Merge Chromium + Blink git repositories
[chromium-blink-merge.git] / third_party / WebKit / Source / wtf / text / TextEncoding.cpp
blob6e7bf1cf75abd85b80bb064ae7df152313510cdb
1 /*
2 * Copyright (C) 2004, 2006, 2007, 2008, 2009 Apple Inc. All rights reserved.
3 * Copyright (C) 2006 Alexey Proskuryakov <ap@nypop.com>
4 * Copyright (C) 2007-2009 Torch Mobile, Inc.
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
15 * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY
16 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE COMPUTER, INC. OR
19 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 #include "config.h"
29 #include "wtf/text/TextEncoding.h"
31 #include "wtf/text/TextEncodingRegistry.h"
32 #include <unicode/unorm.h>
33 #include "wtf/OwnPtr.h"
34 #include "wtf/StdLibExtras.h"
35 #include "wtf/Threading.h"
36 #include "wtf/text/CString.h"
37 #include "wtf/text/WTFString.h"
39 namespace WTF {
41 static const TextEncoding& UTF7Encoding()
43 AtomicallyInitializedStaticReference(const TextEncoding, globalUTF7Encoding, new TextEncoding("UTF-7"));
44 return globalUTF7Encoding;
47 TextEncoding::TextEncoding(const char* name)
48 : m_name(atomicCanonicalTextEncodingName(name))
50 // Aliases are valid, but not "replacement" itself.
51 if (m_name && isReplacementEncoding(name))
52 m_name = 0;
55 TextEncoding::TextEncoding(const String& name)
56 : m_name(atomicCanonicalTextEncodingName(name))
58 // Aliases are valid, but not "replacement" itself.
59 if (m_name && isReplacementEncoding(name))
60 m_name = 0;
63 String TextEncoding::decode(const char* data, size_t length, bool stopOnError, bool& sawError) const
65 if (!m_name)
66 return String();
68 return newTextCodec(*this)->decode(data, length, DataEOF, stopOnError, sawError);
71 CString TextEncoding::encode(const String& string, UnencodableHandling handling) const
73 if (!m_name)
74 return CString();
76 if (string.isEmpty())
77 return "";
79 OwnPtr<TextCodec> textCodec = newTextCodec(*this);
80 CString encodedString;
81 if (string.is8Bit())
82 encodedString = textCodec->encode(string.characters8(), string.length(), handling);
83 else
84 encodedString = textCodec->encode(string.characters16(), string.length(), handling);
85 return encodedString;
88 CString TextEncoding::normalizeAndEncode(const String& string, UnencodableHandling handling) const
90 if (!m_name)
91 return CString();
93 if (string.isEmpty())
94 return "";
96 // Text exclusively containing Latin-1 characters (U+0000..U+00FF) is left
97 // unaffected by NFC. This is effectively the same as saying that all
98 // Latin-1 text is already normalized to NFC.
99 // Source: http://unicode.org/reports/tr15/
100 if (string.is8Bit())
101 return newTextCodec(*this)->encode(string.characters8(), string.length(), handling);
103 const UChar* source = string.characters16();
104 size_t length = string.length();
106 Vector<UChar> normalizedCharacters;
108 UErrorCode err = U_ZERO_ERROR;
109 if (unorm_quickCheck(source, length, UNORM_NFC, &err) != UNORM_YES) {
110 // First try using the length of the original string, since normalization to NFC rarely increases length.
111 normalizedCharacters.grow(length);
112 int32_t normalizedLength = unorm_normalize(source, length, UNORM_NFC, 0, normalizedCharacters.data(), length, &err);
113 if (err == U_BUFFER_OVERFLOW_ERROR) {
114 err = U_ZERO_ERROR;
115 normalizedCharacters.resize(normalizedLength);
116 normalizedLength = unorm_normalize(source, length, UNORM_NFC, 0, normalizedCharacters.data(), normalizedLength, &err);
118 ASSERT(U_SUCCESS(err));
120 source = normalizedCharacters.data();
121 length = normalizedLength;
124 return newTextCodec(*this)->encode(source, length, handling);
127 bool TextEncoding::usesVisualOrdering() const
129 if (noExtendedTextEncodingNameUsed())
130 return false;
132 static const char* const a = atomicCanonicalTextEncodingName("ISO-8859-8");
133 return m_name == a;
136 bool TextEncoding::isNonByteBasedEncoding() const
138 if (noExtendedTextEncodingNameUsed()) {
139 return *this == UTF16LittleEndianEncoding()
140 || *this == UTF16BigEndianEncoding();
143 return *this == UTF16LittleEndianEncoding()
144 || *this == UTF16BigEndianEncoding()
145 || *this == UTF32BigEndianEncoding()
146 || *this == UTF32LittleEndianEncoding();
149 bool TextEncoding::isUTF7Encoding() const
151 if (noExtendedTextEncodingNameUsed())
152 return false;
154 return *this == UTF7Encoding();
157 const TextEncoding& TextEncoding::closestByteBasedEquivalent() const
159 if (isNonByteBasedEncoding())
160 return UTF8Encoding();
161 return *this;
164 // HTML5 specifies that UTF-8 be used in form submission when a form is
165 // is a part of a document in UTF-16 probably because UTF-16 is not a
166 // byte-based encoding and can contain 0x00. By extension, the same
167 // should be done for UTF-32. In case of UTF-7, it is a byte-based encoding,
168 // but it's fraught with problems and we'd rather steer clear of it.
169 const TextEncoding& TextEncoding::encodingForFormSubmission() const
171 if (isNonByteBasedEncoding() || isUTF7Encoding())
172 return UTF8Encoding();
173 return *this;
176 const TextEncoding& ASCIIEncoding()
178 AtomicallyInitializedStaticReference(const TextEncoding, globalASCIIEncoding, new TextEncoding("ASCII"));
179 return globalASCIIEncoding;
182 const TextEncoding& Latin1Encoding()
184 AtomicallyInitializedStaticReference(const TextEncoding, globalLatin1Encoding, new TextEncoding("latin1"));
185 return globalLatin1Encoding;
188 const TextEncoding& UTF16BigEndianEncoding()
190 AtomicallyInitializedStaticReference(const TextEncoding, globalUTF16BigEndianEncoding, new TextEncoding("UTF-16BE"));
191 return globalUTF16BigEndianEncoding;
194 const TextEncoding& UTF16LittleEndianEncoding()
196 AtomicallyInitializedStaticReference(const TextEncoding, globalUTF16LittleEndianEncoding, new TextEncoding("UTF-16LE"));
197 return globalUTF16LittleEndianEncoding;
200 const TextEncoding& UTF32BigEndianEncoding()
202 AtomicallyInitializedStaticReference(const TextEncoding, globalUTF32BigEndianEncoding, new TextEncoding("UTF-32BE"));
203 return globalUTF32BigEndianEncoding;
206 const TextEncoding& UTF32LittleEndianEncoding()
208 AtomicallyInitializedStaticReference(const TextEncoding, globalUTF32LittleEndianEncoding, new TextEncoding("UTF-32LE"));
209 return globalUTF32LittleEndianEncoding;
212 const TextEncoding& UTF8Encoding()
214 AtomicallyInitializedStaticReference(const TextEncoding, globalUTF8Encoding, new TextEncoding("UTF-8"));
215 ASSERT(globalUTF8Encoding.isValid());
216 return globalUTF8Encoding;
219 const TextEncoding& WindowsLatin1Encoding()
221 AtomicallyInitializedStaticReference(const TextEncoding, globalWindowsLatin1Encoding, new TextEncoding("WinLatin1"));
222 return globalWindowsLatin1Encoding;
225 } // namespace WTF