2 * Copyright (C) 2004, 2006, 2007, 2008, 2009 Apple Inc. All rights reserved.
3 * Copyright (C) 2006 Alexey Proskuryakov <ap@nypop.com>
4 * Copyright (C) 2007-2009 Torch Mobile, Inc.
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
15 * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY
16 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE COMPUTER, INC. OR
19 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 #include "wtf/text/TextEncoding.h"
31 #include "wtf/text/TextEncodingRegistry.h"
32 #include <unicode/unorm.h>
33 #include "wtf/OwnPtr.h"
34 #include "wtf/StdLibExtras.h"
35 #include "wtf/Threading.h"
36 #include "wtf/text/CString.h"
37 #include "wtf/text/WTFString.h"
41 static const TextEncoding
& UTF7Encoding()
43 AtomicallyInitializedStaticReference(const TextEncoding
, globalUTF7Encoding
, new TextEncoding("UTF-7"));
44 return globalUTF7Encoding
;
47 TextEncoding::TextEncoding(const char* name
)
48 : m_name(atomicCanonicalTextEncodingName(name
))
50 // Aliases are valid, but not "replacement" itself.
51 if (m_name
&& isReplacementEncoding(name
))
55 TextEncoding::TextEncoding(const String
& name
)
56 : m_name(atomicCanonicalTextEncodingName(name
))
58 // Aliases are valid, but not "replacement" itself.
59 if (m_name
&& isReplacementEncoding(name
))
63 String
TextEncoding::decode(const char* data
, size_t length
, bool stopOnError
, bool& sawError
) const
68 return newTextCodec(*this)->decode(data
, length
, DataEOF
, stopOnError
, sawError
);
71 CString
TextEncoding::encode(const String
& string
, UnencodableHandling handling
) const
79 OwnPtr
<TextCodec
> textCodec
= newTextCodec(*this);
80 CString encodedString
;
82 encodedString
= textCodec
->encode(string
.characters8(), string
.length(), handling
);
84 encodedString
= textCodec
->encode(string
.characters16(), string
.length(), handling
);
88 CString
TextEncoding::normalizeAndEncode(const String
& string
, UnencodableHandling handling
) const
96 // Text exclusively containing Latin-1 characters (U+0000..U+00FF) is left
97 // unaffected by NFC. This is effectively the same as saying that all
98 // Latin-1 text is already normalized to NFC.
99 // Source: http://unicode.org/reports/tr15/
101 return newTextCodec(*this)->encode(string
.characters8(), string
.length(), handling
);
103 const UChar
* source
= string
.characters16();
104 size_t length
= string
.length();
106 Vector
<UChar
> normalizedCharacters
;
108 UErrorCode err
= U_ZERO_ERROR
;
109 if (unorm_quickCheck(source
, length
, UNORM_NFC
, &err
) != UNORM_YES
) {
110 // First try using the length of the original string, since normalization to NFC rarely increases length.
111 normalizedCharacters
.grow(length
);
112 int32_t normalizedLength
= unorm_normalize(source
, length
, UNORM_NFC
, 0, normalizedCharacters
.data(), length
, &err
);
113 if (err
== U_BUFFER_OVERFLOW_ERROR
) {
115 normalizedCharacters
.resize(normalizedLength
);
116 normalizedLength
= unorm_normalize(source
, length
, UNORM_NFC
, 0, normalizedCharacters
.data(), normalizedLength
, &err
);
118 ASSERT(U_SUCCESS(err
));
120 source
= normalizedCharacters
.data();
121 length
= normalizedLength
;
124 return newTextCodec(*this)->encode(source
, length
, handling
);
127 bool TextEncoding::usesVisualOrdering() const
129 if (noExtendedTextEncodingNameUsed())
132 static const char* const a
= atomicCanonicalTextEncodingName("ISO-8859-8");
136 bool TextEncoding::isNonByteBasedEncoding() const
138 if (noExtendedTextEncodingNameUsed()) {
139 return *this == UTF16LittleEndianEncoding()
140 || *this == UTF16BigEndianEncoding();
143 return *this == UTF16LittleEndianEncoding()
144 || *this == UTF16BigEndianEncoding()
145 || *this == UTF32BigEndianEncoding()
146 || *this == UTF32LittleEndianEncoding();
149 bool TextEncoding::isUTF7Encoding() const
151 if (noExtendedTextEncodingNameUsed())
154 return *this == UTF7Encoding();
157 const TextEncoding
& TextEncoding::closestByteBasedEquivalent() const
159 if (isNonByteBasedEncoding())
160 return UTF8Encoding();
164 // HTML5 specifies that UTF-8 be used in form submission when a form is
165 // is a part of a document in UTF-16 probably because UTF-16 is not a
166 // byte-based encoding and can contain 0x00. By extension, the same
167 // should be done for UTF-32. In case of UTF-7, it is a byte-based encoding,
168 // but it's fraught with problems and we'd rather steer clear of it.
169 const TextEncoding
& TextEncoding::encodingForFormSubmission() const
171 if (isNonByteBasedEncoding() || isUTF7Encoding())
172 return UTF8Encoding();
176 const TextEncoding
& ASCIIEncoding()
178 AtomicallyInitializedStaticReference(const TextEncoding
, globalASCIIEncoding
, new TextEncoding("ASCII"));
179 return globalASCIIEncoding
;
182 const TextEncoding
& Latin1Encoding()
184 AtomicallyInitializedStaticReference(const TextEncoding
, globalLatin1Encoding
, new TextEncoding("latin1"));
185 return globalLatin1Encoding
;
188 const TextEncoding
& UTF16BigEndianEncoding()
190 AtomicallyInitializedStaticReference(const TextEncoding
, globalUTF16BigEndianEncoding
, new TextEncoding("UTF-16BE"));
191 return globalUTF16BigEndianEncoding
;
194 const TextEncoding
& UTF16LittleEndianEncoding()
196 AtomicallyInitializedStaticReference(const TextEncoding
, globalUTF16LittleEndianEncoding
, new TextEncoding("UTF-16LE"));
197 return globalUTF16LittleEndianEncoding
;
200 const TextEncoding
& UTF32BigEndianEncoding()
202 AtomicallyInitializedStaticReference(const TextEncoding
, globalUTF32BigEndianEncoding
, new TextEncoding("UTF-32BE"));
203 return globalUTF32BigEndianEncoding
;
206 const TextEncoding
& UTF32LittleEndianEncoding()
208 AtomicallyInitializedStaticReference(const TextEncoding
, globalUTF32LittleEndianEncoding
, new TextEncoding("UTF-32LE"));
209 return globalUTF32LittleEndianEncoding
;
212 const TextEncoding
& UTF8Encoding()
214 AtomicallyInitializedStaticReference(const TextEncoding
, globalUTF8Encoding
, new TextEncoding("UTF-8"));
215 ASSERT(globalUTF8Encoding
.isValid());
216 return globalUTF8Encoding
;
219 const TextEncoding
& WindowsLatin1Encoding()
221 AtomicallyInitializedStaticReference(const TextEncoding
, globalWindowsLatin1Encoding
, new TextEncoding("WinLatin1"));
222 return globalWindowsLatin1Encoding
;