2 * Copyright (C) 2004, 2006, 2008, 2011 Apple Inc. All rights reserved.
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
7 * 1. Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * 2. Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
13 * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY
14 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
16 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE COMPUTER, INC. OR
17 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
18 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
19 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
20 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
21 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
23 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 #include "wtf/text/TextCodecUTF8.h"
29 #include "wtf/text/CString.h"
30 #include "wtf/text/CharacterNames.h"
31 #include "wtf/text/StringBuffer.h"
32 #include "wtf/text/TextCodecASCIIFastPath.h"
35 using namespace WTF::Unicode
;
40 const int nonCharacter
= -1;
42 PassOwnPtr
<TextCodec
> TextCodecUTF8::create(const TextEncoding
&, const void*)
44 return adoptPtr(new TextCodecUTF8
);
47 void TextCodecUTF8::registerEncodingNames(EncodingNameRegistrar registrar
)
49 registrar("UTF-8", "UTF-8");
51 // Additional aliases that originally were present in the encoding
52 // table in WebKit on Macintosh, and subsequently added by
53 // TextCodecICU. Perhaps we can prove some are not used on the web
55 registrar("unicode11utf8", "UTF-8");
56 registrar("unicode20utf8", "UTF-8");
57 registrar("utf8", "UTF-8");
58 registrar("x-unicode20utf8", "UTF-8");
60 // Additional aliases present in the WHATWG Encoding Standard (http://encoding.spec.whatwg.org/)
61 // and Firefox (24), but not in ICU 4.6.
62 registrar("unicode-1-1-utf-8", "UTF-8");
65 void TextCodecUTF8::registerCodecs(TextCodecRegistrar registrar
)
67 registrar("UTF-8", create
, 0);
70 static inline int nonASCIISequenceLength(uint8_t firstByte
)
72 static const uint8_t lengths
[256] = {
73 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
74 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
75 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
76 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
77 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
78 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
79 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
80 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
81 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
82 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
83 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
84 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
85 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
86 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
87 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
88 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
90 return lengths
[firstByte
];
93 static inline int decodeNonASCIISequence(const uint8_t* sequence
, unsigned length
)
95 ASSERT(!isASCII(sequence
[0]));
97 ASSERT(sequence
[0] <= 0xDF);
98 if (sequence
[0] < 0xC2)
100 if (sequence
[1] < 0x80 || sequence
[1] > 0xBF)
102 return ((sequence
[0] << 6) + sequence
[1]) - 0x00003080;
105 ASSERT(sequence
[0] >= 0xE0 && sequence
[0] <= 0xEF);
106 switch (sequence
[0]) {
108 if (sequence
[1] < 0xA0 || sequence
[1] > 0xBF)
112 if (sequence
[1] < 0x80 || sequence
[1] > 0x9F)
116 if (sequence
[1] < 0x80 || sequence
[1] > 0xBF)
119 if (sequence
[2] < 0x80 || sequence
[2] > 0xBF)
121 return ((sequence
[0] << 12) + (sequence
[1] << 6) + sequence
[2]) - 0x000E2080;
124 ASSERT(sequence
[0] >= 0xF0 && sequence
[0] <= 0xF4);
125 switch (sequence
[0]) {
127 if (sequence
[1] < 0x90 || sequence
[1] > 0xBF)
131 if (sequence
[1] < 0x80 || sequence
[1] > 0x8F)
135 if (sequence
[1] < 0x80 || sequence
[1] > 0xBF)
138 if (sequence
[2] < 0x80 || sequence
[2] > 0xBF)
140 if (sequence
[3] < 0x80 || sequence
[3] > 0xBF)
142 return ((sequence
[0] << 18) + (sequence
[1] << 12) + (sequence
[2] << 6) + sequence
[3]) - 0x03C82080;
145 static inline UChar
* appendCharacter(UChar
* destination
, int character
)
147 ASSERT(character
!= nonCharacter
);
148 ASSERT(!U_IS_SURROGATE(character
));
149 if (U_IS_BMP(character
))
150 *destination
++ = static_cast<UChar
>(character
);
152 *destination
++ = U16_LEAD(character
);
153 *destination
++ = U16_TRAIL(character
);
158 void TextCodecUTF8::consumePartialSequenceByte()
160 --m_partialSequenceSize
;
161 memmove(m_partialSequence
, m_partialSequence
+ 1, m_partialSequenceSize
);
164 void TextCodecUTF8::handleError(UChar
*& destination
, bool stopOnError
, bool& sawError
)
169 // Each error generates a replacement character and consumes one byte.
170 *destination
++ = replacementCharacter
;
171 consumePartialSequenceByte();
175 bool TextCodecUTF8::handlePartialSequence
<LChar
>(LChar
*& destination
, const uint8_t*& source
, const uint8_t* end
, bool flush
, bool, bool&)
177 ASSERT(m_partialSequenceSize
);
179 if (isASCII(m_partialSequence
[0])) {
180 *destination
++ = m_partialSequence
[0];
181 consumePartialSequenceByte();
184 int count
= nonASCIISequenceLength(m_partialSequence
[0]);
188 if (count
> m_partialSequenceSize
) {
189 if (count
- m_partialSequenceSize
> end
- source
) {
191 // The new data is not enough to complete the sequence, so
192 // add it to the existing partial sequence.
193 memcpy(m_partialSequence
+ m_partialSequenceSize
, source
, end
- source
);
194 m_partialSequenceSize
+= end
- source
;
197 // An incomplete partial sequence at the end is an error, but it will create
198 // a 16 bit string due to the replacementCharacter. Let the 16 bit path handle
202 memcpy(m_partialSequence
+ m_partialSequenceSize
, source
, count
- m_partialSequenceSize
);
203 source
+= count
- m_partialSequenceSize
;
204 m_partialSequenceSize
= count
;
206 int character
= decodeNonASCIISequence(m_partialSequence
, count
);
207 if (character
& ~0xff)
210 m_partialSequenceSize
-= count
;
211 *destination
++ = static_cast<LChar
>(character
);
212 } while (m_partialSequenceSize
);
218 bool TextCodecUTF8::handlePartialSequence
<UChar
>(UChar
*& destination
, const uint8_t*& source
, const uint8_t* end
, bool flush
, bool stopOnError
, bool& sawError
)
220 ASSERT(m_partialSequenceSize
);
222 if (isASCII(m_partialSequence
[0])) {
223 *destination
++ = m_partialSequence
[0];
224 consumePartialSequenceByte();
227 int count
= nonASCIISequenceLength(m_partialSequence
[0]);
229 handleError(destination
, stopOnError
, sawError
);
234 if (count
> m_partialSequenceSize
) {
235 if (count
- m_partialSequenceSize
> end
- source
) {
237 // The new data is not enough to complete the sequence, so
238 // add it to the existing partial sequence.
239 memcpy(m_partialSequence
+ m_partialSequenceSize
, source
, end
- source
);
240 m_partialSequenceSize
+= end
- source
;
243 // An incomplete partial sequence at the end is an error.
244 handleError(destination
, stopOnError
, sawError
);
249 memcpy(m_partialSequence
+ m_partialSequenceSize
, source
, count
- m_partialSequenceSize
);
250 source
+= count
- m_partialSequenceSize
;
251 m_partialSequenceSize
= count
;
253 int character
= decodeNonASCIISequence(m_partialSequence
, count
);
254 if (character
== nonCharacter
) {
255 handleError(destination
, stopOnError
, sawError
);
261 m_partialSequenceSize
-= count
;
262 destination
= appendCharacter(destination
, character
);
263 } while (m_partialSequenceSize
);
268 String
TextCodecUTF8::decode(const char* bytes
, size_t length
, FlushBehavior flush
, bool stopOnError
, bool& sawError
)
270 // Each input byte might turn into a character.
271 // That includes all bytes in the partial-sequence buffer because
272 // each byte in an invalid sequence will turn into a replacement character.
273 StringBuffer
<LChar
> buffer(m_partialSequenceSize
+ length
);
275 const uint8_t* source
= reinterpret_cast<const uint8_t*>(bytes
);
276 const uint8_t* end
= source
+ length
;
277 const uint8_t* alignedEnd
= alignToMachineWord(end
);
278 LChar
* destination
= buffer
.characters();
281 if (m_partialSequenceSize
) {
282 // Explicitly copy destination and source pointers to avoid taking pointers to the
283 // local variables, which may harm code generation by disabling some optimizations
284 // in some compilers.
285 LChar
* destinationForHandlePartialSequence
= destination
;
286 const uint8_t* sourceForHandlePartialSequence
= source
;
287 if (handlePartialSequence(destinationForHandlePartialSequence
, sourceForHandlePartialSequence
, end
, flush
, stopOnError
, sawError
)) {
288 source
= sourceForHandlePartialSequence
;
289 goto upConvertTo16Bit
;
291 destination
= destinationForHandlePartialSequence
;
292 source
= sourceForHandlePartialSequence
;
293 if (m_partialSequenceSize
)
297 while (source
< end
) {
298 if (isASCII(*source
)) {
299 // Fast path for ASCII. Most UTF-8 text will be ASCII.
300 if (isAlignedToMachineWord(source
)) {
301 while (source
< alignedEnd
) {
302 MachineWord chunk
= *reinterpret_cast_ptr
<const MachineWord
*>(source
);
303 if (!isAllASCII
<LChar
>(chunk
))
305 copyASCIIMachineWord(destination
, source
);
306 source
+= sizeof(MachineWord
);
307 destination
+= sizeof(MachineWord
);
311 if (!isASCII(*source
))
314 *destination
++ = *source
++;
317 int count
= nonASCIISequenceLength(*source
);
320 character
= nonCharacter
;
322 if (count
> end
- source
) {
323 ASSERT_WITH_SECURITY_IMPLICATION(end
- source
< static_cast<ptrdiff_t>(sizeof(m_partialSequence
)));
324 ASSERT(!m_partialSequenceSize
);
325 m_partialSequenceSize
= end
- source
;
326 memcpy(m_partialSequence
, source
, m_partialSequenceSize
);
330 character
= decodeNonASCIISequence(source
, count
);
332 if (character
== nonCharacter
) {
337 goto upConvertTo16Bit
;
339 if (character
> 0xff)
340 goto upConvertTo16Bit
;
343 *destination
++ = static_cast<LChar
>(character
);
345 } while (flush
&& m_partialSequenceSize
);
347 buffer
.shrink(destination
- buffer
.characters());
349 return String::adopt(buffer
);
352 StringBuffer
<UChar
> buffer16(m_partialSequenceSize
+ length
);
354 UChar
* destination16
= buffer16
.characters();
356 // Copy the already converted characters
357 for (LChar
* converted8
= buffer
.characters(); converted8
< destination
;)
358 *destination16
++ = *converted8
++;
361 if (m_partialSequenceSize
) {
362 // Explicitly copy destination and source pointers to avoid taking pointers to the
363 // local variables, which may harm code generation by disabling some optimizations
364 // in some compilers.
365 UChar
* destinationForHandlePartialSequence
= destination16
;
366 const uint8_t* sourceForHandlePartialSequence
= source
;
367 handlePartialSequence(destinationForHandlePartialSequence
, sourceForHandlePartialSequence
, end
, flush
, stopOnError
, sawError
);
368 destination16
= destinationForHandlePartialSequence
;
369 source
= sourceForHandlePartialSequence
;
370 if (m_partialSequenceSize
)
374 while (source
< end
) {
375 if (isASCII(*source
)) {
376 // Fast path for ASCII. Most UTF-8 text will be ASCII.
377 if (isAlignedToMachineWord(source
)) {
378 while (source
< alignedEnd
) {
379 MachineWord chunk
= *reinterpret_cast_ptr
<const MachineWord
*>(source
);
380 if (!isAllASCII
<LChar
>(chunk
))
382 copyASCIIMachineWord(destination16
, source
);
383 source
+= sizeof(MachineWord
);
384 destination16
+= sizeof(MachineWord
);
388 if (!isASCII(*source
))
391 *destination16
++ = *source
++;
394 int count
= nonASCIISequenceLength(*source
);
397 character
= nonCharacter
;
399 if (count
> end
- source
) {
400 ASSERT_WITH_SECURITY_IMPLICATION(end
- source
< static_cast<ptrdiff_t>(sizeof(m_partialSequence
)));
401 ASSERT(!m_partialSequenceSize
);
402 m_partialSequenceSize
= end
- source
;
403 memcpy(m_partialSequence
, source
, m_partialSequenceSize
);
407 character
= decodeNonASCIISequence(source
, count
);
409 if (character
== nonCharacter
) {
413 // Each error generates a replacement character and consumes one byte.
414 *destination16
++ = replacementCharacter
;
419 destination16
= appendCharacter(destination16
, character
);
421 } while (flush
&& m_partialSequenceSize
);
423 buffer16
.shrink(destination16
- buffer16
.characters());
425 return String::adopt(buffer16
);
428 template<typename CharType
>
429 CString
TextCodecUTF8::encodeCommon(const CharType
* characters
, size_t length
)
431 // The maximum number of UTF-8 bytes needed per UTF-16 code unit is 3.
432 // BMP characters take only one UTF-16 code unit and can take up to 3 bytes (3x).
433 // Non-BMP characters take two UTF-16 code units and can take up to 4 bytes (2x).
434 if (length
> numeric_limits
<size_t>::max() / 3)
436 Vector
<uint8_t> bytes(length
* 3);
439 size_t bytesWritten
= 0;
442 U16_NEXT(characters
, i
, length
, character
);
443 // U16_NEXT will simply emit a surrogate code point if an unmatched surrogate
444 // is encountered; we must convert it to a U+FFFD (REPLACEMENT CHARACTER) here.
445 if (0xD800 <= character
&& character
<= 0xDFFF)
446 character
= replacementCharacter
;
447 U8_APPEND_UNSAFE(bytes
.data(), bytesWritten
, character
);
450 return CString(reinterpret_cast<char*>(bytes
.data()), bytesWritten
);
453 CString
TextCodecUTF8::encode(const UChar
* characters
, size_t length
, UnencodableHandling
)
455 return encodeCommon(characters
, length
);
458 CString
TextCodecUTF8::encode(const LChar
* characters
, size_t length
, UnencodableHandling
)
460 return encodeCommon(characters
, length
);