Merge Chromium + Blink git repositories
[chromium-blink-merge.git] / third_party / WebKit / Source / wtf / text / UTF8.cpp
blob79abd8c059b1fff01efdbd5030981cb3bfa04032
1 /*
2 * Copyright (C) 2007 Apple Inc. All rights reserved.
3 * Copyright (C) 2010 Patrick Gansterer <paroga@paroga.com>
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
14 * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY
15 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
17 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE COMPUTER, INC. OR
18 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
19 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
20 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
21 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
22 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
24 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 #include "config.h"
28 #include "wtf/text/UTF8.h"
30 #include "wtf/ASCIICType.h"
31 #include "wtf/StringHasher.h"
32 #include "wtf/text/CharacterNames.h"
34 namespace WTF {
35 namespace Unicode {
37 inline int inlineUTF8SequenceLengthNonASCII(char b0)
39 if ((b0 & 0xC0) != 0xC0)
40 return 0;
41 if ((b0 & 0xE0) == 0xC0)
42 return 2;
43 if ((b0 & 0xF0) == 0xE0)
44 return 3;
45 if ((b0 & 0xF8) == 0xF0)
46 return 4;
47 return 0;
50 inline int inlineUTF8SequenceLength(char b0)
52 return isASCII(b0) ? 1 : inlineUTF8SequenceLengthNonASCII(b0);
55 // Once the bits are split out into bytes of UTF-8, this is a mask OR-ed
56 // into the first byte, depending on how many bytes follow. There are
57 // as many entries in this table as there are UTF-8 sequence types.
58 // (I.e., one byte sequence, two byte... etc.). Remember that sequencs
59 // for *legal* UTF-8 will be 4 or fewer bytes total.
60 static const unsigned char firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
62 ConversionResult convertLatin1ToUTF8(
63 const LChar** sourceStart, const LChar* sourceEnd,
64 char** targetStart, char* targetEnd)
66 ConversionResult result = conversionOK;
67 const LChar* source = *sourceStart;
68 char* target = *targetStart;
69 while (source < sourceEnd) {
70 UChar32 ch;
71 unsigned short bytesToWrite = 0;
72 const UChar32 byteMask = 0xBF;
73 const UChar32 byteMark = 0x80;
74 const LChar* oldSource = source; // In case we have to back up because of target overflow.
75 ch = static_cast<unsigned short>(*source++);
77 // Figure out how many bytes the result will require
78 if (ch < (UChar32)0x80)
79 bytesToWrite = 1;
80 else
81 bytesToWrite = 2;
83 target += bytesToWrite;
84 if (target > targetEnd) {
85 source = oldSource; // Back up source pointer!
86 target -= bytesToWrite;
87 result = targetExhausted;
88 break;
90 switch (bytesToWrite) { // note: everything falls through.
91 case 2:
92 *--target = (char)((ch | byteMark) & byteMask);
93 ch >>= 6;
94 case 1:
95 *--target = (char)(ch | firstByteMark[bytesToWrite]);
97 target += bytesToWrite;
99 *sourceStart = source;
100 *targetStart = target;
101 return result;
104 ConversionResult convertUTF16ToUTF8(
105 const UChar** sourceStart, const UChar* sourceEnd,
106 char** targetStart, char* targetEnd, bool strict)
108 ConversionResult result = conversionOK;
109 const UChar* source = *sourceStart;
110 char* target = *targetStart;
111 while (source < sourceEnd) {
112 UChar32 ch;
113 unsigned short bytesToWrite = 0;
114 const UChar32 byteMask = 0xBF;
115 const UChar32 byteMark = 0x80;
116 const UChar* oldSource = source; // In case we have to back up because of target overflow.
117 ch = static_cast<unsigned short>(*source++);
118 // If we have a surrogate pair, convert to UChar32 first.
119 if (ch >= 0xD800 && ch <= 0xDBFF) {
120 // If the 16 bits following the high surrogate are in the source buffer...
121 if (source < sourceEnd) {
122 UChar32 ch2 = static_cast<unsigned short>(*source);
123 // If it's a low surrogate, convert to UChar32.
124 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
125 ch = ((ch - 0xD800) << 10) + (ch2 - 0xDC00) + 0x0010000;
126 ++source;
127 } else if (strict) { // it's an unpaired high surrogate
128 --source; // return to the illegal value itself
129 result = sourceIllegal;
130 break;
132 } else { // We don't have the 16 bits following the high surrogate.
133 --source; // return to the high surrogate
134 result = sourceExhausted;
135 break;
137 } else if (strict) {
138 // UTF-16 surrogate values are illegal in UTF-32
139 if (ch >= 0xDC00 && ch <= 0xDFFF) {
140 --source; // return to the illegal value itself
141 result = sourceIllegal;
142 break;
145 // Figure out how many bytes the result will require
146 if (ch < (UChar32)0x80) {
147 bytesToWrite = 1;
148 } else if (ch < (UChar32)0x800) {
149 bytesToWrite = 2;
150 } else if (ch < (UChar32)0x10000) {
151 bytesToWrite = 3;
152 } else if (ch < (UChar32)0x110000) {
153 bytesToWrite = 4;
154 } else {
155 bytesToWrite = 3;
156 ch = replacementCharacter;
159 target += bytesToWrite;
160 if (target > targetEnd) {
161 source = oldSource; // Back up source pointer!
162 target -= bytesToWrite;
163 result = targetExhausted;
164 break;
166 switch (bytesToWrite) { // note: everything falls through.
167 case 4: *--target = (char)((ch | byteMark) & byteMask); ch >>= 6;
168 case 3: *--target = (char)((ch | byteMark) & byteMask); ch >>= 6;
169 case 2: *--target = (char)((ch | byteMark) & byteMask); ch >>= 6;
170 case 1: *--target = (char)(ch | firstByteMark[bytesToWrite]);
172 target += bytesToWrite;
174 *sourceStart = source;
175 *targetStart = target;
176 return result;
179 // This must be called with the length pre-determined by the first byte.
180 // If presented with a length > 4, this returns false. The Unicode
181 // definition of UTF-8 goes up to 4-byte sequences.
182 static bool isLegalUTF8(const unsigned char* source, int length)
184 unsigned char a;
185 const unsigned char* srcptr = source + length;
186 switch (length) {
187 default: return false;
188 // Everything else falls through when "true"...
189 case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
190 case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
191 case 2: if ((a = (*--srcptr)) > 0xBF) return false;
193 switch (*source) {
194 // no fall-through in this inner switch
195 case 0xE0: if (a < 0xA0) return false; break;
196 case 0xED: if (a > 0x9F) return false; break;
197 case 0xF0: if (a < 0x90) return false; break;
198 case 0xF4: if (a > 0x8F) return false; break;
199 default: if (a < 0x80) return false;
202 case 1: if (*source >= 0x80 && *source < 0xC2) return false;
204 if (*source > 0xF4)
205 return false;
206 return true;
209 // Magic values subtracted from a buffer value during UTF8 conversion.
210 // This table contains as many values as there might be trailing bytes
211 // in a UTF-8 sequence.
212 static const UChar32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL, 0x03C82080UL, static_cast<UChar32>(0xFA082080UL), static_cast<UChar32>(0x82082080UL) };
214 static inline UChar32 readUTF8Sequence(const char*& sequence, unsigned length)
216 UChar32 character = 0;
218 // The cases all fall through.
219 switch (length) {
220 case 6: character += static_cast<unsigned char>(*sequence++); character <<= 6;
221 case 5: character += static_cast<unsigned char>(*sequence++); character <<= 6;
222 case 4: character += static_cast<unsigned char>(*sequence++); character <<= 6;
223 case 3: character += static_cast<unsigned char>(*sequence++); character <<= 6;
224 case 2: character += static_cast<unsigned char>(*sequence++); character <<= 6;
225 case 1: character += static_cast<unsigned char>(*sequence++);
228 return character - offsetsFromUTF8[length - 1];
231 ConversionResult convertUTF8ToUTF16(
232 const char** sourceStart, const char* sourceEnd,
233 UChar** targetStart, UChar* targetEnd, bool* sourceAllASCII, bool strict)
235 ConversionResult result = conversionOK;
236 const char* source = *sourceStart;
237 UChar* target = *targetStart;
238 UChar orAllData = 0;
239 while (source < sourceEnd) {
240 int utf8SequenceLength = inlineUTF8SequenceLength(*source);
241 if (sourceEnd - source < utf8SequenceLength) {
242 result = sourceExhausted;
243 break;
245 // Do this check whether lenient or strict
246 if (!isLegalUTF8(reinterpret_cast<const unsigned char*>(source), utf8SequenceLength)) {
247 result = sourceIllegal;
248 break;
251 UChar32 character = readUTF8Sequence(source, utf8SequenceLength);
253 if (target >= targetEnd) {
254 source -= utf8SequenceLength; // Back up source pointer!
255 result = targetExhausted;
256 break;
259 if (U_IS_BMP(character)) {
260 // UTF-16 surrogate values are illegal in UTF-32
261 if (U_IS_SURROGATE(character)) {
262 if (strict) {
263 source -= utf8SequenceLength; // return to the illegal value itself
264 result = sourceIllegal;
265 break;
266 } else {
267 *target++ = replacementCharacter;
268 orAllData |= replacementCharacter;
270 } else {
271 *target++ = static_cast<UChar>(character); // normal case
272 orAllData |= character;
274 } else if (U_IS_SUPPLEMENTARY(character)) {
275 // target is a character in range 0xFFFF - 0x10FFFF
276 if (target + 1 >= targetEnd) {
277 source -= utf8SequenceLength; // Back up source pointer!
278 result = targetExhausted;
279 break;
281 *target++ = U16_LEAD(character);
282 *target++ = U16_TRAIL(character);
283 orAllData = 0xffff;
284 } else {
285 if (strict) {
286 source -= utf8SequenceLength; // return to the start
287 result = sourceIllegal;
288 break; // Bail out; shouldn't continue
289 } else {
290 *target++ = replacementCharacter;
291 orAllData |= replacementCharacter;
295 *sourceStart = source;
296 *targetStart = target;
298 if (sourceAllASCII)
299 *sourceAllASCII = !(orAllData & ~0x7f);
301 return result;
304 unsigned calculateStringHashAndLengthFromUTF8MaskingTop8Bits(const char* data, const char* dataEnd, unsigned& dataLength, unsigned& utf16Length)
306 if (!data)
307 return 0;
309 StringHasher stringHasher;
310 dataLength = 0;
311 utf16Length = 0;
313 while (data < dataEnd || (!dataEnd && *data)) {
314 if (isASCII(*data)) {
315 stringHasher.addCharacter(*data++);
316 dataLength++;
317 utf16Length++;
318 continue;
321 int utf8SequenceLength = inlineUTF8SequenceLengthNonASCII(*data);
322 dataLength += utf8SequenceLength;
324 if (!dataEnd) {
325 for (int i = 1; i < utf8SequenceLength; ++i) {
326 if (!data[i])
327 return 0;
329 } else if (dataEnd - data < utf8SequenceLength)
330 return 0;
332 if (!isLegalUTF8(reinterpret_cast<const unsigned char*>(data), utf8SequenceLength))
333 return 0;
335 UChar32 character = readUTF8Sequence(data, utf8SequenceLength);
336 ASSERT(!isASCII(character));
338 if (U_IS_BMP(character)) {
339 // UTF-16 surrogate values are illegal in UTF-32
340 if (U_IS_SURROGATE(character))
341 return 0;
342 stringHasher.addCharacter(static_cast<UChar>(character)); // normal case
343 utf16Length++;
344 } else if (U_IS_SUPPLEMENTARY(character)) {
345 stringHasher.addCharacters(static_cast<UChar>(U16_LEAD(character)),
346 static_cast<UChar>(U16_TRAIL(character)));
347 utf16Length += 2;
348 } else
349 return 0;
352 return stringHasher.hashWithTop8BitsMasked();
355 template<typename CharType>
356 ALWAYS_INLINE bool equalWithUTF8Internal(const CharType* a, const CharType* aEnd, const char* b, const char* bEnd)
358 while (b < bEnd) {
359 if (isASCII(*b)) {
360 if (*a++ != *b++)
361 return false;
362 continue;
365 int utf8SequenceLength = inlineUTF8SequenceLengthNonASCII(*b);
367 if (bEnd - b < utf8SequenceLength)
368 return false;
370 if (!isLegalUTF8(reinterpret_cast<const unsigned char*>(b), utf8SequenceLength))
371 return 0;
373 UChar32 character = readUTF8Sequence(b, utf8SequenceLength);
374 ASSERT(!isASCII(character));
376 if (U_IS_BMP(character)) {
377 // UTF-16 surrogate values are illegal in UTF-32
378 if (U_IS_SURROGATE(character))
379 return false;
380 if (*a++ != character)
381 return false;
382 } else if (U_IS_SUPPLEMENTARY(character)) {
383 if (*a++ != U16_LEAD(character))
384 return false;
385 if (*a++ != U16_TRAIL(character))
386 return false;
387 } else
388 return false;
391 return a == aEnd;
394 bool equalUTF16WithUTF8(const UChar* a, const UChar* aEnd, const char* b, const char* bEnd)
396 return equalWithUTF8Internal(a, aEnd, b, bEnd);
399 bool equalLatin1WithUTF8(const LChar* a, const LChar* aEnd, const char* b, const char* bEnd)
401 return equalWithUTF8Internal(a, aEnd, b, bEnd);
404 } // namespace Unicode
405 } // namespace WTF