third_party/WebKit/Source/wtf/text/UTF8.cpp

   1 /*
   2  * Copyright (C) 2007 Apple Inc.  All rights reserved.
   3  * Copyright (C) 2010 Patrick Gansterer <paroga@paroga.com>
   4  *
   5  * Redistribution and use in source and binary forms, with or without
   6  * modification, are permitted provided that the following conditions
   7  * are met:
   8  * 1. Redistributions of source code must retain the above copyright
   9  *    notice, this list of conditions and the following disclaimer.
  10  * 2. Redistributions in binary form must reproduce the above copyright
  11  *    notice, this list of conditions and the following disclaimer in the
  12  *    documentation and/or other materials provided with the distribution.
  13  *
  14  * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY
  15  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  17  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE COMPUTER, INC. OR
  18  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
  19  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  20  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  21  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
  22  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  23  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  24  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  25  */
  26
  27 #include "config.h"
  28 #include "wtf/text/UTF8.h"
  29
  30 #include "wtf/ASCIICType.h"
  31 #include "wtf/StringHasher.h"
  32 #include "wtf/text/CharacterNames.h"
  33
  34 namespace WTF {
  35 namespace Unicode {
  36
  37 inline int inlineUTF8SequenceLengthNonASCII(char b0)
  38 {
  39     if ((b0 & 0xC0) != 0xC0)
  40         return 0;
  41     if ((b0 & 0xE0) == 0xC0)
  42         return 2;
  43     if ((b0 & 0xF0) == 0xE0)
  44         return 3;
  45     if ((b0 & 0xF8) == 0xF0)
  46         return 4;
  47     return 0;
  48 }
  49
  50 inline int inlineUTF8SequenceLength(char b0)
  51 {
  52     return isASCII(b0) ? 1 : inlineUTF8SequenceLengthNonASCII(b0);
  53 }
  54
  55 // Once the bits are split out into bytes of UTF-8, this is a mask OR-ed
  56 // into the first byte, depending on how many bytes follow.  There are
  57 // as many entries in this table as there are UTF-8 sequence types.
  58 // (I.e., one byte sequence, two byte... etc.). Remember that sequencs
  59 // for *legal* UTF-8 will be 4 or fewer bytes total.
  60 static const unsigned char firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
  61
  62 ConversionResult convertLatin1ToUTF8(
  63                                      const LChar** sourceStart, const LChar* sourceEnd,
  64                                      char** targetStart, char* targetEnd)
  65 {
  66     ConversionResult result = conversionOK;
  67     const LChar* source = *sourceStart;
  68     char* target = *targetStart;
  69     while (source < sourceEnd) {
  70         UChar32 ch;
  71         unsigned short bytesToWrite = 0;
  72         const UChar32 byteMask = 0xBF;
  73         const UChar32 byteMark = 0x80;
  74         const LChar* oldSource = source; // In case we have to back up because of target overflow.
  75         ch = static_cast<unsigned short>(*source++);
  76
  77         // Figure out how many bytes the result will require
  78         if (ch < (UChar32)0x80)
  79             bytesToWrite = 1;
  80         else
  81             bytesToWrite = 2;
  82
  83         target += bytesToWrite;
  84         if (target > targetEnd) {
  85             source = oldSource; // Back up source pointer!
  86             target -= bytesToWrite;
  87             result = targetExhausted;
  88             break;
  89         }
  90         switch (bytesToWrite) { // note: everything falls through.
  91         case 2:
  92             *--target = (char)((ch | byteMark) & byteMask);
  93             ch >>= 6;
  94         case 1:
  95             *--target =  (char)(ch | firstByteMark[bytesToWrite]);
  96         }
  97         target += bytesToWrite;
  98     }
  99     *sourceStart = source;
 100     *targetStart = target;
 101     return result;
 102 }
 103
 104 ConversionResult convertUTF16ToUTF8(
 105     const UChar** sourceStart, const UChar* sourceEnd,
 106     char** targetStart, char* targetEnd, bool strict)
 107 {
 108     ConversionResult result = conversionOK;
 109     const UChar* source = *sourceStart;
 110     char* target = *targetStart;
 111     while (source < sourceEnd) {
 112         UChar32 ch;
 113         unsigned short bytesToWrite = 0;
 114         const UChar32 byteMask = 0xBF;
 115         const UChar32 byteMark = 0x80;
 116         const UChar* oldSource = source; // In case we have to back up because of target overflow.
 117         ch = static_cast<unsigned short>(*source++);
 118         // If we have a surrogate pair, convert to UChar32 first.
 119         if (ch >= 0xD800 && ch <= 0xDBFF) {
 120             // If the 16 bits following the high surrogate are in the source buffer...
 121             if (source < sourceEnd) {
 122                 UChar32 ch2 = static_cast<unsigned short>(*source);
 123                 // If it's a low surrogate, convert to UChar32.
 124                 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
 125                     ch = ((ch - 0xD800) << 10) + (ch2 - 0xDC00) + 0x0010000;
 126                     ++source;
 127                 } else if (strict) { // it's an unpaired high surrogate
 128                     --source; // return to the illegal value itself
 129                     result = sourceIllegal;
 130                     break;
 131                 }
 132             } else { // We don't have the 16 bits following the high surrogate.
 133                 --source; // return to the high surrogate
 134                 result = sourceExhausted;
 135                 break;
 136             }
 137         } else if (strict) {
 138             // UTF-16 surrogate values are illegal in UTF-32
 139             if (ch >= 0xDC00 && ch <= 0xDFFF) {
 140                 --source; // return to the illegal value itself
 141                 result = sourceIllegal;
 142                 break;
 143             }
 144         }
 145         // Figure out how many bytes the result will require
 146         if (ch < (UChar32)0x80) {
 147             bytesToWrite = 1;
 148         } else if (ch < (UChar32)0x800) {
 149             bytesToWrite = 2;
 150         } else if (ch < (UChar32)0x10000) {
 151             bytesToWrite = 3;
 152         } else if (ch < (UChar32)0x110000) {
 153             bytesToWrite = 4;
 154         } else {
 155             bytesToWrite = 3;
 156             ch = replacementCharacter;
 157         }
 158
 159         target += bytesToWrite;
 160         if (target > targetEnd) {
 161             source = oldSource; // Back up source pointer!
 162             target -= bytesToWrite;
 163             result = targetExhausted;
 164             break;
 165         }
 166         switch (bytesToWrite) { // note: everything falls through.
 167             case 4: *--target = (char)((ch | byteMark) & byteMask); ch >>= 6;
 168             case 3: *--target = (char)((ch | byteMark) & byteMask); ch >>= 6;
 169             case 2: *--target = (char)((ch | byteMark) & byteMask); ch >>= 6;
 170             case 1: *--target =  (char)(ch | firstByteMark[bytesToWrite]);
 171         }
 172         target += bytesToWrite;
 173     }
 174     *sourceStart = source;
 175     *targetStart = target;
 176     return result;
 177 }
 178
 179 // This must be called with the length pre-determined by the first byte.
 180 // If presented with a length > 4, this returns false.  The Unicode
 181 // definition of UTF-8 goes up to 4-byte sequences.
 182 static bool isLegalUTF8(const unsigned char* source, int length)
 183 {
 184     unsigned char a;
 185     const unsigned char* srcptr = source + length;
 186     switch (length) {
 187         default: return false;
 188         // Everything else falls through when "true"...
 189         case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
 190         case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
 191         case 2: if ((a = (*--srcptr)) > 0xBF) return false;
 192
 193         switch (*source) {
 194             // no fall-through in this inner switch
 195             case 0xE0: if (a < 0xA0) return false; break;
 196             case 0xED: if (a > 0x9F) return false; break;
 197             case 0xF0: if (a < 0x90) return false; break;
 198             case 0xF4: if (a > 0x8F) return false; break;
 199             default:   if (a < 0x80) return false;
 200         }
 201
 202         case 1: if (*source >= 0x80 && *source < 0xC2) return false;
 203     }
 204     if (*source > 0xF4)
 205         return false;
 206     return true;
 207 }
 208
 209 // Magic values subtracted from a buffer value during UTF8 conversion.
 210 // This table contains as many values as there might be trailing bytes
 211 // in a UTF-8 sequence.
 212 static const UChar32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL, 0x03C82080UL, static_cast<UChar32>(0xFA082080UL), static_cast<UChar32>(0x82082080UL) };
 213
 214 static inline UChar32 readUTF8Sequence(const char*& sequence, unsigned length)
 215 {
 216     UChar32 character = 0;
 217
 218     // The cases all fall through.
 219     switch (length) {
 220         case 6: character += static_cast<unsigned char>(*sequence++); character <<= 6;
 221         case 5: character += static_cast<unsigned char>(*sequence++); character <<= 6;
 222         case 4: character += static_cast<unsigned char>(*sequence++); character <<= 6;
 223         case 3: character += static_cast<unsigned char>(*sequence++); character <<= 6;
 224         case 2: character += static_cast<unsigned char>(*sequence++); character <<= 6;
 225         case 1: character += static_cast<unsigned char>(*sequence++);
 226     }
 227
 228     return character - offsetsFromUTF8[length - 1];
 229 }
 230
 231 ConversionResult convertUTF8ToUTF16(
 232     const char** sourceStart, const char* sourceEnd,
 233     UChar** targetStart, UChar* targetEnd, bool* sourceAllASCII, bool strict)
 234 {
 235     ConversionResult result = conversionOK;
 236     const char* source = *sourceStart;
 237     UChar* target = *targetStart;
 238     UChar orAllData = 0;
 239     while (source < sourceEnd) {
 240         int utf8SequenceLength = inlineUTF8SequenceLength(*source);
 241         if (sourceEnd - source < utf8SequenceLength)  {
 242             result = sourceExhausted;
 243             break;
 244         }
 245         // Do this check whether lenient or strict
 246         if (!isLegalUTF8(reinterpret_cast<const unsigned char*>(source), utf8SequenceLength)) {
 247             result = sourceIllegal;
 248             break;
 249         }
 250
 251         UChar32 character = readUTF8Sequence(source, utf8SequenceLength);
 252
 253         if (target >= targetEnd) {
 254             source -= utf8SequenceLength; // Back up source pointer!
 255             result = targetExhausted;
 256             break;
 257         }
 258
 259         if (U_IS_BMP(character)) {
 260             // UTF-16 surrogate values are illegal in UTF-32
 261             if (U_IS_SURROGATE(character)) {
 262                 if (strict) {
 263                     source -= utf8SequenceLength; // return to the illegal value itself
 264                     result = sourceIllegal;
 265                     break;
 266                 } else {
 267                     *target++ = replacementCharacter;
 268                     orAllData |= replacementCharacter;
 269                 }
 270             } else {
 271                 *target++ = static_cast<UChar>(character); // normal case
 272                 orAllData |= character;
 273             }
 274         } else if (U_IS_SUPPLEMENTARY(character)) {
 275             // target is a character in range 0xFFFF - 0x10FFFF
 276             if (target + 1 >= targetEnd) {
 277                 source -= utf8SequenceLength; // Back up source pointer!
 278                 result = targetExhausted;
 279                 break;
 280             }
 281             *target++ = U16_LEAD(character);
 282             *target++ = U16_TRAIL(character);
 283             orAllData = 0xffff;
 284         } else {
 285             if (strict) {
 286                 source -= utf8SequenceLength; // return to the start
 287                 result = sourceIllegal;
 288                 break; // Bail out; shouldn't continue
 289             } else {
 290                 *target++ = replacementCharacter;
 291                 orAllData |= replacementCharacter;
 292             }
 293         }
 294     }
 295     *sourceStart = source;
 296     *targetStart = target;
 297
 298     if (sourceAllASCII)
 299         *sourceAllASCII = !(orAllData & ~0x7f);
 300
 301     return result;
 302 }
 303
 304 unsigned calculateStringHashAndLengthFromUTF8MaskingTop8Bits(const char* data, const char* dataEnd, unsigned& dataLength, unsigned& utf16Length)
 305 {
 306     if (!data)
 307         return 0;
 308
 309     StringHasher stringHasher;
 310     dataLength = 0;
 311     utf16Length = 0;
 312
 313     while (data < dataEnd || (!dataEnd && *data)) {
 314         if (isASCII(*data)) {
 315             stringHasher.addCharacter(*data++);
 316             dataLength++;
 317             utf16Length++;
 318             continue;
 319         }
 320
 321         int utf8SequenceLength = inlineUTF8SequenceLengthNonASCII(*data);
 322         dataLength += utf8SequenceLength;
 323
 324         if (!dataEnd) {
 325             for (int i = 1; i < utf8SequenceLength; ++i) {
 326                 if (!data[i])
 327                     return 0;
 328             }
 329         } else if (dataEnd - data < utf8SequenceLength)
 330             return 0;
 331
 332         if (!isLegalUTF8(reinterpret_cast<const unsigned char*>(data), utf8SequenceLength))
 333             return 0;
 334
 335         UChar32 character = readUTF8Sequence(data, utf8SequenceLength);
 336         ASSERT(!isASCII(character));
 337
 338         if (U_IS_BMP(character)) {
 339             // UTF-16 surrogate values are illegal in UTF-32
 340             if (U_IS_SURROGATE(character))
 341                 return 0;
 342             stringHasher.addCharacter(static_cast<UChar>(character)); // normal case
 343             utf16Length++;
 344         } else if (U_IS_SUPPLEMENTARY(character)) {
 345             stringHasher.addCharacters(static_cast<UChar>(U16_LEAD(character)),
 346                                        static_cast<UChar>(U16_TRAIL(character)));
 347             utf16Length += 2;
 348         } else
 349             return 0;
 350     }
 351
 352     return stringHasher.hashWithTop8BitsMasked();
 353 }
 354
 355 template<typename CharType>
 356 ALWAYS_INLINE bool equalWithUTF8Internal(const CharType* a, const CharType* aEnd, const char* b, const char* bEnd)
 357 {
 358     while (b < bEnd) {
 359         if (isASCII(*b)) {
 360             if (*a++ != *b++)
 361                 return false;
 362             continue;
 363         }
 364
 365         int utf8SequenceLength = inlineUTF8SequenceLengthNonASCII(*b);
 366
 367         if (bEnd - b < utf8SequenceLength)
 368             return false;
 369
 370         if (!isLegalUTF8(reinterpret_cast<const unsigned char*>(b), utf8SequenceLength))
 371             return 0;
 372
 373         UChar32 character = readUTF8Sequence(b, utf8SequenceLength);
 374         ASSERT(!isASCII(character));
 375
 376         if (U_IS_BMP(character)) {
 377             // UTF-16 surrogate values are illegal in UTF-32
 378             if (U_IS_SURROGATE(character))
 379                 return false;
 380             if (*a++ != character)
 381                 return false;
 382         } else if (U_IS_SUPPLEMENTARY(character)) {
 383             if (*a++ != U16_LEAD(character))
 384                 return false;
 385             if (*a++ != U16_TRAIL(character))
 386                 return false;
 387         } else
 388             return false;
 389     }
 390
 391     return a == aEnd;
 392 }
 393
 394 bool equalUTF16WithUTF8(const UChar* a, const UChar* aEnd, const char* b, const char* bEnd)
 395 {
 396     return equalWithUTF8Internal(a, aEnd, b, bEnd);
 397 }
 398
 399 bool equalLatin1WithUTF8(const LChar* a, const LChar* aEnd, const char* b, const char* bEnd)
 400 {
 401     return equalWithUTF8Internal(a, aEnd, b, bEnd);
 402 }
 403
 404 } // namespace Unicode
 405 } // namespace WTF