cpp/src/IceUtil/ConvertUTF.cpp

   1 /*
   2  * Copyright 2001-2004 Unicode, Inc.
   3  *
   4  * Disclaimer
   5  *
   6  * This source code is provided as is by Unicode, Inc. No claims are
   7  * made as to fitness for any particular purpose. No warranties of any
   8  * kind are expressed or implied. The recipient agrees to determine
   9  * applicability of information provided. If this file has been
  10  * purchased on magnetic or optical media from Unicode, Inc., the
  11  * sole remedy for any claim will be exchange of defective media
  12  * within 90 days of receipt.
  13  *
  14  * Limitations on Rights to Redistribute This Code
  15  *
  16  * Unicode, Inc. hereby grants the right to freely use the information
  17  * supplied in this file in the creation of products supporting the
  18  * Unicode Standard, and to make copies of this file in any form
  19  * for internal or external distribution as long as this notice
  20  * remains attached.
  21  */
  22
  23 // **********************************************************************
  24 //
  25 // Copyright (c) 2003-2011 ZeroC, Inc. All rights reserved.
  26 //
  27 // This copy of Ice is licensed to you under the terms described in the
  28 // ICE_LICENSE file included in this distribution.
  29 //
  30 // **********************************************************************
  31
  32 /* ---------------------------------------------------------------------
  33
  34     Conversions between UTF32, UTF-16, and UTF-8. Source code file.
  35     Author: Mark E. Davis, 1994.
  36     Rev History: Rick McGowan, fixes & updates May 2001.
  37     Sept 2001: fixed const & error conditions per
  38         mods suggested by S. Parent & A. Lillich.
  39     June 2002: Tim Dodd added detection and handling of incomplete
  40         source sequences, enhanced error detection, added casts
  41         to eliminate compiler warnings.
  42     July 2003: slight mods to back out aggressive FFFE detection.
  43     Jan 2004: updated switches in from-UTF8 conversions.
  44     Oct 2004: updated to use UNI_MAX_LEGAL_UTF32 in UTF-32 conversions.
  45
  46     See the header file "ConvertUTF.h" for complete documentation.
  47
  48 ------------------------------------------------------------------------ */
  49
  50
  51 #include <IceUtil/ConvertUTF.h>
  52
  53 #ifdef CVTUTF_DEBUG
  54 #include <stdio.h>
  55 #endif
  56
  57 using namespace IceUtil;
  58
  59 namespace IceUtilInternal
  60 {
  61
  62 static const int halfShift  = 10; /* used for shifting by 10 bits */
  63
  64 static const UTF32 halfBase = 0x0010000UL;
  65 static const UTF32 halfMask = 0x3FFUL;
  66
  67 #define UNI_SUR_HIGH_START  (UTF32)0xD800
  68 #define UNI_SUR_HIGH_END    (UTF32)0xDBFF
  69 #define UNI_SUR_LOW_START   (UTF32)0xDC00
  70 #define UNI_SUR_LOW_END     (UTF32)0xDFFF
  71 // #define false           0
  72 // #define true     1
  73
  74
  75 /* --------------------------------------------------------------------- */
  76
  77 /*
  78  * Index into the table below with the first byte of a UTF-8 sequence to
  79  * get the number of trailing bytes that are supposed to follow it.
  80  * Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is
  81  * left as-is for anyone who may want to do such conversion, which was
  82  * allowed in earlier algorithms.
  83  */
  84 static const char trailingBytesForUTF8[256] = {
  85     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  86     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  87     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  88     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  89     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  90     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  91     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
  92     2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
  93 };
  94
  95 /*
  96  * Magic values subtracted from a buffer value during UTF8 conversion.
  97  * This table contains as many values as there might be trailing bytes
  98  * in a UTF-8 sequence.
  99  */
 100 static const UTF32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL,
 101                      0x03C82080UL, 0xFA082080UL, 0x82082080UL };
 102
 103 /*
 104  * Once the bits are split out into bytes of UTF-8, this is a mask OR-ed
 105  * into the first byte, depending on how many bytes follow.  There are
 106  * as many entries in this table as there are UTF-8 sequence types.
 107  * (I.e., one byte sequence, two byte... etc.). Remember that sequencs
 108  * for *legal* UTF-8 will be 4 or fewer bytes total.
 109  */
 110 static const UTF8 firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
 111
 112 /* --------------------------------------------------------------------- */
 113
 114 /* The interface converts a whole buffer to avoid function-call overhead.
 115  * Constants have been gathered. Loops & conditionals have been removed as
 116  * much as possible for efficiency, in favor of drop-through switches.
 117  * (See "Note A" at the bottom of the file for equivalent code.)
 118  * If your compiler supports it, the "isLegalUTF8" call can be turned
 119  * into an inline function.
 120  */
 121
 122 /* --------------------------------------------------------------------- */
 123
 124 ConversionResult ConvertUTF16toUTF8 (
 125         const UTF16** sourceStart, const UTF16* sourceEnd,
 126         UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) {
 127     ConversionResult result = conversionOK;
 128     const UTF16* source = *sourceStart;
 129     UTF8* target = *targetStart;
 130     while (source < sourceEnd) {
 131         UTF32 ch;
 132         unsigned short bytesToWrite = 0;
 133         const UTF32 byteMask = 0xBF;
 134         const UTF32 byteMark = 0x80;
 135         const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */
 136         ch = *source++;
 137         /* If we have a surrogate pair, convert to UTF32 first. */
 138         if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
 139             /* If the 16 bits following the high surrogate are in the source buffer... */
 140             if (source < sourceEnd) {
 141                 UTF32 ch2 = *source;
 142                 /* If it's a low surrogate, convert to UTF32. */
 143                 if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
 144                     ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
 145                         + (ch2 - UNI_SUR_LOW_START) + halfBase;
 146                     ++source;
 147                 } else if (flags == strictConversion) { /* it's an unpaired high surrogate */
 148                     --source; /* return to the illegal value itself */
 149                     result = sourceIllegal;
 150                     break;
 151                 }
 152             } else { /* We don't have the 16 bits following the high surrogate. */
 153                 --source; /* return to the high surrogate */
 154                 result = sourceExhausted;
 155                 break;
 156             }
 157         } else if (flags == strictConversion) {
 158             /* UTF-16 surrogate values are illegal in UTF-32 */
 159             if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
 160                 --source; /* return to the illegal value itself */
 161                 result = sourceIllegal;
 162                 break;
 163             }
 164         }
 165         /* Figure out how many bytes the result will require */
 166         if (ch < (UTF32)0x80) {      bytesToWrite = 1;
 167         } else if (ch < (UTF32)0x800) {     bytesToWrite = 2;
 168         } else if (ch < (UTF32)0x10000) {   bytesToWrite = 3;
 169         } else if (ch < (UTF32)0x110000) {  bytesToWrite = 4;
 170         } else {                            bytesToWrite = 3;
 171                                             ch = UNI_REPLACEMENT_CHAR;
 172         }
 173
 174         target += bytesToWrite;
 175         if (target > targetEnd) {
 176             source = oldSource; /* Back up source pointer! */
 177             target -= bytesToWrite; result = targetExhausted; break;
 178         }
 179         switch (bytesToWrite) { /* note: everything falls through. */
 180             case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
 181             case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
 182             case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
 183             case 1: *--target =  (UTF8)(ch | firstByteMark[bytesToWrite]);
 184         }
 185         target += bytesToWrite;
 186     }
 187     *sourceStart = source;
 188     *targetStart = target;
 189     return result;
 190 }
 191
 192 /* --------------------------------------------------------------------- */
 193
 194 /*
 195  * Utility routine to tell whether a sequence of bytes is legal UTF-8.
 196  * This must be called with the length pre-determined by the first byte.
 197  * If not calling this from ConvertUTF8to*, then the length can be set by:
 198  *  length = trailingBytesForUTF8[*source]+1;
 199  * and the sequence is illegal right away if there aren't that many bytes
 200  * available.
 201  * If presented with a length > 4, this returns false.  The Unicode
 202  * definition of UTF-8 goes up to 4-byte sequences.
 203  */
 204
 205 static Boolean isLegalUTF8(const UTF8 *source, int length) {
 206     UTF8 a;
 207     const UTF8 *srcptr = source+length;
 208     switch (length) {
 209     default: return false;
 210         /* Everything else falls through when "true"... */
 211     case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
 212     case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
 213     case 2: if ((a = (*--srcptr)) > 0xBF) return false;
 214
 215         switch (*source) {
 216             /* no fall-through in this inner switch */
 217             case 0xE0: if (a < 0xA0) return false; break;
 218             case 0xED: if (a > 0x9F) return false; break;
 219             case 0xF0: if (a < 0x90) return false; break;
 220             case 0xF4: if (a > 0x8F) return false; break;
 221             default:   if (a < 0x80) return false;
 222         }
 223
 224     case 1: if (*source >= 0x80 && *source < 0xC2) return false;
 225     }
 226     if (*source > 0xF4) return false;
 227     return true;
 228 }
 229
 230 /* --------------------------------------------------------------------- */
 231
 232 ConversionResult ConvertUTF8toUTF16 (
 233         const UTF8** sourceStart, const UTF8* sourceEnd,
 234         UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) {
 235     ConversionResult result = conversionOK;
 236     const UTF8* source = *sourceStart;
 237     UTF16* target = *targetStart;
 238     while (source < sourceEnd) {
 239         UTF32 ch = 0;
 240         unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
 241         if (source + extraBytesToRead >= sourceEnd) {
 242             result = sourceExhausted; break;
 243         }
 244         /* Do this check whether lenient or strict */
 245         if (! isLegalUTF8(source, extraBytesToRead+1)) {
 246             result = sourceIllegal;
 247             break;
 248         }
 249         /*
 250          * The cases all fall through. See "Note A" below.
 251          */
 252         switch (extraBytesToRead) {
 253             case 5: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */
 254             case 4: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */
 255             case 3: ch += *source++; ch <<= 6;
 256             case 2: ch += *source++; ch <<= 6;
 257             case 1: ch += *source++; ch <<= 6;
 258             case 0: ch += *source++;
 259         }
 260         ch -= offsetsFromUTF8[extraBytesToRead];
 261
 262         if (target >= targetEnd) {
 263             source -= (extraBytesToRead+1); /* Back up source pointer! */
 264             result = targetExhausted; break;
 265         }
 266         if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
 267             /* UTF-16 surrogate values are illegal in UTF-32 */
 268             if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
 269                 if (flags == strictConversion) {
 270                     source -= (extraBytesToRead+1); /* return to the illegal value itself */
 271                     result = sourceIllegal;
 272                     break;
 273                 } else {
 274                     *target++ = UNI_REPLACEMENT_CHAR;
 275                 }
 276             } else {
 277                 *target++ = (UTF16)ch; /* normal case */
 278             }
 279         } else if (ch > UNI_MAX_UTF16) {
 280             if (flags == strictConversion) {
 281                 result = sourceIllegal;
 282                 source -= (extraBytesToRead+1); /* return to the start */
 283                 break; /* Bail out; shouldn't continue */
 284             } else {
 285                 *target++ = UNI_REPLACEMENT_CHAR;
 286             }
 287         } else {
 288             /* target is a character in range 0xFFFF - 0x10FFFF. */
 289             if (target + 1 >= targetEnd) {
 290                 source -= (extraBytesToRead+1); /* Back up source pointer! */
 291                 result = targetExhausted; break;
 292             }
 293             ch -= halfBase;
 294             *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START);
 295             *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START);
 296         }
 297     }
 298     *sourceStart = source;
 299     *targetStart = target;
 300     return result;
 301 }
 302
 303 /* --------------------------------------------------------------------- */
 304
 305 ConversionResult ConvertUTF32toUTF8 (
 306         const UTF32** sourceStart, const UTF32* sourceEnd,
 307         UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) {
 308     ConversionResult result = conversionOK;
 309     const UTF32* source = *sourceStart;
 310     UTF8* target = *targetStart;
 311     while (source < sourceEnd) {
 312         UTF32 ch;
 313         unsigned short bytesToWrite = 0;
 314         const UTF32 byteMask = 0xBF;
 315         const UTF32 byteMark = 0x80;
 316         ch = *source++;
 317         if (flags == strictConversion ) {
 318             /* UTF-16 surrogate values are illegal in UTF-32 */
 319             if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
 320                 --source; /* return to the illegal value itself */
 321                 result = sourceIllegal;
 322                 break;
 323             }
 324         }
 325         /*
 326          * Figure out how many bytes the result will require. Turn any
 327          * illegally large UTF32 things (> Plane 17) into replacement chars.
 328          */
 329         if (ch < (UTF32)0x80) {      bytesToWrite = 1;
 330         } else if (ch < (UTF32)0x800) {     bytesToWrite = 2;
 331         } else if (ch < (UTF32)0x10000) {   bytesToWrite = 3;
 332         } else if (ch <= UNI_MAX_LEGAL_UTF32) {  bytesToWrite = 4;
 333         } else {                            bytesToWrite = 3;
 334                                             ch = UNI_REPLACEMENT_CHAR;
 335                                             result = sourceIllegal;
 336         }
 337
 338         target += bytesToWrite;
 339         if (target > targetEnd) {
 340             --source; /* Back up source pointer! */
 341             target -= bytesToWrite; result = targetExhausted; break;
 342         }
 343         switch (bytesToWrite) { /* note: everything falls through. */
 344             case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
 345             case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
 346             case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
 347             case 1: *--target = (UTF8) (ch | firstByteMark[bytesToWrite]);
 348         }
 349         target += bytesToWrite;
 350     }
 351     *sourceStart = source;
 352     *targetStart = target;
 353     return result;
 354 }
 355
 356 /* --------------------------------------------------------------------- */
 357
 358 ConversionResult ConvertUTF8toUTF32 (
 359         const UTF8** sourceStart, const UTF8* sourceEnd,
 360         UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) {
 361     ConversionResult result = conversionOK;
 362     const UTF8* source = *sourceStart;
 363     UTF32* target = *targetStart;
 364     while (source < sourceEnd) {
 365         UTF32 ch = 0;
 366         unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
 367         if (source + extraBytesToRead >= sourceEnd) {
 368             result = sourceExhausted; break;
 369         }
 370         /* Do this check whether lenient or strict */
 371         if (! isLegalUTF8(source, extraBytesToRead+1)) {
 372             result = sourceIllegal;
 373             break;
 374         }
 375         /*
 376          * The cases all fall through. See "Note A" below.
 377          */
 378         switch (extraBytesToRead) {
 379             case 5: ch += *source++; ch <<= 6;
 380             case 4: ch += *source++; ch <<= 6;
 381             case 3: ch += *source++; ch <<= 6;
 382             case 2: ch += *source++; ch <<= 6;
 383             case 1: ch += *source++; ch <<= 6;
 384             case 0: ch += *source++;
 385         }
 386         ch -= offsetsFromUTF8[extraBytesToRead];
 387
 388         if (target >= targetEnd) {
 389             source -= (extraBytesToRead+1); /* Back up the source pointer! */
 390             result = targetExhausted; break;
 391         }
 392         if (ch <= UNI_MAX_LEGAL_UTF32) {
 393             /*
 394              * UTF-16 surrogate values are illegal in UTF-32, and anything
 395              * over Plane 17 (> 0x10FFFF) is illegal.
 396              */
 397             if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
 398                 if (flags == strictConversion) {
 399                     source -= (extraBytesToRead+1); /* return to the illegal value itself */
 400                     result = sourceIllegal;
 401                     break;
 402                 } else {
 403                     *target++ = UNI_REPLACEMENT_CHAR;
 404                 }
 405             } else {
 406                 *target++ = ch;
 407             }
 408         } else { /* i.e., ch > UNI_MAX_LEGAL_UTF32 */
 409             result = sourceIllegal;
 410             *target++ = UNI_REPLACEMENT_CHAR;
 411         }
 412     }
 413     *sourceStart = source;
 414     *targetStart = target;
 415     return result;
 416 }
 417
 418 /* ---------------------------------------------------------------------
 419
 420     Note A.
 421     The fall-through switches in UTF-8 reading code save a
 422     temp variable, some decrements & conditionals.  The switches
 423     are equivalent to the following loop:
 424         {
 425             int tmpBytesToRead = extraBytesToRead+1;
 426             do {
 427                 ch += *source++;
 428                 --tmpBytesToRead;
 429                 if (tmpBytesToRead) ch <<= 6;
 430             } while (tmpBytesToRead > 0);
 431         }
 432     In UTF-8 writing code, the switches on "bytesToWrite" are
 433     similarly unrolled loops.
 434
 435    --------------------------------------------------------------------- */
 436 }
 437
 438 namespace IceUtil
 439 {
 440
 441 using namespace IceUtilInternal;
 442
 443 /* --------------------------------------------------------------------- */
 444
 445 /*
 446  * Exported function to return whether a UTF-8 sequence is legal or not.
 447  * This is not used here; it's just exported.
 448  */
 449 Boolean isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd) {
 450     if(source == sourceEnd) {
 451         return true;
 452     }
 453     while(true) {
 454         int length = trailingBytesForUTF8[*source]+1;
 455         // Is buffer big enough to contain character?
 456         if (source+length > sourceEnd) {
 457             return false;
 458         }
 459         // Is character legal UTF8?
 460         if(!isLegalUTF8(source, length)) {
 461             return false;
 462         }
 463         // Are we at end of buffer?
 464         source += length;
 465         if(source == sourceEnd) {
 466             return true;
 467         }
 468     }
 469 }
 470
 471 }