ACE/ace/UTF16_Encoding_Converter.cpp

   1 // ======================================================================
   2 //
   3 // The actual conversion methods are covered by the copyright information
   4 // below.  It is not the actual code provided by Unicode, Inc. but is an
   5 // ACE-ified and only slightly modified version.
   6 // Chad Elliott 4/28/2005
   7 //
   8 // Copyright 2001-2004 Unicode, Inc.
   9 //
  10 // Limitations on Rights to Redistribute This Code
  11 //
  12 // Unicode, Inc. hereby grants the right to freely use the information
  13 // supplied in this file in the creation of products supporting the
  14 // Unicode Standard, and to make copies of this file in any form
  15 // for internal or external distribution as long as this notice
  16 // remains attached.
  17 //
  18 // ======================================================================
  19
  20 #include "ace/UTF16_Encoding_Converter.h"
  21
  22 #if defined (ACE_USES_WCHAR)
  23 #include "ace/OS_NS_stdio.h"
  24 #include "ace/OS_Memory.h"
  25 #include "ace/Min_Max.h"
  26
  27 #if !defined (__ACE_INLINE__)
  28 #include "ace/UTF16_Encoding_Converter.inl"
  29 #endif /* __ACE_INLINE__ */
  30
  31 ACE_BEGIN_VERSIONED_NAMESPACE_DECL
  32
  33 static constexpr ACE_UINT32 halfShift = 10;
  34 static constexpr ACE_UINT32 halfBase  = 0x00010000;
  35 static constexpr ACE_UINT32 halfMask  = 0x000003FF;
  36
  37 static constexpr ACE_UINT32 UNI_SUR_HIGH_START   = 0x0000D800;
  38 static constexpr ACE_UINT32 UNI_SUR_HIGH_END     = 0x0000DBFF;
  39 static constexpr ACE_UINT32 UNI_SUR_LOW_START    = 0x0000DC00;
  40 static constexpr ACE_UINT32 UNI_SUR_LOW_END      = 0x0000DFFF;
  41 static constexpr ACE_UINT32 UNI_REPLACEMENT_CHAR = 0x0000FFFD;
  42 static constexpr ACE_UINT32 UNI_MAX_BMP          = 0x0000FFFF;
  43 static constexpr ACE_UINT32 UNI_MAX_UTF16        = 0x0010FFFF;
  44
  45 // Once the bits are split out into bytes of UTF-8, this is a mask OR-ed
  46 // into the first byte, depending on how many bytes follow.  There are
  47 // as many entries in this table as there are UTF-8 sequence types.
  48 // (I.e., one byte sequence, two byte... etc.). Remember that sequencs
  49 // for *legal* UTF-8 will be 4 or fewer bytes total.
  50 static const ACE_Byte firstByteMark[7] = { 0x00, 0x00, 0xC0,
  51                                            0xE0, 0xF0, 0xF8, 0xFC };
  52
  53 // Index into the table below with the first byte of a UTF-8 sequence to
  54 // get the number of trailing bytes that are supposed to follow it.
  55 // Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is
  56 // left as-is for anyone who may want to do such conversion, which was
  57 // allowed in earlier algorithms.
  58 static const ACE_Byte trailingBytesForUTF8[256] = {
  59     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  60     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  61     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  62     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  63     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  64     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  65     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
  66     2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
  67 };
  68
  69 // Magic values subtracted from a buffer value during UTF8 conversion.
  70 // This table contains as many values as there might be trailing bytes
  71 // in a UTF-8 sequence.
  72 static const ACE_UINT32 offsetsFromUTF8[6] = { 0x00000000, 0x00003080,
  73                                                0x000E2080, 0x03C82080,
  74                                                0xFA082080, 0x82082080 };
  75
  76
  77 ACE_UTF16_Encoding_Converter::ACE_UTF16_Encoding_Converter (bool swap)
  78  : swap_ (swap)
  79 {
  80 }
  81
  82 ACE_UTF16_Encoding_Converter::~ACE_UTF16_Encoding_Converter ()
  83 {
  84 }
  85
  86 ACE_UTF16_Encoding_Converter::Result
  87 ACE_UTF16_Encoding_Converter::to_utf8 (const void* source,
  88                                        size_t source_size,
  89                                        ACE_Byte* target,
  90                                        size_t target_size,
  91                                        bool strict)
  92 {
  93   static const ACE_UINT32 byteMask = 0xBF;
  94   static const ACE_UINT32 byteMark = 0x80;
  95   Result result = CONVERSION_OK;
  96
  97   ACE_Byte* targetEnd = target + target_size;
  98   const ACE_UINT16* sourceStart = static_cast<const ACE_UINT16*> (source);
  99   const ACE_UINT16* sourceEnd   = sourceStart +
 100                                   (source_size / sizeof (ACE_UINT16));
 101
 102   while (sourceStart < sourceEnd)
 103     {
 104       ACE_UINT16 nw = *sourceStart++;
 105       ACE_UINT32 ch = (this->swap_ ? ACE_SWAP_WORD (nw) : nw);
 106
 107       // If we have a surrogate pair, convert to ACE_UINT32 first.
 108       if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END)
 109         {
 110           // If the 16 bits following the high surrogate are in the
 111           // sourceStart buffer...
 112           if (sourceStart < sourceEnd)
 113             {
 114               ACE_UINT32 ch2 = (this->swap_ ? ACE_SWAP_WORD (*sourceStart) :
 115                                               *sourceStart);
 116               // If it's a low surrogate, convert to ACE_UINT32.
 117               if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END)
 118                 {
 119                   ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
 120                     + (ch2 - UNI_SUR_LOW_START) + halfBase;
 121                   ++sourceStart;
 122                 }
 123               else if (strict)
 124                 {
 125                   // it's an unpaired high surrogate
 126                   result = SOURCE_ILLEGAL;
 127                   break;
 128                 }
 129             }
 130           else
 131             {
 132               // We don't have the 16 bits following the high surrogate.
 133               result = SOURCE_EXHAUSTED;
 134               break;
 135             }
 136         }
 137       else if (strict)
 138         {
 139           // UTF-16 surrogate values are illegal in UTF-32
 140           if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END)
 141             {
 142               result = SOURCE_ILLEGAL;
 143               break;
 144             }
 145         }
 146
 147       // Figure out how many bytes the result will require
 148       unsigned short bytesToWrite = 0;
 149       if (ch < 0x80)
 150         bytesToWrite = 1;
 151       else if (ch < 0x800)
 152         bytesToWrite = 2;
 153       else if (ch < 0x10000)
 154         bytesToWrite = 3;
 155       else if (ch < 0x110000)
 156         bytesToWrite = 4;
 157       else
 158         {
 159           bytesToWrite = 3;
 160           ch = UNI_REPLACEMENT_CHAR;
 161         }
 162
 163       target += bytesToWrite;
 164       if (target > targetEnd)
 165         {
 166           result = TARGET_EXHAUSTED;
 167           break;
 168         }
 169
 170       // NOTE: Everything falls through for efficiency purposes.
 171       switch (bytesToWrite)
 172         {
 173         case 4:
 174           *--target = (ACE_Byte)((ch | byteMark) & byteMask);
 175           ch >>= 6;
 176         case 3:
 177           *--target = (ACE_Byte)((ch | byteMark) & byteMask);
 178           ch >>= 6;
 179         case 2:
 180           *--target = (ACE_Byte)((ch | byteMark) & byteMask);
 181           ch >>= 6;
 182         case 1:
 183           *--target = (ACE_Byte)(ch | firstByteMark[bytesToWrite]);
 184         }
 185       target += bytesToWrite;
 186     }
 187
 188   return result;
 189 }
 190
 191 ACE_UTF16_Encoding_Converter::Result
 192 ACE_UTF16_Encoding_Converter::from_utf8 (const ACE_Byte* source,
 193                                          size_t source_size,
 194                                          void* target,
 195                                          size_t target_size,
 196                                          bool strict)
 197 {
 198   Result result = CONVERSION_OK;
 199   const ACE_Byte* sourceEnd = source + source_size;
 200   ACE_UINT16* targetStart   = static_cast<ACE_UINT16*> (target);
 201   ACE_UINT16* targetEnd     = targetStart + target_size;
 202
 203   while (source < sourceEnd)
 204     {
 205       ACE_UINT32 ch = 0;
 206       unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
 207       if (source + extraBytesToRead >= sourceEnd)
 208         {
 209           result = SOURCE_EXHAUSTED;
 210           break;
 211         }
 212
 213       // Do this check whether lenient or strict
 214       if (!this->is_legal_utf8 (source, extraBytesToRead + 1))
 215         {
 216           result = SOURCE_ILLEGAL;
 217           break;
 218         }
 219
 220       // The cases all fall through. See "Note A" below.
 221       switch (extraBytesToRead)
 222         {
 223         case 5: // remember, illegal UTF-8
 224           ch += *source++;
 225           ch <<= 6;
 226         case 4: // remember, illegal UTF-8
 227           ch += *source++;
 228           ch <<= 6;
 229         case 3:
 230           ch += *source++;
 231           ch <<= 6;
 232         case 2:
 233           ch += *source++;
 234           ch <<= 6;
 235         case 1:
 236           ch += *source++;
 237           ch <<= 6;
 238         case 0:
 239           ch += *source++;
 240       }
 241       ch -= offsetsFromUTF8[extraBytesToRead];
 242
 243       if (targetStart >= targetEnd)
 244         {
 245           result = TARGET_EXHAUSTED;
 246           break;
 247         }
 248
 249       if (ch <= UNI_MAX_BMP) // Target is a character <= 0xFFFF
 250         {
 251           // UTF-16 surrogate values are illegal in UTF-32
 252           if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END)
 253             {
 254               if (strict)
 255                 {
 256                   result = SOURCE_ILLEGAL;
 257                   break;
 258                 }
 259               else
 260                 {
 261                   *targetStart++ = UNI_REPLACEMENT_CHAR;
 262                 }
 263             }
 264           else
 265             {
 266               *targetStart++ = (ACE_UINT16)ch;
 267             }
 268         }
 269       else if (ch > UNI_MAX_UTF16)
 270         {
 271           if (strict)
 272             {
 273               result = SOURCE_ILLEGAL;
 274               break;
 275             }
 276           else
 277             {
 278               *targetStart++ = UNI_REPLACEMENT_CHAR;
 279             }
 280         }
 281       else
 282         {
 283           // targetStart is a character in range 0xFFFF - 0x10FFFF.
 284           if (targetStart + 1 >= targetEnd)
 285             {
 286               result = TARGET_EXHAUSTED;
 287               break;
 288             }
 289           ch -= halfBase;
 290           *targetStart++ = (ACE_UINT16)((ch >> halfShift) + UNI_SUR_HIGH_START);
 291           *targetStart++ = (ACE_UINT16)((ch & halfMask) + UNI_SUR_LOW_START);
 292         }
 293     }
 294
 295   return result;
 296 }
 297
 298 ACE_UTF16_Encoding_Converter*
 299 ACE_UTF16_Encoding_Converter::encoded (const ACE_Byte* source,
 300                                        size_t source_size)
 301 {
 302   static const size_t begin = 16;
 303   static const size_t converted = begin * 4;
 304
 305   ACE_Byte target[converted];
 306   ACE_UTF16_Encoding_Converter* converter = 0;
 307   ACE_NEW_RETURN (converter,
 308                   ACE_UTF16_Encoding_Converter (false),
 309                   0);
 310   if (converter->to_utf8 (source,
 311                           ACE_MIN (begin, source_size),
 312                           target,
 313                           converted) == CONVERSION_OK)
 314     {
 315       return converter;
 316     }
 317   else
 318     {
 319       delete converter;
 320     }
 321
 322   return 0;
 323 }
 324
 325 ACE_UINT32
 326 ACE_UTF16_Encoding_Converter::get_UNI_SUR_HIGH_START ()
 327 {
 328   return UNI_SUR_HIGH_START;
 329 }
 330
 331 ACE_UINT32
 332 ACE_UTF16_Encoding_Converter::get_UNI_SUR_LOW_END ()
 333 {
 334   return UNI_SUR_LOW_END;
 335 }
 336
 337 ACE_UINT32
 338 ACE_UTF16_Encoding_Converter::get_UNI_REPLACEMENT_CHAR ()
 339 {
 340   return UNI_REPLACEMENT_CHAR;
 341 }
 342
 343 const ACE_Byte*
 344 ACE_UTF16_Encoding_Converter::get_first_byte_mark ()
 345 {
 346   return firstByteMark;
 347 }
 348
 349 const ACE_Byte*
 350 ACE_UTF16_Encoding_Converter::get_trailing_bytes_for_utf8 ()
 351 {
 352   return trailingBytesForUTF8;
 353 }
 354
 355 const ACE_UINT32*
 356 ACE_UTF16_Encoding_Converter::get_offsets_from_utf8 ()
 357 {
 358   return offsetsFromUTF8;
 359 }
 360
 361 ACE_END_VERSIONED_NAMESPACE_DECL
 362 #endif /* ACE_USES_WCHAR */