TAO/tests/CodeSets/libs/UCS4_UTF16/WUCS4_UTF16.cpp

   1 // -*- C++ -*-
   2
   3 //=============================================================================
   4 /**
   5  *  @file    WUCS4_UTF16.cpp
   6  *
   7  *  Defines the arrays required to convert between UCS-4 a 4 byte wide char
   8  *  codeset, and UCS-16, aka unicode, a 2-byte codeset.
   9  *
  10  *  @author Phil Mesnier <mesnier_p@ociweb.com>
  11  */
  12 //=============================================================================
  13
  14
  15 #include "WUCS4_UTF16.h"
  16 #include "ace/OS_Memory.h"
  17
  18 // ****************************************************************
  19
  20
  21 // @@ TODO: Find a better home for these definition
  22 // Note: unlike the UNICODE standard we define these as
  23 // half-closed ranges i.e.
  24 //    *BEGIN is the first value in the range
  25 //    *END is the first value beyond the range (END is not included
  26 //         in the range)
  27 // Note the use of unsigned short for UTF-16 codepoints.  wchar_t may
  28 // by four bytes
  29 typedef ACE_CDR::UShort ACE_UTF16_T;
  30 static const size_t ACE_UTF16_CODEPOINT_SIZE = sizeof(ACE_UTF16_T);
  31
  32 // surrogate high           1101.10HH.HHHH.HHHH
  33 // surrogate low            1101.11LL.LLLL.LLLL
  34 // 4 byte result:           0000.0000.0000.HHHH.HHHH.HHLL.LLLL.LLLL
  35 // add offset               0000.0000.0000.0000.0001.0000.0000.0000
  36
  37 // range of surrogate values for high-order bits
  38 static const unsigned short ACE_UTF16_SURROGATE_HIGH_BEGIN = 0xD800U;
  39 static const unsigned short ACE_UTF16_SURROGATE_HIGH_END = 0xDC00U;
  40
  41 static const unsigned short ACE_UTF16_SURROGATE_LOW_BEGIN = 0xDC00U;
  42 static const unsigned short ACE_UTF16_SURROGATE_LOW_END = 0xE000U;
  43
  44 // offset to UTF16 values encoded with surrogates start at 2^16
  45 static const unsigned long ACE_UTF16_SURROGATE_OFFSET = 0x000010000UL;
  46
  47 // shift high order bits from surrogate into correct postion
  48 static const int ACE_UTF16_SURROGATE_HIGH_SHIFT = 10;
  49 static const unsigned short ACE_UTF16_SURROGATE_LOW_MASK = 0x3FF;
  50
  51 // largest value that can be represented in UTF16 without using surrogates + 1
  52 static const unsigned long ACE_UTF16_RAW_END = 0x00010000LU;
  53
  54 // largest value that can be represented in UTF16 + 1
  55 static const unsigned long ACE_UTF16_END = 0x00110000LU;
  56
  57 static const unsigned short ACE_UNICODE_SUBSTITUTE_CHARACTER = 0xFFFDU;
  58 static const unsigned short ACE_UNICODE_BOM_CORRECT = 0xFEFFU;
  59 static const unsigned short ACE_UNICODE_BOM_SWAPPED = 0xFFFEU;
  60
  61 /////////////////////////////////////////////////////
  62 // Static inline routines to simplify conversion code
  63 // @@ should be in anonymous namespace when ACE allows it
  64 //    or better yet, there should be a UTF-16 support thingie(technical term)
  65 //    that provides these methods.
  66 // Performance: depends on compiler inlining + optimization for performance
  67
  68 /// load next two bytes from buffer into a short. Byte swapping as necessary
  69 static
  70 //ACE_INLINE
  71 ACE_UTF16_T
  72 load_raw_wchar (const char * buffer, size_t & pos, int do_byte_swap)
  73 {
  74   // need a two byte object to load the UTF16 2 byte codepoint
  75   ACE_UTF16_T utf16_char = * reinterpret_cast<ACE_UTF16_T const *> (&buffer[pos*ACE_UTF16_CODEPOINT_SIZE]);
  76 #if ! defined (ACE_DISABLE_SWAP_ON_READ)
  77   if (do_byte_swap)
  78     {
  79       ACE_CDR::swap_2 (
  80         &buffer[pos*ACE_UTF16_CODEPOINT_SIZE],
  81         reinterpret_cast<char *> (&utf16_char));
  82     }
  83 #endif
  84   pos ++;
  85   return utf16_char;
  86 }
  87
  88 /// convert UTF-16 surrogate pair to wchar_t
  89 static
  90 //ACE_INLINE
  91 ACE_CDR::WChar
  92 convert_surrogate_pair (ACE_UTF16_T high, ACE_UTF16_T low)
  93 {
  94   return static_cast<ACE_CDR::WChar> (((high - ACE_UTF16_SURROGATE_HIGH_BEGIN) << ACE_UTF16_SURROGATE_HIGH_SHIFT)
  95     + (low - ACE_UTF16_SURROGATE_LOW_BEGIN)
  96     + ACE_UTF16_SURROGATE_OFFSET);
  97 }
  98
  99 /// load wchar from utf16 buffer
 100 /// converts surrogate pairs
 101 /// substitutes SUBSTITUTE_CHAR for bad encoding
 102 static
 103 //ACE_INLINE
 104 ACE_CDR::WChar
 105 load_wchar (const char * buffer, size_t & pos, size_t length, int do_byte_swap)
 106 {
 107   ACE_CDR::WChar rc = ACE_UNICODE_SUBSTITUTE_CHARACTER;
 108   if (pos < length)
 109     {
 110       rc = static_cast<ACE_CDR::WChar> (load_raw_wchar (buffer, pos, do_byte_swap));
 111       // Is this a UTF16 surrogate?
 112       // note assumpton that SURROGATE_HIGH_END == SURROGATE_LOW_BEGIN
 113       if (rc >= ACE_UTF16_SURROGATE_HIGH_BEGIN && rc < ACE_UTF16_SURROGATE_LOW_END)
 114         {
 115           // if we still have two bytes available
 116           if (pos < length)
 117             {
 118               // expecting high surrogate
 119               if (rc < ACE_UTF16_SURROGATE_HIGH_END)
 120               {
 121                 ACE_UTF16_T low = load_raw_wchar (buffer, pos, do_byte_swap);
 122                 if (low >= ACE_UTF16_SURROGATE_LOW_BEGIN
 123                   && low < ACE_UTF16_SURROGATE_LOW_END)
 124                   {
 125                     rc = convert_surrogate_pair (
 126                       static_cast<ACE_UTF16_T> (rc), low);
 127                   }
 128                 else
 129                   {
 130                     rc = ACE_UNICODE_SUBSTITUTE_CHARACTER;
 131                   }
 132               }
 133               else
 134               {
 135                 rc = ACE_UNICODE_SUBSTITUTE_CHARACTER;
 136               }
 137             }
 138           else
 139             {
 140               rc = ACE_UNICODE_SUBSTITUTE_CHARACTER;
 141             }
 142         }
 143     }
 144   return rc;
 145 }
 146
 147 static
 148 //ACE_INLINE
 149 size_t encode_utf16 (ACE_UTF16_T * buffer, ACE_CDR::WChar value)
 150 {
 151   buffer[0] = static_cast<ACE_UTF16_T> (value);
 152   size_t length = 1;
 153
 154   // On platforms where sizeof(ACE_CDR::WChar) == 2, the test using
 155   // ul_value will always be false, since we are improperly using
 156   // a 4-byte native wchar codeset. But since this is for a simple
 157   // test that has to run on machines with 4 byte wchars, this cast
 158   // avoids compile time issues of comparing a value that starts out
 159   // as a short with a constant that is too big for a short.
 160   unsigned long ul_value = static_cast<unsigned long>(value);
 161   if (value >= ACE_UTF16_SURROGATE_HIGH_BEGIN)
 162     {
 163       if (value < ACE_UTF16_SURROGATE_LOW_END)
 164         {
 165           buffer[0] = ACE_UNICODE_SUBSTITUTE_CHARACTER;
 166         }
 167       else if (ul_value >= ACE_UTF16_RAW_END)
 168         {
 169           if (ul_value >= ACE_UTF16_END)
 170             {
 171               buffer[0] = ACE_UNICODE_SUBSTITUTE_CHARACTER;
 172             }
 173           else
 174             {
 175               ACE_CDR::WChar offset = static_cast<ACE_CDR::WChar> (value - ACE_UTF16_SURROGATE_OFFSET);
 176               buffer[0] = (offset >> ACE_UTF16_SURROGATE_HIGH_SHIFT)
 177                 + ACE_UTF16_SURROGATE_HIGH_BEGIN;
 178               buffer[1] = (offset & ACE_UTF16_SURROGATE_LOW_MASK)
 179                 + ACE_UTF16_SURROGATE_LOW_BEGIN;
 180               length = 2;
 181             }
 182         }
 183     }
 184   return length;
 185 }
 186
 187 /// count number of characters in native WString that will be converted
 188 /// to UTF-16 surrogate pairs
 189 static
 190 size_t count_potential_surrogates (
 191     const ACE_CDR::WChar *buffer,
 192     ACE_CDR::ULong len)
 193 {
 194   size_t count = 0;
 195   for (size_t i = 0; i < len; ++i)
 196     {
 197       // see comments above in encode_utf16().
 198       unsigned long ul_value = static_cast<unsigned long>(buffer[i]);
 199       if (ul_value >= ACE_UTF16_RAW_END &&
 200           ul_value < ACE_UTF16_END)
 201         {
 202           count += 1;
 203         }
 204     }
 205   return count;
 206 }
 207
 208
 209 /////////////////////////////
 210 // WUCS4_UTF16 implementation
 211
 212 WUCS4_UTF16::WUCS4_UTF16 ()
 213 {
 214 }
 215
 216 WUCS4_UTF16::~WUCS4_UTF16 ()
 217 {
 218 }
 219
 220 // = Documented in $ACE_ROOT/ace/CDR_Stream.h
 221 ACE_CDR::Boolean
 222 WUCS4_UTF16::read_wchar (ACE_InputCDR &cdr, ACE_CDR::WChar &x)
 223 {
 224   if (static_cast<ACE_CDR::Short> (this->major_version(cdr)) == 1
 225       && static_cast<ACE_CDR::Short> (this->minor_version(cdr)) > 1)
 226     {
 227       ACE_CDR::Octet len;
 228       if (! this->read_1 (cdr, &len))
 229         {
 230           return 0;
 231         }
 232
 233       int old_bo = cdr.byte_order();
 234
 235       ACE_UTF16_T sx = 0;
 236       if (! this->read_2 (cdr,&sx))
 237         {
 238           return 0;
 239         }
 240
 241       // Check for byte order mark, if found, consume and honor it.
 242       if (sx == ACE_UNICODE_BOM_CORRECT || sx == ACE_UNICODE_BOM_SWAPPED)
 243         {
 244           // if we found it, but it came in in the wrong order
 245           // invert the byte order flag for the duration of this method
 246           if (sx == ACE_UNICODE_BOM_SWAPPED)
 247             {
 248               cdr.reset_byte_order (! old_bo);
 249             }
 250           this->read_2 (cdr,&sx);
 251         }
 252
 253       // check for UTF-16 surrogate pair, and if found interpret it
 254       if (sx >= ACE_UTF16_SURROGATE_HIGH_BEGIN
 255         && sx < ACE_UTF16_SURROGATE_LOW_END)
 256         {
 257           if (sx >= ACE_UTF16_SURROGATE_HIGH_END)
 258             {
 259               cdr.reset_byte_order (old_bo);
 260               return 0;
 261             }
 262
 263             ACE_UTF16_T low;
 264             if (! this->read_2 (cdr, &low))
 265               {
 266                 cdr.reset_byte_order (old_bo);
 267                 return 0;
 268               }
 269             if (low < ACE_UTF16_SURROGATE_LOW_BEGIN
 270               || low >= ACE_UTF16_SURROGATE_LOW_END)
 271               {
 272                 cdr.reset_byte_order (old_bo);
 273                 return 0;
 274               }
 275             x = convert_surrogate_pair (sx, low);
 276         }
 277       else
 278         {
 279           x = static_cast<ACE_CDR::WChar> (sx);
 280         }
 281
 282         cdr.reset_byte_order (old_bo);
 283     }
 284   else
 285     {
 286       ACE_UTF16_T sx = 0;
 287       if (!this->read_2 (cdr, &sx))
 288         {
 289           return 0;
 290         }
 291       x = static_cast<ACE_CDR::WChar> (sx);
 292     }
 293   return 1;
 294 }
 295
 296 ACE_CDR::Boolean
 297 WUCS4_UTF16::read_wstring (ACE_InputCDR &cdr,
 298                            ACE_CDR::WChar *&x)
 299 {
 300   ACE_CDR::ULong len;
 301   if (!this->read_4 (cdr, &len))
 302     return 0;
 303
 304   // A check for the length being too great is done later in the
 305   // call to read_char_array but we want to have it done before
 306   // the memory is allocated.
 307   if (len > 0 && len <= cdr.length())
 308     {
 309       if (static_cast<ACE_CDR::Short> (this->major_version(cdr)) == 1
 310           && static_cast<ACE_CDR::Short> (this->minor_version(cdr)) > 1)
 311         {
 312           len /= ACE_UTF16_CODEPOINT_SIZE;
 313
 314           //allocating one extra for the null character needed by applications
 315           ACE_NEW_RETURN (x,
 316                           ACE_CDR::WChar [len + 1],
 317                           0);
 318           x[len] = L'\x00';
 319           if (this->read_wchar_array_i (cdr, x, len,1))
 320             {
 321               // Since reading the array may have adjusted the length,
 322               // we simply rewrite the null terminator
 323               x[len] = L'\x00';
 324               return 1;
 325             }
 326         }
 327       else
 328         {
 329           ACE_NEW_RETURN (x,
 330                           ACE_CDR::WChar [len],
 331                           0);
 332           if (this->read_wchar_array (cdr, x, len))
 333             return 1;
 334         }
 335       delete [] x;
 336     }
 337   else if (len == 0)
 338     {
 339       // Convert any null strings to empty strings since empty
 340       // strings can cause crashes. (See bug 58.)
 341       ACE_NEW_RETURN (x,
 342                       ACE_CDR::WChar[1],
 343                       0);
 344       x[0] = '\x00';
 345       return 1;
 346     }
 347   x = 0;
 348   return 0;
 349 }
 350
 351 ACE_CDR::Boolean
 352 WUCS4_UTF16::read_wchar_array_i (ACE_InputCDR & cdr,
 353                                  ACE_CDR::WChar *x,
 354                                  ACE_CDR::ULong &length,
 355                                  int adjust_len)
 356 {
 357   if (length == 0)
 358     return 1;
 359   char* buf;
 360   size_t align = ACE_CDR::SHORT_ALIGN;
 361   if (cdr.adjust (ACE_UTF16_CODEPOINT_SIZE * length, align, buf) == 0)
 362     {
 363       int byte_swap = cdr.do_byte_swap();
 364       size_t pos = 0;
 365
 366       // check for byte order mark.  If found, honor it then discard it
 367       ACE_UTF16_T bom = load_raw_wchar (buf, pos, byte_swap);
 368       if (bom == ACE_UNICODE_BOM_CORRECT || bom == ACE_UNICODE_BOM_SWAPPED)
 369         {
 370           if (bom == ACE_UNICODE_BOM_SWAPPED)
 371             {
 372               byte_swap = !byte_swap;
 373             }
 374           buf += ACE_UTF16_CODEPOINT_SIZE;
 375           if (adjust_len)
 376             length -= 1;
 377         }
 378       size_t bpos = 0;
 379       for (size_t xpos = 0; xpos < length; ++xpos)
 380         {
 381           x[xpos] = load_wchar (buf, bpos, length, byte_swap);
 382         }
 383
 384       return 1;
 385     }
 386   return 0;
 387 }
 388
 389
 390 ACE_CDR::Boolean
 391 WUCS4_UTF16::read_wchar_array (ACE_InputCDR & cdr,
 392                                ACE_CDR::WChar *x,
 393                                ACE_CDR::ULong length)
 394 {
 395   if (length == 0)
 396     return 1;
 397
 398   if (static_cast<ACE_CDR::Short> (this->major_version(cdr)) == 1
 399       && static_cast<ACE_CDR::Short> (this->minor_version(cdr)) > 1)
 400     {
 401       for (size_t i = 0; i < length; i++)
 402         if (!this->read_wchar(cdr,x[i]))
 403           return 0;
 404       return 1;
 405     }
 406   else
 407     return this->read_wchar_array_i(cdr,x,length);
 408 }
 409
 410 ACE_CDR::Boolean
 411 WUCS4_UTF16::write_wchar (ACE_OutputCDR &cdr,
 412                           ACE_CDR::WChar x)
 413 {
 414   int encode_len = 1;
 415   if (static_cast<ACE_CDR::Short> (this->minor_version(cdr)) == 0)
 416     { // wchar is not allowed with GIOP 1.0
 417       errno = EINVAL;
 418       return 0;
 419     }
 420   else if (static_cast<ACE_CDR::Short> (this->minor_version(cdr)) == 1)
 421     encode_len = 0;
 422
 423   return write_wchar_i(cdr,x,1,encode_len);
 424 }
 425
 426 ACE_CDR::Boolean
 427 WUCS4_UTF16::write_wchar_i (ACE_OutputCDR &cdr,
 428                             ACE_CDR::WChar x,
 429                             int use_BOM,
 430                             int encode_len)
 431 {
 432   // If the desired char cannot be translated into a single unicode char,
 433   // we must raise a marshal exception.
 434   //
 435   // see the comment in encode_utf16() regarding the cast.
 436   unsigned long ul_x = static_cast<unsigned long>(x);
 437   if (ul_x >= ACE_UTF16_RAW_END &&
 438       ul_x < ACE_UTF16_END)
 439     return 0;
 440
 441   int len = 0;
 442   ACE_CDR::UShort buffer[2];
 443   if (use_BOM)
 444     {
 445       len = 2;
 446       buffer[0] = ACE_UNICODE_BOM_CORRECT;
 447       buffer[1] = static_cast<ACE_CDR::Short> (x);
 448     }
 449   else
 450     {
 451       len = 1;
 452       if (cdr.byte_order())
 453         ACE_CDR::swap_2 (reinterpret_cast<const char *> (&x),
 454                          reinterpret_cast<char *> (buffer));
 455       else
 456         buffer[0] = static_cast<ACE_CDR::Short> (x);
 457     }
 458
 459   if (encode_len)
 460     {
 461       unsigned char tcsize = static_cast<unsigned char> (len * ACE_UTF16_CODEPOINT_SIZE);
 462       if (this->write_1 (cdr, &tcsize))
 463         return this->write_array(cdr, &buffer, tcsize, 1, 1);
 464       else
 465         return 0;
 466     }
 467   if (this->write_2 (cdr, buffer) == 0)
 468     return 0;
 469   if (len == 2)
 470     return this->write_2 (cdr,buffer+1);
 471   return 1;
 472 }
 473
 474 ACE_CDR::Boolean
 475 WUCS4_UTF16::write_wstring (ACE_OutputCDR & cdr,
 476                 ACE_CDR::ULong len,
 477                 const ACE_CDR::WChar *x)
 478 {
 479   if (static_cast<ACE_CDR::Short> (this->major_version(cdr)) == 1
 480       && static_cast<ACE_CDR::Short> (this->minor_version(cdr)) > 1)
 481     {
 482       // count characters that will require surrogates to
 483       // determine transmission length
 484       len++; // make room for BOM
 485       ACE_UTF16_T bom = ACE_UNICODE_BOM_CORRECT;
 486       ACE_CDR::ULong length = len + count_potential_surrogates (x, len);
 487       ACE_CDR::ULong l = length * ACE_UTF16_CODEPOINT_SIZE;
 488
 489       if (this->write_4 (cdr, &l) && x != 0)
 490         {
 491           this->write_2 (cdr, &bom);
 492           return this->write_measured_wchar_array (cdr, x, len, length);
 493         }
 494     }
 495   else
 496     {
 497       ACE_CDR::ULong l = len + 1;
 498
 499       if (this->write_4 (cdr, &l))
 500         {
 501           if (x != 0)
 502             {
 503               return this->write_wchar_array (cdr, x, len + 1);
 504             }
 505           else
 506             {
 507               ACE_UTF16_T s = 0;
 508               return this->write_2 (cdr, &s);
 509             }
 510         }
 511     }
 512
 513   return 0;
 514 }
 515
 516 ACE_CDR::Boolean
 517 WUCS4_UTF16::write_wchar_array (ACE_OutputCDR & cdr,
 518                 const ACE_CDR::WChar *x,
 519                 ACE_CDR::ULong length)
 520 {
 521 #if 0
 522   // I do not believe this is correct, because this could yield an array
 523   // with an incorrect number of elements for the space allotted.
 524   return this->write_measured_wchar_array (
 525     cdr,
 526     x,
 527     length,
 528     length + count_potential_surrogates (x, length));
 529 #endif
 530
 531   int encode_len = 1;
 532   if (static_cast<ACE_CDR::Short> (this->minor_version(cdr)) == 0)
 533     { // wchar is not allowed with GIOP 1.0
 534       errno = EINVAL;
 535       return 0;
 536     }
 537   else if (static_cast<ACE_CDR::Short> (this->minor_version(cdr)) == 1)
 538     encode_len = 0;
 539
 540   for (size_t i = 0; i < length; i++)
 541     if (this->write_wchar_i (cdr,x[i],0,encode_len) == 0)
 542       return 0;
 543   return 1;
 544 }
 545
 546 ACE_CDR::Boolean
 547 WUCS4_UTF16::write_measured_wchar_array (ACE_OutputCDR & cdr,
 548                                          const ACE_CDR::WChar *x,
 549                                          ACE_CDR::ULong length,
 550                                          ACE_CDR::ULong transmission_length)
 551 {
 552   if (length == 0)
 553     return 1;
 554   char* buf;
 555   size_t align = ACE_CDR::SHORT_ALIGN;
 556   if (cdr.adjust (ACE_UTF16_CODEPOINT_SIZE * transmission_length, align, buf)
 557       != 0)
 558     {
 559       return 0;
 560     }
 561
 562   ACE_UTF16_T *sb = reinterpret_cast<ACE_UTF16_T *> (buf);
 563   size_t sbpos = 0;
 564
 565   for (size_t i = 0; i < length; i++)
 566     {
 567       sbpos += encode_utf16 (& sb[sbpos], x[i]);
 568     }
 569 #if defined (ACE_ENABLE_SWAP_ON_WRITE)
 570   // @note this will rarely be enabled.
 571   if (cdr.do_byte_swap())
 572     {
 573       // note can't use swap_2_array because in-place swaps are not safe :-<
 574       // and we don't want to allocate a new array
 575       for (size_t i = 0; i < sbpos; i++)
 576         {
 577           char * pchar = reinterpret_cast<char *> (&sb[i]);
 578           // ACE_CDR::swap_2 (pchar, pchar);
 579           // can't use swap_2 because inplace swaps are not safe
 580           // and work-arounds like copying to another buffer lose
 581           // any performance improvement from
 582           // that fancy asm code, so we might as well just:
 583           char temp = pchar[0];
 584           pchar[0] = pchar[1];
 585           pchar[1] = temp;
 586           //@@TODO write swap_2(char * inplace_buffer);
 587         }
 588     }
 589 #endif /* ACE_ENABLE_SWAP_ON_WRITE */
 590   return 1;
 591 }