dep/ACE_wrappers/ace/UTF32_Encoding_Converter.cpp

   1 // $Id: UTF32_Encoding_Converter.cpp 80826 2008-03-04 14:51:23Z wotte $
   2
   3 // ======================================================================
   4 //
   5 // The actual conversion methods are covered by the copyright information
   6 // below.  It is not the actual code provided by Unicode, Inc. but is an
   7 // ACE-ified and only slightly modified version.
   8 //
   9 // Chad Elliott 4/28/2005
  10 //
  11 // Copyright 2001-2004 Unicode, Inc.
  12 //
  13 // Limitations on Rights to Redistribute This Code
  14 //
  15 // Unicode, Inc. hereby grants the right to freely use the information
  16 // supplied in this file in the creation of products supporting the
  17 // Unicode Standard, and to make copies of this file in any form
  18 // for internal or external distribution as long as this notice
  19 // remains attached.
  20 //
  21 // ======================================================================
  22
  23 #include "ace/UTF32_Encoding_Converter.h"
  24
  25 #if defined (ACE_USES_WCHAR)
  26 #include "ace/OS_NS_stdio.h"
  27 #include "ace/OS_Memory.h"
  28 #include "ace/Min_Max.h"
  29
  30 ACE_BEGIN_VERSIONED_NAMESPACE_DECL
  31
  32 static const ACE_UINT32 UNI_MAX_LEGAL_UTF32 = 0x0010FFFF;
  33
  34 ACE_UTF32_Encoding_Converter::ACE_UTF32_Encoding_Converter (bool swap)
  35  : ACE_UTF16_Encoding_Converter (swap)
  36 {
  37 }
  38
  39 ACE_UTF32_Encoding_Converter::~ACE_UTF32_Encoding_Converter (void)
  40 {
  41 }
  42
  43 ACE_UTF32_Encoding_Converter::Result
  44 ACE_UTF32_Encoding_Converter::to_utf8 (const void* source,
  45                                        size_t source_size,
  46                                        ACE_Byte* target,
  47                                        size_t target_size,
  48                                        bool strict)
  49 {
  50   static const ACE_UINT32 byteMask = 0xBF;
  51   static const ACE_UINT32 byteMark = 0x80;
  52   static const ACE_UINT32 UNI_SUR_HIGH_START = get_UNI_SUR_HIGH_START ();
  53   static const ACE_UINT32 UNI_SUR_LOW_END = get_UNI_SUR_LOW_END ();
  54   static const ACE_Byte* firstByteMark = get_first_byte_mark ();
  55
  56   Result result = CONVERSION_OK;
  57   ACE_Byte* targetEnd = target + target_size;
  58   const ACE_UINT32* sourceStart = static_cast<const ACE_UINT32*> (source);
  59   const ACE_UINT32* sourceEnd = sourceStart + (source_size / sizeof (ACE_UINT32));
  60
  61   while (sourceStart < sourceEnd)
  62     {
  63       ACE_UINT32 nw = *sourceStart++;
  64       ACE_UINT32 ch = (this->swap_ ? ACE_SWAP_LONG (nw) : nw);
  65       unsigned short bytesToWrite = 0;
  66
  67       if (strict)
  68         {
  69           // UTF-16 surrogate values are illegal in UTF-32
  70           if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END)
  71             {
  72               result = SOURCE_ILLEGAL;
  73               break;
  74             }
  75         }
  76
  77       // Figure out how many bytes the result will require. Turn any
  78       // illegally large ACE_UINT32 things (> Plane 17) into replacement
  79       // chars.
  80       if (ch < 0x80)
  81         {
  82           bytesToWrite = 1;
  83         }
  84       else if (ch < 0x800)
  85         {
  86           bytesToWrite = 2;
  87         }
  88       else if (ch < 0x10000)
  89         {
  90           bytesToWrite = 3;
  91         }
  92       else if (ch <= UNI_MAX_LEGAL_UTF32)
  93         {
  94           bytesToWrite = 4;
  95         }
  96       else
  97         {
  98           result = SOURCE_ILLEGAL;
  99           break;
 100         }
 101
 102       target += bytesToWrite;
 103       if (target > targetEnd)
 104         {
 105           result = TARGET_EXHAUSTED;
 106           break;
 107         }
 108
 109       // NOTE: everything falls through.
 110       switch (bytesToWrite)
 111         {
 112         case 4:
 113           *--target = (ACE_Byte)((ch | byteMark) & byteMask);
 114           ch >>= 6;
 115         case 3:
 116           *--target = (ACE_Byte)((ch | byteMark) & byteMask);
 117           ch >>= 6;
 118         case 2:
 119           *--target = (ACE_Byte)((ch | byteMark) & byteMask);
 120           ch >>= 6;
 121         case 1:
 122           *--target = (ACE_Byte) (ch | firstByteMark[bytesToWrite]);
 123       }
 124       target += bytesToWrite;
 125     }
 126
 127   return result;
 128 }
 129
 130 ACE_UTF32_Encoding_Converter::Result
 131 ACE_UTF32_Encoding_Converter::from_utf8 (const ACE_Byte* source,
 132                                          size_t source_size,
 133                                          void* target,
 134                                          size_t target_size,
 135                                          bool strict)
 136 {
 137   static const ACE_UINT32 UNI_SUR_HIGH_START = get_UNI_SUR_HIGH_START ();
 138   static const ACE_UINT32 UNI_SUR_LOW_END = get_UNI_SUR_LOW_END ();
 139   static const ACE_UINT32 UNI_REPLACEMENT_CHAR = get_UNI_REPLACEMENT_CHAR ();
 140   static const ACE_Byte* trailingBytesForUTF8 = get_trailing_bytes_for_utf8 ();
 141   static const ACE_UINT32* offsetsFromUTF8 = get_offsets_from_utf8 ();
 142
 143   Result result = CONVERSION_OK;
 144   const ACE_Byte* sourceEnd = source + source_size;
 145   ACE_UINT32* targetStart = static_cast<ACE_UINT32*> (target);
 146   ACE_UINT32* targetEnd   = targetStart + target_size;
 147
 148   while (source < sourceEnd)
 149     {
 150       ACE_UINT32 ch = 0;
 151       unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
 152       if (source + extraBytesToRead >= sourceEnd)
 153         {
 154           result = SOURCE_EXHAUSTED;
 155           break;
 156         }
 157
 158       // Do this check whether lenient or strict
 159       if (!this->is_legal_utf8 (source, extraBytesToRead + 1))
 160         {
 161           result = SOURCE_ILLEGAL;
 162           break;
 163         }
 164
 165       // The cases all fall through. See "Note A" below.
 166       switch (extraBytesToRead)
 167         {
 168         case 5:
 169           ch += *source++;
 170           ch <<= 6;
 171         case 4:
 172           ch += *source++;
 173           ch <<= 6;
 174         case 3:
 175           ch += *source++;
 176           ch <<= 6;
 177         case 2:
 178           ch += *source++;
 179           ch <<= 6;
 180         case 1:
 181           ch += *source++;
 182           ch <<= 6;
 183         case 0:
 184           ch += *source++;
 185       }
 186       ch -= offsetsFromUTF8[extraBytesToRead];
 187
 188       if (targetStart >= targetEnd)
 189         {
 190           result = TARGET_EXHAUSTED;
 191           break;
 192         }
 193
 194       if (ch <= UNI_MAX_LEGAL_UTF32)
 195         {
 196           // UTF-16 surrogate values are illegal in UTF-32, and anything
 197           // over Plane 17 (> 0x10FFFF) is illegal.
 198           if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END)
 199             {
 200               if (strict)
 201                 {
 202                   result = SOURCE_ILLEGAL;
 203                   break;
 204                 }
 205               else
 206                 {
 207                   *targetStart++ = UNI_REPLACEMENT_CHAR;
 208                 }
 209             }
 210           else
 211             {
 212               *targetStart++ = ch;
 213             }
 214         }
 215       else
 216         {
 217           result = SOURCE_ILLEGAL;
 218           break;
 219         }
 220     }
 221
 222   return result;
 223 }
 224
 225 ACE_UTF32_Encoding_Converter*
 226 ACE_UTF32_Encoding_Converter::encoded (const ACE_Byte* source,
 227                                        size_t source_size)
 228 {
 229   static const size_t begin = 16;
 230   static const size_t converted = begin * 4;
 231
 232   ACE_Byte target[converted];
 233   ACE_UTF32_Encoding_Converter* converter = 0;
 234   ACE_NEW_RETURN (converter,
 235                   ACE_UTF32_Encoding_Converter (false),
 236                   0);
 237
 238   if (converter->to_utf8 (source,
 239                           ACE_MIN (begin, source_size),
 240                           target,
 241                           converted) == CONVERSION_OK)
 242     {
 243       return converter;
 244     }
 245   else
 246     {
 247       delete converter;
 248     }
 249
 250   return 0;
 251 }
 252
 253 ACE_END_VERSIONED_NAMESPACE_DECL
 254 #endif /* ACE_USES_WCHAR */