ACE/ace/UTF32_Encoding_Converter.cpp

   1 // ======================================================================
   2 //
   3 // The actual conversion methods are covered by the copyright information
   4 // below.  It is not the actual code provided by Unicode, Inc. but is an
   5 // ACE-ified and only slightly modified version.
   6 //
   7 // Chad Elliott 4/28/2005
   8 //
   9 // Copyright 2001-2004 Unicode, Inc.
  10 //
  11 // Limitations on Rights to Redistribute This Code
  12 //
  13 // Unicode, Inc. hereby grants the right to freely use the information
  14 // supplied in this file in the creation of products supporting the
  15 // Unicode Standard, and to make copies of this file in any form
  16 // for internal or external distribution as long as this notice
  17 // remains attached.
  18 //
  19 // ======================================================================
  20
  21 #include "ace/UTF32_Encoding_Converter.h"
  22
  23 #if defined (ACE_USES_WCHAR)
  24 #include "ace/OS_NS_stdio.h"
  25 #include "ace/OS_Memory.h"
  26 #include "ace/Min_Max.h"
  27
  28 ACE_BEGIN_VERSIONED_NAMESPACE_DECL
  29
  30 static const ACE_UINT32 UNI_MAX_LEGAL_UTF32 = 0x0010FFFF;
  31
  32 ACE_UTF32_Encoding_Converter::ACE_UTF32_Encoding_Converter (bool swap)
  33  : ACE_UTF16_Encoding_Converter (swap)
  34 {
  35 }
  36
  37 ACE_UTF32_Encoding_Converter::~ACE_UTF32_Encoding_Converter ()
  38 {
  39 }
  40
  41 ACE_UTF32_Encoding_Converter::Result
  42 ACE_UTF32_Encoding_Converter::to_utf8 (const void* source,
  43                                        size_t source_size,
  44                                        ACE_Byte* target,
  45                                        size_t target_size,
  46                                        bool strict)
  47 {
  48   static const ACE_UINT32 byteMask = 0xBF;
  49   static const ACE_UINT32 byteMark = 0x80;
  50   static const ACE_UINT32 UNI_SUR_HIGH_START = get_UNI_SUR_HIGH_START ();
  51   static const ACE_UINT32 UNI_SUR_LOW_END = get_UNI_SUR_LOW_END ();
  52   static const ACE_Byte* firstByteMark = get_first_byte_mark ();
  53
  54   Result result = CONVERSION_OK;
  55   ACE_Byte* targetEnd = target + target_size;
  56   const ACE_UINT32* sourceStart = static_cast<const ACE_UINT32*> (source);
  57   const ACE_UINT32* sourceEnd = sourceStart + (source_size / sizeof (ACE_UINT32));
  58
  59   while (sourceStart < sourceEnd)
  60     {
  61       ACE_UINT32 nw = *sourceStart++;
  62       ACE_UINT32 ch = (this->swap_ ? ACE_SWAP_LONG (nw) : nw);
  63       unsigned short bytesToWrite = 0;
  64
  65       if (strict)
  66         {
  67           // UTF-16 surrogate values are illegal in UTF-32
  68           if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END)
  69             {
  70               result = SOURCE_ILLEGAL;
  71               break;
  72             }
  73         }
  74
  75       // Figure out how many bytes the result will require. Turn any
  76       // illegally large ACE_UINT32 things (> Plane 17) into replacement
  77       // chars.
  78       if (ch < 0x80)
  79         {
  80           bytesToWrite = 1;
  81         }
  82       else if (ch < 0x800)
  83         {
  84           bytesToWrite = 2;
  85         }
  86       else if (ch < 0x10000)
  87         {
  88           bytesToWrite = 3;
  89         }
  90       else if (ch <= UNI_MAX_LEGAL_UTF32)
  91         {
  92           bytesToWrite = 4;
  93         }
  94       else
  95         {
  96           result = SOURCE_ILLEGAL;
  97           break;
  98         }
  99
 100       target += bytesToWrite;
 101       if (target > targetEnd)
 102         {
 103           result = TARGET_EXHAUSTED;
 104           break;
 105         }
 106
 107       // NOTE: everything falls through.
 108       switch (bytesToWrite)
 109         {
 110         case 4:
 111           *--target = (ACE_Byte)((ch | byteMark) & byteMask);
 112           ch >>= 6;
 113         case 3:
 114           *--target = (ACE_Byte)((ch | byteMark) & byteMask);
 115           ch >>= 6;
 116         case 2:
 117           *--target = (ACE_Byte)((ch | byteMark) & byteMask);
 118           ch >>= 6;
 119         case 1:
 120           *--target = (ACE_Byte) (ch | firstByteMark[bytesToWrite]);
 121       }
 122       target += bytesToWrite;
 123     }
 124
 125   return result;
 126 }
 127
 128 ACE_UTF32_Encoding_Converter::Result
 129 ACE_UTF32_Encoding_Converter::from_utf8 (const ACE_Byte* source,
 130                                          size_t source_size,
 131                                          void* target,
 132                                          size_t target_size,
 133                                          bool strict)
 134 {
 135   static const ACE_UINT32 UNI_SUR_HIGH_START = get_UNI_SUR_HIGH_START ();
 136   static const ACE_UINT32 UNI_SUR_LOW_END = get_UNI_SUR_LOW_END ();
 137   static const ACE_UINT32 UNI_REPLACEMENT_CHAR = get_UNI_REPLACEMENT_CHAR ();
 138   static const ACE_Byte* trailingBytesForUTF8 = get_trailing_bytes_for_utf8 ();
 139   static const ACE_UINT32* offsetsFromUTF8 = get_offsets_from_utf8 ();
 140
 141   Result result = CONVERSION_OK;
 142   const ACE_Byte* sourceEnd = source + source_size;
 143   ACE_UINT32* targetStart = static_cast<ACE_UINT32*> (target);
 144   ACE_UINT32* targetEnd   = targetStart + target_size;
 145
 146   while (source < sourceEnd)
 147     {
 148       ACE_UINT32 ch = 0;
 149       unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
 150       if (source + extraBytesToRead >= sourceEnd)
 151         {
 152           result = SOURCE_EXHAUSTED;
 153           break;
 154         }
 155
 156       // Do this check whether lenient or strict
 157       if (!this->is_legal_utf8 (source, extraBytesToRead + 1))
 158         {
 159           result = SOURCE_ILLEGAL;
 160           break;
 161         }
 162
 163       // The cases all fall through. See "Note A" below.
 164       switch (extraBytesToRead)
 165         {
 166         case 5:
 167           ch += *source++;
 168           ch <<= 6;
 169         case 4:
 170           ch += *source++;
 171           ch <<= 6;
 172         case 3:
 173           ch += *source++;
 174           ch <<= 6;
 175         case 2:
 176           ch += *source++;
 177           ch <<= 6;
 178         case 1:
 179           ch += *source++;
 180           ch <<= 6;
 181         case 0:
 182           ch += *source++;
 183       }
 184       ch -= offsetsFromUTF8[extraBytesToRead];
 185
 186       if (targetStart >= targetEnd)
 187         {
 188           result = TARGET_EXHAUSTED;
 189           break;
 190         }
 191
 192       if (ch <= UNI_MAX_LEGAL_UTF32)
 193         {
 194           // UTF-16 surrogate values are illegal in UTF-32, and anything
 195           // over Plane 17 (> 0x10FFFF) is illegal.
 196           if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END)
 197             {
 198               if (strict)
 199                 {
 200                   result = SOURCE_ILLEGAL;
 201                   break;
 202                 }
 203               else
 204                 {
 205                   *targetStart++ = UNI_REPLACEMENT_CHAR;
 206                 }
 207             }
 208           else
 209             {
 210               *targetStart++ = ch;
 211             }
 212         }
 213       else
 214         {
 215           result = SOURCE_ILLEGAL;
 216           break;
 217         }
 218     }
 219
 220   return result;
 221 }
 222
 223 ACE_UTF32_Encoding_Converter*
 224 ACE_UTF32_Encoding_Converter::encoded (const ACE_Byte* source,
 225                                        size_t source_size)
 226 {
 227   static const size_t begin = 16;
 228   static const size_t converted = begin * 4;
 229
 230   ACE_Byte target[converted];
 231   ACE_UTF32_Encoding_Converter* converter = 0;
 232   ACE_NEW_RETURN (converter,
 233                   ACE_UTF32_Encoding_Converter (false),
 234                   0);
 235
 236   if (converter->to_utf8 (source,
 237                           ACE_MIN (begin, source_size),
 238                           target,
 239                           converted) == CONVERSION_OK)
 240     {
 241       return converter;
 242     }
 243   else
 244     {
 245       delete converter;
 246     }
 247
 248   return 0;
 249 }
 250
 251 ACE_END_VERSIONED_NAMESPACE_DECL
 252 #endif /* ACE_USES_WCHAR */