Cleanup ACE_HAS_PTHREAD_SIGMASK_PROTOTYPE, all platforms support it so far as I can...
[ACE_TAO.git] / ACE / ace / UTF16_Encoding_Converter.cpp
blob3934167a9de8335358010795d7e6c9fcebfdf382
1 // ======================================================================
2 //
3 // The actual conversion methods are covered by the copyright information
4 // below. It is not the actual code provided by Unicode, Inc. but is an
5 // ACE-ified and only slightly modified version.
6 // Chad Elliott 4/28/2005
7 //
8 // Copyright 2001-2004 Unicode, Inc.
9 //
10 // Limitations on Rights to Redistribute This Code
12 // Unicode, Inc. hereby grants the right to freely use the information
13 // supplied in this file in the creation of products supporting the
14 // Unicode Standard, and to make copies of this file in any form
15 // for internal or external distribution as long as this notice
16 // remains attached.
18 // ======================================================================
20 #include "ace/UTF16_Encoding_Converter.h"
22 #if defined (ACE_USES_WCHAR)
23 #include "ace/OS_NS_stdio.h"
24 #include "ace/OS_Memory.h"
25 #include "ace/Min_Max.h"
27 #if !defined (__ACE_INLINE__)
28 #include "ace/UTF16_Encoding_Converter.inl"
29 #endif /* __ACE_INLINE__ */
31 ACE_BEGIN_VERSIONED_NAMESPACE_DECL
33 static constexpr ACE_UINT32 halfShift = 10;
34 static constexpr ACE_UINT32 halfBase = 0x00010000;
35 static constexpr ACE_UINT32 halfMask = 0x000003FF;
37 static constexpr ACE_UINT32 UNI_SUR_HIGH_START = 0x0000D800;
38 static constexpr ACE_UINT32 UNI_SUR_HIGH_END = 0x0000DBFF;
39 static constexpr ACE_UINT32 UNI_SUR_LOW_START = 0x0000DC00;
40 static constexpr ACE_UINT32 UNI_SUR_LOW_END = 0x0000DFFF;
41 static constexpr ACE_UINT32 UNI_REPLACEMENT_CHAR = 0x0000FFFD;
42 static constexpr ACE_UINT32 UNI_MAX_BMP = 0x0000FFFF;
43 static constexpr ACE_UINT32 UNI_MAX_UTF16 = 0x0010FFFF;
45 // Once the bits are split out into bytes of UTF-8, this is a mask OR-ed
46 // into the first byte, depending on how many bytes follow. There are
47 // as many entries in this table as there are UTF-8 sequence types.
48 // (I.e., one byte sequence, two byte... etc.). Remember that sequencs
49 // for *legal* UTF-8 will be 4 or fewer bytes total.
50 static const ACE_Byte firstByteMark[7] = { 0x00, 0x00, 0xC0,
51 0xE0, 0xF0, 0xF8, 0xFC };
53 // Index into the table below with the first byte of a UTF-8 sequence to
54 // get the number of trailing bytes that are supposed to follow it.
55 // Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is
56 // left as-is for anyone who may want to do such conversion, which was
57 // allowed in earlier algorithms.
58 static const ACE_Byte trailingBytesForUTF8[256] = {
59 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
60 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
61 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
62 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
63 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
64 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
65 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
66 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
69 // Magic values subtracted from a buffer value during UTF8 conversion.
70 // This table contains as many values as there might be trailing bytes
71 // in a UTF-8 sequence.
72 static const ACE_UINT32 offsetsFromUTF8[6] = { 0x00000000, 0x00003080,
73 0x000E2080, 0x03C82080,
74 0xFA082080, 0x82082080 };
77 ACE_UTF16_Encoding_Converter::ACE_UTF16_Encoding_Converter (bool swap)
78 : swap_ (swap)
82 ACE_UTF16_Encoding_Converter::~ACE_UTF16_Encoding_Converter ()
86 ACE_UTF16_Encoding_Converter::Result
87 ACE_UTF16_Encoding_Converter::to_utf8 (const void* source,
88 size_t source_size,
89 ACE_Byte* target,
90 size_t target_size,
91 bool strict)
93 static const ACE_UINT32 byteMask = 0xBF;
94 static const ACE_UINT32 byteMark = 0x80;
95 Result result = CONVERSION_OK;
97 ACE_Byte* targetEnd = target + target_size;
98 const ACE_UINT16* sourceStart = static_cast<const ACE_UINT16*> (source);
99 const ACE_UINT16* sourceEnd = sourceStart +
100 (source_size / sizeof (ACE_UINT16));
102 while (sourceStart < sourceEnd)
104 ACE_UINT16 nw = *sourceStart++;
105 ACE_UINT32 ch = (this->swap_ ? ACE_SWAP_WORD (nw) : nw);
107 // If we have a surrogate pair, convert to ACE_UINT32 first.
108 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END)
110 // If the 16 bits following the high surrogate are in the
111 // sourceStart buffer...
112 if (sourceStart < sourceEnd)
114 ACE_UINT32 ch2 = (this->swap_ ? ACE_SWAP_WORD (*sourceStart) :
115 *sourceStart);
116 // If it's a low surrogate, convert to ACE_UINT32.
117 if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END)
119 ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
120 + (ch2 - UNI_SUR_LOW_START) + halfBase;
121 ++sourceStart;
123 else if (strict)
125 // it's an unpaired high surrogate
126 result = SOURCE_ILLEGAL;
127 break;
130 else
132 // We don't have the 16 bits following the high surrogate.
133 result = SOURCE_EXHAUSTED;
134 break;
137 else if (strict)
139 // UTF-16 surrogate values are illegal in UTF-32
140 if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END)
142 result = SOURCE_ILLEGAL;
143 break;
147 // Figure out how many bytes the result will require
148 unsigned short bytesToWrite = 0;
149 if (ch < 0x80)
150 bytesToWrite = 1;
151 else if (ch < 0x800)
152 bytesToWrite = 2;
153 else if (ch < 0x10000)
154 bytesToWrite = 3;
155 else if (ch < 0x110000)
156 bytesToWrite = 4;
157 else
159 bytesToWrite = 3;
160 ch = UNI_REPLACEMENT_CHAR;
163 target += bytesToWrite;
164 if (target > targetEnd)
166 result = TARGET_EXHAUSTED;
167 break;
170 // NOTE: Everything falls through for efficiency purposes.
171 switch (bytesToWrite)
173 case 4:
174 *--target = (ACE_Byte)((ch | byteMark) & byteMask);
175 ch >>= 6;
176 case 3:
177 *--target = (ACE_Byte)((ch | byteMark) & byteMask);
178 ch >>= 6;
179 case 2:
180 *--target = (ACE_Byte)((ch | byteMark) & byteMask);
181 ch >>= 6;
182 case 1:
183 *--target = (ACE_Byte)(ch | firstByteMark[bytesToWrite]);
185 target += bytesToWrite;
188 return result;
191 ACE_UTF16_Encoding_Converter::Result
192 ACE_UTF16_Encoding_Converter::from_utf8 (const ACE_Byte* source,
193 size_t source_size,
194 void* target,
195 size_t target_size,
196 bool strict)
198 Result result = CONVERSION_OK;
199 const ACE_Byte* sourceEnd = source + source_size;
200 ACE_UINT16* targetStart = static_cast<ACE_UINT16*> (target);
201 ACE_UINT16* targetEnd = targetStart + target_size;
203 while (source < sourceEnd)
205 ACE_UINT32 ch = 0;
206 unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
207 if (source + extraBytesToRead >= sourceEnd)
209 result = SOURCE_EXHAUSTED;
210 break;
213 // Do this check whether lenient or strict
214 if (!this->is_legal_utf8 (source, extraBytesToRead + 1))
216 result = SOURCE_ILLEGAL;
217 break;
220 // The cases all fall through. See "Note A" below.
221 switch (extraBytesToRead)
223 case 5: // remember, illegal UTF-8
224 ch += *source++;
225 ch <<= 6;
226 case 4: // remember, illegal UTF-8
227 ch += *source++;
228 ch <<= 6;
229 case 3:
230 ch += *source++;
231 ch <<= 6;
232 case 2:
233 ch += *source++;
234 ch <<= 6;
235 case 1:
236 ch += *source++;
237 ch <<= 6;
238 case 0:
239 ch += *source++;
241 ch -= offsetsFromUTF8[extraBytesToRead];
243 if (targetStart >= targetEnd)
245 result = TARGET_EXHAUSTED;
246 break;
249 if (ch <= UNI_MAX_BMP) // Target is a character <= 0xFFFF
251 // UTF-16 surrogate values are illegal in UTF-32
252 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END)
254 if (strict)
256 result = SOURCE_ILLEGAL;
257 break;
259 else
261 *targetStart++ = UNI_REPLACEMENT_CHAR;
264 else
266 *targetStart++ = (ACE_UINT16)ch;
269 else if (ch > UNI_MAX_UTF16)
271 if (strict)
273 result = SOURCE_ILLEGAL;
274 break;
276 else
278 *targetStart++ = UNI_REPLACEMENT_CHAR;
281 else
283 // targetStart is a character in range 0xFFFF - 0x10FFFF.
284 if (targetStart + 1 >= targetEnd)
286 result = TARGET_EXHAUSTED;
287 break;
289 ch -= halfBase;
290 *targetStart++ = (ACE_UINT16)((ch >> halfShift) + UNI_SUR_HIGH_START);
291 *targetStart++ = (ACE_UINT16)((ch & halfMask) + UNI_SUR_LOW_START);
295 return result;
298 ACE_UTF16_Encoding_Converter*
299 ACE_UTF16_Encoding_Converter::encoded (const ACE_Byte* source,
300 size_t source_size)
302 static const size_t begin = 16;
303 static const size_t converted = begin * 4;
305 ACE_Byte target[converted];
306 ACE_UTF16_Encoding_Converter* converter = 0;
307 ACE_NEW_RETURN (converter,
308 ACE_UTF16_Encoding_Converter (false),
310 if (converter->to_utf8 (source,
311 ACE_MIN (begin, source_size),
312 target,
313 converted) == CONVERSION_OK)
315 return converter;
317 else
319 delete converter;
322 return 0;
325 ACE_UINT32
326 ACE_UTF16_Encoding_Converter::get_UNI_SUR_HIGH_START ()
328 return UNI_SUR_HIGH_START;
331 ACE_UINT32
332 ACE_UTF16_Encoding_Converter::get_UNI_SUR_LOW_END ()
334 return UNI_SUR_LOW_END;
337 ACE_UINT32
338 ACE_UTF16_Encoding_Converter::get_UNI_REPLACEMENT_CHAR ()
340 return UNI_REPLACEMENT_CHAR;
343 const ACE_Byte*
344 ACE_UTF16_Encoding_Converter::get_first_byte_mark ()
346 return firstByteMark;
349 const ACE_Byte*
350 ACE_UTF16_Encoding_Converter::get_trailing_bytes_for_utf8 ()
352 return trailingBytesForUTF8;
355 const ACE_UINT32*
356 ACE_UTF16_Encoding_Converter::get_offsets_from_utf8 ()
358 return offsetsFromUTF8;
361 ACE_END_VERSIONED_NAMESPACE_DECL
362 #endif /* ACE_USES_WCHAR */