Revert "Use a variable on the stack to not have a temporary in the call"
[ACE_TAO.git] / TAO / tests / CodeSets / libs / UCS4_UTF16 / WUCS4_UTF16.cpp
bloba8f749ecfdc0ac83fbebd48d72b51c71b4887825
1 // -*- C++ -*-
3 //=============================================================================
4 /**
5 * @file WUCS4_UTF16.cpp
7 * Defines the arrays required to convert between UCS-4 a 4 byte wide char
8 * codeset, and UCS-16, aka unicode, a 2-byte codeset.
10 * @author Phil Mesnier <mesnier_p@ociweb.com>
12 //=============================================================================
15 #include "WUCS4_UTF16.h"
16 #include "ace/OS_Memory.h"
18 // ****************************************************************
21 // @@ TODO: Find a better home for these definition
22 // Note: unlike the UNICODE standard we define these as
23 // half-closed ranges i.e.
24 // *BEGIN is the first value in the range
25 // *END is the first value beyond the range (END is not included
26 // in the range)
27 // Note the use of unsigned short for UTF-16 codepoints. wchar_t may
28 // by four bytes
29 typedef ACE_CDR::UShort ACE_UTF16_T;
30 static const size_t ACE_UTF16_CODEPOINT_SIZE = sizeof(ACE_UTF16_T);
32 // surrogate high 1101.10HH.HHHH.HHHH
33 // surrogate low 1101.11LL.LLLL.LLLL
34 // 4 byte result: 0000.0000.0000.HHHH.HHHH.HHLL.LLLL.LLLL
35 // add offset 0000.0000.0000.0000.0001.0000.0000.0000
37 // range of surrogate values for high-order bits
38 static const unsigned short ACE_UTF16_SURROGATE_HIGH_BEGIN = 0xD800U;
39 static const unsigned short ACE_UTF16_SURROGATE_HIGH_END = 0xDC00U;
41 static const unsigned short ACE_UTF16_SURROGATE_LOW_BEGIN = 0xDC00U;
42 static const unsigned short ACE_UTF16_SURROGATE_LOW_END = 0xE000U;
44 // offset to UTF16 values encoded with surrogates start at 2^16
45 static const unsigned long ACE_UTF16_SURROGATE_OFFSET = 0x000010000UL;
47 // shift high order bits from surrogate into correct postion
48 static const int ACE_UTF16_SURROGATE_HIGH_SHIFT = 10;
49 static const unsigned short ACE_UTF16_SURROGATE_LOW_MASK = 0x3FF;
51 // largest value that can be represented in UTF16 without using surrogates + 1
52 static const unsigned long ACE_UTF16_RAW_END = 0x00010000LU;
54 // largest value that can be represented in UTF16 + 1
55 static const unsigned long ACE_UTF16_END = 0x00110000LU;
57 static const unsigned short ACE_UNICODE_SUBSTITUTE_CHARACTER = 0xFFFDU;
58 static const unsigned short ACE_UNICODE_BOM_CORRECT = 0xFEFFU;
59 static const unsigned short ACE_UNICODE_BOM_SWAPPED = 0xFFFEU;
61 /////////////////////////////////////////////////////
62 // Static inline routines to simplify conversion code
63 // @@ should be in anonymous namespace when ACE allows it
64 // or better yet, there should be a UTF-16 support thingie(technical term)
65 // that provides these methods.
66 // Performance: depends on compiler inlining + optimization for performance
68 /// load next two bytes from buffer into a short. Byte swapping as necessary
69 static
70 //ACE_INLINE
71 ACE_UTF16_T
72 load_raw_wchar (const char * buffer, size_t & pos, int do_byte_swap)
74 // need a two byte object to load the UTF16 2 byte codepoint
75 ACE_UTF16_T utf16_char = * reinterpret_cast<ACE_UTF16_T const *> (&buffer[pos*ACE_UTF16_CODEPOINT_SIZE]);
76 #if ! defined (ACE_DISABLE_SWAP_ON_READ)
77 if (do_byte_swap)
79 ACE_CDR::swap_2 (
80 &buffer[pos*ACE_UTF16_CODEPOINT_SIZE],
81 reinterpret_cast<char *> (&utf16_char));
83 #endif
84 pos ++;
85 return utf16_char;
88 /// convert UTF-16 surrogate pair to wchar_t
89 static
90 //ACE_INLINE
91 ACE_CDR::WChar
92 convert_surrogate_pair (ACE_UTF16_T high, ACE_UTF16_T low)
94 return static_cast<ACE_CDR::WChar> (((high - ACE_UTF16_SURROGATE_HIGH_BEGIN) << ACE_UTF16_SURROGATE_HIGH_SHIFT)
95 + (low - ACE_UTF16_SURROGATE_LOW_BEGIN)
96 + ACE_UTF16_SURROGATE_OFFSET);
99 /// load wchar from utf16 buffer
100 /// converts surrogate pairs
101 /// substitutes SUBSTITUTE_CHAR for bad encoding
102 static
103 //ACE_INLINE
104 ACE_CDR::WChar
105 load_wchar (const char * buffer, size_t & pos, size_t length, int do_byte_swap)
107 ACE_CDR::WChar rc = ACE_UNICODE_SUBSTITUTE_CHARACTER;
108 if (pos < length)
110 rc = static_cast<ACE_CDR::WChar> (load_raw_wchar (buffer, pos, do_byte_swap));
111 // Is this a UTF16 surrogate?
112 // note assumpton that SURROGATE_HIGH_END == SURROGATE_LOW_BEGIN
113 if (rc >= ACE_UTF16_SURROGATE_HIGH_BEGIN && rc < ACE_UTF16_SURROGATE_LOW_END)
115 // if we still have two bytes available
116 if (pos < length)
118 // expecting high surrogate
119 if (rc < ACE_UTF16_SURROGATE_HIGH_END)
121 ACE_UTF16_T low = load_raw_wchar (buffer, pos, do_byte_swap);
122 if (low >= ACE_UTF16_SURROGATE_LOW_BEGIN
123 && low < ACE_UTF16_SURROGATE_LOW_END)
125 rc = convert_surrogate_pair (
126 static_cast<ACE_UTF16_T> (rc), low);
128 else
130 rc = ACE_UNICODE_SUBSTITUTE_CHARACTER;
133 else
135 rc = ACE_UNICODE_SUBSTITUTE_CHARACTER;
138 else
140 rc = ACE_UNICODE_SUBSTITUTE_CHARACTER;
144 return rc;
147 static
148 //ACE_INLINE
149 size_t encode_utf16 (ACE_UTF16_T * buffer, ACE_CDR::WChar value)
151 buffer[0] = static_cast<ACE_UTF16_T> (value);
152 size_t length = 1;
154 // On platforms where sizeof(ACE_CDR::WChar) == 2, the test using
155 // ul_value will always be false, since we are improperly using
156 // a 4-byte native wchar codeset. But since this is for a simple
157 // test that has to run on machines with 4 byte wchars, this cast
158 // avoids compile time issues of comparing a value that starts out
159 // as a short with a constant that is too big for a short.
160 unsigned long ul_value = static_cast<unsigned long>(value);
161 if (value >= ACE_UTF16_SURROGATE_HIGH_BEGIN)
163 if (value < ACE_UTF16_SURROGATE_LOW_END)
165 buffer[0] = ACE_UNICODE_SUBSTITUTE_CHARACTER;
167 else if (ul_value >= ACE_UTF16_RAW_END)
169 if (ul_value >= ACE_UTF16_END)
171 buffer[0] = ACE_UNICODE_SUBSTITUTE_CHARACTER;
173 else
175 ACE_CDR::WChar offset = static_cast<ACE_CDR::WChar> (value - ACE_UTF16_SURROGATE_OFFSET);
176 buffer[0] = (offset >> ACE_UTF16_SURROGATE_HIGH_SHIFT)
177 + ACE_UTF16_SURROGATE_HIGH_BEGIN;
178 buffer[1] = (offset & ACE_UTF16_SURROGATE_LOW_MASK)
179 + ACE_UTF16_SURROGATE_LOW_BEGIN;
180 length = 2;
184 return length;
187 /// count number of characters in native WString that will be converted
188 /// to UTF-16 surrogate pairs
189 static
190 size_t count_potential_surrogates (
191 const ACE_CDR::WChar *buffer,
192 ACE_CDR::ULong len)
194 size_t count = 0;
195 for (size_t i = 0; i < len; ++i)
197 // see comments above in encode_utf16().
198 unsigned long ul_value = static_cast<unsigned long>(buffer[i]);
199 if (ul_value >= ACE_UTF16_RAW_END &&
200 ul_value < ACE_UTF16_END)
202 count += 1;
205 return count;
209 /////////////////////////////
210 // WUCS4_UTF16 implementation
212 WUCS4_UTF16::WUCS4_UTF16 ()
216 WUCS4_UTF16::~WUCS4_UTF16 ()
220 // = Documented in $ACE_ROOT/ace/CDR_Stream.h
221 ACE_CDR::Boolean
222 WUCS4_UTF16::read_wchar (ACE_InputCDR &cdr, ACE_CDR::WChar &x)
224 if (static_cast<ACE_CDR::Short> (this->major_version(cdr)) == 1
225 && static_cast<ACE_CDR::Short> (this->minor_version(cdr)) > 1)
227 ACE_CDR::Octet len;
228 if (! this->read_1 (cdr, &len))
230 return 0;
233 int old_bo = cdr.byte_order();
235 ACE_UTF16_T sx = 0;
236 if (! this->read_2 (cdr,&sx))
238 return 0;
241 // Check for byte order mark, if found, consume and honor it.
242 if (sx == ACE_UNICODE_BOM_CORRECT || sx == ACE_UNICODE_BOM_SWAPPED)
244 // if we found it, but it came in in the wrong order
245 // invert the byte order flag for the duration of this method
246 if (sx == ACE_UNICODE_BOM_SWAPPED)
248 cdr.reset_byte_order (! old_bo);
250 this->read_2 (cdr,&sx);
253 // check for UTF-16 surrogate pair, and if found interpret it
254 if (sx >= ACE_UTF16_SURROGATE_HIGH_BEGIN
255 && sx < ACE_UTF16_SURROGATE_LOW_END)
257 if (sx >= ACE_UTF16_SURROGATE_HIGH_END)
259 cdr.reset_byte_order (old_bo);
260 return 0;
263 ACE_UTF16_T low;
264 if (! this->read_2 (cdr, &low))
266 cdr.reset_byte_order (old_bo);
267 return 0;
269 if (low < ACE_UTF16_SURROGATE_LOW_BEGIN
270 || low >= ACE_UTF16_SURROGATE_LOW_END)
272 cdr.reset_byte_order (old_bo);
273 return 0;
275 x = convert_surrogate_pair (sx, low);
277 else
279 x = static_cast<ACE_CDR::WChar> (sx);
282 cdr.reset_byte_order (old_bo);
284 else
286 ACE_UTF16_T sx = 0;
287 if (!this->read_2 (cdr, &sx))
289 return 0;
291 x = static_cast<ACE_CDR::WChar> (sx);
293 return 1;
296 ACE_CDR::Boolean
297 WUCS4_UTF16::read_wstring (ACE_InputCDR &cdr,
298 ACE_CDR::WChar *&x)
300 ACE_CDR::ULong len;
301 if (!this->read_4 (cdr, &len))
302 return 0;
304 // A check for the length being too great is done later in the
305 // call to read_char_array but we want to have it done before
306 // the memory is allocated.
307 if (len > 0 && len <= cdr.length())
309 if (static_cast<ACE_CDR::Short> (this->major_version(cdr)) == 1
310 && static_cast<ACE_CDR::Short> (this->minor_version(cdr)) > 1)
312 len /= ACE_UTF16_CODEPOINT_SIZE;
314 //allocating one extra for the null character needed by applications
315 ACE_NEW_RETURN (x,
316 ACE_CDR::WChar [len + 1],
318 x[len] = L'\x00';
319 if (this->read_wchar_array_i (cdr, x, len,1))
321 // Since reading the array may have adjusted the length,
322 // we simply rewrite the null terminator
323 x[len] = L'\x00';
324 return 1;
327 else
329 ACE_NEW_RETURN (x,
330 ACE_CDR::WChar [len],
332 if (this->read_wchar_array (cdr, x, len))
333 return 1;
335 delete [] x;
337 else if (len == 0)
339 // Convert any null strings to empty strings since empty
340 // strings can cause crashes. (See bug 58.)
341 ACE_NEW_RETURN (x,
342 ACE_CDR::WChar[1],
344 x[0] = '\x00';
345 return 1;
347 x = 0;
348 return 0;
351 ACE_CDR::Boolean
352 WUCS4_UTF16::read_wchar_array_i (ACE_InputCDR & cdr,
353 ACE_CDR::WChar *x,
354 ACE_CDR::ULong &length,
355 int adjust_len)
357 if (length == 0)
358 return 1;
359 char* buf;
360 size_t align = ACE_CDR::SHORT_ALIGN;
361 if (cdr.adjust (ACE_UTF16_CODEPOINT_SIZE * length, align, buf) == 0)
363 int byte_swap = cdr.do_byte_swap();
364 size_t pos = 0;
366 // check for byte order mark. If found, honor it then discard it
367 ACE_UTF16_T bom = load_raw_wchar (buf, pos, byte_swap);
368 if (bom == ACE_UNICODE_BOM_CORRECT || bom == ACE_UNICODE_BOM_SWAPPED)
370 if (bom == ACE_UNICODE_BOM_SWAPPED)
372 byte_swap = !byte_swap;
374 buf += ACE_UTF16_CODEPOINT_SIZE;
375 if (adjust_len)
376 length -= 1;
378 size_t bpos = 0;
379 for (size_t xpos = 0; xpos < length; ++xpos)
381 x[xpos] = load_wchar (buf, bpos, length, byte_swap);
384 return 1;
386 return 0;
390 ACE_CDR::Boolean
391 WUCS4_UTF16::read_wchar_array (ACE_InputCDR & cdr,
392 ACE_CDR::WChar *x,
393 ACE_CDR::ULong length)
395 if (length == 0)
396 return 1;
398 if (static_cast<ACE_CDR::Short> (this->major_version(cdr)) == 1
399 && static_cast<ACE_CDR::Short> (this->minor_version(cdr)) > 1)
401 for (size_t i = 0; i < length; i++)
402 if (!this->read_wchar(cdr,x[i]))
403 return 0;
404 return 1;
406 else
407 return this->read_wchar_array_i(cdr,x,length);
410 ACE_CDR::Boolean
411 WUCS4_UTF16::write_wchar (ACE_OutputCDR &cdr,
412 ACE_CDR::WChar x)
414 int encode_len = 1;
415 if (static_cast<ACE_CDR::Short> (this->minor_version(cdr)) == 0)
416 { // wchar is not allowed with GIOP 1.0
417 errno = EINVAL;
418 return 0;
420 else if (static_cast<ACE_CDR::Short> (this->minor_version(cdr)) == 1)
421 encode_len = 0;
423 return write_wchar_i(cdr,x,1,encode_len);
426 ACE_CDR::Boolean
427 WUCS4_UTF16::write_wchar_i (ACE_OutputCDR &cdr,
428 ACE_CDR::WChar x,
429 int use_BOM,
430 int encode_len)
432 // If the desired char cannot be translated into a single unicode char,
433 // we must raise a marshal exception.
435 // see the comment in encode_utf16() regarding the cast.
436 unsigned long ul_x = static_cast<unsigned long>(x);
437 if (ul_x >= ACE_UTF16_RAW_END &&
438 ul_x < ACE_UTF16_END)
439 return 0;
441 int len = 0;
442 ACE_CDR::UShort buffer[2];
443 if (use_BOM)
445 len = 2;
446 buffer[0] = ACE_UNICODE_BOM_CORRECT;
447 buffer[1] = static_cast<ACE_CDR::Short> (x);
449 else
451 len = 1;
452 if (cdr.byte_order())
453 ACE_CDR::swap_2 (reinterpret_cast<const char *> (&x),
454 reinterpret_cast<char *> (buffer));
455 else
456 buffer[0] = static_cast<ACE_CDR::Short> (x);
459 if (encode_len)
461 unsigned char tcsize = static_cast<unsigned char> (len * ACE_UTF16_CODEPOINT_SIZE);
462 if (this->write_1 (cdr, &tcsize))
463 return this->write_array(cdr, &buffer, tcsize, 1, 1);
464 else
465 return 0;
467 if (this->write_2 (cdr, buffer) == 0)
468 return 0;
469 if (len == 2)
470 return this->write_2 (cdr,buffer+1);
471 return 1;
474 ACE_CDR::Boolean
475 WUCS4_UTF16::write_wstring (ACE_OutputCDR & cdr,
476 ACE_CDR::ULong len,
477 const ACE_CDR::WChar *x)
479 if (static_cast<ACE_CDR::Short> (this->major_version(cdr)) == 1
480 && static_cast<ACE_CDR::Short> (this->minor_version(cdr)) > 1)
482 // count characters that will require surrogates to
483 // determine transmission length
484 len++; // make room for BOM
485 ACE_UTF16_T bom = ACE_UNICODE_BOM_CORRECT;
486 ACE_CDR::ULong length = len + count_potential_surrogates (x, len);
487 ACE_CDR::ULong l = length * ACE_UTF16_CODEPOINT_SIZE;
489 if (this->write_4 (cdr, &l) && x != 0)
491 this->write_2 (cdr, &bom);
492 return this->write_measured_wchar_array (cdr, x, len, length);
495 else
497 ACE_CDR::ULong l = len + 1;
499 if (this->write_4 (cdr, &l))
501 if (x != 0)
503 return this->write_wchar_array (cdr, x, len + 1);
505 else
507 ACE_UTF16_T s = 0;
508 return this->write_2 (cdr, &s);
513 return 0;
516 ACE_CDR::Boolean
517 WUCS4_UTF16::write_wchar_array (ACE_OutputCDR & cdr,
518 const ACE_CDR::WChar *x,
519 ACE_CDR::ULong length)
521 #if 0
522 // I do not believe this is correct, because this could yield an array
523 // with an incorrect number of elements for the space allotted.
524 return this->write_measured_wchar_array (
525 cdr,
527 length,
528 length + count_potential_surrogates (x, length));
529 #endif
531 int encode_len = 1;
532 if (static_cast<ACE_CDR::Short> (this->minor_version(cdr)) == 0)
533 { // wchar is not allowed with GIOP 1.0
534 errno = EINVAL;
535 return 0;
537 else if (static_cast<ACE_CDR::Short> (this->minor_version(cdr)) == 1)
538 encode_len = 0;
540 for (size_t i = 0; i < length; i++)
541 if (this->write_wchar_i (cdr,x[i],0,encode_len) == 0)
542 return 0;
543 return 1;
546 ACE_CDR::Boolean
547 WUCS4_UTF16::write_measured_wchar_array (ACE_OutputCDR & cdr,
548 const ACE_CDR::WChar *x,
549 ACE_CDR::ULong length,
550 ACE_CDR::ULong transmission_length)
552 if (length == 0)
553 return 1;
554 char* buf;
555 size_t align = ACE_CDR::SHORT_ALIGN;
556 if (cdr.adjust (ACE_UTF16_CODEPOINT_SIZE * transmission_length, align, buf)
557 != 0)
559 return 0;
562 ACE_UTF16_T *sb = reinterpret_cast<ACE_UTF16_T *> (buf);
563 size_t sbpos = 0;
565 for (size_t i = 0; i < length; i++)
567 sbpos += encode_utf16 (& sb[sbpos], x[i]);
569 #if defined (ACE_ENABLE_SWAP_ON_WRITE)
570 // @note this will rarely be enabled.
571 if (cdr.do_byte_swap())
573 // note can't use swap_2_array because in-place swaps are not safe :-<
574 // and we don't want to allocate a new array
575 for (size_t i = 0; i < sbpos; i++)
577 char * pchar = reinterpret_cast<char *> (&sb[i]);
578 // ACE_CDR::swap_2 (pchar, pchar);
579 // can't use swap_2 because inplace swaps are not safe
580 // and work-arounds like copying to another buffer lose
581 // any performance improvement from
582 // that fancy asm code, so we might as well just:
583 char temp = pchar[0];
584 pchar[0] = pchar[1];
585 pchar[1] = temp;
586 //@@TODO write swap_2(char * inplace_buffer);
589 #endif /* ACE_ENABLE_SWAP_ON_WRITE */
590 return 1;