3 //=============================================================================
5 * @file WUCS4_UTF16.cpp
7 * Defines the arrays required to convert between UCS-4 a 4 byte wide char
8 * codeset, and UCS-16, aka unicode, a 2-byte codeset.
10 * @author Phil Mesnier <mesnier_p@ociweb.com>
12 //=============================================================================
15 #include "WUCS4_UTF16.h"
16 #include "ace/OS_Memory.h"
18 // ****************************************************************
21 // @@ TODO: Find a better home for these definition
22 // Note: unlike the UNICODE standard we define these as
23 // half-closed ranges i.e.
24 // *BEGIN is the first value in the range
25 // *END is the first value beyond the range (END is not included
27 // Note the use of unsigned short for UTF-16 codepoints. wchar_t may
29 typedef ACE_CDR::UShort ACE_UTF16_T
;
30 static const size_t ACE_UTF16_CODEPOINT_SIZE
= sizeof(ACE_UTF16_T
);
32 // surrogate high 1101.10HH.HHHH.HHHH
33 // surrogate low 1101.11LL.LLLL.LLLL
34 // 4 byte result: 0000.0000.0000.HHHH.HHHH.HHLL.LLLL.LLLL
35 // add offset 0000.0000.0000.0000.0001.0000.0000.0000
37 // range of surrogate values for high-order bits
38 static const unsigned short ACE_UTF16_SURROGATE_HIGH_BEGIN
= 0xD800U
;
39 static const unsigned short ACE_UTF16_SURROGATE_HIGH_END
= 0xDC00U
;
41 static const unsigned short ACE_UTF16_SURROGATE_LOW_BEGIN
= 0xDC00U
;
42 static const unsigned short ACE_UTF16_SURROGATE_LOW_END
= 0xE000U
;
44 // offset to UTF16 values encoded with surrogates start at 2^16
45 static const unsigned long ACE_UTF16_SURROGATE_OFFSET
= 0x000010000UL
;
47 // shift high order bits from surrogate into correct postion
48 static const int ACE_UTF16_SURROGATE_HIGH_SHIFT
= 10;
49 static const unsigned short ACE_UTF16_SURROGATE_LOW_MASK
= 0x3FF;
51 // largest value that can be represented in UTF16 without using surrogates + 1
52 static const unsigned long ACE_UTF16_RAW_END
= 0x00010000LU
;
54 // largest value that can be represented in UTF16 + 1
55 static const unsigned long ACE_UTF16_END
= 0x00110000LU
;
57 static const unsigned short ACE_UNICODE_SUBSTITUTE_CHARACTER
= 0xFFFDU
;
58 static const unsigned short ACE_UNICODE_BOM_CORRECT
= 0xFEFFU
;
59 static const unsigned short ACE_UNICODE_BOM_SWAPPED
= 0xFFFEU
;
61 /////////////////////////////////////////////////////
62 // Static inline routines to simplify conversion code
63 // @@ should be in anonymous namespace when ACE allows it
64 // or better yet, there should be a UTF-16 support thingie(technical term)
65 // that provides these methods.
66 // Performance: depends on compiler inlining + optimization for performance
68 /// load next two bytes from buffer into a short. Byte swapping as necessary
72 load_raw_wchar (const char * buffer
, size_t & pos
, int do_byte_swap
)
74 // need a two byte object to load the UTF16 2 byte codepoint
75 ACE_UTF16_T utf16_char
= * reinterpret_cast<ACE_UTF16_T
const *> (&buffer
[pos
*ACE_UTF16_CODEPOINT_SIZE
]);
76 #if ! defined (ACE_DISABLE_SWAP_ON_READ)
80 &buffer
[pos
*ACE_UTF16_CODEPOINT_SIZE
],
81 reinterpret_cast<char *> (&utf16_char
));
88 /// convert UTF-16 surrogate pair to wchar_t
92 convert_surrogate_pair (ACE_UTF16_T high
, ACE_UTF16_T low
)
94 return static_cast<ACE_CDR::WChar
> (((high
- ACE_UTF16_SURROGATE_HIGH_BEGIN
) << ACE_UTF16_SURROGATE_HIGH_SHIFT
)
95 + (low
- ACE_UTF16_SURROGATE_LOW_BEGIN
)
96 + ACE_UTF16_SURROGATE_OFFSET
);
99 /// load wchar from utf16 buffer
100 /// converts surrogate pairs
101 /// substitutes SUBSTITUTE_CHAR for bad encoding
105 load_wchar (const char * buffer
, size_t & pos
, size_t length
, int do_byte_swap
)
107 ACE_CDR::WChar rc
= ACE_UNICODE_SUBSTITUTE_CHARACTER
;
110 rc
= static_cast<ACE_CDR::WChar
> (load_raw_wchar (buffer
, pos
, do_byte_swap
));
111 // Is this a UTF16 surrogate?
112 // note assumpton that SURROGATE_HIGH_END == SURROGATE_LOW_BEGIN
113 if (rc
>= ACE_UTF16_SURROGATE_HIGH_BEGIN
&& rc
< ACE_UTF16_SURROGATE_LOW_END
)
115 // if we still have two bytes available
118 // expecting high surrogate
119 if (rc
< ACE_UTF16_SURROGATE_HIGH_END
)
121 ACE_UTF16_T low
= load_raw_wchar (buffer
, pos
, do_byte_swap
);
122 if (low
>= ACE_UTF16_SURROGATE_LOW_BEGIN
123 && low
< ACE_UTF16_SURROGATE_LOW_END
)
125 rc
= convert_surrogate_pair (
126 static_cast<ACE_UTF16_T
> (rc
), low
);
130 rc
= ACE_UNICODE_SUBSTITUTE_CHARACTER
;
135 rc
= ACE_UNICODE_SUBSTITUTE_CHARACTER
;
140 rc
= ACE_UNICODE_SUBSTITUTE_CHARACTER
;
149 size_t encode_utf16 (ACE_UTF16_T
* buffer
, ACE_CDR::WChar value
)
151 buffer
[0] = static_cast<ACE_UTF16_T
> (value
);
154 // On platforms where sizeof(ACE_CDR::WChar) == 2, the test using
155 // ul_value will always be false, since we are improperly using
156 // a 4-byte native wchar codeset. But since this is for a simple
157 // test that has to run on machines with 4 byte wchars, this cast
158 // avoids compile time issues of comparing a value that starts out
159 // as a short with a constant that is too big for a short.
160 unsigned long ul_value
= static_cast<unsigned long>(value
);
161 if (value
>= ACE_UTF16_SURROGATE_HIGH_BEGIN
)
163 if (value
< ACE_UTF16_SURROGATE_LOW_END
)
165 buffer
[0] = ACE_UNICODE_SUBSTITUTE_CHARACTER
;
167 else if (ul_value
>= ACE_UTF16_RAW_END
)
169 if (ul_value
>= ACE_UTF16_END
)
171 buffer
[0] = ACE_UNICODE_SUBSTITUTE_CHARACTER
;
175 ACE_CDR::WChar offset
= static_cast<ACE_CDR::WChar
> (value
- ACE_UTF16_SURROGATE_OFFSET
);
176 buffer
[0] = (offset
>> ACE_UTF16_SURROGATE_HIGH_SHIFT
)
177 + ACE_UTF16_SURROGATE_HIGH_BEGIN
;
178 buffer
[1] = (offset
& ACE_UTF16_SURROGATE_LOW_MASK
)
179 + ACE_UTF16_SURROGATE_LOW_BEGIN
;
187 /// count number of characters in native WString that will be converted
188 /// to UTF-16 surrogate pairs
190 size_t count_potential_surrogates (
191 const ACE_CDR::WChar
*buffer
,
195 for (size_t i
= 0; i
< len
; ++i
)
197 // see comments above in encode_utf16().
198 unsigned long ul_value
= static_cast<unsigned long>(buffer
[i
]);
199 if (ul_value
>= ACE_UTF16_RAW_END
&&
200 ul_value
< ACE_UTF16_END
)
209 /////////////////////////////
210 // WUCS4_UTF16 implementation
212 WUCS4_UTF16::WUCS4_UTF16 ()
216 WUCS4_UTF16::~WUCS4_UTF16 ()
220 // = Documented in $ACE_ROOT/ace/CDR_Stream.h
222 WUCS4_UTF16::read_wchar (ACE_InputCDR
&cdr
, ACE_CDR::WChar
&x
)
224 if (static_cast<ACE_CDR::Short
> (this->major_version(cdr
)) == 1
225 && static_cast<ACE_CDR::Short
> (this->minor_version(cdr
)) > 1)
228 if (! this->read_1 (cdr
, &len
))
233 int old_bo
= cdr
.byte_order();
236 if (! this->read_2 (cdr
,&sx
))
241 // Check for byte order mark, if found, consume and honor it.
242 if (sx
== ACE_UNICODE_BOM_CORRECT
|| sx
== ACE_UNICODE_BOM_SWAPPED
)
244 // if we found it, but it came in in the wrong order
245 // invert the byte order flag for the duration of this method
246 if (sx
== ACE_UNICODE_BOM_SWAPPED
)
248 cdr
.reset_byte_order (! old_bo
);
250 this->read_2 (cdr
,&sx
);
253 // check for UTF-16 surrogate pair, and if found interpret it
254 if (sx
>= ACE_UTF16_SURROGATE_HIGH_BEGIN
255 && sx
< ACE_UTF16_SURROGATE_LOW_END
)
257 if (sx
>= ACE_UTF16_SURROGATE_HIGH_END
)
259 cdr
.reset_byte_order (old_bo
);
264 if (! this->read_2 (cdr
, &low
))
266 cdr
.reset_byte_order (old_bo
);
269 if (low
< ACE_UTF16_SURROGATE_LOW_BEGIN
270 || low
>= ACE_UTF16_SURROGATE_LOW_END
)
272 cdr
.reset_byte_order (old_bo
);
275 x
= convert_surrogate_pair (sx
, low
);
279 x
= static_cast<ACE_CDR::WChar
> (sx
);
282 cdr
.reset_byte_order (old_bo
);
287 if (!this->read_2 (cdr
, &sx
))
291 x
= static_cast<ACE_CDR::WChar
> (sx
);
297 WUCS4_UTF16::read_wstring (ACE_InputCDR
&cdr
,
301 if (!this->read_4 (cdr
, &len
))
304 // A check for the length being too great is done later in the
305 // call to read_char_array but we want to have it done before
306 // the memory is allocated.
307 if (len
> 0 && len
<= cdr
.length())
309 if (static_cast<ACE_CDR::Short
> (this->major_version(cdr
)) == 1
310 && static_cast<ACE_CDR::Short
> (this->minor_version(cdr
)) > 1)
312 len
/= ACE_UTF16_CODEPOINT_SIZE
;
314 //allocating one extra for the null character needed by applications
316 ACE_CDR::WChar
[len
+ 1],
319 if (this->read_wchar_array_i (cdr
, x
, len
,1))
321 // Since reading the array may have adjusted the length,
322 // we simply rewrite the null terminator
330 ACE_CDR::WChar
[len
],
332 if (this->read_wchar_array (cdr
, x
, len
))
339 // Convert any null strings to empty strings since empty
340 // strings can cause crashes. (See bug 58.)
352 WUCS4_UTF16::read_wchar_array_i (ACE_InputCDR
& cdr
,
354 ACE_CDR::ULong
&length
,
360 size_t align
= ACE_CDR::SHORT_ALIGN
;
361 if (cdr
.adjust (ACE_UTF16_CODEPOINT_SIZE
* length
, align
, buf
) == 0)
363 int byte_swap
= cdr
.do_byte_swap();
366 // check for byte order mark. If found, honor it then discard it
367 ACE_UTF16_T bom
= load_raw_wchar (buf
, pos
, byte_swap
);
368 if (bom
== ACE_UNICODE_BOM_CORRECT
|| bom
== ACE_UNICODE_BOM_SWAPPED
)
370 if (bom
== ACE_UNICODE_BOM_SWAPPED
)
372 byte_swap
= !byte_swap
;
374 buf
+= ACE_UTF16_CODEPOINT_SIZE
;
379 for (size_t xpos
= 0; xpos
< length
; ++xpos
)
381 x
[xpos
] = load_wchar (buf
, bpos
, length
, byte_swap
);
391 WUCS4_UTF16::read_wchar_array (ACE_InputCDR
& cdr
,
393 ACE_CDR::ULong length
)
398 if (static_cast<ACE_CDR::Short
> (this->major_version(cdr
)) == 1
399 && static_cast<ACE_CDR::Short
> (this->minor_version(cdr
)) > 1)
401 for (size_t i
= 0; i
< length
; i
++)
402 if (!this->read_wchar(cdr
,x
[i
]))
407 return this->read_wchar_array_i(cdr
,x
,length
);
411 WUCS4_UTF16::write_wchar (ACE_OutputCDR
&cdr
,
415 if (static_cast<ACE_CDR::Short
> (this->minor_version(cdr
)) == 0)
416 { // wchar is not allowed with GIOP 1.0
420 else if (static_cast<ACE_CDR::Short
> (this->minor_version(cdr
)) == 1)
423 return write_wchar_i(cdr
,x
,1,encode_len
);
427 WUCS4_UTF16::write_wchar_i (ACE_OutputCDR
&cdr
,
432 // If the desired char cannot be translated into a single unicode char,
433 // we must raise a marshal exception.
435 // see the comment in encode_utf16() regarding the cast.
436 unsigned long ul_x
= static_cast<unsigned long>(x
);
437 if (ul_x
>= ACE_UTF16_RAW_END
&&
438 ul_x
< ACE_UTF16_END
)
442 ACE_CDR::UShort buffer
[2];
446 buffer
[0] = ACE_UNICODE_BOM_CORRECT
;
447 buffer
[1] = static_cast<ACE_CDR::Short
> (x
);
452 if (cdr
.byte_order())
453 ACE_CDR::swap_2 (reinterpret_cast<const char *> (&x
),
454 reinterpret_cast<char *> (buffer
));
456 buffer
[0] = static_cast<ACE_CDR::Short
> (x
);
461 unsigned char tcsize
= static_cast<unsigned char> (len
* ACE_UTF16_CODEPOINT_SIZE
);
462 if (this->write_1 (cdr
, &tcsize
))
463 return this->write_array(cdr
, &buffer
, tcsize
, 1, 1);
467 if (this->write_2 (cdr
, buffer
) == 0)
470 return this->write_2 (cdr
,buffer
+1);
475 WUCS4_UTF16::write_wstring (ACE_OutputCDR
& cdr
,
477 const ACE_CDR::WChar
*x
)
479 if (static_cast<ACE_CDR::Short
> (this->major_version(cdr
)) == 1
480 && static_cast<ACE_CDR::Short
> (this->minor_version(cdr
)) > 1)
482 // count characters that will require surrogates to
483 // determine transmission length
484 len
++; // make room for BOM
485 ACE_UTF16_T bom
= ACE_UNICODE_BOM_CORRECT
;
486 ACE_CDR::ULong length
= len
+ count_potential_surrogates (x
, len
);
487 ACE_CDR::ULong l
= length
* ACE_UTF16_CODEPOINT_SIZE
;
489 if (this->write_4 (cdr
, &l
) && x
!= 0)
491 this->write_2 (cdr
, &bom
);
492 return this->write_measured_wchar_array (cdr
, x
, len
, length
);
497 ACE_CDR::ULong l
= len
+ 1;
499 if (this->write_4 (cdr
, &l
))
503 return this->write_wchar_array (cdr
, x
, len
+ 1);
508 return this->write_2 (cdr
, &s
);
517 WUCS4_UTF16::write_wchar_array (ACE_OutputCDR
& cdr
,
518 const ACE_CDR::WChar
*x
,
519 ACE_CDR::ULong length
)
522 // I do not believe this is correct, because this could yield an array
523 // with an incorrect number of elements for the space allotted.
524 return this->write_measured_wchar_array (
528 length
+ count_potential_surrogates (x
, length
));
532 if (static_cast<ACE_CDR::Short
> (this->minor_version(cdr
)) == 0)
533 { // wchar is not allowed with GIOP 1.0
537 else if (static_cast<ACE_CDR::Short
> (this->minor_version(cdr
)) == 1)
540 for (size_t i
= 0; i
< length
; i
++)
541 if (this->write_wchar_i (cdr
,x
[i
],0,encode_len
) == 0)
547 WUCS4_UTF16::write_measured_wchar_array (ACE_OutputCDR
& cdr
,
548 const ACE_CDR::WChar
*x
,
549 ACE_CDR::ULong length
,
550 ACE_CDR::ULong transmission_length
)
555 size_t align
= ACE_CDR::SHORT_ALIGN
;
556 if (cdr
.adjust (ACE_UTF16_CODEPOINT_SIZE
* transmission_length
, align
, buf
)
562 ACE_UTF16_T
*sb
= reinterpret_cast<ACE_UTF16_T
*> (buf
);
565 for (size_t i
= 0; i
< length
; i
++)
567 sbpos
+= encode_utf16 (& sb
[sbpos
], x
[i
]);
569 #if defined (ACE_ENABLE_SWAP_ON_WRITE)
570 // @note this will rarely be enabled.
571 if (cdr
.do_byte_swap())
573 // note can't use swap_2_array because in-place swaps are not safe :-<
574 // and we don't want to allocate a new array
575 for (size_t i
= 0; i
< sbpos
; i
++)
577 char * pchar
= reinterpret_cast<char *> (&sb
[i
]);
578 // ACE_CDR::swap_2 (pchar, pchar);
579 // can't use swap_2 because inplace swaps are not safe
580 // and work-arounds like copying to another buffer lose
581 // any performance improvement from
582 // that fancy asm code, so we might as well just:
583 char temp
= pchar
[0];
586 //@@TODO write swap_2(char * inplace_buffer);
589 #endif /* ACE_ENABLE_SWAP_ON_WRITE */