1 // ======================================================================
3 // The actual conversion methods are covered by the copyright information
4 // below. It is not the actual code provided by Unicode, Inc. but is an
5 // ACE-ified and only slightly modified version.
6 // Chad Elliott 4/28/2005
8 // Copyright 2001-2004 Unicode, Inc.
10 // Limitations on Rights to Redistribute This Code
12 // Unicode, Inc. hereby grants the right to freely use the information
13 // supplied in this file in the creation of products supporting the
14 // Unicode Standard, and to make copies of this file in any form
15 // for internal or external distribution as long as this notice
18 // ======================================================================
20 #include "ace/UTF16_Encoding_Converter.h"
22 #if defined (ACE_USES_WCHAR)
23 #include "ace/OS_NS_stdio.h"
24 #include "ace/OS_Memory.h"
25 #include "ace/Min_Max.h"
27 #if !defined (__ACE_INLINE__)
28 #include "ace/UTF16_Encoding_Converter.inl"
29 #endif /* __ACE_INLINE__ */
31 ACE_BEGIN_VERSIONED_NAMESPACE_DECL
33 static constexpr ACE_UINT32 halfShift
= 10;
34 static constexpr ACE_UINT32 halfBase
= 0x00010000;
35 static constexpr ACE_UINT32 halfMask
= 0x000003FF;
37 static constexpr ACE_UINT32 UNI_SUR_HIGH_START
= 0x0000D800;
38 static constexpr ACE_UINT32 UNI_SUR_HIGH_END
= 0x0000DBFF;
39 static constexpr ACE_UINT32 UNI_SUR_LOW_START
= 0x0000DC00;
40 static constexpr ACE_UINT32 UNI_SUR_LOW_END
= 0x0000DFFF;
41 static constexpr ACE_UINT32 UNI_REPLACEMENT_CHAR
= 0x0000FFFD;
42 static constexpr ACE_UINT32 UNI_MAX_BMP
= 0x0000FFFF;
43 static constexpr ACE_UINT32 UNI_MAX_UTF16
= 0x0010FFFF;
45 // Once the bits are split out into bytes of UTF-8, this is a mask OR-ed
46 // into the first byte, depending on how many bytes follow. There are
47 // as many entries in this table as there are UTF-8 sequence types.
48 // (I.e., one byte sequence, two byte... etc.). Remember that sequencs
49 // for *legal* UTF-8 will be 4 or fewer bytes total.
50 static const ACE_Byte firstByteMark
[7] = { 0x00, 0x00, 0xC0,
51 0xE0, 0xF0, 0xF8, 0xFC };
53 // Index into the table below with the first byte of a UTF-8 sequence to
54 // get the number of trailing bytes that are supposed to follow it.
55 // Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is
56 // left as-is for anyone who may want to do such conversion, which was
57 // allowed in earlier algorithms.
58 static const ACE_Byte trailingBytesForUTF8
[256] = {
59 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
60 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
61 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
62 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
63 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
64 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
65 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
66 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
69 // Magic values subtracted from a buffer value during UTF8 conversion.
70 // This table contains as many values as there might be trailing bytes
71 // in a UTF-8 sequence.
72 static const ACE_UINT32 offsetsFromUTF8
[6] = { 0x00000000, 0x00003080,
73 0x000E2080, 0x03C82080,
74 0xFA082080, 0x82082080 };
77 ACE_UTF16_Encoding_Converter::ACE_UTF16_Encoding_Converter (bool swap
)
82 ACE_UTF16_Encoding_Converter::~ACE_UTF16_Encoding_Converter ()
86 ACE_UTF16_Encoding_Converter::Result
87 ACE_UTF16_Encoding_Converter::to_utf8 (const void* source
,
93 static const ACE_UINT32 byteMask
= 0xBF;
94 static const ACE_UINT32 byteMark
= 0x80;
95 Result result
= CONVERSION_OK
;
97 ACE_Byte
* targetEnd
= target
+ target_size
;
98 const ACE_UINT16
* sourceStart
= static_cast<const ACE_UINT16
*> (source
);
99 const ACE_UINT16
* sourceEnd
= sourceStart
+
100 (source_size
/ sizeof (ACE_UINT16
));
102 while (sourceStart
< sourceEnd
)
104 ACE_UINT16 nw
= *sourceStart
++;
105 ACE_UINT32 ch
= (this->swap_
? ACE_SWAP_WORD (nw
) : nw
);
107 // If we have a surrogate pair, convert to ACE_UINT32 first.
108 if (ch
>= UNI_SUR_HIGH_START
&& ch
<= UNI_SUR_HIGH_END
)
110 // If the 16 bits following the high surrogate are in the
111 // sourceStart buffer...
112 if (sourceStart
< sourceEnd
)
114 ACE_UINT32 ch2
= (this->swap_
? ACE_SWAP_WORD (*sourceStart
) :
116 // If it's a low surrogate, convert to ACE_UINT32.
117 if (ch2
>= UNI_SUR_LOW_START
&& ch2
<= UNI_SUR_LOW_END
)
119 ch
= ((ch
- UNI_SUR_HIGH_START
) << halfShift
)
120 + (ch2
- UNI_SUR_LOW_START
) + halfBase
;
125 // it's an unpaired high surrogate
126 result
= SOURCE_ILLEGAL
;
132 // We don't have the 16 bits following the high surrogate.
133 result
= SOURCE_EXHAUSTED
;
139 // UTF-16 surrogate values are illegal in UTF-32
140 if (ch
>= UNI_SUR_LOW_START
&& ch
<= UNI_SUR_LOW_END
)
142 result
= SOURCE_ILLEGAL
;
147 // Figure out how many bytes the result will require
148 unsigned short bytesToWrite
= 0;
153 else if (ch
< 0x10000)
155 else if (ch
< 0x110000)
160 ch
= UNI_REPLACEMENT_CHAR
;
163 target
+= bytesToWrite
;
164 if (target
> targetEnd
)
166 result
= TARGET_EXHAUSTED
;
170 // NOTE: Everything falls through for efficiency purposes.
171 switch (bytesToWrite
)
174 *--target
= (ACE_Byte
)((ch
| byteMark
) & byteMask
);
177 *--target
= (ACE_Byte
)((ch
| byteMark
) & byteMask
);
180 *--target
= (ACE_Byte
)((ch
| byteMark
) & byteMask
);
183 *--target
= (ACE_Byte
)(ch
| firstByteMark
[bytesToWrite
]);
185 target
+= bytesToWrite
;
191 ACE_UTF16_Encoding_Converter::Result
192 ACE_UTF16_Encoding_Converter::from_utf8 (const ACE_Byte
* source
,
198 Result result
= CONVERSION_OK
;
199 const ACE_Byte
* sourceEnd
= source
+ source_size
;
200 ACE_UINT16
* targetStart
= static_cast<ACE_UINT16
*> (target
);
201 ACE_UINT16
* targetEnd
= targetStart
+ target_size
;
203 while (source
< sourceEnd
)
206 unsigned short extraBytesToRead
= trailingBytesForUTF8
[*source
];
207 if (source
+ extraBytesToRead
>= sourceEnd
)
209 result
= SOURCE_EXHAUSTED
;
213 // Do this check whether lenient or strict
214 if (!this->is_legal_utf8 (source
, extraBytesToRead
+ 1))
216 result
= SOURCE_ILLEGAL
;
220 // The cases all fall through. See "Note A" below.
221 switch (extraBytesToRead
)
223 case 5: // remember, illegal UTF-8
226 case 4: // remember, illegal UTF-8
241 ch
-= offsetsFromUTF8
[extraBytesToRead
];
243 if (targetStart
>= targetEnd
)
245 result
= TARGET_EXHAUSTED
;
249 if (ch
<= UNI_MAX_BMP
) // Target is a character <= 0xFFFF
251 // UTF-16 surrogate values are illegal in UTF-32
252 if (ch
>= UNI_SUR_HIGH_START
&& ch
<= UNI_SUR_LOW_END
)
256 result
= SOURCE_ILLEGAL
;
261 *targetStart
++ = UNI_REPLACEMENT_CHAR
;
266 *targetStart
++ = (ACE_UINT16
)ch
;
269 else if (ch
> UNI_MAX_UTF16
)
273 result
= SOURCE_ILLEGAL
;
278 *targetStart
++ = UNI_REPLACEMENT_CHAR
;
283 // targetStart is a character in range 0xFFFF - 0x10FFFF.
284 if (targetStart
+ 1 >= targetEnd
)
286 result
= TARGET_EXHAUSTED
;
290 *targetStart
++ = (ACE_UINT16
)((ch
>> halfShift
) + UNI_SUR_HIGH_START
);
291 *targetStart
++ = (ACE_UINT16
)((ch
& halfMask
) + UNI_SUR_LOW_START
);
298 ACE_UTF16_Encoding_Converter
*
299 ACE_UTF16_Encoding_Converter::encoded (const ACE_Byte
* source
,
302 static const size_t begin
= 16;
303 static const size_t converted
= begin
* 4;
305 ACE_Byte target
[converted
];
306 ACE_UTF16_Encoding_Converter
* converter
= 0;
307 ACE_NEW_RETURN (converter
,
308 ACE_UTF16_Encoding_Converter (false),
310 if (converter
->to_utf8 (source
,
311 ACE_MIN (begin
, source_size
),
313 converted
) == CONVERSION_OK
)
326 ACE_UTF16_Encoding_Converter::get_UNI_SUR_HIGH_START ()
328 return UNI_SUR_HIGH_START
;
332 ACE_UTF16_Encoding_Converter::get_UNI_SUR_LOW_END ()
334 return UNI_SUR_LOW_END
;
338 ACE_UTF16_Encoding_Converter::get_UNI_REPLACEMENT_CHAR ()
340 return UNI_REPLACEMENT_CHAR
;
344 ACE_UTF16_Encoding_Converter::get_first_byte_mark ()
346 return firstByteMark
;
350 ACE_UTF16_Encoding_Converter::get_trailing_bytes_for_utf8 ()
352 return trailingBytesForUTF8
;
356 ACE_UTF16_Encoding_Converter::get_offsets_from_utf8 ()
358 return offsetsFromUTF8
;
361 ACE_END_VERSIONED_NAMESPACE_DECL
362 #endif /* ACE_USES_WCHAR */