1 // ======================================================================
3 // The actual conversion methods are covered by the copyright information
4 // below. It is not the actual code provided by Unicode, Inc. but is an
5 // ACE-ified and only slightly modified version.
7 // Chad Elliott 4/28/2005
9 // Copyright 2001-2004 Unicode, Inc.
11 // Limitations on Rights to Redistribute This Code
13 // Unicode, Inc. hereby grants the right to freely use the information
14 // supplied in this file in the creation of products supporting the
15 // Unicode Standard, and to make copies of this file in any form
16 // for internal or external distribution as long as this notice
19 // ======================================================================
21 #include "ace/UTF32_Encoding_Converter.h"
23 #if defined (ACE_USES_WCHAR)
24 #include "ace/OS_NS_stdio.h"
25 #include "ace/OS_Memory.h"
26 #include "ace/Min_Max.h"
28 ACE_BEGIN_VERSIONED_NAMESPACE_DECL
30 static const ACE_UINT32 UNI_MAX_LEGAL_UTF32
= 0x0010FFFF;
32 ACE_UTF32_Encoding_Converter::ACE_UTF32_Encoding_Converter (bool swap
)
33 : ACE_UTF16_Encoding_Converter (swap
)
37 ACE_UTF32_Encoding_Converter::~ACE_UTF32_Encoding_Converter ()
41 ACE_UTF32_Encoding_Converter::Result
42 ACE_UTF32_Encoding_Converter::to_utf8 (const void* source
,
48 static const ACE_UINT32 byteMask
= 0xBF;
49 static const ACE_UINT32 byteMark
= 0x80;
50 static const ACE_UINT32 UNI_SUR_HIGH_START
= get_UNI_SUR_HIGH_START ();
51 static const ACE_UINT32 UNI_SUR_LOW_END
= get_UNI_SUR_LOW_END ();
52 static const ACE_Byte
* firstByteMark
= get_first_byte_mark ();
54 Result result
= CONVERSION_OK
;
55 ACE_Byte
* targetEnd
= target
+ target_size
;
56 const ACE_UINT32
* sourceStart
= static_cast<const ACE_UINT32
*> (source
);
57 const ACE_UINT32
* sourceEnd
= sourceStart
+ (source_size
/ sizeof (ACE_UINT32
));
59 while (sourceStart
< sourceEnd
)
61 ACE_UINT32 nw
= *sourceStart
++;
62 ACE_UINT32 ch
= (this->swap_
? ACE_SWAP_LONG (nw
) : nw
);
63 unsigned short bytesToWrite
= 0;
67 // UTF-16 surrogate values are illegal in UTF-32
68 if (ch
>= UNI_SUR_HIGH_START
&& ch
<= UNI_SUR_LOW_END
)
70 result
= SOURCE_ILLEGAL
;
75 // Figure out how many bytes the result will require. Turn any
76 // illegally large ACE_UINT32 things (> Plane 17) into replacement
86 else if (ch
< 0x10000)
90 else if (ch
<= UNI_MAX_LEGAL_UTF32
)
96 result
= SOURCE_ILLEGAL
;
100 target
+= bytesToWrite
;
101 if (target
> targetEnd
)
103 result
= TARGET_EXHAUSTED
;
107 // NOTE: everything falls through.
108 switch (bytesToWrite
)
111 *--target
= (ACE_Byte
)((ch
| byteMark
) & byteMask
);
114 *--target
= (ACE_Byte
)((ch
| byteMark
) & byteMask
);
117 *--target
= (ACE_Byte
)((ch
| byteMark
) & byteMask
);
120 *--target
= (ACE_Byte
) (ch
| firstByteMark
[bytesToWrite
]);
122 target
+= bytesToWrite
;
128 ACE_UTF32_Encoding_Converter::Result
129 ACE_UTF32_Encoding_Converter::from_utf8 (const ACE_Byte
* source
,
135 static const ACE_UINT32 UNI_SUR_HIGH_START
= get_UNI_SUR_HIGH_START ();
136 static const ACE_UINT32 UNI_SUR_LOW_END
= get_UNI_SUR_LOW_END ();
137 static const ACE_UINT32 UNI_REPLACEMENT_CHAR
= get_UNI_REPLACEMENT_CHAR ();
138 static const ACE_Byte
* trailingBytesForUTF8
= get_trailing_bytes_for_utf8 ();
139 static const ACE_UINT32
* offsetsFromUTF8
= get_offsets_from_utf8 ();
141 Result result
= CONVERSION_OK
;
142 const ACE_Byte
* sourceEnd
= source
+ source_size
;
143 ACE_UINT32
* targetStart
= static_cast<ACE_UINT32
*> (target
);
144 ACE_UINT32
* targetEnd
= targetStart
+ target_size
;
146 while (source
< sourceEnd
)
149 unsigned short extraBytesToRead
= trailingBytesForUTF8
[*source
];
150 if (source
+ extraBytesToRead
>= sourceEnd
)
152 result
= SOURCE_EXHAUSTED
;
156 // Do this check whether lenient or strict
157 if (!this->is_legal_utf8 (source
, extraBytesToRead
+ 1))
159 result
= SOURCE_ILLEGAL
;
163 // The cases all fall through. See "Note A" below.
164 switch (extraBytesToRead
)
184 ch
-= offsetsFromUTF8
[extraBytesToRead
];
186 if (targetStart
>= targetEnd
)
188 result
= TARGET_EXHAUSTED
;
192 if (ch
<= UNI_MAX_LEGAL_UTF32
)
194 // UTF-16 surrogate values are illegal in UTF-32, and anything
195 // over Plane 17 (> 0x10FFFF) is illegal.
196 if (ch
>= UNI_SUR_HIGH_START
&& ch
<= UNI_SUR_LOW_END
)
200 result
= SOURCE_ILLEGAL
;
205 *targetStart
++ = UNI_REPLACEMENT_CHAR
;
215 result
= SOURCE_ILLEGAL
;
223 ACE_UTF32_Encoding_Converter
*
224 ACE_UTF32_Encoding_Converter::encoded (const ACE_Byte
* source
,
227 static const size_t begin
= 16;
228 static const size_t converted
= begin
* 4;
230 ACE_Byte target
[converted
];
231 ACE_UTF32_Encoding_Converter
* converter
= 0;
232 ACE_NEW_RETURN (converter
,
233 ACE_UTF32_Encoding_Converter (false),
236 if (converter
->to_utf8 (source
,
237 ACE_MIN (begin
, source_size
),
239 converted
) == CONVERSION_OK
)
251 ACE_END_VERSIONED_NAMESPACE_DECL
252 #endif /* ACE_USES_WCHAR */