1 // $Id: UTF32_Encoding_Converter.cpp 80826 2008-03-04 14:51:23Z wotte $
3 // ======================================================================
5 // The actual conversion methods are covered by the copyright information
6 // below. It is not the actual code provided by Unicode, Inc. but is an
7 // ACE-ified and only slightly modified version.
9 // Chad Elliott 4/28/2005
11 // Copyright 2001-2004 Unicode, Inc.
13 // Limitations on Rights to Redistribute This Code
15 // Unicode, Inc. hereby grants the right to freely use the information
16 // supplied in this file in the creation of products supporting the
17 // Unicode Standard, and to make copies of this file in any form
18 // for internal or external distribution as long as this notice
21 // ======================================================================
23 #include "ace/UTF32_Encoding_Converter.h"
25 #if defined (ACE_USES_WCHAR)
26 #include "ace/OS_NS_stdio.h"
27 #include "ace/OS_Memory.h"
28 #include "ace/Min_Max.h"
30 ACE_BEGIN_VERSIONED_NAMESPACE_DECL
32 static const ACE_UINT32 UNI_MAX_LEGAL_UTF32
= 0x0010FFFF;
34 ACE_UTF32_Encoding_Converter::ACE_UTF32_Encoding_Converter (bool swap
)
35 : ACE_UTF16_Encoding_Converter (swap
)
39 ACE_UTF32_Encoding_Converter::~ACE_UTF32_Encoding_Converter (void)
43 ACE_UTF32_Encoding_Converter::Result
44 ACE_UTF32_Encoding_Converter::to_utf8 (const void* source
,
50 static const ACE_UINT32 byteMask
= 0xBF;
51 static const ACE_UINT32 byteMark
= 0x80;
52 static const ACE_UINT32 UNI_SUR_HIGH_START
= get_UNI_SUR_HIGH_START ();
53 static const ACE_UINT32 UNI_SUR_LOW_END
= get_UNI_SUR_LOW_END ();
54 static const ACE_Byte
* firstByteMark
= get_first_byte_mark ();
56 Result result
= CONVERSION_OK
;
57 ACE_Byte
* targetEnd
= target
+ target_size
;
58 const ACE_UINT32
* sourceStart
= static_cast<const ACE_UINT32
*> (source
);
59 const ACE_UINT32
* sourceEnd
= sourceStart
+ (source_size
/ sizeof (ACE_UINT32
));
61 while (sourceStart
< sourceEnd
)
63 ACE_UINT32 nw
= *sourceStart
++;
64 ACE_UINT32 ch
= (this->swap_
? ACE_SWAP_LONG (nw
) : nw
);
65 unsigned short bytesToWrite
= 0;
69 // UTF-16 surrogate values are illegal in UTF-32
70 if (ch
>= UNI_SUR_HIGH_START
&& ch
<= UNI_SUR_LOW_END
)
72 result
= SOURCE_ILLEGAL
;
77 // Figure out how many bytes the result will require. Turn any
78 // illegally large ACE_UINT32 things (> Plane 17) into replacement
88 else if (ch
< 0x10000)
92 else if (ch
<= UNI_MAX_LEGAL_UTF32
)
98 result
= SOURCE_ILLEGAL
;
102 target
+= bytesToWrite
;
103 if (target
> targetEnd
)
105 result
= TARGET_EXHAUSTED
;
109 // NOTE: everything falls through.
110 switch (bytesToWrite
)
113 *--target
= (ACE_Byte
)((ch
| byteMark
) & byteMask
);
116 *--target
= (ACE_Byte
)((ch
| byteMark
) & byteMask
);
119 *--target
= (ACE_Byte
)((ch
| byteMark
) & byteMask
);
122 *--target
= (ACE_Byte
) (ch
| firstByteMark
[bytesToWrite
]);
124 target
+= bytesToWrite
;
130 ACE_UTF32_Encoding_Converter::Result
131 ACE_UTF32_Encoding_Converter::from_utf8 (const ACE_Byte
* source
,
137 static const ACE_UINT32 UNI_SUR_HIGH_START
= get_UNI_SUR_HIGH_START ();
138 static const ACE_UINT32 UNI_SUR_LOW_END
= get_UNI_SUR_LOW_END ();
139 static const ACE_UINT32 UNI_REPLACEMENT_CHAR
= get_UNI_REPLACEMENT_CHAR ();
140 static const ACE_Byte
* trailingBytesForUTF8
= get_trailing_bytes_for_utf8 ();
141 static const ACE_UINT32
* offsetsFromUTF8
= get_offsets_from_utf8 ();
143 Result result
= CONVERSION_OK
;
144 const ACE_Byte
* sourceEnd
= source
+ source_size
;
145 ACE_UINT32
* targetStart
= static_cast<ACE_UINT32
*> (target
);
146 ACE_UINT32
* targetEnd
= targetStart
+ target_size
;
148 while (source
< sourceEnd
)
151 unsigned short extraBytesToRead
= trailingBytesForUTF8
[*source
];
152 if (source
+ extraBytesToRead
>= sourceEnd
)
154 result
= SOURCE_EXHAUSTED
;
158 // Do this check whether lenient or strict
159 if (!this->is_legal_utf8 (source
, extraBytesToRead
+ 1))
161 result
= SOURCE_ILLEGAL
;
165 // The cases all fall through. See "Note A" below.
166 switch (extraBytesToRead
)
186 ch
-= offsetsFromUTF8
[extraBytesToRead
];
188 if (targetStart
>= targetEnd
)
190 result
= TARGET_EXHAUSTED
;
194 if (ch
<= UNI_MAX_LEGAL_UTF32
)
196 // UTF-16 surrogate values are illegal in UTF-32, and anything
197 // over Plane 17 (> 0x10FFFF) is illegal.
198 if (ch
>= UNI_SUR_HIGH_START
&& ch
<= UNI_SUR_LOW_END
)
202 result
= SOURCE_ILLEGAL
;
207 *targetStart
++ = UNI_REPLACEMENT_CHAR
;
217 result
= SOURCE_ILLEGAL
;
225 ACE_UTF32_Encoding_Converter
*
226 ACE_UTF32_Encoding_Converter::encoded (const ACE_Byte
* source
,
229 static const size_t begin
= 16;
230 static const size_t converted
= begin
* 4;
232 ACE_Byte target
[converted
];
233 ACE_UTF32_Encoding_Converter
* converter
= 0;
234 ACE_NEW_RETURN (converter
,
235 ACE_UTF32_Encoding_Converter (false),
238 if (converter
->to_utf8 (source
,
239 ACE_MIN (begin
, source_size
),
241 converted
) == CONVERSION_OK
)
253 ACE_END_VERSIONED_NAMESPACE_DECL
254 #endif /* ACE_USES_WCHAR */