2 * Copyright (c) 2003-2004, Artem B. Bityuckiy
3 * Copyright (c) 1999,2000, Konstantin Chuguev. All rights reserved.
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28 #if defined (ICONV_TO_UCS_CES_UTF_8) \
29 || defined (ICONV_FROM_UCS_CES_UTF_8)
33 #include <sys/types.h>
34 #include "../lib/local.h"
35 #include "../lib/ucsconv.h"
37 #define UTF8_MB_CUR_MAX 6
40 * UTF-8 CES converter doesn't interpret BOM. Reject overlong sequences,
41 * U'FFFF, U'FFFE codes, UTF-16 surrogate codes and all codes > 0x7FFFFFFF.
44 #if defined (ICONV_FROM_UCS_CES_UTF_8)
46 convert_from_ucs (void *data
,
48 unsigned char **outbuf
,
51 register unsigned char *cp
;
52 register size_t bytes
;
54 if ((in
>= 0x0000D800 && in
<= 0x0000DFFF)
55 || in
> 0x7FFFFFFF || in
== 0x0000FFFF || in
== 0x0000FFFE)
56 return (size_t)ICONV_CES_INVALID_CHARACTER
;
62 else if (in
< 0x10000)
64 else if (in
< 0x200000)
66 else if (in
< 0x4000000)
71 if (*outbytesleft
< bytes
)
72 return (size_t)ICONV_CES_NOSPACE
;
79 *cp
= (unsigned char)in
;
83 *cp
++ = (unsigned char)((in
>> 6) | 0x000000C0);
84 *cp
++ = (unsigned char)((in
& 0x0000003F) | 0x00000080);
88 *cp
++ = (unsigned char)((in
>> 12) | 0x000000E0);
89 *cp
++ = (unsigned char)(((in
>> 6) & 0x0000003F) | 0x00000080);
90 *cp
++ = (unsigned char)((in
& 0x0000003F) | 0x00000080);
94 *cp
++ = (unsigned char)((in
>> 18) | 0x000000F0);
95 *cp
++ = (unsigned char)(((in
>> 12) & 0x0000003F) | 0x00000080);
96 *cp
++ = (unsigned char)(((in
>> 6) & 0x0000003F) | 0x00000080);
97 *cp
++ = (unsigned char)((in
& 0x0000003F) | 0x00000080);
101 *cp
++ = (unsigned char)((in
>> 24) | 0x000000F8);
102 *cp
++ = (unsigned char)(((in
>> 18) & 0x0000003F) | 0x00000080);
103 *cp
++ = (unsigned char)(((in
>> 12) & 0x0000003F) | 0x00000080);
104 *cp
++ = (unsigned char)(((in
>> 6) & 0x0000003F) | 0x00000080);
105 *cp
++ = (unsigned char)((in
& 0x0000003F) | 0x00000080);
109 *cp
++ = (unsigned char)((in
>> 30) | 0x000000FC);
110 *cp
++ = (unsigned char)(((in
>> 24) & 0x0000003F) | 0x00000080);
111 *cp
++ = (unsigned char)(((in
>> 18) & 0x0000003F) | 0x00000080);
112 *cp
++ = (unsigned char)(((in
>> 12) & 0x0000003F) | 0x00000080);
113 *cp
++ = (unsigned char)(((in
>> 6) & 0x0000003F) | 0x00000080);
114 *cp
++ = (unsigned char)((in
& 0x0000003F) | 0x00000080);
118 *outbytesleft
-= bytes
;
123 #endif /* ICONV_FROM_UCS_CES_UTF_8 */
125 #if defined (ICONV_TO_UCS_CES_UTF_8)
127 convert_to_ucs (void *data
,
128 const unsigned char **inbuf
,
131 register const unsigned char *in
= *inbuf
;
132 register size_t bytes
;
139 if (*inbytesleft
< (bytes
= 2))
140 return (ucs4_t
)ICONV_CES_BAD_SEQUENCE
;
142 if ( ((in
[0] & ~0x1F) == 0xC0)
143 && ((in
[1] & 0xC0) == 0x80))
144 res
= ((ucs4_t
)(in
[0] & 0x1F) << 6)
145 | ((ucs4_t
)(in
[1] & 0x3F));
147 return (ucs4_t
)ICONV_CES_INVALID_CHARACTER
;
149 if (res
< 0x00000080) /* Overlong sequence */
150 return (ucs4_t
)ICONV_CES_INVALID_CHARACTER
;
153 else if (in
[0] < 0xF0)
155 if (*inbytesleft
< (bytes
= 3))
156 return (ucs4_t
)ICONV_CES_BAD_SEQUENCE
;
158 if ( ((in
[0] & ~0x0F) == 0xE0)
159 && ((in
[1] & 0xC0) == 0x80)
160 && ((in
[2] & 0xC0) == 0x80))
161 res
= ((ucs4_t
)(in
[0] & 0x0F) << 12)
162 | ((ucs4_t
)(in
[1] & 0x3F) << 6)
163 | ((ucs4_t
)(in
[2] & 0x3F));
165 return (ucs4_t
)ICONV_CES_INVALID_CHARACTER
;
167 if (res
< 0x00000800) /* Overlong sequence */
168 return (ucs4_t
)ICONV_CES_INVALID_CHARACTER
;
171 else if (in
[0] < 0xF8)
173 if (*inbytesleft
< (bytes
= 4))
174 return (ucs4_t
)ICONV_CES_BAD_SEQUENCE
;
176 if ( ((in
[0] & ~0x07) == 0xF0)
177 && ((in
[1] & 0xC0) == 0x80)
178 && ((in
[2] & 0xC0) == 0x80)
179 && ((in
[3] & 0xC0) == 0x80))
180 res
= ((ucs4_t
)(in
[0] & 0x07) << 18)
181 | ((ucs4_t
)(in
[1] & 0x3F) << 12)
182 | ((ucs4_t
)(in
[2] & 0x3F) << 6)
183 | ((ucs4_t
)(in
[3] & 0x3F));
185 return (ucs4_t
)ICONV_CES_INVALID_CHARACTER
;
187 if (res
< 0x00010000) /* Overlong sequence */
188 return (ucs4_t
)ICONV_CES_INVALID_CHARACTER
;
191 else if (in
[0] < 0xFC)
193 if (*inbytesleft
< (bytes
= 5))
194 return (ucs4_t
)ICONV_CES_BAD_SEQUENCE
;
196 if ( ((in
[0] & ~0x03) == 0xF8)
197 && ((in
[1] & 0xC0) == 0x80)
198 && ((in
[2] & 0xC0) == 0x80)
199 && ((in
[3] & 0xC0) == 0x80)
200 && ((in
[4] & 0xC0) == 0x80))
201 res
= ((ucs4_t
)(in
[0] & 0x03) << 24)
202 | ((ucs4_t
)(in
[1] & 0x3F) << 18)
203 | ((ucs4_t
)(in
[2] & 0x3F) << 12)
204 | ((ucs4_t
)(in
[3] & 0x3F) << 6)
205 | ((ucs4_t
)(in
[4] & 0x3F));
207 return (ucs4_t
)ICONV_CES_INVALID_CHARACTER
;
209 if (res
< 0x00200000) /* Overlong sequence */
210 return (ucs4_t
)ICONV_CES_INVALID_CHARACTER
;
213 else if (in
[0] <= 0xFD)
215 if (*inbytesleft
< (bytes
= 6))
216 return (ucs4_t
)ICONV_CES_BAD_SEQUENCE
;
218 if ( ((in
[0] & ~0x01) == 0xFC)
219 && ((in
[1] & 0xC0) == 0x80)
220 && ((in
[2] & 0xC0) == 0x80)
221 && ((in
[3] & 0xC0) == 0x80)
222 && ((in
[4] & 0xC0) == 0x80)
223 && ((in
[5] & 0xC0) == 0x80))
224 res
= ((ucs4_t
)(in
[0] & 0x1) << 30)
225 | ((ucs4_t
)(in
[1] & 0x3F) << 24)
226 | ((ucs4_t
)(in
[2] & 0x3F) << 18)
227 | ((ucs4_t
)(in
[3] & 0x3F) << 12)
228 | ((ucs4_t
)(in
[4] & 0x3F) << 6)
229 | ((ucs4_t
)(in
[5] & 0x3F));
231 return (ucs4_t
)ICONV_CES_INVALID_CHARACTER
;
233 if (res
< 0x04000000) /* Overlong sequence */
234 return (ucs4_t
)ICONV_CES_INVALID_CHARACTER
;
238 return (ucs4_t
)ICONV_CES_INVALID_CHARACTER
;
240 else if (in
[0] & 0x80)
241 return (ucs4_t
)ICONV_CES_INVALID_CHARACTER
;
248 if ( (res
>= 0x0000D800 && res
<= 0x0000DFFF)
249 || res
> 0x7FFFFFFF || res
== 0x0000FFFF || res
== 0x0000FFFE)
250 return (ucs4_t
)ICONV_CES_INVALID_CHARACTER
;
252 *inbytesleft
-= bytes
;
257 #endif /* ICONV_TO_UCS_CES_UTF_8 */
260 get_mb_cur_max (void *data
)
262 return UTF8_MB_CUR_MAX
;
265 #if defined (ICONV_TO_UCS_CES_UTF_8)
266 const iconv_to_ucs_ces_handlers_t
267 _iconv_to_ucs_ces_handlers_utf_8
=
279 #if defined (ICONV_FROM_UCS_CES_UTF_8)
280 const iconv_from_ucs_ces_handlers_t
281 _iconv_from_ucs_ces_handlers_utf_8
=
293 #endif /* ICONV_TO_UCS_CES_UTF_8 || ICONV_FROM_UCS_CES_UTF_8 */