2 * Copyright (c) 2003-2004, Artem B. Bityuckiy
3 * Copyright (c) 1999,2000, Konstantin Chuguev. All rights reserved.
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28 #if defined (ICONV_TO_UCS_CES_UTF_16) \
29 || defined (ICONV_FROM_UCS_CES_UTF_16)
33 #include <sys/types.h>
37 #include "../lib/local.h"
38 #include "../lib/ucsconv.h"
39 #include "../lib/endian.h"
42 * On input UTF-16 converter interpret BOM and uses Big Endian byte order if BOM
43 * is absent. UTF-16 converter outputs in System Endian and adds correspondent
44 * BOM as first code. UTF-16LE and UTF-16BE converters ignore BOM on input and
48 #define UTF16_UNDEFINED 0x00
49 #define UTF16_BIG_ENDIAN 0x01
50 #define UTF16_LITTLE_ENDIAN 0x02
51 #define UTF16_SYSTEM_ENDIAN 0x04
52 #define UTF16_BOM_WRITTEN 0x08
54 #define UTF16_BOM 0xFEFF
56 #define UTF_16 "utf_16"
57 #define UTF_16BE "utf_16be"
58 #define UTF_16LE "utf_16le"
61 utf_16_close (struct _reent
*rptr
,
68 #if defined (ICONV_FROM_UCS_CES_UTF_16)
70 utf_16_init_from_ucs (struct _reent
*rptr
,
75 if ((data
= (int *)_malloc_r (rptr
, sizeof (int))) == NULL
)
78 if (strcmp (encoding
, UTF_16LE
) == 0)
79 *data
= UTF16_LITTLE_ENDIAN
;
80 else if (strcmp (encoding
, UTF_16BE
) == 0)
81 *data
= UTF16_BIG_ENDIAN
;
83 *data
= UTF16_SYSTEM_ENDIAN
;
89 utf_16_convert_from_ucs (void *data
,
91 unsigned char **outbuf
,
95 register size_t bytes
;
98 if (in
> 0x0010FFFF || (in
>= 0x0000D800 && in
<= 0x0000DFFF)
99 || in
== 0x0000FFFF || in
== 0x0000FFFE)
100 return (size_t)ICONV_CES_INVALID_CHARACTER
;
103 bytes
= (*state
== UTF16_SYSTEM_ENDIAN
) ? sizeof (ucs2_t
) * 2
107 bytes
+= sizeof (ucs2_t
);
109 if (*outbytesleft
< bytes
)
110 return (size_t)ICONV_CES_NOSPACE
;
112 cp
= (ucs2_t
*)*outbuf
;
114 if (*state
== UTF16_SYSTEM_ENDIAN
)
117 *state
|= UTF16_BOM_WRITTEN
;
124 case UTF16_LITTLE_ENDIAN
:
125 *cp
= ICONV_HTOLES ((ucs2_t
)in
);
127 case UTF16_BIG_ENDIAN
:
128 *cp
= ICONV_HTOBES ((ucs2_t
)in
);
130 case (UTF16_SYSTEM_ENDIAN
| UTF16_BOM_WRITTEN
):
139 /* Process surrogate pair */
141 w1
= ((ucs2_t
)((in
>> 10)) & 0x03FF) | 0xD800;
142 w2
= (ucs2_t
)(in
& 0x000003FF) | 0xDC00;
146 case UTF16_LITTLE_ENDIAN
:
147 *cp
++ = ICONV_HTOLES (w1
);
148 *cp
= ICONV_HTOLES (w2
);
150 case UTF16_BIG_ENDIAN
:
151 *cp
++ = ICONV_HTOBES (w1
);
152 *cp
= ICONV_HTOBES (w2
);
154 case (UTF16_SYSTEM_ENDIAN
| UTF16_BOM_WRITTEN
):
162 *outbytesleft
-= bytes
;
166 #endif /* ICONV_FROM_UCS_CES_UTF_16 */
168 #if defined (ICONV_TO_UCS_CES_UTF_16)
170 utf_16_init_to_ucs (struct _reent
*rptr
,
171 const char *encoding
)
175 if ((data
= (int *)_malloc_r (rptr
, sizeof (int))) == NULL
)
178 if (strcmp (encoding
, UTF_16BE
) == 0)
179 *data
= UTF16_BIG_ENDIAN
;
180 else if (strcmp (encoding
, UTF_16LE
) == 0)
181 *data
= UTF16_LITTLE_ENDIAN
;
183 *data
= UTF16_UNDEFINED
;
189 utf_16_convert_to_ucs (void *data
,
190 const unsigned char **inbuf
,
198 int bytes
= sizeof (ucs2_t
);
200 if (*inbytesleft
< bytes
)
201 return (ucs4_t
)ICONV_CES_BAD_SEQUENCE
;
204 cp
= ((ucs2_t
*)*inbuf
);
206 if (*state
== UTF16_UNDEFINED
)
208 if (*cp
== ICONV_HTOLES(UTF16_BOM
))
209 *state
= UTF16_LITTLE_ENDIAN
;
211 *state
= UTF16_BIG_ENDIAN
;
213 if ( *cp
== ICONV_HTOBES (UTF16_BOM
)
214 || *cp
== ICONV_HTOLES (UTF16_BOM
))
216 if (*inbytesleft
< (bytes
+= sizeof (ucs2_t
)))
217 return (ucs4_t
)ICONV_CES_BAD_SEQUENCE
;
222 if (*state
== UTF16_LITTLE_ENDIAN
)
223 w1
= ICONV_LETOHS (*cp
);
225 w1
= ICONV_BETOHS (*cp
);
227 if (w1
< 0xD800 || w1
> 0xDFFF)
229 if (w1
== 0xFFFF || w1
== 0xFFFE)
230 return (ucs4_t
)ICONV_CES_INVALID_CHARACTER
;
235 /* Process surrogate pair */
236 if (*inbytesleft
< (bytes
+= 2))
237 return (ucs4_t
)ICONV_CES_BAD_SEQUENCE
;
240 /* Broken surrogate character */
241 return (ucs4_t
)ICONV_CES_INVALID_CHARACTER
;
245 if (*state
== UTF16_LITTLE_ENDIAN
)
246 w2
= ICONV_LETOHS (*cp
);
248 w2
= ICONV_BETOHS (*cp
);
250 if (w2
< 0xDC00 || w2
> 0xDFFF)
251 /* Broken surrogate character */
252 return (ucs4_t
)ICONV_CES_INVALID_CHARACTER
;
254 res
= (ucs4_t
)(w2
& 0x03FF) | ((ucs4_t
)(w1
& 0x03FF) << 10);
259 *inbytesleft
-= bytes
;
263 #endif /* ICONV_TO_UCS_CES_UTF_16 */
266 utf_16_get_mb_cur_max (void *data
)
271 #if defined (ICONV_TO_UCS_CES_UTF_16)
272 const iconv_to_ucs_ces_handlers_t
273 _iconv_to_ucs_ces_handlers_utf_16
=
277 utf_16_get_mb_cur_max
,
281 utf_16_convert_to_ucs
285 #if defined (ICONV_FROM_UCS_CES_UTF_16)
286 const iconv_from_ucs_ces_handlers_t
287 _iconv_from_ucs_ces_handlers_utf_16
=
289 utf_16_init_from_ucs
,
291 utf_16_get_mb_cur_max
,
295 utf_16_convert_from_ucs
299 #endif /* ICONV_TO_UCS_CES_UTF_16 || ICONV_FROM_UCS_CES_UTF_16 */