4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
26 #pragma ident "%Z%%M% %I% %E% SMI"
29 * Unicode encoding conversion functions among UTF-8, UTF-16, and UTF-32.
30 * (PSARC/2005/446, PSARC/2007/038, PSARC/2007/517)
31 * Man pages: uconv_u16tou32(9F), uconv_u16tou8(9F), uconv_u32tou16(9F),
32 * uconv_u32tou8(9F), uconv_u8tou16(9F), and uconv_u8tou32(9F). See also
33 * the section 3C man pages.
34 * Interface stability: Committed
37 #include <sys/types.h>
39 #include <sys/param.h>
40 #include <sys/sysmacros.h>
41 #include <sys/systm.h>
42 #include <sys/debug.h>
44 #include <sys/sunddi.h>
46 #include <sys/u8_textprep.h>
48 #include <sys/byteorder.h>
49 #include <sys/errno.h>
53 * The max and min values of high and low surrogate pairs of UTF-16,
54 * UTF-16 bit shift value, bit mask, and starting value outside of BMP.
56 #define UCONV_U16_HI_MIN (0xd800U)
57 #define UCONV_U16_HI_MAX (0xdbffU)
58 #define UCONV_U16_LO_MIN (0xdc00U)
59 #define UCONV_U16_LO_MAX (0xdfffU)
60 #define UCONV_U16_BIT_SHIFT (0x0400U)
61 #define UCONV_U16_BIT_MASK (0x0fffffU)
62 #define UCONV_U16_START (0x010000U)
64 /* The maximum value of Unicode coding space and ASCII coding space. */
65 #define UCONV_UNICODE_MAX (0x10ffffU)
66 #define UCONV_ASCII_MAX (0x7fU)
68 /* The mask values for input and output endians. */
69 #define UCONV_IN_ENDIAN_MASKS (UCONV_IN_BIG_ENDIAN | UCONV_IN_LITTLE_ENDIAN)
70 #define UCONV_OUT_ENDIAN_MASKS (UCONV_OUT_BIG_ENDIAN | UCONV_OUT_LITTLE_ENDIAN)
72 /* Native and reversed endian macros. */
74 #define UCONV_IN_NAT_ENDIAN UCONV_IN_BIG_ENDIAN
75 #define UCONV_IN_REV_ENDIAN UCONV_IN_LITTLE_ENDIAN
76 #define UCONV_OUT_NAT_ENDIAN UCONV_OUT_BIG_ENDIAN
77 #define UCONV_OUT_REV_ENDIAN UCONV_OUT_LITTLE_ENDIAN
79 #define UCONV_IN_NAT_ENDIAN UCONV_IN_LITTLE_ENDIAN
80 #define UCONV_IN_REV_ENDIAN UCONV_IN_BIG_ENDIAN
81 #define UCONV_OUT_NAT_ENDIAN UCONV_OUT_LITTLE_ENDIAN
82 #define UCONV_OUT_REV_ENDIAN UCONV_OUT_BIG_ENDIAN
83 #endif /* _BIG_ENDIAN */
85 /* The Byte Order Mark (BOM) character in normal and reversed byte orderings. */
86 #define UCONV_BOM_NORMAL (0xfeffU)
87 #define UCONV_BOM_SWAPPED (0xfffeU)
88 #define UCONV_BOM_SWAPPED_32 (0xfffe0000U)
90 /* UTF-32 boundaries based on UTF-8 character byte lengths. */
91 #define UCONV_U8_ONE_BYTE (0x7fU)
92 #define UCONV_U8_TWO_BYTES (0x7ffU)
93 #define UCONV_U8_THREE_BYTES (0xffffU)
94 #define UCONV_U8_FOUR_BYTES (0x10ffffU)
96 /* The common minimum and maximum values at the UTF-8 character bytes. */
97 #define UCONV_U8_BYTE_MIN (0x80U)
98 #define UCONV_U8_BYTE_MAX (0xbfU)
101 * The following "6" and "0x3f" came from "10xx xxxx" bit representation of
102 * UTF-8 character bytes.
104 #define UCONV_U8_BIT_SHIFT 6
105 #define UCONV_U8_BIT_MASK 0x3f
108 * The following vector shows remaining bytes in a UTF-8 character.
109 * Index will be the first byte of the character.
111 static const uchar_t remaining_bytes_tbl
[0x100] = {
112 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
113 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
114 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
115 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
116 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
117 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
118 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
119 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
120 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
121 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
122 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
123 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
125 /* C0 C1 C2 C3 C4 C5 C6 C7 C8 C9 CA CB CC CD CE CF */
126 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
128 /* D0 D1 D2 D3 D4 D5 D6 D7 D8 D9 DA DB DC DD DE DF */
129 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
131 /* E0 E1 E2 E3 E4 E5 E6 E7 E8 E9 EA EB EC ED EE EF */
132 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
134 /* F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 FA FB FC FD FE FF */
135 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
139 * The following is a vector of bit-masks to get used bits in
140 * the first byte of a UTF-8 character. Index is remaining bytes at above of
144 const uchar_t u8_masks_tbl
[6] = { 0x00, 0x1f, 0x0f, 0x07, 0x03, 0x01 };
146 static const uchar_t u8_masks_tbl
[6] = { 0x00, 0x1f, 0x0f, 0x07, 0x03, 0x01 };
150 * The following two vectors are to provide valid minimum and
151 * maximum values for the 2'nd byte of a multibyte UTF-8 character for
152 * better illegal sequence checking. The index value must be the value of
153 * the first byte of the UTF-8 character.
155 static const uchar_t valid_min_2nd_byte
[0x100] = {
156 0, 0, 0, 0, 0, 0, 0, 0,
157 0, 0, 0, 0, 0, 0, 0, 0,
158 0, 0, 0, 0, 0, 0, 0, 0,
159 0, 0, 0, 0, 0, 0, 0, 0,
160 0, 0, 0, 0, 0, 0, 0, 0,
161 0, 0, 0, 0, 0, 0, 0, 0,
162 0, 0, 0, 0, 0, 0, 0, 0,
163 0, 0, 0, 0, 0, 0, 0, 0,
164 0, 0, 0, 0, 0, 0, 0, 0,
165 0, 0, 0, 0, 0, 0, 0, 0,
166 0, 0, 0, 0, 0, 0, 0, 0,
167 0, 0, 0, 0, 0, 0, 0, 0,
168 0, 0, 0, 0, 0, 0, 0, 0,
169 0, 0, 0, 0, 0, 0, 0, 0,
170 0, 0, 0, 0, 0, 0, 0, 0,
171 0, 0, 0, 0, 0, 0, 0, 0,
172 0, 0, 0, 0, 0, 0, 0, 0,
173 0, 0, 0, 0, 0, 0, 0, 0,
174 0, 0, 0, 0, 0, 0, 0, 0,
175 0, 0, 0, 0, 0, 0, 0, 0,
176 0, 0, 0, 0, 0, 0, 0, 0,
177 0, 0, 0, 0, 0, 0, 0, 0,
178 0, 0, 0, 0, 0, 0, 0, 0,
179 0, 0, 0, 0, 0, 0, 0, 0,
181 /* C0 C1 C2 C3 C4 C5 C6 C7 */
182 0, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
184 /* C8 C9 CA CB CC CD CE CF */
185 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
187 /* D0 D1 D2 D3 D4 D5 D6 D7 */
188 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
190 /* D8 D9 DA DB DC DD DE DF */
191 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
193 /* E0 E1 E2 E3 E4 E5 E6 E7 */
194 0xa0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
196 /* E8 E9 EA EB EC ED EE EF */
197 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
199 /* F0 F1 F2 F3 F4 F5 F6 F7 */
200 0x90, 0x80, 0x80, 0x80, 0x80, 0, 0, 0,
202 0, 0, 0, 0, 0, 0, 0, 0
205 static const uchar_t valid_max_2nd_byte
[0x100] = {
206 0, 0, 0, 0, 0, 0, 0, 0,
207 0, 0, 0, 0, 0, 0, 0, 0,
208 0, 0, 0, 0, 0, 0, 0, 0,
209 0, 0, 0, 0, 0, 0, 0, 0,
210 0, 0, 0, 0, 0, 0, 0, 0,
211 0, 0, 0, 0, 0, 0, 0, 0,
212 0, 0, 0, 0, 0, 0, 0, 0,
213 0, 0, 0, 0, 0, 0, 0, 0,
214 0, 0, 0, 0, 0, 0, 0, 0,
215 0, 0, 0, 0, 0, 0, 0, 0,
216 0, 0, 0, 0, 0, 0, 0, 0,
217 0, 0, 0, 0, 0, 0, 0, 0,
218 0, 0, 0, 0, 0, 0, 0, 0,
219 0, 0, 0, 0, 0, 0, 0, 0,
220 0, 0, 0, 0, 0, 0, 0, 0,
221 0, 0, 0, 0, 0, 0, 0, 0,
222 0, 0, 0, 0, 0, 0, 0, 0,
223 0, 0, 0, 0, 0, 0, 0, 0,
224 0, 0, 0, 0, 0, 0, 0, 0,
225 0, 0, 0, 0, 0, 0, 0, 0,
226 0, 0, 0, 0, 0, 0, 0, 0,
227 0, 0, 0, 0, 0, 0, 0, 0,
228 0, 0, 0, 0, 0, 0, 0, 0,
229 0, 0, 0, 0, 0, 0, 0, 0,
231 /* C0 C1 C2 C3 C4 C5 C6 C7 */
232 0, 0, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
234 /* C8 C9 CA CB CC CD CE CF */
235 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
237 /* D0 D1 D2 D3 D4 D5 D6 D7 */
238 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
240 /* D8 D9 DA DB DC DD DE DF */
241 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
243 /* E0 E1 E2 E3 E4 E5 E6 E7 */
244 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
246 /* E8 E9 EA EB EC ED EE EF */
247 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0x9f, 0xbf, 0xbf,
249 /* F0 F1 F2 F3 F4 F5 F6 F7 */
250 0xbf, 0xbf, 0xbf, 0xbf, 0x8f, 0, 0, 0,
252 0, 0, 0, 0, 0, 0, 0, 0
257 check_endian(int flag
, int *in
, int *out
)
259 *in
= flag
& UCONV_IN_ENDIAN_MASKS
;
261 /* You cannot have both. */
262 if (*in
== UCONV_IN_ENDIAN_MASKS
)
266 *in
= UCONV_IN_NAT_ENDIAN
;
268 *out
= flag
& UCONV_OUT_ENDIAN_MASKS
;
270 /* You cannot have both. */
271 if (*out
== UCONV_OUT_ENDIAN_MASKS
)
275 *out
= UCONV_OUT_NAT_ENDIAN
;
281 check_bom16(const uint16_t *u16s
, size_t u16l
, int *in
)
284 if (*u16s
== UCONV_BOM_NORMAL
) {
285 *in
= UCONV_IN_NAT_ENDIAN
;
288 if (*u16s
== UCONV_BOM_SWAPPED
) {
289 *in
= UCONV_IN_REV_ENDIAN
;
298 check_bom32(const uint32_t *u32s
, size_t u32l
, int *in
)
301 if (*u32s
== UCONV_BOM_NORMAL
) {
302 *in
= UCONV_IN_NAT_ENDIAN
;
305 if (*u32s
== UCONV_BOM_SWAPPED_32
) {
306 *in
= UCONV_IN_REV_ENDIAN
;
315 uconv_u16tou32(const uint16_t *u16s
, size_t *utf16len
,
316 uint32_t *u32s
, size_t *utf32len
, int flag
)
324 boolean_t do_not_ignore_null
;
327 * Do preliminary validity checks on parameters and collect info on
330 if (u16s
== NULL
|| utf16len
== NULL
)
333 if (u32s
== NULL
|| utf32len
== NULL
)
336 if (check_endian(flag
, &inendian
, &outendian
) != 0)
340 * Initialize input and output parameter buffer indices and
341 * temporary variables.
345 do_not_ignore_null
= ((flag
& UCONV_IGNORE_NULL
) == 0);
348 * Check on the BOM at the beginning of the input buffer if required
349 * and if there is indeed one, process it.
351 if ((flag
& UCONV_IN_ACCEPT_BOM
) &&
352 check_bom16(u16s
, *utf16len
, &inendian
))
356 * Reset inendian and outendian so that after this point, those can be
357 * used as condition values.
359 inendian
&= UCONV_IN_NAT_ENDIAN
;
360 outendian
&= UCONV_OUT_NAT_ENDIAN
;
363 * If there is something in the input buffer and if necessary and
364 * requested, save the BOM at the output buffer.
366 if (*utf16len
> 0 && *utf32len
> 0 && (flag
& UCONV_OUT_EMIT_BOM
))
367 u32s
[u32l
++] = (outendian
) ? UCONV_BOM_NORMAL
:
368 UCONV_BOM_SWAPPED_32
;
371 * Do conversion; if encounter a surrogate pair, assemble high and
372 * low pair values to form a UTF-32 character. If a half of a pair
373 * exists alone, then, either it is an illegal (EILSEQ) or
374 * invalid (EINVAL) value.
376 for (; u16l
< *utf16len
; u16l
++) {
377 if (u16s
[u16l
] == 0 && do_not_ignore_null
)
380 lo
= (uint32_t)((inendian
) ? u16s
[u16l
] : BSWAP_16(u16s
[u16l
]));
382 if (lo
>= UCONV_U16_HI_MIN
&& lo
<= UCONV_U16_HI_MAX
) {
387 } else if (lo
>= UCONV_U16_LO_MIN
&& lo
<= UCONV_U16_LO_MAX
) {
390 lo
= (((hi
- UCONV_U16_HI_MIN
) * UCONV_U16_BIT_SHIFT
+
391 lo
- UCONV_U16_LO_MIN
) & UCONV_U16_BIT_MASK
)
398 if (u32l
>= *utf32len
)
401 u32s
[u32l
++] = (outendian
) ? lo
: BSWAP_32(lo
);
405 * If high half didn't see low half, then, it's most likely the input
406 * parameter is incomplete.
412 * Save the number of consumed and saved characters. They do not
413 * include terminating NULL character (U+0000) at the end of
414 * the input buffer (even when UCONV_IGNORE_NULL isn't specified and
415 * the input buffer length is big enough to include the terminating
425 uconv_u16tou8(const uint16_t *u16s
, size_t *utf16len
,
426 uchar_t
*u8s
, size_t *utf8len
, int flag
)
434 boolean_t do_not_ignore_null
;
436 if (u16s
== NULL
|| utf16len
== NULL
)
439 if (u8s
== NULL
|| utf8len
== NULL
)
442 if (check_endian(flag
, &inendian
, &outendian
) != 0)
447 do_not_ignore_null
= ((flag
& UCONV_IGNORE_NULL
) == 0);
449 if ((flag
& UCONV_IN_ACCEPT_BOM
) &&
450 check_bom16(u16s
, *utf16len
, &inendian
))
453 inendian
&= UCONV_IN_NAT_ENDIAN
;
455 for (; u16l
< *utf16len
; u16l
++) {
456 if (u16s
[u16l
] == 0 && do_not_ignore_null
)
459 lo
= (uint32_t)((inendian
) ? u16s
[u16l
] : BSWAP_16(u16s
[u16l
]));
461 if (lo
>= UCONV_U16_HI_MIN
&& lo
<= UCONV_U16_HI_MAX
) {
466 } else if (lo
>= UCONV_U16_LO_MIN
&& lo
<= UCONV_U16_LO_MAX
) {
469 lo
= (((hi
- UCONV_U16_HI_MIN
) * UCONV_U16_BIT_SHIFT
+
470 lo
- UCONV_U16_LO_MIN
) & UCONV_U16_BIT_MASK
)
478 * Now we convert a UTF-32 character into a UTF-8 character.
479 * Unicode coding space is between U+0000 and U+10FFFF;
480 * anything bigger is an illegal character.
482 if (lo
<= UCONV_U8_ONE_BYTE
) {
485 u8s
[u8l
++] = (uchar_t
)lo
;
486 } else if (lo
<= UCONV_U8_TWO_BYTES
) {
487 if ((u8l
+ 1) >= *utf8len
)
489 u8s
[u8l
++] = (uchar_t
)(0xc0 | ((lo
& 0x07c0) >> 6));
490 u8s
[u8l
++] = (uchar_t
)(0x80 | (lo
& 0x003f));
491 } else if (lo
<= UCONV_U8_THREE_BYTES
) {
492 if ((u8l
+ 2) >= *utf8len
)
494 u8s
[u8l
++] = (uchar_t
)(0xe0 | ((lo
& 0x0f000) >> 12));
495 u8s
[u8l
++] = (uchar_t
)(0x80 | ((lo
& 0x00fc0) >> 6));
496 u8s
[u8l
++] = (uchar_t
)(0x80 | (lo
& 0x0003f));
497 } else if (lo
<= UCONV_U8_FOUR_BYTES
) {
498 if ((u8l
+ 3) >= *utf8len
)
500 u8s
[u8l
++] = (uchar_t
)(0xf0 | ((lo
& 0x01c0000) >> 18));
501 u8s
[u8l
++] = (uchar_t
)(0x80 | ((lo
& 0x003f000) >> 12));
502 u8s
[u8l
++] = (uchar_t
)(0x80 | ((lo
& 0x0000fc0) >> 6));
503 u8s
[u8l
++] = (uchar_t
)(0x80 | (lo
& 0x000003f));
519 uconv_u32tou16(const uint32_t *u32s
, size_t *utf32len
,
520 uint16_t *u16s
, size_t *utf16len
, int flag
)
528 boolean_t do_not_ignore_null
;
530 if (u32s
== NULL
|| utf32len
== NULL
)
533 if (u16s
== NULL
|| utf16len
== NULL
)
536 if (check_endian(flag
, &inendian
, &outendian
) != 0)
540 do_not_ignore_null
= ((flag
& UCONV_IGNORE_NULL
) == 0);
542 if ((flag
& UCONV_IN_ACCEPT_BOM
) &&
543 check_bom32(u32s
, *utf32len
, &inendian
))
546 inendian
&= UCONV_IN_NAT_ENDIAN
;
547 outendian
&= UCONV_OUT_NAT_ENDIAN
;
549 if (*utf32len
> 0 && *utf16len
> 0 && (flag
& UCONV_OUT_EMIT_BOM
))
550 u16s
[u16l
++] = (outendian
) ? UCONV_BOM_NORMAL
:
553 for (; u32l
< *utf32len
; u32l
++) {
554 if (u32s
[u32l
] == 0 && do_not_ignore_null
)
557 hi
= (inendian
) ? u32s
[u32l
] : BSWAP_32(u32s
[u32l
]);
560 * Anything bigger than the Unicode coding space, i.e.,
561 * Unicode scalar value bigger than U+10FFFF, is an illegal
564 if (hi
> UCONV_UNICODE_MAX
)
568 * Anything bigger than U+FFFF must be converted into
569 * a surrogate pair in UTF-16.
571 if (hi
>= UCONV_U16_START
) {
572 lo
= ((hi
- UCONV_U16_START
) % UCONV_U16_BIT_SHIFT
) +
574 hi
= ((hi
- UCONV_U16_START
) / UCONV_U16_BIT_SHIFT
) +
577 if ((u16l
+ 1) >= *utf16len
)
581 u16s
[u16l
++] = (uint16_t)hi
;
582 u16s
[u16l
++] = (uint16_t)lo
;
584 u16s
[u16l
++] = BSWAP_16(((uint16_t)hi
));
585 u16s
[u16l
++] = BSWAP_16(((uint16_t)lo
));
588 if (u16l
>= *utf16len
)
590 u16s
[u16l
++] = (outendian
) ? (uint16_t)hi
:
591 BSWAP_16(((uint16_t)hi
));
602 uconv_u32tou8(const uint32_t *u32s
, size_t *utf32len
,
603 uchar_t
*u8s
, size_t *utf8len
, int flag
)
610 boolean_t do_not_ignore_null
;
612 if (u32s
== NULL
|| utf32len
== NULL
)
615 if (u8s
== NULL
|| utf8len
== NULL
)
618 if (check_endian(flag
, &inendian
, &outendian
) != 0)
622 do_not_ignore_null
= ((flag
& UCONV_IGNORE_NULL
) == 0);
624 if ((flag
& UCONV_IN_ACCEPT_BOM
) &&
625 check_bom32(u32s
, *utf32len
, &inendian
))
628 inendian
&= UCONV_IN_NAT_ENDIAN
;
630 for (; u32l
< *utf32len
; u32l
++) {
631 if (u32s
[u32l
] == 0 && do_not_ignore_null
)
634 lo
= (inendian
) ? u32s
[u32l
] : BSWAP_32(u32s
[u32l
]);
636 if (lo
<= UCONV_U8_ONE_BYTE
) {
639 u8s
[u8l
++] = (uchar_t
)lo
;
640 } else if (lo
<= UCONV_U8_TWO_BYTES
) {
641 if ((u8l
+ 1) >= *utf8len
)
643 u8s
[u8l
++] = (uchar_t
)(0xc0 | ((lo
& 0x07c0) >> 6));
644 u8s
[u8l
++] = (uchar_t
)(0x80 | (lo
& 0x003f));
645 } else if (lo
<= UCONV_U8_THREE_BYTES
) {
646 if ((u8l
+ 2) >= *utf8len
)
648 u8s
[u8l
++] = (uchar_t
)(0xe0 | ((lo
& 0x0f000) >> 12));
649 u8s
[u8l
++] = (uchar_t
)(0x80 | ((lo
& 0x00fc0) >> 6));
650 u8s
[u8l
++] = (uchar_t
)(0x80 | (lo
& 0x0003f));
651 } else if (lo
<= UCONV_U8_FOUR_BYTES
) {
652 if ((u8l
+ 3) >= *utf8len
)
654 u8s
[u8l
++] = (uchar_t
)(0xf0 | ((lo
& 0x01c0000) >> 18));
655 u8s
[u8l
++] = (uchar_t
)(0x80 | ((lo
& 0x003f000) >> 12));
656 u8s
[u8l
++] = (uchar_t
)(0x80 | ((lo
& 0x0000fc0) >> 6));
657 u8s
[u8l
++] = (uchar_t
)(0x80 | (lo
& 0x000003f));
670 uconv_u8tou16(const uchar_t
*u8s
, size_t *utf8len
,
671 uint16_t *u16s
, size_t *utf16len
, int flag
)
681 boolean_t do_not_ignore_null
;
683 if (u8s
== NULL
|| utf8len
== NULL
)
686 if (u16s
== NULL
|| utf16len
== NULL
)
689 if (check_endian(flag
, &inendian
, &outendian
) != 0)
693 do_not_ignore_null
= ((flag
& UCONV_IGNORE_NULL
) == 0);
695 outendian
&= UCONV_OUT_NAT_ENDIAN
;
697 if (*utf8len
> 0 && *utf16len
> 0 && (flag
& UCONV_OUT_EMIT_BOM
))
698 u16s
[u16l
++] = (outendian
) ? UCONV_BOM_NORMAL
:
701 for (; u8l
< *utf8len
; ) {
702 if (u8s
[u8l
] == 0 && do_not_ignore_null
)
706 * Collect a UTF-8 character and convert it to a UTF-32
707 * character. In doing so, we screen out illegally formed
708 * UTF-8 characters and treat such as illegal characters.
709 * The algorithm at below also screens out anything bigger
712 * See Unicode 3.1 UTF-8 Corrigendum and Unicode 3.2 for
713 * more details on the illegal values of UTF-8 character
716 hi
= (uint32_t)u8s
[u8l
++];
718 if (hi
> UCONV_ASCII_MAX
) {
719 if ((remaining_bytes
= remaining_bytes_tbl
[hi
]) == 0)
723 hi
= hi
& u8_masks_tbl
[remaining_bytes
];
725 for (; remaining_bytes
> 0; remaining_bytes
--) {
727 * If we have no more bytes, the current
728 * UTF-8 character is incomplete.
733 lo
= (uint32_t)u8s
[u8l
++];
736 if (lo
< valid_min_2nd_byte
[first_b
] ||
737 lo
> valid_max_2nd_byte
[first_b
])
740 } else if (lo
< UCONV_U8_BYTE_MIN
||
741 lo
> UCONV_U8_BYTE_MAX
) {
744 hi
= (hi
<< UCONV_U8_BIT_SHIFT
) |
745 (lo
& UCONV_U8_BIT_MASK
);
749 if (hi
>= UCONV_U16_START
) {
750 lo
= ((hi
- UCONV_U16_START
) % UCONV_U16_BIT_SHIFT
) +
752 hi
= ((hi
- UCONV_U16_START
) / UCONV_U16_BIT_SHIFT
) +
755 if ((u16l
+ 1) >= *utf16len
)
759 u16s
[u16l
++] = (uint16_t)hi
;
760 u16s
[u16l
++] = (uint16_t)lo
;
762 u16s
[u16l
++] = BSWAP_16(((uint16_t)hi
));
763 u16s
[u16l
++] = BSWAP_16(((uint16_t)lo
));
766 if (u16l
>= *utf16len
)
769 u16s
[u16l
++] = (outendian
) ? (uint16_t)hi
:
770 BSWAP_16(((uint16_t)hi
));
781 uconv_u8tou32(const uchar_t
*u8s
, size_t *utf8len
,
782 uint32_t *u32s
, size_t *utf32len
, int flag
)
792 boolean_t do_not_ignore_null
;
794 if (u8s
== NULL
|| utf8len
== NULL
)
797 if (u32s
== NULL
|| utf32len
== NULL
)
800 if (check_endian(flag
, &inendian
, &outendian
) != 0)
804 do_not_ignore_null
= ((flag
& UCONV_IGNORE_NULL
) == 0);
806 outendian
&= UCONV_OUT_NAT_ENDIAN
;
808 if (*utf8len
> 0 && *utf32len
> 0 && (flag
& UCONV_OUT_EMIT_BOM
))
809 u32s
[u32l
++] = (outendian
) ? UCONV_BOM_NORMAL
:
810 UCONV_BOM_SWAPPED_32
;
812 for (; u8l
< *utf8len
; ) {
813 if (u8s
[u8l
] == 0 && do_not_ignore_null
)
816 hi
= (uint32_t)u8s
[u8l
++];
818 if (hi
> UCONV_ASCII_MAX
) {
819 if ((remaining_bytes
= remaining_bytes_tbl
[hi
]) == 0)
823 hi
= hi
& u8_masks_tbl
[remaining_bytes
];
825 for (; remaining_bytes
> 0; remaining_bytes
--) {
829 c
= (uint32_t)u8s
[u8l
++];
832 if (c
< valid_min_2nd_byte
[first_b
] ||
833 c
> valid_max_2nd_byte
[first_b
])
836 } else if (c
< UCONV_U8_BYTE_MIN
||
837 c
> UCONV_U8_BYTE_MAX
) {
840 hi
= (hi
<< UCONV_U8_BIT_SHIFT
) |
841 (c
& UCONV_U8_BIT_MASK
);
845 if (u32l
>= *utf32len
)
848 u32s
[u32l
++] = (outendian
) ? hi
: BSWAP_32(hi
);