4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or https://opensource.org/licenses/CDDL-1.0.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
29 * Unicode encoding conversion functions among UTF-8, UTF-16, and UTF-32.
30 * (PSARC/2005/446, PSARC/2007/038, PSARC/2007/517)
31 * Man pages: uconv_u16tou32(9F), uconv_u16tou8(9F), uconv_u32tou16(9F),
32 * uconv_u32tou8(9F), uconv_u8tou16(9F), and uconv_u8tou32(9F). See also
33 * the section 3C man pages.
34 * Interface stability: Committed
37 #include <sys/types.h>
39 #include <sys/param.h>
40 #include <sys/sysmacros.h>
41 #include <sys/debug.h>
43 #include <sys/sunddi.h>
45 #include <sys/u8_textprep.h>
47 #include <sys/byteorder.h>
48 #include <sys/errno.h>
52 * The max and min values of high and low surrogate pairs of UTF-16,
53 * UTF-16 bit shift value, bit mask, and starting value outside of BMP.
55 #define UCONV_U16_HI_MIN (0xd800U)
56 #define UCONV_U16_HI_MAX (0xdbffU)
57 #define UCONV_U16_LO_MIN (0xdc00U)
58 #define UCONV_U16_LO_MAX (0xdfffU)
59 #define UCONV_U16_BIT_SHIFT (0x0400U)
60 #define UCONV_U16_BIT_MASK (0x0fffffU)
61 #define UCONV_U16_START (0x010000U)
63 /* The maximum value of Unicode coding space and ASCII coding space. */
64 #define UCONV_UNICODE_MAX (0x10ffffU)
65 #define UCONV_ASCII_MAX (0x7fU)
67 /* The mask values for input and output endians. */
68 #define UCONV_IN_ENDIAN_MASKS (UCONV_IN_BIG_ENDIAN | UCONV_IN_LITTLE_ENDIAN)
69 #define UCONV_OUT_ENDIAN_MASKS (UCONV_OUT_BIG_ENDIAN | UCONV_OUT_LITTLE_ENDIAN)
71 /* Native and reversed endian macros. */
72 #ifdef _ZFS_BIG_ENDIAN
73 #define UCONV_IN_NAT_ENDIAN UCONV_IN_BIG_ENDIAN
74 #define UCONV_IN_REV_ENDIAN UCONV_IN_LITTLE_ENDIAN
75 #define UCONV_OUT_NAT_ENDIAN UCONV_OUT_BIG_ENDIAN
76 #define UCONV_OUT_REV_ENDIAN UCONV_OUT_LITTLE_ENDIAN
78 #define UCONV_IN_NAT_ENDIAN UCONV_IN_LITTLE_ENDIAN
79 #define UCONV_IN_REV_ENDIAN UCONV_IN_BIG_ENDIAN
80 #define UCONV_OUT_NAT_ENDIAN UCONV_OUT_LITTLE_ENDIAN
81 #define UCONV_OUT_REV_ENDIAN UCONV_OUT_BIG_ENDIAN
82 #endif /* _BIG_ENDIAN */
84 /* The Byte Order Mark (BOM) character in normal and reversed byte orderings. */
85 #define UCONV_BOM_NORMAL (0xfeffU)
86 #define UCONV_BOM_SWAPPED (0xfffeU)
87 #define UCONV_BOM_SWAPPED_32 (0xfffe0000U)
89 /* UTF-32 boundaries based on UTF-8 character byte lengths. */
90 #define UCONV_U8_ONE_BYTE (0x7fU)
91 #define UCONV_U8_TWO_BYTES (0x7ffU)
92 #define UCONV_U8_THREE_BYTES (0xffffU)
93 #define UCONV_U8_FOUR_BYTES (0x10ffffU)
95 /* The common minimum and maximum values at the UTF-8 character bytes. */
96 #define UCONV_U8_BYTE_MIN (0x80U)
97 #define UCONV_U8_BYTE_MAX (0xbfU)
100 * The following "6" and "0x3f" came from "10xx xxxx" bit representation of
101 * UTF-8 character bytes.
103 #define UCONV_U8_BIT_SHIFT 6
104 #define UCONV_U8_BIT_MASK 0x3f
107 * The following vector shows remaining bytes in a UTF-8 character.
108 * Index will be the first byte of the character.
110 static const uchar_t remaining_bytes_tbl
[0x100] = {
111 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
112 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
113 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
114 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
115 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
116 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
117 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
118 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
119 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
120 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
121 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
122 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
124 /* C0 C1 C2 C3 C4 C5 C6 C7 C8 C9 CA CB CC CD CE CF */
125 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
127 /* D0 D1 D2 D3 D4 D5 D6 D7 D8 D9 DA DB DC DD DE DF */
128 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
130 /* E0 E1 E2 E3 E4 E5 E6 E7 E8 E9 EA EB EC ED EE EF */
131 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
133 /* F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 FA FB FC FD FE FF */
134 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
138 * The following is a vector of bit-masks to get used bits in
139 * the first byte of a UTF-8 character. Index is remaining bytes at above of
142 static const uchar_t u8_masks_tbl
[6] = { 0x00, 0x1f, 0x0f, 0x07, 0x03, 0x01 };
145 * The following two vectors are to provide valid minimum and
146 * maximum values for the 2'nd byte of a multibyte UTF-8 character for
147 * better illegal sequence checking. The index value must be the value of
148 * the first byte of the UTF-8 character.
150 static const uchar_t valid_min_2nd_byte
[0x100] = {
151 0, 0, 0, 0, 0, 0, 0, 0,
152 0, 0, 0, 0, 0, 0, 0, 0,
153 0, 0, 0, 0, 0, 0, 0, 0,
154 0, 0, 0, 0, 0, 0, 0, 0,
155 0, 0, 0, 0, 0, 0, 0, 0,
156 0, 0, 0, 0, 0, 0, 0, 0,
157 0, 0, 0, 0, 0, 0, 0, 0,
158 0, 0, 0, 0, 0, 0, 0, 0,
159 0, 0, 0, 0, 0, 0, 0, 0,
160 0, 0, 0, 0, 0, 0, 0, 0,
161 0, 0, 0, 0, 0, 0, 0, 0,
162 0, 0, 0, 0, 0, 0, 0, 0,
163 0, 0, 0, 0, 0, 0, 0, 0,
164 0, 0, 0, 0, 0, 0, 0, 0,
165 0, 0, 0, 0, 0, 0, 0, 0,
166 0, 0, 0, 0, 0, 0, 0, 0,
167 0, 0, 0, 0, 0, 0, 0, 0,
168 0, 0, 0, 0, 0, 0, 0, 0,
169 0, 0, 0, 0, 0, 0, 0, 0,
170 0, 0, 0, 0, 0, 0, 0, 0,
171 0, 0, 0, 0, 0, 0, 0, 0,
172 0, 0, 0, 0, 0, 0, 0, 0,
173 0, 0, 0, 0, 0, 0, 0, 0,
174 0, 0, 0, 0, 0, 0, 0, 0,
176 /* C0 C1 C2 C3 C4 C5 C6 C7 */
177 0, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
179 /* C8 C9 CA CB CC CD CE CF */
180 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
182 /* D0 D1 D2 D3 D4 D5 D6 D7 */
183 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
185 /* D8 D9 DA DB DC DD DE DF */
186 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
188 /* E0 E1 E2 E3 E4 E5 E6 E7 */
189 0xa0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
191 /* E8 E9 EA EB EC ED EE EF */
192 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
194 /* F0 F1 F2 F3 F4 F5 F6 F7 */
195 0x90, 0x80, 0x80, 0x80, 0x80, 0, 0, 0,
197 0, 0, 0, 0, 0, 0, 0, 0
200 static const uchar_t valid_max_2nd_byte
[0x100] = {
201 0, 0, 0, 0, 0, 0, 0, 0,
202 0, 0, 0, 0, 0, 0, 0, 0,
203 0, 0, 0, 0, 0, 0, 0, 0,
204 0, 0, 0, 0, 0, 0, 0, 0,
205 0, 0, 0, 0, 0, 0, 0, 0,
206 0, 0, 0, 0, 0, 0, 0, 0,
207 0, 0, 0, 0, 0, 0, 0, 0,
208 0, 0, 0, 0, 0, 0, 0, 0,
209 0, 0, 0, 0, 0, 0, 0, 0,
210 0, 0, 0, 0, 0, 0, 0, 0,
211 0, 0, 0, 0, 0, 0, 0, 0,
212 0, 0, 0, 0, 0, 0, 0, 0,
213 0, 0, 0, 0, 0, 0, 0, 0,
214 0, 0, 0, 0, 0, 0, 0, 0,
215 0, 0, 0, 0, 0, 0, 0, 0,
216 0, 0, 0, 0, 0, 0, 0, 0,
217 0, 0, 0, 0, 0, 0, 0, 0,
218 0, 0, 0, 0, 0, 0, 0, 0,
219 0, 0, 0, 0, 0, 0, 0, 0,
220 0, 0, 0, 0, 0, 0, 0, 0,
221 0, 0, 0, 0, 0, 0, 0, 0,
222 0, 0, 0, 0, 0, 0, 0, 0,
223 0, 0, 0, 0, 0, 0, 0, 0,
224 0, 0, 0, 0, 0, 0, 0, 0,
226 /* C0 C1 C2 C3 C4 C5 C6 C7 */
227 0, 0, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
229 /* C8 C9 CA CB CC CD CE CF */
230 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
232 /* D0 D1 D2 D3 D4 D5 D6 D7 */
233 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
235 /* D8 D9 DA DB DC DD DE DF */
236 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
238 /* E0 E1 E2 E3 E4 E5 E6 E7 */
239 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
241 /* E8 E9 EA EB EC ED EE EF */
242 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0x9f, 0xbf, 0xbf,
244 /* F0 F1 F2 F3 F4 F5 F6 F7 */
245 0xbf, 0xbf, 0xbf, 0xbf, 0x8f, 0, 0, 0,
247 0, 0, 0, 0, 0, 0, 0, 0
252 check_endian(int flag
, int *in
, int *out
)
254 *in
= flag
& UCONV_IN_ENDIAN_MASKS
;
256 /* You cannot have both. */
257 if (*in
== UCONV_IN_ENDIAN_MASKS
)
261 *in
= UCONV_IN_NAT_ENDIAN
;
263 *out
= flag
& UCONV_OUT_ENDIAN_MASKS
;
265 /* You cannot have both. */
266 if (*out
== UCONV_OUT_ENDIAN_MASKS
)
270 *out
= UCONV_OUT_NAT_ENDIAN
;
276 check_bom16(const uint16_t *u16s
, size_t u16l
, int *in
)
279 if (*u16s
== UCONV_BOM_NORMAL
) {
280 *in
= UCONV_IN_NAT_ENDIAN
;
283 if (*u16s
== UCONV_BOM_SWAPPED
) {
284 *in
= UCONV_IN_REV_ENDIAN
;
293 check_bom32(const uint32_t *u32s
, size_t u32l
, int *in
)
296 if (*u32s
== UCONV_BOM_NORMAL
) {
297 *in
= UCONV_IN_NAT_ENDIAN
;
300 if (*u32s
== UCONV_BOM_SWAPPED_32
) {
301 *in
= UCONV_IN_REV_ENDIAN
;
310 uconv_u16tou32(const uint16_t *u16s
, size_t *utf16len
,
311 uint32_t *u32s
, size_t *utf32len
, int flag
)
319 boolean_t do_not_ignore_null
;
322 * Do preliminary validity checks on parameters and collect info on
325 if (u16s
== NULL
|| utf16len
== NULL
)
328 if (u32s
== NULL
|| utf32len
== NULL
)
331 if (check_endian(flag
, &inendian
, &outendian
) != 0)
335 * Initialize input and output parameter buffer indices and
336 * temporary variables.
340 do_not_ignore_null
= ((flag
& UCONV_IGNORE_NULL
) == 0);
343 * Check on the BOM at the beginning of the input buffer if required
344 * and if there is indeed one, process it.
346 if ((flag
& UCONV_IN_ACCEPT_BOM
) &&
347 check_bom16(u16s
, *utf16len
, &inendian
))
351 * Reset inendian and outendian so that after this point, those can be
352 * used as condition values.
354 inendian
&= UCONV_IN_NAT_ENDIAN
;
355 outendian
&= UCONV_OUT_NAT_ENDIAN
;
358 * If there is something in the input buffer and if necessary and
359 * requested, save the BOM at the output buffer.
361 if (*utf16len
> 0 && *utf32len
> 0 && (flag
& UCONV_OUT_EMIT_BOM
))
362 u32s
[u32l
++] = (outendian
) ? UCONV_BOM_NORMAL
:
363 UCONV_BOM_SWAPPED_32
;
366 * Do conversion; if encounter a surrogate pair, assemble high and
367 * low pair values to form a UTF-32 character. If a half of a pair
368 * exists alone, then, either it is an illegal (EILSEQ) or
369 * invalid (EINVAL) value.
371 for (; u16l
< *utf16len
; u16l
++) {
372 if (u16s
[u16l
] == 0 && do_not_ignore_null
)
375 lo
= (uint32_t)((inendian
) ? u16s
[u16l
] : BSWAP_16(u16s
[u16l
]));
377 if (lo
>= UCONV_U16_HI_MIN
&& lo
<= UCONV_U16_HI_MAX
) {
382 } else if (lo
>= UCONV_U16_LO_MIN
&& lo
<= UCONV_U16_LO_MAX
) {
385 lo
= (((hi
- UCONV_U16_HI_MIN
) * UCONV_U16_BIT_SHIFT
+
386 lo
- UCONV_U16_LO_MIN
) & UCONV_U16_BIT_MASK
)
393 if (u32l
>= *utf32len
)
396 u32s
[u32l
++] = (outendian
) ? lo
: BSWAP_32(lo
);
400 * If high half didn't see low half, then, it's most likely the input
401 * parameter is incomplete.
407 * Save the number of consumed and saved characters. They do not
408 * include terminating NULL character (U+0000) at the end of
409 * the input buffer (even when UCONV_IGNORE_NULL isn't specified and
410 * the input buffer length is big enough to include the terminating
420 uconv_u16tou8(const uint16_t *u16s
, size_t *utf16len
,
421 uchar_t
*u8s
, size_t *utf8len
, int flag
)
429 boolean_t do_not_ignore_null
;
431 if (u16s
== NULL
|| utf16len
== NULL
)
434 if (u8s
== NULL
|| utf8len
== NULL
)
437 if (check_endian(flag
, &inendian
, &outendian
) != 0)
442 do_not_ignore_null
= ((flag
& UCONV_IGNORE_NULL
) == 0);
444 if ((flag
& UCONV_IN_ACCEPT_BOM
) &&
445 check_bom16(u16s
, *utf16len
, &inendian
))
448 inendian
&= UCONV_IN_NAT_ENDIAN
;
450 for (; u16l
< *utf16len
; u16l
++) {
451 if (u16s
[u16l
] == 0 && do_not_ignore_null
)
454 lo
= (uint32_t)((inendian
) ? u16s
[u16l
] : BSWAP_16(u16s
[u16l
]));
456 if (lo
>= UCONV_U16_HI_MIN
&& lo
<= UCONV_U16_HI_MAX
) {
461 } else if (lo
>= UCONV_U16_LO_MIN
&& lo
<= UCONV_U16_LO_MAX
) {
464 lo
= (((hi
- UCONV_U16_HI_MIN
) * UCONV_U16_BIT_SHIFT
+
465 lo
- UCONV_U16_LO_MIN
) & UCONV_U16_BIT_MASK
)
473 * Now we convert a UTF-32 character into a UTF-8 character.
474 * Unicode coding space is between U+0000 and U+10FFFF;
475 * anything bigger is an illegal character.
477 if (lo
<= UCONV_U8_ONE_BYTE
) {
480 u8s
[u8l
++] = (uchar_t
)lo
;
481 } else if (lo
<= UCONV_U8_TWO_BYTES
) {
482 if ((u8l
+ 1) >= *utf8len
)
484 u8s
[u8l
++] = (uchar_t
)(0xc0 | ((lo
& 0x07c0) >> 6));
485 u8s
[u8l
++] = (uchar_t
)(0x80 | (lo
& 0x003f));
486 } else if (lo
<= UCONV_U8_THREE_BYTES
) {
487 if ((u8l
+ 2) >= *utf8len
)
489 u8s
[u8l
++] = (uchar_t
)(0xe0 | ((lo
& 0x0f000) >> 12));
490 u8s
[u8l
++] = (uchar_t
)(0x80 | ((lo
& 0x00fc0) >> 6));
491 u8s
[u8l
++] = (uchar_t
)(0x80 | (lo
& 0x0003f));
492 } else if (lo
<= UCONV_U8_FOUR_BYTES
) {
493 if ((u8l
+ 3) >= *utf8len
)
495 u8s
[u8l
++] = (uchar_t
)(0xf0 | ((lo
& 0x01c0000) >> 18));
496 u8s
[u8l
++] = (uchar_t
)(0x80 | ((lo
& 0x003f000) >> 12));
497 u8s
[u8l
++] = (uchar_t
)(0x80 | ((lo
& 0x0000fc0) >> 6));
498 u8s
[u8l
++] = (uchar_t
)(0x80 | (lo
& 0x000003f));
514 uconv_u32tou16(const uint32_t *u32s
, size_t *utf32len
,
515 uint16_t *u16s
, size_t *utf16len
, int flag
)
523 boolean_t do_not_ignore_null
;
525 if (u32s
== NULL
|| utf32len
== NULL
)
528 if (u16s
== NULL
|| utf16len
== NULL
)
531 if (check_endian(flag
, &inendian
, &outendian
) != 0)
535 do_not_ignore_null
= ((flag
& UCONV_IGNORE_NULL
) == 0);
537 if ((flag
& UCONV_IN_ACCEPT_BOM
) &&
538 check_bom32(u32s
, *utf32len
, &inendian
))
541 inendian
&= UCONV_IN_NAT_ENDIAN
;
542 outendian
&= UCONV_OUT_NAT_ENDIAN
;
544 if (*utf32len
> 0 && *utf16len
> 0 && (flag
& UCONV_OUT_EMIT_BOM
))
545 u16s
[u16l
++] = (outendian
) ? UCONV_BOM_NORMAL
:
548 for (; u32l
< *utf32len
; u32l
++) {
549 if (u32s
[u32l
] == 0 && do_not_ignore_null
)
552 hi
= (inendian
) ? u32s
[u32l
] : BSWAP_32(u32s
[u32l
]);
555 * Anything bigger than the Unicode coding space, i.e.,
556 * Unicode scalar value bigger than U+10FFFF, is an illegal
559 if (hi
> UCONV_UNICODE_MAX
)
563 * Anything bigger than U+FFFF must be converted into
564 * a surrogate pair in UTF-16.
566 if (hi
>= UCONV_U16_START
) {
567 lo
= ((hi
- UCONV_U16_START
) % UCONV_U16_BIT_SHIFT
) +
569 hi
= ((hi
- UCONV_U16_START
) / UCONV_U16_BIT_SHIFT
) +
572 if ((u16l
+ 1) >= *utf16len
)
576 u16s
[u16l
++] = (uint16_t)hi
;
577 u16s
[u16l
++] = (uint16_t)lo
;
579 u16s
[u16l
++] = BSWAP_16(((uint16_t)hi
));
580 u16s
[u16l
++] = BSWAP_16(((uint16_t)lo
));
583 if (u16l
>= *utf16len
)
585 u16s
[u16l
++] = (outendian
) ? (uint16_t)hi
:
586 BSWAP_16(((uint16_t)hi
));
597 uconv_u32tou8(const uint32_t *u32s
, size_t *utf32len
,
598 uchar_t
*u8s
, size_t *utf8len
, int flag
)
605 boolean_t do_not_ignore_null
;
607 if (u32s
== NULL
|| utf32len
== NULL
)
610 if (u8s
== NULL
|| utf8len
== NULL
)
613 if (check_endian(flag
, &inendian
, &outendian
) != 0)
617 do_not_ignore_null
= ((flag
& UCONV_IGNORE_NULL
) == 0);
619 if ((flag
& UCONV_IN_ACCEPT_BOM
) &&
620 check_bom32(u32s
, *utf32len
, &inendian
))
623 inendian
&= UCONV_IN_NAT_ENDIAN
;
625 for (; u32l
< *utf32len
; u32l
++) {
626 if (u32s
[u32l
] == 0 && do_not_ignore_null
)
629 lo
= (inendian
) ? u32s
[u32l
] : BSWAP_32(u32s
[u32l
]);
631 if (lo
<= UCONV_U8_ONE_BYTE
) {
634 u8s
[u8l
++] = (uchar_t
)lo
;
635 } else if (lo
<= UCONV_U8_TWO_BYTES
) {
636 if ((u8l
+ 1) >= *utf8len
)
638 u8s
[u8l
++] = (uchar_t
)(0xc0 | ((lo
& 0x07c0) >> 6));
639 u8s
[u8l
++] = (uchar_t
)(0x80 | (lo
& 0x003f));
640 } else if (lo
<= UCONV_U8_THREE_BYTES
) {
641 if ((u8l
+ 2) >= *utf8len
)
643 u8s
[u8l
++] = (uchar_t
)(0xe0 | ((lo
& 0x0f000) >> 12));
644 u8s
[u8l
++] = (uchar_t
)(0x80 | ((lo
& 0x00fc0) >> 6));
645 u8s
[u8l
++] = (uchar_t
)(0x80 | (lo
& 0x0003f));
646 } else if (lo
<= UCONV_U8_FOUR_BYTES
) {
647 if ((u8l
+ 3) >= *utf8len
)
649 u8s
[u8l
++] = (uchar_t
)(0xf0 | ((lo
& 0x01c0000) >> 18));
650 u8s
[u8l
++] = (uchar_t
)(0x80 | ((lo
& 0x003f000) >> 12));
651 u8s
[u8l
++] = (uchar_t
)(0x80 | ((lo
& 0x0000fc0) >> 6));
652 u8s
[u8l
++] = (uchar_t
)(0x80 | (lo
& 0x000003f));
665 uconv_u8tou16(const uchar_t
*u8s
, size_t *utf8len
,
666 uint16_t *u16s
, size_t *utf16len
, int flag
)
676 boolean_t do_not_ignore_null
;
678 if (u8s
== NULL
|| utf8len
== NULL
)
681 if (u16s
== NULL
|| utf16len
== NULL
)
684 if (check_endian(flag
, &inendian
, &outendian
) != 0)
688 do_not_ignore_null
= ((flag
& UCONV_IGNORE_NULL
) == 0);
690 outendian
&= UCONV_OUT_NAT_ENDIAN
;
692 if (*utf8len
> 0 && *utf16len
> 0 && (flag
& UCONV_OUT_EMIT_BOM
))
693 u16s
[u16l
++] = (outendian
) ? UCONV_BOM_NORMAL
:
696 for (; u8l
< *utf8len
; ) {
697 if (u8s
[u8l
] == 0 && do_not_ignore_null
)
701 * Collect a UTF-8 character and convert it to a UTF-32
702 * character. In doing so, we screen out illegally formed
703 * UTF-8 characters and treat such as illegal characters.
704 * The algorithm at below also screens out anything bigger
707 * See Unicode 3.1 UTF-8 Corrigendum and Unicode 3.2 for
708 * more details on the illegal values of UTF-8 character
711 hi
= (uint32_t)u8s
[u8l
++];
713 if (hi
> UCONV_ASCII_MAX
) {
714 if ((remaining_bytes
= remaining_bytes_tbl
[hi
]) == 0)
718 hi
= hi
& u8_masks_tbl
[remaining_bytes
];
720 for (; remaining_bytes
> 0; remaining_bytes
--) {
722 * If we have no more bytes, the current
723 * UTF-8 character is incomplete.
728 lo
= (uint32_t)u8s
[u8l
++];
731 if (lo
< valid_min_2nd_byte
[first_b
] ||
732 lo
> valid_max_2nd_byte
[first_b
])
735 } else if (lo
< UCONV_U8_BYTE_MIN
||
736 lo
> UCONV_U8_BYTE_MAX
) {
739 hi
= (hi
<< UCONV_U8_BIT_SHIFT
) |
740 (lo
& UCONV_U8_BIT_MASK
);
744 if (hi
>= UCONV_U16_START
) {
745 lo
= ((hi
- UCONV_U16_START
) % UCONV_U16_BIT_SHIFT
) +
747 hi
= ((hi
- UCONV_U16_START
) / UCONV_U16_BIT_SHIFT
) +
750 if ((u16l
+ 1) >= *utf16len
)
754 u16s
[u16l
++] = (uint16_t)hi
;
755 u16s
[u16l
++] = (uint16_t)lo
;
757 u16s
[u16l
++] = BSWAP_16(((uint16_t)hi
));
758 u16s
[u16l
++] = BSWAP_16(((uint16_t)lo
));
761 if (u16l
>= *utf16len
)
764 u16s
[u16l
++] = (outendian
) ? (uint16_t)hi
:
765 BSWAP_16(((uint16_t)hi
));
776 uconv_u8tou32(const uchar_t
*u8s
, size_t *utf8len
,
777 uint32_t *u32s
, size_t *utf32len
, int flag
)
787 boolean_t do_not_ignore_null
;
789 if (u8s
== NULL
|| utf8len
== NULL
)
792 if (u32s
== NULL
|| utf32len
== NULL
)
795 if (check_endian(flag
, &inendian
, &outendian
) != 0)
799 do_not_ignore_null
= ((flag
& UCONV_IGNORE_NULL
) == 0);
801 outendian
&= UCONV_OUT_NAT_ENDIAN
;
803 if (*utf8len
> 0 && *utf32len
> 0 && (flag
& UCONV_OUT_EMIT_BOM
))
804 u32s
[u32l
++] = (outendian
) ? UCONV_BOM_NORMAL
:
805 UCONV_BOM_SWAPPED_32
;
807 for (; u8l
< *utf8len
; ) {
808 if (u8s
[u8l
] == 0 && do_not_ignore_null
)
811 hi
= (uint32_t)u8s
[u8l
++];
813 if (hi
> UCONV_ASCII_MAX
) {
814 if ((remaining_bytes
= remaining_bytes_tbl
[hi
]) == 0)
818 hi
= hi
& u8_masks_tbl
[remaining_bytes
];
820 for (; remaining_bytes
> 0; remaining_bytes
--) {
824 c
= (uint32_t)u8s
[u8l
++];
827 if (c
< valid_min_2nd_byte
[first_b
] ||
828 c
> valid_max_2nd_byte
[first_b
])
831 } else if (c
< UCONV_U8_BYTE_MIN
||
832 c
> UCONV_U8_BYTE_MAX
) {
835 hi
= (hi
<< UCONV_U8_BIT_SHIFT
) |
836 (c
& UCONV_U8_BIT_MASK
);
840 if (u32l
>= *utf32len
)
843 u32s
[u32l
++] = (outendian
) ? hi
: BSWAP_32(hi
);
853 EXPORT_SYMBOL(uconv_u16tou32
);
854 EXPORT_SYMBOL(uconv_u16tou8
);
855 EXPORT_SYMBOL(uconv_u32tou16
);
856 EXPORT_SYMBOL(uconv_u32tou8
);
857 EXPORT_SYMBOL(uconv_u8tou16
);
858 EXPORT_SYMBOL(uconv_u8tou32
);