4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
26 #pragma ident "%Z%%M% %I% %E% SMI"
28 #include <sys/types.h>
29 #include <sys/param.h>
30 #include <sys/sysmacros.h>
31 #include <sys/systm.h>
32 #include <sys/debug.h>
34 #include <sys/sunddi.h>
35 #include <sys/byteorder.h>
36 #include <sys/errno.h>
37 #include <sys/u8_textprep.h>
38 #include <sys/kiconv.h>
39 #include <sys/kiconv_cck_common.h>
44 * Common kiconv_open method for UTF-8 -> CCK conversion.
51 st
= (kiconv_state_t
)kmem_alloc(sizeof (kiconv_state_data_t
), KM_SLEEP
);
53 st
->bom_processed
= 0;
59 * Common kiconv_close method for UTF-8 -> CCK conversion.
62 kiconv_close_to_cck(void *kcd
)
64 if (! kcd
|| kcd
== (void *)-1)
67 kmem_free(kcd
, sizeof (kiconv_state_data_t
));
73 * Common routine to convert UTF-8 sequence to CCK legal character sequence.
76 kiconv_utf8_to_cck(void *kcd
, char **inbuf
, size_t *inbytesleft
,
77 char **outbuf
, size_t *outbytesleft
, int *errno
,
78 kiconv_utf8tocck_t ptr_utf8tocck
)
86 size_t i
; /* temp variable in for loop */
90 /* Check on the kiconv code conversion descriptor. */
91 if (! kcd
|| kcd
== (void *)-1) {
96 /* If this is a state reset request, process and return. */
97 if (! inbuf
|| !(*inbuf
)) {
98 ((kiconv_state_t
)kcd
)->bom_processed
= 0;
103 ib
= (uchar_t
*)*inbuf
;
104 ob
= (uchar_t
*)*outbuf
;
105 ibtail
= ib
+ *inbytesleft
;
106 obtail
= ob
+ *outbytesleft
;
108 KICONV_CHECK_UTF8_BOM(ib
, ibtail
);
110 while (ib
< ibtail
) {
111 sz
= u8_number_of_bytes
[*ib
];
114 * If it is a 7-bit ASCII character, we don't need to
115 * process further and we just copy the character over.
117 * If not, we connect the chracter bytes up to four bytes,
118 * validate the bytes, and binary search for the corresponding
119 * table. If we find it from the mapping table, we put that
120 * into the output buffer; otherwise, we put a replacement
121 * character instead as a non-identical conversion.
125 KICONV_SET_ERRNO_AND_BREAK(E2BIG
);
133 * Issue EILSEQ error if the first byte is a
134 * invalid UTF-8 character leading byte.
137 KICONV_SET_ERRNO_AND_BREAK(EILSEQ
);
141 * Issue EINVAL error if input buffer has an incomplete
142 * character at the end of the buffer.
144 if (ibtail
- ib
< sz
) {
145 KICONV_SET_ERRNO_AND_BREAK(EINVAL
);
149 * We collect UTF-8 character bytes and also check if this
150 * is a valid UTF-8 character without any bogus bytes based
151 * on the latest UTF-8 binary representation.
156 if (KICONV_IS_INVALID_UTF8_SECOND_BYTE(*ib
, u8
))
157 goto ILLEGAL_CHAR_PROCESS
;
158 u8
= (u8
<< 8) | *ib
++;
160 for (i
= 2; i
< sz
; i
++) {
161 if (*ib
< 0x80 || *ib
> 0xbf) {
162 ILLEGAL_CHAR_PROCESS
:
164 ret_val
= (size_t)-1;
166 goto ILLEGAL_CHAR_ERR
;
169 u8
= (u8
<< 8) | *ib
++;
172 /* Now we have a valid UTF-8 character. */
173 sz
= ptr_utf8tocck(u8
, &ib
, ibtail
, ob
, obtail
, &ret_val
);
176 KICONV_SET_ERRNO_AND_BREAK(E2BIG
);
184 *inbytesleft
= ibtail
- ib
;
185 *outbuf
= (char *)ob
;
186 *outbytesleft
= obtail
- ob
;
192 kiconvstr_utf8_to_cck(uchar_t
*ib
, size_t *inlen
, uchar_t
*ob
, size_t *outlen
,
193 int flag
, int *errno
, kiconv_utf8tocck_t ptr_utf8tocck
)
199 size_t i
; /* temp variable in for loop */
202 boolean_t do_not_ignore_null
;
205 ibtail
= ib
+ *inlen
;
206 obtail
= ob
+ *outlen
;
207 do_not_ignore_null
= ((flag
& KICONV_IGNORE_NULL
) == 0);
209 KICONV_CHECK_UTF8_BOM_WITHOUT_STATE(ib
, ibtail
);
211 while (ib
< ibtail
) {
212 if (*ib
== '\0' && do_not_ignore_null
)
215 sz
= u8_number_of_bytes
[*ib
];
219 KICONV_SET_ERRNO_AND_BREAK(E2BIG
);
229 KICONV_SET_ERRNO_WITH_FLAG(1, EILSEQ
);
232 if (ibtail
- ib
< sz
) {
233 if (flag
& KICONV_REPLACE_INVALID
) {
235 goto REPLACE_INVALID
;
238 KICONV_SET_ERRNO_AND_BREAK(EINVAL
);
243 if (KICONV_IS_INVALID_UTF8_SECOND_BYTE(*ib
, u8
))
244 goto ILLEGAL_CHAR_PROCESS
;
245 u8
= (u8
<< 8) | *ib
++;
247 for (i
= 2; i
< sz
; i
++) {
248 if (*ib
< 0x80 || *ib
> 0xbf) {
249 ILLEGAL_CHAR_PROCESS
:
250 if (flag
& KICONV_REPLACE_INVALID
) {
252 goto REPLACE_INVALID
;
256 ret_val
= (size_t)-1;
258 goto ILLEGAL_CHAR_ERR
;
261 u8
= (u8
<< 8) | *ib
++;
264 /* Now we get a valid character encoded in UTF-8. */
265 sz
= ptr_utf8tocck(u8
, &ib
, ibtail
, ob
, obtail
, &ret_val
);
268 KICONV_SET_ERRNO_AND_BREAK(E2BIG
);
277 KICONV_SET_ERRNO_AND_BREAK(E2BIG
);
280 *ob
++ = KICONV_ASCII_REPLACEMENT_CHAR
;
285 *inlen
= ibtail
- ib
;
286 *outlen
= obtail
- ob
;
292 * Search key in tbl[0] <= tbl[1] <= ... <= tbl[n-1]. Return 0 if not found.
293 * tbl[0] is a special element for non-identical conversion.
296 kiconv_binsearch(uint32_t key
, void *tbl
, size_t nitems
)
298 size_t low
, high
, mid
;
299 kiconv_table_t
*table
;
303 table
= (kiconv_table_t
*)tbl
;
305 while (low
<= high
) {
306 mid
= (low
+ high
) / 2;
308 if (key
< table
[mid
].key
)
310 else if (key
> table
[mid
].key
)