Merge remote-tracking branch 'origin/master'
[unleashed/lotheac.git] / usr / src / uts / common / kiconv / kiconv_tc / kiconv_tc.c
blob1c5e6e0db05c3939eac69be85d5787a5150cd792
1 /*
2 * CDDL HEADER START
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
19 * CDDL HEADER END
22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
26 #pragma ident "%Z%%M% %I% %E% SMI"
28 #include <sys/types.h>
29 #include <sys/param.h>
30 #include <sys/sysmacros.h>
31 #include <sys/systm.h>
32 #include <sys/debug.h>
33 #include <sys/kmem.h>
34 #include <sys/sunddi.h>
35 #include <sys/byteorder.h>
36 #include <sys/errno.h>
37 #include <sys/modctl.h>
38 #include <sys/u8_textprep.h>
39 #include <sys/kiconv.h>
40 #include <sys/kiconv_cck_common.h>
41 #include <sys/kiconv_tc.h>
42 #include <sys/kiconv_big5_utf8.h>
43 #include <sys/kiconv_euctw_utf8.h>
44 #include <sys/kiconv_hkscs_utf8.h>
45 #include <sys/kiconv_cp950hkscs_utf8.h>
46 #include <sys/kiconv_utf8_big5.h>
47 #include <sys/kiconv_utf8_euctw.h>
48 #include <sys/kiconv_utf8_cp950hkscs.h>
49 #include <sys/kiconv_utf8_hkscs.h>
51 /* 4 HKSCS-2004 code points map to 2 Unicode code points separately. */
52 static uchar_t hkscs_special_sequence[][4] = {
53 { 0xc3, 0x8a, 0xcc, 0x84 }, /* 0x8862 */
54 { 0xc3, 0x8a, 0xcc, 0x8c }, /* 0x8864 */
55 { 0xc3, 0xaa, 0xcc, 0x84 }, /* 0x88a3 */
56 { 0xc3, 0xaa, 0xcc, 0x8c } /* 0x88a5 */
59 /* 4 Unicode code point pair map to 1 HKSCS-2004 code point. */
60 static uint32_t ucs_special_sequence[] = {
61 0x8866, /* U+00ca */
62 0x8862, /* U+00ca U+0304 */
63 0x8864, /* U+00ca U+030c */
64 0x88a7, /* U+00ea */
65 0x88a3, /* U+00ea U+0304 */
66 0x88a5 /* U+00ea U+030c */
69 typedef int8_t (*kiconv_big5toutf8_t)(uint32_t value, uchar_t *ob,
70 uchar_t *obtail, size_t *ret_val);
72 static int8_t utf8_to_big5(uint32_t utf8, uchar_t **inbuf, uchar_t *ibtail,
73 uchar_t *ob, uchar_t *obtail, size_t *ret_val);
74 static int8_t utf8_to_euctw(uint32_t utf8, uchar_t **inbuf, uchar_t *ibtail,
75 uchar_t *ob, uchar_t *obtail, size_t *ret_val);
76 static int8_t utf8_to_cp950hkscs(uint32_t utf8, uchar_t **inbuf,
77 uchar_t *ibtail, uchar_t *ob, uchar_t *obtail, size_t *ret_val);
78 static int8_t utf8_to_big5hkscs(uint32_t utf8, uchar_t **inbuf, uchar_t *ibtail,
79 uchar_t *ob, uchar_t *obtail, size_t *ret_val);
80 static int8_t big5_to_utf8(uint32_t big5_val, uchar_t *ob, uchar_t *obtail,
81 size_t *ret_val);
82 static int8_t big5hkscs_to_utf8(uint32_t hkscs_val, uchar_t *ob,
83 uchar_t *obtail, size_t *ret_val);
84 static int8_t cp950hkscs_to_utf8(uint32_t hkscs_val, uchar_t *ob,
85 uchar_t *obtail, size_t *ret_val);
86 static int8_t euctw_to_utf8(size_t plane_no, uint32_t euctw_val,
87 uchar_t *ob, uchar_t *obtail, size_t *ret_val);
88 static uint32_t get_unicode_from_UDA(size_t plane_no, uchar_t byte1,
89 uchar_t byte2);
91 #define KICONV_TC_BIG5 (0x01)
92 #define KICONV_TC_BIG5HKSCS (0x02)
93 #define KICONV_TC_CP950HKSCS (0x03)
94 #define KICONV_TC_EUCTW (0x04)
95 #define KICONV_TC_MAX_MAGIC_ID (0x04)
97 static void *
98 open_fr_big5()
100 return ((void *)KICONV_TC_BIG5);
103 static void *
104 open_fr_big5hkscs()
106 return ((void *)KICONV_TC_BIG5HKSCS);
109 static void *
110 open_fr_cp950hkscs()
112 return ((void *)KICONV_TC_CP950HKSCS);
115 static void *
116 open_fr_euctw()
118 return ((void *)KICONV_TC_EUCTW);
121 static int
122 close_fr_tc(void *s)
124 if ((uintptr_t)s > KICONV_TC_MAX_MAGIC_ID)
125 return (EBADF);
127 return (0);
131 * Common convertor from BIG5/HKSCS(BIG5-HKSCS or CP950-HKSCS) to UTF-8.
133 static size_t
134 kiconv_fr_big5_common(void *kcd, char **inbuf, size_t *inbytesleft,
135 char **outbuf, size_t *outbytesleft, int *errno,
136 kiconv_big5toutf8_t ptr_big5touf8)
138 uchar_t *ib;
139 uchar_t *ob;
140 uchar_t *ibtail;
141 uchar_t *obtail;
142 size_t ret_val;
143 int8_t sz;
144 uint32_t big5_val;
146 /* Check on the kiconv code conversion descriptor. */
147 if (kcd == NULL || kcd == (void *)-1) {
148 *errno = EBADF;
149 return ((size_t)-1);
152 /* If this is a state reset request, process and return. */
153 if (inbuf == NULL || *inbuf == NULL) {
154 return (0);
157 ret_val = 0;
158 ib = (uchar_t *)*inbuf;
159 ob = (uchar_t *)*outbuf;
160 ibtail = ib + *inbytesleft;
161 obtail = ob + *outbytesleft;
163 while (ib < ibtail) {
164 if (KICONV_IS_ASCII(*ib)) {
165 if (ob >= obtail) {
166 KICONV_SET_ERRNO_AND_BREAK(E2BIG);
169 *ob++ = *ib++;
170 continue;
174 * Issue EILSEQ error if the first byte is not a
175 * valid BIG5/HKSCS leading byte.
177 if (! KICONV_TC_IS_BIG5_1st_BYTE(*ib)) {
178 KICONV_SET_ERRNO_AND_BREAK(EILSEQ);
182 * Issue EINVAL error if input buffer has an incomplete
183 * character at the end of the buffer.
185 if (ibtail - ib < 2) {
186 KICONV_SET_ERRNO_AND_BREAK(EINVAL);
190 * Issue EILSEQ error if the remaining bytes is not
191 * a valid BIG5/HKSCS byte.
193 if (! KICONV_TC_IS_BIG5_2nd_BYTE(*(ib + 1))) {
194 KICONV_SET_ERRNO_AND_BREAK(EILSEQ);
197 /* Now we have a valid BIG5/HKSCS character. */
198 big5_val = (uint32_t)(*ib) << 8 | *(ib + 1);
199 sz = ptr_big5touf8(big5_val, ob, obtail, &ret_val);
201 if (sz < 0) {
202 KICONV_SET_ERRNO_AND_BREAK(E2BIG);
205 ib += 2;
206 ob += sz;
209 *inbuf = (char *)ib;
210 *inbytesleft = ibtail - ib;
211 *outbuf = (char *)ob;
212 *outbytesleft = obtail - ob;
214 return (ret_val);
218 * String based Common convertor from BIG5/HKSCS(BIG5-HKSCS or CP950-HKSCS)
219 * to UTF-8.
221 static size_t
222 kiconvstr_fr_big5_common(uchar_t *ib, size_t *inlen, uchar_t *ob,
223 size_t *outlen, int flag, int *errno,
224 kiconv_big5toutf8_t ptr_big5touf8)
226 uchar_t *oldib;
227 uchar_t *ibtail;
228 uchar_t *obtail;
229 size_t ret_val;
230 int8_t sz;
231 uint32_t big5_val;
232 boolean_t do_not_ignore_null;
234 ret_val = 0;
235 ibtail = ib + *inlen;
236 obtail = ob + *outlen;
237 do_not_ignore_null = ((flag & KICONV_IGNORE_NULL) == 0);
239 while (ib < ibtail) {
240 if (*ib == '\0' && do_not_ignore_null)
241 break;
243 if (KICONV_IS_ASCII(*ib)) {
244 if (ob >= obtail) {
245 KICONV_SET_ERRNO_AND_BREAK(E2BIG);
248 *ob++ = *ib++;
249 continue;
252 oldib = ib;
254 if (! KICONV_TC_IS_BIG5_1st_BYTE(*ib)) {
255 KICONV_SET_ERRNO_WITH_FLAG(1, EILSEQ);
258 if (ibtail - ib < 2) {
259 KICONV_SET_ERRNO_WITH_FLAG(1, EINVAL);
262 if (! KICONV_TC_IS_BIG5_2nd_BYTE(*(ib + 1))) {
263 KICONV_SET_ERRNO_WITH_FLAG(2, EILSEQ);
266 big5_val = *ib++;
267 big5_val = (big5_val << 8) | *ib++;
268 sz = ptr_big5touf8(big5_val, ob, obtail, &ret_val);
270 if (sz < 0) {
271 ib = oldib;
272 KICONV_SET_ERRNO_AND_BREAK(E2BIG);
275 ob += sz;
276 continue;
278 REPLACE_INVALID:
279 if (obtail - ob < KICONV_UTF8_REPLACEMENT_CHAR_LEN) {
280 ib = oldib;
281 KICONV_SET_ERRNO_AND_BREAK(E2BIG);
284 *ob++ = KICONV_UTF8_REPLACEMENT_CHAR1;
285 *ob++ = KICONV_UTF8_REPLACEMENT_CHAR2;
286 *ob++ = KICONV_UTF8_REPLACEMENT_CHAR3;
287 ret_val++;
290 *inlen = ibtail - ib;
291 *outlen = obtail - ob;
293 return (ret_val);
297 * Encoding convertor from BIG5 to UTF-8.
299 static size_t
300 kiconv_fr_big5(void *kcd, char **inbuf, size_t *inbytesleft, char **outbuf,
301 size_t *outbytesleft, int *errno)
303 return (kiconv_fr_big5_common(kcd, inbuf, inbytesleft, outbuf,
304 outbytesleft, errno, big5_to_utf8));
308 * String based encoding convertor from BIG5 to UTF-8.
310 static size_t
311 kiconvstr_fr_big5(char *inarray, size_t *inlen, char *outarray,
312 size_t *outlen, int flag, int *errno)
314 return (kiconvstr_fr_big5_common((uchar_t *)inarray, inlen,
315 (uchar_t *)outarray, outlen, flag, errno,
316 big5_to_utf8));
320 * Encoding convertor from BIG5-HKSCS to UTF-8.
322 static size_t
323 kiconv_fr_big5hkscs(void *kcd, char **inbuf, size_t *inbytesleft,
324 char **outbuf, size_t *outbytesleft, int *errno)
326 return kiconv_fr_big5_common(kcd, inbuf, inbytesleft, outbuf,
327 outbytesleft, errno, big5hkscs_to_utf8);
331 * String based encoding convertor from BIG5-HKSCS to UTF-8.
333 static size_t
334 kiconvstr_fr_big5hkscs(char *inarray, size_t *inlen, char *outarray,
335 size_t *outlen, int flag, int *errno)
337 return kiconvstr_fr_big5_common((uchar_t *)inarray, inlen,
338 (uchar_t *)outarray, outlen, flag, errno, big5hkscs_to_utf8);
342 * Encoding convertor from CP950-HKSCS to UTF-8.
344 static size_t
345 kiconv_fr_cp950hkscs(void *kcd, char **inbuf, size_t *inbytesleft,
346 char **outbuf, size_t *outbytesleft, int *errno)
348 return kiconv_fr_big5_common(kcd, inbuf, inbytesleft, outbuf,
349 outbytesleft, errno, cp950hkscs_to_utf8);
353 * String based encoding convertor from CP950-HKSCS to UTF-8.
355 static size_t
356 kiconvstr_fr_cp950hkscs(char *inarray, size_t *inlen, char *outarray,
357 size_t *outlen, int flag, int *errno)
359 return kiconvstr_fr_big5_common((uchar_t *)inarray, inlen,
360 (uchar_t *)outarray, outlen, flag, errno, cp950hkscs_to_utf8);
364 * Encoding convertor from EUC-TW to UTF-8.
366 static size_t
367 kiconv_fr_euctw(void *kcd, char **inbuf, size_t *inbytesleft,
368 char **outbuf, size_t *outbytesleft, int *errno)
370 uchar_t *ib;
371 uchar_t *ob;
372 uchar_t *ibtail;
373 uchar_t *obtail;
374 uchar_t *oldib;
375 size_t ret_val;
376 size_t plane_no;
377 int8_t sz;
378 uint32_t euctw_val;
379 boolean_t isplane1;
381 /* Check on the kiconv code conversion descriptor. */
382 if (kcd == NULL || kcd == (void *)-1) {
383 *errno = EBADF;
384 return ((size_t)-1);
387 /* If this is a state reset request, process and return. */
388 if (inbuf == NULL || *inbuf == NULL) {
389 return (0);
392 ret_val = 0;
393 ib = (uchar_t *)*inbuf;
394 ob = (uchar_t *)*outbuf;
395 ibtail = ib + *inbytesleft;
396 obtail = ob + *outbytesleft;
398 while (ib < ibtail) {
399 if (KICONV_IS_ASCII(*ib)) {
400 if (ob >= obtail) {
401 KICONV_SET_ERRNO_AND_BREAK(E2BIG);
404 *ob++ = *ib++;
405 continue;
409 * Issue EILSEQ error if the first byte is not a
410 * valid EUC-TW leading byte.
412 if (! KICONV_TC_IS_EUCTW_1st_BYTE(*ib)) {
413 KICONV_SET_ERRNO_AND_BREAK(EILSEQ);
416 isplane1 = (*ib == KICONV_TC_EUCTW_MBYTE) ?
417 B_FALSE : B_TRUE;
420 * Issue EINVAL error if input buffer has an incomplete
421 * character at the end of the buffer.
423 if (ibtail - ib < (isplane1 ? 2 : 4)) {
424 KICONV_SET_ERRNO_AND_BREAK(EINVAL);
427 oldib = ib;
428 plane_no = isplane1 ? 1 : *(ib + 1) - KICONV_TC_EUCTW_PMASK;
431 * Issue EILSEQ error if the remaining bytes are not
432 * valid EUC-TW bytes.
434 if (! KICONV_TC_IS_VALID_EUCTW_SEQ(ib)) {
435 KICONV_SET_ERRNO_AND_BREAK(EILSEQ);
438 if (! isplane1)
439 ib += 2;
441 /* Now we have a valid EUC-TW character. */
442 euctw_val = *ib++;
443 euctw_val = (euctw_val << 8) | *ib++;
444 sz = euctw_to_utf8(plane_no, euctw_val, ob, obtail, &ret_val);
446 if (sz < 0) {
447 ib = oldib;
448 KICONV_SET_ERRNO_AND_BREAK(E2BIG);
451 ob += sz;
454 *inbuf = (char *)ib;
455 *inbytesleft = ibtail - ib;
456 *outbuf = (char *)ob;
457 *outbytesleft = obtail - ob;
459 return (ret_val);
463 * String based encoding convertor from EUC-TW to UTF-8.
465 static size_t
466 kiconvstr_fr_euctw(char *inarray, size_t *inlen, char *outarray,
467 size_t *outlen, int flag, int *errno)
469 uchar_t *ib;
470 uchar_t *ob;
471 uchar_t *ibtail;
472 uchar_t *obtail;
473 uchar_t *oldib;
474 size_t ret_val;
475 size_t plane_no;
476 int8_t sz;
477 uint32_t euctw_val;
478 boolean_t isplane1;
479 boolean_t do_not_ignore_null;
481 ret_val = 0;
482 ib = (uchar_t *)inarray;
483 ob = (uchar_t *)outarray;
484 ibtail = ib + *inlen;
485 obtail = ob + *outlen;
486 do_not_ignore_null = ((flag & KICONV_IGNORE_NULL) == 0);
488 while (ib < ibtail) {
489 if (*ib == '\0' && do_not_ignore_null)
490 break;
492 if (KICONV_IS_ASCII(*ib)) {
493 if (ob >= obtail) {
494 KICONV_SET_ERRNO_AND_BREAK(E2BIG);
497 *ob++ = *ib++;
498 continue;
501 oldib = ib;
503 if (! KICONV_TC_IS_EUCTW_1st_BYTE(*ib)) {
504 KICONV_SET_ERRNO_WITH_FLAG(1, EILSEQ);
507 isplane1 = (*ib == KICONV_TC_EUCTW_MBYTE) ?
508 B_FALSE : B_TRUE;
510 if (ibtail - ib < (isplane1 ? 2 : 4)) {
511 if (flag & KICONV_REPLACE_INVALID) {
512 ib = ibtail;
513 goto REPLACE_INVALID;
516 KICONV_SET_ERRNO_AND_BREAK(EINVAL);
519 plane_no = isplane1 ? 1 : *(ib + 1) - KICONV_TC_EUCTW_PMASK;
521 if (! KICONV_TC_IS_VALID_EUCTW_SEQ(ib)) {
522 KICONV_SET_ERRNO_WITH_FLAG(isplane1 ? 2 : 4, EILSEQ);
525 if (! isplane1)
526 ib += 2;
528 euctw_val = *ib++;
529 euctw_val = (euctw_val << 8) | *ib++;
530 sz = euctw_to_utf8(plane_no, euctw_val, ob, obtail, &ret_val);
532 if (sz < 0) {
533 ib = oldib;
534 KICONV_SET_ERRNO_AND_BREAK(E2BIG);
537 ob += sz;
538 continue;
540 REPLACE_INVALID:
541 if (obtail - ob < KICONV_UTF8_REPLACEMENT_CHAR_LEN) {
542 ib = oldib;
543 KICONV_SET_ERRNO_AND_BREAK(E2BIG);
546 *ob++ = KICONV_UTF8_REPLACEMENT_CHAR1;
547 *ob++ = KICONV_UTF8_REPLACEMENT_CHAR2;
548 *ob++ = KICONV_UTF8_REPLACEMENT_CHAR3;
549 ret_val++;
552 *inlen = ibtail - ib;
553 *outlen = obtail - ob;
555 return (ret_val);
559 * Encoding convertor from UTF-8 to BIG5.
561 static size_t
562 kiconv_to_big5(void *kcd, char **inbuf, size_t *inbytesleft,
563 char **outbuf, size_t *outbytesleft, int *errno)
565 return kiconv_utf8_to_cck(kcd, inbuf, inbytesleft, outbuf,
566 outbytesleft, errno, utf8_to_big5);
570 * String based encoding convertor from UTF-8 to BIG5.
572 static size_t
573 kiconvstr_to_big5(char *inarray, size_t *inlen, char *outarray,
574 size_t *outlen, int flag, int *errno)
576 return kiconvstr_utf8_to_cck((uchar_t *)inarray, inlen,
577 (uchar_t *)outarray, outlen, flag, errno, utf8_to_big5);
581 * Encoding convertor from UTF-8 to EUC-TW.
583 static size_t
584 kiconv_to_euctw(void *kcd, char **inbuf, size_t *inbytesleft,
585 char **outbuf, size_t *outbytesleft, int *errno)
587 return kiconv_utf8_to_cck(kcd, inbuf, inbytesleft, outbuf,
588 outbytesleft, errno, utf8_to_euctw);
592 * String based encoding convertor from UTF-8 to EUC-TW.
594 static size_t
595 kiconvstr_to_euctw(char *inarray, size_t *inlen, char *outarray,
596 size_t *outlen, int flag, int *errno)
598 return kiconvstr_utf8_to_cck((uchar_t *)inarray, inlen,
599 (uchar_t *)outarray, outlen, flag, errno, utf8_to_euctw);
603 * Encoding convertor from UTF-8 to CP950HKSCS.
605 static size_t
606 kiconv_to_cp950hkscs(void *kcd, char **inbuf, size_t *inbytesleft,
607 char **outbuf, size_t *outbytesleft, int *errno)
609 return kiconv_utf8_to_cck(kcd, inbuf, inbytesleft, outbuf,
610 outbytesleft, errno, utf8_to_cp950hkscs);
614 * String based encoding convertor from UTF-8 to CP950HKSCS.
616 static size_t
617 kiconvstr_to_cp950hkscs(char *inarray, size_t *inlen, char *outarray,
618 size_t *outlen, int flag, int *errno)
620 return kiconvstr_utf8_to_cck((uchar_t *)inarray, inlen,
621 (uchar_t *)outarray, outlen, flag, errno, utf8_to_cp950hkscs);
625 * Encoding convertor from UTF-8 to BIG5HKSCS(HKSCS-2004).
627 static size_t
628 kiconv_to_big5hkscs(void *kcd, char **inbuf, size_t *inbytesleft,
629 char **outbuf, size_t *outbytesleft, int *errno)
631 return kiconv_utf8_to_cck(kcd, inbuf, inbytesleft, outbuf,
632 outbytesleft, errno, utf8_to_big5hkscs);
636 * String based encoding convertor from UTF-8 to BIG5HKSCS(HKSCS-2004).
638 static size_t
639 kiconvstr_to_big5hkscs(char *inarray, size_t *inlen, char *outarray,
640 size_t *outlen, int flag, int *errno)
642 return kiconvstr_utf8_to_cck((uchar_t *)inarray, inlen,
643 (uchar_t *)outarray, outlen, flag, errno, utf8_to_big5hkscs);
647 * Common convertor from single BIG5/CP950-HKSCS character to UTF-8.
648 * Return: > 0 - Converted successfully
649 * = -1 - E2BIG
651 static int8_t
652 big5_to_utf8_common(uint32_t big5_val, uchar_t *ob, uchar_t *obtail,
653 size_t *ret_val, kiconv_table_array_t *table, size_t nitems)
655 size_t index;
656 int8_t sz;
657 uchar_t *u8;
659 index = kiconv_binsearch(big5_val, table, nitems);
660 u8 = table[index].u8;
661 sz = u8_number_of_bytes[u8[0]];
663 if (obtail - ob < sz) {
664 *ret_val = (size_t)-1;
665 return (-1);
668 if (index == 0)
669 (*ret_val)++; /* Non-identical conversion */
671 for (index = 0; index < sz; index++)
672 *ob++ = u8[index];
674 return (sz);
678 * Convert single BIG5 character to UTF-8.
680 static int8_t
681 big5_to_utf8(uint32_t big5_val, uchar_t *ob, uchar_t *obtail, size_t *ret_val)
683 return (big5_to_utf8_common(big5_val, ob, obtail, ret_val,
684 kiconv_big5_utf8, KICONV_BIG5_UTF8_MAX));
688 * Convert single CP950-HKSCS character to UTF-8.
690 static int8_t
691 cp950hkscs_to_utf8(uint32_t hkscs_val, uchar_t *ob, uchar_t *obtail,
692 size_t *ret_val)
694 return (big5_to_utf8_common(hkscs_val, ob, obtail, ret_val,
695 kiconv_cp950hkscs_utf8, KICONV_CP950HKSCS_UTF8_MAX));
699 * Calculate unicode value for some CNS planes which fall in Unicode
700 * UDA range.
702 static uint32_t
703 get_unicode_from_UDA(size_t plane_no, uchar_t b1, uchar_t b2)
706 * CNS Plane 15 is pre-allocated, so need move Plane 16 to back 15
707 * to compute the Unicode value.
709 if (plane_no == 16)
710 --plane_no;
712 /* 0xF0000 + (plane_no - 12) * 8836 + (b1 - 0xA1) * 94 + (b2 - 0xA1) */
713 return (8836 * plane_no + 94 * b1 + b2 + 0xD2611);
717 * Convert single EUC-TW character to UTF-8.
718 * Return: > 0 - Converted successfully
719 * = -1 - E2BIG
721 static int8_t
722 euctw_to_utf8(size_t plane_no, uint32_t euctw_val, uchar_t *ob,
723 uchar_t *obtail, size_t *ret_val)
725 uint32_t u32;
726 size_t index;
727 int8_t sz;
728 uchar_t udc[4];
729 uchar_t *u8;
731 switch (plane_no) {
732 case 1:
733 index = kiconv_binsearch(euctw_val, kiconv_cns1_utf8,
734 KICONV_CNS1_UTF8_MAX);
735 u8 = kiconv_cns1_utf8[index].u8;
736 break;
737 case 2:
738 index = kiconv_binsearch(euctw_val, kiconv_cns2_utf8,
739 KICONV_CNS2_UTF8_MAX);
740 u8 = kiconv_cns2_utf8[index].u8;
741 break;
742 case 3:
743 index = kiconv_binsearch(euctw_val, kiconv_cns3_utf8,
744 KICONV_CNS3_UTF8_MAX);
745 u8 = kiconv_cns3_utf8[index].u8;
746 break;
747 case 4:
748 index = kiconv_binsearch(euctw_val, kiconv_cns4_utf8,
749 KICONV_CNS4_UTF8_MAX);
750 u8 = kiconv_cns4_utf8[index].u8;
751 break;
752 case 5:
753 index = kiconv_binsearch(euctw_val, kiconv_cns5_utf8,
754 KICONV_CNS5_UTF8_MAX);
755 u8 = kiconv_cns5_utf8[index].u8;
756 break;
757 case 6:
758 index = kiconv_binsearch(euctw_val, kiconv_cns6_utf8,
759 KICONV_CNS6_UTF8_MAX);
760 u8 = kiconv_cns6_utf8[index].u8;
761 break;
762 case 7:
763 index = kiconv_binsearch(euctw_val, kiconv_cns7_utf8,
764 KICONV_CNS7_UTF8_MAX);
765 u8 = kiconv_cns7_utf8[index].u8;
766 break;
767 case 12:
768 case 13:
769 case 14:
770 case 16:
771 u32 = get_unicode_from_UDA(plane_no,
772 (euctw_val & 0xFF00) >> 8, euctw_val & 0xFF);
774 * As U+F0000 <= u32 <= U+F8A0F, so its UTF-8 sequence
775 * will occupy 4 bytes.
777 udc[0] = 0xF3;
778 udc[1] = (uchar_t)(0x80 | (u32 & 0x03F000) >> 12);
779 udc[2] = (uchar_t)(0x80 | (u32 & 0x000FC0) >> 6);
780 udc[3] = (uchar_t)(0x80 | (u32 & 0x00003F));
781 u8 = udc;
782 index = 1;
783 break;
784 case 15:
785 index = kiconv_binsearch(euctw_val, kiconv_cns15_utf8,
786 KICONV_CNS15_UTF8_MAX);
787 u8 = kiconv_cns15_utf8[index].u8;
788 break;
789 default:
790 index = 0;
791 u8 = kiconv_cns1_utf8[index].u8;
794 sz = u8_number_of_bytes[u8[0]];
795 if (obtail - ob < sz) {
796 *ret_val = (size_t)-1;
797 return (-1);
800 if (index == 0)
801 (*ret_val)++;
803 for (index = 0; index < sz; index++)
804 *ob++ = u8[index];
806 return (sz);
810 * Convert single HKSCS character to UTF-8.
811 * Return: > 0 - Converted successfully
812 * = -1 - E2BIG
814 static int8_t
815 big5hkscs_to_utf8(uint32_t hkscs_val, uchar_t *ob, uchar_t *obtail,
816 size_t *ret_val)
818 size_t index;
819 int8_t sz;
820 uchar_t *u8;
822 index = kiconv_binsearch(hkscs_val, kiconv_hkscs_utf8,
823 KICONV_HKSCS_UTF8_MAX);
824 u8 = kiconv_hkscs_utf8[index].u8;
827 * Single HKSCS-2004 character may map to 2 Unicode
828 * code points.
830 if (u8[0] == 0xFF) {
831 u8 = hkscs_special_sequence[u8[1]];
832 sz = 4;
833 } else {
834 sz = u8_number_of_bytes[u8[0]];
837 if (obtail - ob < sz) {
838 *ret_val = (size_t)-1;
839 return (-1);
842 if (index == 0)
843 (*ret_val)++; /* Non-identical conversion. */
845 for (index = 0; index < sz; index++)
846 *ob++ = u8[index];
848 return (sz);
852 * Convert single UTF-8 character to EUC-TW.
853 * Return: > 0 - Converted successfully
854 * = -1 - E2BIG
856 /* ARGSUSED */
857 static int8_t
858 utf8_to_euctw(uint32_t utf8, uchar_t **inbuf, uchar_t *ibtail,
859 uchar_t *ob, uchar_t *obtail, size_t *ret_val)
861 size_t index;
862 size_t plane_no;
863 uchar_t byte1;
864 uchar_t byte2;
866 if (utf8 >= KICONV_TC_UDA_UTF8_START &&
867 utf8 <= KICONV_TC_UDA_UTF8_END) {
869 * Calculate EUC-TW code if utf8 is in Unicode
870 * Private Plane 15.
872 index = (((utf8 & 0x7000000) >> 6) | ((utf8 & 0x3F0000) >> 4) |
873 ((utf8 & 0x3F00) >> 2) | (utf8 & 0x3F)) -
874 KICONV_TC_UDA_UCS4_START;
875 plane_no = 12 + index / 8836;
876 byte1 = 0xA1 + (index % 8836) / 94;
877 byte2 = 0xA1 + index % 94;
879 /* CNS Plane 15 is pre-allocated, so place it into Plane 16. */
880 if (plane_no == 15)
881 plane_no = 16;
882 } else {
883 uint32_t euctw_val;
885 index = kiconv_binsearch(utf8, kiconv_utf8_euctw,
886 KICONV_UTF8_EUCTW_MAX);
888 if (index == 0) {
889 if (ob >= obtail) {
890 *ret_val = (size_t)-1;
891 return (-1);
894 *ob++ = KICONV_ASCII_REPLACEMENT_CHAR;
895 (*ret_val)++;
897 return (1);
900 euctw_val = kiconv_utf8_euctw[index].value;
901 byte1 = (euctw_val & 0xFF00) >> 8;
902 byte2 = euctw_val & 0xFF;
903 plane_no = euctw_val >> 16;
906 if (obtail - ob < (plane_no == 1 ? 2 : 4)) {
907 *ret_val = (size_t)-1;
908 return (-1);
911 if (plane_no != 1) {
912 *ob++ = KICONV_TC_EUCTW_MBYTE;
913 *ob++ = KICONV_TC_EUCTW_PMASK + plane_no;
916 *ob++ = byte1;
917 *ob = byte2;
919 return (plane_no == 1 ? 2 : 4);
923 * Convert single UTF-8 character to BIG5-HKSCS
924 * Return: > 0 - Converted successfully
925 * = -1 - E2BIG
927 static int8_t
928 utf8_to_big5hkscs(uint32_t utf8, uchar_t **inbuf, uchar_t *ibtail,
929 uchar_t *ob, uchar_t *obtail, size_t *ret_val)
931 size_t index;
932 int8_t hkscslen;
933 uint32_t hkscscode;
934 boolean_t special_sequence = B_FALSE;
936 index = kiconv_binsearch(utf8, kiconv_utf8_hkscs,
937 KICONV_UTF8_HKSCS_MAX);
938 hkscscode = kiconv_utf8_hkscs[index].value;
941 * There are 4 special code points in HKSCS-2004 which mapped
942 * to 2 UNICODE code points.
944 if ((int32_t)hkscscode < 0) {
945 size_t special_index = (-(int32_t)hkscscode - 1) * 3;
947 /* Check the following 2 bytes. */
948 if (ibtail - *inbuf >= 2 && **inbuf == 0xcc &&
949 (*(*inbuf + 1) == 0x84 || *(*inbuf + 1) == 0x8c)) {
950 special_index += (*(*inbuf + 1) == 0x84 ? 1 : 2);
951 special_sequence = B_TRUE;
954 hkscscode = ucs_special_sequence[special_index];
957 hkscslen = (hkscscode <= 0xFF) ? 1 : 2;
958 if (obtail - ob < hkscslen) {
959 *ret_val = (size_t)-1;
960 return (-1);
963 if (index == 0)
964 (*ret_val)++;
966 if (hkscslen > 1)
967 *ob++ = (uchar_t)(hkscscode >> 8);
968 *ob = (uchar_t)(hkscscode & 0xFF);
970 if (special_sequence) { /* Advance for special sequence */
971 (*inbuf) += 2;
974 return (hkscslen);
978 * Common convertor for UTF-8 to BIG5/CP950-HKSCS.
979 * Return: > 0 - Converted successfully
980 * = -1 - E2BIG
982 static int8_t
983 utf8_to_big5_common(uint32_t utf8, uchar_t *ob, uchar_t *obtail,
984 size_t *ret_val, kiconv_table_t *table, size_t nitems)
986 size_t index;
987 int8_t big5len;
988 uint32_t big5code;
990 index = kiconv_binsearch(utf8, table, nitems);
991 big5code = table[index].value;
992 big5len = (big5code <= 0xFF) ? 1 : 2;
994 if (obtail - ob < big5len) {
995 *ret_val = (size_t)-1;
996 return (-1);
999 if (index == 0)
1000 (*ret_val)++;
1002 if (big5len > 1)
1003 *ob++ = (uchar_t)(big5code >> 8);
1004 *ob = (uchar_t)(big5code & 0xFF);
1006 return (big5len);
1010 * Convert single UTF-8 character to BIG5.
1012 /* ARGSUSED */
1013 static int8_t
1014 utf8_to_big5(uint32_t utf8, uchar_t **inbuf, uchar_t *ibtail,
1015 uchar_t *ob, uchar_t *obtail, size_t *ret_val)
1017 return (utf8_to_big5_common(utf8, ob, obtail, ret_val,
1018 kiconv_utf8_big5, KICONV_UTF8_BIG5_MAX));
1022 * Convert single UTF-8 character to CP950-HKSCS for Windows compatibility.
1024 /* ARGSUSED */
1025 static int8_t
1026 utf8_to_cp950hkscs(uint32_t utf8, uchar_t **inbuf, uchar_t *ibtail,
1027 uchar_t *ob, uchar_t *obtail, size_t *ret_val)
1029 return (utf8_to_big5_common(utf8, ob, obtail, ret_val,
1030 kiconv_utf8_cp950hkscs, KICONV_UTF8_CP950HKSCS));
1033 static kiconv_ops_t kiconv_tc_ops_tbl[] = {
1035 "big5", "utf-8", kiconv_open_to_cck, kiconv_to_big5,
1036 kiconv_close_to_cck, kiconvstr_to_big5
1039 "utf-8", "big5", open_fr_big5, kiconv_fr_big5,
1040 close_fr_tc, kiconvstr_fr_big5
1044 "big5-hkscs", "utf-8", kiconv_open_to_cck, kiconv_to_big5hkscs,
1045 kiconv_close_to_cck, kiconvstr_to_big5hkscs
1048 "utf-8", "big5-hkscs", open_fr_big5hkscs, kiconv_fr_big5hkscs,
1049 close_fr_tc, kiconvstr_fr_big5hkscs
1053 "euc-tw", "utf-8", kiconv_open_to_cck, kiconv_to_euctw,
1054 kiconv_close_to_cck, kiconvstr_to_euctw
1057 "utf-8", "euc-tw", open_fr_euctw, kiconv_fr_euctw,
1058 close_fr_tc, kiconvstr_fr_euctw
1062 "cp950-hkscs", "utf-8", kiconv_open_to_cck,
1063 kiconv_to_cp950hkscs, kiconv_close_to_cck,
1064 kiconvstr_to_cp950hkscs
1067 "utf-8", "cp950-hkscs", open_fr_cp950hkscs,
1068 kiconv_fr_cp950hkscs, close_fr_tc, kiconvstr_fr_cp950hkscs
1072 static kiconv_module_info_t kiconv_tc_info = {
1073 "kiconv_tc", /* module name */
1074 sizeof (kiconv_tc_ops_tbl) / sizeof (kiconv_tc_ops_tbl[0]),
1075 kiconv_tc_ops_tbl,
1077 NULL,
1078 NULL,
1082 static struct modlkiconv modlkiconv_tc = {
1083 &mod_kiconvops,
1084 "kiconv Traditional Chinese module 1.0",
1085 &kiconv_tc_info
1088 static struct modlinkage modlinkage = {
1089 MODREV_1,
1090 (void *)&modlkiconv_tc,
1091 NULL
1095 _init(void)
1097 int err;
1099 err = mod_install(&modlinkage);
1100 if (err)
1101 cmn_err(CE_WARN, "kiconv_tc: failed to load kernel module");
1103 return (err);
1107 _fini(void)
1109 int err;
1112 * If this module is being used, then, we cannot remove the module.
1113 * The following checking will catch pretty much all usual cases.
1115 * Any remaining will be catached by the kiconv_unregister_module()
1116 * during mod_remove() at below.
1118 if (kiconv_module_ref_count(KICONV_MODULE_ID_TC))
1119 return (EBUSY);
1121 err = mod_remove(&modlinkage);
1122 if (err)
1123 cmn_err(CE_WARN, "kiconv_tc: failed to remove kernel module");
1125 return (err);
1129 _info(struct modinfo *modinfop)
1131 return (mod_info(&modlinkage, modinfop));