1 /* $NetBSD: unicode.c,v 1.1.1.1 2007/03/06 00:10:39 dillo Exp $ */
4 * Copyright (c) 2007 The NetBSD Foundation, Inc.
7 * This code is derived from software contributed to The NetBSD Foundation
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
32 #include <sys/cdefs.h>
33 __KERNEL_RCSID(0, "$NetBSD$");
40 utf8_to_utf16(uint16_t *dst
, size_t dst_len
,
41 const char *src
, size_t src_len
,
44 const unsigned char *s
;
49 #define IS_CONT(c) (((c)&0xc0) == 0x80)
52 s
= (const unsigned char *)src
;
54 while (spos
<src_len
) {
57 else if ((flags
& UNICODE_UTF8_LATIN1_FALLBACK
)
58 && (spos
>= src_len
|| !IS_CONT(s
[spos
+1]))
60 /* not valid UTF-8, assume ISO 8859-1 */
63 else if (s
[spos
] < 0xc0 || s
[spos
] >= 0xf5) {
64 /* continuation byte without lead byte
65 or lead byte for codepoint above 0x10ffff */
70 else if (s
[spos
] < 0xe0) {
71 if (spos
>= src_len
|| !IS_CONT(s
[spos
+1])) {
76 c
= ((s
[spos
] & 0x3f) << 6) | (s
[spos
+1] & 0x3f);
79 /* overlong encoding */
84 else if (s
[spos
] < 0xf0) {
86 || !IS_CONT(s
[spos
+1]) || !IS_CONT(s
[spos
+2])) {
91 c
= ((s
[spos
] & 0x0f) << 12) | ((s
[spos
+1] & 0x3f) << 6)
94 if (c
< 0x800 || (c
& 0xdf00) == 0xd800 ) {
95 /* overlong encoding or encoded surrogate */
102 /* UTF-16 surrogate pair */
104 if (spos
>= src_len
-3 || !IS_CONT(s
[spos
+1])
105 || !IS_CONT(s
[spos
+2]) || !IS_CONT(s
[spos
+3])) {
111 cc
= ((s
[spos
] & 0x03) << 18) | ((s
[spos
+1] & 0x3f) << 12)
112 | ((s
[spos
+2] & 0x3f) << 6) | (s
[spos
+3] & 0x3f);
115 /* overlong encoding */
119 if (dst
&& dpos
< dst_len
)
120 dst
[dpos
] = (0xd800 | ((cc
-0x10000)>>10));
122 c
= 0xdc00 | ((cc
-0x10000) & 0x3ffff);
125 if (dst
&& dpos
< dst_len
)
140 utf16_to_utf8(char *dst
, size_t dst_len
,
141 const uint16_t *src
, size_t src_len
,
142 int flags
, int *errp
)
147 #define CHECK_LENGTH(l) (dpos > dst_len-(l) ? dst=NULL : NULL)
148 #define ADD_BYTE(b) (dst ? dst[dpos] = (b) : 0, dpos++)
152 for (spos
=0; spos
<src_len
; spos
++) {
153 if (src
[spos
] < 0x80) {
157 else if (src
[spos
] < 0x800) {
159 ADD_BYTE(0xc0 | (src
[spos
]>>6));
160 ADD_BYTE(0x80 | (src
[spos
] & 0x3f));
162 else if ((src
[spos
] & 0xdc00) == 0xd800) {
164 /* first surrogate */
165 if (spos
== src_len
- 1 || (src
[spos
] & 0xdc00) != 0xdc00) {
166 /* no second surrogate present */
172 c
= (((src
[spos
]&0x3ff) << 10) | (src
[spos
+1]&0x3ff)) + 0x10000;
173 ADD_BYTE(0xf0 | (c
>>18));
174 ADD_BYTE(0x80 | ((c
>>12) & 0x3f));
175 ADD_BYTE(0x80 | ((c
>>6) & 0x3f));
176 ADD_BYTE(0x80 | (c
& 0x3f));
178 else if ((src
[spos
] & 0xdc00) == 0xdc00) {
179 /* second surrogate without preceding first surrogate */
184 ADD_BYTE(0xe0 | src
[spos
]>>12);
185 ADD_BYTE(0x80 | ((src
[spos
]>>6) & 0x3f));
186 ADD_BYTE(0x80 | (src
[spos
] & 0x3f));