3 * Copyright (c) 2007 Alexey Vatchenko <av@bsdua.org>
5 * Permission to use, copy, modify, and/or distribute this software for any
6 * purpose with or without fee is hereby granted, provided that the above
7 * copyright notice and this permission notice appear in all copies.
9 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
12 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
15 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17 #include <sys/types.h>
20 #include <arpa/inet.h> /* for htonl() */
33 static int __wchar_forbitten(wchar_t sym
);
34 static int __utf8_forbitten(u_char octet
);
37 __wchar_forbitten(wchar_t sym
)
41 if (sym
>= 0xd800 && sym
<= 0xdfff)
48 __utf8_forbitten(u_char octet
)
64 * This function translates UTF-8 string into UCS-4 string (all symbols
65 * will be in local machine byte order).
67 * It takes the following arguments:
68 * in - input UTF-8 string. It can be null-terminated.
69 * insize - size of input string in bytes.
70 * out - result buffer for UCS-4 string. If out is NULL,
71 * function returns size of result buffer.
72 * outsize - size of out buffer in wide characters.
75 * The function returns size of result buffer (in wide characters).
76 * Zero is returned in case of error.
79 * 1. If UTF-8 string contains zero symbols, they will be translated
81 * 2. If UTF8_IGNORE_ERROR or UTF8_SKIP_BOM flag is set, sizes may vary
82 * when `out' is NULL and not NULL. It's because of special UTF-8
83 * sequences which may result in forbitten (by RFC3629) UNICODE
84 * characters. So, the caller must check return value every time and
85 * not prepare buffer in advance (\0 terminate) but after calling this
89 utf8_to_wchar(const char *in
, size_t insize
, wchar_t *out
, size_t outsize
,
94 size_t n
, total
, i
, n_bits
;
96 if (in
== NULL
|| insize
== 0 || (outsize
== 0 && out
!= NULL
))
102 wlim
= out
+ outsize
;
104 for (; p
< lim
; p
+= n
) {
105 if (__utf8_forbitten(*p
) != 0 &&
106 (flags
& UTF8_IGNORE_ERROR
) == 0)
110 * Get number of bytes for one wide character.
112 n
= 1; /* default: 1 byte. Used when skipping bytes. */
113 if ((*p
& 0x80) == 0)
115 else if ((*p
& 0xe0) == _SEQ2
) {
117 high
= (wchar_t)(*p
& 0x1f);
118 } else if ((*p
& 0xf0) == _SEQ3
) {
120 high
= (wchar_t)(*p
& 0x0f);
121 } else if ((*p
& 0xf8) == _SEQ4
) {
123 high
= (wchar_t)(*p
& 0x07);
124 } else if ((*p
& 0xfc) == _SEQ5
) {
126 high
= (wchar_t)(*p
& 0x03);
127 } else if ((*p
& 0xfe) == _SEQ6
) {
129 high
= (wchar_t)(*p
& 0x01);
131 if ((flags
& UTF8_IGNORE_ERROR
) == 0)
136 /* does the sequence header tell us truth about length? */
137 if (lim
- p
<= n
- 1) {
138 if ((flags
& UTF8_IGNORE_ERROR
) == 0)
146 * All symbols must have higher bits set to 10xxxxxx
149 for (i
= 1; i
< n
; i
++) {
150 if ((p
[i
] & 0xc0) != _NXT
)
154 if ((flags
& UTF8_IGNORE_ERROR
) == 0)
167 return (0); /* no space left */
171 for (i
= 1; i
< n
; i
++) {
172 *out
|= (wchar_t)(p
[n
- i
] & 0x3f) << n_bits
;
173 n_bits
+= 6; /* 6 low bits in every byte */
175 *out
|= high
<< n_bits
;
177 if (*out
== 0) /* return at end of string */
180 if (__wchar_forbitten(*out
) != 0) {
181 if ((flags
& UTF8_IGNORE_ERROR
) == 0)
182 return (0); /* forbitten character */
187 } else if (*out
== _BOM
&& (flags
& UTF8_SKIP_BOM
) != 0) {
200 * This function translates UCS-4 symbols (given in local machine
201 * byte order) into UTF-8 string.
203 * It takes the following arguments:
204 * in - input unicode string. It can be null-terminated.
205 * insize - size of input string in wide characters.
206 * out - result buffer for utf8 string. If out is NULL,
207 * function returns size of result buffer.
208 * outsize - size of result buffer.
211 * The function returns size of result buffer (in bytes). Zero is returned
215 * If UCS-4 string contains zero symbols, they will be translated
216 * as regular symbols.
219 wchar_to_utf8(const wchar_t *in
, size_t insize
, char *out
, size_t outsize
,
222 wchar_t *w
, *wlim
, ch
;
223 u_char
*p
, *lim
, *oc
;
226 if (in
== NULL
|| insize
== 0 || (outsize
== 0 && out
!= NULL
))
234 for (; w
< wlim
; w
++) {
235 if (__wchar_forbitten(*w
) != 0) {
236 if ((flags
& UTF8_IGNORE_ERROR
) == 0)
242 if (*w
== _BOM
&& (flags
& UTF8_SKIP_BOM
) != 0)
246 if ((flags
& UTF8_IGNORE_ERROR
) == 0)
249 } else if (*w
<= 0x0000007f)
251 else if (*w
<= 0x000007ff)
253 else if (*w
<= 0x0000ffff)
255 else if (*w
<= 0x001fffff)
257 else if (*w
<= 0x03ffffff)
259 else /* if (*w <= 0x7fffffff) */
267 if (lim
- p
<= n
- 1)
268 return (0); /* no space left */
270 /* make it work under different endians */
279 p
[1] = _NXT
| (oc
[3] & 0x3f);
280 p
[0] = _SEQ2
| (oc
[3] >> 6) | ((oc
[2] & 0x07) << 2);
284 p
[2] = _NXT
| (oc
[3] & 0x3f);
285 p
[1] = _NXT
| (oc
[3] >> 6) | ((oc
[2] & 0x0f) << 2);
286 p
[0] = _SEQ3
| ((oc
[2] & 0xf0) >> 4);
290 p
[3] = _NXT
| (oc
[3] & 0x3f);
291 p
[2] = _NXT
| (oc
[3] >> 6) | ((oc
[2] & 0x0f) << 2);
292 p
[1] = _NXT
| ((oc
[2] & 0xf0) >> 4) |
293 ((oc
[1] & 0x03) << 4);
294 p
[0] = _SEQ4
| ((oc
[1] & 0x1f) >> 2);
298 p
[4] = _NXT
| (oc
[3] & 0x3f);
299 p
[3] = _NXT
| (oc
[3] >> 6) | ((oc
[2] & 0x0f) << 2);
300 p
[2] = _NXT
| ((oc
[2] & 0xf0) >> 4) |
301 ((oc
[1] & 0x03) << 4);
302 p
[1] = _NXT
| (oc
[1] >> 2);
303 p
[0] = _SEQ5
| (oc
[0] & 0x03);
307 p
[5] = _NXT
| (oc
[3] & 0x3f);
308 p
[4] = _NXT
| (oc
[3] >> 6) | ((oc
[2] & 0x0f) << 2);
309 p
[3] = _NXT
| (oc
[2] >> 4) | ((oc
[1] & 0x03) << 4);
310 p
[2] = _NXT
| (oc
[1] >> 2);
311 p
[1] = _NXT
| (oc
[0] & 0x3f);
312 p
[0] = _SEQ6
| ((oc
[0] & 0x40) >> 6);
317 * NOTE: do not check here for forbitten UTF-8 characters.
318 * They cannot appear here because we do proper convertion.
326 #endif /* TLF_FONTS */