r6831@lvps87-230-33-50: verhaegs | 2008-02-03 14:08:57 +0100
[tangerine.git] / compiler / clib / locale / utf8.c
blob9a9ffa92ac6d80b85d1464f7761798a6ca927222
1 /*-
2 * Copyright (c) 2002-2004 Tim J. Robbins
3 * All rights reserved.
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
27 #include <sys/param.h>
28 __FBSDID("$FreeBSD: src/lib/libc/locale/utf8.c,v 1.16 2007/10/15 09:51:30 ache Exp $");
30 #include <errno.h>
31 #include <limits.h>
32 #include <runetype.h>
33 #include <stdlib.h>
34 #include <string.h>
35 #include <wchar.h>
36 #include "mblocal.h"
38 extern int __mb_sb_limit;
40 static size_t _UTF8_mbrtowc(wchar_t * __restrict, const char * __restrict,
41 size_t, mbstate_t * __restrict);
42 static int _UTF8_mbsinit(const mbstate_t *);
43 static size_t _UTF8_mbsnrtowcs(wchar_t * __restrict,
44 const char ** __restrict, size_t, size_t,
45 mbstate_t * __restrict);
46 static size_t _UTF8_wcrtomb(char * __restrict, wchar_t,
47 mbstate_t * __restrict);
48 static size_t _UTF8_wcsnrtombs(char * __restrict, const wchar_t ** __restrict,
49 size_t, size_t, mbstate_t * __restrict);
51 typedef struct {
52 wchar_t ch;
53 int want;
54 wchar_t lbound;
55 } _UTF8State;
57 int
58 _UTF8_init(_RuneLocale *rl)
61 __mbrtowc = _UTF8_mbrtowc;
62 __wcrtomb = _UTF8_wcrtomb;
63 __mbsinit = _UTF8_mbsinit;
64 __mbsnrtowcs = _UTF8_mbsnrtowcs;
65 __wcsnrtombs = _UTF8_wcsnrtombs;
66 _CurrentRuneLocale = rl;
67 __mb_cur_max = 6;
69 * UCS-4 encoding used as the internal representation, so
70 * slots 0x0080-0x00FF are occuped and must be excluded
71 * from the single byte ctype by setting the limit.
73 __mb_sb_limit = 128;
75 return (0);
78 static int
79 _UTF8_mbsinit(const mbstate_t *ps)
82 return (ps == NULL || ((const _UTF8State *)ps)->want == 0);
85 static size_t
86 _UTF8_mbrtowc(wchar_t * __restrict pwc, const char * __restrict s, size_t n,
87 mbstate_t * __restrict ps)
89 _UTF8State *us;
90 int ch, i, mask, want;
91 wchar_t lbound, wch;
93 us = (_UTF8State *)ps;
95 if (us->want < 0 || us->want > 6) {
96 errno = EINVAL;
97 return ((size_t)-1);
100 if (s == NULL) {
101 s = "";
102 n = 1;
103 pwc = NULL;
106 if (n == 0)
107 /* Incomplete multibyte sequence */
108 return ((size_t)-2);
110 if (us->want == 0 && ((ch = (unsigned char)*s) & ~0x7f) == 0) {
111 /* Fast path for plain ASCII characters. */
112 if (pwc != NULL)
113 *pwc = ch;
114 return (ch != '\0' ? 1 : 0);
117 if (us->want == 0) {
119 * Determine the number of octets that make up this character
120 * from the first octet, and a mask that extracts the
121 * interesting bits of the first octet. We already know
122 * the character is at least two bytes long.
124 * We also specify a lower bound for the character code to
125 * detect redundant, non-"shortest form" encodings. For
126 * example, the sequence C0 80 is _not_ a legal representation
127 * of the null character. This enforces a 1-to-1 mapping
128 * between character codes and their multibyte representations.
130 ch = (unsigned char)*s;
131 if ((ch & 0x80) == 0) {
132 mask = 0x7f;
133 want = 1;
134 lbound = 0;
135 } else if ((ch & 0xe0) == 0xc0) {
136 mask = 0x1f;
137 want = 2;
138 lbound = 0x80;
139 } else if ((ch & 0xf0) == 0xe0) {
140 mask = 0x0f;
141 want = 3;
142 lbound = 0x800;
143 } else if ((ch & 0xf8) == 0xf0) {
144 mask = 0x07;
145 want = 4;
146 lbound = 0x10000;
147 } else if ((ch & 0xfc) == 0xf8) {
148 mask = 0x03;
149 want = 5;
150 lbound = 0x200000;
151 } else if ((ch & 0xfe) == 0xfc) {
152 mask = 0x01;
153 want = 6;
154 lbound = 0x4000000;
155 } else {
157 * Malformed input; input is not UTF-8.
159 errno = EILSEQ;
160 return ((size_t)-1);
162 } else {
163 want = us->want;
164 lbound = us->lbound;
168 * Decode the octet sequence representing the character in chunks
169 * of 6 bits, most significant first.
171 if (us->want == 0)
172 wch = (unsigned char)*s++ & mask;
173 else
174 wch = us->ch;
175 for (i = (us->want == 0) ? 1 : 0; i < MIN(want, n); i++) {
176 if ((*s & 0xc0) != 0x80) {
178 * Malformed input; bad characters in the middle
179 * of a character.
181 errno = EILSEQ;
182 return ((size_t)-1);
184 wch <<= 6;
185 wch |= *s++ & 0x3f;
187 if (i < want) {
188 /* Incomplete multibyte sequence. */
189 us->want = want - i;
190 us->lbound = lbound;
191 us->ch = wch;
192 return ((size_t)-2);
194 if (wch < lbound) {
196 * Malformed input; redundant encoding.
198 errno = EILSEQ;
199 return ((size_t)-1);
201 if (pwc != NULL)
202 *pwc = wch;
203 us->want = 0;
204 return (wch == L'\0' ? 0 : want);
207 static size_t
208 _UTF8_mbsnrtowcs(wchar_t * __restrict dst, const char ** __restrict src,
209 size_t nms, size_t len, mbstate_t * __restrict ps)
211 _UTF8State *us;
212 const char *s;
213 size_t nchr;
214 wchar_t wc;
215 size_t nb;
217 us = (_UTF8State *)ps;
219 s = *src;
220 nchr = 0;
222 if (dst == NULL) {
224 * The fast path in the loop below is not safe if an ASCII
225 * character appears as anything but the first byte of a
226 * multibyte sequence. Check now to avoid doing it in the loop.
228 if (nms > 0 && us->want > 0 && (signed char)*s > 0) {
229 errno = EILSEQ;
230 return ((size_t)-1);
232 for (;;) {
233 if (nms > 0 && (signed char)*s > 0)
235 * Fast path for plain ASCII characters
236 * excluding NUL.
238 nb = 1;
239 else if ((nb = _UTF8_mbrtowc(&wc, s, nms, ps)) ==
240 (size_t)-1)
241 /* Invalid sequence - mbrtowc() sets errno. */
242 return ((size_t)-1);
243 else if (nb == 0 || nb == (size_t)-2)
244 return (nchr);
245 s += nb;
246 nms -= nb;
247 nchr++;
249 /*NOTREACHED*/
253 * The fast path in the loop below is not safe if an ASCII
254 * character appears as anything but the first byte of a
255 * multibyte sequence. Check now to avoid doing it in the loop.
257 if (nms > 0 && len > 0 && us->want > 0 && (signed char)*s > 0) {
258 errno = EILSEQ;
259 return ((size_t)-1);
261 while (len-- > 0) {
262 if (nms > 0 && (signed char)*s > 0) {
264 * Fast path for plain ASCII characters
265 * excluding NUL.
267 *dst = (wchar_t)*s;
268 nb = 1;
269 } else if ((nb = _UTF8_mbrtowc(dst, s, nms, ps)) ==
270 (size_t)-1) {
271 *src = s;
272 return ((size_t)-1);
273 } else if (nb == (size_t)-2) {
274 *src = s + nms;
275 return (nchr);
276 } else if (nb == 0) {
277 *src = NULL;
278 return (nchr);
280 s += nb;
281 nms -= nb;
282 nchr++;
283 dst++;
285 *src = s;
286 return (nchr);
289 static size_t
290 _UTF8_wcrtomb(char * __restrict s, wchar_t wc, mbstate_t * __restrict ps)
292 _UTF8State *us;
293 unsigned char lead;
294 int i, len;
296 us = (_UTF8State *)ps;
298 if (us->want != 0) {
299 errno = EINVAL;
300 return ((size_t)-1);
303 if (s == NULL)
304 /* Reset to initial shift state (no-op) */
305 return (1);
307 if ((wc & ~0x7f) == 0) {
308 /* Fast path for plain ASCII characters. */
309 *s = (char)wc;
310 return (1);
314 * Determine the number of octets needed to represent this character.
315 * We always output the shortest sequence possible. Also specify the
316 * first few bits of the first octet, which contains the information
317 * about the sequence length.
319 if ((wc & ~0x7f) == 0) {
320 lead = 0;
321 len = 1;
322 } else if ((wc & ~0x7ff) == 0) {
323 lead = 0xc0;
324 len = 2;
325 } else if ((wc & ~0xffff) == 0) {
326 lead = 0xe0;
327 len = 3;
328 } else if ((wc & ~0x1fffff) == 0) {
329 lead = 0xf0;
330 len = 4;
331 } else if ((wc & ~0x3ffffff) == 0) {
332 lead = 0xf8;
333 len = 5;
334 } else if ((wc & ~0x7fffffff) == 0) {
335 lead = 0xfc;
336 len = 6;
337 } else {
338 errno = EILSEQ;
339 return ((size_t)-1);
343 * Output the octets representing the character in chunks
344 * of 6 bits, least significant last. The first octet is
345 * a special case because it contains the sequence length
346 * information.
348 for (i = len - 1; i > 0; i--) {
349 s[i] = (wc & 0x3f) | 0x80;
350 wc >>= 6;
352 *s = (wc & 0xff) | lead;
354 return (len);
357 static size_t
358 _UTF8_wcsnrtombs(char * __restrict dst, const wchar_t ** __restrict src,
359 size_t nwc, size_t len, mbstate_t * __restrict ps)
361 _UTF8State *us;
362 char buf[MB_LEN_MAX];
363 const wchar_t *s;
364 size_t nbytes;
365 size_t nb;
367 us = (_UTF8State *)ps;
369 if (us->want != 0) {
370 errno = EINVAL;
371 return ((size_t)-1);
374 s = *src;
375 nbytes = 0;
377 if (dst == NULL) {
378 while (nwc-- > 0) {
379 if (0 <= *s && *s < 0x80)
380 /* Fast path for plain ASCII characters. */
381 nb = 1;
382 else if ((nb = _UTF8_wcrtomb(buf, *s, ps)) ==
383 (size_t)-1)
384 /* Invalid character - wcrtomb() sets errno. */
385 return ((size_t)-1);
386 if (*s == L'\0')
387 return (nbytes + nb - 1);
388 s++;
389 nbytes += nb;
391 return (nbytes);
394 while (len > 0 && nwc-- > 0) {
395 if (0 <= *s && *s < 0x80) {
396 /* Fast path for plain ASCII characters. */
397 nb = 1;
398 *dst = *s;
399 } else if (len > (size_t)MB_CUR_MAX) {
400 /* Enough space to translate in-place. */
401 if ((nb = _UTF8_wcrtomb(dst, *s, ps)) == (size_t)-1) {
402 *src = s;
403 return ((size_t)-1);
405 } else {
407 * May not be enough space; use temp. buffer.
409 if ((nb = _UTF8_wcrtomb(buf, *s, ps)) == (size_t)-1) {
410 *src = s;
411 return ((size_t)-1);
413 if (nb > (int)len)
414 /* MB sequence for character won't fit. */
415 break;
416 memcpy(dst, buf, nb);
418 if (*s == L'\0') {
419 *src = NULL;
420 return (nbytes + nb - 1);
422 s++;
423 dst += nb;
424 len -= nb;
425 nbytes += nb;
427 *src = s;
428 return (nbytes);