1 /* Conversion UTF-8 to UCS-4.
2 Copyright (C) 2001-2002 Free Software Foundation, Inc.
3 Written by Bruno Haible <haible@clisp.cons.org>, 2001.
5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 2, or (at your option)
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software
17 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */
22 /* Return the length (number of units) of the first character in S, putting
23 its 'ucs4_t' representation in *PUC. */
25 u8_mbtouc_aux (unsigned int *puc
, const unsigned char *s
, size_t n
)
35 if ((s
[1] ^ 0x80) < 0x40)
37 *puc
= ((unsigned int) (c
& 0x1f) << 6)
38 | (unsigned int) (s
[1] ^ 0x80);
41 /* invalid multibyte character */
45 /* incomplete multibyte character */
54 if ((s
[1] ^ 0x80) < 0x40 && (s
[2] ^ 0x80) < 0x40
55 && (c
>= 0xe1 || s
[1] >= 0xa0))
57 *puc
= ((unsigned int) (c
& 0x0f) << 12)
58 | ((unsigned int) (s
[1] ^ 0x80) << 6)
59 | (unsigned int) (s
[2] ^ 0x80);
62 /* invalid multibyte character */
66 /* incomplete multibyte character */
75 if ((s
[1] ^ 0x80) < 0x40 && (s
[2] ^ 0x80) < 0x40
76 && (s
[3] ^ 0x80) < 0x40
77 && (c
>= 0xf1 || s
[1] >= 0x90)
79 && (c
< 0xf4 || (c
== 0xf4 && s
[1] < 0x90))
83 *puc
= ((unsigned int) (c
& 0x07) << 18)
84 | ((unsigned int) (s
[1] ^ 0x80) << 12)
85 | ((unsigned int) (s
[2] ^ 0x80) << 6)
86 | (unsigned int) (s
[3] ^ 0x80);
89 /* invalid multibyte character */
93 /* incomplete multibyte character */
103 if ((s
[1] ^ 0x80) < 0x40 && (s
[2] ^ 0x80) < 0x40
104 && (s
[3] ^ 0x80) < 0x40 && (s
[4] ^ 0x80) < 0x40
105 && (c
>= 0xf9 || s
[1] >= 0x88))
107 *puc
= ((unsigned int) (c
& 0x03) << 24)
108 | ((unsigned int) (s
[1] ^ 0x80) << 18)
109 | ((unsigned int) (s
[2] ^ 0x80) << 12)
110 | ((unsigned int) (s
[3] ^ 0x80) << 6)
111 | (unsigned int) (s
[4] ^ 0x80);
114 /* invalid multibyte character */
118 /* incomplete multibyte character */
127 if ((s
[1] ^ 0x80) < 0x40 && (s
[2] ^ 0x80) < 0x40
128 && (s
[3] ^ 0x80) < 0x40 && (s
[4] ^ 0x80) < 0x40
129 && (s
[5] ^ 0x80) < 0x40
130 && (c
>= 0xfd || s
[1] >= 0x84))
132 *puc
= ((unsigned int) (c
& 0x01) << 30)
133 | ((unsigned int) (s
[1] ^ 0x80) << 24)
134 | ((unsigned int) (s
[2] ^ 0x80) << 18)
135 | ((unsigned int) (s
[3] ^ 0x80) << 12)
136 | ((unsigned int) (s
[4] ^ 0x80) << 6)
137 | (unsigned int) (s
[5] ^ 0x80);
140 /* invalid multibyte character */
144 /* incomplete multibyte character */
151 /* invalid multibyte character */
156 u8_mbtouc (unsigned int *puc
, const unsigned char *s
, size_t n
)
158 unsigned char c
= *s
;
166 return u8_mbtouc_aux (puc
, s
, n
);