1 /* Convert multibyte character to wide character.
2 Copyright (C) 1999-2002, 2005-2009 Free Software Foundation, Inc.
3 Written by Bruno Haible <bruno@clisp.org>, 2008.
5 This program is free software: you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 3 of the License, or
8 (at your option) any later version.
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program. If not, see <http://www.gnu.org/licenses/>. */
23 #if GNULIB_defined_mbstate_t
24 /* Implement mbrtowc() on top of mbtowc(). */
29 # include "localcharset.h"
34 verify (sizeof (mbstate_t) >= 4);
36 static char internal_state
[4];
39 mbrtowc (wchar_t *pwc
, const char *s
, size_t n
, mbstate_t *ps
)
41 char *pstate
= (char *)ps
;
44 pstate
= internal_state
;
58 size_t nstate
= pstate
[0];
95 /* Work around bug <http://sourceware.org/bugzilla/show_bug.cgi?id=9674> */
96 mbtowc (NULL
, NULL
, 0);
99 int res
= mbtowc (pwc
, p
, m
);
103 if (pwc
!= NULL
&& ((*pwc
== 0) != (res
== 0)))
105 if (nstate
>= (res
> 0 ? res
: 1))
112 /* mbtowc does not distinguish between invalid and incomplete multibyte
113 sequences. But mbrtowc needs to make this distinction.
114 There are two possible approaches:
115 - Use iconv() and its return value.
116 - Use built-in knowledge about the possible encodings.
117 Given the low quality of implementation of iconv() on the systems that
118 lack mbrtowc(), we use the second approach.
119 The possible encodings are:
121 - EUC-JP, EUC-KR, GB2312, EUC-TW, BIG5, GB18030, SJIS,
123 Use specialized code for each. */
124 if (m
>= 4 || m
>= MB_CUR_MAX
)
126 /* Here MB_CUR_MAX > 1 and 0 < m < 4. */
128 const char *encoding
= locale_charset ();
130 if (STREQ (encoding
, "UTF-8", 'U', 'T', 'F', '-', '8', 0, 0, 0, 0))
132 /* Cf. unistr/u8-mblen.c. */
133 unsigned char c
= (unsigned char) p
[0];
148 unsigned char c2
= (unsigned char) p
[1];
150 if ((c2
^ 0x80) < 0x40
151 && (c
>= 0xe1 || c2
>= 0xa0)
152 && (c
!= 0xed || c2
< 0xa0))
160 else /* m == 2 || m == 3 */
162 unsigned char c2
= (unsigned char) p
[1];
164 if ((c2
^ 0x80) < 0x40
165 && (c
>= 0xf1 || c2
>= 0x90)
166 && (c
< 0xf4 || (c
== 0xf4 && c2
< 0x90)))
172 unsigned char c3
= (unsigned char) p
[2];
174 if ((c3
^ 0x80) < 0x40)
184 /* As a reference for this code, you can use the GNU libiconv
185 implementation. Look for uses of the RET_TOOFEW macro. */
187 if (STREQ (encoding
, "EUC-JP", 'E', 'U', 'C', '-', 'J', 'P', 0, 0, 0))
191 unsigned char c
= (unsigned char) p
[0];
193 if ((c
>= 0xa1 && c
< 0xff) || c
== 0x8e || c
== 0x8f)
198 unsigned char c
= (unsigned char) p
[0];
202 unsigned char c2
= (unsigned char) p
[1];
204 if (c2
>= 0xa1 && c2
< 0xff)
210 if (STREQ (encoding
, "EUC-KR", 'E', 'U', 'C', '-', 'K', 'R', 0, 0, 0)
211 || STREQ (encoding
, "GB2312", 'G', 'B', '2', '3', '1', '2', 0, 0, 0)
212 || STREQ (encoding
, "BIG5", 'B', 'I', 'G', '5', 0, 0, 0, 0, 0))
216 unsigned char c
= (unsigned char) p
[0];
218 if (c
>= 0xa1 && c
< 0xff)
223 if (STREQ (encoding
, "EUC-TW", 'E', 'U', 'C', '-', 'T', 'W', 0, 0, 0))
227 unsigned char c
= (unsigned char) p
[0];
229 if ((c
>= 0xa1 && c
< 0xff) || c
== 0x8e)
232 else /* m == 2 || m == 3 */
234 unsigned char c
= (unsigned char) p
[0];
241 if (STREQ (encoding
, "GB18030", 'G', 'B', '1', '8', '0', '3', '0', 0, 0))
245 unsigned char c
= (unsigned char) p
[0];
247 if ((c
>= 0x90 && c
<= 0xe3) || (c
>= 0xf8 && c
<= 0xfe))
250 else /* m == 2 || m == 3 */
252 unsigned char c
= (unsigned char) p
[0];
254 if (c
>= 0x90 && c
<= 0xe3)
256 unsigned char c2
= (unsigned char) p
[1];
258 if (c2
>= 0x30 && c2
<= 0x39)
264 unsigned char c3
= (unsigned char) p
[2];
266 if (c3
>= 0x81 && c3
<= 0xfe)
274 if (STREQ (encoding
, "SJIS", 'S', 'J', 'I', 'S', 0, 0, 0, 0, 0))
278 unsigned char c
= (unsigned char) p
[0];
280 if ((c
>= 0x81 && c
<= 0x9f) || (c
>= 0xe0 && c
<= 0xea)
281 || (c
>= 0xf0 && c
<= 0xf9))
287 /* An unknown multibyte encoding. */
294 /* Here 0 <= k < m < 4. */
310 /* The conversion state is undefined, says POSIX. */
317 /* Override the system's mbrtowc() function. */
322 rpl_mbrtowc (wchar_t *pwc
, const char *s
, size_t n
, mbstate_t *ps
)
324 # if MBRTOWC_NULL_ARG_BUG || MBRTOWC_RETVAL_BUG
333 # if MBRTOWC_RETVAL_BUG
335 static mbstate_t internal_state
;
337 /* Override mbrtowc's internal state. We can not call mbsinit() on the
338 hidden internal state, but we can call it on our variable. */
340 ps
= &internal_state
;
344 /* Parse the rest of the multibyte character byte for byte. */
346 for (; n
> 0; s
++, n
--)
349 size_t ret
= mbrtowc (&wc
, s
, 1, ps
);
351 if (ret
== (size_t)(-1))
354 if (ret
!= (size_t)(-2))
356 /* The multibyte character has been completed. */
359 return (wc
== 0 ? 0 : count
);
367 # if MBRTOWC_NUL_RETVAL_BUG
370 size_t ret
= mbrtowc (&wc
, s
, n
, ps
);
372 if (ret
!= (size_t)(-1) && ret
!= (size_t)(-2))
382 return mbrtowc (pwc
, s
, n
, ps
);