1 /* Convert multibyte character to wide character.
2 Copyright (C) 1999-2002, 2005-2019 Free Software Foundation, Inc.
3 Written by Bruno Haible <bruno@clisp.org>, 2008.
5 This program is free software: you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 3 of the License, or
8 (at your option) any later version.
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program. If not, see <https://www.gnu.org/licenses/>. */
23 #if C_LOCALE_MAYBE_EILSEQ
24 # include "hard-locale.h"
28 #if GNULIB_defined_mbstate_t
29 /* Implement mbrtowc() on top of mbtowc(). */
34 # include "localcharset.h"
40 # define FALLTHROUGH ((void) 0)
42 # define FALLTHROUGH __attribute__ ((__fallthrough__))
46 /* Returns a classification of special values of the encoding of the current
49 enc_other
, /* other */
51 enc_eucjp
, /* EUC-JP */
52 enc_94
, /* EUC-KR, GB2312, BIG5 */
53 enc_euctw
, /* EUC-TW */
54 enc_gb18030
, /* GB18030 */
60 const char *encoding
= locale_charset ();
61 if (STREQ_OPT (encoding
, "UTF-8", 'U', 'T', 'F', '-', '8', 0, 0, 0, 0))
63 if (STREQ_OPT (encoding
, "EUC-JP", 'E', 'U', 'C', '-', 'J', 'P', 0, 0, 0))
65 if (STREQ_OPT (encoding
, "EUC-KR", 'E', 'U', 'C', '-', 'K', 'R', 0, 0, 0)
66 || STREQ_OPT (encoding
, "GB2312", 'G', 'B', '2', '3', '1', '2', 0, 0, 0)
67 || STREQ_OPT (encoding
, "BIG5", 'B', 'I', 'G', '5', 0, 0, 0, 0, 0))
69 if (STREQ_OPT (encoding
, "EUC-TW", 'E', 'U', 'C', '-', 'T', 'W', 0, 0, 0))
71 if (STREQ_OPT (encoding
, "GB18030", 'G', 'B', '1', '8', '0', '3', '0', 0, 0))
73 if (STREQ_OPT (encoding
, "SJIS", 'S', 'J', 'I', 'S', 0, 0, 0, 0, 0))
78 #if GNULIB_WCHAR_SINGLE
79 /* When we know that the locale does not change, provide a speedup by
80 caching the value of locale_enc. */
81 static int cached_locale_enc
= -1;
83 locale_enc_cached (void)
85 if (cached_locale_enc
< 0)
86 cached_locale_enc
= locale_enc ();
87 return cached_locale_enc
;
90 /* By default, don't make assumptions, hence no caching. */
91 # define locale_enc_cached locale_enc
94 verify (sizeof (mbstate_t) >= 4);
96 static char internal_state
[4];
99 mbrtowc (wchar_t *pwc
, const char *s
, size_t n
, mbstate_t *ps
)
101 char *pstate
= (char *)ps
;
116 pstate
= internal_state
;
119 size_t nstate
= pstate
[0];
155 # if __GLIBC__ || defined __UCLIBC__
156 /* Work around bug <https://sourceware.org/bugzilla/show_bug.cgi?id=9674> */
157 mbtowc (NULL
, NULL
, 0);
160 int res
= mbtowc (pwc
, p
, m
);
164 if (pwc
!= NULL
&& ((*pwc
== 0) != (res
== 0)))
166 if (nstate
>= (res
> 0 ? res
: 1))
173 /* mbtowc does not distinguish between invalid and incomplete multibyte
174 sequences. But mbrtowc needs to make this distinction.
175 There are two possible approaches:
176 - Use iconv() and its return value.
177 - Use built-in knowledge about the possible encodings.
178 Given the low quality of implementation of iconv() on the systems that
179 lack mbrtowc(), we use the second approach.
180 The possible encodings are:
182 - EUC-JP, EUC-KR, GB2312, EUC-TW, BIG5, GB18030, SJIS,
184 Use specialized code for each. */
185 if (m
>= 4 || m
>= MB_CUR_MAX
)
187 /* Here MB_CUR_MAX > 1 and 0 < m < 4. */
188 switch (locale_enc_cached ())
190 case enc_utf8
: /* UTF-8 */
192 /* Cf. unistr/u8-mblen.c. */
193 unsigned char c
= (unsigned char) p
[0];
208 unsigned char c2
= (unsigned char) p
[1];
210 if ((c2
^ 0x80) < 0x40
211 && (c
>= 0xe1 || c2
>= 0xa0)
212 && (c
!= 0xed || c2
< 0xa0))
220 else /* m == 2 || m == 3 */
222 unsigned char c2
= (unsigned char) p
[1];
224 if ((c2
^ 0x80) < 0x40
225 && (c
>= 0xf1 || c2
>= 0x90)
226 && (c
< 0xf4 || (c
== 0xf4 && c2
< 0x90)))
232 unsigned char c3
= (unsigned char) p
[2];
234 if ((c3
^ 0x80) < 0x40)
244 /* As a reference for this code, you can use the GNU libiconv
245 implementation. Look for uses of the RET_TOOFEW macro. */
247 case enc_eucjp
: /* EUC-JP */
251 unsigned char c
= (unsigned char) p
[0];
253 if ((c
>= 0xa1 && c
< 0xff) || c
== 0x8e || c
== 0x8f)
258 unsigned char c
= (unsigned char) p
[0];
262 unsigned char c2
= (unsigned char) p
[1];
264 if (c2
>= 0xa1 && c2
< 0xff)
271 case enc_94
: /* EUC-KR, GB2312, BIG5 */
275 unsigned char c
= (unsigned char) p
[0];
277 if (c
>= 0xa1 && c
< 0xff)
283 case enc_euctw
: /* EUC-TW */
287 unsigned char c
= (unsigned char) p
[0];
289 if ((c
>= 0xa1 && c
< 0xff) || c
== 0x8e)
292 else /* m == 2 || m == 3 */
294 unsigned char c
= (unsigned char) p
[0];
302 case enc_gb18030
: /* GB18030 */
306 unsigned char c
= (unsigned char) p
[0];
308 if ((c
>= 0x90 && c
<= 0xe3) || (c
>= 0xf8 && c
<= 0xfe))
311 else /* m == 2 || m == 3 */
313 unsigned char c
= (unsigned char) p
[0];
315 if (c
>= 0x90 && c
<= 0xe3)
317 unsigned char c2
= (unsigned char) p
[1];
319 if (c2
>= 0x30 && c2
<= 0x39)
325 unsigned char c3
= (unsigned char) p
[2];
327 if (c3
>= 0x81 && c3
<= 0xfe)
336 case enc_sjis
: /* SJIS */
340 unsigned char c
= (unsigned char) p
[0];
342 if ((c
>= 0x81 && c
<= 0x9f) || (c
>= 0xe0 && c
<= 0xea)
343 || (c
>= 0xf0 && c
<= 0xf9))
350 /* An unknown multibyte encoding. */
357 /* Here 0 <= k < m < 4. */
373 /* The conversion state is undefined, says POSIX. */
380 /* Override the system's mbrtowc() function. */
385 rpl_mbrtowc (wchar_t *pwc
, const char *s
, size_t n
, mbstate_t *ps
)
390 # if MBRTOWC_NULL_ARG2_BUG || MBRTOWC_RETVAL_BUG || MBRTOWC_EMPTY_INPUT_BUG
399 # if MBRTOWC_EMPTY_INPUT_BUG
407 # if MBRTOWC_RETVAL_BUG
409 static mbstate_t internal_state
;
411 /* Override mbrtowc's internal state. We cannot call mbsinit() on the
412 hidden internal state, but we can call it on our variable. */
414 ps
= &internal_state
;
418 /* Parse the rest of the multibyte character byte for byte. */
420 for (; n
> 0; s
++, n
--)
422 ret
= mbrtowc (&wc
, s
, 1, ps
);
424 if (ret
== (size_t)(-1))
427 if (ret
!= (size_t)(-2))
429 /* The multibyte character has been completed. */
431 return (wc
== 0 ? 0 : count
);
439 ret
= mbrtowc (pwc
, s
, n
, ps
);
441 # if MBRTOWC_NUL_RETVAL_BUG
442 if (ret
< (size_t) -2 && !*pwc
)
446 # if C_LOCALE_MAYBE_EILSEQ
447 if ((size_t) -2 <= ret
&& n
!= 0 && ! hard_locale (LC_CTYPE
))
449 unsigned char uc
= *s
;