1 /* Convert multibyte character to wide character.
2 Copyright (C) 1999-2002, 2005-2024 Free Software Foundation, Inc.
4 This file is free software: you can redistribute it and/or modify
5 it under the terms of the GNU Lesser General Public License as
6 published by the Free Software Foundation; either version 2.1 of the
7 License, or (at your option) any later version.
9 This file is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU Lesser General Public License for more details.
14 You should have received a copy of the GNU Lesser General Public License
15 along with this program. If not, see <https://www.gnu.org/licenses/>. */
17 /* Written by Bruno Haible <bruno@clisp.org>, 2008. */
19 /* This file contains the body of the mbrtowc and mbrtoc32 functions,
20 when GNULIB_defined_mbstate_t is defined. */
22 char *pstate
= (char *)ps
;
37 pstate
= internal_state
;
40 size_t nstate
= pstate
[0];
78 enc
= locale_encoding_classification ();
80 if (enc
== enc_utf8
) /* UTF-8 */
83 - multi-thread safety and
84 - the ability to produce wide character values > WCHAR_MAX
85 by not calling mbtowc() at all. */
86 #include "mbrtowc-impl-utf8.h"
90 /* The hidden internal state of mbtowc would make this function not
91 multi-thread safe. Achieve multi-thread safety through a lock. */
93 res
= mbtowc_with_lock (&wc
, p
, m
);
97 if ((wc
== 0) != (res
== 0))
104 /* mbtowc does not distinguish between invalid and incomplete multibyte
105 sequences. But mbrtowc needs to make this distinction.
106 There are two possible approaches:
107 - Use iconv() and its return value.
108 - Use built-in knowledge about the possible encodings.
109 Given the low quality of implementation of iconv() on the systems
110 that lack mbrtowc(), we use the second approach.
111 The possible encodings are:
113 - EUC-JP, EUC-KR, GB2312, EUC-TW, BIG5, GB18030, SJIS,
114 - UTF-8 (already handled above).
115 Use specialized code for each. */
116 if (m
>= 4 || m
>= MB_CUR_MAX
)
118 /* Here MB_CUR_MAX > 1 and 0 < m < 4. */
121 /* As a reference for this code, you can use the GNU libiconv
122 implementation. Look for uses of the RET_TOOFEW macro. */
124 case enc_eucjp
: /* EUC-JP */
128 unsigned char c
= (unsigned char) p
[0];
130 if ((c
>= 0xa1 && c
< 0xff) || c
== 0x8e || c
== 0x8f)
135 unsigned char c
= (unsigned char) p
[0];
139 unsigned char c2
= (unsigned char) p
[1];
141 if (c2
>= 0xa1 && c2
< 0xff)
148 case enc_94
: /* EUC-KR, GB2312, BIG5 */
152 unsigned char c
= (unsigned char) p
[0];
154 if (c
>= 0xa1 && c
< 0xff)
160 case enc_euctw
: /* EUC-TW */
164 unsigned char c
= (unsigned char) p
[0];
166 if ((c
>= 0xa1 && c
< 0xff) || c
== 0x8e)
169 else /* m == 2 || m == 3 */
171 unsigned char c
= (unsigned char) p
[0];
179 case enc_gb18030
: /* GB18030 */
183 unsigned char c
= (unsigned char) p
[0];
185 if ((c
>= 0x90 && c
<= 0xe3) || (c
>= 0xf8 && c
<= 0xfe))
188 else /* m == 2 || m == 3 */
190 unsigned char c
= (unsigned char) p
[0];
192 if (c
>= 0x90 && c
<= 0xe3)
194 unsigned char c2
= (unsigned char) p
[1];
196 if (c2
>= 0x30 && c2
<= 0x39)
202 unsigned char c3
= (unsigned char) p
[2];
204 if (c3
>= 0x81 && c3
<= 0xfe)
213 case enc_sjis
: /* SJIS */
217 unsigned char c
= (unsigned char) p
[0];
219 if ((c
>= 0x81 && c
<= 0x9f) || (c
>= 0xe0 && c
<= 0xea)
220 || (c
>= 0xf0 && c
<= 0xf9))
227 /* An unknown multibyte encoding. */
233 /* res >= 0 is the corrected return value of
234 mbtowc_with_lock (&wc, p, m). */
235 if (nstate
>= (res
> 0 ? res
: 1))
244 /* Here 0 <= k < m < 4. */
260 /* The conversion state is undefined, says POSIX. */