Update from gnulib.
[libiconv.git] / libcharset / lib / localcharset.c
blobda3ac451966e4e5aba9872549d8e0441770c324f
1 /* Determine a canonical name for the current locale's character encoding.
3 Copyright (C) 2000-2006, 2008-2019 Free Software Foundation, Inc.
5 This program is free software; you can redistribute it and/or modify it
6 under the terms of the GNU Library General Public License as published
7 by the Free Software Foundation; either version 2, or (at your option)
8 any later version.
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Library General Public License for more details.
15 You should have received a copy of the GNU Library General Public License
16 along with this program; if not, see <https://www.gnu.org/licenses/>. */
18 /* Written by Bruno Haible <bruno@clisp.org>. */
20 #include <config.h>
22 /* Specification. */
23 #include "localcharset.h"
25 #include <stddef.h>
26 #include <stdio.h>
27 #include <string.h>
28 #include <stdlib.h>
30 #if defined __APPLE__ && defined __MACH__ && HAVE_LANGINFO_CODESET
31 # define DARWIN7 /* Darwin 7 or newer, i.e. Mac OS X 10.3 or newer */
32 #endif
34 #if defined _WIN32 && !defined __CYGWIN__
35 # define WINDOWS_NATIVE
36 # include <locale.h>
37 #endif
39 #if defined __EMX__
40 /* Assume EMX program runs on OS/2, even if compiled under DOS. */
41 # ifndef OS2
42 # define OS2
43 # endif
44 #endif
46 #if !defined WINDOWS_NATIVE
47 # if HAVE_LANGINFO_CODESET
48 # include <langinfo.h>
49 # else
50 # if 0 /* see comment regarding use of setlocale(), below */
51 # include <locale.h>
52 # endif
53 # endif
54 # ifdef __CYGWIN__
55 # define WIN32_LEAN_AND_MEAN
56 # include <windows.h>
57 # endif
58 #elif defined WINDOWS_NATIVE
59 # define WIN32_LEAN_AND_MEAN
60 # include <windows.h>
61 #endif
62 #if defined OS2
63 # define INCL_DOS
64 # include <os2.h>
65 #endif
67 /* For MB_CUR_MAX_L */
68 #if defined DARWIN7
69 # include <xlocale.h>
70 #endif
73 #if HAVE_LANGINFO_CODESET || defined WINDOWS_NATIVE || defined OS2
75 /* On these platforms, we use a mapping from non-canonical encoding name
76 to GNU canonical encoding name. */
78 /* With glibc-2.1 or newer, we don't need any canonicalization,
79 because glibc has iconv and both glibc and libiconv support all
80 GNU canonical names directly. */
81 # if !((defined __GNU_LIBRARY__ && __GLIBC__ >= 2) || defined __UCLIBC__)
83 struct table_entry
85 const char alias[11+1];
86 const char canonical[11+1];
89 /* Table of platform-dependent mappings, sorted in ascending order. */
90 static const struct table_entry alias_table[] =
92 # if defined __FreeBSD__ /* FreeBSD */
93 /*{ "ARMSCII-8", "ARMSCII-8" },*/
94 { "Big5", "BIG5" },
95 { "C", "ASCII" },
96 /*{ "CP1131", "CP1131" },*/
97 /*{ "CP1251", "CP1251" },*/
98 /*{ "CP866", "CP866" },*/
99 /*{ "GB18030", "GB18030" },*/
100 /*{ "GB2312", "GB2312" },*/
101 /*{ "GBK", "GBK" },*/
102 /*{ "ISCII-DEV", "?" },*/
103 { "ISO8859-1", "ISO-8859-1" },
104 { "ISO8859-13", "ISO-8859-13" },
105 { "ISO8859-15", "ISO-8859-15" },
106 { "ISO8859-2", "ISO-8859-2" },
107 { "ISO8859-5", "ISO-8859-5" },
108 { "ISO8859-7", "ISO-8859-7" },
109 { "ISO8859-9", "ISO-8859-9" },
110 /*{ "KOI8-R", "KOI8-R" },*/
111 /*{ "KOI8-U", "KOI8-U" },*/
112 { "SJIS", "SHIFT_JIS" },
113 { "US-ASCII", "ASCII" },
114 { "eucCN", "GB2312" },
115 { "eucJP", "EUC-JP" },
116 { "eucKR", "EUC-KR" }
117 # define alias_table_defined
118 # endif
119 # if defined __NetBSD__ /* NetBSD */
120 { "646", "ASCII" },
121 /*{ "ARMSCII-8", "ARMSCII-8" },*/
122 /*{ "BIG5", "BIG5" },*/
123 { "Big5-HKSCS", "BIG5-HKSCS" },
124 /*{ "CP1251", "CP1251" },*/
125 /*{ "CP866", "CP866" },*/
126 /*{ "GB18030", "GB18030" },*/
127 /*{ "GB2312", "GB2312" },*/
128 { "ISO8859-1", "ISO-8859-1" },
129 { "ISO8859-13", "ISO-8859-13" },
130 { "ISO8859-15", "ISO-8859-15" },
131 { "ISO8859-2", "ISO-8859-2" },
132 { "ISO8859-4", "ISO-8859-4" },
133 { "ISO8859-5", "ISO-8859-5" },
134 { "ISO8859-7", "ISO-8859-7" },
135 /*{ "KOI8-R", "KOI8-R" },*/
136 /*{ "KOI8-U", "KOI8-U" },*/
137 /*{ "PT154", "PT154" },*/
138 { "SJIS", "SHIFT_JIS" },
139 { "eucCN", "GB2312" },
140 { "eucJP", "EUC-JP" },
141 { "eucKR", "EUC-KR" },
142 { "eucTW", "EUC-TW" }
143 # define alias_table_defined
144 # endif
145 # if defined __OpenBSD__ /* OpenBSD */
146 { "646", "ASCII" },
147 { "ISO8859-1", "ISO-8859-1" },
148 { "ISO8859-13", "ISO-8859-13" },
149 { "ISO8859-15", "ISO-8859-15" },
150 { "ISO8859-2", "ISO-8859-2" },
151 { "ISO8859-4", "ISO-8859-4" },
152 { "ISO8859-5", "ISO-8859-5" },
153 { "ISO8859-7", "ISO-8859-7" }
154 # define alias_table_defined
155 # endif
156 # if defined __APPLE__ && defined __MACH__ /* Mac OS X */
157 /* Darwin 7.5 has nl_langinfo(CODESET), but sometimes its value is
158 useless:
159 - It returns the empty string when LANG is set to a locale of the
160 form ll_CC, although ll_CC/LC_CTYPE is a symlink to an UTF-8
161 LC_CTYPE file.
162 - The environment variables LANG, LC_CTYPE, LC_ALL are not set by
163 the system; nl_langinfo(CODESET) returns "US-ASCII" in this case.
164 - The documentation says:
165 "... all code that calls BSD system routines should ensure
166 that the const *char parameters of these routines are in UTF-8
167 encoding. All BSD system functions expect their string
168 parameters to be in UTF-8 encoding and nothing else."
169 It also says
170 "An additional caveat is that string parameters for files,
171 paths, and other file-system entities must be in canonical
172 UTF-8. In a canonical UTF-8 Unicode string, all decomposable
173 characters are decomposed ..."
174 but this is not true: You can pass non-decomposed UTF-8 strings
175 to file system functions, and it is the OS which will convert
176 them to decomposed UTF-8 before accessing the file system.
177 - The Apple Terminal application displays UTF-8 by default.
178 - However, other applications are free to use different encodings:
179 - xterm uses ISO-8859-1 by default.
180 - TextEdit uses MacRoman by default.
181 We prefer UTF-8 over decomposed UTF-8-MAC because one should
182 minimize the use of decomposed Unicode. Unfortunately, through the
183 Darwin file system, decomposed UTF-8 strings are leaked into user
184 space nevertheless.
185 Then there are also the locales with encodings other than US-ASCII
186 and UTF-8. These locales can be occasionally useful to users (e.g.
187 when grepping through ISO-8859-1 encoded text files), when all their
188 file names are in US-ASCII.
190 { "ARMSCII-8", "ARMSCII-8" },
191 { "Big5", "BIG5" },
192 { "Big5HKSCS", "BIG5-HKSCS" },
193 { "CP1131", "CP1131" },
194 { "CP1251", "CP1251" },
195 { "CP866", "CP866" },
196 { "CP949", "CP949" },
197 { "GB18030", "GB18030" },
198 { "GB2312", "GB2312" },
199 { "GBK", "GBK" },
200 /*{ "ISCII-DEV", "?" },*/
201 { "ISO8859-1", "ISO-8859-1" },
202 { "ISO8859-13", "ISO-8859-13" },
203 { "ISO8859-15", "ISO-8859-15" },
204 { "ISO8859-2", "ISO-8859-2" },
205 { "ISO8859-4", "ISO-8859-4" },
206 { "ISO8859-5", "ISO-8859-5" },
207 { "ISO8859-7", "ISO-8859-7" },
208 { "ISO8859-9", "ISO-8859-9" },
209 { "KOI8-R", "KOI8-R" },
210 { "KOI8-U", "KOI8-U" },
211 { "PT154", "PT154" },
212 { "SJIS", "SHIFT_JIS" },
213 { "eucCN", "GB2312" },
214 { "eucJP", "EUC-JP" },
215 { "eucKR", "EUC-KR" }
216 # define alias_table_defined
217 # endif
218 # if defined _AIX /* AIX */
219 /*{ "GBK", "GBK" },*/
220 { "IBM-1046", "CP1046" },
221 { "IBM-1124", "CP1124" },
222 { "IBM-1129", "CP1129" },
223 { "IBM-1252", "CP1252" },
224 { "IBM-850", "CP850" },
225 { "IBM-856", "CP856" },
226 { "IBM-921", "ISO-8859-13" },
227 { "IBM-922", "CP922" },
228 { "IBM-932", "CP932" },
229 { "IBM-943", "CP943" },
230 { "IBM-eucCN", "GB2312" },
231 { "IBM-eucJP", "EUC-JP" },
232 { "IBM-eucKR", "EUC-KR" },
233 { "IBM-eucTW", "EUC-TW" },
234 { "ISO8859-1", "ISO-8859-1" },
235 { "ISO8859-15", "ISO-8859-15" },
236 { "ISO8859-2", "ISO-8859-2" },
237 { "ISO8859-5", "ISO-8859-5" },
238 { "ISO8859-6", "ISO-8859-6" },
239 { "ISO8859-7", "ISO-8859-7" },
240 { "ISO8859-8", "ISO-8859-8" },
241 { "ISO8859-9", "ISO-8859-9" },
242 { "TIS-620", "TIS-620" },
243 /*{ "UTF-8", "UTF-8" },*/
244 { "big5", "BIG5" }
245 # define alias_table_defined
246 # endif
247 # if defined __hpux /* HP-UX */
248 { "SJIS", "SHIFT_JIS" },
249 { "arabic8", "HP-ARABIC8" },
250 { "big5", "BIG5" },
251 { "cp1251", "CP1251" },
252 { "eucJP", "EUC-JP" },
253 { "eucKR", "EUC-KR" },
254 { "eucTW", "EUC-TW" },
255 { "gb18030", "GB18030" },
256 { "greek8", "HP-GREEK8" },
257 { "hebrew8", "HP-HEBREW8" },
258 { "hkbig5", "BIG5-HKSCS" },
259 { "hp15CN", "GB2312" },
260 { "iso88591", "ISO-8859-1" },
261 { "iso885913", "ISO-8859-13" },
262 { "iso885915", "ISO-8859-15" },
263 { "iso88592", "ISO-8859-2" },
264 { "iso88594", "ISO-8859-4" },
265 { "iso88595", "ISO-8859-5" },
266 { "iso88596", "ISO-8859-6" },
267 { "iso88597", "ISO-8859-7" },
268 { "iso88598", "ISO-8859-8" },
269 { "iso88599", "ISO-8859-9" },
270 { "kana8", "HP-KANA8" },
271 { "koi8r", "KOI8-R" },
272 { "roman8", "HP-ROMAN8" },
273 { "tis620", "TIS-620" },
274 { "turkish8", "HP-TURKISH8" },
275 { "utf8", "UTF-8" }
276 # define alias_table_defined
277 # endif
278 # if defined __sgi /* IRIX */
279 { "ISO8859-1", "ISO-8859-1" },
280 { "ISO8859-15", "ISO-8859-15" },
281 { "ISO8859-2", "ISO-8859-2" },
282 { "ISO8859-5", "ISO-8859-5" },
283 { "ISO8859-7", "ISO-8859-7" },
284 { "ISO8859-9", "ISO-8859-9" },
285 { "eucCN", "GB2312" },
286 { "eucJP", "EUC-JP" },
287 { "eucKR", "EUC-KR" },
288 { "eucTW", "EUC-TW" }
289 # define alias_table_defined
290 # endif
291 # if defined __osf__ /* OSF/1 */
292 /*{ "GBK", "GBK" },*/
293 { "ISO8859-1", "ISO-8859-1" },
294 { "ISO8859-15", "ISO-8859-15" },
295 { "ISO8859-2", "ISO-8859-2" },
296 { "ISO8859-4", "ISO-8859-4" },
297 { "ISO8859-5", "ISO-8859-5" },
298 { "ISO8859-7", "ISO-8859-7" },
299 { "ISO8859-8", "ISO-8859-8" },
300 { "ISO8859-9", "ISO-8859-9" },
301 { "KSC5601", "CP949" },
302 { "SJIS", "SHIFT_JIS" },
303 { "TACTIS", "TIS-620" },
304 /*{ "UTF-8", "UTF-8" },*/
305 { "big5", "BIG5" },
306 { "cp850", "CP850" },
307 { "dechanyu", "DEC-HANYU" },
308 { "dechanzi", "GB2312" },
309 { "deckanji", "DEC-KANJI" },
310 { "deckorean", "EUC-KR" },
311 { "eucJP", "EUC-JP" },
312 { "eucKR", "EUC-KR" },
313 { "eucTW", "EUC-TW" },
314 { "sdeckanji", "EUC-JP" }
315 # define alias_table_defined
316 # endif
317 # if defined __sun /* Solaris */
318 { "5601", "EUC-KR" },
319 { "646", "ASCII" },
320 /*{ "BIG5", "BIG5" },*/
321 { "Big5-HKSCS", "BIG5-HKSCS" },
322 { "GB18030", "GB18030" },
323 /*{ "GBK", "GBK" },*/
324 { "ISO8859-1", "ISO-8859-1" },
325 { "ISO8859-11", "TIS-620" },
326 { "ISO8859-13", "ISO-8859-13" },
327 { "ISO8859-15", "ISO-8859-15" },
328 { "ISO8859-2", "ISO-8859-2" },
329 { "ISO8859-3", "ISO-8859-3" },
330 { "ISO8859-4", "ISO-8859-4" },
331 { "ISO8859-5", "ISO-8859-5" },
332 { "ISO8859-6", "ISO-8859-6" },
333 { "ISO8859-7", "ISO-8859-7" },
334 { "ISO8859-8", "ISO-8859-8" },
335 { "ISO8859-9", "ISO-8859-9" },
336 { "PCK", "SHIFT_JIS" },
337 { "TIS620.2533", "TIS-620" },
338 /*{ "UTF-8", "UTF-8" },*/
339 { "ansi-1251", "CP1251" },
340 { "cns11643", "EUC-TW" },
341 { "eucJP", "EUC-JP" },
342 { "gb2312", "GB2312" },
343 { "koi8-r", "KOI8-R" }
344 # define alias_table_defined
345 # endif
346 # if defined __minix /* Minix */
347 { "646", "ASCII" }
348 # define alias_table_defined
349 # endif
350 # if defined WINDOWS_NATIVE || defined __CYGWIN__ /* Windows */
351 { "CP1361", "JOHAB" },
352 { "CP20127", "ASCII" },
353 { "CP20866", "KOI8-R" },
354 { "CP20936", "GB2312" },
355 { "CP21866", "KOI8-RU" },
356 { "CP28591", "ISO-8859-1" },
357 { "CP28592", "ISO-8859-2" },
358 { "CP28593", "ISO-8859-3" },
359 { "CP28594", "ISO-8859-4" },
360 { "CP28595", "ISO-8859-5" },
361 { "CP28596", "ISO-8859-6" },
362 { "CP28597", "ISO-8859-7" },
363 { "CP28598", "ISO-8859-8" },
364 { "CP28599", "ISO-8859-9" },
365 { "CP28605", "ISO-8859-15" },
366 { "CP38598", "ISO-8859-8" },
367 { "CP51932", "EUC-JP" },
368 { "CP51936", "GB2312" },
369 { "CP51949", "EUC-KR" },
370 { "CP51950", "EUC-TW" },
371 { "CP54936", "GB18030" },
372 { "CP65001", "UTF-8" },
373 { "CP936", "GBK" }
374 # define alias_table_defined
375 # endif
376 # if defined OS2 /* OS/2 */
377 /* The list of encodings is taken from "List of OS/2 Codepages"
378 by Alex Taylor:
379 <http://altsan.org/os2/toolkits/uls/index.html#codepages>.
380 See also "IBM Globalization - Code page identifiers":
381 <https://www-01.ibm.com/software/globalization/cp/cp_cpgid.html>. */
382 { "CP1089", "ISO-8859-6" },
383 { "CP1208", "UTF-8" },
384 { "CP1381", "GB2312" },
385 { "CP1386", "GBK" },
386 { "CP3372", "EUC-JP" },
387 { "CP813", "ISO-8859-7" },
388 { "CP819", "ISO-8859-1" },
389 { "CP878", "KOI8-R" },
390 { "CP912", "ISO-8859-2" },
391 { "CP913", "ISO-8859-3" },
392 { "CP914", "ISO-8859-4" },
393 { "CP915", "ISO-8859-5" },
394 { "CP916", "ISO-8859-8" },
395 { "CP920", "ISO-8859-9" },
396 { "CP921", "ISO-8859-13" },
397 { "CP923", "ISO-8859-15" },
398 { "CP954", "EUC-JP" },
399 { "CP964", "EUC-TW" },
400 { "CP970", "EUC-KR" }
401 # define alias_table_defined
402 # endif
403 # if defined VMS /* OpenVMS */
404 /* The list of encodings is taken from the OpenVMS 7.3-1 documentation
405 "Compaq C Run-Time Library Reference Manual for OpenVMS systems"
406 section 10.7 "Handling Different Character Sets". */
407 { "DECHANYU", "DEC-HANYU" },
408 { "DECHANZI", "GB2312" },
409 { "DECKANJI", "DEC-KANJI" },
410 { "DECKOREAN", "EUC-KR" },
411 { "ISO8859-1", "ISO-8859-1" },
412 { "ISO8859-2", "ISO-8859-2" },
413 { "ISO8859-5", "ISO-8859-5" },
414 { "ISO8859-7", "ISO-8859-7" },
415 { "ISO8859-8", "ISO-8859-8" },
416 { "ISO8859-9", "ISO-8859-9" },
417 { "SDECKANJI", "EUC-JP" },
418 { "SJIS", "SHIFT_JIS" },
419 { "eucJP", "EUC-JP" },
420 { "eucTW", "EUC-TW" }
421 # define alias_table_defined
422 # endif
423 # ifndef alias_table_defined
424 /* Just a dummy entry, to avoid a C syntax error. */
425 { "", "" }
426 # endif
429 # endif
431 #else
433 /* On these platforms, we use a mapping from locale name to GNU canonical
434 encoding name. */
436 struct table_entry
438 const char locale[17+1];
439 const char canonical[11+1];
442 /* Table of platform-dependent mappings, sorted in ascending order. */
443 static const struct table_entry locale_table[] =
445 # if defined __FreeBSD__ /* FreeBSD 4.2 */
446 { "cs_CZ.ISO_8859-2", "ISO-8859-2" },
447 { "da_DK.DIS_8859-15", "ISO-8859-15" },
448 { "da_DK.ISO_8859-1", "ISO-8859-1" },
449 { "de_AT.DIS_8859-15", "ISO-8859-15" },
450 { "de_AT.ISO_8859-1", "ISO-8859-1" },
451 { "de_CH.DIS_8859-15", "ISO-8859-15" },
452 { "de_CH.ISO_8859-1", "ISO-8859-1" },
453 { "de_DE.DIS_8859-15", "ISO-8859-15" },
454 { "de_DE.ISO_8859-1", "ISO-8859-1" },
455 { "en_AU.DIS_8859-15", "ISO-8859-15" },
456 { "en_AU.ISO_8859-1", "ISO-8859-1" },
457 { "en_CA.DIS_8859-15", "ISO-8859-15" },
458 { "en_CA.ISO_8859-1", "ISO-8859-1" },
459 { "en_GB.DIS_8859-15", "ISO-8859-15" },
460 { "en_GB.ISO_8859-1", "ISO-8859-1" },
461 { "en_US.DIS_8859-15", "ISO-8859-15" },
462 { "en_US.ISO_8859-1", "ISO-8859-1" },
463 { "es_ES.DIS_8859-15", "ISO-8859-15" },
464 { "es_ES.ISO_8859-1", "ISO-8859-1" },
465 { "fi_FI.DIS_8859-15", "ISO-8859-15" },
466 { "fi_FI.ISO_8859-1", "ISO-8859-1" },
467 { "fr_BE.DIS_8859-15", "ISO-8859-15" },
468 { "fr_BE.ISO_8859-1", "ISO-8859-1" },
469 { "fr_CA.DIS_8859-15", "ISO-8859-15" },
470 { "fr_CA.ISO_8859-1", "ISO-8859-1" },
471 { "fr_CH.DIS_8859-15", "ISO-8859-15" },
472 { "fr_CH.ISO_8859-1", "ISO-8859-1" },
473 { "fr_FR.DIS_8859-15", "ISO-8859-15" },
474 { "fr_FR.ISO_8859-1", "ISO-8859-1" },
475 { "hr_HR.ISO_8859-2", "ISO-8859-2" },
476 { "hu_HU.ISO_8859-2", "ISO-8859-2" },
477 { "is_IS.DIS_8859-15", "ISO-8859-15" },
478 { "is_IS.ISO_8859-1", "ISO-8859-1" },
479 { "it_CH.DIS_8859-15", "ISO-8859-15" },
480 { "it_CH.ISO_8859-1", "ISO-8859-1" },
481 { "it_IT.DIS_8859-15", "ISO-8859-15" },
482 { "it_IT.ISO_8859-1", "ISO-8859-1" },
483 { "ja_JP.EUC", "EUC-JP" },
484 { "ja_JP.SJIS", "SHIFT_JIS" },
485 { "ja_JP.Shift_JIS", "SHIFT_JIS" },
486 { "ko_KR.EUC", "EUC-KR" },
487 { "la_LN.ASCII", "ASCII" },
488 { "la_LN.DIS_8859-15", "ISO-8859-15" },
489 { "la_LN.ISO_8859-1", "ISO-8859-1" },
490 { "la_LN.ISO_8859-2", "ISO-8859-2" },
491 { "la_LN.ISO_8859-4", "ISO-8859-4" },
492 { "lt_LN.ASCII", "ASCII" },
493 { "lt_LN.DIS_8859-15", "ISO-8859-15" },
494 { "lt_LN.ISO_8859-1", "ISO-8859-1" },
495 { "lt_LN.ISO_8859-2", "ISO-8859-2" },
496 { "lt_LT.ISO_8859-4", "ISO-8859-4" },
497 { "nl_BE.DIS_8859-15", "ISO-8859-15" },
498 { "nl_BE.ISO_8859-1", "ISO-8859-1" },
499 { "nl_NL.DIS_8859-15", "ISO-8859-15" },
500 { "nl_NL.ISO_8859-1", "ISO-8859-1" },
501 { "no_NO.DIS_8859-15", "ISO-8859-15" },
502 { "no_NO.ISO_8859-1", "ISO-8859-1" },
503 { "pl_PL.ISO_8859-2", "ISO-8859-2" },
504 { "pt_PT.DIS_8859-15", "ISO-8859-15" },
505 { "pt_PT.ISO_8859-1", "ISO-8859-1" },
506 { "ru_RU.CP866", "CP866" },
507 { "ru_RU.ISO_8859-5", "ISO-8859-5" },
508 { "ru_RU.KOI8-R", "KOI8-R" },
509 { "ru_SU.CP866", "CP866" },
510 { "ru_SU.ISO_8859-5", "ISO-8859-5" },
511 { "ru_SU.KOI8-R", "KOI8-R" },
512 { "sl_SI.ISO_8859-2", "ISO-8859-2" },
513 { "sv_SE.DIS_8859-15", "ISO-8859-15" },
514 { "sv_SE.ISO_8859-1", "ISO-8859-1" },
515 { "uk_UA.KOI8-U", "KOI8-U" },
516 { "zh_CN.EUC", "GB2312" },
517 { "zh_TW.BIG5", "BIG5" },
518 { "zh_TW.Big5", "BIG5" }
519 # define locale_table_defined
520 # endif
521 # if defined __DJGPP__ /* DOS / DJGPP 2.03 */
522 /* The encodings given here may not all be correct.
523 If you find that the encoding given for your language and
524 country is not the one your DOS machine actually uses, just
525 correct it in this file, and send a mail to
526 Juan Manuel Guerrero <juan.guerrero@gmx.de>
527 and <bug-gnulib@gnu.org>. */
528 { "C", "ASCII" },
529 { "ar", "CP864" },
530 { "ar_AE", "CP864" },
531 { "ar_DZ", "CP864" },
532 { "ar_EG", "CP864" },
533 { "ar_IQ", "CP864" },
534 { "ar_IR", "CP864" },
535 { "ar_JO", "CP864" },
536 { "ar_KW", "CP864" },
537 { "ar_MA", "CP864" },
538 { "ar_OM", "CP864" },
539 { "ar_QA", "CP864" },
540 { "ar_SA", "CP864" },
541 { "ar_SY", "CP864" },
542 { "be", "CP866" },
543 { "be_BE", "CP866" },
544 { "bg", "CP866" }, /* not CP855 ?? */
545 { "bg_BG", "CP866" }, /* not CP855 ?? */
546 { "ca", "CP850" },
547 { "ca_ES", "CP850" },
548 { "cs", "CP852" },
549 { "cs_CZ", "CP852" },
550 { "da", "CP865" }, /* not CP850 ?? */
551 { "da_DK", "CP865" }, /* not CP850 ?? */
552 { "de", "CP850" },
553 { "de_AT", "CP850" },
554 { "de_CH", "CP850" },
555 { "de_DE", "CP850" },
556 { "el", "CP869" },
557 { "el_GR", "CP869" },
558 { "en", "CP850" },
559 { "en_AU", "CP850" }, /* not CP437 ?? */
560 { "en_CA", "CP850" },
561 { "en_GB", "CP850" },
562 { "en_NZ", "CP437" },
563 { "en_US", "CP437" },
564 { "en_ZA", "CP850" }, /* not CP437 ?? */
565 { "eo", "CP850" },
566 { "eo_EO", "CP850" },
567 { "es", "CP850" },
568 { "es_AR", "CP850" },
569 { "es_BO", "CP850" },
570 { "es_CL", "CP850" },
571 { "es_CO", "CP850" },
572 { "es_CR", "CP850" },
573 { "es_CU", "CP850" },
574 { "es_DO", "CP850" },
575 { "es_EC", "CP850" },
576 { "es_ES", "CP850" },
577 { "es_GT", "CP850" },
578 { "es_HN", "CP850" },
579 { "es_MX", "CP850" },
580 { "es_NI", "CP850" },
581 { "es_PA", "CP850" },
582 { "es_PE", "CP850" },
583 { "es_PY", "CP850" },
584 { "es_SV", "CP850" },
585 { "es_UY", "CP850" },
586 { "es_VE", "CP850" },
587 { "et", "CP850" },
588 { "et_EE", "CP850" },
589 { "eu", "CP850" },
590 { "eu_ES", "CP850" },
591 { "fi", "CP850" },
592 { "fi_FI", "CP850" },
593 { "fr", "CP850" },
594 { "fr_BE", "CP850" },
595 { "fr_CA", "CP850" },
596 { "fr_CH", "CP850" },
597 { "fr_FR", "CP850" },
598 { "ga", "CP850" },
599 { "ga_IE", "CP850" },
600 { "gd", "CP850" },
601 { "gd_GB", "CP850" },
602 { "gl", "CP850" },
603 { "gl_ES", "CP850" },
604 { "he", "CP862" },
605 { "he_IL", "CP862" },
606 { "hr", "CP852" },
607 { "hr_HR", "CP852" },
608 { "hu", "CP852" },
609 { "hu_HU", "CP852" },
610 { "id", "CP850" }, /* not CP437 ?? */
611 { "id_ID", "CP850" }, /* not CP437 ?? */
612 { "is", "CP861" }, /* not CP850 ?? */
613 { "is_IS", "CP861" }, /* not CP850 ?? */
614 { "it", "CP850" },
615 { "it_CH", "CP850" },
616 { "it_IT", "CP850" },
617 { "ja", "CP932" },
618 { "ja_JP", "CP932" },
619 { "kr", "CP949" }, /* not CP934 ?? */
620 { "kr_KR", "CP949" }, /* not CP934 ?? */
621 { "lt", "CP775" },
622 { "lt_LT", "CP775" },
623 { "lv", "CP775" },
624 { "lv_LV", "CP775" },
625 { "mk", "CP866" }, /* not CP855 ?? */
626 { "mk_MK", "CP866" }, /* not CP855 ?? */
627 { "mt", "CP850" },
628 { "mt_MT", "CP850" },
629 { "nb", "CP865" }, /* not CP850 ?? */
630 { "nb_NO", "CP865" }, /* not CP850 ?? */
631 { "nl", "CP850" },
632 { "nl_BE", "CP850" },
633 { "nl_NL", "CP850" },
634 { "nn", "CP865" }, /* not CP850 ?? */
635 { "nn_NO", "CP865" }, /* not CP850 ?? */
636 { "no", "CP865" }, /* not CP850 ?? */
637 { "no_NO", "CP865" }, /* not CP850 ?? */
638 { "pl", "CP852" },
639 { "pl_PL", "CP852" },
640 { "pt", "CP850" },
641 { "pt_BR", "CP850" },
642 { "pt_PT", "CP850" },
643 { "ro", "CP852" },
644 { "ro_RO", "CP852" },
645 { "ru", "CP866" },
646 { "ru_RU", "CP866" },
647 { "sk", "CP852" },
648 { "sk_SK", "CP852" },
649 { "sl", "CP852" },
650 { "sl_SI", "CP852" },
651 { "sq", "CP852" },
652 { "sq_AL", "CP852" },
653 { "sr", "CP852" }, /* CP852 or CP866 or CP855 ?? */
654 { "sr_CS", "CP852" }, /* CP852 or CP866 or CP855 ?? */
655 { "sr_YU", "CP852" }, /* CP852 or CP866 or CP855 ?? */
656 { "sv", "CP850" },
657 { "sv_SE", "CP850" },
658 { "th", "CP874" },
659 { "th_TH", "CP874" },
660 { "tr", "CP857" },
661 { "tr_TR", "CP857" },
662 { "uk", "CP1125" },
663 { "uk_UA", "CP1125" },
664 { "zh_CN", "GBK" },
665 { "zh_TW", "CP950" } /* not CP938 ?? */
666 # define locale_table_defined
667 # endif
668 # ifndef locale_table_defined
669 /* Just a dummy entry, to avoid a C syntax error. */
670 { "", "" }
671 # endif
674 #endif
677 /* Determine the current locale's character encoding, and canonicalize it
678 into one of the canonical names listed in localcharset.h.
679 The result must not be freed; it is statically allocated.
680 If the canonical name cannot be determined, the result is a non-canonical
681 name. */
683 #ifdef STATIC
684 STATIC
685 #endif
686 const char *
687 locale_charset (void)
689 const char *codeset;
691 #if HAVE_LANGINFO_CODESET || defined WINDOWS_NATIVE || defined OS2
693 # if HAVE_LANGINFO_CODESET
695 /* Most systems support nl_langinfo (CODESET) nowadays. */
696 codeset = nl_langinfo (CODESET);
698 # ifdef __CYGWIN__
699 /* Cygwin < 1.7 does not have locales. nl_langinfo (CODESET) always
700 returns "US-ASCII". Return the suffix of the locale name from the
701 environment variables (if present) or the codepage as a number. */
702 if (codeset != NULL && strcmp (codeset, "US-ASCII") == 0)
704 const char *locale;
705 static char buf[2 + 10 + 1];
707 locale = getenv ("LC_ALL");
708 if (locale == NULL || locale[0] == '\0')
710 locale = getenv ("LC_CTYPE");
711 if (locale == NULL || locale[0] == '\0')
712 locale = getenv ("LANG");
714 if (locale != NULL && locale[0] != '\0')
716 /* If the locale name contains an encoding after the dot, return
717 it. */
718 const char *dot = strchr (locale, '.');
720 if (dot != NULL)
722 const char *modifier;
724 dot++;
725 /* Look for the possible @... trailer and remove it, if any. */
726 modifier = strchr (dot, '@');
727 if (modifier == NULL)
728 return dot;
729 if (modifier - dot < sizeof (buf))
731 memcpy (buf, dot, modifier - dot);
732 buf [modifier - dot] = '\0';
733 return buf;
738 /* The Windows API has a function returning the locale's codepage as a
739 number: GetACP(). This encoding is used by Cygwin, unless the user
740 has set the environment variable CYGWIN=codepage:oem (which very few
741 people do).
742 Output directed to console windows needs to be converted (to
743 GetOEMCP() if the console is using a raster font, or to
744 GetConsoleOutputCP() if it is using a TrueType font). Cygwin does
745 this conversion transparently (see winsup/cygwin/fhandler_console.cc),
746 converting to GetConsoleOutputCP(). This leads to correct results,
747 except when SetConsoleOutputCP has been called and a raster font is
748 in use. */
749 sprintf (buf, "CP%u", GetACP ());
750 codeset = buf;
752 # endif
754 if (codeset == NULL)
755 /* The canonical name cannot be determined. */
756 codeset = "";
758 # elif defined WINDOWS_NATIVE
760 static char buf[2 + 10 + 1];
762 /* The Windows API has a function returning the locale's codepage as
763 a number, but the value doesn't change according to what the
764 'setlocale' call specified. So we use it as a last resort, in
765 case the string returned by 'setlocale' doesn't specify the
766 codepage. */
767 char *current_locale = setlocale (LC_ALL, NULL);
768 char *pdot;
770 /* If they set different locales for different categories,
771 'setlocale' will return a semi-colon separated list of locale
772 values. To make sure we use the correct one, we choose LC_CTYPE. */
773 if (strchr (current_locale, ';'))
774 current_locale = setlocale (LC_CTYPE, NULL);
776 pdot = strrchr (current_locale, '.');
777 if (pdot && 2 + strlen (pdot + 1) + 1 <= sizeof (buf))
778 sprintf (buf, "CP%s", pdot + 1);
779 else
781 /* The Windows API has a function returning the locale's codepage as a
782 number: GetACP().
783 When the output goes to a console window, it needs to be provided in
784 GetOEMCP() encoding if the console is using a raster font, or in
785 GetConsoleOutputCP() encoding if it is using a TrueType font.
786 But in GUI programs and for output sent to files and pipes, GetACP()
787 encoding is the best bet. */
788 sprintf (buf, "CP%u", GetACP ());
790 /* For a locale name such as "French_France.65001", in Windows 10,
791 setlocale now returns "French_France.utf8" instead. */
792 if (strcmp (buf + 2, "65001") == 0 || strcmp (buf + 2, "utf8") == 0)
793 codeset = "UTF-8";
794 else
795 codeset = buf;
797 # elif defined OS2
799 const char *locale;
800 static char buf[2 + 10 + 1];
801 ULONG cp[3];
802 ULONG cplen;
804 codeset = NULL;
806 /* Allow user to override the codeset, as set in the operating system,
807 with standard language environment variables. */
808 locale = getenv ("LC_ALL");
809 if (locale == NULL || locale[0] == '\0')
811 locale = getenv ("LC_CTYPE");
812 if (locale == NULL || locale[0] == '\0')
813 locale = getenv ("LANG");
815 if (locale != NULL && locale[0] != '\0')
817 /* If the locale name contains an encoding after the dot, return it. */
818 const char *dot = strchr (locale, '.');
820 if (dot != NULL)
822 const char *modifier;
824 dot++;
825 /* Look for the possible @... trailer and remove it, if any. */
826 modifier = strchr (dot, '@');
827 if (modifier == NULL)
828 return dot;
829 if (modifier - dot < sizeof (buf))
831 memcpy (buf, dot, modifier - dot);
832 buf [modifier - dot] = '\0';
833 return buf;
837 /* For the POSIX locale, don't use the system's codepage. */
838 if (strcmp (locale, "C") == 0 || strcmp (locale, "POSIX") == 0)
839 codeset = "";
842 if (codeset == NULL)
844 /* OS/2 has a function returning the locale's codepage as a number. */
845 if (DosQueryCp (sizeof (cp), cp, &cplen))
846 codeset = "";
847 else
849 sprintf (buf, "CP%u", cp[0]);
850 codeset = buf;
854 # else
856 # error "Add code for other platforms here."
858 # endif
860 /* Resolve alias. */
862 # ifdef alias_table_defined
863 /* On some platforms, UTF-8 locales are the most frequently used ones.
864 Speed up the common case and slow down the less common cases by
865 testing for this case first. */
866 # if defined __OpenBSD__ || (defined __APPLE__ && defined __MACH__) || defined __sun || defined __CYGWIN__
867 if (strcmp (codeset, "UTF-8") == 0)
868 goto done_table_lookup;
869 else
870 # endif
872 const struct table_entry * const table = alias_table;
873 size_t const table_size =
874 sizeof (alias_table) / sizeof (struct table_entry);
875 /* The table is sorted. Perform a binary search. */
876 size_t hi = table_size;
877 size_t lo = 0;
878 while (lo < hi)
880 /* Invariant:
881 for i < lo, strcmp (table[i].alias, codeset) < 0,
882 for i >= hi, strcmp (table[i].alias, codeset) > 0. */
883 size_t mid = (hi + lo) >> 1; /* >= lo, < hi */
884 int cmp = strcmp (table[mid].alias, codeset);
885 if (cmp < 0)
886 lo = mid + 1;
887 else if (cmp > 0)
888 hi = mid;
889 else
891 /* Found an i with
892 strcmp (table[i].alias, codeset) == 0. */
893 codeset = table[mid].canonical;
894 goto done_table_lookup;
898 if (0)
899 done_table_lookup: ;
900 else
901 # endif
903 /* Did not find it in the table. */
904 /* On Mac OS X, all modern locales use the UTF-8 encoding.
905 BeOS and Haiku have a single locale, and it has UTF-8 encoding. */
906 # if (defined __APPLE__ && defined __MACH__) || defined __BEOS__ || defined __HAIKU__
907 codeset = "UTF-8";
908 # else
909 /* Don't return an empty string. GNU libc and GNU libiconv interpret
910 the empty string as denoting "the locale's character encoding",
911 thus GNU libiconv would call this function a second time. */
912 if (codeset[0] == '\0')
913 codeset = "ASCII";
914 # endif
918 #else
920 /* On old systems which lack it, use setlocale or getenv. */
921 const char *locale = NULL;
923 /* But most old systems don't have a complete set of locales. Some
924 (like DJGPP) have only the C locale. Therefore we don't use setlocale
925 here; it would return "C" when it doesn't support the locale name the
926 user has set. */
927 # if 0
928 locale = setlocale (LC_CTYPE, NULL);
929 # endif
930 if (locale == NULL || locale[0] == '\0')
932 locale = getenv ("LC_ALL");
933 if (locale == NULL || locale[0] == '\0')
935 locale = getenv ("LC_CTYPE");
936 if (locale == NULL || locale[0] == '\0')
937 locale = getenv ("LANG");
938 if (locale == NULL)
939 locale = "";
943 /* Map locale name to canonical encoding name. */
945 # ifdef locale_table_defined
946 const struct table_entry * const table = locale_table;
947 size_t const table_size =
948 sizeof (locale_table) / sizeof (struct table_entry);
949 /* The table is sorted. Perform a binary search. */
950 size_t hi = table_size;
951 size_t lo = 0;
952 while (lo < hi)
954 /* Invariant:
955 for i < lo, strcmp (table[i].locale, locale) < 0,
956 for i >= hi, strcmp (table[i].locale, locale) > 0. */
957 size_t mid = (hi + lo) >> 1; /* >= lo, < hi */
958 int cmp = strcmp (table[mid].locale, locale);
959 if (cmp < 0)
960 lo = mid + 1;
961 else if (cmp > 0)
962 hi = mid;
963 else
965 /* Found an i with
966 strcmp (table[i].locale, locale) == 0. */
967 codeset = table[mid].canonical;
968 goto done_table_lookup;
971 if (0)
972 done_table_lookup: ;
973 else
974 # endif
976 /* Did not find it in the table. */
977 /* On Mac OS X, all modern locales use the UTF-8 encoding.
978 BeOS and Haiku have a single locale, and it has UTF-8 encoding. */
979 # if (defined __APPLE__ && defined __MACH__) || defined __BEOS__ || defined __HAIKU__
980 codeset = "UTF-8";
981 # else
982 /* The canonical name cannot be determined. */
983 /* Don't return an empty string. GNU libc and GNU libiconv interpret
984 the empty string as denoting "the locale's character encoding",
985 thus GNU libiconv would call this function a second time. */
986 codeset = "ASCII";
987 # endif
991 #endif
993 #ifdef DARWIN7
994 /* Mac OS X sets MB_CUR_MAX to 1 when LC_ALL=C, and "UTF-8"
995 (the default codeset) does not work when MB_CUR_MAX is 1. */
996 if (strcmp (codeset, "UTF-8") == 0 && MB_CUR_MAX_L (uselocale (NULL)) <= 1)
997 codeset = "ASCII";
998 #endif
1000 return codeset;