libcharset/lib/localcharset.c

   1 /* Determine a canonical name for the current locale's character encoding.
   2
   3    Copyright (C) 2000-2006, 2008-2019 Free Software Foundation, Inc.
   4
   5    This program is free software; you can redistribute it and/or modify it
   6    under the terms of the GNU Library General Public License as published
   7    by the Free Software Foundation; either version 2, or (at your option)
   8    any later version.
   9
  10    This program is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Library General Public License for more details.
  14
  15    You should have received a copy of the GNU Library General Public License
  16    along with this program; if not, see <https://www.gnu.org/licenses/>.  */
  17
  18 /* Written by Bruno Haible <bruno@clisp.org>.  */
  19
  20 #include <config.h>
  21
  22 /* Specification.  */
  23 #include "localcharset.h"
  24
  25 #include <stddef.h>
  26 #include <stdio.h>
  27 #include <string.h>
  28 #include <stdlib.h>
  29
  30 #if defined __APPLE__ && defined __MACH__ && HAVE_LANGINFO_CODESET
  31 # define DARWIN7 /* Darwin 7 or newer, i.e. Mac OS X 10.3 or newer */
  32 #endif
  33
  34 #if defined _WIN32 && !defined __CYGWIN__
  35 # define WINDOWS_NATIVE
  36 # include <locale.h>
  37 #endif
  38
  39 #if defined __EMX__
  40 /* Assume EMX program runs on OS/2, even if compiled under DOS.  */
  41 # ifndef OS2
  42 #  define OS2
  43 # endif
  44 #endif
  45
  46 #if !defined WINDOWS_NATIVE
  47 # if HAVE_LANGINFO_CODESET
  48 #  include <langinfo.h>
  49 # else
  50 #  if 0 /* see comment regarding use of setlocale(), below */
  51 #   include <locale.h>
  52 #  endif
  53 # endif
  54 # ifdef __CYGWIN__
  55 #  define WIN32_LEAN_AND_MEAN
  56 #  include <windows.h>
  57 # endif
  58 #elif defined WINDOWS_NATIVE
  59 # define WIN32_LEAN_AND_MEAN
  60 # include <windows.h>
  61 #endif
  62 #if defined OS2
  63 # define INCL_DOS
  64 # include <os2.h>
  65 #endif
  66
  67 /* For MB_CUR_MAX_L */
  68 #if defined DARWIN7
  69 # include <xlocale.h>
  70 #endif
  71
  72
  73 #if HAVE_LANGINFO_CODESET || defined WINDOWS_NATIVE || defined OS2
  74
  75 /* On these platforms, we use a mapping from non-canonical encoding name
  76    to GNU canonical encoding name.  */
  77
  78 /* With glibc-2.1 or newer, we don't need any canonicalization,
  79    because glibc has iconv and both glibc and libiconv support all
  80    GNU canonical names directly.  */
  81 # if !((defined __GNU_LIBRARY__ && __GLIBC__ >= 2) || defined __UCLIBC__)
  82
  83 struct table_entry
  84 {
  85   const char alias[11+1];
  86   const char canonical[11+1];
  87 };
  88
  89 /* Table of platform-dependent mappings, sorted in ascending order.  */
  90 static const struct table_entry alias_table[] =
  91   {
  92 #  if defined __FreeBSD__                                   /* FreeBSD */
  93   /*{ "ARMSCII-8",  "ARMSCII-8" },*/
  94     { "Big5",       "BIG5" },
  95     { "C",          "ASCII" },
  96   /*{ "CP1131",     "CP1131" },*/
  97   /*{ "CP1251",     "CP1251" },*/
  98   /*{ "CP866",      "CP866" },*/
  99   /*{ "GB18030",    "GB18030" },*/
 100   /*{ "GB2312",     "GB2312" },*/
 101   /*{ "GBK",        "GBK" },*/
 102   /*{ "ISCII-DEV",  "?" },*/
 103     { "ISO8859-1",  "ISO-8859-1" },
 104     { "ISO8859-13", "ISO-8859-13" },
 105     { "ISO8859-15", "ISO-8859-15" },
 106     { "ISO8859-2",  "ISO-8859-2" },
 107     { "ISO8859-5",  "ISO-8859-5" },
 108     { "ISO8859-7",  "ISO-8859-7" },
 109     { "ISO8859-9",  "ISO-8859-9" },
 110   /*{ "KOI8-R",     "KOI8-R" },*/
 111   /*{ "KOI8-U",     "KOI8-U" },*/
 112     { "SJIS",       "SHIFT_JIS" },
 113     { "US-ASCII",   "ASCII" },
 114     { "eucCN",      "GB2312" },
 115     { "eucJP",      "EUC-JP" },
 116     { "eucKR",      "EUC-KR" }
 117 #   define alias_table_defined
 118 #  endif
 119 #  if defined __NetBSD__                                    /* NetBSD */
 120     { "646",        "ASCII" },
 121   /*{ "ARMSCII-8",  "ARMSCII-8" },*/
 122   /*{ "BIG5",       "BIG5" },*/
 123     { "Big5-HKSCS", "BIG5-HKSCS" },
 124   /*{ "CP1251",     "CP1251" },*/
 125   /*{ "CP866",      "CP866" },*/
 126   /*{ "GB18030",    "GB18030" },*/
 127   /*{ "GB2312",     "GB2312" },*/
 128     { "ISO8859-1",  "ISO-8859-1" },
 129     { "ISO8859-13", "ISO-8859-13" },
 130     { "ISO8859-15", "ISO-8859-15" },
 131     { "ISO8859-2",  "ISO-8859-2" },
 132     { "ISO8859-4",  "ISO-8859-4" },
 133     { "ISO8859-5",  "ISO-8859-5" },
 134     { "ISO8859-7",  "ISO-8859-7" },
 135   /*{ "KOI8-R",     "KOI8-R" },*/
 136   /*{ "KOI8-U",     "KOI8-U" },*/
 137   /*{ "PT154",      "PT154" },*/
 138     { "SJIS",       "SHIFT_JIS" },
 139     { "eucCN",      "GB2312" },
 140     { "eucJP",      "EUC-JP" },
 141     { "eucKR",      "EUC-KR" },
 142     { "eucTW",      "EUC-TW" }
 143 #   define alias_table_defined
 144 #  endif
 145 #  if defined __OpenBSD__                                   /* OpenBSD */
 146     { "646",        "ASCII" },
 147     { "ISO8859-1",  "ISO-8859-1" },
 148     { "ISO8859-13", "ISO-8859-13" },
 149     { "ISO8859-15", "ISO-8859-15" },
 150     { "ISO8859-2",  "ISO-8859-2" },
 151     { "ISO8859-4",  "ISO-8859-4" },
 152     { "ISO8859-5",  "ISO-8859-5" },
 153     { "ISO8859-7",  "ISO-8859-7" }
 154 #   define alias_table_defined
 155 #  endif
 156 #  if defined __APPLE__ && defined __MACH__                 /* Mac OS X */
 157     /* Darwin 7.5 has nl_langinfo(CODESET), but sometimes its value is
 158        useless:
 159        - It returns the empty string when LANG is set to a locale of the
 160          form ll_CC, although ll_CC/LC_CTYPE is a symlink to an UTF-8
 161          LC_CTYPE file.
 162        - The environment variables LANG, LC_CTYPE, LC_ALL are not set by
 163          the system; nl_langinfo(CODESET) returns "US-ASCII" in this case.
 164        - The documentation says:
 165            "... all code that calls BSD system routines should ensure
 166             that the const *char parameters of these routines are in UTF-8
 167             encoding. All BSD system functions expect their string
 168             parameters to be in UTF-8 encoding and nothing else."
 169          It also says
 170            "An additional caveat is that string parameters for files,
 171             paths, and other file-system entities must be in canonical
 172             UTF-8. In a canonical UTF-8 Unicode string, all decomposable
 173             characters are decomposed ..."
 174          but this is not true: You can pass non-decomposed UTF-8 strings
 175          to file system functions, and it is the OS which will convert
 176          them to decomposed UTF-8 before accessing the file system.
 177        - The Apple Terminal application displays UTF-8 by default.
 178        - However, other applications are free to use different encodings:
 179          - xterm uses ISO-8859-1 by default.
 180          - TextEdit uses MacRoman by default.
 181        We prefer UTF-8 over decomposed UTF-8-MAC because one should
 182        minimize the use of decomposed Unicode. Unfortunately, through the
 183        Darwin file system, decomposed UTF-8 strings are leaked into user
 184        space nevertheless.
 185        Then there are also the locales with encodings other than US-ASCII
 186        and UTF-8. These locales can be occasionally useful to users (e.g.
 187        when grepping through ISO-8859-1 encoded text files), when all their
 188        file names are in US-ASCII.
 189      */
 190     { "ARMSCII-8",  "ARMSCII-8" },
 191     { "Big5",       "BIG5" },
 192     { "Big5HKSCS",  "BIG5-HKSCS" },
 193     { "CP1131",     "CP1131" },
 194     { "CP1251",     "CP1251" },
 195     { "CP866",      "CP866" },
 196     { "CP949",      "CP949" },
 197     { "GB18030",    "GB18030" },
 198     { "GB2312",     "GB2312" },
 199     { "GBK",        "GBK" },
 200   /*{ "ISCII-DEV",  "?" },*/
 201     { "ISO8859-1",  "ISO-8859-1" },
 202     { "ISO8859-13", "ISO-8859-13" },
 203     { "ISO8859-15", "ISO-8859-15" },
 204     { "ISO8859-2",  "ISO-8859-2" },
 205     { "ISO8859-4",  "ISO-8859-4" },
 206     { "ISO8859-5",  "ISO-8859-5" },
 207     { "ISO8859-7",  "ISO-8859-7" },
 208     { "ISO8859-9",  "ISO-8859-9" },
 209     { "KOI8-R",     "KOI8-R" },
 210     { "KOI8-U",     "KOI8-U" },
 211     { "PT154",      "PT154" },
 212     { "SJIS",       "SHIFT_JIS" },
 213     { "eucCN",      "GB2312" },
 214     { "eucJP",      "EUC-JP" },
 215     { "eucKR",      "EUC-KR" }
 216 #   define alias_table_defined
 217 #  endif
 218 #  if defined _AIX                                          /* AIX */
 219   /*{ "GBK",        "GBK" },*/
 220     { "IBM-1046",   "CP1046" },
 221     { "IBM-1124",   "CP1124" },
 222     { "IBM-1129",   "CP1129" },
 223     { "IBM-1252",   "CP1252" },
 224     { "IBM-850",    "CP850" },
 225     { "IBM-856",    "CP856" },
 226     { "IBM-921",    "ISO-8859-13" },
 227     { "IBM-922",    "CP922" },
 228     { "IBM-932",    "CP932" },
 229     { "IBM-943",    "CP943" },
 230     { "IBM-eucCN",  "GB2312" },
 231     { "IBM-eucJP",  "EUC-JP" },
 232     { "IBM-eucKR",  "EUC-KR" },
 233     { "IBM-eucTW",  "EUC-TW" },
 234     { "ISO8859-1",  "ISO-8859-1" },
 235     { "ISO8859-15", "ISO-8859-15" },
 236     { "ISO8859-2",  "ISO-8859-2" },
 237     { "ISO8859-5",  "ISO-8859-5" },
 238     { "ISO8859-6",  "ISO-8859-6" },
 239     { "ISO8859-7",  "ISO-8859-7" },
 240     { "ISO8859-8",  "ISO-8859-8" },
 241     { "ISO8859-9",  "ISO-8859-9" },
 242     { "TIS-620",    "TIS-620" },
 243   /*{ "UTF-8",      "UTF-8" },*/
 244     { "big5",       "BIG5" }
 245 #   define alias_table_defined
 246 #  endif
 247 #  if defined __hpux                                        /* HP-UX */
 248     { "SJIS",      "SHIFT_JIS" },
 249     { "arabic8",   "HP-ARABIC8" },
 250     { "big5",      "BIG5" },
 251     { "cp1251",    "CP1251" },
 252     { "eucJP",     "EUC-JP" },
 253     { "eucKR",     "EUC-KR" },
 254     { "eucTW",     "EUC-TW" },
 255     { "gb18030",   "GB18030" },
 256     { "greek8",    "HP-GREEK8" },
 257     { "hebrew8",   "HP-HEBREW8" },
 258     { "hkbig5",    "BIG5-HKSCS" },
 259     { "hp15CN",    "GB2312" },
 260     { "iso88591",  "ISO-8859-1" },
 261     { "iso885913", "ISO-8859-13" },
 262     { "iso885915", "ISO-8859-15" },
 263     { "iso88592",  "ISO-8859-2" },
 264     { "iso88594",  "ISO-8859-4" },
 265     { "iso88595",  "ISO-8859-5" },
 266     { "iso88596",  "ISO-8859-6" },
 267     { "iso88597",  "ISO-8859-7" },
 268     { "iso88598",  "ISO-8859-8" },
 269     { "iso88599",  "ISO-8859-9" },
 270     { "kana8",     "HP-KANA8" },
 271     { "koi8r",     "KOI8-R" },
 272     { "roman8",    "HP-ROMAN8" },
 273     { "tis620",    "TIS-620" },
 274     { "turkish8",  "HP-TURKISH8" },
 275     { "utf8",      "UTF-8" }
 276 #   define alias_table_defined
 277 #  endif
 278 #  if defined __sgi                                         /* IRIX */
 279     { "ISO8859-1",  "ISO-8859-1" },
 280     { "ISO8859-15", "ISO-8859-15" },
 281     { "ISO8859-2",  "ISO-8859-2" },
 282     { "ISO8859-5",  "ISO-8859-5" },
 283     { "ISO8859-7",  "ISO-8859-7" },
 284     { "ISO8859-9",  "ISO-8859-9" },
 285     { "eucCN",      "GB2312" },
 286     { "eucJP",      "EUC-JP" },
 287     { "eucKR",      "EUC-KR" },
 288     { "eucTW",      "EUC-TW" }
 289 #   define alias_table_defined
 290 #  endif
 291 #  if defined __osf__                                       /* OSF/1 */
 292   /*{ "GBK",        "GBK" },*/
 293     { "ISO8859-1",  "ISO-8859-1" },
 294     { "ISO8859-15", "ISO-8859-15" },
 295     { "ISO8859-2",  "ISO-8859-2" },
 296     { "ISO8859-4",  "ISO-8859-4" },
 297     { "ISO8859-5",  "ISO-8859-5" },
 298     { "ISO8859-7",  "ISO-8859-7" },
 299     { "ISO8859-8",  "ISO-8859-8" },
 300     { "ISO8859-9",  "ISO-8859-9" },
 301     { "KSC5601",    "CP949" },
 302     { "SJIS",       "SHIFT_JIS" },
 303     { "TACTIS",     "TIS-620" },
 304   /*{ "UTF-8",      "UTF-8" },*/
 305     { "big5",       "BIG5" },
 306     { "cp850",      "CP850" },
 307     { "dechanyu",   "DEC-HANYU" },
 308     { "dechanzi",   "GB2312" },
 309     { "deckanji",   "DEC-KANJI" },
 310     { "deckorean",  "EUC-KR" },
 311     { "eucJP",      "EUC-JP" },
 312     { "eucKR",      "EUC-KR" },
 313     { "eucTW",      "EUC-TW" },
 314     { "sdeckanji",  "EUC-JP" }
 315 #   define alias_table_defined
 316 #  endif
 317 #  if defined __sun                                         /* Solaris */
 318     { "5601",        "EUC-KR" },
 319     { "646",         "ASCII" },
 320   /*{ "BIG5",        "BIG5" },*/
 321     { "Big5-HKSCS",  "BIG5-HKSCS" },
 322     { "GB18030",     "GB18030" },
 323   /*{ "GBK",         "GBK" },*/
 324     { "ISO8859-1",   "ISO-8859-1" },
 325     { "ISO8859-11",  "TIS-620" },
 326     { "ISO8859-13",  "ISO-8859-13" },
 327     { "ISO8859-15",  "ISO-8859-15" },
 328     { "ISO8859-2",   "ISO-8859-2" },
 329     { "ISO8859-3",   "ISO-8859-3" },
 330     { "ISO8859-4",   "ISO-8859-4" },
 331     { "ISO8859-5",   "ISO-8859-5" },
 332     { "ISO8859-6",   "ISO-8859-6" },
 333     { "ISO8859-7",   "ISO-8859-7" },
 334     { "ISO8859-8",   "ISO-8859-8" },
 335     { "ISO8859-9",   "ISO-8859-9" },
 336     { "PCK",         "SHIFT_JIS" },
 337     { "TIS620.2533", "TIS-620" },
 338   /*{ "UTF-8",       "UTF-8" },*/
 339     { "ansi-1251",   "CP1251" },
 340     { "cns11643",    "EUC-TW" },
 341     { "eucJP",       "EUC-JP" },
 342     { "gb2312",      "GB2312" },
 343     { "koi8-r",      "KOI8-R" }
 344 #   define alias_table_defined
 345 #  endif
 346 #  if defined __minix                                       /* Minix */
 347     { "646", "ASCII" }
 348 #   define alias_table_defined
 349 #  endif
 350 #  if defined WINDOWS_NATIVE || defined __CYGWIN__          /* Windows */
 351     { "CP1361",  "JOHAB" },
 352     { "CP20127", "ASCII" },
 353     { "CP20866", "KOI8-R" },
 354     { "CP20936", "GB2312" },
 355     { "CP21866", "KOI8-RU" },
 356     { "CP28591", "ISO-8859-1" },
 357     { "CP28592", "ISO-8859-2" },
 358     { "CP28593", "ISO-8859-3" },
 359     { "CP28594", "ISO-8859-4" },
 360     { "CP28595", "ISO-8859-5" },
 361     { "CP28596", "ISO-8859-6" },
 362     { "CP28597", "ISO-8859-7" },
 363     { "CP28598", "ISO-8859-8" },
 364     { "CP28599", "ISO-8859-9" },
 365     { "CP28605", "ISO-8859-15" },
 366     { "CP38598", "ISO-8859-8" },
 367     { "CP51932", "EUC-JP" },
 368     { "CP51936", "GB2312" },
 369     { "CP51949", "EUC-KR" },
 370     { "CP51950", "EUC-TW" },
 371     { "CP54936", "GB18030" },
 372     { "CP65001", "UTF-8" },
 373     { "CP936",   "GBK" }
 374 #   define alias_table_defined
 375 #  endif
 376 #  if defined OS2                                           /* OS/2 */
 377     /* The list of encodings is taken from "List of OS/2 Codepages"
 378        by Alex Taylor:
 379        <http://altsan.org/os2/toolkits/uls/index.html#codepages>.
 380        See also "IBM Globalization - Code page identifiers":
 381        <https://www-01.ibm.com/software/globalization/cp/cp_cpgid.html>.  */
 382     { "CP1089", "ISO-8859-6" },
 383     { "CP1208", "UTF-8" },
 384     { "CP1381", "GB2312" },
 385     { "CP1386", "GBK" },
 386     { "CP3372", "EUC-JP" },
 387     { "CP813",  "ISO-8859-7" },
 388     { "CP819",  "ISO-8859-1" },
 389     { "CP878",  "KOI8-R" },
 390     { "CP912",  "ISO-8859-2" },
 391     { "CP913",  "ISO-8859-3" },
 392     { "CP914",  "ISO-8859-4" },
 393     { "CP915",  "ISO-8859-5" },
 394     { "CP916",  "ISO-8859-8" },
 395     { "CP920",  "ISO-8859-9" },
 396     { "CP921",  "ISO-8859-13" },
 397     { "CP923",  "ISO-8859-15" },
 398     { "CP954",  "EUC-JP" },
 399     { "CP964",  "EUC-TW" },
 400     { "CP970",  "EUC-KR" }
 401 #   define alias_table_defined
 402 #  endif
 403 #  if defined VMS                                           /* OpenVMS */
 404     /* The list of encodings is taken from the OpenVMS 7.3-1 documentation
 405        "Compaq C Run-Time Library Reference Manual for OpenVMS systems"
 406        section 10.7 "Handling Different Character Sets".  */
 407     { "DECHANYU",  "DEC-HANYU" },
 408     { "DECHANZI",  "GB2312" },
 409     { "DECKANJI",  "DEC-KANJI" },
 410     { "DECKOREAN", "EUC-KR" },
 411     { "ISO8859-1", "ISO-8859-1" },
 412     { "ISO8859-2", "ISO-8859-2" },
 413     { "ISO8859-5", "ISO-8859-5" },
 414     { "ISO8859-7", "ISO-8859-7" },
 415     { "ISO8859-8", "ISO-8859-8" },
 416     { "ISO8859-9", "ISO-8859-9" },
 417     { "SDECKANJI", "EUC-JP" },
 418     { "SJIS",      "SHIFT_JIS" },
 419     { "eucJP",     "EUC-JP" },
 420     { "eucTW",     "EUC-TW" }
 421 #   define alias_table_defined
 422 #  endif
 423 #  ifndef alias_table_defined
 424     /* Just a dummy entry, to avoid a C syntax error.  */
 425     { "", "" }
 426 #  endif
 427   };
 428
 429 # endif
 430
 431 #else
 432
 433 /* On these platforms, we use a mapping from locale name to GNU canonical
 434    encoding name.  */
 435
 436 struct table_entry
 437 {
 438   const char locale[17+1];
 439   const char canonical[11+1];
 440 };
 441
 442 /* Table of platform-dependent mappings, sorted in ascending order.  */
 443 static const struct table_entry locale_table[] =
 444   {
 445 # if defined __FreeBSD__                                    /* FreeBSD 4.2 */
 446     { "cs_CZ.ISO_8859-2",  "ISO-8859-2" },
 447     { "da_DK.DIS_8859-15", "ISO-8859-15" },
 448     { "da_DK.ISO_8859-1",  "ISO-8859-1" },
 449     { "de_AT.DIS_8859-15", "ISO-8859-15" },
 450     { "de_AT.ISO_8859-1",  "ISO-8859-1" },
 451     { "de_CH.DIS_8859-15", "ISO-8859-15" },
 452     { "de_CH.ISO_8859-1",  "ISO-8859-1" },
 453     { "de_DE.DIS_8859-15", "ISO-8859-15" },
 454     { "de_DE.ISO_8859-1",  "ISO-8859-1" },
 455     { "en_AU.DIS_8859-15", "ISO-8859-15" },
 456     { "en_AU.ISO_8859-1",  "ISO-8859-1" },
 457     { "en_CA.DIS_8859-15", "ISO-8859-15" },
 458     { "en_CA.ISO_8859-1",  "ISO-8859-1" },
 459     { "en_GB.DIS_8859-15", "ISO-8859-15" },
 460     { "en_GB.ISO_8859-1",  "ISO-8859-1" },
 461     { "en_US.DIS_8859-15", "ISO-8859-15" },
 462     { "en_US.ISO_8859-1",  "ISO-8859-1" },
 463     { "es_ES.DIS_8859-15", "ISO-8859-15" },
 464     { "es_ES.ISO_8859-1",  "ISO-8859-1" },
 465     { "fi_FI.DIS_8859-15", "ISO-8859-15" },
 466     { "fi_FI.ISO_8859-1",  "ISO-8859-1" },
 467     { "fr_BE.DIS_8859-15", "ISO-8859-15" },
 468     { "fr_BE.ISO_8859-1",  "ISO-8859-1" },
 469     { "fr_CA.DIS_8859-15", "ISO-8859-15" },
 470     { "fr_CA.ISO_8859-1",  "ISO-8859-1" },
 471     { "fr_CH.DIS_8859-15", "ISO-8859-15" },
 472     { "fr_CH.ISO_8859-1",  "ISO-8859-1" },
 473     { "fr_FR.DIS_8859-15", "ISO-8859-15" },
 474     { "fr_FR.ISO_8859-1",  "ISO-8859-1" },
 475     { "hr_HR.ISO_8859-2",  "ISO-8859-2" },
 476     { "hu_HU.ISO_8859-2",  "ISO-8859-2" },
 477     { "is_IS.DIS_8859-15", "ISO-8859-15" },
 478     { "is_IS.ISO_8859-1",  "ISO-8859-1" },
 479     { "it_CH.DIS_8859-15", "ISO-8859-15" },
 480     { "it_CH.ISO_8859-1",  "ISO-8859-1" },
 481     { "it_IT.DIS_8859-15", "ISO-8859-15" },
 482     { "it_IT.ISO_8859-1",  "ISO-8859-1" },
 483     { "ja_JP.EUC",         "EUC-JP" },
 484     { "ja_JP.SJIS",        "SHIFT_JIS" },
 485     { "ja_JP.Shift_JIS",   "SHIFT_JIS" },
 486     { "ko_KR.EUC",         "EUC-KR" },
 487     { "la_LN.ASCII",       "ASCII" },
 488     { "la_LN.DIS_8859-15", "ISO-8859-15" },
 489     { "la_LN.ISO_8859-1",  "ISO-8859-1" },
 490     { "la_LN.ISO_8859-2",  "ISO-8859-2" },
 491     { "la_LN.ISO_8859-4",  "ISO-8859-4" },
 492     { "lt_LN.ASCII",       "ASCII" },
 493     { "lt_LN.DIS_8859-15", "ISO-8859-15" },
 494     { "lt_LN.ISO_8859-1",  "ISO-8859-1" },
 495     { "lt_LN.ISO_8859-2",  "ISO-8859-2" },
 496     { "lt_LT.ISO_8859-4",  "ISO-8859-4" },
 497     { "nl_BE.DIS_8859-15", "ISO-8859-15" },
 498     { "nl_BE.ISO_8859-1",  "ISO-8859-1" },
 499     { "nl_NL.DIS_8859-15", "ISO-8859-15" },
 500     { "nl_NL.ISO_8859-1",  "ISO-8859-1" },
 501     { "no_NO.DIS_8859-15", "ISO-8859-15" },
 502     { "no_NO.ISO_8859-1",  "ISO-8859-1" },
 503     { "pl_PL.ISO_8859-2",  "ISO-8859-2" },
 504     { "pt_PT.DIS_8859-15", "ISO-8859-15" },
 505     { "pt_PT.ISO_8859-1",  "ISO-8859-1" },
 506     { "ru_RU.CP866",       "CP866" },
 507     { "ru_RU.ISO_8859-5",  "ISO-8859-5" },
 508     { "ru_RU.KOI8-R",      "KOI8-R" },
 509     { "ru_SU.CP866",       "CP866" },
 510     { "ru_SU.ISO_8859-5",  "ISO-8859-5" },
 511     { "ru_SU.KOI8-R",      "KOI8-R" },
 512     { "sl_SI.ISO_8859-2",  "ISO-8859-2" },
 513     { "sv_SE.DIS_8859-15", "ISO-8859-15" },
 514     { "sv_SE.ISO_8859-1",  "ISO-8859-1" },
 515     { "uk_UA.KOI8-U",      "KOI8-U" },
 516     { "zh_CN.EUC",         "GB2312" },
 517     { "zh_TW.BIG5",        "BIG5" },
 518     { "zh_TW.Big5",        "BIG5" }
 519 #  define locale_table_defined
 520 # endif
 521 # if defined __DJGPP__                                      /* DOS / DJGPP 2.03 */
 522     /* The encodings given here may not all be correct.
 523        If you find that the encoding given for your language and
 524        country is not the one your DOS machine actually uses, just
 525        correct it in this file, and send a mail to
 526        Juan Manuel Guerrero <juan.guerrero@gmx.de>
 527        and <bug-gnulib@gnu.org>.  */
 528     { "C",     "ASCII" },
 529     { "ar",    "CP864" },
 530     { "ar_AE", "CP864" },
 531     { "ar_DZ", "CP864" },
 532     { "ar_EG", "CP864" },
 533     { "ar_IQ", "CP864" },
 534     { "ar_IR", "CP864" },
 535     { "ar_JO", "CP864" },
 536     { "ar_KW", "CP864" },
 537     { "ar_MA", "CP864" },
 538     { "ar_OM", "CP864" },
 539     { "ar_QA", "CP864" },
 540     { "ar_SA", "CP864" },
 541     { "ar_SY", "CP864" },
 542     { "be",    "CP866" },
 543     { "be_BE", "CP866" },
 544     { "bg",    "CP866" }, /* not CP855 ?? */
 545     { "bg_BG", "CP866" }, /* not CP855 ?? */
 546     { "ca",    "CP850" },
 547     { "ca_ES", "CP850" },
 548     { "cs",    "CP852" },
 549     { "cs_CZ", "CP852" },
 550     { "da",    "CP865" }, /* not CP850 ?? */
 551     { "da_DK", "CP865" }, /* not CP850 ?? */
 552     { "de",    "CP850" },
 553     { "de_AT", "CP850" },
 554     { "de_CH", "CP850" },
 555     { "de_DE", "CP850" },
 556     { "el",    "CP869" },
 557     { "el_GR", "CP869" },
 558     { "en",    "CP850" },
 559     { "en_AU", "CP850" }, /* not CP437 ?? */
 560     { "en_CA", "CP850" },
 561     { "en_GB", "CP850" },
 562     { "en_NZ", "CP437" },
 563     { "en_US", "CP437" },
 564     { "en_ZA", "CP850" }, /* not CP437 ?? */
 565     { "eo",    "CP850" },
 566     { "eo_EO", "CP850" },
 567     { "es",    "CP850" },
 568     { "es_AR", "CP850" },
 569     { "es_BO", "CP850" },
 570     { "es_CL", "CP850" },
 571     { "es_CO", "CP850" },
 572     { "es_CR", "CP850" },
 573     { "es_CU", "CP850" },
 574     { "es_DO", "CP850" },
 575     { "es_EC", "CP850" },
 576     { "es_ES", "CP850" },
 577     { "es_GT", "CP850" },
 578     { "es_HN", "CP850" },
 579     { "es_MX", "CP850" },
 580     { "es_NI", "CP850" },
 581     { "es_PA", "CP850" },
 582     { "es_PE", "CP850" },
 583     { "es_PY", "CP850" },
 584     { "es_SV", "CP850" },
 585     { "es_UY", "CP850" },
 586     { "es_VE", "CP850" },
 587     { "et",    "CP850" },
 588     { "et_EE", "CP850" },
 589     { "eu",    "CP850" },
 590     { "eu_ES", "CP850" },
 591     { "fi",    "CP850" },
 592     { "fi_FI", "CP850" },
 593     { "fr",    "CP850" },
 594     { "fr_BE", "CP850" },
 595     { "fr_CA", "CP850" },
 596     { "fr_CH", "CP850" },
 597     { "fr_FR", "CP850" },
 598     { "ga",    "CP850" },
 599     { "ga_IE", "CP850" },
 600     { "gd",    "CP850" },
 601     { "gd_GB", "CP850" },
 602     { "gl",    "CP850" },
 603     { "gl_ES", "CP850" },
 604     { "he",    "CP862" },
 605     { "he_IL", "CP862" },
 606     { "hr",    "CP852" },
 607     { "hr_HR", "CP852" },
 608     { "hu",    "CP852" },
 609     { "hu_HU", "CP852" },
 610     { "id",    "CP850" }, /* not CP437 ?? */
 611     { "id_ID", "CP850" }, /* not CP437 ?? */
 612     { "is",    "CP861" }, /* not CP850 ?? */
 613     { "is_IS", "CP861" }, /* not CP850 ?? */
 614     { "it",    "CP850" },
 615     { "it_CH", "CP850" },
 616     { "it_IT", "CP850" },
 617     { "ja",    "CP932" },
 618     { "ja_JP", "CP932" },
 619     { "kr",    "CP949" }, /* not CP934 ?? */
 620     { "kr_KR", "CP949" }, /* not CP934 ?? */
 621     { "lt",    "CP775" },
 622     { "lt_LT", "CP775" },
 623     { "lv",    "CP775" },
 624     { "lv_LV", "CP775" },
 625     { "mk",    "CP866" }, /* not CP855 ?? */
 626     { "mk_MK", "CP866" }, /* not CP855 ?? */
 627     { "mt",    "CP850" },
 628     { "mt_MT", "CP850" },
 629     { "nb",    "CP865" }, /* not CP850 ?? */
 630     { "nb_NO", "CP865" }, /* not CP850 ?? */
 631     { "nl",    "CP850" },
 632     { "nl_BE", "CP850" },
 633     { "nl_NL", "CP850" },
 634     { "nn",    "CP865" }, /* not CP850 ?? */
 635     { "nn_NO", "CP865" }, /* not CP850 ?? */
 636     { "no",    "CP865" }, /* not CP850 ?? */
 637     { "no_NO", "CP865" }, /* not CP850 ?? */
 638     { "pl",    "CP852" },
 639     { "pl_PL", "CP852" },
 640     { "pt",    "CP850" },
 641     { "pt_BR", "CP850" },
 642     { "pt_PT", "CP850" },
 643     { "ro",    "CP852" },
 644     { "ro_RO", "CP852" },
 645     { "ru",    "CP866" },
 646     { "ru_RU", "CP866" },
 647     { "sk",    "CP852" },
 648     { "sk_SK", "CP852" },
 649     { "sl",    "CP852" },
 650     { "sl_SI", "CP852" },
 651     { "sq",    "CP852" },
 652     { "sq_AL", "CP852" },
 653     { "sr",    "CP852" }, /* CP852 or CP866 or CP855 ?? */
 654     { "sr_CS", "CP852" }, /* CP852 or CP866 or CP855 ?? */
 655     { "sr_YU", "CP852" }, /* CP852 or CP866 or CP855 ?? */
 656     { "sv",    "CP850" },
 657     { "sv_SE", "CP850" },
 658     { "th",    "CP874" },
 659     { "th_TH", "CP874" },
 660     { "tr",    "CP857" },
 661     { "tr_TR", "CP857" },
 662     { "uk",    "CP1125" },
 663     { "uk_UA", "CP1125" },
 664     { "zh_CN", "GBK" },
 665     { "zh_TW", "CP950" } /* not CP938 ?? */
 666 #  define locale_table_defined
 667 # endif
 668 # ifndef locale_table_defined
 669     /* Just a dummy entry, to avoid a C syntax error.  */
 670     { "", "" }
 671 # endif
 672   };
 673
 674 #endif
 675
 676
 677 /* Determine the current locale's character encoding, and canonicalize it
 678    into one of the canonical names listed in localcharset.h.
 679    The result must not be freed; it is statically allocated.
 680    If the canonical name cannot be determined, the result is a non-canonical
 681    name.  */
 682
 683 #ifdef STATIC
 684 STATIC
 685 #endif
 686 const char *
 687 locale_charset (void)
 688 {
 689   const char *codeset;
 690
 691 #if HAVE_LANGINFO_CODESET || defined WINDOWS_NATIVE || defined OS2
 692
 693 # if HAVE_LANGINFO_CODESET
 694
 695   /* Most systems support nl_langinfo (CODESET) nowadays.  */
 696   codeset = nl_langinfo (CODESET);
 697
 698 #  ifdef __CYGWIN__
 699   /* Cygwin < 1.7 does not have locales.  nl_langinfo (CODESET) always
 700      returns "US-ASCII".  Return the suffix of the locale name from the
 701      environment variables (if present) or the codepage as a number.  */
 702   if (codeset != NULL && strcmp (codeset, "US-ASCII") == 0)
 703     {
 704       const char *locale;
 705       static char buf[2 + 10 + 1];
 706
 707       locale = getenv ("LC_ALL");
 708       if (locale == NULL || locale[0] == '\0')
 709         {
 710           locale = getenv ("LC_CTYPE");
 711           if (locale == NULL || locale[0] == '\0')
 712             locale = getenv ("LANG");
 713         }
 714       if (locale != NULL && locale[0] != '\0')
 715         {
 716           /* If the locale name contains an encoding after the dot, return
 717              it.  */
 718           const char *dot = strchr (locale, '.');
 719
 720           if (dot != NULL)
 721             {
 722               const char *modifier;
 723
 724               dot++;
 725               /* Look for the possible @... trailer and remove it, if any.  */
 726               modifier = strchr (dot, '@');
 727               if (modifier == NULL)
 728                 return dot;
 729               if (modifier - dot < sizeof (buf))
 730                 {
 731                   memcpy (buf, dot, modifier - dot);
 732                   buf [modifier - dot] = '\0';
 733                   return buf;
 734                 }
 735             }
 736         }
 737
 738       /* The Windows API has a function returning the locale's codepage as a
 739          number: GetACP().  This encoding is used by Cygwin, unless the user
 740          has set the environment variable CYGWIN=codepage:oem (which very few
 741          people do).
 742          Output directed to console windows needs to be converted (to
 743          GetOEMCP() if the console is using a raster font, or to
 744          GetConsoleOutputCP() if it is using a TrueType font).  Cygwin does
 745          this conversion transparently (see winsup/cygwin/fhandler_console.cc),
 746          converting to GetConsoleOutputCP().  This leads to correct results,
 747          except when SetConsoleOutputCP has been called and a raster font is
 748          in use.  */
 749       sprintf (buf, "CP%u", GetACP ());
 750       codeset = buf;
 751     }
 752 #  endif
 753
 754   if (codeset == NULL)
 755     /* The canonical name cannot be determined.  */
 756     codeset = "";
 757
 758 # elif defined WINDOWS_NATIVE
 759
 760   static char buf[2 + 10 + 1];
 761
 762   /* The Windows API has a function returning the locale's codepage as
 763      a number, but the value doesn't change according to what the
 764      'setlocale' call specified.  So we use it as a last resort, in
 765      case the string returned by 'setlocale' doesn't specify the
 766      codepage.  */
 767   char *current_locale = setlocale (LC_ALL, NULL);
 768   char *pdot;
 769
 770   /* If they set different locales for different categories,
 771      'setlocale' will return a semi-colon separated list of locale
 772      values.  To make sure we use the correct one, we choose LC_CTYPE.  */
 773   if (strchr (current_locale, ';'))
 774     current_locale = setlocale (LC_CTYPE, NULL);
 775
 776   pdot = strrchr (current_locale, '.');
 777   if (pdot && 2 + strlen (pdot + 1) + 1 <= sizeof (buf))
 778     sprintf (buf, "CP%s", pdot + 1);
 779   else
 780     {
 781       /* The Windows API has a function returning the locale's codepage as a
 782         number: GetACP().
 783         When the output goes to a console window, it needs to be provided in
 784         GetOEMCP() encoding if the console is using a raster font, or in
 785         GetConsoleOutputCP() encoding if it is using a TrueType font.
 786         But in GUI programs and for output sent to files and pipes, GetACP()
 787         encoding is the best bet.  */
 788       sprintf (buf, "CP%u", GetACP ());
 789     }
 790   /* For a locale name such as "French_France.65001", in Windows 10,
 791      setlocale now returns "French_France.utf8" instead.  */
 792   if (strcmp (buf + 2, "65001") == 0 || strcmp (buf + 2, "utf8") == 0)
 793     codeset = "UTF-8";
 794   else
 795     codeset = buf;
 796
 797 # elif defined OS2
 798
 799   const char *locale;
 800   static char buf[2 + 10 + 1];
 801   ULONG cp[3];
 802   ULONG cplen;
 803
 804   codeset = NULL;
 805
 806   /* Allow user to override the codeset, as set in the operating system,
 807      with standard language environment variables.  */
 808   locale = getenv ("LC_ALL");
 809   if (locale == NULL || locale[0] == '\0')
 810     {
 811       locale = getenv ("LC_CTYPE");
 812       if (locale == NULL || locale[0] == '\0')
 813         locale = getenv ("LANG");
 814     }
 815   if (locale != NULL && locale[0] != '\0')
 816     {
 817       /* If the locale name contains an encoding after the dot, return it.  */
 818       const char *dot = strchr (locale, '.');
 819
 820       if (dot != NULL)
 821         {
 822           const char *modifier;
 823
 824           dot++;
 825           /* Look for the possible @... trailer and remove it, if any.  */
 826           modifier = strchr (dot, '@');
 827           if (modifier == NULL)
 828             return dot;
 829           if (modifier - dot < sizeof (buf))
 830             {
 831               memcpy (buf, dot, modifier - dot);
 832               buf [modifier - dot] = '\0';
 833               return buf;
 834             }
 835         }
 836
 837       /* For the POSIX locale, don't use the system's codepage.  */
 838       if (strcmp (locale, "C") == 0 || strcmp (locale, "POSIX") == 0)
 839         codeset = "";
 840     }
 841
 842   if (codeset == NULL)
 843     {
 844       /* OS/2 has a function returning the locale's codepage as a number.  */
 845       if (DosQueryCp (sizeof (cp), cp, &cplen))
 846         codeset = "";
 847       else
 848         {
 849           sprintf (buf, "CP%u", cp[0]);
 850           codeset = buf;
 851         }
 852     }
 853
 854 # else
 855
 856 #  error "Add code for other platforms here."
 857
 858 # endif
 859
 860   /* Resolve alias.  */
 861   {
 862 # ifdef alias_table_defined
 863     /* On some platforms, UTF-8 locales are the most frequently used ones.
 864        Speed up the common case and slow down the less common cases by
 865        testing for this case first.  */
 866 #  if defined __OpenBSD__ || (defined __APPLE__ && defined __MACH__) || defined __sun || defined __CYGWIN__
 867     if (strcmp (codeset, "UTF-8") == 0)
 868       goto done_table_lookup;
 869     else
 870 #  endif
 871       {
 872         const struct table_entry * const table = alias_table;
 873         size_t const table_size =
 874           sizeof (alias_table) / sizeof (struct table_entry);
 875         /* The table is sorted.  Perform a binary search.  */
 876         size_t hi = table_size;
 877         size_t lo = 0;
 878         while (lo < hi)
 879           {
 880             /* Invariant:
 881                for i < lo, strcmp (table[i].alias, codeset) < 0,
 882                for i >= hi, strcmp (table[i].alias, codeset) > 0.  */
 883             size_t mid = (hi + lo) >> 1; /* >= lo, < hi */
 884             int cmp = strcmp (table[mid].alias, codeset);
 885             if (cmp < 0)
 886               lo = mid + 1;
 887             else if (cmp > 0)
 888               hi = mid;
 889             else
 890               {
 891                 /* Found an i with
 892                      strcmp (table[i].alias, codeset) == 0.  */
 893                 codeset = table[mid].canonical;
 894                 goto done_table_lookup;
 895               }
 896           }
 897       }
 898     if (0)
 899       done_table_lookup: ;
 900     else
 901 # endif
 902       {
 903         /* Did not find it in the table.  */
 904         /* On Mac OS X, all modern locales use the UTF-8 encoding.
 905            BeOS and Haiku have a single locale, and it has UTF-8 encoding.  */
 906 # if (defined __APPLE__ && defined __MACH__) || defined __BEOS__ || defined __HAIKU__
 907         codeset = "UTF-8";
 908 # else
 909         /* Don't return an empty string.  GNU libc and GNU libiconv interpret
 910            the empty string as denoting "the locale's character encoding",
 911            thus GNU libiconv would call this function a second time.  */
 912         if (codeset[0] == '\0')
 913           codeset = "ASCII";
 914 # endif
 915       }
 916   }
 917
 918 #else
 919
 920   /* On old systems which lack it, use setlocale or getenv.  */
 921   const char *locale = NULL;
 922
 923   /* But most old systems don't have a complete set of locales.  Some
 924      (like DJGPP) have only the C locale.  Therefore we don't use setlocale
 925      here; it would return "C" when it doesn't support the locale name the
 926      user has set.  */
 927 # if 0
 928   locale = setlocale (LC_CTYPE, NULL);
 929 # endif
 930   if (locale == NULL || locale[0] == '\0')
 931     {
 932       locale = getenv ("LC_ALL");
 933       if (locale == NULL || locale[0] == '\0')
 934         {
 935           locale = getenv ("LC_CTYPE");
 936           if (locale == NULL || locale[0] == '\0')
 937             locale = getenv ("LANG");
 938             if (locale == NULL)
 939               locale = "";
 940         }
 941     }
 942
 943   /* Map locale name to canonical encoding name.  */
 944   {
 945 # ifdef locale_table_defined
 946     const struct table_entry * const table = locale_table;
 947     size_t const table_size =
 948       sizeof (locale_table) / sizeof (struct table_entry);
 949     /* The table is sorted.  Perform a binary search.  */
 950     size_t hi = table_size;
 951     size_t lo = 0;
 952     while (lo < hi)
 953       {
 954         /* Invariant:
 955            for i < lo, strcmp (table[i].locale, locale) < 0,
 956            for i >= hi, strcmp (table[i].locale, locale) > 0.  */
 957         size_t mid = (hi + lo) >> 1; /* >= lo, < hi */
 958         int cmp = strcmp (table[mid].locale, locale);
 959         if (cmp < 0)
 960           lo = mid + 1;
 961         else if (cmp > 0)
 962           hi = mid;
 963         else
 964           {
 965             /* Found an i with
 966                  strcmp (table[i].locale, locale) == 0.  */
 967             codeset = table[mid].canonical;
 968             goto done_table_lookup;
 969           }
 970       }
 971     if (0)
 972       done_table_lookup: ;
 973     else
 974 # endif
 975       {
 976         /* Did not find it in the table.  */
 977         /* On Mac OS X, all modern locales use the UTF-8 encoding.
 978            BeOS and Haiku have a single locale, and it has UTF-8 encoding.  */
 979 # if (defined __APPLE__ && defined __MACH__) || defined __BEOS__ || defined __HAIKU__
 980         codeset = "UTF-8";
 981 # else
 982         /* The canonical name cannot be determined.  */
 983         /* Don't return an empty string.  GNU libc and GNU libiconv interpret
 984            the empty string as denoting "the locale's character encoding",
 985            thus GNU libiconv would call this function a second time.  */
 986         codeset = "ASCII";
 987 # endif
 988       }
 989   }
 990
 991 #endif
 992
 993 #ifdef DARWIN7
 994   /* Mac OS X sets MB_CUR_MAX to 1 when LC_ALL=C, and "UTF-8"
 995      (the default codeset) does not work when MB_CUR_MAX is 1.  */
 996   if (strcmp (codeset, "UTF-8") == 0 && MB_CUR_MAX_L (uselocale (NULL)) <= 1)
 997     codeset = "ASCII";
 998 #endif
 999
1000   return codeset;
1001 }