1 /* Determine a canonical name for the current locale's character encoding.
3 Copyright (C) 2000-2006, 2008-2016 Free Software Foundation, Inc.
5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 3, or (at your option)
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 You should have received a copy of the GNU General Public License along
16 with this program; if not, see <http://www.gnu.org/licenses/>. */
18 /* Written by Bruno Haible <bruno@clisp.org>. */
23 #include "localcharset.h"
31 #if defined __APPLE__ && defined __MACH__ && HAVE_LANGINFO_CODESET
32 # define DARWIN7 /* Darwin 7 or newer, i.e. Mac OS X 10.3 or newer */
35 #if defined _WIN32 || defined __WIN32__
36 # define WINDOWS_NATIVE
41 /* Assume EMX program runs on OS/2, even if compiled under DOS. */
47 #if !defined WINDOWS_NATIVE
49 # if HAVE_LANGINFO_CODESET
50 # include <langinfo.h>
52 # if 0 /* see comment below */
57 # define WIN32_LEAN_AND_MEAN
60 #elif defined WINDOWS_NATIVE
61 # define WIN32_LEAN_AND_MEAN
69 /* For MB_CUR_MAX_L */
74 #if ENABLE_RELOCATABLE
75 # include "relocatable.h"
77 # define relocate(pathname) (pathname)
82 # include "configmake.h"
85 /* Define O_NOFOLLOW to 0 on platforms where it does not exist. */
90 #if defined _WIN32 || defined __WIN32__ || defined __CYGWIN__ || defined __EMX__ || defined __DJGPP__
91 /* Native Windows, Cygwin, OS/2, DOS */
92 # define ISSLASH(C) ((C) == '/' || (C) == '\\')
95 #ifndef DIRECTORY_SEPARATOR
96 # define DIRECTORY_SEPARATOR '/'
100 # define ISSLASH(C) ((C) == DIRECTORY_SEPARATOR)
103 #if HAVE_DECL_GETC_UNLOCKED
105 # define getc getc_unlocked
108 /* The following static variable is declared 'volatile' to avoid a
109 possible multithread problem in the function get_charset_aliases. If we
110 are running in a threaded environment, and if two threads initialize
111 'charset_aliases' simultaneously, both will produce the same value,
112 and everything will be ok if the two assignments to 'charset_aliases'
113 are atomic. But I don't know what will happen if the two assignments mix. */
115 # define volatile /* empty */
117 /* Pointer to the contents of the charset.alias file, if it has already been
118 read, else NULL. Its format is:
119 ALIAS_1 '\0' CANONICAL_1 '\0' ... ALIAS_n '\0' CANONICAL_n '\0' '\0' */
120 static const char * volatile charset_aliases
;
122 /* Return a pointer to the contents of the charset.alias file. */
124 get_charset_aliases (void)
128 cp
= charset_aliases
;
131 #if !(defined DARWIN7 || defined VMS || defined WINDOWS_NATIVE || defined __CYGWIN__ || defined OS2)
133 const char *base
= "charset.alias";
136 /* Make it possible to override the charset.alias location. This is
137 necessary for running the testsuite before "make install". */
138 dir
= getenv ("CHARSETALIASDIR");
139 if (dir
== NULL
|| dir
[0] == '\0')
140 dir
= relocate (LIBDIR
);
142 /* Concatenate dir and base into freshly allocated file_name. */
144 size_t dir_len
= strlen (dir
);
145 size_t base_len
= strlen (base
);
146 int add_slash
= (dir_len
> 0 && !ISSLASH (dir
[dir_len
- 1]));
147 file_name
= (char *) malloc (dir_len
+ add_slash
+ base_len
+ 1);
148 if (file_name
!= NULL
)
150 memcpy (file_name
, dir
, dir_len
);
152 file_name
[dir_len
] = DIRECTORY_SEPARATOR
;
153 memcpy (file_name
+ dir_len
+ add_slash
, base
, base_len
+ 1);
157 if (file_name
== NULL
)
158 /* Out of memory. Treat the file as empty. */
164 /* Open the file. Reject symbolic links on platforms that support
165 O_NOFOLLOW. This is a security feature. Without it, an attacker
166 could retrieve parts of the contents (namely, the tail of the
167 first line that starts with "* ") of an arbitrary file by placing
168 a symbolic link to that file under the name "charset.alias" in
169 some writable directory and defining the environment variable
170 CHARSETALIASDIR to point to that directory. */
171 fd
= open (file_name
,
172 O_RDONLY
| (HAVE_WORKING_O_NOFOLLOW
? O_NOFOLLOW
: 0));
174 /* File not found. Treat it as empty. */
180 fp
= fdopen (fd
, "r");
183 /* Out of memory. Treat the file as empty. */
189 /* Parse the file's contents. */
190 char *res_ptr
= NULL
;
204 if (c
== '\n' || c
== ' ' || c
== '\t')
208 /* Skip comment, to end of line. */
211 while (!(c
== EOF
|| c
== '\n'));
217 if (fscanf (fp
, "%50s %50s", buf1
, buf2
) < 2)
221 old_res_ptr
= res_ptr
;
224 res_size
= l1
+ 1 + l2
+ 1;
225 res_ptr
= (char *) malloc (res_size
+ 1);
229 res_size
+= l1
+ 1 + l2
+ 1;
230 res_ptr
= (char *) realloc (res_ptr
, res_size
+ 1);
239 strcpy (res_ptr
+ res_size
- (l2
+ 1) - (l1
+ 1), buf1
);
240 strcpy (res_ptr
+ res_size
- (l2
+ 1), buf2
);
247 *(res_ptr
+ res_size
) = '\0';
259 /* To avoid the trouble of installing a file that is shared by many
260 GNU packages -- many packaging systems have problems with this --,
261 simply inline the aliases here. */
262 cp
= "ISO8859-1" "\0" "ISO-8859-1" "\0"
263 "ISO8859-2" "\0" "ISO-8859-2" "\0"
264 "ISO8859-4" "\0" "ISO-8859-4" "\0"
265 "ISO8859-5" "\0" "ISO-8859-5" "\0"
266 "ISO8859-7" "\0" "ISO-8859-7" "\0"
267 "ISO8859-9" "\0" "ISO-8859-9" "\0"
268 "ISO8859-13" "\0" "ISO-8859-13" "\0"
269 "ISO8859-15" "\0" "ISO-8859-15" "\0"
270 "KOI8-R" "\0" "KOI8-R" "\0"
271 "KOI8-U" "\0" "KOI8-U" "\0"
272 "CP866" "\0" "CP866" "\0"
273 "CP949" "\0" "CP949" "\0"
274 "CP1131" "\0" "CP1131" "\0"
275 "CP1251" "\0" "CP1251" "\0"
276 "eucCN" "\0" "GB2312" "\0"
277 "GB2312" "\0" "GB2312" "\0"
278 "eucJP" "\0" "EUC-JP" "\0"
279 "eucKR" "\0" "EUC-KR" "\0"
280 "Big5" "\0" "BIG5" "\0"
281 "Big5HKSCS" "\0" "BIG5-HKSCS" "\0"
282 "GBK" "\0" "GBK" "\0"
283 "GB18030" "\0" "GB18030" "\0"
284 "SJIS" "\0" "SHIFT_JIS" "\0"
285 "ARMSCII-8" "\0" "ARMSCII-8" "\0"
286 "PT154" "\0" "PT154" "\0"
287 /*"ISCII-DEV" "\0" "?" "\0"*/
288 "*" "\0" "UTF-8" "\0";
292 /* To avoid the troubles of an extra file charset.alias_vms in the
293 sources of many GNU packages, simply inline the aliases here. */
294 /* The list of encodings is taken from the OpenVMS 7.3-1 documentation
295 "Compaq C Run-Time Library Reference Manual for OpenVMS systems"
296 section 10.7 "Handling Different Character Sets". */
297 cp
= "ISO8859-1" "\0" "ISO-8859-1" "\0"
298 "ISO8859-2" "\0" "ISO-8859-2" "\0"
299 "ISO8859-5" "\0" "ISO-8859-5" "\0"
300 "ISO8859-7" "\0" "ISO-8859-7" "\0"
301 "ISO8859-8" "\0" "ISO-8859-8" "\0"
302 "ISO8859-9" "\0" "ISO-8859-9" "\0"
304 "eucJP" "\0" "EUC-JP" "\0"
305 "SJIS" "\0" "SHIFT_JIS" "\0"
306 "DECKANJI" "\0" "DEC-KANJI" "\0"
307 "SDECKANJI" "\0" "EUC-JP" "\0"
309 "eucTW" "\0" "EUC-TW" "\0"
310 "DECHANYU" "\0" "DEC-HANYU" "\0"
311 "DECHANZI" "\0" "GB2312" "\0"
313 "DECKOREAN" "\0" "EUC-KR" "\0";
316 # if defined WINDOWS_NATIVE || defined __CYGWIN__
317 /* To avoid the troubles of installing a separate file in the same
318 directory as the DLL and of retrieving the DLL's directory at
319 runtime, simply inline the aliases here. */
321 cp
= "CP936" "\0" "GBK" "\0"
322 "CP1361" "\0" "JOHAB" "\0"
323 "CP20127" "\0" "ASCII" "\0"
324 "CP20866" "\0" "KOI8-R" "\0"
325 "CP20936" "\0" "GB2312" "\0"
326 "CP21866" "\0" "KOI8-RU" "\0"
327 "CP28591" "\0" "ISO-8859-1" "\0"
328 "CP28592" "\0" "ISO-8859-2" "\0"
329 "CP28593" "\0" "ISO-8859-3" "\0"
330 "CP28594" "\0" "ISO-8859-4" "\0"
331 "CP28595" "\0" "ISO-8859-5" "\0"
332 "CP28596" "\0" "ISO-8859-6" "\0"
333 "CP28597" "\0" "ISO-8859-7" "\0"
334 "CP28598" "\0" "ISO-8859-8" "\0"
335 "CP28599" "\0" "ISO-8859-9" "\0"
336 "CP28605" "\0" "ISO-8859-15" "\0"
337 "CP38598" "\0" "ISO-8859-8" "\0"
338 "CP51932" "\0" "EUC-JP" "\0"
339 "CP51936" "\0" "GB2312" "\0"
340 "CP51949" "\0" "EUC-KR" "\0"
341 "CP51950" "\0" "EUC-TW" "\0"
342 "CP54936" "\0" "GB18030" "\0"
343 "CP65001" "\0" "UTF-8" "\0";
346 /* To avoid the troubles of installing a separate file in the same
347 directory as the DLL and of retrieving the DLL's directory at
348 runtime, simply inline the aliases here. */
350 /* The list of encodings is taken from "List of OS/2 Codepages"
352 <http://altsan.org/os2/toolkits/uls/index.html#codepages>.
353 See also "IBM Globalization - Code page identifiers":
354 <http://www-01.ibm.com/software/globalization/cp/cp_cpgid.html>. */
355 cp
= "CP813" "\0" "ISO-8859-7" "\0"
356 "CP878" "\0" "KOI8-R" "\0"
357 "CP819" "\0" "ISO-8859-1" "\0"
358 "CP912" "\0" "ISO-8859-2" "\0"
359 "CP913" "\0" "ISO-8859-3" "\0"
360 "CP914" "\0" "ISO-8859-4" "\0"
361 "CP915" "\0" "ISO-8859-5" "\0"
362 "CP916" "\0" "ISO-8859-8" "\0"
363 "CP920" "\0" "ISO-8859-9" "\0"
364 "CP921" "\0" "ISO-8859-13" "\0"
365 "CP923" "\0" "ISO-8859-15" "\0"
366 "CP954" "\0" "EUC-JP" "\0"
367 "CP964" "\0" "EUC-TW" "\0"
368 "CP970" "\0" "EUC-KR" "\0"
369 "CP1089" "\0" "ISO-8859-6" "\0"
370 "CP1208" "\0" "UTF-8" "\0"
371 "CP1381" "\0" "GB2312" "\0"
372 "CP1386" "\0" "GBK" "\0"
373 "CP3372" "\0" "EUC-JP" "\0";
377 charset_aliases
= cp
;
383 /* Determine the current locale's character encoding, and canonicalize it
384 into one of the canonical names listed in config.charset.
385 The result must not be freed; it is statically allocated.
386 If the canonical name cannot be determined, the result is a non-canonical
393 locale_charset (void)
398 #if !(defined WINDOWS_NATIVE || defined OS2)
400 # if HAVE_LANGINFO_CODESET
402 /* Most systems support nl_langinfo (CODESET) nowadays. */
403 codeset
= nl_langinfo (CODESET
);
406 /* Cygwin < 1.7 does not have locales. nl_langinfo (CODESET) always
407 returns "US-ASCII". Return the suffix of the locale name from the
408 environment variables (if present) or the codepage as a number. */
409 if (codeset
!= NULL
&& strcmp (codeset
, "US-ASCII") == 0)
412 static char buf
[2 + 10 + 1];
414 locale
= getenv ("LC_ALL");
415 if (locale
== NULL
|| locale
[0] == '\0')
417 locale
= getenv ("LC_CTYPE");
418 if (locale
== NULL
|| locale
[0] == '\0')
419 locale
= getenv ("LANG");
421 if (locale
!= NULL
&& locale
[0] != '\0')
423 /* If the locale name contains an encoding after the dot, return
425 const char *dot
= strchr (locale
, '.');
429 const char *modifier
;
432 /* Look for the possible @... trailer and remove it, if any. */
433 modifier
= strchr (dot
, '@');
434 if (modifier
== NULL
)
436 if (modifier
- dot
< sizeof (buf
))
438 memcpy (buf
, dot
, modifier
- dot
);
439 buf
[modifier
- dot
] = '\0';
445 /* The Windows API has a function returning the locale's codepage as a
446 number: GetACP(). This encoding is used by Cygwin, unless the user
447 has set the environment variable CYGWIN=codepage:oem (which very few
449 Output directed to console windows needs to be converted (to
450 GetOEMCP() if the console is using a raster font, or to
451 GetConsoleOutputCP() if it is using a TrueType font). Cygwin does
452 this conversion transparently (see winsup/cygwin/fhandler_console.cc),
453 converting to GetConsoleOutputCP(). This leads to correct results,
454 except when SetConsoleOutputCP has been called and a raster font is
456 sprintf (buf
, "CP%u", GetACP ());
463 /* On old systems which lack it, use setlocale or getenv. */
464 const char *locale
= NULL
;
466 /* But most old systems don't have a complete set of locales. Some
467 (like SunOS 4 or DJGPP) have only the C locale. Therefore we don't
468 use setlocale here; it would return "C" when it doesn't support the
469 locale name the user has set. */
471 locale
= setlocale (LC_CTYPE
, NULL
);
473 if (locale
== NULL
|| locale
[0] == '\0')
475 locale
= getenv ("LC_ALL");
476 if (locale
== NULL
|| locale
[0] == '\0')
478 locale
= getenv ("LC_CTYPE");
479 if (locale
== NULL
|| locale
[0] == '\0')
480 locale
= getenv ("LANG");
484 /* On some old systems, one used to set locale = "iso8859_1". On others,
485 you set it to "language_COUNTRY.charset". In any case, we resolve it
486 through the charset.alias file. */
491 #elif defined WINDOWS_NATIVE
493 static char buf
[2 + 10 + 1];
495 /* The Windows API has a function returning the locale's codepage as
496 a number, but the value doesn't change according to what the
497 'setlocale' call specified. So we use it as a last resort, in
498 case the string returned by 'setlocale' doesn't specify the
500 char *current_locale
= setlocale (LC_ALL
, NULL
);
503 /* If they set different locales for different categories,
504 'setlocale' will return a semi-colon separated list of locale
505 values. To make sure we use the correct one, we choose LC_CTYPE. */
506 if (strchr (current_locale
, ';'))
507 current_locale
= setlocale (LC_CTYPE
, NULL
);
509 pdot
= strrchr (current_locale
, '.');
511 sprintf (buf
, "CP%s", pdot
+ 1);
514 /* The Windows API has a function returning the locale's codepage as a
516 When the output goes to a console window, it needs to be provided in
517 GetOEMCP() encoding if the console is using a raster font, or in
518 GetConsoleOutputCP() encoding if it is using a TrueType font.
519 But in GUI programs and for output sent to files and pipes, GetACP()
520 encoding is the best bet. */
521 sprintf (buf
, "CP%u", GetACP ());
528 static char buf
[2 + 10 + 1];
534 /* Allow user to override the codeset, as set in the operating system,
535 with standard language environment variables. */
536 locale
= getenv ("LC_ALL");
537 if (locale
== NULL
|| locale
[0] == '\0')
539 locale
= getenv ("LC_CTYPE");
540 if (locale
== NULL
|| locale
[0] == '\0')
541 locale
= getenv ("LANG");
543 if (locale
!= NULL
&& locale
[0] != '\0')
545 /* If the locale name contains an encoding after the dot, return it. */
546 const char *dot
= strchr (locale
, '.');
550 const char *modifier
;
553 /* Look for the possible @... trailer and remove it, if any. */
554 modifier
= strchr (dot
, '@');
555 if (modifier
== NULL
)
557 if (modifier
- dot
< sizeof (buf
))
559 memcpy (buf
, dot
, modifier
- dot
);
560 buf
[modifier
- dot
] = '\0';
565 /* For the POSIX locale, don't use the system's codepage. */
566 if (strcmp (locale
, "C") == 0 || strcmp (locale
, "POSIX") == 0)
572 /* OS/2 has a function returning the locale's codepage as a number. */
573 if (DosQueryCp (sizeof (cp
), cp
, &cplen
))
577 sprintf (buf
, "CP%u", cp
[0]);
585 /* The canonical name cannot be determined. */
589 for (aliases
= get_charset_aliases ();
591 aliases
+= strlen (aliases
) + 1, aliases
+= strlen (aliases
) + 1)
592 if (strcmp (codeset
, aliases
) == 0
593 || (aliases
[0] == '*' && aliases
[1] == '\0'))
595 codeset
= aliases
+ strlen (aliases
) + 1;
599 /* Don't return an empty string. GNU libc and GNU libiconv interpret
600 the empty string as denoting "the locale's character encoding",
601 thus GNU libiconv would call this function a second time. */
602 if (codeset
[0] == '\0')
606 /* Mac OS X sets MB_CUR_MAX to 1 when LC_ALL=C, and "UTF-8"
607 (the default codeset) does not work when MB_CUR_MAX is 1. */
608 if (strcmp (codeset
, "UTF-8") == 0 && MB_CUR_MAX_L (uselocale (NULL
)) <= 1)