1 /* Determine a canonical name for the current locale's character encoding.
3 Copyright (C) 2000-2006, 2008-2009 Free Software Foundation, Inc.
5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 3, or (at your option)
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 You should have received a copy of the GNU General Public License along
16 with this program; if not, write to the Free Software Foundation,
17 Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
19 /* Written by Bruno Haible <bruno@clisp.org>. */
24 #include "localcharset.h"
31 #if defined __APPLE__ && defined __MACH__ && HAVE_LANGINFO_CODESET
32 # define DARWIN7 /* Darwin 7 or newer, i.e. MacOS X 10.3 or newer */
35 #if defined _WIN32 || defined __WIN32__
40 /* Assume EMX program runs on OS/2, even if compiled under DOS. */
46 #if !defined WIN32_NATIVE
47 # if HAVE_LANGINFO_CODESET
48 # include <langinfo.h>
50 # if 0 /* see comment below */
55 # define WIN32_LEAN_AND_MEAN
58 #elif defined WIN32_NATIVE
59 # define WIN32_LEAN_AND_MEAN
67 #if ENABLE_RELOCATABLE
68 # include "relocatable.h"
70 # define relocate(pathname) (pathname)
75 # include "configmake.h"
78 #if defined _WIN32 || defined __WIN32__ || defined __CYGWIN__ || defined __EMX__ || defined __DJGPP__
79 /* Win32, Cygwin, OS/2, DOS */
80 # define ISSLASH(C) ((C) == '/' || (C) == '\\')
83 #ifndef DIRECTORY_SEPARATOR
84 # define DIRECTORY_SEPARATOR '/'
88 # define ISSLASH(C) ((C) == DIRECTORY_SEPARATOR)
91 #if HAVE_DECL_GETC_UNLOCKED
93 # define getc getc_unlocked
96 /* The following static variable is declared 'volatile' to avoid a
97 possible multithread problem in the function get_charset_aliases. If we
98 are running in a threaded environment, and if two threads initialize
99 'charset_aliases' simultaneously, both will produce the same value,
100 and everything will be ok if the two assignments to 'charset_aliases'
101 are atomic. But I don't know what will happen if the two assignments mix. */
103 # define volatile /* empty */
105 /* Pointer to the contents of the charset.alias file, if it has already been
106 read, else NULL. Its format is:
107 ALIAS_1 '\0' CANONICAL_1 '\0' ... ALIAS_n '\0' CANONICAL_n '\0' '\0' */
108 static const char * volatile charset_aliases
;
110 /* Return a pointer to the contents of the charset.alias file. */
112 get_charset_aliases (void)
116 cp
= charset_aliases
;
119 #if !(defined DARWIN7 || defined VMS || defined WIN32_NATIVE || defined __CYGWIN__)
122 const char *base
= "charset.alias";
125 /* Make it possible to override the charset.alias location. This is
126 necessary for running the testsuite before "make install". */
127 dir
= getenv ("CHARSETALIASDIR");
128 if (dir
== NULL
|| dir
[0] == '\0')
129 dir
= relocate (LIBDIR
);
131 /* Concatenate dir and base into freshly allocated file_name. */
133 size_t dir_len
= strlen (dir
);
134 size_t base_len
= strlen (base
);
135 int add_slash
= (dir_len
> 0 && !ISSLASH (dir
[dir_len
- 1]));
136 file_name
= (char *) malloc (dir_len
+ add_slash
+ base_len
+ 1);
137 if (file_name
!= NULL
)
139 memcpy (file_name
, dir
, dir_len
);
141 file_name
[dir_len
] = DIRECTORY_SEPARATOR
;
142 memcpy (file_name
+ dir_len
+ add_slash
, base
, base_len
+ 1);
146 if (file_name
== NULL
|| (fp
= fopen (file_name
, "r")) == NULL
)
147 /* Out of memory or file not found, treat it as empty. */
151 /* Parse the file's contents. */
152 char *res_ptr
= NULL
;
166 if (c
== '\n' || c
== ' ' || c
== '\t')
170 /* Skip comment, to end of line. */
173 while (!(c
== EOF
|| c
== '\n'));
179 if (fscanf (fp
, "%50s %50s", buf1
, buf2
) < 2)
183 old_res_ptr
= res_ptr
;
186 res_size
= l1
+ 1 + l2
+ 1;
187 res_ptr
= (char *) malloc (res_size
+ 1);
191 res_size
+= l1
+ 1 + l2
+ 1;
192 res_ptr
= (char *) realloc (res_ptr
, res_size
+ 1);
198 if (old_res_ptr
!= NULL
)
202 strcpy (res_ptr
+ res_size
- (l2
+ 1) - (l1
+ 1), buf1
);
203 strcpy (res_ptr
+ res_size
- (l2
+ 1), buf2
);
210 *(res_ptr
+ res_size
) = '\0';
215 if (file_name
!= NULL
)
221 /* To avoid the trouble of installing a file that is shared by many
222 GNU packages -- many packaging systems have problems with this --,
223 simply inline the aliases here. */
224 cp
= "ISO8859-1" "\0" "ISO-8859-1" "\0"
225 "ISO8859-2" "\0" "ISO-8859-2" "\0"
226 "ISO8859-4" "\0" "ISO-8859-4" "\0"
227 "ISO8859-5" "\0" "ISO-8859-5" "\0"
228 "ISO8859-7" "\0" "ISO-8859-7" "\0"
229 "ISO8859-9" "\0" "ISO-8859-9" "\0"
230 "ISO8859-13" "\0" "ISO-8859-13" "\0"
231 "ISO8859-15" "\0" "ISO-8859-15" "\0"
232 "KOI8-R" "\0" "KOI8-R" "\0"
233 "KOI8-U" "\0" "KOI8-U" "\0"
234 "CP866" "\0" "CP866" "\0"
235 "CP949" "\0" "CP949" "\0"
236 "CP1131" "\0" "CP1131" "\0"
237 "CP1251" "\0" "CP1251" "\0"
238 "eucCN" "\0" "GB2312" "\0"
239 "GB2312" "\0" "GB2312" "\0"
240 "eucJP" "\0" "EUC-JP" "\0"
241 "eucKR" "\0" "EUC-KR" "\0"
242 "Big5" "\0" "BIG5" "\0"
243 "Big5HKSCS" "\0" "BIG5-HKSCS" "\0"
244 "GBK" "\0" "GBK" "\0"
245 "GB18030" "\0" "GB18030" "\0"
246 "SJIS" "\0" "SHIFT_JIS" "\0"
247 "ARMSCII-8" "\0" "ARMSCII-8" "\0"
248 "PT154" "\0" "PT154" "\0"
249 /*"ISCII-DEV" "\0" "?" "\0"*/
250 "*" "\0" "UTF-8" "\0";
254 /* To avoid the troubles of an extra file charset.alias_vms in the
255 sources of many GNU packages, simply inline the aliases here. */
256 /* The list of encodings is taken from the OpenVMS 7.3-1 documentation
257 "Compaq C Run-Time Library Reference Manual for OpenVMS systems"
258 section 10.7 "Handling Different Character Sets". */
259 cp
= "ISO8859-1" "\0" "ISO-8859-1" "\0"
260 "ISO8859-2" "\0" "ISO-8859-2" "\0"
261 "ISO8859-5" "\0" "ISO-8859-5" "\0"
262 "ISO8859-7" "\0" "ISO-8859-7" "\0"
263 "ISO8859-8" "\0" "ISO-8859-8" "\0"
264 "ISO8859-9" "\0" "ISO-8859-9" "\0"
266 "eucJP" "\0" "EUC-JP" "\0"
267 "SJIS" "\0" "SHIFT_JIS" "\0"
268 "DECKANJI" "\0" "DEC-KANJI" "\0"
269 "SDECKANJI" "\0" "EUC-JP" "\0"
271 "eucTW" "\0" "EUC-TW" "\0"
272 "DECHANYU" "\0" "DEC-HANYU" "\0"
273 "DECHANZI" "\0" "GB2312" "\0"
275 "DECKOREAN" "\0" "EUC-KR" "\0";
278 # if defined WIN32_NATIVE || defined __CYGWIN__
279 /* To avoid the troubles of installing a separate file in the same
280 directory as the DLL and of retrieving the DLL's directory at
281 runtime, simply inline the aliases here. */
283 cp
= "CP936" "\0" "GBK" "\0"
284 "CP1361" "\0" "JOHAB" "\0"
285 "CP20127" "\0" "ASCII" "\0"
286 "CP20866" "\0" "KOI8-R" "\0"
287 "CP20936" "\0" "GB2312" "\0"
288 "CP21866" "\0" "KOI8-RU" "\0"
289 "CP28591" "\0" "ISO-8859-1" "\0"
290 "CP28592" "\0" "ISO-8859-2" "\0"
291 "CP28593" "\0" "ISO-8859-3" "\0"
292 "CP28594" "\0" "ISO-8859-4" "\0"
293 "CP28595" "\0" "ISO-8859-5" "\0"
294 "CP28596" "\0" "ISO-8859-6" "\0"
295 "CP28597" "\0" "ISO-8859-7" "\0"
296 "CP28598" "\0" "ISO-8859-8" "\0"
297 "CP28599" "\0" "ISO-8859-9" "\0"
298 "CP28605" "\0" "ISO-8859-15" "\0"
299 "CP38598" "\0" "ISO-8859-8" "\0"
300 "CP51932" "\0" "EUC-JP" "\0"
301 "CP51936" "\0" "GB2312" "\0"
302 "CP51949" "\0" "EUC-KR" "\0"
303 "CP51950" "\0" "EUC-TW" "\0"
304 "CP54936" "\0" "GB18030" "\0"
305 "CP65001" "\0" "UTF-8" "\0";
309 charset_aliases
= cp
;
315 /* Determine the current locale's character encoding, and canonicalize it
316 into one of the canonical names listed in config.charset.
317 The result must not be freed; it is statically allocated.
318 If the canonical name cannot be determined, the result is a non-canonical
325 locale_charset (void)
330 #if !(defined WIN32_NATIVE || defined OS2)
332 # if HAVE_LANGINFO_CODESET
334 /* Most systems support nl_langinfo (CODESET) nowadays. */
335 codeset
= nl_langinfo (CODESET
);
338 /* Cygwin 2006 does not have locales. nl_langinfo (CODESET) always
339 returns "US-ASCII". As long as this is not fixed, return the suffix
340 of the locale name from the environment variables (if present) or
341 the codepage as a number. */
342 if (codeset
!= NULL
&& strcmp (codeset
, "US-ASCII") == 0)
345 static char buf
[2 + 10 + 1];
347 locale
= getenv ("LC_ALL");
348 if (locale
== NULL
|| locale
[0] == '\0')
350 locale
= getenv ("LC_CTYPE");
351 if (locale
== NULL
|| locale
[0] == '\0')
352 locale
= getenv ("LANG");
354 if (locale
!= NULL
&& locale
[0] != '\0')
356 /* If the locale name contains an encoding after the dot, return
358 const char *dot
= strchr (locale
, '.');
362 const char *modifier
;
365 /* Look for the possible @... trailer and remove it, if any. */
366 modifier
= strchr (dot
, '@');
367 if (modifier
== NULL
)
369 if (modifier
- dot
< sizeof (buf
))
371 memcpy (buf
, dot
, modifier
- dot
);
372 buf
[modifier
- dot
] = '\0';
378 /* Woe32 has a function returning the locale's codepage as a number. */
379 sprintf (buf
, "CP%u", GetACP ());
386 /* On old systems which lack it, use setlocale or getenv. */
387 const char *locale
= NULL
;
389 /* But most old systems don't have a complete set of locales. Some
390 (like SunOS 4 or DJGPP) have only the C locale. Therefore we don't
391 use setlocale here; it would return "C" when it doesn't support the
392 locale name the user has set. */
394 locale
= setlocale (LC_CTYPE
, NULL
);
396 if (locale
== NULL
|| locale
[0] == '\0')
398 locale
= getenv ("LC_ALL");
399 if (locale
== NULL
|| locale
[0] == '\0')
401 locale
= getenv ("LC_CTYPE");
402 if (locale
== NULL
|| locale
[0] == '\0')
403 locale
= getenv ("LANG");
407 /* On some old systems, one used to set locale = "iso8859_1". On others,
408 you set it to "language_COUNTRY.charset". In any case, we resolve it
409 through the charset.alias file. */
414 #elif defined WIN32_NATIVE
416 static char buf
[2 + 10 + 1];
418 /* Woe32 has a function returning the locale's codepage as a number. */
419 sprintf (buf
, "CP%u", GetACP ());
425 static char buf
[2 + 10 + 1];
429 /* Allow user to override the codeset, as set in the operating system,
430 with standard language environment variables. */
431 locale
= getenv ("LC_ALL");
432 if (locale
== NULL
|| locale
[0] == '\0')
434 locale
= getenv ("LC_CTYPE");
435 if (locale
== NULL
|| locale
[0] == '\0')
436 locale
= getenv ("LANG");
438 if (locale
!= NULL
&& locale
[0] != '\0')
440 /* If the locale name contains an encoding after the dot, return it. */
441 const char *dot
= strchr (locale
, '.');
445 const char *modifier
;
448 /* Look for the possible @... trailer and remove it, if any. */
449 modifier
= strchr (dot
, '@');
450 if (modifier
== NULL
)
452 if (modifier
- dot
< sizeof (buf
))
454 memcpy (buf
, dot
, modifier
- dot
);
455 buf
[modifier
- dot
] = '\0';
460 /* Resolve through the charset.alias file. */
465 /* OS/2 has a function returning the locale's codepage as a number. */
466 if (DosQueryCp (sizeof (cp
), cp
, &cplen
))
470 sprintf (buf
, "CP%u", cp
[0]);
478 /* The canonical name cannot be determined. */
482 for (aliases
= get_charset_aliases ();
484 aliases
+= strlen (aliases
) + 1, aliases
+= strlen (aliases
) + 1)
485 if (strcmp (codeset
, aliases
) == 0
486 || (aliases
[0] == '*' && aliases
[1] == '\0'))
488 codeset
= aliases
+ strlen (aliases
) + 1;
492 /* Don't return an empty string. GNU libc and GNU libiconv interpret
493 the empty string as denoting "the locale's character encoding",
494 thus GNU libiconv would call this function a second time. */
495 if (codeset
[0] == '\0')