1 /* gcharset.c - Charset information
3 * Copyright (C) 2011 Red Hat, Inc.
5 * This library is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU Lesser General Public
7 * License as published by the Free Software Foundation; either
8 * version 2.1 of the License, or (at your option) any later version.
10 * This library is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * Lesser General Public License for more details.
15 * You should have received a copy of the GNU Lesser General Public
16 * License along with this library; if not, see <http://www.gnu.org/licenses/>.
21 #include "gcharsetprivate.h"
26 #include "gmessages.h"
27 #include "gstrfuncs.h"
33 #include "libcharset/libcharset.h"
38 G_LOCK_DEFINE_STATIC (aliases
);
43 static GHashTable
*alias_hash
= NULL
;
50 alias_hash
= g_hash_table_new (g_str_hash
, g_str_equal
);
52 aliases
= _g_locale_get_charset_aliases ();
53 while (*aliases
!= '\0')
55 const char *canonical
;
57 const char **alias_array
;
61 aliases
+= strlen (aliases
) + 1;
63 aliases
+= strlen (aliases
) + 1;
65 alias_array
= g_hash_table_lookup (alias_hash
, canonical
);
68 while (alias_array
[count
])
72 alias_array
= g_renew (const char *, alias_array
, count
+ 2);
73 alias_array
[count
] = alias
;
74 alias_array
[count
+ 1] = NULL
;
76 g_hash_table_insert (alias_hash
, (char *)canonical
, alias_array
);
85 /* As an abuse of the alias table, the following routines gets
86 * the charsets that are aliases for the canonical name.
89 _g_charset_get_aliases (const char *canonical_name
)
91 GHashTable
*alias_hash
= get_alias_hash ();
93 return g_hash_table_lookup (alias_hash
, canonical_name
);
97 g_utf8_get_charset_internal (const char *raw_data
,
100 const char *charset
= g_getenv ("CHARSET");
102 if (charset
&& *charset
)
106 if (charset
&& strstr (charset
, "UTF-8"))
112 /* The libcharset code tries to be thread-safe without
113 * a lock, but has a memory leak and a missing memory
114 * barrier, so we lock for it
117 charset
= _g_locale_charset_unalias (raw_data
);
120 if (charset
&& *charset
)
124 if (charset
&& strstr (charset
, "UTF-8"))
130 /* Assume this for compatibility at present. */
136 typedef struct _GCharsetCache GCharsetCache
;
138 struct _GCharsetCache
{
145 charset_cache_free (gpointer data
)
147 GCharsetCache
*cache
= data
;
149 g_free (cache
->charset
);
155 * @charset: (out) (optional) (transfer none): return location for character set
158 * Obtains the character set for the [current locale][setlocale]; you
159 * might use this character set as an argument to g_convert(), to convert
160 * from the current locale's encoding to some other encoding. (Frequently
161 * g_locale_to_utf8() and g_locale_from_utf8() are nice shortcuts, though.)
163 * On Windows the character set returned by this function is the
164 * so-called system default ANSI code-page. That is the character set
165 * used by the "narrow" versions of C library and Win32 functions that
166 * handle file names. It might be different from the character set
167 * used by the C library's current locale.
169 * The return value is %TRUE if the locale's encoding is UTF-8, in that
170 * case you can perhaps avoid calling g_convert().
172 * The string returned in @charset is not allocated, and should not be
175 * Returns: %TRUE if the returned charset is UTF-8
178 g_get_charset (const char **charset
)
180 static GPrivate cache_private
= G_PRIVATE_INIT (charset_cache_free
);
181 GCharsetCache
*cache
= g_private_get (&cache_private
);
186 cache
= g_new0 (GCharsetCache
, 1);
187 g_private_set (&cache_private
, cache
);
191 raw
= _g_locale_charset_raw ();
194 if (!(cache
->raw
&& strcmp (cache
->raw
, raw
) == 0))
196 const gchar
*new_charset
;
199 g_free (cache
->charset
);
200 cache
->raw
= g_strdup (raw
);
201 cache
->is_utf8
= g_utf8_get_charset_internal (raw
, &new_charset
);
202 cache
->charset
= g_strdup (new_charset
);
206 *charset
= cache
->charset
;
208 return cache
->is_utf8
;
214 * Gets the character set for the current locale.
216 * Returns: a newly allocated string containing the name
217 * of the character set. This string must be freed with g_free().
222 const gchar
*charset
;
224 g_get_charset (&charset
);
226 return g_strdup (charset
);
231 /* read an alias file for the locales */
233 read_aliases (gchar
*file
,
234 GHashTable
*alias_table
)
239 fp
= fopen (file
,"r");
242 while (fgets (buf
, 256, fp
))
248 /* Line is a comment */
249 if ((buf
[0] == '#') || (buf
[0] == '\0'))
252 /* Reads first column */
253 for (p
= buf
, q
= NULL
; *p
; p
++) {
254 if ((*p
== '\t') || (*p
== ' ') || (*p
== ':')) {
257 while ((*q
== '\t') || (*q
== ' ')) {
263 /* The line only had one column */
264 if (!q
|| *q
== '\0')
267 /* Read second column */
268 for (p
= q
; *p
; p
++) {
269 if ((*p
== '\t') || (*p
== ' ')) {
275 /* Add to alias table if necessary */
276 if (!g_hash_table_lookup (alias_table
, buf
)) {
277 g_hash_table_insert (alias_table
, g_strdup (buf
), g_strdup (q
));
286 unalias_lang (char *lang
)
289 static GHashTable
*alias_table
= NULL
;
293 if (g_once_init_enter (&alias_table
))
295 GHashTable
*table
= g_hash_table_new (g_str_hash
, g_str_equal
);
296 read_aliases ("/usr/share/locale/locale.alias", table
);
297 g_once_init_leave (&alias_table
, table
);
301 while ((p
= g_hash_table_lookup (alias_table
, lang
)) && (strcmp (p
, lang
) != 0))
306 static gboolean said_before
= FALSE
;
308 g_warning ("Too many alias levels for a locale, "
309 "may indicate a loop");
318 /* Mask for components of locale spec. The ordering here is from
319 * least significant to most significant
323 COMPONENT_CODESET
= 1 << 0,
324 COMPONENT_TERRITORY
= 1 << 1,
325 COMPONENT_MODIFIER
= 1 << 2
328 /* Break an X/Open style locale specification into components
331 explode_locale (const gchar
*locale
,
337 const gchar
*uscore_pos
;
339 const gchar
*dot_pos
;
343 uscore_pos
= strchr (locale
, '_');
344 dot_pos
= strchr (uscore_pos
? uscore_pos
: locale
, '.');
345 at_pos
= strchr (dot_pos
? dot_pos
: (uscore_pos
? uscore_pos
: locale
), '@');
349 mask
|= COMPONENT_MODIFIER
;
350 *modifier
= g_strdup (at_pos
);
353 at_pos
= locale
+ strlen (locale
);
357 mask
|= COMPONENT_CODESET
;
358 *codeset
= g_strndup (dot_pos
, at_pos
- dot_pos
);
365 mask
|= COMPONENT_TERRITORY
;
366 *territory
= g_strndup (uscore_pos
, dot_pos
- uscore_pos
);
369 uscore_pos
= dot_pos
;
371 *language
= g_strndup (locale
, uscore_pos
- locale
);
377 * Compute all interesting variants for a given locale name -
378 * by stripping off different components of the value.
380 * For simplicity, we assume that the locale is in
381 * X/Open format: language[_territory][.codeset][@modifier]
383 * TODO: Extend this to handle the CEN format (see the GNUlibc docs)
384 * as well. We could just copy the code from glibc wholesale
385 * but it is big, ugly, and complicated, so I'm reluctant
386 * to do so when this should handle 99% of the time...
389 append_locale_variants (GPtrArray
*array
,
392 gchar
*language
= NULL
;
393 gchar
*territory
= NULL
;
394 gchar
*codeset
= NULL
;
395 gchar
*modifier
= NULL
;
400 g_return_if_fail (locale
!= NULL
);
402 mask
= explode_locale (locale
, &language
, &territory
, &codeset
, &modifier
);
404 /* Iterate through all possible combinations, from least attractive
405 * to most attractive.
407 for (j
= 0; j
<= mask
; ++j
)
411 if ((i
& ~mask
) == 0)
413 gchar
*val
= g_strconcat (language
,
414 (i
& COMPONENT_TERRITORY
) ? territory
: "",
415 (i
& COMPONENT_CODESET
) ? codeset
: "",
416 (i
& COMPONENT_MODIFIER
) ? modifier
: "",
418 g_ptr_array_add (array
, val
);
423 if (mask
& COMPONENT_CODESET
)
425 if (mask
& COMPONENT_TERRITORY
)
427 if (mask
& COMPONENT_MODIFIER
)
432 * g_get_locale_variants:
433 * @locale: a locale identifier
435 * Returns a list of derived variants of @locale, which can be used to
436 * e.g. construct locale-dependent filenames or search paths. The returned
437 * list is sorted from most desirable to least desirable.
438 * This function handles territory, charset and extra locale modifiers.
440 * For example, if @locale is "fr_BE", then the returned list
443 * If you need the list of variants for the current locale,
444 * use g_get_language_names().
446 * Returns: (transfer full) (array zero-terminated=1) (element-type utf8): a newly
447 * allocated array of newly allocated strings with the locale variants. Free with
453 g_get_locale_variants (const gchar
*locale
)
457 g_return_val_if_fail (locale
!= NULL
, NULL
);
459 array
= g_ptr_array_sized_new (8);
460 append_locale_variants (array
, locale
);
461 g_ptr_array_add (array
, NULL
);
463 return (gchar
**) g_ptr_array_free (array
, FALSE
);
466 /* The following is (partly) taken from the gettext package.
467 Copyright (C) 1995, 1996, 1997, 1998 Free Software Foundation, Inc. */
470 guess_category_value (const gchar
*category_name
)
474 /* The highest priority value is the 'LANGUAGE' environment
475 variable. This is a GNU extension. */
476 retval
= g_getenv ("LANGUAGE");
477 if ((retval
!= NULL
) && (retval
[0] != '\0'))
480 /* 'LANGUAGE' is not set. So we have to proceed with the POSIX
481 methods of looking to 'LC_ALL', 'LC_xxx', and 'LANG'. On some
482 systems this can be done by the 'setlocale' function itself. */
484 /* Setting of LC_ALL overwrites all other. */
485 retval
= g_getenv ("LC_ALL");
486 if ((retval
!= NULL
) && (retval
[0] != '\0'))
489 /* Next comes the name of the desired category. */
490 retval
= g_getenv (category_name
);
491 if ((retval
!= NULL
) && (retval
[0] != '\0'))
494 /* Last possibility is the LANG environment variable. */
495 retval
= g_getenv ("LANG");
496 if ((retval
!= NULL
) && (retval
[0] != '\0'))
499 #ifdef G_PLATFORM_WIN32
500 /* g_win32_getlocale() first checks for LC_ALL, LC_MESSAGES and
501 * LANG, which we already did above. Oh well. The main point of
502 * calling g_win32_getlocale() is to get the thread's locale as used
503 * by Windows and the Microsoft C runtime (in the "English_United
504 * States" format) translated into the Unixish format.
507 char *locale
= g_win32_getlocale ();
508 retval
= g_intern_string (locale
);
517 typedef struct _GLanguageNamesCache GLanguageNamesCache
;
519 struct _GLanguageNamesCache
{
521 gchar
**language_names
;
525 language_names_cache_free (gpointer data
)
527 GLanguageNamesCache
*cache
= data
;
528 g_free (cache
->languages
);
529 g_strfreev (cache
->language_names
);
534 * g_get_language_names:
536 * Computes a list of applicable locale names, which can be used to
537 * e.g. construct locale-dependent filenames or search paths. The returned
538 * list is sorted from most desirable to least desirable and always contains
539 * the default locale "C".
541 * For example, if LANGUAGE=de:en_US, then the returned list is
542 * "de", "en_US", "en", "C".
544 * This function consults the environment variables `LANGUAGE`, `LC_ALL`,
545 * `LC_MESSAGES` and `LANG` to find the list of locales specified by the
548 * Returns: (array zero-terminated=1) (transfer none): a %NULL-terminated array of strings owned by GLib
549 * that must not be modified or freed.
553 const gchar
* const *
554 g_get_language_names (void)
556 static GPrivate cache_private
= G_PRIVATE_INIT (language_names_cache_free
);
557 GLanguageNamesCache
*cache
= g_private_get (&cache_private
);
562 cache
= g_new0 (GLanguageNamesCache
, 1);
563 g_private_set (&cache_private
, cache
);
566 value
= guess_category_value ("LC_MESSAGES");
570 if (!(cache
->languages
&& strcmp (cache
->languages
, value
) == 0))
575 g_free (cache
->languages
);
576 g_strfreev (cache
->language_names
);
577 cache
->languages
= g_strdup (value
);
579 array
= g_ptr_array_sized_new (8);
581 alist
= g_strsplit (value
, ":", 0);
582 for (a
= alist
; *a
; a
++)
583 append_locale_variants (array
, unalias_lang (*a
));
585 g_ptr_array_add (array
, g_strdup ("C"));
586 g_ptr_array_add (array
, NULL
);
588 cache
->language_names
= (gchar
**) g_ptr_array_free (array
, FALSE
);
591 return (const gchar
* const *) cache
->language_names
;