utf8: add unit test for g_utf8_make_valid
[glib.git] / glib / gcharset.c
blobd47541cba89cdb36b7d033197ed56791ae6679c7
1 /* gcharset.c - Charset information
3 * Copyright (C) 2011 Red Hat, Inc.
5 * This library is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU Lesser General Public
7 * License as published by the Free Software Foundation; either
8 * version 2 of the License, or (at your option) any later version.
10 * This library is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * Lesser General Public License for more details.
15 * You should have received a copy of the GNU Lesser General Public
16 * License along with this library; if not, see <http://www.gnu.org/licenses/>.
19 #include "config.h"
21 #include "gcharsetprivate.h"
23 #include "garray.h"
24 #include "genviron.h"
25 #include "ghash.h"
26 #include "gmessages.h"
27 #include "gstrfuncs.h"
28 #include "gthread.h"
29 #ifdef G_OS_WIN32
30 #include "gwin32.h"
31 #endif
33 #include "libcharset/libcharset.h"
35 #include <string.h>
36 #include <stdio.h>
38 G_LOCK_DEFINE_STATIC (aliases);
40 static GHashTable *
41 get_alias_hash (void)
43 static GHashTable *alias_hash = NULL;
44 const char *aliases;
46 G_LOCK (aliases);
48 if (!alias_hash)
50 alias_hash = g_hash_table_new (g_str_hash, g_str_equal);
52 aliases = _g_locale_get_charset_aliases ();
53 while (*aliases != '\0')
55 const char *canonical;
56 const char *alias;
57 const char **alias_array;
58 int count = 0;
60 alias = aliases;
61 aliases += strlen (aliases) + 1;
62 canonical = aliases;
63 aliases += strlen (aliases) + 1;
65 alias_array = g_hash_table_lookup (alias_hash, canonical);
66 if (alias_array)
68 while (alias_array[count])
69 count++;
72 alias_array = g_renew (const char *, alias_array, count + 2);
73 alias_array[count] = alias;
74 alias_array[count + 1] = NULL;
76 g_hash_table_insert (alias_hash, (char *)canonical, alias_array);
80 G_UNLOCK (aliases);
82 return alias_hash;
85 /* As an abuse of the alias table, the following routines gets
86 * the charsets that are aliases for the canonical name.
88 const char **
89 _g_charset_get_aliases (const char *canonical_name)
91 GHashTable *alias_hash = get_alias_hash ();
93 return g_hash_table_lookup (alias_hash, canonical_name);
96 static gboolean
97 g_utf8_get_charset_internal (const char *raw_data,
98 const char **a)
100 const char *charset = g_getenv ("CHARSET");
102 if (charset && *charset)
104 *a = charset;
106 if (charset && strstr (charset, "UTF-8"))
107 return TRUE;
108 else
109 return FALSE;
112 /* The libcharset code tries to be thread-safe without
113 * a lock, but has a memory leak and a missing memory
114 * barrier, so we lock for it
116 G_LOCK (aliases);
117 charset = _g_locale_charset_unalias (raw_data);
118 G_UNLOCK (aliases);
120 if (charset && *charset)
122 *a = charset;
124 if (charset && strstr (charset, "UTF-8"))
125 return TRUE;
126 else
127 return FALSE;
130 /* Assume this for compatibility at present. */
131 *a = "US-ASCII";
133 return FALSE;
136 typedef struct _GCharsetCache GCharsetCache;
138 struct _GCharsetCache {
139 gboolean is_utf8;
140 gchar *raw;
141 gchar *charset;
144 static void
145 charset_cache_free (gpointer data)
147 GCharsetCache *cache = data;
148 g_free (cache->raw);
149 g_free (cache->charset);
150 g_free (cache);
154 * g_get_charset:
155 * @charset: (out) (optional) (transfer none): return location for character set
156 * name, or %NULL.
158 * Obtains the character set for the [current locale][setlocale]; you
159 * might use this character set as an argument to g_convert(), to convert
160 * from the current locale's encoding to some other encoding. (Frequently
161 * g_locale_to_utf8() and g_locale_from_utf8() are nice shortcuts, though.)
163 * On Windows the character set returned by this function is the
164 * so-called system default ANSI code-page. That is the character set
165 * used by the "narrow" versions of C library and Win32 functions that
166 * handle file names. It might be different from the character set
167 * used by the C library's current locale.
169 * The return value is %TRUE if the locale's encoding is UTF-8, in that
170 * case you can perhaps avoid calling g_convert().
172 * The string returned in @charset is not allocated, and should not be
173 * freed.
175 * Returns: %TRUE if the returned charset is UTF-8
177 gboolean
178 g_get_charset (const char **charset)
180 static GPrivate cache_private = G_PRIVATE_INIT (charset_cache_free);
181 GCharsetCache *cache = g_private_get (&cache_private);
182 const gchar *raw;
184 if (!cache)
186 cache = g_new0 (GCharsetCache, 1);
187 g_private_set (&cache_private, cache);
190 G_LOCK (aliases);
191 raw = _g_locale_charset_raw ();
192 G_UNLOCK (aliases);
194 if (!(cache->raw && strcmp (cache->raw, raw) == 0))
196 const gchar *new_charset;
198 g_free (cache->raw);
199 g_free (cache->charset);
200 cache->raw = g_strdup (raw);
201 cache->is_utf8 = g_utf8_get_charset_internal (raw, &new_charset);
202 cache->charset = g_strdup (new_charset);
205 if (charset)
206 *charset = cache->charset;
208 return cache->is_utf8;
212 * g_get_codeset:
214 * Gets the character set for the current locale.
216 * Returns: a newly allocated string containing the name
217 * of the character set. This string must be freed with g_free().
219 gchar *
220 g_get_codeset (void)
222 const gchar *charset;
224 g_get_charset (&charset);
226 return g_strdup (charset);
229 #ifndef G_OS_WIN32
231 /* read an alias file for the locales */
232 static void
233 read_aliases (gchar *file,
234 GHashTable *alias_table)
236 FILE *fp;
237 char buf[256];
239 fp = fopen (file,"r");
240 if (!fp)
241 return;
242 while (fgets (buf, 256, fp))
244 char *p, *q;
246 g_strstrip (buf);
248 /* Line is a comment */
249 if ((buf[0] == '#') || (buf[0] == '\0'))
250 continue;
252 /* Reads first column */
253 for (p = buf, q = NULL; *p; p++) {
254 if ((*p == '\t') || (*p == ' ') || (*p == ':')) {
255 *p = '\0';
256 q = p+1;
257 while ((*q == '\t') || (*q == ' ')) {
258 q++;
260 break;
263 /* The line only had one column */
264 if (!q || *q == '\0')
265 continue;
267 /* Read second column */
268 for (p = q; *p; p++) {
269 if ((*p == '\t') || (*p == ' ')) {
270 *p = '\0';
271 break;
275 /* Add to alias table if necessary */
276 if (!g_hash_table_lookup (alias_table, buf)) {
277 g_hash_table_insert (alias_table, g_strdup (buf), g_strdup (q));
280 fclose (fp);
283 #endif
285 static char *
286 unalias_lang (char *lang)
288 #ifndef G_OS_WIN32
289 static GHashTable *alias_table = NULL;
290 char *p;
291 int i;
293 if (g_once_init_enter (&alias_table))
295 GHashTable *table = g_hash_table_new (g_str_hash, g_str_equal);
296 read_aliases ("/usr/share/locale/locale.alias", table);
297 g_once_init_leave (&alias_table, table);
300 i = 0;
301 while ((p = g_hash_table_lookup (alias_table, lang)) && (strcmp (p, lang) != 0))
303 lang = p;
304 if (i++ == 30)
306 static gboolean said_before = FALSE;
307 if (!said_before)
308 g_warning ("Too many alias levels for a locale, "
309 "may indicate a loop");
310 said_before = TRUE;
311 return lang;
314 #endif
315 return lang;
318 /* Mask for components of locale spec. The ordering here is from
319 * least significant to most significant
321 enum
323 COMPONENT_CODESET = 1 << 0,
324 COMPONENT_TERRITORY = 1 << 1,
325 COMPONENT_MODIFIER = 1 << 2
328 /* Break an X/Open style locale specification into components
330 static guint
331 explode_locale (const gchar *locale,
332 gchar **language,
333 gchar **territory,
334 gchar **codeset,
335 gchar **modifier)
337 const gchar *uscore_pos;
338 const gchar *at_pos;
339 const gchar *dot_pos;
341 guint mask = 0;
343 uscore_pos = strchr (locale, '_');
344 dot_pos = strchr (uscore_pos ? uscore_pos : locale, '.');
345 at_pos = strchr (dot_pos ? dot_pos : (uscore_pos ? uscore_pos : locale), '@');
347 if (at_pos)
349 mask |= COMPONENT_MODIFIER;
350 *modifier = g_strdup (at_pos);
352 else
353 at_pos = locale + strlen (locale);
355 if (dot_pos)
357 mask |= COMPONENT_CODESET;
358 *codeset = g_strndup (dot_pos, at_pos - dot_pos);
360 else
361 dot_pos = at_pos;
363 if (uscore_pos)
365 mask |= COMPONENT_TERRITORY;
366 *territory = g_strndup (uscore_pos, dot_pos - uscore_pos);
368 else
369 uscore_pos = dot_pos;
371 *language = g_strndup (locale, uscore_pos - locale);
373 return mask;
377 * Compute all interesting variants for a given locale name -
378 * by stripping off different components of the value.
380 * For simplicity, we assume that the locale is in
381 * X/Open format: language[_territory][.codeset][@modifier]
383 * TODO: Extend this to handle the CEN format (see the GNUlibc docs)
384 * as well. We could just copy the code from glibc wholesale
385 * but it is big, ugly, and complicated, so I'm reluctant
386 * to do so when this should handle 99% of the time...
388 static void
389 append_locale_variants (GPtrArray *array,
390 const gchar *locale)
392 gchar *language = NULL;
393 gchar *territory = NULL;
394 gchar *codeset = NULL;
395 gchar *modifier = NULL;
397 guint mask;
398 guint i, j;
400 g_return_if_fail (locale != NULL);
402 mask = explode_locale (locale, &language, &territory, &codeset, &modifier);
404 /* Iterate through all possible combinations, from least attractive
405 * to most attractive.
407 for (j = 0; j <= mask; ++j)
409 i = mask - j;
411 if ((i & ~mask) == 0)
413 gchar *val = g_strconcat (language,
414 (i & COMPONENT_TERRITORY) ? territory : "",
415 (i & COMPONENT_CODESET) ? codeset : "",
416 (i & COMPONENT_MODIFIER) ? modifier : "",
417 NULL);
418 g_ptr_array_add (array, val);
422 g_free (language);
423 if (mask & COMPONENT_CODESET)
424 g_free (codeset);
425 if (mask & COMPONENT_TERRITORY)
426 g_free (territory);
427 if (mask & COMPONENT_MODIFIER)
428 g_free (modifier);
432 * g_get_locale_variants:
433 * @locale: a locale identifier
435 * Returns a list of derived variants of @locale, which can be used to
436 * e.g. construct locale-dependent filenames or search paths. The returned
437 * list is sorted from most desirable to least desirable.
438 * This function handles territory, charset and extra locale modifiers.
440 * For example, if @locale is "fr_BE", then the returned list
441 * is "fr_BE", "fr".
443 * If you need the list of variants for the current locale,
444 * use g_get_language_names().
446 * Returns: (transfer full) (array zero-terminated=1) (element-type utf8): a newly
447 * allocated array of newly allocated strings with the locale variants. Free with
448 * g_strfreev().
450 * Since: 2.28
452 gchar **
453 g_get_locale_variants (const gchar *locale)
455 GPtrArray *array;
457 g_return_val_if_fail (locale != NULL, NULL);
459 array = g_ptr_array_sized_new (8);
460 append_locale_variants (array, locale);
461 g_ptr_array_add (array, NULL);
463 return (gchar **) g_ptr_array_free (array, FALSE);
466 /* The following is (partly) taken from the gettext package.
467 Copyright (C) 1995, 1996, 1997, 1998 Free Software Foundation, Inc. */
469 static const gchar *
470 guess_category_value (const gchar *category_name)
472 const gchar *retval;
474 /* The highest priority value is the 'LANGUAGE' environment
475 variable. This is a GNU extension. */
476 retval = g_getenv ("LANGUAGE");
477 if ((retval != NULL) && (retval[0] != '\0'))
478 return retval;
480 /* 'LANGUAGE' is not set. So we have to proceed with the POSIX
481 methods of looking to 'LC_ALL', 'LC_xxx', and 'LANG'. On some
482 systems this can be done by the 'setlocale' function itself. */
484 /* Setting of LC_ALL overwrites all other. */
485 retval = g_getenv ("LC_ALL");
486 if ((retval != NULL) && (retval[0] != '\0'))
487 return retval;
489 /* Next comes the name of the desired category. */
490 retval = g_getenv (category_name);
491 if ((retval != NULL) && (retval[0] != '\0'))
492 return retval;
494 /* Last possibility is the LANG environment variable. */
495 retval = g_getenv ("LANG");
496 if ((retval != NULL) && (retval[0] != '\0'))
497 return retval;
499 #ifdef G_PLATFORM_WIN32
500 /* g_win32_getlocale() first checks for LC_ALL, LC_MESSAGES and
501 * LANG, which we already did above. Oh well. The main point of
502 * calling g_win32_getlocale() is to get the thread's locale as used
503 * by Windows and the Microsoft C runtime (in the "English_United
504 * States" format) translated into the Unixish format.
507 char *locale = g_win32_getlocale ();
508 retval = g_intern_string (locale);
509 g_free (locale);
510 return retval;
512 #endif
514 return NULL;
517 typedef struct _GLanguageNamesCache GLanguageNamesCache;
519 struct _GLanguageNamesCache {
520 gchar *languages;
521 gchar **language_names;
524 static void
525 language_names_cache_free (gpointer data)
527 GLanguageNamesCache *cache = data;
528 g_free (cache->languages);
529 g_strfreev (cache->language_names);
530 g_free (cache);
534 * g_get_language_names:
536 * Computes a list of applicable locale names, which can be used to
537 * e.g. construct locale-dependent filenames or search paths. The returned
538 * list is sorted from most desirable to least desirable and always contains
539 * the default locale "C".
541 * For example, if LANGUAGE=de:en_US, then the returned list is
542 * "de", "en_US", "en", "C".
544 * This function consults the environment variables `LANGUAGE`, `LC_ALL`,
545 * `LC_MESSAGES` and `LANG` to find the list of locales specified by the
546 * user.
548 * Returns: (array zero-terminated=1) (transfer none): a %NULL-terminated array of strings owned by GLib
549 * that must not be modified or freed.
551 * Since: 2.6
553 const gchar * const *
554 g_get_language_names (void)
556 static GPrivate cache_private = G_PRIVATE_INIT (language_names_cache_free);
557 GLanguageNamesCache *cache = g_private_get (&cache_private);
558 const gchar *value;
560 if (!cache)
562 cache = g_new0 (GLanguageNamesCache, 1);
563 g_private_set (&cache_private, cache);
566 value = guess_category_value ("LC_MESSAGES");
567 if (!value)
568 value = "C";
570 if (!(cache->languages && strcmp (cache->languages, value) == 0))
572 GPtrArray *array;
573 gchar **alist, **a;
575 g_free (cache->languages);
576 g_strfreev (cache->language_names);
577 cache->languages = g_strdup (value);
579 array = g_ptr_array_sized_new (8);
581 alist = g_strsplit (value, ":", 0);
582 for (a = alist; *a; a++)
583 append_locale_variants (array, unalias_lang (*a));
584 g_strfreev (alist);
585 g_ptr_array_add (array, g_strdup ("C"));
586 g_ptr_array_add (array, NULL);
588 cache->language_names = (gchar **) g_ptr_array_free (array, FALSE);
591 return (const gchar * const *) cache->language_names;