2 * Copyright © 2014 Canonical Limited
4 * This library is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU Lesser General Public
6 * License as published by the Free Software Foundation; either
7 * version 2.1 of the License, or (at your option) any later version.
9 * This library is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * Lesser General Public License for more details.
14 * You should have received a copy of the GNU Lesser General Public
15 * License along with this library; if not, see <http://www.gnu.org/licenses/>.
17 * Author: Ryan Lortie <desrt@desrt.ca>
22 #include "gstrfuncs.h"
47 #include "gtranslit-data.h"
49 #define get_src_char(array, encoded, index) ((encoded & 0x8000) ? (array)[((encoded) & 0xfff) + index] : encoded)
50 #define get_length(encoded) ((encoded & 0x8000) ? ((encoded & 0x7000) >> 12) : 1)
52 #if G_BYTE_ORDER == G_BIG_ENDIAN
53 #define get_ascii_item(array, encoded) ((encoded & 0x8000) ? &(array)[(encoded) & 0xfff] : (gpointer) (((char *) &(encoded)) + 1))
55 #define get_ascii_item(array, encoded) ((encoded & 0x8000) ? &(array)[(encoded) & 0xfff] : (gpointer) &(encoded))
58 static const gchar
* lookup_in_item (guint item_id
,
64 compare_mapping_entry (gconstpointer user_data
,
67 const struct mapping_entry
*entry
= data
;
68 const gunichar
*key
= user_data
;
71 G_STATIC_ASSERT(MAX_KEY_SIZE
== 2);
73 src_0
= get_src_char (src_table
, entry
->src
, 0);
77 else if (key
[0] < src_0
)
80 if (get_length (entry
->src
) > 1)
84 src_1
= get_src_char (src_table
, entry
->src
, 1);
88 else if (key
[1] < src_1
)
98 lookup_in_mapping (const struct mapping_entry
*mapping
,
104 const struct mapping_entry
*hit
;
106 hit
= bsearch (key
, mapping
, mapping_size
, sizeof (struct mapping_entry
), compare_mapping_entry
);
111 *key_consumed
= get_length (hit
->src
);
112 *result_len
= get_length (hit
->ascii
);
114 return get_ascii_item(ascii_table
, hit
->ascii
);
118 lookup_in_chain (const guint8
*chain
,
125 while (*chain
!= 0xff)
127 result
= lookup_in_item (*chain
, key
, result_len
, key_consumed
);
139 lookup_in_item (guint item_id
,
146 const guint8
*chain
= chains_table
+ chain_starts
[item_id
& 0x7f];
148 return lookup_in_chain (chain
, key
, result_len
, key_consumed
);
152 const struct mapping_range
*range
= &mapping_ranges
[item_id
];
154 return lookup_in_mapping (mappings_table
+ range
->start
, range
->length
, key
, result_len
, key_consumed
);
159 compare_locale_entry (gconstpointer user_data
,
162 const struct locale_entry
*entry
= data
;
163 const gchar
*key
= user_data
;
165 return strcmp (key
, &locale_names
[entry
->name_offset
]);
169 lookup_item_id_for_one_locale (const gchar
*key
,
172 const struct locale_entry
*hit
;
174 hit
= bsearch (key
, locale_index
, G_N_ELEMENTS (locale_index
), sizeof (struct locale_entry
), compare_locale_entry
);
179 *item_id
= hit
->item_id
;
184 lookup_item_id_for_locale (const gchar
*locale
)
186 gchar key
[MAX_LOCALE_NAME
+ 1];
187 const gchar
*language
;
189 const gchar
*territory
= NULL
;
190 guint territory_len
= 0;
191 const gchar
*modifier
= NULL
;
192 guint modifier_len
= 0;
193 const gchar
*next_char
;
196 /* As per POSIX, a valid locale looks like:
198 * language[_territory][.codeset][@modifier]
201 language_len
= strcspn (language
, "_.@");
202 next_char
= language
+ language_len
;
204 if (*next_char
== '_')
206 territory
= next_char
;
207 territory_len
= strcspn (territory
+ 1, "_.@") + 1;
208 next_char
= territory
+ territory_len
;
211 if (*next_char
== '.')
213 const gchar
*codeset
;
217 codeset_len
= strcspn (codeset
+ 1, "_.@") + 1;
218 next_char
= codeset
+ codeset_len
;
221 if (*next_char
== '@')
223 modifier
= next_char
;
224 modifier_len
= strcspn (modifier
+ 1, "_.@") + 1;
225 next_char
= modifier
+ modifier_len
;
228 /* What madness is this? */
229 if (language_len
== 0 || *next_char
)
230 return default_item_id
;
232 /* We are not interested in codeset.
240 * Note: we have no locales of the form aa_BB@cc in the database.
248 if (modifier_len
&& language_len
+ modifier_len
<= MAX_LOCALE_NAME
)
250 memcpy (key
, language
, language_len
);
251 memcpy (key
+ language_len
, modifier
, modifier_len
);
252 key
[language_len
+ modifier_len
] = '\0';
254 if (lookup_item_id_for_one_locale (key
, &id
))
259 if (territory_len
&& language_len
+ territory_len
<= MAX_LOCALE_NAME
)
261 memcpy (key
, language
, language_len
);
262 memcpy (key
+ language_len
, territory
, territory_len
);
263 key
[language_len
+ territory_len
] = '\0';
265 if (lookup_item_id_for_one_locale (key
, &id
))
270 if (language_len
<= MAX_LOCALE_NAME
)
272 memcpy (key
, language
, language_len
);
273 key
[language_len
] = '\0';
275 if (lookup_item_id_for_one_locale (key
, &id
))
279 return default_item_id
;
283 get_default_item_id (void)
285 static guint item_id
;
286 static gboolean done
;
288 /* Doesn't need to be locked -- no harm in doing it twice. */
293 locale
= setlocale (LC_CTYPE
, NULL
);
294 item_id
= lookup_item_id_for_locale (locale
);
303 * @str: a string, in UTF-8
304 * @from_locale: (nullable): the source locale, if known
306 * Transliterate @str to plain ASCII.
308 * For best results, @str should be in composed normalised form.
310 * This function performs a reasonably good set of character
311 * replacements. The particular set of replacements that is done may
312 * change by version or even by runtime environment.
314 * If the source language of @str is known, it can used to improve the
315 * accuracy of the translation by passing it as @from_locale. It should
316 * be a valid POSIX locale string (of the form
317 * `language[_territory][.codeset][@modifier]`).
319 * If @from_locale is %NULL then the current locale is used.
321 * If you want to do translation for no specific locale, and you want it
322 * to be done independently of the currently locale, specify `"C"` for
325 * Returns: a string in plain ASCII
330 g_str_to_ascii (const gchar
*str
,
331 const gchar
*from_locale
)
336 g_return_val_if_fail (str
!= NULL
, NULL
);
338 if (g_str_is_ascii (str
))
339 return g_strdup (str
);
342 item_id
= lookup_item_id_for_locale (from_locale
);
344 item_id
= get_default_item_id ();
346 result
= g_string_sized_new (strlen (str
));
350 /* We only need to transliterate non-ASCII values... */
353 gunichar key
[MAX_KEY_SIZE
];
359 G_STATIC_ASSERT(MAX_KEY_SIZE
== 2);
361 c
= g_utf8_get_char (str
);
363 /* This is where it gets evil...
365 * We know that MAX_KEY_SIZE is 2. We also know that we
366 * only want to try another character if it's non-ascii.
368 str
= g_utf8_next_char (str
);
372 key
[1] = g_utf8_get_char (str
);
376 r
= lookup_in_item (item_id
, key
, &r_len
, &consumed
);
378 /* If we failed to map two characters, try again with one.
380 * gconv behaviour is a bit weird here -- it seems to
381 * depend in the randomness of the binary search and the
382 * size of the input buffer as to what result we get here.
384 * Doing it this way is more work, but should be
387 if (r
== NULL
&& key
[1])
390 r
= lookup_in_item (item_id
, key
, &r_len
, &consumed
);
395 g_string_append_len (result
, r
, r_len
);
397 /* If it took both then skip again */
398 str
= g_utf8_next_char (str
);
400 else /* no match found */
401 g_string_append_c (result
, '?');
403 else if (*str
& 0x80) /* Out-of-range non-ASCII case */
405 g_string_append_c (result
, '?');
406 str
= g_utf8_next_char (str
);
408 else /* ASCII case */
409 g_string_append_c (result
, *str
++);
412 return g_string_free (result
, FALSE
);