utf8: add unit test for g_utf8_make_valid
[glib.git] / glib / update-gtranslit.py
blob01f7c7f84f424684d6c2e30658a44cd6ff3e06bc
1 #!/usr/bin/env python3
3 # Run this script like so:
5 # ./update-gtranslit.py /path/to/glibc/localedata/locales > gtranslit-data.h
7 import sys, os
9 localedir = sys.argv[1]
11 # returns true if the name looks like a POSIX locale name
12 def looks_like_locale(name):
13 name, _, variant = name.partition('@')
15 if '_' not in name:
16 return False
18 lang, _, land = name.partition('_')
20 return len(lang) == 2 or len(lang) == 3 and len(land) == 2
22 # handles <U1234> style escapes
23 def unescape(string):
24 chunks = []
26 n = len(string)
27 i = 0
29 while i < n:
30 start_escape = string.find('<', i)
32 if start_escape == -1:
33 chunks.append(string[i:])
34 break
36 assert string[start_escape:start_escape + 2] == '<U'
37 start_escape += 2
39 end_escape = string.find('>', start_escape)
40 assert end_escape != -1
42 chunks.append(chr(int(string[start_escape:end_escape], 16)))
43 i = end_escape + 1
45 return ''.join(chunks)
47 # Checks if a string is ascii
48 def is_ascii(string):
49 return all(ord(c) < 0x80 for c in string)
51 # A Mapping is a map from non-ascii strings to ascii strings.
53 # It corresponds to a sequence of one or more mapping lines:
55 # <U00C4> "<U0041><U0308>";"<U0041><U0045>"
57 # in a file.
58 class Mapping:
59 def __init__(self):
60 self.serialised = None
61 self.mapping = {}
63 # Scans a string like
65 # <U00C4> "<U0041><U0308>";"<U0041><U0045>" % LATIN CAPITAL LETTER A WITH DIAERESIS.
67 # and adds the first all-ascii choice (or IGNORE) to the mapping
68 # dictionary, with the origin string as the key. In the case of
69 # IGNORE, stores the empty string.
70 def consider_mapping_line(self, line):
71 key, value, rest = (line + ' % comment').split(maxsplit=2)
73 key = unescape(key)
75 for alternative in value.split(';'):
76 if alternative[0] == '"' and alternative[-1] == '"':
77 unescaped = unescape(alternative[1:-1])
78 if is_ascii(unescaped):
79 self.mapping[key] = unescaped
80 break
82 elif alternative[0] == '<' and alternative[-1] == '>':
83 unescaped = unescape(alternative)
84 if is_ascii(unescaped):
85 self.mapping[key] = unescaped
86 break
88 elif alternative == 'IGNORE':
89 self.mapping[key] = ''
90 break
92 # Performs a normal dictionary merge, but ensures that there are no
93 # conflicting entries between the original dictionary and the requested
94 # changes
95 def merge_mapping(self, changes):
96 for key in changes.mapping:
97 if key in self.mapping:
98 assert self.mapping[key] == changes.mapping[key]
100 self.mapping.update(changes.mapping)
102 # Can't get much flatter...
103 def get_flattened(self):
104 return [self]
106 def serialise(self, serialiser):
107 if self.serialised == None:
108 self.serialised = serialiser.add_mapping(self.mapping)
110 return self.serialised
112 # A Chain is a sequence of mappings and chains.
114 # A chain contains another chain whenever "copy" or "include" is
115 # encountered in a source file.
117 # A chain contains a mapping whenever a sequence of mapping lines:
119 # <U00C4> "<U0041><U0308>";"<U0041><U0045>"
121 # is encountered in a file.
123 # The order of lookup is reverse: later entries override earlier ones.
124 class Chain:
125 def __init__(self, name):
126 self.serialised = None
127 self.name = name
128 self.chain = []
129 self.links = 0
131 self.read_from_file(os.path.join(localedir, name))
133 def read_from_file(self, filename):
134 current_mapping = None
135 in_lc_ctype = False
136 in_translit = False
138 fp = open(filename, encoding='ascii', errors='surrogateescape')
140 for line in fp:
141 line = line.strip()
143 if in_lc_ctype:
144 if line == 'END LC_CTYPE':
145 break
147 if line.startswith('copy') or line.startswith('include'):
148 if current_mapping:
149 self.chain.append(current_mapping)
151 copyname = unescape(line.split('"', 3)[1])
152 copyfile = get_chain(copyname)
153 self.chain.append(copyfile)
154 copyfile.links += 1
156 current_mapping = None
158 elif line == 'translit_start':
159 in_translit = True
161 elif line == 'translit_end':
162 in_translit = False
164 elif in_translit and line.startswith('<U'):
165 if not current_mapping:
166 current_mapping = Mapping()
168 current_mapping.consider_mapping_line(line)
170 elif line == '' or line.startswith('%'):
171 pass
173 elif 'default_missing <U003F>':
174 pass
176 elif in_translit:
177 print('unknown line:', line)
178 assert False
180 elif line == 'LC_CTYPE':
181 in_lc_ctype = True
183 if current_mapping:
184 self.chain.append(current_mapping)
186 # If there is only one link to this chain, we may as well just
187 # return the contents of the chain so that they can be merged into
188 # our sole parent directly. Otherwise, return ourselves.
189 def get_flattened(self):
190 if self.links == 1:
191 return sum((item.get_flattened() for item in self.chain), [])
192 else:
193 return [self]
195 def serialise(self, serialiser):
196 if self.serialised == None:
197 # Before we serialise, see if we can optimise a bit
198 self.chain = sum((item.get_flattened() for item in self.chain), [])
200 i = 0
201 while i < len(self.chain) - 1:
202 if isinstance(self.chain[i], Mapping) and isinstance(self.chain[i + 1], Mapping):
203 # We have two mappings in a row. Try to merge them.
204 self.chain[i].merge_mapping(self.chain[i + 1])
205 del self.chain[i + 1]
206 else:
207 i += 1
209 # If all that is left is one item, just serialise that directly
210 if len(self.chain) == 1:
211 self.serialised = self.chain[0].serialise(serialiser)
212 else:
213 ids = [item.serialise(serialiser) for item in self.chain]
214 self.serialised = serialiser.add_chain(ids)
216 return self.serialised
218 # Chain cache -- allows sharing of common chains
219 chains = {}
220 def get_chain(name):
221 if not name in chains:
222 chains[name] = Chain(name)
224 return chains[name]
227 # Remove the country name from a locale, preserving variant
228 # eg: 'sr_RS@latin' -> 'sr@latin'
229 def remove_country(string):
230 base, at, variant = string.partition('@')
231 lang, _, land = base.partition('_')
232 return lang + at + variant
234 def encode_range(start, end):
235 assert start <= end
236 length = end - start
238 assert start < 0x1000
239 assert length < 0x8
241 result = 0x8000 + (length << 12) + start
243 assert result < 0x10000
245 return result
247 def c_pair_array(array):
248 return '{ ' + ', '.join ('{ %u, %u }' % pair for pair in array) + ' };'
250 class Serialiser:
251 def __init__(self):
252 self.mappings = []
253 self.chains = []
254 self.locales = {}
256 def add_mapping(self, mapping):
257 if mapping in self.mappings:
258 mapping_id = self.mappings.index(mapping)
259 else:
260 mapping_id = len(self.mappings)
261 self.mappings.append(mapping)
263 assert mapping_id < 128
264 return mapping_id
266 def add_chain(self, chain):
267 if chain in self.chains:
268 chain_id = self.chains.index(chain)
269 else:
270 chain_id = len(self.chains)
271 self.chains.append(chain)
273 assert chain_id < 128
274 return 128 + chain_id
276 def add_locale(self, name, item_id):
277 self.locales[name] = item_id
279 def add_default(self, item_id):
280 self.default = item_id
282 def optimise_locales(self):
283 # Check if all regions of a language/variant agree
284 languages = list(set(remove_country(locale) for locale in self.locales))
286 for language in languages:
287 locales = [locale for locale in self.locales if remove_country(locale) == language]
289 item_id = self.locales[locales[0]]
290 if all(self.locales[locale] == item_id for locale in locales):
291 self.locales[language] = item_id
292 for locale in locales:
293 del self.locales[locale]
295 # Check if a variant is the same as the non-variant form
296 # eg: 'de@euro' and 'de'
297 for variant in list(locale for locale in self.locales if '@' in locale):
298 base, _, _ = variant.partition('@')
299 if base in self.locales and self.locales[base] == self.locales[variant]:
300 del self.locales[variant]
302 # Eliminate any entries that are just the same as the C locale
303 for locale in list(self.locales):
304 if self.locales[locale] == self.default:
305 del self.locales[locale]
307 def to_c(self):
308 src_table = ''
309 ascii_table = ''
310 mappings_table = []
311 mapping_ranges = []
312 chains_table = []
313 chain_starts = []
314 locale_names = ''
315 locale_index = []
316 max_lookup = 0
317 max_localename = 0
319 for mapping in self.mappings:
320 mapping_ranges.append ((len(mappings_table), len(mapping)))
322 for key in sorted(mapping):
323 if len(key) == 1 and ord(key[0]) < 0x8000:
324 src_range = ord(key[0])
325 else:
326 existing = src_table.find(key)
327 if existing == -1:
328 start = len(src_table)
329 assert all(ord(c) <= 0x10ffff for c in key)
330 src_table += key
331 src_range = encode_range(start, len(src_table))
332 max_lookup = max(max_lookup, len(key))
333 else:
334 src_range = encode_range(existing, existing + len(key))
336 value = mapping[key]
337 if len(value) == 1 and ord(value[0]) < 0x80:
338 ascii_range = ord(value[0])
339 else:
340 existing = ascii_table.find(value)
341 if existing == -1:
342 start = len(ascii_table)
343 assert all(ord(c) < 0x80 for c in value)
344 ascii_table += value
345 ascii_range = encode_range(start, len(ascii_table))
346 else:
347 ascii_range = encode_range(existing, existing + len(value))
349 mappings_table.append ((src_range, ascii_range))
351 mapping_end = len(mappings_table)
353 for chain in self.chains:
354 chain_starts.append(len(chains_table))
356 for item_id in reversed(chain):
357 assert item_id < 0xff
358 chains_table.append(item_id)
359 chains_table.append(0xff)
361 for locale in sorted(self.locales):
362 max_localename = max(max_localename, len(locale))
363 name_offset = len(locale_names)
364 assert all(ord(c) <= 0x7f for c in locale)
365 locale_names += (locale + '\0')
367 item_id = self.locales[locale]
369 assert name_offset < 256
370 assert item_id < 256
371 locale_index.append((name_offset, item_id))
373 print('/* Generated by update-gtranslit.py */')
374 print('#define MAX_KEY_SIZE', max_lookup)
375 print('#define MAX_LOCALE_NAME', max_localename)
376 print('static const gunichar src_table[] = {', ', '.join(str(ord(c)) for c in src_table), '};')
377 # cannot do this in plain ascii because of trigraphs... :(
378 print('static const gchar ascii_table[] = {', ', '.join(str(ord(c)) for c in ascii_table), '};')
379 print('static const struct mapping_entry mappings_table[] =', c_pair_array (mappings_table))
380 print('static const struct mapping_range mapping_ranges[] =', c_pair_array (mapping_ranges))
381 print('static const guint8 chains_table[] = {', ', '.join(str(i) for i in chains_table), '};')
382 print('static const guint8 chain_starts[] = {', ', '.join(str(i) for i in chain_starts), '};')
383 print('static const gchar locale_names[] = "' + locale_names.replace('\0', '\\0') + '";')
384 print('static const struct locale_entry locale_index[] = ', c_pair_array (locale_index))
385 print('static const guint8 default_item_id = %u;' % (self.default,))
387 def dump(self):
388 print(self.mappings)
389 print(self.chains)
390 print(self.locales)
392 locales = []
393 for name in os.listdir(localedir):
394 if looks_like_locale(name):
395 chain = get_chain(name)
396 locales.append (chain)
397 chain.links += 1
399 serialiser = Serialiser()
401 for locale in locales:
402 serialiser.add_locale(locale.name, locale.serialise(serialiser))
404 i18n = get_chain('i18n').serialise(serialiser)
405 combining = get_chain('translit_combining').serialise(serialiser)
406 serialiser.add_default(serialiser.add_chain([i18n, combining]))
408 serialiser.optimise_locales()
410 serialiser.to_c()