Add ICU message format support
[chromium-blink-merge.git] / ui / base / ime / chromeos / generate_character_composer_data.py
bloba0becd22479ec324c98a8e310cc07ceb14892956
1 #!/usr/bin/env python
2 # Copyright 2015 The Chromium Authors. All rights reserved.
3 # Use of this source code is governed by a BSD-style license that can be
4 # found in the LICENSE file.
6 """Generate a compact character composition table.
8 Normal use:
9 ./generate_character_composer_data.py \
10 --output character_composer_data.h \
11 character_composer_sequences.txt
13 Run this script with --help for a description of arguments.
15 Input file format:
17 Comments begin with '#' and extend to the end of the line.
19 Each non-comment line is a sequence of two or more keys, separated by
20 space, the last of which is the result of composing the initial part.
21 A sequence must begin with a dead key or Compose, and the result must
22 be a character key.
24 Each key can either be a character key, a dead key, or the compose key.
26 A character key is written as one of the following inside matched delimiters:
27 - a single printable ASCII character;
28 - a Unicode character name;
29 - 'U+' followed by one or more hexadecimal digits.
30 Delimiter pairs are any of '' "" () <> [] or {}.
32 A dead key is written as the word 'Dead' followed (immediately, without space)
33 by the combining character written in the same form as a character key.
35 A compose key is written as the word 'Compose'.
37 Output file format:
39 The output file is a C++ header containing a small header structure
40 |ui::TreeComposeChecker::CompositionData| and a tree of composition sequences.
42 For space efficiency, the tree is stored in a single array of 16-bit values,
43 which represent either characters (printable or dead-key) or subtree array
44 indices.
46 Each tree level consists for four tables: two key kinds (character or dead)
47 by two node types (internal or leaf), in the order:
48 - character internal
49 - character leaf
50 - dead-key internal
51 - dead-key leaf
52 This order is used because character key entries are more common than dead-key
53 entries, and internal entries are more common than leaf entries.
55 Each table consists of a word containing the number of table entries |n|,
56 followed immediately by |n| key-value pairs of words, ordered by key.
57 For internal edges, the value is the array index of the resulting subtree.
58 For leaf edges, the value is the unicode character code of the composition
59 result.
60 """
62 import argparse
63 import codecs
64 import collections
65 import sys
66 import unicodedata
68 # Global error counter.
69 g_fail = 0
71 class Key(str):
72 """Represents an element of a composition sequence.
74 Supports only Compose, dead keys, and BMP unicode characters.
75 Based on |str| for easy comparison and sorting.
76 The representation is 'C' (for unicode characters) or 'D' (for dead keys)
77 followed by 4 hex digits for the character value. The Compose key is
78 represented as dead key with combining character 0.
79 """
80 _kinds = ['character', 'dead_key']
82 def __new__(cls, key, character, location=None):
83 """Construct a Key.
84 Call as:
85 - Key(None, character_code)
86 - Key('Dead', combining_character_code)
87 - Key('Compose', 0)
88 """
89 global g_fail
90 if character > 0xFFFF:
91 print '{}: unsupported non-BMP character {}'.format(location, character)
92 g_fail += 1
93 s = 'ERROR'
94 elif key is None:
95 s = 'C{:04X}'.format(character)
96 elif key.lower() == 'dead':
97 s = 'D{:04X}'.format(character)
98 elif key.lower() == 'compose':
99 s = 'D0000'
100 else:
101 print '{}: unexpected combination {}<{}>'.format(location, key, character)
102 g_fail += 1
103 s = 'ERROR'
104 return str.__new__(cls, s)
106 def Kind(self):
107 return {'C': 'character', 'D': 'dead_key'}[self[0]]
109 def CharacterCode(self):
110 return int(self[1:], 16)
112 def UnicodeName(self):
113 v = self.CharacterCode()
114 try:
115 return unicodedata.name(unichr(v)).lower()
116 except ValueError:
117 return 'U+{:04X}'.format(v)
119 def ShorterUnicodeName(self):
120 s = self.UnicodeName()
121 if s.startswith('latin ') or s.startswith('greek '):
122 s = s[6:]
123 if s.startswith('small '):
124 s = s[6:]
125 return s.replace(' letter ', ' ')
127 def Pretty(self):
128 if self == 'D0000':
129 return 'Compose'
130 return ('Dead' if self[0] == 'D' else '') + '<' + self.UnicodeName() + '>'
133 class Input:
135 Takes a sequences of file names and presents them as a single input stream,
136 with location reporting for error messages.
138 def __init__(self, filenames):
139 self._pending = filenames
140 self._filename = None
141 self._file = None
142 self._line = None
143 self._lineno = 0
144 self._column = 0
146 def Where(self):
147 """Report the current input location, for error messages."""
148 if self._file:
149 return '{}:{}:{}'.format(self._filename, self._lineno, self._column)
150 if self._pending:
151 return '<before>'
152 return '<eof>'
154 def Get(self):
155 """Return the next input character, or None when inputs are exhausted."""
156 if self._line is None:
157 if self._file is None:
158 if not self._pending:
159 return None
160 self._filename = self._pending[0]
161 self._pending = self._pending[1:]
162 self._file = codecs.open(self._filename, mode='rb', encoding='utf-8')
163 self._lineno = 0
164 self._lineno += 1
165 self._line = self._file.readline()
166 if not self._line:
167 self._file = None
168 self._filename = None
169 return self.Get()
170 self._column = 0
171 if self._column >= len(self._line):
172 self._line = None
173 return self.Get()
174 c = self._line[self._column]
175 self._column += 1
176 return c
179 class Lexer:
181 Breaks the input stream into a sequence of tokens, each of which is either
182 a Key or the string 'EOL'.
184 def __init__(self, compose_input):
185 self._input = compose_input
187 _delimiters = {
188 '"': '"',
189 "'": "'",
190 '<': '>',
191 '(': ')',
192 '[': ']',
193 '{': '}',
196 def GetUntilDelimiter(self, e):
197 text = ''
198 c = self._input.Get()
199 while c and c != e:
200 text += c
201 c = self._input.Get()
202 return text
204 def Get(self):
205 global g_fail
206 c = ' '
207 while c and c.isspace() and c != '\n':
208 c = self._input.Get()
209 if not c:
210 return None
211 location = self._input.Where()
212 if c == '\n':
213 return 'EOL'
214 if c == '#':
215 self.GetUntilDelimiter('\n')
216 return 'EOL'
217 if c == '\\':
218 self.GetUntilDelimiter('\n')
219 return self.Get()
220 key = None
221 character = None
222 if c.isalnum():
223 key = ''
224 while c and c.isalnum():
225 key += c
226 c = self._input.Get()
227 if c in Lexer._delimiters:
228 s = self.GetUntilDelimiter(Lexer._delimiters[c])
229 if len(s) == 1:
230 character = ord(s)
231 elif s.startswith('U+'):
232 character = int(s[2:], 16)
233 else:
234 try:
235 character = ord(unicodedata.lookup(s.upper()))
236 except KeyError:
237 g_fail += 1
238 character = None
239 print '{}: unknown character name "{}"'.format(location,
240 s.encode('utf-8'))
241 return Key(key, character, location)
243 def Where(self):
244 return self._input.Where()
247 class Parser:
249 Takes a sequence of tokens from a Lexer and returns a tree of
250 composition sequences, represented as nested dictionaries where each
251 composition source element key is a dictionary key, and the final
252 composition result has a dictionary key of |None|.
254 def __init__(self, lexer):
255 self._lexer = lexer
256 self._trie = {}
258 def Parse(self):
259 global g_fail
260 self._trie = {}
261 while True:
262 seq = []
263 t = self._lexer.Get()
264 if not t:
265 break
266 if t and t != 'EOL' and t.Kind() != 'dead_key':
267 g_fail += 1
268 print ('{}: sequence does not begin with Compose or Dead key'
269 .format(self._lexer.Where()))
270 break
271 while t and t != 'EOL':
272 seq.append(t)
273 t = self._lexer.Get()
274 if not seq:
275 continue
276 self.AddToSimpleTree(self._trie, seq)
277 return self._trie
279 def AddToSimpleTree(self, tree, seq):
280 first = seq[0]
281 rest = seq[1:]
282 if first not in tree:
283 tree[first] = {}
284 if len(rest) == 1:
285 # Leaf
286 tree[first][None] = rest[0]
287 else:
288 self.AddToSimpleTree(tree[first], rest)
291 class GroupedTree:
293 Represents composition sequences in a manner close to the output format.
295 The core of the representation is the |_tables| dictionary, which has
296 an entry for each kind of |Key|, each of which is a dictionary with
297 two entries, 'internal' and 'leaf', for the output tables, each being
298 a dictionary indexed by a composition sequence |Key|. For 'internal'
299 tables the dictionary values are |GroupedTree|s at the next level;
300 for 'leaf' tables the dictionary values are |Key| composition results.
302 _key_kinds = Key._kinds
303 _sub_parts = ['internal', 'leaf']
305 def __init__(self, simple_trie, path=None):
306 if path is None:
307 path = []
308 self.path = path
309 self.depth = len(path)
310 self.height = 0
311 self.empty = True
312 self.location = -1
314 # Initialize |_tables|.
315 self._tables = {}
316 for k in self._key_kinds:
317 self._tables[k] = {}
318 for p in self._sub_parts:
319 self._tables[k][p] = {}
321 # Build the tables.
322 for key in simple_trie:
323 if key is not None:
324 # Leaf table entry.
325 if None in simple_trie[key]:
326 self.empty = False
327 self._tables[key.Kind()]['leaf'][key] = simple_trie[key][None]
328 # Internal subtree entries.
329 v = GroupedTree(simple_trie[key], path + [key])
330 if not v.empty:
331 self.empty = False
332 self._tables[key.Kind()]['internal'][key] = v
333 if self.height < v.height:
334 self.height = v.height
335 self.height += 1
337 def SubTrees(self):
338 """Returns a list of all sub-|GroupedTree|s of the current GroupTree."""
339 r = []
340 for k in self._key_kinds:
341 for key in sorted(self._tables[k]['internal']):
342 r.append(self._tables[k]['internal'][key])
343 return r
346 class Assembler:
347 """Convert a parse tree via a GroupedTree to a C++ header."""
349 def __init__(self, args, dtree):
350 self._name = args.data_name
351 self._type = args.type_name
352 self._gtree = GroupedTree(dtree)
354 def Write(self, out):
355 # First pass: determine table sizes and locations.
356 self.Pass(None, self._gtree)
358 # Second pass: write the array.
359 out.write('\nstatic const uint16_t {}Tree[] = {{\n'.format(self._name))
360 end = self.Pass(out, self._gtree)
361 out.write('};\n\n')
363 # Write the description structure.
364 out.write('static const {} {} = {{\n'
365 .format(self._type, self._name))
366 out.write(' {}, // maximum sequence length\n'.format(self._gtree.height))
367 out.write(' {}, // tree array entries\n'.format(end))
368 out.write(' {}Tree\n'.format(self._name))
369 out.write('};\n\n')
371 def Pass(self, out, gtree, location=0):
372 gtree.location = location
374 # Write tree header.
375 if out:
376 out.write('\n // offset 0x{:04X}:\n'.format(location))
377 if gtree.path:
378 out.write(' // prefix:\n')
379 for key in gtree.path:
380 out.write(' // {}\n'.format(key.Pretty()))
382 # Write tables.
383 for k in gtree._key_kinds:
384 for p in gtree._sub_parts:
385 # Write table size.
386 location += 1
387 if out:
388 out.write(' // {} {} table\n'.format(p, k))
389 out.write(' 0x{:04X}, // number of entries\n'
390 .format(len(gtree._tables[k][p])))
391 # Write table body.
392 for key in sorted(gtree._tables[k][p]):
393 location += 2
394 if out:
395 out.write(' 0x{:04X}, // {}\n'
396 .format(key.CharacterCode(), key.ShorterUnicodeName()))
397 result = gtree._tables[k][p][key]
398 if p == 'internal':
399 out.write(' 0x{:04X},\n'.format(result.location))
400 else:
401 out.write(' 0x{:04X}, // -> {}\n'
402 .format(result.CharacterCode(),
403 result.ShorterUnicodeName()))
405 # Assemble subtrees of the current tree.
406 for t in gtree.SubTrees():
407 location = self.Pass(out, t, location)
409 return location
412 def main(argv):
413 parser = argparse.ArgumentParser()
414 parser.add_argument('--type_name',
415 default='ui::TreeComposeChecker::CompositionData')
416 parser.add_argument('--data_name', default='kCompositions')
417 parser.add_argument('--output', default='character_composer_data.h')
418 parser.add_argument('--guard', default=None)
419 parser.add_argument('inputs', nargs='*')
420 args = parser.parse_args(argv[1:])
422 parse_tree = Parser(Lexer(Input(args.inputs))).Parse()
424 with (sys.stdout if args.output == '-' else open(args.output, 'wb')) as out:
425 out.write('// Copyright 2015 The Chromium Authors. All rights reserved.\n')
426 out.write('// Use of this source code is governed by a BSD-style license')
427 out.write(' that can be\n// found in the LICENSE file.\n\n')
428 out.write('// DO NOT EDIT.\n')
429 out.write('// GENERATED BY {}\n'.format(sys.argv[0]))
430 out.write('// FROM {}\n\n'.format(' '.join(args.inputs)))
431 guard = args.guard if args.guard else args.output
432 guard = ''.join([c.upper() if c.isalpha() else '_' for c in guard])
433 out.write('#ifndef {0}_\n#define {0}_\n'.format(guard))
434 Assembler(args, parse_tree).Write(out)
435 out.write('#endif // {}_\n'.format(guard))
437 return g_fail
439 if __name__ == '__main__':
440 sys.exit(main(sys.argv))