2 # Copyright 2015 The Chromium Authors. All rights reserved.
3 # Use of this source code is governed by a BSD-style license that can be
4 # found in the LICENSE file.
6 """Generate a compact character composition table.
9 ./generate_character_composer_data.py \
10 --output character_composer_data.h \
11 character_composer_sequences.txt
13 Run this script with --help for a description of arguments.
17 Comments begin with '#' and extend to the end of the line.
19 Each non-comment line is a sequence of two or more keys, separated by
20 space, the last of which is the result of composing the initial part.
21 A sequence must begin with a dead key or Compose, and the result must
24 Each key can either be a character key, a dead key, or the compose key.
26 A character key is written as one of the following inside matched delimiters:
27 - a single printable ASCII character;
28 - a Unicode character name;
29 - 'U+' followed by one or more hexadecimal digits.
30 Delimiter pairs are any of '' "" () <> [] or {}.
32 A dead key is written as the word 'Dead' followed (immediately, without space)
33 by the combining character written in the same form as a character key.
35 A compose key is written as the word 'Compose'.
39 The output file is a C++ header containing a small header structure
40 |ui::TreeComposeChecker::CompositionData| and a tree of composition sequences.
42 For space efficiency, the tree is stored in a single array of 16-bit values,
43 which represent either characters (printable or dead-key) or subtree array
46 Each tree level consists for four tables: two key kinds (character or dead)
47 by two node types (internal or leaf), in the order:
52 This order is used because character key entries are more common than dead-key
53 entries, and internal entries are more common than leaf entries.
55 Each table consists of a word containing the number of table entries |n|,
56 followed immediately by |n| key-value pairs of words, ordered by key.
57 For internal edges, the value is the array index of the resulting subtree.
58 For leaf edges, the value is the unicode character code of the composition
68 # Global error counter.
72 """Represents an element of a composition sequence.
74 Supports only Compose, dead keys, and BMP unicode characters.
75 Based on |str| for easy comparison and sorting.
76 The representation is 'C' (for unicode characters) or 'D' (for dead keys)
77 followed by 4 hex digits for the character value. The Compose key is
78 represented as dead key with combining character 0.
80 _kinds
= ['character', 'dead_key']
82 def __new__(cls
, key
, character
, location
=None):
85 - Key(None, character_code)
86 - Key('Dead', combining_character_code)
90 if character
> 0xFFFF:
91 print '{}: unsupported non-BMP character {}'.format(location
, character
)
95 s
= 'C{:04X}'.format(character
)
96 elif key
.lower() == 'dead':
97 s
= 'D{:04X}'.format(character
)
98 elif key
.lower() == 'compose':
101 print '{}: unexpected combination {}<{}>'.format(location
, key
, character
)
104 return str.__new
__(cls
, s
)
107 return {'C': 'character', 'D': 'dead_key'}[self
[0]]
109 def CharacterCode(self
):
110 return int(self
[1:], 16)
112 def UnicodeName(self
):
113 v
= self
.CharacterCode()
115 return unicodedata
.name(unichr(v
)).lower()
117 return 'U+{:04X}'.format(v
)
119 def ShorterUnicodeName(self
):
120 s
= self
.UnicodeName()
121 if s
.startswith('latin ') or s
.startswith('greek '):
123 if s
.startswith('small '):
125 return s
.replace(' letter ', ' ')
130 return ('Dead' if self
[0] == 'D' else '') + '<' + self
.UnicodeName() + '>'
135 Takes a sequences of file names and presents them as a single input stream,
136 with location reporting for error messages.
138 def __init__(self
, filenames
):
139 self
._pending
= filenames
140 self
._filename
= None
147 """Report the current input location, for error messages."""
149 return '{}:{}:{}'.format(self
._filename
, self
._lineno
, self
._column
)
155 """Return the next input character, or None when inputs are exhausted."""
156 if self
._line
is None:
157 if self
._file
is None:
158 if not self
._pending
:
160 self
._filename
= self
._pending
[0]
161 self
._pending
= self
._pending
[1:]
162 self
._file
= codecs
.open(self
._filename
, mode
='rb', encoding
='utf-8')
165 self
._line
= self
._file
.readline()
168 self
._filename
= None
171 if self
._column
>= len(self
._line
):
174 c
= self
._line
[self
._column
]
181 Breaks the input stream into a sequence of tokens, each of which is either
182 a Key or the string 'EOL'.
184 def __init__(self
, compose_input
):
185 self
._input
= compose_input
196 def GetUntilDelimiter(self
, e
):
198 c
= self
._input
.Get()
201 c
= self
._input
.Get()
207 while c
and c
.isspace() and c
!= '\n':
208 c
= self
._input
.Get()
211 location
= self
._input
.Where()
215 self
.GetUntilDelimiter('\n')
218 self
.GetUntilDelimiter('\n')
224 while c
and c
.isalnum():
226 c
= self
._input
.Get()
227 if c
in Lexer
._delimiters
:
228 s
= self
.GetUntilDelimiter(Lexer
._delimiters
[c
])
231 elif s
.startswith('U+'):
232 character
= int(s
[2:], 16)
235 character
= ord(unicodedata
.lookup(s
.upper()))
239 print '{}: unknown character name "{}"'.format(location
,
241 return Key(key
, character
, location
)
244 return self
._input
.Where()
249 Takes a sequence of tokens from a Lexer and returns a tree of
250 composition sequences, represented as nested dictionaries where each
251 composition source element key is a dictionary key, and the final
252 composition result has a dictionary key of |None|.
254 def __init__(self
, lexer
):
263 t
= self
._lexer
.Get()
266 if t
and t
!= 'EOL' and t
.Kind() != 'dead_key':
268 print ('{}: sequence does not begin with Compose or Dead key'
269 .format(self
._lexer
.Where()))
271 while t
and t
!= 'EOL':
273 t
= self
._lexer
.Get()
276 self
.AddToSimpleTree(self
._trie
, seq
)
279 def AddToSimpleTree(self
, tree
, seq
):
282 if first
not in tree
:
286 tree
[first
][None] = rest
[0]
288 self
.AddToSimpleTree(tree
[first
], rest
)
293 Represents composition sequences in a manner close to the output format.
295 The core of the representation is the |_tables| dictionary, which has
296 an entry for each kind of |Key|, each of which is a dictionary with
297 two entries, 'internal' and 'leaf', for the output tables, each being
298 a dictionary indexed by a composition sequence |Key|. For 'internal'
299 tables the dictionary values are |GroupedTree|s at the next level;
300 for 'leaf' tables the dictionary values are |Key| composition results.
302 _key_kinds
= Key
._kinds
303 _sub_parts
= ['internal', 'leaf']
305 def __init__(self
, simple_trie
, path
=None):
309 self
.depth
= len(path
)
314 # Initialize |_tables|.
316 for k
in self
._key
_kinds
:
318 for p
in self
._sub
_parts
:
319 self
._tables
[k
][p
] = {}
322 for key
in simple_trie
:
325 if None in simple_trie
[key
]:
327 self
._tables
[key
.Kind()]['leaf'][key
] = simple_trie
[key
][None]
328 # Internal subtree entries.
329 v
= GroupedTree(simple_trie
[key
], path
+ [key
])
332 self
._tables
[key
.Kind()]['internal'][key
] = v
333 if self
.height
< v
.height
:
334 self
.height
= v
.height
338 """Returns a list of all sub-|GroupedTree|s of the current GroupTree."""
340 for k
in self
._key
_kinds
:
341 for key
in sorted(self
._tables
[k
]['internal']):
342 r
.append(self
._tables
[k
]['internal'][key
])
347 """Convert a parse tree via a GroupedTree to a C++ header."""
349 def __init__(self
, args
, dtree
):
350 self
._name
= args
.data_name
351 self
._type
= args
.type_name
352 self
._gtree
= GroupedTree(dtree
)
354 def Write(self
, out
):
355 # First pass: determine table sizes and locations.
356 self
.Pass(None, self
._gtree
)
358 # Second pass: write the array.
359 out
.write('\nstatic const uint16_t {}Tree[] = {{\n'.format(self
._name
))
360 end
= self
.Pass(out
, self
._gtree
)
363 # Write the description structure.
364 out
.write('static const {} {} = {{\n'
365 .format(self
._type
, self
._name
))
366 out
.write(' {}, // maximum sequence length\n'.format(self
._gtree
.height
))
367 out
.write(' {}, // tree array entries\n'.format(end
))
368 out
.write(' {}Tree\n'.format(self
._name
))
371 def Pass(self
, out
, gtree
, location
=0):
372 gtree
.location
= location
376 out
.write('\n // offset 0x{:04X}:\n'.format(location
))
378 out
.write(' // prefix:\n')
379 for key
in gtree
.path
:
380 out
.write(' // {}\n'.format(key
.Pretty()))
383 for k
in gtree
._key
_kinds
:
384 for p
in gtree
._sub
_parts
:
388 out
.write(' // {} {} table\n'.format(p
, k
))
389 out
.write(' 0x{:04X}, // number of entries\n'
390 .format(len(gtree
._tables
[k
][p
])))
392 for key
in sorted(gtree
._tables
[k
][p
]):
395 out
.write(' 0x{:04X}, // {}\n'
396 .format(key
.CharacterCode(), key
.ShorterUnicodeName()))
397 result
= gtree
._tables
[k
][p
][key
]
399 out
.write(' 0x{:04X},\n'.format(result
.location
))
401 out
.write(' 0x{:04X}, // -> {}\n'
402 .format(result
.CharacterCode(),
403 result
.ShorterUnicodeName()))
405 # Assemble subtrees of the current tree.
406 for t
in gtree
.SubTrees():
407 location
= self
.Pass(out
, t
, location
)
413 parser
= argparse
.ArgumentParser()
414 parser
.add_argument('--type_name',
415 default
='ui::TreeComposeChecker::CompositionData')
416 parser
.add_argument('--data_name', default
='kCompositions')
417 parser
.add_argument('--output', default
='character_composer_data.h')
418 parser
.add_argument('--guard', default
=None)
419 parser
.add_argument('inputs', nargs
='*')
420 args
= parser
.parse_args(argv
[1:])
422 parse_tree
= Parser(Lexer(Input(args
.inputs
))).Parse()
424 with (sys
.stdout
if args
.output
== '-' else open(args
.output
, 'wb')) as out
:
425 out
.write('// Copyright 2015 The Chromium Authors. All rights reserved.\n')
426 out
.write('// Use of this source code is governed by a BSD-style license')
427 out
.write(' that can be\n// found in the LICENSE file.\n\n')
428 out
.write('// DO NOT EDIT.\n')
429 out
.write('// GENERATED BY {}\n'.format(sys
.argv
[0]))
430 out
.write('// FROM {}\n\n'.format(' '.join(args
.inputs
)))
431 guard
= args
.guard
if args
.guard
else args
.output
432 guard
= ''.join([c
.upper() if c
.isalpha() else '_' for c
in guard
])
433 out
.write('#ifndef {0}_\n#define {0}_\n'.format(guard
))
434 Assembler(args
, parse_tree
).Write(out
)
435 out
.write('#endif // {}_\n'.format(guard
))
439 if __name__
== '__main__':
440 sys
.exit(main(sys
.argv
))