ui/base/ime/chromeos/generate_character_composer_data.py

   1 #!/usr/bin/env python
   2 # Copyright 2015 The Chromium Authors. All rights reserved.
   3 # Use of this source code is governed by a BSD-style license that can be
   4 # found in the LICENSE file.
   5
   6 """Generate a compact character composition table.
   7
   8 Normal use:
   9   ./generate_character_composer_data.py \
  10       --output character_composer_data.h \
  11       character_composer_sequences.txt
  12
  13 Run this script with --help for a description of arguments.
  14
  15 Input file format:
  16
  17   Comments begin with '#' and extend to the end of the line.
  18
  19   Each non-comment line is a sequence of two or more keys, separated by
  20   space, the last of which is the result of composing the initial part.
  21   A sequence must begin with a dead key or Compose, and the result must
  22   be a character key.
  23
  24   Each key can either be a character key, a dead key, or the compose key.
  25
  26   A character key is written as one of the following inside matched delimiters:
  27   - a single printable ASCII character;
  28   - a Unicode character name;
  29   - 'U+' followed by one or more hexadecimal digits.
  30   Delimiter pairs are any of '' "" () <> [] or {}.
  31
  32   A dead key is written as the word 'Dead' followed (immediately, without space)
  33   by the combining character written in the same form as a character key.
  34
  35   A compose key is written as the word 'Compose'.
  36
  37 Output file format:
  38
  39   The output file is a C++ header containing a small header structure
  40   |ui::TreeComposeChecker::CompositionData| and a tree of composition sequences.
  41
  42   For space efficiency, the tree is stored in a single array of 16-bit values,
  43   which represent either characters (printable or dead-key) or subtree array
  44   indices.
  45
  46   Each tree level consists for four tables: two key kinds (character or dead)
  47   by two node types (internal or leaf), in the order:
  48   - character internal
  49   - character leaf
  50   - dead-key internal
  51   - dead-key leaf
  52   This order is used because character key entries are more common than dead-key
  53   entries, and internal entries are more common than leaf entries.
  54
  55   Each table consists of a word containing the number of table entries |n|,
  56   followed immediately by |n| key-value pairs of words, ordered by key.
  57   For internal edges, the value is the array index of the resulting subtree.
  58   For leaf edges, the value is the unicode character code of the composition
  59   result.
  60 """
  61
  62 import argparse
  63 import codecs
  64 import collections
  65 import sys
  66 import unicodedata
  67
  68 # Global error counter.
  69 g_fail = 0
  70
  71 class Key(str):
  72   """Represents an element of a composition sequence.
  73
  74   Supports only Compose, dead keys, and BMP unicode characters.
  75   Based on |str| for easy comparison and sorting.
  76   The representation is 'C' (for unicode characters) or 'D' (for dead keys)
  77   followed by 4 hex digits for the character value. The Compose key is
  78   represented as dead key with combining character 0.
  79   """
  80   _kinds = ['character', 'dead_key']
  81
  82   def __new__(cls, key, character, location=None):
  83     """Construct a Key.
  84     Call as:
  85     - Key(None, character_code)
  86     - Key('Dead', combining_character_code)
  87     - Key('Compose', 0)
  88     """
  89     global g_fail
  90     if character > 0xFFFF:
  91       print '{}: unsupported non-BMP character {}'.format(location, character)
  92       g_fail += 1
  93       s = 'ERROR'
  94     elif key is None:
  95       s = 'C{:04X}'.format(character)
  96     elif key.lower() == 'dead':
  97       s = 'D{:04X}'.format(character)
  98     elif key.lower() == 'compose':
  99       s = 'D0000'
 100     else:
 101       print '{}: unexpected combination {}<{}>'.format(location, key, character)
 102       g_fail += 1
 103       s = 'ERROR'
 104     return str.__new__(cls, s)
 105
 106   def Kind(self):
 107     return {'C': 'character', 'D': 'dead_key'}[self[0]]
 108
 109   def CharacterCode(self):
 110     return int(self[1:], 16)
 111
 112   def UnicodeName(self):
 113     v = self.CharacterCode()
 114     try:
 115       return unicodedata.name(unichr(v)).lower()
 116     except ValueError:
 117       return 'U+{:04X}'.format(v)
 118
 119   def ShorterUnicodeName(self):
 120     s = self.UnicodeName()
 121     if s.startswith('latin ') or s.startswith('greek '):
 122       s = s[6:]
 123     if s.startswith('small '):
 124       s = s[6:]
 125     return s.replace(' letter ', ' ')
 126
 127   def Pretty(self):
 128     if self == 'D0000':
 129       return 'Compose'
 130     return ('Dead' if self[0] == 'D' else '') + '<' + self.UnicodeName() + '>'
 131
 132
 133 class Input:
 134   """
 135   Takes a sequences of file names and presents them as a single input stream,
 136   with location reporting for error messages.
 137   """
 138   def __init__(self, filenames):
 139     self._pending = filenames
 140     self._filename = None
 141     self._file = None
 142     self._line = None
 143     self._lineno = 0
 144     self._column = 0
 145
 146   def Where(self):
 147     """Report the current input location, for error messages."""
 148     if self._file:
 149       return '{}:{}:{}'.format(self._filename, self._lineno, self._column)
 150     if self._pending:
 151       return '<before>'
 152     return '<eof>'
 153
 154   def Get(self):
 155     """Return the next input character, or None when inputs are exhausted."""
 156     if self._line is None:
 157       if self._file is None:
 158         if not self._pending:
 159           return None
 160         self._filename = self._pending[0]
 161         self._pending = self._pending[1:]
 162         self._file = codecs.open(self._filename, mode='rb', encoding='utf-8')
 163         self._lineno = 0
 164       self._lineno += 1
 165       self._line = self._file.readline()
 166       if not self._line:
 167         self._file = None
 168         self._filename = None
 169         return self.Get()
 170       self._column = 0
 171     if self._column >= len(self._line):
 172       self._line = None
 173       return self.Get()
 174     c = self._line[self._column]
 175     self._column += 1
 176     return c
 177
 178
 179 class Lexer:
 180   """
 181   Breaks the input stream into a sequence of tokens, each of which is either
 182   a Key or the string 'EOL'.
 183   """
 184   def __init__(self, compose_input):
 185     self._input = compose_input
 186
 187   _delimiters = {
 188       '"': '"',
 189       "'": "'",
 190       '<': '>',
 191       '(': ')',
 192       '[': ']',
 193       '{': '}',
 194   }
 195
 196   def GetUntilDelimiter(self, e):
 197     text = ''
 198     c = self._input.Get()
 199     while c and c != e:
 200       text += c
 201       c = self._input.Get()
 202     return text
 203
 204   def Get(self):
 205     global g_fail
 206     c = ' '
 207     while c and c.isspace() and c != '\n':
 208       c = self._input.Get()
 209     if not c:
 210       return None
 211     location = self._input.Where()
 212     if c == '\n':
 213       return 'EOL'
 214     if c == '#':
 215       self.GetUntilDelimiter('\n')
 216       return 'EOL'
 217     if c == '\\':
 218       self.GetUntilDelimiter('\n')
 219       return self.Get()
 220     key = None
 221     character = None
 222     if c.isalnum():
 223       key = ''
 224       while c and c.isalnum():
 225         key += c
 226         c = self._input.Get()
 227     if c in Lexer._delimiters:
 228       s = self.GetUntilDelimiter(Lexer._delimiters[c])
 229       if len(s) == 1:
 230         character = ord(s)
 231       elif s.startswith('U+'):
 232         character = int(s[2:], 16)
 233       else:
 234         try:
 235           character = ord(unicodedata.lookup(s.upper()))
 236         except KeyError:
 237           g_fail += 1
 238           character = None
 239           print '{}: unknown character name "{}"'.format(location,
 240                                                          s.encode('utf-8'))
 241     return Key(key, character, location)
 242
 243   def Where(self):
 244     return self._input.Where()
 245
 246
 247 class Parser:
 248   """
 249   Takes a sequence of tokens from a Lexer and returns a tree of
 250   composition sequences, represented as nested dictionaries where each
 251   composition source element key is a dictionary key, and the final
 252   composition result has a dictionary key of |None|.
 253   """
 254   def __init__(self, lexer):
 255     self._lexer = lexer
 256     self._trie = {}
 257
 258   def Parse(self):
 259     global g_fail
 260     self._trie = {}
 261     while True:
 262       seq = []
 263       t = self._lexer.Get()
 264       if not t:
 265         break
 266       if t and t != 'EOL' and t.Kind() != 'dead_key':
 267         g_fail += 1
 268         print ('{}: sequence does not begin with Compose or Dead key'
 269                .format(self._lexer.Where()))
 270         break
 271       while t and t != 'EOL':
 272         seq.append(t)
 273         t = self._lexer.Get()
 274       if not seq:
 275         continue
 276       self.AddToSimpleTree(self._trie, seq)
 277     return self._trie
 278
 279   def AddToSimpleTree(self, tree, seq):
 280     first = seq[0]
 281     rest = seq[1:]
 282     if first not in tree:
 283       tree[first] = {}
 284     if len(rest) == 1:
 285       # Leaf
 286       tree[first][None] = rest[0]
 287     else:
 288       self.AddToSimpleTree(tree[first], rest)
 289
 290
 291 class GroupedTree:
 292   """
 293   Represents composition sequences in a manner close to the output format.
 294
 295   The core of the representation is the |_tables| dictionary, which has
 296   an entry for each kind of |Key|, each of which is a dictionary with
 297   two entries, 'internal' and 'leaf', for the output tables, each being
 298   a dictionary indexed by a composition sequence |Key|. For 'internal'
 299   tables the dictionary values are |GroupedTree|s at the next level;
 300   for 'leaf' tables the dictionary values are |Key| composition results.
 301   """
 302   _key_kinds = Key._kinds
 303   _sub_parts = ['internal', 'leaf']
 304
 305   def __init__(self, simple_trie, path=None):
 306     if path is None:
 307       path = []
 308     self.path = path
 309     self.depth = len(path)
 310     self.height = 0
 311     self.empty = True
 312     self.location = -1
 313
 314     # Initialize |_tables|.
 315     self._tables = {}
 316     for k in self._key_kinds:
 317       self._tables[k] = {}
 318       for p in self._sub_parts:
 319         self._tables[k][p] = {}
 320
 321     # Build the tables.
 322     for key in simple_trie:
 323       if key is not None:
 324         # Leaf table entry.
 325         if None in simple_trie[key]:
 326           self.empty = False
 327           self._tables[key.Kind()]['leaf'][key] = simple_trie[key][None]
 328         # Internal subtree entries.
 329         v = GroupedTree(simple_trie[key], path + [key])
 330         if not v.empty:
 331           self.empty = False
 332           self._tables[key.Kind()]['internal'][key] = v
 333           if self.height < v.height:
 334             self.height = v.height
 335     self.height += 1
 336
 337   def SubTrees(self):
 338     """Returns a list of all sub-|GroupedTree|s of the current GroupTree."""
 339     r = []
 340     for k in self._key_kinds:
 341       for key in sorted(self._tables[k]['internal']):
 342         r.append(self._tables[k]['internal'][key])
 343     return r
 344
 345
 346 class Assembler:
 347   """Convert a parse tree via a GroupedTree to a C++ header."""
 348
 349   def __init__(self, args, dtree):
 350     self._name = args.data_name
 351     self._type = args.type_name
 352     self._gtree = GroupedTree(dtree)
 353
 354   def Write(self, out):
 355     # First pass: determine table sizes and locations.
 356     self.Pass(None, self._gtree)
 357
 358     # Second pass: write the array.
 359     out.write('\nstatic const uint16_t {}Tree[] = {{\n'.format(self._name))
 360     end = self.Pass(out, self._gtree)
 361     out.write('};\n\n')
 362
 363     # Write the description structure.
 364     out.write('static const {} {} = {{\n'
 365               .format(self._type, self._name))
 366     out.write('  {}, // maximum sequence length\n'.format(self._gtree.height))
 367     out.write('  {}, // tree array entries\n'.format(end))
 368     out.write('  {}Tree\n'.format(self._name))
 369     out.write('};\n\n')
 370
 371   def Pass(self, out, gtree, location=0):
 372     gtree.location = location
 373
 374     # Write tree header.
 375     if out:
 376       out.write('\n    // offset 0x{:04X}:\n'.format(location))
 377       if gtree.path:
 378         out.write('    //   prefix:\n')
 379         for key in gtree.path:
 380           out.write('    //     {}\n'.format(key.Pretty()))
 381
 382     # Write tables.
 383     for k in gtree._key_kinds:
 384       for p in gtree._sub_parts:
 385         # Write table size.
 386         location += 1
 387         if out:
 388           out.write('    //   {} {} table\n'.format(p, k))
 389           out.write('    0x{:04X},          // number of entries\n'
 390                     .format(len(gtree._tables[k][p])))
 391         # Write table body.
 392         for key in sorted(gtree._tables[k][p]):
 393           location += 2
 394           if out:
 395             out.write('    0x{:04X},  // {}\n'
 396                       .format(key.CharacterCode(), key.ShorterUnicodeName()))
 397             result = gtree._tables[k][p][key]
 398             if p == 'internal':
 399               out.write('    0x{:04X},\n'.format(result.location))
 400             else:
 401               out.write('    0x{:04X},  // -> {}\n'
 402                         .format(result.CharacterCode(),
 403                                 result.ShorterUnicodeName()))
 404
 405     # Assemble subtrees of the current tree.
 406     for t in gtree.SubTrees():
 407       location = self.Pass(out, t, location)
 408
 409     return location
 410
 411
 412 def main(argv):
 413   parser = argparse.ArgumentParser()
 414   parser.add_argument('--type_name',
 415                       default='ui::TreeComposeChecker::CompositionData')
 416   parser.add_argument('--data_name', default='kCompositions')
 417   parser.add_argument('--output', default='character_composer_data.h')
 418   parser.add_argument('--guard', default=None)
 419   parser.add_argument('inputs', nargs='*')
 420   args = parser.parse_args(argv[1:])
 421
 422   parse_tree = Parser(Lexer(Input(args.inputs))).Parse()
 423
 424   with (sys.stdout if args.output == '-' else open(args.output, 'wb')) as out:
 425     out.write('// Copyright 2015 The Chromium Authors. All rights reserved.\n')
 426     out.write('// Use of this source code is governed by a BSD-style license')
 427     out.write(' that can be\n// found in the LICENSE file.\n\n')
 428     out.write('// DO NOT EDIT.\n')
 429     out.write('//   GENERATED BY {}\n'.format(sys.argv[0]))
 430     out.write('//   FROM {}\n\n'.format(' '.join(args.inputs)))
 431     guard = args.guard if args.guard else args.output
 432     guard = ''.join([c.upper() if c.isalpha() else '_' for c in guard])
 433     out.write('#ifndef {0}_\n#define {0}_\n'.format(guard))
 434     Assembler(args, parse_tree).Write(out)
 435     out.write('#endif  // {}_\n'.format(guard))
 436
 437   return g_fail
 438
 439 if __name__ == '__main__':
 440   sys.exit(main(sys.argv))