Tools/unicode/makeunicodedata.py

   1 #
   2 # (re)generate unicode property and type databases
   3 #
   4 # this script converts a unicode 3.0 database file to
   5 # Modules/unicodedata_db.h, Modules/unicodename_db.h,
   6 # and Objects/unicodetype_db.h
   7 #
   8 # history:
   9 # 2000-09-24 fl   created (based on bits and pieces from unidb)
  10 # 2000-09-25 fl   merged tim's splitbin fixes, separate decomposition table
  11 # 2000-09-25 fl   added character type table
  12 # 2000-09-26 fl   added LINEBREAK, DECIMAL, and DIGIT flags/fields (2.0)
  13 # 2000-11-03 fl   expand first/last ranges
  14 # 2001-01-19 fl   added character name tables (2.1)
  15 # 2001-01-21 fl   added decomp compression; dynamic phrasebook threshold
  16 #
  17 # written by Fredrik Lundh (fredrik@pythonware.com)
  18 #
  19
  20 import sys
  21
  22 SCRIPT = sys.argv[0]
  23 VERSION = "2.1"
  24
  25 UNICODE_DATA = "UnicodeData-Latest.txt"
  26
  27 CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd",
  28     "Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm",
  29     "Lo", "Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po", "Sm", "Sc", "Sk",
  30     "So" ]
  31
  32 BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO",
  33     "PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS",
  34     "ON" ]
  35
  36 # note: should match definitions in Objects/unicodectype.c
  37 ALPHA_MASK = 0x01
  38 DECIMAL_MASK = 0x02
  39 DIGIT_MASK = 0x04
  40 LOWER_MASK = 0x08
  41 LINEBREAK_MASK = 0x10
  42 SPACE_MASK = 0x20
  43 TITLE_MASK = 0x40
  44 UPPER_MASK = 0x80
  45
  46 def maketables(trace=0):
  47
  48     print "--- Reading", UNICODE_DATA, "..."
  49
  50     unicode = UnicodeData(UNICODE_DATA)
  51
  52     print len(filter(None, unicode.table)), "characters"
  53
  54     makeunicodename(unicode, trace)
  55     makeunicodedata(unicode, trace)
  56     makeunicodetype(unicode, trace)
  57
  58 # --------------------------------------------------------------------
  59 # unicode character properties
  60
  61 def makeunicodedata(unicode, trace):
  62
  63     dummy = (0, 0, 0, 0)
  64     table = [dummy]
  65     cache = {0: dummy}
  66     index = [0] * len(unicode.chars)
  67
  68     FILE = "Modules/unicodedata_db.h"
  69
  70     print "--- Preparing", FILE, "..."
  71
  72     # 1) database properties
  73
  74     for char in unicode.chars:
  75         record = unicode.table[char]
  76         if record:
  77             # extract database properties
  78             category = CATEGORY_NAMES.index(record[2])
  79             combining = int(record[3])
  80             bidirectional = BIDIRECTIONAL_NAMES.index(record[4])
  81             mirrored = record[9] == "Y"
  82             item = (
  83                 category, combining, bidirectional, mirrored
  84                 )
  85             # add entry to index and item tables
  86             i = cache.get(item)
  87             if i is None:
  88                 cache[item] = i = len(table)
  89                 table.append(item)
  90             index[char] = i
  91
  92     # 2) decomposition data
  93
  94     decomp_data = [0]
  95     decomp_prefix = [""]
  96     decomp_index = [0] * len(unicode.chars)
  97     decomp_size = 0
  98
  99     for char in unicode.chars:
 100         record = unicode.table[char]
 101         if record:
 102             if record[5]:
 103                 decomp = string.split(record[5])
 104                 # prefix
 105                 if decomp[0][0] == "<":
 106                     prefix = decomp.pop(0)
 107                 else:
 108                     prefix = ""
 109                 try:
 110                     i = decomp_prefix.index(prefix)
 111                 except ValueError:
 112                     i = len(decomp_prefix)
 113                     decomp_prefix.append(prefix)
 114                 prefix = i
 115                 assert prefix < 256
 116                 # content
 117                 decomp = [prefix + (len(decomp)<<8)] +\
 118                          map(lambda s: int(s, 16), decomp)
 119                 try:
 120                     i = decomp_data.index(decomp)
 121                 except ValueError:
 122                     i = len(decomp_data)
 123                     decomp_data.extend(decomp)
 124                     decomp_size = decomp_size + len(decomp) * 2
 125             else:
 126                 i = 0
 127             decomp_index[char] = i
 128
 129     print len(table), "unique properties"
 130     print len(decomp_prefix), "unique decomposition prefixes"
 131     print len(decomp_data), "unique decomposition entries:",
 132     print decomp_size, "bytes"
 133
 134     print "--- Writing", FILE, "..."
 135
 136     fp = open(FILE, "w")
 137     print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION)
 138     print >>fp
 139     print >>fp, "/* a list of unique database records */"
 140     print >>fp, \
 141           "const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {"
 142     for item in table:
 143         print >>fp, "    {%d, %d, %d, %d}," % item
 144     print >>fp, "};"
 145     print >>fp
 146
 147     # FIXME: <fl> the following tables could be made static, and
 148     # the support code moved into unicodedatabase.c
 149
 150     print >>fp, "/* string literals */"
 151     print >>fp, "const char *_PyUnicode_CategoryNames[] = {"
 152     for name in CATEGORY_NAMES:
 153         print >>fp, "    \"%s\"," % name
 154     print >>fp, "    NULL"
 155     print >>fp, "};"
 156
 157     print >>fp, "const char *_PyUnicode_BidirectionalNames[] = {"
 158     for name in BIDIRECTIONAL_NAMES:
 159         print >>fp, "    \"%s\"," % name
 160     print >>fp, "    NULL"
 161     print >>fp, "};"
 162
 163     print >>fp, "static const char *decomp_prefix[] = {"
 164     for name in decomp_prefix:
 165         print >>fp, "    \"%s\"," % name
 166     print >>fp, "    NULL"
 167     print >>fp, "};"
 168
 169     # split record index table
 170     index1, index2, shift = splitbins(index, trace)
 171
 172     print >>fp, "/* index tables for the database records */"
 173     print >>fp, "#define SHIFT", shift
 174     Array("index1", index1).dump(fp, trace)
 175     Array("index2", index2).dump(fp, trace)
 176
 177     # split decomposition index table
 178     index1, index2, shift = splitbins(decomp_index, trace)
 179
 180     print >>fp, "/* decomposition data */"
 181     Array("decomp_data", decomp_data).dump(fp, trace)
 182
 183     print >>fp, "/* index tables for the decomposition data */"
 184     print >>fp, "#define DECOMP_SHIFT", shift
 185     Array("decomp_index1", index1).dump(fp, trace)
 186     Array("decomp_index2", index2).dump(fp, trace)
 187
 188     fp.close()
 189
 190 # --------------------------------------------------------------------
 191 # unicode character type tables
 192
 193 def makeunicodetype(unicode, trace):
 194
 195     FILE = "Objects/unicodetype_db.h"
 196
 197     print "--- Preparing", FILE, "..."
 198
 199     # extract unicode types
 200     dummy = (0, 0, 0, 0, 0, 0)
 201     table = [dummy]
 202     cache = {0: dummy}
 203     index = [0] * len(unicode.chars)
 204
 205     for char in unicode.chars:
 206         record = unicode.table[char]
 207         if record:
 208             # extract database properties
 209             category = record[2]
 210             bidirectional = record[4]
 211             flags = 0
 212             if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]:
 213                 flags |= ALPHA_MASK
 214             if category == "Ll":
 215                 flags |= LOWER_MASK
 216             if category == "Zl" or bidirectional == "B":
 217                 flags |= LINEBREAK_MASK
 218             if category == "Zs" or bidirectional in ("WS", "B", "S"):
 219                 flags |= SPACE_MASK
 220             if category == "Lt":
 221                 flags |= TITLE_MASK
 222             if category == "Lu":
 223                 flags |= UPPER_MASK
 224             # use delta predictor for upper/lower/title
 225             if record[12]:
 226                 upper = (int(record[12], 16) - char) & 0xffff
 227             else:
 228                 upper = 0
 229             if record[13]:
 230                 lower = (int(record[13], 16) - char) & 0xffff
 231             else:
 232                 lower = 0
 233             if record[14]:
 234                 title = (int(record[14], 16) - char) & 0xffff
 235             else:
 236                 title = 0
 237             # decimal digit, integer digit
 238             decimal = 0
 239             if record[6]:
 240                 flags |= DECIMAL_MASK
 241                 decimal = int(record[6])
 242             digit = 0
 243             if record[7]:
 244                 flags |= DIGIT_MASK
 245                 digit = int(record[7])
 246             item = (
 247                 flags, upper, lower, title, decimal, digit
 248                 )
 249             # add entry to index and item tables
 250             i = cache.get(item)
 251             if i is None:
 252                 cache[item] = i = len(table)
 253                 table.append(item)
 254             index[char] = i
 255
 256     print len(table), "unique character type entries"
 257
 258     print "--- Writing", FILE, "..."
 259
 260     fp = open(FILE, "w")
 261     print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION)
 262     print >>fp
 263     print >>fp, "/* a list of unique character type descriptors */"
 264     print >>fp, "const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {"
 265     for item in table:
 266         print >>fp, "    {%d, %d, %d, %d, %d, %d}," % item
 267     print >>fp, "};"
 268     print >>fp
 269
 270     # split decomposition index table
 271     index1, index2, shift = splitbins(index, trace)
 272
 273     print >>fp, "/* type indexes */"
 274     print >>fp, "#define SHIFT", shift
 275     Array("index1", index1).dump(fp, trace)
 276     Array("index2", index2).dump(fp, trace)
 277
 278     fp.close()
 279
 280 # --------------------------------------------------------------------
 281 # unicode name database
 282
 283 def makeunicodename(unicode, trace):
 284
 285     FILE = "Modules/unicodename_db.h"
 286
 287     print "--- Preparing", FILE, "..."
 288
 289     # collect names
 290     names = [None] * len(unicode.chars)
 291
 292     for char in unicode.chars:
 293         record = unicode.table[char]
 294         if record:
 295             name = record[1].strip()
 296             if name and name[0] != "<":
 297                 names[char] = name + chr(0)
 298
 299     print len(filter(lambda n: n is not None, names)), "distinct names"
 300
 301     # collect unique words from names (note that we differ between
 302     # words inside a sentence, and words ending a sentence.  the
 303     # latter includes the trailing null byte.
 304
 305     words = {}
 306     n = b = 0
 307     for char in unicode.chars:
 308         name = names[char]
 309         if name:
 310             w = name.split()
 311             b = b + len(name)
 312             n = n + len(w)
 313             for w in w:
 314                 l = words.get(w)
 315                 if l:
 316                     l.append(None)
 317                 else:
 318                     words[w] = [len(words)]
 319
 320     print n, "words in text;", b, "bytes"
 321
 322     wordlist = words.items()
 323
 324     # sort on falling frequency
 325     wordlist.sort(lambda a, b: len(b[1])-len(a[1]))
 326
 327     # figure out how many phrasebook escapes we need
 328     escapes = 0
 329     while escapes * 256 < len(wordlist):
 330         escapes = escapes + 1
 331     print escapes, "escapes"
 332
 333     short = 256 - escapes
 334
 335     assert short > 0
 336
 337     print short, "short indexes in lexicon"
 338
 339     # statistics
 340     n = 0
 341     for i in range(short):
 342         n = n + len(wordlist[i][1])
 343     print n, "short indexes in phrasebook"
 344
 345     # pick the most commonly used words, and sort the rest on falling
 346     # length (to maximize overlap)
 347
 348     wordlist, wordtail = wordlist[:short], wordlist[short:]
 349     wordtail.sort(lambda a, b: len(b[0])-len(a[0]))
 350     wordlist.extend(wordtail)
 351
 352     # generate lexicon from words
 353
 354     lexicon_offset = [0]
 355     lexicon = ""
 356     words = {}
 357
 358     # build a lexicon string
 359     offset = 0
 360     for w, x in wordlist:
 361         # encoding: bit 7 indicates last character in word (chr(128)
 362         # indicates the last character in an entire string)
 363         ww = w[:-1] + chr(ord(w[-1])+128)
 364         # reuse string tails, when possible
 365         o = string.find(lexicon, ww)
 366         if o < 0:
 367             o = offset
 368             lexicon = lexicon + ww
 369             offset = offset + len(w)
 370         words[w] = len(lexicon_offset)
 371         lexicon_offset.append(o)
 372
 373     lexicon = map(ord, lexicon)
 374
 375     # generate phrasebook from names and lexicon
 376     phrasebook = [0]
 377     phrasebook_offset = [0] * len(unicode.chars)
 378     for char in unicode.chars:
 379         name = names[char]
 380         if name:
 381             w = name.split()
 382             phrasebook_offset[char] = len(phrasebook)
 383             for w in w:
 384                 i = words[w]
 385                 if i < short:
 386                     phrasebook.append(i)
 387                 else:
 388                     # store as two bytes
 389                     phrasebook.append((i>>8) + short)
 390                     phrasebook.append(i&255)
 391
 392     assert getsize(phrasebook) == 1
 393
 394     #
 395     # unicode name hash table
 396
 397     # extract names
 398     data = []
 399     for char in unicode.chars:
 400         record = unicode.table[char]
 401         if record:
 402             name = record[1].strip()
 403             if name and name[0] != "<":
 404                 data.append((name, char))
 405
 406     # the magic number 47 was chosen to minimize the number of
 407     # collisions on the current data set.  if you like, change it
 408     # and see what happens...
 409
 410     codehash = Hash("code", data, 47)
 411
 412     print "--- Writing", FILE, "..."
 413
 414     fp = open(FILE, "w")
 415     print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION)
 416     print >>fp
 417     print >>fp, "#define NAME_MAXLEN", 256
 418     print >>fp
 419     print >>fp, "/* lexicon */"
 420     Array("lexicon", lexicon).dump(fp, trace)
 421     Array("lexicon_offset", lexicon_offset).dump(fp, trace)
 422
 423     # split decomposition index table
 424     offset1, offset2, shift = splitbins(phrasebook_offset, trace)
 425
 426     print >>fp, "/* code->name phrasebook */"
 427     print >>fp, "#define phrasebook_shift", shift
 428     print >>fp, "#define phrasebook_short", short
 429
 430     Array("phrasebook", phrasebook).dump(fp, trace)
 431     Array("phrasebook_offset1", offset1).dump(fp, trace)
 432     Array("phrasebook_offset2", offset2).dump(fp, trace)
 433
 434     print >>fp, "/* name->code dictionary */"
 435     codehash.dump(fp, trace)
 436
 437     fp.close()
 438
 439 # --------------------------------------------------------------------
 440 # the following support code is taken from the unidb utilities
 441 # Copyright (c) 1999-2000 by Secret Labs AB
 442
 443 # load a unicode-data file from disk
 444
 445 import string, sys
 446
 447 class UnicodeData:
 448
 449     def __init__(self, filename, expand=1):
 450         file = open(filename)
 451         table = [None] * 65536
 452         while 1:
 453             s = file.readline()
 454             if not s:
 455                 break
 456             s = string.split(string.strip(s), ";")
 457             char = string.atoi(s[0], 16)
 458             table[char] = s
 459
 460         # expand first-last ranges (ignore surrogates and private use)
 461         if expand:
 462             field = None
 463             for i in range(0, 0xD800):
 464                 s = table[i]
 465                 if s:
 466                     if s[1][-6:] == "First>":
 467                         s[1] = ""
 468                         field = s[:]
 469                     elif s[1][-5:] == "Last>":
 470                         s[1] = ""
 471                         field = None
 472                 elif field:
 473                     field[0] = hex(i)
 474                     table[i] = field
 475
 476         # public attributes
 477         self.filename = filename
 478         self.table = table
 479         self.chars = range(65536) # unicode
 480
 481     def uselatin1(self):
 482         # restrict character range to ISO Latin 1
 483         self.chars = range(256)
 484
 485 # hash table tools
 486
 487 # this is a straight-forward reimplementation of Python's built-in
 488 # dictionary type, using a static data structure, and a custom string
 489 # hash algorithm.
 490
 491 def myhash(s, magic):
 492     h = 0
 493     for c in map(ord, string.upper(s)):
 494         h = (h * magic) + c
 495         ix = h & 0xff000000
 496         if ix:
 497             h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff
 498     return h
 499
 500 SIZES = [
 501     (4,3), (8,3), (16,3), (32,5), (64,3), (128,3), (256,29), (512,17),
 502     (1024,9), (2048,5), (4096,83), (8192,27), (16384,43), (32768,3),
 503     (65536,45), (131072,9), (262144,39), (524288,39), (1048576,9),
 504     (2097152,5), (4194304,3), (8388608,33), (16777216,27)
 505 ]
 506
 507 class Hash:
 508     def __init__(self, name, data, magic):
 509         # turn a (key, value) list into a static hash table structure
 510
 511         # determine table size
 512         for size, poly in SIZES:
 513             if size > len(data):
 514                 poly = size + poly
 515                 break
 516         else:
 517             raise AssertionError, "ran out of polynominals"
 518
 519         print size, "slots in hash table"
 520
 521         table = [None] * size
 522
 523         mask = size-1
 524
 525         n = 0
 526
 527         hash = myhash
 528
 529         # initialize hash table
 530         for key, value in data:
 531             h = hash(key, magic)
 532             i = (~h) & mask
 533             v = table[i]
 534             if v is None:
 535                 table[i] = value
 536                 continue
 537             incr = (h ^ (h >> 3)) & mask;
 538             if not incr:
 539                 incr = mask
 540             while 1:
 541                 n = n + 1
 542                 i = (i + incr) & mask
 543                 v = table[i]
 544                 if v is None:
 545                     table[i] = value
 546                     break
 547                 incr = incr << 1
 548                 if incr > mask:
 549                     incr = incr ^ poly
 550
 551         print n, "collisions"
 552         self.collisions = n
 553
 554         for i in range(len(table)):
 555             if table[i] is None:
 556                 table[i] = 0
 557
 558         self.data = Array(name + "_hash", table)
 559         self.magic = magic
 560         self.name = name
 561         self.size = size
 562         self.poly = poly
 563
 564     def dump(self, file, trace):
 565         # write data to file, as a C array
 566         self.data.dump(file, trace)
 567         file.write("#define %s_magic %d\n" % (self.name, self.magic))
 568         file.write("#define %s_size %d\n" % (self.name, self.size))
 569         file.write("#define %s_poly %d\n" % (self.name, self.poly))
 570
 571 # stuff to deal with arrays of unsigned integers
 572
 573 class Array:
 574
 575     def __init__(self, name, data):
 576         self.name = name
 577         self.data = data
 578
 579     def dump(self, file, trace=0):
 580         # write data to file, as a C array
 581         size = getsize(self.data)
 582         if trace:
 583             print >>sys.stderr, self.name+":", size*len(self.data), "bytes"
 584         file.write("static ")
 585         if size == 1:
 586             file.write("unsigned char")
 587         elif size == 2:
 588             file.write("unsigned short")
 589         else:
 590             file.write("unsigned int")
 591         file.write(" " + self.name + "[] = {\n")
 592         if self.data:
 593             s = "    "
 594             for item in self.data:
 595                 i = str(item) + ", "
 596                 if len(s) + len(i) > 78:
 597                     file.write(s + "\n")
 598                     s = "    " + i
 599                 else:
 600                     s = s + i
 601             if string.strip(s):
 602                 file.write(s + "\n")
 603         file.write("};\n\n")
 604
 605 def getsize(data):
 606     # return smallest possible integer size for the given array
 607     maxdata = max(data)
 608     if maxdata < 256:
 609         return 1
 610     elif maxdata < 65536:
 611         return 2
 612     else:
 613         return 4
 614
 615 def splitbins(t, trace=0):
 616     """t, trace=0 -> (t1, t2, shift).  Split a table to save space.
 617
 618     t is a sequence of ints.  This function can be useful to save space if
 619     many of the ints are the same.  t1 and t2 are lists of ints, and shift
 620     is an int, chosen to minimize the combined size of t1 and t2 (in C
 621     code), and where for each i in range(len(t)),
 622         t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
 623     where mask is a bitmask isolating the last "shift" bits.
 624
 625     If optional arg trace is non-zero (default zero), progress info
 626     is printed to sys.stderr.  The higher the value, the more info
 627     you'll get.
 628     """
 629
 630     import sys
 631     if trace:
 632         def dump(t1, t2, shift, bytes):
 633             print >>sys.stderr, "%d+%d bins at shift %d; %d bytes" % (
 634                 len(t1), len(t2), shift, bytes)
 635         print >>sys.stderr, "Size of original table:", len(t)*getsize(t), \
 636                             "bytes"
 637     n = len(t)-1    # last valid index
 638     maxshift = 0    # the most we can shift n and still have something left
 639     if n > 0:
 640         while n >> 1:
 641             n >>= 1
 642             maxshift += 1
 643     del n
 644     bytes = sys.maxint  # smallest total size so far
 645     t = tuple(t)    # so slices can be dict keys
 646     for shift in range(maxshift + 1):
 647         t1 = []
 648         t2 = []
 649         size = 2**shift
 650         bincache = {}
 651         for i in range(0, len(t), size):
 652             bin = t[i:i+size]
 653             index = bincache.get(bin)
 654             if index is None:
 655                 index = len(t2)
 656                 bincache[bin] = index
 657                 t2.extend(bin)
 658             t1.append(index >> shift)
 659         # determine memory size
 660         b = len(t1)*getsize(t1) + len(t2)*getsize(t2)
 661         if trace > 1:
 662             dump(t1, t2, shift, b)
 663         if b < bytes:
 664             best = t1, t2, shift
 665             bytes = b
 666     t1, t2, shift = best
 667     if trace:
 668         print >>sys.stderr, "Best:",
 669         dump(t1, t2, shift, bytes)
 670     if __debug__:
 671         # exhaustively verify that the decomposition is correct
 672         mask = ~((~0) << shift) # i.e., low-bit mask of shift bits
 673         for i in xrange(len(t)):
 674             assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
 675     return best
 676
 677 if __name__ == "__main__":
 678     maketables(1)