py-cvs-rel2_1 (Rev 1.2) merge
[python/dscho.git] / Tools / unicode / makeunicodedata.py
blob0eeb335ede675fc14859c4207be6c044390f9a7a
2 # (re)generate unicode property and type databases
4 # this script converts a unicode 3.0 database file to
5 # Modules/unicodedata_db.h, Modules/unicodename_db.h,
6 # and Objects/unicodetype_db.h
8 # history:
9 # 2000-09-24 fl created (based on bits and pieces from unidb)
10 # 2000-09-25 fl merged tim's splitbin fixes, separate decomposition table
11 # 2000-09-25 fl added character type table
12 # 2000-09-26 fl added LINEBREAK, DECIMAL, and DIGIT flags/fields (2.0)
13 # 2000-11-03 fl expand first/last ranges
14 # 2001-01-19 fl added character name tables (2.1)
15 # 2001-01-21 fl added decomp compression; dynamic phrasebook threshold
17 # written by Fredrik Lundh (fredrik@pythonware.com)
20 import sys
22 SCRIPT = sys.argv[0]
23 VERSION = "2.1"
25 UNICODE_DATA = "UnicodeData-Latest.txt"
27 CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd",
28 "Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm",
29 "Lo", "Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po", "Sm", "Sc", "Sk",
30 "So" ]
32 BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO",
33 "PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS",
34 "ON" ]
36 # note: should match definitions in Objects/unicodectype.c
37 ALPHA_MASK = 0x01
38 DECIMAL_MASK = 0x02
39 DIGIT_MASK = 0x04
40 LOWER_MASK = 0x08
41 LINEBREAK_MASK = 0x10
42 SPACE_MASK = 0x20
43 TITLE_MASK = 0x40
44 UPPER_MASK = 0x80
46 def maketables(trace=0):
48 print "--- Reading", UNICODE_DATA, "..."
50 unicode = UnicodeData(UNICODE_DATA)
52 print len(filter(None, unicode.table)), "characters"
54 makeunicodename(unicode, trace)
55 makeunicodedata(unicode, trace)
56 makeunicodetype(unicode, trace)
58 # --------------------------------------------------------------------
59 # unicode character properties
61 def makeunicodedata(unicode, trace):
63 dummy = (0, 0, 0, 0)
64 table = [dummy]
65 cache = {0: dummy}
66 index = [0] * len(unicode.chars)
68 FILE = "Modules/unicodedata_db.h"
70 print "--- Preparing", FILE, "..."
72 # 1) database properties
74 for char in unicode.chars:
75 record = unicode.table[char]
76 if record:
77 # extract database properties
78 category = CATEGORY_NAMES.index(record[2])
79 combining = int(record[3])
80 bidirectional = BIDIRECTIONAL_NAMES.index(record[4])
81 mirrored = record[9] == "Y"
82 item = (
83 category, combining, bidirectional, mirrored
85 # add entry to index and item tables
86 i = cache.get(item)
87 if i is None:
88 cache[item] = i = len(table)
89 table.append(item)
90 index[char] = i
92 # 2) decomposition data
94 decomp_data = [0]
95 decomp_prefix = [""]
96 decomp_index = [0] * len(unicode.chars)
97 decomp_size = 0
99 for char in unicode.chars:
100 record = unicode.table[char]
101 if record:
102 if record[5]:
103 decomp = string.split(record[5])
104 # prefix
105 if decomp[0][0] == "<":
106 prefix = decomp.pop(0)
107 else:
108 prefix = ""
109 try:
110 i = decomp_prefix.index(prefix)
111 except ValueError:
112 i = len(decomp_prefix)
113 decomp_prefix.append(prefix)
114 prefix = i
115 assert prefix < 256
116 # content
117 decomp = [prefix + (len(decomp)<<8)] +\
118 map(lambda s: int(s, 16), decomp)
119 try:
120 i = decomp_data.index(decomp)
121 except ValueError:
122 i = len(decomp_data)
123 decomp_data.extend(decomp)
124 decomp_size = decomp_size + len(decomp) * 2
125 else:
126 i = 0
127 decomp_index[char] = i
129 print len(table), "unique properties"
130 print len(decomp_prefix), "unique decomposition prefixes"
131 print len(decomp_data), "unique decomposition entries:",
132 print decomp_size, "bytes"
134 print "--- Writing", FILE, "..."
136 fp = open(FILE, "w")
137 print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION)
138 print >>fp
139 print >>fp, "/* a list of unique database records */"
140 print >>fp, \
141 "const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {"
142 for item in table:
143 print >>fp, " {%d, %d, %d, %d}," % item
144 print >>fp, "};"
145 print >>fp
147 # FIXME: <fl> the following tables could be made static, and
148 # the support code moved into unicodedatabase.c
150 print >>fp, "/* string literals */"
151 print >>fp, "const char *_PyUnicode_CategoryNames[] = {"
152 for name in CATEGORY_NAMES:
153 print >>fp, " \"%s\"," % name
154 print >>fp, " NULL"
155 print >>fp, "};"
157 print >>fp, "const char *_PyUnicode_BidirectionalNames[] = {"
158 for name in BIDIRECTIONAL_NAMES:
159 print >>fp, " \"%s\"," % name
160 print >>fp, " NULL"
161 print >>fp, "};"
163 print >>fp, "static const char *decomp_prefix[] = {"
164 for name in decomp_prefix:
165 print >>fp, " \"%s\"," % name
166 print >>fp, " NULL"
167 print >>fp, "};"
169 # split record index table
170 index1, index2, shift = splitbins(index, trace)
172 print >>fp, "/* index tables for the database records */"
173 print >>fp, "#define SHIFT", shift
174 Array("index1", index1).dump(fp, trace)
175 Array("index2", index2).dump(fp, trace)
177 # split decomposition index table
178 index1, index2, shift = splitbins(decomp_index, trace)
180 print >>fp, "/* decomposition data */"
181 Array("decomp_data", decomp_data).dump(fp, trace)
183 print >>fp, "/* index tables for the decomposition data */"
184 print >>fp, "#define DECOMP_SHIFT", shift
185 Array("decomp_index1", index1).dump(fp, trace)
186 Array("decomp_index2", index2).dump(fp, trace)
188 fp.close()
190 # --------------------------------------------------------------------
191 # unicode character type tables
193 def makeunicodetype(unicode, trace):
195 FILE = "Objects/unicodetype_db.h"
197 print "--- Preparing", FILE, "..."
199 # extract unicode types
200 dummy = (0, 0, 0, 0, 0, 0)
201 table = [dummy]
202 cache = {0: dummy}
203 index = [0] * len(unicode.chars)
205 for char in unicode.chars:
206 record = unicode.table[char]
207 if record:
208 # extract database properties
209 category = record[2]
210 bidirectional = record[4]
211 flags = 0
212 if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]:
213 flags |= ALPHA_MASK
214 if category == "Ll":
215 flags |= LOWER_MASK
216 if category == "Zl" or bidirectional == "B":
217 flags |= LINEBREAK_MASK
218 if category == "Zs" or bidirectional in ("WS", "B", "S"):
219 flags |= SPACE_MASK
220 if category == "Lt":
221 flags |= TITLE_MASK
222 if category == "Lu":
223 flags |= UPPER_MASK
224 # use delta predictor for upper/lower/title
225 if record[12]:
226 upper = (int(record[12], 16) - char) & 0xffff
227 else:
228 upper = 0
229 if record[13]:
230 lower = (int(record[13], 16) - char) & 0xffff
231 else:
232 lower = 0
233 if record[14]:
234 title = (int(record[14], 16) - char) & 0xffff
235 else:
236 title = 0
237 # decimal digit, integer digit
238 decimal = 0
239 if record[6]:
240 flags |= DECIMAL_MASK
241 decimal = int(record[6])
242 digit = 0
243 if record[7]:
244 flags |= DIGIT_MASK
245 digit = int(record[7])
246 item = (
247 flags, upper, lower, title, decimal, digit
249 # add entry to index and item tables
250 i = cache.get(item)
251 if i is None:
252 cache[item] = i = len(table)
253 table.append(item)
254 index[char] = i
256 print len(table), "unique character type entries"
258 print "--- Writing", FILE, "..."
260 fp = open(FILE, "w")
261 print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION)
262 print >>fp
263 print >>fp, "/* a list of unique character type descriptors */"
264 print >>fp, "const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {"
265 for item in table:
266 print >>fp, " {%d, %d, %d, %d, %d, %d}," % item
267 print >>fp, "};"
268 print >>fp
270 # split decomposition index table
271 index1, index2, shift = splitbins(index, trace)
273 print >>fp, "/* type indexes */"
274 print >>fp, "#define SHIFT", shift
275 Array("index1", index1).dump(fp, trace)
276 Array("index2", index2).dump(fp, trace)
278 fp.close()
280 # --------------------------------------------------------------------
281 # unicode name database
283 def makeunicodename(unicode, trace):
285 FILE = "Modules/unicodename_db.h"
287 print "--- Preparing", FILE, "..."
289 # collect names
290 names = [None] * len(unicode.chars)
292 for char in unicode.chars:
293 record = unicode.table[char]
294 if record:
295 name = record[1].strip()
296 if name and name[0] != "<":
297 names[char] = name + chr(0)
299 print len(filter(lambda n: n is not None, names)), "distinct names"
301 # collect unique words from names (note that we differ between
302 # words inside a sentence, and words ending a sentence. the
303 # latter includes the trailing null byte.
305 words = {}
306 n = b = 0
307 for char in unicode.chars:
308 name = names[char]
309 if name:
310 w = name.split()
311 b = b + len(name)
312 n = n + len(w)
313 for w in w:
314 l = words.get(w)
315 if l:
316 l.append(None)
317 else:
318 words[w] = [len(words)]
320 print n, "words in text;", b, "bytes"
322 wordlist = words.items()
324 # sort on falling frequency
325 wordlist.sort(lambda a, b: len(b[1])-len(a[1]))
327 # figure out how many phrasebook escapes we need
328 escapes = 0
329 while escapes * 256 < len(wordlist):
330 escapes = escapes + 1
331 print escapes, "escapes"
333 short = 256 - escapes
335 assert short > 0
337 print short, "short indexes in lexicon"
339 # statistics
340 n = 0
341 for i in range(short):
342 n = n + len(wordlist[i][1])
343 print n, "short indexes in phrasebook"
345 # pick the most commonly used words, and sort the rest on falling
346 # length (to maximize overlap)
348 wordlist, wordtail = wordlist[:short], wordlist[short:]
349 wordtail.sort(lambda a, b: len(b[0])-len(a[0]))
350 wordlist.extend(wordtail)
352 # generate lexicon from words
354 lexicon_offset = [0]
355 lexicon = ""
356 words = {}
358 # build a lexicon string
359 offset = 0
360 for w, x in wordlist:
361 # encoding: bit 7 indicates last character in word (chr(128)
362 # indicates the last character in an entire string)
363 ww = w[:-1] + chr(ord(w[-1])+128)
364 # reuse string tails, when possible
365 o = string.find(lexicon, ww)
366 if o < 0:
367 o = offset
368 lexicon = lexicon + ww
369 offset = offset + len(w)
370 words[w] = len(lexicon_offset)
371 lexicon_offset.append(o)
373 lexicon = map(ord, lexicon)
375 # generate phrasebook from names and lexicon
376 phrasebook = [0]
377 phrasebook_offset = [0] * len(unicode.chars)
378 for char in unicode.chars:
379 name = names[char]
380 if name:
381 w = name.split()
382 phrasebook_offset[char] = len(phrasebook)
383 for w in w:
384 i = words[w]
385 if i < short:
386 phrasebook.append(i)
387 else:
388 # store as two bytes
389 phrasebook.append((i>>8) + short)
390 phrasebook.append(i&255)
392 assert getsize(phrasebook) == 1
395 # unicode name hash table
397 # extract names
398 data = []
399 for char in unicode.chars:
400 record = unicode.table[char]
401 if record:
402 name = record[1].strip()
403 if name and name[0] != "<":
404 data.append((name, char))
406 # the magic number 47 was chosen to minimize the number of
407 # collisions on the current data set. if you like, change it
408 # and see what happens...
410 codehash = Hash("code", data, 47)
412 print "--- Writing", FILE, "..."
414 fp = open(FILE, "w")
415 print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION)
416 print >>fp
417 print >>fp, "#define NAME_MAXLEN", 256
418 print >>fp
419 print >>fp, "/* lexicon */"
420 Array("lexicon", lexicon).dump(fp, trace)
421 Array("lexicon_offset", lexicon_offset).dump(fp, trace)
423 # split decomposition index table
424 offset1, offset2, shift = splitbins(phrasebook_offset, trace)
426 print >>fp, "/* code->name phrasebook */"
427 print >>fp, "#define phrasebook_shift", shift
428 print >>fp, "#define phrasebook_short", short
430 Array("phrasebook", phrasebook).dump(fp, trace)
431 Array("phrasebook_offset1", offset1).dump(fp, trace)
432 Array("phrasebook_offset2", offset2).dump(fp, trace)
434 print >>fp, "/* name->code dictionary */"
435 codehash.dump(fp, trace)
437 fp.close()
439 # --------------------------------------------------------------------
440 # the following support code is taken from the unidb utilities
441 # Copyright (c) 1999-2000 by Secret Labs AB
443 # load a unicode-data file from disk
445 import string, sys
447 class UnicodeData:
449 def __init__(self, filename, expand=1):
450 file = open(filename)
451 table = [None] * 65536
452 while 1:
453 s = file.readline()
454 if not s:
455 break
456 s = string.split(string.strip(s), ";")
457 char = string.atoi(s[0], 16)
458 table[char] = s
460 # expand first-last ranges (ignore surrogates and private use)
461 if expand:
462 field = None
463 for i in range(0, 0xD800):
464 s = table[i]
465 if s:
466 if s[1][-6:] == "First>":
467 s[1] = ""
468 field = s[:]
469 elif s[1][-5:] == "Last>":
470 s[1] = ""
471 field = None
472 elif field:
473 field[0] = hex(i)
474 table[i] = field
476 # public attributes
477 self.filename = filename
478 self.table = table
479 self.chars = range(65536) # unicode
481 def uselatin1(self):
482 # restrict character range to ISO Latin 1
483 self.chars = range(256)
485 # hash table tools
487 # this is a straight-forward reimplementation of Python's built-in
488 # dictionary type, using a static data structure, and a custom string
489 # hash algorithm.
491 def myhash(s, magic):
492 h = 0
493 for c in map(ord, string.upper(s)):
494 h = (h * magic) + c
495 ix = h & 0xff000000
496 if ix:
497 h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff
498 return h
500 SIZES = [
501 (4,3), (8,3), (16,3), (32,5), (64,3), (128,3), (256,29), (512,17),
502 (1024,9), (2048,5), (4096,83), (8192,27), (16384,43), (32768,3),
503 (65536,45), (131072,9), (262144,39), (524288,39), (1048576,9),
504 (2097152,5), (4194304,3), (8388608,33), (16777216,27)
507 class Hash:
508 def __init__(self, name, data, magic):
509 # turn a (key, value) list into a static hash table structure
511 # determine table size
512 for size, poly in SIZES:
513 if size > len(data):
514 poly = size + poly
515 break
516 else:
517 raise AssertionError, "ran out of polynominals"
519 print size, "slots in hash table"
521 table = [None] * size
523 mask = size-1
525 n = 0
527 hash = myhash
529 # initialize hash table
530 for key, value in data:
531 h = hash(key, magic)
532 i = (~h) & mask
533 v = table[i]
534 if v is None:
535 table[i] = value
536 continue
537 incr = (h ^ (h >> 3)) & mask;
538 if not incr:
539 incr = mask
540 while 1:
541 n = n + 1
542 i = (i + incr) & mask
543 v = table[i]
544 if v is None:
545 table[i] = value
546 break
547 incr = incr << 1
548 if incr > mask:
549 incr = incr ^ poly
551 print n, "collisions"
552 self.collisions = n
554 for i in range(len(table)):
555 if table[i] is None:
556 table[i] = 0
558 self.data = Array(name + "_hash", table)
559 self.magic = magic
560 self.name = name
561 self.size = size
562 self.poly = poly
564 def dump(self, file, trace):
565 # write data to file, as a C array
566 self.data.dump(file, trace)
567 file.write("#define %s_magic %d\n" % (self.name, self.magic))
568 file.write("#define %s_size %d\n" % (self.name, self.size))
569 file.write("#define %s_poly %d\n" % (self.name, self.poly))
571 # stuff to deal with arrays of unsigned integers
573 class Array:
575 def __init__(self, name, data):
576 self.name = name
577 self.data = data
579 def dump(self, file, trace=0):
580 # write data to file, as a C array
581 size = getsize(self.data)
582 if trace:
583 print >>sys.stderr, self.name+":", size*len(self.data), "bytes"
584 file.write("static ")
585 if size == 1:
586 file.write("unsigned char")
587 elif size == 2:
588 file.write("unsigned short")
589 else:
590 file.write("unsigned int")
591 file.write(" " + self.name + "[] = {\n")
592 if self.data:
593 s = " "
594 for item in self.data:
595 i = str(item) + ", "
596 if len(s) + len(i) > 78:
597 file.write(s + "\n")
598 s = " " + i
599 else:
600 s = s + i
601 if string.strip(s):
602 file.write(s + "\n")
603 file.write("};\n\n")
605 def getsize(data):
606 # return smallest possible integer size for the given array
607 maxdata = max(data)
608 if maxdata < 256:
609 return 1
610 elif maxdata < 65536:
611 return 2
612 else:
613 return 4
615 def splitbins(t, trace=0):
616 """t, trace=0 -> (t1, t2, shift). Split a table to save space.
618 t is a sequence of ints. This function can be useful to save space if
619 many of the ints are the same. t1 and t2 are lists of ints, and shift
620 is an int, chosen to minimize the combined size of t1 and t2 (in C
621 code), and where for each i in range(len(t)),
622 t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
623 where mask is a bitmask isolating the last "shift" bits.
625 If optional arg trace is non-zero (default zero), progress info
626 is printed to sys.stderr. The higher the value, the more info
627 you'll get.
630 import sys
631 if trace:
632 def dump(t1, t2, shift, bytes):
633 print >>sys.stderr, "%d+%d bins at shift %d; %d bytes" % (
634 len(t1), len(t2), shift, bytes)
635 print >>sys.stderr, "Size of original table:", len(t)*getsize(t), \
636 "bytes"
637 n = len(t)-1 # last valid index
638 maxshift = 0 # the most we can shift n and still have something left
639 if n > 0:
640 while n >> 1:
641 n >>= 1
642 maxshift += 1
643 del n
644 bytes = sys.maxint # smallest total size so far
645 t = tuple(t) # so slices can be dict keys
646 for shift in range(maxshift + 1):
647 t1 = []
648 t2 = []
649 size = 2**shift
650 bincache = {}
651 for i in range(0, len(t), size):
652 bin = t[i:i+size]
653 index = bincache.get(bin)
654 if index is None:
655 index = len(t2)
656 bincache[bin] = index
657 t2.extend(bin)
658 t1.append(index >> shift)
659 # determine memory size
660 b = len(t1)*getsize(t1) + len(t2)*getsize(t2)
661 if trace > 1:
662 dump(t1, t2, shift, b)
663 if b < bytes:
664 best = t1, t2, shift
665 bytes = b
666 t1, t2, shift = best
667 if trace:
668 print >>sys.stderr, "Best:",
669 dump(t1, t2, shift, bytes)
670 if __debug__:
671 # exhaustively verify that the decomposition is correct
672 mask = ~((~0) << shift) # i.e., low-bit mask of shift bits
673 for i in xrange(len(t)):
674 assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
675 return best
677 if __name__ == "__main__":
678 maketables(1)