2 # (re)generate unicode property and type databases
4 # this script converts a unicode 3.0 database file to
5 # Modules/unicodedata_db.h, Modules/unicodename_db.h,
6 # and Objects/unicodetype_db.h
9 # 2000-09-24 fl created (based on bits and pieces from unidb)
10 # 2000-09-25 fl merged tim's splitbin fixes, separate decomposition table
11 # 2000-09-25 fl added character type table
12 # 2000-09-26 fl added LINEBREAK, DECIMAL, and DIGIT flags/fields (2.0)
13 # 2000-11-03 fl expand first/last ranges
14 # 2001-01-19 fl added character name tables (2.1)
15 # 2001-01-21 fl added decomp compression; dynamic phrasebook threshold
17 # written by Fredrik Lundh (fredrik@pythonware.com)
25 UNICODE_DATA
= "UnicodeData-Latest.txt"
27 CATEGORY_NAMES
= [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd",
28 "Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm",
29 "Lo", "Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po", "Sm", "Sc", "Sk",
32 BIDIRECTIONAL_NAMES
= [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO",
33 "PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS",
36 # note: should match definitions in Objects/unicodectype.c
46 def maketables(trace
=0):
48 print "--- Reading", UNICODE_DATA
, "..."
50 unicode = UnicodeData(UNICODE_DATA
)
52 print len(filter(None, unicode.table
)), "characters"
54 makeunicodename(unicode, trace
)
55 makeunicodedata(unicode, trace
)
56 makeunicodetype(unicode, trace
)
58 # --------------------------------------------------------------------
59 # unicode character properties
61 def makeunicodedata(unicode, trace
):
66 index
= [0] * len(unicode.chars
)
68 FILE
= "Modules/unicodedata_db.h"
70 print "--- Preparing", FILE
, "..."
72 # 1) database properties
74 for char
in unicode.chars
:
75 record
= unicode.table
[char
]
77 # extract database properties
78 category
= CATEGORY_NAMES
.index(record
[2])
79 combining
= int(record
[3])
80 bidirectional
= BIDIRECTIONAL_NAMES
.index(record
[4])
81 mirrored
= record
[9] == "Y"
83 category
, combining
, bidirectional
, mirrored
85 # add entry to index and item tables
88 cache
[item
] = i
= len(table
)
92 # 2) decomposition data
96 decomp_index
= [0] * len(unicode.chars
)
99 for char
in unicode.chars
:
100 record
= unicode.table
[char
]
103 decomp
= string
.split(record
[5])
105 if decomp
[0][0] == "<":
106 prefix
= decomp
.pop(0)
110 i
= decomp_prefix
.index(prefix
)
112 i
= len(decomp_prefix
)
113 decomp_prefix
.append(prefix
)
117 decomp
= [prefix
+ (len(decomp
)<<8)] +\
118 map(lambda s
: int(s
, 16), decomp
)
120 i
= decomp_data
.index(decomp
)
123 decomp_data
.extend(decomp
)
124 decomp_size
= decomp_size
+ len(decomp
) * 2
127 decomp_index
[char
] = i
129 print len(table
), "unique properties"
130 print len(decomp_prefix
), "unique decomposition prefixes"
131 print len(decomp_data
), "unique decomposition entries:",
132 print decomp_size
, "bytes"
134 print "--- Writing", FILE
, "..."
137 print >>fp
, "/* this file was generated by %s %s */" % (SCRIPT
, VERSION
)
139 print >>fp
, "/* a list of unique database records */"
141 "const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {"
143 print >>fp
, " {%d, %d, %d, %d}," % item
147 # FIXME: <fl> the following tables could be made static, and
148 # the support code moved into unicodedatabase.c
150 print >>fp
, "/* string literals */"
151 print >>fp
, "const char *_PyUnicode_CategoryNames[] = {"
152 for name
in CATEGORY_NAMES
:
153 print >>fp
, " \"%s\"," % name
157 print >>fp
, "const char *_PyUnicode_BidirectionalNames[] = {"
158 for name
in BIDIRECTIONAL_NAMES
:
159 print >>fp
, " \"%s\"," % name
163 print >>fp
, "static const char *decomp_prefix[] = {"
164 for name
in decomp_prefix
:
165 print >>fp
, " \"%s\"," % name
169 # split record index table
170 index1
, index2
, shift
= splitbins(index
, trace
)
172 print >>fp
, "/* index tables for the database records */"
173 print >>fp
, "#define SHIFT", shift
174 Array("index1", index1
).dump(fp
, trace
)
175 Array("index2", index2
).dump(fp
, trace
)
177 # split decomposition index table
178 index1
, index2
, shift
= splitbins(decomp_index
, trace
)
180 print >>fp
, "/* decomposition data */"
181 Array("decomp_data", decomp_data
).dump(fp
, trace
)
183 print >>fp
, "/* index tables for the decomposition data */"
184 print >>fp
, "#define DECOMP_SHIFT", shift
185 Array("decomp_index1", index1
).dump(fp
, trace
)
186 Array("decomp_index2", index2
).dump(fp
, trace
)
190 # --------------------------------------------------------------------
191 # unicode character type tables
193 def makeunicodetype(unicode, trace
):
195 FILE
= "Objects/unicodetype_db.h"
197 print "--- Preparing", FILE
, "..."
199 # extract unicode types
200 dummy
= (0, 0, 0, 0, 0, 0)
203 index
= [0] * len(unicode.chars
)
205 for char
in unicode.chars
:
206 record
= unicode.table
[char
]
208 # extract database properties
210 bidirectional
= record
[4]
212 if category
in ["Lm", "Lt", "Lu", "Ll", "Lo"]:
216 if category
== "Zl" or bidirectional
== "B":
217 flags |
= LINEBREAK_MASK
218 if category
== "Zs" or bidirectional
in ("WS", "B", "S"):
224 # use delta predictor for upper/lower/title
226 upper
= (int(record
[12], 16) - char
) & 0xffff
230 lower
= (int(record
[13], 16) - char
) & 0xffff
234 title
= (int(record
[14], 16) - char
) & 0xffff
237 # decimal digit, integer digit
240 flags |
= DECIMAL_MASK
241 decimal
= int(record
[6])
245 digit
= int(record
[7])
247 flags
, upper
, lower
, title
, decimal
, digit
249 # add entry to index and item tables
252 cache
[item
] = i
= len(table
)
256 print len(table
), "unique character type entries"
258 print "--- Writing", FILE
, "..."
261 print >>fp
, "/* this file was generated by %s %s */" % (SCRIPT
, VERSION
)
263 print >>fp
, "/* a list of unique character type descriptors */"
264 print >>fp
, "const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {"
266 print >>fp
, " {%d, %d, %d, %d, %d, %d}," % item
270 # split decomposition index table
271 index1
, index2
, shift
= splitbins(index
, trace
)
273 print >>fp
, "/* type indexes */"
274 print >>fp
, "#define SHIFT", shift
275 Array("index1", index1
).dump(fp
, trace
)
276 Array("index2", index2
).dump(fp
, trace
)
280 # --------------------------------------------------------------------
281 # unicode name database
283 def makeunicodename(unicode, trace
):
285 FILE
= "Modules/unicodename_db.h"
287 print "--- Preparing", FILE
, "..."
290 names
= [None] * len(unicode.chars
)
292 for char
in unicode.chars
:
293 record
= unicode.table
[char
]
295 name
= record
[1].strip()
296 if name
and name
[0] != "<":
297 names
[char
] = name
+ chr(0)
299 print len(filter(lambda n
: n
is not None, names
)), "distinct names"
301 # collect unique words from names (note that we differ between
302 # words inside a sentence, and words ending a sentence. the
303 # latter includes the trailing null byte.
307 for char
in unicode.chars
:
318 words
[w
] = [len(words
)]
320 print n
, "words in text;", b
, "bytes"
322 wordlist
= words
.items()
324 # sort on falling frequency
325 wordlist
.sort(lambda a
, b
: len(b
[1])-len(a
[1]))
327 # figure out how many phrasebook escapes we need
329 while escapes
* 256 < len(wordlist
):
330 escapes
= escapes
+ 1
331 print escapes
, "escapes"
333 short
= 256 - escapes
337 print short
, "short indexes in lexicon"
341 for i
in range(short
):
342 n
= n
+ len(wordlist
[i
][1])
343 print n
, "short indexes in phrasebook"
345 # pick the most commonly used words, and sort the rest on falling
346 # length (to maximize overlap)
348 wordlist
, wordtail
= wordlist
[:short
], wordlist
[short
:]
349 wordtail
.sort(lambda a
, b
: len(b
[0])-len(a
[0]))
350 wordlist
.extend(wordtail
)
352 # generate lexicon from words
358 # build a lexicon string
360 for w
, x
in wordlist
:
361 # encoding: bit 7 indicates last character in word (chr(128)
362 # indicates the last character in an entire string)
363 ww
= w
[:-1] + chr(ord(w
[-1])+128)
364 # reuse string tails, when possible
365 o
= string
.find(lexicon
, ww
)
368 lexicon
= lexicon
+ ww
369 offset
= offset
+ len(w
)
370 words
[w
] = len(lexicon_offset
)
371 lexicon_offset
.append(o
)
373 lexicon
= map(ord, lexicon
)
375 # generate phrasebook from names and lexicon
377 phrasebook_offset
= [0] * len(unicode.chars
)
378 for char
in unicode.chars
:
382 phrasebook_offset
[char
] = len(phrasebook
)
389 phrasebook
.append((i
>>8) + short
)
390 phrasebook
.append(i
&255)
392 assert getsize(phrasebook
) == 1
395 # unicode name hash table
399 for char
in unicode.chars
:
400 record
= unicode.table
[char
]
402 name
= record
[1].strip()
403 if name
and name
[0] != "<":
404 data
.append((name
, char
))
406 # the magic number 47 was chosen to minimize the number of
407 # collisions on the current data set. if you like, change it
408 # and see what happens...
410 codehash
= Hash("code", data
, 47)
412 print "--- Writing", FILE
, "..."
415 print >>fp
, "/* this file was generated by %s %s */" % (SCRIPT
, VERSION
)
417 print >>fp
, "#define NAME_MAXLEN", 256
419 print >>fp
, "/* lexicon */"
420 Array("lexicon", lexicon
).dump(fp
, trace
)
421 Array("lexicon_offset", lexicon_offset
).dump(fp
, trace
)
423 # split decomposition index table
424 offset1
, offset2
, shift
= splitbins(phrasebook_offset
, trace
)
426 print >>fp
, "/* code->name phrasebook */"
427 print >>fp
, "#define phrasebook_shift", shift
428 print >>fp
, "#define phrasebook_short", short
430 Array("phrasebook", phrasebook
).dump(fp
, trace
)
431 Array("phrasebook_offset1", offset1
).dump(fp
, trace
)
432 Array("phrasebook_offset2", offset2
).dump(fp
, trace
)
434 print >>fp
, "/* name->code dictionary */"
435 codehash
.dump(fp
, trace
)
439 # --------------------------------------------------------------------
440 # the following support code is taken from the unidb utilities
441 # Copyright (c) 1999-2000 by Secret Labs AB
443 # load a unicode-data file from disk
449 def __init__(self
, filename
, expand
=1):
450 file = open(filename
)
451 table
= [None] * 65536
456 s
= string
.split(string
.strip(s
), ";")
457 char
= string
.atoi(s
[0], 16)
460 # expand first-last ranges (ignore surrogates and private use)
463 for i
in range(0, 0xD800):
466 if s
[1][-6:] == "First>":
469 elif s
[1][-5:] == "Last>":
477 self
.filename
= filename
479 self
.chars
= range(65536) # unicode
482 # restrict character range to ISO Latin 1
483 self
.chars
= range(256)
487 # this is a straight-forward reimplementation of Python's built-in
488 # dictionary type, using a static data structure, and a custom string
491 def myhash(s
, magic
):
493 for c
in map(ord, string
.upper(s
)):
497 h
= (h ^
((ix
>>24) & 0xff)) & 0x00ffffff
501 (4,3), (8,3), (16,3), (32,5), (64,3), (128,3), (256,29), (512,17),
502 (1024,9), (2048,5), (4096,83), (8192,27), (16384,43), (32768,3),
503 (65536,45), (131072,9), (262144,39), (524288,39), (1048576,9),
504 (2097152,5), (4194304,3), (8388608,33), (16777216,27)
508 def __init__(self
, name
, data
, magic
):
509 # turn a (key, value) list into a static hash table structure
511 # determine table size
512 for size
, poly
in SIZES
:
517 raise AssertionError, "ran out of polynominals"
519 print size
, "slots in hash table"
521 table
= [None] * size
529 # initialize hash table
530 for key
, value
in data
:
537 incr
= (h ^
(h
>> 3)) & mask
;
542 i
= (i
+ incr
) & mask
551 print n
, "collisions"
554 for i
in range(len(table
)):
558 self
.data
= Array(name
+ "_hash", table
)
564 def dump(self
, file, trace
):
565 # write data to file, as a C array
566 self
.data
.dump(file, trace
)
567 file.write("#define %s_magic %d\n" % (self
.name
, self
.magic
))
568 file.write("#define %s_size %d\n" % (self
.name
, self
.size
))
569 file.write("#define %s_poly %d\n" % (self
.name
, self
.poly
))
571 # stuff to deal with arrays of unsigned integers
575 def __init__(self
, name
, data
):
579 def dump(self
, file, trace
=0):
580 # write data to file, as a C array
581 size
= getsize(self
.data
)
583 print >>sys
.stderr
, self
.name
+":", size
*len(self
.data
), "bytes"
584 file.write("static ")
586 file.write("unsigned char")
588 file.write("unsigned short")
590 file.write("unsigned int")
591 file.write(" " + self
.name
+ "[] = {\n")
594 for item
in self
.data
:
596 if len(s
) + len(i
) > 78:
606 # return smallest possible integer size for the given array
610 elif maxdata
< 65536:
615 def splitbins(t
, trace
=0):
616 """t, trace=0 -> (t1, t2, shift). Split a table to save space.
618 t is a sequence of ints. This function can be useful to save space if
619 many of the ints are the same. t1 and t2 are lists of ints, and shift
620 is an int, chosen to minimize the combined size of t1 and t2 (in C
621 code), and where for each i in range(len(t)),
622 t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
623 where mask is a bitmask isolating the last "shift" bits.
625 If optional arg trace is non-zero (default zero), progress info
626 is printed to sys.stderr. The higher the value, the more info
632 def dump(t1
, t2
, shift
, bytes
):
633 print >>sys
.stderr
, "%d+%d bins at shift %d; %d bytes" % (
634 len(t1
), len(t2
), shift
, bytes
)
635 print >>sys
.stderr
, "Size of original table:", len(t
)*getsize(t
), \
637 n
= len(t
)-1 # last valid index
638 maxshift
= 0 # the most we can shift n and still have something left
644 bytes
= sys
.maxint
# smallest total size so far
645 t
= tuple(t
) # so slices can be dict keys
646 for shift
in range(maxshift
+ 1):
651 for i
in range(0, len(t
), size
):
653 index
= bincache
.get(bin
)
656 bincache
[bin
] = index
658 t1
.append(index
>> shift
)
659 # determine memory size
660 b
= len(t1
)*getsize(t1
) + len(t2
)*getsize(t2
)
662 dump(t1
, t2
, shift
, b
)
668 print >>sys
.stderr
, "Best:",
669 dump(t1
, t2
, shift
, bytes
)
671 # exhaustively verify that the decomposition is correct
672 mask
= ~
((~
0) << shift
) # i.e., low-bit mask of shift bits
673 for i
in xrange(len(t
)):
674 assert t
[i
] == t2
[(t1
[i
>> shift
] << shift
) + (i
& mask
)]
677 if __name__
== "__main__":