Tools/unicode/mkstringprep.py

   1 import re, unicodedata, sys
   2
   3 if sys.maxunicode == 65535:
   4     raise RuntimeError("need UCS-4 Python")
   5
   6 def gen_category(cats):
   7     for i in range(0, 0x110000):
   8         if unicodedata.category(unichr(i)) in cats:
   9             yield(i)
  10
  11 def gen_bidirectional(cats):
  12     for i in range(0, 0x110000):
  13         if unicodedata.bidirectional(unichr(i)) in cats:
  14             yield(i)
  15
  16 def compact_set(l):
  17     single = []
  18     tuple = []
  19     prev = None
  20     span = 0
  21     for e in l:
  22         if prev is None:
  23             prev = e
  24             span = 0
  25             continue
  26         if prev+span+1 != e:
  27             if span > 2:
  28                 tuple.append((prev,prev+span+1))
  29             else:
  30                 for i in range(prev, prev+span+1):
  31                     single.append(i)
  32             prev = e
  33             span = 0
  34         else:
  35             span += 1
  36     if span:
  37         tuple.append((prev,prev+span+1))
  38     else:
  39         single.append(prev)
  40     tuple = " + ".join(["range(%d,%d)" % t for t in tuple])
  41     if not single:
  42         return "set(%s)" % tuple
  43     if not tuple:
  44         return "set(%s)" % repr(single)
  45     return "set(%s + %s)" % (repr(single),tuple)
  46
  47 ############## Read the tables in the RFC #######################
  48
  49 data = open("rfc3454.txt").readlines()
  50
  51 tables = []
  52 curname = None
  53 for l in data:
  54     l = l.strip()
  55     if not l:
  56         continue
  57     # Skip RFC page breaks
  58     if l.startswith("Hoffman & Blanchet") or\
  59        l.startswith("RFC 3454"):
  60         continue
  61     # Find start/end lines
  62     m = re.match("----- (Start|End) Table ([A-Z](.[0-9])+) -----", l)
  63     if m:
  64         if m.group(1) == "Start":
  65             if curname:
  66                 raise RuntimeError("Double Start", (curname, l))
  67             curname = m.group(2)
  68             table = {}
  69             tables.append((curname, table))
  70             continue
  71         else:
  72             if not curname:
  73                 raise RuntimeError("End without start", l)
  74             curname = None
  75             continue
  76     if not curname:
  77         continue
  78     # Now we are in a table
  79     fields = l.split(";")
  80     if len(fields) > 1:
  81         # Drop comment field
  82         fields = fields[:-1]
  83     if len(fields) == 1:
  84         fields = fields[0].split("-")
  85         if len(fields) > 1:
  86             # range
  87             try:
  88                 start, end = fields
  89             except ValueError:
  90                 raise RuntimeError("Unpacking problem", l)
  91         else:
  92             start = end = fields[0]
  93         start = int(start, 16)
  94         end = int(end, 16)
  95         for i in range(start, end+1):
  96             table[i] = i
  97     else:
  98         code, value = fields
  99         value = value.strip()
 100         if value:
 101             value = [int(v, 16) for v in value.split(" ")]
 102         else:
 103             # table B.1
 104             value = None
 105         table[int(code, 16)] = value
 106
 107 ########### Generate compact Python versions of the tables #############
 108
 109 print """# This file is generated by mkstringprep.py. DO NOT EDIT.
 110 \"\"\"Library that exposes various tables found in the StringPrep RFC 3454.
 111
 112 There are two kinds of tables: sets, for which a member test is provided,
 113 and mappings, for which a mapping function is provided.
 114 \"\"\"
 115
 116 import unicodedata
 117 """
 118
 119 print "assert unicodedata.unidata_version == %s" % repr(unicodedata.unidata_version)
 120
 121 # A.1 is the table of unassigned characters
 122 # XXX Plane 15 PUA is listed as unassigned in Python.
 123 name, table = tables[0]
 124 del tables[0]
 125 assert name == "A.1"
 126 table = set(table.keys())
 127 Cn = set(gen_category(["Cn"]))
 128
 129 # FDD0..FDEF are process internal codes
 130 Cn -= set(range(0xFDD0, 0xFDF0))
 131 # not a character
 132 Cn -= set(range(0xFFFE, 0x110000, 0x10000))
 133 Cn -= set(range(0xFFFF, 0x110000, 0x10000))
 134
 135 # assert table == Cn
 136
 137 print """
 138 def in_table_a1(code):
 139     if unicodedata.category(code) != 'Cn': return False
 140     c = ord(code)
 141     if 0xFDD0 <= c < 0xFDF0: return False
 142     return (c & 0xFFFF) not in (0xFFFE, 0xFFFF)
 143 """
 144
 145 # B.1 cannot easily be derived
 146 name, table = tables[0]
 147 del tables[0]
 148 assert name == "B.1"
 149 table = sorted(table.keys())
 150 print """
 151 b1_set = """ + compact_set(table) + """
 152 def in_table_b1(code):
 153     return ord(code) in b1_set
 154 """
 155
 156 # B.2 and B.3 is case folding.
 157 # It takes CaseFolding.txt into account, which is
 158 # not available in the Python database. Since
 159 # B.2 is derived from B.3, we process B.3 first.
 160 # B.3 supposedly *is* CaseFolding-3.2.0.txt.
 161
 162 name, table_b2 = tables[0]
 163 del tables[0]
 164 assert name == "B.2"
 165
 166 name, table_b3 = tables[0]
 167 del tables[0]
 168 assert name == "B.3"
 169
 170 # B.3 is mostly Python's .lower, except for a number
 171 # of special cases, e.g. considering canonical forms.
 172
 173 b3_exceptions = {}
 174
 175 for k,v in table_b2.items():
 176     if map(ord, unichr(k).lower()) != v:
 177         b3_exceptions[k] = u"".join(map(unichr,v))
 178
 179 b3 = sorted(b3_exceptions.items())
 180
 181 print """
 182 b3_exceptions = {"""
 183 for i,(k,v) in enumerate(b3):
 184     print "0x%x:%s," % (k, repr(v)),
 185     if i % 4 == 3:
 186         print
 187 print "}"
 188
 189 print """
 190 def map_table_b3(code):
 191     r = b3_exceptions.get(ord(code))
 192     if r is not None: return r
 193     return code.lower()
 194 """
 195
 196 def map_table_b3(code):
 197     r = b3_exceptions.get(ord(code))
 198     if r is not None: return r
 199     return code.lower()
 200
 201 # B.2 is case folding for NFKC. This is the same as B.3,
 202 # except where NormalizeWithKC(Fold(a)) !=
 203 # NormalizeWithKC(Fold(NormalizeWithKC(Fold(a))))
 204
 205 def map_table_b2(a):
 206     al = map_table_b3(a)
 207     b = unicodedata.normalize("NFKC", al)
 208     bl = u"".join([map_table_b3(ch) for ch in b])
 209     c = unicodedata.normalize("NFKC", bl)
 210     if b != c:
 211         return c
 212     else:
 213         return al
 214
 215 specials = {}
 216 for k,v in table_b2.items():
 217     if map(ord, map_table_b2(unichr(k))) != v:
 218         specials[k] = v
 219
 220 # B.3 should not add any additional special cases
 221 assert specials == {}
 222
 223 print """
 224 def map_table_b2(a):
 225     al = map_table_b3(a)
 226     b = unicodedata.normalize("NFKC", al)
 227     bl = u"".join([map_table_b3(ch) for ch in b])
 228     c = unicodedata.normalize("NFKC", bl)
 229     if b != c:
 230         return c
 231     else:
 232         return al
 233 """
 234
 235 # C.1.1 is a table with a single character
 236 name, table = tables[0]
 237 del tables[0]
 238 assert name == "C.1.1"
 239 assert table == {0x20:0x20}
 240
 241 print """
 242 def in_table_c11(code):
 243     return code == u" "
 244 """
 245
 246 # C.1.2 is the rest of all space characters
 247 name, table = tables[0]
 248 del tables[0]
 249 assert name == "C.1.2"
 250
 251 # table = set(table.keys())
 252 # Zs = set(gen_category(["Zs"])) - set([0x20])
 253 # assert Zs == table
 254
 255 print """
 256 def in_table_c12(code):
 257     return unicodedata.category(code) == "Zs" and code != u" "
 258
 259 def in_table_c11_c12(code):
 260     return unicodedata.category(code) == "Zs"
 261 """
 262
 263 # C.2.1 ASCII control characters
 264 name, table_c21 = tables[0]
 265 del tables[0]
 266 assert name == "C.2.1"
 267
 268 Cc = set(gen_category(["Cc"]))
 269 Cc_ascii = Cc & set(range(128))
 270 table_c21 = set(table_c21.keys())
 271 assert Cc_ascii == table_c21
 272
 273 print """
 274 def in_table_c21(code):
 275     return ord(code) < 128 and unicodedata.category(code) == "Cc"
 276 """
 277
 278 # C.2.2 Non-ASCII control characters. It also includes
 279 # a number of characters in category Cf.
 280 name, table_c22 = tables[0]
 281 del tables[0]
 282 assert name == "C.2.2"
 283
 284 Cc_nonascii = Cc - Cc_ascii
 285 table_c22 = set(table_c22.keys())
 286 assert len(Cc_nonascii - table_c22) == 0
 287
 288 specials = list(table_c22 - Cc_nonascii)
 289 specials.sort()
 290
 291 print """c22_specials = """ + compact_set(specials) + """
 292 def in_table_c22(code):
 293     c = ord(code)
 294     if c < 128: return False
 295     if unicodedata.category(code) == "Cc": return True
 296     return c in c22_specials
 297
 298 def in_table_c21_c22(code):
 299     return unicodedata.category(code) == "Cc" or \\
 300            ord(code) in c22_specials
 301 """
 302
 303 # C.3 Private use
 304 name, table = tables[0]
 305 del tables[0]
 306 assert name == "C.3"
 307
 308 Co = set(gen_category(["Co"]))
 309 assert set(table.keys()) == Co
 310
 311 print """
 312 def in_table_c3(code):
 313     return unicodedata.category(code) == "Co"
 314 """
 315
 316 # C.4 Non-character code points, xFFFE, xFFFF
 317 # plus process internal codes
 318 name, table = tables[0]
 319 del tables[0]
 320 assert name == "C.4"
 321
 322 nonchar = set(range(0xFDD0,0xFDF0) +
 323               range(0xFFFE,0x110000,0x10000) +
 324               range(0xFFFF,0x110000,0x10000))
 325 table = set(table.keys())
 326 assert table == nonchar
 327
 328 print """
 329 def in_table_c4(code):
 330     c = ord(code)
 331     if c < 0xFDD0: return False
 332     if c < 0xFDF0: return True
 333     return (ord(code) & 0xFFFF) in (0xFFFE, 0xFFFF)
 334 """
 335
 336 # C.5 Surrogate codes
 337 name, table = tables[0]
 338 del tables[0]
 339 assert name == "C.5"
 340
 341 Cs = set(gen_category(["Cs"]))
 342 assert set(table.keys()) == Cs
 343
 344 print """
 345 def in_table_c5(code):
 346     return unicodedata.category(code) == "Cs"
 347 """
 348
 349 # C.6 Inappropriate for plain text
 350 name, table = tables[0]
 351 del tables[0]
 352 assert name == "C.6"
 353
 354 table = sorted(table.keys())
 355
 356 print """
 357 c6_set = """ + compact_set(table) + """
 358 def in_table_c6(code):
 359     return ord(code) in c6_set
 360 """
 361
 362 # C.7 Inappropriate for canonical representation
 363 name, table = tables[0]
 364 del tables[0]
 365 assert name == "C.7"
 366
 367 table = sorted(table.keys())
 368
 369 print """
 370 c7_set = """ + compact_set(table) + """
 371 def in_table_c7(code):
 372     return ord(code) in c7_set
 373 """
 374
 375 # C.8 Change display properties or are deprecated
 376 name, table = tables[0]
 377 del tables[0]
 378 assert name == "C.8"
 379
 380 table = sorted(table.keys())
 381
 382 print """
 383 c8_set = """ + compact_set(table) + """
 384 def in_table_c8(code):
 385     return ord(code) in c8_set
 386 """
 387
 388 # C.9 Tagging characters
 389 name, table = tables[0]
 390 del tables[0]
 391 assert name == "C.9"
 392
 393 table = sorted(table.keys())
 394
 395 print """
 396 c9_set = """ + compact_set(table) + """
 397 def in_table_c9(code):
 398     return ord(code) in c9_set
 399 """
 400
 401 # D.1 Characters with bidirectional property "R" or "AL"
 402 name, table = tables[0]
 403 del tables[0]
 404 assert name == "D.1"
 405
 406 RandAL = set(gen_bidirectional(["R","AL"]))
 407 assert set(table.keys()) == RandAL
 408
 409 print """
 410 def in_table_d1(code):
 411     return unicodedata.bidirectional(code) in ("R","AL")
 412 """
 413
 414 # D.2 Characters with bidirectional property "L"
 415 name, table = tables[0]
 416 del tables[0]
 417 assert name == "D.2"
 418
 419 L = set(gen_bidirectional(["L"]))
 420 assert set(table.keys()) == L
 421
 422 print """
 423 def in_table_d2(code):
 424     return unicodedata.bidirectional(code) == "L"
 425 """