Tools/unicode/mkstringprep.py

   1 import re, unicodedata, sys, sets
   2 from sets import Set
   3
   4 if sys.maxunicode == 65535:
   5     raise RuntimeError, "need UCS-4 Python"
   6
   7 def gen_category(cats):
   8     for i in range(0, 0x110000):
   9         if unicodedata.category(unichr(i)) in cats:
  10             yield(i)
  11
  12 def gen_bidirectional(cats):
  13     for i in range(0, 0x110000):
  14         if unicodedata.bidirectional(unichr(i)) in cats:
  15             yield(i)
  16
  17 def compact_set(l):
  18     single = []
  19     tuple = []
  20     prev = None
  21     span = 0
  22     for e in l:
  23         if prev is None:
  24             prev = e
  25             span = 0
  26             continue
  27         if prev+span+1 != e:
  28             if span > 2:
  29                 tuple.append((prev,prev+span+1))
  30             else:
  31                 for i in range(prev, prev+span+1):
  32                     single.append(i)
  33             prev = e
  34             span = 0
  35         else:
  36             span += 1
  37     if span:
  38         tuple.append((prev,prev+span+1))
  39     else:
  40         single.append(prev)
  41     tuple = " + ".join(["range(%d,%d)" % t for t in tuple])
  42     if not single:
  43         return "sets.Set(%s)" % tuple
  44     if not tuple:
  45         return "sets.Set(%s)" % repr(single)
  46     return "sets.Set(%s + %s)" % (repr(single),tuple)
  47
  48 ############## Read the tables in the RFC #######################
  49
  50 data = open("rfc3454.txt").readlines()
  51
  52 tables = []
  53 curname = None
  54 for l in data:
  55     l = l.strip()
  56     if not l:
  57         continue
  58     # Skip RFC page breaks
  59     if l.startswith("Hoffman & Blanchet") or\
  60        l.startswith("RFC 3454"):
  61         continue
  62     # Find start/end lines
  63     m = re.match("----- (Start|End) Table ([A-Z](.[0-9])+) -----", l)
  64     if m:
  65         if m.group(1) == "Start":
  66             if curname:
  67                 raise "Double Start",(curname, l)
  68             curname = m.group(2)
  69             table = {}
  70             tables.append((curname, table))
  71             continue
  72         else:
  73             if not curname:
  74                 raise "End without start", l
  75             curname = None
  76             continue
  77     if not curname:
  78         continue
  79     # Now we are in a table
  80     fields = l.split(";")
  81     if len(fields) > 1:
  82         # Drop comment field
  83         fields = fields[:-1]
  84     if len(fields) == 1:
  85         fields = fields[0].split("-")
  86         if len(fields) > 1:
  87             # range
  88             try:
  89                 start, end = fields
  90             except ValueError:
  91                 raise "Unpacking problem", l
  92         else:
  93             start = end = fields[0]
  94         start = int(start, 16)
  95         end = int(end, 16)
  96         for i in range(start, end+1):
  97             table[i] = i
  98     else:
  99         code, value = fields
 100         value = value.strip()
 101         if value:
 102             value = [int(v, 16) for v in value.split(" ")]
 103         else:
 104             # table B.1
 105             value = None
 106         table[int(code, 16)] = value
 107
 108 ########### Generate compact Python versions of the tables #############
 109
 110 print """# This file is generated by mkstringprep.py. DO NOT EDIT.
 111 \"\"\"Library that exposes various tables found in the StringPrep RFC 3454.
 112
 113 There are two kinds of tables: sets, for which a member test is provided,
 114 and mappings, for which a mapping function is provided.
 115 \"\"\"
 116
 117 import unicodedata, sets
 118 """
 119
 120 print "assert unicodedata.unidata_version == %s" % repr(unicodedata.unidata_version)
 121
 122 # A.1 is the table of unassigned characters
 123 # XXX Plane 15 PUA is listed as unassigned in Python.
 124 name, table = tables[0]
 125 del tables[0]
 126 assert name == "A.1"
 127 table = Set(table.keys())
 128 Cn = Set(gen_category(["Cn"]))
 129
 130 # FDD0..FDEF are process internal codes
 131 Cn -= Set(range(0xFDD0, 0xFDF0))
 132 # not a character
 133 Cn -= Set(range(0xFFFE, 0x110000, 0x10000))
 134 Cn -= Set(range(0xFFFF, 0x110000, 0x10000))
 135
 136 # assert table == Cn
 137
 138 print """
 139 def in_table_a1(code):
 140     if unicodedata.category(code) != 'Cn': return False
 141     c = ord(code)
 142     if 0xFDD0 <= c < 0xFDF0: return False
 143     return (c & 0xFFFF) not in (0xFFFE, 0xFFFF)
 144 """
 145
 146 # B.1 cannot easily be derived
 147 name, table = tables[0]
 148 del tables[0]
 149 assert name == "B.1"
 150 table = table.keys()
 151 table.sort()
 152 print """
 153 b1_set = """ + compact_set(table) + """
 154 def in_table_b1(code):
 155     return ord(code) in b1_set
 156 """
 157
 158 # B.2 and B.3 is case folding.
 159 # It takes CaseFolding.txt into account, which is
 160 # not available in the Python database. Since
 161 # B.2 is derived from B.3, we process B.3 first.
 162 # B.3 supposedly *is* CaseFolding-3.2.0.txt.
 163
 164 name, table_b2 = tables[0]
 165 del tables[0]
 166 assert name == "B.2"
 167
 168 name, table_b3 = tables[0]
 169 del tables[0]
 170 assert name == "B.3"
 171
 172 # B.3 is mostly Python's .lower, except for a number
 173 # of special cases, e.g. considering canonical forms.
 174
 175 b3_exceptions = {}
 176
 177 for k,v in table_b2.items():
 178     if map(ord, unichr(k).lower()) != v:
 179         b3_exceptions[k] = u"".join(map(unichr,v))
 180
 181 b3 = b3_exceptions.items()
 182 b3.sort()
 183
 184 print """
 185 b3_exceptions = {"""
 186 for i,(k,v) in enumerate(b3):
 187     print "0x%x:%s," % (k, repr(v)),
 188     if i % 4 == 3:
 189         print
 190 print "}"
 191
 192 print """
 193 def map_table_b3(code):
 194     r = b3_exceptions.get(ord(code))
 195     if r is not None: return r
 196     return code.lower()
 197 """
 198
 199 def map_table_b3(code):
 200     r = b3_exceptions.get(ord(code))
 201     if r is not None: return r
 202     return code.lower()
 203
 204 # B.2 is case folding for NFKC. This is the same as B.3,
 205 # except where NormalizeWithKC(Fold(a)) !=
 206 # NormalizeWithKC(Fold(NormalizeWithKC(Fold(a))))
 207
 208 def map_table_b2(a):
 209     al = map_table_b3(a)
 210     b = unicodedata.normalize("NFKC", al)
 211     bl = u"".join([map_table_b3(ch) for ch in b])
 212     c = unicodedata.normalize("NFKC", bl)
 213     if b != c:
 214         return c
 215     else:
 216         return al
 217
 218 specials = {}
 219 for k,v in table_b2.items():
 220     if map(ord, map_table_b2(unichr(k))) != v:
 221         specials[k] = v
 222
 223 # B.3 should not add any additional special cases
 224 assert specials == {}
 225
 226 print """
 227 def map_table_b2(a):
 228     al = map_table_b3(a)
 229     b = unicodedata.normalize("NFKC", al)
 230     bl = u"".join([map_table_b3(ch) for ch in b])
 231     c = unicodedata.normalize("NFKC", bl)
 232     if b != c:
 233         return c
 234     else:
 235         return al
 236 """
 237
 238 # C.1.1 is a table with a single character
 239 name, table = tables[0]
 240 del tables[0]
 241 assert name == "C.1.1"
 242 assert table == {0x20:0x20}
 243
 244 print """
 245 def in_table_c11(code):
 246     return code == u" "
 247 """
 248
 249 # C.1.2 is the rest of all space characters
 250 name, table = tables[0]
 251 del tables[0]
 252 assert name == "C.1.2"
 253
 254 # table = Set(table.keys())
 255 # Zs = Set(gen_category(["Zs"])) - Set([0x20])
 256 # assert Zs == table
 257
 258 print """
 259 def in_table_c12(code):
 260     return unicodedata.category(code) == "Zs" and code != u" "
 261
 262 def in_table_c11_c12(code):
 263     return unicodedata.category(code) == "Zs"
 264 """
 265
 266 # C.2.1 ASCII control characters
 267 name, table_c21 = tables[0]
 268 del tables[0]
 269 assert name == "C.2.1"
 270
 271 Cc = Set(gen_category(["Cc"]))
 272 Cc_ascii = Cc & Set(range(128))
 273 table_c21 = Set(table_c21.keys())
 274 assert Cc_ascii == table_c21
 275
 276 print """
 277 def in_table_c21(code):
 278     return ord(code) < 128 and unicodedata.category(code) == "Cc"
 279 """
 280
 281 # C.2.2 Non-ASCII control characters. It also includes
 282 # a number of characters in category Cf.
 283 name, table_c22 = tables[0]
 284 del tables[0]
 285 assert name == "C.2.2"
 286
 287 Cc_nonascii = Cc - Cc_ascii
 288 table_c22 = Set(table_c22.keys())
 289 assert len(Cc_nonascii - table_c22) == 0
 290
 291 specials = list(table_c22 - Cc_nonascii)
 292 specials.sort()
 293
 294 print """c22_specials = """ + compact_set(specials) + """
 295 def in_table_c22(code):
 296     c = ord(code)
 297     if c < 128: return False
 298     if unicodedata.category(code) == "Cc": return True
 299     return c in c22_specials
 300
 301 def in_table_c21_c22(code):
 302     return unicodedata.category(code) == "Cc" or \\
 303            ord(code) in c22_specials
 304 """
 305
 306 # C.3 Private use
 307 name, table = tables[0]
 308 del tables[0]
 309 assert name == "C.3"
 310
 311 Co = Set(gen_category(["Co"]))
 312 assert Set(table.keys()) == Co
 313
 314 print """
 315 def in_table_c3(code):
 316     return unicodedata.category(code) == "Co"
 317 """
 318
 319 # C.4 Non-character code points, xFFFE, xFFFF
 320 # plus process internal codes
 321 name, table = tables[0]
 322 del tables[0]
 323 assert name == "C.4"
 324
 325 nonchar = Set(range(0xFDD0,0xFDF0) +
 326               range(0xFFFE,0x110000,0x10000) +
 327               range(0xFFFF,0x110000,0x10000))
 328 table = Set(table.keys())
 329 assert table == nonchar
 330
 331 print """
 332 def in_table_c4(code):
 333     c = ord(code)
 334     if c < 0xFDD0: return False
 335     if c < 0xFDF0: return True
 336     return (ord(code) & 0xFFFF) in (0xFFFE, 0xFFFF)
 337 """
 338
 339 # C.5 Surrogate codes
 340 name, table = tables[0]
 341 del tables[0]
 342 assert name == "C.5"
 343
 344 Cs = Set(gen_category(["Cs"]))
 345 assert Set(table.keys()) == Cs
 346
 347 print """
 348 def in_table_c5(code):
 349     return unicodedata.category(code) == "Cs"
 350 """
 351
 352 # C.6 Inappropriate for plain text
 353 name, table = tables[0]
 354 del tables[0]
 355 assert name == "C.6"
 356
 357 table = table.keys()
 358 table.sort()
 359
 360 print """
 361 c6_set = """ + compact_set(table) + """
 362 def in_table_c6(code):
 363     return ord(code) in c6_set
 364 """
 365
 366 # C.7 Inappropriate for canonical representation
 367 name, table = tables[0]
 368 del tables[0]
 369 assert name == "C.7"
 370
 371 table = table.keys()
 372 table.sort()
 373
 374 print """
 375 c7_set = """ + compact_set(table) + """
 376 def in_table_c7(code):
 377     return ord(code) in c7_set
 378 """
 379
 380 # C.8 Change display properties or are deprecated
 381 name, table = tables[0]
 382 del tables[0]
 383 assert name == "C.8"
 384
 385 table = table.keys()
 386 table.sort()
 387
 388 print """
 389 c8_set = """ + compact_set(table) + """
 390 def in_table_c8(code):
 391     return ord(code) in c8_set
 392 """
 393
 394 # C.9 Tagging characters
 395 name, table = tables[0]
 396 del tables[0]
 397 assert name == "C.9"
 398
 399 table = table.keys()
 400 table.sort()
 401
 402 print """
 403 c9_set = """ + compact_set(table) + """
 404 def in_table_c9(code):
 405     return ord(code) in c9_set
 406 """
 407
 408 # D.1 Characters with bidirectional property "R" or "AL"
 409 name, table = tables[0]
 410 del tables[0]
 411 assert name == "D.1"
 412
 413 RandAL = Set(gen_bidirectional(["R","AL"]))
 414 assert Set(table.keys()) == RandAL
 415
 416 print """
 417 def in_table_d1(code):
 418     return unicodedata.bidirectional(code) in ("R","AL")
 419 """
 420
 421 # D.2 Characters with bidirectional property "L"
 422 name, table = tables[0]
 423 del tables[0]
 424 assert name == "D.2"
 425
 426 L = Set(gen_bidirectional(["L"]))
 427 assert Set(table.keys()) == L
 428
 429 print """
 430 def in_table_d2(code):
 431     return unicodedata.bidirectional(code) == "L"
 432 """
 433