Lib/test/test_normalization.py

   1 from test.test_support import verbose, TestFailed, TestSkipped, verify
   2 import sys
   3 import os
   4 from unicodedata import normalize
   5
   6 TESTDATAFILE = "NormalizationTest.txt"
   7
   8 # This search allows using a build directory just inside the source
   9 # directory, and saving just one copy of the test data in the source
  10 # tree, rather than having a copy in each build directory.
  11 # There might be a better way to do this.
  12
  13 for path in [os.path.curdir, os.path.pardir]:
  14     fn = os.path.join(path, TESTDATAFILE)
  15     skip_expected = not os.path.exists(fn)
  16     if not skip_expected:
  17         TESTDATAFILE = fn
  18         break
  19
  20 class RangeError:
  21     pass
  22
  23 def NFC(str):
  24     return normalize("NFC", str)
  25
  26 def NFKC(str):
  27     return normalize("NFKC", str)
  28
  29 def NFD(str):
  30     return normalize("NFD", str)
  31
  32 def NFKD(str):
  33     return normalize("NFKD", str)
  34
  35 def unistr(data):
  36     data = [int(x, 16) for x in data.split(" ")]
  37     for x in data:
  38         if x > sys.maxunicode:
  39             raise RangeError
  40     return u"".join([unichr(x) for x in data])
  41
  42 def test_main():
  43     if skip_expected:
  44         raise TestSkipped(TESTDATAFILE + " not found, download from " +
  45                     "http://www.unicode.org/Public/UNIDATA/" + TESTDATAFILE)
  46
  47     part1_data = {}
  48     for line in open(TESTDATAFILE):
  49         if '#' in line:
  50             line = line.split('#')[0]
  51         line = line.strip()
  52         if not line:
  53             continue
  54         if line.startswith("@Part"):
  55             part = line
  56             continue
  57         try:
  58             c1,c2,c3,c4,c5 = [unistr(x) for x in line.split(';')[:-1]]
  59         except RangeError:
  60             # Skip unsupported characters
  61             continue
  62
  63         if verbose:
  64             print line
  65
  66         # Perform tests
  67         verify(c2 ==  NFC(c1) ==  NFC(c2) ==  NFC(c3), line)
  68         verify(c4 ==  NFC(c4) ==  NFC(c5), line)
  69         verify(c3 ==  NFD(c1) ==  NFD(c2) ==  NFD(c3), line)
  70         verify(c5 ==  NFD(c4) ==  NFD(c5), line)
  71         verify(c4 == NFKC(c1) == NFKC(c2) == NFKC(c3) == NFKC(c4) == NFKC(c5),
  72                line)
  73         verify(c5 == NFKD(c1) == NFKD(c2) == NFKD(c3) == NFKD(c4) == NFKD(c5),
  74                line)
  75
  76         # Record part 1 data
  77         if part == "@Part1":
  78             part1_data[c1] = 1
  79
  80     # Perform tests for all other data
  81     for c in range(sys.maxunicode+1):
  82         X = unichr(c)
  83         if X in part1_data:
  84             continue
  85         assert X == NFC(X) == NFD(X) == NFKC(X) == NFKD(X), c
  86
  87 if __name__ == "__main__":
  88     test_main()