Lib/test/test_unicode.py

   1 """ Test script for the Unicode implementation.
   2
   3 Written by Marc-Andre Lemburg (mal@lemburg.com).
   4
   5 (c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
   6
   7 """#"
   8 from test_support import verify, verbose, TestFailed
   9 import sys
  10
  11 if not sys.platform.startswith('java'):
  12     # Test basic sanity of repr()
  13     verify(repr(u'abc') == "u'abc'")
  14     verify(repr(u'ab\\c') == "u'ab\\\\c'")
  15     verify(repr(u'ab\\') == "u'ab\\\\'")
  16     verify(repr(u'\\c') == "u'\\\\c'")
  17     verify(repr(u'\\') == "u'\\\\'")
  18     verify(repr(u'\n') == "u'\\n'")
  19     verify(repr(u'\r') == "u'\\r'")
  20     verify(repr(u'\t') == "u'\\t'")
  21     verify(repr(u'\b') == "u'\\x08'")
  22     verify(repr(u"'\"") == """u'\\'"'""")
  23     verify(repr(u"'\"") == """u'\\'"'""")
  24     verify(repr(u"'") == '''u"'"''')
  25     verify(repr(u'"') == """u'"'""")
  26     verify(repr(u''.join(map(unichr, range(256)))) ==
  27        "u'\\x00\\x01\\x02\\x03\\x04\\x05\\x06\\x07\\x08\\t\\n\\x0b\\x0c\\r"
  28        "\\x0e\\x0f\\x10\\x11\\x12\\x13\\x14\\x15\\x16\\x17\\x18\\x19\\x1a"
  29        "\\x1b\\x1c\\x1d\\x1e\\x1f !\"#$%&\\'()*+,-./0123456789:;<=>?@ABCDEFGHI"
  30        "JKLMNOPQRSTUVWXYZ[\\\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\\x7f"
  31        "\\x80\\x81\\x82\\x83\\x84\\x85\\x86\\x87\\x88\\x89\\x8a\\x8b\\x8c\\x8d"
  32        "\\x8e\\x8f\\x90\\x91\\x92\\x93\\x94\\x95\\x96\\x97\\x98\\x99\\x9a\\x9b"
  33        "\\x9c\\x9d\\x9e\\x9f\\xa0\\xa1\\xa2\\xa3\\xa4\\xa5\\xa6\\xa7\\xa8\\xa9"
  34        "\\xaa\\xab\\xac\\xad\\xae\\xaf\\xb0\\xb1\\xb2\\xb3\\xb4\\xb5\\xb6\\xb7"
  35        "\\xb8\\xb9\\xba\\xbb\\xbc\\xbd\\xbe\\xbf\\xc0\\xc1\\xc2\\xc3\\xc4\\xc5"
  36        "\\xc6\\xc7\\xc8\\xc9\\xca\\xcb\\xcc\\xcd\\xce\\xcf\\xd0\\xd1\\xd2\\xd3"
  37        "\\xd4\\xd5\\xd6\\xd7\\xd8\\xd9\\xda\\xdb\\xdc\\xdd\\xde\\xdf\\xe0\\xe1"
  38        "\\xe2\\xe3\\xe4\\xe5\\xe6\\xe7\\xe8\\xe9\\xea\\xeb\\xec\\xed\\xee\\xef"
  39        "\\xf0\\xf1\\xf2\\xf3\\xf4\\xf5\\xf6\\xf7\\xf8\\xf9\\xfa\\xfb\\xfc\\xfd"
  40        "\\xfe\\xff'")
  41
  42 def test(method, input, output, *args):
  43     if verbose:
  44         print '%s.%s%s =? %s... ' % (repr(input), method, args, repr(output)),
  45     try:
  46         f = getattr(input, method)
  47         value = apply(f, args)
  48     except:
  49         value = sys.exc_type
  50         exc = sys.exc_info()[:2]
  51     else:
  52         exc = None
  53     if value != output or type(value) is not type(output):
  54         if verbose:
  55             print 'no'
  56         print '*',f, `input`, `output`, `value`
  57         if exc:
  58             print '  value == %s: %s' % (exc)
  59     else:
  60         if verbose:
  61             print 'yes'
  62
  63 test('capitalize', u' hello ', u' hello ')
  64 test('capitalize', u'hello ', u'Hello ')
  65 test('capitalize', u'aaaa', u'Aaaa')
  66 test('capitalize', u'AaAa', u'Aaaa')
  67
  68 test('count', u'aaa', 3, u'a')
  69 test('count', u'aaa', 0, u'b')
  70 test('count', 'aaa', 3, u'a')
  71 test('count', 'aaa', 0, u'b')
  72 test('count', u'aaa', 3, 'a')
  73 test('count', u'aaa', 0, 'b')
  74
  75 test('title', u' hello ', u' Hello ')
  76 test('title', u'hello ', u'Hello ')
  77 test('title', u"fOrMaT thIs aS titLe String", u'Format This As Title String')
  78 test('title', u"fOrMaT,thIs-aS*titLe;String", u'Format,This-As*Title;String')
  79 test('title', u"getInt", u'Getint')
  80
  81 test('find', u'abcdefghiabc', 0, u'abc')
  82 test('find', u'abcdefghiabc', 9, u'abc', 1)
  83 test('find', u'abcdefghiabc', -1, u'def', 4)
  84
  85 test('rfind', u'abcdefghiabc', 9, u'abc')
  86
  87 test('lower', u'HeLLo', u'hello')
  88 test('lower', u'hello', u'hello')
  89
  90 test('upper', u'HeLLo', u'HELLO')
  91 test('upper', u'HELLO', u'HELLO')
  92
  93 if 0:
  94     transtable = '\000\001\002\003\004\005\006\007\010\011\012\013\014\015\016\017\020\021\022\023\024\025\026\027\030\031\032\033\034\035\036\037 !"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`xyzdefghijklmnopqrstuvwxyz{|}~\177\200\201\202\203\204\205\206\207\210\211\212\213\214\215\216\217\220\221\222\223\224\225\226\227\230\231\232\233\234\235\236\237\240\241\242\243\244\245\246\247\250\251\252\253\254\255\256\257\260\261\262\263\264\265\266\267\270\271\272\273\274\275\276\277\300\301\302\303\304\305\306\307\310\311\312\313\314\315\316\317\320\321\322\323\324\325\326\327\330\331\332\333\334\335\336\337\340\341\342\343\344\345\346\347\350\351\352\353\354\355\356\357\360\361\362\363\364\365\366\367\370\371\372\373\374\375\376\377'
  95
  96     test('maketrans', u'abc', transtable, u'xyz')
  97     test('maketrans', u'abc', ValueError, u'xyzq')
  98
  99 test('split', u'this is the split function',
 100      [u'this', u'is', u'the', u'split', u'function'])
 101 test('split', u'a|b|c|d', [u'a', u'b', u'c', u'd'], u'|')
 102 test('split', u'a|b|c|d', [u'a', u'b', u'c|d'], u'|', 2)
 103 test('split', u'a b c d', [u'a', u'b c d'], None, 1)
 104 test('split', u'a b c d', [u'a', u'b', u'c d'], None, 2)
 105 test('split', u'a b c d', [u'a', u'b', u'c', u'd'], None, 3)
 106 test('split', u'a b c d', [u'a', u'b', u'c', u'd'], None, 4)
 107 test('split', u'a b c d', [u'a b c d'], None, 0)
 108 test('split', u'a  b  c  d', [u'a', u'b', u'c  d'], None, 2)
 109 test('split', u'a b c d ', [u'a', u'b', u'c', u'd'])
 110 test('split', u'a//b//c//d', [u'a', u'b', u'c', u'd'], u'//')
 111 test('split', u'a//b//c//d', [u'a', u'b', u'c', u'd'], '//')
 112 test('split', 'a//b//c//d', [u'a', u'b', u'c', u'd'], u'//')
 113 test('split', u'endcase test', [u'endcase ', u''], u'test')
 114 test('split', u'endcase test', [u'endcase ', u''], 'test')
 115 test('split', 'endcase test', [u'endcase ', u''], u'test')
 116
 117
 118 # join now works with any sequence type
 119 class Sequence:
 120     def __init__(self, seq): self.seq = seq
 121     def __len__(self): return len(self.seq)
 122     def __getitem__(self, i): return self.seq[i]
 123
 124 test('join', u' ', u'a b c d', [u'a', u'b', u'c', u'd'])
 125 test('join', u' ', u'a b c d', ['a', 'b', u'c', u'd'])
 126 test('join', u'', u'abcd', (u'a', u'b', u'c', u'd'))
 127 test('join', u' ', u'w x y z', Sequence('wxyz'))
 128 test('join', u' ', TypeError, 7)
 129 test('join', u' ', TypeError, Sequence([7, u'hello', 123L]))
 130 test('join', ' ', u'a b c d', [u'a', u'b', u'c', u'd'])
 131 test('join', ' ', u'a b c d', ['a', 'b', u'c', u'd'])
 132 test('join', '', u'abcd', (u'a', u'b', u'c', u'd'))
 133 test('join', ' ', u'w x y z', Sequence(u'wxyz'))
 134 test('join', ' ', TypeError, 7)
 135
 136 result = u''
 137 for i in range(10):
 138     if i > 0:
 139         result = result + u':'
 140     result = result + u'x'*10
 141 test('join', u':', result, [u'x' * 10] * 10)
 142 test('join', u':', result, (u'x' * 10,) * 10)
 143
 144 test('strip', u'   hello   ', u'hello')
 145 test('lstrip', u'   hello   ', u'hello   ')
 146 test('rstrip', u'   hello   ', u'   hello')
 147 test('strip', u'hello', u'hello')
 148
 149 test('swapcase', u'HeLLo cOmpUteRs', u'hEllO CoMPuTErS')
 150
 151 if 0:
 152     test('translate', u'xyzabcdef', u'xyzxyz', transtable, u'def')
 153
 154     table = string.maketrans('a', u'A')
 155     test('translate', u'abc', u'Abc', table)
 156     test('translate', u'xyz', u'xyz', table)
 157
 158 test('replace', u'one!two!three!', u'one@two!three!', u'!', u'@', 1)
 159 test('replace', u'one!two!three!', u'onetwothree', '!', '')
 160 test('replace', u'one!two!three!', u'one@two@three!', u'!', u'@', 2)
 161 test('replace', u'one!two!three!', u'one@two@three@', u'!', u'@', 3)
 162 test('replace', u'one!two!three!', u'one@two@three@', u'!', u'@', 4)
 163 test('replace', u'one!two!three!', u'one!two!three!', u'!', u'@', 0)
 164 test('replace', u'one!two!three!', u'one@two@three@', u'!', u'@')
 165 test('replace', u'one!two!three!', u'one!two!three!', u'x', u'@')
 166 test('replace', u'one!two!three!', u'one!two!three!', u'x', u'@', 2)
 167
 168 test('startswith', u'hello', 1, u'he')
 169 test('startswith', u'hello', 1, u'hello')
 170 test('startswith', u'hello', 0, u'hello world')
 171 test('startswith', u'hello', 1, u'')
 172 test('startswith', u'hello', 0, u'ello')
 173 test('startswith', u'hello', 1, u'ello', 1)
 174 test('startswith', u'hello', 1, u'o', 4)
 175 test('startswith', u'hello', 0, u'o', 5)
 176 test('startswith', u'hello', 1, u'', 5)
 177 test('startswith', u'hello', 0, u'lo', 6)
 178 test('startswith', u'helloworld', 1, u'lowo', 3)
 179 test('startswith', u'helloworld', 1, u'lowo', 3, 7)
 180 test('startswith', u'helloworld', 0, u'lowo', 3, 6)
 181
 182 test('endswith', u'hello', 1, u'lo')
 183 test('endswith', u'hello', 0, u'he')
 184 test('endswith', u'hello', 1, u'')
 185 test('endswith', u'hello', 0, u'hello world')
 186 test('endswith', u'helloworld', 0, u'worl')
 187 test('endswith', u'helloworld', 1, u'worl', 3, 9)
 188 test('endswith', u'helloworld', 1, u'world', 3, 12)
 189 test('endswith', u'helloworld', 1, u'lowo', 1, 7)
 190 test('endswith', u'helloworld', 1, u'lowo', 2, 7)
 191 test('endswith', u'helloworld', 1, u'lowo', 3, 7)
 192 test('endswith', u'helloworld', 0, u'lowo', 4, 7)
 193 test('endswith', u'helloworld', 0, u'lowo', 3, 8)
 194 test('endswith', u'ab', 0, u'ab', 0, 1)
 195 test('endswith', u'ab', 0, u'ab', 0, 0)
 196
 197 test('expandtabs', u'abc\rab\tdef\ng\thi', u'abc\rab      def\ng       hi')
 198 test('expandtabs', u'abc\rab\tdef\ng\thi', u'abc\rab      def\ng       hi', 8)
 199 test('expandtabs', u'abc\rab\tdef\ng\thi', u'abc\rab  def\ng   hi', 4)
 200 test('expandtabs', u'abc\r\nab\tdef\ng\thi', u'abc\r\nab  def\ng   hi', 4)
 201
 202 if 0:
 203     test('capwords', u'abc def ghi', u'Abc Def Ghi')
 204     test('capwords', u'abc\tdef\nghi', u'Abc Def Ghi')
 205     test('capwords', u'abc\t   def  \nghi', u'Abc Def Ghi')
 206
 207 # Comparisons:
 208 print 'Testing Unicode comparisons...',
 209 verify(u'abc' == 'abc')
 210 verify('abc' == u'abc')
 211 verify(u'abc' == u'abc')
 212 verify(u'abcd' > 'abc')
 213 verify('abcd' > u'abc')
 214 verify(u'abcd' > u'abc')
 215 verify(u'abc' < 'abcd')
 216 verify('abc' < u'abcd')
 217 verify(u'abc' < u'abcd')
 218 print 'done.'
 219
 220 if 0:
 221     # Move these tests to a Unicode collation module test...
 222
 223     print 'Testing UTF-16 code point order comparisons...',
 224     #No surrogates, no fixup required.
 225     verify(u'\u0061' < u'\u20ac')
 226     # Non surrogate below surrogate value, no fixup required
 227     verify(u'\u0061' < u'\ud800\udc02')
 228
 229     # Non surrogate above surrogate value, fixup required
 230     def test_lecmp(s, s2):
 231         verify(s <  s2 , "comparison failed on %s < %s" % (s, s2))
 232
 233     def test_fixup(s):
 234         s2 = u'\ud800\udc01'
 235         test_lecmp(s, s2)
 236         s2 = u'\ud900\udc01'
 237         test_lecmp(s, s2)
 238         s2 = u'\uda00\udc01'
 239         test_lecmp(s, s2)
 240         s2 = u'\udb00\udc01'
 241         test_lecmp(s, s2)
 242         s2 = u'\ud800\udd01'
 243         test_lecmp(s, s2)
 244         s2 = u'\ud900\udd01'
 245         test_lecmp(s, s2)
 246         s2 = u'\uda00\udd01'
 247         test_lecmp(s, s2)
 248         s2 = u'\udb00\udd01'
 249         test_lecmp(s, s2)
 250         s2 = u'\ud800\ude01'
 251         test_lecmp(s, s2)
 252         s2 = u'\ud900\ude01'
 253         test_lecmp(s, s2)
 254         s2 = u'\uda00\ude01'
 255         test_lecmp(s, s2)
 256         s2 = u'\udb00\ude01'
 257         test_lecmp(s, s2)
 258         s2 = u'\ud800\udfff'
 259         test_lecmp(s, s2)
 260         s2 = u'\ud900\udfff'
 261         test_lecmp(s, s2)
 262         s2 = u'\uda00\udfff'
 263         test_lecmp(s, s2)
 264         s2 = u'\udb00\udfff'
 265         test_lecmp(s, s2)
 266
 267     test_fixup(u'\ue000')
 268     test_fixup(u'\uff61')
 269
 270     # Surrogates on both sides, no fixup required
 271     verify(u'\ud800\udc02' < u'\ud84d\udc56')
 272     print 'done.'
 273
 274 test('ljust', u'abc',  u'abc       ', 10)
 275 test('rjust', u'abc',  u'       abc', 10)
 276 test('center', u'abc', u'   abc    ', 10)
 277 test('ljust', u'abc',  u'abc   ', 6)
 278 test('rjust', u'abc',  u'   abc', 6)
 279 test('center', u'abc', u' abc  ', 6)
 280 test('ljust', u'abc', u'abc', 2)
 281 test('rjust', u'abc', u'abc', 2)
 282 test('center', u'abc', u'abc', 2)
 283
 284 test('islower', u'a', 1)
 285 test('islower', u'A', 0)
 286 test('islower', u'\n', 0)
 287 test('islower', u'\u1FFc', 0)
 288 test('islower', u'abc', 1)
 289 test('islower', u'aBc', 0)
 290 test('islower', u'abc\n', 1)
 291
 292 test('isupper', u'a', 0)
 293 test('isupper', u'A', 1)
 294 test('isupper', u'\n', 0)
 295 if sys.platform[:4] != 'java':
 296     test('isupper', u'\u1FFc', 0)
 297 test('isupper', u'ABC', 1)
 298 test('isupper', u'AbC', 0)
 299 test('isupper', u'ABC\n', 1)
 300
 301 test('istitle', u'a', 0)
 302 test('istitle', u'A', 1)
 303 test('istitle', u'\n', 0)
 304 test('istitle', u'\u1FFc', 1)
 305 test('istitle', u'A Titlecased Line', 1)
 306 test('istitle', u'A\nTitlecased Line', 1)
 307 test('istitle', u'A Titlecased, Line', 1)
 308 test('istitle', u'Greek \u1FFcitlecases ...', 1)
 309 test('istitle', u'Not a capitalized String', 0)
 310 test('istitle', u'Not\ta Titlecase String', 0)
 311 test('istitle', u'Not--a Titlecase String', 0)
 312
 313 test('isalpha', u'a', 1)
 314 test('isalpha', u'A', 1)
 315 test('isalpha', u'\n', 0)
 316 test('isalpha', u'\u1FFc', 1)
 317 test('isalpha', u'abc', 1)
 318 test('isalpha', u'aBc123', 0)
 319 test('isalpha', u'abc\n', 0)
 320
 321 test('isalnum', u'a', 1)
 322 test('isalnum', u'A', 1)
 323 test('isalnum', u'\n', 0)
 324 test('isalnum', u'123abc456', 1)
 325 test('isalnum', u'a1b3c', 1)
 326 test('isalnum', u'aBc000 ', 0)
 327 test('isalnum', u'abc\n', 0)
 328
 329 test('splitlines', u"abc\ndef\n\rghi", [u'abc', u'def', u'', u'ghi'])
 330 test('splitlines', u"abc\ndef\n\r\nghi", [u'abc', u'def', u'', u'ghi'])
 331 test('splitlines', u"abc\ndef\r\nghi", [u'abc', u'def', u'ghi'])
 332 test('splitlines', u"abc\ndef\r\nghi\n", [u'abc', u'def', u'ghi'])
 333 test('splitlines', u"abc\ndef\r\nghi\n\r", [u'abc', u'def', u'ghi', u''])
 334 test('splitlines', u"\nabc\ndef\r\nghi\n\r", [u'', u'abc', u'def', u'ghi', u''])
 335 test('splitlines', u"\nabc\ndef\r\nghi\n\r", [u'\n', u'abc\n', u'def\r\n', u'ghi\n', u'\r'], 1)
 336
 337 test('translate', u"abababc", u'bbbc', {ord('a'):None})
 338 test('translate', u"abababc", u'iiic', {ord('a'):None, ord('b'):ord('i')})
 339 test('translate', u"abababc", u'iiix', {ord('a'):None, ord('b'):ord('i'), ord('c'):u'x'})
 340
 341 # Contains:
 342 print 'Testing Unicode contains method...',
 343 verify(('a' in u'abdb') == 1)
 344 verify(('a' in u'bdab') == 1)
 345 verify(('a' in u'bdaba') == 1)
 346 verify(('a' in u'bdba') == 1)
 347 verify(('a' in u'bdba') == 1)
 348 verify((u'a' in u'bdba') == 1)
 349 verify((u'a' in u'bdb') == 0)
 350 verify((u'a' in 'bdb') == 0)
 351 verify((u'a' in 'bdba') == 1)
 352 verify((u'a' in ('a',1,None)) == 1)
 353 verify((u'a' in (1,None,'a')) == 1)
 354 verify((u'a' in (1,None,u'a')) == 1)
 355 verify(('a' in ('a',1,None)) == 1)
 356 verify(('a' in (1,None,'a')) == 1)
 357 verify(('a' in (1,None,u'a')) == 1)
 358 verify(('a' in ('x',1,u'y')) == 0)
 359 verify(('a' in ('x',1,None)) == 0)
 360 print 'done.'
 361
 362 # Formatting:
 363 print 'Testing Unicode formatting strings...',
 364 verify(u"%s, %s" % (u"abc", "abc") == u'abc, abc')
 365 verify(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", 1, 2, 3) == u'abc, abc, 1, 2.000000,  3.00')
 366 verify(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", 1, -2, 3) == u'abc, abc, 1, -2.000000,  3.00')
 367 verify(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", -1, -2, 3.5) == u'abc, abc, -1, -2.000000,  3.50')
 368 verify(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", -1, -2, 3.57) == u'abc, abc, -1, -2.000000,  3.57')
 369 verify(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", -1, -2, 1003.57) == u'abc, abc, -1, -2.000000, 1003.57')
 370 verify(u"%c" % (u"a",) == u'a')
 371 verify(u"%c" % ("a",) == u'a')
 372 verify(u"%c" % (34,) == u'"')
 373 verify(u"%c" % (36,) == u'$')
 374 if sys.platform[:4] != 'java':
 375     value = u"%r, %r" % (u"abc", "abc")
 376     if value != u"u'abc', 'abc'":
 377         print '*** formatting failed for "%s"' % 'u"%r, %r" % (u"abc", "abc")'
 378
 379 verify(u"%(x)s, %(y)s" % {'x':u"abc", 'y':"def"} == u'abc, def')
 380 try:
 381     value = u"%(x)s, %(ä)s" % {'x':u"abc", u'ä':"def"}
 382 except KeyError:
 383     print '*** formatting failed for "%s"' % "u'abc, def'"
 384 else:
 385     verify(value == u'abc, def')
 386
 387 # formatting jobs delegated from the string implementation:
 388 verify('...%(foo)s...' % {'foo':u"abc"} == u'...abc...')
 389 verify('...%(foo)s...' % {'foo':"abc"} == '...abc...')
 390 verify('...%(foo)s...' % {u'foo':"abc"} == '...abc...')
 391 verify('...%(foo)s...' % {u'foo':u"abc"} == u'...abc...')
 392 verify('...%(foo)s...' % {u'foo':u"abc",'def':123} ==  u'...abc...')
 393 verify('...%(foo)s...' % {u'foo':u"abc",u'def':123} == u'...abc...')
 394 verify('...%s...%s...%s...%s...' % (1,2,3,u"abc") == u'...1...2...3...abc...')
 395 verify('...%%...%%s...%s...%s...%s...%s...' % (1,2,3,u"abc") == u'...%...%s...1...2...3...abc...')
 396 verify('...%s...' % u"abc" == u'...abc...')
 397 verify('%*s' % (5,u'abc',) == u'  abc')
 398 verify('%*s' % (-5,u'abc',) == u'abc  ')
 399 verify('%*.*s' % (5,2,u'abc',) == u'   ab')
 400 verify('%*.*s' % (5,3,u'abc',) == u'  abc')
 401 verify('%i %*.*s' % (10, 5,3,u'abc',) == u'10   abc')
 402 verify('%i%s %*.*s' % (10, 3, 5,3,u'abc',) == u'103   abc')
 403 print 'done.'
 404
 405 print 'Testing builtin unicode()...',
 406
 407 # unicode(obj) tests (this maps to PyObject_Unicode() at C level)
 408
 409 verify(unicode(u'unicode remains unicode') == u'unicode remains unicode')
 410
 411 class UnicodeSubclass(unicode):
 412     pass
 413
 414 verify(unicode(UnicodeSubclass('unicode subclass becomes unicode'))
 415        == u'unicode subclass becomes unicode')
 416
 417 verify(unicode('strings are converted to unicode')
 418        == u'strings are converted to unicode')
 419
 420 class UnicodeCompat:
 421     def __init__(self, x):
 422         self.x = x
 423     def __unicode__(self):
 424         return self.x
 425
 426 verify(unicode(UnicodeCompat('__unicode__ compatible objects are recognized'))
 427        == u'__unicode__ compatible objects are recognized')
 428
 429 class StringCompat:
 430     def __init__(self, x):
 431         self.x = x
 432     def __str__(self):
 433         return self.x
 434
 435 verify(unicode(StringCompat('__str__ compatible objects are recognized'))
 436        == u'__str__ compatible objects are recognized')
 437
 438 # unicode(obj) is compatible to str():
 439
 440 o = StringCompat('unicode(obj) is compatible to str()')
 441 verify(unicode(o) == u'unicode(obj) is compatible to str()')
 442 verify(str(o) == 'unicode(obj) is compatible to str()')
 443
 444 for obj in (123, 123.45, 123L):
 445     verify(unicode(obj) == unicode(str(obj)))
 446
 447 # unicode(obj, encoding, error) tests (this maps to
 448 # PyUnicode_FromEncodedObject() at C level)
 449
 450 if not sys.platform.startswith('java'):
 451     try:
 452         unicode(u'decoding unicode is not supported', 'utf-8', 'strict')
 453     except TypeError:
 454         pass
 455     else:
 456         raise TestFailed, "decoding unicode should NOT be supported"
 457
 458 verify(unicode('strings are decoded to unicode', 'utf-8', 'strict')
 459        == u'strings are decoded to unicode')
 460
 461 if not sys.platform.startswith('java'):
 462     verify(unicode(buffer('character buffers are decoded to unicode'),
 463                    'utf-8', 'strict')
 464            == u'character buffers are decoded to unicode')
 465
 466 print 'done.'
 467
 468 # Test builtin codecs
 469 print 'Testing builtin codecs...',
 470
 471 # UTF-7 specific encoding tests:
 472 utfTests = [(u'A\u2262\u0391.', 'A+ImIDkQ.'),  # RFC2152 example
 473  (u'Hi Mom -\u263a-!', 'Hi Mom -+Jjo--!'),     # RFC2152 example
 474  (u'\u65E5\u672C\u8A9E', '+ZeVnLIqe-'),        # RFC2152 example
 475  (u'Item 3 is \u00a31.', 'Item 3 is +AKM-1.'), # RFC2152 example
 476  (u'+', '+-'),
 477  (u'+-', '+--'),
 478  (u'+?', '+-?'),
 479  (u'\?', '+AFw?'),
 480  (u'+?', '+-?'),
 481  (ur'\\?', '+AFwAXA?'),
 482  (ur'\\\?', '+AFwAXABc?'),
 483  (ur'++--', '+-+---')]
 484
 485 for x,y in utfTests:
 486     verify( x.encode('utf-7') == y )
 487
 488 try:
 489     unicode('+3ADYAA-', 'utf-7') # surrogates not supported
 490 except UnicodeError:
 491     pass
 492 else:
 493     raise TestFailed, "unicode('+3ADYAA-', 'utf-7') failed to raise an exception"
 494
 495 verify(unicode('+3ADYAA-', 'utf-7', 'replace') == u'\ufffd')
 496
 497 # UTF-8 specific encoding tests:
 498 verify(u'\u20ac'.encode('utf-8') == \
 499        ''.join((chr(0xe2), chr(0x82), chr(0xac))) )
 500 verify(u'\ud800\udc02'.encode('utf-8') == \
 501        ''.join((chr(0xf0), chr(0x90), chr(0x80), chr(0x82))) )
 502 verify(u'\ud84d\udc56'.encode('utf-8') == \
 503        ''.join((chr(0xf0), chr(0xa3), chr(0x91), chr(0x96))) )
 504 # UTF-8 specific decoding tests
 505 verify(unicode(''.join((chr(0xf0), chr(0xa3), chr(0x91), chr(0x96))),
 506                'utf-8') == u'\U00023456' )
 507 verify(unicode(''.join((chr(0xf0), chr(0x90), chr(0x80), chr(0x82))),
 508                'utf-8') == u'\U00010002' )
 509 verify(unicode(''.join((chr(0xe2), chr(0x82), chr(0xac))),
 510                'utf-8') == u'\u20ac' )
 511
 512 # Other possible utf-8 test cases:
 513 # * strict decoding testing for all of the
 514 #   UTF8_ERROR cases in PyUnicode_DecodeUTF8
 515
 516 verify(unicode('hello','ascii') == u'hello')
 517 verify(unicode('hello','utf-8') == u'hello')
 518 verify(unicode('hello','utf8') == u'hello')
 519 verify(unicode('hello','latin-1') == u'hello')
 520
 521 # Error handling
 522 try:
 523     u'Andr\202 x'.encode('ascii')
 524     u'Andr\202 x'.encode('ascii','strict')
 525 except ValueError:
 526     pass
 527 else:
 528     raise TestFailed, "u'Andr\202'.encode('ascii') failed to raise an exception"
 529 verify(u'Andr\202 x'.encode('ascii','ignore') == "Andr x")
 530 verify(u'Andr\202 x'.encode('ascii','replace') == "Andr? x")
 531
 532 try:
 533     unicode('Andr\202 x','ascii')
 534     unicode('Andr\202 x','ascii','strict')
 535 except ValueError:
 536     pass
 537 else:
 538     raise TestFailed, "unicode('Andr\202') failed to raise an exception"
 539 verify(unicode('Andr\202 x','ascii','ignore') == u"Andr x")
 540 verify(unicode('Andr\202 x','ascii','replace') == u'Andr\uFFFD x')
 541
 542 verify("\\N{foo}xx".decode("unicode-escape", "ignore") == u"xx")
 543 try:
 544     "\\".decode("unicode-escape")
 545 except ValueError:
 546     pass
 547 else:
 548     raise TestFailed, '"\\".decode("unicode-escape") should fail'
 549
 550 verify(u'hello'.encode('ascii') == 'hello')
 551 verify(u'hello'.encode('utf-7') == 'hello')
 552 verify(u'hello'.encode('utf-8') == 'hello')
 553 verify(u'hello'.encode('utf8') == 'hello')
 554 verify(u'hello'.encode('utf-16-le') == 'h\000e\000l\000l\000o\000')
 555 verify(u'hello'.encode('utf-16-be') == '\000h\000e\000l\000l\000o')
 556 verify(u'hello'.encode('latin-1') == 'hello')
 557
 558 # Roundtrip safety for BMP (just the first 1024 chars)
 559 u = u''.join(map(unichr, range(1024)))
 560 for encoding in ('utf-7', 'utf-8', 'utf-16', 'utf-16-le', 'utf-16-be',
 561                  'raw_unicode_escape', 'unicode_escape', 'unicode_internal'):
 562     verify(unicode(u.encode(encoding),encoding) == u)
 563
 564 # Roundtrip safety for non-BMP (just a few chars)
 565 u = u'\U00010001\U00020002\U00030003\U00040004\U00050005'
 566 for encoding in ('utf-8',
 567                  'utf-16', 'utf-16-le', 'utf-16-be',
 568                  #'raw_unicode_escape',
 569                  'unicode_escape', 'unicode_internal'):
 570     verify(unicode(u.encode(encoding),encoding) == u)
 571
 572 u = u''.join(map(unichr, range(256)))
 573 for encoding in (
 574     'latin-1',
 575     ):
 576     try:
 577         verify(unicode(u.encode(encoding),encoding) == u)
 578     except TestFailed:
 579         print '*** codec "%s" failed round-trip' % encoding
 580     except ValueError,why:
 581         print '*** codec for "%s" failed: %s' % (encoding, why)
 582
 583 u = u''.join(map(unichr, range(128)))
 584 for encoding in (
 585     'ascii',
 586     ):
 587     try:
 588         verify(unicode(u.encode(encoding),encoding) == u)
 589     except TestFailed:
 590         print '*** codec "%s" failed round-trip' % encoding
 591     except ValueError,why:
 592         print '*** codec for "%s" failed: %s' % (encoding, why)
 593
 594 print 'done.'
 595
 596 print 'Testing standard mapping codecs...',
 597
 598 print '0-127...',
 599 s = ''.join(map(chr, range(128)))
 600 for encoding in (
 601     'cp037', 'cp1026',
 602     'cp437', 'cp500', 'cp737', 'cp775', 'cp850',
 603     'cp852', 'cp855', 'cp860', 'cp861', 'cp862',
 604     'cp863', 'cp865', 'cp866',
 605     'iso8859_10', 'iso8859_13', 'iso8859_14', 'iso8859_15',
 606     'iso8859_2', 'iso8859_3', 'iso8859_4', 'iso8859_5', 'iso8859_6',
 607     'iso8859_7', 'iso8859_9', 'koi8_r', 'latin_1',
 608     'mac_cyrillic', 'mac_latin2',
 609
 610     'cp1250', 'cp1251', 'cp1252', 'cp1253', 'cp1254', 'cp1255',
 611     'cp1256', 'cp1257', 'cp1258',
 612     'cp856', 'cp857', 'cp864', 'cp869', 'cp874',
 613
 614     'mac_greek', 'mac_iceland','mac_roman', 'mac_turkish',
 615     'cp1006', 'iso8859_8',
 616
 617     ### These have undefined mappings:
 618     #'cp424',
 619
 620     ### These fail the round-trip:
 621     #'cp875'
 622
 623     ):
 624     try:
 625         verify(unicode(s,encoding).encode(encoding) == s)
 626     except TestFailed:
 627         print '*** codec "%s" failed round-trip' % encoding
 628     except ValueError,why:
 629         print '*** codec for "%s" failed: %s' % (encoding, why)
 630
 631 print '128-255...',
 632 s = ''.join(map(chr, range(128,256)))
 633 for encoding in (
 634     'cp037', 'cp1026',
 635     'cp437', 'cp500', 'cp737', 'cp775', 'cp850',
 636     'cp852', 'cp855', 'cp860', 'cp861', 'cp862',
 637     'cp863', 'cp865', 'cp866',
 638     'iso8859_10', 'iso8859_13', 'iso8859_14', 'iso8859_15',
 639     'iso8859_2', 'iso8859_4', 'iso8859_5',
 640     'iso8859_9', 'koi8_r', 'latin_1',
 641     'mac_cyrillic', 'mac_latin2',
 642
 643     ### These have undefined mappings:
 644     #'cp1250', 'cp1251', 'cp1252', 'cp1253', 'cp1254', 'cp1255',
 645     #'cp1256', 'cp1257', 'cp1258',
 646     #'cp424', 'cp856', 'cp857', 'cp864', 'cp869', 'cp874',
 647     #'iso8859_3', 'iso8859_6', 'iso8859_7',
 648     #'mac_greek', 'mac_iceland','mac_roman', 'mac_turkish',
 649
 650     ### These fail the round-trip:
 651     #'cp1006', 'cp875', 'iso8859_8',
 652
 653     ):
 654     try:
 655         verify(unicode(s,encoding).encode(encoding) == s)
 656     except TestFailed:
 657         print '*** codec "%s" failed round-trip' % encoding
 658     except ValueError,why:
 659         print '*** codec for "%s" failed: %s' % (encoding, why)
 660
 661 print 'done.'
 662
 663 print 'Testing Unicode string concatenation...',
 664 verify((u"abc" u"def") == u"abcdef")
 665 verify(("abc" u"def") == u"abcdef")
 666 verify((u"abc" "def") == u"abcdef")
 667 verify((u"abc" u"def" "ghi") == u"abcdefghi")
 668 verify(("abc" "def" u"ghi") == u"abcdefghi")
 669 print 'done.'
 670
 671 print 'Testing Unicode printing...',
 672 print u'abc'
 673 print u'abc', u'def'
 674 print u'abc', 'def'
 675 print 'abc', u'def'
 676 print u'abc\n'
 677 print u'abc\n',
 678 print u'abc\n',
 679 print u'def\n'
 680 print u'def\n'
 681 print 'done.'