This commit was manufactured by cvs2svn to create tag 'r221'.
[python/dscho.git] / Lib / test / test_unicode.py
blobdf5d6159ba0b611a1f2316fdc7bcf62029d6ec0c
1 """ Test script for the Unicode implementation.
3 Written by Marc-Andre Lemburg (mal@lemburg.com).
5 (c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
7 """#"
8 from test_support import verify, verbose, TestFailed
9 import sys
11 if not sys.platform.startswith('java'):
12 # Test basic sanity of repr()
13 verify(repr(u'abc') == "u'abc'")
14 verify(repr(u'ab\\c') == "u'ab\\\\c'")
15 verify(repr(u'ab\\') == "u'ab\\\\'")
16 verify(repr(u'\\c') == "u'\\\\c'")
17 verify(repr(u'\\') == "u'\\\\'")
18 verify(repr(u'\n') == "u'\\n'")
19 verify(repr(u'\r') == "u'\\r'")
20 verify(repr(u'\t') == "u'\\t'")
21 verify(repr(u'\b') == "u'\\x08'")
22 verify(repr(u"'\"") == """u'\\'"'""")
23 verify(repr(u"'\"") == """u'\\'"'""")
24 verify(repr(u"'") == '''u"'"''')
25 verify(repr(u'"') == """u'"'""")
26 verify(repr(u''.join(map(unichr, range(256)))) ==
27 "u'\\x00\\x01\\x02\\x03\\x04\\x05\\x06\\x07\\x08\\t\\n\\x0b\\x0c\\r"
28 "\\x0e\\x0f\\x10\\x11\\x12\\x13\\x14\\x15\\x16\\x17\\x18\\x19\\x1a"
29 "\\x1b\\x1c\\x1d\\x1e\\x1f !\"#$%&\\'()*+,-./0123456789:;<=>?@ABCDEFGHI"
30 "JKLMNOPQRSTUVWXYZ[\\\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\\x7f"
31 "\\x80\\x81\\x82\\x83\\x84\\x85\\x86\\x87\\x88\\x89\\x8a\\x8b\\x8c\\x8d"
32 "\\x8e\\x8f\\x90\\x91\\x92\\x93\\x94\\x95\\x96\\x97\\x98\\x99\\x9a\\x9b"
33 "\\x9c\\x9d\\x9e\\x9f\\xa0\\xa1\\xa2\\xa3\\xa4\\xa5\\xa6\\xa7\\xa8\\xa9"
34 "\\xaa\\xab\\xac\\xad\\xae\\xaf\\xb0\\xb1\\xb2\\xb3\\xb4\\xb5\\xb6\\xb7"
35 "\\xb8\\xb9\\xba\\xbb\\xbc\\xbd\\xbe\\xbf\\xc0\\xc1\\xc2\\xc3\\xc4\\xc5"
36 "\\xc6\\xc7\\xc8\\xc9\\xca\\xcb\\xcc\\xcd\\xce\\xcf\\xd0\\xd1\\xd2\\xd3"
37 "\\xd4\\xd5\\xd6\\xd7\\xd8\\xd9\\xda\\xdb\\xdc\\xdd\\xde\\xdf\\xe0\\xe1"
38 "\\xe2\\xe3\\xe4\\xe5\\xe6\\xe7\\xe8\\xe9\\xea\\xeb\\xec\\xed\\xee\\xef"
39 "\\xf0\\xf1\\xf2\\xf3\\xf4\\xf5\\xf6\\xf7\\xf8\\xf9\\xfa\\xfb\\xfc\\xfd"
40 "\\xfe\\xff'")
42 def test(method, input, output, *args):
43 if verbose:
44 print '%s.%s%s =? %s... ' % (repr(input), method, args, repr(output)),
45 try:
46 f = getattr(input, method)
47 value = apply(f, args)
48 except:
49 value = sys.exc_type
50 exc = sys.exc_info()[:2]
51 else:
52 exc = None
53 if value != output or type(value) is not type(output):
54 if verbose:
55 print 'no'
56 print '*',f, `input`, `output`, `value`
57 if exc:
58 print ' value == %s: %s' % (exc)
59 else:
60 if verbose:
61 print 'yes'
63 test('capitalize', u' hello ', u' hello ')
64 test('capitalize', u'hello ', u'Hello ')
65 test('capitalize', u'aaaa', u'Aaaa')
66 test('capitalize', u'AaAa', u'Aaaa')
68 test('count', u'aaa', 3, u'a')
69 test('count', u'aaa', 0, u'b')
70 test('count', 'aaa', 3, u'a')
71 test('count', 'aaa', 0, u'b')
72 test('count', u'aaa', 3, 'a')
73 test('count', u'aaa', 0, 'b')
75 test('title', u' hello ', u' Hello ')
76 test('title', u'hello ', u'Hello ')
77 test('title', u"fOrMaT thIs aS titLe String", u'Format This As Title String')
78 test('title', u"fOrMaT,thIs-aS*titLe;String", u'Format,This-As*Title;String')
79 test('title', u"getInt", u'Getint')
81 test('find', u'abcdefghiabc', 0, u'abc')
82 test('find', u'abcdefghiabc', 9, u'abc', 1)
83 test('find', u'abcdefghiabc', -1, u'def', 4)
85 test('rfind', u'abcdefghiabc', 9, u'abc')
87 test('lower', u'HeLLo', u'hello')
88 test('lower', u'hello', u'hello')
90 test('upper', u'HeLLo', u'HELLO')
91 test('upper', u'HELLO', u'HELLO')
93 if 0:
94 transtable = '\000\001\002\003\004\005\006\007\010\011\012\013\014\015\016\017\020\021\022\023\024\025\026\027\030\031\032\033\034\035\036\037 !"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`xyzdefghijklmnopqrstuvwxyz{|}~\177\200\201\202\203\204\205\206\207\210\211\212\213\214\215\216\217\220\221\222\223\224\225\226\227\230\231\232\233\234\235\236\237\240\241\242\243\244\245\246\247\250\251\252\253\254\255\256\257\260\261\262\263\264\265\266\267\270\271\272\273\274\275\276\277\300\301\302\303\304\305\306\307\310\311\312\313\314\315\316\317\320\321\322\323\324\325\326\327\330\331\332\333\334\335\336\337\340\341\342\343\344\345\346\347\350\351\352\353\354\355\356\357\360\361\362\363\364\365\366\367\370\371\372\373\374\375\376\377'
96 test('maketrans', u'abc', transtable, u'xyz')
97 test('maketrans', u'abc', ValueError, u'xyzq')
99 test('split', u'this is the split function',
100 [u'this', u'is', u'the', u'split', u'function'])
101 test('split', u'a|b|c|d', [u'a', u'b', u'c', u'd'], u'|')
102 test('split', u'a|b|c|d', [u'a', u'b', u'c|d'], u'|', 2)
103 test('split', u'a b c d', [u'a', u'b c d'], None, 1)
104 test('split', u'a b c d', [u'a', u'b', u'c d'], None, 2)
105 test('split', u'a b c d', [u'a', u'b', u'c', u'd'], None, 3)
106 test('split', u'a b c d', [u'a', u'b', u'c', u'd'], None, 4)
107 test('split', u'a b c d', [u'a b c d'], None, 0)
108 test('split', u'a b c d', [u'a', u'b', u'c d'], None, 2)
109 test('split', u'a b c d ', [u'a', u'b', u'c', u'd'])
110 test('split', u'a//b//c//d', [u'a', u'b', u'c', u'd'], u'//')
111 test('split', u'a//b//c//d', [u'a', u'b', u'c', u'd'], '//')
112 test('split', 'a//b//c//d', [u'a', u'b', u'c', u'd'], u'//')
113 test('split', u'endcase test', [u'endcase ', u''], u'test')
114 test('split', u'endcase test', [u'endcase ', u''], 'test')
115 test('split', 'endcase test', [u'endcase ', u''], u'test')
118 # join now works with any sequence type
119 class Sequence:
120 def __init__(self, seq): self.seq = seq
121 def __len__(self): return len(self.seq)
122 def __getitem__(self, i): return self.seq[i]
124 test('join', u' ', u'a b c d', [u'a', u'b', u'c', u'd'])
125 test('join', u' ', u'a b c d', ['a', 'b', u'c', u'd'])
126 test('join', u'', u'abcd', (u'a', u'b', u'c', u'd'))
127 test('join', u' ', u'w x y z', Sequence('wxyz'))
128 test('join', u' ', TypeError, 7)
129 test('join', u' ', TypeError, Sequence([7, u'hello', 123L]))
130 test('join', ' ', u'a b c d', [u'a', u'b', u'c', u'd'])
131 test('join', ' ', u'a b c d', ['a', 'b', u'c', u'd'])
132 test('join', '', u'abcd', (u'a', u'b', u'c', u'd'))
133 test('join', ' ', u'w x y z', Sequence(u'wxyz'))
134 test('join', ' ', TypeError, 7)
136 result = u''
137 for i in range(10):
138 if i > 0:
139 result = result + u':'
140 result = result + u'x'*10
141 test('join', u':', result, [u'x' * 10] * 10)
142 test('join', u':', result, (u'x' * 10,) * 10)
144 test('strip', u' hello ', u'hello')
145 test('lstrip', u' hello ', u'hello ')
146 test('rstrip', u' hello ', u' hello')
147 test('strip', u'hello', u'hello')
149 test('swapcase', u'HeLLo cOmpUteRs', u'hEllO CoMPuTErS')
151 if 0:
152 test('translate', u'xyzabcdef', u'xyzxyz', transtable, u'def')
154 table = string.maketrans('a', u'A')
155 test('translate', u'abc', u'Abc', table)
156 test('translate', u'xyz', u'xyz', table)
158 test('replace', u'one!two!three!', u'one@two!three!', u'!', u'@', 1)
159 test('replace', u'one!two!three!', u'onetwothree', '!', '')
160 test('replace', u'one!two!three!', u'one@two@three!', u'!', u'@', 2)
161 test('replace', u'one!two!three!', u'one@two@three@', u'!', u'@', 3)
162 test('replace', u'one!two!three!', u'one@two@three@', u'!', u'@', 4)
163 test('replace', u'one!two!three!', u'one!two!three!', u'!', u'@', 0)
164 test('replace', u'one!two!three!', u'one@two@three@', u'!', u'@')
165 test('replace', u'one!two!three!', u'one!two!three!', u'x', u'@')
166 test('replace', u'one!two!three!', u'one!two!three!', u'x', u'@', 2)
168 test('startswith', u'hello', 1, u'he')
169 test('startswith', u'hello', 1, u'hello')
170 test('startswith', u'hello', 0, u'hello world')
171 test('startswith', u'hello', 1, u'')
172 test('startswith', u'hello', 0, u'ello')
173 test('startswith', u'hello', 1, u'ello', 1)
174 test('startswith', u'hello', 1, u'o', 4)
175 test('startswith', u'hello', 0, u'o', 5)
176 test('startswith', u'hello', 1, u'', 5)
177 test('startswith', u'hello', 0, u'lo', 6)
178 test('startswith', u'helloworld', 1, u'lowo', 3)
179 test('startswith', u'helloworld', 1, u'lowo', 3, 7)
180 test('startswith', u'helloworld', 0, u'lowo', 3, 6)
182 test('endswith', u'hello', 1, u'lo')
183 test('endswith', u'hello', 0, u'he')
184 test('endswith', u'hello', 1, u'')
185 test('endswith', u'hello', 0, u'hello world')
186 test('endswith', u'helloworld', 0, u'worl')
187 test('endswith', u'helloworld', 1, u'worl', 3, 9)
188 test('endswith', u'helloworld', 1, u'world', 3, 12)
189 test('endswith', u'helloworld', 1, u'lowo', 1, 7)
190 test('endswith', u'helloworld', 1, u'lowo', 2, 7)
191 test('endswith', u'helloworld', 1, u'lowo', 3, 7)
192 test('endswith', u'helloworld', 0, u'lowo', 4, 7)
193 test('endswith', u'helloworld', 0, u'lowo', 3, 8)
194 test('endswith', u'ab', 0, u'ab', 0, 1)
195 test('endswith', u'ab', 0, u'ab', 0, 0)
197 test('expandtabs', u'abc\rab\tdef\ng\thi', u'abc\rab def\ng hi')
198 test('expandtabs', u'abc\rab\tdef\ng\thi', u'abc\rab def\ng hi', 8)
199 test('expandtabs', u'abc\rab\tdef\ng\thi', u'abc\rab def\ng hi', 4)
200 test('expandtabs', u'abc\r\nab\tdef\ng\thi', u'abc\r\nab def\ng hi', 4)
202 if 0:
203 test('capwords', u'abc def ghi', u'Abc Def Ghi')
204 test('capwords', u'abc\tdef\nghi', u'Abc Def Ghi')
205 test('capwords', u'abc\t def \nghi', u'Abc Def Ghi')
207 # Comparisons:
208 print 'Testing Unicode comparisons...',
209 verify(u'abc' == 'abc')
210 verify('abc' == u'abc')
211 verify(u'abc' == u'abc')
212 verify(u'abcd' > 'abc')
213 verify('abcd' > u'abc')
214 verify(u'abcd' > u'abc')
215 verify(u'abc' < 'abcd')
216 verify('abc' < u'abcd')
217 verify(u'abc' < u'abcd')
218 print 'done.'
220 if 0:
221 # Move these tests to a Unicode collation module test...
223 print 'Testing UTF-16 code point order comparisons...',
224 #No surrogates, no fixup required.
225 verify(u'\u0061' < u'\u20ac')
226 # Non surrogate below surrogate value, no fixup required
227 verify(u'\u0061' < u'\ud800\udc02')
229 # Non surrogate above surrogate value, fixup required
230 def test_lecmp(s, s2):
231 verify(s < s2 , "comparison failed on %s < %s" % (s, s2))
233 def test_fixup(s):
234 s2 = u'\ud800\udc01'
235 test_lecmp(s, s2)
236 s2 = u'\ud900\udc01'
237 test_lecmp(s, s2)
238 s2 = u'\uda00\udc01'
239 test_lecmp(s, s2)
240 s2 = u'\udb00\udc01'
241 test_lecmp(s, s2)
242 s2 = u'\ud800\udd01'
243 test_lecmp(s, s2)
244 s2 = u'\ud900\udd01'
245 test_lecmp(s, s2)
246 s2 = u'\uda00\udd01'
247 test_lecmp(s, s2)
248 s2 = u'\udb00\udd01'
249 test_lecmp(s, s2)
250 s2 = u'\ud800\ude01'
251 test_lecmp(s, s2)
252 s2 = u'\ud900\ude01'
253 test_lecmp(s, s2)
254 s2 = u'\uda00\ude01'
255 test_lecmp(s, s2)
256 s2 = u'\udb00\ude01'
257 test_lecmp(s, s2)
258 s2 = u'\ud800\udfff'
259 test_lecmp(s, s2)
260 s2 = u'\ud900\udfff'
261 test_lecmp(s, s2)
262 s2 = u'\uda00\udfff'
263 test_lecmp(s, s2)
264 s2 = u'\udb00\udfff'
265 test_lecmp(s, s2)
267 test_fixup(u'\ue000')
268 test_fixup(u'\uff61')
270 # Surrogates on both sides, no fixup required
271 verify(u'\ud800\udc02' < u'\ud84d\udc56')
272 print 'done.'
274 test('ljust', u'abc', u'abc ', 10)
275 test('rjust', u'abc', u' abc', 10)
276 test('center', u'abc', u' abc ', 10)
277 test('ljust', u'abc', u'abc ', 6)
278 test('rjust', u'abc', u' abc', 6)
279 test('center', u'abc', u' abc ', 6)
280 test('ljust', u'abc', u'abc', 2)
281 test('rjust', u'abc', u'abc', 2)
282 test('center', u'abc', u'abc', 2)
284 test('islower', u'a', 1)
285 test('islower', u'A', 0)
286 test('islower', u'\n', 0)
287 test('islower', u'\u1FFc', 0)
288 test('islower', u'abc', 1)
289 test('islower', u'aBc', 0)
290 test('islower', u'abc\n', 1)
292 test('isupper', u'a', 0)
293 test('isupper', u'A', 1)
294 test('isupper', u'\n', 0)
295 if sys.platform[:4] != 'java':
296 test('isupper', u'\u1FFc', 0)
297 test('isupper', u'ABC', 1)
298 test('isupper', u'AbC', 0)
299 test('isupper', u'ABC\n', 1)
301 test('istitle', u'a', 0)
302 test('istitle', u'A', 1)
303 test('istitle', u'\n', 0)
304 test('istitle', u'\u1FFc', 1)
305 test('istitle', u'A Titlecased Line', 1)
306 test('istitle', u'A\nTitlecased Line', 1)
307 test('istitle', u'A Titlecased, Line', 1)
308 test('istitle', u'Greek \u1FFcitlecases ...', 1)
309 test('istitle', u'Not a capitalized String', 0)
310 test('istitle', u'Not\ta Titlecase String', 0)
311 test('istitle', u'Not--a Titlecase String', 0)
313 test('isalpha', u'a', 1)
314 test('isalpha', u'A', 1)
315 test('isalpha', u'\n', 0)
316 test('isalpha', u'\u1FFc', 1)
317 test('isalpha', u'abc', 1)
318 test('isalpha', u'aBc123', 0)
319 test('isalpha', u'abc\n', 0)
321 test('isalnum', u'a', 1)
322 test('isalnum', u'A', 1)
323 test('isalnum', u'\n', 0)
324 test('isalnum', u'123abc456', 1)
325 test('isalnum', u'a1b3c', 1)
326 test('isalnum', u'aBc000 ', 0)
327 test('isalnum', u'abc\n', 0)
329 test('splitlines', u"abc\ndef\n\rghi", [u'abc', u'def', u'', u'ghi'])
330 test('splitlines', u"abc\ndef\n\r\nghi", [u'abc', u'def', u'', u'ghi'])
331 test('splitlines', u"abc\ndef\r\nghi", [u'abc', u'def', u'ghi'])
332 test('splitlines', u"abc\ndef\r\nghi\n", [u'abc', u'def', u'ghi'])
333 test('splitlines', u"abc\ndef\r\nghi\n\r", [u'abc', u'def', u'ghi', u''])
334 test('splitlines', u"\nabc\ndef\r\nghi\n\r", [u'', u'abc', u'def', u'ghi', u''])
335 test('splitlines', u"\nabc\ndef\r\nghi\n\r", [u'\n', u'abc\n', u'def\r\n', u'ghi\n', u'\r'], 1)
337 test('translate', u"abababc", u'bbbc', {ord('a'):None})
338 test('translate', u"abababc", u'iiic', {ord('a'):None, ord('b'):ord('i')})
339 test('translate', u"abababc", u'iiix', {ord('a'):None, ord('b'):ord('i'), ord('c'):u'x'})
341 # Contains:
342 print 'Testing Unicode contains method...',
343 verify(('a' in u'abdb') == 1)
344 verify(('a' in u'bdab') == 1)
345 verify(('a' in u'bdaba') == 1)
346 verify(('a' in u'bdba') == 1)
347 verify(('a' in u'bdba') == 1)
348 verify((u'a' in u'bdba') == 1)
349 verify((u'a' in u'bdb') == 0)
350 verify((u'a' in 'bdb') == 0)
351 verify((u'a' in 'bdba') == 1)
352 verify((u'a' in ('a',1,None)) == 1)
353 verify((u'a' in (1,None,'a')) == 1)
354 verify((u'a' in (1,None,u'a')) == 1)
355 verify(('a' in ('a',1,None)) == 1)
356 verify(('a' in (1,None,'a')) == 1)
357 verify(('a' in (1,None,u'a')) == 1)
358 verify(('a' in ('x',1,u'y')) == 0)
359 verify(('a' in ('x',1,None)) == 0)
360 print 'done.'
362 # Formatting:
363 print 'Testing Unicode formatting strings...',
364 verify(u"%s, %s" % (u"abc", "abc") == u'abc, abc')
365 verify(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", 1, 2, 3) == u'abc, abc, 1, 2.000000, 3.00')
366 verify(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", 1, -2, 3) == u'abc, abc, 1, -2.000000, 3.00')
367 verify(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", -1, -2, 3.5) == u'abc, abc, -1, -2.000000, 3.50')
368 verify(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", -1, -2, 3.57) == u'abc, abc, -1, -2.000000, 3.57')
369 verify(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", -1, -2, 1003.57) == u'abc, abc, -1, -2.000000, 1003.57')
370 verify(u"%c" % (u"a",) == u'a')
371 verify(u"%c" % ("a",) == u'a')
372 verify(u"%c" % (34,) == u'"')
373 verify(u"%c" % (36,) == u'$')
374 if sys.platform[:4] != 'java':
375 value = u"%r, %r" % (u"abc", "abc")
376 if value != u"u'abc', 'abc'":
377 print '*** formatting failed for "%s"' % 'u"%r, %r" % (u"abc", "abc")'
379 verify(u"%(x)s, %(y)s" % {'x':u"abc", 'y':"def"} == u'abc, def')
380 try:
381 value = u"%(x)s, %(ä)s" % {'x':u"abc", u'ä':"def"}
382 except KeyError:
383 print '*** formatting failed for "%s"' % "u'abc, def'"
384 else:
385 verify(value == u'abc, def')
387 # formatting jobs delegated from the string implementation:
388 verify('...%(foo)s...' % {'foo':u"abc"} == u'...abc...')
389 verify('...%(foo)s...' % {'foo':"abc"} == '...abc...')
390 verify('...%(foo)s...' % {u'foo':"abc"} == '...abc...')
391 verify('...%(foo)s...' % {u'foo':u"abc"} == u'...abc...')
392 verify('...%(foo)s...' % {u'foo':u"abc",'def':123} == u'...abc...')
393 verify('...%(foo)s...' % {u'foo':u"abc",u'def':123} == u'...abc...')
394 verify('...%s...%s...%s...%s...' % (1,2,3,u"abc") == u'...1...2...3...abc...')
395 verify('...%%...%%s...%s...%s...%s...%s...' % (1,2,3,u"abc") == u'...%...%s...1...2...3...abc...')
396 verify('...%s...' % u"abc" == u'...abc...')
397 verify('%*s' % (5,u'abc',) == u' abc')
398 verify('%*s' % (-5,u'abc',) == u'abc ')
399 verify('%*.*s' % (5,2,u'abc',) == u' ab')
400 verify('%*.*s' % (5,3,u'abc',) == u' abc')
401 verify('%i %*.*s' % (10, 5,3,u'abc',) == u'10 abc')
402 verify('%i%s %*.*s' % (10, 3, 5,3,u'abc',) == u'103 abc')
403 print 'done.'
405 print 'Testing builtin unicode()...',
407 # unicode(obj) tests (this maps to PyObject_Unicode() at C level)
409 verify(unicode(u'unicode remains unicode') == u'unicode remains unicode')
411 class UnicodeSubclass(unicode):
412 pass
414 verify(unicode(UnicodeSubclass('unicode subclass becomes unicode'))
415 == u'unicode subclass becomes unicode')
417 verify(unicode('strings are converted to unicode')
418 == u'strings are converted to unicode')
420 class UnicodeCompat:
421 def __init__(self, x):
422 self.x = x
423 def __unicode__(self):
424 return self.x
426 verify(unicode(UnicodeCompat('__unicode__ compatible objects are recognized'))
427 == u'__unicode__ compatible objects are recognized')
429 class StringCompat:
430 def __init__(self, x):
431 self.x = x
432 def __str__(self):
433 return self.x
435 verify(unicode(StringCompat('__str__ compatible objects are recognized'))
436 == u'__str__ compatible objects are recognized')
438 # unicode(obj) is compatible to str():
440 o = StringCompat('unicode(obj) is compatible to str()')
441 verify(unicode(o) == u'unicode(obj) is compatible to str()')
442 verify(str(o) == 'unicode(obj) is compatible to str()')
444 for obj in (123, 123.45, 123L):
445 verify(unicode(obj) == unicode(str(obj)))
447 # unicode(obj, encoding, error) tests (this maps to
448 # PyUnicode_FromEncodedObject() at C level)
450 if not sys.platform.startswith('java'):
451 try:
452 unicode(u'decoding unicode is not supported', 'utf-8', 'strict')
453 except TypeError:
454 pass
455 else:
456 raise TestFailed, "decoding unicode should NOT be supported"
458 verify(unicode('strings are decoded to unicode', 'utf-8', 'strict')
459 == u'strings are decoded to unicode')
461 if not sys.platform.startswith('java'):
462 verify(unicode(buffer('character buffers are decoded to unicode'),
463 'utf-8', 'strict')
464 == u'character buffers are decoded to unicode')
466 print 'done.'
468 # Test builtin codecs
469 print 'Testing builtin codecs...',
471 # UTF-7 specific encoding tests:
472 utfTests = [(u'A\u2262\u0391.', 'A+ImIDkQ.'), # RFC2152 example
473 (u'Hi Mom -\u263a-!', 'Hi Mom -+Jjo--!'), # RFC2152 example
474 (u'\u65E5\u672C\u8A9E', '+ZeVnLIqe-'), # RFC2152 example
475 (u'Item 3 is \u00a31.', 'Item 3 is +AKM-1.'), # RFC2152 example
476 (u'+', '+-'),
477 (u'+-', '+--'),
478 (u'+?', '+-?'),
479 (u'\?', '+AFw?'),
480 (u'+?', '+-?'),
481 (ur'\\?', '+AFwAXA?'),
482 (ur'\\\?', '+AFwAXABc?'),
483 (ur'++--', '+-+---')]
485 for x,y in utfTests:
486 verify( x.encode('utf-7') == y )
488 try:
489 unicode('+3ADYAA-', 'utf-7') # surrogates not supported
490 except UnicodeError:
491 pass
492 else:
493 raise TestFailed, "unicode('+3ADYAA-', 'utf-7') failed to raise an exception"
495 verify(unicode('+3ADYAA-', 'utf-7', 'replace') == u'\ufffd')
497 # UTF-8 specific encoding tests:
498 verify(u'\u20ac'.encode('utf-8') == \
499 ''.join((chr(0xe2), chr(0x82), chr(0xac))) )
500 verify(u'\ud800\udc02'.encode('utf-8') == \
501 ''.join((chr(0xf0), chr(0x90), chr(0x80), chr(0x82))) )
502 verify(u'\ud84d\udc56'.encode('utf-8') == \
503 ''.join((chr(0xf0), chr(0xa3), chr(0x91), chr(0x96))) )
504 # UTF-8 specific decoding tests
505 verify(unicode(''.join((chr(0xf0), chr(0xa3), chr(0x91), chr(0x96))),
506 'utf-8') == u'\U00023456' )
507 verify(unicode(''.join((chr(0xf0), chr(0x90), chr(0x80), chr(0x82))),
508 'utf-8') == u'\U00010002' )
509 verify(unicode(''.join((chr(0xe2), chr(0x82), chr(0xac))),
510 'utf-8') == u'\u20ac' )
512 # Other possible utf-8 test cases:
513 # * strict decoding testing for all of the
514 # UTF8_ERROR cases in PyUnicode_DecodeUTF8
516 verify(unicode('hello','ascii') == u'hello')
517 verify(unicode('hello','utf-8') == u'hello')
518 verify(unicode('hello','utf8') == u'hello')
519 verify(unicode('hello','latin-1') == u'hello')
521 # Error handling
522 try:
523 u'Andr\202 x'.encode('ascii')
524 u'Andr\202 x'.encode('ascii','strict')
525 except ValueError:
526 pass
527 else:
528 raise TestFailed, "u'Andr\202'.encode('ascii') failed to raise an exception"
529 verify(u'Andr\202 x'.encode('ascii','ignore') == "Andr x")
530 verify(u'Andr\202 x'.encode('ascii','replace') == "Andr? x")
532 try:
533 unicode('Andr\202 x','ascii')
534 unicode('Andr\202 x','ascii','strict')
535 except ValueError:
536 pass
537 else:
538 raise TestFailed, "unicode('Andr\202') failed to raise an exception"
539 verify(unicode('Andr\202 x','ascii','ignore') == u"Andr x")
540 verify(unicode('Andr\202 x','ascii','replace') == u'Andr\uFFFD x')
542 verify("\\N{foo}xx".decode("unicode-escape", "ignore") == u"xx")
543 try:
544 "\\".decode("unicode-escape")
545 except ValueError:
546 pass
547 else:
548 raise TestFailed, '"\\".decode("unicode-escape") should fail'
550 verify(u'hello'.encode('ascii') == 'hello')
551 verify(u'hello'.encode('utf-7') == 'hello')
552 verify(u'hello'.encode('utf-8') == 'hello')
553 verify(u'hello'.encode('utf8') == 'hello')
554 verify(u'hello'.encode('utf-16-le') == 'h\000e\000l\000l\000o\000')
555 verify(u'hello'.encode('utf-16-be') == '\000h\000e\000l\000l\000o')
556 verify(u'hello'.encode('latin-1') == 'hello')
558 # Roundtrip safety for BMP (just the first 1024 chars)
559 u = u''.join(map(unichr, range(1024)))
560 for encoding in ('utf-7', 'utf-8', 'utf-16', 'utf-16-le', 'utf-16-be',
561 'raw_unicode_escape', 'unicode_escape', 'unicode_internal'):
562 verify(unicode(u.encode(encoding),encoding) == u)
564 # Roundtrip safety for non-BMP (just a few chars)
565 u = u'\U00010001\U00020002\U00030003\U00040004\U00050005'
566 for encoding in ('utf-8',
567 'utf-16', 'utf-16-le', 'utf-16-be',
568 #'raw_unicode_escape',
569 'unicode_escape', 'unicode_internal'):
570 verify(unicode(u.encode(encoding),encoding) == u)
572 u = u''.join(map(unichr, range(256)))
573 for encoding in (
574 'latin-1',
576 try:
577 verify(unicode(u.encode(encoding),encoding) == u)
578 except TestFailed:
579 print '*** codec "%s" failed round-trip' % encoding
580 except ValueError,why:
581 print '*** codec for "%s" failed: %s' % (encoding, why)
583 u = u''.join(map(unichr, range(128)))
584 for encoding in (
585 'ascii',
587 try:
588 verify(unicode(u.encode(encoding),encoding) == u)
589 except TestFailed:
590 print '*** codec "%s" failed round-trip' % encoding
591 except ValueError,why:
592 print '*** codec for "%s" failed: %s' % (encoding, why)
594 print 'done.'
596 print 'Testing standard mapping codecs...',
598 print '0-127...',
599 s = ''.join(map(chr, range(128)))
600 for encoding in (
601 'cp037', 'cp1026',
602 'cp437', 'cp500', 'cp737', 'cp775', 'cp850',
603 'cp852', 'cp855', 'cp860', 'cp861', 'cp862',
604 'cp863', 'cp865', 'cp866',
605 'iso8859_10', 'iso8859_13', 'iso8859_14', 'iso8859_15',
606 'iso8859_2', 'iso8859_3', 'iso8859_4', 'iso8859_5', 'iso8859_6',
607 'iso8859_7', 'iso8859_9', 'koi8_r', 'latin_1',
608 'mac_cyrillic', 'mac_latin2',
610 'cp1250', 'cp1251', 'cp1252', 'cp1253', 'cp1254', 'cp1255',
611 'cp1256', 'cp1257', 'cp1258',
612 'cp856', 'cp857', 'cp864', 'cp869', 'cp874',
614 'mac_greek', 'mac_iceland','mac_roman', 'mac_turkish',
615 'cp1006', 'iso8859_8',
617 ### These have undefined mappings:
618 #'cp424',
620 ### These fail the round-trip:
621 #'cp875'
624 try:
625 verify(unicode(s,encoding).encode(encoding) == s)
626 except TestFailed:
627 print '*** codec "%s" failed round-trip' % encoding
628 except ValueError,why:
629 print '*** codec for "%s" failed: %s' % (encoding, why)
631 print '128-255...',
632 s = ''.join(map(chr, range(128,256)))
633 for encoding in (
634 'cp037', 'cp1026',
635 'cp437', 'cp500', 'cp737', 'cp775', 'cp850',
636 'cp852', 'cp855', 'cp860', 'cp861', 'cp862',
637 'cp863', 'cp865', 'cp866',
638 'iso8859_10', 'iso8859_13', 'iso8859_14', 'iso8859_15',
639 'iso8859_2', 'iso8859_4', 'iso8859_5',
640 'iso8859_9', 'koi8_r', 'latin_1',
641 'mac_cyrillic', 'mac_latin2',
643 ### These have undefined mappings:
644 #'cp1250', 'cp1251', 'cp1252', 'cp1253', 'cp1254', 'cp1255',
645 #'cp1256', 'cp1257', 'cp1258',
646 #'cp424', 'cp856', 'cp857', 'cp864', 'cp869', 'cp874',
647 #'iso8859_3', 'iso8859_6', 'iso8859_7',
648 #'mac_greek', 'mac_iceland','mac_roman', 'mac_turkish',
650 ### These fail the round-trip:
651 #'cp1006', 'cp875', 'iso8859_8',
654 try:
655 verify(unicode(s,encoding).encode(encoding) == s)
656 except TestFailed:
657 print '*** codec "%s" failed round-trip' % encoding
658 except ValueError,why:
659 print '*** codec for "%s" failed: %s' % (encoding, why)
661 print 'done.'
663 print 'Testing Unicode string concatenation...',
664 verify((u"abc" u"def") == u"abcdef")
665 verify(("abc" u"def") == u"abcdef")
666 verify((u"abc" "def") == u"abcdef")
667 verify((u"abc" u"def" "ghi") == u"abcdefghi")
668 verify(("abc" "def" u"ghi") == u"abcdefghi")
669 print 'done.'
671 print 'Testing Unicode printing...',
672 print u'abc'
673 print u'abc', u'def'
674 print u'abc', 'def'
675 print 'abc', u'def'
676 print u'abc\n'
677 print u'abc\n',
678 print u'abc\n',
679 print u'def\n'
680 print u'def\n'
681 print 'done.'