- Got rid of newmodule.c
[python/dscho.git] / Lib / test / test_unicode.py
blob546505176f2b0d5cb1c0f15764c7487fc6a9ab4b
1 """ Test script for the Unicode implementation.
3 Written by Marc-Andre Lemburg (mal@lemburg.com).
5 (c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
7 """#"
8 from test_support import verify, verbose, TestFailed
9 import sys, string
11 if not sys.platform.startswith('java'):
12 # Test basic sanity of repr()
13 verify(repr(u'abc') == "u'abc'")
14 verify(repr(u'ab\\c') == "u'ab\\\\c'")
15 verify(repr(u'ab\\') == "u'ab\\\\'")
16 verify(repr(u'\\c') == "u'\\\\c'")
17 verify(repr(u'\\') == "u'\\\\'")
18 verify(repr(u'\n') == "u'\\n'")
19 verify(repr(u'\r') == "u'\\r'")
20 verify(repr(u'\t') == "u'\\t'")
21 verify(repr(u'\b') == "u'\\x08'")
22 verify(repr(u"'\"") == """u'\\'"'""")
23 verify(repr(u"'\"") == """u'\\'"'""")
24 verify(repr(u"'") == '''u"'"''')
25 verify(repr(u'"') == """u'"'""")
26 latin1repr = (
27 "u'\\x00\\x01\\x02\\x03\\x04\\x05\\x06\\x07\\x08\\t\\n\\x0b\\x0c\\r"
28 "\\x0e\\x0f\\x10\\x11\\x12\\x13\\x14\\x15\\x16\\x17\\x18\\x19\\x1a"
29 "\\x1b\\x1c\\x1d\\x1e\\x1f !\"#$%&\\'()*+,-./0123456789:;<=>?@ABCDEFGHI"
30 "JKLMNOPQRSTUVWXYZ[\\\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\\x7f"
31 "\\x80\\x81\\x82\\x83\\x84\\x85\\x86\\x87\\x88\\x89\\x8a\\x8b\\x8c\\x8d"
32 "\\x8e\\x8f\\x90\\x91\\x92\\x93\\x94\\x95\\x96\\x97\\x98\\x99\\x9a\\x9b"
33 "\\x9c\\x9d\\x9e\\x9f\\xa0\\xa1\\xa2\\xa3\\xa4\\xa5\\xa6\\xa7\\xa8\\xa9"
34 "\\xaa\\xab\\xac\\xad\\xae\\xaf\\xb0\\xb1\\xb2\\xb3\\xb4\\xb5\\xb6\\xb7"
35 "\\xb8\\xb9\\xba\\xbb\\xbc\\xbd\\xbe\\xbf\\xc0\\xc1\\xc2\\xc3\\xc4\\xc5"
36 "\\xc6\\xc7\\xc8\\xc9\\xca\\xcb\\xcc\\xcd\\xce\\xcf\\xd0\\xd1\\xd2\\xd3"
37 "\\xd4\\xd5\\xd6\\xd7\\xd8\\xd9\\xda\\xdb\\xdc\\xdd\\xde\\xdf\\xe0\\xe1"
38 "\\xe2\\xe3\\xe4\\xe5\\xe6\\xe7\\xe8\\xe9\\xea\\xeb\\xec\\xed\\xee\\xef"
39 "\\xf0\\xf1\\xf2\\xf3\\xf4\\xf5\\xf6\\xf7\\xf8\\xf9\\xfa\\xfb\\xfc\\xfd"
40 "\\xfe\\xff'")
41 testrepr = repr(u''.join(map(unichr, range(256))))
42 verify(testrepr == latin1repr)
44 def test(method, input, output, *args):
45 if verbose:
46 print '%s.%s%s =? %s... ' % (repr(input), method, args, repr(output)),
47 try:
48 f = getattr(input, method)
49 value = apply(f, args)
50 except:
51 value = sys.exc_type
52 exc = sys.exc_info()[:2]
53 else:
54 exc = None
55 if value == output and type(value) is type(output):
56 # if the original is returned make sure that
57 # this doesn't happen with subclasses
58 if value is input:
59 class usub(unicode):
60 def __repr__(self):
61 return 'usub(%r)' % unicode.__repr__(self)
62 input = usub(input)
63 try:
64 f = getattr(input, method)
65 value = apply(f, args)
66 except:
67 value = sys.exc_type
68 exc = sys.exc_info()[:2]
69 if value is input:
70 if verbose:
71 print 'no'
72 print '*',f, `input`, `output`, `value`
73 return
74 if value != output or type(value) is not type(output):
75 if verbose:
76 print 'no'
77 print '*',f, `input`, `output`, `value`
78 if exc:
79 print ' value == %s: %s' % (exc)
80 else:
81 if verbose:
82 print 'yes'
84 test('capitalize', u' hello ', u' hello ')
85 test('capitalize', u'Hello ', u'Hello ')
86 test('capitalize', u'hello ', u'Hello ')
87 test('capitalize', u'aaaa', u'Aaaa')
88 test('capitalize', u'AaAa', u'Aaaa')
90 test('count', u'aaa', 3, u'a')
91 test('count', u'aaa', 0, u'b')
92 test('count', 'aaa', 3, u'a')
93 test('count', 'aaa', 0, u'b')
94 test('count', u'aaa', 3, 'a')
95 test('count', u'aaa', 0, 'b')
97 test('title', u' hello ', u' Hello ')
98 test('title', u'Hello ', u'Hello ')
99 test('title', u'hello ', u'Hello ')
100 test('title', u"fOrMaT thIs aS titLe String", u'Format This As Title String')
101 test('title', u"fOrMaT,thIs-aS*titLe;String", u'Format,This-As*Title;String')
102 test('title', u"getInt", u'Getint')
104 test('find', u'abcdefghiabc', 0, u'abc')
105 test('find', u'abcdefghiabc', 9, u'abc', 1)
106 test('find', u'abcdefghiabc', -1, u'def', 4)
108 test('rfind', u'abcdefghiabc', 9, u'abc')
110 test('lower', u'HeLLo', u'hello')
111 test('lower', u'hello', u'hello')
113 test('upper', u'HeLLo', u'HELLO')
114 test('upper', u'HELLO', u'HELLO')
116 if 0:
117 transtable = '\000\001\002\003\004\005\006\007\010\011\012\013\014\015\016\017\020\021\022\023\024\025\026\027\030\031\032\033\034\035\036\037 !"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`xyzdefghijklmnopqrstuvwxyz{|}~\177\200\201\202\203\204\205\206\207\210\211\212\213\214\215\216\217\220\221\222\223\224\225\226\227\230\231\232\233\234\235\236\237\240\241\242\243\244\245\246\247\250\251\252\253\254\255\256\257\260\261\262\263\264\265\266\267\270\271\272\273\274\275\276\277\300\301\302\303\304\305\306\307\310\311\312\313\314\315\316\317\320\321\322\323\324\325\326\327\330\331\332\333\334\335\336\337\340\341\342\343\344\345\346\347\350\351\352\353\354\355\356\357\360\361\362\363\364\365\366\367\370\371\372\373\374\375\376\377'
119 test('maketrans', u'abc', transtable, u'xyz')
120 test('maketrans', u'abc', ValueError, u'xyzq')
122 test('split', u'this is the split function',
123 [u'this', u'is', u'the', u'split', u'function'])
124 test('split', u'a|b|c|d', [u'a', u'b', u'c', u'd'], u'|')
125 test('split', u'a|b|c|d', [u'a', u'b', u'c|d'], u'|', 2)
126 test('split', u'a b c d', [u'a', u'b c d'], None, 1)
127 test('split', u'a b c d', [u'a', u'b', u'c d'], None, 2)
128 test('split', u'a b c d', [u'a', u'b', u'c', u'd'], None, 3)
129 test('split', u'a b c d', [u'a', u'b', u'c', u'd'], None, 4)
130 test('split', u'a b c d', [u'a b c d'], None, 0)
131 test('split', u'a b c d', [u'a', u'b', u'c d'], None, 2)
132 test('split', u'a b c d ', [u'a', u'b', u'c', u'd'])
133 test('split', u'a//b//c//d', [u'a', u'b', u'c', u'd'], u'//')
134 test('split', u'a//b//c//d', [u'a', u'b', u'c', u'd'], '//')
135 test('split', 'a//b//c//d', [u'a', u'b', u'c', u'd'], u'//')
136 test('split', u'endcase test', [u'endcase ', u''], u'test')
137 test('split', u'endcase test', [u'endcase ', u''], 'test')
138 test('split', 'endcase test', [u'endcase ', u''], u'test')
141 # join now works with any sequence type
142 class Sequence:
143 def __init__(self, seq): self.seq = seq
144 def __len__(self): return len(self.seq)
145 def __getitem__(self, i): return self.seq[i]
147 test('join', u' ', u'a b c d', [u'a', u'b', u'c', u'd'])
148 test('join', u' ', u'a b c d', ['a', 'b', u'c', u'd'])
149 test('join', u'', u'abcd', (u'a', u'b', u'c', u'd'))
150 test('join', u' ', u'w x y z', Sequence('wxyz'))
151 test('join', u' ', TypeError, 7)
152 test('join', u' ', TypeError, Sequence([7, u'hello', 123L]))
153 test('join', ' ', u'a b c d', [u'a', u'b', u'c', u'd'])
154 test('join', ' ', u'a b c d', ['a', 'b', u'c', u'd'])
155 test('join', '', u'abcd', (u'a', u'b', u'c', u'd'))
156 test('join', ' ', u'w x y z', Sequence(u'wxyz'))
157 test('join', ' ', TypeError, 7)
159 result = u''
160 for i in range(10):
161 if i > 0:
162 result = result + u':'
163 result = result + u'x'*10
164 test('join', u':', result, [u'x' * 10] * 10)
165 test('join', u':', result, (u'x' * 10,) * 10)
167 test('strip', u' hello ', u'hello')
168 test('lstrip', u' hello ', u'hello ')
169 test('rstrip', u' hello ', u' hello')
170 test('strip', u'hello', u'hello')
172 # strip/lstrip/rstrip with None arg
173 test('strip', u' hello ', u'hello', None)
174 test('lstrip', u' hello ', u'hello ', None)
175 test('rstrip', u' hello ', u' hello', None)
176 test('strip', u'hello', u'hello', None)
178 # strip/lstrip/rstrip with unicode arg
179 test('strip', u'xyzzyhelloxyzzy', u'hello', u'xyz')
180 test('lstrip', u'xyzzyhelloxyzzy', u'helloxyzzy', u'xyz')
181 test('rstrip', u'xyzzyhelloxyzzy', u'xyzzyhello', u'xyz')
182 test('strip', u'hello', u'hello', u'xyz')
184 # strip/lstrip/rstrip with str arg
185 test('strip', u'xyzzyhelloxyzzy', u'hello', 'xyz')
186 test('lstrip', u'xyzzyhelloxyzzy', u'helloxyzzy', 'xyz')
187 test('rstrip', u'xyzzyhelloxyzzy', u'xyzzyhello', 'xyz')
188 test('strip', u'hello', u'hello', 'xyz')
190 test('swapcase', u'HeLLo cOmpUteRs', u'hEllO CoMPuTErS')
192 if 0:
193 test('translate', u'xyzabcdef', u'xyzxyz', transtable, u'def')
195 table = string.maketrans('a', u'A')
196 test('translate', u'abc', u'Abc', table)
197 test('translate', u'xyz', u'xyz', table)
199 test('replace', u'one!two!three!', u'one@two!three!', u'!', u'@', 1)
200 test('replace', u'one!two!three!', u'onetwothree', '!', '')
201 test('replace', u'one!two!three!', u'one@two@three!', u'!', u'@', 2)
202 test('replace', u'one!two!three!', u'one@two@three@', u'!', u'@', 3)
203 test('replace', u'one!two!three!', u'one@two@three@', u'!', u'@', 4)
204 test('replace', u'one!two!three!', u'one!two!three!', u'!', u'@', 0)
205 test('replace', u'one!two!three!', u'one@two@three@', u'!', u'@')
206 test('replace', u'one!two!three!', u'one!two!three!', u'x', u'@')
207 test('replace', u'one!two!three!', u'one!two!three!', u'x', u'@', 2)
209 test('startswith', u'hello', True, u'he')
210 test('startswith', u'hello', True, u'hello')
211 test('startswith', u'hello', False, u'hello world')
212 test('startswith', u'hello', True, u'')
213 test('startswith', u'hello', False, u'ello')
214 test('startswith', u'hello', True, u'ello', 1)
215 test('startswith', u'hello', True, u'o', 4)
216 test('startswith', u'hello', False, u'o', 5)
217 test('startswith', u'hello', True, u'', 5)
218 test('startswith', u'hello', False, u'lo', 6)
219 test('startswith', u'helloworld', True, u'lowo', 3)
220 test('startswith', u'helloworld', True, u'lowo', 3, 7)
221 test('startswith', u'helloworld', False, u'lowo', 3, 6)
223 test('endswith', u'hello', True, u'lo')
224 test('endswith', u'hello', False, u'he')
225 test('endswith', u'hello', True, u'')
226 test('endswith', u'hello', False, u'hello world')
227 test('endswith', u'helloworld', False, u'worl')
228 test('endswith', u'helloworld', True, u'worl', 3, 9)
229 test('endswith', u'helloworld', True, u'world', 3, 12)
230 test('endswith', u'helloworld', True, u'lowo', 1, 7)
231 test('endswith', u'helloworld', True, u'lowo', 2, 7)
232 test('endswith', u'helloworld', True, u'lowo', 3, 7)
233 test('endswith', u'helloworld', False, u'lowo', 4, 7)
234 test('endswith', u'helloworld', False, u'lowo', 3, 8)
235 test('endswith', u'ab', False, u'ab', 0, 1)
236 test('endswith', u'ab', False, u'ab', 0, 0)
238 test('expandtabs', u'abc\rab\tdef\ng\thi', u'abc\rab def\ng hi')
239 test('expandtabs', u'abc\rab\tdef\ng\thi', u'abc\rab def\ng hi', 8)
240 test('expandtabs', u'abc\rab\tdef\ng\thi', u'abc\rab def\ng hi', 4)
241 test('expandtabs', u'abc\r\nab\tdef\ng\thi', u'abc\r\nab def\ng hi', 4)
242 test('expandtabs', u'abc\r\nab\r\ndef\ng\r\nhi', u'abc\r\nab\r\ndef\ng\r\nhi', 4)
244 if 0:
245 test('capwords', u'abc def ghi', u'Abc Def Ghi')
246 test('capwords', u'abc\tdef\nghi', u'Abc Def Ghi')
247 test('capwords', u'abc\t def \nghi', u'Abc Def Ghi')
249 test('zfill', u'123', u'123', 2)
250 test('zfill', u'123', u'123', 3)
251 test('zfill', u'123', u'0123', 4)
252 test('zfill', u'+123', u'+123', 3)
253 test('zfill', u'+123', u'+123', 4)
254 test('zfill', u'+123', u'+0123', 5)
255 test('zfill', u'-123', u'-123', 3)
256 test('zfill', u'-123', u'-123', 4)
257 test('zfill', u'-123', u'-0123', 5)
258 test('zfill', u'', u'000', 3)
259 test('zfill', u'34', u'34', 1)
260 test('zfill', u'34', u'00034', 5)
262 # Comparisons:
263 print 'Testing Unicode comparisons...',
264 verify(u'abc' == 'abc')
265 verify('abc' == u'abc')
266 verify(u'abc' == u'abc')
267 verify(u'abcd' > 'abc')
268 verify('abcd' > u'abc')
269 verify(u'abcd' > u'abc')
270 verify(u'abc' < 'abcd')
271 verify('abc' < u'abcd')
272 verify(u'abc' < u'abcd')
273 print 'done.'
275 if 0:
276 # Move these tests to a Unicode collation module test...
278 print 'Testing UTF-16 code point order comparisons...',
279 #No surrogates, no fixup required.
280 verify(u'\u0061' < u'\u20ac')
281 # Non surrogate below surrogate value, no fixup required
282 verify(u'\u0061' < u'\ud800\udc02')
284 # Non surrogate above surrogate value, fixup required
285 def test_lecmp(s, s2):
286 verify(s < s2 , "comparison failed on %s < %s" % (s, s2))
288 def test_fixup(s):
289 s2 = u'\ud800\udc01'
290 test_lecmp(s, s2)
291 s2 = u'\ud900\udc01'
292 test_lecmp(s, s2)
293 s2 = u'\uda00\udc01'
294 test_lecmp(s, s2)
295 s2 = u'\udb00\udc01'
296 test_lecmp(s, s2)
297 s2 = u'\ud800\udd01'
298 test_lecmp(s, s2)
299 s2 = u'\ud900\udd01'
300 test_lecmp(s, s2)
301 s2 = u'\uda00\udd01'
302 test_lecmp(s, s2)
303 s2 = u'\udb00\udd01'
304 test_lecmp(s, s2)
305 s2 = u'\ud800\ude01'
306 test_lecmp(s, s2)
307 s2 = u'\ud900\ude01'
308 test_lecmp(s, s2)
309 s2 = u'\uda00\ude01'
310 test_lecmp(s, s2)
311 s2 = u'\udb00\ude01'
312 test_lecmp(s, s2)
313 s2 = u'\ud800\udfff'
314 test_lecmp(s, s2)
315 s2 = u'\ud900\udfff'
316 test_lecmp(s, s2)
317 s2 = u'\uda00\udfff'
318 test_lecmp(s, s2)
319 s2 = u'\udb00\udfff'
320 test_lecmp(s, s2)
322 test_fixup(u'\ue000')
323 test_fixup(u'\uff61')
325 # Surrogates on both sides, no fixup required
326 verify(u'\ud800\udc02' < u'\ud84d\udc56')
327 print 'done.'
329 test('ljust', u'abc', u'abc ', 10)
330 test('rjust', u'abc', u' abc', 10)
331 test('center', u'abc', u' abc ', 10)
332 test('ljust', u'abc', u'abc ', 6)
333 test('rjust', u'abc', u' abc', 6)
334 test('center', u'abc', u' abc ', 6)
335 test('ljust', u'abc', u'abc', 2)
336 test('rjust', u'abc', u'abc', 2)
337 test('center', u'abc', u'abc', 2)
339 test('islower', u'a', True)
340 test('islower', u'A', False)
341 test('islower', u'\n', False)
342 test('islower', u'\u1FFc', False)
343 test('islower', u'abc', True)
344 test('islower', u'aBc', False)
345 test('islower', u'abc\n', True)
347 test('isupper', u'a', False)
348 test('isupper', u'A', True)
349 test('isupper', u'\n', False)
350 if sys.platform[:4] != 'java':
351 test('isupper', u'\u1FFc', False)
352 test('isupper', u'ABC', True)
353 test('isupper', u'AbC', False)
354 test('isupper', u'ABC\n', True)
356 test('istitle', u'a', False)
357 test('istitle', u'A', True)
358 test('istitle', u'\n', False)
359 test('istitle', u'\u1FFc', True)
360 test('istitle', u'A Titlecased Line', True)
361 test('istitle', u'A\nTitlecased Line', True)
362 test('istitle', u'A Titlecased, Line', True)
363 test('istitle', u'Greek \u1FFcitlecases ...', True)
364 test('istitle', u'Not a capitalized String', False)
365 test('istitle', u'Not\ta Titlecase String', False)
366 test('istitle', u'Not--a Titlecase String', False)
368 test('isalpha', u'a', True)
369 test('isalpha', u'A', True)
370 test('isalpha', u'\n', False)
371 test('isalpha', u'\u1FFc', True)
372 test('isalpha', u'abc', True)
373 test('isalpha', u'aBc123', False)
374 test('isalpha', u'abc\n', False)
376 test('isalnum', u'a', True)
377 test('isalnum', u'A', True)
378 test('isalnum', u'\n', False)
379 test('isalnum', u'123abc456', True)
380 test('isalnum', u'a1b3c', True)
381 test('isalnum', u'aBc000 ', False)
382 test('isalnum', u'abc\n', False)
384 test('splitlines', u"abc\ndef\n\rghi", [u'abc', u'def', u'', u'ghi'])
385 test('splitlines', u"abc\ndef\n\r\nghi", [u'abc', u'def', u'', u'ghi'])
386 test('splitlines', u"abc\ndef\r\nghi", [u'abc', u'def', u'ghi'])
387 test('splitlines', u"abc\ndef\r\nghi\n", [u'abc', u'def', u'ghi'])
388 test('splitlines', u"abc\ndef\r\nghi\n\r", [u'abc', u'def', u'ghi', u''])
389 test('splitlines', u"\nabc\ndef\r\nghi\n\r", [u'', u'abc', u'def', u'ghi', u''])
390 test('splitlines', u"\nabc\ndef\r\nghi\n\r", [u'\n', u'abc\n', u'def\r\n', u'ghi\n', u'\r'], True)
392 test('translate', u"abababc", u'bbbc', {ord('a'):None})
393 test('translate', u"abababc", u'iiic', {ord('a'):None, ord('b'):ord('i')})
394 test('translate', u"abababc", u'iiix', {ord('a'):None, ord('b'):ord('i'), ord('c'):u'x'})
396 # Contains:
397 print 'Testing Unicode contains method...',
398 verify(('a' in u'abdb') == 1)
399 verify(('a' in u'bdab') == 1)
400 verify(('a' in u'bdaba') == 1)
401 verify(('a' in u'bdba') == 1)
402 verify(('a' in u'bdba') == 1)
403 verify((u'a' in u'bdba') == 1)
404 verify((u'a' in u'bdb') == 0)
405 verify((u'a' in 'bdb') == 0)
406 verify((u'a' in 'bdba') == 1)
407 verify((u'a' in ('a',1,None)) == 1)
408 verify((u'a' in (1,None,'a')) == 1)
409 verify((u'a' in (1,None,u'a')) == 1)
410 verify(('a' in ('a',1,None)) == 1)
411 verify(('a' in (1,None,'a')) == 1)
412 verify(('a' in (1,None,u'a')) == 1)
413 verify(('a' in ('x',1,u'y')) == 0)
414 verify(('a' in ('x',1,None)) == 0)
415 print 'done.'
417 # Formatting:
418 print 'Testing Unicode formatting strings...',
419 verify(u"%s, %s" % (u"abc", "abc") == u'abc, abc')
420 verify(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", 1, 2, 3) == u'abc, abc, 1, 2.000000, 3.00')
421 verify(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", 1, -2, 3) == u'abc, abc, 1, -2.000000, 3.00')
422 verify(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", -1, -2, 3.5) == u'abc, abc, -1, -2.000000, 3.50')
423 verify(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", -1, -2, 3.57) == u'abc, abc, -1, -2.000000, 3.57')
424 verify(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", -1, -2, 1003.57) == u'abc, abc, -1, -2.000000, 1003.57')
425 verify(u"%c" % (u"a",) == u'a')
426 verify(u"%c" % ("a",) == u'a')
427 verify(u"%c" % (34,) == u'"')
428 verify(u"%c" % (36,) == u'$')
429 if sys.platform[:4] != 'java':
430 value = u"%r, %r" % (u"abc", "abc")
431 if value != u"u'abc', 'abc'":
432 print '*** formatting failed for "%s"' % 'u"%r, %r" % (u"abc", "abc")'
434 verify(u"%(x)s, %(y)s" % {'x':u"abc", 'y':"def"} == u'abc, def')
435 try:
436 value = u"%(x)s, %(ä)s" % {'x':u"abc", u'ä':"def"}
437 except KeyError:
438 print '*** formatting failed for "%s"' % "u'abc, def'"
439 else:
440 verify(value == u'abc, def')
442 # formatting jobs delegated from the string implementation:
443 verify('...%(foo)s...' % {'foo':u"abc"} == u'...abc...')
444 verify('...%(foo)s...' % {'foo':"abc"} == '...abc...')
445 verify('...%(foo)s...' % {u'foo':"abc"} == '...abc...')
446 verify('...%(foo)s...' % {u'foo':u"abc"} == u'...abc...')
447 verify('...%(foo)s...' % {u'foo':u"abc",'def':123} == u'...abc...')
448 verify('...%(foo)s...' % {u'foo':u"abc",u'def':123} == u'...abc...')
449 verify('...%s...%s...%s...%s...' % (1,2,3,u"abc") == u'...1...2...3...abc...')
450 verify('...%%...%%s...%s...%s...%s...%s...' % (1,2,3,u"abc") == u'...%...%s...1...2...3...abc...')
451 verify('...%s...' % u"abc" == u'...abc...')
452 verify('%*s' % (5,u'abc',) == u' abc')
453 verify('%*s' % (-5,u'abc',) == u'abc ')
454 verify('%*.*s' % (5,2,u'abc',) == u' ab')
455 verify('%*.*s' % (5,3,u'abc',) == u' abc')
456 verify('%i %*.*s' % (10, 5,3,u'abc',) == u'10 abc')
457 verify('%i%s %*.*s' % (10, 3, 5,3,u'abc',) == u'103 abc')
458 print 'done.'
460 print 'Testing builtin unicode()...',
462 # unicode(obj) tests (this maps to PyObject_Unicode() at C level)
464 verify(unicode(u'unicode remains unicode') == u'unicode remains unicode')
466 class UnicodeSubclass(unicode):
467 pass
469 verify(unicode(UnicodeSubclass('unicode subclass becomes unicode'))
470 == u'unicode subclass becomes unicode')
472 verify(unicode('strings are converted to unicode')
473 == u'strings are converted to unicode')
475 class UnicodeCompat:
476 def __init__(self, x):
477 self.x = x
478 def __unicode__(self):
479 return self.x
481 verify(unicode(UnicodeCompat('__unicode__ compatible objects are recognized'))
482 == u'__unicode__ compatible objects are recognized')
484 class StringCompat:
485 def __init__(self, x):
486 self.x = x
487 def __str__(self):
488 return self.x
490 verify(unicode(StringCompat('__str__ compatible objects are recognized'))
491 == u'__str__ compatible objects are recognized')
493 # unicode(obj) is compatible to str():
495 o = StringCompat('unicode(obj) is compatible to str()')
496 verify(unicode(o) == u'unicode(obj) is compatible to str()')
497 verify(str(o) == 'unicode(obj) is compatible to str()')
499 for obj in (123, 123.45, 123L):
500 verify(unicode(obj) == unicode(str(obj)))
502 # unicode(obj, encoding, error) tests (this maps to
503 # PyUnicode_FromEncodedObject() at C level)
505 if not sys.platform.startswith('java'):
506 try:
507 unicode(u'decoding unicode is not supported', 'utf-8', 'strict')
508 except TypeError:
509 pass
510 else:
511 raise TestFailed, "decoding unicode should NOT be supported"
513 verify(unicode('strings are decoded to unicode', 'utf-8', 'strict')
514 == u'strings are decoded to unicode')
516 if not sys.platform.startswith('java'):
517 verify(unicode(buffer('character buffers are decoded to unicode'),
518 'utf-8', 'strict')
519 == u'character buffers are decoded to unicode')
521 print 'done.'
523 # Test builtin codecs
524 print 'Testing builtin codecs...',
526 # UTF-7 specific encoding tests:
527 utfTests = [(u'A\u2262\u0391.', 'A+ImIDkQ.'), # RFC2152 example
528 (u'Hi Mom -\u263a-!', 'Hi Mom -+Jjo--!'), # RFC2152 example
529 (u'\u65E5\u672C\u8A9E', '+ZeVnLIqe-'), # RFC2152 example
530 (u'Item 3 is \u00a31.', 'Item 3 is +AKM-1.'), # RFC2152 example
531 (u'+', '+-'),
532 (u'+-', '+--'),
533 (u'+?', '+-?'),
534 (u'\?', '+AFw?'),
535 (u'+?', '+-?'),
536 (ur'\\?', '+AFwAXA?'),
537 (ur'\\\?', '+AFwAXABc?'),
538 (ur'++--', '+-+---')]
540 for x,y in utfTests:
541 verify( x.encode('utf-7') == y )
543 try:
544 unicode('+3ADYAA-', 'utf-7') # surrogates not supported
545 except UnicodeError:
546 pass
547 else:
548 raise TestFailed, "unicode('+3ADYAA-', 'utf-7') failed to raise an exception"
550 verify(unicode('+3ADYAA-', 'utf-7', 'replace') == u'\ufffd')
552 # UTF-8 specific encoding tests:
553 verify(u''.encode('utf-8') == '')
554 verify(u'\u20ac'.encode('utf-8') == '\xe2\x82\xac')
555 verify(u'\ud800\udc02'.encode('utf-8') == '\xf0\x90\x80\x82')
556 verify(u'\ud84d\udc56'.encode('utf-8') == '\xf0\xa3\x91\x96')
557 verify(u'\ud800'.encode('utf-8') == '\xed\xa0\x80')
558 verify(u'\udc00'.encode('utf-8') == '\xed\xb0\x80')
559 verify((u'\ud800\udc02'*1000).encode('utf-8') ==
560 '\xf0\x90\x80\x82'*1000)
561 verify(u'\u6b63\u78ba\u306b\u8a00\u3046\u3068\u7ffb\u8a33\u306f'
562 u'\u3055\u308c\u3066\u3044\u307e\u305b\u3093\u3002\u4e00'
563 u'\u90e8\u306f\u30c9\u30a4\u30c4\u8a9e\u3067\u3059\u304c'
564 u'\u3001\u3042\u3068\u306f\u3067\u305f\u3089\u3081\u3067'
565 u'\u3059\u3002\u5b9f\u969b\u306b\u306f\u300cWenn ist das'
566 u' Nunstuck git und'.encode('utf-8') ==
567 '\xe6\xad\xa3\xe7\xa2\xba\xe3\x81\xab\xe8\xa8\x80\xe3\x81'
568 '\x86\xe3\x81\xa8\xe7\xbf\xbb\xe8\xa8\xb3\xe3\x81\xaf\xe3'
569 '\x81\x95\xe3\x82\x8c\xe3\x81\xa6\xe3\x81\x84\xe3\x81\xbe'
570 '\xe3\x81\x9b\xe3\x82\x93\xe3\x80\x82\xe4\xb8\x80\xe9\x83'
571 '\xa8\xe3\x81\xaf\xe3\x83\x89\xe3\x82\xa4\xe3\x83\x84\xe8'
572 '\xaa\x9e\xe3\x81\xa7\xe3\x81\x99\xe3\x81\x8c\xe3\x80\x81'
573 '\xe3\x81\x82\xe3\x81\xa8\xe3\x81\xaf\xe3\x81\xa7\xe3\x81'
574 '\x9f\xe3\x82\x89\xe3\x82\x81\xe3\x81\xa7\xe3\x81\x99\xe3'
575 '\x80\x82\xe5\xae\x9f\xe9\x9a\x9b\xe3\x81\xab\xe3\x81\xaf'
576 '\xe3\x80\x8cWenn ist das Nunstuck git und')
578 # UTF-8 specific decoding tests
579 verify(unicode('\xf0\xa3\x91\x96', 'utf-8') == u'\U00023456' )
580 verify(unicode('\xf0\x90\x80\x82', 'utf-8') == u'\U00010002' )
581 verify(unicode('\xe2\x82\xac', 'utf-8') == u'\u20ac' )
583 # Other possible utf-8 test cases:
584 # * strict decoding testing for all of the
585 # UTF8_ERROR cases in PyUnicode_DecodeUTF8
587 verify(unicode('hello','ascii') == u'hello')
588 verify(unicode('hello','utf-8') == u'hello')
589 verify(unicode('hello','utf8') == u'hello')
590 verify(unicode('hello','latin-1') == u'hello')
592 # Error handling
593 try:
594 u'Andr\202 x'.encode('ascii')
595 u'Andr\202 x'.encode('ascii','strict')
596 except ValueError:
597 pass
598 else:
599 raise TestFailed, "u'Andr\202'.encode('ascii') failed to raise an exception"
600 verify(u'Andr\202 x'.encode('ascii','ignore') == "Andr x")
601 verify(u'Andr\202 x'.encode('ascii','replace') == "Andr? x")
603 try:
604 unicode('Andr\202 x','ascii')
605 unicode('Andr\202 x','ascii','strict')
606 except ValueError:
607 pass
608 else:
609 raise TestFailed, "unicode('Andr\202') failed to raise an exception"
610 verify(unicode('Andr\202 x','ascii','ignore') == u"Andr x")
611 verify(unicode('Andr\202 x','ascii','replace') == u'Andr\uFFFD x')
613 verify("\\N{foo}xx".decode("unicode-escape", "ignore") == u"xx")
614 try:
615 "\\".decode("unicode-escape")
616 except ValueError:
617 pass
618 else:
619 raise TestFailed, '"\\".decode("unicode-escape") should fail'
621 verify(u'hello'.encode('ascii') == 'hello')
622 verify(u'hello'.encode('utf-7') == 'hello')
623 verify(u'hello'.encode('utf-8') == 'hello')
624 verify(u'hello'.encode('utf8') == 'hello')
625 verify(u'hello'.encode('utf-16-le') == 'h\000e\000l\000l\000o\000')
626 verify(u'hello'.encode('utf-16-be') == '\000h\000e\000l\000l\000o')
627 verify(u'hello'.encode('latin-1') == 'hello')
629 # Roundtrip safety for BMP (just the first 1024 chars)
630 u = u''.join(map(unichr, range(1024)))
631 for encoding in ('utf-7', 'utf-8', 'utf-16', 'utf-16-le', 'utf-16-be',
632 'raw_unicode_escape', 'unicode_escape', 'unicode_internal'):
633 verify(unicode(u.encode(encoding),encoding) == u)
635 # Roundtrip safety for BMP (just the first 256 chars)
636 u = u''.join(map(unichr, range(256)))
637 for encoding in (
638 'latin-1',
640 try:
641 verify(unicode(u.encode(encoding),encoding) == u)
642 except TestFailed:
643 print '*** codec "%s" failed round-trip' % encoding
644 except ValueError,why:
645 print '*** codec for "%s" failed: %s' % (encoding, why)
647 # Roundtrip safety for BMP (just the first 128 chars)
648 u = u''.join(map(unichr, range(128)))
649 for encoding in (
650 'ascii',
652 try:
653 verify(unicode(u.encode(encoding),encoding) == u)
654 except TestFailed:
655 print '*** codec "%s" failed round-trip' % encoding
656 except ValueError,why:
657 print '*** codec for "%s" failed: %s' % (encoding, why)
659 # Roundtrip safety for non-BMP (just a few chars)
660 u = u'\U00010001\U00020002\U00030003\U00040004\U00050005'
661 for encoding in ('utf-8',
662 'utf-16', 'utf-16-le', 'utf-16-be',
663 #'raw_unicode_escape',
664 'unicode_escape', 'unicode_internal'):
665 verify(unicode(u.encode(encoding),encoding) == u)
667 # UTF-8 must be roundtrip safe for all UCS-2 code points
668 u = u''.join(map(unichr, range(0x10000)))
669 for encoding in ('utf-8',):
670 verify(unicode(u.encode(encoding),encoding) == u)
672 print 'done.'
674 print 'Testing standard mapping codecs...',
676 print '0-127...',
677 s = ''.join(map(chr, range(128)))
678 for encoding in (
679 'cp037', 'cp1026',
680 'cp437', 'cp500', 'cp737', 'cp775', 'cp850',
681 'cp852', 'cp855', 'cp860', 'cp861', 'cp862',
682 'cp863', 'cp865', 'cp866',
683 'iso8859_10', 'iso8859_13', 'iso8859_14', 'iso8859_15',
684 'iso8859_2', 'iso8859_3', 'iso8859_4', 'iso8859_5', 'iso8859_6',
685 'iso8859_7', 'iso8859_9', 'koi8_r', 'latin_1',
686 'mac_cyrillic', 'mac_latin2',
688 'cp1250', 'cp1251', 'cp1252', 'cp1253', 'cp1254', 'cp1255',
689 'cp1256', 'cp1257', 'cp1258',
690 'cp856', 'cp857', 'cp864', 'cp869', 'cp874',
692 'mac_greek', 'mac_iceland','mac_roman', 'mac_turkish',
693 'cp1006', 'iso8859_8',
695 ### These have undefined mappings:
696 #'cp424',
698 ### These fail the round-trip:
699 #'cp875'
702 try:
703 verify(unicode(s,encoding).encode(encoding) == s)
704 except TestFailed:
705 print '*** codec "%s" failed round-trip' % encoding
706 except ValueError,why:
707 print '*** codec for "%s" failed: %s' % (encoding, why)
709 print '128-255...',
710 s = ''.join(map(chr, range(128,256)))
711 for encoding in (
712 'cp037', 'cp1026',
713 'cp437', 'cp500', 'cp737', 'cp775', 'cp850',
714 'cp852', 'cp855', 'cp860', 'cp861', 'cp862',
715 'cp863', 'cp865', 'cp866',
716 'iso8859_10', 'iso8859_13', 'iso8859_14', 'iso8859_15',
717 'iso8859_2', 'iso8859_4', 'iso8859_5',
718 'iso8859_9', 'koi8_r', 'latin_1',
719 'mac_cyrillic', 'mac_latin2',
721 ### These have undefined mappings:
722 #'cp1250', 'cp1251', 'cp1252', 'cp1253', 'cp1254', 'cp1255',
723 #'cp1256', 'cp1257', 'cp1258',
724 #'cp424', 'cp856', 'cp857', 'cp864', 'cp869', 'cp874',
725 #'iso8859_3', 'iso8859_6', 'iso8859_7',
726 #'mac_greek', 'mac_iceland','mac_roman', 'mac_turkish',
728 ### These fail the round-trip:
729 #'cp1006', 'cp875', 'iso8859_8',
732 try:
733 verify(unicode(s,encoding).encode(encoding) == s)
734 except TestFailed:
735 print '*** codec "%s" failed round-trip' % encoding
736 except ValueError,why:
737 print '*** codec for "%s" failed: %s' % (encoding, why)
739 print 'done.'
741 print 'Testing Unicode string concatenation...',
742 verify((u"abc" u"def") == u"abcdef")
743 verify(("abc" u"def") == u"abcdef")
744 verify((u"abc" "def") == u"abcdef")
745 verify((u"abc" u"def" "ghi") == u"abcdefghi")
746 verify(("abc" "def" u"ghi") == u"abcdefghi")
747 print 'done.'
749 print 'Testing Unicode printing...',
750 print u'abc'
751 print u'abc', u'def'
752 print u'abc', 'def'
753 print 'abc', u'def'
754 print u'abc\n'
755 print u'abc\n',
756 print u'abc\n',
757 print u'def\n'
758 print u'def\n'
759 print 'done.'