This commit was manufactured by cvs2svn to create tag 'r234c1'.
[python/dscho.git] / Lib / test / test_unicode.py
blobeda7d223b2c4c8b1518fcabcea68f7586de20f9f
1 # -*- coding: iso-8859-1 -*-
2 """ Test script for the Unicode implementation.
4 Written by Marc-Andre Lemburg (mal@lemburg.com).
6 (c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
8 """#"
9 import unittest, sys, string, codecs, new
10 from test import test_support, string_tests
12 class UnicodeTest(
13 string_tests.CommonTest,
14 string_tests.MixinStrUnicodeUserStringTest
16 type2test = unicode
18 def checkequalnofix(self, result, object, methodname, *args):
19 method = getattr(object, methodname)
20 realresult = method(*args)
21 self.assertEqual(realresult, result)
22 self.assert_(type(realresult) is type(result))
24 # if the original is returned make sure that
25 # this doesn't happen with subclasses
26 if realresult is object:
27 class usub(unicode):
28 def __repr__(self):
29 return 'usub(%r)' % unicode.__repr__(self)
30 object = usub(object)
31 method = getattr(object, methodname)
32 realresult = method(*args)
33 self.assertEqual(realresult, result)
34 self.assert_(object is not realresult)
36 def test_repr(self):
37 if not sys.platform.startswith('java'):
38 # Test basic sanity of repr()
39 self.assertEqual(repr(u'abc'), "u'abc'")
40 self.assertEqual(repr(u'ab\\c'), "u'ab\\\\c'")
41 self.assertEqual(repr(u'ab\\'), "u'ab\\\\'")
42 self.assertEqual(repr(u'\\c'), "u'\\\\c'")
43 self.assertEqual(repr(u'\\'), "u'\\\\'")
44 self.assertEqual(repr(u'\n'), "u'\\n'")
45 self.assertEqual(repr(u'\r'), "u'\\r'")
46 self.assertEqual(repr(u'\t'), "u'\\t'")
47 self.assertEqual(repr(u'\b'), "u'\\x08'")
48 self.assertEqual(repr(u"'\""), """u'\\'"'""")
49 self.assertEqual(repr(u"'\""), """u'\\'"'""")
50 self.assertEqual(repr(u"'"), '''u"'"''')
51 self.assertEqual(repr(u'"'), """u'"'""")
52 latin1repr = (
53 "u'\\x00\\x01\\x02\\x03\\x04\\x05\\x06\\x07\\x08\\t\\n\\x0b\\x0c\\r"
54 "\\x0e\\x0f\\x10\\x11\\x12\\x13\\x14\\x15\\x16\\x17\\x18\\x19\\x1a"
55 "\\x1b\\x1c\\x1d\\x1e\\x1f !\"#$%&\\'()*+,-./0123456789:;<=>?@ABCDEFGHI"
56 "JKLMNOPQRSTUVWXYZ[\\\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\\x7f"
57 "\\x80\\x81\\x82\\x83\\x84\\x85\\x86\\x87\\x88\\x89\\x8a\\x8b\\x8c\\x8d"
58 "\\x8e\\x8f\\x90\\x91\\x92\\x93\\x94\\x95\\x96\\x97\\x98\\x99\\x9a\\x9b"
59 "\\x9c\\x9d\\x9e\\x9f\\xa0\\xa1\\xa2\\xa3\\xa4\\xa5\\xa6\\xa7\\xa8\\xa9"
60 "\\xaa\\xab\\xac\\xad\\xae\\xaf\\xb0\\xb1\\xb2\\xb3\\xb4\\xb5\\xb6\\xb7"
61 "\\xb8\\xb9\\xba\\xbb\\xbc\\xbd\\xbe\\xbf\\xc0\\xc1\\xc2\\xc3\\xc4\\xc5"
62 "\\xc6\\xc7\\xc8\\xc9\\xca\\xcb\\xcc\\xcd\\xce\\xcf\\xd0\\xd1\\xd2\\xd3"
63 "\\xd4\\xd5\\xd6\\xd7\\xd8\\xd9\\xda\\xdb\\xdc\\xdd\\xde\\xdf\\xe0\\xe1"
64 "\\xe2\\xe3\\xe4\\xe5\\xe6\\xe7\\xe8\\xe9\\xea\\xeb\\xec\\xed\\xee\\xef"
65 "\\xf0\\xf1\\xf2\\xf3\\xf4\\xf5\\xf6\\xf7\\xf8\\xf9\\xfa\\xfb\\xfc\\xfd"
66 "\\xfe\\xff'")
67 testrepr = repr(u''.join(map(unichr, xrange(256))))
68 self.assertEqual(testrepr, latin1repr)
70 def test_count(self):
71 string_tests.CommonTest.test_count(self)
72 # check mixed argument types
73 self.checkequalnofix(3, 'aaa', 'count', u'a')
74 self.checkequalnofix(0, 'aaa', 'count', u'b')
75 self.checkequalnofix(3, u'aaa', 'count', 'a')
76 self.checkequalnofix(0, u'aaa', 'count', 'b')
77 self.checkequalnofix(0, u'aaa', 'count', 'b')
78 self.checkequalnofix(1, u'aaa', 'count', 'a', -1)
79 self.checkequalnofix(3, u'aaa', 'count', 'a', -10)
80 self.checkequalnofix(2, u'aaa', 'count', 'a', 0, -1)
81 self.checkequalnofix(0, u'aaa', 'count', 'a', 0, -10)
83 def test_find(self):
84 self.checkequalnofix(0, u'abcdefghiabc', 'find', u'abc')
85 self.checkequalnofix(9, u'abcdefghiabc', 'find', u'abc', 1)
86 self.checkequalnofix(-1, u'abcdefghiabc', 'find', u'def', 4)
88 self.assertRaises(TypeError, u'hello'.find)
89 self.assertRaises(TypeError, u'hello'.find, 42)
91 def test_rfind(self):
92 string_tests.CommonTest.test_rfind(self)
93 # check mixed argument types
94 self.checkequalnofix(9, 'abcdefghiabc', 'rfind', u'abc')
95 self.checkequalnofix(12, 'abcdefghiabc', 'rfind', u'')
96 self.checkequalnofix(12, u'abcdefghiabc', 'rfind', '')
98 def test_index(self):
99 string_tests.CommonTest.test_index(self)
100 # check mixed argument types
101 for (t1, t2) in ((str, unicode), (unicode, str)):
102 self.checkequalnofix(0, t1('abcdefghiabc'), 'index', t2(''))
103 self.checkequalnofix(3, t1('abcdefghiabc'), 'index', t2('def'))
104 self.checkequalnofix(0, t1('abcdefghiabc'), 'index', t2('abc'))
105 self.checkequalnofix(9, t1('abcdefghiabc'), 'index', t2('abc'), 1)
106 self.assertRaises(ValueError, t1('abcdefghiabc').index, t2('hib'))
107 self.assertRaises(ValueError, t1('abcdefghiab').index, t2('abc'), 1)
108 self.assertRaises(ValueError, t1('abcdefghi').index, t2('ghi'), 8)
109 self.assertRaises(ValueError, t1('abcdefghi').index, t2('ghi'), -1)
111 def test_rindex(self):
112 string_tests.CommonTest.test_rindex(self)
113 # check mixed argument types
114 for (t1, t2) in ((str, unicode), (unicode, str)):
115 self.checkequalnofix(12, t1('abcdefghiabc'), 'rindex', t2(''))
116 self.checkequalnofix(3, t1('abcdefghiabc'), 'rindex', t2('def'))
117 self.checkequalnofix(9, t1('abcdefghiabc'), 'rindex', t2('abc'))
118 self.checkequalnofix(0, t1('abcdefghiabc'), 'rindex', t2('abc'), 0, -1)
120 self.assertRaises(ValueError, t1('abcdefghiabc').rindex, t2('hib'))
121 self.assertRaises(ValueError, t1('defghiabc').rindex, t2('def'), 1)
122 self.assertRaises(ValueError, t1('defghiabc').rindex, t2('abc'), 0, -1)
123 self.assertRaises(ValueError, t1('abcdefghi').rindex, t2('ghi'), 0, 8)
124 self.assertRaises(ValueError, t1('abcdefghi').rindex, t2('ghi'), 0, -1)
126 def test_translate(self):
127 self.checkequalnofix(u'bbbc', u'abababc', 'translate', {ord('a'):None})
128 self.checkequalnofix(u'iiic', u'abababc', 'translate', {ord('a'):None, ord('b'):ord('i')})
129 self.checkequalnofix(u'iiix', u'abababc', 'translate', {ord('a'):None, ord('b'):ord('i'), ord('c'):u'x'})
130 self.checkequalnofix(u'<i><i><i>c', u'abababc', 'translate', {ord('a'):None, ord('b'):u'<i>'})
131 self.checkequalnofix(u'c', u'abababc', 'translate', {ord('a'):None, ord('b'):u''})
132 self.checkequalnofix(u'xyyx', u'xzx', 'translate', {ord('z'):u'yy'})
134 self.assertRaises(TypeError, u'hello'.translate)
135 self.assertRaises(TypeError, u'abababc'.translate, {ord('a'):''})
137 def test_split(self):
138 string_tests.CommonTest.test_split(self)
140 # Mixed arguments
141 self.checkequalnofix([u'a', u'b', u'c', u'd'], u'a//b//c//d', 'split', '//')
142 self.checkequalnofix([u'a', u'b', u'c', u'd'], 'a//b//c//d', 'split', u'//')
143 self.checkequalnofix([u'endcase ', u''], u'endcase test', 'split', 'test')
145 def test_join(self):
146 string_tests.MixinStrUnicodeUserStringTest.test_join(self)
148 # mixed arguments
149 self.checkequalnofix(u'a b c d', u' ', 'join', ['a', 'b', u'c', u'd'])
150 self.checkequalnofix(u'abcd', u'', 'join', (u'a', u'b', u'c', u'd'))
151 self.checkequalnofix(u'w x y z', u' ', 'join', string_tests.Sequence('wxyz'))
152 self.checkequalnofix(u'a b c d', ' ', 'join', [u'a', u'b', u'c', u'd'])
153 self.checkequalnofix(u'a b c d', ' ', 'join', ['a', 'b', u'c', u'd'])
154 self.checkequalnofix(u'abcd', '', 'join', (u'a', u'b', u'c', u'd'))
155 self.checkequalnofix(u'w x y z', ' ', 'join', string_tests.Sequence(u'wxyz'))
157 def test_strip(self):
158 string_tests.CommonTest.test_strip(self)
159 self.assertRaises(UnicodeError, u"hello".strip, "\xff")
161 def test_replace(self):
162 string_tests.CommonTest.test_replace(self)
164 # method call forwarded from str implementation because of unicode argument
165 self.checkequalnofix(u'one@two!three!', 'one!two!three!', 'replace', u'!', u'@', 1)
166 self.assertRaises(TypeError, 'replace'.replace, u"r", 42)
168 def test_comparison(self):
169 # Comparisons:
170 self.assertEqual(u'abc', 'abc')
171 self.assertEqual('abc', u'abc')
172 self.assertEqual(u'abc', u'abc')
173 self.assert_(u'abcd' > 'abc')
174 self.assert_('abcd' > u'abc')
175 self.assert_(u'abcd' > u'abc')
176 self.assert_(u'abc' < 'abcd')
177 self.assert_('abc' < u'abcd')
178 self.assert_(u'abc' < u'abcd')
180 if 0:
181 # Move these tests to a Unicode collation module test...
182 # Testing UTF-16 code point order comparisons...
184 # No surrogates, no fixup required.
185 self.assert_(u'\u0061' < u'\u20ac')
186 # Non surrogate below surrogate value, no fixup required
187 self.assert_(u'\u0061' < u'\ud800\udc02')
189 # Non surrogate above surrogate value, fixup required
190 def test_lecmp(s, s2):
191 self.assert_(s < s2)
193 def test_fixup(s):
194 s2 = u'\ud800\udc01'
195 test_lecmp(s, s2)
196 s2 = u'\ud900\udc01'
197 test_lecmp(s, s2)
198 s2 = u'\uda00\udc01'
199 test_lecmp(s, s2)
200 s2 = u'\udb00\udc01'
201 test_lecmp(s, s2)
202 s2 = u'\ud800\udd01'
203 test_lecmp(s, s2)
204 s2 = u'\ud900\udd01'
205 test_lecmp(s, s2)
206 s2 = u'\uda00\udd01'
207 test_lecmp(s, s2)
208 s2 = u'\udb00\udd01'
209 test_lecmp(s, s2)
210 s2 = u'\ud800\ude01'
211 test_lecmp(s, s2)
212 s2 = u'\ud900\ude01'
213 test_lecmp(s, s2)
214 s2 = u'\uda00\ude01'
215 test_lecmp(s, s2)
216 s2 = u'\udb00\ude01'
217 test_lecmp(s, s2)
218 s2 = u'\ud800\udfff'
219 test_lecmp(s, s2)
220 s2 = u'\ud900\udfff'
221 test_lecmp(s, s2)
222 s2 = u'\uda00\udfff'
223 test_lecmp(s, s2)
224 s2 = u'\udb00\udfff'
225 test_lecmp(s, s2)
227 test_fixup(u'\ue000')
228 test_fixup(u'\uff61')
230 # Surrogates on both sides, no fixup required
231 self.assert_(u'\ud800\udc02' < u'\ud84d\udc56')
233 def test_islower(self):
234 string_tests.MixinStrUnicodeUserStringTest.test_islower(self)
235 self.checkequalnofix(False, u'\u1FFc', 'islower')
237 def test_isupper(self):
238 string_tests.MixinStrUnicodeUserStringTest.test_isupper(self)
239 if not sys.platform.startswith('java'):
240 self.checkequalnofix(False, u'\u1FFc', 'isupper')
242 def test_istitle(self):
243 string_tests.MixinStrUnicodeUserStringTest.test_title(self)
244 self.checkequalnofix(True, u'\u1FFc', 'istitle')
245 self.checkequalnofix(True, u'Greek \u1FFcitlecases ...', 'istitle')
247 def test_isspace(self):
248 string_tests.MixinStrUnicodeUserStringTest.test_isspace(self)
249 self.checkequalnofix(True, u'\u2000', 'isspace')
250 self.checkequalnofix(True, u'\u200a', 'isspace')
251 self.checkequalnofix(False, u'\u2014', 'isspace')
253 def test_isalpha(self):
254 string_tests.MixinStrUnicodeUserStringTest.test_isalpha(self)
255 self.checkequalnofix(True, u'\u1FFc', 'isalpha')
257 def test_isdecimal(self):
258 self.checkequalnofix(False, u'', 'isdecimal')
259 self.checkequalnofix(False, u'a', 'isdecimal')
260 self.checkequalnofix(True, u'0', 'isdecimal')
261 self.checkequalnofix(False, u'\u2460', 'isdecimal') # CIRCLED DIGIT ONE
262 self.checkequalnofix(False, u'\xbc', 'isdecimal') # VULGAR FRACTION ONE QUARTER
263 self.checkequalnofix(True, u'\u0660', 'isdecimal') # ARABIC-INDIC DIGIT ZERO
264 self.checkequalnofix(True, u'0123456789', 'isdecimal')
265 self.checkequalnofix(False, u'0123456789a', 'isdecimal')
267 self.checkraises(TypeError, 'abc', 'isdecimal', 42)
269 def test_isdigit(self):
270 string_tests.MixinStrUnicodeUserStringTest.test_isdigit(self)
271 self.checkequalnofix(True, u'\u2460', 'isdigit')
272 self.checkequalnofix(False, u'\xbc', 'isdigit')
273 self.checkequalnofix(True, u'\u0660', 'isdigit')
275 def test_isnumeric(self):
276 self.checkequalnofix(False, u'', 'isnumeric')
277 self.checkequalnofix(False, u'a', 'isnumeric')
278 self.checkequalnofix(True, u'0', 'isnumeric')
279 self.checkequalnofix(True, u'\u2460', 'isnumeric')
280 self.checkequalnofix(True, u'\xbc', 'isnumeric')
281 self.checkequalnofix(True, u'\u0660', 'isnumeric')
282 self.checkequalnofix(True, u'0123456789', 'isnumeric')
283 self.checkequalnofix(False, u'0123456789a', 'isnumeric')
285 self.assertRaises(TypeError, u"abc".isnumeric, 42)
287 def test_contains(self):
288 # Testing Unicode contains method
289 self.assert_('a' in u'abdb')
290 self.assert_('a' in u'bdab')
291 self.assert_('a' in u'bdaba')
292 self.assert_('a' in u'bdba')
293 self.assert_('a' in u'bdba')
294 self.assert_(u'a' in u'bdba')
295 self.assert_(u'a' not in u'bdb')
296 self.assert_(u'a' not in 'bdb')
297 self.assert_(u'a' in 'bdba')
298 self.assert_(u'a' in ('a',1,None))
299 self.assert_(u'a' in (1,None,'a'))
300 self.assert_(u'a' in (1,None,u'a'))
301 self.assert_('a' in ('a',1,None))
302 self.assert_('a' in (1,None,'a'))
303 self.assert_('a' in (1,None,u'a'))
304 self.assert_('a' not in ('x',1,u'y'))
305 self.assert_('a' not in ('x',1,None))
306 self.assert_(u'abcd' not in u'abcxxxx')
307 self.assert_(u'ab' in u'abcd')
308 self.assert_('ab' in u'abc')
309 self.assert_(u'ab' in 'abc')
310 self.assert_(u'ab' in (1,None,u'ab'))
311 self.assert_(u'' in u'abc')
312 self.assert_('' in u'abc')
314 # If the following fails either
315 # the contains operator does not propagate UnicodeErrors or
316 # someone has changed the default encoding
317 self.assertRaises(UnicodeError, 'g\xe2teau'.__contains__, u'\xe2')
319 self.assert_(u'' in '')
320 self.assert_('' in u'')
321 self.assert_(u'' in u'')
322 self.assert_(u'' in 'abc')
323 self.assert_('' in u'abc')
324 self.assert_(u'' in u'abc')
325 self.assert_(u'\0' not in 'abc')
326 self.assert_('\0' not in u'abc')
327 self.assert_(u'\0' not in u'abc')
328 self.assert_(u'\0' in '\0abc')
329 self.assert_('\0' in u'\0abc')
330 self.assert_(u'\0' in u'\0abc')
331 self.assert_(u'\0' in 'abc\0')
332 self.assert_('\0' in u'abc\0')
333 self.assert_(u'\0' in u'abc\0')
334 self.assert_(u'a' in '\0abc')
335 self.assert_('a' in u'\0abc')
336 self.assert_(u'a' in u'\0abc')
337 self.assert_(u'asdf' in 'asdf')
338 self.assert_('asdf' in u'asdf')
339 self.assert_(u'asdf' in u'asdf')
340 self.assert_(u'asdf' not in 'asd')
341 self.assert_('asdf' not in u'asd')
342 self.assert_(u'asdf' not in u'asd')
343 self.assert_(u'asdf' not in '')
344 self.assert_('asdf' not in u'')
345 self.assert_(u'asdf' not in u'')
347 self.assertRaises(TypeError, u"abc".__contains__)
349 def test_formatting(self):
350 string_tests.MixinStrUnicodeUserStringTest.test_formatting(self)
351 # Testing Unicode formatting strings...
352 self.assertEqual(u"%s, %s" % (u"abc", "abc"), u'abc, abc')
353 self.assertEqual(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", 1, 2, 3), u'abc, abc, 1, 2.000000, 3.00')
354 self.assertEqual(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", 1, -2, 3), u'abc, abc, 1, -2.000000, 3.00')
355 self.assertEqual(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", -1, -2, 3.5), u'abc, abc, -1, -2.000000, 3.50')
356 self.assertEqual(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", -1, -2, 3.57), u'abc, abc, -1, -2.000000, 3.57')
357 self.assertEqual(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", -1, -2, 1003.57), u'abc, abc, -1, -2.000000, 1003.57')
358 if not sys.platform.startswith('java'):
359 self.assertEqual(u"%r, %r" % (u"abc", "abc"), u"u'abc', 'abc'")
360 self.assertEqual(u"%(x)s, %(y)s" % {'x':u"abc", 'y':"def"}, u'abc, def')
361 self.assertEqual(u"%(x)s, %(\xfc)s" % {'x':u"abc", u'\xfc':"def"}, u'abc, def')
363 self.assertEqual(u'%c' % 0x1234, u'\u1234')
364 self.assertRaises(OverflowError, u"%c".__mod__, (sys.maxunicode+1,))
366 # formatting jobs delegated from the string implementation:
367 self.assertEqual('...%(foo)s...' % {'foo':u"abc"}, u'...abc...')
368 self.assertEqual('...%(foo)s...' % {'foo':"abc"}, '...abc...')
369 self.assertEqual('...%(foo)s...' % {u'foo':"abc"}, '...abc...')
370 self.assertEqual('...%(foo)s...' % {u'foo':u"abc"}, u'...abc...')
371 self.assertEqual('...%(foo)s...' % {u'foo':u"abc",'def':123}, u'...abc...')
372 self.assertEqual('...%(foo)s...' % {u'foo':u"abc",u'def':123}, u'...abc...')
373 self.assertEqual('...%s...%s...%s...%s...' % (1,2,3,u"abc"), u'...1...2...3...abc...')
374 self.assertEqual('...%%...%%s...%s...%s...%s...%s...' % (1,2,3,u"abc"), u'...%...%s...1...2...3...abc...')
375 self.assertEqual('...%s...' % u"abc", u'...abc...')
376 self.assertEqual('%*s' % (5,u'abc',), u' abc')
377 self.assertEqual('%*s' % (-5,u'abc',), u'abc ')
378 self.assertEqual('%*.*s' % (5,2,u'abc',), u' ab')
379 self.assertEqual('%*.*s' % (5,3,u'abc',), u' abc')
380 self.assertEqual('%i %*.*s' % (10, 5,3,u'abc',), u'10 abc')
381 self.assertEqual('%i%s %*.*s' % (10, 3, 5, 3, u'abc',), u'103 abc')
382 self.assertEqual('%c' % u'a', u'a')
385 def test_constructor(self):
386 # unicode(obj) tests (this maps to PyObject_Unicode() at C level)
388 self.assertEqual(
389 unicode(u'unicode remains unicode'),
390 u'unicode remains unicode'
393 class UnicodeSubclass(unicode):
394 pass
396 self.assertEqual(
397 unicode(UnicodeSubclass('unicode subclass becomes unicode')),
398 u'unicode subclass becomes unicode'
401 self.assertEqual(
402 unicode('strings are converted to unicode'),
403 u'strings are converted to unicode'
406 class UnicodeCompat:
407 def __init__(self, x):
408 self.x = x
409 def __unicode__(self):
410 return self.x
412 self.assertEqual(
413 unicode(UnicodeCompat('__unicode__ compatible objects are recognized')),
414 u'__unicode__ compatible objects are recognized')
416 class StringCompat:
417 def __init__(self, x):
418 self.x = x
419 def __str__(self):
420 return self.x
422 self.assertEqual(
423 unicode(StringCompat('__str__ compatible objects are recognized')),
424 u'__str__ compatible objects are recognized'
427 # unicode(obj) is compatible to str():
429 o = StringCompat('unicode(obj) is compatible to str()')
430 self.assertEqual(unicode(o), u'unicode(obj) is compatible to str()')
431 self.assertEqual(str(o), 'unicode(obj) is compatible to str()')
433 for obj in (123, 123.45, 123L):
434 self.assertEqual(unicode(obj), unicode(str(obj)))
436 # unicode(obj, encoding, error) tests (this maps to
437 # PyUnicode_FromEncodedObject() at C level)
439 if not sys.platform.startswith('java'):
440 self.assertRaises(
441 TypeError,
442 unicode,
443 u'decoding unicode is not supported',
444 'utf-8',
445 'strict'
448 self.assertEqual(
449 unicode('strings are decoded to unicode', 'utf-8', 'strict'),
450 u'strings are decoded to unicode'
453 if not sys.platform.startswith('java'):
454 self.assertEqual(
455 unicode(
456 buffer('character buffers are decoded to unicode'),
457 'utf-8',
458 'strict'
460 u'character buffers are decoded to unicode'
463 self.assertRaises(TypeError, unicode, 42, 42, 42)
465 def test_codecs_utf7(self):
466 utfTests = [
467 (u'A\u2262\u0391.', 'A+ImIDkQ.'), # RFC2152 example
468 (u'Hi Mom -\u263a-!', 'Hi Mom -+Jjo--!'), # RFC2152 example
469 (u'\u65E5\u672C\u8A9E', '+ZeVnLIqe-'), # RFC2152 example
470 (u'Item 3 is \u00a31.', 'Item 3 is +AKM-1.'), # RFC2152 example
471 (u'+', '+-'),
472 (u'+-', '+--'),
473 (u'+?', '+-?'),
474 (u'\?', '+AFw?'),
475 (u'+?', '+-?'),
476 (ur'\\?', '+AFwAXA?'),
477 (ur'\\\?', '+AFwAXABc?'),
478 (ur'++--', '+-+---')
481 for (x, y) in utfTests:
482 self.assertEqual(x.encode('utf-7'), y)
484 # surrogates not supported
485 self.assertRaises(UnicodeError, unicode, '+3ADYAA-', 'utf-7')
487 self.assertEqual(unicode('+3ADYAA-', 'utf-7', 'replace'), u'\ufffd')
489 def test_codecs_utf8(self):
490 self.assertEqual(u''.encode('utf-8'), '')
491 self.assertEqual(u'\u20ac'.encode('utf-8'), '\xe2\x82\xac')
492 self.assertEqual(u'\ud800\udc02'.encode('utf-8'), '\xf0\x90\x80\x82')
493 self.assertEqual(u'\ud84d\udc56'.encode('utf-8'), '\xf0\xa3\x91\x96')
494 self.assertEqual(u'\ud800'.encode('utf-8'), '\xed\xa0\x80')
495 self.assertEqual(u'\udc00'.encode('utf-8'), '\xed\xb0\x80')
496 self.assertEqual(
497 (u'\ud800\udc02'*1000).encode('utf-8'),
498 '\xf0\x90\x80\x82'*1000
500 self.assertEqual(
501 u'\u6b63\u78ba\u306b\u8a00\u3046\u3068\u7ffb\u8a33\u306f'
502 u'\u3055\u308c\u3066\u3044\u307e\u305b\u3093\u3002\u4e00'
503 u'\u90e8\u306f\u30c9\u30a4\u30c4\u8a9e\u3067\u3059\u304c'
504 u'\u3001\u3042\u3068\u306f\u3067\u305f\u3089\u3081\u3067'
505 u'\u3059\u3002\u5b9f\u969b\u306b\u306f\u300cWenn ist das'
506 u' Nunstuck git und'.encode('utf-8'),
507 '\xe6\xad\xa3\xe7\xa2\xba\xe3\x81\xab\xe8\xa8\x80\xe3\x81'
508 '\x86\xe3\x81\xa8\xe7\xbf\xbb\xe8\xa8\xb3\xe3\x81\xaf\xe3'
509 '\x81\x95\xe3\x82\x8c\xe3\x81\xa6\xe3\x81\x84\xe3\x81\xbe'
510 '\xe3\x81\x9b\xe3\x82\x93\xe3\x80\x82\xe4\xb8\x80\xe9\x83'
511 '\xa8\xe3\x81\xaf\xe3\x83\x89\xe3\x82\xa4\xe3\x83\x84\xe8'
512 '\xaa\x9e\xe3\x81\xa7\xe3\x81\x99\xe3\x81\x8c\xe3\x80\x81'
513 '\xe3\x81\x82\xe3\x81\xa8\xe3\x81\xaf\xe3\x81\xa7\xe3\x81'
514 '\x9f\xe3\x82\x89\xe3\x82\x81\xe3\x81\xa7\xe3\x81\x99\xe3'
515 '\x80\x82\xe5\xae\x9f\xe9\x9a\x9b\xe3\x81\xab\xe3\x81\xaf'
516 '\xe3\x80\x8cWenn ist das Nunstuck git und'
519 # UTF-8 specific decoding tests
520 self.assertEqual(unicode('\xf0\xa3\x91\x96', 'utf-8'), u'\U00023456' )
521 self.assertEqual(unicode('\xf0\x90\x80\x82', 'utf-8'), u'\U00010002' )
522 self.assertEqual(unicode('\xe2\x82\xac', 'utf-8'), u'\u20ac' )
524 # Other possible utf-8 test cases:
525 # * strict decoding testing for all of the
526 # UTF8_ERROR cases in PyUnicode_DecodeUTF8
528 def test_codecs_idna(self):
529 # Test whether trailing dot is preserved
530 self.assertEqual(u"www.python.org.".encode("idna"), "www.python.org.")
532 def test_codecs_errors(self):
533 # Error handling (encoding)
534 self.assertRaises(UnicodeError, u'Andr\202 x'.encode, 'ascii')
535 self.assertRaises(UnicodeError, u'Andr\202 x'.encode, 'ascii','strict')
536 self.assertEqual(u'Andr\202 x'.encode('ascii','ignore'), "Andr x")
537 self.assertEqual(u'Andr\202 x'.encode('ascii','replace'), "Andr? x")
539 # Error handling (decoding)
540 self.assertRaises(UnicodeError, unicode, 'Andr\202 x', 'ascii')
541 self.assertRaises(UnicodeError, unicode, 'Andr\202 x', 'ascii','strict')
542 self.assertEqual(unicode('Andr\202 x','ascii','ignore'), u"Andr x")
543 self.assertEqual(unicode('Andr\202 x','ascii','replace'), u'Andr\uFFFD x')
545 # Error handling (unknown character names)
546 self.assertEqual("\\N{foo}xx".decode("unicode-escape", "ignore"), u"xx")
548 # Error handling (truncated escape sequence)
549 self.assertRaises(UnicodeError, "\\".decode, "unicode-escape")
551 # Error handling (bad decoder return)
552 def search_function(encoding):
553 def decode1(input, errors="strict"):
554 return 42 # not a tuple
555 def encode1(input, errors="strict"):
556 return 42 # not a tuple
557 def encode2(input, errors="strict"):
558 return (42, 42) # no unicode
559 def decode2(input, errors="strict"):
560 return (42, 42) # no unicode
561 if encoding=="test.unicode1":
562 return (encode1, decode1, None, None)
563 elif encoding=="test.unicode2":
564 return (encode2, decode2, None, None)
565 else:
566 return None
567 codecs.register(search_function)
568 self.assertRaises(TypeError, "hello".decode, "test.unicode1")
569 self.assertRaises(TypeError, unicode, "hello", "test.unicode2")
570 self.assertRaises(TypeError, u"hello".encode, "test.unicode1")
571 self.assertRaises(TypeError, u"hello".encode, "test.unicode2")
572 # executes PyUnicode_Encode()
573 import imp
574 self.assertRaises(
575 ImportError,
576 imp.find_module,
577 "non-existing module",
578 [u"non-existing dir"]
581 # Error handling (wrong arguments)
582 self.assertRaises(TypeError, u"hello".encode, 42, 42, 42)
584 # Error handling (PyUnicode_EncodeDecimal())
585 self.assertRaises(UnicodeError, int, u"\u0200")
587 def test_codecs(self):
588 # Encoding
589 self.assertEqual(u'hello'.encode('ascii'), 'hello')
590 self.assertEqual(u'hello'.encode('utf-7'), 'hello')
591 self.assertEqual(u'hello'.encode('utf-8'), 'hello')
592 self.assertEqual(u'hello'.encode('utf8'), 'hello')
593 self.assertEqual(u'hello'.encode('utf-16-le'), 'h\000e\000l\000l\000o\000')
594 self.assertEqual(u'hello'.encode('utf-16-be'), '\000h\000e\000l\000l\000o')
595 self.assertEqual(u'hello'.encode('latin-1'), 'hello')
597 # Roundtrip safety for BMP (just the first 1024 chars)
598 u = u''.join(map(unichr, xrange(1024)))
599 for encoding in ('utf-7', 'utf-8', 'utf-16', 'utf-16-le', 'utf-16-be',
600 'raw_unicode_escape', 'unicode_escape', 'unicode_internal'):
601 self.assertEqual(unicode(u.encode(encoding),encoding), u)
603 # Roundtrip safety for BMP (just the first 256 chars)
604 u = u''.join(map(unichr, xrange(256)))
605 for encoding in ('latin-1',):
606 self.assertEqual(unicode(u.encode(encoding),encoding), u)
608 # Roundtrip safety for BMP (just the first 128 chars)
609 u = u''.join(map(unichr, xrange(128)))
610 for encoding in ('ascii',):
611 self.assertEqual(unicode(u.encode(encoding),encoding), u)
613 # Roundtrip safety for non-BMP (just a few chars)
614 u = u'\U00010001\U00020002\U00030003\U00040004\U00050005'
615 for encoding in ('utf-8', 'utf-16', 'utf-16-le', 'utf-16-be',
616 #'raw_unicode_escape',
617 'unicode_escape', 'unicode_internal'):
618 self.assertEqual(unicode(u.encode(encoding),encoding), u)
620 # UTF-8 must be roundtrip safe for all UCS-2 code points
621 # This excludes surrogates: in the full range, there would be
622 # a surrogate pair (\udbff\udc00), which gets converted back
623 # to a non-BMP character (\U0010fc00)
624 u = u''.join(map(unichr, range(0,0xd800)+range(0xe000,0x10000)))
625 for encoding in ('utf-8',):
626 self.assertEqual(unicode(u.encode(encoding),encoding), u)
628 def test_codecs_charmap(self):
629 # 0-127
630 s = ''.join(map(chr, xrange(128)))
631 for encoding in (
632 'cp037', 'cp1026',
633 'cp437', 'cp500', 'cp737', 'cp775', 'cp850',
634 'cp852', 'cp855', 'cp860', 'cp861', 'cp862',
635 'cp863', 'cp865', 'cp866',
636 'iso8859_10', 'iso8859_13', 'iso8859_14', 'iso8859_15',
637 'iso8859_2', 'iso8859_3', 'iso8859_4', 'iso8859_5', 'iso8859_6',
638 'iso8859_7', 'iso8859_9', 'koi8_r', 'latin_1',
639 'mac_cyrillic', 'mac_latin2',
641 'cp1250', 'cp1251', 'cp1252', 'cp1253', 'cp1254', 'cp1255',
642 'cp1256', 'cp1257', 'cp1258',
643 'cp856', 'cp857', 'cp864', 'cp869', 'cp874',
645 'mac_greek', 'mac_iceland','mac_roman', 'mac_turkish',
646 'cp1006', 'iso8859_8',
648 ### These have undefined mappings:
649 #'cp424',
651 ### These fail the round-trip:
652 #'cp875'
655 self.assertEqual(unicode(s, encoding).encode(encoding), s)
657 # 128-255
658 s = ''.join(map(chr, xrange(128, 256)))
659 for encoding in (
660 'cp037', 'cp1026',
661 'cp437', 'cp500', 'cp737', 'cp775', 'cp850',
662 'cp852', 'cp855', 'cp860', 'cp861', 'cp862',
663 'cp863', 'cp865', 'cp866',
664 'iso8859_10', 'iso8859_13', 'iso8859_14', 'iso8859_15',
665 'iso8859_2', 'iso8859_4', 'iso8859_5',
666 'iso8859_9', 'koi8_r', 'latin_1',
667 'mac_cyrillic', 'mac_latin2',
669 ### These have undefined mappings:
670 #'cp1250', 'cp1251', 'cp1252', 'cp1253', 'cp1254', 'cp1255',
671 #'cp1256', 'cp1257', 'cp1258',
672 #'cp424', 'cp856', 'cp857', 'cp864', 'cp869', 'cp874',
673 #'iso8859_3', 'iso8859_6', 'iso8859_7',
674 #'mac_greek', 'mac_iceland','mac_roman', 'mac_turkish',
676 ### These fail the round-trip:
677 #'cp1006', 'cp875', 'iso8859_8',
680 self.assertEqual(unicode(s, encoding).encode(encoding), s)
682 def test_concatenation(self):
683 self.assertEqual((u"abc" u"def"), u"abcdef")
684 self.assertEqual(("abc" u"def"), u"abcdef")
685 self.assertEqual((u"abc" "def"), u"abcdef")
686 self.assertEqual((u"abc" u"def" "ghi"), u"abcdefghi")
687 self.assertEqual(("abc" "def" u"ghi"), u"abcdefghi")
689 def test_printing(self):
690 class BitBucket:
691 def write(self, text):
692 pass
694 out = BitBucket()
695 print >>out, u'abc'
696 print >>out, u'abc', u'def'
697 print >>out, u'abc', 'def'
698 print >>out, 'abc', u'def'
699 print >>out, u'abc\n'
700 print >>out, u'abc\n',
701 print >>out, u'abc\n',
702 print >>out, u'def\n'
703 print >>out, u'def\n'
705 def test_ucs4(self):
706 if sys.maxunicode == 0xFFFF:
707 return
708 x = u'\U00100000'
709 y = x.encode("raw-unicode-escape").decode("raw-unicode-escape")
710 self.assertEqual(x, y)
712 def test_main():
713 test_support.run_unittest(UnicodeTest)
715 if __name__ == "__main__":
716 test_main()