Update version number and release date.
[python/dscho.git] / Lib / test / test_unicode.py
blob28837b4364c3565177b652345f871b3398c86f62
1 # -*- coding: iso-8859-1 -*-
2 """ Test script for the Unicode implementation.
4 Written by Marc-Andre Lemburg (mal@lemburg.com).
6 (c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
8 """#"
9 import unittest, sys, string, codecs, new
10 from test import test_support, string_tests
12 class UnicodeTest(
13 string_tests.CommonTest,
14 string_tests.MixinStrUnicodeUserStringTest
16 type2test = unicode
18 def checkequalnofix(self, result, object, methodname, *args):
19 method = getattr(object, methodname)
20 realresult = method(*args)
21 self.assertEqual(realresult, result)
22 self.assert_(type(realresult) is type(result))
24 # if the original is returned make sure that
25 # this doesn't happen with subclasses
26 if realresult is object:
27 class usub(unicode):
28 def __repr__(self):
29 return 'usub(%r)' % unicode.__repr__(self)
30 object = usub(object)
31 method = getattr(object, methodname)
32 realresult = method(*args)
33 self.assertEqual(realresult, result)
34 self.assert_(object is not realresult)
36 def test_repr(self):
37 if not sys.platform.startswith('java'):
38 # Test basic sanity of repr()
39 self.assertEqual(repr(u'abc'), "u'abc'")
40 self.assertEqual(repr(u'ab\\c'), "u'ab\\\\c'")
41 self.assertEqual(repr(u'ab\\'), "u'ab\\\\'")
42 self.assertEqual(repr(u'\\c'), "u'\\\\c'")
43 self.assertEqual(repr(u'\\'), "u'\\\\'")
44 self.assertEqual(repr(u'\n'), "u'\\n'")
45 self.assertEqual(repr(u'\r'), "u'\\r'")
46 self.assertEqual(repr(u'\t'), "u'\\t'")
47 self.assertEqual(repr(u'\b'), "u'\\x08'")
48 self.assertEqual(repr(u"'\""), """u'\\'"'""")
49 self.assertEqual(repr(u"'\""), """u'\\'"'""")
50 self.assertEqual(repr(u"'"), '''u"'"''')
51 self.assertEqual(repr(u'"'), """u'"'""")
52 latin1repr = (
53 "u'\\x00\\x01\\x02\\x03\\x04\\x05\\x06\\x07\\x08\\t\\n\\x0b\\x0c\\r"
54 "\\x0e\\x0f\\x10\\x11\\x12\\x13\\x14\\x15\\x16\\x17\\x18\\x19\\x1a"
55 "\\x1b\\x1c\\x1d\\x1e\\x1f !\"#$%&\\'()*+,-./0123456789:;<=>?@ABCDEFGHI"
56 "JKLMNOPQRSTUVWXYZ[\\\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\\x7f"
57 "\\x80\\x81\\x82\\x83\\x84\\x85\\x86\\x87\\x88\\x89\\x8a\\x8b\\x8c\\x8d"
58 "\\x8e\\x8f\\x90\\x91\\x92\\x93\\x94\\x95\\x96\\x97\\x98\\x99\\x9a\\x9b"
59 "\\x9c\\x9d\\x9e\\x9f\\xa0\\xa1\\xa2\\xa3\\xa4\\xa5\\xa6\\xa7\\xa8\\xa9"
60 "\\xaa\\xab\\xac\\xad\\xae\\xaf\\xb0\\xb1\\xb2\\xb3\\xb4\\xb5\\xb6\\xb7"
61 "\\xb8\\xb9\\xba\\xbb\\xbc\\xbd\\xbe\\xbf\\xc0\\xc1\\xc2\\xc3\\xc4\\xc5"
62 "\\xc6\\xc7\\xc8\\xc9\\xca\\xcb\\xcc\\xcd\\xce\\xcf\\xd0\\xd1\\xd2\\xd3"
63 "\\xd4\\xd5\\xd6\\xd7\\xd8\\xd9\\xda\\xdb\\xdc\\xdd\\xde\\xdf\\xe0\\xe1"
64 "\\xe2\\xe3\\xe4\\xe5\\xe6\\xe7\\xe8\\xe9\\xea\\xeb\\xec\\xed\\xee\\xef"
65 "\\xf0\\xf1\\xf2\\xf3\\xf4\\xf5\\xf6\\xf7\\xf8\\xf9\\xfa\\xfb\\xfc\\xfd"
66 "\\xfe\\xff'")
67 testrepr = repr(u''.join(map(unichr, xrange(256))))
68 self.assertEqual(testrepr, latin1repr)
70 def test_count(self):
71 string_tests.CommonTest.test_count(self)
72 # check mixed argument types
73 self.checkequalnofix(3, 'aaa', 'count', u'a')
74 self.checkequalnofix(0, 'aaa', 'count', u'b')
75 self.checkequalnofix(3, u'aaa', 'count', 'a')
76 self.checkequalnofix(0, u'aaa', 'count', 'b')
77 self.checkequalnofix(0, u'aaa', 'count', 'b')
78 self.checkequalnofix(1, u'aaa', 'count', 'a', -1)
79 self.checkequalnofix(3, u'aaa', 'count', 'a', -10)
80 self.checkequalnofix(2, u'aaa', 'count', 'a', 0, -1)
81 self.checkequalnofix(0, u'aaa', 'count', 'a', 0, -10)
83 def test_find(self):
84 self.checkequalnofix(0, u'abcdefghiabc', 'find', u'abc')
85 self.checkequalnofix(9, u'abcdefghiabc', 'find', u'abc', 1)
86 self.checkequalnofix(-1, u'abcdefghiabc', 'find', u'def', 4)
88 self.assertRaises(TypeError, u'hello'.find)
89 self.assertRaises(TypeError, u'hello'.find, 42)
91 def test_rfind(self):
92 string_tests.CommonTest.test_rfind(self)
93 # check mixed argument types
94 self.checkequalnofix(9, 'abcdefghiabc', 'rfind', u'abc')
95 self.checkequalnofix(12, 'abcdefghiabc', 'rfind', u'')
96 self.checkequalnofix(12, u'abcdefghiabc', 'rfind', '')
98 def test_index(self):
99 string_tests.CommonTest.test_index(self)
100 # check mixed argument types
101 for (t1, t2) in ((str, unicode), (unicode, str)):
102 self.checkequalnofix(0, t1('abcdefghiabc'), 'index', t2(''))
103 self.checkequalnofix(3, t1('abcdefghiabc'), 'index', t2('def'))
104 self.checkequalnofix(0, t1('abcdefghiabc'), 'index', t2('abc'))
105 self.checkequalnofix(9, t1('abcdefghiabc'), 'index', t2('abc'), 1)
106 self.assertRaises(ValueError, t1('abcdefghiabc').index, t2('hib'))
107 self.assertRaises(ValueError, t1('abcdefghiab').index, t2('abc'), 1)
108 self.assertRaises(ValueError, t1('abcdefghi').index, t2('ghi'), 8)
109 self.assertRaises(ValueError, t1('abcdefghi').index, t2('ghi'), -1)
111 def test_rindex(self):
112 string_tests.CommonTest.test_rindex(self)
113 # check mixed argument types
114 for (t1, t2) in ((str, unicode), (unicode, str)):
115 self.checkequalnofix(12, t1('abcdefghiabc'), 'rindex', t2(''))
116 self.checkequalnofix(3, t1('abcdefghiabc'), 'rindex', t2('def'))
117 self.checkequalnofix(9, t1('abcdefghiabc'), 'rindex', t2('abc'))
118 self.checkequalnofix(0, t1('abcdefghiabc'), 'rindex', t2('abc'), 0, -1)
120 self.assertRaises(ValueError, t1('abcdefghiabc').rindex, t2('hib'))
121 self.assertRaises(ValueError, t1('defghiabc').rindex, t2('def'), 1)
122 self.assertRaises(ValueError, t1('defghiabc').rindex, t2('abc'), 0, -1)
123 self.assertRaises(ValueError, t1('abcdefghi').rindex, t2('ghi'), 0, 8)
124 self.assertRaises(ValueError, t1('abcdefghi').rindex, t2('ghi'), 0, -1)
126 def test_translate(self):
127 self.checkequalnofix(u'bbbc', u'abababc', 'translate', {ord('a'):None})
128 self.checkequalnofix(u'iiic', u'abababc', 'translate', {ord('a'):None, ord('b'):ord('i')})
129 self.checkequalnofix(u'iiix', u'abababc', 'translate', {ord('a'):None, ord('b'):ord('i'), ord('c'):u'x'})
130 self.checkequalnofix(u'<i><i><i>c', u'abababc', 'translate', {ord('a'):None, ord('b'):u'<i>'})
131 self.checkequalnofix(u'c', u'abababc', 'translate', {ord('a'):None, ord('b'):u''})
133 self.assertRaises(TypeError, u'hello'.translate)
134 self.assertRaises(TypeError, u'abababc'.translate, {ord('a'):''})
136 def test_split(self):
137 string_tests.CommonTest.test_split(self)
139 # Mixed arguments
140 self.checkequalnofix([u'a', u'b', u'c', u'd'], u'a//b//c//d', 'split', '//')
141 self.checkequalnofix([u'a', u'b', u'c', u'd'], 'a//b//c//d', 'split', u'//')
142 self.checkequalnofix([u'endcase ', u''], u'endcase test', 'split', 'test')
144 def test_join(self):
145 string_tests.MixinStrUnicodeUserStringTest.test_join(self)
147 # mixed arguments
148 self.checkequalnofix(u'a b c d', u' ', 'join', ['a', 'b', u'c', u'd'])
149 self.checkequalnofix(u'abcd', u'', 'join', (u'a', u'b', u'c', u'd'))
150 self.checkequalnofix(u'w x y z', u' ', 'join', string_tests.Sequence('wxyz'))
151 self.checkequalnofix(u'a b c d', ' ', 'join', [u'a', u'b', u'c', u'd'])
152 self.checkequalnofix(u'a b c d', ' ', 'join', ['a', 'b', u'c', u'd'])
153 self.checkequalnofix(u'abcd', '', 'join', (u'a', u'b', u'c', u'd'))
154 self.checkequalnofix(u'w x y z', ' ', 'join', string_tests.Sequence(u'wxyz'))
156 def test_strip(self):
157 string_tests.CommonTest.test_strip(self)
158 self.assertRaises(UnicodeError, u"hello".strip, "\xff")
160 def test_replace(self):
161 string_tests.CommonTest.test_replace(self)
163 # method call forwarded from str implementation because of unicode argument
164 self.checkequalnofix(u'one@two!three!', 'one!two!three!', 'replace', u'!', u'@', 1)
165 self.assertRaises(TypeError, 'replace'.replace, u"r", 42)
167 def test_comparison(self):
168 # Comparisons:
169 self.assertEqual(u'abc', 'abc')
170 self.assertEqual('abc', u'abc')
171 self.assertEqual(u'abc', u'abc')
172 self.assert_(u'abcd' > 'abc')
173 self.assert_('abcd' > u'abc')
174 self.assert_(u'abcd' > u'abc')
175 self.assert_(u'abc' < 'abcd')
176 self.assert_('abc' < u'abcd')
177 self.assert_(u'abc' < u'abcd')
179 if 0:
180 # Move these tests to a Unicode collation module test...
181 # Testing UTF-16 code point order comparisons...
183 # No surrogates, no fixup required.
184 self.assert_(u'\u0061' < u'\u20ac')
185 # Non surrogate below surrogate value, no fixup required
186 self.assert_(u'\u0061' < u'\ud800\udc02')
188 # Non surrogate above surrogate value, fixup required
189 def test_lecmp(s, s2):
190 self.assert_(s < s2)
192 def test_fixup(s):
193 s2 = u'\ud800\udc01'
194 test_lecmp(s, s2)
195 s2 = u'\ud900\udc01'
196 test_lecmp(s, s2)
197 s2 = u'\uda00\udc01'
198 test_lecmp(s, s2)
199 s2 = u'\udb00\udc01'
200 test_lecmp(s, s2)
201 s2 = u'\ud800\udd01'
202 test_lecmp(s, s2)
203 s2 = u'\ud900\udd01'
204 test_lecmp(s, s2)
205 s2 = u'\uda00\udd01'
206 test_lecmp(s, s2)
207 s2 = u'\udb00\udd01'
208 test_lecmp(s, s2)
209 s2 = u'\ud800\ude01'
210 test_lecmp(s, s2)
211 s2 = u'\ud900\ude01'
212 test_lecmp(s, s2)
213 s2 = u'\uda00\ude01'
214 test_lecmp(s, s2)
215 s2 = u'\udb00\ude01'
216 test_lecmp(s, s2)
217 s2 = u'\ud800\udfff'
218 test_lecmp(s, s2)
219 s2 = u'\ud900\udfff'
220 test_lecmp(s, s2)
221 s2 = u'\uda00\udfff'
222 test_lecmp(s, s2)
223 s2 = u'\udb00\udfff'
224 test_lecmp(s, s2)
226 test_fixup(u'\ue000')
227 test_fixup(u'\uff61')
229 # Surrogates on both sides, no fixup required
230 self.assert_(u'\ud800\udc02' < u'\ud84d\udc56')
232 def test_islower(self):
233 string_tests.MixinStrUnicodeUserStringTest.test_islower(self)
234 self.checkequalnofix(False, u'\u1FFc', 'islower')
236 def test_isupper(self):
237 string_tests.MixinStrUnicodeUserStringTest.test_isupper(self)
238 if not sys.platform.startswith('java'):
239 self.checkequalnofix(False, u'\u1FFc', 'isupper')
241 def test_istitle(self):
242 string_tests.MixinStrUnicodeUserStringTest.test_title(self)
243 self.checkequalnofix(True, u'\u1FFc', 'istitle')
244 self.checkequalnofix(True, u'Greek \u1FFcitlecases ...', 'istitle')
246 def test_isspace(self):
247 string_tests.MixinStrUnicodeUserStringTest.test_isspace(self)
248 self.checkequalnofix(True, u'\u2000', 'isspace')
249 self.checkequalnofix(True, u'\u200a', 'isspace')
250 self.checkequalnofix(False, u'\u2014', 'isspace')
252 def test_isalpha(self):
253 string_tests.MixinStrUnicodeUserStringTest.test_isalpha(self)
254 self.checkequalnofix(True, u'\u1FFc', 'isalpha')
256 def test_isdecimal(self):
257 self.checkequalnofix(False, u'', 'isdecimal')
258 self.checkequalnofix(False, u'a', 'isdecimal')
259 self.checkequalnofix(True, u'0', 'isdecimal')
260 self.checkequalnofix(False, u'\u2460', 'isdecimal') # CIRCLED DIGIT ONE
261 self.checkequalnofix(False, u'\xbc', 'isdecimal') # VULGAR FRACTION ONE QUARTER
262 self.checkequalnofix(True, u'\u0660', 'isdecimal') # ARABIC-INDIC DIGIT ZERO
263 self.checkequalnofix(True, u'0123456789', 'isdecimal')
264 self.checkequalnofix(False, u'0123456789a', 'isdecimal')
266 self.checkraises(TypeError, 'abc', 'isdecimal', 42)
268 def test_isdigit(self):
269 string_tests.MixinStrUnicodeUserStringTest.test_isdigit(self)
270 self.checkequalnofix(True, u'\u2460', 'isdigit')
271 self.checkequalnofix(False, u'\xbc', 'isdigit')
272 self.checkequalnofix(True, u'\u0660', 'isdigit')
274 def test_isnumeric(self):
275 self.checkequalnofix(False, u'', 'isnumeric')
276 self.checkequalnofix(False, u'a', 'isnumeric')
277 self.checkequalnofix(True, u'0', 'isnumeric')
278 self.checkequalnofix(True, u'\u2460', 'isnumeric')
279 self.checkequalnofix(True, u'\xbc', 'isnumeric')
280 self.checkequalnofix(True, u'\u0660', 'isnumeric')
281 self.checkequalnofix(True, u'0123456789', 'isnumeric')
282 self.checkequalnofix(False, u'0123456789a', 'isnumeric')
284 self.assertRaises(TypeError, u"abc".isnumeric, 42)
286 def test_contains(self):
287 # Testing Unicode contains method
288 self.assert_('a' in u'abdb')
289 self.assert_('a' in u'bdab')
290 self.assert_('a' in u'bdaba')
291 self.assert_('a' in u'bdba')
292 self.assert_('a' in u'bdba')
293 self.assert_(u'a' in u'bdba')
294 self.assert_(u'a' not in u'bdb')
295 self.assert_(u'a' not in 'bdb')
296 self.assert_(u'a' in 'bdba')
297 self.assert_(u'a' in ('a',1,None))
298 self.assert_(u'a' in (1,None,'a'))
299 self.assert_(u'a' in (1,None,u'a'))
300 self.assert_('a' in ('a',1,None))
301 self.assert_('a' in (1,None,'a'))
302 self.assert_('a' in (1,None,u'a'))
303 self.assert_('a' not in ('x',1,u'y'))
304 self.assert_('a' not in ('x',1,None))
305 self.assert_(u'abcd' not in u'abcxxxx')
306 self.assert_(u'ab' in u'abcd')
307 self.assert_('ab' in u'abc')
308 self.assert_(u'ab' in 'abc')
309 self.assert_(u'ab' in (1,None,u'ab'))
310 self.assert_(u'' in u'abc')
311 self.assert_('' in u'abc')
313 # If the following fails either
314 # the contains operator does not propagate UnicodeErrors or
315 # someone has changed the default encoding
316 self.assertRaises(UnicodeError, 'g\xe2teau'.__contains__, u'\xe2')
318 self.assert_(u'' in '')
319 self.assert_('' in u'')
320 self.assert_(u'' in u'')
321 self.assert_(u'' in 'abc')
322 self.assert_('' in u'abc')
323 self.assert_(u'' in u'abc')
324 self.assert_(u'\0' not in 'abc')
325 self.assert_('\0' not in u'abc')
326 self.assert_(u'\0' not in u'abc')
327 self.assert_(u'\0' in '\0abc')
328 self.assert_('\0' in u'\0abc')
329 self.assert_(u'\0' in u'\0abc')
330 self.assert_(u'\0' in 'abc\0')
331 self.assert_('\0' in u'abc\0')
332 self.assert_(u'\0' in u'abc\0')
333 self.assert_(u'a' in '\0abc')
334 self.assert_('a' in u'\0abc')
335 self.assert_(u'a' in u'\0abc')
336 self.assert_(u'asdf' in 'asdf')
337 self.assert_('asdf' in u'asdf')
338 self.assert_(u'asdf' in u'asdf')
339 self.assert_(u'asdf' not in 'asd')
340 self.assert_('asdf' not in u'asd')
341 self.assert_(u'asdf' not in u'asd')
342 self.assert_(u'asdf' not in '')
343 self.assert_('asdf' not in u'')
344 self.assert_(u'asdf' not in u'')
346 self.assertRaises(TypeError, u"abc".__contains__)
348 def test_formatting(self):
349 string_tests.MixinStrUnicodeUserStringTest.test_formatting(self)
350 # Testing Unicode formatting strings...
351 self.assertEqual(u"%s, %s" % (u"abc", "abc"), u'abc, abc')
352 self.assertEqual(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", 1, 2, 3), u'abc, abc, 1, 2.000000, 3.00')
353 self.assertEqual(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", 1, -2, 3), u'abc, abc, 1, -2.000000, 3.00')
354 self.assertEqual(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", -1, -2, 3.5), u'abc, abc, -1, -2.000000, 3.50')
355 self.assertEqual(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", -1, -2, 3.57), u'abc, abc, -1, -2.000000, 3.57')
356 self.assertEqual(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", -1, -2, 1003.57), u'abc, abc, -1, -2.000000, 1003.57')
357 if not sys.platform.startswith('java'):
358 self.assertEqual(u"%r, %r" % (u"abc", "abc"), u"u'abc', 'abc'")
359 self.assertEqual(u"%(x)s, %(y)s" % {'x':u"abc", 'y':"def"}, u'abc, def')
360 self.assertEqual(u"%(x)s, %(\xfc)s" % {'x':u"abc", u'\xfc':"def"}, u'abc, def')
362 self.assertEqual(u'%c' % 0x1234, u'\u1234')
363 self.assertRaises(OverflowError, u"%c".__mod__, (sys.maxunicode+1,))
365 # formatting jobs delegated from the string implementation:
366 self.assertEqual('...%(foo)s...' % {'foo':u"abc"}, u'...abc...')
367 self.assertEqual('...%(foo)s...' % {'foo':"abc"}, '...abc...')
368 self.assertEqual('...%(foo)s...' % {u'foo':"abc"}, '...abc...')
369 self.assertEqual('...%(foo)s...' % {u'foo':u"abc"}, u'...abc...')
370 self.assertEqual('...%(foo)s...' % {u'foo':u"abc",'def':123}, u'...abc...')
371 self.assertEqual('...%(foo)s...' % {u'foo':u"abc",u'def':123}, u'...abc...')
372 self.assertEqual('...%s...%s...%s...%s...' % (1,2,3,u"abc"), u'...1...2...3...abc...')
373 self.assertEqual('...%%...%%s...%s...%s...%s...%s...' % (1,2,3,u"abc"), u'...%...%s...1...2...3...abc...')
374 self.assertEqual('...%s...' % u"abc", u'...abc...')
375 self.assertEqual('%*s' % (5,u'abc',), u' abc')
376 self.assertEqual('%*s' % (-5,u'abc',), u'abc ')
377 self.assertEqual('%*.*s' % (5,2,u'abc',), u' ab')
378 self.assertEqual('%*.*s' % (5,3,u'abc',), u' abc')
379 self.assertEqual('%i %*.*s' % (10, 5,3,u'abc',), u'10 abc')
380 self.assertEqual('%i%s %*.*s' % (10, 3, 5, 3, u'abc',), u'103 abc')
381 self.assertEqual('%c' % u'a', u'a')
384 def test_constructor(self):
385 # unicode(obj) tests (this maps to PyObject_Unicode() at C level)
387 self.assertEqual(
388 unicode(u'unicode remains unicode'),
389 u'unicode remains unicode'
392 class UnicodeSubclass(unicode):
393 pass
395 self.assertEqual(
396 unicode(UnicodeSubclass('unicode subclass becomes unicode')),
397 u'unicode subclass becomes unicode'
400 self.assertEqual(
401 unicode('strings are converted to unicode'),
402 u'strings are converted to unicode'
405 class UnicodeCompat:
406 def __init__(self, x):
407 self.x = x
408 def __unicode__(self):
409 return self.x
411 self.assertEqual(
412 unicode(UnicodeCompat('__unicode__ compatible objects are recognized')),
413 u'__unicode__ compatible objects are recognized')
415 class StringCompat:
416 def __init__(self, x):
417 self.x = x
418 def __str__(self):
419 return self.x
421 self.assertEqual(
422 unicode(StringCompat('__str__ compatible objects are recognized')),
423 u'__str__ compatible objects are recognized'
426 # unicode(obj) is compatible to str():
428 o = StringCompat('unicode(obj) is compatible to str()')
429 self.assertEqual(unicode(o), u'unicode(obj) is compatible to str()')
430 self.assertEqual(str(o), 'unicode(obj) is compatible to str()')
432 for obj in (123, 123.45, 123L):
433 self.assertEqual(unicode(obj), unicode(str(obj)))
435 # unicode(obj, encoding, error) tests (this maps to
436 # PyUnicode_FromEncodedObject() at C level)
438 if not sys.platform.startswith('java'):
439 self.assertRaises(
440 TypeError,
441 unicode,
442 u'decoding unicode is not supported',
443 'utf-8',
444 'strict'
447 self.assertEqual(
448 unicode('strings are decoded to unicode', 'utf-8', 'strict'),
449 u'strings are decoded to unicode'
452 if not sys.platform.startswith('java'):
453 self.assertEqual(
454 unicode(
455 buffer('character buffers are decoded to unicode'),
456 'utf-8',
457 'strict'
459 u'character buffers are decoded to unicode'
462 self.assertRaises(TypeError, unicode, 42, 42, 42)
464 def test_codecs_utf7(self):
465 utfTests = [
466 (u'A\u2262\u0391.', 'A+ImIDkQ.'), # RFC2152 example
467 (u'Hi Mom -\u263a-!', 'Hi Mom -+Jjo--!'), # RFC2152 example
468 (u'\u65E5\u672C\u8A9E', '+ZeVnLIqe-'), # RFC2152 example
469 (u'Item 3 is \u00a31.', 'Item 3 is +AKM-1.'), # RFC2152 example
470 (u'+', '+-'),
471 (u'+-', '+--'),
472 (u'+?', '+-?'),
473 (u'\?', '+AFw?'),
474 (u'+?', '+-?'),
475 (ur'\\?', '+AFwAXA?'),
476 (ur'\\\?', '+AFwAXABc?'),
477 (ur'++--', '+-+---')
480 for (x, y) in utfTests:
481 self.assertEqual(x.encode('utf-7'), y)
483 # surrogates not supported
484 self.assertRaises(UnicodeError, unicode, '+3ADYAA-', 'utf-7')
486 self.assertEqual(unicode('+3ADYAA-', 'utf-7', 'replace'), u'\ufffd')
488 def test_codecs_utf8(self):
489 self.assertEqual(u''.encode('utf-8'), '')
490 self.assertEqual(u'\u20ac'.encode('utf-8'), '\xe2\x82\xac')
491 self.assertEqual(u'\ud800\udc02'.encode('utf-8'), '\xf0\x90\x80\x82')
492 self.assertEqual(u'\ud84d\udc56'.encode('utf-8'), '\xf0\xa3\x91\x96')
493 self.assertEqual(u'\ud800'.encode('utf-8'), '\xed\xa0\x80')
494 self.assertEqual(u'\udc00'.encode('utf-8'), '\xed\xb0\x80')
495 self.assertEqual(
496 (u'\ud800\udc02'*1000).encode('utf-8'),
497 '\xf0\x90\x80\x82'*1000
499 self.assertEqual(
500 u'\u6b63\u78ba\u306b\u8a00\u3046\u3068\u7ffb\u8a33\u306f'
501 u'\u3055\u308c\u3066\u3044\u307e\u305b\u3093\u3002\u4e00'
502 u'\u90e8\u306f\u30c9\u30a4\u30c4\u8a9e\u3067\u3059\u304c'
503 u'\u3001\u3042\u3068\u306f\u3067\u305f\u3089\u3081\u3067'
504 u'\u3059\u3002\u5b9f\u969b\u306b\u306f\u300cWenn ist das'
505 u' Nunstuck git und'.encode('utf-8'),
506 '\xe6\xad\xa3\xe7\xa2\xba\xe3\x81\xab\xe8\xa8\x80\xe3\x81'
507 '\x86\xe3\x81\xa8\xe7\xbf\xbb\xe8\xa8\xb3\xe3\x81\xaf\xe3'
508 '\x81\x95\xe3\x82\x8c\xe3\x81\xa6\xe3\x81\x84\xe3\x81\xbe'
509 '\xe3\x81\x9b\xe3\x82\x93\xe3\x80\x82\xe4\xb8\x80\xe9\x83'
510 '\xa8\xe3\x81\xaf\xe3\x83\x89\xe3\x82\xa4\xe3\x83\x84\xe8'
511 '\xaa\x9e\xe3\x81\xa7\xe3\x81\x99\xe3\x81\x8c\xe3\x80\x81'
512 '\xe3\x81\x82\xe3\x81\xa8\xe3\x81\xaf\xe3\x81\xa7\xe3\x81'
513 '\x9f\xe3\x82\x89\xe3\x82\x81\xe3\x81\xa7\xe3\x81\x99\xe3'
514 '\x80\x82\xe5\xae\x9f\xe9\x9a\x9b\xe3\x81\xab\xe3\x81\xaf'
515 '\xe3\x80\x8cWenn ist das Nunstuck git und'
518 # UTF-8 specific decoding tests
519 self.assertEqual(unicode('\xf0\xa3\x91\x96', 'utf-8'), u'\U00023456' )
520 self.assertEqual(unicode('\xf0\x90\x80\x82', 'utf-8'), u'\U00010002' )
521 self.assertEqual(unicode('\xe2\x82\xac', 'utf-8'), u'\u20ac' )
523 # Other possible utf-8 test cases:
524 # * strict decoding testing for all of the
525 # UTF8_ERROR cases in PyUnicode_DecodeUTF8
527 def test_codecs_errors(self):
528 # Error handling (encoding)
529 self.assertRaises(UnicodeError, u'Andr\202 x'.encode, 'ascii')
530 self.assertRaises(UnicodeError, u'Andr\202 x'.encode, 'ascii','strict')
531 self.assertEqual(u'Andr\202 x'.encode('ascii','ignore'), "Andr x")
532 self.assertEqual(u'Andr\202 x'.encode('ascii','replace'), "Andr? x")
534 # Error handling (decoding)
535 self.assertRaises(UnicodeError, unicode, 'Andr\202 x', 'ascii')
536 self.assertRaises(UnicodeError, unicode, 'Andr\202 x', 'ascii','strict')
537 self.assertEqual(unicode('Andr\202 x','ascii','ignore'), u"Andr x")
538 self.assertEqual(unicode('Andr\202 x','ascii','replace'), u'Andr\uFFFD x')
540 # Error handling (unknown character names)
541 self.assertEqual("\\N{foo}xx".decode("unicode-escape", "ignore"), u"xx")
543 # Error handling (truncated escape sequence)
544 self.assertRaises(UnicodeError, "\\".decode, "unicode-escape")
546 # Error handling (bad decoder return)
547 def search_function(encoding):
548 def decode1(input, errors="strict"):
549 return 42 # not a tuple
550 def encode1(input, errors="strict"):
551 return 42 # not a tuple
552 def encode2(input, errors="strict"):
553 return (42, 42) # no unicode
554 def decode2(input, errors="strict"):
555 return (42, 42) # no unicode
556 if encoding=="test.unicode1":
557 return (encode1, decode1, None, None)
558 elif encoding=="test.unicode2":
559 return (encode2, decode2, None, None)
560 else:
561 return None
562 codecs.register(search_function)
563 self.assertRaises(TypeError, "hello".decode, "test.unicode1")
564 self.assertRaises(TypeError, unicode, "hello", "test.unicode2")
565 self.assertRaises(TypeError, u"hello".encode, "test.unicode1")
566 self.assertRaises(TypeError, u"hello".encode, "test.unicode2")
567 # executes PyUnicode_Encode()
568 import imp
569 self.assertRaises(
570 ImportError,
571 imp.find_module,
572 "non-existing module",
573 [u"non-existing dir"]
576 # Error handling (wrong arguments)
577 self.assertRaises(TypeError, u"hello".encode, 42, 42, 42)
579 # Error handling (PyUnicode_EncodeDecimal())
580 self.assertRaises(UnicodeError, int, u"\u0200")
582 def test_codecs(self):
583 # Encoding
584 self.assertEqual(u'hello'.encode('ascii'), 'hello')
585 self.assertEqual(u'hello'.encode('utf-7'), 'hello')
586 self.assertEqual(u'hello'.encode('utf-8'), 'hello')
587 self.assertEqual(u'hello'.encode('utf8'), 'hello')
588 self.assertEqual(u'hello'.encode('utf-16-le'), 'h\000e\000l\000l\000o\000')
589 self.assertEqual(u'hello'.encode('utf-16-be'), '\000h\000e\000l\000l\000o')
590 self.assertEqual(u'hello'.encode('latin-1'), 'hello')
592 # Roundtrip safety for BMP (just the first 1024 chars)
593 u = u''.join(map(unichr, xrange(1024)))
594 for encoding in ('utf-7', 'utf-8', 'utf-16', 'utf-16-le', 'utf-16-be',
595 'raw_unicode_escape', 'unicode_escape', 'unicode_internal'):
596 self.assertEqual(unicode(u.encode(encoding),encoding), u)
598 # Roundtrip safety for BMP (just the first 256 chars)
599 u = u''.join(map(unichr, xrange(256)))
600 for encoding in ('latin-1',):
601 self.assertEqual(unicode(u.encode(encoding),encoding), u)
603 # Roundtrip safety for BMP (just the first 128 chars)
604 u = u''.join(map(unichr, xrange(128)))
605 for encoding in ('ascii',):
606 self.assertEqual(unicode(u.encode(encoding),encoding), u)
608 # Roundtrip safety for non-BMP (just a few chars)
609 u = u'\U00010001\U00020002\U00030003\U00040004\U00050005'
610 for encoding in ('utf-8', 'utf-16', 'utf-16-le', 'utf-16-be',
611 #'raw_unicode_escape',
612 'unicode_escape', 'unicode_internal'):
613 self.assertEqual(unicode(u.encode(encoding),encoding), u)
615 # UTF-8 must be roundtrip safe for all UCS-2 code points
616 # This excludes surrogates: in the full range, there would be
617 # a surrogate pair (\udbff\udc00), which gets converted back
618 # to a non-BMP character (\U0010fc00)
619 u = u''.join(map(unichr, range(0,0xd800)+range(0xe000,0x10000)))
620 for encoding in ('utf-8',):
621 self.assertEqual(unicode(u.encode(encoding),encoding), u)
623 def test_codecs_charmap(self):
624 # 0-127
625 s = ''.join(map(chr, xrange(128)))
626 for encoding in (
627 'cp037', 'cp1026',
628 'cp437', 'cp500', 'cp737', 'cp775', 'cp850',
629 'cp852', 'cp855', 'cp860', 'cp861', 'cp862',
630 'cp863', 'cp865', 'cp866',
631 'iso8859_10', 'iso8859_13', 'iso8859_14', 'iso8859_15',
632 'iso8859_2', 'iso8859_3', 'iso8859_4', 'iso8859_5', 'iso8859_6',
633 'iso8859_7', 'iso8859_9', 'koi8_r', 'latin_1',
634 'mac_cyrillic', 'mac_latin2',
636 'cp1250', 'cp1251', 'cp1252', 'cp1253', 'cp1254', 'cp1255',
637 'cp1256', 'cp1257', 'cp1258',
638 'cp856', 'cp857', 'cp864', 'cp869', 'cp874',
640 'mac_greek', 'mac_iceland','mac_roman', 'mac_turkish',
641 'cp1006', 'iso8859_8',
643 ### These have undefined mappings:
644 #'cp424',
646 ### These fail the round-trip:
647 #'cp875'
650 self.assertEqual(unicode(s, encoding).encode(encoding), s)
652 # 128-255
653 s = ''.join(map(chr, xrange(128, 256)))
654 for encoding in (
655 'cp037', 'cp1026',
656 'cp437', 'cp500', 'cp737', 'cp775', 'cp850',
657 'cp852', 'cp855', 'cp860', 'cp861', 'cp862',
658 'cp863', 'cp865', 'cp866',
659 'iso8859_10', 'iso8859_13', 'iso8859_14', 'iso8859_15',
660 'iso8859_2', 'iso8859_4', 'iso8859_5',
661 'iso8859_9', 'koi8_r', 'latin_1',
662 'mac_cyrillic', 'mac_latin2',
664 ### These have undefined mappings:
665 #'cp1250', 'cp1251', 'cp1252', 'cp1253', 'cp1254', 'cp1255',
666 #'cp1256', 'cp1257', 'cp1258',
667 #'cp424', 'cp856', 'cp857', 'cp864', 'cp869', 'cp874',
668 #'iso8859_3', 'iso8859_6', 'iso8859_7',
669 #'mac_greek', 'mac_iceland','mac_roman', 'mac_turkish',
671 ### These fail the round-trip:
672 #'cp1006', 'cp875', 'iso8859_8',
675 self.assertEqual(unicode(s, encoding).encode(encoding), s)
677 def test_concatenation(self):
678 self.assertEqual((u"abc" u"def"), u"abcdef")
679 self.assertEqual(("abc" u"def"), u"abcdef")
680 self.assertEqual((u"abc" "def"), u"abcdef")
681 self.assertEqual((u"abc" u"def" "ghi"), u"abcdefghi")
682 self.assertEqual(("abc" "def" u"ghi"), u"abcdefghi")
684 def test_printing(self):
685 class BitBucket:
686 def write(self, text):
687 pass
689 out = BitBucket()
690 print >>out, u'abc'
691 print >>out, u'abc', u'def'
692 print >>out, u'abc', 'def'
693 print >>out, 'abc', u'def'
694 print >>out, u'abc\n'
695 print >>out, u'abc\n',
696 print >>out, u'abc\n',
697 print >>out, u'def\n'
698 print >>out, u'def\n'
700 def test_main():
701 suite = unittest.TestSuite()
702 suite.addTest(unittest.makeSuite(UnicodeTest))
703 test_support.run_suite(suite)
705 if __name__ == "__main__":
706 test_main()