1 # -*- coding: iso-8859-1 -*-
2 """ Test script for the Unicode implementation.
4 Written by Marc-Andre Lemburg (mal@lemburg.com).
6 (c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
9 import unittest
, sys
, string
, codecs
, new
10 from test
import test_support
, string_tests
13 string_tests
.CommonTest
,
14 string_tests
.MixinStrUnicodeUserStringTest
18 def checkequalnofix(self
, result
, object, methodname
, *args
):
19 method
= getattr(object, methodname
)
20 realresult
= method(*args
)
21 self
.assertEqual(realresult
, result
)
22 self
.assert_(type(realresult
) is type(result
))
24 # if the original is returned make sure that
25 # this doesn't happen with subclasses
26 if realresult
is object:
29 return 'usub(%r)' % unicode.__repr
__(self
)
31 method
= getattr(object, methodname
)
32 realresult
= method(*args
)
33 self
.assertEqual(realresult
, result
)
34 self
.assert_(object is not realresult
)
37 if not sys
.platform
.startswith('java'):
38 # Test basic sanity of repr()
39 self
.assertEqual(repr(u
'abc'), "u'abc'")
40 self
.assertEqual(repr(u
'ab\\c'), "u'ab\\\\c'")
41 self
.assertEqual(repr(u
'ab\\'), "u'ab\\\\'")
42 self
.assertEqual(repr(u
'\\c'), "u'\\\\c'")
43 self
.assertEqual(repr(u
'\\'), "u'\\\\'")
44 self
.assertEqual(repr(u
'\n'), "u'\\n'")
45 self
.assertEqual(repr(u
'\r'), "u'\\r'")
46 self
.assertEqual(repr(u
'\t'), "u'\\t'")
47 self
.assertEqual(repr(u
'\b'), "u'\\x08'")
48 self
.assertEqual(repr(u
"'\""), """u'\\'"'""")
49 self
.assertEqual(repr(u
"'\""), """u'\\'"'""")
50 self
.assertEqual(repr(u
"'"), '''u"'"''')
51 self
.assertEqual(repr(u
'"'), """u'"'""")
53 "u'\\x00\\x01\\x02\\x03\\x04\\x05\\x06\\x07\\x08\\t\\n\\x0b\\x0c\\r"
54 "\\x0e\\x0f\\x10\\x11\\x12\\x13\\x14\\x15\\x16\\x17\\x18\\x19\\x1a"
55 "\\x1b\\x1c\\x1d\\x1e\\x1f !\"#$%&\\'()*+,-./0123456789:;<=>?@ABCDEFGHI"
56 "JKLMNOPQRSTUVWXYZ[\\\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\\x7f"
57 "\\x80\\x81\\x82\\x83\\x84\\x85\\x86\\x87\\x88\\x89\\x8a\\x8b\\x8c\\x8d"
58 "\\x8e\\x8f\\x90\\x91\\x92\\x93\\x94\\x95\\x96\\x97\\x98\\x99\\x9a\\x9b"
59 "\\x9c\\x9d\\x9e\\x9f\\xa0\\xa1\\xa2\\xa3\\xa4\\xa5\\xa6\\xa7\\xa8\\xa9"
60 "\\xaa\\xab\\xac\\xad\\xae\\xaf\\xb0\\xb1\\xb2\\xb3\\xb4\\xb5\\xb6\\xb7"
61 "\\xb8\\xb9\\xba\\xbb\\xbc\\xbd\\xbe\\xbf\\xc0\\xc1\\xc2\\xc3\\xc4\\xc5"
62 "\\xc6\\xc7\\xc8\\xc9\\xca\\xcb\\xcc\\xcd\\xce\\xcf\\xd0\\xd1\\xd2\\xd3"
63 "\\xd4\\xd5\\xd6\\xd7\\xd8\\xd9\\xda\\xdb\\xdc\\xdd\\xde\\xdf\\xe0\\xe1"
64 "\\xe2\\xe3\\xe4\\xe5\\xe6\\xe7\\xe8\\xe9\\xea\\xeb\\xec\\xed\\xee\\xef"
65 "\\xf0\\xf1\\xf2\\xf3\\xf4\\xf5\\xf6\\xf7\\xf8\\xf9\\xfa\\xfb\\xfc\\xfd"
67 testrepr
= repr(u
''.join(map(unichr, xrange(256))))
68 self
.assertEqual(testrepr
, latin1repr
)
71 string_tests
.CommonTest
.test_count(self
)
72 # check mixed argument types
73 self
.checkequalnofix(3, 'aaa', 'count', u
'a')
74 self
.checkequalnofix(0, 'aaa', 'count', u
'b')
75 self
.checkequalnofix(3, u
'aaa', 'count', 'a')
76 self
.checkequalnofix(0, u
'aaa', 'count', 'b')
77 self
.checkequalnofix(0, u
'aaa', 'count', 'b')
78 self
.checkequalnofix(1, u
'aaa', 'count', 'a', -1)
79 self
.checkequalnofix(3, u
'aaa', 'count', 'a', -10)
80 self
.checkequalnofix(2, u
'aaa', 'count', 'a', 0, -1)
81 self
.checkequalnofix(0, u
'aaa', 'count', 'a', 0, -10)
84 self
.checkequalnofix(0, u
'abcdefghiabc', 'find', u
'abc')
85 self
.checkequalnofix(9, u
'abcdefghiabc', 'find', u
'abc', 1)
86 self
.checkequalnofix(-1, u
'abcdefghiabc', 'find', u
'def', 4)
88 self
.assertRaises(TypeError, u
'hello'.find
)
89 self
.assertRaises(TypeError, u
'hello'.find
, 42)
92 string_tests
.CommonTest
.test_rfind(self
)
93 # check mixed argument types
94 self
.checkequalnofix(9, 'abcdefghiabc', 'rfind', u
'abc')
95 self
.checkequalnofix(12, 'abcdefghiabc', 'rfind', u
'')
96 self
.checkequalnofix(12, u
'abcdefghiabc', 'rfind', '')
99 string_tests
.CommonTest
.test_index(self
)
100 # check mixed argument types
101 for (t1
, t2
) in ((str, unicode), (unicode, str)):
102 self
.checkequalnofix(0, t1('abcdefghiabc'), 'index', t2(''))
103 self
.checkequalnofix(3, t1('abcdefghiabc'), 'index', t2('def'))
104 self
.checkequalnofix(0, t1('abcdefghiabc'), 'index', t2('abc'))
105 self
.checkequalnofix(9, t1('abcdefghiabc'), 'index', t2('abc'), 1)
106 self
.assertRaises(ValueError, t1('abcdefghiabc').index
, t2('hib'))
107 self
.assertRaises(ValueError, t1('abcdefghiab').index
, t2('abc'), 1)
108 self
.assertRaises(ValueError, t1('abcdefghi').index
, t2('ghi'), 8)
109 self
.assertRaises(ValueError, t1('abcdefghi').index
, t2('ghi'), -1)
111 def test_rindex(self
):
112 string_tests
.CommonTest
.test_rindex(self
)
113 # check mixed argument types
114 for (t1
, t2
) in ((str, unicode), (unicode, str)):
115 self
.checkequalnofix(12, t1('abcdefghiabc'), 'rindex', t2(''))
116 self
.checkequalnofix(3, t1('abcdefghiabc'), 'rindex', t2('def'))
117 self
.checkequalnofix(9, t1('abcdefghiabc'), 'rindex', t2('abc'))
118 self
.checkequalnofix(0, t1('abcdefghiabc'), 'rindex', t2('abc'), 0, -1)
120 self
.assertRaises(ValueError, t1('abcdefghiabc').rindex
, t2('hib'))
121 self
.assertRaises(ValueError, t1('defghiabc').rindex
, t2('def'), 1)
122 self
.assertRaises(ValueError, t1('defghiabc').rindex
, t2('abc'), 0, -1)
123 self
.assertRaises(ValueError, t1('abcdefghi').rindex
, t2('ghi'), 0, 8)
124 self
.assertRaises(ValueError, t1('abcdefghi').rindex
, t2('ghi'), 0, -1)
126 def test_translate(self
):
127 self
.checkequalnofix(u
'bbbc', u
'abababc', 'translate', {ord('a'):None})
128 self
.checkequalnofix(u
'iiic', u
'abababc', 'translate', {ord('a'):None, ord('b'):ord('i')})
129 self
.checkequalnofix(u
'iiix', u
'abababc', 'translate', {ord('a'):None, ord('b'):ord('i'), ord('c'):u
'x'})
130 self
.checkequalnofix(u
'<i><i><i>c', u
'abababc', 'translate', {ord('a'):None, ord('b'):u
'<i>'})
131 self
.checkequalnofix(u
'c', u
'abababc', 'translate', {ord('a'):None, ord('b'):u
''})
133 self
.assertRaises(TypeError, u
'hello'.translate
)
134 self
.assertRaises(TypeError, u
'abababc'.translate
, {ord('a'):''})
136 def test_split(self
):
137 string_tests
.CommonTest
.test_split(self
)
140 self
.checkequalnofix([u
'a', u
'b', u
'c', u
'd'], u
'a//b//c//d', 'split', '//')
141 self
.checkequalnofix([u
'a', u
'b', u
'c', u
'd'], 'a//b//c//d', 'split', u
'//')
142 self
.checkequalnofix([u
'endcase ', u
''], u
'endcase test', 'split', 'test')
145 string_tests
.MixinStrUnicodeUserStringTest
.test_join(self
)
148 self
.checkequalnofix(u
'a b c d', u
' ', 'join', ['a', 'b', u
'c', u
'd'])
149 self
.checkequalnofix(u
'abcd', u
'', 'join', (u
'a', u
'b', u
'c', u
'd'))
150 self
.checkequalnofix(u
'w x y z', u
' ', 'join', string_tests
.Sequence('wxyz'))
151 self
.checkequalnofix(u
'a b c d', ' ', 'join', [u
'a', u
'b', u
'c', u
'd'])
152 self
.checkequalnofix(u
'a b c d', ' ', 'join', ['a', 'b', u
'c', u
'd'])
153 self
.checkequalnofix(u
'abcd', '', 'join', (u
'a', u
'b', u
'c', u
'd'))
154 self
.checkequalnofix(u
'w x y z', ' ', 'join', string_tests
.Sequence(u
'wxyz'))
156 def test_strip(self
):
157 string_tests
.CommonTest
.test_strip(self
)
158 self
.assertRaises(UnicodeError, u
"hello".strip
, "\xff")
160 def test_replace(self
):
161 string_tests
.CommonTest
.test_replace(self
)
163 # method call forwarded from str implementation because of unicode argument
164 self
.checkequalnofix(u
'one@two!three!', 'one!two!three!', 'replace', u
'!', u
'@', 1)
165 self
.assertRaises(TypeError, 'replace'.replace
, u
"r", 42)
167 def test_comparison(self
):
169 self
.assertEqual(u
'abc', 'abc')
170 self
.assertEqual('abc', u
'abc')
171 self
.assertEqual(u
'abc', u
'abc')
172 self
.assert_(u
'abcd' > 'abc')
173 self
.assert_('abcd' > u
'abc')
174 self
.assert_(u
'abcd' > u
'abc')
175 self
.assert_(u
'abc' < 'abcd')
176 self
.assert_('abc' < u
'abcd')
177 self
.assert_(u
'abc' < u
'abcd')
180 # Move these tests to a Unicode collation module test...
181 # Testing UTF-16 code point order comparisons...
183 # No surrogates, no fixup required.
184 self
.assert_(u
'\u0061' < u
'\u20ac')
185 # Non surrogate below surrogate value, no fixup required
186 self
.assert_(u
'\u0061' < u
'\ud800\udc02')
188 # Non surrogate above surrogate value, fixup required
189 def test_lecmp(s
, s2
):
226 test_fixup(u
'\ue000')
227 test_fixup(u
'\uff61')
229 # Surrogates on both sides, no fixup required
230 self
.assert_(u
'\ud800\udc02' < u
'\ud84d\udc56')
232 def test_islower(self
):
233 string_tests
.MixinStrUnicodeUserStringTest
.test_islower(self
)
234 self
.checkequalnofix(False, u
'\u1FFc', 'islower')
236 def test_isupper(self
):
237 string_tests
.MixinStrUnicodeUserStringTest
.test_isupper(self
)
238 if not sys
.platform
.startswith('java'):
239 self
.checkequalnofix(False, u
'\u1FFc', 'isupper')
241 def test_istitle(self
):
242 string_tests
.MixinStrUnicodeUserStringTest
.test_title(self
)
243 self
.checkequalnofix(True, u
'\u1FFc', 'istitle')
244 self
.checkequalnofix(True, u
'Greek \u1FFcitlecases ...', 'istitle')
246 def test_isspace(self
):
247 string_tests
.MixinStrUnicodeUserStringTest
.test_isspace(self
)
248 self
.checkequalnofix(True, u
'\u2000', 'isspace')
249 self
.checkequalnofix(True, u
'\u200a', 'isspace')
250 self
.checkequalnofix(False, u
'\u2014', 'isspace')
252 def test_isalpha(self
):
253 string_tests
.MixinStrUnicodeUserStringTest
.test_isalpha(self
)
254 self
.checkequalnofix(True, u
'\u1FFc', 'isalpha')
256 def test_isdecimal(self
):
257 self
.checkequalnofix(False, u
'', 'isdecimal')
258 self
.checkequalnofix(False, u
'a', 'isdecimal')
259 self
.checkequalnofix(True, u
'0', 'isdecimal')
260 self
.checkequalnofix(False, u
'\u2460', 'isdecimal') # CIRCLED DIGIT ONE
261 self
.checkequalnofix(False, u
'\xbc', 'isdecimal') # VULGAR FRACTION ONE QUARTER
262 self
.checkequalnofix(True, u
'\u0660', 'isdecimal') # ARABIC-INDIC DIGIT ZERO
263 self
.checkequalnofix(True, u
'0123456789', 'isdecimal')
264 self
.checkequalnofix(False, u
'0123456789a', 'isdecimal')
266 self
.checkraises(TypeError, 'abc', 'isdecimal', 42)
268 def test_isdigit(self
):
269 string_tests
.MixinStrUnicodeUserStringTest
.test_isdigit(self
)
270 self
.checkequalnofix(True, u
'\u2460', 'isdigit')
271 self
.checkequalnofix(False, u
'\xbc', 'isdigit')
272 self
.checkequalnofix(True, u
'\u0660', 'isdigit')
274 def test_isnumeric(self
):
275 self
.checkequalnofix(False, u
'', 'isnumeric')
276 self
.checkequalnofix(False, u
'a', 'isnumeric')
277 self
.checkequalnofix(True, u
'0', 'isnumeric')
278 self
.checkequalnofix(True, u
'\u2460', 'isnumeric')
279 self
.checkequalnofix(True, u
'\xbc', 'isnumeric')
280 self
.checkequalnofix(True, u
'\u0660', 'isnumeric')
281 self
.checkequalnofix(True, u
'0123456789', 'isnumeric')
282 self
.checkequalnofix(False, u
'0123456789a', 'isnumeric')
284 self
.assertRaises(TypeError, u
"abc".isnumeric
, 42)
286 def test_contains(self
):
287 # Testing Unicode contains method
288 self
.assert_('a' in u
'abdb')
289 self
.assert_('a' in u
'bdab')
290 self
.assert_('a' in u
'bdaba')
291 self
.assert_('a' in u
'bdba')
292 self
.assert_('a' in u
'bdba')
293 self
.assert_(u
'a' in u
'bdba')
294 self
.assert_(u
'a' not in u
'bdb')
295 self
.assert_(u
'a' not in 'bdb')
296 self
.assert_(u
'a' in 'bdba')
297 self
.assert_(u
'a' in ('a',1,None))
298 self
.assert_(u
'a' in (1,None,'a'))
299 self
.assert_(u
'a' in (1,None,u
'a'))
300 self
.assert_('a' in ('a',1,None))
301 self
.assert_('a' in (1,None,'a'))
302 self
.assert_('a' in (1,None,u
'a'))
303 self
.assert_('a' not in ('x',1,u
'y'))
304 self
.assert_('a' not in ('x',1,None))
305 self
.assert_(u
'abcd' not in u
'abcxxxx')
306 self
.assert_(u
'ab' in u
'abcd')
307 self
.assert_('ab' in u
'abc')
308 self
.assert_(u
'ab' in 'abc')
309 self
.assert_(u
'ab' in (1,None,u
'ab'))
310 self
.assert_(u
'' in u
'abc')
311 self
.assert_('' in u
'abc')
313 # If the following fails either
314 # the contains operator does not propagate UnicodeErrors or
315 # someone has changed the default encoding
316 self
.assertRaises(UnicodeError, 'g\xe2teau'.__contains
__, u
'\xe2')
318 self
.assert_(u
'' in '')
319 self
.assert_('' in u
'')
320 self
.assert_(u
'' in u
'')
321 self
.assert_(u
'' in 'abc')
322 self
.assert_('' in u
'abc')
323 self
.assert_(u
'' in u
'abc')
324 self
.assert_(u
'\0' not in 'abc')
325 self
.assert_('\0' not in u
'abc')
326 self
.assert_(u
'\0' not in u
'abc')
327 self
.assert_(u
'\0' in '\0abc')
328 self
.assert_('\0' in u
'\0abc')
329 self
.assert_(u
'\0' in u
'\0abc')
330 self
.assert_(u
'\0' in 'abc\0')
331 self
.assert_('\0' in u
'abc\0')
332 self
.assert_(u
'\0' in u
'abc\0')
333 self
.assert_(u
'a' in '\0abc')
334 self
.assert_('a' in u
'\0abc')
335 self
.assert_(u
'a' in u
'\0abc')
336 self
.assert_(u
'asdf' in 'asdf')
337 self
.assert_('asdf' in u
'asdf')
338 self
.assert_(u
'asdf' in u
'asdf')
339 self
.assert_(u
'asdf' not in 'asd')
340 self
.assert_('asdf' not in u
'asd')
341 self
.assert_(u
'asdf' not in u
'asd')
342 self
.assert_(u
'asdf' not in '')
343 self
.assert_('asdf' not in u
'')
344 self
.assert_(u
'asdf' not in u
'')
346 self
.assertRaises(TypeError, u
"abc".__contains
__)
348 def test_formatting(self
):
349 string_tests
.MixinStrUnicodeUserStringTest
.test_formatting(self
)
350 # Testing Unicode formatting strings...
351 self
.assertEqual(u
"%s, %s" % (u
"abc", "abc"), u
'abc, abc')
352 self
.assertEqual(u
"%s, %s, %i, %f, %5.2f" % (u
"abc", "abc", 1, 2, 3), u
'abc, abc, 1, 2.000000, 3.00')
353 self
.assertEqual(u
"%s, %s, %i, %f, %5.2f" % (u
"abc", "abc", 1, -2, 3), u
'abc, abc, 1, -2.000000, 3.00')
354 self
.assertEqual(u
"%s, %s, %i, %f, %5.2f" % (u
"abc", "abc", -1, -2, 3.5), u
'abc, abc, -1, -2.000000, 3.50')
355 self
.assertEqual(u
"%s, %s, %i, %f, %5.2f" % (u
"abc", "abc", -1, -2, 3.57), u
'abc, abc, -1, -2.000000, 3.57')
356 self
.assertEqual(u
"%s, %s, %i, %f, %5.2f" % (u
"abc", "abc", -1, -2, 1003.57), u
'abc, abc, -1, -2.000000, 1003.57')
357 if not sys
.platform
.startswith('java'):
358 self
.assertEqual(u
"%r, %r" % (u
"abc", "abc"), u
"u'abc', 'abc'")
359 self
.assertEqual(u
"%(x)s, %(y)s" % {'x':u
"abc", 'y':"def"}, u
'abc, def')
360 self
.assertEqual(u
"%(x)s, %(\xfc)s" % {'x':u
"abc", u
'\xfc':"def"}, u
'abc, def')
362 self
.assertEqual(u
'%c' % 0x1234, u
'\u1234')
363 self
.assertRaises(OverflowError, u
"%c".__mod
__, (sys
.maxunicode
+1,))
365 # formatting jobs delegated from the string implementation:
366 self
.assertEqual('...%(foo)s...' % {'foo':u
"abc"}, u
'...abc...')
367 self
.assertEqual('...%(foo)s...' % {'foo':"abc"}, '...abc...')
368 self
.assertEqual('...%(foo)s...' % {u
'foo':"abc"}, '...abc...')
369 self
.assertEqual('...%(foo)s...' % {u
'foo':u
"abc"}, u
'...abc...')
370 self
.assertEqual('...%(foo)s...' % {u
'foo':u
"abc",'def':123}, u
'...abc...')
371 self
.assertEqual('...%(foo)s...' % {u
'foo':u
"abc",u
'def':123}, u
'...abc...')
372 self
.assertEqual('...%s...%s...%s...%s...' % (1,2,3,u
"abc"), u
'...1...2...3...abc...')
373 self
.assertEqual('...%%...%%s...%s...%s...%s...%s...' % (1,2,3,u
"abc"), u
'...%...%s...1...2...3...abc...')
374 self
.assertEqual('...%s...' % u
"abc", u
'...abc...')
375 self
.assertEqual('%*s' % (5,u
'abc',), u
' abc')
376 self
.assertEqual('%*s' % (-5,u
'abc',), u
'abc ')
377 self
.assertEqual('%*.*s' % (5,2,u
'abc',), u
' ab')
378 self
.assertEqual('%*.*s' % (5,3,u
'abc',), u
' abc')
379 self
.assertEqual('%i %*.*s' % (10, 5,3,u
'abc',), u
'10 abc')
380 self
.assertEqual('%i%s %*.*s' % (10, 3, 5, 3, u
'abc',), u
'103 abc')
381 self
.assertEqual('%c' % u
'a', u
'a')
384 def test_constructor(self
):
385 # unicode(obj) tests (this maps to PyObject_Unicode() at C level)
388 unicode(u
'unicode remains unicode'),
389 u
'unicode remains unicode'
392 class UnicodeSubclass(unicode):
396 unicode(UnicodeSubclass('unicode subclass becomes unicode')),
397 u
'unicode subclass becomes unicode'
401 unicode('strings are converted to unicode'),
402 u
'strings are converted to unicode'
406 def __init__(self
, x
):
408 def __unicode__(self
):
412 unicode(UnicodeCompat('__unicode__ compatible objects are recognized')),
413 u
'__unicode__ compatible objects are recognized')
416 def __init__(self
, x
):
422 unicode(StringCompat('__str__ compatible objects are recognized')),
423 u
'__str__ compatible objects are recognized'
426 # unicode(obj) is compatible to str():
428 o
= StringCompat('unicode(obj) is compatible to str()')
429 self
.assertEqual(unicode(o
), u
'unicode(obj) is compatible to str()')
430 self
.assertEqual(str(o
), 'unicode(obj) is compatible to str()')
432 for obj
in (123, 123.45, 123L):
433 self
.assertEqual(unicode(obj
), unicode(str(obj
)))
435 # unicode(obj, encoding, error) tests (this maps to
436 # PyUnicode_FromEncodedObject() at C level)
438 if not sys
.platform
.startswith('java'):
442 u
'decoding unicode is not supported',
448 unicode('strings are decoded to unicode', 'utf-8', 'strict'),
449 u
'strings are decoded to unicode'
452 if not sys
.platform
.startswith('java'):
455 buffer('character buffers are decoded to unicode'),
459 u
'character buffers are decoded to unicode'
462 self
.assertRaises(TypeError, unicode, 42, 42, 42)
464 def test_codecs_utf7(self
):
466 (u
'A\u2262\u0391.', 'A+ImIDkQ.'), # RFC2152 example
467 (u
'Hi Mom -\u263a-!', 'Hi Mom -+Jjo--!'), # RFC2152 example
468 (u
'\u65E5\u672C\u8A9E', '+ZeVnLIqe-'), # RFC2152 example
469 (u
'Item 3 is \u00a31.', 'Item 3 is +AKM-1.'), # RFC2152 example
475 (ur
'\\?', '+AFwAXA?'),
476 (ur
'\\\?', '+AFwAXABc?'),
480 for (x
, y
) in utfTests
:
481 self
.assertEqual(x
.encode('utf-7'), y
)
483 # surrogates not supported
484 self
.assertRaises(UnicodeError, unicode, '+3ADYAA-', 'utf-7')
486 self
.assertEqual(unicode('+3ADYAA-', 'utf-7', 'replace'), u
'\ufffd')
488 def test_codecs_utf8(self
):
489 self
.assertEqual(u
''.encode('utf-8'), '')
490 self
.assertEqual(u
'\u20ac'.encode('utf-8'), '\xe2\x82\xac')
491 self
.assertEqual(u
'\ud800\udc02'.encode('utf-8'), '\xf0\x90\x80\x82')
492 self
.assertEqual(u
'\ud84d\udc56'.encode('utf-8'), '\xf0\xa3\x91\x96')
493 self
.assertEqual(u
'\ud800'.encode('utf-8'), '\xed\xa0\x80')
494 self
.assertEqual(u
'\udc00'.encode('utf-8'), '\xed\xb0\x80')
496 (u
'\ud800\udc02'*1000).encode('utf-8'),
497 '\xf0\x90\x80\x82'*1000
500 u
'\u6b63\u78ba\u306b\u8a00\u3046\u3068\u7ffb\u8a33\u306f'
501 u
'\u3055\u308c\u3066\u3044\u307e\u305b\u3093\u3002\u4e00'
502 u
'\u90e8\u306f\u30c9\u30a4\u30c4\u8a9e\u3067\u3059\u304c'
503 u
'\u3001\u3042\u3068\u306f\u3067\u305f\u3089\u3081\u3067'
504 u
'\u3059\u3002\u5b9f\u969b\u306b\u306f\u300cWenn ist das'
505 u
' Nunstuck git und'.encode('utf-8'),
506 '\xe6\xad\xa3\xe7\xa2\xba\xe3\x81\xab\xe8\xa8\x80\xe3\x81'
507 '\x86\xe3\x81\xa8\xe7\xbf\xbb\xe8\xa8\xb3\xe3\x81\xaf\xe3'
508 '\x81\x95\xe3\x82\x8c\xe3\x81\xa6\xe3\x81\x84\xe3\x81\xbe'
509 '\xe3\x81\x9b\xe3\x82\x93\xe3\x80\x82\xe4\xb8\x80\xe9\x83'
510 '\xa8\xe3\x81\xaf\xe3\x83\x89\xe3\x82\xa4\xe3\x83\x84\xe8'
511 '\xaa\x9e\xe3\x81\xa7\xe3\x81\x99\xe3\x81\x8c\xe3\x80\x81'
512 '\xe3\x81\x82\xe3\x81\xa8\xe3\x81\xaf\xe3\x81\xa7\xe3\x81'
513 '\x9f\xe3\x82\x89\xe3\x82\x81\xe3\x81\xa7\xe3\x81\x99\xe3'
514 '\x80\x82\xe5\xae\x9f\xe9\x9a\x9b\xe3\x81\xab\xe3\x81\xaf'
515 '\xe3\x80\x8cWenn ist das Nunstuck git und'
518 # UTF-8 specific decoding tests
519 self
.assertEqual(unicode('\xf0\xa3\x91\x96', 'utf-8'), u
'\U00023456' )
520 self
.assertEqual(unicode('\xf0\x90\x80\x82', 'utf-8'), u
'\U00010002' )
521 self
.assertEqual(unicode('\xe2\x82\xac', 'utf-8'), u
'\u20ac' )
523 # Other possible utf-8 test cases:
524 # * strict decoding testing for all of the
525 # UTF8_ERROR cases in PyUnicode_DecodeUTF8
527 def test_codecs_errors(self
):
528 # Error handling (encoding)
529 self
.assertRaises(UnicodeError, u
'Andr\202 x'.encode
, 'ascii')
530 self
.assertRaises(UnicodeError, u
'Andr\202 x'.encode
, 'ascii','strict')
531 self
.assertEqual(u
'Andr\202 x'.encode('ascii','ignore'), "Andr x")
532 self
.assertEqual(u
'Andr\202 x'.encode('ascii','replace'), "Andr? x")
534 # Error handling (decoding)
535 self
.assertRaises(UnicodeError, unicode, 'Andr\202 x', 'ascii')
536 self
.assertRaises(UnicodeError, unicode, 'Andr\202 x', 'ascii','strict')
537 self
.assertEqual(unicode('Andr\202 x','ascii','ignore'), u
"Andr x")
538 self
.assertEqual(unicode('Andr\202 x','ascii','replace'), u
'Andr\uFFFD x')
540 # Error handling (unknown character names)
541 self
.assertEqual("\\N{foo}xx".decode("unicode-escape", "ignore"), u
"xx")
543 # Error handling (truncated escape sequence)
544 self
.assertRaises(UnicodeError, "\\".decode
, "unicode-escape")
546 # Error handling (bad decoder return)
547 def search_function(encoding
):
548 def decode1(input, errors
="strict"):
549 return 42 # not a tuple
550 def encode1(input, errors
="strict"):
551 return 42 # not a tuple
552 def encode2(input, errors
="strict"):
553 return (42, 42) # no unicode
554 def decode2(input, errors
="strict"):
555 return (42, 42) # no unicode
556 if encoding
=="test.unicode1":
557 return (encode1
, decode1
, None, None)
558 elif encoding
=="test.unicode2":
559 return (encode2
, decode2
, None, None)
562 codecs
.register(search_function
)
563 self
.assertRaises(TypeError, "hello".decode
, "test.unicode1")
564 self
.assertRaises(TypeError, unicode, "hello", "test.unicode2")
565 self
.assertRaises(TypeError, u
"hello".encode
, "test.unicode1")
566 self
.assertRaises(TypeError, u
"hello".encode
, "test.unicode2")
567 # executes PyUnicode_Encode()
572 "non-existing module",
573 [u
"non-existing dir"]
576 # Error handling (wrong arguments)
577 self
.assertRaises(TypeError, u
"hello".encode
, 42, 42, 42)
579 # Error handling (PyUnicode_EncodeDecimal())
580 self
.assertRaises(UnicodeError, int, u
"\u0200")
582 def test_codecs(self
):
584 self
.assertEqual(u
'hello'.encode('ascii'), 'hello')
585 self
.assertEqual(u
'hello'.encode('utf-7'), 'hello')
586 self
.assertEqual(u
'hello'.encode('utf-8'), 'hello')
587 self
.assertEqual(u
'hello'.encode('utf8'), 'hello')
588 self
.assertEqual(u
'hello'.encode('utf-16-le'), 'h\000e\000l\000l\000o\000')
589 self
.assertEqual(u
'hello'.encode('utf-16-be'), '\000h\000e\000l\000l\000o')
590 self
.assertEqual(u
'hello'.encode('latin-1'), 'hello')
592 # Roundtrip safety for BMP (just the first 1024 chars)
593 u
= u
''.join(map(unichr, xrange(1024)))
594 for encoding
in ('utf-7', 'utf-8', 'utf-16', 'utf-16-le', 'utf-16-be',
595 'raw_unicode_escape', 'unicode_escape', 'unicode_internal'):
596 self
.assertEqual(unicode(u
.encode(encoding
),encoding
), u
)
598 # Roundtrip safety for BMP (just the first 256 chars)
599 u
= u
''.join(map(unichr, xrange(256)))
600 for encoding
in ('latin-1',):
601 self
.assertEqual(unicode(u
.encode(encoding
),encoding
), u
)
603 # Roundtrip safety for BMP (just the first 128 chars)
604 u
= u
''.join(map(unichr, xrange(128)))
605 for encoding
in ('ascii',):
606 self
.assertEqual(unicode(u
.encode(encoding
),encoding
), u
)
608 # Roundtrip safety for non-BMP (just a few chars)
609 u
= u
'\U00010001\U00020002\U00030003\U00040004\U00050005'
610 for encoding
in ('utf-8', 'utf-16', 'utf-16-le', 'utf-16-be',
611 #'raw_unicode_escape',
612 'unicode_escape', 'unicode_internal'):
613 self
.assertEqual(unicode(u
.encode(encoding
),encoding
), u
)
615 # UTF-8 must be roundtrip safe for all UCS-2 code points
616 # This excludes surrogates: in the full range, there would be
617 # a surrogate pair (\udbff\udc00), which gets converted back
618 # to a non-BMP character (\U0010fc00)
619 u
= u
''.join(map(unichr, range(0,0xd800)+range(0xe000,0x10000)))
620 for encoding
in ('utf-8',):
621 self
.assertEqual(unicode(u
.encode(encoding
),encoding
), u
)
623 def test_codecs_charmap(self
):
625 s
= ''.join(map(chr, xrange(128)))
628 'cp437', 'cp500', 'cp737', 'cp775', 'cp850',
629 'cp852', 'cp855', 'cp860', 'cp861', 'cp862',
630 'cp863', 'cp865', 'cp866',
631 'iso8859_10', 'iso8859_13', 'iso8859_14', 'iso8859_15',
632 'iso8859_2', 'iso8859_3', 'iso8859_4', 'iso8859_5', 'iso8859_6',
633 'iso8859_7', 'iso8859_9', 'koi8_r', 'latin_1',
634 'mac_cyrillic', 'mac_latin2',
636 'cp1250', 'cp1251', 'cp1252', 'cp1253', 'cp1254', 'cp1255',
637 'cp1256', 'cp1257', 'cp1258',
638 'cp856', 'cp857', 'cp864', 'cp869', 'cp874',
640 'mac_greek', 'mac_iceland','mac_roman', 'mac_turkish',
641 'cp1006', 'iso8859_8',
643 ### These have undefined mappings:
646 ### These fail the round-trip:
650 self
.assertEqual(unicode(s
, encoding
).encode(encoding
), s
)
653 s
= ''.join(map(chr, xrange(128, 256)))
656 'cp437', 'cp500', 'cp737', 'cp775', 'cp850',
657 'cp852', 'cp855', 'cp860', 'cp861', 'cp862',
658 'cp863', 'cp865', 'cp866',
659 'iso8859_10', 'iso8859_13', 'iso8859_14', 'iso8859_15',
660 'iso8859_2', 'iso8859_4', 'iso8859_5',
661 'iso8859_9', 'koi8_r', 'latin_1',
662 'mac_cyrillic', 'mac_latin2',
664 ### These have undefined mappings:
665 #'cp1250', 'cp1251', 'cp1252', 'cp1253', 'cp1254', 'cp1255',
666 #'cp1256', 'cp1257', 'cp1258',
667 #'cp424', 'cp856', 'cp857', 'cp864', 'cp869', 'cp874',
668 #'iso8859_3', 'iso8859_6', 'iso8859_7',
669 #'mac_greek', 'mac_iceland','mac_roman', 'mac_turkish',
671 ### These fail the round-trip:
672 #'cp1006', 'cp875', 'iso8859_8',
675 self
.assertEqual(unicode(s
, encoding
).encode(encoding
), s
)
677 def test_concatenation(self
):
678 self
.assertEqual((u
"abc" u
"def"), u
"abcdef")
679 self
.assertEqual(("abc" u
"def"), u
"abcdef")
680 self
.assertEqual((u
"abc" "def"), u
"abcdef")
681 self
.assertEqual((u
"abc" u
"def" "ghi"), u
"abcdefghi")
682 self
.assertEqual(("abc" "def" u
"ghi"), u
"abcdefghi")
684 def test_printing(self
):
686 def write(self
, text
):
691 print >>out
, u
'abc', u
'def'
692 print >>out
, u
'abc', 'def'
693 print >>out
, 'abc', u
'def'
694 print >>out
, u
'abc\n'
695 print >>out
, u
'abc\n',
696 print >>out
, u
'abc\n',
697 print >>out
, u
'def\n'
698 print >>out
, u
'def\n'
701 suite
= unittest
.TestSuite()
702 suite
.addTest(unittest
.makeSuite(UnicodeTest
))
703 test_support
.run_suite(suite
)
705 if __name__
== "__main__":