1 # -*- coding: iso-8859-1 -*-
2 """ Test script for the Unicode implementation.
4 Written by Marc-Andre Lemburg (mal@lemburg.com).
6 (c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
9 import unittest
, sys
, string
, codecs
, new
10 from test
import test_support
, string_tests
13 string_tests
.CommonTest
,
14 string_tests
.MixinStrUnicodeUserStringTest
18 def checkequalnofix(self
, result
, object, methodname
, *args
):
19 method
= getattr(object, methodname
)
20 realresult
= method(*args
)
21 self
.assertEqual(realresult
, result
)
22 self
.assert_(type(realresult
) is type(result
))
24 # if the original is returned make sure that
25 # this doesn't happen with subclasses
26 if realresult
is object:
29 return 'usub(%r)' % unicode.__repr
__(self
)
31 method
= getattr(object, methodname
)
32 realresult
= method(*args
)
33 self
.assertEqual(realresult
, result
)
34 self
.assert_(object is not realresult
)
37 if not sys
.platform
.startswith('java'):
38 # Test basic sanity of repr()
39 self
.assertEqual(repr(u
'abc'), "u'abc'")
40 self
.assertEqual(repr(u
'ab\\c'), "u'ab\\\\c'")
41 self
.assertEqual(repr(u
'ab\\'), "u'ab\\\\'")
42 self
.assertEqual(repr(u
'\\c'), "u'\\\\c'")
43 self
.assertEqual(repr(u
'\\'), "u'\\\\'")
44 self
.assertEqual(repr(u
'\n'), "u'\\n'")
45 self
.assertEqual(repr(u
'\r'), "u'\\r'")
46 self
.assertEqual(repr(u
'\t'), "u'\\t'")
47 self
.assertEqual(repr(u
'\b'), "u'\\x08'")
48 self
.assertEqual(repr(u
"'\""), """u'\\'"'""")
49 self
.assertEqual(repr(u
"'\""), """u'\\'"'""")
50 self
.assertEqual(repr(u
"'"), '''u"'"''')
51 self
.assertEqual(repr(u
'"'), """u'"'""")
53 "u'\\x00\\x01\\x02\\x03\\x04\\x05\\x06\\x07\\x08\\t\\n\\x0b\\x0c\\r"
54 "\\x0e\\x0f\\x10\\x11\\x12\\x13\\x14\\x15\\x16\\x17\\x18\\x19\\x1a"
55 "\\x1b\\x1c\\x1d\\x1e\\x1f !\"#$%&\\'()*+,-./0123456789:;<=>?@ABCDEFGHI"
56 "JKLMNOPQRSTUVWXYZ[\\\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\\x7f"
57 "\\x80\\x81\\x82\\x83\\x84\\x85\\x86\\x87\\x88\\x89\\x8a\\x8b\\x8c\\x8d"
58 "\\x8e\\x8f\\x90\\x91\\x92\\x93\\x94\\x95\\x96\\x97\\x98\\x99\\x9a\\x9b"
59 "\\x9c\\x9d\\x9e\\x9f\\xa0\\xa1\\xa2\\xa3\\xa4\\xa5\\xa6\\xa7\\xa8\\xa9"
60 "\\xaa\\xab\\xac\\xad\\xae\\xaf\\xb0\\xb1\\xb2\\xb3\\xb4\\xb5\\xb6\\xb7"
61 "\\xb8\\xb9\\xba\\xbb\\xbc\\xbd\\xbe\\xbf\\xc0\\xc1\\xc2\\xc3\\xc4\\xc5"
62 "\\xc6\\xc7\\xc8\\xc9\\xca\\xcb\\xcc\\xcd\\xce\\xcf\\xd0\\xd1\\xd2\\xd3"
63 "\\xd4\\xd5\\xd6\\xd7\\xd8\\xd9\\xda\\xdb\\xdc\\xdd\\xde\\xdf\\xe0\\xe1"
64 "\\xe2\\xe3\\xe4\\xe5\\xe6\\xe7\\xe8\\xe9\\xea\\xeb\\xec\\xed\\xee\\xef"
65 "\\xf0\\xf1\\xf2\\xf3\\xf4\\xf5\\xf6\\xf7\\xf8\\xf9\\xfa\\xfb\\xfc\\xfd"
67 testrepr
= repr(u
''.join(map(unichr, xrange(256))))
68 self
.assertEqual(testrepr
, latin1repr
)
71 string_tests
.CommonTest
.test_count(self
)
72 # check mixed argument types
73 self
.checkequalnofix(3, 'aaa', 'count', u
'a')
74 self
.checkequalnofix(0, 'aaa', 'count', u
'b')
75 self
.checkequalnofix(3, u
'aaa', 'count', 'a')
76 self
.checkequalnofix(0, u
'aaa', 'count', 'b')
77 self
.checkequalnofix(0, u
'aaa', 'count', 'b')
78 self
.checkequalnofix(1, u
'aaa', 'count', 'a', -1)
79 self
.checkequalnofix(3, u
'aaa', 'count', 'a', -10)
80 self
.checkequalnofix(2, u
'aaa', 'count', 'a', 0, -1)
81 self
.checkequalnofix(0, u
'aaa', 'count', 'a', 0, -10)
84 self
.checkequalnofix(0, u
'abcdefghiabc', 'find', u
'abc')
85 self
.checkequalnofix(9, u
'abcdefghiabc', 'find', u
'abc', 1)
86 self
.checkequalnofix(-1, u
'abcdefghiabc', 'find', u
'def', 4)
88 self
.assertRaises(TypeError, u
'hello'.find
)
89 self
.assertRaises(TypeError, u
'hello'.find
, 42)
92 string_tests
.CommonTest
.test_rfind(self
)
93 # check mixed argument types
94 self
.checkequalnofix(9, 'abcdefghiabc', 'rfind', u
'abc')
95 self
.checkequalnofix(12, 'abcdefghiabc', 'rfind', u
'')
96 self
.checkequalnofix(12, u
'abcdefghiabc', 'rfind', '')
99 string_tests
.CommonTest
.test_index(self
)
100 # check mixed argument types
101 for (t1
, t2
) in ((str, unicode), (unicode, str)):
102 self
.checkequalnofix(0, t1('abcdefghiabc'), 'index', t2(''))
103 self
.checkequalnofix(3, t1('abcdefghiabc'), 'index', t2('def'))
104 self
.checkequalnofix(0, t1('abcdefghiabc'), 'index', t2('abc'))
105 self
.checkequalnofix(9, t1('abcdefghiabc'), 'index', t2('abc'), 1)
106 self
.assertRaises(ValueError, t1('abcdefghiabc').index
, t2('hib'))
107 self
.assertRaises(ValueError, t1('abcdefghiab').index
, t2('abc'), 1)
108 self
.assertRaises(ValueError, t1('abcdefghi').index
, t2('ghi'), 8)
109 self
.assertRaises(ValueError, t1('abcdefghi').index
, t2('ghi'), -1)
111 def test_rindex(self
):
112 string_tests
.CommonTest
.test_rindex(self
)
113 # check mixed argument types
114 for (t1
, t2
) in ((str, unicode), (unicode, str)):
115 self
.checkequalnofix(12, t1('abcdefghiabc'), 'rindex', t2(''))
116 self
.checkequalnofix(3, t1('abcdefghiabc'), 'rindex', t2('def'))
117 self
.checkequalnofix(9, t1('abcdefghiabc'), 'rindex', t2('abc'))
118 self
.checkequalnofix(0, t1('abcdefghiabc'), 'rindex', t2('abc'), 0, -1)
120 self
.assertRaises(ValueError, t1('abcdefghiabc').rindex
, t2('hib'))
121 self
.assertRaises(ValueError, t1('defghiabc').rindex
, t2('def'), 1)
122 self
.assertRaises(ValueError, t1('defghiabc').rindex
, t2('abc'), 0, -1)
123 self
.assertRaises(ValueError, t1('abcdefghi').rindex
, t2('ghi'), 0, 8)
124 self
.assertRaises(ValueError, t1('abcdefghi').rindex
, t2('ghi'), 0, -1)
126 def test_translate(self
):
127 self
.checkequalnofix(u
'bbbc', u
'abababc', 'translate', {ord('a'):None})
128 self
.checkequalnofix(u
'iiic', u
'abababc', 'translate', {ord('a'):None, ord('b'):ord('i')})
129 self
.checkequalnofix(u
'iiix', u
'abababc', 'translate', {ord('a'):None, ord('b'):ord('i'), ord('c'):u
'x'})
130 self
.checkequalnofix(u
'<i><i><i>c', u
'abababc', 'translate', {ord('a'):None, ord('b'):u
'<i>'})
131 self
.checkequalnofix(u
'c', u
'abababc', 'translate', {ord('a'):None, ord('b'):u
''})
132 self
.checkequalnofix(u
'xyyx', u
'xzx', 'translate', {ord('z'):u
'yy'})
134 self
.assertRaises(TypeError, u
'hello'.translate
)
135 self
.assertRaises(TypeError, u
'abababc'.translate
, {ord('a'):''})
137 def test_split(self
):
138 string_tests
.CommonTest
.test_split(self
)
141 self
.checkequalnofix([u
'a', u
'b', u
'c', u
'd'], u
'a//b//c//d', 'split', '//')
142 self
.checkequalnofix([u
'a', u
'b', u
'c', u
'd'], 'a//b//c//d', 'split', u
'//')
143 self
.checkequalnofix([u
'endcase ', u
''], u
'endcase test', 'split', 'test')
146 string_tests
.MixinStrUnicodeUserStringTest
.test_join(self
)
149 self
.checkequalnofix(u
'a b c d', u
' ', 'join', ['a', 'b', u
'c', u
'd'])
150 self
.checkequalnofix(u
'abcd', u
'', 'join', (u
'a', u
'b', u
'c', u
'd'))
151 self
.checkequalnofix(u
'w x y z', u
' ', 'join', string_tests
.Sequence('wxyz'))
152 self
.checkequalnofix(u
'a b c d', ' ', 'join', [u
'a', u
'b', u
'c', u
'd'])
153 self
.checkequalnofix(u
'a b c d', ' ', 'join', ['a', 'b', u
'c', u
'd'])
154 self
.checkequalnofix(u
'abcd', '', 'join', (u
'a', u
'b', u
'c', u
'd'))
155 self
.checkequalnofix(u
'w x y z', ' ', 'join', string_tests
.Sequence(u
'wxyz'))
157 def test_strip(self
):
158 string_tests
.CommonTest
.test_strip(self
)
159 self
.assertRaises(UnicodeError, u
"hello".strip
, "\xff")
161 def test_replace(self
):
162 string_tests
.CommonTest
.test_replace(self
)
164 # method call forwarded from str implementation because of unicode argument
165 self
.checkequalnofix(u
'one@two!three!', 'one!two!three!', 'replace', u
'!', u
'@', 1)
166 self
.assertRaises(TypeError, 'replace'.replace
, u
"r", 42)
168 def test_comparison(self
):
170 self
.assertEqual(u
'abc', 'abc')
171 self
.assertEqual('abc', u
'abc')
172 self
.assertEqual(u
'abc', u
'abc')
173 self
.assert_(u
'abcd' > 'abc')
174 self
.assert_('abcd' > u
'abc')
175 self
.assert_(u
'abcd' > u
'abc')
176 self
.assert_(u
'abc' < 'abcd')
177 self
.assert_('abc' < u
'abcd')
178 self
.assert_(u
'abc' < u
'abcd')
181 # Move these tests to a Unicode collation module test...
182 # Testing UTF-16 code point order comparisons...
184 # No surrogates, no fixup required.
185 self
.assert_(u
'\u0061' < u
'\u20ac')
186 # Non surrogate below surrogate value, no fixup required
187 self
.assert_(u
'\u0061' < u
'\ud800\udc02')
189 # Non surrogate above surrogate value, fixup required
190 def test_lecmp(s
, s2
):
227 test_fixup(u
'\ue000')
228 test_fixup(u
'\uff61')
230 # Surrogates on both sides, no fixup required
231 self
.assert_(u
'\ud800\udc02' < u
'\ud84d\udc56')
233 def test_islower(self
):
234 string_tests
.MixinStrUnicodeUserStringTest
.test_islower(self
)
235 self
.checkequalnofix(False, u
'\u1FFc', 'islower')
237 def test_isupper(self
):
238 string_tests
.MixinStrUnicodeUserStringTest
.test_isupper(self
)
239 if not sys
.platform
.startswith('java'):
240 self
.checkequalnofix(False, u
'\u1FFc', 'isupper')
242 def test_istitle(self
):
243 string_tests
.MixinStrUnicodeUserStringTest
.test_title(self
)
244 self
.checkequalnofix(True, u
'\u1FFc', 'istitle')
245 self
.checkequalnofix(True, u
'Greek \u1FFcitlecases ...', 'istitle')
247 def test_isspace(self
):
248 string_tests
.MixinStrUnicodeUserStringTest
.test_isspace(self
)
249 self
.checkequalnofix(True, u
'\u2000', 'isspace')
250 self
.checkequalnofix(True, u
'\u200a', 'isspace')
251 self
.checkequalnofix(False, u
'\u2014', 'isspace')
253 def test_isalpha(self
):
254 string_tests
.MixinStrUnicodeUserStringTest
.test_isalpha(self
)
255 self
.checkequalnofix(True, u
'\u1FFc', 'isalpha')
257 def test_isdecimal(self
):
258 self
.checkequalnofix(False, u
'', 'isdecimal')
259 self
.checkequalnofix(False, u
'a', 'isdecimal')
260 self
.checkequalnofix(True, u
'0', 'isdecimal')
261 self
.checkequalnofix(False, u
'\u2460', 'isdecimal') # CIRCLED DIGIT ONE
262 self
.checkequalnofix(False, u
'\xbc', 'isdecimal') # VULGAR FRACTION ONE QUARTER
263 self
.checkequalnofix(True, u
'\u0660', 'isdecimal') # ARABIC-INDIC DIGIT ZERO
264 self
.checkequalnofix(True, u
'0123456789', 'isdecimal')
265 self
.checkequalnofix(False, u
'0123456789a', 'isdecimal')
267 self
.checkraises(TypeError, 'abc', 'isdecimal', 42)
269 def test_isdigit(self
):
270 string_tests
.MixinStrUnicodeUserStringTest
.test_isdigit(self
)
271 self
.checkequalnofix(True, u
'\u2460', 'isdigit')
272 self
.checkequalnofix(False, u
'\xbc', 'isdigit')
273 self
.checkequalnofix(True, u
'\u0660', 'isdigit')
275 def test_isnumeric(self
):
276 self
.checkequalnofix(False, u
'', 'isnumeric')
277 self
.checkequalnofix(False, u
'a', 'isnumeric')
278 self
.checkequalnofix(True, u
'0', 'isnumeric')
279 self
.checkequalnofix(True, u
'\u2460', 'isnumeric')
280 self
.checkequalnofix(True, u
'\xbc', 'isnumeric')
281 self
.checkequalnofix(True, u
'\u0660', 'isnumeric')
282 self
.checkequalnofix(True, u
'0123456789', 'isnumeric')
283 self
.checkequalnofix(False, u
'0123456789a', 'isnumeric')
285 self
.assertRaises(TypeError, u
"abc".isnumeric
, 42)
287 def test_contains(self
):
288 # Testing Unicode contains method
289 self
.assert_('a' in u
'abdb')
290 self
.assert_('a' in u
'bdab')
291 self
.assert_('a' in u
'bdaba')
292 self
.assert_('a' in u
'bdba')
293 self
.assert_('a' in u
'bdba')
294 self
.assert_(u
'a' in u
'bdba')
295 self
.assert_(u
'a' not in u
'bdb')
296 self
.assert_(u
'a' not in 'bdb')
297 self
.assert_(u
'a' in 'bdba')
298 self
.assert_(u
'a' in ('a',1,None))
299 self
.assert_(u
'a' in (1,None,'a'))
300 self
.assert_(u
'a' in (1,None,u
'a'))
301 self
.assert_('a' in ('a',1,None))
302 self
.assert_('a' in (1,None,'a'))
303 self
.assert_('a' in (1,None,u
'a'))
304 self
.assert_('a' not in ('x',1,u
'y'))
305 self
.assert_('a' not in ('x',1,None))
306 self
.assert_(u
'abcd' not in u
'abcxxxx')
307 self
.assert_(u
'ab' in u
'abcd')
308 self
.assert_('ab' in u
'abc')
309 self
.assert_(u
'ab' in 'abc')
310 self
.assert_(u
'ab' in (1,None,u
'ab'))
311 self
.assert_(u
'' in u
'abc')
312 self
.assert_('' in u
'abc')
314 # If the following fails either
315 # the contains operator does not propagate UnicodeErrors or
316 # someone has changed the default encoding
317 self
.assertRaises(UnicodeError, 'g\xe2teau'.__contains
__, u
'\xe2')
319 self
.assert_(u
'' in '')
320 self
.assert_('' in u
'')
321 self
.assert_(u
'' in u
'')
322 self
.assert_(u
'' in 'abc')
323 self
.assert_('' in u
'abc')
324 self
.assert_(u
'' in u
'abc')
325 self
.assert_(u
'\0' not in 'abc')
326 self
.assert_('\0' not in u
'abc')
327 self
.assert_(u
'\0' not in u
'abc')
328 self
.assert_(u
'\0' in '\0abc')
329 self
.assert_('\0' in u
'\0abc')
330 self
.assert_(u
'\0' in u
'\0abc')
331 self
.assert_(u
'\0' in 'abc\0')
332 self
.assert_('\0' in u
'abc\0')
333 self
.assert_(u
'\0' in u
'abc\0')
334 self
.assert_(u
'a' in '\0abc')
335 self
.assert_('a' in u
'\0abc')
336 self
.assert_(u
'a' in u
'\0abc')
337 self
.assert_(u
'asdf' in 'asdf')
338 self
.assert_('asdf' in u
'asdf')
339 self
.assert_(u
'asdf' in u
'asdf')
340 self
.assert_(u
'asdf' not in 'asd')
341 self
.assert_('asdf' not in u
'asd')
342 self
.assert_(u
'asdf' not in u
'asd')
343 self
.assert_(u
'asdf' not in '')
344 self
.assert_('asdf' not in u
'')
345 self
.assert_(u
'asdf' not in u
'')
347 self
.assertRaises(TypeError, u
"abc".__contains
__)
349 def test_formatting(self
):
350 string_tests
.MixinStrUnicodeUserStringTest
.test_formatting(self
)
351 # Testing Unicode formatting strings...
352 self
.assertEqual(u
"%s, %s" % (u
"abc", "abc"), u
'abc, abc')
353 self
.assertEqual(u
"%s, %s, %i, %f, %5.2f" % (u
"abc", "abc", 1, 2, 3), u
'abc, abc, 1, 2.000000, 3.00')
354 self
.assertEqual(u
"%s, %s, %i, %f, %5.2f" % (u
"abc", "abc", 1, -2, 3), u
'abc, abc, 1, -2.000000, 3.00')
355 self
.assertEqual(u
"%s, %s, %i, %f, %5.2f" % (u
"abc", "abc", -1, -2, 3.5), u
'abc, abc, -1, -2.000000, 3.50')
356 self
.assertEqual(u
"%s, %s, %i, %f, %5.2f" % (u
"abc", "abc", -1, -2, 3.57), u
'abc, abc, -1, -2.000000, 3.57')
357 self
.assertEqual(u
"%s, %s, %i, %f, %5.2f" % (u
"abc", "abc", -1, -2, 1003.57), u
'abc, abc, -1, -2.000000, 1003.57')
358 if not sys
.platform
.startswith('java'):
359 self
.assertEqual(u
"%r, %r" % (u
"abc", "abc"), u
"u'abc', 'abc'")
360 self
.assertEqual(u
"%(x)s, %(y)s" % {'x':u
"abc", 'y':"def"}, u
'abc, def')
361 self
.assertEqual(u
"%(x)s, %(\xfc)s" % {'x':u
"abc", u
'\xfc':"def"}, u
'abc, def')
363 self
.assertEqual(u
'%c' % 0x1234, u
'\u1234')
364 self
.assertRaises(OverflowError, u
"%c".__mod
__, (sys
.maxunicode
+1,))
366 # formatting jobs delegated from the string implementation:
367 self
.assertEqual('...%(foo)s...' % {'foo':u
"abc"}, u
'...abc...')
368 self
.assertEqual('...%(foo)s...' % {'foo':"abc"}, '...abc...')
369 self
.assertEqual('...%(foo)s...' % {u
'foo':"abc"}, '...abc...')
370 self
.assertEqual('...%(foo)s...' % {u
'foo':u
"abc"}, u
'...abc...')
371 self
.assertEqual('...%(foo)s...' % {u
'foo':u
"abc",'def':123}, u
'...abc...')
372 self
.assertEqual('...%(foo)s...' % {u
'foo':u
"abc",u
'def':123}, u
'...abc...')
373 self
.assertEqual('...%s...%s...%s...%s...' % (1,2,3,u
"abc"), u
'...1...2...3...abc...')
374 self
.assertEqual('...%%...%%s...%s...%s...%s...%s...' % (1,2,3,u
"abc"), u
'...%...%s...1...2...3...abc...')
375 self
.assertEqual('...%s...' % u
"abc", u
'...abc...')
376 self
.assertEqual('%*s' % (5,u
'abc',), u
' abc')
377 self
.assertEqual('%*s' % (-5,u
'abc',), u
'abc ')
378 self
.assertEqual('%*.*s' % (5,2,u
'abc',), u
' ab')
379 self
.assertEqual('%*.*s' % (5,3,u
'abc',), u
' abc')
380 self
.assertEqual('%i %*.*s' % (10, 5,3,u
'abc',), u
'10 abc')
381 self
.assertEqual('%i%s %*.*s' % (10, 3, 5, 3, u
'abc',), u
'103 abc')
382 self
.assertEqual('%c' % u
'a', u
'a')
385 def test_constructor(self
):
386 # unicode(obj) tests (this maps to PyObject_Unicode() at C level)
389 unicode(u
'unicode remains unicode'),
390 u
'unicode remains unicode'
393 class UnicodeSubclass(unicode):
397 unicode(UnicodeSubclass('unicode subclass becomes unicode')),
398 u
'unicode subclass becomes unicode'
402 unicode('strings are converted to unicode'),
403 u
'strings are converted to unicode'
407 def __init__(self
, x
):
409 def __unicode__(self
):
413 unicode(UnicodeCompat('__unicode__ compatible objects are recognized')),
414 u
'__unicode__ compatible objects are recognized')
417 def __init__(self
, x
):
423 unicode(StringCompat('__str__ compatible objects are recognized')),
424 u
'__str__ compatible objects are recognized'
427 # unicode(obj) is compatible to str():
429 o
= StringCompat('unicode(obj) is compatible to str()')
430 self
.assertEqual(unicode(o
), u
'unicode(obj) is compatible to str()')
431 self
.assertEqual(str(o
), 'unicode(obj) is compatible to str()')
433 for obj
in (123, 123.45, 123L):
434 self
.assertEqual(unicode(obj
), unicode(str(obj
)))
436 # unicode(obj, encoding, error) tests (this maps to
437 # PyUnicode_FromEncodedObject() at C level)
439 if not sys
.platform
.startswith('java'):
443 u
'decoding unicode is not supported',
449 unicode('strings are decoded to unicode', 'utf-8', 'strict'),
450 u
'strings are decoded to unicode'
453 if not sys
.platform
.startswith('java'):
456 buffer('character buffers are decoded to unicode'),
460 u
'character buffers are decoded to unicode'
463 self
.assertRaises(TypeError, unicode, 42, 42, 42)
465 def test_codecs_utf7(self
):
467 (u
'A\u2262\u0391.', 'A+ImIDkQ.'), # RFC2152 example
468 (u
'Hi Mom -\u263a-!', 'Hi Mom -+Jjo--!'), # RFC2152 example
469 (u
'\u65E5\u672C\u8A9E', '+ZeVnLIqe-'), # RFC2152 example
470 (u
'Item 3 is \u00a31.', 'Item 3 is +AKM-1.'), # RFC2152 example
476 (ur
'\\?', '+AFwAXA?'),
477 (ur
'\\\?', '+AFwAXABc?'),
481 for (x
, y
) in utfTests
:
482 self
.assertEqual(x
.encode('utf-7'), y
)
484 # surrogates not supported
485 self
.assertRaises(UnicodeError, unicode, '+3ADYAA-', 'utf-7')
487 self
.assertEqual(unicode('+3ADYAA-', 'utf-7', 'replace'), u
'\ufffd')
489 def test_codecs_utf8(self
):
490 self
.assertEqual(u
''.encode('utf-8'), '')
491 self
.assertEqual(u
'\u20ac'.encode('utf-8'), '\xe2\x82\xac')
492 self
.assertEqual(u
'\ud800\udc02'.encode('utf-8'), '\xf0\x90\x80\x82')
493 self
.assertEqual(u
'\ud84d\udc56'.encode('utf-8'), '\xf0\xa3\x91\x96')
494 self
.assertEqual(u
'\ud800'.encode('utf-8'), '\xed\xa0\x80')
495 self
.assertEqual(u
'\udc00'.encode('utf-8'), '\xed\xb0\x80')
497 (u
'\ud800\udc02'*1000).encode('utf-8'),
498 '\xf0\x90\x80\x82'*1000
501 u
'\u6b63\u78ba\u306b\u8a00\u3046\u3068\u7ffb\u8a33\u306f'
502 u
'\u3055\u308c\u3066\u3044\u307e\u305b\u3093\u3002\u4e00'
503 u
'\u90e8\u306f\u30c9\u30a4\u30c4\u8a9e\u3067\u3059\u304c'
504 u
'\u3001\u3042\u3068\u306f\u3067\u305f\u3089\u3081\u3067'
505 u
'\u3059\u3002\u5b9f\u969b\u306b\u306f\u300cWenn ist das'
506 u
' Nunstuck git und'.encode('utf-8'),
507 '\xe6\xad\xa3\xe7\xa2\xba\xe3\x81\xab\xe8\xa8\x80\xe3\x81'
508 '\x86\xe3\x81\xa8\xe7\xbf\xbb\xe8\xa8\xb3\xe3\x81\xaf\xe3'
509 '\x81\x95\xe3\x82\x8c\xe3\x81\xa6\xe3\x81\x84\xe3\x81\xbe'
510 '\xe3\x81\x9b\xe3\x82\x93\xe3\x80\x82\xe4\xb8\x80\xe9\x83'
511 '\xa8\xe3\x81\xaf\xe3\x83\x89\xe3\x82\xa4\xe3\x83\x84\xe8'
512 '\xaa\x9e\xe3\x81\xa7\xe3\x81\x99\xe3\x81\x8c\xe3\x80\x81'
513 '\xe3\x81\x82\xe3\x81\xa8\xe3\x81\xaf\xe3\x81\xa7\xe3\x81'
514 '\x9f\xe3\x82\x89\xe3\x82\x81\xe3\x81\xa7\xe3\x81\x99\xe3'
515 '\x80\x82\xe5\xae\x9f\xe9\x9a\x9b\xe3\x81\xab\xe3\x81\xaf'
516 '\xe3\x80\x8cWenn ist das Nunstuck git und'
519 # UTF-8 specific decoding tests
520 self
.assertEqual(unicode('\xf0\xa3\x91\x96', 'utf-8'), u
'\U00023456' )
521 self
.assertEqual(unicode('\xf0\x90\x80\x82', 'utf-8'), u
'\U00010002' )
522 self
.assertEqual(unicode('\xe2\x82\xac', 'utf-8'), u
'\u20ac' )
524 # Other possible utf-8 test cases:
525 # * strict decoding testing for all of the
526 # UTF8_ERROR cases in PyUnicode_DecodeUTF8
528 def test_codecs_idna(self
):
529 # Test whether trailing dot is preserved
530 self
.assertEqual(u
"www.python.org.".encode("idna"), "www.python.org.")
532 def test_codecs_errors(self
):
533 # Error handling (encoding)
534 self
.assertRaises(UnicodeError, u
'Andr\202 x'.encode
, 'ascii')
535 self
.assertRaises(UnicodeError, u
'Andr\202 x'.encode
, 'ascii','strict')
536 self
.assertEqual(u
'Andr\202 x'.encode('ascii','ignore'), "Andr x")
537 self
.assertEqual(u
'Andr\202 x'.encode('ascii','replace'), "Andr? x")
539 # Error handling (decoding)
540 self
.assertRaises(UnicodeError, unicode, 'Andr\202 x', 'ascii')
541 self
.assertRaises(UnicodeError, unicode, 'Andr\202 x', 'ascii','strict')
542 self
.assertEqual(unicode('Andr\202 x','ascii','ignore'), u
"Andr x")
543 self
.assertEqual(unicode('Andr\202 x','ascii','replace'), u
'Andr\uFFFD x')
545 # Error handling (unknown character names)
546 self
.assertEqual("\\N{foo}xx".decode("unicode-escape", "ignore"), u
"xx")
548 # Error handling (truncated escape sequence)
549 self
.assertRaises(UnicodeError, "\\".decode
, "unicode-escape")
551 # Error handling (bad decoder return)
552 def search_function(encoding
):
553 def decode1(input, errors
="strict"):
554 return 42 # not a tuple
555 def encode1(input, errors
="strict"):
556 return 42 # not a tuple
557 def encode2(input, errors
="strict"):
558 return (42, 42) # no unicode
559 def decode2(input, errors
="strict"):
560 return (42, 42) # no unicode
561 if encoding
=="test.unicode1":
562 return (encode1
, decode1
, None, None)
563 elif encoding
=="test.unicode2":
564 return (encode2
, decode2
, None, None)
567 codecs
.register(search_function
)
568 self
.assertRaises(TypeError, "hello".decode
, "test.unicode1")
569 self
.assertRaises(TypeError, unicode, "hello", "test.unicode2")
570 self
.assertRaises(TypeError, u
"hello".encode
, "test.unicode1")
571 self
.assertRaises(TypeError, u
"hello".encode
, "test.unicode2")
572 # executes PyUnicode_Encode()
577 "non-existing module",
578 [u
"non-existing dir"]
581 # Error handling (wrong arguments)
582 self
.assertRaises(TypeError, u
"hello".encode
, 42, 42, 42)
584 # Error handling (PyUnicode_EncodeDecimal())
585 self
.assertRaises(UnicodeError, int, u
"\u0200")
587 def test_codecs(self
):
589 self
.assertEqual(u
'hello'.encode('ascii'), 'hello')
590 self
.assertEqual(u
'hello'.encode('utf-7'), 'hello')
591 self
.assertEqual(u
'hello'.encode('utf-8'), 'hello')
592 self
.assertEqual(u
'hello'.encode('utf8'), 'hello')
593 self
.assertEqual(u
'hello'.encode('utf-16-le'), 'h\000e\000l\000l\000o\000')
594 self
.assertEqual(u
'hello'.encode('utf-16-be'), '\000h\000e\000l\000l\000o')
595 self
.assertEqual(u
'hello'.encode('latin-1'), 'hello')
597 # Roundtrip safety for BMP (just the first 1024 chars)
598 u
= u
''.join(map(unichr, xrange(1024)))
599 for encoding
in ('utf-7', 'utf-8', 'utf-16', 'utf-16-le', 'utf-16-be',
600 'raw_unicode_escape', 'unicode_escape', 'unicode_internal'):
601 self
.assertEqual(unicode(u
.encode(encoding
),encoding
), u
)
603 # Roundtrip safety for BMP (just the first 256 chars)
604 u
= u
''.join(map(unichr, xrange(256)))
605 for encoding
in ('latin-1',):
606 self
.assertEqual(unicode(u
.encode(encoding
),encoding
), u
)
608 # Roundtrip safety for BMP (just the first 128 chars)
609 u
= u
''.join(map(unichr, xrange(128)))
610 for encoding
in ('ascii',):
611 self
.assertEqual(unicode(u
.encode(encoding
),encoding
), u
)
613 # Roundtrip safety for non-BMP (just a few chars)
614 u
= u
'\U00010001\U00020002\U00030003\U00040004\U00050005'
615 for encoding
in ('utf-8', 'utf-16', 'utf-16-le', 'utf-16-be',
616 #'raw_unicode_escape',
617 'unicode_escape', 'unicode_internal'):
618 self
.assertEqual(unicode(u
.encode(encoding
),encoding
), u
)
620 # UTF-8 must be roundtrip safe for all UCS-2 code points
621 # This excludes surrogates: in the full range, there would be
622 # a surrogate pair (\udbff\udc00), which gets converted back
623 # to a non-BMP character (\U0010fc00)
624 u
= u
''.join(map(unichr, range(0,0xd800)+range(0xe000,0x10000)))
625 for encoding
in ('utf-8',):
626 self
.assertEqual(unicode(u
.encode(encoding
),encoding
), u
)
628 def test_codecs_charmap(self
):
630 s
= ''.join(map(chr, xrange(128)))
633 'cp437', 'cp500', 'cp737', 'cp775', 'cp850',
634 'cp852', 'cp855', 'cp860', 'cp861', 'cp862',
635 'cp863', 'cp865', 'cp866',
636 'iso8859_10', 'iso8859_13', 'iso8859_14', 'iso8859_15',
637 'iso8859_2', 'iso8859_3', 'iso8859_4', 'iso8859_5', 'iso8859_6',
638 'iso8859_7', 'iso8859_9', 'koi8_r', 'latin_1',
639 'mac_cyrillic', 'mac_latin2',
641 'cp1250', 'cp1251', 'cp1252', 'cp1253', 'cp1254', 'cp1255',
642 'cp1256', 'cp1257', 'cp1258',
643 'cp856', 'cp857', 'cp864', 'cp869', 'cp874',
645 'mac_greek', 'mac_iceland','mac_roman', 'mac_turkish',
646 'cp1006', 'iso8859_8',
648 ### These have undefined mappings:
651 ### These fail the round-trip:
655 self
.assertEqual(unicode(s
, encoding
).encode(encoding
), s
)
658 s
= ''.join(map(chr, xrange(128, 256)))
661 'cp437', 'cp500', 'cp737', 'cp775', 'cp850',
662 'cp852', 'cp855', 'cp860', 'cp861', 'cp862',
663 'cp863', 'cp865', 'cp866',
664 'iso8859_10', 'iso8859_13', 'iso8859_14', 'iso8859_15',
665 'iso8859_2', 'iso8859_4', 'iso8859_5',
666 'iso8859_9', 'koi8_r', 'latin_1',
667 'mac_cyrillic', 'mac_latin2',
669 ### These have undefined mappings:
670 #'cp1250', 'cp1251', 'cp1252', 'cp1253', 'cp1254', 'cp1255',
671 #'cp1256', 'cp1257', 'cp1258',
672 #'cp424', 'cp856', 'cp857', 'cp864', 'cp869', 'cp874',
673 #'iso8859_3', 'iso8859_6', 'iso8859_7',
674 #'mac_greek', 'mac_iceland','mac_roman', 'mac_turkish',
676 ### These fail the round-trip:
677 #'cp1006', 'cp875', 'iso8859_8',
680 self
.assertEqual(unicode(s
, encoding
).encode(encoding
), s
)
682 def test_concatenation(self
):
683 self
.assertEqual((u
"abc" u
"def"), u
"abcdef")
684 self
.assertEqual(("abc" u
"def"), u
"abcdef")
685 self
.assertEqual((u
"abc" "def"), u
"abcdef")
686 self
.assertEqual((u
"abc" u
"def" "ghi"), u
"abcdefghi")
687 self
.assertEqual(("abc" "def" u
"ghi"), u
"abcdefghi")
689 def test_printing(self
):
691 def write(self
, text
):
696 print >>out
, u
'abc', u
'def'
697 print >>out
, u
'abc', 'def'
698 print >>out
, 'abc', u
'def'
699 print >>out
, u
'abc\n'
700 print >>out
, u
'abc\n',
701 print >>out
, u
'abc\n',
702 print >>out
, u
'def\n'
703 print >>out
, u
'def\n'
706 if sys
.maxunicode
== 0xFFFF:
709 y
= x
.encode("raw-unicode-escape").decode("raw-unicode-escape")
710 self
.assertEqual(x
, y
)
713 test_support
.run_unittest(UnicodeTest
)
715 if __name__
== "__main__":