1 import test
.test_support
, unittest
2 import sys
, codecs
, htmlentitydefs
, unicodedata
4 class CodecCallbackTest(unittest
.TestCase
):
6 def test_xmlcharrefreplace(self
):
7 # replace unencodable characters which numeric character entities.
8 # For ascii, latin-1 and charmaps this is completely implemented
9 # in C and should be reasonably fast.
10 s
= u
"\u30b9\u30d1\u30e2 \xe4nd eggs"
12 s
.encode("ascii", "xmlcharrefreplace"),
13 "スパモ änd eggs"
16 s
.encode("latin-1", "xmlcharrefreplace"),
17 "スパモ \xe4nd eggs"
20 def test_xmlcharnamereplace(self
):
21 # This time use a named character entity for unencodable
22 # characters, if one is available.
24 for (key
, value
) in htmlentitydefs
.entitydefs
.items():
26 names
[unicode(value
, "latin-1")] = unicode(key
, "latin-1")
28 names
[unichr(int(value
[2:-1]))] = unicode(key
, "latin-1")
30 def xmlcharnamereplace(exc
):
31 if not isinstance(exc
, UnicodeEncodeError):
32 raise TypeError("don't know how to handle %r" % exc
)
34 for c
in exc
.object[exc
.start
:exc
.end
]:
36 l
.append(u
"&%s;" % names
[c
])
38 l
.append(u
"&#%d;" % ord(c
))
39 return (u
"".join(l
), exc
.end
)
41 codecs
.register_error(
42 "test.xmlcharnamereplace", xmlcharnamereplace
)
44 sin
= u
"\xab\u211c\xbb = \u2329\u1234\u20ac\u232a"
45 sout
= "«ℜ» = ⟨ሴ€⟩"
46 self
.assertEqual(sin
.encode("ascii", "test.xmlcharnamereplace"), sout
)
47 sout
= "\xabℜ\xbb = ⟨ሴ€⟩"
48 self
.assertEqual(sin
.encode("latin-1", "test.xmlcharnamereplace"), sout
)
49 sout
= "\xabℜ\xbb = ⟨ሴ\xa4⟩"
50 self
.assertEqual(sin
.encode("iso-8859-15", "test.xmlcharnamereplace"), sout
)
52 def test_uninamereplace(self
):
53 # We're using the names from the unicode database this time,
54 # and we're doing "systax highlighting" here, i.e. we include
55 # the replaced text in ANSI escape sequences. For this it is
56 # useful that the error handler is not called for every single
57 # unencodable character, but for a complete sequence of
58 # unencodable characters, otherwise we would output many
59 # unneccessary escape sequences.
61 def uninamereplace(exc
):
62 if not isinstance(exc
, UnicodeEncodeError):
63 raise TypeError("don't know how to handle %r" % exc
)
65 for c
in exc
.object[exc
.start
:exc
.end
]:
66 l
.append(unicodedata
.name(c
, u
"0x%x" % ord(c
)))
67 return (u
"\033[1m%s\033[0m" % u
", ".join(l
), exc
.end
)
69 codecs
.register_error(
70 "test.uninamereplace", uninamereplace
)
72 sin
= u
"\xac\u1234\u20ac\u8000"
73 sout
= "\033[1mNOT SIGN, ETHIOPIC SYLLABLE SEE, EURO SIGN, 0x8000\033[0m"
74 self
.assertEqual(sin
.encode("ascii", "test.uninamereplace"), sout
)
76 sout
= "\xac\033[1mETHIOPIC SYLLABLE SEE, EURO SIGN, 0x8000\033[0m"
77 self
.assertEqual(sin
.encode("latin-1", "test.uninamereplace"), sout
)
79 sout
= "\xac\033[1mETHIOPIC SYLLABLE SEE\033[0m\xa4\033[1m0x8000\033[0m"
80 self
.assertEqual(sin
.encode("iso-8859-15", "test.uninamereplace"), sout
)
82 def test_backslashescape(self
):
83 # Does the same as the "unicode-escape" encoding, but with different
85 sin
= u
"a\xac\u1234\u20ac\u8000"
86 if sys
.maxunicode
> 0xffff:
87 sin
+= unichr(sys
.maxunicode
)
88 sout
= "a\\xac\\u1234\\u20ac\\u8000"
89 if sys
.maxunicode
> 0xffff:
90 sout
+= "\\U%08x" % sys
.maxunicode
91 self
.assertEqual(sin
.encode("ascii", "backslashreplace"), sout
)
93 sout
= "a\xac\\u1234\\u20ac\\u8000"
94 if sys
.maxunicode
> 0xffff:
95 sout
+= "\\U%08x" % sys
.maxunicode
96 self
.assertEqual(sin
.encode("latin-1", "backslashreplace"), sout
)
98 sout
= "a\xac\\u1234\xa4\\u8000"
99 if sys
.maxunicode
> 0xffff:
100 sout
+= "\\U%08x" % sys
.maxunicode
101 self
.assertEqual(sin
.encode("iso-8859-15", "backslashreplace"), sout
)
103 def test_relaxedutf8(self
):
104 # This is the test for a decoding callback handler,
105 # that relaxes the UTF-8 minimal encoding restriction.
106 # A null byte that is encoded as "\xc0\x80" will be
107 # decoded as a null byte. All other illegal sequences
108 # will be handled strictly.
109 def relaxedutf8(exc
):
110 if not isinstance(exc
, UnicodeDecodeError):
111 raise TypeError("don't know how to handle %r" % exc
)
112 if exc
.object[exc
.start
:exc
.end
].startswith("\xc0\x80"):
113 return (u
"\x00", exc
.start
+2) # retry after two bytes
117 codecs
.register_error(
118 "test.relaxedutf8", relaxedutf8
)
120 sin
= "a\x00b\xc0\x80c\xc3\xbc\xc0\x80\xc0\x80"
121 sout
= u
"a\x00b\x00c\xfc\x00\x00"
122 self
.assertEqual(sin
.decode("utf-8", "test.relaxedutf8"), sout
)
123 sin
= "\xc0\x80\xc0\x81"
124 self
.assertRaises(UnicodeError, sin
.decode
, "utf-8", "test.relaxedutf8")
126 def test_charmapencode(self
):
127 # For charmap encodings the replacement string will be
128 # mapped through the encoding again. This means, that
129 # to be able to use e.g. the "replace" handler, the
130 # charmap has to have a mapping for "?".
131 charmap
= dict([ (ord(c
), 2*c
.upper()) for c
in "abcdefgh"])
134 self
.assertEquals(codecs
.charmap_encode(sin
, "strict", charmap
)[0], sout
)
137 self
.assertRaises(UnicodeError, codecs
.charmap_encode
, sin
, "strict", charmap
)
139 charmap
[ord("?")] = "XYZ"
141 sout
= "AABBCCXYZXYZXYZ"
142 self
.assertEquals(codecs
.charmap_encode(sin
, "replace", charmap
)[0], sout
)
144 charmap
[ord("?")] = u
"XYZ"
145 self
.assertRaises(TypeError, codecs
.charmap_encode
, sin
, "replace", charmap
)
147 charmap
[ord("?")] = u
"XYZ"
148 self
.assertRaises(TypeError, codecs
.charmap_encode
, sin
, "replace", charmap
)
150 def test_callbacks(self
):
152 if not isinstance(exc
, UnicodeEncodeError) \
153 and not isinstance(exc
, UnicodeDecodeError):
154 raise TypeError("don't know how to handle %r" % exc
)
155 l
= [u
"<%d>" % ord(exc
.object[pos
]) for pos
in xrange(exc
.start
, exc
.end
)]
156 return (u
"[%s]" % u
"".join(l
), exc
.end
)
158 codecs
.register_error("test.handler1", handler1
)
161 if not isinstance(exc
, UnicodeDecodeError):
162 raise TypeError("don't know how to handle %r" % exc
)
163 l
= [u
"<%d>" % ord(exc
.object[pos
]) for pos
in xrange(exc
.start
, exc
.end
)]
164 return (u
"[%s]" % u
"".join(l
), exc
.end
+1) # skip one character
166 codecs
.register_error("test.handler2", handler2
)
168 s
= "\x00\x81\x7f\x80\xff"
171 s
.decode("ascii", "test.handler1"),
172 u
"\x00[<129>]\x7f[<128>][<255>]"
175 s
.decode("ascii", "test.handler2"),
176 u
"\x00[<129>][<128>]"
180 "\\u3042\u3xxx".decode("unicode-escape", "test.handler1"),
181 u
"\u3042[<92><117><51><120>]xx"
185 "\\u3042\u3xx".decode("unicode-escape", "test.handler1"),
186 u
"\u3042[<92><117><51><120><120>]"
190 codecs
.charmap_decode("abc", "test.handler1", {ord("a"): u
"z"})[0],
195 u
"g\xfc\xdfrk".encode("ascii", "test.handler1"),
200 u
"g\xfc\xdf".encode("ascii", "test.handler1"),
204 def test_longstrings(self
):
205 # test long strings to check for memory overflow problems
206 errors
= [ "strict", "ignore", "replace", "xmlcharrefreplace", "backslashreplace"]
207 # register the handlers under different names,
208 # to prevent the codec from recognizing the name
210 codecs
.register_error("test." + err
, codecs
.lookup_error(err
))
212 errors
+= [ "test." + err
for err
in errors
]
213 for uni
in [ s
*l
for s
in (u
"x", u
"\u3042", u
"a\xe4") ]:
214 for enc
in ("ascii", "latin-1", "iso-8859-1", "iso-8859-15", "utf-8", "utf-7", "utf-16"):
221 def check_exceptionobjectargs(self
, exctype
, args
, msg
):
222 # Test UnicodeError subclasses: construction, attribute assignment and __str__ conversion
223 # check with one missing argument
224 self
.assertRaises(TypeError, exctype
, *args
[:-1])
225 # check with one missing argument
226 self
.assertRaises(TypeError, exctype
, *(args
+ ["too much"]))
227 # check with one argument of the wrong type
228 wrongargs
= [ "spam", u
"eggs", 42, 1.0, None ]
229 for i
in xrange(len(args
)):
230 for wrongarg
in wrongargs
:
231 if type(wrongarg
) is type(args
[i
]):
233 # build argument array
235 for j
in xrange(len(args
)):
237 callargs
.append(wrongarg
)
239 callargs
.append(args
[i
])
240 self
.assertRaises(TypeError, exctype
, *callargs
)
242 self
.assertEquals(str(exc
), msg
)
244 def test_unicodeencodeerror(self
):
245 self
.check_exceptionobjectargs(
247 ["ascii", u
"g\xfcrk", 1, 2, "ouch"],
248 "'ascii' codec can't encode character '\ufc' in position 1: ouch"
250 self
.check_exceptionobjectargs(
252 ["ascii", u
"g\xfcrk", 1, 4, "ouch"],
253 "'ascii' codec can't encode characters in position 1-3: ouch"
255 self
.check_exceptionobjectargs(
257 ["ascii", u
"\xfcx", 0, 1, "ouch"],
258 "'ascii' codec can't encode character '\ufc' in position 0: ouch"
261 def test_unicodedecodeerror(self
):
262 self
.check_exceptionobjectargs(
264 ["ascii", "g\xfcrk", 1, 2, "ouch"],
265 "'ascii' codec can't decode byte 0xfc in position 1: ouch"
267 self
.check_exceptionobjectargs(
269 ["ascii", "g\xfcrk", 1, 3, "ouch"],
270 "'ascii' codec can't decode bytes in position 1-2: ouch"
273 def test_unicodetranslateerror(self
):
274 self
.check_exceptionobjectargs(
275 UnicodeTranslateError,
276 [u
"g\xfcrk", 1, 2, "ouch"],
277 "can't translate character '\\ufc' in position 1: ouch"
279 self
.check_exceptionobjectargs(
280 UnicodeTranslateError,
281 [u
"g\xfcrk", 1, 3, "ouch"],
282 "can't translate characters in position 1-2: ouch"
285 def test_badandgoodstrictexceptions(self
):
288 codecs
.strict_errors
,
293 codecs
.strict_errors
,
299 codecs
.strict_errors
,
300 UnicodeEncodeError("ascii", u
"\u3042", 0, 1, "ouch")
303 def test_badandgoodignoreexceptions(self
):
306 codecs
.ignore_errors
,
311 codecs
.ignore_errors
,
315 codecs
.ignore_errors(UnicodeEncodeError("ascii", u
"\u3042", 0, 1, "ouch")),
319 codecs
.ignore_errors(UnicodeDecodeError("ascii", "\xff", 0, 1, "ouch")),
323 codecs
.ignore_errors(UnicodeTranslateError(u
"\u3042", 0, 1, "ouch")),
327 def test_badandgoodreplaceexceptions(self
):
330 codecs
.replace_errors
,
335 codecs
.replace_errors
,
339 codecs
.replace_errors(UnicodeEncodeError("ascii", u
"\u3042", 0, 1, "ouch")),
343 codecs
.replace_errors(UnicodeDecodeError("ascii", "\xff", 0, 1, "ouch")),
347 codecs
.replace_errors(UnicodeTranslateError(u
"\u3042", 0, 1, "ouch")),
351 def test_badandgoodxmlcharrefreplaceexceptions(self
):
354 codecs
.xmlcharrefreplace_errors
,
359 codecs
.xmlcharrefreplace_errors
,
363 codecs
.xmlcharrefreplace_errors(UnicodeEncodeError("ascii", u
"\u3042", 0, 1, "ouch")),
364 (u
"&#%d;" % 0x3042, 1)
368 codecs
.xmlcharrefreplace_errors
,
373 codecs
.xmlcharrefreplace_errors
,
374 UnicodeDecodeError("ascii", "\xff", 0, 1, "ouch")
378 codecs
.xmlcharrefreplace_errors
,
379 UnicodeTranslateError(u
"\u3042", 0, 1, "ouch")
382 def test_badandgoodbackslashreplaceexceptions(self
):
385 codecs
.backslashreplace_errors
,
390 codecs
.backslashreplace_errors
,
394 codecs
.backslashreplace_errors(UnicodeEncodeError("ascii", u
"\u3042", 0, 1, "ouch")),
398 codecs
.backslashreplace_errors(UnicodeEncodeError("ascii", u
"\x00", 0, 1, "ouch")),
402 codecs
.backslashreplace_errors(UnicodeEncodeError("ascii", u
"\xff", 0, 1, "ouch")),
406 codecs
.backslashreplace_errors(UnicodeEncodeError("ascii", u
"\u0100", 0, 1, "ouch")),
410 codecs
.backslashreplace_errors(UnicodeEncodeError("ascii", u
"\uffff", 0, 1, "ouch")),
413 if sys
.maxunicode
>0xffff:
415 codecs
.backslashreplace_errors(UnicodeEncodeError("ascii", u
"\U00010000", 0, 1, "ouch")),
419 codecs
.backslashreplace_errors(UnicodeEncodeError("ascii", u
"\U0010ffff", 0, 1, "ouch")),
425 codecs
.backslashreplace_errors
,
430 codecs
.backslashreplace_errors
,
431 UnicodeDecodeError("ascii", "\xff", 0, 1, "ouch")
435 codecs
.backslashreplace_errors
,
436 UnicodeTranslateError(u
"\u3042", 0, 1, "ouch")
439 def test_badhandlerresults(self
):
440 results
= ( 42, u
"foo", (1,2,3), (u
"foo", 1, 3), (u
"foo", None), (u
"foo",), ("foo", 1, 3), ("foo", None), ("foo",) )
441 encs
= ("ascii", "latin-1", "iso-8859-1", "iso-8859-15")
444 codecs
.register_error("test.badhandler", lambda: res
)
452 for (enc
, bytes
) in (
464 def test_lookup(self
):
465 self
.assertEquals(codecs
.strict_errors
, codecs
.lookup_error("strict"))
466 self
.assertEquals(codecs
.ignore_errors
, codecs
.lookup_error("ignore"))
467 self
.assertEquals(codecs
.strict_errors
, codecs
.lookup_error("strict"))
469 codecs
.xmlcharrefreplace_errors
,
470 codecs
.lookup_error("xmlcharrefreplace")
473 codecs
.backslashreplace_errors
,
474 codecs
.lookup_error("backslashreplace")
477 def test_unencodablereplacement(self
):
479 if isinstance(exc
, UnicodeEncodeError):
480 return (u
"\u4242", exc
.end
)
482 raise TypeError("don't know how to handle %r" % exc
)
483 codecs
.register_error("test.unencreplhandler", unencrepl
)
484 for enc
in ("ascii", "iso-8859-1", "iso-8859-15"):
489 "test.unencreplhandler"
493 suite
= unittest
.TestSuite()
494 suite
.addTest(unittest
.makeSuite(CodecCallbackTest
))
495 test
.test_support
.run_suite(suite
)
497 if __name__
== "__main__":