This commit was manufactured by cvs2svn to create tag 'r23b1-mac'.
[python/dscho.git] / Lib / test / test_re.py
blob2430790301315fc00bce16490506a47eaca72b0d
1 import sys
2 sys.path = ['.'] + sys.path
4 from test.test_support import verbose, run_suite
5 import re
6 from sre import Scanner
7 import sys, os, traceback
9 # Misc tests from Tim Peters' re.doc
11 import unittest
13 class ReTests(unittest.TestCase):
14 def test_search_star_plus(self):
15 self.assertEqual(re.search('x*', 'axx').span(0), (0, 0))
16 self.assertEqual(re.search('x*', 'axx').span(), (0, 0))
17 self.assertEqual(re.search('x+', 'axx').span(0), (1, 3))
18 self.assertEqual(re.search('x+', 'axx').span(), (1, 3))
19 self.assertEqual(re.search('x', 'aaa'), None)
20 self.assertEqual(re.match('a*', 'xxx').span(0), (0, 0))
21 self.assertEqual(re.match('a*', 'xxx').span(), (0, 0))
22 self.assertEqual(re.match('x*', 'xxxa').span(0), (0, 3))
23 self.assertEqual(re.match('x*', 'xxxa').span(), (0, 3))
24 self.assertEqual(re.match('a+', 'xxx'), None)
26 def bump_num(self, matchobj):
27 int_value = int(matchobj.group(0))
28 return str(int_value + 1)
30 def test_basic_re_sub(self):
31 self.assertEqual(re.sub("(?i)b+", "x", "bbbb BBBB"), 'x x')
32 self.assertEqual(re.sub(r'\d+', self.bump_num, '08.2 -2 23x99y'),
33 '9.3 -3 24x100y')
34 self.assertEqual(re.sub(r'\d+', self.bump_num, '08.2 -2 23x99y', 3),
35 '9.3 -3 23x99y')
37 self.assertEqual(re.sub('.', lambda m: r"\n", 'x'), '\\n')
38 self.assertEqual(re.sub('.', r"\n", 'x'), '\n')
40 s = r"\1\1"
41 self.assertEqual(re.sub('(.)', s, 'x'), 'xx')
42 self.assertEqual(re.sub('(.)', re.escape(s), 'x'), s)
43 self.assertEqual(re.sub('(.)', lambda m: s, 'x'), s)
45 self.assertEqual(re.sub('(?P<a>x)', '\g<a>\g<a>', 'xx'), 'xxxx')
46 self.assertEqual(re.sub('(?P<a>x)', '\g<a>\g<1>', 'xx'), 'xxxx')
47 self.assertEqual(re.sub('(?P<unk>x)', '\g<unk>\g<unk>', 'xx'), 'xxxx')
48 self.assertEqual(re.sub('(?P<unk>x)', '\g<1>\g<1>', 'xx'), 'xxxx')
50 self.assertEqual(re.sub('a',r'\t\n\v\r\f\a\b\B\Z\a\A\w\W\s\S\d\D','a'),
51 '\t\n\v\r\f\a\b\\B\\Z\a\\A\\w\\W\\s\\S\\d\\D')
52 self.assertEqual(re.sub('a', '\t\n\v\r\f\a', 'a'), '\t\n\v\r\f\a')
53 self.assertEqual(re.sub('a', '\t\n\v\r\f\a', 'a'),
54 (chr(9)+chr(10)+chr(11)+chr(13)+chr(12)+chr(7)))
56 self.assertEqual(re.sub('^\s*', 'X', 'test'), 'Xtest')
58 def test_bug_449964(self):
59 # fails for group followed by other escape
60 self.assertEqual(re.sub(r'(?P<unk>x)', '\g<1>\g<1>\\b', 'xx'),
61 'xx\bxx\b')
63 def test_bug_449000(self):
64 # Test for sub() on escaped characters
65 self.assertEqual(re.sub(r'\r\n', r'\n', 'abc\r\ndef\r\n'),
66 'abc\ndef\n')
67 self.assertEqual(re.sub('\r\n', r'\n', 'abc\r\ndef\r\n'),
68 'abc\ndef\n')
69 self.assertEqual(re.sub(r'\r\n', '\n', 'abc\r\ndef\r\n'),
70 'abc\ndef\n')
71 self.assertEqual(re.sub('\r\n', '\n', 'abc\r\ndef\r\n'),
72 'abc\ndef\n')
74 def test_qualified_re_sub(self):
75 self.assertEqual(re.sub('a', 'b', 'aaaaa'), 'bbbbb')
76 self.assertEqual(re.sub('a', 'b', 'aaaaa', 1), 'baaaa')
78 def test_bug_114660(self):
79 self.assertEqual(re.sub(r'(\S)\s+(\S)', r'\1 \2', 'hello there'),
80 'hello there')
82 def test_bug_462270(self):
83 # Test for empty sub() behaviour, see SF bug #462270
84 self.assertEqual(re.sub('x*', '-', 'abxd'), '-a-b-d-')
85 self.assertEqual(re.sub('x+', '-', 'abxd'), 'ab-d')
87 def test_symbolic_refs(self):
88 self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<a', 'xx')
89 self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<', 'xx')
90 self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g', 'xx')
91 self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<a a>', 'xx')
92 self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<1a1>', 'xx')
93 self.assertRaises(IndexError, re.sub, '(?P<a>x)', '\g<ab>', 'xx')
94 self.assertRaises(re.error, re.sub, '(?P<a>x)|(?P<b>y)', '\g<b>', 'xx')
95 self.assertRaises(re.error, re.sub, '(?P<a>x)|(?P<b>y)', '\\2', 'xx')
97 def test_re_subn(self):
98 self.assertEqual(re.subn("(?i)b+", "x", "bbbb BBBB"), ('x x', 2))
99 self.assertEqual(re.subn("b+", "x", "bbbb BBBB"), ('x BBBB', 1))
100 self.assertEqual(re.subn("b+", "x", "xyz"), ('xyz', 0))
101 self.assertEqual(re.subn("b*", "x", "xyz"), ('xxxyxzx', 4))
102 self.assertEqual(re.subn("b*", "x", "xyz", 2), ('xxxyz', 2))
104 def test_re_split(self):
105 self.assertEqual(re.split(":", ":a:b::c"), ['', 'a', 'b', '', 'c'])
106 self.assertEqual(re.split(":*", ":a:b::c"), ['', 'a', 'b', 'c'])
107 self.assertEqual(re.split("(:*)", ":a:b::c"),
108 ['', ':', 'a', ':', 'b', '::', 'c'])
109 self.assertEqual(re.split("(?::*)", ":a:b::c"), ['', 'a', 'b', 'c'])
110 self.assertEqual(re.split("(:)*", ":a:b::c"),
111 ['', ':', 'a', ':', 'b', ':', 'c'])
112 self.assertEqual(re.split("([b:]+)", ":a:b::c"),
113 ['', ':', 'a', ':b::', 'c'])
114 self.assertEqual(re.split("(b)|(:+)", ":a:b::c"),
115 ['', None, ':', 'a', None, ':', '', 'b', None, '',
116 None, '::', 'c'])
117 self.assertEqual(re.split("(?:b)|(?::+)", ":a:b::c"),
118 ['', 'a', '', '', 'c'])
120 def test_qualified_re_split(self):
121 self.assertEqual(re.split(":", ":a:b::c", 2), ['', 'a', 'b::c'])
122 self.assertEqual(re.split(':', 'a:b:c:d', 2), ['a', 'b', 'c:d'])
123 self.assertEqual(re.split("(:)", ":a:b::c", 2),
124 ['', ':', 'a', ':', 'b::c'])
125 self.assertEqual(re.split("(:*)", ":a:b::c", 2),
126 ['', ':', 'a', ':', 'b::c'])
128 def test_re_findall(self):
129 self.assertEqual(re.findall(":+", "abc"), [])
130 self.assertEqual(re.findall(":+", "a:b::c:::d"), [":", "::", ":::"])
131 self.assertEqual(re.findall("(:+)", "a:b::c:::d"), [":", "::", ":::"])
132 self.assertEqual(re.findall("(:)(:*)", "a:b::c:::d"), [(":", ""),
133 (":", ":"),
134 (":", "::")])
136 def test_bug_117612(self):
137 self.assertEqual(re.findall(r"(a|(b))", "aba"),
138 [("a", ""),("b", "b"),("a", "")])
140 def test_re_match(self):
141 self.assertEqual(re.match('a', 'a').groups(), ())
142 self.assertEqual(re.match('(a)', 'a').groups(), ('a',))
143 self.assertEqual(re.match(r'(a)', 'a').group(0), 'a')
144 self.assertEqual(re.match(r'(a)', 'a').group(1), 'a')
145 self.assertEqual(re.match(r'(a)', 'a').group(1, 1), ('a', 'a'))
147 pat = re.compile('((a)|(b))(c)?')
148 self.assertEqual(pat.match('a').groups(), ('a', 'a', None, None))
149 self.assertEqual(pat.match('b').groups(), ('b', None, 'b', None))
150 self.assertEqual(pat.match('ac').groups(), ('a', 'a', None, 'c'))
151 self.assertEqual(pat.match('bc').groups(), ('b', None, 'b', 'c'))
152 self.assertEqual(pat.match('bc').groups(""), ('b', "", 'b', 'c'))
154 # A single group
155 m = re.match('(a)', 'a')
156 self.assertEqual(m.group(0), 'a')
157 self.assertEqual(m.group(0), 'a')
158 self.assertEqual(m.group(1), 'a')
159 self.assertEqual(m.group(1, 1), ('a', 'a'))
161 pat = re.compile('(?:(?P<a1>a)|(?P<b2>b))(?P<c3>c)?')
162 self.assertEqual(pat.match('a').group(1, 2, 3), ('a', None, None))
163 self.assertEqual(pat.match('b').group('a1', 'b2', 'c3'),
164 (None, 'b', None))
165 self.assertEqual(pat.match('ac').group(1, 'b2', 3), ('a', None, 'c'))
167 def test_re_escape(self):
168 p=""
169 for i in range(0, 256):
170 p = p + chr(i)
171 self.assertEqual(re.match(re.escape(chr(i)), chr(i)) is not None,
172 True)
173 self.assertEqual(re.match(re.escape(chr(i)), chr(i)).span(), (0,1))
175 pat=re.compile(re.escape(p))
176 self.assertEqual(pat.match(p) is not None, True)
177 self.assertEqual(pat.match(p).span(), (0,256))
179 def test_pickling(self):
180 import pickle
181 self.pickle_test(pickle)
182 import cPickle
183 self.pickle_test(cPickle)
185 def pickle_test(self, pickle):
186 oldpat = re.compile('a(?:b|(c|e){1,2}?|d)+?(.)')
187 s = pickle.dumps(oldpat)
188 newpat = pickle.loads(s)
189 self.assertEqual(oldpat, newpat)
191 def test_constants(self):
192 self.assertEqual(re.I, re.IGNORECASE)
193 self.assertEqual(re.L, re.LOCALE)
194 self.assertEqual(re.M, re.MULTILINE)
195 self.assertEqual(re.S, re.DOTALL)
196 self.assertEqual(re.X, re.VERBOSE)
198 def test_flags(self):
199 for flag in [re.I, re.M, re.X, re.S, re.L]:
200 self.assertNotEqual(re.compile('^pattern$', flag), None)
202 def test_sre_character_literals(self):
203 for i in [0, 8, 16, 32, 64, 127, 128, 255]:
204 self.assertNotEqual(re.match(r"\%03o" % i, chr(i)), None)
205 self.assertNotEqual(re.match(r"\%03o0" % i, chr(i)+"0"), None)
206 self.assertNotEqual(re.match(r"\%03o8" % i, chr(i)+"8"), None)
207 self.assertNotEqual(re.match(r"\x%02x" % i, chr(i)), None)
208 self.assertNotEqual(re.match(r"\x%02x0" % i, chr(i)+"0"), None)
209 self.assertNotEqual(re.match(r"\x%02xz" % i, chr(i)+"z"), None)
210 self.assertRaises(re.error, re.match, "\911", "")
212 def test_bug_113254(self):
213 self.assertEqual(re.match(r'(a)|(b)', 'b').start(1), -1)
214 self.assertEqual(re.match(r'(a)|(b)', 'b').end(1), -1)
215 self.assertEqual(re.match(r'(a)|(b)', 'b').span(1), (-1, -1))
217 def test_bug_527371(self):
218 # bug described in patches 527371/672491
219 self.assertEqual(re.match(r'(a)?a','a').lastindex, None)
220 self.assertEqual(re.match(r'(a)(b)?b','ab').lastindex, 1)
221 self.assertEqual(re.match(r'(?P<a>a)(?P<b>b)?b','ab').lastgroup, 'a')
222 self.assertEqual(re.match("(?P<a>a(b))", "ab").lastgroup, 'a')
223 self.assertEqual(re.match("((a))", "a").lastindex, 1)
225 def test_bug_545855(self):
226 # bug 545855 -- This pattern failed to cause a compile error as it
227 # should, instead provoking a TypeError.
228 self.assertRaises(re.error, re.compile, 'foo[a-')
230 def test_bug_418626(self):
231 # bugs 418626 at al. -- Testing Greg Chapman's addition of op code
232 # SRE_OP_MIN_REPEAT_ONE for eliminating recursion on simple uses of
233 # pattern '*?' on a long string.
234 self.assertEqual(re.match('.*?c', 10000*'ab'+'cd').end(0), 20001)
235 self.assertEqual(re.match('.*?cd', 5000*'ab'+'c'+5000*'ab'+'cde').end(0),
236 20003)
237 self.assertEqual(re.match('.*?cd', 20000*'abc'+'de').end(0), 60001)
238 # non-simple '*?' still recurses and hits the recursion limit
239 self.assertRaises(RuntimeError, re.search, '(a|b)*?c', 10000*'ab'+'cd')
241 def test_bug_612074(self):
242 pat=u"["+re.escape(u"\u2039")+u"]"
243 self.assertEqual(re.compile(pat) and 1, 1)
245 def test_stack_overflow(self):
246 # nasty case that overflows the straightforward recursive
247 # implementation of repeated groups.
248 self.assertRaises(RuntimeError, re.match, '(x)*', 50000*'x')
249 self.assertRaises(RuntimeError, re.match, '(x)*y', 50000*'x'+'y')
250 self.assertRaises(RuntimeError, re.match, '(x)*?y', 50000*'x'+'y')
252 def test_scanner(self):
253 def s_ident(scanner, token): return token
254 def s_operator(scanner, token): return "op%s" % token
255 def s_float(scanner, token): return float(token)
256 def s_int(scanner, token): return int(token)
258 scanner = Scanner([
259 (r"[a-zA-Z_]\w*", s_ident),
260 (r"\d+\.\d*", s_float),
261 (r"\d+", s_int),
262 (r"=|\+|-|\*|/", s_operator),
263 (r"\s+", None),
266 self.assertEqual(scanner.scan("sum = 3*foo + 312.50 + bar"),
267 (['sum', 'op=', 3, 'op*', 'foo', 'op+', 312.5,
268 'op+', 'bar'], ''))
270 def test_bug_448951(self):
271 # bug 448951 (similar to 429357, but with single char match)
272 # (Also test greedy matches.)
273 for op in '','?','*':
274 self.assertEqual(re.match(r'((.%s):)?z'%op, 'z').groups(),
275 (None, None))
276 self.assertEqual(re.match(r'((.%s):)?z'%op, 'a:z').groups(),
277 ('a:', 'a'))
279 def test_finditer(self):
280 iter = re.finditer(r":+", "a:b::c:::d")
281 self.assertEqual([item.group(0) for item in iter],
282 [":", "::", ":::"])
284 def run_re_tests():
285 from test.re_tests import benchmarks, tests, SUCCEED, FAIL, SYNTAX_ERROR
286 if verbose:
287 print 'Running re_tests test suite'
288 else:
289 # To save time, only run the first and last 10 tests
290 #tests = tests[:10] + tests[-10:]
291 pass
293 for t in tests:
294 sys.stdout.flush()
295 pattern = s = outcome = repl = expected = None
296 if len(t) == 5:
297 pattern, s, outcome, repl, expected = t
298 elif len(t) == 3:
299 pattern, s, outcome = t
300 else:
301 raise ValueError, ('Test tuples should have 3 or 5 fields', t)
303 try:
304 obj = re.compile(pattern)
305 except re.error:
306 if outcome == SYNTAX_ERROR: pass # Expected a syntax error
307 else:
308 print '=== Syntax error:', t
309 except KeyboardInterrupt: raise KeyboardInterrupt
310 except:
311 print '*** Unexpected error ***', t
312 if verbose:
313 traceback.print_exc(file=sys.stdout)
314 else:
315 try:
316 result = obj.search(s)
317 except re.error, msg:
318 print '=== Unexpected exception', t, repr(msg)
319 if outcome == SYNTAX_ERROR:
320 # This should have been a syntax error; forget it.
321 pass
322 elif outcome == FAIL:
323 if result is None: pass # No match, as expected
324 else: print '=== Succeeded incorrectly', t
325 elif outcome == SUCCEED:
326 if result is not None:
327 # Matched, as expected, so now we compute the
328 # result string and compare it to our expected result.
329 start, end = result.span(0)
330 vardict={'found': result.group(0),
331 'groups': result.group(),
332 'flags': result.re.flags}
333 for i in range(1, 100):
334 try:
335 gi = result.group(i)
336 # Special hack because else the string concat fails:
337 if gi is None:
338 gi = "None"
339 except IndexError:
340 gi = "Error"
341 vardict['g%d' % i] = gi
342 for i in result.re.groupindex.keys():
343 try:
344 gi = result.group(i)
345 if gi is None:
346 gi = "None"
347 except IndexError:
348 gi = "Error"
349 vardict[i] = gi
350 repl = eval(repl, vardict)
351 if repl != expected:
352 print '=== grouping error', t,
353 print repr(repl) + ' should be ' + repr(expected)
354 else:
355 print '=== Failed incorrectly', t
357 # Try the match on a unicode string, and check that it
358 # still succeeds.
359 try:
360 result = obj.search(unicode(s, "latin-1"))
361 if result is None:
362 print '=== Fails on unicode match', t
363 except NameError:
364 continue # 1.5.2
365 except TypeError:
366 continue # unicode test case
368 # Try the match on a unicode pattern, and check that it
369 # still succeeds.
370 obj=re.compile(unicode(pattern, "latin-1"))
371 result = obj.search(s)
372 if result is None:
373 print '=== Fails on unicode pattern match', t
375 # Try the match with the search area limited to the extent
376 # of the match and see if it still succeeds. \B will
377 # break (because it won't match at the end or start of a
378 # string), so we'll ignore patterns that feature it.
380 if pattern[:2] != '\\B' and pattern[-2:] != '\\B' \
381 and result is not None:
382 obj = re.compile(pattern)
383 result = obj.search(s, result.start(0), result.end(0) + 1)
384 if result is None:
385 print '=== Failed on range-limited match', t
387 # Try the match with IGNORECASE enabled, and check that it
388 # still succeeds.
389 obj = re.compile(pattern, re.IGNORECASE)
390 result = obj.search(s)
391 if result is None:
392 print '=== Fails on case-insensitive match', t
394 # Try the match with LOCALE enabled, and check that it
395 # still succeeds.
396 obj = re.compile(pattern, re.LOCALE)
397 result = obj.search(s)
398 if result is None:
399 print '=== Fails on locale-sensitive match', t
401 # Try the match with UNICODE locale enabled, and check
402 # that it still succeeds.
403 obj = re.compile(pattern, re.UNICODE)
404 result = obj.search(s)
405 if result is None:
406 print '=== Fails on unicode-sensitive match', t
408 def test_main():
409 suite = unittest.TestSuite()
410 suite.addTest(unittest.makeSuite(ReTests))
411 run_suite(suite)
412 run_re_tests()
414 if __name__ == "__main__":
415 test_main()