2 # Secret Labs' Regular Expression Engine
4 # convert re-style regular expression to sre pattern
6 # Copyright (c) 1998-2001 by Secret Labs AB. All rights reserved.
8 # See the sre.py file for information on usage and redistribution.
11 # XXX: show string offset and offending character for all errors
13 # this module works under 1.5.2 and later. don't use string methods
16 from sre_constants
import *
18 SPECIAL_CHARS
= ".\\[{()*+?^$|"
21 DIGITS
= tuple("0123456789")
23 OCTDIGITS
= tuple("01234567")
24 HEXDIGITS
= tuple("0123456789abcdefABCDEF")
26 WHITESPACE
= tuple(" \t\n\r\v\f")
29 r
"\a": (LITERAL
, ord("\a")),
30 r
"\b": (LITERAL
, ord("\b")),
31 r
"\f": (LITERAL
, ord("\f")),
32 r
"\n": (LITERAL
, ord("\n")),
33 r
"\r": (LITERAL
, ord("\r")),
34 r
"\t": (LITERAL
, ord("\t")),
35 r
"\v": (LITERAL
, ord("\v")),
36 r
"\\": (LITERAL
, ord("\\"))
40 r
"\A": (AT
, AT_BEGINNING_STRING
), # start of string
41 r
"\b": (AT
, AT_BOUNDARY
),
42 r
"\B": (AT
, AT_NON_BOUNDARY
),
43 r
"\d": (IN
, [(CATEGORY
, CATEGORY_DIGIT
)]),
44 r
"\D": (IN
, [(CATEGORY
, CATEGORY_NOT_DIGIT
)]),
45 r
"\s": (IN
, [(CATEGORY
, CATEGORY_SPACE
)]),
46 r
"\S": (IN
, [(CATEGORY
, CATEGORY_NOT_SPACE
)]),
47 r
"\w": (IN
, [(CATEGORY
, CATEGORY_WORD
)]),
48 r
"\W": (IN
, [(CATEGORY
, CATEGORY_NOT_WORD
)]),
49 r
"\Z": (AT
, AT_END_STRING
), # end of string
54 "i": SRE_FLAG_IGNORECASE
,
56 "m": SRE_FLAG_MULTILINE
,
58 "x": SRE_FLAG_VERBOSE
,
60 "t": SRE_FLAG_TEMPLATE
,
61 "u": SRE_FLAG_UNICODE
,
64 # figure out best way to convert hex/octal numbers to integers
67 atoi
= int # 2.0 and later
69 atoi
= string
.atoi
# 1.5.2
72 # master pattern object. keeps track of global attributes
78 def opengroup(self
, name
=None):
82 self
.groupdict
[name
] = gid
85 def closegroup(self
, gid
):
87 def checkgroup(self
, gid
):
88 return gid
< self
.groups
and gid
not in self
.open
91 # a subpattern, in intermediate form
92 def __init__(self
, pattern
, data
=None):
93 self
.pattern
= pattern
98 def dump(self
, level
=0):
100 for op
, av
in self
.data
:
101 print level
*" " + op
,; nl
= 0
106 print (level
+1)*" " + op
, a
112 print level
*" " + "or"
113 a
.dump(level
+1); nl
= 1
115 elif type(av
) in (type(()), type([])):
117 if isinstance(a
, SubPattern
):
119 a
.dump(level
+1); nl
= 1
126 return repr(self
.data
)
128 return len(self
.data
)
129 def __delitem__(self
, index
):
131 def __getitem__(self
, index
):
132 return self
.data
[index
]
133 def __setitem__(self
, index
, code
):
134 self
.data
[index
] = code
135 def __getslice__(self
, start
, stop
):
136 return SubPattern(self
.pattern
, self
.data
[start
:stop
])
137 def insert(self
, index
, code
):
138 self
.data
.insert(index
, code
)
139 def append(self
, code
):
140 self
.data
.append(code
)
142 # determine the width (min, max) for this subpattern
146 for op
, av
in self
.data
:
160 elif op
is SUBPATTERN
:
161 i
, j
= av
[1].getwidth()
164 elif op
in (MIN_REPEAT
, MAX_REPEAT
):
165 i
, j
= av
[2].getwidth()
166 lo
= lo
+ long(i
) * av
[0]
167 hi
= hi
+ long(j
) * av
[1]
168 elif op
in (ANY
, RANGE
, IN
, LITERAL
, NOT_LITERAL
, CATEGORY
):
173 self
.width
= int(min(lo
, sys
.maxint
)), int(min(hi
, sys
.maxint
))
177 def __init__(self
, string
):
182 if self
.index
>= len(self
.string
):
185 char
= self
.string
[self
.index
]
188 c
= self
.string
[self
.index
+ 1]
190 raise error
, "bogus escape"
192 self
.index
= self
.index
+ len(char
)
194 def match(self
, char
, skip
=1):
195 if char
== self
.next
:
205 return self
.index
, self
.next
206 def seek(self
, index
):
207 self
.index
, self
.next
= index
210 return "a" <= char
<= "z" or "A" <= char
<= "Z" or char
== "_"
213 return "0" <= char
<= "9"
216 # check that group name is a valid string
217 if not isident(name
[0]):
220 if not isident(char
) and not isdigit(char
):
224 def _group(escape
, groups
):
225 # check if the escape string represents a valid group
227 gid
= atoi(escape
[1:])
228 if gid
and gid
< groups
:
232 return None # not a valid group
234 def _class_escape(source
, escape
):
235 # handle escape code inside character class
236 code
= ESCAPES
.get(escape
)
239 code
= CATEGORIES
.get(escape
)
243 if escape
[1:2] == "x":
244 # hexadecimal escape (exactly two digits)
245 while source
.next
in HEXDIGITS
and len(escape
) < 4:
246 escape
= escape
+ source
.get()
249 raise error
, "bogus escape: %s" % repr("\\" + escape
)
250 return LITERAL
, atoi(escape
, 16) & 0xff
251 elif str(escape
[1:2]) in OCTDIGITS
:
252 # octal escape (up to three digits)
253 while source
.next
in OCTDIGITS
and len(escape
) < 5:
254 escape
= escape
+ source
.get()
256 return LITERAL
, atoi(escape
, 8) & 0xff
258 return LITERAL
, ord(escape
[1])
261 raise error
, "bogus escape: %s" % repr(escape
)
263 def _escape(source
, escape
, state
):
264 # handle escape code in expression
265 code
= CATEGORIES
.get(escape
)
268 code
= ESCAPES
.get(escape
)
272 if escape
[1:2] == "x":
274 while source
.next
in HEXDIGITS
and len(escape
) < 4:
275 escape
= escape
+ source
.get()
278 return LITERAL
, atoi(escape
[2:], 16) & 0xff
279 elif escape
[1:2] == "0":
281 while source
.next
in OCTDIGITS
and len(escape
) < 4:
282 escape
= escape
+ source
.get()
283 return LITERAL
, atoi(escape
[1:], 8) & 0xff
284 elif escape
[1:2] in DIGITS
:
285 # octal escape *or* decimal group reference (sigh)
287 if source
.next
in DIGITS
:
288 escape
= escape
+ source
.get()
289 if (escape
[1] in OCTDIGITS
and escape
[2] in OCTDIGITS
and
290 source
.next
in OCTDIGITS
):
291 # got three octal digits; this is an octal escape
292 escape
= escape
+ source
.get()
293 return LITERAL
, atoi(escape
[1:], 8) & 0xff
294 # got at least one decimal digit; this is a group reference
295 group
= _group(escape
, state
.groups
)
297 if not state
.checkgroup(group
):
298 raise error
, "cannot refer to open group"
299 return GROUPREF
, group
302 return LITERAL
, ord(escape
[1])
305 raise error
, "bogus escape: %s" % repr(escape
)
307 def _parse_sub(source
, state
, nested
=1):
308 # parse an alternation: a|b|c
312 items
.append(_parse(source
, state
))
313 if source
.match("|"):
317 if not source
.next
or source
.match(")", 0):
320 raise error
, "pattern not properly closed"
325 subpattern
= SubPattern(state
)
327 # check if all items share a common prefix
335 elif item
[0] != prefix
:
338 # all subitems start with a common "prefix".
339 # move it out of the branch
342 subpattern
.append(prefix
)
343 continue # check next one
346 # check if the branch can be replaced by a character set
348 if len(item
) != 1 or item
[0][0] != LITERAL
:
351 # we can store this as a character set instead of a
352 # branch (the compiler may optimize this even more)
356 subpattern
.append((IN
, set))
359 subpattern
.append((BRANCH
, (None, items
)))
362 def _parse(source
, state
):
363 # parse a simple pattern
365 subpattern
= SubPattern(state
)
369 if source
.next
in ("|", ")"):
370 break # end of subpattern
373 break # end of pattern
375 if state
.flags
& SRE_FLAG_VERBOSE
:
376 # skip whitespace and comments
377 if this
in WHITESPACE
:
382 if this
in (None, "\n"):
386 if this
and this
[0] not in SPECIAL_CHARS
:
387 subpattern
.append((LITERAL
, ord(this
)))
392 ## if source.match(":"):
393 ## pass # handle character classes
394 if source
.match("^"):
395 set.append((NEGATE
, None))
396 # check remaining characters
400 if this
== "]" and set != start
:
402 elif this
and this
[0] == "\\":
403 code1
= _class_escape(source
, this
)
405 code1
= LITERAL
, ord(this
)
407 raise error
, "unexpected end of regular expression"
408 if source
.match("-"):
415 set.append((LITERAL
, ord("-")))
419 code2
= _class_escape(source
, this
)
421 code2
= LITERAL
, ord(this
)
422 if code1
[0] != LITERAL
or code2
[0] != LITERAL
:
423 raise error
, "bad character range"
427 raise error
, "bad character range"
428 set.append((RANGE
, (lo
, hi
)))
434 # XXX: <fl> should move set optimization to compiler!
435 if len(set)==1 and set[0][0] is LITERAL
:
436 subpattern
.append(set[0]) # optimization
437 elif len(set)==2 and set[0][0] is NEGATE
and set[1][0] is LITERAL
:
438 subpattern
.append((NOT_LITERAL
, set[1][1])) # optimization
440 # XXX: <fl> should add charmap optimization here
441 subpattern
.append((IN
, set))
443 elif this
and this
[0] in REPEAT_CHARS
:
444 # repeat previous item
448 min, max = 0, MAXREPEAT
451 min, max = 1, MAXREPEAT
454 min, max = 0, MAXREPEAT
456 while source
.next
in DIGITS
:
457 lo
= lo
+ source
.get()
458 if source
.match(","):
459 while source
.next
in DIGITS
:
460 hi
= hi
+ source
.get()
463 if not source
.match("}"):
464 subpattern
.append((LITERAL
, ord(this
)))
472 raise error
, "bad repeat interval"
474 raise error
, "not supported"
475 # figure out which item to repeat
477 item
= subpattern
[-1:]
480 if not item
or (len(item
) == 1 and item
[0][0] == AT
):
481 raise error
, "nothing to repeat"
482 if item
[0][0] in (MIN_REPEAT
, MAX_REPEAT
):
483 raise error
, "multiple repeat"
484 if source
.match("?"):
485 subpattern
[-1] = (MIN_REPEAT
, (min, max, item
))
487 subpattern
[-1] = (MAX_REPEAT
, (min, max, item
))
490 subpattern
.append((ANY
, None))
495 if source
.match("?"):
498 if source
.match("P"):
500 if source
.match("<"):
501 # named group: skip forward to end of name
506 raise error
, "unterminated name"
512 raise error
, "bad character in group name"
513 elif source
.match("="):
514 # named backreference
519 raise error
, "unterminated name"
524 raise error
, "bad character in group name"
525 gid
= state
.groupdict
.get(name
)
527 raise error
, "unknown group name"
528 subpattern
.append((GROUPREF
, gid
))
533 raise error
, "unexpected end of pattern"
534 raise error
, "unknown specifier: ?P%s" % char
535 elif source
.match(":"):
536 # non-capturing group
538 elif source
.match("#"):
541 if source
.next
is None or source
.next
== ")":
544 if not source
.match(")"):
545 raise error
, "unbalanced parenthesis"
547 elif source
.next
in ("=", "!", "<"):
548 # lookahead assertions
552 if source
.next
not in ("=", "!"):
553 raise error
, "syntax error"
554 dir = -1 # lookbehind
556 p
= _parse_sub(source
, state
)
557 if not source
.match(")"):
558 raise error
, "unbalanced parenthesis"
560 subpattern
.append((ASSERT
, (dir, p
)))
562 subpattern
.append((ASSERT_NOT
, (dir, p
)))
566 if not FLAGS
.has_key(source
.next
):
567 raise error
, "unexpected end of pattern"
568 while FLAGS
.has_key(source
.next
):
569 state
.flags
= state
.flags | FLAGS
[source
.get()]
571 # parse group contents
576 group
= state
.opengroup(name
)
577 p
= _parse_sub(source
, state
)
578 if not source
.match(")"):
579 raise error
, "unbalanced parenthesis"
580 if group
is not None:
581 state
.closegroup(group
)
582 subpattern
.append((SUBPATTERN
, (group
, p
)))
587 raise error
, "unexpected end of pattern"
590 raise error
, "unknown extension"
593 subpattern
.append((AT
, AT_BEGINNING
))
596 subpattern
.append((AT
, AT_END
))
598 elif this
and this
[0] == "\\":
599 code
= _escape(source
, this
, state
)
600 subpattern
.append(code
)
603 raise error
, "parser error"
607 def parse(str, flags
=0, pattern
=None):
608 # parse 're' pattern into list of (opcode, argument) tuples
610 source
= Tokenizer(str)
614 pattern
.flags
= flags
617 p
= _parse_sub(source
, pattern
, 0)
621 raise error
, "unbalanced parenthesis"
623 raise error
, "bogus characters at end of regular expression"
625 if flags
& SRE_FLAG_DEBUG
:
628 if not (flags
& SRE_FLAG_VERBOSE
) and p
.pattern
.flags
& SRE_FLAG_VERBOSE
:
629 # the VERBOSE flag was switched on inside the pattern. to be
630 # on the safe side, we'll parse the whole thing again...
631 return parse(str, p
.pattern
.flags
)
635 def parse_template(source
, pattern
):
636 # parse 're' replacement string into list of literals and
638 s
= Tokenizer(source
)
644 break # end of replacement string
645 if this
and this
[0] == "\\":
653 raise error
, "unterminated group name"
658 raise error
, "bad group name"
663 raise error
, "bad character in group name"
665 index
= pattern
.groupindex
[name
]
667 raise IndexError, "unknown group name"
669 elif len(this
) > 1 and this
[1] in DIGITS
:
672 group
= _group(this
, pattern
.groups
+1)
674 if (s
.next
not in DIGITS
or
675 not _group(this
+ s
.next
, pattern
.groups
+1)):
678 elif s
.next
in OCTDIGITS
:
679 this
= this
+ s
.get()
684 code
= LITERAL
, atoi(this
[-6:], 8) & 0xff
693 a((LITERAL
, ord(this
)))
696 def expand_template(template
, match
):
697 # XXX: <fl> this is sooooo slow. drop in the slicelist code instead
700 sep
= match
.string
[:0]
701 if type(sep
) is type(""):
705 for c
, s
in template
:
711 raise error
, "empty group"
713 return string
.join(p
, sep
)