2 # Secret Labs' Regular Expression Engine
4 # convert re-style regular expression to sre pattern
6 # Copyright (c) 1998-2001 by Secret Labs AB. All rights reserved.
8 # See the sre.py file for information on usage and redistribution.
11 """Internal support module for sre"""
13 # XXX: show string offset and offending character for all errors
15 # this module works under 1.5.2 and later. don't use string methods
18 from sre_constants
import *
20 SPECIAL_CHARS
= ".\\[{()*+?^$|"
23 DIGITS
= tuple("0123456789")
25 OCTDIGITS
= tuple("01234567")
26 HEXDIGITS
= tuple("0123456789abcdefABCDEF")
28 WHITESPACE
= tuple(" \t\n\r\v\f")
31 r
"\a": (LITERAL
, ord("\a")),
32 r
"\b": (LITERAL
, ord("\b")),
33 r
"\f": (LITERAL
, ord("\f")),
34 r
"\n": (LITERAL
, ord("\n")),
35 r
"\r": (LITERAL
, ord("\r")),
36 r
"\t": (LITERAL
, ord("\t")),
37 r
"\v": (LITERAL
, ord("\v")),
38 r
"\\": (LITERAL
, ord("\\"))
42 r
"\A": (AT
, AT_BEGINNING_STRING
), # start of string
43 r
"\b": (AT
, AT_BOUNDARY
),
44 r
"\B": (AT
, AT_NON_BOUNDARY
),
45 r
"\d": (IN
, [(CATEGORY
, CATEGORY_DIGIT
)]),
46 r
"\D": (IN
, [(CATEGORY
, CATEGORY_NOT_DIGIT
)]),
47 r
"\s": (IN
, [(CATEGORY
, CATEGORY_SPACE
)]),
48 r
"\S": (IN
, [(CATEGORY
, CATEGORY_NOT_SPACE
)]),
49 r
"\w": (IN
, [(CATEGORY
, CATEGORY_WORD
)]),
50 r
"\W": (IN
, [(CATEGORY
, CATEGORY_NOT_WORD
)]),
51 r
"\Z": (AT
, AT_END_STRING
), # end of string
56 "i": SRE_FLAG_IGNORECASE
,
58 "m": SRE_FLAG_MULTILINE
,
60 "x": SRE_FLAG_VERBOSE
,
62 "t": SRE_FLAG_TEMPLATE
,
63 "u": SRE_FLAG_UNICODE
,
66 # figure out best way to convert hex/octal numbers to integers
69 atoi
= int # 2.0 and later
71 atoi
= string
.atoi
# 1.5.2
74 # master pattern object. keeps track of global attributes
80 def opengroup(self
, name
=None):
84 ogid
= self
.groupdict
.get(name
, None)
86 raise error
, ("redefinition of group name %s as group %d; "
87 "was group %d" % (repr(name
), gid
, ogid
))
88 self
.groupdict
[name
] = gid
91 def closegroup(self
, gid
):
93 def checkgroup(self
, gid
):
94 return gid
< self
.groups
and gid
not in self
.open
97 # a subpattern, in intermediate form
98 def __init__(self
, pattern
, data
=None):
99 self
.pattern
= pattern
104 def dump(self
, level
=0):
106 for op
, av
in self
.data
:
107 print level
*" " + op
,; nl
= 0
112 print (level
+1)*" " + op
, a
118 print level
*" " + "or"
119 a
.dump(level
+1); nl
= 1
121 elif type(av
) in (type(()), type([])):
123 if isinstance(a
, SubPattern
):
125 a
.dump(level
+1); nl
= 1
132 return repr(self
.data
)
134 return len(self
.data
)
135 def __delitem__(self
, index
):
137 def __getitem__(self
, index
):
138 return self
.data
[index
]
139 def __setitem__(self
, index
, code
):
140 self
.data
[index
] = code
141 def __getslice__(self
, start
, stop
):
142 return SubPattern(self
.pattern
, self
.data
[start
:stop
])
143 def insert(self
, index
, code
):
144 self
.data
.insert(index
, code
)
145 def append(self
, code
):
146 self
.data
.append(code
)
148 # determine the width (min, max) for this subpattern
152 for op
, av
in self
.data
:
166 elif op
is SUBPATTERN
:
167 i
, j
= av
[1].getwidth()
170 elif op
in (MIN_REPEAT
, MAX_REPEAT
):
171 i
, j
= av
[2].getwidth()
172 lo
= lo
+ long(i
) * av
[0]
173 hi
= hi
+ long(j
) * av
[1]
174 elif op
in (ANY
, RANGE
, IN
, LITERAL
, NOT_LITERAL
, CATEGORY
):
179 self
.width
= int(min(lo
, sys
.maxint
)), int(min(hi
, sys
.maxint
))
183 def __init__(self
, string
):
188 if self
.index
>= len(self
.string
):
191 char
= self
.string
[self
.index
]
194 c
= self
.string
[self
.index
+ 1]
196 raise error
, "bogus escape (end of line)"
198 self
.index
= self
.index
+ len(char
)
200 def match(self
, char
, skip
=1):
201 if char
== self
.next
:
211 return self
.index
, self
.next
212 def seek(self
, index
):
213 self
.index
, self
.next
= index
216 return "a" <= char
<= "z" or "A" <= char
<= "Z" or char
== "_"
219 return "0" <= char
<= "9"
222 # check that group name is a valid string
223 if not isident(name
[0]):
226 if not isident(char
) and not isdigit(char
):
230 def _group(escape
, groups
):
231 # check if the escape string represents a valid group
233 gid
= atoi(escape
[1:])
234 if gid
and gid
< groups
:
238 return None # not a valid group
240 def _class_escape(source
, escape
):
241 # handle escape code inside character class
242 code
= ESCAPES
.get(escape
)
245 code
= CATEGORIES
.get(escape
)
249 if escape
[1:2] == "x":
250 # hexadecimal escape (exactly two digits)
251 while source
.next
in HEXDIGITS
and len(escape
) < 4:
252 escape
= escape
+ source
.get()
255 raise error
, "bogus escape: %s" % repr("\\" + escape
)
256 return LITERAL
, atoi(escape
, 16) & 0xff
257 elif str(escape
[1:2]) in OCTDIGITS
:
258 # octal escape (up to three digits)
259 while source
.next
in OCTDIGITS
and len(escape
) < 5:
260 escape
= escape
+ source
.get()
262 return LITERAL
, atoi(escape
, 8) & 0xff
264 return LITERAL
, ord(escape
[1])
267 raise error
, "bogus escape: %s" % repr(escape
)
269 def _escape(source
, escape
, state
):
270 # handle escape code in expression
271 code
= CATEGORIES
.get(escape
)
274 code
= ESCAPES
.get(escape
)
278 if escape
[1:2] == "x":
280 while source
.next
in HEXDIGITS
and len(escape
) < 4:
281 escape
= escape
+ source
.get()
284 return LITERAL
, atoi(escape
[2:], 16) & 0xff
285 elif escape
[1:2] == "0":
287 while source
.next
in OCTDIGITS
and len(escape
) < 4:
288 escape
= escape
+ source
.get()
289 return LITERAL
, atoi(escape
[1:], 8) & 0xff
290 elif escape
[1:2] in DIGITS
:
291 # octal escape *or* decimal group reference (sigh)
292 if source
.next
in DIGITS
:
293 escape
= escape
+ source
.get()
294 if (escape
[1] in OCTDIGITS
and escape
[2] in OCTDIGITS
and
295 source
.next
in OCTDIGITS
):
296 # got three octal digits; this is an octal escape
297 escape
= escape
+ source
.get()
298 return LITERAL
, atoi(escape
[1:], 8) & 0xff
299 # got at least one decimal digit; this is a group reference
300 group
= _group(escape
, state
.groups
)
302 if not state
.checkgroup(group
):
303 raise error
, "cannot refer to open group"
304 return GROUPREF
, group
307 return LITERAL
, ord(escape
[1])
310 raise error
, "bogus escape: %s" % repr(escape
)
312 def _parse_sub(source
, state
, nested
=1):
313 # parse an alternation: a|b|c
317 items
.append(_parse(source
, state
))
318 if source
.match("|"):
322 if not source
.next
or source
.match(")", 0):
325 raise error
, "pattern not properly closed"
330 subpattern
= SubPattern(state
)
332 # check if all items share a common prefix
340 elif item
[0] != prefix
:
343 # all subitems start with a common "prefix".
344 # move it out of the branch
347 subpattern
.append(prefix
)
348 continue # check next one
351 # check if the branch can be replaced by a character set
353 if len(item
) != 1 or item
[0][0] != LITERAL
:
356 # we can store this as a character set instead of a
357 # branch (the compiler may optimize this even more)
361 subpattern
.append((IN
, set))
364 subpattern
.append((BRANCH
, (None, items
)))
367 def _parse(source
, state
):
368 # parse a simple pattern
370 subpattern
= SubPattern(state
)
374 if source
.next
in ("|", ")"):
375 break # end of subpattern
378 break # end of pattern
380 if state
.flags
& SRE_FLAG_VERBOSE
:
381 # skip whitespace and comments
382 if this
in WHITESPACE
:
387 if this
in (None, "\n"):
391 if this
and this
[0] not in SPECIAL_CHARS
:
392 subpattern
.append((LITERAL
, ord(this
)))
397 ## if source.match(":"):
398 ## pass # handle character classes
399 if source
.match("^"):
400 set.append((NEGATE
, None))
401 # check remaining characters
405 if this
== "]" and set != start
:
407 elif this
and this
[0] == "\\":
408 code1
= _class_escape(source
, this
)
410 code1
= LITERAL
, ord(this
)
412 raise error
, "unexpected end of regular expression"
413 if source
.match("-"):
420 set.append((LITERAL
, ord("-")))
424 code2
= _class_escape(source
, this
)
426 code2
= LITERAL
, ord(this
)
427 if code1
[0] != LITERAL
or code2
[0] != LITERAL
:
428 raise error
, "bad character range"
432 raise error
, "bad character range"
433 set.append((RANGE
, (lo
, hi
)))
439 # XXX: <fl> should move set optimization to compiler!
440 if len(set)==1 and set[0][0] is LITERAL
:
441 subpattern
.append(set[0]) # optimization
442 elif len(set)==2 and set[0][0] is NEGATE
and set[1][0] is LITERAL
:
443 subpattern
.append((NOT_LITERAL
, set[1][1])) # optimization
445 # XXX: <fl> should add charmap optimization here
446 subpattern
.append((IN
, set))
448 elif this
and this
[0] in REPEAT_CHARS
:
449 # repeat previous item
453 min, max = 0, MAXREPEAT
456 min, max = 1, MAXREPEAT
459 min, max = 0, MAXREPEAT
461 while source
.next
in DIGITS
:
462 lo
= lo
+ source
.get()
463 if source
.match(","):
464 while source
.next
in DIGITS
:
465 hi
= hi
+ source
.get()
468 if not source
.match("}"):
469 subpattern
.append((LITERAL
, ord(this
)))
477 raise error
, "bad repeat interval"
479 raise error
, "not supported"
480 # figure out which item to repeat
482 item
= subpattern
[-1:]
485 if not item
or (len(item
) == 1 and item
[0][0] == AT
):
486 raise error
, "nothing to repeat"
487 if item
[0][0] in (MIN_REPEAT
, MAX_REPEAT
):
488 raise error
, "multiple repeat"
489 if source
.match("?"):
490 subpattern
[-1] = (MIN_REPEAT
, (min, max, item
))
492 subpattern
[-1] = (MAX_REPEAT
, (min, max, item
))
495 subpattern
.append((ANY
, None))
500 if source
.match("?"):
503 if source
.match("P"):
505 if source
.match("<"):
506 # named group: skip forward to end of name
511 raise error
, "unterminated name"
517 raise error
, "bad character in group name"
518 elif source
.match("="):
519 # named backreference
524 raise error
, "unterminated name"
529 raise error
, "bad character in group name"
530 gid
= state
.groupdict
.get(name
)
532 raise error
, "unknown group name"
533 subpattern
.append((GROUPREF
, gid
))
538 raise error
, "unexpected end of pattern"
539 raise error
, "unknown specifier: ?P%s" % char
540 elif source
.match(":"):
541 # non-capturing group
543 elif source
.match("#"):
546 if source
.next
is None or source
.next
== ")":
549 if not source
.match(")"):
550 raise error
, "unbalanced parenthesis"
552 elif source
.next
in ("=", "!", "<"):
553 # lookahead assertions
557 if source
.next
not in ("=", "!"):
558 raise error
, "syntax error"
559 dir = -1 # lookbehind
561 p
= _parse_sub(source
, state
)
562 if not source
.match(")"):
563 raise error
, "unbalanced parenthesis"
565 subpattern
.append((ASSERT
, (dir, p
)))
567 subpattern
.append((ASSERT_NOT
, (dir, p
)))
571 if not source
.next
in FLAGS
:
572 raise error
, "unexpected end of pattern"
573 while source
.next
in FLAGS
:
574 state
.flags
= state
.flags | FLAGS
[source
.get()]
576 # parse group contents
581 group
= state
.opengroup(name
)
582 p
= _parse_sub(source
, state
)
583 if not source
.match(")"):
584 raise error
, "unbalanced parenthesis"
585 if group
is not None:
586 state
.closegroup(group
)
587 subpattern
.append((SUBPATTERN
, (group
, p
)))
592 raise error
, "unexpected end of pattern"
595 raise error
, "unknown extension"
598 subpattern
.append((AT
, AT_BEGINNING
))
601 subpattern
.append((AT
, AT_END
))
603 elif this
and this
[0] == "\\":
604 code
= _escape(source
, this
, state
)
605 subpattern
.append(code
)
608 raise error
, "parser error"
612 def parse(str, flags
=0, pattern
=None):
613 # parse 're' pattern into list of (opcode, argument) tuples
615 source
= Tokenizer(str)
619 pattern
.flags
= flags
622 p
= _parse_sub(source
, pattern
, 0)
626 raise error
, "unbalanced parenthesis"
628 raise error
, "bogus characters at end of regular expression"
630 if flags
& SRE_FLAG_DEBUG
:
633 if not (flags
& SRE_FLAG_VERBOSE
) and p
.pattern
.flags
& SRE_FLAG_VERBOSE
:
634 # the VERBOSE flag was switched on inside the pattern. to be
635 # on the safe side, we'll parse the whole thing again...
636 return parse(str, p
.pattern
.flags
)
640 def parse_template(source
, pattern
):
641 # parse 're' replacement string into list of literals and
643 s
= Tokenizer(source
)
646 def literal(literal
, p
=p
):
647 if p
and p
[-1][0] is LITERAL
:
648 p
[-1] = LITERAL
, p
[-1][1] + literal
650 p
.append((LITERAL
, literal
))
652 if type(sep
) is type(""):
659 break # end of replacement string
660 if this
and this
[0] == "\\":
668 raise error
, "unterminated group name"
673 raise error
, "bad group name"
678 raise error
, "bad character in group name"
680 index
= pattern
.groupindex
[name
]
682 raise IndexError, "unknown group name"
684 elif len(this
) > 1 and this
[1] in DIGITS
:
687 group
= _group(this
, pattern
.groups
+1)
689 if (s
.next
not in DIGITS
or
690 not _group(this
+ s
.next
, pattern
.groups
+1)):
693 elif s
.next
in OCTDIGITS
:
694 this
= this
+ s
.get()
699 code
= LITERAL
, makechar(atoi(this
[-6:], 8) & 0xff)
700 if code
[0] is LITERAL
:
706 this
= makechar(ESCAPES
[this
][1])
712 # convert template to groups and literals lists
718 groups
.append((i
, s
))
719 literals
.append(None)
723 return groups
, literals
725 def expand_template(template
, match
):
727 sep
= match
.string
[:0]
728 groups
, literals
= template
729 literals
= literals
[:]
731 for index
, group
in groups
:
732 literals
[index
] = s
= g(group
)
736 raise error
, "empty group"
737 return string
.join(literals
, sep
)