2 # Secret Labs' Regular Expression Engine
4 # convert re-style regular expression to sre pattern
6 # Copyright (c) 1998-2000 by Secret Labs AB. All rights reserved.
8 # Portions of this engine have been developed in cooperation with
9 # CNRI. Hewlett-Packard provided funding for 1.6 integration and
10 # other compatibility work.
17 from sre_constants
import *
19 # FIXME: should be 65535, but the arraymodule is still broken
22 # FIXME: might change in 2.0 final. but for now, this seems
23 # to be the best way to be compatible with 1.5.2
26 SPECIAL_CHARS
= ".\\[{()*+?^$|"
29 DIGITS
= tuple(string
.digits
)
31 OCTDIGITS
= tuple("01234567")
32 HEXDIGITS
= tuple("0123456789abcdefABCDEF")
34 WHITESPACE
= tuple(string
.whitespace
)
44 r
"\\": (LITERAL
, ord("\\"))
48 r
"\A": (AT
, AT_BEGINNING
), # start of string
49 r
"\b": (AT
, AT_BOUNDARY
),
50 r
"\B": (AT
, AT_NON_BOUNDARY
),
51 r
"\d": (IN
, [(CATEGORY
, CATEGORY_DIGIT
)]),
52 r
"\D": (IN
, [(CATEGORY
, CATEGORY_NOT_DIGIT
)]),
53 r
"\s": (IN
, [(CATEGORY
, CATEGORY_SPACE
)]),
54 r
"\S": (IN
, [(CATEGORY
, CATEGORY_NOT_SPACE
)]),
55 r
"\w": (IN
, [(CATEGORY
, CATEGORY_WORD
)]),
56 r
"\W": (IN
, [(CATEGORY
, CATEGORY_NOT_WORD
)]),
57 r
"\Z": (AT
, AT_END
), # end of string
62 "i": SRE_FLAG_IGNORECASE
,
64 "m": SRE_FLAG_MULTILINE
,
66 "x": SRE_FLAG_VERBOSE
,
68 "t": SRE_FLAG_TEMPLATE
,
69 "u": SRE_FLAG_UNICODE
,
77 def getgroup(self
, name
=None):
81 self
.groupdict
[name
] = gid
85 # a subpattern, in intermediate form
86 def __init__(self
, pattern
, data
=None):
87 self
.pattern
= pattern
93 return repr(self
.data
)
96 def __delitem__(self
, index
):
98 def __getitem__(self
, index
):
99 return self
.data
[index
]
100 def __setitem__(self
, index
, code
):
101 self
.data
[index
] = code
102 def __getslice__(self
, start
, stop
):
103 return SubPattern(self
.pattern
, self
.data
[start
:stop
])
104 def insert(self
, index
, code
):
105 self
.data
.insert(index
, code
)
106 def append(self
, code
):
107 self
.data
.append(code
)
109 # determine the width (min, max) for this subpattern
113 for op
, av
in self
.data
:
127 elif op
is SUBPATTERN
:
128 i
, j
= av
[1].getwidth()
131 elif op
in (MIN_REPEAT
, MAX_REPEAT
):
132 i
, j
= av
[2].getwidth()
133 lo
= lo
+ long(i
) * av
[0]
134 hi
= hi
+ long(j
) * av
[1]
135 elif op
in (ANY
, RANGE
, IN
, LITERAL
, NOT_LITERAL
, CATEGORY
):
140 self
.width
= int(min(lo
, sys
.maxint
)), int(min(hi
, sys
.maxint
))
144 def __init__(self
, string
):
147 self
.next
= self
.__next
()
149 if self
.index
>= len(self
.string
):
151 char
= self
.string
[self
.index
]
154 c
= self
.string
[self
.index
+ 1]
156 raise error
, "bogus escape"
158 self
.index
= self
.index
+ len(char
)
160 def match(self
, char
):
161 if char
== self
.next
:
162 self
.next
= self
.__next
()
165 def match_set(self
, set):
166 if self
.next
and self
.next
in set:
167 self
.next
= self
.__next
()
172 self
.next
= self
.__next
()
176 return "a" <= char
<= "z" or "A" <= char
<= "Z" or char
== "_"
179 return "0" <= char
<= "9"
182 # check that group name is a valid string
183 if not isident(name
[0]):
186 if not isident(char
) and not isdigit(char
):
190 def _group(escape
, groups
):
191 # check if the escape string represents a valid group
193 gid
= int(escape
[1:])
194 if gid
and gid
< groups
:
198 return None # not a valid group
200 def _class_escape(source
, escape
):
201 # handle escape code inside character class
202 code
= ESCAPES
.get(escape
)
205 code
= CATEGORIES
.get(escape
)
209 if escape
[1:2] == "x":
210 while source
.next
in HEXDIGITS
:
211 escape
= escape
+ source
.get()
213 return LITERAL
, int(escape
[-4:], 16) & CHARMASK
214 elif str(escape
[1:2]) in OCTDIGITS
:
215 while source
.next
in OCTDIGITS
:
216 escape
= escape
+ source
.get()
218 return LITERAL
, int(escape
[-6:], 8) & CHARMASK
220 return LITERAL
, ord(escape
[1])
223 raise error
, "bogus escape: %s" % repr(escape
)
225 def _escape(source
, escape
, state
):
226 # handle escape code in expression
227 code
= CATEGORIES
.get(escape
)
230 code
= ESCAPES
.get(escape
)
234 if escape
[1:2] == "x":
235 while source
.next
in HEXDIGITS
:
236 escape
= escape
+ source
.get()
238 return LITERAL
, int(escape
[-4:], 16) & CHARMASK
239 elif escape
[1:2] in DIGITS
:
241 group
= _group(escape
, state
.groups
)
243 if (not source
.next
or
244 not _group(escape
+ source
.next
, state
.groups
)):
246 escape
= escape
+ source
.get()
247 elif source
.next
in OCTDIGITS
:
248 escape
= escape
+ source
.get()
252 return LITERAL
, int(escape
[-6:], 8) & CHARMASK
254 return LITERAL
, ord(escape
[1])
257 raise error
, "bogus escape: %s" % repr(escape
)
259 def _branch(pattern
, items
):
260 # form a branch operator from a set of items
262 subpattern
= SubPattern(pattern
)
264 # check if all items share a common prefix
272 elif item
[0] != prefix
:
275 # all subitems start with a common "prefix".
276 # move it out of the branch
279 subpattern
.append(prefix
)
280 continue # check next one
283 # check if the branch can be replaced by a character set
285 if len(item
) != 1 or item
[0][0] != LITERAL
:
288 # we can store this as a character set instead of a
289 # branch (FIXME: use a range if possible)
293 subpattern
.append((IN
, set))
296 subpattern
.append((BRANCH
, (None, items
)))
299 def _parse(source
, state
):
301 # parse regular expression pattern into an operator list.
303 subpattern
= SubPattern(state
)
307 if source
.next
in ("|", ")"):
308 break # end of subpattern
311 break # end of pattern
313 if state
.flags
& SRE_FLAG_VERBOSE
:
314 # skip whitespace and comments
315 if this
in WHITESPACE
:
320 if this
in (None, "\n"):
324 if this
and this
[0] not in SPECIAL_CHARS
:
325 subpattern
.append((LITERAL
, ord(this
)))
330 ## if source.match(":"):
331 ## pass # handle character classes
332 if source
.match("^"):
333 set.append((NEGATE
, None))
334 # check remaining characters
338 if this
== "]" and set != start
:
340 elif this
and this
[0] == "\\":
341 code1
= _class_escape(source
, this
)
343 code1
= LITERAL
, ord(this
)
345 raise error
, "unexpected end of regular expression"
346 if source
.match("-"):
351 set.append((LITERAL
, ord("-")))
355 code2
= _class_escape(source
, this
)
357 code2
= LITERAL
, ord(this
)
358 if code1
[0] != LITERAL
or code2
[0] != LITERAL
:
359 raise error
, "illegal range"
360 set.append((RANGE
, (code1
[1], code2
[1])))
366 # FIXME: <fl> move set optimization to compiler!
367 if len(set)==1 and set[0][0] is LITERAL
:
368 subpattern
.append(set[0]) # optimization
369 elif len(set)==2 and set[0][0] is NEGATE
and set[1][0] is LITERAL
:
370 subpattern
.append((NOT_LITERAL
, set[1][1])) # optimization
372 # FIXME: <fl> add charmap optimization
373 subpattern
.append((IN
, set))
375 elif this
and this
[0] in REPEAT_CHARS
:
376 # repeat previous item
380 min, max = 0, MAXREPEAT
382 min, max = 1, MAXREPEAT
384 min, max = 0, MAXREPEAT
386 while source
.next
in DIGITS
:
387 lo
= lo
+ source
.get()
388 if source
.match(","):
389 while source
.next
in DIGITS
:
390 hi
= hi
+ source
.get()
393 if not source
.match("}"):
394 raise error
, "bogus range"
399 # FIXME: <fl> check that hi >= lo!
401 raise error
, "not supported"
402 # figure out which item to repeat
404 item
= subpattern
[-1:]
406 raise error
, "nothing to repeat"
407 if source
.match("?"):
408 subpattern
[-1] = (MIN_REPEAT
, (min, max, item
))
410 subpattern
[-1] = (MAX_REPEAT
, (min, max, item
))
413 subpattern
.append((ANY
, None))
418 if source
.match("?"):
421 if source
.match("P"):
423 if source
.match("<"):
424 # named group: skip forward to end of name
429 raise error
, "unterminated name"
435 raise error
, "illegal character in group name"
436 elif source
.match("="):
437 # named backreference
442 raise error
, "unterminated name"
447 raise error
, "illegal character in group name"
448 gid
= state
.groupdict
.get(name
)
450 raise error
, "unknown group name"
451 subpattern
.append((GROUP
, gid
))
455 raise error
, "unexpected end of pattern"
456 raise error
, "unknown specifier: ?P%s" % char
457 elif source
.match(":"):
458 # non-capturing group
460 elif source
.match("#"):
463 if source
.next
is None or source
.next
== ")":
466 elif source
.next
in ("=", "!"):
467 # lookahead assertions
471 p
= _parse(source
, state
)
472 if source
.next
== ")":
475 p
= _branch(state
, b
)
477 subpattern
.append((ASSERT
, p
))
479 subpattern
.append((ASSERT_NOT
, p
))
481 elif source
.match("|"):
484 raise error
, "pattern not properly closed"
487 while FLAGS
.has_key(source
.next
):
488 state
.flags
= state
.flags | FLAGS
[source
.get()]
490 # parse group contents
496 group
= state
.getgroup(name
)
498 p
= _parse(source
, state
)
499 if source
.match(")"):
502 p
= _branch(state
, b
)
503 subpattern
.append((SUBPATTERN
, (group
, p
)))
505 elif source
.match("|"):
508 raise error
, "group not properly closed"
512 if char
is None or char
== ")":
514 raise error
, "unknown extension"
517 subpattern
.append((AT
, AT_BEGINNING
))
520 subpattern
.append((AT
, AT_END
))
522 elif this
and this
[0] == "\\":
523 code
= _escape(source
, this
, state
)
524 subpattern
.append(code
)
527 raise error
, "parser error"
531 def parse(pattern
, flags
=0):
532 # parse 're' pattern into list of (opcode, argument) tuples
533 source
= Tokenizer(pattern
)
538 p
= _parse(source
, state
)
543 raise error
, "unbalanced parenthesis"
547 p
= _branch(state
, b
)
550 raise error
, "bogus characters at end of regular expression"
553 def parse_template(source
, pattern
):
554 # parse 're' replacement string into list of literals and
556 s
= Tokenizer(source
)
562 break # end of replacement string
563 if this
and this
[0] == "\\":
571 raise error
, "unterminated group name"
576 raise error
, "bad group name"
581 raise error
, "illegal character in group name"
583 index
= pattern
.groupindex
[name
]
585 raise IndexError, "unknown group name"
587 elif len(this
) > 1 and this
[1] in DIGITS
:
590 group
= _group(this
, pattern
.groups
+1)
593 not _group(this
+ s
.next
, pattern
.groups
+1)):
594 code
= MARK
, int(group
)
596 elif s
.next
in OCTDIGITS
:
597 this
= this
+ s
.get()
602 code
= LITERAL
, int(this
[-6:], 8) & CHARMASK
611 a((LITERAL
, ord(this
)))
614 def expand_template(template
, match
):
615 # FIXME: <fl> this is sooooo slow. drop in the slicelist
619 sep
= match
.string
[:0]
620 if type(sep
) is type(""):
624 for c
, s
in template
:
630 raise error
, "empty group"