Lib/dos-8x3/sre_pars.py

   1 #
   2 # Secret Labs' Regular Expression Engine
   3 #
   4 # convert re-style regular expression to sre pattern
   5 #
   6 # Copyright (c) 1998-2000 by Secret Labs AB.  All rights reserved.
   7 #
   8 # Portions of this engine have been developed in cooperation with
   9 # CNRI.  Hewlett-Packard provided funding for 1.6 integration and
  10 # other compatibility work.
  11 #
  12
  13 import string, sys
  14
  15 import _sre
  16
  17 from sre_constants import *
  18
  19 # FIXME: should be 65535, but the arraymodule is still broken
  20 MAXREPEAT = 32767
  21
  22 # FIXME: might change in 2.0 final.  but for now, this seems
  23 # to be the best way to be compatible with 1.5.2
  24 CHARMASK = 0xff
  25
  26 SPECIAL_CHARS = ".\\[{()*+?^$|"
  27 REPEAT_CHARS  = "*+?{"
  28
  29 DIGITS = tuple(string.digits)
  30
  31 OCTDIGITS = tuple("01234567")
  32 HEXDIGITS = tuple("0123456789abcdefABCDEF")
  33
  34 WHITESPACE = tuple(string.whitespace)
  35
  36 ESCAPES = {
  37     r"\a": (LITERAL, 7),
  38     r"\b": (LITERAL, 8),
  39     r"\f": (LITERAL, 12),
  40     r"\n": (LITERAL, 10),
  41     r"\r": (LITERAL, 13),
  42     r"\t": (LITERAL, 9),
  43     r"\v": (LITERAL, 11),
  44     r"\\": (LITERAL, ord("\\"))
  45 }
  46
  47 CATEGORIES = {
  48     r"\A": (AT, AT_BEGINNING), # start of string
  49     r"\b": (AT, AT_BOUNDARY),
  50     r"\B": (AT, AT_NON_BOUNDARY),
  51     r"\d": (IN, [(CATEGORY, CATEGORY_DIGIT)]),
  52     r"\D": (IN, [(CATEGORY, CATEGORY_NOT_DIGIT)]),
  53     r"\s": (IN, [(CATEGORY, CATEGORY_SPACE)]),
  54     r"\S": (IN, [(CATEGORY, CATEGORY_NOT_SPACE)]),
  55     r"\w": (IN, [(CATEGORY, CATEGORY_WORD)]),
  56     r"\W": (IN, [(CATEGORY, CATEGORY_NOT_WORD)]),
  57     r"\Z": (AT, AT_END), # end of string
  58 }
  59
  60 FLAGS = {
  61     # standard flags
  62     "i": SRE_FLAG_IGNORECASE,
  63     "L": SRE_FLAG_LOCALE,
  64     "m": SRE_FLAG_MULTILINE,
  65     "s": SRE_FLAG_DOTALL,
  66     "x": SRE_FLAG_VERBOSE,
  67     # extensions
  68     "t": SRE_FLAG_TEMPLATE,
  69     "u": SRE_FLAG_UNICODE,
  70 }
  71
  72 class State:
  73     def __init__(self):
  74         self.flags = 0
  75         self.groups = 1
  76         self.groupdict = {}
  77     def getgroup(self, name=None):
  78         gid = self.groups
  79         self.groups = gid + 1
  80         if name:
  81             self.groupdict[name] = gid
  82         return gid
  83
  84 class SubPattern:
  85     # a subpattern, in intermediate form
  86     def __init__(self, pattern, data=None):
  87         self.pattern = pattern
  88         if not data:
  89             data = []
  90         self.data = data
  91         self.width = None
  92     def __repr__(self):
  93         return repr(self.data)
  94     def __len__(self):
  95         return len(self.data)
  96     def __delitem__(self, index):
  97         del self.data[index]
  98     def __getitem__(self, index):
  99         return self.data[index]
 100     def __setitem__(self, index, code):
 101         self.data[index] = code
 102     def __getslice__(self, start, stop):
 103         return SubPattern(self.pattern, self.data[start:stop])
 104     def insert(self, index, code):
 105         self.data.insert(index, code)
 106     def append(self, code):
 107         self.data.append(code)
 108     def getwidth(self):
 109         # determine the width (min, max) for this subpattern
 110         if self.width:
 111             return self.width
 112         lo = hi = 0L
 113         for op, av in self.data:
 114             if op is BRANCH:
 115                 l = sys.maxint
 116                 h = 0
 117                 for av in av[1]:
 118                     i, j = av.getwidth()
 119                     l = min(l, i)
 120                     h = min(h, j)
 121                 lo = lo + i
 122                 hi = hi + j
 123             elif op is CALL:
 124                 i, j = av.getwidth()
 125                 lo = lo + i
 126                 hi = hi + j
 127             elif op is SUBPATTERN:
 128                 i, j = av[1].getwidth()
 129                 lo = lo + i
 130                 hi = hi + j
 131             elif op in (MIN_REPEAT, MAX_REPEAT):
 132                 i, j = av[2].getwidth()
 133                 lo = lo + long(i) * av[0]
 134                 hi = hi + long(j) * av[1]
 135             elif op in (ANY, RANGE, IN, LITERAL, NOT_LITERAL, CATEGORY):
 136                 lo = lo + 1
 137                 hi = hi + 1
 138             elif op == SUCCESS:
 139                 break
 140         self.width = int(min(lo, sys.maxint)), int(min(hi, sys.maxint))
 141         return self.width
 142
 143 class Tokenizer:
 144     def __init__(self, string):
 145         self.index = 0
 146         self.string = string
 147         self.next = self.__next()
 148     def __next(self):
 149         if self.index >= len(self.string):
 150             return None
 151         char = self.string[self.index]
 152         if char[0] == "\\":
 153             try:
 154                 c = self.string[self.index + 1]
 155             except IndexError:
 156                 raise error, "bogus escape"
 157             char = char + c
 158         self.index = self.index + len(char)
 159         return char
 160     def match(self, char):
 161         if char == self.next:
 162             self.next = self.__next()
 163             return 1
 164         return 0
 165     def match_set(self, set):
 166         if self.next and self.next in set:
 167             self.next = self.__next()
 168             return 1
 169         return 0
 170     def get(self):
 171         this = self.next
 172         self.next = self.__next()
 173         return this
 174
 175 def isident(char):
 176     return "a" <= char <= "z" or "A" <= char <= "Z" or char == "_"
 177
 178 def isdigit(char):
 179     return "0" <= char <= "9"
 180
 181 def isname(name):
 182     # check that group name is a valid string
 183     if not isident(name[0]):
 184         return 0
 185     for char in name:
 186         if not isident(char) and not isdigit(char):
 187             return 0
 188     return 1
 189
 190 def _group(escape, groups):
 191     # check if the escape string represents a valid group
 192     try:
 193         gid = int(escape[1:])
 194         if gid and gid < groups:
 195             return gid
 196     except ValueError:
 197         pass
 198     return None # not a valid group
 199
 200 def _class_escape(source, escape):
 201     # handle escape code inside character class
 202     code = ESCAPES.get(escape)
 203     if code:
 204         return code
 205     code = CATEGORIES.get(escape)
 206     if code:
 207         return code
 208     try:
 209         if escape[1:2] == "x":
 210             while source.next in HEXDIGITS:
 211                 escape = escape + source.get()
 212             escape = escape[2:]
 213             return LITERAL, int(escape[-4:], 16) & CHARMASK
 214         elif str(escape[1:2]) in OCTDIGITS:
 215             while source.next in OCTDIGITS:
 216                 escape = escape + source.get()
 217             escape = escape[1:]
 218             return LITERAL, int(escape[-6:], 8) & CHARMASK
 219         if len(escape) == 2:
 220             return LITERAL, ord(escape[1])
 221     except ValueError:
 222         pass
 223     raise error, "bogus escape: %s" % repr(escape)
 224
 225 def _escape(source, escape, state):
 226     # handle escape code in expression
 227     code = CATEGORIES.get(escape)
 228     if code:
 229         return code
 230     code = ESCAPES.get(escape)
 231     if code:
 232         return code
 233     try:
 234         if escape[1:2] == "x":
 235             while source.next in HEXDIGITS:
 236                 escape = escape + source.get()
 237             escape = escape[2:]
 238             return LITERAL, int(escape[-4:], 16) & CHARMASK
 239         elif escape[1:2] in DIGITS:
 240             while 1:
 241                 group = _group(escape, state.groups)
 242                 if group:
 243                     if (not source.next or
 244                         not _group(escape + source.next, state.groups)):
 245                         return GROUP, group
 246                     escape = escape + source.get()
 247                 elif source.next in OCTDIGITS:
 248                     escape = escape + source.get()
 249                 else:
 250                     break
 251             escape = escape[1:]
 252             return LITERAL, int(escape[-6:], 8) & CHARMASK
 253         if len(escape) == 2:
 254             return LITERAL, ord(escape[1])
 255     except ValueError:
 256         pass
 257     raise error, "bogus escape: %s" % repr(escape)
 258
 259 def _branch(pattern, items):
 260     # form a branch operator from a set of items
 261
 262     subpattern = SubPattern(pattern)
 263
 264     # check if all items share a common prefix
 265     while 1:
 266         prefix = None
 267         for item in items:
 268             if not item:
 269                 break
 270             if prefix is None:
 271                 prefix = item[0]
 272             elif item[0] != prefix:
 273                 break
 274         else:
 275             # all subitems start with a common "prefix".
 276             # move it out of the branch
 277             for item in items:
 278                 del item[0]
 279             subpattern.append(prefix)
 280             continue # check next one
 281         break
 282
 283     # check if the branch can be replaced by a character set
 284     for item in items:
 285         if len(item) != 1 or item[0][0] != LITERAL:
 286             break
 287     else:
 288         # we can store this as a character set instead of a
 289         # branch (FIXME: use a range if possible)
 290         set = []
 291         for item in items:
 292             set.append(item[0])
 293         subpattern.append((IN, set))
 294         return subpattern
 295
 296     subpattern.append((BRANCH, (None, items)))
 297     return subpattern
 298
 299 def _parse(source, state):
 300
 301     # parse regular expression pattern into an operator list.
 302
 303     subpattern = SubPattern(state)
 304
 305     while 1:
 306
 307         if source.next in ("|", ")"):
 308             break # end of subpattern
 309         this = source.get()
 310         if this is None:
 311             break # end of pattern
 312
 313         if state.flags & SRE_FLAG_VERBOSE:
 314             # skip whitespace and comments
 315             if this in WHITESPACE:
 316                 continue
 317             if this == "#":
 318                 while 1:
 319                     this = source.get()
 320                     if this in (None, "\n"):
 321                         break
 322                 continue
 323
 324         if this and this[0] not in SPECIAL_CHARS:
 325             subpattern.append((LITERAL, ord(this)))
 326
 327         elif this == "[":
 328             # character set
 329             set = []
 330 ##          if source.match(":"):
 331 ##              pass # handle character classes
 332             if source.match("^"):
 333                 set.append((NEGATE, None))
 334             # check remaining characters
 335             start = set[:]
 336             while 1:
 337                 this = source.get()
 338                 if this == "]" and set != start:
 339                     break
 340                 elif this and this[0] == "\\":
 341                     code1 = _class_escape(source, this)
 342                 elif this:
 343                     code1 = LITERAL, ord(this)
 344                 else:
 345                     raise error, "unexpected end of regular expression"
 346                 if source.match("-"):
 347                     # potential range
 348                     this = source.get()
 349                     if this == "]":
 350                         set.append(code1)
 351                         set.append((LITERAL, ord("-")))
 352                         break
 353                     else:
 354                         if this[0] == "\\":
 355                             code2 = _class_escape(source, this)
 356                         else:
 357                             code2 = LITERAL, ord(this)
 358                         if code1[0] != LITERAL or code2[0] != LITERAL:
 359                             raise error, "illegal range"
 360                         set.append((RANGE, (code1[1], code2[1])))
 361                 else:
 362                     if code1[0] is IN:
 363                         code1 = code1[1][0]
 364                     set.append(code1)
 365
 366             # FIXME: <fl> move set optimization to compiler!
 367             if len(set)==1 and set[0][0] is LITERAL:
 368                 subpattern.append(set[0]) # optimization
 369             elif len(set)==2 and set[0][0] is NEGATE and set[1][0] is LITERAL:
 370                 subpattern.append((NOT_LITERAL, set[1][1])) # optimization
 371             else:
 372                 # FIXME: <fl> add charmap optimization
 373                 subpattern.append((IN, set))
 374
 375         elif this and this[0] in REPEAT_CHARS:
 376             # repeat previous item
 377             if this == "?":
 378                 min, max = 0, 1
 379             elif this == "*":
 380                 min, max = 0, MAXREPEAT
 381             elif this == "+":
 382                 min, max = 1, MAXREPEAT
 383             elif this == "{":
 384                 min, max = 0, MAXREPEAT
 385                 lo = hi = ""
 386                 while source.next in DIGITS:
 387                     lo = lo + source.get()
 388                 if source.match(","):
 389                     while source.next in DIGITS:
 390                         hi = hi + source.get()
 391                 else:
 392                     hi = lo
 393                 if not source.match("}"):
 394                     raise error, "bogus range"
 395                 if lo:
 396                     min = int(lo)
 397                 if hi:
 398                     max = int(hi)
 399                 # FIXME: <fl> check that hi >= lo!
 400             else:
 401                 raise error, "not supported"
 402             # figure out which item to repeat
 403             if subpattern:
 404                 item = subpattern[-1:]
 405             else:
 406                 raise error, "nothing to repeat"
 407             if source.match("?"):
 408                 subpattern[-1] = (MIN_REPEAT, (min, max, item))
 409             else:
 410                 subpattern[-1] = (MAX_REPEAT, (min, max, item))
 411
 412         elif this == ".":
 413             subpattern.append((ANY, None))
 414
 415         elif this == "(":
 416             group = 1
 417             name = None
 418             if source.match("?"):
 419                 group = 0
 420                 # options
 421                 if source.match("P"):
 422                     # python extensions
 423                     if source.match("<"):
 424                         # named group: skip forward to end of name
 425                         name = ""
 426                         while 1:
 427                             char = source.get()
 428                             if char is None:
 429                                 raise error, "unterminated name"
 430                             if char == ">":
 431                                 break
 432                             name = name + char
 433                         group = 1
 434                         if not isname(name):
 435                             raise error, "illegal character in group name"
 436                     elif source.match("="):
 437                         # named backreference
 438                         name = ""
 439                         while 1:
 440                             char = source.get()
 441                             if char is None:
 442                                 raise error, "unterminated name"
 443                             if char == ")":
 444                                 break
 445                             name = name + char
 446                         if not isname(name):
 447                             raise error, "illegal character in group name"
 448                         gid = state.groupdict.get(name)
 449                         if gid is None:
 450                             raise error, "unknown group name"
 451                         subpattern.append((GROUP, gid))
 452                     else:
 453                         char = source.get()
 454                         if char is None:
 455                             raise error, "unexpected end of pattern"
 456                         raise error, "unknown specifier: ?P%s" % char
 457                 elif source.match(":"):
 458                     # non-capturing group
 459                     group = 2
 460                 elif source.match("#"):
 461                     # comment
 462                     while 1:
 463                         if source.next is None or source.next == ")":
 464                             break
 465                         source.get()
 466                 elif source.next in ("=", "!"):
 467                     # lookahead assertions
 468                     char = source.get()
 469                     b = []
 470                     while 1:
 471                         p = _parse(source, state)
 472                         if source.next == ")":
 473                             if b:
 474                                 b.append(p)
 475                                 p = _branch(state, b)
 476                             if char == "=":
 477                                 subpattern.append((ASSERT, p))
 478                             else:
 479                                 subpattern.append((ASSERT_NOT, p))
 480                             break
 481                         elif source.match("|"):
 482                             b.append(p)
 483                         else:
 484                             raise error, "pattern not properly closed"
 485                 else:
 486                     # flags
 487                     while FLAGS.has_key(source.next):
 488                         state.flags = state.flags | FLAGS[source.get()]
 489             if group:
 490                 # parse group contents
 491                 b = []
 492                 if group == 2:
 493                     # anonymous group
 494                     group = None
 495                 else:
 496                     group = state.getgroup(name)
 497                 while 1:
 498                     p = _parse(source, state)
 499                     if source.match(")"):
 500                         if b:
 501                             b.append(p)
 502                             p = _branch(state, b)
 503                         subpattern.append((SUBPATTERN, (group, p)))
 504                         break
 505                     elif source.match("|"):
 506                         b.append(p)
 507                     else:
 508                         raise error, "group not properly closed"
 509             else:
 510                 while 1:
 511                     char = source.get()
 512                     if char is None or char == ")":
 513                         break
 514                     raise error, "unknown extension"
 515
 516         elif this == "^":
 517             subpattern.append((AT, AT_BEGINNING))
 518
 519         elif this == "$":
 520             subpattern.append((AT, AT_END))
 521
 522         elif this and this[0] == "\\":
 523             code = _escape(source, this, state)
 524             subpattern.append(code)
 525
 526         else:
 527             raise error, "parser error"
 528
 529     return subpattern
 530
 531 def parse(pattern, flags=0):
 532     # parse 're' pattern into list of (opcode, argument) tuples
 533     source = Tokenizer(pattern)
 534     state = State()
 535     state.flags = flags
 536     b = []
 537     while 1:
 538         p = _parse(source, state)
 539         tail = source.get()
 540         if tail == "|":
 541             b.append(p)
 542         elif tail == ")":
 543             raise error, "unbalanced parenthesis"
 544         elif tail is None:
 545             if b:
 546                 b.append(p)
 547                 p = _branch(state, b)
 548             break
 549         else:
 550             raise error, "bogus characters at end of regular expression"
 551     return p
 552
 553 def parse_template(source, pattern):
 554     # parse 're' replacement string into list of literals and
 555     # group references
 556     s = Tokenizer(source)
 557     p = []
 558     a = p.append
 559     while 1:
 560         this = s.get()
 561         if this is None:
 562             break # end of replacement string
 563         if this and this[0] == "\\":
 564             # group
 565             if this == "\\g":
 566                 name = ""
 567                 if s.match("<"):
 568                     while 1:
 569                         char = s.get()
 570                         if char is None:
 571                             raise error, "unterminated group name"
 572                         if char == ">":
 573                             break
 574                         name = name + char
 575                 if not name:
 576                     raise error, "bad group name"
 577                 try:
 578                     index = int(name)
 579                 except ValueError:
 580                     if not isname(name):
 581                         raise error, "illegal character in group name"
 582                     try:
 583                         index = pattern.groupindex[name]
 584                     except KeyError:
 585                         raise IndexError, "unknown group name"
 586                 a((MARK, index))
 587             elif len(this) > 1 and this[1] in DIGITS:
 588                 code = None
 589                 while 1:
 590                     group = _group(this, pattern.groups+1)
 591                     if group:
 592                         if (not s.next or
 593                             not _group(this + s.next, pattern.groups+1)):
 594                             code = MARK, int(group)
 595                             break
 596                     elif s.next in OCTDIGITS:
 597                         this = this + s.get()
 598                     else:
 599                         break
 600                 if not code:
 601                     this = this[1:]
 602                     code = LITERAL, int(this[-6:], 8) & CHARMASK
 603                 a(code)
 604             else:
 605                 try:
 606                     a(ESCAPES[this])
 607                 except KeyError:
 608                     for c in this:
 609                         a((LITERAL, ord(c)))
 610         else:
 611             a((LITERAL, ord(this)))
 612     return p
 613
 614 def expand_template(template, match):
 615     # FIXME: <fl> this is sooooo slow.  drop in the slicelist
 616     # code instead
 617     p = []
 618     a = p.append
 619     sep = match.string[:0]
 620     if type(sep) is type(""):
 621         char = chr
 622     else:
 623         char = unichr
 624     for c, s in template:
 625         if c is LITERAL:
 626             a(char(s))
 627         elif c is MARK:
 628             s = match.group(s)
 629             if s is None:
 630                 raise error, "empty group"
 631             a(s)
 632     return sep.join(p)