Lib/pre.py

   1 # module 're' -- A collection of regular expression operations
   2
   3 r"""Support for regular expressions (RE).
   4
   5 This module provides regular expression matching operations similar to
   6 those found in Perl. It's 8-bit clean: the strings being processed may
   7 contain both null bytes and characters whose high bit is set. Regular
   8 expression pattern strings may not contain null bytes, but can specify
   9 the null byte using the \\number notation. Characters with the high
  10 bit set may be included.
  11
  12 Regular expressions can contain both special and ordinary
  13 characters. Most ordinary characters, like "A", "a", or "0", are the
  14 simplest regular expressions; they simply match themselves. You can
  15 concatenate ordinary characters, so last matches the string 'last'.
  16
  17 The special characters are:
  18     "."      Matches any character except a newline.
  19     "^"      Matches the start of the string.
  20     "$"      Matches the end of the string.
  21     "*"      Matches 0 or more (greedy) repetitions of the preceding RE.
  22              Greedy means that it will match as many repetitions as possible.
  23     "+"      Matches 1 or more (greedy) repetitions of the preceding RE.
  24     "?"      Matches 0 or 1 (greedy) of the preceding RE.
  25     *?,+?,?? Non-greedy versions of the previous three special characters.
  26     {m,n}    Matches from m to n repetitions of the preceding RE.
  27     {m,n}?   Non-greedy version of the above.
  28     "\\"      Either escapes special characters or signals a special sequence.
  29     []       Indicates a set of characters.
  30              A "^" as the first character indicates a complementing set.
  31     "|"      A|B, creates an RE that will match either A or B.
  32     (...)    Matches the RE inside the parentheses.
  33              The contents can be retrieved or matched later in the string.
  34     (?iLmsx) Set the I, L, M, S, or X flag for the RE.
  35     (?:...)  Non-grouping version of regular parentheses.
  36     (?P<name>...) The substring matched by the group is accessible by name.
  37     (?P=name)     Matches the text matched earlier by the group named name.
  38     (?#...)  A comment; ignored.
  39     (?=...)  Matches if ... matches next, but doesn't consume the string.
  40     (?!...)  Matches if ... doesn't match next.
  41
  42 The special sequences consist of "\\" and a character from the list
  43 below. If the ordinary character is not on the list, then the
  44 resulting RE will match the second character.
  45     \\number  Matches the contents of the group of the same number.
  46     \\A       Matches only at the start of the string.
  47     \\Z       Matches only at the end of the string.
  48     \\b       Matches the empty string, but only at the start or end of a word.
  49     \\B       Matches the empty string, but not at the start or end of a word.
  50     \\d       Matches any decimal digit; equivalent to the set [0-9].
  51     \\D       Matches any non-digit character; equivalent to the set [^0-9].
  52     \\s       Matches any whitespace character; equivalent to [ \\t\\n\\r\\f\\v].
  53     \\S       Matches any non-whitespace character; equiv. to [^ \\t\\n\\r\\f\\v].
  54     \\w       Matches any alphanumeric character; equivalent to [a-zA-Z0-9_].
  55              With LOCALE, it will match the set [0-9_] plus characters defined
  56              as letters for the current locale.
  57     \\W       Matches the complement of \\w.
  58     \\\\       Matches a literal backslash.
  59
  60 This module exports the following functions:
  61     match    Match a regular expression pattern to the beginning of a string.
  62     search   Search a string for the presence of a pattern.
  63     sub      Substitute occurrences of a pattern found in a string.
  64     subn     Same as sub, but also return the number of substitutions made.
  65     split    Split a string by the occurrences of a pattern.
  66     findall  Find all occurrences of a pattern in a string.
  67     compile  Compile a pattern into a RegexObject.
  68     escape   Backslash all non-alphanumerics in a string.
  69
  70 This module exports the following classes:
  71     RegexObject    Holds a compiled regular expression pattern.
  72     MatchObject    Contains information about pattern matches.
  73
  74 Some of the functions in this module takes flags as optional parameters:
  75     I  IGNORECASE  Perform case-insensitive matching.
  76     L  LOCALE      Make \w, \W, \b, \B, dependent on the current locale.
  77     M  MULTILINE   "^" matches the beginning of lines as well as the string.
  78                    "$" matches the end of lines as well as the string.
  79     S  DOTALL      "." matches any character at all, including the newline.
  80     X  VERBOSE     Ignore whitespace and comments for nicer looking RE's.
  81
  82 This module also defines an exception 'error'.
  83
  84 """
  85
  86
  87 import sys
  88 from pcre import *
  89
  90 # XXX This module is deprecated as of Python 2.3, and should be removed
  91 # in the version that follows 2.3.
  92 import warnings as _warnings
  93 _warnings.warn("Please use the 're' module, not the 'pre' module",
  94                DeprecationWarning)
  95
  96 __all__ = ["match","search","sub","subn","split","findall","escape","compile",
  97            "I","L","M","S","X","IGNORECASE","LOCALE","MULTILINE","DOTALL",
  98            "VERBOSE","error"]
  99
 100 #
 101 # First, the public part of the interface:
 102 #
 103
 104 # pcre.error and re.error should be the same, since exceptions can be
 105 # raised from either module.
 106
 107 # compilation flags
 108
 109 I = IGNORECASE
 110 L = LOCALE
 111 M = MULTILINE
 112 S = DOTALL
 113 X = VERBOSE
 114
 115
 116 #
 117 #
 118 #
 119
 120 _cache = {}
 121 _MAXCACHE = 20
 122
 123 def _cachecompile(pattern, flags=0):
 124     key = (pattern, flags)
 125     try:
 126         return _cache[key]
 127     except KeyError:
 128         pass
 129     value = compile(pattern, flags)
 130     if len(_cache) >= _MAXCACHE:
 131         _cache.clear()
 132     _cache[key] = value
 133     return value
 134
 135 def match(pattern, string, flags=0):
 136     """match (pattern, string[, flags]) -> MatchObject or None
 137
 138     If zero or more characters at the beginning of string match the
 139     regular expression pattern, return a corresponding MatchObject
 140     instance. Return None if the string does not match the pattern;
 141     note that this is different from a zero-length match.
 142
 143     Note: If you want to locate a match anywhere in string, use
 144     search() instead.
 145
 146     """
 147
 148     return _cachecompile(pattern, flags).match(string)
 149
 150 def search(pattern, string, flags=0):
 151     """search (pattern, string[, flags]) -> MatchObject or None
 152
 153     Scan through string looking for a location where the regular
 154     expression pattern produces a match, and return a corresponding
 155     MatchObject instance. Return None if no position in the string
 156     matches the pattern; note that this is different from finding a
 157     zero-length match at some point in the string.
 158
 159     """
 160     return _cachecompile(pattern, flags).search(string)
 161
 162 def sub(pattern, repl, string, count=0):
 163     """sub(pattern, repl, string[, count=0]) -> string
 164
 165     Return the string obtained by replacing the leftmost
 166     non-overlapping occurrences of pattern in string by the
 167     replacement repl. If the pattern isn't found, string is returned
 168     unchanged. repl can be a string or a function; if a function, it
 169     is called for every non-overlapping occurrence of pattern. The
 170     function takes a single match object argument, and returns the
 171     replacement string.
 172
 173     The pattern may be a string or a regex object; if you need to
 174     specify regular expression flags, you must use a regex object, or
 175     use embedded modifiers in a pattern; e.g.
 176     sub("(?i)b+", "x", "bbbb BBBB") returns 'x x'.
 177
 178     The optional argument count is the maximum number of pattern
 179     occurrences to be replaced; count must be a non-negative integer,
 180     and the default value of 0 means to replace all occurrences.
 181
 182     """
 183     if type(pattern) == type(''):
 184         pattern = _cachecompile(pattern)
 185     return pattern.sub(repl, string, count)
 186
 187 def subn(pattern, repl, string, count=0):
 188     """subn(pattern, repl, string[, count=0]) -> (string, num substitutions)
 189
 190     Perform the same operation as sub(), but return a tuple
 191     (new_string, number_of_subs_made).
 192
 193     """
 194     if type(pattern) == type(''):
 195         pattern = _cachecompile(pattern)
 196     return pattern.subn(repl, string, count)
 197
 198 def split(pattern, string, maxsplit=0):
 199     """split(pattern, string[, maxsplit=0]) -> list of strings
 200
 201     Split string by the occurrences of pattern. If capturing
 202     parentheses are used in pattern, then the text of all groups in
 203     the pattern are also returned as part of the resulting list. If
 204     maxsplit is nonzero, at most maxsplit splits occur, and the
 205     remainder of the string is returned as the final element of the
 206     list.
 207
 208     """
 209     if type(pattern) == type(''):
 210         pattern = _cachecompile(pattern)
 211     return pattern.split(string, maxsplit)
 212
 213 def findall(pattern, string):
 214     """findall(pattern, string) -> list
 215
 216     Return a list of all non-overlapping matches of pattern in
 217     string. If one or more groups are present in the pattern, return a
 218     list of groups; this will be a list of tuples if the pattern has
 219     more than one group. Empty matches are included in the result.
 220
 221     """
 222     if type(pattern) == type(''):
 223         pattern = _cachecompile(pattern)
 224     return pattern.findall(string)
 225
 226 def escape(pattern):
 227     """escape(string) -> string
 228
 229     Return string with all non-alphanumerics backslashed; this is
 230     useful if you want to match an arbitrary literal string that may
 231     have regular expression metacharacters in it.
 232
 233     """
 234     result = list(pattern)
 235     for i in range(len(pattern)):
 236         char = pattern[i]
 237         if not char.isalnum():
 238             if char=='\000': result[i] = '\\000'
 239             else: result[i] = '\\'+char
 240     return ''.join(result)
 241
 242 def compile(pattern, flags=0):
 243     """compile(pattern[, flags]) -> RegexObject
 244
 245     Compile a regular expression pattern into a regular expression
 246     object, which can be used for matching using its match() and
 247     search() methods.
 248
 249     """
 250     groupindex={}
 251     code=pcre_compile(pattern, flags, groupindex)
 252     return RegexObject(pattern, flags, code, groupindex)
 253
 254
 255 #
 256 #   Class definitions
 257 #
 258
 259 class RegexObject:
 260     """Holds a compiled regular expression pattern.
 261
 262     Methods:
 263     match    Match the pattern to the beginning of a string.
 264     search   Search a string for the presence of the pattern.
 265     sub      Substitute occurrences of the pattern found in a string.
 266     subn     Same as sub, but also return the number of substitutions made.
 267     split    Split a string by the occurrences of the pattern.
 268     findall  Find all occurrences of the pattern in a string.
 269
 270     """
 271
 272     def __init__(self, pattern, flags, code, groupindex):
 273         self.code = code
 274         self.flags = flags
 275         self.pattern = pattern
 276         self.groupindex = groupindex
 277
 278     def search(self, string, pos=0, endpos=None):
 279         """search(string[, pos][, endpos]) -> MatchObject or None
 280
 281         Scan through string looking for a location where this regular
 282         expression produces a match, and return a corresponding
 283         MatchObject instance. Return None if no position in the string
 284         matches the pattern; note that this is different from finding
 285         a zero-length match at some point in the string. The optional
 286         pos and endpos parameters have the same meaning as for the
 287         match() method.
 288
 289         """
 290         if endpos is None or endpos>len(string):
 291             endpos=len(string)
 292         if endpos<pos: endpos=pos
 293         regs = self.code.match(string, pos, endpos, 0)
 294         if regs is None:
 295             return None
 296         self._num_regs=len(regs)
 297
 298         return MatchObject(self,
 299                            string,
 300                            pos, endpos,
 301                            regs)
 302
 303     def match(self, string, pos=0, endpos=None):
 304         """match(string[, pos][, endpos]) -> MatchObject or None
 305
 306         If zero or more characters at the beginning of string match
 307         this regular expression, return a corresponding MatchObject
 308         instance. Return None if the string does not match the
 309         pattern; note that this is different from a zero-length match.
 310
 311         Note: If you want to locate a match anywhere in string, use
 312         search() instead.
 313
 314         The optional second parameter pos gives an index in the string
 315         where the search is to start; it defaults to 0.  This is not
 316         completely equivalent to slicing the string; the '' pattern
 317         character matches at the real beginning of the string and at
 318         positions just after a newline, but not necessarily at the
 319         index where the search is to start.
 320
 321         The optional parameter endpos limits how far the string will
 322         be searched; it will be as if the string is endpos characters
 323         long, so only the characters from pos to endpos will be
 324         searched for a match.
 325
 326         """
 327         if endpos is None or endpos>len(string):
 328             endpos=len(string)
 329         if endpos<pos: endpos=pos
 330         regs = self.code.match(string, pos, endpos, ANCHORED)
 331         if regs is None:
 332             return None
 333         self._num_regs=len(regs)
 334         return MatchObject(self,
 335                            string,
 336                            pos, endpos,
 337                            regs)
 338
 339     def sub(self, repl, string, count=0):
 340         """sub(repl, string[, count=0]) -> string
 341
 342         Return the string obtained by replacing the leftmost
 343         non-overlapping occurrences of the compiled pattern in string
 344         by the replacement repl. If the pattern isn't found, string is
 345         returned unchanged.
 346
 347         Identical to the sub() function, using the compiled pattern.
 348
 349         """
 350         return self.subn(repl, string, count)[0]
 351
 352     def subn(self, repl, source, count=0):
 353         """subn(repl, string[, count=0]) -> tuple
 354
 355         Perform the same operation as sub(), but return a tuple
 356         (new_string, number_of_subs_made).
 357
 358         """
 359         if count < 0:
 360             raise error, "negative substitution count"
 361         if count == 0:
 362             count = sys.maxint
 363         n = 0           # Number of matches
 364         pos = 0         # Where to start searching
 365         lastmatch = -1  # End of last match
 366         results = []    # Substrings making up the result
 367         end = len(source)
 368
 369         if type(repl) is type(''):
 370             # See if repl contains group references (if it does,
 371             # pcre_expand will attempt to call _Dummy.group, which
 372             # results in a TypeError)
 373             try:
 374                 repl = pcre_expand(_Dummy, repl)
 375             except (error, TypeError):
 376                 m = MatchObject(self, source, 0, end, [])
 377                 repl = lambda m, repl=repl, expand=pcre_expand: expand(m, repl)
 378             else:
 379                 m = None
 380         else:
 381             m = MatchObject(self, source, 0, end, [])
 382
 383         match = self.code.match
 384         append = results.append
 385         while n < count and pos <= end:
 386             regs = match(source, pos, end, 0)
 387             if not regs:
 388                 break
 389             self._num_regs = len(regs)
 390             i, j = regs[0]
 391             if i == j == lastmatch:
 392                 # Empty match adjacent to previous match
 393                 pos = pos + 1
 394                 append(source[lastmatch:pos])
 395                 continue
 396             if pos < i:
 397                 append(source[pos:i])
 398             if m:
 399                 m.pos = pos
 400                 m.regs = regs
 401                 append(repl(m))
 402             else:
 403                 append(repl)
 404             pos = lastmatch = j
 405             if i == j:
 406                 # Last match was empty; don't try here again
 407                 pos = pos + 1
 408                 append(source[lastmatch:pos])
 409             n = n + 1
 410         append(source[pos:])
 411         return (''.join(results), n)
 412
 413     def split(self, source, maxsplit=0):
 414         """split(source[, maxsplit=0]) -> list of strings
 415
 416         Split string by the occurrences of the compiled pattern. If
 417         capturing parentheses are used in the pattern, then the text
 418         of all groups in the pattern are also returned as part of the
 419         resulting list. If maxsplit is nonzero, at most maxsplit
 420         splits occur, and the remainder of the string is returned as
 421         the final element of the list.
 422
 423         """
 424         if maxsplit < 0:
 425             raise error, "negative split count"
 426         if maxsplit == 0:
 427             maxsplit = sys.maxint
 428         n = 0
 429         pos = 0
 430         lastmatch = 0
 431         results = []
 432         end = len(source)
 433         match = self.code.match
 434         append = results.append
 435         while n < maxsplit:
 436             regs = match(source, pos, end, 0)
 437             if not regs:
 438                 break
 439             i, j = regs[0]
 440             if i == j:
 441                 # Empty match
 442                 if pos >= end:
 443                     break
 444                 pos = pos+1
 445                 continue
 446             append(source[lastmatch:i])
 447             rest = regs[1:]
 448             if rest:
 449                 for a, b in rest:
 450                     if a == -1 or b == -1:
 451                         group = None
 452                     else:
 453                         group = source[a:b]
 454                     append(group)
 455             pos = lastmatch = j
 456             n = n + 1
 457         append(source[lastmatch:])
 458         return results
 459
 460     def findall(self, source):
 461         """findall(source) -> list
 462
 463         Return a list of all non-overlapping matches of the compiled
 464         pattern in string. If one or more groups are present in the
 465         pattern, return a list of groups; this will be a list of
 466         tuples if the pattern has more than one group. Empty matches
 467         are included in the result.
 468
 469         """
 470         pos = 0
 471         end = len(source)
 472         results = []
 473         match = self.code.match
 474         append = results.append
 475         while pos <= end:
 476             regs = match(source, pos, end, 0)
 477             if not regs:
 478                 break
 479             i, j = regs[0]
 480             rest = regs[1:]
 481             if not rest:
 482                 gr = source[i:j]
 483             elif len(rest) == 1:
 484                 a, b = rest[0]
 485                 gr = source[a:b]
 486             else:
 487                 gr = []
 488                 for (a, b) in rest:
 489                     gr.append(source[a:b])
 490                 gr = tuple(gr)
 491             append(gr)
 492             pos = max(j, pos+1)
 493         return results
 494
 495     # The following 3 functions were contributed by Mike Fletcher, and
 496     # allow pickling and unpickling of RegexObject instances.
 497     def __getinitargs__(self):
 498         return (None,None,None,None) # any 4 elements, to work around
 499                                      # problems with the
 500                                      # pickle/cPickle modules not yet
 501                                      # ignoring the __init__ function
 502     def __getstate__(self):
 503         return self.pattern, self.flags, self.groupindex
 504     def __setstate__(self, statetuple):
 505         self.pattern = statetuple[0]
 506         self.flags = statetuple[1]
 507         self.groupindex = statetuple[2]
 508         self.code = pcre_compile(*statetuple)
 509
 510 class _Dummy:
 511     # Dummy class used by _subn_string().  Has 'group' to avoid core dump.
 512     group = None
 513
 514 class MatchObject:
 515     """Holds a compiled regular expression pattern.
 516
 517     Methods:
 518     start      Return the index of the start of a matched substring.
 519     end        Return the index of the end of a matched substring.
 520     span       Return a tuple of (start, end) of a matched substring.
 521     groups     Return a tuple of all the subgroups of the match.
 522     group      Return one or more subgroups of the match.
 523     groupdict  Return a dictionary of all the named subgroups of the match.
 524
 525     """
 526
 527     def __init__(self, re, string, pos, endpos, regs):
 528         self.re = re
 529         self.string = string
 530         self.pos = pos
 531         self.endpos = endpos
 532         self.regs = regs
 533
 534     def start(self, g = 0):
 535         """start([group=0]) -> int or None
 536
 537         Return the index of the start of the substring matched by
 538         group; group defaults to zero (meaning the whole matched
 539         substring). Return -1 if group exists but did not contribute
 540         to the match.
 541
 542         """
 543         if type(g) == type(''):
 544             try:
 545                 g = self.re.groupindex[g]
 546             except (KeyError, TypeError):
 547                 raise IndexError, 'group %s is undefined' % `g`
 548         return self.regs[g][0]
 549
 550     def end(self, g = 0):
 551         """end([group=0]) -> int or None
 552
 553         Return the indices of the end of the substring matched by
 554         group; group defaults to zero (meaning the whole matched
 555         substring). Return -1 if group exists but did not contribute
 556         to the match.
 557
 558         """
 559         if type(g) == type(''):
 560             try:
 561                 g = self.re.groupindex[g]
 562             except (KeyError, TypeError):
 563                 raise IndexError, 'group %s is undefined' % `g`
 564         return self.regs[g][1]
 565
 566     def span(self, g = 0):
 567         """span([group=0]) -> tuple
 568
 569         Return the 2-tuple (m.start(group), m.end(group)). Note that
 570         if group did not contribute to the match, this is (-1,
 571         -1). Group defaults to zero (meaning the whole matched
 572         substring).
 573
 574         """
 575         if type(g) == type(''):
 576             try:
 577                 g = self.re.groupindex[g]
 578             except (KeyError, TypeError):
 579                 raise IndexError, 'group %s is undefined' % `g`
 580         return self.regs[g]
 581
 582     def groups(self, default=None):
 583         """groups([default=None]) -> tuple
 584
 585         Return a tuple containing all the subgroups of the match, from
 586         1 up to however many groups are in the pattern. The default
 587         argument is used for groups that did not participate in the
 588         match.
 589
 590         """
 591         result = []
 592         for g in range(1, self.re._num_regs):
 593             a, b = self.regs[g]
 594             if a == -1 or b == -1:
 595                 result.append(default)
 596             else:
 597                 result.append(self.string[a:b])
 598         return tuple(result)
 599
 600     def group(self, *groups):
 601         """group([group1, group2, ...]) -> string or tuple
 602
 603         Return one or more subgroups of the match. If there is a
 604         single argument, the result is a single string; if there are
 605         multiple arguments, the result is a tuple with one item per
 606         argument. Without arguments, group1 defaults to zero (i.e. the
 607         whole match is returned). If a groupN argument is zero, the
 608         corresponding return value is the entire matching string; if
 609         it is in the inclusive range [1..99], it is the string
 610         matching the the corresponding parenthesized group. If a group
 611         number is negative or larger than the number of groups defined
 612         in the pattern, an IndexError exception is raised. If a group
 613         is contained in a part of the pattern that did not match, the
 614         corresponding result is None. If a group is contained in a
 615         part of the pattern that matched multiple times, the last
 616         match is returned.
 617
 618         If the regular expression uses the (?P<name>...) syntax, the
 619         groupN arguments may also be strings identifying groups by
 620         their group name. If a string argument is not used as a group
 621         name in the pattern, an IndexError exception is raised.
 622
 623         """
 624         if len(groups) == 0:
 625             groups = (0,)
 626         result = []
 627         for g in groups:
 628             if type(g) == type(''):
 629                 try:
 630                     g = self.re.groupindex[g]
 631                 except (KeyError, TypeError):
 632                     raise IndexError, 'group %s is undefined' % `g`
 633             if g >= len(self.regs):
 634                 raise IndexError, 'group %s is undefined' % `g`
 635             a, b = self.regs[g]
 636             if a == -1 or b == -1:
 637                 result.append(None)
 638             else:
 639                 result.append(self.string[a:b])
 640         if len(result) > 1:
 641             return tuple(result)
 642         elif len(result) == 1:
 643             return result[0]
 644         else:
 645             return ()
 646
 647     def groupdict(self, default=None):
 648         """groupdict([default=None]) -> dictionary
 649
 650         Return a dictionary containing all the named subgroups of the
 651         match, keyed by the subgroup name. The default argument is
 652         used for groups that did not participate in the match.
 653
 654         """
 655         dict = {}
 656         for name, index in self.re.groupindex.items():
 657             a, b = self.regs[index]
 658             if a == -1 or b == -1:
 659                 dict[name] = default
 660             else:
 661                 dict[name] = self.string[a:b]
 662         return dict