Lib/pre.py

   1 # module 're' -- A collection of regular expression operations
   2
   3 r"""Support for regular expressions (RE).
   4
   5 This module provides regular expression matching operations similar to
   6 those found in Perl. It's 8-bit clean: the strings being processed may
   7 contain both null bytes and characters whose high bit is set. Regular
   8 expression pattern strings may not contain null bytes, but can specify
   9 the null byte using the \\number notation. Characters with the high
  10 bit set may be included.
  11
  12 Regular expressions can contain both special and ordinary
  13 characters. Most ordinary characters, like "A", "a", or "0", are the
  14 simplest regular expressions; they simply match themselves. You can
  15 concatenate ordinary characters, so last matches the string 'last'.
  16
  17 The special characters are:
  18     "."      Matches any character except a newline.
  19     "^"      Matches the start of the string.
  20     "$"      Matches the end of the string.
  21     "*"      Matches 0 or more (greedy) repetitions of the preceding RE.
  22              Greedy means that it will match as many repetitions as possible.
  23     "+"      Matches 1 or more (greedy) repetitions of the preceding RE.
  24     "?"      Matches 0 or 1 (greedy) of the preceding RE.
  25     *?,+?,?? Non-greedy versions of the previous three special characters.
  26     {m,n}    Matches from m to n repetitions of the preceding RE.
  27     {m,n}?   Non-greedy version of the above.
  28     "\\"      Either escapes special characters or signals a special sequence.
  29     []       Indicates a set of characters.
  30              A "^" as the first character indicates a complementing set.
  31     "|"      A|B, creates an RE that will match either A or B.
  32     (...)    Matches the RE inside the parentheses.
  33              The contents can be retrieved or matched later in the string.
  34     (?iLmsx) Set the I, L, M, S, or X flag for the RE.
  35     (?:...)  Non-grouping version of regular parentheses.
  36     (?P<name>...) The substring matched by the group is accessible by name.
  37     (?P=name)     Matches the text matched earlier by the group named name.
  38     (?#...)  A comment; ignored.
  39     (?=...)  Matches if ... matches next, but doesn't consume the string.
  40     (?!...)  Matches if ... doesn't match next.
  41
  42 The special sequences consist of "\\" and a character from the list
  43 below. If the ordinary character is not on the list, then the
  44 resulting RE will match the second character.
  45     \\number  Matches the contents of the group of the same number.
  46     \\A       Matches only at the start of the string.
  47     \\Z       Matches only at the end of the string.
  48     \\b       Matches the empty string, but only at the start or end of a word.
  49     \\B       Matches the empty string, but not at the start or end of a word.
  50     \\d       Matches any decimal digit; equivalent to the set [0-9].
  51     \\D       Matches any non-digit character; equivalent to the set [^0-9].
  52     \\s       Matches any whitespace character; equivalent to [ \\t\\n\\r\\f\\v].
  53     \\S       Matches any non-whitespace character; equiv. to [^ \\t\\n\\r\\f\\v].
  54     \\w       Matches any alphanumeric character; equivalent to [a-zA-Z0-9_].
  55              With LOCALE, it will match the set [0-9_] plus characters defined
  56              as letters for the current locale.
  57     \\W       Matches the complement of \\w.
  58     \\\\       Matches a literal backslash.
  59
  60 This module exports the following functions:
  61     match    Match a regular expression pattern to the beginning of a string.
  62     search   Search a string for the presence of a pattern.
  63     sub      Substitute occurrences of a pattern found in a string.
  64     subn     Same as sub, but also return the number of substitutions made.
  65     split    Split a string by the occurrences of a pattern.
  66     findall  Find all occurrences of a pattern in a string.
  67     compile  Compile a pattern into a RegexObject.
  68     escape   Backslash all non-alphanumerics in a string.
  69
  70 This module exports the following classes:
  71     RegexObject    Holds a compiled regular expression pattern.
  72     MatchObject    Contains information about pattern matches.
  73
  74 Some of the functions in this module takes flags as optional parameters:
  75     I  IGNORECASE  Perform case-insensitive matching.
  76     L  LOCALE      Make \w, \W, \b, \B, dependent on the current locale.
  77     M  MULTILINE   "^" matches the beginning of lines as well as the string.
  78                    "$" matches the end of lines as well as the string.
  79     S  DOTALL      "." matches any character at all, including the newline.
  80     X  VERBOSE     Ignore whitespace and comments for nicer looking RE's.
  81
  82 This module also defines an exception 'error'.
  83
  84 """
  85
  86
  87 import sys
  88 from pcre import *
  89
  90 __all__ = ["match","search","sub","subn","split","findall","escape","compile",
  91            "I","L","M","S","X","IGNORECASE","LOCALE","MULTILINE","DOTALL",
  92            "VERBOSE","error"]
  93
  94 #
  95 # First, the public part of the interface:
  96 #
  97
  98 # pcre.error and re.error should be the same, since exceptions can be
  99 # raised from either module.
 100
 101 # compilation flags
 102
 103 I = IGNORECASE
 104 L = LOCALE
 105 M = MULTILINE
 106 S = DOTALL
 107 X = VERBOSE
 108
 109
 110 #
 111 #
 112 #
 113
 114 _cache = {}
 115 _MAXCACHE = 20
 116
 117 def _cachecompile(pattern, flags=0):
 118     key = (pattern, flags)
 119     try:
 120         return _cache[key]
 121     except KeyError:
 122         pass
 123     value = compile(pattern, flags)
 124     if len(_cache) >= _MAXCACHE:
 125         _cache.clear()
 126     _cache[key] = value
 127     return value
 128
 129 def match(pattern, string, flags=0):
 130     """match (pattern, string[, flags]) -> MatchObject or None
 131
 132     If zero or more characters at the beginning of string match the
 133     regular expression pattern, return a corresponding MatchObject
 134     instance. Return None if the string does not match the pattern;
 135     note that this is different from a zero-length match.
 136
 137     Note: If you want to locate a match anywhere in string, use
 138     search() instead.
 139
 140     """
 141
 142     return _cachecompile(pattern, flags).match(string)
 143
 144 def search(pattern, string, flags=0):
 145     """search (pattern, string[, flags]) -> MatchObject or None
 146
 147     Scan through string looking for a location where the regular
 148     expression pattern produces a match, and return a corresponding
 149     MatchObject instance. Return None if no position in the string
 150     matches the pattern; note that this is different from finding a
 151     zero-length match at some point in the string.
 152
 153     """
 154     return _cachecompile(pattern, flags).search(string)
 155
 156 def sub(pattern, repl, string, count=0):
 157     """sub(pattern, repl, string[, count=0]) -> string
 158
 159     Return the string obtained by replacing the leftmost
 160     non-overlapping occurrences of pattern in string by the
 161     replacement repl. If the pattern isn't found, string is returned
 162     unchanged. repl can be a string or a function; if a function, it
 163     is called for every non-overlapping occurrence of pattern. The
 164     function takes a single match object argument, and returns the
 165     replacement string.
 166
 167     The pattern may be a string or a regex object; if you need to
 168     specify regular expression flags, you must use a regex object, or
 169     use embedded modifiers in a pattern; e.g.
 170     sub("(?i)b+", "x", "bbbb BBBB") returns 'x x'.
 171
 172     The optional argument count is the maximum number of pattern
 173     occurrences to be replaced; count must be a non-negative integer,
 174     and the default value of 0 means to replace all occurrences.
 175
 176     """
 177     if type(pattern) == type(''):
 178         pattern = _cachecompile(pattern)
 179     return pattern.sub(repl, string, count)
 180
 181 def subn(pattern, repl, string, count=0):
 182     """subn(pattern, repl, string[, count=0]) -> (string, num substitutions)
 183
 184     Perform the same operation as sub(), but return a tuple
 185     (new_string, number_of_subs_made).
 186
 187     """
 188     if type(pattern) == type(''):
 189         pattern = _cachecompile(pattern)
 190     return pattern.subn(repl, string, count)
 191
 192 def split(pattern, string, maxsplit=0):
 193     """split(pattern, string[, maxsplit=0]) -> list of strings
 194
 195     Split string by the occurrences of pattern. If capturing
 196     parentheses are used in pattern, then the text of all groups in
 197     the pattern are also returned as part of the resulting list. If
 198     maxsplit is nonzero, at most maxsplit splits occur, and the
 199     remainder of the string is returned as the final element of the
 200     list.
 201
 202     """
 203     if type(pattern) == type(''):
 204         pattern = _cachecompile(pattern)
 205     return pattern.split(string, maxsplit)
 206
 207 def findall(pattern, string):
 208     """findall(pattern, string) -> list
 209
 210     Return a list of all non-overlapping matches of pattern in
 211     string. If one or more groups are present in the pattern, return a
 212     list of groups; this will be a list of tuples if the pattern has
 213     more than one group. Empty matches are included in the result.
 214
 215     """
 216     if type(pattern) == type(''):
 217         pattern = _cachecompile(pattern)
 218     return pattern.findall(string)
 219
 220 def escape(pattern):
 221     """escape(string) -> string
 222
 223     Return string with all non-alphanumerics backslashed; this is
 224     useful if you want to match an arbitrary literal string that may
 225     have regular expression metacharacters in it.
 226
 227     """
 228     result = list(pattern)
 229     for i in range(len(pattern)):
 230         char = pattern[i]
 231         if not char.isalnum():
 232             if char=='\000': result[i] = '\\000'
 233             else: result[i] = '\\'+char
 234     return ''.join(result)
 235
 236 def compile(pattern, flags=0):
 237     """compile(pattern[, flags]) -> RegexObject
 238
 239     Compile a regular expression pattern into a regular expression
 240     object, which can be used for matching using its match() and
 241     search() methods.
 242
 243     """
 244     groupindex={}
 245     code=pcre_compile(pattern, flags, groupindex)
 246     return RegexObject(pattern, flags, code, groupindex)
 247
 248
 249 #
 250 #   Class definitions
 251 #
 252
 253 class RegexObject:
 254     """Holds a compiled regular expression pattern.
 255
 256     Methods:
 257     match    Match the pattern to the beginning of a string.
 258     search   Search a string for the presence of the pattern.
 259     sub      Substitute occurrences of the pattern found in a string.
 260     subn     Same as sub, but also return the number of substitutions made.
 261     split    Split a string by the occurrences of the pattern.
 262     findall  Find all occurrences of the pattern in a string.
 263
 264     """
 265
 266     def __init__(self, pattern, flags, code, groupindex):
 267         self.code = code
 268         self.flags = flags
 269         self.pattern = pattern
 270         self.groupindex = groupindex
 271
 272     def search(self, string, pos=0, endpos=None):
 273         """search(string[, pos][, endpos]) -> MatchObject or None
 274
 275         Scan through string looking for a location where this regular
 276         expression produces a match, and return a corresponding
 277         MatchObject instance. Return None if no position in the string
 278         matches the pattern; note that this is different from finding
 279         a zero-length match at some point in the string. The optional
 280         pos and endpos parameters have the same meaning as for the
 281         match() method.
 282
 283         """
 284         if endpos is None or endpos>len(string):
 285             endpos=len(string)
 286         if endpos<pos: endpos=pos
 287         regs = self.code.match(string, pos, endpos, 0)
 288         if regs is None:
 289             return None
 290         self._num_regs=len(regs)
 291
 292         return MatchObject(self,
 293                            string,
 294                            pos, endpos,
 295                            regs)
 296
 297     def match(self, string, pos=0, endpos=None):
 298         """match(string[, pos][, endpos]) -> MatchObject or None
 299
 300         If zero or more characters at the beginning of string match
 301         this regular expression, return a corresponding MatchObject
 302         instance. Return None if the string does not match the
 303         pattern; note that this is different from a zero-length match.
 304
 305         Note: If you want to locate a match anywhere in string, use
 306         search() instead.
 307
 308         The optional second parameter pos gives an index in the string
 309         where the search is to start; it defaults to 0.  This is not
 310         completely equivalent to slicing the string; the '' pattern
 311         character matches at the real beginning of the string and at
 312         positions just after a newline, but not necessarily at the
 313         index where the search is to start.
 314
 315         The optional parameter endpos limits how far the string will
 316         be searched; it will be as if the string is endpos characters
 317         long, so only the characters from pos to endpos will be
 318         searched for a match.
 319
 320         """
 321         if endpos is None or endpos>len(string):
 322             endpos=len(string)
 323         if endpos<pos: endpos=pos
 324         regs = self.code.match(string, pos, endpos, ANCHORED)
 325         if regs is None:
 326             return None
 327         self._num_regs=len(regs)
 328         return MatchObject(self,
 329                            string,
 330                            pos, endpos,
 331                            regs)
 332
 333     def sub(self, repl, string, count=0):
 334         """sub(repl, string[, count=0]) -> string
 335
 336         Return the string obtained by replacing the leftmost
 337         non-overlapping occurrences of the compiled pattern in string
 338         by the replacement repl. If the pattern isn't found, string is
 339         returned unchanged.
 340
 341         Identical to the sub() function, using the compiled pattern.
 342
 343         """
 344         return self.subn(repl, string, count)[0]
 345
 346     def subn(self, repl, source, count=0):
 347         """subn(repl, string[, count=0]) -> tuple
 348
 349         Perform the same operation as sub(), but return a tuple
 350         (new_string, number_of_subs_made).
 351
 352         """
 353         if count < 0:
 354             raise error, "negative substitution count"
 355         if count == 0:
 356             count = sys.maxint
 357         n = 0           # Number of matches
 358         pos = 0         # Where to start searching
 359         lastmatch = -1  # End of last match
 360         results = []    # Substrings making up the result
 361         end = len(source)
 362
 363         if type(repl) is type(''):
 364             # See if repl contains group references (if it does,
 365             # pcre_expand will attempt to call _Dummy.group, which
 366             # results in a TypeError)
 367             try:
 368                 repl = pcre_expand(_Dummy, repl)
 369             except (error, TypeError):
 370                 m = MatchObject(self, source, 0, end, [])
 371                 repl = lambda m, repl=repl, expand=pcre_expand: expand(m, repl)
 372             else:
 373                 m = None
 374         else:
 375             m = MatchObject(self, source, 0, end, [])
 376
 377         match = self.code.match
 378         append = results.append
 379         while n < count and pos <= end:
 380             regs = match(source, pos, end, 0)
 381             if not regs:
 382                 break
 383             self._num_regs = len(regs)
 384             i, j = regs[0]
 385             if i == j == lastmatch:
 386                 # Empty match adjacent to previous match
 387                 pos = pos + 1
 388                 append(source[lastmatch:pos])
 389                 continue
 390             if pos < i:
 391                 append(source[pos:i])
 392             if m:
 393                 m.pos = pos
 394                 m.regs = regs
 395                 append(repl(m))
 396             else:
 397                 append(repl)
 398             pos = lastmatch = j
 399             if i == j:
 400                 # Last match was empty; don't try here again
 401                 pos = pos + 1
 402                 append(source[lastmatch:pos])
 403             n = n + 1
 404         append(source[pos:])
 405         return (''.join(results), n)
 406
 407     def split(self, source, maxsplit=0):
 408         """split(source[, maxsplit=0]) -> list of strings
 409
 410         Split string by the occurrences of the compiled pattern. If
 411         capturing parentheses are used in the pattern, then the text
 412         of all groups in the pattern are also returned as part of the
 413         resulting list. If maxsplit is nonzero, at most maxsplit
 414         splits occur, and the remainder of the string is returned as
 415         the final element of the list.
 416
 417         """
 418         if maxsplit < 0:
 419             raise error, "negative split count"
 420         if maxsplit == 0:
 421             maxsplit = sys.maxint
 422         n = 0
 423         pos = 0
 424         lastmatch = 0
 425         results = []
 426         end = len(source)
 427         match = self.code.match
 428         append = results.append
 429         while n < maxsplit:
 430             regs = match(source, pos, end, 0)
 431             if not regs:
 432                 break
 433             i, j = regs[0]
 434             if i == j:
 435                 # Empty match
 436                 if pos >= end:
 437                     break
 438                 pos = pos+1
 439                 continue
 440             append(source[lastmatch:i])
 441             rest = regs[1:]
 442             if rest:
 443                 for a, b in rest:
 444                     if a == -1 or b == -1:
 445                         group = None
 446                     else:
 447                         group = source[a:b]
 448                     append(group)
 449             pos = lastmatch = j
 450             n = n + 1
 451         append(source[lastmatch:])
 452         return results
 453
 454     def findall(self, source):
 455         """findall(source) -> list
 456
 457         Return a list of all non-overlapping matches of the compiled
 458         pattern in string. If one or more groups are present in the
 459         pattern, return a list of groups; this will be a list of
 460         tuples if the pattern has more than one group. Empty matches
 461         are included in the result.
 462
 463         """
 464         pos = 0
 465         end = len(source)
 466         results = []
 467         match = self.code.match
 468         append = results.append
 469         while pos <= end:
 470             regs = match(source, pos, end, 0)
 471             if not regs:
 472                 break
 473             i, j = regs[0]
 474             rest = regs[1:]
 475             if not rest:
 476                 gr = source[i:j]
 477             elif len(rest) == 1:
 478                 a, b = rest[0]
 479                 gr = source[a:b]
 480             else:
 481                 gr = []
 482                 for (a, b) in rest:
 483                     gr.append(source[a:b])
 484                 gr = tuple(gr)
 485             append(gr)
 486             pos = max(j, pos+1)
 487         return results
 488
 489     # The following 3 functions were contributed by Mike Fletcher, and
 490     # allow pickling and unpickling of RegexObject instances.
 491     def __getinitargs__(self):
 492         return (None,None,None,None) # any 4 elements, to work around
 493                                      # problems with the
 494                                      # pickle/cPickle modules not yet
 495                                      # ignoring the __init__ function
 496     def __getstate__(self):
 497         return self.pattern, self.flags, self.groupindex
 498     def __setstate__(self, statetuple):
 499         self.pattern = statetuple[0]
 500         self.flags = statetuple[1]
 501         self.groupindex = statetuple[2]
 502         self.code = apply(pcre_compile, statetuple)
 503
 504 class _Dummy:
 505     # Dummy class used by _subn_string().  Has 'group' to avoid core dump.
 506     group = None
 507
 508 class MatchObject:
 509     """Holds a compiled regular expression pattern.
 510
 511     Methods:
 512     start      Return the index of the start of a matched substring.
 513     end        Return the index of the end of a matched substring.
 514     span       Return a tuple of (start, end) of a matched substring.
 515     groups     Return a tuple of all the subgroups of the match.
 516     group      Return one or more subgroups of the match.
 517     groupdict  Return a dictionary of all the named subgroups of the match.
 518
 519     """
 520
 521     def __init__(self, re, string, pos, endpos, regs):
 522         self.re = re
 523         self.string = string
 524         self.pos = pos
 525         self.endpos = endpos
 526         self.regs = regs
 527
 528     def start(self, g = 0):
 529         """start([group=0]) -> int or None
 530
 531         Return the index of the start of the substring matched by
 532         group; group defaults to zero (meaning the whole matched
 533         substring). Return -1 if group exists but did not contribute
 534         to the match.
 535
 536         """
 537         if type(g) == type(''):
 538             try:
 539                 g = self.re.groupindex[g]
 540             except (KeyError, TypeError):
 541                 raise IndexError, 'group %s is undefined' % `g`
 542         return self.regs[g][0]
 543
 544     def end(self, g = 0):
 545         """end([group=0]) -> int or None
 546
 547         Return the indices of the end of the substring matched by
 548         group; group defaults to zero (meaning the whole matched
 549         substring). Return -1 if group exists but did not contribute
 550         to the match.
 551
 552         """
 553         if type(g) == type(''):
 554             try:
 555                 g = self.re.groupindex[g]
 556             except (KeyError, TypeError):
 557                 raise IndexError, 'group %s is undefined' % `g`
 558         return self.regs[g][1]
 559
 560     def span(self, g = 0):
 561         """span([group=0]) -> tuple
 562
 563         Return the 2-tuple (m.start(group), m.end(group)). Note that
 564         if group did not contribute to the match, this is (-1,
 565         -1). Group defaults to zero (meaning the whole matched
 566         substring).
 567
 568         """
 569         if type(g) == type(''):
 570             try:
 571                 g = self.re.groupindex[g]
 572             except (KeyError, TypeError):
 573                 raise IndexError, 'group %s is undefined' % `g`
 574         return self.regs[g]
 575
 576     def groups(self, default=None):
 577         """groups([default=None]) -> tuple
 578
 579         Return a tuple containing all the subgroups of the match, from
 580         1 up to however many groups are in the pattern. The default
 581         argument is used for groups that did not participate in the
 582         match.
 583
 584         """
 585         result = []
 586         for g in range(1, self.re._num_regs):
 587             a, b = self.regs[g]
 588             if a == -1 or b == -1:
 589                 result.append(default)
 590             else:
 591                 result.append(self.string[a:b])
 592         return tuple(result)
 593
 594     def group(self, *groups):
 595         """group([group1, group2, ...]) -> string or tuple
 596
 597         Return one or more subgroups of the match. If there is a
 598         single argument, the result is a single string; if there are
 599         multiple arguments, the result is a tuple with one item per
 600         argument. Without arguments, group1 defaults to zero (i.e. the
 601         whole match is returned). If a groupN argument is zero, the
 602         corresponding return value is the entire matching string; if
 603         it is in the inclusive range [1..99], it is the string
 604         matching the the corresponding parenthesized group. If a group
 605         number is negative or larger than the number of groups defined
 606         in the pattern, an IndexError exception is raised. If a group
 607         is contained in a part of the pattern that did not match, the
 608         corresponding result is None. If a group is contained in a
 609         part of the pattern that matched multiple times, the last
 610         match is returned.
 611
 612         If the regular expression uses the (?P<name>...) syntax, the
 613         groupN arguments may also be strings identifying groups by
 614         their group name. If a string argument is not used as a group
 615         name in the pattern, an IndexError exception is raised.
 616
 617         """
 618         if len(groups) == 0:
 619             groups = (0,)
 620         result = []
 621         for g in groups:
 622             if type(g) == type(''):
 623                 try:
 624                     g = self.re.groupindex[g]
 625                 except (KeyError, TypeError):
 626                     raise IndexError, 'group %s is undefined' % `g`
 627             if g >= len(self.regs):
 628                 raise IndexError, 'group %s is undefined' % `g`
 629             a, b = self.regs[g]
 630             if a == -1 or b == -1:
 631                 result.append(None)
 632             else:
 633                 result.append(self.string[a:b])
 634         if len(result) > 1:
 635             return tuple(result)
 636         elif len(result) == 1:
 637             return result[0]
 638         else:
 639             return ()
 640
 641     def groupdict(self, default=None):
 642         """groupdict([default=None]) -> dictionary
 643
 644         Return a dictionary containing all the named subgroups of the
 645         match, keyed by the subgroup name. The default argument is
 646         used for groups that did not participate in the match.
 647
 648         """
 649         dict = {}
 650         for name, index in self.re.groupindex.items():
 651             a, b = self.regs[index]
 652             if a == -1 or b == -1:
 653                 dict[name] = default
 654             else:
 655                 dict[name] = self.string[a:b]
 656         return dict