Lib/pre.py

   1 # module 're' -- A collection of regular expression operations
   2
   3 """Support for regular expressions (RE).
   4
   5 This module provides regular expression matching operations similar to
   6 those found in Perl. It's 8-bit clean: the strings being processed may
   7 contain both null bytes and characters whose high bit is set. Regular
   8 expression pattern strings may not contain null bytes, but can specify
   9 the null byte using the \\number notation. Characters with the high
  10 bit set may be included.
  11
  12 Regular expressions can contain both special and ordinary
  13 characters. Most ordinary characters, like "A", "a", or "0", are the
  14 simplest regular expressions; they simply match themselves. You can
  15 concatenate ordinary characters, so last matches the string 'last'.
  16
  17 The special characters are:
  18     "."      Matches any character except a newline.
  19     "^"      Matches the start of the string.
  20     "$"      Matches the end of the string.
  21     "*"      Matches 0 or more (greedy) repetitions of the preceding RE.
  22              Greedy means that it will match as many repetitions as possible.
  23     "+"      Matches 1 or more (greedy) repetitions of the preceding RE.
  24     "?"      Matches 0 or 1 (greedy) of the preceding RE.
  25     *?,+?,?? Non-greedy versions of the previous three special characters.
  26     {m,n}    Matches from m to n repetitions of the preceding RE.
  27     {m,n}?   Non-greedy version of the above.
  28     "\\"      Either escapes special characters or signals a special sequence.
  29     []       Indicates a set of characters.
  30              A "^" as the first character indicates a complementing set.
  31     "|"      A|B, creates an RE that will match either A or B.
  32     (...)    Matches the RE inside the parentheses.
  33              The contents can be retrieved or matched later in the string.
  34     (?iLmsx) Set the I, L, M, S, or X flag for the RE.
  35     (?:...)  Non-grouping version of regular parentheses.
  36     (?P<name>...) The substring matched by the group is accessible by name.
  37     (?P=name)     Matches the text matched earlier by the group named name.
  38     (?#...)  A comment; ignored.
  39     (?=...)  Matches if ... matches next, but doesn't consume the string.
  40     (?!...)  Matches if ... doesn't match next.
  41
  42 The special sequences consist of "\\" and a character from the list
  43 below. If the ordinary character is not on the list, then the
  44 resulting RE will match the second character.
  45     \\number  Matches the contents of the group of the same number.
  46     \\A       Matches only at the start of the string.
  47     \\Z       Matches only at the end of the string.
  48     \\b       Matches the empty string, but only at the start or end of a word.
  49     \\B       Matches the empty string, but not at the start or end of a word.
  50     \\d       Matches any decimal digit; equivalent to the set [0-9].
  51     \\D       Matches any non-digit character; equivalent to the set [^0-9].
  52     \\s       Matches any whitespace character; equivalent to [ \\t\\n\\r\\f\\v].
  53     \\S       Matches any non-whitespace character; equiv. to [^ \\t\\n\\r\\f\\v].
  54     \\w       Matches any alphanumeric character; equivalent to [a-zA-Z0-9_].
  55              With LOCALE, it will match the set [0-9_] plus characters defined
  56              as letters for the current locale.
  57     \\W       Matches the complement of \\w.
  58     \\\\       Matches a literal backslash.
  59
  60 This module exports the following functions:
  61     match    Match a regular expression pattern to the beginning of a string.
  62     search   Search a string for the presence of a pattern.
  63     sub      Substitute occurrences of a pattern found in a string.
  64     subn     Same as sub, but also return the number of substitutions made.
  65     split    Split a string by the occurrences of a pattern.
  66     findall  Find all occurrences of a pattern in a string.
  67     compile  Compile a pattern into a RegexObject.
  68     escape   Backslash all non-alphanumerics in a string.
  69
  70 This module exports the following classes:
  71     RegexObject    Holds a compiled regular expression pattern.
  72     MatchObject    Contains information about pattern matches.
  73
  74 Some of the functions in this module takes flags as optional parameters:
  75     I  IGNORECASE  Perform case-insensitive matching.
  76     L  LOCALE      Make \w, \W, \b, \B, dependent on the current locale.
  77     M  MULTILINE   "^" matches the beginning of lines as well as the string.
  78                    "$" matches the end of lines as well as the string.
  79     S  DOTALL      "." matches any character at all, including the newline.
  80     X  VERBOSE     Ignore whitespace and comments for nicer looking RE's.
  81
  82 This module also defines an exception 'error'.
  83
  84 """
  85
  86
  87 import sys
  88 from pcre import *
  89
  90 __all__ = ["match","search","sub","subn","split","findall","escape","compile",
  91            "I","L","M","S","X","IGNORECASE","LOCALE","MULTILINE","DOTALL",
  92            "VERBOSE","error"]
  93
  94 #
  95 # First, the public part of the interface:
  96 #
  97
  98 # pcre.error and re.error should be the same, since exceptions can be
  99 # raised from either module.
 100
 101 # compilation flags
 102
 103 I = IGNORECASE
 104 L = LOCALE
 105 M = MULTILINE
 106 S = DOTALL
 107 X = VERBOSE
 108
 109
 110 #
 111 #
 112 #
 113
 114 _cache = {}
 115 _MAXCACHE = 20
 116
 117 def _cachecompile(pattern, flags=0):
 118     key = (pattern, flags)
 119     try:
 120         return _cache[key]
 121     except KeyError:
 122         pass
 123     value = compile(pattern, flags)
 124     if len(_cache) >= _MAXCACHE:
 125         _cache.clear()
 126     _cache[key] = value
 127     return value
 128
 129 def match(pattern, string, flags=0):
 130     """match (pattern, string[, flags]) -> MatchObject or None
 131
 132     If zero or more characters at the beginning of string match the
 133     regular expression pattern, return a corresponding MatchObject
 134     instance. Return None if the string does not match the pattern;
 135     note that this is different from a zero-length match.
 136
 137     Note: If you want to locate a match anywhere in string, use
 138     search() instead.
 139
 140     """
 141
 142     return _cachecompile(pattern, flags).match(string)
 143
 144 def search(pattern, string, flags=0):
 145     """search (pattern, string[, flags]) -> MatchObject or None
 146
 147     Scan through string looking for a location where the regular
 148     expression pattern produces a match, and return a corresponding
 149     MatchObject instance. Return None if no position in the string
 150     matches the pattern; note that this is different from finding a
 151     zero-length match at some point in the string.
 152
 153     """
 154     return _cachecompile(pattern, flags).search(string)
 155
 156 def sub(pattern, repl, string, count=0):
 157     """sub(pattern, repl, string[, count=0]) -> string
 158
 159     Return the string obtained by replacing the leftmost
 160     non-overlapping occurrences of pattern in string by the
 161     replacement repl. If the pattern isn't found, string is returned
 162     unchanged. repl can be a string or a function; if a function, it
 163     is called for every non-overlapping occurrence of pattern. The
 164     function takes a single match object argument, and returns the
 165     replacement string.
 166
 167     The pattern may be a string or a regex object; if you need to
 168     specify regular expression flags, you must use a regex object, or
 169     use embedded modifiers in a pattern; e.g.
 170     sub("(?i)b+", "x", "bbbb BBBB") returns 'x x'.
 171
 172     The optional argument count is the maximum number of pattern
 173     occurrences to be replaced; count must be a non-negative integer,
 174     and the default value of 0 means to replace all occurrences.
 175
 176     """
 177     if type(pattern) == type(''):
 178         pattern = _cachecompile(pattern)
 179     return pattern.sub(repl, string, count)
 180
 181 def subn(pattern, repl, string, count=0):
 182     """subn(pattern, repl, string[, count=0]) -> (string, num substitutions)
 183
 184     Perform the same operation as sub(), but return a tuple
 185     (new_string, number_of_subs_made).
 186
 187     """
 188     if type(pattern) == type(''):
 189         pattern = _cachecompile(pattern)
 190     return pattern.subn(repl, string, count)
 191
 192 def split(pattern, string, maxsplit=0):
 193     """split(pattern, string[, maxsplit=0]) -> list of strings
 194
 195     Split string by the occurrences of pattern. If capturing
 196     parentheses are used in pattern, then the text of all groups in
 197     the pattern are also returned as part of the resulting list. If
 198     maxsplit is nonzero, at most maxsplit splits occur, and the
 199     remainder of the string is returned as the final element of the
 200     list.
 201
 202     """
 203     if type(pattern) == type(''):
 204         pattern = _cachecompile(pattern)
 205     return pattern.split(string, maxsplit)
 206
 207 def findall(pattern, string):
 208     """findall(pattern, string) -> list
 209
 210     Return a list of all non-overlapping matches of pattern in
 211     string. If one or more groups are present in the pattern, return a
 212     list of groups; this will be a list of tuples if the pattern has
 213     more than one group. Empty matches are included in the result.
 214
 215     """
 216     if type(pattern) == type(''):
 217         pattern = _cachecompile(pattern)
 218     return pattern.findall(string)
 219
 220 def escape(pattern):
 221     """escape(string) -> string
 222
 223     Return string with all non-alphanumerics backslashed; this is
 224     useful if you want to match an arbitrary literal string that may
 225     have regular expression metacharacters in it.
 226
 227     """
 228     result = list(pattern)
 229     for i in range(len(pattern)):
 230         char = pattern[i]
 231         if not char.isalnum():
 232             if char=='\000': result[i] = '\\000'
 233             else: result[i] = '\\'+char
 234     return ''.join(result)
 235
 236 def compile(pattern, flags=0):
 237     """compile(pattern[, flags]) -> RegexObject
 238
 239     Compile a regular expression pattern into a regular expression
 240     object, which can be used for matching using its match() and
 241     search() methods.
 242
 243     """
 244     groupindex={}
 245     code=pcre_compile(pattern, flags, groupindex)
 246     return RegexObject(pattern, flags, code, groupindex)
 247
 248
 249 #
 250 #   Class definitions
 251 #
 252
 253 class RegexObject:
 254     """Holds a compiled regular expression pattern.
 255
 256     Methods:
 257     match    Match the pattern to the beginning of a string.
 258     search   Search a string for the presence of the pattern.
 259     sub      Substitute occurrences of the pattern found in a string.
 260     subn     Same as sub, but also return the number of substitutions made.
 261     split    Split a string by the occurrences of the pattern.
 262     findall  Find all occurrences of the pattern in a string.
 263
 264     """
 265
 266     def __init__(self, pattern, flags, code, groupindex):
 267         self.code = code
 268         self.flags = flags
 269         self.pattern = pattern
 270         self.groupindex = groupindex
 271
 272     def search(self, string, pos=0, endpos=None):
 273         """search(string[, pos][, endpos]) -> MatchObject or None
 274
 275         Scan through string looking for a location where this regular
 276         expression produces a match, and return a corresponding
 277         MatchObject instance. Return None if no position in the string
 278         matches the pattern; note that this is different from finding
 279         a zero-length match at some point in the string. The optional
 280         pos and endpos parameters have the same meaning as for the
 281         match() method.
 282
 283         """
 284         if endpos is None or endpos>len(string):
 285             endpos=len(string)
 286         if endpos<pos: endpos=pos
 287         regs = self.code.match(string, pos, endpos, 0)
 288         if regs is None:
 289             return None
 290         self._num_regs=len(regs)
 291
 292         return MatchObject(self,
 293                            string,
 294                            pos, endpos,
 295                            regs)
 296
 297     def match(self, string, pos=0, endpos=None):
 298         """match(string[, pos][, endpos]) -> MatchObject or None
 299
 300         If zero or more characters at the beginning of string match
 301         this regular expression, return a corresponding MatchObject
 302         instance. Return None if the string does not match the
 303         pattern; note that this is different from a zero-length match.
 304
 305         Note: If you want to locate a match anywhere in string, use
 306         search() instead.
 307
 308         The optional second parameter pos gives an index in the string
 309         where the search is to start; it defaults to 0.  This is not
 310         completely equivalent to slicing the string; the '' pattern
 311         character matches at the real beginning of the string and at
 312         positions just after a newline, but not necessarily at the
 313         index where the search is to start.
 314
 315         The optional parameter endpos limits how far the string will
 316         be searched; it will be as if the string is endpos characters
 317         long, so only the characters from pos to endpos will be
 318         searched for a match.
 319
 320         """
 321         if endpos is None or endpos>len(string):
 322             endpos=len(string)
 323         if endpos<pos: endpos=pos
 324         regs = self.code.match(string, pos, endpos, ANCHORED)
 325         if regs is None:
 326             return None
 327         self._num_regs=len(regs)
 328         return MatchObject(self,
 329                            string,
 330                            pos, endpos,
 331                            regs)
 332
 333     def sub(self, repl, string, count=0):
 334         """sub(repl, string[, count=0]) -> string
 335
 336         Return the string obtained by replacing the leftmost
 337         non-overlapping occurrences of the compiled pattern in string
 338         by the replacement repl. If the pattern isn't found, string is
 339         returned unchanged.
 340
 341         Identical to the sub() function, using the compiled pattern.
 342
 343         """
 344         return self.subn(repl, string, count)[0]
 345
 346     def subn(self, repl, source, count=0):
 347         """subn(repl, string[, count=0]) -> tuple
 348
 349         Perform the same operation as sub(), but return a tuple
 350         (new_string, number_of_subs_made).
 351
 352         """
 353         if count < 0:
 354             raise error, "negative substitution count"
 355         if count == 0:
 356             count = sys.maxint
 357         n = 0           # Number of matches
 358         pos = 0         # Where to start searching
 359         lastmatch = -1  # End of last match
 360         results = []    # Substrings making up the result
 361         end = len(source)
 362
 363         if type(repl) is type(''):
 364             # See if repl contains group references
 365             try:
 366                 repl = pcre_expand(_Dummy, repl)
 367             except:
 368                 m = MatchObject(self, source, 0, end, [])
 369                 repl = lambda m, repl=repl, expand=pcre_expand: expand(m, repl)
 370             else:
 371                 m = None
 372         else:
 373             m = MatchObject(self, source, 0, end, [])
 374
 375         match = self.code.match
 376         append = results.append
 377         while n < count and pos <= end:
 378             regs = match(source, pos, end, 0)
 379             if not regs:
 380                 break
 381             self._num_regs = len(regs)
 382             i, j = regs[0]
 383             if i == j == lastmatch:
 384                 # Empty match adjacent to previous match
 385                 pos = pos + 1
 386                 append(source[lastmatch:pos])
 387                 continue
 388             if pos < i:
 389                 append(source[pos:i])
 390             if m:
 391                 m.pos = pos
 392                 m.regs = regs
 393                 append(repl(m))
 394             else:
 395                 append(repl)
 396             pos = lastmatch = j
 397             if i == j:
 398                 # Last match was empty; don't try here again
 399                 pos = pos + 1
 400                 append(source[lastmatch:pos])
 401             n = n + 1
 402         append(source[pos:])
 403         return (''.join(results), n)
 404
 405     def split(self, source, maxsplit=0):
 406         """split(source[, maxsplit=0]) -> list of strings
 407
 408         Split string by the occurrences of the compiled pattern. If
 409         capturing parentheses are used in the pattern, then the text
 410         of all groups in the pattern are also returned as part of the
 411         resulting list. If maxsplit is nonzero, at most maxsplit
 412         splits occur, and the remainder of the string is returned as
 413         the final element of the list.
 414
 415         """
 416         if maxsplit < 0:
 417             raise error, "negative split count"
 418         if maxsplit == 0:
 419             maxsplit = sys.maxint
 420         n = 0
 421         pos = 0
 422         lastmatch = 0
 423         results = []
 424         end = len(source)
 425         match = self.code.match
 426         append = results.append
 427         while n < maxsplit:
 428             regs = match(source, pos, end, 0)
 429             if not regs:
 430                 break
 431             i, j = regs[0]
 432             if i == j:
 433                 # Empty match
 434                 if pos >= end:
 435                     break
 436                 pos = pos+1
 437                 continue
 438             append(source[lastmatch:i])
 439             rest = regs[1:]
 440             if rest:
 441                 for a, b in rest:
 442                     if a == -1 or b == -1:
 443                         group = None
 444                     else:
 445                         group = source[a:b]
 446                     append(group)
 447             pos = lastmatch = j
 448             n = n + 1
 449         append(source[lastmatch:])
 450         return results
 451
 452     def findall(self, source):
 453         """findall(source) -> list
 454
 455         Return a list of all non-overlapping matches of the compiled
 456         pattern in string. If one or more groups are present in the
 457         pattern, return a list of groups; this will be a list of
 458         tuples if the pattern has more than one group. Empty matches
 459         are included in the result.
 460
 461         """
 462         pos = 0
 463         end = len(source)
 464         results = []
 465         match = self.code.match
 466         append = results.append
 467         while pos <= end:
 468             regs = match(source, pos, end, 0)
 469             if not regs:
 470                 break
 471             i, j = regs[0]
 472             rest = regs[1:]
 473             if not rest:
 474                 gr = source[i:j]
 475             elif len(rest) == 1:
 476                 a, b = rest[0]
 477                 gr = source[a:b]
 478             else:
 479                 gr = []
 480                 for (a, b) in rest:
 481                     gr.append(source[a:b])
 482                 gr = tuple(gr)
 483             append(gr)
 484             pos = max(j, pos+1)
 485         return results
 486
 487     # The following 3 functions were contributed by Mike Fletcher, and
 488     # allow pickling and unpickling of RegexObject instances.
 489     def __getinitargs__(self):
 490         return (None,None,None,None) # any 4 elements, to work around
 491                                      # problems with the
 492                                      # pickle/cPickle modules not yet
 493                                      # ignoring the __init__ function
 494     def __getstate__(self):
 495         return self.pattern, self.flags, self.groupindex
 496     def __setstate__(self, statetuple):
 497         self.pattern = statetuple[0]
 498         self.flags = statetuple[1]
 499         self.groupindex = statetuple[2]
 500         self.code = apply(pcre_compile, statetuple)
 501
 502 class _Dummy:
 503     # Dummy class used by _subn_string().  Has 'group' to avoid core dump.
 504     group = None
 505
 506 class MatchObject:
 507     """Holds a compiled regular expression pattern.
 508
 509     Methods:
 510     start      Return the index of the start of a matched substring.
 511     end        Return the index of the end of a matched substring.
 512     span       Return a tuple of (start, end) of a matched substring.
 513     groups     Return a tuple of all the subgroups of the match.
 514     group      Return one or more subgroups of the match.
 515     groupdict  Return a dictionary of all the named subgroups of the match.
 516
 517     """
 518
 519     def __init__(self, re, string, pos, endpos, regs):
 520         self.re = re
 521         self.string = string
 522         self.pos = pos
 523         self.endpos = endpos
 524         self.regs = regs
 525
 526     def start(self, g = 0):
 527         """start([group=0]) -> int or None
 528
 529         Return the index of the start of the substring matched by
 530         group; group defaults to zero (meaning the whole matched
 531         substring). Return -1 if group exists but did not contribute
 532         to the match.
 533
 534         """
 535         if type(g) == type(''):
 536             try:
 537                 g = self.re.groupindex[g]
 538             except (KeyError, TypeError):
 539                 raise IndexError, 'group %s is undefined' % `g`
 540         return self.regs[g][0]
 541
 542     def end(self, g = 0):
 543         """end([group=0]) -> int or None
 544
 545         Return the indices of the end of the substring matched by
 546         group; group defaults to zero (meaning the whole matched
 547         substring). Return -1 if group exists but did not contribute
 548         to the match.
 549
 550         """
 551         if type(g) == type(''):
 552             try:
 553                 g = self.re.groupindex[g]
 554             except (KeyError, TypeError):
 555                 raise IndexError, 'group %s is undefined' % `g`
 556         return self.regs[g][1]
 557
 558     def span(self, g = 0):
 559         """span([group=0]) -> tuple
 560
 561         Return the 2-tuple (m.start(group), m.end(group)). Note that
 562         if group did not contribute to the match, this is (-1,
 563         -1). Group defaults to zero (meaning the whole matched
 564         substring).
 565
 566         """
 567         if type(g) == type(''):
 568             try:
 569                 g = self.re.groupindex[g]
 570             except (KeyError, TypeError):
 571                 raise IndexError, 'group %s is undefined' % `g`
 572         return self.regs[g]
 573
 574     def groups(self, default=None):
 575         """groups([default=None]) -> tuple
 576
 577         Return a tuple containing all the subgroups of the match, from
 578         1 up to however many groups are in the pattern. The default
 579         argument is used for groups that did not participate in the
 580         match.
 581
 582         """
 583         result = []
 584         for g in range(1, self.re._num_regs):
 585             a, b = self.regs[g]
 586             if a == -1 or b == -1:
 587                 result.append(default)
 588             else:
 589                 result.append(self.string[a:b])
 590         return tuple(result)
 591
 592     def group(self, *groups):
 593         """group([group1, group2, ...]) -> string or tuple
 594
 595         Return one or more subgroups of the match. If there is a
 596         single argument, the result is a single string; if there are
 597         multiple arguments, the result is a tuple with one item per
 598         argument. Without arguments, group1 defaults to zero (i.e. the
 599         whole match is returned). If a groupN argument is zero, the
 600         corresponding return value is the entire matching string; if
 601         it is in the inclusive range [1..99], it is the string
 602         matching the the corresponding parenthesized group. If a group
 603         number is negative or larger than the number of groups defined
 604         in the pattern, an IndexError exception is raised. If a group
 605         is contained in a part of the pattern that did not match, the
 606         corresponding result is None. If a group is contained in a
 607         part of the pattern that matched multiple times, the last
 608         match is returned.
 609
 610         If the regular expression uses the (?P<name>...) syntax, the
 611         groupN arguments may also be strings identifying groups by
 612         their group name. If a string argument is not used as a group
 613         name in the pattern, an IndexError exception is raised.
 614
 615         """
 616         if len(groups) == 0:
 617             groups = (0,)
 618         result = []
 619         for g in groups:
 620             if type(g) == type(''):
 621                 try:
 622                     g = self.re.groupindex[g]
 623                 except (KeyError, TypeError):
 624                     raise IndexError, 'group %s is undefined' % `g`
 625             if g >= len(self.regs):
 626                 raise IndexError, 'group %s is undefined' % `g`
 627             a, b = self.regs[g]
 628             if a == -1 or b == -1:
 629                 result.append(None)
 630             else:
 631                 result.append(self.string[a:b])
 632         if len(result) > 1:
 633             return tuple(result)
 634         elif len(result) == 1:
 635             return result[0]
 636         else:
 637             return ()
 638
 639     def groupdict(self, default=None):
 640         """groupdict([default=None]) -> dictionary
 641
 642         Return a dictionary containing all the named subgroups of the
 643         match, keyed by the subgroup name. The default argument is
 644         used for groups that did not participate in the match.
 645
 646         """
 647         dict = {}
 648         for name, index in self.re.groupindex.items():
 649             a, b = self.regs[index]
 650             if a == -1 or b == -1:
 651                 dict[name] = default
 652             else:
 653                 dict[name] = self.string[a:b]
 654         return dict