Lib/pre.py

   1 # module 're' -- A collection of regular expression operations
   2
   3 """Support for regular expressions (RE).
   4
   5 This module provides regular expression matching operations similar to
   6 those found in Perl. It's 8-bit clean: the strings being processed may
   7 contain both null bytes and characters whose high bit is set. Regular
   8 expression pattern strings may not contain null bytes, but can specify
   9 the null byte using the \\number notation. Characters with the high
  10 bit set may be included.
  11
  12 Regular expressions can contain both special and ordinary
  13 characters. Most ordinary characters, like "A", "a", or "0", are the
  14 simplest regular expressions; they simply match themselves. You can
  15 concatenate ordinary characters, so last matches the string 'last'.
  16
  17 The special characters are:
  18     "."      Matches any character except a newline.
  19     "^"      Matches the start of the string.
  20     "$"      Matches the end of the string.
  21     "*"      Matches 0 or more (greedy) repetitions of the preceding RE.
  22              Greedy means that it will match as many repetitions as possible.
  23     "+"      Matches 1 or more (greedy) repetitions of the preceding RE.
  24     "?"      Matches 0 or 1 (greedy) of the preceding RE.
  25     *?,+?,?? Non-greedy versions of the previous three special characters.
  26     {m,n}    Matches from m to n repetitions of the preceding RE.
  27     {m,n}?   Non-greedy version of the above.
  28     "\\"      Either escapes special characters or signals a special sequence.
  29     []       Indicates a set of characters.
  30              A "^" as the first character indicates a complementing set.
  31     "|"      A|B, creates an RE that will match either A or B.
  32     (...)    Matches the RE inside the parentheses.
  33              The contents can be retrieved or matched later in the string.
  34     (?iLmsx) Set the I, L, M, S, or X flag for the RE.
  35     (?:...)  Non-grouping version of regular parentheses.
  36     (?P<name>...) The substring matched by the group is accessible by name.
  37     (?P=name)     Matches the text matched earlier by the group named name.
  38     (?#...)  A comment; ignored.
  39     (?=...)  Matches if ... matches next, but doesn't consume the string.
  40     (?!...)  Matches if ... doesn't match next.
  41
  42 The special sequences consist of "\\" and a character from the list
  43 below. If the ordinary character is not on the list, then the
  44 resulting RE will match the second character.
  45     \\number  Matches the contents of the group of the same number.
  46     \\A       Matches only at the start of the string.
  47     \\Z       Matches only at the end of the string.
  48     \\b       Matches the empty string, but only at the start or end of a word.
  49     \\B       Matches the empty string, but not at the start or end of a word.
  50     \\d       Matches any decimal digit; equivalent to the set [0-9].
  51     \\D       Matches any non-digit character; equivalent to the set [^0-9].
  52     \\s       Matches any whitespace character; equivalent to [ \\t\\n\\r\\f\\v].
  53     \\S       Matches any non-whitespace character; equiv. to [^ \\t\\n\\r\\f\\v].
  54     \\w       Matches any alphanumeric character; equivalent to [a-zA-Z0-9_].
  55              With LOCALE, it will match the set [0-9_] plus characters defined
  56              as letters for the current locale.
  57     \\W       Matches the complement of \\w.
  58     \\\\       Matches a literal backslash.
  59
  60 This module exports the following functions:
  61     match    Match a regular expression pattern to the beginning of a string.
  62     search   Search a string for the presence of a pattern.
  63     sub      Substitute occurrences of a pattern found in a string.
  64     subn     Same as sub, but also return the number of substitutions made.
  65     split    Split a string by the occurrences of a pattern.
  66     findall  Find all occurrences of a pattern in a string.
  67     compile  Compile a pattern into a RegexObject.
  68     escape   Backslash all non-alphanumerics in a string.
  69
  70 This module exports the following classes:
  71     RegexObject    Holds a compiled regular expression pattern.
  72     MatchObject    Contains information about pattern matches.
  73
  74 Some of the functions in this module takes flags as optional parameters:
  75     I  IGNORECASE  Perform case-insensitive matching.
  76     L  LOCALE      Make \w, \W, \b, \B, dependent on the current locale.
  77     M  MULTILINE   "^" matches the beginning of lines as well as the string.
  78                    "$" matches the end of lines as well as the string.
  79     S  DOTALL      "." matches any character at all, including the newline.
  80     X  VERBOSE     Ignore whitespace and comments for nicer looking RE's.
  81
  82 This module also defines an exception 'error'.
  83
  84 """
  85
  86
  87 import sys
  88 import string
  89 from pcre import *
  90
  91 #
  92 # First, the public part of the interface:
  93 #
  94
  95 # pcre.error and re.error should be the same, since exceptions can be
  96 # raised from either module.
  97
  98 # compilation flags
  99
 100 I = IGNORECASE
 101 L = LOCALE
 102 M = MULTILINE
 103 S = DOTALL
 104 X = VERBOSE
 105
 106
 107 #
 108 #
 109 #
 110
 111 _cache = {}
 112 _MAXCACHE = 20
 113
 114 def _cachecompile(pattern, flags=0):
 115     key = (pattern, flags)
 116     try:
 117         return _cache[key]
 118     except KeyError:
 119         pass
 120     value = compile(pattern, flags)
 121     if len(_cache) >= _MAXCACHE:
 122         _cache.clear()
 123     _cache[key] = value
 124     return value
 125
 126 def match(pattern, string, flags=0):
 127     """match (pattern, string[, flags]) -> MatchObject or None
 128
 129     If zero or more characters at the beginning of string match the
 130     regular expression pattern, return a corresponding MatchObject
 131     instance. Return None if the string does not match the pattern;
 132     note that this is different from a zero-length match.
 133
 134     Note: If you want to locate a match anywhere in string, use
 135     search() instead.
 136
 137     """
 138
 139     return _cachecompile(pattern, flags).match(string)
 140
 141 def search(pattern, string, flags=0):
 142     """search (pattern, string[, flags]) -> MatchObject or None
 143
 144     Scan through string looking for a location where the regular
 145     expression pattern produces a match, and return a corresponding
 146     MatchObject instance. Return None if no position in the string
 147     matches the pattern; note that this is different from finding a
 148     zero-length match at some point in the string.
 149
 150     """
 151     return _cachecompile(pattern, flags).search(string)
 152
 153 def sub(pattern, repl, string, count=0):
 154     """sub(pattern, repl, string[, count=0]) -> string
 155
 156     Return the string obtained by replacing the leftmost
 157     non-overlapping occurrences of pattern in string by the
 158     replacement repl. If the pattern isn't found, string is returned
 159     unchanged. repl can be a string or a function; if a function, it
 160     is called for every non-overlapping occurrence of pattern. The
 161     function takes a single match object argument, and returns the
 162     replacement string.
 163
 164     The pattern may be a string or a regex object; if you need to
 165     specify regular expression flags, you must use a regex object, or
 166     use embedded modifiers in a pattern; e.g.
 167     sub("(?i)b+", "x", "bbbb BBBB") returns 'x x'.
 168
 169     The optional argument count is the maximum number of pattern
 170     occurrences to be replaced; count must be a non-negative integer,
 171     and the default value of 0 means to replace all occurrences.
 172
 173     """
 174     if type(pattern) == type(''):
 175         pattern = _cachecompile(pattern)
 176     return pattern.sub(repl, string, count)
 177
 178 def subn(pattern, repl, string, count=0):
 179     """subn(pattern, repl, string[, count=0]) -> (string, num substitutions)
 180
 181     Perform the same operation as sub(), but return a tuple
 182     (new_string, number_of_subs_made).
 183
 184     """
 185     if type(pattern) == type(''):
 186         pattern = _cachecompile(pattern)
 187     return pattern.subn(repl, string, count)
 188
 189 def split(pattern, string, maxsplit=0):
 190     """split(pattern, string[, maxsplit=0]) -> list of strings
 191
 192     Split string by the occurrences of pattern. If capturing
 193     parentheses are used in pattern, then the text of all groups in
 194     the pattern are also returned as part of the resulting list. If
 195     maxsplit is nonzero, at most maxsplit splits occur, and the
 196     remainder of the string is returned as the final element of the
 197     list.
 198
 199     """
 200     if type(pattern) == type(''):
 201         pattern = _cachecompile(pattern)
 202     return pattern.split(string, maxsplit)
 203
 204 def findall(pattern, string):
 205     """findall(pattern, string) -> list
 206
 207     Return a list of all non-overlapping matches of pattern in
 208     string. If one or more groups are present in the pattern, return a
 209     list of groups; this will be a list of tuples if the pattern has
 210     more than one group. Empty matches are included in the result.
 211
 212     """
 213     if type(pattern) == type(''):
 214         pattern = _cachecompile(pattern)
 215     return pattern.findall(string)
 216
 217 def escape(pattern):
 218     """escape(string) -> string
 219
 220     Return string with all non-alphanumerics backslashed; this is
 221     useful if you want to match an arbitrary literal string that may
 222     have regular expression metacharacters in it.
 223
 224     """
 225     result = list(pattern)
 226     alphanum=string.letters+'_'+string.digits
 227     for i in range(len(pattern)):
 228         char = pattern[i]
 229         if char not in alphanum:
 230             if char=='\000': result[i] = '\\000'
 231             else: result[i] = '\\'+char
 232     return string.join(result, '')
 233
 234 def compile(pattern, flags=0):
 235     """compile(pattern[, flags]) -> RegexObject
 236
 237     Compile a regular expression pattern into a regular expression
 238     object, which can be used for matching using its match() and
 239     search() methods.
 240
 241     """
 242     groupindex={}
 243     code=pcre_compile(pattern, flags, groupindex)
 244     return RegexObject(pattern, flags, code, groupindex)
 245
 246
 247 #
 248 #   Class definitions
 249 #
 250
 251 class RegexObject:
 252     """Holds a compiled regular expression pattern.
 253
 254     Methods:
 255     match    Match the pattern to the beginning of a string.
 256     search   Search a string for the presence of the pattern.
 257     sub      Substitute occurrences of the pattern found in a string.
 258     subn     Same as sub, but also return the number of substitutions made.
 259     split    Split a string by the occurrences of the pattern.
 260     findall  Find all occurrences of the pattern in a string.
 261
 262     """
 263
 264     def __init__(self, pattern, flags, code, groupindex):
 265         self.code = code
 266         self.flags = flags
 267         self.pattern = pattern
 268         self.groupindex = groupindex
 269
 270     def search(self, string, pos=0, endpos=None):
 271         """search(string[, pos][, endpos]) -> MatchObject or None
 272
 273         Scan through string looking for a location where this regular
 274         expression produces a match, and return a corresponding
 275         MatchObject instance. Return None if no position in the string
 276         matches the pattern; note that this is different from finding
 277         a zero-length match at some point in the string. The optional
 278         pos and endpos parameters have the same meaning as for the
 279         match() method.
 280
 281         """
 282         if endpos is None or endpos>len(string):
 283             endpos=len(string)
 284         if endpos<pos: endpos=pos
 285         regs = self.code.match(string, pos, endpos, 0)
 286         if regs is None:
 287             return None
 288         self._num_regs=len(regs)
 289
 290         return MatchObject(self,
 291                            string,
 292                            pos, endpos,
 293                            regs)
 294
 295     def match(self, string, pos=0, endpos=None):
 296         """match(string[, pos][, endpos]) -> MatchObject or None
 297
 298         If zero or more characters at the beginning of string match
 299         this regular expression, return a corresponding MatchObject
 300         instance. Return None if the string does not match the
 301         pattern; note that this is different from a zero-length match.
 302
 303         Note: If you want to locate a match anywhere in string, use
 304         search() instead.
 305
 306         The optional second parameter pos gives an index in the string
 307         where the search is to start; it defaults to 0.  This is not
 308         completely equivalent to slicing the string; the '' pattern
 309         character matches at the real beginning of the string and at
 310         positions just after a newline, but not necessarily at the
 311         index where the search is to start.
 312
 313         The optional parameter endpos limits how far the string will
 314         be searched; it will be as if the string is endpos characters
 315         long, so only the characters from pos to endpos will be
 316         searched for a match.
 317
 318         """
 319         if endpos is None or endpos>len(string):
 320             endpos=len(string)
 321         if endpos<pos: endpos=pos
 322         regs = self.code.match(string, pos, endpos, ANCHORED)
 323         if regs is None:
 324             return None
 325         self._num_regs=len(regs)
 326         return MatchObject(self,
 327                            string,
 328                            pos, endpos,
 329                            regs)
 330
 331     def sub(self, repl, string, count=0):
 332         """sub(repl, string[, count=0]) -> string
 333
 334         Return the string obtained by replacing the leftmost
 335         non-overlapping occurrences of the compiled pattern in string
 336         by the replacement repl. If the pattern isn't found, string is
 337         returned unchanged.
 338
 339         Identical to the sub() function, using the compiled pattern.
 340
 341         """
 342         return self.subn(repl, string, count)[0]
 343
 344     def subn(self, repl, source, count=0):
 345         """subn(repl, string[, count=0]) -> tuple
 346
 347         Perform the same operation as sub(), but return a tuple
 348         (new_string, number_of_subs_made).
 349
 350         """
 351         if count < 0:
 352             raise error, "negative substitution count"
 353         if count == 0:
 354             count = sys.maxint
 355         n = 0           # Number of matches
 356         pos = 0         # Where to start searching
 357         lastmatch = -1  # End of last match
 358         results = []    # Substrings making up the result
 359         end = len(source)
 360
 361         if type(repl) is type(''):
 362             # See if repl contains group references
 363             try:
 364                 repl = pcre_expand(_Dummy, repl)
 365             except:
 366                 m = MatchObject(self, source, 0, end, [])
 367                 repl = lambda m, repl=repl, expand=pcre_expand: expand(m, repl)
 368             else:
 369                 m = None
 370         else:
 371             m = MatchObject(self, source, 0, end, [])
 372
 373         match = self.code.match
 374         append = results.append
 375         while n < count and pos <= end:
 376             regs = match(source, pos, end, 0)
 377             if not regs:
 378                 break
 379             self._num_regs = len(regs)
 380             i, j = regs[0]
 381             if i == j == lastmatch:
 382                 # Empty match adjacent to previous match
 383                 pos = pos + 1
 384                 append(source[lastmatch:pos])
 385                 continue
 386             if pos < i:
 387                 append(source[pos:i])
 388             if m:
 389                 m.pos = pos
 390                 m.regs = regs
 391                 append(repl(m))
 392             else:
 393                 append(repl)
 394             pos = lastmatch = j
 395             if i == j:
 396                 # Last match was empty; don't try here again
 397                 pos = pos + 1
 398                 append(source[lastmatch:pos])
 399             n = n + 1
 400         append(source[pos:])
 401         return (string.join(results, ''), n)
 402
 403     def split(self, source, maxsplit=0):
 404         """split(source[, maxsplit=0]) -> list of strings
 405
 406         Split string by the occurrences of the compiled pattern. If
 407         capturing parentheses are used in the pattern, then the text
 408         of all groups in the pattern are also returned as part of the
 409         resulting list. If maxsplit is nonzero, at most maxsplit
 410         splits occur, and the remainder of the string is returned as
 411         the final element of the list.
 412
 413         """
 414         if maxsplit < 0:
 415             raise error, "negative split count"
 416         if maxsplit == 0:
 417             maxsplit = sys.maxint
 418         n = 0
 419         pos = 0
 420         lastmatch = 0
 421         results = []
 422         end = len(source)
 423         match = self.code.match
 424         append = results.append
 425         while n < maxsplit:
 426             regs = match(source, pos, end, 0)
 427             if not regs:
 428                 break
 429             i, j = regs[0]
 430             if i == j:
 431                 # Empty match
 432                 if pos >= end:
 433                     break
 434                 pos = pos+1
 435                 continue
 436             append(source[lastmatch:i])
 437             rest = regs[1:]
 438             if rest:
 439                 for a, b in rest:
 440                     if a == -1 or b == -1:
 441                         group = None
 442                     else:
 443                         group = source[a:b]
 444                     append(group)
 445             pos = lastmatch = j
 446             n = n + 1
 447         append(source[lastmatch:])
 448         return results
 449
 450     def findall(self, source):
 451         """findall(source) -> list
 452
 453         Return a list of all non-overlapping matches of the compiled
 454         pattern in string. If one or more groups are present in the
 455         pattern, return a list of groups; this will be a list of
 456         tuples if the pattern has more than one group. Empty matches
 457         are included in the result.
 458
 459         """
 460         pos = 0
 461         end = len(source)
 462         results = []
 463         match = self.code.match
 464         append = results.append
 465         while pos <= end:
 466             regs = match(source, pos, end, 0)
 467             if not regs:
 468                 break
 469             i, j = regs[0]
 470             rest = regs[1:]
 471             if not rest:
 472                 gr = source[i:j]
 473             elif len(rest) == 1:
 474                 a, b = rest[0]
 475                 gr = source[a:b]
 476             else:
 477                 gr = []
 478                 for (a, b) in rest:
 479                     gr.append(source[a:b])
 480                 gr = tuple(gr)
 481             append(gr)
 482             pos = max(j, pos+1)
 483         return results
 484
 485     # The following 3 functions were contributed by Mike Fletcher, and
 486     # allow pickling and unpickling of RegexObject instances.
 487     def __getinitargs__(self):
 488         return (None,None,None,None) # any 4 elements, to work around
 489                                      # problems with the
 490                                      # pickle/cPickle modules not yet
 491                                      # ignoring the __init__ function
 492     def __getstate__(self):
 493         return self.pattern, self.flags, self.groupindex
 494     def __setstate__(self, statetuple):
 495         self.pattern = statetuple[0]
 496         self.flags = statetuple[1]
 497         self.groupindex = statetuple[2]
 498         self.code = apply(pcre_compile, statetuple)
 499
 500 class _Dummy:
 501     # Dummy class used by _subn_string().  Has 'group' to avoid core dump.
 502     group = None
 503
 504 class MatchObject:
 505     """Holds a compiled regular expression pattern.
 506
 507     Methods:
 508     start      Return the index of the start of a matched substring.
 509     end        Return the index of the end of a matched substring.
 510     span       Return a tuple of (start, end) of a matched substring.
 511     groups     Return a tuple of all the subgroups of the match.
 512     group      Return one or more subgroups of the match.
 513     groupdict  Return a dictionary of all the named subgroups of the match.
 514
 515     """
 516
 517     def __init__(self, re, string, pos, endpos, regs):
 518         self.re = re
 519         self.string = string
 520         self.pos = pos
 521         self.endpos = endpos
 522         self.regs = regs
 523
 524     def start(self, g = 0):
 525         """start([group=0]) -> int or None
 526
 527         Return the index of the start of the substring matched by
 528         group; group defaults to zero (meaning the whole matched
 529         substring). Return None if group exists but did not contribute
 530         to the match.
 531
 532         """
 533         if type(g) == type(''):
 534             try:
 535                 g = self.re.groupindex[g]
 536             except (KeyError, TypeError):
 537                 raise IndexError, 'group %s is undefined' % `g`
 538         return self.regs[g][0]
 539
 540     def end(self, g = 0):
 541         """end([group=0]) -> int or None
 542
 543         Return the indices of the end of the substring matched by
 544         group; group defaults to zero (meaning the whole matched
 545         substring). Return None if group exists but did not contribute
 546         to the match.
 547
 548         """
 549         if type(g) == type(''):
 550             try:
 551                 g = self.re.groupindex[g]
 552             except (KeyError, TypeError):
 553                 raise IndexError, 'group %s is undefined' % `g`
 554         return self.regs[g][1]
 555
 556     def span(self, g = 0):
 557         """span([group=0]) -> tuple
 558
 559         Return the 2-tuple (m.start(group), m.end(group)). Note that
 560         if group did not contribute to the match, this is (None,
 561         None). Group defaults to zero (meaning the whole matched
 562         substring).
 563
 564         """
 565         if type(g) == type(''):
 566             try:
 567                 g = self.re.groupindex[g]
 568             except (KeyError, TypeError):
 569                 raise IndexError, 'group %s is undefined' % `g`
 570         return self.regs[g]
 571
 572     def groups(self, default=None):
 573         """groups([default=None]) -> tuple
 574
 575         Return a tuple containing all the subgroups of the match, from
 576         1 up to however many groups are in the pattern. The default
 577         argument is used for groups that did not participate in the
 578         match.
 579
 580         """
 581         result = []
 582         for g in range(1, self.re._num_regs):
 583             a, b = self.regs[g]
 584             if a == -1 or b == -1:
 585                 result.append(default)
 586             else:
 587                 result.append(self.string[a:b])
 588         return tuple(result)
 589
 590     def group(self, *groups):
 591         """group([group1, group2, ...]) -> string or tuple
 592
 593         Return one or more subgroups of the match. If there is a
 594         single argument, the result is a single string; if there are
 595         multiple arguments, the result is a tuple with one item per
 596         argument. Without arguments, group1 defaults to zero (i.e. the
 597         whole match is returned). If a groupN argument is zero, the
 598         corresponding return value is the entire matching string; if
 599         it is in the inclusive range [1..99], it is the string
 600         matching the the corresponding parenthesized group. If a group
 601         number is negative or larger than the number of groups defined
 602         in the pattern, an IndexError exception is raised. If a group
 603         is contained in a part of the pattern that did not match, the
 604         corresponding result is None. If a group is contained in a
 605         part of the pattern that matched multiple times, the last
 606         match is returned.
 607
 608         If the regular expression uses the (?P<name>...) syntax, the
 609         groupN arguments may also be strings identifying groups by
 610         their group name. If a string argument is not used as a group
 611         name in the pattern, an IndexError exception is raised.
 612
 613         """
 614         if len(groups) == 0:
 615             groups = (0,)
 616         result = []
 617         for g in groups:
 618             if type(g) == type(''):
 619                 try:
 620                     g = self.re.groupindex[g]
 621                 except (KeyError, TypeError):
 622                     raise IndexError, 'group %s is undefined' % `g`
 623             if g >= len(self.regs):
 624                 raise IndexError, 'group %s is undefined' % `g`
 625             a, b = self.regs[g]
 626             if a == -1 or b == -1:
 627                 result.append(None)
 628             else:
 629                 result.append(self.string[a:b])
 630         if len(result) > 1:
 631             return tuple(result)
 632         elif len(result) == 1:
 633             return result[0]
 634         else:
 635             return ()
 636
 637     def groupdict(self, default=None):
 638         """groupdict([default=None]) -> dictionary
 639
 640         Return a dictionary containing all the named subgroups of the
 641         match, keyed by the subgroup name. The default argument is
 642         used for groups that did not participate in the match.
 643
 644         """
 645         dict = {}
 646         for name, index in self.re.groupindex.items():
 647             a, b = self.regs[index]
 648             if a == -1 or b == -1:
 649                 dict[name] = default
 650             else:
 651                 dict[name] = self.string[a:b]
 652         return dict