Lib/re.py

   1 import sys
   2 import string
   3 from pcre import *
   4
   5 #
   6 # First, the public part of the interface:
   7 #
   8
   9 # pcre.error and re.error should be the same, since exceptions can be
  10 # raised from either module.
  11
  12 # compilation flags
  13
  14 I = IGNORECASE
  15 L = LOCALE
  16 M = MULTILINE
  17 S = DOTALL
  18 X = VERBOSE
  19
  20 #
  21 #
  22 #
  23
  24 _cache = {}
  25 _MAXCACHE = 20
  26
  27 def _cachecompile(pattern, flags=0):
  28     key = (pattern, flags)
  29     try:
  30         return _cache[key]
  31     except KeyError:
  32         pass
  33     value = compile(pattern, flags)
  34     if len(_cache) >= _MAXCACHE:
  35         _cache.clear()
  36     _cache[key] = value
  37     return value
  38
  39 def match(pattern, string, flags=0):
  40     return _cachecompile(pattern, flags).match(string)
  41
  42 def search(pattern, string, flags=0):
  43     return _cachecompile(pattern, flags).search(string)
  44
  45 def sub(pattern, repl, string, count=0):
  46     if type(pattern) == type(''):
  47         pattern = _cachecompile(pattern)
  48     return pattern.sub(repl, string, count)
  49
  50 def subn(pattern, repl, string, count=0):
  51     if type(pattern) == type(''):
  52         pattern = _cachecompile(pattern)
  53     return pattern.subn(repl, string, count)
  54
  55 def split(pattern, string, maxsplit=0):
  56     if type(pattern) == type(''):
  57         pattern = _cachecompile(pattern)
  58     return pattern.split(string, maxsplit)
  59
  60 def findall(pattern, string):
  61     if type(pattern) == type(''):
  62         pattern = _cachecompile(pattern)
  63     return pattern.findall(string)
  64
  65 def escape(pattern):
  66     "Escape all non-alphanumeric characters in pattern."
  67     result = list(pattern)
  68     alphanum=string.letters+'_'+string.digits
  69     for i in range(len(pattern)):
  70         char = pattern[i]
  71         if char not in alphanum:
  72             if char=='\000': result[i] = '\\000'
  73             else: result[i] = '\\'+char
  74     return string.join(result, '')
  75
  76 def compile(pattern, flags=0):
  77     "Compile a regular expression pattern, returning a RegexObject."
  78     groupindex={}
  79     code=pcre_compile(pattern, flags, groupindex)
  80     return RegexObject(pattern, flags, code, groupindex)
  81
  82
  83 #
  84 #   Class definitions
  85 #
  86
  87 class RegexObject:
  88
  89     def __init__(self, pattern, flags, code, groupindex):
  90         self.code = code
  91         self.flags = flags
  92         self.pattern = pattern
  93         self.groupindex = groupindex
  94
  95     def search(self, string, pos=0, endpos=None):
  96         """Scan through string looking for a match to the pattern, returning
  97         a MatchObject instance, or None if no match was found."""
  98
  99         if endpos is None or endpos>len(string):
 100             endpos=len(string)
 101         if endpos<pos: endpos=pos
 102         regs = self.code.match(string, pos, endpos, 0)
 103         if regs is None:
 104             return None
 105         self._num_regs=len(regs)
 106
 107         return MatchObject(self,
 108                            string,
 109                            pos, endpos,
 110                            regs)
 111
 112     def match(self, string, pos=0, endpos=None):
 113         """Try to apply the pattern at the start of the string, returning
 114         a MatchObject instance, or None if no match was found."""
 115
 116         if endpos is None or endpos>len(string):
 117             endpos=len(string)
 118         if endpos<pos: endpos=pos
 119         regs = self.code.match(string, pos, endpos, ANCHORED)
 120         if regs is None:
 121             return None
 122         self._num_regs=len(regs)
 123         return MatchObject(self,
 124                            string,
 125                            pos, endpos,
 126                            regs)
 127
 128     def sub(self, repl, string, count=0):
 129         """Return the string obtained by replacing the leftmost
 130         non-overlapping occurrences of the pattern in string by the
 131         replacement repl"""
 132
 133         return self.subn(repl, string, count)[0]
 134
 135     def subn(self, repl, source, count=0):
 136         """Return a 2-tuple containing (new_string, number).
 137         new_string is the string obtained by replacing the leftmost
 138         non-overlapping occurrences of the pattern in the source
 139         string by the replacement repl.  number is the number of
 140         substitutions that were made."""
 141
 142         if count < 0:
 143             raise error, "negative substitution count"
 144         if count == 0:
 145             count = sys.maxint
 146         n = 0           # Number of matches
 147         pos = 0         # Where to start searching
 148         lastmatch = -1  # End of last match
 149         results = []    # Substrings making up the result
 150         end = len(source)
 151
 152         if type(repl) is type(''):
 153             # See if repl contains group references
 154             try:
 155                 repl = pcre_expand(_Dummy, repl)
 156             except:
 157                 m = MatchObject(self, source, 0, end, [])
 158                 repl = lambda m, repl=repl, expand=pcre_expand: expand(m, repl)
 159             else:
 160                 m = None
 161         else:
 162             m = MatchObject(self, source, 0, end, [])
 163
 164         match = self.code.match
 165         append = results.append
 166         while n < count and pos <= end:
 167             regs = match(source, pos, end, 0)
 168             if not regs:
 169                 break
 170             self._num_regs = len(regs)
 171             i, j = regs[0]
 172             if i == j == lastmatch:
 173                 # Empty match adjacent to previous match
 174                 pos = pos + 1
 175                 append(source[lastmatch:pos])
 176                 continue
 177             if pos < i:
 178                 append(source[pos:i])
 179             if m:
 180                 m.pos = pos
 181                 m.regs = regs
 182                 append(repl(m))
 183             else:
 184                 append(repl)
 185             pos = lastmatch = j
 186             if i == j:
 187                 # Last match was empty; don't try here again
 188                 pos = pos + 1
 189                 append(source[lastmatch:pos])
 190             n = n + 1
 191         append(source[pos:])
 192         return (string.join(results, ''), n)
 193
 194     def split(self, source, maxsplit=0):
 195         """Split the source string by the occurrences of the pattern,
 196         returning a list containing the resulting substrings."""
 197
 198         if maxsplit < 0:
 199             raise error, "negative split count"
 200         if maxsplit == 0:
 201             maxsplit = sys.maxint
 202         n = 0
 203         pos = 0
 204         lastmatch = 0
 205         results = []
 206         end = len(source)
 207         match = self.code.match
 208         append = results.append
 209         while n < maxsplit:
 210             regs = match(source, pos, end, 0)
 211             if not regs:
 212                 break
 213             i, j = regs[0]
 214             if i == j:
 215                 # Empty match
 216                 if pos >= end:
 217                     break
 218                 pos = pos+1
 219                 continue
 220             append(source[lastmatch:i])
 221             rest = regs[1:]
 222             if rest:
 223                 for a, b in rest:
 224                     if a == -1 or b == -1:
 225                         group = None
 226                     else:
 227                         group = source[a:b]
 228                     append(group)
 229             pos = lastmatch = j
 230             n = n + 1
 231         append(source[lastmatch:])
 232         return results
 233
 234     def findall(self, source):
 235         """Return a list of all non-overlapping matches in the string.
 236
 237         If one or more groups are present in the pattern, return a
 238         list of groups; this will be a list of tuples if the pattern
 239         has more than one group.
 240
 241         Empty matches are included in the result.
 242
 243         """
 244         pos = 0
 245         end = len(source)
 246         results = []
 247         match = self.code.match
 248         append = results.append
 249         while pos <= end:
 250             regs = match(source, pos, end, 0)
 251             if not regs:
 252                 break
 253             i, j = regs[0]
 254             rest = regs[1:]
 255             if not rest:
 256                 gr = source[i:j]
 257             elif len(rest) == 1:
 258                 a, b = rest[0]
 259                 gr = source[a:b]
 260             else:
 261                 gr = []
 262                 for (a, b) in rest:
 263                     gr.append(source[a:b])
 264                 gr = tuple(gr)
 265             append(gr)
 266             pos = max(j, pos+1)
 267         return results
 268
 269     # The following 3 functions were contributed by Mike Fletcher, and
 270     # allow pickling and unpickling of RegexObject instances.
 271     def __getinitargs__(self):
 272         return (None,None,None,None) # any 4 elements, to work around
 273                                      # problems with the
 274                                      # pickle/cPickle modules not yet
 275                                      # ignoring the __init__ function
 276     def __getstate__(self):
 277         return self.pattern, self.flags, self.groupindex
 278     def __setstate__(self, statetuple):
 279         self.pattern = statetuple[0]
 280         self.flags = statetuple[1]
 281         self.groupindex = statetuple[2]
 282         self.code = apply(pcre_compile, statetuple)
 283
 284 class _Dummy:
 285     # Dummy class used by _subn_string().  Has 'group' to avoid core dump.
 286     group = None
 287
 288 class MatchObject:
 289
 290     def __init__(self, re, string, pos, endpos, regs):
 291         self.re = re
 292         self.string = string
 293         self.pos = pos
 294         self.endpos = endpos
 295         self.regs = regs
 296
 297     def start(self, g = 0):
 298         "Return the start of the substring matched by group g"
 299         if type(g) == type(''):
 300             try:
 301                 g = self.re.groupindex[g]
 302             except (KeyError, TypeError):
 303                 raise IndexError, 'group %s is undefined' % `g`
 304         return self.regs[g][0]
 305
 306     def end(self, g = 0):
 307         "Return the end of the substring matched by group g"
 308         if type(g) == type(''):
 309             try:
 310                 g = self.re.groupindex[g]
 311             except (KeyError, TypeError):
 312                 raise IndexError, 'group %s is undefined' % `g`
 313         return self.regs[g][1]
 314
 315     def span(self, g = 0):
 316         "Return (start, end) of the substring matched by group g"
 317         if type(g) == type(''):
 318             try:
 319                 g = self.re.groupindex[g]
 320             except (KeyError, TypeError):
 321                 raise IndexError, 'group %s is undefined' % `g`
 322         return self.regs[g]
 323
 324     def groups(self, default=None):
 325         "Return a tuple containing all subgroups of the match object"
 326         result = []
 327         for g in range(1, self.re._num_regs):
 328             a, b = self.regs[g]
 329             if a == -1 or b == -1:
 330                 result.append(default)
 331             else:
 332                 result.append(self.string[a:b])
 333         return tuple(result)
 334
 335     def group(self, *groups):
 336         "Return one or more groups of the match"
 337         if len(groups) == 0:
 338             groups = (0,)
 339         result = []
 340         for g in groups:
 341             if type(g) == type(''):
 342                 try:
 343                     g = self.re.groupindex[g]
 344                 except (KeyError, TypeError):
 345                     raise IndexError, 'group %s is undefined' % `g`
 346             if g >= len(self.regs):
 347                 raise IndexError, 'group %s is undefined' % `g`
 348             a, b = self.regs[g]
 349             if a == -1 or b == -1:
 350                 result.append(None)
 351             else:
 352                 result.append(self.string[a:b])
 353         if len(result) > 1:
 354             return tuple(result)
 355         elif len(result) == 1:
 356             return result[0]
 357         else:
 358             return ()
 359
 360     def groupdict(self, default=None):
 361         "Return a dictionary containing all named subgroups of the match"
 362         dict = {}
 363         for name, index in self.re.groupindex.items():
 364             a, b = self.regs[index]
 365             if a == -1 or b == -1:
 366                 dict[name] = default
 367             else:
 368                 dict[name] = self.string[a:b]
 369         return dict