Lib/regsub.py

   1 """Regexp-based split and replace using the obsolete regex module.
   2
   3 This module is only for backward compatibility.  These operations
   4 are now provided by the new regular expression module, "re".
   5
   6 sub(pat, repl, str):        replace first occurrence of pattern in string
   7 gsub(pat, repl, str):       replace all occurrences of pattern in string
   8 split(str, pat, maxsplit):  split string using pattern as delimiter
   9 splitx(str, pat, maxsplit): split string using pattern as delimiter plus
  10                             return delimiters
  11 """
  12
  13 import regex
  14
  15
  16 # Replace first occurrence of pattern pat in string str by replacement
  17 # repl.  If the pattern isn't found, the string is returned unchanged.
  18 # The replacement may contain references \digit to subpatterns and
  19 # escaped backslashes.  The pattern may be a string or an already
  20 # compiled pattern.
  21
  22 def sub(pat, repl, str):
  23         prog = compile(pat)
  24         if prog.search(str) >= 0:
  25                 regs = prog.regs
  26                 a, b = regs[0]
  27                 str = str[:a] + expand(repl, regs, str) + str[b:]
  28         return str
  29
  30
  31 # Replace all (non-overlapping) occurrences of pattern pat in string
  32 # str by replacement repl.  The same rules as for sub() apply.
  33 # Empty matches for the pattern are replaced only when not adjacent to
  34 # a previous match, so e.g. gsub('', '-', 'abc') returns '-a-b-c-'.
  35
  36 def gsub(pat, repl, str):
  37         prog = compile(pat)
  38         new = ''
  39         start = 0
  40         first = 1
  41         while prog.search(str, start) >= 0:
  42                 regs = prog.regs
  43                 a, b = regs[0]
  44                 if a == b == start and not first:
  45                         if start >= len(str) or prog.search(str, start+1) < 0:
  46                                 break
  47                         regs = prog.regs
  48                         a, b = regs[0]
  49                 new = new + str[start:a] + expand(repl, regs, str)
  50                 start = b
  51                 first = 0
  52         new = new + str[start:]
  53         return new
  54
  55
  56 # Split string str in fields separated by delimiters matching pattern
  57 # pat.  Only non-empty matches for the pattern are considered, so e.g.
  58 # split('abc', '') returns ['abc'].
  59 # The optional 3rd argument sets the number of splits that are performed.
  60
  61 def split(str, pat, maxsplit = 0):
  62         return intsplit(str, pat, maxsplit, 0)
  63
  64 # Split string str in fields separated by delimiters matching pattern
  65 # pat.  Only non-empty matches for the pattern are considered, so e.g.
  66 # split('abc', '') returns ['abc']. The delimiters are also included
  67 # in the list.
  68 # The optional 3rd argument sets the number of splits that are performed.
  69
  70
  71 def splitx(str, pat, maxsplit = 0):
  72         return intsplit(str, pat, maxsplit, 1)
  73
  74 # Internal function used to implement split() and splitx().
  75
  76 def intsplit(str, pat, maxsplit, retain):
  77         prog = compile(pat)
  78         res = []
  79         start = next = 0
  80         splitcount = 0
  81         while prog.search(str, next) >= 0:
  82                 regs = prog.regs
  83                 a, b = regs[0]
  84                 if a == b:
  85                         next = next + 1
  86                         if next >= len(str):
  87                                 break
  88                 else:
  89                         res.append(str[start:a])
  90                         if retain:
  91                                 res.append(str[a:b])
  92                         start = next = b
  93                         splitcount = splitcount + 1
  94                         if (maxsplit and (splitcount >= maxsplit)):
  95                             break
  96         res.append(str[start:])
  97         return res
  98
  99
 100 # Capitalize words split using a pattern
 101
 102 def capwords(str, pat='[^a-zA-Z0-9_]+'):
 103         import string
 104         words = splitx(str, pat)
 105         for i in range(0, len(words), 2):
 106                 words[i] = string.capitalize(words[i])
 107         return string.joinfields(words, "")
 108
 109
 110 # Internal subroutines:
 111 # compile(pat): compile a pattern, caching already compiled patterns
 112 # expand(repl, regs, str): expand \digit escapes in replacement string
 113
 114
 115 # Manage a cache of compiled regular expressions.
 116 #
 117 # If the pattern is a string a compiled version of it is returned.  If
 118 # the pattern has been used before we return an already compiled
 119 # version from the cache; otherwise we compile it now and save the
 120 # compiled version in the cache, along with the syntax it was compiled
 121 # with.  Instead of a string, a compiled regular expression can also
 122 # be passed.
 123
 124 cache = {}
 125
 126 def compile(pat):
 127         if type(pat) <> type(''):
 128                 return pat              # Assume it is a compiled regex
 129         key = (pat, regex.get_syntax())
 130         if cache.has_key(key):
 131                 prog = cache[key]       # Get it from the cache
 132         else:
 133                 prog = cache[key] = regex.compile(pat)
 134         return prog
 135
 136
 137 def clear_cache():
 138         global cache
 139         cache = {}
 140
 141
 142 # Expand \digit in the replacement.
 143 # Each occurrence of \digit is replaced by the substring of str
 144 # indicated by regs[digit].  To include a literal \ in the
 145 # replacement, double it; other \ escapes are left unchanged (i.e.
 146 # the \ and the following character are both copied).
 147
 148 def expand(repl, regs, str):
 149         if '\\' not in repl:
 150                 return repl
 151         new = ''
 152         i = 0
 153         ord0 = ord('0')
 154         while i < len(repl):
 155                 c = repl[i]; i = i+1
 156                 if c <> '\\' or i >= len(repl):
 157                         new = new + c
 158                 else:
 159                         c = repl[i]; i = i+1
 160                         if '0' <= c <= '9':
 161                                 a, b = regs[ord(c)-ord0]
 162                                 new = new + str[a:b]
 163                         elif c == '\\':
 164                                 new = new + c
 165                         else:
 166                                 new = new + '\\' + c
 167         return new
 168
 169
 170 # Test program, reads sequences "pat repl str" from stdin.
 171 # Optional argument specifies pattern used to split lines.
 172
 173 def test():
 174         import sys
 175         if sys.argv[1:]:
 176                 delpat = sys.argv[1]
 177         else:
 178                 delpat = '[ \t\n]+'
 179         while 1:
 180                 if sys.stdin.isatty(): sys.stderr.write('--> ')
 181                 line = sys.stdin.readline()
 182                 if not line: break
 183                 if line[-1] == '\n': line = line[:-1]
 184                 fields = split(line, delpat)
 185                 if len(fields) <> 3:
 186                         print 'Sorry, not three fields'
 187                         print 'split:', `fields`
 188                         continue
 189                 [pat, repl, str] = split(line, delpat)
 190                 print 'sub :', `sub(pat, repl, str)`
 191                 print 'gsub:', `gsub(pat, repl, str)`