Lib/regsub.py

   1 # Regular expression subroutines:
   2 # sub(pat, repl, str): replace first occurrence of pattern in string
   3 # gsub(pat, repl, str): replace all occurrences of pattern in string
   4 # split(str, pat, maxsplit): split string using pattern as delimiter
   5 # splitx(str, pat, maxsplit): split string using pattern as delimiter plus
   6 #                             return delimiters
   7
   8
   9 import regex
  10
  11
  12 # Replace first occurrence of pattern pat in string str by replacement
  13 # repl.  If the pattern isn't found, the string is returned unchanged.
  14 # The replacement may contain references \digit to subpatterns and
  15 # escaped backslashes.  The pattern may be a string or an already
  16 # compiled pattern.
  17
  18 def sub(pat, repl, str):
  19         prog = compile(pat)
  20         if prog.search(str) >= 0:
  21                 regs = prog.regs
  22                 a, b = regs[0]
  23                 str = str[:a] + expand(repl, regs, str) + str[b:]
  24         return str
  25
  26
  27 # Replace all (non-overlapping) occurrences of pattern pat in string
  28 # str by replacement repl.  The same rules as for sub() apply.
  29 # Empty matches for the pattern are replaced only when not adjacent to
  30 # a previous match, so e.g. gsub('', '-', 'abc') returns '-a-b-c-'.
  31
  32 def gsub(pat, repl, str):
  33         prog = compile(pat)
  34         new = ''
  35         start = 0
  36         first = 1
  37         while prog.search(str, start) >= 0:
  38                 regs = prog.regs
  39                 a, b = regs[0]
  40                 if a == b == start and not first:
  41                         if start >= len(str) or prog.search(str, start+1) < 0:
  42                                 break
  43                         regs = prog.regs
  44                         a, b = regs[0]
  45                 new = new + str[start:a] + expand(repl, regs, str)
  46                 start = b
  47                 first = 0
  48         new = new + str[start:]
  49         return new
  50
  51
  52 # Split string str in fields separated by delimiters matching pattern
  53 # pat.  Only non-empty matches for the pattern are considered, so e.g.
  54 # split('abc', '') returns ['abc'].
  55 # The optional 3rd argument sets the number of splits that are performed.
  56
  57 def split(str, pat, maxsplit = 0):
  58         return intsplit(str, pat, maxsplit, 0)
  59
  60 # Split string str in fields separated by delimiters matching pattern
  61 # pat.  Only non-empty matches for the pattern are considered, so e.g.
  62 # split('abc', '') returns ['abc']. The delimiters are also included
  63 # in the list.
  64 # The optional 3rd argument sets the number of splits that are performed.
  65
  66
  67 def splitx(str, pat, maxsplit = 0):
  68         return intsplit(str, pat, maxsplit, 1)
  69
  70 # Internal function used to implement split() and splitx().
  71
  72 def intsplit(str, pat, maxsplit, retain):
  73         prog = compile(pat)
  74         res = []
  75         start = next = 0
  76         splitcount = 0
  77         while prog.search(str, next) >= 0:
  78                 regs = prog.regs
  79                 a, b = regs[0]
  80                 if a == b:
  81                         next = next + 1
  82                         if next >= len(str):
  83                                 break
  84                 else:
  85                         res.append(str[start:a])
  86                         if retain:
  87                                 res.append(str[a:b])
  88                         start = next = b
  89                         splitcount = splitcount + 1
  90                         if (maxsplit and (splitcount >= maxsplit)):
  91                             break
  92         res.append(str[start:])
  93         return res
  94
  95
  96 # Capitalize words split using a pattern
  97
  98 def capwords(str, pat='[^a-zA-Z0-9_]+'):
  99         import string
 100         words = splitx(str, pat)
 101         for i in range(0, len(words), 2):
 102                 words[i] = string.capitalize(words[i])
 103         return string.joinfields(words, "")
 104
 105
 106 # Internal subroutines:
 107 # compile(pat): compile a pattern, caching already compiled patterns
 108 # expand(repl, regs, str): expand \digit escapes in replacement string
 109
 110
 111 # Manage a cache of compiled regular expressions.
 112 #
 113 # If the pattern is a string a compiled version of it is returned.  If
 114 # the pattern has been used before we return an already compiled
 115 # version from the cache; otherwise we compile it now and save the
 116 # compiled version in the cache, along with the syntax it was compiled
 117 # with.  Instead of a string, a compiled regular expression can also
 118 # be passed.
 119
 120 cache = {}
 121
 122 def compile(pat):
 123         if type(pat) <> type(''):
 124                 return pat              # Assume it is a compiled regex
 125         key = (pat, regex.get_syntax())
 126         if cache.has_key(key):
 127                 prog = cache[key]       # Get it from the cache
 128         else:
 129                 prog = cache[key] = regex.compile(pat)
 130         return prog
 131
 132
 133 def clear_cache():
 134         global cache
 135         cache = {}
 136
 137
 138 # Expand \digit in the replacement.
 139 # Each occurrence of \digit is replaced by the substring of str
 140 # indicated by regs[digit].  To include a literal \ in the
 141 # replacement, double it; other \ escapes are left unchanged (i.e.
 142 # the \ and the following character are both copied).
 143
 144 def expand(repl, regs, str):
 145         if '\\' not in repl:
 146                 return repl
 147         new = ''
 148         i = 0
 149         ord0 = ord('0')
 150         while i < len(repl):
 151                 c = repl[i]; i = i+1
 152                 if c <> '\\' or i >= len(repl):
 153                         new = new + c
 154                 else:
 155                         c = repl[i]; i = i+1
 156                         if '0' <= c <= '9':
 157                                 a, b = regs[ord(c)-ord0]
 158                                 new = new + str[a:b]
 159                         elif c == '\\':
 160                                 new = new + c
 161                         else:
 162                                 new = new + '\\' + c
 163         return new
 164
 165
 166 # Test program, reads sequences "pat repl str" from stdin.
 167 # Optional argument specifies pattern used to split lines.
 168
 169 def test():
 170         import sys
 171         if sys.argv[1:]:
 172                 delpat = sys.argv[1]
 173         else:
 174                 delpat = '[ \t\n]+'
 175         while 1:
 176                 if sys.stdin.isatty(): sys.stderr.write('--> ')
 177                 line = sys.stdin.readline()
 178                 if not line: break
 179                 if line[-1] == '\n': line = line[:-1]
 180                 fields = split(line, delpat)
 181                 if len(fields) <> 3:
 182                         print 'Sorry, not three fields'
 183                         print 'split:', `fields`
 184                         continue
 185                 [pat, repl, str] = split(line, delpat)
 186                 print 'sub :', `sub(pat, repl, str)`
 187                 print 'gsub:', `gsub(pat, repl, str)`