At the release of 1.0.1.
[python/dscho.git] / Lib / regsub.py
blob7eb175b60ec5ca25a39d44a41fc3a1d93ce2f079
1 # Regular expression subroutines:
2 # sub(pat, repl, str): replace first occurrence of pattern in string
3 # gsub(pat, repl, str): replace all occurrences of pattern in string
4 # split(str, pat): split string using pattern as delimiter
7 import regex
10 # Replace first occurrence of pattern pat in string str by replacement
11 # repl. If the pattern isn't found, the string is returned unchanged.
12 # The replacement may contain references \digit to subpatterns and
13 # escaped backslashes. The pattern may be a string or an already
14 # compiled pattern.
16 def sub(pat, repl, str):
17 prog = compile(pat)
18 if prog.search(str) >= 0:
19 regs = prog.regs
20 a, b = regs[0]
21 str = str[:a] + expand(repl, regs, str) + str[b:]
22 return str
25 # Replace all (non-overlapping) occurrences of pattern pat in string
26 # str by replacement repl. The same rules as for sub() apply.
27 # Empty matches for the pattern are replaced only when not adjacent to
28 # a previous match, so e.g. gsub('', '-', 'abc') returns '-a-b-c-'.
30 def gsub(pat, repl, str):
31 prog = compile(pat)
32 new = ''
33 start = 0
34 first = 1
35 while prog.search(str, start) >= 0:
36 regs = prog.regs
37 a, b = regs[0]
38 if a == b == start and not first:
39 if start >= len(str) or prog.search(str, start+1) < 0:
40 break
41 regs = prog.regs
42 a, b = regs[0]
43 new = new + str[start:a] + expand(repl, regs, str)
44 start = b
45 first = 0
46 new = new + str[start:]
47 return new
50 # Split string str in fields separated by delimiters matching pattern
51 # pat. Only non-empty matches for the pattern are considered, so e.g.
52 # split('abc', '') returns ['abc'].
54 def split(str, pat):
55 prog = compile(pat)
56 res = []
57 start = next = 0
58 while prog.search(str, next) >= 0:
59 regs = prog.regs
60 a, b = regs[0]
61 if a == b:
62 next = next + 1
63 if next >= len(str):
64 break
65 else:
66 res.append(str[start:a])
67 start = next = b
68 res.append(str[start:])
69 return res
72 # Internal subroutines:
73 # compile(pat): compile a pattern, caching already compiled patterns
74 # expand(repl, regs, str): expand \digit escapes in replacement string
77 # Manage a cache of compiled regular expressions.
78 # If the pattern is a string a compiled version of it is returned.
79 # If the pattern has been used before we return an already compiled
80 # version from the cache; otherwise we compile it now and save the
81 # compiled version in the cache.
82 # Instead of a string, a compiled regular expression can also be
83 # passed.
84 # WARNING: if the pattern syntax is changed, the cache should be
85 # flushed!
87 cache = {}
89 def compile(pat):
90 if type(pat) <> type(''):
91 return pat # Assume it is a compiled regex
92 if cache.has_key(pat):
93 prog = cache[pat] # Get it from the cache
94 else:
95 prog = cache[pat] = regex.compile(pat)
96 return prog
99 # Expand \digit in the replacement.
100 # Each occurrence of \digit is replaced by the substring of str
101 # indicated by regs[digit]. To include a literal \ in the
102 # replacement, double it; other \ escapes are left unchanged (i.e.
103 # the \ and the following character are both copied).
105 def expand(repl, regs, str):
106 if '\\' not in repl:
107 return repl
108 new = ''
109 i = 0
110 while i < len(repl):
111 c = repl[i]; i = i+1
112 if c <> '\\' or i >= len(repl):
113 new = new + c
114 else:
115 c = repl[i]; i = i+1
116 if '0' <= c <= '9':
117 a, b = regs[eval(c)]
118 new = new + str[a:b]
119 elif c == '\\':
120 new = new + c
121 else:
122 new = new + '\\' + c
123 return new
126 # Test program, reads sequences "pat repl str" from stdin.
127 # Optional argument specifies pattern used to split lines.
129 def test():
130 import sys
131 if sys.argv[1:]:
132 delpat = sys.argv[1]
133 else:
134 delpat = '[ \t\n]+'
135 while 1:
136 if sys.stdin.isatty(): sys.stderr.write('--> ')
137 line = sys.stdin.readline()
138 if not line: break
139 if line[-1] == '\n': line = line[:-1]
140 fields = split(line, delpat)
141 if len(fields) <> 3:
142 print 'Sorry, not three fields'
143 print 'split:', `fields`
144 continue
145 [pat, repl, str] = split(line, delpat)
146 print 'sub :', `sub(pat, repl, str)`
147 print 'gsub:', `gsub(pat, repl, str)`