Added 'list_only' option (and modified 'run()' to respect it).
[python/dscho.git] / Lib / regsub.py
blob8fb33065ed2269148227483fc0dadfd30b43b02b
1 # Regular expression subroutines:
2 # sub(pat, repl, str): replace first occurrence of pattern in string
3 # gsub(pat, repl, str): replace all occurrences of pattern in string
4 # split(str, pat, maxsplit): split string using pattern as delimiter
5 # splitx(str, pat, maxsplit): split string using pattern as delimiter plus
6 # return delimiters
9 import regex
12 # Replace first occurrence of pattern pat in string str by replacement
13 # repl. If the pattern isn't found, the string is returned unchanged.
14 # The replacement may contain references \digit to subpatterns and
15 # escaped backslashes. The pattern may be a string or an already
16 # compiled pattern.
18 def sub(pat, repl, str):
19 prog = compile(pat)
20 if prog.search(str) >= 0:
21 regs = prog.regs
22 a, b = regs[0]
23 str = str[:a] + expand(repl, regs, str) + str[b:]
24 return str
27 # Replace all (non-overlapping) occurrences of pattern pat in string
28 # str by replacement repl. The same rules as for sub() apply.
29 # Empty matches for the pattern are replaced only when not adjacent to
30 # a previous match, so e.g. gsub('', '-', 'abc') returns '-a-b-c-'.
32 def gsub(pat, repl, str):
33 prog = compile(pat)
34 new = ''
35 start = 0
36 first = 1
37 while prog.search(str, start) >= 0:
38 regs = prog.regs
39 a, b = regs[0]
40 if a == b == start and not first:
41 if start >= len(str) or prog.search(str, start+1) < 0:
42 break
43 regs = prog.regs
44 a, b = regs[0]
45 new = new + str[start:a] + expand(repl, regs, str)
46 start = b
47 first = 0
48 new = new + str[start:]
49 return new
52 # Split string str in fields separated by delimiters matching pattern
53 # pat. Only non-empty matches for the pattern are considered, so e.g.
54 # split('abc', '') returns ['abc'].
55 # The optional 3rd argument sets the number of splits that are performed.
57 def split(str, pat, maxsplit = 0):
58 return intsplit(str, pat, maxsplit, 0)
60 # Split string str in fields separated by delimiters matching pattern
61 # pat. Only non-empty matches for the pattern are considered, so e.g.
62 # split('abc', '') returns ['abc']. The delimiters are also included
63 # in the list.
64 # The optional 3rd argument sets the number of splits that are performed.
67 def splitx(str, pat, maxsplit = 0):
68 return intsplit(str, pat, maxsplit, 1)
70 # Internal function used to implement split() and splitx().
72 def intsplit(str, pat, maxsplit, retain):
73 prog = compile(pat)
74 res = []
75 start = next = 0
76 splitcount = 0
77 while prog.search(str, next) >= 0:
78 regs = prog.regs
79 a, b = regs[0]
80 if a == b:
81 next = next + 1
82 if next >= len(str):
83 break
84 else:
85 res.append(str[start:a])
86 if retain:
87 res.append(str[a:b])
88 start = next = b
89 splitcount = splitcount + 1
90 if (maxsplit and (splitcount >= maxsplit)):
91 break
92 res.append(str[start:])
93 return res
96 # Capitalize words split using a pattern
98 def capwords(str, pat='[^a-zA-Z0-9_]+'):
99 import string
100 words = splitx(str, pat)
101 for i in range(0, len(words), 2):
102 words[i] = string.capitalize(words[i])
103 return string.joinfields(words, "")
106 # Internal subroutines:
107 # compile(pat): compile a pattern, caching already compiled patterns
108 # expand(repl, regs, str): expand \digit escapes in replacement string
111 # Manage a cache of compiled regular expressions.
113 # If the pattern is a string a compiled version of it is returned. If
114 # the pattern has been used before we return an already compiled
115 # version from the cache; otherwise we compile it now and save the
116 # compiled version in the cache, along with the syntax it was compiled
117 # with. Instead of a string, a compiled regular expression can also
118 # be passed.
120 cache = {}
122 def compile(pat):
123 if type(pat) <> type(''):
124 return pat # Assume it is a compiled regex
125 key = (pat, regex.get_syntax())
126 if cache.has_key(key):
127 prog = cache[key] # Get it from the cache
128 else:
129 prog = cache[key] = regex.compile(pat)
130 return prog
133 def clear_cache():
134 global cache
135 cache = {}
138 # Expand \digit in the replacement.
139 # Each occurrence of \digit is replaced by the substring of str
140 # indicated by regs[digit]. To include a literal \ in the
141 # replacement, double it; other \ escapes are left unchanged (i.e.
142 # the \ and the following character are both copied).
144 def expand(repl, regs, str):
145 if '\\' not in repl:
146 return repl
147 new = ''
148 i = 0
149 ord0 = ord('0')
150 while i < len(repl):
151 c = repl[i]; i = i+1
152 if c <> '\\' or i >= len(repl):
153 new = new + c
154 else:
155 c = repl[i]; i = i+1
156 if '0' <= c <= '9':
157 a, b = regs[ord(c)-ord0]
158 new = new + str[a:b]
159 elif c == '\\':
160 new = new + c
161 else:
162 new = new + '\\' + c
163 return new
166 # Test program, reads sequences "pat repl str" from stdin.
167 # Optional argument specifies pattern used to split lines.
169 def test():
170 import sys
171 if sys.argv[1:]:
172 delpat = sys.argv[1]
173 else:
174 delpat = '[ \t\n]+'
175 while 1:
176 if sys.stdin.isatty(): sys.stderr.write('--> ')
177 line = sys.stdin.readline()
178 if not line: break
179 if line[-1] == '\n': line = line[:-1]
180 fields = split(line, delpat)
181 if len(fields) <> 3:
182 print 'Sorry, not three fields'
183 print 'split:', `fields`
184 continue
185 [pat, repl, str] = split(line, delpat)
186 print 'sub :', `sub(pat, repl, str)`
187 print 'gsub:', `gsub(pat, repl, str)`