Class around PixMap objects that allows more python-like access. By Joe Strout.
[python/dscho.git] / Lib / re.py
blobd190363df47581c0c0478b0d738749a5f1749e4b
1 import sys
2 import string
3 from pcre import *
6 # First, the public part of the interface:
9 # pcre.error and re.error should be the same, since exceptions can be
10 # raised from either module.
12 # compilation flags
14 I = IGNORECASE
15 L = LOCALE
16 M = MULTILINE
17 S = DOTALL
18 X = VERBOSE
24 _cache = {}
25 _MAXCACHE = 20
27 def _cachecompile(pattern, flags=0):
28 key = (pattern, flags)
29 try:
30 return _cache[key]
31 except KeyError:
32 pass
33 value = compile(pattern, flags)
34 if len(_cache) >= _MAXCACHE:
35 _cache.clear()
36 _cache[key] = value
37 return value
39 def match(pattern, string, flags=0):
40 return _cachecompile(pattern, flags).match(string)
42 def search(pattern, string, flags=0):
43 return _cachecompile(pattern, flags).search(string)
45 def sub(pattern, repl, string, count=0):
46 if type(pattern) == type(''):
47 pattern = _cachecompile(pattern)
48 return pattern.sub(repl, string, count)
50 def subn(pattern, repl, string, count=0):
51 if type(pattern) == type(''):
52 pattern = _cachecompile(pattern)
53 return pattern.subn(repl, string, count)
55 def split(pattern, string, maxsplit=0):
56 if type(pattern) == type(''):
57 pattern = _cachecompile(pattern)
58 return pattern.split(string, maxsplit)
60 def findall(pattern, string):
61 if type(pattern) == type(''):
62 pattern = _cachecompile(pattern)
63 return pattern.findall(string)
65 def escape(pattern):
66 "Escape all non-alphanumeric characters in pattern."
67 result = list(pattern)
68 alphanum=string.letters+'_'+string.digits
69 for i in range(len(pattern)):
70 char = pattern[i]
71 if char not in alphanum:
72 if char=='\000': result[i] = '\\000'
73 else: result[i] = '\\'+char
74 return string.join(result, '')
76 def compile(pattern, flags=0):
77 "Compile a regular expression pattern, returning a RegexObject."
78 groupindex={}
79 code=pcre_compile(pattern, flags, groupindex)
80 return RegexObject(pattern, flags, code, groupindex)
84 # Class definitions
87 class RegexObject:
89 def __init__(self, pattern, flags, code, groupindex):
90 self.code = code
91 self.flags = flags
92 self.pattern = pattern
93 self.groupindex = groupindex
95 def search(self, string, pos=0, endpos=None):
96 """Scan through string looking for a match to the pattern, returning
97 a MatchObject instance, or None if no match was found."""
99 if endpos is None or endpos>len(string):
100 endpos=len(string)
101 if endpos<pos: endpos=pos
102 regs = self.code.match(string, pos, endpos, 0)
103 if regs is None:
104 return None
105 self._num_regs=len(regs)
107 return MatchObject(self,
108 string,
109 pos, endpos,
110 regs)
112 def match(self, string, pos=0, endpos=None):
113 """Try to apply the pattern at the start of the string, returning
114 a MatchObject instance, or None if no match was found."""
116 if endpos is None or endpos>len(string):
117 endpos=len(string)
118 if endpos<pos: endpos=pos
119 regs = self.code.match(string, pos, endpos, ANCHORED)
120 if regs is None:
121 return None
122 self._num_regs=len(regs)
123 return MatchObject(self,
124 string,
125 pos, endpos,
126 regs)
128 def sub(self, repl, string, count=0):
129 """Return the string obtained by replacing the leftmost
130 non-overlapping occurrences of the pattern in string by the
131 replacement repl"""
133 return self.subn(repl, string, count)[0]
135 def subn(self, repl, source, count=0):
136 """Return a 2-tuple containing (new_string, number).
137 new_string is the string obtained by replacing the leftmost
138 non-overlapping occurrences of the pattern in the source
139 string by the replacement repl. number is the number of
140 substitutions that were made."""
142 if count < 0:
143 raise error, "negative substitution count"
144 if count == 0:
145 count = sys.maxint
146 n = 0 # Number of matches
147 pos = 0 # Where to start searching
148 lastmatch = -1 # End of last match
149 results = [] # Substrings making up the result
150 end = len(source)
152 if type(repl) is type(''):
153 # See if repl contains group references
154 try:
155 repl = pcre_expand(_Dummy, repl)
156 except:
157 m = MatchObject(self, source, 0, end, [])
158 repl = lambda m, repl=repl, expand=pcre_expand: expand(m, repl)
159 else:
160 m = None
161 else:
162 m = MatchObject(self, source, 0, end, [])
164 match = self.code.match
165 append = results.append
166 while n < count and pos <= end:
167 regs = match(source, pos, end, 0)
168 if not regs:
169 break
170 self._num_regs = len(regs)
171 i, j = regs[0]
172 if i == j == lastmatch:
173 # Empty match adjacent to previous match
174 pos = pos + 1
175 append(source[lastmatch:pos])
176 continue
177 if pos < i:
178 append(source[pos:i])
179 if m:
180 m.pos = pos
181 m.regs = regs
182 append(repl(m))
183 else:
184 append(repl)
185 pos = lastmatch = j
186 if i == j:
187 # Last match was empty; don't try here again
188 pos = pos + 1
189 append(source[lastmatch:pos])
190 n = n + 1
191 append(source[pos:])
192 return (string.join(results, ''), n)
194 def split(self, source, maxsplit=0):
195 """Split the source string by the occurrences of the pattern,
196 returning a list containing the resulting substrings."""
198 if maxsplit < 0:
199 raise error, "negative split count"
200 if maxsplit == 0:
201 maxsplit = sys.maxint
202 n = 0
203 pos = 0
204 lastmatch = 0
205 results = []
206 end = len(source)
207 match = self.code.match
208 append = results.append
209 while n < maxsplit:
210 regs = match(source, pos, end, 0)
211 if not regs:
212 break
213 i, j = regs[0]
214 if i == j:
215 # Empty match
216 if pos >= end:
217 break
218 pos = pos+1
219 continue
220 append(source[lastmatch:i])
221 rest = regs[1:]
222 if rest:
223 for a, b in rest:
224 if a == -1 or b == -1:
225 group = None
226 else:
227 group = source[a:b]
228 append(group)
229 pos = lastmatch = j
230 n = n + 1
231 append(source[lastmatch:])
232 return results
234 def findall(self, source):
235 """Return a list of all non-overlapping matches in the string.
237 If one or more groups are present in the pattern, return a
238 list of groups; this will be a list of tuples if the pattern
239 has more than one group.
241 Empty matches are included in the result.
244 pos = 0
245 end = len(source)
246 results = []
247 match = self.code.match
248 append = results.append
249 while pos <= end:
250 regs = match(source, pos, end, 0)
251 if not regs:
252 break
253 i, j = regs[0]
254 rest = regs[1:]
255 if not rest:
256 gr = source[i:j]
257 elif len(rest) == 1:
258 a, b = rest[0]
259 gr = source[a:b]
260 else:
261 gr = []
262 for (a, b) in rest:
263 gr.append(source[a:b])
264 gr = tuple(gr)
265 append(gr)
266 pos = max(j, pos+1)
267 return results
269 # The following 3 functions were contributed by Mike Fletcher, and
270 # allow pickling and unpickling of RegexObject instances.
271 def __getinitargs__(self):
272 return (None,None,None,None) # any 4 elements, to work around
273 # problems with the
274 # pickle/cPickle modules not yet
275 # ignoring the __init__ function
276 def __getstate__(self):
277 return self.pattern, self.flags, self.groupindex
278 def __setstate__(self, statetuple):
279 self.pattern = statetuple[0]
280 self.flags = statetuple[1]
281 self.groupindex = statetuple[2]
282 self.code = apply(pcre_compile, statetuple)
284 class _Dummy:
285 # Dummy class used by _subn_string(). Has 'group' to avoid core dump.
286 group = None
288 class MatchObject:
290 def __init__(self, re, string, pos, endpos, regs):
291 self.re = re
292 self.string = string
293 self.pos = pos
294 self.endpos = endpos
295 self.regs = regs
297 def start(self, g = 0):
298 "Return the start of the substring matched by group g"
299 if type(g) == type(''):
300 try:
301 g = self.re.groupindex[g]
302 except (KeyError, TypeError):
303 raise IndexError, 'group %s is undefined' % `g`
304 return self.regs[g][0]
306 def end(self, g = 0):
307 "Return the end of the substring matched by group g"
308 if type(g) == type(''):
309 try:
310 g = self.re.groupindex[g]
311 except (KeyError, TypeError):
312 raise IndexError, 'group %s is undefined' % `g`
313 return self.regs[g][1]
315 def span(self, g = 0):
316 "Return (start, end) of the substring matched by group g"
317 if type(g) == type(''):
318 try:
319 g = self.re.groupindex[g]
320 except (KeyError, TypeError):
321 raise IndexError, 'group %s is undefined' % `g`
322 return self.regs[g]
324 def groups(self, default=None):
325 "Return a tuple containing all subgroups of the match object"
326 result = []
327 for g in range(1, self.re._num_regs):
328 a, b = self.regs[g]
329 if a == -1 or b == -1:
330 result.append(default)
331 else:
332 result.append(self.string[a:b])
333 return tuple(result)
335 def group(self, *groups):
336 "Return one or more groups of the match"
337 if len(groups) == 0:
338 groups = (0,)
339 result = []
340 for g in groups:
341 if type(g) == type(''):
342 try:
343 g = self.re.groupindex[g]
344 except (KeyError, TypeError):
345 raise IndexError, 'group %s is undefined' % `g`
346 if g >= len(self.regs):
347 raise IndexError, 'group %s is undefined' % `g`
348 a, b = self.regs[g]
349 if a == -1 or b == -1:
350 result.append(None)
351 else:
352 result.append(self.string[a:b])
353 if len(result) > 1:
354 return tuple(result)
355 elif len(result) == 1:
356 return result[0]
357 else:
358 return ()
360 def groupdict(self, default=None):
361 "Return a dictionary containing all named subgroups of the match"
362 dict = {}
363 for name, index in self.re.groupindex.items():
364 a, b = self.regs[index]
365 if a == -1 or b == -1:
366 dict[name] = default
367 else:
368 dict[name] = self.string[a:b]
369 return dict