Fix the availability statement for the spawn*() functions to reflect the
[python/dscho.git] / Lib / sre_compile.py
blobe48be72285b4277747cf2be59b149693d33f1177
2 # Secret Labs' Regular Expression Engine
4 # convert template to internal format
6 # Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved.
8 # See the sre.py file for information on usage and redistribution.
11 """Internal support module for sre"""
13 import _sre,sys
15 from sre_constants import *
17 assert _sre.MAGIC == MAGIC, "SRE module mismatch"
19 MAXCODE = 65535
21 def _compile(code, pattern, flags):
22 # internal: compile a (sub)pattern
23 emit = code.append
24 for op, av in pattern:
25 if op in (LITERAL, NOT_LITERAL):
26 if flags & SRE_FLAG_IGNORECASE:
27 emit(OPCODES[OP_IGNORE[op]])
28 emit(_sre.getlower(av, flags))
29 else:
30 emit(OPCODES[op])
31 emit(av)
32 elif op is IN:
33 if flags & SRE_FLAG_IGNORECASE:
34 emit(OPCODES[OP_IGNORE[op]])
35 def fixup(literal, flags=flags):
36 return _sre.getlower(literal, flags)
37 else:
38 emit(OPCODES[op])
39 fixup = lambda x: x
40 skip = len(code); emit(0)
41 _compile_charset(av, flags, code, fixup)
42 code[skip] = len(code) - skip
43 elif op is ANY:
44 if flags & SRE_FLAG_DOTALL:
45 emit(OPCODES[ANY_ALL])
46 else:
47 emit(OPCODES[ANY])
48 elif op in (REPEAT, MIN_REPEAT, MAX_REPEAT):
49 if flags & SRE_FLAG_TEMPLATE:
50 raise error, "internal: unsupported template operator"
51 emit(OPCODES[REPEAT])
52 skip = len(code); emit(0)
53 emit(av[0])
54 emit(av[1])
55 _compile(code, av[2], flags)
56 emit(OPCODES[SUCCESS])
57 code[skip] = len(code) - skip
58 elif _simple(av) and op == MAX_REPEAT:
59 emit(OPCODES[REPEAT_ONE])
60 skip = len(code); emit(0)
61 emit(av[0])
62 emit(av[1])
63 _compile(code, av[2], flags)
64 emit(OPCODES[SUCCESS])
65 code[skip] = len(code) - skip
66 else:
67 emit(OPCODES[REPEAT])
68 skip = len(code); emit(0)
69 emit(av[0])
70 emit(av[1])
71 _compile(code, av[2], flags)
72 code[skip] = len(code) - skip
73 if op == MAX_REPEAT:
74 emit(OPCODES[MAX_UNTIL])
75 else:
76 emit(OPCODES[MIN_UNTIL])
77 elif op is SUBPATTERN:
78 if av[0]:
79 emit(OPCODES[MARK])
80 emit((av[0]-1)*2)
81 # _compile_info(code, av[1], flags)
82 _compile(code, av[1], flags)
83 if av[0]:
84 emit(OPCODES[MARK])
85 emit((av[0]-1)*2+1)
86 elif op in (SUCCESS, FAILURE):
87 emit(OPCODES[op])
88 elif op in (ASSERT, ASSERT_NOT):
89 emit(OPCODES[op])
90 skip = len(code); emit(0)
91 if av[0] >= 0:
92 emit(0) # look ahead
93 else:
94 lo, hi = av[1].getwidth()
95 if lo != hi:
96 raise error, "look-behind requires fixed-width pattern"
97 emit(lo) # look behind
98 _compile(code, av[1], flags)
99 emit(OPCODES[SUCCESS])
100 code[skip] = len(code) - skip
101 elif op is CALL:
102 emit(OPCODES[op])
103 skip = len(code); emit(0)
104 _compile(code, av, flags)
105 emit(OPCODES[SUCCESS])
106 code[skip] = len(code) - skip
107 elif op is AT:
108 emit(OPCODES[op])
109 if flags & SRE_FLAG_MULTILINE:
110 av = AT_MULTILINE.get(av, av)
111 if flags & SRE_FLAG_LOCALE:
112 av = AT_LOCALE.get(av, av)
113 elif flags & SRE_FLAG_UNICODE:
114 av = AT_UNICODE.get(av, av)
115 emit(ATCODES[av])
116 elif op is BRANCH:
117 emit(OPCODES[op])
118 tail = []
119 for av in av[1]:
120 skip = len(code); emit(0)
121 # _compile_info(code, av, flags)
122 _compile(code, av, flags)
123 emit(OPCODES[JUMP])
124 tail.append(len(code)); emit(0)
125 code[skip] = len(code) - skip
126 emit(0) # end of branch
127 for tail in tail:
128 code[tail] = len(code) - tail
129 elif op is CATEGORY:
130 emit(OPCODES[op])
131 if flags & SRE_FLAG_LOCALE:
132 av = CH_LOCALE[av]
133 elif flags & SRE_FLAG_UNICODE:
134 av = CH_UNICODE[av]
135 emit(CHCODES[av])
136 elif op is GROUPREF:
137 if flags & SRE_FLAG_IGNORECASE:
138 emit(OPCODES[OP_IGNORE[op]])
139 else:
140 emit(OPCODES[op])
141 emit(av-1)
142 else:
143 raise ValueError, ("unsupported operand type", op)
145 def _compile_charset(charset, flags, code, fixup=None):
146 # compile charset subprogram
147 emit = code.append
148 if not fixup:
149 fixup = lambda x: x
150 for op, av in _optimize_charset(charset, fixup):
151 emit(OPCODES[op])
152 if op is NEGATE:
153 pass
154 elif op is LITERAL:
155 emit(fixup(av))
156 elif op is RANGE:
157 emit(fixup(av[0]))
158 emit(fixup(av[1]))
159 elif op is CHARSET:
160 code.extend(av)
161 elif op is BIGCHARSET:
162 code.extend(av)
163 elif op is CATEGORY:
164 if flags & SRE_FLAG_LOCALE:
165 emit(CHCODES[CH_LOCALE[av]])
166 elif flags & SRE_FLAG_UNICODE:
167 emit(CHCODES[CH_UNICODE[av]])
168 else:
169 emit(CHCODES[av])
170 else:
171 raise error, "internal: unsupported set operator"
172 emit(OPCODES[FAILURE])
174 def _optimize_charset(charset, fixup):
175 # internal: optimize character set
176 out = []
177 charmap = [0]*256
178 try:
179 for op, av in charset:
180 if op is NEGATE:
181 out.append((op, av))
182 elif op is LITERAL:
183 charmap[fixup(av)] = 1
184 elif op is RANGE:
185 for i in range(fixup(av[0]), fixup(av[1])+1):
186 charmap[i] = 1
187 elif op is CATEGORY:
188 # XXX: could append to charmap tail
189 return charset # cannot compress
190 except IndexError:
191 # character set contains unicode characters
192 return _optimize_unicode(charset, fixup)
193 # compress character map
194 i = p = n = 0
195 runs = []
196 for c in charmap:
197 if c:
198 if n == 0:
199 p = i
200 n = n + 1
201 elif n:
202 runs.append((p, n))
203 n = 0
204 i = i + 1
205 if n:
206 runs.append((p, n))
207 if len(runs) <= 2:
208 # use literal/range
209 for p, n in runs:
210 if n == 1:
211 out.append((LITERAL, p))
212 else:
213 out.append((RANGE, (p, p+n-1)))
214 if len(out) < len(charset):
215 return out
216 else:
217 # use bitmap
218 data = _mk_bitmap(charmap)
219 out.append((CHARSET, data))
220 return out
221 return charset
223 def _mk_bitmap(bits):
224 data = []
225 m = 1; v = 0
226 for c in bits:
227 if c:
228 v = v + m
229 m = m << 1
230 if m > MAXCODE:
231 data.append(v)
232 m = 1; v = 0
233 return data
235 # To represent a big charset, first a bitmap of all characters in the
236 # set is constructed. Then, this bitmap is sliced into chunks of 256
237 # characters, duplicate chunks are eliminitated, and each chunk is
238 # given a number. In the compiled expression, the charset is
239 # represented by a 16-bit word sequence, consisting of one word for
240 # the number of different chunks, a sequence of 256 bytes (128 words)
241 # of chunk numbers indexed by their original chunk position, and a
242 # sequence of chunks (16 words each).
244 # Compression is normally good: in a typical charset, large ranges of
245 # Unicode will be either completely excluded (e.g. if only cyrillic
246 # letters are to be matched), or completely included (e.g. if large
247 # subranges of Kanji match). These ranges will be represented by
248 # chunks of all one-bits or all zero-bits.
250 # Matching can be also done efficiently: the more significant byte of
251 # the Unicode character is an index into the chunk number, and the
252 # less significant byte is a bit index in the chunk (just like the
253 # CHARSET matching).
255 def _optimize_unicode(charset, fixup):
256 charmap = [0]*65536
257 negate = 0
258 for op, av in charset:
259 if op is NEGATE:
260 negate = 1
261 elif op is LITERAL:
262 charmap[fixup(av)] = 1
263 elif op is RANGE:
264 for i in range(fixup(av[0]), fixup(av[1])+1):
265 charmap[i] = 1
266 elif op is CATEGORY:
267 # XXX: could expand category
268 return charset # cannot compress
269 if negate:
270 for i in range(65536):
271 charmap[i] = not charmap[i]
272 comps = {}
273 mapping = [0]*256
274 block = 0
275 data = []
276 for i in range(256):
277 chunk = tuple(charmap[i*256:(i+1)*256])
278 new = comps.setdefault(chunk, block)
279 mapping[i] = new
280 if new == block:
281 block += 1
282 data += _mk_bitmap(chunk)
283 header = [block]
284 assert MAXCODE == 65535
285 for i in range(128):
286 if sys.byteorder == 'big':
287 header.append(256*mapping[2*i]+mapping[2*i+1])
288 else:
289 header.append(mapping[2*i]+256*mapping[2*i+1])
290 data[0:0] = header
291 return [(BIGCHARSET, data)]
293 def _simple(av):
294 # check if av is a "simple" operator
295 lo, hi = av[2].getwidth()
296 if lo == 0 and hi == MAXREPEAT:
297 raise error, "nothing to repeat"
298 return lo == hi == 1 and av[2][0][0] != SUBPATTERN
300 def _compile_info(code, pattern, flags):
301 # internal: compile an info block. in the current version,
302 # this contains min/max pattern width, and an optional literal
303 # prefix or a character map
304 lo, hi = pattern.getwidth()
305 if lo == 0:
306 return # not worth it
307 # look for a literal prefix
308 prefix = []
309 prefix_skip = 0
310 charset = [] # not used
311 if not (flags & SRE_FLAG_IGNORECASE):
312 # look for literal prefix
313 for op, av in pattern.data:
314 if op is LITERAL:
315 if len(prefix) == prefix_skip:
316 prefix_skip = prefix_skip + 1
317 prefix.append(av)
318 elif op is SUBPATTERN and len(av[1]) == 1:
319 op, av = av[1][0]
320 if op is LITERAL:
321 prefix.append(av)
322 else:
323 break
324 else:
325 break
326 # if no prefix, look for charset prefix
327 if not prefix and pattern.data:
328 op, av = pattern.data[0]
329 if op is SUBPATTERN and av[1]:
330 op, av = av[1][0]
331 if op is LITERAL:
332 charset.append((op, av))
333 elif op is BRANCH:
334 c = []
335 for p in av[1]:
336 if not p:
337 break
338 op, av = p[0]
339 if op is LITERAL:
340 c.append((op, av))
341 else:
342 break
343 else:
344 charset = c
345 elif op is BRANCH:
346 c = []
347 for p in av[1]:
348 if not p:
349 break
350 op, av = p[0]
351 if op is LITERAL:
352 c.append((op, av))
353 else:
354 break
355 else:
356 charset = c
357 elif op is IN:
358 charset = av
359 ## if prefix:
360 ## print "*** PREFIX", prefix, prefix_skip
361 ## if charset:
362 ## print "*** CHARSET", charset
363 # add an info block
364 emit = code.append
365 emit(OPCODES[INFO])
366 skip = len(code); emit(0)
367 # literal flag
368 mask = 0
369 if prefix:
370 mask = SRE_INFO_PREFIX
371 if len(prefix) == prefix_skip == len(pattern.data):
372 mask = mask + SRE_INFO_LITERAL
373 elif charset:
374 mask = mask + SRE_INFO_CHARSET
375 emit(mask)
376 # pattern length
377 if lo < MAXCODE:
378 emit(lo)
379 else:
380 emit(MAXCODE)
381 prefix = prefix[:MAXCODE]
382 if hi < MAXCODE:
383 emit(hi)
384 else:
385 emit(0)
386 # add literal prefix
387 if prefix:
388 emit(len(prefix)) # length
389 emit(prefix_skip) # skip
390 code.extend(prefix)
391 # generate overlap table
392 table = [-1] + ([0]*len(prefix))
393 for i in range(len(prefix)):
394 table[i+1] = table[i]+1
395 while table[i+1] > 0 and prefix[i] != prefix[table[i+1]-1]:
396 table[i+1] = table[table[i+1]-1]+1
397 code.extend(table[1:]) # don't store first entry
398 elif charset:
399 _compile_charset(charset, 0, code)
400 code[skip] = len(code) - skip
402 STRING_TYPES = [type("")]
404 try:
405 STRING_TYPES.append(type(unicode("")))
406 except NameError:
407 pass
409 def _code(p, flags):
411 flags = p.pattern.flags | flags
412 code = []
414 # compile info block
415 _compile_info(code, p, flags)
417 # compile the pattern
418 _compile(code, p.data, flags)
420 code.append(OPCODES[SUCCESS])
422 return code
424 def compile(p, flags=0):
425 # internal: convert pattern list to internal format
427 if type(p) in STRING_TYPES:
428 import sre_parse
429 pattern = p
430 p = sre_parse.parse(p, flags)
431 else:
432 pattern = None
434 code = _code(p, flags)
436 # print code
438 # XXX: <fl> get rid of this limitation!
439 assert p.pattern.groups <= 100,\
440 "sorry, but this version only supports 100 named groups"
442 # map in either direction
443 groupindex = p.pattern.groupdict
444 indexgroup = [None] * p.pattern.groups
445 for k, i in groupindex.items():
446 indexgroup[i] = k
448 return _sre.compile(
449 pattern, flags, code,
450 p.pattern.groups-1,
451 groupindex, indexgroup