Update for release.
[python/dscho.git] / Lib / sre_compile.py
blobe5adb7e46a7817a57e2c4f43c46547d947cd755b
2 # Secret Labs' Regular Expression Engine
4 # convert template to internal format
6 # Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved.
8 # See the sre.py file for information on usage and redistribution.
11 """Internal support module for sre"""
13 import _sre, sys
15 from sre_constants import *
17 assert _sre.MAGIC == MAGIC, "SRE module mismatch"
19 MAXCODE = 65535
21 def _compile(code, pattern, flags):
22 # internal: compile a (sub)pattern
23 emit = code.append
24 for op, av in pattern:
25 if op in (LITERAL, NOT_LITERAL):
26 if flags & SRE_FLAG_IGNORECASE:
27 emit(OPCODES[OP_IGNORE[op]])
28 emit(_sre.getlower(av, flags))
29 else:
30 emit(OPCODES[op])
31 emit(av)
32 elif op is IN:
33 if flags & SRE_FLAG_IGNORECASE:
34 emit(OPCODES[OP_IGNORE[op]])
35 def fixup(literal, flags=flags):
36 return _sre.getlower(literal, flags)
37 else:
38 emit(OPCODES[op])
39 fixup = lambda x: x
40 skip = len(code); emit(0)
41 _compile_charset(av, flags, code, fixup)
42 code[skip] = len(code) - skip
43 elif op is ANY:
44 if flags & SRE_FLAG_DOTALL:
45 emit(OPCODES[ANY_ALL])
46 else:
47 emit(OPCODES[ANY])
48 elif op in (REPEAT, MIN_REPEAT, MAX_REPEAT):
49 if flags & SRE_FLAG_TEMPLATE:
50 raise error, "internal: unsupported template operator"
51 emit(OPCODES[REPEAT])
52 skip = len(code); emit(0)
53 emit(av[0])
54 emit(av[1])
55 _compile(code, av[2], flags)
56 emit(OPCODES[SUCCESS])
57 code[skip] = len(code) - skip
58 elif _simple(av) and op == MAX_REPEAT:
59 emit(OPCODES[REPEAT_ONE])
60 skip = len(code); emit(0)
61 emit(av[0])
62 emit(av[1])
63 _compile(code, av[2], flags)
64 emit(OPCODES[SUCCESS])
65 code[skip] = len(code) - skip
66 else:
67 emit(OPCODES[REPEAT])
68 skip = len(code); emit(0)
69 emit(av[0])
70 emit(av[1])
71 _compile(code, av[2], flags)
72 code[skip] = len(code) - skip
73 if op == MAX_REPEAT:
74 emit(OPCODES[MAX_UNTIL])
75 else:
76 emit(OPCODES[MIN_UNTIL])
77 elif op is SUBPATTERN:
78 if av[0]:
79 emit(OPCODES[MARK])
80 emit((av[0]-1)*2)
81 # _compile_info(code, av[1], flags)
82 _compile(code, av[1], flags)
83 if av[0]:
84 emit(OPCODES[MARK])
85 emit((av[0]-1)*2+1)
86 elif op in (SUCCESS, FAILURE):
87 emit(OPCODES[op])
88 elif op in (ASSERT, ASSERT_NOT):
89 emit(OPCODES[op])
90 skip = len(code); emit(0)
91 if av[0] >= 0:
92 emit(0) # look ahead
93 else:
94 lo, hi = av[1].getwidth()
95 if lo != hi:
96 raise error, "look-behind requires fixed-width pattern"
97 emit(lo) # look behind
98 _compile(code, av[1], flags)
99 emit(OPCODES[SUCCESS])
100 code[skip] = len(code) - skip
101 elif op is CALL:
102 emit(OPCODES[op])
103 skip = len(code); emit(0)
104 _compile(code, av, flags)
105 emit(OPCODES[SUCCESS])
106 code[skip] = len(code) - skip
107 elif op is AT:
108 emit(OPCODES[op])
109 if flags & SRE_FLAG_MULTILINE:
110 av = AT_MULTILINE.get(av, av)
111 if flags & SRE_FLAG_LOCALE:
112 av = AT_LOCALE.get(av, av)
113 elif flags & SRE_FLAG_UNICODE:
114 av = AT_UNICODE.get(av, av)
115 emit(ATCODES[av])
116 elif op is BRANCH:
117 emit(OPCODES[op])
118 tail = []
119 for av in av[1]:
120 skip = len(code); emit(0)
121 # _compile_info(code, av, flags)
122 _compile(code, av, flags)
123 emit(OPCODES[JUMP])
124 tail.append(len(code)); emit(0)
125 code[skip] = len(code) - skip
126 emit(0) # end of branch
127 for tail in tail:
128 code[tail] = len(code) - tail
129 elif op is CATEGORY:
130 emit(OPCODES[op])
131 if flags & SRE_FLAG_LOCALE:
132 av = CH_LOCALE[av]
133 elif flags & SRE_FLAG_UNICODE:
134 av = CH_UNICODE[av]
135 emit(CHCODES[av])
136 elif op is GROUPREF:
137 if flags & SRE_FLAG_IGNORECASE:
138 emit(OPCODES[OP_IGNORE[op]])
139 else:
140 emit(OPCODES[op])
141 emit(av-1)
142 else:
143 raise ValueError, ("unsupported operand type", op)
145 def _compile_charset(charset, flags, code, fixup=None):
146 # compile charset subprogram
147 emit = code.append
148 if fixup is None:
149 fixup = lambda x: x
150 for op, av in _optimize_charset(charset, fixup):
151 emit(OPCODES[op])
152 if op is NEGATE:
153 pass
154 elif op is LITERAL:
155 emit(fixup(av))
156 elif op is RANGE:
157 emit(fixup(av[0]))
158 emit(fixup(av[1]))
159 elif op is CHARSET:
160 code.extend(av)
161 elif op is BIGCHARSET:
162 code.extend(av)
163 elif op is CATEGORY:
164 if flags & SRE_FLAG_LOCALE:
165 emit(CHCODES[CH_LOCALE[av]])
166 elif flags & SRE_FLAG_UNICODE:
167 emit(CHCODES[CH_UNICODE[av]])
168 else:
169 emit(CHCODES[av])
170 else:
171 raise error, "internal: unsupported set operator"
172 emit(OPCODES[FAILURE])
174 def _optimize_charset(charset, fixup):
175 # internal: optimize character set
176 out = []
177 charmap = [0]*256
178 try:
179 for op, av in charset:
180 if op is NEGATE:
181 out.append((op, av))
182 elif op is LITERAL:
183 charmap[fixup(av)] = 1
184 elif op is RANGE:
185 for i in range(fixup(av[0]), fixup(av[1])+1):
186 charmap[i] = 1
187 elif op is CATEGORY:
188 # XXX: could append to charmap tail
189 return charset # cannot compress
190 except IndexError:
191 if sys.maxunicode != 65535:
192 # XXX: big charsets don't work in UCS-4 builds
193 return charset
194 # character set contains unicode characters
195 return _optimize_unicode(charset, fixup)
196 # compress character map
197 i = p = n = 0
198 runs = []
199 for c in charmap:
200 if c:
201 if n == 0:
202 p = i
203 n = n + 1
204 elif n:
205 runs.append((p, n))
206 n = 0
207 i = i + 1
208 if n:
209 runs.append((p, n))
210 if len(runs) <= 2:
211 # use literal/range
212 for p, n in runs:
213 if n == 1:
214 out.append((LITERAL, p))
215 else:
216 out.append((RANGE, (p, p+n-1)))
217 if len(out) < len(charset):
218 return out
219 else:
220 # use bitmap
221 data = _mk_bitmap(charmap)
222 out.append((CHARSET, data))
223 return out
224 return charset
226 def _mk_bitmap(bits):
227 data = []
228 m = 1; v = 0
229 for c in bits:
230 if c:
231 v = v + m
232 m = m << 1
233 if m > MAXCODE:
234 data.append(v)
235 m = 1; v = 0
236 return data
238 # To represent a big charset, first a bitmap of all characters in the
239 # set is constructed. Then, this bitmap is sliced into chunks of 256
240 # characters, duplicate chunks are eliminitated, and each chunk is
241 # given a number. In the compiled expression, the charset is
242 # represented by a 16-bit word sequence, consisting of one word for
243 # the number of different chunks, a sequence of 256 bytes (128 words)
244 # of chunk numbers indexed by their original chunk position, and a
245 # sequence of chunks (16 words each).
247 # Compression is normally good: in a typical charset, large ranges of
248 # Unicode will be either completely excluded (e.g. if only cyrillic
249 # letters are to be matched), or completely included (e.g. if large
250 # subranges of Kanji match). These ranges will be represented by
251 # chunks of all one-bits or all zero-bits.
253 # Matching can be also done efficiently: the more significant byte of
254 # the Unicode character is an index into the chunk number, and the
255 # less significant byte is a bit index in the chunk (just like the
256 # CHARSET matching).
258 def _optimize_unicode(charset, fixup):
259 charmap = [0]*65536
260 negate = 0
261 for op, av in charset:
262 if op is NEGATE:
263 negate = 1
264 elif op is LITERAL:
265 charmap[fixup(av)] = 1
266 elif op is RANGE:
267 for i in range(fixup(av[0]), fixup(av[1])+1):
268 charmap[i] = 1
269 elif op is CATEGORY:
270 # XXX: could expand category
271 return charset # cannot compress
272 if negate:
273 for i in range(65536):
274 charmap[i] = not charmap[i]
275 comps = {}
276 mapping = [0]*256
277 block = 0
278 data = []
279 for i in range(256):
280 chunk = tuple(charmap[i*256:(i+1)*256])
281 new = comps.setdefault(chunk, block)
282 mapping[i] = new
283 if new == block:
284 block = block + 1
285 data = data + _mk_bitmap(chunk)
286 header = [block]
287 assert MAXCODE == 65535
288 for i in range(128):
289 if sys.byteorder == 'big':
290 header.append(256*mapping[2*i]+mapping[2*i+1])
291 else:
292 header.append(mapping[2*i]+256*mapping[2*i+1])
293 data[0:0] = header
294 return [(BIGCHARSET, data)]
296 def _simple(av):
297 # check if av is a "simple" operator
298 lo, hi = av[2].getwidth()
299 if lo == 0 and hi == MAXREPEAT:
300 raise error, "nothing to repeat"
301 return lo == hi == 1 and av[2][0][0] != SUBPATTERN
303 def _compile_info(code, pattern, flags):
304 # internal: compile an info block. in the current version,
305 # this contains min/max pattern width, and an optional literal
306 # prefix or a character map
307 lo, hi = pattern.getwidth()
308 if lo == 0:
309 return # not worth it
310 # look for a literal prefix
311 prefix = []
312 prefix_skip = 0
313 charset = [] # not used
314 if not (flags & SRE_FLAG_IGNORECASE):
315 # look for literal prefix
316 for op, av in pattern.data:
317 if op is LITERAL:
318 if len(prefix) == prefix_skip:
319 prefix_skip = prefix_skip + 1
320 prefix.append(av)
321 elif op is SUBPATTERN and len(av[1]) == 1:
322 op, av = av[1][0]
323 if op is LITERAL:
324 prefix.append(av)
325 else:
326 break
327 else:
328 break
329 # if no prefix, look for charset prefix
330 if not prefix and pattern.data:
331 op, av = pattern.data[0]
332 if op is SUBPATTERN and av[1]:
333 op, av = av[1][0]
334 if op is LITERAL:
335 charset.append((op, av))
336 elif op is BRANCH:
337 c = []
338 for p in av[1]:
339 if not p:
340 break
341 op, av = p[0]
342 if op is LITERAL:
343 c.append((op, av))
344 else:
345 break
346 else:
347 charset = c
348 elif op is BRANCH:
349 c = []
350 for p in av[1]:
351 if not p:
352 break
353 op, av = p[0]
354 if op is LITERAL:
355 c.append((op, av))
356 else:
357 break
358 else:
359 charset = c
360 elif op is IN:
361 charset = av
362 ## if prefix:
363 ## print "*** PREFIX", prefix, prefix_skip
364 ## if charset:
365 ## print "*** CHARSET", charset
366 # add an info block
367 emit = code.append
368 emit(OPCODES[INFO])
369 skip = len(code); emit(0)
370 # literal flag
371 mask = 0
372 if prefix:
373 mask = SRE_INFO_PREFIX
374 if len(prefix) == prefix_skip == len(pattern.data):
375 mask = mask + SRE_INFO_LITERAL
376 elif charset:
377 mask = mask + SRE_INFO_CHARSET
378 emit(mask)
379 # pattern length
380 if lo < MAXCODE:
381 emit(lo)
382 else:
383 emit(MAXCODE)
384 prefix = prefix[:MAXCODE]
385 if hi < MAXCODE:
386 emit(hi)
387 else:
388 emit(0)
389 # add literal prefix
390 if prefix:
391 emit(len(prefix)) # length
392 emit(prefix_skip) # skip
393 code.extend(prefix)
394 # generate overlap table
395 table = [-1] + ([0]*len(prefix))
396 for i in range(len(prefix)):
397 table[i+1] = table[i]+1
398 while table[i+1] > 0 and prefix[i] != prefix[table[i+1]-1]:
399 table[i+1] = table[table[i+1]-1]+1
400 code.extend(table[1:]) # don't store first entry
401 elif charset:
402 _compile_charset(charset, 0, code)
403 code[skip] = len(code) - skip
405 STRING_TYPES = [type("")]
407 try:
408 STRING_TYPES.append(type(unicode("")))
409 except NameError:
410 pass
412 def _code(p, flags):
414 flags = p.pattern.flags | flags
415 code = []
417 # compile info block
418 _compile_info(code, p, flags)
420 # compile the pattern
421 _compile(code, p.data, flags)
423 code.append(OPCODES[SUCCESS])
425 return code
427 def compile(p, flags=0):
428 # internal: convert pattern list to internal format
430 if type(p) in STRING_TYPES:
431 import sre_parse
432 pattern = p
433 p = sre_parse.parse(p, flags)
434 else:
435 pattern = None
437 code = _code(p, flags)
439 # print code
441 # XXX: <fl> get rid of this limitation!
442 assert p.pattern.groups <= 100,\
443 "sorry, but this version only supports 100 named groups"
445 # map in either direction
446 groupindex = p.pattern.groupdict
447 indexgroup = [None] * p.pattern.groups
448 for k, i in groupindex.items():
449 indexgroup[i] = k
451 return _sre.compile(
452 pattern, flags, code,
453 p.pattern.groups-1,
454 groupindex, indexgroup