Lib/sre_compile.py

   1 #
   2 # Secret Labs' Regular Expression Engine
   3 #
   4 # convert template to internal format
   5 #
   6 # Copyright (c) 1997-2001 by Secret Labs AB.  All rights reserved.
   7 #
   8 # See the sre.py file for information on usage and redistribution.
   9 #
  10
  11 """Internal support module for sre"""
  12
  13 import _sre, sys
  14
  15 from sre_constants import *
  16
  17 assert _sre.MAGIC == MAGIC, "SRE module mismatch"
  18
  19 MAXCODE = 65535
  20
  21 def _compile(code, pattern, flags):
  22     # internal: compile a (sub)pattern
  23     emit = code.append
  24     for op, av in pattern:
  25         if op in (LITERAL, NOT_LITERAL):
  26             if flags & SRE_FLAG_IGNORECASE:
  27                 emit(OPCODES[OP_IGNORE[op]])
  28                 emit(_sre.getlower(av, flags))
  29             else:
  30                 emit(OPCODES[op])
  31                 emit(av)
  32         elif op is IN:
  33             if flags & SRE_FLAG_IGNORECASE:
  34                 emit(OPCODES[OP_IGNORE[op]])
  35                 def fixup(literal, flags=flags):
  36                     return _sre.getlower(literal, flags)
  37             else:
  38                 emit(OPCODES[op])
  39                 fixup = lambda x: x
  40             skip = len(code); emit(0)
  41             _compile_charset(av, flags, code, fixup)
  42             code[skip] = len(code) - skip
  43         elif op is ANY:
  44             if flags & SRE_FLAG_DOTALL:
  45                 emit(OPCODES[ANY_ALL])
  46             else:
  47                 emit(OPCODES[ANY])
  48         elif op in (REPEAT, MIN_REPEAT, MAX_REPEAT):
  49             if flags & SRE_FLAG_TEMPLATE:
  50                 raise error, "internal: unsupported template operator"
  51                 emit(OPCODES[REPEAT])
  52                 skip = len(code); emit(0)
  53                 emit(av[0])
  54                 emit(av[1])
  55                 _compile(code, av[2], flags)
  56                 emit(OPCODES[SUCCESS])
  57                 code[skip] = len(code) - skip
  58             elif _simple(av) and op == MAX_REPEAT:
  59                 emit(OPCODES[REPEAT_ONE])
  60                 skip = len(code); emit(0)
  61                 emit(av[0])
  62                 emit(av[1])
  63                 _compile(code, av[2], flags)
  64                 emit(OPCODES[SUCCESS])
  65                 code[skip] = len(code) - skip
  66             else:
  67                 emit(OPCODES[REPEAT])
  68                 skip = len(code); emit(0)
  69                 emit(av[0])
  70                 emit(av[1])
  71                 _compile(code, av[2], flags)
  72                 code[skip] = len(code) - skip
  73                 if op == MAX_REPEAT:
  74                     emit(OPCODES[MAX_UNTIL])
  75                 else:
  76                     emit(OPCODES[MIN_UNTIL])
  77         elif op is SUBPATTERN:
  78             if av[0]:
  79                 emit(OPCODES[MARK])
  80                 emit((av[0]-1)*2)
  81             # _compile_info(code, av[1], flags)
  82             _compile(code, av[1], flags)
  83             if av[0]:
  84                 emit(OPCODES[MARK])
  85                 emit((av[0]-1)*2+1)
  86         elif op in (SUCCESS, FAILURE):
  87             emit(OPCODES[op])
  88         elif op in (ASSERT, ASSERT_NOT):
  89             emit(OPCODES[op])
  90             skip = len(code); emit(0)
  91             if av[0] >= 0:
  92                 emit(0) # look ahead
  93             else:
  94                 lo, hi = av[1].getwidth()
  95                 if lo != hi:
  96                     raise error, "look-behind requires fixed-width pattern"
  97                 emit(lo) # look behind
  98             _compile(code, av[1], flags)
  99             emit(OPCODES[SUCCESS])
 100             code[skip] = len(code) - skip
 101         elif op is CALL:
 102             emit(OPCODES[op])
 103             skip = len(code); emit(0)
 104             _compile(code, av, flags)
 105             emit(OPCODES[SUCCESS])
 106             code[skip] = len(code) - skip
 107         elif op is AT:
 108             emit(OPCODES[op])
 109             if flags & SRE_FLAG_MULTILINE:
 110                 av = AT_MULTILINE.get(av, av)
 111             if flags & SRE_FLAG_LOCALE:
 112                 av = AT_LOCALE.get(av, av)
 113             elif flags & SRE_FLAG_UNICODE:
 114                 av = AT_UNICODE.get(av, av)
 115             emit(ATCODES[av])
 116         elif op is BRANCH:
 117             emit(OPCODES[op])
 118             tail = []
 119             for av in av[1]:
 120                 skip = len(code); emit(0)
 121                 # _compile_info(code, av, flags)
 122                 _compile(code, av, flags)
 123                 emit(OPCODES[JUMP])
 124                 tail.append(len(code)); emit(0)
 125                 code[skip] = len(code) - skip
 126             emit(0) # end of branch
 127             for tail in tail:
 128                 code[tail] = len(code) - tail
 129         elif op is CATEGORY:
 130             emit(OPCODES[op])
 131             if flags & SRE_FLAG_LOCALE:
 132                 av = CH_LOCALE[av]
 133             elif flags & SRE_FLAG_UNICODE:
 134                 av = CH_UNICODE[av]
 135             emit(CHCODES[av])
 136         elif op is GROUPREF:
 137             if flags & SRE_FLAG_IGNORECASE:
 138                 emit(OPCODES[OP_IGNORE[op]])
 139             else:
 140                 emit(OPCODES[op])
 141             emit(av-1)
 142         else:
 143             raise ValueError, ("unsupported operand type", op)
 144
 145 def _compile_charset(charset, flags, code, fixup=None):
 146     # compile charset subprogram
 147     emit = code.append
 148     if fixup is None:
 149         fixup = lambda x: x
 150     for op, av in _optimize_charset(charset, fixup):
 151         emit(OPCODES[op])
 152         if op is NEGATE:
 153             pass
 154         elif op is LITERAL:
 155             emit(fixup(av))
 156         elif op is RANGE:
 157             emit(fixup(av[0]))
 158             emit(fixup(av[1]))
 159         elif op is CHARSET:
 160             code.extend(av)
 161         elif op is BIGCHARSET:
 162             code.extend(av)
 163         elif op is CATEGORY:
 164             if flags & SRE_FLAG_LOCALE:
 165                 emit(CHCODES[CH_LOCALE[av]])
 166             elif flags & SRE_FLAG_UNICODE:
 167                 emit(CHCODES[CH_UNICODE[av]])
 168             else:
 169                 emit(CHCODES[av])
 170         else:
 171             raise error, "internal: unsupported set operator"
 172     emit(OPCODES[FAILURE])
 173
 174 def _optimize_charset(charset, fixup):
 175     # internal: optimize character set
 176     out = []
 177     charmap = [0]*256
 178     try:
 179         for op, av in charset:
 180             if op is NEGATE:
 181                 out.append((op, av))
 182             elif op is LITERAL:
 183                 charmap[fixup(av)] = 1
 184             elif op is RANGE:
 185                 for i in range(fixup(av[0]), fixup(av[1])+1):
 186                     charmap[i] = 1
 187             elif op is CATEGORY:
 188                 # XXX: could append to charmap tail
 189                 return charset # cannot compress
 190     except IndexError:
 191         if sys.maxunicode != 65535:
 192             # XXX: big charsets don't work in UCS-4 builds
 193             return charset
 194         # character set contains unicode characters
 195         return _optimize_unicode(charset, fixup)
 196     # compress character map
 197     i = p = n = 0
 198     runs = []
 199     for c in charmap:
 200         if c:
 201             if n == 0:
 202                 p = i
 203             n = n + 1
 204         elif n:
 205             runs.append((p, n))
 206             n = 0
 207         i = i + 1
 208     if n:
 209         runs.append((p, n))
 210     if len(runs) <= 2:
 211         # use literal/range
 212         for p, n in runs:
 213             if n == 1:
 214                 out.append((LITERAL, p))
 215             else:
 216                 out.append((RANGE, (p, p+n-1)))
 217         if len(out) < len(charset):
 218             return out
 219     else:
 220         # use bitmap
 221         data = _mk_bitmap(charmap)
 222         out.append((CHARSET, data))
 223         return out
 224     return charset
 225
 226 def _mk_bitmap(bits):
 227     data = []
 228     m = 1; v = 0
 229     for c in bits:
 230         if c:
 231             v = v + m
 232         m = m << 1
 233         if m > MAXCODE:
 234             data.append(v)
 235             m = 1; v = 0
 236     return data
 237
 238 # To represent a big charset, first a bitmap of all characters in the
 239 # set is constructed. Then, this bitmap is sliced into chunks of 256
 240 # characters, duplicate chunks are eliminitated, and each chunk is
 241 # given a number. In the compiled expression, the charset is
 242 # represented by a 16-bit word sequence, consisting of one word for
 243 # the number of different chunks, a sequence of 256 bytes (128 words)
 244 # of chunk numbers indexed by their original chunk position, and a
 245 # sequence of chunks (16 words each).
 246
 247 # Compression is normally good: in a typical charset, large ranges of
 248 # Unicode will be either completely excluded (e.g. if only cyrillic
 249 # letters are to be matched), or completely included (e.g. if large
 250 # subranges of Kanji match). These ranges will be represented by
 251 # chunks of all one-bits or all zero-bits.
 252
 253 # Matching can be also done efficiently: the more significant byte of
 254 # the Unicode character is an index into the chunk number, and the
 255 # less significant byte is a bit index in the chunk (just like the
 256 # CHARSET matching).
 257
 258 def _optimize_unicode(charset, fixup):
 259     charmap = [0]*65536
 260     negate = 0
 261     for op, av in charset:
 262         if op is NEGATE:
 263             negate = 1
 264         elif op is LITERAL:
 265             charmap[fixup(av)] = 1
 266         elif op is RANGE:
 267             for i in range(fixup(av[0]), fixup(av[1])+1):
 268                 charmap[i] = 1
 269         elif op is CATEGORY:
 270             # XXX: could expand category
 271             return charset # cannot compress
 272     if negate:
 273         for i in range(65536):
 274             charmap[i] = not charmap[i]
 275     comps = {}
 276     mapping = [0]*256
 277     block = 0
 278     data = []
 279     for i in range(256):
 280         chunk = tuple(charmap[i*256:(i+1)*256])
 281         new = comps.setdefault(chunk, block)
 282         mapping[i] = new
 283         if new == block:
 284             block = block + 1
 285             data = data + _mk_bitmap(chunk)
 286     header = [block]
 287     assert MAXCODE == 65535
 288     for i in range(128):
 289         if sys.byteorder == 'big':
 290             header.append(256*mapping[2*i]+mapping[2*i+1])
 291         else:
 292             header.append(mapping[2*i]+256*mapping[2*i+1])
 293     data[0:0] = header
 294     return [(BIGCHARSET, data)]
 295
 296 def _simple(av):
 297     # check if av is a "simple" operator
 298     lo, hi = av[2].getwidth()
 299     if lo == 0 and hi == MAXREPEAT:
 300         raise error, "nothing to repeat"
 301     return lo == hi == 1 and av[2][0][0] != SUBPATTERN
 302
 303 def _compile_info(code, pattern, flags):
 304     # internal: compile an info block.  in the current version,
 305     # this contains min/max pattern width, and an optional literal
 306     # prefix or a character map
 307     lo, hi = pattern.getwidth()
 308     if lo == 0:
 309         return # not worth it
 310     # look for a literal prefix
 311     prefix = []
 312     prefix_skip = 0
 313     charset = [] # not used
 314     if not (flags & SRE_FLAG_IGNORECASE):
 315         # look for literal prefix
 316         for op, av in pattern.data:
 317             if op is LITERAL:
 318                 if len(prefix) == prefix_skip:
 319                     prefix_skip = prefix_skip + 1
 320                 prefix.append(av)
 321             elif op is SUBPATTERN and len(av[1]) == 1:
 322                 op, av = av[1][0]
 323                 if op is LITERAL:
 324                     prefix.append(av)
 325                 else:
 326                     break
 327             else:
 328                 break
 329         # if no prefix, look for charset prefix
 330         if not prefix and pattern.data:
 331             op, av = pattern.data[0]
 332             if op is SUBPATTERN and av[1]:
 333                 op, av = av[1][0]
 334                 if op is LITERAL:
 335                     charset.append((op, av))
 336                 elif op is BRANCH:
 337                     c = []
 338                     for p in av[1]:
 339                         if not p:
 340                             break
 341                         op, av = p[0]
 342                         if op is LITERAL:
 343                             c.append((op, av))
 344                         else:
 345                             break
 346                     else:
 347                         charset = c
 348             elif op is BRANCH:
 349                 c = []
 350                 for p in av[1]:
 351                     if not p:
 352                         break
 353                     op, av = p[0]
 354                     if op is LITERAL:
 355                         c.append((op, av))
 356                     else:
 357                         break
 358                 else:
 359                     charset = c
 360             elif op is IN:
 361                 charset = av
 362 ##     if prefix:
 363 ##         print "*** PREFIX", prefix, prefix_skip
 364 ##     if charset:
 365 ##         print "*** CHARSET", charset
 366     # add an info block
 367     emit = code.append
 368     emit(OPCODES[INFO])
 369     skip = len(code); emit(0)
 370     # literal flag
 371     mask = 0
 372     if prefix:
 373         mask = SRE_INFO_PREFIX
 374         if len(prefix) == prefix_skip == len(pattern.data):
 375             mask = mask + SRE_INFO_LITERAL
 376     elif charset:
 377         mask = mask + SRE_INFO_CHARSET
 378     emit(mask)
 379     # pattern length
 380     if lo < MAXCODE:
 381         emit(lo)
 382     else:
 383         emit(MAXCODE)
 384         prefix = prefix[:MAXCODE]
 385     if hi < MAXCODE:
 386         emit(hi)
 387     else:
 388         emit(0)
 389     # add literal prefix
 390     if prefix:
 391         emit(len(prefix)) # length
 392         emit(prefix_skip) # skip
 393         code.extend(prefix)
 394         # generate overlap table
 395         table = [-1] + ([0]*len(prefix))
 396         for i in range(len(prefix)):
 397             table[i+1] = table[i]+1
 398             while table[i+1] > 0 and prefix[i] != prefix[table[i+1]-1]:
 399                 table[i+1] = table[table[i+1]-1]+1
 400         code.extend(table[1:]) # don't store first entry
 401     elif charset:
 402         _compile_charset(charset, 0, code)
 403     code[skip] = len(code) - skip
 404
 405 STRING_TYPES = [type("")]
 406
 407 try:
 408     STRING_TYPES.append(type(unicode("")))
 409 except NameError:
 410     pass
 411
 412 def _code(p, flags):
 413
 414     flags = p.pattern.flags | flags
 415     code = []
 416
 417     # compile info block
 418     _compile_info(code, p, flags)
 419
 420     # compile the pattern
 421     _compile(code, p.data, flags)
 422
 423     code.append(OPCODES[SUCCESS])
 424
 425     return code
 426
 427 def compile(p, flags=0):
 428     # internal: convert pattern list to internal format
 429
 430     if type(p) in STRING_TYPES:
 431         import sre_parse
 432         pattern = p
 433         p = sre_parse.parse(p, flags)
 434     else:
 435         pattern = None
 436
 437     code = _code(p, flags)
 438
 439     # print code
 440
 441     # XXX: <fl> get rid of this limitation!
 442     assert p.pattern.groups <= 100,\
 443            "sorry, but this version only supports 100 named groups"
 444
 445     # map in either direction
 446     groupindex = p.pattern.groupdict
 447     indexgroup = [None] * p.pattern.groups
 448     for k, i in groupindex.items():
 449         indexgroup[i] = k
 450
 451     return _sre.compile(
 452         pattern, flags, code,
 453         p.pattern.groups-1,
 454         groupindex, indexgroup
 455         )