3 * Author: Tatu Ylonen <ylo@ngs.fi>
5 * Copyright (c) 1991 Tatu Ylonen, Espoo, Finland
7 * Permission to use, copy, modify, distribute, and sell this software
8 * and its documentation for any purpose is hereby granted without
9 * fee, provided that the above copyright notice appear in all copies.
10 * This software is provided "as is" without express or implied
13 * Created: Thu Sep 26 17:14:05 1991 ylo
14 * Last modified: Mon Nov 4 17:06:48 1991 ylo
15 * Ported to Think C: 19 Jan 1992 guido@cwi.nl
17 * This code draws many ideas from the regular expression packages by
18 * Henry Spencer of the University of Toronto and Richard Stallman of
19 * the Free Software Foundation.
21 * Emacs-specific code and syntax table code is almost directly borrowed
24 * Bugs fixed and lots of reorganization by Jeffrey C. Ollie, April
25 * 1997 Thanks for bug reports and ideas from Andrew Kuchling, Tim
26 * Peters, Guido van Rossum, Ka-Ping Yee, Sjoerd Mullender, and
27 * probably one or two others that I'm forgetting.
34 /* The original code blithely assumed that sizeof(short) == 2. Not
35 * always true. Original instances of "(short)x" were replaced by
36 * SHORT(x), where SHORT is #defined below. */
38 #define SHORT(x) ((x) & 0x8000 ? (x) - 0x10000 : (x))
40 /* The stack implementation is taken from an idea by Andrew Kuchling.
41 * It's a doubly linked list of arrays. The advantages of this over a
42 * simple linked list are that the number of mallocs required are
43 * reduced. It also makes it possible to statically allocate enough
44 * space so that small patterns don't ever need to call malloc.
46 * The advantages over a single array is that is periodically
47 * realloced when more space is needed is that we avoid ever copying
50 /* item_t is the basic stack element. Defined as a union of
51 * structures so that both registers, failure points, and counters can
52 * be pushed/popped from the stack. There's nothing built into the
53 * item to keep track of whether a certain stack item is a register, a
54 * failure point, or a counter. */
81 #define STACK_PAGE_SIZE 256
82 #define NUM_REGISTERS 256
84 /* A 'page' of stack items. */
86 typedef struct item_page_t
88 item_t items
[STACK_PAGE_SIZE
];
89 struct item_page_t
*prev
;
90 struct item_page_t
*next
;
94 typedef struct match_state
96 /* The number of registers that have been pushed onto the stack
97 * since the last failure point. */
101 /* Used to control when registers need to be pushed onto the
106 /* The number of failure points on the stack. */
110 /* Storage for the registers. Each register consists of two
111 * pointers to characters. So register N is represented as
112 * start[N] and end[N]. The pointers must be converted to
113 * offsets from the beginning of the string before returning the
114 * registers to the calling program. */
116 unsigned char *start
[NUM_REGISTERS
];
117 unsigned char *end
[NUM_REGISTERS
];
119 /* Keeps track of whether a register has changed recently. */
121 int changed
[NUM_REGISTERS
];
123 /* Structure to encapsulate the stack. */
126 /* index into the current page. If index == 0 and you need
127 * to pop an item, move to the previous page and set index
128 * = STACK_PAGE_SIZE - 1. Otherwise decrement index to
129 * push a page. If index == STACK_PAGE_SIZE and you need
130 * to push a page move to the next page and set index =
131 * 0. If there is no new next page, allocate a new page
132 * and link it in. Otherwise, increment index to push a
136 item_page_t
*current
; /* Pointer to the current page. */
137 item_page_t first
; /* First page is statically allocated. */
141 /* Initialize a state object */
143 /* #define NEW_STATE(state) \ */
144 /* memset(&state, 0, (void *)(&state.stack) - (void *)(&state)); \ */
145 /* state.stack.current = &state.stack.first; \ */
146 /* state.stack.first.prev = NULL; \ */
147 /* state.stack.first.next = NULL; \ */
148 /* state.stack.index = 0; \ */
149 /* state.level = 1 */
151 #define NEW_STATE(state, nregs) \
154 for (i = 0; i < nregs; i++) \
156 state.start[i] = NULL; \
157 state.end[i] = NULL; \
158 state.changed[i] = 0; \
160 state.stack.current = &state.stack.first; \
161 state.stack.first.prev = NULL; \
162 state.stack.first.next = NULL; \
163 state.stack.index = 0; \
170 /* Free any memory that might have been malloc'd */
172 #define FREE_STATE(state) \
173 while(state.stack.first.next != NULL) \
175 state.stack.current = state.stack.first.next; \
176 state.stack.first.next = state.stack.current->next; \
177 free(state.stack.current); \
180 /* Discard the top 'count' stack items. */
182 #define STACK_DISCARD(stack, count, on_error) \
183 stack.index -= count; \
184 while (stack.index < 0) \
186 if (stack.current->prev == NULL) \
188 stack.current = stack.current->prev; \
189 stack.index += STACK_PAGE_SIZE; \
192 /* Store a pointer to the previous item on the stack. Used to pop an
193 * item off of the stack. */
195 #define STACK_PREV(stack, top, on_error) \
196 if (stack.index == 0) \
198 if (stack.current->prev == NULL) \
200 stack.current = stack.current->prev; \
201 stack.index = STACK_PAGE_SIZE - 1; \
207 top = &(stack.current->items[stack.index])
209 /* Store a pointer to the next item on the stack. Used to push an item
210 * on to the stack. */
212 #define STACK_NEXT(stack, top, on_error) \
213 if (stack.index == STACK_PAGE_SIZE) \
215 if (stack.current->next == NULL) \
217 stack.current->next = (item_page_t *)malloc(sizeof(item_page_t)); \
218 if (stack.current->next == NULL) \
220 stack.current->next->prev = stack.current; \
221 stack.current->next->next = NULL; \
223 stack.current = stack.current->next; \
226 top = &(stack.current->items[stack.index++])
228 /* Store a pointer to the item that is 'count' items back in the
229 * stack. STACK_BACK(stack, top, 1, on_error) is equivalent to
230 * STACK_TOP(stack, top, on_error). */
232 #define STACK_BACK(stack, top, count, on_error) \
235 item_page_t *current; \
236 current = stack.current; \
237 index = stack.index - (count); \
240 if (current->prev == NULL) \
242 current = current->prev; \
243 index += STACK_PAGE_SIZE; \
245 top = &(current->items[index]); \
248 /* Store a pointer to the top item on the stack. Execute the
249 * 'on_error' code if there are no items on the stack. */
251 #define STACK_TOP(stack, top, on_error) \
252 if (stack.index == 0) \
254 if (stack.current->prev == NULL) \
256 top = &(stack.current->prev->items[STACK_PAGE_SIZE - 1]); \
260 top = &(stack.current->items[stack.index - 1]); \
263 /* Test to see if the stack is empty */
265 #define STACK_EMPTY(stack) ((stack.index == 0) && \
266 (stack.current->prev == NULL))
268 /* Return the start of register 'reg' */
270 #define GET_REG_START(state, reg) (state.start[reg])
272 /* Return the end of register 'reg' */
274 #define GET_REG_END(state, reg) (state.end[reg])
276 /* Set the start of register 'reg'. If the state of the register needs
277 * saving, push it on the stack. */
279 #define SET_REG_START(state, reg, text, on_error) \
280 if(state.changed[reg] < state.level) \
283 STACK_NEXT(state.stack, item, on_error); \
284 item->reg.num = reg; \
285 item->reg.start = state.start[reg]; \
286 item->reg.end = state.end[reg]; \
287 item->reg.level = state.changed[reg]; \
288 state.changed[reg] = state.level; \
291 state.start[reg] = text
293 /* Set the end of register 'reg'. If the state of the register needs
294 * saving, push it on the stack. */
296 #define SET_REG_END(state, reg, text, on_error) \
297 if(state.changed[reg] < state.level) \
300 STACK_NEXT(state.stack, item, on_error); \
301 item->reg.num = reg; \
302 item->reg.start = state.start[reg]; \
303 item->reg.end = state.end[reg]; \
304 item->reg.level = state.changed[reg]; \
305 state.changed[reg] = state.level; \
308 state.end[reg] = text
310 #define PUSH_FAILURE(state, xcode, xtext, on_error) \
313 STACK_NEXT(state.stack, item, on_error); \
314 item->fail.code = xcode; \
315 item->fail.text = xtext; \
316 item->fail.count = state.count; \
317 item->fail.level = state.level; \
318 item->fail.phantom = 0; \
324 /* Update the last failure point with a new position in the text. */
326 #define UPDATE_FAILURE(state, xtext, on_error) \
329 STACK_BACK(state.stack, item, state.count + 1, on_error); \
330 if (!item->fail.phantom) \
333 STACK_NEXT(state.stack, item2, on_error); \
334 item2->fail.code = item->fail.code; \
335 item2->fail.text = xtext; \
336 item2->fail.count = state.count; \
337 item2->fail.level = state.level; \
338 item2->fail.phantom = 1; \
345 STACK_DISCARD(state.stack, state.count, on_error); \
346 STACK_TOP(state.stack, item, on_error); \
347 item->fail.text = xtext; \
353 #define POP_FAILURE(state, xcode, xtext, on_empty, on_error) \
358 while(state.count > 0) \
360 STACK_PREV(state.stack, item, on_error); \
361 state.start[item->reg.num] = item->reg.start; \
362 state.end[item->reg.num] = item->reg.end; \
363 state.changed[item->reg.num] = item->reg.level; \
366 STACK_PREV(state.stack, item, on_empty); \
367 xcode = item->fail.code; \
368 xtext = item->fail.text; \
369 state.count = item->fail.count; \
370 state.level = item->fail.level; \
373 while (item->fail.text == NULL); \
376 enum regexp_compiled_ops
/* opcodes for compiled regexp */
378 Cend
, /* end of pattern reached */
379 Cbol
, /* beginning of line */
380 Ceol
, /* end of line */
381 Cset
, /* character set. Followed by 32 bytes of set. */
382 Cexact
, /* followed by a byte to match */
383 Canychar
, /* matches any character except newline */
384 Cstart_memory
, /* set register start addr (followed by reg number) */
385 Cend_memory
, /* set register end addr (followed by reg number) */
386 Cmatch_memory
, /* match a duplicate of reg contents (regnum follows)*/
387 Cjump
, /* followed by two bytes (lsb,msb) of displacement. */
388 Cstar_jump
, /* will change to jump/update_failure_jump at runtime */
389 Cfailure_jump
, /* jump to addr on failure */
390 Cupdate_failure_jump
, /* update topmost failure point and jump */
391 Cdummy_failure_jump
, /* push a dummy failure point and jump */
392 Cbegbuf
, /* match at beginning of buffer */
393 Cendbuf
, /* match at end of buffer */
394 Cwordbeg
, /* match at beginning of word */
395 Cwordend
, /* match at end of word */
396 Cwordbound
, /* match if at word boundary */
397 Cnotwordbound
, /* match if not at word boundary */
398 Csyntaxspec
, /* matches syntax code (1 byte follows) */
399 Cnotsyntaxspec
, /* matches if syntax code does not match (1 byte follows) */
403 enum regexp_syntax_op
/* syntax codes for plain and quoted characters */
405 Rend
, /* special code for end of regexp */
406 Rnormal
, /* normal character */
407 Ranychar
, /* any character except newline */
408 Rquote
, /* the quote character */
409 Rbol
, /* match beginning of line */
410 Reol
, /* match end of line */
411 Roptional
, /* match preceding expression optionally */
412 Rstar
, /* match preceding expr zero or more times */
413 Rplus
, /* match preceding expr one or more times */
414 Ror
, /* match either of alternatives */
415 Ropenpar
, /* opening parenthesis */
416 Rclosepar
, /* closing parenthesis */
417 Rmemory
, /* match memory register */
418 Rextended_memory
, /* \vnn to match registers 10-99 */
419 Ropenset
, /* open set. Internal syntax hard-coded below. */
420 /* the following are gnu extensions to "normal" regexp syntax */
421 Rbegbuf
, /* beginning of buffer */
422 Rendbuf
, /* end of buffer */
423 Rwordchar
, /* word character */
424 Rnotwordchar
, /* not word character */
425 Rwordbeg
, /* beginning of word */
426 Rwordend
, /* end of word */
427 Rwordbound
, /* word bound */
428 Rnotwordbound
, /* not word bound */
432 static int re_compile_initialized
= 0;
433 static int regexp_syntax
= 0;
434 int re_syntax
= 0; /* Exported copy of regexp_syntax */
435 static unsigned char regexp_plain_ops
[256];
436 static unsigned char regexp_quoted_ops
[256];
437 static unsigned char regexp_precedences
[Rnum_ops
];
438 static int regexp_context_indep_ops
;
439 static int regexp_ansi_sequences
;
441 #define NUM_LEVELS 5 /* number of precedence levels in use */
442 #define MAX_NESTING 100 /* max nesting level of operators */
444 #define SYNTAX(ch) re_syntax_table[(unsigned char)(ch)]
446 unsigned char re_syntax_table
[256];
448 void re_compile_initialize(void)
452 static int syntax_table_inited
= 0;
454 if (!syntax_table_inited
)
456 syntax_table_inited
= 1;
457 memset(re_syntax_table
, 0, 256);
458 for (a
= 'a'; a
<= 'z'; a
++)
459 re_syntax_table
[a
] = Sword
;
460 for (a
= 'A'; a
<= 'Z'; a
++)
461 re_syntax_table
[a
] = Sword
;
462 for (a
= '0'; a
<= '9'; a
++)
463 re_syntax_table
[a
] = Sword
| Sdigit
| Shexdigit
;
464 for (a
= '0'; a
<= '7'; a
++)
465 re_syntax_table
[a
] |= Soctaldigit
;
466 for (a
= 'A'; a
<= 'F'; a
++)
467 re_syntax_table
[a
] |= Shexdigit
;
468 for (a
= 'a'; a
<= 'f'; a
++)
469 re_syntax_table
[a
] |= Shexdigit
;
470 re_syntax_table
['_'] = Sword
;
471 for (a
= 9; a
<= 13; a
++)
472 re_syntax_table
[a
] = Swhitespace
;
473 re_syntax_table
[' '] = Swhitespace
;
475 re_compile_initialized
= 1;
476 for (a
= 0; a
< 256; a
++)
478 regexp_plain_ops
[a
] = Rnormal
;
479 regexp_quoted_ops
[a
] = Rnormal
;
481 for (a
= '0'; a
<= '9'; a
++)
482 regexp_quoted_ops
[a
] = Rmemory
;
483 regexp_plain_ops
['\134'] = Rquote
;
484 if (regexp_syntax
& RE_NO_BK_PARENS
)
486 regexp_plain_ops
['('] = Ropenpar
;
487 regexp_plain_ops
[')'] = Rclosepar
;
491 regexp_quoted_ops
['('] = Ropenpar
;
492 regexp_quoted_ops
[')'] = Rclosepar
;
494 if (regexp_syntax
& RE_NO_BK_VBAR
)
495 regexp_plain_ops
['\174'] = Ror
;
497 regexp_quoted_ops
['\174'] = Ror
;
498 regexp_plain_ops
['*'] = Rstar
;
499 if (regexp_syntax
& RE_BK_PLUS_QM
)
501 regexp_quoted_ops
['+'] = Rplus
;
502 regexp_quoted_ops
['?'] = Roptional
;
506 regexp_plain_ops
['+'] = Rplus
;
507 regexp_plain_ops
['?'] = Roptional
;
509 if (regexp_syntax
& RE_NEWLINE_OR
)
510 regexp_plain_ops
['\n'] = Ror
;
511 regexp_plain_ops
['\133'] = Ropenset
;
512 regexp_plain_ops
['\136'] = Rbol
;
513 regexp_plain_ops
['$'] = Reol
;
514 regexp_plain_ops
['.'] = Ranychar
;
515 if (!(regexp_syntax
& RE_NO_GNU_EXTENSIONS
))
517 regexp_quoted_ops
['w'] = Rwordchar
;
518 regexp_quoted_ops
['W'] = Rnotwordchar
;
519 regexp_quoted_ops
['<'] = Rwordbeg
;
520 regexp_quoted_ops
['>'] = Rwordend
;
521 regexp_quoted_ops
['b'] = Rwordbound
;
522 regexp_quoted_ops
['B'] = Rnotwordbound
;
523 regexp_quoted_ops
['`'] = Rbegbuf
;
524 regexp_quoted_ops
['\''] = Rendbuf
;
526 if (regexp_syntax
& RE_ANSI_HEX
)
527 regexp_quoted_ops
['v'] = Rextended_memory
;
528 for (a
= 0; a
< Rnum_ops
; a
++)
529 regexp_precedences
[a
] = 4;
530 if (regexp_syntax
& RE_TIGHT_VBAR
)
532 regexp_precedences
[Ror
] = 3;
533 regexp_precedences
[Rbol
] = 2;
534 regexp_precedences
[Reol
] = 2;
538 regexp_precedences
[Ror
] = 2;
539 regexp_precedences
[Rbol
] = 3;
540 regexp_precedences
[Reol
] = 3;
542 regexp_precedences
[Rclosepar
] = 1;
543 regexp_precedences
[Rend
] = 0;
544 regexp_context_indep_ops
= (regexp_syntax
& RE_CONTEXT_INDEP_OPS
) != 0;
545 regexp_ansi_sequences
= (regexp_syntax
& RE_ANSI_HEX
) != 0;
548 int re_set_syntax(int syntax
)
553 regexp_syntax
= syntax
;
554 re_syntax
= syntax
; /* Exported copy */
555 re_compile_initialize();
559 static int hex_char_to_decimal(int ch
)
561 if (ch
>= '0' && ch
<= '9')
563 if (ch
>= 'a' && ch
<= 'f')
564 return ch
- 'a' + 10;
565 if (ch
>= 'A' && ch
<= 'F')
566 return ch
- 'A' + 10;
570 static void re_compile_fastmap_aux(unsigned char *code
, int pos
,
571 unsigned char *visited
,
572 unsigned char *can_be_null
,
573 unsigned char *fastmap
)
580 return; /* we have already been here */
583 switch (code
[pos
++]) {
597 for (a
= 0; a
< 256; a
++)
603 syntaxcode
= code
[pos
++];
604 for (a
= 0; a
< 256; a
++)
605 if (SYNTAX(a
) & syntaxcode
)
611 syntaxcode
= code
[pos
++];
612 for (a
= 0; a
< 256; a
++)
613 if (!(SYNTAX(a
) & syntaxcode
) )
620 if (*can_be_null
== 0)
621 *can_be_null
= 2; /* can match null, but only at end of buffer*/
626 for (a
= 0; a
< 256/8; a
++)
627 if (code
[pos
+ a
] != 0)
628 for (b
= 0; b
< 8; b
++)
629 if (code
[pos
+ a
] & (1 << b
))
630 fastmap
[(a
<< 3) + b
] = 1;
636 fastmap
[(unsigned char)code
[pos
]] = 1;
641 for (a
= 0; a
< 256; a
++)
654 for (a
= 0; a
< 256; a
++)
660 case Cdummy_failure_jump
:
661 case Cupdate_failure_jump
:
664 a
= (unsigned char)code
[pos
++];
665 a
|= (unsigned char)code
[pos
++] << 8;
666 pos
+= (int)SHORT(a
);
669 /* argh... the regexp contains empty loops. This is not
670 good, as this may cause a failure stack overflow when
671 matching. Oh well. */
672 /* this path leads nowhere; pursue other paths. */
680 a
= (unsigned char)code
[pos
++];
681 a
|= (unsigned char)code
[pos
++] << 8;
682 a
= pos
+ (int)SHORT(a
);
683 re_compile_fastmap_aux(code
, a
, visited
, can_be_null
, fastmap
);
693 PyErr_SetString(PyExc_SystemError
, "Unknown regex opcode: memory corrupted?");
700 static int re_do_compile_fastmap(unsigned char *buffer
, int used
, int pos
,
701 unsigned char *can_be_null
,
702 unsigned char *fastmap
)
704 unsigned char small_visited
[512], *visited
;
706 if (used
<= sizeof(small_visited
))
707 visited
= small_visited
;
710 visited
= malloc(used
);
715 memset(fastmap
, 0, 256);
716 memset(visited
, 0, used
);
717 re_compile_fastmap_aux(buffer
, pos
, visited
, can_be_null
, fastmap
);
718 if (visited
!= small_visited
)
723 void re_compile_fastmap(regexp_t bufp
)
725 if (!bufp
->fastmap
|| bufp
->fastmap_accurate
)
727 assert(bufp
->used
> 0);
728 if (!re_do_compile_fastmap(bufp
->buffer
,
734 if (PyErr_Occurred()) return;
735 if (bufp
->buffer
[0] == Cbol
)
736 bufp
->anchor
= 1; /* begline */
738 if (bufp
->buffer
[0] == Cbegbuf
)
739 bufp
->anchor
= 2; /* begbuf */
741 bufp
->anchor
= 0; /* none */
742 bufp
->fastmap_accurate
= 1;
748 * ... code for operand of star
750 * 2: ... code after star
752 * We change the star_jump to update_failure_jump if we can determine
753 * that it is safe to do so; otherwise we change it to an ordinary
760 * 2: ... code for operand of plus
762 * 3: ... code after plus
764 * For star_jump considerations this is processed identically to star.
768 static int re_optimize_star_jump(regexp_t bufp
, unsigned char *code
)
770 unsigned char map
[256];
771 unsigned char can_be_null
;
777 int num_instructions
= 0;
779 a
= (unsigned char)*code
++;
780 a
|= (unsigned char)*code
++ << 8;
783 p1
= code
+ a
+ 3; /* skip the failure_jump */
784 /* Check that the jump is within the pattern */
785 if (p1
<bufp
->buffer
|| bufp
->buffer
+bufp
->used
<p1
)
787 PyErr_SetString(PyExc_SystemError
, "Regex VM jump out of bounds (failure_jump opt)");
791 assert(p1
[-3] == Cfailure_jump
);
793 /* p1 points inside loop, p2 points to after loop */
794 if (!re_do_compile_fastmap(bufp
->buffer
, bufp
->used
,
795 (int)(p2
- bufp
->buffer
),
797 goto make_normal_jump
;
799 /* If we might introduce a new update point inside the
800 * loop, we can't optimize because then update_jump would
801 * update a wrong failure point. Thus we have to be
802 * quite careful here.
805 /* loop until we find something that consumes a character */
829 ch
= (unsigned char)*p1
++;
831 goto make_normal_jump
;
836 for (b
= 0; b
< 256; b
++)
837 if (b
!= '\n' && map
[b
])
838 goto make_normal_jump
;
843 for (b
= 0; b
< 256; b
++)
844 if ((p1
[b
>> 3] & (1 << (b
& 7))) && map
[b
])
845 goto make_normal_jump
;
851 goto make_normal_jump
;
854 /* now we know that we can't backtrack. */
894 case Cupdate_failure_jump
:
895 case Cdummy_failure_jump
:
897 goto make_normal_jump
;
906 /* make_update_jump: */
908 a
+= 3; /* jump to after the Cfailure_jump */
909 code
[0] = Cupdate_failure_jump
;
912 if (num_instructions
> 1)
914 assert(num_instructions
== 1);
915 /* if the only instruction matches a single character, we can do
917 p1
= code
+ 3 + a
; /* start of sole instruction */
918 if (*p1
== Cset
|| *p1
== Cexact
|| *p1
== Canychar
||
919 *p1
== Csyntaxspec
|| *p1
== Cnotsyntaxspec
)
929 static int re_optimize(regexp_t bufp
)
972 if (!re_optimize_star_jump(bufp
, code
))
978 case Cupdate_failure_jump
:
980 case Cdummy_failure_jump
:
995 #define NEXTCHAR(var) \
998 goto ends_prematurely; \
999 (var) = regex[pos]; \
1003 #define ALLOC(amount) \
1005 if (pattern_offset+(amount) > alloc) \
1007 alloc += 256 + (amount); \
1008 pattern = realloc(pattern, alloc); \
1010 goto out_of_memory; \
1014 #define STORE(ch) pattern[pattern_offset++] = (ch)
1016 #define CURRENT_LEVEL_START (starts[starts_base + current_level])
1018 #define SET_LEVEL_START starts[starts_base + current_level] = pattern_offset
1020 #define PUSH_LEVEL_STARTS \
1021 if (starts_base < (MAX_NESTING-1)*NUM_LEVELS) \
1022 starts_base += NUM_LEVELS; \
1026 #define POP_LEVEL_STARTS starts_base -= NUM_LEVELS
1028 #define PUT_ADDR(offset,addr) \
1030 int disp = (addr) - (offset) - 2; \
1031 pattern[(offset)] = disp & 0xff; \
1032 pattern[(offset)+1] = (disp>>8) & 0xff; \
1035 #define INSERT_JUMP(pos,type,addr) \
1037 int a, p = (pos), t = (type), ad = (addr); \
1038 for (a = pattern_offset - 1; a >= p; a--) \
1039 pattern[a + 3] = pattern[a]; \
1042 pattern_offset += 3; \
1045 #define SETBIT(buf,offset,bit) (buf)[(offset)+(bit)/8] |= (1<<((bit) & 7))
1047 #define SET_FIELDS \
1049 bufp->allocated = alloc; \
1050 bufp->buffer = pattern; \
1051 bufp->used = pattern_offset; \
1054 #define GETHEX(var) \
1056 unsigned char gethex_ch, gethex_value; \
1057 NEXTCHAR(gethex_ch); \
1058 gethex_value = hex_char_to_decimal(gethex_ch); \
1059 if (gethex_value == 16) \
1061 NEXTCHAR(gethex_ch); \
1062 gethex_ch = hex_char_to_decimal(gethex_ch); \
1063 if (gethex_ch == 16) \
1065 (var) = gethex_value * 16 + gethex_ch; \
1068 #define ANSI_TRANSLATE(ch) \
1075 ch = 7; /* audible bell */ \
1081 ch = 8; /* backspace */ \
1087 ch = 12; /* form feed */ \
1093 ch = 10; /* line feed */ \
1099 ch = 13; /* carriage return */ \
1111 ch = 11; /* vertical tab */ \
1114 case 'x': /* hex code */ \
1122 /* other characters passed through */ \
1124 ch = translate[(unsigned char)ch]; \
1130 char *re_compile_pattern(unsigned char *regex
, int size
, regexp_t bufp
)
1138 int pattern_offset
= 0, alloc
;
1139 int starts
[NUM_LEVELS
* MAX_NESTING
];
1141 int future_jumps
[MAX_NESTING
];
1143 unsigned char ch
= '\0';
1144 unsigned char *pattern
;
1145 unsigned char *translate
;
1148 int num_open_registers
;
1149 int open_registers
[RE_NREGS
];
1150 int beginning_context
;
1152 if (!re_compile_initialized
)
1153 re_compile_initialize();
1155 bufp
->fastmap_accurate
= 0;
1156 bufp
->uses_registers
= 1;
1157 bufp
->num_registers
= 1;
1158 translate
= bufp
->translate
;
1159 pattern
= bufp
->buffer
;
1160 alloc
= bufp
->allocated
;
1161 if (alloc
== 0 || pattern
== NULL
)
1164 pattern
= malloc(alloc
);
1173 num_open_registers
= 0;
1176 beginning_context
= 1;
1178 /* we use Rend dummy to ensure that pending jumps are updated
1179 (due to low priority of Rend) before exiting the loop. */
1189 ch
= translate
[(unsigned char)ch
];
1190 op
= regexp_plain_ops
[(unsigned char)ch
];
1194 op
= regexp_quoted_ops
[(unsigned char)ch
];
1195 if (op
== Rnormal
&& regexp_ansi_sequences
)
1199 level
= regexp_precedences
[op
];
1200 /* printf("ch='%c' op=%d level=%d current_level=%d
1201 curlevstart=%d\n", ch, op, level, current_level,
1202 CURRENT_LEVEL_START); */
1203 if (level
> current_level
)
1205 for (current_level
++; current_level
< level
; current_level
++)
1210 if (level
< current_level
)
1212 current_level
= level
;
1213 for (;num_jumps
> 0 &&
1214 future_jumps
[num_jumps
-1] >= CURRENT_LEVEL_START
;
1216 PUT_ADDR(future_jumps
[num_jumps
-1], pattern_offset
);
1228 store_opcode_and_arg
: /* opcode & ch must be set */
1251 if (!beginning_context
) {
1252 if (regexp_context_indep_ops
)
1262 if (!((pos
>= size
) ||
1263 ((regexp_syntax
& RE_NO_BK_VBAR
) ?
1264 (regex
[pos
] == '\174') :
1265 (pos
+1 < size
&& regex
[pos
] == '\134' &&
1266 regex
[pos
+1] == '\174')) ||
1267 ((regexp_syntax
& RE_NO_BK_PARENS
)?
1268 (regex
[pos
] == ')'):
1269 (pos
+1 < size
&& regex
[pos
] == '\134' &&
1270 regex
[pos
+1] == ')')))) {
1271 if (regexp_context_indep_ops
)
1283 if (beginning_context
) {
1284 if (regexp_context_indep_ops
)
1289 if (CURRENT_LEVEL_START
== pattern_offset
)
1290 break; /* ignore empty patterns for ? */
1292 INSERT_JUMP(CURRENT_LEVEL_START
, Cfailure_jump
,
1293 pattern_offset
+ 3);
1299 if (beginning_context
) {
1300 if (regexp_context_indep_ops
)
1305 if (CURRENT_LEVEL_START
== pattern_offset
)
1306 break; /* ignore empty patterns for + and * */
1308 INSERT_JUMP(CURRENT_LEVEL_START
, Cfailure_jump
,
1309 pattern_offset
+ 6);
1310 INSERT_JUMP(pattern_offset
, Cstar_jump
, CURRENT_LEVEL_START
);
1311 if (op
== Rplus
) /* jump over initial failure_jump */
1312 INSERT_JUMP(CURRENT_LEVEL_START
, Cdummy_failure_jump
,
1313 CURRENT_LEVEL_START
+ 6);
1319 INSERT_JUMP(CURRENT_LEVEL_START
, Cfailure_jump
,
1320 pattern_offset
+ 6);
1321 if (num_jumps
>= MAX_NESTING
)
1324 future_jumps
[num_jumps
++] = pattern_offset
;
1333 if (next_register
< RE_NREGS
)
1335 bufp
->uses_registers
= 1;
1337 STORE(Cstart_memory
);
1338 STORE(next_register
);
1339 open_registers
[num_open_registers
++] = next_register
;
1340 bufp
->num_registers
++;
1351 if (paren_depth
<= 0)
1352 goto parenthesis_error
;
1354 current_level
= regexp_precedences
[Ropenpar
];
1356 if (paren_depth
< num_open_registers
)
1358 bufp
->uses_registers
= 1;
1361 num_open_registers
--;
1362 STORE(open_registers
[num_open_registers
]);
1369 goto bad_match_register
;
1370 assert(ch
>= '0' && ch
<= '9');
1371 bufp
->uses_registers
= 1;
1372 opcode
= Cmatch_memory
;
1374 goto store_opcode_and_arg
;
1376 case Rextended_memory
:
1379 if (ch
< '0' || ch
> '9')
1380 goto bad_match_register
;
1382 if (a
< '0' || a
> '9')
1383 goto bad_match_register
;
1384 ch
= 10 * (a
- '0') + ch
- '0';
1385 if (ch
== 0 || ch
>= RE_NREGS
)
1386 goto bad_match_register
;
1387 bufp
->uses_registers
= 1;
1388 opcode
= Cmatch_memory
;
1389 goto store_opcode_and_arg
;
1402 offset
= pattern_offset
;
1403 for (a
= 0; a
< 256/8; a
++)
1407 ch
= translate
[(unsigned char)ch
];
1413 ch
= translate
[(unsigned char)ch
];
1420 while (ch
!= '\135' || firstchar
)
1423 if (regexp_ansi_sequences
&& ch
== '\134')
1430 for (a
= prev
; a
<= (int)ch
; a
++)
1431 SETBIT(pattern
, offset
, a
);
1436 if (prev
!= -1 && ch
== '-')
1440 SETBIT(pattern
, offset
, ch
);
1445 ch
= translate
[(unsigned char)ch
];
1448 SETBIT(pattern
, offset
, '-');
1451 for (a
= 0; a
< 256/8; a
++)
1452 pattern
[offset
+a
] ^= 0xff;
1468 opcode
= Csyntaxspec
;
1470 goto store_opcode_and_arg
;
1474 opcode
= Cnotsyntaxspec
;
1476 goto store_opcode_and_arg
;
1490 opcode
= Cwordbound
;
1495 opcode
= Cnotwordbound
;
1503 beginning_context
= (op
== Ropenpar
|| op
== Ror
);
1505 if (starts_base
!= 0)
1506 goto parenthesis_error
;
1507 assert(num_jumps
== 0);
1511 if(!re_optimize(bufp
))
1512 return "Optimization error";
1517 return "Badly placed special character";
1521 return "Bad match register number";
1525 return "Bad hexadecimal number";
1529 return "Badly placed parenthesis";
1533 return "Out of memory";
1537 return "Regular expression ends prematurely";
1541 return "Regular expression too complex";
1549 #undef CURRENT_LEVEL_START
1550 #undef SET_LEVEL_START
1551 #undef PUSH_LEVEL_STARTS
1552 #undef POP_LEVEL_STARTS
1558 #define PREFETCH if (text == textend) goto fail
1560 #define NEXTCHAR(var) \
1562 var = (unsigned char)*text++; \
1564 var = translate[var]
1566 int re_match(regexp_t bufp
, unsigned char *string
, int size
, int pos
,
1567 regexp_registers_t old_regs
)
1569 unsigned char *code
;
1570 unsigned char *translate
;
1571 unsigned char *text
;
1572 unsigned char *textstart
;
1573 unsigned char *textend
;
1579 unsigned char *regstart
;
1580 unsigned char *regend
;
1584 assert(pos
>= 0 && size
>= 0);
1585 assert(pos
<= size
);
1587 text
= string
+ pos
;
1589 textend
= string
+ size
;
1591 code
= bufp
->buffer
;
1593 translate
= bufp
->translate
;
1595 NEW_STATE(state
, bufp
->num_registers
);
1602 match_end
= text
- textstart
;
1605 old_regs
->start
[0] = pos
;
1606 old_regs
->end
[0] = match_end
;
1607 if (!bufp
->uses_registers
)
1609 for (a
= 1; a
< RE_NREGS
; a
++)
1611 old_regs
->start
[a
] = -1;
1612 old_regs
->end
[a
] = -1;
1617 for (a
= 1; a
< bufp
->num_registers
; a
++)
1619 if ((GET_REG_START(state
, a
) == NULL
) ||
1620 (GET_REG_END(state
, a
) == NULL
))
1622 old_regs
->start
[a
] = -1;
1623 old_regs
->end
[a
] = -1;
1626 old_regs
->start
[a
] = GET_REG_START(state
, a
) - textstart
;
1627 old_regs
->end
[a
] = GET_REG_END(state
, a
) - textstart
;
1629 for (; a
< RE_NREGS
; a
++)
1631 old_regs
->start
[a
] = -1;
1632 old_regs
->end
[a
] = -1;
1637 return match_end
- pos
;
1641 if (text
== textstart
|| text
[-1] == '\n')
1642 goto continue_matching
;
1647 if (text
== textend
|| *text
== '\n')
1648 goto continue_matching
;
1654 if (code
[ch
/8] & (1<<(ch
& 7)))
1657 goto continue_matching
;
1664 if (ch
!= (unsigned char)*code
++)
1666 goto continue_matching
;
1673 goto continue_matching
;
1678 SET_REG_START(state
, reg
, text
, goto error
);
1679 goto continue_matching
;
1684 SET_REG_END(state
, reg
, text
, goto error
);
1685 goto continue_matching
;
1690 regstart
= GET_REG_START(state
, reg
);
1691 regend
= GET_REG_END(state
, reg
);
1692 if ((regstart
== NULL
) || (regend
== NULL
))
1693 goto fail
; /* or should we just match nothing? */
1694 regsize
= regend
- regstart
;
1696 if (regsize
> (textend
- text
))
1700 for (; regstart
< regend
; regstart
++, text
++)
1701 if (translate
[*regstart
] != translate
[*text
])
1705 for (; regstart
< regend
; regstart
++, text
++)
1706 if (*regstart
!= *text
)
1708 goto continue_matching
;
1710 case Cupdate_failure_jump
:
1712 UPDATE_FAILURE(state
, text
, goto error
);
1713 /* fall to next case */
1715 /* treat Cstar_jump just like Cjump if it hasn't been optimized */
1719 a
= (unsigned char)*code
++;
1720 a
|= (unsigned char)*code
++ << 8;
1721 code
+= (int)SHORT(a
);
1722 if (code
<bufp
->buffer
|| bufp
->buffer
+bufp
->used
<code
) {
1723 PyErr_SetString(PyExc_SystemError
, "Regex VM jump out of bounds (Cjump)");
1727 goto continue_matching
;
1729 case Cdummy_failure_jump
:
1731 unsigned char *failuredest
;
1733 a
= (unsigned char)*code
++;
1734 a
|= (unsigned char)*code
++ << 8;
1736 assert(*code
== Cfailure_jump
);
1737 b
= (unsigned char)code
[1];
1738 b
|= (unsigned char)code
[2] << 8;
1739 failuredest
= code
+ (int)SHORT(b
) + 3;
1740 if (failuredest
<bufp
->buffer
|| bufp
->buffer
+bufp
->used
< failuredest
) {
1741 PyErr_SetString(PyExc_SystemError
, "Regex VM jump out of bounds (Cdummy_failure_jump failuredest)");
1745 PUSH_FAILURE(state
, failuredest
, NULL
, goto error
);
1747 if (code
<bufp
->buffer
|| bufp
->buffer
+bufp
->used
< code
) {
1748 PyErr_SetString(PyExc_SystemError
, "Regex VM jump out of bounds (Cdummy_failure_jump code)");
1752 goto continue_matching
;
1756 a
= (unsigned char)*code
++;
1757 a
|= (unsigned char)*code
++ << 8;
1759 if (code
+a
<bufp
->buffer
|| bufp
->buffer
+bufp
->used
< code
+a
) {
1760 PyErr_SetString(PyExc_SystemError
, "Regex VM jump out of bounds (Cfailure_jump)");
1764 PUSH_FAILURE(state
, code
+ a
, text
, goto error
);
1765 goto continue_matching
;
1769 unsigned char *pinst
;
1770 a
= (unsigned char)*code
++;
1771 a
|= (unsigned char)*code
++ << 8;
1774 if (pinst
<bufp
->buffer
|| bufp
->buffer
+bufp
->used
<pinst
) {
1775 PyErr_SetString(PyExc_SystemError
, "Regex VM jump out of bounds (Crepeat1)");
1779 /* pinst is sole instruction in loop, and it matches a
1780 * single character. Since Crepeat1 was originally a
1781 * Cupdate_failure_jump, we also know that backtracking
1782 * is useless: so long as the single-character
1783 * expression matches, it must be used. Also, in the
1784 * case of +, we've already matched one character, so +
1785 * can't fail: nothing here can cause a failure. */
1792 while (text
< textend
)
1794 ch
= translate
[(unsigned char)*text
];
1795 if (pinst
[ch
/8] & (1<<(ch
& 7)))
1803 while (text
< textend
)
1805 ch
= (unsigned char)*text
;
1806 if (pinst
[ch
/8] & (1<<(ch
& 7)))
1816 ch
= (unsigned char)*pinst
;
1819 while (text
< textend
&&
1820 translate
[(unsigned char)*text
] == ch
)
1825 while (text
< textend
&& (unsigned char)*text
== ch
)
1832 while (text
< textend
&& (unsigned char)*text
!= '\n')
1838 a
= (unsigned char)*pinst
;
1841 while (text
< textend
&&
1842 (SYNTAX(translate
[*text
]) & a
) )
1847 while (text
< textend
&& (SYNTAX(*text
) & a
) )
1852 case Cnotsyntaxspec
:
1854 a
= (unsigned char)*pinst
;
1857 while (text
< textend
&&
1858 !(SYNTAX(translate
[*text
]) & a
) )
1863 while (text
< textend
&& !(SYNTAX(*text
) & a
) )
1871 PyErr_SetString(PyExc_SystemError
, "Unknown regex opcode: memory corrupted?");
1876 /* due to the funky way + and * are compiled, the top
1877 * failure- stack entry at this point is actually a
1878 * success entry -- update it & pop it */
1879 UPDATE_FAILURE(state
, text
, goto error
);
1880 goto fail
; /* i.e., succeed <wink/sigh> */
1884 if (text
== textstart
)
1885 goto continue_matching
;
1890 if (text
== textend
)
1891 goto continue_matching
;
1896 if (text
== textend
)
1898 if (!(SYNTAX(*text
) & Sword
))
1900 if (text
== textstart
)
1901 goto continue_matching
;
1902 if (!(SYNTAX(text
[-1]) & Sword
))
1903 goto continue_matching
;
1908 if (text
== textstart
)
1910 if (!(SYNTAX(text
[-1]) & Sword
))
1912 if (text
== textend
)
1913 goto continue_matching
;
1914 if (!(SYNTAX(*text
) & Sword
))
1915 goto continue_matching
;
1920 /* Note: as in gnu regexp, this also matches at the
1921 * beginning and end of buffer. */
1923 if (text
== textstart
|| text
== textend
)
1924 goto continue_matching
;
1925 if ((SYNTAX(text
[-1]) & Sword
) ^ (SYNTAX(*text
) & Sword
))
1926 goto continue_matching
;
1931 /* Note: as in gnu regexp, this never matches at the
1932 * beginning and end of buffer. */
1933 if (text
== textstart
|| text
== textend
)
1935 if (!((SYNTAX(text
[-1]) & Sword
) ^ (SYNTAX(*text
) & Sword
)))
1936 goto continue_matching
;
1942 if (!(SYNTAX(ch
) & (unsigned char)*code
++))
1944 goto continue_matching
;
1946 case Cnotsyntaxspec
:
1949 if (SYNTAX(ch
) & (unsigned char)*code
++)
1951 goto continue_matching
;
1956 PyErr_SetString(PyExc_SystemError
, "Unknown regex opcode: memory corrupted?");
1964 #if 0 /* This line is never reached --Guido */
1971 /* Using "break;" in the above switch statement is equivalent to "goto fail;" */
1973 POP_FAILURE(state
, code
, text
, goto done_matching
, goto error
);
1974 goto continue_matching
;
1977 /* if(translated != NULL) */
1978 /* free(translated); */
1983 /* if (translated != NULL) */
1984 /* free(translated); */
1993 int re_search(regexp_t bufp
, unsigned char *string
, int size
, int pos
,
1994 int range
, regexp_registers_t regs
)
1996 unsigned char *fastmap
;
1997 unsigned char *translate
;
1998 unsigned char *text
;
1999 unsigned char *partstart
;
2000 unsigned char *partend
;
2003 unsigned char anchor
;
2005 assert(size
>= 0 && pos
>= 0);
2006 assert(pos
+ range
>= 0 && pos
+ range
<= size
); /* Bugfix by ylo */
2008 fastmap
= bufp
->fastmap
;
2009 translate
= bufp
->translate
;
2010 if (fastmap
&& !bufp
->fastmap_accurate
) {
2011 re_compile_fastmap(bufp
);
2012 if (PyErr_Occurred()) return -2;
2015 anchor
= bufp
->anchor
;
2016 if (bufp
->can_be_null
== 1) /* can_be_null == 2: can match null at eob */
2034 for (; range
>= 0; range
--, pos
+= dir
)
2039 { /* searching forwards */
2041 text
= string
+ pos
;
2042 partend
= string
+ size
;
2045 while (text
!= partend
&&
2046 !fastmap
[(unsigned char) translate
[(unsigned char)*text
]])
2049 while (text
!= partend
&& !fastmap
[(unsigned char)*text
])
2051 pos
+= text
- partstart
;
2052 range
-= text
- partstart
;
2053 if (pos
== size
&& bufp
->can_be_null
== 0)
2057 { /* searching backwards */
2058 text
= string
+ pos
;
2059 partstart
= string
+ pos
- range
;
2062 while (text
!= partstart
&&
2063 !fastmap
[(unsigned char)
2064 translate
[(unsigned char)*text
]])
2067 while (text
!= partstart
&&
2068 !fastmap
[(unsigned char)*text
])
2070 pos
-= partend
- text
;
2071 range
-= partend
- text
;
2075 { /* anchored to begline */
2076 if (pos
> 0 && (string
[pos
- 1] != '\n'))
2079 assert(pos
>= 0 && pos
<= size
);
2080 ret
= re_match(bufp
, string
, size
, pos
, regs
);
2092 ** c-file-style: "python"