3 * Author: Tatu Ylonen <ylo@ngs.fi>
5 * Copyright (c) 1991 Tatu Ylonen, Espoo, Finland
7 * Permission to use, copy, modify, distribute, and sell this software
8 * and its documentation for any purpose is hereby granted without
9 * fee, provided that the above copyright notice appear in all copies.
10 * This software is provided "as is" without express or implied
13 * Created: Thu Sep 26 17:14:05 1991 ylo
14 * Last modified: Mon Nov 4 17:06:48 1991 ylo
15 * Ported to Think C: 19 Jan 1992 guido@cwi.nl
17 * This code draws many ideas from the regular expression packages by
18 * Henry Spencer of the University of Toronto and Richard Stallman of
19 * the Free Software Foundation.
21 * Emacs-specific code and syntax table code is almost directly borrowed
24 * Bugs fixed and lots of reorganization by Jeffrey C. Ollie, April
25 * 1997 Thanks for bug reports and ideas from Andrew Kuchling, Tim
26 * Peters, Guido van Rossum, Ka-Ping Yee, Sjoerd Mullender, and
27 * probably one or two others that I'm forgetting.
35 /* The original code blithely assumed that sizeof(short) == 2. Not
36 * always true. Original instances of "(short)x" were replaced by
37 * SHORT(x), where SHORT is #defined below. */
39 #define SHORT(x) ((x) & 0x8000 ? (x) - 0x10000 : (x))
41 /* The stack implementation is taken from an idea by Andrew Kuchling.
42 * It's a doubly linked list of arrays. The advantages of this over a
43 * simple linked list are that the number of mallocs required are
44 * reduced. It also makes it possible to statically allocate enough
45 * space so that small patterns don't ever need to call malloc.
47 * The advantages over a single array is that is periodically
48 * realloced when more space is needed is that we avoid ever copying
51 /* item_t is the basic stack element. Defined as a union of
52 * structures so that both registers, failure points, and counters can
53 * be pushed/popped from the stack. There's nothing built into the
54 * item to keep track of whether a certain stack item is a register, a
55 * failure point, or a counter. */
82 #define STACK_PAGE_SIZE 256
83 #define NUM_REGISTERS 256
85 /* A 'page' of stack items. */
87 typedef struct item_page_t
89 item_t items
[STACK_PAGE_SIZE
];
90 struct item_page_t
*prev
;
91 struct item_page_t
*next
;
95 typedef struct match_state
97 /* The number of registers that have been pushed onto the stack
98 * since the last failure point. */
102 /* Used to control when registers need to be pushed onto the
107 /* The number of failure points on the stack. */
111 /* Storage for the registers. Each register consists of two
112 * pointers to characters. So register N is represented as
113 * start[N] and end[N]. The pointers must be converted to
114 * offsets from the beginning of the string before returning the
115 * registers to the calling program. */
117 unsigned char *start
[NUM_REGISTERS
];
118 unsigned char *end
[NUM_REGISTERS
];
120 /* Keeps track of whether a register has changed recently. */
122 int changed
[NUM_REGISTERS
];
124 /* Structure to encapsulate the stack. */
127 /* index into the current page. If index == 0 and you need
128 * to pop an item, move to the previous page and set index
129 * = STACK_PAGE_SIZE - 1. Otherwise decrement index to
130 * push a page. If index == STACK_PAGE_SIZE and you need
131 * to push a page move to the next page and set index =
132 * 0. If there is no new next page, allocate a new page
133 * and link it in. Otherwise, increment index to push a
137 item_page_t
*current
; /* Pointer to the current page. */
138 item_page_t first
; /* First page is statically allocated. */
142 /* Initialize a state object */
144 /* #define NEW_STATE(state) \ */
145 /* memset(&state, 0, (void *)(&state.stack) - (void *)(&state)); \ */
146 /* state.stack.current = &state.stack.first; \ */
147 /* state.stack.first.prev = NULL; \ */
148 /* state.stack.first.next = NULL; \ */
149 /* state.stack.index = 0; \ */
150 /* state.level = 1 */
152 #define NEW_STATE(state, nregs) \
155 for (i = 0; i < nregs; i++) \
157 state.start[i] = NULL; \
158 state.end[i] = NULL; \
159 state.changed[i] = 0; \
161 state.stack.current = &state.stack.first; \
162 state.stack.first.prev = NULL; \
163 state.stack.first.next = NULL; \
164 state.stack.index = 0; \
171 /* Free any memory that might have been malloc'd */
173 #define FREE_STATE(state) \
174 while(state.stack.first.next != NULL) \
176 state.stack.current = state.stack.first.next; \
177 state.stack.first.next = state.stack.current->next; \
178 free(state.stack.current); \
181 /* Discard the top 'count' stack items. */
183 #define STACK_DISCARD(stack, count, on_error) \
184 stack.index -= count; \
185 while (stack.index < 0) \
187 if (stack.current->prev == NULL) \
189 stack.current = stack.current->prev; \
190 stack.index += STACK_PAGE_SIZE; \
193 /* Store a pointer to the previous item on the stack. Used to pop an
194 * item off of the stack. */
196 #define STACK_PREV(stack, top, on_error) \
197 if (stack.index == 0) \
199 if (stack.current->prev == NULL) \
201 stack.current = stack.current->prev; \
202 stack.index = STACK_PAGE_SIZE - 1; \
208 top = &(stack.current->items[stack.index])
210 /* Store a pointer to the next item on the stack. Used to push an item
211 * on to the stack. */
213 #define STACK_NEXT(stack, top, on_error) \
214 if (stack.index == STACK_PAGE_SIZE) \
216 if (stack.current->next == NULL) \
218 stack.current->next = (item_page_t *)malloc(sizeof(item_page_t)); \
219 if (stack.current->next == NULL) \
221 stack.current->next->prev = stack.current; \
222 stack.current->next->next = NULL; \
224 stack.current = stack.current->next; \
227 top = &(stack.current->items[stack.index++])
229 /* Store a pointer to the item that is 'count' items back in the
230 * stack. STACK_BACK(stack, top, 1, on_error) is equivalent to
231 * STACK_TOP(stack, top, on_error). */
233 #define STACK_BACK(stack, top, count, on_error) \
236 item_page_t *current; \
237 current = stack.current; \
238 index = stack.index - (count); \
241 if (current->prev == NULL) \
243 current = current->prev; \
244 index += STACK_PAGE_SIZE; \
246 top = &(current->items[index]); \
249 /* Store a pointer to the top item on the stack. Execute the
250 * 'on_error' code if there are no items on the stack. */
252 #define STACK_TOP(stack, top, on_error) \
253 if (stack.index == 0) \
255 if (stack.current->prev == NULL) \
257 top = &(stack.current->prev->items[STACK_PAGE_SIZE - 1]); \
261 top = &(stack.current->items[stack.index - 1]); \
264 /* Test to see if the stack is empty */
266 #define STACK_EMPTY(stack) ((stack.index == 0) && \
267 (stack.current->prev == NULL))
269 /* Return the start of register 'reg' */
271 #define GET_REG_START(state, reg) (state.start[reg])
273 /* Return the end of register 'reg' */
275 #define GET_REG_END(state, reg) (state.end[reg])
277 /* Set the start of register 'reg'. If the state of the register needs
278 * saving, push it on the stack. */
280 #define SET_REG_START(state, reg, text, on_error) \
281 if(state.changed[reg] < state.level) \
284 STACK_NEXT(state.stack, item, on_error); \
285 item->reg.num = reg; \
286 item->reg.start = state.start[reg]; \
287 item->reg.end = state.end[reg]; \
288 item->reg.level = state.changed[reg]; \
289 state.changed[reg] = state.level; \
292 state.start[reg] = text
294 /* Set the end of register 'reg'. If the state of the register needs
295 * saving, push it on the stack. */
297 #define SET_REG_END(state, reg, text, on_error) \
298 if(state.changed[reg] < state.level) \
301 STACK_NEXT(state.stack, item, on_error); \
302 item->reg.num = reg; \
303 item->reg.start = state.start[reg]; \
304 item->reg.end = state.end[reg]; \
305 item->reg.level = state.changed[reg]; \
306 state.changed[reg] = state.level; \
309 state.end[reg] = text
311 #define PUSH_FAILURE(state, xcode, xtext, on_error) \
314 STACK_NEXT(state.stack, item, on_error); \
315 item->fail.code = xcode; \
316 item->fail.text = xtext; \
317 item->fail.count = state.count; \
318 item->fail.level = state.level; \
319 item->fail.phantom = 0; \
325 /* Update the last failure point with a new position in the text. */
327 #define UPDATE_FAILURE(state, xtext, on_error) \
330 STACK_BACK(state.stack, item, state.count + 1, on_error); \
331 if (!item->fail.phantom) \
334 STACK_NEXT(state.stack, item2, on_error); \
335 item2->fail.code = item->fail.code; \
336 item2->fail.text = xtext; \
337 item2->fail.count = state.count; \
338 item2->fail.level = state.level; \
339 item2->fail.phantom = 1; \
346 STACK_DISCARD(state.stack, state.count, on_error); \
347 STACK_TOP(state.stack, item, on_error); \
348 item->fail.text = xtext; \
354 #define POP_FAILURE(state, xcode, xtext, on_empty, on_error) \
359 while(state.count > 0) \
361 STACK_PREV(state.stack, item, on_error); \
362 state.start[item->reg.num] = item->reg.start; \
363 state.end[item->reg.num] = item->reg.end; \
364 state.changed[item->reg.num] = item->reg.level; \
367 STACK_PREV(state.stack, item, on_empty); \
368 xcode = item->fail.code; \
369 xtext = item->fail.text; \
370 state.count = item->fail.count; \
371 state.level = item->fail.level; \
374 while (item->fail.text == NULL); \
377 enum regexp_compiled_ops
/* opcodes for compiled regexp */
379 Cend
, /* end of pattern reached */
380 Cbol
, /* beginning of line */
381 Ceol
, /* end of line */
382 Cset
, /* character set. Followed by 32 bytes of set. */
383 Cexact
, /* followed by a byte to match */
384 Canychar
, /* matches any character except newline */
385 Cstart_memory
, /* set register start addr (followed by reg number) */
386 Cend_memory
, /* set register end addr (followed by reg number) */
387 Cmatch_memory
, /* match a duplicate of reg contents (regnum follows)*/
388 Cjump
, /* followed by two bytes (lsb,msb) of displacement. */
389 Cstar_jump
, /* will change to jump/update_failure_jump at runtime */
390 Cfailure_jump
, /* jump to addr on failure */
391 Cupdate_failure_jump
, /* update topmost failure point and jump */
392 Cdummy_failure_jump
, /* push a dummy failure point and jump */
393 Cbegbuf
, /* match at beginning of buffer */
394 Cendbuf
, /* match at end of buffer */
395 Cwordbeg
, /* match at beginning of word */
396 Cwordend
, /* match at end of word */
397 Cwordbound
, /* match if at word boundary */
398 Cnotwordbound
, /* match if not at word boundary */
399 Csyntaxspec
, /* matches syntax code (1 byte follows) */
400 Cnotsyntaxspec
, /* matches if syntax code does not match (1 byte follows) */
404 enum regexp_syntax_op
/* syntax codes for plain and quoted characters */
406 Rend
, /* special code for end of regexp */
407 Rnormal
, /* normal character */
408 Ranychar
, /* any character except newline */
409 Rquote
, /* the quote character */
410 Rbol
, /* match beginning of line */
411 Reol
, /* match end of line */
412 Roptional
, /* match preceding expression optionally */
413 Rstar
, /* match preceding expr zero or more times */
414 Rplus
, /* match preceding expr one or more times */
415 Ror
, /* match either of alternatives */
416 Ropenpar
, /* opening parenthesis */
417 Rclosepar
, /* closing parenthesis */
418 Rmemory
, /* match memory register */
419 Rextended_memory
, /* \vnn to match registers 10-99 */
420 Ropenset
, /* open set. Internal syntax hard-coded below. */
421 /* the following are gnu extensions to "normal" regexp syntax */
422 Rbegbuf
, /* beginning of buffer */
423 Rendbuf
, /* end of buffer */
424 Rwordchar
, /* word character */
425 Rnotwordchar
, /* not word character */
426 Rwordbeg
, /* beginning of word */
427 Rwordend
, /* end of word */
428 Rwordbound
, /* word bound */
429 Rnotwordbound
, /* not word bound */
433 static int re_compile_initialized
= 0;
434 static int regexp_syntax
= 0;
435 int re_syntax
= 0; /* Exported copy of regexp_syntax */
436 static unsigned char regexp_plain_ops
[256];
437 static unsigned char regexp_quoted_ops
[256];
438 static unsigned char regexp_precedences
[Rnum_ops
];
439 static int regexp_context_indep_ops
;
440 static int regexp_ansi_sequences
;
442 #define NUM_LEVELS 5 /* number of precedence levels in use */
443 #define MAX_NESTING 100 /* max nesting level of operators */
445 #define SYNTAX(ch) re_syntax_table[(unsigned char)(ch)]
447 unsigned char re_syntax_table
[256];
449 void re_compile_initialize(void)
453 static int syntax_table_inited
= 0;
455 if (!syntax_table_inited
)
457 syntax_table_inited
= 1;
458 memset(re_syntax_table
, 0, 256);
459 for (a
= 'a'; a
<= 'z'; a
++)
460 re_syntax_table
[a
] = Sword
;
461 for (a
= 'A'; a
<= 'Z'; a
++)
462 re_syntax_table
[a
] = Sword
;
463 for (a
= '0'; a
<= '9'; a
++)
464 re_syntax_table
[a
] = Sword
| Sdigit
| Shexdigit
;
465 for (a
= '0'; a
<= '7'; a
++)
466 re_syntax_table
[a
] |= Soctaldigit
;
467 for (a
= 'A'; a
<= 'F'; a
++)
468 re_syntax_table
[a
] |= Shexdigit
;
469 for (a
= 'a'; a
<= 'f'; a
++)
470 re_syntax_table
[a
] |= Shexdigit
;
471 re_syntax_table
['_'] = Sword
;
472 for (a
= 9; a
<= 13; a
++)
473 re_syntax_table
[a
] = Swhitespace
;
474 re_syntax_table
[' '] = Swhitespace
;
476 re_compile_initialized
= 1;
477 for (a
= 0; a
< 256; a
++)
479 regexp_plain_ops
[a
] = Rnormal
;
480 regexp_quoted_ops
[a
] = Rnormal
;
482 for (a
= '0'; a
<= '9'; a
++)
483 regexp_quoted_ops
[a
] = Rmemory
;
484 regexp_plain_ops
['\134'] = Rquote
;
485 if (regexp_syntax
& RE_NO_BK_PARENS
)
487 regexp_plain_ops
['('] = Ropenpar
;
488 regexp_plain_ops
[')'] = Rclosepar
;
492 regexp_quoted_ops
['('] = Ropenpar
;
493 regexp_quoted_ops
[')'] = Rclosepar
;
495 if (regexp_syntax
& RE_NO_BK_VBAR
)
496 regexp_plain_ops
['\174'] = Ror
;
498 regexp_quoted_ops
['\174'] = Ror
;
499 regexp_plain_ops
['*'] = Rstar
;
500 if (regexp_syntax
& RE_BK_PLUS_QM
)
502 regexp_quoted_ops
['+'] = Rplus
;
503 regexp_quoted_ops
['?'] = Roptional
;
507 regexp_plain_ops
['+'] = Rplus
;
508 regexp_plain_ops
['?'] = Roptional
;
510 if (regexp_syntax
& RE_NEWLINE_OR
)
511 regexp_plain_ops
['\n'] = Ror
;
512 regexp_plain_ops
['\133'] = Ropenset
;
513 regexp_plain_ops
['\136'] = Rbol
;
514 regexp_plain_ops
['$'] = Reol
;
515 regexp_plain_ops
['.'] = Ranychar
;
516 if (!(regexp_syntax
& RE_NO_GNU_EXTENSIONS
))
518 regexp_quoted_ops
['w'] = Rwordchar
;
519 regexp_quoted_ops
['W'] = Rnotwordchar
;
520 regexp_quoted_ops
['<'] = Rwordbeg
;
521 regexp_quoted_ops
['>'] = Rwordend
;
522 regexp_quoted_ops
['b'] = Rwordbound
;
523 regexp_quoted_ops
['B'] = Rnotwordbound
;
524 regexp_quoted_ops
['`'] = Rbegbuf
;
525 regexp_quoted_ops
['\''] = Rendbuf
;
527 if (regexp_syntax
& RE_ANSI_HEX
)
528 regexp_quoted_ops
['v'] = Rextended_memory
;
529 for (a
= 0; a
< Rnum_ops
; a
++)
530 regexp_precedences
[a
] = 4;
531 if (regexp_syntax
& RE_TIGHT_VBAR
)
533 regexp_precedences
[Ror
] = 3;
534 regexp_precedences
[Rbol
] = 2;
535 regexp_precedences
[Reol
] = 2;
539 regexp_precedences
[Ror
] = 2;
540 regexp_precedences
[Rbol
] = 3;
541 regexp_precedences
[Reol
] = 3;
543 regexp_precedences
[Rclosepar
] = 1;
544 regexp_precedences
[Rend
] = 0;
545 regexp_context_indep_ops
= (regexp_syntax
& RE_CONTEXT_INDEP_OPS
) != 0;
546 regexp_ansi_sequences
= (regexp_syntax
& RE_ANSI_HEX
) != 0;
549 int re_set_syntax(int syntax
)
554 regexp_syntax
= syntax
;
555 re_syntax
= syntax
; /* Exported copy */
556 re_compile_initialize();
560 static int hex_char_to_decimal(int ch
)
562 if (ch
>= '0' && ch
<= '9')
564 if (ch
>= 'a' && ch
<= 'f')
565 return ch
- 'a' + 10;
566 if (ch
>= 'A' && ch
<= 'F')
567 return ch
- 'A' + 10;
571 static void re_compile_fastmap_aux(unsigned char *code
, int pos
,
572 unsigned char *visited
,
573 unsigned char *can_be_null
,
574 unsigned char *fastmap
)
581 return; /* we have already been here */
584 switch (code
[pos
++]) {
598 for (a
= 0; a
< 256; a
++)
604 syntaxcode
= code
[pos
++];
605 for (a
= 0; a
< 256; a
++)
606 if (SYNTAX(a
) & syntaxcode
)
612 syntaxcode
= code
[pos
++];
613 for (a
= 0; a
< 256; a
++)
614 if (!(SYNTAX(a
) & syntaxcode
) )
621 if (*can_be_null
== 0)
622 *can_be_null
= 2; /* can match null, but only at end of buffer*/
627 for (a
= 0; a
< 256/8; a
++)
628 if (code
[pos
+ a
] != 0)
629 for (b
= 0; b
< 8; b
++)
630 if (code
[pos
+ a
] & (1 << b
))
631 fastmap
[(a
<< 3) + b
] = 1;
637 fastmap
[(unsigned char)code
[pos
]] = 1;
642 for (a
= 0; a
< 256; a
++)
655 for (a
= 0; a
< 256; a
++)
661 case Cdummy_failure_jump
:
662 case Cupdate_failure_jump
:
665 a
= (unsigned char)code
[pos
++];
666 a
|= (unsigned char)code
[pos
++] << 8;
667 pos
+= (int)SHORT(a
);
670 /* argh... the regexp contains empty loops. This is not
671 good, as this may cause a failure stack overflow when
672 matching. Oh well. */
673 /* this path leads nowhere; pursue other paths. */
681 a
= (unsigned char)code
[pos
++];
682 a
|= (unsigned char)code
[pos
++] << 8;
683 a
= pos
+ (int)SHORT(a
);
684 re_compile_fastmap_aux(code
, a
, visited
, can_be_null
, fastmap
);
694 PyErr_SetString(PyExc_SystemError
, "Unknown regex opcode: memory corrupted?");
701 static int re_do_compile_fastmap(unsigned char *buffer
, int used
, int pos
,
702 unsigned char *can_be_null
,
703 unsigned char *fastmap
)
705 unsigned char small_visited
[512], *visited
;
707 if (used
<= sizeof(small_visited
))
708 visited
= small_visited
;
711 visited
= malloc(used
);
716 memset(fastmap
, 0, 256);
717 memset(visited
, 0, used
);
718 re_compile_fastmap_aux(buffer
, pos
, visited
, can_be_null
, fastmap
);
719 if (visited
!= small_visited
)
724 void re_compile_fastmap(regexp_t bufp
)
726 if (!bufp
->fastmap
|| bufp
->fastmap_accurate
)
728 assert(bufp
->used
> 0);
729 if (!re_do_compile_fastmap(bufp
->buffer
,
735 if (PyErr_Occurred()) return;
736 if (bufp
->buffer
[0] == Cbol
)
737 bufp
->anchor
= 1; /* begline */
739 if (bufp
->buffer
[0] == Cbegbuf
)
740 bufp
->anchor
= 2; /* begbuf */
742 bufp
->anchor
= 0; /* none */
743 bufp
->fastmap_accurate
= 1;
749 * ... code for operand of star
751 * 2: ... code after star
753 * We change the star_jump to update_failure_jump if we can determine
754 * that it is safe to do so; otherwise we change it to an ordinary
761 * 2: ... code for operand of plus
763 * 3: ... code after plus
765 * For star_jump considerations this is processed identically to star.
769 static int re_optimize_star_jump(regexp_t bufp
, unsigned char *code
)
771 unsigned char map
[256];
772 unsigned char can_be_null
;
778 int num_instructions
= 0;
780 a
= (unsigned char)*code
++;
781 a
|= (unsigned char)*code
++ << 8;
784 p1
= code
+ a
+ 3; /* skip the failure_jump */
785 /* Check that the jump is within the pattern */
786 if (p1
<bufp
->buffer
|| bufp
->buffer
+bufp
->used
<p1
)
788 PyErr_SetString(PyExc_SystemError
, "Regex VM jump out of bounds (failure_jump opt)");
792 assert(p1
[-3] == Cfailure_jump
);
794 /* p1 points inside loop, p2 points to after loop */
795 if (!re_do_compile_fastmap(bufp
->buffer
, bufp
->used
,
796 (int)(p2
- bufp
->buffer
),
798 goto make_normal_jump
;
800 /* If we might introduce a new update point inside the
801 * loop, we can't optimize because then update_jump would
802 * update a wrong failure point. Thus we have to be
803 * quite careful here.
806 /* loop until we find something that consumes a character */
830 ch
= (unsigned char)*p1
++;
832 goto make_normal_jump
;
837 for (b
= 0; b
< 256; b
++)
838 if (b
!= '\n' && map
[b
])
839 goto make_normal_jump
;
844 for (b
= 0; b
< 256; b
++)
845 if ((p1
[b
>> 3] & (1 << (b
& 7))) && map
[b
])
846 goto make_normal_jump
;
852 goto make_normal_jump
;
855 /* now we know that we can't backtrack. */
895 case Cupdate_failure_jump
:
896 case Cdummy_failure_jump
:
898 goto make_normal_jump
;
907 /* make_update_jump: */
909 a
+= 3; /* jump to after the Cfailure_jump */
910 code
[0] = Cupdate_failure_jump
;
913 if (num_instructions
> 1)
915 assert(num_instructions
== 1);
916 /* if the only instruction matches a single character, we can do
918 p1
= code
+ 3 + a
; /* start of sole instruction */
919 if (*p1
== Cset
|| *p1
== Cexact
|| *p1
== Canychar
||
920 *p1
== Csyntaxspec
|| *p1
== Cnotsyntaxspec
)
930 static int re_optimize(regexp_t bufp
)
973 if (!re_optimize_star_jump(bufp
, code
))
979 case Cupdate_failure_jump
:
981 case Cdummy_failure_jump
:
996 #define NEXTCHAR(var) \
999 goto ends_prematurely; \
1000 (var) = regex[pos]; \
1004 #define ALLOC(amount) \
1006 if (pattern_offset+(amount) > alloc) \
1008 alloc += 256 + (amount); \
1009 pattern = realloc(pattern, alloc); \
1011 goto out_of_memory; \
1015 #define STORE(ch) pattern[pattern_offset++] = (ch)
1017 #define CURRENT_LEVEL_START (starts[starts_base + current_level])
1019 #define SET_LEVEL_START starts[starts_base + current_level] = pattern_offset
1021 #define PUSH_LEVEL_STARTS \
1022 if (starts_base < (MAX_NESTING-1)*NUM_LEVELS) \
1023 starts_base += NUM_LEVELS; \
1027 #define POP_LEVEL_STARTS starts_base -= NUM_LEVELS
1029 #define PUT_ADDR(offset,addr) \
1031 int disp = (addr) - (offset) - 2; \
1032 pattern[(offset)] = disp & 0xff; \
1033 pattern[(offset)+1] = (disp>>8) & 0xff; \
1036 #define INSERT_JUMP(pos,type,addr) \
1038 int a, p = (pos), t = (type), ad = (addr); \
1039 for (a = pattern_offset - 1; a >= p; a--) \
1040 pattern[a + 3] = pattern[a]; \
1043 pattern_offset += 3; \
1046 #define SETBIT(buf,offset,bit) (buf)[(offset)+(bit)/8] |= (1<<((bit) & 7))
1048 #define SET_FIELDS \
1050 bufp->allocated = alloc; \
1051 bufp->buffer = pattern; \
1052 bufp->used = pattern_offset; \
1055 #define GETHEX(var) \
1057 unsigned char gethex_ch, gethex_value; \
1058 NEXTCHAR(gethex_ch); \
1059 gethex_value = hex_char_to_decimal(gethex_ch); \
1060 if (gethex_value == 16) \
1062 NEXTCHAR(gethex_ch); \
1063 gethex_ch = hex_char_to_decimal(gethex_ch); \
1064 if (gethex_ch == 16) \
1066 (var) = gethex_value * 16 + gethex_ch; \
1069 #define ANSI_TRANSLATE(ch) \
1076 ch = 7; /* audible bell */ \
1082 ch = 8; /* backspace */ \
1088 ch = 12; /* form feed */ \
1094 ch = 10; /* line feed */ \
1100 ch = 13; /* carriage return */ \
1112 ch = 11; /* vertical tab */ \
1115 case 'x': /* hex code */ \
1123 /* other characters passed through */ \
1125 ch = translate[(unsigned char)ch]; \
1131 char *re_compile_pattern(unsigned char *regex
, int size
, regexp_t bufp
)
1139 int pattern_offset
= 0, alloc
;
1140 int starts
[NUM_LEVELS
* MAX_NESTING
];
1142 int future_jumps
[MAX_NESTING
];
1144 unsigned char ch
= '\0';
1145 unsigned char *pattern
;
1146 unsigned char *translate
;
1149 int num_open_registers
;
1150 int open_registers
[RE_NREGS
];
1151 int beginning_context
;
1153 if (!re_compile_initialized
)
1154 re_compile_initialize();
1156 bufp
->fastmap_accurate
= 0;
1157 bufp
->uses_registers
= 1;
1158 bufp
->num_registers
= 1;
1159 translate
= bufp
->translate
;
1160 pattern
= bufp
->buffer
;
1161 alloc
= bufp
->allocated
;
1162 if (alloc
== 0 || pattern
== NULL
)
1165 pattern
= malloc(alloc
);
1174 num_open_registers
= 0;
1177 beginning_context
= 1;
1179 /* we use Rend dummy to ensure that pending jumps are updated
1180 (due to low priority of Rend) before exiting the loop. */
1190 ch
= translate
[(unsigned char)ch
];
1191 op
= regexp_plain_ops
[(unsigned char)ch
];
1195 op
= regexp_quoted_ops
[(unsigned char)ch
];
1196 if (op
== Rnormal
&& regexp_ansi_sequences
)
1200 level
= regexp_precedences
[op
];
1201 /* printf("ch='%c' op=%d level=%d current_level=%d
1202 curlevstart=%d\n", ch, op, level, current_level,
1203 CURRENT_LEVEL_START); */
1204 if (level
> current_level
)
1206 for (current_level
++; current_level
< level
; current_level
++)
1211 if (level
< current_level
)
1213 current_level
= level
;
1214 for (;num_jumps
> 0 &&
1215 future_jumps
[num_jumps
-1] >= CURRENT_LEVEL_START
;
1217 PUT_ADDR(future_jumps
[num_jumps
-1], pattern_offset
);
1229 store_opcode_and_arg
: /* opcode & ch must be set */
1252 if (!beginning_context
) {
1253 if (regexp_context_indep_ops
)
1263 if (!((pos
>= size
) ||
1264 ((regexp_syntax
& RE_NO_BK_VBAR
) ?
1265 (regex
[pos
] == '\174') :
1266 (pos
+1 < size
&& regex
[pos
] == '\134' &&
1267 regex
[pos
+1] == '\174')) ||
1268 ((regexp_syntax
& RE_NO_BK_PARENS
)?
1269 (regex
[pos
] == ')'):
1270 (pos
+1 < size
&& regex
[pos
] == '\134' &&
1271 regex
[pos
+1] == ')')))) {
1272 if (regexp_context_indep_ops
)
1284 if (beginning_context
) {
1285 if (regexp_context_indep_ops
)
1290 if (CURRENT_LEVEL_START
== pattern_offset
)
1291 break; /* ignore empty patterns for ? */
1293 INSERT_JUMP(CURRENT_LEVEL_START
, Cfailure_jump
,
1294 pattern_offset
+ 3);
1300 if (beginning_context
) {
1301 if (regexp_context_indep_ops
)
1306 if (CURRENT_LEVEL_START
== pattern_offset
)
1307 break; /* ignore empty patterns for + and * */
1309 INSERT_JUMP(CURRENT_LEVEL_START
, Cfailure_jump
,
1310 pattern_offset
+ 6);
1311 INSERT_JUMP(pattern_offset
, Cstar_jump
, CURRENT_LEVEL_START
);
1312 if (op
== Rplus
) /* jump over initial failure_jump */
1313 INSERT_JUMP(CURRENT_LEVEL_START
, Cdummy_failure_jump
,
1314 CURRENT_LEVEL_START
+ 6);
1320 INSERT_JUMP(CURRENT_LEVEL_START
, Cfailure_jump
,
1321 pattern_offset
+ 6);
1322 if (num_jumps
>= MAX_NESTING
)
1325 future_jumps
[num_jumps
++] = pattern_offset
;
1334 if (next_register
< RE_NREGS
)
1336 bufp
->uses_registers
= 1;
1338 STORE(Cstart_memory
);
1339 STORE(next_register
);
1340 open_registers
[num_open_registers
++] = next_register
;
1341 bufp
->num_registers
++;
1352 if (paren_depth
<= 0)
1353 goto parenthesis_error
;
1355 current_level
= regexp_precedences
[Ropenpar
];
1357 if (paren_depth
< num_open_registers
)
1359 bufp
->uses_registers
= 1;
1362 num_open_registers
--;
1363 STORE(open_registers
[num_open_registers
]);
1370 goto bad_match_register
;
1371 assert(ch
>= '0' && ch
<= '9');
1372 bufp
->uses_registers
= 1;
1373 opcode
= Cmatch_memory
;
1375 goto store_opcode_and_arg
;
1377 case Rextended_memory
:
1380 if (ch
< '0' || ch
> '9')
1381 goto bad_match_register
;
1383 if (a
< '0' || a
> '9')
1384 goto bad_match_register
;
1385 ch
= 10 * (a
- '0') + ch
- '0';
1386 if (ch
== 0 || ch
>= RE_NREGS
)
1387 goto bad_match_register
;
1388 bufp
->uses_registers
= 1;
1389 opcode
= Cmatch_memory
;
1390 goto store_opcode_and_arg
;
1403 offset
= pattern_offset
;
1404 for (a
= 0; a
< 256/8; a
++)
1408 ch
= translate
[(unsigned char)ch
];
1414 ch
= translate
[(unsigned char)ch
];
1421 while (ch
!= '\135' || firstchar
)
1424 if (regexp_ansi_sequences
&& ch
== '\134')
1431 for (a
= prev
; a
<= (int)ch
; a
++)
1432 SETBIT(pattern
, offset
, a
);
1437 if (prev
!= -1 && ch
== '-')
1441 SETBIT(pattern
, offset
, ch
);
1446 ch
= translate
[(unsigned char)ch
];
1449 SETBIT(pattern
, offset
, '-');
1452 for (a
= 0; a
< 256/8; a
++)
1453 pattern
[offset
+a
] ^= 0xff;
1469 opcode
= Csyntaxspec
;
1471 goto store_opcode_and_arg
;
1475 opcode
= Cnotsyntaxspec
;
1477 goto store_opcode_and_arg
;
1491 opcode
= Cwordbound
;
1496 opcode
= Cnotwordbound
;
1504 beginning_context
= (op
== Ropenpar
|| op
== Ror
);
1506 if (starts_base
!= 0)
1507 goto parenthesis_error
;
1508 assert(num_jumps
== 0);
1512 if(!re_optimize(bufp
))
1513 return "Optimization error";
1518 return "Badly placed special character";
1522 return "Bad match register number";
1526 return "Bad hexadecimal number";
1530 return "Badly placed parenthesis";
1534 return "Out of memory";
1538 return "Regular expression ends prematurely";
1542 return "Regular expression too complex";
1550 #undef CURRENT_LEVEL_START
1551 #undef SET_LEVEL_START
1552 #undef PUSH_LEVEL_STARTS
1553 #undef POP_LEVEL_STARTS
1559 #define PREFETCH if (text == textend) goto fail
1561 #define NEXTCHAR(var) \
1563 var = (unsigned char)*text++; \
1565 var = translate[var]
1567 int re_match(regexp_t bufp
, unsigned char *string
, int size
, int pos
,
1568 regexp_registers_t old_regs
)
1570 unsigned char *code
;
1571 unsigned char *translate
;
1572 unsigned char *text
;
1573 unsigned char *textstart
;
1574 unsigned char *textend
;
1580 unsigned char *regstart
;
1581 unsigned char *regend
;
1585 assert(pos
>= 0 && size
>= 0);
1586 assert(pos
<= size
);
1588 text
= string
+ pos
;
1590 textend
= string
+ size
;
1592 code
= bufp
->buffer
;
1594 translate
= bufp
->translate
;
1596 NEW_STATE(state
, bufp
->num_registers
);
1603 match_end
= text
- textstart
;
1606 old_regs
->start
[0] = pos
;
1607 old_regs
->end
[0] = match_end
;
1608 if (!bufp
->uses_registers
)
1610 for (a
= 1; a
< RE_NREGS
; a
++)
1612 old_regs
->start
[a
] = -1;
1613 old_regs
->end
[a
] = -1;
1618 for (a
= 1; a
< bufp
->num_registers
; a
++)
1620 if ((GET_REG_START(state
, a
) == NULL
) ||
1621 (GET_REG_END(state
, a
) == NULL
))
1623 old_regs
->start
[a
] = -1;
1624 old_regs
->end
[a
] = -1;
1627 old_regs
->start
[a
] = GET_REG_START(state
, a
) - textstart
;
1628 old_regs
->end
[a
] = GET_REG_END(state
, a
) - textstart
;
1630 for (; a
< RE_NREGS
; a
++)
1632 old_regs
->start
[a
] = -1;
1633 old_regs
->end
[a
] = -1;
1638 return match_end
- pos
;
1642 if (text
== textstart
|| text
[-1] == '\n')
1643 goto continue_matching
;
1648 if (text
== textend
|| *text
== '\n')
1649 goto continue_matching
;
1655 if (code
[ch
/8] & (1<<(ch
& 7)))
1658 goto continue_matching
;
1665 if (ch
!= (unsigned char)*code
++)
1667 goto continue_matching
;
1674 goto continue_matching
;
1679 SET_REG_START(state
, reg
, text
, goto error
);
1680 goto continue_matching
;
1685 SET_REG_END(state
, reg
, text
, goto error
);
1686 goto continue_matching
;
1691 regstart
= GET_REG_START(state
, reg
);
1692 regend
= GET_REG_END(state
, reg
);
1693 if ((regstart
== NULL
) || (regend
== NULL
))
1694 goto fail
; /* or should we just match nothing? */
1695 regsize
= regend
- regstart
;
1697 if (regsize
> (textend
- text
))
1701 for (; regstart
< regend
; regstart
++, text
++)
1702 if (translate
[*regstart
] != translate
[*text
])
1706 for (; regstart
< regend
; regstart
++, text
++)
1707 if (*regstart
!= *text
)
1709 goto continue_matching
;
1711 case Cupdate_failure_jump
:
1713 UPDATE_FAILURE(state
, text
, goto error
);
1714 /* fall to next case */
1716 /* treat Cstar_jump just like Cjump if it hasn't been optimized */
1720 a
= (unsigned char)*code
++;
1721 a
|= (unsigned char)*code
++ << 8;
1722 code
+= (int)SHORT(a
);
1723 if (code
<bufp
->buffer
|| bufp
->buffer
+bufp
->used
<code
) {
1724 PyErr_SetString(PyExc_SystemError
, "Regex VM jump out of bounds (Cjump)");
1728 goto continue_matching
;
1730 case Cdummy_failure_jump
:
1732 unsigned char *failuredest
;
1734 a
= (unsigned char)*code
++;
1735 a
|= (unsigned char)*code
++ << 8;
1737 assert(*code
== Cfailure_jump
);
1738 b
= (unsigned char)code
[1];
1739 b
|= (unsigned char)code
[2] << 8;
1740 failuredest
= code
+ (int)SHORT(b
) + 3;
1741 if (failuredest
<bufp
->buffer
|| bufp
->buffer
+bufp
->used
< failuredest
) {
1742 PyErr_SetString(PyExc_SystemError
, "Regex VM jump out of bounds (Cdummy_failure_jump failuredest)");
1746 PUSH_FAILURE(state
, failuredest
, NULL
, goto error
);
1748 if (code
<bufp
->buffer
|| bufp
->buffer
+bufp
->used
< code
) {
1749 PyErr_SetString(PyExc_SystemError
, "Regex VM jump out of bounds (Cdummy_failure_jump code)");
1753 goto continue_matching
;
1757 a
= (unsigned char)*code
++;
1758 a
|= (unsigned char)*code
++ << 8;
1760 if (code
+a
<bufp
->buffer
|| bufp
->buffer
+bufp
->used
< code
+a
) {
1761 PyErr_SetString(PyExc_SystemError
, "Regex VM jump out of bounds (Cfailure_jump)");
1765 PUSH_FAILURE(state
, code
+ a
, text
, goto error
);
1766 goto continue_matching
;
1770 unsigned char *pinst
;
1771 a
= (unsigned char)*code
++;
1772 a
|= (unsigned char)*code
++ << 8;
1775 if (pinst
<bufp
->buffer
|| bufp
->buffer
+bufp
->used
<pinst
) {
1776 PyErr_SetString(PyExc_SystemError
, "Regex VM jump out of bounds (Crepeat1)");
1780 /* pinst is sole instruction in loop, and it matches a
1781 * single character. Since Crepeat1 was originally a
1782 * Cupdate_failure_jump, we also know that backtracking
1783 * is useless: so long as the single-character
1784 * expression matches, it must be used. Also, in the
1785 * case of +, we've already matched one character, so +
1786 * can't fail: nothing here can cause a failure. */
1793 while (text
< textend
)
1795 ch
= translate
[(unsigned char)*text
];
1796 if (pinst
[ch
/8] & (1<<(ch
& 7)))
1804 while (text
< textend
)
1806 ch
= (unsigned char)*text
;
1807 if (pinst
[ch
/8] & (1<<(ch
& 7)))
1817 ch
= (unsigned char)*pinst
;
1820 while (text
< textend
&&
1821 translate
[(unsigned char)*text
] == ch
)
1826 while (text
< textend
&& (unsigned char)*text
== ch
)
1833 while (text
< textend
&& (unsigned char)*text
!= '\n')
1839 a
= (unsigned char)*pinst
;
1842 while (text
< textend
&&
1843 (SYNTAX(translate
[*text
]) & a
) )
1848 while (text
< textend
&& (SYNTAX(*text
) & a
) )
1853 case Cnotsyntaxspec
:
1855 a
= (unsigned char)*pinst
;
1858 while (text
< textend
&&
1859 !(SYNTAX(translate
[*text
]) & a
) )
1864 while (text
< textend
&& !(SYNTAX(*text
) & a
) )
1872 PyErr_SetString(PyExc_SystemError
, "Unknown regex opcode: memory corrupted?");
1877 /* due to the funky way + and * are compiled, the top
1878 * failure- stack entry at this point is actually a
1879 * success entry -- update it & pop it */
1880 UPDATE_FAILURE(state
, text
, goto error
);
1881 goto fail
; /* i.e., succeed <wink/sigh> */
1885 if (text
== textstart
)
1886 goto continue_matching
;
1891 if (text
== textend
)
1892 goto continue_matching
;
1897 if (text
== textend
)
1899 if (!(SYNTAX(*text
) & Sword
))
1901 if (text
== textstart
)
1902 goto continue_matching
;
1903 if (!(SYNTAX(text
[-1]) & Sword
))
1904 goto continue_matching
;
1909 if (text
== textstart
)
1911 if (!(SYNTAX(text
[-1]) & Sword
))
1913 if (text
== textend
)
1914 goto continue_matching
;
1915 if (!(SYNTAX(*text
) & Sword
))
1916 goto continue_matching
;
1921 /* Note: as in gnu regexp, this also matches at the
1922 * beginning and end of buffer. */
1924 if (text
== textstart
|| text
== textend
)
1925 goto continue_matching
;
1926 if ((SYNTAX(text
[-1]) & Sword
) ^ (SYNTAX(*text
) & Sword
))
1927 goto continue_matching
;
1932 /* Note: as in gnu regexp, this never matches at the
1933 * beginning and end of buffer. */
1934 if (text
== textstart
|| text
== textend
)
1936 if (!((SYNTAX(text
[-1]) & Sword
) ^ (SYNTAX(*text
) & Sword
)))
1937 goto continue_matching
;
1943 if (!(SYNTAX(ch
) & (unsigned char)*code
++))
1945 goto continue_matching
;
1947 case Cnotsyntaxspec
:
1950 if (SYNTAX(ch
) & (unsigned char)*code
++)
1952 goto continue_matching
;
1957 PyErr_SetString(PyExc_SystemError
, "Unknown regex opcode: memory corrupted?");
1965 #if 0 /* This line is never reached --Guido */
1972 /* Using "break;" in the above switch statement is equivalent to "goto fail;" */
1974 POP_FAILURE(state
, code
, text
, goto done_matching
, goto error
);
1975 goto continue_matching
;
1978 /* if(translated != NULL) */
1979 /* free(translated); */
1984 /* if (translated != NULL) */
1985 /* free(translated); */
1994 int re_search(regexp_t bufp
, unsigned char *string
, int size
, int pos
,
1995 int range
, regexp_registers_t regs
)
1997 unsigned char *fastmap
;
1998 unsigned char *translate
;
1999 unsigned char *text
;
2000 unsigned char *partstart
;
2001 unsigned char *partend
;
2004 unsigned char anchor
;
2006 assert(size
>= 0 && pos
>= 0);
2007 assert(pos
+ range
>= 0 && pos
+ range
<= size
); /* Bugfix by ylo */
2009 fastmap
= bufp
->fastmap
;
2010 translate
= bufp
->translate
;
2011 if (fastmap
&& !bufp
->fastmap_accurate
) {
2012 re_compile_fastmap(bufp
);
2013 if (PyErr_Occurred()) return -2;
2016 anchor
= bufp
->anchor
;
2017 if (bufp
->can_be_null
== 1) /* can_be_null == 2: can match null at eob */
2035 for (; range
>= 0; range
--, pos
+= dir
)
2040 { /* searching forwards */
2042 text
= string
+ pos
;
2043 partend
= string
+ size
;
2046 while (text
!= partend
&&
2047 !fastmap
[(unsigned char) translate
[(unsigned char)*text
]])
2050 while (text
!= partend
&& !fastmap
[(unsigned char)*text
])
2052 pos
+= text
- partstart
;
2053 range
-= text
- partstart
;
2054 if (pos
== size
&& bufp
->can_be_null
== 0)
2058 { /* searching backwards */
2059 text
= string
+ pos
;
2060 partstart
= string
+ pos
- range
;
2063 while (text
!= partstart
&&
2064 !fastmap
[(unsigned char)
2065 translate
[(unsigned char)*text
]])
2068 while (text
!= partstart
&&
2069 !fastmap
[(unsigned char)*text
])
2071 pos
-= partend
- text
;
2072 range
-= partend
- text
;
2076 { /* anchored to begline */
2077 if (pos
> 0 && (string
[pos
- 1] != '\n'))
2080 assert(pos
>= 0 && pos
<= size
);
2081 ret
= re_match(bufp
, string
, size
, pos
, regs
);
2093 ** c-file-style: "python"