turns printfs back on
[freebsd-src/fkvm-freebsd.git] / contrib / cvs / lib / regex.c
blobe123284e39eb4fd709ddf9aa878f8dbe26f26fbd
1 /* Extended regular expression matching and search library, version
2 0.12. (Implements POSIX draft P10003.2/D11.2, except for
3 internationalization features.)
5 Copyright (C) 1993, 1994, 1995, 1996, 1997, 1998 Free Software Foundation, Inc.
7 This program is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 2, or (at your option)
10 any later version.
12 This program is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with this program; if not, write to the Free Software
19 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
20 USA. */
22 /* AIX requires this to be the first thing in the file. */
23 #if defined (_AIX) && !defined (REGEX_MALLOC)
24 #pragma alloca
25 #endif
27 #undef _GNU_SOURCE
28 #define _GNU_SOURCE
30 #ifdef emacs
31 /* Converts the pointer to the char to BEG-based offset from the start. */
32 #define PTR_TO_OFFSET(d) \
33 POS_AS_IN_BUFFER (MATCHING_IN_FIRST_STRING \
34 ? (d) - string1 : (d) - (string2 - size1))
35 #define POS_AS_IN_BUFFER(p) ((p) + (NILP (re_match_object) || BUFFERP (re_match_object)))
36 #else
37 #define PTR_TO_OFFSET(d) 0
38 #endif
40 #ifdef HAVE_CONFIG_H
41 #include <config.h>
42 #endif
44 /* We need this for `regex.h', and perhaps for the Emacs include files. */
45 #include <sys/types.h>
47 /* This is for other GNU distributions with internationalized messages. */
48 #if HAVE_LIBINTL_H || defined (_LIBC)
49 # include <libintl.h>
50 #else
51 # define gettext(msgid) (msgid)
52 #endif
54 #ifndef gettext_noop
55 /* This define is so xgettext can find the internationalizable
56 strings. */
57 #define gettext_noop(String) String
58 #endif
60 /* The `emacs' switch turns on certain matching commands
61 that make sense only in Emacs. */
62 #ifdef emacs
64 #include "lisp.h"
65 #include "buffer.h"
67 /* Make syntax table lookup grant data in gl_state. */
68 #define SYNTAX_ENTRY_VIA_PROPERTY
70 #include "syntax.h"
71 #include "charset.h"
72 #include "category.h"
74 #define malloc xmalloc
75 #define realloc xrealloc
76 #define free xfree
78 #else /* not emacs */
80 /* If we are not linking with Emacs proper,
81 we can't use the relocating allocator
82 even if config.h says that we can. */
83 #undef REL_ALLOC
85 #if defined (STDC_HEADERS) || defined (_LIBC)
86 #include <stdlib.h>
87 #else
88 char *malloc ();
89 char *realloc ();
90 #endif
92 /* When used in Emacs's lib-src, we need to get bzero and bcopy somehow.
93 If nothing else has been done, use the method below. */
94 #ifdef INHIBIT_STRING_HEADER
95 #if !(defined (HAVE_BZERO) && defined (HAVE_BCOPY))
96 #if !defined (bzero) && !defined (bcopy)
97 #undef INHIBIT_STRING_HEADER
98 #endif
99 #endif
100 #endif
102 /* This is the normal way of making sure we have a bcopy and a bzero.
103 This is used in most programs--a few other programs avoid this
104 by defining INHIBIT_STRING_HEADER. */
105 #ifndef INHIBIT_STRING_HEADER
106 #if defined (HAVE_STRING_H) || defined (STDC_HEADERS) || defined (_LIBC)
107 #include <string.h>
108 #ifndef bcmp
109 #define bcmp(s1, s2, n) memcmp ((s1), (s2), (n))
110 #endif
111 #ifndef bcopy
112 #define bcopy(s, d, n) memcpy ((d), (s), (n))
113 #endif
114 #ifndef bzero
115 #define bzero(s, n) memset ((s), 0, (n))
116 #endif
117 #else
118 #include <strings.h>
119 #endif
120 #endif
122 /* Define the syntax stuff for \<, \>, etc. */
124 /* This must be nonzero for the wordchar and notwordchar pattern
125 commands in re_match_2. */
126 #ifndef Sword
127 #define Sword 1
128 #endif
130 #ifdef SWITCH_ENUM_BUG
131 #define SWITCH_ENUM_CAST(x) ((int)(x))
132 #else
133 #define SWITCH_ENUM_CAST(x) (x)
134 #endif
136 #ifdef SYNTAX_TABLE
138 extern char *re_syntax_table;
140 #else /* not SYNTAX_TABLE */
142 /* How many characters in the character set. */
143 #define CHAR_SET_SIZE 256
145 static char re_syntax_table[CHAR_SET_SIZE];
147 static void
148 init_syntax_once ()
150 register int c;
151 static int done = 0;
153 if (done)
154 return;
156 bzero (re_syntax_table, sizeof re_syntax_table);
158 for (c = 'a'; c <= 'z'; c++)
159 re_syntax_table[c] = Sword;
161 for (c = 'A'; c <= 'Z'; c++)
162 re_syntax_table[c] = Sword;
164 for (c = '0'; c <= '9'; c++)
165 re_syntax_table[c] = Sword;
167 re_syntax_table['_'] = Sword;
169 done = 1;
172 #endif /* not SYNTAX_TABLE */
174 #define SYNTAX(c) re_syntax_table[c]
176 /* Dummy macros for non-Emacs environments. */
177 #define BASE_LEADING_CODE_P(c) (0)
178 #define WORD_BOUNDARY_P(c1, c2) (0)
179 #define CHAR_HEAD_P(p) (1)
180 #define SINGLE_BYTE_CHAR_P(c) (1)
181 #define SAME_CHARSET_P(c1, c2) (1)
182 #define MULTIBYTE_FORM_LENGTH(p, s) (1)
183 #define STRING_CHAR(p, s) (*(p))
184 #define STRING_CHAR_AND_LENGTH(p, s, actual_len) ((actual_len) = 1, *(p))
185 #define GET_CHAR_AFTER_2(c, p, str1, end1, str2, end2) \
186 (c = ((p) == (end1) ? *(str2) : *(p)))
187 #define GET_CHAR_BEFORE_2(c, p, str1, end1, str2, end2) \
188 (c = ((p) == (str2) ? *((end1) - 1) : *((p) - 1)))
189 #endif /* not emacs */
191 /* Get the interface, including the syntax bits. */
192 #include "regex.h"
194 /* isalpha etc. are used for the character classes. */
195 #include <ctype.h>
197 /* Jim Meyering writes:
199 "... Some ctype macros are valid only for character codes that
200 isascii says are ASCII (SGI's IRIX-4.0.5 is one such system --when
201 using /bin/cc or gcc but without giving an ansi option). So, all
202 ctype uses should be through macros like ISPRINT... If
203 STDC_HEADERS is defined, then autoconf has verified that the ctype
204 macros don't need to be guarded with references to isascii. ...
205 Defining isascii to 1 should let any compiler worth its salt
206 eliminate the && through constant folding." */
208 #if defined (STDC_HEADERS) || (!defined (isascii) && !defined (HAVE_ISASCII))
209 #define ISASCII(c) 1
210 #else
211 #define ISASCII(c) isascii(c)
212 #endif
214 #ifdef isblank
215 #define ISBLANK(c) (ISASCII (c) && isblank (c))
216 #else
217 #define ISBLANK(c) ((c) == ' ' || (c) == '\t')
218 #endif
219 #ifdef isgraph
220 #define ISGRAPH(c) (ISASCII (c) && isgraph (c))
221 #else
222 #define ISGRAPH(c) (ISASCII (c) && isprint (c) && !isspace (c))
223 #endif
225 #define ISPRINT(c) (ISASCII (c) && isprint (c))
226 #define ISDIGIT(c) (ISASCII (c) && isdigit (c))
227 #define ISALNUM(c) (ISASCII (c) && isalnum (c))
228 #define ISALPHA(c) (ISASCII (c) && isalpha (c))
229 #define ISCNTRL(c) (ISASCII (c) && iscntrl (c))
230 #define ISLOWER(c) (ISASCII (c) && islower (c))
231 #define ISPUNCT(c) (ISASCII (c) && ispunct (c))
232 #define ISSPACE(c) (ISASCII (c) && isspace (c))
233 #define ISUPPER(c) (ISASCII (c) && isupper (c))
234 #define ISXDIGIT(c) (ISASCII (c) && isxdigit (c))
236 #ifndef NULL
237 #define NULL (void *)0
238 #endif
240 /* We remove any previous definition of `SIGN_EXTEND_CHAR',
241 since ours (we hope) works properly with all combinations of
242 machines, compilers, `char' and `unsigned char' argument types.
243 (Per Bothner suggested the basic approach.) */
244 #undef SIGN_EXTEND_CHAR
245 #if __STDC__
246 #define SIGN_EXTEND_CHAR(c) ((signed char) (c))
247 #else /* not __STDC__ */
248 /* As in Harbison and Steele. */
249 #define SIGN_EXTEND_CHAR(c) ((((unsigned char) (c)) ^ 128) - 128)
250 #endif
252 /* Should we use malloc or alloca? If REGEX_MALLOC is not defined, we
253 use `alloca' instead of `malloc'. This is because using malloc in
254 re_search* or re_match* could cause memory leaks when C-g is used in
255 Emacs; also, malloc is slower and causes storage fragmentation. On
256 the other hand, malloc is more portable, and easier to debug.
258 Because we sometimes use alloca, some routines have to be macros,
259 not functions -- `alloca'-allocated space disappears at the end of the
260 function it is called in. */
262 #ifdef REGEX_MALLOC
264 #define REGEX_ALLOCATE malloc
265 #define REGEX_REALLOCATE(source, osize, nsize) realloc (source, nsize)
266 #define REGEX_FREE free
268 #else /* not REGEX_MALLOC */
270 /* Emacs already defines alloca, sometimes. */
271 #ifndef alloca
273 /* Make alloca work the best possible way. */
274 #ifdef __GNUC__
275 #define alloca __builtin_alloca
276 #else /* not __GNUC__ */
277 #if HAVE_ALLOCA_H
278 #include <alloca.h>
279 #else /* not __GNUC__ or HAVE_ALLOCA_H */
280 #if 0 /* It is a bad idea to declare alloca. We always cast the result. */
281 #ifndef _AIX /* Already did AIX, up at the top. */
282 char *alloca ();
283 #endif /* not _AIX */
284 #endif
285 #endif /* not HAVE_ALLOCA_H */
286 #endif /* not __GNUC__ */
288 #endif /* not alloca */
290 #define REGEX_ALLOCATE alloca
292 /* Assumes a `char *destination' variable. */
293 #define REGEX_REALLOCATE(source, osize, nsize) \
294 (destination = (char *) alloca (nsize), \
295 bcopy (source, destination, osize), \
296 destination)
298 /* No need to do anything to free, after alloca. */
299 #define REGEX_FREE(arg) ((void)0) /* Do nothing! But inhibit gcc warning. */
301 #endif /* not REGEX_MALLOC */
303 /* Define how to allocate the failure stack. */
305 #if defined (REL_ALLOC) && defined (REGEX_MALLOC)
307 #define REGEX_ALLOCATE_STACK(size) \
308 r_alloc (&failure_stack_ptr, (size))
309 #define REGEX_REALLOCATE_STACK(source, osize, nsize) \
310 r_re_alloc (&failure_stack_ptr, (nsize))
311 #define REGEX_FREE_STACK(ptr) \
312 r_alloc_free (&failure_stack_ptr)
314 #else /* not using relocating allocator */
316 #ifdef REGEX_MALLOC
318 #define REGEX_ALLOCATE_STACK malloc
319 #define REGEX_REALLOCATE_STACK(source, osize, nsize) realloc (source, nsize)
320 #define REGEX_FREE_STACK free
322 #else /* not REGEX_MALLOC */
324 #define REGEX_ALLOCATE_STACK alloca
326 #define REGEX_REALLOCATE_STACK(source, osize, nsize) \
327 REGEX_REALLOCATE (source, osize, nsize)
328 /* No need to explicitly free anything. */
329 #define REGEX_FREE_STACK(arg)
331 #endif /* not REGEX_MALLOC */
332 #endif /* not using relocating allocator */
335 /* True if `size1' is non-NULL and PTR is pointing anywhere inside
336 `string1' or just past its end. This works if PTR is NULL, which is
337 a good thing. */
338 #define FIRST_STRING_P(ptr) \
339 (size1 && string1 <= (ptr) && (ptr) <= string1 + size1)
341 /* (Re)Allocate N items of type T using malloc, or fail. */
342 #define TALLOC(n, t) ((t *) malloc ((n) * sizeof (t)))
343 #define RETALLOC(addr, n, t) ((addr) = (t *) realloc (addr, (n) * sizeof (t)))
344 #define RETALLOC_IF(addr, n, t) \
345 if (addr) RETALLOC((addr), (n), t); else (addr) = TALLOC ((n), t)
346 #define REGEX_TALLOC(n, t) ((t *) REGEX_ALLOCATE ((n) * sizeof (t)))
348 #define BYTEWIDTH 8 /* In bits. */
350 #define STREQ(s1, s2) ((strcmp (s1, s2) == 0))
352 #undef MAX
353 #undef MIN
354 #define MAX(a, b) ((a) > (b) ? (a) : (b))
355 #define MIN(a, b) ((a) < (b) ? (a) : (b))
357 typedef char boolean;
358 #define false 0
359 #define true 1
361 static int re_match_2_internal ();
363 /* These are the command codes that appear in compiled regular
364 expressions. Some opcodes are followed by argument bytes. A
365 command code can specify any interpretation whatsoever for its
366 arguments. Zero bytes may appear in the compiled regular expression. */
368 typedef enum
370 no_op = 0,
372 /* Succeed right away--no more backtracking. */
373 succeed,
375 /* Followed by one byte giving n, then by n literal bytes. */
376 exactn,
378 /* Matches any (more or less) character. */
379 anychar,
381 /* Matches any one char belonging to specified set. First
382 following byte is number of bitmap bytes. Then come bytes
383 for a bitmap saying which chars are in. Bits in each byte
384 are ordered low-bit-first. A character is in the set if its
385 bit is 1. A character too large to have a bit in the map is
386 automatically not in the set. */
387 charset,
389 /* Same parameters as charset, but match any character that is
390 not one of those specified. */
391 charset_not,
393 /* Start remembering the text that is matched, for storing in a
394 register. Followed by one byte with the register number, in
395 the range 0 to one less than the pattern buffer's re_nsub
396 field. Then followed by one byte with the number of groups
397 inner to this one. (This last has to be part of the
398 start_memory only because we need it in the on_failure_jump
399 of re_match_2.) */
400 start_memory,
402 /* Stop remembering the text that is matched and store it in a
403 memory register. Followed by one byte with the register
404 number, in the range 0 to one less than `re_nsub' in the
405 pattern buffer, and one byte with the number of inner groups,
406 just like `start_memory'. (We need the number of inner
407 groups here because we don't have any easy way of finding the
408 corresponding start_memory when we're at a stop_memory.) */
409 stop_memory,
411 /* Match a duplicate of something remembered. Followed by one
412 byte containing the register number. */
413 duplicate,
415 /* Fail unless at beginning of line. */
416 begline,
418 /* Fail unless at end of line. */
419 endline,
421 /* Succeeds if at beginning of buffer (if emacs) or at beginning
422 of string to be matched (if not). */
423 begbuf,
425 /* Analogously, for end of buffer/string. */
426 endbuf,
428 /* Followed by two byte relative address to which to jump. */
429 jump,
431 /* Same as jump, but marks the end of an alternative. */
432 jump_past_alt,
434 /* Followed by two-byte relative address of place to resume at
435 in case of failure. */
436 on_failure_jump,
438 /* Like on_failure_jump, but pushes a placeholder instead of the
439 current string position when executed. */
440 on_failure_keep_string_jump,
442 /* Throw away latest failure point and then jump to following
443 two-byte relative address. */
444 pop_failure_jump,
446 /* Change to pop_failure_jump if know won't have to backtrack to
447 match; otherwise change to jump. This is used to jump
448 back to the beginning of a repeat. If what follows this jump
449 clearly won't match what the repeat does, such that we can be
450 sure that there is no use backtracking out of repetitions
451 already matched, then we change it to a pop_failure_jump.
452 Followed by two-byte address. */
453 maybe_pop_jump,
455 /* Jump to following two-byte address, and push a dummy failure
456 point. This failure point will be thrown away if an attempt
457 is made to use it for a failure. A `+' construct makes this
458 before the first repeat. Also used as an intermediary kind
459 of jump when compiling an alternative. */
460 dummy_failure_jump,
462 /* Push a dummy failure point and continue. Used at the end of
463 alternatives. */
464 push_dummy_failure,
466 /* Followed by two-byte relative address and two-byte number n.
467 After matching N times, jump to the address upon failure. */
468 succeed_n,
470 /* Followed by two-byte relative address, and two-byte number n.
471 Jump to the address N times, then fail. */
472 jump_n,
474 /* Set the following two-byte relative address to the
475 subsequent two-byte number. The address *includes* the two
476 bytes of number. */
477 set_number_at,
479 wordchar, /* Matches any word-constituent character. */
480 notwordchar, /* Matches any char that is not a word-constituent. */
482 wordbeg, /* Succeeds if at word beginning. */
483 wordend, /* Succeeds if at word end. */
485 wordbound, /* Succeeds if at a word boundary. */
486 notwordbound /* Succeeds if not at a word boundary. */
488 #ifdef emacs
489 ,before_dot, /* Succeeds if before point. */
490 at_dot, /* Succeeds if at point. */
491 after_dot, /* Succeeds if after point. */
493 /* Matches any character whose syntax is specified. Followed by
494 a byte which contains a syntax code, e.g., Sword. */
495 syntaxspec,
497 /* Matches any character whose syntax is not that specified. */
498 notsyntaxspec,
500 /* Matches any character whose category-set contains the specified
501 category. The operator is followed by a byte which contains a
502 category code (mnemonic ASCII character). */
503 categoryspec,
505 /* Matches any character whose category-set does not contain the
506 specified category. The operator is followed by a byte which
507 contains the category code (mnemonic ASCII character). */
508 notcategoryspec
509 #endif /* emacs */
510 } re_opcode_t;
512 /* Common operations on the compiled pattern. */
514 /* Store NUMBER in two contiguous bytes starting at DESTINATION. */
516 #define STORE_NUMBER(destination, number) \
517 do { \
518 (destination)[0] = (number) & 0377; \
519 (destination)[1] = (number) >> 8; \
520 } while (0)
522 /* Same as STORE_NUMBER, except increment DESTINATION to
523 the byte after where the number is stored. Therefore, DESTINATION
524 must be an lvalue. */
526 #define STORE_NUMBER_AND_INCR(destination, number) \
527 do { \
528 STORE_NUMBER (destination, number); \
529 (destination) += 2; \
530 } while (0)
532 /* Put into DESTINATION a number stored in two contiguous bytes starting
533 at SOURCE. */
535 #define EXTRACT_NUMBER(destination, source) \
536 do { \
537 (destination) = *(source) & 0377; \
538 (destination) += SIGN_EXTEND_CHAR (*((source) + 1)) << 8; \
539 } while (0)
541 #ifdef DEBUG
542 static void
543 extract_number (dest, source)
544 int *dest;
545 unsigned char *source;
547 int temp = SIGN_EXTEND_CHAR (*(source + 1));
548 *dest = *source & 0377;
549 *dest += temp << 8;
552 #ifndef EXTRACT_MACROS /* To debug the macros. */
553 #undef EXTRACT_NUMBER
554 #define EXTRACT_NUMBER(dest, src) extract_number (&dest, src)
555 #endif /* not EXTRACT_MACROS */
557 #endif /* DEBUG */
559 /* Same as EXTRACT_NUMBER, except increment SOURCE to after the number.
560 SOURCE must be an lvalue. */
562 #define EXTRACT_NUMBER_AND_INCR(destination, source) \
563 do { \
564 EXTRACT_NUMBER (destination, source); \
565 (source) += 2; \
566 } while (0)
568 #ifdef DEBUG
569 static void
570 extract_number_and_incr (destination, source)
571 int *destination;
572 unsigned char **source;
574 extract_number (destination, *source);
575 *source += 2;
578 #ifndef EXTRACT_MACROS
579 #undef EXTRACT_NUMBER_AND_INCR
580 #define EXTRACT_NUMBER_AND_INCR(dest, src) \
581 extract_number_and_incr (&dest, &src)
582 #endif /* not EXTRACT_MACROS */
584 #endif /* DEBUG */
586 /* Store a multibyte character in three contiguous bytes starting
587 DESTINATION, and increment DESTINATION to the byte after where the
588 character is stored. Therefore, DESTINATION must be an lvalue. */
590 #define STORE_CHARACTER_AND_INCR(destination, character) \
591 do { \
592 (destination)[0] = (character) & 0377; \
593 (destination)[1] = ((character) >> 8) & 0377; \
594 (destination)[2] = (character) >> 16; \
595 (destination) += 3; \
596 } while (0)
598 /* Put into DESTINATION a character stored in three contiguous bytes
599 starting at SOURCE. */
601 #define EXTRACT_CHARACTER(destination, source) \
602 do { \
603 (destination) = ((source)[0] \
604 | ((source)[1] << 8) \
605 | ((source)[2] << 16)); \
606 } while (0)
609 /* Macros for charset. */
611 /* Size of bitmap of charset P in bytes. P is a start of charset,
612 i.e. *P is (re_opcode_t) charset or (re_opcode_t) charset_not. */
613 #define CHARSET_BITMAP_SIZE(p) ((p)[1] & 0x7F)
615 /* Nonzero if charset P has range table. */
616 #define CHARSET_RANGE_TABLE_EXISTS_P(p) ((p)[1] & 0x80)
618 /* Return the address of range table of charset P. But not the start
619 of table itself, but the before where the number of ranges is
620 stored. `2 +' means to skip re_opcode_t and size of bitmap. */
621 #define CHARSET_RANGE_TABLE(p) (&(p)[2 + CHARSET_BITMAP_SIZE (p)])
623 /* Test if C is listed in the bitmap of charset P. */
624 #define CHARSET_LOOKUP_BITMAP(p, c) \
625 ((c) < CHARSET_BITMAP_SIZE (p) * BYTEWIDTH \
626 && (p)[2 + (c) / BYTEWIDTH] & (1 << ((c) % BYTEWIDTH)))
628 /* Return the address of end of RANGE_TABLE. COUNT is number of
629 ranges (which is a pair of (start, end)) in the RANGE_TABLE. `* 2'
630 is start of range and end of range. `* 3' is size of each start
631 and end. */
632 #define CHARSET_RANGE_TABLE_END(range_table, count) \
633 ((range_table) + (count) * 2 * 3)
635 /* Test if C is in RANGE_TABLE. A flag NOT is negated if C is in.
636 COUNT is number of ranges in RANGE_TABLE. */
637 #define CHARSET_LOOKUP_RANGE_TABLE_RAW(not, c, range_table, count) \
638 do \
640 int range_start, range_end; \
641 unsigned char *p; \
642 unsigned char *range_table_end \
643 = CHARSET_RANGE_TABLE_END ((range_table), (count)); \
645 for (p = (range_table); p < range_table_end; p += 2 * 3) \
647 EXTRACT_CHARACTER (range_start, p); \
648 EXTRACT_CHARACTER (range_end, p + 3); \
650 if (range_start <= (c) && (c) <= range_end) \
652 (not) = !(not); \
653 break; \
657 while (0)
659 /* Test if C is in range table of CHARSET. The flag NOT is negated if
660 C is listed in it. */
661 #define CHARSET_LOOKUP_RANGE_TABLE(not, c, charset) \
662 do \
664 /* Number of ranges in range table. */ \
665 int count; \
666 unsigned char *range_table = CHARSET_RANGE_TABLE (charset); \
668 EXTRACT_NUMBER_AND_INCR (count, range_table); \
669 CHARSET_LOOKUP_RANGE_TABLE_RAW ((not), (c), range_table, count); \
671 while (0)
673 /* If DEBUG is defined, Regex prints many voluminous messages about what
674 it is doing (if the variable `debug' is nonzero). If linked with the
675 main program in `iregex.c', you can enter patterns and strings
676 interactively. And if linked with the main program in `main.c' and
677 the other test files, you can run the already-written tests. */
679 #ifdef DEBUG
681 /* We use standard I/O for debugging. */
682 #include <stdio.h>
684 /* It is useful to test things that ``must'' be true when debugging. */
685 #include <assert.h>
687 static int debug = 0;
689 #define DEBUG_STATEMENT(e) e
690 #define DEBUG_PRINT1(x) if (debug) printf (x)
691 #define DEBUG_PRINT2(x1, x2) if (debug) printf (x1, x2)
692 #define DEBUG_PRINT3(x1, x2, x3) if (debug) printf (x1, x2, x3)
693 #define DEBUG_PRINT4(x1, x2, x3, x4) if (debug) printf (x1, x2, x3, x4)
694 #define DEBUG_PRINT_COMPILED_PATTERN(p, s, e) \
695 if (debug) print_partial_compiled_pattern (s, e)
696 #define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2) \
697 if (debug) print_double_string (w, s1, sz1, s2, sz2)
700 /* Print the fastmap in human-readable form. */
702 void
703 print_fastmap (fastmap)
704 char *fastmap;
706 unsigned was_a_range = 0;
707 unsigned i = 0;
709 while (i < (1 << BYTEWIDTH))
711 if (fastmap[i++])
713 was_a_range = 0;
714 putchar (i - 1);
715 while (i < (1 << BYTEWIDTH) && fastmap[i])
717 was_a_range = 1;
718 i++;
720 if (was_a_range)
722 printf ("-");
723 putchar (i - 1);
727 putchar ('\n');
731 /* Print a compiled pattern string in human-readable form, starting at
732 the START pointer into it and ending just before the pointer END. */
734 void
735 print_partial_compiled_pattern (start, end)
736 unsigned char *start;
737 unsigned char *end;
739 int mcnt, mcnt2;
740 unsigned char *p = start;
741 unsigned char *pend = end;
743 if (start == NULL)
745 printf ("(null)\n");
746 return;
749 /* Loop over pattern commands. */
750 while (p < pend)
752 printf ("%d:\t", p - start);
754 switch ((re_opcode_t) *p++)
756 case no_op:
757 printf ("/no_op");
758 break;
760 case exactn:
761 mcnt = *p++;
762 printf ("/exactn/%d", mcnt);
765 putchar ('/');
766 putchar (*p++);
768 while (--mcnt);
769 break;
771 case start_memory:
772 mcnt = *p++;
773 printf ("/start_memory/%d/%d", mcnt, *p++);
774 break;
776 case stop_memory:
777 mcnt = *p++;
778 printf ("/stop_memory/%d/%d", mcnt, *p++);
779 break;
781 case duplicate:
782 printf ("/duplicate/%d", *p++);
783 break;
785 case anychar:
786 printf ("/anychar");
787 break;
789 case charset:
790 case charset_not:
792 register int c, last = -100;
793 register int in_range = 0;
795 printf ("/charset [%s",
796 (re_opcode_t) *(p - 1) == charset_not ? "^" : "");
798 assert (p + *p < pend);
800 for (c = 0; c < 256; c++)
801 if (c / 8 < *p
802 && (p[1 + (c/8)] & (1 << (c % 8))))
804 /* Are we starting a range? */
805 if (last + 1 == c && ! in_range)
807 putchar ('-');
808 in_range = 1;
810 /* Have we broken a range? */
811 else if (last + 1 != c && in_range)
813 putchar (last);
814 in_range = 0;
817 if (! in_range)
818 putchar (c);
820 last = c;
823 if (in_range)
824 putchar (last);
826 putchar (']');
828 p += 1 + *p;
830 break;
832 case begline:
833 printf ("/begline");
834 break;
836 case endline:
837 printf ("/endline");
838 break;
840 case on_failure_jump:
841 extract_number_and_incr (&mcnt, &p);
842 printf ("/on_failure_jump to %d", p + mcnt - start);
843 break;
845 case on_failure_keep_string_jump:
846 extract_number_and_incr (&mcnt, &p);
847 printf ("/on_failure_keep_string_jump to %d", p + mcnt - start);
848 break;
850 case dummy_failure_jump:
851 extract_number_and_incr (&mcnt, &p);
852 printf ("/dummy_failure_jump to %d", p + mcnt - start);
853 break;
855 case push_dummy_failure:
856 printf ("/push_dummy_failure");
857 break;
859 case maybe_pop_jump:
860 extract_number_and_incr (&mcnt, &p);
861 printf ("/maybe_pop_jump to %d", p + mcnt - start);
862 break;
864 case pop_failure_jump:
865 extract_number_and_incr (&mcnt, &p);
866 printf ("/pop_failure_jump to %d", p + mcnt - start);
867 break;
869 case jump_past_alt:
870 extract_number_and_incr (&mcnt, &p);
871 printf ("/jump_past_alt to %d", p + mcnt - start);
872 break;
874 case jump:
875 extract_number_and_incr (&mcnt, &p);
876 printf ("/jump to %d", p + mcnt - start);
877 break;
879 case succeed_n:
880 extract_number_and_incr (&mcnt, &p);
881 extract_number_and_incr (&mcnt2, &p);
882 printf ("/succeed_n to %d, %d times", p + mcnt - start, mcnt2);
883 break;
885 case jump_n:
886 extract_number_and_incr (&mcnt, &p);
887 extract_number_and_incr (&mcnt2, &p);
888 printf ("/jump_n to %d, %d times", p + mcnt - start, mcnt2);
889 break;
891 case set_number_at:
892 extract_number_and_incr (&mcnt, &p);
893 extract_number_and_incr (&mcnt2, &p);
894 printf ("/set_number_at location %d to %d", p + mcnt - start, mcnt2);
895 break;
897 case wordbound:
898 printf ("/wordbound");
899 break;
901 case notwordbound:
902 printf ("/notwordbound");
903 break;
905 case wordbeg:
906 printf ("/wordbeg");
907 break;
909 case wordend:
910 printf ("/wordend");
912 #ifdef emacs
913 case before_dot:
914 printf ("/before_dot");
915 break;
917 case at_dot:
918 printf ("/at_dot");
919 break;
921 case after_dot:
922 printf ("/after_dot");
923 break;
925 case syntaxspec:
926 printf ("/syntaxspec");
927 mcnt = *p++;
928 printf ("/%d", mcnt);
929 break;
931 case notsyntaxspec:
932 printf ("/notsyntaxspec");
933 mcnt = *p++;
934 printf ("/%d", mcnt);
935 break;
936 #endif /* emacs */
938 case wordchar:
939 printf ("/wordchar");
940 break;
942 case notwordchar:
943 printf ("/notwordchar");
944 break;
946 case begbuf:
947 printf ("/begbuf");
948 break;
950 case endbuf:
951 printf ("/endbuf");
952 break;
954 default:
955 printf ("?%d", *(p-1));
958 putchar ('\n');
961 printf ("%d:\tend of pattern.\n", p - start);
965 void
966 print_compiled_pattern (bufp)
967 struct re_pattern_buffer *bufp;
969 unsigned char *buffer = bufp->buffer;
971 print_partial_compiled_pattern (buffer, buffer + bufp->used);
972 printf ("%d bytes used/%d bytes allocated.\n", bufp->used, bufp->allocated);
974 if (bufp->fastmap_accurate && bufp->fastmap)
976 printf ("fastmap: ");
977 print_fastmap (bufp->fastmap);
980 printf ("re_nsub: %d\t", bufp->re_nsub);
981 printf ("regs_alloc: %d\t", bufp->regs_allocated);
982 printf ("can_be_null: %d\t", bufp->can_be_null);
983 printf ("newline_anchor: %d\n", bufp->newline_anchor);
984 printf ("no_sub: %d\t", bufp->no_sub);
985 printf ("not_bol: %d\t", bufp->not_bol);
986 printf ("not_eol: %d\t", bufp->not_eol);
987 printf ("syntax: %d\n", bufp->syntax);
988 /* Perhaps we should print the translate table? */
992 void
993 print_double_string (where, string1, size1, string2, size2)
994 const char *where;
995 const char *string1;
996 const char *string2;
997 int size1;
998 int size2;
1000 unsigned this_char;
1002 if (where == NULL)
1003 printf ("(null)");
1004 else
1006 if (FIRST_STRING_P (where))
1008 for (this_char = where - string1; this_char < size1; this_char++)
1009 putchar (string1[this_char]);
1011 where = string2;
1014 for (this_char = where - string2; this_char < size2; this_char++)
1015 putchar (string2[this_char]);
1019 #else /* not DEBUG */
1021 #undef assert
1022 #define assert(e)
1024 #define DEBUG_STATEMENT(e)
1025 #define DEBUG_PRINT1(x)
1026 #define DEBUG_PRINT2(x1, x2)
1027 #define DEBUG_PRINT3(x1, x2, x3)
1028 #define DEBUG_PRINT4(x1, x2, x3, x4)
1029 #define DEBUG_PRINT_COMPILED_PATTERN(p, s, e)
1030 #define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2)
1032 #endif /* not DEBUG */
1034 /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
1035 also be assigned to arbitrarily: each pattern buffer stores its own
1036 syntax, so it can be changed between regex compilations. */
1037 /* This has no initializer because initialized variables in Emacs
1038 become read-only after dumping. */
1039 reg_syntax_t re_syntax_options;
1042 /* Specify the precise syntax of regexps for compilation. This provides
1043 for compatibility for various utilities which historically have
1044 different, incompatible syntaxes.
1046 The argument SYNTAX is a bit mask comprised of the various bits
1047 defined in regex.h. We return the old syntax. */
1049 reg_syntax_t
1050 re_set_syntax (syntax)
1051 reg_syntax_t syntax;
1053 reg_syntax_t ret = re_syntax_options;
1055 re_syntax_options = syntax;
1056 return ret;
1059 /* This table gives an error message for each of the error codes listed
1060 in regex.h. Obviously the order here has to be same as there.
1061 POSIX doesn't require that we do anything for REG_NOERROR,
1062 but why not be nice? */
1064 static const char *re_error_msgid[] =
1066 gettext_noop ("Success"), /* REG_NOERROR */
1067 gettext_noop ("No match"), /* REG_NOMATCH */
1068 gettext_noop ("Invalid regular expression"), /* REG_BADPAT */
1069 gettext_noop ("Invalid collation character"), /* REG_ECOLLATE */
1070 gettext_noop ("Invalid character class name"), /* REG_ECTYPE */
1071 gettext_noop ("Trailing backslash"), /* REG_EESCAPE */
1072 gettext_noop ("Invalid back reference"), /* REG_ESUBREG */
1073 gettext_noop ("Unmatched [ or [^"), /* REG_EBRACK */
1074 gettext_noop ("Unmatched ( or \\("), /* REG_EPAREN */
1075 gettext_noop ("Unmatched \\{"), /* REG_EBRACE */
1076 gettext_noop ("Invalid content of \\{\\}"), /* REG_BADBR */
1077 gettext_noop ("Invalid range end"), /* REG_ERANGE */
1078 gettext_noop ("Memory exhausted"), /* REG_ESPACE */
1079 gettext_noop ("Invalid preceding regular expression"), /* REG_BADRPT */
1080 gettext_noop ("Premature end of regular expression"), /* REG_EEND */
1081 gettext_noop ("Regular expression too big"), /* REG_ESIZE */
1082 gettext_noop ("Unmatched ) or \\)"), /* REG_ERPAREN */
1085 /* Avoiding alloca during matching, to placate r_alloc. */
1087 /* Define MATCH_MAY_ALLOCATE unless we need to make sure that the
1088 searching and matching functions should not call alloca. On some
1089 systems, alloca is implemented in terms of malloc, and if we're
1090 using the relocating allocator routines, then malloc could cause a
1091 relocation, which might (if the strings being searched are in the
1092 ralloc heap) shift the data out from underneath the regexp
1093 routines.
1095 Here's another reason to avoid allocation: Emacs
1096 processes input from X in a signal handler; processing X input may
1097 call malloc; if input arrives while a matching routine is calling
1098 malloc, then we're scrod. But Emacs can't just block input while
1099 calling matching routines; then we don't notice interrupts when
1100 they come in. So, Emacs blocks input around all regexp calls
1101 except the matching calls, which it leaves unprotected, in the
1102 faith that they will not malloc. */
1104 /* Normally, this is fine. */
1105 #define MATCH_MAY_ALLOCATE
1107 /* When using GNU C, we are not REALLY using the C alloca, no matter
1108 what config.h may say. So don't take precautions for it. */
1109 #ifdef __GNUC__
1110 #undef C_ALLOCA
1111 #endif
1113 /* The match routines may not allocate if (1) they would do it with malloc
1114 and (2) it's not safe for them to use malloc.
1115 Note that if REL_ALLOC is defined, matching would not use malloc for the
1116 failure stack, but we would still use it for the register vectors;
1117 so REL_ALLOC should not affect this. */
1118 #if (defined (C_ALLOCA) || defined (REGEX_MALLOC)) && defined (emacs)
1119 #undef MATCH_MAY_ALLOCATE
1120 #endif
1123 /* Failure stack declarations and macros; both re_compile_fastmap and
1124 re_match_2 use a failure stack. These have to be macros because of
1125 REGEX_ALLOCATE_STACK. */
1128 /* Approximate number of failure points for which to initially allocate space
1129 when matching. If this number is exceeded, we allocate more
1130 space, so it is not a hard limit. */
1131 #ifndef INIT_FAILURE_ALLOC
1132 #define INIT_FAILURE_ALLOC 20
1133 #endif
1135 /* Roughly the maximum number of failure points on the stack. Would be
1136 exactly that if always used TYPICAL_FAILURE_SIZE items each time we failed.
1137 This is a variable only so users of regex can assign to it; we never
1138 change it ourselves. */
1139 #if defined (MATCH_MAY_ALLOCATE)
1140 /* Note that 4400 is enough to cause a crash on Alpha OSF/1,
1141 whose default stack limit is 2mb. In order for a larger
1142 value to work reliably, you have to try to make it accord
1143 with the process stack limit. */
1144 int re_max_failures = 40000;
1145 #else
1146 int re_max_failures = 4000;
1147 #endif
1149 union fail_stack_elt
1151 unsigned char *pointer;
1152 int integer;
1155 typedef union fail_stack_elt fail_stack_elt_t;
1157 typedef struct
1159 fail_stack_elt_t *stack;
1160 unsigned size;
1161 unsigned avail; /* Offset of next open position. */
1162 } fail_stack_type;
1164 #define FAIL_STACK_EMPTY() (fail_stack.avail == 0)
1165 #define FAIL_STACK_PTR_EMPTY() (fail_stack_ptr->avail == 0)
1166 #define FAIL_STACK_FULL() (fail_stack.avail == fail_stack.size)
1169 /* Define macros to initialize and free the failure stack.
1170 Do `return -2' if the alloc fails. */
1172 #ifdef MATCH_MAY_ALLOCATE
1173 #define INIT_FAIL_STACK() \
1174 do { \
1175 fail_stack.stack = (fail_stack_elt_t *) \
1176 REGEX_ALLOCATE_STACK (INIT_FAILURE_ALLOC * TYPICAL_FAILURE_SIZE \
1177 * sizeof (fail_stack_elt_t)); \
1179 if (fail_stack.stack == NULL) \
1180 return -2; \
1182 fail_stack.size = INIT_FAILURE_ALLOC; \
1183 fail_stack.avail = 0; \
1184 } while (0)
1186 #define RESET_FAIL_STACK() REGEX_FREE_STACK (fail_stack.stack)
1187 #else
1188 #define INIT_FAIL_STACK() \
1189 do { \
1190 fail_stack.avail = 0; \
1191 } while (0)
1193 #define RESET_FAIL_STACK()
1194 #endif
1197 /* Double the size of FAIL_STACK, up to a limit
1198 which allows approximately `re_max_failures' items.
1200 Return 1 if succeeds, and 0 if either ran out of memory
1201 allocating space for it or it was already too large.
1203 REGEX_REALLOCATE_STACK requires `destination' be declared. */
1205 /* Factor to increase the failure stack size by
1206 when we increase it.
1207 This used to be 2, but 2 was too wasteful
1208 because the old discarded stacks added up to as much space
1209 were as ultimate, maximum-size stack. */
1210 #define FAIL_STACK_GROWTH_FACTOR 4
1212 #define GROW_FAIL_STACK(fail_stack) \
1213 (((fail_stack).size * sizeof (fail_stack_elt_t) \
1214 >= re_max_failures * TYPICAL_FAILURE_SIZE) \
1215 ? 0 \
1216 : ((fail_stack).stack \
1217 = (fail_stack_elt_t *) \
1218 REGEX_REALLOCATE_STACK ((fail_stack).stack, \
1219 (fail_stack).size * sizeof (fail_stack_elt_t), \
1220 MIN (re_max_failures * TYPICAL_FAILURE_SIZE, \
1221 ((fail_stack).size * sizeof (fail_stack_elt_t) \
1222 * FAIL_STACK_GROWTH_FACTOR))), \
1224 (fail_stack).stack == NULL \
1225 ? 0 \
1226 : ((fail_stack).size \
1227 = (MIN (re_max_failures * TYPICAL_FAILURE_SIZE, \
1228 ((fail_stack).size * sizeof (fail_stack_elt_t) \
1229 * FAIL_STACK_GROWTH_FACTOR)) \
1230 / sizeof (fail_stack_elt_t)), \
1231 1)))
1234 /* Push pointer POINTER on FAIL_STACK.
1235 Return 1 if was able to do so and 0 if ran out of memory allocating
1236 space to do so. */
1237 #define PUSH_PATTERN_OP(POINTER, FAIL_STACK) \
1238 ((FAIL_STACK_FULL () \
1239 && !GROW_FAIL_STACK (FAIL_STACK)) \
1240 ? 0 \
1241 : ((FAIL_STACK).stack[(FAIL_STACK).avail++].pointer = POINTER, \
1244 /* Push a pointer value onto the failure stack.
1245 Assumes the variable `fail_stack'. Probably should only
1246 be called from within `PUSH_FAILURE_POINT'. */
1247 #define PUSH_FAILURE_POINTER(item) \
1248 fail_stack.stack[fail_stack.avail++].pointer = (unsigned char *) (item)
1250 /* This pushes an integer-valued item onto the failure stack.
1251 Assumes the variable `fail_stack'. Probably should only
1252 be called from within `PUSH_FAILURE_POINT'. */
1253 #define PUSH_FAILURE_INT(item) \
1254 fail_stack.stack[fail_stack.avail++].integer = (item)
1256 /* Push a fail_stack_elt_t value onto the failure stack.
1257 Assumes the variable `fail_stack'. Probably should only
1258 be called from within `PUSH_FAILURE_POINT'. */
1259 #define PUSH_FAILURE_ELT(item) \
1260 fail_stack.stack[fail_stack.avail++] = (item)
1262 /* These three POP... operations complement the three PUSH... operations.
1263 All assume that `fail_stack' is nonempty. */
1264 #define POP_FAILURE_POINTER() fail_stack.stack[--fail_stack.avail].pointer
1265 #define POP_FAILURE_INT() fail_stack.stack[--fail_stack.avail].integer
1266 #define POP_FAILURE_ELT() fail_stack.stack[--fail_stack.avail]
1268 /* Used to omit pushing failure point id's when we're not debugging. */
1269 #ifdef DEBUG
1270 #define DEBUG_PUSH PUSH_FAILURE_INT
1271 #define DEBUG_POP(item_addr) *(item_addr) = POP_FAILURE_INT ()
1272 #else
1273 #define DEBUG_PUSH(item)
1274 #define DEBUG_POP(item_addr)
1275 #endif
1278 /* Push the information about the state we will need
1279 if we ever fail back to it.
1281 Requires variables fail_stack, regstart, regend, reg_info, and
1282 num_regs be declared. GROW_FAIL_STACK requires `destination' be
1283 declared.
1285 Does `return FAILURE_CODE' if runs out of memory. */
1287 #define PUSH_FAILURE_POINT(pattern_place, string_place, failure_code) \
1288 do { \
1289 char *destination; \
1290 /* Must be int, so when we don't save any registers, the arithmetic \
1291 of 0 + -1 isn't done as unsigned. */ \
1292 int this_reg; \
1294 DEBUG_STATEMENT (failure_id++); \
1295 DEBUG_STATEMENT (nfailure_points_pushed++); \
1296 DEBUG_PRINT2 ("\nPUSH_FAILURE_POINT #%u:\n", failure_id); \
1297 DEBUG_PRINT2 (" Before push, next avail: %d\n", (fail_stack).avail);\
1298 DEBUG_PRINT2 (" size: %d\n", (fail_stack).size);\
1300 DEBUG_PRINT2 (" slots needed: %d\n", NUM_FAILURE_ITEMS); \
1301 DEBUG_PRINT2 (" available: %d\n", REMAINING_AVAIL_SLOTS); \
1303 /* Ensure we have enough space allocated for what we will push. */ \
1304 while (REMAINING_AVAIL_SLOTS < NUM_FAILURE_ITEMS) \
1306 if (!GROW_FAIL_STACK (fail_stack)) \
1307 return failure_code; \
1309 DEBUG_PRINT2 ("\n Doubled stack; size now: %d\n", \
1310 (fail_stack).size); \
1311 DEBUG_PRINT2 (" slots available: %d\n", REMAINING_AVAIL_SLOTS);\
1314 /* Push the info, starting with the registers. */ \
1315 DEBUG_PRINT1 ("\n"); \
1317 if (1) \
1318 for (this_reg = lowest_active_reg; this_reg <= highest_active_reg; \
1319 this_reg++) \
1321 DEBUG_PRINT2 (" Pushing reg: %d\n", this_reg); \
1322 DEBUG_STATEMENT (num_regs_pushed++); \
1324 DEBUG_PRINT2 (" start: 0x%x\n", regstart[this_reg]); \
1325 PUSH_FAILURE_POINTER (regstart[this_reg]); \
1327 DEBUG_PRINT2 (" end: 0x%x\n", regend[this_reg]); \
1328 PUSH_FAILURE_POINTER (regend[this_reg]); \
1330 DEBUG_PRINT2 (" info: 0x%x\n ", reg_info[this_reg]); \
1331 DEBUG_PRINT2 (" match_null=%d", \
1332 REG_MATCH_NULL_STRING_P (reg_info[this_reg])); \
1333 DEBUG_PRINT2 (" active=%d", IS_ACTIVE (reg_info[this_reg])); \
1334 DEBUG_PRINT2 (" matched_something=%d", \
1335 MATCHED_SOMETHING (reg_info[this_reg])); \
1336 DEBUG_PRINT2 (" ever_matched=%d", \
1337 EVER_MATCHED_SOMETHING (reg_info[this_reg])); \
1338 DEBUG_PRINT1 ("\n"); \
1339 PUSH_FAILURE_ELT (reg_info[this_reg].word); \
1342 DEBUG_PRINT2 (" Pushing low active reg: %d\n", lowest_active_reg);\
1343 PUSH_FAILURE_INT (lowest_active_reg); \
1345 DEBUG_PRINT2 (" Pushing high active reg: %d\n", highest_active_reg);\
1346 PUSH_FAILURE_INT (highest_active_reg); \
1348 DEBUG_PRINT2 (" Pushing pattern 0x%x: ", pattern_place); \
1349 DEBUG_PRINT_COMPILED_PATTERN (bufp, pattern_place, pend); \
1350 PUSH_FAILURE_POINTER (pattern_place); \
1352 DEBUG_PRINT2 (" Pushing string 0x%x: `", string_place); \
1353 DEBUG_PRINT_DOUBLE_STRING (string_place, string1, size1, string2, \
1354 size2); \
1355 DEBUG_PRINT1 ("'\n"); \
1356 PUSH_FAILURE_POINTER (string_place); \
1358 DEBUG_PRINT2 (" Pushing failure id: %u\n", failure_id); \
1359 DEBUG_PUSH (failure_id); \
1360 } while (0)
1362 /* This is the number of items that are pushed and popped on the stack
1363 for each register. */
1364 #define NUM_REG_ITEMS 3
1366 /* Individual items aside from the registers. */
1367 #ifdef DEBUG
1368 #define NUM_NONREG_ITEMS 5 /* Includes failure point id. */
1369 #else
1370 #define NUM_NONREG_ITEMS 4
1371 #endif
1373 /* Estimate the size of data pushed by a typical failure stack entry.
1374 An estimate is all we need, because all we use this for
1375 is to choose a limit for how big to make the failure stack. */
1377 #define TYPICAL_FAILURE_SIZE 20
1379 /* This is how many items we actually use for a failure point.
1380 It depends on the regexp. */
1381 #define NUM_FAILURE_ITEMS \
1382 (((0 \
1383 ? 0 : highest_active_reg - lowest_active_reg + 1) \
1384 * NUM_REG_ITEMS) \
1385 + NUM_NONREG_ITEMS)
1387 /* How many items can still be added to the stack without overflowing it. */
1388 #define REMAINING_AVAIL_SLOTS ((fail_stack).size - (fail_stack).avail)
1391 /* Pops what PUSH_FAIL_STACK pushes.
1393 We restore into the parameters, all of which should be lvalues:
1394 STR -- the saved data position.
1395 PAT -- the saved pattern position.
1396 LOW_REG, HIGH_REG -- the highest and lowest active registers.
1397 REGSTART, REGEND -- arrays of string positions.
1398 REG_INFO -- array of information about each subexpression.
1400 Also assumes the variables `fail_stack' and (if debugging), `bufp',
1401 `pend', `string1', `size1', `string2', and `size2'. */
1403 #define POP_FAILURE_POINT(str, pat, low_reg, high_reg, regstart, regend, reg_info)\
1405 DEBUG_STATEMENT (fail_stack_elt_t failure_id;) \
1406 int this_reg; \
1407 const unsigned char *string_temp; \
1409 assert (!FAIL_STACK_EMPTY ()); \
1411 /* Remove failure points and point to how many regs pushed. */ \
1412 DEBUG_PRINT1 ("POP_FAILURE_POINT:\n"); \
1413 DEBUG_PRINT2 (" Before pop, next avail: %d\n", fail_stack.avail); \
1414 DEBUG_PRINT2 (" size: %d\n", fail_stack.size); \
1416 assert (fail_stack.avail >= NUM_NONREG_ITEMS); \
1418 DEBUG_POP (&failure_id); \
1419 DEBUG_PRINT2 (" Popping failure id: %u\n", failure_id); \
1421 /* If the saved string location is NULL, it came from an \
1422 on_failure_keep_string_jump opcode, and we want to throw away the \
1423 saved NULL, thus retaining our current position in the string. */ \
1424 string_temp = POP_FAILURE_POINTER (); \
1425 if (string_temp != NULL) \
1426 str = (const char *) string_temp; \
1428 DEBUG_PRINT2 (" Popping string 0x%x: `", str); \
1429 DEBUG_PRINT_DOUBLE_STRING (str, string1, size1, string2, size2); \
1430 DEBUG_PRINT1 ("'\n"); \
1432 pat = (unsigned char *) POP_FAILURE_POINTER (); \
1433 DEBUG_PRINT2 (" Popping pattern 0x%x: ", pat); \
1434 DEBUG_PRINT_COMPILED_PATTERN (bufp, pat, pend); \
1436 /* Restore register info. */ \
1437 high_reg = (unsigned) POP_FAILURE_INT (); \
1438 DEBUG_PRINT2 (" Popping high active reg: %d\n", high_reg); \
1440 low_reg = (unsigned) POP_FAILURE_INT (); \
1441 DEBUG_PRINT2 (" Popping low active reg: %d\n", low_reg); \
1443 if (1) \
1444 for (this_reg = high_reg; this_reg >= low_reg; this_reg--) \
1446 DEBUG_PRINT2 (" Popping reg: %d\n", this_reg); \
1448 reg_info[this_reg].word = POP_FAILURE_ELT (); \
1449 DEBUG_PRINT2 (" info: 0x%x\n", reg_info[this_reg]); \
1451 regend[this_reg] = (const char *) POP_FAILURE_POINTER (); \
1452 DEBUG_PRINT2 (" end: 0x%x\n", regend[this_reg]); \
1454 regstart[this_reg] = (const char *) POP_FAILURE_POINTER (); \
1455 DEBUG_PRINT2 (" start: 0x%x\n", regstart[this_reg]); \
1457 else \
1459 for (this_reg = highest_active_reg; this_reg > high_reg; this_reg--) \
1461 reg_info[this_reg].word.integer = 0; \
1462 regend[this_reg] = 0; \
1463 regstart[this_reg] = 0; \
1465 highest_active_reg = high_reg; \
1468 set_regs_matched_done = 0; \
1469 DEBUG_STATEMENT (nfailure_points_popped++); \
1470 } /* POP_FAILURE_POINT */
1474 /* Structure for per-register (a.k.a. per-group) information.
1475 Other register information, such as the
1476 starting and ending positions (which are addresses), and the list of
1477 inner groups (which is a bits list) are maintained in separate
1478 variables.
1480 We are making a (strictly speaking) nonportable assumption here: that
1481 the compiler will pack our bit fields into something that fits into
1482 the type of `word', i.e., is something that fits into one item on the
1483 failure stack. */
1485 typedef union
1487 fail_stack_elt_t word;
1488 struct
1490 /* This field is one if this group can match the empty string,
1491 zero if not. If not yet determined, `MATCH_NULL_UNSET_VALUE'. */
1492 #define MATCH_NULL_UNSET_VALUE 3
1493 unsigned match_null_string_p : 2;
1494 unsigned is_active : 1;
1495 unsigned matched_something : 1;
1496 unsigned ever_matched_something : 1;
1497 } bits;
1498 } register_info_type;
1500 #define REG_MATCH_NULL_STRING_P(R) ((R).bits.match_null_string_p)
1501 #define IS_ACTIVE(R) ((R).bits.is_active)
1502 #define MATCHED_SOMETHING(R) ((R).bits.matched_something)
1503 #define EVER_MATCHED_SOMETHING(R) ((R).bits.ever_matched_something)
1506 /* Call this when have matched a real character; it sets `matched' flags
1507 for the subexpressions which we are currently inside. Also records
1508 that those subexprs have matched. */
1509 #define SET_REGS_MATCHED() \
1510 do \
1512 if (!set_regs_matched_done) \
1514 unsigned r; \
1515 set_regs_matched_done = 1; \
1516 for (r = lowest_active_reg; r <= highest_active_reg; r++) \
1518 MATCHED_SOMETHING (reg_info[r]) \
1519 = EVER_MATCHED_SOMETHING (reg_info[r]) \
1520 = 1; \
1524 while (0)
1526 /* Registers are set to a sentinel when they haven't yet matched. */
1527 static char reg_unset_dummy;
1528 #define REG_UNSET_VALUE (&reg_unset_dummy)
1529 #define REG_UNSET(e) ((e) == REG_UNSET_VALUE)
1531 /* Subroutine declarations and macros for regex_compile. */
1533 static void store_op1 (), store_op2 ();
1534 static void insert_op1 (), insert_op2 ();
1535 static boolean at_begline_loc_p (), at_endline_loc_p ();
1536 static boolean group_in_compile_stack ();
1538 /* Fetch the next character in the uncompiled pattern---translating it
1539 if necessary. Also cast from a signed character in the constant
1540 string passed to us by the user to an unsigned char that we can use
1541 as an array index (in, e.g., `translate'). */
1542 #ifndef PATFETCH
1543 #define PATFETCH(c) \
1544 do {if (p == pend) return REG_EEND; \
1545 c = (unsigned char) *p++; \
1546 if (RE_TRANSLATE_P (translate)) c = RE_TRANSLATE (translate, c); \
1547 } while (0)
1548 #endif
1550 /* Fetch the next character in the uncompiled pattern, with no
1551 translation. */
1552 #define PATFETCH_RAW(c) \
1553 do {if (p == pend) return REG_EEND; \
1554 c = (unsigned char) *p++; \
1555 } while (0)
1557 /* Go backwards one character in the pattern. */
1558 #define PATUNFETCH p--
1561 /* If `translate' is non-null, return translate[D], else just D. We
1562 cast the subscript to translate because some data is declared as
1563 `char *', to avoid warnings when a string constant is passed. But
1564 when we use a character as a subscript we must make it unsigned. */
1565 #ifndef TRANSLATE
1566 #define TRANSLATE(d) \
1567 (RE_TRANSLATE_P (translate) \
1568 ? (unsigned) RE_TRANSLATE (translate, (unsigned) (d)) : (d))
1569 #endif
1572 /* Macros for outputting the compiled pattern into `buffer'. */
1574 /* If the buffer isn't allocated when it comes in, use this. */
1575 #define INIT_BUF_SIZE 32
1577 /* Make sure we have at least N more bytes of space in buffer. */
1578 #define GET_BUFFER_SPACE(n) \
1579 while (b - bufp->buffer + (n) > bufp->allocated) \
1580 EXTEND_BUFFER ()
1582 /* Make sure we have one more byte of buffer space and then add C to it. */
1583 #define BUF_PUSH(c) \
1584 do { \
1585 GET_BUFFER_SPACE (1); \
1586 *b++ = (unsigned char) (c); \
1587 } while (0)
1590 /* Ensure we have two more bytes of buffer space and then append C1 and C2. */
1591 #define BUF_PUSH_2(c1, c2) \
1592 do { \
1593 GET_BUFFER_SPACE (2); \
1594 *b++ = (unsigned char) (c1); \
1595 *b++ = (unsigned char) (c2); \
1596 } while (0)
1599 /* As with BUF_PUSH_2, except for three bytes. */
1600 #define BUF_PUSH_3(c1, c2, c3) \
1601 do { \
1602 GET_BUFFER_SPACE (3); \
1603 *b++ = (unsigned char) (c1); \
1604 *b++ = (unsigned char) (c2); \
1605 *b++ = (unsigned char) (c3); \
1606 } while (0)
1609 /* Store a jump with opcode OP at LOC to location TO. We store a
1610 relative address offset by the three bytes the jump itself occupies. */
1611 #define STORE_JUMP(op, loc, to) \
1612 store_op1 (op, loc, (to) - (loc) - 3)
1614 /* Likewise, for a two-argument jump. */
1615 #define STORE_JUMP2(op, loc, to, arg) \
1616 store_op2 (op, loc, (to) - (loc) - 3, arg)
1618 /* Like `STORE_JUMP', but for inserting. Assume `b' is the buffer end. */
1619 #define INSERT_JUMP(op, loc, to) \
1620 insert_op1 (op, loc, (to) - (loc) - 3, b)
1622 /* Like `STORE_JUMP2', but for inserting. Assume `b' is the buffer end. */
1623 #define INSERT_JUMP2(op, loc, to, arg) \
1624 insert_op2 (op, loc, (to) - (loc) - 3, arg, b)
1627 /* This is not an arbitrary limit: the arguments which represent offsets
1628 into the pattern are two bytes long. So if 2^16 bytes turns out to
1629 be too small, many things would have to change. */
1630 #define MAX_BUF_SIZE (1L << 16)
1633 /* Extend the buffer by twice its current size via realloc and
1634 reset the pointers that pointed into the old block to point to the
1635 correct places in the new one. If extending the buffer results in it
1636 being larger than MAX_BUF_SIZE, then flag memory exhausted. */
1637 #define EXTEND_BUFFER() \
1638 do { \
1639 unsigned char *old_buffer = bufp->buffer; \
1640 if (bufp->allocated == MAX_BUF_SIZE) \
1641 return REG_ESIZE; \
1642 bufp->allocated <<= 1; \
1643 if (bufp->allocated > MAX_BUF_SIZE) \
1644 bufp->allocated = MAX_BUF_SIZE; \
1645 bufp->buffer = (unsigned char *) realloc (bufp->buffer, bufp->allocated);\
1646 if (bufp->buffer == NULL) \
1647 return REG_ESPACE; \
1648 /* If the buffer moved, move all the pointers into it. */ \
1649 if (old_buffer != bufp->buffer) \
1651 b = (b - old_buffer) + bufp->buffer; \
1652 begalt = (begalt - old_buffer) + bufp->buffer; \
1653 if (fixup_alt_jump) \
1654 fixup_alt_jump = (fixup_alt_jump - old_buffer) + bufp->buffer;\
1655 if (laststart) \
1656 laststart = (laststart - old_buffer) + bufp->buffer; \
1657 if (pending_exact) \
1658 pending_exact = (pending_exact - old_buffer) + bufp->buffer; \
1660 } while (0)
1663 /* Since we have one byte reserved for the register number argument to
1664 {start,stop}_memory, the maximum number of groups we can report
1665 things about is what fits in that byte. */
1666 #define MAX_REGNUM 255
1668 /* But patterns can have more than `MAX_REGNUM' registers. We just
1669 ignore the excess. */
1670 typedef unsigned regnum_t;
1673 /* Macros for the compile stack. */
1675 /* Since offsets can go either forwards or backwards, this type needs to
1676 be able to hold values from -(MAX_BUF_SIZE - 1) to MAX_BUF_SIZE - 1. */
1677 typedef int pattern_offset_t;
1679 typedef struct
1681 pattern_offset_t begalt_offset;
1682 pattern_offset_t fixup_alt_jump;
1683 pattern_offset_t inner_group_offset;
1684 pattern_offset_t laststart_offset;
1685 regnum_t regnum;
1686 } compile_stack_elt_t;
1689 typedef struct
1691 compile_stack_elt_t *stack;
1692 unsigned size;
1693 unsigned avail; /* Offset of next open position. */
1694 } compile_stack_type;
1697 #define INIT_COMPILE_STACK_SIZE 32
1699 #define COMPILE_STACK_EMPTY (compile_stack.avail == 0)
1700 #define COMPILE_STACK_FULL (compile_stack.avail == compile_stack.size)
1702 /* The next available element. */
1703 #define COMPILE_STACK_TOP (compile_stack.stack[compile_stack.avail])
1706 /* Structure to manage work area for range table. */
1707 struct range_table_work_area
1709 int *table; /* actual work area. */
1710 int allocated; /* allocated size for work area in bytes. */
1711 int used; /* actually used size in words. */
1714 /* Make sure that WORK_AREA can hold more N multibyte characters. */
1715 #define EXTEND_RANGE_TABLE_WORK_AREA(work_area, n) \
1716 do { \
1717 if (((work_area).used + (n)) * sizeof (int) > (work_area).allocated) \
1719 (work_area).allocated += 16 * sizeof (int); \
1720 if ((work_area).table) \
1721 (work_area).table \
1722 = (int *) realloc ((work_area).table, (work_area).allocated); \
1723 else \
1724 (work_area).table \
1725 = (int *) malloc ((work_area).allocated); \
1726 if ((work_area).table == 0) \
1727 FREE_STACK_RETURN (REG_ESPACE); \
1729 } while (0)
1731 /* Set a range (RANGE_START, RANGE_END) to WORK_AREA. */
1732 #define SET_RANGE_TABLE_WORK_AREA(work_area, range_start, range_end) \
1733 do { \
1734 EXTEND_RANGE_TABLE_WORK_AREA ((work_area), 2); \
1735 (work_area).table[(work_area).used++] = (range_start); \
1736 (work_area).table[(work_area).used++] = (range_end); \
1737 } while (0)
1739 /* Free allocated memory for WORK_AREA. */
1740 #define FREE_RANGE_TABLE_WORK_AREA(work_area) \
1741 do { \
1742 if ((work_area).table) \
1743 free ((work_area).table); \
1744 } while (0)
1746 #define CLEAR_RANGE_TABLE_WORK_USED(work_area) ((work_area).used = 0)
1747 #define RANGE_TABLE_WORK_USED(work_area) ((work_area).used)
1748 #define RANGE_TABLE_WORK_ELT(work_area, i) ((work_area).table[i])
1751 /* Set the bit for character C in a list. */
1752 #define SET_LIST_BIT(c) \
1753 (b[((unsigned char) (c)) / BYTEWIDTH] \
1754 |= 1 << (((unsigned char) c) % BYTEWIDTH))
1757 /* Get the next unsigned number in the uncompiled pattern. */
1758 #define GET_UNSIGNED_NUMBER(num) \
1759 { if (p != pend) \
1761 PATFETCH (c); \
1762 while (ISDIGIT (c)) \
1764 if (num < 0) \
1765 num = 0; \
1766 num = num * 10 + c - '0'; \
1767 if (p == pend) \
1768 break; \
1769 PATFETCH (c); \
1774 #define CHAR_CLASS_MAX_LENGTH 6 /* Namely, `xdigit'. */
1776 #define IS_CHAR_CLASS(string) \
1777 (STREQ (string, "alpha") || STREQ (string, "upper") \
1778 || STREQ (string, "lower") || STREQ (string, "digit") \
1779 || STREQ (string, "alnum") || STREQ (string, "xdigit") \
1780 || STREQ (string, "space") || STREQ (string, "print") \
1781 || STREQ (string, "punct") || STREQ (string, "graph") \
1782 || STREQ (string, "cntrl") || STREQ (string, "blank"))
1784 #ifndef MATCH_MAY_ALLOCATE
1786 /* If we cannot allocate large objects within re_match_2_internal,
1787 we make the fail stack and register vectors global.
1788 The fail stack, we grow to the maximum size when a regexp
1789 is compiled.
1790 The register vectors, we adjust in size each time we
1791 compile a regexp, according to the number of registers it needs. */
1793 static fail_stack_type fail_stack;
1795 /* Size with which the following vectors are currently allocated.
1796 That is so we can make them bigger as needed,
1797 but never make them smaller. */
1798 static int regs_allocated_size;
1800 static const char ** regstart, ** regend;
1801 static const char ** old_regstart, ** old_regend;
1802 static const char **best_regstart, **best_regend;
1803 static register_info_type *reg_info;
1804 static const char **reg_dummy;
1805 static register_info_type *reg_info_dummy;
1807 /* Make the register vectors big enough for NUM_REGS registers,
1808 but don't make them smaller. */
1810 static
1811 regex_grow_registers (num_regs)
1812 int num_regs;
1814 if (num_regs > regs_allocated_size)
1816 RETALLOC_IF (regstart, num_regs, const char *);
1817 RETALLOC_IF (regend, num_regs, const char *);
1818 RETALLOC_IF (old_regstart, num_regs, const char *);
1819 RETALLOC_IF (old_regend, num_regs, const char *);
1820 RETALLOC_IF (best_regstart, num_regs, const char *);
1821 RETALLOC_IF (best_regend, num_regs, const char *);
1822 RETALLOC_IF (reg_info, num_regs, register_info_type);
1823 RETALLOC_IF (reg_dummy, num_regs, const char *);
1824 RETALLOC_IF (reg_info_dummy, num_regs, register_info_type);
1826 regs_allocated_size = num_regs;
1830 #endif /* not MATCH_MAY_ALLOCATE */
1832 /* `regex_compile' compiles PATTERN (of length SIZE) according to SYNTAX.
1833 Returns one of error codes defined in `regex.h', or zero for success.
1835 Assumes the `allocated' (and perhaps `buffer') and `translate'
1836 fields are set in BUFP on entry.
1838 If it succeeds, results are put in BUFP (if it returns an error, the
1839 contents of BUFP are undefined):
1840 `buffer' is the compiled pattern;
1841 `syntax' is set to SYNTAX;
1842 `used' is set to the length of the compiled pattern;
1843 `fastmap_accurate' is zero;
1844 `re_nsub' is the number of subexpressions in PATTERN;
1845 `not_bol' and `not_eol' are zero;
1847 The `fastmap' and `newline_anchor' fields are neither
1848 examined nor set. */
1850 /* Return, freeing storage we allocated. */
1851 #define FREE_STACK_RETURN(value) \
1852 do { \
1853 FREE_RANGE_TABLE_WORK_AREA (range_table_work); \
1854 free (compile_stack.stack); \
1855 return value; \
1856 } while (0)
1858 static reg_errcode_t
1859 regex_compile (pattern, size, syntax, bufp)
1860 const char *pattern;
1861 int size;
1862 reg_syntax_t syntax;
1863 struct re_pattern_buffer *bufp;
1865 /* We fetch characters from PATTERN here. Even though PATTERN is
1866 `char *' (i.e., signed), we declare these variables as unsigned, so
1867 they can be reliably used as array indices. */
1868 register unsigned int c, c1;
1870 /* A random temporary spot in PATTERN. */
1871 const char *p1;
1873 /* Points to the end of the buffer, where we should append. */
1874 register unsigned char *b;
1876 /* Keeps track of unclosed groups. */
1877 compile_stack_type compile_stack;
1879 /* Points to the current (ending) position in the pattern. */
1880 #ifdef AIX
1881 /* `const' makes AIX compiler fail. */
1882 char *p = pattern;
1883 #else
1884 const char *p = pattern;
1885 #endif
1886 const char *pend = pattern + size;
1888 /* How to translate the characters in the pattern. */
1889 RE_TRANSLATE_TYPE translate = bufp->translate;
1891 /* Address of the count-byte of the most recently inserted `exactn'
1892 command. This makes it possible to tell if a new exact-match
1893 character can be added to that command or if the character requires
1894 a new `exactn' command. */
1895 unsigned char *pending_exact = 0;
1897 /* Address of start of the most recently finished expression.
1898 This tells, e.g., postfix * where to find the start of its
1899 operand. Reset at the beginning of groups and alternatives. */
1900 unsigned char *laststart = 0;
1902 /* Address of beginning of regexp, or inside of last group. */
1903 unsigned char *begalt;
1905 /* Place in the uncompiled pattern (i.e., the {) to
1906 which to go back if the interval is invalid. */
1907 const char *beg_interval;
1909 /* Address of the place where a forward jump should go to the end of
1910 the containing expression. Each alternative of an `or' -- except the
1911 last -- ends with a forward jump of this sort. */
1912 unsigned char *fixup_alt_jump = 0;
1914 /* Counts open-groups as they are encountered. Remembered for the
1915 matching close-group on the compile stack, so the same register
1916 number is put in the stop_memory as the start_memory. */
1917 regnum_t regnum = 0;
1919 /* Work area for range table of charset. */
1920 struct range_table_work_area range_table_work;
1922 #ifdef DEBUG
1923 DEBUG_PRINT1 ("\nCompiling pattern: ");
1924 if (debug)
1926 unsigned debug_count;
1928 for (debug_count = 0; debug_count < size; debug_count++)
1929 putchar (pattern[debug_count]);
1930 putchar ('\n');
1932 #endif /* DEBUG */
1934 /* Initialize the compile stack. */
1935 compile_stack.stack = TALLOC (INIT_COMPILE_STACK_SIZE, compile_stack_elt_t);
1936 if (compile_stack.stack == NULL)
1937 return REG_ESPACE;
1939 compile_stack.size = INIT_COMPILE_STACK_SIZE;
1940 compile_stack.avail = 0;
1942 range_table_work.table = 0;
1943 range_table_work.allocated = 0;
1945 /* Initialize the pattern buffer. */
1946 bufp->syntax = syntax;
1947 bufp->fastmap_accurate = 0;
1948 bufp->not_bol = bufp->not_eol = 0;
1950 /* Set `used' to zero, so that if we return an error, the pattern
1951 printer (for debugging) will think there's no pattern. We reset it
1952 at the end. */
1953 bufp->used = 0;
1955 /* Always count groups, whether or not bufp->no_sub is set. */
1956 bufp->re_nsub = 0;
1958 #ifdef emacs
1959 /* bufp->multibyte is set before regex_compile is called, so don't alter
1960 it. */
1961 #else /* not emacs */
1962 /* Nothing is recognized as a multibyte character. */
1963 bufp->multibyte = 0;
1964 #endif
1966 #if !defined (emacs) && !defined (SYNTAX_TABLE)
1967 /* Initialize the syntax table. */
1968 init_syntax_once ();
1969 #endif
1971 if (bufp->allocated == 0)
1973 if (bufp->buffer)
1974 { /* If zero allocated, but buffer is non-null, try to realloc
1975 enough space. This loses if buffer's address is bogus, but
1976 that is the user's responsibility. */
1977 RETALLOC (bufp->buffer, INIT_BUF_SIZE, unsigned char);
1979 else
1980 { /* Caller did not allocate a buffer. Do it for them. */
1981 bufp->buffer = TALLOC (INIT_BUF_SIZE, unsigned char);
1983 if (!bufp->buffer) FREE_STACK_RETURN (REG_ESPACE);
1985 bufp->allocated = INIT_BUF_SIZE;
1988 begalt = b = bufp->buffer;
1990 /* Loop through the uncompiled pattern until we're at the end. */
1991 while (p != pend)
1993 PATFETCH (c);
1995 switch (c)
1997 case '^':
1999 if ( /* If at start of pattern, it's an operator. */
2000 p == pattern + 1
2001 /* If context independent, it's an operator. */
2002 || syntax & RE_CONTEXT_INDEP_ANCHORS
2003 /* Otherwise, depends on what's come before. */
2004 || at_begline_loc_p (pattern, p, syntax))
2005 BUF_PUSH (begline);
2006 else
2007 goto normal_char;
2009 break;
2012 case '$':
2014 if ( /* If at end of pattern, it's an operator. */
2015 p == pend
2016 /* If context independent, it's an operator. */
2017 || syntax & RE_CONTEXT_INDEP_ANCHORS
2018 /* Otherwise, depends on what's next. */
2019 || at_endline_loc_p (p, pend, syntax))
2020 BUF_PUSH (endline);
2021 else
2022 goto normal_char;
2024 break;
2027 case '+':
2028 case '?':
2029 if ((syntax & RE_BK_PLUS_QM)
2030 || (syntax & RE_LIMITED_OPS))
2031 goto normal_char;
2032 handle_plus:
2033 case '*':
2034 /* If there is no previous pattern... */
2035 if (!laststart)
2037 if (syntax & RE_CONTEXT_INVALID_OPS)
2038 FREE_STACK_RETURN (REG_BADRPT);
2039 else if (!(syntax & RE_CONTEXT_INDEP_OPS))
2040 goto normal_char;
2044 /* Are we optimizing this jump? */
2045 boolean keep_string_p = false;
2047 /* 1 means zero (many) matches is allowed. */
2048 char zero_times_ok = 0, many_times_ok = 0;
2050 /* If there is a sequence of repetition chars, collapse it
2051 down to just one (the right one). We can't combine
2052 interval operators with these because of, e.g., `a{2}*',
2053 which should only match an even number of `a's. */
2055 for (;;)
2057 zero_times_ok |= c != '+';
2058 many_times_ok |= c != '?';
2060 if (p == pend)
2061 break;
2063 PATFETCH (c);
2065 if (c == '*'
2066 || (!(syntax & RE_BK_PLUS_QM) && (c == '+' || c == '?')))
2069 else if (syntax & RE_BK_PLUS_QM && c == '\\')
2071 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
2073 PATFETCH (c1);
2074 if (!(c1 == '+' || c1 == '?'))
2076 PATUNFETCH;
2077 PATUNFETCH;
2078 break;
2081 c = c1;
2083 else
2085 PATUNFETCH;
2086 break;
2089 /* If we get here, we found another repeat character. */
2092 /* Star, etc. applied to an empty pattern is equivalent
2093 to an empty pattern. */
2094 if (!laststart)
2095 break;
2097 /* Now we know whether or not zero matches is allowed
2098 and also whether or not two or more matches is allowed. */
2099 if (many_times_ok)
2100 { /* More than one repetition is allowed, so put in at the
2101 end a backward relative jump from `b' to before the next
2102 jump we're going to put in below (which jumps from
2103 laststart to after this jump).
2105 But if we are at the `*' in the exact sequence `.*\n',
2106 insert an unconditional jump backwards to the .,
2107 instead of the beginning of the loop. This way we only
2108 push a failure point once, instead of every time
2109 through the loop. */
2110 assert (p - 1 > pattern);
2112 /* Allocate the space for the jump. */
2113 GET_BUFFER_SPACE (3);
2115 /* We know we are not at the first character of the pattern,
2116 because laststart was nonzero. And we've already
2117 incremented `p', by the way, to be the character after
2118 the `*'. Do we have to do something analogous here
2119 for null bytes, because of RE_DOT_NOT_NULL? */
2120 if (TRANSLATE ((unsigned char)*(p - 2)) == TRANSLATE ('.')
2121 && zero_times_ok
2122 && p < pend
2123 && TRANSLATE ((unsigned char)*p) == TRANSLATE ('\n')
2124 && !(syntax & RE_DOT_NEWLINE))
2125 { /* We have .*\n. */
2126 STORE_JUMP (jump, b, laststart);
2127 keep_string_p = true;
2129 else
2130 /* Anything else. */
2131 STORE_JUMP (maybe_pop_jump, b, laststart - 3);
2133 /* We've added more stuff to the buffer. */
2134 b += 3;
2137 /* On failure, jump from laststart to b + 3, which will be the
2138 end of the buffer after this jump is inserted. */
2139 GET_BUFFER_SPACE (3);
2140 INSERT_JUMP (keep_string_p ? on_failure_keep_string_jump
2141 : on_failure_jump,
2142 laststart, b + 3);
2143 pending_exact = 0;
2144 b += 3;
2146 if (!zero_times_ok)
2148 /* At least one repetition is required, so insert a
2149 `dummy_failure_jump' before the initial
2150 `on_failure_jump' instruction of the loop. This
2151 effects a skip over that instruction the first time
2152 we hit that loop. */
2153 GET_BUFFER_SPACE (3);
2154 INSERT_JUMP (dummy_failure_jump, laststart, laststart + 6);
2155 b += 3;
2158 break;
2161 case '.':
2162 laststart = b;
2163 BUF_PUSH (anychar);
2164 break;
2167 case '[':
2169 CLEAR_RANGE_TABLE_WORK_USED (range_table_work);
2171 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2173 /* Ensure that we have enough space to push a charset: the
2174 opcode, the length count, and the bitset; 34 bytes in all. */
2175 GET_BUFFER_SPACE (34);
2177 laststart = b;
2179 /* We test `*p == '^' twice, instead of using an if
2180 statement, so we only need one BUF_PUSH. */
2181 BUF_PUSH (*p == '^' ? charset_not : charset);
2182 if (*p == '^')
2183 p++;
2185 /* Remember the first position in the bracket expression. */
2186 p1 = p;
2188 /* Push the number of bytes in the bitmap. */
2189 BUF_PUSH ((1 << BYTEWIDTH) / BYTEWIDTH);
2191 /* Clear the whole map. */
2192 bzero (b, (1 << BYTEWIDTH) / BYTEWIDTH);
2194 /* charset_not matches newline according to a syntax bit. */
2195 if ((re_opcode_t) b[-2] == charset_not
2196 && (syntax & RE_HAT_LISTS_NOT_NEWLINE))
2197 SET_LIST_BIT ('\n');
2199 /* Read in characters and ranges, setting map bits. */
2200 for (;;)
2202 int len;
2203 boolean escaped_char = false;
2205 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2207 PATFETCH (c);
2209 /* \ might escape characters inside [...] and [^...]. */
2210 if ((syntax & RE_BACKSLASH_ESCAPE_IN_LISTS) && c == '\\')
2212 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
2214 PATFETCH (c);
2215 escaped_char = true;
2217 else
2219 /* Could be the end of the bracket expression. If it's
2220 not (i.e., when the bracket expression is `[]' so
2221 far), the ']' character bit gets set way below. */
2222 if (c == ']' && p != p1 + 1)
2223 break;
2226 /* If C indicates start of multibyte char, get the
2227 actual character code in C, and set the pattern
2228 pointer P to the next character boundary. */
2229 if (bufp->multibyte && BASE_LEADING_CODE_P (c))
2231 PATUNFETCH;
2232 c = STRING_CHAR_AND_LENGTH (p, pend - p, len);
2233 p += len;
2235 /* What should we do for the character which is
2236 greater than 0x7F, but not BASE_LEADING_CODE_P?
2237 XXX */
2239 /* See if we're at the beginning of a possible character
2240 class. */
2242 else if (!escaped_char &&
2243 syntax & RE_CHAR_CLASSES && c == '[' && *p == ':')
2245 /* Leave room for the null. */
2246 char str[CHAR_CLASS_MAX_LENGTH + 1];
2248 PATFETCH (c);
2249 c1 = 0;
2251 /* If pattern is `[[:'. */
2252 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2254 for (;;)
2256 PATFETCH (c);
2257 if (c == ':' || c == ']' || p == pend
2258 || c1 == CHAR_CLASS_MAX_LENGTH)
2259 break;
2260 str[c1++] = c;
2262 str[c1] = '\0';
2264 /* If isn't a word bracketed by `[:' and `:]':
2265 undo the ending character, the letters, and
2266 leave the leading `:' and `[' (but set bits for
2267 them). */
2268 if (c == ':' && *p == ']')
2270 int ch;
2271 boolean is_alnum = STREQ (str, "alnum");
2272 boolean is_alpha = STREQ (str, "alpha");
2273 boolean is_blank = STREQ (str, "blank");
2274 boolean is_cntrl = STREQ (str, "cntrl");
2275 boolean is_digit = STREQ (str, "digit");
2276 boolean is_graph = STREQ (str, "graph");
2277 boolean is_lower = STREQ (str, "lower");
2278 boolean is_print = STREQ (str, "print");
2279 boolean is_punct = STREQ (str, "punct");
2280 boolean is_space = STREQ (str, "space");
2281 boolean is_upper = STREQ (str, "upper");
2282 boolean is_xdigit = STREQ (str, "xdigit");
2284 if (!IS_CHAR_CLASS (str))
2285 FREE_STACK_RETURN (REG_ECTYPE);
2287 /* Throw away the ] at the end of the character
2288 class. */
2289 PATFETCH (c);
2291 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2293 for (ch = 0; ch < 1 << BYTEWIDTH; ch++)
2295 int translated = TRANSLATE (ch);
2296 /* This was split into 3 if's to
2297 avoid an arbitrary limit in some compiler. */
2298 if ( (is_alnum && ISALNUM (ch))
2299 || (is_alpha && ISALPHA (ch))
2300 || (is_blank && ISBLANK (ch))
2301 || (is_cntrl && ISCNTRL (ch)))
2302 SET_LIST_BIT (translated);
2303 if ( (is_digit && ISDIGIT (ch))
2304 || (is_graph && ISGRAPH (ch))
2305 || (is_lower && ISLOWER (ch))
2306 || (is_print && ISPRINT (ch)))
2307 SET_LIST_BIT (translated);
2308 if ( (is_punct && ISPUNCT (ch))
2309 || (is_space && ISSPACE (ch))
2310 || (is_upper && ISUPPER (ch))
2311 || (is_xdigit && ISXDIGIT (ch)))
2312 SET_LIST_BIT (translated);
2315 /* Repeat the loop. */
2316 continue;
2318 else
2320 c1++;
2321 while (c1--)
2322 PATUNFETCH;
2323 SET_LIST_BIT ('[');
2325 /* Because the `:' may starts the range, we
2326 can't simply set bit and repeat the loop.
2327 Instead, just set it to C and handle below. */
2328 c = ':';
2332 if (p < pend && p[0] == '-' && p[1] != ']')
2335 /* Discard the `-'. */
2336 PATFETCH (c1);
2338 /* Fetch the character which ends the range. */
2339 PATFETCH (c1);
2340 if (bufp->multibyte && BASE_LEADING_CODE_P (c1))
2342 PATUNFETCH;
2343 c1 = STRING_CHAR_AND_LENGTH (p, pend - p, len);
2344 p += len;
2347 if (SINGLE_BYTE_CHAR_P (c)
2348 && ! SINGLE_BYTE_CHAR_P (c1))
2350 /* Handle a range such as \177-\377 in multibyte mode.
2351 Split that into two ranges,,
2352 the low one ending at 0237, and the high one
2353 starting at ...040. */
2354 int c1_base = (c1 & ~0177) | 040;
2355 SET_RANGE_TABLE_WORK_AREA (range_table_work, c, c1);
2356 c1 = 0237;
2358 else if (!SAME_CHARSET_P (c, c1))
2359 FREE_STACK_RETURN (REG_ERANGE);
2361 else
2362 /* Range from C to C. */
2363 c1 = c;
2365 /* Set the range ... */
2366 if (SINGLE_BYTE_CHAR_P (c))
2367 /* ... into bitmap. */
2369 unsigned this_char;
2370 int range_start = c, range_end = c1;
2372 /* If the start is after the end, the range is empty. */
2373 if (range_start > range_end)
2375 if (syntax & RE_NO_EMPTY_RANGES)
2376 FREE_STACK_RETURN (REG_ERANGE);
2377 /* Else, repeat the loop. */
2379 else
2381 for (this_char = range_start; this_char <= range_end;
2382 this_char++)
2383 SET_LIST_BIT (TRANSLATE (this_char));
2386 else
2387 /* ... into range table. */
2388 SET_RANGE_TABLE_WORK_AREA (range_table_work, c, c1);
2391 /* Discard any (non)matching list bytes that are all 0 at the
2392 end of the map. Decrease the map-length byte too. */
2393 while ((int) b[-1] > 0 && b[b[-1] - 1] == 0)
2394 b[-1]--;
2395 b += b[-1];
2397 /* Build real range table from work area. */
2398 if (RANGE_TABLE_WORK_USED (range_table_work))
2400 int i;
2401 int used = RANGE_TABLE_WORK_USED (range_table_work);
2403 /* Allocate space for COUNT + RANGE_TABLE. Needs two
2404 bytes for COUNT and three bytes for each character. */
2405 GET_BUFFER_SPACE (2 + used * 3);
2407 /* Indicate the existence of range table. */
2408 laststart[1] |= 0x80;
2410 STORE_NUMBER_AND_INCR (b, used / 2);
2411 for (i = 0; i < used; i++)
2412 STORE_CHARACTER_AND_INCR
2413 (b, RANGE_TABLE_WORK_ELT (range_table_work, i));
2416 break;
2419 case '(':
2420 if (syntax & RE_NO_BK_PARENS)
2421 goto handle_open;
2422 else
2423 goto normal_char;
2426 case ')':
2427 if (syntax & RE_NO_BK_PARENS)
2428 goto handle_close;
2429 else
2430 goto normal_char;
2433 case '\n':
2434 if (syntax & RE_NEWLINE_ALT)
2435 goto handle_alt;
2436 else
2437 goto normal_char;
2440 case '|':
2441 if (syntax & RE_NO_BK_VBAR)
2442 goto handle_alt;
2443 else
2444 goto normal_char;
2447 case '{':
2448 if (syntax & RE_INTERVALS && syntax & RE_NO_BK_BRACES)
2449 goto handle_interval;
2450 else
2451 goto normal_char;
2454 case '\\':
2455 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
2457 /* Do not translate the character after the \, so that we can
2458 distinguish, e.g., \B from \b, even if we normally would
2459 translate, e.g., B to b. */
2460 PATFETCH_RAW (c);
2462 switch (c)
2464 case '(':
2465 if (syntax & RE_NO_BK_PARENS)
2466 goto normal_backslash;
2468 handle_open:
2469 bufp->re_nsub++;
2470 regnum++;
2472 if (COMPILE_STACK_FULL)
2474 RETALLOC (compile_stack.stack, compile_stack.size << 1,
2475 compile_stack_elt_t);
2476 if (compile_stack.stack == NULL) return REG_ESPACE;
2478 compile_stack.size <<= 1;
2481 /* These are the values to restore when we hit end of this
2482 group. They are all relative offsets, so that if the
2483 whole pattern moves because of realloc, they will still
2484 be valid. */
2485 COMPILE_STACK_TOP.begalt_offset = begalt - bufp->buffer;
2486 COMPILE_STACK_TOP.fixup_alt_jump
2487 = fixup_alt_jump ? fixup_alt_jump - bufp->buffer + 1 : 0;
2488 COMPILE_STACK_TOP.laststart_offset = b - bufp->buffer;
2489 COMPILE_STACK_TOP.regnum = regnum;
2491 /* We will eventually replace the 0 with the number of
2492 groups inner to this one. But do not push a
2493 start_memory for groups beyond the last one we can
2494 represent in the compiled pattern. */
2495 if (regnum <= MAX_REGNUM)
2497 COMPILE_STACK_TOP.inner_group_offset = b - bufp->buffer + 2;
2498 BUF_PUSH_3 (start_memory, regnum, 0);
2501 compile_stack.avail++;
2503 fixup_alt_jump = 0;
2504 laststart = 0;
2505 begalt = b;
2506 /* If we've reached MAX_REGNUM groups, then this open
2507 won't actually generate any code, so we'll have to
2508 clear pending_exact explicitly. */
2509 pending_exact = 0;
2510 break;
2513 case ')':
2514 if (syntax & RE_NO_BK_PARENS) goto normal_backslash;
2516 if (COMPILE_STACK_EMPTY)
2517 if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)
2518 goto normal_backslash;
2519 else
2520 FREE_STACK_RETURN (REG_ERPAREN);
2522 handle_close:
2523 if (fixup_alt_jump)
2524 { /* Push a dummy failure point at the end of the
2525 alternative for a possible future
2526 `pop_failure_jump' to pop. See comments at
2527 `push_dummy_failure' in `re_match_2'. */
2528 BUF_PUSH (push_dummy_failure);
2530 /* We allocated space for this jump when we assigned
2531 to `fixup_alt_jump', in the `handle_alt' case below. */
2532 STORE_JUMP (jump_past_alt, fixup_alt_jump, b - 1);
2535 /* See similar code for backslashed left paren above. */
2536 if (COMPILE_STACK_EMPTY)
2537 if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)
2538 goto normal_char;
2539 else
2540 FREE_STACK_RETURN (REG_ERPAREN);
2542 /* Since we just checked for an empty stack above, this
2543 ``can't happen''. */
2544 assert (compile_stack.avail != 0);
2546 /* We don't just want to restore into `regnum', because
2547 later groups should continue to be numbered higher,
2548 as in `(ab)c(de)' -- the second group is #2. */
2549 regnum_t this_group_regnum;
2551 compile_stack.avail--;
2552 begalt = bufp->buffer + COMPILE_STACK_TOP.begalt_offset;
2553 fixup_alt_jump
2554 = COMPILE_STACK_TOP.fixup_alt_jump
2555 ? bufp->buffer + COMPILE_STACK_TOP.fixup_alt_jump - 1
2556 : 0;
2557 laststart = bufp->buffer + COMPILE_STACK_TOP.laststart_offset;
2558 this_group_regnum = COMPILE_STACK_TOP.regnum;
2559 /* If we've reached MAX_REGNUM groups, then this open
2560 won't actually generate any code, so we'll have to
2561 clear pending_exact explicitly. */
2562 pending_exact = 0;
2564 /* We're at the end of the group, so now we know how many
2565 groups were inside this one. */
2566 if (this_group_regnum <= MAX_REGNUM)
2568 unsigned char *inner_group_loc
2569 = bufp->buffer + COMPILE_STACK_TOP.inner_group_offset;
2571 *inner_group_loc = regnum - this_group_regnum;
2572 BUF_PUSH_3 (stop_memory, this_group_regnum,
2573 regnum - this_group_regnum);
2576 break;
2579 case '|': /* `\|'. */
2580 if (syntax & RE_LIMITED_OPS || syntax & RE_NO_BK_VBAR)
2581 goto normal_backslash;
2582 handle_alt:
2583 if (syntax & RE_LIMITED_OPS)
2584 goto normal_char;
2586 /* Insert before the previous alternative a jump which
2587 jumps to this alternative if the former fails. */
2588 GET_BUFFER_SPACE (3);
2589 INSERT_JUMP (on_failure_jump, begalt, b + 6);
2590 pending_exact = 0;
2591 b += 3;
2593 /* The alternative before this one has a jump after it
2594 which gets executed if it gets matched. Adjust that
2595 jump so it will jump to this alternative's analogous
2596 jump (put in below, which in turn will jump to the next
2597 (if any) alternative's such jump, etc.). The last such
2598 jump jumps to the correct final destination. A picture:
2599 _____ _____
2600 | | | |
2601 | v | v
2602 a | b | c
2604 If we are at `b', then fixup_alt_jump right now points to a
2605 three-byte space after `a'. We'll put in the jump, set
2606 fixup_alt_jump to right after `b', and leave behind three
2607 bytes which we'll fill in when we get to after `c'. */
2609 if (fixup_alt_jump)
2610 STORE_JUMP (jump_past_alt, fixup_alt_jump, b);
2612 /* Mark and leave space for a jump after this alternative,
2613 to be filled in later either by next alternative or
2614 when know we're at the end of a series of alternatives. */
2615 fixup_alt_jump = b;
2616 GET_BUFFER_SPACE (3);
2617 b += 3;
2619 laststart = 0;
2620 begalt = b;
2621 break;
2624 case '{':
2625 /* If \{ is a literal. */
2626 if (!(syntax & RE_INTERVALS)
2627 /* If we're at `\{' and it's not the open-interval
2628 operator. */
2629 || ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
2630 || (p - 2 == pattern && p == pend))
2631 goto normal_backslash;
2633 handle_interval:
2635 /* If got here, then the syntax allows intervals. */
2637 /* At least (most) this many matches must be made. */
2638 int lower_bound = -1, upper_bound = -1;
2640 beg_interval = p - 1;
2642 if (p == pend)
2644 if (syntax & RE_NO_BK_BRACES)
2645 goto unfetch_interval;
2646 else
2647 FREE_STACK_RETURN (REG_EBRACE);
2650 GET_UNSIGNED_NUMBER (lower_bound);
2652 if (c == ',')
2654 GET_UNSIGNED_NUMBER (upper_bound);
2655 if (upper_bound < 0) upper_bound = RE_DUP_MAX;
2657 else
2658 /* Interval such as `{1}' => match exactly once. */
2659 upper_bound = lower_bound;
2661 if (lower_bound < 0 || upper_bound > RE_DUP_MAX
2662 || lower_bound > upper_bound)
2664 if (syntax & RE_NO_BK_BRACES)
2665 goto unfetch_interval;
2666 else
2667 FREE_STACK_RETURN (REG_BADBR);
2670 if (!(syntax & RE_NO_BK_BRACES))
2672 if (c != '\\') FREE_STACK_RETURN (REG_EBRACE);
2674 PATFETCH (c);
2677 if (c != '}')
2679 if (syntax & RE_NO_BK_BRACES)
2680 goto unfetch_interval;
2681 else
2682 FREE_STACK_RETURN (REG_BADBR);
2685 /* We just parsed a valid interval. */
2687 /* If it's invalid to have no preceding re. */
2688 if (!laststart)
2690 if (syntax & RE_CONTEXT_INVALID_OPS)
2691 FREE_STACK_RETURN (REG_BADRPT);
2692 else if (syntax & RE_CONTEXT_INDEP_OPS)
2693 laststart = b;
2694 else
2695 goto unfetch_interval;
2698 /* If the upper bound is zero, don't want to succeed at
2699 all; jump from `laststart' to `b + 3', which will be
2700 the end of the buffer after we insert the jump. */
2701 if (upper_bound == 0)
2703 GET_BUFFER_SPACE (3);
2704 INSERT_JUMP (jump, laststart, b + 3);
2705 b += 3;
2708 /* Otherwise, we have a nontrivial interval. When
2709 we're all done, the pattern will look like:
2710 set_number_at <jump count> <upper bound>
2711 set_number_at <succeed_n count> <lower bound>
2712 succeed_n <after jump addr> <succeed_n count>
2713 <body of loop>
2714 jump_n <succeed_n addr> <jump count>
2715 (The upper bound and `jump_n' are omitted if
2716 `upper_bound' is 1, though.) */
2717 else
2718 { /* If the upper bound is > 1, we need to insert
2719 more at the end of the loop. */
2720 unsigned nbytes = 10 + (upper_bound > 1) * 10;
2722 GET_BUFFER_SPACE (nbytes);
2724 /* Initialize lower bound of the `succeed_n', even
2725 though it will be set during matching by its
2726 attendant `set_number_at' (inserted next),
2727 because `re_compile_fastmap' needs to know.
2728 Jump to the `jump_n' we might insert below. */
2729 INSERT_JUMP2 (succeed_n, laststart,
2730 b + 5 + (upper_bound > 1) * 5,
2731 lower_bound);
2732 b += 5;
2734 /* Code to initialize the lower bound. Insert
2735 before the `succeed_n'. The `5' is the last two
2736 bytes of this `set_number_at', plus 3 bytes of
2737 the following `succeed_n'. */
2738 insert_op2 (set_number_at, laststart, 5, lower_bound, b);
2739 b += 5;
2741 if (upper_bound > 1)
2742 { /* More than one repetition is allowed, so
2743 append a backward jump to the `succeed_n'
2744 that starts this interval.
2746 When we've reached this during matching,
2747 we'll have matched the interval once, so
2748 jump back only `upper_bound - 1' times. */
2749 STORE_JUMP2 (jump_n, b, laststart + 5,
2750 upper_bound - 1);
2751 b += 5;
2753 /* The location we want to set is the second
2754 parameter of the `jump_n'; that is `b-2' as
2755 an absolute address. `laststart' will be
2756 the `set_number_at' we're about to insert;
2757 `laststart+3' the number to set, the source
2758 for the relative address. But we are
2759 inserting into the middle of the pattern --
2760 so everything is getting moved up by 5.
2761 Conclusion: (b - 2) - (laststart + 3) + 5,
2762 i.e., b - laststart.
2764 We insert this at the beginning of the loop
2765 so that if we fail during matching, we'll
2766 reinitialize the bounds. */
2767 insert_op2 (set_number_at, laststart, b - laststart,
2768 upper_bound - 1, b);
2769 b += 5;
2772 pending_exact = 0;
2773 beg_interval = NULL;
2775 break;
2777 unfetch_interval:
2778 /* If an invalid interval, match the characters as literals. */
2779 assert (beg_interval);
2780 p = beg_interval;
2781 beg_interval = NULL;
2783 /* normal_char and normal_backslash need `c'. */
2784 PATFETCH (c);
2786 if (!(syntax & RE_NO_BK_BRACES))
2788 if (p > pattern && p[-1] == '\\')
2789 goto normal_backslash;
2791 goto normal_char;
2793 #ifdef emacs
2794 /* There is no way to specify the before_dot and after_dot
2795 operators. rms says this is ok. --karl */
2796 case '=':
2797 BUF_PUSH (at_dot);
2798 break;
2800 case 's':
2801 laststart = b;
2802 PATFETCH (c);
2803 BUF_PUSH_2 (syntaxspec, syntax_spec_code[c]);
2804 break;
2806 case 'S':
2807 laststart = b;
2808 PATFETCH (c);
2809 BUF_PUSH_2 (notsyntaxspec, syntax_spec_code[c]);
2810 break;
2812 case 'c':
2813 laststart = b;
2814 PATFETCH_RAW (c);
2815 BUF_PUSH_2 (categoryspec, c);
2816 break;
2818 case 'C':
2819 laststart = b;
2820 PATFETCH_RAW (c);
2821 BUF_PUSH_2 (notcategoryspec, c);
2822 break;
2823 #endif /* emacs */
2826 case 'w':
2827 laststart = b;
2828 BUF_PUSH (wordchar);
2829 break;
2832 case 'W':
2833 laststart = b;
2834 BUF_PUSH (notwordchar);
2835 break;
2838 case '<':
2839 BUF_PUSH (wordbeg);
2840 break;
2842 case '>':
2843 BUF_PUSH (wordend);
2844 break;
2846 case 'b':
2847 BUF_PUSH (wordbound);
2848 break;
2850 case 'B':
2851 BUF_PUSH (notwordbound);
2852 break;
2854 case '`':
2855 BUF_PUSH (begbuf);
2856 break;
2858 case '\'':
2859 BUF_PUSH (endbuf);
2860 break;
2862 case '1': case '2': case '3': case '4': case '5':
2863 case '6': case '7': case '8': case '9':
2864 if (syntax & RE_NO_BK_REFS)
2865 goto normal_char;
2867 c1 = c - '0';
2869 if (c1 > regnum)
2870 FREE_STACK_RETURN (REG_ESUBREG);
2872 /* Can't back reference to a subexpression if inside of it. */
2873 if (group_in_compile_stack (compile_stack, c1))
2874 goto normal_char;
2876 laststart = b;
2877 BUF_PUSH_2 (duplicate, c1);
2878 break;
2881 case '+':
2882 case '?':
2883 if (syntax & RE_BK_PLUS_QM)
2884 goto handle_plus;
2885 else
2886 goto normal_backslash;
2888 default:
2889 normal_backslash:
2890 /* You might think it would be useful for \ to mean
2891 not to translate; but if we don't translate it
2892 it will never match anything. */
2893 c = TRANSLATE (c);
2894 goto normal_char;
2896 break;
2899 default:
2900 /* Expects the character in `c'. */
2901 normal_char:
2902 p1 = p - 1; /* P1 points the head of C. */
2903 #ifdef emacs
2904 if (bufp->multibyte)
2906 c = STRING_CHAR (p1, pend - p1);
2907 c = TRANSLATE (c);
2908 /* Set P to the next character boundary. */
2909 p += MULTIBYTE_FORM_LENGTH (p1, pend - p1) - 1;
2911 #endif
2912 /* If no exactn currently being built. */
2913 if (!pending_exact
2915 /* If last exactn not at current position. */
2916 || pending_exact + *pending_exact + 1 != b
2918 /* We have only one byte following the exactn for the count. */
2919 || *pending_exact >= (1 << BYTEWIDTH) - (p - p1)
2921 /* If followed by a repetition operator. */
2922 || (p != pend && (*p == '*' || *p == '^'))
2923 || ((syntax & RE_BK_PLUS_QM)
2924 ? p + 1 < pend && *p == '\\' && (p[1] == '+' || p[1] == '?')
2925 : p != pend && (*p == '+' || *p == '?'))
2926 || ((syntax & RE_INTERVALS)
2927 && ((syntax & RE_NO_BK_BRACES)
2928 ? p != pend && *p == '{'
2929 : p + 1 < pend && p[0] == '\\' && p[1] == '{')))
2931 /* Start building a new exactn. */
2933 laststart = b;
2935 BUF_PUSH_2 (exactn, 0);
2936 pending_exact = b - 1;
2939 #ifdef emacs
2940 if (! SINGLE_BYTE_CHAR_P (c))
2942 unsigned char work[4], *str;
2943 int i = CHAR_STRING (c, work, str);
2944 int j;
2945 for (j = 0; j < i; j++)
2947 BUF_PUSH (str[j]);
2948 (*pending_exact)++;
2951 else
2952 #endif
2954 BUF_PUSH (c);
2955 (*pending_exact)++;
2957 break;
2958 } /* switch (c) */
2959 } /* while p != pend */
2962 /* Through the pattern now. */
2964 if (fixup_alt_jump)
2965 STORE_JUMP (jump_past_alt, fixup_alt_jump, b);
2967 if (!COMPILE_STACK_EMPTY)
2968 FREE_STACK_RETURN (REG_EPAREN);
2970 /* If we don't want backtracking, force success
2971 the first time we reach the end of the compiled pattern. */
2972 if (syntax & RE_NO_POSIX_BACKTRACKING)
2973 BUF_PUSH (succeed);
2975 free (compile_stack.stack);
2977 /* We have succeeded; set the length of the buffer. */
2978 bufp->used = b - bufp->buffer;
2980 #ifdef DEBUG
2981 if (debug)
2983 DEBUG_PRINT1 ("\nCompiled pattern: \n");
2984 print_compiled_pattern (bufp);
2986 #endif /* DEBUG */
2988 #ifndef MATCH_MAY_ALLOCATE
2989 /* Initialize the failure stack to the largest possible stack. This
2990 isn't necessary unless we're trying to avoid calling alloca in
2991 the search and match routines. */
2993 int num_regs = bufp->re_nsub + 1;
2995 if (fail_stack.size < re_max_failures * TYPICAL_FAILURE_SIZE)
2997 fail_stack.size = re_max_failures * TYPICAL_FAILURE_SIZE;
2999 #ifdef emacs
3000 if (! fail_stack.stack)
3001 fail_stack.stack
3002 = (fail_stack_elt_t *) xmalloc (fail_stack.size
3003 * sizeof (fail_stack_elt_t));
3004 else
3005 fail_stack.stack
3006 = (fail_stack_elt_t *) xrealloc (fail_stack.stack,
3007 (fail_stack.size
3008 * sizeof (fail_stack_elt_t)));
3009 #else /* not emacs */
3010 if (! fail_stack.stack)
3011 fail_stack.stack
3012 = (fail_stack_elt_t *) malloc (fail_stack.size
3013 * sizeof (fail_stack_elt_t));
3014 else
3015 fail_stack.stack
3016 = (fail_stack_elt_t *) realloc (fail_stack.stack,
3017 (fail_stack.size
3018 * sizeof (fail_stack_elt_t)));
3019 #endif /* not emacs */
3022 regex_grow_registers (num_regs);
3024 #endif /* not MATCH_MAY_ALLOCATE */
3026 return REG_NOERROR;
3027 } /* regex_compile */
3029 /* Subroutines for `regex_compile'. */
3031 /* Store OP at LOC followed by two-byte integer parameter ARG. */
3033 static void
3034 store_op1 (op, loc, arg)
3035 re_opcode_t op;
3036 unsigned char *loc;
3037 int arg;
3039 *loc = (unsigned char) op;
3040 STORE_NUMBER (loc + 1, arg);
3044 /* Like `store_op1', but for two two-byte parameters ARG1 and ARG2. */
3046 static void
3047 store_op2 (op, loc, arg1, arg2)
3048 re_opcode_t op;
3049 unsigned char *loc;
3050 int arg1, arg2;
3052 *loc = (unsigned char) op;
3053 STORE_NUMBER (loc + 1, arg1);
3054 STORE_NUMBER (loc + 3, arg2);
3058 /* Copy the bytes from LOC to END to open up three bytes of space at LOC
3059 for OP followed by two-byte integer parameter ARG. */
3061 static void
3062 insert_op1 (op, loc, arg, end)
3063 re_opcode_t op;
3064 unsigned char *loc;
3065 int arg;
3066 unsigned char *end;
3068 register unsigned char *pfrom = end;
3069 register unsigned char *pto = end + 3;
3071 while (pfrom != loc)
3072 *--pto = *--pfrom;
3074 store_op1 (op, loc, arg);
3078 /* Like `insert_op1', but for two two-byte parameters ARG1 and ARG2. */
3080 static void
3081 insert_op2 (op, loc, arg1, arg2, end)
3082 re_opcode_t op;
3083 unsigned char *loc;
3084 int arg1, arg2;
3085 unsigned char *end;
3087 register unsigned char *pfrom = end;
3088 register unsigned char *pto = end + 5;
3090 while (pfrom != loc)
3091 *--pto = *--pfrom;
3093 store_op2 (op, loc, arg1, arg2);
3097 /* P points to just after a ^ in PATTERN. Return true if that ^ comes
3098 after an alternative or a begin-subexpression. We assume there is at
3099 least one character before the ^. */
3101 static boolean
3102 at_begline_loc_p (pattern, p, syntax)
3103 const char *pattern, *p;
3104 reg_syntax_t syntax;
3106 const char *prev = p - 2;
3107 boolean prev_prev_backslash = prev > pattern && prev[-1] == '\\';
3109 return
3110 /* After a subexpression? */
3111 (*prev == '(' && (syntax & RE_NO_BK_PARENS || prev_prev_backslash))
3112 /* After an alternative? */
3113 || (*prev == '|' && (syntax & RE_NO_BK_VBAR || prev_prev_backslash));
3117 /* The dual of at_begline_loc_p. This one is for $. We assume there is
3118 at least one character after the $, i.e., `P < PEND'. */
3120 static boolean
3121 at_endline_loc_p (p, pend, syntax)
3122 const char *p, *pend;
3123 int syntax;
3125 const char *next = p;
3126 boolean next_backslash = *next == '\\';
3127 const char *next_next = p + 1 < pend ? p + 1 : 0;
3129 return
3130 /* Before a subexpression? */
3131 (syntax & RE_NO_BK_PARENS ? *next == ')'
3132 : next_backslash && next_next && *next_next == ')')
3133 /* Before an alternative? */
3134 || (syntax & RE_NO_BK_VBAR ? *next == '|'
3135 : next_backslash && next_next && *next_next == '|');
3139 /* Returns true if REGNUM is in one of COMPILE_STACK's elements and
3140 false if it's not. */
3142 static boolean
3143 group_in_compile_stack (compile_stack, regnum)
3144 compile_stack_type compile_stack;
3145 regnum_t regnum;
3147 int this_element;
3149 for (this_element = compile_stack.avail - 1;
3150 this_element >= 0;
3151 this_element--)
3152 if (compile_stack.stack[this_element].regnum == regnum)
3153 return true;
3155 return false;
3158 /* re_compile_fastmap computes a ``fastmap'' for the compiled pattern in
3159 BUFP. A fastmap records which of the (1 << BYTEWIDTH) possible
3160 characters can start a string that matches the pattern. This fastmap
3161 is used by re_search to skip quickly over impossible starting points.
3163 The caller must supply the address of a (1 << BYTEWIDTH)-byte data
3164 area as BUFP->fastmap.
3166 We set the `fastmap', `fastmap_accurate', and `can_be_null' fields in
3167 the pattern buffer.
3169 Returns 0 if we succeed, -2 if an internal error. */
3172 re_compile_fastmap (bufp)
3173 struct re_pattern_buffer *bufp;
3175 int i, j, k;
3176 #ifdef MATCH_MAY_ALLOCATE
3177 fail_stack_type fail_stack;
3178 #endif
3179 #ifndef REGEX_MALLOC
3180 char *destination;
3181 #endif
3182 /* We don't push any register information onto the failure stack. */
3183 unsigned num_regs = 0;
3185 register char *fastmap = bufp->fastmap;
3186 unsigned char *pattern = bufp->buffer;
3187 unsigned long size = bufp->used;
3188 unsigned char *p = pattern;
3189 register unsigned char *pend = pattern + size;
3191 /* This holds the pointer to the failure stack, when
3192 it is allocated relocatably. */
3193 fail_stack_elt_t *failure_stack_ptr;
3195 /* Assume that each path through the pattern can be null until
3196 proven otherwise. We set this false at the bottom of switch
3197 statement, to which we get only if a particular path doesn't
3198 match the empty string. */
3199 boolean path_can_be_null = true;
3201 /* We aren't doing a `succeed_n' to begin with. */
3202 boolean succeed_n_p = false;
3204 /* If all elements for base leading-codes in fastmap is set, this
3205 flag is set true. */
3206 boolean match_any_multibyte_characters = false;
3208 /* Maximum code of simple (single byte) character. */
3209 int simple_char_max;
3211 assert (fastmap != NULL && p != NULL);
3213 INIT_FAIL_STACK ();
3214 bzero (fastmap, 1 << BYTEWIDTH); /* Assume nothing's valid. */
3215 bufp->fastmap_accurate = 1; /* It will be when we're done. */
3216 bufp->can_be_null = 0;
3218 while (1)
3220 if (p == pend || *p == succeed)
3222 /* We have reached the (effective) end of pattern. */
3223 if (!FAIL_STACK_EMPTY ())
3225 bufp->can_be_null |= path_can_be_null;
3227 /* Reset for next path. */
3228 path_can_be_null = true;
3230 p = fail_stack.stack[--fail_stack.avail].pointer;
3232 continue;
3234 else
3235 break;
3238 /* We should never be about to go beyond the end of the pattern. */
3239 assert (p < pend);
3241 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p++))
3244 /* I guess the idea here is to simply not bother with a fastmap
3245 if a backreference is used, since it's too hard to figure out
3246 the fastmap for the corresponding group. Setting
3247 `can_be_null' stops `re_search_2' from using the fastmap, so
3248 that is all we do. */
3249 case duplicate:
3250 bufp->can_be_null = 1;
3251 goto done;
3254 /* Following are the cases which match a character. These end
3255 with `break'. */
3257 case exactn:
3258 fastmap[p[1]] = 1;
3259 break;
3262 #ifndef emacs
3263 case charset:
3264 for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--)
3265 if (p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH)))
3266 fastmap[j] = 1;
3267 break;
3270 case charset_not:
3271 /* Chars beyond end of map must be allowed. */
3272 for (j = *p * BYTEWIDTH; j < (1 << BYTEWIDTH); j++)
3273 fastmap[j] = 1;
3275 for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--)
3276 if (!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))))
3277 fastmap[j] = 1;
3278 break;
3281 case wordchar:
3282 for (j = 0; j < (1 << BYTEWIDTH); j++)
3283 if (SYNTAX (j) == Sword)
3284 fastmap[j] = 1;
3285 break;
3288 case notwordchar:
3289 for (j = 0; j < (1 << BYTEWIDTH); j++)
3290 if (SYNTAX (j) != Sword)
3291 fastmap[j] = 1;
3292 break;
3293 #else /* emacs */
3294 case charset:
3295 for (j = CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH - 1, p++;
3296 j >= 0; j--)
3297 if (p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH)))
3298 fastmap[j] = 1;
3300 if (CHARSET_RANGE_TABLE_EXISTS_P (&p[-2])
3301 && match_any_multibyte_characters == false)
3303 /* Set fastmap[I] 1 where I is a base leading code of each
3304 multibyte character in the range table. */
3305 int c, count;
3307 /* Make P points the range table. */
3308 p += CHARSET_BITMAP_SIZE (&p[-2]);
3310 /* Extract the number of ranges in range table into
3311 COUNT. */
3312 EXTRACT_NUMBER_AND_INCR (count, p);
3313 for (; count > 0; count--, p += 2 * 3) /* XXX */
3315 /* Extract the start of each range. */
3316 EXTRACT_CHARACTER (c, p);
3317 j = CHAR_CHARSET (c);
3318 fastmap[CHARSET_LEADING_CODE_BASE (j)] = 1;
3321 break;
3324 case charset_not:
3325 /* Chars beyond end of bitmap are possible matches.
3326 All the single-byte codes can occur in multibyte buffers.
3327 So any that are not listed in the charset
3328 are possible matches, even in multibyte buffers. */
3329 simple_char_max = (1 << BYTEWIDTH);
3330 for (j = CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH;
3331 j < simple_char_max; j++)
3332 fastmap[j] = 1;
3334 for (j = CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH - 1, p++;
3335 j >= 0; j--)
3336 if (!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))))
3337 fastmap[j] = 1;
3339 if (bufp->multibyte)
3340 /* Any character set can possibly contain a character
3341 which doesn't match the specified set of characters. */
3343 set_fastmap_for_multibyte_characters:
3344 if (match_any_multibyte_characters == false)
3346 for (j = 0x80; j < 0xA0; j++) /* XXX */
3347 if (BASE_LEADING_CODE_P (j))
3348 fastmap[j] = 1;
3349 match_any_multibyte_characters = true;
3352 break;
3355 case wordchar:
3356 /* All the single-byte codes can occur in multibyte buffers,
3357 and they may have word syntax. So do consider them. */
3358 simple_char_max = (1 << BYTEWIDTH);
3359 for (j = 0; j < simple_char_max; j++)
3360 if (SYNTAX (j) == Sword)
3361 fastmap[j] = 1;
3363 if (bufp->multibyte)
3364 /* Any character set can possibly contain a character
3365 whose syntax is `Sword'. */
3366 goto set_fastmap_for_multibyte_characters;
3367 break;
3370 case notwordchar:
3371 /* All the single-byte codes can occur in multibyte buffers,
3372 and they may not have word syntax. So do consider them. */
3373 simple_char_max = (1 << BYTEWIDTH);
3374 for (j = 0; j < simple_char_max; j++)
3375 if (SYNTAX (j) != Sword)
3376 fastmap[j] = 1;
3378 if (bufp->multibyte)
3379 /* Any character set can possibly contain a character
3380 whose syntax is not `Sword'. */
3381 goto set_fastmap_for_multibyte_characters;
3382 break;
3383 #endif
3385 case anychar:
3387 int fastmap_newline = fastmap['\n'];
3389 /* `.' matches anything, except perhaps newline.
3390 Even in a multibyte buffer, it should match any
3391 conceivable byte value for the fastmap. */
3392 if (bufp->multibyte)
3393 match_any_multibyte_characters = true;
3395 simple_char_max = (1 << BYTEWIDTH);
3396 for (j = 0; j < simple_char_max; j++)
3397 fastmap[j] = 1;
3399 /* ... except perhaps newline. */
3400 if (!(bufp->syntax & RE_DOT_NEWLINE))
3401 fastmap['\n'] = fastmap_newline;
3403 /* Return if we have already set `can_be_null'; if we have,
3404 then the fastmap is irrelevant. Something's wrong here. */
3405 else if (bufp->can_be_null)
3406 goto done;
3408 /* Otherwise, have to check alternative paths. */
3409 break;
3412 #ifdef emacs
3413 case wordbound:
3414 case notwordbound:
3415 case wordbeg:
3416 case wordend:
3417 case notsyntaxspec:
3418 case syntaxspec:
3419 /* This match depends on text properties. These end with
3420 aborting optimizations. */
3421 bufp->can_be_null = 1;
3422 goto done;
3423 #if 0
3424 k = *p++;
3425 simple_char_max = bufp->multibyte ? 0x80 : (1 << BYTEWIDTH);
3426 for (j = 0; j < simple_char_max; j++)
3427 if (SYNTAX (j) == (enum syntaxcode) k)
3428 fastmap[j] = 1;
3430 if (bufp->multibyte)
3431 /* Any character set can possibly contain a character
3432 whose syntax is K. */
3433 goto set_fastmap_for_multibyte_characters;
3434 break;
3436 case notsyntaxspec:
3437 k = *p++;
3438 simple_char_max = bufp->multibyte ? 0x80 : (1 << BYTEWIDTH);
3439 for (j = 0; j < simple_char_max; j++)
3440 if (SYNTAX (j) != (enum syntaxcode) k)
3441 fastmap[j] = 1;
3443 if (bufp->multibyte)
3444 /* Any character set can possibly contain a character
3445 whose syntax is not K. */
3446 goto set_fastmap_for_multibyte_characters;
3447 break;
3448 #endif
3451 case categoryspec:
3452 k = *p++;
3453 simple_char_max = (1 << BYTEWIDTH);
3454 for (j = 0; j < simple_char_max; j++)
3455 if (CHAR_HAS_CATEGORY (j, k))
3456 fastmap[j] = 1;
3458 if (bufp->multibyte)
3459 /* Any character set can possibly contain a character
3460 whose category is K. */
3461 goto set_fastmap_for_multibyte_characters;
3462 break;
3465 case notcategoryspec:
3466 k = *p++;
3467 simple_char_max = (1 << BYTEWIDTH);
3468 for (j = 0; j < simple_char_max; j++)
3469 if (!CHAR_HAS_CATEGORY (j, k))
3470 fastmap[j] = 1;
3472 if (bufp->multibyte)
3473 /* Any character set can possibly contain a character
3474 whose category is not K. */
3475 goto set_fastmap_for_multibyte_characters;
3476 break;
3478 /* All cases after this match the empty string. These end with
3479 `continue'. */
3482 case before_dot:
3483 case at_dot:
3484 case after_dot:
3485 continue;
3486 #endif /* emacs */
3489 case no_op:
3490 case begline:
3491 case endline:
3492 case begbuf:
3493 case endbuf:
3494 #ifndef emacs
3495 case wordbound:
3496 case notwordbound:
3497 case wordbeg:
3498 case wordend:
3499 #endif
3500 case push_dummy_failure:
3501 continue;
3504 case jump_n:
3505 case pop_failure_jump:
3506 case maybe_pop_jump:
3507 case jump:
3508 case jump_past_alt:
3509 case dummy_failure_jump:
3510 EXTRACT_NUMBER_AND_INCR (j, p);
3511 p += j;
3512 if (j > 0)
3513 continue;
3515 /* Jump backward implies we just went through the body of a
3516 loop and matched nothing. Opcode jumped to should be
3517 `on_failure_jump' or `succeed_n'. Just treat it like an
3518 ordinary jump. For a * loop, it has pushed its failure
3519 point already; if so, discard that as redundant. */
3520 if ((re_opcode_t) *p != on_failure_jump
3521 && (re_opcode_t) *p != succeed_n)
3522 continue;
3524 p++;
3525 EXTRACT_NUMBER_AND_INCR (j, p);
3526 p += j;
3528 /* If what's on the stack is where we are now, pop it. */
3529 if (!FAIL_STACK_EMPTY ()
3530 && fail_stack.stack[fail_stack.avail - 1].pointer == p)
3531 fail_stack.avail--;
3533 continue;
3536 case on_failure_jump:
3537 case on_failure_keep_string_jump:
3538 handle_on_failure_jump:
3539 EXTRACT_NUMBER_AND_INCR (j, p);
3541 /* For some patterns, e.g., `(a?)?', `p+j' here points to the
3542 end of the pattern. We don't want to push such a point,
3543 since when we restore it above, entering the switch will
3544 increment `p' past the end of the pattern. We don't need
3545 to push such a point since we obviously won't find any more
3546 fastmap entries beyond `pend'. Such a pattern can match
3547 the null string, though. */
3548 if (p + j < pend)
3550 if (!PUSH_PATTERN_OP (p + j, fail_stack))
3552 RESET_FAIL_STACK ();
3553 return -2;
3556 else
3557 bufp->can_be_null = 1;
3559 if (succeed_n_p)
3561 EXTRACT_NUMBER_AND_INCR (k, p); /* Skip the n. */
3562 succeed_n_p = false;
3565 continue;
3568 case succeed_n:
3569 /* Get to the number of times to succeed. */
3570 p += 2;
3572 /* Increment p past the n for when k != 0. */
3573 EXTRACT_NUMBER_AND_INCR (k, p);
3574 if (k == 0)
3576 p -= 4;
3577 succeed_n_p = true; /* Spaghetti code alert. */
3578 goto handle_on_failure_jump;
3580 continue;
3583 case set_number_at:
3584 p += 4;
3585 continue;
3588 case start_memory:
3589 case stop_memory:
3590 p += 2;
3591 continue;
3594 default:
3595 abort (); /* We have listed all the cases. */
3596 } /* switch *p++ */
3598 /* Getting here means we have found the possible starting
3599 characters for one path of the pattern -- and that the empty
3600 string does not match. We need not follow this path further.
3601 Instead, look at the next alternative (remembered on the
3602 stack), or quit if no more. The test at the top of the loop
3603 does these things. */
3604 path_can_be_null = false;
3605 p = pend;
3606 } /* while p */
3608 /* Set `can_be_null' for the last path (also the first path, if the
3609 pattern is empty). */
3610 bufp->can_be_null |= path_can_be_null;
3612 done:
3613 RESET_FAIL_STACK ();
3614 return 0;
3615 } /* re_compile_fastmap */
3617 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and
3618 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
3619 this memory for recording register information. STARTS and ENDS
3620 must be allocated using the malloc library routine, and must each
3621 be at least NUM_REGS * sizeof (regoff_t) bytes long.
3623 If NUM_REGS == 0, then subsequent matches should allocate their own
3624 register data.
3626 Unless this function is called, the first search or match using
3627 PATTERN_BUFFER will allocate its own register data, without
3628 freeing the old data. */
3630 void
3631 re_set_registers (bufp, regs, num_regs, starts, ends)
3632 struct re_pattern_buffer *bufp;
3633 struct re_registers *regs;
3634 unsigned num_regs;
3635 regoff_t *starts, *ends;
3637 if (num_regs)
3639 bufp->regs_allocated = REGS_REALLOCATE;
3640 regs->num_regs = num_regs;
3641 regs->start = starts;
3642 regs->end = ends;
3644 else
3646 bufp->regs_allocated = REGS_UNALLOCATED;
3647 regs->num_regs = 0;
3648 regs->start = regs->end = (regoff_t *) 0;
3652 /* Searching routines. */
3654 /* Like re_search_2, below, but only one string is specified, and
3655 doesn't let you say where to stop matching. */
3658 re_search (bufp, string, size, startpos, range, regs)
3659 struct re_pattern_buffer *bufp;
3660 const char *string;
3661 int size, startpos, range;
3662 struct re_registers *regs;
3664 return re_search_2 (bufp, NULL, 0, string, size, startpos, range,
3665 regs, size);
3668 /* End address of virtual concatenation of string. */
3669 #define STOP_ADDR_VSTRING(P) \
3670 (((P) >= size1 ? string2 + size2 : string1 + size1))
3672 /* Address of POS in the concatenation of virtual string. */
3673 #define POS_ADDR_VSTRING(POS) \
3674 (((POS) >= size1 ? string2 - size1 : string1) + (POS))
3676 /* Using the compiled pattern in BUFP->buffer, first tries to match the
3677 virtual concatenation of STRING1 and STRING2, starting first at index
3678 STARTPOS, then at STARTPOS + 1, and so on.
3680 STRING1 and STRING2 have length SIZE1 and SIZE2, respectively.
3682 RANGE is how far to scan while trying to match. RANGE = 0 means try
3683 only at STARTPOS; in general, the last start tried is STARTPOS +
3684 RANGE.
3686 In REGS, return the indices of the virtual concatenation of STRING1
3687 and STRING2 that matched the entire BUFP->buffer and its contained
3688 subexpressions.
3690 Do not consider matching one past the index STOP in the virtual
3691 concatenation of STRING1 and STRING2.
3693 We return either the position in the strings at which the match was
3694 found, -1 if no match, or -2 if error (such as failure
3695 stack overflow). */
3698 re_search_2 (bufp, string1, size1, string2, size2, startpos, range, regs, stop)
3699 struct re_pattern_buffer *bufp;
3700 const char *string1, *string2;
3701 int size1, size2;
3702 int startpos;
3703 int range;
3704 struct re_registers *regs;
3705 int stop;
3707 int val;
3708 register char *fastmap = bufp->fastmap;
3709 register RE_TRANSLATE_TYPE translate = bufp->translate;
3710 int total_size = size1 + size2;
3711 int endpos = startpos + range;
3712 int anchored_start = 0;
3714 /* Nonzero if we have to concern multibyte character. */
3715 int multibyte = bufp->multibyte;
3717 /* Check for out-of-range STARTPOS. */
3718 if (startpos < 0 || startpos > total_size)
3719 return -1;
3721 /* Fix up RANGE if it might eventually take us outside
3722 the virtual concatenation of STRING1 and STRING2.
3723 Make sure we won't move STARTPOS below 0 or above TOTAL_SIZE. */
3724 if (endpos < 0)
3725 range = 0 - startpos;
3726 else if (endpos > total_size)
3727 range = total_size - startpos;
3729 /* If the search isn't to be a backwards one, don't waste time in a
3730 search for a pattern anchored at beginning of buffer. */
3731 if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == begbuf && range > 0)
3733 if (startpos > 0)
3734 return -1;
3735 else
3736 range = 0;
3739 #ifdef emacs
3740 /* In a forward search for something that starts with \=.
3741 don't keep searching past point. */
3742 if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == at_dot && range > 0)
3744 range = PT_BYTE - BEGV_BYTE - startpos;
3745 if (range < 0)
3746 return -1;
3748 #endif /* emacs */
3750 /* Update the fastmap now if not correct already. */
3751 if (fastmap && !bufp->fastmap_accurate)
3752 if (re_compile_fastmap (bufp) == -2)
3753 return -2;
3755 /* See whether the pattern is anchored. */
3756 if (bufp->buffer[0] == begline)
3757 anchored_start = 1;
3759 #ifdef emacs
3760 gl_state.object = re_match_object;
3762 int adjpos = NILP (re_match_object) || BUFFERP (re_match_object);
3763 int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (startpos + adjpos);
3765 SETUP_SYNTAX_TABLE_FOR_OBJECT (re_match_object, charpos, 1);
3767 #endif
3769 /* Loop through the string, looking for a place to start matching. */
3770 for (;;)
3772 /* If the pattern is anchored,
3773 skip quickly past places we cannot match.
3774 We don't bother to treat startpos == 0 specially
3775 because that case doesn't repeat. */
3776 if (anchored_start && startpos > 0)
3778 if (! (bufp->newline_anchor
3779 && ((startpos <= size1 ? string1[startpos - 1]
3780 : string2[startpos - size1 - 1])
3781 == '\n')))
3782 goto advance;
3785 /* If a fastmap is supplied, skip quickly over characters that
3786 cannot be the start of a match. If the pattern can match the
3787 null string, however, we don't need to skip characters; we want
3788 the first null string. */
3789 if (fastmap && startpos < total_size && !bufp->can_be_null)
3791 register const char *d;
3792 register unsigned int buf_ch;
3794 d = POS_ADDR_VSTRING (startpos);
3796 if (range > 0) /* Searching forwards. */
3798 register int lim = 0;
3799 int irange = range;
3801 if (startpos < size1 && startpos + range >= size1)
3802 lim = range - (size1 - startpos);
3804 /* Written out as an if-else to avoid testing `translate'
3805 inside the loop. */
3806 if (RE_TRANSLATE_P (translate))
3808 if (multibyte)
3809 while (range > lim)
3811 int buf_charlen;
3813 buf_ch = STRING_CHAR_AND_LENGTH (d, range - lim,
3814 buf_charlen);
3816 buf_ch = RE_TRANSLATE (translate, buf_ch);
3817 if (buf_ch >= 0400
3818 || fastmap[buf_ch])
3819 break;
3821 range -= buf_charlen;
3822 d += buf_charlen;
3824 else
3825 while (range > lim
3826 && !fastmap[(unsigned char)
3827 RE_TRANSLATE (translate, (unsigned char) *d)])
3829 d++;
3830 range--;
3833 else
3834 while (range > lim && !fastmap[(unsigned char) *d])
3836 d++;
3837 range--;
3840 startpos += irange - range;
3842 else /* Searching backwards. */
3844 int room = (size1 == 0 || startpos >= size1
3845 ? size2 + size1 - startpos
3846 : size1 - startpos);
3848 buf_ch = STRING_CHAR (d, room);
3849 if (RE_TRANSLATE_P (translate))
3850 buf_ch = RE_TRANSLATE (translate, buf_ch);
3852 if (! (buf_ch >= 0400
3853 || fastmap[buf_ch]))
3854 goto advance;
3858 /* If can't match the null string, and that's all we have left, fail. */
3859 if (range >= 0 && startpos == total_size && fastmap
3860 && !bufp->can_be_null)
3861 return -1;
3863 val = re_match_2_internal (bufp, string1, size1, string2, size2,
3864 startpos, regs, stop);
3865 #ifndef REGEX_MALLOC
3866 #ifdef C_ALLOCA
3867 alloca (0);
3868 #endif
3869 #endif
3871 if (val >= 0)
3872 return startpos;
3874 if (val == -2)
3875 return -2;
3877 advance:
3878 if (!range)
3879 break;
3880 else if (range > 0)
3882 /* Update STARTPOS to the next character boundary. */
3883 if (multibyte)
3885 const unsigned char *p
3886 = (const unsigned char *) POS_ADDR_VSTRING (startpos);
3887 const unsigned char *pend
3888 = (const unsigned char *) STOP_ADDR_VSTRING (startpos);
3889 int len = MULTIBYTE_FORM_LENGTH (p, pend - p);
3891 range -= len;
3892 if (range < 0)
3893 break;
3894 startpos += len;
3896 else
3898 range--;
3899 startpos++;
3902 else
3904 range++;
3905 startpos--;
3907 /* Update STARTPOS to the previous character boundary. */
3908 if (multibyte)
3910 const unsigned char *p
3911 = (const unsigned char *) POS_ADDR_VSTRING (startpos);
3912 int len = 0;
3914 /* Find the head of multibyte form. */
3915 while (!CHAR_HEAD_P (*p))
3916 p--, len++;
3918 /* Adjust it. */
3919 #if 0 /* XXX */
3920 if (MULTIBYTE_FORM_LENGTH (p, len + 1) != (len + 1))
3922 else
3923 #endif
3925 range += len;
3926 if (range > 0)
3927 break;
3929 startpos -= len;
3934 return -1;
3935 } /* re_search_2 */
3937 /* Declarations and macros for re_match_2. */
3939 static int bcmp_translate ();
3940 static boolean alt_match_null_string_p (),
3941 common_op_match_null_string_p (),
3942 group_match_null_string_p ();
3944 /* This converts PTR, a pointer into one of the search strings `string1'
3945 and `string2' into an offset from the beginning of that string. */
3946 #define POINTER_TO_OFFSET(ptr) \
3947 (FIRST_STRING_P (ptr) \
3948 ? ((regoff_t) ((ptr) - string1)) \
3949 : ((regoff_t) ((ptr) - string2 + size1)))
3951 /* Macros for dealing with the split strings in re_match_2. */
3953 #define MATCHING_IN_FIRST_STRING (dend == end_match_1)
3955 /* Call before fetching a character with *d. This switches over to
3956 string2 if necessary. */
3957 #define PREFETCH() \
3958 while (d == dend) \
3960 /* End of string2 => fail. */ \
3961 if (dend == end_match_2) \
3962 goto fail; \
3963 /* End of string1 => advance to string2. */ \
3964 d = string2; \
3965 dend = end_match_2; \
3969 /* Test if at very beginning or at very end of the virtual concatenation
3970 of `string1' and `string2'. If only one string, it's `string2'. */
3971 #define AT_STRINGS_BEG(d) ((d) == (size1 ? string1 : string2) || !size2)
3972 #define AT_STRINGS_END(d) ((d) == end2)
3975 /* Test if D points to a character which is word-constituent. We have
3976 two special cases to check for: if past the end of string1, look at
3977 the first character in string2; and if before the beginning of
3978 string2, look at the last character in string1. */
3979 #define WORDCHAR_P(d) \
3980 (SYNTAX ((d) == end1 ? *string2 \
3981 : (d) == string2 - 1 ? *(end1 - 1) : *(d)) \
3982 == Sword)
3984 /* Disabled due to a compiler bug -- see comment at case wordbound */
3986 /* The comment at case wordbound is following one, but we don't use
3987 AT_WORD_BOUNDARY anymore to support multibyte form.
3989 The DEC Alpha C compiler 3.x generates incorrect code for the
3990 test WORDCHAR_P (d - 1) != WORDCHAR_P (d) in the expansion of
3991 AT_WORD_BOUNDARY, so this code is disabled. Expanding the
3992 macro and introducing temporary variables works around the bug. */
3994 #if 0
3995 /* Test if the character before D and the one at D differ with respect
3996 to being word-constituent. */
3997 #define AT_WORD_BOUNDARY(d) \
3998 (AT_STRINGS_BEG (d) || AT_STRINGS_END (d) \
3999 || WORDCHAR_P (d - 1) != WORDCHAR_P (d))
4000 #endif
4002 /* Free everything we malloc. */
4003 #ifdef MATCH_MAY_ALLOCATE
4004 #define FREE_VAR(var) if (var) { REGEX_FREE (var); var = NULL; } else
4005 #define FREE_VARIABLES() \
4006 do { \
4007 REGEX_FREE_STACK (fail_stack.stack); \
4008 FREE_VAR (regstart); \
4009 FREE_VAR (regend); \
4010 FREE_VAR (old_regstart); \
4011 FREE_VAR (old_regend); \
4012 FREE_VAR (best_regstart); \
4013 FREE_VAR (best_regend); \
4014 FREE_VAR (reg_info); \
4015 FREE_VAR (reg_dummy); \
4016 FREE_VAR (reg_info_dummy); \
4017 } while (0)
4018 #else
4019 #define FREE_VARIABLES() ((void)0) /* Do nothing! But inhibit gcc warning. */
4020 #endif /* not MATCH_MAY_ALLOCATE */
4022 /* These values must meet several constraints. They must not be valid
4023 register values; since we have a limit of 255 registers (because
4024 we use only one byte in the pattern for the register number), we can
4025 use numbers larger than 255. They must differ by 1, because of
4026 NUM_FAILURE_ITEMS above. And the value for the lowest register must
4027 be larger than the value for the highest register, so we do not try
4028 to actually save any registers when none are active. */
4029 #define NO_HIGHEST_ACTIVE_REG (1 << BYTEWIDTH)
4030 #define NO_LOWEST_ACTIVE_REG (NO_HIGHEST_ACTIVE_REG + 1)
4032 /* Matching routines. */
4034 #ifndef emacs /* Emacs never uses this. */
4035 /* re_match is like re_match_2 except it takes only a single string. */
4038 re_match (bufp, string, size, pos, regs)
4039 struct re_pattern_buffer *bufp;
4040 const char *string;
4041 int size, pos;
4042 struct re_registers *regs;
4044 int result = re_match_2_internal (bufp, NULL, 0, string, size,
4045 pos, regs, size);
4046 #ifndef REGEX_MALLOC /* CVS */
4047 #ifdef C_ALLOCA /* CVS */
4048 alloca (0);
4049 #endif /* CVS */
4050 #endif /* CVS */
4051 return result;
4053 #endif /* not emacs */
4055 #ifdef emacs
4056 /* In Emacs, this is the string or buffer in which we
4057 are matching. It is used for looking up syntax properties. */
4058 Lisp_Object re_match_object;
4059 #endif
4061 /* re_match_2 matches the compiled pattern in BUFP against the
4062 the (virtual) concatenation of STRING1 and STRING2 (of length SIZE1
4063 and SIZE2, respectively). We start matching at POS, and stop
4064 matching at STOP.
4066 If REGS is non-null and the `no_sub' field of BUFP is nonzero, we
4067 store offsets for the substring each group matched in REGS. See the
4068 documentation for exactly how many groups we fill.
4070 We return -1 if no match, -2 if an internal error (such as the
4071 failure stack overflowing). Otherwise, we return the length of the
4072 matched substring. */
4075 re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop)
4076 struct re_pattern_buffer *bufp;
4077 const char *string1, *string2;
4078 int size1, size2;
4079 int pos;
4080 struct re_registers *regs;
4081 int stop;
4083 int result;
4085 #ifdef emacs
4086 int charpos;
4087 int adjpos = NILP (re_match_object) || BUFFERP (re_match_object);
4088 gl_state.object = re_match_object;
4089 charpos = SYNTAX_TABLE_BYTE_TO_CHAR (pos + adjpos);
4090 SETUP_SYNTAX_TABLE_FOR_OBJECT (re_match_object, charpos, 1);
4091 #endif
4093 result = re_match_2_internal (bufp, string1, size1, string2, size2,
4094 pos, regs, stop);
4095 #ifndef REGEX_MALLOC /* CVS */
4096 #ifdef C_ALLOCA /* CVS */
4097 alloca (0);
4098 #endif /* CVS */
4099 #endif /* CVS */
4100 return result;
4103 /* This is a separate function so that we can force an alloca cleanup
4104 afterwards. */
4105 static int
4106 re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
4107 struct re_pattern_buffer *bufp;
4108 const char *string1, *string2;
4109 int size1, size2;
4110 int pos;
4111 struct re_registers *regs;
4112 int stop;
4114 /* General temporaries. */
4115 int mcnt;
4116 unsigned char *p1;
4118 /* Just past the end of the corresponding string. */
4119 const char *end1, *end2;
4121 /* Pointers into string1 and string2, just past the last characters in
4122 each to consider matching. */
4123 const char *end_match_1, *end_match_2;
4125 /* Where we are in the data, and the end of the current string. */
4126 const char *d, *dend;
4128 /* Where we are in the pattern, and the end of the pattern. */
4129 unsigned char *p = bufp->buffer;
4130 register unsigned char *pend = p + bufp->used;
4132 /* Mark the opcode just after a start_memory, so we can test for an
4133 empty subpattern when we get to the stop_memory. */
4134 unsigned char *just_past_start_mem = 0;
4136 /* We use this to map every character in the string. */
4137 RE_TRANSLATE_TYPE translate = bufp->translate;
4139 /* Nonzero if we have to concern multibyte character. */
4140 int multibyte = bufp->multibyte;
4142 /* Failure point stack. Each place that can handle a failure further
4143 down the line pushes a failure point on this stack. It consists of
4144 restart, regend, and reg_info for all registers corresponding to
4145 the subexpressions we're currently inside, plus the number of such
4146 registers, and, finally, two char *'s. The first char * is where
4147 to resume scanning the pattern; the second one is where to resume
4148 scanning the strings. If the latter is zero, the failure point is
4149 a ``dummy''; if a failure happens and the failure point is a dummy,
4150 it gets discarded and the next next one is tried. */
4151 #ifdef MATCH_MAY_ALLOCATE /* otherwise, this is global. */
4152 fail_stack_type fail_stack;
4153 #endif
4154 #ifdef DEBUG
4155 static unsigned failure_id = 0;
4156 unsigned nfailure_points_pushed = 0, nfailure_points_popped = 0;
4157 #endif
4159 /* This holds the pointer to the failure stack, when
4160 it is allocated relocatably. */
4161 fail_stack_elt_t *failure_stack_ptr;
4163 /* We fill all the registers internally, independent of what we
4164 return, for use in backreferences. The number here includes
4165 an element for register zero. */
4166 unsigned num_regs = bufp->re_nsub + 1;
4168 /* The currently active registers. */
4169 unsigned lowest_active_reg = NO_LOWEST_ACTIVE_REG;
4170 unsigned highest_active_reg = NO_HIGHEST_ACTIVE_REG;
4172 /* Information on the contents of registers. These are pointers into
4173 the input strings; they record just what was matched (on this
4174 attempt) by a subexpression part of the pattern, that is, the
4175 regnum-th regstart pointer points to where in the pattern we began
4176 matching and the regnum-th regend points to right after where we
4177 stopped matching the regnum-th subexpression. (The zeroth register
4178 keeps track of what the whole pattern matches.) */
4179 #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
4180 const char **regstart, **regend;
4181 #endif
4183 /* If a group that's operated upon by a repetition operator fails to
4184 match anything, then the register for its start will need to be
4185 restored because it will have been set to wherever in the string we
4186 are when we last see its open-group operator. Similarly for a
4187 register's end. */
4188 #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
4189 const char **old_regstart, **old_regend;
4190 #endif
4192 /* The is_active field of reg_info helps us keep track of which (possibly
4193 nested) subexpressions we are currently in. The matched_something
4194 field of reg_info[reg_num] helps us tell whether or not we have
4195 matched any of the pattern so far this time through the reg_num-th
4196 subexpression. These two fields get reset each time through any
4197 loop their register is in. */
4198 #ifdef MATCH_MAY_ALLOCATE /* otherwise, this is global. */
4199 register_info_type *reg_info;
4200 #endif
4202 /* The following record the register info as found in the above
4203 variables when we find a match better than any we've seen before.
4204 This happens as we backtrack through the failure points, which in
4205 turn happens only if we have not yet matched the entire string. */
4206 unsigned best_regs_set = false;
4207 #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
4208 const char **best_regstart, **best_regend;
4209 #endif
4211 /* Logically, this is `best_regend[0]'. But we don't want to have to
4212 allocate space for that if we're not allocating space for anything
4213 else (see below). Also, we never need info about register 0 for
4214 any of the other register vectors, and it seems rather a kludge to
4215 treat `best_regend' differently than the rest. So we keep track of
4216 the end of the best match so far in a separate variable. We
4217 initialize this to NULL so that when we backtrack the first time
4218 and need to test it, it's not garbage. */
4219 const char *match_end = NULL;
4221 /* This helps SET_REGS_MATCHED avoid doing redundant work. */
4222 int set_regs_matched_done = 0;
4224 /* Used when we pop values we don't care about. */
4225 #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
4226 const char **reg_dummy;
4227 register_info_type *reg_info_dummy;
4228 #endif
4230 #ifdef DEBUG
4231 /* Counts the total number of registers pushed. */
4232 unsigned num_regs_pushed = 0;
4233 #endif
4235 DEBUG_PRINT1 ("\n\nEntering re_match_2.\n");
4237 INIT_FAIL_STACK ();
4239 #ifdef MATCH_MAY_ALLOCATE
4240 /* Do not bother to initialize all the register variables if there are
4241 no groups in the pattern, as it takes a fair amount of time. If
4242 there are groups, we include space for register 0 (the whole
4243 pattern), even though we never use it, since it simplifies the
4244 array indexing. We should fix this. */
4245 if (bufp->re_nsub)
4247 regstart = REGEX_TALLOC (num_regs, const char *);
4248 regend = REGEX_TALLOC (num_regs, const char *);
4249 old_regstart = REGEX_TALLOC (num_regs, const char *);
4250 old_regend = REGEX_TALLOC (num_regs, const char *);
4251 best_regstart = REGEX_TALLOC (num_regs, const char *);
4252 best_regend = REGEX_TALLOC (num_regs, const char *);
4253 reg_info = REGEX_TALLOC (num_regs, register_info_type);
4254 reg_dummy = REGEX_TALLOC (num_regs, const char *);
4255 reg_info_dummy = REGEX_TALLOC (num_regs, register_info_type);
4257 if (!(regstart && regend && old_regstart && old_regend && reg_info
4258 && best_regstart && best_regend && reg_dummy && reg_info_dummy))
4260 FREE_VARIABLES ();
4261 return -2;
4264 else
4266 /* We must initialize all our variables to NULL, so that
4267 `FREE_VARIABLES' doesn't try to free them. */
4268 regstart = regend = old_regstart = old_regend = best_regstart
4269 = best_regend = reg_dummy = NULL;
4270 reg_info = reg_info_dummy = (register_info_type *) NULL;
4272 #endif /* MATCH_MAY_ALLOCATE */
4274 /* The starting position is bogus. */
4275 if (pos < 0 || pos > size1 + size2)
4277 FREE_VARIABLES ();
4278 return -1;
4281 /* Initialize subexpression text positions to -1 to mark ones that no
4282 start_memory/stop_memory has been seen for. Also initialize the
4283 register information struct. */
4284 for (mcnt = 1; mcnt < num_regs; mcnt++)
4286 regstart[mcnt] = regend[mcnt]
4287 = old_regstart[mcnt] = old_regend[mcnt] = REG_UNSET_VALUE;
4289 REG_MATCH_NULL_STRING_P (reg_info[mcnt]) = MATCH_NULL_UNSET_VALUE;
4290 IS_ACTIVE (reg_info[mcnt]) = 0;
4291 MATCHED_SOMETHING (reg_info[mcnt]) = 0;
4292 EVER_MATCHED_SOMETHING (reg_info[mcnt]) = 0;
4295 /* We move `string1' into `string2' if the latter's empty -- but not if
4296 `string1' is null. */
4297 if (size2 == 0 && string1 != NULL)
4299 string2 = string1;
4300 size2 = size1;
4301 string1 = 0;
4302 size1 = 0;
4304 end1 = string1 + size1;
4305 end2 = string2 + size2;
4307 /* Compute where to stop matching, within the two strings. */
4308 if (stop <= size1)
4310 end_match_1 = string1 + stop;
4311 end_match_2 = string2;
4313 else
4315 end_match_1 = end1;
4316 end_match_2 = string2 + stop - size1;
4319 /* `p' scans through the pattern as `d' scans through the data.
4320 `dend' is the end of the input string that `d' points within. `d'
4321 is advanced into the following input string whenever necessary, but
4322 this happens before fetching; therefore, at the beginning of the
4323 loop, `d' can be pointing at the end of a string, but it cannot
4324 equal `string2'. */
4325 if (size1 > 0 && pos <= size1)
4327 d = string1 + pos;
4328 dend = end_match_1;
4330 else
4332 d = string2 + pos - size1;
4333 dend = end_match_2;
4336 DEBUG_PRINT1 ("The compiled pattern is: ");
4337 DEBUG_PRINT_COMPILED_PATTERN (bufp, p, pend);
4338 DEBUG_PRINT1 ("The string to match is: `");
4339 DEBUG_PRINT_DOUBLE_STRING (d, string1, size1, string2, size2);
4340 DEBUG_PRINT1 ("'\n");
4342 /* This loops over pattern commands. It exits by returning from the
4343 function if the match is complete, or it drops through if the match
4344 fails at this starting point in the input data. */
4345 for (;;)
4347 DEBUG_PRINT2 ("\n0x%x: ", p);
4349 if (p == pend)
4350 { /* End of pattern means we might have succeeded. */
4351 DEBUG_PRINT1 ("end of pattern ... ");
4353 /* If we haven't matched the entire string, and we want the
4354 longest match, try backtracking. */
4355 if (d != end_match_2)
4357 /* 1 if this match ends in the same string (string1 or string2)
4358 as the best previous match. */
4359 boolean same_str_p = (FIRST_STRING_P (match_end)
4360 == MATCHING_IN_FIRST_STRING);
4361 /* 1 if this match is the best seen so far. */
4362 boolean best_match_p;
4364 /* AIX compiler got confused when this was combined
4365 with the previous declaration. */
4366 if (same_str_p)
4367 best_match_p = d > match_end;
4368 else
4369 best_match_p = !MATCHING_IN_FIRST_STRING;
4371 DEBUG_PRINT1 ("backtracking.\n");
4373 if (!FAIL_STACK_EMPTY ())
4374 { /* More failure points to try. */
4376 /* If exceeds best match so far, save it. */
4377 if (!best_regs_set || best_match_p)
4379 best_regs_set = true;
4380 match_end = d;
4382 DEBUG_PRINT1 ("\nSAVING match as best so far.\n");
4384 for (mcnt = 1; mcnt < num_regs; mcnt++)
4386 best_regstart[mcnt] = regstart[mcnt];
4387 best_regend[mcnt] = regend[mcnt];
4390 goto fail;
4393 /* If no failure points, don't restore garbage. And if
4394 last match is real best match, don't restore second
4395 best one. */
4396 else if (best_regs_set && !best_match_p)
4398 restore_best_regs:
4399 /* Restore best match. It may happen that `dend ==
4400 end_match_1' while the restored d is in string2.
4401 For example, the pattern `x.*y.*z' against the
4402 strings `x-' and `y-z-', if the two strings are
4403 not consecutive in memory. */
4404 DEBUG_PRINT1 ("Restoring best registers.\n");
4406 d = match_end;
4407 dend = ((d >= string1 && d <= end1)
4408 ? end_match_1 : end_match_2);
4410 for (mcnt = 1; mcnt < num_regs; mcnt++)
4412 regstart[mcnt] = best_regstart[mcnt];
4413 regend[mcnt] = best_regend[mcnt];
4416 } /* d != end_match_2 */
4418 succeed_label:
4419 DEBUG_PRINT1 ("Accepting match.\n");
4421 /* If caller wants register contents data back, do it. */
4422 if (regs && !bufp->no_sub)
4424 /* Have the register data arrays been allocated? */
4425 if (bufp->regs_allocated == REGS_UNALLOCATED)
4426 { /* No. So allocate them with malloc. We need one
4427 extra element beyond `num_regs' for the `-1' marker
4428 GNU code uses. */
4429 regs->num_regs = MAX (RE_NREGS, num_regs + 1);
4430 regs->start = TALLOC (regs->num_regs, regoff_t);
4431 regs->end = TALLOC (regs->num_regs, regoff_t);
4432 if (regs->start == NULL || regs->end == NULL)
4434 FREE_VARIABLES ();
4435 return -2;
4437 bufp->regs_allocated = REGS_REALLOCATE;
4439 else if (bufp->regs_allocated == REGS_REALLOCATE)
4440 { /* Yes. If we need more elements than were already
4441 allocated, reallocate them. If we need fewer, just
4442 leave it alone. */
4443 if (regs->num_regs < num_regs + 1)
4445 regs->num_regs = num_regs + 1;
4446 RETALLOC (regs->start, regs->num_regs, regoff_t);
4447 RETALLOC (regs->end, regs->num_regs, regoff_t);
4448 if (regs->start == NULL || regs->end == NULL)
4450 FREE_VARIABLES ();
4451 return -2;
4455 else
4457 /* These braces fend off a "empty body in an else-statement"
4458 warning under GCC when assert expands to nothing. */
4459 assert (bufp->regs_allocated == REGS_FIXED);
4462 /* Convert the pointer data in `regstart' and `regend' to
4463 indices. Register zero has to be set differently,
4464 since we haven't kept track of any info for it. */
4465 if (regs->num_regs > 0)
4467 regs->start[0] = pos;
4468 regs->end[0] = (MATCHING_IN_FIRST_STRING
4469 ? ((regoff_t) (d - string1))
4470 : ((regoff_t) (d - string2 + size1)));
4473 /* Go through the first `min (num_regs, regs->num_regs)'
4474 registers, since that is all we initialized. */
4475 for (mcnt = 1; mcnt < MIN (num_regs, regs->num_regs); mcnt++)
4477 if (REG_UNSET (regstart[mcnt]) || REG_UNSET (regend[mcnt]))
4478 regs->start[mcnt] = regs->end[mcnt] = -1;
4479 else
4481 regs->start[mcnt]
4482 = (regoff_t) POINTER_TO_OFFSET (regstart[mcnt]);
4483 regs->end[mcnt]
4484 = (regoff_t) POINTER_TO_OFFSET (regend[mcnt]);
4488 /* If the regs structure we return has more elements than
4489 were in the pattern, set the extra elements to -1. If
4490 we (re)allocated the registers, this is the case,
4491 because we always allocate enough to have at least one
4492 -1 at the end. */
4493 for (mcnt = num_regs; mcnt < regs->num_regs; mcnt++)
4494 regs->start[mcnt] = regs->end[mcnt] = -1;
4495 } /* regs && !bufp->no_sub */
4497 DEBUG_PRINT4 ("%u failure points pushed, %u popped (%u remain).\n",
4498 nfailure_points_pushed, nfailure_points_popped,
4499 nfailure_points_pushed - nfailure_points_popped);
4500 DEBUG_PRINT2 ("%u registers pushed.\n", num_regs_pushed);
4502 mcnt = d - pos - (MATCHING_IN_FIRST_STRING
4503 ? string1
4504 : string2 - size1);
4506 DEBUG_PRINT2 ("Returning %d from re_match_2.\n", mcnt);
4508 FREE_VARIABLES ();
4509 return mcnt;
4512 /* Otherwise match next pattern command. */
4513 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p++))
4515 /* Ignore these. Used to ignore the n of succeed_n's which
4516 currently have n == 0. */
4517 case no_op:
4518 DEBUG_PRINT1 ("EXECUTING no_op.\n");
4519 break;
4521 case succeed:
4522 DEBUG_PRINT1 ("EXECUTING succeed.\n");
4523 goto succeed_label;
4525 /* Match the next n pattern characters exactly. The following
4526 byte in the pattern defines n, and the n bytes after that
4527 are the characters to match. */
4528 case exactn:
4529 mcnt = *p++;
4530 DEBUG_PRINT2 ("EXECUTING exactn %d.\n", mcnt);
4532 /* This is written out as an if-else so we don't waste time
4533 testing `translate' inside the loop. */
4534 if (RE_TRANSLATE_P (translate))
4536 #ifdef emacs
4537 if (multibyte)
4540 int pat_charlen, buf_charlen;
4541 unsigned int pat_ch, buf_ch;
4543 PREFETCH ();
4544 pat_ch = STRING_CHAR_AND_LENGTH (p, pend - p, pat_charlen);
4545 buf_ch = STRING_CHAR_AND_LENGTH (d, dend - d, buf_charlen);
4547 if (RE_TRANSLATE (translate, buf_ch)
4548 != pat_ch)
4549 goto fail;
4551 p += pat_charlen;
4552 d += buf_charlen;
4553 mcnt -= pat_charlen;
4555 while (mcnt > 0);
4556 else
4557 #endif /* not emacs */
4560 PREFETCH ();
4561 if ((unsigned char) RE_TRANSLATE (translate, (unsigned char) *d)
4562 != (unsigned char) *p++)
4563 goto fail;
4564 d++;
4566 while (--mcnt);
4568 else
4572 PREFETCH ();
4573 if (*d++ != (char) *p++) goto fail;
4575 while (--mcnt);
4577 SET_REGS_MATCHED ();
4578 break;
4581 /* Match any character except possibly a newline or a null. */
4582 case anychar:
4584 int buf_charlen;
4585 unsigned int buf_ch;
4587 DEBUG_PRINT1 ("EXECUTING anychar.\n");
4589 PREFETCH ();
4591 #ifdef emacs
4592 if (multibyte)
4593 buf_ch = STRING_CHAR_AND_LENGTH (d, dend - d, buf_charlen);
4594 else
4595 #endif /* not emacs */
4597 buf_ch = (unsigned char) *d;
4598 buf_charlen = 1;
4601 buf_ch = TRANSLATE (buf_ch);
4603 if ((!(bufp->syntax & RE_DOT_NEWLINE)
4604 && buf_ch == '\n')
4605 || ((bufp->syntax & RE_DOT_NOT_NULL)
4606 && buf_ch == '\000'))
4607 goto fail;
4609 SET_REGS_MATCHED ();
4610 DEBUG_PRINT2 (" Matched `%d'.\n", *d);
4611 d += buf_charlen;
4613 break;
4616 case charset:
4617 case charset_not:
4619 register unsigned int c;
4620 boolean not = (re_opcode_t) *(p - 1) == charset_not;
4621 int len;
4623 /* Start of actual range_table, or end of bitmap if there is no
4624 range table. */
4625 unsigned char *range_table;
4627 /* Nonzero if there is range table. */
4628 int range_table_exists;
4630 /* Number of ranges of range table. Not in bytes. */
4631 int count;
4633 DEBUG_PRINT2 ("EXECUTING charset%s.\n", not ? "_not" : "");
4635 PREFETCH ();
4636 c = (unsigned char) *d;
4638 range_table = CHARSET_RANGE_TABLE (&p[-1]); /* Past the bitmap. */
4639 range_table_exists = CHARSET_RANGE_TABLE_EXISTS_P (&p[-1]);
4640 if (range_table_exists)
4641 EXTRACT_NUMBER_AND_INCR (count, range_table);
4642 else
4643 count = 0;
4645 if (multibyte && BASE_LEADING_CODE_P (c))
4646 c = STRING_CHAR_AND_LENGTH (d, dend - d, len);
4648 if (SINGLE_BYTE_CHAR_P (c))
4649 { /* Lookup bitmap. */
4650 c = TRANSLATE (c); /* The character to match. */
4651 len = 1;
4653 /* Cast to `unsigned' instead of `unsigned char' in
4654 case the bit list is a full 32 bytes long. */
4655 if (c < (unsigned) (CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH)
4656 && p[1 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
4657 not = !not;
4659 else if (range_table_exists)
4660 CHARSET_LOOKUP_RANGE_TABLE_RAW (not, c, range_table, count);
4662 p = CHARSET_RANGE_TABLE_END (range_table, count);
4664 if (!not) goto fail;
4666 SET_REGS_MATCHED ();
4667 d += len;
4668 break;
4672 /* The beginning of a group is represented by start_memory.
4673 The arguments are the register number in the next byte, and the
4674 number of groups inner to this one in the next. The text
4675 matched within the group is recorded (in the internal
4676 registers data structure) under the register number. */
4677 case start_memory:
4678 DEBUG_PRINT3 ("EXECUTING start_memory %d (%d):\n", *p, p[1]);
4680 /* Find out if this group can match the empty string. */
4681 p1 = p; /* To send to group_match_null_string_p. */
4683 if (REG_MATCH_NULL_STRING_P (reg_info[*p]) == MATCH_NULL_UNSET_VALUE)
4684 REG_MATCH_NULL_STRING_P (reg_info[*p])
4685 = group_match_null_string_p (&p1, pend, reg_info);
4687 /* Save the position in the string where we were the last time
4688 we were at this open-group operator in case the group is
4689 operated upon by a repetition operator, e.g., with `(a*)*b'
4690 against `ab'; then we want to ignore where we are now in
4691 the string in case this attempt to match fails. */
4692 old_regstart[*p] = REG_MATCH_NULL_STRING_P (reg_info[*p])
4693 ? REG_UNSET (regstart[*p]) ? d : regstart[*p]
4694 : regstart[*p];
4695 DEBUG_PRINT2 (" old_regstart: %d\n",
4696 POINTER_TO_OFFSET (old_regstart[*p]));
4698 regstart[*p] = d;
4699 DEBUG_PRINT2 (" regstart: %d\n", POINTER_TO_OFFSET (regstart[*p]));
4701 IS_ACTIVE (reg_info[*p]) = 1;
4702 MATCHED_SOMETHING (reg_info[*p]) = 0;
4704 /* Clear this whenever we change the register activity status. */
4705 set_regs_matched_done = 0;
4707 /* This is the new highest active register. */
4708 highest_active_reg = *p;
4710 /* If nothing was active before, this is the new lowest active
4711 register. */
4712 if (lowest_active_reg == NO_LOWEST_ACTIVE_REG)
4713 lowest_active_reg = *p;
4715 /* Move past the register number and inner group count. */
4716 p += 2;
4717 just_past_start_mem = p;
4719 break;
4722 /* The stop_memory opcode represents the end of a group. Its
4723 arguments are the same as start_memory's: the register
4724 number, and the number of inner groups. */
4725 case stop_memory:
4726 DEBUG_PRINT3 ("EXECUTING stop_memory %d (%d):\n", *p, p[1]);
4728 /* We need to save the string position the last time we were at
4729 this close-group operator in case the group is operated
4730 upon by a repetition operator, e.g., with `((a*)*(b*)*)*'
4731 against `aba'; then we want to ignore where we are now in
4732 the string in case this attempt to match fails. */
4733 old_regend[*p] = REG_MATCH_NULL_STRING_P (reg_info[*p])
4734 ? REG_UNSET (regend[*p]) ? d : regend[*p]
4735 : regend[*p];
4736 DEBUG_PRINT2 (" old_regend: %d\n",
4737 POINTER_TO_OFFSET (old_regend[*p]));
4739 regend[*p] = d;
4740 DEBUG_PRINT2 (" regend: %d\n", POINTER_TO_OFFSET (regend[*p]));
4742 /* This register isn't active anymore. */
4743 IS_ACTIVE (reg_info[*p]) = 0;
4745 /* Clear this whenever we change the register activity status. */
4746 set_regs_matched_done = 0;
4748 /* If this was the only register active, nothing is active
4749 anymore. */
4750 if (lowest_active_reg == highest_active_reg)
4752 lowest_active_reg = NO_LOWEST_ACTIVE_REG;
4753 highest_active_reg = NO_HIGHEST_ACTIVE_REG;
4755 else
4756 { /* We must scan for the new highest active register, since
4757 it isn't necessarily one less than now: consider
4758 (a(b)c(d(e)f)g). When group 3 ends, after the f), the
4759 new highest active register is 1. */
4760 unsigned char r = *p - 1;
4761 while (r > 0 && !IS_ACTIVE (reg_info[r]))
4762 r--;
4764 /* If we end up at register zero, that means that we saved
4765 the registers as the result of an `on_failure_jump', not
4766 a `start_memory', and we jumped to past the innermost
4767 `stop_memory'. For example, in ((.)*) we save
4768 registers 1 and 2 as a result of the *, but when we pop
4769 back to the second ), we are at the stop_memory 1.
4770 Thus, nothing is active. */
4771 if (r == 0)
4773 lowest_active_reg = NO_LOWEST_ACTIVE_REG;
4774 highest_active_reg = NO_HIGHEST_ACTIVE_REG;
4776 else
4777 highest_active_reg = r;
4780 /* If just failed to match something this time around with a
4781 group that's operated on by a repetition operator, try to
4782 force exit from the ``loop'', and restore the register
4783 information for this group that we had before trying this
4784 last match. */
4785 if ((!MATCHED_SOMETHING (reg_info[*p])
4786 || just_past_start_mem == p - 1)
4787 && (p + 2) < pend)
4789 boolean is_a_jump_n = false;
4791 p1 = p + 2;
4792 mcnt = 0;
4793 switch ((re_opcode_t) *p1++)
4795 case jump_n:
4796 is_a_jump_n = true;
4797 case pop_failure_jump:
4798 case maybe_pop_jump:
4799 case jump:
4800 case dummy_failure_jump:
4801 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
4802 if (is_a_jump_n)
4803 p1 += 2;
4804 break;
4806 default:
4807 /* do nothing */ ;
4809 p1 += mcnt;
4811 /* If the next operation is a jump backwards in the pattern
4812 to an on_failure_jump right before the start_memory
4813 corresponding to this stop_memory, exit from the loop
4814 by forcing a failure after pushing on the stack the
4815 on_failure_jump's jump in the pattern, and d. */
4816 if (mcnt < 0 && (re_opcode_t) *p1 == on_failure_jump
4817 && (re_opcode_t) p1[3] == start_memory && p1[4] == *p)
4819 /* If this group ever matched anything, then restore
4820 what its registers were before trying this last
4821 failed match, e.g., with `(a*)*b' against `ab' for
4822 regstart[1], and, e.g., with `((a*)*(b*)*)*'
4823 against `aba' for regend[3].
4825 Also restore the registers for inner groups for,
4826 e.g., `((a*)(b*))*' against `aba' (register 3 would
4827 otherwise get trashed). */
4829 if (EVER_MATCHED_SOMETHING (reg_info[*p]))
4831 unsigned r;
4833 EVER_MATCHED_SOMETHING (reg_info[*p]) = 0;
4835 /* Restore this and inner groups' (if any) registers. */
4836 for (r = *p; r < *p + *(p + 1); r++)
4838 regstart[r] = old_regstart[r];
4840 /* xx why this test? */
4841 if (old_regend[r] >= regstart[r])
4842 regend[r] = old_regend[r];
4845 p1++;
4846 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
4847 PUSH_FAILURE_POINT (p1 + mcnt, d, -2);
4849 goto fail;
4853 /* Move past the register number and the inner group count. */
4854 p += 2;
4855 break;
4858 /* \<digit> has been turned into a `duplicate' command which is
4859 followed by the numeric value of <digit> as the register number. */
4860 case duplicate:
4862 register const char *d2, *dend2;
4863 int regno = *p++; /* Get which register to match against. */
4864 DEBUG_PRINT2 ("EXECUTING duplicate %d.\n", regno);
4866 /* Can't back reference a group which we've never matched. */
4867 if (REG_UNSET (regstart[regno]) || REG_UNSET (regend[regno]))
4868 goto fail;
4870 /* Where in input to try to start matching. */
4871 d2 = regstart[regno];
4873 /* Where to stop matching; if both the place to start and
4874 the place to stop matching are in the same string, then
4875 set to the place to stop, otherwise, for now have to use
4876 the end of the first string. */
4878 dend2 = ((FIRST_STRING_P (regstart[regno])
4879 == FIRST_STRING_P (regend[regno]))
4880 ? regend[regno] : end_match_1);
4881 for (;;)
4883 /* If necessary, advance to next segment in register
4884 contents. */
4885 while (d2 == dend2)
4887 if (dend2 == end_match_2) break;
4888 if (dend2 == regend[regno]) break;
4890 /* End of string1 => advance to string2. */
4891 d2 = string2;
4892 dend2 = regend[regno];
4894 /* At end of register contents => success */
4895 if (d2 == dend2) break;
4897 /* If necessary, advance to next segment in data. */
4898 PREFETCH ();
4900 /* How many characters left in this segment to match. */
4901 mcnt = dend - d;
4903 /* Want how many consecutive characters we can match in
4904 one shot, so, if necessary, adjust the count. */
4905 if (mcnt > dend2 - d2)
4906 mcnt = dend2 - d2;
4908 /* Compare that many; failure if mismatch, else move
4909 past them. */
4910 if (RE_TRANSLATE_P (translate)
4911 ? bcmp_translate (d, d2, mcnt, translate)
4912 : bcmp (d, d2, mcnt))
4913 goto fail;
4914 d += mcnt, d2 += mcnt;
4916 /* Do this because we've match some characters. */
4917 SET_REGS_MATCHED ();
4920 break;
4923 /* begline matches the empty string at the beginning of the string
4924 (unless `not_bol' is set in `bufp'), and, if
4925 `newline_anchor' is set, after newlines. */
4926 case begline:
4927 DEBUG_PRINT1 ("EXECUTING begline.\n");
4929 if (AT_STRINGS_BEG (d))
4931 if (!bufp->not_bol) break;
4933 else if (d[-1] == '\n' && bufp->newline_anchor)
4935 break;
4937 /* In all other cases, we fail. */
4938 goto fail;
4941 /* endline is the dual of begline. */
4942 case endline:
4943 DEBUG_PRINT1 ("EXECUTING endline.\n");
4945 if (AT_STRINGS_END (d))
4947 if (!bufp->not_eol) break;
4950 /* We have to ``prefetch'' the next character. */
4951 else if ((d == end1 ? *string2 : *d) == '\n'
4952 && bufp->newline_anchor)
4954 break;
4956 goto fail;
4959 /* Match at the very beginning of the data. */
4960 case begbuf:
4961 DEBUG_PRINT1 ("EXECUTING begbuf.\n");
4962 if (AT_STRINGS_BEG (d))
4963 break;
4964 goto fail;
4967 /* Match at the very end of the data. */
4968 case endbuf:
4969 DEBUG_PRINT1 ("EXECUTING endbuf.\n");
4970 if (AT_STRINGS_END (d))
4971 break;
4972 goto fail;
4975 /* on_failure_keep_string_jump is used to optimize `.*\n'. It
4976 pushes NULL as the value for the string on the stack. Then
4977 `pop_failure_point' will keep the current value for the
4978 string, instead of restoring it. To see why, consider
4979 matching `foo\nbar' against `.*\n'. The .* matches the foo;
4980 then the . fails against the \n. But the next thing we want
4981 to do is match the \n against the \n; if we restored the
4982 string value, we would be back at the foo.
4984 Because this is used only in specific cases, we don't need to
4985 check all the things that `on_failure_jump' does, to make
4986 sure the right things get saved on the stack. Hence we don't
4987 share its code. The only reason to push anything on the
4988 stack at all is that otherwise we would have to change
4989 `anychar's code to do something besides goto fail in this
4990 case; that seems worse than this. */
4991 case on_failure_keep_string_jump:
4992 DEBUG_PRINT1 ("EXECUTING on_failure_keep_string_jump");
4994 EXTRACT_NUMBER_AND_INCR (mcnt, p);
4995 DEBUG_PRINT3 (" %d (to 0x%x):\n", mcnt, p + mcnt);
4997 PUSH_FAILURE_POINT (p + mcnt, NULL, -2);
4998 break;
5001 /* Uses of on_failure_jump:
5003 Each alternative starts with an on_failure_jump that points
5004 to the beginning of the next alternative. Each alternative
5005 except the last ends with a jump that in effect jumps past
5006 the rest of the alternatives. (They really jump to the
5007 ending jump of the following alternative, because tensioning
5008 these jumps is a hassle.)
5010 Repeats start with an on_failure_jump that points past both
5011 the repetition text and either the following jump or
5012 pop_failure_jump back to this on_failure_jump. */
5013 case on_failure_jump:
5014 on_failure:
5015 DEBUG_PRINT1 ("EXECUTING on_failure_jump");
5017 #if defined (WINDOWSNT) && defined (emacs)
5018 QUIT;
5019 #endif
5021 EXTRACT_NUMBER_AND_INCR (mcnt, p);
5022 DEBUG_PRINT3 (" %d (to 0x%x)", mcnt, p + mcnt);
5024 /* If this on_failure_jump comes right before a group (i.e.,
5025 the original * applied to a group), save the information
5026 for that group and all inner ones, so that if we fail back
5027 to this point, the group's information will be correct.
5028 For example, in \(a*\)*\1, we need the preceding group,
5029 and in \(zz\(a*\)b*\)\2, we need the inner group. */
5031 /* We can't use `p' to check ahead because we push
5032 a failure point to `p + mcnt' after we do this. */
5033 p1 = p;
5035 /* We need to skip no_op's before we look for the
5036 start_memory in case this on_failure_jump is happening as
5037 the result of a completed succeed_n, as in \(a\)\{1,3\}b\1
5038 against aba. */
5039 while (p1 < pend && (re_opcode_t) *p1 == no_op)
5040 p1++;
5042 if (p1 < pend && (re_opcode_t) *p1 == start_memory)
5044 /* We have a new highest active register now. This will
5045 get reset at the start_memory we are about to get to,
5046 but we will have saved all the registers relevant to
5047 this repetition op, as described above. */
5048 highest_active_reg = *(p1 + 1) + *(p1 + 2);
5049 if (lowest_active_reg == NO_LOWEST_ACTIVE_REG)
5050 lowest_active_reg = *(p1 + 1);
5053 DEBUG_PRINT1 (":\n");
5054 PUSH_FAILURE_POINT (p + mcnt, d, -2);
5055 break;
5058 /* A smart repeat ends with `maybe_pop_jump'.
5059 We change it to either `pop_failure_jump' or `jump'. */
5060 case maybe_pop_jump:
5061 #if defined (WINDOWSNT) && defined (emacs)
5062 QUIT;
5063 #endif
5064 EXTRACT_NUMBER_AND_INCR (mcnt, p);
5065 DEBUG_PRINT2 ("EXECUTING maybe_pop_jump %d.\n", mcnt);
5067 register unsigned char *p2 = p;
5069 /* Compare the beginning of the repeat with what in the
5070 pattern follows its end. If we can establish that there
5071 is nothing that they would both match, i.e., that we
5072 would have to backtrack because of (as in, e.g., `a*a')
5073 then we can change to pop_failure_jump, because we'll
5074 never have to backtrack.
5076 This is not true in the case of alternatives: in
5077 `(a|ab)*' we do need to backtrack to the `ab' alternative
5078 (e.g., if the string was `ab'). But instead of trying to
5079 detect that here, the alternative has put on a dummy
5080 failure point which is what we will end up popping. */
5082 /* Skip over open/close-group commands.
5083 If what follows this loop is a ...+ construct,
5084 look at what begins its body, since we will have to
5085 match at least one of that. */
5086 while (1)
5088 if (p2 + 2 < pend
5089 && ((re_opcode_t) *p2 == stop_memory
5090 || (re_opcode_t) *p2 == start_memory))
5091 p2 += 3;
5092 else if (p2 + 6 < pend
5093 && (re_opcode_t) *p2 == dummy_failure_jump)
5094 p2 += 6;
5095 else
5096 break;
5099 p1 = p + mcnt;
5100 /* p1[0] ... p1[2] are the `on_failure_jump' corresponding
5101 to the `maybe_finalize_jump' of this case. Examine what
5102 follows. */
5104 /* If we're at the end of the pattern, we can change. */
5105 if (p2 == pend)
5107 /* Consider what happens when matching ":\(.*\)"
5108 against ":/". I don't really understand this code
5109 yet. */
5110 p[-3] = (unsigned char) pop_failure_jump;
5111 DEBUG_PRINT1
5112 (" End of pattern: change to `pop_failure_jump'.\n");
5115 else if ((re_opcode_t) *p2 == exactn
5116 || (bufp->newline_anchor && (re_opcode_t) *p2 == endline))
5118 register unsigned int c
5119 = *p2 == (unsigned char) endline ? '\n' : p2[2];
5121 if ((re_opcode_t) p1[3] == exactn)
5123 if (!(multibyte /* && (c != '\n') */
5124 && BASE_LEADING_CODE_P (c))
5125 ? c != p1[5]
5126 : (STRING_CHAR (&p2[2], pend - &p2[2])
5127 != STRING_CHAR (&p1[5], pend - &p1[5])))
5129 p[-3] = (unsigned char) pop_failure_jump;
5130 DEBUG_PRINT3 (" %c != %c => pop_failure_jump.\n",
5131 c, p1[5]);
5135 else if ((re_opcode_t) p1[3] == charset
5136 || (re_opcode_t) p1[3] == charset_not)
5138 int not = (re_opcode_t) p1[3] == charset_not;
5140 if (multibyte /* && (c != '\n') */
5141 && BASE_LEADING_CODE_P (c))
5142 c = STRING_CHAR (&p2[2], pend - &p2[2]);
5144 /* Test if C is listed in charset (or charset_not)
5145 at `&p1[3]'. */
5146 if (SINGLE_BYTE_CHAR_P (c))
5148 if (c < CHARSET_BITMAP_SIZE (&p1[3]) * BYTEWIDTH
5149 && p1[5 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
5150 not = !not;
5152 else if (CHARSET_RANGE_TABLE_EXISTS_P (&p1[3]))
5153 CHARSET_LOOKUP_RANGE_TABLE (not, c, &p1[3]);
5155 /* `not' is equal to 1 if c would match, which means
5156 that we can't change to pop_failure_jump. */
5157 if (!not)
5159 p[-3] = (unsigned char) pop_failure_jump;
5160 DEBUG_PRINT1 (" No match => pop_failure_jump.\n");
5164 else if ((re_opcode_t) *p2 == charset)
5166 if ((re_opcode_t) p1[3] == exactn)
5168 register unsigned int c = p1[5];
5169 int not = 0;
5171 if (multibyte && BASE_LEADING_CODE_P (c))
5172 c = STRING_CHAR (&p1[5], pend - &p1[5]);
5174 /* Test if C is listed in charset at `p2'. */
5175 if (SINGLE_BYTE_CHAR_P (c))
5177 if (c < CHARSET_BITMAP_SIZE (p2) * BYTEWIDTH
5178 && (p2[2 + c / BYTEWIDTH]
5179 & (1 << (c % BYTEWIDTH))))
5180 not = !not;
5182 else if (CHARSET_RANGE_TABLE_EXISTS_P (p2))
5183 CHARSET_LOOKUP_RANGE_TABLE (not, c, p2);
5185 if (!not)
5187 p[-3] = (unsigned char) pop_failure_jump;
5188 DEBUG_PRINT1 (" No match => pop_failure_jump.\n");
5192 /* It is hard to list up all the character in charset
5193 P2 if it includes multibyte character. Give up in
5194 such case. */
5195 else if (!multibyte || !CHARSET_RANGE_TABLE_EXISTS_P (p2))
5197 /* Now, we are sure that P2 has no range table.
5198 So, for the size of bitmap in P2, `p2[1]' is
5199 enough. But P1 may have range table, so the
5200 size of bitmap table of P1 is extracted by
5201 using macro `CHARSET_BITMAP_SIZE'.
5203 Since we know that all the character listed in
5204 P2 is ASCII, it is enough to test only bitmap
5205 table of P1. */
5207 if ((re_opcode_t) p1[3] == charset_not)
5209 int idx;
5210 /* We win if the charset_not inside the loop lists
5211 every character listed in the charset after. */
5212 for (idx = 0; idx < (int) p2[1]; idx++)
5213 if (! (p2[2 + idx] == 0
5214 || (idx < CHARSET_BITMAP_SIZE (&p1[3])
5215 && ((p2[2 + idx] & ~ p1[5 + idx]) == 0))))
5216 break;
5218 if (idx == p2[1])
5220 p[-3] = (unsigned char) pop_failure_jump;
5221 DEBUG_PRINT1 (" No match => pop_failure_jump.\n");
5224 else if ((re_opcode_t) p1[3] == charset)
5226 int idx;
5227 /* We win if the charset inside the loop
5228 has no overlap with the one after the loop. */
5229 for (idx = 0;
5230 (idx < (int) p2[1]
5231 && idx < CHARSET_BITMAP_SIZE (&p1[3]));
5232 idx++)
5233 if ((p2[2 + idx] & p1[5 + idx]) != 0)
5234 break;
5236 if (idx == p2[1]
5237 || idx == CHARSET_BITMAP_SIZE (&p1[3]))
5239 p[-3] = (unsigned char) pop_failure_jump;
5240 DEBUG_PRINT1 (" No match => pop_failure_jump.\n");
5246 p -= 2; /* Point at relative address again. */
5247 if ((re_opcode_t) p[-1] != pop_failure_jump)
5249 p[-1] = (unsigned char) jump;
5250 DEBUG_PRINT1 (" Match => jump.\n");
5251 goto unconditional_jump;
5253 /* Note fall through. */
5256 /* The end of a simple repeat has a pop_failure_jump back to
5257 its matching on_failure_jump, where the latter will push a
5258 failure point. The pop_failure_jump takes off failure
5259 points put on by this pop_failure_jump's matching
5260 on_failure_jump; we got through the pattern to here from the
5261 matching on_failure_jump, so didn't fail. */
5262 case pop_failure_jump:
5264 /* We need to pass separate storage for the lowest and
5265 highest registers, even though we don't care about the
5266 actual values. Otherwise, we will restore only one
5267 register from the stack, since lowest will == highest in
5268 `pop_failure_point'. */
5269 unsigned dummy_low_reg, dummy_high_reg;
5270 unsigned char *pdummy;
5271 const char *sdummy;
5273 DEBUG_PRINT1 ("EXECUTING pop_failure_jump.\n");
5274 POP_FAILURE_POINT (sdummy, pdummy,
5275 dummy_low_reg, dummy_high_reg,
5276 reg_dummy, reg_dummy, reg_info_dummy);
5278 /* Note fall through. */
5281 /* Unconditionally jump (without popping any failure points). */
5282 case jump:
5283 unconditional_jump:
5284 #if defined (WINDOWSNT) && defined (emacs)
5285 QUIT;
5286 #endif
5287 EXTRACT_NUMBER_AND_INCR (mcnt, p); /* Get the amount to jump. */
5288 DEBUG_PRINT2 ("EXECUTING jump %d ", mcnt);
5289 p += mcnt; /* Do the jump. */
5290 DEBUG_PRINT2 ("(to 0x%x).\n", p);
5291 break;
5294 /* We need this opcode so we can detect where alternatives end
5295 in `group_match_null_string_p' et al. */
5296 case jump_past_alt:
5297 DEBUG_PRINT1 ("EXECUTING jump_past_alt.\n");
5298 goto unconditional_jump;
5301 /* Normally, the on_failure_jump pushes a failure point, which
5302 then gets popped at pop_failure_jump. We will end up at
5303 pop_failure_jump, also, and with a pattern of, say, `a+', we
5304 are skipping over the on_failure_jump, so we have to push
5305 something meaningless for pop_failure_jump to pop. */
5306 case dummy_failure_jump:
5307 DEBUG_PRINT1 ("EXECUTING dummy_failure_jump.\n");
5308 /* It doesn't matter what we push for the string here. What
5309 the code at `fail' tests is the value for the pattern. */
5310 PUSH_FAILURE_POINT (0, 0, -2);
5311 goto unconditional_jump;
5314 /* At the end of an alternative, we need to push a dummy failure
5315 point in case we are followed by a `pop_failure_jump', because
5316 we don't want the failure point for the alternative to be
5317 popped. For example, matching `(a|ab)*' against `aab'
5318 requires that we match the `ab' alternative. */
5319 case push_dummy_failure:
5320 DEBUG_PRINT1 ("EXECUTING push_dummy_failure.\n");
5321 /* See comments just above at `dummy_failure_jump' about the
5322 two zeroes. */
5323 PUSH_FAILURE_POINT (0, 0, -2);
5324 break;
5326 /* Have to succeed matching what follows at least n times.
5327 After that, handle like `on_failure_jump'. */
5328 case succeed_n:
5329 EXTRACT_NUMBER (mcnt, p + 2);
5330 DEBUG_PRINT2 ("EXECUTING succeed_n %d.\n", mcnt);
5332 assert (mcnt >= 0);
5333 /* Originally, this is how many times we HAVE to succeed. */
5334 if (mcnt > 0)
5336 mcnt--;
5337 p += 2;
5338 STORE_NUMBER_AND_INCR (p, mcnt);
5339 DEBUG_PRINT3 (" Setting 0x%x to %d.\n", p, mcnt);
5341 else if (mcnt == 0)
5343 DEBUG_PRINT2 (" Setting two bytes from 0x%x to no_op.\n", p+2);
5344 p[2] = (unsigned char) no_op;
5345 p[3] = (unsigned char) no_op;
5346 goto on_failure;
5348 break;
5350 case jump_n:
5351 EXTRACT_NUMBER (mcnt, p + 2);
5352 DEBUG_PRINT2 ("EXECUTING jump_n %d.\n", mcnt);
5354 /* Originally, this is how many times we CAN jump. */
5355 if (mcnt)
5357 mcnt--;
5358 STORE_NUMBER (p + 2, mcnt);
5359 goto unconditional_jump;
5361 /* If don't have to jump any more, skip over the rest of command. */
5362 else
5363 p += 4;
5364 break;
5366 case set_number_at:
5368 DEBUG_PRINT1 ("EXECUTING set_number_at.\n");
5370 EXTRACT_NUMBER_AND_INCR (mcnt, p);
5371 p1 = p + mcnt;
5372 EXTRACT_NUMBER_AND_INCR (mcnt, p);
5373 DEBUG_PRINT3 (" Setting 0x%x to %d.\n", p1, mcnt);
5374 STORE_NUMBER (p1, mcnt);
5375 break;
5378 case wordbound:
5379 DEBUG_PRINT1 ("EXECUTING wordbound.\n");
5381 /* We SUCCEED in one of the following cases: */
5383 /* Case 1: D is at the beginning or the end of string. */
5384 if (AT_STRINGS_BEG (d) || AT_STRINGS_END (d))
5385 break;
5386 else
5388 /* C1 is the character before D, S1 is the syntax of C1, C2
5389 is the character at D, and S2 is the syntax of C2. */
5390 int c1, c2, s1, s2;
5391 int pos1 = PTR_TO_OFFSET (d - 1);
5392 int charpos;
5394 GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
5395 GET_CHAR_AFTER_2 (c2, d, string1, end1, string2, end2);
5396 #ifdef emacs
5397 charpos = SYNTAX_TABLE_BYTE_TO_CHAR (pos1);
5398 UPDATE_SYNTAX_TABLE (charpos);
5399 #endif
5400 s1 = SYNTAX (c1);
5401 #ifdef emacs
5402 UPDATE_SYNTAX_TABLE_FORWARD (charpos + 1);
5403 #endif
5404 s2 = SYNTAX (c2);
5406 if (/* Case 2: Only one of S1 and S2 is Sword. */
5407 ((s1 == Sword) != (s2 == Sword))
5408 /* Case 3: Both of S1 and S2 are Sword, and macro
5409 WORD_BOUNDARY_P (C1, C2) returns nonzero. */
5410 || ((s1 == Sword) && WORD_BOUNDARY_P (c1, c2)))
5411 break;
5413 goto fail;
5415 case notwordbound:
5416 DEBUG_PRINT1 ("EXECUTING notwordbound.\n");
5418 /* We FAIL in one of the following cases: */
5420 /* Case 1: D is at the beginning or the end of string. */
5421 if (AT_STRINGS_BEG (d) || AT_STRINGS_END (d))
5422 goto fail;
5423 else
5425 /* C1 is the character before D, S1 is the syntax of C1, C2
5426 is the character at D, and S2 is the syntax of C2. */
5427 int c1, c2, s1, s2;
5428 int pos1 = PTR_TO_OFFSET (d - 1);
5429 int charpos;
5431 GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
5432 GET_CHAR_AFTER_2 (c2, d, string1, end1, string2, end2);
5433 #ifdef emacs
5434 charpos = SYNTAX_TABLE_BYTE_TO_CHAR (pos1);
5435 UPDATE_SYNTAX_TABLE (charpos);
5436 #endif
5437 s1 = SYNTAX (c1);
5438 #ifdef emacs
5439 UPDATE_SYNTAX_TABLE_FORWARD (charpos + 1);
5440 #endif
5441 s2 = SYNTAX (c2);
5443 if (/* Case 2: Only one of S1 and S2 is Sword. */
5444 ((s1 == Sword) != (s2 == Sword))
5445 /* Case 3: Both of S1 and S2 are Sword, and macro
5446 WORD_BOUNDARY_P (C1, C2) returns nonzero. */
5447 || ((s1 == Sword) && WORD_BOUNDARY_P (c1, c2)))
5448 goto fail;
5450 break;
5452 case wordbeg:
5453 DEBUG_PRINT1 ("EXECUTING wordbeg.\n");
5455 /* We FAIL in one of the following cases: */
5457 /* Case 1: D is at the end of string. */
5458 if (AT_STRINGS_END (d))
5459 goto fail;
5460 else
5462 /* C1 is the character before D, S1 is the syntax of C1, C2
5463 is the character at D, and S2 is the syntax of C2. */
5464 int c1, c2, s1, s2;
5465 int pos1 = PTR_TO_OFFSET (d);
5466 int charpos;
5468 GET_CHAR_AFTER_2 (c2, d, string1, end1, string2, end2);
5469 #ifdef emacs
5470 charpos = SYNTAX_TABLE_BYTE_TO_CHAR (pos1);
5471 UPDATE_SYNTAX_TABLE (charpos);
5472 #endif
5473 s2 = SYNTAX (c2);
5475 /* Case 2: S2 is not Sword. */
5476 if (s2 != Sword)
5477 goto fail;
5479 /* Case 3: D is not at the beginning of string ... */
5480 if (!AT_STRINGS_BEG (d))
5482 GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
5483 #ifdef emacs
5484 UPDATE_SYNTAX_TABLE_BACKWARD (charpos - 1);
5485 #endif
5486 s1 = SYNTAX (c1);
5488 /* ... and S1 is Sword, and WORD_BOUNDARY_P (C1, C2)
5489 returns 0. */
5490 if ((s1 == Sword) && !WORD_BOUNDARY_P (c1, c2))
5491 goto fail;
5494 break;
5496 case wordend:
5497 DEBUG_PRINT1 ("EXECUTING wordend.\n");
5499 /* We FAIL in one of the following cases: */
5501 /* Case 1: D is at the beginning of string. */
5502 if (AT_STRINGS_BEG (d))
5503 goto fail;
5504 else
5506 /* C1 is the character before D, S1 is the syntax of C1, C2
5507 is the character at D, and S2 is the syntax of C2. */
5508 int c1, c2, s1, s2;
5509 int pos1 = PTR_TO_OFFSET (d);
5510 int charpos;
5512 GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
5513 #ifdef emacs
5514 charpos = SYNTAX_TABLE_BYTE_TO_CHAR (pos1 - 1);
5515 UPDATE_SYNTAX_TABLE (charpos);
5516 #endif
5517 s1 = SYNTAX (c1);
5519 /* Case 2: S1 is not Sword. */
5520 if (s1 != Sword)
5521 goto fail;
5523 /* Case 3: D is not at the end of string ... */
5524 if (!AT_STRINGS_END (d))
5526 GET_CHAR_AFTER_2 (c2, d, string1, end1, string2, end2);
5527 #ifdef emacs
5528 UPDATE_SYNTAX_TABLE_FORWARD (charpos);
5529 #endif
5530 s2 = SYNTAX (c2);
5532 /* ... and S2 is Sword, and WORD_BOUNDARY_P (C1, C2)
5533 returns 0. */
5534 if ((s2 == Sword) && !WORD_BOUNDARY_P (c1, c2))
5535 goto fail;
5538 break;
5540 #ifdef emacs
5541 case before_dot:
5542 DEBUG_PRINT1 ("EXECUTING before_dot.\n");
5543 if (PTR_BYTE_POS ((unsigned char *) d) >= PT_BYTE)
5544 goto fail;
5545 break;
5547 case at_dot:
5548 DEBUG_PRINT1 ("EXECUTING at_dot.\n");
5549 if (PTR_BYTE_POS ((unsigned char *) d) != PT_BYTE)
5550 goto fail;
5551 break;
5553 case after_dot:
5554 DEBUG_PRINT1 ("EXECUTING after_dot.\n");
5555 if (PTR_BYTE_POS ((unsigned char *) d) <= PT_BYTE)
5556 goto fail;
5557 break;
5559 case syntaxspec:
5560 DEBUG_PRINT2 ("EXECUTING syntaxspec %d.\n", mcnt);
5561 mcnt = *p++;
5562 goto matchsyntax;
5564 case wordchar:
5565 DEBUG_PRINT1 ("EXECUTING Emacs wordchar.\n");
5566 mcnt = (int) Sword;
5567 matchsyntax:
5568 PREFETCH ();
5569 #ifdef emacs
5571 int pos1 = SYNTAX_TABLE_BYTE_TO_CHAR (PTR_TO_OFFSET (d));
5572 UPDATE_SYNTAX_TABLE (pos1);
5574 #endif
5576 int c, len;
5578 if (multibyte)
5579 /* we must concern about multibyte form, ... */
5580 c = STRING_CHAR_AND_LENGTH (d, dend - d, len);
5581 else
5582 /* everything should be handled as ASCII, even though it
5583 looks like multibyte form. */
5584 c = *d, len = 1;
5586 if (SYNTAX (c) != (enum syntaxcode) mcnt)
5587 goto fail;
5588 d += len;
5590 SET_REGS_MATCHED ();
5591 break;
5593 case notsyntaxspec:
5594 DEBUG_PRINT2 ("EXECUTING notsyntaxspec %d.\n", mcnt);
5595 mcnt = *p++;
5596 goto matchnotsyntax;
5598 case notwordchar:
5599 DEBUG_PRINT1 ("EXECUTING Emacs notwordchar.\n");
5600 mcnt = (int) Sword;
5601 matchnotsyntax:
5602 PREFETCH ();
5603 #ifdef emacs
5605 int pos1 = SYNTAX_TABLE_BYTE_TO_CHAR (PTR_TO_OFFSET (d));
5606 UPDATE_SYNTAX_TABLE (pos1);
5608 #endif
5610 int c, len;
5612 if (multibyte)
5613 c = STRING_CHAR_AND_LENGTH (d, dend - d, len);
5614 else
5615 c = *d, len = 1;
5617 if (SYNTAX (c) == (enum syntaxcode) mcnt)
5618 goto fail;
5619 d += len;
5621 SET_REGS_MATCHED ();
5622 break;
5624 case categoryspec:
5625 DEBUG_PRINT2 ("EXECUTING categoryspec %d.\n", *p);
5626 mcnt = *p++;
5627 PREFETCH ();
5629 int c, len;
5631 if (multibyte)
5632 c = STRING_CHAR_AND_LENGTH (d, dend - d, len);
5633 else
5634 c = *d, len = 1;
5636 if (!CHAR_HAS_CATEGORY (c, mcnt))
5637 goto fail;
5638 d += len;
5640 SET_REGS_MATCHED ();
5641 break;
5643 case notcategoryspec:
5644 DEBUG_PRINT2 ("EXECUTING notcategoryspec %d.\n", *p);
5645 mcnt = *p++;
5646 PREFETCH ();
5648 int c, len;
5650 if (multibyte)
5651 c = STRING_CHAR_AND_LENGTH (d, dend - d, len);
5652 else
5653 c = *d, len = 1;
5655 if (CHAR_HAS_CATEGORY (c, mcnt))
5656 goto fail;
5657 d += len;
5659 SET_REGS_MATCHED ();
5660 break;
5662 #else /* not emacs */
5663 case wordchar:
5664 DEBUG_PRINT1 ("EXECUTING non-Emacs wordchar.\n");
5665 PREFETCH ();
5666 if (!WORDCHAR_P (d))
5667 goto fail;
5668 SET_REGS_MATCHED ();
5669 d++;
5670 break;
5672 case notwordchar:
5673 DEBUG_PRINT1 ("EXECUTING non-Emacs notwordchar.\n");
5674 PREFETCH ();
5675 if (WORDCHAR_P (d))
5676 goto fail;
5677 SET_REGS_MATCHED ();
5678 d++;
5679 break;
5680 #endif /* not emacs */
5682 default:
5683 abort ();
5685 continue; /* Successfully executed one pattern command; keep going. */
5688 /* We goto here if a matching operation fails. */
5689 fail:
5690 #if defined (WINDOWSNT) && defined (emacs)
5691 QUIT;
5692 #endif
5693 if (!FAIL_STACK_EMPTY ())
5694 { /* A restart point is known. Restore to that state. */
5695 DEBUG_PRINT1 ("\nFAIL:\n");
5696 POP_FAILURE_POINT (d, p,
5697 lowest_active_reg, highest_active_reg,
5698 regstart, regend, reg_info);
5700 /* If this failure point is a dummy, try the next one. */
5701 if (!p)
5702 goto fail;
5704 /* If we failed to the end of the pattern, don't examine *p. */
5705 assert (p <= pend);
5706 if (p < pend)
5708 boolean is_a_jump_n = false;
5710 /* If failed to a backwards jump that's part of a repetition
5711 loop, need to pop this failure point and use the next one. */
5712 switch ((re_opcode_t) *p)
5714 case jump_n:
5715 is_a_jump_n = true;
5716 case maybe_pop_jump:
5717 case pop_failure_jump:
5718 case jump:
5719 p1 = p + 1;
5720 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
5721 p1 += mcnt;
5723 if ((is_a_jump_n && (re_opcode_t) *p1 == succeed_n)
5724 || (!is_a_jump_n
5725 && (re_opcode_t) *p1 == on_failure_jump))
5726 goto fail;
5727 break;
5728 default:
5729 /* do nothing */ ;
5733 if (d >= string1 && d <= end1)
5734 dend = end_match_1;
5736 else
5737 break; /* Matching at this starting point really fails. */
5738 } /* for (;;) */
5740 if (best_regs_set)
5741 goto restore_best_regs;
5743 FREE_VARIABLES ();
5745 return -1; /* Failure to match. */
5746 } /* re_match_2 */
5748 /* Subroutine definitions for re_match_2. */
5751 /* We are passed P pointing to a register number after a start_memory.
5753 Return true if the pattern up to the corresponding stop_memory can
5754 match the empty string, and false otherwise.
5756 If we find the matching stop_memory, sets P to point to one past its number.
5757 Otherwise, sets P to an undefined byte less than or equal to END.
5759 We don't handle duplicates properly (yet). */
5761 static boolean
5762 group_match_null_string_p (p, end, reg_info)
5763 unsigned char **p, *end;
5764 register_info_type *reg_info;
5766 int mcnt;
5767 /* Point to after the args to the start_memory. */
5768 unsigned char *p1 = *p + 2;
5770 while (p1 < end)
5772 /* Skip over opcodes that can match nothing, and return true or
5773 false, as appropriate, when we get to one that can't, or to the
5774 matching stop_memory. */
5776 switch ((re_opcode_t) *p1)
5778 /* Could be either a loop or a series of alternatives. */
5779 case on_failure_jump:
5780 p1++;
5781 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
5783 /* If the next operation is not a jump backwards in the
5784 pattern. */
5786 if (mcnt >= 0)
5788 /* Go through the on_failure_jumps of the alternatives,
5789 seeing if any of the alternatives cannot match nothing.
5790 The last alternative starts with only a jump,
5791 whereas the rest start with on_failure_jump and end
5792 with a jump, e.g., here is the pattern for `a|b|c':
5794 /on_failure_jump/0/6/exactn/1/a/jump_past_alt/0/6
5795 /on_failure_jump/0/6/exactn/1/b/jump_past_alt/0/3
5796 /exactn/1/c
5798 So, we have to first go through the first (n-1)
5799 alternatives and then deal with the last one separately. */
5802 /* Deal with the first (n-1) alternatives, which start
5803 with an on_failure_jump (see above) that jumps to right
5804 past a jump_past_alt. */
5806 while ((re_opcode_t) p1[mcnt-3] == jump_past_alt)
5808 /* `mcnt' holds how many bytes long the alternative
5809 is, including the ending `jump_past_alt' and
5810 its number. */
5812 if (!alt_match_null_string_p (p1, p1 + mcnt - 3,
5813 reg_info))
5814 return false;
5816 /* Move to right after this alternative, including the
5817 jump_past_alt. */
5818 p1 += mcnt;
5820 /* Break if it's the beginning of an n-th alternative
5821 that doesn't begin with an on_failure_jump. */
5822 if ((re_opcode_t) *p1 != on_failure_jump)
5823 break;
5825 /* Still have to check that it's not an n-th
5826 alternative that starts with an on_failure_jump. */
5827 p1++;
5828 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
5829 if ((re_opcode_t) p1[mcnt-3] != jump_past_alt)
5831 /* Get to the beginning of the n-th alternative. */
5832 p1 -= 3;
5833 break;
5837 /* Deal with the last alternative: go back and get number
5838 of the `jump_past_alt' just before it. `mcnt' contains
5839 the length of the alternative. */
5840 EXTRACT_NUMBER (mcnt, p1 - 2);
5842 if (!alt_match_null_string_p (p1, p1 + mcnt, reg_info))
5843 return false;
5845 p1 += mcnt; /* Get past the n-th alternative. */
5846 } /* if mcnt > 0 */
5847 break;
5850 case stop_memory:
5851 assert (p1[1] == **p);
5852 *p = p1 + 2;
5853 return true;
5856 default:
5857 if (!common_op_match_null_string_p (&p1, end, reg_info))
5858 return false;
5860 } /* while p1 < end */
5862 return false;
5863 } /* group_match_null_string_p */
5866 /* Similar to group_match_null_string_p, but doesn't deal with alternatives:
5867 It expects P to be the first byte of a single alternative and END one
5868 byte past the last. The alternative can contain groups. */
5870 static boolean
5871 alt_match_null_string_p (p, end, reg_info)
5872 unsigned char *p, *end;
5873 register_info_type *reg_info;
5875 int mcnt;
5876 unsigned char *p1 = p;
5878 while (p1 < end)
5880 /* Skip over opcodes that can match nothing, and break when we get
5881 to one that can't. */
5883 switch ((re_opcode_t) *p1)
5885 /* It's a loop. */
5886 case on_failure_jump:
5887 p1++;
5888 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
5889 p1 += mcnt;
5890 break;
5892 default:
5893 if (!common_op_match_null_string_p (&p1, end, reg_info))
5894 return false;
5896 } /* while p1 < end */
5898 return true;
5899 } /* alt_match_null_string_p */
5902 /* Deals with the ops common to group_match_null_string_p and
5903 alt_match_null_string_p.
5905 Sets P to one after the op and its arguments, if any. */
5907 static boolean
5908 common_op_match_null_string_p (p, end, reg_info)
5909 unsigned char **p, *end;
5910 register_info_type *reg_info;
5912 int mcnt;
5913 boolean ret;
5914 int reg_no;
5915 unsigned char *p1 = *p;
5917 switch ((re_opcode_t) *p1++)
5919 case no_op:
5920 case begline:
5921 case endline:
5922 case begbuf:
5923 case endbuf:
5924 case wordbeg:
5925 case wordend:
5926 case wordbound:
5927 case notwordbound:
5928 #ifdef emacs
5929 case before_dot:
5930 case at_dot:
5931 case after_dot:
5932 #endif
5933 break;
5935 case start_memory:
5936 reg_no = *p1;
5937 assert (reg_no > 0 && reg_no <= MAX_REGNUM);
5938 ret = group_match_null_string_p (&p1, end, reg_info);
5940 /* Have to set this here in case we're checking a group which
5941 contains a group and a back reference to it. */
5943 if (REG_MATCH_NULL_STRING_P (reg_info[reg_no]) == MATCH_NULL_UNSET_VALUE)
5944 REG_MATCH_NULL_STRING_P (reg_info[reg_no]) = ret;
5946 if (!ret)
5947 return false;
5948 break;
5950 /* If this is an optimized succeed_n for zero times, make the jump. */
5951 case jump:
5952 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
5953 if (mcnt >= 0)
5954 p1 += mcnt;
5955 else
5956 return false;
5957 break;
5959 case succeed_n:
5960 /* Get to the number of times to succeed. */
5961 p1 += 2;
5962 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
5964 if (mcnt == 0)
5966 p1 -= 4;
5967 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
5968 p1 += mcnt;
5970 else
5971 return false;
5972 break;
5974 case duplicate:
5975 if (!REG_MATCH_NULL_STRING_P (reg_info[*p1]))
5976 return false;
5977 break;
5979 case set_number_at:
5980 p1 += 4;
5982 default:
5983 /* All other opcodes mean we cannot match the empty string. */
5984 return false;
5987 *p = p1;
5988 return true;
5989 } /* common_op_match_null_string_p */
5992 /* Return zero if TRANSLATE[S1] and TRANSLATE[S2] are identical for LEN
5993 bytes; nonzero otherwise. */
5995 static int
5996 bcmp_translate (s1, s2, len, translate)
5997 unsigned char *s1, *s2;
5998 register int len;
5999 RE_TRANSLATE_TYPE translate;
6001 register unsigned char *p1 = s1, *p2 = s2;
6002 unsigned char *p1_end = s1 + len;
6003 unsigned char *p2_end = s2 + len;
6005 while (p1 != p1_end && p2 != p2_end)
6007 int p1_charlen, p2_charlen;
6008 int p1_ch, p2_ch;
6010 p1_ch = STRING_CHAR_AND_LENGTH (p1, p1_end - p1, p1_charlen);
6011 p2_ch = STRING_CHAR_AND_LENGTH (p2, p2_end - p2, p2_charlen);
6013 if (RE_TRANSLATE (translate, p1_ch)
6014 != RE_TRANSLATE (translate, p2_ch))
6015 return 1;
6017 p1 += p1_charlen, p2 += p2_charlen;
6020 if (p1 != p1_end || p2 != p2_end)
6021 return 1;
6023 return 0;
6026 /* Entry points for GNU code. */
6028 /* re_compile_pattern is the GNU regular expression compiler: it
6029 compiles PATTERN (of length SIZE) and puts the result in BUFP.
6030 Returns 0 if the pattern was valid, otherwise an error string.
6032 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
6033 are set in BUFP on entry.
6035 We call regex_compile to do the actual compilation. */
6037 const char *
6038 re_compile_pattern (pattern, length, bufp)
6039 const char *pattern;
6040 int length;
6041 struct re_pattern_buffer *bufp;
6043 reg_errcode_t ret;
6045 /* GNU code is written to assume at least RE_NREGS registers will be set
6046 (and at least one extra will be -1). */
6047 bufp->regs_allocated = REGS_UNALLOCATED;
6049 /* And GNU code determines whether or not to get register information
6050 by passing null for the REGS argument to re_match, etc., not by
6051 setting no_sub. */
6052 bufp->no_sub = 0;
6054 /* Match anchors at newline. */
6055 bufp->newline_anchor = 1;
6057 ret = regex_compile (pattern, length, re_syntax_options, bufp);
6059 if (!ret)
6060 return NULL;
6061 return gettext (re_error_msgid[(int) ret]);
6064 /* Entry points compatible with 4.2 BSD regex library. We don't define
6065 them unless specifically requested. */
6067 #if defined (_REGEX_RE_COMP) || defined (_LIBC)
6069 /* BSD has one and only one pattern buffer. */
6070 static struct re_pattern_buffer re_comp_buf;
6072 char *
6073 #ifdef _LIBC
6074 /* Make these definitions weak in libc, so POSIX programs can redefine
6075 these names if they don't use our functions, and still use
6076 regcomp/regexec below without link errors. */
6077 weak_function
6078 #endif
6079 re_comp (s)
6080 const char *s;
6082 reg_errcode_t ret;
6084 if (!s)
6086 if (!re_comp_buf.buffer)
6087 return (char *) gettext ("No previous regular expression");
6088 return 0;
6091 if (!re_comp_buf.buffer)
6093 re_comp_buf.buffer = (unsigned char *) malloc (200);
6094 if (re_comp_buf.buffer == NULL)
6095 /* CVS: Yes, we're discarding `const' here if !HAVE_LIBINTL. */
6096 return (char *) gettext (re_error_msgid[(int) REG_ESPACE]);
6097 re_comp_buf.allocated = 200;
6099 re_comp_buf.fastmap = (char *) malloc (1 << BYTEWIDTH);
6100 if (re_comp_buf.fastmap == NULL)
6101 /* CVS: Yes, we're discarding `const' here if !HAVE_LIBINTL. */
6102 return (char *) gettext (re_error_msgid[(int) REG_ESPACE]);
6105 /* Since `re_exec' always passes NULL for the `regs' argument, we
6106 don't need to initialize the pattern buffer fields which affect it. */
6108 /* Match anchors at newlines. */
6109 re_comp_buf.newline_anchor = 1;
6111 ret = regex_compile (s, strlen (s), re_syntax_options, &re_comp_buf);
6113 if (!ret)
6114 return NULL;
6116 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
6117 return (char *) gettext (re_error_msgid[(int) ret]);
6122 #ifdef _LIBC
6123 weak_function
6124 #endif
6125 re_exec (s)
6126 const char *s;
6128 const int len = strlen (s);
6129 return
6130 0 <= re_search (&re_comp_buf, s, len, 0, len, (struct re_registers *) 0);
6132 #endif /* _REGEX_RE_COMP */
6134 /* POSIX.2 functions. Don't define these for Emacs. */
6136 #ifndef emacs
6138 /* regcomp takes a regular expression as a string and compiles it.
6140 PREG is a regex_t *. We do not expect any fields to be initialized,
6141 since POSIX says we shouldn't. Thus, we set
6143 `buffer' to the compiled pattern;
6144 `used' to the length of the compiled pattern;
6145 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
6146 REG_EXTENDED bit in CFLAGS is set; otherwise, to
6147 RE_SYNTAX_POSIX_BASIC;
6148 `newline_anchor' to REG_NEWLINE being set in CFLAGS;
6149 `fastmap' and `fastmap_accurate' to zero;
6150 `re_nsub' to the number of subexpressions in PATTERN.
6152 PATTERN is the address of the pattern string.
6154 CFLAGS is a series of bits which affect compilation.
6156 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
6157 use POSIX basic syntax.
6159 If REG_NEWLINE is set, then . and [^...] don't match newline.
6160 Also, regexec will try a match beginning after every newline.
6162 If REG_ICASE is set, then we considers upper- and lowercase
6163 versions of letters to be equivalent when matching.
6165 If REG_NOSUB is set, then when PREG is passed to regexec, that
6166 routine will report only success or failure, and nothing about the
6167 registers.
6169 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
6170 the return codes and their meanings.) */
6173 regcomp (preg, pattern, cflags)
6174 regex_t *preg;
6175 const char *pattern;
6176 int cflags;
6178 reg_errcode_t ret;
6179 unsigned syntax
6180 = (cflags & REG_EXTENDED) ?
6181 RE_SYNTAX_POSIX_EXTENDED : RE_SYNTAX_POSIX_BASIC;
6183 /* regex_compile will allocate the space for the compiled pattern. */
6184 preg->buffer = 0;
6185 preg->allocated = 0;
6186 preg->used = 0;
6188 /* Don't bother to use a fastmap when searching. This simplifies the
6189 REG_NEWLINE case: if we used a fastmap, we'd have to put all the
6190 characters after newlines into the fastmap. This way, we just try
6191 every character. */
6192 preg->fastmap = 0;
6194 if (cflags & REG_ICASE)
6196 unsigned i;
6198 preg->translate
6199 = (RE_TRANSLATE_TYPE) malloc (CHAR_SET_SIZE
6200 * sizeof (*(RE_TRANSLATE_TYPE)0));
6201 if (preg->translate == NULL)
6202 return (int) REG_ESPACE;
6204 /* Map uppercase characters to corresponding lowercase ones. */
6205 for (i = 0; i < CHAR_SET_SIZE; i++)
6206 preg->translate[i] = ISUPPER (i) ? tolower (i) : i;
6208 else
6209 preg->translate = NULL;
6211 /* If REG_NEWLINE is set, newlines are treated differently. */
6212 if (cflags & REG_NEWLINE)
6213 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
6214 syntax &= ~RE_DOT_NEWLINE;
6215 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
6216 /* It also changes the matching behavior. */
6217 preg->newline_anchor = 1;
6219 else
6220 preg->newline_anchor = 0;
6222 preg->no_sub = !!(cflags & REG_NOSUB);
6224 /* POSIX says a null character in the pattern terminates it, so we
6225 can use strlen here in compiling the pattern. */
6226 ret = regex_compile (pattern, strlen (pattern), syntax, preg);
6228 /* POSIX doesn't distinguish between an unmatched open-group and an
6229 unmatched close-group: both are REG_EPAREN. */
6230 if (ret == REG_ERPAREN) ret = REG_EPAREN;
6232 return (int) ret;
6236 /* regexec searches for a given pattern, specified by PREG, in the
6237 string STRING.
6239 If NMATCH is zero or REG_NOSUB was set in the cflags argument to
6240 `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
6241 least NMATCH elements, and we set them to the offsets of the
6242 corresponding matched substrings.
6244 EFLAGS specifies `execution flags' which affect matching: if
6245 REG_NOTBOL is set, then ^ does not match at the beginning of the
6246 string; if REG_NOTEOL is set, then $ does not match at the end.
6248 We return 0 if we find a match and REG_NOMATCH if not. */
6251 regexec (preg, string, nmatch, pmatch, eflags)
6252 const regex_t *preg;
6253 const char *string;
6254 size_t nmatch;
6255 regmatch_t pmatch[];
6256 int eflags;
6258 int ret;
6259 struct re_registers regs;
6260 regex_t private_preg;
6261 int len = strlen (string);
6262 boolean want_reg_info = !preg->no_sub && nmatch > 0;
6264 private_preg = *preg;
6266 private_preg.not_bol = !!(eflags & REG_NOTBOL);
6267 private_preg.not_eol = !!(eflags & REG_NOTEOL);
6269 /* The user has told us exactly how many registers to return
6270 information about, via `nmatch'. We have to pass that on to the
6271 matching routines. */
6272 private_preg.regs_allocated = REGS_FIXED;
6274 if (want_reg_info)
6276 regs.num_regs = nmatch;
6277 regs.start = TALLOC (nmatch, regoff_t);
6278 regs.end = TALLOC (nmatch, regoff_t);
6279 if (regs.start == NULL || regs.end == NULL)
6280 return (int) REG_NOMATCH;
6283 /* Perform the searching operation. */
6284 ret = re_search (&private_preg, string, len,
6285 /* start: */ 0, /* range: */ len,
6286 want_reg_info ? &regs : (struct re_registers *) 0);
6288 /* Copy the register information to the POSIX structure. */
6289 if (want_reg_info)
6291 if (ret >= 0)
6293 unsigned r;
6295 for (r = 0; r < nmatch; r++)
6297 pmatch[r].rm_so = regs.start[r];
6298 pmatch[r].rm_eo = regs.end[r];
6302 /* If we needed the temporary register info, free the space now. */
6303 free (regs.start);
6304 free (regs.end);
6307 /* We want zero return to mean success, unlike `re_search'. */
6308 return ret >= 0 ? (int) REG_NOERROR : (int) REG_NOMATCH;
6312 /* Returns a message corresponding to an error code, ERRCODE, returned
6313 from either regcomp or regexec. We don't use PREG here. */
6315 size_t
6316 regerror (errcode, preg, errbuf, errbuf_size)
6317 int errcode;
6318 const regex_t *preg;
6319 char *errbuf;
6320 size_t errbuf_size;
6322 const char *msg;
6323 size_t msg_size;
6325 if (errcode < 0
6326 || errcode >= (sizeof (re_error_msgid) / sizeof (re_error_msgid[0])))
6327 /* Only error codes returned by the rest of the code should be passed
6328 to this routine. If we are given anything else, or if other regex
6329 code generates an invalid error code, then the program has a bug.
6330 Dump core so we can fix it. */
6331 abort ();
6333 msg = gettext (re_error_msgid[errcode]);
6335 msg_size = strlen (msg) + 1; /* Includes the null. */
6337 if (errbuf_size != 0)
6339 if (msg_size > errbuf_size)
6341 strncpy (errbuf, msg, errbuf_size - 1);
6342 errbuf[errbuf_size - 1] = 0;
6344 else
6345 strcpy (errbuf, msg);
6348 return msg_size;
6352 /* Free dynamically allocated space used by PREG. */
6354 void
6355 regfree (preg)
6356 regex_t *preg;
6358 if (preg->buffer != NULL)
6359 free (preg->buffer);
6360 preg->buffer = NULL;
6362 preg->allocated = 0;
6363 preg->used = 0;
6365 if (preg->fastmap != NULL)
6366 free (preg->fastmap);
6367 preg->fastmap = NULL;
6368 preg->fastmap_accurate = 0;
6370 if (preg->translate != NULL)
6371 free (preg->translate);
6372 preg->translate = NULL;
6375 #endif /* not emacs */