(md5_file): New function -- extracted from main.
[coreutils.git] / lib / regex.c
blob04919abe25ca97473930d6efb387762f8e582942
1 /* Extended regular expression matching and search library,
2 version 0.12.
3 (Implements POSIX draft P10003.2/D11.2, except for
4 internationalization features.)
6 Copyright (C) 1993, 1994, 1995 Free Software Foundation, Inc.
8 This program is free software; you can redistribute it and/or modify
9 it under the terms of the GNU General Public License as published by
10 the Free Software Foundation; either version 2, or (at your option)
11 any later version.
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
18 You should have received a copy of the GNU General Public License
19 along with this program; if not, write to the Free Software
20 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
22 /* AIX requires this to be the first thing in the file. */
23 #if defined (_AIX) && !defined (REGEX_MALLOC)
24 #pragma alloca
25 #endif
27 #define _GNU_SOURCE
29 #ifdef HAVE_CONFIG_H
30 #include <config.h>
31 #endif
33 /* We need this for `regex.h', and perhaps for the Emacs include files. */
34 #include <sys/types.h>
36 /* This is for other GNU distributions with internationalized messages. */
37 #if HAVE_LIBINTL_H || defined (_LIBC)
38 # include <libintl.h>
39 #else
40 # define gettext(msgid) (msgid)
41 #endif
43 /* The `emacs' switch turns on certain matching commands
44 that make sense only in Emacs. */
45 #ifdef emacs
47 #include "lisp.h"
48 #include "buffer.h"
49 #include "syntax.h"
51 #else /* not emacs */
53 /* If we are not linking with Emacs proper,
54 we can't use the relocating allocator
55 even if config.h says that we can. */
56 #undef REL_ALLOC
58 #if defined (STDC_HEADERS) || defined (_LIBC)
59 #include <stdlib.h>
60 #else
61 char *malloc ();
62 char *realloc ();
63 #endif
65 /* When used in Emacs's lib-src, we need to get bzero and bcopy somehow.
66 If nothing else has been done, use the method below. */
67 #ifdef INHIBIT_STRING_HEADER
68 #if !(defined (HAVE_BZERO) && defined (HAVE_BCOPY))
69 #if !defined (bzero) && !defined (bcopy)
70 #undef INHIBIT_STRING_HEADER
71 #endif
72 #endif
73 #endif
75 /* This is the normal way of making sure we have a bcopy and a bzero.
76 This is used in most programs--a few other programs avoid this
77 by defining INHIBIT_STRING_HEADER. */
78 #ifndef INHIBIT_STRING_HEADER
79 #if defined (HAVE_STRING_H) || defined (STDC_HEADERS) || defined (_LIBC)
80 #include <string.h>
81 #ifndef bcmp
82 #define bcmp(s1, s2, n) memcmp ((s1), (s2), (n))
83 #endif
84 #ifndef bcopy
85 #define bcopy(s, d, n) memcpy ((d), (s), (n))
86 #endif
87 #ifndef bzero
88 #define bzero(s, n) memset ((s), 0, (n))
89 #endif
90 #else
91 #include <strings.h>
92 #endif
93 #endif
95 /* Define the syntax stuff for \<, \>, etc. */
97 /* This must be nonzero for the wordchar and notwordchar pattern
98 commands in re_match_2. */
99 #ifndef Sword
100 #define Sword 1
101 #endif
103 #ifdef SWITCH_ENUM_BUG
104 #define SWITCH_ENUM_CAST(x) ((int)(x))
105 #else
106 #define SWITCH_ENUM_CAST(x) (x)
107 #endif
109 #ifdef SYNTAX_TABLE
111 extern char *re_syntax_table;
113 #else /* not SYNTAX_TABLE */
115 /* How many characters in the character set. */
116 #define CHAR_SET_SIZE 256
118 static char re_syntax_table[CHAR_SET_SIZE];
120 static void
121 init_syntax_once ()
123 register int c;
124 static int done = 0;
126 if (done)
127 return;
129 bzero (re_syntax_table, sizeof re_syntax_table);
131 for (c = 'a'; c <= 'z'; c++)
132 re_syntax_table[c] = Sword;
134 for (c = 'A'; c <= 'Z'; c++)
135 re_syntax_table[c] = Sword;
137 for (c = '0'; c <= '9'; c++)
138 re_syntax_table[c] = Sword;
140 re_syntax_table['_'] = Sword;
142 done = 1;
145 #endif /* not SYNTAX_TABLE */
147 #define SYNTAX(c) re_syntax_table[c]
149 #endif /* not emacs */
151 /* Get the interface, including the syntax bits. */
152 #include "regex.h"
154 /* isalpha etc. are used for the character classes. */
155 #include <ctype.h>
157 /* Jim Meyering writes:
159 "... Some ctype macros are valid only for character codes that
160 isascii says are ASCII (SGI's IRIX-4.0.5 is one such system --when
161 using /bin/cc or gcc but without giving an ansi option). So, all
162 ctype uses should be through macros like ISPRINT... If
163 STDC_HEADERS is defined, then autoconf has verified that the ctype
164 macros don't need to be guarded with references to isascii. ...
165 Defining isascii to 1 should let any compiler worth its salt
166 eliminate the && through constant folding." */
168 #if defined (STDC_HEADERS) || (!defined (isascii) && !defined (HAVE_ISASCII))
169 #define ISASCII(c) 1
170 #else
171 #define ISASCII(c) isascii(c)
172 #endif
174 #ifdef isblank
175 #define ISBLANK(c) (ISASCII (c) && isblank (c))
176 #else
177 #define ISBLANK(c) ((c) == ' ' || (c) == '\t')
178 #endif
179 #ifdef isgraph
180 #define ISGRAPH(c) (ISASCII (c) && isgraph (c))
181 #else
182 #define ISGRAPH(c) (ISASCII (c) && isprint (c) && !isspace (c))
183 #endif
185 #define ISPRINT(c) (ISASCII (c) && isprint (c))
186 #define ISDIGIT(c) (ISASCII (c) && isdigit (c))
187 #define ISALNUM(c) (ISASCII (c) && isalnum (c))
188 #define ISALPHA(c) (ISASCII (c) && isalpha (c))
189 #define ISCNTRL(c) (ISASCII (c) && iscntrl (c))
190 #define ISLOWER(c) (ISASCII (c) && islower (c))
191 #define ISPUNCT(c) (ISASCII (c) && ispunct (c))
192 #define ISSPACE(c) (ISASCII (c) && isspace (c))
193 #define ISUPPER(c) (ISASCII (c) && isupper (c))
194 #define ISXDIGIT(c) (ISASCII (c) && isxdigit (c))
196 #ifndef NULL
197 #define NULL (void *)0
198 #endif
200 /* We remove any previous definition of `SIGN_EXTEND_CHAR',
201 since ours (we hope) works properly with all combinations of
202 machines, compilers, `char' and `unsigned char' argument types.
203 (Per Bothner suggested the basic approach.) */
204 #undef SIGN_EXTEND_CHAR
205 #if __STDC__
206 #define SIGN_EXTEND_CHAR(c) ((signed char) (c))
207 #else /* not __STDC__ */
208 /* As in Harbison and Steele. */
209 #define SIGN_EXTEND_CHAR(c) ((((unsigned char) (c)) ^ 128) - 128)
210 #endif
212 /* Should we use malloc or alloca? If REGEX_MALLOC is not defined, we
213 use `alloca' instead of `malloc'. This is because using malloc in
214 re_search* or re_match* could cause memory leaks when C-g is used in
215 Emacs; also, malloc is slower and causes storage fragmentation. On
216 the other hand, malloc is more portable, and easier to debug.
218 Because we sometimes use alloca, some routines have to be macros,
219 not functions -- `alloca'-allocated space disappears at the end of the
220 function it is called in. */
222 #ifdef REGEX_MALLOC
224 #define REGEX_ALLOCATE malloc
225 #define REGEX_REALLOCATE(source, osize, nsize) realloc (source, nsize)
226 #define REGEX_FREE free
228 #else /* not REGEX_MALLOC */
230 /* Emacs already defines alloca, sometimes. */
231 #ifndef alloca
233 /* Make alloca work the best possible way. */
234 #ifdef __GNUC__
235 #define alloca __builtin_alloca
236 #else /* not __GNUC__ */
237 #if HAVE_ALLOCA_H
238 #include <alloca.h>
239 #else /* not __GNUC__ or HAVE_ALLOCA_H */
240 #ifndef _AIX /* Already did AIX, up at the top. */
241 #if defined (__STDC__) && __STDC__
242 void *alloca ();
243 #else
244 char *alloca ();
245 #endif
246 #endif /* not _AIX */
247 #endif /* not HAVE_ALLOCA_H */
248 #endif /* not __GNUC__ */
250 #endif /* not alloca */
252 #define REGEX_ALLOCATE alloca
254 /* Assumes a `char *destination' variable. */
255 #define REGEX_REALLOCATE(source, osize, nsize) \
256 (destination = (char *) alloca (nsize), \
257 bcopy (source, destination, osize), \
258 destination)
260 /* No need to do anything to free, after alloca. */
261 #define REGEX_FREE(arg) ((void)0) /* Do nothing! But inhibit gcc warning. */
263 #endif /* not REGEX_MALLOC */
265 /* Define how to allocate the failure stack. */
267 #ifdef REL_ALLOC
268 #define REGEX_ALLOCATE_STACK(size) \
269 r_alloc (&failure_stack_ptr, (size))
270 #define REGEX_REALLOCATE_STACK(source, osize, nsize) \
271 r_re_alloc (&failure_stack_ptr, (nsize))
272 #define REGEX_FREE_STACK(ptr) \
273 r_alloc_free (&failure_stack_ptr)
275 #else /* not REL_ALLOC */
277 #ifdef REGEX_MALLOC
279 #define REGEX_ALLOCATE_STACK malloc
280 #define REGEX_REALLOCATE_STACK(source, osize, nsize) realloc (source, nsize)
281 #define REGEX_FREE_STACK free
283 #else /* not REGEX_MALLOC */
285 #define REGEX_ALLOCATE_STACK alloca
287 #define REGEX_REALLOCATE_STACK(source, osize, nsize) \
288 REGEX_REALLOCATE (source, osize, nsize)
289 /* No need to explicitly free anything. */
290 #define REGEX_FREE_STACK(arg)
292 #endif /* not REGEX_MALLOC */
293 #endif /* not REL_ALLOC */
296 /* True if `size1' is non-NULL and PTR is pointing anywhere inside
297 `string1' or just past its end. This works if PTR is NULL, which is
298 a good thing. */
299 #define FIRST_STRING_P(ptr) \
300 (size1 && string1 <= (ptr) && (ptr) <= string1 + size1)
302 /* (Re)Allocate N items of type T using malloc, or fail. */
303 #define TALLOC(n, t) ((t *) malloc ((n) * sizeof (t)))
304 #define RETALLOC(addr, n, t) ((addr) = (t *) realloc (addr, (n) * sizeof (t)))
305 #define RETALLOC_IF(addr, n, t) \
306 if (addr) RETALLOC((addr), (n), t); else (addr) = TALLOC ((n), t)
307 #define REGEX_TALLOC(n, t) ((t *) REGEX_ALLOCATE ((n) * sizeof (t)))
309 #define BYTEWIDTH 8 /* In bits. */
311 #define STREQ(s1, s2) ((strcmp (s1, s2) == 0))
313 #undef MAX
314 #undef MIN
315 #define MAX(a, b) ((a) > (b) ? (a) : (b))
316 #define MIN(a, b) ((a) < (b) ? (a) : (b))
318 typedef char boolean;
319 #define false 0
320 #define true 1
322 static int re_match_2_internal ();
324 /* These are the command codes that appear in compiled regular
325 expressions. Some opcodes are followed by argument bytes. A
326 command code can specify any interpretation whatsoever for its
327 arguments. Zero bytes may appear in the compiled regular expression. */
329 typedef enum
331 no_op = 0,
333 /* Succeed right away--no more backtracking. */
334 succeed,
336 /* Followed by one byte giving n, then by n literal bytes. */
337 exactn,
339 /* Matches any (more or less) character. */
340 anychar,
342 /* Matches any one char belonging to specified set. First
343 following byte is number of bitmap bytes. Then come bytes
344 for a bitmap saying which chars are in. Bits in each byte
345 are ordered low-bit-first. A character is in the set if its
346 bit is 1. A character too large to have a bit in the map is
347 automatically not in the set. */
348 charset,
350 /* Same parameters as charset, but match any character that is
351 not one of those specified. */
352 charset_not,
354 /* Start remembering the text that is matched, for storing in a
355 register. Followed by one byte with the register number, in
356 the range 0 to one less than the pattern buffer's re_nsub
357 field. Then followed by one byte with the number of groups
358 inner to this one. (This last has to be part of the
359 start_memory only because we need it in the on_failure_jump
360 of re_match_2.) */
361 start_memory,
363 /* Stop remembering the text that is matched and store it in a
364 memory register. Followed by one byte with the register
365 number, in the range 0 to one less than `re_nsub' in the
366 pattern buffer, and one byte with the number of inner groups,
367 just like `start_memory'. (We need the number of inner
368 groups here because we don't have any easy way of finding the
369 corresponding start_memory when we're at a stop_memory.) */
370 stop_memory,
372 /* Match a duplicate of something remembered. Followed by one
373 byte containing the register number. */
374 duplicate,
376 /* Fail unless at beginning of line. */
377 begline,
379 /* Fail unless at end of line. */
380 endline,
382 /* Succeeds if at beginning of buffer (if emacs) or at beginning
383 of string to be matched (if not). */
384 begbuf,
386 /* Analogously, for end of buffer/string. */
387 endbuf,
389 /* Followed by two byte relative address to which to jump. */
390 jump,
392 /* Same as jump, but marks the end of an alternative. */
393 jump_past_alt,
395 /* Followed by two-byte relative address of place to resume at
396 in case of failure. */
397 on_failure_jump,
399 /* Like on_failure_jump, but pushes a placeholder instead of the
400 current string position when executed. */
401 on_failure_keep_string_jump,
403 /* Throw away latest failure point and then jump to following
404 two-byte relative address. */
405 pop_failure_jump,
407 /* Change to pop_failure_jump if know won't have to backtrack to
408 match; otherwise change to jump. This is used to jump
409 back to the beginning of a repeat. If what follows this jump
410 clearly won't match what the repeat does, such that we can be
411 sure that there is no use backtracking out of repetitions
412 already matched, then we change it to a pop_failure_jump.
413 Followed by two-byte address. */
414 maybe_pop_jump,
416 /* Jump to following two-byte address, and push a dummy failure
417 point. This failure point will be thrown away if an attempt
418 is made to use it for a failure. A `+' construct makes this
419 before the first repeat. Also used as an intermediary kind
420 of jump when compiling an alternative. */
421 dummy_failure_jump,
423 /* Push a dummy failure point and continue. Used at the end of
424 alternatives. */
425 push_dummy_failure,
427 /* Followed by two-byte relative address and two-byte number n.
428 After matching N times, jump to the address upon failure. */
429 succeed_n,
431 /* Followed by two-byte relative address, and two-byte number n.
432 Jump to the address N times, then fail. */
433 jump_n,
435 /* Set the following two-byte relative address to the
436 subsequent two-byte number. The address *includes* the two
437 bytes of number. */
438 set_number_at,
440 wordchar, /* Matches any word-constituent character. */
441 notwordchar, /* Matches any char that is not a word-constituent. */
443 wordbeg, /* Succeeds if at word beginning. */
444 wordend, /* Succeeds if at word end. */
446 wordbound, /* Succeeds if at a word boundary. */
447 notwordbound /* Succeeds if not at a word boundary. */
449 #ifdef emacs
450 ,before_dot, /* Succeeds if before point. */
451 at_dot, /* Succeeds if at point. */
452 after_dot, /* Succeeds if after point. */
454 /* Matches any character whose syntax is specified. Followed by
455 a byte which contains a syntax code, e.g., Sword. */
456 syntaxspec,
458 /* Matches any character whose syntax is not that specified. */
459 notsyntaxspec
460 #endif /* emacs */
461 } re_opcode_t;
463 /* Common operations on the compiled pattern. */
465 /* Store NUMBER in two contiguous bytes starting at DESTINATION. */
467 #define STORE_NUMBER(destination, number) \
468 do { \
469 (destination)[0] = (number) & 0377; \
470 (destination)[1] = (number) >> 8; \
471 } while (0)
473 /* Same as STORE_NUMBER, except increment DESTINATION to
474 the byte after where the number is stored. Therefore, DESTINATION
475 must be an lvalue. */
477 #define STORE_NUMBER_AND_INCR(destination, number) \
478 do { \
479 STORE_NUMBER (destination, number); \
480 (destination) += 2; \
481 } while (0)
483 /* Put into DESTINATION a number stored in two contiguous bytes starting
484 at SOURCE. */
486 #define EXTRACT_NUMBER(destination, source) \
487 do { \
488 (destination) = *(source) & 0377; \
489 (destination) += SIGN_EXTEND_CHAR (*((source) + 1)) << 8; \
490 } while (0)
492 #ifdef DEBUG
493 static void
494 extract_number (dest, source)
495 int *dest;
496 unsigned char *source;
498 int temp = SIGN_EXTEND_CHAR (*(source + 1));
499 *dest = *source & 0377;
500 *dest += temp << 8;
503 #ifndef EXTRACT_MACROS /* To debug the macros. */
504 #undef EXTRACT_NUMBER
505 #define EXTRACT_NUMBER(dest, src) extract_number (&dest, src)
506 #endif /* not EXTRACT_MACROS */
508 #endif /* DEBUG */
510 /* Same as EXTRACT_NUMBER, except increment SOURCE to after the number.
511 SOURCE must be an lvalue. */
513 #define EXTRACT_NUMBER_AND_INCR(destination, source) \
514 do { \
515 EXTRACT_NUMBER (destination, source); \
516 (source) += 2; \
517 } while (0)
519 #ifdef DEBUG
520 static void
521 extract_number_and_incr (destination, source)
522 int *destination;
523 unsigned char **source;
525 extract_number (destination, *source);
526 *source += 2;
529 #ifndef EXTRACT_MACROS
530 #undef EXTRACT_NUMBER_AND_INCR
531 #define EXTRACT_NUMBER_AND_INCR(dest, src) \
532 extract_number_and_incr (&dest, &src)
533 #endif /* not EXTRACT_MACROS */
535 #endif /* DEBUG */
537 /* If DEBUG is defined, Regex prints many voluminous messages about what
538 it is doing (if the variable `debug' is nonzero). If linked with the
539 main program in `iregex.c', you can enter patterns and strings
540 interactively. And if linked with the main program in `main.c' and
541 the other test files, you can run the already-written tests. */
543 #ifdef DEBUG
545 /* We use standard I/O for debugging. */
546 #include <stdio.h>
548 /* It is useful to test things that ``must'' be true when debugging. */
549 #include <assert.h>
551 static int debug = 0;
553 #define DEBUG_STATEMENT(e) e
554 #define DEBUG_PRINT1(x) if (debug) printf (x)
555 #define DEBUG_PRINT2(x1, x2) if (debug) printf (x1, x2)
556 #define DEBUG_PRINT3(x1, x2, x3) if (debug) printf (x1, x2, x3)
557 #define DEBUG_PRINT4(x1, x2, x3, x4) if (debug) printf (x1, x2, x3, x4)
558 #define DEBUG_PRINT_COMPILED_PATTERN(p, s, e) \
559 if (debug) print_partial_compiled_pattern (s, e)
560 #define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2) \
561 if (debug) print_double_string (w, s1, sz1, s2, sz2)
564 /* Print the fastmap in human-readable form. */
566 void
567 print_fastmap (fastmap)
568 char *fastmap;
570 unsigned was_a_range = 0;
571 unsigned i = 0;
573 while (i < (1 << BYTEWIDTH))
575 if (fastmap[i++])
577 was_a_range = 0;
578 putchar (i - 1);
579 while (i < (1 << BYTEWIDTH) && fastmap[i])
581 was_a_range = 1;
582 i++;
584 if (was_a_range)
586 printf ("-");
587 putchar (i - 1);
591 putchar ('\n');
595 /* Print a compiled pattern string in human-readable form, starting at
596 the START pointer into it and ending just before the pointer END. */
598 void
599 print_partial_compiled_pattern (start, end)
600 unsigned char *start;
601 unsigned char *end;
603 int mcnt, mcnt2;
604 unsigned char *p = start;
605 unsigned char *pend = end;
607 if (start == NULL)
609 printf ("(null)\n");
610 return;
613 /* Loop over pattern commands. */
614 while (p < pend)
616 printf ("%d:\t", p - start);
618 switch ((re_opcode_t) *p++)
620 case no_op:
621 printf ("/no_op");
622 break;
624 case exactn:
625 mcnt = *p++;
626 printf ("/exactn/%d", mcnt);
629 putchar ('/');
630 putchar (*p++);
632 while (--mcnt);
633 break;
635 case start_memory:
636 mcnt = *p++;
637 printf ("/start_memory/%d/%d", mcnt, *p++);
638 break;
640 case stop_memory:
641 mcnt = *p++;
642 printf ("/stop_memory/%d/%d", mcnt, *p++);
643 break;
645 case duplicate:
646 printf ("/duplicate/%d", *p++);
647 break;
649 case anychar:
650 printf ("/anychar");
651 break;
653 case charset:
654 case charset_not:
656 register int c, last = -100;
657 register int in_range = 0;
659 printf ("/charset [%s",
660 (re_opcode_t) *(p - 1) == charset_not ? "^" : "");
662 assert (p + *p < pend);
664 for (c = 0; c < 256; c++)
665 if (c / 8 < *p
666 && (p[1 + (c/8)] & (1 << (c % 8))))
668 /* Are we starting a range? */
669 if (last + 1 == c && ! in_range)
671 putchar ('-');
672 in_range = 1;
674 /* Have we broken a range? */
675 else if (last + 1 != c && in_range)
677 putchar (last);
678 in_range = 0;
681 if (! in_range)
682 putchar (c);
684 last = c;
687 if (in_range)
688 putchar (last);
690 putchar (']');
692 p += 1 + *p;
694 break;
696 case begline:
697 printf ("/begline");
698 break;
700 case endline:
701 printf ("/endline");
702 break;
704 case on_failure_jump:
705 extract_number_and_incr (&mcnt, &p);
706 printf ("/on_failure_jump to %d", p + mcnt - start);
707 break;
709 case on_failure_keep_string_jump:
710 extract_number_and_incr (&mcnt, &p);
711 printf ("/on_failure_keep_string_jump to %d", p + mcnt - start);
712 break;
714 case dummy_failure_jump:
715 extract_number_and_incr (&mcnt, &p);
716 printf ("/dummy_failure_jump to %d", p + mcnt - start);
717 break;
719 case push_dummy_failure:
720 printf ("/push_dummy_failure");
721 break;
723 case maybe_pop_jump:
724 extract_number_and_incr (&mcnt, &p);
725 printf ("/maybe_pop_jump to %d", p + mcnt - start);
726 break;
728 case pop_failure_jump:
729 extract_number_and_incr (&mcnt, &p);
730 printf ("/pop_failure_jump to %d", p + mcnt - start);
731 break;
733 case jump_past_alt:
734 extract_number_and_incr (&mcnt, &p);
735 printf ("/jump_past_alt to %d", p + mcnt - start);
736 break;
738 case jump:
739 extract_number_and_incr (&mcnt, &p);
740 printf ("/jump to %d", p + mcnt - start);
741 break;
743 case succeed_n:
744 extract_number_and_incr (&mcnt, &p);
745 extract_number_and_incr (&mcnt2, &p);
746 printf ("/succeed_n to %d, %d times", p + mcnt - start, mcnt2);
747 break;
749 case jump_n:
750 extract_number_and_incr (&mcnt, &p);
751 extract_number_and_incr (&mcnt2, &p);
752 printf ("/jump_n to %d, %d times", p + mcnt - start, mcnt2);
753 break;
755 case set_number_at:
756 extract_number_and_incr (&mcnt, &p);
757 extract_number_and_incr (&mcnt2, &p);
758 printf ("/set_number_at location %d to %d", p + mcnt - start, mcnt2);
759 break;
761 case wordbound:
762 printf ("/wordbound");
763 break;
765 case notwordbound:
766 printf ("/notwordbound");
767 break;
769 case wordbeg:
770 printf ("/wordbeg");
771 break;
773 case wordend:
774 printf ("/wordend");
776 #ifdef emacs
777 case before_dot:
778 printf ("/before_dot");
779 break;
781 case at_dot:
782 printf ("/at_dot");
783 break;
785 case after_dot:
786 printf ("/after_dot");
787 break;
789 case syntaxspec:
790 printf ("/syntaxspec");
791 mcnt = *p++;
792 printf ("/%d", mcnt);
793 break;
795 case notsyntaxspec:
796 printf ("/notsyntaxspec");
797 mcnt = *p++;
798 printf ("/%d", mcnt);
799 break;
800 #endif /* emacs */
802 case wordchar:
803 printf ("/wordchar");
804 break;
806 case notwordchar:
807 printf ("/notwordchar");
808 break;
810 case begbuf:
811 printf ("/begbuf");
812 break;
814 case endbuf:
815 printf ("/endbuf");
816 break;
818 default:
819 printf ("?%d", *(p-1));
822 putchar ('\n');
825 printf ("%d:\tend of pattern.\n", p - start);
829 void
830 print_compiled_pattern (bufp)
831 struct re_pattern_buffer *bufp;
833 unsigned char *buffer = bufp->buffer;
835 print_partial_compiled_pattern (buffer, buffer + bufp->used);
836 printf ("%d bytes used/%d bytes allocated.\n", bufp->used, bufp->allocated);
838 if (bufp->fastmap_accurate && bufp->fastmap)
840 printf ("fastmap: ");
841 print_fastmap (bufp->fastmap);
844 printf ("re_nsub: %d\t", bufp->re_nsub);
845 printf ("regs_alloc: %d\t", bufp->regs_allocated);
846 printf ("can_be_null: %d\t", bufp->can_be_null);
847 printf ("newline_anchor: %d\n", bufp->newline_anchor);
848 printf ("no_sub: %d\t", bufp->no_sub);
849 printf ("not_bol: %d\t", bufp->not_bol);
850 printf ("not_eol: %d\t", bufp->not_eol);
851 printf ("syntax: %d\n", bufp->syntax);
852 /* Perhaps we should print the translate table? */
856 void
857 print_double_string (where, string1, size1, string2, size2)
858 const char *where;
859 const char *string1;
860 const char *string2;
861 int size1;
862 int size2;
864 unsigned this_char;
866 if (where == NULL)
867 printf ("(null)");
868 else
870 if (FIRST_STRING_P (where))
872 for (this_char = where - string1; this_char < size1; this_char++)
873 putchar (string1[this_char]);
875 where = string2;
878 for (this_char = where - string2; this_char < size2; this_char++)
879 putchar (string2[this_char]);
883 #else /* not DEBUG */
885 #undef assert
886 #define assert(e)
888 #define DEBUG_STATEMENT(e)
889 #define DEBUG_PRINT1(x)
890 #define DEBUG_PRINT2(x1, x2)
891 #define DEBUG_PRINT3(x1, x2, x3)
892 #define DEBUG_PRINT4(x1, x2, x3, x4)
893 #define DEBUG_PRINT_COMPILED_PATTERN(p, s, e)
894 #define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2)
896 #endif /* not DEBUG */
898 /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
899 also be assigned to arbitrarily: each pattern buffer stores its own
900 syntax, so it can be changed between regex compilations. */
901 /* This has no initializer because initialized variables in Emacs
902 become read-only after dumping. */
903 reg_syntax_t re_syntax_options;
906 /* Specify the precise syntax of regexps for compilation. This provides
907 for compatibility for various utilities which historically have
908 different, incompatible syntaxes.
910 The argument SYNTAX is a bit mask comprised of the various bits
911 defined in regex.h. We return the old syntax. */
913 reg_syntax_t
914 re_set_syntax (syntax)
915 reg_syntax_t syntax;
917 reg_syntax_t ret = re_syntax_options;
919 re_syntax_options = syntax;
920 return ret;
923 /* This table gives an error message for each of the error codes listed
924 in regex.h. Obviously the order here has to be same as there.
925 POSIX doesn't require that we do anything for REG_NOERROR,
926 but why not be nice? */
928 static const char *re_error_msgid[] =
929 { "Success", /* REG_NOERROR */
930 "No match", /* REG_NOMATCH */
931 "Invalid regular expression", /* REG_BADPAT */
932 "Invalid collation character", /* REG_ECOLLATE */
933 "Invalid character class name", /* REG_ECTYPE */
934 "Trailing backslash", /* REG_EESCAPE */
935 "Invalid back reference", /* REG_ESUBREG */
936 "Unmatched [ or [^", /* REG_EBRACK */
937 "Unmatched ( or \\(", /* REG_EPAREN */
938 "Unmatched \\{", /* REG_EBRACE */
939 "Invalid content of \\{\\}", /* REG_BADBR */
940 "Invalid range end", /* REG_ERANGE */
941 "Memory exhausted", /* REG_ESPACE */
942 "Invalid preceding regular expression", /* REG_BADRPT */
943 "Premature end of regular expression", /* REG_EEND */
944 "Regular expression too big", /* REG_ESIZE */
945 "Unmatched ) or \\)", /* REG_ERPAREN */
948 /* Avoiding alloca during matching, to placate r_alloc. */
950 /* Define MATCH_MAY_ALLOCATE unless we need to make sure that the
951 searching and matching functions should not call alloca. On some
952 systems, alloca is implemented in terms of malloc, and if we're
953 using the relocating allocator routines, then malloc could cause a
954 relocation, which might (if the strings being searched are in the
955 ralloc heap) shift the data out from underneath the regexp
956 routines.
958 Here's another reason to avoid allocation: Emacs
959 processes input from X in a signal handler; processing X input may
960 call malloc; if input arrives while a matching routine is calling
961 malloc, then we're scrod. But Emacs can't just block input while
962 calling matching routines; then we don't notice interrupts when
963 they come in. So, Emacs blocks input around all regexp calls
964 except the matching calls, which it leaves unprotected, in the
965 faith that they will not malloc. */
967 /* Normally, this is fine. */
968 #define MATCH_MAY_ALLOCATE
970 /* When using GNU C, we are not REALLY using the C alloca, no matter
971 what config.h may say. So don't take precautions for it. */
972 #ifdef __GNUC__
973 #undef C_ALLOCA
974 #endif
976 /* The match routines may not allocate if (1) they would do it with malloc
977 and (2) it's not safe for them to use malloc.
978 Note that if REL_ALLOC is defined, matching would not use malloc for the
979 failure stack, but we would still use it for the register vectors;
980 so REL_ALLOC should not affect this. */
981 #if (defined (C_ALLOCA) || defined (REGEX_MALLOC)) && defined (emacs)
982 #undef MATCH_MAY_ALLOCATE
983 #endif
986 /* Failure stack declarations and macros; both re_compile_fastmap and
987 re_match_2 use a failure stack. These have to be macros because of
988 REGEX_ALLOCATE_STACK. */
991 /* Number of failure points for which to initially allocate space
992 when matching. If this number is exceeded, we allocate more
993 space, so it is not a hard limit. */
994 #ifndef INIT_FAILURE_ALLOC
995 #define INIT_FAILURE_ALLOC 5
996 #endif
998 /* Roughly the maximum number of failure points on the stack. Would be
999 exactly that if always used MAX_FAILURE_SPACE each time we failed.
1000 This is a variable only so users of regex can assign to it; we never
1001 change it ourselves. */
1002 #if defined (MATCH_MAY_ALLOCATE)
1003 int re_max_failures = 200000;
1004 #else
1005 int re_max_failures = 2000;
1006 #endif
1008 union fail_stack_elt
1010 unsigned char *pointer;
1011 int integer;
1014 typedef union fail_stack_elt fail_stack_elt_t;
1016 typedef struct
1018 fail_stack_elt_t *stack;
1019 unsigned size;
1020 unsigned avail; /* Offset of next open position. */
1021 } fail_stack_type;
1023 #define FAIL_STACK_EMPTY() (fail_stack.avail == 0)
1024 #define FAIL_STACK_PTR_EMPTY() (fail_stack_ptr->avail == 0)
1025 #define FAIL_STACK_FULL() (fail_stack.avail == fail_stack.size)
1028 /* Define macros to initialize and free the failure stack.
1029 Do `return -2' if the alloc fails. */
1031 #ifdef MATCH_MAY_ALLOCATE
1032 #define INIT_FAIL_STACK() \
1033 do { \
1034 fail_stack.stack = (fail_stack_elt_t *) \
1035 REGEX_ALLOCATE_STACK (INIT_FAILURE_ALLOC * sizeof (fail_stack_elt_t)); \
1037 if (fail_stack.stack == NULL) \
1038 return -2; \
1040 fail_stack.size = INIT_FAILURE_ALLOC; \
1041 fail_stack.avail = 0; \
1042 } while (0)
1044 #define RESET_FAIL_STACK() REGEX_FREE_STACK (fail_stack.stack)
1045 #else
1046 #define INIT_FAIL_STACK() \
1047 do { \
1048 fail_stack.avail = 0; \
1049 } while (0)
1051 #define RESET_FAIL_STACK()
1052 #endif
1055 /* Double the size of FAIL_STACK, up to approximately `re_max_failures' items.
1057 Return 1 if succeeds, and 0 if either ran out of memory
1058 allocating space for it or it was already too large.
1060 REGEX_REALLOCATE_STACK requires `destination' be declared. */
1062 #define DOUBLE_FAIL_STACK(fail_stack) \
1063 ((fail_stack).size > re_max_failures * MAX_FAILURE_ITEMS \
1064 ? 0 \
1065 : ((fail_stack).stack = (fail_stack_elt_t *) \
1066 REGEX_REALLOCATE_STACK ((fail_stack).stack, \
1067 (fail_stack).size * sizeof (fail_stack_elt_t), \
1068 ((fail_stack).size << 1) * sizeof (fail_stack_elt_t)), \
1070 (fail_stack).stack == NULL \
1071 ? 0 \
1072 : ((fail_stack).size <<= 1, \
1073 1)))
1076 /* Push pointer POINTER on FAIL_STACK.
1077 Return 1 if was able to do so and 0 if ran out of memory allocating
1078 space to do so. */
1079 #define PUSH_PATTERN_OP(POINTER, FAIL_STACK) \
1080 ((FAIL_STACK_FULL () \
1081 && !DOUBLE_FAIL_STACK (FAIL_STACK)) \
1082 ? 0 \
1083 : ((FAIL_STACK).stack[(FAIL_STACK).avail++].pointer = POINTER, \
1086 /* Push a pointer value onto the failure stack.
1087 Assumes the variable `fail_stack'. Probably should only
1088 be called from within `PUSH_FAILURE_POINT'. */
1089 #define PUSH_FAILURE_POINTER(item) \
1090 fail_stack.stack[fail_stack.avail++].pointer = (unsigned char *) (item)
1092 /* This pushes an integer-valued item onto the failure stack.
1093 Assumes the variable `fail_stack'. Probably should only
1094 be called from within `PUSH_FAILURE_POINT'. */
1095 #define PUSH_FAILURE_INT(item) \
1096 fail_stack.stack[fail_stack.avail++].integer = (item)
1098 /* Push a fail_stack_elt_t value onto the failure stack.
1099 Assumes the variable `fail_stack'. Probably should only
1100 be called from within `PUSH_FAILURE_POINT'. */
1101 #define PUSH_FAILURE_ELT(item) \
1102 fail_stack.stack[fail_stack.avail++] = (item)
1104 /* These three POP... operations complement the three PUSH... operations.
1105 All assume that `fail_stack' is nonempty. */
1106 #define POP_FAILURE_POINTER() fail_stack.stack[--fail_stack.avail].pointer
1107 #define POP_FAILURE_INT() fail_stack.stack[--fail_stack.avail].integer
1108 #define POP_FAILURE_ELT() fail_stack.stack[--fail_stack.avail]
1110 /* Used to omit pushing failure point id's when we're not debugging. */
1111 #ifdef DEBUG
1112 #define DEBUG_PUSH PUSH_FAILURE_INT
1113 #define DEBUG_POP(item_addr) *(item_addr) = POP_FAILURE_INT ()
1114 #else
1115 #define DEBUG_PUSH(item)
1116 #define DEBUG_POP(item_addr)
1117 #endif
1120 /* Push the information about the state we will need
1121 if we ever fail back to it.
1123 Requires variables fail_stack, regstart, regend, reg_info, and
1124 num_regs be declared. DOUBLE_FAIL_STACK requires `destination' be
1125 declared.
1127 Does `return FAILURE_CODE' if runs out of memory. */
1129 #define PUSH_FAILURE_POINT(pattern_place, string_place, failure_code) \
1130 do { \
1131 char *destination; \
1132 /* Must be int, so when we don't save any registers, the arithmetic \
1133 of 0 + -1 isn't done as unsigned. */ \
1134 int this_reg; \
1136 DEBUG_STATEMENT (failure_id++); \
1137 DEBUG_STATEMENT (nfailure_points_pushed++); \
1138 DEBUG_PRINT2 ("\nPUSH_FAILURE_POINT #%u:\n", failure_id); \
1139 DEBUG_PRINT2 (" Before push, next avail: %d\n", (fail_stack).avail);\
1140 DEBUG_PRINT2 (" size: %d\n", (fail_stack).size);\
1142 DEBUG_PRINT2 (" slots needed: %d\n", NUM_FAILURE_ITEMS); \
1143 DEBUG_PRINT2 (" available: %d\n", REMAINING_AVAIL_SLOTS); \
1145 /* Ensure we have enough space allocated for what we will push. */ \
1146 while (REMAINING_AVAIL_SLOTS < NUM_FAILURE_ITEMS) \
1148 if (!DOUBLE_FAIL_STACK (fail_stack)) \
1149 return failure_code; \
1151 DEBUG_PRINT2 ("\n Doubled stack; size now: %d\n", \
1152 (fail_stack).size); \
1153 DEBUG_PRINT2 (" slots available: %d\n", REMAINING_AVAIL_SLOTS);\
1156 /* Push the info, starting with the registers. */ \
1157 DEBUG_PRINT1 ("\n"); \
1159 for (this_reg = lowest_active_reg; this_reg <= highest_active_reg; \
1160 this_reg++) \
1162 DEBUG_PRINT2 (" Pushing reg: %d\n", this_reg); \
1163 DEBUG_STATEMENT (num_regs_pushed++); \
1165 DEBUG_PRINT2 (" start: 0x%x\n", regstart[this_reg]); \
1166 PUSH_FAILURE_POINTER (regstart[this_reg]); \
1168 DEBUG_PRINT2 (" end: 0x%x\n", regend[this_reg]); \
1169 PUSH_FAILURE_POINTER (regend[this_reg]); \
1171 DEBUG_PRINT2 (" info: 0x%x\n ", reg_info[this_reg]); \
1172 DEBUG_PRINT2 (" match_null=%d", \
1173 REG_MATCH_NULL_STRING_P (reg_info[this_reg])); \
1174 DEBUG_PRINT2 (" active=%d", IS_ACTIVE (reg_info[this_reg])); \
1175 DEBUG_PRINT2 (" matched_something=%d", \
1176 MATCHED_SOMETHING (reg_info[this_reg])); \
1177 DEBUG_PRINT2 (" ever_matched=%d", \
1178 EVER_MATCHED_SOMETHING (reg_info[this_reg])); \
1179 DEBUG_PRINT1 ("\n"); \
1180 PUSH_FAILURE_ELT (reg_info[this_reg].word); \
1183 DEBUG_PRINT2 (" Pushing low active reg: %d\n", lowest_active_reg);\
1184 PUSH_FAILURE_INT (lowest_active_reg); \
1186 DEBUG_PRINT2 (" Pushing high active reg: %d\n", highest_active_reg);\
1187 PUSH_FAILURE_INT (highest_active_reg); \
1189 DEBUG_PRINT2 (" Pushing pattern 0x%x: ", pattern_place); \
1190 DEBUG_PRINT_COMPILED_PATTERN (bufp, pattern_place, pend); \
1191 PUSH_FAILURE_POINTER (pattern_place); \
1193 DEBUG_PRINT2 (" Pushing string 0x%x: `", string_place); \
1194 DEBUG_PRINT_DOUBLE_STRING (string_place, string1, size1, string2, \
1195 size2); \
1196 DEBUG_PRINT1 ("'\n"); \
1197 PUSH_FAILURE_POINTER (string_place); \
1199 DEBUG_PRINT2 (" Pushing failure id: %u\n", failure_id); \
1200 DEBUG_PUSH (failure_id); \
1201 } while (0)
1203 /* This is the number of items that are pushed and popped on the stack
1204 for each register. */
1205 #define NUM_REG_ITEMS 3
1207 /* Individual items aside from the registers. */
1208 #ifdef DEBUG
1209 #define NUM_NONREG_ITEMS 5 /* Includes failure point id. */
1210 #else
1211 #define NUM_NONREG_ITEMS 4
1212 #endif
1214 /* We push at most this many items on the stack. */
1215 #define MAX_FAILURE_ITEMS ((num_regs - 1) * NUM_REG_ITEMS + NUM_NONREG_ITEMS)
1217 /* We actually push this many items. */
1218 #define NUM_FAILURE_ITEMS \
1219 ((highest_active_reg - lowest_active_reg + 1) * NUM_REG_ITEMS \
1220 + NUM_NONREG_ITEMS)
1222 /* How many items can still be added to the stack without overflowing it. */
1223 #define REMAINING_AVAIL_SLOTS ((fail_stack).size - (fail_stack).avail)
1226 /* Pops what PUSH_FAIL_STACK pushes.
1228 We restore into the parameters, all of which should be lvalues:
1229 STR -- the saved data position.
1230 PAT -- the saved pattern position.
1231 LOW_REG, HIGH_REG -- the highest and lowest active registers.
1232 REGSTART, REGEND -- arrays of string positions.
1233 REG_INFO -- array of information about each subexpression.
1235 Also assumes the variables `fail_stack' and (if debugging), `bufp',
1236 `pend', `string1', `size1', `string2', and `size2'. */
1238 #define POP_FAILURE_POINT(str, pat, low_reg, high_reg, regstart, regend, reg_info)\
1240 DEBUG_STATEMENT (fail_stack_elt_t failure_id;) \
1241 int this_reg; \
1242 const unsigned char *string_temp; \
1244 assert (!FAIL_STACK_EMPTY ()); \
1246 /* Remove failure points and point to how many regs pushed. */ \
1247 DEBUG_PRINT1 ("POP_FAILURE_POINT:\n"); \
1248 DEBUG_PRINT2 (" Before pop, next avail: %d\n", fail_stack.avail); \
1249 DEBUG_PRINT2 (" size: %d\n", fail_stack.size); \
1251 assert (fail_stack.avail >= NUM_NONREG_ITEMS); \
1253 DEBUG_POP (&failure_id); \
1254 DEBUG_PRINT2 (" Popping failure id: %u\n", failure_id); \
1256 /* If the saved string location is NULL, it came from an \
1257 on_failure_keep_string_jump opcode, and we want to throw away the \
1258 saved NULL, thus retaining our current position in the string. */ \
1259 string_temp = POP_FAILURE_POINTER (); \
1260 if (string_temp != NULL) \
1261 str = (const char *) string_temp; \
1263 DEBUG_PRINT2 (" Popping string 0x%x: `", str); \
1264 DEBUG_PRINT_DOUBLE_STRING (str, string1, size1, string2, size2); \
1265 DEBUG_PRINT1 ("'\n"); \
1267 pat = (unsigned char *) POP_FAILURE_POINTER (); \
1268 DEBUG_PRINT2 (" Popping pattern 0x%x: ", pat); \
1269 DEBUG_PRINT_COMPILED_PATTERN (bufp, pat, pend); \
1271 /* Restore register info. */ \
1272 high_reg = (unsigned) POP_FAILURE_INT (); \
1273 DEBUG_PRINT2 (" Popping high active reg: %d\n", high_reg); \
1275 low_reg = (unsigned) POP_FAILURE_INT (); \
1276 DEBUG_PRINT2 (" Popping low active reg: %d\n", low_reg); \
1278 for (this_reg = high_reg; this_reg >= low_reg; this_reg--) \
1280 DEBUG_PRINT2 (" Popping reg: %d\n", this_reg); \
1282 reg_info[this_reg].word = POP_FAILURE_ELT (); \
1283 DEBUG_PRINT2 (" info: 0x%x\n", reg_info[this_reg]); \
1285 regend[this_reg] = (const char *) POP_FAILURE_POINTER (); \
1286 DEBUG_PRINT2 (" end: 0x%x\n", regend[this_reg]); \
1288 regstart[this_reg] = (const char *) POP_FAILURE_POINTER (); \
1289 DEBUG_PRINT2 (" start: 0x%x\n", regstart[this_reg]); \
1292 set_regs_matched_done = 0; \
1293 DEBUG_STATEMENT (nfailure_points_popped++); \
1294 } /* POP_FAILURE_POINT */
1298 /* Structure for per-register (a.k.a. per-group) information.
1299 Other register information, such as the
1300 starting and ending positions (which are addresses), and the list of
1301 inner groups (which is a bits list) are maintained in separate
1302 variables.
1304 We are making a (strictly speaking) nonportable assumption here: that
1305 the compiler will pack our bit fields into something that fits into
1306 the type of `word', i.e., is something that fits into one item on the
1307 failure stack. */
1309 typedef union
1311 fail_stack_elt_t word;
1312 struct
1314 /* This field is one if this group can match the empty string,
1315 zero if not. If not yet determined, `MATCH_NULL_UNSET_VALUE'. */
1316 #define MATCH_NULL_UNSET_VALUE 3
1317 unsigned match_null_string_p : 2;
1318 unsigned is_active : 1;
1319 unsigned matched_something : 1;
1320 unsigned ever_matched_something : 1;
1321 } bits;
1322 } register_info_type;
1324 #define REG_MATCH_NULL_STRING_P(R) ((R).bits.match_null_string_p)
1325 #define IS_ACTIVE(R) ((R).bits.is_active)
1326 #define MATCHED_SOMETHING(R) ((R).bits.matched_something)
1327 #define EVER_MATCHED_SOMETHING(R) ((R).bits.ever_matched_something)
1330 /* Call this when have matched a real character; it sets `matched' flags
1331 for the subexpressions which we are currently inside. Also records
1332 that those subexprs have matched. */
1333 #define SET_REGS_MATCHED() \
1334 do \
1336 if (!set_regs_matched_done) \
1338 unsigned r; \
1339 set_regs_matched_done = 1; \
1340 for (r = lowest_active_reg; r <= highest_active_reg; r++) \
1342 MATCHED_SOMETHING (reg_info[r]) \
1343 = EVER_MATCHED_SOMETHING (reg_info[r]) \
1344 = 1; \
1348 while (0)
1350 /* Registers are set to a sentinel when they haven't yet matched. */
1351 static char reg_unset_dummy;
1352 #define REG_UNSET_VALUE (&reg_unset_dummy)
1353 #define REG_UNSET(e) ((e) == REG_UNSET_VALUE)
1355 /* Subroutine declarations and macros for regex_compile. */
1357 static void store_op1 (), store_op2 ();
1358 static void insert_op1 (), insert_op2 ();
1359 static boolean at_begline_loc_p (), at_endline_loc_p ();
1360 static boolean group_in_compile_stack ();
1361 static reg_errcode_t compile_range ();
1363 /* Fetch the next character in the uncompiled pattern---translating it
1364 if necessary. Also cast from a signed character in the constant
1365 string passed to us by the user to an unsigned char that we can use
1366 as an array index (in, e.g., `translate'). */
1367 #define PATFETCH(c) \
1368 do {if (p == pend) return REG_EEND; \
1369 c = (unsigned char) *p++; \
1370 if (translate) c = translate[c]; \
1371 } while (0)
1373 /* Fetch the next character in the uncompiled pattern, with no
1374 translation. */
1375 #define PATFETCH_RAW(c) \
1376 do {if (p == pend) return REG_EEND; \
1377 c = (unsigned char) *p++; \
1378 } while (0)
1380 /* Go backwards one character in the pattern. */
1381 #define PATUNFETCH p--
1384 /* If `translate' is non-null, return translate[D], else just D. We
1385 cast the subscript to translate because some data is declared as
1386 `char *', to avoid warnings when a string constant is passed. But
1387 when we use a character as a subscript we must make it unsigned. */
1388 #define TRANSLATE(d) (translate ? translate[(unsigned char) (d)] : (d))
1391 /* Macros for outputting the compiled pattern into `buffer'. */
1393 /* If the buffer isn't allocated when it comes in, use this. */
1394 #define INIT_BUF_SIZE 32
1396 /* Make sure we have at least N more bytes of space in buffer. */
1397 #define GET_BUFFER_SPACE(n) \
1398 while (b - bufp->buffer + (n) > bufp->allocated) \
1399 EXTEND_BUFFER ()
1401 /* Make sure we have one more byte of buffer space and then add C to it. */
1402 #define BUF_PUSH(c) \
1403 do { \
1404 GET_BUFFER_SPACE (1); \
1405 *b++ = (unsigned char) (c); \
1406 } while (0)
1409 /* Ensure we have two more bytes of buffer space and then append C1 and C2. */
1410 #define BUF_PUSH_2(c1, c2) \
1411 do { \
1412 GET_BUFFER_SPACE (2); \
1413 *b++ = (unsigned char) (c1); \
1414 *b++ = (unsigned char) (c2); \
1415 } while (0)
1418 /* As with BUF_PUSH_2, except for three bytes. */
1419 #define BUF_PUSH_3(c1, c2, c3) \
1420 do { \
1421 GET_BUFFER_SPACE (3); \
1422 *b++ = (unsigned char) (c1); \
1423 *b++ = (unsigned char) (c2); \
1424 *b++ = (unsigned char) (c3); \
1425 } while (0)
1428 /* Store a jump with opcode OP at LOC to location TO. We store a
1429 relative address offset by the three bytes the jump itself occupies. */
1430 #define STORE_JUMP(op, loc, to) \
1431 store_op1 (op, loc, (to) - (loc) - 3)
1433 /* Likewise, for a two-argument jump. */
1434 #define STORE_JUMP2(op, loc, to, arg) \
1435 store_op2 (op, loc, (to) - (loc) - 3, arg)
1437 /* Like `STORE_JUMP', but for inserting. Assume `b' is the buffer end. */
1438 #define INSERT_JUMP(op, loc, to) \
1439 insert_op1 (op, loc, (to) - (loc) - 3, b)
1441 /* Like `STORE_JUMP2', but for inserting. Assume `b' is the buffer end. */
1442 #define INSERT_JUMP2(op, loc, to, arg) \
1443 insert_op2 (op, loc, (to) - (loc) - 3, arg, b)
1446 /* This is not an arbitrary limit: the arguments which represent offsets
1447 into the pattern are two bytes long. So if 2^16 bytes turns out to
1448 be too small, many things would have to change. */
1449 #define MAX_BUF_SIZE (1L << 16)
1452 /* Extend the buffer by twice its current size via realloc and
1453 reset the pointers that pointed into the old block to point to the
1454 correct places in the new one. If extending the buffer results in it
1455 being larger than MAX_BUF_SIZE, then flag memory exhausted. */
1456 #define EXTEND_BUFFER() \
1457 do { \
1458 unsigned char *old_buffer = bufp->buffer; \
1459 if (bufp->allocated == MAX_BUF_SIZE) \
1460 return REG_ESIZE; \
1461 bufp->allocated <<= 1; \
1462 if (bufp->allocated > MAX_BUF_SIZE) \
1463 bufp->allocated = MAX_BUF_SIZE; \
1464 bufp->buffer = (unsigned char *) realloc (bufp->buffer, bufp->allocated);\
1465 if (bufp->buffer == NULL) \
1466 return REG_ESPACE; \
1467 /* If the buffer moved, move all the pointers into it. */ \
1468 if (old_buffer != bufp->buffer) \
1470 b = (b - old_buffer) + bufp->buffer; \
1471 begalt = (begalt - old_buffer) + bufp->buffer; \
1472 if (fixup_alt_jump) \
1473 fixup_alt_jump = (fixup_alt_jump - old_buffer) + bufp->buffer;\
1474 if (laststart) \
1475 laststart = (laststart - old_buffer) + bufp->buffer; \
1476 if (pending_exact) \
1477 pending_exact = (pending_exact - old_buffer) + bufp->buffer; \
1479 } while (0)
1482 /* Since we have one byte reserved for the register number argument to
1483 {start,stop}_memory, the maximum number of groups we can report
1484 things about is what fits in that byte. */
1485 #define MAX_REGNUM 255
1487 /* But patterns can have more than `MAX_REGNUM' registers. We just
1488 ignore the excess. */
1489 typedef unsigned regnum_t;
1492 /* Macros for the compile stack. */
1494 /* Since offsets can go either forwards or backwards, this type needs to
1495 be able to hold values from -(MAX_BUF_SIZE - 1) to MAX_BUF_SIZE - 1. */
1496 typedef int pattern_offset_t;
1498 typedef struct
1500 pattern_offset_t begalt_offset;
1501 pattern_offset_t fixup_alt_jump;
1502 pattern_offset_t inner_group_offset;
1503 pattern_offset_t laststart_offset;
1504 regnum_t regnum;
1505 } compile_stack_elt_t;
1508 typedef struct
1510 compile_stack_elt_t *stack;
1511 unsigned size;
1512 unsigned avail; /* Offset of next open position. */
1513 } compile_stack_type;
1516 #define INIT_COMPILE_STACK_SIZE 32
1518 #define COMPILE_STACK_EMPTY (compile_stack.avail == 0)
1519 #define COMPILE_STACK_FULL (compile_stack.avail == compile_stack.size)
1521 /* The next available element. */
1522 #define COMPILE_STACK_TOP (compile_stack.stack[compile_stack.avail])
1525 /* Set the bit for character C in a list. */
1526 #define SET_LIST_BIT(c) \
1527 (b[((unsigned char) (c)) / BYTEWIDTH] \
1528 |= 1 << (((unsigned char) c) % BYTEWIDTH))
1531 /* Get the next unsigned number in the uncompiled pattern. */
1532 #define GET_UNSIGNED_NUMBER(num) \
1533 { if (p != pend) \
1535 PATFETCH (c); \
1536 while (ISDIGIT (c)) \
1538 if (num < 0) \
1539 num = 0; \
1540 num = num * 10 + c - '0'; \
1541 if (p == pend) \
1542 break; \
1543 PATFETCH (c); \
1548 #define CHAR_CLASS_MAX_LENGTH 6 /* Namely, `xdigit'. */
1550 #define IS_CHAR_CLASS(string) \
1551 (STREQ (string, "alpha") || STREQ (string, "upper") \
1552 || STREQ (string, "lower") || STREQ (string, "digit") \
1553 || STREQ (string, "alnum") || STREQ (string, "xdigit") \
1554 || STREQ (string, "space") || STREQ (string, "print") \
1555 || STREQ (string, "punct") || STREQ (string, "graph") \
1556 || STREQ (string, "cntrl") || STREQ (string, "blank"))
1558 #ifndef MATCH_MAY_ALLOCATE
1560 /* If we cannot allocate large objects within re_match_2_internal,
1561 we make the fail stack and register vectors global.
1562 The fail stack, we grow to the maximum size when a regexp
1563 is compiled.
1564 The register vectors, we adjust in size each time we
1565 compile a regexp, according to the number of registers it needs. */
1567 static fail_stack_type fail_stack;
1569 /* Size with which the following vectors are currently allocated.
1570 That is so we can make them bigger as needed,
1571 but never make them smaller. */
1572 static int regs_allocated_size;
1574 static const char ** regstart, ** regend;
1575 static const char ** old_regstart, ** old_regend;
1576 static const char **best_regstart, **best_regend;
1577 static register_info_type *reg_info;
1578 static const char **reg_dummy;
1579 static register_info_type *reg_info_dummy;
1581 /* Make the register vectors big enough for NUM_REGS registers,
1582 but don't make them smaller. */
1584 static
1585 regex_grow_registers (num_regs)
1586 int num_regs;
1588 if (num_regs > regs_allocated_size)
1590 RETALLOC_IF (regstart, num_regs, const char *);
1591 RETALLOC_IF (regend, num_regs, const char *);
1592 RETALLOC_IF (old_regstart, num_regs, const char *);
1593 RETALLOC_IF (old_regend, num_regs, const char *);
1594 RETALLOC_IF (best_regstart, num_regs, const char *);
1595 RETALLOC_IF (best_regend, num_regs, const char *);
1596 RETALLOC_IF (reg_info, num_regs, register_info_type);
1597 RETALLOC_IF (reg_dummy, num_regs, const char *);
1598 RETALLOC_IF (reg_info_dummy, num_regs, register_info_type);
1600 regs_allocated_size = num_regs;
1604 #endif /* not MATCH_MAY_ALLOCATE */
1606 /* `regex_compile' compiles PATTERN (of length SIZE) according to SYNTAX.
1607 Returns one of error codes defined in `regex.h', or zero for success.
1609 Assumes the `allocated' (and perhaps `buffer') and `translate'
1610 fields are set in BUFP on entry.
1612 If it succeeds, results are put in BUFP (if it returns an error, the
1613 contents of BUFP are undefined):
1614 `buffer' is the compiled pattern;
1615 `syntax' is set to SYNTAX;
1616 `used' is set to the length of the compiled pattern;
1617 `fastmap_accurate' is zero;
1618 `re_nsub' is the number of subexpressions in PATTERN;
1619 `not_bol' and `not_eol' are zero;
1621 The `fastmap' and `newline_anchor' fields are neither
1622 examined nor set. */
1624 /* Return, freeing storage we allocated. */
1625 #define FREE_STACK_RETURN(value) \
1626 return (free (compile_stack.stack), value)
1628 static reg_errcode_t
1629 regex_compile (pattern, size, syntax, bufp)
1630 const char *pattern;
1631 int size;
1632 reg_syntax_t syntax;
1633 struct re_pattern_buffer *bufp;
1635 /* We fetch characters from PATTERN here. Even though PATTERN is
1636 `char *' (i.e., signed), we declare these variables as unsigned, so
1637 they can be reliably used as array indices. */
1638 register unsigned char c, c1;
1640 /* A random temporary spot in PATTERN. */
1641 const char *p1;
1643 /* Points to the end of the buffer, where we should append. */
1644 register unsigned char *b;
1646 /* Keeps track of unclosed groups. */
1647 compile_stack_type compile_stack;
1649 /* Points to the current (ending) position in the pattern. */
1650 const char *p = pattern;
1651 const char *pend = pattern + size;
1653 /* How to translate the characters in the pattern. */
1654 char *translate = bufp->translate;
1656 /* Address of the count-byte of the most recently inserted `exactn'
1657 command. This makes it possible to tell if a new exact-match
1658 character can be added to that command or if the character requires
1659 a new `exactn' command. */
1660 unsigned char *pending_exact = 0;
1662 /* Address of start of the most recently finished expression.
1663 This tells, e.g., postfix * where to find the start of its
1664 operand. Reset at the beginning of groups and alternatives. */
1665 unsigned char *laststart = 0;
1667 /* Address of beginning of regexp, or inside of last group. */
1668 unsigned char *begalt;
1670 /* Place in the uncompiled pattern (i.e., the {) to
1671 which to go back if the interval is invalid. */
1672 const char *beg_interval;
1674 /* Address of the place where a forward jump should go to the end of
1675 the containing expression. Each alternative of an `or' -- except the
1676 last -- ends with a forward jump of this sort. */
1677 unsigned char *fixup_alt_jump = 0;
1679 /* Counts open-groups as they are encountered. Remembered for the
1680 matching close-group on the compile stack, so the same register
1681 number is put in the stop_memory as the start_memory. */
1682 regnum_t regnum = 0;
1684 #ifdef DEBUG
1685 DEBUG_PRINT1 ("\nCompiling pattern: ");
1686 if (debug)
1688 unsigned debug_count;
1690 for (debug_count = 0; debug_count < size; debug_count++)
1691 putchar (pattern[debug_count]);
1692 putchar ('\n');
1694 #endif /* DEBUG */
1696 /* Initialize the compile stack. */
1697 compile_stack.stack = TALLOC (INIT_COMPILE_STACK_SIZE, compile_stack_elt_t);
1698 if (compile_stack.stack == NULL)
1699 return REG_ESPACE;
1701 compile_stack.size = INIT_COMPILE_STACK_SIZE;
1702 compile_stack.avail = 0;
1704 /* Initialize the pattern buffer. */
1705 bufp->syntax = syntax;
1706 bufp->fastmap_accurate = 0;
1707 bufp->not_bol = bufp->not_eol = 0;
1709 /* Set `used' to zero, so that if we return an error, the pattern
1710 printer (for debugging) will think there's no pattern. We reset it
1711 at the end. */
1712 bufp->used = 0;
1714 /* Always count groups, whether or not bufp->no_sub is set. */
1715 bufp->re_nsub = 0;
1717 #if !defined (emacs) && !defined (SYNTAX_TABLE)
1718 /* Initialize the syntax table. */
1719 init_syntax_once ();
1720 #endif
1722 if (bufp->allocated == 0)
1724 if (bufp->buffer)
1725 { /* If zero allocated, but buffer is non-null, try to realloc
1726 enough space. This loses if buffer's address is bogus, but
1727 that is the user's responsibility. */
1728 RETALLOC (bufp->buffer, INIT_BUF_SIZE, unsigned char);
1730 else
1731 { /* Caller did not allocate a buffer. Do it for them. */
1732 bufp->buffer = TALLOC (INIT_BUF_SIZE, unsigned char);
1734 if (!bufp->buffer) FREE_STACK_RETURN (REG_ESPACE);
1736 bufp->allocated = INIT_BUF_SIZE;
1739 begalt = b = bufp->buffer;
1741 /* Loop through the uncompiled pattern until we're at the end. */
1742 while (p != pend)
1744 PATFETCH (c);
1746 switch (c)
1748 case '^':
1750 if ( /* If at start of pattern, it's an operator. */
1751 p == pattern + 1
1752 /* If context independent, it's an operator. */
1753 || syntax & RE_CONTEXT_INDEP_ANCHORS
1754 /* Otherwise, depends on what's come before. */
1755 || at_begline_loc_p (pattern, p, syntax))
1756 BUF_PUSH (begline);
1757 else
1758 goto normal_char;
1760 break;
1763 case '$':
1765 if ( /* If at end of pattern, it's an operator. */
1766 p == pend
1767 /* If context independent, it's an operator. */
1768 || syntax & RE_CONTEXT_INDEP_ANCHORS
1769 /* Otherwise, depends on what's next. */
1770 || at_endline_loc_p (p, pend, syntax))
1771 BUF_PUSH (endline);
1772 else
1773 goto normal_char;
1775 break;
1778 case '+':
1779 case '?':
1780 if ((syntax & RE_BK_PLUS_QM)
1781 || (syntax & RE_LIMITED_OPS))
1782 goto normal_char;
1783 handle_plus:
1784 case '*':
1785 /* If there is no previous pattern... */
1786 if (!laststart)
1788 if (syntax & RE_CONTEXT_INVALID_OPS)
1789 FREE_STACK_RETURN (REG_BADRPT);
1790 else if (!(syntax & RE_CONTEXT_INDEP_OPS))
1791 goto normal_char;
1795 /* Are we optimizing this jump? */
1796 boolean keep_string_p = false;
1798 /* 1 means zero (many) matches is allowed. */
1799 char zero_times_ok = 0, many_times_ok = 0;
1801 /* If there is a sequence of repetition chars, collapse it
1802 down to just one (the right one). We can't combine
1803 interval operators with these because of, e.g., `a{2}*',
1804 which should only match an even number of `a's. */
1806 for (;;)
1808 zero_times_ok |= c != '+';
1809 many_times_ok |= c != '?';
1811 if (p == pend)
1812 break;
1814 PATFETCH (c);
1816 if (c == '*'
1817 || (!(syntax & RE_BK_PLUS_QM) && (c == '+' || c == '?')))
1820 else if (syntax & RE_BK_PLUS_QM && c == '\\')
1822 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
1824 PATFETCH (c1);
1825 if (!(c1 == '+' || c1 == '?'))
1827 PATUNFETCH;
1828 PATUNFETCH;
1829 break;
1832 c = c1;
1834 else
1836 PATUNFETCH;
1837 break;
1840 /* If we get here, we found another repeat character. */
1843 /* Star, etc. applied to an empty pattern is equivalent
1844 to an empty pattern. */
1845 if (!laststart)
1846 break;
1848 /* Now we know whether or not zero matches is allowed
1849 and also whether or not two or more matches is allowed. */
1850 if (many_times_ok)
1851 { /* More than one repetition is allowed, so put in at the
1852 end a backward relative jump from `b' to before the next
1853 jump we're going to put in below (which jumps from
1854 laststart to after this jump).
1856 But if we are at the `*' in the exact sequence `.*\n',
1857 insert an unconditional jump backwards to the .,
1858 instead of the beginning of the loop. This way we only
1859 push a failure point once, instead of every time
1860 through the loop. */
1861 assert (p - 1 > pattern);
1863 /* Allocate the space for the jump. */
1864 GET_BUFFER_SPACE (3);
1866 /* We know we are not at the first character of the pattern,
1867 because laststart was nonzero. And we've already
1868 incremented `p', by the way, to be the character after
1869 the `*'. Do we have to do something analogous here
1870 for null bytes, because of RE_DOT_NOT_NULL? */
1871 if (TRANSLATE (*(p - 2)) == TRANSLATE ('.')
1872 && zero_times_ok
1873 && p < pend && TRANSLATE (*p) == TRANSLATE ('\n')
1874 && !(syntax & RE_DOT_NEWLINE))
1875 { /* We have .*\n. */
1876 STORE_JUMP (jump, b, laststart);
1877 keep_string_p = true;
1879 else
1880 /* Anything else. */
1881 STORE_JUMP (maybe_pop_jump, b, laststart - 3);
1883 /* We've added more stuff to the buffer. */
1884 b += 3;
1887 /* On failure, jump from laststart to b + 3, which will be the
1888 end of the buffer after this jump is inserted. */
1889 GET_BUFFER_SPACE (3);
1890 INSERT_JUMP (keep_string_p ? on_failure_keep_string_jump
1891 : on_failure_jump,
1892 laststart, b + 3);
1893 pending_exact = 0;
1894 b += 3;
1896 if (!zero_times_ok)
1898 /* At least one repetition is required, so insert a
1899 `dummy_failure_jump' before the initial
1900 `on_failure_jump' instruction of the loop. This
1901 effects a skip over that instruction the first time
1902 we hit that loop. */
1903 GET_BUFFER_SPACE (3);
1904 INSERT_JUMP (dummy_failure_jump, laststart, laststart + 6);
1905 b += 3;
1908 break;
1911 case '.':
1912 laststart = b;
1913 BUF_PUSH (anychar);
1914 break;
1917 case '[':
1919 boolean had_char_class = false;
1921 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
1923 /* Ensure that we have enough space to push a charset: the
1924 opcode, the length count, and the bitset; 34 bytes in all. */
1925 GET_BUFFER_SPACE (34);
1927 laststart = b;
1929 /* We test `*p == '^' twice, instead of using an if
1930 statement, so we only need one BUF_PUSH. */
1931 BUF_PUSH (*p == '^' ? charset_not : charset);
1932 if (*p == '^')
1933 p++;
1935 /* Remember the first position in the bracket expression. */
1936 p1 = p;
1938 /* Push the number of bytes in the bitmap. */
1939 BUF_PUSH ((1 << BYTEWIDTH) / BYTEWIDTH);
1941 /* Clear the whole map. */
1942 bzero (b, (1 << BYTEWIDTH) / BYTEWIDTH);
1944 /* charset_not matches newline according to a syntax bit. */
1945 if ((re_opcode_t) b[-2] == charset_not
1946 && (syntax & RE_HAT_LISTS_NOT_NEWLINE))
1947 SET_LIST_BIT ('\n');
1949 /* Read in characters and ranges, setting map bits. */
1950 for (;;)
1952 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
1954 PATFETCH (c);
1956 /* \ might escape characters inside [...] and [^...]. */
1957 if ((syntax & RE_BACKSLASH_ESCAPE_IN_LISTS) && c == '\\')
1959 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
1961 PATFETCH (c1);
1962 SET_LIST_BIT (c1);
1963 continue;
1966 /* Could be the end of the bracket expression. If it's
1967 not (i.e., when the bracket expression is `[]' so
1968 far), the ']' character bit gets set way below. */
1969 if (c == ']' && p != p1 + 1)
1970 break;
1972 /* Look ahead to see if it's a range when the last thing
1973 was a character class. */
1974 if (had_char_class && c == '-' && *p != ']')
1975 FREE_STACK_RETURN (REG_ERANGE);
1977 /* Look ahead to see if it's a range when the last thing
1978 was a character: if this is a hyphen not at the
1979 beginning or the end of a list, then it's the range
1980 operator. */
1981 if (c == '-'
1982 && !(p - 2 >= pattern && p[-2] == '[')
1983 && !(p - 3 >= pattern && p[-3] == '[' && p[-2] == '^')
1984 && *p != ']')
1986 reg_errcode_t ret
1987 = compile_range (&p, pend, translate, syntax, b);
1988 if (ret != REG_NOERROR) FREE_STACK_RETURN (ret);
1991 else if (p[0] == '-' && p[1] != ']')
1992 { /* This handles ranges made up of characters only. */
1993 reg_errcode_t ret;
1995 /* Move past the `-'. */
1996 PATFETCH (c1);
1998 ret = compile_range (&p, pend, translate, syntax, b);
1999 if (ret != REG_NOERROR) FREE_STACK_RETURN (ret);
2002 /* See if we're at the beginning of a possible character
2003 class. */
2005 else if (syntax & RE_CHAR_CLASSES && c == '[' && *p == ':')
2006 { /* Leave room for the null. */
2007 char str[CHAR_CLASS_MAX_LENGTH + 1];
2009 PATFETCH (c);
2010 c1 = 0;
2012 /* If pattern is `[[:'. */
2013 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2015 for (;;)
2017 PATFETCH (c);
2018 if (c == ':' || c == ']' || p == pend
2019 || c1 == CHAR_CLASS_MAX_LENGTH)
2020 break;
2021 str[c1++] = c;
2023 str[c1] = '\0';
2025 /* If isn't a word bracketed by `[:' and:`]':
2026 undo the ending character, the letters, and leave
2027 the leading `:' and `[' (but set bits for them). */
2028 if (c == ':' && *p == ']')
2030 int ch;
2031 boolean is_alnum = STREQ (str, "alnum");
2032 boolean is_alpha = STREQ (str, "alpha");
2033 boolean is_blank = STREQ (str, "blank");
2034 boolean is_cntrl = STREQ (str, "cntrl");
2035 boolean is_digit = STREQ (str, "digit");
2036 boolean is_graph = STREQ (str, "graph");
2037 boolean is_lower = STREQ (str, "lower");
2038 boolean is_print = STREQ (str, "print");
2039 boolean is_punct = STREQ (str, "punct");
2040 boolean is_space = STREQ (str, "space");
2041 boolean is_upper = STREQ (str, "upper");
2042 boolean is_xdigit = STREQ (str, "xdigit");
2044 if (!IS_CHAR_CLASS (str))
2045 FREE_STACK_RETURN (REG_ECTYPE);
2047 /* Throw away the ] at the end of the character
2048 class. */
2049 PATFETCH (c);
2051 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2053 for (ch = 0; ch < 1 << BYTEWIDTH; ch++)
2055 /* This was split into 3 if's to
2056 avoid an arbitrary limit in some compiler. */
2057 if ( (is_alnum && ISALNUM (ch))
2058 || (is_alpha && ISALPHA (ch))
2059 || (is_blank && ISBLANK (ch))
2060 || (is_cntrl && ISCNTRL (ch)))
2061 SET_LIST_BIT (ch);
2062 if ( (is_digit && ISDIGIT (ch))
2063 || (is_graph && ISGRAPH (ch))
2064 || (is_lower && ISLOWER (ch))
2065 || (is_print && ISPRINT (ch)))
2066 SET_LIST_BIT (ch);
2067 if ( (is_punct && ISPUNCT (ch))
2068 || (is_space && ISSPACE (ch))
2069 || (is_upper && ISUPPER (ch))
2070 || (is_xdigit && ISXDIGIT (ch)))
2071 SET_LIST_BIT (ch);
2073 had_char_class = true;
2075 else
2077 c1++;
2078 while (c1--)
2079 PATUNFETCH;
2080 SET_LIST_BIT ('[');
2081 SET_LIST_BIT (':');
2082 had_char_class = false;
2085 else
2087 had_char_class = false;
2088 SET_LIST_BIT (c);
2092 /* Discard any (non)matching list bytes that are all 0 at the
2093 end of the map. Decrease the map-length byte too. */
2094 while ((int) b[-1] > 0 && b[b[-1] - 1] == 0)
2095 b[-1]--;
2096 b += b[-1];
2098 break;
2101 case '(':
2102 if (syntax & RE_NO_BK_PARENS)
2103 goto handle_open;
2104 else
2105 goto normal_char;
2108 case ')':
2109 if (syntax & RE_NO_BK_PARENS)
2110 goto handle_close;
2111 else
2112 goto normal_char;
2115 case '\n':
2116 if (syntax & RE_NEWLINE_ALT)
2117 goto handle_alt;
2118 else
2119 goto normal_char;
2122 case '|':
2123 if (syntax & RE_NO_BK_VBAR)
2124 goto handle_alt;
2125 else
2126 goto normal_char;
2129 case '{':
2130 if (syntax & RE_INTERVALS && syntax & RE_NO_BK_BRACES)
2131 goto handle_interval;
2132 else
2133 goto normal_char;
2136 case '\\':
2137 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
2139 /* Do not translate the character after the \, so that we can
2140 distinguish, e.g., \B from \b, even if we normally would
2141 translate, e.g., B to b. */
2142 PATFETCH_RAW (c);
2144 switch (c)
2146 case '(':
2147 if (syntax & RE_NO_BK_PARENS)
2148 goto normal_backslash;
2150 handle_open:
2151 bufp->re_nsub++;
2152 regnum++;
2154 if (COMPILE_STACK_FULL)
2156 RETALLOC (compile_stack.stack, compile_stack.size << 1,
2157 compile_stack_elt_t);
2158 if (compile_stack.stack == NULL) return REG_ESPACE;
2160 compile_stack.size <<= 1;
2163 /* These are the values to restore when we hit end of this
2164 group. They are all relative offsets, so that if the
2165 whole pattern moves because of realloc, they will still
2166 be valid. */
2167 COMPILE_STACK_TOP.begalt_offset = begalt - bufp->buffer;
2168 COMPILE_STACK_TOP.fixup_alt_jump
2169 = fixup_alt_jump ? fixup_alt_jump - bufp->buffer + 1 : 0;
2170 COMPILE_STACK_TOP.laststart_offset = b - bufp->buffer;
2171 COMPILE_STACK_TOP.regnum = regnum;
2173 /* We will eventually replace the 0 with the number of
2174 groups inner to this one. But do not push a
2175 start_memory for groups beyond the last one we can
2176 represent in the compiled pattern. */
2177 if (regnum <= MAX_REGNUM)
2179 COMPILE_STACK_TOP.inner_group_offset = b - bufp->buffer + 2;
2180 BUF_PUSH_3 (start_memory, regnum, 0);
2183 compile_stack.avail++;
2185 fixup_alt_jump = 0;
2186 laststart = 0;
2187 begalt = b;
2188 /* If we've reached MAX_REGNUM groups, then this open
2189 won't actually generate any code, so we'll have to
2190 clear pending_exact explicitly. */
2191 pending_exact = 0;
2192 break;
2195 case ')':
2196 if (syntax & RE_NO_BK_PARENS) goto normal_backslash;
2198 if (COMPILE_STACK_EMPTY)
2199 if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)
2200 goto normal_backslash;
2201 else
2202 FREE_STACK_RETURN (REG_ERPAREN);
2204 handle_close:
2205 if (fixup_alt_jump)
2206 { /* Push a dummy failure point at the end of the
2207 alternative for a possible future
2208 `pop_failure_jump' to pop. See comments at
2209 `push_dummy_failure' in `re_match_2'. */
2210 BUF_PUSH (push_dummy_failure);
2212 /* We allocated space for this jump when we assigned
2213 to `fixup_alt_jump', in the `handle_alt' case below. */
2214 STORE_JUMP (jump_past_alt, fixup_alt_jump, b - 1);
2217 /* See similar code for backslashed left paren above. */
2218 if (COMPILE_STACK_EMPTY)
2219 if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)
2220 goto normal_char;
2221 else
2222 FREE_STACK_RETURN (REG_ERPAREN);
2224 /* Since we just checked for an empty stack above, this
2225 ``can't happen''. */
2226 assert (compile_stack.avail != 0);
2228 /* We don't just want to restore into `regnum', because
2229 later groups should continue to be numbered higher,
2230 as in `(ab)c(de)' -- the second group is #2. */
2231 regnum_t this_group_regnum;
2233 compile_stack.avail--;
2234 begalt = bufp->buffer + COMPILE_STACK_TOP.begalt_offset;
2235 fixup_alt_jump
2236 = COMPILE_STACK_TOP.fixup_alt_jump
2237 ? bufp->buffer + COMPILE_STACK_TOP.fixup_alt_jump - 1
2238 : 0;
2239 laststart = bufp->buffer + COMPILE_STACK_TOP.laststart_offset;
2240 this_group_regnum = COMPILE_STACK_TOP.regnum;
2241 /* If we've reached MAX_REGNUM groups, then this open
2242 won't actually generate any code, so we'll have to
2243 clear pending_exact explicitly. */
2244 pending_exact = 0;
2246 /* We're at the end of the group, so now we know how many
2247 groups were inside this one. */
2248 if (this_group_regnum <= MAX_REGNUM)
2250 unsigned char *inner_group_loc
2251 = bufp->buffer + COMPILE_STACK_TOP.inner_group_offset;
2253 *inner_group_loc = regnum - this_group_regnum;
2254 BUF_PUSH_3 (stop_memory, this_group_regnum,
2255 regnum - this_group_regnum);
2258 break;
2261 case '|': /* `\|'. */
2262 if (syntax & RE_LIMITED_OPS || syntax & RE_NO_BK_VBAR)
2263 goto normal_backslash;
2264 handle_alt:
2265 if (syntax & RE_LIMITED_OPS)
2266 goto normal_char;
2268 /* Insert before the previous alternative a jump which
2269 jumps to this alternative if the former fails. */
2270 GET_BUFFER_SPACE (3);
2271 INSERT_JUMP (on_failure_jump, begalt, b + 6);
2272 pending_exact = 0;
2273 b += 3;
2275 /* The alternative before this one has a jump after it
2276 which gets executed if it gets matched. Adjust that
2277 jump so it will jump to this alternative's analogous
2278 jump (put in below, which in turn will jump to the next
2279 (if any) alternative's such jump, etc.). The last such
2280 jump jumps to the correct final destination. A picture:
2281 _____ _____
2282 | | | |
2283 | v | v
2284 a | b | c
2286 If we are at `b', then fixup_alt_jump right now points to a
2287 three-byte space after `a'. We'll put in the jump, set
2288 fixup_alt_jump to right after `b', and leave behind three
2289 bytes which we'll fill in when we get to after `c'. */
2291 if (fixup_alt_jump)
2292 STORE_JUMP (jump_past_alt, fixup_alt_jump, b);
2294 /* Mark and leave space for a jump after this alternative,
2295 to be filled in later either by next alternative or
2296 when know we're at the end of a series of alternatives. */
2297 fixup_alt_jump = b;
2298 GET_BUFFER_SPACE (3);
2299 b += 3;
2301 laststart = 0;
2302 begalt = b;
2303 break;
2306 case '{':
2307 /* If \{ is a literal. */
2308 if (!(syntax & RE_INTERVALS)
2309 /* If we're at `\{' and it's not the open-interval
2310 operator. */
2311 || ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
2312 || (p - 2 == pattern && p == pend))
2313 goto normal_backslash;
2315 handle_interval:
2317 /* If got here, then the syntax allows intervals. */
2319 /* At least (most) this many matches must be made. */
2320 int lower_bound = -1, upper_bound = -1;
2322 beg_interval = p - 1;
2324 if (p == pend)
2326 if (syntax & RE_NO_BK_BRACES)
2327 goto unfetch_interval;
2328 else
2329 FREE_STACK_RETURN (REG_EBRACE);
2332 GET_UNSIGNED_NUMBER (lower_bound);
2334 if (c == ',')
2336 GET_UNSIGNED_NUMBER (upper_bound);
2337 if (upper_bound < 0) upper_bound = RE_DUP_MAX;
2339 else
2340 /* Interval such as `{1}' => match exactly once. */
2341 upper_bound = lower_bound;
2343 if (lower_bound < 0 || upper_bound > RE_DUP_MAX
2344 || lower_bound > upper_bound)
2346 if (syntax & RE_NO_BK_BRACES)
2347 goto unfetch_interval;
2348 else
2349 FREE_STACK_RETURN (REG_BADBR);
2352 if (!(syntax & RE_NO_BK_BRACES))
2354 if (c != '\\') FREE_STACK_RETURN (REG_EBRACE);
2356 PATFETCH (c);
2359 if (c != '}')
2361 if (syntax & RE_NO_BK_BRACES)
2362 goto unfetch_interval;
2363 else
2364 FREE_STACK_RETURN (REG_BADBR);
2367 /* We just parsed a valid interval. */
2369 /* If it's invalid to have no preceding re. */
2370 if (!laststart)
2372 if (syntax & RE_CONTEXT_INVALID_OPS)
2373 FREE_STACK_RETURN (REG_BADRPT);
2374 else if (syntax & RE_CONTEXT_INDEP_OPS)
2375 laststart = b;
2376 else
2377 goto unfetch_interval;
2380 /* If the upper bound is zero, don't want to succeed at
2381 all; jump from `laststart' to `b + 3', which will be
2382 the end of the buffer after we insert the jump. */
2383 if (upper_bound == 0)
2385 GET_BUFFER_SPACE (3);
2386 INSERT_JUMP (jump, laststart, b + 3);
2387 b += 3;
2390 /* Otherwise, we have a nontrivial interval. When
2391 we're all done, the pattern will look like:
2392 set_number_at <jump count> <upper bound>
2393 set_number_at <succeed_n count> <lower bound>
2394 succeed_n <after jump addr> <succeed_n count>
2395 <body of loop>
2396 jump_n <succeed_n addr> <jump count>
2397 (The upper bound and `jump_n' are omitted if
2398 `upper_bound' is 1, though.) */
2399 else
2400 { /* If the upper bound is > 1, we need to insert
2401 more at the end of the loop. */
2402 unsigned nbytes = 10 + (upper_bound > 1) * 10;
2404 GET_BUFFER_SPACE (nbytes);
2406 /* Initialize lower bound of the `succeed_n', even
2407 though it will be set during matching by its
2408 attendant `set_number_at' (inserted next),
2409 because `re_compile_fastmap' needs to know.
2410 Jump to the `jump_n' we might insert below. */
2411 INSERT_JUMP2 (succeed_n, laststart,
2412 b + 5 + (upper_bound > 1) * 5,
2413 lower_bound);
2414 b += 5;
2416 /* Code to initialize the lower bound. Insert
2417 before the `succeed_n'. The `5' is the last two
2418 bytes of this `set_number_at', plus 3 bytes of
2419 the following `succeed_n'. */
2420 insert_op2 (set_number_at, laststart, 5, lower_bound, b);
2421 b += 5;
2423 if (upper_bound > 1)
2424 { /* More than one repetition is allowed, so
2425 append a backward jump to the `succeed_n'
2426 that starts this interval.
2428 When we've reached this during matching,
2429 we'll have matched the interval once, so
2430 jump back only `upper_bound - 1' times. */
2431 STORE_JUMP2 (jump_n, b, laststart + 5,
2432 upper_bound - 1);
2433 b += 5;
2435 /* The location we want to set is the second
2436 parameter of the `jump_n'; that is `b-2' as
2437 an absolute address. `laststart' will be
2438 the `set_number_at' we're about to insert;
2439 `laststart+3' the number to set, the source
2440 for the relative address. But we are
2441 inserting into the middle of the pattern --
2442 so everything is getting moved up by 5.
2443 Conclusion: (b - 2) - (laststart + 3) + 5,
2444 i.e., b - laststart.
2446 We insert this at the beginning of the loop
2447 so that if we fail during matching, we'll
2448 reinitialize the bounds. */
2449 insert_op2 (set_number_at, laststart, b - laststart,
2450 upper_bound - 1, b);
2451 b += 5;
2454 pending_exact = 0;
2455 beg_interval = NULL;
2457 break;
2459 unfetch_interval:
2460 /* If an invalid interval, match the characters as literals. */
2461 assert (beg_interval);
2462 p = beg_interval;
2463 beg_interval = NULL;
2465 /* normal_char and normal_backslash need `c'. */
2466 PATFETCH (c);
2468 if (!(syntax & RE_NO_BK_BRACES))
2470 if (p > pattern && p[-1] == '\\')
2471 goto normal_backslash;
2473 goto normal_char;
2475 #ifdef emacs
2476 /* There is no way to specify the before_dot and after_dot
2477 operators. rms says this is ok. --karl */
2478 case '=':
2479 BUF_PUSH (at_dot);
2480 break;
2482 case 's':
2483 laststart = b;
2484 PATFETCH (c);
2485 BUF_PUSH_2 (syntaxspec, syntax_spec_code[c]);
2486 break;
2488 case 'S':
2489 laststart = b;
2490 PATFETCH (c);
2491 BUF_PUSH_2 (notsyntaxspec, syntax_spec_code[c]);
2492 break;
2493 #endif /* emacs */
2496 case 'w':
2497 laststart = b;
2498 BUF_PUSH (wordchar);
2499 break;
2502 case 'W':
2503 laststart = b;
2504 BUF_PUSH (notwordchar);
2505 break;
2508 case '<':
2509 BUF_PUSH (wordbeg);
2510 break;
2512 case '>':
2513 BUF_PUSH (wordend);
2514 break;
2516 case 'b':
2517 BUF_PUSH (wordbound);
2518 break;
2520 case 'B':
2521 BUF_PUSH (notwordbound);
2522 break;
2524 case '`':
2525 BUF_PUSH (begbuf);
2526 break;
2528 case '\'':
2529 BUF_PUSH (endbuf);
2530 break;
2532 case '1': case '2': case '3': case '4': case '5':
2533 case '6': case '7': case '8': case '9':
2534 if (syntax & RE_NO_BK_REFS)
2535 goto normal_char;
2537 c1 = c - '0';
2539 if (c1 > regnum)
2540 FREE_STACK_RETURN (REG_ESUBREG);
2542 /* Can't back reference to a subexpression if inside of it. */
2543 if (group_in_compile_stack (compile_stack, c1))
2544 goto normal_char;
2546 laststart = b;
2547 BUF_PUSH_2 (duplicate, c1);
2548 break;
2551 case '+':
2552 case '?':
2553 if (syntax & RE_BK_PLUS_QM)
2554 goto handle_plus;
2555 else
2556 goto normal_backslash;
2558 default:
2559 normal_backslash:
2560 /* You might think it would be useful for \ to mean
2561 not to translate; but if we don't translate it
2562 it will never match anything. */
2563 c = TRANSLATE (c);
2564 goto normal_char;
2566 break;
2569 default:
2570 /* Expects the character in `c'. */
2571 normal_char:
2572 /* If no exactn currently being built. */
2573 if (!pending_exact
2575 /* If last exactn not at current position. */
2576 || pending_exact + *pending_exact + 1 != b
2578 /* We have only one byte following the exactn for the count. */
2579 || *pending_exact == (1 << BYTEWIDTH) - 1
2581 /* If followed by a repetition operator. */
2582 || *p == '*' || *p == '^'
2583 || ((syntax & RE_BK_PLUS_QM)
2584 ? *p == '\\' && (p[1] == '+' || p[1] == '?')
2585 : (*p == '+' || *p == '?'))
2586 || ((syntax & RE_INTERVALS)
2587 && ((syntax & RE_NO_BK_BRACES)
2588 ? *p == '{'
2589 : (p[0] == '\\' && p[1] == '{'))))
2591 /* Start building a new exactn. */
2593 laststart = b;
2595 BUF_PUSH_2 (exactn, 0);
2596 pending_exact = b - 1;
2599 BUF_PUSH (c);
2600 (*pending_exact)++;
2601 break;
2602 } /* switch (c) */
2603 } /* while p != pend */
2606 /* Through the pattern now. */
2608 if (fixup_alt_jump)
2609 STORE_JUMP (jump_past_alt, fixup_alt_jump, b);
2611 if (!COMPILE_STACK_EMPTY)
2612 FREE_STACK_RETURN (REG_EPAREN);
2614 /* If we don't want backtracking, force success
2615 the first time we reach the end of the compiled pattern. */
2616 if (syntax & RE_NO_POSIX_BACKTRACKING)
2617 BUF_PUSH (succeed);
2619 free (compile_stack.stack);
2621 /* We have succeeded; set the length of the buffer. */
2622 bufp->used = b - bufp->buffer;
2624 #ifdef DEBUG
2625 if (debug)
2627 DEBUG_PRINT1 ("\nCompiled pattern: \n");
2628 print_compiled_pattern (bufp);
2630 #endif /* DEBUG */
2632 #ifndef MATCH_MAY_ALLOCATE
2633 /* Initialize the failure stack to the largest possible stack. This
2634 isn't necessary unless we're trying to avoid calling alloca in
2635 the search and match routines. */
2637 int num_regs = bufp->re_nsub + 1;
2639 /* Since DOUBLE_FAIL_STACK refuses to double only if the current size
2640 is strictly greater than re_max_failures, the largest possible stack
2641 is 2 * re_max_failures failure points. */
2642 if (fail_stack.size < (2 * re_max_failures * MAX_FAILURE_ITEMS))
2644 fail_stack.size = (2 * re_max_failures * MAX_FAILURE_ITEMS);
2646 #ifdef emacs
2647 if (! fail_stack.stack)
2648 fail_stack.stack
2649 = (fail_stack_elt_t *) xmalloc (fail_stack.size
2650 * sizeof (fail_stack_elt_t));
2651 else
2652 fail_stack.stack
2653 = (fail_stack_elt_t *) xrealloc (fail_stack.stack,
2654 (fail_stack.size
2655 * sizeof (fail_stack_elt_t)));
2656 #else /* not emacs */
2657 if (! fail_stack.stack)
2658 fail_stack.stack
2659 = (fail_stack_elt_t *) malloc (fail_stack.size
2660 * sizeof (fail_stack_elt_t));
2661 else
2662 fail_stack.stack
2663 = (fail_stack_elt_t *) realloc (fail_stack.stack,
2664 (fail_stack.size
2665 * sizeof (fail_stack_elt_t)));
2666 #endif /* not emacs */
2669 regex_grow_registers (num_regs);
2671 #endif /* not MATCH_MAY_ALLOCATE */
2673 return REG_NOERROR;
2674 } /* regex_compile */
2676 /* Subroutines for `regex_compile'. */
2678 /* Store OP at LOC followed by two-byte integer parameter ARG. */
2680 static void
2681 store_op1 (op, loc, arg)
2682 re_opcode_t op;
2683 unsigned char *loc;
2684 int arg;
2686 *loc = (unsigned char) op;
2687 STORE_NUMBER (loc + 1, arg);
2691 /* Like `store_op1', but for two two-byte parameters ARG1 and ARG2. */
2693 static void
2694 store_op2 (op, loc, arg1, arg2)
2695 re_opcode_t op;
2696 unsigned char *loc;
2697 int arg1, arg2;
2699 *loc = (unsigned char) op;
2700 STORE_NUMBER (loc + 1, arg1);
2701 STORE_NUMBER (loc + 3, arg2);
2705 /* Copy the bytes from LOC to END to open up three bytes of space at LOC
2706 for OP followed by two-byte integer parameter ARG. */
2708 static void
2709 insert_op1 (op, loc, arg, end)
2710 re_opcode_t op;
2711 unsigned char *loc;
2712 int arg;
2713 unsigned char *end;
2715 register unsigned char *pfrom = end;
2716 register unsigned char *pto = end + 3;
2718 while (pfrom != loc)
2719 *--pto = *--pfrom;
2721 store_op1 (op, loc, arg);
2725 /* Like `insert_op1', but for two two-byte parameters ARG1 and ARG2. */
2727 static void
2728 insert_op2 (op, loc, arg1, arg2, end)
2729 re_opcode_t op;
2730 unsigned char *loc;
2731 int arg1, arg2;
2732 unsigned char *end;
2734 register unsigned char *pfrom = end;
2735 register unsigned char *pto = end + 5;
2737 while (pfrom != loc)
2738 *--pto = *--pfrom;
2740 store_op2 (op, loc, arg1, arg2);
2744 /* P points to just after a ^ in PATTERN. Return true if that ^ comes
2745 after an alternative or a begin-subexpression. We assume there is at
2746 least one character before the ^. */
2748 static boolean
2749 at_begline_loc_p (pattern, p, syntax)
2750 const char *pattern, *p;
2751 reg_syntax_t syntax;
2753 const char *prev = p - 2;
2754 boolean prev_prev_backslash = prev > pattern && prev[-1] == '\\';
2756 return
2757 /* After a subexpression? */
2758 (*prev == '(' && (syntax & RE_NO_BK_PARENS || prev_prev_backslash))
2759 /* After an alternative? */
2760 || (*prev == '|' && (syntax & RE_NO_BK_VBAR || prev_prev_backslash));
2764 /* The dual of at_begline_loc_p. This one is for $. We assume there is
2765 at least one character after the $, i.e., `P < PEND'. */
2767 static boolean
2768 at_endline_loc_p (p, pend, syntax)
2769 const char *p, *pend;
2770 int syntax;
2772 const char *next = p;
2773 boolean next_backslash = *next == '\\';
2774 const char *next_next = p + 1 < pend ? p + 1 : 0;
2776 return
2777 /* Before a subexpression? */
2778 (syntax & RE_NO_BK_PARENS ? *next == ')'
2779 : next_backslash && next_next && *next_next == ')')
2780 /* Before an alternative? */
2781 || (syntax & RE_NO_BK_VBAR ? *next == '|'
2782 : next_backslash && next_next && *next_next == '|');
2786 /* Returns true if REGNUM is in one of COMPILE_STACK's elements and
2787 false if it's not. */
2789 static boolean
2790 group_in_compile_stack (compile_stack, regnum)
2791 compile_stack_type compile_stack;
2792 regnum_t regnum;
2794 int this_element;
2796 for (this_element = compile_stack.avail - 1;
2797 this_element >= 0;
2798 this_element--)
2799 if (compile_stack.stack[this_element].regnum == regnum)
2800 return true;
2802 return false;
2806 /* Read the ending character of a range (in a bracket expression) from the
2807 uncompiled pattern *P_PTR (which ends at PEND). We assume the
2808 starting character is in `P[-2]'. (`P[-1]' is the character `-'.)
2809 Then we set the translation of all bits between the starting and
2810 ending characters (inclusive) in the compiled pattern B.
2812 Return an error code.
2814 We use these short variable names so we can use the same macros as
2815 `regex_compile' itself. */
2817 static reg_errcode_t
2818 compile_range (p_ptr, pend, translate, syntax, b)
2819 const char **p_ptr, *pend;
2820 char *translate;
2821 reg_syntax_t syntax;
2822 unsigned char *b;
2824 unsigned this_char;
2826 const char *p = *p_ptr;
2827 int range_start, range_end;
2829 if (p == pend)
2830 return REG_ERANGE;
2832 /* Even though the pattern is a signed `char *', we need to fetch
2833 with unsigned char *'s; if the high bit of the pattern character
2834 is set, the range endpoints will be negative if we fetch using a
2835 signed char *.
2837 We also want to fetch the endpoints without translating them; the
2838 appropriate translation is done in the bit-setting loop below. */
2839 /* The SVR4 compiler on the 3B2 had trouble with unsigned const char *. */
2840 range_start = ((const unsigned char *) p)[-2];
2841 range_end = ((const unsigned char *) p)[0];
2843 /* Have to increment the pointer into the pattern string, so the
2844 caller isn't still at the ending character. */
2845 (*p_ptr)++;
2847 /* If the start is after the end, the range is empty. */
2848 if (range_start > range_end)
2849 return syntax & RE_NO_EMPTY_RANGES ? REG_ERANGE : REG_NOERROR;
2851 /* Here we see why `this_char' has to be larger than an `unsigned
2852 char' -- the range is inclusive, so if `range_end' == 0xff
2853 (assuming 8-bit characters), we would otherwise go into an infinite
2854 loop, since all characters <= 0xff. */
2855 for (this_char = range_start; this_char <= range_end; this_char++)
2857 SET_LIST_BIT (TRANSLATE (this_char));
2860 return REG_NOERROR;
2863 /* re_compile_fastmap computes a ``fastmap'' for the compiled pattern in
2864 BUFP. A fastmap records which of the (1 << BYTEWIDTH) possible
2865 characters can start a string that matches the pattern. This fastmap
2866 is used by re_search to skip quickly over impossible starting points.
2868 The caller must supply the address of a (1 << BYTEWIDTH)-byte data
2869 area as BUFP->fastmap.
2871 We set the `fastmap', `fastmap_accurate', and `can_be_null' fields in
2872 the pattern buffer.
2874 Returns 0 if we succeed, -2 if an internal error. */
2877 re_compile_fastmap (bufp)
2878 struct re_pattern_buffer *bufp;
2880 int j, k;
2881 #ifdef MATCH_MAY_ALLOCATE
2882 fail_stack_type fail_stack;
2883 #endif
2884 #ifndef REGEX_MALLOC
2885 char *destination;
2886 #endif
2887 /* We don't push any register information onto the failure stack. */
2888 unsigned num_regs = 0;
2890 register char *fastmap = bufp->fastmap;
2891 unsigned char *pattern = bufp->buffer;
2892 unsigned long size = bufp->used;
2893 unsigned char *p = pattern;
2894 register unsigned char *pend = pattern + size;
2896 /* This holds the pointer to the failure stack, when
2897 it is allocated relocatably. */
2898 #ifdef REL_ALLOC
2899 fail_stack_elt_t *failure_stack_ptr;
2900 #endif
2902 /* Assume that each path through the pattern can be null until
2903 proven otherwise. We set this false at the bottom of switch
2904 statement, to which we get only if a particular path doesn't
2905 match the empty string. */
2906 boolean path_can_be_null = true;
2908 /* We aren't doing a `succeed_n' to begin with. */
2909 boolean succeed_n_p = false;
2911 assert (fastmap != NULL && p != NULL);
2913 INIT_FAIL_STACK ();
2914 bzero (fastmap, 1 << BYTEWIDTH); /* Assume nothing's valid. */
2915 bufp->fastmap_accurate = 1; /* It will be when we're done. */
2916 bufp->can_be_null = 0;
2918 while (1)
2920 if (p == pend || *p == succeed)
2922 /* We have reached the (effective) end of pattern. */
2923 if (!FAIL_STACK_EMPTY ())
2925 bufp->can_be_null |= path_can_be_null;
2927 /* Reset for next path. */
2928 path_can_be_null = true;
2930 p = fail_stack.stack[--fail_stack.avail].pointer;
2932 continue;
2934 else
2935 break;
2938 /* We should never be about to go beyond the end of the pattern. */
2939 assert (p < pend);
2941 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p++))
2944 /* I guess the idea here is to simply not bother with a fastmap
2945 if a backreference is used, since it's too hard to figure out
2946 the fastmap for the corresponding group. Setting
2947 `can_be_null' stops `re_search_2' from using the fastmap, so
2948 that is all we do. */
2949 case duplicate:
2950 bufp->can_be_null = 1;
2951 goto done;
2954 /* Following are the cases which match a character. These end
2955 with `break'. */
2957 case exactn:
2958 fastmap[p[1]] = 1;
2959 break;
2962 case charset:
2963 for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--)
2964 if (p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH)))
2965 fastmap[j] = 1;
2966 break;
2969 case charset_not:
2970 /* Chars beyond end of map must be allowed. */
2971 for (j = *p * BYTEWIDTH; j < (1 << BYTEWIDTH); j++)
2972 fastmap[j] = 1;
2974 for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--)
2975 if (!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))))
2976 fastmap[j] = 1;
2977 break;
2980 case wordchar:
2981 for (j = 0; j < (1 << BYTEWIDTH); j++)
2982 if (SYNTAX (j) == Sword)
2983 fastmap[j] = 1;
2984 break;
2987 case notwordchar:
2988 for (j = 0; j < (1 << BYTEWIDTH); j++)
2989 if (SYNTAX (j) != Sword)
2990 fastmap[j] = 1;
2991 break;
2994 case anychar:
2996 int fastmap_newline = fastmap['\n'];
2998 /* `.' matches anything ... */
2999 for (j = 0; j < (1 << BYTEWIDTH); j++)
3000 fastmap[j] = 1;
3002 /* ... except perhaps newline. */
3003 if (!(bufp->syntax & RE_DOT_NEWLINE))
3004 fastmap['\n'] = fastmap_newline;
3006 /* Return if we have already set `can_be_null'; if we have,
3007 then the fastmap is irrelevant. Something's wrong here. */
3008 else if (bufp->can_be_null)
3009 goto done;
3011 /* Otherwise, have to check alternative paths. */
3012 break;
3015 #ifdef emacs
3016 case syntaxspec:
3017 k = *p++;
3018 for (j = 0; j < (1 << BYTEWIDTH); j++)
3019 if (SYNTAX (j) == (enum syntaxcode) k)
3020 fastmap[j] = 1;
3021 break;
3024 case notsyntaxspec:
3025 k = *p++;
3026 for (j = 0; j < (1 << BYTEWIDTH); j++)
3027 if (SYNTAX (j) != (enum syntaxcode) k)
3028 fastmap[j] = 1;
3029 break;
3032 /* All cases after this match the empty string. These end with
3033 `continue'. */
3036 case before_dot:
3037 case at_dot:
3038 case after_dot:
3039 continue;
3040 #endif /* not emacs */
3043 case no_op:
3044 case begline:
3045 case endline:
3046 case begbuf:
3047 case endbuf:
3048 case wordbound:
3049 case notwordbound:
3050 case wordbeg:
3051 case wordend:
3052 case push_dummy_failure:
3053 continue;
3056 case jump_n:
3057 case pop_failure_jump:
3058 case maybe_pop_jump:
3059 case jump:
3060 case jump_past_alt:
3061 case dummy_failure_jump:
3062 EXTRACT_NUMBER_AND_INCR (j, p);
3063 p += j;
3064 if (j > 0)
3065 continue;
3067 /* Jump backward implies we just went through the body of a
3068 loop and matched nothing. Opcode jumped to should be
3069 `on_failure_jump' or `succeed_n'. Just treat it like an
3070 ordinary jump. For a * loop, it has pushed its failure
3071 point already; if so, discard that as redundant. */
3072 if ((re_opcode_t) *p != on_failure_jump
3073 && (re_opcode_t) *p != succeed_n)
3074 continue;
3076 p++;
3077 EXTRACT_NUMBER_AND_INCR (j, p);
3078 p += j;
3080 /* If what's on the stack is where we are now, pop it. */
3081 if (!FAIL_STACK_EMPTY ()
3082 && fail_stack.stack[fail_stack.avail - 1].pointer == p)
3083 fail_stack.avail--;
3085 continue;
3088 case on_failure_jump:
3089 case on_failure_keep_string_jump:
3090 handle_on_failure_jump:
3091 EXTRACT_NUMBER_AND_INCR (j, p);
3093 /* For some patterns, e.g., `(a?)?', `p+j' here points to the
3094 end of the pattern. We don't want to push such a point,
3095 since when we restore it above, entering the switch will
3096 increment `p' past the end of the pattern. We don't need
3097 to push such a point since we obviously won't find any more
3098 fastmap entries beyond `pend'. Such a pattern can match
3099 the null string, though. */
3100 if (p + j < pend)
3102 if (!PUSH_PATTERN_OP (p + j, fail_stack))
3104 RESET_FAIL_STACK ();
3105 return -2;
3108 else
3109 bufp->can_be_null = 1;
3111 if (succeed_n_p)
3113 EXTRACT_NUMBER_AND_INCR (k, p); /* Skip the n. */
3114 succeed_n_p = false;
3117 continue;
3120 case succeed_n:
3121 /* Get to the number of times to succeed. */
3122 p += 2;
3124 /* Increment p past the n for when k != 0. */
3125 EXTRACT_NUMBER_AND_INCR (k, p);
3126 if (k == 0)
3128 p -= 4;
3129 succeed_n_p = true; /* Spaghetti code alert. */
3130 goto handle_on_failure_jump;
3132 continue;
3135 case set_number_at:
3136 p += 4;
3137 continue;
3140 case start_memory:
3141 case stop_memory:
3142 p += 2;
3143 continue;
3146 default:
3147 abort (); /* We have listed all the cases. */
3148 } /* switch *p++ */
3150 /* Getting here means we have found the possible starting
3151 characters for one path of the pattern -- and that the empty
3152 string does not match. We need not follow this path further.
3153 Instead, look at the next alternative (remembered on the
3154 stack), or quit if no more. The test at the top of the loop
3155 does these things. */
3156 path_can_be_null = false;
3157 p = pend;
3158 } /* while p */
3160 /* Set `can_be_null' for the last path (also the first path, if the
3161 pattern is empty). */
3162 bufp->can_be_null |= path_can_be_null;
3164 done:
3165 RESET_FAIL_STACK ();
3166 return 0;
3167 } /* re_compile_fastmap */
3169 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and
3170 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
3171 this memory for recording register information. STARTS and ENDS
3172 must be allocated using the malloc library routine, and must each
3173 be at least NUM_REGS * sizeof (regoff_t) bytes long.
3175 If NUM_REGS == 0, then subsequent matches should allocate their own
3176 register data.
3178 Unless this function is called, the first search or match using
3179 PATTERN_BUFFER will allocate its own register data, without
3180 freeing the old data. */
3182 void
3183 re_set_registers (bufp, regs, num_regs, starts, ends)
3184 struct re_pattern_buffer *bufp;
3185 struct re_registers *regs;
3186 unsigned num_regs;
3187 regoff_t *starts, *ends;
3189 if (num_regs)
3191 bufp->regs_allocated = REGS_REALLOCATE;
3192 regs->num_regs = num_regs;
3193 regs->start = starts;
3194 regs->end = ends;
3196 else
3198 bufp->regs_allocated = REGS_UNALLOCATED;
3199 regs->num_regs = 0;
3200 regs->start = regs->end = (regoff_t *) 0;
3204 /* Searching routines. */
3206 /* Like re_search_2, below, but only one string is specified, and
3207 doesn't let you say where to stop matching. */
3210 re_search (bufp, string, size, startpos, range, regs)
3211 struct re_pattern_buffer *bufp;
3212 const char *string;
3213 int size, startpos, range;
3214 struct re_registers *regs;
3216 return re_search_2 (bufp, NULL, 0, string, size, startpos, range,
3217 regs, size);
3221 /* Using the compiled pattern in BUFP->buffer, first tries to match the
3222 virtual concatenation of STRING1 and STRING2, starting first at index
3223 STARTPOS, then at STARTPOS + 1, and so on.
3225 STRING1 and STRING2 have length SIZE1 and SIZE2, respectively.
3227 RANGE is how far to scan while trying to match. RANGE = 0 means try
3228 only at STARTPOS; in general, the last start tried is STARTPOS +
3229 RANGE.
3231 In REGS, return the indices of the virtual concatenation of STRING1
3232 and STRING2 that matched the entire BUFP->buffer and its contained
3233 subexpressions.
3235 Do not consider matching one past the index STOP in the virtual
3236 concatenation of STRING1 and STRING2.
3238 We return either the position in the strings at which the match was
3239 found, -1 if no match, or -2 if error (such as failure
3240 stack overflow). */
3243 re_search_2 (bufp, string1, size1, string2, size2, startpos, range, regs, stop)
3244 struct re_pattern_buffer *bufp;
3245 const char *string1, *string2;
3246 int size1, size2;
3247 int startpos;
3248 int range;
3249 struct re_registers *regs;
3250 int stop;
3252 int val;
3253 register char *fastmap = bufp->fastmap;
3254 register char *translate = bufp->translate;
3255 int total_size = size1 + size2;
3256 int endpos = startpos + range;
3258 /* Check for out-of-range STARTPOS. */
3259 if (startpos < 0 || startpos > total_size)
3260 return -1;
3262 /* Fix up RANGE if it might eventually take us outside
3263 the virtual concatenation of STRING1 and STRING2. */
3264 if (endpos < -1)
3265 range = -1 - startpos;
3266 else if (endpos > total_size)
3267 range = total_size - startpos;
3269 /* If the search isn't to be a backwards one, don't waste time in a
3270 search for a pattern that must be anchored. */
3271 if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == begbuf && range > 0)
3273 if (startpos > 0)
3274 return -1;
3275 else
3276 range = 1;
3279 /* Update the fastmap now if not correct already. */
3280 if (fastmap && !bufp->fastmap_accurate)
3281 if (re_compile_fastmap (bufp) == -2)
3282 return -2;
3284 /* Loop through the string, looking for a place to start matching. */
3285 for (;;)
3287 /* If a fastmap is supplied, skip quickly over characters that
3288 cannot be the start of a match. If the pattern can match the
3289 null string, however, we don't need to skip characters; we want
3290 the first null string. */
3291 if (fastmap && startpos < total_size && !bufp->can_be_null)
3293 if (range > 0) /* Searching forwards. */
3295 register const char *d;
3296 register int lim = 0;
3297 int irange = range;
3299 if (startpos < size1 && startpos + range >= size1)
3300 lim = range - (size1 - startpos);
3302 d = (startpos >= size1 ? string2 - size1 : string1) + startpos;
3304 /* Written out as an if-else to avoid testing `translate'
3305 inside the loop. */
3306 if (translate)
3307 while (range > lim
3308 && !fastmap[(unsigned char)
3309 translate[(unsigned char) *d++]])
3310 range--;
3311 else
3312 while (range > lim && !fastmap[(unsigned char) *d++])
3313 range--;
3315 startpos += irange - range;
3317 else /* Searching backwards. */
3319 register char c = (size1 == 0 || startpos >= size1
3320 ? string2[startpos - size1]
3321 : string1[startpos]);
3323 if (!fastmap[(unsigned char) TRANSLATE (c)])
3324 goto advance;
3328 /* If can't match the null string, and that's all we have left, fail. */
3329 if (range >= 0 && startpos == total_size && fastmap
3330 && !bufp->can_be_null)
3331 return -1;
3333 val = re_match_2_internal (bufp, string1, size1, string2, size2,
3334 startpos, regs, stop);
3335 #ifndef REGEX_MALLOC
3336 #ifdef C_ALLOCA
3337 alloca (0);
3338 #endif
3339 #endif
3341 if (val >= 0)
3342 return startpos;
3344 if (val == -2)
3345 return -2;
3347 advance:
3348 if (!range)
3349 break;
3350 else if (range > 0)
3352 range--;
3353 startpos++;
3355 else
3357 range++;
3358 startpos--;
3361 return -1;
3362 } /* re_search_2 */
3364 /* Declarations and macros for re_match_2. */
3366 static int bcmp_translate ();
3367 static boolean alt_match_null_string_p (),
3368 common_op_match_null_string_p (),
3369 group_match_null_string_p ();
3371 /* This converts PTR, a pointer into one of the search strings `string1'
3372 and `string2' into an offset from the beginning of that string. */
3373 #define POINTER_TO_OFFSET(ptr) \
3374 (FIRST_STRING_P (ptr) \
3375 ? ((regoff_t) ((ptr) - string1)) \
3376 : ((regoff_t) ((ptr) - string2 + size1)))
3378 /* Macros for dealing with the split strings in re_match_2. */
3380 #define MATCHING_IN_FIRST_STRING (dend == end_match_1)
3382 /* Call before fetching a character with *d. This switches over to
3383 string2 if necessary. */
3384 #define PREFETCH() \
3385 while (d == dend) \
3387 /* End of string2 => fail. */ \
3388 if (dend == end_match_2) \
3389 goto fail; \
3390 /* End of string1 => advance to string2. */ \
3391 d = string2; \
3392 dend = end_match_2; \
3396 /* Test if at very beginning or at very end of the virtual concatenation
3397 of `string1' and `string2'. If only one string, it's `string2'. */
3398 #define AT_STRINGS_BEG(d) ((d) == (size1 ? string1 : string2) || !size2)
3399 #define AT_STRINGS_END(d) ((d) == end2)
3402 /* Test if D points to a character which is word-constituent. We have
3403 two special cases to check for: if past the end of string1, look at
3404 the first character in string2; and if before the beginning of
3405 string2, look at the last character in string1. */
3406 #define WORDCHAR_P(d) \
3407 (SYNTAX ((d) == end1 ? *string2 \
3408 : (d) == string2 - 1 ? *(end1 - 1) : *(d)) \
3409 == Sword)
3411 /* Test if the character before D and the one at D differ with respect
3412 to being word-constituent. */
3413 #define AT_WORD_BOUNDARY(d) \
3414 (AT_STRINGS_BEG (d) || AT_STRINGS_END (d) \
3415 || WORDCHAR_P (d - 1) != WORDCHAR_P (d))
3418 /* Free everything we malloc. */
3419 #ifdef MATCH_MAY_ALLOCATE
3420 #define FREE_VAR(var) if (var) REGEX_FREE (var); var = NULL
3421 #define FREE_VARIABLES() \
3422 do { \
3423 REGEX_FREE_STACK (fail_stack.stack); \
3424 FREE_VAR (regstart); \
3425 FREE_VAR (regend); \
3426 FREE_VAR (old_regstart); \
3427 FREE_VAR (old_regend); \
3428 FREE_VAR (best_regstart); \
3429 FREE_VAR (best_regend); \
3430 FREE_VAR (reg_info); \
3431 FREE_VAR (reg_dummy); \
3432 FREE_VAR (reg_info_dummy); \
3433 } while (0)
3434 #else
3435 #define FREE_VARIABLES() ((void)0) /* Do nothing! But inhibit gcc warning. */
3436 #endif /* not MATCH_MAY_ALLOCATE */
3438 /* These values must meet several constraints. They must not be valid
3439 register values; since we have a limit of 255 registers (because
3440 we use only one byte in the pattern for the register number), we can
3441 use numbers larger than 255. They must differ by 1, because of
3442 NUM_FAILURE_ITEMS above. And the value for the lowest register must
3443 be larger than the value for the highest register, so we do not try
3444 to actually save any registers when none are active. */
3445 #define NO_HIGHEST_ACTIVE_REG (1 << BYTEWIDTH)
3446 #define NO_LOWEST_ACTIVE_REG (NO_HIGHEST_ACTIVE_REG + 1)
3448 /* Matching routines. */
3450 #ifndef emacs /* Emacs never uses this. */
3451 /* re_match is like re_match_2 except it takes only a single string. */
3454 re_match (bufp, string, size, pos, regs)
3455 struct re_pattern_buffer *bufp;
3456 const char *string;
3457 int size, pos;
3458 struct re_registers *regs;
3460 int result = re_match_2_internal (bufp, NULL, 0, string, size,
3461 pos, regs, size);
3462 alloca (0);
3463 return result;
3465 #endif /* not emacs */
3468 /* re_match_2 matches the compiled pattern in BUFP against the
3469 the (virtual) concatenation of STRING1 and STRING2 (of length SIZE1
3470 and SIZE2, respectively). We start matching at POS, and stop
3471 matching at STOP.
3473 If REGS is non-null and the `no_sub' field of BUFP is nonzero, we
3474 store offsets for the substring each group matched in REGS. See the
3475 documentation for exactly how many groups we fill.
3477 We return -1 if no match, -2 if an internal error (such as the
3478 failure stack overflowing). Otherwise, we return the length of the
3479 matched substring. */
3482 re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop)
3483 struct re_pattern_buffer *bufp;
3484 const char *string1, *string2;
3485 int size1, size2;
3486 int pos;
3487 struct re_registers *regs;
3488 int stop;
3490 int result = re_match_2_internal (bufp, string1, size1, string2, size2,
3491 pos, regs, stop);
3492 alloca (0);
3493 return result;
3496 /* This is a separate function so that we can force an alloca cleanup
3497 afterwards. */
3498 static int
3499 re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
3500 struct re_pattern_buffer *bufp;
3501 const char *string1, *string2;
3502 int size1, size2;
3503 int pos;
3504 struct re_registers *regs;
3505 int stop;
3507 /* General temporaries. */
3508 int mcnt;
3509 unsigned char *p1;
3511 /* Just past the end of the corresponding string. */
3512 const char *end1, *end2;
3514 /* Pointers into string1 and string2, just past the last characters in
3515 each to consider matching. */
3516 const char *end_match_1, *end_match_2;
3518 /* Where we are in the data, and the end of the current string. */
3519 const char *d, *dend;
3521 /* Where we are in the pattern, and the end of the pattern. */
3522 unsigned char *p = bufp->buffer;
3523 register unsigned char *pend = p + bufp->used;
3525 /* Mark the opcode just after a start_memory, so we can test for an
3526 empty subpattern when we get to the stop_memory. */
3527 unsigned char *just_past_start_mem = 0;
3529 /* We use this to map every character in the string. */
3530 char *translate = bufp->translate;
3532 /* Failure point stack. Each place that can handle a failure further
3533 down the line pushes a failure point on this stack. It consists of
3534 restart, regend, and reg_info for all registers corresponding to
3535 the subexpressions we're currently inside, plus the number of such
3536 registers, and, finally, two char *'s. The first char * is where
3537 to resume scanning the pattern; the second one is where to resume
3538 scanning the strings. If the latter is zero, the failure point is
3539 a ``dummy''; if a failure happens and the failure point is a dummy,
3540 it gets discarded and the next next one is tried. */
3541 #ifdef MATCH_MAY_ALLOCATE /* otherwise, this is global. */
3542 fail_stack_type fail_stack;
3543 #endif
3544 #ifdef DEBUG
3545 static unsigned failure_id = 0;
3546 unsigned nfailure_points_pushed = 0, nfailure_points_popped = 0;
3547 #endif
3549 /* This holds the pointer to the failure stack, when
3550 it is allocated relocatably. */
3551 #ifdef REL_ALLOC
3552 fail_stack_elt_t *failure_stack_ptr;
3553 #endif
3555 /* We fill all the registers internally, independent of what we
3556 return, for use in backreferences. The number here includes
3557 an element for register zero. */
3558 unsigned num_regs = bufp->re_nsub + 1;
3560 /* The currently active registers. */
3561 unsigned lowest_active_reg = NO_LOWEST_ACTIVE_REG;
3562 unsigned highest_active_reg = NO_HIGHEST_ACTIVE_REG;
3564 /* Information on the contents of registers. These are pointers into
3565 the input strings; they record just what was matched (on this
3566 attempt) by a subexpression part of the pattern, that is, the
3567 regnum-th regstart pointer points to where in the pattern we began
3568 matching and the regnum-th regend points to right after where we
3569 stopped matching the regnum-th subexpression. (The zeroth register
3570 keeps track of what the whole pattern matches.) */
3571 #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
3572 const char **regstart, **regend;
3573 #endif
3575 /* If a group that's operated upon by a repetition operator fails to
3576 match anything, then the register for its start will need to be
3577 restored because it will have been set to wherever in the string we
3578 are when we last see its open-group operator. Similarly for a
3579 register's end. */
3580 #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
3581 const char **old_regstart, **old_regend;
3582 #endif
3584 /* The is_active field of reg_info helps us keep track of which (possibly
3585 nested) subexpressions we are currently in. The matched_something
3586 field of reg_info[reg_num] helps us tell whether or not we have
3587 matched any of the pattern so far this time through the reg_num-th
3588 subexpression. These two fields get reset each time through any
3589 loop their register is in. */
3590 #ifdef MATCH_MAY_ALLOCATE /* otherwise, this is global. */
3591 register_info_type *reg_info;
3592 #endif
3594 /* The following record the register info as found in the above
3595 variables when we find a match better than any we've seen before.
3596 This happens as we backtrack through the failure points, which in
3597 turn happens only if we have not yet matched the entire string. */
3598 unsigned best_regs_set = false;
3599 #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
3600 const char **best_regstart, **best_regend;
3601 #endif
3603 /* Logically, this is `best_regend[0]'. But we don't want to have to
3604 allocate space for that if we're not allocating space for anything
3605 else (see below). Also, we never need info about register 0 for
3606 any of the other register vectors, and it seems rather a kludge to
3607 treat `best_regend' differently than the rest. So we keep track of
3608 the end of the best match so far in a separate variable. We
3609 initialize this to NULL so that when we backtrack the first time
3610 and need to test it, it's not garbage. */
3611 const char *match_end = NULL;
3613 /* This helps SET_REGS_MATCHED avoid doing redundant work. */
3614 int set_regs_matched_done = 0;
3616 /* Used when we pop values we don't care about. */
3617 #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
3618 const char **reg_dummy;
3619 register_info_type *reg_info_dummy;
3620 #endif
3622 #ifdef DEBUG
3623 /* Counts the total number of registers pushed. */
3624 unsigned num_regs_pushed = 0;
3625 #endif
3627 DEBUG_PRINT1 ("\n\nEntering re_match_2.\n");
3629 INIT_FAIL_STACK ();
3631 #ifdef MATCH_MAY_ALLOCATE
3632 /* Do not bother to initialize all the register variables if there are
3633 no groups in the pattern, as it takes a fair amount of time. If
3634 there are groups, we include space for register 0 (the whole
3635 pattern), even though we never use it, since it simplifies the
3636 array indexing. We should fix this. */
3637 if (bufp->re_nsub)
3639 regstart = REGEX_TALLOC (num_regs, const char *);
3640 regend = REGEX_TALLOC (num_regs, const char *);
3641 old_regstart = REGEX_TALLOC (num_regs, const char *);
3642 old_regend = REGEX_TALLOC (num_regs, const char *);
3643 best_regstart = REGEX_TALLOC (num_regs, const char *);
3644 best_regend = REGEX_TALLOC (num_regs, const char *);
3645 reg_info = REGEX_TALLOC (num_regs, register_info_type);
3646 reg_dummy = REGEX_TALLOC (num_regs, const char *);
3647 reg_info_dummy = REGEX_TALLOC (num_regs, register_info_type);
3649 if (!(regstart && regend && old_regstart && old_regend && reg_info
3650 && best_regstart && best_regend && reg_dummy && reg_info_dummy))
3652 FREE_VARIABLES ();
3653 return -2;
3656 else
3658 /* We must initialize all our variables to NULL, so that
3659 `FREE_VARIABLES' doesn't try to free them. */
3660 regstart = regend = old_regstart = old_regend = best_regstart
3661 = best_regend = reg_dummy = NULL;
3662 reg_info = reg_info_dummy = (register_info_type *) NULL;
3664 #endif /* MATCH_MAY_ALLOCATE */
3666 /* The starting position is bogus. */
3667 if (pos < 0 || pos > size1 + size2)
3669 FREE_VARIABLES ();
3670 return -1;
3673 /* Initialize subexpression text positions to -1 to mark ones that no
3674 start_memory/stop_memory has been seen for. Also initialize the
3675 register information struct. */
3676 for (mcnt = 1; mcnt < num_regs; mcnt++)
3678 regstart[mcnt] = regend[mcnt]
3679 = old_regstart[mcnt] = old_regend[mcnt] = REG_UNSET_VALUE;
3681 REG_MATCH_NULL_STRING_P (reg_info[mcnt]) = MATCH_NULL_UNSET_VALUE;
3682 IS_ACTIVE (reg_info[mcnt]) = 0;
3683 MATCHED_SOMETHING (reg_info[mcnt]) = 0;
3684 EVER_MATCHED_SOMETHING (reg_info[mcnt]) = 0;
3687 /* We move `string1' into `string2' if the latter's empty -- but not if
3688 `string1' is null. */
3689 if (size2 == 0 && string1 != NULL)
3691 string2 = string1;
3692 size2 = size1;
3693 string1 = 0;
3694 size1 = 0;
3696 end1 = string1 + size1;
3697 end2 = string2 + size2;
3699 /* Compute where to stop matching, within the two strings. */
3700 if (stop <= size1)
3702 end_match_1 = string1 + stop;
3703 end_match_2 = string2;
3705 else
3707 end_match_1 = end1;
3708 end_match_2 = string2 + stop - size1;
3711 /* `p' scans through the pattern as `d' scans through the data.
3712 `dend' is the end of the input string that `d' points within. `d'
3713 is advanced into the following input string whenever necessary, but
3714 this happens before fetching; therefore, at the beginning of the
3715 loop, `d' can be pointing at the end of a string, but it cannot
3716 equal `string2'. */
3717 if (size1 > 0 && pos <= size1)
3719 d = string1 + pos;
3720 dend = end_match_1;
3722 else
3724 d = string2 + pos - size1;
3725 dend = end_match_2;
3728 DEBUG_PRINT1 ("The compiled pattern is: ");
3729 DEBUG_PRINT_COMPILED_PATTERN (bufp, p, pend);
3730 DEBUG_PRINT1 ("The string to match is: `");
3731 DEBUG_PRINT_DOUBLE_STRING (d, string1, size1, string2, size2);
3732 DEBUG_PRINT1 ("'\n");
3734 /* This loops over pattern commands. It exits by returning from the
3735 function if the match is complete, or it drops through if the match
3736 fails at this starting point in the input data. */
3737 for (;;)
3739 DEBUG_PRINT2 ("\n0x%x: ", p);
3741 if (p == pend)
3742 { /* End of pattern means we might have succeeded. */
3743 DEBUG_PRINT1 ("end of pattern ... ");
3745 /* If we haven't matched the entire string, and we want the
3746 longest match, try backtracking. */
3747 if (d != end_match_2)
3749 /* 1 if this match ends in the same string (string1 or string2)
3750 as the best previous match. */
3751 boolean same_str_p = (FIRST_STRING_P (match_end)
3752 == MATCHING_IN_FIRST_STRING);
3753 /* 1 if this match is the best seen so far. */
3754 boolean best_match_p;
3756 /* AIX compiler got confused when this was combined
3757 with the previous declaration. */
3758 if (same_str_p)
3759 best_match_p = d > match_end;
3760 else
3761 best_match_p = !MATCHING_IN_FIRST_STRING;
3763 DEBUG_PRINT1 ("backtracking.\n");
3765 if (!FAIL_STACK_EMPTY ())
3766 { /* More failure points to try. */
3768 /* If exceeds best match so far, save it. */
3769 if (!best_regs_set || best_match_p)
3771 best_regs_set = true;
3772 match_end = d;
3774 DEBUG_PRINT1 ("\nSAVING match as best so far.\n");
3776 for (mcnt = 1; mcnt < num_regs; mcnt++)
3778 best_regstart[mcnt] = regstart[mcnt];
3779 best_regend[mcnt] = regend[mcnt];
3782 goto fail;
3785 /* If no failure points, don't restore garbage. And if
3786 last match is real best match, don't restore second
3787 best one. */
3788 else if (best_regs_set && !best_match_p)
3790 restore_best_regs:
3791 /* Restore best match. It may happen that `dend ==
3792 end_match_1' while the restored d is in string2.
3793 For example, the pattern `x.*y.*z' against the
3794 strings `x-' and `y-z-', if the two strings are
3795 not consecutive in memory. */
3796 DEBUG_PRINT1 ("Restoring best registers.\n");
3798 d = match_end;
3799 dend = ((d >= string1 && d <= end1)
3800 ? end_match_1 : end_match_2);
3802 for (mcnt = 1; mcnt < num_regs; mcnt++)
3804 regstart[mcnt] = best_regstart[mcnt];
3805 regend[mcnt] = best_regend[mcnt];
3808 } /* d != end_match_2 */
3810 succeed_label:
3811 DEBUG_PRINT1 ("Accepting match.\n");
3813 /* If caller wants register contents data back, do it. */
3814 if (regs && !bufp->no_sub)
3816 /* Have the register data arrays been allocated? */
3817 if (bufp->regs_allocated == REGS_UNALLOCATED)
3818 { /* No. So allocate them with malloc. We need one
3819 extra element beyond `num_regs' for the `-1' marker
3820 GNU code uses. */
3821 regs->num_regs = MAX (RE_NREGS, num_regs + 1);
3822 regs->start = TALLOC (regs->num_regs, regoff_t);
3823 regs->end = TALLOC (regs->num_regs, regoff_t);
3824 if (regs->start == NULL || regs->end == NULL)
3826 FREE_VARIABLES ();
3827 return -2;
3829 bufp->regs_allocated = REGS_REALLOCATE;
3831 else if (bufp->regs_allocated == REGS_REALLOCATE)
3832 { /* Yes. If we need more elements than were already
3833 allocated, reallocate them. If we need fewer, just
3834 leave it alone. */
3835 if (regs->num_regs < num_regs + 1)
3837 regs->num_regs = num_regs + 1;
3838 RETALLOC (regs->start, regs->num_regs, regoff_t);
3839 RETALLOC (regs->end, regs->num_regs, regoff_t);
3840 if (regs->start == NULL || regs->end == NULL)
3842 FREE_VARIABLES ();
3843 return -2;
3847 else
3849 /* These braces fend off a "empty body in an else-statement"
3850 warning under GCC when assert expands to nothing. */
3851 assert (bufp->regs_allocated == REGS_FIXED);
3854 /* Convert the pointer data in `regstart' and `regend' to
3855 indices. Register zero has to be set differently,
3856 since we haven't kept track of any info for it. */
3857 if (regs->num_regs > 0)
3859 regs->start[0] = pos;
3860 regs->end[0] = (MATCHING_IN_FIRST_STRING
3861 ? ((regoff_t) (d - string1))
3862 : ((regoff_t) (d - string2 + size1)));
3865 /* Go through the first `min (num_regs, regs->num_regs)'
3866 registers, since that is all we initialized. */
3867 for (mcnt = 1; mcnt < MIN (num_regs, regs->num_regs); mcnt++)
3869 if (REG_UNSET (regstart[mcnt]) || REG_UNSET (regend[mcnt]))
3870 regs->start[mcnt] = regs->end[mcnt] = -1;
3871 else
3873 regs->start[mcnt]
3874 = (regoff_t) POINTER_TO_OFFSET (regstart[mcnt]);
3875 regs->end[mcnt]
3876 = (regoff_t) POINTER_TO_OFFSET (regend[mcnt]);
3880 /* If the regs structure we return has more elements than
3881 were in the pattern, set the extra elements to -1. If
3882 we (re)allocated the registers, this is the case,
3883 because we always allocate enough to have at least one
3884 -1 at the end. */
3885 for (mcnt = num_regs; mcnt < regs->num_regs; mcnt++)
3886 regs->start[mcnt] = regs->end[mcnt] = -1;
3887 } /* regs && !bufp->no_sub */
3889 DEBUG_PRINT4 ("%u failure points pushed, %u popped (%u remain).\n",
3890 nfailure_points_pushed, nfailure_points_popped,
3891 nfailure_points_pushed - nfailure_points_popped);
3892 DEBUG_PRINT2 ("%u registers pushed.\n", num_regs_pushed);
3894 mcnt = d - pos - (MATCHING_IN_FIRST_STRING
3895 ? string1
3896 : string2 - size1);
3898 DEBUG_PRINT2 ("Returning %d from re_match_2.\n", mcnt);
3900 FREE_VARIABLES ();
3901 return mcnt;
3904 /* Otherwise match next pattern command. */
3905 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p++))
3907 /* Ignore these. Used to ignore the n of succeed_n's which
3908 currently have n == 0. */
3909 case no_op:
3910 DEBUG_PRINT1 ("EXECUTING no_op.\n");
3911 break;
3913 case succeed:
3914 DEBUG_PRINT1 ("EXECUTING succeed.\n");
3915 goto succeed_label;
3917 /* Match the next n pattern characters exactly. The following
3918 byte in the pattern defines n, and the n bytes after that
3919 are the characters to match. */
3920 case exactn:
3921 mcnt = *p++;
3922 DEBUG_PRINT2 ("EXECUTING exactn %d.\n", mcnt);
3924 /* This is written out as an if-else so we don't waste time
3925 testing `translate' inside the loop. */
3926 if (translate)
3930 PREFETCH ();
3931 if (translate[(unsigned char) *d++] != (char) *p++)
3932 goto fail;
3934 while (--mcnt);
3936 else
3940 PREFETCH ();
3941 if (*d++ != (char) *p++) goto fail;
3943 while (--mcnt);
3945 SET_REGS_MATCHED ();
3946 break;
3949 /* Match any character except possibly a newline or a null. */
3950 case anychar:
3951 DEBUG_PRINT1 ("EXECUTING anychar.\n");
3953 PREFETCH ();
3955 if ((!(bufp->syntax & RE_DOT_NEWLINE) && TRANSLATE (*d) == '\n')
3956 || (bufp->syntax & RE_DOT_NOT_NULL && TRANSLATE (*d) == '\000'))
3957 goto fail;
3959 SET_REGS_MATCHED ();
3960 DEBUG_PRINT2 (" Matched `%d'.\n", *d);
3961 d++;
3962 break;
3965 case charset:
3966 case charset_not:
3968 register unsigned char c;
3969 boolean not = (re_opcode_t) *(p - 1) == charset_not;
3971 DEBUG_PRINT2 ("EXECUTING charset%s.\n", not ? "_not" : "");
3973 PREFETCH ();
3974 c = TRANSLATE (*d); /* The character to match. */
3976 /* Cast to `unsigned' instead of `unsigned char' in case the
3977 bit list is a full 32 bytes long. */
3978 if (c < (unsigned) (*p * BYTEWIDTH)
3979 && p[1 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
3980 not = !not;
3982 p += 1 + *p;
3984 if (!not) goto fail;
3986 SET_REGS_MATCHED ();
3987 d++;
3988 break;
3992 /* The beginning of a group is represented by start_memory.
3993 The arguments are the register number in the next byte, and the
3994 number of groups inner to this one in the next. The text
3995 matched within the group is recorded (in the internal
3996 registers data structure) under the register number. */
3997 case start_memory:
3998 DEBUG_PRINT3 ("EXECUTING start_memory %d (%d):\n", *p, p[1]);
4000 /* Find out if this group can match the empty string. */
4001 p1 = p; /* To send to group_match_null_string_p. */
4003 if (REG_MATCH_NULL_STRING_P (reg_info[*p]) == MATCH_NULL_UNSET_VALUE)
4004 REG_MATCH_NULL_STRING_P (reg_info[*p])
4005 = group_match_null_string_p (&p1, pend, reg_info);
4007 /* Save the position in the string where we were the last time
4008 we were at this open-group operator in case the group is
4009 operated upon by a repetition operator, e.g., with `(a*)*b'
4010 against `ab'; then we want to ignore where we are now in
4011 the string in case this attempt to match fails. */
4012 old_regstart[*p] = REG_MATCH_NULL_STRING_P (reg_info[*p])
4013 ? REG_UNSET (regstart[*p]) ? d : regstart[*p]
4014 : regstart[*p];
4015 DEBUG_PRINT2 (" old_regstart: %d\n",
4016 POINTER_TO_OFFSET (old_regstart[*p]));
4018 regstart[*p] = d;
4019 DEBUG_PRINT2 (" regstart: %d\n", POINTER_TO_OFFSET (regstart[*p]));
4021 IS_ACTIVE (reg_info[*p]) = 1;
4022 MATCHED_SOMETHING (reg_info[*p]) = 0;
4024 /* Clear this whenever we change the register activity status. */
4025 set_regs_matched_done = 0;
4027 /* This is the new highest active register. */
4028 highest_active_reg = *p;
4030 /* If nothing was active before, this is the new lowest active
4031 register. */
4032 if (lowest_active_reg == NO_LOWEST_ACTIVE_REG)
4033 lowest_active_reg = *p;
4035 /* Move past the register number and inner group count. */
4036 p += 2;
4037 just_past_start_mem = p;
4039 break;
4042 /* The stop_memory opcode represents the end of a group. Its
4043 arguments are the same as start_memory's: the register
4044 number, and the number of inner groups. */
4045 case stop_memory:
4046 DEBUG_PRINT3 ("EXECUTING stop_memory %d (%d):\n", *p, p[1]);
4048 /* We need to save the string position the last time we were at
4049 this close-group operator in case the group is operated
4050 upon by a repetition operator, e.g., with `((a*)*(b*)*)*'
4051 against `aba'; then we want to ignore where we are now in
4052 the string in case this attempt to match fails. */
4053 old_regend[*p] = REG_MATCH_NULL_STRING_P (reg_info[*p])
4054 ? REG_UNSET (regend[*p]) ? d : regend[*p]
4055 : regend[*p];
4056 DEBUG_PRINT2 (" old_regend: %d\n",
4057 POINTER_TO_OFFSET (old_regend[*p]));
4059 regend[*p] = d;
4060 DEBUG_PRINT2 (" regend: %d\n", POINTER_TO_OFFSET (regend[*p]));
4062 /* This register isn't active anymore. */
4063 IS_ACTIVE (reg_info[*p]) = 0;
4065 /* Clear this whenever we change the register activity status. */
4066 set_regs_matched_done = 0;
4068 /* If this was the only register active, nothing is active
4069 anymore. */
4070 if (lowest_active_reg == highest_active_reg)
4072 lowest_active_reg = NO_LOWEST_ACTIVE_REG;
4073 highest_active_reg = NO_HIGHEST_ACTIVE_REG;
4075 else
4076 { /* We must scan for the new highest active register, since
4077 it isn't necessarily one less than now: consider
4078 (a(b)c(d(e)f)g). When group 3 ends, after the f), the
4079 new highest active register is 1. */
4080 unsigned char r = *p - 1;
4081 while (r > 0 && !IS_ACTIVE (reg_info[r]))
4082 r--;
4084 /* If we end up at register zero, that means that we saved
4085 the registers as the result of an `on_failure_jump', not
4086 a `start_memory', and we jumped to past the innermost
4087 `stop_memory'. For example, in ((.)*) we save
4088 registers 1 and 2 as a result of the *, but when we pop
4089 back to the second ), we are at the stop_memory 1.
4090 Thus, nothing is active. */
4091 if (r == 0)
4093 lowest_active_reg = NO_LOWEST_ACTIVE_REG;
4094 highest_active_reg = NO_HIGHEST_ACTIVE_REG;
4096 else
4097 highest_active_reg = r;
4100 /* If just failed to match something this time around with a
4101 group that's operated on by a repetition operator, try to
4102 force exit from the ``loop'', and restore the register
4103 information for this group that we had before trying this
4104 last match. */
4105 if ((!MATCHED_SOMETHING (reg_info[*p])
4106 || just_past_start_mem == p - 1)
4107 && (p + 2) < pend)
4109 boolean is_a_jump_n = false;
4111 p1 = p + 2;
4112 mcnt = 0;
4113 switch ((re_opcode_t) *p1++)
4115 case jump_n:
4116 is_a_jump_n = true;
4117 case pop_failure_jump:
4118 case maybe_pop_jump:
4119 case jump:
4120 case dummy_failure_jump:
4121 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
4122 if (is_a_jump_n)
4123 p1 += 2;
4124 break;
4126 default:
4127 /* do nothing */ ;
4129 p1 += mcnt;
4131 /* If the next operation is a jump backwards in the pattern
4132 to an on_failure_jump right before the start_memory
4133 corresponding to this stop_memory, exit from the loop
4134 by forcing a failure after pushing on the stack the
4135 on_failure_jump's jump in the pattern, and d. */
4136 if (mcnt < 0 && (re_opcode_t) *p1 == on_failure_jump
4137 && (re_opcode_t) p1[3] == start_memory && p1[4] == *p)
4139 /* If this group ever matched anything, then restore
4140 what its registers were before trying this last
4141 failed match, e.g., with `(a*)*b' against `ab' for
4142 regstart[1], and, e.g., with `((a*)*(b*)*)*'
4143 against `aba' for regend[3].
4145 Also restore the registers for inner groups for,
4146 e.g., `((a*)(b*))*' against `aba' (register 3 would
4147 otherwise get trashed). */
4149 if (EVER_MATCHED_SOMETHING (reg_info[*p]))
4151 unsigned r;
4153 EVER_MATCHED_SOMETHING (reg_info[*p]) = 0;
4155 /* Restore this and inner groups' (if any) registers. */
4156 for (r = *p; r < *p + *(p + 1); r++)
4158 regstart[r] = old_regstart[r];
4160 /* xx why this test? */
4161 if (old_regend[r] >= regstart[r])
4162 regend[r] = old_regend[r];
4165 p1++;
4166 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
4167 PUSH_FAILURE_POINT (p1 + mcnt, d, -2);
4169 goto fail;
4173 /* Move past the register number and the inner group count. */
4174 p += 2;
4175 break;
4178 /* \<digit> has been turned into a `duplicate' command which is
4179 followed by the numeric value of <digit> as the register number. */
4180 case duplicate:
4182 register const char *d2, *dend2;
4183 int regno = *p++; /* Get which register to match against. */
4184 DEBUG_PRINT2 ("EXECUTING duplicate %d.\n", regno);
4186 /* Can't back reference a group which we've never matched. */
4187 if (REG_UNSET (regstart[regno]) || REG_UNSET (regend[regno]))
4188 goto fail;
4190 /* Where in input to try to start matching. */
4191 d2 = regstart[regno];
4193 /* Where to stop matching; if both the place to start and
4194 the place to stop matching are in the same string, then
4195 set to the place to stop, otherwise, for now have to use
4196 the end of the first string. */
4198 dend2 = ((FIRST_STRING_P (regstart[regno])
4199 == FIRST_STRING_P (regend[regno]))
4200 ? regend[regno] : end_match_1);
4201 for (;;)
4203 /* If necessary, advance to next segment in register
4204 contents. */
4205 while (d2 == dend2)
4207 if (dend2 == end_match_2) break;
4208 if (dend2 == regend[regno]) break;
4210 /* End of string1 => advance to string2. */
4211 d2 = string2;
4212 dend2 = regend[regno];
4214 /* At end of register contents => success */
4215 if (d2 == dend2) break;
4217 /* If necessary, advance to next segment in data. */
4218 PREFETCH ();
4220 /* How many characters left in this segment to match. */
4221 mcnt = dend - d;
4223 /* Want how many consecutive characters we can match in
4224 one shot, so, if necessary, adjust the count. */
4225 if (mcnt > dend2 - d2)
4226 mcnt = dend2 - d2;
4228 /* Compare that many; failure if mismatch, else move
4229 past them. */
4230 if (translate
4231 ? bcmp_translate (d, d2, mcnt, translate)
4232 : bcmp (d, d2, mcnt))
4233 goto fail;
4234 d += mcnt, d2 += mcnt;
4236 /* Do this because we've match some characters. */
4237 SET_REGS_MATCHED ();
4240 break;
4243 /* begline matches the empty string at the beginning of the string
4244 (unless `not_bol' is set in `bufp'), and, if
4245 `newline_anchor' is set, after newlines. */
4246 case begline:
4247 DEBUG_PRINT1 ("EXECUTING begline.\n");
4249 if (AT_STRINGS_BEG (d))
4251 if (!bufp->not_bol) break;
4253 else if (d[-1] == '\n' && bufp->newline_anchor)
4255 break;
4257 /* In all other cases, we fail. */
4258 goto fail;
4261 /* endline is the dual of begline. */
4262 case endline:
4263 DEBUG_PRINT1 ("EXECUTING endline.\n");
4265 if (AT_STRINGS_END (d))
4267 if (!bufp->not_eol) break;
4270 /* We have to ``prefetch'' the next character. */
4271 else if ((d == end1 ? *string2 : *d) == '\n'
4272 && bufp->newline_anchor)
4274 break;
4276 goto fail;
4279 /* Match at the very beginning of the data. */
4280 case begbuf:
4281 DEBUG_PRINT1 ("EXECUTING begbuf.\n");
4282 if (AT_STRINGS_BEG (d))
4283 break;
4284 goto fail;
4287 /* Match at the very end of the data. */
4288 case endbuf:
4289 DEBUG_PRINT1 ("EXECUTING endbuf.\n");
4290 if (AT_STRINGS_END (d))
4291 break;
4292 goto fail;
4295 /* on_failure_keep_string_jump is used to optimize `.*\n'. It
4296 pushes NULL as the value for the string on the stack. Then
4297 `pop_failure_point' will keep the current value for the
4298 string, instead of restoring it. To see why, consider
4299 matching `foo\nbar' against `.*\n'. The .* matches the foo;
4300 then the . fails against the \n. But the next thing we want
4301 to do is match the \n against the \n; if we restored the
4302 string value, we would be back at the foo.
4304 Because this is used only in specific cases, we don't need to
4305 check all the things that `on_failure_jump' does, to make
4306 sure the right things get saved on the stack. Hence we don't
4307 share its code. The only reason to push anything on the
4308 stack at all is that otherwise we would have to change
4309 `anychar's code to do something besides goto fail in this
4310 case; that seems worse than this. */
4311 case on_failure_keep_string_jump:
4312 DEBUG_PRINT1 ("EXECUTING on_failure_keep_string_jump");
4314 EXTRACT_NUMBER_AND_INCR (mcnt, p);
4315 DEBUG_PRINT3 (" %d (to 0x%x):\n", mcnt, p + mcnt);
4317 PUSH_FAILURE_POINT (p + mcnt, NULL, -2);
4318 break;
4321 /* Uses of on_failure_jump:
4323 Each alternative starts with an on_failure_jump that points
4324 to the beginning of the next alternative. Each alternative
4325 except the last ends with a jump that in effect jumps past
4326 the rest of the alternatives. (They really jump to the
4327 ending jump of the following alternative, because tensioning
4328 these jumps is a hassle.)
4330 Repeats start with an on_failure_jump that points past both
4331 the repetition text and either the following jump or
4332 pop_failure_jump back to this on_failure_jump. */
4333 case on_failure_jump:
4334 on_failure:
4335 DEBUG_PRINT1 ("EXECUTING on_failure_jump");
4337 EXTRACT_NUMBER_AND_INCR (mcnt, p);
4338 DEBUG_PRINT3 (" %d (to 0x%x)", mcnt, p + mcnt);
4340 /* If this on_failure_jump comes right before a group (i.e.,
4341 the original * applied to a group), save the information
4342 for that group and all inner ones, so that if we fail back
4343 to this point, the group's information will be correct.
4344 For example, in \(a*\)*\1, we need the preceding group,
4345 and in \(\(a*\)b*\)\2, we need the inner group. */
4347 /* We can't use `p' to check ahead because we push
4348 a failure point to `p + mcnt' after we do this. */
4349 p1 = p;
4351 /* We need to skip no_op's before we look for the
4352 start_memory in case this on_failure_jump is happening as
4353 the result of a completed succeed_n, as in \(a\)\{1,3\}b\1
4354 against aba. */
4355 while (p1 < pend && (re_opcode_t) *p1 == no_op)
4356 p1++;
4358 if (p1 < pend && (re_opcode_t) *p1 == start_memory)
4360 /* We have a new highest active register now. This will
4361 get reset at the start_memory we are about to get to,
4362 but we will have saved all the registers relevant to
4363 this repetition op, as described above. */
4364 highest_active_reg = *(p1 + 1) + *(p1 + 2);
4365 if (lowest_active_reg == NO_LOWEST_ACTIVE_REG)
4366 lowest_active_reg = *(p1 + 1);
4369 DEBUG_PRINT1 (":\n");
4370 PUSH_FAILURE_POINT (p + mcnt, d, -2);
4371 break;
4374 /* A smart repeat ends with `maybe_pop_jump'.
4375 We change it to either `pop_failure_jump' or `jump'. */
4376 case maybe_pop_jump:
4377 EXTRACT_NUMBER_AND_INCR (mcnt, p);
4378 DEBUG_PRINT2 ("EXECUTING maybe_pop_jump %d.\n", mcnt);
4380 register unsigned char *p2 = p;
4382 /* Compare the beginning of the repeat with what in the
4383 pattern follows its end. If we can establish that there
4384 is nothing that they would both match, i.e., that we
4385 would have to backtrack because of (as in, e.g., `a*a')
4386 then we can change to pop_failure_jump, because we'll
4387 never have to backtrack.
4389 This is not true in the case of alternatives: in
4390 `(a|ab)*' we do need to backtrack to the `ab' alternative
4391 (e.g., if the string was `ab'). But instead of trying to
4392 detect that here, the alternative has put on a dummy
4393 failure point which is what we will end up popping. */
4395 /* Skip over open/close-group commands.
4396 If what follows this loop is a ...+ construct,
4397 look at what begins its body, since we will have to
4398 match at least one of that. */
4399 while (1)
4401 if (p2 + 2 < pend
4402 && ((re_opcode_t) *p2 == stop_memory
4403 || (re_opcode_t) *p2 == start_memory))
4404 p2 += 3;
4405 else if (p2 + 6 < pend
4406 && (re_opcode_t) *p2 == dummy_failure_jump)
4407 p2 += 6;
4408 else
4409 break;
4412 p1 = p + mcnt;
4413 /* p1[0] ... p1[2] are the `on_failure_jump' corresponding
4414 to the `maybe_finalize_jump' of this case. Examine what
4415 follows. */
4417 /* If we're at the end of the pattern, we can change. */
4418 if (p2 == pend)
4420 /* Consider what happens when matching ":\(.*\)"
4421 against ":/". I don't really understand this code
4422 yet. */
4423 p[-3] = (unsigned char) pop_failure_jump;
4424 DEBUG_PRINT1
4425 (" End of pattern: change to `pop_failure_jump'.\n");
4428 else if ((re_opcode_t) *p2 == exactn
4429 || (bufp->newline_anchor && (re_opcode_t) *p2 == endline))
4431 register unsigned char c
4432 = *p2 == (unsigned char) endline ? '\n' : p2[2];
4434 if ((re_opcode_t) p1[3] == exactn && p1[5] != c)
4436 p[-3] = (unsigned char) pop_failure_jump;
4437 DEBUG_PRINT3 (" %c != %c => pop_failure_jump.\n",
4438 c, p1[5]);
4441 else if ((re_opcode_t) p1[3] == charset
4442 || (re_opcode_t) p1[3] == charset_not)
4444 int not = (re_opcode_t) p1[3] == charset_not;
4446 if (c < (unsigned char) (p1[4] * BYTEWIDTH)
4447 && p1[5 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
4448 not = !not;
4450 /* `not' is equal to 1 if c would match, which means
4451 that we can't change to pop_failure_jump. */
4452 if (!not)
4454 p[-3] = (unsigned char) pop_failure_jump;
4455 DEBUG_PRINT1 (" No match => pop_failure_jump.\n");
4459 else if ((re_opcode_t) *p2 == charset)
4461 #ifdef DEBUG
4462 register unsigned char c
4463 = *p2 == (unsigned char) endline ? '\n' : p2[2];
4464 #endif
4466 if ((re_opcode_t) p1[3] == exactn
4467 && ! ((int) p2[1] * BYTEWIDTH > (int) p1[4]
4468 && (p2[1 + p1[4] / BYTEWIDTH]
4469 & (1 << (p1[4] % BYTEWIDTH)))))
4471 p[-3] = (unsigned char) pop_failure_jump;
4472 DEBUG_PRINT3 (" %c != %c => pop_failure_jump.\n",
4473 c, p1[5]);
4476 else if ((re_opcode_t) p1[3] == charset_not)
4478 int idx;
4479 /* We win if the charset_not inside the loop
4480 lists every character listed in the charset after. */
4481 for (idx = 0; idx < (int) p2[1]; idx++)
4482 if (! (p2[2 + idx] == 0
4483 || (idx < (int) p1[4]
4484 && ((p2[2 + idx] & ~ p1[5 + idx]) == 0))))
4485 break;
4487 if (idx == p2[1])
4489 p[-3] = (unsigned char) pop_failure_jump;
4490 DEBUG_PRINT1 (" No match => pop_failure_jump.\n");
4493 else if ((re_opcode_t) p1[3] == charset)
4495 int idx;
4496 /* We win if the charset inside the loop
4497 has no overlap with the one after the loop. */
4498 for (idx = 0;
4499 idx < (int) p2[1] && idx < (int) p1[4];
4500 idx++)
4501 if ((p2[2 + idx] & p1[5 + idx]) != 0)
4502 break;
4504 if (idx == p2[1] || idx == p1[4])
4506 p[-3] = (unsigned char) pop_failure_jump;
4507 DEBUG_PRINT1 (" No match => pop_failure_jump.\n");
4512 p -= 2; /* Point at relative address again. */
4513 if ((re_opcode_t) p[-1] != pop_failure_jump)
4515 p[-1] = (unsigned char) jump;
4516 DEBUG_PRINT1 (" Match => jump.\n");
4517 goto unconditional_jump;
4519 /* Note fall through. */
4522 /* The end of a simple repeat has a pop_failure_jump back to
4523 its matching on_failure_jump, where the latter will push a
4524 failure point. The pop_failure_jump takes off failure
4525 points put on by this pop_failure_jump's matching
4526 on_failure_jump; we got through the pattern to here from the
4527 matching on_failure_jump, so didn't fail. */
4528 case pop_failure_jump:
4530 /* We need to pass separate storage for the lowest and
4531 highest registers, even though we don't care about the
4532 actual values. Otherwise, we will restore only one
4533 register from the stack, since lowest will == highest in
4534 `pop_failure_point'. */
4535 unsigned dummy_low_reg, dummy_high_reg;
4536 unsigned char *pdummy;
4537 const char *sdummy;
4539 DEBUG_PRINT1 ("EXECUTING pop_failure_jump.\n");
4540 POP_FAILURE_POINT (sdummy, pdummy,
4541 dummy_low_reg, dummy_high_reg,
4542 reg_dummy, reg_dummy, reg_info_dummy);
4544 /* Note fall through. */
4547 /* Unconditionally jump (without popping any failure points). */
4548 case jump:
4549 unconditional_jump:
4550 EXTRACT_NUMBER_AND_INCR (mcnt, p); /* Get the amount to jump. */
4551 DEBUG_PRINT2 ("EXECUTING jump %d ", mcnt);
4552 p += mcnt; /* Do the jump. */
4553 DEBUG_PRINT2 ("(to 0x%x).\n", p);
4554 break;
4557 /* We need this opcode so we can detect where alternatives end
4558 in `group_match_null_string_p' et al. */
4559 case jump_past_alt:
4560 DEBUG_PRINT1 ("EXECUTING jump_past_alt.\n");
4561 goto unconditional_jump;
4564 /* Normally, the on_failure_jump pushes a failure point, which
4565 then gets popped at pop_failure_jump. We will end up at
4566 pop_failure_jump, also, and with a pattern of, say, `a+', we
4567 are skipping over the on_failure_jump, so we have to push
4568 something meaningless for pop_failure_jump to pop. */
4569 case dummy_failure_jump:
4570 DEBUG_PRINT1 ("EXECUTING dummy_failure_jump.\n");
4571 /* It doesn't matter what we push for the string here. What
4572 the code at `fail' tests is the value for the pattern. */
4573 PUSH_FAILURE_POINT (0, 0, -2);
4574 goto unconditional_jump;
4577 /* At the end of an alternative, we need to push a dummy failure
4578 point in case we are followed by a `pop_failure_jump', because
4579 we don't want the failure point for the alternative to be
4580 popped. For example, matching `(a|ab)*' against `aab'
4581 requires that we match the `ab' alternative. */
4582 case push_dummy_failure:
4583 DEBUG_PRINT1 ("EXECUTING push_dummy_failure.\n");
4584 /* See comments just above at `dummy_failure_jump' about the
4585 two zeroes. */
4586 PUSH_FAILURE_POINT (0, 0, -2);
4587 break;
4589 /* Have to succeed matching what follows at least n times.
4590 After that, handle like `on_failure_jump'. */
4591 case succeed_n:
4592 EXTRACT_NUMBER (mcnt, p + 2);
4593 DEBUG_PRINT2 ("EXECUTING succeed_n %d.\n", mcnt);
4595 assert (mcnt >= 0);
4596 /* Originally, this is how many times we HAVE to succeed. */
4597 if (mcnt > 0)
4599 mcnt--;
4600 p += 2;
4601 STORE_NUMBER_AND_INCR (p, mcnt);
4602 DEBUG_PRINT3 (" Setting 0x%x to %d.\n", p, mcnt);
4604 else if (mcnt == 0)
4606 DEBUG_PRINT2 (" Setting two bytes from 0x%x to no_op.\n", p+2);
4607 p[2] = (unsigned char) no_op;
4608 p[3] = (unsigned char) no_op;
4609 goto on_failure;
4611 break;
4613 case jump_n:
4614 EXTRACT_NUMBER (mcnt, p + 2);
4615 DEBUG_PRINT2 ("EXECUTING jump_n %d.\n", mcnt);
4617 /* Originally, this is how many times we CAN jump. */
4618 if (mcnt)
4620 mcnt--;
4621 STORE_NUMBER (p + 2, mcnt);
4622 goto unconditional_jump;
4624 /* If don't have to jump any more, skip over the rest of command. */
4625 else
4626 p += 4;
4627 break;
4629 case set_number_at:
4631 DEBUG_PRINT1 ("EXECUTING set_number_at.\n");
4633 EXTRACT_NUMBER_AND_INCR (mcnt, p);
4634 p1 = p + mcnt;
4635 EXTRACT_NUMBER_AND_INCR (mcnt, p);
4636 DEBUG_PRINT3 (" Setting 0x%x to %d.\n", p1, mcnt);
4637 STORE_NUMBER (p1, mcnt);
4638 break;
4641 case wordbound:
4642 DEBUG_PRINT1 ("EXECUTING wordbound.\n");
4643 if (AT_WORD_BOUNDARY (d))
4644 break;
4645 goto fail;
4647 case notwordbound:
4648 DEBUG_PRINT1 ("EXECUTING notwordbound.\n");
4649 if (AT_WORD_BOUNDARY (d))
4650 goto fail;
4651 break;
4653 case wordbeg:
4654 DEBUG_PRINT1 ("EXECUTING wordbeg.\n");
4655 if (WORDCHAR_P (d) && (AT_STRINGS_BEG (d) || !WORDCHAR_P (d - 1)))
4656 break;
4657 goto fail;
4659 case wordend:
4660 DEBUG_PRINT1 ("EXECUTING wordend.\n");
4661 if (!AT_STRINGS_BEG (d) && WORDCHAR_P (d - 1)
4662 && (!WORDCHAR_P (d) || AT_STRINGS_END (d)))
4663 break;
4664 goto fail;
4666 #ifdef emacs
4667 case before_dot:
4668 DEBUG_PRINT1 ("EXECUTING before_dot.\n");
4669 if (PTR_CHAR_POS ((unsigned char *) d) >= point)
4670 goto fail;
4671 break;
4673 case at_dot:
4674 DEBUG_PRINT1 ("EXECUTING at_dot.\n");
4675 if (PTR_CHAR_POS ((unsigned char *) d) != point)
4676 goto fail;
4677 break;
4679 case after_dot:
4680 DEBUG_PRINT1 ("EXECUTING after_dot.\n");
4681 if (PTR_CHAR_POS ((unsigned char *) d) <= point)
4682 goto fail;
4683 break;
4684 #if 0 /* not emacs19 */
4685 case at_dot:
4686 DEBUG_PRINT1 ("EXECUTING at_dot.\n");
4687 if (PTR_CHAR_POS ((unsigned char *) d) + 1 != point)
4688 goto fail;
4689 break;
4690 #endif /* not emacs19 */
4692 case syntaxspec:
4693 DEBUG_PRINT2 ("EXECUTING syntaxspec %d.\n", mcnt);
4694 mcnt = *p++;
4695 goto matchsyntax;
4697 case wordchar:
4698 DEBUG_PRINT1 ("EXECUTING Emacs wordchar.\n");
4699 mcnt = (int) Sword;
4700 matchsyntax:
4701 PREFETCH ();
4702 /* Can't use *d++ here; SYNTAX may be an unsafe macro. */
4703 d++;
4704 if (SYNTAX (d[-1]) != (enum syntaxcode) mcnt)
4705 goto fail;
4706 SET_REGS_MATCHED ();
4707 break;
4709 case notsyntaxspec:
4710 DEBUG_PRINT2 ("EXECUTING notsyntaxspec %d.\n", mcnt);
4711 mcnt = *p++;
4712 goto matchnotsyntax;
4714 case notwordchar:
4715 DEBUG_PRINT1 ("EXECUTING Emacs notwordchar.\n");
4716 mcnt = (int) Sword;
4717 matchnotsyntax:
4718 PREFETCH ();
4719 /* Can't use *d++ here; SYNTAX may be an unsafe macro. */
4720 d++;
4721 if (SYNTAX (d[-1]) == (enum syntaxcode) mcnt)
4722 goto fail;
4723 SET_REGS_MATCHED ();
4724 break;
4726 #else /* not emacs */
4727 case wordchar:
4728 DEBUG_PRINT1 ("EXECUTING non-Emacs wordchar.\n");
4729 PREFETCH ();
4730 if (!WORDCHAR_P (d))
4731 goto fail;
4732 SET_REGS_MATCHED ();
4733 d++;
4734 break;
4736 case notwordchar:
4737 DEBUG_PRINT1 ("EXECUTING non-Emacs notwordchar.\n");
4738 PREFETCH ();
4739 if (WORDCHAR_P (d))
4740 goto fail;
4741 SET_REGS_MATCHED ();
4742 d++;
4743 break;
4744 #endif /* not emacs */
4746 default:
4747 abort ();
4749 continue; /* Successfully executed one pattern command; keep going. */
4752 /* We goto here if a matching operation fails. */
4753 fail:
4754 if (!FAIL_STACK_EMPTY ())
4755 { /* A restart point is known. Restore to that state. */
4756 DEBUG_PRINT1 ("\nFAIL:\n");
4757 POP_FAILURE_POINT (d, p,
4758 lowest_active_reg, highest_active_reg,
4759 regstart, regend, reg_info);
4761 /* If this failure point is a dummy, try the next one. */
4762 if (!p)
4763 goto fail;
4765 /* If we failed to the end of the pattern, don't examine *p. */
4766 assert (p <= pend);
4767 if (p < pend)
4769 boolean is_a_jump_n = false;
4771 /* If failed to a backwards jump that's part of a repetition
4772 loop, need to pop this failure point and use the next one. */
4773 switch ((re_opcode_t) *p)
4775 case jump_n:
4776 is_a_jump_n = true;
4777 case maybe_pop_jump:
4778 case pop_failure_jump:
4779 case jump:
4780 p1 = p + 1;
4781 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
4782 p1 += mcnt;
4784 if ((is_a_jump_n && (re_opcode_t) *p1 == succeed_n)
4785 || (!is_a_jump_n
4786 && (re_opcode_t) *p1 == on_failure_jump))
4787 goto fail;
4788 break;
4789 default:
4790 /* do nothing */ ;
4794 if (d >= string1 && d <= end1)
4795 dend = end_match_1;
4797 else
4798 break; /* Matching at this starting point really fails. */
4799 } /* for (;;) */
4801 if (best_regs_set)
4802 goto restore_best_regs;
4804 FREE_VARIABLES ();
4806 return -1; /* Failure to match. */
4807 } /* re_match_2 */
4809 /* Subroutine definitions for re_match_2. */
4812 /* We are passed P pointing to a register number after a start_memory.
4814 Return true if the pattern up to the corresponding stop_memory can
4815 match the empty string, and false otherwise.
4817 If we find the matching stop_memory, sets P to point to one past its number.
4818 Otherwise, sets P to an undefined byte less than or equal to END.
4820 We don't handle duplicates properly (yet). */
4822 static boolean
4823 group_match_null_string_p (p, end, reg_info)
4824 unsigned char **p, *end;
4825 register_info_type *reg_info;
4827 int mcnt;
4828 /* Point to after the args to the start_memory. */
4829 unsigned char *p1 = *p + 2;
4831 while (p1 < end)
4833 /* Skip over opcodes that can match nothing, and return true or
4834 false, as appropriate, when we get to one that can't, or to the
4835 matching stop_memory. */
4837 switch ((re_opcode_t) *p1)
4839 /* Could be either a loop or a series of alternatives. */
4840 case on_failure_jump:
4841 p1++;
4842 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
4844 /* If the next operation is not a jump backwards in the
4845 pattern. */
4847 if (mcnt >= 0)
4849 /* Go through the on_failure_jumps of the alternatives,
4850 seeing if any of the alternatives cannot match nothing.
4851 The last alternative starts with only a jump,
4852 whereas the rest start with on_failure_jump and end
4853 with a jump, e.g., here is the pattern for `a|b|c':
4855 /on_failure_jump/0/6/exactn/1/a/jump_past_alt/0/6
4856 /on_failure_jump/0/6/exactn/1/b/jump_past_alt/0/3
4857 /exactn/1/c
4859 So, we have to first go through the first (n-1)
4860 alternatives and then deal with the last one separately. */
4863 /* Deal with the first (n-1) alternatives, which start
4864 with an on_failure_jump (see above) that jumps to right
4865 past a jump_past_alt. */
4867 while ((re_opcode_t) p1[mcnt-3] == jump_past_alt)
4869 /* `mcnt' holds how many bytes long the alternative
4870 is, including the ending `jump_past_alt' and
4871 its number. */
4873 if (!alt_match_null_string_p (p1, p1 + mcnt - 3,
4874 reg_info))
4875 return false;
4877 /* Move to right after this alternative, including the
4878 jump_past_alt. */
4879 p1 += mcnt;
4881 /* Break if it's the beginning of an n-th alternative
4882 that doesn't begin with an on_failure_jump. */
4883 if ((re_opcode_t) *p1 != on_failure_jump)
4884 break;
4886 /* Still have to check that it's not an n-th
4887 alternative that starts with an on_failure_jump. */
4888 p1++;
4889 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
4890 if ((re_opcode_t) p1[mcnt-3] != jump_past_alt)
4892 /* Get to the beginning of the n-th alternative. */
4893 p1 -= 3;
4894 break;
4898 /* Deal with the last alternative: go back and get number
4899 of the `jump_past_alt' just before it. `mcnt' contains
4900 the length of the alternative. */
4901 EXTRACT_NUMBER (mcnt, p1 - 2);
4903 if (!alt_match_null_string_p (p1, p1 + mcnt, reg_info))
4904 return false;
4906 p1 += mcnt; /* Get past the n-th alternative. */
4907 } /* if mcnt > 0 */
4908 break;
4911 case stop_memory:
4912 assert (p1[1] == **p);
4913 *p = p1 + 2;
4914 return true;
4917 default:
4918 if (!common_op_match_null_string_p (&p1, end, reg_info))
4919 return false;
4921 } /* while p1 < end */
4923 return false;
4924 } /* group_match_null_string_p */
4927 /* Similar to group_match_null_string_p, but doesn't deal with alternatives:
4928 It expects P to be the first byte of a single alternative and END one
4929 byte past the last. The alternative can contain groups. */
4931 static boolean
4932 alt_match_null_string_p (p, end, reg_info)
4933 unsigned char *p, *end;
4934 register_info_type *reg_info;
4936 int mcnt;
4937 unsigned char *p1 = p;
4939 while (p1 < end)
4941 /* Skip over opcodes that can match nothing, and break when we get
4942 to one that can't. */
4944 switch ((re_opcode_t) *p1)
4946 /* It's a loop. */
4947 case on_failure_jump:
4948 p1++;
4949 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
4950 p1 += mcnt;
4951 break;
4953 default:
4954 if (!common_op_match_null_string_p (&p1, end, reg_info))
4955 return false;
4957 } /* while p1 < end */
4959 return true;
4960 } /* alt_match_null_string_p */
4963 /* Deals with the ops common to group_match_null_string_p and
4964 alt_match_null_string_p.
4966 Sets P to one after the op and its arguments, if any. */
4968 static boolean
4969 common_op_match_null_string_p (p, end, reg_info)
4970 unsigned char **p, *end;
4971 register_info_type *reg_info;
4973 int mcnt;
4974 boolean ret;
4975 int reg_no;
4976 unsigned char *p1 = *p;
4978 switch ((re_opcode_t) *p1++)
4980 case no_op:
4981 case begline:
4982 case endline:
4983 case begbuf:
4984 case endbuf:
4985 case wordbeg:
4986 case wordend:
4987 case wordbound:
4988 case notwordbound:
4989 #ifdef emacs
4990 case before_dot:
4991 case at_dot:
4992 case after_dot:
4993 #endif
4994 break;
4996 case start_memory:
4997 reg_no = *p1;
4998 assert (reg_no > 0 && reg_no <= MAX_REGNUM);
4999 ret = group_match_null_string_p (&p1, end, reg_info);
5001 /* Have to set this here in case we're checking a group which
5002 contains a group and a back reference to it. */
5004 if (REG_MATCH_NULL_STRING_P (reg_info[reg_no]) == MATCH_NULL_UNSET_VALUE)
5005 REG_MATCH_NULL_STRING_P (reg_info[reg_no]) = ret;
5007 if (!ret)
5008 return false;
5009 break;
5011 /* If this is an optimized succeed_n for zero times, make the jump. */
5012 case jump:
5013 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
5014 if (mcnt >= 0)
5015 p1 += mcnt;
5016 else
5017 return false;
5018 break;
5020 case succeed_n:
5021 /* Get to the number of times to succeed. */
5022 p1 += 2;
5023 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
5025 if (mcnt == 0)
5027 p1 -= 4;
5028 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
5029 p1 += mcnt;
5031 else
5032 return false;
5033 break;
5035 case duplicate:
5036 if (!REG_MATCH_NULL_STRING_P (reg_info[*p1]))
5037 return false;
5038 break;
5040 case set_number_at:
5041 p1 += 4;
5043 default:
5044 /* All other opcodes mean we cannot match the empty string. */
5045 return false;
5048 *p = p1;
5049 return true;
5050 } /* common_op_match_null_string_p */
5053 /* Return zero if TRANSLATE[S1] and TRANSLATE[S2] are identical for LEN
5054 bytes; nonzero otherwise. */
5056 static int
5057 bcmp_translate (s1, s2, len, translate)
5058 unsigned char *s1, *s2;
5059 register int len;
5060 char *translate;
5062 register unsigned char *p1 = s1, *p2 = s2;
5063 while (len)
5065 if (translate[*p1++] != translate[*p2++]) return 1;
5066 len--;
5068 return 0;
5071 /* Entry points for GNU code. */
5073 /* re_compile_pattern is the GNU regular expression compiler: it
5074 compiles PATTERN (of length SIZE) and puts the result in BUFP.
5075 Returns 0 if the pattern was valid, otherwise an error string.
5077 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
5078 are set in BUFP on entry.
5080 We call regex_compile to do the actual compilation. */
5082 const char *
5083 re_compile_pattern (pattern, length, bufp)
5084 const char *pattern;
5085 int length;
5086 struct re_pattern_buffer *bufp;
5088 reg_errcode_t ret;
5090 /* GNU code is written to assume at least RE_NREGS registers will be set
5091 (and at least one extra will be -1). */
5092 bufp->regs_allocated = REGS_UNALLOCATED;
5094 /* And GNU code determines whether or not to get register information
5095 by passing null for the REGS argument to re_match, etc., not by
5096 setting no_sub. */
5097 bufp->no_sub = 0;
5099 /* Match anchors at newline. */
5100 bufp->newline_anchor = 1;
5102 ret = regex_compile (pattern, length, re_syntax_options, bufp);
5104 if (!ret)
5105 return NULL;
5106 return gettext (re_error_msgid[(int) ret]);
5109 /* Entry points compatible with 4.2 BSD regex library. We don't define
5110 them unless specifically requested. */
5112 #ifdef _REGEX_RE_COMP
5114 /* BSD has one and only one pattern buffer. */
5115 static struct re_pattern_buffer re_comp_buf;
5117 char *
5118 re_comp (s)
5119 const char *s;
5121 reg_errcode_t ret;
5123 if (!s)
5125 if (!re_comp_buf.buffer)
5126 return gettext ("No previous regular expression");
5127 return 0;
5130 if (!re_comp_buf.buffer)
5132 re_comp_buf.buffer = (unsigned char *) malloc (200);
5133 if (re_comp_buf.buffer == NULL)
5134 return gettext (re_error_msgid[(int) REG_ESPACE]);
5135 re_comp_buf.allocated = 200;
5137 re_comp_buf.fastmap = (char *) malloc (1 << BYTEWIDTH);
5138 if (re_comp_buf.fastmap == NULL)
5139 return gettext (re_error_msgid[(int) REG_ESPACE]);
5142 /* Since `re_exec' always passes NULL for the `regs' argument, we
5143 don't need to initialize the pattern buffer fields which affect it. */
5145 /* Match anchors at newlines. */
5146 re_comp_buf.newline_anchor = 1;
5148 ret = regex_compile (s, strlen (s), re_syntax_options, &re_comp_buf);
5150 if (!ret)
5151 return NULL;
5153 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
5154 return (char *) gettext (re_error_msgid[(int) ret]);
5159 re_exec (s)
5160 const char *s;
5162 const int len = strlen (s);
5163 return
5164 0 <= re_search (&re_comp_buf, s, len, 0, len, (struct re_registers *) 0);
5166 #endif /* _REGEX_RE_COMP */
5168 /* POSIX.2 functions. Don't define these for Emacs. */
5170 #ifndef emacs
5172 /* regcomp takes a regular expression as a string and compiles it.
5174 PREG is a regex_t *. We do not expect any fields to be initialized,
5175 since POSIX says we shouldn't. Thus, we set
5177 `buffer' to the compiled pattern;
5178 `used' to the length of the compiled pattern;
5179 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
5180 REG_EXTENDED bit in CFLAGS is set; otherwise, to
5181 RE_SYNTAX_POSIX_BASIC;
5182 `newline_anchor' to REG_NEWLINE being set in CFLAGS;
5183 `fastmap' and `fastmap_accurate' to zero;
5184 `re_nsub' to the number of subexpressions in PATTERN.
5186 PATTERN is the address of the pattern string.
5188 CFLAGS is a series of bits which affect compilation.
5190 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
5191 use POSIX basic syntax.
5193 If REG_NEWLINE is set, then . and [^...] don't match newline.
5194 Also, regexec will try a match beginning after every newline.
5196 If REG_ICASE is set, then we considers upper- and lowercase
5197 versions of letters to be equivalent when matching.
5199 If REG_NOSUB is set, then when PREG is passed to regexec, that
5200 routine will report only success or failure, and nothing about the
5201 registers.
5203 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
5204 the return codes and their meanings.) */
5207 regcomp (preg, pattern, cflags)
5208 regex_t *preg;
5209 const char *pattern;
5210 int cflags;
5212 reg_errcode_t ret;
5213 unsigned syntax
5214 = (cflags & REG_EXTENDED) ?
5215 RE_SYNTAX_POSIX_EXTENDED : RE_SYNTAX_POSIX_BASIC;
5217 /* regex_compile will allocate the space for the compiled pattern. */
5218 preg->buffer = 0;
5219 preg->allocated = 0;
5220 preg->used = 0;
5222 /* Don't bother to use a fastmap when searching. This simplifies the
5223 REG_NEWLINE case: if we used a fastmap, we'd have to put all the
5224 characters after newlines into the fastmap. This way, we just try
5225 every character. */
5226 preg->fastmap = 0;
5228 if (cflags & REG_ICASE)
5230 unsigned i;
5232 preg->translate = (char *) malloc (CHAR_SET_SIZE);
5233 if (preg->translate == NULL)
5234 return (int) REG_ESPACE;
5236 /* Map uppercase characters to corresponding lowercase ones. */
5237 for (i = 0; i < CHAR_SET_SIZE; i++)
5238 preg->translate[i] = ISUPPER (i) ? tolower (i) : i;
5240 else
5241 preg->translate = NULL;
5243 /* If REG_NEWLINE is set, newlines are treated differently. */
5244 if (cflags & REG_NEWLINE)
5245 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
5246 syntax &= ~RE_DOT_NEWLINE;
5247 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
5248 /* It also changes the matching behavior. */
5249 preg->newline_anchor = 1;
5251 else
5252 preg->newline_anchor = 0;
5254 preg->no_sub = !!(cflags & REG_NOSUB);
5256 /* POSIX says a null character in the pattern terminates it, so we
5257 can use strlen here in compiling the pattern. */
5258 ret = regex_compile (pattern, strlen (pattern), syntax, preg);
5260 /* POSIX doesn't distinguish between an unmatched open-group and an
5261 unmatched close-group: both are REG_EPAREN. */
5262 if (ret == REG_ERPAREN) ret = REG_EPAREN;
5264 return (int) ret;
5268 /* regexec searches for a given pattern, specified by PREG, in the
5269 string STRING.
5271 If NMATCH is zero or REG_NOSUB was set in the cflags argument to
5272 `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
5273 least NMATCH elements, and we set them to the offsets of the
5274 corresponding matched substrings.
5276 EFLAGS specifies `execution flags' which affect matching: if
5277 REG_NOTBOL is set, then ^ does not match at the beginning of the
5278 string; if REG_NOTEOL is set, then $ does not match at the end.
5280 We return 0 if we find a match and REG_NOMATCH if not. */
5283 regexec (preg, string, nmatch, pmatch, eflags)
5284 const regex_t *preg;
5285 const char *string;
5286 size_t nmatch;
5287 regmatch_t pmatch[];
5288 int eflags;
5290 int ret;
5291 struct re_registers regs;
5292 regex_t private_preg;
5293 int len = strlen (string);
5294 boolean want_reg_info = !preg->no_sub && nmatch > 0;
5296 private_preg = *preg;
5298 private_preg.not_bol = !!(eflags & REG_NOTBOL);
5299 private_preg.not_eol = !!(eflags & REG_NOTEOL);
5301 /* The user has told us exactly how many registers to return
5302 information about, via `nmatch'. We have to pass that on to the
5303 matching routines. */
5304 private_preg.regs_allocated = REGS_FIXED;
5306 if (want_reg_info)
5308 regs.num_regs = nmatch;
5309 regs.start = TALLOC (nmatch, regoff_t);
5310 regs.end = TALLOC (nmatch, regoff_t);
5311 if (regs.start == NULL || regs.end == NULL)
5312 return (int) REG_NOMATCH;
5315 /* Perform the searching operation. */
5316 ret = re_search (&private_preg, string, len,
5317 /* start: */ 0, /* range: */ len,
5318 want_reg_info ? &regs : (struct re_registers *) 0);
5320 /* Copy the register information to the POSIX structure. */
5321 if (want_reg_info)
5323 if (ret >= 0)
5325 unsigned r;
5327 for (r = 0; r < nmatch; r++)
5329 pmatch[r].rm_so = regs.start[r];
5330 pmatch[r].rm_eo = regs.end[r];
5334 /* If we needed the temporary register info, free the space now. */
5335 free (regs.start);
5336 free (regs.end);
5339 /* We want zero return to mean success, unlike `re_search'. */
5340 return ret >= 0 ? (int) REG_NOERROR : (int) REG_NOMATCH;
5344 /* Returns a message corresponding to an error code, ERRCODE, returned
5345 from either regcomp or regexec. We don't use PREG here. */
5347 size_t
5348 regerror (errcode, preg, errbuf, errbuf_size)
5349 int errcode;
5350 const regex_t *preg;
5351 char *errbuf;
5352 size_t errbuf_size;
5354 const char *msg;
5355 size_t msg_size;
5357 if (errcode < 0
5358 || errcode >= (sizeof (re_error_msgid) / sizeof (re_error_msgid[0])))
5359 /* Only error codes returned by the rest of the code should be passed
5360 to this routine. If we are given anything else, or if other regex
5361 code generates an invalid error code, then the program has a bug.
5362 Dump core so we can fix it. */
5363 abort ();
5365 msg = gettext (re_error_msgid[errcode]);
5367 msg_size = strlen (msg) + 1; /* Includes the null. */
5369 if (errbuf_size != 0)
5371 if (msg_size > errbuf_size)
5373 strncpy (errbuf, msg, errbuf_size - 1);
5374 errbuf[errbuf_size - 1] = 0;
5376 else
5377 strcpy (errbuf, msg);
5380 return msg_size;
5384 /* Free dynamically allocated space used by PREG. */
5386 void
5387 regfree (preg)
5388 regex_t *preg;
5390 if (preg->buffer != NULL)
5391 free (preg->buffer);
5392 preg->buffer = NULL;
5394 preg->allocated = 0;
5395 preg->used = 0;
5397 if (preg->fastmap != NULL)
5398 free (preg->fastmap);
5399 preg->fastmap = NULL;
5400 preg->fastmap_accurate = 0;
5402 if (preg->translate != NULL)
5403 free (preg->translate);
5404 preg->translate = NULL;
5407 #endif /* not emacs */
5410 Local variables:
5411 make-backup-files: t
5412 version-control: t
5413 trim-versions-without-asking: nil
5414 End: