1 /* Extended regular expression matching and search library.
2 Copyright (C) 1985, 1989-90 Free Software Foundation, Inc.
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2, or (at your option)
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
19 /* To test, compile with -Dtest. This Dtestable feature turns this into
20 a self-contained program which reads a pattern, describes how it
21 compiles, then reads a string and searches for it.
23 On the other hand, if you compile with both -Dtest and -Dcanned you
24 can run some tests we've already thought of. */
26 /* AIX requires the alloca decl to be the first thing in the file. */
28 #define alloca __builtin_alloca
43 /* The `emacs' switch turns on certain special matching commands
44 that make sense only in emacs. */
53 #if defined (USG) || defined (STDC_HEADERS)
56 #define bcopy(s,d,n) memcpy((d),(s),(n))
57 #define bcmp(s1,s2,n) memcmp((s1),(s2),(n))
58 #define bzero(s,n) memset((s),0,(n))
69 /* Define the syntax stuff, so we can do the \<, \>, etc. */
71 /* This must be nonzero for the wordchar and notwordchar pattern
72 commands in re_match_2. */
77 #define SYNTAX(c) re_syntax_table[c]
82 char *re_syntax_table
;
84 #else /* not SYNTAX_TABLE */
86 static char re_syntax_table
[256];
98 bzero (re_syntax_table
, sizeof re_syntax_table
);
100 for (c
= 'a'; c
<= 'z'; c
++)
101 re_syntax_table
[c
] = Sword
;
103 for (c
= 'A'; c
<= 'Z'; c
++)
104 re_syntax_table
[c
] = Sword
;
106 for (c
= '0'; c
<= '9'; c
++)
107 re_syntax_table
[c
] = Sword
;
112 #endif /* SYNTAX_TABLE */
115 /* We write fatal error messages on standard error. */
118 /* isalpha(3) etc. are used for the character classes. */
120 /* Sequents are missing isgraph. */
122 #define isgraph(c) (isprint((c)) && !isspace((c)))
125 /* Get the interface, including the syntax bits. */
129 /* These are the command codes that appear in compiled regular
130 expressions, one per byte. Some command codes are followed by
131 argument bytes. A command code can specify any interpretation
132 whatsoever for its arguments. Zero-bytes may appear in the compiled
135 The value of `exactn' is needed in search.c (search_buffer) in emacs.
136 So regex.h defines a symbol `RE_EXACTN_VALUE' to be 1; the value of
137 `exactn' we use here must also be 1. */
142 exactn
=1, /* Followed by one byte giving n, then by n literal bytes. */
143 begline
, /* Fail unless at beginning of line. */
144 endline
, /* Fail unless at end of line. */
145 jump
, /* Followed by two bytes giving relative address to jump to. */
146 on_failure_jump
, /* Followed by two bytes giving relative address of
147 place to resume at in case of failure. */
148 finalize_jump
, /* Throw away latest failure point and then jump to
150 maybe_finalize_jump
, /* Like jump but finalize if safe to do so.
151 This is used to jump back to the beginning
152 of a repeat. If the command that follows
153 this jump is clearly incompatible with the
154 one at the beginning of the repeat, such that
155 we can be sure that there is no use backtracking
156 out of repetitions already completed,
158 dummy_failure_jump
, /* Jump, and push a dummy failure point. This
159 failure point will be thrown away if an attempt
160 is made to use it for a failure. A + construct
161 makes this before the first repeat. Also
162 use it as an intermediary kind of jump when
163 compiling an or construct. */
164 succeed_n
, /* Used like on_failure_jump except has to succeed n times;
165 then gets turned into an on_failure_jump. The relative
166 address following it is useless until then. The
167 address is followed by two bytes containing n. */
168 jump_n
, /* Similar to jump, but jump n times only; also the relative
169 address following is in turn followed by yet two more bytes
171 set_number_at
, /* Set the following relative location to the
172 subsequent number. */
173 anychar
, /* Matches any (more or less) one character. */
174 charset
, /* Matches any one char belonging to specified set.
175 First following byte is number of bitmap bytes.
176 Then come bytes for a bitmap saying which chars are in.
177 Bits in each byte are ordered low-bit-first.
178 A character is in the set if its bit is 1.
179 A character too large to have a bit in the map
180 is automatically not in the set. */
181 charset_not
, /* Same parameters as charset, but match any character
182 that is not one of those specified. */
183 start_memory
, /* Start remembering the text that is matched, for
184 storing in a memory register. Followed by one
185 byte containing the register number. Register numbers
186 must be in the range 0 through RE_NREGS. */
187 stop_memory
, /* Stop remembering the text that is matched
188 and store it in a memory register. Followed by
189 one byte containing the register number. Register
190 numbers must be in the range 0 through RE_NREGS. */
191 duplicate
, /* Match a duplicate of something remembered.
192 Followed by one byte containing the index of the memory
194 before_dot
, /* Succeeds if before point. */
195 at_dot
, /* Succeeds if at point. */
196 after_dot
, /* Succeeds if after point. */
197 begbuf
, /* Succeeds if at beginning of buffer. */
198 endbuf
, /* Succeeds if at end of buffer. */
199 wordchar
, /* Matches any word-constituent character. */
200 notwordchar
, /* Matches any char that is not a word-constituent. */
201 wordbeg
, /* Succeeds if at word beginning. */
202 wordend
, /* Succeeds if at word end. */
203 wordbound
, /* Succeeds if at a word boundary. */
204 notwordbound
,/* Succeeds if not at a word boundary. */
205 syntaxspec
, /* Matches any character whose syntax is specified.
206 followed by a byte which contains a syntax code,
208 notsyntaxspec
/* Matches any character whose syntax differs from
213 /* Number of failure points to allocate space for initially,
214 when matching. If this number is exceeded, more space is allocated,
215 so it is not a hard limit. */
222 #define SIGN_EXTEND_CHAR(c) ((c)>(char)127?(c)-256:(c)) /* for IBM RT */
224 #ifndef SIGN_EXTEND_CHAR
225 #define SIGN_EXTEND_CHAR(x) (x)
229 /* Store NUMBER in two contiguous bytes starting at DESTINATION. */
230 #define STORE_NUMBER(destination, number) \
231 { (destination)[0] = (number) & 0377; \
232 (destination)[1] = (number) >> 8; }
234 /* Same as STORE_NUMBER, except increment the destination pointer to
235 the byte after where the number is stored. Watch out that values for
236 DESTINATION such as p + 1 won't work, whereas p will. */
237 #define STORE_NUMBER_AND_INCR(destination, number) \
238 { STORE_NUMBER(destination, number); \
239 (destination) += 2; }
242 /* Put into DESTINATION a number stored in two contingous bytes starting
244 #define EXTRACT_NUMBER(destination, source) \
245 { (destination) = *(source) & 0377; \
246 (destination) += SIGN_EXTEND_CHAR (*(char *)((source) + 1)) << 8; }
248 /* Same as EXTRACT_NUMBER, except increment the pointer for source to
249 point to second byte of SOURCE. Note that SOURCE has to be a value
250 such as p, not, e.g., p + 1. */
251 #define EXTRACT_NUMBER_AND_INCR(destination, source) \
252 { EXTRACT_NUMBER (destination, source); \
256 /* Specify the precise syntax of regexps for compilation. This provides
257 for compatibility for various utilities which historically have
258 different, incompatible syntaxes.
260 The argument SYNTAX is a bit-mask comprised of the various bits
261 defined in regex.h. */
264 re_set_syntax (syntax
)
269 ret
= obscure_syntax
;
270 obscure_syntax
= syntax
;
274 /* Set by re_set_syntax to the current regexp syntax to recognize. */
275 int obscure_syntax
= 0;
279 /* Macros for re_compile_pattern, which is found below these definitions. */
281 #define CHAR_CLASS_MAX_LENGTH 6
283 /* Fetch the next character in the uncompiled pattern, translating it if
285 #define PATFETCH(c) \
286 {if (p == pend) goto end_of_pattern; \
287 c = * (unsigned char *) p++; \
288 if (translate) c = translate[c]; }
290 /* Fetch the next character in the uncompiled pattern, with no
292 #define PATFETCH_RAW(c) \
293 {if (p == pend) goto end_of_pattern; \
294 c = * (unsigned char *) p++; }
296 #define PATUNFETCH p--
299 /* If the buffer isn't allocated when it comes in, use this. */
300 #define INIT_BUF_SIZE 28
302 /* Make sure we have at least N more bytes of space in buffer. */
303 #define GET_BUFFER_SPACE(n) \
305 while (b - bufp->buffer + (n) >= bufp->allocated) \
309 /* Make sure we have one more byte of buffer space and then add CH to it. */
310 #define BUFPUSH(ch) \
312 GET_BUFFER_SPACE (1); \
313 *b++ = (char) (ch); \
316 /* Extend the buffer by twice its current size via reallociation and
317 reset the pointers that pointed into the old allocation to point to
318 the correct places in the new allocation. If extending the buffer
319 results in it being larger than 1 << 16, then flag memory exhausted. */
320 #define EXTEND_BUFFER \
321 { char *old_buffer = bufp->buffer; \
322 if (bufp->allocated == (1L<<16)) goto too_big; \
323 bufp->allocated *= 2; \
324 if (bufp->allocated > (1L<<16)) bufp->allocated = (1L<<16); \
325 bufp->buffer = (char *) realloc (bufp->buffer, bufp->allocated); \
326 if (bufp->buffer == 0) \
327 goto memory_exhausted; \
328 b = (b - old_buffer) + bufp->buffer; \
330 fixup_jump = (fixup_jump - old_buffer) + bufp->buffer; \
332 laststart = (laststart - old_buffer) + bufp->buffer; \
333 begalt = (begalt - old_buffer) + bufp->buffer; \
335 pending_exact = (pending_exact - old_buffer) + bufp->buffer; \
338 /* Set the bit for character C in a character set list. */
339 #define SET_LIST_BIT(c) (b[(c) / BYTEWIDTH] |= 1 << ((c) % BYTEWIDTH))
341 /* Get the next unsigned number in the uncompiled pattern. */
342 #define GET_UNSIGNED_NUMBER(num) \
346 while (isdigit (c)) \
350 num = num * 10 + c - '0'; \
358 /* Subroutines for re_compile_pattern. */
359 static void store_jump (), insert_jump (), store_jump_n (),
360 insert_jump_n (), insert_op_2 ();
363 /* re_compile_pattern takes a regular-expression string
364 and converts it into a buffer full of byte commands for matching.
366 PATTERN is the address of the pattern string
367 SIZE is the length of it.
368 BUFP is a struct re_pattern_buffer * which points to the info
369 on where to store the byte commands.
370 This structure contains a char * which points to the
371 actual space, which should have been obtained with malloc.
372 re_compile_pattern may use realloc to grow the buffer space.
374 The number of bytes of commands can be found out by looking in
375 the `struct re_pattern_buffer' that bufp pointed to, after
376 re_compile_pattern returns. */
379 re_compile_pattern (pattern
, size
, bufp
)
382 struct re_pattern_buffer
*bufp
;
384 register char *b
= bufp
->buffer
;
385 register char *p
= pattern
;
386 char *pend
= pattern
+ size
;
387 register unsigned c
, c1
;
389 unsigned char *translate
= (unsigned char *) bufp
->translate
;
391 /* Address of the count-byte of the most recently inserted `exactn'
392 command. This makes it possible to tell whether a new exact-match
393 character can be added to that command or requires a new `exactn'
396 char *pending_exact
= 0;
398 /* Address of the place where a forward-jump should go to the end of
399 the containing expression. Each alternative of an `or', except the
400 last, ends with a forward-jump of this sort. */
402 char *fixup_jump
= 0;
404 /* Address of start of the most recently finished expression.
405 This tells postfix * where to find the start of its operand. */
409 /* In processing a repeat, 1 means zero matches is allowed. */
413 /* In processing a repeat, 1 means many matches is allowed. */
417 /* Address of beginning of regexp, or inside of last \(. */
421 /* In processing an interval, at least this many matches must be made. */
424 /* In processing an interval, at most this many matches can be made. */
427 /* Place in pattern (i.e., the {) to which to go back if the interval
429 char *beg_interval
= 0;
431 /* Stack of information saved by \( and restored by \).
432 Four stack elements are pushed by each \(:
433 First, the value of b.
434 Second, the value of fixup_jump.
435 Third, the value of regnum.
436 Fourth, the value of begalt. */
439 int *stackp
= stackb
;
440 int *stacke
= stackb
+ 40;
443 /* Counts \('s as they are encountered. Remembered for the matching \),
444 where it becomes the register number to put in the stop_memory
449 bufp
->fastmap_accurate
= 0;
453 /* Initialize the syntax table. */
458 if (bufp
->allocated
== 0)
460 bufp
->allocated
= INIT_BUF_SIZE
;
462 /* EXTEND_BUFFER loses when bufp->allocated is 0. */
463 bufp
->buffer
= (char *) realloc (bufp
->buffer
, INIT_BUF_SIZE
);
465 /* Caller did not allocate a buffer. Do it for them. */
466 bufp
->buffer
= (char *) malloc (INIT_BUF_SIZE
);
467 if (!bufp
->buffer
) goto memory_exhausted
;
468 begalt
= b
= bufp
->buffer
;
480 /* When testing what follows the $,
481 look past the \-constructs that don't consume anything. */
482 if (! (obscure_syntax
& RE_CONTEXT_INDEP_OPS
))
485 if (*p1
== '\\' && p1
+ 1 != pend
486 && (p1
[1] == '<' || p1
[1] == '>'
487 || p1
[1] == '`' || p1
[1] == '\''
491 || p1
[1] == 'b' || p1
[1] == 'B'))
496 if (obscure_syntax
& RE_TIGHT_VBAR
)
498 if (! (obscure_syntax
& RE_CONTEXT_INDEP_OPS
) && p1
!= pend
)
500 /* Make operand of last vbar end before this `$'. */
502 store_jump (fixup_jump
, jump
, b
);
507 /* $ means succeed if at end of line, but only in special contexts.
508 If validly in the middle of a pattern, it is a normal character. */
510 if ((obscure_syntax
& RE_CONTEXTUAL_INVALID_OPS
) && p1
!= pend
)
511 goto invalid_pattern
;
512 if (p1
== pend
|| *p1
== '\n'
513 || (obscure_syntax
& RE_CONTEXT_INDEP_OPS
)
514 || (obscure_syntax
& RE_NO_BK_PARENS
516 : *p1
== '\\' && p1
[1] == ')')
517 || (obscure_syntax
& RE_NO_BK_VBAR
519 : *p1
== '\\' && p1
[1] == '|'))
527 /* ^ means succeed if at beg of line, but only if no preceding
530 if ((obscure_syntax
& RE_CONTEXTUAL_INVALID_OPS
) && laststart
)
531 goto invalid_pattern
;
532 if (laststart
&& p
- 2 >= pattern
&& p
[-2] != '\n'
533 && !(obscure_syntax
& RE_CONTEXT_INDEP_OPS
))
535 if (obscure_syntax
& RE_TIGHT_VBAR
)
538 && ! (obscure_syntax
& RE_CONTEXT_INDEP_OPS
))
549 if ((obscure_syntax
& RE_BK_PLUS_QM
)
550 || (obscure_syntax
& RE_LIMITED_OPS
))
554 /* If there is no previous pattern, char not special. */
557 if (obscure_syntax
& RE_CONTEXTUAL_INVALID_OPS
)
558 goto invalid_pattern
;
559 else if (! (obscure_syntax
& RE_CONTEXT_INDEP_OPS
))
562 /* If there is a sequence of repetition chars,
563 collapse it down to just one. */
568 zero_times_ok
|= c
!= '+';
569 many_times_ok
|= c
!= '?';
575 else if (!(obscure_syntax
& RE_BK_PLUS_QM
)
576 && (c
== '+' || c
== '?'))
578 else if ((obscure_syntax
& RE_BK_PLUS_QM
)
583 if (!(c1
== '+' || c1
== '?'))
598 /* Star, etc. applied to an empty pattern is equivalent
599 to an empty pattern. */
603 /* Now we know whether or not zero matches is allowed
604 and also whether or not two or more matches is allowed. */
607 /* If more than one repetition is allowed, put in at the
608 end a backward relative jump from b to before the next
609 jump we're going to put in below (which jumps from
610 laststart to after this jump). */
611 GET_BUFFER_SPACE (3);
612 store_jump (b
, maybe_finalize_jump
, laststart
- 3);
613 b
+= 3; /* Because store_jump put stuff here. */
615 /* On failure, jump from laststart to b + 3, which will be the
616 end of the buffer after this jump is inserted. */
617 GET_BUFFER_SPACE (3);
618 insert_jump (on_failure_jump
, laststart
, b
+ 3, b
);
623 /* At least one repetition is required, so insert a
624 dummy-failure before the initial on-failure-jump
625 instruction of the loop. This effects a skip over that
626 instruction the first time we hit that loop. */
627 GET_BUFFER_SPACE (6);
628 insert_jump (dummy_failure_jump
, laststart
, laststart
+ 6, b
);
640 goto invalid_pattern
;
641 while (b
- bufp
->buffer
642 > bufp
->allocated
- 3 - (1 << BYTEWIDTH
) / BYTEWIDTH
)
648 BUFPUSH (charset_not
);
655 BUFPUSH ((1 << BYTEWIDTH
) / BYTEWIDTH
);
656 /* Clear the whole map */
657 bzero (b
, (1 << BYTEWIDTH
) / BYTEWIDTH
);
659 if ((obscure_syntax
& RE_HAT_NOT_NEWLINE
) && b
[-2] == charset_not
)
663 /* Read in characters and ranges, setting map bits. */
666 /* Don't translate while fetching, in case it's a range bound.
667 When we set the bit for the character, we translate it. */
670 /* If set, \ escapes characters when inside [...]. */
671 if ((obscure_syntax
& RE_AWK_CLASS_HACK
) && c
== '\\')
681 /* If this is an empty bracket expression. */
682 if ((obscure_syntax
& RE_NO_EMPTY_BRACKETS
)
684 goto invalid_pattern
;
687 /* Stop if this isn't merely a ] inside a bracket
688 expression, but rather the end of a bracket
693 if (p
[0] == '-' && p
[1] != ']')
696 /* Don't translate the range bounds while fetching them. */
699 if ((obscure_syntax
& RE_NO_EMPTY_RANGES
) && c
> c1
)
700 goto invalid_pattern
;
702 if ((obscure_syntax
& RE_NO_HYPHEN_RANGE_END
)
703 && c1
== '-' && *p
!= ']')
704 goto invalid_pattern
;
708 /* Translate each char that's in the range. */
710 SET_LIST_BIT (translate
[c
]);
716 else if ((obscure_syntax
& RE_CHAR_CLASSES
)
717 && c
== '[' && p
[0] == ':')
719 /* Longest valid character class word has six characters. */
720 char str
[CHAR_CLASS_MAX_LENGTH
];
723 /* If no ] at end. */
725 goto invalid_pattern
;
728 /* Don't translate the ``character class'' characters. */
730 if (c
== ':' || c
== ']' || p
== pend
731 || c1
== CHAR_CLASS_MAX_LENGTH
)
737 || c
== ']' /* End of the bracket expression. */
740 || (strcmp (str
, "alpha") != 0
741 && strcmp (str
, "upper") != 0
742 && strcmp (str
, "lower") != 0
743 && strcmp (str
, "digit") != 0
744 && strcmp (str
, "alnum") != 0
745 && strcmp (str
, "xdigit") != 0
746 && strcmp (str
, "space") != 0
747 && strcmp (str
, "print") != 0
748 && strcmp (str
, "punct") != 0
749 && strcmp (str
, "graph") != 0
750 && strcmp (str
, "cntrl") != 0))
752 /* Undo the ending character, the letters, and leave
753 the leading : and [ (but set bits for them). */
762 /* The ] at the end of the character class. */
765 goto invalid_pattern
;
766 for (c
= 0; c
< (1 << BYTEWIDTH
); c
++)
768 if ((strcmp (str
, "alpha") == 0 && isalpha (c
))
769 || (strcmp (str
, "upper") == 0 && isupper (c
))
770 || (strcmp (str
, "lower") == 0 && islower (c
))
771 || (strcmp (str
, "digit") == 0 && isdigit (c
))
772 || (strcmp (str
, "alnum") == 0 && isalnum (c
))
773 || (strcmp (str
, "xdigit") == 0 && isxdigit (c
))
774 || (strcmp (str
, "space") == 0 && isspace (c
))
775 || (strcmp (str
, "print") == 0 && isprint (c
))
776 || (strcmp (str
, "punct") == 0 && ispunct (c
))
777 || (strcmp (str
, "graph") == 0 && isgraph (c
))
778 || (strcmp (str
, "cntrl") == 0 && iscntrl (c
)))
784 SET_LIST_BIT (translate
[c
]);
789 /* Discard any character set/class bitmap bytes that are all
790 0 at the end of the map. Decrement the map-length byte too. */
791 while ((int) b
[-1] > 0 && b
[b
[-1] - 1] == 0)
797 if (! (obscure_syntax
& RE_NO_BK_PARENS
))
803 if (! (obscure_syntax
& RE_NO_BK_PARENS
))
809 if (! (obscure_syntax
& RE_NEWLINE_OR
))
815 if ((obscure_syntax
& RE_CONTEXTUAL_INVALID_OPS
)
816 && (! laststart
|| p
== pend
))
817 goto invalid_pattern
;
818 else if (! (obscure_syntax
& RE_NO_BK_VBAR
))
824 if (! ((obscure_syntax
& RE_NO_BK_CURLY_BRACES
)
825 && (obscure_syntax
& RE_INTERVALS
)))
828 goto handle_interval
;
831 if (p
== pend
) goto invalid_pattern
;
836 if (obscure_syntax
& RE_NO_BK_PARENS
)
839 if (stackp
== stacke
) goto nesting_too_deep
;
841 /* Laststart should point to the start_memory that we are about
842 to push (unless the pattern has RE_NREGS or more ('s). */
843 *stackp
++ = b
- bufp
->buffer
;
844 if (regnum
< RE_NREGS
)
846 BUFPUSH (start_memory
);
849 *stackp
++ = fixup_jump
? fixup_jump
- bufp
->buffer
+ 1 : 0;
850 *stackp
++ = regnum
++;
851 *stackp
++ = begalt
- bufp
->buffer
;
858 if (obscure_syntax
& RE_NO_BK_PARENS
)
861 if (stackp
== stackb
) goto unmatched_close
;
862 begalt
= *--stackp
+ bufp
->buffer
;
864 store_jump (fixup_jump
, jump
, b
);
865 if (stackp
[-1] < RE_NREGS
)
867 BUFPUSH (stop_memory
);
868 BUFPUSH (stackp
[-1]);
871 fixup_jump
= *stackp
? *stackp
+ bufp
->buffer
- 1 : 0;
872 laststart
= *--stackp
+ bufp
->buffer
;
876 if ((obscure_syntax
& RE_LIMITED_OPS
)
877 || (obscure_syntax
& RE_NO_BK_VBAR
))
880 if (obscure_syntax
& RE_LIMITED_OPS
)
882 /* Insert before the previous alternative a jump which
883 jumps to this alternative if the former fails. */
884 GET_BUFFER_SPACE (6);
885 insert_jump (on_failure_jump
, begalt
, b
+ 6, b
);
888 /* The alternative before the previous alternative has a
889 jump after it which gets executed if it gets matched.
890 Adjust that jump so it will jump to the previous
891 alternative's analogous jump (put in below, which in
892 turn will jump to the next (if any) alternative's such
893 jump, etc.). The last such jump jumps to the correct
894 final destination. */
896 store_jump (fixup_jump
, jump
, b
);
898 /* Leave space for a jump after previous alternative---to be
908 if (! (obscure_syntax
& RE_INTERVALS
)
909 /* Let \{ be a literal. */
910 || ((obscure_syntax
& RE_INTERVALS
)
911 && (obscure_syntax
& RE_NO_BK_CURLY_BRACES
))
912 /* If it's the string "\{". */
913 || (p
- 2 == pattern
&& p
== pend
))
916 beg_interval
= p
- 1; /* The {. */
917 /* If there is no previous pattern, this isn't an interval. */
920 if (obscure_syntax
& RE_CONTEXTUAL_INVALID_OPS
)
921 goto invalid_pattern
;
925 /* It also isn't an interval if not preceded by an re
926 matching a single character or subexpression, or if
927 the current type of intervals can't handle back
928 references and the previous thing is a back reference. */
929 if (! (*laststart
== anychar
930 || *laststart
== charset
931 || *laststart
== charset_not
932 || *laststart
== start_memory
933 || (*laststart
== exactn
&& laststart
[1] == 1)
934 || (! (obscure_syntax
& RE_NO_BK_REFS
)
935 && *laststart
== duplicate
)))
937 if (obscure_syntax
& RE_NO_BK_CURLY_BRACES
)
940 /* Posix extended syntax is handled in previous
941 statement; this is for Posix basic syntax. */
942 if (obscure_syntax
& RE_INTERVALS
)
943 goto invalid_pattern
;
947 lower_bound
= -1; /* So can see if are set. */
949 GET_UNSIGNED_NUMBER (lower_bound
);
952 GET_UNSIGNED_NUMBER (upper_bound
);
954 upper_bound
= RE_DUP_MAX
;
957 upper_bound
= lower_bound
;
958 if (! (obscure_syntax
& RE_NO_BK_CURLY_BRACES
))
961 goto invalid_pattern
;
964 if (c
!= '}' || lower_bound
< 0 || upper_bound
> RE_DUP_MAX
965 || lower_bound
> upper_bound
966 || ((obscure_syntax
& RE_NO_BK_CURLY_BRACES
)
967 && p
!= pend
&& *p
== '{'))
969 if (obscure_syntax
& RE_NO_BK_CURLY_BRACES
)
970 goto unfetch_interval
;
972 goto invalid_pattern
;
975 /* If upper_bound is zero, don't want to succeed at all;
976 jump from laststart to b + 3, which will be the end of
977 the buffer after this jump is inserted. */
979 if (upper_bound
== 0)
981 GET_BUFFER_SPACE (3);
982 insert_jump (jump
, laststart
, b
+ 3, b
);
986 /* Otherwise, after lower_bound number of succeeds, jump
987 to after the jump_n which will be inserted at the end
988 of the buffer, and insert that jump_n. */
990 { /* Set to 5 if only one repetition is allowed and
991 hence no jump_n is inserted at the current end of
992 the buffer; then only space for the succeed_n is
993 needed. Otherwise, need space for both the
994 succeed_n and the jump_n. */
996 unsigned slots_needed
= upper_bound
== 1 ? 5 : 10;
998 GET_BUFFER_SPACE (slots_needed
);
999 /* Initialize the succeed_n to n, even though it will
1000 be set by its attendant set_number_at, because
1001 re_compile_fastmap will need to know it. Jump to
1002 what the end of buffer will be after inserting
1003 this succeed_n and possibly appending a jump_n. */
1004 insert_jump_n (succeed_n
, laststart
, b
+ slots_needed
,
1006 b
+= 5; /* Just increment for the succeed_n here. */
1008 /* More than one repetition is allowed, so put in at
1009 the end of the buffer a backward jump from b to the
1010 succeed_n we put in above. By the time we've gotten
1011 to this jump when matching, we'll have matched once
1012 already, so jump back only upper_bound - 1 times. */
1014 if (upper_bound
> 1)
1016 store_jump_n (b
, jump_n
, laststart
, upper_bound
- 1);
1018 /* When hit this when matching, reset the
1019 preceding jump_n's n to upper_bound - 1. */
1020 BUFPUSH (set_number_at
);
1021 GET_BUFFER_SPACE (2);
1022 STORE_NUMBER_AND_INCR (b
, -5);
1023 STORE_NUMBER_AND_INCR (b
, upper_bound
- 1);
1025 /* When hit this when matching, set the succeed_n's n. */
1026 GET_BUFFER_SPACE (5);
1027 insert_op_2 (set_number_at
, laststart
, b
, 5, lower_bound
);
1036 /* If an invalid interval, match the characters as literals. */
1042 "regex: no interval beginning to which to backtrack.\n");
1047 PATFETCH (c
); /* normal_char expects char in `c'. */
1058 BUFPUSH (syntaxspec
);
1060 BUFPUSH (syntax_spec_code
[c
]);
1065 BUFPUSH (notsyntaxspec
);
1067 BUFPUSH (syntax_spec_code
[c
]);
1078 BUFPUSH (notwordchar
);
1090 BUFPUSH (wordbound
);
1094 BUFPUSH (notwordbound
);
1114 if (obscure_syntax
& RE_NO_BK_REFS
)
1119 if (obscure_syntax
& RE_NO_EMPTY_BK_REF
)
1120 goto invalid_pattern
;
1124 /* Can't back reference to a subexpression if inside of it. */
1125 for (stackt
= stackp
- 2; stackt
> stackb
; stackt
-= 4)
1129 BUFPUSH (duplicate
);
1135 if (obscure_syntax
& RE_BK_PLUS_QM
)
1143 /* You might think it would be useful for \ to mean
1144 not to translate; but if we don't translate it
1145 it will never match anything. */
1146 if (translate
) c
= translate
[c
];
1152 normal_char
: /* Expects the character in `c'. */
1153 if (!pending_exact
|| pending_exact
+ *pending_exact
+ 1 != b
1154 || *pending_exact
== 0177 || *p
== '*' || *p
== '^'
1155 || ((obscure_syntax
& RE_BK_PLUS_QM
)
1156 ? *p
== '\\' && (p
[1] == '+' || p
[1] == '?')
1157 : (*p
== '+' || *p
== '?'))
1158 || ((obscure_syntax
& RE_INTERVALS
)
1159 && ((obscure_syntax
& RE_NO_BK_CURLY_BRACES
)
1161 : (p
[0] == '\\' && p
[1] == '{'))))
1174 store_jump (fixup_jump
, jump
, b
);
1176 if (stackp
!= stackb
) goto unmatched_open
;
1178 bufp
->used
= b
- bufp
->buffer
;
1182 return "Invalid regular expression";
1185 return "Unmatched \\(";
1188 return "Unmatched \\)";
1191 return "Premature end of regular expression";
1194 return "Nesting too deep";
1197 return "Regular expression too big";
1200 return "Memory exhausted";
1204 /* Store a jump of the form <OPCODE> <relative address>.
1205 Store in the location FROM a jump operation to jump to relative
1206 address FROM - TO. OPCODE is the opcode to store. */
1209 store_jump (from
, opcode
, to
)
1214 STORE_NUMBER(from
+ 1, to
- (from
+ 3));
1218 /* Open up space before char FROM, and insert there a jump to TO.
1219 CURRENT_END gives the end of the storage not in use, so we know
1220 how much data to copy up. OP is the opcode of the jump to insert.
1222 If you call this function, you must zero out pending_exact. */
1225 insert_jump (op
, from
, to
, current_end
)
1227 char *from
, *to
, *current_end
;
1229 register char *pfrom
= current_end
; /* Copy from here... */
1230 register char *pto
= current_end
+ 3; /* ...to here. */
1232 while (pfrom
!= from
)
1234 store_jump (from
, op
, to
);
1238 /* Store a jump of the form <opcode> <relative address> <n> .
1240 Store in the location FROM a jump operation to jump to relative
1241 address FROM - TO. OPCODE is the opcode to store, N is a number the
1242 jump uses, say, to decide how many times to jump.
1244 If you call this function, you must zero out pending_exact. */
1247 store_jump_n (from
, opcode
, to
, n
)
1253 STORE_NUMBER (from
+ 1, to
- (from
+ 3));
1254 STORE_NUMBER (from
+ 3, n
);
1258 /* Similar to insert_jump, but handles a jump which needs an extra
1259 number to handle minimum and maximum cases. Open up space at
1260 location FROM, and insert there a jump to TO. CURRENT_END gives the
1261 end of the storage in use, so we know how much data to copy up. OP is
1262 the opcode of the jump to insert.
1264 If you call this function, you must zero out pending_exact. */
1267 insert_jump_n (op
, from
, to
, current_end
, n
)
1269 char *from
, *to
, *current_end
;
1272 register char *pfrom
= current_end
; /* Copy from here... */
1273 register char *pto
= current_end
+ 5; /* ...to here. */
1275 while (pfrom
!= from
)
1277 store_jump_n (from
, op
, to
, n
);
1281 /* Open up space at location THERE, and insert operation OP followed by
1282 NUM_1 and NUM_2. CURRENT_END gives the end of the storage in use, so
1283 we know how much data to copy up.
1285 If you call this function, you must zero out pending_exact. */
1288 insert_op_2 (op
, there
, current_end
, num_1
, num_2
)
1290 char *there
, *current_end
;
1293 register char *pfrom
= current_end
; /* Copy from here... */
1294 register char *pto
= current_end
+ 5; /* ...to here. */
1296 while (pfrom
!= there
)
1300 STORE_NUMBER (there
+ 1, num_1
);
1301 STORE_NUMBER (there
+ 3, num_2
);
1306 /* Given a pattern, compute a fastmap from it. The fastmap records
1307 which of the (1 << BYTEWIDTH) possible characters can start a string
1308 that matches the pattern. This fastmap is used by re_search to skip
1309 quickly over totally implausible text.
1311 The caller must supply the address of a (1 << BYTEWIDTH)-byte data
1312 area as bufp->fastmap.
1313 The other components of bufp describe the pattern to be used. */
1316 re_compile_fastmap (bufp
)
1317 struct re_pattern_buffer
*bufp
;
1319 unsigned char *pattern
= (unsigned char *) bufp
->buffer
;
1320 int size
= bufp
->used
;
1321 register char *fastmap
= bufp
->fastmap
;
1322 register unsigned char *p
= pattern
;
1323 register unsigned char *pend
= pattern
+ size
;
1325 unsigned char *translate
= (unsigned char *) bufp
->translate
;
1327 unsigned char *stackb
[NFAILURES
];
1328 unsigned char **stackp
= stackb
;
1330 unsigned is_a_succeed_n
;
1332 bzero (fastmap
, (1 << BYTEWIDTH
));
1333 bufp
->fastmap_accurate
= 1;
1334 bufp
->can_be_null
= 0;
1341 bufp
->can_be_null
= 1;
1344 #ifdef SWITCH_ENUM_BUG
1345 switch ((int) ((enum regexpcode
) *p
++))
1347 switch ((enum regexpcode
) *p
++)
1352 fastmap
[translate
[p
[1]]] = 1;
1371 fastmap
[translate
['\n']] = 1;
1375 if (bufp
->can_be_null
!= 1)
1376 bufp
->can_be_null
= 2;
1381 case maybe_finalize_jump
:
1383 case dummy_failure_jump
:
1384 EXTRACT_NUMBER_AND_INCR (j
, p
);
1388 /* Jump backward reached implies we just went through
1389 the body of a loop and matched nothing.
1390 Opcode jumped to should be an on_failure_jump.
1391 Just treat it like an ordinary jump.
1392 For a * loop, it has pushed its failure point already;
1393 If so, discard that as redundant. */
1395 if ((enum regexpcode
) *p
!= on_failure_jump
1396 && (enum regexpcode
) *p
!= succeed_n
)
1399 EXTRACT_NUMBER_AND_INCR (j
, p
);
1401 if (stackp
!= stackb
&& *stackp
== p
)
1405 case on_failure_jump
:
1406 handle_on_failure_jump
:
1407 EXTRACT_NUMBER_AND_INCR (j
, p
);
1410 EXTRACT_NUMBER_AND_INCR (k
, p
); /* Skip the n. */
1415 /* Get to the number of times to succeed. */
1417 /* Increment p past the n for when k != 0. */
1418 EXTRACT_NUMBER_AND_INCR (k
, p
);
1422 goto handle_on_failure_jump
;
1436 bufp
->can_be_null
= 1;
1439 for (j
= 0; j
< (1 << BYTEWIDTH
); j
++)
1442 if (bufp
->can_be_null
)
1444 /* Don't return; check the alternative paths
1445 so we can set can_be_null if appropriate. */
1449 for (j
= 0; j
< (1 << BYTEWIDTH
); j
++)
1450 if (SYNTAX (j
) == Sword
)
1455 for (j
= 0; j
< (1 << BYTEWIDTH
); j
++)
1456 if (SYNTAX (j
) != Sword
)
1463 for (j
= 0; j
< (1 << BYTEWIDTH
); j
++)
1464 if (SYNTAX (j
) == (enum syntaxcode
) k
)
1470 for (j
= 0; j
< (1 << BYTEWIDTH
); j
++)
1471 if (SYNTAX (j
) != (enum syntaxcode
) k
)
1474 #endif /* not emacs */
1477 for (j
= *p
++ * BYTEWIDTH
- 1; j
>= 0; j
--)
1478 if (p
[j
/ BYTEWIDTH
] & (1 << (j
% BYTEWIDTH
)))
1481 fastmap
[translate
[j
]] = 1;
1488 /* Chars beyond end of map must be allowed */
1489 for (j
= *p
* BYTEWIDTH
; j
< (1 << BYTEWIDTH
); j
++)
1491 fastmap
[translate
[j
]] = 1;
1495 for (j
= *p
++ * BYTEWIDTH
- 1; j
>= 0; j
--)
1496 if (!(p
[j
/ BYTEWIDTH
] & (1 << (j
% BYTEWIDTH
))))
1499 fastmap
[translate
[j
]] = 1;
1506 /* Get here means we have successfully found the possible starting
1507 characters of one path of the pattern. We need not follow this
1508 path any farther. Instead, look at the next alternative
1509 remembered in the stack. */
1510 if (stackp
!= stackb
)
1519 /* Like re_search_2, below, but only one string is specified, and
1520 doesn't let you say where to stop matching. */
1523 re_search (pbufp
, string
, size
, startpos
, range
, regs
)
1524 struct re_pattern_buffer
*pbufp
;
1526 int size
, startpos
, range
;
1527 struct re_registers
*regs
;
1529 return re_search_2 (pbufp
, (char *) 0, 0, string
, size
, startpos
, range
,
1534 /* Using the compiled pattern in PBUFP->buffer, first tries to match the
1535 virtual concatenation of STRING1 and STRING2, starting first at index
1536 STARTPOS, then at STARTPOS + 1, and so on. RANGE is the number of
1537 places to try before giving up. If RANGE is negative, it searches
1538 backwards, i.e., the starting positions tried are STARTPOS, STARTPOS
1539 - 1, etc. STRING1 and STRING2 are of SIZE1 and SIZE2, respectively.
1540 In REGS, return the indices of the virtual concatenation of STRING1
1541 and STRING2 that matched the entire PBUFP->buffer and its contained
1542 subexpressions. Do not consider matching one past the index MSTOP in
1543 the virtual concatenation of STRING1 and STRING2.
1545 The value returned is the position in the strings at which the match
1546 was found, or -1 if no match was found, or -2 if error (such as
1547 failure stack overflow). */
1550 re_search_2 (pbufp
, string1
, size1
, string2
, size2
, startpos
, range
,
1552 struct re_pattern_buffer
*pbufp
;
1553 char *string1
, *string2
;
1557 struct re_registers
*regs
;
1560 register char *fastmap
= pbufp
->fastmap
;
1561 register unsigned char *translate
= (unsigned char *) pbufp
->translate
;
1562 int total_size
= size1
+ size2
;
1563 int endpos
= startpos
+ range
;
1566 /* Check for out-of-range starting position. */
1567 if (startpos
< 0 || startpos
> total_size
)
1570 /* Fix up range if it would eventually take startpos outside of the
1571 virtual concatenation of string1 and string2. */
1573 range
= -1 - startpos
;
1574 else if (endpos
> total_size
)
1575 range
= total_size
- startpos
;
1577 /* Update the fastmap now if not correct already. */
1578 if (fastmap
&& !pbufp
->fastmap_accurate
)
1579 re_compile_fastmap (pbufp
);
1581 /* If the search isn't to be a backwards one, don't waste time in a
1582 long search for a pattern that says it is anchored. */
1583 if (pbufp
->used
> 0 && (enum regexpcode
) pbufp
->buffer
[0] == begbuf
1594 /* If a fastmap is supplied, skip quickly over characters that
1595 cannot possibly be the start of a match. Note, however, that
1596 if the pattern can possibly match the null string, we must
1597 test it at each starting point so that we take the first null
1600 if (fastmap
&& startpos
< total_size
&& pbufp
->can_be_null
!= 1)
1602 if (range
> 0) /* Searching forwards. */
1604 register int lim
= 0;
1605 register unsigned char *p
;
1607 if (startpos
< size1
&& startpos
+ range
>= size1
)
1608 lim
= range
- (size1
- startpos
);
1610 p
= ((unsigned char *)
1611 &(startpos
>= size1
? string2
- size1
: string1
)[startpos
]);
1613 while (range
> lim
&& !fastmap
[translate
1617 startpos
+= irange
- range
;
1619 else /* Searching backwards. */
1621 register unsigned char c
;
1623 if (string1
== 0 || startpos
>= size1
)
1624 c
= string2
[startpos
- size1
];
1626 c
= string1
[startpos
];
1629 if (translate
? !fastmap
[translate
[c
]] : !fastmap
[c
])
1634 if (range
>= 0 && startpos
== total_size
1635 && fastmap
&& pbufp
->can_be_null
== 0)
1638 val
= re_match_2 (pbufp
, string1
, size1
, string2
, size2
, startpos
,
1647 #endif /* C_ALLOCA */
1668 #ifndef emacs /* emacs never uses this. */
1670 re_match (pbufp
, string
, size
, pos
, regs
)
1671 struct re_pattern_buffer
*pbufp
;
1674 struct re_registers
*regs
;
1676 return re_match_2 (pbufp
, (char *) 0, 0, string
, size
, pos
, regs
, size
);
1678 #endif /* not emacs */
1681 /* The following are used for re_match_2, defined below: */
1683 /* Roughly the maximum number of failure points on the stack. Would be
1684 exactly that if always pushed MAX_NUM_FAILURE_ITEMS each time we failed. */
1686 int re_max_failures
= 2000;
1688 /* Routine used by re_match_2. */
1689 static int bcmp_translate ();
1692 /* Structure and accessing macros used in re_match_2: */
1694 struct register_info
1696 unsigned is_active
: 1;
1697 unsigned matched_something
: 1;
1700 #define IS_ACTIVE(R) ((R).is_active)
1701 #define MATCHED_SOMETHING(R) ((R).matched_something)
1704 /* Macros used by re_match_2: */
1707 /* I.e., regstart, regend, and reg_info. */
1709 #define NUM_REG_ITEMS 3
1711 /* We push at most this many things on the stack whenever we
1712 fail. The `+ 2' refers to PATTERN_PLACE and STRING_PLACE, which are
1713 arguments to the PUSH_FAILURE_POINT macro. */
1715 #define MAX_NUM_FAILURE_ITEMS (RE_NREGS * NUM_REG_ITEMS + 2)
1718 /* We push this many things on the stack whenever we fail. */
1720 #define NUM_FAILURE_ITEMS (last_used_reg * NUM_REG_ITEMS + 2)
1723 /* This pushes most of the information about the current state we will want
1724 if we ever fail back to it. */
1726 #define PUSH_FAILURE_POINT(pattern_place, string_place) \
1728 short last_used_reg, this_reg; \
1730 /* Find out how many registers are active or have been matched. \
1731 (Aside from register zero, which is only set at the end.) */ \
1732 for (last_used_reg = RE_NREGS - 1; last_used_reg > 0; last_used_reg--)\
1733 if (regstart[last_used_reg] != (unsigned char *) -1) \
1736 if (stacke - stackp < NUM_FAILURE_ITEMS) \
1738 unsigned char **stackx; \
1739 unsigned int len = stacke - stackb; \
1740 if (len > re_max_failures * MAX_NUM_FAILURE_ITEMS) \
1743 /* Roughly double the size of the stack. */ \
1744 stackx = (unsigned char **) alloca (2 * len \
1745 * sizeof (unsigned char *));\
1746 /* Only copy what is in use. */ \
1747 bcopy (stackb, stackx, len * sizeof (char *)); \
1748 stackp = stackx + (stackp - stackb); \
1750 stacke = stackb + 2 * len; \
1753 /* Now push the info for each of those registers. */ \
1754 for (this_reg = 1; this_reg <= last_used_reg; this_reg++) \
1756 *stackp++ = regstart[this_reg]; \
1757 *stackp++ = regend[this_reg]; \
1758 *stackp++ = (unsigned char *) ®_info[this_reg]; \
1761 /* Push how many registers we saved. */ \
1762 *stackp++ = (unsigned char *) last_used_reg; \
1764 *stackp++ = pattern_place; \
1765 *stackp++ = string_place; \
1769 /* This pops what PUSH_FAILURE_POINT pushes. */
1771 #define POP_FAILURE_POINT() \
1774 stackp -= 2; /* Remove failure points. */ \
1775 temp = (int) *--stackp; /* How many regs pushed. */ \
1776 temp *= NUM_REG_ITEMS; /* How much to take off the stack. */ \
1777 stackp -= temp; /* Remove the register info. */ \
1781 #define MATCHING_IN_FIRST_STRING (dend == end_match_1)
1783 /* Is true if there is a first string and if PTR is pointing anywhere
1784 inside it or just past the end. */
1786 #define IS_IN_FIRST_STRING(ptr) \
1787 (size1 && string1 <= (ptr) && (ptr) <= string1 + size1)
1789 /* Call before fetching a character with *d. This switches over to
1790 string2 if necessary. */
1795 /* end of string2 => fail. */ \
1796 if (dend == end_match_2) \
1798 /* end of string1 => advance to string2. */ \
1800 dend = end_match_2; \
1804 /* Call this when have matched something; it sets `matched' flags for the
1805 registers corresponding to the subexpressions of which we currently
1807 #define SET_REGS_MATCHED \
1808 { unsigned this_reg; \
1809 for (this_reg = 0; this_reg < RE_NREGS; this_reg++) \
1811 if (IS_ACTIVE(reg_info[this_reg])) \
1812 MATCHED_SOMETHING(reg_info[this_reg]) = 1; \
1814 MATCHED_SOMETHING(reg_info[this_reg]) = 0; \
1818 /* Test if at very beginning or at very end of the virtual concatenation
1819 of string1 and string2. If there is only one string, we've put it in
1822 #define AT_STRINGS_BEG (d == (size1 ? string1 : string2) || !size2)
1823 #define AT_STRINGS_END (d == end2)
1825 #define AT_WORD_BOUNDARY \
1826 (AT_STRINGS_BEG || AT_STRINGS_END || IS_A_LETTER (d - 1) != IS_A_LETTER (d))
1828 /* We have two special cases to check for:
1829 1) if we're past the end of string1, we have to look at the first
1830 character in string2;
1831 2) if we're before the beginning of string2, we have to look at the
1832 last character in string1; we assume there is a string1, so use
1833 this in conjunction with AT_STRINGS_BEG. */
1834 #define IS_A_LETTER(d) \
1835 (SYNTAX ((d) == end1 ? *string2 : (d) == string2 - 1 ? *(end1 - 1) : *(d))\
1839 /* Match the pattern described by PBUFP against the virtual
1840 concatenation of STRING1 and STRING2, which are of SIZE1 and SIZE2,
1841 respectively. Start the match at index POS in the virtual
1842 concatenation of STRING1 and STRING2. In REGS, return the indices of
1843 the virtual concatenation of STRING1 and STRING2 that matched the
1844 entire PBUFP->buffer and its contained subexpressions. Do not
1845 consider matching one past the index MSTOP in the virtual
1846 concatenation of STRING1 and STRING2.
1848 If pbufp->fastmap is nonzero, then it had better be up to date.
1850 The reason that the data to match are specified as two components
1851 which are to be regarded as concatenated is so this function can be
1852 used directly on the contents of an Emacs buffer.
1854 -1 is returned if there is no match. -2 is returned if there is an
1855 error (such as match stack overflow). Otherwise the value is the
1856 length of the substring which was matched. */
1859 re_match_2 (pbufp
, string1_arg
, size1
, string2_arg
, size2
, pos
, regs
, mstop
)
1860 struct re_pattern_buffer
*pbufp
;
1861 char *string1_arg
, *string2_arg
;
1864 struct re_registers
*regs
;
1867 register unsigned char *p
= (unsigned char *) pbufp
->buffer
;
1869 /* Pointer to beyond end of buffer. */
1870 register unsigned char *pend
= p
+ pbufp
->used
;
1872 unsigned char *string1
= (unsigned char *) string1_arg
;
1873 unsigned char *string2
= (unsigned char *) string2_arg
;
1874 unsigned char *end1
; /* Just past end of first string. */
1875 unsigned char *end2
; /* Just past end of second string. */
1877 /* Pointers into string1 and string2, just past the last characters in
1878 each to consider matching. */
1879 unsigned char *end_match_1
, *end_match_2
;
1881 register unsigned char *d
, *dend
;
1882 register int mcnt
; /* Multipurpose. */
1883 unsigned char *translate
= (unsigned char *) pbufp
->translate
;
1884 unsigned is_a_jump_n
= 0;
1886 /* Failure point stack. Each place that can handle a failure further
1887 down the line pushes a failure point on this stack. It consists of
1888 restart, regend, and reg_info for all registers corresponding to the
1889 subexpressions we're currently inside, plus the number of such
1890 registers, and, finally, two char *'s. The first char * is where to
1891 resume scanning the pattern; the second one is where to resume
1892 scanning the strings. If the latter is zero, the failure point is a
1893 ``dummy''; if a failure happens and the failure point is a dummy, it
1894 gets discarded and the next next one is tried. */
1896 unsigned char *initial_stack
[MAX_NUM_FAILURE_ITEMS
* NFAILURES
];
1897 unsigned char **stackb
= initial_stack
;
1898 unsigned char **stackp
= stackb
;
1899 unsigned char **stacke
= &stackb
[MAX_NUM_FAILURE_ITEMS
* NFAILURES
];
1902 /* Information on the contents of registers. These are pointers into
1903 the input strings; they record just what was matched (on this
1904 attempt) by a subexpression part of the pattern, that is, the
1905 regnum-th regstart pointer points to where in the pattern we began
1906 matching and the regnum-th regend points to right after where we
1907 stopped matching the regnum-th subexpression. (The zeroth register
1908 keeps track of what the whole pattern matches.) */
1910 unsigned char *regstart
[RE_NREGS
];
1911 unsigned char *regend
[RE_NREGS
];
1913 /* The is_active field of reg_info helps us keep track of which (possibly
1914 nested) subexpressions we are currently in. The matched_something
1915 field of reg_info[reg_num] helps us tell whether or not we have
1916 matched any of the pattern so far this time through the reg_num-th
1917 subexpression. These two fields get reset each time through any
1918 loop their register is in. */
1920 struct register_info reg_info
[RE_NREGS
];
1923 /* The following record the register info as found in the above
1924 variables when we find a match better than any we've seen before.
1925 This happens as we backtrack through the failure points, which in
1926 turn happens only if we have not yet matched the entire string. */
1928 unsigned best_regs_set
= 0;
1929 unsigned char *best_regstart
[RE_NREGS
];
1930 unsigned char *best_regend
[RE_NREGS
];
1932 /* Initialize subexpression text positions to -1 to mark ones that no
1933 \( or ( and \) or ) has been seen for. Also set all registers to
1934 inactive and mark them as not having matched anything or ever
1936 for (mcnt
= 0; mcnt
< RE_NREGS
; mcnt
++)
1938 regstart
[mcnt
] = regend
[mcnt
] = (unsigned char *) -1;
1939 IS_ACTIVE (reg_info
[mcnt
]) = 0;
1940 MATCHED_SOMETHING (reg_info
[mcnt
]) = 0;
1944 for (mcnt
= 0; mcnt
< RE_NREGS
; mcnt
++)
1945 regs
->start
[mcnt
] = regs
->end
[mcnt
] = -1;
1947 /* Set up pointers to ends of strings.
1948 Don't allow the second string to be empty unless both are empty. */
1956 end1
= string1
+ size1
;
1957 end2
= string2
+ size2
;
1959 /* Compute where to stop matching, within the two strings. */
1962 end_match_1
= string1
+ mstop
;
1963 end_match_2
= string2
;
1968 end_match_2
= string2
+ mstop
- size1
;
1971 /* `p' scans through the pattern as `d' scans through the data. `dend'
1972 is the end of the input string that `d' points within. `d' is
1973 advanced into the following input string whenever necessary, but
1974 this happens before fetching; therefore, at the beginning of the
1975 loop, `d' can be pointing at the end of a string, but it cannot
1978 if (size1
!= 0 && pos
<= size1
)
1979 d
= string1
+ pos
, dend
= end_match_1
;
1981 d
= string2
+ pos
- size1
, dend
= end_match_2
;
1984 /* This loops over pattern commands. It exits by returning from the
1985 function if match is complete, or it drops through if match fails
1986 at this starting point in the input data. */
1991 /* End of pattern means we might have succeeded. */
1994 /* If not end of string, try backtracking. Otherwise done. */
1995 if (d
!= end_match_2
)
1997 if (stackp
!= stackb
)
1999 /* More failure points to try. */
2001 unsigned in_same_string
=
2002 IS_IN_FIRST_STRING (best_regend
[0])
2003 == MATCHING_IN_FIRST_STRING
;
2005 /* If exceeds best match so far, save it. */
2007 || (in_same_string
&& d
> best_regend
[0])
2008 || (! in_same_string
&& ! MATCHING_IN_FIRST_STRING
))
2011 best_regend
[0] = d
; /* Never use regstart[0]. */
2013 for (mcnt
= 1; mcnt
< RE_NREGS
; mcnt
++)
2015 best_regstart
[mcnt
] = regstart
[mcnt
];
2016 best_regend
[mcnt
] = regend
[mcnt
];
2021 /* If no failure points, don't restore garbage. */
2022 else if (best_regs_set
)
2025 /* Restore best match. */
2028 for (mcnt
= 0; mcnt
< RE_NREGS
; mcnt
++)
2030 regstart
[mcnt
] = best_regstart
[mcnt
];
2031 regend
[mcnt
] = best_regend
[mcnt
];
2036 /* If caller wants register contents data back, convert it
2040 regs
->start
[0] = pos
;
2041 if (MATCHING_IN_FIRST_STRING
)
2042 regs
->end
[0] = d
- string1
;
2044 regs
->end
[0] = d
- string2
+ size1
;
2045 for (mcnt
= 1; mcnt
< RE_NREGS
; mcnt
++)
2047 if (regend
[mcnt
] == (unsigned char *) -1)
2049 regs
->start
[mcnt
] = -1;
2050 regs
->end
[mcnt
] = -1;
2053 if (IS_IN_FIRST_STRING (regstart
[mcnt
]))
2054 regs
->start
[mcnt
] = regstart
[mcnt
] - string1
;
2056 regs
->start
[mcnt
] = regstart
[mcnt
] - string2
+ size1
;
2058 if (IS_IN_FIRST_STRING (regend
[mcnt
]))
2059 regs
->end
[mcnt
] = regend
[mcnt
] - string1
;
2061 regs
->end
[mcnt
] = regend
[mcnt
] - string2
+ size1
;
2064 return d
- pos
- (MATCHING_IN_FIRST_STRING
2069 /* Otherwise match next pattern command. */
2070 #ifdef SWITCH_ENUM_BUG
2071 switch ((int) ((enum regexpcode
) *p
++))
2073 switch ((enum regexpcode
) *p
++)
2077 /* \( [or `(', as appropriate] is represented by start_memory,
2078 \) by stop_memory. Both of those commands are followed by
2079 a register number in the next byte. The text matched
2080 within the \( and \) is recorded under that number. */
2083 IS_ACTIVE (reg_info
[*p
]) = 1;
2084 MATCHED_SOMETHING (reg_info
[*p
]) = 0;
2090 IS_ACTIVE (reg_info
[*p
]) = 0;
2092 /* If just failed to match something this time around with a sub-
2093 expression that's in a loop, try to force exit from the loop. */
2094 if ((! MATCHED_SOMETHING (reg_info
[*p
])
2095 || (enum regexpcode
) p
[-3] == start_memory
)
2098 register unsigned char *p2
= p
+ 1;
2105 case maybe_finalize_jump
:
2107 case dummy_failure_jump
:
2108 EXTRACT_NUMBER_AND_INCR (mcnt
, p2
);
2115 /* If the next operation is a jump backwards in the pattern
2116 to an on_failure_jump, exit from the loop by forcing a
2117 failure after pushing on the stack the on_failure_jump's
2118 jump in the pattern, and d. */
2119 if (mcnt
< 0 && (enum regexpcode
) *p2
++ == on_failure_jump
)
2121 EXTRACT_NUMBER_AND_INCR (mcnt
, p2
);
2122 PUSH_FAILURE_POINT (p2
+ mcnt
, d
);
2129 /* \<digit> has been turned into a `duplicate' command which is
2130 followed by the numeric value of <digit> as the register number. */
2133 int regno
= *p
++; /* Get which register to match against */
2134 register unsigned char *d2
, *dend2
;
2136 /* Where in input to try to start matching. */
2137 d2
= regstart
[regno
];
2139 /* Where to stop matching; if both the place to start and
2140 the place to stop matching are in the same string, then
2141 set to the place to stop, otherwise, for now have to use
2142 the end of the first string. */
2144 dend2
= ((IS_IN_FIRST_STRING (regstart
[regno
])
2145 == IS_IN_FIRST_STRING (regend
[regno
]))
2146 ? regend
[regno
] : end_match_1
);
2149 /* If necessary, advance to next segment in register
2153 if (dend2
== end_match_2
) break;
2154 if (dend2
== regend
[regno
]) break;
2155 d2
= string2
, dend2
= regend
[regno
]; /* end of string1 => advance to string2. */
2157 /* At end of register contents => success */
2158 if (d2
== dend2
) break;
2160 /* If necessary, advance to next segment in data. */
2163 /* How many characters left in this segment to match. */
2166 /* Want how many consecutive characters we can match in
2167 one shot, so, if necessary, adjust the count. */
2168 if (mcnt
> dend2
- d2
)
2171 /* Compare that many; failure if mismatch, else move
2174 ? bcmp_translate (d
, d2
, mcnt
, translate
)
2175 : bcmp (d
, d2
, mcnt
))
2177 d
+= mcnt
, d2
+= mcnt
;
2183 PREFETCH
; /* Fetch a data character. */
2184 /* Match anything but a newline, maybe even a null. */
2185 if ((translate
? translate
[*d
] : *d
) == '\n'
2186 || ((obscure_syntax
& RE_DOT_NOT_NULL
)
2187 && (translate
? translate
[*d
] : *d
) == '\000'))
2196 int not = 0; /* Nonzero for charset_not. */
2198 if (*(p
- 1) == (unsigned char) charset_not
)
2201 PREFETCH
; /* Fetch a data character. */
2208 if (c
< *p
* BYTEWIDTH
2209 && p
[1 + c
/ BYTEWIDTH
] & (1 << (c
% BYTEWIDTH
)))
2214 if (!not) goto fail
;
2221 if ((size1
!= 0 && d
== string1
)
2222 || (size1
== 0 && size2
!= 0 && d
== string2
)
2223 || (d
&& d
[-1] == '\n')
2224 || (size1
== 0 && size2
== 0))
2231 || (d
== end1
? (size2
== 0 || *string2
== '\n') : *d
== '\n'))
2235 /* `or' constructs are handled by starting each alternative with
2236 an on_failure_jump that points to the start of the next
2237 alternative. Each alternative except the last ends with a
2238 jump to the joining point. (Actually, each jump except for
2239 the last one really jumps to the following jump, because
2240 tensioning the jumps is a hassle.) */
2242 /* The start of a stupid repeat has an on_failure_jump that points
2243 past the end of the repeat text. This makes a failure point so
2244 that on failure to match a repetition, matching restarts past
2245 as many repetitions have been found with no way to fail and
2246 look for another one. */
2248 /* A smart repeat is similar but loops back to the on_failure_jump
2249 so that each repetition makes another failure point. */
2251 case on_failure_jump
:
2253 EXTRACT_NUMBER_AND_INCR (mcnt
, p
);
2254 PUSH_FAILURE_POINT (p
+ mcnt
, d
);
2257 /* The end of a smart repeat has a maybe_finalize_jump back.
2258 Change it either to a finalize_jump or an ordinary jump. */
2259 case maybe_finalize_jump
:
2260 EXTRACT_NUMBER_AND_INCR (mcnt
, p
);
2262 register unsigned char *p2
= p
;
2263 /* Compare what follows with the beginning of the repeat.
2264 If we can establish that there is nothing that they would
2265 both match, we can change to finalize_jump. */
2266 while (p2
+ 1 != pend
2267 && (*p2
== (unsigned char) stop_memory
2268 || *p2
== (unsigned char) start_memory
))
2269 p2
+= 2; /* Skip over reg number. */
2271 p
[-3] = (unsigned char) finalize_jump
;
2272 else if (*p2
== (unsigned char) exactn
2273 || *p2
== (unsigned char) endline
)
2275 register int c
= *p2
== (unsigned char) endline
? '\n' : p2
[2];
2276 register unsigned char *p1
= p
+ mcnt
;
2277 /* p1[0] ... p1[2] are an on_failure_jump.
2278 Examine what follows that. */
2279 if (p1
[3] == (unsigned char) exactn
&& p1
[5] != c
)
2280 p
[-3] = (unsigned char) finalize_jump
;
2281 else if (p1
[3] == (unsigned char) charset
2282 || p1
[3] == (unsigned char) charset_not
)
2284 int not = p1
[3] == (unsigned char) charset_not
;
2285 if (c
< p1
[4] * BYTEWIDTH
2286 && p1
[5 + c
/ BYTEWIDTH
] & (1 << (c
% BYTEWIDTH
)))
2288 /* `not' is 1 if c would match. */
2289 /* That means it is not safe to finalize. */
2291 p
[-3] = (unsigned char) finalize_jump
;
2295 p
-= 2; /* Point at relative address again. */
2296 if (p
[-1] != (unsigned char) finalize_jump
)
2298 p
[-1] = (unsigned char) jump
;
2301 /* Note fall through. */
2303 /* The end of a stupid repeat has a finalize_jump back to the
2304 start, where another failure point will be made which will
2305 point to after all the repetitions found so far. */
2307 /* Take off failure points put on by matching on_failure_jump
2308 because didn't fail. Also remove the register information
2309 put on by the on_failure_jump. */
2311 POP_FAILURE_POINT ();
2312 /* Note fall through. */
2314 /* Jump without taking off any failure points. */
2317 EXTRACT_NUMBER_AND_INCR (mcnt
, p
);
2321 case dummy_failure_jump
:
2322 /* Normally, the on_failure_jump pushes a failure point, which
2323 then gets popped at finalize_jump. We will end up at
2324 finalize_jump, also, and with a pattern of, say, `a+', we
2325 are skipping over the on_failure_jump, so we have to push
2326 something meaningless for finalize_jump to pop. */
2327 PUSH_FAILURE_POINT (0, 0);
2331 /* Have to succeed matching what follows at least n times. Then
2332 just handle like an on_failure_jump. */
2334 EXTRACT_NUMBER (mcnt
, p
+ 2);
2335 /* Originally, this is how many times we HAVE to succeed. */
2340 STORE_NUMBER_AND_INCR (p
, mcnt
);
2350 fprintf (stderr
, "regex: the succeed_n's n is not set.\n");
2356 EXTRACT_NUMBER (mcnt
, p
+ 2);
2357 /* Originally, this is how many times we CAN jump. */
2361 STORE_NUMBER(p
+ 2, mcnt
);
2362 goto nofinalize
; /* Do the jump without taking off
2363 any failure points. */
2365 /* If don't have to jump any more, skip over the rest of command. */
2372 register unsigned char *p1
;
2374 EXTRACT_NUMBER_AND_INCR (mcnt
, p
);
2376 EXTRACT_NUMBER_AND_INCR (mcnt
, p
);
2377 STORE_NUMBER (p1
, mcnt
);
2381 /* Ignore these. Used to ignore the n of succeed_n's which
2382 currently have n == 0. */
2387 if (AT_WORD_BOUNDARY
)
2392 if (AT_WORD_BOUNDARY
)
2397 /* Have to check if AT_STRINGS_BEG before looking at d - 1. */
2398 if (IS_A_LETTER (d
) && (AT_STRINGS_BEG
|| !IS_A_LETTER (d
- 1)))
2403 /* Have to check if AT_STRINGS_BEG before looking at d - 1. */
2404 if (!AT_STRINGS_BEG
&& IS_A_LETTER (d
- 1)
2405 && (!IS_A_LETTER (d
) || AT_STRINGS_END
))
2411 if (PTR_CHAR_POS (d
) >= point
)
2416 if (PTR_CHAR_POS (d
) != point
)
2421 if (PTR_CHAR_POS (d
) <= point
)
2433 if (SYNTAX (*d
++) != (enum syntaxcode
) mcnt
) goto fail
;
2439 goto matchnotsyntax
;
2445 if (SYNTAX (*d
++) == (enum syntaxcode
) mcnt
) goto fail
;
2449 #else /* not emacs */
2453 if (!IS_A_LETTER (d
))
2460 if (IS_A_LETTER (d
))
2465 #endif /* not emacs */
2478 /* Match the next few pattern characters exactly.
2479 mcnt is how many characters to match. */
2481 /* This is written out as an if-else so we don't waste time
2482 testing `translate' inside the loop. */
2488 if (translate
[*d
++] != *p
++) goto fail
;
2497 if (*d
++ != *p
++) goto fail
;
2504 continue; /* Successfully executed one pattern command; keep going. */
2506 /* Jump here if any matching operation fails. */
2508 if (stackp
!= stackb
)
2509 /* A restart point is known. Restart there and pop it. */
2511 short last_used_reg
, this_reg
;
2513 /* If this failure point is from a dummy_failure_point, just
2517 POP_FAILURE_POINT ();
2523 if (d
>= string1
&& d
<= end1
)
2525 /* Restore register info. */
2526 last_used_reg
= (short) *--stackp
;
2528 /* Make the ones that weren't saved -1 or 0 again. */
2529 for (this_reg
= RE_NREGS
- 1; this_reg
> last_used_reg
; this_reg
--)
2531 regend
[this_reg
] = (unsigned char *) -1;
2532 regstart
[this_reg
] = (unsigned char *) -1;
2533 IS_ACTIVE (reg_info
[this_reg
]) = 0;
2534 MATCHED_SOMETHING (reg_info
[this_reg
]) = 0;
2537 /* And restore the rest from the stack. */
2538 for ( ; this_reg
> 0; this_reg
--)
2540 reg_info
[this_reg
] = *(struct register_info
*) *--stackp
;
2541 regend
[this_reg
] = *--stackp
;
2542 regstart
[this_reg
] = *--stackp
;
2546 break; /* Matching at this starting point really fails. */
2550 goto restore_best_regs
;
2551 return -1; /* Failure to match. */
2556 bcmp_translate (s1
, s2
, len
, translate
)
2557 unsigned char *s1
, *s2
;
2559 unsigned char *translate
;
2561 register unsigned char *p1
= s1
, *p2
= s2
;
2564 if (translate
[*p1
++] != translate
[*p2
++]) return 1;
2572 /* Entry points compatible with 4.2 BSD regex library. */
2576 static struct re_pattern_buffer re_comp_buf
;
2584 if (!re_comp_buf
.buffer
)
2585 return "No previous regular expression";
2589 if (!re_comp_buf
.buffer
)
2591 if (!(re_comp_buf
.buffer
= (char *) malloc (200)))
2592 return "Memory exhausted";
2593 re_comp_buf
.allocated
= 200;
2594 if (!(re_comp_buf
.fastmap
= (char *) malloc (1 << BYTEWIDTH
)))
2595 return "Memory exhausted";
2597 return re_compile_pattern (s
, strlen (s
), &re_comp_buf
);
2604 int len
= strlen (s
);
2605 return 0 <= re_search (&re_comp_buf
, s
, len
, 0, len
,
2606 (struct re_registers
*) 0);
2608 #endif /* not emacs */
2616 /* Indexed by a character, gives the upper case equivalent of the
2620 { 000, 001, 002, 003, 004, 005, 006, 007,
2621 010, 011, 012, 013, 014, 015, 016, 017,
2622 020, 021, 022, 023, 024, 025, 026, 027,
2623 030, 031, 032, 033, 034, 035, 036, 037,
2624 040, 041, 042, 043, 044, 045, 046, 047,
2625 050, 051, 052, 053, 054, 055, 056, 057,
2626 060, 061, 062, 063, 064, 065, 066, 067,
2627 070, 071, 072, 073, 074, 075, 076, 077,
2628 0100, 0101, 0102, 0103, 0104, 0105, 0106, 0107,
2629 0110, 0111, 0112, 0113, 0114, 0115, 0116, 0117,
2630 0120, 0121, 0122, 0123, 0124, 0125, 0126, 0127,
2631 0130, 0131, 0132, 0133, 0134, 0135, 0136, 0137,
2632 0140, 0101, 0102, 0103, 0104, 0105, 0106, 0107,
2633 0110, 0111, 0112, 0113, 0114, 0115, 0116, 0117,
2634 0120, 0121, 0122, 0123, 0124, 0125, 0126, 0127,
2635 0130, 0131, 0132, 0173, 0174, 0175, 0176, 0177,
2636 0200, 0201, 0202, 0203, 0204, 0205, 0206, 0207,
2637 0210, 0211, 0212, 0213, 0214, 0215, 0216, 0217,
2638 0220, 0221, 0222, 0223, 0224, 0225, 0226, 0227,
2639 0230, 0231, 0232, 0233, 0234, 0235, 0236, 0237,
2640 0240, 0241, 0242, 0243, 0244, 0245, 0246, 0247,
2641 0250, 0251, 0252, 0253, 0254, 0255, 0256, 0257,
2642 0260, 0261, 0262, 0263, 0264, 0265, 0266, 0267,
2643 0270, 0271, 0272, 0273, 0274, 0275, 0276, 0277,
2644 0300, 0301, 0302, 0303, 0304, 0305, 0306, 0307,
2645 0310, 0311, 0312, 0313, 0314, 0315, 0316, 0317,
2646 0320, 0321, 0322, 0323, 0324, 0325, 0326, 0327,
2647 0330, 0331, 0332, 0333, 0334, 0335, 0336, 0337,
2648 0340, 0341, 0342, 0343, 0344, 0345, 0346, 0347,
2649 0350, 0351, 0352, 0353, 0354, 0355, 0356, 0357,
2650 0360, 0361, 0362, 0363, 0364, 0365, 0366, 0367,
2651 0370, 0371, 0372, 0373, 0374, 0375, 0376, 0377
2658 typedef enum { extended_test
, basic_test
} test_type
;
2660 /* Use this to run the tests we've thought of. */
2665 test_type t
= extended_test
;
2667 if (t
== basic_test
)
2669 printf ("Running basic tests:\n\n");
2670 test_posix_basic ();
2672 else if (t
== extended_test
)
2674 printf ("Running extended tests:\n\n");
2675 test_posix_extended ();
2679 #else /* not canned */
2681 /* Use this to run interactive tests. */
2689 struct re_pattern_buffer buf
;
2692 char fastmap
[(1 << BYTEWIDTH
)];
2694 /* Allow a command argument to specify the style of syntax. */
2696 obscure_syntax
= atoi (argv
[1]);
2699 buf
.buffer
= (char *) malloc (buf
.allocated
);
2700 buf
.fastmap
= fastmap
;
2701 buf
.translate
= upcase
;
2709 re_compile_pattern (pat
, strlen(pat
), &buf
);
2711 for (i
= 0; i
< buf
.used
; i
++)
2712 printchar (buf
.buffer
[i
]);
2716 printf ("%d allocated, %d used.\n", buf
.allocated
, buf
.used
);
2718 re_compile_fastmap (&buf
);
2719 printf ("Allowed by fastmap: ");
2720 for (i
= 0; i
< (1 << BYTEWIDTH
); i
++)
2721 if (fastmap
[i
]) printchar (i
);
2725 gets (pat
); /* Now read the string to match against */
2727 i
= re_match (&buf
, pat
, strlen (pat
), 0, 0);
2728 printf ("Match value %d.\n", i
);
2737 struct re_pattern_buffer
*bufp
;
2741 printf ("buf is :\n----------------\n");
2742 for (i
= 0; i
< bufp
->used
; i
++)
2743 printchar (bufp
->buffer
[i
]);
2745 printf ("\n%d allocated, %d used.\n", bufp
->allocated
, bufp
->used
);
2747 printf ("Allowed by fastmap: ");
2748 for (i
= 0; i
< (1 << BYTEWIDTH
); i
++)
2749 if (bufp
->fastmap
[i
])
2751 printf ("\nAllowed by translate: ");
2752 if (bufp
->translate
)
2753 for (i
= 0; i
< (1 << BYTEWIDTH
); i
++)
2754 if (bufp
->translate
[i
])
2756 printf ("\nfastmap is%s accurate\n", bufp
->fastmap_accurate
? "" : "n't");
2757 printf ("can %s be null\n----------", bufp
->can_be_null
? "" : "not");
2764 if (c
< 040 || c
>= 0177)
2767 putchar (((c
>> 6) & 3) + '0');
2768 putchar (((c
>> 3) & 7) + '0');
2769 putchar ((c
& 7) + '0');