Fix mdoc(7)/man(7) mix up.
[netbsd-mini2440.git] / usr.bin / sed / regex.c
bloba24d05cec870919abc06ebe2e11b57d720903e63
1 /* Extended regular expression matching and search library.
2 Copyright (C) 1985, 1989-90 Free Software Foundation, Inc.
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2, or (at your option)
7 any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
19 /* To test, compile with -Dtest. This Dtestable feature turns this into
20 a self-contained program which reads a pattern, describes how it
21 compiles, then reads a string and searches for it.
23 On the other hand, if you compile with both -Dtest and -Dcanned you
24 can run some tests we've already thought of. */
26 /* AIX requires the alloca decl to be the first thing in the file. */
27 #ifdef __GNUC__
28 #define alloca __builtin_alloca
29 #else
30 #ifdef sparc
31 #include <alloca.h>
32 #else
33 #ifdef _AIX
34 #pragma alloca
35 #else
36 char *alloca ();
37 #endif
38 #endif
39 #endif
41 #ifdef emacs
43 /* The `emacs' switch turns on certain special matching commands
44 that make sense only in emacs. */
46 #include "config.h"
47 #include "lisp.h"
48 #include "buffer.h"
49 #include "syntax.h"
51 #else /* not emacs */
53 #if defined (USG) || defined (STDC_HEADERS)
54 #ifndef BSTRING
55 #include <string.h>
56 #define bcopy(s,d,n) memcpy((d),(s),(n))
57 #define bcmp(s1,s2,n) memcmp((s1),(s2),(n))
58 #define bzero(s,n) memset((s),0,(n))
59 #endif
60 #endif
62 #ifdef STDC_HEADERS
63 #include <stdlib.h>
64 #else
65 char *malloc ();
66 char *realloc ();
67 #endif
69 /* Define the syntax stuff, so we can do the \<, \>, etc. */
71 /* This must be nonzero for the wordchar and notwordchar pattern
72 commands in re_match_2. */
73 #ifndef Sword
74 #define Sword 1
75 #endif
77 #define SYNTAX(c) re_syntax_table[c]
80 #ifdef SYNTAX_TABLE
82 char *re_syntax_table;
84 #else /* not SYNTAX_TABLE */
86 static char re_syntax_table[256];
89 static void
90 init_syntax_once ()
92 register int c;
93 static int done = 0;
95 if (done)
96 return;
98 bzero (re_syntax_table, sizeof re_syntax_table);
100 for (c = 'a'; c <= 'z'; c++)
101 re_syntax_table[c] = Sword;
103 for (c = 'A'; c <= 'Z'; c++)
104 re_syntax_table[c] = Sword;
106 for (c = '0'; c <= '9'; c++)
107 re_syntax_table[c] = Sword;
109 done = 1;
112 #endif /* SYNTAX_TABLE */
113 #endif /* emacs */
115 /* We write fatal error messages on standard error. */
116 #include <stdio.h>
118 /* isalpha(3) etc. are used for the character classes. */
119 #include <ctype.h>
120 /* Sequents are missing isgraph. */
121 #ifndef isgraph
122 #define isgraph(c) (isprint((c)) && !isspace((c)))
123 #endif
125 /* Get the interface, including the syntax bits. */
126 #include "regex.h"
129 /* These are the command codes that appear in compiled regular
130 expressions, one per byte. Some command codes are followed by
131 argument bytes. A command code can specify any interpretation
132 whatsoever for its arguments. Zero-bytes may appear in the compiled
133 regular expression.
135 The value of `exactn' is needed in search.c (search_buffer) in emacs.
136 So regex.h defines a symbol `RE_EXACTN_VALUE' to be 1; the value of
137 `exactn' we use here must also be 1. */
139 enum regexpcode
141 unused=0,
142 exactn=1, /* Followed by one byte giving n, then by n literal bytes. */
143 begline, /* Fail unless at beginning of line. */
144 endline, /* Fail unless at end of line. */
145 jump, /* Followed by two bytes giving relative address to jump to. */
146 on_failure_jump, /* Followed by two bytes giving relative address of
147 place to resume at in case of failure. */
148 finalize_jump, /* Throw away latest failure point and then jump to
149 address. */
150 maybe_finalize_jump, /* Like jump but finalize if safe to do so.
151 This is used to jump back to the beginning
152 of a repeat. If the command that follows
153 this jump is clearly incompatible with the
154 one at the beginning of the repeat, such that
155 we can be sure that there is no use backtracking
156 out of repetitions already completed,
157 then we finalize. */
158 dummy_failure_jump, /* Jump, and push a dummy failure point. This
159 failure point will be thrown away if an attempt
160 is made to use it for a failure. A + construct
161 makes this before the first repeat. Also
162 use it as an intermediary kind of jump when
163 compiling an or construct. */
164 succeed_n, /* Used like on_failure_jump except has to succeed n times;
165 then gets turned into an on_failure_jump. The relative
166 address following it is useless until then. The
167 address is followed by two bytes containing n. */
168 jump_n, /* Similar to jump, but jump n times only; also the relative
169 address following is in turn followed by yet two more bytes
170 containing n. */
171 set_number_at, /* Set the following relative location to the
172 subsequent number. */
173 anychar, /* Matches any (more or less) one character. */
174 charset, /* Matches any one char belonging to specified set.
175 First following byte is number of bitmap bytes.
176 Then come bytes for a bitmap saying which chars are in.
177 Bits in each byte are ordered low-bit-first.
178 A character is in the set if its bit is 1.
179 A character too large to have a bit in the map
180 is automatically not in the set. */
181 charset_not, /* Same parameters as charset, but match any character
182 that is not one of those specified. */
183 start_memory, /* Start remembering the text that is matched, for
184 storing in a memory register. Followed by one
185 byte containing the register number. Register numbers
186 must be in the range 0 through RE_NREGS. */
187 stop_memory, /* Stop remembering the text that is matched
188 and store it in a memory register. Followed by
189 one byte containing the register number. Register
190 numbers must be in the range 0 through RE_NREGS. */
191 duplicate, /* Match a duplicate of something remembered.
192 Followed by one byte containing the index of the memory
193 register. */
194 before_dot, /* Succeeds if before point. */
195 at_dot, /* Succeeds if at point. */
196 after_dot, /* Succeeds if after point. */
197 begbuf, /* Succeeds if at beginning of buffer. */
198 endbuf, /* Succeeds if at end of buffer. */
199 wordchar, /* Matches any word-constituent character. */
200 notwordchar, /* Matches any char that is not a word-constituent. */
201 wordbeg, /* Succeeds if at word beginning. */
202 wordend, /* Succeeds if at word end. */
203 wordbound, /* Succeeds if at a word boundary. */
204 notwordbound,/* Succeeds if not at a word boundary. */
205 syntaxspec, /* Matches any character whose syntax is specified.
206 followed by a byte which contains a syntax code,
207 e.g., Sword. */
208 notsyntaxspec /* Matches any character whose syntax differs from
209 that specified. */
213 /* Number of failure points to allocate space for initially,
214 when matching. If this number is exceeded, more space is allocated,
215 so it is not a hard limit. */
217 #ifndef NFAILURES
218 #define NFAILURES 80
219 #endif
221 #ifdef CHAR_UNSIGNED
222 #define SIGN_EXTEND_CHAR(c) ((c)>(char)127?(c)-256:(c)) /* for IBM RT */
223 #endif
224 #ifndef SIGN_EXTEND_CHAR
225 #define SIGN_EXTEND_CHAR(x) (x)
226 #endif
229 /* Store NUMBER in two contiguous bytes starting at DESTINATION. */
230 #define STORE_NUMBER(destination, number) \
231 { (destination)[0] = (number) & 0377; \
232 (destination)[1] = (number) >> 8; }
234 /* Same as STORE_NUMBER, except increment the destination pointer to
235 the byte after where the number is stored. Watch out that values for
236 DESTINATION such as p + 1 won't work, whereas p will. */
237 #define STORE_NUMBER_AND_INCR(destination, number) \
238 { STORE_NUMBER(destination, number); \
239 (destination) += 2; }
242 /* Put into DESTINATION a number stored in two contingous bytes starting
243 at SOURCE. */
244 #define EXTRACT_NUMBER(destination, source) \
245 { (destination) = *(source) & 0377; \
246 (destination) += SIGN_EXTEND_CHAR (*(char *)((source) + 1)) << 8; }
248 /* Same as EXTRACT_NUMBER, except increment the pointer for source to
249 point to second byte of SOURCE. Note that SOURCE has to be a value
250 such as p, not, e.g., p + 1. */
251 #define EXTRACT_NUMBER_AND_INCR(destination, source) \
252 { EXTRACT_NUMBER (destination, source); \
253 (source) += 2; }
256 /* Specify the precise syntax of regexps for compilation. This provides
257 for compatibility for various utilities which historically have
258 different, incompatible syntaxes.
260 The argument SYNTAX is a bit-mask comprised of the various bits
261 defined in regex.h. */
264 re_set_syntax (syntax)
265 int syntax;
267 int ret;
269 ret = obscure_syntax;
270 obscure_syntax = syntax;
271 return ret;
274 /* Set by re_set_syntax to the current regexp syntax to recognize. */
275 int obscure_syntax = 0;
279 /* Macros for re_compile_pattern, which is found below these definitions. */
281 #define CHAR_CLASS_MAX_LENGTH 6
283 /* Fetch the next character in the uncompiled pattern, translating it if
284 necessary. */
285 #define PATFETCH(c) \
286 {if (p == pend) goto end_of_pattern; \
287 c = * (unsigned char *) p++; \
288 if (translate) c = translate[c]; }
290 /* Fetch the next character in the uncompiled pattern, with no
291 translation. */
292 #define PATFETCH_RAW(c) \
293 {if (p == pend) goto end_of_pattern; \
294 c = * (unsigned char *) p++; }
296 #define PATUNFETCH p--
299 /* If the buffer isn't allocated when it comes in, use this. */
300 #define INIT_BUF_SIZE 28
302 /* Make sure we have at least N more bytes of space in buffer. */
303 #define GET_BUFFER_SPACE(n) \
305 while (b - bufp->buffer + (n) >= bufp->allocated) \
306 EXTEND_BUFFER; \
309 /* Make sure we have one more byte of buffer space and then add CH to it. */
310 #define BUFPUSH(ch) \
312 GET_BUFFER_SPACE (1); \
313 *b++ = (char) (ch); \
316 /* Extend the buffer by twice its current size via reallociation and
317 reset the pointers that pointed into the old allocation to point to
318 the correct places in the new allocation. If extending the buffer
319 results in it being larger than 1 << 16, then flag memory exhausted. */
320 #define EXTEND_BUFFER \
321 { char *old_buffer = bufp->buffer; \
322 if (bufp->allocated == (1L<<16)) goto too_big; \
323 bufp->allocated *= 2; \
324 if (bufp->allocated > (1L<<16)) bufp->allocated = (1L<<16); \
325 bufp->buffer = (char *) realloc (bufp->buffer, bufp->allocated); \
326 if (bufp->buffer == 0) \
327 goto memory_exhausted; \
328 b = (b - old_buffer) + bufp->buffer; \
329 if (fixup_jump) \
330 fixup_jump = (fixup_jump - old_buffer) + bufp->buffer; \
331 if (laststart) \
332 laststart = (laststart - old_buffer) + bufp->buffer; \
333 begalt = (begalt - old_buffer) + bufp->buffer; \
334 if (pending_exact) \
335 pending_exact = (pending_exact - old_buffer) + bufp->buffer; \
338 /* Set the bit for character C in a character set list. */
339 #define SET_LIST_BIT(c) (b[(c) / BYTEWIDTH] |= 1 << ((c) % BYTEWIDTH))
341 /* Get the next unsigned number in the uncompiled pattern. */
342 #define GET_UNSIGNED_NUMBER(num) \
343 { if (p != pend) \
345 PATFETCH (c); \
346 while (isdigit (c)) \
348 if (num < 0) \
349 num = 0; \
350 num = num * 10 + c - '0'; \
351 if (p == pend) \
352 break; \
353 PATFETCH (c); \
358 /* Subroutines for re_compile_pattern. */
359 static void store_jump (), insert_jump (), store_jump_n (),
360 insert_jump_n (), insert_op_2 ();
363 /* re_compile_pattern takes a regular-expression string
364 and converts it into a buffer full of byte commands for matching.
366 PATTERN is the address of the pattern string
367 SIZE is the length of it.
368 BUFP is a struct re_pattern_buffer * which points to the info
369 on where to store the byte commands.
370 This structure contains a char * which points to the
371 actual space, which should have been obtained with malloc.
372 re_compile_pattern may use realloc to grow the buffer space.
374 The number of bytes of commands can be found out by looking in
375 the `struct re_pattern_buffer' that bufp pointed to, after
376 re_compile_pattern returns. */
378 char *
379 re_compile_pattern (pattern, size, bufp)
380 char *pattern;
381 int size;
382 struct re_pattern_buffer *bufp;
384 register char *b = bufp->buffer;
385 register char *p = pattern;
386 char *pend = pattern + size;
387 register unsigned c, c1;
388 char *p1;
389 unsigned char *translate = (unsigned char *) bufp->translate;
391 /* Address of the count-byte of the most recently inserted `exactn'
392 command. This makes it possible to tell whether a new exact-match
393 character can be added to that command or requires a new `exactn'
394 command. */
396 char *pending_exact = 0;
398 /* Address of the place where a forward-jump should go to the end of
399 the containing expression. Each alternative of an `or', except the
400 last, ends with a forward-jump of this sort. */
402 char *fixup_jump = 0;
404 /* Address of start of the most recently finished expression.
405 This tells postfix * where to find the start of its operand. */
407 char *laststart = 0;
409 /* In processing a repeat, 1 means zero matches is allowed. */
411 char zero_times_ok;
413 /* In processing a repeat, 1 means many matches is allowed. */
415 char many_times_ok;
417 /* Address of beginning of regexp, or inside of last \(. */
419 char *begalt = b;
421 /* In processing an interval, at least this many matches must be made. */
422 int lower_bound;
424 /* In processing an interval, at most this many matches can be made. */
425 int upper_bound;
427 /* Place in pattern (i.e., the {) to which to go back if the interval
428 is invalid. */
429 char *beg_interval = 0;
431 /* Stack of information saved by \( and restored by \).
432 Four stack elements are pushed by each \(:
433 First, the value of b.
434 Second, the value of fixup_jump.
435 Third, the value of regnum.
436 Fourth, the value of begalt. */
438 int stackb[40];
439 int *stackp = stackb;
440 int *stacke = stackb + 40;
441 int *stackt;
443 /* Counts \('s as they are encountered. Remembered for the matching \),
444 where it becomes the register number to put in the stop_memory
445 command. */
447 int regnum = 1;
449 bufp->fastmap_accurate = 0;
451 #ifndef emacs
452 #ifndef SYNTAX_TABLE
453 /* Initialize the syntax table. */
454 init_syntax_once();
455 #endif
456 #endif
458 if (bufp->allocated == 0)
460 bufp->allocated = INIT_BUF_SIZE;
461 if (bufp->buffer)
462 /* EXTEND_BUFFER loses when bufp->allocated is 0. */
463 bufp->buffer = (char *) realloc (bufp->buffer, INIT_BUF_SIZE);
464 else
465 /* Caller did not allocate a buffer. Do it for them. */
466 bufp->buffer = (char *) malloc (INIT_BUF_SIZE);
467 if (!bufp->buffer) goto memory_exhausted;
468 begalt = b = bufp->buffer;
471 while (p != pend)
473 PATFETCH (c);
475 switch (c)
477 case '$':
479 char *p1 = p;
480 /* When testing what follows the $,
481 look past the \-constructs that don't consume anything. */
482 if (! (obscure_syntax & RE_CONTEXT_INDEP_OPS))
483 while (p1 != pend)
485 if (*p1 == '\\' && p1 + 1 != pend
486 && (p1[1] == '<' || p1[1] == '>'
487 || p1[1] == '`' || p1[1] == '\''
488 #ifdef emacs
489 || p1[1] == '='
490 #endif
491 || p1[1] == 'b' || p1[1] == 'B'))
492 p1 += 2;
493 else
494 break;
496 if (obscure_syntax & RE_TIGHT_VBAR)
498 if (! (obscure_syntax & RE_CONTEXT_INDEP_OPS) && p1 != pend)
499 goto normal_char;
500 /* Make operand of last vbar end before this `$'. */
501 if (fixup_jump)
502 store_jump (fixup_jump, jump, b);
503 fixup_jump = 0;
504 BUFPUSH (endline);
505 break;
507 /* $ means succeed if at end of line, but only in special contexts.
508 If validly in the middle of a pattern, it is a normal character. */
510 if ((obscure_syntax & RE_CONTEXTUAL_INVALID_OPS) && p1 != pend)
511 goto invalid_pattern;
512 if (p1 == pend || *p1 == '\n'
513 || (obscure_syntax & RE_CONTEXT_INDEP_OPS)
514 || (obscure_syntax & RE_NO_BK_PARENS
515 ? *p1 == ')'
516 : *p1 == '\\' && p1[1] == ')')
517 || (obscure_syntax & RE_NO_BK_VBAR
518 ? *p1 == '|'
519 : *p1 == '\\' && p1[1] == '|'))
521 BUFPUSH (endline);
522 break;
524 goto normal_char;
526 case '^':
527 /* ^ means succeed if at beg of line, but only if no preceding
528 pattern. */
530 if ((obscure_syntax & RE_CONTEXTUAL_INVALID_OPS) && laststart)
531 goto invalid_pattern;
532 if (laststart && p - 2 >= pattern && p[-2] != '\n'
533 && !(obscure_syntax & RE_CONTEXT_INDEP_OPS))
534 goto normal_char;
535 if (obscure_syntax & RE_TIGHT_VBAR)
537 if (p != pattern + 1
538 && ! (obscure_syntax & RE_CONTEXT_INDEP_OPS))
539 goto normal_char;
540 BUFPUSH (begline);
541 begalt = b;
543 else
544 BUFPUSH (begline);
545 break;
547 case '+':
548 case '?':
549 if ((obscure_syntax & RE_BK_PLUS_QM)
550 || (obscure_syntax & RE_LIMITED_OPS))
551 goto normal_char;
552 handle_plus:
553 case '*':
554 /* If there is no previous pattern, char not special. */
555 if (!laststart)
557 if (obscure_syntax & RE_CONTEXTUAL_INVALID_OPS)
558 goto invalid_pattern;
559 else if (! (obscure_syntax & RE_CONTEXT_INDEP_OPS))
560 goto normal_char;
562 /* If there is a sequence of repetition chars,
563 collapse it down to just one. */
564 zero_times_ok = 0;
565 many_times_ok = 0;
566 while (1)
568 zero_times_ok |= c != '+';
569 many_times_ok |= c != '?';
570 if (p == pend)
571 break;
572 PATFETCH (c);
573 if (c == '*')
575 else if (!(obscure_syntax & RE_BK_PLUS_QM)
576 && (c == '+' || c == '?'))
578 else if ((obscure_syntax & RE_BK_PLUS_QM)
579 && c == '\\')
581 int c1;
582 PATFETCH (c1);
583 if (!(c1 == '+' || c1 == '?'))
585 PATUNFETCH;
586 PATUNFETCH;
587 break;
589 c = c1;
591 else
593 PATUNFETCH;
594 break;
598 /* Star, etc. applied to an empty pattern is equivalent
599 to an empty pattern. */
600 if (!laststart)
601 break;
603 /* Now we know whether or not zero matches is allowed
604 and also whether or not two or more matches is allowed. */
605 if (many_times_ok)
607 /* If more than one repetition is allowed, put in at the
608 end a backward relative jump from b to before the next
609 jump we're going to put in below (which jumps from
610 laststart to after this jump). */
611 GET_BUFFER_SPACE (3);
612 store_jump (b, maybe_finalize_jump, laststart - 3);
613 b += 3; /* Because store_jump put stuff here. */
615 /* On failure, jump from laststart to b + 3, which will be the
616 end of the buffer after this jump is inserted. */
617 GET_BUFFER_SPACE (3);
618 insert_jump (on_failure_jump, laststart, b + 3, b);
619 pending_exact = 0;
620 b += 3;
621 if (!zero_times_ok)
623 /* At least one repetition is required, so insert a
624 dummy-failure before the initial on-failure-jump
625 instruction of the loop. This effects a skip over that
626 instruction the first time we hit that loop. */
627 GET_BUFFER_SPACE (6);
628 insert_jump (dummy_failure_jump, laststart, laststart + 6, b);
629 b += 3;
631 break;
633 case '.':
634 laststart = b;
635 BUFPUSH (anychar);
636 break;
638 case '[':
639 if (p == pend)
640 goto invalid_pattern;
641 while (b - bufp->buffer
642 > bufp->allocated - 3 - (1 << BYTEWIDTH) / BYTEWIDTH)
643 EXTEND_BUFFER;
645 laststart = b;
646 if (*p == '^')
648 BUFPUSH (charset_not);
649 p++;
651 else
652 BUFPUSH (charset);
653 p1 = p;
655 BUFPUSH ((1 << BYTEWIDTH) / BYTEWIDTH);
656 /* Clear the whole map */
657 bzero (b, (1 << BYTEWIDTH) / BYTEWIDTH);
659 if ((obscure_syntax & RE_HAT_NOT_NEWLINE) && b[-2] == charset_not)
660 SET_LIST_BIT ('\n');
663 /* Read in characters and ranges, setting map bits. */
664 while (1)
666 /* Don't translate while fetching, in case it's a range bound.
667 When we set the bit for the character, we translate it. */
668 PATFETCH_RAW (c);
670 /* If set, \ escapes characters when inside [...]. */
671 if ((obscure_syntax & RE_AWK_CLASS_HACK) && c == '\\')
673 PATFETCH(c1);
674 SET_LIST_BIT (c1);
675 continue;
677 if (c == ']')
679 if (p == p1 + 1)
681 /* If this is an empty bracket expression. */
682 if ((obscure_syntax & RE_NO_EMPTY_BRACKETS)
683 && p == pend)
684 goto invalid_pattern;
686 else
687 /* Stop if this isn't merely a ] inside a bracket
688 expression, but rather the end of a bracket
689 expression. */
690 break;
692 /* Get a range. */
693 if (p[0] == '-' && p[1] != ']')
695 PATFETCH (c1);
696 /* Don't translate the range bounds while fetching them. */
697 PATFETCH_RAW (c1);
699 if ((obscure_syntax & RE_NO_EMPTY_RANGES) && c > c1)
700 goto invalid_pattern;
702 if ((obscure_syntax & RE_NO_HYPHEN_RANGE_END)
703 && c1 == '-' && *p != ']')
704 goto invalid_pattern;
706 while (c <= c1)
708 /* Translate each char that's in the range. */
709 if (translate)
710 SET_LIST_BIT (translate[c]);
711 else
712 SET_LIST_BIT (c);
713 c++;
716 else if ((obscure_syntax & RE_CHAR_CLASSES)
717 && c == '[' && p[0] == ':')
719 /* Longest valid character class word has six characters. */
720 char str[CHAR_CLASS_MAX_LENGTH];
721 PATFETCH (c);
722 c1 = 0;
723 /* If no ] at end. */
724 if (p == pend)
725 goto invalid_pattern;
726 while (1)
728 /* Don't translate the ``character class'' characters. */
729 PATFETCH_RAW (c);
730 if (c == ':' || c == ']' || p == pend
731 || c1 == CHAR_CLASS_MAX_LENGTH)
732 break;
733 str[c1++] = c;
735 str[c1] = '\0';
736 if (p == pend
737 || c == ']' /* End of the bracket expression. */
738 || p[0] != ']'
739 || p + 1 == pend
740 || (strcmp (str, "alpha") != 0
741 && strcmp (str, "upper") != 0
742 && strcmp (str, "lower") != 0
743 && strcmp (str, "digit") != 0
744 && strcmp (str, "alnum") != 0
745 && strcmp (str, "xdigit") != 0
746 && strcmp (str, "space") != 0
747 && strcmp (str, "print") != 0
748 && strcmp (str, "punct") != 0
749 && strcmp (str, "graph") != 0
750 && strcmp (str, "cntrl") != 0))
752 /* Undo the ending character, the letters, and leave
753 the leading : and [ (but set bits for them). */
754 c1++;
755 while (c1--)
756 PATUNFETCH;
757 SET_LIST_BIT ('[');
758 SET_LIST_BIT (':');
760 else
762 /* The ] at the end of the character class. */
763 PATFETCH (c);
764 if (c != ']')
765 goto invalid_pattern;
766 for (c = 0; c < (1 << BYTEWIDTH); c++)
768 if ((strcmp (str, "alpha") == 0 && isalpha (c))
769 || (strcmp (str, "upper") == 0 && isupper (c))
770 || (strcmp (str, "lower") == 0 && islower (c))
771 || (strcmp (str, "digit") == 0 && isdigit (c))
772 || (strcmp (str, "alnum") == 0 && isalnum (c))
773 || (strcmp (str, "xdigit") == 0 && isxdigit (c))
774 || (strcmp (str, "space") == 0 && isspace (c))
775 || (strcmp (str, "print") == 0 && isprint (c))
776 || (strcmp (str, "punct") == 0 && ispunct (c))
777 || (strcmp (str, "graph") == 0 && isgraph (c))
778 || (strcmp (str, "cntrl") == 0 && iscntrl (c)))
779 SET_LIST_BIT (c);
783 else if (translate)
784 SET_LIST_BIT (translate[c]);
785 else
786 SET_LIST_BIT (c);
789 /* Discard any character set/class bitmap bytes that are all
790 0 at the end of the map. Decrement the map-length byte too. */
791 while ((int) b[-1] > 0 && b[b[-1] - 1] == 0)
792 b[-1]--;
793 b += b[-1];
794 break;
796 case '(':
797 if (! (obscure_syntax & RE_NO_BK_PARENS))
798 goto normal_char;
799 else
800 goto handle_open;
802 case ')':
803 if (! (obscure_syntax & RE_NO_BK_PARENS))
804 goto normal_char;
805 else
806 goto handle_close;
808 case '\n':
809 if (! (obscure_syntax & RE_NEWLINE_OR))
810 goto normal_char;
811 else
812 goto handle_bar;
814 case '|':
815 if ((obscure_syntax & RE_CONTEXTUAL_INVALID_OPS)
816 && (! laststart || p == pend))
817 goto invalid_pattern;
818 else if (! (obscure_syntax & RE_NO_BK_VBAR))
819 goto normal_char;
820 else
821 goto handle_bar;
823 case '{':
824 if (! ((obscure_syntax & RE_NO_BK_CURLY_BRACES)
825 && (obscure_syntax & RE_INTERVALS)))
826 goto normal_char;
827 else
828 goto handle_interval;
830 case '\\':
831 if (p == pend) goto invalid_pattern;
832 PATFETCH_RAW (c);
833 switch (c)
835 case '(':
836 if (obscure_syntax & RE_NO_BK_PARENS)
837 goto normal_backsl;
838 handle_open:
839 if (stackp == stacke) goto nesting_too_deep;
841 /* Laststart should point to the start_memory that we are about
842 to push (unless the pattern has RE_NREGS or more ('s). */
843 *stackp++ = b - bufp->buffer;
844 if (regnum < RE_NREGS)
846 BUFPUSH (start_memory);
847 BUFPUSH (regnum);
849 *stackp++ = fixup_jump ? fixup_jump - bufp->buffer + 1 : 0;
850 *stackp++ = regnum++;
851 *stackp++ = begalt - bufp->buffer;
852 fixup_jump = 0;
853 laststart = 0;
854 begalt = b;
855 break;
857 case ')':
858 if (obscure_syntax & RE_NO_BK_PARENS)
859 goto normal_backsl;
860 handle_close:
861 if (stackp == stackb) goto unmatched_close;
862 begalt = *--stackp + bufp->buffer;
863 if (fixup_jump)
864 store_jump (fixup_jump, jump, b);
865 if (stackp[-1] < RE_NREGS)
867 BUFPUSH (stop_memory);
868 BUFPUSH (stackp[-1]);
870 stackp -= 2;
871 fixup_jump = *stackp ? *stackp + bufp->buffer - 1 : 0;
872 laststart = *--stackp + bufp->buffer;
873 break;
875 case '|':
876 if ((obscure_syntax & RE_LIMITED_OPS)
877 || (obscure_syntax & RE_NO_BK_VBAR))
878 goto normal_backsl;
879 handle_bar:
880 if (obscure_syntax & RE_LIMITED_OPS)
881 goto normal_char;
882 /* Insert before the previous alternative a jump which
883 jumps to this alternative if the former fails. */
884 GET_BUFFER_SPACE (6);
885 insert_jump (on_failure_jump, begalt, b + 6, b);
886 pending_exact = 0;
887 b += 3;
888 /* The alternative before the previous alternative has a
889 jump after it which gets executed if it gets matched.
890 Adjust that jump so it will jump to the previous
891 alternative's analogous jump (put in below, which in
892 turn will jump to the next (if any) alternative's such
893 jump, etc.). The last such jump jumps to the correct
894 final destination. */
895 if (fixup_jump)
896 store_jump (fixup_jump, jump, b);
898 /* Leave space for a jump after previous alternative---to be
899 filled in later. */
900 fixup_jump = b;
901 b += 3;
903 laststart = 0;
904 begalt = b;
905 break;
907 case '{':
908 if (! (obscure_syntax & RE_INTERVALS)
909 /* Let \{ be a literal. */
910 || ((obscure_syntax & RE_INTERVALS)
911 && (obscure_syntax & RE_NO_BK_CURLY_BRACES))
912 /* If it's the string "\{". */
913 || (p - 2 == pattern && p == pend))
914 goto normal_backsl;
915 handle_interval:
916 beg_interval = p - 1; /* The {. */
917 /* If there is no previous pattern, this isn't an interval. */
918 if (!laststart)
920 if (obscure_syntax & RE_CONTEXTUAL_INVALID_OPS)
921 goto invalid_pattern;
922 else
923 goto normal_backsl;
925 /* It also isn't an interval if not preceded by an re
926 matching a single character or subexpression, or if
927 the current type of intervals can't handle back
928 references and the previous thing is a back reference. */
929 if (! (*laststart == anychar
930 || *laststart == charset
931 || *laststart == charset_not
932 || *laststart == start_memory
933 || (*laststart == exactn && laststart[1] == 1)
934 || (! (obscure_syntax & RE_NO_BK_REFS)
935 && *laststart == duplicate)))
937 if (obscure_syntax & RE_NO_BK_CURLY_BRACES)
938 goto normal_char;
940 /* Posix extended syntax is handled in previous
941 statement; this is for Posix basic syntax. */
942 if (obscure_syntax & RE_INTERVALS)
943 goto invalid_pattern;
945 goto normal_backsl;
947 lower_bound = -1; /* So can see if are set. */
948 upper_bound = -1;
949 GET_UNSIGNED_NUMBER (lower_bound);
950 if (c == ',')
952 GET_UNSIGNED_NUMBER (upper_bound);
953 if (upper_bound < 0)
954 upper_bound = RE_DUP_MAX;
956 if (upper_bound < 0)
957 upper_bound = lower_bound;
958 if (! (obscure_syntax & RE_NO_BK_CURLY_BRACES))
960 if (c != '\\')
961 goto invalid_pattern;
962 PATFETCH (c);
964 if (c != '}' || lower_bound < 0 || upper_bound > RE_DUP_MAX
965 || lower_bound > upper_bound
966 || ((obscure_syntax & RE_NO_BK_CURLY_BRACES)
967 && p != pend && *p == '{'))
969 if (obscure_syntax & RE_NO_BK_CURLY_BRACES)
970 goto unfetch_interval;
971 else
972 goto invalid_pattern;
975 /* If upper_bound is zero, don't want to succeed at all;
976 jump from laststart to b + 3, which will be the end of
977 the buffer after this jump is inserted. */
979 if (upper_bound == 0)
981 GET_BUFFER_SPACE (3);
982 insert_jump (jump, laststart, b + 3, b);
983 b += 3;
986 /* Otherwise, after lower_bound number of succeeds, jump
987 to after the jump_n which will be inserted at the end
988 of the buffer, and insert that jump_n. */
989 else
990 { /* Set to 5 if only one repetition is allowed and
991 hence no jump_n is inserted at the current end of
992 the buffer; then only space for the succeed_n is
993 needed. Otherwise, need space for both the
994 succeed_n and the jump_n. */
996 unsigned slots_needed = upper_bound == 1 ? 5 : 10;
998 GET_BUFFER_SPACE (slots_needed);
999 /* Initialize the succeed_n to n, even though it will
1000 be set by its attendant set_number_at, because
1001 re_compile_fastmap will need to know it. Jump to
1002 what the end of buffer will be after inserting
1003 this succeed_n and possibly appending a jump_n. */
1004 insert_jump_n (succeed_n, laststart, b + slots_needed,
1005 b, lower_bound);
1006 b += 5; /* Just increment for the succeed_n here. */
1008 /* More than one repetition is allowed, so put in at
1009 the end of the buffer a backward jump from b to the
1010 succeed_n we put in above. By the time we've gotten
1011 to this jump when matching, we'll have matched once
1012 already, so jump back only upper_bound - 1 times. */
1014 if (upper_bound > 1)
1016 store_jump_n (b, jump_n, laststart, upper_bound - 1);
1017 b += 5;
1018 /* When hit this when matching, reset the
1019 preceding jump_n's n to upper_bound - 1. */
1020 BUFPUSH (set_number_at);
1021 GET_BUFFER_SPACE (2);
1022 STORE_NUMBER_AND_INCR (b, -5);
1023 STORE_NUMBER_AND_INCR (b, upper_bound - 1);
1025 /* When hit this when matching, set the succeed_n's n. */
1026 GET_BUFFER_SPACE (5);
1027 insert_op_2 (set_number_at, laststart, b, 5, lower_bound);
1028 b += 5;
1030 pending_exact = 0;
1031 beg_interval = 0;
1032 break;
1035 unfetch_interval:
1036 /* If an invalid interval, match the characters as literals. */
1037 if (beg_interval)
1038 p = beg_interval;
1039 else
1041 fprintf (stderr,
1042 "regex: no interval beginning to which to backtrack.\n");
1043 exit (1);
1046 beg_interval = 0;
1047 PATFETCH (c); /* normal_char expects char in `c'. */
1048 goto normal_char;
1049 break;
1051 #ifdef emacs
1052 case '=':
1053 BUFPUSH (at_dot);
1054 break;
1056 case 's':
1057 laststart = b;
1058 BUFPUSH (syntaxspec);
1059 PATFETCH (c);
1060 BUFPUSH (syntax_spec_code[c]);
1061 break;
1063 case 'S':
1064 laststart = b;
1065 BUFPUSH (notsyntaxspec);
1066 PATFETCH (c);
1067 BUFPUSH (syntax_spec_code[c]);
1068 break;
1069 #endif /* emacs */
1071 case 'w':
1072 laststart = b;
1073 BUFPUSH (wordchar);
1074 break;
1076 case 'W':
1077 laststart = b;
1078 BUFPUSH (notwordchar);
1079 break;
1081 case '<':
1082 BUFPUSH (wordbeg);
1083 break;
1085 case '>':
1086 BUFPUSH (wordend);
1087 break;
1089 case 'b':
1090 BUFPUSH (wordbound);
1091 break;
1093 case 'B':
1094 BUFPUSH (notwordbound);
1095 break;
1097 case '`':
1098 BUFPUSH (begbuf);
1099 break;
1101 case '\'':
1102 BUFPUSH (endbuf);
1103 break;
1105 case '1':
1106 case '2':
1107 case '3':
1108 case '4':
1109 case '5':
1110 case '6':
1111 case '7':
1112 case '8':
1113 case '9':
1114 if (obscure_syntax & RE_NO_BK_REFS)
1115 goto normal_char;
1116 c1 = c - '0';
1117 if (c1 >= regnum)
1119 if (obscure_syntax & RE_NO_EMPTY_BK_REF)
1120 goto invalid_pattern;
1121 else
1122 goto normal_char;
1124 /* Can't back reference to a subexpression if inside of it. */
1125 for (stackt = stackp - 2; stackt > stackb; stackt -= 4)
1126 if (*stackt == c1)
1127 goto normal_char;
1128 laststart = b;
1129 BUFPUSH (duplicate);
1130 BUFPUSH (c1);
1131 break;
1133 case '+':
1134 case '?':
1135 if (obscure_syntax & RE_BK_PLUS_QM)
1136 goto handle_plus;
1137 else
1138 goto normal_backsl;
1139 break;
1141 default:
1142 normal_backsl:
1143 /* You might think it would be useful for \ to mean
1144 not to translate; but if we don't translate it
1145 it will never match anything. */
1146 if (translate) c = translate[c];
1147 goto normal_char;
1149 break;
1151 default:
1152 normal_char: /* Expects the character in `c'. */
1153 if (!pending_exact || pending_exact + *pending_exact + 1 != b
1154 || *pending_exact == 0177 || *p == '*' || *p == '^'
1155 || ((obscure_syntax & RE_BK_PLUS_QM)
1156 ? *p == '\\' && (p[1] == '+' || p[1] == '?')
1157 : (*p == '+' || *p == '?'))
1158 || ((obscure_syntax & RE_INTERVALS)
1159 && ((obscure_syntax & RE_NO_BK_CURLY_BRACES)
1160 ? *p == '{'
1161 : (p[0] == '\\' && p[1] == '{'))))
1163 laststart = b;
1164 BUFPUSH (exactn);
1165 pending_exact = b;
1166 BUFPUSH (0);
1168 BUFPUSH (c);
1169 (*pending_exact)++;
1173 if (fixup_jump)
1174 store_jump (fixup_jump, jump, b);
1176 if (stackp != stackb) goto unmatched_open;
1178 bufp->used = b - bufp->buffer;
1179 return 0;
1181 invalid_pattern:
1182 return "Invalid regular expression";
1184 unmatched_open:
1185 return "Unmatched \\(";
1187 unmatched_close:
1188 return "Unmatched \\)";
1190 end_of_pattern:
1191 return "Premature end of regular expression";
1193 nesting_too_deep:
1194 return "Nesting too deep";
1196 too_big:
1197 return "Regular expression too big";
1199 memory_exhausted:
1200 return "Memory exhausted";
1204 /* Store a jump of the form <OPCODE> <relative address>.
1205 Store in the location FROM a jump operation to jump to relative
1206 address FROM - TO. OPCODE is the opcode to store. */
1208 static void
1209 store_jump (from, opcode, to)
1210 char *from, *to;
1211 char opcode;
1213 from[0] = opcode;
1214 STORE_NUMBER(from + 1, to - (from + 3));
1218 /* Open up space before char FROM, and insert there a jump to TO.
1219 CURRENT_END gives the end of the storage not in use, so we know
1220 how much data to copy up. OP is the opcode of the jump to insert.
1222 If you call this function, you must zero out pending_exact. */
1224 static void
1225 insert_jump (op, from, to, current_end)
1226 char op;
1227 char *from, *to, *current_end;
1229 register char *pfrom = current_end; /* Copy from here... */
1230 register char *pto = current_end + 3; /* ...to here. */
1232 while (pfrom != from)
1233 *--pto = *--pfrom;
1234 store_jump (from, op, to);
1238 /* Store a jump of the form <opcode> <relative address> <n> .
1240 Store in the location FROM a jump operation to jump to relative
1241 address FROM - TO. OPCODE is the opcode to store, N is a number the
1242 jump uses, say, to decide how many times to jump.
1244 If you call this function, you must zero out pending_exact. */
1246 static void
1247 store_jump_n (from, opcode, to, n)
1248 char *from, *to;
1249 char opcode;
1250 unsigned n;
1252 from[0] = opcode;
1253 STORE_NUMBER (from + 1, to - (from + 3));
1254 STORE_NUMBER (from + 3, n);
1258 /* Similar to insert_jump, but handles a jump which needs an extra
1259 number to handle minimum and maximum cases. Open up space at
1260 location FROM, and insert there a jump to TO. CURRENT_END gives the
1261 end of the storage in use, so we know how much data to copy up. OP is
1262 the opcode of the jump to insert.
1264 If you call this function, you must zero out pending_exact. */
1266 static void
1267 insert_jump_n (op, from, to, current_end, n)
1268 char op;
1269 char *from, *to, *current_end;
1270 unsigned n;
1272 register char *pfrom = current_end; /* Copy from here... */
1273 register char *pto = current_end + 5; /* ...to here. */
1275 while (pfrom != from)
1276 *--pto = *--pfrom;
1277 store_jump_n (from, op, to, n);
1281 /* Open up space at location THERE, and insert operation OP followed by
1282 NUM_1 and NUM_2. CURRENT_END gives the end of the storage in use, so
1283 we know how much data to copy up.
1285 If you call this function, you must zero out pending_exact. */
1287 static void
1288 insert_op_2 (op, there, current_end, num_1, num_2)
1289 char op;
1290 char *there, *current_end;
1291 int num_1, num_2;
1293 register char *pfrom = current_end; /* Copy from here... */
1294 register char *pto = current_end + 5; /* ...to here. */
1296 while (pfrom != there)
1297 *--pto = *--pfrom;
1299 there[0] = op;
1300 STORE_NUMBER (there + 1, num_1);
1301 STORE_NUMBER (there + 3, num_2);
1306 /* Given a pattern, compute a fastmap from it. The fastmap records
1307 which of the (1 << BYTEWIDTH) possible characters can start a string
1308 that matches the pattern. This fastmap is used by re_search to skip
1309 quickly over totally implausible text.
1311 The caller must supply the address of a (1 << BYTEWIDTH)-byte data
1312 area as bufp->fastmap.
1313 The other components of bufp describe the pattern to be used. */
1315 void
1316 re_compile_fastmap (bufp)
1317 struct re_pattern_buffer *bufp;
1319 unsigned char *pattern = (unsigned char *) bufp->buffer;
1320 int size = bufp->used;
1321 register char *fastmap = bufp->fastmap;
1322 register unsigned char *p = pattern;
1323 register unsigned char *pend = pattern + size;
1324 register int j, k;
1325 unsigned char *translate = (unsigned char *) bufp->translate;
1327 unsigned char *stackb[NFAILURES];
1328 unsigned char **stackp = stackb;
1330 unsigned is_a_succeed_n;
1332 bzero (fastmap, (1 << BYTEWIDTH));
1333 bufp->fastmap_accurate = 1;
1334 bufp->can_be_null = 0;
1336 while (p)
1338 is_a_succeed_n = 0;
1339 if (p == pend)
1341 bufp->can_be_null = 1;
1342 break;
1344 #ifdef SWITCH_ENUM_BUG
1345 switch ((int) ((enum regexpcode) *p++))
1346 #else
1347 switch ((enum regexpcode) *p++)
1348 #endif
1350 case exactn:
1351 if (translate)
1352 fastmap[translate[p[1]]] = 1;
1353 else
1354 fastmap[p[1]] = 1;
1355 break;
1357 case begline:
1358 case before_dot:
1359 case at_dot:
1360 case after_dot:
1361 case begbuf:
1362 case endbuf:
1363 case wordbound:
1364 case notwordbound:
1365 case wordbeg:
1366 case wordend:
1367 continue;
1369 case endline:
1370 if (translate)
1371 fastmap[translate['\n']] = 1;
1372 else
1373 fastmap['\n'] = 1;
1375 if (bufp->can_be_null != 1)
1376 bufp->can_be_null = 2;
1377 break;
1379 case jump_n:
1380 case finalize_jump:
1381 case maybe_finalize_jump:
1382 case jump:
1383 case dummy_failure_jump:
1384 EXTRACT_NUMBER_AND_INCR (j, p);
1385 p += j;
1386 if (j > 0)
1387 continue;
1388 /* Jump backward reached implies we just went through
1389 the body of a loop and matched nothing.
1390 Opcode jumped to should be an on_failure_jump.
1391 Just treat it like an ordinary jump.
1392 For a * loop, it has pushed its failure point already;
1393 If so, discard that as redundant. */
1395 if ((enum regexpcode) *p != on_failure_jump
1396 && (enum regexpcode) *p != succeed_n)
1397 continue;
1398 p++;
1399 EXTRACT_NUMBER_AND_INCR (j, p);
1400 p += j;
1401 if (stackp != stackb && *stackp == p)
1402 stackp--;
1403 continue;
1405 case on_failure_jump:
1406 handle_on_failure_jump:
1407 EXTRACT_NUMBER_AND_INCR (j, p);
1408 *++stackp = p + j;
1409 if (is_a_succeed_n)
1410 EXTRACT_NUMBER_AND_INCR (k, p); /* Skip the n. */
1411 continue;
1413 case succeed_n:
1414 is_a_succeed_n = 1;
1415 /* Get to the number of times to succeed. */
1416 p += 2;
1417 /* Increment p past the n for when k != 0. */
1418 EXTRACT_NUMBER_AND_INCR (k, p);
1419 if (k == 0)
1421 p -= 4;
1422 goto handle_on_failure_jump;
1424 continue;
1426 case set_number_at:
1427 p += 4;
1428 continue;
1430 case start_memory:
1431 case stop_memory:
1432 p++;
1433 continue;
1435 case duplicate:
1436 bufp->can_be_null = 1;
1437 fastmap['\n'] = 1;
1438 case anychar:
1439 for (j = 0; j < (1 << BYTEWIDTH); j++)
1440 if (j != '\n')
1441 fastmap[j] = 1;
1442 if (bufp->can_be_null)
1443 return;
1444 /* Don't return; check the alternative paths
1445 so we can set can_be_null if appropriate. */
1446 break;
1448 case wordchar:
1449 for (j = 0; j < (1 << BYTEWIDTH); j++)
1450 if (SYNTAX (j) == Sword)
1451 fastmap[j] = 1;
1452 break;
1454 case notwordchar:
1455 for (j = 0; j < (1 << BYTEWIDTH); j++)
1456 if (SYNTAX (j) != Sword)
1457 fastmap[j] = 1;
1458 break;
1460 #ifdef emacs
1461 case syntaxspec:
1462 k = *p++;
1463 for (j = 0; j < (1 << BYTEWIDTH); j++)
1464 if (SYNTAX (j) == (enum syntaxcode) k)
1465 fastmap[j] = 1;
1466 break;
1468 case notsyntaxspec:
1469 k = *p++;
1470 for (j = 0; j < (1 << BYTEWIDTH); j++)
1471 if (SYNTAX (j) != (enum syntaxcode) k)
1472 fastmap[j] = 1;
1473 break;
1474 #endif /* not emacs */
1476 case charset:
1477 for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--)
1478 if (p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH)))
1480 if (translate)
1481 fastmap[translate[j]] = 1;
1482 else
1483 fastmap[j] = 1;
1485 break;
1487 case charset_not:
1488 /* Chars beyond end of map must be allowed */
1489 for (j = *p * BYTEWIDTH; j < (1 << BYTEWIDTH); j++)
1490 if (translate)
1491 fastmap[translate[j]] = 1;
1492 else
1493 fastmap[j] = 1;
1495 for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--)
1496 if (!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))))
1498 if (translate)
1499 fastmap[translate[j]] = 1;
1500 else
1501 fastmap[j] = 1;
1503 break;
1506 /* Get here means we have successfully found the possible starting
1507 characters of one path of the pattern. We need not follow this
1508 path any farther. Instead, look at the next alternative
1509 remembered in the stack. */
1510 if (stackp != stackb)
1511 p = *stackp--;
1512 else
1513 break;
1519 /* Like re_search_2, below, but only one string is specified, and
1520 doesn't let you say where to stop matching. */
1523 re_search (pbufp, string, size, startpos, range, regs)
1524 struct re_pattern_buffer *pbufp;
1525 char *string;
1526 int size, startpos, range;
1527 struct re_registers *regs;
1529 return re_search_2 (pbufp, (char *) 0, 0, string, size, startpos, range,
1530 regs, size);
1534 /* Using the compiled pattern in PBUFP->buffer, first tries to match the
1535 virtual concatenation of STRING1 and STRING2, starting first at index
1536 STARTPOS, then at STARTPOS + 1, and so on. RANGE is the number of
1537 places to try before giving up. If RANGE is negative, it searches
1538 backwards, i.e., the starting positions tried are STARTPOS, STARTPOS
1539 - 1, etc. STRING1 and STRING2 are of SIZE1 and SIZE2, respectively.
1540 In REGS, return the indices of the virtual concatenation of STRING1
1541 and STRING2 that matched the entire PBUFP->buffer and its contained
1542 subexpressions. Do not consider matching one past the index MSTOP in
1543 the virtual concatenation of STRING1 and STRING2.
1545 The value returned is the position in the strings at which the match
1546 was found, or -1 if no match was found, or -2 if error (such as
1547 failure stack overflow). */
1550 re_search_2 (pbufp, string1, size1, string2, size2, startpos, range,
1551 regs, mstop)
1552 struct re_pattern_buffer *pbufp;
1553 char *string1, *string2;
1554 int size1, size2;
1555 int startpos;
1556 register int range;
1557 struct re_registers *regs;
1558 int mstop;
1560 register char *fastmap = pbufp->fastmap;
1561 register unsigned char *translate = (unsigned char *) pbufp->translate;
1562 int total_size = size1 + size2;
1563 int endpos = startpos + range;
1564 int val;
1566 /* Check for out-of-range starting position. */
1567 if (startpos < 0 || startpos > total_size)
1568 return -1;
1570 /* Fix up range if it would eventually take startpos outside of the
1571 virtual concatenation of string1 and string2. */
1572 if (endpos < -1)
1573 range = -1 - startpos;
1574 else if (endpos > total_size)
1575 range = total_size - startpos;
1577 /* Update the fastmap now if not correct already. */
1578 if (fastmap && !pbufp->fastmap_accurate)
1579 re_compile_fastmap (pbufp);
1581 /* If the search isn't to be a backwards one, don't waste time in a
1582 long search for a pattern that says it is anchored. */
1583 if (pbufp->used > 0 && (enum regexpcode) pbufp->buffer[0] == begbuf
1584 && range > 0)
1586 if (startpos > 0)
1587 return -1;
1588 else
1589 range = 1;
1592 while (1)
1594 /* If a fastmap is supplied, skip quickly over characters that
1595 cannot possibly be the start of a match. Note, however, that
1596 if the pattern can possibly match the null string, we must
1597 test it at each starting point so that we take the first null
1598 string we get. */
1600 if (fastmap && startpos < total_size && pbufp->can_be_null != 1)
1602 if (range > 0) /* Searching forwards. */
1604 register int lim = 0;
1605 register unsigned char *p;
1606 int irange = range;
1607 if (startpos < size1 && startpos + range >= size1)
1608 lim = range - (size1 - startpos);
1610 p = ((unsigned char *)
1611 &(startpos >= size1 ? string2 - size1 : string1)[startpos]);
1613 while (range > lim && !fastmap[translate
1614 ? translate[*p++]
1615 : *p++])
1616 range--;
1617 startpos += irange - range;
1619 else /* Searching backwards. */
1621 register unsigned char c;
1623 if (string1 == 0 || startpos >= size1)
1624 c = string2[startpos - size1];
1625 else
1626 c = string1[startpos];
1628 c &= 0xff;
1629 if (translate ? !fastmap[translate[c]] : !fastmap[c])
1630 goto advance;
1634 if (range >= 0 && startpos == total_size
1635 && fastmap && pbufp->can_be_null == 0)
1636 return -1;
1638 val = re_match_2 (pbufp, string1, size1, string2, size2, startpos,
1639 regs, mstop);
1640 if (val >= 0)
1641 return startpos;
1642 if (val == -2)
1643 return -2;
1645 #ifdef C_ALLOCA
1646 alloca (0);
1647 #endif /* C_ALLOCA */
1649 advance:
1650 if (!range)
1651 break;
1652 else if (range > 0)
1654 range--;
1655 startpos++;
1657 else
1659 range++;
1660 startpos--;
1663 return -1;
1668 #ifndef emacs /* emacs never uses this. */
1670 re_match (pbufp, string, size, pos, regs)
1671 struct re_pattern_buffer *pbufp;
1672 char *string;
1673 int size, pos;
1674 struct re_registers *regs;
1676 return re_match_2 (pbufp, (char *) 0, 0, string, size, pos, regs, size);
1678 #endif /* not emacs */
1681 /* The following are used for re_match_2, defined below: */
1683 /* Roughly the maximum number of failure points on the stack. Would be
1684 exactly that if always pushed MAX_NUM_FAILURE_ITEMS each time we failed. */
1686 int re_max_failures = 2000;
1688 /* Routine used by re_match_2. */
1689 static int bcmp_translate ();
1692 /* Structure and accessing macros used in re_match_2: */
1694 struct register_info
1696 unsigned is_active : 1;
1697 unsigned matched_something : 1;
1700 #define IS_ACTIVE(R) ((R).is_active)
1701 #define MATCHED_SOMETHING(R) ((R).matched_something)
1704 /* Macros used by re_match_2: */
1707 /* I.e., regstart, regend, and reg_info. */
1709 #define NUM_REG_ITEMS 3
1711 /* We push at most this many things on the stack whenever we
1712 fail. The `+ 2' refers to PATTERN_PLACE and STRING_PLACE, which are
1713 arguments to the PUSH_FAILURE_POINT macro. */
1715 #define MAX_NUM_FAILURE_ITEMS (RE_NREGS * NUM_REG_ITEMS + 2)
1718 /* We push this many things on the stack whenever we fail. */
1720 #define NUM_FAILURE_ITEMS (last_used_reg * NUM_REG_ITEMS + 2)
1723 /* This pushes most of the information about the current state we will want
1724 if we ever fail back to it. */
1726 #define PUSH_FAILURE_POINT(pattern_place, string_place) \
1728 short last_used_reg, this_reg; \
1730 /* Find out how many registers are active or have been matched. \
1731 (Aside from register zero, which is only set at the end.) */ \
1732 for (last_used_reg = RE_NREGS - 1; last_used_reg > 0; last_used_reg--)\
1733 if (regstart[last_used_reg] != (unsigned char *) -1) \
1734 break; \
1736 if (stacke - stackp < NUM_FAILURE_ITEMS) \
1738 unsigned char **stackx; \
1739 unsigned int len = stacke - stackb; \
1740 if (len > re_max_failures * MAX_NUM_FAILURE_ITEMS) \
1741 return -2; \
1743 /* Roughly double the size of the stack. */ \
1744 stackx = (unsigned char **) alloca (2 * len \
1745 * sizeof (unsigned char *));\
1746 /* Only copy what is in use. */ \
1747 bcopy (stackb, stackx, len * sizeof (char *)); \
1748 stackp = stackx + (stackp - stackb); \
1749 stackb = stackx; \
1750 stacke = stackb + 2 * len; \
1753 /* Now push the info for each of those registers. */ \
1754 for (this_reg = 1; this_reg <= last_used_reg; this_reg++) \
1756 *stackp++ = regstart[this_reg]; \
1757 *stackp++ = regend[this_reg]; \
1758 *stackp++ = (unsigned char *) &reg_info[this_reg]; \
1761 /* Push how many registers we saved. */ \
1762 *stackp++ = (unsigned char *) last_used_reg; \
1764 *stackp++ = pattern_place; \
1765 *stackp++ = string_place; \
1769 /* This pops what PUSH_FAILURE_POINT pushes. */
1771 #define POP_FAILURE_POINT() \
1773 int temp; \
1774 stackp -= 2; /* Remove failure points. */ \
1775 temp = (int) *--stackp; /* How many regs pushed. */ \
1776 temp *= NUM_REG_ITEMS; /* How much to take off the stack. */ \
1777 stackp -= temp; /* Remove the register info. */ \
1781 #define MATCHING_IN_FIRST_STRING (dend == end_match_1)
1783 /* Is true if there is a first string and if PTR is pointing anywhere
1784 inside it or just past the end. */
1786 #define IS_IN_FIRST_STRING(ptr) \
1787 (size1 && string1 <= (ptr) && (ptr) <= string1 + size1)
1789 /* Call before fetching a character with *d. This switches over to
1790 string2 if necessary. */
1792 #define PREFETCH \
1793 while (d == dend) \
1795 /* end of string2 => fail. */ \
1796 if (dend == end_match_2) \
1797 goto fail; \
1798 /* end of string1 => advance to string2. */ \
1799 d = string2; \
1800 dend = end_match_2; \
1804 /* Call this when have matched something; it sets `matched' flags for the
1805 registers corresponding to the subexpressions of which we currently
1806 are inside. */
1807 #define SET_REGS_MATCHED \
1808 { unsigned this_reg; \
1809 for (this_reg = 0; this_reg < RE_NREGS; this_reg++) \
1811 if (IS_ACTIVE(reg_info[this_reg])) \
1812 MATCHED_SOMETHING(reg_info[this_reg]) = 1; \
1813 else \
1814 MATCHED_SOMETHING(reg_info[this_reg]) = 0; \
1818 /* Test if at very beginning or at very end of the virtual concatenation
1819 of string1 and string2. If there is only one string, we've put it in
1820 string2. */
1822 #define AT_STRINGS_BEG (d == (size1 ? string1 : string2) || !size2)
1823 #define AT_STRINGS_END (d == end2)
1825 #define AT_WORD_BOUNDARY \
1826 (AT_STRINGS_BEG || AT_STRINGS_END || IS_A_LETTER (d - 1) != IS_A_LETTER (d))
1828 /* We have two special cases to check for:
1829 1) if we're past the end of string1, we have to look at the first
1830 character in string2;
1831 2) if we're before the beginning of string2, we have to look at the
1832 last character in string1; we assume there is a string1, so use
1833 this in conjunction with AT_STRINGS_BEG. */
1834 #define IS_A_LETTER(d) \
1835 (SYNTAX ((d) == end1 ? *string2 : (d) == string2 - 1 ? *(end1 - 1) : *(d))\
1836 == Sword)
1839 /* Match the pattern described by PBUFP against the virtual
1840 concatenation of STRING1 and STRING2, which are of SIZE1 and SIZE2,
1841 respectively. Start the match at index POS in the virtual
1842 concatenation of STRING1 and STRING2. In REGS, return the indices of
1843 the virtual concatenation of STRING1 and STRING2 that matched the
1844 entire PBUFP->buffer and its contained subexpressions. Do not
1845 consider matching one past the index MSTOP in the virtual
1846 concatenation of STRING1 and STRING2.
1848 If pbufp->fastmap is nonzero, then it had better be up to date.
1850 The reason that the data to match are specified as two components
1851 which are to be regarded as concatenated is so this function can be
1852 used directly on the contents of an Emacs buffer.
1854 -1 is returned if there is no match. -2 is returned if there is an
1855 error (such as match stack overflow). Otherwise the value is the
1856 length of the substring which was matched. */
1859 re_match_2 (pbufp, string1_arg, size1, string2_arg, size2, pos, regs, mstop)
1860 struct re_pattern_buffer *pbufp;
1861 char *string1_arg, *string2_arg;
1862 int size1, size2;
1863 int pos;
1864 struct re_registers *regs;
1865 int mstop;
1867 register unsigned char *p = (unsigned char *) pbufp->buffer;
1869 /* Pointer to beyond end of buffer. */
1870 register unsigned char *pend = p + pbufp->used;
1872 unsigned char *string1 = (unsigned char *) string1_arg;
1873 unsigned char *string2 = (unsigned char *) string2_arg;
1874 unsigned char *end1; /* Just past end of first string. */
1875 unsigned char *end2; /* Just past end of second string. */
1877 /* Pointers into string1 and string2, just past the last characters in
1878 each to consider matching. */
1879 unsigned char *end_match_1, *end_match_2;
1881 register unsigned char *d, *dend;
1882 register int mcnt; /* Multipurpose. */
1883 unsigned char *translate = (unsigned char *) pbufp->translate;
1884 unsigned is_a_jump_n = 0;
1886 /* Failure point stack. Each place that can handle a failure further
1887 down the line pushes a failure point on this stack. It consists of
1888 restart, regend, and reg_info for all registers corresponding to the
1889 subexpressions we're currently inside, plus the number of such
1890 registers, and, finally, two char *'s. The first char * is where to
1891 resume scanning the pattern; the second one is where to resume
1892 scanning the strings. If the latter is zero, the failure point is a
1893 ``dummy''; if a failure happens and the failure point is a dummy, it
1894 gets discarded and the next next one is tried. */
1896 unsigned char *initial_stack[MAX_NUM_FAILURE_ITEMS * NFAILURES];
1897 unsigned char **stackb = initial_stack;
1898 unsigned char **stackp = stackb;
1899 unsigned char **stacke = &stackb[MAX_NUM_FAILURE_ITEMS * NFAILURES];
1902 /* Information on the contents of registers. These are pointers into
1903 the input strings; they record just what was matched (on this
1904 attempt) by a subexpression part of the pattern, that is, the
1905 regnum-th regstart pointer points to where in the pattern we began
1906 matching and the regnum-th regend points to right after where we
1907 stopped matching the regnum-th subexpression. (The zeroth register
1908 keeps track of what the whole pattern matches.) */
1910 unsigned char *regstart[RE_NREGS];
1911 unsigned char *regend[RE_NREGS];
1913 /* The is_active field of reg_info helps us keep track of which (possibly
1914 nested) subexpressions we are currently in. The matched_something
1915 field of reg_info[reg_num] helps us tell whether or not we have
1916 matched any of the pattern so far this time through the reg_num-th
1917 subexpression. These two fields get reset each time through any
1918 loop their register is in. */
1920 struct register_info reg_info[RE_NREGS];
1923 /* The following record the register info as found in the above
1924 variables when we find a match better than any we've seen before.
1925 This happens as we backtrack through the failure points, which in
1926 turn happens only if we have not yet matched the entire string. */
1928 unsigned best_regs_set = 0;
1929 unsigned char *best_regstart[RE_NREGS];
1930 unsigned char *best_regend[RE_NREGS];
1932 /* Initialize subexpression text positions to -1 to mark ones that no
1933 \( or ( and \) or ) has been seen for. Also set all registers to
1934 inactive and mark them as not having matched anything or ever
1935 failed. */
1936 for (mcnt = 0; mcnt < RE_NREGS; mcnt++)
1938 regstart[mcnt] = regend[mcnt] = (unsigned char *) -1;
1939 IS_ACTIVE (reg_info[mcnt]) = 0;
1940 MATCHED_SOMETHING (reg_info[mcnt]) = 0;
1943 if (regs)
1944 for (mcnt = 0; mcnt < RE_NREGS; mcnt++)
1945 regs->start[mcnt] = regs->end[mcnt] = -1;
1947 /* Set up pointers to ends of strings.
1948 Don't allow the second string to be empty unless both are empty. */
1949 if (size2 == 0)
1951 string2 = string1;
1952 size2 = size1;
1953 string1 = 0;
1954 size1 = 0;
1956 end1 = string1 + size1;
1957 end2 = string2 + size2;
1959 /* Compute where to stop matching, within the two strings. */
1960 if (mstop <= size1)
1962 end_match_1 = string1 + mstop;
1963 end_match_2 = string2;
1965 else
1967 end_match_1 = end1;
1968 end_match_2 = string2 + mstop - size1;
1971 /* `p' scans through the pattern as `d' scans through the data. `dend'
1972 is the end of the input string that `d' points within. `d' is
1973 advanced into the following input string whenever necessary, but
1974 this happens before fetching; therefore, at the beginning of the
1975 loop, `d' can be pointing at the end of a string, but it cannot
1976 equal string2. */
1978 if (size1 != 0 && pos <= size1)
1979 d = string1 + pos, dend = end_match_1;
1980 else
1981 d = string2 + pos - size1, dend = end_match_2;
1984 /* This loops over pattern commands. It exits by returning from the
1985 function if match is complete, or it drops through if match fails
1986 at this starting point in the input data. */
1988 while (1)
1990 is_a_jump_n = 0;
1991 /* End of pattern means we might have succeeded. */
1992 if (p == pend)
1994 /* If not end of string, try backtracking. Otherwise done. */
1995 if (d != end_match_2)
1997 if (stackp != stackb)
1999 /* More failure points to try. */
2001 unsigned in_same_string =
2002 IS_IN_FIRST_STRING (best_regend[0])
2003 == MATCHING_IN_FIRST_STRING;
2005 /* If exceeds best match so far, save it. */
2006 if (! best_regs_set
2007 || (in_same_string && d > best_regend[0])
2008 || (! in_same_string && ! MATCHING_IN_FIRST_STRING))
2010 best_regs_set = 1;
2011 best_regend[0] = d; /* Never use regstart[0]. */
2013 for (mcnt = 1; mcnt < RE_NREGS; mcnt++)
2015 best_regstart[mcnt] = regstart[mcnt];
2016 best_regend[mcnt] = regend[mcnt];
2019 goto fail;
2021 /* If no failure points, don't restore garbage. */
2022 else if (best_regs_set)
2024 restore_best_regs:
2025 /* Restore best match. */
2026 d = best_regend[0];
2028 for (mcnt = 0; mcnt < RE_NREGS; mcnt++)
2030 regstart[mcnt] = best_regstart[mcnt];
2031 regend[mcnt] = best_regend[mcnt];
2036 /* If caller wants register contents data back, convert it
2037 to indices. */
2038 if (regs)
2040 regs->start[0] = pos;
2041 if (MATCHING_IN_FIRST_STRING)
2042 regs->end[0] = d - string1;
2043 else
2044 regs->end[0] = d - string2 + size1;
2045 for (mcnt = 1; mcnt < RE_NREGS; mcnt++)
2047 if (regend[mcnt] == (unsigned char *) -1)
2049 regs->start[mcnt] = -1;
2050 regs->end[mcnt] = -1;
2051 continue;
2053 if (IS_IN_FIRST_STRING (regstart[mcnt]))
2054 regs->start[mcnt] = regstart[mcnt] - string1;
2055 else
2056 regs->start[mcnt] = regstart[mcnt] - string2 + size1;
2058 if (IS_IN_FIRST_STRING (regend[mcnt]))
2059 regs->end[mcnt] = regend[mcnt] - string1;
2060 else
2061 regs->end[mcnt] = regend[mcnt] - string2 + size1;
2064 return d - pos - (MATCHING_IN_FIRST_STRING
2065 ? string1
2066 : string2 - size1);
2069 /* Otherwise match next pattern command. */
2070 #ifdef SWITCH_ENUM_BUG
2071 switch ((int) ((enum regexpcode) *p++))
2072 #else
2073 switch ((enum regexpcode) *p++)
2074 #endif
2077 /* \( [or `(', as appropriate] is represented by start_memory,
2078 \) by stop_memory. Both of those commands are followed by
2079 a register number in the next byte. The text matched
2080 within the \( and \) is recorded under that number. */
2081 case start_memory:
2082 regstart[*p] = d;
2083 IS_ACTIVE (reg_info[*p]) = 1;
2084 MATCHED_SOMETHING (reg_info[*p]) = 0;
2085 p++;
2086 break;
2088 case stop_memory:
2089 regend[*p] = d;
2090 IS_ACTIVE (reg_info[*p]) = 0;
2092 /* If just failed to match something this time around with a sub-
2093 expression that's in a loop, try to force exit from the loop. */
2094 if ((! MATCHED_SOMETHING (reg_info[*p])
2095 || (enum regexpcode) p[-3] == start_memory)
2096 && (p + 1) != pend)
2098 register unsigned char *p2 = p + 1;
2099 mcnt = 0;
2100 switch (*p2++)
2102 case jump_n:
2103 is_a_jump_n = 1;
2104 case finalize_jump:
2105 case maybe_finalize_jump:
2106 case jump:
2107 case dummy_failure_jump:
2108 EXTRACT_NUMBER_AND_INCR (mcnt, p2);
2109 if (is_a_jump_n)
2110 p2 += 2;
2111 break;
2113 p2 += mcnt;
2115 /* If the next operation is a jump backwards in the pattern
2116 to an on_failure_jump, exit from the loop by forcing a
2117 failure after pushing on the stack the on_failure_jump's
2118 jump in the pattern, and d. */
2119 if (mcnt < 0 && (enum regexpcode) *p2++ == on_failure_jump)
2121 EXTRACT_NUMBER_AND_INCR (mcnt, p2);
2122 PUSH_FAILURE_POINT (p2 + mcnt, d);
2123 goto fail;
2126 p++;
2127 break;
2129 /* \<digit> has been turned into a `duplicate' command which is
2130 followed by the numeric value of <digit> as the register number. */
2131 case duplicate:
2133 int regno = *p++; /* Get which register to match against */
2134 register unsigned char *d2, *dend2;
2136 /* Where in input to try to start matching. */
2137 d2 = regstart[regno];
2139 /* Where to stop matching; if both the place to start and
2140 the place to stop matching are in the same string, then
2141 set to the place to stop, otherwise, for now have to use
2142 the end of the first string. */
2144 dend2 = ((IS_IN_FIRST_STRING (regstart[regno])
2145 == IS_IN_FIRST_STRING (regend[regno]))
2146 ? regend[regno] : end_match_1);
2147 while (1)
2149 /* If necessary, advance to next segment in register
2150 contents. */
2151 while (d2 == dend2)
2153 if (dend2 == end_match_2) break;
2154 if (dend2 == regend[regno]) break;
2155 d2 = string2, dend2 = regend[regno]; /* end of string1 => advance to string2. */
2157 /* At end of register contents => success */
2158 if (d2 == dend2) break;
2160 /* If necessary, advance to next segment in data. */
2161 PREFETCH;
2163 /* How many characters left in this segment to match. */
2164 mcnt = dend - d;
2166 /* Want how many consecutive characters we can match in
2167 one shot, so, if necessary, adjust the count. */
2168 if (mcnt > dend2 - d2)
2169 mcnt = dend2 - d2;
2171 /* Compare that many; failure if mismatch, else move
2172 past them. */
2173 if (translate
2174 ? bcmp_translate (d, d2, mcnt, translate)
2175 : bcmp (d, d2, mcnt))
2176 goto fail;
2177 d += mcnt, d2 += mcnt;
2180 break;
2182 case anychar:
2183 PREFETCH; /* Fetch a data character. */
2184 /* Match anything but a newline, maybe even a null. */
2185 if ((translate ? translate[*d] : *d) == '\n'
2186 || ((obscure_syntax & RE_DOT_NOT_NULL)
2187 && (translate ? translate[*d] : *d) == '\000'))
2188 goto fail;
2189 SET_REGS_MATCHED;
2190 d++;
2191 break;
2193 case charset:
2194 case charset_not:
2196 int not = 0; /* Nonzero for charset_not. */
2197 register int c;
2198 if (*(p - 1) == (unsigned char) charset_not)
2199 not = 1;
2201 PREFETCH; /* Fetch a data character. */
2203 if (translate)
2204 c = translate[*d];
2205 else
2206 c = *d;
2208 if (c < *p * BYTEWIDTH
2209 && p[1 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
2210 not = !not;
2212 p += 1 + *p;
2214 if (!not) goto fail;
2215 SET_REGS_MATCHED;
2216 d++;
2217 break;
2220 case begline:
2221 if ((size1 != 0 && d == string1)
2222 || (size1 == 0 && size2 != 0 && d == string2)
2223 || (d && d[-1] == '\n')
2224 || (size1 == 0 && size2 == 0))
2225 break;
2226 else
2227 goto fail;
2229 case endline:
2230 if (d == end2
2231 || (d == end1 ? (size2 == 0 || *string2 == '\n') : *d == '\n'))
2232 break;
2233 goto fail;
2235 /* `or' constructs are handled by starting each alternative with
2236 an on_failure_jump that points to the start of the next
2237 alternative. Each alternative except the last ends with a
2238 jump to the joining point. (Actually, each jump except for
2239 the last one really jumps to the following jump, because
2240 tensioning the jumps is a hassle.) */
2242 /* The start of a stupid repeat has an on_failure_jump that points
2243 past the end of the repeat text. This makes a failure point so
2244 that on failure to match a repetition, matching restarts past
2245 as many repetitions have been found with no way to fail and
2246 look for another one. */
2248 /* A smart repeat is similar but loops back to the on_failure_jump
2249 so that each repetition makes another failure point. */
2251 case on_failure_jump:
2252 on_failure:
2253 EXTRACT_NUMBER_AND_INCR (mcnt, p);
2254 PUSH_FAILURE_POINT (p + mcnt, d);
2255 break;
2257 /* The end of a smart repeat has a maybe_finalize_jump back.
2258 Change it either to a finalize_jump or an ordinary jump. */
2259 case maybe_finalize_jump:
2260 EXTRACT_NUMBER_AND_INCR (mcnt, p);
2262 register unsigned char *p2 = p;
2263 /* Compare what follows with the beginning of the repeat.
2264 If we can establish that there is nothing that they would
2265 both match, we can change to finalize_jump. */
2266 while (p2 + 1 != pend
2267 && (*p2 == (unsigned char) stop_memory
2268 || *p2 == (unsigned char) start_memory))
2269 p2 += 2; /* Skip over reg number. */
2270 if (p2 == pend)
2271 p[-3] = (unsigned char) finalize_jump;
2272 else if (*p2 == (unsigned char) exactn
2273 || *p2 == (unsigned char) endline)
2275 register int c = *p2 == (unsigned char) endline ? '\n' : p2[2];
2276 register unsigned char *p1 = p + mcnt;
2277 /* p1[0] ... p1[2] are an on_failure_jump.
2278 Examine what follows that. */
2279 if (p1[3] == (unsigned char) exactn && p1[5] != c)
2280 p[-3] = (unsigned char) finalize_jump;
2281 else if (p1[3] == (unsigned char) charset
2282 || p1[3] == (unsigned char) charset_not)
2284 int not = p1[3] == (unsigned char) charset_not;
2285 if (c < p1[4] * BYTEWIDTH
2286 && p1[5 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
2287 not = !not;
2288 /* `not' is 1 if c would match. */
2289 /* That means it is not safe to finalize. */
2290 if (!not)
2291 p[-3] = (unsigned char) finalize_jump;
2295 p -= 2; /* Point at relative address again. */
2296 if (p[-1] != (unsigned char) finalize_jump)
2298 p[-1] = (unsigned char) jump;
2299 goto nofinalize;
2301 /* Note fall through. */
2303 /* The end of a stupid repeat has a finalize_jump back to the
2304 start, where another failure point will be made which will
2305 point to after all the repetitions found so far. */
2307 /* Take off failure points put on by matching on_failure_jump
2308 because didn't fail. Also remove the register information
2309 put on by the on_failure_jump. */
2310 case finalize_jump:
2311 POP_FAILURE_POINT ();
2312 /* Note fall through. */
2314 /* Jump without taking off any failure points. */
2315 case jump:
2316 nofinalize:
2317 EXTRACT_NUMBER_AND_INCR (mcnt, p);
2318 p += mcnt;
2319 break;
2321 case dummy_failure_jump:
2322 /* Normally, the on_failure_jump pushes a failure point, which
2323 then gets popped at finalize_jump. We will end up at
2324 finalize_jump, also, and with a pattern of, say, `a+', we
2325 are skipping over the on_failure_jump, so we have to push
2326 something meaningless for finalize_jump to pop. */
2327 PUSH_FAILURE_POINT (0, 0);
2328 goto nofinalize;
2331 /* Have to succeed matching what follows at least n times. Then
2332 just handle like an on_failure_jump. */
2333 case succeed_n:
2334 EXTRACT_NUMBER (mcnt, p + 2);
2335 /* Originally, this is how many times we HAVE to succeed. */
2336 if (mcnt)
2338 mcnt--;
2339 p += 2;
2340 STORE_NUMBER_AND_INCR (p, mcnt);
2342 else if (mcnt == 0)
2344 p[2] = unused;
2345 p[3] = unused;
2346 goto on_failure;
2348 else
2350 fprintf (stderr, "regex: the succeed_n's n is not set.\n");
2351 exit (1);
2353 break;
2355 case jump_n:
2356 EXTRACT_NUMBER (mcnt, p + 2);
2357 /* Originally, this is how many times we CAN jump. */
2358 if (mcnt)
2360 mcnt--;
2361 STORE_NUMBER(p + 2, mcnt);
2362 goto nofinalize; /* Do the jump without taking off
2363 any failure points. */
2365 /* If don't have to jump any more, skip over the rest of command. */
2366 else
2367 p += 4;
2368 break;
2370 case set_number_at:
2372 register unsigned char *p1;
2374 EXTRACT_NUMBER_AND_INCR (mcnt, p);
2375 p1 = p + mcnt;
2376 EXTRACT_NUMBER_AND_INCR (mcnt, p);
2377 STORE_NUMBER (p1, mcnt);
2378 break;
2381 /* Ignore these. Used to ignore the n of succeed_n's which
2382 currently have n == 0. */
2383 case unused:
2384 break;
2386 case wordbound:
2387 if (AT_WORD_BOUNDARY)
2388 break;
2389 goto fail;
2391 case notwordbound:
2392 if (AT_WORD_BOUNDARY)
2393 goto fail;
2394 break;
2396 case wordbeg:
2397 /* Have to check if AT_STRINGS_BEG before looking at d - 1. */
2398 if (IS_A_LETTER (d) && (AT_STRINGS_BEG || !IS_A_LETTER (d - 1)))
2399 break;
2400 goto fail;
2402 case wordend:
2403 /* Have to check if AT_STRINGS_BEG before looking at d - 1. */
2404 if (!AT_STRINGS_BEG && IS_A_LETTER (d - 1)
2405 && (!IS_A_LETTER (d) || AT_STRINGS_END))
2406 break;
2407 goto fail;
2409 #ifdef emacs
2410 case before_dot:
2411 if (PTR_CHAR_POS (d) >= point)
2412 goto fail;
2413 break;
2415 case at_dot:
2416 if (PTR_CHAR_POS (d) != point)
2417 goto fail;
2418 break;
2420 case after_dot:
2421 if (PTR_CHAR_POS (d) <= point)
2422 goto fail;
2423 break;
2425 case wordchar:
2426 mcnt = (int) Sword;
2427 goto matchsyntax;
2429 case syntaxspec:
2430 mcnt = *p++;
2431 matchsyntax:
2432 PREFETCH;
2433 if (SYNTAX (*d++) != (enum syntaxcode) mcnt) goto fail;
2434 SET_REGS_MATCHED;
2435 break;
2437 case notwordchar:
2438 mcnt = (int) Sword;
2439 goto matchnotsyntax;
2441 case notsyntaxspec:
2442 mcnt = *p++;
2443 matchnotsyntax:
2444 PREFETCH;
2445 if (SYNTAX (*d++) == (enum syntaxcode) mcnt) goto fail;
2446 SET_REGS_MATCHED;
2447 break;
2449 #else /* not emacs */
2451 case wordchar:
2452 PREFETCH;
2453 if (!IS_A_LETTER (d))
2454 goto fail;
2455 SET_REGS_MATCHED;
2456 break;
2458 case notwordchar:
2459 PREFETCH;
2460 if (IS_A_LETTER (d))
2461 goto fail;
2462 SET_REGS_MATCHED;
2463 break;
2465 #endif /* not emacs */
2467 case begbuf:
2468 if (AT_STRINGS_BEG)
2469 break;
2470 goto fail;
2472 case endbuf:
2473 if (AT_STRINGS_END)
2474 break;
2475 goto fail;
2477 case exactn:
2478 /* Match the next few pattern characters exactly.
2479 mcnt is how many characters to match. */
2480 mcnt = *p++;
2481 /* This is written out as an if-else so we don't waste time
2482 testing `translate' inside the loop. */
2483 if (translate)
2487 PREFETCH;
2488 if (translate[*d++] != *p++) goto fail;
2490 while (--mcnt);
2492 else
2496 PREFETCH;
2497 if (*d++ != *p++) goto fail;
2499 while (--mcnt);
2501 SET_REGS_MATCHED;
2502 break;
2504 continue; /* Successfully executed one pattern command; keep going. */
2506 /* Jump here if any matching operation fails. */
2507 fail:
2508 if (stackp != stackb)
2509 /* A restart point is known. Restart there and pop it. */
2511 short last_used_reg, this_reg;
2513 /* If this failure point is from a dummy_failure_point, just
2514 skip it. */
2515 if (!stackp[-2])
2517 POP_FAILURE_POINT ();
2518 goto fail;
2521 d = *--stackp;
2522 p = *--stackp;
2523 if (d >= string1 && d <= end1)
2524 dend = end_match_1;
2525 /* Restore register info. */
2526 last_used_reg = (short) *--stackp;
2528 /* Make the ones that weren't saved -1 or 0 again. */
2529 for (this_reg = RE_NREGS - 1; this_reg > last_used_reg; this_reg--)
2531 regend[this_reg] = (unsigned char *) -1;
2532 regstart[this_reg] = (unsigned char *) -1;
2533 IS_ACTIVE (reg_info[this_reg]) = 0;
2534 MATCHED_SOMETHING (reg_info[this_reg]) = 0;
2537 /* And restore the rest from the stack. */
2538 for ( ; this_reg > 0; this_reg--)
2540 reg_info[this_reg] = *(struct register_info *) *--stackp;
2541 regend[this_reg] = *--stackp;
2542 regstart[this_reg] = *--stackp;
2545 else
2546 break; /* Matching at this starting point really fails. */
2549 if (best_regs_set)
2550 goto restore_best_regs;
2551 return -1; /* Failure to match. */
2555 static int
2556 bcmp_translate (s1, s2, len, translate)
2557 unsigned char *s1, *s2;
2558 register int len;
2559 unsigned char *translate;
2561 register unsigned char *p1 = s1, *p2 = s2;
2562 while (len)
2564 if (translate [*p1++] != translate [*p2++]) return 1;
2565 len--;
2567 return 0;
2572 /* Entry points compatible with 4.2 BSD regex library. */
2574 #ifndef emacs
2576 static struct re_pattern_buffer re_comp_buf;
2578 char *
2579 re_comp (s)
2580 char *s;
2582 if (!s)
2584 if (!re_comp_buf.buffer)
2585 return "No previous regular expression";
2586 return 0;
2589 if (!re_comp_buf.buffer)
2591 if (!(re_comp_buf.buffer = (char *) malloc (200)))
2592 return "Memory exhausted";
2593 re_comp_buf.allocated = 200;
2594 if (!(re_comp_buf.fastmap = (char *) malloc (1 << BYTEWIDTH)))
2595 return "Memory exhausted";
2597 return re_compile_pattern (s, strlen (s), &re_comp_buf);
2601 re_exec (s)
2602 char *s;
2604 int len = strlen (s);
2605 return 0 <= re_search (&re_comp_buf, s, len, 0, len,
2606 (struct re_registers *) 0);
2608 #endif /* not emacs */
2612 #ifdef test
2614 #include <stdio.h>
2616 /* Indexed by a character, gives the upper case equivalent of the
2617 character. */
2619 char upcase[0400] =
2620 { 000, 001, 002, 003, 004, 005, 006, 007,
2621 010, 011, 012, 013, 014, 015, 016, 017,
2622 020, 021, 022, 023, 024, 025, 026, 027,
2623 030, 031, 032, 033, 034, 035, 036, 037,
2624 040, 041, 042, 043, 044, 045, 046, 047,
2625 050, 051, 052, 053, 054, 055, 056, 057,
2626 060, 061, 062, 063, 064, 065, 066, 067,
2627 070, 071, 072, 073, 074, 075, 076, 077,
2628 0100, 0101, 0102, 0103, 0104, 0105, 0106, 0107,
2629 0110, 0111, 0112, 0113, 0114, 0115, 0116, 0117,
2630 0120, 0121, 0122, 0123, 0124, 0125, 0126, 0127,
2631 0130, 0131, 0132, 0133, 0134, 0135, 0136, 0137,
2632 0140, 0101, 0102, 0103, 0104, 0105, 0106, 0107,
2633 0110, 0111, 0112, 0113, 0114, 0115, 0116, 0117,
2634 0120, 0121, 0122, 0123, 0124, 0125, 0126, 0127,
2635 0130, 0131, 0132, 0173, 0174, 0175, 0176, 0177,
2636 0200, 0201, 0202, 0203, 0204, 0205, 0206, 0207,
2637 0210, 0211, 0212, 0213, 0214, 0215, 0216, 0217,
2638 0220, 0221, 0222, 0223, 0224, 0225, 0226, 0227,
2639 0230, 0231, 0232, 0233, 0234, 0235, 0236, 0237,
2640 0240, 0241, 0242, 0243, 0244, 0245, 0246, 0247,
2641 0250, 0251, 0252, 0253, 0254, 0255, 0256, 0257,
2642 0260, 0261, 0262, 0263, 0264, 0265, 0266, 0267,
2643 0270, 0271, 0272, 0273, 0274, 0275, 0276, 0277,
2644 0300, 0301, 0302, 0303, 0304, 0305, 0306, 0307,
2645 0310, 0311, 0312, 0313, 0314, 0315, 0316, 0317,
2646 0320, 0321, 0322, 0323, 0324, 0325, 0326, 0327,
2647 0330, 0331, 0332, 0333, 0334, 0335, 0336, 0337,
2648 0340, 0341, 0342, 0343, 0344, 0345, 0346, 0347,
2649 0350, 0351, 0352, 0353, 0354, 0355, 0356, 0357,
2650 0360, 0361, 0362, 0363, 0364, 0365, 0366, 0367,
2651 0370, 0371, 0372, 0373, 0374, 0375, 0376, 0377
2654 #ifdef canned
2656 #include "tests.h"
2658 typedef enum { extended_test, basic_test } test_type;
2660 /* Use this to run the tests we've thought of. */
2662 void
2663 main ()
2665 test_type t = extended_test;
2667 if (t == basic_test)
2669 printf ("Running basic tests:\n\n");
2670 test_posix_basic ();
2672 else if (t == extended_test)
2674 printf ("Running extended tests:\n\n");
2675 test_posix_extended ();
2679 #else /* not canned */
2681 /* Use this to run interactive tests. */
2683 void
2684 main (argc, argv)
2685 int argc;
2686 char **argv;
2688 char pat[80];
2689 struct re_pattern_buffer buf;
2690 int i;
2691 char c;
2692 char fastmap[(1 << BYTEWIDTH)];
2694 /* Allow a command argument to specify the style of syntax. */
2695 if (argc > 1)
2696 obscure_syntax = atoi (argv[1]);
2698 buf.allocated = 40;
2699 buf.buffer = (char *) malloc (buf.allocated);
2700 buf.fastmap = fastmap;
2701 buf.translate = upcase;
2703 while (1)
2705 gets (pat);
2707 if (*pat)
2709 re_compile_pattern (pat, strlen(pat), &buf);
2711 for (i = 0; i < buf.used; i++)
2712 printchar (buf.buffer[i]);
2714 putchar ('\n');
2716 printf ("%d allocated, %d used.\n", buf.allocated, buf.used);
2718 re_compile_fastmap (&buf);
2719 printf ("Allowed by fastmap: ");
2720 for (i = 0; i < (1 << BYTEWIDTH); i++)
2721 if (fastmap[i]) printchar (i);
2722 putchar ('\n');
2725 gets (pat); /* Now read the string to match against */
2727 i = re_match (&buf, pat, strlen (pat), 0, 0);
2728 printf ("Match value %d.\n", i);
2732 #endif
2735 #ifdef NOTDEF
2736 print_buf (bufp)
2737 struct re_pattern_buffer *bufp;
2739 int i;
2741 printf ("buf is :\n----------------\n");
2742 for (i = 0; i < bufp->used; i++)
2743 printchar (bufp->buffer[i]);
2745 printf ("\n%d allocated, %d used.\n", bufp->allocated, bufp->used);
2747 printf ("Allowed by fastmap: ");
2748 for (i = 0; i < (1 << BYTEWIDTH); i++)
2749 if (bufp->fastmap[i])
2750 printchar (i);
2751 printf ("\nAllowed by translate: ");
2752 if (bufp->translate)
2753 for (i = 0; i < (1 << BYTEWIDTH); i++)
2754 if (bufp->translate[i])
2755 printchar (i);
2756 printf ("\nfastmap is%s accurate\n", bufp->fastmap_accurate ? "" : "n't");
2757 printf ("can %s be null\n----------", bufp->can_be_null ? "" : "not");
2759 #endif /* NOTDEF */
2761 printchar (c)
2762 char c;
2764 if (c < 040 || c >= 0177)
2766 putchar ('\\');
2767 putchar (((c >> 6) & 3) + '0');
2768 putchar (((c >> 3) & 7) + '0');
2769 putchar ((c & 7) + '0');
2771 else
2772 putchar (c);
2775 error (string)
2776 char *string;
2778 puts (string);
2779 exit (1);
2781 #endif /* test */