Remove building with NOCRYPTO option
[minix.git] / minix / commands / cawf / regexp.c
blobeaeccee27cc872c2e9125b9613bc545d3f503e6b
1 /*
2 * regcomp and regexec -- regsub and regerror are elsewhere
4 * Copyright (c) 1986 by University of Toronto.
5 * Written by Henry Spencer. Not derived from licensed software.
7 * Permission is granted to anyone to use this software for any
8 * purpose on any computer system, and to redistribute it freely,
9 * subject to the following restrictions:
11 * 1. The author is not responsible for the consequences of use of
12 * this software, no matter how awful, even if they arise
13 * from defects in it.
15 * 2. The origin of this software must not be misrepresented, either
16 * by explicit claim or by omission.
18 * 3. Altered versions must be plainly marked as such, and must not
19 * be misrepresented as being the original software.
21 * Beware that some of this code is subtly aware of the way operator
22 * precedence is structured in regular expressions. Serious changes in
23 * regular-expression syntax might require a total rethink.
25 #include <stdio.h>
26 #ifdef UNIX
27 #ifdef USG
28 #include <string.h>
29 #else
30 #include <strings.h>
31 #endif
32 #else
33 #include <string.h>
34 #endif
35 #include "regexp.h"
36 #include "regmagic.h"
37 #include "proto.h"
40 * The "internal use only" fields in regexp.h are present to pass info from
41 * compile to execute that permits the execute phase to run lots faster on
42 * simple cases. They are:
44 * regstart char that must begin a match; '\0' if none obvious
45 * reganch is the match anchored (at beginning-of-line only)?
46 * regmust string (pointer into program) that match must include, or NULL
47 * regmlen length of regmust string
49 * Regstart and reganch permit very fast decisions on suitable starting points
50 * for a match, cutting down the work a lot. Regmust permits fast rejection
51 * of lines that cannot possibly match. The regmust tests are costly enough
52 * that regcomp() supplies a regmust only if the r.e. contains something
53 * potentially expensive (at present, the only such thing detected is * or +
54 * at the start of the r.e., which can involve a lot of backup). Regmlen is
55 * supplied because the test in regexec() needs it and regcomp() is computing
56 * it anyway.
60 * Structure for regexp "program". This is essentially a linear encoding
61 * of a nondeterministic finite-state machine (aka syntax charts or
62 * "railroad normal form" in parsing technology). Each node is an opcode
63 * plus a "next" pointer, possibly plus an operand. "Next" pointers of
64 * all nodes except BRANCH implement concatenation; a "next" pointer with
65 * a BRANCH on both ends of it is connecting two alternatives. (Here we
66 * have one of the subtle syntax dependencies: an individual BRANCH (as
67 * opposed to a collection of them) is never concatenated with anything
68 * because of operator precedence.) The operand of some types of node is
69 * a literal string; for others, it is a node leading into a sub-FSM. In
70 * particular, the operand of a BRANCH node is the first node of the branch.
71 * (NB this is *not* a tree structure: the tail of the branch connects
72 * to the thing following the set of BRANCHes.) The opcodes are:
75 /* definition number opnd? meaning */
76 #define END 0 /* no End of program. */
77 #define BOL 1 /* no Match "" at beginning of line. */
78 #define EOL 2 /* no Match "" at end of line. */
79 #define ANY 3 /* no Match any one character. */
80 #define ANYOF 4 /* str Match any character in this string. */
81 #define ANYBUT 5 /* str Match any character not in this string. */
82 #define BRANCH 6 /* node Match this alternative, or the next... */
83 #define BACK 7 /* no Match "", "next" ptr points backward. */
84 #define EXACTLY 8 /* str Match this string. */
85 #define NOTHING 9 /* no Match empty string. */
86 #define STAR 10 /* node Match this (simple) thing 0 or more times. */
87 #define PLUS 11 /* node Match this (simple) thing 1 or more times. */
88 #define OPEN 20 /* no Mark this point in input as start of #n. */
89 /* OPEN+1 is number 1, etc. */
90 #define CLOSE 30 /* no Analogous to OPEN. */
93 * Opcode notes:
95 * BRANCH The set of branches constituting a single choice are hooked
96 * together with their "next" pointers, since precedence prevents
97 * anything being concatenated to any individual branch. The
98 * "next" pointer of the last BRANCH in a choice points to the
99 * thing following the whole choice. This is also where the
100 * final "next" pointer of each individual branch points; each
101 * branch starts with the operand node of a BRANCH node.
103 * BACK Normal "next" pointers all implicitly point forward; BACK
104 * exists to make loop structures possible.
106 * STAR,PLUS '?', and complex '*' and '+', are implemented as circular
107 * BRANCH structures using BACK. Simple cases (one character
108 * per match) are implemented with STAR and PLUS for speed
109 * and to minimize recursive plunges.
111 * OPEN,CLOSE ...are numbered at compile time.
115 * A node is one char of opcode followed by two chars of "next" pointer.
116 * "Next" pointers are stored as two 8-bit pieces, high order first. The
117 * value is a positive offset from the opcode of the node containing it.
118 * An operand, if any, simply follows the node. (Note that much of the
119 * code generation knows about this implicit relationship.)
121 * Using two bytes for the "next" pointer is vast overkill for most things,
122 * but allows patterns to get big without disasters.
124 #define OP(p) (*(p))
125 #define NEXT(p) (((*((p)+1)&0377)<<8) + (*((p)+2)&0377))
126 #define OPERAND(p) ((p) + 3)
129 * See regmagic.h for one further detail of program structure.
134 * Utility definitions.
136 #ifndef CHARBITS
137 #define UCHARAT(p) ((int)*(unsigned char *)(p))
138 #else
139 #define UCHARAT(p) ((int)*(p)&CHARBITS)
140 #endif
142 #define FAIL(m) { regerror(m); return(NULL); }
143 #define ISMULT(c) ((c) == '*' || (c) == '+' || (c) == '?')
144 #define META "^$.[()|?+*\\"
147 * Flags to be passed up and down.
149 #define HASWIDTH 01 /* Known never to match null string. */
150 #define SIMPLE 02 /* Simple enough to be STAR/PLUS operand. */
151 #define SPSTART 04 /* Starts with * or +. */
152 #define WORST 0 /* Worst case. */
153 #define STATIC
155 * Global work variables for regcomp().
158 STATIC char *regparse; /* Input-scan pointer. */
159 STATIC int regnpar; /* () count. */
160 STATIC unsigned char regdummy;
161 STATIC unsigned char *regcode; /* Code-emit pointer; &regdummy = don't. */
162 STATIC long regsize; /* Code size. */
165 * Forward declarations for regcomp()'s friends.
167 STATIC unsigned char *reg(int paren, int *flagp );
168 STATIC unsigned char *regbranch(int *flagp );
169 STATIC unsigned char *regpiece(int *flagp );
170 STATIC unsigned char *regatom(int *flagp );
171 STATIC unsigned char *regnode(int op );
172 STATIC void regc(int b );
173 STATIC void reginsert(int op, unsigned char *opnd );
174 STATIC void regtail(unsigned char *p, unsigned char *val );
175 STATIC void regoptail(unsigned char *p, unsigned char *val );
176 STATIC int regtry(regexp *prog, unsigned char *string );
177 STATIC int regmatch(unsigned char *prog );
178 STATIC int regrepeat(unsigned char *p );
179 STATIC unsigned char *regnext(unsigned char *p );
180 STATIC unsigned char *regprop(unsigned char *op );
182 #ifdef STRCSPN
183 STATIC int strcspn(unsigned char *s1, unsigned char *s2 );
184 #endif
187 - regcomp - compile a regular expression into internal code
189 * We can't allocate space until we know how big the compiled form will be,
190 * but we can't compile it (and thus know how big it is) until we've got a
191 * place to put the code. So we cheat: we compile it twice, once with code
192 * generation turned off and size counting turned on, and once "for real".
193 * This also means that we don't allocate space until we are sure that the
194 * thing really will compile successfully, and we never have to move the
195 * code and thus invalidate pointers into it. (Note that it has to be in
196 * one piece because free() must be able to free it all.)
198 * Beware that the optimization-preparation code in here knows about some
199 * of the structure of the compiled regexp.
201 regexp *regcomp(char *exp) {
202 register regexp *r;
203 register unsigned char *scan;
204 register unsigned char *longest;
205 register int len;
206 int flags;
208 if (exp == NULL)
209 FAIL("NULL argument");
211 /* First pass: determine size, legality. */
212 regparse = exp;
213 regnpar = 1;
214 regsize = 0L;
215 regcode = &regdummy;
216 regc(MAGIC);
217 if (reg(0, &flags) == NULL)
218 return(NULL);
220 /* Small enough for pointer-storage convention? */
221 if (regsize >= 32767L) /* Probably could be 65535L. */
222 FAIL("regexp too big");
224 /* Allocate space. */
225 r = (regexp *)malloc(sizeof(regexp) + (unsigned)regsize);
226 if (r == NULL)
227 FAIL("out of space");
229 /* Second pass: emit code. */
230 regparse = exp;
231 regnpar = 1;
232 regcode = r->program;
233 regc(MAGIC);
234 if (reg(0, &flags) == NULL) {
235 free(r);
236 return(NULL);
239 /* Dig out information for optimizations. */
240 r->regstart = '\0'; /* Worst-case defaults. */
241 r->reganch = 0;
242 r->regmust = NULL;
243 r->regmlen = 0;
244 scan = r->program+1; /* First BRANCH. */
245 if (OP(regnext(scan)) == END) { /* Only one top-level choice. */
246 scan = OPERAND(scan);
248 /* Starting-point info. */
249 if (OP(scan) == EXACTLY)
250 r->regstart = *OPERAND(scan);
251 else if (OP(scan) == BOL)
252 r->reganch++;
255 * If there's something expensive in the r.e., find the
256 * longest literal string that must appear and make it the
257 * regmust. Resolve ties in favor of later strings, since
258 * the regstart check works with the beginning of the r.e.
259 * and avoiding duplication strengthens checking. Not a
260 * strong reason, but sufficient in the absence of others.
262 if (flags&SPSTART) {
263 longest = NULL;
264 len = 0;
265 for (; scan != NULL; scan = regnext(scan))
266 if (OP(scan) == EXACTLY
267 && strlen((char *)OPERAND(scan)) >= len) {
268 longest = OPERAND(scan);
269 len = strlen((char *)OPERAND(scan));
271 r->regmust = longest;
272 r->regmlen = len;
276 return(r);
280 - reg - regular expression, i.e. main body or parenthesized thing
282 * Caller must absorb opening parenthesis.
284 * Combining parenthesis handling with the base level of regular expression
285 * is a trifle forced, but the need to tie the tails of the branches to what
286 * follows makes it hard to avoid.
288 STATIC unsigned char *reg(int paren, int *flagp) {
289 /* Parenthesized? paren */
290 register unsigned char *ret;
291 register unsigned char *br;
292 register unsigned char *ender;
293 register int parno;
294 int flags;
296 *flagp = HASWIDTH; /* Tentatively. */
298 /* Make an OPEN node, if parenthesized. */
299 if (paren) {
300 if (regnpar >= NSUBEXP)
301 FAIL("too many ()");
302 parno = regnpar;
303 regnpar++;
304 ret = regnode(OPEN+parno);
305 } else
306 ret = NULL;
308 /* Pick up the branches, linking them together. */
309 br = regbranch(&flags);
310 if (br == NULL)
311 return(NULL);
312 if (ret != NULL)
313 regtail(ret, br); /* OPEN -> first. */
314 else
315 ret = br;
316 if (!(flags&HASWIDTH))
317 *flagp &= ~HASWIDTH;
318 *flagp |= flags&SPSTART;
319 while (*regparse == '|') {
320 regparse++;
321 br = regbranch(&flags);
322 if (br == NULL)
323 return(NULL);
324 regtail(ret, br); /* BRANCH -> BRANCH. */
325 if (!(flags&HASWIDTH))
326 *flagp &= ~HASWIDTH;
327 *flagp |= flags&SPSTART;
330 /* Make a closing node, and hook it on the end. */
331 ender = regnode((paren) ? CLOSE+parno : END);
332 regtail(ret, ender);
334 /* Hook the tails of the branches to the closing node. */
335 for (br = ret; br != NULL; br = regnext(br))
336 regoptail(br, ender);
338 /* Check for proper termination. */
339 if (paren && *regparse++ != ')') {
340 FAIL("unmatched ()");
341 } else if (!paren && *regparse != '\0') {
342 if (*regparse == ')') {
343 FAIL("unmatched ()");
344 } else
345 FAIL("junk on end"); /* "Can't happen". */
346 /* NOTREACHED */
349 return(ret);
353 - regbranch - one alternative of an | operator
355 * Implements the concatenation operator.
357 STATIC unsigned char *regbranch(int *flagp) {
358 register unsigned char *ret;
359 register unsigned char *chain;
360 register unsigned char *latest;
361 int flags;
363 *flagp = WORST; /* Tentatively. */
365 ret = regnode(BRANCH);
366 chain = NULL;
367 while (*regparse != '\0' && *regparse != '|' && *regparse != ')') {
368 latest = regpiece(&flags);
369 if (latest == NULL)
370 return(NULL);
371 *flagp |= flags&HASWIDTH;
372 if (chain == NULL) /* First piece. */
373 *flagp |= flags&SPSTART;
374 else
375 regtail(chain, latest);
376 chain = latest;
378 if (chain == NULL) /* Loop ran zero times. */
379 (void) regnode(NOTHING);
381 return(ret);
385 - regpiece - something followed by possible [*+?]
387 * Note that the branching code sequences used for ? and the general cases
388 * of * and + are somewhat optimized: they use the same NOTHING node as
389 * both the endmarker for their branch list and the body of the last branch.
390 * It might seem that this node could be dispensed with entirely, but the
391 * endmarker role is not redundant.
393 STATIC unsigned char *regpiece(int *flagp) {
394 register unsigned char *ret;
395 register unsigned char op;
396 register unsigned char *next;
397 int flags;
399 ret = regatom(&flags);
400 if (ret == NULL)
401 return(NULL);
403 op = *regparse;
404 if (!ISMULT(op)) {
405 *flagp = flags;
406 return(ret);
409 if (!(flags&HASWIDTH) && op != '?')
410 FAIL("*+ operand could be empty");
411 *flagp = (op != '+') ? (WORST|SPSTART) : (WORST|HASWIDTH);
413 if (op == '*' && (flags&SIMPLE))
414 reginsert(STAR, ret);
415 else if (op == '*') {
416 /* Emit x* as (x&|), where & means "self". */
417 reginsert(BRANCH, ret); /* Either x */
418 regoptail(ret, regnode(BACK)); /* and loop */
419 regoptail(ret, ret); /* back */
420 regtail(ret, regnode(BRANCH)); /* or */
421 regtail(ret, regnode(NOTHING)); /* null. */
422 } else if (op == '+' && (flags&SIMPLE))
423 reginsert(PLUS, ret);
424 else if (op == '+') {
425 /* Emit x+ as x(&|), where & means "self". */
426 next = regnode(BRANCH); /* Either */
427 regtail(ret, next);
428 regtail(regnode(BACK), ret); /* loop back */
429 regtail(next, regnode(BRANCH)); /* or */
430 regtail(ret, regnode(NOTHING)); /* null. */
431 } else if (op == '?') {
432 /* Emit x? as (x|) */
433 reginsert(BRANCH, ret); /* Either x */
434 regtail(ret, regnode(BRANCH)); /* or */
435 next = regnode(NOTHING); /* null. */
436 regtail(ret, next);
437 regoptail(ret, next);
439 regparse++;
440 if (ISMULT(*regparse))
441 FAIL("nested *?+");
443 return(ret);
447 - regatom - the lowest level
449 * Optimization: gobbles an entire sequence of ordinary characters so that
450 * it can turn them into a single node, which is smaller to store and
451 * faster to run. Backslashed characters are exceptions, each becoming a
452 * separate node; the code is simpler that way and it's not worth fixing.
454 STATIC unsigned char *regatom(int *flagp) {
455 register unsigned char *ret;
456 int flags;
458 *flagp = WORST; /* Tentatively. */
460 switch (*regparse++) {
461 case '^':
462 ret = regnode(BOL);
463 break;
464 case '$':
465 ret = regnode(EOL);
466 break;
467 case '.':
468 ret = regnode(ANY);
469 *flagp |= HASWIDTH|SIMPLE;
470 break;
471 case '[': {
472 register int class;
473 register int classend;
475 if (*regparse == '^') { /* Complement of range. */
476 ret = regnode(ANYBUT);
477 regparse++;
478 } else
479 ret = regnode(ANYOF);
480 if (*regparse == ']' || *regparse == '-')
481 regc(*regparse++);
482 while (*regparse != '\0' && *regparse != ']') {
483 if (*regparse == '-') {
484 regparse++;
485 if (*regparse == ']' || *regparse == '\0')
486 regc('-');
487 else {
488 class = UCHARAT(regparse-2)+1;
489 classend = UCHARAT(regparse);
490 if (class > classend+1)
491 FAIL("invalid [] range");
492 for (; class <= classend; class++)
493 regc(class);
494 regparse++;
496 } else
497 regc(*regparse++);
499 regc('\0');
500 if (*regparse != ']')
501 FAIL("unmatched []");
502 regparse++;
503 *flagp |= HASWIDTH|SIMPLE;
505 break;
506 case '(':
507 ret = reg(1, &flags);
508 if (ret == NULL)
509 return(NULL);
510 *flagp |= flags&(HASWIDTH|SPSTART);
511 break;
512 case '\0':
513 case '|':
514 case ')':
515 FAIL("internal urp"); /* Supposed to be caught earlier. */
516 break;
517 case '?':
518 case '+':
519 case '*':
520 FAIL("?+* follows nothing");
521 break;
522 case '\\':
523 if (*regparse == '\0')
524 FAIL("trailing \\");
525 ret = regnode(EXACTLY);
526 regc(*regparse++);
527 regc('\0');
528 *flagp |= HASWIDTH|SIMPLE;
529 break;
530 default: {
531 register int len;
532 register unsigned char ender;
534 regparse--;
535 #ifdef STRCSPN
536 len = strcspn(regparse, (unsigned char *)META);
537 #else
538 len = strcspn((char *)regparse, META);
539 #endif
540 if (len <= 0)
541 FAIL("internal disaster");
542 ender = *(regparse+len);
543 if (len > 1 && ISMULT(ender))
544 len--; /* Back off clear of ?+* operand. */
545 *flagp |= HASWIDTH;
546 if (len == 1)
547 *flagp |= SIMPLE;
548 ret = regnode(EXACTLY);
549 while (len > 0) {
550 regc(*regparse++);
551 len--;
553 regc('\0');
555 break;
558 return(ret);
562 - regnode - emit a node
564 STATIC unsigned char *regnode(int iop) {
565 /* Return value is location */
566 register unsigned char *ret;
567 register unsigned char *ptr;
568 unsigned char op;
570 op = (unsigned char) iop;
571 ret = regcode;
572 if (ret == &regdummy) {
573 regsize += 3;
574 return(ret);
577 ptr = ret;
578 *ptr++ = op;
579 *ptr++ = '\0'; /* Null "next" pointer. */
580 *ptr++ = '\0';
581 regcode = ptr;
583 return(ret);
587 - regc - emit (if appropriate) a byte of code
589 STATIC void regc(int ib) {
590 unsigned char b;
592 b = (unsigned char) ib;
593 if (regcode != &regdummy)
594 *regcode++ = b;
595 else
596 regsize++;
600 - reginsert - insert an operator in front of already-emitted operand
602 * Means relocating the operand.
604 STATIC void reginsert(int iop, unsigned char *opnd) {
605 register unsigned char *src;
606 register unsigned char *dst;
607 register unsigned char *place;
608 unsigned char op;
610 op = (unsigned char) iop;
611 if (regcode == &regdummy) {
612 regsize += 3;
613 return;
616 src = regcode;
617 regcode += 3;
618 dst = regcode;
619 while (src > opnd)
620 *--dst = *--src;
622 place = opnd; /* Op node, where operand used to be. */
623 *place++ = op;
624 *place++ = '\0';
625 *place++ = '\0';
629 - regtail - set the next-pointer at the end of a node chain
631 STATIC void regtail(unsigned char *p, unsigned char *val) {
632 register unsigned char *scan;
633 register unsigned char *temp;
634 register int offset;
636 if (p == &regdummy)
637 return;
639 /* Find last node. */
640 scan = p;
641 for (;;) {
642 temp = regnext(scan);
643 if (temp == NULL)
644 break;
645 scan = temp;
648 if (OP(scan) == BACK)
649 offset = scan - val;
650 else
651 offset = val - scan;
652 *(scan+1) = (offset>>8)&0377;
653 *(scan+2) = offset&0377;
657 - regoptail - regtail on operand of first argument; nop if operandless
659 STATIC void regoptail(unsigned char *p, unsigned char *val) {
660 /* "Operandless" and "op != BRANCH" are synonymous in practice. */
661 if (p == NULL || p == &regdummy || OP(p) != BRANCH)
662 return;
663 regtail(OPERAND(p), val);
667 * regexec and friends
671 * Global work variables for regexec().
673 STATIC unsigned char *reginput; /* String-input pointer. */
674 STATIC unsigned char *regbol; /* Beginning of input, for ^ check. */
675 STATIC unsigned char **regstartp; /* Pointer to startp array. */
676 STATIC unsigned char **regendp; /* Ditto for endp. */
678 #ifdef DEBUG
679 int regnarrate = 0;
680 void regdump(void);
681 STATIC unsigned char *regprop(void);
682 #endif
685 - regexec - match a regexp against a string
688 regexec(register regexp *prog, register unsigned char *string) {
689 register unsigned char *s;
690 #ifndef STDLIB
691 extern char *strchr();
692 #endif
694 /* Be paranoid... */
695 if (prog == NULL || string == NULL) {
696 regerror("NULL parameter");
697 return(0);
700 /* Check validity of program. */
701 if (UCHARAT(prog->program) != MAGIC) {
702 regerror("corrupted program");
703 return(0);
706 /* If there is a "must appear" string, look for it. */
707 if (prog->regmust != NULL) {
708 s = string;
709 while ((s = (unsigned char *)strchr((char *)s,prog->regmust[0]))
710 != NULL) {
711 if (strncmp((char *)s, (char *)prog->regmust,
712 prog->regmlen)
713 == 0)
714 break; /* Found it. */
715 s++;
717 if (s == NULL) /* Not present. */
718 return(0);
721 /* Mark beginning of line for ^ . */
722 regbol = string;
724 /* Simplest case: anchored match need be tried only once. */
725 if (prog->reganch)
726 return(regtry(prog, string));
728 /* Messy cases: unanchored match. */
729 s = string;
730 if (prog->regstart != '\0')
731 /* We know what char it must start with. */
732 while ((s = (unsigned char *)strchr((char *)s, prog->regstart))
733 != NULL) {
734 if (regtry(prog, s))
735 return(1);
736 s++;
738 else
739 /* We don't -- general case. */
740 do {
741 if (regtry(prog, s))
742 return(1);
743 } while (*s++ != '\0');
745 /* Failure. */
746 return(0);
750 - regtry - try match at specific point
752 STATIC int regtry(regexp *prog, unsigned char *string) {
753 /* Return value 0 failure, 1 success */
754 register int i;
755 register unsigned char **sp;
756 register unsigned char **ep;
758 reginput = string;
759 regstartp = prog->startp;
760 regendp = prog->endp;
762 sp = prog->startp;
763 ep = prog->endp;
764 for (i = NSUBEXP; i > 0; i--) {
765 *sp++ = NULL;
766 *ep++ = NULL;
768 if (regmatch(prog->program + 1)) {
769 prog->startp[0] = string;
770 prog->endp[0] = reginput;
771 return(1);
772 } else
773 return(0);
777 - regmatch - main matching routine
779 * Conceptually the strategy is simple: check to see whether the current
780 * node matches, call self recursively to see whether the rest matches,
781 * and then act accordingly. In practice we make some effort to avoid
782 * recursion, in particular by going through "ordinary" nodes (that don't
783 * need to know whether the rest of the match failed) by a loop instead of
784 * by recursion.
786 STATIC int regmatch( unsigned char *prog) {
787 /* Return value 0 failure 1 success */
788 register unsigned char *scan; /* Current node. */
789 unsigned char *next; /* Next node. */
790 #ifndef STDLIB
791 extern char *strchr();
792 #endif
794 scan = prog;
795 #ifdef DEBUG
796 if (scan != NULL && regnarrate)
797 fprintf(stderr, "%s(\n", regprop(scan));
798 #endif
799 while (scan != NULL) {
800 #ifdef DEBUG
801 if (regnarrate)
802 fprintf(stderr, "%s...\n", regprop(scan));
803 #endif
804 next = regnext(scan);
806 switch (OP(scan)) {
807 case BOL:
808 if (reginput != regbol)
809 return(0);
810 break;
811 case EOL:
812 if (*reginput != '\0')
813 return(0);
814 break;
815 case ANY:
816 if (*reginput == '\0')
817 return(0);
818 reginput++;
819 break;
820 case EXACTLY: {
821 register int len;
822 register unsigned char *opnd;
824 opnd = OPERAND(scan);
825 /* Inline the first character, for speed. */
826 if (*opnd != *reginput)
827 return(0);
828 len = strlen((char *)opnd);
829 if (len > 1 && strncmp((char *)opnd,
830 (char *)reginput, len)
831 != 0)
832 return(0);
833 reginput += len;
835 break;
836 case ANYOF:
837 if (*reginput == '\0'
838 || strchr((char *)OPERAND(scan), *reginput) == NULL)
839 return(0);
840 reginput++;
841 break;
842 case ANYBUT:
843 if (*reginput == '\0'
844 || strchr((char *)OPERAND(scan), *reginput) != NULL)
845 return(0);
846 reginput++;
847 break;
848 case NOTHING:
849 break;
850 case BACK:
851 break;
852 case OPEN+1:
853 case OPEN+2:
854 case OPEN+3:
855 case OPEN+4:
856 case OPEN+5:
857 case OPEN+6:
858 case OPEN+7:
859 case OPEN+8:
860 case OPEN+9: {
861 register int no;
862 register unsigned char *save;
864 no = OP(scan) - OPEN;
865 save = reginput;
867 if (regmatch(next)) {
869 * Don't set startp if some later
870 * invocation of the same parentheses
871 * already has.
873 if (regstartp[no] == NULL)
874 regstartp[no] = save;
875 return(1);
876 } else
877 return(0);
879 break;
880 case CLOSE+1:
881 case CLOSE+2:
882 case CLOSE+3:
883 case CLOSE+4:
884 case CLOSE+5:
885 case CLOSE+6:
886 case CLOSE+7:
887 case CLOSE+8:
888 case CLOSE+9: {
889 register int no;
890 register unsigned char *save;
892 no = OP(scan) - CLOSE;
893 save = reginput;
895 if (regmatch(next)) {
897 * Don't set endp if some later
898 * invocation of the same parentheses
899 * already has.
901 if (regendp[no] == NULL)
902 regendp[no] = save;
903 return(1);
904 } else
905 return(0);
907 break;
908 case BRANCH: {
909 register unsigned char *save;
911 if (OP(next) != BRANCH) /* No choice. */
912 next = OPERAND(scan); /* Avoid recursion. */
913 else {
914 do {
915 save = reginput;
916 if (regmatch(OPERAND(scan)))
917 return(1);
918 reginput = save;
919 scan = regnext(scan);
920 } while (scan != NULL && OP(scan) == BRANCH);
921 return(0);
922 /* NOTREACHED */
925 break;
926 case STAR:
927 case PLUS: {
928 register unsigned char nextch;
929 register int no;
930 register unsigned char *save;
931 register int min;
934 * Lookahead to avoid useless match attempts
935 * when we know what character comes next.
937 nextch = '\0';
938 if (OP(next) == EXACTLY)
939 nextch = *OPERAND(next);
940 min = (OP(scan) == STAR) ? 0 : 1;
941 save = reginput;
942 no = regrepeat(OPERAND(scan));
943 while (no >= min) {
944 /* If it could work, try it. */
945 if (nextch == '\0' || *reginput == nextch)
946 if (regmatch(next))
947 return(1);
948 /* Couldn't or didn't -- back up. */
949 no--;
950 reginput = save + no;
952 return(0);
954 break;
955 case END:
956 return(1); /* Success! */
957 break;
958 default:
959 regerror("memory corruption");
960 return(0);
961 break;
964 scan = next;
968 * We get here only if there's trouble -- normally "case END" is
969 * the terminating point.
971 regerror("corrupted pointers");
972 return(0);
976 - regrepeat - repeatedly match something simple, report how many
978 STATIC int regrepeat(unsigned char *p) {
979 register int count = 0;
980 register unsigned char *scan;
981 register unsigned char *opnd;
983 scan = reginput;
984 opnd = OPERAND(p);
985 switch (OP(p)) {
986 case ANY:
987 count = strlen((char *)scan);
988 scan += count;
989 break;
990 case EXACTLY:
991 while (*opnd == *scan) {
992 count++;
993 scan++;
995 break;
996 case ANYOF:
997 while (*scan != '\0' && strchr((char *)opnd, *scan) != NULL) {
998 count++;
999 scan++;
1001 break;
1002 case ANYBUT:
1003 while (*scan != '\0' && strchr((char *)opnd, *scan) == NULL) {
1004 count++;
1005 scan++;
1007 break;
1008 default: /* Oh dear. Called inappropriately. */
1009 regerror("internal foulup");
1010 count = 0; /* Best compromise. */
1011 break;
1013 reginput = scan;
1015 return(count);
1019 - regnext - dig the "next" pointer out of a node
1021 STATIC unsigned char *regnext(register unsigned char *p) {
1022 register int offset;
1024 if (p == &regdummy)
1025 return(NULL);
1027 offset = NEXT(p);
1028 if (offset == 0)
1029 return(NULL);
1031 if (OP(p) == BACK)
1032 return(p-offset);
1033 else
1034 return(p+offset);
1037 #ifdef DEBUG
1039 STATIC unsigned char *regprop(void);
1042 - regdump - dump a regexp onto stdout in vaguely comprehensible form
1044 void regdump(regexp r) {
1045 register unsigned char *s;
1046 register unsigned char op = EXACTLY; /* Arbitrary non-END op. */
1047 register unsigned char *next;
1048 extern char *strchr();
1051 s = r->program + 1;
1052 while (op != END) { /* While that wasn't END last time... */
1053 op = OP(s);
1054 printf("%2d%s", s-r->program, regprop(s)); /* Where, what. */
1055 next = regnext(s);
1056 if (next == NULL) /* Next ptr. */
1057 printf("(0)");
1058 else
1059 printf("(%d)", (s-r->program)+(next-s));
1060 s += 3;
1061 if (op == ANYOF || op == ANYBUT || op == EXACTLY) {
1062 /* Literal string, where present. */
1063 while (*s != '\0') {
1064 putchar(*s);
1065 s++;
1067 s++;
1069 putchar('\n');
1072 /* Header fields of interest. */
1073 if (r->regstart != '\0')
1074 printf("start `%c' ", r->regstart);
1075 if (r->reganch)
1076 printf("anchored ");
1077 if (r->regmust != NULL)
1078 printf("must have \"%s\"", r->regmust);
1079 printf("\n");
1083 - regprop - printable representation of opcode
1085 STATIC unsigned char regprop_buf[50];
1087 STATIC unsigned char *regprop(unsigned char *op) {
1088 register unsigned char *p;
1090 (void) strcpy(regprop_buf, ":");
1092 switch (OP(op)) {
1093 case BOL:
1094 p = "BOL";
1095 break;
1096 case EOL:
1097 p = "EOL";
1098 break;
1099 case ANY:
1100 p = "ANY";
1101 break;
1102 case ANYOF:
1103 p = "ANYOF";
1104 break;
1105 case ANYBUT:
1106 p = "ANYBUT";
1107 break;
1108 case BRANCH:
1109 p = "BRANCH";
1110 break;
1111 case EXACTLY:
1112 p = "EXACTLY";
1113 break;
1114 case NOTHING:
1115 p = "NOTHING";
1116 break;
1117 case BACK:
1118 p = "BACK";
1119 break;
1120 case END:
1121 p = "END";
1122 break;
1123 case OPEN+1:
1124 case OPEN+2:
1125 case OPEN+3:
1126 case OPEN+4:
1127 case OPEN+5:
1128 case OPEN+6:
1129 case OPEN+7:
1130 case OPEN+8:
1131 case OPEN+9:
1132 sprintf(regprop_buf+strlen(regprop_buf), "OPEN%d", OP(op)-OPEN);
1133 p = NULL;
1134 break;
1135 case CLOSE+1:
1136 case CLOSE+2:
1137 case CLOSE+3:
1138 case CLOSE+4:
1139 case CLOSE+5:
1140 case CLOSE+6:
1141 case CLOSE+7:
1142 case CLOSE+8:
1143 case CLOSE+9:
1144 sprintf(regprop_buf+strlen(regprop_buf), "CLOSE%d", OP(op)-CLOSE);
1145 p = NULL;
1146 break;
1147 case STAR:
1148 p = "STAR";
1149 break;
1150 case PLUS:
1151 p = "PLUS";
1152 break;
1153 default:
1154 regerror("corrupted opcode");
1155 break;
1157 if (p != NULL)
1158 (void) strcat(regprop_buf, p);
1159 return(regprop_buf);
1161 #endif
1164 * The following is provided for those people who do not have strcspn() in
1165 * their C libraries. They should get off their butts and do something
1166 * about it; at least one public-domain implementation of those (highly
1167 * useful) string routines has been published on Usenet.
1169 #ifdef STRCSPN
1171 * strcspn - find length of initial segment of s1 consisting entirely
1172 * of characters not from s2
1175 STATIC int strcspn(unsigned char *s1, unsigned char *s2) {
1176 register unsigned char *scan1;
1177 register unsigned char *scan2;
1178 register int count;
1180 count = 0;
1181 for (scan1 = s1; *scan1 != '\0'; scan1++) {
1182 for (scan2 = s2; *scan2 != '\0';) /* ++ moved down. */
1183 if (*scan1 == *scan2++)
1184 return(count);
1185 count++;
1187 return(count);
1189 #endif