lib/libcompat/regexp/regexp.c

   1 /*
   2  * regcomp and regexec -- regsub and regerror are elsewhere
   3  *
   4  *      Copyright (c) 1986 by University of Toronto.
   5  *      Written by Henry Spencer.  Not derived from licensed software.
   6  *
   7  *      Permission is granted to anyone to use this software for any
   8  *      purpose on any computer system, and to redistribute it freely,
   9  *      subject to the following restrictions:
  10  *
  11  *      1. The author is not responsible for the consequences of use of
  12  *              this software, no matter how awful, even if they arise
  13  *              from defects in it.
  14  *
  15  *      2. The origin of this software must not be misrepresented, either
  16  *              by explicit claim or by omission.
  17  *
  18  *      3. Altered versions must be plainly marked as such, and must not
  19  *              be misrepresented as being the original software.
  20  *** THIS IS AN ALTERED VERSION.  It was altered by John Gilmore,
  21  *** hoptoad!gnu, on 27 Dec 1986, to add \n as an alternative to |
  22  *** to assist in implementing egrep.
  23  *** THIS IS AN ALTERED VERSION.  It was altered by John Gilmore,
  24  *** hoptoad!gnu, on 27 Dec 1986, to add \< and \> for word-matching
  25  *** as in BSD grep and ex.
  26  *** THIS IS AN ALTERED VERSION.  It was altered by John Gilmore,
  27  *** hoptoad!gnu, on 28 Dec 1986, to optimize characters quoted with \.
  28  *** THIS IS AN ALTERED VERSION.  It was altered by James A. Woods,
  29  *** ames!jaw, on 19 June 1987, to quash a regcomp() redundancy.
  30  *
  31  * Beware that some of this code is subtly aware of the way operator
  32  * precedence is structured in regular expressions.  Serious changes in
  33  * regular-expression syntax might require a total rethink.
  34  */
  35
  36 #include <sys/cdefs.h>
  37 #if defined(LIBC_SCCS) && !defined(lint)
  38 __RCSID("$NetBSD: regexp.c,v 1.17 2006/04/08 22:05:36 christos Exp $");
  39 #endif /* LIBC_SCCS and not lint */
  40
  41 #include <ctype.h>
  42 #include <regexp.h>
  43 #include <stdio.h>
  44 #include <stdlib.h>
  45 #include <string.h>
  46 #include "regmagic.h"
  47
  48 /*
  49  * The "internal use only" fields in regexp.h are present to pass info from
  50  * compile to execute that permits the execute phase to run lots faster on
  51  * simple cases.  They are:
  52  *
  53  * regstart     char that must begin a match; '\0' if none obvious
  54  * reganch      is the match anchored (at beginning-of-line only)?
  55  * regmust      string (pointer into program) that match must include, or NULL
  56  * regmlen      length of regmust string
  57  *
  58  * Regstart and reganch permit very fast decisions on suitable starting points
  59  * for a match, cutting down the work a lot.  Regmust permits fast rejection
  60  * of lines that cannot possibly match.  The regmust tests are costly enough
  61  * that regcomp() supplies a regmust only if the r.e. contains something
  62  * potentially expensive (at present, the only such thing detected is * or +
  63  * at the start of the r.e., which can involve a lot of backup).  Regmlen is
  64  * supplied because the test in regexec() needs it and regcomp() is computing
  65  * it anyway.
  66  */
  67
  68 /*
  69  * Structure for regexp "program".  This is essentially a linear encoding
  70  * of a nondeterministic finite-state machine (aka syntax charts or
  71  * "railroad normal form" in parsing technology).  Each node is an opcode
  72  * plus a "next" pointer, possibly plus an operand.  "Next" pointers of
  73  * all nodes except BRANCH implement concatenation; a "next" pointer with
  74  * a BRANCH on both ends of it is connecting two alternatives.  (Here we
  75  * have one of the subtle syntax dependencies:  an individual BRANCH (as
  76  * opposed to a collection of them) is never concatenated with anything
  77  * because of operator precedence.)  The operand of some types of node is
  78  * a literal string; for others, it is a node leading into a sub-FSM.  In
  79  * particular, the operand of a BRANCH node is the first node of the branch.
  80  * (NB this is *not* a tree structure:  the tail of the branch connects
  81  * to the thing following the set of BRANCHes.)  The opcodes are:
  82  */
  83
  84 /* definition   number  opnd?   meaning */
  85 #define END     0       /* no   End of program. */
  86 #define BOL     1       /* no   Match "" at beginning of line. */
  87 #define EOL     2       /* no   Match "" at end of line. */
  88 #define ANY     3       /* no   Match any one character. */
  89 #define ANYOF   4       /* str  Match any character in this string. */
  90 #define ANYBUT  5       /* str  Match any character not in this string. */
  91 #define BRANCH  6       /* node Match this alternative, or the next... */
  92 #define BACK    7       /* no   Match "", "next" ptr points backward. */
  93 #define EXACTLY 8       /* str  Match this string. */
  94 #define NOTHING 9       /* no   Match empty string. */
  95 #define STAR    10      /* node Match this (simple) thing 0 or more times. */
  96 #define PLUS    11      /* node Match this (simple) thing 1 or more times. */
  97 #define WORDA   12      /* no   Match "" at wordchar, where prev is nonword */
  98 #define WORDZ   13      /* no   Match "" at nonwordchar, where prev is word */
  99 #define OPEN    20      /* no   Mark this point in input as start of #n. */
 100                         /*      OPEN+1 is number 1, etc. */
 101 #define CLOSE   30      /* no   Analogous to OPEN. */
 102
 103 /*
 104  * Opcode notes:
 105  *
 106  * BRANCH       The set of branches constituting a single choice are hooked
 107  *              together with their "next" pointers, since precedence prevents
 108  *              anything being concatenated to any individual branch.  The
 109  *              "next" pointer of the last BRANCH in a choice points to the
 110  *              thing following the whole choice.  This is also where the
 111  *              final "next" pointer of each individual branch points; each
 112  *              branch starts with the operand node of a BRANCH node.
 113  *
 114  * BACK         Normal "next" pointers all implicitly point forward; BACK
 115  *              exists to make loop structures possible.
 116  *
 117  * STAR,PLUS    '?', and complex '*' and '+', are implemented as circular
 118  *              BRANCH structures using BACK.  Simple cases (one character
 119  *              per match) are implemented with STAR and PLUS for speed
 120  *              and to minimize recursive plunges.
 121  *
 122  * OPEN,CLOSE   ...are numbered at compile time.
 123  */
 124
 125 /*
 126  * A node is one char of opcode followed by two chars of "next" pointer.
 127  * "Next" pointers are stored as two 8-bit pieces, high order first.  The
 128  * value is a positive offset from the opcode of the node containing it.
 129  * An operand, if any, simply follows the node.  (Note that much of the
 130  * code generation knows about this implicit relationship.)
 131  *
 132  * Using two bytes for the "next" pointer is vast overkill for most things,
 133  * but allows patterns to get big without disasters.
 134  */
 135 #define OP(p)   (*(p))
 136 #define NEXT(p) (((*((p)+1)&0377)<<8) + (*((p)+2)&0377))
 137 #define OPERAND(p)      ((p) + 3)
 138
 139 /*
 140  * See regmagic.h for one further detail of program structure.
 141  */
 142
 143
 144 /*
 145  * Utility definitions.
 146  */
 147 #ifndef CHARBITS
 148 #define UCHARAT(p)      ((int)*(unsigned char *)(p))
 149 #else
 150 #define UCHARAT(p)      ((int)*(p)&CHARBITS)
 151 #endif
 152
 153 #define FAIL(m) { regerror(m); return(NULL); }
 154 #define ISMULT(c)       ((c) == '*' || (c) == '+' || (c) == '?')
 155
 156 /*
 157  * Flags to be passed up and down.
 158  */
 159 #define HASWIDTH        01      /* Known never to match null string. */
 160 #define SIMPLE          02      /* Simple enough to be STAR/PLUS operand. */
 161 #define SPSTART         04      /* Starts with * or +. */
 162 #define WORST           0       /* Worst case. */
 163
 164 /*
 165  * Global work variables for regcomp().
 166  */
 167 static char *regparse;          /* Input-scan pointer. */
 168 static int regnpar;             /* () count. */
 169 static char regdummy;
 170 static char *regcode;           /* Code-emit pointer; &regdummy = don't. */
 171 static long regsize;            /* Code size. */
 172
 173 /*
 174  * Forward declarations for regcomp()'s friends.
 175  */
 176 #ifndef STATIC
 177 #define STATIC  static
 178 #endif
 179 STATIC char *reg __P((int, int *));
 180 STATIC char *regbranch __P((int *));
 181 STATIC char *regpiece __P((int *));
 182 STATIC char *regatom __P((int *));
 183 STATIC char *regnode __P((int));
 184 STATIC char *regnext __P((char *));
 185 STATIC void regc __P((int));
 186 STATIC void reginsert __P((int, char *));
 187 STATIC void regtail __P((char *, char *));
 188 STATIC void regoptail __P((char *, char *));
 189 #ifdef STRCSPN
 190 STATIC int strcspn __P((char *, char *));
 191 #endif
 192
 193 /*
 194  - regcomp - compile a regular expression into internal code
 195  *
 196  * We can't allocate space until we know how big the compiled form will be,
 197  * but we can't compile it (and thus know how big it is) until we've got a
 198  * place to put the code.  So we cheat:  we compile it twice, once with code
 199  * generation turned off and size counting turned on, and once "for real".
 200  * This also means that we don't allocate space until we are sure that the
 201  * thing really will compile successfully, and we never have to move the
 202  * code and thus invalidate pointers into it.  (Note that it has to be in
 203  * one piece because free() must be able to free it all.)
 204  *
 205  * Beware that the optimization-preparation code in here knows about some
 206  * of the structure of the compiled regexp.
 207  */
 208 regexp *
 209 __compat_regcomp(expn)
 210 const char *expn;
 211 {
 212         regexp *r;
 213         char *scan;
 214         char *longest;
 215         int len;
 216         int flags;
 217
 218         if (expn == NULL)
 219                 FAIL("NULL argument");
 220
 221         /* First pass: determine size, legality. */
 222 #ifdef notdef
 223         if (expn[0] == '.' && expn[1] == '*') expn += 2;  /* aid grep */
 224 #endif
 225         /* LINTED const castaway */
 226         regparse = (char *)expn;
 227         regnpar = 1;
 228         regsize = 0L;
 229         regcode = &regdummy;
 230         regc(MAGIC);
 231         if (reg(0, &flags) == NULL)
 232                 return(NULL);
 233
 234         /* Small enough for pointer-storage convention? */
 235         if (regsize >= 32767L)          /* Probably could be 65535L. */
 236                 FAIL("regexp too big");
 237
 238         /* Allocate space. */
 239         r = (regexp *)malloc(sizeof(regexp) + (unsigned)regsize);
 240         if (r == NULL)
 241                 FAIL("out of space");
 242
 243         /* Second pass: emit code. */
 244         /* LINTED const castaway */
 245         regparse = (char *)expn;
 246         regnpar = 1;
 247         regcode = r->program;
 248         regc(MAGIC);
 249         if (reg(0, &flags) == NULL)
 250                 return(NULL);
 251
 252         /* Dig out information for optimizations. */
 253         r->regstart = '\0';     /* Worst-case defaults. */
 254         r->reganch = 0;
 255         r->regmust = NULL;
 256         r->regmlen = 0;
 257         scan = r->program+1;                    /* First BRANCH. */
 258         if (OP(regnext(scan)) == END) {         /* Only one top-level choice. */
 259                 scan = OPERAND(scan);
 260
 261                 /* Starting-point info. */
 262                 if (OP(scan) == EXACTLY)
 263                         r->regstart = *OPERAND(scan);
 264                 else if (OP(scan) == BOL)
 265                         r->reganch++;
 266
 267                 /*
 268                  * If there's something expensive in the r.e., find the
 269                  * longest literal string that must appear and make it the
 270                  * regmust.  Resolve ties in favor of later strings, since
 271                  * the regstart check works with the beginning of the r.e.
 272                  * and avoiding duplication strengthens checking.  Not a
 273                  * strong reason, but sufficient in the absence of others.
 274                  */
 275                 if (flags&SPSTART) {
 276                         longest = NULL;
 277                         len = 0;
 278                         for (; scan != NULL; scan = regnext(scan))
 279                                 if (OP(scan) == EXACTLY && (int) strlen(OPERAND(scan)) >= len) {
 280                                         longest = OPERAND(scan);
 281                                         len = strlen(OPERAND(scan));
 282                                 }
 283                         r->regmust = longest;
 284                         r->regmlen = len;
 285                 }
 286         }
 287
 288         return(r);
 289 }
 290
 291 /*
 292  - reg - regular expression, i.e. main body or parenthesized thing
 293  *
 294  * Caller must absorb opening parenthesis.
 295  *
 296  * Combining parenthesis handling with the base level of regular expression
 297  * is a trifle forced, but the need to tie the tails of the branches to what
 298  * follows makes it hard to avoid.
 299  */
 300 static char *
 301 reg(paren, flagp)
 302 int paren;                      /* Parenthesized? */
 303 int *flagp;
 304 {
 305         char *ret;
 306         char *br;
 307         char *ender;
 308         int parno = 0;
 309         int flags;
 310
 311         *flagp = HASWIDTH;      /* Tentatively. */
 312
 313         /* Make an OPEN node, if parenthesized. */
 314         if (paren) {
 315                 if (regnpar >= NSUBEXP)
 316                         FAIL("too many ()");
 317                 parno = regnpar;
 318                 regnpar++;
 319                 ret = regnode(OPEN+parno);
 320         } else
 321                 ret = NULL;
 322
 323         /* Pick up the branches, linking them together. */
 324         br = regbranch(&flags);
 325         if (br == NULL)
 326                 return(NULL);
 327         if (ret != NULL)
 328                 regtail(ret, br);       /* OPEN -> first. */
 329         else
 330                 ret = br;
 331         if (!(flags&HASWIDTH))
 332                 *flagp &= ~HASWIDTH;
 333         *flagp |= flags&SPSTART;
 334         while (*regparse == '|' || *regparse == '\n') {
 335                 regparse++;
 336                 br = regbranch(&flags);
 337                 if (br == NULL)
 338                         return(NULL);
 339                 regtail(ret, br);       /* BRANCH -> BRANCH. */
 340                 if (!(flags&HASWIDTH))
 341                         *flagp &= ~HASWIDTH;
 342                 *flagp |= flags&SPSTART;
 343         }
 344
 345         /* Make a closing node, and hook it on the end. */
 346         ender = regnode((paren) ? CLOSE+parno : END);
 347         regtail(ret, ender);
 348
 349         /* Hook the tails of the branches to the closing node. */
 350         for (br = ret; br != NULL; br = regnext(br))
 351                 regoptail(br, ender);
 352
 353         /* Check for proper termination. */
 354         if (paren && *regparse++ != ')') {
 355                 FAIL("unmatched ()");
 356         } else if (!paren && *regparse != '\0') {
 357                 if (*regparse == ')') {
 358                         FAIL("unmatched ()");
 359                 } else
 360                         FAIL("junk on end");    /* "Can't happen". */
 361                 /* NOTREACHED */
 362         }
 363
 364         return(ret);
 365 }
 366
 367 /*
 368  - regbranch - one alternative of an | operator
 369  *
 370  * Implements the concatenation operator.
 371  */
 372 static char *
 373 regbranch(flagp)
 374 int *flagp;
 375 {
 376         char *ret;
 377         char *chain;
 378         char *latest;
 379         int flags;
 380
 381         *flagp = WORST;         /* Tentatively. */
 382
 383         ret = regnode(BRANCH);
 384         chain = NULL;
 385         while (*regparse != '\0' && *regparse != ')' &&
 386                *regparse != '\n' && *regparse != '|') {
 387                 latest = regpiece(&flags);
 388                 if (latest == NULL)
 389                         return(NULL);
 390                 *flagp |= flags&HASWIDTH;
 391                 if (chain == NULL)      /* First piece. */
 392                         *flagp |= flags&SPSTART;
 393                 else
 394                         regtail(chain, latest);
 395                 chain = latest;
 396         }
 397         if (chain == NULL)      /* Loop ran zero times. */
 398                 (void) regnode(NOTHING);
 399
 400         return(ret);
 401 }
 402
 403 /*
 404  - regpiece - something followed by possible [*+?]
 405  *
 406  * Note that the branching code sequences used for ? and the general cases
 407  * of * and + are somewhat optimized:  they use the same NOTHING node as
 408  * both the endmarker for their branch list and the body of the last branch.
 409  * It might seem that this node could be dispensed with entirely, but the
 410  * endmarker role is not redundant.
 411  */
 412 static char *
 413 regpiece(flagp)
 414 int *flagp;
 415 {
 416         char *ret;
 417         char op;
 418         char *next;
 419         int flags;
 420
 421         ret = regatom(&flags);
 422         if (ret == NULL)
 423                 return(NULL);
 424
 425         op = *regparse;
 426         if (!ISMULT(op)) {
 427                 *flagp = flags;
 428                 return(ret);
 429         }
 430
 431         if (!(flags&HASWIDTH) && op != '?')
 432                 FAIL("*+ operand could be empty");
 433         *flagp = (op != '+') ? (WORST|SPSTART) : (WORST|HASWIDTH);
 434
 435         if (op == '*' && (flags&SIMPLE))
 436                 reginsert(STAR, ret);
 437         else if (op == '*') {
 438                 /* Emit x* as (x&|), where & means "self". */
 439                 reginsert(BRANCH, ret);                 /* Either x */
 440                 regoptail(ret, regnode(BACK));          /* and loop */
 441                 regoptail(ret, ret);                    /* back */
 442                 regtail(ret, regnode(BRANCH));          /* or */
 443                 regtail(ret, regnode(NOTHING));         /* null. */
 444         } else if (op == '+' && (flags&SIMPLE))
 445                 reginsert(PLUS, ret);
 446         else if (op == '+') {
 447                 /* Emit x+ as x(&|), where & means "self". */
 448                 next = regnode(BRANCH);                 /* Either */
 449                 regtail(ret, next);
 450                 regtail(regnode(BACK), ret);            /* loop back */
 451                 regtail(next, regnode(BRANCH));         /* or */
 452                 regtail(ret, regnode(NOTHING));         /* null. */
 453         } else if (op == '?') {
 454                 /* Emit x? as (x|) */
 455                 reginsert(BRANCH, ret);                 /* Either x */
 456                 regtail(ret, regnode(BRANCH));          /* or */
 457                 next = regnode(NOTHING);                /* null. */
 458                 regtail(ret, next);
 459                 regoptail(ret, next);
 460         }
 461         regparse++;
 462         if (ISMULT(*regparse))
 463                 FAIL("nested *?+");
 464
 465         return(ret);
 466 }
 467
 468 /*
 469  - regatom - the lowest level
 470  *
 471  * Optimization:  gobbles an entire sequence of ordinary characters so that
 472  * it can turn them into a single node, which is smaller to store and
 473  * faster to run.  Backslashed characters are exceptions, each becoming a
 474  * separate node; the code is simpler that way and it's not worth fixing.
 475  */
 476 static char *
 477 regatom(flagp)
 478 int *flagp;
 479 {
 480         char *ret;
 481         int flags;
 482
 483         *flagp = WORST;         /* Tentatively. */
 484
 485         switch (*regparse++) {
 486         /* FIXME: these chars only have meaning at beg/end of pat? */
 487         case '^':
 488                 ret = regnode(BOL);
 489                 break;
 490         case '$':
 491                 ret = regnode(EOL);
 492                 break;
 493         case '.':
 494                 ret = regnode(ANY);
 495                 *flagp |= HASWIDTH|SIMPLE;
 496                 break;
 497         case '[': {
 498                         int class;
 499                         int classend;
 500
 501                         if (*regparse == '^') { /* Complement of range. */
 502                                 ret = regnode(ANYBUT);
 503                                 regparse++;
 504                         } else
 505                                 ret = regnode(ANYOF);
 506                         if (*regparse == ']' || *regparse == '-')
 507                                 regc(*regparse++);
 508                         while (*regparse != '\0' && *regparse != ']') {
 509                                 if (*regparse == '-') {
 510                                         regparse++;
 511                                         if (*regparse == ']' || *regparse == '\0')
 512                                                 regc('-');
 513                                         else {
 514                                                 class = UCHARAT(regparse-2)+1;
 515                                                 classend = UCHARAT(regparse);
 516                                                 if (class > classend+1)
 517                                                         FAIL("invalid [] range");
 518                                                 for (; class <= classend; class++)
 519                                                         regc(class);
 520                                                 regparse++;
 521                                         }
 522                                 } else
 523                                         regc(*regparse++);
 524                         }
 525                         regc('\0');
 526                         if (*regparse != ']')
 527                                 FAIL("unmatched []");
 528                         regparse++;
 529                         *flagp |= HASWIDTH|SIMPLE;
 530                 }
 531                 break;
 532         case '(':
 533                 ret = reg(1, &flags);
 534                 if (ret == NULL)
 535                         return(NULL);
 536                 *flagp |= flags&(HASWIDTH|SPSTART);
 537                 break;
 538         case '\0':
 539         case '|':
 540         case '\n':
 541         case ')':
 542                 FAIL("internal urp");   /* Supposed to be caught earlier. */
 543         case '?':
 544         case '+':
 545         case '*':
 546                 FAIL("?+* follows nothing");
 547         case '\\':
 548                 switch (*regparse++) {
 549                 case '\0':
 550                         FAIL("trailing \\");
 551                 case '<':
 552                         ret = regnode(WORDA);
 553                         break;
 554                 case '>':
 555                         ret = regnode(WORDZ);
 556                         break;
 557                 /* FIXME: Someday handle \1, \2, ... */
 558                 default:
 559                         /* Handle general quoted chars in exact-match routine */
 560                         goto de_fault;
 561                 }
 562                 break;
 563         de_fault:
 564                 /*FALLTHROUGH*/
 565         default:
 566                 /*
 567                  * Encode a string of characters to be matched exactly.
 568                  *
 569                  * This is a bit tricky due to quoted chars and due to
 570                  * '*', '+', and '?' taking the SINGLE char previous
 571                  * as their operand.
 572                  *
 573                  * On entry, the char at regparse[-1] is going to go
 574                  * into the string, no matter what it is.  (It could be
 575                  * following a \ if we are entered from the '\' case.)
 576                  *
 577                  * Basic idea is to pick up a good char in  ch  and
 578                  * examine the next char.  If it's *+? then we twiddle.
 579                  * If it's \ then we frozzle.  If it's other magic char
 580                  * we push  ch  and terminate the string.  If none of the
 581                  * above, we push  ch  on the string and go around again.
 582                  *
 583                  *  regprev  is used to remember where "the current char"
 584                  * starts in the string, if due to a *+? we need to back
 585                  * up and put the current char in a separate, 1-char, string.
 586                  * When  regprev  is NULL,  ch  is the only char in the
 587                  * string; this is used in *+? handling, and in setting
 588                  * flags |= SIMPLE at the end.
 589                  */
 590                 {
 591                         char *regprev;
 592                         char ch;
 593
 594                         regparse--;                     /* Look at cur char */
 595                         ret = regnode(EXACTLY);
 596                         for ( regprev = 0 ; ; ) {
 597                                 ch = *regparse++;       /* Get current char */
 598                                 switch (*regparse) {    /* look at next one */
 599
 600                                 default:
 601                                         regc(ch);       /* Add cur to string */
 602                                         break;
 603
 604                                 case '.': case '[': case '(':
 605                                 case ')': case '|': case '\n':
 606                                 case '$': case '^':
 607                                 case '\0':
 608                                 /* FIXME, $ and ^ should not always be magic */
 609                                 magic:
 610                                         regc(ch);       /* dump cur char */
 611                                         goto done;      /* and we are done */
 612
 613                                 case '?': case '+': case '*':
 614                                         if (!regprev)   /* If just ch in str, */
 615                                                 goto magic;     /* use it */
 616                                         /* End mult-char string one early */
 617                                         regparse = regprev; /* Back up parse */
 618                                         goto done;
 619
 620                                 case '\\':
 621                                         regc(ch);       /* Cur char OK */
 622                                         switch (regparse[1]){ /* Look after \ */
 623                                         case '\0':
 624                                         case '<':
 625                                         case '>':
 626                                         /* FIXME: Someday handle \1, \2, ... */
 627                                                 goto done; /* Not quoted */
 628                                         default:
 629                                                 /* Backup point is \, scan                                                       * point is after it. */
 630                                                 regprev = regparse;
 631                                                 regparse++;
 632                                                 continue;       /* NOT break; */
 633                                         }
 634                                 }
 635                                 regprev = regparse;     /* Set backup point */
 636                         }
 637                 done:
 638                         regc('\0');
 639                         *flagp |= HASWIDTH;
 640                         if (!regprev)           /* One char? */
 641                                 *flagp |= SIMPLE;
 642                 }
 643                 break;
 644         }
 645
 646         return(ret);
 647 }
 648
 649 /*
 650  - regnode - emit a node
 651  */
 652 static char *                   /* Location. */
 653 regnode(op)
 654 int op;
 655 {
 656         char *ret;
 657         char *ptr;
 658
 659         ret = regcode;
 660         if (ret == &regdummy) {
 661                 regsize += 3;
 662                 return(ret);
 663         }
 664
 665         ptr = ret;
 666         *ptr++ = op;
 667         *ptr++ = '\0';          /* Null "next" pointer. */
 668         *ptr++ = '\0';
 669         regcode = ptr;
 670
 671         return(ret);
 672 }
 673
 674 /*
 675  - regc - emit (if appropriate) a byte of code
 676  */
 677 static void
 678 regc(b)
 679 int b;
 680 {
 681         if (regcode != &regdummy)
 682                 *regcode++ = b;
 683         else
 684                 regsize++;
 685 }
 686
 687 /*
 688  - reginsert - insert an operator in front of already-emitted operand
 689  *
 690  * Means relocating the operand.
 691  */
 692 static void
 693 reginsert(op, opnd)
 694 int op;
 695 char *opnd;
 696 {
 697         char *src;
 698         char *dst;
 699         char *place;
 700
 701         if (regcode == &regdummy) {
 702                 regsize += 3;
 703                 return;
 704         }
 705
 706         src = regcode;
 707         regcode += 3;
 708         dst = regcode;
 709         while (src > opnd)
 710                 *--dst = *--src;
 711
 712         place = opnd;           /* Op node, where operand used to be. */
 713         *place++ = op;
 714         *place++ = '\0';
 715         *place++ = '\0';
 716 }
 717
 718 /*
 719  - regtail - set the next-pointer at the end of a node chain
 720  */
 721 static void
 722 regtail(p, val)
 723 char *p;
 724 char *val;
 725 {
 726         char *scan;
 727         char *temp;
 728         int offset;
 729
 730         if (p == &regdummy)
 731                 return;
 732
 733         /* Find last node. */
 734         scan = p;
 735         for (;;) {
 736                 temp = regnext(scan);
 737                 if (temp == NULL)
 738                         break;
 739                 scan = temp;
 740         }
 741
 742         if (OP(scan) == BACK)
 743                 offset = scan - val;
 744         else
 745                 offset = val - scan;
 746         *(scan+1) = ((unsigned int)offset>>8)&0377;
 747         *(scan+2) = offset&0377;
 748 }
 749
 750 /*
 751  - regoptail - regtail on operand of first argument; nop if operandless
 752  */
 753 static void
 754 regoptail(p, val)
 755 char *p;
 756 char *val;
 757 {
 758         /* "Operandless" and "op != BRANCH" are synonymous in practice. */
 759         if (p == NULL || p == &regdummy || OP(p) != BRANCH)
 760                 return;
 761         regtail(OPERAND(p), val);
 762 }
 763
 764 /*
 765  * regexec and friends
 766  */
 767
 768 /*
 769  * Global work variables for regexec().
 770  */
 771 static char *reginput;          /* String-input pointer. */
 772 static char *regbol;            /* Beginning of input, for ^ check. */
 773 static char **regstartp;        /* Pointer to startp array. */
 774 static char **regendp;          /* Ditto for endp. */
 775
 776 /*
 777  * Forwards.
 778  */
 779 STATIC int regtry __P((const regexp *, const char *));
 780 STATIC int regmatch __P((char *));
 781 STATIC int regrepeat __P((char *));
 782
 783 #ifdef DEBUG
 784 int regnarrate = 0;
 785 void regdump __P((regexp *));
 786 STATIC char *regprop __P((char *));
 787 #endif
 788
 789 /*
 790  - regexec - match a regexp against a string
 791  */
 792 int
 793 __compat_regexec(prog, string)
 794 const regexp *prog;
 795 const char *string;
 796 {
 797         char *s;
 798
 799         /* Be paranoid... */
 800         if (prog == NULL || string == NULL) {
 801                 regerror("NULL parameter");
 802                 return(0);
 803         }
 804
 805         /* Check validity of program. */
 806         if (UCHARAT(prog->program) != MAGIC) {
 807                 regerror("corrupted program");
 808                 return(0);
 809         }
 810
 811         /* If there is a "must appear" string, look for it. */
 812         if (prog->regmust != NULL) {
 813                 /* LINTED const castaway */
 814                 s = (char *)string;
 815                 while ((s = strchr(s, prog->regmust[0])) != NULL) {
 816                         if (strncmp(s, prog->regmust,
 817                             (size_t)prog->regmlen) == 0)
 818                                 break;  /* Found it. */
 819                         s++;
 820                 }
 821                 if (s == NULL)  /* Not present. */
 822                         return(0);
 823         }
 824
 825         /* Mark beginning of line for ^ . */
 826         /* LINTED const castaway */
 827         regbol = (char *)string;
 828
 829         /* Simplest case:  anchored match need be tried only once. */
 830         if (prog->reganch)
 831                 return(regtry(prog, string));
 832
 833         /* Messy cases:  unanchored match. */
 834         /* LINTED const castaway */
 835         s = (char *)string;
 836         if (prog->regstart != '\0')
 837                 /* We know what char it must start with. */
 838                 while ((s = strchr(s, prog->regstart)) != NULL) {
 839                         if (regtry(prog, s))
 840                                 return(1);
 841                         s++;
 842                 }
 843         else
 844                 /* We don't -- general case. */
 845                 do {
 846                         if (regtry(prog, s))
 847                                 return(1);
 848                 } while (*s++ != '\0');
 849
 850         /* Failure. */
 851         return(0);
 852 }
 853
 854 /*
 855  - regtry - try match at specific point
 856  */
 857 static int                      /* 0 failure, 1 success */
 858 regtry(prog, string)
 859 const regexp *prog;
 860 const char *string;
 861 {
 862         int i;
 863         char **sp;
 864         char **ep;
 865
 866         /* LINTED const castaway */
 867         reginput = (char *)string;                              /* XXX */
 868         regstartp = (char **)prog->startp;                      /* XXX */
 869         regendp = (char **)prog->endp;                          /* XXX */
 870
 871         sp = (char **)prog->startp;                             /* XXX */
 872         ep = (char **)prog->endp;                               /* XXX */
 873         for (i = NSUBEXP; i > 0; i--) {
 874                 *sp++ = NULL;
 875                 *ep++ = NULL;
 876         }
 877         if (regmatch((char *)prog->program + 1)) {              /* XXX */
 878                 /* LINTED const castaway */
 879                 ((regexp *)prog)->startp[0] = (char *)string;   /* XXX */
 880                 /* LINTED const castaway */
 881                 ((regexp *)prog)->endp[0] = reginput;           /* XXX */
 882                 return(1);
 883         } else
 884                 return(0);
 885 }
 886
 887 /*
 888  - regmatch - main matching routine
 889  *
 890  * Conceptually the strategy is simple:  check to see whether the current
 891  * node matches, call self recursively to see whether the rest matches,
 892  * and then act accordingly.  In practice we make some effort to avoid
 893  * recursion, in particular by going through "ordinary" nodes (that don't
 894  * need to know whether the rest of the match failed) by a loop instead of
 895  * by recursion.
 896  */
 897 static int                      /* 0 failure, 1 success */
 898 regmatch(prog)
 899 char *prog;
 900 {
 901         char *scan;     /* Current node. */
 902         char *next;             /* Next node. */
 903
 904         scan = prog;
 905 #ifdef DEBUG
 906         if (scan != NULL && regnarrate)
 907                 fprintf(stderr, "%s(\n", regprop(scan));
 908 #endif
 909         while (scan != NULL) {
 910 #ifdef DEBUG
 911                 if (regnarrate)
 912                         fprintf(stderr, "%s...\n", regprop(scan));
 913 #endif
 914                 next = regnext(scan);
 915
 916                 switch (OP(scan)) {
 917                 case BOL:
 918                         if (reginput != regbol)
 919                                 return(0);
 920                         break;
 921                 case EOL:
 922                         if (*reginput != '\0')
 923                                 return(0);
 924                         break;
 925                 case WORDA:
 926                         /* Must be looking at a letter, digit, or _ */
 927                         if ((!isalnum(UCHARAT(reginput))) && *reginput != '_')
 928                                 return(0);
 929                         /* Prev must be BOL or nonword */
 930                         if (reginput > regbol &&
 931                             (isalnum(UCHARAT(reginput - 1))
 932                              || reginput[-1] == '_'))
 933                                 return(0);
 934                         break;
 935                 case WORDZ:
 936                         /* Must be looking at non letter, digit, or _ */
 937                         if (isalnum(UCHARAT(reginput)) || *reginput == '_')
 938                                 return(0);
 939                         /* We don't care what the previous char was */
 940                         break;
 941                 case ANY:
 942                         if (*reginput == '\0')
 943                                 return(0);
 944                         reginput++;
 945                         break;
 946                 case EXACTLY: {
 947                                 int len;
 948                                 char *opnd;
 949
 950                                 opnd = OPERAND(scan);
 951                                 /* Inline the first character, for speed. */
 952                                 if (*opnd != *reginput)
 953                                         return(0);
 954                                 len = strlen(opnd);
 955                                 if (len > 1 && strncmp(opnd, reginput,
 956                                     (size_t)len) != 0)
 957                                         return(0);
 958                                 reginput += len;
 959                         }
 960                         break;
 961                 case ANYOF:
 962                         if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) == NULL)
 963                                 return(0);
 964                         reginput++;
 965                         break;
 966                 case ANYBUT:
 967                         if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) != NULL)
 968                                 return(0);
 969                         reginput++;
 970                         break;
 971                 case NOTHING:
 972                         break;
 973                 case BACK:
 974                         break;
 975                 case OPEN+1:
 976                 case OPEN+2:
 977                 case OPEN+3:
 978                 case OPEN+4:
 979                 case OPEN+5:
 980                 case OPEN+6:
 981                 case OPEN+7:
 982                 case OPEN+8:
 983                 case OPEN+9: {
 984                                 int no;
 985                                 char *save;
 986
 987                                 no = OP(scan) - OPEN;
 988                                 save = reginput;
 989
 990                                 if (regmatch(next)) {
 991                                         /*
 992                                          * Don't set startp if some later
 993                                          * invocation of the same parentheses
 994                                          * already has.
 995                                          */
 996                                         if (regstartp[no] == NULL)
 997                                                 regstartp[no] = save;
 998                                         return(1);
 999                                 } else
1000                                         return(0);
1001                         }
1002                 case CLOSE+1:
1003                 case CLOSE+2:
1004                 case CLOSE+3:
1005                 case CLOSE+4:
1006                 case CLOSE+5:
1007                 case CLOSE+6:
1008                 case CLOSE+7:
1009                 case CLOSE+8:
1010                 case CLOSE+9: {
1011                                 int no;
1012                                 char *save;
1013
1014                                 no = OP(scan) - CLOSE;
1015                                 save = reginput;
1016
1017                                 if (regmatch(next)) {
1018                                         /*
1019                                          * Don't set endp if some later
1020                                          * invocation of the same parentheses
1021                                          * already has.
1022                                          */
1023                                         if (regendp[no] == NULL)
1024                                                 regendp[no] = save;
1025                                         return(1);
1026                                 } else
1027                                         return(0);
1028                         }
1029                 case BRANCH: {
1030                                 char *save;
1031
1032                                 if (OP(next) != BRANCH)         /* No choice. */
1033                                         next = OPERAND(scan);   /* Avoid recursion. */
1034                                 else {
1035                                         do {
1036                                                 save = reginput;
1037                                                 if (regmatch(OPERAND(scan)))
1038                                                         return(1);
1039                                                 reginput = save;
1040                                                 scan = regnext(scan);
1041                                         } while (scan != NULL && OP(scan) == BRANCH);
1042                                         return(0);
1043                                         /* NOTREACHED */
1044                                 }
1045                         }
1046                         break;
1047                 case STAR:
1048                 case PLUS: {
1049                                 char nextch;
1050                                 int no;
1051                                 char *save;
1052                                 int min;
1053
1054                                 /*
1055                                  * Lookahead to avoid useless match attempts
1056                                  * when we know what character comes next.
1057                                  */
1058                                 nextch = '\0';
1059                                 if (OP(next) == EXACTLY)
1060                                         nextch = *OPERAND(next);
1061                                 min = (OP(scan) == STAR) ? 0 : 1;
1062                                 save = reginput;
1063                                 no = regrepeat(OPERAND(scan));
1064                                 while (no >= min) {
1065                                         /* If it could work, try it. */
1066                                         if (nextch == '\0' || *reginput == nextch)
1067                                                 if (regmatch(next))
1068                                                         return(1);
1069                                         /* Couldn't or didn't -- back up. */
1070                                         no--;
1071                                         reginput = save + no;
1072                                 }
1073                                 return(0);
1074                         }
1075                 case END:
1076                         return(1);      /* Success! */
1077                 default:
1078                         regerror("memory corruption");
1079                         return(0);
1080                 }
1081
1082                 scan = next;
1083         }
1084
1085         /*
1086          * We get here only if there's trouble -- normally "case END" is
1087          * the terminating point.
1088          */
1089         regerror("corrupted pointers");
1090         return(0);
1091 }
1092
1093 /*
1094  - regrepeat - repeatedly match something simple, report how many
1095  */
1096 static int
1097 regrepeat(p)
1098 char *p;
1099 {
1100         int count = 0;
1101         char *scan;
1102         char *opnd;
1103
1104         scan = reginput;
1105         opnd = OPERAND(p);
1106         switch (OP(p)) {
1107         case ANY:
1108                 count = strlen(scan);
1109                 scan += count;
1110                 break;
1111         case EXACTLY:
1112                 while (*opnd == *scan) {
1113                         count++;
1114                         scan++;
1115                 }
1116                 break;
1117         case ANYOF:
1118                 while (*scan != '\0' && strchr(opnd, *scan) != NULL) {
1119                         count++;
1120                         scan++;
1121                 }
1122                 break;
1123         case ANYBUT:
1124                 while (*scan != '\0' && strchr(opnd, *scan) == NULL) {
1125                         count++;
1126                         scan++;
1127                 }
1128                 break;
1129         default:                /* Oh dear.  Called inappropriately. */
1130                 regerror("internal foulup");
1131                 count = 0;      /* Best compromise. */
1132                 break;
1133         }
1134         reginput = scan;
1135
1136         return(count);
1137 }
1138
1139 /*
1140  - regnext - dig the "next" pointer out of a node
1141  */
1142 static char *
1143 regnext(p)
1144 char *p;
1145 {
1146         int offset;
1147
1148         if (p == &regdummy)
1149                 return(NULL);
1150
1151         offset = NEXT(p);
1152         if (offset == 0)
1153                 return(NULL);
1154
1155         if (OP(p) == BACK)
1156                 return(p-offset);
1157         else
1158                 return(p+offset);
1159 }
1160
1161 #ifdef DEBUG
1162
1163 /*
1164  - regdump - dump a regexp onto stdout in vaguely comprehensible form
1165  */
1166 void
1167 regdump(r)
1168 regexp *r;
1169 {
1170         char *s;
1171         char op = EXACTLY;      /* Arbitrary non-END op. */
1172         char *next;
1173
1174
1175         s = r->program + 1;
1176         while (op != END) {     /* While that wasn't END last time... */
1177                 op = OP(s);
1178                 printf("%2td%s", s-r->program, regprop(s));     /* Where, what. */
1179                 next = regnext(s);
1180                 if (next == NULL)               /* Next ptr. */
1181                         printf("(0)");
1182                 else
1183                         printf("(%td)", (s-r->program)+(next-s));
1184                 s += 3;
1185                 if (op == ANYOF || op == ANYBUT || op == EXACTLY) {
1186                         /* Literal string, where present. */
1187                         while (*s != '\0') {
1188                                 putchar(*s);
1189                                 s++;
1190                         }
1191                         s++;
1192                 }
1193                 putchar('\n');
1194         }
1195
1196         /* Header fields of interest. */
1197         if (r->regstart != '\0')
1198                 printf("start `%c' ", r->regstart);
1199         if (r->reganch)
1200                 printf("anchored ");
1201         if (r->regmust != NULL)
1202                 printf("must have \"%s\"", r->regmust);
1203         printf("\n");
1204 }
1205
1206 /*
1207  - regprop - printable representation of opcode
1208  */
1209 static char *
1210 regprop(op)
1211 char *op;
1212 {
1213         char *p;
1214         static char buf[50];
1215
1216         (void)strncpy(buf, ":", sizeof(buf) - 1);
1217
1218         switch (OP(op)) {
1219         case BOL:
1220                 p = "BOL";
1221                 break;
1222         case EOL:
1223                 p = "EOL";
1224                 break;
1225         case ANY:
1226                 p = "ANY";
1227                 break;
1228         case ANYOF:
1229                 p = "ANYOF";
1230                 break;
1231         case ANYBUT:
1232                 p = "ANYBUT";
1233                 break;
1234         case BRANCH:
1235                 p = "BRANCH";
1236                 break;
1237         case EXACTLY:
1238                 p = "EXACTLY";
1239                 break;
1240         case NOTHING:
1241                 p = "NOTHING";
1242                 break;
1243         case BACK:
1244                 p = "BACK";
1245                 break;
1246         case END:
1247                 p = "END";
1248                 break;
1249         case OPEN+1:
1250         case OPEN+2:
1251         case OPEN+3:
1252         case OPEN+4:
1253         case OPEN+5:
1254         case OPEN+6:
1255         case OPEN+7:
1256         case OPEN+8:
1257         case OPEN+9:
1258                 (void)snprintf(buf+strlen(buf), sizeof(buf) - strlen(buf),
1259                     "OPEN%d", OP(op)-OPEN);
1260                 p = NULL;
1261                 break;
1262         case CLOSE+1:
1263         case CLOSE+2:
1264         case CLOSE+3:
1265         case CLOSE+4:
1266         case CLOSE+5:
1267         case CLOSE+6:
1268         case CLOSE+7:
1269         case CLOSE+8:
1270         case CLOSE+9:
1271                 (void)snprintf(buf+strlen(buf), sizeof(buf) - strlen(buf),
1272                     "CLOSE%d", OP(op)-CLOSE);
1273                 p = NULL;
1274                 break;
1275         case STAR:
1276                 p = "STAR";
1277                 break;
1278         case PLUS:
1279                 p = "PLUS";
1280                 break;
1281         case WORDA:
1282                 p = "WORDA";
1283                 break;
1284         case WORDZ:
1285                 p = "WORDZ";
1286                 break;
1287         default:
1288                 p = NULL;
1289                 regerror("corrupted opcode");
1290                 break;
1291         }
1292         if (p != NULL)
1293                 (void)strncat(buf, p, sizeof(buf) - strlen(buf) - 1);
1294         return(buf);
1295 }
1296 #endif
1297
1298 /*
1299  * The following is provided for those people who do not have strcspn() in
1300  * their C libraries.  They should get off their butts and do something
1301  * about it; at least one public-domain implementation of those (highly
1302  * useful) string routines has been published on Usenet.
1303  */
1304 #ifdef STRCSPN
1305 /*
1306  * strcspn - find length of initial segment of s1 consisting entirely
1307  * of characters not from s2
1308  */
1309
1310 static int
1311 strcspn(s1, s2)
1312 char *s1;
1313 char *s2;
1314 {
1315         char *scan1;
1316         char *scan2;
1317         int count;
1318
1319         count = 0;
1320         for (scan1 = s1; *scan1 != '\0'; scan1++) {
1321                 for (scan2 = s2; *scan2 != '\0';)       /* ++ moved down. */
1322                         if (*scan1 == *scan2++)
1323                                 return(count);
1324                 count++;
1325         }
1326         return(count);
1327 }
1328 #endif