release/src/router/usbmodeswitch/jim/jimregexp.c

   1 /*
   2  * regcomp and regexec -- regsub and regerror are elsewhere
   3  *
   4  *      Copyright (c) 1986 by University of Toronto.
   5  *      Written by Henry Spencer.  Not derived from licensed software.
   6  *
   7  *      Permission is granted to anyone to use this software for any
   8  *      purpose on any computer system, and to redistribute it freely,
   9  *      subject to the following restrictions:
  10  *
  11  *      1. The author is not responsible for the consequences of use of
  12  *              this software, no matter how awful, even if they arise
  13  *              from defects in it.
  14  *
  15  *      2. The origin of this software must not be misrepresented, either
  16  *              by explicit claim or by omission.
  17  *
  18  *      3. Altered versions must be plainly marked as such, and must not
  19  *              be misrepresented as being the original software.
  20  *** THIS IS AN ALTERED VERSION.  It was altered by John Gilmore,
  21  *** hoptoad!gnu, on 27 Dec 1986, to add \n as an alternative to |
  22  *** to assist in implementing egrep.
  23  *** THIS IS AN ALTERED VERSION.  It was altered by John Gilmore,
  24  *** hoptoad!gnu, on 27 Dec 1986, to add \< and \> for word-matching
  25  *** as in BSD grep and ex.
  26  *** THIS IS AN ALTERED VERSION.  It was altered by John Gilmore,
  27  *** hoptoad!gnu, on 28 Dec 1986, to optimize characters quoted with \.
  28  *** THIS IS AN ALTERED VERSION.  It was altered by James A. Woods,
  29  *** ames!jaw, on 19 June 1987, to quash a regcomp() redundancy.
  30  *** THIS IS AN ALTERED VERSION.  It was altered by Christopher Seiwald
  31  *** seiwald@vix.com, on 28 August 1993, for use in jam.  Regmagic.h
  32  *** was moved into regexp.h, and the include of regexp.h now uses "'s
  33  *** to avoid conflicting with the system regexp.h.  Const, bless its
  34  *** soul, was removed so it can compile everywhere.  The declaration
  35  *** of strchr() was in conflict on AIX, so it was removed (as it is
  36  *** happily defined in string.h).
  37  *** THIS IS AN ALTERED VERSION.  It was altered by Christopher Seiwald
  38  *** seiwald@perforce.com, on 20 January 2000, to use function prototypes.
  39  *** THIS IS AN ALTERED VERSION.  It was altered by Christopher Seiwald
  40  *** seiwald@perforce.com, on 05 November 2002, to const string literals.
  41  *
  42  *   THIS IS AN ALTERED VERSION.  It was altered by Steve Bennett <steveb@workware.net.au>
  43  *   on 16 October 2010, to remove static state and add better Tcl ARE compatibility.
  44  *   This includes counted repetitions, UTF-8 support, character classes,
  45  *   shorthand character classes, increased number of parentheses to 100,
  46  *   backslash escape sequences. It also removes \n as an alternative to |.
  47  *
  48  * Beware that some of this code is subtly aware of the way operator
  49  * precedence is structured in regular expressions.  Serious changes in
  50  * regular-expression syntax might require a total rethink.
  51  */
  52 #include <stdio.h>
  53 #include <ctype.h>
  54 #include <stdlib.h>
  55 #include <string.h>
  56
  57 #include "jim.h"
  58 #include "jimautoconf.h"
  59 #include "jimregexp.h"
  60 #include "utf8.h"
  61
  62 #if !defined(HAVE_REGCOMP) || defined(JIM_REGEXP)
  63
  64 /*
  65  * Structure for regexp "program".  This is essentially a linear encoding
  66  * of a nondeterministic finite-state machine (aka syntax charts or
  67  * "railroad normal form" in parsing technology).  Each node is an opcode
  68  * plus a "next" pointer, possibly plus an operand.  "Next" pointers of
  69  * all nodes except BRANCH implement concatenation; a "next" pointer with
  70  * a BRANCH on both ends of it is connecting two alternatives.  (Here we
  71  * have one of the subtle syntax dependencies:  an individual BRANCH (as
  72  * opposed to a collection of them) is never concatenated with anything
  73  * because of operator precedence.)  The operand of some types of node is
  74  * a literal string; for others, it is a node leading into a sub-FSM.  In
  75  * particular, the operand of a BRANCH node is the first node of the branch.
  76  * (NB this is *not* a tree structure:  the tail of the branch connects
  77  * to the thing following the set of BRANCHes.)  The opcodes are:
  78  */
  79
  80 /* This *MUST* be less than (255-20)/2=117 */
  81 #define REG_MAX_PAREN 100
  82
  83 /* definition   number  opnd?   meaning */
  84 #define END     0       /* no   End of program. */
  85 #define BOL     1       /* no   Match "" at beginning of line. */
  86 #define EOL     2       /* no   Match "" at end of line. */
  87 #define ANY     3       /* no   Match any one character. */
  88 #define ANYOF   4       /* str  Match any character in this string. */
  89 #define ANYBUT  5       /* str  Match any character not in this string. */
  90 #define BRANCH  6       /* node Match this alternative, or the next... */
  91 #define BACK    7       /* no   Match "", "next" ptr points backward. */
  92 #define EXACTLY 8       /* str  Match this string. */
  93 #define NOTHING 9       /* no   Match empty string. */
  94 #define REP             10      /* max,min      Match this (simple) thing [min,max] times. */
  95 #define REPMIN  11      /* max,min      Match this (simple) thing [min,max] times, mininal match. */
  96 #define REPX    12      /* max,min      Match this (complex) thing [min,max] times. */
  97 #define REPXMIN 13      /* max,min      Match this (complex) thing [min,max] times, minimal match. */
  98
  99 #define WORDA   15      /* no   Match "" at wordchar, where prev is nonword */
 100 #define WORDZ   16      /* no   Match "" at nonwordchar, where prev is word */
 101 #define OPEN    20      /* no   Mark this point in input as start of #n. */
 102                         /*      OPEN+1 is number 1, etc. */
 103 #define CLOSE   (OPEN+REG_MAX_PAREN)    /* no   Analogous to OPEN. */
 104 #define CLOSE_END       (CLOSE+REG_MAX_PAREN)
 105
 106 /*
 107  * The first byte of the regexp internal "program" is actually this magic
 108  * number; the start node begins in the second byte.
 109  */
 110 #define REG_MAGIC       0xFADED00D
 111
 112 /*
 113  * Opcode notes:
 114  *
 115  * BRANCH       The set of branches constituting a single choice are hooked
 116  *              together with their "next" pointers, since precedence prevents
 117  *              anything being concatenated to any individual branch.  The
 118  *              "next" pointer of the last BRANCH in a choice points to the
 119  *              thing following the whole choice.  This is also where the
 120  *              final "next" pointer of each individual branch points; each
 121  *              branch starts with the operand node of a BRANCH node.
 122  *
 123  * BACK         Normal "next" pointers all implicitly point forward; BACK
 124  *              exists to make loop structures possible.
 125  *
 126  * STAR,PLUS    '?', and complex '*' and '+', are implemented as circular
 127  *              BRANCH structures using BACK.  Simple cases (one character
 128  *              per match) are implemented with STAR and PLUS for speed
 129  *              and to minimize recursive plunges.
 130  *
 131  * OPEN,CLOSE   ...are numbered at compile time.
 132  */
 133
 134 /*
 135  * A node is one char of opcode followed by two chars of "next" pointer.
 136  * "Next" pointers are stored as two 8-bit pieces, high order first.  The
 137  * value is a positive offset from the opcode of the node containing it.
 138  * An operand, if any, simply follows the node.  (Note that much of the
 139  * code generation knows about this implicit relationship.)
 140  *
 141  * Using two bytes for the "next" pointer is vast overkill for most things,
 142  * but allows patterns to get big without disasters.
 143  */
 144 #define OP(preg, p)     (preg->program[p])
 145 #define NEXT(preg, p)   (preg->program[p + 1])
 146 #define OPERAND(p)      ((p) + 2)
 147
 148 /*
 149  * See regmagic.h for one further detail of program structure.
 150  */
 151
 152
 153 /*
 154  * Utility definitions.
 155  */
 156
 157 #define FAIL(R,M)       { (R)->err = (M); return (M); }
 158 #define ISMULT(c)       ((c) == '*' || (c) == '+' || (c) == '?' || (c) == '{')
 159 #define META    "^$.[()|?{+*"
 160
 161 /*
 162  * Flags to be passed up and down.
 163  */
 164 #define HASWIDTH        01      /* Known never to match null string. */
 165 #define SIMPLE          02      /* Simple enough to be STAR/PLUS operand. */
 166 #define SPSTART         04      /* Starts with * or +. */
 167 #define WORST           0       /* Worst case. */
 168
 169 #define MAX_REP_COUNT 1000000
 170
 171 /*
 172  * Forward declarations for regcomp()'s friends.
 173  */
 174 static int reg(regex_t *preg, int paren /* Parenthesized? */, int *flagp );
 175 static int regpiece(regex_t *preg, int *flagp );
 176 static int regbranch(regex_t *preg, int *flagp );
 177 static int regatom(regex_t *preg, int *flagp );
 178 static int regnode(regex_t *preg, int op );
 179 static int regnext(regex_t *preg, int p );
 180 static void regc(regex_t *preg, int b );
 181 static int reginsert(regex_t *preg, int op, int size, int opnd );
 182 static void regtail_(regex_t *preg, int p, int val, int line );
 183 static void regoptail(regex_t *preg, int p, int val );
 184 #define regtail(PREG, P, VAL) regtail_(PREG, P, VAL, __LINE__)
 185
 186 static int reg_range_find(const int *string, int c);
 187 static const char *str_find(const char *string, int c, int nocase);
 188 static int prefix_cmp(const int *prog, int proglen, const char *string, int nocase);
 189
 190 /*#define DEBUG*/
 191 #ifdef DEBUG
 192 int regnarrate = 0;
 193 static void regdump(regex_t *preg);
 194 static const char *regprop( int op );
 195 #endif
 196
 197
 198 /**
 199  * Returns the length of the null-terminated integer sequence.
 200  */
 201 static int str_int_len(const int *seq)
 202 {
 203         int n = 0;
 204         while (*seq++) {
 205                 n++;
 206         }
 207         return n;
 208 }
 209
 210 /*
 211  - regcomp - compile a regular expression into internal code
 212  *
 213  * We can't allocate space until we know how big the compiled form will be,
 214  * but we can't compile it (and thus know how big it is) until we've got a
 215  * place to put the code.  So we cheat:  we compile it twice, once with code
 216  * generation turned off and size counting turned on, and once "for real".
 217  * This also means that we don't allocate space until we are sure that the
 218  * thing really will compile successfully, and we never have to move the
 219  * code and thus invalidate pointers into it.  (Note that it has to be in
 220  * one piece because free() must be able to free it all.)
 221  *
 222  * Beware that the optimization-preparation code in here knows about some
 223  * of the structure of the compiled regexp.
 224  */
 225 int regcomp(regex_t *preg, const char *exp, int cflags)
 226 {
 227         int scan;
 228         int longest;
 229         unsigned len;
 230         int flags;
 231
 232 #ifdef DEBUG
 233         fprintf(stderr, "Compiling: '%s'\n", exp);
 234 #endif
 235         memset(preg, 0, sizeof(*preg));
 236
 237         if (exp == NULL)
 238                 FAIL(preg, REG_ERR_NULL_ARGUMENT);
 239
 240         /* First pass: determine size, legality. */
 241         preg->cflags = cflags;
 242         preg->regparse = exp;
 243         /* XXX: For now, start unallocated */
 244         preg->program = NULL;
 245         preg->proglen = 0;
 246
 247 #if 1
 248         /* Allocate space. */
 249         preg->proglen = (strlen(exp) + 1) * 5;
 250         preg->program = malloc(preg->proglen * sizeof(int));
 251         if (preg->program == NULL)
 252                 FAIL(preg, REG_ERR_NOMEM);
 253 #endif
 254
 255         /* Note that since we store a magic value as the first item in the program,
 256          * program offsets will never be 0
 257          */
 258         regc(preg, REG_MAGIC);
 259         if (reg(preg, 0, &flags) == 0) {
 260                 return preg->err;
 261         }
 262
 263         /* Small enough for pointer-storage convention? */
 264         if (preg->re_nsub >= REG_MAX_PAREN)             /* Probably could be 65535L. */
 265                 FAIL(preg,REG_ERR_TOO_BIG);
 266
 267         /* Dig out information for optimizations. */
 268         preg->regstart = 0;     /* Worst-case defaults. */
 269         preg->reganch = 0;
 270         preg->regmust = 0;
 271         preg->regmlen = 0;
 272         scan = 1;                       /* First BRANCH. */
 273         if (OP(preg, regnext(preg, scan)) == END) {             /* Only one top-level choice. */
 274                 scan = OPERAND(scan);
 275
 276                 /* Starting-point info. */
 277                 if (OP(preg, scan) == EXACTLY) {
 278                         preg->regstart = preg->program[OPERAND(scan)];
 279                 }
 280                 else if (OP(preg, scan) == BOL)
 281                         preg->reganch++;
 282
 283                 /*
 284                  * If there's something expensive in the r.e., find the
 285                  * longest literal string that must appear and make it the
 286                  * regmust.  Resolve ties in favor of later strings, since
 287                  * the regstart check works with the beginning of the r.e.
 288                  * and avoiding duplication strengthens checking.  Not a
 289                  * strong reason, but sufficient in the absence of others.
 290                  */
 291                 if (flags&SPSTART) {
 292                         longest = 0;
 293                         len = 0;
 294                         for (; scan != 0; scan = regnext(preg, scan)) {
 295                                 if (OP(preg, scan) == EXACTLY) {
 296                                         int plen = str_int_len(preg->program + OPERAND(scan));
 297                                         if (plen >= len) {
 298                                                 longest = OPERAND(scan);
 299                                                 len = plen;
 300                                         }
 301                                 }
 302                         }
 303                         preg->regmust = longest;
 304                         preg->regmlen = len;
 305                 }
 306         }
 307
 308 #ifdef DEBUG
 309         regdump(preg);
 310 #endif
 311
 312         return 0;
 313 }
 314
 315 /*
 316  - reg - regular expression, i.e. main body or parenthesized thing
 317  *
 318  * Caller must absorb opening parenthesis.
 319  *
 320  * Combining parenthesis handling with the base level of regular expression
 321  * is a trifle forced, but the need to tie the tails of the branches to what
 322  * follows makes it hard to avoid.
 323  */
 324 static int reg(regex_t *preg, int paren /* Parenthesized? */, int *flagp )
 325 {
 326         int ret;
 327         int br;
 328         int ender;
 329         int parno = 0;
 330         int flags;
 331
 332         *flagp = HASWIDTH;      /* Tentatively. */
 333
 334         /* Make an OPEN node, if parenthesized. */
 335         if (paren) {
 336                 parno = ++preg->re_nsub;
 337                 ret = regnode(preg, OPEN+parno);
 338         } else
 339                 ret = 0;
 340
 341         /* Pick up the branches, linking them together. */
 342         br = regbranch(preg, &flags);
 343         if (br == 0)
 344                 return 0;
 345         if (ret != 0)
 346                 regtail(preg, ret, br); /* OPEN -> first. */
 347         else
 348                 ret = br;
 349         if (!(flags&HASWIDTH))
 350                 *flagp &= ~HASWIDTH;
 351         *flagp |= flags&SPSTART;
 352         while (*preg->regparse == '|') {
 353                 preg->regparse++;
 354                 br = regbranch(preg, &flags);
 355                 if (br == 0)
 356                         return 0;
 357                 regtail(preg, ret, br); /* BRANCH -> BRANCH. */
 358                 if (!(flags&HASWIDTH))
 359                         *flagp &= ~HASWIDTH;
 360                 *flagp |= flags&SPSTART;
 361         }
 362
 363         /* Make a closing node, and hook it on the end. */
 364         ender = regnode(preg, (paren) ? CLOSE+parno : END);
 365         regtail(preg, ret, ender);
 366
 367         /* Hook the tails of the branches to the closing node. */
 368         for (br = ret; br != 0; br = regnext(preg, br))
 369                 regoptail(preg, br, ender);
 370
 371         /* Check for proper termination. */
 372         if (paren && *preg->regparse++ != ')') {
 373                 preg->err = REG_ERR_UNMATCHED_PAREN;
 374                 return 0;
 375         } else if (!paren && *preg->regparse != '\0') {
 376                 if (*preg->regparse == ')') {
 377                         preg->err = REG_ERR_UNMATCHED_PAREN;
 378                         return 0;
 379                 } else {
 380                         preg->err = REG_ERR_JUNK_ON_END;
 381                         return 0;
 382                 }
 383         }
 384
 385         return(ret);
 386 }
 387
 388 /*
 389  - regbranch - one alternative of an | operator
 390  *
 391  * Implements the concatenation operator.
 392  */
 393 static int regbranch(regex_t *preg, int *flagp )
 394 {
 395         int ret;
 396         int chain;
 397         int latest;
 398         int flags;
 399
 400         *flagp = WORST;         /* Tentatively. */
 401
 402         ret = regnode(preg, BRANCH);
 403         chain = 0;
 404         while (*preg->regparse != '\0' && *preg->regparse != ')' &&
 405                *preg->regparse != '|') {
 406                 latest = regpiece(preg, &flags);
 407                 if (latest == 0)
 408                         return 0;
 409                 *flagp |= flags&HASWIDTH;
 410                 if (chain == 0) {/* First piece. */
 411                         *flagp |= flags&SPSTART;
 412                 }
 413                 else {
 414                         regtail(preg, chain, latest);
 415                 }
 416                 chain = latest;
 417         }
 418         if (chain == 0) /* Loop ran zero times. */
 419                 (void) regnode(preg, NOTHING);
 420
 421         return(ret);
 422 }
 423
 424 /*
 425  - regpiece - something followed by possible [*+?]
 426  *
 427  * Note that the branching code sequences used for ? and the general cases
 428  * of * and + are somewhat optimized:  they use the same NOTHING node as
 429  * both the endmarker for their branch list and the body of the last branch.
 430  * It might seem that this node could be dispensed with entirely, but the
 431  * endmarker role is not redundant.
 432  */
 433 static int regpiece(regex_t *preg, int *flagp)
 434 {
 435         int ret;
 436         char op;
 437         int next;
 438         int flags;
 439         int chain = 0;
 440         int min;
 441         int max;
 442
 443         ret = regatom(preg, &flags);
 444         if (ret == 0)
 445                 return 0;
 446
 447         op = *preg->regparse;
 448         if (!ISMULT(op)) {
 449                 *flagp = flags;
 450                 return(ret);
 451         }
 452
 453         if (!(flags&HASWIDTH) && op != '?') {
 454                 preg->err = REG_ERR_OPERAND_COULD_BE_EMPTY;
 455                 return 0;
 456         }
 457
 458         /* Handle braces (counted repetition) by expansion */
 459         if (op == '{') {
 460                 char *end;
 461
 462                 min = strtoul(preg->regparse + 1, &end, 10);
 463                 if (end == preg->regparse + 1) {
 464                         preg->err = REG_ERR_BAD_COUNT;
 465                         return 0;
 466                 }
 467                 if (*end == '}') {
 468                         max = min;
 469                 }
 470                 else {
 471                         preg->regparse = end;
 472                         max = strtoul(preg->regparse + 1, &end, 10);
 473                         if (*end != '}') {
 474                                 preg->err = REG_ERR_UNMATCHED_BRACES;
 475                                 return 0;
 476                         }
 477                 }
 478                 if (end == preg->regparse + 1) {
 479                         max = MAX_REP_COUNT;
 480                 }
 481                 else if (max < min || max >= 100) {
 482                         preg->err = REG_ERR_BAD_COUNT;
 483                         return 0;
 484                 }
 485                 if (min >= 100) {
 486                         preg->err = REG_ERR_BAD_COUNT;
 487                         return 0;
 488                 }
 489
 490                 preg->regparse = strchr(preg->regparse, '}');
 491         }
 492         else {
 493                 min = (op == '+');
 494                 max = (op == '?' ? 1 : MAX_REP_COUNT);
 495         }
 496
 497         if (preg->regparse[1] == '?') {
 498                 preg->regparse++;
 499                 next = reginsert(preg, flags & SIMPLE ? REPMIN : REPXMIN, 5, ret);
 500         }
 501         else {
 502                 next = reginsert(preg, flags & SIMPLE ? REP: REPX, 5, ret);
 503         }
 504         preg->program[ret + 2] = max;
 505         preg->program[ret + 3] = min;
 506         preg->program[ret + 4] = 0;
 507
 508         *flagp = (min) ? (WORST|HASWIDTH) : (WORST|SPSTART);
 509
 510         if (!(flags & SIMPLE)) {
 511                 int back = regnode(preg, BACK);
 512                 regtail(preg, back, ret);
 513                 regtail(preg, next, back);
 514         }
 515
 516         preg->regparse++;
 517         if (ISMULT(*preg->regparse)) {
 518                 preg->err = REG_ERR_NESTED_COUNT;
 519                 return 0;
 520         }
 521
 522         return chain ? chain : ret;
 523 }
 524
 525 /**
 526  * Add all characters in the inclusive range between lower and upper.
 527  *
 528  * Handles a swapped range (upper < lower).
 529  */
 530 static void reg_addrange(regex_t *preg, int lower, int upper)
 531 {
 532         if (lower > upper) {
 533                 reg_addrange(preg, upper, lower);
 534         }
 535         /* Add a range as length, start */
 536         regc(preg, upper - lower + 1);
 537         regc(preg, lower);
 538 }
 539
 540 /**
 541  * Add a null-terminated literal string as a set of ranges.
 542  */
 543 static void reg_addrange_str(regex_t *preg, const char *str)
 544 {
 545         while (*str) {
 546                 reg_addrange(preg, *str, *str);
 547                 str++;
 548         }
 549 }
 550
 551 /**
 552  * Extracts the next unicode char from utf8.
 553  *
 554  * If 'upper' is set, converts the char to uppercase.
 555  */
 556 static int reg_utf8_tounicode_case(const char *s, int *uc, int upper)
 557 {
 558         int l = utf8_tounicode(s, uc);
 559         if (upper) {
 560                 *uc = utf8_upper(*uc);
 561         }
 562         return l;
 563 }
 564
 565 /**
 566  * Converts a hex digit to decimal.
 567  *
 568  * Returns -1 for an invalid hex digit.
 569  */
 570 static int hexdigitval(int c)
 571 {
 572         if (c >= '0' && c <= '9')
 573                 return c - '0';
 574         if (c >= 'a' && c <= 'f')
 575                 return c - 'a' + 10;
 576         if (c >= 'A' && c <= 'F')
 577                 return c - 'A' + 10;
 578         return -1;
 579 }
 580
 581 /**
 582  * Parses up to 'n' hex digits at 's' and stores the result in *uc.
 583  *
 584  * Returns the number of hex digits parsed.
 585  * If there are no hex digits, returns 0 and stores nothing.
 586  */
 587 static int parse_hex(const char *s, int n, int *uc)
 588 {
 589         int val = 0;
 590         int k;
 591
 592         for (k = 0; k < n; k++) {
 593                 int c = hexdigitval(*s++);
 594                 if (c == -1) {
 595                         break;
 596                 }
 597                 val = (val << 4) | c;
 598         }
 599         if (k) {
 600                 *uc = val;
 601         }
 602         return k;
 603 }
 604
 605 /**
 606  * Call for chars after a backlash to decode the escape sequence.
 607  *
 608  * Stores the result in *ch.
 609  *
 610  * Returns the number of bytes consumed.
 611  */
 612 static int reg_decode_escape(const char *s, int *ch)
 613 {
 614         int n;
 615         const char *s0 = s;
 616
 617         *ch = *s++;
 618
 619         switch (*ch) {
 620                 case 'b': *ch = '\b'; break;
 621                 case 'e': *ch = 27; break;
 622                 case 'f': *ch = '\f'; break;
 623                 case 'n': *ch = '\n'; break;
 624                 case 'r': *ch = '\r'; break;
 625                 case 't': *ch = '\t'; break;
 626                 case 'v': *ch = '\v'; break;
 627                 case 'u':
 628                         if ((n = parse_hex(s, 4, ch)) > 0) {
 629                                 s += n;
 630                         }
 631                         break;
 632                 case 'x':
 633                         if ((n = parse_hex(s, 2, ch)) > 0) {
 634                                 s += n;
 635                         }
 636                         break;
 637                 case '\0':
 638                         s--;
 639                         *ch = '\\';
 640                         break;
 641         }
 642         return s - s0;
 643 }
 644
 645 /*
 646  - regatom - the lowest level
 647  *
 648  * Optimization:  gobbles an entire sequence of ordinary characters so that
 649  * it can turn them into a single node, which is smaller to store and
 650  * faster to run.  Backslashed characters are exceptions, each becoming a
 651  * separate node; the code is simpler that way and it's not worth fixing.
 652  */
 653 static int regatom(regex_t *preg, int *flagp)
 654 {
 655         int ret;
 656         int flags;
 657         int nocase = (preg->cflags & REG_ICASE);
 658
 659         int ch;
 660         int n = reg_utf8_tounicode_case(preg->regparse, &ch, nocase);
 661
 662         *flagp = WORST;         /* Tentatively. */
 663
 664         preg->regparse += n;
 665         switch (ch) {
 666         /* FIXME: these chars only have meaning at beg/end of pat? */
 667         case '^':
 668                 ret = regnode(preg, BOL);
 669                 break;
 670         case '$':
 671                 ret = regnode(preg, EOL);
 672                 break;
 673         case '.':
 674                 ret = regnode(preg, ANY);
 675                 *flagp |= HASWIDTH|SIMPLE;
 676                 break;
 677         case '[': {
 678                         const char *pattern = preg->regparse;
 679
 680                         if (*pattern == '^') {  /* Complement of range. */
 681                                 ret = regnode(preg, ANYBUT);
 682                                 pattern++;
 683                         } else
 684                                 ret = regnode(preg, ANYOF);
 685
 686                         /* Special case. If the first char is ']' or '-', it is part of the set */
 687                         if (*pattern == ']' || *pattern == '-') {
 688                                 reg_addrange(preg, *pattern, *pattern);
 689                                 pattern++;
 690                         }
 691
 692                         while (*pattern && *pattern != ']') {
 693                                 /* Is this a range? a-z */
 694                                 int start;
 695                                 int end;
 696
 697                                 pattern += reg_utf8_tounicode_case(pattern, &start, nocase);
 698                                 if (start == '\\') {
 699                                         pattern += reg_decode_escape(pattern, &start);
 700                                         if (start == 0) {
 701                                                 preg->err = REG_ERR_NULL_CHAR;
 702                                                 return 0;
 703                                         }
 704                                 }
 705                                 if (pattern[0] == '-' && pattern[1]) {
 706                                         /* skip '-' */
 707                                         pattern += utf8_tounicode(pattern, &end);
 708                                         pattern += reg_utf8_tounicode_case(pattern, &end, nocase);
 709                                         if (end == '\\') {
 710                                                 pattern += reg_decode_escape(pattern, &end);
 711                                                 if (end == 0) {
 712                                                         preg->err = REG_ERR_NULL_CHAR;
 713                                                         return 0;
 714                                                 }
 715                                         }
 716
 717                                         reg_addrange(preg, start, end);
 718                                         continue;
 719                                 }
 720                                 if (start == '[') {
 721                                         if (strncmp(pattern, ":alpha:]", 8) == 0) {
 722                                                 if ((preg->cflags & REG_ICASE) == 0) {
 723                                                         reg_addrange(preg, 'a', 'z');
 724                                                 }
 725                                                 reg_addrange(preg, 'A', 'Z');
 726                                                 pattern += 8;
 727                                                 continue;
 728                                         }
 729                                         if (strncmp(pattern, ":alnum:]", 8) == 0) {
 730                                                 if ((preg->cflags & REG_ICASE) == 0) {
 731                                                         reg_addrange(preg, 'a', 'z');
 732                                                 }
 733                                                 reg_addrange(preg, 'A', 'Z');
 734                                                 reg_addrange(preg, '0', '9');
 735                                                 pattern += 8;
 736                                                 continue;
 737                                         }
 738                                         if (strncmp(pattern, ":space:]", 8) == 0) {
 739                                                 reg_addrange_str(preg, " \t\r\n\f\v");
 740                                                 pattern += 8;
 741                                                 continue;
 742                                         }
 743                                 }
 744                                 /* Not a range, so just add the char */
 745                                 reg_addrange(preg, start, start);
 746                         }
 747                         regc(preg, '\0');
 748
 749                         if (*pattern) {
 750                                 pattern++;
 751                         }
 752                         preg->regparse = pattern;
 753
 754                         *flagp |= HASWIDTH|SIMPLE;
 755                 }
 756                 break;
 757         case '(':
 758                 ret = reg(preg, 1, &flags);
 759                 if (ret == 0)
 760                         return 0;
 761                 *flagp |= flags&(HASWIDTH|SPSTART);
 762                 break;
 763         case '\0':
 764         case '|':
 765         case ')':
 766                 preg->err = REG_ERR_INTERNAL;
 767                 return 0;       /* Supposed to be caught earlier. */
 768         case '?':
 769         case '+':
 770         case '*':
 771         case '{':
 772                 preg->err = REG_ERR_COUNT_FOLLOWS_NOTHING;
 773                 return 0;
 774         case '\\':
 775                 switch (*preg->regparse++) {
 776                 case '\0':
 777                         preg->err = REG_ERR_TRAILING_BACKSLASH;
 778                         return 0;
 779                 case '<':
 780                 case 'm':
 781                         ret = regnode(preg, WORDA);
 782                         break;
 783                 case '>':
 784                 case 'M':
 785                         ret = regnode(preg, WORDZ);
 786                         break;
 787                 case 'd':
 788                         ret = regnode(preg, ANYOF);
 789                         reg_addrange(preg, '0', '9');
 790                         regc(preg, '\0');
 791                         *flagp |= HASWIDTH|SIMPLE;
 792                         break;
 793                 case 'w':
 794                         ret = regnode(preg, ANYOF);
 795                         if ((preg->cflags & REG_ICASE) == 0) {
 796                                 reg_addrange(preg, 'a', 'z');
 797                         }
 798                         reg_addrange(preg, 'A', 'Z');
 799                         reg_addrange(preg, '0', '9');
 800                         reg_addrange(preg, '_', '_');
 801                         regc(preg, '\0');
 802                         *flagp |= HASWIDTH|SIMPLE;
 803                         break;
 804                 case 's':
 805                         ret = regnode(preg, ANYOF);
 806                         reg_addrange_str(preg," \t\r\n\f\v");
 807                         regc(preg, '\0');
 808                         *flagp |= HASWIDTH|SIMPLE;
 809                         break;
 810                 /* FIXME: Someday handle \1, \2, ... */
 811                 default:
 812                         /* Handle general quoted chars in exact-match routine */
 813                         /* Back up to include the backslash */
 814                         preg->regparse--;
 815                         goto de_fault;
 816                 }
 817                 break;
 818         de_fault:
 819         default: {
 820                         /*
 821                          * Encode a string of characters to be matched exactly.
 822                          */
 823                         int added = 0;
 824
 825                         /* Back up to pick up the first char of interest */
 826                         preg->regparse -= n;
 827
 828                         ret = regnode(preg, EXACTLY);
 829
 830                         /* Note that a META operator such as ? or * consumes the
 831                          * preceding char.
 832                          * Thus we must be careful to look ahead by 2 and add the
 833                          * last char as it's own EXACTLY if necessary
 834                          */
 835
 836                         /* Until end of string or a META char is reached */
 837                         while (*preg->regparse && strchr(META, *preg->regparse) == NULL) {
 838                                 n = reg_utf8_tounicode_case(preg->regparse, &ch, (preg->cflags & REG_ICASE));
 839                                 if (ch == '\\' && preg->regparse[n]) {
 840                                         /* Non-trailing backslash.
 841                                          * Is this a special escape, or a regular escape?
 842                                          */
 843                                         if (strchr("<>mMwds", preg->regparse[n])) {
 844                                                 /* A special escape. All done with EXACTLY */
 845                                                 break;
 846                                         }
 847                                         /* Decode it. Note that we add the length for the escape
 848                                          * sequence to the length for the backlash so we can skip
 849                                          * the entire sequence, or not as required.
 850                                          */
 851                                         n += reg_decode_escape(preg->regparse + n, &ch);
 852                                         if (ch == 0) {
 853                                                 preg->err = REG_ERR_NULL_CHAR;
 854                                                 return 0;
 855                                         }
 856                                 }
 857
 858                                 /* Now we have one char 'ch' of length 'n'.
 859                                  * Check to see if the following char is a MULT
 860                                  */
 861
 862                                 if (ISMULT(preg->regparse[n])) {
 863                                         /* Yes. But do we already have some EXACTLY chars? */
 864                                         if (added) {
 865                                                 /* Yes, so return what we have and pick up the current char next time around */
 866                                                 break;
 867                                         }
 868                                         /* No, so add this single char and finish */
 869                                         regc(preg, ch);
 870                                         added++;
 871                                         preg->regparse += n;
 872                                         break;
 873                                 }
 874
 875                                 /* No, so just add this char normally */
 876                                 regc(preg, ch);
 877                                 added++;
 878                                 preg->regparse += n;
 879                         }
 880                         regc(preg, '\0');
 881
 882                         *flagp |= HASWIDTH;
 883                         if (added == 1)
 884                                 *flagp |= SIMPLE;
 885                         break;
 886                 }
 887                 break;
 888         }
 889
 890         return(ret);
 891 }
 892
 893 static void reg_grow(regex_t *preg, int n)
 894 {
 895         if (preg->p + n >= preg->proglen) {
 896                 preg->proglen = (preg->p + n) * 2;
 897                 preg->program = realloc(preg->program, preg->proglen * sizeof(int));
 898         }
 899 }
 900
 901 /*
 902  - regnode - emit a node
 903  */
 904 /* Location. */
 905 static int regnode(regex_t *preg, int op)
 906 {
 907         reg_grow(preg, 2);
 908
 909         preg->program[preg->p++] = op;
 910         preg->program[preg->p++] = 0;
 911
 912         /* Return the start of the node */
 913         return preg->p - 2;
 914 }
 915
 916 /*
 917  - regc - emit (if appropriate) a byte of code
 918  */
 919 static void regc(regex_t *preg, int b )
 920 {
 921         reg_grow(preg, 1);
 922         preg->program[preg->p++] = b;
 923 }
 924
 925 /*
 926  - reginsert - insert an operator in front of already-emitted operand
 927  *
 928  * Means relocating the operand.
 929  * Returns the new location of the original operand.
 930  */
 931 static int reginsert(regex_t *preg, int op, int size, int opnd )
 932 {
 933         reg_grow(preg, size);
 934
 935         /* Move everything from opnd up */
 936         memmove(preg->program + opnd + size, preg->program + opnd, sizeof(int) * (preg->p - opnd));
 937         /* Zero out the new space */
 938         memset(preg->program + opnd, 0, sizeof(int) * size);
 939
 940         preg->program[opnd] = op;
 941
 942         preg->p += size;
 943
 944         return opnd + size;
 945 }
 946
 947 /*
 948  - regtail - set the next-pointer at the end of a node chain
 949  */
 950 static void regtail_(regex_t *preg, int p, int val, int line )
 951 {
 952         int scan;
 953         int temp;
 954         int offset;
 955
 956         /* Find last node. */
 957         scan = p;
 958         for (;;) {
 959                 temp = regnext(preg, scan);
 960                 if (temp == 0)
 961                         break;
 962                 scan = temp;
 963         }
 964
 965         if (OP(preg, scan) == BACK)
 966                 offset = scan - val;
 967         else
 968                 offset = val - scan;
 969
 970         preg->program[scan + 1] = offset;
 971 }
 972
 973 /*
 974  - regoptail - regtail on operand of first argument; nop if operandless
 975  */
 976
 977 static void regoptail(regex_t *preg, int p, int val )
 978 {
 979         /* "Operandless" and "op != BRANCH" are synonymous in practice. */
 980         if (p != 0 && OP(preg, p) == BRANCH) {
 981                 regtail(preg, OPERAND(p), val);
 982         }
 983 }
 984
 985 /*
 986  * regexec and friends
 987  */
 988
 989 /*
 990  * Forwards.
 991  */
 992 static int regtry(regex_t *preg, const char *string );
 993 static int regmatch(regex_t *preg, int prog);
 994 static int regrepeat(regex_t *preg, int p, int max);
 995
 996 /*
 997  - regexec - match a regexp against a string
 998  */
 999 int regexec(regex_t  *preg,  const  char *string, size_t nmatch, regmatch_t pmatch[], int eflags)
1000 {
1001         const char *s;
1002         int scan;
1003
1004         /* Be paranoid... */
1005         if (preg == NULL || preg->program == NULL || string == NULL) {
1006                 return REG_ERR_NULL_ARGUMENT;
1007         }
1008
1009         /* Check validity of program. */
1010         if (*preg->program != REG_MAGIC) {
1011                 return REG_ERR_CORRUPTED;
1012         }
1013
1014 #ifdef DEBUG
1015         fprintf(stderr, "regexec: %s\n", string);
1016         regdump(preg);
1017 #endif
1018
1019         preg->eflags = eflags;
1020         preg->pmatch = pmatch;
1021         preg->nmatch = nmatch;
1022         preg->start = string;   /* All offsets are computed from here */
1023
1024         /* Must clear out the embedded repeat counts */
1025         for (scan = OPERAND(1); scan != 0; scan = regnext(preg, scan)) {
1026                 switch (OP(preg, scan)) {
1027                 case REP:
1028                 case REPMIN:
1029                 case REPX:
1030                 case REPXMIN:
1031                         preg->program[scan + 4] = 0;
1032                         break;
1033                 }
1034         }
1035
1036         /* If there is a "must appear" string, look for it. */
1037         if (preg->regmust != 0) {
1038                 s = string;
1039                 while ((s = str_find(s, preg->program[preg->regmust], preg->cflags & REG_ICASE)) != NULL) {
1040                         if (prefix_cmp(preg->program + preg->regmust, preg->regmlen, s, preg->cflags & REG_ICASE) >= 0) {
1041                                 break;
1042                         }
1043                         s++;
1044                 }
1045                 if (s == NULL)  /* Not present. */
1046                         return REG_NOMATCH;
1047         }
1048
1049         /* Mark beginning of line for ^ . */
1050         preg->regbol = string;
1051
1052         /* Simplest case:  anchored match need be tried only once (maybe per line). */
1053         if (preg->reganch) {
1054                 if (eflags & REG_NOTBOL) {
1055                         /* This is an anchored search, but not an BOL, so possibly skip to the next line */
1056                         goto nextline;
1057                 }
1058                 while (1) {
1059                         int ret = regtry(preg, string);
1060                         if (ret) {
1061                                 return REG_NOERROR;
1062                         }
1063                         if (*string) {
1064 nextline:
1065                                 if (preg->cflags & REG_NEWLINE) {
1066                                         /* Try the next anchor? */
1067                                         string = strchr(string, '\n');
1068                                         if (string) {
1069                                                 preg->regbol = ++string;
1070                                                 continue;
1071                                         }
1072                                 }
1073                         }
1074                         return REG_NOMATCH;
1075                 }
1076         }
1077
1078         /* Messy cases:  unanchored match. */
1079         s = string;
1080         if (preg->regstart != '\0') {
1081                 /* We know what char it must start with. */
1082                 while ((s = str_find(s, preg->regstart, preg->cflags & REG_ICASE)) != NULL) {
1083                         if (regtry(preg, s))
1084                                 return REG_NOERROR;
1085                         s++;
1086                 }
1087         }
1088         else
1089                 /* We don't -- general case. */
1090                 while (1) {
1091                         if (regtry(preg, s))
1092                                 return REG_NOERROR;
1093                         if (*s == '\0') {
1094                                 break;
1095                         }
1096                         s += utf8_charlen(*s);
1097                 }
1098
1099         /* Failure. */
1100         return REG_NOMATCH;
1101 }
1102
1103 /*
1104  - regtry - try match at specific point
1105  */
1106                         /* 0 failure, 1 success */
1107 static int regtry( regex_t *preg, const char *string )
1108 {
1109         int i;
1110
1111         preg->reginput = string;
1112
1113         for (i = 0; i < preg->nmatch; i++) {
1114                 preg->pmatch[i].rm_so = -1;
1115                 preg->pmatch[i].rm_eo = -1;
1116         }
1117         if (regmatch(preg, 1)) {
1118                 preg->pmatch[0].rm_so = string - preg->start;
1119                 preg->pmatch[0].rm_eo = preg->reginput - preg->start;
1120                 return(1);
1121         } else
1122                 return(0);
1123 }
1124
1125 /**
1126  * Returns bytes matched if 'pattern' is a prefix of 'string'.
1127  *
1128  * If 'nocase' is non-zero, does a case-insensitive match.
1129  *
1130  * Returns -1 on not found.
1131  */
1132 static int prefix_cmp(const int *prog, int proglen, const char *string, int nocase)
1133 {
1134         const char *s = string;
1135         while (proglen && *s) {
1136                 int ch;
1137                 int n = reg_utf8_tounicode_case(s, &ch, nocase);
1138                 if (ch != *prog) {
1139                         return -1;
1140                 }
1141                 prog++;
1142                 s += n;
1143                 proglen--;
1144         }
1145         if (proglen == 0) {
1146                 return s - string;
1147         }
1148         return -1;
1149 }
1150
1151 /**
1152  * Searchs for 'c' in the range 'range'.
1153  *
1154  * Returns 1 if found, or 0 if not.
1155  */
1156 static int reg_range_find(const int *range, int c)
1157 {
1158         while (*range) {
1159                 /*printf("Checking %d in range [%d,%d]\n", c, range[1], (range[0] + range[1] - 1));*/
1160                 if (c >= range[1] && c <= (range[0] + range[1] - 1)) {
1161                         return 1;
1162                 }
1163                 range += 2;
1164         }
1165         return 0;
1166 }
1167
1168 /**
1169  * Search for the character 'c' in the utf-8 string 'string'.
1170  *
1171  * If 'nocase' is set, the 'string' is assumed to be uppercase
1172  * and 'c' is converted to uppercase before matching.
1173  *
1174  * Returns the byte position in the string where the 'c' was found, or
1175  * NULL if not found.
1176  */
1177 static const char *str_find(const char *string, int c, int nocase)
1178 {
1179         if (nocase) {
1180                 /* The "string" should already be converted to uppercase */
1181                 c = utf8_upper(c);
1182         }
1183         while (*string) {
1184                 int ch;
1185                 int n = reg_utf8_tounicode_case(string, &ch, nocase);
1186                 if (c == ch) {
1187                         return string;
1188                 }
1189                 string += n;
1190         }
1191         return NULL;
1192 }
1193
1194 /**
1195  * Returns true if 'ch' is an end-of-line char.
1196  *
1197  * In REG_NEWLINE mode, \n is considered EOL in
1198  * addition to \0
1199  */
1200 static int reg_iseol(regex_t *preg, int ch)
1201 {
1202         if (preg->cflags & REG_NEWLINE) {
1203                 return ch == '\0' || ch == '\n';
1204         }
1205         else {
1206                 return ch == '\0';
1207         }
1208 }
1209
1210 static int regmatchsimplerepeat(regex_t *preg, int scan, int matchmin)
1211 {
1212         int nextch = '\0';
1213         const char *save;
1214         int no;
1215         int c;
1216
1217         int max = preg->program[scan + 2];
1218         int min = preg->program[scan + 3];
1219         int next = regnext(preg, scan);
1220
1221         /*
1222          * Lookahead to avoid useless match attempts
1223          * when we know what character comes next.
1224          */
1225         if (OP(preg, next) == EXACTLY) {
1226                 nextch = preg->program[OPERAND(next)];
1227         }
1228         save = preg->reginput;
1229         no = regrepeat(preg, scan + 5, max);
1230         if (no < min) {
1231                 return 0;
1232         }
1233         if (matchmin) {
1234                 /* from min up to no */
1235                 max = no;
1236                 no = min;
1237         }
1238         /* else from no down to min */
1239         while (1) {
1240                 if (matchmin) {
1241                         if (no > max) {
1242                                 break;
1243                         }
1244                 }
1245                 else {
1246                         if (no < min) {
1247                                 break;
1248                         }
1249                 }
1250                 preg->reginput = save + utf8_index(save, no);
1251                 reg_utf8_tounicode_case(preg->reginput, &c, (preg->cflags & REG_ICASE));
1252                 /* If it could work, try it. */
1253                 if (reg_iseol(preg, nextch) || c == nextch) {
1254                         if (regmatch(preg, next)) {
1255                                 return(1);
1256                         }
1257                 }
1258                 if (matchmin) {
1259                         /* Couldn't or didn't, add one more */
1260                         no++;
1261                 }
1262                 else {
1263                         /* Couldn't or didn't -- back up. */
1264                         no--;
1265                 }
1266         }
1267         return(0);
1268 }
1269
1270 static int regmatchrepeat(regex_t *preg, int scan, int matchmin)
1271 {
1272         int *scanpt = preg->program + scan;
1273
1274         int max = scanpt[2];
1275         int min = scanpt[3];
1276
1277         /* Have we reached min? */
1278         if (scanpt[4] < min) {
1279                 /* No, so get another one */
1280                 scanpt[4]++;
1281                 if (regmatch(preg, scan + 5)) {
1282                         return 1;
1283                 }
1284                 scanpt[4]--;
1285                 return 0;
1286         }
1287         if (scanpt[4] > max) {
1288                 return 0;
1289         }
1290
1291         if (matchmin) {
1292                 /* minimal, so try other branch first */
1293                 if (regmatch(preg, regnext(preg, scan))) {
1294                         return 1;
1295                 }
1296                 /* No, so try one more */
1297                 scanpt[4]++;
1298                 if (regmatch(preg, scan + 5)) {
1299                         return 1;
1300                 }
1301                 scanpt[4]--;
1302                 return 0;
1303         }
1304         /* maximal, so try this branch again */
1305         if (scanpt[4] < max) {
1306                 scanpt[4]++;
1307                 if (regmatch(preg, scan + 5)) {
1308                         return 1;
1309                 }
1310                 scanpt[4]--;
1311         }
1312         /* At this point we are at max with no match. Try the other branch */
1313         return regmatch(preg, regnext(preg, scan));
1314 }
1315
1316 /*
1317  - regmatch - main matching routine
1318  *
1319  * Conceptually the strategy is simple:  check to see whether the current
1320  * node matches, call self recursively to see whether the rest matches,
1321  * and then act accordingly.  In practice we make some effort to avoid
1322  * recursion, in particular by going through "ordinary" nodes (that don't
1323  * need to know whether the rest of the match failed) by a loop instead of
1324  * by recursion.
1325  */
1326 /* 0 failure, 1 success */
1327 static int regmatch(regex_t *preg, int prog)
1328 {
1329         int scan;       /* Current node. */
1330         int next;               /* Next node. */
1331
1332         scan = prog;
1333
1334 #ifdef DEBUG
1335         if (scan != 0 && regnarrate)
1336                 fprintf(stderr, "%s(\n", regprop(scan));
1337 #endif
1338         while (scan != 0) {
1339                 int n;
1340                 int c;
1341 #ifdef DEBUG
1342                 if (regnarrate) {
1343                         fprintf(stderr, "%3d: %s...\n", scan, regprop(OP(preg, scan))); /* Where, what. */
1344                 }
1345 #endif
1346                 next = regnext(preg, scan);
1347                 n = reg_utf8_tounicode_case(preg->reginput, &c, (preg->cflags & REG_ICASE));
1348
1349                 switch (OP(preg, scan)) {
1350                 case BOL:
1351                         if (preg->reginput != preg->regbol)
1352                                 return(0);
1353                         break;
1354                 case EOL:
1355                         if (!reg_iseol(preg, c)) {
1356                                 return(0);
1357                         }
1358                         break;
1359                 case WORDA:
1360                         /* Must be looking at a letter, digit, or _ */
1361                         if ((!isalnum(UCHAR(c))) && c != '_')
1362                                 return(0);
1363                         /* Prev must be BOL or nonword */
1364                         if (preg->reginput > preg->regbol &&
1365                                 (isalnum(UCHAR(preg->reginput[-1])) || preg->reginput[-1] == '_'))
1366                                 return(0);
1367                         break;
1368                 case WORDZ:
1369                         /* Can't match at BOL */
1370                         if (preg->reginput > preg->regbol) {
1371                                 /* Current must be EOL or nonword */
1372                                 if (reg_iseol(preg, c) || !isalnum(UCHAR(c)) || c != '_') {
1373                                         c = preg->reginput[-1];
1374                                         /* Previous must be word */
1375                                         if (isalnum(UCHAR(c)) || c == '_') {
1376                                                 break;
1377                                         }
1378                                 }
1379                         }
1380                         /* No */
1381                         return(0);
1382
1383                 case ANY:
1384                         if (reg_iseol(preg, c))
1385                                 return 0;
1386                         preg->reginput += n;
1387                         break;
1388                 case EXACTLY: {
1389                                 int opnd;
1390                                 int len;
1391                                 int slen;
1392
1393                                 opnd = OPERAND(scan);
1394                                 len = str_int_len(preg->program + opnd);
1395
1396                                 slen = prefix_cmp(preg->program + opnd, len, preg->reginput, preg->cflags & REG_ICASE);
1397                                 if (slen < 0) {
1398                                         return(0);
1399                                 }
1400                                 preg->reginput += slen;
1401                         }
1402                         break;
1403                 case ANYOF:
1404                         if (reg_iseol(preg, c) || reg_range_find(preg->program + OPERAND(scan), c) == 0) {
1405                                 return(0);
1406                         }
1407                         preg->reginput += n;
1408                         break;
1409                 case ANYBUT:
1410                         if (reg_iseol(preg, c) || reg_range_find(preg->program + OPERAND(scan), c) != 0) {
1411                                 return(0);
1412                         }
1413                         preg->reginput += n;
1414                         break;
1415                 case NOTHING:
1416                         break;
1417                 case BACK:
1418                         break;
1419                 case BRANCH: {
1420                                 const char *save;
1421
1422                                 if (OP(preg, next) != BRANCH)           /* No choice. */
1423                                         next = OPERAND(scan);   /* Avoid recursion. */
1424                                 else {
1425                                         do {
1426                                                 save = preg->reginput;
1427                                                 if (regmatch(preg, OPERAND(scan))) {
1428                                                         return(1);
1429                                                 }
1430                                                 preg->reginput = save;
1431                                                 scan = regnext(preg, scan);
1432                                         } while (scan != 0 && OP(preg, scan) == BRANCH);
1433                                         return(0);
1434                                         /* NOTREACHED */
1435                                 }
1436                         }
1437                         break;
1438                 case REP:
1439                 case REPMIN:
1440                         return regmatchsimplerepeat(preg, scan, OP(preg, scan) == REPMIN);
1441
1442                 case REPX:
1443                 case REPXMIN:
1444                         return regmatchrepeat(preg, scan, OP(preg, scan) == REPXMIN);
1445
1446                 case END:
1447                         return(1);      /* Success! */
1448                         break;
1449                 default:
1450                         if (OP(preg, scan) >= OPEN+1 && OP(preg, scan) < CLOSE_END) {
1451                                 const char *save;
1452
1453                                 save = preg->reginput;
1454
1455                                 if (regmatch(preg, next)) {
1456                                         int no;
1457                                         /*
1458                                          * Don't set startp if some later
1459                                          * invocation of the same parentheses
1460                                          * already has.
1461                                          */
1462                                         if (OP(preg, scan) < CLOSE) {
1463                                                 no = OP(preg, scan) - OPEN;
1464                                                 if (no < preg->nmatch && preg->pmatch[no].rm_so == -1) {
1465                                                         preg->pmatch[no].rm_so = save - preg->start;
1466                                                 }
1467                                         }
1468                                         else {
1469                                                 no = OP(preg, scan) - CLOSE;
1470                                                 if (no < preg->nmatch && preg->pmatch[no].rm_eo == -1) {
1471                                                         preg->pmatch[no].rm_eo = save - preg->start;
1472                                                 }
1473                                         }
1474                                         return(1);
1475                                 } else
1476                                         return(0);
1477                         }
1478                         return REG_ERR_INTERNAL;
1479                 }
1480
1481                 scan = next;
1482         }
1483
1484         /*
1485          * We get here only if there's trouble -- normally "case END" is
1486          * the terminating point.
1487          */
1488         return REG_ERR_INTERNAL;
1489 }
1490
1491 /*
1492  - regrepeat - repeatedly match something simple, report how many
1493  */
1494 static int regrepeat(regex_t *preg, int p, int max)
1495 {
1496         int count = 0;
1497         const char *scan;
1498         int opnd;
1499         int ch;
1500         int n;
1501
1502         scan = preg->reginput;
1503         opnd = OPERAND(p);
1504         switch (OP(preg, p)) {
1505         case ANY:
1506                 /* No need to handle utf8 specially here */
1507                 while (!reg_iseol(preg, *scan) && count < max) {
1508                         count++;
1509                         scan++;
1510                 }
1511                 break;
1512         case EXACTLY:
1513                 while (count < max) {
1514                         n = reg_utf8_tounicode_case(scan, &ch, preg->cflags & REG_ICASE);
1515                         if (preg->program[opnd] != ch) {
1516                                 break;
1517                         }
1518                         count++;
1519                         scan += n;
1520                 }
1521                 break;
1522         case ANYOF:
1523                 while (count < max) {
1524                         n = reg_utf8_tounicode_case(scan, &ch, preg->cflags & REG_ICASE);
1525                         if (reg_iseol(preg, ch) || reg_range_find(preg->program + opnd, ch) == 0) {
1526                                 break;
1527                         }
1528                         count++;
1529                         scan += n;
1530                 }
1531                 break;
1532         case ANYBUT:
1533                 while (count < max) {
1534                         n = reg_utf8_tounicode_case(scan, &ch, preg->cflags & REG_ICASE);
1535                         if (reg_iseol(preg, ch) || reg_range_find(preg->program + opnd, ch) != 0) {
1536                                 break;
1537                         }
1538                         count++;
1539                         scan += n;
1540                 }
1541                 break;
1542         default:                /* Oh dear.  Called inappropriately. */
1543                 preg->err = REG_ERR_INTERNAL;
1544                 count = 0;      /* Best compromise. */
1545                 break;
1546         }
1547         preg->reginput = scan;
1548
1549         return(count);
1550 }
1551
1552 /*
1553  - regnext - dig the "next" pointer out of a node
1554  */
1555 static int regnext(regex_t *preg, int p )
1556 {
1557         int offset;
1558
1559         offset = NEXT(preg, p);
1560
1561         if (offset == 0)
1562                 return 0;
1563
1564         if (OP(preg, p) == BACK)
1565                 return(p-offset);
1566         else
1567                 return(p+offset);
1568 }
1569
1570 #if defined(DEBUG) && !defined(JIM_BOOTSTRAP)
1571
1572 /*
1573  - regdump - dump a regexp onto stdout in vaguely comprehensible form
1574  */
1575 static void regdump(regex_t *preg)
1576 {
1577         int s;
1578         int op = EXACTLY;       /* Arbitrary non-END op. */
1579         int next;
1580         char buf[4];
1581
1582         int i;
1583         for (i = 1; i < preg->p; i++) {
1584                 printf("%02x ", preg->program[i]);
1585                 if (i % 16 == 15) {
1586                         printf("\n");
1587                 }
1588         }
1589         printf("\n");
1590
1591         s = 1;
1592         while (op != END && s < preg->p) {      /* While that wasn't END last time... */
1593                 op = OP(preg, s);
1594                 printf("%3d: %s", s, regprop(op));      /* Where, what. */
1595                 next = regnext(preg, s);
1596                 if (next == 0)          /* Next ptr. */
1597                         printf("(0)");
1598                 else
1599                         printf("(%d)", next);
1600                 s += 2;
1601                 if (op == REP || op == REPMIN || op == REPX || op == REPXMIN) {
1602                         int max = preg->program[s];
1603                         int min = preg->program[s + 1];
1604                         if (max == 65535) {
1605                                 printf("{%d,*}", min);
1606                         }
1607                         else {
1608                                 printf("{%d,%d}", min, max);
1609                         }
1610                         printf(" %d", preg->program[s + 2]);
1611                         s += 3;
1612                 }
1613                 else if (op == ANYOF || op == ANYBUT) {
1614                         /* set of ranges */
1615
1616                         while (preg->program[s]) {
1617                                 int len = preg->program[s++];
1618                                 int first = preg->program[s++];
1619                                 buf[utf8_fromunicode(buf, first)] = 0;
1620                                 printf("%s", buf);
1621                                 if (len > 1) {
1622                                         buf[utf8_fromunicode(buf, first + len - 1)] = 0;
1623                                         printf("-%s", buf);
1624                                 }
1625                         }
1626                         s++;
1627                 }
1628                 else if (op == EXACTLY) {
1629                         /* Literal string, where present. */
1630
1631                         while (preg->program[s]) {
1632                                 buf[utf8_fromunicode(buf, preg->program[s])] = 0;
1633                                 printf("%s", buf);
1634                                 s++;
1635                         }
1636                         s++;
1637                 }
1638                 putchar('\n');
1639         }
1640
1641         if (op == END) {
1642                 /* Header fields of interest. */
1643                 if (preg->regstart) {
1644                         buf[utf8_fromunicode(buf, preg->regstart)] = 0;
1645                         printf("start '%s' ", buf);
1646                 }
1647                 if (preg->reganch)
1648                         printf("anchored ");
1649                 if (preg->regmust != 0) {
1650                         int i;
1651                         printf("must have:");
1652                         for (i = 0; i < preg->regmlen; i++) {
1653                                 putchar(preg->program[preg->regmust + i]);
1654                         }
1655                         putchar('\n');
1656                 }
1657         }
1658         printf("\n");
1659 }
1660
1661 /*
1662  - regprop - printable representation of opcode
1663  */
1664 static const char *regprop( int op )
1665 {
1666         static char buf[50];
1667
1668         switch (op) {
1669         case BOL:
1670                 return "BOL";
1671         case EOL:
1672                 return "EOL";
1673         case ANY:
1674                 return "ANY";
1675         case ANYOF:
1676                 return "ANYOF";
1677         case ANYBUT:
1678                 return "ANYBUT";
1679         case BRANCH:
1680                 return "BRANCH";
1681         case EXACTLY:
1682                 return "EXACTLY";
1683         case NOTHING:
1684                 return "NOTHING";
1685         case BACK:
1686                 return "BACK";
1687         case END:
1688                 return "END";
1689         case REP:
1690                 return "REP";
1691         case REPMIN:
1692                 return "REPMIN";
1693         case REPX:
1694                 return "REPX";
1695         case REPXMIN:
1696                 return "REPXMIN";
1697         case WORDA:
1698                 return "WORDA";
1699         case WORDZ:
1700                 return "WORDZ";
1701         default:
1702                 if (op >= OPEN && op < CLOSE) {
1703                         snprintf(buf, sizeof(buf), "OPEN%d", op-OPEN);
1704                 }
1705                 else if (op >= CLOSE && op < CLOSE_END) {
1706                         snprintf(buf, sizeof(buf), "CLOSE%d", op-CLOSE);
1707                 }
1708                 else {
1709                         snprintf(buf, sizeof(buf), "?%d?\n", op);
1710                 }
1711                 return(buf);
1712         }
1713 }
1714 #endif /* JIM_BOOTSTRAP */
1715
1716 size_t regerror(int errcode, const regex_t *preg, char *errbuf,  size_t errbuf_size)
1717 {
1718         static const char *error_strings[] = {
1719                 "success",
1720                 "no match",
1721                 "bad pattern",
1722                 "null argument",
1723                 "unknown error",
1724                 "too big",
1725                 "out of memory",
1726                 "too many ()",
1727                 "parentheses () not balanced",
1728                 "braces {} not balanced",
1729                 "invalid repetition count(s)",
1730                 "extra characters",
1731                 "*+ of empty atom",
1732                 "nested count",
1733                 "internal error",
1734                 "count follows nothing",
1735                 "trailing backslash",
1736                 "corrupted program",
1737                 "contains null char",
1738         };
1739         const char *err;
1740
1741         if (errcode < 0 || errcode >= REG_ERR_NUM) {
1742                 err = "Bad error code";
1743         }
1744         else {
1745                 err = error_strings[errcode];
1746         }
1747
1748         return snprintf(errbuf, errbuf_size, "%s", err);
1749 }
1750
1751 void regfree(regex_t *preg)
1752 {
1753         free(preg->program);
1754 }
1755
1756 #endif