src/kits/tracker/RegExp.cpp

   1 /*
   2 Open Tracker License
   3
   4 Terms and Conditions
   5
   6 Copyright (c) 1991-2000, Be Incorporated. All rights reserved.
   7
   8 Permission is hereby granted, free of charge, to any person obtaining a copy of
   9 this software and associated documentation files (the "Software"), to deal in
  10 the Software without restriction, including without limitation the rights to
  11 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
  12 of the Software, and to permit persons to whom the Software is furnished to do
  13 so, subject to the following conditions:
  14
  15 The above copyright notice and this permission notice applies to all licensees
  16 and shall be included in all copies or substantial portions of the Software.
  17
  18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  19 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF TITLE, MERCHANTABILITY,
  20 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  21 BE INCORPORATED BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
  22 AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN CONNECTION
  23 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  24
  25 Except as contained in this notice, the name of Be Incorporated shall not be
  26 used in advertising or otherwise to promote the sale, use or other dealings in
  27 this Software without prior written authorization from Be Incorporated.
  28
  29 Tracker(TM), Be(R), BeOS(R), and BeIA(TM) are trademarks or registered trademarks
  30 of Be Incorporated in the United States and other countries. Other brand product
  31 names are registered trademarks or trademarks of their respective holders.
  32 All rights reserved.
  33 */
  34
  35 // This code is based on regexp.c, v.1.3 by Henry Spencer:
  36
  37 // @(#)regexp.c 1.3 of 18 April 87
  38 //
  39 //      Copyright (c) 1986 by University of Toronto.
  40 //      Written by Henry Spencer.  Not derived from licensed software.
  41 //
  42 //      Permission is granted to anyone to use this software for any
  43 //      purpose on any computer system, and to redistribute it freely,
  44 //      subject to the following restrictions:
  45 //
  46 //      1. The author is not responsible for the consequences of use of
  47 //              this software, no matter how awful, even if they arise
  48 //              from defects in it.
  49 //
  50 //      2. The origin of this software must not be misrepresented, either
  51 //              by explicit claim or by omission.
  52 //
  53 //      3. Altered versions must be plainly marked as such, and must not
  54 //              be misrepresented as being the original software.
  55 //
  56 // Beware that some of this code is subtly aware of the way operator
  57 // precedence is structured in regular expressions.  Serious changes in
  58 // regular-expression syntax might require a total rethink.
  59 //
  60
  61 // ALTERED VERSION: Adapted to ANSI C and C++ for the OpenTracker
  62 // project (www.opentracker.org), Jul 11, 2000.
  63
  64
  65 #include <stdlib.h>
  66 #include <stdio.h>
  67 #include <string.h>
  68
  69 #include <Catalog.h>
  70 #include <Errors.h>
  71
  72 #include "RegExp.h"
  73
  74
  75 #undef B_TRANSLATION_CONTEXT
  76 #define B_TRANSLATION_CONTEXT "libtracker"
  77
  78
  79 // The first byte of the regexp internal "program" is actually this magic
  80 // number; the start node begins in the second byte.
  81
  82 const uint8 kRegExpMagic = 0234;
  83
  84 // The "internal use only" fields in RegExp.h are present to pass info from
  85 // compile to execute that permits the execute phase to run lots faster on
  86 // simple cases.  They are:
  87 //
  88 // regstart     char that must begin a match; '\0' if none obvious
  89 // reganch      is the match anchored (at beginning-of-line only)?
  90 // regmust      string (pointer into program) that match must include, or NULL
  91 // regmlen      length of regmust string
  92 //
  93 // Regstart and reganch permit very fast decisions on suitable starting points
  94 // for a match, cutting down the work a lot.  Regmust permits fast rejection
  95 // of lines that cannot possibly match.  The regmust tests are costly enough
  96 // that Compile() supplies a regmust only if the r.e. contains something
  97 // potentially expensive (at present, the only such thing detected is * or +
  98 // at the start of the r.e., which can involve a lot of backup). Regmlen is
  99 // supplied because the test in RunMatcher() needs it and Compile() is
 100 // computing it anyway.
 101 //
 102 //
 103 //
 104 // Structure for regexp "program".  This is essentially a linear encoding
 105 // of a nondeterministic finite-state machine (aka syntax charts or
 106 // "railroad normal form" in parsing technology).  Each node is an opcode
 107 // plus a "next" pointer, possibly plus an operand.  "Next" pointers of
 108 // all nodes except kRegExpBranch implement concatenation; a "next" pointer
 109 // with a kRegExpBranch on both ends of it is connecting two alternatives.
 110 // (Here we have one of the subtle syntax dependencies:  an individual
 111 // kRegExpBranch (as opposed to a collection of them) is never concatenated
 112 // with anything because of operator precedence.)  The operand of some types
 113 // of node is a literal string; for others, it is a node leading into a
 114 // sub-FSM. In particular, the operand of a kRegExpBranch node is the first
 115 // node of the branch. (NB this is* not* a tree structure:  the tail of the
 116 // branch connects to the thing following the set of kRegExpBranches).
 117 // The opcodes are:
 118 //
 119
 120 // definition   number  opnd?   meaning
 121 enum {
 122         kRegExpEnd = 0,         // no   End of program.
 123         kRegExpBol = 1,         // no   Match "" at beginning of line.
 124         kRegExpEol = 2,         // no   Match "" at end of line.
 125         kRegExpAny = 3,         // no   Match any one character.
 126         kRegExpAnyOf = 4,       // str  Match any character in this string.
 127         kRegExpAnyBut = 5,      // str  Match any character not in this string.
 128         kRegExpBranch = 6,      // node Match this alternative, or the next...
 129         kRegExpBack = 7,        // no   Match "", "next" ptr points backward.
 130         kRegExpExactly = 8,     // str  Match this string.
 131         kRegExpNothing = 9,     // no   Match empty string.
 132         kRegExpStar = 10,       // node Match this (simple) thing 0 or more times.
 133         kRegExpPlus = 11,       // node Match this (simple) thing 1 or more times.
 134         kRegExpOpen     = 20,   // no   Mark this point in input as start of #n.
 135                                                         //      kRegExpOpen + 1 is number 1, etc.
 136         kRegExpClose = 30       // no   Analogous to kRegExpOpen.
 137 };
 138
 139 //
 140 // Opcode notes:
 141 //
 142 // kRegExpBranch        The set of branches constituting a single choice are
 143 //              hooked together with their "next" pointers, since precedence prevents
 144 //              anything being concatenated to any individual branch.  The
 145 //              "next" pointer of the last kRegExpBranch in a choice points to the
 146 //              thing following the whole choice.  This is also where the
 147 //              final "next" pointer of each individual branch points; each
 148 //              branch starts with the operand node of a kRegExpBranch node.
 149 //
 150 // kRegExpBack          Normal "next" pointers all implicitly point forward;
 151 //              kRegExpBack exists to make loop structures possible.
 152 //
 153 // kRegExpStar,kRegExpPlus      '?', and complex '*' and '+', are implemented as
 154 //              circular kRegExpBranch structures using kRegExpBack.  Simple cases
 155 //              (one character per match) are implemented with kRegExpStar and
 156 //              kRegExpPlus for speed and to minimize recursive plunges.
 157 //
 158 // kRegExpOpen,kRegExpClose     ...are numbered at compile time.
 159 //
 160 //
 161 //
 162 // A node is one char of opcode followed by two chars of "next" pointer.
 163 // "Next" pointers are stored as two 8-bit pieces, high order first.  The
 164 // value is a positive offset from the opcode of the node containing it.
 165 // An operand, if any, simply follows the node.  (Note that much of the
 166 // code generation knows about this implicit relationship.)
 167 //
 168 // Using two bytes for the "next" pointer is vast overkill for most things,
 169 // but allows patterns to get big without disasters.
 170 //
 171
 172 const char* kMeta = "^$.[()|?+*\\";
 173 const int32 kMaxSize = 32767L;
 174         // Probably could be 65535L.
 175
 176
 177 // Flags to be passed up and down:
 178 enum {
 179         kHasWidth =     01,     // Known never to match null string.
 180         kSimple = 02,   // Simple enough to be kRegExpStar/kRegExpPlus operand.
 181         kSPStart = 04,  // Starts with * or +.
 182         kWorst = 0      // Worst case.
 183 };
 184
 185
 186 const char* kRegExpErrorStringArray[] = {
 187         B_TRANSLATE_MARK("Unmatched parenthesis."),
 188         B_TRANSLATE_MARK("Expression too long."),
 189         B_TRANSLATE_MARK("Too many parenthesis."),
 190         B_TRANSLATE_MARK("Junk on end."),
 191         B_TRANSLATE_MARK("*+? operand may be empty."),
 192         B_TRANSLATE_MARK("Nested *?+."),
 193         B_TRANSLATE_MARK("Invalid bracket range."),
 194         B_TRANSLATE_MARK("Unmatched brackets."),
 195         B_TRANSLATE_MARK("Internal error."),
 196         B_TRANSLATE_MARK("?+* follows nothing."),
 197         B_TRANSLATE_MARK("Trailing \\."),
 198         B_TRANSLATE_MARK("Corrupted expression."),
 199         B_TRANSLATE_MARK("Memory corruption."),
 200         B_TRANSLATE_MARK("Corrupted pointers."),
 201         B_TRANSLATE_MARK("Corrupted opcode.")
 202 };
 203
 204
 205 #ifdef DEBUG
 206 int32 regnarrate = 0;
 207 #endif
 208
 209
 210 //      #pragma mark - RegExp
 211
 212
 213 RegExp::RegExp()
 214         :
 215         fError(B_OK),
 216         fRegExp(NULL),
 217         fInputScanPointer(NULL),
 218         fParenthesisCount(0),
 219         fDummy('\0'),
 220         fCodeEmitPointer(NULL),
 221         fCodeSize(0),
 222         fStringInputPointer(NULL),
 223         fRegBol(NULL),
 224         fStartPArrayPointer(NULL),
 225         fEndPArrayPointer(NULL)
 226 {
 227 }
 228
 229
 230 RegExp::RegExp(const char* pattern)
 231         :
 232         fError(B_OK),
 233         fRegExp(NULL)
 234 {
 235         fRegExp = Compile(pattern);
 236 }
 237
 238
 239 RegExp::RegExp(const BString& pattern)
 240         :
 241         fError(B_OK),
 242         fRegExp(NULL)
 243 {
 244         fRegExp = Compile(pattern.String());
 245 }
 246
 247
 248 RegExp::~RegExp()
 249 {
 250         free(fRegExp);
 251 }
 252
 253
 254 status_t
 255 RegExp::InitCheck() const
 256 {
 257         return fError;
 258 }
 259
 260
 261 status_t
 262 RegExp::SetTo(const char* pattern)
 263 {
 264         fError = B_OK;
 265         free(fRegExp);
 266         fRegExp = Compile(pattern);
 267         return fError;
 268 }
 269
 270
 271 status_t
 272 RegExp::SetTo(const BString& pattern)
 273 {
 274         fError = B_OK;
 275         free(fRegExp);
 276         fRegExp = Compile(pattern.String());
 277         return fError;
 278 }
 279
 280
 281 bool
 282 RegExp::Matches(const char* string) const
 283 {
 284         if (fRegExp == NULL || string == NULL)
 285                 return false;
 286
 287         return RunMatcher(fRegExp, string) == 1;
 288 }
 289
 290
 291 bool
 292 RegExp::Matches(const BString& string) const
 293 {
 294         if (fRegExp == NULL)
 295                 return false;
 296
 297         return RunMatcher(fRegExp, string.String()) == 1;
 298 }
 299
 300
 301 //
 302 // - Compile - compile a regular expression into internal code
 303 //
 304 // We can't allocate space until we know how big the compiled form will be,
 305 // but we can't compile it (and thus know how big it is) until we've got a
 306 // place to put the code.  So we cheat:  we compile it twice, once with code
 307 // generation turned off and size counting turned on, and once "for real".
 308 // This also means that we don't allocate space until we are sure that the
 309 // thing really will compile successfully, and we never have to move the
 310 // code and thus invalidate pointers into it.  (Note that it has to be in
 311 // one piece because free() must be able to free it all.)
 312 //
 313 // Beware that the optimization-preparation code in here knows about some
 314 // of the structure of the compiled regexp.
 315 regexp*
 316 RegExp::Compile(const char* exp)
 317 {
 318         regexp* r;
 319         const char* scan;
 320         const char* longest;
 321         int32 len;
 322         int32 flags;
 323
 324         if (exp == NULL) {
 325                 SetError(B_BAD_VALUE);
 326                 return NULL;
 327         }
 328
 329         // First pass: determine size, legality.
 330         fInputScanPointer = exp;
 331         fParenthesisCount = 1;
 332         fCodeSize = 0L;
 333         fCodeEmitPointer = &fDummy;
 334         Char(kRegExpMagic);
 335         if (Reg(0, &flags) == NULL)
 336                 return NULL;
 337
 338         // Small enough for pointer-storage convention?
 339         if (fCodeSize >= kMaxSize) {
 340                 SetError(REGEXP_TOO_BIG);
 341                 return NULL;
 342         }
 343
 344         r = (regexp*)malloc(sizeof(regexp) + fCodeSize);
 345                 // Allocate space
 346
 347         if (r == NULL) {
 348                 SetError(B_NO_MEMORY);
 349                 return NULL;
 350         }
 351
 352         // Second pass: emit code.
 353         fInputScanPointer = exp;
 354         fParenthesisCount = 1;
 355         fCodeEmitPointer = r->program;
 356         Char(kRegExpMagic);
 357         if (Reg(0, &flags) == NULL) {
 358                 free(r);
 359                 return NULL;
 360         }
 361
 362         // Dig out information for optimizations.
 363         r->regstart = '\0';
 364                 // Worst-case defaults.
 365         r->reganch = 0;
 366         r->regmust = NULL;
 367         r->regmlen = 0;
 368         scan = r->program + 1;
 369                 // First kRegExpBranch.
 370         if (*Next((char*)scan) == kRegExpEnd) {
 371                 // Only one top-level choice.
 372                 scan = Operand(scan);
 373
 374                 // Starting-point info.
 375                 if (*scan == kRegExpExactly)
 376                         r->regstart = *Operand(scan);
 377                 else if (*scan == kRegExpBol)
 378                         r->reganch++;
 379
 380                 // If there's something expensive in the r.e., find the
 381                 // longest literal string that must appear and make it the
 382                 // regmust.  Resolve ties in favor of later strings, since
 383                 // the regstart check works with the beginning of the r.e.
 384                 // and avoiding duplication strengthens checking.  Not a
 385                 // strong reason, but sufficient in the absence of others.
 386                 if ((flags & kSPStart) != 0) {
 387                         longest = NULL;
 388                         len = 0;
 389                         for (; scan != NULL; scan = Next((char*)scan)) {
 390                                 if (*scan == kRegExpExactly
 391                                         && (int32)strlen(Operand(scan)) >= len) {
 392                                         longest = Operand(scan);
 393                                         len = (int32)strlen(Operand(scan));
 394                                 }
 395                         }
 396                         r->regmust = longest;
 397                         r->regmlen = len;
 398                 }
 399         }
 400
 401         return r;
 402 }
 403
 404
 405 regexp*
 406 RegExp::Expression() const
 407 {
 408         return fRegExp;
 409 }
 410
 411
 412 const char*
 413 RegExp::ErrorString() const
 414 {
 415         if (fError >= REGEXP_UNMATCHED_PARENTHESIS
 416                 && fError <= REGEXP_CORRUPTED_OPCODE) {
 417                 return B_TRANSLATE_NOCOLLECT(
 418                         kRegExpErrorStringArray[fError - B_ERRORS_END]);
 419         }
 420
 421         return strerror(fError);
 422 }
 423
 424
 425 void
 426 RegExp::SetError(status_t error) const
 427 {
 428         fError = error;
 429 }
 430
 431
 432 //
 433 // - Reg - regular expression, i.e. main body or parenthesized thing
 434 //
 435 // Caller must absorb opening parenthesis.
 436 //
 437 // Combining parenthesis handling with the base level of regular expression
 438 // is a trifle forced, but the need to tie the tails of the branches to what
 439 // follows makes it hard to avoid.
 440 //
 441 char*
 442 RegExp::Reg(int32 paren, int32* flagp)
 443 {
 444         char* ret;
 445         char* br;
 446         char* ender;
 447         int32 parno = 0;
 448         int32 flags;
 449
 450         *flagp = kHasWidth;
 451                 // Tentatively.
 452
 453         // Make an kRegExpOpen node, if parenthesized.
 454         if (paren) {
 455                 if (fParenthesisCount >= kSubExpressionMax) {
 456                         SetError(REGEXP_TOO_MANY_PARENTHESIS);
 457                         return NULL;
 458                 }
 459                 parno = fParenthesisCount;
 460                 fParenthesisCount++;
 461                 ret = Node((char)(kRegExpOpen + parno));
 462         } else
 463                 ret = NULL;
 464
 465         // Pick up the branches, linking them together.
 466         br = Branch(&flags);
 467         if (br == NULL)
 468                 return NULL;
 469
 470         if (ret != NULL) {
 471                 Tail(ret, br);
 472                         // kRegExpOpen -> first
 473         } else
 474                 ret = br;
 475
 476         if (!(flags & kHasWidth))
 477                 *flagp &= ~kHasWidth;
 478
 479         *flagp |= flags&kSPStart;
 480         while (*fInputScanPointer == '|') {
 481                 fInputScanPointer++;
 482                 br = Branch(&flags);
 483                 if (br == NULL)
 484                         return NULL;
 485
 486                 Tail(ret, br);
 487                         // kRegExpBranch -> kRegExpBranch.
 488                 if (!(flags & kHasWidth))
 489                         *flagp &= ~kHasWidth;
 490
 491                 *flagp |= flags&kSPStart;
 492         }
 493
 494         // Make a closing node, and hook it on the end.
 495         ender = Node(paren ? (char)(kRegExpClose + parno) : (char)kRegExpEnd);
 496         Tail(ret, ender);
 497
 498         // Hook the tails of the branches to the closing node.
 499         for (br = ret; br != NULL; br = Next(br))
 500                 OpTail(br, ender);
 501
 502         // Check for proper termination.
 503         if (paren && *fInputScanPointer++ != ')') {
 504                 SetError(REGEXP_UNMATCHED_PARENTHESIS);
 505                 return NULL;
 506         } else if (!paren && *fInputScanPointer != '\0') {
 507                 if (*fInputScanPointer == ')') {
 508                         SetError(REGEXP_UNMATCHED_PARENTHESIS);
 509                         return NULL;
 510                 } else {
 511                         SetError(REGEXP_JUNK_ON_END);
 512                         return NULL;    //  "Can't happen".
 513                 }
 514                 // NOTREACHED
 515         }
 516
 517         return ret;
 518 }
 519
 520
 521 //
 522 // - Branch - one alternative of an | operator
 523 //
 524 // Implements the concatenation operator.
 525 //
 526 char*
 527 RegExp::Branch(int32* flagp)
 528 {
 529         char* ret;
 530         char* chain;
 531         char* latest;
 532         int32 flags;
 533
 534         *flagp = kWorst;
 535                 // Tentatively.
 536
 537         ret = Node(kRegExpBranch);
 538         chain = NULL;
 539         while (*fInputScanPointer != '\0'
 540                 && *fInputScanPointer != '|'
 541                 && *fInputScanPointer != ')') {
 542                 latest = Piece(&flags);
 543                 if (latest == NULL)
 544                         return NULL;
 545
 546                 *flagp |= flags & kHasWidth;
 547                 if (chain == NULL) {
 548                         // First piece.
 549                         *flagp |= flags & kSPStart;
 550                 } else
 551                         Tail(chain, latest);
 552
 553                 chain = latest;
 554         }
 555
 556         if (chain == NULL) {
 557                 // Loop ran zero times.
 558                 Node(kRegExpNothing);
 559         }
 560
 561         return ret;
 562 }
 563
 564
 565 //
 566 // - Piece - something followed by possible [*+?]
 567 //
 568 // Note that the branching code sequences used for ? and the general cases
 569 // of * and + are somewhat optimized:  they use the same kRegExpNothing node
 570 // as both the endmarker for their branch list and the body of the last
 571 // branch. It might seem that this node could be dispensed with entirely,
 572 // but the endmarker role is not redundant.
 573 //
 574 char*
 575 RegExp::Piece(int32* flagp)
 576 {
 577         char* ret;
 578         char op;
 579         char* next;
 580         int32 flags;
 581
 582         ret = Atom(&flags);
 583         if (ret == NULL)
 584                 return NULL;
 585
 586         op = *fInputScanPointer;
 587         if (!IsMult(op)) {
 588                 *flagp = flags;
 589                 return ret;
 590         }
 591
 592         if (!(flags & kHasWidth) && op != '?') {
 593                 SetError(REGEXP_STAR_PLUS_OPERAND_EMPTY);
 594                 return NULL;
 595         }
 596         *flagp = op != '+' ? kWorst | kSPStart :  kWorst | kHasWidth;
 597
 598         if (op == '*' && (flags & kSimple))
 599                 Insert(kRegExpStar, ret);
 600         else if (op == '*') {
 601                 // Emit x* as (x&|), where & means "self".
 602                 Insert(kRegExpBranch, ret);
 603                         // Either x
 604                 OpTail(ret, Node(kRegExpBack));
 605                         // and loop
 606                 OpTail(ret, ret);
 607                         // back
 608                 Tail(ret, Node(kRegExpBranch));
 609                         // or
 610                 Tail(ret, Node(kRegExpNothing));
 611                         // null.
 612         } else if (op == '+' && (flags & kSimple))
 613                 Insert(kRegExpPlus, ret);
 614         else if (op == '+') {
 615                 // Emit x+ as x(&|), where & means "self".
 616                 next = Node(kRegExpBranch);
 617                         // Either
 618                 Tail(ret, next);
 619                 Tail(Node(kRegExpBack), ret);
 620                         // loop back
 621                 Tail(next, Node(kRegExpBranch));
 622                         // or
 623                 Tail(ret, Node(kRegExpNothing));
 624                         // null.
 625         } else if (op == '?') {
 626                 // Emit x? as (x|)
 627                 Insert(kRegExpBranch, ret);
 628                         // Either x
 629                 Tail(ret, Node(kRegExpBranch));
 630                         // or
 631                 next = Node(kRegExpNothing);
 632                         // null.
 633                 Tail(ret, next);
 634                 OpTail(ret, next);
 635         }
 636
 637         fInputScanPointer++;
 638         if (IsMult(*fInputScanPointer)) {
 639                 SetError(REGEXP_NESTED_STAR_QUESTION_PLUS);
 640                 return NULL;
 641         }
 642
 643         return ret;
 644 }
 645
 646
 647 //
 648 // - Atom - the lowest level
 649 //
 650 // Optimization:  gobbles an entire sequence of ordinary characters so that
 651 // it can turn them into a single node, which is smaller to store and
 652 // faster to run.  Backslashed characters are exceptions, each becoming a
 653 // separate node; the code is simpler that way and it's not worth fixing.
 654 //
 655 char*
 656 RegExp::Atom(int32* flagp)
 657 {
 658         char* ret;
 659         int32 flags;
 660
 661         *flagp = kWorst;
 662                 // tentatively
 663
 664         switch (*fInputScanPointer++) {
 665                 case '^':
 666                         ret = Node(kRegExpBol);
 667                         break;
 668
 669                 case '$':
 670                         ret = Node(kRegExpEol);
 671                         break;
 672
 673                 case '.':
 674                         ret = Node(kRegExpAny);
 675                         *flagp |= kHasWidth|kSimple;
 676                         break;
 677
 678                 case '[':
 679                 {
 680                         int32 cclass;
 681                         int32 classend;
 682
 683                         if (*fInputScanPointer == '^') {
 684                                 // complement of range
 685                                 ret = Node(kRegExpAnyBut);
 686                                 fInputScanPointer++;
 687                         } else
 688                                 ret = Node(kRegExpAnyOf);
 689                         if (*fInputScanPointer == ']' || *fInputScanPointer == '-')
 690                                 Char(*fInputScanPointer++);
 691                         while (*fInputScanPointer != '\0'
 692                                 && *fInputScanPointer != ']') {
 693                                 if (*fInputScanPointer == '-') {
 694                                         fInputScanPointer++;
 695                                         if (*fInputScanPointer == ']'
 696                                                 || *fInputScanPointer == '\0') {
 697                                                 Char('-');
 698                                         } else {
 699                                                 cclass = UCharAt(fInputScanPointer - 2) + 1;
 700                                                 classend = UCharAt(fInputScanPointer);
 701                                                 if (cclass > classend + 1) {
 702                                                         SetError(REGEXP_INVALID_BRACKET_RANGE);
 703                                                         return NULL;
 704                                                 }
 705                                                 for (; cclass <= classend; cclass++)
 706                                                         Char((char)cclass);
 707                                                 fInputScanPointer++;
 708                                         }
 709                                 } else
 710                                         Char(*fInputScanPointer++);
 711                         }
 712                         Char('\0');
 713                         if (*fInputScanPointer != ']') {
 714                                 SetError(REGEXP_UNMATCHED_BRACKET);
 715                                 return NULL;
 716                         }
 717                         fInputScanPointer++;
 718                         *flagp |= kHasWidth | kSimple;
 719                         break;
 720                 }
 721
 722                 case '(':
 723                         ret = Reg(1, &flags);
 724                         if (ret == NULL)
 725                                 return NULL;
 726                         *flagp |= flags & (kHasWidth | kSPStart);
 727                         break;
 728
 729                 case '\0':
 730                 case '|':
 731                 case ')':
 732                         SetError(REGEXP_INTERNAL_ERROR);
 733                         return NULL;
 734                                 //  supposed to be caught earlier
 735
 736                 case '?':
 737                 case '+':
 738                 case '*':
 739                         SetError(REGEXP_QUESTION_PLUS_STAR_FOLLOWS_NOTHING);
 740                         return NULL;
 741
 742                 case '\\':
 743                         if (*fInputScanPointer == '\0') {
 744                                 SetError(REGEXP_TRAILING_BACKSLASH);
 745                                 return NULL;
 746                         }
 747                         ret = Node(kRegExpExactly);
 748                         Char(*fInputScanPointer++);
 749                         Char('\0');
 750                         *flagp |= kHasWidth|kSimple;
 751                         break;
 752
 753                 default:
 754                 {
 755                         int32 len;
 756                         char ender;
 757
 758                         fInputScanPointer--;
 759                         len = (int32)strcspn(fInputScanPointer, kMeta);
 760                         if (len <= 0) {
 761                                 SetError(REGEXP_INTERNAL_ERROR);
 762                                 return NULL;
 763                         }
 764
 765                         ender = *(fInputScanPointer + len);
 766                         if (len > 1 && IsMult(ender)) {
 767                                 // Back off clear of ?+* operand.
 768                                 len--;
 769                         }
 770
 771                         *flagp |= kHasWidth;
 772                         if (len == 1)
 773                                 *flagp |= kSimple;
 774
 775                         ret = Node(kRegExpExactly);
 776                         while (len > 0) {
 777                                 Char(*fInputScanPointer++);
 778                                 len--;
 779                         }
 780
 781                         Char('\0');
 782                         break;
 783                 }
 784         }
 785
 786         return ret;
 787 }
 788
 789
 790 //
 791 // - Node - emit a node
 792 //
 793 // Returns location.
 794 //
 795 char*
 796 RegExp::Node(char op)
 797 {
 798         char* ret;
 799         char* ptr;
 800
 801         ret = fCodeEmitPointer;
 802         if (ret == &fDummy) {
 803                 fCodeSize += 3;
 804                 return ret;
 805         }
 806
 807         ptr = ret;
 808         *ptr++ = op;
 809         *ptr++ = '\0';
 810                 // Null "next" pointer.
 811         *ptr++ = '\0';
 812         fCodeEmitPointer = ptr;
 813
 814         return ret;
 815 }
 816
 817
 818 //
 819 // - Char - emit (if appropriate) a byte of code
 820 //
 821 void
 822 RegExp::Char(char b)
 823 {
 824         if (fCodeEmitPointer != &fDummy)
 825                 *fCodeEmitPointer++ = b;
 826         else
 827                 fCodeSize++;
 828 }
 829
 830
 831 //
 832 // - Insert - insert an operator in front of already-emitted operand
 833 //
 834 // Means relocating the operand.
 835 //
 836 void
 837 RegExp::Insert(char op, char* opnd)
 838 {
 839         char* src;
 840         char* dst;
 841         char* place;
 842
 843         if (fCodeEmitPointer == &fDummy) {
 844                 fCodeSize += 3;
 845                 return;
 846         }
 847
 848         src = fCodeEmitPointer;
 849         fCodeEmitPointer += 3;
 850         dst = fCodeEmitPointer;
 851         while (src > opnd)
 852                 *--dst = *--src;
 853
 854         place = opnd;
 855                 // Op node, where operand used to be.
 856         *place++ = op;
 857         *place++ = '\0';
 858         *place++ = '\0';
 859 }
 860
 861
 862 //
 863 // - Tail - set the next-pointer at the end of a node chain
 864 //
 865 void
 866 RegExp::Tail(char* p, char* val)
 867 {
 868         char* scan;
 869         char* temp;
 870         int32 offset;
 871
 872         if (p == &fDummy)
 873                 return;
 874
 875         // Find last node.
 876         scan = p;
 877         for (;;) {
 878                 temp = Next(scan);
 879                 if (temp == NULL)
 880                         break;
 881                 scan = temp;
 882         }
 883
 884         if (scan[0] == kRegExpBack)
 885                 offset = scan - val;
 886         else
 887                 offset = val - scan;
 888
 889         scan[1] = (char)((offset >> 8) & 0377);
 890         scan[2] = (char)(offset & 0377);
 891 }
 892
 893
 894 //
 895 // - OpTail - Tail on operand of first argument; nop if operandless
 896 //
 897 void
 898 RegExp::OpTail(char* p, char* val)
 899 {
 900         // "Operandless" and "op != kRegExpBranch" are synonymous in practice.
 901         if (p == NULL || p == &fDummy || *p != kRegExpBranch)
 902                 return;
 903
 904         Tail(Operand(p), val);
 905 }
 906
 907 //
 908 // RunMatcher and friends
 909 //
 910
 911
 912 //
 913 // - RunMatcher - match a regexp against a string
 914 //
 915 int32
 916 RegExp::RunMatcher(regexp* prog, const char* string) const
 917 {
 918         const char* s;
 919
 920         // be paranoid...
 921         if (prog == NULL || string == NULL) {
 922                 SetError(B_BAD_VALUE);
 923                 return 0;
 924         }
 925
 926         // check validity of program
 927         if (UCharAt(prog->program) != kRegExpMagic) {
 928                 SetError(REGEXP_CORRUPTED_PROGRAM);
 929                 return 0;
 930         }
 931
 932         // if there is a "must appear" string, look for it
 933         if (prog->regmust != NULL) {
 934                 s = string;
 935                 while ((s = strchr(s, prog->regmust[0])) != NULL) {
 936                         if (strncmp(s, prog->regmust, (size_t)prog->regmlen) == 0) {
 937                                 // found it
 938                                 break;
 939                         }
 940                         s++;
 941                 }
 942                 if (s == NULL) {
 943                         // not present
 944                         return 0;
 945                 }
 946         }
 947
 948         // mark beginning of line for ^
 949         fRegBol = string;
 950
 951         // simplest case:  anchored match need be tried only once
 952         if (prog->reganch)
 953                 return Try(prog, (char*)string);
 954
 955         // messy cases: unanchored match
 956         s = string;
 957         if (prog->regstart != '\0') {
 958                 // we know what char it must start with
 959                 while ((s = strchr(s, prog->regstart)) != NULL) {
 960                         if (Try(prog, (char*)s))
 961                                 return 1;
 962                         s++;
 963                 }
 964         } else {
 965                 // we don't -- general case.
 966                 do {
 967                         if (Try(prog, (char*)s))
 968                                 return 1;
 969                 } while (*s++ != '\0');
 970         }
 971
 972         // failure
 973         return 0;
 974 }
 975
 976
 977 //
 978 // - Try - try match at specific point
 979 //
 980 // Returns 0 on failure, 1 on success.
 981 //
 982 int32
 983 RegExp::Try(regexp* prog, const char* string) const
 984 {
 985         int32 i;
 986         const char **sp;
 987         const char **ep;
 988
 989         fStringInputPointer = string;
 990         fStartPArrayPointer = prog->startp;
 991         fEndPArrayPointer = prog->endp;
 992
 993         sp = prog->startp;
 994         ep = prog->endp;
 995         for (i = kSubExpressionMax; i > 0; i--) {
 996                 *sp++ = NULL;
 997                 *ep++ = NULL;
 998         }
 999         if (Match(prog->program + 1)) {
1000                 prog->startp[0] = string;
1001                 prog->endp[0] = fStringInputPointer;
1002                 return 1;
1003         } else
1004                 return 0;
1005 }
1006
1007
1008 //
1009 // - Match - main matching routine
1010 //
1011 // Conceptually the strategy is simple:  check to see whether the current
1012 // node matches, call self recursively to see whether the rest matches,
1013 // and then act accordingly.  In practice we make some effort to avoid
1014 // recursion, in particular by going through "ordinary" nodes (that don't
1015 // need to know whether the rest of the match failed) by a loop instead of
1016 // by recursion.
1017 //
1018 // Returns 0 on failure, 1 on success.
1019 //
1020 int32
1021 RegExp::Match(const char* prog) const
1022 {
1023         const char* scan;       // Current node.
1024         const char* next;               // Next node.
1025
1026         scan = prog;
1027 #ifdef DEBUG
1028         if (scan != NULL && regnarrate)
1029                 fprintf(stderr, "%s(\n", Prop(scan));
1030 #endif
1031         while (scan != NULL) {
1032 #ifdef DEBUG
1033                 if (regnarrate)
1034                         fprintf(stderr, "%s...\n", Prop(scan));
1035 #endif
1036                 next = Next(scan);
1037
1038                 switch (*scan) {
1039                         case kRegExpBol:
1040                                 if (fStringInputPointer != fRegBol)
1041                                         return 0;
1042                                 break;
1043
1044                         case kRegExpEol:
1045                                 if (*fStringInputPointer != '\0')
1046                                         return 0;
1047                                 break;
1048
1049                         case kRegExpAny:
1050                                 if (*fStringInputPointer == '\0')
1051                                         return 0;
1052                                 fStringInputPointer++;
1053                                 break;
1054
1055                         case kRegExpExactly:
1056                         {
1057                                 const char* opnd = Operand(scan);
1058                                 // Inline the first character, for speed.
1059                                 if (*opnd != *fStringInputPointer)
1060                                         return 0;
1061
1062                                 uint32 len = strlen(opnd);
1063                                 if (len > 1
1064                                         && strncmp(opnd, fStringInputPointer, len) != 0) {
1065                                         return 0;
1066                                 }
1067
1068                                 fStringInputPointer += len;
1069                         }
1070                         break;
1071
1072                         case kRegExpAnyOf:
1073                                 if (*fStringInputPointer == '\0'
1074                                         || strchr(Operand(scan), *fStringInputPointer) == NULL) {
1075                                         return 0;
1076                                 }
1077                                 fStringInputPointer++;
1078                                 break;
1079
1080                         case kRegExpAnyBut:
1081                                 if (*fStringInputPointer == '\0'
1082                                         || strchr(Operand(scan), *fStringInputPointer) != NULL) {
1083                                         return 0;
1084                                 }
1085                                 fStringInputPointer++;
1086                                 break;
1087
1088                         case kRegExpNothing:
1089                                 break;
1090
1091                         case kRegExpBack:
1092                                 break;
1093
1094                         case kRegExpOpen + 1:
1095                         case kRegExpOpen + 2:
1096                         case kRegExpOpen + 3:
1097                         case kRegExpOpen + 4:
1098                         case kRegExpOpen + 5:
1099                         case kRegExpOpen + 6:
1100                         case kRegExpOpen + 7:
1101                         case kRegExpOpen + 8:
1102                         case kRegExpOpen + 9:
1103                         {
1104                                 int32 no;
1105                                 const char* save;
1106
1107                                 no = *scan - kRegExpOpen;
1108                                 save = fStringInputPointer;
1109
1110                                 if (Match(next)) {
1111                                         //
1112                                         // Don't set startp if some later
1113                                         // invocation of the same parentheses
1114                                         // already has.
1115                                         //
1116                                         if (fStartPArrayPointer[no] == NULL)
1117                                                 fStartPArrayPointer[no] = save;
1118                                         return 1;
1119                                 } else
1120                                         return 0;
1121                         }
1122                         break;
1123
1124                         case kRegExpClose + 1:
1125                         case kRegExpClose + 2:
1126                         case kRegExpClose + 3:
1127                         case kRegExpClose + 4:
1128                         case kRegExpClose + 5:
1129                         case kRegExpClose + 6:
1130                         case kRegExpClose + 7:
1131                         case kRegExpClose + 8:
1132                         case kRegExpClose + 9:
1133                         {
1134                                 int32 no;
1135                                 const char* save;
1136
1137                                 no = *scan - kRegExpClose;
1138                                 save = fStringInputPointer;
1139
1140                                 if (Match(next)) {
1141                                         //
1142                                         // Don't set endp if some later
1143                                         // invocation of the same parentheses
1144                                         // already has.
1145                                         //
1146                                         if (fEndPArrayPointer[no] == NULL)
1147                                                 fEndPArrayPointer[no] = save;
1148                                         return 1;
1149                                 } else
1150                                         return 0;
1151                         }
1152                         break;
1153
1154                         case kRegExpBranch:
1155                         {
1156                                 const char* save;
1157
1158                                 if (*next != kRegExpBranch) {
1159                                         // no choice
1160                                         next = Operand(scan);
1161                                                 // avoid recursion
1162                                 } else {
1163                                         do {
1164                                                 save = fStringInputPointer;
1165                                                 if (Match(Operand(scan)))
1166                                                         return 1;
1167                                                 fStringInputPointer = save;
1168                                                 scan = Next(scan);
1169                                         } while (scan != NULL && *scan == kRegExpBranch);
1170                                         return 0;
1171                                         // NOTREACHED
1172                                 }
1173                         }
1174                         break;
1175
1176                         case kRegExpStar:
1177                         case kRegExpPlus:
1178                                 {
1179                                         char nextch;
1180                                         int32 no;
1181                                         const char* save;
1182                                         int32 min;
1183
1184                                         //
1185                                         // Lookahead to avoid useless match attempts
1186                                         // when we know what character comes next.
1187                                         //
1188                                         nextch = '\0';
1189                                         if (*next == kRegExpExactly)
1190                                                 nextch = *Operand(next);
1191                                         min = (*scan == kRegExpStar) ? 0 : 1;
1192                                         save = fStringInputPointer;
1193                                         no = Repeat(Operand(scan));
1194                                         while (no >= min) {
1195                                                 // if it could work, try it
1196                                                 if (nextch == '\0' || *fStringInputPointer == nextch) {
1197                                                         if (Match(next))
1198                                                                 return 1;
1199                                                 }
1200                                                 // couldn't or didn't, back up
1201                                                 no--;
1202                                                 fStringInputPointer = save + no;
1203                                         }
1204                                         return 0;
1205                                 }
1206                                 break;
1207
1208                         case kRegExpEnd:
1209                                 return 1;
1210                                         // Success!
1211
1212                         default:
1213                                 SetError(REGEXP_MEMORY_CORRUPTION);
1214                                 return 0;
1215                         }
1216
1217                 scan = next;
1218         }
1219
1220         //
1221         // We get here only if there's trouble -- normally "case kRegExpEnd" is
1222         // the terminating point.
1223         //
1224         SetError(REGEXP_CORRUPTED_POINTERS);
1225
1226         return 0;
1227 }
1228
1229
1230 //
1231 // - Repeat - repeatedly match something simple, report how many
1232 //
1233 int32
1234 RegExp::Repeat(const char* p) const
1235 {
1236         int32 count = 0;
1237         const char* scan;
1238         const char* opnd;
1239
1240         scan = fStringInputPointer;
1241         opnd = Operand(p);
1242         switch (*p) {
1243                 case kRegExpAny:
1244                         count = (int32)strlen(scan);
1245                         scan += count;
1246                         break;
1247
1248                 case kRegExpExactly:
1249                         while (*opnd == *scan) {
1250                                 count++;
1251                                 scan++;
1252                         }
1253                         break;
1254
1255                 case kRegExpAnyOf:
1256                         while (*scan != '\0' && strchr(opnd, *scan) != NULL) {
1257                                 count++;
1258                                 scan++;
1259                         }
1260                         break;
1261
1262                 case kRegExpAnyBut:
1263                         while (*scan != '\0' && strchr(opnd, *scan) == NULL) {
1264                                 count++;
1265                                 scan++;
1266                         }
1267                         break;
1268
1269                 default:
1270                         // oh dear, called inappropriately
1271                         SetError(REGEXP_INTERNAL_ERROR);
1272                         count = 0;
1273                                 // best compromise
1274                         break;
1275         }
1276         fStringInputPointer = scan;
1277
1278         return count;
1279 }
1280
1281
1282 //
1283 // - Next - dig the "next" pointer out of a node
1284 //
1285 char*
1286 RegExp::Next(char* p)
1287 {
1288         int32 offset;
1289
1290         if (p == &fDummy)
1291                 return NULL;
1292
1293         offset = ((*(p + 1) & 0377) << 8) + (*(p + 2) & 0377);
1294         if (offset == 0)
1295                 return NULL;
1296
1297         if (*p == kRegExpBack)
1298                 return p - offset;
1299         else
1300                 return p + offset;
1301 }
1302
1303
1304 const char*
1305 RegExp::Next(const char* p) const
1306 {
1307         int32 offset;
1308
1309         if (p == &fDummy)
1310                 return NULL;
1311
1312         offset = ((*(p + 1) & 0377) << 8) + (*(p + 2) & 0377);
1313         if (offset == 0)
1314                 return NULL;
1315
1316         if (*p == kRegExpBack)
1317                 return p - offset;
1318         else
1319                 return p + offset;
1320 }
1321
1322
1323 inline int32
1324 RegExp::UCharAt(const char* p) const
1325 {
1326         return (int32)*(unsigned char *)p;
1327 }
1328
1329
1330 inline char*
1331 RegExp::Operand(char* p) const
1332 {
1333         return p + 3;
1334 }
1335
1336
1337 inline const char*
1338 RegExp::Operand(const char* p) const
1339 {
1340         return p + 3;
1341 }
1342
1343
1344 inline bool
1345 RegExp::IsMult(char c) const
1346 {
1347         return c == '*' || c == '+' || c == '?';
1348 }
1349
1350
1351 #ifdef DEBUG
1352
1353
1354 //
1355 // - Dump - dump a regexp onto stdout in vaguely comprehensible form
1356 //
1357 void
1358 RegExp::Dump()
1359 {
1360         const char* s;
1361         char op = kRegExpExactly;
1362                 // Arbitrary non-kRegExpEnd op.
1363         const char* next;
1364
1365         s = fRegExp->program + 1;
1366         while (op != kRegExpEnd) {
1367                 // While that wasn't kRegExpEnd last time...
1368                 op = *s;
1369                 printf("%2ld%s", s - fRegExp->program, Prop(s));
1370                         // Where, what.
1371                 next = Next(s);
1372                 if (next == NULL) {
1373                         // next ptr
1374                         printf("(0)");
1375                 } else
1376                         printf("(%ld)", (s - fRegExp->program) + (next - s));
1377
1378                 s += 3;
1379                 if (op == kRegExpAnyOf || op == kRegExpAnyBut
1380                         || op == kRegExpExactly) {
1381                         // literal string, where present
1382                         while (*s != '\0') {
1383                                 putchar(*s);
1384                                 s++;
1385                         }
1386                         s++;
1387                 }
1388                 putchar('\n');
1389         }
1390
1391         // header fields of interest
1392         if (fRegExp->regstart != '\0')
1393                 printf("start `%c' ", fRegExp->regstart);
1394
1395         if (fRegExp->reganch)
1396                 printf("anchored ");
1397
1398         if (fRegExp->regmust != NULL)
1399                 printf("must have \"%s\"", fRegExp->regmust);
1400
1401         printf("\n");
1402 }
1403
1404
1405 //
1406 // - Prop - printable representation of opcode
1407 //
1408 char*
1409 RegExp::Prop(const char* op) const
1410 {
1411         const char* p = NULL;
1412         static char buf[50];
1413
1414         strcpy(buf, ":");
1415
1416         switch (*op) {
1417                 case kRegExpBol:
1418                         p = "kRegExpBol";
1419                         break;
1420
1421                 case kRegExpEol:
1422                         p = "kRegExpEol";
1423                         break;
1424
1425                 case kRegExpAny:
1426                         p = "kRegExpAny";
1427                         break;
1428
1429                 case kRegExpAnyOf:
1430                         p = "kRegExpAnyOf";
1431                         break;
1432
1433                 case kRegExpAnyBut:
1434                         p = "kRegExpAnyBut";
1435                         break;
1436
1437                 case kRegExpBranch:
1438                         p = "kRegExpBranch";
1439                         break;
1440
1441                 case kRegExpExactly:
1442                         p = "kRegExpExactly";
1443                         break;
1444
1445                 case kRegExpNothing:
1446                         p = "kRegExpNothing";
1447                         break;
1448
1449                 case kRegExpBack:
1450                         p = "kRegExpBack";
1451                         break;
1452
1453                 case kRegExpEnd:
1454                         p = "kRegExpEnd";
1455                         break;
1456
1457                 case kRegExpOpen + 1:
1458                 case kRegExpOpen + 2:
1459                 case kRegExpOpen + 3:
1460                 case kRegExpOpen + 4:
1461                 case kRegExpOpen + 5:
1462                 case kRegExpOpen + 6:
1463                 case kRegExpOpen + 7:
1464                 case kRegExpOpen + 8:
1465                 case kRegExpOpen + 9:
1466                         sprintf(buf + strlen(buf), "kRegExpOpen%d", *op - kRegExpOpen);
1467                         p = NULL;
1468                         break;
1469
1470                 case kRegExpClose + 1:
1471                 case kRegExpClose + 2:
1472                 case kRegExpClose + 3:
1473                 case kRegExpClose + 4:
1474                 case kRegExpClose + 5:
1475                 case kRegExpClose + 6:
1476                 case kRegExpClose + 7:
1477                 case kRegExpClose + 8:
1478                 case kRegExpClose + 9:
1479                         sprintf(buf + strlen(buf), "kRegExpClose%d", *op - kRegExpClose);
1480                         p = NULL;
1481                         break;
1482
1483                 case kRegExpStar:
1484                         p = "kRegExpStar";
1485                         break;
1486
1487                 case kRegExpPlus:
1488                         p = "kRegExpPlus";
1489                         break;
1490
1491                 default:
1492                         RegExpError("corrupted opcode");
1493                         break;
1494         }
1495
1496         if (p != NULL)
1497                 strcat(buf, p);
1498
1499         return buf;
1500 }
1501
1502
1503 void
1504 RegExp::RegExpError(const char*) const
1505 {
1506         // does nothing now, perhaps it should printf?
1507 }
1508
1509 #endif  // DEBUG