vfs: check userland buffers before reading them.
[haiku.git] / src / kits / tracker / RegExp.cpp
blobfcf980bc8f1696c73da629a7419a07e4cb71b121
1 /*
2 Open Tracker License
4 Terms and Conditions
6 Copyright (c) 1991-2000, Be Incorporated. All rights reserved.
8 Permission is hereby granted, free of charge, to any person obtaining a copy of
9 this software and associated documentation files (the "Software"), to deal in
10 the Software without restriction, including without limitation the rights to
11 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
12 of the Software, and to permit persons to whom the Software is furnished to do
13 so, subject to the following conditions:
15 The above copyright notice and this permission notice applies to all licensees
16 and shall be included in all copies or substantial portions of the Software.
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF TITLE, MERCHANTABILITY,
20 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
21 BE INCORPORATED BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
22 AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN CONNECTION
23 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 Except as contained in this notice, the name of Be Incorporated shall not be
26 used in advertising or otherwise to promote the sale, use or other dealings in
27 this Software without prior written authorization from Be Incorporated.
29 Tracker(TM), Be(R), BeOS(R), and BeIA(TM) are trademarks or registered trademarks
30 of Be Incorporated in the United States and other countries. Other brand product
31 names are registered trademarks or trademarks of their respective holders.
32 All rights reserved.
35 // This code is based on regexp.c, v.1.3 by Henry Spencer:
37 // @(#)regexp.c 1.3 of 18 April 87
39 // Copyright (c) 1986 by University of Toronto.
40 // Written by Henry Spencer. Not derived from licensed software.
42 // Permission is granted to anyone to use this software for any
43 // purpose on any computer system, and to redistribute it freely,
44 // subject to the following restrictions:
46 // 1. The author is not responsible for the consequences of use of
47 // this software, no matter how awful, even if they arise
48 // from defects in it.
50 // 2. The origin of this software must not be misrepresented, either
51 // by explicit claim or by omission.
53 // 3. Altered versions must be plainly marked as such, and must not
54 // be misrepresented as being the original software.
56 // Beware that some of this code is subtly aware of the way operator
57 // precedence is structured in regular expressions. Serious changes in
58 // regular-expression syntax might require a total rethink.
61 // ALTERED VERSION: Adapted to ANSI C and C++ for the OpenTracker
62 // project (www.opentracker.org), Jul 11, 2000.
65 #include <stdlib.h>
66 #include <stdio.h>
67 #include <string.h>
69 #include <Catalog.h>
70 #include <Errors.h>
72 #include "RegExp.h"
75 #undef B_TRANSLATION_CONTEXT
76 #define B_TRANSLATION_CONTEXT "libtracker"
79 // The first byte of the regexp internal "program" is actually this magic
80 // number; the start node begins in the second byte.
82 const uint8 kRegExpMagic = 0234;
84 // The "internal use only" fields in RegExp.h are present to pass info from
85 // compile to execute that permits the execute phase to run lots faster on
86 // simple cases. They are:
88 // regstart char that must begin a match; '\0' if none obvious
89 // reganch is the match anchored (at beginning-of-line only)?
90 // regmust string (pointer into program) that match must include, or NULL
91 // regmlen length of regmust string
93 // Regstart and reganch permit very fast decisions on suitable starting points
94 // for a match, cutting down the work a lot. Regmust permits fast rejection
95 // of lines that cannot possibly match. The regmust tests are costly enough
96 // that Compile() supplies a regmust only if the r.e. contains something
97 // potentially expensive (at present, the only such thing detected is * or +
98 // at the start of the r.e., which can involve a lot of backup). Regmlen is
99 // supplied because the test in RunMatcher() needs it and Compile() is
100 // computing it anyway.
104 // Structure for regexp "program". This is essentially a linear encoding
105 // of a nondeterministic finite-state machine (aka syntax charts or
106 // "railroad normal form" in parsing technology). Each node is an opcode
107 // plus a "next" pointer, possibly plus an operand. "Next" pointers of
108 // all nodes except kRegExpBranch implement concatenation; a "next" pointer
109 // with a kRegExpBranch on both ends of it is connecting two alternatives.
110 // (Here we have one of the subtle syntax dependencies: an individual
111 // kRegExpBranch (as opposed to a collection of them) is never concatenated
112 // with anything because of operator precedence.) The operand of some types
113 // of node is a literal string; for others, it is a node leading into a
114 // sub-FSM. In particular, the operand of a kRegExpBranch node is the first
115 // node of the branch. (NB this is* not* a tree structure: the tail of the
116 // branch connects to the thing following the set of kRegExpBranches).
117 // The opcodes are:
120 // definition number opnd? meaning
121 enum {
122 kRegExpEnd = 0, // no End of program.
123 kRegExpBol = 1, // no Match "" at beginning of line.
124 kRegExpEol = 2, // no Match "" at end of line.
125 kRegExpAny = 3, // no Match any one character.
126 kRegExpAnyOf = 4, // str Match any character in this string.
127 kRegExpAnyBut = 5, // str Match any character not in this string.
128 kRegExpBranch = 6, // node Match this alternative, or the next...
129 kRegExpBack = 7, // no Match "", "next" ptr points backward.
130 kRegExpExactly = 8, // str Match this string.
131 kRegExpNothing = 9, // no Match empty string.
132 kRegExpStar = 10, // node Match this (simple) thing 0 or more times.
133 kRegExpPlus = 11, // node Match this (simple) thing 1 or more times.
134 kRegExpOpen = 20, // no Mark this point in input as start of #n.
135 // kRegExpOpen + 1 is number 1, etc.
136 kRegExpClose = 30 // no Analogous to kRegExpOpen.
140 // Opcode notes:
142 // kRegExpBranch The set of branches constituting a single choice are
143 // hooked together with their "next" pointers, since precedence prevents
144 // anything being concatenated to any individual branch. The
145 // "next" pointer of the last kRegExpBranch in a choice points to the
146 // thing following the whole choice. This is also where the
147 // final "next" pointer of each individual branch points; each
148 // branch starts with the operand node of a kRegExpBranch node.
150 // kRegExpBack Normal "next" pointers all implicitly point forward;
151 // kRegExpBack exists to make loop structures possible.
153 // kRegExpStar,kRegExpPlus '?', and complex '*' and '+', are implemented as
154 // circular kRegExpBranch structures using kRegExpBack. Simple cases
155 // (one character per match) are implemented with kRegExpStar and
156 // kRegExpPlus for speed and to minimize recursive plunges.
158 // kRegExpOpen,kRegExpClose ...are numbered at compile time.
162 // A node is one char of opcode followed by two chars of "next" pointer.
163 // "Next" pointers are stored as two 8-bit pieces, high order first. The
164 // value is a positive offset from the opcode of the node containing it.
165 // An operand, if any, simply follows the node. (Note that much of the
166 // code generation knows about this implicit relationship.)
168 // Using two bytes for the "next" pointer is vast overkill for most things,
169 // but allows patterns to get big without disasters.
172 const char* kMeta = "^$.[()|?+*\\";
173 const int32 kMaxSize = 32767L;
174 // Probably could be 65535L.
177 // Flags to be passed up and down:
178 enum {
179 kHasWidth = 01, // Known never to match null string.
180 kSimple = 02, // Simple enough to be kRegExpStar/kRegExpPlus operand.
181 kSPStart = 04, // Starts with * or +.
182 kWorst = 0 // Worst case.
186 const char* kRegExpErrorStringArray[] = {
187 B_TRANSLATE_MARK("Unmatched parenthesis."),
188 B_TRANSLATE_MARK("Expression too long."),
189 B_TRANSLATE_MARK("Too many parenthesis."),
190 B_TRANSLATE_MARK("Junk on end."),
191 B_TRANSLATE_MARK("*+? operand may be empty."),
192 B_TRANSLATE_MARK("Nested *?+."),
193 B_TRANSLATE_MARK("Invalid bracket range."),
194 B_TRANSLATE_MARK("Unmatched brackets."),
195 B_TRANSLATE_MARK("Internal error."),
196 B_TRANSLATE_MARK("?+* follows nothing."),
197 B_TRANSLATE_MARK("Trailing \\."),
198 B_TRANSLATE_MARK("Corrupted expression."),
199 B_TRANSLATE_MARK("Memory corruption."),
200 B_TRANSLATE_MARK("Corrupted pointers."),
201 B_TRANSLATE_MARK("Corrupted opcode.")
205 #ifdef DEBUG
206 int32 regnarrate = 0;
207 #endif
210 // #pragma mark - RegExp
213 RegExp::RegExp()
215 fError(B_OK),
216 fRegExp(NULL),
217 fInputScanPointer(NULL),
218 fParenthesisCount(0),
219 fDummy('\0'),
220 fCodeEmitPointer(NULL),
221 fCodeSize(0),
222 fStringInputPointer(NULL),
223 fRegBol(NULL),
224 fStartPArrayPointer(NULL),
225 fEndPArrayPointer(NULL)
230 RegExp::RegExp(const char* pattern)
232 fError(B_OK),
233 fRegExp(NULL)
235 fRegExp = Compile(pattern);
239 RegExp::RegExp(const BString& pattern)
241 fError(B_OK),
242 fRegExp(NULL)
244 fRegExp = Compile(pattern.String());
248 RegExp::~RegExp()
250 free(fRegExp);
254 status_t
255 RegExp::InitCheck() const
257 return fError;
261 status_t
262 RegExp::SetTo(const char* pattern)
264 fError = B_OK;
265 free(fRegExp);
266 fRegExp = Compile(pattern);
267 return fError;
271 status_t
272 RegExp::SetTo(const BString& pattern)
274 fError = B_OK;
275 free(fRegExp);
276 fRegExp = Compile(pattern.String());
277 return fError;
281 bool
282 RegExp::Matches(const char* string) const
284 if (fRegExp == NULL || string == NULL)
285 return false;
287 return RunMatcher(fRegExp, string) == 1;
291 bool
292 RegExp::Matches(const BString& string) const
294 if (fRegExp == NULL)
295 return false;
297 return RunMatcher(fRegExp, string.String()) == 1;
302 // - Compile - compile a regular expression into internal code
304 // We can't allocate space until we know how big the compiled form will be,
305 // but we can't compile it (and thus know how big it is) until we've got a
306 // place to put the code. So we cheat: we compile it twice, once with code
307 // generation turned off and size counting turned on, and once "for real".
308 // This also means that we don't allocate space until we are sure that the
309 // thing really will compile successfully, and we never have to move the
310 // code and thus invalidate pointers into it. (Note that it has to be in
311 // one piece because free() must be able to free it all.)
313 // Beware that the optimization-preparation code in here knows about some
314 // of the structure of the compiled regexp.
315 regexp*
316 RegExp::Compile(const char* exp)
318 regexp* r;
319 const char* scan;
320 const char* longest;
321 int32 len;
322 int32 flags;
324 if (exp == NULL) {
325 SetError(B_BAD_VALUE);
326 return NULL;
329 // First pass: determine size, legality.
330 fInputScanPointer = exp;
331 fParenthesisCount = 1;
332 fCodeSize = 0L;
333 fCodeEmitPointer = &fDummy;
334 Char(kRegExpMagic);
335 if (Reg(0, &flags) == NULL)
336 return NULL;
338 // Small enough for pointer-storage convention?
339 if (fCodeSize >= kMaxSize) {
340 SetError(REGEXP_TOO_BIG);
341 return NULL;
344 r = (regexp*)malloc(sizeof(regexp) + fCodeSize);
345 // Allocate space
347 if (r == NULL) {
348 SetError(B_NO_MEMORY);
349 return NULL;
352 // Second pass: emit code.
353 fInputScanPointer = exp;
354 fParenthesisCount = 1;
355 fCodeEmitPointer = r->program;
356 Char(kRegExpMagic);
357 if (Reg(0, &flags) == NULL) {
358 free(r);
359 return NULL;
362 // Dig out information for optimizations.
363 r->regstart = '\0';
364 // Worst-case defaults.
365 r->reganch = 0;
366 r->regmust = NULL;
367 r->regmlen = 0;
368 scan = r->program + 1;
369 // First kRegExpBranch.
370 if (*Next((char*)scan) == kRegExpEnd) {
371 // Only one top-level choice.
372 scan = Operand(scan);
374 // Starting-point info.
375 if (*scan == kRegExpExactly)
376 r->regstart = *Operand(scan);
377 else if (*scan == kRegExpBol)
378 r->reganch++;
380 // If there's something expensive in the r.e., find the
381 // longest literal string that must appear and make it the
382 // regmust. Resolve ties in favor of later strings, since
383 // the regstart check works with the beginning of the r.e.
384 // and avoiding duplication strengthens checking. Not a
385 // strong reason, but sufficient in the absence of others.
386 if ((flags & kSPStart) != 0) {
387 longest = NULL;
388 len = 0;
389 for (; scan != NULL; scan = Next((char*)scan)) {
390 if (*scan == kRegExpExactly
391 && (int32)strlen(Operand(scan)) >= len) {
392 longest = Operand(scan);
393 len = (int32)strlen(Operand(scan));
396 r->regmust = longest;
397 r->regmlen = len;
401 return r;
405 regexp*
406 RegExp::Expression() const
408 return fRegExp;
412 const char*
413 RegExp::ErrorString() const
415 if (fError >= REGEXP_UNMATCHED_PARENTHESIS
416 && fError <= REGEXP_CORRUPTED_OPCODE) {
417 return B_TRANSLATE_NOCOLLECT(
418 kRegExpErrorStringArray[fError - B_ERRORS_END]);
421 return strerror(fError);
425 void
426 RegExp::SetError(status_t error) const
428 fError = error;
433 // - Reg - regular expression, i.e. main body or parenthesized thing
435 // Caller must absorb opening parenthesis.
437 // Combining parenthesis handling with the base level of regular expression
438 // is a trifle forced, but the need to tie the tails of the branches to what
439 // follows makes it hard to avoid.
441 char*
442 RegExp::Reg(int32 paren, int32* flagp)
444 char* ret;
445 char* br;
446 char* ender;
447 int32 parno = 0;
448 int32 flags;
450 *flagp = kHasWidth;
451 // Tentatively.
453 // Make an kRegExpOpen node, if parenthesized.
454 if (paren) {
455 if (fParenthesisCount >= kSubExpressionMax) {
456 SetError(REGEXP_TOO_MANY_PARENTHESIS);
457 return NULL;
459 parno = fParenthesisCount;
460 fParenthesisCount++;
461 ret = Node((char)(kRegExpOpen + parno));
462 } else
463 ret = NULL;
465 // Pick up the branches, linking them together.
466 br = Branch(&flags);
467 if (br == NULL)
468 return NULL;
470 if (ret != NULL) {
471 Tail(ret, br);
472 // kRegExpOpen -> first
473 } else
474 ret = br;
476 if (!(flags & kHasWidth))
477 *flagp &= ~kHasWidth;
479 *flagp |= flags&kSPStart;
480 while (*fInputScanPointer == '|') {
481 fInputScanPointer++;
482 br = Branch(&flags);
483 if (br == NULL)
484 return NULL;
486 Tail(ret, br);
487 // kRegExpBranch -> kRegExpBranch.
488 if (!(flags & kHasWidth))
489 *flagp &= ~kHasWidth;
491 *flagp |= flags&kSPStart;
494 // Make a closing node, and hook it on the end.
495 ender = Node(paren ? (char)(kRegExpClose + parno) : (char)kRegExpEnd);
496 Tail(ret, ender);
498 // Hook the tails of the branches to the closing node.
499 for (br = ret; br != NULL; br = Next(br))
500 OpTail(br, ender);
502 // Check for proper termination.
503 if (paren && *fInputScanPointer++ != ')') {
504 SetError(REGEXP_UNMATCHED_PARENTHESIS);
505 return NULL;
506 } else if (!paren && *fInputScanPointer != '\0') {
507 if (*fInputScanPointer == ')') {
508 SetError(REGEXP_UNMATCHED_PARENTHESIS);
509 return NULL;
510 } else {
511 SetError(REGEXP_JUNK_ON_END);
512 return NULL; // "Can't happen".
514 // NOTREACHED
517 return ret;
522 // - Branch - one alternative of an | operator
524 // Implements the concatenation operator.
526 char*
527 RegExp::Branch(int32* flagp)
529 char* ret;
530 char* chain;
531 char* latest;
532 int32 flags;
534 *flagp = kWorst;
535 // Tentatively.
537 ret = Node(kRegExpBranch);
538 chain = NULL;
539 while (*fInputScanPointer != '\0'
540 && *fInputScanPointer != '|'
541 && *fInputScanPointer != ')') {
542 latest = Piece(&flags);
543 if (latest == NULL)
544 return NULL;
546 *flagp |= flags & kHasWidth;
547 if (chain == NULL) {
548 // First piece.
549 *flagp |= flags & kSPStart;
550 } else
551 Tail(chain, latest);
553 chain = latest;
556 if (chain == NULL) {
557 // Loop ran zero times.
558 Node(kRegExpNothing);
561 return ret;
566 // - Piece - something followed by possible [*+?]
568 // Note that the branching code sequences used for ? and the general cases
569 // of * and + are somewhat optimized: they use the same kRegExpNothing node
570 // as both the endmarker for their branch list and the body of the last
571 // branch. It might seem that this node could be dispensed with entirely,
572 // but the endmarker role is not redundant.
574 char*
575 RegExp::Piece(int32* flagp)
577 char* ret;
578 char op;
579 char* next;
580 int32 flags;
582 ret = Atom(&flags);
583 if (ret == NULL)
584 return NULL;
586 op = *fInputScanPointer;
587 if (!IsMult(op)) {
588 *flagp = flags;
589 return ret;
592 if (!(flags & kHasWidth) && op != '?') {
593 SetError(REGEXP_STAR_PLUS_OPERAND_EMPTY);
594 return NULL;
596 *flagp = op != '+' ? kWorst | kSPStart : kWorst | kHasWidth;
598 if (op == '*' && (flags & kSimple))
599 Insert(kRegExpStar, ret);
600 else if (op == '*') {
601 // Emit x* as (x&|), where & means "self".
602 Insert(kRegExpBranch, ret);
603 // Either x
604 OpTail(ret, Node(kRegExpBack));
605 // and loop
606 OpTail(ret, ret);
607 // back
608 Tail(ret, Node(kRegExpBranch));
609 // or
610 Tail(ret, Node(kRegExpNothing));
611 // null.
612 } else if (op == '+' && (flags & kSimple))
613 Insert(kRegExpPlus, ret);
614 else if (op == '+') {
615 // Emit x+ as x(&|), where & means "self".
616 next = Node(kRegExpBranch);
617 // Either
618 Tail(ret, next);
619 Tail(Node(kRegExpBack), ret);
620 // loop back
621 Tail(next, Node(kRegExpBranch));
622 // or
623 Tail(ret, Node(kRegExpNothing));
624 // null.
625 } else if (op == '?') {
626 // Emit x? as (x|)
627 Insert(kRegExpBranch, ret);
628 // Either x
629 Tail(ret, Node(kRegExpBranch));
630 // or
631 next = Node(kRegExpNothing);
632 // null.
633 Tail(ret, next);
634 OpTail(ret, next);
637 fInputScanPointer++;
638 if (IsMult(*fInputScanPointer)) {
639 SetError(REGEXP_NESTED_STAR_QUESTION_PLUS);
640 return NULL;
643 return ret;
648 // - Atom - the lowest level
650 // Optimization: gobbles an entire sequence of ordinary characters so that
651 // it can turn them into a single node, which is smaller to store and
652 // faster to run. Backslashed characters are exceptions, each becoming a
653 // separate node; the code is simpler that way and it's not worth fixing.
655 char*
656 RegExp::Atom(int32* flagp)
658 char* ret;
659 int32 flags;
661 *flagp = kWorst;
662 // tentatively
664 switch (*fInputScanPointer++) {
665 case '^':
666 ret = Node(kRegExpBol);
667 break;
669 case '$':
670 ret = Node(kRegExpEol);
671 break;
673 case '.':
674 ret = Node(kRegExpAny);
675 *flagp |= kHasWidth|kSimple;
676 break;
678 case '[':
680 int32 cclass;
681 int32 classend;
683 if (*fInputScanPointer == '^') {
684 // complement of range
685 ret = Node(kRegExpAnyBut);
686 fInputScanPointer++;
687 } else
688 ret = Node(kRegExpAnyOf);
689 if (*fInputScanPointer == ']' || *fInputScanPointer == '-')
690 Char(*fInputScanPointer++);
691 while (*fInputScanPointer != '\0'
692 && *fInputScanPointer != ']') {
693 if (*fInputScanPointer == '-') {
694 fInputScanPointer++;
695 if (*fInputScanPointer == ']'
696 || *fInputScanPointer == '\0') {
697 Char('-');
698 } else {
699 cclass = UCharAt(fInputScanPointer - 2) + 1;
700 classend = UCharAt(fInputScanPointer);
701 if (cclass > classend + 1) {
702 SetError(REGEXP_INVALID_BRACKET_RANGE);
703 return NULL;
705 for (; cclass <= classend; cclass++)
706 Char((char)cclass);
707 fInputScanPointer++;
709 } else
710 Char(*fInputScanPointer++);
712 Char('\0');
713 if (*fInputScanPointer != ']') {
714 SetError(REGEXP_UNMATCHED_BRACKET);
715 return NULL;
717 fInputScanPointer++;
718 *flagp |= kHasWidth | kSimple;
719 break;
722 case '(':
723 ret = Reg(1, &flags);
724 if (ret == NULL)
725 return NULL;
726 *flagp |= flags & (kHasWidth | kSPStart);
727 break;
729 case '\0':
730 case '|':
731 case ')':
732 SetError(REGEXP_INTERNAL_ERROR);
733 return NULL;
734 // supposed to be caught earlier
736 case '?':
737 case '+':
738 case '*':
739 SetError(REGEXP_QUESTION_PLUS_STAR_FOLLOWS_NOTHING);
740 return NULL;
742 case '\\':
743 if (*fInputScanPointer == '\0') {
744 SetError(REGEXP_TRAILING_BACKSLASH);
745 return NULL;
747 ret = Node(kRegExpExactly);
748 Char(*fInputScanPointer++);
749 Char('\0');
750 *flagp |= kHasWidth|kSimple;
751 break;
753 default:
755 int32 len;
756 char ender;
758 fInputScanPointer--;
759 len = (int32)strcspn(fInputScanPointer, kMeta);
760 if (len <= 0) {
761 SetError(REGEXP_INTERNAL_ERROR);
762 return NULL;
765 ender = *(fInputScanPointer + len);
766 if (len > 1 && IsMult(ender)) {
767 // Back off clear of ?+* operand.
768 len--;
771 *flagp |= kHasWidth;
772 if (len == 1)
773 *flagp |= kSimple;
775 ret = Node(kRegExpExactly);
776 while (len > 0) {
777 Char(*fInputScanPointer++);
778 len--;
781 Char('\0');
782 break;
786 return ret;
791 // - Node - emit a node
793 // Returns location.
795 char*
796 RegExp::Node(char op)
798 char* ret;
799 char* ptr;
801 ret = fCodeEmitPointer;
802 if (ret == &fDummy) {
803 fCodeSize += 3;
804 return ret;
807 ptr = ret;
808 *ptr++ = op;
809 *ptr++ = '\0';
810 // Null "next" pointer.
811 *ptr++ = '\0';
812 fCodeEmitPointer = ptr;
814 return ret;
819 // - Char - emit (if appropriate) a byte of code
821 void
822 RegExp::Char(char b)
824 if (fCodeEmitPointer != &fDummy)
825 *fCodeEmitPointer++ = b;
826 else
827 fCodeSize++;
832 // - Insert - insert an operator in front of already-emitted operand
834 // Means relocating the operand.
836 void
837 RegExp::Insert(char op, char* opnd)
839 char* src;
840 char* dst;
841 char* place;
843 if (fCodeEmitPointer == &fDummy) {
844 fCodeSize += 3;
845 return;
848 src = fCodeEmitPointer;
849 fCodeEmitPointer += 3;
850 dst = fCodeEmitPointer;
851 while (src > opnd)
852 *--dst = *--src;
854 place = opnd;
855 // Op node, where operand used to be.
856 *place++ = op;
857 *place++ = '\0';
858 *place++ = '\0';
863 // - Tail - set the next-pointer at the end of a node chain
865 void
866 RegExp::Tail(char* p, char* val)
868 char* scan;
869 char* temp;
870 int32 offset;
872 if (p == &fDummy)
873 return;
875 // Find last node.
876 scan = p;
877 for (;;) {
878 temp = Next(scan);
879 if (temp == NULL)
880 break;
881 scan = temp;
884 if (scan[0] == kRegExpBack)
885 offset = scan - val;
886 else
887 offset = val - scan;
889 scan[1] = (char)((offset >> 8) & 0377);
890 scan[2] = (char)(offset & 0377);
895 // - OpTail - Tail on operand of first argument; nop if operandless
897 void
898 RegExp::OpTail(char* p, char* val)
900 // "Operandless" and "op != kRegExpBranch" are synonymous in practice.
901 if (p == NULL || p == &fDummy || *p != kRegExpBranch)
902 return;
904 Tail(Operand(p), val);
908 // RunMatcher and friends
913 // - RunMatcher - match a regexp against a string
915 int32
916 RegExp::RunMatcher(regexp* prog, const char* string) const
918 const char* s;
920 // be paranoid...
921 if (prog == NULL || string == NULL) {
922 SetError(B_BAD_VALUE);
923 return 0;
926 // check validity of program
927 if (UCharAt(prog->program) != kRegExpMagic) {
928 SetError(REGEXP_CORRUPTED_PROGRAM);
929 return 0;
932 // if there is a "must appear" string, look for it
933 if (prog->regmust != NULL) {
934 s = string;
935 while ((s = strchr(s, prog->regmust[0])) != NULL) {
936 if (strncmp(s, prog->regmust, (size_t)prog->regmlen) == 0) {
937 // found it
938 break;
940 s++;
942 if (s == NULL) {
943 // not present
944 return 0;
948 // mark beginning of line for ^
949 fRegBol = string;
951 // simplest case: anchored match need be tried only once
952 if (prog->reganch)
953 return Try(prog, (char*)string);
955 // messy cases: unanchored match
956 s = string;
957 if (prog->regstart != '\0') {
958 // we know what char it must start with
959 while ((s = strchr(s, prog->regstart)) != NULL) {
960 if (Try(prog, (char*)s))
961 return 1;
962 s++;
964 } else {
965 // we don't -- general case.
966 do {
967 if (Try(prog, (char*)s))
968 return 1;
969 } while (*s++ != '\0');
972 // failure
973 return 0;
978 // - Try - try match at specific point
980 // Returns 0 on failure, 1 on success.
982 int32
983 RegExp::Try(regexp* prog, const char* string) const
985 int32 i;
986 const char **sp;
987 const char **ep;
989 fStringInputPointer = string;
990 fStartPArrayPointer = prog->startp;
991 fEndPArrayPointer = prog->endp;
993 sp = prog->startp;
994 ep = prog->endp;
995 for (i = kSubExpressionMax; i > 0; i--) {
996 *sp++ = NULL;
997 *ep++ = NULL;
999 if (Match(prog->program + 1)) {
1000 prog->startp[0] = string;
1001 prog->endp[0] = fStringInputPointer;
1002 return 1;
1003 } else
1004 return 0;
1009 // - Match - main matching routine
1011 // Conceptually the strategy is simple: check to see whether the current
1012 // node matches, call self recursively to see whether the rest matches,
1013 // and then act accordingly. In practice we make some effort to avoid
1014 // recursion, in particular by going through "ordinary" nodes (that don't
1015 // need to know whether the rest of the match failed) by a loop instead of
1016 // by recursion.
1018 // Returns 0 on failure, 1 on success.
1020 int32
1021 RegExp::Match(const char* prog) const
1023 const char* scan; // Current node.
1024 const char* next; // Next node.
1026 scan = prog;
1027 #ifdef DEBUG
1028 if (scan != NULL && regnarrate)
1029 fprintf(stderr, "%s(\n", Prop(scan));
1030 #endif
1031 while (scan != NULL) {
1032 #ifdef DEBUG
1033 if (regnarrate)
1034 fprintf(stderr, "%s...\n", Prop(scan));
1035 #endif
1036 next = Next(scan);
1038 switch (*scan) {
1039 case kRegExpBol:
1040 if (fStringInputPointer != fRegBol)
1041 return 0;
1042 break;
1044 case kRegExpEol:
1045 if (*fStringInputPointer != '\0')
1046 return 0;
1047 break;
1049 case kRegExpAny:
1050 if (*fStringInputPointer == '\0')
1051 return 0;
1052 fStringInputPointer++;
1053 break;
1055 case kRegExpExactly:
1057 const char* opnd = Operand(scan);
1058 // Inline the first character, for speed.
1059 if (*opnd != *fStringInputPointer)
1060 return 0;
1062 uint32 len = strlen(opnd);
1063 if (len > 1
1064 && strncmp(opnd, fStringInputPointer, len) != 0) {
1065 return 0;
1068 fStringInputPointer += len;
1070 break;
1072 case kRegExpAnyOf:
1073 if (*fStringInputPointer == '\0'
1074 || strchr(Operand(scan), *fStringInputPointer) == NULL) {
1075 return 0;
1077 fStringInputPointer++;
1078 break;
1080 case kRegExpAnyBut:
1081 if (*fStringInputPointer == '\0'
1082 || strchr(Operand(scan), *fStringInputPointer) != NULL) {
1083 return 0;
1085 fStringInputPointer++;
1086 break;
1088 case kRegExpNothing:
1089 break;
1091 case kRegExpBack:
1092 break;
1094 case kRegExpOpen + 1:
1095 case kRegExpOpen + 2:
1096 case kRegExpOpen + 3:
1097 case kRegExpOpen + 4:
1098 case kRegExpOpen + 5:
1099 case kRegExpOpen + 6:
1100 case kRegExpOpen + 7:
1101 case kRegExpOpen + 8:
1102 case kRegExpOpen + 9:
1104 int32 no;
1105 const char* save;
1107 no = *scan - kRegExpOpen;
1108 save = fStringInputPointer;
1110 if (Match(next)) {
1112 // Don't set startp if some later
1113 // invocation of the same parentheses
1114 // already has.
1116 if (fStartPArrayPointer[no] == NULL)
1117 fStartPArrayPointer[no] = save;
1118 return 1;
1119 } else
1120 return 0;
1122 break;
1124 case kRegExpClose + 1:
1125 case kRegExpClose + 2:
1126 case kRegExpClose + 3:
1127 case kRegExpClose + 4:
1128 case kRegExpClose + 5:
1129 case kRegExpClose + 6:
1130 case kRegExpClose + 7:
1131 case kRegExpClose + 8:
1132 case kRegExpClose + 9:
1134 int32 no;
1135 const char* save;
1137 no = *scan - kRegExpClose;
1138 save = fStringInputPointer;
1140 if (Match(next)) {
1142 // Don't set endp if some later
1143 // invocation of the same parentheses
1144 // already has.
1146 if (fEndPArrayPointer[no] == NULL)
1147 fEndPArrayPointer[no] = save;
1148 return 1;
1149 } else
1150 return 0;
1152 break;
1154 case kRegExpBranch:
1156 const char* save;
1158 if (*next != kRegExpBranch) {
1159 // no choice
1160 next = Operand(scan);
1161 // avoid recursion
1162 } else {
1163 do {
1164 save = fStringInputPointer;
1165 if (Match(Operand(scan)))
1166 return 1;
1167 fStringInputPointer = save;
1168 scan = Next(scan);
1169 } while (scan != NULL && *scan == kRegExpBranch);
1170 return 0;
1171 // NOTREACHED
1174 break;
1176 case kRegExpStar:
1177 case kRegExpPlus:
1179 char nextch;
1180 int32 no;
1181 const char* save;
1182 int32 min;
1185 // Lookahead to avoid useless match attempts
1186 // when we know what character comes next.
1188 nextch = '\0';
1189 if (*next == kRegExpExactly)
1190 nextch = *Operand(next);
1191 min = (*scan == kRegExpStar) ? 0 : 1;
1192 save = fStringInputPointer;
1193 no = Repeat(Operand(scan));
1194 while (no >= min) {
1195 // if it could work, try it
1196 if (nextch == '\0' || *fStringInputPointer == nextch) {
1197 if (Match(next))
1198 return 1;
1200 // couldn't or didn't, back up
1201 no--;
1202 fStringInputPointer = save + no;
1204 return 0;
1206 break;
1208 case kRegExpEnd:
1209 return 1;
1210 // Success!
1212 default:
1213 SetError(REGEXP_MEMORY_CORRUPTION);
1214 return 0;
1217 scan = next;
1221 // We get here only if there's trouble -- normally "case kRegExpEnd" is
1222 // the terminating point.
1224 SetError(REGEXP_CORRUPTED_POINTERS);
1226 return 0;
1231 // - Repeat - repeatedly match something simple, report how many
1233 int32
1234 RegExp::Repeat(const char* p) const
1236 int32 count = 0;
1237 const char* scan;
1238 const char* opnd;
1240 scan = fStringInputPointer;
1241 opnd = Operand(p);
1242 switch (*p) {
1243 case kRegExpAny:
1244 count = (int32)strlen(scan);
1245 scan += count;
1246 break;
1248 case kRegExpExactly:
1249 while (*opnd == *scan) {
1250 count++;
1251 scan++;
1253 break;
1255 case kRegExpAnyOf:
1256 while (*scan != '\0' && strchr(opnd, *scan) != NULL) {
1257 count++;
1258 scan++;
1260 break;
1262 case kRegExpAnyBut:
1263 while (*scan != '\0' && strchr(opnd, *scan) == NULL) {
1264 count++;
1265 scan++;
1267 break;
1269 default:
1270 // oh dear, called inappropriately
1271 SetError(REGEXP_INTERNAL_ERROR);
1272 count = 0;
1273 // best compromise
1274 break;
1276 fStringInputPointer = scan;
1278 return count;
1283 // - Next - dig the "next" pointer out of a node
1285 char*
1286 RegExp::Next(char* p)
1288 int32 offset;
1290 if (p == &fDummy)
1291 return NULL;
1293 offset = ((*(p + 1) & 0377) << 8) + (*(p + 2) & 0377);
1294 if (offset == 0)
1295 return NULL;
1297 if (*p == kRegExpBack)
1298 return p - offset;
1299 else
1300 return p + offset;
1304 const char*
1305 RegExp::Next(const char* p) const
1307 int32 offset;
1309 if (p == &fDummy)
1310 return NULL;
1312 offset = ((*(p + 1) & 0377) << 8) + (*(p + 2) & 0377);
1313 if (offset == 0)
1314 return NULL;
1316 if (*p == kRegExpBack)
1317 return p - offset;
1318 else
1319 return p + offset;
1323 inline int32
1324 RegExp::UCharAt(const char* p) const
1326 return (int32)*(unsigned char *)p;
1330 inline char*
1331 RegExp::Operand(char* p) const
1333 return p + 3;
1337 inline const char*
1338 RegExp::Operand(const char* p) const
1340 return p + 3;
1344 inline bool
1345 RegExp::IsMult(char c) const
1347 return c == '*' || c == '+' || c == '?';
1351 #ifdef DEBUG
1355 // - Dump - dump a regexp onto stdout in vaguely comprehensible form
1357 void
1358 RegExp::Dump()
1360 const char* s;
1361 char op = kRegExpExactly;
1362 // Arbitrary non-kRegExpEnd op.
1363 const char* next;
1365 s = fRegExp->program + 1;
1366 while (op != kRegExpEnd) {
1367 // While that wasn't kRegExpEnd last time...
1368 op = *s;
1369 printf("%2ld%s", s - fRegExp->program, Prop(s));
1370 // Where, what.
1371 next = Next(s);
1372 if (next == NULL) {
1373 // next ptr
1374 printf("(0)");
1375 } else
1376 printf("(%ld)", (s - fRegExp->program) + (next - s));
1378 s += 3;
1379 if (op == kRegExpAnyOf || op == kRegExpAnyBut
1380 || op == kRegExpExactly) {
1381 // literal string, where present
1382 while (*s != '\0') {
1383 putchar(*s);
1384 s++;
1386 s++;
1388 putchar('\n');
1391 // header fields of interest
1392 if (fRegExp->regstart != '\0')
1393 printf("start `%c' ", fRegExp->regstart);
1395 if (fRegExp->reganch)
1396 printf("anchored ");
1398 if (fRegExp->regmust != NULL)
1399 printf("must have \"%s\"", fRegExp->regmust);
1401 printf("\n");
1406 // - Prop - printable representation of opcode
1408 char*
1409 RegExp::Prop(const char* op) const
1411 const char* p = NULL;
1412 static char buf[50];
1414 strcpy(buf, ":");
1416 switch (*op) {
1417 case kRegExpBol:
1418 p = "kRegExpBol";
1419 break;
1421 case kRegExpEol:
1422 p = "kRegExpEol";
1423 break;
1425 case kRegExpAny:
1426 p = "kRegExpAny";
1427 break;
1429 case kRegExpAnyOf:
1430 p = "kRegExpAnyOf";
1431 break;
1433 case kRegExpAnyBut:
1434 p = "kRegExpAnyBut";
1435 break;
1437 case kRegExpBranch:
1438 p = "kRegExpBranch";
1439 break;
1441 case kRegExpExactly:
1442 p = "kRegExpExactly";
1443 break;
1445 case kRegExpNothing:
1446 p = "kRegExpNothing";
1447 break;
1449 case kRegExpBack:
1450 p = "kRegExpBack";
1451 break;
1453 case kRegExpEnd:
1454 p = "kRegExpEnd";
1455 break;
1457 case kRegExpOpen + 1:
1458 case kRegExpOpen + 2:
1459 case kRegExpOpen + 3:
1460 case kRegExpOpen + 4:
1461 case kRegExpOpen + 5:
1462 case kRegExpOpen + 6:
1463 case kRegExpOpen + 7:
1464 case kRegExpOpen + 8:
1465 case kRegExpOpen + 9:
1466 sprintf(buf + strlen(buf), "kRegExpOpen%d", *op - kRegExpOpen);
1467 p = NULL;
1468 break;
1470 case kRegExpClose + 1:
1471 case kRegExpClose + 2:
1472 case kRegExpClose + 3:
1473 case kRegExpClose + 4:
1474 case kRegExpClose + 5:
1475 case kRegExpClose + 6:
1476 case kRegExpClose + 7:
1477 case kRegExpClose + 8:
1478 case kRegExpClose + 9:
1479 sprintf(buf + strlen(buf), "kRegExpClose%d", *op - kRegExpClose);
1480 p = NULL;
1481 break;
1483 case kRegExpStar:
1484 p = "kRegExpStar";
1485 break;
1487 case kRegExpPlus:
1488 p = "kRegExpPlus";
1489 break;
1491 default:
1492 RegExpError("corrupted opcode");
1493 break;
1496 if (p != NULL)
1497 strcat(buf, p);
1499 return buf;
1503 void
1504 RegExp::RegExpError(const char*) const
1506 // does nothing now, perhaps it should printf?
1509 #endif // DEBUG