6 Copyright (c) 1991-2000, Be Incorporated. All rights reserved.
8 Permission is hereby granted, free of charge, to any person obtaining a copy of
9 this software and associated documentation files (the "Software"), to deal in
10 the Software without restriction, including without limitation the rights to
11 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
12 of the Software, and to permit persons to whom the Software is furnished to do
13 so, subject to the following conditions:
15 The above copyright notice and this permission notice applies to all licensees
16 and shall be included in all copies or substantial portions of the Software.
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF TITLE, MERCHANTABILITY,
20 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
21 BE INCORPORATED BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
22 AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN CONNECTION
23 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 Except as contained in this notice, the name of Be Incorporated shall not be
26 used in advertising or otherwise to promote the sale, use or other dealings in
27 this Software without prior written authorization from Be Incorporated.
29 Tracker(TM), Be(R), BeOS(R), and BeIA(TM) are trademarks or registered trademarks
30 of Be Incorporated in the United States and other countries. Other brand product
31 names are registered trademarks or trademarks of their respective holders.
35 // This code is based on regexp.c, v.1.3 by Henry Spencer:
37 // @(#)regexp.c 1.3 of 18 April 87
39 // Copyright (c) 1986 by University of Toronto.
40 // Written by Henry Spencer. Not derived from licensed software.
42 // Permission is granted to anyone to use this software for any
43 // purpose on any computer system, and to redistribute it freely,
44 // subject to the following restrictions:
46 // 1. The author is not responsible for the consequences of use of
47 // this software, no matter how awful, even if they arise
48 // from defects in it.
50 // 2. The origin of this software must not be misrepresented, either
51 // by explicit claim or by omission.
53 // 3. Altered versions must be plainly marked as such, and must not
54 // be misrepresented as being the original software.
56 // Beware that some of this code is subtly aware of the way operator
57 // precedence is structured in regular expressions. Serious changes in
58 // regular-expression syntax might require a total rethink.
61 // ALTERED VERSION: Adapted to ANSI C and C++ for the OpenTracker
62 // project (www.opentracker.org), Jul 11, 2000.
75 #undef B_TRANSLATION_CONTEXT
76 #define B_TRANSLATION_CONTEXT "libtracker"
79 // The first byte of the regexp internal "program" is actually this magic
80 // number; the start node begins in the second byte.
82 const uint8 kRegExpMagic
= 0234;
84 // The "internal use only" fields in RegExp.h are present to pass info from
85 // compile to execute that permits the execute phase to run lots faster on
86 // simple cases. They are:
88 // regstart char that must begin a match; '\0' if none obvious
89 // reganch is the match anchored (at beginning-of-line only)?
90 // regmust string (pointer into program) that match must include, or NULL
91 // regmlen length of regmust string
93 // Regstart and reganch permit very fast decisions on suitable starting points
94 // for a match, cutting down the work a lot. Regmust permits fast rejection
95 // of lines that cannot possibly match. The regmust tests are costly enough
96 // that Compile() supplies a regmust only if the r.e. contains something
97 // potentially expensive (at present, the only such thing detected is * or +
98 // at the start of the r.e., which can involve a lot of backup). Regmlen is
99 // supplied because the test in RunMatcher() needs it and Compile() is
100 // computing it anyway.
104 // Structure for regexp "program". This is essentially a linear encoding
105 // of a nondeterministic finite-state machine (aka syntax charts or
106 // "railroad normal form" in parsing technology). Each node is an opcode
107 // plus a "next" pointer, possibly plus an operand. "Next" pointers of
108 // all nodes except kRegExpBranch implement concatenation; a "next" pointer
109 // with a kRegExpBranch on both ends of it is connecting two alternatives.
110 // (Here we have one of the subtle syntax dependencies: an individual
111 // kRegExpBranch (as opposed to a collection of them) is never concatenated
112 // with anything because of operator precedence.) The operand of some types
113 // of node is a literal string; for others, it is a node leading into a
114 // sub-FSM. In particular, the operand of a kRegExpBranch node is the first
115 // node of the branch. (NB this is* not* a tree structure: the tail of the
116 // branch connects to the thing following the set of kRegExpBranches).
120 // definition number opnd? meaning
122 kRegExpEnd
= 0, // no End of program.
123 kRegExpBol
= 1, // no Match "" at beginning of line.
124 kRegExpEol
= 2, // no Match "" at end of line.
125 kRegExpAny
= 3, // no Match any one character.
126 kRegExpAnyOf
= 4, // str Match any character in this string.
127 kRegExpAnyBut
= 5, // str Match any character not in this string.
128 kRegExpBranch
= 6, // node Match this alternative, or the next...
129 kRegExpBack
= 7, // no Match "", "next" ptr points backward.
130 kRegExpExactly
= 8, // str Match this string.
131 kRegExpNothing
= 9, // no Match empty string.
132 kRegExpStar
= 10, // node Match this (simple) thing 0 or more times.
133 kRegExpPlus
= 11, // node Match this (simple) thing 1 or more times.
134 kRegExpOpen
= 20, // no Mark this point in input as start of #n.
135 // kRegExpOpen + 1 is number 1, etc.
136 kRegExpClose
= 30 // no Analogous to kRegExpOpen.
142 // kRegExpBranch The set of branches constituting a single choice are
143 // hooked together with their "next" pointers, since precedence prevents
144 // anything being concatenated to any individual branch. The
145 // "next" pointer of the last kRegExpBranch in a choice points to the
146 // thing following the whole choice. This is also where the
147 // final "next" pointer of each individual branch points; each
148 // branch starts with the operand node of a kRegExpBranch node.
150 // kRegExpBack Normal "next" pointers all implicitly point forward;
151 // kRegExpBack exists to make loop structures possible.
153 // kRegExpStar,kRegExpPlus '?', and complex '*' and '+', are implemented as
154 // circular kRegExpBranch structures using kRegExpBack. Simple cases
155 // (one character per match) are implemented with kRegExpStar and
156 // kRegExpPlus for speed and to minimize recursive plunges.
158 // kRegExpOpen,kRegExpClose ...are numbered at compile time.
162 // A node is one char of opcode followed by two chars of "next" pointer.
163 // "Next" pointers are stored as two 8-bit pieces, high order first. The
164 // value is a positive offset from the opcode of the node containing it.
165 // An operand, if any, simply follows the node. (Note that much of the
166 // code generation knows about this implicit relationship.)
168 // Using two bytes for the "next" pointer is vast overkill for most things,
169 // but allows patterns to get big without disasters.
172 const char* kMeta
= "^$.[()|?+*\\";
173 const int32 kMaxSize
= 32767L;
174 // Probably could be 65535L.
177 // Flags to be passed up and down:
179 kHasWidth
= 01, // Known never to match null string.
180 kSimple
= 02, // Simple enough to be kRegExpStar/kRegExpPlus operand.
181 kSPStart
= 04, // Starts with * or +.
182 kWorst
= 0 // Worst case.
186 const char* kRegExpErrorStringArray
[] = {
187 B_TRANSLATE_MARK("Unmatched parenthesis."),
188 B_TRANSLATE_MARK("Expression too long."),
189 B_TRANSLATE_MARK("Too many parenthesis."),
190 B_TRANSLATE_MARK("Junk on end."),
191 B_TRANSLATE_MARK("*+? operand may be empty."),
192 B_TRANSLATE_MARK("Nested *?+."),
193 B_TRANSLATE_MARK("Invalid bracket range."),
194 B_TRANSLATE_MARK("Unmatched brackets."),
195 B_TRANSLATE_MARK("Internal error."),
196 B_TRANSLATE_MARK("?+* follows nothing."),
197 B_TRANSLATE_MARK("Trailing \\."),
198 B_TRANSLATE_MARK("Corrupted expression."),
199 B_TRANSLATE_MARK("Memory corruption."),
200 B_TRANSLATE_MARK("Corrupted pointers."),
201 B_TRANSLATE_MARK("Corrupted opcode.")
206 int32 regnarrate
= 0;
210 // #pragma mark - RegExp
217 fInputScanPointer(NULL
),
218 fParenthesisCount(0),
220 fCodeEmitPointer(NULL
),
222 fStringInputPointer(NULL
),
224 fStartPArrayPointer(NULL
),
225 fEndPArrayPointer(NULL
)
230 RegExp::RegExp(const char* pattern
)
235 fRegExp
= Compile(pattern
);
239 RegExp::RegExp(const BString
& pattern
)
244 fRegExp
= Compile(pattern
.String());
255 RegExp::InitCheck() const
262 RegExp::SetTo(const char* pattern
)
266 fRegExp
= Compile(pattern
);
272 RegExp::SetTo(const BString
& pattern
)
276 fRegExp
= Compile(pattern
.String());
282 RegExp::Matches(const char* string
) const
284 if (fRegExp
== NULL
|| string
== NULL
)
287 return RunMatcher(fRegExp
, string
) == 1;
292 RegExp::Matches(const BString
& string
) const
297 return RunMatcher(fRegExp
, string
.String()) == 1;
302 // - Compile - compile a regular expression into internal code
304 // We can't allocate space until we know how big the compiled form will be,
305 // but we can't compile it (and thus know how big it is) until we've got a
306 // place to put the code. So we cheat: we compile it twice, once with code
307 // generation turned off and size counting turned on, and once "for real".
308 // This also means that we don't allocate space until we are sure that the
309 // thing really will compile successfully, and we never have to move the
310 // code and thus invalidate pointers into it. (Note that it has to be in
311 // one piece because free() must be able to free it all.)
313 // Beware that the optimization-preparation code in here knows about some
314 // of the structure of the compiled regexp.
316 RegExp::Compile(const char* exp
)
325 SetError(B_BAD_VALUE
);
329 // First pass: determine size, legality.
330 fInputScanPointer
= exp
;
331 fParenthesisCount
= 1;
333 fCodeEmitPointer
= &fDummy
;
335 if (Reg(0, &flags
) == NULL
)
338 // Small enough for pointer-storage convention?
339 if (fCodeSize
>= kMaxSize
) {
340 SetError(REGEXP_TOO_BIG
);
344 r
= (regexp
*)malloc(sizeof(regexp
) + fCodeSize
);
348 SetError(B_NO_MEMORY
);
352 // Second pass: emit code.
353 fInputScanPointer
= exp
;
354 fParenthesisCount
= 1;
355 fCodeEmitPointer
= r
->program
;
357 if (Reg(0, &flags
) == NULL
) {
362 // Dig out information for optimizations.
364 // Worst-case defaults.
368 scan
= r
->program
+ 1;
369 // First kRegExpBranch.
370 if (*Next((char*)scan
) == kRegExpEnd
) {
371 // Only one top-level choice.
372 scan
= Operand(scan
);
374 // Starting-point info.
375 if (*scan
== kRegExpExactly
)
376 r
->regstart
= *Operand(scan
);
377 else if (*scan
== kRegExpBol
)
380 // If there's something expensive in the r.e., find the
381 // longest literal string that must appear and make it the
382 // regmust. Resolve ties in favor of later strings, since
383 // the regstart check works with the beginning of the r.e.
384 // and avoiding duplication strengthens checking. Not a
385 // strong reason, but sufficient in the absence of others.
386 if ((flags
& kSPStart
) != 0) {
389 for (; scan
!= NULL
; scan
= Next((char*)scan
)) {
390 if (*scan
== kRegExpExactly
391 && (int32
)strlen(Operand(scan
)) >= len
) {
392 longest
= Operand(scan
);
393 len
= (int32
)strlen(Operand(scan
));
396 r
->regmust
= longest
;
406 RegExp::Expression() const
413 RegExp::ErrorString() const
415 if (fError
>= REGEXP_UNMATCHED_PARENTHESIS
416 && fError
<= REGEXP_CORRUPTED_OPCODE
) {
417 return B_TRANSLATE_NOCOLLECT(
418 kRegExpErrorStringArray
[fError
- B_ERRORS_END
]);
421 return strerror(fError
);
426 RegExp::SetError(status_t error
) const
433 // - Reg - regular expression, i.e. main body or parenthesized thing
435 // Caller must absorb opening parenthesis.
437 // Combining parenthesis handling with the base level of regular expression
438 // is a trifle forced, but the need to tie the tails of the branches to what
439 // follows makes it hard to avoid.
442 RegExp::Reg(int32 paren
, int32
* flagp
)
453 // Make an kRegExpOpen node, if parenthesized.
455 if (fParenthesisCount
>= kSubExpressionMax
) {
456 SetError(REGEXP_TOO_MANY_PARENTHESIS
);
459 parno
= fParenthesisCount
;
461 ret
= Node((char)(kRegExpOpen
+ parno
));
465 // Pick up the branches, linking them together.
472 // kRegExpOpen -> first
476 if (!(flags
& kHasWidth
))
477 *flagp
&= ~kHasWidth
;
479 *flagp
|= flags
&kSPStart
;
480 while (*fInputScanPointer
== '|') {
487 // kRegExpBranch -> kRegExpBranch.
488 if (!(flags
& kHasWidth
))
489 *flagp
&= ~kHasWidth
;
491 *flagp
|= flags
&kSPStart
;
494 // Make a closing node, and hook it on the end.
495 ender
= Node(paren
? (char)(kRegExpClose
+ parno
) : (char)kRegExpEnd
);
498 // Hook the tails of the branches to the closing node.
499 for (br
= ret
; br
!= NULL
; br
= Next(br
))
502 // Check for proper termination.
503 if (paren
&& *fInputScanPointer
++ != ')') {
504 SetError(REGEXP_UNMATCHED_PARENTHESIS
);
506 } else if (!paren
&& *fInputScanPointer
!= '\0') {
507 if (*fInputScanPointer
== ')') {
508 SetError(REGEXP_UNMATCHED_PARENTHESIS
);
511 SetError(REGEXP_JUNK_ON_END
);
512 return NULL
; // "Can't happen".
522 // - Branch - one alternative of an | operator
524 // Implements the concatenation operator.
527 RegExp::Branch(int32
* flagp
)
537 ret
= Node(kRegExpBranch
);
539 while (*fInputScanPointer
!= '\0'
540 && *fInputScanPointer
!= '|'
541 && *fInputScanPointer
!= ')') {
542 latest
= Piece(&flags
);
546 *flagp
|= flags
& kHasWidth
;
549 *flagp
|= flags
& kSPStart
;
557 // Loop ran zero times.
558 Node(kRegExpNothing
);
566 // - Piece - something followed by possible [*+?]
568 // Note that the branching code sequences used for ? and the general cases
569 // of * and + are somewhat optimized: they use the same kRegExpNothing node
570 // as both the endmarker for their branch list and the body of the last
571 // branch. It might seem that this node could be dispensed with entirely,
572 // but the endmarker role is not redundant.
575 RegExp::Piece(int32
* flagp
)
586 op
= *fInputScanPointer
;
592 if (!(flags
& kHasWidth
) && op
!= '?') {
593 SetError(REGEXP_STAR_PLUS_OPERAND_EMPTY
);
596 *flagp
= op
!= '+' ? kWorst
| kSPStart
: kWorst
| kHasWidth
;
598 if (op
== '*' && (flags
& kSimple
))
599 Insert(kRegExpStar
, ret
);
600 else if (op
== '*') {
601 // Emit x* as (x&|), where & means "self".
602 Insert(kRegExpBranch
, ret
);
604 OpTail(ret
, Node(kRegExpBack
));
608 Tail(ret
, Node(kRegExpBranch
));
610 Tail(ret
, Node(kRegExpNothing
));
612 } else if (op
== '+' && (flags
& kSimple
))
613 Insert(kRegExpPlus
, ret
);
614 else if (op
== '+') {
615 // Emit x+ as x(&|), where & means "self".
616 next
= Node(kRegExpBranch
);
619 Tail(Node(kRegExpBack
), ret
);
621 Tail(next
, Node(kRegExpBranch
));
623 Tail(ret
, Node(kRegExpNothing
));
625 } else if (op
== '?') {
627 Insert(kRegExpBranch
, ret
);
629 Tail(ret
, Node(kRegExpBranch
));
631 next
= Node(kRegExpNothing
);
638 if (IsMult(*fInputScanPointer
)) {
639 SetError(REGEXP_NESTED_STAR_QUESTION_PLUS
);
648 // - Atom - the lowest level
650 // Optimization: gobbles an entire sequence of ordinary characters so that
651 // it can turn them into a single node, which is smaller to store and
652 // faster to run. Backslashed characters are exceptions, each becoming a
653 // separate node; the code is simpler that way and it's not worth fixing.
656 RegExp::Atom(int32
* flagp
)
664 switch (*fInputScanPointer
++) {
666 ret
= Node(kRegExpBol
);
670 ret
= Node(kRegExpEol
);
674 ret
= Node(kRegExpAny
);
675 *flagp
|= kHasWidth
|kSimple
;
683 if (*fInputScanPointer
== '^') {
684 // complement of range
685 ret
= Node(kRegExpAnyBut
);
688 ret
= Node(kRegExpAnyOf
);
689 if (*fInputScanPointer
== ']' || *fInputScanPointer
== '-')
690 Char(*fInputScanPointer
++);
691 while (*fInputScanPointer
!= '\0'
692 && *fInputScanPointer
!= ']') {
693 if (*fInputScanPointer
== '-') {
695 if (*fInputScanPointer
== ']'
696 || *fInputScanPointer
== '\0') {
699 cclass
= UCharAt(fInputScanPointer
- 2) + 1;
700 classend
= UCharAt(fInputScanPointer
);
701 if (cclass
> classend
+ 1) {
702 SetError(REGEXP_INVALID_BRACKET_RANGE
);
705 for (; cclass
<= classend
; cclass
++)
710 Char(*fInputScanPointer
++);
713 if (*fInputScanPointer
!= ']') {
714 SetError(REGEXP_UNMATCHED_BRACKET
);
718 *flagp
|= kHasWidth
| kSimple
;
723 ret
= Reg(1, &flags
);
726 *flagp
|= flags
& (kHasWidth
| kSPStart
);
732 SetError(REGEXP_INTERNAL_ERROR
);
734 // supposed to be caught earlier
739 SetError(REGEXP_QUESTION_PLUS_STAR_FOLLOWS_NOTHING
);
743 if (*fInputScanPointer
== '\0') {
744 SetError(REGEXP_TRAILING_BACKSLASH
);
747 ret
= Node(kRegExpExactly
);
748 Char(*fInputScanPointer
++);
750 *flagp
|= kHasWidth
|kSimple
;
759 len
= (int32
)strcspn(fInputScanPointer
, kMeta
);
761 SetError(REGEXP_INTERNAL_ERROR
);
765 ender
= *(fInputScanPointer
+ len
);
766 if (len
> 1 && IsMult(ender
)) {
767 // Back off clear of ?+* operand.
775 ret
= Node(kRegExpExactly
);
777 Char(*fInputScanPointer
++);
791 // - Node - emit a node
796 RegExp::Node(char op
)
801 ret
= fCodeEmitPointer
;
802 if (ret
== &fDummy
) {
810 // Null "next" pointer.
812 fCodeEmitPointer
= ptr
;
819 // - Char - emit (if appropriate) a byte of code
824 if (fCodeEmitPointer
!= &fDummy
)
825 *fCodeEmitPointer
++ = b
;
832 // - Insert - insert an operator in front of already-emitted operand
834 // Means relocating the operand.
837 RegExp::Insert(char op
, char* opnd
)
843 if (fCodeEmitPointer
== &fDummy
) {
848 src
= fCodeEmitPointer
;
849 fCodeEmitPointer
+= 3;
850 dst
= fCodeEmitPointer
;
855 // Op node, where operand used to be.
863 // - Tail - set the next-pointer at the end of a node chain
866 RegExp::Tail(char* p
, char* val
)
884 if (scan
[0] == kRegExpBack
)
889 scan
[1] = (char)((offset
>> 8) & 0377);
890 scan
[2] = (char)(offset
& 0377);
895 // - OpTail - Tail on operand of first argument; nop if operandless
898 RegExp::OpTail(char* p
, char* val
)
900 // "Operandless" and "op != kRegExpBranch" are synonymous in practice.
901 if (p
== NULL
|| p
== &fDummy
|| *p
!= kRegExpBranch
)
904 Tail(Operand(p
), val
);
908 // RunMatcher and friends
913 // - RunMatcher - match a regexp against a string
916 RegExp::RunMatcher(regexp
* prog
, const char* string
) const
921 if (prog
== NULL
|| string
== NULL
) {
922 SetError(B_BAD_VALUE
);
926 // check validity of program
927 if (UCharAt(prog
->program
) != kRegExpMagic
) {
928 SetError(REGEXP_CORRUPTED_PROGRAM
);
932 // if there is a "must appear" string, look for it
933 if (prog
->regmust
!= NULL
) {
935 while ((s
= strchr(s
, prog
->regmust
[0])) != NULL
) {
936 if (strncmp(s
, prog
->regmust
, (size_t)prog
->regmlen
) == 0) {
948 // mark beginning of line for ^
951 // simplest case: anchored match need be tried only once
953 return Try(prog
, (char*)string
);
955 // messy cases: unanchored match
957 if (prog
->regstart
!= '\0') {
958 // we know what char it must start with
959 while ((s
= strchr(s
, prog
->regstart
)) != NULL
) {
960 if (Try(prog
, (char*)s
))
965 // we don't -- general case.
967 if (Try(prog
, (char*)s
))
969 } while (*s
++ != '\0');
978 // - Try - try match at specific point
980 // Returns 0 on failure, 1 on success.
983 RegExp::Try(regexp
* prog
, const char* string
) const
989 fStringInputPointer
= string
;
990 fStartPArrayPointer
= prog
->startp
;
991 fEndPArrayPointer
= prog
->endp
;
995 for (i
= kSubExpressionMax
; i
> 0; i
--) {
999 if (Match(prog
->program
+ 1)) {
1000 prog
->startp
[0] = string
;
1001 prog
->endp
[0] = fStringInputPointer
;
1009 // - Match - main matching routine
1011 // Conceptually the strategy is simple: check to see whether the current
1012 // node matches, call self recursively to see whether the rest matches,
1013 // and then act accordingly. In practice we make some effort to avoid
1014 // recursion, in particular by going through "ordinary" nodes (that don't
1015 // need to know whether the rest of the match failed) by a loop instead of
1018 // Returns 0 on failure, 1 on success.
1021 RegExp::Match(const char* prog
) const
1023 const char* scan
; // Current node.
1024 const char* next
; // Next node.
1028 if (scan
!= NULL
&& regnarrate
)
1029 fprintf(stderr
, "%s(\n", Prop(scan
));
1031 while (scan
!= NULL
) {
1034 fprintf(stderr
, "%s...\n", Prop(scan
));
1040 if (fStringInputPointer
!= fRegBol
)
1045 if (*fStringInputPointer
!= '\0')
1050 if (*fStringInputPointer
== '\0')
1052 fStringInputPointer
++;
1055 case kRegExpExactly
:
1057 const char* opnd
= Operand(scan
);
1058 // Inline the first character, for speed.
1059 if (*opnd
!= *fStringInputPointer
)
1062 uint32 len
= strlen(opnd
);
1064 && strncmp(opnd
, fStringInputPointer
, len
) != 0) {
1068 fStringInputPointer
+= len
;
1073 if (*fStringInputPointer
== '\0'
1074 || strchr(Operand(scan
), *fStringInputPointer
) == NULL
) {
1077 fStringInputPointer
++;
1081 if (*fStringInputPointer
== '\0'
1082 || strchr(Operand(scan
), *fStringInputPointer
) != NULL
) {
1085 fStringInputPointer
++;
1088 case kRegExpNothing
:
1094 case kRegExpOpen
+ 1:
1095 case kRegExpOpen
+ 2:
1096 case kRegExpOpen
+ 3:
1097 case kRegExpOpen
+ 4:
1098 case kRegExpOpen
+ 5:
1099 case kRegExpOpen
+ 6:
1100 case kRegExpOpen
+ 7:
1101 case kRegExpOpen
+ 8:
1102 case kRegExpOpen
+ 9:
1107 no
= *scan
- kRegExpOpen
;
1108 save
= fStringInputPointer
;
1112 // Don't set startp if some later
1113 // invocation of the same parentheses
1116 if (fStartPArrayPointer
[no
] == NULL
)
1117 fStartPArrayPointer
[no
] = save
;
1124 case kRegExpClose
+ 1:
1125 case kRegExpClose
+ 2:
1126 case kRegExpClose
+ 3:
1127 case kRegExpClose
+ 4:
1128 case kRegExpClose
+ 5:
1129 case kRegExpClose
+ 6:
1130 case kRegExpClose
+ 7:
1131 case kRegExpClose
+ 8:
1132 case kRegExpClose
+ 9:
1137 no
= *scan
- kRegExpClose
;
1138 save
= fStringInputPointer
;
1142 // Don't set endp if some later
1143 // invocation of the same parentheses
1146 if (fEndPArrayPointer
[no
] == NULL
)
1147 fEndPArrayPointer
[no
] = save
;
1158 if (*next
!= kRegExpBranch
) {
1160 next
= Operand(scan
);
1164 save
= fStringInputPointer
;
1165 if (Match(Operand(scan
)))
1167 fStringInputPointer
= save
;
1169 } while (scan
!= NULL
&& *scan
== kRegExpBranch
);
1185 // Lookahead to avoid useless match attempts
1186 // when we know what character comes next.
1189 if (*next
== kRegExpExactly
)
1190 nextch
= *Operand(next
);
1191 min
= (*scan
== kRegExpStar
) ? 0 : 1;
1192 save
= fStringInputPointer
;
1193 no
= Repeat(Operand(scan
));
1195 // if it could work, try it
1196 if (nextch
== '\0' || *fStringInputPointer
== nextch
) {
1200 // couldn't or didn't, back up
1202 fStringInputPointer
= save
+ no
;
1213 SetError(REGEXP_MEMORY_CORRUPTION
);
1221 // We get here only if there's trouble -- normally "case kRegExpEnd" is
1222 // the terminating point.
1224 SetError(REGEXP_CORRUPTED_POINTERS
);
1231 // - Repeat - repeatedly match something simple, report how many
1234 RegExp::Repeat(const char* p
) const
1240 scan
= fStringInputPointer
;
1244 count
= (int32
)strlen(scan
);
1248 case kRegExpExactly
:
1249 while (*opnd
== *scan
) {
1256 while (*scan
!= '\0' && strchr(opnd
, *scan
) != NULL
) {
1263 while (*scan
!= '\0' && strchr(opnd
, *scan
) == NULL
) {
1270 // oh dear, called inappropriately
1271 SetError(REGEXP_INTERNAL_ERROR
);
1276 fStringInputPointer
= scan
;
1283 // - Next - dig the "next" pointer out of a node
1286 RegExp::Next(char* p
)
1293 offset
= ((*(p
+ 1) & 0377) << 8) + (*(p
+ 2) & 0377);
1297 if (*p
== kRegExpBack
)
1305 RegExp::Next(const char* p
) const
1312 offset
= ((*(p
+ 1) & 0377) << 8) + (*(p
+ 2) & 0377);
1316 if (*p
== kRegExpBack
)
1324 RegExp::UCharAt(const char* p
) const
1326 return (int32
)*(unsigned char *)p
;
1331 RegExp::Operand(char* p
) const
1338 RegExp::Operand(const char* p
) const
1345 RegExp::IsMult(char c
) const
1347 return c
== '*' || c
== '+' || c
== '?';
1355 // - Dump - dump a regexp onto stdout in vaguely comprehensible form
1361 char op
= kRegExpExactly
;
1362 // Arbitrary non-kRegExpEnd op.
1365 s
= fRegExp
->program
+ 1;
1366 while (op
!= kRegExpEnd
) {
1367 // While that wasn't kRegExpEnd last time...
1369 printf("%2ld%s", s
- fRegExp
->program
, Prop(s
));
1376 printf("(%ld)", (s
- fRegExp
->program
) + (next
- s
));
1379 if (op
== kRegExpAnyOf
|| op
== kRegExpAnyBut
1380 || op
== kRegExpExactly
) {
1381 // literal string, where present
1382 while (*s
!= '\0') {
1391 // header fields of interest
1392 if (fRegExp
->regstart
!= '\0')
1393 printf("start `%c' ", fRegExp
->regstart
);
1395 if (fRegExp
->reganch
)
1396 printf("anchored ");
1398 if (fRegExp
->regmust
!= NULL
)
1399 printf("must have \"%s\"", fRegExp
->regmust
);
1406 // - Prop - printable representation of opcode
1409 RegExp::Prop(const char* op
) const
1411 const char* p
= NULL
;
1412 static char buf
[50];
1434 p
= "kRegExpAnyBut";
1438 p
= "kRegExpBranch";
1441 case kRegExpExactly
:
1442 p
= "kRegExpExactly";
1445 case kRegExpNothing
:
1446 p
= "kRegExpNothing";
1457 case kRegExpOpen
+ 1:
1458 case kRegExpOpen
+ 2:
1459 case kRegExpOpen
+ 3:
1460 case kRegExpOpen
+ 4:
1461 case kRegExpOpen
+ 5:
1462 case kRegExpOpen
+ 6:
1463 case kRegExpOpen
+ 7:
1464 case kRegExpOpen
+ 8:
1465 case kRegExpOpen
+ 9:
1466 sprintf(buf
+ strlen(buf
), "kRegExpOpen%d", *op
- kRegExpOpen
);
1470 case kRegExpClose
+ 1:
1471 case kRegExpClose
+ 2:
1472 case kRegExpClose
+ 3:
1473 case kRegExpClose
+ 4:
1474 case kRegExpClose
+ 5:
1475 case kRegExpClose
+ 6:
1476 case kRegExpClose
+ 7:
1477 case kRegExpClose
+ 8:
1478 case kRegExpClose
+ 9:
1479 sprintf(buf
+ strlen(buf
), "kRegExpClose%d", *op
- kRegExpClose
);
1492 RegExpError("corrupted opcode");
1504 RegExp::RegExpError(const char*) const
1506 // does nothing now, perhaps it should printf?