FIX: stupid pb fixed (close to being medieval'ed by The Ken)
[cmake.git] / Source / cmRegularExpression.cxx
blob917f28533cd1a58faccff91e08438f1f2dfea022
1 /*=========================================================================
3 Program: Insight Segmentation & Registration Toolkit
4 Module: $RCSfile: cmRegularExpression.cxx,v $
5 Language: C++
6 Date: $Date: 2002-03-13 15:25:10 $
7 Version: $Revision: 1.6 $
9 Copyright (c) 2002 Insight Consortium. All rights reserved.
10 See ITKCopyright.txt or http://www.itk.org/HTML/Copyright.htm for details.
12 This software is distributed WITHOUT ANY WARRANTY; without even
13 the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
14 PURPOSE. See the above copyright notices for more information.
16 =========================================================================*/
18 // Copyright (C) 1991 Texas Instruments Incorporated.
20 // Permission is granted to any individual or institution to use, copy, modify,
21 // and distribute this software, provided that this complete copyright and
22 // permission notice is maintained, intact, in all copies and supporting
23 // documentation.
25 // Texas Instruments Incorporated provides this software "as is" without
26 // express or implied warranty.
29 // Created: MNF 06/13/89 Initial Design and Implementation
30 // Updated: LGO 08/09/89 Inherit from Generic
31 // Updated: MBN 09/07/89 Added conditional exception handling
32 // Updated: MBN 12/15/89 Sprinkled "const" qualifiers all over the place!
33 // Updated: DLS 03/22/91 New lite version
36 #include "cmRegularExpression.h" // Include class specification
37 #include "cmStandardIncludes.h"
38 #include <stdio.h>
40 // cmRegularExpression -- Copies the given regular expression.
41 cmRegularExpression::cmRegularExpression (const cmRegularExpression& rxp) {
42 int ind;
43 this->progsize = rxp.progsize; // Copy regular expression size
44 this->program = new char[this->progsize]; // Allocate storage
45 for(ind=this->progsize; ind-- != 0;) // Copy regular expresion
46 this->program[ind] = rxp.program[ind];
47 this->startp[0] = rxp.startp[0]; // Copy pointers into last
48 this->endp[0] = rxp.endp[0]; // Successful "find" operation
49 this->regmust = rxp.regmust; // Copy field
50 if (rxp.regmust != NULL) {
51 char* dum = rxp.program;
52 ind = 0;
53 while (dum != rxp.regmust) {
54 ++dum;
55 ++ind;
57 this->regmust = this->program + ind;
59 this->regstart = rxp.regstart; // Copy starting index
60 this->reganch = rxp.reganch; // Copy remaining private data
61 this->regmlen = rxp.regmlen; // Copy remaining private data
64 // operator== -- Returns true if two regular expressions have the same
65 // compiled program for pattern matching.
66 bool cmRegularExpression::operator== (const cmRegularExpression& rxp) const {
67 if (this != &rxp) { // Same address?
68 int ind = this->progsize; // Get regular expression size
69 if (ind != rxp.progsize) // If different size regexp
70 return false; // Return failure
71 while(ind-- != 0) // Else while still characters
72 if(this->program[ind] != rxp.program[ind]) // If regexp are different
73 return false; // Return failure
75 return true; // Else same, return success
79 // deep_equal -- Returns true if have the same compiled regular expressions
80 // and the same start and end pointers.
82 bool cmRegularExpression::deep_equal (const cmRegularExpression& rxp) const {
83 int ind = this->progsize; // Get regular expression size
84 if (ind != rxp.progsize) // If different size regexp
85 return false; // Return failure
86 while(ind-- != 0) // Else while still characters
87 if(this->program[ind] != rxp.program[ind]) // If regexp are different
88 return false; // Return failure
89 return (this->startp[0] == rxp.startp[0] && // Else if same start/end ptrs,
90 this->endp[0] == rxp.endp[0]); // Return true
93 // The remaining code in this file is derived from the regular expression code
94 // whose copyright statement appears below. It has been changed to work
95 // with the class concepts of C++ and COOL.
98 * compile and find
100 * Copyright (c) 1986 by University of Toronto.
101 * Written by Henry Spencer. Not derived from licensed software.
103 * Permission is granted to anyone to use this software for any
104 * purpose on any computer system, and to redistribute it freely,
105 * subject to the following restrictions:
107 * 1. The author is not responsible for the consequences of use of
108 * this software, no matter how awful, even if they arise
109 * from defects in it.
111 * 2. The origin of this software must not be misrepresented, either
112 * by explicit claim or by omission.
114 * 3. Altered versions must be plainly marked as such, and must not
115 * be misrepresented as being the original software.
117 * Beware that some of this code is subtly aware of the way operator
118 * precedence is structured in regular expressions. Serious changes in
119 * regular-expression syntax might require a total rethink.
123 * The "internal use only" fields in regexp.h are present to pass info from
124 * compile to execute that permits the execute phase to run lots faster on
125 * simple cases. They are:
127 * regstart char that must begin a match; '\0' if none obvious
128 * reganch is the match anchored (at beginning-of-line only)?
129 * regmust string (pointer into program) that match must include, or NULL
130 * regmlen length of regmust string
132 * Regstart and reganch permit very fast decisions on suitable starting points
133 * for a match, cutting down the work a lot. Regmust permits fast rejection
134 * of lines that cannot possibly match. The regmust tests are costly enough
135 * that compile() supplies a regmust only if the r.e. contains something
136 * potentially expensive (at present, the only such thing detected is * or +
137 * at the start of the r.e., which can involve a lot of backup). Regmlen is
138 * supplied because the test in find() needs it and compile() is computing
139 * it anyway.
143 * Structure for regexp "program". This is essentially a linear encoding
144 * of a nondeterministic finite-state machine (aka syntax charts or
145 * "railroad normal form" in parsing technology). Each node is an opcode
146 * plus a "next" pointer, possibly plus an operand. "Next" pointers of
147 * all nodes except BRANCH implement concatenation; a "next" pointer with
148 * a BRANCH on both ends of it is connecting two alternatives. (Here we
149 * have one of the subtle syntax dependencies: an individual BRANCH (as
150 * opposed to a collection of them) is never concatenated with anything
151 * because of operator precedence.) The operand of some types of node is
152 * a literal string; for others, it is a node leading into a sub-FSM. In
153 * particular, the operand of a BRANCH node is the first node of the branch.
154 * (NB this is *not* a tree structure: the tail of the branch connects
155 * to the thing following the set of BRANCHes.) The opcodes are:
158 // definition number opnd? meaning
159 #define END 0 // no End of program.
160 #define BOL 1 // no Match "" at beginning of line.
161 #define EOL 2 // no Match "" at end of line.
162 #define ANY 3 // no Match any one character.
163 #define ANYOF 4 // str Match any character in this string.
164 #define ANYBUT 5 // str Match any character not in this
165 // string.
166 #define BRANCH 6 // node Match this alternative, or the
167 // next...
168 #define BACK 7 // no Match "", "next" ptr points backward.
169 #define EXACTLY 8 // str Match this string.
170 #define NOTHING 9 // no Match empty string.
171 #define STAR 10 // node Match this (simple) thing 0 or more
172 // times.
173 #define PLUS 11 // node Match this (simple) thing 1 or more
174 // times.
175 #define OPEN 20 // no Mark this point in input as start of
176 // #n.
177 // OPEN+1 is number 1, etc.
178 #define CLOSE 30 // no Analogous to OPEN.
181 * Opcode notes:
183 * BRANCH The set of branches constituting a single choice are hooked
184 * together with their "next" pointers, since precedence prevents
185 * anything being concatenated to any individual branch. The
186 * "next" pointer of the last BRANCH in a choice points to the
187 * thing following the whole choice. This is also where the
188 * final "next" pointer of each individual branch points; each
189 * branch starts with the operand node of a BRANCH node.
191 * BACK Normal "next" pointers all implicitly point forward; BACK
192 * exists to make loop structures possible.
194 * STAR,PLUS '?', and complex '*' and '+', are implemented as circular
195 * BRANCH structures using BACK. Simple cases (one character
196 * per match) are implemented with STAR and PLUS for speed
197 * and to minimize recursive plunges.
199 * OPEN,CLOSE ...are numbered at compile time.
203 * A node is one char of opcode followed by two chars of "next" pointer.
204 * "Next" pointers are stored as two 8-bit pieces, high order first. The
205 * value is a positive offset from the opcode of the node containing it.
206 * An operand, if any, simply follows the node. (Note that much of the
207 * code generation knows about this implicit relationship.)
209 * Using two bytes for the "next" pointer is vast overkill for most things,
210 * but allows patterns to get big without disasters.
213 #define OP(p) (*(p))
214 #define NEXT(p) (((*((p)+1)&0377)<<8) + (*((p)+2)&0377))
215 #define OPERAND(p) ((p) + 3)
217 const unsigned char MAGIC = 0234;
219 * Utility definitions.
222 #define UCHARAT(p) ((const unsigned char*)(p))[0]
225 #define FAIL(m) { regerror(m); return(NULL); }
226 #define ISMULT(c) ((c) == '*' || (c) == '+' || (c) == '?')
227 #define META "^$.[()|?+*\\"
231 * Flags to be passed up and down.
233 #define HASWIDTH 01 // Known never to match null string.
234 #define SIMPLE 02 // Simple enough to be STAR/PLUS operand.
235 #define SPSTART 04 // Starts with * or +.
236 #define WORST 0 // Worst case.
240 /////////////////////////////////////////////////////////////////////////
242 // COMPILE AND ASSOCIATED FUNCTIONS
244 /////////////////////////////////////////////////////////////////////////
248 * Global work variables for compile().
250 static const char* regparse; // Input-scan pointer.
251 static int regnpar; // () count.
252 static char regdummy;
253 static char* regcode; // Code-emit pointer; &regdummy = don't.
254 static long regsize; // Code size.
257 * Forward declarations for compile()'s friends.
259 // #ifndef static
260 // #define static static
261 // #endif
262 static char* reg (int, int*);
263 static char* regbranch (int*);
264 static char* regpiece (int*);
265 static char* regatom (int*);
266 static char* regnode (char);
267 static const char* regnext (register const char*);
268 static char* regnext (register char*);
269 static void regc (unsigned char);
270 static void reginsert (char, char*);
271 static void regtail (char*, const char*);
272 static void regoptail (char*, const char*);
274 #ifdef STRCSPN
275 static int strcspn ();
276 #endif
281 * We can't allocate space until we know how big the compiled form will be,
282 * but we can't compile it (and thus know how big it is) until we've got a
283 * place to put the code. So we cheat: we compile it twice, once with code
284 * generation turned off and size counting turned on, and once "for real".
285 * This also means that we don't allocate space until we are sure that the
286 * thing really will compile successfully, and we never have to move the
287 * code and thus invalidate pointers into it. (Note that it has to be in
288 * one piece because free() must be able to free it all.)
290 * Beware that the optimization-preparation code in here knows about some
291 * of the structure of the compiled regexp.
295 // compile -- compile a regular expression into internal code
296 // for later pattern matching.
298 void cmRegularExpression::compile (const char* exp) {
299 register const char* scan;
300 register const char* longest;
301 register unsigned long len;
302 int flags;
304 if (exp == NULL) {
305 //RAISE Error, SYM(cmRegularExpression), SYM(No_Expr),
306 printf ("cmRegularExpression::compile(): No expression supplied.\n");
307 return;
310 // First pass: determine size, legality.
311 regparse = exp;
312 regnpar = 1;
313 regsize = 0L;
314 regcode = &regdummy;
315 regc(MAGIC);
316 if(!reg(0, &flags))
318 printf ("cmRegularExpression::compile(): Error in compile.\n");
319 return;
321 this->startp[0] = this->endp[0] = this->searchstring = NULL;
323 // Small enough for pointer-storage convention?
324 if (regsize >= 32767L) { // Probably could be 65535L.
325 //RAISE Error, SYM(cmRegularExpression), SYM(Expr_Too_Big),
326 printf ("cmRegularExpression::compile(): Expression too big.\n");
327 return;
330 // Allocate space.
331 //#ifndef WIN32
332 if (this->program != NULL) delete [] this->program;
333 //#endif
334 this->program = new char[regsize];
335 this->progsize = (int) regsize;
337 if (this->program == NULL) {
338 //RAISE Error, SYM(cmRegularExpression), SYM(Out_Of_Memory),
339 printf ("cmRegularExpression::compile(): Out of memory.\n");
340 return;
343 // Second pass: emit code.
344 regparse = exp;
345 regnpar = 1;
346 regcode = this->program;
347 regc(MAGIC);
348 reg(0, &flags);
350 // Dig out information for optimizations.
351 this->regstart = '\0'; // Worst-case defaults.
352 this->reganch = 0;
353 this->regmust = NULL;
354 this->regmlen = 0;
355 scan = this->program + 1; // First BRANCH.
356 if (OP(regnext(scan)) == END) { // Only one top-level choice.
357 scan = OPERAND(scan);
359 // Starting-point info.
360 if (OP(scan) == EXACTLY)
361 this->regstart = *OPERAND(scan);
362 else if (OP(scan) == BOL)
363 this->reganch++;
366 // If there's something expensive in the r.e., find the longest
367 // literal string that must appear and make it the regmust. Resolve
368 // ties in favor of later strings, since the regstart check works
369 // with the beginning of the r.e. and avoiding duplication
370 // strengthens checking. Not a strong reason, but sufficient in the
371 // absence of others.
373 if (flags & SPSTART) {
374 longest = NULL;
375 len = 0;
376 for (; scan != NULL; scan = regnext(scan))
377 if (OP(scan) == EXACTLY && strlen(OPERAND(scan)) >= len) {
378 longest = OPERAND(scan);
379 len = int(strlen(OPERAND(scan)));
381 this->regmust = longest;
382 this->regmlen = len;
389 - reg - regular expression, i.e. main body or parenthesized thing
391 * Caller must absorb opening parenthesis.
393 * Combining parenthesis handling with the base level of regular expression
394 * is a trifle forced, but the need to tie the tails of the branches to what
395 * follows makes it hard to avoid.
397 static char* reg (int paren, int *flagp) {
398 register char* ret;
399 register char* br;
400 register char* ender;
401 register int parno =0;
402 int flags;
404 *flagp = HASWIDTH; // Tentatively.
406 // Make an OPEN node, if parenthesized.
407 if (paren) {
408 if (regnpar >= NSUBEXP) {
409 //RAISE Error, SYM(cmRegularExpression), SYM(Too_Many_Parens),
410 printf ("cmRegularExpression::compile(): Too many parentheses.\n");
411 return 0;
413 parno = regnpar;
414 regnpar++;
415 ret = regnode(OPEN + parno);
417 else
418 ret = NULL;
420 // Pick up the branches, linking them together.
421 br = regbranch(&flags);
422 if (br == NULL)
423 return (NULL);
424 if (ret != NULL)
425 regtail(ret, br); // OPEN -> first.
426 else
427 ret = br;
428 if (!(flags & HASWIDTH))
429 *flagp &= ~HASWIDTH;
430 *flagp |= flags & SPSTART;
431 while (*regparse == '|') {
432 regparse++;
433 br = regbranch(&flags);
434 if (br == NULL)
435 return (NULL);
436 regtail(ret, br); // BRANCH -> BRANCH.
437 if (!(flags & HASWIDTH))
438 *flagp &= ~HASWIDTH;
439 *flagp |= flags & SPSTART;
442 // Make a closing node, and hook it on the end.
443 ender = regnode((paren) ? CLOSE + parno : END);
444 regtail(ret, ender);
446 // Hook the tails of the branches to the closing node.
447 for (br = ret; br != NULL; br = regnext(br))
448 regoptail(br, ender);
450 // Check for proper termination.
451 if (paren && *regparse++ != ')') {
452 //RAISE Error, SYM(cmRegularExpression), SYM(Unmatched_Parens),
453 printf ("cmRegularExpression::compile(): Unmatched parentheses.\n");
454 return 0;
456 else if (!paren && *regparse != '\0') {
457 if (*regparse == ')') {
458 //RAISE Error, SYM(cmRegularExpression), SYM(Unmatched_Parens),
459 printf ("cmRegularExpression::compile(): Unmatched parentheses.\n");
460 return 0;
462 else {
463 //RAISE Error, SYM(cmRegularExpression), SYM(Internal_Error),
464 printf ("cmRegularExpression::compile(): Internal error.\n");
465 return 0;
467 // NOTREACHED
469 return (ret);
474 - regbranch - one alternative of an | operator
476 * Implements the concatenation operator.
478 static char* regbranch (int *flagp) {
479 register char* ret;
480 register char* chain;
481 register char* latest;
482 int flags;
484 *flagp = WORST; // Tentatively.
486 ret = regnode(BRANCH);
487 chain = NULL;
488 while (*regparse != '\0' && *regparse != '|' && *regparse != ')') {
489 latest = regpiece(&flags);
490 if (latest == NULL)
491 return (NULL);
492 *flagp |= flags & HASWIDTH;
493 if (chain == NULL) // First piece.
494 *flagp |= flags & SPSTART;
495 else
496 regtail(chain, latest);
497 chain = latest;
499 if (chain == NULL) // Loop ran zero times.
500 regnode(NOTHING);
502 return (ret);
507 - regpiece - something followed by possible [*+?]
509 * Note that the branching code sequences used for ? and the general cases
510 * of * and + are somewhat optimized: they use the same NOTHING node as
511 * both the endmarker for their branch list and the body of the last branch.
512 * It might seem that this node could be dispensed with entirely, but the
513 * endmarker role is not redundant.
515 static char* regpiece (int *flagp) {
516 register char* ret;
517 register char op;
518 register char* next;
519 int flags;
521 ret = regatom(&flags);
522 if (ret == NULL)
523 return (NULL);
525 op = *regparse;
526 if (!ISMULT(op)) {
527 *flagp = flags;
528 return (ret);
531 if (!(flags & HASWIDTH) && op != '?') {
532 //RAISE Error, SYM(cmRegularExpression), SYM(Empty_Operand),
533 printf ("cmRegularExpression::compile() : *+ operand could be empty.\n");
534 return 0;
536 *flagp = (op != '+') ? (WORST | SPSTART) : (WORST | HASWIDTH);
538 if (op == '*' && (flags & SIMPLE))
539 reginsert(STAR, ret);
540 else if (op == '*') {
541 // Emit x* as (x&|), where & means "self".
542 reginsert(BRANCH, ret); // Either x
543 regoptail(ret, regnode(BACK)); // and loop
544 regoptail(ret, ret); // back
545 regtail(ret, regnode(BRANCH)); // or
546 regtail(ret, regnode(NOTHING)); // null.
548 else if (op == '+' && (flags & SIMPLE))
549 reginsert(PLUS, ret);
550 else if (op == '+') {
551 // Emit x+ as x(&|), where & means "self".
552 next = regnode(BRANCH); // Either
553 regtail(ret, next);
554 regtail(regnode(BACK), ret); // loop back
555 regtail(next, regnode(BRANCH)); // or
556 regtail(ret, regnode(NOTHING)); // null.
558 else if (op == '?') {
559 // Emit x? as (x|)
560 reginsert(BRANCH, ret); // Either x
561 regtail(ret, regnode(BRANCH)); // or
562 next = regnode(NOTHING);// null.
563 regtail(ret, next);
564 regoptail(ret, next);
566 regparse++;
567 if (ISMULT(*regparse)) {
568 //RAISE Error, SYM(cmRegularExpression), SYM(Nested_Operand),
569 printf ("cmRegularExpression::compile(): Nested *?+.\n");
570 return 0;
572 return (ret);
577 - regatom - the lowest level
579 * Optimization: gobbles an entire sequence of ordinary characters so that
580 * it can turn them into a single node, which is smaller to store and
581 * faster to run. Backslashed characters are exceptions, each becoming a
582 * separate node; the code is simpler that way and it's not worth fixing.
584 static char* regatom (int *flagp) {
585 register char* ret;
586 int flags;
588 *flagp = WORST; // Tentatively.
590 switch (*regparse++) {
591 case '^':
592 ret = regnode(BOL);
593 break;
594 case '$':
595 ret = regnode(EOL);
596 break;
597 case '.':
598 ret = regnode(ANY);
599 *flagp |= HASWIDTH | SIMPLE;
600 break;
601 case '[':{
602 register int rxpclass;
603 register int rxpclassend;
605 if (*regparse == '^') { // Complement of range.
606 ret = regnode(ANYBUT);
607 regparse++;
609 else
610 ret = regnode(ANYOF);
611 if (*regparse == ']' || *regparse == '-')
612 regc(*regparse++);
613 while (*regparse != '\0' && *regparse != ']') {
614 if (*regparse == '-') {
615 regparse++;
616 if (*regparse == ']' || *regparse == '\0')
617 regc('-');
618 else {
619 rxpclass = UCHARAT(regparse - 2) + 1;
620 rxpclassend = UCHARAT(regparse);
621 if (rxpclass > rxpclassend + 1) {
622 //RAISE Error, SYM(cmRegularExpression), SYM(Invalid_Range),
623 printf ("cmRegularExpression::compile(): Invalid range in [].\n");
624 return 0;
626 for (; rxpclass <= rxpclassend; rxpclass++)
627 regc(rxpclass);
628 regparse++;
631 else
632 regc(*regparse++);
634 regc('\0');
635 if (*regparse != ']') {
636 //RAISE Error, SYM(cmRegularExpression), SYM(Unmatched_Bracket),
637 printf ("cmRegularExpression::compile(): Unmatched [].\n");
638 return 0;
640 regparse++;
641 *flagp |= HASWIDTH | SIMPLE;
643 break;
644 case '(':
645 ret = reg(1, &flags);
646 if (ret == NULL)
647 return (NULL);
648 *flagp |= flags & (HASWIDTH | SPSTART);
649 break;
650 case '\0':
651 case '|':
652 case ')':
653 //RAISE Error, SYM(cmRegularExpression), SYM(Internal_Error),
654 printf ("cmRegularExpression::compile(): Internal error.\n"); // Never here
655 return 0;
656 case '?':
657 case '+':
658 case '*':
659 //RAISE Error, SYM(cmRegularExpression), SYM(No_Operand),
660 printf ("cmRegularExpression::compile(): ?+* follows nothing.\n");
661 return 0;
662 case '\\':
663 if (*regparse == '\0') {
664 //RAISE Error, SYM(cmRegularExpression), SYM(Trailing_Backslash),
665 printf ("cmRegularExpression::compile(): Trailing backslash.\n");
666 return 0;
668 ret = regnode(EXACTLY);
669 regc(*regparse++);
670 regc('\0');
671 *flagp |= HASWIDTH | SIMPLE;
672 break;
673 default:{
674 register int len;
675 register char ender;
677 regparse--;
678 len = int(strcspn(regparse, META));
679 if (len <= 0) {
680 //RAISE Error, SYM(cmRegularExpression), SYM(Internal_Error),
681 printf ("cmRegularExpression::compile(): Internal error.\n");
682 return 0;
684 ender = *(regparse + len);
685 if (len > 1 && ISMULT(ender))
686 len--; // Back off clear of ?+* operand.
687 *flagp |= HASWIDTH;
688 if (len == 1)
689 *flagp |= SIMPLE;
690 ret = regnode(EXACTLY);
691 while (len > 0) {
692 regc(*regparse++);
693 len--;
695 regc('\0');
697 break;
699 return (ret);
704 - regnode - emit a node
705 Location.
707 static char* regnode (char op) {
708 register char* ret;
709 register char* ptr;
711 ret = regcode;
712 if (ret == &regdummy) {
713 regsize += 3;
714 return (ret);
717 ptr = ret;
718 *ptr++ = op;
719 *ptr++ = '\0'; // Null "next" pointer.
720 *ptr++ = '\0';
721 regcode = ptr;
723 return (ret);
728 - regc - emit (if appropriate) a byte of code
730 static void regc (unsigned char b) {
731 if (regcode != &regdummy)
732 *regcode++ = b;
733 else
734 regsize++;
739 - reginsert - insert an operator in front of already-emitted operand
741 * Means relocating the operand.
743 static void reginsert (char op, char* opnd) {
744 register char* src;
745 register char* dst;
746 register char* place;
748 if (regcode == &regdummy) {
749 regsize += 3;
750 return;
753 src = regcode;
754 regcode += 3;
755 dst = regcode;
756 while (src > opnd)
757 *--dst = *--src;
759 place = opnd; // Op node, where operand used to be.
760 *place++ = op;
761 *place++ = '\0';
762 *place++ = '\0';
767 - regtail - set the next-pointer at the end of a node chain
769 static void regtail (char* p, const char* val) {
770 register char* scan;
771 register char* temp;
772 register int offset;
774 if (p == &regdummy)
775 return;
777 // Find last node.
778 scan = p;
779 for (;;) {
780 temp = regnext(scan);
781 if (temp == NULL)
782 break;
783 scan = temp;
786 if (OP(scan) == BACK)
787 offset = int(scan - val);
788 else
789 offset = int(val - scan);
790 *(scan + 1) = (offset >> 8) & 0377;
791 *(scan + 2) = offset & 0377;
796 - regoptail - regtail on operand of first argument; nop if operandless
798 static void regoptail (char* p, const char* val) {
799 // "Operandless" and "op != BRANCH" are synonymous in practice.
800 if (p == NULL || p == &regdummy || OP(p) != BRANCH)
801 return;
802 regtail(OPERAND(p), val);
807 ////////////////////////////////////////////////////////////////////////
809 // find and friends
811 ////////////////////////////////////////////////////////////////////////
815 * Global work variables for find().
817 static const char* reginput; // String-input pointer.
818 static const char* regbol; // Beginning of input, for ^ check.
819 static const char* *regstartp; // Pointer to startp array.
820 static const char* *regendp; // Ditto for endp.
823 * Forwards.
825 static int regtry (const char*, const char* *,
826 const char* *, const char*);
827 static int regmatch (const char*);
828 static int regrepeat (const char*);
830 #ifdef DEBUG
831 int regnarrate = 0;
832 void regdump ();
833 static char* regprop ();
834 #endif
836 bool cmRegularExpression::find (std::string const& s)
838 return find(s.c_str());
843 // find -- Matches the regular expression to the given string.
844 // Returns true if found, and sets start and end indexes accordingly.
846 bool cmRegularExpression::find (const char* string) {
847 register const char* s;
849 this->searchstring = string;
851 // Check validity of program.
852 if (!this->program || UCHARAT(this->program) != MAGIC) {
853 //RAISE Error, SYM(cmRegularExpression), SYM(Internal_Error),
854 printf ("cmRegularExpression::find(): Compiled regular expression corrupted.\n");
855 return 0;
858 // If there is a "must appear" string, look for it.
859 if (this->regmust != NULL) {
860 s = string;
861 while ((s = strchr(s, this->regmust[0])) != NULL) {
862 if (strncmp(s, this->regmust, this->regmlen) == 0)
863 break; // Found it.
864 s++;
866 if (s == NULL) // Not present.
867 return (0);
870 // Mark beginning of line for ^ .
871 regbol = string;
873 // Simplest case: anchored match need be tried only once.
874 if (this->reganch)
875 return (regtry(string, this->startp, this->endp, this->program) != 0);
877 // Messy cases: unanchored match.
878 s = string;
879 if (this->regstart != '\0')
880 // We know what char it must start with.
881 while ((s = strchr(s, this->regstart)) != NULL) {
882 if (regtry(s, this->startp, this->endp, this->program))
883 return (1);
884 s++;
887 else
888 // We don't -- general case.
889 do {
890 if (regtry(s, this->startp, this->endp, this->program))
891 return (1);
892 } while (*s++ != '\0');
894 // Failure.
895 return (0);
900 - regtry - try match at specific point
901 0 failure, 1 success
903 static int regtry (const char* string, const char* *start,
904 const char* *end, const char* prog) {
905 register int i;
906 register const char* *sp1;
907 register const char* *ep;
909 reginput = string;
910 regstartp = start;
911 regendp = end;
913 sp1 = start;
914 ep = end;
915 for (i = NSUBEXP; i > 0; i--) {
916 *sp1++ = NULL;
917 *ep++ = NULL;
919 if (regmatch(prog + 1)) {
920 start[0] = string;
921 end[0] = reginput;
922 return (1);
924 else
925 return (0);
930 - regmatch - main matching routine
932 * Conceptually the strategy is simple: check to see whether the current
933 * node matches, call self recursively to see whether the rest matches,
934 * and then act accordingly. In practice we make some effort to avoid
935 * recursion, in particular by going through "ordinary" nodes (that don't
936 * need to know whether the rest of the match failed) by a loop instead of
937 * by recursion.
938 * 0 failure, 1 success
940 static int regmatch (const char* prog) {
941 register const char* scan; // Current node.
942 const char* next; // Next node.
944 scan = prog;
946 while (scan != NULL) {
948 next = regnext(scan);
950 switch (OP(scan)) {
951 case BOL:
952 if (reginput != regbol)
953 return (0);
954 break;
955 case EOL:
956 if (*reginput != '\0')
957 return (0);
958 break;
959 case ANY:
960 if (*reginput == '\0')
961 return (0);
962 reginput++;
963 break;
964 case EXACTLY:{
965 register int len;
966 register const char* opnd;
968 opnd = OPERAND(scan);
969 // Inline the first character, for speed.
970 if (*opnd != *reginput)
971 return (0);
972 len = int(strlen(opnd));
973 if (len > 1 && strncmp(opnd, reginput, len) != 0)
974 return (0);
975 reginput += len;
977 break;
978 case ANYOF:
979 if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) == NULL)
980 return (0);
981 reginput++;
982 break;
983 case ANYBUT:
984 if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) != NULL)
985 return (0);
986 reginput++;
987 break;
988 case NOTHING:
989 break;
990 case BACK:
991 break;
992 case OPEN + 1:
993 case OPEN + 2:
994 case OPEN + 3:
995 case OPEN + 4:
996 case OPEN + 5:
997 case OPEN + 6:
998 case OPEN + 7:
999 case OPEN + 8:
1000 case OPEN + 9:{
1001 register int no;
1002 register const char* save;
1004 no = OP(scan) - OPEN;
1005 save = reginput;
1007 if (regmatch(next)) {
1010 // Don't set startp if some later invocation of the
1011 // same parentheses already has.
1013 if (regstartp[no] == NULL)
1014 regstartp[no] = save;
1015 return (1);
1017 else
1018 return (0);
1020 // break;
1021 case CLOSE + 1:
1022 case CLOSE + 2:
1023 case CLOSE + 3:
1024 case CLOSE + 4:
1025 case CLOSE + 5:
1026 case CLOSE + 6:
1027 case CLOSE + 7:
1028 case CLOSE + 8:
1029 case CLOSE + 9:{
1030 register int no;
1031 register const char* save;
1033 no = OP(scan) - CLOSE;
1034 save = reginput;
1036 if (regmatch(next)) {
1039 // Don't set endp if some later invocation of the
1040 // same parentheses already has.
1042 if (regendp[no] == NULL)
1043 regendp[no] = save;
1044 return (1);
1046 else
1047 return (0);
1049 // break;
1050 case BRANCH:{
1052 register const char* save;
1054 if (OP(next) != BRANCH) // No choice.
1055 next = OPERAND(scan); // Avoid recursion.
1056 else {
1057 do {
1058 save = reginput;
1059 if (regmatch(OPERAND(scan)))
1060 return (1);
1061 reginput = save;
1062 scan = regnext(scan);
1063 } while (scan != NULL && OP(scan) == BRANCH);
1064 return (0);
1065 // NOTREACHED
1068 break;
1069 case STAR:
1070 case PLUS:{
1071 register char nextch;
1072 register int no;
1073 register const char* save;
1074 register int min_no;
1077 // Lookahead to avoid useless match attempts when we know
1078 // what character comes next.
1080 nextch = '\0';
1081 if (OP(next) == EXACTLY)
1082 nextch = *OPERAND(next);
1083 min_no = (OP(scan) == STAR) ? 0 : 1;
1084 save = reginput;
1085 no = regrepeat(OPERAND(scan));
1086 while (no >= min_no) {
1087 // If it could work, try it.
1088 if (nextch == '\0' || *reginput == nextch)
1089 if (regmatch(next))
1090 return (1);
1091 // Couldn't or didn't -- back up.
1092 no--;
1093 reginput = save + no;
1095 return (0);
1097 // break;
1098 case END:
1099 return (1); // Success!
1101 default:
1102 //RAISE Error, SYM(cmRegularExpression), SYM(Internal_Error),
1103 printf ("cmRegularExpression::find(): Internal error -- memory corrupted.\n");
1104 return 0;
1106 scan = next;
1110 // We get here only if there's trouble -- normally "case END" is the
1111 // terminating point.
1113 //RAISE Error, SYM(cmRegularExpression), SYM(Internal_Error),
1114 printf ("cmRegularExpression::find(): Internal error -- corrupted pointers.\n");
1115 return (0);
1120 - regrepeat - repeatedly match something simple, report how many
1122 static int regrepeat (const char* p) {
1123 register int count = 0;
1124 register const char* scan;
1125 register const char* opnd;
1127 scan = reginput;
1128 opnd = OPERAND(p);
1129 switch (OP(p)) {
1130 case ANY:
1131 count = int(strlen(scan));
1132 scan += count;
1133 break;
1134 case EXACTLY:
1135 while (*opnd == *scan) {
1136 count++;
1137 scan++;
1139 break;
1140 case ANYOF:
1141 while (*scan != '\0' && strchr(opnd, *scan) != NULL) {
1142 count++;
1143 scan++;
1145 break;
1146 case ANYBUT:
1147 while (*scan != '\0' && strchr(opnd, *scan) == NULL) {
1148 count++;
1149 scan++;
1151 break;
1152 default: // Oh dear. Called inappropriately.
1153 //RAISE Error, SYM(cmRegularExpression), SYM(Internal_Error),
1154 printf ("cm RegularExpression::find(): Internal error.\n");
1155 return 0;
1157 reginput = scan;
1158 return (count);
1163 - regnext - dig the "next" pointer out of a node
1165 static const char* regnext (register const char* p) {
1166 register int offset;
1168 if (p == &regdummy)
1169 return (NULL);
1171 offset = NEXT(p);
1172 if (offset == 0)
1173 return (NULL);
1175 if (OP(p) == BACK)
1176 return (p - offset);
1177 else
1178 return (p + offset);
1182 static char* regnext (register char* p) {
1183 register int offset;
1185 if (p == &regdummy)
1186 return (NULL);
1188 offset = NEXT(p);
1189 if (offset == 0)
1190 return (NULL);
1192 if (OP(p) == BACK)
1193 return (p - offset);
1194 else
1195 return (p + offset);