2 * regcomp and regexec -- regsub and regerror are elsewhere
4 * Copyright (c) 1986 by University of Toronto.
5 * Written by Henry Spencer. Not derived from licensed software.
7 * Permission is granted to anyone to use this software for any
8 * purpose on any computer system, and to redistribute it freely,
9 * subject to the following restrictions:
11 * 1. The author is not responsible for the consequences of use of
12 * this software, no matter how awful, even if they arise
15 * 2. The origin of this software must not be misrepresented, either
16 * by explicit claim or by omission.
18 * 3. Altered versions must be plainly marked as such, and must not
19 * be misrepresented as being the original software.
20 *** THIS IS AN ALTERED VERSION. It was altered by John Gilmore,
21 *** hoptoad!gnu, on 27 Dec 1986, to add \n as an alternative to |
22 *** to assist in implementing egrep.
23 *** THIS IS AN ALTERED VERSION. It was altered by John Gilmore,
24 *** hoptoad!gnu, on 27 Dec 1986, to add \< and \> for word-matching
25 *** as in BSD grep and ex.
26 *** THIS IS AN ALTERED VERSION. It was altered by John Gilmore,
27 *** hoptoad!gnu, on 28 Dec 1986, to optimize characters quoted with \.
28 *** THIS IS AN ALTERED VERSION. It was altered by James A. Woods,
29 *** ames!jaw, on 19 June 1987, to quash a regcomp() redundancy.
30 *** THIS IS AN ALTERED VERSION. It was altered by Christopher Seiwald
31 *** seiwald@vix.com, on 28 August 1993, for use in jam. Regmagic.h
32 *** was moved into regexp.h, and the include of regexp.h now uses "'s
33 *** to avoid conflicting with the system regexp.h. Const, bless its
34 *** soul, was removed so it can compile everywhere. The declaration
35 *** of strchr() was in conflict on AIX, so it was removed (as it is
36 *** happily defined in string.h).
37 *** THIS IS AN ALTERED VERSION. It was altered by Christopher Seiwald
38 *** seiwald@perforce.com, on 20 January 2000, to use function prototypes.
39 *** THIS IS AN ALTERED VERSION. It was altered by Christopher Seiwald
40 *** seiwald@perforce.com, on 05 November 2002, to const string literals.
42 * THIS IS AN ALTERED VERSION. It was altered by Steve Bennett <steveb@workware.net.au>
43 * on 16 October 2010, to remove static state and add better Tcl ARE compatibility.
44 * This includes counted repetitions, UTF-8 support, character classes,
45 * shorthand character classes, increased number of parentheses to 100,
46 * backslash escape sequences. It also removes \n as an alternative to |.
48 * Beware that some of this code is subtly aware of the way operator
49 * precedence is structured in regular expressions. Serious changes in
50 * regular-expression syntax might require a total rethink.
58 #include "jimautoconf.h"
59 #include "jimregexp.h"
62 #if !defined(HAVE_REGCOMP) || defined(JIM_REGEXP)
65 * Structure for regexp "program". This is essentially a linear encoding
66 * of a nondeterministic finite-state machine (aka syntax charts or
67 * "railroad normal form" in parsing technology). Each node is an opcode
68 * plus a "next" pointer, possibly plus an operand. "Next" pointers of
69 * all nodes except BRANCH implement concatenation; a "next" pointer with
70 * a BRANCH on both ends of it is connecting two alternatives. (Here we
71 * have one of the subtle syntax dependencies: an individual BRANCH (as
72 * opposed to a collection of them) is never concatenated with anything
73 * because of operator precedence.) The operand of some types of node is
74 * a literal string; for others, it is a node leading into a sub-FSM. In
75 * particular, the operand of a BRANCH node is the first node of the branch.
76 * (NB this is *not* a tree structure: the tail of the branch connects
77 * to the thing following the set of BRANCHes.) The opcodes are:
80 /* This *MUST* be less than (255-20)/2=117 */
81 #define REG_MAX_PAREN 100
83 /* definition number opnd? meaning */
84 #define END 0 /* no End of program. */
85 #define BOL 1 /* no Match "" at beginning of line. */
86 #define EOL 2 /* no Match "" at end of line. */
87 #define ANY 3 /* no Match any one character. */
88 #define ANYOF 4 /* str Match any character in this string. */
89 #define ANYBUT 5 /* str Match any character not in this string. */
90 #define BRANCH 6 /* node Match this alternative, or the next... */
91 #define BACK 7 /* no Match "", "next" ptr points backward. */
92 #define EXACTLY 8 /* str Match this string. */
93 #define NOTHING 9 /* no Match empty string. */
94 #define REP 10 /* max,min Match this (simple) thing [min,max] times. */
95 #define REPMIN 11 /* max,min Match this (simple) thing [min,max] times, mininal match. */
96 #define REPX 12 /* max,min Match this (complex) thing [min,max] times. */
97 #define REPXMIN 13 /* max,min Match this (complex) thing [min,max] times, minimal match. */
99 #define WORDA 15 /* no Match "" at wordchar, where prev is nonword */
100 #define WORDZ 16 /* no Match "" at nonwordchar, where prev is word */
101 #define OPEN 20 /* no Mark this point in input as start of #n. */
102 /* OPEN+1 is number 1, etc. */
103 #define CLOSE (OPEN+REG_MAX_PAREN) /* no Analogous to OPEN. */
104 #define CLOSE_END (CLOSE+REG_MAX_PAREN)
107 * The first byte of the regexp internal "program" is actually this magic
108 * number; the start node begins in the second byte.
110 #define REG_MAGIC 0xFADED00D
115 * BRANCH The set of branches constituting a single choice are hooked
116 * together with their "next" pointers, since precedence prevents
117 * anything being concatenated to any individual branch. The
118 * "next" pointer of the last BRANCH in a choice points to the
119 * thing following the whole choice. This is also where the
120 * final "next" pointer of each individual branch points; each
121 * branch starts with the operand node of a BRANCH node.
123 * BACK Normal "next" pointers all implicitly point forward; BACK
124 * exists to make loop structures possible.
126 * STAR,PLUS '?', and complex '*' and '+', are implemented as circular
127 * BRANCH structures using BACK. Simple cases (one character
128 * per match) are implemented with STAR and PLUS for speed
129 * and to minimize recursive plunges.
131 * OPEN,CLOSE ...are numbered at compile time.
135 * A node is one char of opcode followed by two chars of "next" pointer.
136 * "Next" pointers are stored as two 8-bit pieces, high order first. The
137 * value is a positive offset from the opcode of the node containing it.
138 * An operand, if any, simply follows the node. (Note that much of the
139 * code generation knows about this implicit relationship.)
141 * Using two bytes for the "next" pointer is vast overkill for most things,
142 * but allows patterns to get big without disasters.
144 #define OP(preg, p) (preg->program[p])
145 #define NEXT(preg, p) (preg->program[p + 1])
146 #define OPERAND(p) ((p) + 2)
149 * See regmagic.h for one further detail of program structure.
154 * Utility definitions.
157 #define FAIL(R,M) { (R)->err = (M); return (M); }
158 #define ISMULT(c) ((c) == '*' || (c) == '+' || (c) == '?' || (c) == '{')
159 #define META "^$.[()|?{+*"
162 * Flags to be passed up and down.
164 #define HASWIDTH 01 /* Known never to match null string. */
165 #define SIMPLE 02 /* Simple enough to be STAR/PLUS operand. */
166 #define SPSTART 04 /* Starts with * or +. */
167 #define WORST 0 /* Worst case. */
169 #define MAX_REP_COUNT 1000000
172 * Forward declarations for regcomp()'s friends.
174 static int reg(regex_t
*preg
, int paren
/* Parenthesized? */, int *flagp
);
175 static int regpiece(regex_t
*preg
, int *flagp
);
176 static int regbranch(regex_t
*preg
, int *flagp
);
177 static int regatom(regex_t
*preg
, int *flagp
);
178 static int regnode(regex_t
*preg
, int op
);
179 static int regnext(regex_t
*preg
, int p
);
180 static void regc(regex_t
*preg
, int b
);
181 static int reginsert(regex_t
*preg
, int op
, int size
, int opnd
);
182 static void regtail_(regex_t
*preg
, int p
, int val
, int line
);
183 static void regoptail(regex_t
*preg
, int p
, int val
);
184 #define regtail(PREG, P, VAL) regtail_(PREG, P, VAL, __LINE__)
186 static int reg_range_find(const int *string
, int c
);
187 static const char *str_find(const char *string
, int c
, int nocase
);
188 static int prefix_cmp(const int *prog
, int proglen
, const char *string
, int nocase
);
193 static void regdump(regex_t
*preg
);
194 static const char *regprop( int op
);
199 * Returns the length of the null-terminated integer sequence.
201 static int str_int_len(const int *seq
)
211 - regcomp - compile a regular expression into internal code
213 * We can't allocate space until we know how big the compiled form will be,
214 * but we can't compile it (and thus know how big it is) until we've got a
215 * place to put the code. So we cheat: we compile it twice, once with code
216 * generation turned off and size counting turned on, and once "for real".
217 * This also means that we don't allocate space until we are sure that the
218 * thing really will compile successfully, and we never have to move the
219 * code and thus invalidate pointers into it. (Note that it has to be in
220 * one piece because free() must be able to free it all.)
222 * Beware that the optimization-preparation code in here knows about some
223 * of the structure of the compiled regexp.
225 int regcomp(regex_t
*preg
, const char *exp
, int cflags
)
233 fprintf(stderr
, "Compiling: '%s'\n", exp
);
235 memset(preg
, 0, sizeof(*preg
));
238 FAIL(preg
, REG_ERR_NULL_ARGUMENT
);
240 /* First pass: determine size, legality. */
241 preg
->cflags
= cflags
;
242 preg
->regparse
= exp
;
243 /* XXX: For now, start unallocated */
244 preg
->program
= NULL
;
248 /* Allocate space. */
249 preg
->proglen
= (strlen(exp
) + 1) * 5;
250 preg
->program
= malloc(preg
->proglen
* sizeof(int));
251 if (preg
->program
== NULL
)
252 FAIL(preg
, REG_ERR_NOMEM
);
255 /* Note that since we store a magic value as the first item in the program,
256 * program offsets will never be 0
258 regc(preg
, REG_MAGIC
);
259 if (reg(preg
, 0, &flags
) == 0) {
263 /* Small enough for pointer-storage convention? */
264 if (preg
->re_nsub
>= REG_MAX_PAREN
) /* Probably could be 65535L. */
265 FAIL(preg
,REG_ERR_TOO_BIG
);
267 /* Dig out information for optimizations. */
268 preg
->regstart
= 0; /* Worst-case defaults. */
272 scan
= 1; /* First BRANCH. */
273 if (OP(preg
, regnext(preg
, scan
)) == END
) { /* Only one top-level choice. */
274 scan
= OPERAND(scan
);
276 /* Starting-point info. */
277 if (OP(preg
, scan
) == EXACTLY
) {
278 preg
->regstart
= preg
->program
[OPERAND(scan
)];
280 else if (OP(preg
, scan
) == BOL
)
284 * If there's something expensive in the r.e., find the
285 * longest literal string that must appear and make it the
286 * regmust. Resolve ties in favor of later strings, since
287 * the regstart check works with the beginning of the r.e.
288 * and avoiding duplication strengthens checking. Not a
289 * strong reason, but sufficient in the absence of others.
294 for (; scan
!= 0; scan
= regnext(preg
, scan
)) {
295 if (OP(preg
, scan
) == EXACTLY
) {
296 int plen
= str_int_len(preg
->program
+ OPERAND(scan
));
298 longest
= OPERAND(scan
);
303 preg
->regmust
= longest
;
316 - reg - regular expression, i.e. main body or parenthesized thing
318 * Caller must absorb opening parenthesis.
320 * Combining parenthesis handling with the base level of regular expression
321 * is a trifle forced, but the need to tie the tails of the branches to what
322 * follows makes it hard to avoid.
324 static int reg(regex_t
*preg
, int paren
/* Parenthesized? */, int *flagp
)
332 *flagp
= HASWIDTH
; /* Tentatively. */
334 /* Make an OPEN node, if parenthesized. */
336 parno
= ++preg
->re_nsub
;
337 ret
= regnode(preg
, OPEN
+parno
);
341 /* Pick up the branches, linking them together. */
342 br
= regbranch(preg
, &flags
);
346 regtail(preg
, ret
, br
); /* OPEN -> first. */
349 if (!(flags
&HASWIDTH
))
351 *flagp
|= flags
&SPSTART
;
352 while (*preg
->regparse
== '|') {
354 br
= regbranch(preg
, &flags
);
357 regtail(preg
, ret
, br
); /* BRANCH -> BRANCH. */
358 if (!(flags
&HASWIDTH
))
360 *flagp
|= flags
&SPSTART
;
363 /* Make a closing node, and hook it on the end. */
364 ender
= regnode(preg
, (paren
) ? CLOSE
+parno
: END
);
365 regtail(preg
, ret
, ender
);
367 /* Hook the tails of the branches to the closing node. */
368 for (br
= ret
; br
!= 0; br
= regnext(preg
, br
))
369 regoptail(preg
, br
, ender
);
371 /* Check for proper termination. */
372 if (paren
&& *preg
->regparse
++ != ')') {
373 preg
->err
= REG_ERR_UNMATCHED_PAREN
;
375 } else if (!paren
&& *preg
->regparse
!= '\0') {
376 if (*preg
->regparse
== ')') {
377 preg
->err
= REG_ERR_UNMATCHED_PAREN
;
380 preg
->err
= REG_ERR_JUNK_ON_END
;
389 - regbranch - one alternative of an | operator
391 * Implements the concatenation operator.
393 static int regbranch(regex_t
*preg
, int *flagp
)
400 *flagp
= WORST
; /* Tentatively. */
402 ret
= regnode(preg
, BRANCH
);
404 while (*preg
->regparse
!= '\0' && *preg
->regparse
!= ')' &&
405 *preg
->regparse
!= '|') {
406 latest
= regpiece(preg
, &flags
);
409 *flagp
|= flags
&HASWIDTH
;
410 if (chain
== 0) {/* First piece. */
411 *flagp
|= flags
&SPSTART
;
414 regtail(preg
, chain
, latest
);
418 if (chain
== 0) /* Loop ran zero times. */
419 (void) regnode(preg
, NOTHING
);
425 - regpiece - something followed by possible [*+?]
427 * Note that the branching code sequences used for ? and the general cases
428 * of * and + are somewhat optimized: they use the same NOTHING node as
429 * both the endmarker for their branch list and the body of the last branch.
430 * It might seem that this node could be dispensed with entirely, but the
431 * endmarker role is not redundant.
433 static int regpiece(regex_t
*preg
, int *flagp
)
443 ret
= regatom(preg
, &flags
);
447 op
= *preg
->regparse
;
453 if (!(flags
&HASWIDTH
) && op
!= '?') {
454 preg
->err
= REG_ERR_OPERAND_COULD_BE_EMPTY
;
458 /* Handle braces (counted repetition) by expansion */
462 min
= strtoul(preg
->regparse
+ 1, &end
, 10);
463 if (end
== preg
->regparse
+ 1) {
464 preg
->err
= REG_ERR_BAD_COUNT
;
471 preg
->regparse
= end
;
472 max
= strtoul(preg
->regparse
+ 1, &end
, 10);
474 preg
->err
= REG_ERR_UNMATCHED_BRACES
;
478 if (end
== preg
->regparse
+ 1) {
481 else if (max
< min
|| max
>= 100) {
482 preg
->err
= REG_ERR_BAD_COUNT
;
486 preg
->err
= REG_ERR_BAD_COUNT
;
490 preg
->regparse
= strchr(preg
->regparse
, '}');
494 max
= (op
== '?' ? 1 : MAX_REP_COUNT
);
497 if (preg
->regparse
[1] == '?') {
499 next
= reginsert(preg
, flags
& SIMPLE
? REPMIN
: REPXMIN
, 5, ret
);
502 next
= reginsert(preg
, flags
& SIMPLE
? REP
: REPX
, 5, ret
);
504 preg
->program
[ret
+ 2] = max
;
505 preg
->program
[ret
+ 3] = min
;
506 preg
->program
[ret
+ 4] = 0;
508 *flagp
= (min
) ? (WORST
|HASWIDTH
) : (WORST
|SPSTART
);
510 if (!(flags
& SIMPLE
)) {
511 int back
= regnode(preg
, BACK
);
512 regtail(preg
, back
, ret
);
513 regtail(preg
, next
, back
);
517 if (ISMULT(*preg
->regparse
)) {
518 preg
->err
= REG_ERR_NESTED_COUNT
;
522 return chain
? chain
: ret
;
526 * Add all characters in the inclusive range between lower and upper.
528 * Handles a swapped range (upper < lower).
530 static void reg_addrange(regex_t
*preg
, int lower
, int upper
)
533 reg_addrange(preg
, upper
, lower
);
535 /* Add a range as length, start */
536 regc(preg
, upper
- lower
+ 1);
541 * Add a null-terminated literal string as a set of ranges.
543 static void reg_addrange_str(regex_t
*preg
, const char *str
)
546 reg_addrange(preg
, *str
, *str
);
552 * Extracts the next unicode char from utf8.
554 * If 'upper' is set, converts the char to uppercase.
556 static int reg_utf8_tounicode_case(const char *s
, int *uc
, int upper
)
558 int l
= utf8_tounicode(s
, uc
);
560 *uc
= utf8_upper(*uc
);
566 * Converts a hex digit to decimal.
568 * Returns -1 for an invalid hex digit.
570 static int hexdigitval(int c
)
572 if (c
>= '0' && c
<= '9')
574 if (c
>= 'a' && c
<= 'f')
576 if (c
>= 'A' && c
<= 'F')
582 * Parses up to 'n' hex digits at 's' and stores the result in *uc.
584 * Returns the number of hex digits parsed.
585 * If there are no hex digits, returns 0 and stores nothing.
587 static int parse_hex(const char *s
, int n
, int *uc
)
592 for (k
= 0; k
< n
; k
++) {
593 int c
= hexdigitval(*s
++);
597 val
= (val
<< 4) | c
;
606 * Call for chars after a backlash to decode the escape sequence.
608 * Stores the result in *ch.
610 * Returns the number of bytes consumed.
612 static int reg_decode_escape(const char *s
, int *ch
)
620 case 'b': *ch
= '\b'; break;
621 case 'e': *ch
= 27; break;
622 case 'f': *ch
= '\f'; break;
623 case 'n': *ch
= '\n'; break;
624 case 'r': *ch
= '\r'; break;
625 case 't': *ch
= '\t'; break;
626 case 'v': *ch
= '\v'; break;
628 if ((n
= parse_hex(s
, 4, ch
)) > 0) {
633 if ((n
= parse_hex(s
, 2, ch
)) > 0) {
646 - regatom - the lowest level
648 * Optimization: gobbles an entire sequence of ordinary characters so that
649 * it can turn them into a single node, which is smaller to store and
650 * faster to run. Backslashed characters are exceptions, each becoming a
651 * separate node; the code is simpler that way and it's not worth fixing.
653 static int regatom(regex_t
*preg
, int *flagp
)
657 int nocase
= (preg
->cflags
& REG_ICASE
);
660 int n
= reg_utf8_tounicode_case(preg
->regparse
, &ch
, nocase
);
662 *flagp
= WORST
; /* Tentatively. */
666 /* FIXME: these chars only have meaning at beg/end of pat? */
668 ret
= regnode(preg
, BOL
);
671 ret
= regnode(preg
, EOL
);
674 ret
= regnode(preg
, ANY
);
675 *flagp
|= HASWIDTH
|SIMPLE
;
678 const char *pattern
= preg
->regparse
;
680 if (*pattern
== '^') { /* Complement of range. */
681 ret
= regnode(preg
, ANYBUT
);
684 ret
= regnode(preg
, ANYOF
);
686 /* Special case. If the first char is ']' or '-', it is part of the set */
687 if (*pattern
== ']' || *pattern
== '-') {
688 reg_addrange(preg
, *pattern
, *pattern
);
692 while (*pattern
&& *pattern
!= ']') {
693 /* Is this a range? a-z */
697 pattern
+= reg_utf8_tounicode_case(pattern
, &start
, nocase
);
699 pattern
+= reg_decode_escape(pattern
, &start
);
701 preg
->err
= REG_ERR_NULL_CHAR
;
705 if (pattern
[0] == '-' && pattern
[1]) {
707 pattern
+= utf8_tounicode(pattern
, &end
);
708 pattern
+= reg_utf8_tounicode_case(pattern
, &end
, nocase
);
710 pattern
+= reg_decode_escape(pattern
, &end
);
712 preg
->err
= REG_ERR_NULL_CHAR
;
717 reg_addrange(preg
, start
, end
);
721 if (strncmp(pattern
, ":alpha:]", 8) == 0) {
722 if ((preg
->cflags
& REG_ICASE
) == 0) {
723 reg_addrange(preg
, 'a', 'z');
725 reg_addrange(preg
, 'A', 'Z');
729 if (strncmp(pattern
, ":alnum:]", 8) == 0) {
730 if ((preg
->cflags
& REG_ICASE
) == 0) {
731 reg_addrange(preg
, 'a', 'z');
733 reg_addrange(preg
, 'A', 'Z');
734 reg_addrange(preg
, '0', '9');
738 if (strncmp(pattern
, ":space:]", 8) == 0) {
739 reg_addrange_str(preg
, " \t\r\n\f\v");
744 /* Not a range, so just add the char */
745 reg_addrange(preg
, start
, start
);
752 preg
->regparse
= pattern
;
754 *flagp
|= HASWIDTH
|SIMPLE
;
758 ret
= reg(preg
, 1, &flags
);
761 *flagp
|= flags
&(HASWIDTH
|SPSTART
);
766 preg
->err
= REG_ERR_INTERNAL
;
767 return 0; /* Supposed to be caught earlier. */
772 preg
->err
= REG_ERR_COUNT_FOLLOWS_NOTHING
;
775 switch (*preg
->regparse
++) {
777 preg
->err
= REG_ERR_TRAILING_BACKSLASH
;
781 ret
= regnode(preg
, WORDA
);
785 ret
= regnode(preg
, WORDZ
);
788 ret
= regnode(preg
, ANYOF
);
789 reg_addrange(preg
, '0', '9');
791 *flagp
|= HASWIDTH
|SIMPLE
;
794 ret
= regnode(preg
, ANYOF
);
795 if ((preg
->cflags
& REG_ICASE
) == 0) {
796 reg_addrange(preg
, 'a', 'z');
798 reg_addrange(preg
, 'A', 'Z');
799 reg_addrange(preg
, '0', '9');
800 reg_addrange(preg
, '_', '_');
802 *flagp
|= HASWIDTH
|SIMPLE
;
805 ret
= regnode(preg
, ANYOF
);
806 reg_addrange_str(preg
," \t\r\n\f\v");
808 *flagp
|= HASWIDTH
|SIMPLE
;
810 /* FIXME: Someday handle \1, \2, ... */
812 /* Handle general quoted chars in exact-match routine */
813 /* Back up to include the backslash */
821 * Encode a string of characters to be matched exactly.
825 /* Back up to pick up the first char of interest */
828 ret
= regnode(preg
, EXACTLY
);
830 /* Note that a META operator such as ? or * consumes the
832 * Thus we must be careful to look ahead by 2 and add the
833 * last char as it's own EXACTLY if necessary
836 /* Until end of string or a META char is reached */
837 while (*preg
->regparse
&& strchr(META
, *preg
->regparse
) == NULL
) {
838 n
= reg_utf8_tounicode_case(preg
->regparse
, &ch
, (preg
->cflags
& REG_ICASE
));
839 if (ch
== '\\' && preg
->regparse
[n
]) {
840 /* Non-trailing backslash.
841 * Is this a special escape, or a regular escape?
843 if (strchr("<>mMwds", preg
->regparse
[n
])) {
844 /* A special escape. All done with EXACTLY */
847 /* Decode it. Note that we add the length for the escape
848 * sequence to the length for the backlash so we can skip
849 * the entire sequence, or not as required.
851 n
+= reg_decode_escape(preg
->regparse
+ n
, &ch
);
853 preg
->err
= REG_ERR_NULL_CHAR
;
858 /* Now we have one char 'ch' of length 'n'.
859 * Check to see if the following char is a MULT
862 if (ISMULT(preg
->regparse
[n
])) {
863 /* Yes. But do we already have some EXACTLY chars? */
865 /* Yes, so return what we have and pick up the current char next time around */
868 /* No, so add this single char and finish */
875 /* No, so just add this char normally */
893 static void reg_grow(regex_t
*preg
, int n
)
895 if (preg
->p
+ n
>= preg
->proglen
) {
896 preg
->proglen
= (preg
->p
+ n
) * 2;
897 preg
->program
= realloc(preg
->program
, preg
->proglen
* sizeof(int));
902 - regnode - emit a node
905 static int regnode(regex_t
*preg
, int op
)
909 preg
->program
[preg
->p
++] = op
;
910 preg
->program
[preg
->p
++] = 0;
912 /* Return the start of the node */
917 - regc - emit (if appropriate) a byte of code
919 static void regc(regex_t
*preg
, int b
)
922 preg
->program
[preg
->p
++] = b
;
926 - reginsert - insert an operator in front of already-emitted operand
928 * Means relocating the operand.
929 * Returns the new location of the original operand.
931 static int reginsert(regex_t
*preg
, int op
, int size
, int opnd
)
933 reg_grow(preg
, size
);
935 /* Move everything from opnd up */
936 memmove(preg
->program
+ opnd
+ size
, preg
->program
+ opnd
, sizeof(int) * (preg
->p
- opnd
));
937 /* Zero out the new space */
938 memset(preg
->program
+ opnd
, 0, sizeof(int) * size
);
940 preg
->program
[opnd
] = op
;
948 - regtail - set the next-pointer at the end of a node chain
950 static void regtail_(regex_t
*preg
, int p
, int val
, int line
)
956 /* Find last node. */
959 temp
= regnext(preg
, scan
);
965 if (OP(preg
, scan
) == BACK
)
970 preg
->program
[scan
+ 1] = offset
;
974 - regoptail - regtail on operand of first argument; nop if operandless
977 static void regoptail(regex_t
*preg
, int p
, int val
)
979 /* "Operandless" and "op != BRANCH" are synonymous in practice. */
980 if (p
!= 0 && OP(preg
, p
) == BRANCH
) {
981 regtail(preg
, OPERAND(p
), val
);
986 * regexec and friends
992 static int regtry(regex_t
*preg
, const char *string
);
993 static int regmatch(regex_t
*preg
, int prog
);
994 static int regrepeat(regex_t
*preg
, int p
, int max
);
997 - regexec - match a regexp against a string
999 int regexec(regex_t
*preg
, const char *string
, size_t nmatch
, regmatch_t pmatch
[], int eflags
)
1004 /* Be paranoid... */
1005 if (preg
== NULL
|| preg
->program
== NULL
|| string
== NULL
) {
1006 return REG_ERR_NULL_ARGUMENT
;
1009 /* Check validity of program. */
1010 if (*preg
->program
!= REG_MAGIC
) {
1011 return REG_ERR_CORRUPTED
;
1015 fprintf(stderr
, "regexec: %s\n", string
);
1019 preg
->eflags
= eflags
;
1020 preg
->pmatch
= pmatch
;
1021 preg
->nmatch
= nmatch
;
1022 preg
->start
= string
; /* All offsets are computed from here */
1024 /* Must clear out the embedded repeat counts */
1025 for (scan
= OPERAND(1); scan
!= 0; scan
= regnext(preg
, scan
)) {
1026 switch (OP(preg
, scan
)) {
1031 preg
->program
[scan
+ 4] = 0;
1036 /* If there is a "must appear" string, look for it. */
1037 if (preg
->regmust
!= 0) {
1039 while ((s
= str_find(s
, preg
->program
[preg
->regmust
], preg
->cflags
& REG_ICASE
)) != NULL
) {
1040 if (prefix_cmp(preg
->program
+ preg
->regmust
, preg
->regmlen
, s
, preg
->cflags
& REG_ICASE
) >= 0) {
1045 if (s
== NULL
) /* Not present. */
1049 /* Mark beginning of line for ^ . */
1050 preg
->regbol
= string
;
1052 /* Simplest case: anchored match need be tried only once (maybe per line). */
1053 if (preg
->reganch
) {
1054 if (eflags
& REG_NOTBOL
) {
1055 /* This is an anchored search, but not an BOL, so possibly skip to the next line */
1059 int ret
= regtry(preg
, string
);
1065 if (preg
->cflags
& REG_NEWLINE
) {
1066 /* Try the next anchor? */
1067 string
= strchr(string
, '\n');
1069 preg
->regbol
= ++string
;
1078 /* Messy cases: unanchored match. */
1080 if (preg
->regstart
!= '\0') {
1081 /* We know what char it must start with. */
1082 while ((s
= str_find(s
, preg
->regstart
, preg
->cflags
& REG_ICASE
)) != NULL
) {
1083 if (regtry(preg
, s
))
1089 /* We don't -- general case. */
1091 if (regtry(preg
, s
))
1096 s
+= utf8_charlen(*s
);
1104 - regtry - try match at specific point
1106 /* 0 failure, 1 success */
1107 static int regtry( regex_t
*preg
, const char *string
)
1111 preg
->reginput
= string
;
1113 for (i
= 0; i
< preg
->nmatch
; i
++) {
1114 preg
->pmatch
[i
].rm_so
= -1;
1115 preg
->pmatch
[i
].rm_eo
= -1;
1117 if (regmatch(preg
, 1)) {
1118 preg
->pmatch
[0].rm_so
= string
- preg
->start
;
1119 preg
->pmatch
[0].rm_eo
= preg
->reginput
- preg
->start
;
1126 * Returns bytes matched if 'pattern' is a prefix of 'string'.
1128 * If 'nocase' is non-zero, does a case-insensitive match.
1130 * Returns -1 on not found.
1132 static int prefix_cmp(const int *prog
, int proglen
, const char *string
, int nocase
)
1134 const char *s
= string
;
1135 while (proglen
&& *s
) {
1137 int n
= reg_utf8_tounicode_case(s
, &ch
, nocase
);
1152 * Searchs for 'c' in the range 'range'.
1154 * Returns 1 if found, or 0 if not.
1156 static int reg_range_find(const int *range
, int c
)
1159 /*printf("Checking %d in range [%d,%d]\n", c, range[1], (range[0] + range[1] - 1));*/
1160 if (c
>= range
[1] && c
<= (range
[0] + range
[1] - 1)) {
1169 * Search for the character 'c' in the utf-8 string 'string'.
1171 * If 'nocase' is set, the 'string' is assumed to be uppercase
1172 * and 'c' is converted to uppercase before matching.
1174 * Returns the byte position in the string where the 'c' was found, or
1175 * NULL if not found.
1177 static const char *str_find(const char *string
, int c
, int nocase
)
1180 /* The "string" should already be converted to uppercase */
1185 int n
= reg_utf8_tounicode_case(string
, &ch
, nocase
);
1195 * Returns true if 'ch' is an end-of-line char.
1197 * In REG_NEWLINE mode, \n is considered EOL in
1200 static int reg_iseol(regex_t
*preg
, int ch
)
1202 if (preg
->cflags
& REG_NEWLINE
) {
1203 return ch
== '\0' || ch
== '\n';
1210 static int regmatchsimplerepeat(regex_t
*preg
, int scan
, int matchmin
)
1217 int max
= preg
->program
[scan
+ 2];
1218 int min
= preg
->program
[scan
+ 3];
1219 int next
= regnext(preg
, scan
);
1222 * Lookahead to avoid useless match attempts
1223 * when we know what character comes next.
1225 if (OP(preg
, next
) == EXACTLY
) {
1226 nextch
= preg
->program
[OPERAND(next
)];
1228 save
= preg
->reginput
;
1229 no
= regrepeat(preg
, scan
+ 5, max
);
1234 /* from min up to no */
1238 /* else from no down to min */
1250 preg
->reginput
= save
+ utf8_index(save
, no
);
1251 reg_utf8_tounicode_case(preg
->reginput
, &c
, (preg
->cflags
& REG_ICASE
));
1252 /* If it could work, try it. */
1253 if (reg_iseol(preg
, nextch
) || c
== nextch
) {
1254 if (regmatch(preg
, next
)) {
1259 /* Couldn't or didn't, add one more */
1263 /* Couldn't or didn't -- back up. */
1270 static int regmatchrepeat(regex_t
*preg
, int scan
, int matchmin
)
1272 int *scanpt
= preg
->program
+ scan
;
1274 int max
= scanpt
[2];
1275 int min
= scanpt
[3];
1277 /* Have we reached min? */
1278 if (scanpt
[4] < min
) {
1279 /* No, so get another one */
1281 if (regmatch(preg
, scan
+ 5)) {
1287 if (scanpt
[4] > max
) {
1292 /* minimal, so try other branch first */
1293 if (regmatch(preg
, regnext(preg
, scan
))) {
1296 /* No, so try one more */
1298 if (regmatch(preg
, scan
+ 5)) {
1304 /* maximal, so try this branch again */
1305 if (scanpt
[4] < max
) {
1307 if (regmatch(preg
, scan
+ 5)) {
1312 /* At this point we are at max with no match. Try the other branch */
1313 return regmatch(preg
, regnext(preg
, scan
));
1317 - regmatch - main matching routine
1319 * Conceptually the strategy is simple: check to see whether the current
1320 * node matches, call self recursively to see whether the rest matches,
1321 * and then act accordingly. In practice we make some effort to avoid
1322 * recursion, in particular by going through "ordinary" nodes (that don't
1323 * need to know whether the rest of the match failed) by a loop instead of
1326 /* 0 failure, 1 success */
1327 static int regmatch(regex_t
*preg
, int prog
)
1329 int scan
; /* Current node. */
1330 int next
; /* Next node. */
1335 if (scan
!= 0 && regnarrate
)
1336 fprintf(stderr
, "%s(\n", regprop(scan
));
1343 fprintf(stderr
, "%3d: %s...\n", scan
, regprop(OP(preg
, scan
))); /* Where, what. */
1346 next
= regnext(preg
, scan
);
1347 n
= reg_utf8_tounicode_case(preg
->reginput
, &c
, (preg
->cflags
& REG_ICASE
));
1349 switch (OP(preg
, scan
)) {
1351 if (preg
->reginput
!= preg
->regbol
)
1355 if (!reg_iseol(preg
, c
)) {
1360 /* Must be looking at a letter, digit, or _ */
1361 if ((!isalnum(UCHAR(c
))) && c
!= '_')
1363 /* Prev must be BOL or nonword */
1364 if (preg
->reginput
> preg
->regbol
&&
1365 (isalnum(UCHAR(preg
->reginput
[-1])) || preg
->reginput
[-1] == '_'))
1369 /* Can't match at BOL */
1370 if (preg
->reginput
> preg
->regbol
) {
1371 /* Current must be EOL or nonword */
1372 if (reg_iseol(preg
, c
) || !isalnum(UCHAR(c
)) || c
!= '_') {
1373 c
= preg
->reginput
[-1];
1374 /* Previous must be word */
1375 if (isalnum(UCHAR(c
)) || c
== '_') {
1384 if (reg_iseol(preg
, c
))
1386 preg
->reginput
+= n
;
1393 opnd
= OPERAND(scan
);
1394 len
= str_int_len(preg
->program
+ opnd
);
1396 slen
= prefix_cmp(preg
->program
+ opnd
, len
, preg
->reginput
, preg
->cflags
& REG_ICASE
);
1400 preg
->reginput
+= slen
;
1404 if (reg_iseol(preg
, c
) || reg_range_find(preg
->program
+ OPERAND(scan
), c
) == 0) {
1407 preg
->reginput
+= n
;
1410 if (reg_iseol(preg
, c
) || reg_range_find(preg
->program
+ OPERAND(scan
), c
) != 0) {
1413 preg
->reginput
+= n
;
1422 if (OP(preg
, next
) != BRANCH
) /* No choice. */
1423 next
= OPERAND(scan
); /* Avoid recursion. */
1426 save
= preg
->reginput
;
1427 if (regmatch(preg
, OPERAND(scan
))) {
1430 preg
->reginput
= save
;
1431 scan
= regnext(preg
, scan
);
1432 } while (scan
!= 0 && OP(preg
, scan
) == BRANCH
);
1440 return regmatchsimplerepeat(preg
, scan
, OP(preg
, scan
) == REPMIN
);
1444 return regmatchrepeat(preg
, scan
, OP(preg
, scan
) == REPXMIN
);
1447 return(1); /* Success! */
1450 if (OP(preg
, scan
) >= OPEN
+1 && OP(preg
, scan
) < CLOSE_END
) {
1453 save
= preg
->reginput
;
1455 if (regmatch(preg
, next
)) {
1458 * Don't set startp if some later
1459 * invocation of the same parentheses
1462 if (OP(preg
, scan
) < CLOSE
) {
1463 no
= OP(preg
, scan
) - OPEN
;
1464 if (no
< preg
->nmatch
&& preg
->pmatch
[no
].rm_so
== -1) {
1465 preg
->pmatch
[no
].rm_so
= save
- preg
->start
;
1469 no
= OP(preg
, scan
) - CLOSE
;
1470 if (no
< preg
->nmatch
&& preg
->pmatch
[no
].rm_eo
== -1) {
1471 preg
->pmatch
[no
].rm_eo
= save
- preg
->start
;
1478 return REG_ERR_INTERNAL
;
1485 * We get here only if there's trouble -- normally "case END" is
1486 * the terminating point.
1488 return REG_ERR_INTERNAL
;
1492 - regrepeat - repeatedly match something simple, report how many
1494 static int regrepeat(regex_t
*preg
, int p
, int max
)
1502 scan
= preg
->reginput
;
1504 switch (OP(preg
, p
)) {
1506 /* No need to handle utf8 specially here */
1507 while (!reg_iseol(preg
, *scan
) && count
< max
) {
1513 while (count
< max
) {
1514 n
= reg_utf8_tounicode_case(scan
, &ch
, preg
->cflags
& REG_ICASE
);
1515 if (preg
->program
[opnd
] != ch
) {
1523 while (count
< max
) {
1524 n
= reg_utf8_tounicode_case(scan
, &ch
, preg
->cflags
& REG_ICASE
);
1525 if (reg_iseol(preg
, ch
) || reg_range_find(preg
->program
+ opnd
, ch
) == 0) {
1533 while (count
< max
) {
1534 n
= reg_utf8_tounicode_case(scan
, &ch
, preg
->cflags
& REG_ICASE
);
1535 if (reg_iseol(preg
, ch
) || reg_range_find(preg
->program
+ opnd
, ch
) != 0) {
1542 default: /* Oh dear. Called inappropriately. */
1543 preg
->err
= REG_ERR_INTERNAL
;
1544 count
= 0; /* Best compromise. */
1547 preg
->reginput
= scan
;
1553 - regnext - dig the "next" pointer out of a node
1555 static int regnext(regex_t
*preg
, int p
)
1559 offset
= NEXT(preg
, p
);
1564 if (OP(preg
, p
) == BACK
)
1570 #if defined(DEBUG) && !defined(JIM_BOOTSTRAP)
1573 - regdump - dump a regexp onto stdout in vaguely comprehensible form
1575 static void regdump(regex_t
*preg
)
1578 int op
= EXACTLY
; /* Arbitrary non-END op. */
1583 for (i
= 1; i
< preg
->p
; i
++) {
1584 printf("%02x ", preg
->program
[i
]);
1592 while (op
!= END
&& s
< preg
->p
) { /* While that wasn't END last time... */
1594 printf("%3d: %s", s
, regprop(op
)); /* Where, what. */
1595 next
= regnext(preg
, s
);
1596 if (next
== 0) /* Next ptr. */
1599 printf("(%d)", next
);
1601 if (op
== REP
|| op
== REPMIN
|| op
== REPX
|| op
== REPXMIN
) {
1602 int max
= preg
->program
[s
];
1603 int min
= preg
->program
[s
+ 1];
1605 printf("{%d,*}", min
);
1608 printf("{%d,%d}", min
, max
);
1610 printf(" %d", preg
->program
[s
+ 2]);
1613 else if (op
== ANYOF
|| op
== ANYBUT
) {
1616 while (preg
->program
[s
]) {
1617 int len
= preg
->program
[s
++];
1618 int first
= preg
->program
[s
++];
1619 buf
[utf8_fromunicode(buf
, first
)] = 0;
1622 buf
[utf8_fromunicode(buf
, first
+ len
- 1)] = 0;
1628 else if (op
== EXACTLY
) {
1629 /* Literal string, where present. */
1631 while (preg
->program
[s
]) {
1632 buf
[utf8_fromunicode(buf
, preg
->program
[s
])] = 0;
1642 /* Header fields of interest. */
1643 if (preg
->regstart
) {
1644 buf
[utf8_fromunicode(buf
, preg
->regstart
)] = 0;
1645 printf("start '%s' ", buf
);
1648 printf("anchored ");
1649 if (preg
->regmust
!= 0) {
1651 printf("must have:");
1652 for (i
= 0; i
< preg
->regmlen
; i
++) {
1653 putchar(preg
->program
[preg
->regmust
+ i
]);
1662 - regprop - printable representation of opcode
1664 static const char *regprop( int op
)
1666 static char buf
[50];
1702 if (op
>= OPEN
&& op
< CLOSE
) {
1703 snprintf(buf
, sizeof(buf
), "OPEN%d", op
-OPEN
);
1705 else if (op
>= CLOSE
&& op
< CLOSE_END
) {
1706 snprintf(buf
, sizeof(buf
), "CLOSE%d", op
-CLOSE
);
1709 snprintf(buf
, sizeof(buf
), "?%d?\n", op
);
1714 #endif /* JIM_BOOTSTRAP */
1716 size_t regerror(int errcode
, const regex_t
*preg
, char *errbuf
, size_t errbuf_size
)
1718 static const char *error_strings
[] = {
1727 "parentheses () not balanced",
1728 "braces {} not balanced",
1729 "invalid repetition count(s)",
1734 "count follows nothing",
1735 "trailing backslash",
1736 "corrupted program",
1737 "contains null char",
1741 if (errcode
< 0 || errcode
>= REG_ERR_NUM
) {
1742 err
= "Bad error code";
1745 err
= error_strings
[errcode
];
1748 return snprintf(errbuf
, errbuf_size
, "%s", err
);
1751 void regfree(regex_t
*preg
)
1753 free(preg
->program
);