3 /* This file contains the code that compiles regular expressions and executes
4 * them. It supports the same syntax and features as vi's regular expression
5 * code. Specifically, the meta characters are:
6 * ^ matches the beginning of a line
7 * $ matches the end of a line
8 * \< matches the beginning of a word
9 * \> matches the end of a word
10 * . matches any single character
11 * [] matches any character in a character class
12 * \( delimits the start of a subexpression
13 * \) delimits the end of a subexpression
14 * * repeats the preceding 0 or more times
15 * NOTE: You cannot follow a \) with a *.
17 * The physical structure of a compiled RE is as follows:
18 * - First, there is a one-byte value that says how many character classes
19 * are used in this regular expression
20 * - Next, each character class is stored as a bitmap that is 256 bits
22 * - A mixture of literal characters and compiled meta characters follows.
23 * This begins with M_BEGIN(0) and ends with M_END(0). All meta chars
24 * are stored as a \n followed by a one-byte code, so they take up two
25 * bytes apiece. Literal characters take up one byte apiece. \n can't
26 * be used as a literal character.
28 * If NO_MAGIC is defined, then a different set of functions is used instead.
29 * That right, this file contains TWO versions of the code.
40 static char *previous
; /* the previous regexp, used when null regexp is given */
44 /* THE REAL REGEXP PACKAGE IS USED UNLESS "NO_MAGIC" IS DEFINED */
46 /* These are used to classify or recognize meta-characters */
48 #define BASE_META(m) ((m) - 256)
49 #define INT_META(c) ((c) + 256)
50 #define IS_META(m) ((m) >= 256)
51 #define IS_CLASS(m) ((m) >= M_CLASS(0) && (m) <= M_CLASS(9))
52 #define IS_START(m) ((m) >= M_START(0) && (m) <= M_START(9))
53 #define IS_END(m) ((m) >= M_END(0) && (m) <= M_END(9))
54 #define IS_CLOSURE(m) ((m) >= M_SPLAT && (m) <= M_RANGE)
55 #define ADD_META(s,m) (*(s)++ = META, *(s)++ = BASE_META(m))
56 #define GET_META(s) (*(s) == META ? INT_META(*++(s)) : *s)
58 /* These are the internal codes used for each type of meta-character */
59 #define M_BEGLINE 256 /* internal code for ^ */
60 #define M_ENDLINE 257 /* internal code for $ */
61 #define M_BEGWORD 258 /* internal code for \< */
62 #define M_ENDWORD 259 /* internal code for \> */
63 #define M_ANY 260 /* internal code for . */
64 #define M_SPLAT 261 /* internal code for * */
65 #define M_PLUS 262 /* internal code for \+ */
66 #define M_QMARK 263 /* internal code for \? */
67 #define M_RANGE 264 /* internal code for \{ */
68 #define M_CLASS(n) (265+(n)) /* internal code for [] */
69 #define M_START(n) (275+(n)) /* internal code for \( */
70 #define M_END(n) (285+(n)) /* internal code for \) */
72 /* These are used during compilation */
73 static int class_cnt
; /* used to assign class IDs */
74 static int start_cnt
; /* used to assign start IDs */
75 static int end_stk
[NSUBEXP
];/* used to assign end IDs */
77 static char *retext
; /* points to the text being compiled */
79 /* error-handling stuff */
81 #define FAIL(why) regerror(why); longjmp(errorhandler, 1)
87 /* This function builds a bitmap for a particular class */
88 static char *makeclass(text
, bmap
)
89 REG
char *text
; /* start of the class */
90 REG
char *bmap
; /* the bitmap */
97 for (i
= 0; bmap
&& i
< 32; i
++)
102 /* see if we're going to complement this class */
109 /* add in the characters */
110 while (*text
&& *text
!= ']')
112 /* is this a span of characters? */
113 if (text
[1] == '-' && text
[2])
115 /* spans can't be backwards */
116 if (text
[0] > text
[2])
118 FAIL("Backwards span in []");
121 /* add each character in the span to the bitmap */
122 for (i
= text
[0]; bmap
&& i
<= text
[2]; i
++)
124 bmap
[i
>> 3] |= (1 << (i
& 7));
127 /* move past this span */
132 /* add this single character to the span */
136 bmap
[i
>> 3] |= (1 << (i
& 7));
141 /* make sure the closing ] is missing */
147 /* if we're supposed to complement this class, then do so */
148 if (complement
&& bmap
)
150 for (i
= 0; i
< 32; i
++)
162 /* This function gets the next character or meta character from a string.
163 * The pointer is incremented by 1, or by 2 for \-quoted characters. For [],
164 * a bitmap is generated via makeclass() (if re is given), and the
165 * character-class text is skipped.
167 static int gettoken(sptr
, re
)
188 if (start_cnt
>= NSUBEXP
)
190 FAIL("Too many \\(s");
192 end_stk
[end_sp
++] = start_cnt
;
193 return M_START(start_cnt
++);
198 FAIL("Mismatched \\)");
200 return M_END(end_stk
[--end_sp
]);
203 return (*o_magic
? c
: M_SPLAT
);
206 return (*o_magic
? c
: M_ANY
);
226 if (*sptr
== retext
+ 1)
246 /* make sure we don't have too many classes */
249 FAIL("Too many []s");
252 /* process the character list for this class */
255 /* generate the bitmap for this class */
256 *sptr
= makeclass(*sptr
, re
->program
+ 1 + 32 * class_cnt
);
260 /* skip to end of the class */
261 *sptr
= makeclass(*sptr
, (char *)0);
263 return M_CLASS(class_cnt
++);
269 else /* unquoted nomagic */
274 if (*sptr
== retext
+ 1)
297 /* This function calculates the number of bytes that will be needed for a
298 * compiled RE. Its argument is the uncompiled version. It is not clever
299 * about catching syntax errors; that is done in a later pass.
301 static unsigned calcsize(text
)
312 while ((token
= gettoken(&text
, (regexp
*)0)) != 0)
319 else if (token
== M_RANGE
)
322 while ((token
= gettoken(&text
, (regexp
*)0)) != 0
332 else if (IS_META(token
))
347 /* This function compiles a regexp. */
364 /* prepare for error handling */
366 if (setjmp(errorhandler
))
375 /* if an empty regexp string was given, use the previous one */
380 FAIL("No previous RE");
384 else /* non-empty regexp given, so remember it */
388 previous
= (char *)malloc((unsigned)(strlen(exp
) + 1));
390 strcpy(previous
, exp
);
393 /* allocate memory */
398 size
= calcsize(exp
) + sizeof(regexp
) + 10; /* !!! 10 bytes for slop */
400 re
= ((regexp
*)0) + size
;
402 re
= (regexp
*)malloc((unsigned)size
);
406 FAIL("Not enough memory for this RE");
410 build
= &re
->program
[1 + 32 * class_cnt
];
411 re
->program
[0] = class_cnt
;
412 for (token
= 0; token
< NSUBEXP
; token
++)
414 re
->startp
[token
] = re
->endp
[token
] = (char *)0;
424 for (token
= M_START(0), peek
= gettoken(&exp
, re
);
426 token
= peek
, peek
= gettoken(&exp
, re
))
428 /* special processing for the closure operator */
429 if (IS_CLOSURE(peek
))
431 /* detect misuse of closure operator */
434 FAIL("Closure operator follows nothing");
436 else if (IS_META(token
) && token
!= M_ANY
&& !IS_CLASS(token
))
438 FAIL("Closure operators can only follow a normal character or . or []");
442 /* if \{ \} then read the range */
446 for (digit
= gettoken(&exp
, re
);
447 !IS_META(digit
) && isdigit(digit
);
448 digit
= gettoken(&exp
, re
))
450 from
= from
* 10 + digit
- '0';
456 else if (digit
== ',')
459 for (digit
= gettoken(&exp
, re
);
460 !IS_META(digit
) && isdigit(digit
);
461 digit
= gettoken(&exp
, re
))
463 to
= to
* 10 + digit
- '0';
472 FAIL("Bad characters after \\{");
474 else if (to
< from
|| to
== 0 || from
>= 255)
476 FAIL("Invalid range for \\{ \\}");
487 /* it is okay -- make it prefix instead of postfix */
488 ADD_META(build
, peek
);
493 *build
++ = (to
< 255 ? to
: 255);
498 /* take care of "needfirst" - is this the first char? */
499 if (needfirst
&& peek
== M_PLUS
&& !IS_META(token
))
505 /* we used "peek" -- need to refill it */
506 peek
= gettoken(&exp
, re
);
507 if (IS_CLOSURE(peek
))
509 FAIL("* or \\+ or \\? doubled up");
512 else if (!IS_META(token
))
514 /* normal char is NOT argument of closure */
522 else if (token
== M_ANY
|| IS_CLASS(token
))
524 /* . or [] is NOT argument of closure */
529 /* the "token" character is not closure -- process it normally */
530 if (token
== M_BEGLINE
)
532 /* set the BOL flag instead of storing M_BEGLINE */
535 else if (IS_META(token
))
537 ADD_META(build
, token
);
545 /* end it with a \) which MUST MATCH the opening \( */
546 ADD_META(build
, M_END(0));
549 FAIL("Not enough \\)s");
557 /*---------------------------------------------------------------------------*/
560 /* This function checks for a match between a character and a token which is
561 * known to represent a single character. It returns 0 if they match, or
564 int match1(re
, ch
, token
)
571 /* the end of a line can't match any RE of width 1 */
578 else if (IS_CLASS(token
))
580 if (re
->program
[1 + 32 * (token
- M_CLASS(0)) + (ch
>> 3)] & (1 << (ch
& 7)))
583 else if (ch
== token
|| *o_ignorecase
&& tolower(ch
) == tolower(token
))
592 /* This function checks characters up to and including the next closure, at
593 * which point it does a recursive call to check the rest of it. This function
594 * returns 0 if everything matches, or 1 if something doesn't match.
596 int match(re
, str
, prog
, here
)
597 regexp
*re
; /* the regular expression */
598 char *str
; /* the string */
599 REG
char *prog
; /* a portion of re->program, an compiled RE */
600 REG
char *here
; /* a portion of str, the string to compare it to */
602 REG
int token
; /* the roken pointed to by prog */
603 REG
int nmatched
;/* counter, used during closure matching */
604 REG
int closure
;/* the token denoting the type of closure */
605 int from
; /* minimum number of matches in closure */
606 int to
; /* maximum number of matches in closure */
608 for (token
= GET_META(prog
); !IS_CLOSURE(token
); prog
++, token
= GET_META(prog
))
612 /*case M_BEGLINE: can't happen; re->bol is used instead */
620 (here
[-1] == '_' || isalnum(here
[-1])))
625 if (here
[0] == '_' || isalnum(here
[0]))
639 re
->startp
[token
- M_START(0)] = (char *)here
;
652 re
->endp
[token
- M_END(0)] = (char *)here
;
653 if (token
== M_END(0))
659 default: /* literal, M_CLASS(n), or M_ANY */
660 if (match1(re
, *here
, token
) != 0)
668 /* step 1: see what we have to match against, and move "prog" to point
669 * to the remainder of the compiled RE.
677 to
= strlen(str
); /* infinity */
682 to
= strlen(str
); /* infinity */
692 from
= UCHAR(*prog
++);
696 to
= strlen(str
); /* infinity */
701 token
= GET_META(prog
);
704 /* step 2: see how many times we can match that token against the string */
706 nmatched
< to
&& *here
&& match1(re
, *here
, token
) == 0;
711 /* step 3: try to match the remainder, and back off if it doesn't */
712 while (nmatched
>= from
&& match(re
, str
, prog
, here
) != 0)
718 /* so how did it work out? */
719 if (nmatched
>= from
)
726 /* This function searches through a string for text that matches an RE. */
727 int regexec(re
, str
, bol
)
728 regexp
*re
; /* the compiled regexp to search for */
729 char *str
; /* the string to search through */
730 int bol
; /* boolean: does str start at the beginning of a line? */
732 char *prog
; /* the entry point of re->program */
733 int len
; /* length of the string */
736 /* if must start at the beginning of a line, and this isn't, then fail */
743 prog
= re
->program
+ 1 + 32 * re
->program
[0];
745 /* search for the RE in the string */
748 /* must occur at BOL */
750 && match1(re
, *(char *)str
, re
->first
))/* wrong first letter? */
751 || len
< re
->minlen
/* not long enough? */
752 || match(re
, (char *)str
, prog
, str
)) /* doesn't match? */
753 return 0; /* THEN FAIL! */
756 else if (!*o_ignorecase
)
758 /* can occur anywhere in the line, noignorecase */
759 for (here
= (char *)str
;
760 (re
->first
&& re
->first
!= *here
)
761 || match(re
, (char *)str
, prog
, here
);
764 if (len
< re
->minlen
)
771 /* can occur anywhere in the line, ignorecase */
772 for (here
= (char *)str
;
773 (re
->first
&& match1(re
, *here
, (int)re
->first
))
774 || match(re
, (char *)str
, prog
, here
);
777 if (len
< re
->minlen
)
782 /* if we didn't fail, then we must have succeeded */
786 /*============================================================================*/
797 /* allocate a big enough regexp structure */
801 re
= (regexp
*)malloc((unsigned)(strlen(exp
) + 1 + sizeof(struct regexp
)));
805 regerror("Could not malloc a regexp structure");
809 /* initialize all fields of the structure */
810 for (i
= 0; i
< NSUBEXP
; i
++)
812 re
->startp
[i
] = re
->endp
[i
] = (char *)0;
818 /* copy the string into it, translating ^ and $ as needed */
819 for (src
= exp
, dest
= re
->program
+ 1; *src
; src
++)
855 regerror("extra \\ at end of regular expression");
870 /* This "helper" function checks for a match at a given location. It returns
871 * 1 if it matches, 0 if it doesn't match here but might match later on in the
872 * string, or -1 if it could not possibly match
874 static int reghelp(prog
, string
, bolflag
)
882 /* if ^, then require bolflag */
883 if ((prog
->bol
& 1) && !bolflag
)
888 /* if it matches, then it will start here */
889 prog
->startp
[0] = string
;
891 /* compare, possibly ignoring case */
894 for (scan
= &prog
->program
[1]; *scan
; scan
++, string
++)
895 if (tolower(*scan
) != tolower(*string
))
896 return *string
? 0 : -1;
900 for (scan
= &prog
->program
[1]; *scan
; scan
++, string
++)
901 if (*scan
!= *string
)
902 return *string
? 0 : -1;
905 /* if $, then require string to end here, too */
906 if ((prog
->bol
& 2) && *string
)
911 /* if we get to here, it matches */
912 prog
->endp
[0] = string
;
918 int regexec(prog
, string
, bolflag
)
925 /* keep trying to match it */
926 for (rc
= reghelp(prog
, string
, bolflag
); rc
== 0; rc
= reghelp(prog
, string
, 0))