1 /* $NetBSD: regexp.c,v 1.10 2006/05/01 05:14:22 christos Exp $ */
4 * Copyright (c) 1980, 1993
5 * The Regents of the University of California. All rights reserved.
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
16 * 3. Neither the name of the University nor the names of its contributors
17 * may be used to endorse or promote products derived from this software
18 * without specific prior written permission.
20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33 #include <sys/cdefs.h>
35 __COPYRIGHT("@(#) Copyright (c) 1980, 1993\
36 The Regents of the University of California. All rights reserved.");
41 static char sccsid
[] = "@(#)regexp.c 8.1 (Berkeley) 6/6/93";
43 __RCSID("$NetBSD: regexp.c,v 1.10 2006/05/01 05:14:22 christos Exp $");
56 static void expconv
__P((void));
58 boolean x_escaped
; /* true if we are currently x_escaped */
59 char *x_start
; /* start of string */
60 boolean l_onecase
; /* true if upper and lower equivalent */
62 #define makelower(c) (isupper((unsigned char)(c)) ? tolower((unsigned char)(c)) : (c))
64 /* STRNCMP - like strncmp except that we convert the
65 * first string to lower case before comparing
66 * if l_onecase is set.
76 if (*s2
- makelower(*s1
))
77 return (*s2
- makelower(*s1
));
96 /* The following routine converts an irregular expression to
99 * Either meta symbols (\a \d or \p) or character strings or
100 * operations ( alternation or perenthesizing ) can be
101 * specified. Each starts with a descriptor byte. The descriptor
102 * byte has STR set for strings, META set for meta symbols
103 * and OPER set for operations.
104 * The descriptor byte can also have the OPT bit set if the object
105 * defined is optional. Also ALT can be set to indicate an alternation.
107 * For metasymbols the byte following the descriptor byte identities
108 * the meta symbol (containing an ascii 'a', 'd', 'p', '|', or '('). For
109 * strings the byte after the descriptor is a character count for
112 * meta symbols := descriptor
115 * strings := descriptor
119 * operatins := descriptor
125 * handy macros for accessing parts of match blocks
127 #define MSYM(A) (*(A+1)) /* symbol in a meta symbol block */
128 #define MNEXT(A) (A+2) /* character following a metasymbol block */
130 #define OSYM(A) (*(A+1)) /* symbol in an operation block */
131 #define OCNT(A) (*(A+2)) /* character count */
132 #define ONEXT(A) (A+3) /* next character after the operation */
133 #define OPTR(A) (A+*(A+2)) /* place pointed to by the operator */
135 #define SCNT(A) (*(A+1)) /* byte count of a string */
136 #define SSTR(A) (A+2) /* address of the string */
137 #define SNEXT(A) (A+2+*(A+1)) /* character following the string */
140 * bit flags in the descriptor
148 static char *ccre
; /* pointer to current position in converted exp*/
149 static char *ure
; /* pointer current position in unconverted exp */
153 char *re
; /* unconverted irregular expression */
155 char *cre
; /* pointer to converted regular expression */
157 /* allocate room for the converted expression */
162 cre
= malloc(4 * strlen(re
) + 3);
166 /* start the conversion with a \a */
171 /* start the conversion (its recursive) */
180 char *cs
; /* pointer to current symbol in converted exp */
181 char c
; /* character being processed */
182 char *acs
; /* pinter to last alternate */
185 /* let the conversion begin */
188 while (*ure
!= NIL
) {
189 switch (c
= *ure
++) {
192 switch (c
= *ure
++) {
194 /* escaped characters are just characters */
196 if (cs
== NIL
|| (*cs
& STR
) == 0) {
206 /* normal(?) metacharacters */
211 if (acs
!= NIL
&& acs
!= cs
) {
214 OCNT(acs
) = ccre
- acs
;
227 /* just put the symbol in */
230 if (acs
!= NIL
&& acs
!= cs
) {
233 OCNT(acs
) = ccre
- acs
;
244 /* mark the last match sequence as optional */
250 /* recurse and define a subexpression */
252 if (acs
!= NIL
&& acs
!= cs
) {
255 OCNT(acs
) = ccre
- acs
;
265 OCNT(cs
) = ccre
- cs
; /* offset to next symbol */
268 /* reurn from a recursion */
273 OCNT(acs
) = ccre
- acs
;
284 /* mark the last match sequence as having an alternate */
285 /* the third byte will contain an offset to jump over the */
286 /* alternate match in case the first did not fail */
288 if (acs
!= NIL
&& acs
!= cs
)
289 OCNT(ccre
) = ccre
- acs
; /* make a back pointer */
298 acs
= cs
; /* remember that the pointer is to be filles */
301 /* if its not a metasymbol just build a scharacter string */
303 if (cs
== NIL
|| (*cs
& STR
) == 0) {
317 OCNT(acs
) = ccre
- acs
;
324 /* end of convertre */
328 * The following routine recognises an irregular expresion
329 * with the following special characters:
331 * \? - means last match was optional
332 * \a - matches any number of characters
333 * \d - matches any number of spaces and tabs
334 * \p - matches any number of alphanumeric
336 * characters matched will be copied into
337 * the area pointed to by 'name'.
339 * \( \) - grouping used mostly for alternation and
342 * The irregular expression must be translated to internal form
343 * prior to calling this routine
345 * The value returned is the pointer to the first non \a
350 expmatch(s
, re
, mstring
)
351 char *s
; /* string to check for a match in */
352 char *re
; /* a converted irregular expression */
353 char *mstring
; /* where to put whatever matches a \p */
355 char *cs
; /* the current symbol */
356 char *ptr
,*s1
; /* temporary pointer */
357 boolean matched
; /* a temporary boolean */
359 /* initial conditions */
365 /* loop till expression string is exhausted (or at least pretty tired) */
367 switch (*cs
& (OPER
| STR
| META
)) {
369 /* try to match a string */
371 matched
= !STRNCMP (s
, SSTR(cs
), SCNT(cs
));
374 /* hoorah it matches */
377 } else if (*cs
& ALT
) {
379 /* alternation, skip to next expression */
381 } else if (*cs
& OPT
) {
383 /* the match is optional */
385 matched
= 1; /* indicate a successful match */
388 /* no match, error return */
393 /* an operator, do something fancy */
397 /* this is an alternation */
401 /* last thing in the alternation was a match, skip ahead */
405 /* no match, keep trying */
409 /* this is a grouping, recurse */
411 ptr
= expmatch(s
, ONEXT(cs
), mstring
);
414 /* the subexpression matched */
417 } else if (*cs
& ALT
) {
419 /* alternation, skip to next expression */
421 } else if (*cs
& OPT
) {
423 /* the match is optional */
424 matched
= 1; /* indicate a successful match */
427 /* no match, error return */
435 /* try to match a metasymbol */
439 /* try to match anything and remember what was matched */
442 * This is really the same as trying the match the
443 * remaining parts of the expression to any subset
448 ptr
= expmatch(s1
, MNEXT(cs
), mstring
);
449 if (ptr
!= NIL
&& s1
!= s
) {
451 /* we have a match, remember the match */
452 strncpy(mstring
, s
, s1
- s
);
453 mstring
[s1
- s
] = '\0';
455 } else if (ptr
!= NIL
&& (*cs
& OPT
)) {
457 /* it was aoptional so no match is ok */
459 } else if (ptr
!= NIL
) {
461 /* not optional and we still matched */
464 if (!isalnum((unsigned char)*s1
) && *s1
!= '_')
467 x_escaped
= x_escaped
? FALSE
: TRUE
;
473 /* try to match anything */
476 * This is really the same as trying the match the
477 * remaining parts of the expression to any subset
482 ptr
= expmatch(s1
, MNEXT(cs
), mstring
);
483 if (ptr
!= NIL
&& s1
!= s
) {
485 /* we have a match */
487 } else if (ptr
!= NIL
&& (*cs
& OPT
)) {
489 /* it was aoptional so no match is ok */
491 } else if (ptr
!= NIL
) {
493 /* not optional and we still matched */
497 x_escaped
= x_escaped
? FALSE
: TRUE
;
503 /* fail if we are currently x_escaped */
510 /* match any number of tabs and spaces */
513 while (*s
== ' ' || *s
== '\t')
515 if (s
!= ptr
|| s
== x_start
) {
517 /* match, be happy */
520 } else if (*s
== '\n' || *s
== '\0') {
522 /* match, be happy */
525 } else if (*cs
& ALT
) {
527 /* try the next part */
530 } else if (*cs
& OPT
) {
537 /* no match, error return */
541 /* check for end of line */
543 if (*s
== '\0' || *s
== '\n') {
545 /* match, be happy */
549 } else if (*cs
& ALT
) {
551 /* try the next part */
554 } else if (*cs
& OPT
) {
561 /* no match, error return */
565 /* check for start of line */
569 /* match, be happy */
572 } else if (*cs
& ALT
) {
574 /* try the next part */
577 } else if (*cs
& OPT
) {
584 /* no match, error return */
588 /* end of a subexpression, return success */