4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License, Version 1.0 only
6 * (the "License"). You may not use this file except in compliance
9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10 * or http://www.opensolaris.org/os/licensing.
11 * See the License for the specific language governing permissions
12 * and limitations under the License.
14 * When distributing Covered Code, include this CDDL HEADER in each
15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16 * If applicable, add the following below this CDDL HEADER, with the
17 * fields enclosed by brackets "[]" replaced with your own identifying
18 * information: Portions Copyright [yyyy] [name of copyright owner]
23 * Copyright 1987 Sun Microsystems, Inc. All rights reserved.
24 * Use is subject to license terms.
27 #pragma ident "%Z%%M% %I% %E% SMI"
30 * routines to do regular expression matching
36 * ... returns 0 if the string s was compiled successfully,
37 * a pointer to an error message otherwise.
38 * If passed 0 or a null string returns without changing
39 * the currently compiled re (see note 11 below).
43 * ... returns 1 if the string s matches the last compiled regular
45 * 0 if the string s failed to match the last compiled
46 * regular expression, and
47 * -1 if the compiled regular expression was invalid
48 * (indicating an internal error).
50 * The strings passed to both re_comp and re_exec may have trailing or
51 * embedded newline characters; they are terminated by nulls.
53 * The identity of the author of these routines is lost in antiquity;
54 * this is essentially the same as the re code in the original V6 ed.
56 * The regular expressions recognized are described below. This description
57 * is essentially the same as that for ed.
59 * A regular expression specifies a set of strings of characters.
60 * A member of this set of strings is said to be matched by
61 * the regular expression. In the following specification for
62 * regular expressions the word `character' means any character but NUL.
64 * 1. Any character except a special character matches itself.
65 * Special characters are the regular expression delimiter plus
66 * \ [ . and sometimes ^ * $.
67 * 2. A . matches any character.
68 * 3. A \ followed by any character except a digit or ( )
69 * matches that character.
70 * 4. A nonempty string s bracketed [s] (or [^s]) matches any
71 * character in (or not in) s. In s, \ has no special meaning,
72 * and ] may only appear as the first letter. A substring
73 * a-b, with a and b in ascending ASCII order, stands for
74 * the inclusive range of ASCII characters.
75 * 5. A regular expression of form 1-4 followed by * matches a
76 * sequence of 0 or more matches of the regular expression.
77 * 6. A regular expression, x, of form 1-8, bracketed \(x\)
78 * matches what x matches.
79 * 7. A \ followed by a digit n matches a copy of the string that the
80 * bracketed regular expression beginning with the nth \( matched.
81 * 8. A regular expression of form 1-8, x, followed by a regular
82 * expression of form 1-7, y matches a match for x followed by
83 * a match for y, with the x match being as long as possible
84 * while still permitting a y match.
85 * 9. A regular expression of form 1-8 preceded by ^ (or followed
86 * by $), is constrained to matches that begin at the left
87 * (or end at the right) end of a line.
88 * 10. A regular expression of form 1-9 picks out the longest among
89 * the leftmost matches in a line.
90 * 11. An empty regular expression stands for a copy of the last
91 * regular expression encountered.
112 static struct re_globals
{
114 char *_braslist
[NBRA
], *_braelist
[NBRA
];
117 #define expbuf (_re->_expbuf)
118 #define braslist (_re->_braslist)
119 #define braelist (_re->_braelist)
120 #define circf (_re->_circf)
122 static int advance(char *, char *);
123 static int backref(int, char *);
124 static int cclass(char *, char, int);
127 * compile the regular expression argument into a dfa
133 struct re_globals
*_re
= re_globals
;
135 int cclcnt
, numbra
= 0;
138 char *bracketp
= &bracket
[0];
139 char *retoolong
= "Regular expression too long";
142 _re
= (struct re_globals
*)calloc(1, sizeof (*_re
));
144 return ("Out of memory");
149 #define comerr(msg) {expbuf[0] = 0; numbra = 0; return(msg); }
151 if (sp
== 0 || *sp
== '\0') {
153 return("No previous regular expression");
163 if (ep
>= &expbuf
[ESIZE
])
165 if ((c
= *sp
++) == '\0') {
166 if (bracketp
!= bracket
)
167 comerr("unmatched \\(");
181 if (lastep
== 0 || *lastep
== CBRA
|| *lastep
== CKET
)
196 if ((c
= *sp
++) == '^') {
203 if (c
== '-' && ep
[-1] != 0) {
204 if ((c
= *sp
++) == ']') {
213 if (ep
>= &expbuf
[ESIZE
])
219 if (ep
>= &expbuf
[ESIZE
])
221 } while ((c
= *sp
++) != ']');
226 if ((c
= *sp
++) == '(') {
228 comerr("too many \\(\\) pairs");
229 *bracketp
++ = numbra
;
235 if (bracketp
<= bracket
)
236 comerr("unmatched \\)");
241 if (c
>= '1' && c
< ('1' + NBRA
)) {
259 * match the argument string against the compiled re
264 struct re_globals
*_re
= re_globals
;
272 for (c
= 0; c
< NBRA
; c
++) {
277 return((advance(p1
, p2
)));
279 * fast check for first character
286 if (rv
= advance(p1
, p2
))
295 if (rv
= advance(p1
, p2
))
302 * try to match the next thing in the dfa
305 advance(char *lp
, char *ep
)
310 struct re_globals
*_re
= re_globals
;
334 if (cclass(ep
, *lp
++, 1)) {
341 if (cclass(ep
, *lp
++, 0)) {
348 braslist
[*ep
++] = lp
;
352 braelist
[*ep
++] = lp
;
356 if (braelist
[i
= *ep
++] == 0)
358 if (backref(i
, lp
)) {
359 lp
+= braelist
[i
] - braslist
[i
];
365 if (braelist
[i
= *ep
++] == 0)
368 ct
= braelist
[i
] - braslist
[i
];
369 while (backref(i
, lp
))
371 while (lp
>= curlp
) {
372 if (rv
= advance(lp
, ep
))
394 while (cclass(ep
, *lp
++, ep
[-1] == (CCL
|CSTAR
)))
402 if (rv
= advance(lp
, ep
))
404 } while (lp
> curlp
);
413 backref(int i
, char *lp
)
416 struct re_globals
*_re
= re_globals
;
419 while (*bp
++ == *lp
++)
420 if (bp
>= braelist
[i
])
426 cclass(char *set
, char c
, int af
)