8322 nl: misleading-indentation
[unleashed/tickless.git] / usr / src / lib / libbc / libc / gen / common / regex.c
blobe6bd73361aa9b2de573cb6f0e5c2b771c590ced2
1 /*
2 * CDDL HEADER START
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License, Version 1.0 only
6 * (the "License"). You may not use this file except in compliance
7 * with the License.
9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10 * or http://www.opensolaris.org/os/licensing.
11 * See the License for the specific language governing permissions
12 * and limitations under the License.
14 * When distributing Covered Code, include this CDDL HEADER in each
15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16 * If applicable, add the following below this CDDL HEADER, with the
17 * fields enclosed by brackets "[]" replaced with your own identifying
18 * information: Portions Copyright [yyyy] [name of copyright owner]
20 * CDDL HEADER END
23 * Copyright 1987 Sun Microsystems, Inc. All rights reserved.
24 * Use is subject to license terms.
27 #pragma ident "%Z%%M% %I% %E% SMI"
30 * routines to do regular expression matching
32 * Entry points:
34 * re_comp(s)
35 * char *s;
36 * ... returns 0 if the string s was compiled successfully,
37 * a pointer to an error message otherwise.
38 * If passed 0 or a null string returns without changing
39 * the currently compiled re (see note 11 below).
41 * re_exec(s)
42 * char *s;
43 * ... returns 1 if the string s matches the last compiled regular
44 * expression,
45 * 0 if the string s failed to match the last compiled
46 * regular expression, and
47 * -1 if the compiled regular expression was invalid
48 * (indicating an internal error).
50 * The strings passed to both re_comp and re_exec may have trailing or
51 * embedded newline characters; they are terminated by nulls.
53 * The identity of the author of these routines is lost in antiquity;
54 * this is essentially the same as the re code in the original V6 ed.
56 * The regular expressions recognized are described below. This description
57 * is essentially the same as that for ed.
59 * A regular expression specifies a set of strings of characters.
60 * A member of this set of strings is said to be matched by
61 * the regular expression. In the following specification for
62 * regular expressions the word `character' means any character but NUL.
64 * 1. Any character except a special character matches itself.
65 * Special characters are the regular expression delimiter plus
66 * \ [ . and sometimes ^ * $.
67 * 2. A . matches any character.
68 * 3. A \ followed by any character except a digit or ( )
69 * matches that character.
70 * 4. A nonempty string s bracketed [s] (or [^s]) matches any
71 * character in (or not in) s. In s, \ has no special meaning,
72 * and ] may only appear as the first letter. A substring
73 * a-b, with a and b in ascending ASCII order, stands for
74 * the inclusive range of ASCII characters.
75 * 5. A regular expression of form 1-4 followed by * matches a
76 * sequence of 0 or more matches of the regular expression.
77 * 6. A regular expression, x, of form 1-8, bracketed \(x\)
78 * matches what x matches.
79 * 7. A \ followed by a digit n matches a copy of the string that the
80 * bracketed regular expression beginning with the nth \( matched.
81 * 8. A regular expression of form 1-8, x, followed by a regular
82 * expression of form 1-7, y matches a match for x followed by
83 * a match for y, with the x match being as long as possible
84 * while still permitting a y match.
85 * 9. A regular expression of form 1-8 preceded by ^ (or followed
86 * by $), is constrained to matches that begin at the left
87 * (or end at the right) end of a line.
88 * 10. A regular expression of form 1-9 picks out the longest among
89 * the leftmost matches in a line.
90 * 11. An empty regular expression stands for a copy of the last
91 * regular expression encountered.
95 * constants for re's
97 #define CBRA 1
98 #define CCHR 2
99 #define CDOT 4
100 #define CCL 6
101 #define NCCL 8
102 #define CDOL 10
103 #define CEOF 11
104 #define CKET 12
105 #define CBACK 18
107 #define CSTAR 01
109 #define ESIZE 512
110 #define NBRA 9
112 static struct re_globals {
113 char _expbuf[ESIZE];
114 char *_braslist[NBRA], *_braelist[NBRA];
115 char _circf;
116 } *re_globals;
117 #define expbuf (_re->_expbuf)
118 #define braslist (_re->_braslist)
119 #define braelist (_re->_braelist)
120 #define circf (_re->_circf)
122 static int advance(char *, char *);
123 static int backref(int, char *);
124 static int cclass(char *, char, int);
127 * compile the regular expression argument into a dfa
129 char *
130 re_comp(char *sp)
132 int c;
133 struct re_globals *_re = re_globals;
134 char *ep;
135 int cclcnt, numbra = 0;
136 char *lastep = 0;
137 char bracket[NBRA];
138 char *bracketp = &bracket[0];
139 char *retoolong = "Regular expression too long";
141 if (_re == 0) {
142 _re = (struct re_globals *)calloc(1, sizeof (*_re));
143 if (_re == 0)
144 return ("Out of memory");
145 re_globals = _re;
147 ep = expbuf;
149 #define comerr(msg) {expbuf[0] = 0; numbra = 0; return(msg); }
151 if (sp == 0 || *sp == '\0') {
152 if (*ep == 0)
153 return("No previous regular expression");
154 return (0);
156 if (*sp == '^') {
157 circf = 1;
158 sp++;
160 else
161 circf = 0;
162 for (;;) {
163 if (ep >= &expbuf[ESIZE])
164 comerr(retoolong);
165 if ((c = *sp++) == '\0') {
166 if (bracketp != bracket)
167 comerr("unmatched \\(");
168 *ep++ = CEOF;
169 *ep++ = 0;
170 return (0);
172 if (c != '*')
173 lastep = ep;
174 switch (c) {
176 case '.':
177 *ep++ = CDOT;
178 continue;
180 case '*':
181 if (lastep == 0 || *lastep == CBRA || *lastep == CKET)
182 goto defchar;
183 *lastep |= CSTAR;
184 continue;
186 case '$':
187 if (*sp != '\0')
188 goto defchar;
189 *ep++ = CDOL;
190 continue;
192 case '[':
193 *ep++ = CCL;
194 *ep++ = 0;
195 cclcnt = 1;
196 if ((c = *sp++) == '^') {
197 c = *sp++;
198 ep[-2] = NCCL;
200 do {
201 if (c == '\0')
202 comerr("missing ]");
203 if (c == '-' && ep [-1] != 0) {
204 if ((c = *sp++) == ']') {
205 *ep++ = '-';
206 cclcnt++;
207 break;
209 while (ep[-1] < c) {
210 *ep = ep[-1] + 1;
211 ep++;
212 cclcnt++;
213 if (ep >= &expbuf[ESIZE])
214 comerr(retoolong);
217 *ep++ = c;
218 cclcnt++;
219 if (ep >= &expbuf[ESIZE])
220 comerr(retoolong);
221 } while ((c = *sp++) != ']');
222 lastep[1] = cclcnt;
223 continue;
225 case '\\':
226 if ((c = *sp++) == '(') {
227 if (numbra >= NBRA)
228 comerr("too many \\(\\) pairs");
229 *bracketp++ = numbra;
230 *ep++ = CBRA;
231 *ep++ = numbra++;
232 continue;
234 if (c == ')') {
235 if (bracketp <= bracket)
236 comerr("unmatched \\)");
237 *ep++ = CKET;
238 *ep++ = *--bracketp;
239 continue;
241 if (c >= '1' && c < ('1' + NBRA)) {
242 *ep++ = CBACK;
243 *ep++ = c - '1';
244 continue;
246 *ep++ = CCHR;
247 *ep++ = c;
248 continue;
250 defchar:
251 default:
252 *ep++ = CCHR;
253 *ep++ = c;
259 * match the argument string against the compiled re
262 re_exec(char *p1)
264 struct re_globals *_re = re_globals;
265 char *p2;
266 int c;
267 int rv;
269 if (_re == 0)
270 return (0);
271 p2 = expbuf;
272 for (c = 0; c < NBRA; c++) {
273 braslist[c] = 0;
274 braelist[c] = 0;
276 if (circf)
277 return((advance(p1, p2)));
279 * fast check for first character
281 if (*p2 == CCHR) {
282 c = p2[1];
283 do {
284 if (*p1 != c)
285 continue;
286 if (rv = advance(p1, p2))
287 return(rv);
288 } while (*p1++);
289 return(0);
292 * regular algorithm
295 if (rv = advance(p1, p2))
296 return(rv);
297 while (*p1++);
298 return(0);
302 * try to match the next thing in the dfa
304 static int
305 advance(char *lp, char *ep)
307 char *curlp;
308 int ct, i;
309 int rv;
310 struct re_globals *_re = re_globals;
312 for (;;)
313 switch (*ep++) {
315 case CCHR:
316 if (*ep++ == *lp++)
317 continue;
318 return(0);
320 case CDOT:
321 if (*lp++)
322 continue;
323 return(0);
325 case CDOL:
326 if (*lp == '\0')
327 continue;
328 return(0);
330 case CEOF:
331 return(1);
333 case CCL:
334 if (cclass(ep, *lp++, 1)) {
335 ep += *ep;
336 continue;
338 return(0);
340 case NCCL:
341 if (cclass(ep, *lp++, 0)) {
342 ep += *ep;
343 continue;
345 return(0);
347 case CBRA:
348 braslist[*ep++] = lp;
349 continue;
351 case CKET:
352 braelist[*ep++] = lp;
353 continue;
355 case CBACK:
356 if (braelist[i = *ep++] == 0)
357 return(-1);
358 if (backref(i, lp)) {
359 lp += braelist[i] - braslist[i];
360 continue;
362 return(0);
364 case CBACK|CSTAR:
365 if (braelist[i = *ep++] == 0)
366 return(-1);
367 curlp = lp;
368 ct = braelist[i] - braslist[i];
369 while (backref(i, lp))
370 lp += ct;
371 while (lp >= curlp) {
372 if (rv = advance(lp, ep))
373 return(rv);
374 lp -= ct;
376 continue;
378 case CDOT|CSTAR:
379 curlp = lp;
380 while (*lp++)
382 goto star;
384 case CCHR|CSTAR:
385 curlp = lp;
386 while (*lp++ == *ep)
388 ep++;
389 goto star;
391 case CCL|CSTAR:
392 case NCCL|CSTAR:
393 curlp = lp;
394 while (cclass(ep, *lp++, ep[-1] == (CCL|CSTAR)))
396 ep += *ep;
397 goto star;
399 star:
400 do {
401 lp--;
402 if (rv = advance(lp, ep))
403 return(rv);
404 } while (lp > curlp);
405 return(0);
407 default:
408 return(-1);
412 static int
413 backref(int i, char *lp)
415 char *bp;
416 struct re_globals *_re = re_globals;
418 bp = braslist[i];
419 while (*bp++ == *lp++)
420 if (bp >= braelist[i])
421 return (1);
422 return (0);
425 static int
426 cclass(char *set, char c, int af)
428 int n;
430 if (c == 0)
431 return(0);
432 n = *set++;
433 while (--n)
434 if (*set++ == c)
435 return (af);
436 return (!af);