import less(1)
[unleashed/tickless.git] / usr / src / lib / libc / port / gen / regexpr.c
blobefa3c2e0f029c2e3da571e4879c6f9bdf148303a
1 /*
2 * CDDL HEADER START
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
19 * CDDL HEADER END
23 * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
24 * Use is subject to license terms.
27 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
28 /* All Rights Reserved */
30 #pragma ident "%Z%%M% %I% %E% SMI"
33 * routines to do regular expression matching
35 * Entry points:
37 * re_comp(s)
38 * char *s;
39 * ... returns 0 if the string s was compiled successfully,
40 * a pointer to an error message otherwise.
41 * If passed 0 or a null string returns without changing
42 * the currently compiled re (see note 11 below).
44 * re_exec(s)
45 * char *s;
46 * ... returns 1 if the string s matches the last compiled regular
47 * expression,
48 * 0 if the string s failed to match the last compiled
49 * regular expression, and
50 * -1 if the compiled regular expression was invalid
51 * (indicating an internal error).
53 * The strings passed to both re_comp and re_exec may have trailing or
54 * embedded newline characters; they are terminated by nulls.
56 * The identity of the author of these routines is lost in antiquity;
57 * this is essentially the same as the re code in the original V6 ed.
59 * The regular expressions recognized are described below. This description
60 * is essentially the same as that for ed.
62 * A regular expression specifies a set of strings of characters.
63 * A member of this set of strings is said to be matched by
64 * the regular expression. In the following specification for
65 * regular expressions the word `character' means any character but NUL.
67 * 1. Any character except a special character matches itself.
68 * Special characters are the regular expression delimiter plus
69 * \ [ . and sometimes ^ * $.
70 * 2. A . matches any character.
71 * 3. A \ followed by any character except a digit or ( )
72 * matches that character.
73 * 4. A nonempty string s bracketed [s] (or [^s]) matches any
74 * character in (or not in) s. In s, \ has no special meaning,
75 * and ] may only appear as the first letter. A substring
76 * a-b, with a and b in ascending ASCII order, stands for
77 * the inclusive range of ASCII characters.
78 * 5. A regular expression of form 1-4 followed by * matches a
79 * sequence of 0 or more matches of the regular expression.
80 * 6. A regular expression, x, of form 1-8, bracketed \(x\)
81 * matches what x matches.
82 * 7. A \ followed by a digit n matches a copy of the string that the
83 * bracketed regular expression beginning with the nth \( matched.
84 * 8. A regular expression of form 1-8, x, followed by a regular
85 * expression of form 1-7, y matches a match for x followed by
86 * a match for y, with the x match being as long as possible
87 * while still permitting a y match.
88 * 9. A regular expression of form 1-8 preceded by ^ (or followed
89 * by $), is constrained to matches that begin at the left
90 * (or end at the right) end of a line.
91 * 10. A regular expression of form 1-9 picks out the longest among
92 * the leftmost matches in a line.
93 * 11. An empty regular expression stands for a copy of the last
94 * regular expression encountered.
97 #include "lint.h"
99 #include <stdlib.h>
100 #include <re_comp.h>
101 #include <stddef.h>
102 #include <sys/types.h>
105 * constants for re's
107 #define CBRA 1
108 #define CCHR 2
109 #define CDOT 4
110 #define CCL 6
111 #define NCCL 8
112 #define CDOL 10
113 #define CEOF 11
114 #define CKET 12
115 #define CBACK 18
117 #define CSTAR 01
119 #define ESIZE 512
120 #define NBRA 9
122 static struct re_globals {
123 char _expbuf[ESIZE];
124 char *_braslist[NBRA], *_braelist[NBRA];
125 char _circf;
126 } *re_globals;
127 #define expbuf (_re->_expbuf)
128 #define braslist (_re->_braslist)
129 #define braelist (_re->_braelist)
130 #define circf (_re->_circf)
132 static int advance(const char *, char *);
133 static int backref(int, const char *);
134 static int cclass(char *, char, int);
137 * compile the regular expression argument into a dfa
139 char *
140 re_comp(const char *sp)
142 char c;
143 struct re_globals *_re = re_globals;
144 char *ep;
145 char cclcnt, numbra = 0;
146 char *lastep = NULL;
147 char bracket[NBRA];
148 char *bracketp = &bracket[0];
149 char *retoolong = "Regular expression too long";
151 if (_re == NULL) {
152 _re = (struct re_globals *)calloc(1, sizeof (*_re));
153 if (_re == NULL)
154 return ("Out of memory");
155 re_globals = _re;
157 ep = expbuf;
159 #define comerr(msg) {expbuf[0] = 0; return (msg); }
161 if (sp == NULL || *sp == '\0') {
162 if (*ep == 0)
163 return ("No previous regular expression");
164 return (NULL);
166 if (*sp == '^') {
167 circf = 1;
168 sp++;
170 else
171 circf = 0;
172 for (;;) {
173 if (ep >= &expbuf[ESIZE])
174 comerr(retoolong);
175 if ((c = *sp++) == '\0') {
176 if (bracketp != bracket)
177 comerr("unmatched \\(");
178 *ep++ = CEOF;
179 *ep++ = 0;
180 return (NULL);
182 if (c != '*')
183 lastep = ep;
184 switch (c) {
186 case '.':
187 *ep++ = CDOT;
188 continue;
190 case '*':
191 if (lastep == NULL || *lastep == CBRA ||
192 *lastep == CKET)
193 goto defchar;
194 *lastep |= CSTAR;
195 continue;
197 case '$':
198 if (*sp != '\0')
199 goto defchar;
200 *ep++ = CDOL;
201 continue;
203 case '[':
204 *ep++ = CCL;
205 *ep++ = 0;
206 cclcnt = 1;
207 if ((c = *sp++) == '^') {
208 c = *sp++;
209 ep[-2] = NCCL;
211 do {
212 if (c == '\0')
213 comerr("missing ]");
214 if (c == '-' && ep [-1] != 0) {
215 if ((c = *sp++) == ']') {
216 *ep++ = '-';
217 cclcnt++;
218 break;
220 while (ep[-1] < c) {
221 *ep = ep[-1] + 1;
222 ep++;
223 cclcnt++;
224 if (ep >= &expbuf[ESIZE])
225 comerr(retoolong);
228 *ep++ = c;
229 cclcnt++;
230 if (ep >= &expbuf[ESIZE])
231 comerr(retoolong);
232 } while ((c = *sp++) != ']');
233 lastep[1] = cclcnt;
234 continue;
236 case '\\':
237 if ((c = *sp++) == '(') {
238 if (numbra >= NBRA)
239 comerr("too many \\(\\) pairs");
240 *bracketp++ = numbra;
241 *ep++ = CBRA;
242 *ep++ = numbra++;
243 continue;
245 if (c == ')') {
246 if (bracketp <= bracket)
247 comerr("unmatched \\)");
248 *ep++ = CKET;
249 *ep++ = *--bracketp;
250 continue;
252 if (c >= '1' && c < ('1' + NBRA)) {
253 *ep++ = CBACK;
254 *ep++ = c - '1';
255 continue;
257 *ep++ = CCHR;
258 *ep++ = c;
259 continue;
261 defchar:
262 default:
263 *ep++ = CCHR;
264 *ep++ = c;
270 * match the argument string against the compiled re
273 re_exec(const char *p1)
275 struct re_globals *_re = re_globals;
276 char *p2;
277 int c;
278 int rv;
280 if (_re == NULL)
281 return (0);
282 p2 = expbuf;
283 for (c = 0; c < NBRA; c++) {
284 braslist[c] = 0;
285 braelist[c] = 0;
287 if (circf)
288 return ((advance(p1, p2)));
290 * fast check for first character
292 if (*p2 == CCHR) {
293 c = p2[1];
294 do {
295 if (*p1 != c)
296 continue;
297 if (rv = advance(p1, p2))
298 return (rv);
299 } while (*p1++);
300 return (0);
303 * regular algorithm
305 do {
306 if (rv = advance(p1, p2))
307 return (rv);
308 } while (*p1++);
309 return (0);
313 * try to match the next thing in the dfa
315 static int
316 advance(const char *lp, char *ep)
318 const char *curlp;
319 ptrdiff_t ct;
320 int i;
321 int rv;
322 struct re_globals *_re = re_globals;
324 for (;;)
325 switch (*ep++) {
327 case CCHR:
328 if (*ep++ == *lp++)
329 continue;
330 return (0);
332 case CDOT:
333 if (*lp++)
334 continue;
335 return (0);
337 case CDOL:
338 if (*lp == '\0')
339 continue;
340 return (0);
342 case CEOF:
343 return (1);
345 case CCL:
346 if (cclass(ep, *lp++, 1)) {
347 ep += *ep;
348 continue;
350 return (0);
352 case NCCL:
353 if (cclass(ep, *lp++, 0)) {
354 ep += *ep;
355 continue;
357 return (0);
359 case CBRA:
360 braslist[*ep++] = (char *)lp;
361 continue;
363 case CKET:
364 braelist[*ep++] = (char *)lp;
365 continue;
367 case CBACK:
368 if (braelist[i = *ep++] == NULL)
369 return (-1);
370 if (backref(i, lp)) {
371 lp += braelist[i] - braslist[i];
372 continue;
374 return (0);
376 case CBACK|CSTAR:
377 if (braelist[i = *ep++] == NULL)
378 return (-1);
379 curlp = lp;
380 ct = braelist[i] - braslist[i];
381 while (backref(i, lp))
382 lp += ct;
383 while (lp >= curlp) {
384 if (rv = advance(lp, ep))
385 return (rv);
386 lp -= ct;
388 continue;
390 case CDOT|CSTAR:
391 curlp = lp;
392 while (*lp++)
394 goto star;
396 case CCHR|CSTAR:
397 curlp = lp;
398 while (*lp++ == *ep)
400 ep++;
401 goto star;
403 case CCL|CSTAR:
404 case NCCL|CSTAR:
405 curlp = lp;
406 while (cclass(ep, *lp++, ep[-1] == (CCL|CSTAR)))
408 ep += *ep;
409 goto star;
411 star:
412 do {
413 lp--;
414 if (rv = advance(lp, ep))
415 return (rv);
416 } while (lp > curlp);
417 return (0);
419 default:
420 return (-1);
424 static int
425 backref(int i, const char *lp)
427 char *bp;
428 struct re_globals *_re = re_globals;
430 bp = braslist[i];
431 while (*bp++ == *lp++)
432 if (bp >= braelist[i])
433 return (1);
434 return (0);
437 static int
438 cclass(char *set, char c, int af)
440 int n;
442 if (c == 0)
443 return (0);
444 n = *set++;
445 while (--n)
446 if (*set++ == c)
447 return (af);
448 return (! af);