Remove building with NOCRYPTO option
[minix3.git] / external / bsd / nvi / dist / regex / regcomp.c
blobb4504b961090bacc09b21aabb5da4b50e6c55e41
1 /* $NetBSD: regcomp.c,v 1.5 2014/01/26 21:47:00 christos Exp $ */
2 /*-
3 * Copyright (c) 1992, 1993, 1994 Henry Spencer.
4 * Copyright (c) 1992, 1993, 1994
5 * The Regents of the University of California. All rights reserved.
7 * This code is derived from software contributed to Berkeley by
8 * Henry Spencer of the University of Toronto.
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 3. All advertising materials mentioning features or use of this software
19 * must display the following acknowledgement:
20 * This product includes software developed by the University of
21 * California, Berkeley and its contributors.
22 * 4. Neither the name of the University nor the names of its contributors
23 * may be used to endorse or promote products derived from this software
24 * without specific prior written permission.
26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36 * SUCH DAMAGE.
38 * @(#)regcomp.c 8.4 (Berkeley) 3/19/94
41 #include <sys/cdefs.h>
42 #if 0
43 #if defined(LIBC_SCCS) && !defined(lint)
44 static char sccsid[] = "@(#)regcomp.c 8.4 (Berkeley) 3/19/94";
45 #endif /* LIBC_SCCS and not lint */
46 #else
47 __RCSID("$NetBSD: regcomp.c,v 1.5 2014/01/26 21:47:00 christos Exp $");
48 #endif
50 #include <sys/types.h>
51 #include <stdio.h>
52 #include <string.h>
53 #include <ctype.h>
54 #include <limits.h>
55 #include <stdlib.h>
56 #include <regex.h>
58 #include "utils.h"
59 #include "regex2.h"
61 #include "cclass.h"
62 #include "cname.h"
65 * parse structure, passed up and down to avoid global variables and
66 * other clumsinesses
68 struct parse {
69 RCHAR_T *next; /* next character in RE */
70 RCHAR_T *end; /* end of string (-> NUL normally) */
71 int error; /* has an error been seen? */
72 sop *strip; /* malloced strip */
73 RCHAR_T *stripdata; /* malloced stripdata */
74 sopno ssize; /* malloced strip size (allocated) */
75 sopno slen; /* malloced strip length (used) */
76 int ncsalloc; /* number of csets allocated */
77 struct re_guts *g;
78 # define NPAREN 10 /* we need to remember () 1-9 for back refs */
79 sopno pbegin[NPAREN]; /* -> ( ([0] unused) */
80 sopno pend[NPAREN]; /* -> ) ([0] unused) */
83 /* ========= begin header generated by ./mkh ========= */
84 #ifdef __cplusplus
85 extern "C" {
86 #endif
88 /* === regcomp.c === */
89 static void p_ere __P((struct parse *p, int stop, size_t reclimit));
90 static void p_ere_exp __P((struct parse *p, size_t reclimit));
91 static void p_str __P((struct parse *p));
92 static void p_bre __P((struct parse *p, int end1, int end2, size_t reclimit));
93 static int p_simp_re __P((struct parse *p, int starordinary, size_t reclimit));
94 static int p_count __P((struct parse *p));
95 static void p_bracket __P((struct parse *p));
96 static void p_b_term __P((struct parse *p, cset *cs));
97 static void p_b_cclass __P((struct parse *p, cset *cs));
98 static void p_b_eclass __P((struct parse *p, cset *cs));
99 static char p_b_symbol __P((struct parse *p));
100 static char p_b_coll_elem __P((struct parse *p, int endc));
101 static char othercase __P((int ch));
102 static void bothcases __P((struct parse *p, int ch));
103 static void ordinary __P((struct parse *p, int ch));
104 static void nonnewline __P((struct parse *p));
105 static void repeat __P((struct parse *p, sopno start, int from, int to, size_t reclimit));
106 static int seterr __P((struct parse *p, int e));
107 static cset *allocset __P((struct parse *p));
108 static void freeset __P((struct parse *p, cset *cs));
109 static int freezeset __P((struct parse *p, cset *cs));
110 static int firstch __P((struct parse *p, cset *cs));
111 static int nch __P((struct parse *p, cset *cs));
112 static void mcadd __P((struct parse *p, cset *cs, const char *cp));
113 #ifdef notdef
114 static void mcsub __P((cset *cs, char *cp));
115 static int mcin __P((cset *cs, char *cp));
116 static char *mcfind __P((cset *cs, char *cp));
117 #endif
118 static void mcinvert __P((struct parse *p, cset *cs));
119 static void mccase __P((struct parse *p, cset *cs));
120 #ifdef notdef
121 static int isinsets __P((struct re_guts *g, int c));
122 static int samesets __P((struct re_guts *g, int c1, int c2));
123 #endif
124 static void categorize __P((struct parse *p, struct re_guts *g));
125 static sopno dupl __P((struct parse *p, sopno start, sopno finish));
126 static void doemit __P((struct parse *p, sop op, size_t opnd));
127 static void doinsert __P((struct parse *p, sop op, size_t opnd, sopno pos));
128 static void dofwd __P((struct parse *p, sopno pos, sop value));
129 static int enlarge __P((struct parse *p, sopno size));
130 static void stripsnug __P((struct parse *p, struct re_guts *g));
131 static void findmust __P((struct parse *p, struct re_guts *g));
132 static sopno pluscount __P((struct parse *p, struct re_guts *g));
134 #ifdef __cplusplus
136 #endif
137 /* ========= end header generated by ./mkh ========= */
139 static RCHAR_T nuls[10]; /* place to point scanner in event of error */
142 * macros for use with parse structure
143 * BEWARE: these know that the parse structure is named `p' !!!
145 #define PEEK() (*p->next)
146 #define PEEK2() (*(p->next+1))
147 #define MORE() (p->next < p->end)
148 #define MORE2() (p->next+1 < p->end)
149 #define SEE(c) (MORE() && PEEK() == (c))
150 #define SEETWO(a, b) (MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b))
151 #define EAT(c) ((SEE(c)) ? (NEXT(), 1) : 0)
152 #define EATTWO(a, b) ((SEETWO(a, b)) ? (NEXT2(), 1) : 0)
153 #define NEXT() (p->next++)
154 #define NEXT2() (p->next += 2)
155 #define NEXTn(n) (p->next += (n))
156 #define GETNEXT() (*p->next++)
157 #define SETERROR(e) seterr(p, (e))
158 #define REQUIRE(co, e) ((co) || SETERROR(e))
159 #define MUSTSEE(c, e) (REQUIRE(MORE() && PEEK() == (c), e))
160 #define MUSTEAT(c, e) (REQUIRE(MORE() && GETNEXT() == (c), e))
161 #define MUSTNOTSEE(c, e) (REQUIRE(!MORE() || PEEK() != (c), e))
162 #define EMIT(op, sopnd) doemit(p, (sop)(op), (size_t)(sopnd))
163 #define INSERT(op, pos) doinsert(p, (sop)(op), HERE()-(pos)+1, pos)
164 #define AHEAD(pos) dofwd(p, pos, HERE()-(pos))
165 #define ASTERN(sop, pos) EMIT(sop, HERE()-pos)
166 #define HERE() (p->slen)
167 #define THERE() (p->slen - 1)
168 #define THERETHERE() (p->slen - 2)
169 #define DROP(n) (p->slen -= (n))
171 #ifndef NDEBUG
172 static int never = 0; /* for use in asserts; shuts lint up */
173 #else
174 #define never 0 /* some <assert.h>s have bugs too */
175 #endif
177 #define MEMLIMIT 0x8000000
178 #define MEMSIZE(p) \
179 ((p)->ncsalloc / CHAR_BIT * (p)->g->csetsize + \
180 (p)->ncsalloc * sizeof(cset) + \
181 (p)->ssize * sizeof(sop))
182 #define RECLIMIT 256
185 - regcomp - interface for parser and compilation
186 = extern int regcomp(regex_t *, const RCHAR_T *, int);
187 = #define REG_BASIC 0000
188 = #define REG_EXTENDED 0001
189 = #define REG_ICASE 0002
190 = #define REG_NOSUB 0004
191 = #define REG_NEWLINE 0010
192 = #define REG_NOSPEC 0020
193 = #define REG_PEND 0040
194 = #define REG_DUMP 0200
196 int /* 0 success, otherwise REG_something */
197 regcomp(regex_t *preg, const RCHAR_T *pattern, int cflags)
199 struct parse pa;
200 struct re_guts *g;
201 struct parse *p = &pa;
202 int i;
203 size_t len;
204 #ifdef REDEBUG
205 # define GOODFLAGS(f) (f)
206 #else
207 # define GOODFLAGS(f) ((f)&~REG_DUMP)
208 #endif
210 cflags = GOODFLAGS(cflags);
211 if ((cflags&REG_EXTENDED) && (cflags&REG_NOSPEC))
212 return(REG_INVARG);
214 if (cflags&REG_PEND) {
215 if (preg->re_endp < pattern)
216 return(REG_INVARG);
217 len = preg->re_endp - pattern;
218 } else
219 len = STRLEN(pattern);
221 /* do the mallocs early so failure handling is easy */
222 g = (struct re_guts *)malloc(sizeof(struct re_guts) +
223 (NC-1)*sizeof(cat_t));
224 if (g == NULL)
225 return(REG_ESPACE);
226 p->ssize = len/(size_t)2*(size_t)3 + (size_t)1; /* ugh */
227 p->strip = (sop *)malloc(p->ssize * sizeof(sop));
228 if (p->strip == NULL) {
229 free((char *)g);
230 return(REG_ESPACE);
232 p->stripdata = (RCHAR_T *)malloc(p->ssize * sizeof(RCHAR_T));
233 if (p->stripdata == NULL) {
234 free((char *)p->strip);
235 free((char *)g);
236 return(REG_ESPACE);
238 p->slen = 0;
240 /* set things up */
241 p->g = g;
242 p->next = (RCHAR_T *)__UNCONST(pattern); /* convenience; we do not modify it */
243 p->end = p->next + len;
244 p->error = 0;
245 p->ncsalloc = 0;
246 for (i = 0; i < NPAREN; i++) {
247 p->pbegin[i] = 0;
248 p->pend[i] = 0;
250 g->csetsize = NC;
251 g->sets = NULL;
252 g->setbits = NULL;
253 g->ncsets = 0;
254 g->cflags = cflags;
255 g->iflags = 0;
256 g->nbol = 0;
257 g->neol = 0;
258 g->must = NULL;
259 g->mlen = 0;
260 g->nsub = 0;
261 #if 0
262 g->ncategories = 1; /* category 0 is "everything else" */
263 g->categories = &g->catspace[-(CHAR_MIN)];
264 (void) memset((char *)g->catspace, 0, NC*sizeof(cat_t));
265 #endif
266 g->backrefs = 0;
268 /* do it */
269 EMIT(OEND, 0);
270 g->firststate = THERE();
271 if (cflags&REG_EXTENDED)
272 p_ere(p, OUT, 0);
273 else if (cflags&REG_NOSPEC)
274 p_str(p);
275 else
276 p_bre(p, OUT, OUT, 0);
277 EMIT(OEND, 0);
278 g->laststate = THERE();
280 /* tidy up loose ends and fill things in */
281 categorize(p, g);
282 stripsnug(p, g);
283 findmust(p, g);
284 g->nplus = pluscount(p, g);
285 g->magic = MAGIC2;
286 preg->re_nsub = g->nsub;
287 preg->re_g = g;
288 preg->re_magic = MAGIC1;
289 #ifndef REDEBUG
290 /* not debugging, so can't rely on the assert() in regexec() */
291 if (g->iflags&BAD)
292 SETERROR(REG_ASSERT);
293 #endif
295 /* win or lose, we're done */
296 if (p->error != 0) /* lose */
297 regfree(preg);
298 return(p->error);
302 - p_ere - ERE parser top level, concatenation and alternation
303 == static void p_ere(struct parse *p, int stop, size_t reclimit);
305 static void
306 p_ere(struct parse *p, int stop, size_t reclimit)
308 /* character this ERE should end at */
310 char c;
311 sopno prevback = 0;
312 sopno prevfwd = 0;
313 sopno conc;
314 int first = 1; /* is this the first alternative? */
316 if (reclimit++ > RECLIMIT || p->error == REG_ESPACE) {
317 p->error = REG_ESPACE;
318 return;
321 for (;;) {
322 /* do a bunch of concatenated expressions */
323 conc = HERE();
324 while (MORE() && (c = PEEK()) != '|' && c != stop)
325 p_ere_exp(p, reclimit);
326 (void)REQUIRE(HERE() != conc, REG_EMPTY); /* require nonempty */
328 if (!EAT('|'))
329 break; /* NOTE BREAK OUT */
331 if (first) {
332 INSERT(OCH_, conc); /* offset is wrong */
333 prevfwd = conc;
334 prevback = conc;
335 first = 0;
337 ASTERN(OOR1, prevback);
338 prevback = THERE();
339 AHEAD(prevfwd); /* fix previous offset */
340 prevfwd = HERE();
341 EMIT(OOR2, 0); /* offset is very wrong */
344 if (!first) { /* tail-end fixups */
345 AHEAD(prevfwd);
346 ASTERN(O_CH, prevback);
349 assert(!MORE() || SEE(stop));
353 - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op
354 == static void p_ere_exp(struct parse *p);
356 static void
357 p_ere_exp(struct parse *p, size_t reclimit)
359 char c;
360 sopno pos;
361 int count;
362 int count2;
363 sopno subno;
364 int wascaret = 0;
366 assert(MORE()); /* caller should have ensured this */
367 c = GETNEXT();
369 pos = HERE();
370 switch (c) {
371 case '(':
372 (void)REQUIRE(MORE(), REG_EPAREN);
373 p->g->nsub++;
374 subno = p->g->nsub;
375 if (subno < NPAREN)
376 p->pbegin[subno] = HERE();
377 EMIT(OLPAREN, subno);
378 if (!SEE(')'))
379 p_ere(p, ')', reclimit);
380 if (subno < NPAREN) {
381 p->pend[subno] = HERE();
382 assert(p->pend[subno] != 0);
384 EMIT(ORPAREN, subno);
385 (void)MUSTEAT(')', REG_EPAREN);
386 break;
387 #ifndef POSIX_MISTAKE
388 case ')': /* happens only if no current unmatched ( */
390 * You may ask, why the ifndef? Because I didn't notice
391 * this until slightly too late for 1003.2, and none of the
392 * other 1003.2 regular-expression reviewers noticed it at
393 * all. So an unmatched ) is legal POSIX, at least until
394 * we can get it fixed.
396 SETERROR(REG_EPAREN);
397 break;
398 #endif
399 case '^':
400 EMIT(OBOL, 0);
401 p->g->iflags |= USEBOL;
402 p->g->nbol++;
403 wascaret = 1;
404 break;
405 case '$':
406 EMIT(OEOL, 0);
407 p->g->iflags |= USEEOL;
408 p->g->neol++;
409 break;
410 case '|':
411 SETERROR(REG_EMPTY);
412 break;
413 case '*':
414 case '+':
415 case '?':
416 SETERROR(REG_BADRPT);
417 break;
418 case '.':
419 if (p->g->cflags&REG_NEWLINE)
420 nonnewline(p);
421 else
422 EMIT(OANY, 0);
423 break;
424 case '[':
425 p_bracket(p);
426 break;
427 case '\\':
428 (void)REQUIRE(MORE(), REG_EESCAPE);
429 c = GETNEXT();
430 ordinary(p, c);
431 break;
432 case '{': /* okay as ordinary except if digit follows */
433 (void)REQUIRE(!MORE() || !ISDIGIT((UCHAR_T)PEEK()), REG_BADRPT);
434 /* FALLTHROUGH */
435 default:
436 ordinary(p, c);
437 break;
440 if (!MORE())
441 return;
442 c = PEEK();
443 /* we call { a repetition if followed by a digit */
444 if (!( c == '*' || c == '+' || c == '?' ||
445 (c == '{' && MORE2() && ISDIGIT((UCHAR_T)PEEK2())) ))
446 return; /* no repetition, we're done */
447 NEXT();
449 (void)REQUIRE(!wascaret, REG_BADRPT);
450 switch (c) {
451 case '*': /* implemented as +? */
452 /* this case does not require the (y|) trick, noKLUDGE */
453 INSERT(OPLUS_, pos);
454 ASTERN(O_PLUS, pos);
455 INSERT(OQUEST_, pos);
456 ASTERN(O_QUEST, pos);
457 break;
458 case '+':
459 INSERT(OPLUS_, pos);
460 ASTERN(O_PLUS, pos);
461 break;
462 case '?':
463 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
464 INSERT(OCH_, pos); /* offset slightly wrong */
465 ASTERN(OOR1, pos); /* this one's right */
466 AHEAD(pos); /* fix the OCH_ */
467 EMIT(OOR2, 0); /* offset very wrong... */
468 AHEAD(THERE()); /* ...so fix it */
469 ASTERN(O_CH, THERETHERE());
470 break;
471 case '{':
472 count = p_count(p);
473 if (EAT(',')) {
474 if (ISDIGIT((UCHAR_T)PEEK())) {
475 count2 = p_count(p);
476 (void)REQUIRE(count <= count2, REG_BADBR);
477 } else /* single number with comma */
478 count2 = INFINITY;
479 } else /* just a single number */
480 count2 = count;
481 repeat(p, pos, count, count2, 0);
482 if (!EAT('}')) { /* error heuristics */
483 while (MORE() && PEEK() != '}')
484 NEXT();
485 (void)REQUIRE(MORE(), REG_EBRACE);
486 SETERROR(REG_BADBR);
488 break;
491 if (!MORE())
492 return;
493 c = PEEK();
494 if (!( c == '*' || c == '+' || c == '?' ||
495 (c == '{' && MORE2() && ISDIGIT((UCHAR_T)PEEK2())) ) )
496 return;
497 SETERROR(REG_BADRPT);
501 - p_str - string (no metacharacters) "parser"
502 == static void p_str(struct parse *p);
504 static void
505 p_str(struct parse *p)
507 (void)REQUIRE(MORE(), REG_EMPTY);
508 while (MORE())
509 ordinary(p, GETNEXT());
513 - p_bre - BRE parser top level, anchoring and concatenation
514 == static void p_bre(struct parse *p, int end1, \
515 == int end2, size_t reclimit);
516 * Giving end1 as OUT essentially eliminates the end1/end2 check.
518 * This implementation is a bit of a kludge, in that a trailing $ is first
519 * taken as an ordinary character and then revised to be an anchor. The
520 * only undesirable side effect is that '$' gets included as a character
521 * category in such cases. This is fairly harmless; not worth fixing.
522 * The amount of lookahead needed to avoid this kludge is excessive.
524 static void
525 p_bre(struct parse *p, int end1, int end2, size_t reclimit)
527 /* first terminating character */
528 /* second terminating character */
530 sopno start;
531 int first = 1; /* first subexpression? */
532 int wasdollar = 0;
534 if (reclimit++ > RECLIMIT || p->error == REG_ESPACE) {
535 p->error = REG_ESPACE;
536 return;
539 start = HERE();
541 if (EAT('^')) {
542 EMIT(OBOL, 0);
543 p->g->iflags |= USEBOL;
544 p->g->nbol++;
546 while (MORE() && !SEETWO(end1, end2)) {
547 wasdollar = p_simp_re(p, first, reclimit);
548 first = 0;
550 if (wasdollar) { /* oops, that was a trailing anchor */
551 DROP(1);
552 EMIT(OEOL, 0);
553 p->g->iflags |= USEEOL;
554 p->g->neol++;
557 (void)REQUIRE(HERE() != start, REG_EMPTY); /* require nonempty */
561 - p_simp_re - parse a simple RE, an atom possibly followed by a repetition
562 == static int p_simp_re(struct parse *p, int starordinary, size_t reclimit);
564 static int /* was the simple RE an unbackslashed $? */
565 p_simp_re(struct parse *p, int starordinary, size_t reclimit)
567 /* is a leading * an ordinary character? */
569 int c;
570 int count;
571 int count2;
572 sopno pos;
573 int i;
574 sopno subno;
575 int backsl;
577 pos = HERE(); /* repetion op, if any, covers from here */
579 assert(MORE()); /* caller should have ensured this */
580 c = GETNEXT();
581 backsl = c == '\\';
582 if (backsl) {
583 (void)REQUIRE(MORE(), REG_EESCAPE);
584 c = (unsigned char)GETNEXT();
585 switch (c) {
586 case '{':
587 SETERROR(REG_BADRPT);
588 break;
589 case '(':
590 p->g->nsub++;
591 subno = p->g->nsub;
592 if (subno < NPAREN)
593 p->pbegin[subno] = HERE();
594 EMIT(OLPAREN, subno);
595 /* the MORE here is an error heuristic */
596 if (MORE() && !SEETWO('\\', ')'))
597 p_bre(p, '\\', ')', reclimit);
598 if (subno < NPAREN) {
599 p->pend[subno] = HERE();
600 assert(p->pend[subno] != 0);
602 EMIT(ORPAREN, subno);
603 (void)REQUIRE(EATTWO('\\', ')'), REG_EPAREN);
604 break;
605 case ')': /* should not get here -- must be user */
606 case '}':
607 SETERROR(REG_EPAREN);
608 break;
609 case '1':
610 case '2':
611 case '3':
612 case '4':
613 case '5':
614 case '6':
615 case '7':
616 case '8':
617 case '9':
618 i = c - '0';
619 assert(i < NPAREN);
620 if (p->pend[i] != 0) {
621 assert(i <= p->g->nsub);
622 EMIT(OBACK_, i);
623 assert(p->pbegin[i] != 0);
624 assert(p->strip[p->pbegin[i]] == OLPAREN);
625 assert(p->strip[p->pend[i]] == ORPAREN);
626 (void) dupl(p, p->pbegin[i]+1, p->pend[i]);
627 EMIT(O_BACK, i);
628 } else
629 SETERROR(REG_ESUBREG);
630 p->g->backrefs = 1;
631 break;
632 default:
633 ordinary(p, c);
634 break;
636 } else {
637 switch (c) {
638 case '.':
639 if (p->g->cflags&REG_NEWLINE)
640 nonnewline(p);
641 else
642 EMIT(OANY, 0);
643 break;
644 case '[':
645 p_bracket(p);
646 break;
647 case '*':
648 (void)REQUIRE(starordinary, REG_BADRPT);
649 /* FALLTHROUGH */
650 default:
651 ordinary(p, c);
652 break;
656 if (EAT('*')) { /* implemented as +? */
657 /* this case does not require the (y|) trick, noKLUDGE */
658 INSERT(OPLUS_, pos);
659 ASTERN(O_PLUS, pos);
660 INSERT(OQUEST_, pos);
661 ASTERN(O_QUEST, pos);
662 } else if (EATTWO('\\', '{')) {
663 count = p_count(p);
664 if (EAT(',')) {
665 if (MORE() && ISDIGIT((UCHAR_T)PEEK())) {
666 count2 = p_count(p);
667 (void)REQUIRE(count <= count2, REG_BADBR);
668 } else /* single number with comma */
669 count2 = INFINITY;
670 } else /* just a single number */
671 count2 = count;
672 repeat(p, pos, count, count2, reclimit);
673 if (!EATTWO('\\', '}')) { /* error heuristics */
674 while (MORE() && !SEETWO('\\', '}'))
675 NEXT();
676 (void)REQUIRE(MORE(), REG_EBRACE);
677 SETERROR(REG_BADBR);
679 } else if (!backsl && c == (unsigned char)'$') /* $ (but not \$) ends it */
680 return(1);
682 return(0);
686 - p_count - parse a repetition count
687 == static int p_count(struct parse *p);
689 static int /* the value */
690 p_count(struct parse *p)
692 int count = 0;
693 int ndigits = 0;
695 while (MORE() && ISDIGIT((UCHAR_T)PEEK()) && count <= DUPMAX) {
696 count = count*10 + (GETNEXT() - '0');
697 ndigits++;
700 (void)REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR);
701 return(count);
705 - p_bracket - parse a bracketed character list
706 == static void p_bracket(struct parse *p);
708 * Note a significant property of this code: if the allocset() did SETERROR,
709 * no set operations are done.
711 static void
712 p_bracket(struct parse *p)
714 cset *cs;
715 int invert = 0;
716 static RCHAR_T bow[] = { '[', ':', '<', ':', ']', ']' };
717 static RCHAR_T eow[] = { '[', ':', '>', ':', ']', ']' };
719 cs = allocset(p);
720 if (cs == NULL)
721 return;
723 /* Dept of Truly Sickening Special-Case Kludges */
724 if (p->next + 5 < p->end && MEMCMP(p->next, bow, 6) == 0) {
725 EMIT(OBOW, 0);
726 NEXTn(6);
727 return;
729 if (p->next + 5 < p->end && MEMCMP(p->next, eow, 6) == 0) {
730 EMIT(OEOW, 0);
731 NEXTn(6);
732 return;
735 if (EAT('^'))
736 invert++; /* make note to invert set at end */
737 if (EAT(']'))
738 CHadd(cs, ']');
739 else if (EAT('-'))
740 CHadd(cs, '-');
741 while (MORE() && PEEK() != ']' && !SEETWO('-', ']'))
742 p_b_term(p, cs);
743 if (EAT('-'))
744 CHadd(cs, '-');
745 (void)MUSTEAT(']', REG_EBRACK);
747 if (p->error != 0) /* don't mess things up further */
748 return;
750 if (p->g->cflags&REG_ICASE) {
751 int i;
752 int ci;
754 for (i = p->g->csetsize - 1; i >= 0; i--)
755 if (CHIN(cs, i) && isalpha(i)) {
756 ci = othercase(i);
757 if (ci != i)
758 CHadd(cs, ci);
760 if (cs->multis != NULL)
761 mccase(p, cs);
763 if (invert) {
764 int i;
766 for (i = p->g->csetsize - 1; i >= 0; i--)
767 if (CHIN(cs, i))
768 CHsub(cs, i);
769 else
770 CHadd(cs, i);
771 if (p->g->cflags&REG_NEWLINE)
772 CHsub(cs, '\n');
773 if (cs->multis != NULL)
774 mcinvert(p, cs);
777 assert(cs->multis == NULL); /* xxx */
779 if (nch(p, cs) == 1) { /* optimize singleton sets */
780 ordinary(p, firstch(p, cs));
781 freeset(p, cs);
782 } else
783 EMIT(OANYOF, freezeset(p, cs));
787 - p_b_term - parse one term of a bracketed character list
788 == static void p_b_term(struct parse *p, cset *cs);
790 static void
791 p_b_term(struct parse *p, cset *cs)
793 char c;
794 char start, finish;
795 int i;
797 /* classify what we've got */
798 switch ((MORE()) ? PEEK() : '\0') {
799 case '[':
800 c = (MORE2()) ? PEEK2() : '\0';
801 break;
802 case '-':
803 SETERROR(REG_ERANGE);
804 return; /* NOTE RETURN */
805 break;
806 default:
807 c = '\0';
808 break;
811 switch (c) {
812 case ':': /* character class */
813 NEXT2();
814 (void)REQUIRE(MORE(), REG_EBRACK);
815 c = PEEK();
816 (void)REQUIRE(c != '-' && c != ']', REG_ECTYPE);
817 p_b_cclass(p, cs);
818 (void)REQUIRE(MORE(), REG_EBRACK);
819 (void)REQUIRE(EATTWO(':', ']'), REG_ECTYPE);
820 break;
821 case '=': /* equivalence class */
822 NEXT2();
823 (void)REQUIRE(MORE(), REG_EBRACK);
824 c = PEEK();
825 (void)REQUIRE(c != '-' && c != ']', REG_ECOLLATE);
826 p_b_eclass(p, cs);
827 (void)REQUIRE(MORE(), REG_EBRACK);
828 (void)REQUIRE(EATTWO('=', ']'), REG_ECOLLATE);
829 break;
830 default: /* symbol, ordinary character, or range */
831 /* xxx revision needed for multichar stuff */
832 start = p_b_symbol(p);
833 if (SEE('-') && MORE2() && PEEK2() != ']') {
834 /* range */
835 NEXT();
836 if (EAT('-'))
837 finish = '-';
838 else
839 finish = p_b_symbol(p);
840 } else
841 finish = start;
842 /* xxx what about signed chars here... */
843 (void)REQUIRE(start <= finish, REG_ERANGE);
844 for (i = start; i <= finish; i++)
845 CHadd(cs, i);
846 break;
851 - p_b_cclass - parse a character-class name and deal with it
852 == static void p_b_cclass(struct parse *p, cset *cs);
854 static void
855 p_b_cclass(struct parse *p, cset *cs)
857 RCHAR_T *sp = p->next;
858 struct cclass *cp;
859 size_t len;
860 const char *u;
861 char c;
863 while (MORE() && isalpha(PEEK()))
864 NEXT();
865 len = p->next - sp;
866 for (cp = cclasses; cp->name != NULL; cp++)
867 if (STRLEN(cp->name) == len && !MEMCMP(cp->name, sp, len))
868 break;
869 if (cp->name == NULL) {
870 /* oops, didn't find it */
871 SETERROR(REG_ECTYPE);
872 return;
875 u = cp->chars;
876 while ((c = *u++) != '\0')
877 CHadd(cs, c);
878 for (u = cp->multis; *u != '\0'; u += strlen(u) + 1)
879 MCadd(p, cs, u);
883 - p_b_eclass - parse an equivalence-class name and deal with it
884 == static void p_b_eclass(struct parse *p, cset *cs);
886 * This implementation is incomplete. xxx
888 static void
889 p_b_eclass(struct parse *p, cset *cs)
891 char c;
893 c = p_b_coll_elem(p, '=');
894 CHadd(cs, c);
898 - p_b_symbol - parse a character or [..]ed multicharacter collating symbol
899 == static char p_b_symbol(struct parse *p);
901 static char /* value of symbol */
902 p_b_symbol(struct parse *p)
904 char value;
906 (void)REQUIRE(MORE(), REG_EBRACK);
907 if (!EATTWO('[', '.'))
908 return(GETNEXT());
910 /* collating symbol */
911 value = p_b_coll_elem(p, '.');
912 (void)REQUIRE(EATTWO('.', ']'), REG_ECOLLATE);
913 return(value);
917 - p_b_coll_elem - parse a collating-element name and look it up
918 == static char p_b_coll_elem(struct parse *p, int endc);
920 static char /* value of collating element */
921 p_b_coll_elem(struct parse *p, int endc)
923 /* name ended by endc,']' */
925 RCHAR_T *sp = p->next;
926 struct cname *cp;
927 size_t len;
929 while (MORE() && !SEETWO(endc, ']'))
930 NEXT();
931 if (!MORE()) {
932 SETERROR(REG_EBRACK);
933 return(0);
935 len = p->next - sp;
936 for (cp = cnames; cp->name != NULL; cp++)
937 if (STRLEN(cp->name) == len && MEMCMP(cp->name, sp, len))
938 return(cp->code); /* known name */
939 if (len == 1)
940 return(*sp); /* single character */
941 SETERROR(REG_ECOLLATE); /* neither */
942 return(0);
946 - othercase - return the case counterpart of an alphabetic
947 == static char othercase(int ch);
949 static char /* if no counterpart, return ch */
950 othercase(int ch)
952 assert(isalpha(ch));
953 if (isupper(ch))
954 return(tolower(ch));
955 else if (islower(ch))
956 return(toupper(ch));
957 else /* peculiar, but could happen */
958 return(ch);
962 - bothcases - emit a dualcase version of a two-case character
963 == static void bothcases(struct parse *p, int ch);
965 * Boy, is this implementation ever a kludge...
967 static void
968 bothcases(struct parse *p, int ch)
970 RCHAR_T *oldnext = p->next;
971 RCHAR_T *oldend = p->end;
972 RCHAR_T bracket[3];
974 assert(othercase(ch) != ch); /* p_bracket() would recurse */
975 p->next = bracket;
976 p->end = bracket+2;
977 bracket[0] = ch;
978 bracket[1] = ']';
979 bracket[2] = '\0';
980 p_bracket(p);
981 assert(p->next == bracket+2);
982 p->next = oldnext;
983 p->end = oldend;
987 - ordinary - emit an ordinary character
988 == static void ordinary(struct parse *p, int ch);
990 static void
991 ordinary(struct parse *p, int ch)
994 cat_t *cap = p->g->categories;
997 if ((p->g->cflags&REG_ICASE) && isalpha(ch) && othercase(ch) != ch)
998 bothcases(p, ch);
999 else {
1000 EMIT(OCHAR, (UCHAR_T)ch);
1002 if (cap[ch] == 0)
1003 cap[ch] = p->g->ncategories++;
1009 - nonnewline - emit REG_NEWLINE version of OANY
1010 == static void nonnewline(struct parse *p);
1012 * Boy, is this implementation ever a kludge...
1014 static void
1015 nonnewline(struct parse *p)
1017 RCHAR_T *oldnext = p->next;
1018 RCHAR_T *oldend = p->end;
1019 RCHAR_T bracket[4];
1021 p->next = bracket;
1022 p->end = bracket+3;
1023 bracket[0] = '^';
1024 bracket[1] = '\n';
1025 bracket[2] = ']';
1026 bracket[3] = '\0';
1027 p_bracket(p);
1028 assert(p->next == bracket+3);
1029 p->next = oldnext;
1030 p->end = oldend;
1034 - repeat - generate code for a bounded repetition, recursively if needed
1035 == static void repeat(struct parse *p, sopno start, int from, int to, size_t reclimit);
1037 static void
1038 repeat(struct parse *p, sopno start, int from, int to, size_t reclimit)
1040 /* operand from here to end of strip */
1041 /* repeated from this number */
1042 /* to this number of times (maybe INFINITY) */
1044 sopno finish;
1045 # define N 2
1046 # define INF 3
1047 # define REP(f, t) ((f)*8 + (t))
1048 # define MAP(n) (((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N)
1049 sopno copy;
1051 if (reclimit++ > RECLIMIT)
1052 p->error = REG_ESPACE;
1053 if (p->error)
1054 return;
1056 finish = HERE();
1058 assert(from <= to);
1060 switch (REP(MAP(from), MAP(to))) {
1061 case REP(0, 0): /* must be user doing this */
1062 DROP(finish-start); /* drop the operand */
1063 break;
1064 case REP(0, 1): /* as x{1,1}? */
1065 case REP(0, N): /* as x{1,n}? */
1066 case REP(0, INF): /* as x{1,}? */
1067 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
1068 INSERT(OCH_, start); /* offset is wrong... */
1069 repeat(p, start+1, 1, to, reclimit);
1070 ASTERN(OOR1, start);
1071 AHEAD(start); /* ... fix it */
1072 EMIT(OOR2, 0);
1073 AHEAD(THERE());
1074 ASTERN(O_CH, THERETHERE());
1075 break;
1076 case REP(1, 1): /* trivial case */
1077 /* done */
1078 break;
1079 case REP(1, N): /* as x?x{1,n-1} */
1080 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
1081 INSERT(OCH_, start);
1082 ASTERN(OOR1, start);
1083 AHEAD(start);
1084 EMIT(OOR2, 0); /* offset very wrong... */
1085 AHEAD(THERE()); /* ...so fix it */
1086 ASTERN(O_CH, THERETHERE());
1087 copy = dupl(p, start+1, finish+1);
1088 assert(copy == finish+4);
1089 repeat(p, copy, 1, to-1, reclimit);
1090 break;
1091 case REP(1, INF): /* as x+ */
1092 INSERT(OPLUS_, start);
1093 ASTERN(O_PLUS, start);
1094 break;
1095 case REP(N, N): /* as xx{m-1,n-1} */
1096 copy = dupl(p, start, finish);
1097 repeat(p, copy, from-1, to-1, reclimit);
1098 break;
1099 case REP(N, INF): /* as xx{n-1,INF} */
1100 copy = dupl(p, start, finish);
1101 repeat(p, copy, from-1, to, reclimit);
1102 break;
1103 default: /* "can't happen" */
1104 SETERROR(REG_ASSERT); /* just in case */
1105 break;
1110 - seterr - set an error condition
1111 == static int seterr(struct parse *p, int e);
1113 static int /* useless but makes type checking happy */
1114 seterr(struct parse *p, int e)
1116 if (p->error == 0) /* keep earliest error condition */
1117 p->error = e;
1118 p->next = nuls; /* try to bring things to a halt */
1119 p->end = nuls;
1120 return(0); /* make the return value well-defined */
1124 - allocset - allocate a set of characters for []
1125 == static cset *allocset(struct parse *p);
1127 static cset *
1128 allocset(struct parse *p)
1130 int no = p->g->ncsets++;
1131 size_t nc;
1132 size_t nbytes;
1133 cset *cs;
1134 size_t css = (size_t)p->g->csetsize;
1135 int i;
1137 if (no >= p->ncsalloc) { /* need another column of space */
1138 p->ncsalloc += CHAR_BIT;
1139 nc = p->ncsalloc;
1140 assert(nc % CHAR_BIT == 0);
1141 nbytes = nc / CHAR_BIT * css;
1142 if (MEMSIZE(p) > MEMLIMIT)
1143 goto oomem;
1144 if (p->g->sets == NULL)
1145 p->g->sets = (cset *)malloc(nc * sizeof(cset));
1146 else
1147 p->g->sets = (cset *)realloc((char *)p->g->sets,
1148 nc * sizeof(cset));
1149 if (p->g->setbits == NULL)
1150 p->g->setbits = (uch *)malloc(nbytes);
1151 else {
1152 p->g->setbits = (uch *)realloc((char *)p->g->setbits,
1153 nbytes);
1154 /* xxx this isn't right if setbits is now NULL */
1155 for (i = 0; i < no; i++)
1156 p->g->sets[i].ptr = p->g->setbits + css*(i/CHAR_BIT);
1158 if (p->g->sets != NULL && p->g->setbits != NULL)
1159 (void) memset((char *)p->g->setbits + (nbytes - css),
1160 0, css);
1161 else {
1162 oomem:
1163 no = 0;
1164 SETERROR(REG_ESPACE);
1165 /* caller's responsibility not to do set ops */
1166 return NULL;
1170 cs = &p->g->sets[no];
1171 cs->ptr = p->g->setbits + css*((no)/CHAR_BIT);
1172 cs->mask = 1 << ((no) % CHAR_BIT);
1173 cs->hash = 0;
1174 cs->smultis = 0;
1175 cs->multis = NULL;
1177 return(cs);
1181 - freeset - free a now-unused set
1182 == static void freeset(struct parse *p, cset *cs);
1184 static void
1185 freeset(struct parse *p, cset *cs)
1187 size_t i;
1188 cset *top = &p->g->sets[p->g->ncsets];
1189 size_t css = (size_t)p->g->csetsize;
1191 for (i = 0; i < css; i++)
1192 CHsub(cs, i);
1193 if (cs == top-1) /* recover only the easy case */
1194 p->g->ncsets--;
1198 - freezeset - final processing on a set of characters
1199 == static int freezeset(struct parse *p, cset *cs);
1201 * The main task here is merging identical sets. This is usually a waste
1202 * of time (although the hash code minimizes the overhead), but can win
1203 * big if REG_ICASE is being used. REG_ICASE, by the way, is why the hash
1204 * is done using addition rather than xor -- all ASCII [aA] sets xor to
1205 * the same value!
1207 static int /* set number */
1208 freezeset(struct parse *p, cset *cs)
1210 uch h = cs->hash;
1211 size_t i;
1212 cset *top = &p->g->sets[p->g->ncsets];
1213 cset *cs2;
1214 size_t css = (size_t)p->g->csetsize;
1216 /* look for an earlier one which is the same */
1217 for (cs2 = &p->g->sets[0]; cs2 < top; cs2++)
1218 if (cs2->hash == h && cs2 != cs) {
1219 /* maybe */
1220 for (i = 0; i < css; i++)
1221 if (!!CHIN(cs2, i) != !!CHIN(cs, i))
1222 break; /* no */
1223 if (i == css)
1224 break; /* yes */
1227 if (cs2 < top) { /* found one */
1228 freeset(p, cs);
1229 cs = cs2;
1232 return((int)(cs - p->g->sets));
1236 - firstch - return first character in a set (which must have at least one)
1237 == static int firstch(struct parse *p, cset *cs);
1239 static int /* character; there is no "none" value */
1240 firstch(struct parse *p, cset *cs)
1242 size_t i;
1243 size_t css = (size_t)p->g->csetsize;
1245 for (i = 0; i < css; i++)
1246 if (CHIN(cs, i))
1247 return((char)i);
1248 assert(never);
1249 return(0); /* arbitrary */
1253 - nch - number of characters in a set
1254 == static int nch(struct parse *p, cset *cs);
1256 static int
1257 nch(struct parse *p, cset *cs)
1259 size_t i;
1260 size_t css = (size_t)p->g->csetsize;
1261 int n = 0;
1263 for (i = 0; i < css; i++)
1264 if (CHIN(cs, i))
1265 n++;
1266 return(n);
1270 - mcadd - add a collating element to a cset
1271 == static void mcadd(struct parse *p, cset *cs, \
1272 == char *cp);
1274 static void
1275 mcadd(struct parse *p, cset *cs, const char *cp)
1277 size_t oldend = cs->smultis;
1279 cs->smultis += strlen(cp) + 1;
1280 if (cs->multis == NULL)
1281 cs->multis = malloc(cs->smultis);
1282 else
1283 cs->multis = realloc(cs->multis, cs->smultis);
1284 if (cs->multis == NULL) {
1285 SETERROR(REG_ESPACE);
1286 return;
1289 (void) strcpy(cs->multis + oldend - 1, cp);
1290 cs->multis[cs->smultis - 1] = '\0';
1293 #ifdef notdef
1295 - mcsub - subtract a collating element from a cset
1296 == static void mcsub(cset *cs, char *cp);
1298 static void
1299 mcsub(cset *cs, char *cp)
1301 char *fp = mcfind(cs, cp);
1302 size_t len = strlen(fp);
1304 assert(fp != NULL);
1305 (void) memmove(fp, fp + len + 1,
1306 cs->smultis - (fp + len + 1 - cs->multis));
1307 cs->smultis -= len;
1309 if (cs->smultis == 0) {
1310 free(cs->multis);
1311 cs->multis = NULL;
1312 return;
1315 cs->multis = realloc(cs->multis, cs->smultis);
1316 assert(cs->multis != NULL);
1320 - mcin - is a collating element in a cset?
1321 == static int mcin(cset *cs, char *cp);
1323 static int
1324 mcin(cset *cs, char *cp)
1326 return(mcfind(cs, cp) != NULL);
1330 - mcfind - find a collating element in a cset
1331 == static char *mcfind(cset *cs, char *cp);
1333 static char *
1334 mcfind(cset *cs, char *cp)
1336 char *p;
1338 if (cs->multis == NULL)
1339 return(NULL);
1340 for (p = cs->multis; *p != '\0'; p += strlen(p) + 1)
1341 if (strcmp(cp, p) == 0)
1342 return(p);
1343 return(NULL);
1345 #endif
1348 - mcinvert - invert the list of collating elements in a cset
1349 == static void mcinvert(struct parse *p, cset *cs);
1351 * This would have to know the set of possibilities. Implementation
1352 * is deferred.
1354 static void
1355 mcinvert(struct parse *p, cset *cs)
1357 assert(cs->multis == NULL); /* xxx */
1361 - mccase - add case counterparts of the list of collating elements in a cset
1362 == static void mccase(struct parse *p, cset *cs);
1364 * This would have to know the set of possibilities. Implementation
1365 * is deferred.
1367 static void
1368 mccase(struct parse *p, cset *cs)
1370 assert(cs->multis == NULL); /* xxx */
1373 #ifdef notdef
1375 - isinsets - is this character in any sets?
1376 == static int isinsets(struct re_guts *g, int c);
1378 static int /* predicate */
1379 isinsets(struct re_guts *g, int c)
1381 uch *col;
1382 int i;
1383 int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT;
1384 unsigned uc = (unsigned char)c;
1386 for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize)
1387 if (col[uc] != 0)
1388 return(1);
1389 return(0);
1393 - samesets - are these two characters in exactly the same sets?
1394 == static int samesets(struct re_guts *g, int c1, int c2);
1396 static int /* predicate */
1397 samesets(struct re_guts *g, int c1, int c2)
1399 uch *col;
1400 int i;
1401 int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT;
1402 unsigned uc1 = (unsigned char)c1;
1403 unsigned uc2 = (unsigned char)c2;
1405 for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize)
1406 if (col[uc1] != col[uc2])
1407 return(0);
1408 return(1);
1410 #endif
1413 - categorize - sort out character categories
1414 == static void categorize(struct parse *p, struct re_guts *g);
1416 static void
1417 categorize(struct parse *p, struct re_guts *g)
1419 #ifdef notdef
1420 cat_t *cats = g->categories;
1421 int c;
1422 int c2;
1423 cat_t cat;
1425 /* avoid making error situations worse */
1426 if (p->error != 0)
1427 return;
1429 for (c = CHAR_MIN; c <= CHAR_MAX; c++)
1430 if (cats[c] == 0 && isinsets(g, c)) {
1431 cat = g->ncategories++;
1432 cats[c] = cat;
1433 for (c2 = c+1; c2 <= CHAR_MAX; c2++)
1434 if (cats[c2] == 0 && samesets(g, c, c2))
1435 cats[c2] = cat;
1437 #endif
1441 - dupl - emit a duplicate of a bunch of sops
1442 == static sopno dupl(struct parse *p, sopno start, sopno finish);
1444 static sopno /* start of duplicate */
1445 dupl(struct parse *p, sopno start, sopno finish)
1447 /* from here */
1448 /* to this less one */
1450 sopno ret = HERE();
1451 sopno len = finish - start;
1453 assert(finish >= start);
1454 if (len == 0)
1455 return(ret);
1456 if (!enlarge(p, p->ssize + len)) /* this many unexpected additions */
1457 return ret;
1458 assert(p->ssize >= p->slen + len);
1459 (void) memcpy((char *)(p->strip + p->slen),
1460 (char *)(p->strip + start), (size_t)len*sizeof(sop));
1461 (void) memcpy((char *)(p->stripdata + p->slen),
1462 (char *)(p->stripdata + start), (size_t)len*sizeof(RCHAR_T));
1463 p->slen += len;
1464 return(ret);
1468 - doemit - emit a strip operator
1469 == static void doemit(struct parse *p, sop op, size_t opnd);
1471 * It might seem better to implement this as a macro with a function as
1472 * hard-case backup, but it's just too big and messy unless there are
1473 * some changes to the data structures. Maybe later.
1475 static void
1476 doemit(struct parse *p, sop op, size_t opnd)
1478 /* avoid making error situations worse */
1479 if (p->error != 0)
1480 return;
1482 /* deal with oversize operands ("can't happen", more or less) */
1483 assert(opnd < 1);
1485 /* deal with undersized strip */
1486 if (p->slen >= p->ssize)
1487 if (!enlarge(p, (p->ssize+1) / 2 * 3)) /* +50% */
1488 return;
1490 /* finally, it's all reduced to the easy case */
1491 p->strip[p->slen] = op;
1492 p->stripdata[p->slen] = opnd;
1493 p->slen++;
1497 - doinsert - insert a sop into the strip
1498 == static void doinsert(struct parse *p, sop op, size_t opnd, sopno pos);
1500 static void
1501 doinsert(struct parse *p, sop op, size_t opnd, sopno pos)
1503 sopno sn;
1504 sop s;
1505 RCHAR_T d;
1506 int i;
1508 /* avoid making error situations worse */
1509 if (p->error != 0)
1510 return;
1512 sn = HERE();
1513 EMIT(op, opnd); /* do checks, ensure space */
1514 assert(HERE() == sn+1);
1515 s = p->strip[sn];
1516 d = p->stripdata[sn];
1518 /* adjust paren pointers */
1519 assert(pos > 0);
1520 for (i = 1; i < NPAREN; i++) {
1521 if (p->pbegin[i] >= pos) {
1522 p->pbegin[i]++;
1524 if (p->pend[i] >= pos) {
1525 p->pend[i]++;
1529 memmove((char *)&p->strip[pos+1], (char *)&p->strip[pos],
1530 (HERE()-pos-1)*sizeof(sop));
1531 memmove((char *)&p->stripdata[pos+1], (char *)&p->stripdata[pos],
1532 (HERE()-pos-1)*sizeof(RCHAR_T));
1533 p->strip[pos] = s;
1534 p->stripdata[pos] = d;
1538 - dofwd - complete a forward reference
1539 == static void dofwd(struct parse *p, sopno pos, sop value);
1541 static void
1542 dofwd(struct parse *p, sopno pos, sop value)
1544 /* avoid making error situations worse */
1545 if (p->error != 0)
1546 return;
1548 assert(value < 1);
1549 p->stripdata[pos] = value;
1553 - enlarge - enlarge the strip
1554 == static int enlarge(struct parse *p, sopno size);
1556 static int
1557 enlarge(struct parse *p, sopno size)
1559 sop *sp;
1560 RCHAR_T *dp;
1561 sopno osize;
1563 if (p->ssize >= size)
1564 return 1;
1566 osize = p->ssize;
1567 p->ssize = size;
1568 if (MEMSIZE(p) > MEMLIMIT)
1569 goto oomem;
1570 sp = realloc(p->strip, p->ssize * sizeof(sop));
1571 if (sp == NULL)
1572 goto oomem;
1573 p->strip = sp;
1574 dp = realloc(p->stripdata, p->ssize * sizeof(RCHAR_T));
1575 if (dp == NULL) {
1576 oomem:
1577 p->ssize = osize;
1578 SETERROR(REG_ESPACE);
1579 return 0;
1581 p->stripdata = dp;
1582 return 1;
1586 - stripsnug - compact the strip
1587 == static void stripsnug(struct parse *p, struct re_guts *g);
1589 static void
1590 stripsnug(struct parse *p, struct re_guts *g)
1592 g->nstates = p->slen;
1593 g->strip = (sop *)realloc((char *)p->strip,
1594 p->slen * sizeof(sop));
1595 if (g->strip == NULL) {
1596 SETERROR(REG_ESPACE);
1597 g->strip = p->strip;
1599 g->stripdata = (RCHAR_T *)realloc((char *)p->stripdata,
1600 p->slen * sizeof(RCHAR_T));
1601 if (g->stripdata == NULL) {
1602 SETERROR(REG_ESPACE);
1603 g->stripdata = p->stripdata;
1608 - findmust - fill in must and mlen with longest mandatory literal string
1609 == static void findmust(struct parse *p, struct re_guts *g);
1611 * This algorithm could do fancy things like analyzing the operands of |
1612 * for common subsequences. Someday. This code is simple and finds most
1613 * of the interesting cases.
1615 * Note that must and mlen got initialized during setup.
1617 static void
1618 findmust(struct parse *p, struct re_guts *g)
1620 sop *scans;
1621 RCHAR_T *scand;
1622 sop *starts = 0;
1623 RCHAR_T *startd = NULL;
1624 sop *newstarts = 0;
1625 RCHAR_T *newstartd = NULL;
1626 sopno newlen;
1627 sop s;
1628 RCHAR_T d;
1629 RCHAR_T *cp;
1630 sopno i;
1632 /* avoid making error situations worse */
1633 if (p->error != 0)
1634 return;
1636 /* find the longest OCHAR sequence in strip */
1637 newlen = 0;
1638 scans = g->strip + 1;
1639 scand = g->stripdata + 1;
1640 do {
1641 s = *scans++;
1642 d = *scand++;
1643 switch (s) {
1644 case OCHAR: /* sequence member */
1645 if (newlen == 0) { /* new sequence */
1646 newstarts = scans - 1;
1647 newstartd = scand - 1;
1649 newlen++;
1650 break;
1651 case OPLUS_: /* things that don't break one */
1652 case OLPAREN:
1653 case ORPAREN:
1654 break;
1655 case OQUEST_: /* things that must be skipped */
1656 case OCH_:
1657 scans--;
1658 scand--;
1659 do {
1660 scans += d;
1661 scand += d;
1662 s = *scans;
1663 d = *scand;
1664 /* assert() interferes w debug printouts */
1665 if (s != O_QUEST && s != O_CH && s != OOR2) {
1666 g->iflags |= BAD;
1667 return;
1669 } while (s != O_QUEST && s != O_CH);
1670 /* fallthrough */
1671 default: /* things that break a sequence */
1672 if (newlen > g->mlen) { /* ends one */
1673 starts = newstarts;
1674 startd = newstartd;
1675 g->mlen = newlen;
1677 newlen = 0;
1678 break;
1680 } while (s != OEND);
1682 if (g->mlen == 0) /* there isn't one */
1683 return;
1685 /* turn it into a character string */
1686 g->must = malloc(((size_t)g->mlen + 1) * sizeof(RCHAR_T));
1687 if (g->must == NULL) { /* argh; just forget it */
1688 g->mlen = 0;
1689 return;
1691 cp = g->must;
1692 scans = starts;
1693 scand = startd;
1694 for (i = g->mlen; i > 0; i--) {
1695 for (;;) {
1696 s = *scans++;
1697 d = *scand++;
1698 if (s == OCHAR)
1699 break;
1701 assert(cp < g->must + g->mlen);
1702 *cp++ = d;
1704 assert(cp == g->must + g->mlen);
1705 *cp++ = '\0'; /* just on general principles */
1709 - pluscount - count + nesting
1710 == static sopno pluscount(struct parse *p, struct re_guts *g);
1712 static sopno /* nesting depth */
1713 pluscount(struct parse *p, struct re_guts *g)
1715 sop *scan;
1716 sop s;
1717 sopno plusnest = 0;
1718 sopno maxnest = 0;
1720 if (p->error != 0)
1721 return(0); /* there may not be an OEND */
1723 scan = g->strip + 1;
1724 do {
1725 s = *scan++;
1726 switch (s) {
1727 case OPLUS_:
1728 plusnest++;
1729 break;
1730 case O_PLUS:
1731 if (plusnest > maxnest)
1732 maxnest = plusnest;
1733 plusnest--;
1734 break;
1736 } while (s != OEND);
1737 if (plusnest != 0)
1738 g->iflags |= BAD;
1739 return(maxnest);