dmake: do not set MAKEFLAGS=k
[unleashed/tickless.git] / usr / src / lib / libast / common / regex / regcomp.c
blob2d8ac3a319bf65dff02f510cb5c25d0b71f4d7c5
1 /***********************************************************************
2 * *
3 * This software is part of the ast package *
4 * Copyright (c) 1985-2010 AT&T Intellectual Property *
5 * and is licensed under the *
6 * Common Public License, Version 1.0 *
7 * by AT&T Intellectual Property *
8 * *
9 * A copy of the License is available at *
10 * http://www.opensource.org/licenses/cpl1.0.txt *
11 * (with md5 checksum 059e8cd6165cb4c31e351f2b69388fd9) *
12 * *
13 * Information and Software Systems Research *
14 * AT&T Research *
15 * Florham Park NJ *
16 * *
17 * Glenn Fowler <gsf@research.att.com> *
18 * David Korn <dgk@research.att.com> *
19 * Phong Vo <kpv@research.att.com> *
20 * *
21 ***********************************************************************/
22 #pragma prototyped
25 * posix regex compiler
28 #include "reglib.h"
30 #if _PACKAGE_ast
31 #include "lclib.h"
32 #endif
34 #define serialize re_serialize /* hp.ia64 <unistd.h>! */
36 #define C_ESC (-1)
37 #define C_MB (-2)
39 #if _AST_REGEX_DEBUG
41 #define DEBUG_TEST(f,y,n) ((debug&(debug_flag=f))?(y):(n))
42 #define DEBUG_CODE(f,y,n) do if(debug&(f)){y}else{n} while(0)
43 #define DEBUG_INIT() do { char* t; if (!debug) { debug = 0x80000000; if (t = getenv("_AST_regex_comp_debug")) debug |= strtoul(t, NiL, 0); } } while (0)
45 static unsigned long debug;
46 static unsigned long debug_flag;
48 #else
50 #define DEBUG_INIT()
51 #define DEBUG_TEST(f,y,n) (n)
52 #define DEBUG_CODE(f,y,n) do {n} while(0)
54 #endif
56 #if _PACKAGE_ast
58 typedef struct Cchr_s
60 Dtlink_t lnk;
61 unsigned char nam[2];
62 Ckey_t key;
63 } Cchr_t;
65 #endif
67 #define eat(p) do{if ((p)->token.push)(p)->token.push=0;else (p)->cursor+=(p)->token.len;}while (0)
70 * determine whether greedy matching will work, i.e. produce
71 * the best match first. such expressions are "easy", and
72 * need no backtracking once a complete match is found.
73 * if an expression has backreferences or alts it's hard
74 * else if it has only one closure it's easy
75 * else if all closures are simple (i.e. one-character) it's easy
76 * else it's hard.
79 typedef struct Stats_s
81 unsigned long l; /* min length to left of x */
82 unsigned long k; /* min length to left of y */
83 unsigned long m; /* min length */
84 unsigned long n; /* max length */
85 unsigned short a; /* number of alternations */
86 unsigned short b; /* number of backrefs */
87 unsigned short c; /* number of closures */
88 unsigned short e; /* $ */
89 unsigned short i; /* number of negations */
90 unsigned short p; /* number of named subexpressions */
91 unsigned short s; /* number of simple closures */
92 unsigned short t; /* number of tries */
93 unsigned short u; /* number of unnamed subexpressions */
94 Rex_t* x; /* max length REX_STRING */
95 Rex_t* y; /* max length REX_TRIE */
96 } Stats_t;
98 typedef struct Token_s
100 unsigned long min;
101 unsigned long max;
102 short lex;
103 short len;
104 short esc;
105 short att;
106 short push;
107 } Token_t;
109 typedef struct Cenv_s
111 int delimiter; /* pattern delimiter */
112 int error; /* last error */
113 int explicit; /* explicit match on this char */
114 int mappeddot; /* inverse mapped '.' */
115 int mappednewline; /* inverse mapped '\n' */
116 int mappedslash; /* inverse mapped '/' */
117 regflags_t flags; /* flags arg to regcomp */
118 int type; /* BRE,ERE,ARE,SRE,KRE */
119 unsigned char* cursor; /* curent point in re */
120 unsigned char* pattern; /* the original pattern */
121 unsigned char* literal; /* literal restart pattern */
122 int parno; /* number of last open paren */
123 int parnest; /* paren nest count */
124 int posixkludge; /* to make * nonspecial */
125 int regexp; /* <regexp.h> compatibility */
126 Token_t token; /* token lookahead */
127 Stats_t stats; /* RE statistics */
128 int terminator; /* pattern terminator */
129 Rex_t* paren[2*(BACK_REF_MAX+2)];
130 /* paren[i]!=0 if \i defined */
131 regex_t* regex; /* user handle */
132 regdisc_t* disc; /* user discipline */
133 unsigned char* map; /* external to native ccode map */
134 unsigned char* MAP; /* fold and/or map */
135 } Cenv_t;
138 * allocate a new Rex_t node
141 static Rex_t*
142 node(Cenv_t* env, int type, int lo, int hi, size_t extra)
144 register Rex_t* e;
146 if (e = (Rex_t*)alloc(env->disc, 0, sizeof(Rex_t) + extra))
148 memset(e, 0, sizeof(Rex_t) + extra);
149 e->type = type;
150 e->marked = 0;
151 e->lo = lo;
152 e->hi = hi;
153 e->flags = env->flags;
154 e->map = (e->flags & REG_ICASE) ? env->MAP : env->map;
155 e->explicit = env->explicit;
156 if (extra)
157 e->re.data = (char*)e + sizeof(Rex_t);
159 return e;
163 * free a Trie_node_t node
166 static void
167 triedrop(regdisc_t* disc, Trie_node_t* e)
169 if (e)
171 triedrop(disc, e->son);
172 triedrop(disc, e->sib);
173 alloc(disc, e, 0);
178 * free a Rex_t node
181 void
182 drop(regdisc_t* disc, Rex_t* e)
184 int i;
185 Rex_t* x;
187 if (e && !(disc->re_flags & REG_NOFREE))
190 switch (e->type)
192 case REX_ALT:
193 case REX_CONJ:
194 drop(disc, e->re.group.expr.binary.left);
195 drop(disc, e->re.group.expr.binary.right);
196 break;
197 case REX_GROUP:
198 case REX_GROUP_AHEAD:
199 case REX_GROUP_AHEAD_NOT:
200 case REX_GROUP_BEHIND:
201 case REX_GROUP_BEHIND_NOT:
202 case REX_GROUP_CUT:
203 case REX_NEG:
204 case REX_REP:
205 drop(disc, e->re.group.expr.rex);
206 break;
207 case REX_TRIE:
208 for (i = 0; i <= UCHAR_MAX; i++)
209 triedrop(disc, e->re.trie.root[i]);
210 break;
212 x = e->next;
213 alloc(disc, e, 0);
214 } while (e = x);
218 * mark e and descendants minimal
221 static void
222 mark(register Rex_t* e, int set)
224 if (e && !e->marked)
227 e->marked = 1;
228 if (set)
229 e->flags |= REG_MINIMAL;
230 else
231 e->flags &= ~REG_MINIMAL;
232 switch (e->type)
234 case REX_ALT:
235 case REX_CONJ:
236 case REX_GROUP_COND:
237 if (e->re.group.expr.binary.left)
238 mark(e->re.group.expr.binary.left, set);
239 if (e->re.group.expr.binary.right)
240 mark(e->re.group.expr.binary.right, set);
241 break;
242 case REX_GROUP:
243 case REX_GROUP_AHEAD:
244 case REX_GROUP_AHEAD_NOT:
245 case REX_GROUP_BEHIND:
246 case REX_GROUP_BEHIND_NOT:
247 case REX_GROUP_CUT:
248 case REX_NEG:
249 case REX_REP:
250 case REX_TRIE:
251 mark(e->re.group.expr.rex, set);
252 break;
254 } while (e = e->next);
258 * assign subexpression numbers by a preorder tree walk
261 static int
262 serialize(Cenv_t* env, Rex_t* e, int n)
266 e->serial = n++;
267 switch (e->type)
269 case REX_ALT:
270 case REX_GROUP_COND:
271 if (e->re.group.expr.binary.left)
272 n = serialize(env, e->re.group.expr.binary.left, n);
273 e->re.group.expr.binary.serial = n++;
274 if (e->re.group.expr.binary.right)
275 n = serialize(env, e->re.group.expr.binary.right, n);
276 break;
277 case REX_CONJ:
278 n = serialize(env, e->re.group.expr.binary.left, n);
279 n = serialize(env, e->re.group.expr.binary.right, n);
280 break;
281 case REX_GROUP:
282 case REX_GROUP_AHEAD:
283 case REX_GROUP_AHEAD_NOT:
284 case REX_GROUP_BEHIND:
285 case REX_GROUP_BEHIND_NOT:
286 case REX_GROUP_CUT:
287 case REX_NEG:
288 case REX_REP:
289 n = serialize(env, e->re.group.expr.rex, n);
290 break;
292 } while (e = e->next);
293 return n;
297 * catenate e and f into a sequence, collapsing them if possible
300 static Rex_t*
301 cat(Cenv_t* env, Rex_t* e, Rex_t* f)
303 Rex_t* g;
305 if (!f)
307 drop(env->disc, e);
308 return 0;
310 if (e->type == REX_NULL)
312 drop(env->disc, e);
313 return f;
315 if (f->type == REX_NULL)
317 g = f->next;
318 f->next = 0;
319 drop(env->disc, f);
320 f = g;
322 else if (e->type == REX_DOT && f->type == REX_DOT)
324 unsigned int m = e->lo + f->lo;
325 unsigned int n = e->hi + f->hi;
327 if (m <= RE_DUP_MAX)
329 if (e->hi > RE_DUP_MAX || f->hi > RE_DUP_MAX)
331 n = RE_DUP_INF;
332 goto combine;
334 else if (n <= RE_DUP_MAX)
336 combine:
337 e->lo = m;
338 e->hi = n;
339 g = f->next;
340 f->next = 0;
341 drop(env->disc, f);
342 f = g;
346 e->next = f;
347 return e;
351 * collect re statistics
354 static int
355 stats(register Cenv_t* env, register Rex_t* e)
357 register unsigned long n;
358 register unsigned long m;
359 unsigned long cm;
360 unsigned long nm;
361 unsigned long cn;
362 unsigned long nn;
363 unsigned long l;
364 unsigned long k;
365 unsigned long t;
366 Rex_t* q;
367 Rex_t* x;
368 Rex_t* y;
369 unsigned char c;
370 unsigned char b;
374 switch (e->type)
376 case REX_ALT:
377 x = env->stats.x;
378 l = env->stats.l;
379 y = env->stats.y;
380 k = env->stats.k;
381 t = env->stats.t;
382 if (++env->stats.a <= 0)
383 return 1;
384 cm = env->stats.m;
385 env->stats.m = 0;
386 cn = env->stats.n;
387 env->stats.n = 0;
388 if (stats(env, e->re.group.expr.binary.left))
389 return 1;
390 m = env->stats.m;
391 env->stats.m = 0;
392 n = env->stats.n;
393 env->stats.n = 0;
394 if (e->re.group.expr.binary.right && stats(env, e->re.group.expr.binary.right))
395 return 1;
396 if (env->stats.m > m)
397 env->stats.m = m;
398 else
399 m = env->stats.m;
400 if ((env->stats.m += cm) < m)
401 return 1;
402 if (env->stats.n < n)
403 env->stats.n = n;
404 else
405 n = env->stats.n;
406 if ((env->stats.n += cn) < n)
407 return 1;
408 env->stats.x = x;
409 env->stats.l = l;
410 env->stats.y = y;
411 env->stats.k = k;
412 env->stats.t = t;
413 break;
414 case REX_BACK:
415 if (++env->stats.b <= 0)
416 return 1;
417 break;
418 case REX_CLASS:
419 case REX_COLL_CLASS:
420 case REX_DOT:
421 case REX_ONECHAR:
422 n = env->stats.m;
423 if ((env->stats.m += e->lo) < n)
424 return 1;
425 if (e->hi != RE_DUP_INF)
427 n = env->stats.n;
428 if ((env->stats.n += e->hi) < n)
429 return 1;
431 if (e->lo != e->hi)
433 if (++env->stats.c <= 0)
434 return 1;
435 if (++env->stats.s <= 0)
436 return 1;
438 break;
439 case REX_CONJ:
440 cm = env->stats.m;
441 env->stats.m = 0;
442 cn = env->stats.n;
443 env->stats.n = 0;
444 if (stats(env, e->re.group.expr.binary.left))
445 return 1;
446 nm = env->stats.m;
447 env->stats.m = 0;
448 nn = env->stats.n;
449 env->stats.n = 0;
450 if (stats(env, e->re.group.expr.binary.right))
451 return 1;
452 if (env->stats.m < nm)
453 env->stats.m = nm;
454 else
455 nm = env->stats.m;
456 if ((env->stats.m += cm) < nm)
457 return 1;
458 if (env->stats.n < nn)
459 env->stats.n = nn;
460 else
461 nn = env->stats.n;
462 if ((env->stats.n += cn) < nn)
463 return 1;
464 break;
465 case REX_END:
466 env->stats.e = 1;
467 break;
468 case REX_GROUP:
469 if (e->re.group.number && ++env->stats.p <= 0 || !e->re.group.number && ++env->stats.u <= 0)
470 return 1;
471 if (stats(env, e->re.group.expr.rex))
472 return 1;
473 break;
474 case REX_GROUP_AHEAD:
475 case REX_GROUP_AHEAD_NOT:
476 case REX_GROUP_BEHIND:
477 case REX_GROUP_BEHIND_NOT:
478 m = env->stats.m;
479 n = env->stats.n;
480 x = env->stats.x;
481 y = env->stats.y;
482 if (stats(env, e->re.group.expr.rex))
483 return 1;
484 env->stats.m = m;
485 env->stats.n = n;
486 env->stats.x = x;
487 env->stats.y = y;
488 switch (e->type)
490 case REX_GROUP_AHEAD:
491 case REX_GROUP_BEHIND:
492 if (++env->stats.u <= 0)
493 return 1;
494 break;
496 break;
497 case REX_GROUP_COND:
498 if (++env->stats.u <= 0)
499 return 1;
500 m = env->stats.m;
501 n = env->stats.n;
502 x = env->stats.x;
503 y = env->stats.y;
504 if (e->re.group.size > 0 && ++env->stats.b <= 0)
505 return 1;
506 if (e->re.group.expr.binary.left && stats(env, e->re.group.expr.binary.left))
507 return 1;
508 if (q = e->re.group.expr.binary.right)
510 if (q->re.group.expr.binary.left && stats(env, q->re.group.expr.binary.left))
511 return 1;
512 if (q->re.group.expr.binary.right && stats(env, q->re.group.expr.binary.right))
513 return 1;
515 env->stats.m = m;
516 env->stats.n = n;
517 env->stats.x = x;
518 env->stats.y = y;
519 break;
520 case REX_GROUP_CUT:
521 if (++env->stats.u <= 0)
522 return 1;
523 m = env->stats.m;
524 n = env->stats.n;
525 x = env->stats.x;
526 y = env->stats.y;
527 if (stats(env, e->re.group.expr.rex))
528 return 1;
529 env->stats.m = m;
530 env->stats.n = n;
531 env->stats.x = x;
532 env->stats.y = y;
533 break;
534 case REX_NEG:
535 env->stats.i++;
536 x = env->stats.x;
537 l = env->stats.l;
538 y = env->stats.y;
539 k = env->stats.k;
540 t = env->stats.t;
541 cm = env->stats.m;
542 env->stats.m = 0;
543 if (stats(env, e->re.group.expr.rex))
544 return 1;
545 env->stats.m = !env->stats.m;
546 if ((env->stats.m += cm) < cm)
547 return 1;
548 env->stats.x = x;
549 env->stats.l = l;
550 env->stats.y = y;
551 env->stats.k = k;
552 env->stats.t = t;
553 break;
554 case REX_REP:
555 x = env->stats.x;
556 l = env->stats.l;
557 y = env->stats.y;
558 k = env->stats.k;
559 t = env->stats.t;
560 if (++env->stats.c <= 0)
561 return 1;
562 b = env->stats.b;
563 c = env->stats.c;
564 cm = env->stats.m;
565 env->stats.m = 0;
566 if (stats(env, e->re.group.expr.rex))
567 return 1;
568 if (env->stats.m == 1 && b == env->stats.b && c == env->stats.c && ++env->stats.s <= 0)
569 return 1;
570 if (e->lo < 1)
572 env->stats.x = x;
573 env->stats.l = l;
574 env->stats.y = y;
575 env->stats.k = k;
576 env->stats.t = t;
577 env->stats.m = cm;
579 else
581 m = env->stats.m;
582 if ((env->stats.m *= e->lo) > 0 && env->stats.m < m)
583 return 1;
584 m = env->stats.m;
585 if ((env->stats.m += cm) < m)
586 return 1;
587 if (env->stats.x != x)
588 env->stats.l = cm;
589 if (env->stats.y != y)
590 env->stats.k = cm;
592 break;
593 case REX_STRING:
594 if (!e->map)
596 cm = env->stats.m;
597 if ((env->stats.m += e->re.string.size) < cm)
598 return 1;
599 cn = env->stats.n;
600 if ((env->stats.n += e->re.string.size) < cn)
601 return 1;
602 if (!env->stats.x || env->stats.x->re.string.size < e->re.string.size)
604 env->stats.x = e;
605 env->stats.l = cm;
608 break;
609 case REX_TRIE:
610 if (++env->stats.s <= 0)
611 return 1;
612 cm = env->stats.m;
613 if ((env->stats.m += e->re.trie.min) < cm)
614 return 1;
615 cn = env->stats.n;
616 if ((env->stats.n += e->re.trie.max) < cn)
617 return 1;
618 env->stats.t++;
619 if (!env->stats.y || env->stats.y->re.trie.min < e->re.trie.min)
621 env->stats.y = e;
622 env->stats.k = cm;
624 break;
626 } while (e = e->next);
627 return 0;
630 static int token(Cenv_t*);
632 static int
633 magic(register Cenv_t* env, register int c, int escaped)
635 register char* sp;
636 register int n;
637 int o = c;
638 int e = env->error;
639 int l = env->token.len;
640 short* mp;
641 char* ep;
643 if (mp = state.magic[c])
645 c = mp[env->type+escaped];
646 if (c >= T_META)
648 sp = (char*)env->cursor + env->token.len;
649 switch (c)
651 case T_LEFT:
652 n = 0;
653 ep = sp;
654 while (*sp >= '0' && *sp <= '9')
656 if (n > (INT_MAX / 10))
658 env->error = REG_BADBR;
659 goto bad;
661 n = n * 10 + *sp++ - '0';
663 if (sp == ep)
665 if (env->type < SRE || *sp != ',')
667 env->error = *sp ? REG_BADBR : REG_EBRACE;
668 goto bad;
671 else if (n > RE_DUP_MAX)
673 env->error = REG_BADBR;
674 goto bad;
676 env->token.min = n;
677 if (*sp == ',')
679 n = 0;
680 ep = ++sp;
681 while (*sp >= '0' && *sp <= '9')
683 if (n > (INT_MAX / 10))
685 env->error = REG_BADBR;
686 goto bad;
688 n = n * 10 + *sp++ - '0';
690 if (sp == ep)
691 n = RE_DUP_INF;
692 else if (n < env->token.min)
694 env->error = REG_BADBR;
695 goto bad;
698 env->token.max = n;
699 switch (*sp)
701 case 0:
702 env->error = REG_EBRACE;
703 goto bad;
704 case '\\':
705 if (!escaped)
707 env->error = REG_BADBR;
708 goto bad;
710 sp++;
711 break;
712 default:
713 if (escaped)
715 env->error = REG_BADBR;
716 goto bad;
718 break;
720 switch (*sp++)
722 case 0:
723 env->error = REG_EBRACE;
724 goto bad;
725 case '}':
726 break;
727 default:
728 env->error = REG_BADBR;
729 goto bad;
731 env->token.len = sp - (char*)env->cursor;
732 break;
733 case T_RIGHT:
734 env->error = REG_EBRACE;
735 goto bad;
736 case T_OPEN:
737 if (env->type < SRE && *sp == '?')
739 env->token.len++;
740 env->token.lex = 0;
741 goto group;
743 break;
744 case T_ESCAPE:
745 c = chresc(sp - 2, &ep);
746 if (ep < sp)
747 goto bad;
748 env->token.len += ep - sp;
749 if (c >= T_META)
751 env->token.lex = c;
752 c = C_ESC;
754 return c;
755 case T_BACK+0:
756 case T_BACK+1:
757 case T_BACK+2:
758 case T_BACK+3:
759 case T_BACK+4:
760 case T_BACK+5:
761 case T_BACK+6:
762 case T_BACK+7:
763 n = chresc(sp - 2, &ep);
764 if (ep > sp + 1)
766 env->token.len += ep - sp;
767 return n;
769 /*FALLTHROUGH*/
770 case T_BACK+8:
771 case T_BACK+9:
772 if (env->type == SRE || c == T_BACK && !(env->flags & REG_LENIENT))
774 env->error = REG_BADESC;
775 goto bad;
777 if ((env->flags & REG_MULTIREF) && isdigit(*sp))
779 c = (c - T_BACK) * 10 + (*sp - '0');
780 if (c > 0 && c <= env->parno && env->paren[c])
781 c += T_BACK;
782 else
783 c = chresc(sp - 2, &ep);
784 env->token.len++;
786 if (c == T_BACK)
787 c = 0;
788 break;
789 case T_BAD:
790 if (escaped == 1 && (env->flags & REG_LENIENT) && (c = mp[env->type+escaped+2]) >= T_META)
791 return c;
792 goto bad;
794 if (env->type >= SRE)
796 if (c == T_DOT)
797 c = '.';
798 else if (c < T_OPEN)
800 if (env->type == KRE && *(env->cursor + env->token.len) == '-' && *(env->cursor + env->token.len + 1) == '(')
802 env->token.len++;
803 env->token.att = 1;
805 if (env->type == KRE && *(env->cursor + env->token.len) == '(')
807 env->token.len++;
808 switch (c)
810 case T_AT:
811 break;
812 case T_PERCENT:
813 env->token.lex = c;
814 goto group;
815 case T_TILDE:
816 env->token.lex = 0;
817 goto group;
818 default:
819 env->token.lex = c;
820 break;
822 c = T_OPEN;
824 else if (c == T_STAR)
825 c = T_DOTSTAR;
826 else if (c == T_QUES)
827 c = T_DOT;
828 else
830 c = o;
831 env->token.len = l;
834 else if (c > T_BACK)
836 c = (c - T_BACK) * 2 - 1;
837 c = (c > env->parno || !env->paren[c]) ? o : T_BACK + c;
839 else if (env->type == KRE && !env->parnest && (env->flags & REG_SHELL_GROUP))
841 if (c == T_AND)
842 c = '&';
843 else if (c == T_BAR)
844 c = '|';
845 else if (c == T_OPEN)
846 c = '(';
851 else if (escaped == 2)
853 if (env->type >= SRE && !(env->flags & REG_SHELL_ESCAPED) || (env->flags & REG_ESCAPE) && (c == '[' || c == '-' || c == ']' || env->delimiter && c == env->delimiter))
854 /*ok*/;
855 else
857 env->error = REG_BADESC;
858 goto bad;
861 else if (escaped && !(env->flags & REG_LENIENT) && c != ']')
863 env->error = REG_BADESC;
864 goto bad;
866 return c;
867 group:
868 sp = (char*)env->cursor + env->token.len;
869 switch (*sp++)
871 case ')':
872 break;
873 case '#':
874 for (;;)
876 switch (*sp++)
878 case 0:
879 env->error = REG_EPAREN;
880 return T_BAD;
881 case ')':
882 break;
883 default:
884 continue;
886 break;
888 break;
889 default:
890 return T_GROUP;
892 env->cursor = (unsigned char*)sp;
893 return token(env);
894 bad:
895 if (escaped == 2)
896 env->error = e;
897 else if (env->flags & REG_LENIENT)
898 return o;
899 else if (escaped == 1 && !env->error)
901 if (mp || o == ']')
902 return o;
903 env->error = REG_BADESC;
905 return T_BAD;
908 static int
909 token(register Cenv_t* env)
911 int c;
912 int posixkludge;
914 if (env->token.push)
915 return env->token.lex;
916 env->token.att = env->token.esc = 0;
917 if ((env->token.len = MBSIZE(env->cursor)) > 1)
918 return env->token.lex = C_MB;
919 env->token.lex = 0;
920 for (;;)
922 c = *env->cursor;
923 if (c == 0 || c == env->delimiter || c == env->terminator)
924 return T_END;
925 if (!(env->flags & REG_COMMENT))
926 break;
927 if (c == '#')
931 c = *++env->cursor;
932 if (c == 0 || c == env->delimiter)
933 return T_END;
934 } while (c != '\n');
936 else if (!isspace(c))
937 break;
938 env->cursor++;
940 if (c == '\n' && (env->flags & REG_MULTIPLE) && !env->delimiter)
942 if (env->parnest)
944 env->error = REG_EPAREN;
945 return T_BAD;
947 env->parno = 0;
948 env->pattern = env->cursor + 1;
949 return T_BAR;
951 if (env->flags & REG_LITERAL)
952 return c;
953 if (posixkludge = env->posixkludge)
955 env->posixkludge = 0;
956 if (c == '*')
957 return c;
959 if (c == '\\')
961 if (env->flags & REG_SHELL_ESCAPED)
962 return c;
963 if (!(c = *(env->cursor + 1)) || c == env->terminator)
965 if (env->flags & REG_LENIENT)
967 if (c)
969 env->token.esc = env->token.len;
970 env->token.len += MBSIZE(env->cursor + 1);
971 return c;
973 return '\\';
975 env->error = REG_EESCAPE;
976 return T_BAD;
978 env->token.esc = env->token.len;
979 env->token.len += MBSIZE(env->cursor + 1);
980 if (env->delimiter && c == 'n')
981 return '\n';
982 else if (c == env->delimiter)
983 return magic(env, c, 0);
984 else if (c == '(' && env->type == BRE)
985 env->posixkludge = 1;
986 else if (c == ')' && env->type == BRE && env->parnest <= 0)
988 env->error = REG_EPAREN;
989 return T_BAD;
991 else if (isspace(c) && (env->flags & REG_COMMENT))
992 return c;
993 return magic(env, c, 1);
995 else if (c == '$')
997 if (env->type == BRE && (*(env->cursor + 1) == 0 || *(env->cursor + 1) == env->delimiter || *(env->cursor + 1) == env->terminator || *(env->cursor + 1) == '\\' && *(env->cursor + 2) == ')') || (env->flags & REG_MULTIPLE) && *(env->cursor + 1) == '\n')
998 return T_DOLL;
1000 else if (c == '^')
1002 if (env->type == BRE && (env->cursor == env->pattern || posixkludge == 1))
1004 env->posixkludge = 2;
1005 return T_CFLX;
1008 else if (c == ')')
1010 if (env->type != BRE && env->parnest <= 0)
1011 return c;
1013 else if (c == '/' && env->explicit == env->mappedslash)
1015 while (*(env->cursor + env->token.len) == c)
1016 env->token.len++;
1017 return T_SLASHPLUS;
1019 return magic(env, c, 0);
1022 static Celt_t*
1023 col(Celt_t* ce, int ic, unsigned char* bp, int bw, int bc, unsigned char* ep, int ew, int ec)
1025 register char* s;
1026 register unsigned char* k;
1027 register unsigned char* e;
1028 register int c;
1029 register int cc;
1030 int bt;
1031 int et;
1032 Ckey_t key;
1034 cc = 0;
1035 for (;;)
1037 k = key;
1038 if (bw == 1)
1040 c = bc;
1041 if (ic)
1043 if (isupper(c))
1045 c = tolower(c);
1046 cc = -1;
1048 else if (islower(c))
1050 c = toupper(c);
1051 cc = -1;
1054 *k++ = c;
1056 else if (bw < COLL_KEY_MAX)
1058 s = (char*)bp;
1059 if (ic)
1061 c = mbchar(s);
1062 if (iswupper(c))
1064 c = towlower(c);
1065 cc = 1;
1067 else if (iswlower(c))
1069 c = towupper(c);
1070 cc = 1;
1073 if (cc > 0)
1075 cc = -1;
1076 k += mbconv((char*)k, c);
1078 else
1079 for (e = k + bw; k < e; *k++ = *s++);
1081 *k = 0;
1082 mbxfrm(ce->beg, key, COLL_KEY_MAX);
1083 if (ep)
1085 k = key;
1086 c = mbchar(k);
1087 if (iswupper(c))
1088 bt = COLL_range_uc;
1089 else if (iswlower(c))
1090 bt = COLL_range_lc;
1091 else
1092 bt = COLL_range;
1093 k = key;
1094 if (ew == 1)
1096 c = ec;
1097 if (ic)
1099 if (isupper(c))
1101 c = tolower(c);
1102 cc = -1;
1104 else if (islower(c))
1106 c = toupper(c);
1107 cc = -1;
1110 *k++ = c;
1112 else if (ew < COLL_KEY_MAX)
1114 s = (char*)ep;
1115 if (ic)
1117 c = mbchar(s);
1118 if (iswupper(c))
1120 c = towlower(c);
1121 cc = 1;
1123 else if (iswlower(c))
1125 c = towupper(c);
1126 cc = 1;
1129 if (cc > 0)
1131 cc = -1;
1132 k += mbconv((char*)k, c);
1134 else
1135 for (e = k + ew; k < e; *k++ = *s++);
1137 *k = 0;
1138 mbxfrm(ce->end, key, COLL_KEY_MAX);
1139 k = key;
1140 c = mbchar(k);
1141 if (iswupper(c))
1142 et = COLL_range_uc;
1143 else if (iswlower(c))
1144 et = COLL_range_lc;
1145 else
1146 et = COLL_range;
1147 ce->typ = bt == et ? bt : COLL_range;
1149 else
1150 ce->typ = COLL_char;
1151 ce++;
1152 if (!ic || !cc)
1153 break;
1154 ic = 0;
1156 return ce;
1159 static Rex_t*
1160 bra(Cenv_t* env)
1162 Rex_t* e;
1163 int c;
1164 int i;
1165 int w;
1166 int neg;
1167 int last;
1168 int inrange;
1169 int complicated;
1170 int collate;
1171 int elements;
1172 unsigned char* first;
1173 unsigned char* start;
1174 unsigned char* begin;
1175 unsigned char* s;
1176 regclass_t f;
1177 unsigned char buf[4 * (COLL_KEY_MAX + 1)];
1178 #if _PACKAGE_ast
1179 int ic;
1180 char mbc[COLL_KEY_MAX + 1];
1181 #endif
1183 if (!(e = node(env, REX_CLASS, 1, 1, sizeof(Set_t))))
1184 return 0;
1185 collate = complicated = elements = 0;
1186 if (*env->cursor == '^' || env->type >= SRE && *env->cursor == '!')
1188 env->cursor++;
1189 neg = 1;
1191 else
1192 neg = 0;
1193 first = env->cursor;
1194 start = first + MBSIZE(first);
1195 if (*env->cursor == 0 || *(env->cursor + 1) == 0 || *env->cursor == env->terminator || *(env->cursor + 1) == env->terminator || (env->flags & REG_ESCAPE) && (*env->cursor == env->delimiter || *env->cursor != '\\' && *(env->cursor + 1) == env->delimiter))
1196 goto error;
1197 begin = env->cursor + MBSIZE(env->cursor);
1200 * inrange: 0=no, 1=possibly, 2=definitely
1203 inrange = 0;
1204 for (;;)
1206 if (!(c = *env->cursor) || c == env->terminator || c == env->delimiter && (env->flags & REG_ESCAPE))
1207 goto error;
1208 env->cursor += (w = MBSIZE(env->cursor));
1209 if (c == '\\' && ((env->flags & REG_CLASS_ESCAPE) || env->type >= SRE && env->parnest || *env->cursor == env->delimiter && (env->flags & REG_ESCAPE)))
1211 if (*env->cursor)
1213 if (*env->cursor == 'n')
1215 env->cursor++;
1216 c = '\n';
1218 else if (env->type < SRE || !(env->flags & REG_SHELL_ESCAPED))
1220 env->token.len = 1;
1221 w = magic(env, *env->cursor, 2);
1222 if (env->token.len > 1 || w != T_BAD)
1224 if (env->token.len == 1 && (f = classfun(w)))
1226 if (inrange > 1)
1228 if (env->type < SRE && !(env->flags & REG_LENIENT))
1229 goto erange;
1230 inrange = 0;
1232 env->cursor++;
1233 for (c = 0; c <= UCHAR_MAX; c++)
1234 if ((*f)(c))
1235 setadd(e->re.charclass, c);
1236 complicated++;
1237 elements++;
1238 continue;
1240 if (env->token.len > 1 || w >= 0 && w < T_META)
1242 c = w;
1243 if (c > UCHAR_MAX)
1245 if (env->type < SRE && !(env->flags & REG_LENIENT) && !mbwide())
1246 goto erange;
1247 c = UCHAR_MAX;
1249 env->cursor += env->token.len;
1255 else if (c == ']')
1257 if (env->cursor == begin)
1259 last = c;
1260 inrange = 1;
1261 continue;
1263 if (inrange != 0)
1265 setadd(e->re.charclass, last);
1266 elements++;
1267 if (inrange == 2)
1269 setadd(e->re.charclass, '-');
1270 elements++;
1273 break;
1275 else if (c == '-')
1277 if (!inrange && env->cursor != begin && *env->cursor != ']')
1279 if (env->type < SRE && !(env->flags & REG_LENIENT))
1280 goto erange;
1281 continue;
1283 else if (inrange == 1)
1285 inrange = 2;
1286 complicated++;
1287 continue;
1290 else if (c == '[')
1292 switch (*env->cursor)
1294 case 0:
1295 goto error;
1296 case ':':
1297 if (env->regexp)
1298 goto normal;
1299 if (inrange == 1)
1301 setadd(e->re.charclass, last);
1302 elements++;
1304 if (!(f = regclass((char*)env->cursor, (char**)&env->cursor)))
1306 if (env->cursor == start && (c = *(env->cursor + 1)))
1308 s = start = env->cursor + 1;
1309 while (*++s && *s != ':');
1310 if (*s == ':' && *(s + 1) == ']' && *(s + 2) == ']')
1312 if ((i = (s - start)) == 1)
1314 switch (c)
1316 case '<':
1317 i = REX_WBEG;
1318 break;
1319 case '>':
1320 i = REX_WEND;
1321 break;
1322 default:
1323 i = 0;
1324 break;
1326 if (i)
1328 env->cursor = s + 3;
1329 drop(env->disc, e);
1330 return node(env, i, 0, 0, 0);
1335 env->error = REG_ECTYPE;
1336 goto error;
1338 for (c = 0; c <= UCHAR_MAX; c++)
1339 if ((*f)(c))
1340 setadd(e->re.charclass, c);
1341 inrange = 0;
1342 complicated++;
1343 elements++;
1344 continue;
1345 case '=':
1346 if (env->regexp)
1347 goto normal;
1348 if (inrange == 2)
1349 goto erange;
1350 if (inrange == 1)
1352 setadd(e->re.charclass, last);
1353 elements++;
1355 if ((c = regcollate((char*)env->cursor, (char**)&env->cursor, (char*)buf, sizeof(buf))) < 0)
1356 goto ecollate;
1357 if (c > 1)
1358 collate++;
1359 else
1360 setadd(e->re.charclass, buf[0]);
1361 c = buf[0];
1362 inrange = 0;
1363 complicated++;
1364 elements++;
1365 continue;
1366 case '.':
1367 if (env->regexp)
1368 goto normal;
1369 if ((c = regcollate((char*)env->cursor, (char**)&env->cursor, (char*)buf, sizeof(buf))) < 0)
1370 goto ecollate;
1371 if (c > 1)
1372 collate++;
1373 c = buf[0];
1374 complicated++;
1375 break;
1376 default:
1377 normal:
1378 if (*env->cursor == env->terminator || *env->cursor == env->delimiter && (env->flags & REG_ESCAPE))
1379 goto error;
1380 break;
1383 else if (w > 1)
1384 complicated++;
1385 if (inrange == 2)
1387 if (last <= c)
1389 for (i = last; i <= c; i++)
1390 setadd(e->re.charclass, i);
1391 inrange = env->type >= SRE || (env->flags & REG_LENIENT);
1392 elements += 2;
1394 else if (env->type >= SRE)
1396 setadd(e->re.charclass, last);
1397 setadd(e->re.charclass, c);
1398 elements += 2;
1399 inrange = 1;
1401 else if (!(env->flags & REG_LENIENT))
1402 goto erange;
1403 else
1404 inrange = 0;
1406 else if (inrange == 1)
1408 setadd(e->re.charclass, last);
1409 elements++;
1411 else
1412 inrange = 1;
1413 last = c;
1415 #if _PACKAGE_ast
1416 if (complicated && mbcoll())
1418 Dt_t* dt;
1419 Cchr_t* cc;
1420 Cchr_t* tc;
1421 Cchr_t* xc;
1422 Celt_t* ce;
1423 Cchr_t key;
1424 int rw;
1425 int rc;
1426 int wc;
1427 unsigned char* rp;
1428 unsigned char* pp;
1429 char* wp;
1430 char cb[2][COLL_KEY_MAX+1];
1432 static Dtdisc_t disc;
1434 static const char primary[] = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789";
1436 if (!(dt = (Dt_t*)LCINFO(AST_LC_COLLATE)->data))
1438 disc.key = offsetof(Cchr_t, key);
1439 if ((cc = newof(0, Cchr_t, elementsof(primary), 0)) && (dt = dtopen(&disc, Dttree)))
1441 for (i = 0; i < elementsof(primary) - 1; i++, cc++)
1443 cc->nam[0] = primary[i];
1444 mbxfrm(cc->key, cc->nam, COLL_KEY_MAX);
1445 dtinsert(dt, cc);
1447 for (i = 0; i < elementsof(cc->key); i++)
1448 cc->key[i] = ~0;
1449 dtinsert(dt, cc);
1450 LCINFO(AST_LC_COLLATE)->data = (void*)dt;
1452 else
1454 free(cc);
1455 drop(env->disc, e);
1456 return 0;
1459 if (dt)
1461 drop(env->disc, e);
1462 if (ic = env->flags & REG_ICASE)
1463 elements *= 2;
1464 if (!(e = node(env, REX_COLL_CLASS, 1, 1, (elements + 2) * sizeof(Celt_t))))
1465 return 0;
1466 ce = (Celt_t*)e->re.data;
1467 e->re.collate.invert = neg;
1468 e->re.collate.elements = ce;
1469 env->cursor = first;
1470 inrange = 0;
1471 for (;;)
1473 if ((c = *env->cursor) == 0 || c == env->terminator || (env->flags & REG_ESCAPE) && c == env->delimiter)
1474 goto error;
1475 pp = env->cursor;
1476 env->cursor += (w = MBSIZE(env->cursor));
1477 if (c == '\\' && ((env->flags & REG_CLASS_ESCAPE) || env->type >= SRE && env->parnest || *env->cursor == env->delimiter && (env->flags & REG_ESCAPE)))
1479 if (*env->cursor)
1481 if (*env->cursor == 'n')
1483 pp = env->cursor++;
1484 c = '\n';
1486 else if (env->type < SRE || !(env->flags & REG_SHELL_ESCAPED))
1488 env->token.len = 1;
1489 w = magic(env, *env->cursor, 2);
1490 if (env->token.len > 1 || w != T_BAD)
1492 if (env->token.len == 1 && (f = classfun(w)))
1494 if (inrange > 1)
1496 if (env->type < SRE && !(env->flags & REG_LENIENT))
1497 goto erange;
1498 inrange = 0;
1500 env->cursor++;
1501 ce->fun = f;
1502 ce->typ = COLL_call;
1503 ce++;
1504 continue;
1506 if (env->token.len > 1 || w >= 0 && w < T_META)
1508 c = w;
1509 w = mbconv(mbc, c);
1510 pp = (unsigned char*)mbc;
1511 env->cursor += env->token.len;
1517 else if (c == ']')
1519 if (env->cursor == begin)
1521 rp = pp;
1522 rw = w;
1523 inrange = 1;
1524 continue;
1526 if (inrange != 0)
1528 ce = col(ce, ic, rp, rw, rc, NiL, 0, 0);
1529 if (inrange == 2)
1530 ce = col(ce, ic, NiL, 1, '-', NiL, 0, 0);
1532 break;
1534 else if (c == '-')
1536 if (!inrange && env->cursor != begin && *env->cursor != ']')
1538 if (env->type < SRE && !(env->flags & REG_LENIENT))
1539 goto erange;
1540 continue;
1542 else if (inrange == 1)
1544 inrange = 2;
1545 continue;
1548 else if (c == '[')
1550 switch (*env->cursor)
1552 case 0:
1553 goto error;
1554 case ':':
1555 if (env->regexp)
1556 goto complicated_normal;
1557 if (inrange == 1)
1558 ce = col(ce, ic, rp, rw, rc, NiL, 0, 0);
1559 if (!(f = regclass((char*)env->cursor, (char**)&env->cursor)))
1561 if (env->cursor == start && (c = *(env->cursor + 1)) && *(env->cursor + 2) == ':' && *(env->cursor + 3) == ']' && *(env->cursor + 4) == ']')
1563 switch (c)
1565 case '<':
1566 i = REX_WBEG;
1567 break;
1568 case '>':
1569 i = REX_WEND;
1570 break;
1571 default:
1572 i = 0;
1573 break;
1575 if (i)
1577 env->cursor += 5;
1578 drop(env->disc, e);
1579 return node(env, i, 0, 0, 0);
1582 env->error = REG_ECTYPE;
1583 goto error;
1585 ce->fun = f;
1586 ce->typ = COLL_call;
1587 ce++;
1588 inrange = 0;
1589 continue;
1590 case '=':
1591 if (env->regexp)
1592 goto complicated_normal;
1593 if (inrange == 2)
1594 goto erange;
1595 if (inrange == 1)
1596 ce = col(ce, ic, rp, rw, rc, NiL, 0, 0);
1597 pp = (unsigned char*)cb[inrange];
1598 rp = env->cursor + 1;
1599 if ((rw = regcollate((char*)env->cursor, (char**)&env->cursor, (char*)pp, COLL_KEY_MAX)) < 0)
1600 goto ecollate;
1601 wp = (char*)pp;
1602 wc = mbchar(wp);
1603 c = 0;
1604 if (ic)
1606 if (iswupper(wc))
1608 wc = towlower(wc);
1609 rw = mbconv((char*)pp, wc);
1610 c = 'u';
1612 else if (iswlower(wc))
1613 c = 'l';
1615 for (;;)
1617 mbxfrm(key.key, (char*)pp, COLL_KEY_MAX);
1618 if (!(cc = (Cchr_t*)dtsearch(dt, &key)) && !(cc = (Cchr_t*)dtprev(dt, &key)))
1619 goto ecollate;
1620 xc = (tc = (Cchr_t*)dtprev(dt, cc)) && !strcasecmp((char*)tc->nam, (char*)cc->nam) ? tc : cc;
1621 if (c == 'l' || c == 'L' && !(c = 0))
1622 ce->typ = COLL_range_lc;
1623 else if (c == 'u' || c == 'U' && !(c = 0))
1624 ce->typ = COLL_range_uc;
1625 else
1626 ce->typ = COLL_range;
1627 strcpy((char*)ce->beg, (char*)xc->key);
1628 if (!(cc = (Cchr_t*)dtnext(dt, cc)))
1629 goto ecollate;
1630 if (!strcasecmp((char*)xc->nam, (char*)cc->nam) && (tc = (Cchr_t*)dtnext(dt, cc)))
1631 cc = tc;
1632 strcpy((char*)ce->end, (char*)cc->key);
1633 ce->max = -1;
1634 ce++;
1635 if (!c)
1636 break;
1637 if (c == 'u')
1639 wc = towlower(wc);
1640 c = 'L';
1642 else
1644 wc = towupper(wc);
1645 c = 'U';
1647 rw = mbconv((char*)pp, wc);
1649 inrange = 0;
1650 c = *pp;
1651 continue;
1652 case '.':
1653 if (env->regexp)
1654 goto complicated_normal;
1655 pp = (unsigned char*)cb[inrange];
1656 if ((w = regcollate((char*)env->cursor, (char**)&env->cursor, (char*)pp, COLL_KEY_MAX)) < 0)
1657 goto ecollate;
1658 c = buf[0];
1659 break;
1660 default:
1661 complicated_normal:
1662 if (*env->cursor == env->terminator || *env->cursor == env->delimiter && (env->flags & REG_ESCAPE))
1663 goto error;
1664 break;
1667 if (inrange == 2)
1669 ce = col(ce, ic, rp, rw, rc, pp, w, c);
1670 if (strcmp((char*)ce->beg, (char*)ce->end) > 0)
1672 if (env->type < SRE && !(env->flags & REG_LENIENT))
1673 goto erange;
1674 (ce-1)->typ = COLL_char;
1675 strcpy((char*)ce->beg, (char*)(ce-1)->end);
1676 ce->typ = COLL_char;
1677 ce++;
1679 inrange = env->type >= SRE || (env->flags & REG_LENIENT);
1681 else if (inrange == 1)
1682 ce = col(ce, ic, rp, rw, rc, NiL, 0, 0);
1683 else
1684 inrange = 1;
1685 rp = pp;
1686 rw = w;
1687 rc = c;
1689 ce->typ = COLL_end;
1690 return e;
1693 #endif
1694 if (collate)
1695 goto ecollate;
1696 if (env->flags & REG_ICASE)
1697 for (i = 0; i <= UCHAR_MAX; i++)
1698 if (settst(e->re.charclass, i))
1700 if (isupper(i))
1701 c = tolower(i);
1702 else if (islower(i))
1703 c = toupper(i);
1704 else
1705 continue;
1706 setadd(e->re.charclass, c);
1708 if (neg)
1710 for (i = 0; i < elementsof(e->re.charclass->bits); i++)
1711 e->re.charclass->bits[i] ^= ~0;
1712 if (env->explicit >= 0)
1713 setclr(e->re.charclass, env->explicit);
1715 return e;
1716 ecollate:
1717 env->error = REG_ECOLLATE;
1718 goto error;
1719 erange:
1720 env->error = REG_ERANGE;
1721 error:
1722 drop(env->disc, e);
1723 if (!env->error)
1724 env->error = REG_EBRACK;
1725 return 0;
1728 static Rex_t*
1729 ccl(Cenv_t* env, int type)
1731 int i;
1732 Rex_t* e;
1733 Celt_t* ce;
1734 regclass_t f;
1736 if (!(f = classfun(type)))
1738 env->error = REG_BADESC;
1739 return 0;
1741 if (!mbcoll())
1743 if (!(e = node(env, REX_CLASS, 1, 1, sizeof(Set_t))))
1744 return 0;
1745 for (i = 0; i <= UCHAR_MAX; i++)
1746 if ((*f)(i))
1747 setadd(e->re.charclass, i);
1748 if (env->explicit >= 0)
1749 setclr(e->re.charclass, env->explicit);
1751 else
1753 if (!(e = node(env, REX_COLL_CLASS, 1, 1, 2 * sizeof(Celt_t))))
1754 return 0;
1755 ce = (Celt_t*)e->re.data;
1756 e->re.collate.invert = 0;
1757 e->re.collate.elements = ce;
1758 ce->fun = f;
1759 ce->typ = COLL_call;
1760 ce++;
1761 ce->typ = COLL_end;
1763 return e;
1766 static Rex_t*
1767 rep(Cenv_t* env, Rex_t* e, int number, int last)
1769 Rex_t* f;
1770 unsigned long m = 0;
1771 unsigned long n = RE_DUP_INF;
1772 int minimal = -1;
1774 if (!e)
1775 return 0;
1776 switch (token(env))
1778 case T_BANG:
1779 eat(env);
1780 if (!(f = node(env, REX_NEG, m, n, 0)))
1782 drop(env->disc, e);
1783 return 0;
1785 f->re.group.expr.rex = e;
1786 return f;
1787 case T_QUES:
1788 eat(env);
1789 n = 1;
1790 break;
1791 case T_STAR:
1792 eat(env);
1793 break;
1794 case T_PLUS:
1795 eat(env);
1796 m = 1;
1797 break;
1798 case T_LEFT:
1799 eat(env);
1800 m = env->token.min;
1801 n = env->token.max;
1802 break;
1803 default:
1804 return e;
1806 if (env->token.att)
1807 minimal = 1;
1808 else if (env->type < SRE)
1809 switch (token(env))
1811 case T_QUES:
1812 eat(env);
1813 minimal = !(env->flags & REG_MINIMAL);
1814 break;
1815 case T_STAR: /*AST*/
1816 eat(env);
1817 minimal = !!(env->flags & REG_MINIMAL);
1818 break;
1820 switch (e->type)
1822 case REX_DOT:
1823 case REX_CLASS:
1824 case REX_COLL_CLASS:
1825 case REX_ONECHAR:
1826 e->lo = m;
1827 e->hi = n;
1828 if (minimal >= 0)
1829 mark(e, minimal);
1830 return e;
1831 #if HUH_2002_08_07
1832 case REX_BEG:
1833 #endif
1834 case REX_BEG_STR:
1835 case REX_END_STR:
1836 case REX_FIN_STR:
1837 case REX_WBEG:
1838 case REX_WEND:
1839 case REX_WORD:
1840 case REX_WORD_NOT:
1841 env->error = REG_BADRPT;
1842 drop(env->disc, e);
1843 return 0;
1845 if (m == 1 && n == 1)
1847 if (minimal >= 0)
1848 mark(e, minimal);
1849 return e;
1851 if (!(f = node(env, REX_REP, m, n, 0)))
1853 drop(env->disc, e);
1854 return 0;
1856 f->re.group.expr.rex = e;
1857 f->re.group.number = number;
1858 f->re.group.last = last;
1859 if (minimal >= 0)
1860 mark(f, minimal);
1861 if (m <= n && n)
1863 for (; e && e->type >= REX_GROUP && e->type <= REX_GROUP_CUT; e = e->re.group.expr.rex);
1864 if (e && e->type == REX_NEG)
1865 f->type = REX_GROUP;
1867 return f;
1870 static int
1871 isstring(Rex_t* e)
1873 switch (e->type)
1875 case REX_ONECHAR:
1876 return e->lo == 1 && e->hi == 1;
1877 case REX_STRING:
1878 return 1;
1880 return 0;
1883 static Trie_node_t*
1884 trienode(Cenv_t* env, int c)
1886 Trie_node_t* t;
1888 if (t = (Trie_node_t*)alloc(env->disc, 0, sizeof(Trie_node_t)))
1890 memset(t, 0, sizeof(Trie_node_t));
1891 t->c = c;
1893 return t;
1896 static int
1897 insert(Cenv_t* env, Rex_t* f, Rex_t* g)
1899 unsigned char* s;
1900 unsigned char* e;
1901 Trie_node_t* t;
1902 int len;
1903 unsigned char tmp[2];
1905 switch (f->type)
1907 case REX_ONECHAR:
1908 *(s = tmp) = f->re.onechar;
1909 e = s + 1;
1910 break;
1911 case REX_STRING:
1912 s = f->re.string.base;
1913 e = s + f->re.string.size;
1914 break;
1915 default:
1916 return 1;
1918 if (!(t = g->re.trie.root[*s]) && !(t = g->re.trie.root[*s] = trienode(env, *s)))
1919 return 1;
1920 for (len = 1;;)
1922 if (t->c == *s)
1924 if (++s >= e)
1925 break;
1926 if (!t->son && !(t->son = trienode(env, *s)))
1927 return 1;
1928 t = t->son;
1929 len++;
1931 else
1933 if (!t->sib && !(t->sib = trienode(env, *s)))
1934 return 1;
1935 t = t->sib;
1938 if (g->re.trie.min > len)
1939 g->re.trie.min = len;
1940 if (g->re.trie.max < len)
1941 g->re.trie.max = len;
1942 t->end = 1;
1943 return 0;
1947 * trie() tries to combine nontrivial e and f into a REX_TRIE
1948 * unless 0 is returned, e and f are deleted as far as possible
1949 * also called by regcomb
1952 static Rex_t*
1953 trie(Cenv_t* env, Rex_t* e, Rex_t* f)
1955 Rex_t* g;
1957 if (e->next || f->next || !isstring(e) || e->flags != f->flags)
1958 return 0;
1959 if (isstring(f))
1961 if (!(g = node(env, REX_TRIE, 0, 0, (UCHAR_MAX + 1) * sizeof(Trie_node_t*))))
1962 return 0;
1963 g->re.trie.min = INT_MAX;
1964 if (insert(env, f, g))
1965 goto nospace;
1966 drop(env->disc, f);
1968 else if (f->type != REX_TRIE)
1969 return 0;
1970 else
1971 g = f;
1972 if (insert(env, e, g))
1973 goto nospace;
1974 drop(env->disc, e);
1975 return g;
1976 nospace:
1977 if (g != f)
1978 drop(env->disc, g);
1979 return 0;
1982 static Rex_t* alt(Cenv_t*, int, int);
1984 static int
1985 chr(register Cenv_t* env, int* escaped)
1987 unsigned char* p;
1988 int c;
1990 *escaped = 0;
1991 if (!(c = *env->cursor))
1992 return -1;
1993 env->cursor++;
1994 if (c == '\\')
1996 if (env->flags & REG_SHELL_ESCAPED)
1997 return c;
1998 if (!(c = *(env->cursor + 1)) || c == env->terminator)
2000 if (env->flags & REG_LENIENT)
2001 return c ? c : '\\';
2002 env->error = REG_EESCAPE;
2003 return -1;
2005 p = env->cursor;
2006 c = chresc((char*)env->cursor - 1, (char**)&env->cursor);
2007 *escaped = env->cursor - p;
2009 return c;
2013 * open the perly gates
2016 static Rex_t*
2017 grp(Cenv_t* env, int parno)
2019 Rex_t* e;
2020 Rex_t* f;
2021 int c;
2022 int i;
2023 int n;
2024 int x;
2025 int esc;
2026 int typ;
2027 int beg;
2028 unsigned char* p;
2030 beg = env->pattern == env->cursor - env->token.len;
2031 if (!(c = env->token.lex) && (c = *env->cursor))
2032 env->cursor++;
2033 env->token.len = 0;
2034 env->parnest++;
2035 typ = -1;
2036 switch (c)
2038 case '-':
2039 case '+':
2040 case 'a':
2041 case 'g':
2042 case 'i':
2043 case 'l':
2044 case 'm':
2045 case 'p':
2046 case 'r':
2047 case 's':
2048 case 'x':
2049 case 'A':
2050 case 'B':
2051 case 'E':
2052 case 'F':
2053 case 'G':
2054 case 'I':
2055 case 'K':
2056 case 'L':
2057 case 'M': /* glob(3) */
2058 case 'N': /* glob(3) */
2059 case 'O': /* glob(3) */
2060 case 'P':
2061 case 'R': /* pcre */
2062 case 'S':
2063 case 'U': /* pcre */
2064 case 'X': /* pcre */
2065 x = REX_GROUP;
2066 i = 1;
2067 env->token.push = 1;
2068 for (;;)
2070 switch (c)
2072 case ')':
2073 if (!(env->flags & REG_LITERAL))
2075 env->error = REG_BADRPT;
2076 return 0;
2078 /*FALLTHROUGH*/
2079 case 0:
2080 case T_CLOSE:
2081 x = 0;
2082 goto done;
2083 case ':':
2084 eat(env);
2085 if (token(env) == T_CLOSE)
2086 x = 0;
2087 goto done;
2088 case '-':
2089 i = 0;
2090 break;
2091 case '+':
2092 i = 1;
2093 break;
2094 case 'a':
2095 if (i)
2096 env->flags |= (REG_LEFT|REG_RIGHT);
2097 else
2098 env->flags &= ~(REG_LEFT|REG_RIGHT);
2099 break;
2100 case 'g':
2101 if (i)
2102 env->flags &= ~REG_MINIMAL;
2103 else
2104 env->flags |= REG_MINIMAL;
2105 break;
2106 case 'i':
2107 if (i)
2108 env->flags |= REG_ICASE;
2109 else
2110 env->flags &= ~REG_ICASE;
2111 break;
2112 case 'l':
2113 if (i)
2114 env->flags |= REG_LEFT;
2115 else
2116 env->flags &= ~REG_LEFT;
2117 break;
2118 case 'm':
2119 if (i)
2120 env->flags |= REG_NEWLINE;
2121 else
2122 env->flags &= ~REG_NEWLINE;
2123 env->explicit = (env->flags & (REG_NEWLINE|REG_SPAN)) == REG_NEWLINE ? env->mappednewline : -1;
2124 break;
2125 case 'p':
2126 if (i)
2127 env->flags &= ~REG_LENIENT;
2128 else
2129 env->flags |= REG_LENIENT;
2130 break;
2131 case 'r':
2132 if (i)
2133 env->flags |= REG_RIGHT;
2134 else
2135 env->flags &= ~REG_RIGHT;
2136 break;
2137 case 's':
2138 if (i)
2139 env->flags |= REG_SPAN;
2140 else
2141 env->flags &= ~REG_SPAN;
2142 env->explicit = (env->flags & (REG_NEWLINE|REG_SPAN)) == REG_NEWLINE ? env->mappednewline : -1;
2143 break;
2144 case 'x':
2145 if (i)
2146 env->flags |= REG_COMMENT;
2147 else
2148 env->flags &= ~REG_COMMENT;
2149 break;
2150 case 'A':
2151 env->flags &= ~(REG_AUGMENTED|REG_EXTENDED|REG_LITERAL|REG_SHELL|REG_LEFT|REG_RIGHT|REG_CLASS_ESCAPE);
2152 env->flags |= REG_AUGMENTED|REG_EXTENDED;
2153 typ = ARE;
2154 break;
2155 case 'B':
2156 case 'G':
2157 env->flags &= ~(REG_AUGMENTED|REG_EXTENDED|REG_LITERAL|REG_SHELL|REG_LEFT|REG_RIGHT|REG_CLASS_ESCAPE);
2158 typ = BRE;
2159 break;
2160 case 'E':
2161 env->flags &= ~(REG_AUGMENTED|REG_EXTENDED|REG_LITERAL|REG_SHELL|REG_LEFT|REG_RIGHT|REG_CLASS_ESCAPE);
2162 env->flags |= REG_EXTENDED;
2163 typ = ERE;
2164 break;
2165 case 'F':
2166 case 'L':
2167 env->flags &= ~(REG_AUGMENTED|REG_EXTENDED|REG_LITERAL|REG_SHELL|REG_LEFT|REG_RIGHT|REG_CLASS_ESCAPE);
2168 env->flags |= REG_LITERAL;
2169 typ = ERE;
2170 break;
2171 case 'K':
2172 env->flags &= ~(REG_AUGMENTED|REG_EXTENDED|REG_LITERAL|REG_SHELL|REG_LEFT|REG_RIGHT|REG_CLASS_ESCAPE);
2173 env->flags |= REG_AUGMENTED|REG_SHELL|REG_LEFT|REG_RIGHT;
2174 typ = KRE;
2175 break;
2176 case 'M':
2177 /* used by caller to disable glob(3) GLOB_BRACE */
2178 break;
2179 case 'N':
2180 /* used by caller to disable glob(3) GLOB_NOCHECK */
2181 break;
2182 case 'O':
2183 /* used by caller to disable glob(3) GLOB_STARSTAR */
2184 break;
2185 case 'P':
2186 env->flags &= ~(REG_AUGMENTED|REG_EXTENDED|REG_LITERAL|REG_SHELL|REG_LEFT|REG_RIGHT|REG_CLASS_ESCAPE);
2187 env->flags |= REG_EXTENDED|REG_CLASS_ESCAPE;
2188 typ = ERE;
2189 break;
2190 case 'S':
2191 env->flags &= ~(REG_AUGMENTED|REG_EXTENDED|REG_LITERAL|REG_SHELL|REG_LEFT|REG_RIGHT|REG_CLASS_ESCAPE);
2192 env->flags |= REG_SHELL|REG_LEFT|REG_RIGHT;
2193 typ = SRE;
2194 break;
2195 case 'U': /* PCRE_UNGREEDY */
2196 if (i)
2197 env->flags |= REG_MINIMAL;
2198 else
2199 env->flags &= ~REG_MINIMAL;
2200 break;
2201 case 'X': /* PCRE_EXTRA */
2202 break;
2203 default:
2204 env->error = REG_BADRPT;
2205 return 0;
2207 eat(env);
2208 c = token(env);
2210 done:
2211 break;
2212 case ':':
2213 x = REX_GROUP;
2214 break;
2215 case '=':
2216 x = REX_GROUP_AHEAD;
2217 break;
2218 case '!':
2219 x = REX_GROUP_AHEAD_NOT;
2220 break;
2221 case '<':
2222 switch (token(env))
2224 case '=':
2225 x = REX_GROUP_BEHIND;
2226 break;
2227 case '!':
2228 case T_BANG:
2229 x = REX_GROUP_BEHIND_NOT;
2230 break;
2231 default:
2232 env->error = REG_BADRPT;
2233 return 0;
2235 eat(env);
2236 break;
2237 case '>':
2238 x = REX_GROUP_CUT;
2239 break;
2240 case '%':
2241 case T_PERCENT:
2242 e = node(env, REX_NEST, 0, 0, (UCHAR_MAX + 1) * sizeof(unsigned short));
2243 e->re.nest.primary = isalnum(*env->cursor) ? -1 : *env->cursor;
2244 n = 1;
2245 for (;;)
2247 switch (i = chr(env, &esc))
2249 case -1:
2250 case 0:
2251 invalid:
2252 env->cursor -= esc + 1;
2253 env->error = REG_EPAREN;
2254 return 0;
2255 case 'D':
2256 x = REX_NEST_delimiter;
2257 /*FALLTHROUGH*/
2258 delimiter:
2259 if ((i = chr(env, &esc)) < 0)
2260 goto invalid;
2261 if (e->re.nest.type[i] & ~x)
2262 goto invalid;
2263 e->re.nest.type[i] = x;
2264 continue;
2265 case 'E':
2266 x = REX_NEST_escape;
2267 goto delimiter;
2268 case 'L':
2269 x = REX_NEST_literal;
2270 goto quote;
2271 case 'O':
2272 switch (i = chr(env, &esc))
2274 case 'T':
2275 e->re.nest.type[UCHAR_MAX+1] |= REX_NEST_terminator;
2276 break;
2277 default:
2278 goto invalid;
2280 continue;
2281 case 'Q':
2282 x = REX_NEST_quote;
2283 /*FALLTHROUGH*/
2284 quote:
2285 if ((i = chr(env, &esc)) < 0)
2286 goto invalid;
2287 if (e->re.nest.type[i] & ~x)
2288 goto invalid;
2289 e->re.nest.type[i] = x|REX_NEST_open|REX_NEST_close|(i<<REX_NEST_SHIFT);
2290 continue;
2291 case 'S':
2292 x = REX_NEST_separator;
2293 goto delimiter;
2294 case 'T':
2295 x = REX_NEST_terminator;
2296 goto delimiter;
2297 case '|':
2298 case '&':
2299 if (!esc)
2300 goto invalid;
2301 goto nesting;
2302 case '(':
2303 if (!esc)
2304 n++;
2305 goto nesting;
2306 case ')':
2307 if (!esc && !--n)
2308 break;
2309 goto nesting;
2310 default:
2311 nesting:
2312 if (isalnum(i) || (e->re.nest.type[i] & (REX_NEST_close|REX_NEST_escape|REX_NEST_literal|REX_NEST_quote|REX_NEST_delimiter|REX_NEST_separator|REX_NEST_terminator)))
2313 goto invalid;
2314 e->re.nest.type[i] = REX_NEST_open;
2315 if ((x = chr(env, &esc)) < 0 || (e->re.nest.type[x] & (REX_NEST_close|REX_NEST_escape|REX_NEST_delimiter|REX_NEST_separator|REX_NEST_terminator)))
2316 goto invalid;
2317 if (!esc)
2319 if (x == ')' && !--n)
2320 goto invalid;
2321 else if (x == '(')
2322 n++;
2324 e->re.nest.type[x] |= REX_NEST_close;
2325 e->re.nest.type[i] |= x << REX_NEST_SHIFT;
2326 continue;
2328 break;
2330 env->parnest--;
2331 if (c == T_PERCENT)
2332 for (n = 0; n < 2; n++)
2334 parno = ++env->parno;
2335 if (!(f = node(env, REX_GROUP, 0, 0, 0)))
2337 drop(env->disc, e);
2338 return 0;
2340 if (parno < elementsof(env->paren))
2341 env->paren[parno] = f;
2342 f->re.group.back = 0;
2343 f->re.group.number = parno;
2344 f->re.group.expr.rex = e;
2345 e = f;
2347 return e;
2348 case '(':
2349 c = 0;
2350 if (isdigit(*env->cursor))
2352 f = 0;
2355 if (c > (INT_MAX / 10))
2357 env->error = REG_BADRPT;
2358 return 0;
2360 c = c * 10 + (*env->cursor++ - '0');
2361 } while (isdigit(*env->cursor));
2362 if (*env->cursor++ != ')')
2364 env->error = REG_BADRPT;
2365 return 0;
2367 if (c && env->type >= SRE)
2368 c = c * 2 - 1;
2369 if (!c || c > env->parno || !env->paren[c])
2371 if (!(env->flags & REG_LENIENT))
2373 env->error = REG_ESUBREG;
2374 return 0;
2376 if (c)
2377 c = -1;
2380 else
2382 if (env->type < SRE && *env->cursor++ != '?')
2384 env->error = REG_BADRPT;
2385 return 0;
2387 if (!(f = grp(env, parno + 1)) && env->error)
2388 return 0;
2390 if (!(e = node(env, REX_GROUP_COND, 0, 0, 0)))
2392 drop(env->disc, f);
2393 return 0;
2395 e->re.group.size = c;
2396 e->re.group.expr.binary.left = f;
2397 if (!(e->re.group.expr.binary.right = alt(env, parno, 1)))
2399 drop(env->disc, e);
2400 return 0;
2402 if (token(env) != T_CLOSE)
2404 env->error = REG_EPAREN;
2405 return 0;
2407 eat(env);
2408 env->parnest--;
2409 return rep(env, e, parno, parno);
2410 case '{':
2411 p = env->cursor;
2412 n = 1;
2413 while (c = *env->cursor)
2415 if (c == '\\' && *(env->cursor + 1))
2416 env->cursor++;
2417 else if (c == '{')
2418 n++;
2419 else if (c == '}' && !--n)
2420 break;
2421 else if (c == env->delimiter || c == env->terminator)
2422 break;
2423 env->cursor++;
2425 if (c != '}')
2427 env->error = REG_EBRACE;
2428 return 0;
2430 if (*++env->cursor != ')')
2432 env->error = REG_EPAREN;
2433 return 0;
2435 env->cursor++;
2436 env->parnest--;
2437 if (env->disc->re_version < REG_VERSION_EXEC)
2439 env->error = REG_BADRPT;
2440 return 0;
2442 if (!env->disc->re_execf)
2443 return 0;
2444 if (!(e = node(env, REX_EXEC, 0, 0, 0)))
2445 return 0;
2446 e->re.exec.text = (const char*)p;
2447 e->re.exec.size = env->cursor - p - 2;
2448 if (!env->disc->re_compf)
2449 e->re.exec.data = 0;
2450 else
2451 e->re.exec.data = (*env->disc->re_compf)(env->regex, e->re.exec.text, e->re.exec.size, env->disc);
2452 return e;
2453 case '0': case '1': case '2': case '3': case '4':
2454 case '5': case '6': case '7': case '8': case '9':
2455 c -= '0';
2456 while (isdigit(*env->cursor))
2458 if (c > (INT_MAX / 10))
2460 env->error = REG_ESUBREG;
2461 return 0;
2463 c = c * 10 + *env->cursor++ - '0';
2465 if (*env->cursor == ')')
2467 env->cursor++;
2468 env->parnest--;
2469 env->token.len = 1;
2470 if (c > env->parno || !env->paren[c])
2472 env->error = REG_ESUBREG;
2473 return 0;
2475 env->paren[c]->re.group.back = 1;
2476 return rep(env, node(env, REX_BACK, c, 0, 0), 0, 0);
2478 /*FALLTHROUGH*/
2479 default:
2480 env->error = REG_BADRPT;
2481 return 0;
2483 if (x && !(e = alt(env, parno, 0)))
2484 return 0;
2485 c = token(env);
2486 env->parnest--;
2487 if (c != T_CLOSE && (!(env->flags & REG_LITERAL) || c != ')'))
2489 env->error = REG_EPAREN;
2490 return 0;
2492 eat(env);
2493 if (typ >= 0)
2495 if (beg)
2496 env->pattern = env->cursor;
2497 env->type = typ;
2499 if (!x)
2500 return 0;
2501 if (!(f = node(env, x, 0, 0, 0)))
2503 drop(env->disc, e);
2504 return 0;
2506 f->re.group.expr.rex = e;
2507 if (x == REX_GROUP_BEHIND || x == REX_GROUP_BEHIND_NOT)
2509 if (stats(env, e))
2511 drop(env->disc, f);
2512 if (!env->error)
2513 env->error = REG_ECOUNT;
2514 return 0;
2516 f->re.group.size = env->stats.m;
2517 memset(&env->stats, 0, sizeof(env->stats));
2519 switch (x)
2521 case REX_GROUP:
2522 case REX_GROUP_CUT:
2523 f = rep(env, f, parno, env->parno);
2524 break;
2526 return f;
2529 static Rex_t*
2530 seq(Cenv_t* env)
2532 Rex_t* e;
2533 Rex_t* f;
2534 Token_t tok;
2535 int c;
2536 int i;
2537 int n;
2538 int x;
2539 int parno;
2540 int type;
2541 regflags_t flags;
2542 unsigned char* s;
2543 unsigned char* p;
2544 unsigned char* t;
2545 unsigned char* u;
2546 unsigned char buf[256];
2548 for (;;)
2550 s = buf;
2551 while ((c = token(env)) < T_META && s < &buf[sizeof(buf) - env->token.len])
2553 x = c;
2554 p = env->cursor;
2555 if (c >= 0)
2557 n = 1;
2558 *s++ = (env->flags & REG_ICASE) ? toupper(c) : c;
2560 else if (c == C_ESC || (env->flags & REG_ICASE))
2562 c = (c == C_ESC) ? env->token.lex : mbchar(p);
2563 if (env->flags & REG_ICASE)
2564 c = towupper(c);
2565 if ((&buf[sizeof(buf)] - s) < MB_CUR_MAX)
2566 break;
2567 if ((n = mbconv((char*)s, c)) < 0)
2568 *s++ = c;
2569 else if (n)
2570 s += n;
2571 else
2573 n = 1;
2574 *s++ = 0;
2577 else
2579 n = env->token.len - env->token.esc;
2580 for (t = p, u = s + n; s < u; *s++ = *t++);
2582 eat(env);
2584 if (c == T_BAD)
2585 return 0;
2586 if (s > buf)
2587 switch (c)
2589 case T_STAR:
2590 case T_PLUS:
2591 case T_LEFT:
2592 case T_QUES:
2593 case T_BANG:
2594 if ((s -= n) == buf)
2595 e = 0;
2596 else
2598 i = s - buf;
2599 if (!(e = node(env, REX_STRING, 0, 0, i)))
2600 return 0;
2601 memcpy((char*)(e->re.string.base = (unsigned char*)e->re.data), (char*)buf, i);
2602 e->re.string.size = i;
2604 if (x >= 0)
2606 if (!(f = node(env, REX_ONECHAR, 1, 1, 0)))
2608 drop(env->disc, e);
2609 return 0;
2611 f->re.onechar = (env->flags & REG_ICASE) ? toupper(x) : x;
2613 else
2615 if (!(f = node(env, REX_STRING, 0, 0, n)))
2616 return 0;
2617 memcpy((char*)(f->re.string.base = (unsigned char*)f->re.data), (char*)p, n);
2618 f->re.string.size = n;
2620 if (!(f = rep(env, f, 0, 0)) || !(f = cat(env, f, seq(env))))
2622 drop(env->disc, e);
2623 return 0;
2625 if (e)
2626 f = cat(env, e, f);
2627 return f;
2628 default:
2629 c = s - buf;
2630 if (!(e = node(env, REX_STRING, 0, 0, c)))
2631 return 0;
2632 memcpy((char*)(e->re.string.base = (unsigned char*)e->re.data), (char*)buf, c);
2633 e->re.string.size = c;
2634 return cat(env, e, seq(env));
2636 else if (c > T_BACK)
2638 eat(env);
2639 c -= T_BACK;
2640 if (c > env->parno || !env->paren[c])
2642 env->error = REG_ESUBREG;
2643 return 0;
2645 env->paren[c]->re.group.back = 1;
2646 e = rep(env, node(env, REX_BACK, c, 0, 0), 0, 0);
2648 else
2649 switch (c)
2651 case T_AND:
2652 case T_CLOSE:
2653 case T_BAR:
2654 case T_END:
2655 return node(env, REX_NULL, 0, 0, 0);
2656 case T_DOLL:
2657 eat(env);
2658 e = rep(env, node(env, REX_END, 0, 0, 0), 0, 0);
2659 break;
2660 case T_CFLX:
2661 eat(env);
2662 if ((e = node(env, REX_BEG, 0, 0, 0)) && (env->flags & REG_EXTENDED))
2663 e = rep(env, e, 0, 0);
2664 break;
2665 case T_OPEN:
2666 tok = env->token;
2667 eat(env);
2668 flags = env->flags;
2669 type = env->type;
2670 if (env->token.att)
2671 env->flags |= REG_MINIMAL;
2672 env->parnest++;
2673 if (env->type == KRE)
2674 ++env->parno;
2675 parno = ++env->parno;
2676 if (!(e = alt(env, parno + 1, 0)))
2677 break;
2678 if (e->type == REX_NULL && env->type == ERE && !(env->flags & REG_NULL))
2680 drop(env->disc, e);
2681 env->error = (*env->cursor == 0 || *env->cursor == env->delimiter || *env->cursor == env->terminator) ? REG_EPAREN : REG_ENULL;
2682 return 0;
2684 if (token(env) != T_CLOSE)
2686 drop(env->disc, e);
2687 env->error = REG_EPAREN;
2688 return 0;
2690 env->parnest--;
2691 eat(env);
2692 if (!(f = node(env, REX_GROUP, 0, 0, 0)))
2694 drop(env->disc, e);
2695 return 0;
2697 if (parno < elementsof(env->paren))
2698 env->paren[parno] = f;
2699 f->re.group.back = 0;
2700 f->re.group.number = parno;
2701 f->re.group.expr.rex = e;
2702 if (tok.lex)
2704 tok.push = 1;
2705 env->token = tok;
2707 if (!(e = rep(env, f, parno, env->parno)))
2708 return 0;
2709 if (env->type == KRE)
2711 if (!(f = node(env, REX_GROUP, 0, 0, 0)))
2713 drop(env->disc, e);
2714 return 0;
2716 if (--parno < elementsof(env->paren))
2717 env->paren[parno] = f;
2718 f->re.group.back = 0;
2719 f->re.group.number = parno;
2720 f->re.group.expr.rex = e;
2721 e = f;
2723 env->flags = flags;
2724 env->type = type;
2725 break;
2726 case T_GROUP:
2727 p = env->cursor;
2728 eat(env);
2729 flags = env->flags;
2730 type = env->type;
2731 if (!(e = grp(env, env->parno + 1)))
2733 if (env->error)
2734 return 0;
2735 if (env->literal == env->pattern && env->literal == p)
2736 env->literal = env->cursor;
2737 continue;
2739 env->flags = flags;
2740 env->type = type;
2741 break;
2742 case T_BRA:
2743 eat(env);
2744 if (e = bra(env))
2745 e = rep(env, e, 0, 0);
2746 break;
2747 case T_ALNUM:
2748 case T_ALNUM_NOT:
2749 case T_DIGIT:
2750 case T_DIGIT_NOT:
2751 case T_SPACE:
2752 case T_SPACE_NOT:
2753 eat(env);
2754 if (e = ccl(env, c))
2755 e = rep(env, e, 0, 0);
2756 break;
2757 case T_LT:
2758 eat(env);
2759 e = rep(env, node(env, REX_WBEG, 0, 0, 0), 0, 0);
2760 break;
2761 case T_GT:
2762 eat(env);
2763 e = rep(env, node(env, REX_WEND, 0, 0, 0), 0, 0);
2764 break;
2765 case T_DOT:
2766 eat(env);
2767 e = rep(env, node(env, REX_DOT, 1, 1, 0), 0, 0);
2768 break;
2769 case T_DOTSTAR:
2770 eat(env);
2771 env->token.lex = T_STAR;
2772 env->token.push = 1;
2773 e = rep(env, node(env, REX_DOT, 1, 1, 0), 0, 0);
2774 break;
2775 case T_SLASHPLUS:
2776 eat(env);
2777 env->token.lex = T_PLUS;
2778 env->token.push = 1;
2779 if (e = node(env, REX_ONECHAR, 1, 1, 0))
2781 e->re.onechar = '/';
2782 e = rep(env, e, 0, 0);
2784 break;
2785 case T_WORD:
2786 eat(env);
2787 e = rep(env, node(env, REX_WORD, 0, 0, 0), 0, 0);
2788 break;
2789 case T_WORD_NOT:
2790 eat(env);
2791 e = rep(env, node(env, REX_WORD_NOT, 0, 0, 0), 0, 0);
2792 break;
2793 case T_BEG_STR:
2794 eat(env);
2795 e = rep(env, node(env, REX_BEG_STR, 0, 0, 0), 0, 0);
2796 break;
2797 case T_END_STR:
2798 eat(env);
2799 e = rep(env, node(env, REX_END_STR, 0, 0, 0), 0, 0);
2800 break;
2801 case T_FIN_STR:
2802 eat(env);
2803 e = rep(env, node(env, REX_FIN_STR, 0, 0, 0), 0, 0);
2804 break;
2805 default:
2806 env->error = REG_BADRPT;
2807 return 0;
2809 if (e && *env->cursor != 0 && *env->cursor != env->delimiter && *env->cursor != env->terminator)
2810 e = cat(env, e, seq(env));
2811 return e;
2815 static Rex_t*
2816 con(Cenv_t* env)
2818 Rex_t* e;
2819 Rex_t* f;
2820 Rex_t* g;
2822 if (!(e = seq(env)) || !(env->flags & REG_AUGMENTED) || token(env) != T_AND)
2823 return e;
2824 eat(env);
2825 if (!(f = con(env)))
2827 drop(env->disc, e);
2828 return 0;
2830 if (!(g = node(env, REX_CONJ, 0, 0, 0)))
2832 drop(env->disc, e);
2833 drop(env->disc, f);
2834 return 0;
2836 g->re.group.expr.binary.left = e;
2837 g->re.group.expr.binary.right = f;
2838 return g;
2841 static Rex_t*
2842 alt(Cenv_t* env, int number, int cond)
2844 Rex_t* e;
2845 Rex_t* f;
2846 Rex_t* g;
2848 if (!(e = con(env)))
2849 return 0;
2850 else if (token(env) != T_BAR)
2852 if (!cond)
2853 return e;
2854 f = 0;
2855 if (e->type == REX_NULL)
2856 goto bad;
2858 else
2860 eat(env);
2861 if (!(f = alt(env, number, 0)))
2863 drop(env->disc, e);
2864 return 0;
2866 if ((e->type == REX_NULL || f->type == REX_NULL) && !(env->flags & REG_NULL))
2867 goto bad;
2868 if (!cond && (g = trie(env, e, f)))
2869 return g;
2871 if (!(g = node(env, REX_ALT, 0, 0, 0)))
2873 env->error = REG_ESPACE;
2874 goto bad;
2876 g->re.group.number = number;
2877 g->re.group.last = env->parno;
2878 g->re.group.expr.binary.left = e;
2879 g->re.group.expr.binary.right = f;
2880 return g;
2881 bad:
2882 drop(env->disc, e);
2883 drop(env->disc, f);
2884 if (!env->error)
2885 env->error = REG_ENULL;
2886 return 0;
2890 * add v to REX_BM tables
2893 static void
2894 bmstr(Cenv_t* env, register Rex_t* a, unsigned char* v, int n, Bm_mask_t b)
2896 int c;
2897 int m;
2898 size_t z;
2900 for (m = 0; m < n; m++)
2902 if (!(z = n - m - 1))
2903 z = HIT;
2904 c = v[m];
2905 a->re.bm.mask[m][c] |= b;
2906 if (z == HIT || !a->re.bm.skip[c] || a->re.bm.skip[c] > z && a->re.bm.skip[c] < HIT)
2907 a->re.bm.skip[c] = z;
2908 if (a->flags & REG_ICASE)
2910 if (isupper(c))
2911 c = tolower(c);
2912 else if (islower(c))
2913 c = toupper(c);
2914 else
2915 continue;
2916 a->re.bm.mask[m][c] |= b;
2917 if (z == HIT || !a->re.bm.skip[c] || a->re.bm.skip[c] > z && a->re.bm.skip[c] < HIT)
2918 a->re.bm.skip[c] = z;
2924 * set up BM table from trie
2927 static int
2928 bmtrie(Cenv_t* env, Rex_t* a, unsigned char* v, Trie_node_t* x, int n, int m, Bm_mask_t b)
2932 v[m] = x->c;
2933 if (m >= (n - 1))
2935 bmstr(env, a, v, n, b);
2936 if (!(b <<= 1))
2938 b = 1;
2939 a->re.bm.complete = 0;
2941 else if (x->son)
2942 a->re.bm.complete = 0;
2944 else if (x->son)
2945 b = bmtrie(env, a, v, x->son, n, m + 1, b);
2946 } while (x = x->sib);
2947 return b;
2951 * rewrite the expression tree for some special cases
2952 * 1. it is a null expression - illegal
2953 * 2. max length fixed string found -- use BM algorithm
2954 * 3. it begins with an unanchored string - use KMP algorithm
2955 * 0 returned on success
2958 static int
2959 special(Cenv_t* env, regex_t* p)
2961 Rex_t* a;
2962 Rex_t* e;
2963 Rex_t* t;
2964 Rex_t* x;
2965 Rex_t* y;
2966 unsigned char* s;
2967 int* f;
2968 int n;
2969 int m;
2970 int k;
2972 DEBUG_INIT();
2973 if (e = p->env->rex)
2975 if ((x = env->stats.x) && x->re.string.size < 3)
2976 x = 0;
2977 if ((t = env->stats.y) && t->re.trie.min < 3)
2978 t = 0;
2979 if (x && t)
2981 if (x->re.string.size >= t->re.trie.min)
2982 t = 0;
2983 else
2984 x = 0;
2986 if (x || t)
2988 Bm_mask_t** mask;
2989 Bm_mask_t* h;
2990 unsigned char* v;
2991 size_t* q;
2992 unsigned long l;
2993 int i;
2994 int j;
2996 if (x)
2998 y = x;
2999 n = m = x->re.string.size;
3000 l = env->stats.l;
3002 else
3004 y = t;
3005 n = t->re.trie.min;
3006 m = t->re.trie.max;
3007 l = env->stats.k;
3009 if (!(q = (size_t*)alloc(env->disc, 0, (n + 1) * sizeof(size_t))))
3010 return 1;
3011 if (!(a = node(env, REX_BM, 0, 0, n * (sizeof(Bm_mask_t*) + (UCHAR_MAX + 1) * sizeof(Bm_mask_t)) + (UCHAR_MAX + n + 2) * sizeof(size_t))))
3013 alloc(env->disc, q, 0);
3014 return 1;
3016 a->flags = y->flags;
3017 a->map = y->map;
3018 a->re.bm.size = n;
3019 a->re.bm.back = (y == e || y == e->re.group.expr.rex) ? (m - n) : -1;
3020 a->re.bm.left = l - 1;
3021 a->re.bm.right = env->stats.m - l - n;
3022 a->re.bm.complete = (env->stats.e || y != e && (e->type != REX_GROUP || y != e->re.group.expr.rex) || e->next || ((a->re.bm.left + a->re.bm.right) >= 0)) ? 0 : n;
3023 h = (Bm_mask_t*)&a->re.bm.mask[n];
3024 a->re.bm.skip = (size_t*)(h + n * (UCHAR_MAX + 1));
3025 a->re.bm.fail = &a->re.bm.skip[UCHAR_MAX + 1];
3026 for (m = 0; m <= UCHAR_MAX; m++)
3027 a->re.bm.skip[m] = n;
3028 a->re.bm.skip[0] = a->re.bm.skip[env->mappednewline] = (y->next && y->next->type == REX_END) ? HIT : (n + a->re.bm.left);
3029 for (i = 1; i <= n; i++)
3030 a->re.bm.fail[i] = 2 * n - i;
3031 mask = a->re.bm.mask;
3032 for (m = 0; m < n; m++)
3034 mask[m] = h;
3035 h += UCHAR_MAX + 1;
3037 if (x)
3038 bmstr(env, a, x->re.string.base, n, 1);
3039 else
3041 v = (unsigned char*)q;
3042 memset(v, 0, n);
3043 m = 1;
3044 for (i = 0; i <= UCHAR_MAX; i++)
3045 if (t->re.trie.root[i])
3046 m = bmtrie(env, a, v, t->re.trie.root[i], n, 0, m);
3048 mask--;
3049 memset(q, 0, n * sizeof(*q));
3050 for (k = (j = n) + 1; j > 0; j--, k--)
3052 DEBUG_TEST(0x0010,(sfprintf(sfstderr, "BM#0: k=%d j=%d\n", k, j)),(0));
3053 for (q[j] = k; k <= n; k = q[k])
3055 for (m = 0; m <= UCHAR_MAX; m++)
3056 if (mask[k][m] == mask[j][m])
3058 DEBUG_TEST(0x0010,sfprintf(sfstderr, "CUT1: mask[%d][%c]=mask[%d][%c]\n", k, m, j, m), (0));
3059 goto cut;
3061 DEBUG_TEST(0x0010,sfprintf(sfstderr, "BM#2: fail[%d]=%d => %d\n", k, a->re.bm.fail[k], (a->re.bm.fail[k] > n - j) ? (n - j) : a->re.bm.fail[k]), (0));
3062 if (a->re.bm.fail[k] > n - j)
3063 a->re.bm.fail[k] = n - j;
3065 cut: ;
3067 for (i = 1; i <= n; i++)
3068 if (a->re.bm.fail[i] > n + k - i)
3070 DEBUG_TEST(0x0010,sfprintf(sfstderr, "BM#4: fail[%d]=%d => %d\n", i, a->re.bm.fail[i], n + k - i), (0));
3071 a->re.bm.fail[i] = n + k - i;
3073 #if _AST_REGEX_DEBUG
3074 if (DEBUG_TEST(0x0020,(1),(0)))
3076 sfprintf(sfstderr, "STAT: complete=%d n=%d k=%d l=%d r=%d y=%d:%d e=%d:%d\n", a->re.bm.complete, n, k, a->re.bm.left, a->re.bm.right, y->type, y->next ? y->next->type : 0, e->type, e->next ? e->next->type : 0);
3077 for (m = 0; m < n; m++)
3078 for (i = 1; i <= UCHAR_MAX; i++)
3079 if (a->re.bm.mask[m][i])
3080 sfprintf(sfstderr, "MASK: [%d]['%c'] = %032..2u\n", m, i, a->re.bm.mask[m][i]);
3081 for (i = ' '; i <= UCHAR_MAX; i++)
3082 if (a->re.bm.skip[i] >= HIT)
3083 sfprintf(sfstderr, "SKIP: ['%c'] = *\n", i);
3084 else if (a->re.bm.skip[i] > 0 && a->re.bm.skip[i] < n)
3085 sfprintf(sfstderr, "SKIP: ['%c'] = %3d\n", i, a->re.bm.skip[i]);
3086 for (j = 31; j >= 0; j--)
3088 sfprintf(sfstderr, " ");
3089 next:
3090 for (m = 0; m < n; m++)
3092 for (i = 0040; i < 0177; i++)
3093 if (a->re.bm.mask[m][i] & (1 << j))
3095 sfprintf(sfstderr, " %c", i);
3096 break;
3098 if (i >= 0177)
3100 if (j > 0)
3102 j--;
3103 goto next;
3105 sfprintf(sfstderr, " ?");
3108 sfprintf(sfstderr, "\n");
3110 sfprintf(sfstderr, "FAIL: ");
3111 for (m = 1; m <= n; m++)
3112 sfprintf(sfstderr, "%3d", a->re.bm.fail[m]);
3113 sfprintf(sfstderr, "\n");
3115 #endif
3116 alloc(env->disc, q, 0);
3117 a->next = e;
3118 p->env->rex = a;
3119 return 0;
3121 switch (e->type)
3123 case REX_BEG:
3124 if (env->flags & REG_NEWLINE)
3125 return 0;
3126 break;
3127 case REX_GROUP:
3128 if (env->stats.b)
3129 return 0;
3130 e = e->re.group.expr.rex;
3131 if (e->type != REX_DOT)
3132 return 0;
3133 /*FALLTHROUGH*/
3134 case REX_DOT:
3135 if (e->lo == 0 && e->hi == RE_DUP_INF)
3136 break;
3137 return 0;
3138 case REX_NULL:
3139 if (env->flags & REG_NULL)
3140 break;
3141 env->error = REG_ENULL;
3142 return 1;
3143 case REX_STRING:
3144 if ((env->flags & (REG_LEFT|REG_LITERAL|REG_RIGHT)) || e->map)
3145 return 0;
3146 s = e->re.string.base;
3147 n = e->re.string.size;
3148 if (!(a = node(env, REX_KMP, 0, 0, n * (sizeof(int*) + 1))))
3149 return 1;
3150 a->flags = e->flags;
3151 a->map = e->map;
3152 f = a->re.string.fail;
3153 memcpy((char*)(a->re.string.base = (unsigned char*)&f[n]), (char*)s, n);
3154 s = a->re.string.base;
3155 a->re.string.size = n;
3156 f[0] = m = -1;
3157 for (k = 1; k < n; k++)
3159 while (m >= 0 && s[m+1] != s[k])
3160 m = f[m];
3161 if (s[m+1] == s[k])
3162 m++;
3163 f[k] = m;
3165 a->next = e->next;
3166 p->env->rex = a;
3167 e->next = 0;
3168 drop(env->disc, e);
3169 break;
3170 default:
3171 return 0;
3174 p->env->once = 1;
3175 return 0;
3179 regcomp(regex_t* p, const char* pattern, regflags_t flags)
3181 Rex_t* e;
3182 Rex_t* f;
3183 regdisc_t* disc;
3184 unsigned char* fold;
3185 int i;
3186 Cenv_t env;
3188 if (!p)
3189 return REG_BADPAT;
3190 if (flags & REG_DISCIPLINE)
3192 flags &= ~REG_DISCIPLINE;
3193 disc = p->re_disc;
3195 else
3196 disc = &state.disc;
3197 if (!disc->re_errorlevel)
3198 disc->re_errorlevel = 2;
3199 p->env = 0;
3200 if (!pattern)
3201 return fatal(disc, REG_BADPAT, pattern);
3202 if (!state.initialized)
3204 state.initialized = 1;
3205 for (i = 0; i < elementsof(state.escape); i++)
3206 state.magic[state.escape[i].key] = state.escape[i].val;
3208 if (!(fold = (unsigned char*)LCINFO(AST_LC_CTYPE)->data))
3210 if (!(fold = newof(0, unsigned char, UCHAR_MAX, 1)))
3211 return fatal(disc, REG_ESPACE, pattern);
3212 for (i = 0; i <= UCHAR_MAX; i++)
3213 fold[i] = toupper(i);
3214 LCINFO(AST_LC_CTYPE)->data = (void*)fold;
3216 again:
3217 if (!(p->env = (Env_t*)alloc(disc, 0, sizeof(Env_t))))
3218 return fatal(disc, REG_ESPACE, pattern);
3219 memset(p->env, 0, sizeof(*p->env));
3220 memset(&env, 0, sizeof(env));
3221 env.regex = p;
3222 env.flags = flags;
3223 env.disc = p->env->disc = disc;
3224 if (env.flags & REG_AUGMENTED)
3225 env.flags |= REG_EXTENDED;
3226 env.mappeddot = '.';
3227 env.mappednewline = '\n';
3228 env.mappedslash = '/';
3229 if (disc->re_version >= REG_VERSION_MAP && disc->re_map)
3231 env.map = disc->re_map;
3232 env.MAP = p->env->fold;
3233 for (i = 0; i <= UCHAR_MAX; i++)
3235 env.MAP[i] = fold[env.map[i]];
3236 if (env.map[i] == '.')
3237 env.mappeddot = i;
3238 if (env.map[i] == '\n')
3239 env.mappednewline = i;
3240 if (env.map[i] == '/')
3241 env.mappedslash = i;
3244 else
3245 env.MAP = fold;
3246 env.type = (env.flags & REG_AUGMENTED) ? ARE : (env.flags & REG_EXTENDED) ? ERE : BRE;
3247 env.explicit = -1;
3248 if (env.flags & REG_SHELL)
3250 if (env.flags & REG_SHELL_PATH)
3251 env.explicit = env.mappedslash;
3252 env.flags |= REG_LENIENT|REG_NULL;
3253 env.type = env.type == BRE ? SRE : KRE;
3255 if ((env.flags & (REG_NEWLINE|REG_SPAN)) == REG_NEWLINE)
3256 env.explicit = env.mappednewline;
3257 p->env->leading = (env.flags & REG_SHELL_DOT) ? env.mappeddot : -1;
3258 env.posixkludge = !(env.flags & (REG_EXTENDED|REG_SHELL));
3259 env.regexp = !!(env.flags & REG_REGEXP);
3260 env.token.lex = 0;
3261 env.token.push = 0;
3262 if (env.flags & REG_DELIMITED)
3264 switch (env.delimiter = *pattern++)
3266 case 0:
3267 case '\\':
3268 case '\n':
3269 case '\r':
3270 env.error = REG_EDELIM;
3271 goto bad;
3273 env.terminator = '\n';
3275 env.literal = env.pattern = env.cursor = (unsigned char*)pattern;
3276 if (!(p->env->rex = alt(&env, 1, 0)))
3277 goto bad;
3278 if (env.parnest)
3280 env.error = REG_EPAREN;
3281 goto bad;
3283 p->env->stats.re_flags = env.flags & (REG_EXTENDED|REG_AUGMENTED|REG_SHELL);
3284 if (env.flags & REG_LEFT)
3286 if (p->env->rex->type != REX_BEG)
3288 if (p->env->rex->type == REX_ALT)
3289 env.flags &= ~REG_FIRST;
3290 if (!(e = node(&env, REX_BEG, 0, 0, 0)))
3292 regfree(p);
3293 return fatal(disc, REG_ESPACE, pattern);
3295 e->next = p->env->rex;
3296 p->env->rex = e;
3297 p->env->once = 1;
3299 p->env->stats.re_flags |= REG_LEFT;
3301 for (e = p->env->rex; e->next; e = e->next);
3302 p->env->done.type = REX_DONE;
3303 p->env->done.flags = e->flags;
3304 if (env.flags & REG_RIGHT)
3306 if (e->type != REX_END)
3308 if (p->env->rex->type == REX_ALT)
3309 env.flags &= ~REG_FIRST;
3310 if (!(f = node(&env, REX_END, 0, 0, 0)))
3312 regfree(p);
3313 return fatal(disc, REG_ESPACE, pattern);
3315 f->flags = e->flags;
3316 f->map = e->map;
3317 e->next = f;
3319 p->env->stats.re_flags |= REG_RIGHT;
3321 if (stats(&env, p->env->rex))
3323 if (!env.error)
3324 env.error = REG_ECOUNT;
3325 goto bad;
3327 if (env.stats.b)
3328 p->env->hard = p->env->separate = 1;
3329 else if (!(env.flags & REG_FIRST) && (env.stats.a || env.stats.c > 1 && env.stats.c != env.stats.s || env.stats.t && (env.stats.t > 1 || env.stats.a || env.stats.c)))
3330 p->env->hard = 1;
3331 if (p->env->hard || env.stats.c || env.stats.i)
3332 p->env->stats.re_min = p->env->stats.re_max = -1;
3333 else
3335 if (!(p->env->stats.re_min = env.stats.m))
3336 p->env->stats.re_min = -1;
3337 if (!(p->env->stats.re_max = env.stats.n))
3338 p->env->stats.re_max = -1;
3340 if (special(&env, p))
3341 goto bad;
3342 serialize(&env, p->env->rex, 1);
3343 p->re_nsub = env.stats.p;
3344 if (env.type == KRE)
3345 p->re_nsub /= 2;
3346 if (env.flags & REG_DELIMITED)
3348 p->re_npat = env.cursor - env.pattern + 1;
3349 if (*env.cursor == env.delimiter)
3350 p->re_npat++;
3351 else if (env.flags & REG_MUSTDELIM)
3353 env.error = REG_EDELIM;
3354 goto bad;
3356 else
3357 env.flags &= ~REG_DELIMITED;
3359 p->env->explicit = env.explicit;
3360 p->env->flags = env.flags & REG_COMP;
3361 p->env->min = env.stats.m;
3362 p->env->nsub = env.stats.p + env.stats.u;
3363 p->env->refs = 1;
3364 return 0;
3365 bad:
3366 regfree(p);
3367 if (!env.error)
3368 env.error = REG_ESPACE;
3369 if (env.type >= SRE && env.error != REG_ESPACE && !(flags & REG_LITERAL))
3371 flags |= REG_LITERAL;
3372 pattern = (const char*)env.literal;
3373 goto again;
3375 return fatal(disc, env.error, pattern);
3379 * regcomp() on sized pattern
3380 * the lazy way around adding and checking env.end
3384 regncomp(regex_t* p, const char* pattern, size_t size, regflags_t flags)
3386 char* s;
3387 int r;
3389 if (!(s = malloc(size + 1)))
3390 return fatal((flags & REG_DISCIPLINE) ? p->re_disc : &state.disc, REG_ESPACE, pattern);
3391 memcpy(s, pattern, size);
3392 s[size] = 0;
3393 r = regcomp(p, s, flags);
3394 free(s);
3395 return r;
3399 * combine two compiled regular expressions if possible,
3400 * replacing first with the combination and freeing second.
3401 * return 0 on success.
3402 * the only combinations handled are building a Trie
3403 * from String|Kmp|Trie and String|Kmp
3407 regcomb(regex_t* p, regex_t* q)
3409 Rex_t* e = p->env->rex;
3410 Rex_t* f = q->env->rex;
3411 Rex_t* g;
3412 Rex_t* h;
3413 Cenv_t env;
3415 if (!e || !f)
3416 return fatal(p->env->disc, REG_BADPAT, NiL);
3417 if (p->env->separate || q->env->separate)
3418 return REG_ESUBREG;
3419 memset(&env, 0, sizeof(env));
3420 env.disc = p->env->disc;
3421 if (e->type == REX_BM)
3423 p->env->rex = e->next;
3424 e->next = 0;
3425 drop(env.disc, e);
3426 e = p->env->rex;
3428 if (f->type == REX_BM)
3430 q->env->rex = f->next;
3431 f->next = 0;
3432 drop(env.disc, f);
3433 f = q->env->rex;
3435 if (e->type == REX_BEG && f->type == REX_BEG)
3437 p->env->flags |= REG_LEFT;
3438 p->env->rex = e->next;
3439 e->next = 0;
3440 drop(env.disc, e);
3441 e = p->env->rex;
3442 q->env->rex = f->next;
3443 f->next = 0;
3444 drop(env.disc, f);
3445 f = q->env->rex;
3447 for (g = e; g->next; g = g->next);
3448 for (h = f; h->next; h = h->next);
3449 if (g->next && g->next->type == REX_END && h->next && h->next->type == REX_END)
3451 p->env->flags |= REG_RIGHT;
3452 drop(env.disc, g->next);
3453 g->next = 0;
3454 drop(env.disc, h->next);
3455 h->next = 0;
3457 if (!(g = trie(&env, f, e)))
3458 return fatal(p->env->disc, REG_BADPAT, NiL);
3459 p->env->rex = g;
3460 if (!q->env->once)
3461 p->env->once = 0;
3462 q->env->rex = 0;
3463 if (p->env->flags & REG_LEFT)
3465 if (!(e = node(&env, REX_BEG, 0, 0, 0)))
3467 regfree(p);
3468 return fatal(p->env->disc, REG_ESPACE, NiL);
3470 e->next = p->env->rex;
3471 p->env->rex = e;
3472 p->env->once = 1;
3474 if (p->env->flags & REG_RIGHT)
3476 for (f = p->env->rex; f->next; f = f->next);
3477 if (f->type != REX_END)
3479 if (!(e = node(&env, REX_END, 0, 0, 0)))
3481 regfree(p);
3482 return fatal(p->env->disc, REG_ESPACE, NiL);
3484 f->next = e;
3487 env.explicit = p->env->explicit;
3488 env.flags = p->env->flags;
3489 env.disc = p->env->disc;
3490 if (stats(&env, p->env->rex))
3492 regfree(p);
3493 return fatal(p->env->disc, env.error ? env.error : REG_ECOUNT, NiL);
3495 if (special(&env, p))
3497 regfree(p);
3498 return fatal(p->env->disc, env.error ? env.error : REG_ESPACE, NiL);
3500 p->env->min = g->re.trie.min;
3501 return 0;
3505 * copy a reference of p into q
3506 * p and q may then have separate regsubcomp() instantiations
3510 regdup(regex_t* p, regex_t* q)
3512 if (!p || !q)
3513 return REG_BADPAT;
3514 *q = *p;
3515 p->env->refs++;
3516 q->re_sub = 0;
3517 return 0;