1 /***********************************************************************
3 * This software is part of the ast package *
4 * Copyright (c) 1985-2010 AT&T Intellectual Property *
5 * and is licensed under the *
6 * Common Public License, Version 1.0 *
7 * by AT&T Intellectual Property *
9 * A copy of the License is available at *
10 * http://www.opensource.org/licenses/cpl1.0.txt *
11 * (with md5 checksum 059e8cd6165cb4c31e351f2b69388fd9) *
13 * Information and Software Systems Research *
17 * Glenn Fowler <gsf@research.att.com> *
18 * David Korn <dgk@research.att.com> *
19 * Phong Vo <kpv@research.att.com> *
21 ***********************************************************************/
25 * posix regex compiler
34 #define serialize re_serialize /* hp.ia64 <unistd.h>! */
41 #define DEBUG_TEST(f,y,n) ((debug&(debug_flag=f))?(y):(n))
42 #define DEBUG_CODE(f,y,n) do if(debug&(f)){y}else{n} while(0)
43 #define DEBUG_INIT() do { char* t; if (!debug) { debug = 0x80000000; if (t = getenv("_AST_regex_comp_debug")) debug |= strtoul(t, NiL, 0); } } while (0)
45 static unsigned long debug
;
46 static unsigned long debug_flag
;
51 #define DEBUG_TEST(f,y,n) (n)
52 #define DEBUG_CODE(f,y,n) do {n} while(0)
67 #define eat(p) do{if ((p)->token.push)(p)->token.push=0;else (p)->cursor+=(p)->token.len;}while (0)
70 * determine whether greedy matching will work, i.e. produce
71 * the best match first. such expressions are "easy", and
72 * need no backtracking once a complete match is found.
73 * if an expression has backreferences or alts it's hard
74 * else if it has only one closure it's easy
75 * else if all closures are simple (i.e. one-character) it's easy
79 typedef struct Stats_s
81 unsigned long l
; /* min length to left of x */
82 unsigned long k
; /* min length to left of y */
83 unsigned long m
; /* min length */
84 unsigned long n
; /* max length */
85 unsigned short a
; /* number of alternations */
86 unsigned short b
; /* number of backrefs */
87 unsigned short c
; /* number of closures */
88 unsigned short e
; /* $ */
89 unsigned short i
; /* number of negations */
90 unsigned short p
; /* number of named subexpressions */
91 unsigned short s
; /* number of simple closures */
92 unsigned short t
; /* number of tries */
93 unsigned short u
; /* number of unnamed subexpressions */
94 Rex_t
* x
; /* max length REX_STRING */
95 Rex_t
* y
; /* max length REX_TRIE */
98 typedef struct Token_s
109 typedef struct Cenv_s
111 int delimiter
; /* pattern delimiter */
112 int error
; /* last error */
113 int explicit; /* explicit match on this char */
114 int mappeddot
; /* inverse mapped '.' */
115 int mappednewline
; /* inverse mapped '\n' */
116 int mappedslash
; /* inverse mapped '/' */
117 regflags_t flags
; /* flags arg to regcomp */
118 int type
; /* BRE,ERE,ARE,SRE,KRE */
119 unsigned char* cursor
; /* curent point in re */
120 unsigned char* pattern
; /* the original pattern */
121 unsigned char* literal
; /* literal restart pattern */
122 int parno
; /* number of last open paren */
123 int parnest
; /* paren nest count */
124 int posixkludge
; /* to make * nonspecial */
125 int regexp
; /* <regexp.h> compatibility */
126 Token_t token
; /* token lookahead */
127 Stats_t stats
; /* RE statistics */
128 int terminator
; /* pattern terminator */
129 Rex_t
* paren
[2*(BACK_REF_MAX
+2)];
130 /* paren[i]!=0 if \i defined */
131 regex_t
* regex
; /* user handle */
132 regdisc_t
* disc
; /* user discipline */
133 unsigned char* map
; /* external to native ccode map */
134 unsigned char* MAP
; /* fold and/or map */
138 * allocate a new Rex_t node
142 node(Cenv_t
* env
, int type
, int lo
, int hi
, size_t extra
)
146 if (e
= (Rex_t
*)alloc(env
->disc
, 0, sizeof(Rex_t
) + extra
))
148 memset(e
, 0, sizeof(Rex_t
) + extra
);
153 e
->flags
= env
->flags
;
154 e
->map
= (e
->flags
& REG_ICASE
) ? env
->MAP
: env
->map
;
155 e
->explicit = env
->explicit;
157 e
->re
.data
= (char*)e
+ sizeof(Rex_t
);
163 * free a Trie_node_t node
167 triedrop(regdisc_t
* disc
, Trie_node_t
* e
)
171 triedrop(disc
, e
->son
);
172 triedrop(disc
, e
->sib
);
182 drop(regdisc_t
* disc
, Rex_t
* e
)
187 if (e
&& !(disc
->re_flags
& REG_NOFREE
))
194 drop(disc
, e
->re
.group
.expr
.binary
.left
);
195 drop(disc
, e
->re
.group
.expr
.binary
.right
);
198 case REX_GROUP_AHEAD
:
199 case REX_GROUP_AHEAD_NOT
:
200 case REX_GROUP_BEHIND
:
201 case REX_GROUP_BEHIND_NOT
:
205 drop(disc
, e
->re
.group
.expr
.rex
);
208 for (i
= 0; i
<= UCHAR_MAX
; i
++)
209 triedrop(disc
, e
->re
.trie
.root
[i
]);
218 * mark e and descendants minimal
222 mark(register Rex_t
* e
, int set
)
229 e
->flags
|= REG_MINIMAL
;
231 e
->flags
&= ~REG_MINIMAL
;
237 if (e
->re
.group
.expr
.binary
.left
)
238 mark(e
->re
.group
.expr
.binary
.left
, set
);
239 if (e
->re
.group
.expr
.binary
.right
)
240 mark(e
->re
.group
.expr
.binary
.right
, set
);
243 case REX_GROUP_AHEAD
:
244 case REX_GROUP_AHEAD_NOT
:
245 case REX_GROUP_BEHIND
:
246 case REX_GROUP_BEHIND_NOT
:
251 mark(e
->re
.group
.expr
.rex
, set
);
254 } while (e
= e
->next
);
258 * assign subexpression numbers by a preorder tree walk
262 serialize(Cenv_t
* env
, Rex_t
* e
, int n
)
271 if (e
->re
.group
.expr
.binary
.left
)
272 n
= serialize(env
, e
->re
.group
.expr
.binary
.left
, n
);
273 e
->re
.group
.expr
.binary
.serial
= n
++;
274 if (e
->re
.group
.expr
.binary
.right
)
275 n
= serialize(env
, e
->re
.group
.expr
.binary
.right
, n
);
278 n
= serialize(env
, e
->re
.group
.expr
.binary
.left
, n
);
279 n
= serialize(env
, e
->re
.group
.expr
.binary
.right
, n
);
282 case REX_GROUP_AHEAD
:
283 case REX_GROUP_AHEAD_NOT
:
284 case REX_GROUP_BEHIND
:
285 case REX_GROUP_BEHIND_NOT
:
289 n
= serialize(env
, e
->re
.group
.expr
.rex
, n
);
292 } while (e
= e
->next
);
297 * catenate e and f into a sequence, collapsing them if possible
301 cat(Cenv_t
* env
, Rex_t
* e
, Rex_t
* f
)
310 if (e
->type
== REX_NULL
)
315 if (f
->type
== REX_NULL
)
322 else if (e
->type
== REX_DOT
&& f
->type
== REX_DOT
)
324 unsigned int m
= e
->lo
+ f
->lo
;
325 unsigned int n
= e
->hi
+ f
->hi
;
329 if (e
->hi
> RE_DUP_MAX
|| f
->hi
> RE_DUP_MAX
)
334 else if (n
<= RE_DUP_MAX
)
351 * collect re statistics
355 stats(register Cenv_t
* env
, register Rex_t
* e
)
357 register unsigned long n
;
358 register unsigned long m
;
382 if (++env
->stats
.a
<= 0)
388 if (stats(env
, e
->re
.group
.expr
.binary
.left
))
394 if (e
->re
.group
.expr
.binary
.right
&& stats(env
, e
->re
.group
.expr
.binary
.right
))
396 if (env
->stats
.m
> m
)
400 if ((env
->stats
.m
+= cm
) < m
)
402 if (env
->stats
.n
< n
)
406 if ((env
->stats
.n
+= cn
) < n
)
415 if (++env
->stats
.b
<= 0)
423 if ((env
->stats
.m
+= e
->lo
) < n
)
425 if (e
->hi
!= RE_DUP_INF
)
428 if ((env
->stats
.n
+= e
->hi
) < n
)
433 if (++env
->stats
.c
<= 0)
435 if (++env
->stats
.s
<= 0)
444 if (stats(env
, e
->re
.group
.expr
.binary
.left
))
450 if (stats(env
, e
->re
.group
.expr
.binary
.right
))
452 if (env
->stats
.m
< nm
)
456 if ((env
->stats
.m
+= cm
) < nm
)
458 if (env
->stats
.n
< nn
)
462 if ((env
->stats
.n
+= cn
) < nn
)
469 if (e
->re
.group
.number
&& ++env
->stats
.p
<= 0 || !e
->re
.group
.number
&& ++env
->stats
.u
<= 0)
471 if (stats(env
, e
->re
.group
.expr
.rex
))
474 case REX_GROUP_AHEAD
:
475 case REX_GROUP_AHEAD_NOT
:
476 case REX_GROUP_BEHIND
:
477 case REX_GROUP_BEHIND_NOT
:
482 if (stats(env
, e
->re
.group
.expr
.rex
))
490 case REX_GROUP_AHEAD
:
491 case REX_GROUP_BEHIND
:
492 if (++env
->stats
.u
<= 0)
498 if (++env
->stats
.u
<= 0)
504 if (e
->re
.group
.size
> 0 && ++env
->stats
.b
<= 0)
506 if (e
->re
.group
.expr
.binary
.left
&& stats(env
, e
->re
.group
.expr
.binary
.left
))
508 if (q
= e
->re
.group
.expr
.binary
.right
)
510 if (q
->re
.group
.expr
.binary
.left
&& stats(env
, q
->re
.group
.expr
.binary
.left
))
512 if (q
->re
.group
.expr
.binary
.right
&& stats(env
, q
->re
.group
.expr
.binary
.right
))
521 if (++env
->stats
.u
<= 0)
527 if (stats(env
, e
->re
.group
.expr
.rex
))
543 if (stats(env
, e
->re
.group
.expr
.rex
))
545 env
->stats
.m
= !env
->stats
.m
;
546 if ((env
->stats
.m
+= cm
) < cm
)
560 if (++env
->stats
.c
<= 0)
566 if (stats(env
, e
->re
.group
.expr
.rex
))
568 if (env
->stats
.m
== 1 && b
== env
->stats
.b
&& c
== env
->stats
.c
&& ++env
->stats
.s
<= 0)
582 if ((env
->stats
.m
*= e
->lo
) > 0 && env
->stats
.m
< m
)
585 if ((env
->stats
.m
+= cm
) < m
)
587 if (env
->stats
.x
!= x
)
589 if (env
->stats
.y
!= y
)
597 if ((env
->stats
.m
+= e
->re
.string
.size
) < cm
)
600 if ((env
->stats
.n
+= e
->re
.string
.size
) < cn
)
602 if (!env
->stats
.x
|| env
->stats
.x
->re
.string
.size
< e
->re
.string
.size
)
610 if (++env
->stats
.s
<= 0)
613 if ((env
->stats
.m
+= e
->re
.trie
.min
) < cm
)
616 if ((env
->stats
.n
+= e
->re
.trie
.max
) < cn
)
619 if (!env
->stats
.y
|| env
->stats
.y
->re
.trie
.min
< e
->re
.trie
.min
)
626 } while (e
= e
->next
);
630 static int token(Cenv_t
*);
633 magic(register Cenv_t
* env
, register int c
, int escaped
)
639 int l
= env
->token
.len
;
643 if (mp
= state
.magic
[c
])
645 c
= mp
[env
->type
+escaped
];
648 sp
= (char*)env
->cursor
+ env
->token
.len
;
654 while (*sp
>= '0' && *sp
<= '9')
656 if (n
> (INT_MAX
/ 10))
658 env
->error
= REG_BADBR
;
661 n
= n
* 10 + *sp
++ - '0';
665 if (env
->type
< SRE
|| *sp
!= ',')
667 env
->error
= *sp
? REG_BADBR
: REG_EBRACE
;
671 else if (n
> RE_DUP_MAX
)
673 env
->error
= REG_BADBR
;
681 while (*sp
>= '0' && *sp
<= '9')
683 if (n
> (INT_MAX
/ 10))
685 env
->error
= REG_BADBR
;
688 n
= n
* 10 + *sp
++ - '0';
692 else if (n
< env
->token
.min
)
694 env
->error
= REG_BADBR
;
702 env
->error
= REG_EBRACE
;
707 env
->error
= REG_BADBR
;
715 env
->error
= REG_BADBR
;
723 env
->error
= REG_EBRACE
;
728 env
->error
= REG_BADBR
;
731 env
->token
.len
= sp
- (char*)env
->cursor
;
734 env
->error
= REG_EBRACE
;
737 if (env
->type
< SRE
&& *sp
== '?')
745 c
= chresc(sp
- 2, &ep
);
748 env
->token
.len
+= ep
- sp
;
763 n
= chresc(sp
- 2, &ep
);
766 env
->token
.len
+= ep
- sp
;
772 if (env
->type
== SRE
|| c
== T_BACK
&& !(env
->flags
& REG_LENIENT
))
774 env
->error
= REG_BADESC
;
777 if ((env
->flags
& REG_MULTIREF
) && isdigit(*sp
))
779 c
= (c
- T_BACK
) * 10 + (*sp
- '0');
780 if (c
> 0 && c
<= env
->parno
&& env
->paren
[c
])
783 c
= chresc(sp
- 2, &ep
);
790 if (escaped
== 1 && (env
->flags
& REG_LENIENT
) && (c
= mp
[env
->type
+escaped
+2]) >= T_META
)
794 if (env
->type
>= SRE
)
800 if (env
->type
== KRE
&& *(env
->cursor
+ env
->token
.len
) == '-' && *(env
->cursor
+ env
->token
.len
+ 1) == '(')
805 if (env
->type
== KRE
&& *(env
->cursor
+ env
->token
.len
) == '(')
824 else if (c
== T_STAR
)
826 else if (c
== T_QUES
)
836 c
= (c
- T_BACK
) * 2 - 1;
837 c
= (c
> env
->parno
|| !env
->paren
[c
]) ? o
: T_BACK
+ c
;
839 else if (env
->type
== KRE
&& !env
->parnest
&& (env
->flags
& REG_SHELL_GROUP
))
845 else if (c
== T_OPEN
)
851 else if (escaped
== 2)
853 if (env
->type
>= SRE
&& !(env
->flags
& REG_SHELL_ESCAPED
) || (env
->flags
& REG_ESCAPE
) && (c
== '[' || c
== '-' || c
== ']' || env
->delimiter
&& c
== env
->delimiter
))
857 env
->error
= REG_BADESC
;
861 else if (escaped
&& !(env
->flags
& REG_LENIENT
) && c
!= ']')
863 env
->error
= REG_BADESC
;
868 sp
= (char*)env
->cursor
+ env
->token
.len
;
879 env
->error
= REG_EPAREN
;
892 env
->cursor
= (unsigned char*)sp
;
897 else if (env
->flags
& REG_LENIENT
)
899 else if (escaped
== 1 && !env
->error
)
903 env
->error
= REG_BADESC
;
909 token(register Cenv_t
* env
)
915 return env
->token
.lex
;
916 env
->token
.att
= env
->token
.esc
= 0;
917 if ((env
->token
.len
= MBSIZE(env
->cursor
)) > 1)
918 return env
->token
.lex
= C_MB
;
923 if (c
== 0 || c
== env
->delimiter
|| c
== env
->terminator
)
925 if (!(env
->flags
& REG_COMMENT
))
932 if (c
== 0 || c
== env
->delimiter
)
936 else if (!isspace(c
))
940 if (c
== '\n' && (env
->flags
& REG_MULTIPLE
) && !env
->delimiter
)
944 env
->error
= REG_EPAREN
;
948 env
->pattern
= env
->cursor
+ 1;
951 if (env
->flags
& REG_LITERAL
)
953 if (posixkludge
= env
->posixkludge
)
955 env
->posixkludge
= 0;
961 if (env
->flags
& REG_SHELL_ESCAPED
)
963 if (!(c
= *(env
->cursor
+ 1)) || c
== env
->terminator
)
965 if (env
->flags
& REG_LENIENT
)
969 env
->token
.esc
= env
->token
.len
;
970 env
->token
.len
+= MBSIZE(env
->cursor
+ 1);
975 env
->error
= REG_EESCAPE
;
978 env
->token
.esc
= env
->token
.len
;
979 env
->token
.len
+= MBSIZE(env
->cursor
+ 1);
980 if (env
->delimiter
&& c
== 'n')
982 else if (c
== env
->delimiter
)
983 return magic(env
, c
, 0);
984 else if (c
== '(' && env
->type
== BRE
)
985 env
->posixkludge
= 1;
986 else if (c
== ')' && env
->type
== BRE
&& env
->parnest
<= 0)
988 env
->error
= REG_EPAREN
;
991 else if (isspace(c
) && (env
->flags
& REG_COMMENT
))
993 return magic(env
, c
, 1);
997 if (env
->type
== BRE
&& (*(env
->cursor
+ 1) == 0 || *(env
->cursor
+ 1) == env
->delimiter
|| *(env
->cursor
+ 1) == env
->terminator
|| *(env
->cursor
+ 1) == '\\' && *(env
->cursor
+ 2) == ')') || (env
->flags
& REG_MULTIPLE
) && *(env
->cursor
+ 1) == '\n')
1002 if (env
->type
== BRE
&& (env
->cursor
== env
->pattern
|| posixkludge
== 1))
1004 env
->posixkludge
= 2;
1010 if (env
->type
!= BRE
&& env
->parnest
<= 0)
1013 else if (c
== '/' && env
->explicit == env
->mappedslash
)
1015 while (*(env
->cursor
+ env
->token
.len
) == c
)
1019 return magic(env
, c
, 0);
1023 col(Celt_t
* ce
, int ic
, unsigned char* bp
, int bw
, int bc
, unsigned char* ep
, int ew
, int ec
)
1026 register unsigned char* k
;
1027 register unsigned char* e
;
1048 else if (islower(c
))
1056 else if (bw
< COLL_KEY_MAX
)
1067 else if (iswlower(c
))
1076 k
+= mbconv((char*)k
, c
);
1079 for (e
= k
+ bw
; k
< e
; *k
++ = *s
++);
1082 mbxfrm(ce
->beg
, key
, COLL_KEY_MAX
);
1089 else if (iswlower(c
))
1104 else if (islower(c
))
1112 else if (ew
< COLL_KEY_MAX
)
1123 else if (iswlower(c
))
1132 k
+= mbconv((char*)k
, c
);
1135 for (e
= k
+ ew
; k
< e
; *k
++ = *s
++);
1138 mbxfrm(ce
->end
, key
, COLL_KEY_MAX
);
1143 else if (iswlower(c
))
1147 ce
->typ
= bt
== et
? bt
: COLL_range
;
1150 ce
->typ
= COLL_char
;
1172 unsigned char* first
;
1173 unsigned char* start
;
1174 unsigned char* begin
;
1177 unsigned char buf
[4 * (COLL_KEY_MAX
+ 1)];
1180 char mbc
[COLL_KEY_MAX
+ 1];
1183 if (!(e
= node(env
, REX_CLASS
, 1, 1, sizeof(Set_t
))))
1185 collate
= complicated
= elements
= 0;
1186 if (*env
->cursor
== '^' || env
->type
>= SRE
&& *env
->cursor
== '!')
1193 first
= env
->cursor
;
1194 start
= first
+ MBSIZE(first
);
1195 if (*env
->cursor
== 0 || *(env
->cursor
+ 1) == 0 || *env
->cursor
== env
->terminator
|| *(env
->cursor
+ 1) == env
->terminator
|| (env
->flags
& REG_ESCAPE
) && (*env
->cursor
== env
->delimiter
|| *env
->cursor
!= '\\' && *(env
->cursor
+ 1) == env
->delimiter
))
1197 begin
= env
->cursor
+ MBSIZE(env
->cursor
);
1200 * inrange: 0=no, 1=possibly, 2=definitely
1206 if (!(c
= *env
->cursor
) || c
== env
->terminator
|| c
== env
->delimiter
&& (env
->flags
& REG_ESCAPE
))
1208 env
->cursor
+= (w
= MBSIZE(env
->cursor
));
1209 if (c
== '\\' && ((env
->flags
& REG_CLASS_ESCAPE
) || env
->type
>= SRE
&& env
->parnest
|| *env
->cursor
== env
->delimiter
&& (env
->flags
& REG_ESCAPE
)))
1213 if (*env
->cursor
== 'n')
1218 else if (env
->type
< SRE
|| !(env
->flags
& REG_SHELL_ESCAPED
))
1221 w
= magic(env
, *env
->cursor
, 2);
1222 if (env
->token
.len
> 1 || w
!= T_BAD
)
1224 if (env
->token
.len
== 1 && (f
= classfun(w
)))
1228 if (env
->type
< SRE
&& !(env
->flags
& REG_LENIENT
))
1233 for (c
= 0; c
<= UCHAR_MAX
; c
++)
1235 setadd(e
->re
.charclass
, c
);
1240 if (env
->token
.len
> 1 || w
>= 0 && w
< T_META
)
1245 if (env
->type
< SRE
&& !(env
->flags
& REG_LENIENT
) && !mbwide())
1249 env
->cursor
+= env
->token
.len
;
1257 if (env
->cursor
== begin
)
1265 setadd(e
->re
.charclass
, last
);
1269 setadd(e
->re
.charclass
, '-');
1277 if (!inrange
&& env
->cursor
!= begin
&& *env
->cursor
!= ']')
1279 if (env
->type
< SRE
&& !(env
->flags
& REG_LENIENT
))
1283 else if (inrange
== 1)
1292 switch (*env
->cursor
)
1301 setadd(e
->re
.charclass
, last
);
1304 if (!(f
= regclass((char*)env
->cursor
, (char**)&env
->cursor
)))
1306 if (env
->cursor
== start
&& (c
= *(env
->cursor
+ 1)))
1308 s
= start
= env
->cursor
+ 1;
1309 while (*++s
&& *s
!= ':');
1310 if (*s
== ':' && *(s
+ 1) == ']' && *(s
+ 2) == ']')
1312 if ((i
= (s
- start
)) == 1)
1328 env
->cursor
= s
+ 3;
1330 return node(env
, i
, 0, 0, 0);
1335 env
->error
= REG_ECTYPE
;
1338 for (c
= 0; c
<= UCHAR_MAX
; c
++)
1340 setadd(e
->re
.charclass
, c
);
1352 setadd(e
->re
.charclass
, last
);
1355 if ((c
= regcollate((char*)env
->cursor
, (char**)&env
->cursor
, (char*)buf
, sizeof(buf
))) < 0)
1360 setadd(e
->re
.charclass
, buf
[0]);
1369 if ((c
= regcollate((char*)env
->cursor
, (char**)&env
->cursor
, (char*)buf
, sizeof(buf
))) < 0)
1378 if (*env
->cursor
== env
->terminator
|| *env
->cursor
== env
->delimiter
&& (env
->flags
& REG_ESCAPE
))
1389 for (i
= last
; i
<= c
; i
++)
1390 setadd(e
->re
.charclass
, i
);
1391 inrange
= env
->type
>= SRE
|| (env
->flags
& REG_LENIENT
);
1394 else if (env
->type
>= SRE
)
1396 setadd(e
->re
.charclass
, last
);
1397 setadd(e
->re
.charclass
, c
);
1401 else if (!(env
->flags
& REG_LENIENT
))
1406 else if (inrange
== 1)
1408 setadd(e
->re
.charclass
, last
);
1416 if (complicated
&& mbcoll())
1430 char cb
[2][COLL_KEY_MAX
+1];
1432 static Dtdisc_t disc
;
1434 static const char primary
[] = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789";
1436 if (!(dt
= (Dt_t
*)LCINFO(AST_LC_COLLATE
)->data
))
1438 disc
.key
= offsetof(Cchr_t
, key
);
1439 if ((cc
= newof(0, Cchr_t
, elementsof(primary
), 0)) && (dt
= dtopen(&disc
, Dttree
)))
1441 for (i
= 0; i
< elementsof(primary
) - 1; i
++, cc
++)
1443 cc
->nam
[0] = primary
[i
];
1444 mbxfrm(cc
->key
, cc
->nam
, COLL_KEY_MAX
);
1447 for (i
= 0; i
< elementsof(cc
->key
); i
++)
1450 LCINFO(AST_LC_COLLATE
)->data
= (void*)dt
;
1462 if (ic
= env
->flags
& REG_ICASE
)
1464 if (!(e
= node(env
, REX_COLL_CLASS
, 1, 1, (elements
+ 2) * sizeof(Celt_t
))))
1466 ce
= (Celt_t
*)e
->re
.data
;
1467 e
->re
.collate
.invert
= neg
;
1468 e
->re
.collate
.elements
= ce
;
1469 env
->cursor
= first
;
1473 if ((c
= *env
->cursor
) == 0 || c
== env
->terminator
|| (env
->flags
& REG_ESCAPE
) && c
== env
->delimiter
)
1476 env
->cursor
+= (w
= MBSIZE(env
->cursor
));
1477 if (c
== '\\' && ((env
->flags
& REG_CLASS_ESCAPE
) || env
->type
>= SRE
&& env
->parnest
|| *env
->cursor
== env
->delimiter
&& (env
->flags
& REG_ESCAPE
)))
1481 if (*env
->cursor
== 'n')
1486 else if (env
->type
< SRE
|| !(env
->flags
& REG_SHELL_ESCAPED
))
1489 w
= magic(env
, *env
->cursor
, 2);
1490 if (env
->token
.len
> 1 || w
!= T_BAD
)
1492 if (env
->token
.len
== 1 && (f
= classfun(w
)))
1496 if (env
->type
< SRE
&& !(env
->flags
& REG_LENIENT
))
1502 ce
->typ
= COLL_call
;
1506 if (env
->token
.len
> 1 || w
>= 0 && w
< T_META
)
1510 pp
= (unsigned char*)mbc
;
1511 env
->cursor
+= env
->token
.len
;
1519 if (env
->cursor
== begin
)
1528 ce
= col(ce
, ic
, rp
, rw
, rc
, NiL
, 0, 0);
1530 ce
= col(ce
, ic
, NiL
, 1, '-', NiL
, 0, 0);
1536 if (!inrange
&& env
->cursor
!= begin
&& *env
->cursor
!= ']')
1538 if (env
->type
< SRE
&& !(env
->flags
& REG_LENIENT
))
1542 else if (inrange
== 1)
1550 switch (*env
->cursor
)
1556 goto complicated_normal
;
1558 ce
= col(ce
, ic
, rp
, rw
, rc
, NiL
, 0, 0);
1559 if (!(f
= regclass((char*)env
->cursor
, (char**)&env
->cursor
)))
1561 if (env
->cursor
== start
&& (c
= *(env
->cursor
+ 1)) && *(env
->cursor
+ 2) == ':' && *(env
->cursor
+ 3) == ']' && *(env
->cursor
+ 4) == ']')
1579 return node(env
, i
, 0, 0, 0);
1582 env
->error
= REG_ECTYPE
;
1586 ce
->typ
= COLL_call
;
1592 goto complicated_normal
;
1596 ce
= col(ce
, ic
, rp
, rw
, rc
, NiL
, 0, 0);
1597 pp
= (unsigned char*)cb
[inrange
];
1598 rp
= env
->cursor
+ 1;
1599 if ((rw
= regcollate((char*)env
->cursor
, (char**)&env
->cursor
, (char*)pp
, COLL_KEY_MAX
)) < 0)
1609 rw
= mbconv((char*)pp
, wc
);
1612 else if (iswlower(wc
))
1617 mbxfrm(key
.key
, (char*)pp
, COLL_KEY_MAX
);
1618 if (!(cc
= (Cchr_t
*)dtsearch(dt
, &key
)) && !(cc
= (Cchr_t
*)dtprev(dt
, &key
)))
1620 xc
= (tc
= (Cchr_t
*)dtprev(dt
, cc
)) && !strcasecmp((char*)tc
->nam
, (char*)cc
->nam
) ? tc
: cc
;
1621 if (c
== 'l' || c
== 'L' && !(c
= 0))
1622 ce
->typ
= COLL_range_lc
;
1623 else if (c
== 'u' || c
== 'U' && !(c
= 0))
1624 ce
->typ
= COLL_range_uc
;
1626 ce
->typ
= COLL_range
;
1627 strcpy((char*)ce
->beg
, (char*)xc
->key
);
1628 if (!(cc
= (Cchr_t
*)dtnext(dt
, cc
)))
1630 if (!strcasecmp((char*)xc
->nam
, (char*)cc
->nam
) && (tc
= (Cchr_t
*)dtnext(dt
, cc
)))
1632 strcpy((char*)ce
->end
, (char*)cc
->key
);
1647 rw
= mbconv((char*)pp
, wc
);
1654 goto complicated_normal
;
1655 pp
= (unsigned char*)cb
[inrange
];
1656 if ((w
= regcollate((char*)env
->cursor
, (char**)&env
->cursor
, (char*)pp
, COLL_KEY_MAX
)) < 0)
1662 if (*env
->cursor
== env
->terminator
|| *env
->cursor
== env
->delimiter
&& (env
->flags
& REG_ESCAPE
))
1669 ce
= col(ce
, ic
, rp
, rw
, rc
, pp
, w
, c
);
1670 if (strcmp((char*)ce
->beg
, (char*)ce
->end
) > 0)
1672 if (env
->type
< SRE
&& !(env
->flags
& REG_LENIENT
))
1674 (ce
-1)->typ
= COLL_char
;
1675 strcpy((char*)ce
->beg
, (char*)(ce
-1)->end
);
1676 ce
->typ
= COLL_char
;
1679 inrange
= env
->type
>= SRE
|| (env
->flags
& REG_LENIENT
);
1681 else if (inrange
== 1)
1682 ce
= col(ce
, ic
, rp
, rw
, rc
, NiL
, 0, 0);
1696 if (env
->flags
& REG_ICASE
)
1697 for (i
= 0; i
<= UCHAR_MAX
; i
++)
1698 if (settst(e
->re
.charclass
, i
))
1702 else if (islower(i
))
1706 setadd(e
->re
.charclass
, c
);
1710 for (i
= 0; i
< elementsof(e
->re
.charclass
->bits
); i
++)
1711 e
->re
.charclass
->bits
[i
] ^= ~0;
1712 if (env
->explicit >= 0)
1713 setclr(e
->re
.charclass
, env
->explicit);
1717 env
->error
= REG_ECOLLATE
;
1720 env
->error
= REG_ERANGE
;
1724 env
->error
= REG_EBRACK
;
1729 ccl(Cenv_t
* env
, int type
)
1736 if (!(f
= classfun(type
)))
1738 env
->error
= REG_BADESC
;
1743 if (!(e
= node(env
, REX_CLASS
, 1, 1, sizeof(Set_t
))))
1745 for (i
= 0; i
<= UCHAR_MAX
; i
++)
1747 setadd(e
->re
.charclass
, i
);
1748 if (env
->explicit >= 0)
1749 setclr(e
->re
.charclass
, env
->explicit);
1753 if (!(e
= node(env
, REX_COLL_CLASS
, 1, 1, 2 * sizeof(Celt_t
))))
1755 ce
= (Celt_t
*)e
->re
.data
;
1756 e
->re
.collate
.invert
= 0;
1757 e
->re
.collate
.elements
= ce
;
1759 ce
->typ
= COLL_call
;
1767 rep(Cenv_t
* env
, Rex_t
* e
, int number
, int last
)
1770 unsigned long m
= 0;
1771 unsigned long n
= RE_DUP_INF
;
1780 if (!(f
= node(env
, REX_NEG
, m
, n
, 0)))
1785 f
->re
.group
.expr
.rex
= e
;
1808 else if (env
->type
< SRE
)
1813 minimal
= !(env
->flags
& REG_MINIMAL
);
1815 case T_STAR
: /*AST*/
1817 minimal
= !!(env
->flags
& REG_MINIMAL
);
1824 case REX_COLL_CLASS
:
1841 env
->error
= REG_BADRPT
;
1845 if (m
== 1 && n
== 1)
1851 if (!(f
= node(env
, REX_REP
, m
, n
, 0)))
1856 f
->re
.group
.expr
.rex
= e
;
1857 f
->re
.group
.number
= number
;
1858 f
->re
.group
.last
= last
;
1863 for (; e
&& e
->type
>= REX_GROUP
&& e
->type
<= REX_GROUP_CUT
; e
= e
->re
.group
.expr
.rex
);
1864 if (e
&& e
->type
== REX_NEG
)
1865 f
->type
= REX_GROUP
;
1876 return e
->lo
== 1 && e
->hi
== 1;
1884 trienode(Cenv_t
* env
, int c
)
1888 if (t
= (Trie_node_t
*)alloc(env
->disc
, 0, sizeof(Trie_node_t
)))
1890 memset(t
, 0, sizeof(Trie_node_t
));
1897 insert(Cenv_t
* env
, Rex_t
* f
, Rex_t
* g
)
1903 unsigned char tmp
[2];
1908 *(s
= tmp
) = f
->re
.onechar
;
1912 s
= f
->re
.string
.base
;
1913 e
= s
+ f
->re
.string
.size
;
1918 if (!(t
= g
->re
.trie
.root
[*s
]) && !(t
= g
->re
.trie
.root
[*s
] = trienode(env
, *s
)))
1926 if (!t
->son
&& !(t
->son
= trienode(env
, *s
)))
1933 if (!t
->sib
&& !(t
->sib
= trienode(env
, *s
)))
1938 if (g
->re
.trie
.min
> len
)
1939 g
->re
.trie
.min
= len
;
1940 if (g
->re
.trie
.max
< len
)
1941 g
->re
.trie
.max
= len
;
1947 * trie() tries to combine nontrivial e and f into a REX_TRIE
1948 * unless 0 is returned, e and f are deleted as far as possible
1949 * also called by regcomb
1953 trie(Cenv_t
* env
, Rex_t
* e
, Rex_t
* f
)
1957 if (e
->next
|| f
->next
|| !isstring(e
) || e
->flags
!= f
->flags
)
1961 if (!(g
= node(env
, REX_TRIE
, 0, 0, (UCHAR_MAX
+ 1) * sizeof(Trie_node_t
*))))
1963 g
->re
.trie
.min
= INT_MAX
;
1964 if (insert(env
, f
, g
))
1968 else if (f
->type
!= REX_TRIE
)
1972 if (insert(env
, e
, g
))
1982 static Rex_t
* alt(Cenv_t
*, int, int);
1985 chr(register Cenv_t
* env
, int* escaped
)
1991 if (!(c
= *env
->cursor
))
1996 if (env
->flags
& REG_SHELL_ESCAPED
)
1998 if (!(c
= *(env
->cursor
+ 1)) || c
== env
->terminator
)
2000 if (env
->flags
& REG_LENIENT
)
2001 return c
? c
: '\\';
2002 env
->error
= REG_EESCAPE
;
2006 c
= chresc((char*)env
->cursor
- 1, (char**)&env
->cursor
);
2007 *escaped
= env
->cursor
- p
;
2013 * open the perly gates
2017 grp(Cenv_t
* env
, int parno
)
2030 beg
= env
->pattern
== env
->cursor
- env
->token
.len
;
2031 if (!(c
= env
->token
.lex
) && (c
= *env
->cursor
))
2057 case 'M': /* glob(3) */
2058 case 'N': /* glob(3) */
2059 case 'O': /* glob(3) */
2061 case 'R': /* pcre */
2063 case 'U': /* pcre */
2064 case 'X': /* pcre */
2067 env
->token
.push
= 1;
2073 if (!(env
->flags
& REG_LITERAL
))
2075 env
->error
= REG_BADRPT
;
2085 if (token(env
) == T_CLOSE
)
2096 env
->flags
|= (REG_LEFT
|REG_RIGHT
);
2098 env
->flags
&= ~(REG_LEFT
|REG_RIGHT
);
2102 env
->flags
&= ~REG_MINIMAL
;
2104 env
->flags
|= REG_MINIMAL
;
2108 env
->flags
|= REG_ICASE
;
2110 env
->flags
&= ~REG_ICASE
;
2114 env
->flags
|= REG_LEFT
;
2116 env
->flags
&= ~REG_LEFT
;
2120 env
->flags
|= REG_NEWLINE
;
2122 env
->flags
&= ~REG_NEWLINE
;
2123 env
->explicit = (env
->flags
& (REG_NEWLINE
|REG_SPAN
)) == REG_NEWLINE
? env
->mappednewline
: -1;
2127 env
->flags
&= ~REG_LENIENT
;
2129 env
->flags
|= REG_LENIENT
;
2133 env
->flags
|= REG_RIGHT
;
2135 env
->flags
&= ~REG_RIGHT
;
2139 env
->flags
|= REG_SPAN
;
2141 env
->flags
&= ~REG_SPAN
;
2142 env
->explicit = (env
->flags
& (REG_NEWLINE
|REG_SPAN
)) == REG_NEWLINE
? env
->mappednewline
: -1;
2146 env
->flags
|= REG_COMMENT
;
2148 env
->flags
&= ~REG_COMMENT
;
2151 env
->flags
&= ~(REG_AUGMENTED
|REG_EXTENDED
|REG_LITERAL
|REG_SHELL
|REG_LEFT
|REG_RIGHT
|REG_CLASS_ESCAPE
);
2152 env
->flags
|= REG_AUGMENTED
|REG_EXTENDED
;
2157 env
->flags
&= ~(REG_AUGMENTED
|REG_EXTENDED
|REG_LITERAL
|REG_SHELL
|REG_LEFT
|REG_RIGHT
|REG_CLASS_ESCAPE
);
2161 env
->flags
&= ~(REG_AUGMENTED
|REG_EXTENDED
|REG_LITERAL
|REG_SHELL
|REG_LEFT
|REG_RIGHT
|REG_CLASS_ESCAPE
);
2162 env
->flags
|= REG_EXTENDED
;
2167 env
->flags
&= ~(REG_AUGMENTED
|REG_EXTENDED
|REG_LITERAL
|REG_SHELL
|REG_LEFT
|REG_RIGHT
|REG_CLASS_ESCAPE
);
2168 env
->flags
|= REG_LITERAL
;
2172 env
->flags
&= ~(REG_AUGMENTED
|REG_EXTENDED
|REG_LITERAL
|REG_SHELL
|REG_LEFT
|REG_RIGHT
|REG_CLASS_ESCAPE
);
2173 env
->flags
|= REG_AUGMENTED
|REG_SHELL
|REG_LEFT
|REG_RIGHT
;
2177 /* used by caller to disable glob(3) GLOB_BRACE */
2180 /* used by caller to disable glob(3) GLOB_NOCHECK */
2183 /* used by caller to disable glob(3) GLOB_STARSTAR */
2186 env
->flags
&= ~(REG_AUGMENTED
|REG_EXTENDED
|REG_LITERAL
|REG_SHELL
|REG_LEFT
|REG_RIGHT
|REG_CLASS_ESCAPE
);
2187 env
->flags
|= REG_EXTENDED
|REG_CLASS_ESCAPE
;
2191 env
->flags
&= ~(REG_AUGMENTED
|REG_EXTENDED
|REG_LITERAL
|REG_SHELL
|REG_LEFT
|REG_RIGHT
|REG_CLASS_ESCAPE
);
2192 env
->flags
|= REG_SHELL
|REG_LEFT
|REG_RIGHT
;
2195 case 'U': /* PCRE_UNGREEDY */
2197 env
->flags
|= REG_MINIMAL
;
2199 env
->flags
&= ~REG_MINIMAL
;
2201 case 'X': /* PCRE_EXTRA */
2204 env
->error
= REG_BADRPT
;
2216 x
= REX_GROUP_AHEAD
;
2219 x
= REX_GROUP_AHEAD_NOT
;
2225 x
= REX_GROUP_BEHIND
;
2229 x
= REX_GROUP_BEHIND_NOT
;
2232 env
->error
= REG_BADRPT
;
2242 e
= node(env
, REX_NEST
, 0, 0, (UCHAR_MAX
+ 1) * sizeof(unsigned short));
2243 e
->re
.nest
.primary
= isalnum(*env
->cursor
) ? -1 : *env
->cursor
;
2247 switch (i
= chr(env
, &esc
))
2252 env
->cursor
-= esc
+ 1;
2253 env
->error
= REG_EPAREN
;
2256 x
= REX_NEST_delimiter
;
2259 if ((i
= chr(env
, &esc
)) < 0)
2261 if (e
->re
.nest
.type
[i
] & ~x
)
2263 e
->re
.nest
.type
[i
] = x
;
2266 x
= REX_NEST_escape
;
2269 x
= REX_NEST_literal
;
2272 switch (i
= chr(env
, &esc
))
2275 e
->re
.nest
.type
[UCHAR_MAX
+1] |= REX_NEST_terminator
;
2285 if ((i
= chr(env
, &esc
)) < 0)
2287 if (e
->re
.nest
.type
[i
] & ~x
)
2289 e
->re
.nest
.type
[i
] = x
|REX_NEST_open
|REX_NEST_close
|(i
<<REX_NEST_SHIFT
);
2292 x
= REX_NEST_separator
;
2295 x
= REX_NEST_terminator
;
2312 if (isalnum(i
) || (e
->re
.nest
.type
[i
] & (REX_NEST_close
|REX_NEST_escape
|REX_NEST_literal
|REX_NEST_quote
|REX_NEST_delimiter
|REX_NEST_separator
|REX_NEST_terminator
)))
2314 e
->re
.nest
.type
[i
] = REX_NEST_open
;
2315 if ((x
= chr(env
, &esc
)) < 0 || (e
->re
.nest
.type
[x
] & (REX_NEST_close
|REX_NEST_escape
|REX_NEST_delimiter
|REX_NEST_separator
|REX_NEST_terminator
)))
2319 if (x
== ')' && !--n
)
2324 e
->re
.nest
.type
[x
] |= REX_NEST_close
;
2325 e
->re
.nest
.type
[i
] |= x
<< REX_NEST_SHIFT
;
2332 for (n
= 0; n
< 2; n
++)
2334 parno
= ++env
->parno
;
2335 if (!(f
= node(env
, REX_GROUP
, 0, 0, 0)))
2340 if (parno
< elementsof(env
->paren
))
2341 env
->paren
[parno
] = f
;
2342 f
->re
.group
.back
= 0;
2343 f
->re
.group
.number
= parno
;
2344 f
->re
.group
.expr
.rex
= e
;
2350 if (isdigit(*env
->cursor
))
2355 if (c
> (INT_MAX
/ 10))
2357 env
->error
= REG_BADRPT
;
2360 c
= c
* 10 + (*env
->cursor
++ - '0');
2361 } while (isdigit(*env
->cursor
));
2362 if (*env
->cursor
++ != ')')
2364 env
->error
= REG_BADRPT
;
2367 if (c
&& env
->type
>= SRE
)
2369 if (!c
|| c
> env
->parno
|| !env
->paren
[c
])
2371 if (!(env
->flags
& REG_LENIENT
))
2373 env
->error
= REG_ESUBREG
;
2382 if (env
->type
< SRE
&& *env
->cursor
++ != '?')
2384 env
->error
= REG_BADRPT
;
2387 if (!(f
= grp(env
, parno
+ 1)) && env
->error
)
2390 if (!(e
= node(env
, REX_GROUP_COND
, 0, 0, 0)))
2395 e
->re
.group
.size
= c
;
2396 e
->re
.group
.expr
.binary
.left
= f
;
2397 if (!(e
->re
.group
.expr
.binary
.right
= alt(env
, parno
, 1)))
2402 if (token(env
) != T_CLOSE
)
2404 env
->error
= REG_EPAREN
;
2409 return rep(env
, e
, parno
, parno
);
2413 while (c
= *env
->cursor
)
2415 if (c
== '\\' && *(env
->cursor
+ 1))
2419 else if (c
== '}' && !--n
)
2421 else if (c
== env
->delimiter
|| c
== env
->terminator
)
2427 env
->error
= REG_EBRACE
;
2430 if (*++env
->cursor
!= ')')
2432 env
->error
= REG_EPAREN
;
2437 if (env
->disc
->re_version
< REG_VERSION_EXEC
)
2439 env
->error
= REG_BADRPT
;
2442 if (!env
->disc
->re_execf
)
2444 if (!(e
= node(env
, REX_EXEC
, 0, 0, 0)))
2446 e
->re
.exec
.text
= (const char*)p
;
2447 e
->re
.exec
.size
= env
->cursor
- p
- 2;
2448 if (!env
->disc
->re_compf
)
2449 e
->re
.exec
.data
= 0;
2451 e
->re
.exec
.data
= (*env
->disc
->re_compf
)(env
->regex
, e
->re
.exec
.text
, e
->re
.exec
.size
, env
->disc
);
2453 case '0': case '1': case '2': case '3': case '4':
2454 case '5': case '6': case '7': case '8': case '9':
2456 while (isdigit(*env
->cursor
))
2458 if (c
> (INT_MAX
/ 10))
2460 env
->error
= REG_ESUBREG
;
2463 c
= c
* 10 + *env
->cursor
++ - '0';
2465 if (*env
->cursor
== ')')
2470 if (c
> env
->parno
|| !env
->paren
[c
])
2472 env
->error
= REG_ESUBREG
;
2475 env
->paren
[c
]->re
.group
.back
= 1;
2476 return rep(env
, node(env
, REX_BACK
, c
, 0, 0), 0, 0);
2480 env
->error
= REG_BADRPT
;
2483 if (x
&& !(e
= alt(env
, parno
, 0)))
2487 if (c
!= T_CLOSE
&& (!(env
->flags
& REG_LITERAL
) || c
!= ')'))
2489 env
->error
= REG_EPAREN
;
2496 env
->pattern
= env
->cursor
;
2501 if (!(f
= node(env
, x
, 0, 0, 0)))
2506 f
->re
.group
.expr
.rex
= e
;
2507 if (x
== REX_GROUP_BEHIND
|| x
== REX_GROUP_BEHIND_NOT
)
2513 env
->error
= REG_ECOUNT
;
2516 f
->re
.group
.size
= env
->stats
.m
;
2517 memset(&env
->stats
, 0, sizeof(env
->stats
));
2523 f
= rep(env
, f
, parno
, env
->parno
);
2546 unsigned char buf
[256];
2551 while ((c
= token(env
)) < T_META
&& s
< &buf
[sizeof(buf
) - env
->token
.len
])
2558 *s
++ = (env
->flags
& REG_ICASE
) ? toupper(c
) : c
;
2560 else if (c
== C_ESC
|| (env
->flags
& REG_ICASE
))
2562 c
= (c
== C_ESC
) ? env
->token
.lex
: mbchar(p
);
2563 if (env
->flags
& REG_ICASE
)
2565 if ((&buf
[sizeof(buf
)] - s
) < MB_CUR_MAX
)
2567 if ((n
= mbconv((char*)s
, c
)) < 0)
2579 n
= env
->token
.len
- env
->token
.esc
;
2580 for (t
= p
, u
= s
+ n
; s
< u
; *s
++ = *t
++);
2594 if ((s
-= n
) == buf
)
2599 if (!(e
= node(env
, REX_STRING
, 0, 0, i
)))
2601 memcpy((char*)(e
->re
.string
.base
= (unsigned char*)e
->re
.data
), (char*)buf
, i
);
2602 e
->re
.string
.size
= i
;
2606 if (!(f
= node(env
, REX_ONECHAR
, 1, 1, 0)))
2611 f
->re
.onechar
= (env
->flags
& REG_ICASE
) ? toupper(x
) : x
;
2615 if (!(f
= node(env
, REX_STRING
, 0, 0, n
)))
2617 memcpy((char*)(f
->re
.string
.base
= (unsigned char*)f
->re
.data
), (char*)p
, n
);
2618 f
->re
.string
.size
= n
;
2620 if (!(f
= rep(env
, f
, 0, 0)) || !(f
= cat(env
, f
, seq(env
))))
2630 if (!(e
= node(env
, REX_STRING
, 0, 0, c
)))
2632 memcpy((char*)(e
->re
.string
.base
= (unsigned char*)e
->re
.data
), (char*)buf
, c
);
2633 e
->re
.string
.size
= c
;
2634 return cat(env
, e
, seq(env
));
2636 else if (c
> T_BACK
)
2640 if (c
> env
->parno
|| !env
->paren
[c
])
2642 env
->error
= REG_ESUBREG
;
2645 env
->paren
[c
]->re
.group
.back
= 1;
2646 e
= rep(env
, node(env
, REX_BACK
, c
, 0, 0), 0, 0);
2655 return node(env
, REX_NULL
, 0, 0, 0);
2658 e
= rep(env
, node(env
, REX_END
, 0, 0, 0), 0, 0);
2662 if ((e
= node(env
, REX_BEG
, 0, 0, 0)) && (env
->flags
& REG_EXTENDED
))
2663 e
= rep(env
, e
, 0, 0);
2671 env
->flags
|= REG_MINIMAL
;
2673 if (env
->type
== KRE
)
2675 parno
= ++env
->parno
;
2676 if (!(e
= alt(env
, parno
+ 1, 0)))
2678 if (e
->type
== REX_NULL
&& env
->type
== ERE
&& !(env
->flags
& REG_NULL
))
2681 env
->error
= (*env
->cursor
== 0 || *env
->cursor
== env
->delimiter
|| *env
->cursor
== env
->terminator
) ? REG_EPAREN
: REG_ENULL
;
2684 if (token(env
) != T_CLOSE
)
2687 env
->error
= REG_EPAREN
;
2692 if (!(f
= node(env
, REX_GROUP
, 0, 0, 0)))
2697 if (parno
< elementsof(env
->paren
))
2698 env
->paren
[parno
] = f
;
2699 f
->re
.group
.back
= 0;
2700 f
->re
.group
.number
= parno
;
2701 f
->re
.group
.expr
.rex
= e
;
2707 if (!(e
= rep(env
, f
, parno
, env
->parno
)))
2709 if (env
->type
== KRE
)
2711 if (!(f
= node(env
, REX_GROUP
, 0, 0, 0)))
2716 if (--parno
< elementsof(env
->paren
))
2717 env
->paren
[parno
] = f
;
2718 f
->re
.group
.back
= 0;
2719 f
->re
.group
.number
= parno
;
2720 f
->re
.group
.expr
.rex
= e
;
2731 if (!(e
= grp(env
, env
->parno
+ 1)))
2735 if (env
->literal
== env
->pattern
&& env
->literal
== p
)
2736 env
->literal
= env
->cursor
;
2745 e
= rep(env
, e
, 0, 0);
2754 if (e
= ccl(env
, c
))
2755 e
= rep(env
, e
, 0, 0);
2759 e
= rep(env
, node(env
, REX_WBEG
, 0, 0, 0), 0, 0);
2763 e
= rep(env
, node(env
, REX_WEND
, 0, 0, 0), 0, 0);
2767 e
= rep(env
, node(env
, REX_DOT
, 1, 1, 0), 0, 0);
2771 env
->token
.lex
= T_STAR
;
2772 env
->token
.push
= 1;
2773 e
= rep(env
, node(env
, REX_DOT
, 1, 1, 0), 0, 0);
2777 env
->token
.lex
= T_PLUS
;
2778 env
->token
.push
= 1;
2779 if (e
= node(env
, REX_ONECHAR
, 1, 1, 0))
2781 e
->re
.onechar
= '/';
2782 e
= rep(env
, e
, 0, 0);
2787 e
= rep(env
, node(env
, REX_WORD
, 0, 0, 0), 0, 0);
2791 e
= rep(env
, node(env
, REX_WORD_NOT
, 0, 0, 0), 0, 0);
2795 e
= rep(env
, node(env
, REX_BEG_STR
, 0, 0, 0), 0, 0);
2799 e
= rep(env
, node(env
, REX_END_STR
, 0, 0, 0), 0, 0);
2803 e
= rep(env
, node(env
, REX_FIN_STR
, 0, 0, 0), 0, 0);
2806 env
->error
= REG_BADRPT
;
2809 if (e
&& *env
->cursor
!= 0 && *env
->cursor
!= env
->delimiter
&& *env
->cursor
!= env
->terminator
)
2810 e
= cat(env
, e
, seq(env
));
2822 if (!(e
= seq(env
)) || !(env
->flags
& REG_AUGMENTED
) || token(env
) != T_AND
)
2825 if (!(f
= con(env
)))
2830 if (!(g
= node(env
, REX_CONJ
, 0, 0, 0)))
2836 g
->re
.group
.expr
.binary
.left
= e
;
2837 g
->re
.group
.expr
.binary
.right
= f
;
2842 alt(Cenv_t
* env
, int number
, int cond
)
2848 if (!(e
= con(env
)))
2850 else if (token(env
) != T_BAR
)
2855 if (e
->type
== REX_NULL
)
2861 if (!(f
= alt(env
, number
, 0)))
2866 if ((e
->type
== REX_NULL
|| f
->type
== REX_NULL
) && !(env
->flags
& REG_NULL
))
2868 if (!cond
&& (g
= trie(env
, e
, f
)))
2871 if (!(g
= node(env
, REX_ALT
, 0, 0, 0)))
2873 env
->error
= REG_ESPACE
;
2876 g
->re
.group
.number
= number
;
2877 g
->re
.group
.last
= env
->parno
;
2878 g
->re
.group
.expr
.binary
.left
= e
;
2879 g
->re
.group
.expr
.binary
.right
= f
;
2885 env
->error
= REG_ENULL
;
2890 * add v to REX_BM tables
2894 bmstr(Cenv_t
* env
, register Rex_t
* a
, unsigned char* v
, int n
, Bm_mask_t b
)
2900 for (m
= 0; m
< n
; m
++)
2902 if (!(z
= n
- m
- 1))
2905 a
->re
.bm
.mask
[m
][c
] |= b
;
2906 if (z
== HIT
|| !a
->re
.bm
.skip
[c
] || a
->re
.bm
.skip
[c
] > z
&& a
->re
.bm
.skip
[c
] < HIT
)
2907 a
->re
.bm
.skip
[c
] = z
;
2908 if (a
->flags
& REG_ICASE
)
2912 else if (islower(c
))
2916 a
->re
.bm
.mask
[m
][c
] |= b
;
2917 if (z
== HIT
|| !a
->re
.bm
.skip
[c
] || a
->re
.bm
.skip
[c
] > z
&& a
->re
.bm
.skip
[c
] < HIT
)
2918 a
->re
.bm
.skip
[c
] = z
;
2924 * set up BM table from trie
2928 bmtrie(Cenv_t
* env
, Rex_t
* a
, unsigned char* v
, Trie_node_t
* x
, int n
, int m
, Bm_mask_t b
)
2935 bmstr(env
, a
, v
, n
, b
);
2939 a
->re
.bm
.complete
= 0;
2942 a
->re
.bm
.complete
= 0;
2945 b
= bmtrie(env
, a
, v
, x
->son
, n
, m
+ 1, b
);
2946 } while (x
= x
->sib
);
2951 * rewrite the expression tree for some special cases
2952 * 1. it is a null expression - illegal
2953 * 2. max length fixed string found -- use BM algorithm
2954 * 3. it begins with an unanchored string - use KMP algorithm
2955 * 0 returned on success
2959 special(Cenv_t
* env
, regex_t
* p
)
2973 if (e
= p
->env
->rex
)
2975 if ((x
= env
->stats
.x
) && x
->re
.string
.size
< 3)
2977 if ((t
= env
->stats
.y
) && t
->re
.trie
.min
< 3)
2981 if (x
->re
.string
.size
>= t
->re
.trie
.min
)
2999 n
= m
= x
->re
.string
.size
;
3009 if (!(q
= (size_t*)alloc(env
->disc
, 0, (n
+ 1) * sizeof(size_t))))
3011 if (!(a
= node(env
, REX_BM
, 0, 0, n
* (sizeof(Bm_mask_t
*) + (UCHAR_MAX
+ 1) * sizeof(Bm_mask_t
)) + (UCHAR_MAX
+ n
+ 2) * sizeof(size_t))))
3013 alloc(env
->disc
, q
, 0);
3016 a
->flags
= y
->flags
;
3019 a
->re
.bm
.back
= (y
== e
|| y
== e
->re
.group
.expr
.rex
) ? (m
- n
) : -1;
3020 a
->re
.bm
.left
= l
- 1;
3021 a
->re
.bm
.right
= env
->stats
.m
- l
- n
;
3022 a
->re
.bm
.complete
= (env
->stats
.e
|| y
!= e
&& (e
->type
!= REX_GROUP
|| y
!= e
->re
.group
.expr
.rex
) || e
->next
|| ((a
->re
.bm
.left
+ a
->re
.bm
.right
) >= 0)) ? 0 : n
;
3023 h
= (Bm_mask_t
*)&a
->re
.bm
.mask
[n
];
3024 a
->re
.bm
.skip
= (size_t*)(h
+ n
* (UCHAR_MAX
+ 1));
3025 a
->re
.bm
.fail
= &a
->re
.bm
.skip
[UCHAR_MAX
+ 1];
3026 for (m
= 0; m
<= UCHAR_MAX
; m
++)
3027 a
->re
.bm
.skip
[m
] = n
;
3028 a
->re
.bm
.skip
[0] = a
->re
.bm
.skip
[env
->mappednewline
] = (y
->next
&& y
->next
->type
== REX_END
) ? HIT
: (n
+ a
->re
.bm
.left
);
3029 for (i
= 1; i
<= n
; i
++)
3030 a
->re
.bm
.fail
[i
] = 2 * n
- i
;
3031 mask
= a
->re
.bm
.mask
;
3032 for (m
= 0; m
< n
; m
++)
3038 bmstr(env
, a
, x
->re
.string
.base
, n
, 1);
3041 v
= (unsigned char*)q
;
3044 for (i
= 0; i
<= UCHAR_MAX
; i
++)
3045 if (t
->re
.trie
.root
[i
])
3046 m
= bmtrie(env
, a
, v
, t
->re
.trie
.root
[i
], n
, 0, m
);
3049 memset(q
, 0, n
* sizeof(*q
));
3050 for (k
= (j
= n
) + 1; j
> 0; j
--, k
--)
3052 DEBUG_TEST(0x0010,(sfprintf(sfstderr
, "BM#0: k=%d j=%d\n", k
, j
)),(0));
3053 for (q
[j
] = k
; k
<= n
; k
= q
[k
])
3055 for (m
= 0; m
<= UCHAR_MAX
; m
++)
3056 if (mask
[k
][m
] == mask
[j
][m
])
3058 DEBUG_TEST(0x0010,sfprintf(sfstderr
, "CUT1: mask[%d][%c]=mask[%d][%c]\n", k
, m
, j
, m
), (0));
3061 DEBUG_TEST(0x0010,sfprintf(sfstderr
, "BM#2: fail[%d]=%d => %d\n", k
, a
->re
.bm
.fail
[k
], (a
->re
.bm
.fail
[k
] > n
- j
) ? (n
- j
) : a
->re
.bm
.fail
[k
]), (0));
3062 if (a
->re
.bm
.fail
[k
] > n
- j
)
3063 a
->re
.bm
.fail
[k
] = n
- j
;
3067 for (i
= 1; i
<= n
; i
++)
3068 if (a
->re
.bm
.fail
[i
] > n
+ k
- i
)
3070 DEBUG_TEST(0x0010,sfprintf(sfstderr
, "BM#4: fail[%d]=%d => %d\n", i
, a
->re
.bm
.fail
[i
], n
+ k
- i
), (0));
3071 a
->re
.bm
.fail
[i
] = n
+ k
- i
;
3073 #if _AST_REGEX_DEBUG
3074 if (DEBUG_TEST(0x0020,(1),(0)))
3076 sfprintf(sfstderr
, "STAT: complete=%d n=%d k=%d l=%d r=%d y=%d:%d e=%d:%d\n", a
->re
.bm
.complete
, n
, k
, a
->re
.bm
.left
, a
->re
.bm
.right
, y
->type
, y
->next
? y
->next
->type
: 0, e
->type
, e
->next
? e
->next
->type
: 0);
3077 for (m
= 0; m
< n
; m
++)
3078 for (i
= 1; i
<= UCHAR_MAX
; i
++)
3079 if (a
->re
.bm
.mask
[m
][i
])
3080 sfprintf(sfstderr
, "MASK: [%d]['%c'] = %032..2u\n", m
, i
, a
->re
.bm
.mask
[m
][i
]);
3081 for (i
= ' '; i
<= UCHAR_MAX
; i
++)
3082 if (a
->re
.bm
.skip
[i
] >= HIT
)
3083 sfprintf(sfstderr
, "SKIP: ['%c'] = *\n", i
);
3084 else if (a
->re
.bm
.skip
[i
] > 0 && a
->re
.bm
.skip
[i
] < n
)
3085 sfprintf(sfstderr
, "SKIP: ['%c'] = %3d\n", i
, a
->re
.bm
.skip
[i
]);
3086 for (j
= 31; j
>= 0; j
--)
3088 sfprintf(sfstderr
, " ");
3090 for (m
= 0; m
< n
; m
++)
3092 for (i
= 0040; i
< 0177; i
++)
3093 if (a
->re
.bm
.mask
[m
][i
] & (1 << j
))
3095 sfprintf(sfstderr
, " %c", i
);
3105 sfprintf(sfstderr
, " ?");
3108 sfprintf(sfstderr
, "\n");
3110 sfprintf(sfstderr
, "FAIL: ");
3111 for (m
= 1; m
<= n
; m
++)
3112 sfprintf(sfstderr
, "%3d", a
->re
.bm
.fail
[m
]);
3113 sfprintf(sfstderr
, "\n");
3116 alloc(env
->disc
, q
, 0);
3124 if (env
->flags
& REG_NEWLINE
)
3130 e
= e
->re
.group
.expr
.rex
;
3131 if (e
->type
!= REX_DOT
)
3135 if (e
->lo
== 0 && e
->hi
== RE_DUP_INF
)
3139 if (env
->flags
& REG_NULL
)
3141 env
->error
= REG_ENULL
;
3144 if ((env
->flags
& (REG_LEFT
|REG_LITERAL
|REG_RIGHT
)) || e
->map
)
3146 s
= e
->re
.string
.base
;
3147 n
= e
->re
.string
.size
;
3148 if (!(a
= node(env
, REX_KMP
, 0, 0, n
* (sizeof(int*) + 1))))
3150 a
->flags
= e
->flags
;
3152 f
= a
->re
.string
.fail
;
3153 memcpy((char*)(a
->re
.string
.base
= (unsigned char*)&f
[n
]), (char*)s
, n
);
3154 s
= a
->re
.string
.base
;
3155 a
->re
.string
.size
= n
;
3157 for (k
= 1; k
< n
; k
++)
3159 while (m
>= 0 && s
[m
+1] != s
[k
])
3179 regcomp(regex_t
* p
, const char* pattern
, regflags_t flags
)
3184 unsigned char* fold
;
3190 if (flags
& REG_DISCIPLINE
)
3192 flags
&= ~REG_DISCIPLINE
;
3197 if (!disc
->re_errorlevel
)
3198 disc
->re_errorlevel
= 2;
3201 return fatal(disc
, REG_BADPAT
, pattern
);
3202 if (!state
.initialized
)
3204 state
.initialized
= 1;
3205 for (i
= 0; i
< elementsof(state
.escape
); i
++)
3206 state
.magic
[state
.escape
[i
].key
] = state
.escape
[i
].val
;
3208 if (!(fold
= (unsigned char*)LCINFO(AST_LC_CTYPE
)->data
))
3210 if (!(fold
= newof(0, unsigned char, UCHAR_MAX
, 1)))
3211 return fatal(disc
, REG_ESPACE
, pattern
);
3212 for (i
= 0; i
<= UCHAR_MAX
; i
++)
3213 fold
[i
] = toupper(i
);
3214 LCINFO(AST_LC_CTYPE
)->data
= (void*)fold
;
3217 if (!(p
->env
= (Env_t
*)alloc(disc
, 0, sizeof(Env_t
))))
3218 return fatal(disc
, REG_ESPACE
, pattern
);
3219 memset(p
->env
, 0, sizeof(*p
->env
));
3220 memset(&env
, 0, sizeof(env
));
3223 env
.disc
= p
->env
->disc
= disc
;
3224 if (env
.flags
& REG_AUGMENTED
)
3225 env
.flags
|= REG_EXTENDED
;
3226 env
.mappeddot
= '.';
3227 env
.mappednewline
= '\n';
3228 env
.mappedslash
= '/';
3229 if (disc
->re_version
>= REG_VERSION_MAP
&& disc
->re_map
)
3231 env
.map
= disc
->re_map
;
3232 env
.MAP
= p
->env
->fold
;
3233 for (i
= 0; i
<= UCHAR_MAX
; i
++)
3235 env
.MAP
[i
] = fold
[env
.map
[i
]];
3236 if (env
.map
[i
] == '.')
3238 if (env
.map
[i
] == '\n')
3239 env
.mappednewline
= i
;
3240 if (env
.map
[i
] == '/')
3241 env
.mappedslash
= i
;
3246 env
.type
= (env
.flags
& REG_AUGMENTED
) ? ARE
: (env
.flags
& REG_EXTENDED
) ? ERE
: BRE
;
3248 if (env
.flags
& REG_SHELL
)
3250 if (env
.flags
& REG_SHELL_PATH
)
3251 env
.explicit = env
.mappedslash
;
3252 env
.flags
|= REG_LENIENT
|REG_NULL
;
3253 env
.type
= env
.type
== BRE
? SRE
: KRE
;
3255 if ((env
.flags
& (REG_NEWLINE
|REG_SPAN
)) == REG_NEWLINE
)
3256 env
.explicit = env
.mappednewline
;
3257 p
->env
->leading
= (env
.flags
& REG_SHELL_DOT
) ? env
.mappeddot
: -1;
3258 env
.posixkludge
= !(env
.flags
& (REG_EXTENDED
|REG_SHELL
));
3259 env
.regexp
= !!(env
.flags
& REG_REGEXP
);
3262 if (env
.flags
& REG_DELIMITED
)
3264 switch (env
.delimiter
= *pattern
++)
3270 env
.error
= REG_EDELIM
;
3273 env
.terminator
= '\n';
3275 env
.literal
= env
.pattern
= env
.cursor
= (unsigned char*)pattern
;
3276 if (!(p
->env
->rex
= alt(&env
, 1, 0)))
3280 env
.error
= REG_EPAREN
;
3283 p
->env
->stats
.re_flags
= env
.flags
& (REG_EXTENDED
|REG_AUGMENTED
|REG_SHELL
);
3284 if (env
.flags
& REG_LEFT
)
3286 if (p
->env
->rex
->type
!= REX_BEG
)
3288 if (p
->env
->rex
->type
== REX_ALT
)
3289 env
.flags
&= ~REG_FIRST
;
3290 if (!(e
= node(&env
, REX_BEG
, 0, 0, 0)))
3293 return fatal(disc
, REG_ESPACE
, pattern
);
3295 e
->next
= p
->env
->rex
;
3299 p
->env
->stats
.re_flags
|= REG_LEFT
;
3301 for (e
= p
->env
->rex
; e
->next
; e
= e
->next
);
3302 p
->env
->done
.type
= REX_DONE
;
3303 p
->env
->done
.flags
= e
->flags
;
3304 if (env
.flags
& REG_RIGHT
)
3306 if (e
->type
!= REX_END
)
3308 if (p
->env
->rex
->type
== REX_ALT
)
3309 env
.flags
&= ~REG_FIRST
;
3310 if (!(f
= node(&env
, REX_END
, 0, 0, 0)))
3313 return fatal(disc
, REG_ESPACE
, pattern
);
3315 f
->flags
= e
->flags
;
3319 p
->env
->stats
.re_flags
|= REG_RIGHT
;
3321 if (stats(&env
, p
->env
->rex
))
3324 env
.error
= REG_ECOUNT
;
3328 p
->env
->hard
= p
->env
->separate
= 1;
3329 else if (!(env
.flags
& REG_FIRST
) && (env
.stats
.a
|| env
.stats
.c
> 1 && env
.stats
.c
!= env
.stats
.s
|| env
.stats
.t
&& (env
.stats
.t
> 1 || env
.stats
.a
|| env
.stats
.c
)))
3331 if (p
->env
->hard
|| env
.stats
.c
|| env
.stats
.i
)
3332 p
->env
->stats
.re_min
= p
->env
->stats
.re_max
= -1;
3335 if (!(p
->env
->stats
.re_min
= env
.stats
.m
))
3336 p
->env
->stats
.re_min
= -1;
3337 if (!(p
->env
->stats
.re_max
= env
.stats
.n
))
3338 p
->env
->stats
.re_max
= -1;
3340 if (special(&env
, p
))
3342 serialize(&env
, p
->env
->rex
, 1);
3343 p
->re_nsub
= env
.stats
.p
;
3344 if (env
.type
== KRE
)
3346 if (env
.flags
& REG_DELIMITED
)
3348 p
->re_npat
= env
.cursor
- env
.pattern
+ 1;
3349 if (*env
.cursor
== env
.delimiter
)
3351 else if (env
.flags
& REG_MUSTDELIM
)
3353 env
.error
= REG_EDELIM
;
3357 env
.flags
&= ~REG_DELIMITED
;
3359 p
->env
->explicit = env
.explicit;
3360 p
->env
->flags
= env
.flags
& REG_COMP
;
3361 p
->env
->min
= env
.stats
.m
;
3362 p
->env
->nsub
= env
.stats
.p
+ env
.stats
.u
;
3368 env
.error
= REG_ESPACE
;
3369 if (env
.type
>= SRE
&& env
.error
!= REG_ESPACE
&& !(flags
& REG_LITERAL
))
3371 flags
|= REG_LITERAL
;
3372 pattern
= (const char*)env
.literal
;
3375 return fatal(disc
, env
.error
, pattern
);
3379 * regcomp() on sized pattern
3380 * the lazy way around adding and checking env.end
3384 regncomp(regex_t
* p
, const char* pattern
, size_t size
, regflags_t flags
)
3389 if (!(s
= malloc(size
+ 1)))
3390 return fatal((flags
& REG_DISCIPLINE
) ? p
->re_disc
: &state
.disc
, REG_ESPACE
, pattern
);
3391 memcpy(s
, pattern
, size
);
3393 r
= regcomp(p
, s
, flags
);
3399 * combine two compiled regular expressions if possible,
3400 * replacing first with the combination and freeing second.
3401 * return 0 on success.
3402 * the only combinations handled are building a Trie
3403 * from String|Kmp|Trie and String|Kmp
3407 regcomb(regex_t
* p
, regex_t
* q
)
3409 Rex_t
* e
= p
->env
->rex
;
3410 Rex_t
* f
= q
->env
->rex
;
3416 return fatal(p
->env
->disc
, REG_BADPAT
, NiL
);
3417 if (p
->env
->separate
|| q
->env
->separate
)
3419 memset(&env
, 0, sizeof(env
));
3420 env
.disc
= p
->env
->disc
;
3421 if (e
->type
== REX_BM
)
3423 p
->env
->rex
= e
->next
;
3428 if (f
->type
== REX_BM
)
3430 q
->env
->rex
= f
->next
;
3435 if (e
->type
== REX_BEG
&& f
->type
== REX_BEG
)
3437 p
->env
->flags
|= REG_LEFT
;
3438 p
->env
->rex
= e
->next
;
3442 q
->env
->rex
= f
->next
;
3447 for (g
= e
; g
->next
; g
= g
->next
);
3448 for (h
= f
; h
->next
; h
= h
->next
);
3449 if (g
->next
&& g
->next
->type
== REX_END
&& h
->next
&& h
->next
->type
== REX_END
)
3451 p
->env
->flags
|= REG_RIGHT
;
3452 drop(env
.disc
, g
->next
);
3454 drop(env
.disc
, h
->next
);
3457 if (!(g
= trie(&env
, f
, e
)))
3458 return fatal(p
->env
->disc
, REG_BADPAT
, NiL
);
3463 if (p
->env
->flags
& REG_LEFT
)
3465 if (!(e
= node(&env
, REX_BEG
, 0, 0, 0)))
3468 return fatal(p
->env
->disc
, REG_ESPACE
, NiL
);
3470 e
->next
= p
->env
->rex
;
3474 if (p
->env
->flags
& REG_RIGHT
)
3476 for (f
= p
->env
->rex
; f
->next
; f
= f
->next
);
3477 if (f
->type
!= REX_END
)
3479 if (!(e
= node(&env
, REX_END
, 0, 0, 0)))
3482 return fatal(p
->env
->disc
, REG_ESPACE
, NiL
);
3487 env
.explicit = p
->env
->explicit;
3488 env
.flags
= p
->env
->flags
;
3489 env
.disc
= p
->env
->disc
;
3490 if (stats(&env
, p
->env
->rex
))
3493 return fatal(p
->env
->disc
, env
.error
? env
.error
: REG_ECOUNT
, NiL
);
3495 if (special(&env
, p
))
3498 return fatal(p
->env
->disc
, env
.error
? env
.error
: REG_ESPACE
, NiL
);
3500 p
->env
->min
= g
->re
.trie
.min
;
3505 * copy a reference of p into q
3506 * p and q may then have separate regsubcomp() instantiations
3510 regdup(regex_t
* p
, regex_t
* q
)