1 /***********************************************************************
3 * This software is part of the ast package *
4 * Copyright (c) 1985-2010 AT&T Intellectual Property *
5 * and is licensed under the *
6 * Common Public License, Version 1.0 *
7 * by AT&T Intellectual Property *
9 * A copy of the License is available at *
10 * http://www.opensource.org/licenses/cpl1.0.txt *
11 * (with md5 checksum 059e8cd6165cb4c31e351f2b69388fd9) *
13 * Information and Software Systems Research *
17 * Glenn Fowler <gsf@research.att.com> *
18 * David Korn <dgk@research.att.com> *
19 * Phong Vo <kpv@research.att.com> *
21 ***********************************************************************/
25 * posix regex executor
26 * single sized-record interface
33 #define DEBUG_TEST(f,y,n) ((debug&(debug_flag=f))?(y):(n))
34 #define DEBUG_CODE(f,y,n) do if(debug&(f)){y}else{n} while(0)
35 #define DEBUG_INIT() do { char* t; if (!debug) { debug = 0x80000000; if (t = getenv("_AST_regex_exec_debug")) debug |= strtoul(t, NiL, 0); } } while (0)
37 static unsigned long debug
;
38 static unsigned long debug_flag
;
40 static const char* rexnames
[] =
64 "REX_GROUP_AHEAD_CATCH",
65 "REX_GROUP_AHEAD_NOT",
67 "REX_GROUP_BEHIND_CATCH",
68 "REX_GROUP_BEHIND_NOT",
69 "REX_GROUP_BEHIND_NOT_CATCH",
71 "REX_GROUP_COND_CATCH",
73 "REX_GROUP_CUT_CATCH",
89 static const char* rexname(Rex_t
* rex
)
93 if (rex
->type
>= elementsof(rexnames
))
95 return rexnames
[rex
->type
];
101 #define DEBUG_TEST(f,y,n) (n)
102 #define DEBUG_CODE(f,y,n) do {n} while(0)
106 #define BEG_ALT 1 /* beginning of an alt */
107 #define BEG_ONE 2 /* beginning of one iteration of a rep */
108 #define BEG_REP 3 /* beginning of a repetition */
109 #define BEG_SUB 4 /* beginning of a subexpression */
110 #define END_ANY 5 /* end of any of above */
113 * returns from parse()
116 #define NONE 0 /* no parse found */
117 #define GOOD 1 /* some parse was found */
118 #define CUT 2 /* no match and no backtrack */
119 #define BEST 3 /* an unbeatable parse was found */
120 #define BAD 4 /* error ocurred */
126 #define LEADING(e,r,s) (*(s)==(e)->leading&&((s)==(e)->beg||*((s)-1)==(r)->explicit))
129 * Pos_t is for comparing parses. An entry is made in the
130 * array at the beginning and at the end of each Group_t,
131 * each iteration in a Group_t, and each Binary_t.
136 unsigned char* p
; /* where in string */
137 size_t length
; /* length in string */
138 short serial
; /* preorder subpattern number */
139 short be
; /* which end of pair */
142 /* ===== begin library support ===== */
144 #define vector(t,v,i) (((i)<(v)->max)?(t*)((v)->vec+(i)*(v)->siz):(t*)vecseek(&(v),i))
147 vecopen(int inc
, int siz
)
154 if (!(sp
= stkopen(STK_SMALL
|STK_NULL
)))
156 if (!(v
= (Vector_t
*)stkseek(sp
, sizeof(Vector_t
) + inc
* siz
)))
162 v
->vec
= (char*)v
+ sizeof(Vector_t
);
163 v
->max
= v
->inc
= inc
;
170 vecseek(Vector_t
** p
, int index
)
176 while ((v
->max
+= v
->inc
) <= index
);
177 if (!(v
= (Vector_t
*)stkseek(v
->stk
, sizeof(Vector_t
) + v
->max
* v
->siz
)))
180 v
->vec
= (char*)v
+ sizeof(Vector_t
);
182 return v
->vec
+ index
* v
->siz
;
186 vecclose(Vector_t
* v
)
198 #define stknew(s,p) ((p)->offset=stktell(s),(p)->base=stkfreeze(s,0))
199 #define stkold(s,p) stkset(s,(p)->base,(p)->offset)
201 #define stkframe(s) (*((Stk_frame_t**)stktop(s)-1))
202 #define stkdata(s,t) ((t*)stkframe(s)->data)
203 #define stkpop(s) stkold(s,&(stkframe(s)->pos))
206 stkpush(Stk_t
* sp
, size_t size
)
212 size
= sizeof(Stk_frame_t
) + sizeof(size_t) + size
- 1;
213 if (!(f
= (Stk_frame_t
*)stkalloc(sp
, sizeof(Stk_frame_t
) + sizeof(Stk_frame_t
*) + size
- 1)))
220 /* ===== end library support ===== */
223 * Match_frame_t is for saving and restoring match records
224 * around alternate attempts, so that fossils will not be
225 * left in the match array. These are the only entries in
226 * the match array that are not otherwise guaranteed to
227 * have current data in them when they get used.
237 #define matchframe stkdata(stkstd,Match_frame_t)
238 #define matchpush(e,x) ((x)->re.group.number?_matchpush(e,x):0)
239 #define matchcopy(e,x) ((x)->re.group.number?memcpy(matchframe->match,matchframe->save,matchframe->size):(void*)0)
240 #define matchpop(e,x) ((x)->re.group.number?memcpy(matchframe->match,matchframe->save,matchframe->size),stkpop(stkstd):(void*)0)
242 #define pospop(e) (--(e)->pos->cur)
245 * allocate a frame and push a match onto the stack
249 _matchpush(Env_t
* env
, Rex_t
* rex
)
257 if (rex
->re
.group
.number
<= 0 || (num
= rex
->re
.group
.last
- rex
->re
.group
.number
+ 1) <= 0)
259 if (!(f
= (Match_frame_t
*)stkpush(stkstd
, sizeof(Match_frame_t
) + (num
- 1) * sizeof(regmatch_t
))))
261 env
->error
= REG_ESPACE
;
264 f
->size
= num
* sizeof(regmatch_t
);
265 f
->match
= m
= env
->match
+ rex
->re
.group
.number
;
271 *m
++ = state
.nomatch
;
277 * allocate a frame and push a pos onto the stack
281 pospush(Env_t
* env
, Rex_t
* rex
, unsigned char* p
, int be
)
285 if (!(pos
= vector(Pos_t
, env
->pos
, env
->pos
->cur
)))
287 env
->error
= REG_ESPACE
;
290 pos
->serial
= rex
->serial
;
298 * two matches are known to have the same length
299 * os is start of old pos array, ns is start of new,
300 * oend and nend are end+1 pointers to ends of arrays.
301 * oe and ne are ends (not end+1) of subarrays.
302 * returns 1 if new is better, -1 if old, else 0.
306 better(Env_t
* env
, Pos_t
* os
, Pos_t
* ns
, Pos_t
* oend
, Pos_t
* nend
, int level
)
317 DEBUG_CODE(0x0080,{sfprintf(sfstdout
, " %-*.*sold ", (level
+ 3) * 4, (level
+ 3) * 4, "");for (oe
= os
; oe
< oend
; oe
++)sfprintf(sfstdout
, "<%d,%d,%d>", oe
->p
- env
->beg
, oe
->serial
, oe
->be
);sfprintf(sfstdout
, "\n %-*.*snew ", (level
+ 3) * 4, (level
+ 3) * 4, "");for (oe
= ns
; oe
< nend
; oe
++)sfprintf(sfstdout
, "<%d,%d,%d>", oe
->p
- env
->beg
, oe
->serial
, oe
->be
);sfprintf(sfstdout
, "\n");},{0;});
319 return DEBUG_TEST(0x8000,(os
< oend
),(0));
321 return DEBUG_TEST(0x8000,(-1),(1));
327 env
->error
= REG_PANIC
;
337 if ((++oe
)->serial
== n
)
339 if (oe
->be
!= END_ANY
)
347 if ((++ne
)->serial
== n
)
349 if (ne
->be
!= END_ANY
)
358 if (k
= better(env
, os
+ 1, ns
+ 1, oe
, ne
, level
+ 1))
368 showmatch(regmatch_t
* p
)
370 sfputc(sfstdout
, '(');
372 sfputc(sfstdout
, '?');
374 sfprintf(sfstdout
, "%d", p
->rm_so
);
375 sfputc(sfstdout
, ',');
377 sfputc(sfstdout
, '?');
379 sfprintf(sfstdout
, "%d", p
->rm_eo
);
380 sfputc(sfstdout
, ')');
384 _better(Env_t
* env
, Pos_t
* os
, Pos_t
* ns
, Pos_t
* oend
, Pos_t
* nend
, int level
)
388 DEBUG_CODE(0x0040,{sfprintf(sfstdout
, "AHA better old ");for (i
= 0; i
<= env
->nsub
; i
++)showmatch(&env
->best
[i
]);sfprintf(sfstdout
, "\n new ");for (i
= 0; i
<= env
->nsub
; i
++)showmatch(&env
->match
[i
]);sfprintf(sfstdout
, "\n");},{0;});
389 i
= better(env
, os
, ns
, oend
, nend
, 0);
390 DEBUG_TEST(0x0040,(sfprintf(sfstdout
, " %s\n", i
<= 0 ? "OLD" : "NEW")),(0));
394 #define better _better
398 #define follow(e,r,c,s) ((r)->next?parse(e,(r)->next,c,s):(c)?parse(e,c,0,s):BEST)
400 static int parse(Env_t
*, Rex_t
*, Rex_t
*, unsigned char*);
403 parserep(Env_t
* env
, Rex_t
* rex
, Rex_t
* cont
, unsigned char* s
, int n
)
409 DEBUG_TEST(0x0010,(sfprintf(sfstdout
, "AHA#%04d 0x%04x parserep %s %d %d %d %d `%-.*s'\n", __LINE__
, debug_flag
, rexname(rex
->re
.group
.expr
.rex
), rex
->re
.group
.number
, rex
->lo
, n
, rex
->hi
, env
->end
- s
, s
)),(0));
410 if ((rex
->flags
& REG_MINIMAL
) && n
>= rex
->lo
&& n
< rex
->hi
)
412 if (env
->stack
&& pospush(env
, rex
, s
, END_ANY
))
414 i
= follow(env
, rex
, cont
, s
);
430 catcher
.type
= REX_REP_CATCH
;
431 catcher
.serial
= rex
->serial
;
432 catcher
.re
.rep_catch
.ref
= rex
;
433 catcher
.re
.rep_catch
.cont
= cont
;
434 catcher
.re
.rep_catch
.beg
= s
;
435 catcher
.re
.rep_catch
.n
= n
+ 1;
436 catcher
.next
= rex
->next
;
438 rex
->re
.rep_catch
.beg
= s
;
441 if (matchpush(env
, rex
))
443 if (pospush(env
, rex
, s
, BEG_ONE
))
445 DEBUG_TEST(0x0004,(sfprintf(sfstdout
,"AHA#%04d 0x%04x PUSH %d (%d,%d)(%d,%d)(%d,%d) (%d,%d)(%d,%d)(%d,%d)\n", __LINE__
, debug_flag
, rex
->re
.group
.number
, env
->best
[0].rm_so
, env
->best
[0].rm_eo
, env
->best
[1].rm_so
, env
->best
[1].rm_eo
, env
->best
[2].rm_so
, env
->best
[2].rm_eo
, env
->match
[0].rm_so
, env
->match
[0].rm_eo
, env
->match
[1].rm_so
, env
->match
[1].rm_eo
, env
->match
[2].rm_so
, env
->match
[2].rm_eo
)),(0));
447 r
= parse(env
, rex
->re
.group
.expr
.rex
, &catcher
, s
);
448 DEBUG_TEST(0x0010,(sfprintf(sfstdout
, "AHA#%04d 0x%04x parserep parse %d %d `%-.*s'\n", __LINE__
, debug_flag
, rex
->re
.group
.number
, r
, env
->end
- s
, s
)),(0));
453 DEBUG_TEST(0x0004,(sfprintf(sfstdout
,"AHA#%04d 0x%04x POP %d %d (%d,%d)(%d,%d)(%d,%d) (%d,%d)(%d,%d)(%d,%d)\n", __LINE__
, debug_flag
, rex
->re
.group
.number
, r
, env
->best
[0].rm_so
, env
->best
[0].rm_eo
, env
->best
[1].rm_so
, env
->best
[1].rm_eo
, env
->best
[2].rm_so
, env
->best
[2].rm_eo
, env
->match
[0].rm_so
, env
->match
[0].rm_eo
, env
->match
[1].rm_so
, env
->match
[1].rm_eo
, env
->match
[2].rm_so
, env
->match
[2].rm_eo
)),(0));
465 if (rex
->flags
& REG_MINIMAL
)
473 if (!(rex
->flags
& REG_MINIMAL
) || n
>= rex
->hi
)
475 if (env
->stack
&& pospush(env
, rex
, s
, END_ANY
))
477 i
= follow(env
, rex
, cont
, s
);
492 r
= (rex
->flags
& REG_MINIMAL
) ? BEST
: GOOD
;
500 parsetrie(Env_t
* env
, Trie_node_t
* x
, Rex_t
* rex
, Rex_t
* cont
, unsigned char* s
)
511 while (x
->c
!= p
[*s
])
536 if (rex
->flags
& REG_MINIMAL
)
537 switch (follow(env
, rex
, cont
, s
))
548 switch (parsetrie(env
, x
->son
, rex
, cont
, s
))
557 if (rex
->flags
& REG_MINIMAL
)
567 if (!(rex
->flags
& REG_MINIMAL
))
568 switch (follow(env
, rex
, cont
, s
))
583 collelt(register Celt_t
* ce
, char* key
, int c
, int x
)
587 mbxfrm(elt
, key
, COLL_KEY_MAX
);
593 if (!x
&& (*ce
->fun
)(c
))
597 if (!strcmp((char*)ce
->beg
, (char*)elt
))
601 if (strcmp((char*)ce
->beg
, (char*)elt
) <= ce
->min
&& strcmp((char*)elt
, (char*)ce
->end
) <= ce
->max
)
605 if (strcmp((char*)ce
->beg
, (char*)elt
) <= ce
->min
&& strcmp((char*)elt
, (char*)ce
->end
) <= ce
->max
&& (iswlower(c
) || !iswupper(c
)))
609 if (strcmp((char*)ce
->beg
, (char*)elt
) <= ce
->min
&& strcmp((char*)elt
, (char*)ce
->end
) <= ce
->max
&& (iswupper(c
) || !iswlower(c
)))
619 collic(register Celt_t
* ce
, char* key
, register char* nxt
, int c
, int x
)
623 if (collelt(ce
, key
, c
, x
))
627 else if (iswupper(c
))
633 return collelt(ce
, key
, c
, 0);
637 if (collic(ce
, key
, nxt
+ 1, c
, x
))
640 *nxt
= toupper(*nxt
);
641 else if (isupper(*nxt
))
642 *nxt
= tolower(*nxt
);
647 return collelt(ce
, key
, c
, x
);
651 collmatch(Rex_t
* rex
, unsigned char* s
, unsigned char* e
, unsigned char** p
)
662 ic
= (rex
->flags
& REG_ICASE
);
663 if ((w
= MBSIZE(s
)) > 1)
665 memcpy((char*)key
, (char*)s
, w
);
677 if (ic
&& isupper(c
))
684 if (x
> COLL_KEY_MAX
)
691 r
= mbxfrm(elt
, key
, COLL_KEY_MAX
);
692 if (ic
&& isupper(c
))
696 if (mbxfrm(elt
, key
, COLL_KEY_MAX
) != r
)
708 if (ic
? collic(rex
->re
.collate
.elements
, (char*)key
, (char*)key
, c
, x
) : collelt(rex
->re
.collate
.elements
, (char*)key
, c
, x
))
719 return rex
->re
.collate
.invert
? !r
: r
;
722 static unsigned char*
723 nestmatch(register unsigned char* s
, register unsigned char* e
, const unsigned short* type
, register int co
)
730 if (type
[co
] & (REX_NEST_literal
|REX_NEST_quote
))
732 n
= (type
[co
] & REX_NEST_literal
) ? REX_NEST_terminator
: (REX_NEST_escape
|REX_NEST_terminator
);
738 else if (type
[c
] & n
)
740 if (s
>= e
|| (type
[c
] & REX_NEST_terminator
))
748 cc
= type
[co
] >> REX_NEST_SHIFT
;
749 oc
= type
[co
] & (REX_NEST_open
|REX_NEST_close
);
754 switch (type
[c
] & (REX_NEST_escape
|REX_NEST_open
|REX_NEST_close
|REX_NEST_delimiter
|REX_NEST_separator
|REX_NEST_terminator
))
756 case REX_NEST_delimiter
:
757 case REX_NEST_terminator
:
759 case REX_NEST_separator
:
763 case REX_NEST_escape
:
768 case REX_NEST_open
|REX_NEST_close
:
781 else if (!(s
= nestmatch(s
, e
, type
, c
)))
792 return (oc
|| !(type
[UCHAR_MAX
+1] & REX_NEST_terminator
)) ? 0 : s
;
798 parse(Env_t
* env
, Rex_t
* rex
, Rex_t
* cont
, unsigned char* s
)
820 DEBUG_TEST(0x0008,(sfprintf(sfstdout
, "AHA#%04d 0x%04x parse %s `%-.*s'\n", __LINE__
, debug_flag
, rexname(rex
), env
->end
- s
, s
)),(0));
826 if (matchpush(env
, rex
))
828 if (pospush(env
, rex
, s
, BEG_ALT
))
830 catcher
.type
= REX_ALT_CATCH
;
831 catcher
.serial
= rex
->serial
;
832 catcher
.re
.alt_catch
.cont
= cont
;
833 catcher
.next
= rex
->next
;
834 r
= parse(env
, rex
->re
.group
.expr
.binary
.left
, &catcher
, s
);
835 if (r
< BEST
|| (rex
->flags
& REG_MINIMAL
))
838 ((Pos_t
*)env
->pos
->vec
+ env
->pos
->cur
- 1)->serial
= catcher
.serial
= rex
->re
.group
.expr
.binary
.serial
;
839 n
= parse(env
, rex
->re
.group
.expr
.binary
.right
, &catcher
, s
);
848 if ((r
= parse(env
, rex
->re
.group
.expr
.binary
.left
, cont
, s
)) == NONE
)
849 r
= parse(env
, rex
->re
.group
.expr
.binary
.right
, cont
, s
);
855 if (pospush(env
, rex
, s
, END_ANY
))
857 r
= follow(env
, rex
, rex
->re
.alt_catch
.cont
, s
);
861 o
= &env
->match
[rex
->lo
];
864 i
= o
->rm_eo
- o
->rm_so
;
868 t
= env
->beg
+ o
->rm_so
;
878 if (p
[*s
++] != p
[*t
++])
887 if (towupper(c
) != towupper(d
))
893 if ((!(rex
->flags
& REG_NEWLINE
) || s
<= env
->beg
|| *(s
- 1) != '\n') && ((env
->flags
& REG_NOTBOL
) || s
!= env
->beg
))
897 if (LEADING(env
, rex
, s
))
900 if (n
> env
->end
- s
)
906 if (!(rex
->flags
& REG_MINIMAL
))
908 for (i
= 0; i
< n
; i
++)
909 if (!settst(rex
->re
.charclass
, s
[i
]))
914 for (s
+= n
; n
-- >= m
; s
--)
915 switch (follow(env
, rex
, cont
, s
))
930 for (e
= s
+ m
; s
< e
; s
++)
931 if (!settst(rex
->re
.charclass
, *s
))
936 switch (follow(env
, rex
, cont
, s
))
946 if (s
>= e
|| !settst(rex
->re
.charclass
, *s
))
953 if (LEADING(env
, rex
, s
))
956 if (n
> env
->end
- s
)
963 if (!(rex
->flags
& REG_MINIMAL
))
965 if (!(b
= (unsigned char*)stkpush(stkstd
, n
)))
967 env
->error
= REG_ESPACE
;
970 for (i
= 0; s
< e
&& i
< n
&& collmatch(rex
, s
, e
, &t
); i
++)
975 for (; i
-- >= rex
->lo
; s
-= b
[i
])
976 switch (follow(env
, rex
, cont
, s
))
995 for (i
= 0; i
< m
&& s
< e
; i
++, s
= t
)
996 if (!collmatch(rex
, s
, e
, &t
))
1000 switch (follow(env
, rex
, cont
, s
))
1010 if (s
>= e
|| !collmatch(rex
, s
, e
, &s
))
1016 next
.type
= REX_CONJ_RIGHT
;
1017 next
.re
.conj_right
.cont
= cont
;
1018 next
.next
= rex
->next
;
1019 catcher
.type
= REX_CONJ_LEFT
;
1020 catcher
.re
.conj_left
.right
= rex
->re
.group
.expr
.binary
.right
;
1021 catcher
.re
.conj_left
.cont
= &next
;
1022 catcher
.re
.conj_left
.beg
= s
;
1024 return parse(env
, rex
->re
.group
.expr
.binary
.left
, &catcher
, s
);
1026 rex
->re
.conj_left
.cont
->re
.conj_right
.end
= s
;
1027 cont
= rex
->re
.conj_left
.cont
;
1028 s
= rex
->re
.conj_left
.beg
;
1029 rex
= rex
->re
.conj_left
.right
;
1031 case REX_CONJ_RIGHT
:
1032 if (rex
->re
.conj_right
.end
!= s
)
1034 cont
= rex
->re
.conj_right
.cont
;
1041 DEBUG_TEST(0x0100,(sfprintf(sfstdout
,"AHA#%04d 0x%04x %s (%d,%d)(%d,%d)(%d,%d)(%d,%d) (%d,%d)(%d,%d)\n", __LINE__
, debug_flag
, rexname(rex
), env
->best
[0].rm_so
, env
->best
[0].rm_eo
, env
->best
[1].rm_so
, env
->best
[1].rm_eo
, env
->best
[2].rm_so
, env
->best
[2].rm_eo
, env
->best
[3].rm_so
, env
->best
[3].rm_eo
, env
->match
[0].rm_so
, env
->match
[0].rm_eo
, env
->match
[1].rm_so
, env
->match
[1].rm_eo
)),(0));
1042 if ((i
= env
->best
[0].rm_eo
) >= 0)
1044 if (rex
->flags
& REG_MINIMAL
)
1054 if (n
== i
&& better(env
,
1055 (Pos_t
*)env
->bestpos
->vec
,
1056 (Pos_t
*)env
->pos
->vec
,
1057 (Pos_t
*)env
->bestpos
->vec
+env
->bestpos
->cur
,
1058 (Pos_t
*)env
->pos
->vec
+env
->pos
->cur
,
1062 env
->best
[0].rm_eo
= n
;
1063 memcpy(&env
->best
[1], &env
->match
[1], r
* sizeof(regmatch_t
));
1065 if (!vector(Pos_t
, env
->bestpos
, n
))
1067 env
->error
= REG_ESPACE
;
1070 env
->bestpos
->cur
= n
;
1071 memcpy(env
->bestpos
->vec
, env
->pos
->vec
, n
* sizeof(Pos_t
));
1072 DEBUG_TEST(0x0100,(sfprintf(sfstdout
,"AHA#%04d 0x%04x %s (%d,%d)(%d,%d)(%d,%d)(%d,%d) (%d,%d)(%d,%d)\n", __LINE__
, debug_flag
, rexname(rex
), env
->best
[0].rm_so
, env
->best
[0].rm_eo
, env
->best
[1].rm_so
, env
->best
[1].rm_eo
, env
->best
[2].rm_so
, env
->best
[2].rm_eo
, env
->best
[3].rm_so
, env
->best
[3].rm_eo
, env
->match
[0].rm_so
, env
->match
[0].rm_eo
, env
->match
[1].rm_so
, env
->match
[1].rm_eo
)),(0));
1075 if (LEADING(env
, rex
, s
))
1078 if (n
> env
->end
- s
)
1083 if ((c
= rex
->explicit) >= 0 && !mbwide())
1084 for (i
= 0; i
< n
; i
++)
1091 if (!(rex
->flags
& REG_MINIMAL
))
1095 for (s
+= n
; n
-- >= m
; s
--)
1096 switch (follow(env
, rex
, cont
, s
))
1111 if (!(b
= (unsigned char*)stkpush(stkstd
, n
)))
1113 env
->error
= REG_ESPACE
;
1117 for (i
= 0; s
< e
&& i
< n
&& *s
!= c
; i
++)
1118 s
+= b
[i
] = MBSIZE(s
);
1119 for (; i
-- >= m
; s
-= b
[i
])
1120 switch (follow(env
, rex
, cont
, s
))
1143 for (s
+= m
; s
<= e
; s
++)
1144 switch (follow(env
, rex
, cont
, s
))
1158 for (i
= 0; s
< e
&& i
< m
&& *s
!= c
; i
++)
1161 for (; s
<= e
&& i
<= n
; s
+= MBSIZE(s
), i
++)
1162 switch (follow(env
, rex
, cont
, s
))
1176 if ((!(rex
->flags
& REG_NEWLINE
) || *s
!= '\n') && ((env
->flags
& REG_NOTEOL
) || s
< env
->end
))
1180 DEBUG_TEST(0x0200,(sfprintf(sfstdout
,"AHA#%04d 0x%04x parse %s `%-.*s'\n", __LINE__
, debug_flag
, rexname(rex
), env
->end
- s
, s
)),(0));
1183 if (rex
->re
.group
.number
)
1184 env
->match
[rex
->re
.group
.number
].rm_so
= s
- env
->beg
;
1185 if (pospush(env
, rex
, s
, BEG_SUB
))
1187 catcher
.re
.group_catch
.eo
= rex
->re
.group
.number
? &env
->match
[rex
->re
.group
.number
].rm_eo
: (regoff_t
*)0;
1189 catcher
.type
= REX_GROUP_CATCH
;
1190 catcher
.serial
= rex
->serial
;
1191 catcher
.re
.group_catch
.cont
= cont
;
1192 catcher
.next
= rex
->next
;
1193 r
= parse(env
, rex
->re
.group
.expr
.rex
, &catcher
, s
);
1197 if (rex
->re
.group
.number
)
1198 env
->match
[rex
->re
.group
.number
].rm_so
= -1;
1201 case REX_GROUP_CATCH
:
1202 DEBUG_TEST(0x0200,(sfprintf(sfstdout
,"AHA#%04d 0x%04x parse %s=>%s `%-.*s'\n", __LINE__
, debug_flag
, rexname(rex
), rexname(rex
->re
.group_catch
.cont
), env
->end
- s
, s
)),(0));
1205 if (rex
->re
.group_catch
.eo
)
1206 *rex
->re
.group_catch
.eo
= s
- env
->beg
;
1207 if (pospush(env
, rex
, s
, END_ANY
))
1210 r
= follow(env
, rex
, rex
->re
.group_catch
.cont
, s
);
1214 if (rex
->re
.group_catch
.eo
)
1215 *rex
->re
.group_catch
.eo
= -1;
1218 case REX_GROUP_AHEAD
:
1219 catcher
.type
= REX_GROUP_AHEAD_CATCH
;
1220 catcher
.flags
= rex
->flags
;
1221 catcher
.serial
= rex
->serial
;
1222 catcher
.re
.rep_catch
.beg
= s
;
1223 catcher
.re
.rep_catch
.cont
= cont
;
1224 catcher
.next
= rex
->next
;
1225 return parse(env
, rex
->re
.group
.expr
.rex
, &catcher
, s
);
1226 case REX_GROUP_AHEAD_CATCH
:
1227 return follow(env
, rex
, rex
->re
.rep_catch
.cont
, rex
->re
.rep_catch
.beg
);
1228 case REX_GROUP_AHEAD_NOT
:
1229 r
= parse(env
, rex
->re
.group
.expr
.rex
, NiL
, s
);
1231 r
= follow(env
, rex
, cont
, s
);
1235 case REX_GROUP_BEHIND
:
1236 if ((s
- env
->beg
) < rex
->re
.group
.size
)
1238 catcher
.type
= REX_GROUP_BEHIND_CATCH
;
1239 catcher
.flags
= rex
->flags
;
1240 catcher
.serial
= rex
->serial
;
1241 catcher
.re
.behind_catch
.beg
= s
;
1242 catcher
.re
.behind_catch
.end
= e
= env
->end
;
1243 catcher
.re
.behind_catch
.cont
= cont
;
1244 catcher
.next
= rex
->next
;
1245 for (t
= s
- rex
->re
.group
.size
; t
>= env
->beg
; t
--)
1248 r
= parse(env
, rex
->re
.group
.expr
.rex
, &catcher
, t
);
1254 case REX_GROUP_BEHIND_CATCH
:
1255 if (s
!= rex
->re
.behind_catch
.beg
)
1257 env
->end
= rex
->re
.behind_catch
.end
;
1258 return follow(env
, rex
, rex
->re
.behind_catch
.cont
, rex
->re
.behind_catch
.beg
);
1259 case REX_GROUP_BEHIND_NOT
:
1260 if ((s
- env
->beg
) < rex
->re
.group
.size
)
1264 catcher
.type
= REX_GROUP_BEHIND_NOT_CATCH
;
1265 catcher
.re
.neg_catch
.beg
= s
;
1269 for (t
= s
- rex
->re
.group
.size
; t
>= env
->beg
; t
--)
1271 r
= parse(env
, rex
->re
.group
.expr
.rex
, &catcher
, t
);
1278 r
= follow(env
, rex
, cont
, s
);
1282 case REX_GROUP_BEHIND_NOT_CATCH
:
1283 return s
== rex
->re
.neg_catch
.beg
? GOOD
: NONE
;
1284 case REX_GROUP_COND
:
1285 if (q
= rex
->re
.group
.expr
.binary
.right
)
1287 catcher
.re
.cond_catch
.next
[0] = q
->re
.group
.expr
.binary
.right
;
1288 catcher
.re
.cond_catch
.next
[1] = q
->re
.group
.expr
.binary
.left
;
1291 catcher
.re
.cond_catch
.next
[0] = catcher
.re
.cond_catch
.next
[1] = 0;
1292 if (q
= rex
->re
.group
.expr
.binary
.left
)
1294 catcher
.type
= REX_GROUP_COND_CATCH
;
1295 catcher
.flags
= rex
->flags
;
1296 catcher
.serial
= rex
->serial
;
1297 catcher
.re
.cond_catch
.yes
= 0;
1298 catcher
.re
.cond_catch
.beg
= s
;
1299 catcher
.re
.cond_catch
.cont
= cont
;
1300 catcher
.next
= rex
->next
;
1301 r
= parse(env
, q
, &catcher
, s
);
1302 if (r
== BAD
|| catcher
.re
.cond_catch
.yes
)
1305 else if (!rex
->re
.group
.size
|| rex
->re
.group
.size
> 0 && env
->match
[rex
->re
.group
.size
].rm_so
>= 0)
1309 if (q
= catcher
.re
.cond_catch
.next
[r
!= NONE
])
1311 catcher
.type
= REX_CAT
;
1312 catcher
.flags
= q
->flags
;
1313 catcher
.serial
= q
->serial
;
1314 catcher
.re
.group_catch
.cont
= cont
;
1315 catcher
.next
= rex
->next
;
1316 return parse(env
, q
, &catcher
, s
);
1318 return follow(env
, rex
, cont
, s
);
1319 case REX_GROUP_COND_CATCH
:
1320 rex
->re
.cond_catch
.yes
= 1;
1321 catcher
.type
= REX_CAT
;
1322 catcher
.flags
= rex
->flags
;
1323 catcher
.serial
= rex
->serial
;
1324 catcher
.re
.group_catch
.cont
= rex
->re
.cond_catch
.cont
;
1325 catcher
.next
= rex
->next
;
1326 return parse(env
, rex
->re
.cond_catch
.next
[1], &catcher
, rex
->re
.cond_catch
.beg
);
1328 return follow(env
, rex
, rex
->re
.group_catch
.cont
, s
);
1330 catcher
.type
= REX_GROUP_CUT_CATCH
;
1331 catcher
.flags
= rex
->flags
;
1332 catcher
.serial
= rex
->serial
;
1333 catcher
.re
.group_catch
.cont
= cont
;
1334 catcher
.next
= rex
->next
;
1335 return parse(env
, rex
->re
.group
.expr
.rex
, &catcher
, s
);
1336 case REX_GROUP_CUT_CATCH
:
1337 switch (r
= follow(env
, rex
, rex
->re
.group_catch
.cont
, s
))
1348 f
= rex
->re
.string
.fail
;
1349 b
= rex
->re
.string
.base
;
1350 n
= rex
->re
.string
.size
;
1357 for (i
= -1; t
< e
; t
++)
1359 while (i
>= 0 && b
[i
+1] != p
[*t
])
1361 if (b
[i
+1] == p
[*t
])
1367 env
->best
[0].rm_so
= t
- s
- n
;
1368 switch (follow(env
, rex
, cont
, t
))
1388 for (i
= -1; t
< e
; t
++)
1390 while (i
>= 0 && b
[i
+1] != *t
)
1398 env
->best
[0].rm_so
= t
- s
- n
;
1399 switch (follow(env
, rex
, cont
, t
))
1417 if (LEADING(env
, rex
, s
))
1420 n
= ((i
+ 7) >> 3) + 1;
1421 catcher
.type
= REX_NEG_CATCH
;
1422 catcher
.re
.neg_catch
.beg
= s
;
1423 if (!(p
= (unsigned char*)stkpush(stkstd
, n
)))
1425 memset(catcher
.re
.neg_catch
.index
= p
, 0, n
);
1426 catcher
.next
= rex
->next
;
1427 if (parse(env
, rex
->re
.group
.expr
.rex
, &catcher
, s
) == BAD
)
1435 switch (follow(env
, rex
, cont
, s
+ i
))
1458 bitset(rex
->re
.neg_catch
.index
, s
- rex
->re
.neg_catch
.beg
);
1465 if ((c
= *s
++) == rex
->re
.nest
.primary
)
1467 if (s
>= env
->end
|| !(s
= nestmatch(s
, env
->end
, rex
->re
.nest
.type
, c
)))
1471 if (rex
->re
.nest
.primary
>= 0)
1473 if (rex
->re
.nest
.type
[c
] & (REX_NEST_delimiter
|REX_NEST_separator
|REX_NEST_terminator
))
1475 if (!(s
= nestmatch(s
, env
->end
, rex
->re
.nest
.type
, c
)))
1477 } while (s
< env
->end
&& !(rex
->re
.nest
.type
[*(s
-1)] & (REX_NEST_delimiter
|REX_NEST_separator
|REX_NEST_terminator
)));
1483 if (n
> env
->end
- s
)
1489 c
= rex
->re
.onechar
;
1490 if (!(rex
->flags
& REG_MINIMAL
))
1496 for (i
= 0; i
< n
; i
++, s
++)
1502 for (i
= 0; i
< n
; i
++, s
++)
1506 for (; i
-- >= m
; s
--)
1507 switch (follow(env
, rex
, cont
, s
))
1522 if (!(b
= (unsigned char*)stkpush(stkstd
, n
)))
1524 env
->error
= REG_ESPACE
;
1528 if (!(rex
->flags
& REG_ICASE
))
1530 for (i
= 0; s
< e
&& i
< n
; i
++, s
= t
)
1540 for (i
= 0; s
< e
&& i
< n
; i
++, s
= t
)
1543 if (towupper(mbchar(t
)) != c
)
1548 for (; i
-- >= m
; s
-= b
[i
])
1549 switch (follow(env
, rex
, cont
, s
))
1580 switch (follow(env
, rex
, cont
, s
))
1590 if (s
>= e
|| p
[*s
++] != c
)
1602 switch (follow(env
, rex
, cont
, s
))
1612 if (s
>= e
|| *s
++ != c
)
1619 if (!(rex
->flags
& REG_ICASE
))
1621 for (i
= 0; i
< m
&& s
< e
; i
++, s
= t
)
1629 switch (follow(env
, rex
, cont
, s
))
1639 if (s
>= e
|| mbchar(s
) != c
)
1645 for (i
= 0; i
< m
&& s
< e
; i
++, s
= t
)
1648 if (towupper(mbchar(t
)) != c
)
1653 switch (follow(env
, rex
, cont
, s
))
1663 if (s
>= e
|| towupper(mbchar(s
)) != c
)
1671 if (env
->stack
&& pospush(env
, rex
, s
, BEG_REP
))
1673 r
= parserep(env
, rex
, cont
, s
, 0);
1678 DEBUG_TEST(0x0020,(sfprintf(sfstdout
, "AHA#%04d 0x%04x %s n %d len %d s `%-.*s'\n", __LINE__
, debug_flag
, rexname(rex
), rex
->re
.rep_catch
.n
, s
- rex
->re
.rep_catch
.beg
, env
->end
- s
, s
)),(0));
1679 if (env
->stack
&& pospush(env
, rex
, s
, END_ANY
))
1681 if (s
== rex
->re
.rep_catch
.beg
&& rex
->re
.rep_catch
.n
> rex
->re
.rep_catch
.ref
->lo
)
1684 * optional empty iteration
1687 DEBUG_TEST(0x0002,(sfprintf(sfstdout
, "AHA#%04d %p re.group.back=%d re.group.expr.rex=%s\n", __LINE__
, rex
->re
.rep_catch
.ref
->re
.group
.expr
.rex
, rex
->re
.rep_catch
.ref
->re
.group
.expr
.rex
->re
.group
.back
, rexname(rex
->re
.rep_catch
.ref
->re
.group
.expr
.rex
))),(0));
1688 if (!env
->stack
|| s
!= rex
->re
.rep_catch
.ref
->re
.rep_catch
.beg
&& !rex
->re
.rep_catch
.ref
->re
.group
.expr
.rex
->re
.group
.back
)
1690 else if (pospush(env
, rex
, s
, END_ANY
))
1694 r
= follow(env
, rex
, rex
->re
.rep_catch
.cont
, s
);
1699 r
= parserep(env
, rex
->re
.rep_catch
.ref
, rex
->re
.rep_catch
.cont
, s
, rex
->re
.rep_catch
.n
);
1704 DEBUG_TEST(0x0200,(sfprintf(sfstdout
,"AHA#%04d 0x%04x parse %s \"%-.*s\" `%-.*s'\n", __LINE__
, debug_flag
, rexname(rex
), rex
->re
.string
.size
, rex
->re
.string
.base
, env
->end
- s
, s
)),(0));
1705 if (rex
->re
.string
.size
> (env
->end
- s
))
1707 t
= rex
->re
.string
.base
;
1708 e
= t
+ rex
->re
.string
.size
;
1709 if (!(p
= rex
->map
))
1718 if (p
[*s
++] != *t
++)
1727 if (towupper(c
) != d
)
1733 if (((s
+ rex
->re
.trie
.min
) > env
->end
) || !(x
= rex
->re
.trie
.root
[rex
->map
? rex
->map
[*s
] : *s
]))
1735 return parsetrie(env
, x
, rex
, cont
, s
);
1738 r
= (*env
->disc
->re_execf
)(env
->regex
, rex
->re
.exec
.data
, rex
->re
.exec
.text
, rex
->re
.exec
.size
, (const char*)s
, env
->end
- s
, &u
, env
->disc
);
1739 e
= (unsigned char*)u
;
1740 if (e
>= s
&& e
<= env
->end
)
1754 if (!isword(*s
) || s
> env
->beg
&& isword(*(s
- 1)))
1758 if (isword(*s
) || s
> env
->beg
&& !isword(*(s
- 1)))
1762 if (s
> env
->beg
&& isword(*(s
- 1)) == isword(*s
))
1766 if (s
== env
->beg
|| isword(*(s
- 1)) != isword(*s
))
1774 for (t
= s
; t
< env
->end
&& *t
== '\n'; t
++);
1783 if (!(rex
= rex
->next
))
1793 #if _AST_REGEX_DEBUG
1796 listnode(Rex_t
* e
, int level
)
1804 for (i
= 0; i
< level
; i
++)
1805 sfprintf(sfstderr
, " ");
1806 sfprintf(sfstderr
, "%s\n", rexname(e
));
1811 listnode(e
->re
.group
.expr
.binary
.left
, level
+ 1);
1812 listnode(e
->re
.group
.expr
.binary
.right
, level
+ 1);
1815 case REX_GROUP_AHEAD
:
1816 case REX_GROUP_AHEAD_NOT
:
1817 case REX_GROUP_BEHIND
:
1818 case REX_GROUP_BEHIND_NOT
:
1822 listnode(e
->re
.group
.expr
.rex
, level
+ 1);
1825 } while (e
= e
->next
);
1830 list(Env_t
* env
, Rex_t
* rex
)
1832 sfprintf(sfstderr
, "AHA regex hard=%d stack=%p\n", env
->hard
, env
->stack
);
1841 * returning REG_BADPAT or REG_ESPACE is not explicitly
1842 * countenanced by the standard
1846 regnexec(const regex_t
* p
, const char* s
, size_t len
, size_t nmatch
, regmatch_t
* match
, regflags_t flags
)
1858 DEBUG_TEST(0x0001,(sfprintf(sfstdout
, "AHA#%04d 0x%04x regnexec %d 0x%08x `%-.*s'\n", __LINE__
, debug_flag
, nmatch
, flags
, len
, s
)),(0));
1859 if (!p
|| !(env
= p
->env
))
1862 return fatal(env
->disc
, REG_BADPAT
, NiL
);
1865 DEBUG_TEST(0x0080,(sfprintf(sfstdout
, "AHA#%04d REG_NOMATCH %d %d\n", __LINE__
, len
, env
->min
)),(0));
1869 env
->beg
= (unsigned char*)s
;
1870 env
->end
= env
->beg
+ len
;
1871 stknew(stkstd
, &env
->stk
);
1872 env
->flags
&= ~REG_EXEC
;
1873 env
->flags
|= (flags
& REG_EXEC
);
1875 if (env
->stack
= env
->hard
|| !(env
->flags
& REG_NOSUB
) && nmatch
)
1878 if (!(env
->match
= (regmatch_t
*)stkpush(stkstd
, 2 * (n
+ 1) * sizeof(regmatch_t
))) ||
1879 !env
->pos
&& !(env
->pos
= vecopen(16, sizeof(Pos_t
))) ||
1880 !env
->bestpos
&& !(env
->bestpos
= vecopen(16, sizeof(Pos_t
))))
1885 env
->pos
->cur
= env
->bestpos
->cur
= 0;
1886 env
->best
= &env
->match
[n
+ 1];
1887 env
->best
[0].rm_so
= 0;
1888 env
->best
[0].rm_eo
= -1;
1889 for (i
= 0; i
<= n
; i
++)
1890 env
->match
[i
] = state
.nomatch
;
1891 if (flags
& REG_ADVANCE
)
1894 DEBUG_TEST(0x1000,(list(env
,env
->rex
)),(0));
1896 if ((e
= env
->rex
)->type
== REX_BM
)
1898 DEBUG_TEST(0x0080,(sfprintf(sfstdout
, "AHA#%04d REX_BM\n", __LINE__
)),(0));
1899 if (len
< e
->re
.bm
.right
)
1901 DEBUG_TEST(0x0080,(sfprintf(sfstdout
, "AHA#%04d REG_NOMATCH %d %d\n", __LINE__
, len
, e
->re
.bm
.right
)),(0));
1904 else if (!(flags
& REG_LEFT
))
1906 register unsigned char* buf
= (unsigned char*)s
;
1907 register size_t index
= e
->re
.bm
.left
+ e
->re
.bm
.size
;
1908 register size_t mid
= len
- e
->re
.bm
.right
;
1909 register size_t* skip
= e
->re
.bm
.skip
;
1910 register size_t* fail
= e
->re
.bm
.fail
;
1911 register Bm_mask_t
** mask
= e
->re
.bm
.mask
;
1915 DEBUG_TEST(0x0080,(sfprintf(sfstdout
, "AHA#%04d REX_BM len=%d right=%d left=%d size=%d %d %d\n", __LINE__
, len
, e
->re
.bm
.right
, e
->re
.bm
.left
, e
->re
.bm
.size
, index
, mid
)),(0));
1918 while ((index
+= skip
[buf
[index
]]) < mid
);
1921 DEBUG_TEST(0x0080,(sfprintf(sfstdout
, "AHA#%04d REG_NOMATCH %d %d\n", __LINE__
, index
, HIT
)),(0));
1925 m
= mask
[n
= e
->re
.bm
.size
- 1][buf
[index
]];
1930 if (e
->re
.bm
.back
< 0)
1934 i
= index
- e
->re
.bm
.back
;
1937 env
->best
[0].rm_so
+= i
;
1941 if (index
< e
->re
.bm
.back
)
1944 index
-= e
->re
.bm
.back
;
1947 if ((i
= parse(env
, e
->next
, &env
->done
, buf
+ index
)) != NONE
)
1950 env
->best
[0].rm_so
= index
;
1956 index
+= e
->re
.bm
.size
;
1959 } while (m
&= mask
[n
][buf
[--index
]]);
1960 if ((index
+= fail
[n
+ 1]) >= len
)
1968 j
= env
->once
|| (flags
& REG_LEFT
);
1969 DEBUG_TEST(0x0080,(sfprintf(sfstdout
, "AHA#%04d parse once=%d\n", __LINE__
, j
)),(0));
1970 while ((i
= parse(env
, e
, &env
->done
, (unsigned char*)s
)) == NONE
|| advance
&& !env
->best
[0].rm_eo
&& !(advance
= 0))
1976 if ((unsigned char*)s
> env
->end
- env
->min
)
1979 env
->best
[0].rm_so
+= i
;
1981 if ((flags
& REG_LEFT
) && env
->stack
&& env
->best
[0].rm_so
)
1988 k
= env
->error
= REG_NOMATCH
;
1991 if (!(env
->flags
& REG_NOSUB
))
1993 k
= (env
->flags
& (REG_SHELL
|REG_AUGMENTED
)) == (REG_SHELL
|REG_AUGMENTED
);
1994 for (i
= j
= m
= 0; j
< nmatch
; i
++)
1995 if (!i
|| !k
|| (i
& 1))
1998 match
[j
] = state
.nomatch
;
2000 match
[m
= j
] = env
->best
[i
];
2005 while (m
> 0 && match
[m
].rm_so
== -1 && match
[m
].rm_eo
== -1)
2007 ((regex_t
*)p
)->re_nsub
= m
;
2012 stkold(stkstd
, &env
->stk
);
2014 if (k
> REG_NOMATCH
)
2015 fatal(p
->env
->disc
, k
, NiL
);
2024 if (p
&& (env
= p
->env
))
2034 if (--env
->refs
<= 0 && !(env
->disc
->re_flags
& REG_NOFREE
))
2036 drop(env
->disc
, env
->rex
);
2040 vecclose(env
->bestpos
);
2042 stkold(stkstd
, &env
->stk
);
2043 alloc(env
->disc
, env
, 0);