1 /***********************************************************************
3 * This software is part of the ast package *
4 * Copyright (c) 1985-2010 AT&T Intellectual Property *
5 * and is licensed under the *
6 * Common Public License, Version 1.0 *
7 * by AT&T Intellectual Property *
9 * A copy of the License is available at *
10 * http://www.opensource.org/licenses/cpl1.0.txt *
11 * (with md5 checksum 059e8cd6165cb4c31e351f2b69388fd9) *
13 * Information and Software Systems Research *
17 * Glenn Fowler <gsf@research.att.com> *
18 * David Korn <dgk@research.att.com> *
19 * Phong Vo <kpv@research.att.com> *
21 ***********************************************************************/
25 * posix regex implementation
27 * based on Doug McIlroy's C++ implementation
28 * Knuth-Morris-Pratt adapted from Corman-Leiserson-Rivest
29 * Boyer-Moore from conversations with David Korn, Phong Vo, Andrew Hume
35 #define REG_VERSION_EXEC 20020509L
36 #define REG_VERSION_MAP 20030916L /* regdisc_t.re_map */
40 #define alloc _reg_alloc
41 #define classfun _reg_classfun
42 #define drop _reg_drop
43 #define fatal _reg_fatal
44 #define state _reg_state
46 typedef struct regsubop_s
48 int op
; /* REG_SUB_LOWER,REG_SUB_UPPER */
49 int off
; /* re_rhs or match[] offset */
50 int len
; /* re_rhs len or len==0 match[] */
53 #define _REG_SUB_PRIVATE_ \
54 char* re_cur; /* re_buf cursor */ \
55 char* re_end; /* re_buf end */ \
56 regsubop_t* re_ops; /* rhs ops */ \
57 char re_rhs[1]; /* substitution rhs */
68 #if _BLD_DEBUG && !defined(_AST_REGEX_DEBUG)
69 #define _AST_REGEX_DEBUG 1
72 #define MBSIZE(p) ((ast.tmp_int=mbsize(p))>0?ast.tmp_int:1)
74 #undef RE_DUP_MAX /* posix puts this in limits.h! */
75 #define RE_DUP_MAX (INT_MAX/2-1) /* 2*RE_DUP_MAX won't overflow */
76 #define RE_DUP_INF (RE_DUP_MAX+1) /* infinity, for * */
77 #define BACK_REF_MAX 9
79 #define REG_COMP (REG_DELIMITED|REG_ESCAPE|REG_EXTENDED|REG_FIRST|REG_ICASE|REG_NOSUB|REG_NEWLINE|REG_SHELL|REG_AUGMENTED|REG_LEFT|REG_LITERAL|REG_MINIMAL|REG_MULTIREF|REG_NULL|REG_RIGHT|REG_LENIENT|REG_MUSTDELIM)
80 #define REG_EXEC (REG_ADVANCE|REG_INVERT|REG_NOTBOL|REG_NOTEOL|REG_STARTEND)
82 #define REX_NULL 0 /* null string (internal) */
83 #define REX_ALT 1 /* a|b */
84 #define REX_ALT_CATCH 2 /* REX_ALT catcher */
85 #define REX_BACK 3 /* \1, \2, etc */
86 #define REX_BEG 4 /* initial ^ */
87 #define REX_BEG_STR 5 /* initial ^ w/ no newline */
88 #define REX_BM 6 /* Boyer-Moore */
89 #define REX_CAT 7 /* catenation catcher */
90 #define REX_CLASS 8 /* [...] */
91 #define REX_COLL_CLASS 9 /* collation order [...] */
92 #define REX_CONJ 10 /* a&b */
93 #define REX_CONJ_LEFT 11 /* REX_CONJ left catcher */
94 #define REX_CONJ_RIGHT 12 /* REX_CONJ right catcher */
95 #define REX_DONE 13 /* completed match (internal) */
96 #define REX_DOT 14 /* . */
97 #define REX_END 15 /* final $ */
98 #define REX_END_STR 16 /* final $ before tail newline */
99 #define REX_EXEC 17 /* call re.re_exec() */
100 #define REX_FIN_STR 18 /* final $ w/ no newline */
101 #define REX_GROUP 19 /* \(...\) */
102 #define REX_GROUP_CATCH 20 /* REX_GROUP catcher */
103 #define REX_GROUP_AHEAD 21 /* 0-width lookahead */
104 #define REX_GROUP_AHEAD_CATCH 22 /* REX_GROUP_AHEAD catcher */
105 #define REX_GROUP_AHEAD_NOT 23 /* inverted 0-width lookahead */
106 #define REX_GROUP_BEHIND 24 /* 0-width lookbehind */
107 #define REX_GROUP_BEHIND_CATCH 25 /* REX_GROUP_BEHIND catcher */
108 #define REX_GROUP_BEHIND_NOT 26 /* inverted 0-width lookbehind */
109 #define REX_GROUP_BEHIND_NOT_CATCH 27 /* REX_GROUP_BEHIND_NOT catcher */
110 #define REX_GROUP_COND 28 /* conditional group */
111 #define REX_GROUP_COND_CATCH 29 /* conditional group catcher */
112 #define REX_GROUP_CUT 30 /* don't backtrack over this */
113 #define REX_GROUP_CUT_CATCH 31 /* REX_GROUP_CUT catcher */
114 #define REX_KMP 32 /* Knuth-Morris-Pratt */
115 #define REX_NEG 33 /* negation */
116 #define REX_NEG_CATCH 34 /* REX_NEG catcher */
117 #define REX_NEST 35 /* nested match */
118 #define REX_ONECHAR 36 /* a single-character literal */
119 #define REX_REP 37 /* Kleene closure */
120 #define REX_REP_CATCH 38 /* REX_REP catcher */
121 #define REX_STRING 39 /* some chars */
122 #define REX_TRIE 40 /* alternation of strings */
123 #define REX_WBEG 41 /* \< */
124 #define REX_WEND 42 /* \> */
125 #define REX_WORD 43 /* word boundary */
126 #define REX_WORD_NOT 44 /* not word boundary */
128 #define T_META ((int)UCHAR_MAX+1)
129 #define T_STAR (T_META+0)
130 #define T_PLUS (T_META+1)
131 #define T_QUES (T_META+2)
132 #define T_BANG (T_META+3)
133 #define T_AT (T_META+4)
134 #define T_TILDE (T_META+5)
135 #define T_PERCENT (T_META+6)
136 #define T_LEFT (T_META+7)
137 #define T_OPEN (T_META+8)
138 #define T_CLOSE (T_OPEN+1)
139 #define T_RIGHT (T_OPEN+2)
140 #define T_CFLX (T_OPEN+3)
141 #define T_DOT (T_OPEN+4)
142 #define T_DOTSTAR (T_OPEN+5)
143 #define T_END (T_OPEN+6)
144 #define T_BAD (T_OPEN+7)
145 #define T_DOLL (T_OPEN+8)
146 #define T_BRA (T_OPEN+9)
147 #define T_BAR (T_OPEN+10)
148 #define T_AND (T_OPEN+11)
149 #define T_LT (T_OPEN+12)
150 #define T_GT (T_OPEN+13)
151 #define T_SLASHPLUS (T_OPEN+14)
152 #define T_GROUP (T_OPEN+15)
153 #define T_WORD (T_OPEN+16)
154 #define T_WORD_NOT (T_WORD+1)
155 #define T_BEG_STR (T_WORD+2)
156 #define T_END_STR (T_WORD+3)
157 #define T_FIN_STR (T_WORD+4)
158 #define T_ESCAPE (T_WORD+5)
159 #define T_ALNUM (T_WORD+6)
160 #define T_ALNUM_NOT (T_ALNUM+1)
161 #define T_DIGIT (T_ALNUM+2)
162 #define T_DIGIT_NOT (T_ALNUM+3)
163 #define T_SPACE (T_ALNUM+4)
164 #define T_SPACE_NOT (T_ALNUM+5)
165 #define T_BACK (T_ALNUM+6)
173 #define HIT SSIZE_MAX
175 #define bitclr(p,c) ((p)[((c)>>3)&037]&=(~(1<<((c)&07))))
176 #define bitset(p,c) ((p)[((c)>>3)&037]|=(1<<((c)&07)))
177 #define bittst(p,c) ((p)[((c)>>3)&037]&(1<<((c)&07)))
179 #define setadd(p,c) bitset((p)->bits,c)
180 #define setclr(p,c) bitclr((p)->bits,c)
181 #define settst(p,c) bittst((p)->bits,c)
183 #if _hdr_wchar && _lib_wctype && _lib_iswctype
185 #include <stdio.h> /* because <wchar.h> includes it and we generate it */
191 #if !defined(iswblank) && !_lib_iswblank
192 #define _need_iswblank 1
193 #define iswblank(x) _reg_iswblank(x)
194 extern int _reg_iswblank(wint_t);
197 #if !defined(towupper) && !_lib_towupper
198 #define towupper(x) toupper(x)
201 #if !defined(towlower) && !_lib_towlower
202 #define towlower(x) tolower(x)
210 #define iswalnum(x) isalnum(x)
213 #define iswalpha(x) isalpha(x)
216 #define iswcntrl(x) iscntrl(x)
219 #define iswdigit(x) isdigit(x)
222 #define iswgraph(x) isgraph(x)
225 #define iswlower(x) islower(x)
228 #define iswprint(x) isprint(x)
231 #define iswpunct(x) ispunct(x)
234 #define iswspace(x) isspace(x)
237 #define iswupper(x) isupper(x)
240 #define iswxdigit(x) isxdigit(x)
244 #define towlower(x) tolower(x)
247 #define towupper(x) toupper(x)
253 #define iswblank(x) ((x)==' '||(x)=='\t')
257 #define iswgraph(x) (iswprint(x)&&!iswblank(x))
260 #define isword(x) (isalnum(x)||(x)=='_')
263 * collation element support
266 #define COLL_KEY_MAX 32
268 #if COLL_KEY_MAX < MB_LEN_MAX
270 #define COLL_KEY_MAX MB_LEN_MAX
273 typedef unsigned char Ckey_t
[COLL_KEY_MAX
+1];
279 #define COLL_range_lc 4
280 #define COLL_range_uc 5
282 typedef struct Celt_s
293 * private stuff hanging off regex_t
296 typedef struct Stk_pos_s
302 typedef struct Vector_s
304 Stk_t
* stk
; /* stack pointer */
305 char* vec
; /* the data */
306 int inc
; /* growth increment */
307 int siz
; /* element size */
308 int max
; /* max index */
309 int cur
; /* current index -- user domain */
316 typedef struct Cond_s
318 unsigned char* beg
; /* beginning of next match */
319 struct Rex_s
* next
[2]; /* 0:no 1:yes next pattern */
320 struct Rex_s
* cont
; /* right catcher */
321 int yes
; /* yes condition hit */
324 typedef struct Conj_left_s
326 unsigned char* beg
; /* beginning of left match */
327 struct Rex_s
* right
; /* right pattern */
328 struct Rex_s
* cont
; /* right catcher */
331 typedef struct Conj_right_s
333 unsigned char* end
; /* end of left match */
334 struct Rex_s
* cont
; /* ambient continuation */
337 typedef unsigned int Bm_mask_t
;
351 typedef struct String_s
360 unsigned char bits
[(UCHAR_MAX
+1)/CHAR_BIT
];
363 typedef struct Collate_s
369 typedef struct Binary_s
376 typedef struct Group_s
378 int number
; /* group number */
379 int last
; /* last contained group number */
380 int size
; /* lookbehind size */
381 int back
; /* backreferenced */
382 regflags_t flags
; /* group flags */
390 typedef struct Exec_s
397 #define REX_NEST_open 0x01
398 #define REX_NEST_close 0x02
399 #define REX_NEST_escape 0x04
400 #define REX_NEST_quote 0x08
401 #define REX_NEST_literal 0x10
402 #define REX_NEST_delimiter 0x20
403 #define REX_NEST_terminator 0x40
404 #define REX_NEST_separator 0x80
406 #define REX_NEST_SHIFT 8
408 typedef struct Nest_s
411 unsigned short none
; /* for Nest_t.type[-1] */
412 unsigned short type
[1];
416 * REX_ALT catcher, solely to get control at the end of an
417 * alternative to keep records for comparing matches.
420 typedef struct Alt_catch_s
425 typedef struct Group_catch_s
431 typedef struct Behind_catch_s
439 * REX_NEG catcher determines what string lengths can be matched,
440 * then Neg investigates continuations of other lengths.
441 * This is inefficient. For !POSITIONS expressions, we can do better:
442 * since matches to rex will be enumerated in decreasing order,
443 * we can investigate continuations whenever a length is skipped.
446 typedef struct Neg_catch_s
449 unsigned char* index
;
453 * REX_REP catcher. One is created on the stack for
454 * each iteration of a complex repetition.
457 typedef struct Rep_catch_s
466 * data structure for an alternation of pure strings
467 * son points to a subtree of all strings with a common
468 * prefix ending in character c. sib links alternate
469 * letters in the same position of a word. end=1 if
470 * some word ends with c. the order of strings is
471 * irrelevant, except long words must be investigated
475 typedef struct Trie_node_s
479 struct Trie_node_s
* son
;
480 struct Trie_node_s
* sib
;
483 typedef struct Trie_s
491 * Rex_t is a node in a regular expression
496 unsigned char type
; /* node type */
497 unsigned char marked
; /* already marked */
498 short serial
; /* subpattern number */
499 regflags_t flags
; /* scoped flags */
500 int explicit; /* scoped explicit match*/
501 struct Rex_s
* next
; /* remaining parts */
502 int lo
; /* lo dup count */
503 int hi
; /* hi dup count */
504 unsigned char* map
; /* fold and/or ccode map*/
507 Alt_catch_t alt_catch
; /* alt catcher */
509 Behind_catch_t behind_catch
; /* behind catcher */
510 Set_t
* charclass
; /* char class */
511 Collate_t collate
; /* collation class */
512 Cond_t cond_catch
; /* cond catcher */
513 Conj_left_t conj_left
; /* conj left catcher */
514 Conj_right_t conj_right
; /* conj right catcher */
515 void* data
; /* data after Rex_t */
516 Exec_t exec
; /* re.re_exec() args */
517 Group_t group
; /* a|b or rep */
518 Group_catch_t group_catch
; /* group catcher */
519 Neg_catch_t neg_catch
; /* neg catcher */
520 Nest_t nest
; /* nested match */
521 unsigned char onechar
; /* single char */
522 Rep_catch_t rep_catch
; /* rep catcher */
523 String_t string
; /* string/kmp */
524 Trie_t trie
; /* trie */
528 typedef struct reglib_s
/* library private regex_t info */
530 struct Rex_s
* rex
; /* compiled expression */
531 regdisc_t
* disc
; /* REG_DISCIPLINE discipline */
532 const regex_t
* regex
; /* from regexec */
533 unsigned char* beg
; /* beginning of string */
534 unsigned char* end
; /* end of string */
535 Vector_t
* pos
; /* posns of certain subpatterns */
536 Vector_t
* bestpos
; /* ditto for best match */
537 regmatch_t
* match
; /* subexrs in current match */
538 regmatch_t
* best
; /* ditto in best match yet */
539 Stk_pos_t stk
; /* exec stack pos */
540 size_t min
; /* minimum match length */
541 size_t nsub
; /* internal re_nsub */
542 regflags_t flags
; /* flags from regcomp() */
543 int error
; /* last error */
544 int explicit; /* explicit match on this char */
545 int leading
; /* leading match on this char */
546 int refs
; /* regcomp()+regdup() references*/
547 Rex_t done
; /* the last continuation */
548 regstat_t stats
; /* for regstat() */
549 unsigned char fold
[UCHAR_MAX
+1]; /* REG_ICASE map */
550 unsigned char hard
; /* hard comp */
551 unsigned char once
; /* if 1st parse fails, quit */
552 unsigned char separate
; /* cannot combine */
553 unsigned char stack
; /* hard comp or exec */
554 unsigned char sub
; /* re_sub is valid */
555 unsigned char test
; /* debug/test bitmask */
558 typedef struct State_s
/* shared state */
566 short* magic
[UCHAR_MAX
+1];
575 extern State_t state
;
577 extern void* alloc(regdisc_t
*, void*, size_t);
578 extern regclass_t
classfun(int);
579 extern void drop(regdisc_t
*, Rex_t
*);
580 extern int fatal(regdisc_t
*, int, const char*);