dmake: do not set MAKEFLAGS=k
[unleashed/tickless.git] / usr / src / lib / libast / common / regex / reglib.h
blob8d2d3ea707fd5c9a840a352ef0dec9cc3bd6c48b
1 /***********************************************************************
2 * *
3 * This software is part of the ast package *
4 * Copyright (c) 1985-2010 AT&T Intellectual Property *
5 * and is licensed under the *
6 * Common Public License, Version 1.0 *
7 * by AT&T Intellectual Property *
8 * *
9 * A copy of the License is available at *
10 * http://www.opensource.org/licenses/cpl1.0.txt *
11 * (with md5 checksum 059e8cd6165cb4c31e351f2b69388fd9) *
12 * *
13 * Information and Software Systems Research *
14 * AT&T Research *
15 * Florham Park NJ *
16 * *
17 * Glenn Fowler <gsf@research.att.com> *
18 * David Korn <dgk@research.att.com> *
19 * Phong Vo <kpv@research.att.com> *
20 * *
21 ***********************************************************************/
22 #pragma prototyped
25 * posix regex implementation
27 * based on Doug McIlroy's C++ implementation
28 * Knuth-Morris-Pratt adapted from Corman-Leiserson-Rivest
29 * Boyer-Moore from conversations with David Korn, Phong Vo, Andrew Hume
32 #ifndef _REGLIB_H
33 #define _REGLIB_H
35 #define REG_VERSION_EXEC 20020509L
36 #define REG_VERSION_MAP 20030916L /* regdisc_t.re_map */
38 #define re_info env
40 #define alloc _reg_alloc
41 #define classfun _reg_classfun
42 #define drop _reg_drop
43 #define fatal _reg_fatal
44 #define state _reg_state
46 typedef struct regsubop_s
48 int op; /* REG_SUB_LOWER,REG_SUB_UPPER */
49 int off; /* re_rhs or match[] offset */
50 int len; /* re_rhs len or len==0 match[] */
51 } regsubop_t;
53 #define _REG_SUB_PRIVATE_ \
54 char* re_cur; /* re_buf cursor */ \
55 char* re_end; /* re_buf end */ \
56 regsubop_t* re_ops; /* rhs ops */ \
57 char re_rhs[1]; /* substitution rhs */
59 #include <ast.h>
60 #include <cdt.h>
61 #include <stk.h>
63 #include "regex.h"
65 #include <ctype.h>
66 #include <errno.h>
68 #if _BLD_DEBUG && !defined(_AST_REGEX_DEBUG)
69 #define _AST_REGEX_DEBUG 1
70 #endif
72 #define MBSIZE(p) ((ast.tmp_int=mbsize(p))>0?ast.tmp_int:1)
74 #undef RE_DUP_MAX /* posix puts this in limits.h! */
75 #define RE_DUP_MAX (INT_MAX/2-1) /* 2*RE_DUP_MAX won't overflow */
76 #define RE_DUP_INF (RE_DUP_MAX+1) /* infinity, for * */
77 #define BACK_REF_MAX 9
79 #define REG_COMP (REG_DELIMITED|REG_ESCAPE|REG_EXTENDED|REG_FIRST|REG_ICASE|REG_NOSUB|REG_NEWLINE|REG_SHELL|REG_AUGMENTED|REG_LEFT|REG_LITERAL|REG_MINIMAL|REG_MULTIREF|REG_NULL|REG_RIGHT|REG_LENIENT|REG_MUSTDELIM)
80 #define REG_EXEC (REG_ADVANCE|REG_INVERT|REG_NOTBOL|REG_NOTEOL|REG_STARTEND)
82 #define REX_NULL 0 /* null string (internal) */
83 #define REX_ALT 1 /* a|b */
84 #define REX_ALT_CATCH 2 /* REX_ALT catcher */
85 #define REX_BACK 3 /* \1, \2, etc */
86 #define REX_BEG 4 /* initial ^ */
87 #define REX_BEG_STR 5 /* initial ^ w/ no newline */
88 #define REX_BM 6 /* Boyer-Moore */
89 #define REX_CAT 7 /* catenation catcher */
90 #define REX_CLASS 8 /* [...] */
91 #define REX_COLL_CLASS 9 /* collation order [...] */
92 #define REX_CONJ 10 /* a&b */
93 #define REX_CONJ_LEFT 11 /* REX_CONJ left catcher */
94 #define REX_CONJ_RIGHT 12 /* REX_CONJ right catcher */
95 #define REX_DONE 13 /* completed match (internal) */
96 #define REX_DOT 14 /* . */
97 #define REX_END 15 /* final $ */
98 #define REX_END_STR 16 /* final $ before tail newline */
99 #define REX_EXEC 17 /* call re.re_exec() */
100 #define REX_FIN_STR 18 /* final $ w/ no newline */
101 #define REX_GROUP 19 /* \(...\) */
102 #define REX_GROUP_CATCH 20 /* REX_GROUP catcher */
103 #define REX_GROUP_AHEAD 21 /* 0-width lookahead */
104 #define REX_GROUP_AHEAD_CATCH 22 /* REX_GROUP_AHEAD catcher */
105 #define REX_GROUP_AHEAD_NOT 23 /* inverted 0-width lookahead */
106 #define REX_GROUP_BEHIND 24 /* 0-width lookbehind */
107 #define REX_GROUP_BEHIND_CATCH 25 /* REX_GROUP_BEHIND catcher */
108 #define REX_GROUP_BEHIND_NOT 26 /* inverted 0-width lookbehind */
109 #define REX_GROUP_BEHIND_NOT_CATCH 27 /* REX_GROUP_BEHIND_NOT catcher */
110 #define REX_GROUP_COND 28 /* conditional group */
111 #define REX_GROUP_COND_CATCH 29 /* conditional group catcher */
112 #define REX_GROUP_CUT 30 /* don't backtrack over this */
113 #define REX_GROUP_CUT_CATCH 31 /* REX_GROUP_CUT catcher */
114 #define REX_KMP 32 /* Knuth-Morris-Pratt */
115 #define REX_NEG 33 /* negation */
116 #define REX_NEG_CATCH 34 /* REX_NEG catcher */
117 #define REX_NEST 35 /* nested match */
118 #define REX_ONECHAR 36 /* a single-character literal */
119 #define REX_REP 37 /* Kleene closure */
120 #define REX_REP_CATCH 38 /* REX_REP catcher */
121 #define REX_STRING 39 /* some chars */
122 #define REX_TRIE 40 /* alternation of strings */
123 #define REX_WBEG 41 /* \< */
124 #define REX_WEND 42 /* \> */
125 #define REX_WORD 43 /* word boundary */
126 #define REX_WORD_NOT 44 /* not word boundary */
128 #define T_META ((int)UCHAR_MAX+1)
129 #define T_STAR (T_META+0)
130 #define T_PLUS (T_META+1)
131 #define T_QUES (T_META+2)
132 #define T_BANG (T_META+3)
133 #define T_AT (T_META+4)
134 #define T_TILDE (T_META+5)
135 #define T_PERCENT (T_META+6)
136 #define T_LEFT (T_META+7)
137 #define T_OPEN (T_META+8)
138 #define T_CLOSE (T_OPEN+1)
139 #define T_RIGHT (T_OPEN+2)
140 #define T_CFLX (T_OPEN+3)
141 #define T_DOT (T_OPEN+4)
142 #define T_DOTSTAR (T_OPEN+5)
143 #define T_END (T_OPEN+6)
144 #define T_BAD (T_OPEN+7)
145 #define T_DOLL (T_OPEN+8)
146 #define T_BRA (T_OPEN+9)
147 #define T_BAR (T_OPEN+10)
148 #define T_AND (T_OPEN+11)
149 #define T_LT (T_OPEN+12)
150 #define T_GT (T_OPEN+13)
151 #define T_SLASHPLUS (T_OPEN+14)
152 #define T_GROUP (T_OPEN+15)
153 #define T_WORD (T_OPEN+16)
154 #define T_WORD_NOT (T_WORD+1)
155 #define T_BEG_STR (T_WORD+2)
156 #define T_END_STR (T_WORD+3)
157 #define T_FIN_STR (T_WORD+4)
158 #define T_ESCAPE (T_WORD+5)
159 #define T_ALNUM (T_WORD+6)
160 #define T_ALNUM_NOT (T_ALNUM+1)
161 #define T_DIGIT (T_ALNUM+2)
162 #define T_DIGIT_NOT (T_ALNUM+3)
163 #define T_SPACE (T_ALNUM+4)
164 #define T_SPACE_NOT (T_ALNUM+5)
165 #define T_BACK (T_ALNUM+6)
167 #define BRE 0
168 #define ERE 3
169 #define ARE 6
170 #define SRE 9
171 #define KRE 12
173 #define HIT SSIZE_MAX
175 #define bitclr(p,c) ((p)[((c)>>3)&037]&=(~(1<<((c)&07))))
176 #define bitset(p,c) ((p)[((c)>>3)&037]|=(1<<((c)&07)))
177 #define bittst(p,c) ((p)[((c)>>3)&037]&(1<<((c)&07)))
179 #define setadd(p,c) bitset((p)->bits,c)
180 #define setclr(p,c) bitclr((p)->bits,c)
181 #define settst(p,c) bittst((p)->bits,c)
183 #if _hdr_wchar && _lib_wctype && _lib_iswctype
185 #include <stdio.h> /* because <wchar.h> includes it and we generate it */
186 #include <wchar.h>
187 #if _hdr_wctype
188 #include <wctype.h>
189 #endif
191 #if !defined(iswblank) && !_lib_iswblank
192 #define _need_iswblank 1
193 #define iswblank(x) _reg_iswblank(x)
194 extern int _reg_iswblank(wint_t);
195 #endif
197 #if !defined(towupper) && !_lib_towupper
198 #define towupper(x) toupper(x)
199 #endif
201 #if !defined(towlower) && !_lib_towlower
202 #define towlower(x) tolower(x)
203 #endif
205 #else
207 #undef _lib_wctype
209 #ifndef iswalnum
210 #define iswalnum(x) isalnum(x)
211 #endif
212 #ifndef iswalpha
213 #define iswalpha(x) isalpha(x)
214 #endif
215 #ifndef iswcntrl
216 #define iswcntrl(x) iscntrl(x)
217 #endif
218 #ifndef iswdigit
219 #define iswdigit(x) isdigit(x)
220 #endif
221 #ifndef iswgraph
222 #define iswgraph(x) isgraph(x)
223 #endif
224 #ifndef iswlower
225 #define iswlower(x) islower(x)
226 #endif
227 #ifndef iswprint
228 #define iswprint(x) isprint(x)
229 #endif
230 #ifndef iswpunct
231 #define iswpunct(x) ispunct(x)
232 #endif
233 #ifndef iswspace
234 #define iswspace(x) isspace(x)
235 #endif
236 #ifndef iswupper
237 #define iswupper(x) isupper(x)
238 #endif
239 #ifndef iswxdigit
240 #define iswxdigit(x) isxdigit(x)
241 #endif
243 #ifndef towlower
244 #define towlower(x) tolower(x)
245 #endif
246 #ifndef towupper
247 #define towupper(x) toupper(x)
248 #endif
250 #endif
252 #ifndef iswblank
253 #define iswblank(x) ((x)==' '||(x)=='\t')
254 #endif
256 #ifndef iswgraph
257 #define iswgraph(x) (iswprint(x)&&!iswblank(x))
258 #endif
260 #define isword(x) (isalnum(x)||(x)=='_')
263 * collation element support
266 #define COLL_KEY_MAX 32
268 #if COLL_KEY_MAX < MB_LEN_MAX
269 #undef COLL_KEY_MAX
270 #define COLL_KEY_MAX MB_LEN_MAX
271 #endif
273 typedef unsigned char Ckey_t[COLL_KEY_MAX+1];
275 #define COLL_end 0
276 #define COLL_call 1
277 #define COLL_char 2
278 #define COLL_range 3
279 #define COLL_range_lc 4
280 #define COLL_range_uc 5
282 typedef struct Celt_s
284 short typ;
285 short min;
286 short max;
287 regclass_t fun;
288 Ckey_t beg;
289 Ckey_t end;
290 } Celt_t;
293 * private stuff hanging off regex_t
296 typedef struct Stk_pos_s
298 off_t offset;
299 char* base;
300 } Stk_pos_t;
302 typedef struct Vector_s
304 Stk_t* stk; /* stack pointer */
305 char* vec; /* the data */
306 int inc; /* growth increment */
307 int siz; /* element size */
308 int max; /* max index */
309 int cur; /* current index -- user domain */
310 } Vector_t;
313 * Rex_t subtypes
316 typedef struct Cond_s
318 unsigned char* beg; /* beginning of next match */
319 struct Rex_s* next[2]; /* 0:no 1:yes next pattern */
320 struct Rex_s* cont; /* right catcher */
321 int yes; /* yes condition hit */
322 } Cond_t;
324 typedef struct Conj_left_s
326 unsigned char* beg; /* beginning of left match */
327 struct Rex_s* right; /* right pattern */
328 struct Rex_s* cont; /* right catcher */
329 } Conj_left_t;
331 typedef struct Conj_right_s
333 unsigned char* end; /* end of left match */
334 struct Rex_s* cont; /* ambient continuation */
335 } Conj_right_t;
337 typedef unsigned int Bm_mask_t;
339 typedef struct Bm_s
341 Bm_mask_t** mask;
342 size_t* skip;
343 size_t* fail;
344 size_t size;
345 ssize_t back;
346 ssize_t left;
347 ssize_t right;
348 size_t complete;
349 } Bm_t;
351 typedef struct String_s
353 int* fail;
354 unsigned char* base;
355 size_t size;
356 } String_t;
358 typedef struct Set_s
360 unsigned char bits[(UCHAR_MAX+1)/CHAR_BIT];
361 } Set_t;
363 typedef struct Collate_s
365 int invert;
366 Celt_t* elements;
367 } Collate_t;
369 typedef struct Binary_s
371 struct Rex_s* left;
372 struct Rex_s* right;
373 int serial;
374 } Binary_t;
376 typedef struct Group_s
378 int number; /* group number */
379 int last; /* last contained group number */
380 int size; /* lookbehind size */
381 int back; /* backreferenced */
382 regflags_t flags; /* group flags */
383 union
385 Binary_t binary;
386 struct Rex_s* rex;
387 } expr;
388 } Group_t;
390 typedef struct Exec_s
392 void* data;
393 const char* text;
394 size_t size;
395 } Exec_t;
397 #define REX_NEST_open 0x01
398 #define REX_NEST_close 0x02
399 #define REX_NEST_escape 0x04
400 #define REX_NEST_quote 0x08
401 #define REX_NEST_literal 0x10
402 #define REX_NEST_delimiter 0x20
403 #define REX_NEST_terminator 0x40
404 #define REX_NEST_separator 0x80
406 #define REX_NEST_SHIFT 8
408 typedef struct Nest_s
410 int primary;
411 unsigned short none; /* for Nest_t.type[-1] */
412 unsigned short type[1];
413 } Nest_t;
416 * REX_ALT catcher, solely to get control at the end of an
417 * alternative to keep records for comparing matches.
420 typedef struct Alt_catch_s
422 struct Rex_s* cont;
423 } Alt_catch_t;
425 typedef struct Group_catch_s
427 struct Rex_s* cont;
428 regoff_t* eo;
429 } Group_catch_t;
431 typedef struct Behind_catch_s
433 struct Rex_s* cont;
434 unsigned char* beg;
435 unsigned char* end;
436 } Behind_catch_t;
439 * REX_NEG catcher determines what string lengths can be matched,
440 * then Neg investigates continuations of other lengths.
441 * This is inefficient. For !POSITIONS expressions, we can do better:
442 * since matches to rex will be enumerated in decreasing order,
443 * we can investigate continuations whenever a length is skipped.
446 typedef struct Neg_catch_s
448 unsigned char* beg;
449 unsigned char* index;
450 } Neg_catch_t;
453 * REX_REP catcher. One is created on the stack for
454 * each iteration of a complex repetition.
457 typedef struct Rep_catch_s
459 struct Rex_s* cont;
460 struct Rex_s* ref;
461 unsigned char* beg;
462 int n;
463 } Rep_catch_t;
466 * data structure for an alternation of pure strings
467 * son points to a subtree of all strings with a common
468 * prefix ending in character c. sib links alternate
469 * letters in the same position of a word. end=1 if
470 * some word ends with c. the order of strings is
471 * irrelevant, except long words must be investigated
472 * before short ones.
475 typedef struct Trie_node_s
477 unsigned char c;
478 unsigned char end;
479 struct Trie_node_s* son;
480 struct Trie_node_s* sib;
481 } Trie_node_t;
483 typedef struct Trie_s
485 Trie_node_t** root;
486 int min;
487 int max;
488 } Trie_t;
491 * Rex_t is a node in a regular expression
494 typedef struct Rex_s
496 unsigned char type; /* node type */
497 unsigned char marked; /* already marked */
498 short serial; /* subpattern number */
499 regflags_t flags; /* scoped flags */
500 int explicit; /* scoped explicit match*/
501 struct Rex_s* next; /* remaining parts */
502 int lo; /* lo dup count */
503 int hi; /* hi dup count */
504 unsigned char* map; /* fold and/or ccode map*/
505 union
507 Alt_catch_t alt_catch; /* alt catcher */
508 Bm_t bm; /* bm */
509 Behind_catch_t behind_catch; /* behind catcher */
510 Set_t* charclass; /* char class */
511 Collate_t collate; /* collation class */
512 Cond_t cond_catch; /* cond catcher */
513 Conj_left_t conj_left; /* conj left catcher */
514 Conj_right_t conj_right; /* conj right catcher */
515 void* data; /* data after Rex_t */
516 Exec_t exec; /* re.re_exec() args */
517 Group_t group; /* a|b or rep */
518 Group_catch_t group_catch; /* group catcher */
519 Neg_catch_t neg_catch; /* neg catcher */
520 Nest_t nest; /* nested match */
521 unsigned char onechar; /* single char */
522 Rep_catch_t rep_catch; /* rep catcher */
523 String_t string; /* string/kmp */
524 Trie_t trie; /* trie */
525 } re;
526 } Rex_t;
528 typedef struct reglib_s /* library private regex_t info */
530 struct Rex_s* rex; /* compiled expression */
531 regdisc_t* disc; /* REG_DISCIPLINE discipline */
532 const regex_t* regex; /* from regexec */
533 unsigned char* beg; /* beginning of string */
534 unsigned char* end; /* end of string */
535 Vector_t* pos; /* posns of certain subpatterns */
536 Vector_t* bestpos; /* ditto for best match */
537 regmatch_t* match; /* subexrs in current match */
538 regmatch_t* best; /* ditto in best match yet */
539 Stk_pos_t stk; /* exec stack pos */
540 size_t min; /* minimum match length */
541 size_t nsub; /* internal re_nsub */
542 regflags_t flags; /* flags from regcomp() */
543 int error; /* last error */
544 int explicit; /* explicit match on this char */
545 int leading; /* leading match on this char */
546 int refs; /* regcomp()+regdup() references*/
547 Rex_t done; /* the last continuation */
548 regstat_t stats; /* for regstat() */
549 unsigned char fold[UCHAR_MAX+1]; /* REG_ICASE map */
550 unsigned char hard; /* hard comp */
551 unsigned char once; /* if 1st parse fails, quit */
552 unsigned char separate; /* cannot combine */
553 unsigned char stack; /* hard comp or exec */
554 unsigned char sub; /* re_sub is valid */
555 unsigned char test; /* debug/test bitmask */
556 } Env_t;
558 typedef struct State_s /* shared state */
560 regmatch_t nomatch;
561 struct
563 unsigned char key;
564 short val[15];
565 } escape[52];
566 short* magic[UCHAR_MAX+1];
567 regdisc_t disc;
568 int fatal;
569 int initialized;
570 Dt_t* attrs;
571 Dt_t* names;
572 Dtdisc_t dtdisc;
573 } State_t;
575 extern State_t state;
577 extern void* alloc(regdisc_t*, void*, size_t);
578 extern regclass_t classfun(int);
579 extern void drop(regdisc_t*, Rex_t*);
580 extern int fatal(regdisc_t*, int, const char*);
582 #endif