1 /****************************************************************
2 Copyright (C) Lucent Technologies 1997
5 Permission to use, copy, modify, and distribute this software and
6 its documentation for any purpose and without fee is hereby
7 granted, provided that the above copyright notice appear in all
8 copies and that both that the copyright notice and this
9 permission notice and warranty disclaimer appear in supporting
10 documentation, and that the name Lucent Technologies or any of
11 its entities not be used in advertising or publicity pertaining
12 to distribution of the software without specific, written prior
15 LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
16 INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
17 IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
18 SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
20 IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
21 ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
23 ****************************************************************/
25 /* lasciate ogne speranza, voi ch'intrate. */
27 #if HAVE_NBTOOL_CONFIG_H
28 #include "nbtool_config.h"
41 #define HAT (NCHARS+2) /* matches ^ in regular expr */
45 #define type(v) (v)->nobj /* badly overloaded here */
46 #define info(v) (v)->ntype /* badly overloaded here */
47 #define left(v) (v)->narg[0]
48 #define right(v) (v)->narg[1]
49 #define parent(v) (v)->nnext
51 #define LEAF case CCL: case NCCL: case CHAR: case DOT: case FINAL: case ALL:
52 #define ELEAF case EMPTYRE: /* empty string in regexp */
53 #define UNARY case STAR: case PLUS: case QUEST:
55 /* encoding in tree Nodes:
56 leaf (CCL, NCCL, CHAR, DOT, FINAL, ALL, EMPTYRE):
57 left is index, right contains value or pointer to value
58 unary (STAR, PLUS, QUEST): left is child, right is null
59 binary (CAT, OR): left and right are children
60 parent contains pointer to parent
68 int rtok
; /* next token in current re */
70 static uschar
*rlxstr
;
71 static uschar
*prestr
; /* current position in current re */
72 static uschar
*lastre
; /* origin of last re */
80 #define NFA 128 /* cache this many dynamic fa's */
82 int nfatab
= 0; /* entries in fatab */
85 resize_state(fa
*fa
, int state
)
90 if (++state
< fa
->state_count
)
93 new_count
= state
+ 10; /* needs to be tuned */
95 p
= realloc(fa
->gototab
, new_count
* sizeof(fa
->gototab
[0]));
100 p
= realloc(fa
->out
, new_count
* sizeof(fa
->out
[0]));
105 p
= realloc(fa
->posns
, new_count
* sizeof(fa
->posns
[0]));
110 for (i
= fa
->state_count
; i
< new_count
; ++i
) {
111 fa
->gototab
[i
] = calloc(1, NCHARS
* sizeof (**fa
->gototab
));
112 if (fa
->gototab
[i
] == NULL
)
117 fa
->state_count
= new_count
;
120 overflo("out of memory in resize_state");
123 fa
*makedfa(const char *s
, int anchor
) /* returns dfa for reg expr s */
129 if (setvec
== 0) { /* first time through any RE */
131 setvec
= (int *) malloc(maxsetvec
* sizeof(int));
132 tmpset
= (int *) malloc(maxsetvec
* sizeof(int));
133 if (setvec
== 0 || tmpset
== 0)
134 overflo("out of space initializing makedfa");
137 if (compile_time
) /* a constant for sure */
138 return mkdfa(s
, anchor
);
139 for (i
= 0; i
< nfatab
; i
++) /* is it there already? */
140 if (fatab
[i
]->anchor
== anchor
141 && strcmp((const char *) fatab
[i
]->restr
, s
) == 0) {
142 fatab
[i
]->use
= now
++;
145 pfa
= mkdfa(s
, anchor
);
146 if (nfatab
< NFA
) { /* room for another */
148 fatab
[nfatab
]->use
= now
++;
152 use
= fatab
[0]->use
; /* replace least-recently used */
154 for (i
= 1; i
< nfatab
; i
++)
155 if (fatab
[i
]->use
< use
) {
165 fa
*mkdfa(const char *s
, int anchor
) /* does the real work of making a dfa */
166 /* anchor = 1 for anchored matches, else 0 */
172 p1
= op2(CAT
, op2(STAR
, op2(ALL
, NIL
, NIL
), NIL
), p
);
173 /* put ALL STAR in front of reg. exp. */
174 p1
= op2(CAT
, p1
, op2(FINAL
, NIL
, NIL
));
175 /* put FINAL after reg. exp. */
178 penter(p1
); /* enter parent pointers and leaf indices */
179 if ((f
= (fa
*) calloc(1, sizeof(fa
) + poscnt
*sizeof(rrow
))) == NULL
)
180 overflo("out of space for fa");
181 f
->accept
= poscnt
-1; /* penter has computed number of positions in re */
182 cfoll(f
, p1
); /* set up follow sets */
185 if ((f
->posns
[0] = (int *) calloc(1, *(f
->re
[0].lfollow
)*sizeof(int))) == NULL
)
186 overflo("out of space in makedfa");
187 if ((f
->posns
[1] = (int *) calloc(1, sizeof(int))) == NULL
)
188 overflo("out of space in makedfa");
190 f
->initstat
= makeinit(f
, anchor
);
192 f
->restr
= (uschar
*) tostring(s
);
196 int makeinit(fa
*f
, int anchor
)
203 k
= *(f
->re
[0].lfollow
);
205 if ((f
->posns
[2] = (int *) calloc(1, (k
+1)*sizeof(int))) == NULL
)
206 overflo("out of space in makeinit");
207 for (i
=0; i
<= k
; i
++) {
208 (f
->posns
[2])[i
] = (f
->re
[0].lfollow
)[i
];
210 if ((f
->posns
[2])[1] == f
->accept
)
212 for (i
=0; i
< NCHARS
; i
++)
213 f
->gototab
[2][i
] = 0;
214 f
->curstat
= cgoto(f
, 2, HAT
);
216 *f
->posns
[2] = k
-1; /* leave out position 0 */
217 for (i
=0; i
< k
; i
++) {
218 (f
->posns
[0])[i
] = (f
->posns
[2])[i
];
221 f
->out
[0] = f
->out
[2];
222 if (f
->curstat
!= 2) {
223 resize_state(f
, f
->curstat
);
224 --(*f
->posns
[f
->curstat
]);
230 void penter(Node
*p
) /* set up parent pointers and leaf indices */
247 parent(right(p
)) = p
;
249 default: /* can't happen */
250 FATAL("can't happen: unknown type %d in penter", type(p
));
255 void freetr(Node
*p
) /* free parse tree */
272 default: /* can't happen */
273 FATAL("can't happen: unknown type %d in freetr", type(p
));
278 /* in the parsing of regular expressions, metacharacters like . have */
279 /* to be seen literally; \056 is not a metacharacter. */
281 int hexstr(uschar
**pp
) /* find and eval hex string at pp, return new p */
282 { /* only pick up one 8-bit byte (2 chars) */
287 for (i
= 0, p
= (uschar
*) *pp
; i
< 2 && isxdigit(*p
); i
++, p
++) {
289 n
= 16 * n
+ *p
- '0';
290 else if (*p
>= 'a' && *p
<= 'f')
291 n
= 16 * n
+ *p
- 'a' + 10;
292 else if (*p
>= 'A' && *p
<= 'F')
293 n
= 16 * n
+ *p
- 'A' + 10;
299 #define isoctdigit(c) ((c) >= '0' && (c) <= '7') /* multiple use of arg */
301 int quoted(uschar
**pp
) /* pick up next thing after a \\ */
302 /* and increment *pp */
307 if ((c
= *p
++) == 't')
319 else if (c
== 'x') { /* hexadecimal goo follows */
320 c
= hexstr(&p
); /* this adds a null if number is invalid */
321 } else if (isoctdigit(c
)) { /* \d \dd \ddd */
323 if (isoctdigit(*p
)) {
324 n
= 8 * n
+ *p
++ - '0';
326 n
= 8 * n
+ *p
++ - '0';
335 char *cclenter(const char *argp
) /* add a character class */
338 uschar
*p
= (uschar
*) argp
;
340 static uschar
*buf
= 0;
341 static int bufsz
= 100;
344 if (buf
== 0 && (buf
= (uschar
*) malloc(bufsz
)) == NULL
)
345 FATAL("out of space for character class [%.10s...] 1", p
);
347 for (i
= 0; (c
= *p
++) != 0; ) {
350 } else if (c
== '-' && i
> 0 && bp
[-1] != 0) {
356 if (c
> c2
) { /* empty; ignore */
362 if (!adjbuf(&buf
, &bufsz
, bp
-buf
+2, 100, &bp
, "cclenter1"))
363 FATAL("out of space for character class [%.10s...] 2", p
);
370 if (!adjbuf(&buf
, &bufsz
, bp
-buf
+2, 100, &bp
, "cclenter2"))
371 FATAL("out of space for character class [%.10s...] 3", p
);
376 dprintf( ("cclenter: in = |%s|, out = |%s|\n", op
, buf
) );
378 return (char *) tostring((char *) buf
);
381 void overflo(const char *s
)
383 FATAL("regular expression too big: %.30s...", s
);
386 void cfoll(fa
*f
, Node
*v
) /* enter follow set of each leaf of vertex v into lfollow[leaf] */
394 f
->re
[info(v
)].ltype
= type(v
);
395 f
->re
[info(v
)].lval
.np
= right(v
);
396 while (f
->accept
>= maxsetvec
) { /* guessing here! */
398 setvec
= (int *) realloc(setvec
, maxsetvec
* sizeof(int));
399 tmpset
= (int *) realloc(tmpset
, maxsetvec
* sizeof(int));
400 if (setvec
== 0 || tmpset
== 0)
401 overflo("out of space in cfoll()");
403 for (i
= 0; i
<= f
->accept
; i
++)
406 follow(v
); /* computes setvec and setcnt */
407 if ((p
= (int *) calloc(1, (setcnt
+1)*sizeof(int))) == NULL
)
408 overflo("out of space building follow set");
409 f
->re
[info(v
)].lfollow
= p
;
411 for (i
= f
->accept
; i
>= 0; i
--)
423 default: /* can't happen */
424 FATAL("can't happen: unknown type %d in cfoll", type(v
));
428 int first(Node
*p
) /* collects initially active leaves of p into setvec */
429 /* returns 0 if p matches empty string */
436 lp
= info(p
); /* look for high-water mark of subscripts */
437 while (setcnt
>= maxsetvec
|| lp
>= maxsetvec
) { /* guessing here! */
439 setvec
= (int *) realloc(setvec
, maxsetvec
* sizeof(int));
440 tmpset
= (int *) realloc(tmpset
, maxsetvec
* sizeof(int));
441 if (setvec
== 0 || tmpset
== 0)
442 overflo("out of space in first()");
444 if (type(p
) == EMPTYRE
) {
448 if (setvec
[lp
] != 1) {
452 if (type(p
) == CCL
&& (*(char *) right(p
)) == '\0')
453 return(0); /* empty CCL */
456 if (first(left(p
)) == 0) return(0);
463 if (first(left(p
)) == 0 && first(right(p
)) == 0) return(0);
467 if (first(left(p
)) == 0 || b
== 0) return(0);
470 FATAL("can't happen: unknown type %d in first", type(p
)); /* can't happen */
474 void follow(Node
*v
) /* collects leaves that can follow v into setvec */
478 if (type(v
) == FINAL
)
494 if (v
== left(p
)) { /* v is left child of p */
495 if (first(right(p
)) == 0) {
499 } else /* v is right child */
505 int member(int c
, const char *sarg
) /* is c in s? */
507 uschar
*s
= (uschar
*) sarg
;
515 int match(fa
*f
, const char *p0
) /* shortest match ? */
518 uschar
*p
= (uschar
*) p0
;
521 assert (s
< f
->state_count
);
526 /* assert(*p < NCHARS); */
527 if ((ns
= f
->gototab
[s
][*p
]) != 0)
532 assert (s
< f
->state_count
);
540 int pmatch(fa
*f
, const char *p0
) /* longest match, for sub */
543 uschar
*p
= (uschar
*) p0
;
547 assert(s
< f
->state_count
);
553 if (f
->out
[s
]) /* final state */
555 /* assert(*q < NCHARS); */
556 if ((ns
= f
->gototab
[s
][*q
]) != 0)
561 assert(s
< f
->state_count
);
563 if (s
== 1) { /* no transition */
569 goto nextin
; /* no match */
573 patlen
= q
-p
-1; /* don't count $ */
584 int nematch(fa
*f
, const char *p0
) /* non-empty match, for sub */
587 uschar
*p
= (uschar
*) p0
;
591 assert(s
< f
->state_count
);
597 if (f
->out
[s
]) /* final state */
599 /* assert(*q < NCHARS); */
600 if ((ns
= f
->gototab
[s
][*q
]) != 0)
605 assert(s
< f
->state_count
);
607 if (s
== 1) { /* no transition */
612 goto nnextin
; /* no nonempty match */
616 patlen
= q
-p
-1; /* don't count $ */
628 Node
*reparse(const char *p
) /* parses regular expression pointed to by p */
629 { /* uses relex() to scan regular expression */
632 dprintf( ("reparse <%s>\n", p
) );
633 lastre
= prestr
= (uschar
*) p
; /* prestr points to string to be parsed */
635 /* GNU compatibility: an empty regexp matches anything */
637 /* FATAL("empty regular expression"); previous */
638 return(op2(EMPTYRE
, NIL
, NIL
));
642 FATAL("syntax error in regular expression %s at %s", lastre
, prestr
);
646 Node
*regexp(void) /* top-level parse of reg expr */
648 return (alt(concat(primary())));
657 np
= op2(CHAR
, NIL
, itonp(rlxval
));
662 return (unary(op2(ALL
, NIL
, NIL
)));
665 return (unary(op2(ALL
, NIL
, NIL
)));
668 return (unary(op2(DOT
, NIL
, NIL
)));
670 np
= op2(CCL
, NIL
, (Node
*) cclenter((char *) rlxstr
));
674 np
= op2(NCCL
, NIL
, (Node
*) cclenter((char *) rlxstr
));
679 return (unary(op2(CHAR
, NIL
, itonp(HAT
))));
682 return (unary(op2(CHAR
, NIL
, NIL
)));
685 if (rtok
== ')') { /* special pleading for () */
687 return unary(op2(CCL
, NIL
, (Node
*) tostring("")));
695 FATAL("syntax error in regular expression %s at %s", lastre
, prestr
);
697 FATAL("illegal primary in regular expression %s at %s", lastre
, prestr
);
699 return 0; /*NOTREACHED*/
702 Node
*concat(Node
*np
)
705 case CHAR
: case DOT
: case ALL
: case EMPTYRE
: case CCL
: case NCCL
: case '$': case '(':
706 return (concat(op2(CAT
, np
, primary())));
715 return (alt(op2(OR
, np
, concat(primary()))));
720 Node
*unary(Node
*np
)
725 return (unary(op2(STAR
, np
, NIL
)));
728 return (unary(op2(PLUS
, np
, NIL
)));
731 return (unary(op2(QUEST
, np
, NIL
)));
738 * Character class definitions conformant to the POSIX locale as
739 * defined in IEEE P1003.1 draft 7 of June 2001, assuming the source
740 * and operating character sets are both ASCII (ISO646) or supersets
743 * Note that to avoid overflowing the temporary buffer used in
744 * relex(), the expanded character class (prior to range expansion)
745 * must be less than twice the size of their full name.
748 /* Because isblank doesn't show up in any of the header files on any
749 * system i use, it's defined here. if some other locale has a richer
750 * definition of "blank", define HAS_ISBLANK and provide your own
752 * the parentheses here are an attempt to find a path through the maze
753 * of macro definition and/or function and/or version provided. thanks
754 * to nelson beebe for the suggestion; let's see if it works everywhere.
761 return c
==' ' || c
=='\t';
766 static const struct charclass
{
771 { "alnum", 5, isalnum
},
772 { "alpha", 5, isalpha
},
773 { "blank", 5, isblank
},
774 { "cntrl", 5, iscntrl
},
775 { "digit", 5, isdigit
},
776 { "graph", 5, isgraph
},
777 { "lower", 5, islower
},
778 { "print", 5, isprint
},
779 { "punct", 5, ispunct
},
780 { "space", 5, isspace
},
781 { "upper", 5, isupper
},
782 { "xdigit", 6, isxdigit
},
787 int relex(void) /* lexical analyzer for reparse */
791 static uschar
*buf
= 0;
792 static int bufsz
= 100;
794 const struct charclass
*cc
;
797 switch (c
= *prestr
++) {
799 case '*': return STAR
;
800 case '+': return PLUS
;
801 case '?': return QUEST
;
802 case '.': return DOT
;
803 case '\0': prestr
--; return '\0';
810 rlxval
= quoted(&prestr
);
816 if (buf
== 0 && (buf
= (uschar
*) malloc(bufsz
)) == NULL
)
817 FATAL("out of space in reg expr %.10s..", lastre
);
819 if (*prestr
== '^') {
825 n
= 2 * strlen((const char *) prestr
)+1;
826 if (!adjbuf(&buf
, &bufsz
, n
, n
, &bp
, "relex1"))
827 FATAL("out of space for reg expr %.10s...", lastre
);
829 if ((c
= *prestr
++) == '\\') {
831 if ((c
= *prestr
++) == '\0')
832 FATAL("nonterminated character class %.20s...", lastre
);
834 /* } else if (c == '\n') { */
835 /* FATAL("newline in character class %.20s...", lastre); */
836 } else if (c
== '[' && *prestr
== ':') {
837 /* POSIX char class names, Dag-Erling Smorgrav, des@ofug.org */
838 for (cc
= charclasses
; cc
->cc_name
; cc
++)
839 if (strncmp((const char *) prestr
+ 1, (const char *) cc
->cc_name
, cc
->cc_namelen
) == 0)
841 if (cc
->cc_name
!= NULL
&& prestr
[1 + cc
->cc_namelen
] == ':' &&
842 prestr
[2 + cc
->cc_namelen
] == ']') {
843 prestr
+= cc
->cc_namelen
+ 3;
844 for (i
= 1; i
< NCHARS
; i
++) {
845 if (!adjbuf(&buf
, &bufsz
, bp
-buf
+1, 100, &bp
, "relex2"))
846 FATAL("out of space for reg expr %.10s...", lastre
);
847 if (cc
->cc_func(i
)) {
854 } else if (c
== '\0') {
855 FATAL("nonterminated character class %.20s", lastre
);
856 } else if (bp
== buf
) { /* 1st char is special */
858 } else if (c
== ']') {
860 rlxstr
= (uschar
*) tostring((char *) buf
);
871 int cgoto(fa
*f
, int s
, int c
)
876 assert(c
== HAT
|| c
< NCHARS
);
877 while (f
->accept
>= maxsetvec
) { /* guessing here! */
879 setvec
= (int *) realloc(setvec
, maxsetvec
* sizeof(int));
880 tmpset
= (int *) realloc(tmpset
, maxsetvec
* sizeof(int));
881 if (setvec
== 0 || tmpset
== 0)
882 overflo("out of space in cgoto()");
884 for (i
= 0; i
<= f
->accept
; i
++)
888 /* compute positions of gototab[s,c] into setvec */
890 for (i
= 1; i
<= *p
; i
++) {
891 if ((k
= f
->re
[p
[i
]].ltype
) != FINAL
) {
892 if ((k
== CHAR
&& c
== ptoi(f
->re
[p
[i
]].lval
.np
))
893 || (k
== DOT
&& c
!= 0 && c
!= HAT
)
894 || (k
== ALL
&& c
!= 0)
895 || (k
== EMPTYRE
&& c
!= 0)
896 || (k
== CCL
&& member(c
, (char *) f
->re
[p
[i
]].lval
.up
))
897 || (k
== NCCL
&& !member(c
, (char *) f
->re
[p
[i
]].lval
.up
) && c
!= 0 && c
!= HAT
)) {
898 q
= f
->re
[p
[i
]].lfollow
;
899 for (j
= 1; j
<= *q
; j
++) {
900 if (q
[j
] >= maxsetvec
) {
902 setvec
= (int *) realloc(setvec
, maxsetvec
* sizeof(int));
903 tmpset
= (int *) realloc(tmpset
, maxsetvec
* sizeof(int));
904 if (setvec
== 0 || tmpset
== 0)
905 overflo("cgoto overflow");
907 if (setvec
[q
[j
]] == 0) {
915 /* determine if setvec is a previous state */
918 for (i
= f
->accept
; i
>= 0; i
--)
923 resize_state(f
, f
->curstat
> s
? f
->curstat
: s
);
924 /* tmpset == previous state? */
925 for (i
= 1; i
<= f
->curstat
; i
++) {
927 if ((k
= tmpset
[0]) != p
[0])
929 for (j
= 1; j
<= k
; j
++)
930 if (tmpset
[j
] != p
[j
])
932 /* setvec is state i */
934 f
->gototab
[s
][c
] = i
;
939 /* add tmpset to current set of states */
941 resize_state(f
, f
->curstat
);
942 for (i
= 0; i
< NCHARS
; i
++)
943 f
->gototab
[f
->curstat
][i
] = 0;
944 xfree(f
->posns
[f
->curstat
]);
945 if ((p
= (int *) calloc(1, (setcnt
+1)*sizeof(int))) == NULL
)
946 overflo("out of space in cgoto");
948 f
->posns
[f
->curstat
] = p
;
950 f
->gototab
[s
][c
] = f
->curstat
;
951 for (i
= 0; i
<= setcnt
; i
++)
953 if (setvec
[f
->accept
])
954 f
->out
[f
->curstat
] = 1;
956 f
->out
[f
->curstat
] = 0;
961 void freefa(fa
*f
) /* free a finite automaton */
967 for (i
= 0; i
< f
->state_count
; i
++) {
971 for (i
= 0; i
<= f
->accept
; i
++) {
972 xfree(f
->re
[i
].lfollow
);
973 if (f
->re
[i
].ltype
== CCL
|| f
->re
[i
].ltype
== NCCL
)
974 xfree((f
->re
[i
].lval
.np
));