3 /****************************************************************
4 Copyright (C) Lucent Technologies 1997
7 Permission to use, copy, modify, and distribute this software and
8 its documentation for any purpose and without fee is hereby
9 granted, provided that the above copyright notice appear in all
10 copies and that both that the copyright notice and this
11 permission notice and warranty disclaimer appear in supporting
12 documentation, and that the name Lucent Technologies or any of
13 its entities not be used in advertising or publicity pertaining
14 to distribution of the software without specific, written prior
17 LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
18 INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
19 IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
20 SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
21 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
22 IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
23 ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
25 ****************************************************************/
27 /* lasciate ogne speranza, voi ch'entrate. */
38 #define HAT (NCHARS+2) /* matches ^ in regular expr */
42 #define type(v) (v)->nobj /* badly overloaded here */
43 #define info(v) (v)->ntype /* badly overloaded here */
44 #define left(v) (v)->narg[0]
45 #define right(v) (v)->narg[1]
46 #define parent(v) (v)->nnext
48 #define LEAF case CCL: case NCCL: case CHAR: case DOT: case FINAL: case ALL:
49 #define UNARY case STAR: case PLUS: case QUEST:
51 /* encoding in tree Nodes:
52 leaf (CCL, NCCL, CHAR, DOT, FINAL, ALL):
53 left is index, right contains value or pointer to value
54 unary (STAR, PLUS, QUEST): left is child, right is null
55 binary (CAT, OR): left and right are children
56 parent contains pointer to parent
64 int rtok
; /* next token in current re */
66 static uschar
*rlxstr
;
67 static uschar
*prestr
; /* current position in current re */
68 static uschar
*lastre
; /* origin of last re */
76 #define NFA 20 /* cache this many dynamic fa's */
78 int nfatab
= 0; /* entries in fatab */
80 fa
*makedfa(const char *s
, int anchor
) /* returns dfa for reg expr s */
86 if (setvec
== 0) { /* first time through any RE */
88 setvec
= (int *) malloc(maxsetvec
* sizeof(int));
89 tmpset
= (int *) malloc(maxsetvec
* sizeof(int));
90 if (setvec
== 0 || tmpset
== 0)
91 overflo("out of space initializing makedfa");
94 if (compile_time
) /* a constant for sure */
95 return mkdfa(s
, anchor
);
96 for (i
= 0; i
< nfatab
; i
++) /* is it there already? */
97 if (fatab
[i
]->anchor
== anchor
98 && strcmp((const char *) fatab
[i
]->restr
, s
) == 0) {
99 fatab
[i
]->use
= now
++;
102 pfa
= mkdfa(s
, anchor
);
103 if (nfatab
< NFA
) { /* room for another */
105 fatab
[nfatab
]->use
= now
++;
109 use
= fatab
[0]->use
; /* replace least-recently used */
111 for (i
= 1; i
< nfatab
; i
++)
112 if (fatab
[i
]->use
< use
) {
122 fa
*mkdfa(const char *s
, int anchor
) /* does the real work of making a dfa */
123 /* anchor = 1 for anchored matches, else 0 */
129 p1
= op2(CAT
, op2(STAR
, op2(ALL
, NIL
, NIL
), NIL
), p
);
130 /* put ALL STAR in front of reg. exp. */
131 p1
= op2(CAT
, p1
, op2(FINAL
, NIL
, NIL
));
132 /* put FINAL after reg. exp. */
135 penter(p1
); /* enter parent pointers and leaf indices */
136 if ((f
= (fa
*) calloc(1, sizeof(fa
) + poscnt
*sizeof(rrow
))) == NULL
)
137 overflo("out of space for fa");
138 f
->accept
= poscnt
-1; /* penter has computed number of positions in re */
139 cfoll(f
, p1
); /* set up follow sets */
141 if ((f
->posns
[0] = (int *) calloc(1, *(f
->re
[0].lfollow
)*sizeof(int))) == NULL
)
142 overflo("out of space in makedfa");
143 if ((f
->posns
[1] = (int *) calloc(1, sizeof(int))) == NULL
)
144 overflo("out of space in makedfa");
146 f
->initstat
= makeinit(f
, anchor
);
148 f
->restr
= (uschar
*) tostring(s
);
152 int makeinit(fa
*f
, int anchor
)
159 k
= *(f
->re
[0].lfollow
);
161 if ((f
->posns
[2] = (int *) calloc(1, (k
+1)*sizeof(int))) == NULL
)
162 overflo("out of space in makeinit");
163 for (i
=0; i
<= k
; i
++) {
164 (f
->posns
[2])[i
] = (f
->re
[0].lfollow
)[i
];
166 if ((f
->posns
[2])[1] == f
->accept
)
168 for (i
=0; i
< NCHARS
; i
++)
169 f
->gototab
[2][i
] = 0;
170 f
->curstat
= cgoto(f
, 2, HAT
);
172 *f
->posns
[2] = k
-1; /* leave out position 0 */
173 for (i
=0; i
< k
; i
++) {
174 (f
->posns
[0])[i
] = (f
->posns
[2])[i
];
177 f
->out
[0] = f
->out
[2];
179 --(*f
->posns
[f
->curstat
]);
184 void penter(Node
*p
) /* set up parent pointers and leaf indices */
200 parent(right(p
)) = p
;
202 default: /* can't happen */
203 FATAL("can't happen: unknown type %d in penter", type(p
));
208 void freetr(Node
*p
) /* free parse tree */
224 default: /* can't happen */
225 FATAL("can't happen: unknown type %d in freetr", type(p
));
230 /* in the parsing of regular expressions, metacharacters like . have */
231 /* to be seen literally; \056 is not a metacharacter. */
233 int hexstr(char **pp
) /* find and eval hex string at pp, return new p */
234 { /* only pick up one 8-bit byte (2 chars) */
239 for (i
= 0, p
= (uschar
*) *pp
; i
< 2 && isxdigit(*p
); i
++, p
++) {
241 n
= 16 * n
+ *p
- '0';
242 else if (*p
>= 'a' && *p
<= 'f')
243 n
= 16 * n
+ *p
- 'a' + 10;
244 else if (*p
>= 'A' && *p
<= 'F')
245 n
= 16 * n
+ *p
- 'A' + 10;
251 #define isoctdigit(c) ((c) >= '0' && (c) <= '7') /* multiple use of arg */
253 int quoted(char **pp
) /* pick up next thing after a \\ */
254 /* and increment *pp */
259 if ((c
= *p
++) == 't')
271 else if (c
== 'x') { /* hexadecimal goo follows */
272 c
= hexstr(&p
); /* this adds a null if number is invalid */
273 } else if (isoctdigit(c
)) { /* \d \dd \ddd */
275 if (isoctdigit(*p
)) {
276 n
= 8 * n
+ *p
++ - '0';
278 n
= 8 * n
+ *p
++ - '0';
287 char *cclenter(const char *argp
) /* add a character class */
290 uschar
*p
= (uschar
*) argp
;
292 static uschar
*buf
= 0;
293 static int bufsz
= 100;
296 if (buf
== 0 && (buf
= (uschar
*) malloc(bufsz
)) == NULL
)
297 FATAL("out of space for character class [%.10s...] 1", p
);
299 for (i
= 0; (c
= *p
++) != 0; ) {
301 c
= quoted((char **) &p
);
302 } else if (c
== '-' && i
> 0 && bp
[-1] != 0) {
307 c2
= quoted((char **) &p
);
308 if (c
> c2
) { /* empty; ignore */
314 if (!adjbuf((char **) &buf
, &bufsz
, bp
-buf
+2, 100, (char **) &bp
, 0))
315 FATAL("out of space for character class [%.10s...] 2", p
);
322 if (!adjbuf((char **) &buf
, &bufsz
, bp
-buf
+2, 100, (char **) &bp
, 0))
323 FATAL("out of space for character class [%.10s...] 3", p
);
328 dprintf( ("cclenter: in = |%s|, out = |%s|\n", op
, buf
) );
330 return (char *) tostring((char *) buf
);
333 void overflo(const char *s
)
335 FATAL("regular expression too big: %.30s...", s
);
338 void cfoll(fa
*f
, Node
*v
) /* enter follow set of each leaf of vertex v into lfollow[leaf] */
345 f
->re
[info(v
)].ltype
= type(v
);
346 f
->re
[info(v
)].lval
.np
= right(v
);
347 while (f
->accept
>= maxsetvec
) { /* guessing here! */
349 setvec
= (int *) realloc(setvec
, maxsetvec
* sizeof(int));
350 tmpset
= (int *) realloc(tmpset
, maxsetvec
* sizeof(int));
351 if (setvec
== 0 || tmpset
== 0)
352 overflo("out of space in cfoll()");
354 for (i
= 0; i
<= f
->accept
; i
++)
357 follow(v
); /* computes setvec and setcnt */
358 if ((p
= (int *) calloc(1, (setcnt
+1)*sizeof(int))) == NULL
)
359 overflo("out of space building follow set");
360 f
->re
[info(v
)].lfollow
= p
;
362 for (i
= f
->accept
; i
>= 0; i
--)
374 default: /* can't happen */
375 FATAL("can't happen: unknown type %d in cfoll", type(v
));
379 int first(Node
*p
) /* collects initially active leaves of p into setvec */
380 /* returns 1 if p matches empty string */
386 lp
= info(p
); /* look for high-water mark of subscripts */
387 while (setcnt
>= maxsetvec
|| lp
>= maxsetvec
) { /* guessing here! */
389 setvec
= (int *) realloc(setvec
, maxsetvec
* sizeof(int));
390 tmpset
= (int *) realloc(tmpset
, maxsetvec
* sizeof(int));
391 if (setvec
== 0 || tmpset
== 0)
392 overflo("out of space in first()");
394 if (setvec
[lp
] != 1) {
398 if (type(p
) == CCL
&& (*(char *) right(p
)) == '\0')
399 return(0); /* empty CCL */
402 if (first(left(p
)) == 0) return(0);
409 if (first(left(p
)) == 0 && first(right(p
)) == 0) return(0);
413 if (first(left(p
)) == 0 || b
== 0) return(0);
416 FATAL("can't happen: unknown type %d in first", type(p
)); /* can't happen */
420 void follow(Node
*v
) /* collects leaves that can follow v into setvec */
424 if (type(v
) == FINAL
)
440 if (v
== left(p
)) { /* v is left child of p */
441 if (first(right(p
)) == 0) {
445 } else /* v is right child */
451 int member(int c
, const char *sarg
) /* is c in s? */
453 uschar
*s
= (uschar
*) sarg
;
461 int match(fa
*f
, const char *p0
) /* shortest match ? */
464 uschar
*p
= (uschar
*) p0
;
466 s
= f
->reset
? makeinit(f
,0) : f
->initstat
;
471 if ((ns
= f
->gototab
[s
][*p
]) != 0)
481 int pmatch(fa
*f
, const char *p0
) /* longest match, for sub */
484 uschar
*p
= (uschar
*) p0
;
488 /* s = f->reset ? makeinit(f,1) : f->initstat; */
490 f
->initstat
= s
= makeinit(f
,1);
499 if (f
->out
[s
]) /* final state */
502 if ((ns
= f
->gototab
[s
][*q
]) != 0)
506 if (s
== 1) { /* no transition */
512 goto nextin
; /* no match */
516 patlen
= q
-p
-1; /* don't count $ */
524 for (i
= 2; i
<= f
->curstat
; i
++)
527 if ((f
->posns
[2] = (int *) calloc(1, (k
+1)*sizeof(int))) == NULL
)
528 overflo("out of space in pmatch");
529 for (i
= 0; i
<= k
; i
++)
530 (f
->posns
[2])[i
] = (f
->posns
[0])[i
];
531 f
->initstat
= f
->curstat
= 2;
532 f
->out
[2] = f
->out
[0];
533 for (i
= 0; i
< NCHARS
; i
++)
534 f
->gototab
[2][i
] = 0;
540 int nematch(fa
*f
, const char *p0
) /* non-empty match, for sub */
543 uschar
*p
= (uschar
*) p0
;
547 /* s = f->reset ? makeinit(f,1) : f->initstat; */
549 f
->initstat
= s
= makeinit(f
,1);
557 if (f
->out
[s
]) /* final state */
560 if ((ns
= f
->gototab
[s
][*q
]) != 0)
564 if (s
== 1) { /* no transition */
569 goto nnextin
; /* no nonempty match */
573 patlen
= q
-p
-1; /* don't count $ */
581 for (i
= 2; i
<= f
->curstat
; i
++)
584 if ((f
->posns
[2] = (int *) calloc(1, (k
+1)*sizeof(int))) == NULL
)
585 overflo("out of state space");
586 for (i
= 0; i
<= k
; i
++)
587 (f
->posns
[2])[i
] = (f
->posns
[0])[i
];
588 f
->initstat
= f
->curstat
= 2;
589 f
->out
[2] = f
->out
[0];
590 for (i
= 0; i
< NCHARS
; i
++)
591 f
->gototab
[2][i
] = 0;
598 Node
*reparse(const char *p
) /* parses regular expression pointed to by p */
599 { /* uses relex() to scan regular expression */
602 dprintf( ("reparse <%s>\n", p
) );
603 lastre
= prestr
= (uschar
*) p
; /* prestr points to string to be parsed */
605 /* GNU compatibility: an empty regexp matches anything */
607 /* FATAL("empty regular expression"); previous */
608 return(op2(ALL
, NIL
, NIL
));
611 FATAL("syntax error in regular expression %s at %s", lastre
, prestr
);
615 Node
*regexp(void) /* top-level parse of reg expr */
617 return (alt(concat(primary())));
626 np
= op2(CHAR
, NIL
, itonp(rlxval
));
631 return (unary(op2(ALL
, NIL
, NIL
)));
634 return (unary(op2(DOT
, NIL
, NIL
)));
636 np
= op2(CCL
, NIL
, (Node
*) cclenter((char *) rlxstr
));
640 np
= op2(NCCL
, NIL
, (Node
*) cclenter((char *) rlxstr
));
645 return (unary(op2(CHAR
, NIL
, itonp(HAT
))));
648 return (unary(op2(CHAR
, NIL
, NIL
)));
651 if (rtok
== ')') { /* special pleading for () */
653 return unary(op2(CCL
, NIL
, (Node
*) tostring("")));
661 FATAL("syntax error in regular expression %s at %s", lastre
, prestr
);
663 FATAL("illegal primary in regular expression %s at %s", lastre
, prestr
);
665 return 0; /*NOTREACHED*/
668 Node
*concat(Node
*np
)
671 case CHAR
: case DOT
: case ALL
: case CCL
: case NCCL
: case '$': case '(':
672 return (concat(op2(CAT
, np
, primary())));
681 return (alt(op2(OR
, np
, concat(primary()))));
686 Node
*unary(Node
*np
)
691 return (unary(op2(STAR
, np
, NIL
)));
694 return (unary(op2(PLUS
, np
, NIL
)));
697 return (unary(op2(QUEST
, np
, NIL
)));
704 * Character class definitions conformant to the POSIX locale as
705 * defined in IEEE P1003.1 draft 7 of June 2001, assuming the source
706 * and operating character sets are both ASCII (ISO646) or supersets
709 * Note that to avoid overflowing the temporary buffer used in
710 * relex(), the expanded character class (prior to range expansion)
711 * must be less than twice the size of their full name.
714 /* Because isblank doesn't show up in any of the header files on any
715 * system i use, it's defined here. if some other locale has a richer
716 * definition of "blank", define HAS_ISBLANK and provide your own
718 * the parentheses here are an attempt to find a path through the maze
719 * of macro definition and/or function and/or version provided. thanks
720 * to nelson beebe for the suggestion; let's see if it works everywhere.
727 return c
==' ' || c
=='\t';
737 { "alnum", 5, isalnum
},
738 { "alpha", 5, isalpha
},
739 { "blank", 5, isblank
},
740 { "cntrl", 5, iscntrl
},
741 { "digit", 5, isdigit
},
742 { "graph", 5, isgraph
},
743 { "lower", 5, islower
},
744 { "print", 5, isprint
},
745 { "punct", 5, ispunct
},
746 { "space", 5, isspace
},
747 { "upper", 5, isupper
},
748 { "xdigit", 6, isxdigit
},
753 int relex(void) /* lexical analyzer for reparse */
757 static uschar
*buf
= 0;
758 static int bufsz
= 100;
760 struct charclass
*cc
;
763 switch (c
= *prestr
++) {
765 case '*': return STAR
;
766 case '+': return PLUS
;
767 case '?': return QUEST
;
768 case '.': return DOT
;
769 case '\0': prestr
--; return '\0';
776 rlxval
= quoted((char **) &prestr
);
782 if (buf
== 0 && (buf
= (uschar
*) malloc(bufsz
)) == NULL
)
783 FATAL("out of space in reg expr %.10s..", lastre
);
785 if (*prestr
== '^') {
791 n
= 2 * strlen((const char *) prestr
)+1;
792 if (!adjbuf((char **) &buf
, &bufsz
, n
, n
, (char **) &bp
, 0))
793 FATAL("out of space for reg expr %.10s...", lastre
);
795 if ((c
= *prestr
++) == '\\') {
797 if ((c
= *prestr
++) == '\0')
798 FATAL("nonterminated character class %.20s...", lastre
);
800 /* } else if (c == '\n') { */
801 /* FATAL("newline in character class %.20s...", lastre); */
802 } else if (c
== '[' && *prestr
== ':') {
803 /* POSIX char class names, Dag-Erling Smorgrav, des@ofug.org */
804 for (cc
= charclasses
; cc
->cc_name
; cc
++)
805 if (strncmp((const char *) prestr
+ 1, (const char *) cc
->cc_name
, cc
->cc_namelen
) == 0)
807 if (cc
->cc_name
!= NULL
&& prestr
[1 + cc
->cc_namelen
] == ':' &&
808 prestr
[2 + cc
->cc_namelen
] == ']') {
809 prestr
+= cc
->cc_namelen
+ 3;
810 for (i
= 0; i
< NCHARS
; i
++) {
811 if (!adjbuf((char **) &buf
, &bufsz
, bp
-buf
+1, 100, (char **) &bp
, 0))
812 FATAL("out of space for reg expr %.10s...", lastre
);
813 if (cc
->cc_func(i
)) {
820 } else if (c
== '\0') {
821 FATAL("nonterminated character class %.20s", lastre
);
822 } else if (bp
== buf
) { /* 1st char is special */
824 } else if (c
== ']') {
826 rlxstr
= (uschar
*) tostring((char *) buf
);
837 int cgoto(fa
*f
, int s
, int c
)
842 assert(c
== HAT
|| c
< NCHARS
);
843 while (f
->accept
>= maxsetvec
) { /* guessing here! */
845 setvec
= (int *) realloc(setvec
, maxsetvec
* sizeof(int));
846 tmpset
= (int *) realloc(tmpset
, maxsetvec
* sizeof(int));
847 if (setvec
== 0 || tmpset
== 0)
848 overflo("out of space in cgoto()");
850 for (i
= 0; i
<= f
->accept
; i
++)
853 /* compute positions of gototab[s,c] into setvec */
855 for (i
= 1; i
<= *p
; i
++) {
856 if ((k
= f
->re
[p
[i
]].ltype
) != FINAL
) {
857 if ((k
== CHAR
&& c
== ptoi(f
->re
[p
[i
]].lval
.np
))
858 || (k
== DOT
&& c
!= 0 && c
!= HAT
)
859 || (k
== ALL
&& c
!= 0)
860 || (k
== CCL
&& member(c
, (char *) f
->re
[p
[i
]].lval
.up
))
861 || (k
== NCCL
&& !member(c
, (char *) f
->re
[p
[i
]].lval
.up
) && c
!= 0 && c
!= HAT
)) {
862 q
= f
->re
[p
[i
]].lfollow
;
863 for (j
= 1; j
<= *q
; j
++) {
864 if (q
[j
] >= maxsetvec
) {
866 setvec
= (int *) realloc(setvec
, maxsetvec
* sizeof(int));
867 tmpset
= (int *) realloc(setvec
, maxsetvec
* sizeof(int));
868 if (setvec
== 0 || tmpset
== 0)
869 overflo("cgoto overflow");
871 if (setvec
[q
[j
]] == 0) {
879 /* determine if setvec is a previous state */
882 for (i
= f
->accept
; i
>= 0; i
--)
886 /* tmpset == previous state? */
887 for (i
= 1; i
<= f
->curstat
; i
++) {
889 if ((k
= tmpset
[0]) != p
[0])
891 for (j
= 1; j
<= k
; j
++)
892 if (tmpset
[j
] != p
[j
])
894 /* setvec is state i */
895 f
->gototab
[s
][c
] = i
;
900 /* add tmpset to current set of states */
901 if (f
->curstat
>= NSTATES
-1) {
904 for (i
= 2; i
< NSTATES
; i
++)
908 for (i
= 0; i
< NCHARS
; i
++)
909 f
->gototab
[f
->curstat
][i
] = 0;
910 xfree(f
->posns
[f
->curstat
]);
911 if ((p
= (int *) calloc(1, (setcnt
+1)*sizeof(int))) == NULL
)
912 overflo("out of space in cgoto");
914 f
->posns
[f
->curstat
] = p
;
915 f
->gototab
[s
][c
] = f
->curstat
;
916 for (i
= 0; i
<= setcnt
; i
++)
918 if (setvec
[f
->accept
])
919 f
->out
[f
->curstat
] = 1;
921 f
->out
[f
->curstat
] = 0;
926 void freefa(fa
*f
) /* free a finite automaton */
932 for (i
= 0; i
<= f
->curstat
; i
++)
934 for (i
= 0; i
<= f
->accept
; i
++) {
935 xfree(f
->re
[i
].lfollow
);
936 if (f
->re
[i
].ltype
== CCL
|| f
->re
[i
].ltype
== NCCL
)
937 xfree((f
->re
[i
].lval
.np
));