1 /****************************************************************
2 Copyright (C) Lucent Technologies 1997
5 Permission to use, copy, modify, and distribute this software and
6 its documentation for any purpose and without fee is hereby
7 granted, provided that the above copyright notice appear in all
8 copies and that both that the copyright notice and this
9 permission notice and warranty disclaimer appear in supporting
10 documentation, and that the name Lucent Technologies or any of
11 its entities not be used in advertising or publicity pertaining
12 to distribution of the software without specific, written prior
15 LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
16 INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
17 IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
18 SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
20 IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
21 ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
23 ****************************************************************/
25 /* lasciate ogne speranza, voi ch'intrate. */
27 #if HAVE_NBTOOL_CONFIG_H
28 #include "nbtool_config.h"
41 #define HAT (NCHARS+2) /* matches ^ in regular expr */
45 #define type(v) (v)->nobj /* badly overloaded here */
46 #define info(v) (v)->ntype /* badly overloaded here */
47 #define left(v) (v)->narg[0]
48 #define right(v) (v)->narg[1]
49 #define parent(v) (v)->nnext
51 #define LEAF case CCL: case NCCL: case CHAR: case DOT: case FINAL: case ALL:
52 #define ELEAF case EMPTYRE: /* empty string in regexp */
53 #define UNARY case STAR: case PLUS: case QUEST:
55 /* encoding in tree Nodes:
56 leaf (CCL, NCCL, CHAR, DOT, FINAL, ALL, EMPTYRE):
57 left is index, right contains value or pointer to value
58 unary (STAR, PLUS, QUEST): left is child, right is null
59 binary (CAT, OR): left and right are children
60 parent contains pointer to parent
68 int rtok
; /* next token in current re */
70 static const uschar
*rlxstr
;
71 static const uschar
*prestr
; /* current position in current re */
72 static const uschar
*lastre
; /* origin of last re */
80 #define NFA 128 /* cache this many dynamic fa's */
82 int nfatab
= 0; /* entries in fatab */
85 resizesetvec(const char *msg
)
91 setvec
= realloc(setvec
, maxsetvec
* sizeof(*setvec
));
92 tmpset
= realloc(tmpset
, maxsetvec
* sizeof(*tmpset
));
93 if (setvec
== 0 || tmpset
== 0)
98 resize_state(fa
*f
, int state
)
103 if (++state
< f
->state_count
)
106 new_count
= state
+ 10; /* needs to be tuned */
108 p
= realloc(f
->gototab
, new_count
* sizeof(f
->gototab
[0]));
113 p
= realloc(f
->out
, new_count
* sizeof(f
->out
[0]));
118 p
= realloc(f
->posns
, new_count
* sizeof(f
->posns
[0]));
123 for (i
= f
->state_count
; i
< new_count
; ++i
) {
124 f
->gototab
[i
] = calloc(1, NCHARS
* sizeof (**f
->gototab
));
125 if (f
->gototab
[i
] == NULL
)
130 f
->state_count
= new_count
;
133 overflo("out of memory in resize_state");
136 fa
*makedfa(const char *s
, int anchor
) /* returns dfa for reg expr s */
142 if (setvec
== 0) /* first time through any RE */
143 resizesetvec("out of space initializing makedfa");
145 if (compile_time
) /* a constant for sure */
146 return mkdfa(s
, anchor
);
147 for (i
= 0; i
< nfatab
; i
++) /* is it there already? */
148 if (fatab
[i
]->anchor
== anchor
149 && strcmp((const char *) fatab
[i
]->restr
, s
) == 0) {
150 fatab
[i
]->use
= now
++;
153 pfa
= mkdfa(s
, anchor
);
154 if (nfatab
< NFA
) { /* room for another */
156 fatab
[nfatab
]->use
= now
++;
160 use
= fatab
[0]->use
; /* replace least-recently used */
162 for (i
= 1; i
< nfatab
; i
++)
163 if (fatab
[i
]->use
< use
) {
173 fa
*mkdfa(const char *s
, int anchor
) /* does the real work of making a dfa */
174 /* anchor = 1 for anchored matches, else 0 */
180 p1
= op2(CAT
, op2(STAR
, op2(ALL
, NIL
, NIL
), NIL
), p
);
181 /* put ALL STAR in front of reg. exp. */
182 p1
= op2(CAT
, p1
, op2(FINAL
, NIL
, NIL
));
183 /* put FINAL after reg. exp. */
186 penter(p1
); /* enter parent pointers and leaf indices */
187 if ((f
= calloc(1, sizeof(*f
) + poscnt
*sizeof(rrow
))) == NULL
)
188 overflo("out of space for fa");
189 f
->accept
= poscnt
-1; /* penter has computed number of positions in re */
190 cfoll(f
, p1
); /* set up follow sets */
193 if ((f
->posns
[0] = calloc(1, *(f
->re
[0].lfollow
)*sizeof(int))) == NULL
)
194 overflo("out of space in makedfa");
195 if ((f
->posns
[1] = calloc(1, sizeof(int))) == NULL
)
196 overflo("out of space in makedfa");
198 f
->initstat
= makeinit(f
, anchor
);
200 f
->restr
= (uschar
*) tostring(s
);
204 int makeinit(fa
*f
, int anchor
)
211 k
= *(f
->re
[0].lfollow
);
213 if ((f
->posns
[2] = calloc(1, (k
+1)*sizeof(int))) == NULL
)
214 overflo("out of space in makeinit");
215 for (i
=0; i
<= k
; i
++) {
216 (f
->posns
[2])[i
] = (f
->re
[0].lfollow
)[i
];
218 if ((f
->posns
[2])[1] == f
->accept
)
220 for (i
=0; i
< NCHARS
; i
++)
221 f
->gototab
[2][i
] = 0;
222 f
->curstat
= cgoto(f
, 2, HAT
);
224 *f
->posns
[2] = k
-1; /* leave out position 0 */
225 for (i
=0; i
< k
; i
++) {
226 (f
->posns
[0])[i
] = (f
->posns
[2])[i
];
229 f
->out
[0] = f
->out
[2];
230 if (f
->curstat
!= 2) {
231 resize_state(f
, f
->curstat
);
232 --(*f
->posns
[f
->curstat
]);
238 void penter(Node
*p
) /* set up parent pointers and leaf indices */
255 parent(right(p
)) = p
;
257 default: /* can't happen */
258 FATAL("can't happen: unknown type %d in penter", type(p
));
263 void freetr(Node
*p
) /* free parse tree */
280 default: /* can't happen */
281 FATAL("can't happen: unknown type %d in freetr", type(p
));
286 /* in the parsing of regular expressions, metacharacters like . have */
287 /* to be seen literally; \056 is not a metacharacter. */
289 int hexstr(const uschar
**pp
) /* find and eval hex string at pp, return new p */
290 { /* only pick up one 8-bit byte (2 chars) */
295 for (i
= 0, p
= *pp
; i
< 2 && isxdigit(*p
); i
++, p
++) {
297 n
= 16 * n
+ *p
- '0';
298 else if (*p
>= 'a' && *p
<= 'f')
299 n
= 16 * n
+ *p
- 'a' + 10;
300 else if (*p
>= 'A' && *p
<= 'F')
301 n
= 16 * n
+ *p
- 'A' + 10;
307 #define isoctdigit(c) ((c) >= '0' && (c) <= '7') /* multiple use of arg */
309 int quoted(const uschar
**pp
) /* pick up next thing after a \\ */
310 /* and increment *pp */
312 const uschar
*p
= *pp
;
315 if ((c
= *p
++) == 't')
327 else if (c
== 'x') { /* hexadecimal goo follows */
328 c
= hexstr(&p
); /* this adds a null if number is invalid */
329 } else if (isoctdigit(c
)) { /* \d \dd \ddd */
331 if (isoctdigit(*p
)) {
332 n
= 8 * n
+ *p
++ - '0';
334 n
= 8 * n
+ *p
++ - '0';
343 char *cclenter(const char *argp
) /* add a character class */
346 const uschar
*p
= (const uschar
*) argp
;
349 static uschar
*buf
= 0;
350 static int bufsz
= 100;
353 if (buf
== 0 && (buf
= malloc(bufsz
)) == NULL
)
354 FATAL("out of space for character class [%.10s...] 1", p
);
356 for (i
= 0; (c
= *p
++) != 0; ) {
359 } else if (c
== '-' && i
> 0 && bp
[-1] != 0) {
365 if (c
> c2
) { /* empty; ignore */
371 if (!adjbuf(&buf
, &bufsz
, bp
-buf
+2, 100, &bp
, "cclenter1"))
372 FATAL("out of space for character class [%.10s...] 2", p
);
379 if (!adjbuf(&buf
, &bufsz
, bp
-buf
+2, 100, &bp
, "cclenter2"))
380 FATAL("out of space for character class [%.10s...] 3", p
);
385 dprintf( ("cclenter: in = |%s|, out = |%s|\n", op
, buf
) );
387 return (char *) tostring((char *) buf
);
390 void overflo(const char *s
)
392 FATAL("regular expression too big: %.30s...", s
);
395 void cfoll(fa
*f
, Node
*v
) /* enter follow set of each leaf of vertex v into lfollow[leaf] */
403 f
->re
[info(v
)].ltype
= type(v
);
404 f
->re
[info(v
)].lval
.np
= right(v
);
405 while (f
->accept
>= maxsetvec
) /* guessing here! */
406 resizesetvec("out of space in cfoll()");
407 for (i
= 0; i
<= f
->accept
; i
++)
410 follow(v
); /* computes setvec and setcnt */
411 if ((p
= calloc(1, (setcnt
+1)*sizeof(int))) == NULL
)
412 overflo("out of space building follow set");
413 f
->re
[info(v
)].lfollow
= p
;
415 for (i
= f
->accept
; i
>= 0; i
--)
427 default: /* can't happen */
428 FATAL("can't happen: unknown type %d in cfoll", type(v
));
432 int first(Node
*p
) /* collects initially active leaves of p into setvec */
433 /* returns 0 if p matches empty string */
440 lp
= info(p
); /* look for high-water mark of subscripts */
441 while (setcnt
>= maxsetvec
|| lp
>= maxsetvec
) /* guessing here! */
442 resizesetvec("out of space in first()");
443 if (type(p
) == EMPTYRE
) {
447 if (setvec
[lp
] != 1) {
451 if (type(p
) == CCL
&& (*(char *) right(p
)) == '\0')
452 return(0); /* empty CCL */
455 if (first(left(p
)) == 0) return(0);
462 if (first(left(p
)) == 0 && first(right(p
)) == 0) return(0);
466 if (first(left(p
)) == 0 || b
== 0) return(0);
469 FATAL("can't happen: unknown type %d in first", type(p
)); /* can't happen */
473 void follow(Node
*v
) /* collects leaves that can follow v into setvec */
477 if (type(v
) == FINAL
)
493 if (v
== left(p
)) { /* v is left child of p */
494 if (first(right(p
)) == 0) {
498 } else /* v is right child */
504 int member(int c
, const char *sarg
) /* is c in s? */
506 const uschar
*s
= (const uschar
*) sarg
;
514 int match(fa
*f
, const char *p0
) /* shortest match ? */
517 const uschar
*p
= (const uschar
*) p0
;
520 assert (s
< f
->state_count
);
525 /* assert(*p < NCHARS); */
526 if ((ns
= f
->gototab
[s
][*p
]) != 0)
531 assert (s
< f
->state_count
);
539 int pmatch(fa
*f
, const char *p0
) /* longest match, for sub */
542 uschar
*p
= __UNCONST(p0
);
546 assert(s
< f
->state_count
);
552 if (f
->out
[s
]) /* final state */
554 /* assert(*q < NCHARS); */
555 if ((ns
= f
->gototab
[s
][*q
]) != 0)
560 assert(s
< f
->state_count
);
562 if (s
== 1) { /* no transition */
568 goto nextin
; /* no match */
572 patlen
= q
-p
-1; /* don't count $ */
583 int nematch(fa
*f
, const char *p0
) /* non-empty match, for sub */
586 uschar
*p
= __UNCONST(p0
);
590 assert(s
< f
->state_count
);
596 if (f
->out
[s
]) /* final state */
598 /* assert(*q < NCHARS); */
599 if ((ns
= f
->gototab
[s
][*q
]) != 0)
604 assert(s
< f
->state_count
);
606 if (s
== 1) { /* no transition */
611 goto nnextin
; /* no nonempty match */
615 patlen
= q
-p
-1; /* don't count $ */
633 * A stream-fed version of nematch which transfers characters to a
634 * null-terminated buffer. All characters up to and including the last
635 * character of the matching text or EOF are placed in the buffer. If
636 * a match is found, patbeg and patlen are set appropriately.
643 int fnematch(fa
*pfa
, FILE *f
, uschar
**pbuf
, int *pbufsize
, int quantum
)
646 int bufsize
= *pbufsize
;
647 int c
, i
, j
, k
, ns
, s
;
650 assert(s
< pfa
->state_count
);
654 * All indices relative to buf.
655 * i <= j <= k <= bufsize
657 * i: origin of active substring
658 * j: current character
659 * k: destination of next getc()
667 if (!adjbuf(&buf
, &bufsize
, bufsize
+1, quantum
, 0, "fnematch"))
668 FATAL("stream '%.30s...' too long", buf
);
669 buf
[k
++] = (c
= getc(f
)) != EOF
? c
: 0;
672 /* assert(c < NCHARS); */
674 if ((ns
= pfa
->gototab
[s
][c
]) != 0)
677 s
= cgoto(pfa
, s
, c
);
678 assert(s
< pfa
->state_count
);
680 if (pfa
->out
[s
]) { /* final state */
682 if (c
== 0) /* don't count $ */
685 } while (buf
[j
] && s
!= 1);
687 } while (buf
[i
] && !patlen
);
689 /* adjbuf() may have relocated a resized buffer. Inform the world. */
696 * Under no circumstances is the last character fed to
697 * the automaton part of the match. It is EOF's nullbyte,
698 * or it sent the automaton into a state with no further
699 * transitions available (s==1), or both. Room for a
700 * terminating nullbyte is guaranteed.
702 * ungetc any chars after the end of matching text
703 * (except for EOF's nullbyte, if present) and null
704 * terminate the buffer.
707 if (buf
[--k
] && ungetc(buf
[k
], f
) == EOF
)
708 FATAL("unable to ungetc '%c'", buf
[k
]);
709 while (k
> i
+ patlen
);
717 Node
*reparse(const char *p
) /* parses regular expression pointed to by p */
718 { /* uses relex() to scan regular expression */
721 dprintf( ("reparse <%s>\n", p
) );
722 lastre
= prestr
= (const uschar
*) p
; /* prestr points to string to be parsed */
724 /* GNU compatibility: an empty regexp matches anything */
726 /* FATAL("empty regular expression"); previous */
727 return(op2(EMPTYRE
, NIL
, NIL
));
731 FATAL("syntax error in regular expression %s at %s", lastre
, prestr
);
735 Node
*regexp(void) /* top-level parse of reg expr */
737 return (alt(concat(primary())));
746 np
= op2(CHAR
, NIL
, itonp(rlxval
));
751 return (unary(op2(ALL
, NIL
, NIL
)));
754 return (unary(op2(ALL
, NIL
, NIL
)));
757 return (unary(op2(DOT
, NIL
, NIL
)));
759 np
= op2(CCL
, NIL
, (Node
*) cclenter((const char *) rlxstr
));
763 np
= op2(NCCL
, NIL
, (Node
*) cclenter((const char *) rlxstr
));
768 return (unary(op2(CHAR
, NIL
, itonp(HAT
))));
771 return (unary(op2(CHAR
, NIL
, NIL
)));
774 if (rtok
== ')') { /* special pleading for () */
776 return unary(op2(CCL
, NIL
, (Node
*) tostring("")));
784 FATAL("syntax error in regular expression %s at %s", lastre
, prestr
);
786 FATAL("illegal primary in regular expression %s at %s", lastre
, prestr
);
788 return 0; /*NOTREACHED*/
791 Node
*concat(Node
*np
)
794 case CHAR
: case DOT
: case ALL
: case EMPTYRE
: case CCL
: case NCCL
: case '$': case '(':
795 return (concat(op2(CAT
, np
, primary())));
804 return (alt(op2(OR
, np
, concat(primary()))));
809 Node
*unary(Node
*np
)
814 return (unary(op2(STAR
, np
, NIL
)));
817 return (unary(op2(PLUS
, np
, NIL
)));
820 return (unary(op2(QUEST
, np
, NIL
)));
827 * Character class definitions conformant to the POSIX locale as
828 * defined in IEEE P1003.1 draft 7 of June 2001, assuming the source
829 * and operating character sets are both ASCII (ISO646) or supersets
832 * Note that to avoid overflowing the temporary buffer used in
833 * relex(), the expanded character class (prior to range expansion)
834 * must be less than twice the size of their full name.
837 /* Because isblank doesn't show up in any of the header files on any
838 * system i use, it's defined here. if some other locale has a richer
839 * definition of "blank", define HAS_ISBLANK and provide your own
841 * the parentheses here are an attempt to find a path through the maze
842 * of macro definition and/or function and/or version provided. thanks
843 * to nelson beebe for the suggestion; let's see if it works everywhere.
846 /* #define HAS_ISBLANK */
848 static const struct charclass
{
853 { "alnum", 5, isalnum
},
854 { "alpha", 5, isalpha
},
856 { "blank", 5, isspace
}, /* was isblank */
858 { "blank", 5, isblank
},
860 { "cntrl", 5, iscntrl
},
861 { "digit", 5, isdigit
},
862 { "graph", 5, isgraph
},
863 { "lower", 5, islower
},
864 { "print", 5, isprint
},
865 { "punct", 5, ispunct
},
866 { "space", 5, isspace
},
867 { "upper", 5, isupper
},
868 { "xdigit", 6, isxdigit
},
873 int relex(void) /* lexical analyzer for reparse */
877 static uschar
*buf
= 0;
878 static int bufsz
= 100;
880 const struct charclass
*cc
;
883 switch (c
= *prestr
++) {
885 case '*': return STAR
;
886 case '+': return PLUS
;
887 case '?': return QUEST
;
888 case '.': return DOT
;
889 case '\0': prestr
--; return '\0';
896 rlxval
= quoted(&prestr
);
902 if (buf
== 0 && (buf
= malloc(bufsz
)) == NULL
)
903 FATAL("out of space in reg expr %.10s..", lastre
);
905 if (*prestr
== '^') {
911 n
= 2 * strlen((const char *) prestr
)+1;
912 if (!adjbuf(&buf
, &bufsz
, n
, n
, &bp
, "relex1"))
913 FATAL("out of space for reg expr %.10s...", lastre
);
915 if ((c
= *prestr
++) == '\\') {
917 if ((c
= *prestr
++) == '\0')
918 FATAL("nonterminated character class %.20s...", lastre
);
920 /* } else if (c == '\n') { */
921 /* FATAL("newline in character class %.20s...", lastre); */
922 } else if (c
== '[' && *prestr
== ':') {
923 /* POSIX char class names, Dag-Erling Smorgrav, des@ofug.org */
924 for (cc
= charclasses
; cc
->cc_name
; cc
++)
925 if (strncmp((const char *) prestr
+ 1, (const char *) cc
->cc_name
, cc
->cc_namelen
) == 0)
927 if (cc
->cc_name
!= NULL
&& prestr
[1 + cc
->cc_namelen
] == ':' &&
928 prestr
[2 + cc
->cc_namelen
] == ']') {
929 prestr
+= cc
->cc_namelen
+ 3;
930 for (i
= 1; i
< NCHARS
; i
++) {
931 if (!adjbuf(&buf
, &bufsz
, bp
-buf
+1, 100, &bp
, "relex2"))
932 FATAL("out of space for reg expr %.10s...", lastre
);
933 if (cc
->cc_func(i
)) {
940 } else if (c
== '\0') {
941 FATAL("nonterminated character class %.20s", lastre
);
942 } else if (bp
== buf
) { /* 1st char is special */
944 } else if (c
== ']') {
946 rlxstr
= (uschar
*) tostring((char *) buf
);
957 int cgoto(fa
*f
, int s
, int c
)
962 assert(c
== HAT
|| c
< NCHARS
);
963 while (f
->accept
>= maxsetvec
) /* guessing here! */
964 resizesetvec("out of space in cgoto()");
965 for (i
= 0; i
<= f
->accept
; i
++)
969 /* compute positions of gototab[s,c] into setvec */
971 for (i
= 1; i
<= *p
; i
++) {
972 if ((k
= f
->re
[p
[i
]].ltype
) != FINAL
) {
973 if ((k
== CHAR
&& c
== ptoi(f
->re
[p
[i
]].lval
.np
))
974 || (k
== DOT
&& c
!= 0 && c
!= HAT
)
975 || (k
== ALL
&& c
!= 0)
976 || (k
== EMPTYRE
&& c
!= 0)
977 || (k
== CCL
&& member(c
, (char *) f
->re
[p
[i
]].lval
.up
))
978 || (k
== NCCL
&& !member(c
, (char *) f
->re
[p
[i
]].lval
.up
) && c
!= 0 && c
!= HAT
)) {
979 q
= f
->re
[p
[i
]].lfollow
;
980 for (j
= 1; j
<= *q
; j
++) {
981 if (q
[j
] >= maxsetvec
)
982 resizesetvec("cgoto overflow");
983 if (setvec
[q
[j
]] == 0) {
991 /* determine if setvec is a previous state */
994 for (i
= f
->accept
; i
>= 0; i
--)
999 resize_state(f
, f
->curstat
> s
? f
->curstat
: s
);
1000 /* tmpset == previous state? */
1001 for (i
= 1; i
<= f
->curstat
; i
++) {
1003 if ((k
= tmpset
[0]) != p
[0])
1005 for (j
= 1; j
<= k
; j
++)
1006 if (tmpset
[j
] != p
[j
])
1008 /* setvec is state i */
1010 f
->gototab
[s
][c
] = i
;
1015 /* add tmpset to current set of states */
1017 resize_state(f
, f
->curstat
);
1018 for (i
= 0; i
< NCHARS
; i
++)
1019 f
->gototab
[f
->curstat
][i
] = 0;
1020 xfree(f
->posns
[f
->curstat
]);
1021 if ((p
= calloc(1, (setcnt
+1)*sizeof(int))) == NULL
)
1022 overflo("out of space in cgoto");
1024 f
->posns
[f
->curstat
] = p
;
1026 f
->gototab
[s
][c
] = f
->curstat
;
1027 for (i
= 0; i
<= setcnt
; i
++)
1029 if (setvec
[f
->accept
])
1030 f
->out
[f
->curstat
] = 1;
1032 f
->out
[f
->curstat
] = 0;
1037 void freefa(fa
*f
) /* free a finite automaton */
1043 for (i
= 0; i
< f
->state_count
; i
++) {
1044 xfree(f
->gototab
[i
])
1047 for (i
= 0; i
<= f
->accept
; i
++) {
1048 xfree(f
->re
[i
].lfollow
);
1049 if (f
->re
[i
].ltype
== CCL
|| f
->re
[i
].ltype
== NCCL
)
1050 xfree((f
->re
[i
].lval
.np
));