7 /* dictionary manager interface to REGEXP regular expression library
9 /* #include <dict_regexp.h>
11 /* DICT *dict_regexp_open(name, dummy, dict_flags)
16 /* dict_regexp_open() opens the named file and compiles the contained
17 /* regular expressions. The result object can be used to match strings
20 /* dict(3) generic dictionary manager
21 /* regexp_table(5) format of Postfix regular expression tables
26 /* Based on PCRE dictionary contributed by Andrew McNamara
27 /* andrewm@connect.com.au
28 /* connect.com.au Pty. Ltd.
29 /* Level 3, 213 Miller St
30 /* North Sydney, NSW, Australia
32 /* Heavily rewritten by Wietse Venema
33 /* IBM T.J. Watson Research
35 /* Yorktown Heights, NY 10598, USA
42 #ifdef HAS_POSIX_REGEXP
49 #ifdef STRCASECMP_IN_STRINGS_H
53 /* Utility library. */
60 #include "stringops.h"
61 #include "readlline.h"
63 #include "dict_regexp.h"
64 #include "mac_parse.h"
67 * Support for IF/ENDIF based on an idea by Bert Driehuis.
69 #define DICT_REGEXP_OP_MATCH 1 /* Match this regexp */
70 #define DICT_REGEXP_OP_IF 2 /* Increase if/endif nesting on match */
71 #define DICT_REGEXP_OP_ENDIF 3 /* Decrease if/endif nesting on match */
74 * Regular expression before compiling.
77 char *regexp
; /* regular expression */
78 int options
; /* regcomp() options */
79 int match
; /* positive or negative match */
80 } DICT_REGEXP_PATTERN
;
83 * Compiled generic rule, and subclasses that derive from it.
85 typedef struct DICT_REGEXP_RULE
{
86 int op
; /* DICT_REGEXP_OP_MATCH/IF/ENDIF */
87 int nesting
; /* Level of search nesting */
88 int lineno
; /* source file line number */
89 struct DICT_REGEXP_RULE
*next
; /* next rule in dict */
93 DICT_REGEXP_RULE rule
; /* generic part */
94 regex_t
*first_exp
; /* compiled primary pattern */
95 int first_match
; /* positive or negative match */
96 regex_t
*second_exp
; /* compiled secondary pattern */
97 int second_match
; /* positive or negative match */
98 char *replacement
; /* replacement text */
99 size_t max_sub
; /* largest $number in replacement */
100 } DICT_REGEXP_MATCH_RULE
;
103 DICT_REGEXP_RULE rule
; /* generic members */
104 regex_t
*expr
; /* the condition */
105 int match
; /* positive or negative match */
106 } DICT_REGEXP_IF_RULE
;
112 DICT dict
; /* generic members */
113 regmatch_t
*pmatch
; /* matched substring info */
114 DICT_REGEXP_RULE
*head
; /* first rule */
115 VSTRING
*expansion_buf
; /* lookup result */
119 * Macros to make dense code more readable.
121 #define NULL_SUBSTITUTIONS (0)
122 #define NULL_MATCH_RESULT ((regmatch_t *) 0)
125 * Context for $number expansion callback.
128 DICT_REGEXP
*dict_regexp
; /* the dictionary handle */
129 DICT_REGEXP_MATCH_RULE
*match_rule
; /* the rule we matched */
130 const char *lookup_string
; /* matched text */
131 } DICT_REGEXP_EXPAND_CONTEXT
;
134 * Context for $number pre-scan callback.
137 const char *mapname
; /* name of regexp map */
138 int lineno
; /* where in file */
139 size_t max_sub
; /* largest $number seen */
140 char *literal
; /* constant result, $$ -> $ */
141 } DICT_REGEXP_PRESCAN_CONTEXT
;
147 #define MAC_PARSE_OK 0
150 /* dict_regexp_expand - replace $number with substring from matched text */
152 static int dict_regexp_expand(int type
, VSTRING
*buf
, char *ptr
)
154 DICT_REGEXP_EXPAND_CONTEXT
*ctxt
= (DICT_REGEXP_EXPAND_CONTEXT
*) ptr
;
155 DICT_REGEXP_MATCH_RULE
*match_rule
= ctxt
->match_rule
;
156 DICT_REGEXP
*dict_regexp
= ctxt
->dict_regexp
;
161 * Replace $number by the corresponding substring from the matched text.
162 * We pre-scanned the replacement text at compile time, so any out of
163 * range $number means that something impossible has happened.
165 if (type
== MAC_PARSE_VARNAME
) {
166 n
= atoi(vstring_str(buf
));
167 if (n
< 1 || n
> match_rule
->max_sub
)
168 msg_panic("regexp map %s, line %d: out of range replacement index \"%s\"",
169 dict_regexp
->dict
.name
, match_rule
->rule
.lineno
,
171 pmatch
= dict_regexp
->pmatch
+ n
;
172 if (pmatch
->rm_so
< 0 || pmatch
->rm_so
== pmatch
->rm_eo
)
173 return (MAC_PARSE_UNDEF
); /* empty or not matched */
174 vstring_strncat(dict_regexp
->expansion_buf
,
175 ctxt
->lookup_string
+ pmatch
->rm_so
,
176 pmatch
->rm_eo
- pmatch
->rm_so
);
177 return (MAC_PARSE_OK
);
181 * Straight text - duplicate with no substitution.
184 vstring_strcat(dict_regexp
->expansion_buf
, vstring_str(buf
));
185 return (MAC_PARSE_OK
);
189 /* dict_regexp_regerror - report regexp compile/execute error */
191 static void dict_regexp_regerror(const char *mapname
, int lineno
, int error
,
196 (void) regerror(error
, expr
, errbuf
, sizeof(errbuf
));
197 msg_warn("regexp map %s, line %d: %s", mapname
, lineno
, errbuf
);
201 * Inlined to reduce function call overhead in the time-critical loop.
203 #define DICT_REGEXP_REGEXEC(err, map, line, expr, match, str, nsub, pmatch) \
204 ((err) = regexec((expr), (str), (nsub), (pmatch), 0), \
205 ((err) == REG_NOMATCH ? !(match) : \
206 (err) == 0 ? (match) : \
207 (dict_regexp_regerror((map), (line), (err), (expr)), 0)))
209 /* dict_regexp_lookup - match string and perform optional substitution */
211 static const char *dict_regexp_lookup(DICT
*dict
, const char *lookup_string
)
213 DICT_REGEXP
*dict_regexp
= (DICT_REGEXP
*) dict
;
214 DICT_REGEXP_RULE
*rule
;
215 DICT_REGEXP_IF_RULE
*if_rule
;
216 DICT_REGEXP_MATCH_RULE
*match_rule
;
217 DICT_REGEXP_EXPAND_CONTEXT expand_context
;
224 msg_info("dict_regexp_lookup: %s: %s", dict
->name
, lookup_string
);
227 * Optionally fold the key.
229 if (dict
->flags
& DICT_FLAG_FOLD_MUL
) {
230 if (dict
->fold_buf
== 0)
231 dict
->fold_buf
= vstring_alloc(10);
232 vstring_strcpy(dict
->fold_buf
, lookup_string
);
233 lookup_string
= lowercase(vstring_str(dict
->fold_buf
));
235 for (rule
= dict_regexp
->head
; rule
; rule
= rule
->next
) {
238 * Skip rules inside failed IF/ENDIF.
240 if (nesting
< rule
->nesting
)
246 * Search for the first matching primary expression. Limit the
247 * overhead for substring substitution to the bare minimum.
249 case DICT_REGEXP_OP_MATCH
:
250 match_rule
= (DICT_REGEXP_MATCH_RULE
*) rule
;
251 if (!DICT_REGEXP_REGEXEC(error
, dict
->name
, rule
->lineno
,
252 match_rule
->first_exp
,
253 match_rule
->first_match
,
255 match_rule
->max_sub
> 0 ?
256 match_rule
->max_sub
+ 1 : 0,
257 dict_regexp
->pmatch
))
259 if (match_rule
->second_exp
260 && !DICT_REGEXP_REGEXEC(error
, dict
->name
, rule
->lineno
,
261 match_rule
->second_exp
,
262 match_rule
->second_match
,
269 * Skip $number substitutions when the replacement text contains
270 * no $number strings, as learned during the compile time
271 * pre-scan. The pre-scan already replaced $$ by $.
273 if (match_rule
->max_sub
== 0)
274 return (match_rule
->replacement
);
277 * Perform $number substitutions on the replacement text. We
278 * pre-scanned the replacement text at compile time. Any macro
279 * expansion errors at this point mean something impossible has
282 if (!dict_regexp
->expansion_buf
)
283 dict_regexp
->expansion_buf
= vstring_alloc(10);
284 VSTRING_RESET(dict_regexp
->expansion_buf
);
285 expand_context
.lookup_string
= lookup_string
;
286 expand_context
.match_rule
= match_rule
;
287 expand_context
.dict_regexp
= dict_regexp
;
289 if (mac_parse(match_rule
->replacement
, dict_regexp_expand
,
290 (char *) &expand_context
) & MAC_PARSE_ERROR
)
291 msg_panic("regexp map %s, line %d: bad replacement syntax",
292 dict
->name
, rule
->lineno
);
293 VSTRING_TERMINATE(dict_regexp
->expansion_buf
);
294 return (vstring_str(dict_regexp
->expansion_buf
));
299 case DICT_REGEXP_OP_IF
:
300 if_rule
= (DICT_REGEXP_IF_RULE
*) rule
;
301 if (DICT_REGEXP_REGEXEC(error
, dict
->name
, rule
->lineno
,
302 if_rule
->expr
, if_rule
->match
, lookup_string
,
303 NULL_SUBSTITUTIONS
, NULL_MATCH_RESULT
))
308 * ENDIF after successful IF.
310 case DICT_REGEXP_OP_ENDIF
:
315 msg_panic("dict_regexp_lookup: impossible operation %d", rule
->op
);
321 /* dict_regexp_close - close regexp dictionary */
323 static void dict_regexp_close(DICT
*dict
)
325 DICT_REGEXP
*dict_regexp
= (DICT_REGEXP
*) dict
;
326 DICT_REGEXP_RULE
*rule
;
327 DICT_REGEXP_RULE
*next
;
328 DICT_REGEXP_MATCH_RULE
*match_rule
;
329 DICT_REGEXP_IF_RULE
*if_rule
;
331 for (rule
= dict_regexp
->head
; rule
; rule
= next
) {
334 case DICT_REGEXP_OP_MATCH
:
335 match_rule
= (DICT_REGEXP_MATCH_RULE
*) rule
;
336 if (match_rule
->first_exp
) {
337 regfree(match_rule
->first_exp
);
338 myfree((char *) match_rule
->first_exp
);
340 if (match_rule
->second_exp
) {
341 regfree(match_rule
->second_exp
);
342 myfree((char *) match_rule
->second_exp
);
344 if (match_rule
->replacement
)
345 myfree((char *) match_rule
->replacement
);
347 case DICT_REGEXP_OP_IF
:
348 if_rule
= (DICT_REGEXP_IF_RULE
*) rule
;
350 regfree(if_rule
->expr
);
351 myfree((char *) if_rule
->expr
);
354 case DICT_REGEXP_OP_ENDIF
:
357 msg_panic("dict_regexp_close: unknown operation %d", rule
->op
);
359 myfree((char *) rule
);
361 if (dict_regexp
->pmatch
)
362 myfree((char *) dict_regexp
->pmatch
);
363 if (dict_regexp
->expansion_buf
)
364 vstring_free(dict_regexp
->expansion_buf
);
366 vstring_free(dict
->fold_buf
);
370 /* dict_regexp_get_pat - extract one pattern with options from rule */
372 static int dict_regexp_get_pat(const char *mapname
, int lineno
, char **bufp
,
373 DICT_REGEXP_PATTERN
*pat
)
379 * Process negation operators.
383 pat
->match
= !pat
->match
;
388 * Grr...aceful handling of whitespace after '!'.
390 while (*p
&& ISSPACE(*p
))
393 msg_warn("regexp map %s, line %d: no regexp: skipping this rule",
399 * Search for the closing delimiter, handling backslash escape.
409 } else if (*p
== re_delim
) {
415 msg_warn("regexp map %s, line %d: no closing regexp delimiter \"%c\": "
416 "skipping this rule", mapname
, lineno
, re_delim
);
419 *p
++ = 0; /* null terminate */
422 * Search for options.
424 pat
->options
= REG_EXTENDED
| REG_ICASE
;
425 while (*p
&& !ISSPACE(*p
) && *p
!= '!') {
428 pat
->options
^= REG_ICASE
;
431 pat
->options
^= REG_NEWLINE
;
434 pat
->options
^= REG_EXTENDED
;
437 msg_warn("regexp map %s, line %d: unknown regexp option \"%c\": "
438 "skipping this rule", mapname
, lineno
, *p
);
447 /* dict_regexp_get_pats - get the primary and second patterns and flags */
449 static int dict_regexp_get_pats(const char *mapname
, int lineno
, char **p
,
450 DICT_REGEXP_PATTERN
*first_pat
,
451 DICT_REGEXP_PATTERN
*second_pat
)
455 * Get the primary and optional secondary patterns and their flags.
457 if (dict_regexp_get_pat(mapname
, lineno
, p
, first_pat
) == 0)
461 static int bitrot_warned
= 0;
463 if (bitrot_warned
== 0) {
464 msg_warn("regexp file %s, line %d: /pattern1/!/pattern2/ goes away,"
465 " use \"if !/pattern2/ ... /pattern1/ ... endif\" instead",
470 if (dict_regexp_get_pat(mapname
, lineno
, p
, second_pat
) == 0)
473 second_pat
->regexp
= 0;
478 /* dict_regexp_prescan - find largest $number in replacement text */
480 static int dict_regexp_prescan(int type
, VSTRING
*buf
, char *context
)
482 DICT_REGEXP_PRESCAN_CONTEXT
*ctxt
= (DICT_REGEXP_PRESCAN_CONTEXT
*) context
;
486 * Keep a copy of literal text (with $$ already replaced by $) if and
487 * only if the replacement text contains no $number expression. This way
488 * we can avoid having to scan the replacement text at lookup time.
490 if (type
== MAC_PARSE_VARNAME
) {
492 myfree(ctxt
->literal
);
495 if (!alldig(vstring_str(buf
))) {
496 msg_warn("regexp map %s, line %d: non-numeric replacement index \"%s\"",
497 ctxt
->mapname
, ctxt
->lineno
, vstring_str(buf
));
498 return (MAC_PARSE_ERROR
);
500 n
= atoi(vstring_str(buf
));
502 msg_warn("regexp map %s, line %d: out-of-range replacement index \"%s\"",
503 ctxt
->mapname
, ctxt
->lineno
, vstring_str(buf
));
504 return (MAC_PARSE_ERROR
);
506 if (n
> ctxt
->max_sub
)
508 } else if (type
== MAC_PARSE_LITERAL
&& ctxt
->max_sub
== 0) {
510 msg_panic("regexp map %s, line %d: multiple literals but no $number",
511 ctxt
->mapname
, ctxt
->lineno
);
512 ctxt
->literal
= mystrdup(vstring_str(buf
));
514 return (MAC_PARSE_OK
);
517 /* dict_regexp_compile_pat - compile one pattern */
519 static regex_t
*dict_regexp_compile_pat(const char *mapname
, int lineno
,
520 DICT_REGEXP_PATTERN
*pat
)
525 expr
= (regex_t
*) mymalloc(sizeof(*expr
));
526 error
= regcomp(expr
, pat
->regexp
, pat
->options
);
528 dict_regexp_regerror(mapname
, lineno
, error
, expr
);
529 myfree((char *) expr
);
535 /* dict_regexp_rule_alloc - fill in a generic rule structure */
537 static DICT_REGEXP_RULE
*dict_regexp_rule_alloc(int op
, int nesting
,
541 DICT_REGEXP_RULE
*rule
;
543 rule
= (DICT_REGEXP_RULE
*) mymalloc(size
);
545 rule
->nesting
= nesting
;
546 rule
->lineno
= lineno
;
552 /* dict_regexp_parseline - parse one rule */
554 static DICT_REGEXP_RULE
*dict_regexp_parseline(const char *mapname
, int lineno
,
555 char *line
, int nesting
,
563 * An ordinary rule takes one or two patterns and replacement text.
566 DICT_REGEXP_PATTERN first_pat
;
567 DICT_REGEXP_PATTERN second_pat
;
568 DICT_REGEXP_PRESCAN_CONTEXT prescan_context
;
569 regex_t
*first_exp
= 0;
571 DICT_REGEXP_MATCH_RULE
*match_rule
;
574 * Get the primary and the optional secondary patterns.
576 if (!dict_regexp_get_pats(mapname
, lineno
, &p
, &first_pat
, &second_pat
))
580 * Get the replacement text.
582 while (*p
&& ISSPACE(*p
))
585 msg_warn("regexp map %s, line %d: using empty replacement string",
590 * Find the highest-numbered $number in the replacement text. We can
591 * speed up pattern matching 1) by passing hints to the regexp
592 * compiler, setting the REG_NOSUB flag when the replacement text
593 * contains no $number string; 2) by passing hints to the regexp
594 * execution code, limiting the amount of text that is made available
597 prescan_context
.mapname
= mapname
;
598 prescan_context
.lineno
= lineno
;
599 prescan_context
.max_sub
= 0;
600 prescan_context
.literal
= 0;
603 * The optimizer will eliminate code duplication and/or dead code.
605 #define CREATE_MATCHOP_ERROR_RETURN(rval) do { \
607 regfree(first_exp); \
608 myfree((char *) first_exp); \
610 if (prescan_context.literal) \
611 myfree(prescan_context.literal); \
615 if (mac_parse(p
, dict_regexp_prescan
, (char *) &prescan_context
)
617 msg_warn("regexp map %s, line %d: bad replacement syntax: "
618 "skipping this rule", mapname
, lineno
);
619 CREATE_MATCHOP_ERROR_RETURN(0);
623 * Compile the primary and the optional secondary pattern. Speed up
624 * execution when no matched text needs to be substituted into the
625 * result string, or when the highest numbered substring is less than
626 * the total number of () subpatterns.
628 if (prescan_context
.max_sub
== 0)
629 first_pat
.options
|= REG_NOSUB
;
630 if (prescan_context
.max_sub
> 0 && first_pat
.match
== 0) {
631 msg_warn("regexp map %s, line %d: $number found in negative match "
632 "replacement text: skipping this rule", mapname
, lineno
);
633 CREATE_MATCHOP_ERROR_RETURN(0);
635 if (prescan_context
.max_sub
> 0 && (dict_flags
& DICT_FLAG_NO_REGSUB
)) {
636 msg_warn("regexp map %s, line %d: "
637 "regular expression substitution is not allowed: "
638 "skipping this rule", mapname
, lineno
);
639 CREATE_MATCHOP_ERROR_RETURN(0);
641 if ((first_exp
= dict_regexp_compile_pat(mapname
, lineno
,
643 CREATE_MATCHOP_ERROR_RETURN(0);
644 if (prescan_context
.max_sub
> first_exp
->re_nsub
) {
645 msg_warn("regexp map %s, line %d: out of range replacement index \"%d\": "
646 "skipping this rule", mapname
, lineno
,
647 (int) prescan_context
.max_sub
);
648 CREATE_MATCHOP_ERROR_RETURN(0);
650 if (second_pat
.regexp
!= 0) {
651 second_pat
.options
|= REG_NOSUB
;
652 if ((second_exp
= dict_regexp_compile_pat(mapname
, lineno
,
654 CREATE_MATCHOP_ERROR_RETURN(0);
658 match_rule
= (DICT_REGEXP_MATCH_RULE
*)
659 dict_regexp_rule_alloc(DICT_REGEXP_OP_MATCH
, nesting
, lineno
,
660 sizeof(DICT_REGEXP_MATCH_RULE
));
661 match_rule
->first_exp
= first_exp
;
662 match_rule
->first_match
= first_pat
.match
;
663 match_rule
->max_sub
= prescan_context
.max_sub
;
664 match_rule
->second_exp
= second_exp
;
665 match_rule
->second_match
= second_pat
.match
;
666 if (prescan_context
.literal
)
667 match_rule
->replacement
= prescan_context
.literal
;
669 match_rule
->replacement
= mystrdup(p
);
670 return ((DICT_REGEXP_RULE
*) match_rule
);
674 * The IF operator takes one pattern but no replacement text.
676 else if (strncasecmp(p
, "IF", 2) == 0 && !ISALNUM(p
[2])) {
677 DICT_REGEXP_PATTERN pattern
;
679 DICT_REGEXP_IF_RULE
*if_rule
;
682 while (*p
&& ISSPACE(*p
))
684 if (!dict_regexp_get_pat(mapname
, lineno
, &p
, &pattern
))
686 while (*p
&& ISSPACE(*p
))
689 msg_warn("regexp map %s, line %d: ignoring extra text after"
690 " IF statement: \"%s\"", mapname
, lineno
, p
);
691 msg_warn("regexp map %s, line %d: do not prepend whitespace"
692 " to statements between IF and ENDIF", mapname
, lineno
);
694 if ((expr
= dict_regexp_compile_pat(mapname
, lineno
, &pattern
)) == 0)
696 if_rule
= (DICT_REGEXP_IF_RULE
*)
697 dict_regexp_rule_alloc(DICT_REGEXP_OP_IF
, nesting
, lineno
,
698 sizeof(DICT_REGEXP_IF_RULE
));
699 if_rule
->expr
= expr
;
700 if_rule
->match
= pattern
.match
;
701 return ((DICT_REGEXP_RULE
*) if_rule
);
705 * The ENDIF operator takes no patterns and no replacement text.
707 else if (strncasecmp(p
, "ENDIF", 5) == 0 && !ISALNUM(p
[5])) {
708 DICT_REGEXP_RULE
*rule
;
712 msg_warn("regexp map %s, line %d: ignoring ENDIF without matching IF",
716 while (*p
&& ISSPACE(*p
))
719 msg_warn("regexp map %s, line %d: ignoring extra text after ENDIF",
721 rule
= dict_regexp_rule_alloc(DICT_REGEXP_OP_ENDIF
, nesting
, lineno
,
722 sizeof(DICT_REGEXP_RULE
));
727 * Unrecognized input.
730 msg_warn("regexp map %s, line %d: ignoring unrecognized request",
736 /* dict_regexp_open - load and compile a file containing regular expressions */
738 DICT
*dict_regexp_open(const char *mapname
, int unused_flags
, int dict_flags
)
740 DICT_REGEXP
*dict_regexp
;
742 VSTRING
*line_buffer
;
743 DICT_REGEXP_RULE
*rule
;
744 DICT_REGEXP_RULE
*last_rule
= 0;
750 line_buffer
= vstring_alloc(100);
752 dict_regexp
= (DICT_REGEXP
*) dict_alloc(DICT_TYPE_REGEXP
, mapname
,
753 sizeof(*dict_regexp
));
754 dict_regexp
->dict
.lookup
= dict_regexp_lookup
;
755 dict_regexp
->dict
.close
= dict_regexp_close
;
756 dict_regexp
->dict
.flags
= dict_flags
| DICT_FLAG_PATTERN
;
757 if (dict_flags
& DICT_FLAG_FOLD_MUL
)
758 dict_regexp
->dict
.fold_buf
= vstring_alloc(10);
759 dict_regexp
->head
= 0;
760 dict_regexp
->pmatch
= 0;
761 dict_regexp
->expansion_buf
= 0;
764 * Parse the regexp table.
766 if ((map_fp
= vstream_fopen(mapname
, O_RDONLY
, 0)) == 0)
767 msg_fatal("open %s: %m", mapname
);
769 while (readlline(line_buffer
, map_fp
, &lineno
)) {
770 p
= vstring_str(line_buffer
);
771 trimblanks(p
, 0)[0] = 0;
774 rule
= dict_regexp_parseline(mapname
, lineno
, p
, nesting
, dict_flags
);
777 if (rule
->op
== DICT_REGEXP_OP_MATCH
) {
778 if (((DICT_REGEXP_MATCH_RULE
*) rule
)->max_sub
> max_sub
)
779 max_sub
= ((DICT_REGEXP_MATCH_RULE
*) rule
)->max_sub
;
780 } else if (rule
->op
== DICT_REGEXP_OP_IF
) {
782 } else if (rule
->op
== DICT_REGEXP_OP_ENDIF
) {
786 dict_regexp
->head
= rule
;
788 last_rule
->next
= rule
;
793 msg_warn("regexp map %s, line %d: more IFs than ENDIFs",
797 * Allocate space for only as many matched substrings as used in the
801 dict_regexp
->pmatch
=
802 (regmatch_t
*) mymalloc(sizeof(regmatch_t
) * (max_sub
+ 1));
807 vstring_free(line_buffer
);
808 vstream_fclose(map_fp
);
810 return (DICT_DEBUG (&dict_regexp
->dict
));