7 /* RFC 822 address parser
11 /* TOK822 *tok822_scan_limit(str, tailp, limit)
16 /* TOK822 *tok822_scan(str, tailp)
20 /* TOK822 *tok822_parse_limit(str, limit)
24 /* TOK822 *tok822_parse(str)
27 /* TOK822 *tok822_scan_addr(str)
30 /* VSTRING *tok822_externalize(buffer, tree, flags)
35 /* VSTRING *tok822_internalize(buffer, tree, flags)
40 /* This module converts address lists between string form and parse
41 /* tree formats. The string form can appear in two different ways:
42 /* external (or quoted) form, as used in message headers, and internal
43 /* (unquoted) form, as used internally by the mail software.
44 /* Although RFC 822 expects 7-bit data, these routines pay no
45 /* special attention to 8-bit characters.
47 /* tok822_scan() converts the external-form string in \fIstr\fR
48 /* to a linear token list. The \fItailp\fR argument is a null pointer
49 /* or receives the pointer value of the last result list element.
51 /* tok822_scan_limit() implements tok822_scan(), which is a macro.
52 /* The \fIlimit\fR argument is either zero or an upper bound on the
53 /* number of tokens produced.
55 /* tok822_parse() converts the external-form address list in
56 /* \fIstr\fR to the corresponding token tree. The parser is permissive
57 /* and will not throw away information that it does not understand.
58 /* The parser adds missing commas between addresses.
60 /* tok822_parse_limit() implements tok822_parse(), which is a macro.
61 /* The \fIlimit\fR argument is either zero or an upper bound on the
62 /* number of tokens produced.
64 /* tok822_scan_addr() converts the external-form string in
65 /* \fIstr\fR to an address token tree. This is just string to
66 /* token list conversion; no parsing is done. This routine is
67 /* suitable for data that should contain just one address and no
70 /* tok822_externalize() converts a token list to external form.
71 /* Where appropriate, characters and strings are quoted and white
72 /* space is inserted. The \fIflags\fR argument is the binary OR of
73 /* zero or more of the following:
74 /* .IP TOK822_STR_WIPE
75 /* Initially, truncate the result to zero length.
76 /* .IP TOK822_STR_TERM
77 /* Append a null terminator to the result when done.
78 /* .IP TOK822_STR_LINE
79 /* Append a line break after each comma token, instead of appending
80 /* whitespace. It is up to the caller to concatenate short lines to
81 /* produce longer ones.
82 /* .IP TOK822_STR_TRNC
83 /* Truncate non-address information to 250 characters per address, to
84 /* protect Sendmail systems that are vulnerable to the problem in CERT
85 /* advisory CA-2003-07.
86 /* This flag has effect with tok822_externalize() only.
88 /* The macro TOK_822_NONE expresses that none of the above features
89 /* should be activated.
91 /* The macro TOK822_STR_DEFL combines the TOK822_STR_WIPE and
92 /* TOK822_STR_TERM flags. This is useful for most token to string
95 /* The macro TOK822_STR_HEAD combines the TOK822_STR_TERM,
96 /* TOK822_STR_LINE and TOK822_STR_TRNC flags. This is useful for
97 /* the special case of token to mail header conversion.
99 /* tok822_internalize() converts a token list to string form,
100 /* without quoting. White space is inserted where appropriate.
101 /* The \fIflags\fR argument is as with tok822_externalize().
105 /* RFC 822 (ARPA Internet Text Messages). In addition to this standard
106 /* this module implements additional operators such as % and !. These
107 /* are needed because the real world is not all RFC 822. Also, the ':'
108 /* operator is allowed to appear inside addresses, to accommodate DECnet.
109 /* In addition, 8-bit data is not given special treatment.
113 /* The Secure Mailer license must be distributed with this software.
116 /* IBM T.J. Watson Research
118 /* Yorktown Heights, NY 10598, USA
121 /* System library. */
123 #include <sys_defs.h>
127 /* Utility library. */
131 #include <stringops.h>
133 /* Global library. */
136 #include "quote_822_local.h"
140 * I suppose this is my favorite macro. Used heavily for tokenizing.
142 #define COLLECT(t,s,c,cond) { \
143 while ((c = *(unsigned char *) s) != 0) { \
145 if ((c = *(unsigned char *)++s) == 0) \
147 } else if (!(cond)) { \
150 VSTRING_ADDCH(t->vstr, IS_SPACE_TAB_CR_LF(c) ? ' ' : c); \
153 VSTRING_TERMINATE(t->vstr); \
156 #define COLLECT_SKIP_LAST(t,s,c,cond) { COLLECT(t,s,c,cond); if (*s) s++; }
159 * Not quite as complex. The parser depends heavily on it.
161 #define SKIP(tp, cond) { \
162 while (tp->type && (cond)) \
166 #define MOVE_COMMENT_AND_CONTINUE(tp, right) { \
167 TOK822 *prev = tok822_unlink(tp); \
168 right = tok822_prepend(right, tp); \
173 #define SKIP_MOVE_COMMENT(tp, cond, right) { \
174 while (tp->type && (cond)) { \
175 if (tp->type == TOK822_COMMENT) \
176 MOVE_COMMENT_AND_CONTINUE(tp, right); \
182 * Single-character operators. We include the % and ! operators because not
183 * all the world is RFC822. XXX Make this operator list configurable when we
184 * have a real rewriting language. Include | for aliases file parsing.
186 static char tok822_opchar
[] = "|%!" LEX_822_SPECIALS
;
187 static void tok822_quote_atom(TOK822
*);
188 static const char *tok822_comment(TOK822
*, const char *);
189 static TOK822
*tok822_group(int, TOK822
*, TOK822
*, int);
190 static void tok822_copy_quoted(VSTRING
*, char *, char *);
191 static int tok822_append_space(TOK822
*);
193 #define DO_WORD (1<<0) /* finding a word is ok here */
194 #define DO_GROUP (1<<1) /* doing an address group */
196 #define ADD_COMMA ',' /* resynchronize */
197 #define NO_MISSING_COMMA 0
199 /* tok822_internalize - token tree to string, internal form */
201 VSTRING
*tok822_internalize(VSTRING
*vp
, TOK822
*tree
, int flags
)
205 if (flags
& TOK822_STR_WIPE
)
208 for (tp
= tree
; tp
; tp
= tp
->next
) {
211 VSTRING_ADDCH(vp
, tp
->type
);
212 if (flags
& TOK822_STR_LINE
) {
213 VSTRING_ADDCH(vp
, '\n');
218 tok822_internalize(vp
, tp
->head
, TOK822_STR_NONE
);
223 vstring_strcat(vp
, vstring_str(tp
->vstr
));
226 VSTRING_ADDCH(vp
, '[');
227 vstring_strcat(vp
, vstring_str(tp
->vstr
));
228 VSTRING_ADDCH(vp
, ']');
230 case TOK822_STARTGRP
:
231 VSTRING_ADDCH(vp
, ':');
234 if (tp
->type
>= TOK822_MINTOK
)
235 msg_panic("tok822_internalize: unknown operator %d", tp
->type
);
236 VSTRING_ADDCH(vp
, tp
->type
);
238 if (tok822_append_space(tp
))
239 VSTRING_ADDCH(vp
, ' ');
241 if (flags
& TOK822_STR_TERM
)
242 VSTRING_TERMINATE(vp
);
246 /* strip_address - strip non-address text from address expression */
248 static void strip_address(VSTRING
*vp
, ssize_t start
, TOK822
*addr
)
253 * Emit plain <address>. Discard any comments or phrases.
255 VSTRING_TERMINATE(vp
);
256 msg_warn("stripping too many comments from address: %.100s...",
257 printable(vstring_str(vp
) + start
, '?'));
258 vstring_truncate(vp
, start
);
259 VSTRING_ADDCH(vp
, '<');
261 tmp
= vstring_alloc(100);
262 tok822_internalize(tmp
, addr
, TOK822_STR_TERM
);
263 quote_822_local_flags(vp
, vstring_str(tmp
),
264 QUOTE_FLAG_8BITCLEAN
| QUOTE_FLAG_APPEND
);
267 VSTRING_ADDCH(vp
, '>');
270 /* tok822_externalize - token tree to string, external form */
272 VSTRING
*tok822_externalize(VSTRING
*vp
, TOK822
*tree
, int flags
)
281 * Guard against a Sendmail buffer overflow (CERT advisory CA-2003-07).
282 * The problem was that Sendmail could store too much non-address text
283 * (comments, phrases, etc.) into a static 256-byte buffer.
285 * When the buffer fills up, fixed Sendmail versions remove comments etc.
286 * and reduce the information to just <$g>, which expands to <address>.
287 * No change is made when an address expression (text separated by
288 * commas) contains no address. This fix reportedly also protects
289 * Sendmail systems that are still vulnerable to this problem.
291 * Postfix takes the same approach, grudgingly. To avoid unnecessary damage,
292 * Postfix removes comments etc. only when the amount of non-address text
293 * in an address expression (text separated by commas) exceeds 250 bytes.
295 * With Sendmail, the address part of an address expression is the
296 * right-most <> instance in that expression. If an address expression
297 * contains no <>, then Postfix guarantees that it contains at most one
298 * non-comment string; that string is the address part of the address
299 * expression, so there is no ambiguity.
301 * Finally, we note that stress testing shows that other code in Sendmail
302 * 8.12.8 bluntly truncates ``text <address>'' to 256 bytes even when
303 * this means chopping the <address> somewhere in the middle. This is a
304 * loss of control that we're not entirely comfortable with. However,
305 * unbalanced quotes and dangling backslash do not seem to influence the
306 * way that Sendmail parses headers, so this is not an urgent problem.
308 #define MAX_NONADDR_LENGTH 250
310 #define RESET_NONADDR_LENGTH { \
311 start = VSTRING_LEN(vp); \
316 #define ENFORCE_NONADDR_LENGTH do { \
317 if (addr && VSTRING_LEN(vp) - addr_len > start + MAX_NONADDR_LENGTH) \
318 strip_address(vp, start, addr->head); \
321 if (flags
& TOK822_STR_WIPE
)
324 if (flags
& TOK822_STR_TRNC
)
325 RESET_NONADDR_LENGTH
;
327 for (tp
= tree
; tp
; tp
= tp
->next
) {
330 if (flags
& TOK822_STR_TRNC
)
331 ENFORCE_NONADDR_LENGTH
;
332 VSTRING_ADDCH(vp
, tp
->type
);
333 VSTRING_ADDCH(vp
, (flags
& TOK822_STR_LINE
) ? '\n' : ' ');
334 if (flags
& TOK822_STR_TRNC
)
335 RESET_NONADDR_LENGTH
;
339 * XXX In order to correctly externalize an address, it is not
340 * sufficient to quote individual atoms. There are higher-level
341 * rules that say when an address localpart needs to be quoted.
342 * We wing it with the quote_822_local() routine, which ignores
343 * the issue of atoms in the domain part that would need quoting.
347 tmp
= vstring_alloc(100);
348 tok822_internalize(tmp
, tp
->head
, TOK822_STR_TERM
);
349 addr_len
= VSTRING_LEN(vp
);
350 quote_822_local_flags(vp
, vstring_str(tmp
),
351 QUOTE_FLAG_8BITCLEAN
| QUOTE_FLAG_APPEND
);
352 addr_len
= VSTRING_LEN(vp
) - addr_len
;
357 vstring_strcat(vp
, vstring_str(tp
->vstr
));
360 VSTRING_ADDCH(vp
, '"');
361 tok822_copy_quoted(vp
, vstring_str(tp
->vstr
), "\"\\\r\n");
362 VSTRING_ADDCH(vp
, '"');
365 VSTRING_ADDCH(vp
, '[');
366 tok822_copy_quoted(vp
, vstring_str(tp
->vstr
), "\\\r\n");
367 VSTRING_ADDCH(vp
, ']');
369 case TOK822_STARTGRP
:
370 VSTRING_ADDCH(vp
, ':');
373 if (tp
->next
&& tp
->next
->type
== '>') {
377 VSTRING_ADDCH(vp
, '<');
380 if (tp
->type
>= TOK822_MINTOK
)
381 msg_panic("tok822_externalize: unknown operator %d", tp
->type
);
382 VSTRING_ADDCH(vp
, tp
->type
);
384 if (tok822_append_space(tp
))
385 VSTRING_ADDCH(vp
, ' ');
387 if (flags
& TOK822_STR_TRNC
)
388 ENFORCE_NONADDR_LENGTH
;
390 if (flags
& TOK822_STR_TERM
)
391 VSTRING_TERMINATE(vp
);
395 /* tok822_copy_quoted - copy a string while quoting */
397 static void tok822_copy_quoted(VSTRING
*vp
, char *str
, char *quote_set
)
401 while ((ch
= *(unsigned char *) str
++) != 0) {
402 if (strchr(quote_set
, ch
))
403 VSTRING_ADDCH(vp
, '\\');
404 VSTRING_ADDCH(vp
, ch
);
408 /* tok822_append_space - see if space is needed after this token */
410 static int tok822_append_space(TOK822
*tp
)
414 if (tp
== 0 || (next
= tp
->next
) == 0 || tp
->owner
!= 0)
416 if (tp
->type
== ',' || tp
->type
== TOK822_STARTGRP
|| next
->type
== '<')
419 #define NON_OPERATOR(x) \
420 (x->type == TOK822_ATOM || x->type == TOK822_QSTRING \
421 || x->type == TOK822_COMMENT || x->type == TOK822_DOMLIT \
422 || x->type == TOK822_ADDR)
424 return (NON_OPERATOR(tp
) && NON_OPERATOR(next
));
427 /* tok822_scan_limit - tokenize string */
429 TOK822
*tok822_scan_limit(const char *str
, TOK822
**tailp
, int tok_count_limit
)
438 * XXX 2822 new feature: Section 4.1 allows "." to appear in a phrase (to
439 * allow for forms such as: Johnny B. Goode <johhny@domain.org>. I cannot
440 * handle that at the tokenizer level - it is not context sensitive. And
441 * to fix this at the parser level requires radical changes to preserve
442 * white space as part of the token stream. Thanks a lot, people.
444 while ((ch
= *(unsigned char *) str
++) != 0) {
445 if (IS_SPACE_TAB_CR_LF(ch
))
448 tp
= tok822_alloc(TOK822_COMMENT
, (char *) 0);
449 str
= tok822_comment(tp
, str
);
450 } else if (ch
== '[') {
451 tp
= tok822_alloc(TOK822_DOMLIT
, (char *) 0);
452 COLLECT_SKIP_LAST(tp
, str
, ch
, ch
!= ']');
453 } else if (ch
== '"') {
454 tp
= tok822_alloc(TOK822_QSTRING
, (char *) 0);
455 COLLECT_SKIP_LAST(tp
, str
, ch
, ch
!= '"');
456 } else if (ch
!= '\\' && strchr(tok822_opchar
, ch
)) {
457 tp
= tok822_alloc(ch
, (char *) 0);
459 tp
= tok822_alloc(TOK822_ATOM
, (char *) 0);
460 str
-= 1; /* \ may be first */
461 COLLECT(tp
, str
, ch
, !IS_SPACE_TAB_CR_LF(ch
) && !strchr(tok822_opchar
, ch
));
462 tok822_quote_atom(tp
);
469 tail
= tok822_append(tail
, tp
);
471 if (tok_count_limit
> 0 && ++tok_count
>= tok_count_limit
)
479 /* tok822_parse_limit - translate external string to token tree */
481 TOK822
*tok822_parse_limit(const char *str
, int tok_count_limit
)
492 * First, tokenize the string, from left to right. We are not allowed to
493 * throw away any information that we do not understand. With a flat
494 * token list that contains all tokens, we can always convert back to
497 if ((first_token
= tok822_scan_limit(str
, &last_token
, tok_count_limit
)) == 0)
501 * For convenience, sandwich the token list between two sentinel tokens.
503 #define GLUE(left,rite) { left->next = rite; rite->prev = left; }
505 head
= tok822_alloc(0, (char *) 0);
506 GLUE(head
, first_token
);
507 tail
= tok822_alloc(0, (char *) 0);
508 GLUE(last_token
, tail
);
511 * Next step is to transform the token list into a parse tree. This is
512 * done most conveniently from right to left. If there is something that
513 * we do not understand, just leave it alone, don't throw it away. The
514 * address information that we're looking for sits in-between the current
515 * node (tp) and the one called right. Add missing commas on the fly.
521 if (tp
->type
== TOK822_COMMENT
) { /* move comment to the side */
522 MOVE_COMMENT_AND_CONTINUE(tp
, right
);
523 } else if (tp
->type
== ';') { /* rh side of named group */
524 right
= tok822_group(TOK822_ADDR
, tp
, right
, ADD_COMMA
);
525 state
= DO_GROUP
| DO_WORD
;
526 } else if (tp
->type
== ':' && (state
& DO_GROUP
) != 0) {
527 tp
->type
= TOK822_STARTGRP
;
528 (void) tok822_group(TOK822_ADDR
, tp
, right
, NO_MISSING_COMMA
);
529 SKIP(tp
, tp
->type
!= ',');
532 } else if (tp
->type
== '>') { /* rh side of <route> */
533 right
= tok822_group(TOK822_ADDR
, tp
, right
, ADD_COMMA
);
534 SKIP_MOVE_COMMENT(tp
, tp
->type
!= '<', right
);
535 (void) tok822_group(TOK822_ADDR
, tp
, right
, NO_MISSING_COMMA
);
536 SKIP(tp
, tp
->type
> 0xff || strchr(">;,:", tp
->type
) == 0);
540 } else if (tp
->type
== TOK822_ATOM
|| tp
->type
== TOK822_QSTRING
541 || tp
->type
== TOK822_DOMLIT
) {
542 if ((state
& DO_WORD
) == 0)
543 right
= tok822_group(TOK822_ADDR
, tp
, right
, ADD_COMMA
)->next
;
545 } else if (tp
->type
== ',') {
546 right
= tok822_group(TOK822_ADDR
, tp
, right
, NO_MISSING_COMMA
);
553 (void) tok822_group(TOK822_ADDR
, tp
, right
, NO_MISSING_COMMA
);
556 * Discard the sentinel tokens on the left and right extremes. Properly
557 * terminate the resulting list.
559 tp
= (head
->next
!= tail
? head
->next
: 0);
560 tok822_cut_before(head
->next
);
562 tok822_cut_before(tail
);
567 /* tok822_quote_atom - see if an atom needs quoting when externalized */
569 static void tok822_quote_atom(TOK822
*tp
)
575 * RFC 822 expects 7-bit data. Rather than quoting every 8-bit character
576 * (and still passing it on as 8-bit data) we leave 8-bit data alone.
578 for (cp
= vstring_str(tp
->vstr
); (ch
= *(unsigned char *) cp
) != 0; cp
++) {
579 if ( /* !ISASCII(ch) || */ ch
== ' '
580 || ISCNTRL(ch
) || strchr(tok822_opchar
, ch
)) {
581 tp
->type
= TOK822_QSTRING
;
587 /* tok822_comment - tokenize comment */
589 static const char *tok822_comment(TOK822
*tp
, const char *str
)
595 * XXX We cheat by storing comments in their external form. Otherwise it
596 * would be a royal pain to preserve \ before (. That would require a
597 * recursive parser; the easy to implement stack-based recursion would be
600 VSTRING_ADDCH(tp
->vstr
, '(');
602 while ((ch
= *(unsigned char *) str
) != 0) {
603 VSTRING_ADDCH(tp
->vstr
, ch
);
605 if (ch
== '(') { /* comments can nest! */
607 } else if (ch
== ')') {
610 } else if (ch
== '\\') {
611 if ((ch
= *(unsigned char *) str
) == 0)
613 VSTRING_ADDCH(tp
->vstr
, ch
);
617 VSTRING_TERMINATE(tp
->vstr
);
621 /* tok822_group - cluster a group of tokens */
623 static TOK822
*tok822_group(int group_type
, TOK822
*left
, TOK822
*right
, int sync_type
)
630 * Cluster the tokens between left and right under their own parse tree
631 * node. Optionally insert a resync token.
633 if (left
!= right
&& (first
= left
->next
) != right
) {
634 tok822_cut_before(right
);
635 tok822_cut_before(first
);
636 group
= tok822_alloc(group_type
, (char *) 0);
637 tok822_sub_append(group
, first
);
638 tok822_append(left
, group
);
639 tok822_append(group
, right
);
641 sync
= tok822_alloc(sync_type
, (char *) 0);
642 tok822_append(left
, sync
);
648 /* tok822_scan_addr - convert external address string to address token */
650 TOK822
*tok822_scan_addr(const char *addr
)
652 TOK822
*tree
= tok822_alloc(TOK822_ADDR
, (char *) 0);
654 tree
->head
= tok822_scan(addr
, &tree
->tail
);
662 #include <readlline.h>
664 /* tok822_print - display token */
666 static void tok822_print(TOK822
*list
, int indent
)
670 for (tp
= list
; tp
; tp
= tp
->next
) {
671 if (tp
->type
< TOK822_MINTOK
) {
672 vstream_printf("%*s %s \"%c\"\n", indent
, "", "OP", tp
->type
);
673 } else if (tp
->type
== TOK822_ADDR
) {
674 vstream_printf("%*s %s\n", indent
, "", "address");
675 tok822_print(tp
->head
, indent
+ 2);
676 } else if (tp
->type
== TOK822_STARTGRP
) {
677 vstream_printf("%*s %s\n", indent
, "", "group \":\"");
679 vstream_printf("%*s %s \"%s\"\n", indent
, "",
680 tp
->type
== TOK822_COMMENT
? "comment" :
681 tp
->type
== TOK822_ATOM
? "atom" :
682 tp
->type
== TOK822_QSTRING
? "quoted string" :
683 tp
->type
== TOK822_DOMLIT
? "domain literal" :
684 tp
->type
== TOK822_ADDR
? "address" :
685 "unknown\n", vstring_str(tp
->vstr
));
690 int main(int unused_argc
, char **unused_argv
)
692 VSTRING
*vp
= vstring_alloc(100);
694 VSTRING
*buf
= vstring_alloc(100);
696 #define TEST_TOKEN_LIMIT 20
698 while (readlline(buf
, VSTREAM_IN
, (int *) 0)) {
699 while (VSTRING_LEN(buf
) > 0 && vstring_end(buf
)[-1] == '\n') {
700 vstring_end(buf
)[-1] = 0;
701 vstring_truncate(buf
, VSTRING_LEN(buf
) - 1);
703 if (!isatty(vstream_fileno(VSTREAM_IN
)))
704 vstream_printf(">>>%s<<<\n\n", vstring_str(buf
));
705 list
= tok822_parse_limit(vstring_str(buf
), TEST_TOKEN_LIMIT
);
706 vstream_printf("Parse tree:\n");
707 tok822_print(list
, 0);
708 vstream_printf("\n");
710 vstream_printf("Internalized:\n%s\n\n",
711 vstring_str(tok822_internalize(vp
, list
, TOK822_STR_DEFL
)));
712 vstream_fflush(VSTREAM_OUT
);
713 vstream_printf("Externalized, no newlines inserted:\n%s\n\n",
714 vstring_str(tok822_externalize(vp
, list
,
715 TOK822_STR_DEFL
| TOK822_STR_TRNC
)));
716 vstream_fflush(VSTREAM_OUT
);
717 vstream_printf("Externalized, newlines inserted:\n%s\n\n",
718 vstring_str(tok822_externalize(vp
, list
,
719 TOK822_STR_DEFL
| TOK822_STR_LINE
| TOK822_STR_TRNC
)));
720 vstream_fflush(VSTREAM_OUT
);
721 tok822_free_tree(list
);