1 /* $NetBSD: mime_header.c,v 1.7 2009/01/18 01:29:57 lukem Exp $ */
4 * Copyright (c) 2006 The NetBSD Foundation, Inc.
7 * This code is derived from software contributed to The NetBSD Foundation
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
34 * This module contains the core MIME header decoding routines.
35 * Please refer to RFC 2047 and RFC 2822.
40 #include <sys/cdefs.h>
42 __RCSID("$NetBSD: mime_header.c,v 1.7 2009/01/18 01:29:57 lukem Exp $");
43 #endif /* not __lint__ */
53 #include "mime_header.h"
54 #include "mime_codecs.h"
57 * Our interface to mime_b64tobin()
59 * XXX - This should move to mime_codecs.c.
62 mime_B64_decode(char *outbuf
, size_t outlen
, const char *inbuf
, size_t inlen
)
64 if (outlen
< 3 * roundup(inlen
, 4) / 4)
67 return mime_b64tobin(outbuf
, inbuf
, inlen
);
72 * Header specific "quoted-printable" decode!
73 * Differences with body QP decoding (see rfc 2047, sec 4.2):
74 * 1) '=' occurs _only_ when followed by two hex digits (FWS is not allowed).
75 * 2) Spaces can be encoded as '_' in headers for readability.
77 * XXX - This should move to mime_codecs.c.
80 mime_QPh_decode(char *outbuf
, size_t outlen
, const char *inbuf
, size_t inlen
)
82 const char *p
, *inend
;
86 outend
= outbuf
+ outlen
;
87 inend
= inbuf
+ inlen
;
89 for (p
= inbuf
; p
< inend
; p
++) {
102 c
= strtol(buf
, &bufend
, 16);
103 if (bufend
!= &buf
[2])
110 else if (*p
== '_') /* header's may encode ' ' as '_' */
119 grab_charset(char *from_cs
, size_t from_cs_len
, const char *p
)
123 for (/*EMPTY*/; *p
!= '?'; p
++) {
124 if (*p
== '\0' || q
>= from_cs
+ from_cs_len
- 1)
129 return ++p
; /* if here, then we got the '?' */
133 * An encoded word is a string of at most 75 non-white space
134 * characters of the following form:
136 * =?charset?X?encoding?=
139 * 'charset' is the original character set of the unencoded string.
141 * 'X' is the encoding type 'B' or 'Q' for "base64" or
142 * "quoted-printable", respectively,
143 * 'encoding' is the encoded string.
145 * Both 'charset' and 'X' are case independent and 'encoding' cannot
146 * contain any whitespace or '?' characters. The 'encoding' must also
147 * be fully contained within the encoded words, i.e., it cannot be
148 * split between encoded words.
150 * Note: the 'B' encoding is a slightly modified "quoted-printable"
151 * encoding. In particular, spaces (' ') may be encoded as '_' to
152 * improve undecoded readability.
155 decode_word(const char **ibuf
, char **obuf
, char *oend
, const char *to_cs
)
158 size_t enclen
, dstlen
;
159 char decword
[LINESIZE
];
160 char from_cs
[LINESIZE
];
161 const char *encword
, *iend
, *p
;
166 if (p
[0] != '=' && p
[1] != '?')
168 if (strlen(p
) < 2 + 1 + 3 + 1 + 2)
170 p
= grab_charset(from_cs
, sizeof(from_cs
), p
+ 2);
178 if (p
== NULL
|| p
[1] != '=')
180 enclen
= p
- encword
; /* length of encoded substring */
182 /* encoded words are at most 75 characters (RFC 2047, sec 2) */
183 if (iend
> *ibuf
+ 75)
186 if (oend
< *obuf
+ 1) {
187 assert(/*CONSTCOND*/ 0); /* We have a coding error! */
190 dstend
= to_cs
? decword
: *obuf
;
191 dstlen
= (to_cs
? sizeof(decword
) : (size_t)(oend
- *obuf
)) - 1;
193 if (enctype
== 'B' || enctype
== 'b')
194 declen
= mime_B64_decode(dstend
, dstlen
, encword
, enclen
);
195 else if (enctype
== 'Q' || enctype
== 'q')
196 declen
= mime_QPh_decode(dstend
, dstlen
, encword
, enclen
);
204 #ifdef CHARSET_SUPPORT
211 cd
= iconv_open(to_cs
, from_cs
);
212 if (cd
== (iconv_t
)-1)
218 dstlen
= oend
- *obuf
- 1;
219 cnt
= mime_iconv(cd
, &src
, &srclen
, &dstend
, &dstlen
);
221 (void)iconv_close(cd
);
222 if (cnt
== (size_t)-1)
225 #endif /* CHARSET_SUPPORT */
234 * Folding White Space. See RFC 2822.
236 * Note: RFC 2822 specifies that '\n' and '\r' only occur as CRLF
237 * pairs (i.e., "\r\n") and never separately. However, by the time
238 * mail(1) sees the messages, all CRLF pairs have been converted to
241 * XXX - pull is_FWS() and skip_FWS() up to def.h?
246 return c
== ' ' || c
== '\t' || c
== '\n';
249 static inline const char *
250 skip_FWS(const char *p
)
258 copy_skipped_FWS(char **dst
, char *dstend
, const char **src
, const char *srcend
)
260 const char *p
, *pend
;
268 if (p
) { /* copy any skipped linear-white-space */
269 while (p
< pend
&& q
< qend
)
277 * Decode an unstructured field.
279 * See RFC 2822 Sec 2.2.1 and 3.6.5.
280 * Encoded words may occur anywhere in unstructured fields provided
281 * they are separated from any other text or encoded words by at least
282 * one linear-white-space character. (See RFC 2047 sec 5.1.) If two
283 * encoded words occur sequentially (separated by only FWS) then the
284 * separating FWS is removed.
286 * NOTE: unstructured fields cannot contain 'quoted-pairs' (see
287 * RFC2822 sec 3.2.6 and RFC 2047), but that is no problem as a '\\'
288 * (or any non-whitespace character) immediately before an
289 * encoded-word will prevent it from being decoded.
291 * hstring should be a NULL terminated string.
292 * outbuf should be sufficiently large to hold the result.
295 mime_decode_usfield(char *outbuf
, size_t outsize
, const char *hstring
)
302 charset
= value(ENAME_MIME_CHARSET
);
303 qend
= outbuf
+ outsize
- 1; /* Make sure there is room for the trailing NULL! */
307 lastc
= (unsigned char)' ';
308 while (*p
&& q
< qend
) {
311 if (is_FWS(lastc
) && p
[0] == '=' && p
[1] == '?' &&
312 decode_word((p1
= p
, &p1
), (q1
= q
, &q1
), qend
, charset
) == 0 &&
313 (*p1
== '\0' || is_FWS(*p1
))) {
314 p0
= p1
; /* pointer to first character after encoded word */
317 lastc
= (unsigned char)*p0
;
320 copy_skipped_FWS(&q
, qend
, &p0
, p
);
321 lastc
= (unsigned char)*p
;
326 copy_skipped_FWS(&q
, qend
, &p0
, p
);
331 * Decode a field comment.
333 * Comments only occur in structured fields, can be nested (rfc 2822,
334 * sec 3.2.3), and can contain 'encoded-words' and 'quoted-pairs'.
335 * Otherwise, they can be regarded as unstructured fields that are
336 * bounded by '(' and ')' characters.
339 decode_comment(char **obuf
, char *oend
, const char **ibuf
, const char *iend
, const char *charset
)
341 const char *p
, *pend
, *p0
;
351 while (p
< pend
&& q
< qend
) {
355 if (is_FWS(lastc
) && p
[0] == '=' && p
[1] == '?' &&
356 decode_word((p1
= p
, &p1
), (q1
= q
, &q1
), qend
, charset
) == 0 &&
357 (*p1
== ')' || is_FWS(*p1
))) {
358 lastc
= (unsigned char)*p1
;
363 * XXX - this check should be unnecessary as *pend should
364 * be '\0' which will stop skip_FWS()
370 copy_skipped_FWS(&q
, qend
, &p0
, p
);
371 if (q
>= qend
) /* XXX - q > qend cannot happen */
375 *q
++ = *p
++; /* copy the closing ')' */
376 break; /* and get out of here! */
380 *q
++ = *p
++; /* copy the opening '(' */
381 if (decode_comment(&q
, qend
, &p
, pend
, charset
) == -1)
382 return -1; /* is this right or should we update? */
385 else if (*p
== '\\' && p
+ 1 < pend
) { /* quoted-pair */
386 if (p
[1] == '(' || p
[1] == ')' || p
[1] == '\\') /* need quoted-pair*/
389 lastc
= (unsigned char)*p
;
394 lastc
= (unsigned char)*p
;
405 * Decode a quoted-string or no-fold-quote.
407 * These cannot contain encoded words. They can contain quoted-pairs,
408 * making '\\' special. They have no other structure. See RFC 2822
409 * sec 3.2.5 and 3.6.4.
412 decode_quoted_string(char **obuf
, char *oend
, const char **ibuf
, const char *iend
)
414 const char *p
, *pend
;
421 while (p
< pend
&& q
< qend
) {
423 *q
++ = *p
++; /* copy the closing '"' */
426 if (*p
== '\\' && p
+ 1 < pend
) { /* quoted-pair */
427 if (p
[1] == '"' || p
[1] == '\\') {
441 * Decode a domain-literal or no-fold-literal.
443 * These cannot contain encoded words. They can have quoted pairs and
444 * are delimited by '[' and ']' making '\\', '[', and ']' special.
445 * They have no other structure. See RFC 2822 sec 3.4.1 and 3.6.4.
448 decode_domain_literal(char **obuf
, char *oend
, const char **ibuf
, const char *iend
)
450 const char *p
, *pend
;
457 while (p
< pend
&& q
< qend
) {
459 *q
++ = *p
++; /* copy the closing ']' */
462 if (*p
== '\\' && p
+ 1 < pend
) { /* quoted-pair */
463 if (p
[1] == '[' || p
[1] == ']' || p
[1] == '\\') {
477 * Specials: see RFC 2822 sec 3.2.1.
482 static const char specialtab
[] = {
483 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
484 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
485 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0,
486 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0,
488 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
489 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0,
490 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
491 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
493 return !(c
& ~0x7f) ? specialtab
[c
] : 0;
497 * Decode a structured field.
499 * At the top level, structured fields can only contain encoded-words
500 * via 'phrases' and 'comments'. See RFC 2047 sec 5.
503 mime_decode_sfield(char *linebuf
, size_t bufsize
, const char *hstring
)
505 const char *p
, *pend
, *p0
;
510 charset
= value(ENAME_MIME_CHARSET
);
514 pend
= hstring
+ strlen(hstring
);
515 qend
= linebuf
+ bufsize
- 1; /* save room for the NULL terminator */
516 lastc
= (unsigned char)' ';
518 while (p
< pend
&& q
< qend
) {
523 copy_skipped_FWS(&q
, qend
, &p0
, p
);
529 case '(': /* start of comment */
530 *q
++ = *p
++; /* copy the opening '(' */
531 (void)decode_comment(&q
, qend
, &p
, pend
, charset
);
532 lastc
= (unsigned char)p
[-1];
535 case '"': /* start of quoted-string or no-fold-quote */
536 *q
++ = *p
++; /* copy the opening '"' */
537 decode_quoted_string(&q
, qend
, &p
, pend
);
538 lastc
= (unsigned char)p
[-1];
541 case '[': /* start of domain-literal or no-fold-literal */
542 *q
++ = *p
++; /* copy the opening '[' */
543 decode_domain_literal(&q
, qend
, &p
, pend
);
544 lastc
= (unsigned char)p
[-1];
547 case '\\': /* start of quoted-pair */
548 if (p
+ 1 < pend
) { /* quoted pair */
549 if (is_specials(p
[1])) {
554 p
++; /* skip the '\\' */
560 * At this level encoded words can appear via
561 * 'phrases' (possibly delimited by ',' as in
562 * 'keywords'). Thus we handle them as such.
563 * Hopefully this is sufficient.
565 if ((lastc
== ',' || is_FWS(lastc
)) && p
[1] == '?' &&
566 decode_word((p1
= p
, &p1
), (q1
= q
, &q1
), qend
, charset
) == 0 &&
567 (*p1
== '\0' || *p1
== ',' || is_FWS(*p1
))) {
568 lastc
= (unsigned char)*p1
;
573 * XXX - this check should be
574 * unnecessary as *pend should be '\0'
575 * which will stop skip_FWS()
582 copy_skipped_FWS(&q
, qend
, &p0
, p
);
588 case '<': /* start of angle-addr, msg-id, or path. */
590 * A msg-id cannot contain encoded-pairs or
591 * encoded-words, but angle-addr and path can.
592 * Distinguishing between them seems to be
593 * unnecessary, so let's be loose and just
594 * decode them as if they were all the same.
598 lastc
= (unsigned char)*p
;
603 copy_skipped_FWS(&q
, qend
, &p0
, p
);
604 *q
= '\0'; /* null terminate the result! */
608 * Returns the correct hfield decoder, or NULL if none.
609 * Info extracted from RFC 2822.
611 * name - pointer to field name of header line (with colon).
613 PUBLIC hfield_decoder_t
614 mime_hfield_decoder(const char *name
)
616 static const struct field_decoder_tbl_s
{
617 const char *field_name
;
619 hfield_decoder_t decoder
;
620 } field_decoder_tbl
[] = {
621 #define X(s) s, sizeof(s) - 1
622 { X("Received:"), NULL
},
624 { X("Content-Type:"), NULL
},
625 { X("Content-Disposition:"), NULL
},
626 { X("Content-Transfer-Encoding:"), NULL
},
627 { X("Content-Description:"), mime_decode_sfield
},
628 { X("Content-ID:"), mime_decode_sfield
},
629 { X("MIME-Version:"), mime_decode_sfield
},
631 { X("Bcc:"), mime_decode_sfield
},
632 { X("Cc:"), mime_decode_sfield
},
633 { X("Date:"), mime_decode_sfield
},
634 { X("From:"), mime_decode_sfield
},
635 { X("In-Reply-To:"), mime_decode_sfield
},
636 { X("Keywords:"), mime_decode_sfield
},
637 { X("Message-ID:"), mime_decode_sfield
},
638 { X("References:"), mime_decode_sfield
},
639 { X("Reply-To:"), mime_decode_sfield
},
640 { X("Return-Path:"), mime_decode_sfield
},
641 { X("Sender:"), mime_decode_sfield
},
642 { X("To:"), mime_decode_sfield
},
643 { X("Subject:"), mime_decode_usfield
},
644 { X("Comments:"), mime_decode_usfield
},
645 { X("X-"), mime_decode_usfield
},
646 { NULL
, 0, mime_decode_usfield
}, /* optional-fields */
649 const struct field_decoder_tbl_s
*fp
;
651 /* XXX - this begs for a hash table! */
652 for (fp
= field_decoder_tbl
; fp
->field_name
; fp
++)
653 if (strncasecmp(name
, fp
->field_name
, fp
->field_len
) == 0)
658 #endif /* MIME_SUPPORT */