1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 /* vim:expandtab:shiftwidth=2:tabstop=4:
4 /* ***** BEGIN LICENSE BLOCK *****
5 * Version: MPL 1.1/GPL 2.0/LGPL 2.1
7 * The contents of this file are subject to the Mozilla Public License Version
8 * 1.1 (the "License"); you may not use this file except in compliance with
9 * the License. You may obtain a copy of the License at
10 * http://www.mozilla.org/MPL/
12 * Software distributed under the License is distributed on an "AS IS" basis,
13 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
14 * for the specific language governing rights and limitations under the
17 * The Original Code is mozilla.org code.
19 * The Initial Developer of the Original Code is
20 * Netscape Communications Corporation.
21 * Portions created by the Initial Developer are Copyright (C) 1998
22 * the Initial Developer. All Rights Reserved.
26 * Jungshik Shin <jshin@mailaps.org>
27 * John G Myers <jgmyers@netscape.com>
28 * Takayuki Tei <taka@netscape.com>
30 * Alternatively, the contents of this file may be used under the terms of
31 * either the GNU General Public License Version 2 or later (the "GPL"), or
32 * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
33 * in which case the provisions of the GPL or the LGPL are applicable instead
34 * of those above. If you wish to allow use of your version of this file only
35 * under the terms of either the GPL or the LGPL, and not to allow others to
36 * use your version of this file under the terms of the MPL, indicate your
37 * decision by deleting the provisions above and replace them with the notice
38 * and other provisions required by the GPL or the LGPL. If you do not delete
39 * the provisions above, a recipient may use your version of this file under
40 * the terms of any one of the MPL, the GPL or the LGPL.
42 * ***** END LICENSE BLOCK ***** */
54 #include "nsIUTF8ConverterService.h"
55 #include "nsUConvCID.h"
56 #include "nsIServiceManager.h"
57 #include "nsMIMEHeaderParamImpl.h"
58 #include "nsReadableUtils.h"
59 #include "nsNativeCharsetUtils.h"
61 // static functions declared below are moved from mailnews/mime/src/comi18n.cpp
63 static char *DecodeQ(const char *, PRUint32
);
64 static PRBool
Is7bitNonAsciiString(const char *, PRUint32
);
65 static void CopyRawHeader(const char *, PRUint32
, const char *, nsACString
&);
66 static nsresult
DecodeRFC2047Str(const char *, const char *, PRBool
, nsACString
&);
68 // XXX The chance of UTF-7 being used in the message header is really
69 // low, but in theory it's possible.
70 #define IS_7BIT_NON_ASCII_CHARSET(cset) \
71 (!nsCRT::strncasecmp((cset), "ISO-2022", 8) || \
72 !nsCRT::strncasecmp((cset), "HZ-GB", 5) || \
73 !nsCRT::strncasecmp((cset), "UTF-7", 5))
75 NS_IMPL_ISUPPORTS1(nsMIMEHeaderParamImpl
, nsIMIMEHeaderParam
)
77 // XXX : aTryLocaleCharset is not yet effective.
79 nsMIMEHeaderParamImpl::GetParameter(const nsACString
& aHeaderVal
,
80 const char *aParamName
,
81 const nsACString
& aFallbackCharset
,
82 PRBool aTryLocaleCharset
,
83 char **aLang
, nsAString
& aResult
)
88 // get parameter (decode RFC 2231 if it's RFC 2231-encoded and
91 nsXPIDLCString charset
;
92 rv
= GetParameterInternal(PromiseFlatCString(aHeaderVal
).get(), aParamName
,
93 getter_Copies(charset
), aLang
, getter_Copies(med
));
97 // convert to UTF-8 after charset conversion and RFC 2047 decoding
101 rv
= DecodeParameter(med
, charset
.get(), nsnull
, PR_FALSE
, str1
);
102 NS_ENSURE_SUCCESS(rv
, rv
);
104 if (!aFallbackCharset
.IsEmpty())
107 nsCOMPtr
<nsIUTF8ConverterService
>
108 cvtUTF8(do_GetService(NS_UTF8CONVERTERSERVICE_CONTRACTID
));
110 NS_SUCCEEDED(cvtUTF8
->ConvertStringToUTF8(str1
,
111 PromiseFlatCString(aFallbackCharset
).get(), PR_FALSE
, str2
))) {
112 CopyUTF8toUTF16(str2
, aResult
);
118 CopyUTF8toUTF16(str1
, aResult
);
122 if (aTryLocaleCharset
&& !NS_IsNativeUTF8())
123 return NS_CopyNativeToUnicode(str1
, aResult
);
125 CopyASCIItoUTF16(str1
, aResult
);
129 // moved almost verbatim from mimehdrs.cpp
131 // MimeHeaders_get_parameter (const char *header_value, const char *parm_name,
132 // char **charset, char **language)
134 // The format of these header lines is
135 // <token> [ ';' <token> '=' <token-or-quoted-string> ]*
137 nsMIMEHeaderParamImpl::GetParameterInternal(const char *aHeaderValue
,
138 const char *aParamName
,
143 if (!aHeaderValue
|| !*aHeaderValue
|| !aResult
)
144 return NS_ERROR_INVALID_ARG
;
148 if (aCharset
) *aCharset
= nsnull
;
149 if (aLang
) *aLang
= nsnull
;
151 const char *str
= aHeaderValue
;
153 // skip leading white space.
154 for (; *str
&& nsCRT::IsAsciiSpace(*str
); ++str
)
156 const char *start
= str
;
158 // aParamName is empty. return the first (possibly) _unnamed_ 'parameter'
159 // For instance, return 'inline' in the following case:
160 // Content-Disposition: inline; filename=.....
161 if (!aParamName
|| !*aParamName
)
163 for (; *str
&& *str
!= ';' && !nsCRT::IsAsciiSpace(*str
); ++str
)
166 return NS_ERROR_UNEXPECTED
;
167 *aResult
= (char *) nsMemory::Clone(start
, (str
- start
) + 1);
168 (*aResult
)[str
- start
] = '\0'; // null-terminate
169 NS_ENSURE_TRUE(*aResult
, NS_ERROR_OUT_OF_MEMORY
);
173 /* Skip forward to first ';' */
174 for (; *str
&& *str
!= ';' && *str
!= ','; ++str
)
178 /* Skip over following whitespace */
179 for (; *str
&& nsCRT::IsAsciiSpace(*str
); ++str
)
182 // Some broken http servers just specify parameters
183 // like 'filename' without sepcifying disposition
184 // method. Rewind to the first non-white-space
190 // RFC2231 - The legitimate parm format can be:
191 // A. title=ThisIsTitle
192 // B. title*=us-ascii'en-us'This%20is%20wierd.
193 // C. title*0*=us-ascii'en'This%20is%20wierd.%20We
194 // title*1*=have%20to%20support%20this.
196 // D. title*0="Hey, what you think you are doing?"
197 // title*1="There is no charset and lang info."
199 PRInt32 paramLen
= strlen(aParamName
);
202 const char *tokenStart
= str
;
203 const char *tokenEnd
= 0;
204 const char *valueStart
= str
;
205 const char *valueEnd
= 0;
207 NS_ASSERTION(!nsCRT::IsAsciiSpace(*str
), "should be after whitespace.");
209 // Skip forward to the end of this token.
210 for (; *str
&& !nsCRT::IsAsciiSpace(*str
) && *str
!= '=' && *str
!= ';'; str
++)
214 // Skip over whitespace, '=', and whitespace
215 while (nsCRT::IsAsciiSpace(*str
)) ++str
;
216 if (*str
== '=') ++str
;
217 while (nsCRT::IsAsciiSpace(*str
)) ++str
;
221 // The value is a token, not a quoted string.
224 *valueEnd
&& !nsCRT::IsAsciiSpace (*valueEnd
) && *valueEnd
!= ';';
231 // The value is a quoted string.
234 for (valueEnd
= str
; *valueEnd
; ++valueEnd
)
236 if (*valueEnd
== '\\')
238 else if (*valueEnd
== '"')
244 // See if this is the simplest case (case A above),
245 // a 'single' line value with no charset and lang.
246 // If so, copy it and return.
247 if (tokenEnd
- tokenStart
== paramLen
&&
248 !nsCRT::strncasecmp(tokenStart
, aParamName
, paramLen
))
250 // if the parameter spans across multiple lines we have to strip out the
251 // line continuation -- jht 4/29/98
252 nsCAutoString
tempStr(valueStart
, valueEnd
- valueStart
);
253 tempStr
.StripChars("\r\n");
254 *aResult
= ToNewCString(tempStr
);
255 NS_ENSURE_TRUE(*aResult
, NS_ERROR_OUT_OF_MEMORY
);
259 else if (tokenEnd
- tokenStart
> paramLen
&&
260 !nsCRT::strncasecmp(tokenStart
, aParamName
, paramLen
) &&
261 *(tokenStart
+ paramLen
) == '*')
263 const char *cp
= tokenStart
+ paramLen
+ 1; // 1st char pass '*'
264 PRBool needUnescape
= *(tokenEnd
- 1) == '*';
265 // the 1st line of a multi-line parameter or a single line that needs
266 // unescaping. ( title*0*= or title*= )
267 if ((*cp
== '0' && needUnescape
) || (tokenEnd
- tokenStart
== paramLen
+ 1))
269 // look for single quotation mark(')
270 const char *sQuote1
= PL_strchr(valueStart
, 0x27);
271 const char *sQuote2
= (char *) (sQuote1
? PL_strchr(sQuote1
+ 1, 0x27) : nsnull
);
273 // Two single quotation marks must be present even in
274 // absence of charset and lang.
275 if (!sQuote1
|| !sQuote2
)
276 NS_WARNING("Mandatory two single quotes are missing in header parameter\n");
277 if (aCharset
&& sQuote1
> valueStart
&& sQuote1
< valueEnd
)
279 *aCharset
= (char *) nsMemory::Clone(valueStart
, sQuote1
- valueStart
+ 1);
281 *(*aCharset
+ (sQuote1
- valueStart
)) = 0;
283 if (aLang
&& sQuote1
&& sQuote2
&& sQuote2
> sQuote1
+ 1 &&
286 *aLang
= (char *) nsMemory::Clone(sQuote1
+ 1, sQuote2
- (sQuote1
+ 1) + 1);
288 *(*aLang
+ (sQuote2
- (sQuote1
+ 1))) = 0;
291 // Be generous and handle gracefully when required
292 // single quotes are absent.
299 sQuote2
= valueStart
- 1;
301 if (sQuote2
&& sQuote2
+ 1 < valueEnd
)
303 NS_ASSERTION(!*aResult
, "This is the 1st line. result buffer should be null.");
304 *aResult
= (char *) nsMemory::Alloc(valueEnd
- (sQuote2
+ 1) + 1);
307 memcpy(*aResult
, sQuote2
+ 1, valueEnd
- (sQuote2
+ 1));
308 *(*aResult
+ (valueEnd
- (sQuote2
+ 1))) = 0;
311 nsUnescape(*aResult
);
312 if (tokenEnd
- tokenStart
== paramLen
+ 1)
313 // we're done; this is case B
318 } // end of if-block : title*0*= or title*=
319 // a line of multiline param with no need for unescaping : title*[0-9]=
320 // or 2nd or later lines of a multiline param : title*[1-9]*=
321 else if (nsCRT::IsAsciiDigit(PRUnichar(*cp
)))
324 if (*aResult
) // 2nd or later lines of multiline parameter
326 len
= strlen(*aResult
);
327 char *ns
= (char *) nsMemory::Realloc(*aResult
, len
+ (valueEnd
- valueStart
) + 1);
330 nsMemory::Free(*aResult
);
334 else if (*cp
== '0') // must be; 1st line : title*0=
336 *aResult
= (char *) nsMemory::Alloc(valueEnd
- valueStart
+ 1);
338 // else {} something is really wrong; out of memory
341 // append a partial value
342 memcpy(*aResult
+ len
, valueStart
, valueEnd
- valueStart
);
343 *(*aResult
+ len
+ (valueEnd
- valueStart
)) = 0;
345 nsUnescape(*aResult
+ len
);
348 return NS_ERROR_OUT_OF_MEMORY
;
349 } // end of if-block : title*[0-9]= or title*[1-9]*=
352 // str now points after the end of the value.
353 // skip over whitespace, ';', whitespace.
355 while (nsCRT::IsAsciiSpace(*str
)) ++str
;
356 if (*str
== ';') ++str
;
357 while (nsCRT::IsAsciiSpace(*str
)) ++str
;
363 return NS_ERROR_INVALID_ARG
; // aParameter not found !!
368 nsMIMEHeaderParamImpl::DecodeRFC2047Header(const char* aHeaderVal
,
369 const char* aDefaultCharset
,
370 PRBool aOverrideCharset
,
371 PRBool aEatContinuations
,
376 return NS_ERROR_INVALID_ARG
;
381 // If aHeaderVal is RFC 2047 encoded or is not a UTF-8 string but
382 // aDefaultCharset is specified, decodes RFC 2047 encoding and converts
383 // to UTF-8. Otherwise, just strips away CRLF.
384 if (PL_strstr(aHeaderVal
, "=?") ||
385 aDefaultCharset
&& (!IsUTF8(nsDependentCString(aHeaderVal
)) ||
386 Is7bitNonAsciiString(aHeaderVal
, PL_strlen(aHeaderVal
)))) {
387 DecodeRFC2047Str(aHeaderVal
, aDefaultCharset
, aOverrideCharset
, aResult
);
388 } else if (aEatContinuations
&&
389 (PL_strchr(aHeaderVal
, '\n') || PL_strchr(aHeaderVal
, '\r'))) {
390 aResult
= aHeaderVal
;
392 aEatContinuations
= PR_FALSE
;
393 aResult
= aHeaderVal
;
396 if (aEatContinuations
) {
397 nsCAutoString
temp(aResult
);
398 temp
.StripChars("\r\n");
406 nsMIMEHeaderParamImpl::DecodeParameter(const nsACString
& aParamValue
,
407 const char* aCharset
,
408 const char* aDefaultCharset
,
409 PRBool aOverrideCharset
,
413 // If aCharset is given, aParamValue was obtained from RFC2231
414 // encoding and we're pretty sure that it's in aCharset.
415 if (aCharset
&& *aCharset
)
417 nsCOMPtr
<nsIUTF8ConverterService
> cvtUTF8(do_GetService(NS_UTF8CONVERTERSERVICE_CONTRACTID
));
419 // skip ASCIIness/UTF8ness test if aCharset is 7bit non-ascii charset.
420 return cvtUTF8
->ConvertStringToUTF8(aParamValue
, aCharset
,
421 IS_7BIT_NON_ASCII_CHARSET(aCharset
), aResult
);
424 const nsAFlatCString
& param
= PromiseFlatCString(aParamValue
);
425 nsCAutoString unQuoted
;
426 nsACString::const_iterator s
, e
;
427 param
.BeginReading(s
);
430 // strip '\' when used to quote CR, LF, '"' and '\'
431 for ( ; s
!= e
; ++s
) {
434 --s
; // '\' is at the end. move back and append '\'.
436 else if (*s
!= nsCRT::CR
&& *s
!= nsCRT::LF
&& *s
!= '"' && *s
!= '\\') {
437 --s
; // '\' is not foll. by CR,LF,'"','\'. move back and append '\'
439 // else : skip '\' and append the quoted character.
446 nsCAutoString decoded
;
448 // Try RFC 2047 encoding, instead.
449 nsresult rv
= DecodeRFC2047Header(unQuoted
.get(), aDefaultCharset
,
450 aOverrideCharset
, PR_TRUE
, decoded
);
452 if (NS_SUCCEEDED(rv
) && !decoded
.IsEmpty())
458 #define ISHEXCHAR(c) \
459 (0x30 <= PRUint8(c) && PRUint8(c) <= 0x39 || \
460 0x41 <= PRUint8(c) && PRUint8(c) <= 0x46 || \
461 0x61 <= PRUint8(c) && PRUint8(c) <= 0x66)
463 // Decode Q encoding (RFC 2047).
465 char *DecodeQ(const char *in
, PRUint32 length
)
467 char *out
, *dest
= 0;
469 out
= dest
= (char *)PR_Calloc(length
+ 1, sizeof(char));
476 // check if |in| in the form of '=hh' where h is [0-9a-fA-F].
477 if (length
< 3 || !ISHEXCHAR(in
[1]) || !ISHEXCHAR(in
[2]))
479 PR_sscanf(in
+ 1, "%2X", &c
);
492 if (*in
& 0x80) goto badsyntax
;
499 for (out
= dest
; *out
; ++out
) {
511 // check if input is HZ (a 7bit encoding for simplified Chinese : RFC 1842))
512 // or has ESC which may be an indication that it's in one of many ISO
513 // 2022 7bit encodings (e.g. ISO-2022-JP(-2)/CN : see RFC 1468, 1922, 1554).
515 PRBool
Is7bitNonAsciiString(const char *input
, PRUint32 len
)
519 enum { hz_initial
, // No HZ seen yet
520 hz_escaped
, // Inside an HZ ~{ escape sequence
521 hz_seen
, // Have seen at least one complete HZ sequence
522 hz_notpresent
// Have seen something that is not legal HZ
525 hz_state
= hz_initial
;
527 c
= PRUint8(*input
++);
529 if (c
& 0x80) return PR_FALSE
;
530 if (c
== 0x1B) return PR_TRUE
;
536 hz_state
= hz_escaped
;
537 } else if (*input
== '~') {
538 // ~~ is the HZ encoding of ~. Skip over second ~ as well
543 hz_state
= hz_notpresent
;
548 if (*input
== '}') hz_state
= hz_seen
;
555 return hz_state
== hz_seen
;
558 #define REPLACEMENT_CHAR "\357\277\275" // EF BF BD (UTF-8 encoding of U+FFFD)
560 // copy 'raw' sequences of octets in aInput to aOutput.
561 // If aDefaultCharset is specified, the input is assumed to be in the
562 // charset and converted to UTF-8. Otherwise, a blind copy is made.
563 // If aDefaultCharset is specified, but the conversion to UTF-8
564 // is not successful, each octet is replaced by Unicode replacement
565 // chars. *aOutput is advanced by the number of output octets.
567 void CopyRawHeader(const char *aInput
, PRUint32 aLen
,
568 const char *aDefaultCharset
, nsACString
&aOutput
)
572 // If aDefaultCharset is not specified, make a blind copy.
573 if (!aDefaultCharset
|| !*aDefaultCharset
) {
574 aOutput
.Append(aInput
, aLen
);
578 // Copy as long as it's US-ASCII. An ESC may indicate ISO 2022
579 // A ~ may indicate it is HZ
580 while (aLen
&& (c
= PRUint8(*aInput
++)) != 0x1B && c
!= '~' && !(c
& 0x80)) {
581 aOutput
.Append(char(c
));
589 // skip ASCIIness/UTF8ness test if aInput is supected to be a 7bit non-ascii
590 // string and aDefaultCharset is a 7bit non-ascii charset.
591 PRBool skipCheck
= (c
== 0x1B || c
== '~') &&
592 IS_7BIT_NON_ASCII_CHARSET(aDefaultCharset
);
594 // If not UTF-8, treat as default charset
595 nsCOMPtr
<nsIUTF8ConverterService
>
596 cvtUTF8(do_GetService(NS_UTF8CONVERTERSERVICE_CONTRACTID
));
597 nsCAutoString utf8Text
;
600 cvtUTF8
->ConvertStringToUTF8(Substring(aInput
, aInput
+ aLen
),
601 aDefaultCharset
, skipCheck
, utf8Text
))) {
602 aOutput
.Append(utf8Text
);
603 } else { // replace each octet with Unicode replacement char in UTF-8.
604 for (PRUint32 i
= 0; i
< aLen
; i
++) {
605 aOutput
.Append(REPLACEMENT_CHAR
);
610 static const char especials
[] = "()<>@,;:\\\"/[]?.=";
612 // |decode_mime_part2_str| taken from comi18n.c
613 // Decode RFC2047-encoded words in the input and convert the result to UTF-8.
614 // If aOverrideCharset is true, charset in RFC2047-encoded words is
615 // ignored and aDefaultCharset is assumed, instead. aDefaultCharset
616 // is also used to convert raw octets (without RFC 2047 encoding) to UTF-8.
618 nsresult
DecodeRFC2047Str(const char *aHeader
, const char *aDefaultCharset
,
619 PRBool aOverrideCharset
, nsACString
&aResult
)
621 const char *p
, *q
, *r
;
623 const char *begin
; // tracking pointer for where we are in the input buffer
624 PRInt32 isLastEncodedWord
= 0;
625 const char *charsetStart
, *charsetEnd
;
628 // initialize charset name to an empty string
633 // To avoid buffer realloc, if possible, set capacity in advance. No
634 // matter what, more than 3x expansion can never happen for all charsets
635 // supported by Mozilla. SCSU/BCSU with the sliding window set to a
636 // non-BMP block may be exceptions, but Mozilla does not support them.
637 // Neither any known mail/news program use them. Even if there's, we're
638 // safe because we don't use a raw *char any more.
639 aResult
.SetCapacity(3 * strlen(aHeader
));
641 while ((p
= PL_strstr(begin
, "=?")) != 0) {
642 if (isLastEncodedWord
) {
643 // See if it's all whitespace.
644 for (q
= begin
; q
< p
; ++q
) {
645 if (!PL_strchr(" \t\r\n", *q
)) break;
649 if (!isLastEncodedWord
|| q
< p
) {
650 // copy the part before the encoded-word
651 CopyRawHeader(begin
, p
- begin
, aDefaultCharset
, aResult
);
660 for (q
= p
; *q
!= '?'; q
++) {
661 if (*q
<= ' ' || PL_strchr(especials
, *q
)) {
665 // RFC 2231 section 5
666 if (!charsetEnd
&& *q
== '*') {
674 // Check for too-long charset name
675 if (PRUint32(charsetEnd
- charsetStart
) >= sizeof(charset
))
678 memcpy(charset
, charsetStart
, charsetEnd
- charsetStart
);
679 charset
[charsetEnd
- charsetStart
] = 0;
682 if (*q
!= 'Q' && *q
!= 'q' && *q
!= 'B' && *q
!= 'b')
689 for (r
= q
+ 2; *r
!= '?'; r
++) {
690 if (*r
< ' ') goto badsyntax
;
694 else if (r
== q
+ 2) {
697 isLastEncodedWord
= 1;
701 if(*q
== 'Q' || *q
== 'q')
702 decodedText
= DecodeQ(q
+ 2, r
- (q
+ 2));
704 // bug 227290. ignore an extraneous '=' at the end.
705 // (# of characters in B-encoded part has to be a multiple of 4)
706 PRInt32 n
= r
- (q
+ 2);
707 n
-= (n
% 4 == 1 && !PL_strncmp(r
- 3, "===", 3)) ? 1 : 0;
708 decodedText
= PL_Base64Decode(q
+ 2, n
, nsnull
);
711 if (decodedText
== nsnull
)
714 // Override charset if requested. Never override labeled UTF-8.
715 // Use default charset instead of UNKNOWN-8BIT
716 if ((aOverrideCharset
&& 0 != nsCRT::strcasecmp(charset
, "UTF-8")) ||
717 (aDefaultCharset
&& 0 == nsCRT::strcasecmp(charset
, "UNKNOWN-8BIT"))) {
718 PL_strncpy(charset
, aDefaultCharset
, sizeof(charset
) - 1);
719 charset
[sizeof(charset
) - 1] = '\0';
723 nsCOMPtr
<nsIUTF8ConverterService
>
724 cvtUTF8(do_GetService(NS_UTF8CONVERTERSERVICE_CONTRACTID
));
725 nsCAutoString utf8Text
;
726 // skip ASCIIness/UTF8ness test if aCharset is 7bit non-ascii charset.
729 cvtUTF8
->ConvertStringToUTF8(nsDependentCString(decodedText
),
730 charset
, IS_7BIT_NON_ASCII_CHARSET(charset
), utf8Text
))) {
731 aResult
.Append(utf8Text
);
733 aResult
.Append(REPLACEMENT_CHAR
);
736 PR_Free(decodedText
);
738 isLastEncodedWord
= 1;
742 // copy the part before the encoded-word
743 aResult
.Append(begin
, p
- begin
);
745 isLastEncodedWord
= 0;
749 CopyRawHeader(begin
, strlen(begin
), aDefaultCharset
, aResult
);
751 nsCAutoString
tempStr(aResult
);
752 tempStr
.ReplaceChar('\t', ' ');