1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "net/http/http_content_disposition.h"
7 #include "base/base64.h"
8 #include "base/logging.h"
9 #include "base/strings/string_tokenizer.h"
10 #include "base/strings/string_util.h"
11 #include "base/strings/sys_string_conversions.h"
12 #include "base/strings/utf_string_conversions.h"
13 #include "net/base/net_string_util.h"
14 #include "net/base/net_util.h"
15 #include "net/http/http_util.h"
21 enum RFC2047EncodingType
{
26 // Decodes a "Q" encoded string as described in RFC 2047 section 4.2. Similar to
27 // decoding a quoted-printable string. Returns true if the input was valid.
28 bool DecodeQEncoding(const std::string
& input
, std::string
* output
) {
30 temp
.reserve(input
.size());
31 for (std::string::const_iterator it
= input
.begin(); it
!= input
.end();
35 } else if (*it
== '=') {
36 if ((input
.end() - it
< 3) ||
37 !IsHexDigit(static_cast<unsigned char>(*(it
+ 1))) ||
38 !IsHexDigit(static_cast<unsigned char>(*(it
+ 2))))
40 unsigned char ch
= HexDigitToInt(*(it
+ 1)) * 16 +
41 HexDigitToInt(*(it
+ 2));
42 temp
.push_back(static_cast<char>(ch
));
45 } else if (0x20 < *it
&& *it
< 0x7F && *it
!= '?') {
46 // In a Q-encoded word, only printable ASCII characters
47 // represent themselves. Besides, space, '=', '_' and '?' are
48 // not allowed, but they're already filtered out.
61 // Decodes a "Q" or "B" encoded string as per RFC 2047 section 4. The encoding
62 // type is specified in |enc_type|.
63 bool DecodeBQEncoding(const std::string
& part
,
64 RFC2047EncodingType enc_type
,
65 const std::string
& charset
,
66 std::string
* output
) {
68 if (!((enc_type
== B_ENCODING
) ?
69 base::Base64Decode(part
, &decoded
) : DecodeQEncoding(part
, &decoded
))) {
73 if (decoded
.empty()) {
78 return ConvertToUtf8(decoded
, charset
.c_str(), output
);
81 bool DecodeWord(const std::string
& encoded_word
,
82 const std::string
& referrer_charset
,
85 int* parse_result_flags
) {
88 if (encoded_word
.empty())
91 if (!base::IsStringASCII(encoded_word
)) {
92 // Try UTF-8, referrer_charset and the native OS default charset in turn.
93 if (base::IsStringUTF8(encoded_word
)) {
94 *output
= encoded_word
;
96 base::string16 utf16_output
;
97 if (!referrer_charset
.empty() &&
98 ConvertToUTF16(encoded_word
, referrer_charset
.c_str(),
100 *output
= base::UTF16ToUTF8(utf16_output
);
102 *output
= base::WideToUTF8(base::SysNativeMBToWide(encoded_word
));
106 *parse_result_flags
|= HttpContentDisposition::HAS_NON_ASCII_STRINGS
;
110 // RFC 2047 : one of encoding methods supported by Firefox and relatively
111 // widely used by web servers.
112 // =?charset?<E>?<encoded string>?= where '<E>' is either 'B' or 'Q'.
113 // We don't care about the length restriction (72 bytes) because
114 // many web servers generate encoded words longer than the limit.
115 std::string decoded_word
;
119 base::StringTokenizer
t(encoded_word
, "?");
120 RFC2047EncodingType enc_type
= Q_ENCODING
;
121 while (*is_rfc2047
&& t
.GetNext()) {
122 std::string part
= t
.token();
123 switch (part_index
) {
132 // Do we need charset validity check here?
137 if (part
.size() > 1 ||
138 part
.find_first_of("bBqQ") == std::string::npos
) {
142 if (part
[0] == 'b' || part
[0] == 'B') {
143 enc_type
= B_ENCODING
;
148 *is_rfc2047
= DecodeBQEncoding(part
, enc_type
, charset
, &decoded_word
);
150 // Last minute failure. Invalid B/Q encoding. Rather than
151 // passing it through, return now.
158 // Another last minute failure !
159 // Likely to be a case of two encoded-words in a row or
160 // an encoded word followed by a non-encoded word. We can be
161 // generous, but it does not help much in terms of compatibility,
162 // I believe. Return immediately.
175 if (*(encoded_word
.end() - 1) == '=') {
176 output
->swap(decoded_word
);
177 *parse_result_flags
|=
178 HttpContentDisposition::HAS_RFC2047_ENCODED_STRINGS
;
181 // encoded_word ending prematurelly with '?' or extra '?'
186 // We're not handling 'especial' characters quoted with '\', but
187 // it should be Ok because we're not an email client but a
190 // What IE6/7 does: %-escaped UTF-8.
191 decoded_word
= UnescapeURLComponent(encoded_word
, UnescapeRule::SPACES
);
192 if (decoded_word
!= encoded_word
)
193 *parse_result_flags
|= HttpContentDisposition::HAS_PERCENT_ENCODED_STRINGS
;
194 if (base::IsStringUTF8(decoded_word
)) {
195 output
->swap(decoded_word
);
197 // We can try either the OS default charset or 'origin charset' here,
198 // As far as I can tell, IE does not support it. However, I've seen
199 // web servers emit %-escaped string in a legacy encoding (usually
201 // TODO(jungshik) : Test IE further and consider adding a fallback here.
206 // Decodes the value of a 'filename' or 'name' parameter given as |input|. The
207 // value is supposed to be of the form:
209 // value = token | quoted-string
211 // However we currently also allow RFC 2047 encoding and non-ASCII
212 // strings. Non-ASCII strings are interpreted based on |referrer_charset|.
213 bool DecodeFilenameValue(const std::string
& input
,
214 const std::string
& referrer_charset
,
216 int* parse_result_flags
) {
217 int current_parse_result_flags
= 0;
218 std::string decoded_value
;
219 bool is_previous_token_rfc2047
= true;
221 // Tokenize with whitespace characters.
222 base::StringTokenizer
t(input
, " \t\n\r");
223 t
.set_options(base::StringTokenizer::RETURN_DELIMS
);
224 while (t
.GetNext()) {
225 if (t
.token_is_delim()) {
226 // If the previous non-delimeter token is not RFC2047-encoded,
227 // put in a space in its place. Otheriwse, skip over it.
228 if (!is_previous_token_rfc2047
)
229 decoded_value
.push_back(' ');
232 // We don't support a single multibyte character split into
233 // adjacent encoded words. Some broken mail clients emit headers
234 // with that problem, but most web servers usually encode a filename
235 // in a single encoded-word. Firefox/Thunderbird do not support
238 if (!DecodeWord(t
.token(), referrer_charset
, &is_previous_token_rfc2047
,
239 &decoded
, ¤t_parse_result_flags
))
241 decoded_value
.append(decoded
);
243 output
->swap(decoded_value
);
244 if (parse_result_flags
&& !output
->empty())
245 *parse_result_flags
|= current_parse_result_flags
;
249 // Parses the charset and value-chars out of an ext-value string.
251 // ext-value = charset "'" [ language ] "'" value-chars
252 bool ParseExtValueComponents(const std::string
& input
,
253 std::string
* charset
,
254 std::string
* value_chars
) {
255 base::StringTokenizer
t(input
, "'");
256 t
.set_options(base::StringTokenizer::RETURN_DELIMS
);
257 std::string temp_charset
;
258 std::string temp_value
;
259 int numDelimsSeen
= 0;
260 while (t
.GetNext()) {
261 if (t
.token_is_delim()) {
265 switch (numDelimsSeen
) {
267 temp_charset
= t
.token();
270 // Language is ignored.
273 temp_value
= t
.token();
280 if (numDelimsSeen
!= 2)
282 if (temp_charset
.empty() || temp_value
.empty())
284 charset
->swap(temp_charset
);
285 value_chars
->swap(temp_value
);
289 // http://tools.ietf.org/html/rfc5987#section-3.2
291 // ext-value = charset "'" [ language ] "'" value-chars
293 // charset = "UTF-8" / "ISO-8859-1" / mime-charset
295 // mime-charset = 1*mime-charsetc
296 // mime-charsetc = ALPHA / DIGIT
297 // / "!" / "#" / "$" / "%" / "&"
298 // / "+" / "-" / "^" / "_" / "`"
301 // language = <Language-Tag, defined in [RFC5646], Section 2.1>
303 // value-chars = *( pct-encoded / attr-char )
305 // pct-encoded = "%" HEXDIG HEXDIG
307 // attr-char = ALPHA / DIGIT
308 // / "!" / "#" / "$" / "&" / "+" / "-" / "."
309 // / "^" / "_" / "`" / "|" / "~"
310 bool DecodeExtValue(const std::string
& param_value
, std::string
* decoded
) {
311 if (param_value
.find('"') != std::string::npos
)
316 if (!ParseExtValueComponents(param_value
, &charset
, &value
))
319 // RFC 5987 value should be ASCII-only.
320 if (!base::IsStringASCII(value
)) {
325 std::string unescaped
= UnescapeURLComponent(
326 value
, UnescapeRule::SPACES
| UnescapeRule::URL_SPECIAL_CHARS
);
328 return ConvertToUtf8AndNormalize(unescaped
, charset
.c_str(), decoded
);
333 HttpContentDisposition::HttpContentDisposition(
334 const std::string
& header
, const std::string
& referrer_charset
)
336 parse_result_flags_(INVALID
) {
337 Parse(header
, referrer_charset
);
340 HttpContentDisposition::~HttpContentDisposition() {
343 std::string::const_iterator
HttpContentDisposition::ConsumeDispositionType(
344 std::string::const_iterator begin
, std::string::const_iterator end
) {
345 DCHECK(type_
== INLINE
);
346 std::string::const_iterator delimiter
= std::find(begin
, end
, ';');
348 std::string::const_iterator type_begin
= begin
;
349 std::string::const_iterator type_end
= delimiter
;
350 HttpUtil::TrimLWS(&type_begin
, &type_end
);
352 // If the disposition-type isn't a valid token the then the
353 // Content-Disposition header is malformed, and we treat the first bytes as
354 // a parameter rather than a disposition-type.
355 if (!HttpUtil::IsToken(type_begin
, type_end
))
358 parse_result_flags_
|= HAS_DISPOSITION_TYPE
;
360 DCHECK(std::find(type_begin
, type_end
, '=') == type_end
);
362 if (base::LowerCaseEqualsASCII(type_begin
, type_end
, "inline")) {
364 } else if (base::LowerCaseEqualsASCII(type_begin
, type_end
, "attachment")) {
367 parse_result_flags_
|= HAS_UNKNOWN_DISPOSITION_TYPE
;
373 // http://tools.ietf.org/html/rfc6266
375 // content-disposition = "Content-Disposition" ":"
376 // disposition-type *( ";" disposition-parm )
378 // disposition-type = "inline" | "attachment" | disp-ext-type
379 // ; case-insensitive
380 // disp-ext-type = token
382 // disposition-parm = filename-parm | disp-ext-parm
384 // filename-parm = "filename" "=" value
385 // | "filename*" "=" ext-value
387 // disp-ext-parm = token "=" value
388 // | ext-token "=" ext-value
389 // ext-token = <the characters in token, followed by "*">
391 void HttpContentDisposition::Parse(const std::string
& header
,
392 const std::string
& referrer_charset
) {
393 DCHECK(type_
== INLINE
);
394 DCHECK(filename_
.empty());
396 std::string::const_iterator pos
= header
.begin();
397 std::string::const_iterator end
= header
.end();
398 pos
= ConsumeDispositionType(pos
, end
);
400 std::string filename
;
401 std::string ext_filename
;
403 HttpUtil::NameValuePairsIterator
iter(pos
, end
, ';');
404 while (iter
.GetNext()) {
405 if (filename
.empty() &&
406 base::LowerCaseEqualsASCII(iter
.name_begin(), iter
.name_end(),
408 DecodeFilenameValue(iter
.value(), referrer_charset
, &filename
,
409 &parse_result_flags_
);
410 if (!filename
.empty())
411 parse_result_flags_
|= HAS_FILENAME
;
412 } else if (ext_filename
.empty() &&
413 base::LowerCaseEqualsASCII(iter
.name_begin(), iter
.name_end(),
415 DecodeExtValue(iter
.raw_value(), &ext_filename
);
416 if (!ext_filename
.empty())
417 parse_result_flags_
|= HAS_EXT_FILENAME
;
421 if (!ext_filename
.empty())
422 filename_
= ext_filename
;
424 filename_
= filename
;