Roll src/third_party/WebKit d9c6159:8139f33 (svn 201974:201975)
[chromium-blink-merge.git] / net / http / http_content_disposition.cc
blob28ea8bd5547da89077b0ce3d10e4396ac1a9baa1
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "net/http/http_content_disposition.h"
7 #include "base/base64.h"
8 #include "base/logging.h"
9 #include "base/strings/string_tokenizer.h"
10 #include "base/strings/string_util.h"
11 #include "base/strings/sys_string_conversions.h"
12 #include "base/strings/utf_string_conversions.h"
13 #include "net/base/net_string_util.h"
14 #include "net/base/net_util.h"
15 #include "net/http/http_util.h"
17 namespace net {
19 namespace {
21 enum RFC2047EncodingType {
22 Q_ENCODING,
23 B_ENCODING
26 // Decodes a "Q" encoded string as described in RFC 2047 section 4.2. Similar to
27 // decoding a quoted-printable string. Returns true if the input was valid.
28 bool DecodeQEncoding(const std::string& input, std::string* output) {
29 std::string temp;
30 temp.reserve(input.size());
31 for (std::string::const_iterator it = input.begin(); it != input.end();
32 ++it) {
33 if (*it == '_') {
34 temp.push_back(' ');
35 } else if (*it == '=') {
36 if ((input.end() - it < 3) ||
37 !base::IsHexDigit(static_cast<unsigned char>(*(it + 1))) ||
38 !base::IsHexDigit(static_cast<unsigned char>(*(it + 2))))
39 return false;
40 unsigned char ch =
41 base::HexDigitToInt(*(it + 1)) * 16 + base::HexDigitToInt(*(it + 2));
42 temp.push_back(static_cast<char>(ch));
43 ++it;
44 ++it;
45 } else if (0x20 < *it && *it < 0x7F && *it != '?') {
46 // In a Q-encoded word, only printable ASCII characters
47 // represent themselves. Besides, space, '=', '_' and '?' are
48 // not allowed, but they're already filtered out.
49 DCHECK_NE('=', *it);
50 DCHECK_NE('?', *it);
51 DCHECK_NE('_', *it);
52 temp.push_back(*it);
53 } else {
54 return false;
57 output->swap(temp);
58 return true;
61 // Decodes a "Q" or "B" encoded string as per RFC 2047 section 4. The encoding
62 // type is specified in |enc_type|.
63 bool DecodeBQEncoding(const std::string& part,
64 RFC2047EncodingType enc_type,
65 const std::string& charset,
66 std::string* output) {
67 std::string decoded;
68 if (!((enc_type == B_ENCODING) ?
69 base::Base64Decode(part, &decoded) : DecodeQEncoding(part, &decoded))) {
70 return false;
73 if (decoded.empty()) {
74 output->clear();
75 return true;
78 return ConvertToUtf8(decoded, charset.c_str(), output);
81 bool DecodeWord(const std::string& encoded_word,
82 const std::string& referrer_charset,
83 bool* is_rfc2047,
84 std::string* output,
85 int* parse_result_flags) {
86 *is_rfc2047 = false;
87 output->clear();
88 if (encoded_word.empty())
89 return true;
91 if (!base::IsStringASCII(encoded_word)) {
92 // Try UTF-8, referrer_charset and the native OS default charset in turn.
93 if (base::IsStringUTF8(encoded_word)) {
94 *output = encoded_word;
95 } else {
96 base::string16 utf16_output;
97 if (!referrer_charset.empty() &&
98 ConvertToUTF16(encoded_word, referrer_charset.c_str(),
99 &utf16_output)) {
100 *output = base::UTF16ToUTF8(utf16_output);
101 } else {
102 *output = base::WideToUTF8(base::SysNativeMBToWide(encoded_word));
106 *parse_result_flags |= HttpContentDisposition::HAS_NON_ASCII_STRINGS;
107 return true;
110 // RFC 2047 : one of encoding methods supported by Firefox and relatively
111 // widely used by web servers.
112 // =?charset?<E>?<encoded string>?= where '<E>' is either 'B' or 'Q'.
113 // We don't care about the length restriction (72 bytes) because
114 // many web servers generate encoded words longer than the limit.
115 std::string decoded_word;
116 *is_rfc2047 = true;
117 int part_index = 0;
118 std::string charset;
119 base::StringTokenizer t(encoded_word, "?");
120 RFC2047EncodingType enc_type = Q_ENCODING;
121 while (*is_rfc2047 && t.GetNext()) {
122 std::string part = t.token();
123 switch (part_index) {
124 case 0:
125 if (part != "=") {
126 *is_rfc2047 = false;
127 break;
129 ++part_index;
130 break;
131 case 1:
132 // Do we need charset validity check here?
133 charset = part;
134 ++part_index;
135 break;
136 case 2:
137 if (part.size() > 1 ||
138 part.find_first_of("bBqQ") == std::string::npos) {
139 *is_rfc2047 = false;
140 break;
142 if (part[0] == 'b' || part[0] == 'B') {
143 enc_type = B_ENCODING;
145 ++part_index;
146 break;
147 case 3:
148 *is_rfc2047 = DecodeBQEncoding(part, enc_type, charset, &decoded_word);
149 if (!*is_rfc2047) {
150 // Last minute failure. Invalid B/Q encoding. Rather than
151 // passing it through, return now.
152 return false;
154 ++part_index;
155 break;
156 case 4:
157 if (part != "=") {
158 // Another last minute failure !
159 // Likely to be a case of two encoded-words in a row or
160 // an encoded word followed by a non-encoded word. We can be
161 // generous, but it does not help much in terms of compatibility,
162 // I believe. Return immediately.
163 *is_rfc2047 = false;
164 return false;
166 ++part_index;
167 break;
168 default:
169 *is_rfc2047 = false;
170 return false;
174 if (*is_rfc2047) {
175 if (*(encoded_word.end() - 1) == '=') {
176 output->swap(decoded_word);
177 *parse_result_flags |=
178 HttpContentDisposition::HAS_RFC2047_ENCODED_STRINGS;
179 return true;
181 // encoded_word ending prematurelly with '?' or extra '?'
182 *is_rfc2047 = false;
183 return false;
186 // We're not handling 'especial' characters quoted with '\', but
187 // it should be Ok because we're not an email client but a
188 // web browser.
190 // What IE6/7 does: %-escaped UTF-8.
191 decoded_word = UnescapeURLComponent(encoded_word, UnescapeRule::SPACES);
192 if (decoded_word != encoded_word)
193 *parse_result_flags |= HttpContentDisposition::HAS_PERCENT_ENCODED_STRINGS;
194 if (base::IsStringUTF8(decoded_word)) {
195 output->swap(decoded_word);
196 return true;
197 // We can try either the OS default charset or 'origin charset' here,
198 // As far as I can tell, IE does not support it. However, I've seen
199 // web servers emit %-escaped string in a legacy encoding (usually
200 // origin charset).
201 // TODO(jungshik) : Test IE further and consider adding a fallback here.
203 return false;
206 // Decodes the value of a 'filename' or 'name' parameter given as |input|. The
207 // value is supposed to be of the form:
209 // value = token | quoted-string
211 // However we currently also allow RFC 2047 encoding and non-ASCII
212 // strings. Non-ASCII strings are interpreted based on |referrer_charset|.
213 bool DecodeFilenameValue(const std::string& input,
214 const std::string& referrer_charset,
215 std::string* output,
216 int* parse_result_flags) {
217 int current_parse_result_flags = 0;
218 std::string decoded_value;
219 bool is_previous_token_rfc2047 = true;
221 // Tokenize with whitespace characters.
222 base::StringTokenizer t(input, " \t\n\r");
223 t.set_options(base::StringTokenizer::RETURN_DELIMS);
224 while (t.GetNext()) {
225 if (t.token_is_delim()) {
226 // If the previous non-delimeter token is not RFC2047-encoded,
227 // put in a space in its place. Otheriwse, skip over it.
228 if (!is_previous_token_rfc2047)
229 decoded_value.push_back(' ');
230 continue;
232 // We don't support a single multibyte character split into
233 // adjacent encoded words. Some broken mail clients emit headers
234 // with that problem, but most web servers usually encode a filename
235 // in a single encoded-word. Firefox/Thunderbird do not support
236 // it, either.
237 std::string decoded;
238 if (!DecodeWord(t.token(), referrer_charset, &is_previous_token_rfc2047,
239 &decoded, &current_parse_result_flags))
240 return false;
241 decoded_value.append(decoded);
243 output->swap(decoded_value);
244 if (parse_result_flags && !output->empty())
245 *parse_result_flags |= current_parse_result_flags;
246 return true;
249 // Parses the charset and value-chars out of an ext-value string.
251 // ext-value = charset "'" [ language ] "'" value-chars
252 bool ParseExtValueComponents(const std::string& input,
253 std::string* charset,
254 std::string* value_chars) {
255 base::StringTokenizer t(input, "'");
256 t.set_options(base::StringTokenizer::RETURN_DELIMS);
257 std::string temp_charset;
258 std::string temp_value;
259 int numDelimsSeen = 0;
260 while (t.GetNext()) {
261 if (t.token_is_delim()) {
262 ++numDelimsSeen;
263 continue;
264 } else {
265 switch (numDelimsSeen) {
266 case 0:
267 temp_charset = t.token();
268 break;
269 case 1:
270 // Language is ignored.
271 break;
272 case 2:
273 temp_value = t.token();
274 break;
275 default:
276 return false;
280 if (numDelimsSeen != 2)
281 return false;
282 if (temp_charset.empty() || temp_value.empty())
283 return false;
284 charset->swap(temp_charset);
285 value_chars->swap(temp_value);
286 return true;
289 // http://tools.ietf.org/html/rfc5987#section-3.2
291 // ext-value = charset "'" [ language ] "'" value-chars
293 // charset = "UTF-8" / "ISO-8859-1" / mime-charset
295 // mime-charset = 1*mime-charsetc
296 // mime-charsetc = ALPHA / DIGIT
297 // / "!" / "#" / "$" / "%" / "&"
298 // / "+" / "-" / "^" / "_" / "`"
299 // / "{" / "}" / "~"
301 // language = <Language-Tag, defined in [RFC5646], Section 2.1>
303 // value-chars = *( pct-encoded / attr-char )
305 // pct-encoded = "%" HEXDIG HEXDIG
307 // attr-char = ALPHA / DIGIT
308 // / "!" / "#" / "$" / "&" / "+" / "-" / "."
309 // / "^" / "_" / "`" / "|" / "~"
310 bool DecodeExtValue(const std::string& param_value, std::string* decoded) {
311 if (param_value.find('"') != std::string::npos)
312 return false;
314 std::string charset;
315 std::string value;
316 if (!ParseExtValueComponents(param_value, &charset, &value))
317 return false;
319 // RFC 5987 value should be ASCII-only.
320 if (!base::IsStringASCII(value)) {
321 decoded->clear();
322 return true;
325 std::string unescaped = UnescapeURLComponent(
326 value, UnescapeRule::SPACES | UnescapeRule::URL_SPECIAL_CHARS);
328 return ConvertToUtf8AndNormalize(unescaped, charset.c_str(), decoded);
331 } // namespace
333 HttpContentDisposition::HttpContentDisposition(
334 const std::string& header, const std::string& referrer_charset)
335 : type_(INLINE),
336 parse_result_flags_(INVALID) {
337 Parse(header, referrer_charset);
340 HttpContentDisposition::~HttpContentDisposition() {
343 std::string::const_iterator HttpContentDisposition::ConsumeDispositionType(
344 std::string::const_iterator begin, std::string::const_iterator end) {
345 DCHECK(type_ == INLINE);
346 std::string::const_iterator delimiter = std::find(begin, end, ';');
348 std::string::const_iterator type_begin = begin;
349 std::string::const_iterator type_end = delimiter;
350 HttpUtil::TrimLWS(&type_begin, &type_end);
352 // If the disposition-type isn't a valid token the then the
353 // Content-Disposition header is malformed, and we treat the first bytes as
354 // a parameter rather than a disposition-type.
355 if (!HttpUtil::IsToken(type_begin, type_end))
356 return begin;
358 parse_result_flags_ |= HAS_DISPOSITION_TYPE;
360 DCHECK(std::find(type_begin, type_end, '=') == type_end);
362 if (base::LowerCaseEqualsASCII(base::StringPiece(type_begin, type_end),
363 "inline")) {
364 type_ = INLINE;
365 } else if (base::LowerCaseEqualsASCII(base::StringPiece(type_begin, type_end),
366 "attachment")) {
367 type_ = ATTACHMENT;
368 } else {
369 parse_result_flags_ |= HAS_UNKNOWN_DISPOSITION_TYPE;
370 type_ = ATTACHMENT;
372 return delimiter;
375 // http://tools.ietf.org/html/rfc6266
377 // content-disposition = "Content-Disposition" ":"
378 // disposition-type *( ";" disposition-parm )
380 // disposition-type = "inline" | "attachment" | disp-ext-type
381 // ; case-insensitive
382 // disp-ext-type = token
384 // disposition-parm = filename-parm | disp-ext-parm
386 // filename-parm = "filename" "=" value
387 // | "filename*" "=" ext-value
389 // disp-ext-parm = token "=" value
390 // | ext-token "=" ext-value
391 // ext-token = <the characters in token, followed by "*">
393 void HttpContentDisposition::Parse(const std::string& header,
394 const std::string& referrer_charset) {
395 DCHECK(type_ == INLINE);
396 DCHECK(filename_.empty());
398 std::string::const_iterator pos = header.begin();
399 std::string::const_iterator end = header.end();
400 pos = ConsumeDispositionType(pos, end);
402 std::string filename;
403 std::string ext_filename;
405 HttpUtil::NameValuePairsIterator iter(pos, end, ';');
406 while (iter.GetNext()) {
407 if (filename.empty() &&
408 base::LowerCaseEqualsASCII(
409 base::StringPiece(iter.name_begin(), iter.name_end()),
410 "filename")) {
411 DecodeFilenameValue(iter.value(), referrer_charset, &filename,
412 &parse_result_flags_);
413 if (!filename.empty())
414 parse_result_flags_ |= HAS_FILENAME;
415 } else if (ext_filename.empty() &&
416 base::LowerCaseEqualsASCII(
417 base::StringPiece(iter.name_begin(), iter.name_end()),
418 "filename*")) {
419 DecodeExtValue(iter.raw_value(), &ext_filename);
420 if (!ext_filename.empty())
421 parse_result_flags_ |= HAS_EXT_FILENAME;
425 if (!ext_filename.empty())
426 filename_ = ext_filename;
427 else
428 filename_ = filename;
431 } // namespace net