Update include paths in miscellaneous content/ directories for base/process changes.
[chromium-blink-merge.git] / net / http / http_content_disposition.cc
blob3dbf234b9435a53dca5158151d944cb298737953
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "net/http/http_content_disposition.h"
7 #include "base/base64.h"
8 #include "base/i18n/icu_string_conversions.h"
9 #include "base/logging.h"
10 #include "base/strings/string_tokenizer.h"
11 #include "base/strings/string_util.h"
12 #include "base/strings/sys_string_conversions.h"
13 #include "base/strings/utf_string_conversions.h"
14 #include "net/base/net_util.h"
15 #include "net/http/http_util.h"
16 #include "third_party/icu/source/common/unicode/ucnv.h"
18 namespace {
20 enum RFC2047EncodingType {
21 Q_ENCODING,
22 B_ENCODING
25 // Decodes a "Q" encoded string as described in RFC 2047 section 4.2. Similar to
26 // decoding a quoted-printable string. Returns true if the input was valid.
27 bool DecodeQEncoding(const std::string& input, std::string* output) {
28 std::string temp;
29 temp.reserve(input.size());
30 for (std::string::const_iterator it = input.begin(); it != input.end();
31 ++it) {
32 if (*it == '_') {
33 temp.push_back(' ');
34 } else if (*it == '=') {
35 if ((input.end() - it < 3) ||
36 !IsHexDigit(static_cast<unsigned char>(*(it + 1))) ||
37 !IsHexDigit(static_cast<unsigned char>(*(it + 2))))
38 return false;
39 unsigned char ch = HexDigitToInt(*(it + 1)) * 16 +
40 HexDigitToInt(*(it + 2));
41 temp.push_back(static_cast<char>(ch));
42 ++it;
43 ++it;
44 } else if (0x20 < *it && *it < 0x7F && *it != '?') {
45 // In a Q-encoded word, only printable ASCII characters
46 // represent themselves. Besides, space, '=', '_' and '?' are
47 // not allowed, but they're already filtered out.
48 DCHECK_NE('=', *it);
49 DCHECK_NE('?', *it);
50 DCHECK_NE('_', *it);
51 temp.push_back(*it);
52 } else {
53 return false;
56 output->swap(temp);
57 return true;
60 // Decodes a "Q" or "B" encoded string as per RFC 2047 section 4. The encoding
61 // type is specified in |enc_type|.
62 bool DecodeBQEncoding(const std::string& part,
63 RFC2047EncodingType enc_type,
64 const std::string& charset,
65 std::string* output) {
66 std::string decoded;
67 if (!((enc_type == B_ENCODING) ?
68 base::Base64Decode(part, &decoded) : DecodeQEncoding(part, &decoded)))
69 return false;
71 if (decoded.empty()) {
72 output->clear();
73 return true;
76 UErrorCode err = U_ZERO_ERROR;
77 UConverter* converter(ucnv_open(charset.c_str(), &err));
78 if (U_FAILURE(err))
79 return false;
81 // A single byte in a legacy encoding can be expanded to 3 bytes in UTF-8.
82 // A 'two-byte character' in a legacy encoding can be expanded to 4 bytes
83 // in UTF-8. Therefore, the expansion ratio is 3 at most. Add one for a
84 // trailing '\0'.
85 size_t output_length = decoded.length() * 3 + 1;
86 char* buf = WriteInto(output, output_length);
87 output_length = ucnv_toAlgorithmic(UCNV_UTF8, converter, buf, output_length,
88 decoded.data(), decoded.length(), &err);
89 ucnv_close(converter);
90 if (U_FAILURE(err))
91 return false;
92 output->resize(output_length);
93 return true;
96 bool DecodeWord(const std::string& encoded_word,
97 const std::string& referrer_charset,
98 bool* is_rfc2047,
99 std::string* output,
100 int* parse_result_flags) {
101 *is_rfc2047 = false;
102 output->clear();
103 if (encoded_word.empty())
104 return true;
106 if (!IsStringASCII(encoded_word)) {
107 // Try UTF-8, referrer_charset and the native OS default charset in turn.
108 if (IsStringUTF8(encoded_word)) {
109 *output = encoded_word;
110 } else {
111 base::string16 utf16_output;
112 if (!referrer_charset.empty() &&
113 base::CodepageToUTF16(encoded_word, referrer_charset.c_str(),
114 base::OnStringConversionError::FAIL,
115 &utf16_output)) {
116 *output = UTF16ToUTF8(utf16_output);
117 } else {
118 *output = WideToUTF8(base::SysNativeMBToWide(encoded_word));
122 *parse_result_flags |= net::HttpContentDisposition::HAS_NON_ASCII_STRINGS;
123 return true;
126 // RFC 2047 : one of encoding methods supported by Firefox and relatively
127 // widely used by web servers.
128 // =?charset?<E>?<encoded string>?= where '<E>' is either 'B' or 'Q'.
129 // We don't care about the length restriction (72 bytes) because
130 // many web servers generate encoded words longer than the limit.
131 std::string decoded_word;
132 *is_rfc2047 = true;
133 int part_index = 0;
134 std::string charset;
135 base::StringTokenizer t(encoded_word, "?");
136 RFC2047EncodingType enc_type = Q_ENCODING;
137 while (*is_rfc2047 && t.GetNext()) {
138 std::string part = t.token();
139 switch (part_index) {
140 case 0:
141 if (part != "=") {
142 *is_rfc2047 = false;
143 break;
145 ++part_index;
146 break;
147 case 1:
148 // Do we need charset validity check here?
149 charset = part;
150 ++part_index;
151 break;
152 case 2:
153 if (part.size() > 1 ||
154 part.find_first_of("bBqQ") == std::string::npos) {
155 *is_rfc2047 = false;
156 break;
158 if (part[0] == 'b' || part[0] == 'B') {
159 enc_type = B_ENCODING;
161 ++part_index;
162 break;
163 case 3:
164 *is_rfc2047 = DecodeBQEncoding(part, enc_type, charset, &decoded_word);
165 if (!*is_rfc2047) {
166 // Last minute failure. Invalid B/Q encoding. Rather than
167 // passing it through, return now.
168 return false;
170 ++part_index;
171 break;
172 case 4:
173 if (part != "=") {
174 // Another last minute failure !
175 // Likely to be a case of two encoded-words in a row or
176 // an encoded word followed by a non-encoded word. We can be
177 // generous, but it does not help much in terms of compatibility,
178 // I believe. Return immediately.
179 *is_rfc2047 = false;
180 return false;
182 ++part_index;
183 break;
184 default:
185 *is_rfc2047 = false;
186 return false;
190 if (*is_rfc2047) {
191 if (*(encoded_word.end() - 1) == '=') {
192 output->swap(decoded_word);
193 *parse_result_flags |=
194 net::HttpContentDisposition::HAS_RFC2047_ENCODED_STRINGS;
195 return true;
197 // encoded_word ending prematurelly with '?' or extra '?'
198 *is_rfc2047 = false;
199 return false;
202 // We're not handling 'especial' characters quoted with '\', but
203 // it should be Ok because we're not an email client but a
204 // web browser.
206 // What IE6/7 does: %-escaped UTF-8.
207 decoded_word = net::UnescapeURLComponent(encoded_word,
208 net::UnescapeRule::SPACES);
209 if (decoded_word != encoded_word)
210 *parse_result_flags |=
211 net::HttpContentDisposition::HAS_PERCENT_ENCODED_STRINGS;
212 if (IsStringUTF8(decoded_word)) {
213 output->swap(decoded_word);
214 return true;
215 // We can try either the OS default charset or 'origin charset' here,
216 // As far as I can tell, IE does not support it. However, I've seen
217 // web servers emit %-escaped string in a legacy encoding (usually
218 // origin charset).
219 // TODO(jungshik) : Test IE further and consider adding a fallback here.
221 return false;
224 // Decodes the value of a 'filename' or 'name' parameter given as |input|. The
225 // value is supposed to be of the form:
227 // value = token | quoted-string
229 // However we currently also allow RFC 2047 encoding and non-ASCII
230 // strings. Non-ASCII strings are interpreted based on |referrer_charset|.
231 bool DecodeFilenameValue(const std::string& input,
232 const std::string& referrer_charset,
233 std::string* output,
234 int* parse_result_flags) {
235 int current_parse_result_flags = 0;
236 std::string decoded_value;
237 bool is_previous_token_rfc2047 = true;
239 // Tokenize with whitespace characters.
240 base::StringTokenizer t(input, " \t\n\r");
241 t.set_options(base::StringTokenizer::RETURN_DELIMS);
242 while (t.GetNext()) {
243 if (t.token_is_delim()) {
244 // If the previous non-delimeter token is not RFC2047-encoded,
245 // put in a space in its place. Otheriwse, skip over it.
246 if (!is_previous_token_rfc2047)
247 decoded_value.push_back(' ');
248 continue;
250 // We don't support a single multibyte character split into
251 // adjacent encoded words. Some broken mail clients emit headers
252 // with that problem, but most web servers usually encode a filename
253 // in a single encoded-word. Firefox/Thunderbird do not support
254 // it, either.
255 std::string decoded;
256 if (!DecodeWord(t.token(), referrer_charset, &is_previous_token_rfc2047,
257 &decoded, &current_parse_result_flags))
258 return false;
259 decoded_value.append(decoded);
261 output->swap(decoded_value);
262 if (parse_result_flags && !output->empty())
263 *parse_result_flags |= current_parse_result_flags;
264 return true;
267 // Parses the charset and value-chars out of an ext-value string.
269 // ext-value = charset "'" [ language ] "'" value-chars
270 bool ParseExtValueComponents(const std::string& input,
271 std::string* charset,
272 std::string* value_chars) {
273 base::StringTokenizer t(input, "'");
274 t.set_options(base::StringTokenizer::RETURN_DELIMS);
275 std::string temp_charset;
276 std::string temp_value;
277 int numDelimsSeen = 0;
278 while (t.GetNext()) {
279 if (t.token_is_delim()) {
280 ++numDelimsSeen;
281 continue;
282 } else {
283 switch (numDelimsSeen) {
284 case 0:
285 temp_charset = t.token();
286 break;
287 case 1:
288 // Language is ignored.
289 break;
290 case 2:
291 temp_value = t.token();
292 break;
293 default:
294 return false;
298 if (numDelimsSeen != 2)
299 return false;
300 if (temp_charset.empty() || temp_value.empty())
301 return false;
302 charset->swap(temp_charset);
303 value_chars->swap(temp_value);
304 return true;
307 // http://tools.ietf.org/html/rfc5987#section-3.2
309 // ext-value = charset "'" [ language ] "'" value-chars
311 // charset = "UTF-8" / "ISO-8859-1" / mime-charset
313 // mime-charset = 1*mime-charsetc
314 // mime-charsetc = ALPHA / DIGIT
315 // / "!" / "#" / "$" / "%" / "&"
316 // / "+" / "-" / "^" / "_" / "`"
317 // / "{" / "}" / "~"
319 // language = <Language-Tag, defined in [RFC5646], Section 2.1>
321 // value-chars = *( pct-encoded / attr-char )
323 // pct-encoded = "%" HEXDIG HEXDIG
325 // attr-char = ALPHA / DIGIT
326 // / "!" / "#" / "$" / "&" / "+" / "-" / "."
327 // / "^" / "_" / "`" / "|" / "~"
328 bool DecodeExtValue(const std::string& param_value, std::string* decoded) {
329 if (param_value.find('"') != std::string::npos)
330 return false;
332 std::string charset;
333 std::string value;
334 if (!ParseExtValueComponents(param_value, &charset, &value))
335 return false;
337 // RFC 5987 value should be ASCII-only.
338 if (!IsStringASCII(value)) {
339 decoded->clear();
340 return true;
343 std::string unescaped = net::UnescapeURLComponent(
344 value, net::UnescapeRule::SPACES | net::UnescapeRule::URL_SPECIAL_CHARS);
346 return base::ConvertToUtf8AndNormalize(unescaped, charset, decoded);
349 } // namespace
351 namespace net {
353 HttpContentDisposition::HttpContentDisposition(
354 const std::string& header, const std::string& referrer_charset)
355 : type_(INLINE),
356 parse_result_flags_(INVALID) {
357 Parse(header, referrer_charset);
360 HttpContentDisposition::~HttpContentDisposition() {
363 std::string::const_iterator HttpContentDisposition::ConsumeDispositionType(
364 std::string::const_iterator begin, std::string::const_iterator end) {
365 DCHECK(type_ == INLINE);
366 std::string::const_iterator delimiter = std::find(begin, end, ';');
368 std::string::const_iterator type_begin = begin;
369 std::string::const_iterator type_end = delimiter;
370 HttpUtil::TrimLWS(&type_begin, &type_end);
372 // If the disposition-type isn't a valid token the then the
373 // Content-Disposition header is malformed, and we treat the first bytes as
374 // a parameter rather than a disposition-type.
375 if (!HttpUtil::IsToken(type_begin, type_end))
376 return begin;
378 parse_result_flags_ |= HAS_DISPOSITION_TYPE;
380 DCHECK(std::find(type_begin, type_end, '=') == type_end);
382 if (LowerCaseEqualsASCII(type_begin, type_end, "inline")) {
383 type_ = INLINE;
384 } else if (LowerCaseEqualsASCII(type_begin, type_end, "attachment")) {
385 type_ = ATTACHMENT;
386 } else {
387 parse_result_flags_ |= HAS_UNKNOWN_DISPOSITION_TYPE;
388 type_ = ATTACHMENT;
390 return delimiter;
393 // http://tools.ietf.org/html/rfc6266
395 // content-disposition = "Content-Disposition" ":"
396 // disposition-type *( ";" disposition-parm )
398 // disposition-type = "inline" | "attachment" | disp-ext-type
399 // ; case-insensitive
400 // disp-ext-type = token
402 // disposition-parm = filename-parm | disp-ext-parm
404 // filename-parm = "filename" "=" value
405 // | "filename*" "=" ext-value
407 // disp-ext-parm = token "=" value
408 // | ext-token "=" ext-value
409 // ext-token = <the characters in token, followed by "*">
411 void HttpContentDisposition::Parse(const std::string& header,
412 const std::string& referrer_charset) {
413 DCHECK(type_ == INLINE);
414 DCHECK(filename_.empty());
416 std::string::const_iterator pos = header.begin();
417 std::string::const_iterator end = header.end();
418 pos = ConsumeDispositionType(pos, end);
420 std::string name;
421 std::string filename;
422 std::string ext_filename;
424 HttpUtil::NameValuePairsIterator iter(pos, end, ';');
425 while (iter.GetNext()) {
426 if (filename.empty() && LowerCaseEqualsASCII(iter.name_begin(),
427 iter.name_end(),
428 "filename")) {
429 DecodeFilenameValue(iter.value(), referrer_charset, &filename,
430 &parse_result_flags_);
431 if (!filename.empty())
432 parse_result_flags_ |= HAS_FILENAME;
433 } else if (name.empty() && LowerCaseEqualsASCII(iter.name_begin(),
434 iter.name_end(),
435 "name")) {
436 DecodeFilenameValue(iter.value(), referrer_charset, &name, NULL);
437 if (!name.empty())
438 parse_result_flags_ |= HAS_NAME;
439 } else if (ext_filename.empty() && LowerCaseEqualsASCII(iter.name_begin(),
440 iter.name_end(),
441 "filename*")) {
442 DecodeExtValue(iter.raw_value(), &ext_filename);
443 if (!ext_filename.empty())
444 parse_result_flags_ |= HAS_EXT_FILENAME;
448 if (!ext_filename.empty())
449 filename_ = ext_filename;
450 else if (!filename.empty())
451 filename_ = filename;
452 else
453 filename_ = name;
456 } // namespace net