net/http/http_content_disposition.cc

   1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
   2 // Use of this source code is governed by a BSD-style license that can be
   3 // found in the LICENSE file.
   4
   5 #include "net/http/http_content_disposition.h"
   6
   7 #include "base/base64.h"
   8 #include "base/i18n/icu_string_conversions.h"
   9 #include "base/logging.h"
  10 #include "base/strings/string_tokenizer.h"
  11 #include "base/strings/string_util.h"
  12 #include "base/strings/sys_string_conversions.h"
  13 #include "base/strings/utf_string_conversions.h"
  14 #include "net/base/net_util.h"
  15 #include "net/http/http_util.h"
  16 #include "third_party/icu/source/common/unicode/ucnv.h"
  17
  18 namespace {
  19
  20 enum RFC2047EncodingType {
  21   Q_ENCODING,
  22   B_ENCODING
  23 };
  24
  25 // Decodes a "Q" encoded string as described in RFC 2047 section 4.2. Similar to
  26 // decoding a quoted-printable string.  Returns true if the input was valid.
  27 bool DecodeQEncoding(const std::string& input, std::string* output) {
  28   std::string temp;
  29   temp.reserve(input.size());
  30   for (std::string::const_iterator it = input.begin(); it != input.end();
  31        ++it) {
  32     if (*it == '_') {
  33       temp.push_back(' ');
  34     } else if (*it == '=') {
  35       if ((input.end() - it < 3) ||
  36           !IsHexDigit(static_cast<unsigned char>(*(it + 1))) ||
  37           !IsHexDigit(static_cast<unsigned char>(*(it + 2))))
  38         return false;
  39       unsigned char ch = HexDigitToInt(*(it + 1)) * 16 +
  40                          HexDigitToInt(*(it + 2));
  41       temp.push_back(static_cast<char>(ch));
  42       ++it;
  43       ++it;
  44     } else if (0x20 < *it && *it < 0x7F && *it != '?') {
  45       // In a Q-encoded word, only printable ASCII characters
  46       // represent themselves. Besides, space, '=', '_' and '?' are
  47       // not allowed, but they're already filtered out.
  48       DCHECK_NE('=', *it);
  49       DCHECK_NE('?', *it);
  50       DCHECK_NE('_', *it);
  51       temp.push_back(*it);
  52     } else {
  53       return false;
  54     }
  55   }
  56   output->swap(temp);
  57   return true;
  58 }
  59
  60 // Decodes a "Q" or "B" encoded string as per RFC 2047 section 4. The encoding
  61 // type is specified in |enc_type|.
  62 bool DecodeBQEncoding(const std::string& part,
  63                       RFC2047EncodingType enc_type,
  64                       const std::string& charset,
  65                       std::string* output) {
  66   std::string decoded;
  67   if (!((enc_type == B_ENCODING) ?
  68         base::Base64Decode(part, &decoded) : DecodeQEncoding(part, &decoded)))
  69     return false;
  70
  71   if (decoded.empty()) {
  72     output->clear();
  73     return true;
  74   }
  75
  76   UErrorCode err = U_ZERO_ERROR;
  77   UConverter* converter(ucnv_open(charset.c_str(), &err));
  78   if (U_FAILURE(err))
  79     return false;
  80
  81   // A single byte in a legacy encoding can be expanded to 3 bytes in UTF-8.
  82   // A 'two-byte character' in a legacy encoding can be expanded to 4 bytes
  83   // in UTF-8. Therefore, the expansion ratio is 3 at most. Add one for a
  84   // trailing '\0'.
  85   size_t output_length = decoded.length() * 3 + 1;
  86   char* buf = WriteInto(output, output_length);
  87   output_length = ucnv_toAlgorithmic(UCNV_UTF8, converter, buf, output_length,
  88                                      decoded.data(), decoded.length(), &err);
  89   ucnv_close(converter);
  90   if (U_FAILURE(err))
  91     return false;
  92   output->resize(output_length);
  93   return true;
  94 }
  95
  96 bool DecodeWord(const std::string& encoded_word,
  97                 const std::string& referrer_charset,
  98                 bool* is_rfc2047,
  99                 std::string* output,
 100                 int* parse_result_flags) {
 101   *is_rfc2047 = false;
 102   output->clear();
 103   if (encoded_word.empty())
 104     return true;
 105
 106   if (!IsStringASCII(encoded_word)) {
 107     // Try UTF-8, referrer_charset and the native OS default charset in turn.
 108     if (IsStringUTF8(encoded_word)) {
 109       *output = encoded_word;
 110     } else {
 111       base::string16 utf16_output;
 112       if (!referrer_charset.empty() &&
 113           base::CodepageToUTF16(encoded_word, referrer_charset.c_str(),
 114                                 base::OnStringConversionError::FAIL,
 115                                 &utf16_output)) {
 116         *output = UTF16ToUTF8(utf16_output);
 117       } else {
 118         *output = WideToUTF8(base::SysNativeMBToWide(encoded_word));
 119       }
 120     }
 121
 122     *parse_result_flags |= net::HttpContentDisposition::HAS_NON_ASCII_STRINGS;
 123     return true;
 124   }
 125
 126   // RFC 2047 : one of encoding methods supported by Firefox and relatively
 127   // widely used by web servers.
 128   // =?charset?<E>?<encoded string>?= where '<E>' is either 'B' or 'Q'.
 129   // We don't care about the length restriction (72 bytes) because
 130   // many web servers generate encoded words longer than the limit.
 131   std::string decoded_word;
 132   *is_rfc2047 = true;
 133   int part_index = 0;
 134   std::string charset;
 135   base::StringTokenizer t(encoded_word, "?");
 136   RFC2047EncodingType enc_type = Q_ENCODING;
 137   while (*is_rfc2047 && t.GetNext()) {
 138     std::string part = t.token();
 139     switch (part_index) {
 140       case 0:
 141         if (part != "=") {
 142           *is_rfc2047 = false;
 143           break;
 144         }
 145         ++part_index;
 146         break;
 147       case 1:
 148         // Do we need charset validity check here?
 149         charset = part;
 150         ++part_index;
 151         break;
 152       case 2:
 153         if (part.size() > 1 ||
 154             part.find_first_of("bBqQ") == std::string::npos) {
 155           *is_rfc2047 = false;
 156           break;
 157         }
 158         if (part[0] == 'b' || part[0] == 'B') {
 159           enc_type = B_ENCODING;
 160         }
 161         ++part_index;
 162         break;
 163       case 3:
 164         *is_rfc2047 = DecodeBQEncoding(part, enc_type, charset, &decoded_word);
 165         if (!*is_rfc2047) {
 166           // Last minute failure. Invalid B/Q encoding. Rather than
 167           // passing it through, return now.
 168           return false;
 169         }
 170         ++part_index;
 171         break;
 172       case 4:
 173         if (part != "=") {
 174           // Another last minute failure !
 175           // Likely to be a case of two encoded-words in a row or
 176           // an encoded word followed by a non-encoded word. We can be
 177           // generous, but it does not help much in terms of compatibility,
 178           // I believe. Return immediately.
 179           *is_rfc2047 = false;
 180           return false;
 181         }
 182         ++part_index;
 183         break;
 184       default:
 185         *is_rfc2047 = false;
 186         return false;
 187     }
 188   }
 189
 190   if (*is_rfc2047) {
 191     if (*(encoded_word.end() - 1) == '=') {
 192       output->swap(decoded_word);
 193       *parse_result_flags |=
 194           net::HttpContentDisposition::HAS_RFC2047_ENCODED_STRINGS;
 195       return true;
 196     }
 197     // encoded_word ending prematurelly with '?' or extra '?'
 198     *is_rfc2047 = false;
 199     return false;
 200   }
 201
 202   // We're not handling 'especial' characters quoted with '\', but
 203   // it should be Ok because we're not an email client but a
 204   // web browser.
 205
 206   // What IE6/7 does: %-escaped UTF-8.
 207   decoded_word = net::UnescapeURLComponent(encoded_word,
 208                                            net::UnescapeRule::SPACES);
 209   if (decoded_word != encoded_word)
 210     *parse_result_flags |=
 211         net::HttpContentDisposition::HAS_PERCENT_ENCODED_STRINGS;
 212   if (IsStringUTF8(decoded_word)) {
 213     output->swap(decoded_word);
 214     return true;
 215     // We can try either the OS default charset or 'origin charset' here,
 216     // As far as I can tell, IE does not support it. However, I've seen
 217     // web servers emit %-escaped string in a legacy encoding (usually
 218     // origin charset).
 219     // TODO(jungshik) : Test IE further and consider adding a fallback here.
 220   }
 221   return false;
 222 }
 223
 224 // Decodes the value of a 'filename' or 'name' parameter given as |input|. The
 225 // value is supposed to be of the form:
 226 //
 227 //   value                   = token | quoted-string
 228 //
 229 // However we currently also allow RFC 2047 encoding and non-ASCII
 230 // strings. Non-ASCII strings are interpreted based on |referrer_charset|.
 231 bool DecodeFilenameValue(const std::string& input,
 232                          const std::string& referrer_charset,
 233                          std::string* output,
 234                          int* parse_result_flags) {
 235   int current_parse_result_flags = 0;
 236   std::string decoded_value;
 237   bool is_previous_token_rfc2047 = true;
 238
 239   // Tokenize with whitespace characters.
 240   base::StringTokenizer t(input, " \t\n\r");
 241   t.set_options(base::StringTokenizer::RETURN_DELIMS);
 242   while (t.GetNext()) {
 243     if (t.token_is_delim()) {
 244       // If the previous non-delimeter token is not RFC2047-encoded,
 245       // put in a space in its place. Otheriwse, skip over it.
 246       if (!is_previous_token_rfc2047)
 247         decoded_value.push_back(' ');
 248       continue;
 249     }
 250     // We don't support a single multibyte character split into
 251     // adjacent encoded words. Some broken mail clients emit headers
 252     // with that problem, but most web servers usually encode a filename
 253     // in a single encoded-word. Firefox/Thunderbird do not support
 254     // it, either.
 255     std::string decoded;
 256     if (!DecodeWord(t.token(), referrer_charset, &is_previous_token_rfc2047,
 257                     &decoded, &current_parse_result_flags))
 258       return false;
 259     decoded_value.append(decoded);
 260   }
 261   output->swap(decoded_value);
 262   if (parse_result_flags && !output->empty())
 263     *parse_result_flags |= current_parse_result_flags;
 264   return true;
 265 }
 266
 267 // Parses the charset and value-chars out of an ext-value string.
 268 //
 269 //  ext-value     = charset  "'" [ language ] "'" value-chars
 270 bool ParseExtValueComponents(const std::string& input,
 271                              std::string* charset,
 272                              std::string* value_chars) {
 273   base::StringTokenizer t(input, "'");
 274   t.set_options(base::StringTokenizer::RETURN_DELIMS);
 275   std::string temp_charset;
 276   std::string temp_value;
 277   int numDelimsSeen = 0;
 278   while (t.GetNext()) {
 279     if (t.token_is_delim()) {
 280       ++numDelimsSeen;
 281       continue;
 282     } else {
 283       switch (numDelimsSeen) {
 284         case 0:
 285           temp_charset = t.token();
 286           break;
 287         case 1:
 288           // Language is ignored.
 289           break;
 290         case 2:
 291           temp_value = t.token();
 292           break;
 293         default:
 294           return false;
 295       }
 296     }
 297   }
 298   if (numDelimsSeen != 2)
 299     return false;
 300   if (temp_charset.empty() || temp_value.empty())
 301     return false;
 302   charset->swap(temp_charset);
 303   value_chars->swap(temp_value);
 304   return true;
 305 }
 306
 307 // http://tools.ietf.org/html/rfc5987#section-3.2
 308 //
 309 //  ext-value     = charset  "'" [ language ] "'" value-chars
 310 //
 311 //  charset       = "UTF-8" / "ISO-8859-1" / mime-charset
 312 //
 313 //  mime-charset  = 1*mime-charsetc
 314 //  mime-charsetc = ALPHA / DIGIT
 315 //                 / "!" / "#" / "$" / "%" / "&"
 316 //                 / "+" / "-" / "^" / "_" / "`"
 317 //                 / "{" / "}" / "~"
 318 //
 319 //  language      = <Language-Tag, defined in [RFC5646], Section 2.1>
 320 //
 321 //  value-chars   = *( pct-encoded / attr-char )
 322 //
 323 //  pct-encoded   = "%" HEXDIG HEXDIG
 324 //
 325 //  attr-char     = ALPHA / DIGIT
 326 //                 / "!" / "#" / "$" / "&" / "+" / "-" / "."
 327 //                 / "^" / "_" / "`" / "|" / "~"
 328 bool DecodeExtValue(const std::string& param_value, std::string* decoded) {
 329   if (param_value.find('"') != std::string::npos)
 330     return false;
 331
 332   std::string charset;
 333   std::string value;
 334   if (!ParseExtValueComponents(param_value, &charset, &value))
 335     return false;
 336
 337   // RFC 5987 value should be ASCII-only.
 338   if (!IsStringASCII(value)) {
 339     decoded->clear();
 340     return true;
 341   }
 342
 343   std::string unescaped = net::UnescapeURLComponent(
 344       value, net::UnescapeRule::SPACES | net::UnescapeRule::URL_SPECIAL_CHARS);
 345
 346   return base::ConvertToUtf8AndNormalize(unescaped, charset, decoded);
 347 }
 348
 349 } // namespace
 350
 351 namespace net {
 352
 353 HttpContentDisposition::HttpContentDisposition(
 354     const std::string& header, const std::string& referrer_charset)
 355   : type_(INLINE),
 356     parse_result_flags_(INVALID) {
 357   Parse(header, referrer_charset);
 358 }
 359
 360 HttpContentDisposition::~HttpContentDisposition() {
 361 }
 362
 363 std::string::const_iterator HttpContentDisposition::ConsumeDispositionType(
 364     std::string::const_iterator begin, std::string::const_iterator end) {
 365   DCHECK(type_ == INLINE);
 366   std::string::const_iterator delimiter = std::find(begin, end, ';');
 367
 368   std::string::const_iterator type_begin = begin;
 369   std::string::const_iterator type_end = delimiter;
 370   HttpUtil::TrimLWS(&type_begin, &type_end);
 371
 372   // If the disposition-type isn't a valid token the then the
 373   // Content-Disposition header is malformed, and we treat the first bytes as
 374   // a parameter rather than a disposition-type.
 375   if (!HttpUtil::IsToken(type_begin, type_end))
 376     return begin;
 377
 378   parse_result_flags_ |= HAS_DISPOSITION_TYPE;
 379
 380   DCHECK(std::find(type_begin, type_end, '=') == type_end);
 381
 382   if (LowerCaseEqualsASCII(type_begin, type_end, "inline")) {
 383     type_ = INLINE;
 384   } else if (LowerCaseEqualsASCII(type_begin, type_end, "attachment")) {
 385     type_ = ATTACHMENT;
 386   } else {
 387     parse_result_flags_ |= HAS_UNKNOWN_DISPOSITION_TYPE;
 388     type_ = ATTACHMENT;
 389   }
 390   return delimiter;
 391 }
 392
 393 // http://tools.ietf.org/html/rfc6266
 394 //
 395 //  content-disposition = "Content-Disposition" ":"
 396 //                         disposition-type *( ";" disposition-parm )
 397 //
 398 //  disposition-type    = "inline" | "attachment" | disp-ext-type
 399 //                      ; case-insensitive
 400 //  disp-ext-type       = token
 401 //
 402 //  disposition-parm    = filename-parm | disp-ext-parm
 403 //
 404 //  filename-parm       = "filename" "=" value
 405 //                      | "filename*" "=" ext-value
 406 //
 407 //  disp-ext-parm       = token "=" value
 408 //                      | ext-token "=" ext-value
 409 //  ext-token           = <the characters in token, followed by "*">
 410 //
 411 void HttpContentDisposition::Parse(const std::string& header,
 412                                    const std::string& referrer_charset) {
 413   DCHECK(type_ == INLINE);
 414   DCHECK(filename_.empty());
 415
 416   std::string::const_iterator pos = header.begin();
 417   std::string::const_iterator end = header.end();
 418   pos = ConsumeDispositionType(pos, end);
 419
 420   std::string name;
 421   std::string filename;
 422   std::string ext_filename;
 423
 424   HttpUtil::NameValuePairsIterator iter(pos, end, ';');
 425   while (iter.GetNext()) {
 426     if (filename.empty() && LowerCaseEqualsASCII(iter.name_begin(),
 427                                                  iter.name_end(),
 428                                                  "filename")) {
 429       DecodeFilenameValue(iter.value(), referrer_charset, &filename,
 430                           &parse_result_flags_);
 431       if (!filename.empty())
 432         parse_result_flags_ |= HAS_FILENAME;
 433     } else if (name.empty() && LowerCaseEqualsASCII(iter.name_begin(),
 434                                                     iter.name_end(),
 435                                                     "name")) {
 436       DecodeFilenameValue(iter.value(), referrer_charset, &name, NULL);
 437       if (!name.empty())
 438         parse_result_flags_ |= HAS_NAME;
 439     } else if (ext_filename.empty() && LowerCaseEqualsASCII(iter.name_begin(),
 440                                                             iter.name_end(),
 441                                                             "filename*")) {
 442       DecodeExtValue(iter.raw_value(), &ext_filename);
 443       if (!ext_filename.empty())
 444         parse_result_flags_ |= HAS_EXT_FILENAME;
 445     }
 446   }
 447
 448   if (!ext_filename.empty())
 449     filename_ = ext_filename;
 450   else if (!filename.empty())
 451     filename_ = filename;
 452   else
 453     filename_ = name;
 454 }
 455
 456 }  // namespace net