net/base/mime_sniffer.cc

   1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
   2 // Use of this source code is governed by a BSD-style license that can be
   3 // found in the LICENSE file.
   4
   5 // Detecting mime types is a tricky business because we need to balance
   6 // compatibility concerns with security issues.  Here is a survey of how other
   7 // browsers behave and then a description of how we intend to behave.
   8 //
   9 // HTML payload, no Content-Type header:
  10 // * IE 7: Render as HTML
  11 // * Firefox 2: Render as HTML
  12 // * Safari 3: Render as HTML
  13 // * Opera 9: Render as HTML
  14 //
  15 // Here the choice seems clear:
  16 // => Chrome: Render as HTML
  17 //
  18 // HTML payload, Content-Type: "text/plain":
  19 // * IE 7: Render as HTML
  20 // * Firefox 2: Render as text
  21 // * Safari 3: Render as text (Note: Safari will Render as HTML if the URL
  22 //                                   has an HTML extension)
  23 // * Opera 9: Render as text
  24 //
  25 // Here we choose to follow the majority (and break some compatibility with IE).
  26 // Many folks dislike IE's behavior here.
  27 // => Chrome: Render as text
  28 // We generalize this as follows.  If the Content-Type header is text/plain
  29 // we won't detect dangerous mime types (those that can execute script).
  30 //
  31 // HTML payload, Content-Type: "application/octet-stream":
  32 // * IE 7: Render as HTML
  33 // * Firefox 2: Download as application/octet-stream
  34 // * Safari 3: Render as HTML
  35 // * Opera 9: Render as HTML
  36 //
  37 // We follow Firefox.
  38 // => Chrome: Download as application/octet-stream
  39 // One factor in this decision is that IIS 4 and 5 will send
  40 // application/octet-stream for .xhtml files (because they don't recognize
  41 // the extension).  We did some experiments and it looks like this doesn't occur
  42 // very often on the web.  We choose the more secure option.
  43 //
  44 // GIF payload, no Content-Type header:
  45 // * IE 7: Render as GIF
  46 // * Firefox 2: Render as GIF
  47 // * Safari 3: Download as Unknown (Note: Safari will Render as GIF if the
  48 //                                        URL has an GIF extension)
  49 // * Opera 9: Render as GIF
  50 //
  51 // The choice is clear.
  52 // => Chrome: Render as GIF
  53 // Once we decide to render HTML without a Content-Type header, there isn't much
  54 // reason not to render GIFs.
  55 //
  56 // GIF payload, Content-Type: "text/plain":
  57 // * IE 7: Render as GIF
  58 // * Firefox 2: Download as application/octet-stream (Note: Firefox will
  59 //                              Download as GIF if the URL has an GIF extension)
  60 // * Safari 3: Download as Unknown (Note: Safari will Render as GIF if the
  61 //                                        URL has an GIF extension)
  62 // * Opera 9: Render as GIF
  63 //
  64 // Displaying as text/plain makes little sense as the content will look like
  65 // gibberish.  Here, we could change our minds and download.
  66 // => Chrome: Render as GIF
  67 //
  68 // GIF payload, Content-Type: "application/octet-stream":
  69 // * IE 7: Render as GIF
  70 // * Firefox 2: Download as application/octet-stream (Note: Firefox will
  71 //                              Download as GIF if the URL has an GIF extension)
  72 // * Safari 3: Download as Unknown (Note: Safari will Render as GIF if the
  73 //                                        URL has an GIF extension)
  74 // * Opera 9: Render as GIF
  75 //
  76 // We used to render as GIF here, but the problem is that some sites want to
  77 // trigger downloads by sending application/octet-stream (even though they
  78 // should be sending Content-Disposition: attachment).  Although it is safe
  79 // to render as GIF from a security perspective, we actually get better
  80 // compatibility if we don't sniff from application/octet stream at all.
  81 // => Chrome: Download as application/octet-stream
  82 //
  83 // XHTML payload, Content-Type: "text/xml":
  84 // * IE 7: Render as XML
  85 // * Firefox 2: Render as HTML
  86 // * Safari 3: Render as HTML
  87 // * Opera 9: Render as HTML
  88 // The layout tests rely on us rendering this as HTML.
  89 // But we're conservative in XHTML detection, as this runs afoul of the
  90 // "don't detect dangerous mime types" rule.
  91 //
  92 // Note that our definition of HTML payload is much stricter than IE's
  93 // definition and roughly the same as Firefox's definition.
  94
  95 #include <stdint.h>
  96 #include <string>
  97
  98 #include "net/base/mime_sniffer.h"
  99
 100 #include "base/logging.h"
 101 #include "base/metrics/histogram.h"
 102 #include "base/strings/string_util.h"
 103 #include "url/gurl.h"
 104
 105 namespace net {
 106
 107 // The number of content bytes we need to use all our magic numbers.  Feel free
 108 // to increase this number if you add a longer magic number.
 109 static const size_t kBytesRequiredForMagic = 42;
 110
 111 struct MagicNumber {
 112   const char* const mime_type;
 113   const char* const magic;
 114   size_t magic_len;
 115   bool is_string;
 116   const char* const mask;  // if set, must have same length as |magic|
 117 };
 118
 119 #define MAGIC_NUMBER(mime_type, magic) \
 120   { (mime_type), (magic), sizeof(magic)-1, false, NULL },
 121
 122 template <int MagicSize, int MaskSize>
 123 class VerifySizes {
 124   static_assert(MagicSize == MaskSize, "sizes must be equal");
 125
 126  public:
 127   enum { SIZES = MagicSize };
 128 };
 129
 130 #define verified_sizeof(magic, mask) \
 131 VerifySizes<sizeof(magic), sizeof(mask)>::SIZES
 132
 133 #define MAGIC_MASK(mime_type, magic, mask) \
 134   { (mime_type), (magic), verified_sizeof(magic, mask)-1, false, (mask) },
 135
 136 // Magic strings are case insensitive and must not include '\0' characters
 137 #define MAGIC_STRING(mime_type, magic) \
 138   { (mime_type), (magic), sizeof(magic)-1, true, NULL },
 139
 140 static const MagicNumber kMagicNumbers[] = {
 141   // Source: HTML 5 specification
 142   MAGIC_NUMBER("application/pdf", "%PDF-")
 143   MAGIC_NUMBER("application/postscript", "%!PS-Adobe-")
 144   MAGIC_NUMBER("image/gif", "GIF87a")
 145   MAGIC_NUMBER("image/gif", "GIF89a")
 146   MAGIC_NUMBER("image/png", "\x89" "PNG\x0D\x0A\x1A\x0A")
 147   MAGIC_NUMBER("image/jpeg", "\xFF\xD8\xFF")
 148   MAGIC_NUMBER("image/bmp", "BM")
 149   // Source: Mozilla
 150   MAGIC_NUMBER("text/plain", "#!")  // Script
 151   MAGIC_NUMBER("text/plain", "%!")  // Script, similar to PS
 152   MAGIC_NUMBER("text/plain", "From")
 153   MAGIC_NUMBER("text/plain", ">From")
 154   // Chrome specific
 155   MAGIC_NUMBER("application/x-gzip", "\x1F\x8B\x08")
 156   MAGIC_NUMBER("audio/x-pn-realaudio", "\x2E\x52\x4D\x46")
 157   MAGIC_NUMBER("video/x-ms-asf",
 158       "\x30\x26\xB2\x75\x8E\x66\xCF\x11\xA6\xD9\x00\xAA\x00\x62\xCE\x6C")
 159   MAGIC_NUMBER("image/tiff", "I I")
 160   MAGIC_NUMBER("image/tiff", "II*")
 161   MAGIC_NUMBER("image/tiff", "MM\x00*")
 162   MAGIC_NUMBER("audio/mpeg", "ID3")
 163   MAGIC_NUMBER("image/webp", "RIFF....WEBPVP8 ")
 164   MAGIC_NUMBER("video/webm", "\x1A\x45\xDF\xA3")
 165   MAGIC_NUMBER("application/zip", "PK\x03\x04")
 166   MAGIC_NUMBER("application/x-rar-compressed", "Rar!\x1A\x07\x00")
 167   MAGIC_NUMBER("application/x-msmetafile", "\xD7\xCD\xC6\x9A")
 168   MAGIC_NUMBER("application/octet-stream", "MZ")  // EXE
 169   // Sniffing for Flash:
 170   //
 171   //   MAGIC_NUMBER("application/x-shockwave-flash", "CWS")
 172   //   MAGIC_NUMBER("application/x-shockwave-flash", "FLV")
 173   //   MAGIC_NUMBER("application/x-shockwave-flash", "FWS")
 174   //
 175   // Including these magic number for Flash is a trade off.
 176   //
 177   // Pros:
 178   //   * Flash is an important and popular file format
 179   //
 180   // Cons:
 181   //   * These patterns are fairly weak
 182   //   * If we mistakenly decide something is Flash, we will execute it
 183   //     in the origin of an unsuspecting site.  This could be a security
 184   //     vulnerability if the site allows users to upload content.
 185   //
 186   // On balance, we do not include these patterns.
 187 };
 188
 189 // The number of content bytes we need to use all our Microsoft Office magic
 190 // numbers.
 191 static const size_t kBytesRequiredForOfficeMagic = 8;
 192
 193 static const MagicNumber kOfficeMagicNumbers[] = {
 194   MAGIC_NUMBER("CFB", "\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1")
 195   MAGIC_NUMBER("OOXML", "PK\x03\x04")
 196 };
 197
 198 enum OfficeDocType {
 199   DOC_TYPE_WORD,
 200   DOC_TYPE_EXCEL,
 201   DOC_TYPE_POWERPOINT,
 202   DOC_TYPE_NONE
 203 };
 204
 205 struct OfficeExtensionType {
 206   OfficeDocType doc_type;
 207   const char* const extension;
 208   size_t extension_len;
 209 };
 210
 211 #define OFFICE_EXTENSION(type, extension) \
 212   { (type), (extension), sizeof(extension) - 1 },
 213
 214 static const OfficeExtensionType kOfficeExtensionTypes[] = {
 215   OFFICE_EXTENSION(DOC_TYPE_WORD, ".doc")
 216   OFFICE_EXTENSION(DOC_TYPE_EXCEL, ".xls")
 217   OFFICE_EXTENSION(DOC_TYPE_POWERPOINT, ".ppt")
 218   OFFICE_EXTENSION(DOC_TYPE_WORD, ".docx")
 219   OFFICE_EXTENSION(DOC_TYPE_EXCEL, ".xlsx")
 220   OFFICE_EXTENSION(DOC_TYPE_POWERPOINT, ".pptx")
 221 };
 222
 223 static const MagicNumber kExtraMagicNumbers[] = {
 224   MAGIC_NUMBER("image/x-xbitmap", "#define")
 225   MAGIC_NUMBER("image/x-icon", "\x00\x00\x01\x00")
 226   MAGIC_NUMBER("image/svg+xml", "<?xml_version=")
 227   MAGIC_NUMBER("audio/wav", "RIFF....WAVEfmt ")
 228   MAGIC_NUMBER("video/avi", "RIFF....AVI LIST")
 229   MAGIC_NUMBER("audio/ogg", "OggS")
 230   MAGIC_MASK("video/mpeg", "\x00\x00\x01\xB0", "\xFF\xFF\xFF\xF0")
 231   MAGIC_MASK("audio/mpeg", "\xFF\xE0", "\xFF\xE0")
 232   MAGIC_NUMBER("video/3gpp", "....ftyp3g")
 233   MAGIC_NUMBER("video/3gpp", "....ftypavcl")
 234   MAGIC_NUMBER("video/mp4", "....ftyp")
 235   MAGIC_NUMBER("video/quicktime", "....moov")
 236   MAGIC_NUMBER("application/x-shockwave-flash", "CWS")
 237   MAGIC_NUMBER("application/x-shockwave-flash", "FWS")
 238   MAGIC_NUMBER("video/x-flv", "FLV")
 239   MAGIC_NUMBER("audio/x-flac", "fLaC")
 240
 241   // RAW image types.
 242   MAGIC_NUMBER("image/x-canon-cr2", "II\x2a\x00\x10\x00\x00\x00CR")
 243   MAGIC_NUMBER("image/x-canon-crw", "II\x1a\x00\x00\x00HEAPCCDR")
 244   MAGIC_NUMBER("image/x-minolta-mrw", "\x00MRM")
 245   MAGIC_NUMBER("image/x-olympus-orf", "MMOR")  // big-endian
 246   MAGIC_NUMBER("image/x-olympus-orf", "IIRO")  // little-endian
 247   MAGIC_NUMBER("image/x-olympus-orf", "IIRS")  // little-endian
 248   MAGIC_NUMBER("image/x-fuji-raf", "FUJIFILMCCD-RAW ")
 249   MAGIC_NUMBER("image/x-panasonic-raw",
 250                "IIU\x00\x08\x00\x00\x00")  // Panasonic .raw
 251   MAGIC_NUMBER("image/x-panasonic-raw",
 252                "IIU\x00\x18\x00\x00\x00")  // Panasonic .rw2
 253   MAGIC_NUMBER("image/x-phaseone-raw", "MMMMRaw")
 254   MAGIC_NUMBER("image/x-x3f", "FOVb")
 255 };
 256
 257 // Our HTML sniffer differs slightly from Mozilla.  For example, Mozilla will
 258 // decide that a document that begins "<!DOCTYPE SOAP-ENV:Envelope PUBLIC " is
 259 // HTML, but we will not.
 260
 261 #define MAGIC_HTML_TAG(tag) \
 262   MAGIC_STRING("text/html", "<" tag)
 263
 264 static const MagicNumber kSniffableTags[] = {
 265   // XML processing directive.  Although this is not an HTML mime type, we sniff
 266   // for this in the HTML phase because text/xml is just as powerful as HTML and
 267   // we want to leverage our white space skipping technology.
 268   MAGIC_NUMBER("text/xml", "<?xml")  // Mozilla
 269   // DOCTYPEs
 270   MAGIC_HTML_TAG("!DOCTYPE html")  // HTML5 spec
 271   // Sniffable tags, ordered by how often they occur in sniffable documents.
 272   MAGIC_HTML_TAG("script")  // HTML5 spec, Mozilla
 273   MAGIC_HTML_TAG("html")  // HTML5 spec, Mozilla
 274   MAGIC_HTML_TAG("!--")
 275   MAGIC_HTML_TAG("head")  // HTML5 spec, Mozilla
 276   MAGIC_HTML_TAG("iframe")  // Mozilla
 277   MAGIC_HTML_TAG("h1")  // Mozilla
 278   MAGIC_HTML_TAG("div")  // Mozilla
 279   MAGIC_HTML_TAG("font")  // Mozilla
 280   MAGIC_HTML_TAG("table")  // Mozilla
 281   MAGIC_HTML_TAG("a")  // Mozilla
 282   MAGIC_HTML_TAG("style")  // Mozilla
 283   MAGIC_HTML_TAG("title")  // Mozilla
 284   MAGIC_HTML_TAG("b")  // Mozilla
 285   MAGIC_HTML_TAG("body")  // Mozilla
 286   MAGIC_HTML_TAG("br")
 287   MAGIC_HTML_TAG("p")  // Mozilla
 288 };
 289
 290 static base::HistogramBase* UMASnifferHistogramGet(const char* name,
 291                                                    int array_size) {
 292   base::HistogramBase* counter =
 293       base::LinearHistogram::FactoryGet(name, 1, array_size - 1, array_size,
 294           base::HistogramBase::kUmaTargetedHistogramFlag);
 295   return counter;
 296 }
 297
 298 // Compare content header to a magic number where magic_entry can contain '.'
 299 // for single character of anything, allowing some bytes to be skipped.
 300 static bool MagicCmp(const char* magic_entry, const char* content, size_t len) {
 301   while (len) {
 302     if ((*magic_entry != '.') && (*magic_entry != *content))
 303       return false;
 304     ++magic_entry;
 305     ++content;
 306     --len;
 307   }
 308   return true;
 309 }
 310
 311 // Like MagicCmp() except that it ANDs each byte with a mask before
 312 // the comparison, because there are some bits we don't care about.
 313 static bool MagicMaskCmp(const char* magic_entry,
 314                          const char* content,
 315                          size_t len,
 316                          const char* mask) {
 317   while (len) {
 318     if ((*magic_entry != '.') && (*magic_entry != (*mask & *content)))
 319       return false;
 320     ++magic_entry;
 321     ++content;
 322     ++mask;
 323     --len;
 324   }
 325   return true;
 326 }
 327
 328 static bool MatchMagicNumber(const char* content,
 329                              size_t size,
 330                              const MagicNumber& magic_entry,
 331                              std::string* result) {
 332   const size_t len = magic_entry.magic_len;
 333
 334   // Keep kBytesRequiredForMagic honest.
 335   DCHECK_LE(len, kBytesRequiredForMagic);
 336
 337   // To compare with magic strings, we need to compute strlen(content), but
 338   // content might not actually have a null terminator.  In that case, we
 339   // pretend the length is content_size.
 340   const char* end = static_cast<const char*>(memchr(content, '\0', size));
 341   const size_t content_strlen =
 342       (end != NULL) ? static_cast<size_t>(end - content) : size;
 343
 344   bool match = false;
 345   if (magic_entry.is_string) {
 346     if (content_strlen >= len) {
 347       // Do a case-insensitive prefix comparison.
 348       DCHECK_EQ(strlen(magic_entry.magic), len);
 349       match = base::EqualsCaseInsensitiveASCII(magic_entry.magic,
 350                                                base::StringPiece(content, len));
 351     }
 352   } else {
 353     if (size >= len) {
 354       if (!magic_entry.mask) {
 355         match = MagicCmp(magic_entry.magic, content, len);
 356       } else {
 357         match = MagicMaskCmp(magic_entry.magic, content, len, magic_entry.mask);
 358       }
 359     }
 360   }
 361
 362   if (match) {
 363     result->assign(magic_entry.mime_type);
 364     return true;
 365   }
 366   return false;
 367 }
 368
 369 static bool CheckForMagicNumbers(const char* content, size_t size,
 370                                  const MagicNumber* magic, size_t magic_len,
 371                                  base::HistogramBase* counter,
 372                                  std::string* result) {
 373   for (size_t i = 0; i < magic_len; ++i) {
 374     if (MatchMagicNumber(content, size, magic[i], result)) {
 375       if (counter) counter->Add(static_cast<int>(i));
 376       return true;
 377     }
 378   }
 379   return false;
 380 }
 381
 382 // Truncates |size| to |max_size| and returns true if |size| is at least
 383 // |max_size|.
 384 static bool TruncateSize(const size_t max_size, size_t* size) {
 385   // Keep kMaxBytesToSniff honest.
 386   DCHECK_LE(static_cast<int>(max_size), kMaxBytesToSniff);
 387
 388   if (*size >= max_size) {
 389     *size = max_size;
 390     return true;
 391   }
 392   return false;
 393 }
 394
 395 // Returns true and sets result if the content appears to be HTML.
 396 // Clears have_enough_content if more data could possibly change the result.
 397 static bool SniffForHTML(const char* content,
 398                          size_t size,
 399                          bool* have_enough_content,
 400                          std::string* result) {
 401   // For HTML, we are willing to consider up to 512 bytes. This may be overly
 402   // conservative as IE only considers 256.
 403   *have_enough_content &= TruncateSize(512, &size);
 404
 405   // We adopt a strategy similar to that used by Mozilla to sniff HTML tags,
 406   // but with some modifications to better match the HTML5 spec.
 407   const char* const end = content + size;
 408   const char* pos;
 409   for (pos = content; pos < end; ++pos) {
 410     if (!base::IsAsciiWhitespace(*pos))
 411       break;
 412   }
 413   static base::HistogramBase* counter(NULL);
 414   if (!counter) {
 415     counter = UMASnifferHistogramGet("mime_sniffer.kSniffableTags2",
 416                                      arraysize(kSniffableTags));
 417   }
 418   // |pos| now points to first non-whitespace character (or at end).
 419   return CheckForMagicNumbers(pos, end - pos,
 420                               kSniffableTags, arraysize(kSniffableTags),
 421                               counter, result);
 422 }
 423
 424 // Returns true and sets result if the content matches any of kMagicNumbers.
 425 // Clears have_enough_content if more data could possibly change the result.
 426 static bool SniffForMagicNumbers(const char* content,
 427                                  size_t size,
 428                                  bool* have_enough_content,
 429                                  std::string* result) {
 430   *have_enough_content &= TruncateSize(kBytesRequiredForMagic, &size);
 431
 432   // Check our big table of Magic Numbers
 433   static base::HistogramBase* counter(NULL);
 434   if (!counter) {
 435     counter = UMASnifferHistogramGet("mime_sniffer.kMagicNumbers2",
 436                                      arraysize(kMagicNumbers));
 437   }
 438   return CheckForMagicNumbers(content, size,
 439                               kMagicNumbers, arraysize(kMagicNumbers),
 440                               counter, result);
 441 }
 442
 443 // Returns true and sets result if the content matches any of
 444 // kOfficeMagicNumbers, and the URL has the proper extension.
 445 // Clears |have_enough_content| if more data could possibly change the result.
 446 static bool SniffForOfficeDocs(const char* content,
 447                                size_t size,
 448                                const GURL& url,
 449                                bool* have_enough_content,
 450                                std::string* result) {
 451   *have_enough_content &= TruncateSize(kBytesRequiredForOfficeMagic, &size);
 452
 453   // Check our table of magic numbers for Office file types.
 454   std::string office_version;
 455   if (!CheckForMagicNumbers(content, size,
 456                             kOfficeMagicNumbers, arraysize(kOfficeMagicNumbers),
 457                             NULL, &office_version))
 458     return false;
 459
 460   OfficeDocType type = DOC_TYPE_NONE;
 461   for (size_t i = 0; i < arraysize(kOfficeExtensionTypes); ++i) {
 462     std::string url_path = url.path();
 463
 464     if (url_path.length() < kOfficeExtensionTypes[i].extension_len)
 465       continue;
 466
 467     base::StringPiece extension = base::StringPiece(url_path).substr(
 468         url_path.length() - kOfficeExtensionTypes[i].extension_len);
 469     if (base::EqualsCaseInsensitiveASCII(
 470             extension,
 471             base::StringPiece(kOfficeExtensionTypes[i].extension,
 472                               kOfficeExtensionTypes[i].extension_len))) {
 473       type = kOfficeExtensionTypes[i].doc_type;
 474       break;
 475     }
 476   }
 477
 478   if (type == DOC_TYPE_NONE)
 479     return false;
 480
 481   if (office_version == "CFB") {
 482     switch (type) {
 483       case DOC_TYPE_WORD:
 484         *result = "application/msword";
 485         return true;
 486       case DOC_TYPE_EXCEL:
 487         *result = "application/vnd.ms-excel";
 488         return true;
 489       case DOC_TYPE_POWERPOINT:
 490         *result = "application/vnd.ms-powerpoint";
 491         return true;
 492       case DOC_TYPE_NONE:
 493         NOTREACHED();
 494         return false;
 495     }
 496   } else if (office_version == "OOXML") {
 497     switch (type) {
 498       case DOC_TYPE_WORD:
 499         *result = "application/vnd.openxmlformats-officedocument."
 500                   "wordprocessingml.document";
 501         return true;
 502       case DOC_TYPE_EXCEL:
 503         *result = "application/vnd.openxmlformats-officedocument."
 504                   "spreadsheetml.sheet";
 505         return true;
 506       case DOC_TYPE_POWERPOINT:
 507         *result = "application/vnd.openxmlformats-officedocument."
 508                   "presentationml.presentation";
 509         return true;
 510       case DOC_TYPE_NONE:
 511         NOTREACHED();
 512         return false;
 513     }
 514   }
 515
 516   NOTREACHED();
 517   return false;
 518 }
 519
 520 static bool IsOfficeType(const std::string& type_hint) {
 521   return (type_hint == "application/msword" ||
 522           type_hint == "application/vnd.ms-excel" ||
 523           type_hint == "application/vnd.ms-powerpoint" ||
 524           type_hint == "application/vnd.openxmlformats-officedocument."
 525                        "wordprocessingml.document" ||
 526           type_hint == "application/vnd.openxmlformats-officedocument."
 527                        "spreadsheetml.sheet" ||
 528           type_hint == "application/vnd.openxmlformats-officedocument."
 529                        "presentationml.presentation" ||
 530           type_hint == "application/vnd.ms-excel.sheet.macroenabled.12" ||
 531           type_hint == "application/vnd.ms-word.document.macroenabled.12" ||
 532           type_hint == "application/vnd.ms-powerpoint.presentation."
 533                        "macroenabled.12" ||
 534           type_hint == "application/mspowerpoint" ||
 535           type_hint == "application/msexcel" ||
 536           type_hint == "application/vnd.ms-word" ||
 537           type_hint == "application/vnd.ms-word.document.12" ||
 538           type_hint == "application/vnd.msword");
 539 }
 540
 541 // This function checks for files that have a Microsoft Office MIME type
 542 // set, but are not actually Office files.
 543 //
 544 // If this is not actually an Office file, |*result| is set to
 545 // "application/octet-stream", otherwise it is not modified.
 546 //
 547 // Returns false if additional data is required to determine the file type, or
 548 // true if there is enough data to make a decision.
 549 static bool SniffForInvalidOfficeDocs(const char* content,
 550                                       size_t size,
 551                                       const GURL& url,
 552                                       std::string* result) {
 553   if (!TruncateSize(kBytesRequiredForOfficeMagic, &size))
 554     return false;
 555
 556   // Check our table of magic numbers for Office file types.  If it does not
 557   // match one, the MIME type was invalid.  Set it instead to a safe value.
 558   std::string office_version;
 559   if (!CheckForMagicNumbers(content, size,
 560                             kOfficeMagicNumbers, arraysize(kOfficeMagicNumbers),
 561                             NULL, &office_version)) {
 562     *result = "application/octet-stream";
 563   }
 564
 565   // We have enough information to determine if this was a Microsoft Office
 566   // document or not, so sniffing is completed.
 567   return true;
 568 }
 569
 570 // Byte order marks
 571 static const MagicNumber kMagicXML[] = {
 572   // We want to be very conservative in interpreting text/xml content as
 573   // XHTML -- we just want to sniff enough to make unit tests pass.
 574   // So we match explicitly on this, and don't match other ways of writing
 575   // it in semantically-equivalent ways.
 576   MAGIC_STRING("application/xhtml+xml",
 577                "<html xmlns=\"http://www.w3.org/1999/xhtml\"")
 578   MAGIC_STRING("application/atom+xml", "<feed")
 579   MAGIC_STRING("application/rss+xml", "<rss")  // UTF-8
 580 };
 581
 582 // Returns true and sets result if the content appears to contain XHTML or a
 583 // feed.
 584 // Clears have_enough_content if more data could possibly change the result.
 585 //
 586 // TODO(evanm): this is similar but more conservative than what Safari does,
 587 // while HTML5 has a different recommendation -- what should we do?
 588 // TODO(evanm): this is incorrect for documents whose encoding isn't a superset
 589 // of ASCII -- do we care?
 590 static bool SniffXML(const char* content,
 591                      size_t size,
 592                      bool* have_enough_content,
 593                      std::string* result) {
 594   // We allow at most 300 bytes of content before we expect the opening tag.
 595   *have_enough_content &= TruncateSize(300, &size);
 596   const char* pos = content;
 597   const char* const end = content + size;
 598
 599   // This loop iterates through tag-looking offsets in the file.
 600   // We want to skip XML processing instructions (of the form "<?xml ...")
 601   // and stop at the first "plain" tag, then make a decision on the mime-type
 602   // based on the name (or possibly attributes) of that tag.
 603   static base::HistogramBase* counter(NULL);
 604   if (!counter) {
 605     counter = UMASnifferHistogramGet("mime_sniffer.kMagicXML2",
 606                                      arraysize(kMagicXML));
 607   }
 608   const int kMaxTagIterations = 5;
 609   for (int i = 0; i < kMaxTagIterations && pos < end; ++i) {
 610     pos = reinterpret_cast<const char*>(memchr(pos, '<', end - pos));
 611     if (!pos)
 612       return false;
 613
 614     static const char kXmlPrefix[] = "<?xml";
 615     static const size_t kXmlPrefixLength = arraysize(kXmlPrefix) - 1;
 616     static const char kDocTypePrefix[] = "<!DOCTYPE";
 617     static const size_t kDocTypePrefixLength = arraysize(kDocTypePrefix) - 1;
 618
 619     if ((pos + kXmlPrefixLength <= end) &&
 620         base::EqualsCaseInsensitiveASCII(
 621             base::StringPiece(pos, kXmlPrefixLength),
 622             base::StringPiece(kXmlPrefix, kXmlPrefixLength))) {
 623       // Skip XML declarations.
 624       ++pos;
 625       continue;
 626     } else if ((pos + kDocTypePrefixLength <= end) &&
 627                base::EqualsCaseInsensitiveASCII(
 628                    base::StringPiece(pos, kDocTypePrefixLength),
 629                    base::StringPiece(kDocTypePrefix, kDocTypePrefixLength))) {
 630       // Skip DOCTYPE declarations.
 631       ++pos;
 632       continue;
 633     }
 634
 635     if (CheckForMagicNumbers(pos, end - pos,
 636                              kMagicXML, arraysize(kMagicXML),
 637                              counter, result))
 638       return true;
 639
 640     // TODO(evanm): handle RSS 1.0, which is an RDF format and more difficult
 641     // to identify.
 642
 643     // If we get here, we've hit an initial tag that hasn't matched one of the
 644     // above tests.  Abort.
 645     return true;
 646   }
 647
 648   // We iterated too far without finding a start tag.
 649   // If we have more content to look at, we aren't going to change our mind by
 650   // seeing more bytes from the network.
 651   return pos < end;
 652 }
 653
 654 // Byte order marks
 655 static const MagicNumber kByteOrderMark[] = {
 656   MAGIC_NUMBER("text/plain", "\xFE\xFF")  // UTF-16BE
 657   MAGIC_NUMBER("text/plain", "\xFF\xFE")  // UTF-16LE
 658   MAGIC_NUMBER("text/plain", "\xEF\xBB\xBF")  // UTF-8
 659 };
 660
 661 // Returns true and sets result to "application/octet-stream" if the content
 662 // appears to be binary data. Otherwise, returns false and sets "text/plain".
 663 // Clears have_enough_content if more data could possibly change the result.
 664 static bool SniffBinary(const char* content,
 665                         size_t size,
 666                         bool* have_enough_content,
 667                         std::string* result) {
 668   // There is no concensus about exactly how to sniff for binary content.
 669   // * IE 7: Don't sniff for binary looking bytes, but trust the file extension.
 670   // * Firefox 3.5: Sniff first 4096 bytes for a binary looking byte.
 671   // Here, we side with FF, but with a smaller buffer. This size was chosen
 672   // because it is small enough to comfortably fit into a single packet (after
 673   // allowing for headers) and yet large enough to account for binary formats
 674   // that have a significant amount of ASCII at the beginning (crbug.com/15314).
 675   const bool is_truncated = TruncateSize(kMaxBytesToSniff, &size);
 676
 677   // First, we look for a BOM.
 678   static base::HistogramBase* counter(NULL);
 679   if (!counter) {
 680     counter = UMASnifferHistogramGet("mime_sniffer.kByteOrderMark2",
 681                                      arraysize(kByteOrderMark));
 682   }
 683   std::string unused;
 684   if (CheckForMagicNumbers(content, size,
 685                            kByteOrderMark, arraysize(kByteOrderMark),
 686                            counter, &unused)) {
 687     // If there is BOM, we think the buffer is not binary.
 688     result->assign("text/plain");
 689     return false;
 690   }
 691
 692   // Next we look to see if any of the bytes "look binary."
 693   if (LooksLikeBinary(content, size)) {
 694     result->assign("application/octet-stream");
 695     return true;
 696   }
 697
 698   // No evidence either way. Default to non-binary and, if truncated, clear
 699   // have_enough_content because there could be a binary looking byte in the
 700   // truncated data.
 701   *have_enough_content &= is_truncated;
 702   result->assign("text/plain");
 703   return false;
 704 }
 705
 706 static bool IsUnknownMimeType(const std::string& mime_type) {
 707   // TODO(tc): Maybe reuse some code in net/http/http_response_headers.* here.
 708   // If we do, please be careful not to alter the semantics at all.
 709   static const char* const kUnknownMimeTypes[] = {
 710     // Empty mime types are as unknown as they get.
 711     "",
 712     // The unknown/unknown type is popular and uninformative
 713     "unknown/unknown",
 714     // The second most popular unknown mime type is application/unknown
 715     "application/unknown",
 716     // Firefox rejects a mime type if it is exactly */*
 717     "*/*",
 718   };
 719   static base::HistogramBase* counter(NULL);
 720   if (!counter) {
 721     counter = UMASnifferHistogramGet("mime_sniffer.kUnknownMimeTypes2",
 722                                      arraysize(kUnknownMimeTypes) + 1);
 723   }
 724   for (size_t i = 0; i < arraysize(kUnknownMimeTypes); ++i) {
 725     if (mime_type == kUnknownMimeTypes[i]) {
 726       counter->Add(i);
 727       return true;
 728     }
 729   }
 730   if (mime_type.find('/') == std::string::npos) {
 731     // Firefox rejects a mime type if it does not contain a slash
 732     counter->Add(arraysize(kUnknownMimeTypes));
 733     return true;
 734   }
 735   return false;
 736 }
 737
 738 // Returns true and sets result if the content appears to be a crx (Chrome
 739 // extension) file.
 740 // Clears have_enough_content if more data could possibly change the result.
 741 static bool SniffCRX(const char* content,
 742                      size_t size,
 743                      const GURL& url,
 744                      const std::string& type_hint,
 745                      bool* have_enough_content,
 746                      std::string* result) {
 747   static base::HistogramBase* counter(NULL);
 748   if (!counter)
 749     counter = UMASnifferHistogramGet("mime_sniffer.kSniffCRX", 3);
 750
 751   // Technically, the crx magic number is just Cr24, but the bytes after that
 752   // are a version number which changes infrequently. Including it in the
 753   // sniffing gives us less room for error. If the version number ever changes,
 754   // we can just add an entry to this list.
 755   //
 756   // TODO(aa): If we ever have another magic number, we'll want to pass a
 757   // histogram into CheckForMagicNumbers(), below, to see which one matched.
 758   static const struct MagicNumber kCRXMagicNumbers[] = {
 759     MAGIC_NUMBER("application/x-chrome-extension", "Cr24\x02\x00\x00\x00")
 760   };
 761
 762   // Only consider files that have the extension ".crx".
 763   static const char kCRXExtension[] = ".crx";
 764   // Ignore null by subtracting 1.
 765   static const int kExtensionLength = arraysize(kCRXExtension) - 1;
 766   if (url.path().rfind(kCRXExtension, std::string::npos, kExtensionLength) ==
 767       url.path().size() - kExtensionLength) {
 768     counter->Add(1);
 769   } else {
 770     return false;
 771   }
 772
 773   *have_enough_content &= TruncateSize(kBytesRequiredForMagic, &size);
 774   if (CheckForMagicNumbers(content, size,
 775                            kCRXMagicNumbers, arraysize(kCRXMagicNumbers),
 776                            NULL, result)) {
 777     counter->Add(2);
 778   } else {
 779     return false;
 780   }
 781
 782   return true;
 783 }
 784
 785 bool ShouldSniffMimeType(const GURL& url, const std::string& mime_type) {
 786   static base::HistogramBase* should_sniff_counter(NULL);
 787   if (!should_sniff_counter) {
 788     should_sniff_counter =
 789         UMASnifferHistogramGet("mime_sniffer.ShouldSniffMimeType2", 3);
 790   }
 791   bool sniffable_scheme = url.is_empty() ||
 792                           url.SchemeIsHTTPOrHTTPS() ||
 793                           url.SchemeIs("ftp") ||
 794 #if defined(OS_ANDROID)
 795                           url.SchemeIs("content") ||
 796 #endif
 797                           url.SchemeIsFile() ||
 798                           url.SchemeIsFileSystem();
 799   if (!sniffable_scheme) {
 800     should_sniff_counter->Add(1);
 801     return false;
 802   }
 803
 804   static const char* const kSniffableTypes[] = {
 805     // Many web servers are misconfigured to send text/plain for many
 806     // different types of content.
 807     "text/plain",
 808     // We want to sniff application/octet-stream for
 809     // application/x-chrome-extension, but nothing else.
 810     "application/octet-stream",
 811     // XHTML and Atom/RSS feeds are often served as plain xml instead of
 812     // their more specific mime types.
 813     "text/xml",
 814     "application/xml",
 815     // Check for false Microsoft Office MIME types.
 816     "application/msword",
 817     "application/vnd.ms-excel",
 818     "application/vnd.ms-powerpoint",
 819     "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
 820     "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
 821     "application/vnd.openxmlformats-officedocument.presentationml.presentation",
 822     "application/vnd.ms-excel.sheet.macroenabled.12",
 823     "application/vnd.ms-word.document.macroenabled.12",
 824     "application/vnd.ms-powerpoint.presentation.macroenabled.12",
 825     "application/mspowerpoint",
 826     "application/msexcel",
 827     "application/vnd.ms-word",
 828     "application/vnd.ms-word.document.12",
 829     "application/vnd.msword",
 830   };
 831   static base::HistogramBase* counter(NULL);
 832   if (!counter) {
 833     counter = UMASnifferHistogramGet("mime_sniffer.kSniffableTypes2",
 834                                      arraysize(kSniffableTypes) + 1);
 835   }
 836   for (size_t i = 0; i < arraysize(kSniffableTypes); ++i) {
 837     if (mime_type == kSniffableTypes[i]) {
 838       counter->Add(i);
 839       should_sniff_counter->Add(2);
 840       return true;
 841     }
 842   }
 843   if (IsUnknownMimeType(mime_type)) {
 844     // The web server didn't specify a content type or specified a mime
 845     // type that we ignore.
 846     counter->Add(arraysize(kSniffableTypes));
 847     should_sniff_counter->Add(2);
 848     return true;
 849   }
 850   should_sniff_counter->Add(1);
 851   return false;
 852 }
 853
 854 bool SniffMimeType(const char* content,
 855                    size_t content_size,
 856                    const GURL& url,
 857                    const std::string& type_hint,
 858                    std::string* result) {
 859   DCHECK_LT(content_size, 1000000U);  // sanity check
 860   DCHECK(content);
 861   DCHECK(result);
 862
 863   // By default, we assume we have enough content.
 864   // Each sniff routine may unset this if it wasn't provided enough content.
 865   bool have_enough_content = true;
 866
 867   // By default, we'll return the type hint.
 868   // Each sniff routine may modify this if it has a better guess..
 869   result->assign(type_hint);
 870
 871   // If the file has a Microsoft Office MIME type, we should only check that it
 872   // is a valid Office file.  Because this is the only reason we sniff files
 873   // with a Microsoft Office MIME type, we can return early.
 874   if (IsOfficeType(type_hint))
 875     return SniffForInvalidOfficeDocs(content, content_size, url, result);
 876
 877   // Cache information about the type_hint
 878   const bool hint_is_unknown_mime_type = IsUnknownMimeType(type_hint);
 879
 880   // First check for HTML
 881   if (hint_is_unknown_mime_type) {
 882     // We're only willing to sniff HTML if the server has not supplied a mime
 883     // type, or if the type it did supply indicates that it doesn't know what
 884     // the type should be.
 885     if (SniffForHTML(content, content_size, &have_enough_content, result))
 886       return true;  // We succeeded in sniffing HTML.  No more content needed.
 887   }
 888
 889   // We're only willing to sniff for binary in 3 cases:
 890   // 1. The server has not supplied a mime type.
 891   // 2. The type it did supply indicates that it doesn't know what the type
 892   //    should be.
 893   // 3. The type is "text/plain" which is the default on some web servers and
 894   //    could be indicative of a mis-configuration that we shield the user from.
 895   const bool hint_is_text_plain = (type_hint == "text/plain");
 896   if (hint_is_unknown_mime_type || hint_is_text_plain) {
 897     if (!SniffBinary(content, content_size, &have_enough_content, result)) {
 898       // If the server said the content was text/plain and it doesn't appear
 899       // to be binary, then we trust it.
 900       if (hint_is_text_plain) {
 901         return have_enough_content;
 902       }
 903     }
 904   }
 905
 906   // If we have plain XML, sniff XML subtypes.
 907   if (type_hint == "text/xml" || type_hint == "application/xml") {
 908     // We're not interested in sniffing these types for images and the like.
 909     // Instead, we're looking explicitly for a feed.  If we don't find one
 910     // we're done and return early.
 911     if (SniffXML(content, content_size, &have_enough_content, result))
 912       return true;
 913     return have_enough_content;
 914   }
 915
 916   // CRX files (Chrome extensions) have a special sniffing algorithm. It is
 917   // tighter than the others because we don't have to match legacy behavior.
 918   if (SniffCRX(content, content_size, url, type_hint,
 919                &have_enough_content, result))
 920     return true;
 921
 922   // Check the file extension and magic numbers to see if this is an Office
 923   // document.  This needs to be checked before the general magic numbers
 924   // because zip files and Office documents (OOXML) have the same magic number.
 925   if (SniffForOfficeDocs(content, content_size, url,
 926                          &have_enough_content, result))
 927     return true;  // We've matched a magic number.  No more content needed.
 928
 929   // We're not interested in sniffing for magic numbers when the type_hint
 930   // is application/octet-stream.  Time to bail out.
 931   if (type_hint == "application/octet-stream")
 932     return have_enough_content;
 933
 934   // Now we look in our large table of magic numbers to see if we can find
 935   // anything that matches the content.
 936   if (SniffForMagicNumbers(content, content_size,
 937                            &have_enough_content, result))
 938     return true;  // We've matched a magic number.  No more content needed.
 939
 940   return have_enough_content;
 941 }
 942
 943 bool SniffMimeTypeFromLocalData(const char* content,
 944                                 size_t size,
 945                                 std::string* result) {
 946   // First check the extra table.
 947   if (CheckForMagicNumbers(content, size, kExtraMagicNumbers,
 948                            arraysize(kExtraMagicNumbers), NULL, result))
 949     return true;
 950   // Finally check the original table.
 951   return CheckForMagicNumbers(content, size, kMagicNumbers,
 952                               arraysize(kMagicNumbers), NULL, result);
 953 }
 954
 955 bool LooksLikeBinary(const char* content, size_t size) {
 956   // The definition of "binary bytes" is from the spec at
 957   // https://mimesniff.spec.whatwg.org/#binary-data-byte
 958   //
 959   // The bytes which are considered to be "binary" are all < 0x20. Encode them
 960   // one bit per byte, with 1 for a "binary" bit, and 0 for a "text" bit. The
 961   // least-significant bit represents byte 0x00, the most-significant bit
 962   // represents byte 0x1F.
 963   const uint32_t kBinaryBits =
 964       ~(1u << '\t' | 1u << '\n' | 1u << '\r' | 1u << '\f' | 1u << '\x1b');
 965   for (size_t i = 0; i < size; ++i) {
 966     uint8_t byte = static_cast<uint8_t>(content[i]);
 967     if (byte < 0x20 && (kBinaryBits & (1u << byte)))
 968       return true;
 969   }
 970   return false;
 971 }
 972
 973 }  // namespace net