net/base/mime_sniffer.cc

   1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
   2 // Use of this source code is governed by a BSD-style license that can be
   3 // found in the LICENSE file.
   4
   5 // Detecting mime types is a tricky business because we need to balance
   6 // compatibility concerns with security issues.  Here is a survey of how other
   7 // browsers behave and then a description of how we intend to behave.
   8 //
   9 // HTML payload, no Content-Type header:
  10 // * IE 7: Render as HTML
  11 // * Firefox 2: Render as HTML
  12 // * Safari 3: Render as HTML
  13 // * Opera 9: Render as HTML
  14 //
  15 // Here the choice seems clear:
  16 // => Chrome: Render as HTML
  17 //
  18 // HTML payload, Content-Type: "text/plain":
  19 // * IE 7: Render as HTML
  20 // * Firefox 2: Render as text
  21 // * Safari 3: Render as text (Note: Safari will Render as HTML if the URL
  22 //                                   has an HTML extension)
  23 // * Opera 9: Render as text
  24 //
  25 // Here we choose to follow the majority (and break some compatibility with IE).
  26 // Many folks dislike IE's behavior here.
  27 // => Chrome: Render as text
  28 // We generalize this as follows.  If the Content-Type header is text/plain
  29 // we won't detect dangerous mime types (those that can execute script).
  30 //
  31 // HTML payload, Content-Type: "application/octet-stream":
  32 // * IE 7: Render as HTML
  33 // * Firefox 2: Download as application/octet-stream
  34 // * Safari 3: Render as HTML
  35 // * Opera 9: Render as HTML
  36 //
  37 // We follow Firefox.
  38 // => Chrome: Download as application/octet-stream
  39 // One factor in this decision is that IIS 4 and 5 will send
  40 // application/octet-stream for .xhtml files (because they don't recognize
  41 // the extension).  We did some experiments and it looks like this doesn't occur
  42 // very often on the web.  We choose the more secure option.
  43 //
  44 // GIF payload, no Content-Type header:
  45 // * IE 7: Render as GIF
  46 // * Firefox 2: Render as GIF
  47 // * Safari 3: Download as Unknown (Note: Safari will Render as GIF if the
  48 //                                        URL has an GIF extension)
  49 // * Opera 9: Render as GIF
  50 //
  51 // The choice is clear.
  52 // => Chrome: Render as GIF
  53 // Once we decide to render HTML without a Content-Type header, there isn't much
  54 // reason not to render GIFs.
  55 //
  56 // GIF payload, Content-Type: "text/plain":
  57 // * IE 7: Render as GIF
  58 // * Firefox 2: Download as application/octet-stream (Note: Firefox will
  59 //                              Download as GIF if the URL has an GIF extension)
  60 // * Safari 3: Download as Unknown (Note: Safari will Render as GIF if the
  61 //                                        URL has an GIF extension)
  62 // * Opera 9: Render as GIF
  63 //
  64 // Displaying as text/plain makes little sense as the content will look like
  65 // gibberish.  Here, we could change our minds and download.
  66 // => Chrome: Render as GIF
  67 //
  68 // GIF payload, Content-Type: "application/octet-stream":
  69 // * IE 7: Render as GIF
  70 // * Firefox 2: Download as application/octet-stream (Note: Firefox will
  71 //                              Download as GIF if the URL has an GIF extension)
  72 // * Safari 3: Download as Unknown (Note: Safari will Render as GIF if the
  73 //                                        URL has an GIF extension)
  74 // * Opera 9: Render as GIF
  75 //
  76 // We used to render as GIF here, but the problem is that some sites want to
  77 // trigger downloads by sending application/octet-stream (even though they
  78 // should be sending Content-Disposition: attachment).  Although it is safe
  79 // to render as GIF from a security perspective, we actually get better
  80 // compatibility if we don't sniff from application/octet stream at all.
  81 // => Chrome: Download as application/octet-stream
  82 //
  83 // XHTML payload, Content-Type: "text/xml":
  84 // * IE 7: Render as XML
  85 // * Firefox 2: Render as HTML
  86 // * Safari 3: Render as HTML
  87 // * Opera 9: Render as HTML
  88 // The layout tests rely on us rendering this as HTML.
  89 // But we're conservative in XHTML detection, as this runs afoul of the
  90 // "don't detect dangerous mime types" rule.
  91 //
  92 // Note that our definition of HTML payload is much stricter than IE's
  93 // definition and roughly the same as Firefox's definition.
  94
  95 #include <stdint.h>
  96 #include <string>
  97
  98 #include "net/base/mime_sniffer.h"
  99
 100 #include "base/logging.h"
 101 #include "base/metrics/histogram.h"
 102 #include "base/strings/string_util.h"
 103 #include "url/gurl.h"
 104
 105 namespace net {
 106
 107 // The number of content bytes we need to use all our magic numbers.  Feel free
 108 // to increase this number if you add a longer magic number.
 109 static const size_t kBytesRequiredForMagic = 42;
 110
 111 struct MagicNumber {
 112   const char* const mime_type;
 113   const char* const magic;
 114   size_t magic_len;
 115   bool is_string;
 116   const char* const mask;  // if set, must have same length as |magic|
 117 };
 118
 119 #define MAGIC_NUMBER(mime_type, magic) \
 120   { (mime_type), (magic), sizeof(magic)-1, false, NULL },
 121
 122 template <int MagicSize, int MaskSize>
 123 class VerifySizes {
 124   static_assert(MagicSize == MaskSize, "sizes must be equal");
 125
 126  public:
 127   enum { SIZES = MagicSize };
 128 };
 129
 130 #define verified_sizeof(magic, mask) \
 131 VerifySizes<sizeof(magic), sizeof(mask)>::SIZES
 132
 133 #define MAGIC_MASK(mime_type, magic, mask) \
 134   { (mime_type), (magic), verified_sizeof(magic, mask)-1, false, (mask) },
 135
 136 // Magic strings are case insensitive and must not include '\0' characters
 137 #define MAGIC_STRING(mime_type, magic) \
 138   { (mime_type), (magic), sizeof(magic)-1, true, NULL },
 139
 140 static const MagicNumber kMagicNumbers[] = {
 141   // Source: HTML 5 specification
 142   MAGIC_NUMBER("application/pdf", "%PDF-")
 143   MAGIC_NUMBER("application/postscript", "%!PS-Adobe-")
 144   MAGIC_NUMBER("image/gif", "GIF87a")
 145   MAGIC_NUMBER("image/gif", "GIF89a")
 146   MAGIC_NUMBER("image/png", "\x89" "PNG\x0D\x0A\x1A\x0A")
 147   MAGIC_NUMBER("image/jpeg", "\xFF\xD8\xFF")
 148   MAGIC_NUMBER("image/bmp", "BM")
 149   // Source: Mozilla
 150   MAGIC_NUMBER("text/plain", "#!")  // Script
 151   MAGIC_NUMBER("text/plain", "%!")  // Script, similar to PS
 152   MAGIC_NUMBER("text/plain", "From")
 153   MAGIC_NUMBER("text/plain", ">From")
 154   // Chrome specific
 155   MAGIC_NUMBER("application/x-gzip", "\x1F\x8B\x08")
 156   MAGIC_NUMBER("audio/x-pn-realaudio", "\x2E\x52\x4D\x46")
 157   MAGIC_NUMBER("video/x-ms-asf",
 158       "\x30\x26\xB2\x75\x8E\x66\xCF\x11\xA6\xD9\x00\xAA\x00\x62\xCE\x6C")
 159   MAGIC_NUMBER("image/tiff", "I I")
 160   MAGIC_NUMBER("image/tiff", "II*")
 161   MAGIC_NUMBER("image/tiff", "MM\x00*")
 162   MAGIC_NUMBER("audio/mpeg", "ID3")
 163   MAGIC_NUMBER("image/webp", "RIFF....WEBPVP8 ")
 164   MAGIC_NUMBER("video/webm", "\x1A\x45\xDF\xA3")
 165   MAGIC_NUMBER("application/zip", "PK\x03\x04")
 166   MAGIC_NUMBER("application/x-rar-compressed", "Rar!\x1A\x07\x00")
 167   MAGIC_NUMBER("application/x-msmetafile", "\xD7\xCD\xC6\x9A")
 168   MAGIC_NUMBER("application/octet-stream", "MZ")  // EXE
 169   // Sniffing for Flash:
 170   //
 171   //   MAGIC_NUMBER("application/x-shockwave-flash", "CWS")
 172   //   MAGIC_NUMBER("application/x-shockwave-flash", "FLV")
 173   //   MAGIC_NUMBER("application/x-shockwave-flash", "FWS")
 174   //
 175   // Including these magic number for Flash is a trade off.
 176   //
 177   // Pros:
 178   //   * Flash is an important and popular file format
 179   //
 180   // Cons:
 181   //   * These patterns are fairly weak
 182   //   * If we mistakenly decide something is Flash, we will execute it
 183   //     in the origin of an unsuspecting site.  This could be a security
 184   //     vulnerability if the site allows users to upload content.
 185   //
 186   // On balance, we do not include these patterns.
 187 };
 188
 189 // The number of content bytes we need to use all our Microsoft Office magic
 190 // numbers.
 191 static const size_t kBytesRequiredForOfficeMagic = 8;
 192
 193 static const MagicNumber kOfficeMagicNumbers[] = {
 194   MAGIC_NUMBER("CFB", "\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1")
 195   MAGIC_NUMBER("OOXML", "PK\x03\x04")
 196 };
 197
 198 enum OfficeDocType {
 199   DOC_TYPE_WORD,
 200   DOC_TYPE_EXCEL,
 201   DOC_TYPE_POWERPOINT,
 202   DOC_TYPE_NONE
 203 };
 204
 205 struct OfficeExtensionType {
 206   OfficeDocType doc_type;
 207   const char* const extension;
 208   size_t extension_len;
 209 };
 210
 211 #define OFFICE_EXTENSION(type, extension) \
 212   { (type), (extension), sizeof(extension) - 1 },
 213
 214 static const OfficeExtensionType kOfficeExtensionTypes[] = {
 215   OFFICE_EXTENSION(DOC_TYPE_WORD, ".doc")
 216   OFFICE_EXTENSION(DOC_TYPE_EXCEL, ".xls")
 217   OFFICE_EXTENSION(DOC_TYPE_POWERPOINT, ".ppt")
 218   OFFICE_EXTENSION(DOC_TYPE_WORD, ".docx")
 219   OFFICE_EXTENSION(DOC_TYPE_EXCEL, ".xlsx")
 220   OFFICE_EXTENSION(DOC_TYPE_POWERPOINT, ".pptx")
 221 };
 222
 223 static const MagicNumber kExtraMagicNumbers[] = {
 224   MAGIC_NUMBER("image/x-xbitmap", "#define")
 225   MAGIC_NUMBER("image/x-icon", "\x00\x00\x01\x00")
 226   MAGIC_NUMBER("image/svg+xml", "<?xml_version=")
 227   MAGIC_NUMBER("audio/wav", "RIFF....WAVEfmt ")
 228   MAGIC_NUMBER("video/avi", "RIFF....AVI LIST")
 229   MAGIC_NUMBER("audio/ogg", "OggS")
 230   MAGIC_MASK("video/mpeg", "\x00\x00\x01\xB0", "\xFF\xFF\xFF\xF0")
 231   MAGIC_MASK("audio/mpeg", "\xFF\xE0", "\xFF\xE0")
 232   MAGIC_NUMBER("video/3gpp", "....ftyp3g")
 233   MAGIC_NUMBER("video/3gpp", "....ftypavcl")
 234   MAGIC_NUMBER("video/mp4", "....ftyp")
 235   MAGIC_NUMBER("video/quicktime", "....moov")
 236   MAGIC_NUMBER("application/x-shockwave-flash", "CWS")
 237   MAGIC_NUMBER("application/x-shockwave-flash", "FWS")
 238   MAGIC_NUMBER("video/x-flv", "FLV")
 239   MAGIC_NUMBER("audio/x-flac", "fLaC")
 240
 241   // RAW image types.
 242   MAGIC_NUMBER("image/x-canon-cr2", "II\x2a\x00\x10\x00\x00\x00CR")
 243   MAGIC_NUMBER("image/x-canon-crw", "II\x1a\x00\x00\x00HEAPCCDR")
 244   MAGIC_NUMBER("image/x-minolta-mrw", "\x00MRM")
 245   MAGIC_NUMBER("image/x-olympus-orf", "MMOR")  // big-endian
 246   MAGIC_NUMBER("image/x-olympus-orf", "IIRO")  // little-endian
 247   MAGIC_NUMBER("image/x-olympus-orf", "IIRS")  // little-endian
 248   MAGIC_NUMBER("image/x-fuji-raf", "FUJIFILMCCD-RAW ")
 249   MAGIC_NUMBER("image/x-panasonic-raw",
 250                "IIU\x00\x08\x00\x00\x00")  // Panasonic .raw
 251   MAGIC_NUMBER("image/x-panasonic-raw",
 252                "IIU\x00\x18\x00\x00\x00")  // Panasonic .rw2
 253   MAGIC_NUMBER("image/x-phaseone-raw", "MMMMRaw")
 254   MAGIC_NUMBER("image/x-x3f", "FOVb")
 255 };
 256
 257 // Our HTML sniffer differs slightly from Mozilla.  For example, Mozilla will
 258 // decide that a document that begins "<!DOCTYPE SOAP-ENV:Envelope PUBLIC " is
 259 // HTML, but we will not.
 260
 261 #define MAGIC_HTML_TAG(tag) \
 262   MAGIC_STRING("text/html", "<" tag)
 263
 264 static const MagicNumber kSniffableTags[] = {
 265   // XML processing directive.  Although this is not an HTML mime type, we sniff
 266   // for this in the HTML phase because text/xml is just as powerful as HTML and
 267   // we want to leverage our white space skipping technology.
 268   MAGIC_NUMBER("text/xml", "<?xml")  // Mozilla
 269   // DOCTYPEs
 270   MAGIC_HTML_TAG("!DOCTYPE html")  // HTML5 spec
 271   // Sniffable tags, ordered by how often they occur in sniffable documents.
 272   MAGIC_HTML_TAG("script")  // HTML5 spec, Mozilla
 273   MAGIC_HTML_TAG("html")  // HTML5 spec, Mozilla
 274   MAGIC_HTML_TAG("!--")
 275   MAGIC_HTML_TAG("head")  // HTML5 spec, Mozilla
 276   MAGIC_HTML_TAG("iframe")  // Mozilla
 277   MAGIC_HTML_TAG("h1")  // Mozilla
 278   MAGIC_HTML_TAG("div")  // Mozilla
 279   MAGIC_HTML_TAG("font")  // Mozilla
 280   MAGIC_HTML_TAG("table")  // Mozilla
 281   MAGIC_HTML_TAG("a")  // Mozilla
 282   MAGIC_HTML_TAG("style")  // Mozilla
 283   MAGIC_HTML_TAG("title")  // Mozilla
 284   MAGIC_HTML_TAG("b")  // Mozilla
 285   MAGIC_HTML_TAG("body")  // Mozilla
 286   MAGIC_HTML_TAG("br")
 287   MAGIC_HTML_TAG("p")  // Mozilla
 288 };
 289
 290 static base::HistogramBase* UMASnifferHistogramGet(const char* name,
 291                                                    int array_size) {
 292   base::HistogramBase* counter =
 293       base::LinearHistogram::FactoryGet(name, 1, array_size - 1, array_size,
 294           base::HistogramBase::kUmaTargetedHistogramFlag);
 295   return counter;
 296 }
 297
 298 // Compare content header to a magic number where magic_entry can contain '.'
 299 // for single character of anything, allowing some bytes to be skipped.
 300 static bool MagicCmp(const char* magic_entry, const char* content, size_t len) {
 301   while (len) {
 302     if ((*magic_entry != '.') && (*magic_entry != *content))
 303       return false;
 304     ++magic_entry;
 305     ++content;
 306     --len;
 307   }
 308   return true;
 309 }
 310
 311 // Like MagicCmp() except that it ANDs each byte with a mask before
 312 // the comparison, because there are some bits we don't care about.
 313 static bool MagicMaskCmp(const char* magic_entry,
 314                          const char* content,
 315                          size_t len,
 316                          const char* mask) {
 317   while (len) {
 318     if ((*magic_entry != '.') && (*magic_entry != (*mask & *content)))
 319       return false;
 320     ++magic_entry;
 321     ++content;
 322     ++mask;
 323     --len;
 324   }
 325   return true;
 326 }
 327
 328 static bool MatchMagicNumber(const char* content,
 329                              size_t size,
 330                              const MagicNumber& magic_entry,
 331                              std::string* result) {
 332   const size_t len = magic_entry.magic_len;
 333
 334   // Keep kBytesRequiredForMagic honest.
 335   DCHECK_LE(len, kBytesRequiredForMagic);
 336
 337   // To compare with magic strings, we need to compute strlen(content), but
 338   // content might not actually have a null terminator.  In that case, we
 339   // pretend the length is content_size.
 340   const char* end = static_cast<const char*>(memchr(content, '\0', size));
 341   const size_t content_strlen =
 342       (end != NULL) ? static_cast<size_t>(end - content) : size;
 343
 344   bool match = false;
 345   if (magic_entry.is_string) {
 346     if (content_strlen >= len) {
 347       // String comparisons are case-insensitive
 348       match = (base::strncasecmp(magic_entry.magic, content, len) == 0);
 349     }
 350   } else {
 351     if (size >= len) {
 352       if (!magic_entry.mask) {
 353         match = MagicCmp(magic_entry.magic, content, len);
 354       } else {
 355         match = MagicMaskCmp(magic_entry.magic, content, len, magic_entry.mask);
 356       }
 357     }
 358   }
 359
 360   if (match) {
 361     result->assign(magic_entry.mime_type);
 362     return true;
 363   }
 364   return false;
 365 }
 366
 367 static bool CheckForMagicNumbers(const char* content, size_t size,
 368                                  const MagicNumber* magic, size_t magic_len,
 369                                  base::HistogramBase* counter,
 370                                  std::string* result) {
 371   for (size_t i = 0; i < magic_len; ++i) {
 372     if (MatchMagicNumber(content, size, magic[i], result)) {
 373       if (counter) counter->Add(static_cast<int>(i));
 374       return true;
 375     }
 376   }
 377   return false;
 378 }
 379
 380 // Truncates |size| to |max_size| and returns true if |size| is at least
 381 // |max_size|.
 382 static bool TruncateSize(const size_t max_size, size_t* size) {
 383   // Keep kMaxBytesToSniff honest.
 384   DCHECK_LE(static_cast<int>(max_size), kMaxBytesToSniff);
 385
 386   if (*size >= max_size) {
 387     *size = max_size;
 388     return true;
 389   }
 390   return false;
 391 }
 392
 393 // Returns true and sets result if the content appears to be HTML.
 394 // Clears have_enough_content if more data could possibly change the result.
 395 static bool SniffForHTML(const char* content,
 396                          size_t size,
 397                          bool* have_enough_content,
 398                          std::string* result) {
 399   // For HTML, we are willing to consider up to 512 bytes. This may be overly
 400   // conservative as IE only considers 256.
 401   *have_enough_content &= TruncateSize(512, &size);
 402
 403   // We adopt a strategy similar to that used by Mozilla to sniff HTML tags,
 404   // but with some modifications to better match the HTML5 spec.
 405   const char* const end = content + size;
 406   const char* pos;
 407   for (pos = content; pos < end; ++pos) {
 408     if (!base::IsAsciiWhitespace(*pos))
 409       break;
 410   }
 411   static base::HistogramBase* counter(NULL);
 412   if (!counter) {
 413     counter = UMASnifferHistogramGet("mime_sniffer.kSniffableTags2",
 414                                      arraysize(kSniffableTags));
 415   }
 416   // |pos| now points to first non-whitespace character (or at end).
 417   return CheckForMagicNumbers(pos, end - pos,
 418                               kSniffableTags, arraysize(kSniffableTags),
 419                               counter, result);
 420 }
 421
 422 // Returns true and sets result if the content matches any of kMagicNumbers.
 423 // Clears have_enough_content if more data could possibly change the result.
 424 static bool SniffForMagicNumbers(const char* content,
 425                                  size_t size,
 426                                  bool* have_enough_content,
 427                                  std::string* result) {
 428   *have_enough_content &= TruncateSize(kBytesRequiredForMagic, &size);
 429
 430   // Check our big table of Magic Numbers
 431   static base::HistogramBase* counter(NULL);
 432   if (!counter) {
 433     counter = UMASnifferHistogramGet("mime_sniffer.kMagicNumbers2",
 434                                      arraysize(kMagicNumbers));
 435   }
 436   return CheckForMagicNumbers(content, size,
 437                               kMagicNumbers, arraysize(kMagicNumbers),
 438                               counter, result);
 439 }
 440
 441 // Returns true and sets result if the content matches any of
 442 // kOfficeMagicNumbers, and the URL has the proper extension.
 443 // Clears |have_enough_content| if more data could possibly change the result.
 444 static bool SniffForOfficeDocs(const char* content,
 445                                size_t size,
 446                                const GURL& url,
 447                                bool* have_enough_content,
 448                                std::string* result) {
 449   *have_enough_content &= TruncateSize(kBytesRequiredForOfficeMagic, &size);
 450
 451   // Check our table of magic numbers for Office file types.
 452   std::string office_version;
 453   if (!CheckForMagicNumbers(content, size,
 454                             kOfficeMagicNumbers, arraysize(kOfficeMagicNumbers),
 455                             NULL, &office_version))
 456     return false;
 457
 458   OfficeDocType type = DOC_TYPE_NONE;
 459   for (size_t i = 0; i < arraysize(kOfficeExtensionTypes); ++i) {
 460     std::string url_path = url.path();
 461
 462     if (url_path.length() < kOfficeExtensionTypes[i].extension_len)
 463       continue;
 464
 465     const char* extension =
 466         &url_path[url_path.length() - kOfficeExtensionTypes[i].extension_len];
 467
 468     if (0 == base::strncasecmp(extension, kOfficeExtensionTypes[i].extension,
 469                                kOfficeExtensionTypes[i].extension_len)) {
 470       type = kOfficeExtensionTypes[i].doc_type;
 471       break;
 472     }
 473   }
 474
 475   if (type == DOC_TYPE_NONE)
 476     return false;
 477
 478   if (office_version == "CFB") {
 479     switch (type) {
 480       case DOC_TYPE_WORD:
 481         *result = "application/msword";
 482         return true;
 483       case DOC_TYPE_EXCEL:
 484         *result = "application/vnd.ms-excel";
 485         return true;
 486       case DOC_TYPE_POWERPOINT:
 487         *result = "application/vnd.ms-powerpoint";
 488         return true;
 489       case DOC_TYPE_NONE:
 490         NOTREACHED();
 491         return false;
 492     }
 493   } else if (office_version == "OOXML") {
 494     switch (type) {
 495       case DOC_TYPE_WORD:
 496         *result = "application/vnd.openxmlformats-officedocument."
 497                   "wordprocessingml.document";
 498         return true;
 499       case DOC_TYPE_EXCEL:
 500         *result = "application/vnd.openxmlformats-officedocument."
 501                   "spreadsheetml.sheet";
 502         return true;
 503       case DOC_TYPE_POWERPOINT:
 504         *result = "application/vnd.openxmlformats-officedocument."
 505                   "presentationml.presentation";
 506         return true;
 507       case DOC_TYPE_NONE:
 508         NOTREACHED();
 509         return false;
 510     }
 511   }
 512
 513   NOTREACHED();
 514   return false;
 515 }
 516
 517 static bool IsOfficeType(const std::string& type_hint) {
 518   return (type_hint == "application/msword" ||
 519           type_hint == "application/vnd.ms-excel" ||
 520           type_hint == "application/vnd.ms-powerpoint" ||
 521           type_hint == "application/vnd.openxmlformats-officedocument."
 522                        "wordprocessingml.document" ||
 523           type_hint == "application/vnd.openxmlformats-officedocument."
 524                        "spreadsheetml.sheet" ||
 525           type_hint == "application/vnd.openxmlformats-officedocument."
 526                        "presentationml.presentation" ||
 527           type_hint == "application/vnd.ms-excel.sheet.macroenabled.12" ||
 528           type_hint == "application/vnd.ms-word.document.macroenabled.12" ||
 529           type_hint == "application/vnd.ms-powerpoint.presentation."
 530                        "macroenabled.12" ||
 531           type_hint == "application/mspowerpoint" ||
 532           type_hint == "application/msexcel" ||
 533           type_hint == "application/vnd.ms-word" ||
 534           type_hint == "application/vnd.ms-word.document.12" ||
 535           type_hint == "application/vnd.msword");
 536 }
 537
 538 // This function checks for files that have a Microsoft Office MIME type
 539 // set, but are not actually Office files.
 540 //
 541 // If this is not actually an Office file, |*result| is set to
 542 // "application/octet-stream", otherwise it is not modified.
 543 //
 544 // Returns false if additional data is required to determine the file type, or
 545 // true if there is enough data to make a decision.
 546 static bool SniffForInvalidOfficeDocs(const char* content,
 547                                       size_t size,
 548                                       const GURL& url,
 549                                       std::string* result) {
 550   if (!TruncateSize(kBytesRequiredForOfficeMagic, &size))
 551     return false;
 552
 553   // Check our table of magic numbers for Office file types.  If it does not
 554   // match one, the MIME type was invalid.  Set it instead to a safe value.
 555   std::string office_version;
 556   if (!CheckForMagicNumbers(content, size,
 557                             kOfficeMagicNumbers, arraysize(kOfficeMagicNumbers),
 558                             NULL, &office_version)) {
 559     *result = "application/octet-stream";
 560   }
 561
 562   // We have enough information to determine if this was a Microsoft Office
 563   // document or not, so sniffing is completed.
 564   return true;
 565 }
 566
 567 // Byte order marks
 568 static const MagicNumber kMagicXML[] = {
 569   // We want to be very conservative in interpreting text/xml content as
 570   // XHTML -- we just want to sniff enough to make unit tests pass.
 571   // So we match explicitly on this, and don't match other ways of writing
 572   // it in semantically-equivalent ways.
 573   MAGIC_STRING("application/xhtml+xml",
 574                "<html xmlns=\"http://www.w3.org/1999/xhtml\"")
 575   MAGIC_STRING("application/atom+xml", "<feed")
 576   MAGIC_STRING("application/rss+xml", "<rss")  // UTF-8
 577 };
 578
 579 // Returns true and sets result if the content appears to contain XHTML or a
 580 // feed.
 581 // Clears have_enough_content if more data could possibly change the result.
 582 //
 583 // TODO(evanm): this is similar but more conservative than what Safari does,
 584 // while HTML5 has a different recommendation -- what should we do?
 585 // TODO(evanm): this is incorrect for documents whose encoding isn't a superset
 586 // of ASCII -- do we care?
 587 static bool SniffXML(const char* content,
 588                      size_t size,
 589                      bool* have_enough_content,
 590                      std::string* result) {
 591   // We allow at most 300 bytes of content before we expect the opening tag.
 592   *have_enough_content &= TruncateSize(300, &size);
 593   const char* pos = content;
 594   const char* const end = content + size;
 595
 596   // This loop iterates through tag-looking offsets in the file.
 597   // We want to skip XML processing instructions (of the form "<?xml ...")
 598   // and stop at the first "plain" tag, then make a decision on the mime-type
 599   // based on the name (or possibly attributes) of that tag.
 600   static base::HistogramBase* counter(NULL);
 601   if (!counter) {
 602     counter = UMASnifferHistogramGet("mime_sniffer.kMagicXML2",
 603                                      arraysize(kMagicXML));
 604   }
 605   const int kMaxTagIterations = 5;
 606   for (int i = 0; i < kMaxTagIterations && pos < end; ++i) {
 607     pos = reinterpret_cast<const char*>(memchr(pos, '<', end - pos));
 608     if (!pos)
 609       return false;
 610
 611     if ((pos + sizeof("<?xml") - 1 <= end) &&
 612         (base::strncasecmp(pos, "<?xml", sizeof("<?xml") - 1) == 0)) {
 613       // Skip XML declarations.
 614       ++pos;
 615       continue;
 616     } else if ((pos + sizeof("<!DOCTYPE") - 1 <= end) &&
 617                (base::strncasecmp(pos, "<!DOCTYPE", sizeof("<!DOCTYPE") - 1) ==
 618                 0)) {
 619       // Skip DOCTYPE declarations.
 620       ++pos;
 621       continue;
 622     }
 623
 624     if (CheckForMagicNumbers(pos, end - pos,
 625                              kMagicXML, arraysize(kMagicXML),
 626                              counter, result))
 627       return true;
 628
 629     // TODO(evanm): handle RSS 1.0, which is an RDF format and more difficult
 630     // to identify.
 631
 632     // If we get here, we've hit an initial tag that hasn't matched one of the
 633     // above tests.  Abort.
 634     return true;
 635   }
 636
 637   // We iterated too far without finding a start tag.
 638   // If we have more content to look at, we aren't going to change our mind by
 639   // seeing more bytes from the network.
 640   return pos < end;
 641 }
 642
 643 // Byte order marks
 644 static const MagicNumber kByteOrderMark[] = {
 645   MAGIC_NUMBER("text/plain", "\xFE\xFF")  // UTF-16BE
 646   MAGIC_NUMBER("text/plain", "\xFF\xFE")  // UTF-16LE
 647   MAGIC_NUMBER("text/plain", "\xEF\xBB\xBF")  // UTF-8
 648 };
 649
 650 // Returns true and sets result to "application/octet-stream" if the content
 651 // appears to be binary data. Otherwise, returns false and sets "text/plain".
 652 // Clears have_enough_content if more data could possibly change the result.
 653 static bool SniffBinary(const char* content,
 654                         size_t size,
 655                         bool* have_enough_content,
 656                         std::string* result) {
 657   // There is no concensus about exactly how to sniff for binary content.
 658   // * IE 7: Don't sniff for binary looking bytes, but trust the file extension.
 659   // * Firefox 3.5: Sniff first 4096 bytes for a binary looking byte.
 660   // Here, we side with FF, but with a smaller buffer. This size was chosen
 661   // because it is small enough to comfortably fit into a single packet (after
 662   // allowing for headers) and yet large enough to account for binary formats
 663   // that have a significant amount of ASCII at the beginning (crbug.com/15314).
 664   const bool is_truncated = TruncateSize(kMaxBytesToSniff, &size);
 665
 666   // First, we look for a BOM.
 667   static base::HistogramBase* counter(NULL);
 668   if (!counter) {
 669     counter = UMASnifferHistogramGet("mime_sniffer.kByteOrderMark2",
 670                                      arraysize(kByteOrderMark));
 671   }
 672   std::string unused;
 673   if (CheckForMagicNumbers(content, size,
 674                            kByteOrderMark, arraysize(kByteOrderMark),
 675                            counter, &unused)) {
 676     // If there is BOM, we think the buffer is not binary.
 677     result->assign("text/plain");
 678     return false;
 679   }
 680
 681   // Next we look to see if any of the bytes "look binary."
 682   if (LooksLikeBinary(content, size)) {
 683     result->assign("application/octet-stream");
 684     return true;
 685   }
 686
 687   // No evidence either way. Default to non-binary and, if truncated, clear
 688   // have_enough_content because there could be a binary looking byte in the
 689   // truncated data.
 690   *have_enough_content &= is_truncated;
 691   result->assign("text/plain");
 692   return false;
 693 }
 694
 695 static bool IsUnknownMimeType(const std::string& mime_type) {
 696   // TODO(tc): Maybe reuse some code in net/http/http_response_headers.* here.
 697   // If we do, please be careful not to alter the semantics at all.
 698   static const char* const kUnknownMimeTypes[] = {
 699     // Empty mime types are as unknown as they get.
 700     "",
 701     // The unknown/unknown type is popular and uninformative
 702     "unknown/unknown",
 703     // The second most popular unknown mime type is application/unknown
 704     "application/unknown",
 705     // Firefox rejects a mime type if it is exactly */*
 706     "*/*",
 707   };
 708   static base::HistogramBase* counter(NULL);
 709   if (!counter) {
 710     counter = UMASnifferHistogramGet("mime_sniffer.kUnknownMimeTypes2",
 711                                      arraysize(kUnknownMimeTypes) + 1);
 712   }
 713   for (size_t i = 0; i < arraysize(kUnknownMimeTypes); ++i) {
 714     if (mime_type == kUnknownMimeTypes[i]) {
 715       counter->Add(i);
 716       return true;
 717     }
 718   }
 719   if (mime_type.find('/') == std::string::npos) {
 720     // Firefox rejects a mime type if it does not contain a slash
 721     counter->Add(arraysize(kUnknownMimeTypes));
 722     return true;
 723   }
 724   return false;
 725 }
 726
 727 // Returns true and sets result if the content appears to be a crx (Chrome
 728 // extension) file.
 729 // Clears have_enough_content if more data could possibly change the result.
 730 static bool SniffCRX(const char* content,
 731                      size_t size,
 732                      const GURL& url,
 733                      const std::string& type_hint,
 734                      bool* have_enough_content,
 735                      std::string* result) {
 736   static base::HistogramBase* counter(NULL);
 737   if (!counter)
 738     counter = UMASnifferHistogramGet("mime_sniffer.kSniffCRX", 3);
 739
 740   // Technically, the crx magic number is just Cr24, but the bytes after that
 741   // are a version number which changes infrequently. Including it in the
 742   // sniffing gives us less room for error. If the version number ever changes,
 743   // we can just add an entry to this list.
 744   //
 745   // TODO(aa): If we ever have another magic number, we'll want to pass a
 746   // histogram into CheckForMagicNumbers(), below, to see which one matched.
 747   static const struct MagicNumber kCRXMagicNumbers[] = {
 748     MAGIC_NUMBER("application/x-chrome-extension", "Cr24\x02\x00\x00\x00")
 749   };
 750
 751   // Only consider files that have the extension ".crx".
 752   static const char kCRXExtension[] = ".crx";
 753   // Ignore null by subtracting 1.
 754   static const int kExtensionLength = arraysize(kCRXExtension) - 1;
 755   if (url.path().rfind(kCRXExtension, std::string::npos, kExtensionLength) ==
 756       url.path().size() - kExtensionLength) {
 757     counter->Add(1);
 758   } else {
 759     return false;
 760   }
 761
 762   *have_enough_content &= TruncateSize(kBytesRequiredForMagic, &size);
 763   if (CheckForMagicNumbers(content, size,
 764                            kCRXMagicNumbers, arraysize(kCRXMagicNumbers),
 765                            NULL, result)) {
 766     counter->Add(2);
 767   } else {
 768     return false;
 769   }
 770
 771   return true;
 772 }
 773
 774 bool ShouldSniffMimeType(const GURL& url, const std::string& mime_type) {
 775   static base::HistogramBase* should_sniff_counter(NULL);
 776   if (!should_sniff_counter) {
 777     should_sniff_counter =
 778         UMASnifferHistogramGet("mime_sniffer.ShouldSniffMimeType2", 3);
 779   }
 780   bool sniffable_scheme = url.is_empty() ||
 781                           url.SchemeIsHTTPOrHTTPS() ||
 782                           url.SchemeIs("ftp") ||
 783 #if defined(OS_ANDROID)
 784                           url.SchemeIs("content") ||
 785 #endif
 786                           url.SchemeIsFile() ||
 787                           url.SchemeIsFileSystem();
 788   if (!sniffable_scheme) {
 789     should_sniff_counter->Add(1);
 790     return false;
 791   }
 792
 793   static const char* const kSniffableTypes[] = {
 794     // Many web servers are misconfigured to send text/plain for many
 795     // different types of content.
 796     "text/plain",
 797     // We want to sniff application/octet-stream for
 798     // application/x-chrome-extension, but nothing else.
 799     "application/octet-stream",
 800     // XHTML and Atom/RSS feeds are often served as plain xml instead of
 801     // their more specific mime types.
 802     "text/xml",
 803     "application/xml",
 804     // Check for false Microsoft Office MIME types.
 805     "application/msword",
 806     "application/vnd.ms-excel",
 807     "application/vnd.ms-powerpoint",
 808     "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
 809     "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
 810     "application/vnd.openxmlformats-officedocument.presentationml.presentation",
 811     "application/vnd.ms-excel.sheet.macroenabled.12",
 812     "application/vnd.ms-word.document.macroenabled.12",
 813     "application/vnd.ms-powerpoint.presentation.macroenabled.12",
 814     "application/mspowerpoint",
 815     "application/msexcel",
 816     "application/vnd.ms-word",
 817     "application/vnd.ms-word.document.12",
 818     "application/vnd.msword",
 819   };
 820   static base::HistogramBase* counter(NULL);
 821   if (!counter) {
 822     counter = UMASnifferHistogramGet("mime_sniffer.kSniffableTypes2",
 823                                      arraysize(kSniffableTypes) + 1);
 824   }
 825   for (size_t i = 0; i < arraysize(kSniffableTypes); ++i) {
 826     if (mime_type == kSniffableTypes[i]) {
 827       counter->Add(i);
 828       should_sniff_counter->Add(2);
 829       return true;
 830     }
 831   }
 832   if (IsUnknownMimeType(mime_type)) {
 833     // The web server didn't specify a content type or specified a mime
 834     // type that we ignore.
 835     counter->Add(arraysize(kSniffableTypes));
 836     should_sniff_counter->Add(2);
 837     return true;
 838   }
 839   should_sniff_counter->Add(1);
 840   return false;
 841 }
 842
 843 bool SniffMimeType(const char* content,
 844                    size_t content_size,
 845                    const GURL& url,
 846                    const std::string& type_hint,
 847                    std::string* result) {
 848   DCHECK_LT(content_size, 1000000U);  // sanity check
 849   DCHECK(content);
 850   DCHECK(result);
 851
 852   // By default, we assume we have enough content.
 853   // Each sniff routine may unset this if it wasn't provided enough content.
 854   bool have_enough_content = true;
 855
 856   // By default, we'll return the type hint.
 857   // Each sniff routine may modify this if it has a better guess..
 858   result->assign(type_hint);
 859
 860   // If the file has a Microsoft Office MIME type, we should only check that it
 861   // is a valid Office file.  Because this is the only reason we sniff files
 862   // with a Microsoft Office MIME type, we can return early.
 863   if (IsOfficeType(type_hint))
 864     return SniffForInvalidOfficeDocs(content, content_size, url, result);
 865
 866   // Cache information about the type_hint
 867   const bool hint_is_unknown_mime_type = IsUnknownMimeType(type_hint);
 868
 869   // First check for HTML
 870   if (hint_is_unknown_mime_type) {
 871     // We're only willing to sniff HTML if the server has not supplied a mime
 872     // type, or if the type it did supply indicates that it doesn't know what
 873     // the type should be.
 874     if (SniffForHTML(content, content_size, &have_enough_content, result))
 875       return true;  // We succeeded in sniffing HTML.  No more content needed.
 876   }
 877
 878   // We're only willing to sniff for binary in 3 cases:
 879   // 1. The server has not supplied a mime type.
 880   // 2. The type it did supply indicates that it doesn't know what the type
 881   //    should be.
 882   // 3. The type is "text/plain" which is the default on some web servers and
 883   //    could be indicative of a mis-configuration that we shield the user from.
 884   const bool hint_is_text_plain = (type_hint == "text/plain");
 885   if (hint_is_unknown_mime_type || hint_is_text_plain) {
 886     if (!SniffBinary(content, content_size, &have_enough_content, result)) {
 887       // If the server said the content was text/plain and it doesn't appear
 888       // to be binary, then we trust it.
 889       if (hint_is_text_plain) {
 890         return have_enough_content;
 891       }
 892     }
 893   }
 894
 895   // If we have plain XML, sniff XML subtypes.
 896   if (type_hint == "text/xml" || type_hint == "application/xml") {
 897     // We're not interested in sniffing these types for images and the like.
 898     // Instead, we're looking explicitly for a feed.  If we don't find one
 899     // we're done and return early.
 900     if (SniffXML(content, content_size, &have_enough_content, result))
 901       return true;
 902     return have_enough_content;
 903   }
 904
 905   // CRX files (Chrome extensions) have a special sniffing algorithm. It is
 906   // tighter than the others because we don't have to match legacy behavior.
 907   if (SniffCRX(content, content_size, url, type_hint,
 908                &have_enough_content, result))
 909     return true;
 910
 911   // Check the file extension and magic numbers to see if this is an Office
 912   // document.  This needs to be checked before the general magic numbers
 913   // because zip files and Office documents (OOXML) have the same magic number.
 914   if (SniffForOfficeDocs(content, content_size, url,
 915                          &have_enough_content, result))
 916     return true;  // We've matched a magic number.  No more content needed.
 917
 918   // We're not interested in sniffing for magic numbers when the type_hint
 919   // is application/octet-stream.  Time to bail out.
 920   if (type_hint == "application/octet-stream")
 921     return have_enough_content;
 922
 923   // Now we look in our large table of magic numbers to see if we can find
 924   // anything that matches the content.
 925   if (SniffForMagicNumbers(content, content_size,
 926                            &have_enough_content, result))
 927     return true;  // We've matched a magic number.  No more content needed.
 928
 929   return have_enough_content;
 930 }
 931
 932 bool SniffMimeTypeFromLocalData(const char* content,
 933                                 size_t size,
 934                                 std::string* result) {
 935   // First check the extra table.
 936   if (CheckForMagicNumbers(content, size, kExtraMagicNumbers,
 937                            arraysize(kExtraMagicNumbers), NULL, result))
 938     return true;
 939   // Finally check the original table.
 940   return CheckForMagicNumbers(content, size, kMagicNumbers,
 941                               arraysize(kMagicNumbers), NULL, result);
 942 }
 943
 944 bool LooksLikeBinary(const char* content, size_t size) {
 945   // The definition of "binary bytes" is from the spec at
 946   // https://mimesniff.spec.whatwg.org/#binary-data-byte
 947   //
 948   // The bytes which are considered to be "binary" are all < 0x20. Encode them
 949   // one bit per byte, with 1 for a "binary" bit, and 0 for a "text" bit. The
 950   // least-significant bit represents byte 0x00, the most-significant bit
 951   // represents byte 0x1F.
 952   const uint32_t kBinaryBits =
 953       ~(1u << '\t' | 1u << '\n' | 1u << '\r' | 1u << '\f' | 1u << '\x1b');
 954   for (size_t i = 0; i < size; ++i) {
 955     uint8_t byte = static_cast<uint8_t>(content[i]);
 956     if (byte < 0x20 && (kBinaryBits & (1u << byte)))
 957       return true;
 958   }
 959   return false;
 960 }
 961
 962 }  // namespace net