content/child/site_isolation_policy.cc

   1 // Copyright 2013 The Chromium Authors. All rights reserved.
   2 // Use of this source code is governed by a BSD-style license that can be
   3 // found in the LICENSE file.
   4
   5 #include "content/child/site_isolation_policy.h"
   6
   7 #include "base/basictypes.h"
   8 #include "base/command_line.h"
   9 #include "base/lazy_instance.h"
  10 #include "base/logging.h"
  11 #include "base/metrics/histogram.h"
  12 #include "base/strings/string_util.h"
  13 #include "content/public/common/content_switches.h"
  14 #include "net/base/registry_controlled_domains/registry_controlled_domain.h"
  15 #include "net/http/http_response_headers.h"
  16 #include "webkit/common/resource_response_info.h"
  17
  18 using base::StringPiece;
  19
  20 namespace content {
  21
  22 namespace {
  23
  24 // The cross-site document blocking/UMA data collection is deactivated by
  25 // default, and only activated in renderer processes.
  26 static bool g_policy_enabled = false;
  27
  28 // MIME types
  29 const char kTextHtml[] = "text/html";
  30 const char kTextXml[] = "text/xml";
  31 const char xAppRssXml[] = "application/rss+xml";
  32 const char kAppXml[] = "application/xml";
  33 const char kAppJson[] = "application/json";
  34 const char kTextJson[] = "text/json";
  35 const char kTextXjson[] = "text/x-json";
  36 const char kTextPlain[] = "text/plain";
  37
  38 // TODO(dsjang): this is only needed for collecting UMA stat. Will be deleted
  39 // when this class is used for actual blocking.
  40 bool IsRenderableStatusCode(int status_code) {
  41   // Chrome only uses the content of a response with one of these status codes
  42   // for CSS/JavaScript. For images, Chrome just ignores status code.
  43   const int renderable_status_code[] = {200, 201, 202, 203, 206, 300,
  44                                         301, 302, 303, 305, 306, 307};
  45   for (size_t i = 0; i < arraysize(renderable_status_code); ++i) {
  46     if (renderable_status_code[i] == status_code)
  47       return true;
  48   }
  49   return false;
  50 }
  51
  52 bool MatchesSignature(StringPiece data,
  53                       const StringPiece signatures[],
  54                       size_t arr_size) {
  55
  56   size_t offset = data.find_first_not_of(" \t\r\n");
  57   // There is no not-whitespace character in this document.
  58   if (offset == base::StringPiece::npos)
  59     return false;
  60
  61   data.remove_prefix(offset);
  62   size_t length = data.length();
  63
  64   for (size_t sig_index = 0; sig_index < arr_size; ++sig_index) {
  65     const StringPiece& signature = signatures[sig_index];
  66     size_t signature_length = signature.length();
  67     if (length < signature_length)
  68       continue;
  69
  70     if (LowerCaseEqualsASCII(
  71             data.begin(), data.begin() + signature_length, signature.data()))
  72       return true;
  73   }
  74   return false;
  75 }
  76
  77 void IncrementHistogramCount(const std::string& name) {
  78   // The default value of min, max, bucket_count are copied from histogram.h.
  79   base::HistogramBase* histogram_pointer = base::Histogram::FactoryGet(
  80       name, 1, 100000, 50, base::HistogramBase::kUmaTargetedHistogramFlag);
  81   histogram_pointer->Add(1);
  82 }
  83
  84 void IncrementHistogramEnum(const std::string& name,
  85                           uint32 sample,
  86                           uint32 boundary_value) {
  87   // The default value of min, max, bucket_count are copied from histogram.h.
  88   base::HistogramBase* histogram_pointer = base::LinearHistogram::FactoryGet(
  89       name,
  90       1,
  91       boundary_value,
  92       boundary_value + 1,
  93       base::HistogramBase::kUmaTargetedHistogramFlag);
  94   histogram_pointer->Add(sample);
  95 }
  96
  97 void HistogramCountBlockedResponse(
  98     const std::string& bucket_prefix,
  99     linked_ptr<SiteIsolationResponseMetaData>& resp_data,
 100     bool nosniff_block) {
 101   std::string block_label(nosniff_block ? ".NoSniffBlocked" : ".Blocked");
 102   IncrementHistogramCount(bucket_prefix + block_label);
 103
 104   // The content is blocked if it is sniffed as HTML/JSON/XML. When
 105   // the blocked response is with an error status code, it is not
 106   // disruptive for the following reasons : 1) the blocked content is
 107   // not a binary object (such as an image) since it is sniffed as
 108   // text; 2) then, this blocking only breaks the renderer behavior
 109   // only if it is either JavaScript or CSS. However, the renderer
 110   // doesn't use the contents of JS/CSS with unaffected status code
 111   // (e.g, 404). 3) the renderer is expected not to use the cross-site
 112   // document content for purposes other than JS/CSS (e.g, XHR).
 113   bool renderable_status_code =
 114       IsRenderableStatusCode(resp_data->http_status_code);
 115
 116   if (renderable_status_code) {
 117     IncrementHistogramEnum(
 118         bucket_prefix + block_label + ".RenderableStatusCode",
 119         resp_data->resource_type,
 120         ResourceType::LAST_TYPE);
 121   } else {
 122     IncrementHistogramCount(bucket_prefix + block_label +
 123                             ".NonRenderableStatusCode");
 124   }
 125 }
 126
 127 void HistogramCountNotBlockedResponse(const std::string& bucket_prefix,
 128                                       bool sniffed_as_js) {
 129   IncrementHistogramCount(bucket_prefix + ".NotBlocked");
 130   if (sniffed_as_js)
 131     IncrementHistogramCount(bucket_prefix + ".NotBlocked.MaybeJS");
 132 }
 133
 134 }  // namespace
 135
 136 SiteIsolationResponseMetaData::SiteIsolationResponseMetaData() {}
 137
 138 void SiteIsolationPolicy::SetPolicyEnabled(bool enabled) {
 139   g_policy_enabled = enabled;
 140 }
 141
 142 linked_ptr<SiteIsolationResponseMetaData>
 143 SiteIsolationPolicy::OnReceivedResponse(
 144     const GURL& frame_origin,
 145     const GURL& response_url,
 146     ResourceType::Type resource_type,
 147     int origin_pid,
 148     const webkit_glue::ResourceResponseInfo& info) {
 149   if (!g_policy_enabled)
 150     return linked_ptr<SiteIsolationResponseMetaData>();
 151
 152   // if |origin_pid| is non-zero, it means that this response is for a plugin
 153   // spawned from this renderer process. We exclude responses for plugins for
 154   // now, but eventually, we're going to make plugin processes directly talk to
 155   // the browser process so that we don't apply cross-site document blocking to
 156   // them.
 157   if (origin_pid)
 158     return linked_ptr<SiteIsolationResponseMetaData>();
 159
 160   UMA_HISTOGRAM_COUNTS("SiteIsolation.AllResponses", 1);
 161
 162   // See if this is for navigation. If it is, don't block it, under the
 163   // assumption that we will put it in an appropriate process.
 164   if (ResourceType::IsFrame(resource_type))
 165     return linked_ptr<SiteIsolationResponseMetaData>();
 166
 167   if (!IsBlockableScheme(response_url))
 168     return linked_ptr<SiteIsolationResponseMetaData>();
 169
 170   if (IsSameSite(frame_origin, response_url))
 171     return linked_ptr<SiteIsolationResponseMetaData>();
 172
 173   SiteIsolationResponseMetaData::CanonicalMimeType canonical_mime_type =
 174       GetCanonicalMimeType(info.mime_type);
 175
 176   if (canonical_mime_type == SiteIsolationResponseMetaData::Others)
 177     return linked_ptr<SiteIsolationResponseMetaData>();
 178
 179   // Every CORS request should have the Access-Control-Allow-Origin header even
 180   // if it is preceded by a pre-flight request. Therefore, if this is a CORS
 181   // request, it has this header.  response.httpHeaderField() internally uses
 182   // case-insensitive matching for the header name.
 183   std::string access_control_origin;
 184
 185   // We can use a case-insensitive header name for EnumerateHeader().
 186   info.headers->EnumerateHeader(
 187       NULL, "access-control-allow-origin", &access_control_origin);
 188   if (IsValidCorsHeaderSet(frame_origin, response_url, access_control_origin))
 189     return linked_ptr<SiteIsolationResponseMetaData>();
 190
 191   // Real XSD data collection starts from here.
 192   std::string no_sniff;
 193   info.headers->EnumerateHeader(NULL, "x-content-type-options", &no_sniff);
 194
 195   linked_ptr<SiteIsolationResponseMetaData> resp_data(
 196       new SiteIsolationResponseMetaData);
 197   resp_data->frame_origin = frame_origin.spec();
 198   resp_data->response_url = response_url;
 199   resp_data->resource_type = resource_type;
 200   resp_data->canonical_mime_type = canonical_mime_type;
 201   resp_data->http_status_code = info.headers->response_code();
 202   resp_data->no_sniff = LowerCaseEqualsASCII(no_sniff, "nosniff");
 203
 204   return resp_data;
 205 }
 206
 207 bool SiteIsolationPolicy::ShouldBlockResponse(
 208     linked_ptr<SiteIsolationResponseMetaData>& resp_data,
 209     const char* raw_data,
 210     int raw_length,
 211     std::string* alternative_data) {
 212   if (!g_policy_enabled)
 213     return false;
 214
 215   DCHECK(resp_data.get());
 216
 217   StringPiece data(raw_data, raw_length);
 218
 219   // Record the length of the first received network packet to see if it's
 220   // enough for sniffing.
 221   UMA_HISTOGRAM_COUNTS("SiteIsolation.XSD.DataLength", raw_length);
 222
 223   // Record the number of cross-site document responses with a specific mime
 224   // type (text/html, text/xml, etc).
 225   UMA_HISTOGRAM_ENUMERATION(
 226       "SiteIsolation.XSD.MimeType",
 227       resp_data->canonical_mime_type,
 228       SiteIsolationResponseMetaData::MaxCanonicalMimeType);
 229
 230   // Store the result of cross-site document blocking analysis.
 231   bool is_blocked = false;
 232   bool sniffed_as_js = SniffForJS(data);
 233
 234   // Record the number of responses whose content is sniffed for what its mime
 235   // type claims it to be. For example, we apply a HTML sniffer for a document
 236   // tagged with text/html here. Whenever this check becomes true, we'll block
 237   // the response.
 238   if (resp_data->canonical_mime_type !=
 239           SiteIsolationResponseMetaData::Plain) {
 240     std::string bucket_prefix;
 241     bool sniffed_as_target_document = false;
 242     if (resp_data->canonical_mime_type ==
 243             SiteIsolationResponseMetaData::HTML) {
 244       bucket_prefix = "SiteIsolation.XSD.HTML";
 245       sniffed_as_target_document = SniffForHTML(data);
 246     } else if (resp_data->canonical_mime_type ==
 247                    SiteIsolationResponseMetaData::XML) {
 248       bucket_prefix = "SiteIsolation.XSD.XML";
 249       sniffed_as_target_document = SniffForXML(data);
 250     } else if (resp_data->canonical_mime_type ==
 251                    SiteIsolationResponseMetaData::JSON) {
 252       bucket_prefix = "SiteIsolation.XSD.JSON";
 253       sniffed_as_target_document = SniffForJSON(data);
 254     } else {
 255       NOTREACHED() << "Not a blockable mime type: "
 256                    << resp_data->canonical_mime_type;
 257     }
 258
 259     if (sniffed_as_target_document) {
 260       is_blocked = true;
 261       HistogramCountBlockedResponse(bucket_prefix, resp_data, false);
 262     } else {
 263       if (resp_data->no_sniff) {
 264         is_blocked = true;
 265         HistogramCountBlockedResponse(bucket_prefix, resp_data, true);
 266       } else {
 267         HistogramCountNotBlockedResponse(bucket_prefix, sniffed_as_js);
 268       }
 269     }
 270   } else {
 271     // This block is for plain text documents. We apply our HTML, XML,
 272     // and JSON sniffer to a text document in the order, and block it
 273     // if any of them succeeds in sniffing.
 274     std::string bucket_prefix;
 275     if (SniffForHTML(data))
 276       bucket_prefix = "SiteIsolation.XSD.Plain.HTML";
 277     else if (SniffForXML(data))
 278       bucket_prefix = "SiteIsolation.XSD.Plain.XML";
 279     else if (SniffForJSON(data))
 280       bucket_prefix = "SiteIsolation.XSD.Plain.JSON";
 281
 282     if (bucket_prefix.size() > 0) {
 283       is_blocked = true;
 284       HistogramCountBlockedResponse(bucket_prefix, resp_data, false);
 285     } else if (resp_data->no_sniff) {
 286       is_blocked = true;
 287       HistogramCountBlockedResponse("SiteIsolation.XSD.Plain", resp_data, true);
 288     } else {
 289       HistogramCountNotBlockedResponse("SiteIsolation.XSD.Plain",
 290                                        sniffed_as_js);
 291     }
 292   }
 293
 294   if (!CommandLine::ForCurrentProcess()->HasSwitch(
 295            switches::kBlockCrossSiteDocuments))
 296     is_blocked = false;
 297
 298   if (is_blocked) {
 299     alternative_data->erase();
 300     alternative_data->insert(0, " ");
 301     LOG(ERROR) << resp_data->response_url
 302                << " is blocked as an illegal cross-site document from "
 303                << resp_data->frame_origin;
 304   }
 305   return is_blocked;
 306 }
 307
 308 SiteIsolationResponseMetaData::CanonicalMimeType
 309 SiteIsolationPolicy::GetCanonicalMimeType(const std::string& mime_type) {
 310   if (LowerCaseEqualsASCII(mime_type, kTextHtml)) {
 311     return SiteIsolationResponseMetaData::HTML;
 312   }
 313
 314   if (LowerCaseEqualsASCII(mime_type, kTextPlain)) {
 315     return SiteIsolationResponseMetaData::Plain;
 316   }
 317
 318   if (LowerCaseEqualsASCII(mime_type, kAppJson) ||
 319       LowerCaseEqualsASCII(mime_type, kTextJson) ||
 320       LowerCaseEqualsASCII(mime_type, kTextXjson)) {
 321     return SiteIsolationResponseMetaData::JSON;
 322   }
 323
 324   if (LowerCaseEqualsASCII(mime_type, kTextXml) ||
 325       LowerCaseEqualsASCII(mime_type, xAppRssXml) ||
 326       LowerCaseEqualsASCII(mime_type, kAppXml)) {
 327     return SiteIsolationResponseMetaData::XML;
 328   }
 329
 330  return SiteIsolationResponseMetaData::Others;
 331 }
 332
 333 bool SiteIsolationPolicy::IsBlockableScheme(const GURL& url) {
 334   // We exclude ftp:// from here. FTP doesn't provide a Content-Type
 335   // header which our policy depends on, so we cannot protect any
 336   // document from FTP servers.
 337   return url.SchemeIs("http") || url.SchemeIs("https");
 338 }
 339
 340 bool SiteIsolationPolicy::IsSameSite(const GURL& frame_origin,
 341                                      const GURL& response_url) {
 342
 343   if (!frame_origin.is_valid() || !response_url.is_valid())
 344     return false;
 345
 346   if (frame_origin.scheme() != response_url.scheme())
 347     return false;
 348
 349   // SameDomainOrHost() extracts the effective domains (public suffix plus one)
 350   // from the two URLs and compare them.
 351   return net::registry_controlled_domains::SameDomainOrHost(
 352       frame_origin,
 353       response_url,
 354       net::registry_controlled_domains::INCLUDE_PRIVATE_REGISTRIES);
 355 }
 356
 357 // We don't use Webkit's existing CORS policy implementation since
 358 // their policy works in terms of origins, not sites. For example,
 359 // when frame is sub.a.com and it is not allowed to access a document
 360 // with sub1.a.com. But under Site Isolation, it's allowed.
 361 bool SiteIsolationPolicy::IsValidCorsHeaderSet(
 362     const GURL& frame_origin,
 363     const GURL& website_origin,
 364     const std::string& access_control_origin) {
 365   // Many websites are sending back "\"*\"" instead of "*". This is
 366   // non-standard practice, and not supported by Chrome. Refer to
 367   // CrossOriginAccessControl::passesAccessControlCheck().
 368
 369   // TODO(dsjang): * is not allowed for the response from a request
 370   // with cookies. This allows for more than what the renderer will
 371   // eventually be able to receive, so we won't see illegal cross-site
 372   // documents allowed by this. We have to find a way to see if this
 373   // response is from a cookie-tagged request or not in the future.
 374   if (access_control_origin == "*")
 375     return true;
 376
 377   // TODO(dsjang): The CORS spec only treats a fully specified URL, except for
 378   // "*", but many websites are using just a domain for access_control_origin,
 379   // and this is blocked by Webkit's CORS logic here :
 380   // CrossOriginAccessControl::passesAccessControlCheck(). GURL is set
 381   // is_valid() to false when it is created from a URL containing * in the
 382   // domain part.
 383
 384   GURL cors_origin(access_control_origin);
 385   return IsSameSite(frame_origin, cors_origin);
 386 }
 387
 388 // This function is a slight modification of |net::SniffForHTML|.
 389 bool SiteIsolationPolicy::SniffForHTML(StringPiece data) {
 390   // The content sniffer used by Chrome and Firefox are using "<!--"
 391   // as one of the HTML signatures, but it also appears in valid
 392   // JavaScript, considered as well-formed JS by the browser.  Since
 393   // we do not want to block any JS, we exclude it from our HTML
 394   // signatures. This can weaken our document block policy, but we can
 395   // break less websites.
 396   // TODO(dsjang): parameterize |net::SniffForHTML| with an option
 397   // that decides whether to include <!-- or not, so that we can
 398   // remove this function.
 399   // TODO(dsjang): Once SiteIsolationPolicy is moved into the browser
 400   // process, we should do single-thread checking here for the static
 401   // initializer.
 402   static const StringPiece kHtmlSignatures[] = {
 403     StringPiece("<!DOCTYPE html"),  // HTML5 spec
 404     StringPiece("<script"),  // HTML5 spec, Mozilla
 405     StringPiece("<html"),    // HTML5 spec, Mozilla
 406     StringPiece("<head"),    // HTML5 spec, Mozilla
 407     StringPiece("<iframe"),  // Mozilla
 408     StringPiece("<h1"),      // Mozilla
 409     StringPiece("<div"),     // Mozilla
 410     StringPiece("<font"),    // Mozilla
 411     StringPiece("<table"),   // Mozilla
 412     StringPiece("<a"),       // Mozilla
 413     StringPiece("<style"),   // Mozilla
 414     StringPiece("<title"),   // Mozilla
 415     StringPiece("<b"),       // Mozilla
 416     StringPiece("<body"),    // Mozilla
 417     StringPiece("<br"),      // Mozilla
 418     StringPiece("<p"),       // Mozilla
 419     StringPiece("<?xml")     // Mozilla
 420   };
 421
 422   while (data.length() > 0) {
 423     if (MatchesSignature(
 424           data, kHtmlSignatures, arraysize(kHtmlSignatures)))
 425       return true;
 426
 427     // If we cannot find "<!--", we fail sniffing this as HTML.
 428     static const StringPiece kCommentBegins[] = { StringPiece("<!--") };
 429     if (!MatchesSignature(data, kCommentBegins, arraysize(kCommentBegins)))
 430       break;
 431
 432     // Search for --> and do SniffForHTML after that. If we can find the
 433     // comment's end, we start HTML sniffing from there again.
 434     static const char kEndComment[] = "-->";
 435     size_t offset = data.find(kEndComment);
 436     if (offset == base::StringPiece::npos)
 437       break;
 438
 439     // Proceed to the index next to the ending comment (-->).
 440     data.remove_prefix(offset + strlen(kEndComment));
 441   }
 442
 443   return false;
 444 }
 445
 446 bool SiteIsolationPolicy::SniffForXML(base::StringPiece data) {
 447   // TODO(dsjang): Chrome's mime_sniffer is using strncasecmp() for
 448   // this signature. However, XML is case-sensitive. Don't we have to
 449   // be more lenient only to block documents starting with the exact
 450   // string <?xml rather than <?XML ?
 451   // TODO(dsjang): Once SiteIsolationPolicy is moved into the browser
 452   // process, we should do single-thread checking here for the static
 453   // initializer.
 454   static const StringPiece kXmlSignatures[] = { StringPiece("<?xml") };
 455   return MatchesSignature(data, kXmlSignatures, arraysize(kXmlSignatures));
 456 }
 457
 458 bool SiteIsolationPolicy::SniffForJSON(base::StringPiece data) {
 459   // TODO(dsjang): We have to come up with a better way to sniff
 460   // JSON. However, even RE cannot help us that much due to the fact
 461   // that we don't do full parsing.  This DFA starts with state 0, and
 462   // finds {, "/' and : in that order. We're avoiding adding a
 463   // dependency on a regular expression library.
 464   enum {
 465     kStartState,
 466     kLeftBraceState,
 467     kLeftQuoteState,
 468     kColonState,
 469     kTerminalState,
 470   } state = kStartState;
 471
 472   size_t length = data.length();
 473   for (size_t i = 0; i < length && state < kColonState; ++i) {
 474     const char c = data[i];
 475     if (c == ' ' || c == '\t' || c == '\r' || c == '\n')
 476       continue;
 477
 478     switch (state) {
 479       case kStartState:
 480         if (c == '{')
 481           state = kLeftBraceState;
 482         else
 483           state = kTerminalState;
 484         break;
 485       case kLeftBraceState:
 486         if (c == '\"' || c == '\'')
 487           state = kLeftQuoteState;
 488         else
 489           state = kTerminalState;
 490         break;
 491       case kLeftQuoteState:
 492         if (c == ':')
 493           state = kColonState;
 494         break;
 495       case kColonState:
 496       case kTerminalState:
 497         NOTREACHED();
 498         break;
 499     }
 500   }
 501   return state == kColonState;
 502 }
 503
 504 bool SiteIsolationPolicy::SniffForJS(StringPiece data) {
 505   // TODO(dsjang): This is a real hack. The only purpose of this function is to
 506   // try to see if there's any possibility that this data can be JavaScript
 507   // (superset of JS). This function will be removed once UMA stats are
 508   // gathered.
 509
 510   // Search for "var " for JS detection.
 511   return data.find("var ") != base::StringPiece::npos;
 512 }
 513
 514 }  // namespace content