1 // Copyright 2015 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "content/common/cross_site_document_classifier.h"
7 #include "base/basictypes.h"
8 #include "base/command_line.h"
9 #include "base/lazy_instance.h"
10 #include "base/logging.h"
11 #include "base/metrics/histogram.h"
12 #include "base/strings/string_util.h"
13 #include "content/public/common/content_switches.h"
14 #include "content/public/common/resource_response_info.h"
15 #include "net/base/registry_controlled_domains/registry_controlled_domain.h"
16 #include "net/http/http_response_headers.h"
18 using base::StringPiece
;
25 const char kTextHtml
[] = "text/html";
26 const char kTextXml
[] = "text/xml";
27 const char kAppRssXml
[] = "application/rss+xml";
28 const char kAppXml
[] = "application/xml";
29 const char kAppJson
[] = "application/json";
30 const char kTextJson
[] = "text/json";
31 const char kTextXjson
[] = "text/x-json";
32 const char kTextPlain
[] = "text/plain";
34 bool MatchesSignature(StringPiece data
,
35 const StringPiece signatures
[],
37 size_t offset
= data
.find_first_not_of(" \t\r\n");
38 // There is no not-whitespace character in this document.
39 if (offset
== base::StringPiece::npos
)
42 data
.remove_prefix(offset
);
43 for (size_t sig_index
= 0; sig_index
< arr_size
; ++sig_index
) {
44 if (base::StartsWith(data
, signatures
[sig_index
],
45 base::CompareCase::INSENSITIVE_ASCII
))
53 CrossSiteDocumentMimeType
CrossSiteDocumentClassifier::GetCanonicalMimeType(
54 const std::string
& mime_type
) {
55 if (base::LowerCaseEqualsASCII(mime_type
, kTextHtml
)) {
56 return CROSS_SITE_DOCUMENT_MIME_TYPE_HTML
;
59 if (base::LowerCaseEqualsASCII(mime_type
, kTextPlain
)) {
60 return CROSS_SITE_DOCUMENT_MIME_TYPE_PLAIN
;
63 if (base::LowerCaseEqualsASCII(mime_type
, kAppJson
) ||
64 base::LowerCaseEqualsASCII(mime_type
, kTextJson
) ||
65 base::LowerCaseEqualsASCII(mime_type
, kTextXjson
)) {
66 return CROSS_SITE_DOCUMENT_MIME_TYPE_JSON
;
69 if (base::LowerCaseEqualsASCII(mime_type
, kTextXml
) ||
70 base::LowerCaseEqualsASCII(mime_type
, kAppRssXml
) ||
71 base::LowerCaseEqualsASCII(mime_type
, kAppXml
)) {
72 return CROSS_SITE_DOCUMENT_MIME_TYPE_XML
;
75 return CROSS_SITE_DOCUMENT_MIME_TYPE_OTHERS
;
78 bool CrossSiteDocumentClassifier::IsBlockableScheme(const GURL
& url
) {
79 // We exclude ftp:// from here. FTP doesn't provide a Content-Type
80 // header which our policy depends on, so we cannot protect any
81 // document from FTP servers.
82 return url
.SchemeIs(url::kHttpScheme
) || url
.SchemeIs(url::kHttpsScheme
);
85 bool CrossSiteDocumentClassifier::IsSameSite(const GURL
& frame_origin
,
86 const GURL
& response_url
) {
87 if (!frame_origin
.is_valid() || !response_url
.is_valid())
90 if (frame_origin
.scheme() != response_url
.scheme())
93 // SameDomainOrHost() extracts the effective domains (public suffix plus one)
94 // from the two URLs and compare them.
95 return net::registry_controlled_domains::SameDomainOrHost(
96 frame_origin
, response_url
,
97 net::registry_controlled_domains::INCLUDE_PRIVATE_REGISTRIES
);
100 // We don't use Webkit's existing CORS policy implementation since
101 // their policy works in terms of origins, not sites. For example,
102 // when frame is sub.a.com and it is not allowed to access a document
103 // with sub1.a.com. But under Site Isolation, it's allowed.
104 bool CrossSiteDocumentClassifier::IsValidCorsHeaderSet(
105 const GURL
& frame_origin
,
106 const GURL
& website_origin
,
107 const std::string
& access_control_origin
) {
108 // Many websites are sending back "\"*\"" instead of "*". This is
109 // non-standard practice, and not supported by Chrome. Refer to
110 // CrossOriginAccessControl::passesAccessControlCheck().
112 // TODO(dsjang): * is not allowed for the response from a request
113 // with cookies. This allows for more than what the renderer will
114 // eventually be able to receive, so we won't see illegal cross-site
115 // documents allowed by this. We have to find a way to see if this
116 // response is from a cookie-tagged request or not in the future.
117 if (access_control_origin
== "*")
120 // TODO(dsjang): The CORS spec only treats a fully specified URL, except for
121 // "*", but many websites are using just a domain for access_control_origin,
122 // and this is blocked by Webkit's CORS logic here :
123 // CrossOriginAccessControl::passesAccessControlCheck(). GURL is set
124 // is_valid() to false when it is created from a URL containing * in the
127 GURL
cors_origin(access_control_origin
);
128 return IsSameSite(frame_origin
, cors_origin
);
131 // This function is a slight modification of |net::SniffForHTML|.
132 bool CrossSiteDocumentClassifier::SniffForHTML(StringPiece data
) {
133 // The content sniffer used by Chrome and Firefox are using "<!--"
134 // as one of the HTML signatures, but it also appears in valid
135 // JavaScript, considered as well-formed JS by the browser. Since
136 // we do not want to block any JS, we exclude it from our HTML
137 // signatures. This can weaken our document block policy, but we can
138 // break less websites.
139 // TODO(dsjang): parameterize |net::SniffForHTML| with an option
140 // that decides whether to include <!-- or not, so that we can
141 // remove this function.
142 // TODO(dsjang): Once CrossSiteDocumentClassifier is moved into the browser
143 // process, we should do single-thread checking here for the static
145 static const StringPiece kHtmlSignatures
[] = {
146 StringPiece("<!doctype html"), // HTML5 spec
147 StringPiece("<script"), // HTML5 spec, Mozilla
148 StringPiece("<html"), // HTML5 spec, Mozilla
149 StringPiece("<head"), // HTML5 spec, Mozilla
150 StringPiece("<iframe"), // Mozilla
151 StringPiece("<h1"), // Mozilla
152 StringPiece("<div"), // Mozilla
153 StringPiece("<font"), // Mozilla
154 StringPiece("<table"), // Mozilla
155 StringPiece("<a"), // Mozilla
156 StringPiece("<style"), // Mozilla
157 StringPiece("<title"), // Mozilla
158 StringPiece("<b"), // Mozilla
159 StringPiece("<body"), // Mozilla
160 StringPiece("<br"), // Mozilla
161 StringPiece("<p") // Mozilla
164 while (data
.length() > 0) {
165 if (MatchesSignature(data
, kHtmlSignatures
, arraysize(kHtmlSignatures
)))
168 // If we cannot find "<!--", we fail sniffing this as HTML.
169 static const StringPiece kCommentBegins
[] = {StringPiece("<!--")};
170 if (!MatchesSignature(data
, kCommentBegins
, arraysize(kCommentBegins
)))
173 // Search for --> and do SniffForHTML after that. If we can find the
174 // comment's end, we start HTML sniffing from there again.
175 static const char kEndComment
[] = "-->";
176 size_t offset
= data
.find(kEndComment
);
177 if (offset
== base::StringPiece::npos
)
180 // Proceed to the index next to the ending comment (-->).
181 data
.remove_prefix(offset
+ strlen(kEndComment
));
187 bool CrossSiteDocumentClassifier::SniffForXML(base::StringPiece data
) {
188 // TODO(dsjang): Chrome's mime_sniffer is using strncasecmp() for
189 // this signature. However, XML is case-sensitive. Don't we have to
190 // be more lenient only to block documents starting with the exact
191 // string <?xml rather than <?XML ?
192 // TODO(dsjang): Once CrossSiteDocumentClassifier is moved into the browser
193 // process, we should do single-thread checking here for the static
195 static const StringPiece kXmlSignatures
[] = {StringPiece("<?xml")};
196 return MatchesSignature(data
, kXmlSignatures
, arraysize(kXmlSignatures
));
199 bool CrossSiteDocumentClassifier::SniffForJSON(base::StringPiece data
) {
200 // TODO(dsjang): We have to come up with a better way to sniff
201 // JSON. However, even RE cannot help us that much due to the fact
202 // that we don't do full parsing. This DFA starts with state 0, and
203 // finds {, "/' and : in that order. We're avoiding adding a
204 // dependency on a regular expression library.
211 } state
= kStartState
;
213 size_t length
= data
.length();
214 for (size_t i
= 0; i
< length
&& state
< kColonState
; ++i
) {
215 const char c
= data
[i
];
216 if (c
== ' ' || c
== '\t' || c
== '\r' || c
== '\n')
222 state
= kLeftBraceState
;
224 state
= kTerminalState
;
226 case kLeftBraceState
:
227 if (c
== '\"' || c
== '\'')
228 state
= kLeftQuoteState
;
230 state
= kTerminalState
;
232 case kLeftQuoteState
:
242 return state
== kColonState
;
245 } // namespace content