Roll src/third_party/WebKit b3f094a:f697bbd (svn 194310:194313)
[chromium-blink-merge.git] / extensions / common / url_pattern.cc
blob40142cf79cbbc2e573bb6d8a5b5a4e2b5cd860c3
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "extensions/common/url_pattern.h"
7 #include <ostream>
9 #include "base/strings/string_number_conversions.h"
10 #include "base/strings/string_piece.h"
11 #include "base/strings/string_split.h"
12 #include "base/strings/string_util.h"
13 #include "base/strings/stringprintf.h"
14 #include "content/public/common/url_constants.h"
15 #include "extensions/common/constants.h"
16 #include "net/base/registry_controlled_domains/registry_controlled_domain.h"
17 #include "url/gurl.h"
18 #include "url/url_util.h"
20 const char URLPattern::kAllUrlsPattern[] = "<all_urls>";
22 namespace {
24 // TODO(aa): What about more obscure schemes like data: and javascript: ?
25 // Note: keep this array in sync with kValidSchemeMasks.
26 const char* kValidSchemes[] = {
27 url::kHttpScheme,
28 url::kHttpsScheme,
29 url::kFileScheme,
30 url::kFtpScheme,
31 content::kChromeUIScheme,
32 extensions::kExtensionScheme,
33 url::kFileSystemScheme,
36 const int kValidSchemeMasks[] = {
37 URLPattern::SCHEME_HTTP,
38 URLPattern::SCHEME_HTTPS,
39 URLPattern::SCHEME_FILE,
40 URLPattern::SCHEME_FTP,
41 URLPattern::SCHEME_CHROMEUI,
42 URLPattern::SCHEME_EXTENSION,
43 URLPattern::SCHEME_FILESYSTEM,
46 static_assert(arraysize(kValidSchemes) == arraysize(kValidSchemeMasks),
47 "must keep these arrays in sync");
49 const char kParseSuccess[] = "Success.";
50 const char kParseErrorMissingSchemeSeparator[] = "Missing scheme separator.";
51 const char kParseErrorInvalidScheme[] = "Invalid scheme.";
52 const char kParseErrorWrongSchemeType[] = "Wrong scheme type.";
53 const char kParseErrorEmptyHost[] = "Host can not be empty.";
54 const char kParseErrorInvalidHostWildcard[] = "Invalid host wildcard.";
55 const char kParseErrorEmptyPath[] = "Empty path.";
56 const char kParseErrorInvalidPort[] = "Invalid port.";
57 const char kParseErrorInvalidHost[] = "Invalid host.";
59 // Message explaining each URLPattern::ParseResult.
60 const char* const kParseResultMessages[] = {
61 kParseSuccess,
62 kParseErrorMissingSchemeSeparator,
63 kParseErrorInvalidScheme,
64 kParseErrorWrongSchemeType,
65 kParseErrorEmptyHost,
66 kParseErrorInvalidHostWildcard,
67 kParseErrorEmptyPath,
68 kParseErrorInvalidPort,
69 kParseErrorInvalidHost,
72 static_assert(URLPattern::NUM_PARSE_RESULTS == arraysize(kParseResultMessages),
73 "must add message for each parse result");
75 const char kPathSeparator[] = "/";
77 bool IsStandardScheme(const std::string& scheme) {
78 // "*" gets the same treatment as a standard scheme.
79 if (scheme == "*")
80 return true;
82 return url::IsStandard(scheme.c_str(),
83 url::Component(0, static_cast<int>(scheme.length())));
86 bool IsValidPortForScheme(const std::string& scheme, const std::string& port) {
87 if (port == "*")
88 return true;
90 // Only accept non-wildcard ports if the scheme uses ports.
91 if (url::DefaultPortForScheme(scheme.c_str(), scheme.length()) ==
92 url::PORT_UNSPECIFIED) {
93 return false;
96 int parsed_port = url::PORT_UNSPECIFIED;
97 if (!base::StringToInt(port, &parsed_port))
98 return false;
99 return (parsed_port >= 0) && (parsed_port < 65536);
102 // Returns |path| with the trailing wildcard stripped if one existed.
104 // The functions that rely on this (OverlapsWith and Contains) are only
105 // called for the patterns inside URLPatternSet. In those cases, we know that
106 // the path will have only a single wildcard at the end. This makes figuring
107 // out overlap much easier. It seems like there is probably a computer-sciency
108 // way to solve the general case, but we don't need that yet.
109 std::string StripTrailingWildcard(const std::string& path) {
110 size_t wildcard_index = path.find('*');
111 size_t path_last = path.size() - 1;
112 DCHECK(wildcard_index == std::string::npos || wildcard_index == path_last);
113 return wildcard_index == path_last ? path.substr(0, path_last) : path;
116 } // namespace
118 // static
119 bool URLPattern::IsValidSchemeForExtensions(const std::string& scheme) {
120 for (size_t i = 0; i < arraysize(kValidSchemes); ++i) {
121 if (scheme == kValidSchemes[i])
122 return true;
124 return false;
127 URLPattern::URLPattern()
128 : valid_schemes_(SCHEME_NONE),
129 match_all_urls_(false),
130 match_subdomains_(false),
131 port_("*") {}
133 URLPattern::URLPattern(int valid_schemes)
134 : valid_schemes_(valid_schemes),
135 match_all_urls_(false),
136 match_subdomains_(false),
137 port_("*") {}
139 URLPattern::URLPattern(int valid_schemes, const std::string& pattern)
140 // Strict error checking is used, because this constructor is only
141 // appropriate when we know |pattern| is valid.
142 : valid_schemes_(valid_schemes),
143 match_all_urls_(false),
144 match_subdomains_(false),
145 port_("*") {
146 ParseResult result = Parse(pattern);
147 if (PARSE_SUCCESS != result)
148 NOTREACHED() << "URLPattern invalid: " << pattern << " result " << result;
151 URLPattern::~URLPattern() {
154 bool URLPattern::operator<(const URLPattern& other) const {
155 return GetAsString() < other.GetAsString();
158 bool URLPattern::operator>(const URLPattern& other) const {
159 return GetAsString() > other.GetAsString();
162 bool URLPattern::operator==(const URLPattern& other) const {
163 return GetAsString() == other.GetAsString();
166 std::ostream& operator<<(std::ostream& out, const URLPattern& url_pattern) {
167 return out << '"' << url_pattern.GetAsString() << '"';
170 URLPattern::ParseResult URLPattern::Parse(const std::string& pattern) {
171 spec_.clear();
172 SetMatchAllURLs(false);
173 SetMatchSubdomains(false);
174 SetPort("*");
176 // Special case pattern to match every valid URL.
177 if (pattern == kAllUrlsPattern) {
178 SetMatchAllURLs(true);
179 return PARSE_SUCCESS;
182 // Parse out the scheme.
183 size_t scheme_end_pos = pattern.find(url::kStandardSchemeSeparator);
184 bool has_standard_scheme_separator = true;
186 // Some urls also use ':' alone as the scheme separator.
187 if (scheme_end_pos == std::string::npos) {
188 scheme_end_pos = pattern.find(':');
189 has_standard_scheme_separator = false;
192 if (scheme_end_pos == std::string::npos)
193 return PARSE_ERROR_MISSING_SCHEME_SEPARATOR;
195 if (!SetScheme(pattern.substr(0, scheme_end_pos)))
196 return PARSE_ERROR_INVALID_SCHEME;
198 bool standard_scheme = IsStandardScheme(scheme_);
199 if (standard_scheme != has_standard_scheme_separator)
200 return PARSE_ERROR_WRONG_SCHEME_SEPARATOR;
202 // Advance past the scheme separator.
203 scheme_end_pos +=
204 (standard_scheme ? strlen(url::kStandardSchemeSeparator) : 1);
205 if (scheme_end_pos >= pattern.size())
206 return PARSE_ERROR_EMPTY_HOST;
208 // Parse out the host and path.
209 size_t host_start_pos = scheme_end_pos;
210 size_t path_start_pos = 0;
212 if (!standard_scheme) {
213 path_start_pos = host_start_pos;
214 } else if (scheme_ == url::kFileScheme) {
215 size_t host_end_pos = pattern.find(kPathSeparator, host_start_pos);
216 if (host_end_pos == std::string::npos) {
217 // Allow hostname omission.
218 // e.g. file://* is interpreted as file:///*,
219 // file://foo* is interpreted as file:///foo*.
220 path_start_pos = host_start_pos - 1;
221 } else {
222 // Ignore hostname if scheme is file://.
223 // e.g. file://localhost/foo is equal to file:///foo.
224 path_start_pos = host_end_pos;
226 } else {
227 size_t host_end_pos = pattern.find(kPathSeparator, host_start_pos);
229 // Host is required.
230 if (host_start_pos == host_end_pos)
231 return PARSE_ERROR_EMPTY_HOST;
233 if (host_end_pos == std::string::npos)
234 return PARSE_ERROR_EMPTY_PATH;
236 host_ = pattern.substr(host_start_pos, host_end_pos - host_start_pos);
238 // The first component can optionally be '*' to match all subdomains.
239 std::vector<std::string> host_components;
240 base::SplitString(host_, '.', &host_components);
242 // Could be empty if the host only consists of whitespace characters.
243 if (host_components.empty())
244 return PARSE_ERROR_EMPTY_HOST;
246 if (host_components[0] == "*") {
247 match_subdomains_ = true;
248 host_components.erase(host_components.begin(),
249 host_components.begin() + 1);
251 host_ = JoinString(host_components, '.');
253 path_start_pos = host_end_pos;
256 SetPath(pattern.substr(path_start_pos));
258 size_t port_pos = host_.find(':');
259 if (port_pos != std::string::npos) {
260 if (!SetPort(host_.substr(port_pos + 1)))
261 return PARSE_ERROR_INVALID_PORT;
262 host_ = host_.substr(0, port_pos);
265 // No other '*' can occur in the host, though. This isn't necessary, but is
266 // done as a convenience to developers who might otherwise be confused and
267 // think '*' works as a glob in the host.
268 if (host_.find('*') != std::string::npos)
269 return PARSE_ERROR_INVALID_HOST_WILDCARD;
271 // Null characters are not allowed in hosts.
272 if (host_.find('\0') != std::string::npos)
273 return PARSE_ERROR_INVALID_HOST;
275 return PARSE_SUCCESS;
278 void URLPattern::SetValidSchemes(int valid_schemes) {
279 spec_.clear();
280 valid_schemes_ = valid_schemes;
283 void URLPattern::SetHost(const std::string& host) {
284 spec_.clear();
285 host_ = host;
288 void URLPattern::SetMatchAllURLs(bool val) {
289 spec_.clear();
290 match_all_urls_ = val;
292 if (val) {
293 match_subdomains_ = true;
294 scheme_ = "*";
295 host_.clear();
296 SetPath("/*");
300 void URLPattern::SetMatchSubdomains(bool val) {
301 spec_.clear();
302 match_subdomains_ = val;
305 bool URLPattern::SetScheme(const std::string& scheme) {
306 spec_.clear();
307 scheme_ = scheme;
308 if (scheme_ == "*") {
309 valid_schemes_ &= (SCHEME_HTTP | SCHEME_HTTPS);
310 } else if (!IsValidScheme(scheme_)) {
311 return false;
313 return true;
316 bool URLPattern::IsValidScheme(const std::string& scheme) const {
317 if (valid_schemes_ == SCHEME_ALL)
318 return true;
320 for (size_t i = 0; i < arraysize(kValidSchemes); ++i) {
321 if (scheme == kValidSchemes[i] && (valid_schemes_ & kValidSchemeMasks[i]))
322 return true;
325 return false;
328 void URLPattern::SetPath(const std::string& path) {
329 spec_.clear();
330 path_ = path;
331 path_escaped_ = path_;
332 ReplaceSubstringsAfterOffset(&path_escaped_, 0, "\\", "\\\\");
333 ReplaceSubstringsAfterOffset(&path_escaped_, 0, "?", "\\?");
336 bool URLPattern::SetPort(const std::string& port) {
337 spec_.clear();
338 if (IsValidPortForScheme(scheme_, port)) {
339 port_ = port;
340 return true;
342 return false;
345 bool URLPattern::MatchesURL(const GURL& test) const {
346 const GURL* test_url = &test;
347 bool has_inner_url = test.inner_url() != NULL;
349 if (has_inner_url) {
350 if (!test.SchemeIsFileSystem())
351 return false; // The only nested URLs we handle are filesystem URLs.
352 test_url = test.inner_url();
355 if (!MatchesScheme(test_url->scheme()))
356 return false;
358 if (match_all_urls_)
359 return true;
361 std::string path_for_request = test.PathForRequest();
362 if (has_inner_url)
363 path_for_request = test_url->path() + path_for_request;
365 return MatchesSecurityOriginHelper(*test_url) &&
366 MatchesPath(path_for_request);
369 bool URLPattern::MatchesSecurityOrigin(const GURL& test) const {
370 const GURL* test_url = &test;
371 bool has_inner_url = test.inner_url() != NULL;
373 if (has_inner_url) {
374 if (!test.SchemeIsFileSystem())
375 return false; // The only nested URLs we handle are filesystem URLs.
376 test_url = test.inner_url();
379 if (!MatchesScheme(test_url->scheme()))
380 return false;
382 if (match_all_urls_)
383 return true;
385 return MatchesSecurityOriginHelper(*test_url);
388 bool URLPattern::MatchesScheme(const std::string& test) const {
389 if (!IsValidScheme(test))
390 return false;
392 return scheme_ == "*" || test == scheme_;
395 bool URLPattern::MatchesHost(const std::string& host) const {
396 std::string test(url::kHttpScheme);
397 test += url::kStandardSchemeSeparator;
398 test += host;
399 test += "/";
400 return MatchesHost(GURL(test));
403 bool URLPattern::MatchesHost(const GURL& test) const {
404 // If the hosts are exactly equal, we have a match.
405 if (test.host() == host_)
406 return true;
408 // If we're matching subdomains, and we have no host in the match pattern,
409 // that means that we're matching all hosts, which means we have a match no
410 // matter what the test host is.
411 if (match_subdomains_ && host_.empty())
412 return true;
414 // Otherwise, we can only match if our match pattern matches subdomains.
415 if (!match_subdomains_)
416 return false;
418 // We don't do subdomain matching against IP addresses, so we can give up now
419 // if the test host is an IP address.
420 if (test.HostIsIPAddress())
421 return false;
423 // Check if the test host is a subdomain of our host.
424 if (test.host().length() <= (host_.length() + 1))
425 return false;
427 if (test.host().compare(test.host().length() - host_.length(),
428 host_.length(), host_) != 0)
429 return false;
431 return test.host()[test.host().length() - host_.length() - 1] == '.';
434 bool URLPattern::ImpliesAllHosts() const {
435 // Check if it matches all urls or is a pattern like http://*/*.
436 if (match_all_urls_ ||
437 (match_subdomains_ && host_.empty() && port_ == "*" && path_ == "/*")) {
438 return true;
441 // If this doesn't even match subdomains, it can't possibly imply all hosts.
442 if (!match_subdomains_)
443 return false;
445 // If |host_| is a recognized TLD, this will be 0. We don't include private
446 // TLDs, so that, e.g., *.appspot.com does not imply all hosts.
447 size_t registry_length = net::registry_controlled_domains::GetRegistryLength(
448 host_,
449 net::registry_controlled_domains::EXCLUDE_UNKNOWN_REGISTRIES,
450 net::registry_controlled_domains::EXCLUDE_PRIVATE_REGISTRIES);
451 // If there was more than just a TLD in the host (e.g., *.foobar.com), it
452 // doesn't imply all hosts.
453 if (registry_length > 0)
454 return false;
456 // At this point the host could either be just a TLD ("com") or some unknown
457 // TLD-like string ("notatld"). To disambiguate between them construct a
458 // fake URL, and check the registry. This returns 0 if the TLD is
459 // unrecognized, or the length of the recognized TLD.
460 registry_length = net::registry_controlled_domains::GetRegistryLength(
461 base::StringPrintf("foo.%s", host_.c_str()),
462 net::registry_controlled_domains::EXCLUDE_UNKNOWN_REGISTRIES,
463 net::registry_controlled_domains::EXCLUDE_PRIVATE_REGISTRIES);
464 // If we recognized this TLD, then this is a pattern like *.com, and it
465 // should imply all hosts. Otherwise, this doesn't imply all hosts.
466 return registry_length > 0;
469 bool URLPattern::MatchesSingleOrigin() const {
470 // Strictly speaking, the port is part of the origin, but in URLPattern it
471 // defaults to *. It's not very interesting anyway, so leave it out.
472 return !ImpliesAllHosts() && scheme_ != "*" && !match_subdomains_;
475 bool URLPattern::MatchesPath(const std::string& test) const {
476 // Make the behaviour of OverlapsWith consistent with MatchesURL, which is
477 // need to match hosted apps on e.g. 'google.com' also run on 'google.com/'.
478 if (test + "/*" == path_escaped_)
479 return true;
481 return MatchPattern(test, path_escaped_);
484 const std::string& URLPattern::GetAsString() const {
485 if (!spec_.empty())
486 return spec_;
488 if (match_all_urls_) {
489 spec_ = kAllUrlsPattern;
490 return spec_;
493 bool standard_scheme = IsStandardScheme(scheme_);
495 std::string spec = scheme_ +
496 (standard_scheme ? url::kStandardSchemeSeparator : ":");
498 if (scheme_ != url::kFileScheme && standard_scheme) {
499 if (match_subdomains_) {
500 spec += "*";
501 if (!host_.empty())
502 spec += ".";
505 if (!host_.empty())
506 spec += host_;
508 if (port_ != "*") {
509 spec += ":";
510 spec += port_;
514 if (!path_.empty())
515 spec += path_;
517 spec_ = spec;
518 return spec_;
521 bool URLPattern::OverlapsWith(const URLPattern& other) const {
522 if (match_all_urls() || other.match_all_urls())
523 return true;
524 return (MatchesAnyScheme(other.GetExplicitSchemes()) ||
525 other.MatchesAnyScheme(GetExplicitSchemes()))
526 && (MatchesHost(other.host()) || other.MatchesHost(host()))
527 && (MatchesPortPattern(other.port()) || other.MatchesPortPattern(port()))
528 && (MatchesPath(StripTrailingWildcard(other.path())) ||
529 other.MatchesPath(StripTrailingWildcard(path())));
532 bool URLPattern::Contains(const URLPattern& other) const {
533 if (match_all_urls())
534 return true;
535 return MatchesAllSchemes(other.GetExplicitSchemes())
536 && MatchesHost(other.host())
537 && MatchesPortPattern(other.port())
538 && MatchesPath(StripTrailingWildcard(other.path()));
541 bool URLPattern::MatchesAnyScheme(
542 const std::vector<std::string>& schemes) const {
543 for (std::vector<std::string>::const_iterator i = schemes.begin();
544 i != schemes.end(); ++i) {
545 if (MatchesScheme(*i))
546 return true;
549 return false;
552 bool URLPattern::MatchesAllSchemes(
553 const std::vector<std::string>& schemes) const {
554 for (std::vector<std::string>::const_iterator i = schemes.begin();
555 i != schemes.end(); ++i) {
556 if (!MatchesScheme(*i))
557 return false;
560 return true;
563 bool URLPattern::MatchesSecurityOriginHelper(const GURL& test) const {
564 // Ignore hostname if scheme is file://.
565 if (scheme_ != url::kFileScheme && !MatchesHost(test))
566 return false;
568 if (!MatchesPortPattern(base::IntToString(test.EffectiveIntPort())))
569 return false;
571 return true;
574 bool URLPattern::MatchesPortPattern(const std::string& port) const {
575 return port_ == "*" || port_ == port;
578 std::vector<std::string> URLPattern::GetExplicitSchemes() const {
579 std::vector<std::string> result;
581 if (scheme_ != "*" && !match_all_urls_ && IsValidScheme(scheme_)) {
582 result.push_back(scheme_);
583 return result;
586 for (size_t i = 0; i < arraysize(kValidSchemes); ++i) {
587 if (MatchesScheme(kValidSchemes[i])) {
588 result.push_back(kValidSchemes[i]);
592 return result;
595 std::vector<URLPattern> URLPattern::ConvertToExplicitSchemes() const {
596 std::vector<std::string> explicit_schemes = GetExplicitSchemes();
597 std::vector<URLPattern> result;
599 for (std::vector<std::string>::const_iterator i = explicit_schemes.begin();
600 i != explicit_schemes.end(); ++i) {
601 URLPattern temp = *this;
602 temp.SetScheme(*i);
603 temp.SetMatchAllURLs(false);
604 result.push_back(temp);
607 return result;
610 // static
611 const char* URLPattern::GetParseResultString(
612 URLPattern::ParseResult parse_result) {
613 return kParseResultMessages[parse_result];