Migrate to github hosting of CLD2 project.
[chromium-blink-merge.git] / extensions / common / url_pattern.cc
blobb91d4383704388b8f1affad2073ddd856de01eb9
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "extensions/common/url_pattern.h"
7 #include <ostream>
9 #include "base/strings/pattern.h"
10 #include "base/strings/string_number_conversions.h"
11 #include "base/strings/string_piece.h"
12 #include "base/strings/string_split.h"
13 #include "base/strings/string_util.h"
14 #include "base/strings/stringprintf.h"
15 #include "content/public/common/url_constants.h"
16 #include "extensions/common/constants.h"
17 #include "net/base/registry_controlled_domains/registry_controlled_domain.h"
18 #include "url/gurl.h"
19 #include "url/url_util.h"
21 const char URLPattern::kAllUrlsPattern[] = "<all_urls>";
23 namespace {
25 // TODO(aa): What about more obscure schemes like data: and javascript: ?
26 // Note: keep this array in sync with kValidSchemeMasks.
27 const char* kValidSchemes[] = {
28 url::kHttpScheme,
29 url::kHttpsScheme,
30 url::kFileScheme,
31 url::kFtpScheme,
32 content::kChromeUIScheme,
33 extensions::kExtensionScheme,
34 url::kFileSystemScheme,
37 const int kValidSchemeMasks[] = {
38 URLPattern::SCHEME_HTTP,
39 URLPattern::SCHEME_HTTPS,
40 URLPattern::SCHEME_FILE,
41 URLPattern::SCHEME_FTP,
42 URLPattern::SCHEME_CHROMEUI,
43 URLPattern::SCHEME_EXTENSION,
44 URLPattern::SCHEME_FILESYSTEM,
47 static_assert(arraysize(kValidSchemes) == arraysize(kValidSchemeMasks),
48 "must keep these arrays in sync");
50 const char kParseSuccess[] = "Success.";
51 const char kParseErrorMissingSchemeSeparator[] = "Missing scheme separator.";
52 const char kParseErrorInvalidScheme[] = "Invalid scheme.";
53 const char kParseErrorWrongSchemeType[] = "Wrong scheme type.";
54 const char kParseErrorEmptyHost[] = "Host can not be empty.";
55 const char kParseErrorInvalidHostWildcard[] = "Invalid host wildcard.";
56 const char kParseErrorEmptyPath[] = "Empty path.";
57 const char kParseErrorInvalidPort[] = "Invalid port.";
58 const char kParseErrorInvalidHost[] = "Invalid host.";
60 // Message explaining each URLPattern::ParseResult.
61 const char* const kParseResultMessages[] = {
62 kParseSuccess,
63 kParseErrorMissingSchemeSeparator,
64 kParseErrorInvalidScheme,
65 kParseErrorWrongSchemeType,
66 kParseErrorEmptyHost,
67 kParseErrorInvalidHostWildcard,
68 kParseErrorEmptyPath,
69 kParseErrorInvalidPort,
70 kParseErrorInvalidHost,
73 static_assert(URLPattern::NUM_PARSE_RESULTS == arraysize(kParseResultMessages),
74 "must add message for each parse result");
76 const char kPathSeparator[] = "/";
78 bool IsStandardScheme(const std::string& scheme) {
79 // "*" gets the same treatment as a standard scheme.
80 if (scheme == "*")
81 return true;
83 return url::IsStandard(scheme.c_str(),
84 url::Component(0, static_cast<int>(scheme.length())));
87 bool IsValidPortForScheme(const std::string& scheme, const std::string& port) {
88 if (port == "*")
89 return true;
91 // Only accept non-wildcard ports if the scheme uses ports.
92 if (url::DefaultPortForScheme(scheme.c_str(), scheme.length()) ==
93 url::PORT_UNSPECIFIED) {
94 return false;
97 int parsed_port = url::PORT_UNSPECIFIED;
98 if (!base::StringToInt(port, &parsed_port))
99 return false;
100 return (parsed_port >= 0) && (parsed_port < 65536);
103 // Returns |path| with the trailing wildcard stripped if one existed.
105 // The functions that rely on this (OverlapsWith and Contains) are only
106 // called for the patterns inside URLPatternSet. In those cases, we know that
107 // the path will have only a single wildcard at the end. This makes figuring
108 // out overlap much easier. It seems like there is probably a computer-sciency
109 // way to solve the general case, but we don't need that yet.
110 std::string StripTrailingWildcard(const std::string& path) {
111 size_t wildcard_index = path.find('*');
112 size_t path_last = path.size() - 1;
113 DCHECK(wildcard_index == std::string::npos || wildcard_index == path_last);
114 return wildcard_index == path_last ? path.substr(0, path_last) : path;
117 } // namespace
119 // static
120 bool URLPattern::IsValidSchemeForExtensions(const std::string& scheme) {
121 for (size_t i = 0; i < arraysize(kValidSchemes); ++i) {
122 if (scheme == kValidSchemes[i])
123 return true;
125 return false;
128 URLPattern::URLPattern()
129 : valid_schemes_(SCHEME_NONE),
130 match_all_urls_(false),
131 match_subdomains_(false),
132 port_("*") {}
134 URLPattern::URLPattern(int valid_schemes)
135 : valid_schemes_(valid_schemes),
136 match_all_urls_(false),
137 match_subdomains_(false),
138 port_("*") {}
140 URLPattern::URLPattern(int valid_schemes, const std::string& pattern)
141 // Strict error checking is used, because this constructor is only
142 // appropriate when we know |pattern| is valid.
143 : valid_schemes_(valid_schemes),
144 match_all_urls_(false),
145 match_subdomains_(false),
146 port_("*") {
147 ParseResult result = Parse(pattern);
148 if (PARSE_SUCCESS != result)
149 NOTREACHED() << "URLPattern invalid: " << pattern << " result " << result;
152 URLPattern::~URLPattern() {
155 bool URLPattern::operator<(const URLPattern& other) const {
156 return GetAsString() < other.GetAsString();
159 bool URLPattern::operator>(const URLPattern& other) const {
160 return GetAsString() > other.GetAsString();
163 bool URLPattern::operator==(const URLPattern& other) const {
164 return GetAsString() == other.GetAsString();
167 std::ostream& operator<<(std::ostream& out, const URLPattern& url_pattern) {
168 return out << '"' << url_pattern.GetAsString() << '"';
171 URLPattern::ParseResult URLPattern::Parse(const std::string& pattern) {
172 spec_.clear();
173 SetMatchAllURLs(false);
174 SetMatchSubdomains(false);
175 SetPort("*");
177 // Special case pattern to match every valid URL.
178 if (pattern == kAllUrlsPattern) {
179 SetMatchAllURLs(true);
180 return PARSE_SUCCESS;
183 // Parse out the scheme.
184 size_t scheme_end_pos = pattern.find(url::kStandardSchemeSeparator);
185 bool has_standard_scheme_separator = true;
187 // Some urls also use ':' alone as the scheme separator.
188 if (scheme_end_pos == std::string::npos) {
189 scheme_end_pos = pattern.find(':');
190 has_standard_scheme_separator = false;
193 if (scheme_end_pos == std::string::npos)
194 return PARSE_ERROR_MISSING_SCHEME_SEPARATOR;
196 if (!SetScheme(pattern.substr(0, scheme_end_pos)))
197 return PARSE_ERROR_INVALID_SCHEME;
199 bool standard_scheme = IsStandardScheme(scheme_);
200 if (standard_scheme != has_standard_scheme_separator)
201 return PARSE_ERROR_WRONG_SCHEME_SEPARATOR;
203 // Advance past the scheme separator.
204 scheme_end_pos +=
205 (standard_scheme ? strlen(url::kStandardSchemeSeparator) : 1);
206 if (scheme_end_pos >= pattern.size())
207 return PARSE_ERROR_EMPTY_HOST;
209 // Parse out the host and path.
210 size_t host_start_pos = scheme_end_pos;
211 size_t path_start_pos = 0;
213 if (!standard_scheme) {
214 path_start_pos = host_start_pos;
215 } else if (scheme_ == url::kFileScheme) {
216 size_t host_end_pos = pattern.find(kPathSeparator, host_start_pos);
217 if (host_end_pos == std::string::npos) {
218 // Allow hostname omission.
219 // e.g. file://* is interpreted as file:///*,
220 // file://foo* is interpreted as file:///foo*.
221 path_start_pos = host_start_pos - 1;
222 } else {
223 // Ignore hostname if scheme is file://.
224 // e.g. file://localhost/foo is equal to file:///foo.
225 path_start_pos = host_end_pos;
227 } else {
228 size_t host_end_pos = pattern.find(kPathSeparator, host_start_pos);
230 // Host is required.
231 if (host_start_pos == host_end_pos)
232 return PARSE_ERROR_EMPTY_HOST;
234 if (host_end_pos == std::string::npos)
235 return PARSE_ERROR_EMPTY_PATH;
237 host_ = pattern.substr(host_start_pos, host_end_pos - host_start_pos);
239 // The first component can optionally be '*' to match all subdomains.
240 std::vector<std::string> host_components = base::SplitString(
241 host_, ".", base::TRIM_WHITESPACE, base::SPLIT_WANT_ALL);
243 // Could be empty if the host only consists of whitespace characters.
244 if (host_components.empty() ||
245 (host_components.size() == 1 && host_components[0].empty()))
246 return PARSE_ERROR_EMPTY_HOST;
248 if (host_components[0] == "*") {
249 match_subdomains_ = true;
250 host_components.erase(host_components.begin(),
251 host_components.begin() + 1);
253 host_ = base::JoinString(host_components, ".");
255 path_start_pos = host_end_pos;
258 SetPath(pattern.substr(path_start_pos));
260 size_t port_pos = host_.find(':');
261 if (port_pos != std::string::npos) {
262 if (!SetPort(host_.substr(port_pos + 1)))
263 return PARSE_ERROR_INVALID_PORT;
264 host_ = host_.substr(0, port_pos);
267 // No other '*' can occur in the host, though. This isn't necessary, but is
268 // done as a convenience to developers who might otherwise be confused and
269 // think '*' works as a glob in the host.
270 if (host_.find('*') != std::string::npos)
271 return PARSE_ERROR_INVALID_HOST_WILDCARD;
273 // Null characters are not allowed in hosts.
274 if (host_.find('\0') != std::string::npos)
275 return PARSE_ERROR_INVALID_HOST;
277 return PARSE_SUCCESS;
280 void URLPattern::SetValidSchemes(int valid_schemes) {
281 spec_.clear();
282 valid_schemes_ = valid_schemes;
285 void URLPattern::SetHost(const std::string& host) {
286 spec_.clear();
287 host_ = host;
290 void URLPattern::SetMatchAllURLs(bool val) {
291 spec_.clear();
292 match_all_urls_ = val;
294 if (val) {
295 match_subdomains_ = true;
296 scheme_ = "*";
297 host_.clear();
298 SetPath("/*");
302 void URLPattern::SetMatchSubdomains(bool val) {
303 spec_.clear();
304 match_subdomains_ = val;
307 bool URLPattern::SetScheme(const std::string& scheme) {
308 spec_.clear();
309 scheme_ = scheme;
310 if (scheme_ == "*") {
311 valid_schemes_ &= (SCHEME_HTTP | SCHEME_HTTPS);
312 } else if (!IsValidScheme(scheme_)) {
313 return false;
315 return true;
318 bool URLPattern::IsValidScheme(const std::string& scheme) const {
319 if (valid_schemes_ == SCHEME_ALL)
320 return true;
322 for (size_t i = 0; i < arraysize(kValidSchemes); ++i) {
323 if (scheme == kValidSchemes[i] && (valid_schemes_ & kValidSchemeMasks[i]))
324 return true;
327 return false;
330 void URLPattern::SetPath(const std::string& path) {
331 spec_.clear();
332 path_ = path;
333 path_escaped_ = path_;
334 base::ReplaceSubstringsAfterOffset(&path_escaped_, 0, "\\", "\\\\");
335 base::ReplaceSubstringsAfterOffset(&path_escaped_, 0, "?", "\\?");
338 bool URLPattern::SetPort(const std::string& port) {
339 spec_.clear();
340 if (IsValidPortForScheme(scheme_, port)) {
341 port_ = port;
342 return true;
344 return false;
347 bool URLPattern::MatchesURL(const GURL& test) const {
348 const GURL* test_url = &test;
349 bool has_inner_url = test.inner_url() != NULL;
351 if (has_inner_url) {
352 if (!test.SchemeIsFileSystem())
353 return false; // The only nested URLs we handle are filesystem URLs.
354 test_url = test.inner_url();
357 if (!MatchesScheme(test_url->scheme()))
358 return false;
360 if (match_all_urls_)
361 return true;
363 std::string path_for_request = test.PathForRequest();
364 if (has_inner_url)
365 path_for_request = test_url->path() + path_for_request;
367 return MatchesSecurityOriginHelper(*test_url) &&
368 MatchesPath(path_for_request);
371 bool URLPattern::MatchesSecurityOrigin(const GURL& test) const {
372 const GURL* test_url = &test;
373 bool has_inner_url = test.inner_url() != NULL;
375 if (has_inner_url) {
376 if (!test.SchemeIsFileSystem())
377 return false; // The only nested URLs we handle are filesystem URLs.
378 test_url = test.inner_url();
381 if (!MatchesScheme(test_url->scheme()))
382 return false;
384 if (match_all_urls_)
385 return true;
387 return MatchesSecurityOriginHelper(*test_url);
390 bool URLPattern::MatchesScheme(const std::string& test) const {
391 if (!IsValidScheme(test))
392 return false;
394 return scheme_ == "*" || test == scheme_;
397 bool URLPattern::MatchesHost(const std::string& host) const {
398 std::string test(url::kHttpScheme);
399 test += url::kStandardSchemeSeparator;
400 test += host;
401 test += "/";
402 return MatchesHost(GURL(test));
405 bool URLPattern::MatchesHost(const GURL& test) const {
406 // If the hosts are exactly equal, we have a match.
407 if (test.host() == host_)
408 return true;
410 // If we're matching subdomains, and we have no host in the match pattern,
411 // that means that we're matching all hosts, which means we have a match no
412 // matter what the test host is.
413 if (match_subdomains_ && host_.empty())
414 return true;
416 // Otherwise, we can only match if our match pattern matches subdomains.
417 if (!match_subdomains_)
418 return false;
420 // We don't do subdomain matching against IP addresses, so we can give up now
421 // if the test host is an IP address.
422 if (test.HostIsIPAddress())
423 return false;
425 // Check if the test host is a subdomain of our host.
426 if (test.host().length() <= (host_.length() + 1))
427 return false;
429 if (test.host().compare(test.host().length() - host_.length(),
430 host_.length(), host_) != 0)
431 return false;
433 return test.host()[test.host().length() - host_.length() - 1] == '.';
436 bool URLPattern::ImpliesAllHosts() const {
437 // Check if it matches all urls or is a pattern like http://*/*.
438 if (match_all_urls_ ||
439 (match_subdomains_ && host_.empty() && port_ == "*" && path_ == "/*")) {
440 return true;
443 // If this doesn't even match subdomains, it can't possibly imply all hosts.
444 if (!match_subdomains_)
445 return false;
447 // If |host_| is a recognized TLD, this will be 0. We don't include private
448 // TLDs, so that, e.g., *.appspot.com does not imply all hosts.
449 size_t registry_length = net::registry_controlled_domains::GetRegistryLength(
450 host_,
451 net::registry_controlled_domains::EXCLUDE_UNKNOWN_REGISTRIES,
452 net::registry_controlled_domains::EXCLUDE_PRIVATE_REGISTRIES);
453 // If there was more than just a TLD in the host (e.g., *.foobar.com), it
454 // doesn't imply all hosts.
455 if (registry_length > 0)
456 return false;
458 // At this point the host could either be just a TLD ("com") or some unknown
459 // TLD-like string ("notatld"). To disambiguate between them construct a
460 // fake URL, and check the registry. This returns 0 if the TLD is
461 // unrecognized, or the length of the recognized TLD.
462 registry_length = net::registry_controlled_domains::GetRegistryLength(
463 base::StringPrintf("foo.%s", host_.c_str()),
464 net::registry_controlled_domains::EXCLUDE_UNKNOWN_REGISTRIES,
465 net::registry_controlled_domains::EXCLUDE_PRIVATE_REGISTRIES);
466 // If we recognized this TLD, then this is a pattern like *.com, and it
467 // should imply all hosts. Otherwise, this doesn't imply all hosts.
468 return registry_length > 0;
471 bool URLPattern::MatchesSingleOrigin() const {
472 // Strictly speaking, the port is part of the origin, but in URLPattern it
473 // defaults to *. It's not very interesting anyway, so leave it out.
474 return !ImpliesAllHosts() && scheme_ != "*" && !match_subdomains_;
477 bool URLPattern::MatchesPath(const std::string& test) const {
478 // Make the behaviour of OverlapsWith consistent with MatchesURL, which is
479 // need to match hosted apps on e.g. 'google.com' also run on 'google.com/'.
480 if (test + "/*" == path_escaped_)
481 return true;
483 return base::MatchPattern(test, path_escaped_);
486 const std::string& URLPattern::GetAsString() const {
487 if (!spec_.empty())
488 return spec_;
490 if (match_all_urls_) {
491 spec_ = kAllUrlsPattern;
492 return spec_;
495 bool standard_scheme = IsStandardScheme(scheme_);
497 std::string spec = scheme_ +
498 (standard_scheme ? url::kStandardSchemeSeparator : ":");
500 if (scheme_ != url::kFileScheme && standard_scheme) {
501 if (match_subdomains_) {
502 spec += "*";
503 if (!host_.empty())
504 spec += ".";
507 if (!host_.empty())
508 spec += host_;
510 if (port_ != "*") {
511 spec += ":";
512 spec += port_;
516 if (!path_.empty())
517 spec += path_;
519 spec_ = spec;
520 return spec_;
523 bool URLPattern::OverlapsWith(const URLPattern& other) const {
524 if (match_all_urls() || other.match_all_urls())
525 return true;
526 return (MatchesAnyScheme(other.GetExplicitSchemes()) ||
527 other.MatchesAnyScheme(GetExplicitSchemes()))
528 && (MatchesHost(other.host()) || other.MatchesHost(host()))
529 && (MatchesPortPattern(other.port()) || other.MatchesPortPattern(port()))
530 && (MatchesPath(StripTrailingWildcard(other.path())) ||
531 other.MatchesPath(StripTrailingWildcard(path())));
534 bool URLPattern::Contains(const URLPattern& other) const {
535 if (match_all_urls())
536 return true;
537 return MatchesAllSchemes(other.GetExplicitSchemes())
538 && MatchesHost(other.host())
539 && MatchesPortPattern(other.port())
540 && MatchesPath(StripTrailingWildcard(other.path()));
543 bool URLPattern::MatchesAnyScheme(
544 const std::vector<std::string>& schemes) const {
545 for (std::vector<std::string>::const_iterator i = schemes.begin();
546 i != schemes.end(); ++i) {
547 if (MatchesScheme(*i))
548 return true;
551 return false;
554 bool URLPattern::MatchesAllSchemes(
555 const std::vector<std::string>& schemes) const {
556 for (std::vector<std::string>::const_iterator i = schemes.begin();
557 i != schemes.end(); ++i) {
558 if (!MatchesScheme(*i))
559 return false;
562 return true;
565 bool URLPattern::MatchesSecurityOriginHelper(const GURL& test) const {
566 // Ignore hostname if scheme is file://.
567 if (scheme_ != url::kFileScheme && !MatchesHost(test))
568 return false;
570 if (!MatchesPortPattern(base::IntToString(test.EffectiveIntPort())))
571 return false;
573 return true;
576 bool URLPattern::MatchesPortPattern(const std::string& port) const {
577 return port_ == "*" || port_ == port;
580 std::vector<std::string> URLPattern::GetExplicitSchemes() const {
581 std::vector<std::string> result;
583 if (scheme_ != "*" && !match_all_urls_ && IsValidScheme(scheme_)) {
584 result.push_back(scheme_);
585 return result;
588 for (size_t i = 0; i < arraysize(kValidSchemes); ++i) {
589 if (MatchesScheme(kValidSchemes[i])) {
590 result.push_back(kValidSchemes[i]);
594 return result;
597 std::vector<URLPattern> URLPattern::ConvertToExplicitSchemes() const {
598 std::vector<std::string> explicit_schemes = GetExplicitSchemes();
599 std::vector<URLPattern> result;
601 for (std::vector<std::string>::const_iterator i = explicit_schemes.begin();
602 i != explicit_schemes.end(); ++i) {
603 URLPattern temp = *this;
604 temp.SetScheme(*i);
605 temp.SetMatchAllURLs(false);
606 result.push_back(temp);
609 return result;
612 // static
613 const char* URLPattern::GetParseResultString(
614 URLPattern::ParseResult parse_result) {
615 return kParseResultMessages[parse_result];