extensions/common/url_pattern.cc

   1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
   2 // Use of this source code is governed by a BSD-style license that can be
   3 // found in the LICENSE file.
   4
   5 #include "extensions/common/url_pattern.h"
   6
   7 #include <ostream>
   8
   9 #include "base/strings/string_number_conversions.h"
  10 #include "base/strings/string_piece.h"
  11 #include "base/strings/string_split.h"
  12 #include "base/strings/string_util.h"
  13 #include "base/strings/stringprintf.h"
  14 #include "content/public/common/url_constants.h"
  15 #include "extensions/common/constants.h"
  16 #include "net/base/registry_controlled_domains/registry_controlled_domain.h"
  17 #include "url/gurl.h"
  18 #include "url/url_util.h"
  19
  20 const char URLPattern::kAllUrlsPattern[] = "<all_urls>";
  21
  22 namespace {
  23
  24 // TODO(aa): What about more obscure schemes like data: and javascript: ?
  25 // Note: keep this array in sync with kValidSchemeMasks.
  26 const char* kValidSchemes[] = {
  27     url::kHttpScheme,
  28     url::kHttpsScheme,
  29     url::kFileScheme,
  30     url::kFtpScheme,
  31     content::kChromeUIScheme,
  32     extensions::kExtensionScheme,
  33     url::kFileSystemScheme,
  34 };
  35
  36 const int kValidSchemeMasks[] = {
  37   URLPattern::SCHEME_HTTP,
  38   URLPattern::SCHEME_HTTPS,
  39   URLPattern::SCHEME_FILE,
  40   URLPattern::SCHEME_FTP,
  41   URLPattern::SCHEME_CHROMEUI,
  42   URLPattern::SCHEME_EXTENSION,
  43   URLPattern::SCHEME_FILESYSTEM,
  44 };
  45
  46 static_assert(arraysize(kValidSchemes) == arraysize(kValidSchemeMasks),
  47               "must keep these arrays in sync");
  48
  49 const char kParseSuccess[] = "Success.";
  50 const char kParseErrorMissingSchemeSeparator[] = "Missing scheme separator.";
  51 const char kParseErrorInvalidScheme[] = "Invalid scheme.";
  52 const char kParseErrorWrongSchemeType[] = "Wrong scheme type.";
  53 const char kParseErrorEmptyHost[] = "Host can not be empty.";
  54 const char kParseErrorInvalidHostWildcard[] = "Invalid host wildcard.";
  55 const char kParseErrorEmptyPath[] = "Empty path.";
  56 const char kParseErrorInvalidPort[] = "Invalid port.";
  57 const char kParseErrorInvalidHost[] = "Invalid host.";
  58
  59 // Message explaining each URLPattern::ParseResult.
  60 const char* const kParseResultMessages[] = {
  61   kParseSuccess,
  62   kParseErrorMissingSchemeSeparator,
  63   kParseErrorInvalidScheme,
  64   kParseErrorWrongSchemeType,
  65   kParseErrorEmptyHost,
  66   kParseErrorInvalidHostWildcard,
  67   kParseErrorEmptyPath,
  68   kParseErrorInvalidPort,
  69   kParseErrorInvalidHost,
  70 };
  71
  72 static_assert(URLPattern::NUM_PARSE_RESULTS == arraysize(kParseResultMessages),
  73               "must add message for each parse result");
  74
  75 const char kPathSeparator[] = "/";
  76
  77 bool IsStandardScheme(const std::string& scheme) {
  78   // "*" gets the same treatment as a standard scheme.
  79   if (scheme == "*")
  80     return true;
  81
  82   return url::IsStandard(scheme.c_str(),
  83                          url::Component(0, static_cast<int>(scheme.length())));
  84 }
  85
  86 bool IsValidPortForScheme(const std::string& scheme, const std::string& port) {
  87   if (port == "*")
  88     return true;
  89
  90   // Only accept non-wildcard ports if the scheme uses ports.
  91   if (url::DefaultPortForScheme(scheme.c_str(), scheme.length()) ==
  92       url::PORT_UNSPECIFIED) {
  93     return false;
  94   }
  95
  96   int parsed_port = url::PORT_UNSPECIFIED;
  97   if (!base::StringToInt(port, &parsed_port))
  98     return false;
  99   return (parsed_port >= 0) && (parsed_port < 65536);
 100 }
 101
 102 // Returns |path| with the trailing wildcard stripped if one existed.
 103 //
 104 // The functions that rely on this (OverlapsWith and Contains) are only
 105 // called for the patterns inside URLPatternSet. In those cases, we know that
 106 // the path will have only a single wildcard at the end. This makes figuring
 107 // out overlap much easier. It seems like there is probably a computer-sciency
 108 // way to solve the general case, but we don't need that yet.
 109 std::string StripTrailingWildcard(const std::string& path) {
 110   size_t wildcard_index = path.find('*');
 111   size_t path_last = path.size() - 1;
 112   DCHECK(wildcard_index == std::string::npos || wildcard_index == path_last);
 113   return wildcard_index == path_last ? path.substr(0, path_last) : path;
 114 }
 115
 116 }  // namespace
 117
 118 // static
 119 bool URLPattern::IsValidSchemeForExtensions(const std::string& scheme) {
 120   for (size_t i = 0; i < arraysize(kValidSchemes); ++i) {
 121     if (scheme == kValidSchemes[i])
 122       return true;
 123   }
 124   return false;
 125 }
 126
 127 URLPattern::URLPattern()
 128     : valid_schemes_(SCHEME_NONE),
 129       match_all_urls_(false),
 130       match_subdomains_(false),
 131       port_("*") {}
 132
 133 URLPattern::URLPattern(int valid_schemes)
 134     : valid_schemes_(valid_schemes),
 135       match_all_urls_(false),
 136       match_subdomains_(false),
 137       port_("*") {}
 138
 139 URLPattern::URLPattern(int valid_schemes, const std::string& pattern)
 140     // Strict error checking is used, because this constructor is only
 141     // appropriate when we know |pattern| is valid.
 142     : valid_schemes_(valid_schemes),
 143       match_all_urls_(false),
 144       match_subdomains_(false),
 145       port_("*") {
 146   ParseResult result = Parse(pattern);
 147   if (PARSE_SUCCESS != result)
 148     NOTREACHED() << "URLPattern invalid: " << pattern << " result " << result;
 149 }
 150
 151 URLPattern::~URLPattern() {
 152 }
 153
 154 bool URLPattern::operator<(const URLPattern& other) const {
 155   return GetAsString() < other.GetAsString();
 156 }
 157
 158 bool URLPattern::operator>(const URLPattern& other) const {
 159   return GetAsString() > other.GetAsString();
 160 }
 161
 162 bool URLPattern::operator==(const URLPattern& other) const {
 163   return GetAsString() == other.GetAsString();
 164 }
 165
 166 std::ostream& operator<<(std::ostream& out, const URLPattern& url_pattern) {
 167   return out << '"' << url_pattern.GetAsString() << '"';
 168 }
 169
 170 URLPattern::ParseResult URLPattern::Parse(const std::string& pattern) {
 171   spec_.clear();
 172   SetMatchAllURLs(false);
 173   SetMatchSubdomains(false);
 174   SetPort("*");
 175
 176   // Special case pattern to match every valid URL.
 177   if (pattern == kAllUrlsPattern) {
 178     SetMatchAllURLs(true);
 179     return PARSE_SUCCESS;
 180   }
 181
 182   // Parse out the scheme.
 183   size_t scheme_end_pos = pattern.find(url::kStandardSchemeSeparator);
 184   bool has_standard_scheme_separator = true;
 185
 186   // Some urls also use ':' alone as the scheme separator.
 187   if (scheme_end_pos == std::string::npos) {
 188     scheme_end_pos = pattern.find(':');
 189     has_standard_scheme_separator = false;
 190   }
 191
 192   if (scheme_end_pos == std::string::npos)
 193     return PARSE_ERROR_MISSING_SCHEME_SEPARATOR;
 194
 195   if (!SetScheme(pattern.substr(0, scheme_end_pos)))
 196     return PARSE_ERROR_INVALID_SCHEME;
 197
 198   bool standard_scheme = IsStandardScheme(scheme_);
 199   if (standard_scheme != has_standard_scheme_separator)
 200     return PARSE_ERROR_WRONG_SCHEME_SEPARATOR;
 201
 202   // Advance past the scheme separator.
 203   scheme_end_pos +=
 204       (standard_scheme ? strlen(url::kStandardSchemeSeparator) : 1);
 205   if (scheme_end_pos >= pattern.size())
 206     return PARSE_ERROR_EMPTY_HOST;
 207
 208   // Parse out the host and path.
 209   size_t host_start_pos = scheme_end_pos;
 210   size_t path_start_pos = 0;
 211
 212   if (!standard_scheme) {
 213     path_start_pos = host_start_pos;
 214   } else if (scheme_ == url::kFileScheme) {
 215     size_t host_end_pos = pattern.find(kPathSeparator, host_start_pos);
 216     if (host_end_pos == std::string::npos) {
 217       // Allow hostname omission.
 218       // e.g. file://* is interpreted as file:///*,
 219       // file://foo* is interpreted as file:///foo*.
 220       path_start_pos = host_start_pos - 1;
 221     } else {
 222       // Ignore hostname if scheme is file://.
 223       // e.g. file://localhost/foo is equal to file:///foo.
 224       path_start_pos = host_end_pos;
 225     }
 226   } else {
 227     size_t host_end_pos = pattern.find(kPathSeparator, host_start_pos);
 228
 229     // Host is required.
 230     if (host_start_pos == host_end_pos)
 231       return PARSE_ERROR_EMPTY_HOST;
 232
 233     if (host_end_pos == std::string::npos)
 234       return PARSE_ERROR_EMPTY_PATH;
 235
 236     host_ = pattern.substr(host_start_pos, host_end_pos - host_start_pos);
 237
 238     // The first component can optionally be '*' to match all subdomains.
 239     std::vector<std::string> host_components;
 240     base::SplitString(host_, '.', &host_components);
 241
 242     // Could be empty if the host only consists of whitespace characters.
 243     if (host_components.empty())
 244       return PARSE_ERROR_EMPTY_HOST;
 245
 246     if (host_components[0] == "*") {
 247       match_subdomains_ = true;
 248       host_components.erase(host_components.begin(),
 249                             host_components.begin() + 1);
 250     }
 251     host_ = JoinString(host_components, '.');
 252
 253     path_start_pos = host_end_pos;
 254   }
 255
 256   SetPath(pattern.substr(path_start_pos));
 257
 258   size_t port_pos = host_.find(':');
 259   if (port_pos != std::string::npos) {
 260     if (!SetPort(host_.substr(port_pos + 1)))
 261       return PARSE_ERROR_INVALID_PORT;
 262     host_ = host_.substr(0, port_pos);
 263   }
 264
 265   // No other '*' can occur in the host, though. This isn't necessary, but is
 266   // done as a convenience to developers who might otherwise be confused and
 267   // think '*' works as a glob in the host.
 268   if (host_.find('*') != std::string::npos)
 269     return PARSE_ERROR_INVALID_HOST_WILDCARD;
 270
 271   // Null characters are not allowed in hosts.
 272   if (host_.find('\0') != std::string::npos)
 273     return PARSE_ERROR_INVALID_HOST;
 274
 275   return PARSE_SUCCESS;
 276 }
 277
 278 void URLPattern::SetValidSchemes(int valid_schemes) {
 279   spec_.clear();
 280   valid_schemes_ = valid_schemes;
 281 }
 282
 283 void URLPattern::SetHost(const std::string& host) {
 284   spec_.clear();
 285   host_ = host;
 286 }
 287
 288 void URLPattern::SetMatchAllURLs(bool val) {
 289   spec_.clear();
 290   match_all_urls_ = val;
 291
 292   if (val) {
 293     match_subdomains_ = true;
 294     scheme_ = "*";
 295     host_.clear();
 296     SetPath("/*");
 297   }
 298 }
 299
 300 void URLPattern::SetMatchSubdomains(bool val) {
 301   spec_.clear();
 302   match_subdomains_ = val;
 303 }
 304
 305 bool URLPattern::SetScheme(const std::string& scheme) {
 306   spec_.clear();
 307   scheme_ = scheme;
 308   if (scheme_ == "*") {
 309     valid_schemes_ &= (SCHEME_HTTP | SCHEME_HTTPS);
 310   } else if (!IsValidScheme(scheme_)) {
 311     return false;
 312   }
 313   return true;
 314 }
 315
 316 bool URLPattern::IsValidScheme(const std::string& scheme) const {
 317   if (valid_schemes_ == SCHEME_ALL)
 318     return true;
 319
 320   for (size_t i = 0; i < arraysize(kValidSchemes); ++i) {
 321     if (scheme == kValidSchemes[i] && (valid_schemes_ & kValidSchemeMasks[i]))
 322       return true;
 323   }
 324
 325   return false;
 326 }
 327
 328 void URLPattern::SetPath(const std::string& path) {
 329   spec_.clear();
 330   path_ = path;
 331   path_escaped_ = path_;
 332   ReplaceSubstringsAfterOffset(&path_escaped_, 0, "\\", "\\\\");
 333   ReplaceSubstringsAfterOffset(&path_escaped_, 0, "?", "\\?");
 334 }
 335
 336 bool URLPattern::SetPort(const std::string& port) {
 337   spec_.clear();
 338   if (IsValidPortForScheme(scheme_, port)) {
 339     port_ = port;
 340     return true;
 341   }
 342   return false;
 343 }
 344
 345 bool URLPattern::MatchesURL(const GURL& test) const {
 346   const GURL* test_url = &test;
 347   bool has_inner_url = test.inner_url() != NULL;
 348
 349   if (has_inner_url) {
 350     if (!test.SchemeIsFileSystem())
 351       return false;  // The only nested URLs we handle are filesystem URLs.
 352     test_url = test.inner_url();
 353   }
 354
 355   if (!MatchesScheme(test_url->scheme()))
 356     return false;
 357
 358   if (match_all_urls_)
 359     return true;
 360
 361   std::string path_for_request = test.PathForRequest();
 362   if (has_inner_url)
 363     path_for_request = test_url->path() + path_for_request;
 364
 365   return MatchesSecurityOriginHelper(*test_url) &&
 366          MatchesPath(path_for_request);
 367 }
 368
 369 bool URLPattern::MatchesSecurityOrigin(const GURL& test) const {
 370   const GURL* test_url = &test;
 371   bool has_inner_url = test.inner_url() != NULL;
 372
 373   if (has_inner_url) {
 374     if (!test.SchemeIsFileSystem())
 375       return false;  // The only nested URLs we handle are filesystem URLs.
 376     test_url = test.inner_url();
 377   }
 378
 379   if (!MatchesScheme(test_url->scheme()))
 380     return false;
 381
 382   if (match_all_urls_)
 383     return true;
 384
 385   return MatchesSecurityOriginHelper(*test_url);
 386 }
 387
 388 bool URLPattern::MatchesScheme(const std::string& test) const {
 389   if (!IsValidScheme(test))
 390     return false;
 391
 392   return scheme_ == "*" || test == scheme_;
 393 }
 394
 395 bool URLPattern::MatchesHost(const std::string& host) const {
 396   std::string test(url::kHttpScheme);
 397   test += url::kStandardSchemeSeparator;
 398   test += host;
 399   test += "/";
 400   return MatchesHost(GURL(test));
 401 }
 402
 403 bool URLPattern::MatchesHost(const GURL& test) const {
 404   // If the hosts are exactly equal, we have a match.
 405   if (test.host() == host_)
 406     return true;
 407
 408   // If we're matching subdomains, and we have no host in the match pattern,
 409   // that means that we're matching all hosts, which means we have a match no
 410   // matter what the test host is.
 411   if (match_subdomains_ && host_.empty())
 412     return true;
 413
 414   // Otherwise, we can only match if our match pattern matches subdomains.
 415   if (!match_subdomains_)
 416     return false;
 417
 418   // We don't do subdomain matching against IP addresses, so we can give up now
 419   // if the test host is an IP address.
 420   if (test.HostIsIPAddress())
 421     return false;
 422
 423   // Check if the test host is a subdomain of our host.
 424   if (test.host().length() <= (host_.length() + 1))
 425     return false;
 426
 427   if (test.host().compare(test.host().length() - host_.length(),
 428                           host_.length(), host_) != 0)
 429     return false;
 430
 431   return test.host()[test.host().length() - host_.length() - 1] == '.';
 432 }
 433
 434 bool URLPattern::ImpliesAllHosts() const {
 435   // Check if it matches all urls or is a pattern like http://*/*.
 436   if (match_all_urls_ ||
 437       (match_subdomains_ && host_.empty() && port_ == "*" && path_ == "/*")) {
 438     return true;
 439   }
 440
 441   // If this doesn't even match subdomains, it can't possibly imply all hosts.
 442   if (!match_subdomains_)
 443     return false;
 444
 445   // If |host_| is a recognized TLD, this will be 0. We don't include private
 446   // TLDs, so that, e.g., *.appspot.com does not imply all hosts.
 447   size_t registry_length = net::registry_controlled_domains::GetRegistryLength(
 448       host_,
 449       net::registry_controlled_domains::EXCLUDE_UNKNOWN_REGISTRIES,
 450       net::registry_controlled_domains::EXCLUDE_PRIVATE_REGISTRIES);
 451   // If there was more than just a TLD in the host (e.g., *.foobar.com), it
 452   // doesn't imply all hosts.
 453   if (registry_length > 0)
 454     return false;
 455
 456   // At this point the host could either be just a TLD ("com") or some unknown
 457   // TLD-like string ("notatld"). To disambiguate between them construct a
 458   // fake URL, and check the registry. This returns 0 if the TLD is
 459   // unrecognized, or the length of the recognized TLD.
 460   registry_length = net::registry_controlled_domains::GetRegistryLength(
 461       base::StringPrintf("foo.%s", host_.c_str()),
 462       net::registry_controlled_domains::EXCLUDE_UNKNOWN_REGISTRIES,
 463       net::registry_controlled_domains::EXCLUDE_PRIVATE_REGISTRIES);
 464   // If we recognized this TLD, then this is a pattern like *.com, and it
 465   // should imply all hosts. Otherwise, this doesn't imply all hosts.
 466   return registry_length > 0;
 467 }
 468
 469 bool URLPattern::MatchesSingleOrigin() const {
 470   // Strictly speaking, the port is part of the origin, but in URLPattern it
 471   // defaults to *. It's not very interesting anyway, so leave it out.
 472   return !ImpliesAllHosts() && scheme_ != "*" && !match_subdomains_;
 473 }
 474
 475 bool URLPattern::MatchesPath(const std::string& test) const {
 476   // Make the behaviour of OverlapsWith consistent with MatchesURL, which is
 477   // need to match hosted apps on e.g. 'google.com' also run on 'google.com/'.
 478   if (test + "/*" == path_escaped_)
 479     return true;
 480
 481   return MatchPattern(test, path_escaped_);
 482 }
 483
 484 const std::string& URLPattern::GetAsString() const {
 485   if (!spec_.empty())
 486     return spec_;
 487
 488   if (match_all_urls_) {
 489     spec_ = kAllUrlsPattern;
 490     return spec_;
 491   }
 492
 493   bool standard_scheme = IsStandardScheme(scheme_);
 494
 495   std::string spec = scheme_ +
 496       (standard_scheme ? url::kStandardSchemeSeparator : ":");
 497
 498   if (scheme_ != url::kFileScheme && standard_scheme) {
 499     if (match_subdomains_) {
 500       spec += "*";
 501       if (!host_.empty())
 502         spec += ".";
 503     }
 504
 505     if (!host_.empty())
 506       spec += host_;
 507
 508     if (port_ != "*") {
 509       spec += ":";
 510       spec += port_;
 511     }
 512   }
 513
 514   if (!path_.empty())
 515     spec += path_;
 516
 517   spec_ = spec;
 518   return spec_;
 519 }
 520
 521 bool URLPattern::OverlapsWith(const URLPattern& other) const {
 522   if (match_all_urls() || other.match_all_urls())
 523     return true;
 524   return (MatchesAnyScheme(other.GetExplicitSchemes()) ||
 525           other.MatchesAnyScheme(GetExplicitSchemes()))
 526       && (MatchesHost(other.host()) || other.MatchesHost(host()))
 527       && (MatchesPortPattern(other.port()) || other.MatchesPortPattern(port()))
 528       && (MatchesPath(StripTrailingWildcard(other.path())) ||
 529           other.MatchesPath(StripTrailingWildcard(path())));
 530 }
 531
 532 bool URLPattern::Contains(const URLPattern& other) const {
 533   if (match_all_urls())
 534     return true;
 535   return MatchesAllSchemes(other.GetExplicitSchemes())
 536       && MatchesHost(other.host())
 537       && MatchesPortPattern(other.port())
 538       && MatchesPath(StripTrailingWildcard(other.path()));
 539 }
 540
 541 bool URLPattern::MatchesAnyScheme(
 542     const std::vector<std::string>& schemes) const {
 543   for (std::vector<std::string>::const_iterator i = schemes.begin();
 544        i != schemes.end(); ++i) {
 545     if (MatchesScheme(*i))
 546       return true;
 547   }
 548
 549   return false;
 550 }
 551
 552 bool URLPattern::MatchesAllSchemes(
 553     const std::vector<std::string>& schemes) const {
 554   for (std::vector<std::string>::const_iterator i = schemes.begin();
 555        i != schemes.end(); ++i) {
 556     if (!MatchesScheme(*i))
 557       return false;
 558   }
 559
 560   return true;
 561 }
 562
 563 bool URLPattern::MatchesSecurityOriginHelper(const GURL& test) const {
 564   // Ignore hostname if scheme is file://.
 565   if (scheme_ != url::kFileScheme && !MatchesHost(test))
 566     return false;
 567
 568   if (!MatchesPortPattern(base::IntToString(test.EffectiveIntPort())))
 569     return false;
 570
 571   return true;
 572 }
 573
 574 bool URLPattern::MatchesPortPattern(const std::string& port) const {
 575   return port_ == "*" || port_ == port;
 576 }
 577
 578 std::vector<std::string> URLPattern::GetExplicitSchemes() const {
 579   std::vector<std::string> result;
 580
 581   if (scheme_ != "*" && !match_all_urls_ && IsValidScheme(scheme_)) {
 582     result.push_back(scheme_);
 583     return result;
 584   }
 585
 586   for (size_t i = 0; i < arraysize(kValidSchemes); ++i) {
 587     if (MatchesScheme(kValidSchemes[i])) {
 588       result.push_back(kValidSchemes[i]);
 589     }
 590   }
 591
 592   return result;
 593 }
 594
 595 std::vector<URLPattern> URLPattern::ConvertToExplicitSchemes() const {
 596   std::vector<std::string> explicit_schemes = GetExplicitSchemes();
 597   std::vector<URLPattern> result;
 598
 599   for (std::vector<std::string>::const_iterator i = explicit_schemes.begin();
 600        i != explicit_schemes.end(); ++i) {
 601     URLPattern temp = *this;
 602     temp.SetScheme(*i);
 603     temp.SetMatchAllURLs(false);
 604     result.push_back(temp);
 605   }
 606
 607   return result;
 608 }
 609
 610 // static
 611 const char* URLPattern::GetParseResultString(
 612     URLPattern::ParseResult parse_result) {
 613   return kParseResultMessages[parse_result];
 614 }