1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "extensions/common/url_pattern.h"
9 #include "base/strings/pattern.h"
10 #include "base/strings/string_number_conversions.h"
11 #include "base/strings/string_piece.h"
12 #include "base/strings/string_split.h"
13 #include "base/strings/string_util.h"
14 #include "base/strings/stringprintf.h"
15 #include "content/public/common/url_constants.h"
16 #include "extensions/common/constants.h"
17 #include "net/base/registry_controlled_domains/registry_controlled_domain.h"
19 #include "url/url_util.h"
21 const char URLPattern::kAllUrlsPattern
[] = "<all_urls>";
25 // TODO(aa): What about more obscure schemes like data: and javascript: ?
26 // Note: keep this array in sync with kValidSchemeMasks.
27 const char* kValidSchemes
[] = {
32 content::kChromeUIScheme
,
33 extensions::kExtensionScheme
,
34 url::kFileSystemScheme
,
37 const int kValidSchemeMasks
[] = {
38 URLPattern::SCHEME_HTTP
,
39 URLPattern::SCHEME_HTTPS
,
40 URLPattern::SCHEME_FILE
,
41 URLPattern::SCHEME_FTP
,
42 URLPattern::SCHEME_CHROMEUI
,
43 URLPattern::SCHEME_EXTENSION
,
44 URLPattern::SCHEME_FILESYSTEM
,
47 static_assert(arraysize(kValidSchemes
) == arraysize(kValidSchemeMasks
),
48 "must keep these arrays in sync");
50 const char kParseSuccess
[] = "Success.";
51 const char kParseErrorMissingSchemeSeparator
[] = "Missing scheme separator.";
52 const char kParseErrorInvalidScheme
[] = "Invalid scheme.";
53 const char kParseErrorWrongSchemeType
[] = "Wrong scheme type.";
54 const char kParseErrorEmptyHost
[] = "Host can not be empty.";
55 const char kParseErrorInvalidHostWildcard
[] = "Invalid host wildcard.";
56 const char kParseErrorEmptyPath
[] = "Empty path.";
57 const char kParseErrorInvalidPort
[] = "Invalid port.";
58 const char kParseErrorInvalidHost
[] = "Invalid host.";
60 // Message explaining each URLPattern::ParseResult.
61 const char* const kParseResultMessages
[] = {
63 kParseErrorMissingSchemeSeparator
,
64 kParseErrorInvalidScheme
,
65 kParseErrorWrongSchemeType
,
67 kParseErrorInvalidHostWildcard
,
69 kParseErrorInvalidPort
,
70 kParseErrorInvalidHost
,
73 static_assert(URLPattern::NUM_PARSE_RESULTS
== arraysize(kParseResultMessages
),
74 "must add message for each parse result");
76 const char kPathSeparator
[] = "/";
78 bool IsStandardScheme(const std::string
& scheme
) {
79 // "*" gets the same treatment as a standard scheme.
83 return url::IsStandard(scheme
.c_str(),
84 url::Component(0, static_cast<int>(scheme
.length())));
87 bool IsValidPortForScheme(const std::string
& scheme
, const std::string
& port
) {
91 // Only accept non-wildcard ports if the scheme uses ports.
92 if (url::DefaultPortForScheme(scheme
.c_str(), scheme
.length()) ==
93 url::PORT_UNSPECIFIED
) {
97 int parsed_port
= url::PORT_UNSPECIFIED
;
98 if (!base::StringToInt(port
, &parsed_port
))
100 return (parsed_port
>= 0) && (parsed_port
< 65536);
103 // Returns |path| with the trailing wildcard stripped if one existed.
105 // The functions that rely on this (OverlapsWith and Contains) are only
106 // called for the patterns inside URLPatternSet. In those cases, we know that
107 // the path will have only a single wildcard at the end. This makes figuring
108 // out overlap much easier. It seems like there is probably a computer-sciency
109 // way to solve the general case, but we don't need that yet.
110 std::string
StripTrailingWildcard(const std::string
& path
) {
111 size_t wildcard_index
= path
.find('*');
112 size_t path_last
= path
.size() - 1;
113 return wildcard_index
== path_last
? path
.substr(0, path_last
) : path
;
119 bool URLPattern::IsValidSchemeForExtensions(const std::string
& scheme
) {
120 for (size_t i
= 0; i
< arraysize(kValidSchemes
); ++i
) {
121 if (scheme
== kValidSchemes
[i
])
127 URLPattern::URLPattern()
128 : valid_schemes_(SCHEME_NONE
),
129 match_all_urls_(false),
130 match_subdomains_(false),
133 URLPattern::URLPattern(int valid_schemes
)
134 : valid_schemes_(valid_schemes
),
135 match_all_urls_(false),
136 match_subdomains_(false),
139 URLPattern::URLPattern(int valid_schemes
, const std::string
& pattern
)
140 // Strict error checking is used, because this constructor is only
141 // appropriate when we know |pattern| is valid.
142 : valid_schemes_(valid_schemes
),
143 match_all_urls_(false),
144 match_subdomains_(false),
146 ParseResult result
= Parse(pattern
);
147 if (PARSE_SUCCESS
!= result
)
148 NOTREACHED() << "URLPattern invalid: " << pattern
<< " result " << result
;
151 URLPattern::~URLPattern() {
154 bool URLPattern::operator<(const URLPattern
& other
) const {
155 return GetAsString() < other
.GetAsString();
158 bool URLPattern::operator>(const URLPattern
& other
) const {
159 return GetAsString() > other
.GetAsString();
162 bool URLPattern::operator==(const URLPattern
& other
) const {
163 return GetAsString() == other
.GetAsString();
166 std::ostream
& operator<<(std::ostream
& out
, const URLPattern
& url_pattern
) {
167 return out
<< '"' << url_pattern
.GetAsString() << '"';
170 URLPattern::ParseResult
URLPattern::Parse(const std::string
& pattern
) {
172 SetMatchAllURLs(false);
173 SetMatchSubdomains(false);
176 // Special case pattern to match every valid URL.
177 if (pattern
== kAllUrlsPattern
) {
178 SetMatchAllURLs(true);
179 return PARSE_SUCCESS
;
182 // Parse out the scheme.
183 size_t scheme_end_pos
= pattern
.find(url::kStandardSchemeSeparator
);
184 bool has_standard_scheme_separator
= true;
186 // Some urls also use ':' alone as the scheme separator.
187 if (scheme_end_pos
== std::string::npos
) {
188 scheme_end_pos
= pattern
.find(':');
189 has_standard_scheme_separator
= false;
192 if (scheme_end_pos
== std::string::npos
)
193 return PARSE_ERROR_MISSING_SCHEME_SEPARATOR
;
195 if (!SetScheme(pattern
.substr(0, scheme_end_pos
)))
196 return PARSE_ERROR_INVALID_SCHEME
;
198 bool standard_scheme
= IsStandardScheme(scheme_
);
199 if (standard_scheme
!= has_standard_scheme_separator
)
200 return PARSE_ERROR_WRONG_SCHEME_SEPARATOR
;
202 // Advance past the scheme separator.
204 (standard_scheme
? strlen(url::kStandardSchemeSeparator
) : 1);
205 if (scheme_end_pos
>= pattern
.size())
206 return PARSE_ERROR_EMPTY_HOST
;
208 // Parse out the host and path.
209 size_t host_start_pos
= scheme_end_pos
;
210 size_t path_start_pos
= 0;
212 if (!standard_scheme
) {
213 path_start_pos
= host_start_pos
;
214 } else if (scheme_
== url::kFileScheme
) {
215 size_t host_end_pos
= pattern
.find(kPathSeparator
, host_start_pos
);
216 if (host_end_pos
== std::string::npos
) {
217 // Allow hostname omission.
218 // e.g. file://* is interpreted as file:///*,
219 // file://foo* is interpreted as file:///foo*.
220 path_start_pos
= host_start_pos
- 1;
222 // Ignore hostname if scheme is file://.
223 // e.g. file://localhost/foo is equal to file:///foo.
224 path_start_pos
= host_end_pos
;
227 size_t host_end_pos
= pattern
.find(kPathSeparator
, host_start_pos
);
230 if (host_start_pos
== host_end_pos
)
231 return PARSE_ERROR_EMPTY_HOST
;
233 if (host_end_pos
== std::string::npos
)
234 return PARSE_ERROR_EMPTY_PATH
;
236 host_
= pattern
.substr(host_start_pos
, host_end_pos
- host_start_pos
);
238 // The first component can optionally be '*' to match all subdomains.
239 std::vector
<std::string
> host_components
= base::SplitString(
240 host_
, ".", base::TRIM_WHITESPACE
, base::SPLIT_WANT_ALL
);
242 // Could be empty if the host only consists of whitespace characters.
243 if (host_components
.empty() ||
244 (host_components
.size() == 1 && host_components
[0].empty()))
245 return PARSE_ERROR_EMPTY_HOST
;
247 if (host_components
[0] == "*") {
248 match_subdomains_
= true;
249 host_components
.erase(host_components
.begin(),
250 host_components
.begin() + 1);
252 host_
= base::JoinString(host_components
, ".");
254 path_start_pos
= host_end_pos
;
257 SetPath(pattern
.substr(path_start_pos
));
259 size_t port_pos
= host_
.find(':');
260 if (port_pos
!= std::string::npos
) {
261 if (!SetPort(host_
.substr(port_pos
+ 1)))
262 return PARSE_ERROR_INVALID_PORT
;
263 host_
= host_
.substr(0, port_pos
);
266 // No other '*' can occur in the host, though. This isn't necessary, but is
267 // done as a convenience to developers who might otherwise be confused and
268 // think '*' works as a glob in the host.
269 if (host_
.find('*') != std::string::npos
)
270 return PARSE_ERROR_INVALID_HOST_WILDCARD
;
272 // Null characters are not allowed in hosts.
273 if (host_
.find('\0') != std::string::npos
)
274 return PARSE_ERROR_INVALID_HOST
;
276 return PARSE_SUCCESS
;
279 void URLPattern::SetValidSchemes(int valid_schemes
) {
281 valid_schemes_
= valid_schemes
;
284 void URLPattern::SetHost(const std::string
& host
) {
289 void URLPattern::SetMatchAllURLs(bool val
) {
291 match_all_urls_
= val
;
294 match_subdomains_
= true;
301 void URLPattern::SetMatchSubdomains(bool val
) {
303 match_subdomains_
= val
;
306 bool URLPattern::SetScheme(const std::string
& scheme
) {
309 if (scheme_
== "*") {
310 valid_schemes_
&= (SCHEME_HTTP
| SCHEME_HTTPS
);
311 } else if (!IsValidScheme(scheme_
)) {
317 bool URLPattern::IsValidScheme(const std::string
& scheme
) const {
318 if (valid_schemes_
== SCHEME_ALL
)
321 for (size_t i
= 0; i
< arraysize(kValidSchemes
); ++i
) {
322 if (scheme
== kValidSchemes
[i
] && (valid_schemes_
& kValidSchemeMasks
[i
]))
329 void URLPattern::SetPath(const std::string
& path
) {
332 path_escaped_
= path_
;
333 base::ReplaceSubstringsAfterOffset(&path_escaped_
, 0, "\\", "\\\\");
334 base::ReplaceSubstringsAfterOffset(&path_escaped_
, 0, "?", "\\?");
337 bool URLPattern::SetPort(const std::string
& port
) {
339 if (IsValidPortForScheme(scheme_
, port
)) {
346 bool URLPattern::MatchesURL(const GURL
& test
) const {
347 const GURL
* test_url
= &test
;
348 bool has_inner_url
= test
.inner_url() != NULL
;
351 if (!test
.SchemeIsFileSystem())
352 return false; // The only nested URLs we handle are filesystem URLs.
353 test_url
= test
.inner_url();
356 if (!MatchesScheme(test_url
->scheme()))
362 std::string path_for_request
= test
.PathForRequest();
364 path_for_request
= test_url
->path() + path_for_request
;
366 return MatchesSecurityOriginHelper(*test_url
) &&
367 MatchesPath(path_for_request
);
370 bool URLPattern::MatchesSecurityOrigin(const GURL
& test
) const {
371 const GURL
* test_url
= &test
;
372 bool has_inner_url
= test
.inner_url() != NULL
;
375 if (!test
.SchemeIsFileSystem())
376 return false; // The only nested URLs we handle are filesystem URLs.
377 test_url
= test
.inner_url();
380 if (!MatchesScheme(test_url
->scheme()))
386 return MatchesSecurityOriginHelper(*test_url
);
389 bool URLPattern::MatchesScheme(const std::string
& test
) const {
390 if (!IsValidScheme(test
))
393 return scheme_
== "*" || test
== scheme_
;
396 bool URLPattern::MatchesHost(const std::string
& host
) const {
397 std::string
test(url::kHttpScheme
);
398 test
+= url::kStandardSchemeSeparator
;
401 return MatchesHost(GURL(test
));
404 bool URLPattern::MatchesHost(const GURL
& test
) const {
405 // If the hosts are exactly equal, we have a match.
406 if (test
.host() == host_
)
409 // If we're matching subdomains, and we have no host in the match pattern,
410 // that means that we're matching all hosts, which means we have a match no
411 // matter what the test host is.
412 if (match_subdomains_
&& host_
.empty())
415 // Otherwise, we can only match if our match pattern matches subdomains.
416 if (!match_subdomains_
)
419 // We don't do subdomain matching against IP addresses, so we can give up now
420 // if the test host is an IP address.
421 if (test
.HostIsIPAddress())
424 // Check if the test host is a subdomain of our host.
425 if (test
.host().length() <= (host_
.length() + 1))
428 if (test
.host().compare(test
.host().length() - host_
.length(),
429 host_
.length(), host_
) != 0)
432 return test
.host()[test
.host().length() - host_
.length() - 1] == '.';
435 bool URLPattern::ImpliesAllHosts() const {
436 // Check if it matches all urls or is a pattern like http://*/*.
437 if (match_all_urls_
||
438 (match_subdomains_
&& host_
.empty() && port_
== "*" && path_
== "/*")) {
442 // If this doesn't even match subdomains, it can't possibly imply all hosts.
443 if (!match_subdomains_
)
446 // If |host_| is a recognized TLD, this will be 0. We don't include private
447 // TLDs, so that, e.g., *.appspot.com does not imply all hosts.
448 size_t registry_length
= net::registry_controlled_domains::GetRegistryLength(
450 net::registry_controlled_domains::EXCLUDE_UNKNOWN_REGISTRIES
,
451 net::registry_controlled_domains::EXCLUDE_PRIVATE_REGISTRIES
);
452 // If there was more than just a TLD in the host (e.g., *.foobar.com), it
453 // doesn't imply all hosts.
454 if (registry_length
> 0)
457 // At this point the host could either be just a TLD ("com") or some unknown
458 // TLD-like string ("notatld"). To disambiguate between them construct a
459 // fake URL, and check the registry. This returns 0 if the TLD is
460 // unrecognized, or the length of the recognized TLD.
461 registry_length
= net::registry_controlled_domains::GetRegistryLength(
462 base::StringPrintf("foo.%s", host_
.c_str()),
463 net::registry_controlled_domains::EXCLUDE_UNKNOWN_REGISTRIES
,
464 net::registry_controlled_domains::EXCLUDE_PRIVATE_REGISTRIES
);
465 // If we recognized this TLD, then this is a pattern like *.com, and it
466 // should imply all hosts. Otherwise, this doesn't imply all hosts.
467 return registry_length
> 0;
470 bool URLPattern::MatchesSingleOrigin() const {
471 // Strictly speaking, the port is part of the origin, but in URLPattern it
472 // defaults to *. It's not very interesting anyway, so leave it out.
473 return !ImpliesAllHosts() && scheme_
!= "*" && !match_subdomains_
;
476 bool URLPattern::MatchesPath(const std::string
& test
) const {
477 // Make the behaviour of OverlapsWith consistent with MatchesURL, which is
478 // need to match hosted apps on e.g. 'google.com' also run on 'google.com/'.
479 if (test
+ "/*" == path_escaped_
)
482 return base::MatchPattern(test
, path_escaped_
);
485 const std::string
& URLPattern::GetAsString() const {
489 if (match_all_urls_
) {
490 spec_
= kAllUrlsPattern
;
494 bool standard_scheme
= IsStandardScheme(scheme_
);
496 std::string spec
= scheme_
+
497 (standard_scheme
? url::kStandardSchemeSeparator
: ":");
499 if (scheme_
!= url::kFileScheme
&& standard_scheme
) {
500 if (match_subdomains_
) {
522 bool URLPattern::OverlapsWith(const URLPattern
& other
) const {
523 if (match_all_urls() || other
.match_all_urls())
525 return (MatchesAnyScheme(other
.GetExplicitSchemes()) ||
526 other
.MatchesAnyScheme(GetExplicitSchemes()))
527 && (MatchesHost(other
.host()) || other
.MatchesHost(host()))
528 && (MatchesPortPattern(other
.port()) || other
.MatchesPortPattern(port()))
529 && (MatchesPath(StripTrailingWildcard(other
.path())) ||
530 other
.MatchesPath(StripTrailingWildcard(path())));
533 bool URLPattern::Contains(const URLPattern
& other
) const {
534 if (match_all_urls())
536 return MatchesAllSchemes(other
.GetExplicitSchemes()) &&
537 MatchesHost(other
.host()) &&
538 (!other
.match_subdomains_
|| match_subdomains_
) &&
539 MatchesPortPattern(other
.port()) &&
540 MatchesPath(StripTrailingWildcard(other
.path()));
543 bool URLPattern::MatchesAnyScheme(
544 const std::vector
<std::string
>& schemes
) const {
545 for (std::vector
<std::string
>::const_iterator i
= schemes
.begin();
546 i
!= schemes
.end(); ++i
) {
547 if (MatchesScheme(*i
))
554 bool URLPattern::MatchesAllSchemes(
555 const std::vector
<std::string
>& schemes
) const {
556 for (std::vector
<std::string
>::const_iterator i
= schemes
.begin();
557 i
!= schemes
.end(); ++i
) {
558 if (!MatchesScheme(*i
))
565 bool URLPattern::MatchesSecurityOriginHelper(const GURL
& test
) const {
566 // Ignore hostname if scheme is file://.
567 if (scheme_
!= url::kFileScheme
&& !MatchesHost(test
))
570 if (!MatchesPortPattern(base::IntToString(test
.EffectiveIntPort())))
576 bool URLPattern::MatchesPortPattern(const std::string
& port
) const {
577 return port_
== "*" || port_
== port
;
580 std::vector
<std::string
> URLPattern::GetExplicitSchemes() const {
581 std::vector
<std::string
> result
;
583 if (scheme_
!= "*" && !match_all_urls_
&& IsValidScheme(scheme_
)) {
584 result
.push_back(scheme_
);
588 for (size_t i
= 0; i
< arraysize(kValidSchemes
); ++i
) {
589 if (MatchesScheme(kValidSchemes
[i
])) {
590 result
.push_back(kValidSchemes
[i
]);
597 std::vector
<URLPattern
> URLPattern::ConvertToExplicitSchemes() const {
598 std::vector
<std::string
> explicit_schemes
= GetExplicitSchemes();
599 std::vector
<URLPattern
> result
;
601 for (std::vector
<std::string
>::const_iterator i
= explicit_schemes
.begin();
602 i
!= explicit_schemes
.end(); ++i
) {
603 URLPattern temp
= *this;
605 temp
.SetMatchAllURLs(false);
606 result
.push_back(temp
);
613 const char* URLPattern::GetParseResultString(
614 URLPattern::ParseResult parse_result
) {
615 return kParseResultMessages
[parse_result
];