1 // Copyright 2013 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "components/url_matcher/url_matcher.h"
10 #include "base/logging.h"
11 #include "base/profiler/scoped_profile.h"
13 #include "url/url_canon.h"
15 namespace url_matcher
{
17 // This set of classes implement a mapping of URL Component Patterns, such as
18 // host_prefix, host_suffix, host_equals, ..., etc., to StringPatterns
19 // for use in substring comparisons.
21 // The idea of this mapping is to reduce the problem of comparing many
22 // URL Component Patterns against one URL to the problem of searching many
23 // substrings in one string:
25 // ---------------------- -----------------
26 // | URL Query operator | ----translate----> | StringPattern |
27 // ---------------------- -----------------
33 // ---------------------- -----------------
34 // | URL to compare | | |
35 // | to all URL Query | ----translate----> | String |
37 // ---------------------- -----------------
39 // The reason for this problem reduction is that there are efficient algorithms
40 // for searching many substrings in one string (see Aho-Corasick algorithm).
42 // Additionally, some of the same pieces are reused to implement regular
43 // expression comparisons. The FilteredRE2 implementation for matching many
44 // regular expressions against one string uses prefiltering, in which a set
45 // of substrings (derived from the regexes) are first searched for, to reduce
46 // the number of regular expressions to test; the prefiltering step also
49 // Case 1: {host,path,query}_{prefix,suffix,equals} searches.
50 // ==========================================================
52 // For searches in this class, we normalize URLs as follows:
55 // Remove scheme, port and segment from URL:
56 // -> http://www.example.com:8080/index.html?search=foo#first_match becomes
57 // www.example.com/index.html?search=foo
59 // We remove the scheme and port number because they can be checked later
60 // in a secondary filter step. We remove the segment (the #... part) because
61 // this is not guaranteed to be ASCII-7 encoded.
64 // Translate URL to String and add the following position markers:
65 // - BU = Beginning of URL
66 // - ED = End of Domain
69 // Furthermore, the hostname is canonicalized to start with a ".".
71 // Position markers are represented as characters >127, which are therefore
72 // guaranteed not to be part of the ASCII-7 encoded URL character set.
74 // -> www.example.com/index.html?search=foo becomes
75 // BU .www.example.com ED /index.html EP ?search=foo EU
77 // -> www.example.com/index.html becomes
78 // BU .www.example.com ED /index.html EP EU
81 // Translate URL Component Patterns as follows:
83 // host_prefix(prefix) = BU add_missing_dot_prefix(prefix)
84 // -> host_prefix("www.example") = BU .www.example
86 // host_suffix(suffix) = suffix ED
87 // -> host_suffix("example.com") = example.com ED
88 // -> host_suffix(".example.com") = .example.com ED
90 // host_equals(domain) = BU add_missing_dot_prefix(domain) ED
91 // -> host_equals("www.example.com") = BU .www.example.com ED
93 // Similarly for path query parameters ({path, query}_{prefix, suffix, equals}).
95 // With this, we can search the StringPatterns in the normalized URL.
98 // Case 2: url_{prefix,suffix,equals,contains} searches.
99 // =====================================================
101 // Step 1: as above, except that
102 // - the scheme is not removed
103 // - the port is not removed if it is specified and does not match the default
104 // port for the given scheme.
107 // Translate URL to String and add the following position markers:
108 // - BU = Beginning of URL
111 // -> http://www.example.com:8080/index.html?search=foo#first_match becomes
112 // BU http://www.example.com:8080/index.html?search=foo EU
113 // -> http://www.example.com:80/index.html?search=foo#first_match becomes
114 // BU http://www.example.com/index.html?search=foo EU
116 // url_prefix(prefix) = BU prefix
117 // -> url_prefix("http://www.example") = BU http://www.example
119 // url_contains(substring) = substring
120 // -> url_contains("index") = index
123 // Case 3: {host,path,query}_contains searches.
124 // ============================================
126 // These kinds of searches are not supported directly but can be derived
127 // by a combination of a url_contains() query followed by an explicit test:
129 // host_contains(str) = url_contains(str) followed by test whether str occurs
130 // in host component of original URL.
131 // -> host_contains("example.co") = example.co
132 // followed by gurl.host().find("example.co");
134 // [similarly for path_contains and query_contains].
137 // Regular expression matching (url_matches searches)
138 // ==================================================
140 // This class also supports matching regular expressions (RE2 syntax)
141 // against full URLs, which are transformed as in case 2.
145 bool IsRegexCriterion(URLMatcherCondition::Criterion criterion
) {
146 return criterion
== URLMatcherCondition::URL_MATCHES
;
149 bool IsOriginAndPathRegexCriterion(URLMatcherCondition::Criterion criterion
) {
150 return criterion
== URLMatcherCondition::ORIGIN_AND_PATH_MATCHES
;
156 // URLMatcherCondition
159 URLMatcherCondition::URLMatcherCondition()
160 : criterion_(HOST_PREFIX
),
161 string_pattern_(NULL
) {}
163 URLMatcherCondition::~URLMatcherCondition() {}
165 URLMatcherCondition::URLMatcherCondition(
167 const StringPattern
* string_pattern
)
168 : criterion_(criterion
),
169 string_pattern_(string_pattern
) {}
171 URLMatcherCondition::URLMatcherCondition(const URLMatcherCondition
& rhs
)
172 : criterion_(rhs
.criterion_
),
173 string_pattern_(rhs
.string_pattern_
) {}
175 URLMatcherCondition
& URLMatcherCondition::operator=(
176 const URLMatcherCondition
& rhs
) {
177 criterion_
= rhs
.criterion_
;
178 string_pattern_
= rhs
.string_pattern_
;
182 bool URLMatcherCondition::operator<(const URLMatcherCondition
& rhs
) const {
183 if (criterion_
< rhs
.criterion_
) return true;
184 if (criterion_
> rhs
.criterion_
) return false;
185 if (string_pattern_
!= NULL
&& rhs
.string_pattern_
!= NULL
)
186 return *string_pattern_
< *rhs
.string_pattern_
;
187 if (string_pattern_
== NULL
&& rhs
.string_pattern_
!= NULL
) return true;
188 // Either string_pattern_ != NULL && rhs.string_pattern_ == NULL,
193 bool URLMatcherCondition::IsFullURLCondition() const {
194 // For these criteria the SubstringMatcher needs to be executed on the
195 // GURL that is canonicalized with
196 // URLMatcherConditionFactory::CanonicalizeURLForFullSearches.
197 switch (criterion_
) {
212 bool URLMatcherCondition::IsRegexCondition() const {
213 return IsRegexCriterion(criterion_
);
216 bool URLMatcherCondition::IsOriginAndPathRegexCondition() const {
217 return IsOriginAndPathRegexCriterion(criterion_
);
220 bool URLMatcherCondition::IsMatch(
221 const std::set
<StringPattern::ID
>& matching_patterns
,
222 const GURL
& url
) const {
223 DCHECK(string_pattern_
);
224 if (!ContainsKey(matching_patterns
, string_pattern_
->id()))
226 // The criteria HOST_CONTAINS, PATH_CONTAINS, QUERY_CONTAINS are based on
227 // a substring match on the raw URL. In case of a match, we need to verify
228 // that the match was found in the correct component of the URL.
229 switch (criterion_
) {
231 return url
.host().find(string_pattern_
->pattern()) !=
234 return url
.path().find(string_pattern_
->pattern()) !=
237 return url
.query().find(string_pattern_
->pattern()) !=
246 // URLMatcherConditionFactory
250 // These are symbols that are not contained in 7-bit ASCII used in GURLs.
251 const char kBeginningOfURL
[] = {static_cast<char>(-1), 0};
252 const char kEndOfDomain
[] = {static_cast<char>(-2), 0};
253 const char kEndOfPath
[] = {static_cast<char>(-3), 0};
254 const char kQueryComponentDelimiter
[] = {static_cast<char>(-4), 0};
255 const char kEndOfURL
[] = {static_cast<char>(-5), 0};
257 // The delimiter for query parameters
258 const char kQuerySeparator
= '&';
261 URLMatcherConditionFactory::URLMatcherConditionFactory() : id_counter_(0) {}
263 URLMatcherConditionFactory::~URLMatcherConditionFactory() {
264 STLDeleteElements(&substring_pattern_singletons_
);
265 STLDeleteElements(®ex_pattern_singletons_
);
266 STLDeleteElements(&origin_and_path_regex_pattern_singletons_
);
269 std::string
URLMatcherConditionFactory::CanonicalizeURLForComponentSearches(
270 const GURL
& url
) const {
271 return kBeginningOfURL
+ CanonicalizeHostname(url
.host()) + kEndOfDomain
+
272 url
.path() + kEndOfPath
+
273 (url
.has_query() ? CanonicalizeQuery(url
.query(), true, true)
278 URLMatcherCondition
URLMatcherConditionFactory::CreateHostPrefixCondition(
279 const std::string
& prefix
) {
280 return CreateCondition(URLMatcherCondition::HOST_PREFIX
,
281 kBeginningOfURL
+ CanonicalizeHostname(prefix
));
284 URLMatcherCondition
URLMatcherConditionFactory::CreateHostSuffixCondition(
285 const std::string
& suffix
) {
286 return CreateCondition(URLMatcherCondition::HOST_SUFFIX
,
287 suffix
+ kEndOfDomain
);
290 URLMatcherCondition
URLMatcherConditionFactory::CreateHostContainsCondition(
291 const std::string
& str
) {
292 return CreateCondition(URLMatcherCondition::HOST_CONTAINS
, str
);
295 URLMatcherCondition
URLMatcherConditionFactory::CreateHostEqualsCondition(
296 const std::string
& str
) {
297 return CreateCondition(URLMatcherCondition::HOST_EQUALS
,
298 kBeginningOfURL
+ CanonicalizeHostname(str
) + kEndOfDomain
);
301 URLMatcherCondition
URLMatcherConditionFactory::CreatePathPrefixCondition(
302 const std::string
& prefix
) {
303 return CreateCondition(URLMatcherCondition::PATH_PREFIX
,
304 kEndOfDomain
+ prefix
);
307 URLMatcherCondition
URLMatcherConditionFactory::CreatePathSuffixCondition(
308 const std::string
& suffix
) {
309 return CreateCondition(URLMatcherCondition::PATH_SUFFIX
, suffix
+ kEndOfPath
);
312 URLMatcherCondition
URLMatcherConditionFactory::CreatePathContainsCondition(
313 const std::string
& str
) {
314 return CreateCondition(URLMatcherCondition::PATH_CONTAINS
, str
);
317 URLMatcherCondition
URLMatcherConditionFactory::CreatePathEqualsCondition(
318 const std::string
& str
) {
319 return CreateCondition(URLMatcherCondition::PATH_EQUALS
,
320 kEndOfDomain
+ str
+ kEndOfPath
);
323 URLMatcherCondition
URLMatcherConditionFactory::CreateQueryPrefixCondition(
324 const std::string
& prefix
) {
326 if (!prefix
.empty() && prefix
[0] == '?')
327 pattern
= kEndOfPath
+ CanonicalizeQuery(prefix
.substr(1), true, false);
329 pattern
= kEndOfPath
+ CanonicalizeQuery(prefix
, true, false);
331 return CreateCondition(URLMatcherCondition::QUERY_PREFIX
, pattern
);
334 URLMatcherCondition
URLMatcherConditionFactory::CreateQuerySuffixCondition(
335 const std::string
& suffix
) {
336 if (!suffix
.empty() && suffix
[0] == '?') {
337 return CreateQueryEqualsCondition(suffix
);
339 return CreateCondition(URLMatcherCondition::QUERY_SUFFIX
,
340 CanonicalizeQuery(suffix
, false, true) + kEndOfURL
);
344 URLMatcherCondition
URLMatcherConditionFactory::CreateQueryContainsCondition(
345 const std::string
& str
) {
346 if (!str
.empty() && str
[0] == '?')
347 return CreateQueryPrefixCondition(str
);
349 return CreateCondition(URLMatcherCondition::QUERY_CONTAINS
, str
);
352 URLMatcherCondition
URLMatcherConditionFactory::CreateQueryEqualsCondition(
353 const std::string
& str
) {
355 if (!str
.empty() && str
[0] == '?')
357 kEndOfPath
+ CanonicalizeQuery(str
.substr(1), true, true) + kEndOfURL
;
359 pattern
= kEndOfPath
+ CanonicalizeQuery(str
, true, true) + kEndOfURL
;
361 return CreateCondition(URLMatcherCondition::QUERY_EQUALS
, pattern
);
365 URLMatcherConditionFactory::CreateHostSuffixPathPrefixCondition(
366 const std::string
& host_suffix
,
367 const std::string
& path_prefix
) {
368 return CreateCondition(URLMatcherCondition::HOST_SUFFIX_PATH_PREFIX
,
369 host_suffix
+ kEndOfDomain
+ path_prefix
);
373 URLMatcherConditionFactory::CreateHostEqualsPathPrefixCondition(
374 const std::string
& host
,
375 const std::string
& path_prefix
) {
376 return CreateCondition(URLMatcherCondition::HOST_EQUALS_PATH_PREFIX
,
377 kBeginningOfURL
+ CanonicalizeHostname(host
) + kEndOfDomain
+
381 std::string
URLMatcherConditionFactory::CanonicalizeURLForFullSearches(
382 const GURL
& url
) const {
383 GURL::Replacements replacements
;
384 replacements
.ClearPassword();
385 replacements
.ClearUsername();
386 replacements
.ClearRef();
387 // Clear port if it is implicit from scheme.
388 if (url
.has_port()) {
389 const std::string
& port
= url
.scheme();
390 if (url::DefaultPortForScheme(port
.c_str(), port
.size()) ==
391 url
.EffectiveIntPort()) {
392 replacements
.ClearPort();
395 return kBeginningOfURL
+ url
.ReplaceComponents(replacements
).spec() +
399 static std::string
CanonicalizeURLForRegexSearchesHelper(
402 GURL::Replacements replacements
;
403 replacements
.ClearPassword();
404 replacements
.ClearUsername();
405 replacements
.ClearRef();
407 replacements
.ClearQuery();
408 // Clear port if it is implicit from scheme.
409 if (url
.has_port()) {
410 const std::string
& port
= url
.scheme();
411 if (url::DefaultPortForScheme(port
.c_str(), port
.size()) ==
412 url
.EffectiveIntPort()) {
413 replacements
.ClearPort();
416 return url
.ReplaceComponents(replacements
).spec();
419 std::string
URLMatcherConditionFactory::CanonicalizeURLForRegexSearches(
420 const GURL
& url
) const {
421 return CanonicalizeURLForRegexSearchesHelper(url
, false);
425 URLMatcherConditionFactory::CanonicalizeURLForOriginAndPathRegexSearches(
426 const GURL
& url
) const {
427 return CanonicalizeURLForRegexSearchesHelper(url
, true);
430 URLMatcherCondition
URLMatcherConditionFactory::CreateURLPrefixCondition(
431 const std::string
& prefix
) {
432 return CreateCondition(URLMatcherCondition::URL_PREFIX
,
433 kBeginningOfURL
+ prefix
);
436 URLMatcherCondition
URLMatcherConditionFactory::CreateURLSuffixCondition(
437 const std::string
& suffix
) {
438 return CreateCondition(URLMatcherCondition::URL_SUFFIX
, suffix
+ kEndOfURL
);
441 URLMatcherCondition
URLMatcherConditionFactory::CreateURLContainsCondition(
442 const std::string
& str
) {
443 return CreateCondition(URLMatcherCondition::URL_CONTAINS
, str
);
446 URLMatcherCondition
URLMatcherConditionFactory::CreateURLEqualsCondition(
447 const std::string
& str
) {
448 return CreateCondition(URLMatcherCondition::URL_EQUALS
,
449 kBeginningOfURL
+ str
+ kEndOfURL
);
452 URLMatcherCondition
URLMatcherConditionFactory::CreateURLMatchesCondition(
453 const std::string
& regex
) {
454 return CreateCondition(URLMatcherCondition::URL_MATCHES
, regex
);
458 URLMatcherConditionFactory::CreateOriginAndPathMatchesCondition(
459 const std::string
& regex
) {
460 return CreateCondition(URLMatcherCondition::ORIGIN_AND_PATH_MATCHES
, regex
);
463 void URLMatcherConditionFactory::ForgetUnusedPatterns(
464 const std::set
<StringPattern::ID
>& used_patterns
) {
465 PatternSingletons::iterator i
= substring_pattern_singletons_
.begin();
466 while (i
!= substring_pattern_singletons_
.end()) {
467 if (ContainsKey(used_patterns
, (*i
)->id())) {
471 substring_pattern_singletons_
.erase(i
++);
474 i
= regex_pattern_singletons_
.begin();
475 while (i
!= regex_pattern_singletons_
.end()) {
476 if (ContainsKey(used_patterns
, (*i
)->id())) {
480 regex_pattern_singletons_
.erase(i
++);
483 i
= origin_and_path_regex_pattern_singletons_
.begin();
484 while (i
!= origin_and_path_regex_pattern_singletons_
.end()) {
485 if (ContainsKey(used_patterns
, (*i
)->id())) {
489 origin_and_path_regex_pattern_singletons_
.erase(i
++);
494 bool URLMatcherConditionFactory::IsEmpty() const {
495 return substring_pattern_singletons_
.empty() &&
496 regex_pattern_singletons_
.empty() &&
497 origin_and_path_regex_pattern_singletons_
.empty();
500 URLMatcherCondition
URLMatcherConditionFactory::CreateCondition(
501 URLMatcherCondition::Criterion criterion
,
502 const std::string
& pattern
) {
503 StringPattern
search_pattern(pattern
, 0);
504 PatternSingletons
* pattern_singletons
= NULL
;
505 if (IsRegexCriterion(criterion
))
506 pattern_singletons
= ®ex_pattern_singletons_
;
507 else if (IsOriginAndPathRegexCriterion(criterion
))
508 pattern_singletons
= &origin_and_path_regex_pattern_singletons_
;
510 pattern_singletons
= &substring_pattern_singletons_
;
512 PatternSingletons::const_iterator iter
=
513 pattern_singletons
->find(&search_pattern
);
515 if (iter
!= pattern_singletons
->end()) {
516 return URLMatcherCondition(criterion
, *iter
);
518 StringPattern
* new_pattern
=
519 new StringPattern(pattern
, id_counter_
++);
520 pattern_singletons
->insert(new_pattern
);
521 return URLMatcherCondition(criterion
, new_pattern
);
525 std::string
URLMatcherConditionFactory::CanonicalizeHostname(
526 const std::string
& hostname
) const {
527 if (!hostname
.empty() && hostname
[0] == '.')
530 return "." + hostname
;
533 // This function prepares the query string by replacing query separator with a
534 // magic value (|kQueryComponentDelimiter|). When the boolean
535 // |prepend_beginning_of_query_component| is true the function prepends the
536 // query with the same magic. This is done to locate the start of a key value
537 // pair in the query string. The parameter |query| is passed by value
538 // intentionally, since it is locally modified.
539 std::string
URLMatcherConditionFactory::CanonicalizeQuery(
541 bool prepend_beginning_of_query_component
,
542 bool append_end_of_query_component
) const {
543 for (std::string::iterator it
= query
.begin(); it
!= query
.end(); ++it
) {
544 if (*it
== kQuerySeparator
)
545 *it
= kQueryComponentDelimiter
[0];
547 if (prepend_beginning_of_query_component
)
548 query
= kQueryComponentDelimiter
+ query
;
549 if (append_end_of_query_component
)
550 query
+= kQueryComponentDelimiter
;
554 bool URLMatcherConditionFactory::StringPatternPointerCompare::operator()(
556 StringPattern
* rhs
) const {
557 if (lhs
== NULL
&& rhs
!= NULL
) return true;
558 if (lhs
!= NULL
&& rhs
!= NULL
)
559 return lhs
->pattern() < rhs
->pattern();
560 // Either both are NULL or only rhs is NULL.
565 // URLQueryElementMatcherCondition
568 URLQueryElementMatcherCondition::URLQueryElementMatcherCondition(
569 const std::string
& key
,
570 const std::string
& value
,
571 QueryValueMatchType query_value_match_type
,
572 QueryElementType query_element_type
,
574 URLMatcherConditionFactory
* factory
) {
575 match_type_
= match_type
;
577 if (query_element_type
== ELEMENT_TYPE_KEY_VALUE
) {
578 key_
= kQueryComponentDelimiter
+ key
+ "=";
581 key_
= kQueryComponentDelimiter
+ key
;
582 value_
= std::string();
585 if (query_value_match_type
== QUERY_VALUE_MATCH_EXACT
)
586 value_
+= kQueryComponentDelimiter
;
588 // If |value_| is empty no need to find the |key_| and verify if the value
589 // matches. Simply checking the presence of key is sufficient, which is done
592 match_type_
= MATCH_ANY
;
594 URLMatcherCondition condition
;
595 // If |match_type_| is MATCH_ANY, then we could simply look for the
596 // combination of |key_| + |value_|, which can be efficiently done by
598 if (match_type_
== MATCH_ANY
)
599 condition
= factory
->CreateQueryContainsCondition(key_
+ value_
);
601 condition
= factory
->CreateQueryContainsCondition(key_
);
602 string_pattern_
= condition
.string_pattern();
604 key_length_
= key_
.length();
605 value_length_
= value_
.length();
608 URLQueryElementMatcherCondition::~URLQueryElementMatcherCondition() {}
610 bool URLQueryElementMatcherCondition::operator<(
611 const URLQueryElementMatcherCondition
& rhs
) const {
612 if (match_type_
!= rhs
.match_type_
)
613 return match_type_
< rhs
.match_type_
;
614 if (string_pattern_
!= NULL
&& rhs
.string_pattern_
!= NULL
)
615 return *string_pattern_
< *rhs
.string_pattern_
;
616 if (string_pattern_
== NULL
&& rhs
.string_pattern_
!= NULL
)
618 // Either string_pattern_ != NULL && rhs.string_pattern_ == NULL,
623 bool URLQueryElementMatcherCondition::IsMatch(
624 const std::string
& url_for_component_searches
) const {
625 switch (match_type_
) {
627 // For MATCH_ANY, no additional verification step is needed. We can trust
628 // the SubstringMatcher to do the verification.
635 while ((offset
= url_for_component_searches
.find(key_
, start
)) !=
637 if (url_for_component_searches
.compare(
638 offset
+ key_length_
, value_length_
, value_
) != 0) {
643 start
= offset
+ key_length_
+ value_length_
- 1;
648 size_t offset
= url_for_component_searches
.find(key_
);
649 return url_for_component_searches
.compare(
650 offset
+ key_length_
, value_length_
, value_
) == 0;
653 size_t offset
= url_for_component_searches
.rfind(key_
);
654 return url_for_component_searches
.compare(
655 offset
+ key_length_
, value_length_
, value_
) == 0;
663 // URLMatcherSchemeFilter
666 URLMatcherSchemeFilter::URLMatcherSchemeFilter(const std::string
& filter
)
668 filters_
.push_back(filter
);
671 URLMatcherSchemeFilter::URLMatcherSchemeFilter(
672 const std::vector
<std::string
>& filters
)
673 : filters_(filters
) {}
675 URLMatcherSchemeFilter::~URLMatcherSchemeFilter() {}
677 bool URLMatcherSchemeFilter::IsMatch(const GURL
& url
) const {
678 return std::find(filters_
.begin(), filters_
.end(), url
.scheme()) !=
683 // URLMatcherPortFilter
686 URLMatcherPortFilter::URLMatcherPortFilter(
687 const std::vector
<URLMatcherPortFilter::Range
>& ranges
)
690 URLMatcherPortFilter::~URLMatcherPortFilter() {}
692 bool URLMatcherPortFilter::IsMatch(const GURL
& url
) const {
693 int port
= url
.EffectiveIntPort();
694 for (std::vector
<Range
>::const_iterator i
= ranges_
.begin();
695 i
!= ranges_
.end(); ++i
) {
696 if (i
->first
<= port
&& port
<= i
->second
)
703 URLMatcherPortFilter::Range
URLMatcherPortFilter::CreateRange(int from
,
705 return Range(from
, to
);
709 URLMatcherPortFilter::Range
URLMatcherPortFilter::CreateRange(int port
) {
710 return Range(port
, port
);
714 // URLMatcherConditionSet
717 URLMatcherConditionSet::~URLMatcherConditionSet() {}
719 URLMatcherConditionSet::URLMatcherConditionSet(
721 const Conditions
& conditions
)
723 conditions_(conditions
) {}
725 URLMatcherConditionSet::URLMatcherConditionSet(
727 const Conditions
& conditions
,
728 scoped_ptr
<URLMatcherSchemeFilter
> scheme_filter
,
729 scoped_ptr
<URLMatcherPortFilter
> port_filter
)
731 conditions_(conditions
),
732 scheme_filter_(scheme_filter
.Pass()),
733 port_filter_(port_filter
.Pass()) {}
735 URLMatcherConditionSet::URLMatcherConditionSet(
737 const Conditions
& conditions
,
738 const QueryConditions
& query_conditions
,
739 scoped_ptr
<URLMatcherSchemeFilter
> scheme_filter
,
740 scoped_ptr
<URLMatcherPortFilter
> port_filter
)
742 conditions_(conditions
),
743 query_conditions_(query_conditions
),
744 scheme_filter_(scheme_filter
.Pass()),
745 port_filter_(port_filter
.Pass()) {}
747 bool URLMatcherConditionSet::IsMatch(
748 const std::set
<StringPattern::ID
>& matching_patterns
,
749 const GURL
& url
) const {
750 return IsMatch(matching_patterns
, url
, std::string());
753 bool URLMatcherConditionSet::IsMatch(
754 const std::set
<StringPattern::ID
>& matching_patterns
,
756 const std::string
& url_for_component_searches
) const {
757 for (Conditions::const_iterator i
= conditions_
.begin();
758 i
!= conditions_
.end(); ++i
) {
759 if (!i
->IsMatch(matching_patterns
, url
))
762 if (scheme_filter_
.get() && !scheme_filter_
->IsMatch(url
))
764 if (port_filter_
.get() && !port_filter_
->IsMatch(url
))
766 if (query_conditions_
.empty())
768 // The loop is duplicated below for performance reasons. If not all query
769 // elements are found, no need to verify match that is expected to take more
771 for (QueryConditions::const_iterator i
= query_conditions_
.begin();
772 i
!= query_conditions_
.end();
774 if (!ContainsKey(matching_patterns
, i
->string_pattern()->id()))
777 for (QueryConditions::const_iterator i
= query_conditions_
.begin();
778 i
!= query_conditions_
.end();
780 if (!i
->IsMatch(url_for_component_searches
))
790 URLMatcher::URLMatcher() {}
792 URLMatcher::~URLMatcher() {}
794 void URLMatcher::AddConditionSets(
795 const URLMatcherConditionSet::Vector
& condition_sets
) {
796 for (URLMatcherConditionSet::Vector::const_iterator i
=
797 condition_sets
.begin(); i
!= condition_sets
.end(); ++i
) {
798 DCHECK(url_matcher_condition_sets_
.find((*i
)->id()) ==
799 url_matcher_condition_sets_
.end());
800 url_matcher_condition_sets_
[(*i
)->id()] = *i
;
802 UpdateInternalDatastructures();
805 void URLMatcher::RemoveConditionSets(
806 const std::vector
<URLMatcherConditionSet::ID
>& condition_set_ids
) {
807 for (std::vector
<URLMatcherConditionSet::ID
>::const_iterator i
=
808 condition_set_ids
.begin(); i
!= condition_set_ids
.end(); ++i
) {
809 DCHECK(url_matcher_condition_sets_
.find(*i
) !=
810 url_matcher_condition_sets_
.end());
811 url_matcher_condition_sets_
.erase(*i
);
813 UpdateInternalDatastructures();
816 void URLMatcher::ClearUnusedConditionSets() {
817 UpdateConditionFactory();
820 std::set
<URLMatcherConditionSet::ID
> URLMatcher::MatchURL(
821 const GURL
& url
) const {
822 // Find all IDs of StringPatterns that match |url|.
823 // See URLMatcherConditionFactory for the canonicalization of URLs and the
824 // distinction between full url searches and url component searches.
825 std::set
<StringPattern::ID
> matches
;
826 std::string url_for_component_searches
;
828 if (!full_url_matcher_
.IsEmpty()) {
829 full_url_matcher_
.Match(
830 condition_factory_
.CanonicalizeURLForFullSearches(url
), &matches
);
832 if (!url_component_matcher_
.IsEmpty()) {
833 url_for_component_searches
=
834 condition_factory_
.CanonicalizeURLForComponentSearches(url
);
835 url_component_matcher_
.Match(url_for_component_searches
, &matches
);
837 if (!regex_set_matcher_
.IsEmpty()) {
838 regex_set_matcher_
.Match(
839 condition_factory_
.CanonicalizeURLForRegexSearches(url
), &matches
);
841 if (!origin_and_path_regex_set_matcher_
.IsEmpty()) {
842 origin_and_path_regex_set_matcher_
.Match(
843 condition_factory_
.CanonicalizeURLForOriginAndPathRegexSearches(url
),
847 // Calculate all URLMatcherConditionSets for which all URLMatcherConditions
849 std::set
<URLMatcherConditionSet::ID
> result
;
850 for (std::set
<StringPattern::ID
>::const_iterator i
= matches
.begin();
851 i
!= matches
.end(); ++i
) {
852 // For each URLMatcherConditionSet there is exactly one condition
853 // registered in substring_match_triggers_. This means that the following
854 // logic tests each URLMatcherConditionSet exactly once if it can be
855 // completely fulfilled.
856 StringPatternTriggers::const_iterator triggered_condition_sets_iter
=
857 substring_match_triggers_
.find(*i
);
858 if (triggered_condition_sets_iter
== substring_match_triggers_
.end())
859 continue; // Not all substring matches are triggers for a condition set.
860 const std::set
<URLMatcherConditionSet::ID
>& condition_sets
=
861 triggered_condition_sets_iter
->second
;
862 for (std::set
<URLMatcherConditionSet::ID
>::const_iterator j
=
863 condition_sets
.begin(); j
!= condition_sets
.end(); ++j
) {
864 URLMatcherConditionSets::const_iterator condition_set_iter
=
865 url_matcher_condition_sets_
.find(*j
);
866 DCHECK(condition_set_iter
!= url_matcher_condition_sets_
.end());
867 if (condition_set_iter
->second
->IsMatch(
868 matches
, url
, url_for_component_searches
))
876 bool URLMatcher::IsEmpty() const {
877 return condition_factory_
.IsEmpty() &&
878 url_matcher_condition_sets_
.empty() &&
879 substring_match_triggers_
.empty() &&
880 full_url_matcher_
.IsEmpty() &&
881 url_component_matcher_
.IsEmpty() &&
882 regex_set_matcher_
.IsEmpty() &&
883 origin_and_path_regex_set_matcher_
.IsEmpty() &&
884 registered_full_url_patterns_
.empty() &&
885 registered_url_component_patterns_
.empty();
888 void URLMatcher::UpdateSubstringSetMatcher(bool full_url_conditions
) {
889 // The purpose of |full_url_conditions| is just that we need to execute
890 // the same logic once for Full URL searches and once for URL Component
891 // searches (see URLMatcherConditionFactory).
893 // Determine which patterns need to be registered when this function
895 std::set
<const StringPattern
*> new_patterns
;
896 for (URLMatcherConditionSets::const_iterator condition_set_iter
=
897 url_matcher_condition_sets_
.begin();
898 condition_set_iter
!= url_matcher_condition_sets_
.end();
899 ++condition_set_iter
) {
900 const URLMatcherConditionSet::Conditions
& conditions
=
901 condition_set_iter
->second
->conditions();
902 for (URLMatcherConditionSet::Conditions::const_iterator condition_iter
=
903 conditions
.begin(); condition_iter
!= conditions
.end();
905 // If we are called to process Full URL searches, ignore others, and
906 // vice versa. (Regex conditions are updated in UpdateRegexSetMatcher.)
907 if (!condition_iter
->IsRegexCondition() &&
908 !condition_iter
->IsOriginAndPathRegexCondition() &&
909 full_url_conditions
== condition_iter
->IsFullURLCondition())
910 new_patterns
.insert(condition_iter
->string_pattern());
913 if (full_url_conditions
)
916 const URLMatcherConditionSet::QueryConditions
& query_conditions
=
917 condition_set_iter
->second
->query_conditions();
918 for (URLMatcherConditionSet::QueryConditions::const_iterator
919 query_condition_iter
= query_conditions
.begin();
920 query_condition_iter
!= query_conditions
.end();
921 ++query_condition_iter
) {
922 new_patterns
.insert(query_condition_iter
->string_pattern());
926 // This is the set of patterns that were registered before this function
928 std::set
<const StringPattern
*>& registered_patterns
=
929 full_url_conditions
? registered_full_url_patterns_
930 : registered_url_component_patterns_
;
932 // Add all patterns that are in new_patterns but not in registered_patterns.
933 std::vector
<const StringPattern
*> patterns_to_register
=
934 base::STLSetDifference
<std::vector
<const StringPattern
*> >(
935 new_patterns
, registered_patterns
);
937 // Remove all patterns that are in registered_patterns but not in
939 std::vector
<const StringPattern
*> patterns_to_unregister
=
940 base::STLSetDifference
<std::vector
<const StringPattern
*> >(
941 registered_patterns
, new_patterns
);
943 // Update the SubstringSetMatcher.
944 SubstringSetMatcher
& url_matcher
=
945 full_url_conditions
? full_url_matcher_
: url_component_matcher_
;
946 url_matcher
.RegisterAndUnregisterPatterns(patterns_to_register
,
947 patterns_to_unregister
);
949 // Update the set of registered_patterns for the next time this function
951 registered_patterns
.swap(new_patterns
);
954 void URLMatcher::UpdateRegexSetMatcher() {
955 std::vector
<const StringPattern
*> new_patterns
;
956 std::vector
<const StringPattern
*> new_origin_and_path_patterns
;
958 for (URLMatcherConditionSets::const_iterator condition_set_iter
=
959 url_matcher_condition_sets_
.begin();
960 condition_set_iter
!= url_matcher_condition_sets_
.end();
961 ++condition_set_iter
) {
962 const URLMatcherConditionSet::Conditions
& conditions
=
963 condition_set_iter
->second
->conditions();
964 for (URLMatcherConditionSet::Conditions::const_iterator condition_iter
=
965 conditions
.begin(); condition_iter
!= conditions
.end();
967 if (condition_iter
->IsRegexCondition()) {
968 new_patterns
.push_back(condition_iter
->string_pattern());
969 } else if (condition_iter
->IsOriginAndPathRegexCondition()) {
970 new_origin_and_path_patterns
.push_back(
971 condition_iter
->string_pattern());
976 // Start over from scratch. We can't really do better than this, since the
977 // FilteredRE2 backend doesn't support incremental updates.
978 regex_set_matcher_
.ClearPatterns();
979 regex_set_matcher_
.AddPatterns(new_patterns
);
980 origin_and_path_regex_set_matcher_
.ClearPatterns();
981 origin_and_path_regex_set_matcher_
.AddPatterns(new_origin_and_path_patterns
);
984 void URLMatcher::UpdateTriggers() {
985 // Count substring pattern frequencies.
986 std::map
<StringPattern::ID
, size_t> substring_pattern_frequencies
;
987 for (URLMatcherConditionSets::const_iterator condition_set_iter
=
988 url_matcher_condition_sets_
.begin();
989 condition_set_iter
!= url_matcher_condition_sets_
.end();
990 ++condition_set_iter
) {
991 const URLMatcherConditionSet::Conditions
& conditions
=
992 condition_set_iter
->second
->conditions();
993 for (URLMatcherConditionSet::Conditions::const_iterator condition_iter
=
994 conditions
.begin(); condition_iter
!= conditions
.end();
996 const StringPattern
* pattern
= condition_iter
->string_pattern();
997 substring_pattern_frequencies
[pattern
->id()]++;
1000 const URLMatcherConditionSet::QueryConditions
& query_conditions
=
1001 condition_set_iter
->second
->query_conditions();
1002 for (URLMatcherConditionSet::QueryConditions::const_iterator
1003 query_condition_iter
= query_conditions
.begin();
1004 query_condition_iter
!= query_conditions
.end();
1005 ++query_condition_iter
) {
1006 const StringPattern
* pattern
= query_condition_iter
->string_pattern();
1007 substring_pattern_frequencies
[pattern
->id()]++;
1011 // Update trigger conditions: Determine for each URLMatcherConditionSet which
1012 // URLMatcherCondition contains a StringPattern that occurs least
1013 // frequently in this URLMatcher. We assume that this condition is very
1014 // specific and occurs rarely in URLs. If a match occurs for this
1015 // URLMatcherCondition, we want to test all other URLMatcherCondition in the
1016 // respective URLMatcherConditionSet as well to see whether the entire
1017 // URLMatcherConditionSet is considered matching.
1018 substring_match_triggers_
.clear();
1019 for (URLMatcherConditionSets::const_iterator condition_set_iter
=
1020 url_matcher_condition_sets_
.begin();
1021 condition_set_iter
!= url_matcher_condition_sets_
.end();
1022 ++condition_set_iter
) {
1023 const URLMatcherConditionSet::Conditions
& conditions
=
1024 condition_set_iter
->second
->conditions();
1025 if (conditions
.empty())
1027 URLMatcherConditionSet::Conditions::const_iterator condition_iter
=
1029 StringPattern::ID trigger
= condition_iter
->string_pattern()->id();
1030 // We skip the first element in the following loop.
1032 for (; condition_iter
!= conditions
.end(); ++condition_iter
) {
1033 StringPattern::ID current_id
=
1034 condition_iter
->string_pattern()->id();
1035 if (substring_pattern_frequencies
[trigger
] >
1036 substring_pattern_frequencies
[current_id
]) {
1037 trigger
= current_id
;
1041 const URLMatcherConditionSet::QueryConditions
& query_conditions
=
1042 condition_set_iter
->second
->query_conditions();
1043 for (URLMatcherConditionSet::QueryConditions::const_iterator
1044 query_condition_iter
= query_conditions
.begin();
1045 query_condition_iter
!= query_conditions
.end();
1046 ++query_condition_iter
) {
1047 StringPattern::ID current_id
=
1048 query_condition_iter
->string_pattern()->id();
1049 if (substring_pattern_frequencies
[trigger
] >
1050 substring_pattern_frequencies
[current_id
]) {
1051 trigger
= current_id
;
1055 substring_match_triggers_
[trigger
].insert(condition_set_iter
->second
->id());
1059 void URLMatcher::UpdateConditionFactory() {
1060 std::set
<StringPattern::ID
> used_patterns
;
1061 for (URLMatcherConditionSets::const_iterator condition_set_iter
=
1062 url_matcher_condition_sets_
.begin();
1063 condition_set_iter
!= url_matcher_condition_sets_
.end();
1064 ++condition_set_iter
) {
1065 const URLMatcherConditionSet::Conditions
& conditions
=
1066 condition_set_iter
->second
->conditions();
1067 for (URLMatcherConditionSet::Conditions::const_iterator condition_iter
=
1068 conditions
.begin(); condition_iter
!= conditions
.end();
1070 used_patterns
.insert(condition_iter
->string_pattern()->id());
1072 const URLMatcherConditionSet::QueryConditions
& query_conditions
=
1073 condition_set_iter
->second
->query_conditions();
1074 for (URLMatcherConditionSet::QueryConditions::const_iterator
1075 query_condition_iter
= query_conditions
.begin();
1076 query_condition_iter
!= query_conditions
.end();
1077 ++query_condition_iter
) {
1078 used_patterns
.insert(query_condition_iter
->string_pattern()->id());
1081 condition_factory_
.ForgetUnusedPatterns(used_patterns
);
1084 void URLMatcher::UpdateInternalDatastructures() {
1085 // TODO(vadimt): Remove ScopedProfile below once crbug.com/417106 is fixed.
1086 tracked_objects::ScopedProfile
tracking_profile(
1087 FROM_HERE_WITH_EXPLICIT_FUNCTION(
1088 "URLMatcher_UpdateInternalDatastructures"));
1089 UpdateSubstringSetMatcher(false);
1090 UpdateSubstringSetMatcher(true);
1091 UpdateRegexSetMatcher();
1093 UpdateConditionFactory();
1096 } // namespace url_matcher