1 // Copyright 2013 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "components/url_matcher/url_matcher.h"
10 #include "base/logging.h"
12 #include "url/url_canon.h"
14 namespace url_matcher
{
16 // This set of classes implement a mapping of URL Component Patterns, such as
17 // host_prefix, host_suffix, host_equals, ..., etc., to StringPatterns
18 // for use in substring comparisons.
20 // The idea of this mapping is to reduce the problem of comparing many
21 // URL Component Patterns against one URL to the problem of searching many
22 // substrings in one string:
24 // ---------------------- -----------------
25 // | URL Query operator | ----translate----> | StringPattern |
26 // ---------------------- -----------------
32 // ---------------------- -----------------
33 // | URL to compare | | |
34 // | to all URL Query | ----translate----> | String |
36 // ---------------------- -----------------
38 // The reason for this problem reduction is that there are efficient algorithms
39 // for searching many substrings in one string (see Aho-Corasick algorithm).
41 // Additionally, some of the same pieces are reused to implement regular
42 // expression comparisons. The FilteredRE2 implementation for matching many
43 // regular expressions against one string uses prefiltering, in which a set
44 // of substrings (derived from the regexes) are first searched for, to reduce
45 // the number of regular expressions to test; the prefiltering step also
48 // Case 1: {host,path,query}_{prefix,suffix,equals} searches.
49 // ==========================================================
51 // For searches in this class, we normalize URLs as follows:
54 // Remove scheme, port and segment from URL:
55 // -> http://www.example.com:8080/index.html?search=foo#first_match becomes
56 // www.example.com/index.html?search=foo
58 // We remove the scheme and port number because they can be checked later
59 // in a secondary filter step. We remove the segment (the #... part) because
60 // this is not guaranteed to be ASCII-7 encoded.
63 // Translate URL to String and add the following position markers:
64 // - BU = Beginning of URL
65 // - ED = End of Domain
68 // Furthermore, the hostname is canonicalized to start with a ".".
70 // Position markers are represented as characters >127, which are therefore
71 // guaranteed not to be part of the ASCII-7 encoded URL character set.
73 // -> www.example.com/index.html?search=foo becomes
74 // BU .www.example.com ED /index.html EP ?search=foo EU
76 // -> www.example.com/index.html becomes
77 // BU .www.example.com ED /index.html EP EU
80 // Translate URL Component Patterns as follows:
82 // host_prefix(prefix) = BU add_missing_dot_prefix(prefix)
83 // -> host_prefix("www.example") = BU .www.example
85 // host_suffix(suffix) = suffix ED
86 // -> host_suffix("example.com") = example.com ED
87 // -> host_suffix(".example.com") = .example.com ED
89 // host_equals(domain) = BU add_missing_dot_prefix(domain) ED
90 // -> host_equals("www.example.com") = BU .www.example.com ED
92 // Similarly for path query parameters ({path, query}_{prefix, suffix, equals}).
94 // With this, we can search the StringPatterns in the normalized URL.
97 // Case 2: url_{prefix,suffix,equals,contains} searches.
98 // =====================================================
100 // Step 1: as above, except that
101 // - the scheme is not removed
102 // - the port is not removed if it is specified and does not match the default
103 // port for the given scheme.
106 // Translate URL to String and add the following position markers:
107 // - BU = Beginning of URL
110 // -> http://www.example.com:8080/index.html?search=foo#first_match becomes
111 // BU http://www.example.com:8080/index.html?search=foo EU
112 // -> http://www.example.com:80/index.html?search=foo#first_match becomes
113 // BU http://www.example.com/index.html?search=foo EU
115 // url_prefix(prefix) = BU prefix
116 // -> url_prefix("http://www.example") = BU http://www.example
118 // url_contains(substring) = substring
119 // -> url_contains("index") = index
122 // Case 3: {host,path,query}_contains searches.
123 // ============================================
125 // These kinds of searches are not supported directly but can be derived
126 // by a combination of a url_contains() query followed by an explicit test:
128 // host_contains(str) = url_contains(str) followed by test whether str occurs
129 // in host component of original URL.
130 // -> host_contains("example.co") = example.co
131 // followed by gurl.host().find("example.co");
133 // [similarly for path_contains and query_contains].
136 // Regular expression matching (url_matches searches)
137 // ==================================================
139 // This class also supports matching regular expressions (RE2 syntax)
140 // against full URLs, which are transformed as in case 2.
144 bool IsRegexCriterion(URLMatcherCondition::Criterion criterion
) {
145 return criterion
== URLMatcherCondition::URL_MATCHES
;
148 bool IsOriginAndPathRegexCriterion(URLMatcherCondition::Criterion criterion
) {
149 return criterion
== URLMatcherCondition::ORIGIN_AND_PATH_MATCHES
;
155 // URLMatcherCondition
158 URLMatcherCondition::URLMatcherCondition()
159 : criterion_(HOST_PREFIX
),
160 string_pattern_(NULL
) {}
162 URLMatcherCondition::~URLMatcherCondition() {}
164 URLMatcherCondition::URLMatcherCondition(
166 const StringPattern
* string_pattern
)
167 : criterion_(criterion
),
168 string_pattern_(string_pattern
) {}
170 URLMatcherCondition::URLMatcherCondition(const URLMatcherCondition
& rhs
)
171 : criterion_(rhs
.criterion_
),
172 string_pattern_(rhs
.string_pattern_
) {}
174 URLMatcherCondition
& URLMatcherCondition::operator=(
175 const URLMatcherCondition
& rhs
) {
176 criterion_
= rhs
.criterion_
;
177 string_pattern_
= rhs
.string_pattern_
;
181 bool URLMatcherCondition::operator<(const URLMatcherCondition
& rhs
) const {
182 if (criterion_
< rhs
.criterion_
) return true;
183 if (criterion_
> rhs
.criterion_
) return false;
184 if (string_pattern_
!= NULL
&& rhs
.string_pattern_
!= NULL
)
185 return *string_pattern_
< *rhs
.string_pattern_
;
186 if (string_pattern_
== NULL
&& rhs
.string_pattern_
!= NULL
) return true;
187 // Either string_pattern_ != NULL && rhs.string_pattern_ == NULL,
192 bool URLMatcherCondition::IsFullURLCondition() const {
193 // For these criteria the SubstringMatcher needs to be executed on the
194 // GURL that is canonicalized with
195 // URLMatcherConditionFactory::CanonicalizeURLForFullSearches.
196 switch (criterion_
) {
211 bool URLMatcherCondition::IsRegexCondition() const {
212 return IsRegexCriterion(criterion_
);
215 bool URLMatcherCondition::IsOriginAndPathRegexCondition() const {
216 return IsOriginAndPathRegexCriterion(criterion_
);
219 bool URLMatcherCondition::IsMatch(
220 const std::set
<StringPattern::ID
>& matching_patterns
,
221 const GURL
& url
) const {
222 DCHECK(string_pattern_
);
223 if (!ContainsKey(matching_patterns
, string_pattern_
->id()))
225 // The criteria HOST_CONTAINS, PATH_CONTAINS, QUERY_CONTAINS are based on
226 // a substring match on the raw URL. In case of a match, we need to verify
227 // that the match was found in the correct component of the URL.
228 switch (criterion_
) {
230 return url
.host().find(string_pattern_
->pattern()) !=
233 return url
.path().find(string_pattern_
->pattern()) !=
236 return url
.query().find(string_pattern_
->pattern()) !=
245 // URLMatcherConditionFactory
249 // These are symbols that are not contained in 7-bit ASCII used in GURLs.
250 const char kBeginningOfURL
[] = {static_cast<char>(-1), 0};
251 const char kEndOfDomain
[] = {static_cast<char>(-2), 0};
252 const char kEndOfPath
[] = {static_cast<char>(-3), 0};
253 const char kEndOfURL
[] = {static_cast<char>(-4), 0};
256 URLMatcherConditionFactory::URLMatcherConditionFactory() : id_counter_(0) {}
258 URLMatcherConditionFactory::~URLMatcherConditionFactory() {
259 STLDeleteElements(&substring_pattern_singletons_
);
260 STLDeleteElements(®ex_pattern_singletons_
);
261 STLDeleteElements(&origin_and_path_regex_pattern_singletons_
);
264 std::string
URLMatcherConditionFactory::CanonicalizeURLForComponentSearches(
265 const GURL
& url
) const {
266 return kBeginningOfURL
+ CanonicalizeHostname(url
.host()) + kEndOfDomain
+
267 url
.path() + kEndOfPath
+
268 (url
.has_query() ? "?" + url
.query() : std::string()) + kEndOfURL
;
271 URLMatcherCondition
URLMatcherConditionFactory::CreateHostPrefixCondition(
272 const std::string
& prefix
) {
273 return CreateCondition(URLMatcherCondition::HOST_PREFIX
,
274 kBeginningOfURL
+ CanonicalizeHostname(prefix
));
277 URLMatcherCondition
URLMatcherConditionFactory::CreateHostSuffixCondition(
278 const std::string
& suffix
) {
279 return CreateCondition(URLMatcherCondition::HOST_SUFFIX
,
280 suffix
+ kEndOfDomain
);
283 URLMatcherCondition
URLMatcherConditionFactory::CreateHostContainsCondition(
284 const std::string
& str
) {
285 return CreateCondition(URLMatcherCondition::HOST_CONTAINS
, str
);
288 URLMatcherCondition
URLMatcherConditionFactory::CreateHostEqualsCondition(
289 const std::string
& str
) {
290 return CreateCondition(URLMatcherCondition::HOST_EQUALS
,
291 kBeginningOfURL
+ CanonicalizeHostname(str
) + kEndOfDomain
);
294 URLMatcherCondition
URLMatcherConditionFactory::CreatePathPrefixCondition(
295 const std::string
& prefix
) {
296 return CreateCondition(URLMatcherCondition::PATH_PREFIX
,
297 kEndOfDomain
+ prefix
);
300 URLMatcherCondition
URLMatcherConditionFactory::CreatePathSuffixCondition(
301 const std::string
& suffix
) {
302 return CreateCondition(URLMatcherCondition::PATH_SUFFIX
, suffix
+ kEndOfPath
);
305 URLMatcherCondition
URLMatcherConditionFactory::CreatePathContainsCondition(
306 const std::string
& str
) {
307 return CreateCondition(URLMatcherCondition::PATH_CONTAINS
, str
);
310 URLMatcherCondition
URLMatcherConditionFactory::CreatePathEqualsCondition(
311 const std::string
& str
) {
312 return CreateCondition(URLMatcherCondition::PATH_EQUALS
,
313 kEndOfDomain
+ str
+ kEndOfPath
);
316 URLMatcherCondition
URLMatcherConditionFactory::CreateQueryPrefixCondition(
317 const std::string
& prefix
) {
319 if (!prefix
.empty() && prefix
[0] == '?')
320 pattern
= kEndOfPath
+ prefix
;
322 pattern
= kEndOfPath
+ ('?' + prefix
);
324 return CreateCondition(URLMatcherCondition::QUERY_PREFIX
, pattern
);
327 URLMatcherCondition
URLMatcherConditionFactory::CreateQuerySuffixCondition(
328 const std::string
& suffix
) {
329 if (!suffix
.empty() && suffix
[0] == '?') {
330 return CreateQueryEqualsCondition(suffix
);
332 return CreateCondition(URLMatcherCondition::QUERY_SUFFIX
,
337 URLMatcherCondition
URLMatcherConditionFactory::CreateQueryContainsCondition(
338 const std::string
& str
) {
339 if (!str
.empty() && str
[0] == '?')
340 return CreateQueryPrefixCondition(str
);
342 return CreateCondition(URLMatcherCondition::QUERY_CONTAINS
, str
);
345 URLMatcherCondition
URLMatcherConditionFactory::CreateQueryEqualsCondition(
346 const std::string
& str
) {
348 if (!str
.empty() && str
[0] == '?')
349 pattern
= kEndOfPath
+ str
+ kEndOfURL
;
351 pattern
= kEndOfPath
+ ('?' + str
) + kEndOfURL
;
353 return CreateCondition(URLMatcherCondition::QUERY_EQUALS
, pattern
);
357 URLMatcherConditionFactory::CreateHostSuffixPathPrefixCondition(
358 const std::string
& host_suffix
,
359 const std::string
& path_prefix
) {
360 return CreateCondition(URLMatcherCondition::HOST_SUFFIX_PATH_PREFIX
,
361 host_suffix
+ kEndOfDomain
+ path_prefix
);
365 URLMatcherConditionFactory::CreateHostEqualsPathPrefixCondition(
366 const std::string
& host
,
367 const std::string
& path_prefix
) {
368 return CreateCondition(URLMatcherCondition::HOST_EQUALS_PATH_PREFIX
,
369 kBeginningOfURL
+ CanonicalizeHostname(host
) + kEndOfDomain
+
373 std::string
URLMatcherConditionFactory::CanonicalizeURLForFullSearches(
374 const GURL
& url
) const {
375 GURL::Replacements replacements
;
376 replacements
.ClearPassword();
377 replacements
.ClearUsername();
378 replacements
.ClearRef();
379 // Clear port if it is implicit from scheme.
380 if (url
.has_port()) {
381 const std::string
& port
= url
.scheme();
382 if (url_canon::DefaultPortForScheme(port
.c_str(), port
.size()) ==
383 url
.EffectiveIntPort()) {
384 replacements
.ClearPort();
387 return kBeginningOfURL
+ url
.ReplaceComponents(replacements
).spec() +
391 static std::string
CanonicalizeURLForRegexSearchesHelper(
394 GURL::Replacements replacements
;
395 replacements
.ClearPassword();
396 replacements
.ClearUsername();
397 replacements
.ClearRef();
399 replacements
.ClearQuery();
400 // Clear port if it is implicit from scheme.
401 if (url
.has_port()) {
402 const std::string
& port
= url
.scheme();
403 if (url_canon::DefaultPortForScheme(port
.c_str(), port
.size()) ==
404 url
.EffectiveIntPort()) {
405 replacements
.ClearPort();
408 return url
.ReplaceComponents(replacements
).spec();
411 std::string
URLMatcherConditionFactory::CanonicalizeURLForRegexSearches(
412 const GURL
& url
) const {
413 return CanonicalizeURLForRegexSearchesHelper(url
, false);
417 URLMatcherConditionFactory::CanonicalizeURLForOriginAndPathRegexSearches(
418 const GURL
& url
) const {
419 return CanonicalizeURLForRegexSearchesHelper(url
, true);
422 URLMatcherCondition
URLMatcherConditionFactory::CreateURLPrefixCondition(
423 const std::string
& prefix
) {
424 return CreateCondition(URLMatcherCondition::URL_PREFIX
,
425 kBeginningOfURL
+ prefix
);
428 URLMatcherCondition
URLMatcherConditionFactory::CreateURLSuffixCondition(
429 const std::string
& suffix
) {
430 return CreateCondition(URLMatcherCondition::URL_SUFFIX
, suffix
+ kEndOfURL
);
433 URLMatcherCondition
URLMatcherConditionFactory::CreateURLContainsCondition(
434 const std::string
& str
) {
435 return CreateCondition(URLMatcherCondition::URL_CONTAINS
, str
);
438 URLMatcherCondition
URLMatcherConditionFactory::CreateURLEqualsCondition(
439 const std::string
& str
) {
440 return CreateCondition(URLMatcherCondition::URL_EQUALS
,
441 kBeginningOfURL
+ str
+ kEndOfURL
);
444 URLMatcherCondition
URLMatcherConditionFactory::CreateURLMatchesCondition(
445 const std::string
& regex
) {
446 return CreateCondition(URLMatcherCondition::URL_MATCHES
, regex
);
450 URLMatcherConditionFactory::CreateOriginAndPathMatchesCondition(
451 const std::string
& regex
) {
452 return CreateCondition(URLMatcherCondition::ORIGIN_AND_PATH_MATCHES
, regex
);
455 void URLMatcherConditionFactory::ForgetUnusedPatterns(
456 const std::set
<StringPattern::ID
>& used_patterns
) {
457 PatternSingletons::iterator i
= substring_pattern_singletons_
.begin();
458 while (i
!= substring_pattern_singletons_
.end()) {
459 if (ContainsKey(used_patterns
, (*i
)->id())) {
463 substring_pattern_singletons_
.erase(i
++);
466 i
= regex_pattern_singletons_
.begin();
467 while (i
!= regex_pattern_singletons_
.end()) {
468 if (ContainsKey(used_patterns
, (*i
)->id())) {
472 regex_pattern_singletons_
.erase(i
++);
475 i
= origin_and_path_regex_pattern_singletons_
.begin();
476 while (i
!= origin_and_path_regex_pattern_singletons_
.end()) {
477 if (ContainsKey(used_patterns
, (*i
)->id())) {
481 origin_and_path_regex_pattern_singletons_
.erase(i
++);
486 bool URLMatcherConditionFactory::IsEmpty() const {
487 return substring_pattern_singletons_
.empty() &&
488 regex_pattern_singletons_
.empty() &&
489 origin_and_path_regex_pattern_singletons_
.empty();
492 URLMatcherCondition
URLMatcherConditionFactory::CreateCondition(
493 URLMatcherCondition::Criterion criterion
,
494 const std::string
& pattern
) {
495 StringPattern
search_pattern(pattern
, 0);
496 PatternSingletons
* pattern_singletons
= NULL
;
497 if (IsRegexCriterion(criterion
))
498 pattern_singletons
= ®ex_pattern_singletons_
;
499 else if (IsOriginAndPathRegexCriterion(criterion
))
500 pattern_singletons
= &origin_and_path_regex_pattern_singletons_
;
502 pattern_singletons
= &substring_pattern_singletons_
;
504 PatternSingletons::const_iterator iter
=
505 pattern_singletons
->find(&search_pattern
);
507 if (iter
!= pattern_singletons
->end()) {
508 return URLMatcherCondition(criterion
, *iter
);
510 StringPattern
* new_pattern
=
511 new StringPattern(pattern
, id_counter_
++);
512 pattern_singletons
->insert(new_pattern
);
513 return URLMatcherCondition(criterion
, new_pattern
);
517 std::string
URLMatcherConditionFactory::CanonicalizeHostname(
518 const std::string
& hostname
) const {
519 if (!hostname
.empty() && hostname
[0] == '.')
522 return "." + hostname
;
525 bool URLMatcherConditionFactory::StringPatternPointerCompare::operator()(
527 StringPattern
* rhs
) const {
528 if (lhs
== NULL
&& rhs
!= NULL
) return true;
529 if (lhs
!= NULL
&& rhs
!= NULL
)
530 return lhs
->pattern() < rhs
->pattern();
531 // Either both are NULL or only rhs is NULL.
536 // URLMatcherSchemeFilter
539 URLMatcherSchemeFilter::URLMatcherSchemeFilter(const std::string
& filter
)
541 filters_
.push_back(filter
);
544 URLMatcherSchemeFilter::URLMatcherSchemeFilter(
545 const std::vector
<std::string
>& filters
)
546 : filters_(filters
) {}
548 URLMatcherSchemeFilter::~URLMatcherSchemeFilter() {}
550 bool URLMatcherSchemeFilter::IsMatch(const GURL
& url
) const {
551 return std::find(filters_
.begin(), filters_
.end(), url
.scheme()) !=
556 // URLMatcherPortFilter
559 URLMatcherPortFilter::URLMatcherPortFilter(
560 const std::vector
<URLMatcherPortFilter::Range
>& ranges
)
563 URLMatcherPortFilter::~URLMatcherPortFilter() {}
565 bool URLMatcherPortFilter::IsMatch(const GURL
& url
) const {
566 int port
= url
.EffectiveIntPort();
567 for (std::vector
<Range
>::const_iterator i
= ranges_
.begin();
568 i
!= ranges_
.end(); ++i
) {
569 if (i
->first
<= port
&& port
<= i
->second
)
576 URLMatcherPortFilter::Range
URLMatcherPortFilter::CreateRange(int from
,
578 return Range(from
, to
);
582 URLMatcherPortFilter::Range
URLMatcherPortFilter::CreateRange(int port
) {
583 return Range(port
, port
);
587 // URLMatcherConditionSet
590 URLMatcherConditionSet::~URLMatcherConditionSet() {}
592 URLMatcherConditionSet::URLMatcherConditionSet(
594 const Conditions
& conditions
)
596 conditions_(conditions
) {}
598 URLMatcherConditionSet::URLMatcherConditionSet(
600 const Conditions
& conditions
,
601 scoped_ptr
<URLMatcherSchemeFilter
> scheme_filter
,
602 scoped_ptr
<URLMatcherPortFilter
> port_filter
)
604 conditions_(conditions
),
605 scheme_filter_(scheme_filter
.Pass()),
606 port_filter_(port_filter
.Pass()) {}
608 bool URLMatcherConditionSet::IsMatch(
609 const std::set
<StringPattern::ID
>& matching_patterns
,
610 const GURL
& url
) const {
611 for (Conditions::const_iterator i
= conditions_
.begin();
612 i
!= conditions_
.end(); ++i
) {
613 if (!i
->IsMatch(matching_patterns
, url
))
616 if (scheme_filter_
.get() && !scheme_filter_
->IsMatch(url
))
618 if (port_filter_
.get() && !port_filter_
->IsMatch(url
))
627 URLMatcher::URLMatcher() {}
629 URLMatcher::~URLMatcher() {}
631 void URLMatcher::AddConditionSets(
632 const URLMatcherConditionSet::Vector
& condition_sets
) {
633 for (URLMatcherConditionSet::Vector::const_iterator i
=
634 condition_sets
.begin(); i
!= condition_sets
.end(); ++i
) {
635 DCHECK(url_matcher_condition_sets_
.find((*i
)->id()) ==
636 url_matcher_condition_sets_
.end());
637 url_matcher_condition_sets_
[(*i
)->id()] = *i
;
639 UpdateInternalDatastructures();
642 void URLMatcher::RemoveConditionSets(
643 const std::vector
<URLMatcherConditionSet::ID
>& condition_set_ids
) {
644 for (std::vector
<URLMatcherConditionSet::ID
>::const_iterator i
=
645 condition_set_ids
.begin(); i
!= condition_set_ids
.end(); ++i
) {
646 DCHECK(url_matcher_condition_sets_
.find(*i
) !=
647 url_matcher_condition_sets_
.end());
648 url_matcher_condition_sets_
.erase(*i
);
650 UpdateInternalDatastructures();
653 void URLMatcher::ClearUnusedConditionSets() {
654 UpdateConditionFactory();
657 std::set
<URLMatcherConditionSet::ID
> URLMatcher::MatchURL(
658 const GURL
& url
) const {
659 // Find all IDs of StringPatterns that match |url|.
660 // See URLMatcherConditionFactory for the canonicalization of URLs and the
661 // distinction between full url searches and url component searches.
662 std::set
<StringPattern::ID
> matches
;
663 if (!full_url_matcher_
.IsEmpty()) {
664 full_url_matcher_
.Match(
665 condition_factory_
.CanonicalizeURLForFullSearches(url
), &matches
);
667 if (!url_component_matcher_
.IsEmpty()) {
668 url_component_matcher_
.Match(
669 condition_factory_
.CanonicalizeURLForComponentSearches(url
), &matches
);
671 if (!regex_set_matcher_
.IsEmpty()) {
672 regex_set_matcher_
.Match(
673 condition_factory_
.CanonicalizeURLForRegexSearches(url
), &matches
);
675 if (!origin_and_path_regex_set_matcher_
.IsEmpty()) {
676 origin_and_path_regex_set_matcher_
.Match(
677 condition_factory_
.CanonicalizeURLForOriginAndPathRegexSearches(url
),
681 // Calculate all URLMatcherConditionSets for which all URLMatcherConditions
683 std::set
<URLMatcherConditionSet::ID
> result
;
684 for (std::set
<StringPattern::ID
>::const_iterator i
= matches
.begin();
685 i
!= matches
.end(); ++i
) {
686 // For each URLMatcherConditionSet there is exactly one condition
687 // registered in substring_match_triggers_. This means that the following
688 // logic tests each URLMatcherConditionSet exactly once if it can be
689 // completely fulfilled.
690 StringPatternTriggers::const_iterator triggered_condition_sets_iter
=
691 substring_match_triggers_
.find(*i
);
692 if (triggered_condition_sets_iter
== substring_match_triggers_
.end())
693 continue; // Not all substring matches are triggers for a condition set.
694 const std::set
<URLMatcherConditionSet::ID
>& condition_sets
=
695 triggered_condition_sets_iter
->second
;
696 for (std::set
<URLMatcherConditionSet::ID
>::const_iterator j
=
697 condition_sets
.begin(); j
!= condition_sets
.end(); ++j
) {
698 URLMatcherConditionSets::const_iterator condition_set_iter
=
699 url_matcher_condition_sets_
.find(*j
);
700 DCHECK(condition_set_iter
!= url_matcher_condition_sets_
.end());
701 if (condition_set_iter
->second
->IsMatch(matches
, url
))
709 bool URLMatcher::IsEmpty() const {
710 return condition_factory_
.IsEmpty() &&
711 url_matcher_condition_sets_
.empty() &&
712 substring_match_triggers_
.empty() &&
713 full_url_matcher_
.IsEmpty() &&
714 url_component_matcher_
.IsEmpty() &&
715 regex_set_matcher_
.IsEmpty() &&
716 origin_and_path_regex_set_matcher_
.IsEmpty() &&
717 registered_full_url_patterns_
.empty() &&
718 registered_url_component_patterns_
.empty();
721 void URLMatcher::UpdateSubstringSetMatcher(bool full_url_conditions
) {
722 // The purpose of |full_url_conditions| is just that we need to execute
723 // the same logic once for Full URL searches and once for URL Component
724 // searches (see URLMatcherConditionFactory).
726 // Determine which patterns need to be registered when this function
728 std::set
<const StringPattern
*> new_patterns
;
729 for (URLMatcherConditionSets::const_iterator condition_set_iter
=
730 url_matcher_condition_sets_
.begin();
731 condition_set_iter
!= url_matcher_condition_sets_
.end();
732 ++condition_set_iter
) {
733 const URLMatcherConditionSet::Conditions
& conditions
=
734 condition_set_iter
->second
->conditions();
735 for (URLMatcherConditionSet::Conditions::const_iterator condition_iter
=
736 conditions
.begin(); condition_iter
!= conditions
.end();
738 // If we are called to process Full URL searches, ignore others, and
739 // vice versa. (Regex conditions are updated in UpdateRegexSetMatcher.)
740 if (!condition_iter
->IsRegexCondition() &&
741 !condition_iter
->IsOriginAndPathRegexCondition() &&
742 full_url_conditions
== condition_iter
->IsFullURLCondition())
743 new_patterns
.insert(condition_iter
->string_pattern());
747 // This is the set of patterns that were registered before this function
749 std::set
<const StringPattern
*>& registered_patterns
=
750 full_url_conditions
? registered_full_url_patterns_
751 : registered_url_component_patterns_
;
753 // Add all patterns that are in new_patterns but not in registered_patterns.
754 std::vector
<const StringPattern
*> patterns_to_register
=
755 base::STLSetDifference
<std::vector
<const StringPattern
*> >(
756 new_patterns
, registered_patterns
);
758 // Remove all patterns that are in registered_patterns but not in
760 std::vector
<const StringPattern
*> patterns_to_unregister
=
761 base::STLSetDifference
<std::vector
<const StringPattern
*> >(
762 registered_patterns
, new_patterns
);
764 // Update the SubstringSetMatcher.
765 SubstringSetMatcher
& url_matcher
=
766 full_url_conditions
? full_url_matcher_
: url_component_matcher_
;
767 url_matcher
.RegisterAndUnregisterPatterns(patterns_to_register
,
768 patterns_to_unregister
);
770 // Update the set of registered_patterns for the next time this function
772 registered_patterns
.swap(new_patterns
);
775 void URLMatcher::UpdateRegexSetMatcher() {
776 std::vector
<const StringPattern
*> new_patterns
;
777 std::vector
<const StringPattern
*> new_origin_and_path_patterns
;
779 for (URLMatcherConditionSets::const_iterator condition_set_iter
=
780 url_matcher_condition_sets_
.begin();
781 condition_set_iter
!= url_matcher_condition_sets_
.end();
782 ++condition_set_iter
) {
783 const URLMatcherConditionSet::Conditions
& conditions
=
784 condition_set_iter
->second
->conditions();
785 for (URLMatcherConditionSet::Conditions::const_iterator condition_iter
=
786 conditions
.begin(); condition_iter
!= conditions
.end();
788 if (condition_iter
->IsRegexCondition()) {
789 new_patterns
.push_back(condition_iter
->string_pattern());
790 } else if (condition_iter
->IsOriginAndPathRegexCondition()) {
791 new_origin_and_path_patterns
.push_back(
792 condition_iter
->string_pattern());
797 // Start over from scratch. We can't really do better than this, since the
798 // FilteredRE2 backend doesn't support incremental updates.
799 regex_set_matcher_
.ClearPatterns();
800 regex_set_matcher_
.AddPatterns(new_patterns
);
801 origin_and_path_regex_set_matcher_
.ClearPatterns();
802 origin_and_path_regex_set_matcher_
.AddPatterns(new_origin_and_path_patterns
);
805 void URLMatcher::UpdateTriggers() {
806 // Count substring pattern frequencies.
807 std::map
<StringPattern::ID
, size_t> substring_pattern_frequencies
;
808 for (URLMatcherConditionSets::const_iterator condition_set_iter
=
809 url_matcher_condition_sets_
.begin();
810 condition_set_iter
!= url_matcher_condition_sets_
.end();
811 ++condition_set_iter
) {
812 const URLMatcherConditionSet::Conditions
& conditions
=
813 condition_set_iter
->second
->conditions();
814 for (URLMatcherConditionSet::Conditions::const_iterator condition_iter
=
815 conditions
.begin(); condition_iter
!= conditions
.end();
817 const StringPattern
* pattern
= condition_iter
->string_pattern();
818 substring_pattern_frequencies
[pattern
->id()]++;
822 // Update trigger conditions: Determine for each URLMatcherConditionSet which
823 // URLMatcherCondition contains a StringPattern that occurs least
824 // frequently in this URLMatcher. We assume that this condition is very
825 // specific and occurs rarely in URLs. If a match occurs for this
826 // URLMatcherCondition, we want to test all other URLMatcherCondition in the
827 // respective URLMatcherConditionSet as well to see whether the entire
828 // URLMatcherConditionSet is considered matching.
829 substring_match_triggers_
.clear();
830 for (URLMatcherConditionSets::const_iterator condition_set_iter
=
831 url_matcher_condition_sets_
.begin();
832 condition_set_iter
!= url_matcher_condition_sets_
.end();
833 ++condition_set_iter
) {
834 const URLMatcherConditionSet::Conditions
& conditions
=
835 condition_set_iter
->second
->conditions();
836 if (conditions
.empty())
838 URLMatcherConditionSet::Conditions::const_iterator condition_iter
=
840 StringPattern::ID trigger
= condition_iter
->string_pattern()->id();
841 // We skip the first element in the following loop.
843 for (; condition_iter
!= conditions
.end(); ++condition_iter
) {
844 StringPattern::ID current_id
=
845 condition_iter
->string_pattern()->id();
846 if (substring_pattern_frequencies
[trigger
] >
847 substring_pattern_frequencies
[current_id
]) {
848 trigger
= current_id
;
851 substring_match_triggers_
[trigger
].insert(condition_set_iter
->second
->id());
855 void URLMatcher::UpdateConditionFactory() {
856 std::set
<StringPattern::ID
> used_patterns
;
857 for (URLMatcherConditionSets::const_iterator condition_set_iter
=
858 url_matcher_condition_sets_
.begin();
859 condition_set_iter
!= url_matcher_condition_sets_
.end();
860 ++condition_set_iter
) {
861 const URLMatcherConditionSet::Conditions
& conditions
=
862 condition_set_iter
->second
->conditions();
863 for (URLMatcherConditionSet::Conditions::const_iterator condition_iter
=
864 conditions
.begin(); condition_iter
!= conditions
.end();
866 used_patterns
.insert(condition_iter
->string_pattern()->id());
869 condition_factory_
.ForgetUnusedPatterns(used_patterns
);
872 void URLMatcher::UpdateInternalDatastructures() {
873 UpdateSubstringSetMatcher(false);
874 UpdateSubstringSetMatcher(true);
875 UpdateRegexSetMatcher();
877 UpdateConditionFactory();
880 } // namespace url_matcher