Merge Chromium + Blink git repositories
[chromium-blink-merge.git] / components / url_matcher / url_matcher.cc
blob67fc3f6f4abb33af871b60f0505cae8285a4c461
1 // Copyright 2013 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "components/url_matcher/url_matcher.h"
7 #include <algorithm>
8 #include <iterator>
10 #include "base/logging.h"
11 #include "base/stl_util.h"
12 #include "url/gurl.h"
13 #include "url/url_canon.h"
15 namespace url_matcher {
17 // This set of classes implement a mapping of URL Component Patterns, such as
18 // host_prefix, host_suffix, host_equals, ..., etc., to StringPatterns
19 // for use in substring comparisons.
21 // The idea of this mapping is to reduce the problem of comparing many
22 // URL Component Patterns against one URL to the problem of searching many
23 // substrings in one string:
25 // ---------------------- -----------------
26 // | URL Query operator | ----translate----> | StringPattern |
27 // ---------------------- -----------------
28 // ^
29 // |
30 // compare
31 // |
32 // v
33 // ---------------------- -----------------
34 // | URL to compare | | |
35 // | to all URL Query | ----translate----> | String |
36 // | operators | | |
37 // ---------------------- -----------------
39 // The reason for this problem reduction is that there are efficient algorithms
40 // for searching many substrings in one string (see Aho-Corasick algorithm).
42 // Additionally, some of the same pieces are reused to implement regular
43 // expression comparisons. The FilteredRE2 implementation for matching many
44 // regular expressions against one string uses prefiltering, in which a set
45 // of substrings (derived from the regexes) are first searched for, to reduce
46 // the number of regular expressions to test; the prefiltering step also
47 // uses Aho-Corasick.
49 // Case 1: {host,path,query}_{prefix,suffix,equals} searches.
50 // ==========================================================
52 // For searches in this class, we normalize URLs as follows:
54 // Step 1:
55 // Remove scheme, port and segment from URL:
56 // -> http://www.example.com:8080/index.html?search=foo#first_match becomes
57 // www.example.com/index.html?search=foo
59 // We remove the scheme and port number because they can be checked later
60 // in a secondary filter step. We remove the segment (the #... part) because
61 // this is not guaranteed to be ASCII-7 encoded.
63 // Step 2:
64 // Translate URL to String and add the following position markers:
65 // - BU = Beginning of URL
66 // - ED = End of Domain
67 // - EP = End of Path
68 // - EU = End of URL
69 // Furthermore, the hostname is canonicalized to start with a ".".
71 // Position markers are represented as characters >127, which are therefore
72 // guaranteed not to be part of the ASCII-7 encoded URL character set.
74 // -> www.example.com/index.html?search=foo becomes
75 // BU .www.example.com ED /index.html EP ?search=foo EU
77 // -> www.example.com/index.html becomes
78 // BU .www.example.com ED /index.html EP EU
80 // Step 3:
81 // Translate URL Component Patterns as follows:
83 // host_prefix(prefix) = BU add_missing_dot_prefix(prefix)
84 // -> host_prefix("www.example") = BU .www.example
86 // host_suffix(suffix) = suffix ED
87 // -> host_suffix("example.com") = example.com ED
88 // -> host_suffix(".example.com") = .example.com ED
90 // host_equals(domain) = BU add_missing_dot_prefix(domain) ED
91 // -> host_equals("www.example.com") = BU .www.example.com ED
93 // Similarly for path query parameters ({path, query}_{prefix, suffix, equals}).
95 // With this, we can search the StringPatterns in the normalized URL.
98 // Case 2: url_{prefix,suffix,equals,contains} searches.
99 // =====================================================
101 // Step 1: as above, except that
102 // - the scheme is not removed
103 // - the port is not removed if it is specified and does not match the default
104 // port for the given scheme.
106 // Step 2:
107 // Translate URL to String and add the following position markers:
108 // - BU = Beginning of URL
109 // - EU = End of URL
111 // -> http://www.example.com:8080/index.html?search=foo#first_match becomes
112 // BU http://www.example.com:8080/index.html?search=foo EU
113 // -> http://www.example.com:80/index.html?search=foo#first_match becomes
114 // BU http://www.example.com/index.html?search=foo EU
116 // url_prefix(prefix) = BU prefix
117 // -> url_prefix("http://www.example") = BU http://www.example
119 // url_contains(substring) = substring
120 // -> url_contains("index") = index
123 // Case 3: {host,path,query}_contains searches.
124 // ============================================
126 // These kinds of searches are not supported directly but can be derived
127 // by a combination of a url_contains() query followed by an explicit test:
129 // host_contains(str) = url_contains(str) followed by test whether str occurs
130 // in host component of original URL.
131 // -> host_contains("example.co") = example.co
132 // followed by gurl.host().find("example.co");
134 // [similarly for path_contains and query_contains].
137 // Regular expression matching (url_matches searches)
138 // ==================================================
140 // This class also supports matching regular expressions (RE2 syntax)
141 // against full URLs, which are transformed as in case 2.
143 namespace {
145 bool IsRegexCriterion(URLMatcherCondition::Criterion criterion) {
146 return criterion == URLMatcherCondition::URL_MATCHES;
149 bool IsOriginAndPathRegexCriterion(URLMatcherCondition::Criterion criterion) {
150 return criterion == URLMatcherCondition::ORIGIN_AND_PATH_MATCHES;
153 } // namespace
156 // URLMatcherCondition
159 URLMatcherCondition::URLMatcherCondition()
160 : criterion_(HOST_PREFIX),
161 string_pattern_(NULL) {}
163 URLMatcherCondition::~URLMatcherCondition() {}
165 URLMatcherCondition::URLMatcherCondition(
166 Criterion criterion,
167 const StringPattern* string_pattern)
168 : criterion_(criterion),
169 string_pattern_(string_pattern) {}
171 URLMatcherCondition::URLMatcherCondition(const URLMatcherCondition& rhs)
172 : criterion_(rhs.criterion_),
173 string_pattern_(rhs.string_pattern_) {}
175 URLMatcherCondition& URLMatcherCondition::operator=(
176 const URLMatcherCondition& rhs) {
177 criterion_ = rhs.criterion_;
178 string_pattern_ = rhs.string_pattern_;
179 return *this;
182 bool URLMatcherCondition::operator<(const URLMatcherCondition& rhs) const {
183 if (criterion_ < rhs.criterion_) return true;
184 if (criterion_ > rhs.criterion_) return false;
185 if (string_pattern_ != NULL && rhs.string_pattern_ != NULL)
186 return *string_pattern_ < *rhs.string_pattern_;
187 if (string_pattern_ == NULL && rhs.string_pattern_ != NULL) return true;
188 // Either string_pattern_ != NULL && rhs.string_pattern_ == NULL,
189 // or both are NULL.
190 return false;
193 bool URLMatcherCondition::IsFullURLCondition() const {
194 // For these criteria the SubstringMatcher needs to be executed on the
195 // GURL that is canonicalized with
196 // URLMatcherConditionFactory::CanonicalizeURLForFullSearches.
197 switch (criterion_) {
198 case HOST_CONTAINS:
199 case PATH_CONTAINS:
200 case QUERY_CONTAINS:
201 case URL_PREFIX:
202 case URL_SUFFIX:
203 case URL_CONTAINS:
204 case URL_EQUALS:
205 return true;
206 default:
207 break;
209 return false;
212 bool URLMatcherCondition::IsRegexCondition() const {
213 return IsRegexCriterion(criterion_);
216 bool URLMatcherCondition::IsOriginAndPathRegexCondition() const {
217 return IsOriginAndPathRegexCriterion(criterion_);
220 bool URLMatcherCondition::IsMatch(
221 const std::set<StringPattern::ID>& matching_patterns,
222 const GURL& url) const {
223 DCHECK(string_pattern_);
224 if (!ContainsKey(matching_patterns, string_pattern_->id()))
225 return false;
226 // The criteria HOST_CONTAINS, PATH_CONTAINS, QUERY_CONTAINS are based on
227 // a substring match on the raw URL. In case of a match, we need to verify
228 // that the match was found in the correct component of the URL.
229 switch (criterion_) {
230 case HOST_CONTAINS:
231 return url.host().find(string_pattern_->pattern()) !=
232 std::string::npos;
233 case PATH_CONTAINS:
234 return url.path().find(string_pattern_->pattern()) !=
235 std::string::npos;
236 case QUERY_CONTAINS:
237 return url.query().find(string_pattern_->pattern()) !=
238 std::string::npos;
239 default:
240 break;
242 return true;
246 // URLMatcherConditionFactory
249 namespace {
250 // These are symbols that are not contained in 7-bit ASCII used in GURLs.
251 const char kBeginningOfURL[] = {static_cast<char>(-1), 0};
252 const char kEndOfDomain[] = {static_cast<char>(-2), 0};
253 const char kEndOfPath[] = {static_cast<char>(-3), 0};
254 const char kQueryComponentDelimiter[] = {static_cast<char>(-4), 0};
255 const char kEndOfURL[] = {static_cast<char>(-5), 0};
257 // The delimiter for query parameters
258 const char kQuerySeparator = '&';
259 } // namespace
261 URLMatcherConditionFactory::URLMatcherConditionFactory() : id_counter_(0) {}
263 URLMatcherConditionFactory::~URLMatcherConditionFactory() {
264 STLDeleteElements(&substring_pattern_singletons_);
265 STLDeleteElements(&regex_pattern_singletons_);
266 STLDeleteElements(&origin_and_path_regex_pattern_singletons_);
269 std::string URLMatcherConditionFactory::CanonicalizeURLForComponentSearches(
270 const GURL& url) const {
271 return kBeginningOfURL + CanonicalizeHostname(url.host()) + kEndOfDomain +
272 url.path() + kEndOfPath +
273 (url.has_query() ? CanonicalizeQuery(url.query(), true, true)
274 : std::string()) +
275 kEndOfURL;
278 URLMatcherCondition URLMatcherConditionFactory::CreateHostPrefixCondition(
279 const std::string& prefix) {
280 return CreateCondition(URLMatcherCondition::HOST_PREFIX,
281 kBeginningOfURL + CanonicalizeHostPrefix(prefix));
284 URLMatcherCondition URLMatcherConditionFactory::CreateHostSuffixCondition(
285 const std::string& suffix) {
286 return CreateCondition(URLMatcherCondition::HOST_SUFFIX,
287 CanonicalizeHostSuffix(suffix) + kEndOfDomain);
290 URLMatcherCondition URLMatcherConditionFactory::CreateHostContainsCondition(
291 const std::string& str) {
292 return CreateCondition(URLMatcherCondition::HOST_CONTAINS, str);
295 URLMatcherCondition URLMatcherConditionFactory::CreateHostEqualsCondition(
296 const std::string& str) {
297 return CreateCondition(URLMatcherCondition::HOST_EQUALS,
298 kBeginningOfURL + CanonicalizeHostname(str) + kEndOfDomain);
301 URLMatcherCondition URLMatcherConditionFactory::CreatePathPrefixCondition(
302 const std::string& prefix) {
303 return CreateCondition(URLMatcherCondition::PATH_PREFIX,
304 kEndOfDomain + prefix);
307 URLMatcherCondition URLMatcherConditionFactory::CreatePathSuffixCondition(
308 const std::string& suffix) {
309 return CreateCondition(URLMatcherCondition::PATH_SUFFIX, suffix + kEndOfPath);
312 URLMatcherCondition URLMatcherConditionFactory::CreatePathContainsCondition(
313 const std::string& str) {
314 return CreateCondition(URLMatcherCondition::PATH_CONTAINS, str);
317 URLMatcherCondition URLMatcherConditionFactory::CreatePathEqualsCondition(
318 const std::string& str) {
319 return CreateCondition(URLMatcherCondition::PATH_EQUALS,
320 kEndOfDomain + str + kEndOfPath);
323 URLMatcherCondition URLMatcherConditionFactory::CreateQueryPrefixCondition(
324 const std::string& prefix) {
325 std::string pattern;
326 if (!prefix.empty() && prefix[0] == '?')
327 pattern = kEndOfPath + CanonicalizeQuery(prefix.substr(1), true, false);
328 else
329 pattern = kEndOfPath + CanonicalizeQuery(prefix, true, false);
331 return CreateCondition(URLMatcherCondition::QUERY_PREFIX, pattern);
334 URLMatcherCondition URLMatcherConditionFactory::CreateQuerySuffixCondition(
335 const std::string& suffix) {
336 if (!suffix.empty() && suffix[0] == '?') {
337 return CreateQueryEqualsCondition(suffix);
338 } else {
339 return CreateCondition(URLMatcherCondition::QUERY_SUFFIX,
340 CanonicalizeQuery(suffix, false, true) + kEndOfURL);
344 URLMatcherCondition URLMatcherConditionFactory::CreateQueryContainsCondition(
345 const std::string& str) {
346 if (!str.empty() && str[0] == '?')
347 return CreateQueryPrefixCondition(str);
348 else
349 return CreateCondition(URLMatcherCondition::QUERY_CONTAINS, str);
352 URLMatcherCondition URLMatcherConditionFactory::CreateQueryEqualsCondition(
353 const std::string& str) {
354 std::string pattern;
355 if (!str.empty() && str[0] == '?')
356 pattern =
357 kEndOfPath + CanonicalizeQuery(str.substr(1), true, true) + kEndOfURL;
358 else
359 pattern = kEndOfPath + CanonicalizeQuery(str, true, true) + kEndOfURL;
361 return CreateCondition(URLMatcherCondition::QUERY_EQUALS, pattern);
364 URLMatcherCondition
365 URLMatcherConditionFactory::CreateHostSuffixPathPrefixCondition(
366 const std::string& host_suffix,
367 const std::string& path_prefix) {
368 return CreateCondition(URLMatcherCondition::HOST_SUFFIX_PATH_PREFIX,
369 CanonicalizeHostSuffix(host_suffix) + kEndOfDomain + path_prefix);
372 URLMatcherCondition
373 URLMatcherConditionFactory::CreateHostEqualsPathPrefixCondition(
374 const std::string& host,
375 const std::string& path_prefix) {
376 return CreateCondition(URLMatcherCondition::HOST_EQUALS_PATH_PREFIX,
377 kBeginningOfURL + CanonicalizeHostname(host) + kEndOfDomain +
378 path_prefix);
381 std::string URLMatcherConditionFactory::CanonicalizeURLForFullSearches(
382 const GURL& url) const {
383 GURL::Replacements replacements;
384 replacements.ClearPassword();
385 replacements.ClearUsername();
386 replacements.ClearRef();
387 // Clear port if it is implicit from scheme.
388 if (url.has_port()) {
389 const std::string& port = url.scheme();
390 if (url::DefaultPortForScheme(port.c_str(), port.size()) ==
391 url.EffectiveIntPort()) {
392 replacements.ClearPort();
395 return kBeginningOfURL + url.ReplaceComponents(replacements).spec() +
396 kEndOfURL;
399 static std::string CanonicalizeURLForRegexSearchesHelper(
400 const GURL& url,
401 bool clear_query) {
402 GURL::Replacements replacements;
403 replacements.ClearPassword();
404 replacements.ClearUsername();
405 replacements.ClearRef();
406 if (clear_query)
407 replacements.ClearQuery();
408 // Clear port if it is implicit from scheme.
409 if (url.has_port()) {
410 const std::string& port = url.scheme();
411 if (url::DefaultPortForScheme(port.c_str(), port.size()) ==
412 url.EffectiveIntPort()) {
413 replacements.ClearPort();
416 return url.ReplaceComponents(replacements).spec();
419 std::string URLMatcherConditionFactory::CanonicalizeURLForRegexSearches(
420 const GURL& url) const {
421 return CanonicalizeURLForRegexSearchesHelper(url, false);
424 std::string
425 URLMatcherConditionFactory::CanonicalizeURLForOriginAndPathRegexSearches(
426 const GURL& url) const {
427 return CanonicalizeURLForRegexSearchesHelper(url, true);
430 URLMatcherCondition URLMatcherConditionFactory::CreateURLPrefixCondition(
431 const std::string& prefix) {
432 return CreateCondition(URLMatcherCondition::URL_PREFIX,
433 kBeginningOfURL + prefix);
436 URLMatcherCondition URLMatcherConditionFactory::CreateURLSuffixCondition(
437 const std::string& suffix) {
438 return CreateCondition(URLMatcherCondition::URL_SUFFIX, suffix + kEndOfURL);
441 URLMatcherCondition URLMatcherConditionFactory::CreateURLContainsCondition(
442 const std::string& str) {
443 return CreateCondition(URLMatcherCondition::URL_CONTAINS, str);
446 URLMatcherCondition URLMatcherConditionFactory::CreateURLEqualsCondition(
447 const std::string& str) {
448 return CreateCondition(URLMatcherCondition::URL_EQUALS,
449 kBeginningOfURL + str + kEndOfURL);
452 URLMatcherCondition URLMatcherConditionFactory::CreateURLMatchesCondition(
453 const std::string& regex) {
454 return CreateCondition(URLMatcherCondition::URL_MATCHES, regex);
457 URLMatcherCondition
458 URLMatcherConditionFactory::CreateOriginAndPathMatchesCondition(
459 const std::string& regex) {
460 return CreateCondition(URLMatcherCondition::ORIGIN_AND_PATH_MATCHES, regex);
463 void URLMatcherConditionFactory::ForgetUnusedPatterns(
464 const std::set<StringPattern::ID>& used_patterns) {
465 PatternSingletons::iterator i = substring_pattern_singletons_.begin();
466 while (i != substring_pattern_singletons_.end()) {
467 if (ContainsKey(used_patterns, (*i)->id())) {
468 ++i;
469 } else {
470 delete *i;
471 substring_pattern_singletons_.erase(i++);
474 i = regex_pattern_singletons_.begin();
475 while (i != regex_pattern_singletons_.end()) {
476 if (ContainsKey(used_patterns, (*i)->id())) {
477 ++i;
478 } else {
479 delete *i;
480 regex_pattern_singletons_.erase(i++);
483 i = origin_and_path_regex_pattern_singletons_.begin();
484 while (i != origin_and_path_regex_pattern_singletons_.end()) {
485 if (ContainsKey(used_patterns, (*i)->id())) {
486 ++i;
487 } else {
488 delete *i;
489 origin_and_path_regex_pattern_singletons_.erase(i++);
494 bool URLMatcherConditionFactory::IsEmpty() const {
495 return substring_pattern_singletons_.empty() &&
496 regex_pattern_singletons_.empty() &&
497 origin_and_path_regex_pattern_singletons_.empty();
500 URLMatcherCondition URLMatcherConditionFactory::CreateCondition(
501 URLMatcherCondition::Criterion criterion,
502 const std::string& pattern) {
503 StringPattern search_pattern(pattern, 0);
504 PatternSingletons* pattern_singletons = NULL;
505 if (IsRegexCriterion(criterion))
506 pattern_singletons = &regex_pattern_singletons_;
507 else if (IsOriginAndPathRegexCriterion(criterion))
508 pattern_singletons = &origin_and_path_regex_pattern_singletons_;
509 else
510 pattern_singletons = &substring_pattern_singletons_;
512 PatternSingletons::const_iterator iter =
513 pattern_singletons->find(&search_pattern);
515 if (iter != pattern_singletons->end()) {
516 return URLMatcherCondition(criterion, *iter);
517 } else {
518 StringPattern* new_pattern =
519 new StringPattern(pattern, id_counter_++);
520 pattern_singletons->insert(new_pattern);
521 return URLMatcherCondition(criterion, new_pattern);
525 std::string URLMatcherConditionFactory::CanonicalizeHostSuffix(
526 const std::string& suffix) const {
527 if (!suffix.empty() && suffix[suffix.size() - 1] == '.')
528 return suffix;
529 else
530 return suffix + ".";
533 std::string URLMatcherConditionFactory::CanonicalizeHostPrefix(
534 const std::string& prefix) const {
535 if (!prefix.empty() && prefix[0] == '.')
536 return prefix;
537 else
538 return "." + prefix;
541 std::string URLMatcherConditionFactory::CanonicalizeHostname(
542 const std::string& hostname) const {
543 return CanonicalizeHostPrefix(CanonicalizeHostSuffix(hostname));
546 // This function prepares the query string by replacing query separator with a
547 // magic value (|kQueryComponentDelimiter|). When the boolean
548 // |prepend_beginning_of_query_component| is true the function prepends the
549 // query with the same magic. This is done to locate the start of a key value
550 // pair in the query string. The parameter |query| is passed by value
551 // intentionally, since it is locally modified.
552 std::string URLMatcherConditionFactory::CanonicalizeQuery(
553 std::string query,
554 bool prepend_beginning_of_query_component,
555 bool append_end_of_query_component) const {
556 for (std::string::iterator it = query.begin(); it != query.end(); ++it) {
557 if (*it == kQuerySeparator)
558 *it = kQueryComponentDelimiter[0];
560 if (prepend_beginning_of_query_component)
561 query = kQueryComponentDelimiter + query;
562 if (append_end_of_query_component)
563 query += kQueryComponentDelimiter;
564 return query;
567 bool URLMatcherConditionFactory::StringPatternPointerCompare::operator()(
568 StringPattern* lhs,
569 StringPattern* rhs) const {
570 if (lhs == NULL && rhs != NULL) return true;
571 if (lhs != NULL && rhs != NULL)
572 return lhs->pattern() < rhs->pattern();
573 // Either both are NULL or only rhs is NULL.
574 return false;
578 // URLQueryElementMatcherCondition
581 URLQueryElementMatcherCondition::URLQueryElementMatcherCondition(
582 const std::string& key,
583 const std::string& value,
584 QueryValueMatchType query_value_match_type,
585 QueryElementType query_element_type,
586 Type match_type,
587 URLMatcherConditionFactory* factory) {
588 match_type_ = match_type;
590 if (query_element_type == ELEMENT_TYPE_KEY_VALUE) {
591 key_ = kQueryComponentDelimiter + key + "=";
592 value_ = value;
593 } else {
594 key_ = kQueryComponentDelimiter + key;
595 value_ = std::string();
598 if (query_value_match_type == QUERY_VALUE_MATCH_EXACT)
599 value_ += kQueryComponentDelimiter;
601 // If |value_| is empty no need to find the |key_| and verify if the value
602 // matches. Simply checking the presence of key is sufficient, which is done
603 // by MATCH_ANY
604 if (value_.empty())
605 match_type_ = MATCH_ANY;
607 URLMatcherCondition condition;
608 // If |match_type_| is MATCH_ANY, then we could simply look for the
609 // combination of |key_| + |value_|, which can be efficiently done by
610 // SubstringMatcher
611 if (match_type_ == MATCH_ANY)
612 condition = factory->CreateQueryContainsCondition(key_ + value_);
613 else
614 condition = factory->CreateQueryContainsCondition(key_);
615 string_pattern_ = condition.string_pattern();
617 key_length_ = key_.length();
618 value_length_ = value_.length();
621 URLQueryElementMatcherCondition::~URLQueryElementMatcherCondition() {}
623 bool URLQueryElementMatcherCondition::operator<(
624 const URLQueryElementMatcherCondition& rhs) const {
625 if (match_type_ != rhs.match_type_)
626 return match_type_ < rhs.match_type_;
627 if (string_pattern_ != NULL && rhs.string_pattern_ != NULL)
628 return *string_pattern_ < *rhs.string_pattern_;
629 if (string_pattern_ == NULL && rhs.string_pattern_ != NULL)
630 return true;
631 // Either string_pattern_ != NULL && rhs.string_pattern_ == NULL,
632 // or both are NULL.
633 return false;
636 bool URLQueryElementMatcherCondition::IsMatch(
637 const std::string& url_for_component_searches) const {
638 switch (match_type_) {
639 case MATCH_ANY: {
640 // For MATCH_ANY, no additional verification step is needed. We can trust
641 // the SubstringMatcher to do the verification.
642 return true;
644 case MATCH_ALL: {
645 size_t start = 0;
646 int found = 0;
647 size_t offset;
648 while ((offset = url_for_component_searches.find(key_, start)) !=
649 std::string::npos) {
650 if (url_for_component_searches.compare(
651 offset + key_length_, value_length_, value_) != 0) {
652 return false;
653 } else {
654 ++found;
656 start = offset + key_length_ + value_length_ - 1;
658 return !!found;
660 case MATCH_FIRST: {
661 size_t offset = url_for_component_searches.find(key_);
662 return url_for_component_searches.compare(
663 offset + key_length_, value_length_, value_) == 0;
665 case MATCH_LAST: {
666 size_t offset = url_for_component_searches.rfind(key_);
667 return url_for_component_searches.compare(
668 offset + key_length_, value_length_, value_) == 0;
671 NOTREACHED();
672 return false;
676 // URLMatcherSchemeFilter
679 URLMatcherSchemeFilter::URLMatcherSchemeFilter(const std::string& filter)
680 : filters_(1) {
681 filters_.push_back(filter);
684 URLMatcherSchemeFilter::URLMatcherSchemeFilter(
685 const std::vector<std::string>& filters)
686 : filters_(filters) {}
688 URLMatcherSchemeFilter::~URLMatcherSchemeFilter() {}
690 bool URLMatcherSchemeFilter::IsMatch(const GURL& url) const {
691 return std::find(filters_.begin(), filters_.end(), url.scheme()) !=
692 filters_.end();
696 // URLMatcherPortFilter
699 URLMatcherPortFilter::URLMatcherPortFilter(
700 const std::vector<URLMatcherPortFilter::Range>& ranges)
701 : ranges_(ranges) {}
703 URLMatcherPortFilter::~URLMatcherPortFilter() {}
705 bool URLMatcherPortFilter::IsMatch(const GURL& url) const {
706 int port = url.EffectiveIntPort();
707 for (std::vector<Range>::const_iterator i = ranges_.begin();
708 i != ranges_.end(); ++i) {
709 if (i->first <= port && port <= i->second)
710 return true;
712 return false;
715 // static
716 URLMatcherPortFilter::Range URLMatcherPortFilter::CreateRange(int from,
717 int to) {
718 return Range(from, to);
721 // static
722 URLMatcherPortFilter::Range URLMatcherPortFilter::CreateRange(int port) {
723 return Range(port, port);
727 // URLMatcherConditionSet
730 URLMatcherConditionSet::~URLMatcherConditionSet() {}
732 URLMatcherConditionSet::URLMatcherConditionSet(
733 ID id,
734 const Conditions& conditions)
735 : id_(id),
736 conditions_(conditions) {}
738 URLMatcherConditionSet::URLMatcherConditionSet(
739 ID id,
740 const Conditions& conditions,
741 scoped_ptr<URLMatcherSchemeFilter> scheme_filter,
742 scoped_ptr<URLMatcherPortFilter> port_filter)
743 : id_(id),
744 conditions_(conditions),
745 scheme_filter_(scheme_filter.Pass()),
746 port_filter_(port_filter.Pass()) {}
748 URLMatcherConditionSet::URLMatcherConditionSet(
749 ID id,
750 const Conditions& conditions,
751 const QueryConditions& query_conditions,
752 scoped_ptr<URLMatcherSchemeFilter> scheme_filter,
753 scoped_ptr<URLMatcherPortFilter> port_filter)
754 : id_(id),
755 conditions_(conditions),
756 query_conditions_(query_conditions),
757 scheme_filter_(scheme_filter.Pass()),
758 port_filter_(port_filter.Pass()) {}
760 bool URLMatcherConditionSet::IsMatch(
761 const std::set<StringPattern::ID>& matching_patterns,
762 const GURL& url) const {
763 return IsMatch(matching_patterns, url, std::string());
766 bool URLMatcherConditionSet::IsMatch(
767 const std::set<StringPattern::ID>& matching_patterns,
768 const GURL& url,
769 const std::string& url_for_component_searches) const {
770 for (Conditions::const_iterator i = conditions_.begin();
771 i != conditions_.end(); ++i) {
772 if (!i->IsMatch(matching_patterns, url))
773 return false;
775 if (scheme_filter_.get() && !scheme_filter_->IsMatch(url))
776 return false;
777 if (port_filter_.get() && !port_filter_->IsMatch(url))
778 return false;
779 if (query_conditions_.empty())
780 return true;
781 // The loop is duplicated below for performance reasons. If not all query
782 // elements are found, no need to verify match that is expected to take more
783 // cycles.
784 for (QueryConditions::const_iterator i = query_conditions_.begin();
785 i != query_conditions_.end();
786 ++i) {
787 if (!ContainsKey(matching_patterns, i->string_pattern()->id()))
788 return false;
790 for (QueryConditions::const_iterator i = query_conditions_.begin();
791 i != query_conditions_.end();
792 ++i) {
793 if (!i->IsMatch(url_for_component_searches))
794 return false;
796 return true;
800 // URLMatcher
803 URLMatcher::URLMatcher() {}
805 URLMatcher::~URLMatcher() {}
807 void URLMatcher::AddConditionSets(
808 const URLMatcherConditionSet::Vector& condition_sets) {
809 for (URLMatcherConditionSet::Vector::const_iterator i =
810 condition_sets.begin(); i != condition_sets.end(); ++i) {
811 DCHECK(url_matcher_condition_sets_.find((*i)->id()) ==
812 url_matcher_condition_sets_.end());
813 url_matcher_condition_sets_[(*i)->id()] = *i;
815 UpdateInternalDatastructures();
818 void URLMatcher::RemoveConditionSets(
819 const std::vector<URLMatcherConditionSet::ID>& condition_set_ids) {
820 for (std::vector<URLMatcherConditionSet::ID>::const_iterator i =
821 condition_set_ids.begin(); i != condition_set_ids.end(); ++i) {
822 DCHECK(url_matcher_condition_sets_.find(*i) !=
823 url_matcher_condition_sets_.end());
824 url_matcher_condition_sets_.erase(*i);
826 UpdateInternalDatastructures();
829 void URLMatcher::ClearUnusedConditionSets() {
830 UpdateConditionFactory();
833 std::set<URLMatcherConditionSet::ID> URLMatcher::MatchURL(
834 const GURL& url) const {
835 // Find all IDs of StringPatterns that match |url|.
836 // See URLMatcherConditionFactory for the canonicalization of URLs and the
837 // distinction between full url searches and url component searches.
838 std::set<StringPattern::ID> matches;
839 std::string url_for_component_searches;
841 if (!full_url_matcher_.IsEmpty()) {
842 full_url_matcher_.Match(
843 condition_factory_.CanonicalizeURLForFullSearches(url), &matches);
845 if (!url_component_matcher_.IsEmpty()) {
846 url_for_component_searches =
847 condition_factory_.CanonicalizeURLForComponentSearches(url);
848 url_component_matcher_.Match(url_for_component_searches, &matches);
850 if (!regex_set_matcher_.IsEmpty()) {
851 regex_set_matcher_.Match(
852 condition_factory_.CanonicalizeURLForRegexSearches(url), &matches);
854 if (!origin_and_path_regex_set_matcher_.IsEmpty()) {
855 origin_and_path_regex_set_matcher_.Match(
856 condition_factory_.CanonicalizeURLForOriginAndPathRegexSearches(url),
857 &matches);
860 // Calculate all URLMatcherConditionSets for which all URLMatcherConditions
861 // were fulfilled.
862 std::set<URLMatcherConditionSet::ID> result;
863 for (std::set<StringPattern::ID>::const_iterator i = matches.begin();
864 i != matches.end(); ++i) {
865 // For each URLMatcherConditionSet there is exactly one condition
866 // registered in substring_match_triggers_. This means that the following
867 // logic tests each URLMatcherConditionSet exactly once if it can be
868 // completely fulfilled.
869 StringPatternTriggers::const_iterator triggered_condition_sets_iter =
870 substring_match_triggers_.find(*i);
871 if (triggered_condition_sets_iter == substring_match_triggers_.end())
872 continue; // Not all substring matches are triggers for a condition set.
873 const std::set<URLMatcherConditionSet::ID>& condition_sets =
874 triggered_condition_sets_iter->second;
875 for (std::set<URLMatcherConditionSet::ID>::const_iterator j =
876 condition_sets.begin(); j != condition_sets.end(); ++j) {
877 URLMatcherConditionSets::const_iterator condition_set_iter =
878 url_matcher_condition_sets_.find(*j);
879 DCHECK(condition_set_iter != url_matcher_condition_sets_.end());
880 if (condition_set_iter->second->IsMatch(
881 matches, url, url_for_component_searches))
882 result.insert(*j);
886 return result;
889 bool URLMatcher::IsEmpty() const {
890 return condition_factory_.IsEmpty() &&
891 url_matcher_condition_sets_.empty() &&
892 substring_match_triggers_.empty() &&
893 full_url_matcher_.IsEmpty() &&
894 url_component_matcher_.IsEmpty() &&
895 regex_set_matcher_.IsEmpty() &&
896 origin_and_path_regex_set_matcher_.IsEmpty() &&
897 registered_full_url_patterns_.empty() &&
898 registered_url_component_patterns_.empty();
901 void URLMatcher::UpdateSubstringSetMatcher(bool full_url_conditions) {
902 // The purpose of |full_url_conditions| is just that we need to execute
903 // the same logic once for Full URL searches and once for URL Component
904 // searches (see URLMatcherConditionFactory).
906 // Determine which patterns need to be registered when this function
907 // terminates.
908 std::set<const StringPattern*> new_patterns;
909 for (URLMatcherConditionSets::const_iterator condition_set_iter =
910 url_matcher_condition_sets_.begin();
911 condition_set_iter != url_matcher_condition_sets_.end();
912 ++condition_set_iter) {
913 const URLMatcherConditionSet::Conditions& conditions =
914 condition_set_iter->second->conditions();
915 for (URLMatcherConditionSet::Conditions::const_iterator condition_iter =
916 conditions.begin(); condition_iter != conditions.end();
917 ++condition_iter) {
918 // If we are called to process Full URL searches, ignore others, and
919 // vice versa. (Regex conditions are updated in UpdateRegexSetMatcher.)
920 if (!condition_iter->IsRegexCondition() &&
921 !condition_iter->IsOriginAndPathRegexCondition() &&
922 full_url_conditions == condition_iter->IsFullURLCondition())
923 new_patterns.insert(condition_iter->string_pattern());
926 if (full_url_conditions)
927 continue;
929 const URLMatcherConditionSet::QueryConditions& query_conditions =
930 condition_set_iter->second->query_conditions();
931 for (URLMatcherConditionSet::QueryConditions::const_iterator
932 query_condition_iter = query_conditions.begin();
933 query_condition_iter != query_conditions.end();
934 ++query_condition_iter) {
935 new_patterns.insert(query_condition_iter->string_pattern());
939 // This is the set of patterns that were registered before this function
940 // is called.
941 std::set<const StringPattern*>& registered_patterns =
942 full_url_conditions ? registered_full_url_patterns_
943 : registered_url_component_patterns_;
945 // Add all patterns that are in new_patterns but not in registered_patterns.
946 std::vector<const StringPattern*> patterns_to_register =
947 base::STLSetDifference<std::vector<const StringPattern*> >(
948 new_patterns, registered_patterns);
950 // Remove all patterns that are in registered_patterns but not in
951 // new_patterns.
952 std::vector<const StringPattern*> patterns_to_unregister =
953 base::STLSetDifference<std::vector<const StringPattern*> >(
954 registered_patterns, new_patterns);
956 // Update the SubstringSetMatcher.
957 SubstringSetMatcher& url_matcher =
958 full_url_conditions ? full_url_matcher_ : url_component_matcher_;
959 url_matcher.RegisterAndUnregisterPatterns(patterns_to_register,
960 patterns_to_unregister);
962 // Update the set of registered_patterns for the next time this function
963 // is being called.
964 registered_patterns.swap(new_patterns);
967 void URLMatcher::UpdateRegexSetMatcher() {
968 std::vector<const StringPattern*> new_patterns;
969 std::vector<const StringPattern*> new_origin_and_path_patterns;
971 for (URLMatcherConditionSets::const_iterator condition_set_iter =
972 url_matcher_condition_sets_.begin();
973 condition_set_iter != url_matcher_condition_sets_.end();
974 ++condition_set_iter) {
975 const URLMatcherConditionSet::Conditions& conditions =
976 condition_set_iter->second->conditions();
977 for (URLMatcherConditionSet::Conditions::const_iterator condition_iter =
978 conditions.begin(); condition_iter != conditions.end();
979 ++condition_iter) {
980 if (condition_iter->IsRegexCondition()) {
981 new_patterns.push_back(condition_iter->string_pattern());
982 } else if (condition_iter->IsOriginAndPathRegexCondition()) {
983 new_origin_and_path_patterns.push_back(
984 condition_iter->string_pattern());
989 // Start over from scratch. We can't really do better than this, since the
990 // FilteredRE2 backend doesn't support incremental updates.
991 regex_set_matcher_.ClearPatterns();
992 regex_set_matcher_.AddPatterns(new_patterns);
993 origin_and_path_regex_set_matcher_.ClearPatterns();
994 origin_and_path_regex_set_matcher_.AddPatterns(new_origin_and_path_patterns);
997 void URLMatcher::UpdateTriggers() {
998 // Count substring pattern frequencies.
999 std::map<StringPattern::ID, size_t> substring_pattern_frequencies;
1000 for (URLMatcherConditionSets::const_iterator condition_set_iter =
1001 url_matcher_condition_sets_.begin();
1002 condition_set_iter != url_matcher_condition_sets_.end();
1003 ++condition_set_iter) {
1004 const URLMatcherConditionSet::Conditions& conditions =
1005 condition_set_iter->second->conditions();
1006 for (URLMatcherConditionSet::Conditions::const_iterator condition_iter =
1007 conditions.begin(); condition_iter != conditions.end();
1008 ++condition_iter) {
1009 const StringPattern* pattern = condition_iter->string_pattern();
1010 substring_pattern_frequencies[pattern->id()]++;
1013 const URLMatcherConditionSet::QueryConditions& query_conditions =
1014 condition_set_iter->second->query_conditions();
1015 for (URLMatcherConditionSet::QueryConditions::const_iterator
1016 query_condition_iter = query_conditions.begin();
1017 query_condition_iter != query_conditions.end();
1018 ++query_condition_iter) {
1019 const StringPattern* pattern = query_condition_iter->string_pattern();
1020 substring_pattern_frequencies[pattern->id()]++;
1024 // Update trigger conditions: Determine for each URLMatcherConditionSet which
1025 // URLMatcherCondition contains a StringPattern that occurs least
1026 // frequently in this URLMatcher. We assume that this condition is very
1027 // specific and occurs rarely in URLs. If a match occurs for this
1028 // URLMatcherCondition, we want to test all other URLMatcherCondition in the
1029 // respective URLMatcherConditionSet as well to see whether the entire
1030 // URLMatcherConditionSet is considered matching.
1031 substring_match_triggers_.clear();
1032 for (URLMatcherConditionSets::const_iterator condition_set_iter =
1033 url_matcher_condition_sets_.begin();
1034 condition_set_iter != url_matcher_condition_sets_.end();
1035 ++condition_set_iter) {
1036 const URLMatcherConditionSet::Conditions& conditions =
1037 condition_set_iter->second->conditions();
1038 if (conditions.empty())
1039 continue;
1040 URLMatcherConditionSet::Conditions::const_iterator condition_iter =
1041 conditions.begin();
1042 StringPattern::ID trigger = condition_iter->string_pattern()->id();
1043 // We skip the first element in the following loop.
1044 ++condition_iter;
1045 for (; condition_iter != conditions.end(); ++condition_iter) {
1046 StringPattern::ID current_id =
1047 condition_iter->string_pattern()->id();
1048 if (substring_pattern_frequencies[trigger] >
1049 substring_pattern_frequencies[current_id]) {
1050 trigger = current_id;
1054 const URLMatcherConditionSet::QueryConditions& query_conditions =
1055 condition_set_iter->second->query_conditions();
1056 for (URLMatcherConditionSet::QueryConditions::const_iterator
1057 query_condition_iter = query_conditions.begin();
1058 query_condition_iter != query_conditions.end();
1059 ++query_condition_iter) {
1060 StringPattern::ID current_id =
1061 query_condition_iter->string_pattern()->id();
1062 if (substring_pattern_frequencies[trigger] >
1063 substring_pattern_frequencies[current_id]) {
1064 trigger = current_id;
1068 substring_match_triggers_[trigger].insert(condition_set_iter->second->id());
1072 void URLMatcher::UpdateConditionFactory() {
1073 std::set<StringPattern::ID> used_patterns;
1074 for (URLMatcherConditionSets::const_iterator condition_set_iter =
1075 url_matcher_condition_sets_.begin();
1076 condition_set_iter != url_matcher_condition_sets_.end();
1077 ++condition_set_iter) {
1078 const URLMatcherConditionSet::Conditions& conditions =
1079 condition_set_iter->second->conditions();
1080 for (URLMatcherConditionSet::Conditions::const_iterator condition_iter =
1081 conditions.begin(); condition_iter != conditions.end();
1082 ++condition_iter) {
1083 used_patterns.insert(condition_iter->string_pattern()->id());
1085 const URLMatcherConditionSet::QueryConditions& query_conditions =
1086 condition_set_iter->second->query_conditions();
1087 for (URLMatcherConditionSet::QueryConditions::const_iterator
1088 query_condition_iter = query_conditions.begin();
1089 query_condition_iter != query_conditions.end();
1090 ++query_condition_iter) {
1091 used_patterns.insert(query_condition_iter->string_pattern()->id());
1094 condition_factory_.ForgetUnusedPatterns(used_patterns);
1097 void URLMatcher::UpdateInternalDatastructures() {
1098 UpdateSubstringSetMatcher(false);
1099 UpdateSubstringSetMatcher(true);
1100 UpdateRegexSetMatcher();
1101 UpdateTriggers();
1102 UpdateConditionFactory();
1105 } // namespace url_matcher