Updating trunk VERSION from 2139.0 to 2140.0
[chromium-blink-merge.git] / components / url_matcher / url_matcher.cc
blob3798f7ebcfde1d6ebd90df41a7d2f496e4390e38
1 // Copyright 2013 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "components/url_matcher/url_matcher.h"
7 #include <algorithm>
8 #include <iterator>
10 #include "base/logging.h"
11 #include "url/gurl.h"
12 #include "url/url_canon.h"
14 namespace url_matcher {
16 // This set of classes implement a mapping of URL Component Patterns, such as
17 // host_prefix, host_suffix, host_equals, ..., etc., to StringPatterns
18 // for use in substring comparisons.
20 // The idea of this mapping is to reduce the problem of comparing many
21 // URL Component Patterns against one URL to the problem of searching many
22 // substrings in one string:
24 // ---------------------- -----------------
25 // | URL Query operator | ----translate----> | StringPattern |
26 // ---------------------- -----------------
27 // ^
28 // |
29 // compare
30 // |
31 // v
32 // ---------------------- -----------------
33 // | URL to compare | | |
34 // | to all URL Query | ----translate----> | String |
35 // | operators | | |
36 // ---------------------- -----------------
38 // The reason for this problem reduction is that there are efficient algorithms
39 // for searching many substrings in one string (see Aho-Corasick algorithm).
41 // Additionally, some of the same pieces are reused to implement regular
42 // expression comparisons. The FilteredRE2 implementation for matching many
43 // regular expressions against one string uses prefiltering, in which a set
44 // of substrings (derived from the regexes) are first searched for, to reduce
45 // the number of regular expressions to test; the prefiltering step also
46 // uses Aho-Corasick.
48 // Case 1: {host,path,query}_{prefix,suffix,equals} searches.
49 // ==========================================================
51 // For searches in this class, we normalize URLs as follows:
53 // Step 1:
54 // Remove scheme, port and segment from URL:
55 // -> http://www.example.com:8080/index.html?search=foo#first_match becomes
56 // www.example.com/index.html?search=foo
58 // We remove the scheme and port number because they can be checked later
59 // in a secondary filter step. We remove the segment (the #... part) because
60 // this is not guaranteed to be ASCII-7 encoded.
62 // Step 2:
63 // Translate URL to String and add the following position markers:
64 // - BU = Beginning of URL
65 // - ED = End of Domain
66 // - EP = End of Path
67 // - EU = End of URL
68 // Furthermore, the hostname is canonicalized to start with a ".".
70 // Position markers are represented as characters >127, which are therefore
71 // guaranteed not to be part of the ASCII-7 encoded URL character set.
73 // -> www.example.com/index.html?search=foo becomes
74 // BU .www.example.com ED /index.html EP ?search=foo EU
76 // -> www.example.com/index.html becomes
77 // BU .www.example.com ED /index.html EP EU
79 // Step 3:
80 // Translate URL Component Patterns as follows:
82 // host_prefix(prefix) = BU add_missing_dot_prefix(prefix)
83 // -> host_prefix("www.example") = BU .www.example
85 // host_suffix(suffix) = suffix ED
86 // -> host_suffix("example.com") = example.com ED
87 // -> host_suffix(".example.com") = .example.com ED
89 // host_equals(domain) = BU add_missing_dot_prefix(domain) ED
90 // -> host_equals("www.example.com") = BU .www.example.com ED
92 // Similarly for path query parameters ({path, query}_{prefix, suffix, equals}).
94 // With this, we can search the StringPatterns in the normalized URL.
97 // Case 2: url_{prefix,suffix,equals,contains} searches.
98 // =====================================================
100 // Step 1: as above, except that
101 // - the scheme is not removed
102 // - the port is not removed if it is specified and does not match the default
103 // port for the given scheme.
105 // Step 2:
106 // Translate URL to String and add the following position markers:
107 // - BU = Beginning of URL
108 // - EU = End of URL
110 // -> http://www.example.com:8080/index.html?search=foo#first_match becomes
111 // BU http://www.example.com:8080/index.html?search=foo EU
112 // -> http://www.example.com:80/index.html?search=foo#first_match becomes
113 // BU http://www.example.com/index.html?search=foo EU
115 // url_prefix(prefix) = BU prefix
116 // -> url_prefix("http://www.example") = BU http://www.example
118 // url_contains(substring) = substring
119 // -> url_contains("index") = index
122 // Case 3: {host,path,query}_contains searches.
123 // ============================================
125 // These kinds of searches are not supported directly but can be derived
126 // by a combination of a url_contains() query followed by an explicit test:
128 // host_contains(str) = url_contains(str) followed by test whether str occurs
129 // in host component of original URL.
130 // -> host_contains("example.co") = example.co
131 // followed by gurl.host().find("example.co");
133 // [similarly for path_contains and query_contains].
136 // Regular expression matching (url_matches searches)
137 // ==================================================
139 // This class also supports matching regular expressions (RE2 syntax)
140 // against full URLs, which are transformed as in case 2.
142 namespace {
144 bool IsRegexCriterion(URLMatcherCondition::Criterion criterion) {
145 return criterion == URLMatcherCondition::URL_MATCHES;
148 bool IsOriginAndPathRegexCriterion(URLMatcherCondition::Criterion criterion) {
149 return criterion == URLMatcherCondition::ORIGIN_AND_PATH_MATCHES;
152 } // namespace
155 // URLMatcherCondition
158 URLMatcherCondition::URLMatcherCondition()
159 : criterion_(HOST_PREFIX),
160 string_pattern_(NULL) {}
162 URLMatcherCondition::~URLMatcherCondition() {}
164 URLMatcherCondition::URLMatcherCondition(
165 Criterion criterion,
166 const StringPattern* string_pattern)
167 : criterion_(criterion),
168 string_pattern_(string_pattern) {}
170 URLMatcherCondition::URLMatcherCondition(const URLMatcherCondition& rhs)
171 : criterion_(rhs.criterion_),
172 string_pattern_(rhs.string_pattern_) {}
174 URLMatcherCondition& URLMatcherCondition::operator=(
175 const URLMatcherCondition& rhs) {
176 criterion_ = rhs.criterion_;
177 string_pattern_ = rhs.string_pattern_;
178 return *this;
181 bool URLMatcherCondition::operator<(const URLMatcherCondition& rhs) const {
182 if (criterion_ < rhs.criterion_) return true;
183 if (criterion_ > rhs.criterion_) return false;
184 if (string_pattern_ != NULL && rhs.string_pattern_ != NULL)
185 return *string_pattern_ < *rhs.string_pattern_;
186 if (string_pattern_ == NULL && rhs.string_pattern_ != NULL) return true;
187 // Either string_pattern_ != NULL && rhs.string_pattern_ == NULL,
188 // or both are NULL.
189 return false;
192 bool URLMatcherCondition::IsFullURLCondition() const {
193 // For these criteria the SubstringMatcher needs to be executed on the
194 // GURL that is canonicalized with
195 // URLMatcherConditionFactory::CanonicalizeURLForFullSearches.
196 switch (criterion_) {
197 case HOST_CONTAINS:
198 case PATH_CONTAINS:
199 case QUERY_CONTAINS:
200 case URL_PREFIX:
201 case URL_SUFFIX:
202 case URL_CONTAINS:
203 case URL_EQUALS:
204 return true;
205 default:
206 break;
208 return false;
211 bool URLMatcherCondition::IsRegexCondition() const {
212 return IsRegexCriterion(criterion_);
215 bool URLMatcherCondition::IsOriginAndPathRegexCondition() const {
216 return IsOriginAndPathRegexCriterion(criterion_);
219 bool URLMatcherCondition::IsMatch(
220 const std::set<StringPattern::ID>& matching_patterns,
221 const GURL& url) const {
222 DCHECK(string_pattern_);
223 if (!ContainsKey(matching_patterns, string_pattern_->id()))
224 return false;
225 // The criteria HOST_CONTAINS, PATH_CONTAINS, QUERY_CONTAINS are based on
226 // a substring match on the raw URL. In case of a match, we need to verify
227 // that the match was found in the correct component of the URL.
228 switch (criterion_) {
229 case HOST_CONTAINS:
230 return url.host().find(string_pattern_->pattern()) !=
231 std::string::npos;
232 case PATH_CONTAINS:
233 return url.path().find(string_pattern_->pattern()) !=
234 std::string::npos;
235 case QUERY_CONTAINS:
236 return url.query().find(string_pattern_->pattern()) !=
237 std::string::npos;
238 default:
239 break;
241 return true;
245 // URLMatcherConditionFactory
248 namespace {
249 // These are symbols that are not contained in 7-bit ASCII used in GURLs.
250 const char kBeginningOfURL[] = {static_cast<char>(-1), 0};
251 const char kEndOfDomain[] = {static_cast<char>(-2), 0};
252 const char kEndOfPath[] = {static_cast<char>(-3), 0};
253 const char kQueryComponentDelimiter[] = {static_cast<char>(-4), 0};
254 const char kEndOfURL[] = {static_cast<char>(-5), 0};
256 // The delimiter for query parameters
257 const char kQuerySeparator = '&';
258 } // namespace
260 URLMatcherConditionFactory::URLMatcherConditionFactory() : id_counter_(0) {}
262 URLMatcherConditionFactory::~URLMatcherConditionFactory() {
263 STLDeleteElements(&substring_pattern_singletons_);
264 STLDeleteElements(&regex_pattern_singletons_);
265 STLDeleteElements(&origin_and_path_regex_pattern_singletons_);
268 std::string URLMatcherConditionFactory::CanonicalizeURLForComponentSearches(
269 const GURL& url) const {
270 return kBeginningOfURL + CanonicalizeHostname(url.host()) + kEndOfDomain +
271 url.path() + kEndOfPath +
272 (url.has_query() ? CanonicalizeQuery(url.query(), true, true)
273 : std::string()) +
274 kEndOfURL;
277 URLMatcherCondition URLMatcherConditionFactory::CreateHostPrefixCondition(
278 const std::string& prefix) {
279 return CreateCondition(URLMatcherCondition::HOST_PREFIX,
280 kBeginningOfURL + CanonicalizeHostname(prefix));
283 URLMatcherCondition URLMatcherConditionFactory::CreateHostSuffixCondition(
284 const std::string& suffix) {
285 return CreateCondition(URLMatcherCondition::HOST_SUFFIX,
286 suffix + kEndOfDomain);
289 URLMatcherCondition URLMatcherConditionFactory::CreateHostContainsCondition(
290 const std::string& str) {
291 return CreateCondition(URLMatcherCondition::HOST_CONTAINS, str);
294 URLMatcherCondition URLMatcherConditionFactory::CreateHostEqualsCondition(
295 const std::string& str) {
296 return CreateCondition(URLMatcherCondition::HOST_EQUALS,
297 kBeginningOfURL + CanonicalizeHostname(str) + kEndOfDomain);
300 URLMatcherCondition URLMatcherConditionFactory::CreatePathPrefixCondition(
301 const std::string& prefix) {
302 return CreateCondition(URLMatcherCondition::PATH_PREFIX,
303 kEndOfDomain + prefix);
306 URLMatcherCondition URLMatcherConditionFactory::CreatePathSuffixCondition(
307 const std::string& suffix) {
308 return CreateCondition(URLMatcherCondition::PATH_SUFFIX, suffix + kEndOfPath);
311 URLMatcherCondition URLMatcherConditionFactory::CreatePathContainsCondition(
312 const std::string& str) {
313 return CreateCondition(URLMatcherCondition::PATH_CONTAINS, str);
316 URLMatcherCondition URLMatcherConditionFactory::CreatePathEqualsCondition(
317 const std::string& str) {
318 return CreateCondition(URLMatcherCondition::PATH_EQUALS,
319 kEndOfDomain + str + kEndOfPath);
322 URLMatcherCondition URLMatcherConditionFactory::CreateQueryPrefixCondition(
323 const std::string& prefix) {
324 std::string pattern;
325 if (!prefix.empty() && prefix[0] == '?')
326 pattern = kEndOfPath + CanonicalizeQuery(prefix.substr(1), true, false);
327 else
328 pattern = kEndOfPath + CanonicalizeQuery(prefix, true, false);
330 return CreateCondition(URLMatcherCondition::QUERY_PREFIX, pattern);
333 URLMatcherCondition URLMatcherConditionFactory::CreateQuerySuffixCondition(
334 const std::string& suffix) {
335 if (!suffix.empty() && suffix[0] == '?') {
336 return CreateQueryEqualsCondition(suffix);
337 } else {
338 return CreateCondition(URLMatcherCondition::QUERY_SUFFIX,
339 CanonicalizeQuery(suffix, false, true) + kEndOfURL);
343 URLMatcherCondition URLMatcherConditionFactory::CreateQueryContainsCondition(
344 const std::string& str) {
345 if (!str.empty() && str[0] == '?')
346 return CreateQueryPrefixCondition(str);
347 else
348 return CreateCondition(URLMatcherCondition::QUERY_CONTAINS, str);
351 URLMatcherCondition URLMatcherConditionFactory::CreateQueryEqualsCondition(
352 const std::string& str) {
353 std::string pattern;
354 if (!str.empty() && str[0] == '?')
355 pattern =
356 kEndOfPath + CanonicalizeQuery(str.substr(1), true, true) + kEndOfURL;
357 else
358 pattern = kEndOfPath + CanonicalizeQuery(str, true, true) + kEndOfURL;
360 return CreateCondition(URLMatcherCondition::QUERY_EQUALS, pattern);
363 URLMatcherCondition
364 URLMatcherConditionFactory::CreateHostSuffixPathPrefixCondition(
365 const std::string& host_suffix,
366 const std::string& path_prefix) {
367 return CreateCondition(URLMatcherCondition::HOST_SUFFIX_PATH_PREFIX,
368 host_suffix + kEndOfDomain + path_prefix);
371 URLMatcherCondition
372 URLMatcherConditionFactory::CreateHostEqualsPathPrefixCondition(
373 const std::string& host,
374 const std::string& path_prefix) {
375 return CreateCondition(URLMatcherCondition::HOST_EQUALS_PATH_PREFIX,
376 kBeginningOfURL + CanonicalizeHostname(host) + kEndOfDomain +
377 path_prefix);
380 std::string URLMatcherConditionFactory::CanonicalizeURLForFullSearches(
381 const GURL& url) const {
382 GURL::Replacements replacements;
383 replacements.ClearPassword();
384 replacements.ClearUsername();
385 replacements.ClearRef();
386 // Clear port if it is implicit from scheme.
387 if (url.has_port()) {
388 const std::string& port = url.scheme();
389 if (url::DefaultPortForScheme(port.c_str(), port.size()) ==
390 url.EffectiveIntPort()) {
391 replacements.ClearPort();
394 return kBeginningOfURL + url.ReplaceComponents(replacements).spec() +
395 kEndOfURL;
398 static std::string CanonicalizeURLForRegexSearchesHelper(
399 const GURL& url,
400 bool clear_query) {
401 GURL::Replacements replacements;
402 replacements.ClearPassword();
403 replacements.ClearUsername();
404 replacements.ClearRef();
405 if (clear_query)
406 replacements.ClearQuery();
407 // Clear port if it is implicit from scheme.
408 if (url.has_port()) {
409 const std::string& port = url.scheme();
410 if (url::DefaultPortForScheme(port.c_str(), port.size()) ==
411 url.EffectiveIntPort()) {
412 replacements.ClearPort();
415 return url.ReplaceComponents(replacements).spec();
418 std::string URLMatcherConditionFactory::CanonicalizeURLForRegexSearches(
419 const GURL& url) const {
420 return CanonicalizeURLForRegexSearchesHelper(url, false);
423 std::string
424 URLMatcherConditionFactory::CanonicalizeURLForOriginAndPathRegexSearches(
425 const GURL& url) const {
426 return CanonicalizeURLForRegexSearchesHelper(url, true);
429 URLMatcherCondition URLMatcherConditionFactory::CreateURLPrefixCondition(
430 const std::string& prefix) {
431 return CreateCondition(URLMatcherCondition::URL_PREFIX,
432 kBeginningOfURL + prefix);
435 URLMatcherCondition URLMatcherConditionFactory::CreateURLSuffixCondition(
436 const std::string& suffix) {
437 return CreateCondition(URLMatcherCondition::URL_SUFFIX, suffix + kEndOfURL);
440 URLMatcherCondition URLMatcherConditionFactory::CreateURLContainsCondition(
441 const std::string& str) {
442 return CreateCondition(URLMatcherCondition::URL_CONTAINS, str);
445 URLMatcherCondition URLMatcherConditionFactory::CreateURLEqualsCondition(
446 const std::string& str) {
447 return CreateCondition(URLMatcherCondition::URL_EQUALS,
448 kBeginningOfURL + str + kEndOfURL);
451 URLMatcherCondition URLMatcherConditionFactory::CreateURLMatchesCondition(
452 const std::string& regex) {
453 return CreateCondition(URLMatcherCondition::URL_MATCHES, regex);
456 URLMatcherCondition
457 URLMatcherConditionFactory::CreateOriginAndPathMatchesCondition(
458 const std::string& regex) {
459 return CreateCondition(URLMatcherCondition::ORIGIN_AND_PATH_MATCHES, regex);
462 void URLMatcherConditionFactory::ForgetUnusedPatterns(
463 const std::set<StringPattern::ID>& used_patterns) {
464 PatternSingletons::iterator i = substring_pattern_singletons_.begin();
465 while (i != substring_pattern_singletons_.end()) {
466 if (ContainsKey(used_patterns, (*i)->id())) {
467 ++i;
468 } else {
469 delete *i;
470 substring_pattern_singletons_.erase(i++);
473 i = regex_pattern_singletons_.begin();
474 while (i != regex_pattern_singletons_.end()) {
475 if (ContainsKey(used_patterns, (*i)->id())) {
476 ++i;
477 } else {
478 delete *i;
479 regex_pattern_singletons_.erase(i++);
482 i = origin_and_path_regex_pattern_singletons_.begin();
483 while (i != origin_and_path_regex_pattern_singletons_.end()) {
484 if (ContainsKey(used_patterns, (*i)->id())) {
485 ++i;
486 } else {
487 delete *i;
488 origin_and_path_regex_pattern_singletons_.erase(i++);
493 bool URLMatcherConditionFactory::IsEmpty() const {
494 return substring_pattern_singletons_.empty() &&
495 regex_pattern_singletons_.empty() &&
496 origin_and_path_regex_pattern_singletons_.empty();
499 URLMatcherCondition URLMatcherConditionFactory::CreateCondition(
500 URLMatcherCondition::Criterion criterion,
501 const std::string& pattern) {
502 StringPattern search_pattern(pattern, 0);
503 PatternSingletons* pattern_singletons = NULL;
504 if (IsRegexCriterion(criterion))
505 pattern_singletons = &regex_pattern_singletons_;
506 else if (IsOriginAndPathRegexCriterion(criterion))
507 pattern_singletons = &origin_and_path_regex_pattern_singletons_;
508 else
509 pattern_singletons = &substring_pattern_singletons_;
511 PatternSingletons::const_iterator iter =
512 pattern_singletons->find(&search_pattern);
514 if (iter != pattern_singletons->end()) {
515 return URLMatcherCondition(criterion, *iter);
516 } else {
517 StringPattern* new_pattern =
518 new StringPattern(pattern, id_counter_++);
519 pattern_singletons->insert(new_pattern);
520 return URLMatcherCondition(criterion, new_pattern);
524 std::string URLMatcherConditionFactory::CanonicalizeHostname(
525 const std::string& hostname) const {
526 if (!hostname.empty() && hostname[0] == '.')
527 return hostname;
528 else
529 return "." + hostname;
532 // This function prepares the query string by replacing query separator with a
533 // magic value (|kQueryComponentDelimiter|). When the boolean
534 // |prepend_beginning_of_query_component| is true the function prepends the
535 // query with the same magic. This is done to locate the start of a key value
536 // pair in the query string. The parameter |query| is passed by value
537 // intentionally, since it is locally modified.
538 std::string URLMatcherConditionFactory::CanonicalizeQuery(
539 std::string query,
540 bool prepend_beginning_of_query_component,
541 bool append_end_of_query_component) const {
542 for (std::string::iterator it = query.begin(); it != query.end(); ++it) {
543 if (*it == kQuerySeparator)
544 *it = kQueryComponentDelimiter[0];
546 if (prepend_beginning_of_query_component)
547 query = kQueryComponentDelimiter + query;
548 if (append_end_of_query_component)
549 query += kQueryComponentDelimiter;
550 return query;
553 bool URLMatcherConditionFactory::StringPatternPointerCompare::operator()(
554 StringPattern* lhs,
555 StringPattern* rhs) const {
556 if (lhs == NULL && rhs != NULL) return true;
557 if (lhs != NULL && rhs != NULL)
558 return lhs->pattern() < rhs->pattern();
559 // Either both are NULL or only rhs is NULL.
560 return false;
564 // URLQueryElementMatcherCondition
567 URLQueryElementMatcherCondition::URLQueryElementMatcherCondition(
568 const std::string& key,
569 const std::string& value,
570 QueryValueMatchType query_value_match_type,
571 QueryElementType query_element_type,
572 Type match_type,
573 URLMatcherConditionFactory* factory) {
574 match_type_ = match_type;
576 if (query_element_type == ELEMENT_TYPE_KEY_VALUE) {
577 key_ = kQueryComponentDelimiter + key + "=";
578 value_ = value;
579 } else {
580 key_ = kQueryComponentDelimiter + key;
581 value_ = std::string();
584 if (query_value_match_type == QUERY_VALUE_MATCH_EXACT)
585 value_ += kQueryComponentDelimiter;
587 // If |value_| is empty no need to find the |key_| and verify if the value
588 // matches. Simply checking the presence of key is sufficient, which is done
589 // by MATCH_ANY
590 if (value_.empty())
591 match_type_ = MATCH_ANY;
593 URLMatcherCondition condition;
594 // If |match_type_| is MATCH_ANY, then we could simply look for the
595 // combination of |key_| + |value_|, which can be efficiently done by
596 // SubstringMatcher
597 if (match_type_ == MATCH_ANY)
598 condition = factory->CreateQueryContainsCondition(key_ + value_);
599 else
600 condition = factory->CreateQueryContainsCondition(key_);
601 string_pattern_ = condition.string_pattern();
603 key_length_ = key_.length();
604 value_length_ = value_.length();
607 URLQueryElementMatcherCondition::~URLQueryElementMatcherCondition() {}
609 bool URLQueryElementMatcherCondition::operator<(
610 const URLQueryElementMatcherCondition& rhs) const {
611 if (match_type_ != rhs.match_type_)
612 return match_type_ < rhs.match_type_;
613 if (string_pattern_ != NULL && rhs.string_pattern_ != NULL)
614 return *string_pattern_ < *rhs.string_pattern_;
615 if (string_pattern_ == NULL && rhs.string_pattern_ != NULL)
616 return true;
617 // Either string_pattern_ != NULL && rhs.string_pattern_ == NULL,
618 // or both are NULL.
619 return false;
622 bool URLQueryElementMatcherCondition::IsMatch(
623 const std::string& url_for_component_searches) const {
624 switch (match_type_) {
625 case MATCH_ANY: {
626 // For MATCH_ANY, no additional verification step is needed. We can trust
627 // the SubstringMatcher to do the verification.
628 return true;
630 case MATCH_ALL: {
631 size_t start = 0;
632 int found = 0;
633 size_t offset;
634 while ((offset = url_for_component_searches.find(key_, start)) !=
635 std::string::npos) {
636 if (url_for_component_searches.compare(
637 offset + key_length_, value_length_, value_) != 0) {
638 return false;
639 } else {
640 ++found;
642 start = offset + key_length_ + value_length_ - 1;
644 return !!found;
646 case MATCH_FIRST: {
647 size_t offset = url_for_component_searches.find(key_);
648 return url_for_component_searches.compare(
649 offset + key_length_, value_length_, value_) == 0;
651 case MATCH_LAST: {
652 size_t offset = url_for_component_searches.rfind(key_);
653 return url_for_component_searches.compare(
654 offset + key_length_, value_length_, value_) == 0;
657 NOTREACHED();
658 return false;
662 // URLMatcherSchemeFilter
665 URLMatcherSchemeFilter::URLMatcherSchemeFilter(const std::string& filter)
666 : filters_(1) {
667 filters_.push_back(filter);
670 URLMatcherSchemeFilter::URLMatcherSchemeFilter(
671 const std::vector<std::string>& filters)
672 : filters_(filters) {}
674 URLMatcherSchemeFilter::~URLMatcherSchemeFilter() {}
676 bool URLMatcherSchemeFilter::IsMatch(const GURL& url) const {
677 return std::find(filters_.begin(), filters_.end(), url.scheme()) !=
678 filters_.end();
682 // URLMatcherPortFilter
685 URLMatcherPortFilter::URLMatcherPortFilter(
686 const std::vector<URLMatcherPortFilter::Range>& ranges)
687 : ranges_(ranges) {}
689 URLMatcherPortFilter::~URLMatcherPortFilter() {}
691 bool URLMatcherPortFilter::IsMatch(const GURL& url) const {
692 int port = url.EffectiveIntPort();
693 for (std::vector<Range>::const_iterator i = ranges_.begin();
694 i != ranges_.end(); ++i) {
695 if (i->first <= port && port <= i->second)
696 return true;
698 return false;
701 // static
702 URLMatcherPortFilter::Range URLMatcherPortFilter::CreateRange(int from,
703 int to) {
704 return Range(from, to);
707 // static
708 URLMatcherPortFilter::Range URLMatcherPortFilter::CreateRange(int port) {
709 return Range(port, port);
713 // URLMatcherConditionSet
716 URLMatcherConditionSet::~URLMatcherConditionSet() {}
718 URLMatcherConditionSet::URLMatcherConditionSet(
719 ID id,
720 const Conditions& conditions)
721 : id_(id),
722 conditions_(conditions) {}
724 URLMatcherConditionSet::URLMatcherConditionSet(
725 ID id,
726 const Conditions& conditions,
727 scoped_ptr<URLMatcherSchemeFilter> scheme_filter,
728 scoped_ptr<URLMatcherPortFilter> port_filter)
729 : id_(id),
730 conditions_(conditions),
731 scheme_filter_(scheme_filter.Pass()),
732 port_filter_(port_filter.Pass()) {}
734 URLMatcherConditionSet::URLMatcherConditionSet(
735 ID id,
736 const Conditions& conditions,
737 const QueryConditions& query_conditions,
738 scoped_ptr<URLMatcherSchemeFilter> scheme_filter,
739 scoped_ptr<URLMatcherPortFilter> port_filter)
740 : id_(id),
741 conditions_(conditions),
742 query_conditions_(query_conditions),
743 scheme_filter_(scheme_filter.Pass()),
744 port_filter_(port_filter.Pass()) {}
746 bool URLMatcherConditionSet::IsMatch(
747 const std::set<StringPattern::ID>& matching_patterns,
748 const GURL& url) const {
749 return IsMatch(matching_patterns, url, std::string());
752 bool URLMatcherConditionSet::IsMatch(
753 const std::set<StringPattern::ID>& matching_patterns,
754 const GURL& url,
755 const std::string& url_for_component_searches) const {
756 for (Conditions::const_iterator i = conditions_.begin();
757 i != conditions_.end(); ++i) {
758 if (!i->IsMatch(matching_patterns, url))
759 return false;
761 if (scheme_filter_.get() && !scheme_filter_->IsMatch(url))
762 return false;
763 if (port_filter_.get() && !port_filter_->IsMatch(url))
764 return false;
765 if (query_conditions_.empty())
766 return true;
767 // The loop is duplicated below for performance reasons. If not all query
768 // elements are found, no need to verify match that is expected to take more
769 // cycles.
770 for (QueryConditions::const_iterator i = query_conditions_.begin();
771 i != query_conditions_.end();
772 ++i) {
773 if (!ContainsKey(matching_patterns, i->string_pattern()->id()))
774 return false;
776 for (QueryConditions::const_iterator i = query_conditions_.begin();
777 i != query_conditions_.end();
778 ++i) {
779 if (!i->IsMatch(url_for_component_searches))
780 return false;
782 return true;
786 // URLMatcher
789 URLMatcher::URLMatcher() {}
791 URLMatcher::~URLMatcher() {}
793 void URLMatcher::AddConditionSets(
794 const URLMatcherConditionSet::Vector& condition_sets) {
795 for (URLMatcherConditionSet::Vector::const_iterator i =
796 condition_sets.begin(); i != condition_sets.end(); ++i) {
797 DCHECK(url_matcher_condition_sets_.find((*i)->id()) ==
798 url_matcher_condition_sets_.end());
799 url_matcher_condition_sets_[(*i)->id()] = *i;
801 UpdateInternalDatastructures();
804 void URLMatcher::RemoveConditionSets(
805 const std::vector<URLMatcherConditionSet::ID>& condition_set_ids) {
806 for (std::vector<URLMatcherConditionSet::ID>::const_iterator i =
807 condition_set_ids.begin(); i != condition_set_ids.end(); ++i) {
808 DCHECK(url_matcher_condition_sets_.find(*i) !=
809 url_matcher_condition_sets_.end());
810 url_matcher_condition_sets_.erase(*i);
812 UpdateInternalDatastructures();
815 void URLMatcher::ClearUnusedConditionSets() {
816 UpdateConditionFactory();
819 std::set<URLMatcherConditionSet::ID> URLMatcher::MatchURL(
820 const GURL& url) const {
821 // Find all IDs of StringPatterns that match |url|.
822 // See URLMatcherConditionFactory for the canonicalization of URLs and the
823 // distinction between full url searches and url component searches.
824 std::set<StringPattern::ID> matches;
825 std::string url_for_component_searches;
827 if (!full_url_matcher_.IsEmpty()) {
828 full_url_matcher_.Match(
829 condition_factory_.CanonicalizeURLForFullSearches(url), &matches);
831 if (!url_component_matcher_.IsEmpty()) {
832 url_for_component_searches =
833 condition_factory_.CanonicalizeURLForComponentSearches(url);
834 url_component_matcher_.Match(url_for_component_searches, &matches);
836 if (!regex_set_matcher_.IsEmpty()) {
837 regex_set_matcher_.Match(
838 condition_factory_.CanonicalizeURLForRegexSearches(url), &matches);
840 if (!origin_and_path_regex_set_matcher_.IsEmpty()) {
841 origin_and_path_regex_set_matcher_.Match(
842 condition_factory_.CanonicalizeURLForOriginAndPathRegexSearches(url),
843 &matches);
846 // Calculate all URLMatcherConditionSets for which all URLMatcherConditions
847 // were fulfilled.
848 std::set<URLMatcherConditionSet::ID> result;
849 for (std::set<StringPattern::ID>::const_iterator i = matches.begin();
850 i != matches.end(); ++i) {
851 // For each URLMatcherConditionSet there is exactly one condition
852 // registered in substring_match_triggers_. This means that the following
853 // logic tests each URLMatcherConditionSet exactly once if it can be
854 // completely fulfilled.
855 StringPatternTriggers::const_iterator triggered_condition_sets_iter =
856 substring_match_triggers_.find(*i);
857 if (triggered_condition_sets_iter == substring_match_triggers_.end())
858 continue; // Not all substring matches are triggers for a condition set.
859 const std::set<URLMatcherConditionSet::ID>& condition_sets =
860 triggered_condition_sets_iter->second;
861 for (std::set<URLMatcherConditionSet::ID>::const_iterator j =
862 condition_sets.begin(); j != condition_sets.end(); ++j) {
863 URLMatcherConditionSets::const_iterator condition_set_iter =
864 url_matcher_condition_sets_.find(*j);
865 DCHECK(condition_set_iter != url_matcher_condition_sets_.end());
866 if (condition_set_iter->second->IsMatch(
867 matches, url, url_for_component_searches))
868 result.insert(*j);
872 return result;
875 bool URLMatcher::IsEmpty() const {
876 return condition_factory_.IsEmpty() &&
877 url_matcher_condition_sets_.empty() &&
878 substring_match_triggers_.empty() &&
879 full_url_matcher_.IsEmpty() &&
880 url_component_matcher_.IsEmpty() &&
881 regex_set_matcher_.IsEmpty() &&
882 origin_and_path_regex_set_matcher_.IsEmpty() &&
883 registered_full_url_patterns_.empty() &&
884 registered_url_component_patterns_.empty();
887 void URLMatcher::UpdateSubstringSetMatcher(bool full_url_conditions) {
888 // The purpose of |full_url_conditions| is just that we need to execute
889 // the same logic once for Full URL searches and once for URL Component
890 // searches (see URLMatcherConditionFactory).
892 // Determine which patterns need to be registered when this function
893 // terminates.
894 std::set<const StringPattern*> new_patterns;
895 for (URLMatcherConditionSets::const_iterator condition_set_iter =
896 url_matcher_condition_sets_.begin();
897 condition_set_iter != url_matcher_condition_sets_.end();
898 ++condition_set_iter) {
899 const URLMatcherConditionSet::Conditions& conditions =
900 condition_set_iter->second->conditions();
901 for (URLMatcherConditionSet::Conditions::const_iterator condition_iter =
902 conditions.begin(); condition_iter != conditions.end();
903 ++condition_iter) {
904 // If we are called to process Full URL searches, ignore others, and
905 // vice versa. (Regex conditions are updated in UpdateRegexSetMatcher.)
906 if (!condition_iter->IsRegexCondition() &&
907 !condition_iter->IsOriginAndPathRegexCondition() &&
908 full_url_conditions == condition_iter->IsFullURLCondition())
909 new_patterns.insert(condition_iter->string_pattern());
912 if (full_url_conditions)
913 continue;
915 const URLMatcherConditionSet::QueryConditions& query_conditions =
916 condition_set_iter->second->query_conditions();
917 for (URLMatcherConditionSet::QueryConditions::const_iterator
918 query_condition_iter = query_conditions.begin();
919 query_condition_iter != query_conditions.end();
920 ++query_condition_iter) {
921 new_patterns.insert(query_condition_iter->string_pattern());
925 // This is the set of patterns that were registered before this function
926 // is called.
927 std::set<const StringPattern*>& registered_patterns =
928 full_url_conditions ? registered_full_url_patterns_
929 : registered_url_component_patterns_;
931 // Add all patterns that are in new_patterns but not in registered_patterns.
932 std::vector<const StringPattern*> patterns_to_register =
933 base::STLSetDifference<std::vector<const StringPattern*> >(
934 new_patterns, registered_patterns);
936 // Remove all patterns that are in registered_patterns but not in
937 // new_patterns.
938 std::vector<const StringPattern*> patterns_to_unregister =
939 base::STLSetDifference<std::vector<const StringPattern*> >(
940 registered_patterns, new_patterns);
942 // Update the SubstringSetMatcher.
943 SubstringSetMatcher& url_matcher =
944 full_url_conditions ? full_url_matcher_ : url_component_matcher_;
945 url_matcher.RegisterAndUnregisterPatterns(patterns_to_register,
946 patterns_to_unregister);
948 // Update the set of registered_patterns for the next time this function
949 // is being called.
950 registered_patterns.swap(new_patterns);
953 void URLMatcher::UpdateRegexSetMatcher() {
954 std::vector<const StringPattern*> new_patterns;
955 std::vector<const StringPattern*> new_origin_and_path_patterns;
957 for (URLMatcherConditionSets::const_iterator condition_set_iter =
958 url_matcher_condition_sets_.begin();
959 condition_set_iter != url_matcher_condition_sets_.end();
960 ++condition_set_iter) {
961 const URLMatcherConditionSet::Conditions& conditions =
962 condition_set_iter->second->conditions();
963 for (URLMatcherConditionSet::Conditions::const_iterator condition_iter =
964 conditions.begin(); condition_iter != conditions.end();
965 ++condition_iter) {
966 if (condition_iter->IsRegexCondition()) {
967 new_patterns.push_back(condition_iter->string_pattern());
968 } else if (condition_iter->IsOriginAndPathRegexCondition()) {
969 new_origin_and_path_patterns.push_back(
970 condition_iter->string_pattern());
975 // Start over from scratch. We can't really do better than this, since the
976 // FilteredRE2 backend doesn't support incremental updates.
977 regex_set_matcher_.ClearPatterns();
978 regex_set_matcher_.AddPatterns(new_patterns);
979 origin_and_path_regex_set_matcher_.ClearPatterns();
980 origin_and_path_regex_set_matcher_.AddPatterns(new_origin_and_path_patterns);
983 void URLMatcher::UpdateTriggers() {
984 // Count substring pattern frequencies.
985 std::map<StringPattern::ID, size_t> substring_pattern_frequencies;
986 for (URLMatcherConditionSets::const_iterator condition_set_iter =
987 url_matcher_condition_sets_.begin();
988 condition_set_iter != url_matcher_condition_sets_.end();
989 ++condition_set_iter) {
990 const URLMatcherConditionSet::Conditions& conditions =
991 condition_set_iter->second->conditions();
992 for (URLMatcherConditionSet::Conditions::const_iterator condition_iter =
993 conditions.begin(); condition_iter != conditions.end();
994 ++condition_iter) {
995 const StringPattern* pattern = condition_iter->string_pattern();
996 substring_pattern_frequencies[pattern->id()]++;
999 const URLMatcherConditionSet::QueryConditions& query_conditions =
1000 condition_set_iter->second->query_conditions();
1001 for (URLMatcherConditionSet::QueryConditions::const_iterator
1002 query_condition_iter = query_conditions.begin();
1003 query_condition_iter != query_conditions.end();
1004 ++query_condition_iter) {
1005 const StringPattern* pattern = query_condition_iter->string_pattern();
1006 substring_pattern_frequencies[pattern->id()]++;
1010 // Update trigger conditions: Determine for each URLMatcherConditionSet which
1011 // URLMatcherCondition contains a StringPattern that occurs least
1012 // frequently in this URLMatcher. We assume that this condition is very
1013 // specific and occurs rarely in URLs. If a match occurs for this
1014 // URLMatcherCondition, we want to test all other URLMatcherCondition in the
1015 // respective URLMatcherConditionSet as well to see whether the entire
1016 // URLMatcherConditionSet is considered matching.
1017 substring_match_triggers_.clear();
1018 for (URLMatcherConditionSets::const_iterator condition_set_iter =
1019 url_matcher_condition_sets_.begin();
1020 condition_set_iter != url_matcher_condition_sets_.end();
1021 ++condition_set_iter) {
1022 const URLMatcherConditionSet::Conditions& conditions =
1023 condition_set_iter->second->conditions();
1024 if (conditions.empty())
1025 continue;
1026 URLMatcherConditionSet::Conditions::const_iterator condition_iter =
1027 conditions.begin();
1028 StringPattern::ID trigger = condition_iter->string_pattern()->id();
1029 // We skip the first element in the following loop.
1030 ++condition_iter;
1031 for (; condition_iter != conditions.end(); ++condition_iter) {
1032 StringPattern::ID current_id =
1033 condition_iter->string_pattern()->id();
1034 if (substring_pattern_frequencies[trigger] >
1035 substring_pattern_frequencies[current_id]) {
1036 trigger = current_id;
1040 const URLMatcherConditionSet::QueryConditions& query_conditions =
1041 condition_set_iter->second->query_conditions();
1042 for (URLMatcherConditionSet::QueryConditions::const_iterator
1043 query_condition_iter = query_conditions.begin();
1044 query_condition_iter != query_conditions.end();
1045 ++query_condition_iter) {
1046 StringPattern::ID current_id =
1047 query_condition_iter->string_pattern()->id();
1048 if (substring_pattern_frequencies[trigger] >
1049 substring_pattern_frequencies[current_id]) {
1050 trigger = current_id;
1054 substring_match_triggers_[trigger].insert(condition_set_iter->second->id());
1058 void URLMatcher::UpdateConditionFactory() {
1059 std::set<StringPattern::ID> used_patterns;
1060 for (URLMatcherConditionSets::const_iterator condition_set_iter =
1061 url_matcher_condition_sets_.begin();
1062 condition_set_iter != url_matcher_condition_sets_.end();
1063 ++condition_set_iter) {
1064 const URLMatcherConditionSet::Conditions& conditions =
1065 condition_set_iter->second->conditions();
1066 for (URLMatcherConditionSet::Conditions::const_iterator condition_iter =
1067 conditions.begin(); condition_iter != conditions.end();
1068 ++condition_iter) {
1069 used_patterns.insert(condition_iter->string_pattern()->id());
1071 const URLMatcherConditionSet::QueryConditions& query_conditions =
1072 condition_set_iter->second->query_conditions();
1073 for (URLMatcherConditionSet::QueryConditions::const_iterator
1074 query_condition_iter = query_conditions.begin();
1075 query_condition_iter != query_conditions.end();
1076 ++query_condition_iter) {
1077 used_patterns.insert(query_condition_iter->string_pattern()->id());
1080 condition_factory_.ForgetUnusedPatterns(used_patterns);
1083 void URLMatcher::UpdateInternalDatastructures() {
1084 UpdateSubstringSetMatcher(false);
1085 UpdateSubstringSetMatcher(true);
1086 UpdateRegexSetMatcher();
1087 UpdateTriggers();
1088 UpdateConditionFactory();
1091 } // namespace url_matcher