Don't preload rarely seen large images
[chromium-blink-merge.git] / components / url_matcher / url_matcher.h
bloba0b13d9889929a7b2a5b62e8f5b4004305ba1f26
1 // Copyright 2013 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #ifndef COMPONENTS_URL_MATCHER_URL_MATCHER_H_
6 #define COMPONENTS_URL_MATCHER_URL_MATCHER_H_
8 #include <set>
9 #include <vector>
11 #include "base/memory/ref_counted.h"
12 #include "base/memory/scoped_ptr.h"
13 #include "base/memory/scoped_vector.h"
14 #include "components/url_matcher/regex_set_matcher.h"
15 #include "components/url_matcher/substring_set_matcher.h"
16 #include "components/url_matcher/url_matcher_export.h"
18 class GURL;
20 namespace base {
21 class DictionaryValue;
24 namespace url_matcher {
26 // This class represents a single URL matching condition, e.g. a match on the
27 // host suffix or the containment of a string in the query component of a GURL.
29 // The difference from a simple StringPattern is that this also supports
30 // checking whether the {Host, Path, Query} of a URL contains a string. The
31 // reduction of URL matching conditions to StringPatterns conducted by
32 // URLMatcherConditionFactory is not capable of expressing that alone.
34 // Also supported is matching regular expressions against the URL (URL_MATCHES).
35 class URL_MATCHER_EXPORT URLMatcherCondition {
36 public:
37 enum Criterion {
38 HOST_PREFIX,
39 HOST_SUFFIX,
40 HOST_CONTAINS,
41 HOST_EQUALS,
42 PATH_PREFIX,
43 PATH_SUFFIX,
44 PATH_CONTAINS,
45 PATH_EQUALS,
46 QUERY_PREFIX,
47 QUERY_SUFFIX,
48 QUERY_CONTAINS,
49 QUERY_EQUALS,
50 HOST_SUFFIX_PATH_PREFIX,
51 HOST_EQUALS_PATH_PREFIX,
52 URL_PREFIX,
53 URL_SUFFIX,
54 URL_CONTAINS,
55 URL_EQUALS,
56 URL_MATCHES,
57 ORIGIN_AND_PATH_MATCHES, // Matches the URL minus its query string.
60 URLMatcherCondition();
61 ~URLMatcherCondition();
62 URLMatcherCondition(Criterion criterion,
63 const StringPattern* substring_pattern);
64 URLMatcherCondition(const URLMatcherCondition& rhs);
65 URLMatcherCondition& operator=(const URLMatcherCondition& rhs);
66 bool operator<(const URLMatcherCondition& rhs) const;
68 Criterion criterion() const { return criterion_; }
69 const StringPattern* string_pattern() const {
70 return string_pattern_;
73 // Returns whether this URLMatcherCondition needs to be executed on a
74 // full URL rather than the individual components (see
75 // URLMatcherConditionFactory).
76 bool IsFullURLCondition() const;
78 // Returns whether this URLMatcherCondition is a regular expression to be
79 // handled by a regex matcher instead of a substring matcher.
80 bool IsRegexCondition() const;
82 // Returns whether this URLMatcherCondition is a regular expression that shall
83 // be evaluated on the URL without the query parameter.
84 bool IsOriginAndPathRegexCondition() const;
86 // Returns whether this condition is fulfilled according to
87 // |matching_patterns| and |url|.
88 bool IsMatch(const std::set<StringPattern::ID>& matching_patterns,
89 const GURL& url) const;
91 private:
92 // |criterion_| and |string_pattern_| describe together what property a URL
93 // needs to fulfill to be considered a match.
94 Criterion criterion_;
96 // This is the StringPattern that is used in a SubstringSetMatcher.
97 const StringPattern* string_pattern_;
100 // Class to map the problem of finding {host, path, query} {prefixes, suffixes,
101 // containments, and equality} in GURLs to the substring matching problem.
103 // Say, you want to check whether the path of a URL starts with "/index.html".
104 // This class preprocesses a URL like "www.google.com/index.html" into something
105 // like "www.google.com|/index.html". After preprocessing, you can search for
106 // "|/index.html" in the string and see that this candidate URL actually has
107 // a path that starts with "/index.html". On the contrary,
108 // "www.google.com/images/index.html" would be normalized to
109 // "www.google.com|/images/index.html". It is easy to see that it contains
110 // "/index.html" but the path of the URL does not start with "/index.html".
112 // This preprocessing is important if you want to match a URL against many
113 // patterns because it reduces the matching to a "discover all substrings
114 // of a dictionary in a text" problem, which can be solved very efficiently
115 // by the Aho-Corasick algorithm.
117 // IMPORTANT: The URLMatcherConditionFactory owns the StringPattern
118 // referenced by created URLMatcherConditions. Therefore, it must outlive
119 // all created URLMatcherCondition and the SubstringSetMatcher.
120 class URL_MATCHER_EXPORT URLMatcherConditionFactory {
121 public:
122 URLMatcherConditionFactory();
123 ~URLMatcherConditionFactory();
125 // Canonicalizes a URL for "Create{Host,Path,Query}*Condition" searches.
126 std::string CanonicalizeURLForComponentSearches(const GURL& url) const;
128 // Factory methods for various condition types.
130 // Note that these methods fill the pattern_singletons_. If you create
131 // conditions and don't register them to a URLMatcher, they will continue to
132 // consume memory. You need to call ForgetUnusedPatterns() or
133 // URLMatcher::ClearUnusedConditionSets() in this case.
134 URLMatcherCondition CreateHostPrefixCondition(const std::string& prefix);
135 URLMatcherCondition CreateHostSuffixCondition(const std::string& suffix);
136 URLMatcherCondition CreateHostContainsCondition(const std::string& str);
137 URLMatcherCondition CreateHostEqualsCondition(const std::string& str);
139 URLMatcherCondition CreatePathPrefixCondition(const std::string& prefix);
140 URLMatcherCondition CreatePathSuffixCondition(const std::string& suffix);
141 URLMatcherCondition CreatePathContainsCondition(const std::string& str);
142 URLMatcherCondition CreatePathEqualsCondition(const std::string& str);
144 URLMatcherCondition CreateQueryPrefixCondition(const std::string& prefix);
145 URLMatcherCondition CreateQuerySuffixCondition(const std::string& suffix);
146 URLMatcherCondition CreateQueryContainsCondition(const std::string& str);
147 URLMatcherCondition CreateQueryEqualsCondition(const std::string& str);
149 // This covers the common case, where you don't care whether a domain
150 // "foobar.com" is expressed as "foobar.com" or "www.foobar.com", and it
151 // should be followed by a given |path_prefix|.
152 URLMatcherCondition CreateHostSuffixPathPrefixCondition(
153 const std::string& host_suffix,
154 const std::string& path_prefix);
155 URLMatcherCondition CreateHostEqualsPathPrefixCondition(
156 const std::string& host,
157 const std::string& path_prefix);
159 // Canonicalizes a URL for "CreateURL*Condition" searches.
160 std::string CanonicalizeURLForFullSearches(const GURL& url) const;
162 // Canonicalizes a URL for "CreateURLMatchesCondition" searches.
163 std::string CanonicalizeURLForRegexSearches(const GURL& url) const;
164 // Canonicalizes a URL for "CreateOriginAndPathMatchesCondition" searches.
165 std::string CanonicalizeURLForOriginAndPathRegexSearches(
166 const GURL& url) const;
168 URLMatcherCondition CreateURLPrefixCondition(const std::string& prefix);
169 URLMatcherCondition CreateURLSuffixCondition(const std::string& suffix);
170 URLMatcherCondition CreateURLContainsCondition(const std::string& str);
171 URLMatcherCondition CreateURLEqualsCondition(const std::string& str);
173 URLMatcherCondition CreateURLMatchesCondition(const std::string& regex);
174 URLMatcherCondition CreateOriginAndPathMatchesCondition(
175 const std::string& regex);
177 // Removes all patterns from |pattern_singletons_| that are not listed in
178 // |used_patterns|. These patterns are not referenced any more and get
179 // freed.
180 void ForgetUnusedPatterns(
181 const std::set<StringPattern::ID>& used_patterns);
183 // Returns true if this object retains no allocated data. Only for debugging.
184 bool IsEmpty() const;
186 private:
187 // Creates a URLMatcherCondition according to the parameters passed.
188 // The URLMatcherCondition will refer to a StringPattern that is
189 // owned by |pattern_singletons_|.
190 URLMatcherCondition CreateCondition(URLMatcherCondition::Criterion criterion,
191 const std::string& pattern);
193 // Prepends a "." to the prefix if it does not start with one.
194 std::string CanonicalizeHostPrefix(const std::string& prefix) const;
195 // Appends a "." to the hostname if it does not start with one.
196 std::string CanonicalizeHostSuffix(const std::string& suffix) const;
197 // Adds "." to either side of the hostname if not present yet.
198 std::string CanonicalizeHostname(const std::string& hostname) const;
200 // Convert the query string to canonical form suitable for key token search.
201 std::string CanonicalizeQuery(std::string query,
202 bool prepend_beginning_of_query_component,
203 bool append_end_of_query_component) const;
205 // Counter that ensures that all created StringPatterns have unique IDs.
206 // Note that substring patterns and regex patterns will use different IDs.
207 int id_counter_;
209 // This comparison considers only the pattern() value of the
210 // StringPatterns.
211 struct StringPatternPointerCompare {
212 bool operator()(StringPattern* lhs, StringPattern* rhs) const;
214 // Set to ensure that we generate only one StringPattern for each content
215 // of StringPattern::pattern().
216 typedef std::set<StringPattern*, StringPatternPointerCompare>
217 PatternSingletons;
218 PatternSingletons substring_pattern_singletons_;
219 PatternSingletons regex_pattern_singletons_;
220 PatternSingletons origin_and_path_regex_pattern_singletons_;
222 DISALLOW_COPY_AND_ASSIGN(URLMatcherConditionFactory);
225 // This class represents a single URL query matching condition. The query
226 // matching is done as a search for a key and optionally a value.
227 // The matching makes use of CanonicalizeURLForComponentSearches to ensure that
228 // the key starts and ends (optionally) with the right marker.
229 class URL_MATCHER_EXPORT URLQueryElementMatcherCondition {
230 public:
231 // Multiple occurrences of the same key can happen in a URL query. The type
232 // ensures that every (MATCH_ALL), any (MATCH_ANY), first (MATCH_FIRST) or
233 // last (MATCH_LAST) instance of the key occurrence matches the value.
234 enum Type { MATCH_ANY, MATCH_FIRST, MATCH_LAST, MATCH_ALL };
236 // Allows the match to be exact (QUERY_VALUE_MATCH_EXACT, starts and ends with
237 // a delimiter or a border) or simply a prefix (QUERY_VALUE_MATCH_PREFIX,
238 // starts with a delimiter or a border).
239 enum QueryValueMatchType {
240 QUERY_VALUE_MATCH_EXACT,
241 QUERY_VALUE_MATCH_PREFIX
244 // Used to indicate if the query parameter is of type &key=value&
245 // (ELEMENT_TYPE_KEY_VALUE) or simply &key& (ELEMENT_TYPE_KEY).
246 enum QueryElementType { ELEMENT_TYPE_KEY_VALUE, ELEMENT_TYPE_KEY };
248 URLQueryElementMatcherCondition(const std::string& key,
249 const std::string& value,
250 QueryValueMatchType query_value_match_type,
251 QueryElementType query_element_type,
252 Type match_type,
253 URLMatcherConditionFactory* factory);
254 ~URLQueryElementMatcherCondition();
256 bool operator<(const URLQueryElementMatcherCondition& rhs) const;
258 // Returns whether the URL query satisfies the key value constraint.
259 bool IsMatch(const std::string& canonical_url_query) const;
261 const StringPattern* string_pattern() const { return string_pattern_; }
263 private:
264 Type match_type_;
265 std::string key_;
266 std::string value_;
267 size_t key_length_;
268 size_t value_length_;
269 const StringPattern* string_pattern_;
272 // This class represents a filter for the URL scheme to be hooked up into a
273 // URLMatcherConditionSet.
274 class URL_MATCHER_EXPORT URLMatcherSchemeFilter {
275 public:
276 explicit URLMatcherSchemeFilter(const std::string& filter);
277 explicit URLMatcherSchemeFilter(const std::vector<std::string>& filters);
278 ~URLMatcherSchemeFilter();
279 bool IsMatch(const GURL& url) const;
281 private:
282 std::vector<std::string> filters_;
284 DISALLOW_COPY_AND_ASSIGN(URLMatcherSchemeFilter);
287 // This class represents a filter for port numbers to be hooked up into a
288 // URLMatcherConditionSet.
289 class URL_MATCHER_EXPORT URLMatcherPortFilter {
290 public:
291 // Boundaries of a port range (both ends are included).
292 typedef std::pair<int, int> Range;
293 explicit URLMatcherPortFilter(const std::vector<Range>& ranges);
294 ~URLMatcherPortFilter();
295 bool IsMatch(const GURL& url) const;
297 // Creates a port range [from, to]; both ends are included.
298 static Range CreateRange(int from, int to);
299 // Creates a port range containing a single port.
300 static Range CreateRange(int port);
302 private:
303 std::vector<Range> ranges_;
305 DISALLOW_COPY_AND_ASSIGN(URLMatcherPortFilter);
308 // This class represents a set of conditions that all need to match on a
309 // given URL in order to be considered a match.
310 class URL_MATCHER_EXPORT URLMatcherConditionSet
311 : public base::RefCounted<URLMatcherConditionSet> {
312 public:
313 typedef int ID;
314 typedef std::set<URLMatcherCondition> Conditions;
315 typedef std::set<URLQueryElementMatcherCondition> QueryConditions;
316 typedef std::vector<scoped_refptr<URLMatcherConditionSet> > Vector;
318 // Matches if all conditions in |conditions| are fulfilled.
319 URLMatcherConditionSet(ID id, const Conditions& conditions);
321 // Matches if all conditions in |conditions|, |scheme_filter| and
322 // |port_filter| are fulfilled. |scheme_filter| and |port_filter| may be NULL,
323 // in which case, no restrictions are imposed on the scheme/port of a URL.
324 URLMatcherConditionSet(ID id, const Conditions& conditions,
325 scoped_ptr<URLMatcherSchemeFilter> scheme_filter,
326 scoped_ptr<URLMatcherPortFilter> port_filter);
328 // Matches if all conditions in |conditions|, |query_conditions|,
329 // |scheme_filter| and |port_filter| are fulfilled. |scheme_filter| and
330 // |port_filter| may be NULL, in which case, no restrictions are imposed on
331 // the scheme/port of a URL.
332 URLMatcherConditionSet(ID id,
333 const Conditions& conditions,
334 const QueryConditions& query_conditions,
335 scoped_ptr<URLMatcherSchemeFilter> scheme_filter,
336 scoped_ptr<URLMatcherPortFilter> port_filter);
338 ID id() const { return id_; }
339 const Conditions& conditions() const { return conditions_; }
340 const QueryConditions& query_conditions() const { return query_conditions_; }
342 bool IsMatch(const std::set<StringPattern::ID>& matching_patterns,
343 const GURL& url) const;
345 bool IsMatch(const std::set<StringPattern::ID>& matching_patterns,
346 const GURL& url,
347 const std::string& url_for_component_searches) const;
349 private:
350 friend class base::RefCounted<URLMatcherConditionSet>;
351 ~URLMatcherConditionSet();
352 ID id_;
353 Conditions conditions_;
354 QueryConditions query_conditions_;
355 scoped_ptr<URLMatcherSchemeFilter> scheme_filter_;
356 scoped_ptr<URLMatcherPortFilter> port_filter_;
358 DISALLOW_COPY_AND_ASSIGN(URLMatcherConditionSet);
361 // This class allows matching one URL against a large set of
362 // URLMatcherConditionSets at the same time.
363 class URL_MATCHER_EXPORT URLMatcher {
364 public:
365 URLMatcher();
366 ~URLMatcher();
368 // Adds new URLMatcherConditionSet to this URL Matcher. Each condition set
369 // must have a unique ID.
370 // This is an expensive operation as it triggers pre-calculations on the
371 // currently registered condition sets. Do not call this operation many
372 // times with a single condition set in each call.
373 void AddConditionSets(const URLMatcherConditionSet::Vector& condition_sets);
375 // Removes the listed condition sets. All |condition_set_ids| must be
376 // currently registered. This function should be called with large batches
377 // of |condition_set_ids| at a time to improve performance.
378 void RemoveConditionSets(
379 const std::vector<URLMatcherConditionSet::ID>& condition_set_ids);
381 // Removes all unused condition sets from the ConditionFactory.
382 void ClearUnusedConditionSets();
384 // Returns the IDs of all URLMatcherConditionSet that match to this |url|.
385 std::set<URLMatcherConditionSet::ID> MatchURL(const GURL& url) const;
387 // Returns the URLMatcherConditionFactory that must be used to create
388 // URLMatcherConditionSets for this URLMatcher.
389 URLMatcherConditionFactory* condition_factory() {
390 return &condition_factory_;
393 // Returns true if this object retains no allocated data. Only for debugging.
394 bool IsEmpty() const;
396 private:
397 void UpdateSubstringSetMatcher(bool full_url_conditions);
398 void UpdateRegexSetMatcher();
399 void UpdateTriggers();
400 void UpdateConditionFactory();
401 void UpdateInternalDatastructures();
403 URLMatcherConditionFactory condition_factory_;
405 // Maps the ID of a URLMatcherConditionSet to the respective
406 // URLMatcherConditionSet.
407 typedef std::map<URLMatcherConditionSet::ID,
408 scoped_refptr<URLMatcherConditionSet> >
409 URLMatcherConditionSets;
410 URLMatcherConditionSets url_matcher_condition_sets_;
412 // Maps a StringPattern ID to the URLMatcherConditions that need to
413 // be triggered in case of a StringPattern match.
414 typedef std::map<StringPattern::ID, std::set<URLMatcherConditionSet::ID> >
415 StringPatternTriggers;
416 StringPatternTriggers substring_match_triggers_;
418 SubstringSetMatcher full_url_matcher_;
419 SubstringSetMatcher url_component_matcher_;
420 RegexSetMatcher regex_set_matcher_;
421 RegexSetMatcher origin_and_path_regex_set_matcher_;
422 std::set<const StringPattern*> registered_full_url_patterns_;
423 std::set<const StringPattern*> registered_url_component_patterns_;
425 DISALLOW_COPY_AND_ASSIGN(URLMatcher);
428 } // namespace url_matcher
430 #endif // COMPONENTS_URL_MATCHER_URL_MATCHER_H_