1 // Copyright 2013 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #ifndef COMPONENTS_URL_MATCHER_URL_MATCHER_H_
6 #define COMPONENTS_URL_MATCHER_URL_MATCHER_H_
11 #include "base/memory/ref_counted.h"
12 #include "base/memory/scoped_ptr.h"
13 #include "components/url_matcher/regex_set_matcher.h"
14 #include "components/url_matcher/substring_set_matcher.h"
15 #include "components/url_matcher/url_matcher_export.h"
20 class DictionaryValue
;
23 namespace url_matcher
{
25 // This class represents a single URL matching condition, e.g. a match on the
26 // host suffix or the containment of a string in the query component of a GURL.
28 // The difference from a simple StringPattern is that this also supports
29 // checking whether the {Host, Path, Query} of a URL contains a string. The
30 // reduction of URL matching conditions to StringPatterns conducted by
31 // URLMatcherConditionFactory is not capable of expressing that alone.
33 // Also supported is matching regular expressions against the URL (URL_MATCHES).
34 class URL_MATCHER_EXPORT URLMatcherCondition
{
49 HOST_SUFFIX_PATH_PREFIX
,
50 HOST_EQUALS_PATH_PREFIX
,
56 ORIGIN_AND_PATH_MATCHES
, // Matches the URL minus its query string.
59 URLMatcherCondition();
60 ~URLMatcherCondition();
61 URLMatcherCondition(Criterion criterion
,
62 const StringPattern
* substring_pattern
);
63 URLMatcherCondition(const URLMatcherCondition
& rhs
);
64 URLMatcherCondition
& operator=(const URLMatcherCondition
& rhs
);
65 bool operator<(const URLMatcherCondition
& rhs
) const;
67 Criterion
criterion() const { return criterion_
; }
68 const StringPattern
* string_pattern() const {
69 return string_pattern_
;
72 // Returns whether this URLMatcherCondition needs to be executed on a
73 // full URL rather than the individual components (see
74 // URLMatcherConditionFactory).
75 bool IsFullURLCondition() const;
77 // Returns whether this URLMatcherCondition is a regular expression to be
78 // handled by a regex matcher instead of a substring matcher.
79 bool IsRegexCondition() const;
81 // Returns whether this URLMatcherCondition is a regular expression that shall
82 // be evaluated on the URL without the query parameter.
83 bool IsOriginAndPathRegexCondition() const;
85 // Returns whether this condition is fulfilled according to
86 // |matching_patterns| and |url|.
87 bool IsMatch(const std::set
<StringPattern::ID
>& matching_patterns
,
88 const GURL
& url
) const;
91 // |criterion_| and |string_pattern_| describe together what property a URL
92 // needs to fulfill to be considered a match.
95 // This is the StringPattern that is used in a SubstringSetMatcher.
96 const StringPattern
* string_pattern_
;
99 // Class to map the problem of finding {host, path, query} {prefixes, suffixes,
100 // containments, and equality} in GURLs to the substring matching problem.
102 // Say, you want to check whether the path of a URL starts with "/index.html".
103 // This class preprocesses a URL like "www.google.com/index.html" into something
104 // like "www.google.com|/index.html". After preprocessing, you can search for
105 // "|/index.html" in the string and see that this candidate URL actually has
106 // a path that starts with "/index.html". On the contrary,
107 // "www.google.com/images/index.html" would be normalized to
108 // "www.google.com|/images/index.html". It is easy to see that it contains
109 // "/index.html" but the path of the URL does not start with "/index.html".
111 // This preprocessing is important if you want to match a URL against many
112 // patterns because it reduces the matching to a "discover all substrings
113 // of a dictionary in a text" problem, which can be solved very efficiently
114 // by the Aho-Corasick algorithm.
116 // IMPORTANT: The URLMatcherConditionFactory owns the StringPattern
117 // referenced by created URLMatcherConditions. Therefore, it must outlive
118 // all created URLMatcherCondition and the SubstringSetMatcher.
119 class URL_MATCHER_EXPORT URLMatcherConditionFactory
{
121 URLMatcherConditionFactory();
122 ~URLMatcherConditionFactory();
124 // Canonicalizes a URL for "Create{Host,Path,Query}*Condition" searches.
125 std::string
CanonicalizeURLForComponentSearches(const GURL
& url
) const;
127 // Factory methods for various condition types.
129 // Note that these methods fill the pattern_singletons_. If you create
130 // conditions and don't register them to a URLMatcher, they will continue to
131 // consume memory. You need to call ForgetUnusedPatterns() or
132 // URLMatcher::ClearUnusedConditionSets() in this case.
133 URLMatcherCondition
CreateHostPrefixCondition(const std::string
& prefix
);
134 URLMatcherCondition
CreateHostSuffixCondition(const std::string
& suffix
);
135 URLMatcherCondition
CreateHostContainsCondition(const std::string
& str
);
136 URLMatcherCondition
CreateHostEqualsCondition(const std::string
& str
);
138 URLMatcherCondition
CreatePathPrefixCondition(const std::string
& prefix
);
139 URLMatcherCondition
CreatePathSuffixCondition(const std::string
& suffix
);
140 URLMatcherCondition
CreatePathContainsCondition(const std::string
& str
);
141 URLMatcherCondition
CreatePathEqualsCondition(const std::string
& str
);
143 URLMatcherCondition
CreateQueryPrefixCondition(const std::string
& prefix
);
144 URLMatcherCondition
CreateQuerySuffixCondition(const std::string
& suffix
);
145 URLMatcherCondition
CreateQueryContainsCondition(const std::string
& str
);
146 URLMatcherCondition
CreateQueryEqualsCondition(const std::string
& str
);
148 // This covers the common case, where you don't care whether a domain
149 // "foobar.com" is expressed as "foobar.com" or "www.foobar.com", and it
150 // should be followed by a given |path_prefix|.
151 URLMatcherCondition
CreateHostSuffixPathPrefixCondition(
152 const std::string
& host_suffix
,
153 const std::string
& path_prefix
);
154 URLMatcherCondition
CreateHostEqualsPathPrefixCondition(
155 const std::string
& host
,
156 const std::string
& path_prefix
);
158 // Canonicalizes a URL for "CreateURL*Condition" searches.
159 std::string
CanonicalizeURLForFullSearches(const GURL
& url
) const;
161 // Canonicalizes a URL for "CreateURLMatchesCondition" searches.
162 std::string
CanonicalizeURLForRegexSearches(const GURL
& url
) const;
163 // Canonicalizes a URL for "CreateOriginAndPathMatchesCondition" searches.
164 std::string
CanonicalizeURLForOriginAndPathRegexSearches(
165 const GURL
& url
) const;
167 URLMatcherCondition
CreateURLPrefixCondition(const std::string
& prefix
);
168 URLMatcherCondition
CreateURLSuffixCondition(const std::string
& suffix
);
169 URLMatcherCondition
CreateURLContainsCondition(const std::string
& str
);
170 URLMatcherCondition
CreateURLEqualsCondition(const std::string
& str
);
172 URLMatcherCondition
CreateURLMatchesCondition(const std::string
& regex
);
173 URLMatcherCondition
CreateOriginAndPathMatchesCondition(
174 const std::string
& regex
);
176 // Removes all patterns from |pattern_singletons_| that are not listed in
177 // |used_patterns|. These patterns are not referenced any more and get
179 void ForgetUnusedPatterns(
180 const std::set
<StringPattern::ID
>& used_patterns
);
182 // Returns true if this object retains no allocated data. Only for debugging.
183 bool IsEmpty() const;
186 // Creates a URLMatcherCondition according to the parameters passed.
187 // The URLMatcherCondition will refer to a StringPattern that is
188 // owned by |pattern_singletons_|.
189 URLMatcherCondition
CreateCondition(URLMatcherCondition::Criterion criterion
,
190 const std::string
& pattern
);
192 // Prepends a "." to the prefix if it does not start with one.
193 std::string
CanonicalizeHostPrefix(const std::string
& prefix
) const;
194 // Appends a "." to the hostname if it does not start with one.
195 std::string
CanonicalizeHostSuffix(const std::string
& suffix
) const;
196 // Adds "." to either side of the hostname if not present yet.
197 std::string
CanonicalizeHostname(const std::string
& hostname
) const;
199 // Convert the query string to canonical form suitable for key token search.
200 std::string
CanonicalizeQuery(std::string query
,
201 bool prepend_beginning_of_query_component
,
202 bool append_end_of_query_component
) const;
204 // Counter that ensures that all created StringPatterns have unique IDs.
205 // Note that substring patterns and regex patterns will use different IDs.
208 // This comparison considers only the pattern() value of the
210 struct StringPatternPointerCompare
{
211 bool operator()(StringPattern
* lhs
, StringPattern
* rhs
) const;
213 // Set to ensure that we generate only one StringPattern for each content
214 // of StringPattern::pattern().
215 typedef std::set
<StringPattern
*, StringPatternPointerCompare
>
217 PatternSingletons substring_pattern_singletons_
;
218 PatternSingletons regex_pattern_singletons_
;
219 PatternSingletons origin_and_path_regex_pattern_singletons_
;
221 DISALLOW_COPY_AND_ASSIGN(URLMatcherConditionFactory
);
224 // This class represents a single URL query matching condition. The query
225 // matching is done as a search for a key and optionally a value.
226 // The matching makes use of CanonicalizeURLForComponentSearches to ensure that
227 // the key starts and ends (optionally) with the right marker.
228 class URL_MATCHER_EXPORT URLQueryElementMatcherCondition
{
230 // Multiple occurrences of the same key can happen in a URL query. The type
231 // ensures that every (MATCH_ALL), any (MATCH_ANY), first (MATCH_FIRST) or
232 // last (MATCH_LAST) instance of the key occurrence matches the value.
233 enum Type
{ MATCH_ANY
, MATCH_FIRST
, MATCH_LAST
, MATCH_ALL
};
235 // Allows the match to be exact (QUERY_VALUE_MATCH_EXACT, starts and ends with
236 // a delimiter or a border) or simply a prefix (QUERY_VALUE_MATCH_PREFIX,
237 // starts with a delimiter or a border).
238 enum QueryValueMatchType
{
239 QUERY_VALUE_MATCH_EXACT
,
240 QUERY_VALUE_MATCH_PREFIX
243 // Used to indicate if the query parameter is of type &key=value&
244 // (ELEMENT_TYPE_KEY_VALUE) or simply &key& (ELEMENT_TYPE_KEY).
245 enum QueryElementType
{ ELEMENT_TYPE_KEY_VALUE
, ELEMENT_TYPE_KEY
};
247 URLQueryElementMatcherCondition(const std::string
& key
,
248 const std::string
& value
,
249 QueryValueMatchType query_value_match_type
,
250 QueryElementType query_element_type
,
252 URLMatcherConditionFactory
* factory
);
253 ~URLQueryElementMatcherCondition();
255 bool operator<(const URLQueryElementMatcherCondition
& rhs
) const;
257 // Returns whether the URL query satisfies the key value constraint.
258 bool IsMatch(const std::string
& canonical_url_query
) const;
260 const StringPattern
* string_pattern() const { return string_pattern_
; }
267 size_t value_length_
;
268 const StringPattern
* string_pattern_
;
271 // This class represents a filter for the URL scheme to be hooked up into a
272 // URLMatcherConditionSet.
273 class URL_MATCHER_EXPORT URLMatcherSchemeFilter
{
275 explicit URLMatcherSchemeFilter(const std::string
& filter
);
276 explicit URLMatcherSchemeFilter(const std::vector
<std::string
>& filters
);
277 ~URLMatcherSchemeFilter();
278 bool IsMatch(const GURL
& url
) const;
281 std::vector
<std::string
> filters_
;
283 DISALLOW_COPY_AND_ASSIGN(URLMatcherSchemeFilter
);
286 // This class represents a filter for port numbers to be hooked up into a
287 // URLMatcherConditionSet.
288 class URL_MATCHER_EXPORT URLMatcherPortFilter
{
290 // Boundaries of a port range (both ends are included).
291 typedef std::pair
<int, int> Range
;
292 explicit URLMatcherPortFilter(const std::vector
<Range
>& ranges
);
293 ~URLMatcherPortFilter();
294 bool IsMatch(const GURL
& url
) const;
296 // Creates a port range [from, to]; both ends are included.
297 static Range
CreateRange(int from
, int to
);
298 // Creates a port range containing a single port.
299 static Range
CreateRange(int port
);
302 std::vector
<Range
> ranges_
;
304 DISALLOW_COPY_AND_ASSIGN(URLMatcherPortFilter
);
307 // This class represents a set of conditions that all need to match on a
308 // given URL in order to be considered a match.
309 class URL_MATCHER_EXPORT URLMatcherConditionSet
310 : public base::RefCounted
<URLMatcherConditionSet
> {
313 typedef std::set
<URLMatcherCondition
> Conditions
;
314 typedef std::set
<URLQueryElementMatcherCondition
> QueryConditions
;
315 typedef std::vector
<scoped_refptr
<URLMatcherConditionSet
> > Vector
;
317 // Matches if all conditions in |conditions| are fulfilled.
318 URLMatcherConditionSet(ID id
, const Conditions
& conditions
);
320 // Matches if all conditions in |conditions|, |scheme_filter| and
321 // |port_filter| are fulfilled. |scheme_filter| and |port_filter| may be NULL,
322 // in which case, no restrictions are imposed on the scheme/port of a URL.
323 URLMatcherConditionSet(ID id
, const Conditions
& conditions
,
324 scoped_ptr
<URLMatcherSchemeFilter
> scheme_filter
,
325 scoped_ptr
<URLMatcherPortFilter
> port_filter
);
327 // Matches if all conditions in |conditions|, |query_conditions|,
328 // |scheme_filter| and |port_filter| are fulfilled. |scheme_filter| and
329 // |port_filter| may be NULL, in which case, no restrictions are imposed on
330 // the scheme/port of a URL.
331 URLMatcherConditionSet(ID id
,
332 const Conditions
& conditions
,
333 const QueryConditions
& query_conditions
,
334 scoped_ptr
<URLMatcherSchemeFilter
> scheme_filter
,
335 scoped_ptr
<URLMatcherPortFilter
> port_filter
);
337 ID
id() const { return id_
; }
338 const Conditions
& conditions() const { return conditions_
; }
339 const QueryConditions
& query_conditions() const { return query_conditions_
; }
341 bool IsMatch(const std::set
<StringPattern::ID
>& matching_patterns
,
342 const GURL
& url
) const;
344 bool IsMatch(const std::set
<StringPattern::ID
>& matching_patterns
,
346 const std::string
& url_for_component_searches
) const;
349 friend class base::RefCounted
<URLMatcherConditionSet
>;
350 ~URLMatcherConditionSet();
352 Conditions conditions_
;
353 QueryConditions query_conditions_
;
354 scoped_ptr
<URLMatcherSchemeFilter
> scheme_filter_
;
355 scoped_ptr
<URLMatcherPortFilter
> port_filter_
;
357 DISALLOW_COPY_AND_ASSIGN(URLMatcherConditionSet
);
360 // This class allows matching one URL against a large set of
361 // URLMatcherConditionSets at the same time.
362 class URL_MATCHER_EXPORT URLMatcher
{
367 // Adds new URLMatcherConditionSet to this URL Matcher. Each condition set
368 // must have a unique ID.
369 // This is an expensive operation as it triggers pre-calculations on the
370 // currently registered condition sets. Do not call this operation many
371 // times with a single condition set in each call.
372 void AddConditionSets(const URLMatcherConditionSet::Vector
& condition_sets
);
374 // Removes the listed condition sets. All |condition_set_ids| must be
375 // currently registered. This function should be called with large batches
376 // of |condition_set_ids| at a time to improve performance.
377 void RemoveConditionSets(
378 const std::vector
<URLMatcherConditionSet::ID
>& condition_set_ids
);
380 // Removes all unused condition sets from the ConditionFactory.
381 void ClearUnusedConditionSets();
383 // Returns the IDs of all URLMatcherConditionSet that match to this |url|.
384 std::set
<URLMatcherConditionSet::ID
> MatchURL(const GURL
& url
) const;
386 // Returns the URLMatcherConditionFactory that must be used to create
387 // URLMatcherConditionSets for this URLMatcher.
388 URLMatcherConditionFactory
* condition_factory() {
389 return &condition_factory_
;
392 // Returns true if this object retains no allocated data. Only for debugging.
393 bool IsEmpty() const;
396 void UpdateSubstringSetMatcher(bool full_url_conditions
);
397 void UpdateRegexSetMatcher();
398 void UpdateTriggers();
399 void UpdateConditionFactory();
400 void UpdateInternalDatastructures();
402 URLMatcherConditionFactory condition_factory_
;
404 // Maps the ID of a URLMatcherConditionSet to the respective
405 // URLMatcherConditionSet.
406 typedef std::map
<URLMatcherConditionSet::ID
,
407 scoped_refptr
<URLMatcherConditionSet
> >
408 URLMatcherConditionSets
;
409 URLMatcherConditionSets url_matcher_condition_sets_
;
411 // Maps a StringPattern ID to the URLMatcherConditions that need to
412 // be triggered in case of a StringPattern match.
413 typedef std::map
<StringPattern::ID
, std::set
<URLMatcherConditionSet::ID
> >
414 StringPatternTriggers
;
415 StringPatternTriggers substring_match_triggers_
;
417 SubstringSetMatcher full_url_matcher_
;
418 SubstringSetMatcher url_component_matcher_
;
419 RegexSetMatcher regex_set_matcher_
;
420 RegexSetMatcher origin_and_path_regex_set_matcher_
;
421 std::set
<const StringPattern
*> registered_full_url_patterns_
;
422 std::set
<const StringPattern
*> registered_url_component_patterns_
;
424 DISALLOW_COPY_AND_ASSIGN(URLMatcher
);
427 } // namespace url_matcher
429 #endif // COMPONENTS_URL_MATCHER_URL_MATCHER_H_