1 // Copyright 2013 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #ifndef COMPONENTS_URL_MATCHER_URL_MATCHER_H_
6 #define COMPONENTS_URL_MATCHER_URL_MATCHER_H_
11 #include "base/memory/ref_counted.h"
12 #include "base/memory/scoped_ptr.h"
13 #include "base/memory/scoped_vector.h"
14 #include "components/url_matcher/regex_set_matcher.h"
15 #include "components/url_matcher/substring_set_matcher.h"
16 #include "components/url_matcher/url_matcher_export.h"
21 class DictionaryValue
;
24 namespace url_matcher
{
26 // This class represents a single URL matching condition, e.g. a match on the
27 // host suffix or the containment of a string in the query component of a GURL.
29 // The difference from a simple StringPattern is that this also supports
30 // checking whether the {Host, Path, Query} of a URL contains a string. The
31 // reduction of URL matching conditions to StringPatterns conducted by
32 // URLMatcherConditionFactory is not capable of expressing that alone.
34 // Also supported is matching regular expressions against the URL (URL_MATCHES).
35 class URL_MATCHER_EXPORT URLMatcherCondition
{
50 HOST_SUFFIX_PATH_PREFIX
,
51 HOST_EQUALS_PATH_PREFIX
,
57 ORIGIN_AND_PATH_MATCHES
, // Matches the URL minus its query string.
60 URLMatcherCondition();
61 ~URLMatcherCondition();
62 URLMatcherCondition(Criterion criterion
,
63 const StringPattern
* substring_pattern
);
64 URLMatcherCondition(const URLMatcherCondition
& rhs
);
65 URLMatcherCondition
& operator=(const URLMatcherCondition
& rhs
);
66 bool operator<(const URLMatcherCondition
& rhs
) const;
68 Criterion
criterion() const { return criterion_
; }
69 const StringPattern
* string_pattern() const {
70 return string_pattern_
;
73 // Returns whether this URLMatcherCondition needs to be executed on a
74 // full URL rather than the individual components (see
75 // URLMatcherConditionFactory).
76 bool IsFullURLCondition() const;
78 // Returns whether this URLMatcherCondition is a regular expression to be
79 // handled by a regex matcher instead of a substring matcher.
80 bool IsRegexCondition() const;
82 // Returns whether this URLMatcherCondition is a regular expression that shall
83 // be evaluated on the URL without the query parameter.
84 bool IsOriginAndPathRegexCondition() const;
86 // Returns whether this condition is fulfilled according to
87 // |matching_patterns| and |url|.
88 bool IsMatch(const std::set
<StringPattern::ID
>& matching_patterns
,
89 const GURL
& url
) const;
92 // |criterion_| and |string_pattern_| describe together what property a URL
93 // needs to fulfill to be considered a match.
96 // This is the StringPattern that is used in a SubstringSetMatcher.
97 const StringPattern
* string_pattern_
;
100 // Class to map the problem of finding {host, path, query} {prefixes, suffixes,
101 // containments, and equality} in GURLs to the substring matching problem.
103 // Say, you want to check whether the path of a URL starts with "/index.html".
104 // This class preprocesses a URL like "www.google.com/index.html" into something
105 // like "www.google.com|/index.html". After preprocessing, you can search for
106 // "|/index.html" in the string and see that this candidate URL actually has
107 // a path that starts with "/index.html". On the contrary,
108 // "www.google.com/images/index.html" would be normalized to
109 // "www.google.com|/images/index.html". It is easy to see that it contains
110 // "/index.html" but the path of the URL does not start with "/index.html".
112 // This preprocessing is important if you want to match a URL against many
113 // patterns because it reduces the matching to a "discover all substrings
114 // of a dictionary in a text" problem, which can be solved very efficiently
115 // by the Aho-Corasick algorithm.
117 // IMPORTANT: The URLMatcherConditionFactory owns the StringPattern
118 // referenced by created URLMatcherConditions. Therefore, it must outlive
119 // all created URLMatcherCondition and the SubstringSetMatcher.
120 class URL_MATCHER_EXPORT URLMatcherConditionFactory
{
122 URLMatcherConditionFactory();
123 ~URLMatcherConditionFactory();
125 // Canonicalizes a URL for "Create{Host,Path,Query}*Condition" searches.
126 std::string
CanonicalizeURLForComponentSearches(const GURL
& url
) const;
128 // Factory methods for various condition types.
130 // Note that these methods fill the pattern_singletons_. If you create
131 // conditions and don't register them to a URLMatcher, they will continue to
132 // consume memory. You need to call ForgetUnusedPatterns() or
133 // URLMatcher::ClearUnusedConditionSets() in this case.
134 URLMatcherCondition
CreateHostPrefixCondition(const std::string
& prefix
);
135 URLMatcherCondition
CreateHostSuffixCondition(const std::string
& suffix
);
136 URLMatcherCondition
CreateHostContainsCondition(const std::string
& str
);
137 URLMatcherCondition
CreateHostEqualsCondition(const std::string
& str
);
139 URLMatcherCondition
CreatePathPrefixCondition(const std::string
& prefix
);
140 URLMatcherCondition
CreatePathSuffixCondition(const std::string
& suffix
);
141 URLMatcherCondition
CreatePathContainsCondition(const std::string
& str
);
142 URLMatcherCondition
CreatePathEqualsCondition(const std::string
& str
);
144 URLMatcherCondition
CreateQueryPrefixCondition(const std::string
& prefix
);
145 URLMatcherCondition
CreateQuerySuffixCondition(const std::string
& suffix
);
146 URLMatcherCondition
CreateQueryContainsCondition(const std::string
& str
);
147 URLMatcherCondition
CreateQueryEqualsCondition(const std::string
& str
);
149 // This covers the common case, where you don't care whether a domain
150 // "foobar.com" is expressed as "foobar.com" or "www.foobar.com", and it
151 // should be followed by a given |path_prefix|.
152 URLMatcherCondition
CreateHostSuffixPathPrefixCondition(
153 const std::string
& host_suffix
,
154 const std::string
& path_prefix
);
155 URLMatcherCondition
CreateHostEqualsPathPrefixCondition(
156 const std::string
& host
,
157 const std::string
& path_prefix
);
159 // Canonicalizes a URL for "CreateURL*Condition" searches.
160 std::string
CanonicalizeURLForFullSearches(const GURL
& url
) const;
162 // Canonicalizes a URL for "CreateURLMatchesCondition" searches.
163 std::string
CanonicalizeURLForRegexSearches(const GURL
& url
) const;
164 // Canonicalizes a URL for "CreateOriginAndPathMatchesCondition" searches.
165 std::string
CanonicalizeURLForOriginAndPathRegexSearches(
166 const GURL
& url
) const;
168 URLMatcherCondition
CreateURLPrefixCondition(const std::string
& prefix
);
169 URLMatcherCondition
CreateURLSuffixCondition(const std::string
& suffix
);
170 URLMatcherCondition
CreateURLContainsCondition(const std::string
& str
);
171 URLMatcherCondition
CreateURLEqualsCondition(const std::string
& str
);
173 URLMatcherCondition
CreateURLMatchesCondition(const std::string
& regex
);
174 URLMatcherCondition
CreateOriginAndPathMatchesCondition(
175 const std::string
& regex
);
177 // Removes all patterns from |pattern_singletons_| that are not listed in
178 // |used_patterns|. These patterns are not referenced any more and get
180 void ForgetUnusedPatterns(
181 const std::set
<StringPattern::ID
>& used_patterns
);
183 // Returns true if this object retains no allocated data. Only for debugging.
184 bool IsEmpty() const;
187 // Creates a URLMatcherCondition according to the parameters passed.
188 // The URLMatcherCondition will refer to a StringPattern that is
189 // owned by |pattern_singletons_|.
190 URLMatcherCondition
CreateCondition(URLMatcherCondition::Criterion criterion
,
191 const std::string
& pattern
);
193 // Prepends a "." to the hostname if it does not start with one.
194 std::string
CanonicalizeHostname(const std::string
& hostname
) const;
196 // Counter that ensures that all created StringPatterns have unique IDs.
197 // Note that substring patterns and regex patterns will use different IDs.
200 // This comparison considers only the pattern() value of the
202 struct StringPatternPointerCompare
{
203 bool operator()(StringPattern
* lhs
, StringPattern
* rhs
) const;
205 // Set to ensure that we generate only one StringPattern for each content
206 // of StringPattern::pattern().
207 typedef std::set
<StringPattern
*, StringPatternPointerCompare
>
209 PatternSingletons substring_pattern_singletons_
;
210 PatternSingletons regex_pattern_singletons_
;
211 PatternSingletons origin_and_path_regex_pattern_singletons_
;
213 DISALLOW_COPY_AND_ASSIGN(URLMatcherConditionFactory
);
216 // This class represents a filter for the URL scheme to be hooked up into a
217 // URLMatcherConditionSet.
218 class URL_MATCHER_EXPORT URLMatcherSchemeFilter
{
220 explicit URLMatcherSchemeFilter(const std::string
& filter
);
221 explicit URLMatcherSchemeFilter(const std::vector
<std::string
>& filters
);
222 ~URLMatcherSchemeFilter();
223 bool IsMatch(const GURL
& url
) const;
226 std::vector
<std::string
> filters_
;
228 DISALLOW_COPY_AND_ASSIGN(URLMatcherSchemeFilter
);
231 // This class represents a filter for port numbers to be hooked up into a
232 // URLMatcherConditionSet.
233 class URL_MATCHER_EXPORT URLMatcherPortFilter
{
235 // Boundaries of a port range (both ends are included).
236 typedef std::pair
<int, int> Range
;
237 explicit URLMatcherPortFilter(const std::vector
<Range
>& ranges
);
238 ~URLMatcherPortFilter();
239 bool IsMatch(const GURL
& url
) const;
241 // Creates a port range [from, to]; both ends are included.
242 static Range
CreateRange(int from
, int to
);
243 // Creates a port range containing a single port.
244 static Range
CreateRange(int port
);
247 std::vector
<Range
> ranges_
;
249 DISALLOW_COPY_AND_ASSIGN(URLMatcherPortFilter
);
252 // This class represents a set of conditions that all need to match on a
253 // given URL in order to be considered a match.
254 class URL_MATCHER_EXPORT URLMatcherConditionSet
255 : public base::RefCounted
<URLMatcherConditionSet
> {
258 typedef std::set
<URLMatcherCondition
> Conditions
;
259 typedef std::vector
<scoped_refptr
<URLMatcherConditionSet
> > Vector
;
261 // Matches if all conditions in |conditions| are fulfilled.
262 URLMatcherConditionSet(ID id
, const Conditions
& conditions
);
264 // Matches if all conditions in |conditions|, |scheme_filter| and
265 // |port_filter| are fulfilled. |scheme_filter| and |port_filter| may be NULL,
266 // in which case, no restrictions are imposed on the scheme/port of a URL.
267 URLMatcherConditionSet(ID id
, const Conditions
& conditions
,
268 scoped_ptr
<URLMatcherSchemeFilter
> scheme_filter
,
269 scoped_ptr
<URLMatcherPortFilter
> port_filter
);
271 ID
id() const { return id_
; }
272 const Conditions
& conditions() const { return conditions_
; }
274 bool IsMatch(const std::set
<StringPattern::ID
>& matching_patterns
,
275 const GURL
& url
) const;
278 friend class base::RefCounted
<URLMatcherConditionSet
>;
279 ~URLMatcherConditionSet();
281 Conditions conditions_
;
282 scoped_ptr
<URLMatcherSchemeFilter
> scheme_filter_
;
283 scoped_ptr
<URLMatcherPortFilter
> port_filter_
;
285 DISALLOW_COPY_AND_ASSIGN(URLMatcherConditionSet
);
288 // This class allows matching one URL against a large set of
289 // URLMatcherConditionSets at the same time.
290 class URL_MATCHER_EXPORT URLMatcher
{
295 // Adds new URLMatcherConditionSet to this URL Matcher. Each condition set
296 // must have a unique ID.
297 // This is an expensive operation as it triggers pre-calculations on the
298 // currently registered condition sets. Do not call this operation many
299 // times with a single condition set in each call.
300 void AddConditionSets(const URLMatcherConditionSet::Vector
& condition_sets
);
302 // Removes the listed condition sets. All |condition_set_ids| must be
303 // currently registered. This function should be called with large batches
304 // of |condition_set_ids| at a time to improve performance.
305 void RemoveConditionSets(
306 const std::vector
<URLMatcherConditionSet::ID
>& condition_set_ids
);
308 // Removes all unused condition sets from the ConditionFactory.
309 void ClearUnusedConditionSets();
311 // Returns the IDs of all URLMatcherConditionSet that match to this |url|.
312 std::set
<URLMatcherConditionSet::ID
> MatchURL(const GURL
& url
) const;
314 // Returns the URLMatcherConditionFactory that must be used to create
315 // URLMatcherConditionSets for this URLMatcher.
316 URLMatcherConditionFactory
* condition_factory() {
317 return &condition_factory_
;
320 // Returns true if this object retains no allocated data. Only for debugging.
321 bool IsEmpty() const;
324 void UpdateSubstringSetMatcher(bool full_url_conditions
);
325 void UpdateRegexSetMatcher();
326 void UpdateTriggers();
327 void UpdateConditionFactory();
328 void UpdateInternalDatastructures();
330 URLMatcherConditionFactory condition_factory_
;
332 // Maps the ID of a URLMatcherConditionSet to the respective
333 // URLMatcherConditionSet.
334 typedef std::map
<URLMatcherConditionSet::ID
,
335 scoped_refptr
<URLMatcherConditionSet
> >
336 URLMatcherConditionSets
;
337 URLMatcherConditionSets url_matcher_condition_sets_
;
339 // Maps a StringPattern ID to the URLMatcherConditions that need to
340 // be triggered in case of a StringPattern match.
341 typedef std::map
<StringPattern::ID
, std::set
<URLMatcherConditionSet::ID
> >
342 StringPatternTriggers
;
343 StringPatternTriggers substring_match_triggers_
;
345 SubstringSetMatcher full_url_matcher_
;
346 SubstringSetMatcher url_component_matcher_
;
347 RegexSetMatcher regex_set_matcher_
;
348 RegexSetMatcher origin_and_path_regex_set_matcher_
;
349 std::set
<const StringPattern
*> registered_full_url_patterns_
;
350 std::set
<const StringPattern
*> registered_url_component_patterns_
;
352 DISALLOW_COPY_AND_ASSIGN(URLMatcher
);
355 } // namespace url_matcher
357 #endif // COMPONENTS_URL_MATCHER_URL_MATCHER_H_