1 // Copyright 2013 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #ifndef COMPONENTS_URL_MATCHER_SUBSTRING_SET_MATCHER_H_
6 #define COMPONENTS_URL_MATCHER_SUBSTRING_SET_MATCHER_H_
14 #include "base/basictypes.h"
15 #include "components/url_matcher/string_pattern.h"
16 #include "components/url_matcher/url_matcher_export.h"
18 namespace url_matcher
{
20 // Class that store a set of string patterns and can find for a string S,
21 // which string patterns occur in S.
22 class URL_MATCHER_EXPORT SubstringSetMatcher
{
24 SubstringSetMatcher();
25 ~SubstringSetMatcher();
27 // Registers all |patterns|. The ownership remains with the caller.
28 // The same pattern cannot be registered twice and each pattern needs to have
30 // Ownership of the patterns remains with the caller.
31 void RegisterPatterns(const std::vector
<const StringPattern
*>& patterns
);
33 // Unregisters the passed |patterns|.
34 void UnregisterPatterns(const std::vector
<const StringPattern
*>& patterns
);
36 // Analogous to RegisterPatterns and UnregisterPatterns but executes both
37 // operations in one step, which is cheaper in the execution.
38 void RegisterAndUnregisterPatterns(
39 const std::vector
<const StringPattern
*>& to_register
,
40 const std::vector
<const StringPattern
*>& to_unregister
);
42 // Matches |text| against all registered StringPatterns. Stores the IDs
43 // of matching patterns in |matches|. |matches| is not cleared before adding
45 bool Match(const std::string
& text
,
46 std::set
<StringPattern::ID
>* matches
) const;
48 // Returns true if this object retains no allocated data. Only for debugging.
52 // A node of an Aho Corasick Tree. This is implemented according to
53 // http://www.cs.uku.fi/~kilpelai/BSA05/lectures/slides04.pdf
55 // The algorithm is based on the idea of building a trie of all registered
56 // patterns. Each node of the tree is annotated with a set of pattern
57 // IDs that are used to report matches.
59 // The root of the trie represents an empty match. If we were looking whether
60 // any registered pattern matches a text at the beginning of the text (i.e.
61 // whether any pattern is a prefix of the text), we could just follow
62 // nodes in the trie according to the matching characters in the text.
63 // E.g., if text == "foobar", we would follow the trie from the root node
64 // to its child labeled 'f', from there to child 'o', etc. In this process we
65 // would report all pattern IDs associated with the trie nodes as matches.
67 // As we are not looking for all prefix matches but all substring matches,
68 // this algorithm would need to compare text.substr(0), text.substr(1), ...
69 // against the trie, which is in O(|text|^2).
71 // The Aho Corasick algorithm improves this runtime by using failure edges.
72 // In case we have found a partial match of length k in the text
73 // (text[i, ..., i + k - 1]) in the trie starting at the root and ending at
74 // a node at depth k, but cannot find a match in the trie for character
75 // text[i + k] at depth k + 1, we follow a failure edge. This edge
76 // corresponds to the longest proper suffix of text[i, ..., i + k - 1] that
77 // is a prefix of any registered pattern.
79 // If your brain thinks "Forget it, let's go shopping.", don't worry.
80 // Take a nap and read an introductory text on the Aho Corasick algorithm.
81 // It will make sense. Eventually.
82 class AhoCorasickNode
{
84 // Key: label of the edge, value: node index in |tree_| of parent class.
85 typedef std::map
<char, uint32
> Edges
;
86 typedef std::set
<StringPattern::ID
> Matches
;
88 static const uint32 kNoSuchEdge
; // Represents an invalid node index.
92 AhoCorasickNode(const AhoCorasickNode
& other
);
93 AhoCorasickNode
& operator=(const AhoCorasickNode
& other
);
95 uint32
GetEdge(char c
) const;
96 void SetEdge(char c
, uint32 node
);
97 const Edges
& edges() const { return edges_
; }
99 uint32
failure() const { return failure_
; }
100 void set_failure(uint32 failure
) { failure_
= failure
; }
102 void AddMatch(StringPattern::ID id
);
103 void AddMatches(const Matches
& matches
);
104 const Matches
& matches() const { return matches_
; }
107 // Outgoing edges of current node.
110 // Node index that failure edge leads to.
113 // Identifiers of matches.
117 typedef std::map
<StringPattern::ID
, const StringPattern
*> SubstringPatternMap
;
118 typedef std::vector
<const StringPattern
*> SubstringPatternVector
;
120 // |sorted_patterns| is a copy of |patterns_| sorted by the pattern string.
121 void RebuildAhoCorasickTree(const SubstringPatternVector
& sorted_patterns
);
123 // Inserts a path for |pattern->pattern()| into the tree and adds
124 // |pattern->id()| to the set of matches. Ownership of |pattern| remains with
126 void InsertPatternIntoAhoCorasickTree(const StringPattern
* pattern
);
127 void CreateFailureEdges();
129 // Set of all registered StringPatterns. Used to regenerate the
130 // Aho-Corasick tree in case patterns are registered or unregistered.
131 SubstringPatternMap patterns_
;
133 // The nodes of a Aho-Corasick tree.
134 std::vector
<AhoCorasickNode
> tree_
;
136 DISALLOW_COPY_AND_ASSIGN(SubstringSetMatcher
);
139 } // namespace url_matcher
141 #endif // COMPONENTS_URL_MATCHER_SUBSTRING_SET_MATCHER_H_