1 // Copyright 2013 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "components/url_matcher/substring_set_matcher.h"
10 #include "base/logging.h"
11 #include "base/stl_util.h"
13 namespace url_matcher
{
17 // Compare StringPattern instances based on their string patterns.
18 bool ComparePatterns(const StringPattern
* a
, const StringPattern
* b
) {
19 return a
->pattern() < b
->pattern();
22 // Given the set of patterns, compute how many nodes will the corresponding
23 // Aho-Corasick tree have. Note that |patterns| need to be sorted.
24 uint32
TreeSize(const std::vector
<const StringPattern
*>& patterns
) {
25 uint32 result
= 1u; // 1 for the root node.
29 std::vector
<const StringPattern
*>::const_iterator last
= patterns
.begin();
30 std::vector
<const StringPattern
*>::const_iterator current
= last
+ 1;
31 // For the first pattern, each letter is a label of an edge to a new node.
32 result
+= (*last
)->pattern().size();
34 // For the subsequent patterns, only count the edges which were not counted
35 // yet. For this it suffices to test against the previous pattern, because the
36 // patterns are sorted.
37 for (; current
!= patterns
.end(); ++last
, ++current
) {
38 const std::string
& last_pattern
= (*last
)->pattern();
39 const std::string
& current_pattern
= (*current
)->pattern();
40 const uint32 prefix_bound
=
41 std::min(last_pattern
.size(), current_pattern
.size());
43 uint32 common_prefix
= 0;
44 while (common_prefix
< prefix_bound
&&
45 last_pattern
[common_prefix
] == current_pattern
[common_prefix
])
47 result
+= current_pattern
.size() - common_prefix
;
55 // SubstringSetMatcher
58 SubstringSetMatcher::SubstringSetMatcher() {
59 RebuildAhoCorasickTree(SubstringPatternVector());
62 SubstringSetMatcher::~SubstringSetMatcher() {}
64 void SubstringSetMatcher::RegisterPatterns(
65 const std::vector
<const StringPattern
*>& patterns
) {
66 RegisterAndUnregisterPatterns(patterns
,
67 std::vector
<const StringPattern
*>());
70 void SubstringSetMatcher::UnregisterPatterns(
71 const std::vector
<const StringPattern
*>& patterns
) {
72 RegisterAndUnregisterPatterns(std::vector
<const StringPattern
*>(),
76 void SubstringSetMatcher::RegisterAndUnregisterPatterns(
77 const std::vector
<const StringPattern
*>& to_register
,
78 const std::vector
<const StringPattern
*>& to_unregister
) {
80 for (std::vector
<const StringPattern
*>::const_iterator i
=
81 to_register
.begin(); i
!= to_register
.end(); ++i
) {
82 DCHECK(patterns_
.find((*i
)->id()) == patterns_
.end());
83 patterns_
[(*i
)->id()] = *i
;
86 // Unregister patterns
87 for (std::vector
<const StringPattern
*>::const_iterator i
=
88 to_unregister
.begin(); i
!= to_unregister
.end(); ++i
) {
89 patterns_
.erase((*i
)->id());
92 // Now we compute the total number of tree nodes needed.
93 SubstringPatternVector sorted_patterns
;
94 sorted_patterns
.resize(patterns_
.size());
97 for (SubstringPatternMap::const_iterator i
= patterns_
.begin();
100 sorted_patterns
[next
] = i
->second
;
103 std::sort(sorted_patterns
.begin(), sorted_patterns
.end(), ComparePatterns
);
104 tree_
.reserve(TreeSize(sorted_patterns
));
106 RebuildAhoCorasickTree(sorted_patterns
);
109 bool SubstringSetMatcher::Match(const std::string
& text
,
110 std::set
<StringPattern::ID
>* matches
) const {
111 const size_t old_number_of_matches
= matches
->size();
113 // Handle patterns matching the empty string.
114 matches
->insert(tree_
[0].matches().begin(), tree_
[0].matches().end());
116 uint32 current_node
= 0;
117 for (std::string::const_iterator i
= text
.begin(); i
!= text
.end(); ++i
) {
118 uint32 edge_from_current
= tree_
[current_node
].GetEdge(*i
);
119 while (edge_from_current
== AhoCorasickNode::kNoSuchEdge
&&
121 current_node
= tree_
[current_node
].failure();
122 edge_from_current
= tree_
[current_node
].GetEdge(*i
);
124 if (edge_from_current
!= AhoCorasickNode::kNoSuchEdge
) {
125 current_node
= edge_from_current
;
126 matches
->insert(tree_
[current_node
].matches().begin(),
127 tree_
[current_node
].matches().end());
129 DCHECK_EQ(0u, current_node
);
133 return old_number_of_matches
!= matches
->size();
136 bool SubstringSetMatcher::IsEmpty() const {
137 // An empty tree consists of only the root node.
138 return patterns_
.empty() && tree_
.size() == 1u;
141 void SubstringSetMatcher::RebuildAhoCorasickTree(
142 const SubstringPatternVector
& sorted_patterns
) {
145 // Initialize root note of tree.
146 AhoCorasickNode root
;
148 tree_
.push_back(root
);
150 // Insert all patterns.
151 for (SubstringPatternVector::const_iterator i
= sorted_patterns
.begin();
152 i
!= sorted_patterns
.end();
154 InsertPatternIntoAhoCorasickTree(*i
);
157 CreateFailureEdges();
160 void SubstringSetMatcher::InsertPatternIntoAhoCorasickTree(
161 const StringPattern
* pattern
) {
162 const std::string
& text
= pattern
->pattern();
163 const std::string::const_iterator text_end
= text
.end();
165 // Iterators on the tree and the text.
166 uint32 current_node
= 0;
167 std::string::const_iterator i
= text
.begin();
169 // Follow existing paths for as long as possible.
170 while (i
!= text_end
) {
171 uint32 edge_from_current
= tree_
[current_node
].GetEdge(*i
);
172 if (edge_from_current
== AhoCorasickNode::kNoSuchEdge
)
174 current_node
= edge_from_current
;
178 // Create new nodes if necessary.
179 while (i
!= text_end
) {
180 tree_
.push_back(AhoCorasickNode());
181 tree_
[current_node
].SetEdge(*i
, tree_
.size() - 1);
182 current_node
= tree_
.size() - 1;
187 tree_
[current_node
].AddMatch(pattern
->id());
190 void SubstringSetMatcher::CreateFailureEdges() {
191 typedef AhoCorasickNode::Edges Edges
;
193 std::queue
<uint32
> queue
;
195 AhoCorasickNode
& root
= tree_
[0];
197 const Edges
& root_edges
= root
.edges();
198 for (Edges::const_iterator e
= root_edges
.begin(); e
!= root_edges
.end();
200 const uint32
& leads_to
= e
->second
;
201 tree_
[leads_to
].set_failure(0);
202 queue
.push(leads_to
);
205 while (!queue
.empty()) {
206 AhoCorasickNode
& current_node
= tree_
[queue
.front()];
208 for (Edges::const_iterator e
= current_node
.edges().begin();
209 e
!= current_node
.edges().end(); ++e
) {
210 const char& edge_label
= e
->first
;
211 const uint32
& leads_to
= e
->second
;
212 queue
.push(leads_to
);
214 uint32 failure
= current_node
.failure();
215 uint32 edge_from_failure
= tree_
[failure
].GetEdge(edge_label
);
216 while (edge_from_failure
== AhoCorasickNode::kNoSuchEdge
&&
218 failure
= tree_
[failure
].failure();
219 edge_from_failure
= tree_
[failure
].GetEdge(edge_label
);
222 const uint32 follow_in_case_of_failure
=
223 edge_from_failure
!= AhoCorasickNode::kNoSuchEdge
226 tree_
[leads_to
].set_failure(follow_in_case_of_failure
);
227 tree_
[leads_to
].AddMatches(tree_
[follow_in_case_of_failure
].matches());
232 const uint32
SubstringSetMatcher::AhoCorasickNode::kNoSuchEdge
= 0xFFFFFFFF;
234 SubstringSetMatcher::AhoCorasickNode::AhoCorasickNode()
235 : failure_(kNoSuchEdge
) {}
237 SubstringSetMatcher::AhoCorasickNode::~AhoCorasickNode() {}
239 SubstringSetMatcher::AhoCorasickNode::AhoCorasickNode(
240 const SubstringSetMatcher::AhoCorasickNode
& other
)
241 : edges_(other
.edges_
),
242 failure_(other
.failure_
),
243 matches_(other
.matches_
) {}
245 SubstringSetMatcher::AhoCorasickNode
&
246 SubstringSetMatcher::AhoCorasickNode::operator=(
247 const SubstringSetMatcher::AhoCorasickNode
& other
) {
248 edges_
= other
.edges_
;
249 failure_
= other
.failure_
;
250 matches_
= other
.matches_
;
254 uint32
SubstringSetMatcher::AhoCorasickNode::GetEdge(char c
) const {
255 Edges::const_iterator i
= edges_
.find(c
);
256 return i
== edges_
.end() ? kNoSuchEdge
: i
->second
;
259 void SubstringSetMatcher::AhoCorasickNode::SetEdge(char c
, uint32 node
) {
263 void SubstringSetMatcher::AhoCorasickNode::AddMatch(StringPattern::ID id
) {
267 void SubstringSetMatcher::AhoCorasickNode::AddMatches(
268 const SubstringSetMatcher::AhoCorasickNode::Matches
& matches
) {
269 matches_
.insert(matches
.begin(), matches
.end());
272 } // namespace url_matcher