Roll src/third_party/WebKit c63b89c:29324ab (svn 202546:202547)
[chromium-blink-merge.git] / components / url_matcher / substring_set_matcher.cc
blob848c86300e8482da1afad5962fbedc5ced86129d
1 // Copyright 2013 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "components/url_matcher/substring_set_matcher.h"
7 #include <algorithm>
8 #include <queue>
10 #include "base/logging.h"
11 #include "base/stl_util.h"
13 namespace url_matcher {
15 namespace {
17 // Compare StringPattern instances based on their string patterns.
18 bool ComparePatterns(const StringPattern* a, const StringPattern* b) {
19 return a->pattern() < b->pattern();
22 // Given the set of patterns, compute how many nodes will the corresponding
23 // Aho-Corasick tree have. Note that |patterns| need to be sorted.
24 uint32 TreeSize(const std::vector<const StringPattern*>& patterns) {
25 uint32 result = 1u; // 1 for the root node.
26 if (patterns.empty())
27 return result;
29 std::vector<const StringPattern*>::const_iterator last = patterns.begin();
30 std::vector<const StringPattern*>::const_iterator current = last + 1;
31 // For the first pattern, each letter is a label of an edge to a new node.
32 result += (*last)->pattern().size();
34 // For the subsequent patterns, only count the edges which were not counted
35 // yet. For this it suffices to test against the previous pattern, because the
36 // patterns are sorted.
37 for (; current != patterns.end(); ++last, ++current) {
38 const std::string& last_pattern = (*last)->pattern();
39 const std::string& current_pattern = (*current)->pattern();
40 const uint32 prefix_bound =
41 std::min(last_pattern.size(), current_pattern.size());
43 uint32 common_prefix = 0;
44 while (common_prefix < prefix_bound &&
45 last_pattern[common_prefix] == current_pattern[common_prefix])
46 ++common_prefix;
47 result += current_pattern.size() - common_prefix;
49 return result;
52 } // namespace
55 // SubstringSetMatcher
58 SubstringSetMatcher::SubstringSetMatcher() {
59 RebuildAhoCorasickTree(SubstringPatternVector());
62 SubstringSetMatcher::~SubstringSetMatcher() {}
64 void SubstringSetMatcher::RegisterPatterns(
65 const std::vector<const StringPattern*>& patterns) {
66 RegisterAndUnregisterPatterns(patterns,
67 std::vector<const StringPattern*>());
70 void SubstringSetMatcher::UnregisterPatterns(
71 const std::vector<const StringPattern*>& patterns) {
72 RegisterAndUnregisterPatterns(std::vector<const StringPattern*>(),
73 patterns);
76 void SubstringSetMatcher::RegisterAndUnregisterPatterns(
77 const std::vector<const StringPattern*>& to_register,
78 const std::vector<const StringPattern*>& to_unregister) {
79 // Register patterns.
80 for (std::vector<const StringPattern*>::const_iterator i =
81 to_register.begin(); i != to_register.end(); ++i) {
82 DCHECK(patterns_.find((*i)->id()) == patterns_.end());
83 patterns_[(*i)->id()] = *i;
86 // Unregister patterns
87 for (std::vector<const StringPattern*>::const_iterator i =
88 to_unregister.begin(); i != to_unregister.end(); ++i) {
89 patterns_.erase((*i)->id());
92 // Now we compute the total number of tree nodes needed.
93 SubstringPatternVector sorted_patterns;
94 sorted_patterns.resize(patterns_.size());
96 size_t next = 0;
97 for (SubstringPatternMap::const_iterator i = patterns_.begin();
98 i != patterns_.end();
99 ++i, ++next) {
100 sorted_patterns[next] = i->second;
103 std::sort(sorted_patterns.begin(), sorted_patterns.end(), ComparePatterns);
104 tree_.reserve(TreeSize(sorted_patterns));
106 RebuildAhoCorasickTree(sorted_patterns);
109 bool SubstringSetMatcher::Match(const std::string& text,
110 std::set<StringPattern::ID>* matches) const {
111 const size_t old_number_of_matches = matches->size();
113 // Handle patterns matching the empty string.
114 matches->insert(tree_[0].matches().begin(), tree_[0].matches().end());
116 uint32 current_node = 0;
117 for (std::string::const_iterator i = text.begin(); i != text.end(); ++i) {
118 uint32 edge_from_current = tree_[current_node].GetEdge(*i);
119 while (edge_from_current == AhoCorasickNode::kNoSuchEdge &&
120 current_node != 0) {
121 current_node = tree_[current_node].failure();
122 edge_from_current = tree_[current_node].GetEdge(*i);
124 if (edge_from_current != AhoCorasickNode::kNoSuchEdge) {
125 current_node = edge_from_current;
126 matches->insert(tree_[current_node].matches().begin(),
127 tree_[current_node].matches().end());
128 } else {
129 DCHECK_EQ(0u, current_node);
133 return old_number_of_matches != matches->size();
136 bool SubstringSetMatcher::IsEmpty() const {
137 // An empty tree consists of only the root node.
138 return patterns_.empty() && tree_.size() == 1u;
141 void SubstringSetMatcher::RebuildAhoCorasickTree(
142 const SubstringPatternVector& sorted_patterns) {
143 tree_.clear();
145 // Initialize root note of tree.
146 AhoCorasickNode root;
147 root.set_failure(0);
148 tree_.push_back(root);
150 // Insert all patterns.
151 for (SubstringPatternVector::const_iterator i = sorted_patterns.begin();
152 i != sorted_patterns.end();
153 ++i) {
154 InsertPatternIntoAhoCorasickTree(*i);
157 CreateFailureEdges();
160 void SubstringSetMatcher::InsertPatternIntoAhoCorasickTree(
161 const StringPattern* pattern) {
162 const std::string& text = pattern->pattern();
163 const std::string::const_iterator text_end = text.end();
165 // Iterators on the tree and the text.
166 uint32 current_node = 0;
167 std::string::const_iterator i = text.begin();
169 // Follow existing paths for as long as possible.
170 while (i != text_end) {
171 uint32 edge_from_current = tree_[current_node].GetEdge(*i);
172 if (edge_from_current == AhoCorasickNode::kNoSuchEdge)
173 break;
174 current_node = edge_from_current;
175 ++i;
178 // Create new nodes if necessary.
179 while (i != text_end) {
180 tree_.push_back(AhoCorasickNode());
181 tree_[current_node].SetEdge(*i, tree_.size() - 1);
182 current_node = tree_.size() - 1;
183 ++i;
186 // Register match.
187 tree_[current_node].AddMatch(pattern->id());
190 void SubstringSetMatcher::CreateFailureEdges() {
191 typedef AhoCorasickNode::Edges Edges;
193 std::queue<uint32> queue;
195 AhoCorasickNode& root = tree_[0];
196 root.set_failure(0);
197 const Edges& root_edges = root.edges();
198 for (Edges::const_iterator e = root_edges.begin(); e != root_edges.end();
199 ++e) {
200 const uint32& leads_to = e->second;
201 tree_[leads_to].set_failure(0);
202 queue.push(leads_to);
205 while (!queue.empty()) {
206 AhoCorasickNode& current_node = tree_[queue.front()];
207 queue.pop();
208 for (Edges::const_iterator e = current_node.edges().begin();
209 e != current_node.edges().end(); ++e) {
210 const char& edge_label = e->first;
211 const uint32& leads_to = e->second;
212 queue.push(leads_to);
214 uint32 failure = current_node.failure();
215 uint32 edge_from_failure = tree_[failure].GetEdge(edge_label);
216 while (edge_from_failure == AhoCorasickNode::kNoSuchEdge &&
217 failure != 0) {
218 failure = tree_[failure].failure();
219 edge_from_failure = tree_[failure].GetEdge(edge_label);
222 const uint32 follow_in_case_of_failure =
223 edge_from_failure != AhoCorasickNode::kNoSuchEdge
224 ? edge_from_failure
225 : 0;
226 tree_[leads_to].set_failure(follow_in_case_of_failure);
227 tree_[leads_to].AddMatches(tree_[follow_in_case_of_failure].matches());
232 const uint32 SubstringSetMatcher::AhoCorasickNode::kNoSuchEdge = 0xFFFFFFFF;
234 SubstringSetMatcher::AhoCorasickNode::AhoCorasickNode()
235 : failure_(kNoSuchEdge) {}
237 SubstringSetMatcher::AhoCorasickNode::~AhoCorasickNode() {}
239 SubstringSetMatcher::AhoCorasickNode::AhoCorasickNode(
240 const SubstringSetMatcher::AhoCorasickNode& other)
241 : edges_(other.edges_),
242 failure_(other.failure_),
243 matches_(other.matches_) {}
245 SubstringSetMatcher::AhoCorasickNode&
246 SubstringSetMatcher::AhoCorasickNode::operator=(
247 const SubstringSetMatcher::AhoCorasickNode& other) {
248 edges_ = other.edges_;
249 failure_ = other.failure_;
250 matches_ = other.matches_;
251 return *this;
254 uint32 SubstringSetMatcher::AhoCorasickNode::GetEdge(char c) const {
255 Edges::const_iterator i = edges_.find(c);
256 return i == edges_.end() ? kNoSuchEdge : i->second;
259 void SubstringSetMatcher::AhoCorasickNode::SetEdge(char c, uint32 node) {
260 edges_[c] = node;
263 void SubstringSetMatcher::AhoCorasickNode::AddMatch(StringPattern::ID id) {
264 matches_.insert(id);
267 void SubstringSetMatcher::AhoCorasickNode::AddMatches(
268 const SubstringSetMatcher::AhoCorasickNode::Matches& matches) {
269 matches_.insert(matches.begin(), matches.end());
272 } // namespace url_matcher