1 // Copyright 2014 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "components/bookmarks/browser/bookmark_index.h"
12 #include "base/i18n/case_conversion.h"
13 #include "base/logging.h"
14 #include "base/strings/utf_offset_string_conversions.h"
15 #include "components/bookmarks/browser/bookmark_client.h"
16 #include "components/bookmarks/browser/bookmark_match.h"
17 #include "components/bookmarks/browser/bookmark_node.h"
18 #include "components/bookmarks/browser/bookmark_utils.h"
19 #include "components/query_parser/snippet.h"
20 #include "third_party/icu/source/common/unicode/normalizer2.h"
24 typedef BookmarkClient::NodeTypedCountPair NodeTypedCountPair
;
25 typedef BookmarkClient::NodeTypedCountPairs NodeTypedCountPairs
;
29 // Returns a normalized version of the UTF16 string |text|. If it fails to
30 // normalize the string, returns |text| itself as a best-effort.
31 base::string16
Normalize(const base::string16
& text
) {
32 UErrorCode status
= U_ZERO_ERROR
;
33 const icu::Normalizer2
* normalizer2
=
34 icu::Normalizer2::getInstance(NULL
, "nfkc", UNORM2_COMPOSE
, status
);
35 icu::UnicodeString
unicode_text(
36 text
.data(), static_cast<int32_t>(text
.length()));
37 icu::UnicodeString unicode_normalized_text
;
38 normalizer2
->normalize(unicode_text
, unicode_normalized_text
, status
);
39 if (U_FAILURE(status
))
41 return base::string16(unicode_normalized_text
.getBuffer(),
42 unicode_normalized_text
.length());
45 // Sort functor for NodeTypedCountPairs. We sort in decreasing order of typed
46 // count so that the best matches will always be added to the results.
47 struct NodeTypedCountPairSortFunctor
48 : std::binary_function
<NodeTypedCountPair
, NodeTypedCountPair
, bool> {
49 bool operator()(const NodeTypedCountPair
& a
,
50 const NodeTypedCountPair
& b
) const {
51 return a
.second
> b
.second
;
55 // Extract the const Node* stored in a BookmarkClient::NodeTypedCountPair.
56 struct NodeTypedCountPairExtractNodeFunctor
57 : std::unary_function
<NodeTypedCountPair
, const BookmarkNode
*> {
58 const BookmarkNode
* operator()(const NodeTypedCountPair
& pair
) const {
65 // Used when finding the set of bookmarks that match a query. Each match
66 // represents a set of terms (as an interator into the Index) matching the
67 // query as well as the set of nodes that contain those terms in their titles.
68 struct BookmarkIndex::Match
{
69 // List of terms matching the query.
70 std::list
<Index::const_iterator
> terms
;
72 // The set of nodes matching the terms. As an optimization this is empty
73 // when we match only one term, and is filled in when we get more than one
74 // term. We can do this as when we have only one matching term we know
75 // the set of matching nodes is terms.front()->second.
77 // Use nodes_begin() and nodes_end() to get an iterator over the set as
78 // it handles the necessary switching between nodes and terms.front().
81 // Returns an iterator to the beginning of the matching nodes. See
82 // description of nodes for why this should be used over nodes.begin().
83 NodeSet::const_iterator
nodes_begin() const;
85 // Returns an iterator to the beginning of the matching nodes. See
86 // description of nodes for why this should be used over nodes.end().
87 NodeSet::const_iterator
nodes_end() const;
90 BookmarkIndex::NodeSet::const_iterator
91 BookmarkIndex::Match::nodes_begin() const {
92 return nodes
.empty() ? terms
.front()->second
.begin() : nodes
.begin();
95 BookmarkIndex::NodeSet::const_iterator
BookmarkIndex::Match::nodes_end() const {
96 return nodes
.empty() ? terms
.front()->second
.end() : nodes
.end();
99 BookmarkIndex::BookmarkIndex(BookmarkClient
* client
,
100 const std::string
& languages
)
102 languages_(languages
) {
106 BookmarkIndex::~BookmarkIndex() {
109 void BookmarkIndex::Add(const BookmarkNode
* node
) {
112 std::vector
<base::string16
> terms
=
113 ExtractQueryWords(Normalize(node
->GetTitle()));
114 for (size_t i
= 0; i
< terms
.size(); ++i
)
115 RegisterNode(terms
[i
], node
);
117 ExtractQueryWords(CleanUpUrlForMatching(node
->url(), languages_
, NULL
));
118 for (size_t i
= 0; i
< terms
.size(); ++i
)
119 RegisterNode(terms
[i
], node
);
122 void BookmarkIndex::Remove(const BookmarkNode
* node
) {
126 std::vector
<base::string16
> terms
=
127 ExtractQueryWords(Normalize(node
->GetTitle()));
128 for (size_t i
= 0; i
< terms
.size(); ++i
)
129 UnregisterNode(terms
[i
], node
);
131 ExtractQueryWords(CleanUpUrlForMatching(node
->url(), languages_
, NULL
));
132 for (size_t i
= 0; i
< terms
.size(); ++i
)
133 UnregisterNode(terms
[i
], node
);
136 void BookmarkIndex::GetBookmarksMatching(
137 const base::string16
& input_query
,
139 query_parser::MatchingAlgorithm matching_algorithm
,
140 std::vector
<BookmarkMatch
>* results
) {
141 const base::string16 query
= Normalize(input_query
);
142 std::vector
<base::string16
> terms
= ExtractQueryWords(query
);
147 for (size_t i
= 0; i
< terms
.size(); ++i
) {
148 if (!GetBookmarksMatchingTerm(
149 terms
[i
], i
== 0, matching_algorithm
, &matches
)) {
155 SortMatches(matches
, &sorted_nodes
);
157 // We use a QueryParser to fill in match positions for us. It's not the most
158 // efficient way to go about this, but by the time we get here we know what
159 // matches and so this shouldn't be performance critical.
160 query_parser::QueryParser parser
;
161 ScopedVector
<query_parser::QueryNode
> query_nodes
;
162 parser
.ParseQueryNodes(query
, matching_algorithm
, &query_nodes
.get());
164 // The highest typed counts should be at the beginning of the results vector
165 // so that the best matches will always be included in the results. The loop
166 // that calculates result relevance in HistoryContentsProvider::ConvertResults
167 // will run backwards to assure higher relevance will be attributed to the
169 for (Nodes::const_iterator i
= sorted_nodes
.begin();
170 i
!= sorted_nodes
.end() && results
->size() < max_count
;
172 AddMatchToResults(*i
, &parser
, query_nodes
.get(), results
);
175 void BookmarkIndex::SortMatches(const Matches
& matches
,
176 Nodes
* sorted_nodes
) const {
178 for (Matches::const_iterator i
= matches
.begin(); i
!= matches
.end(); ++i
) {
179 #if !defined(OS_ANDROID)
180 nodes
.insert(i
->nodes_begin(), i
->nodes_end());
182 // Work around a bug in the implementation of std::set::insert in the STL
183 // used on android (http://crbug.com/367050).
184 for (NodeSet::const_iterator n
= i
->nodes_begin(); n
!= i
->nodes_end(); ++n
)
185 nodes
.insert(nodes
.end(), *n
);
188 sorted_nodes
->reserve(sorted_nodes
->size() + nodes
.size());
189 if (client_
->SupportsTypedCountForNodes()) {
190 NodeTypedCountPairs node_typed_counts
;
191 client_
->GetTypedCountForNodes(nodes
, &node_typed_counts
);
192 std::sort(node_typed_counts
.begin(),
193 node_typed_counts
.end(),
194 NodeTypedCountPairSortFunctor());
195 std::transform(node_typed_counts
.begin(),
196 node_typed_counts
.end(),
197 std::back_inserter(*sorted_nodes
),
198 NodeTypedCountPairExtractNodeFunctor());
200 sorted_nodes
->insert(sorted_nodes
->end(), nodes
.begin(), nodes
.end());
204 void BookmarkIndex::AddMatchToResults(
205 const BookmarkNode
* node
,
206 query_parser::QueryParser
* parser
,
207 const query_parser::QueryNodeStarVector
& query_nodes
,
208 std::vector
<BookmarkMatch
>* results
) {
209 // Check that the result matches the query. The previous search
210 // was a simple per-word search, while the more complex matching
211 // of QueryParser may filter it out. For example, the query
212 // ["thi"] will match the bookmark titled [Thinking], but since
213 // ["thi"] is quoted we don't want to do a prefix match.
214 query_parser::QueryWordVector title_words
, url_words
;
215 const base::string16 lower_title
=
216 base::i18n::ToLower(Normalize(node
->GetTitle()));
217 parser
->ExtractQueryWords(lower_title
, &title_words
);
218 base::OffsetAdjuster::Adjustments adjustments
;
219 parser
->ExtractQueryWords(
220 CleanUpUrlForMatching(node
->url(), languages_
, &adjustments
),
222 query_parser::Snippet::MatchPositions title_matches
, url_matches
;
223 for (size_t i
= 0; i
< query_nodes
.size(); ++i
) {
224 const bool has_title_matches
=
225 query_nodes
[i
]->HasMatchIn(title_words
, &title_matches
);
226 const bool has_url_matches
=
227 query_nodes
[i
]->HasMatchIn(url_words
, &url_matches
);
228 if (!has_title_matches
&& !has_url_matches
)
230 query_parser::QueryParser::SortAndCoalesceMatchPositions(&title_matches
);
231 query_parser::QueryParser::SortAndCoalesceMatchPositions(&url_matches
);
234 if (lower_title
.length() == node
->GetTitle().length()) {
235 // Only use title matches if the lowercase string is the same length
236 // as the original string, otherwise the matches are meaningless.
237 // TODO(mpearson): revise match positions appropriately.
238 match
.title_match_positions
.swap(title_matches
);
240 // Now that we're done processing this entry, correct the offsets of the
241 // matches in |url_matches| so they point to offsets in the original URL
242 // spec, not the cleaned-up URL string that we used for matching.
243 std::vector
<size_t> offsets
=
244 BookmarkMatch::OffsetsFromMatchPositions(url_matches
);
245 base::OffsetAdjuster::UnadjustOffsets(adjustments
, &offsets
);
247 BookmarkMatch::ReplaceOffsetsInMatchPositions(url_matches
, offsets
);
248 match
.url_match_positions
.swap(url_matches
);
250 results
->push_back(match
);
253 bool BookmarkIndex::GetBookmarksMatchingTerm(
254 const base::string16
& term
,
256 query_parser::MatchingAlgorithm matching_algorithm
,
258 Index::const_iterator i
= index_
.lower_bound(term
);
259 if (i
== index_
.end())
262 if (!query_parser::QueryParser::IsWordLongEnoughForPrefixSearch(
263 term
, matching_algorithm
)) {
264 // Term is too short for prefix match, compare using exact match.
265 if (i
->first
!= term
)
266 return false; // No bookmarks with this term.
270 match
.terms
.push_back(i
);
271 matches
->push_back(match
);
274 CombineMatchesInPlace(i
, matches
);
275 } else if (first_term
) {
276 // This is the first term and we're doing a prefix match. Loop through
277 // index adding all entries that start with term to matches.
278 while (i
!= index_
.end() &&
279 i
->first
.size() >= term
.size() &&
280 term
.compare(0, term
.size(), i
->first
, 0, term
.size()) == 0) {
282 match
.terms
.push_back(i
);
283 matches
->push_back(match
);
287 // Prefix match and not the first term. Loop through index combining
288 // current matches in matches with term, placing result in result.
290 while (i
!= index_
.end() &&
291 i
->first
.size() >= term
.size() &&
292 term
.compare(0, term
.size(), i
->first
, 0, term
.size()) == 0) {
293 CombineMatches(i
, *matches
, &result
);
296 matches
->swap(result
);
298 return !matches
->empty();
301 void BookmarkIndex::CombineMatchesInPlace(const Index::const_iterator
& index_i
,
303 for (size_t i
= 0; i
< matches
->size(); ) {
304 Match
* match
= &((*matches
)[i
]);
305 NodeSet intersection
;
306 std::set_intersection(match
->nodes_begin(), match
->nodes_end(),
307 index_i
->second
.begin(), index_i
->second
.end(),
308 std::inserter(intersection
, intersection
.begin()));
309 if (intersection
.empty()) {
310 matches
->erase(matches
->begin() + i
);
312 match
->terms
.push_back(index_i
);
313 match
->nodes
.swap(intersection
);
319 void BookmarkIndex::CombineMatches(const Index::const_iterator
& index_i
,
320 const Matches
& current_matches
,
322 for (size_t i
= 0; i
< current_matches
.size(); ++i
) {
323 const Match
& match
= current_matches
[i
];
324 NodeSet intersection
;
325 std::set_intersection(match
.nodes_begin(), match
.nodes_end(),
326 index_i
->second
.begin(), index_i
->second
.end(),
327 std::inserter(intersection
, intersection
.begin()));
328 if (!intersection
.empty()) {
329 result
->push_back(Match());
330 Match
& combined_match
= result
->back();
331 combined_match
.terms
= match
.terms
;
332 combined_match
.terms
.push_back(index_i
);
333 combined_match
.nodes
.swap(intersection
);
338 std::vector
<base::string16
> BookmarkIndex::ExtractQueryWords(
339 const base::string16
& query
) {
340 std::vector
<base::string16
> terms
;
342 return std::vector
<base::string16
>();
343 query_parser::QueryParser parser
;
344 parser
.ParseQueryWords(base::i18n::ToLower(query
),
345 query_parser::MatchingAlgorithm::DEFAULT
,
350 void BookmarkIndex::RegisterNode(const base::string16
& term
,
351 const BookmarkNode
* node
) {
352 index_
[term
].insert(node
);
355 void BookmarkIndex::UnregisterNode(const base::string16
& term
,
356 const BookmarkNode
* node
) {
357 Index::iterator i
= index_
.find(term
);
358 if (i
== index_
.end()) {
359 // We can get here if the node has the same term more than once. For
360 // example, a bookmark with the title 'foo foo' would end up here.
363 i
->second
.erase(node
);
364 if (i
->second
.empty())
368 } // namespace bookmarks