1 // Copyright 2014 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "components/bookmarks/browser/bookmark_index.h"
12 #include "base/i18n/case_conversion.h"
13 #include "base/logging.h"
14 #include "base/strings/utf_offset_string_conversions.h"
15 #include "components/bookmarks/browser/bookmark_client.h"
16 #include "components/bookmarks/browser/bookmark_match.h"
17 #include "components/bookmarks/browser/bookmark_node.h"
18 #include "components/bookmarks/browser/bookmark_utils.h"
19 #include "components/query_parser/snippet.h"
20 #include "third_party/icu/source/common/unicode/normalizer2.h"
24 typedef BookmarkClient::NodeTypedCountPair NodeTypedCountPair
;
25 typedef BookmarkClient::NodeTypedCountPairs NodeTypedCountPairs
;
29 // Returns a normalized version of the UTF16 string |text|. If it fails to
30 // normalize the string, returns |text| itself as a best-effort.
31 base::string16
Normalize(const base::string16
& text
) {
32 UErrorCode status
= U_ZERO_ERROR
;
33 const icu::Normalizer2
* normalizer2
=
34 icu::Normalizer2::getInstance(NULL
, "nfkc", UNORM2_COMPOSE
, status
);
35 icu::UnicodeString
unicode_text(
36 text
.data(), static_cast<int32_t>(text
.length()));
37 icu::UnicodeString unicode_normalized_text
;
38 normalizer2
->normalize(unicode_text
, unicode_normalized_text
, status
);
39 if (U_FAILURE(status
))
41 return base::string16(unicode_normalized_text
.getBuffer(),
42 unicode_normalized_text
.length());
45 // Sort functor for NodeTypedCountPairs. We sort in decreasing order of typed
46 // count so that the best matches will always be added to the results.
47 struct NodeTypedCountPairSortFunctor
48 : std::binary_function
<NodeTypedCountPair
, NodeTypedCountPair
, bool> {
49 bool operator()(const NodeTypedCountPair
& a
,
50 const NodeTypedCountPair
& b
) const {
51 return a
.second
> b
.second
;
55 // Extract the const Node* stored in a BookmarkClient::NodeTypedCountPair.
56 struct NodeTypedCountPairExtractNodeFunctor
57 : std::unary_function
<NodeTypedCountPair
, const BookmarkNode
*> {
58 const BookmarkNode
* operator()(const NodeTypedCountPair
& pair
) const {
65 // Used when finding the set of bookmarks that match a query. Each match
66 // represents a set of terms (as an interator into the Index) matching the
67 // query as well as the set of nodes that contain those terms in their titles.
68 struct BookmarkIndex::Match
{
69 // List of terms matching the query.
70 std::list
<Index::const_iterator
> terms
;
72 // The set of nodes matching the terms. As an optimization this is empty
73 // when we match only one term, and is filled in when we get more than one
74 // term. We can do this as when we have only one matching term we know
75 // the set of matching nodes is terms.front()->second.
77 // Use nodes_begin() and nodes_end() to get an iterator over the set as
78 // it handles the necessary switching between nodes and terms.front().
81 // Returns an iterator to the beginning of the matching nodes. See
82 // description of nodes for why this should be used over nodes.begin().
83 NodeSet::const_iterator
nodes_begin() const;
85 // Returns an iterator to the beginning of the matching nodes. See
86 // description of nodes for why this should be used over nodes.end().
87 NodeSet::const_iterator
nodes_end() const;
90 BookmarkIndex::NodeSet::const_iterator
91 BookmarkIndex::Match::nodes_begin() const {
92 return nodes
.empty() ? terms
.front()->second
.begin() : nodes
.begin();
95 BookmarkIndex::NodeSet::const_iterator
BookmarkIndex::Match::nodes_end() const {
96 return nodes
.empty() ? terms
.front()->second
.end() : nodes
.end();
99 BookmarkIndex::BookmarkIndex(BookmarkClient
* client
,
101 const std::string
& languages
)
103 languages_(languages
),
104 index_urls_(index_urls
) {
108 BookmarkIndex::~BookmarkIndex() {
111 void BookmarkIndex::Add(const BookmarkNode
* node
) {
114 std::vector
<base::string16
> terms
=
115 ExtractQueryWords(Normalize(node
->GetTitle()));
116 for (size_t i
= 0; i
< terms
.size(); ++i
)
117 RegisterNode(terms
[i
], node
);
120 ExtractQueryWords(CleanUpUrlForMatching(node
->url(), languages_
, NULL
));
121 for (size_t i
= 0; i
< terms
.size(); ++i
)
122 RegisterNode(terms
[i
], node
);
126 void BookmarkIndex::Remove(const BookmarkNode
* node
) {
130 std::vector
<base::string16
> terms
=
131 ExtractQueryWords(Normalize(node
->GetTitle()));
132 for (size_t i
= 0; i
< terms
.size(); ++i
)
133 UnregisterNode(terms
[i
], node
);
136 ExtractQueryWords(CleanUpUrlForMatching(node
->url(), languages_
, NULL
));
137 for (size_t i
= 0; i
< terms
.size(); ++i
)
138 UnregisterNode(terms
[i
], node
);
142 void BookmarkIndex::GetBookmarksMatching(const base::string16
& input_query
,
144 std::vector
<BookmarkMatch
>* results
) {
145 const base::string16 query
= Normalize(input_query
);
146 std::vector
<base::string16
> terms
= ExtractQueryWords(query
);
151 for (size_t i
= 0; i
< terms
.size(); ++i
) {
152 if (!GetBookmarksMatchingTerm(terms
[i
], i
== 0, &matches
))
157 SortMatches(matches
, &sorted_nodes
);
159 // We use a QueryParser to fill in match positions for us. It's not the most
160 // efficient way to go about this, but by the time we get here we know what
161 // matches and so this shouldn't be performance critical.
162 query_parser::QueryParser parser
;
163 ScopedVector
<query_parser::QueryNode
> query_nodes
;
164 parser
.ParseQueryNodes(query
, &query_nodes
.get());
166 // The highest typed counts should be at the beginning of the results vector
167 // so that the best matches will always be included in the results. The loop
168 // that calculates result relevance in HistoryContentsProvider::ConvertResults
169 // will run backwards to assure higher relevance will be attributed to the
171 for (Nodes::const_iterator i
= sorted_nodes
.begin();
172 i
!= sorted_nodes
.end() && results
->size() < max_count
;
174 AddMatchToResults(*i
, &parser
, query_nodes
.get(), results
);
177 void BookmarkIndex::SortMatches(const Matches
& matches
,
178 Nodes
* sorted_nodes
) const {
180 for (Matches::const_iterator i
= matches
.begin(); i
!= matches
.end(); ++i
) {
181 #if !defined(OS_ANDROID)
182 nodes
.insert(i
->nodes_begin(), i
->nodes_end());
184 // Work around a bug in the implementation of std::set::insert in the STL
185 // used on android (http://crbug.com/367050).
186 for (NodeSet::const_iterator n
= i
->nodes_begin(); n
!= i
->nodes_end(); ++n
)
187 nodes
.insert(nodes
.end(), *n
);
190 sorted_nodes
->reserve(sorted_nodes
->size() + nodes
.size());
191 if (client_
->SupportsTypedCountForNodes()) {
192 NodeTypedCountPairs node_typed_counts
;
193 client_
->GetTypedCountForNodes(nodes
, &node_typed_counts
);
194 std::sort(node_typed_counts
.begin(),
195 node_typed_counts
.end(),
196 NodeTypedCountPairSortFunctor());
197 std::transform(node_typed_counts
.begin(),
198 node_typed_counts
.end(),
199 std::back_inserter(*sorted_nodes
),
200 NodeTypedCountPairExtractNodeFunctor());
202 sorted_nodes
->insert(sorted_nodes
->end(), nodes
.begin(), nodes
.end());
206 void BookmarkIndex::AddMatchToResults(
207 const BookmarkNode
* node
,
208 query_parser::QueryParser
* parser
,
209 const query_parser::QueryNodeStarVector
& query_nodes
,
210 std::vector
<BookmarkMatch
>* results
) {
211 // Check that the result matches the query. The previous search
212 // was a simple per-word search, while the more complex matching
213 // of QueryParser may filter it out. For example, the query
214 // ["thi"] will match the bookmark titled [Thinking], but since
215 // ["thi"] is quoted we don't want to do a prefix match.
216 query_parser::QueryWordVector title_words
, url_words
;
217 const base::string16 lower_title
=
218 base::i18n::ToLower(Normalize(node
->GetTitle()));
219 parser
->ExtractQueryWords(lower_title
, &title_words
);
220 base::OffsetAdjuster::Adjustments adjustments
;
222 parser
->ExtractQueryWords(
223 CleanUpUrlForMatching(node
->url(), languages_
, &adjustments
),
226 query_parser::Snippet::MatchPositions title_matches
, url_matches
;
227 for (size_t i
= 0; i
< query_nodes
.size(); ++i
) {
228 const bool has_title_matches
=
229 query_nodes
[i
]->HasMatchIn(title_words
, &title_matches
);
230 const bool has_url_matches
= index_urls_
&&
231 query_nodes
[i
]->HasMatchIn(url_words
, &url_matches
);
232 if (!has_title_matches
&& !has_url_matches
)
234 query_parser::QueryParser::SortAndCoalesceMatchPositions(&title_matches
);
236 query_parser::QueryParser::SortAndCoalesceMatchPositions(&url_matches
);
239 if (lower_title
.length() == node
->GetTitle().length()) {
240 // Only use title matches if the lowercase string is the same length
241 // as the original string, otherwise the matches are meaningless.
242 // TODO(mpearson): revise match positions appropriately.
243 match
.title_match_positions
.swap(title_matches
);
246 // Now that we're done processing this entry, correct the offsets of the
247 // matches in |url_matches| so they point to offsets in the original URL
248 // spec, not the cleaned-up URL string that we used for matching.
249 std::vector
<size_t> offsets
=
250 BookmarkMatch::OffsetsFromMatchPositions(url_matches
);
251 base::OffsetAdjuster::UnadjustOffsets(adjustments
, &offsets
);
253 BookmarkMatch::ReplaceOffsetsInMatchPositions(url_matches
, offsets
);
254 match
.url_match_positions
.swap(url_matches
);
257 results
->push_back(match
);
260 bool BookmarkIndex::GetBookmarksMatchingTerm(const base::string16
& term
,
263 Index::const_iterator i
= index_
.lower_bound(term
);
264 if (i
== index_
.end())
267 if (!query_parser::QueryParser::IsWordLongEnoughForPrefixSearch(term
)) {
268 // Term is too short for prefix match, compare using exact match.
269 if (i
->first
!= term
)
270 return false; // No bookmarks with this term.
274 match
.terms
.push_back(i
);
275 matches
->push_back(match
);
278 CombineMatchesInPlace(i
, matches
);
279 } else if (first_term
) {
280 // This is the first term and we're doing a prefix match. Loop through
281 // index adding all entries that start with term to matches.
282 while (i
!= index_
.end() &&
283 i
->first
.size() >= term
.size() &&
284 term
.compare(0, term
.size(), i
->first
, 0, term
.size()) == 0) {
286 match
.terms
.push_back(i
);
287 matches
->push_back(match
);
291 // Prefix match and not the first term. Loop through index combining
292 // current matches in matches with term, placing result in result.
294 while (i
!= index_
.end() &&
295 i
->first
.size() >= term
.size() &&
296 term
.compare(0, term
.size(), i
->first
, 0, term
.size()) == 0) {
297 CombineMatches(i
, *matches
, &result
);
300 matches
->swap(result
);
302 return !matches
->empty();
305 void BookmarkIndex::CombineMatchesInPlace(const Index::const_iterator
& index_i
,
307 for (size_t i
= 0; i
< matches
->size(); ) {
308 Match
* match
= &((*matches
)[i
]);
309 NodeSet intersection
;
310 std::set_intersection(match
->nodes_begin(), match
->nodes_end(),
311 index_i
->second
.begin(), index_i
->second
.end(),
312 std::inserter(intersection
, intersection
.begin()));
313 if (intersection
.empty()) {
314 matches
->erase(matches
->begin() + i
);
316 match
->terms
.push_back(index_i
);
317 match
->nodes
.swap(intersection
);
323 void BookmarkIndex::CombineMatches(const Index::const_iterator
& index_i
,
324 const Matches
& current_matches
,
326 for (size_t i
= 0; i
< current_matches
.size(); ++i
) {
327 const Match
& match
= current_matches
[i
];
328 NodeSet intersection
;
329 std::set_intersection(match
.nodes_begin(), match
.nodes_end(),
330 index_i
->second
.begin(), index_i
->second
.end(),
331 std::inserter(intersection
, intersection
.begin()));
332 if (!intersection
.empty()) {
333 result
->push_back(Match());
334 Match
& combined_match
= result
->back();
335 combined_match
.terms
= match
.terms
;
336 combined_match
.terms
.push_back(index_i
);
337 combined_match
.nodes
.swap(intersection
);
342 std::vector
<base::string16
> BookmarkIndex::ExtractQueryWords(
343 const base::string16
& query
) {
344 std::vector
<base::string16
> terms
;
346 return std::vector
<base::string16
>();
347 query_parser::QueryParser parser
;
348 parser
.ParseQueryWords(base::i18n::ToLower(query
), &terms
);
352 void BookmarkIndex::RegisterNode(const base::string16
& term
,
353 const BookmarkNode
* node
) {
354 index_
[term
].insert(node
);
357 void BookmarkIndex::UnregisterNode(const base::string16
& term
,
358 const BookmarkNode
* node
) {
359 Index::iterator i
= index_
.find(term
);
360 if (i
== index_
.end()) {
361 // We can get here if the node has the same term more than once. For
362 // example, a bookmark with the title 'foo foo' would end up here.
365 i
->second
.erase(node
);
366 if (i
->second
.empty())
370 } // namespace bookmarks