1 // Copyright 2014 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "components/bookmarks/browser/bookmark_index.h"
12 #include "base/i18n/case_conversion.h"
13 #include "base/logging.h"
14 #include "base/strings/utf_offset_string_conversions.h"
15 #include "components/bookmarks/browser/bookmark_client.h"
16 #include "components/bookmarks/browser/bookmark_match.h"
17 #include "components/bookmarks/browser/bookmark_node.h"
18 #include "components/bookmarks/browser/bookmark_utils.h"
19 #include "components/query_parser/snippet.h"
20 #include "third_party/icu/source/common/unicode/normalizer2.h"
24 typedef BookmarkClient::NodeTypedCountPair NodeTypedCountPair
;
25 typedef BookmarkClient::NodeTypedCountPairs NodeTypedCountPairs
;
29 // Returns a normalized version of the UTF16 string |text|. If it fails to
30 // normalize the string, returns |text| itself as a best-effort.
31 base::string16
Normalize(const base::string16
& text
) {
32 UErrorCode status
= U_ZERO_ERROR
;
33 const icu::Normalizer2
* normalizer2
=
34 icu::Normalizer2::getInstance(NULL
, "nfkc", UNORM2_COMPOSE
, status
);
35 icu::UnicodeString
unicode_text(
36 text
.data(), static_cast<int32_t>(text
.length()));
37 icu::UnicodeString unicode_normalized_text
;
38 normalizer2
->normalize(unicode_text
, unicode_normalized_text
, status
);
39 if (U_FAILURE(status
))
41 return base::string16(unicode_normalized_text
.getBuffer(),
42 unicode_normalized_text
.length());
45 // Sort functor for NodeTypedCountPairs. We sort in decreasing order of typed
46 // count so that the best matches will always be added to the results.
47 struct NodeTypedCountPairSortFunctor
48 : std::binary_function
<NodeTypedCountPair
, NodeTypedCountPair
, bool> {
49 bool operator()(const NodeTypedCountPair
& a
,
50 const NodeTypedCountPair
& b
) const {
51 return a
.second
> b
.second
;
55 // Extract the const Node* stored in a BookmarkClient::NodeTypedCountPair.
56 struct NodeTypedCountPairExtractNodeFunctor
57 : std::unary_function
<NodeTypedCountPair
, const BookmarkNode
*> {
58 const BookmarkNode
* operator()(const NodeTypedCountPair
& pair
) const {
65 // Used when finding the set of bookmarks that match a query. Each match
66 // represents a set of terms (as an interator into the Index) matching the
67 // query as well as the set of nodes that contain those terms in their titles.
68 struct BookmarkIndex::Match
{
69 // List of terms matching the query.
70 std::list
<Index::const_iterator
> terms
;
72 // The set of nodes matching the terms. As an optimization this is empty
73 // when we match only one term, and is filled in when we get more than one
74 // term. We can do this as when we have only one matching term we know
75 // the set of matching nodes is terms.front()->second.
77 // Use nodes_begin() and nodes_end() to get an iterator over the set as
78 // it handles the necessary switching between nodes and terms.front().
81 // Returns an iterator to the beginning of the matching nodes. See
82 // description of nodes for why this should be used over nodes.begin().
83 NodeSet::const_iterator
nodes_begin() const;
85 // Returns an iterator to the beginning of the matching nodes. See
86 // description of nodes for why this should be used over nodes.end().
87 NodeSet::const_iterator
nodes_end() const;
90 BookmarkIndex::NodeSet::const_iterator
91 BookmarkIndex::Match::nodes_begin() const {
92 return nodes
.empty() ? terms
.front()->second
.begin() : nodes
.begin();
95 BookmarkIndex::NodeSet::const_iterator
BookmarkIndex::Match::nodes_end() const {
96 return nodes
.empty() ? terms
.front()->second
.end() : nodes
.end();
99 BookmarkIndex::BookmarkIndex(BookmarkClient
* client
,
100 const std::string
& languages
)
102 languages_(languages
) {
106 BookmarkIndex::~BookmarkIndex() {
109 void BookmarkIndex::Add(const BookmarkNode
* node
) {
112 std::vector
<base::string16
> terms
=
113 ExtractQueryWords(Normalize(node
->GetTitle()));
114 for (size_t i
= 0; i
< terms
.size(); ++i
)
115 RegisterNode(terms
[i
], node
);
117 ExtractQueryWords(CleanUpUrlForMatching(node
->url(), languages_
, NULL
));
118 for (size_t i
= 0; i
< terms
.size(); ++i
)
119 RegisterNode(terms
[i
], node
);
122 void BookmarkIndex::Remove(const BookmarkNode
* node
) {
126 std::vector
<base::string16
> terms
=
127 ExtractQueryWords(Normalize(node
->GetTitle()));
128 for (size_t i
= 0; i
< terms
.size(); ++i
)
129 UnregisterNode(terms
[i
], node
);
131 ExtractQueryWords(CleanUpUrlForMatching(node
->url(), languages_
, NULL
));
132 for (size_t i
= 0; i
< terms
.size(); ++i
)
133 UnregisterNode(terms
[i
], node
);
136 void BookmarkIndex::GetBookmarksMatching(const base::string16
& input_query
,
138 std::vector
<BookmarkMatch
>* results
) {
139 const base::string16 query
= Normalize(input_query
);
140 std::vector
<base::string16
> terms
= ExtractQueryWords(query
);
145 for (size_t i
= 0; i
< terms
.size(); ++i
) {
146 if (!GetBookmarksMatchingTerm(terms
[i
], i
== 0, &matches
))
151 SortMatches(matches
, &sorted_nodes
);
153 // We use a QueryParser to fill in match positions for us. It's not the most
154 // efficient way to go about this, but by the time we get here we know what
155 // matches and so this shouldn't be performance critical.
156 query_parser::QueryParser parser
;
157 ScopedVector
<query_parser::QueryNode
> query_nodes
;
158 parser
.ParseQueryNodes(query
, &query_nodes
.get());
160 // The highest typed counts should be at the beginning of the results vector
161 // so that the best matches will always be included in the results. The loop
162 // that calculates result relevance in HistoryContentsProvider::ConvertResults
163 // will run backwards to assure higher relevance will be attributed to the
165 for (Nodes::const_iterator i
= sorted_nodes
.begin();
166 i
!= sorted_nodes
.end() && results
->size() < max_count
;
168 AddMatchToResults(*i
, &parser
, query_nodes
.get(), results
);
171 void BookmarkIndex::SortMatches(const Matches
& matches
,
172 Nodes
* sorted_nodes
) const {
174 for (Matches::const_iterator i
= matches
.begin(); i
!= matches
.end(); ++i
) {
175 #if !defined(OS_ANDROID)
176 nodes
.insert(i
->nodes_begin(), i
->nodes_end());
178 // Work around a bug in the implementation of std::set::insert in the STL
179 // used on android (http://crbug.com/367050).
180 for (NodeSet::const_iterator n
= i
->nodes_begin(); n
!= i
->nodes_end(); ++n
)
181 nodes
.insert(nodes
.end(), *n
);
184 sorted_nodes
->reserve(sorted_nodes
->size() + nodes
.size());
185 if (client_
->SupportsTypedCountForNodes()) {
186 NodeTypedCountPairs node_typed_counts
;
187 client_
->GetTypedCountForNodes(nodes
, &node_typed_counts
);
188 std::sort(node_typed_counts
.begin(),
189 node_typed_counts
.end(),
190 NodeTypedCountPairSortFunctor());
191 std::transform(node_typed_counts
.begin(),
192 node_typed_counts
.end(),
193 std::back_inserter(*sorted_nodes
),
194 NodeTypedCountPairExtractNodeFunctor());
196 sorted_nodes
->insert(sorted_nodes
->end(), nodes
.begin(), nodes
.end());
200 void BookmarkIndex::AddMatchToResults(
201 const BookmarkNode
* node
,
202 query_parser::QueryParser
* parser
,
203 const query_parser::QueryNodeStarVector
& query_nodes
,
204 std::vector
<BookmarkMatch
>* results
) {
205 // Check that the result matches the query. The previous search
206 // was a simple per-word search, while the more complex matching
207 // of QueryParser may filter it out. For example, the query
208 // ["thi"] will match the bookmark titled [Thinking], but since
209 // ["thi"] is quoted we don't want to do a prefix match.
210 query_parser::QueryWordVector title_words
, url_words
;
211 const base::string16 lower_title
=
212 base::i18n::ToLower(Normalize(node
->GetTitle()));
213 parser
->ExtractQueryWords(lower_title
, &title_words
);
214 base::OffsetAdjuster::Adjustments adjustments
;
215 parser
->ExtractQueryWords(
216 CleanUpUrlForMatching(node
->url(), languages_
, &adjustments
),
218 query_parser::Snippet::MatchPositions title_matches
, url_matches
;
219 for (size_t i
= 0; i
< query_nodes
.size(); ++i
) {
220 const bool has_title_matches
=
221 query_nodes
[i
]->HasMatchIn(title_words
, &title_matches
);
222 const bool has_url_matches
=
223 query_nodes
[i
]->HasMatchIn(url_words
, &url_matches
);
224 if (!has_title_matches
&& !has_url_matches
)
226 query_parser::QueryParser::SortAndCoalesceMatchPositions(&title_matches
);
227 query_parser::QueryParser::SortAndCoalesceMatchPositions(&url_matches
);
230 if (lower_title
.length() == node
->GetTitle().length()) {
231 // Only use title matches if the lowercase string is the same length
232 // as the original string, otherwise the matches are meaningless.
233 // TODO(mpearson): revise match positions appropriately.
234 match
.title_match_positions
.swap(title_matches
);
236 // Now that we're done processing this entry, correct the offsets of the
237 // matches in |url_matches| so they point to offsets in the original URL
238 // spec, not the cleaned-up URL string that we used for matching.
239 std::vector
<size_t> offsets
=
240 BookmarkMatch::OffsetsFromMatchPositions(url_matches
);
241 base::OffsetAdjuster::UnadjustOffsets(adjustments
, &offsets
);
243 BookmarkMatch::ReplaceOffsetsInMatchPositions(url_matches
, offsets
);
244 match
.url_match_positions
.swap(url_matches
);
246 results
->push_back(match
);
249 bool BookmarkIndex::GetBookmarksMatchingTerm(const base::string16
& term
,
252 Index::const_iterator i
= index_
.lower_bound(term
);
253 if (i
== index_
.end())
256 if (!query_parser::QueryParser::IsWordLongEnoughForPrefixSearch(term
)) {
257 // Term is too short for prefix match, compare using exact match.
258 if (i
->first
!= term
)
259 return false; // No bookmarks with this term.
263 match
.terms
.push_back(i
);
264 matches
->push_back(match
);
267 CombineMatchesInPlace(i
, matches
);
268 } else if (first_term
) {
269 // This is the first term and we're doing a prefix match. Loop through
270 // index adding all entries that start with term to matches.
271 while (i
!= index_
.end() &&
272 i
->first
.size() >= term
.size() &&
273 term
.compare(0, term
.size(), i
->first
, 0, term
.size()) == 0) {
275 match
.terms
.push_back(i
);
276 matches
->push_back(match
);
280 // Prefix match and not the first term. Loop through index combining
281 // current matches in matches with term, placing result in result.
283 while (i
!= index_
.end() &&
284 i
->first
.size() >= term
.size() &&
285 term
.compare(0, term
.size(), i
->first
, 0, term
.size()) == 0) {
286 CombineMatches(i
, *matches
, &result
);
289 matches
->swap(result
);
291 return !matches
->empty();
294 void BookmarkIndex::CombineMatchesInPlace(const Index::const_iterator
& index_i
,
296 for (size_t i
= 0; i
< matches
->size(); ) {
297 Match
* match
= &((*matches
)[i
]);
298 NodeSet intersection
;
299 std::set_intersection(match
->nodes_begin(), match
->nodes_end(),
300 index_i
->second
.begin(), index_i
->second
.end(),
301 std::inserter(intersection
, intersection
.begin()));
302 if (intersection
.empty()) {
303 matches
->erase(matches
->begin() + i
);
305 match
->terms
.push_back(index_i
);
306 match
->nodes
.swap(intersection
);
312 void BookmarkIndex::CombineMatches(const Index::const_iterator
& index_i
,
313 const Matches
& current_matches
,
315 for (size_t i
= 0; i
< current_matches
.size(); ++i
) {
316 const Match
& match
= current_matches
[i
];
317 NodeSet intersection
;
318 std::set_intersection(match
.nodes_begin(), match
.nodes_end(),
319 index_i
->second
.begin(), index_i
->second
.end(),
320 std::inserter(intersection
, intersection
.begin()));
321 if (!intersection
.empty()) {
322 result
->push_back(Match());
323 Match
& combined_match
= result
->back();
324 combined_match
.terms
= match
.terms
;
325 combined_match
.terms
.push_back(index_i
);
326 combined_match
.nodes
.swap(intersection
);
331 std::vector
<base::string16
> BookmarkIndex::ExtractQueryWords(
332 const base::string16
& query
) {
333 std::vector
<base::string16
> terms
;
335 return std::vector
<base::string16
>();
336 query_parser::QueryParser parser
;
337 parser
.ParseQueryWords(base::i18n::ToLower(query
), &terms
);
341 void BookmarkIndex::RegisterNode(const base::string16
& term
,
342 const BookmarkNode
* node
) {
343 index_
[term
].insert(node
);
346 void BookmarkIndex::UnregisterNode(const base::string16
& term
,
347 const BookmarkNode
* node
) {
348 Index::iterator i
= index_
.find(term
);
349 if (i
== index_
.end()) {
350 // We can get here if the node has the same term more than once. For
351 // example, a bookmark with the title 'foo foo' would end up here.
354 i
->second
.erase(node
);
355 if (i
->second
.empty())
359 } // namespace bookmarks