1 // Copyright 2014 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "components/bookmarks/browser/bookmark_index.h"
12 #include "base/i18n/case_conversion.h"
13 #include "base/logging.h"
14 #include "base/stl_util.h"
15 #include "base/strings/utf_offset_string_conversions.h"
16 #include "components/bookmarks/browser/bookmark_client.h"
17 #include "components/bookmarks/browser/bookmark_match.h"
18 #include "components/bookmarks/browser/bookmark_node.h"
19 #include "components/bookmarks/browser/bookmark_utils.h"
20 #include "components/query_parser/snippet.h"
21 #include "third_party/icu/source/common/unicode/normalizer2.h"
25 typedef BookmarkClient::NodeTypedCountPair NodeTypedCountPair
;
26 typedef BookmarkClient::NodeTypedCountPairs NodeTypedCountPairs
;
30 // Returns a normalized version of the UTF16 string |text|. If it fails to
31 // normalize the string, returns |text| itself as a best-effort.
32 base::string16
Normalize(const base::string16
& text
) {
33 UErrorCode status
= U_ZERO_ERROR
;
34 const icu::Normalizer2
* normalizer2
=
35 icu::Normalizer2::getInstance(NULL
, "nfkc", UNORM2_COMPOSE
, status
);
36 icu::UnicodeString
unicode_text(
37 text
.data(), static_cast<int32_t>(text
.length()));
38 icu::UnicodeString unicode_normalized_text
;
39 normalizer2
->normalize(unicode_text
, unicode_normalized_text
, status
);
40 if (U_FAILURE(status
))
42 return base::string16(unicode_normalized_text
.getBuffer(),
43 unicode_normalized_text
.length());
46 // Sort functor for NodeTypedCountPairs. We sort in decreasing order of typed
47 // count so that the best matches will always be added to the results.
48 struct NodeTypedCountPairSortFunctor
49 : std::binary_function
<NodeTypedCountPair
, NodeTypedCountPair
, bool> {
50 bool operator()(const NodeTypedCountPair
& a
,
51 const NodeTypedCountPair
& b
) const {
52 return a
.second
> b
.second
;
56 // Extract the const Node* stored in a BookmarkClient::NodeTypedCountPair.
57 struct NodeTypedCountPairExtractNodeFunctor
58 : std::unary_function
<NodeTypedCountPair
, const BookmarkNode
*> {
59 const BookmarkNode
* operator()(const NodeTypedCountPair
& pair
) const {
66 BookmarkIndex::BookmarkIndex(BookmarkClient
* client
,
67 const std::string
& languages
)
69 languages_(languages
) {
73 BookmarkIndex::~BookmarkIndex() {
76 void BookmarkIndex::Add(const BookmarkNode
* node
) {
79 std::vector
<base::string16
> terms
=
80 ExtractQueryWords(Normalize(node
->GetTitle()));
81 for (size_t i
= 0; i
< terms
.size(); ++i
)
82 RegisterNode(terms
[i
], node
);
84 ExtractQueryWords(CleanUpUrlForMatching(node
->url(), languages_
, NULL
));
85 for (size_t i
= 0; i
< terms
.size(); ++i
)
86 RegisterNode(terms
[i
], node
);
89 void BookmarkIndex::Remove(const BookmarkNode
* node
) {
93 std::vector
<base::string16
> terms
=
94 ExtractQueryWords(Normalize(node
->GetTitle()));
95 for (size_t i
= 0; i
< terms
.size(); ++i
)
96 UnregisterNode(terms
[i
], node
);
98 ExtractQueryWords(CleanUpUrlForMatching(node
->url(), languages_
, NULL
));
99 for (size_t i
= 0; i
< terms
.size(); ++i
)
100 UnregisterNode(terms
[i
], node
);
103 void BookmarkIndex::GetBookmarksMatching(
104 const base::string16
& input_query
,
106 query_parser::MatchingAlgorithm matching_algorithm
,
107 std::vector
<BookmarkMatch
>* results
) {
108 const base::string16 query
= Normalize(input_query
);
109 std::vector
<base::string16
> terms
= ExtractQueryWords(query
);
114 for (size_t i
= 0; i
< terms
.size(); ++i
) {
115 if (!GetBookmarksMatchingTerm(
116 terms
[i
], i
== 0, matching_algorithm
, &matches
)) {
122 SortMatches(matches
, &sorted_nodes
);
124 // We use a QueryParser to fill in match positions for us. It's not the most
125 // efficient way to go about this, but by the time we get here we know what
126 // matches and so this shouldn't be performance critical.
127 query_parser::QueryParser parser
;
128 ScopedVector
<query_parser::QueryNode
> query_nodes
;
129 parser
.ParseQueryNodes(query
, matching_algorithm
, &query_nodes
.get());
131 // The highest typed counts should be at the beginning of the results vector
132 // so that the best matches will always be included in the results. The loop
133 // that calculates result relevance in HistoryContentsProvider::ConvertResults
134 // will run backwards to assure higher relevance will be attributed to the
136 for (Nodes::const_iterator i
= sorted_nodes
.begin();
137 i
!= sorted_nodes
.end() && results
->size() < max_count
;
139 AddMatchToResults(*i
, &parser
, query_nodes
.get(), results
);
142 void BookmarkIndex::SortMatches(const NodeSet
& matches
,
143 Nodes
* sorted_nodes
) const {
144 sorted_nodes
->reserve(matches
.size());
145 if (client_
->SupportsTypedCountForNodes()) {
146 NodeTypedCountPairs node_typed_counts
;
147 client_
->GetTypedCountForNodes(matches
, &node_typed_counts
);
148 std::sort(node_typed_counts
.begin(),
149 node_typed_counts
.end(),
150 NodeTypedCountPairSortFunctor());
151 std::transform(node_typed_counts
.begin(),
152 node_typed_counts
.end(),
153 std::back_inserter(*sorted_nodes
),
154 NodeTypedCountPairExtractNodeFunctor());
156 sorted_nodes
->insert(sorted_nodes
->end(), matches
.begin(), matches
.end());
160 void BookmarkIndex::AddMatchToResults(
161 const BookmarkNode
* node
,
162 query_parser::QueryParser
* parser
,
163 const query_parser::QueryNodeStarVector
& query_nodes
,
164 std::vector
<BookmarkMatch
>* results
) {
165 // Check that the result matches the query. The previous search
166 // was a simple per-word search, while the more complex matching
167 // of QueryParser may filter it out. For example, the query
168 // ["thi"] will match the bookmark titled [Thinking], but since
169 // ["thi"] is quoted we don't want to do a prefix match.
170 query_parser::QueryWordVector title_words
, url_words
;
171 const base::string16 lower_title
=
172 base::i18n::ToLower(Normalize(node
->GetTitle()));
173 parser
->ExtractQueryWords(lower_title
, &title_words
);
174 base::OffsetAdjuster::Adjustments adjustments
;
175 parser
->ExtractQueryWords(
176 CleanUpUrlForMatching(node
->url(), languages_
, &adjustments
),
178 query_parser::Snippet::MatchPositions title_matches
, url_matches
;
179 for (size_t i
= 0; i
< query_nodes
.size(); ++i
) {
180 const bool has_title_matches
=
181 query_nodes
[i
]->HasMatchIn(title_words
, &title_matches
);
182 const bool has_url_matches
=
183 query_nodes
[i
]->HasMatchIn(url_words
, &url_matches
);
184 if (!has_title_matches
&& !has_url_matches
)
186 query_parser::QueryParser::SortAndCoalesceMatchPositions(&title_matches
);
187 query_parser::QueryParser::SortAndCoalesceMatchPositions(&url_matches
);
190 if (lower_title
.length() == node
->GetTitle().length()) {
191 // Only use title matches if the lowercase string is the same length
192 // as the original string, otherwise the matches are meaningless.
193 // TODO(mpearson): revise match positions appropriately.
194 match
.title_match_positions
.swap(title_matches
);
196 // Now that we're done processing this entry, correct the offsets of the
197 // matches in |url_matches| so they point to offsets in the original URL
198 // spec, not the cleaned-up URL string that we used for matching.
199 std::vector
<size_t> offsets
=
200 BookmarkMatch::OffsetsFromMatchPositions(url_matches
);
201 base::OffsetAdjuster::UnadjustOffsets(adjustments
, &offsets
);
203 BookmarkMatch::ReplaceOffsetsInMatchPositions(url_matches
, offsets
);
204 match
.url_match_positions
.swap(url_matches
);
206 results
->push_back(match
);
209 bool BookmarkIndex::GetBookmarksMatchingTerm(
210 const base::string16
& term
,
212 query_parser::MatchingAlgorithm matching_algorithm
,
214 Index::const_iterator i
= index_
.lower_bound(term
);
215 if (i
== index_
.end())
218 if (!query_parser::QueryParser::IsWordLongEnoughForPrefixSearch(
219 term
, matching_algorithm
)) {
220 // Term is too short for prefix match, compare using exact match.
221 if (i
->first
!= term
)
222 return false; // No bookmarks with this term.
225 (*matches
) = i
->second
;
228 *matches
= base::STLSetIntersection
<NodeSet
>(i
->second
, *matches
);
230 // Loop through index adding all entries that start with term to
232 NodeSet tmp_prefix_matches
;
233 // If this is the first term, then store the result directly in |matches|
234 // to avoid calling stl intersection (which requires a copy).
235 NodeSet
* prefix_matches
= first_term
? matches
: &tmp_prefix_matches
;
236 while (i
!= index_
.end() &&
237 i
->first
.size() >= term
.size() &&
238 term
.compare(0, term
.size(), i
->first
, 0, term
.size()) == 0) {
239 #if !defined(OS_ANDROID)
240 prefix_matches
->insert(i
->second
.begin(), i
->second
.end());
242 // Work around a bug in the implementation of std::set::insert in the STL
243 // used on android (http://crbug.com/367050).
244 for (NodeSet::const_iterator n
= i
->second
.begin(); n
!= i
->second
.end();
246 prefix_matches
->insert(prefix_matches
->end(), *n
);
251 *matches
= base::STLSetIntersection
<NodeSet
>(*prefix_matches
, *matches
);
253 return !matches
->empty();
256 std::vector
<base::string16
> BookmarkIndex::ExtractQueryWords(
257 const base::string16
& query
) {
258 std::vector
<base::string16
> terms
;
260 return std::vector
<base::string16
>();
261 query_parser::QueryParser parser
;
262 parser
.ParseQueryWords(base::i18n::ToLower(query
),
263 query_parser::MatchingAlgorithm::DEFAULT
,
268 void BookmarkIndex::RegisterNode(const base::string16
& term
,
269 const BookmarkNode
* node
) {
270 index_
[term
].insert(node
);
273 void BookmarkIndex::UnregisterNode(const base::string16
& term
,
274 const BookmarkNode
* node
) {
275 Index::iterator i
= index_
.find(term
);
276 if (i
== index_
.end()) {
277 // We can get here if the node has the same term more than once. For
278 // example, a bookmark with the title 'foo foo' would end up here.
281 i
->second
.erase(node
);
282 if (i
->second
.empty())
286 } // namespace bookmarks