1 // Copyright 2014 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "components/bookmarks/browser/bookmark_index.h"
12 #include "base/i18n/case_conversion.h"
13 #include "base/logging.h"
14 #include "base/stl_util.h"
15 #include "base/strings/utf_offset_string_conversions.h"
16 #include "components/bookmarks/browser/bookmark_client.h"
17 #include "components/bookmarks/browser/bookmark_match.h"
18 #include "components/bookmarks/browser/bookmark_node.h"
19 #include "components/bookmarks/browser/bookmark_utils.h"
20 #include "components/query_parser/snippet.h"
21 #include "third_party/icu/source/common/unicode/normalizer2.h"
22 #include "third_party/icu/source/common/unicode/utypes.h"
26 typedef BookmarkClient::NodeTypedCountPair NodeTypedCountPair
;
27 typedef BookmarkClient::NodeTypedCountPairs NodeTypedCountPairs
;
31 // Returns a normalized version of the UTF16 string |text|. If it fails to
32 // normalize the string, returns |text| itself as a best-effort.
33 base::string16
Normalize(const base::string16
& text
) {
34 UErrorCode status
= U_ZERO_ERROR
;
35 const icu::Normalizer2
* normalizer2
=
36 icu::Normalizer2::getInstance(NULL
, "nfkc", UNORM2_COMPOSE
, status
);
37 if (U_FAILURE(status
)) {
38 // Log and crash right away to capture the error code in the crash report.
39 LOG(FATAL
) << "failed to create a normalizer: " << u_errorName(status
);
41 icu::UnicodeString
unicode_text(
42 text
.data(), static_cast<int32_t>(text
.length()));
43 icu::UnicodeString unicode_normalized_text
;
44 normalizer2
->normalize(unicode_text
, unicode_normalized_text
, status
);
45 if (U_FAILURE(status
)) {
46 // This should not happen. Log the error and fall back.
47 LOG(ERROR
) << "normalization failed: " << u_errorName(status
);
50 return base::string16(unicode_normalized_text
.getBuffer(),
51 unicode_normalized_text
.length());
54 // Sort functor for NodeTypedCountPairs. We sort in decreasing order of typed
55 // count so that the best matches will always be added to the results.
56 struct NodeTypedCountPairSortFunctor
57 : std::binary_function
<NodeTypedCountPair
, NodeTypedCountPair
, bool> {
58 bool operator()(const NodeTypedCountPair
& a
,
59 const NodeTypedCountPair
& b
) const {
60 return a
.second
> b
.second
;
64 // Extract the const Node* stored in a BookmarkClient::NodeTypedCountPair.
65 struct NodeTypedCountPairExtractNodeFunctor
66 : std::unary_function
<NodeTypedCountPair
, const BookmarkNode
*> {
67 const BookmarkNode
* operator()(const NodeTypedCountPair
& pair
) const {
74 BookmarkIndex::BookmarkIndex(BookmarkClient
* client
,
75 const std::string
& languages
)
77 languages_(languages
) {
81 BookmarkIndex::~BookmarkIndex() {
84 void BookmarkIndex::Add(const BookmarkNode
* node
) {
87 std::vector
<base::string16
> terms
=
88 ExtractQueryWords(Normalize(node
->GetTitle()));
89 for (size_t i
= 0; i
< terms
.size(); ++i
)
90 RegisterNode(terms
[i
], node
);
92 ExtractQueryWords(CleanUpUrlForMatching(node
->url(), languages_
, NULL
));
93 for (size_t i
= 0; i
< terms
.size(); ++i
)
94 RegisterNode(terms
[i
], node
);
97 void BookmarkIndex::Remove(const BookmarkNode
* node
) {
101 std::vector
<base::string16
> terms
=
102 ExtractQueryWords(Normalize(node
->GetTitle()));
103 for (size_t i
= 0; i
< terms
.size(); ++i
)
104 UnregisterNode(terms
[i
], node
);
106 ExtractQueryWords(CleanUpUrlForMatching(node
->url(), languages_
, NULL
));
107 for (size_t i
= 0; i
< terms
.size(); ++i
)
108 UnregisterNode(terms
[i
], node
);
111 void BookmarkIndex::GetBookmarksMatching(
112 const base::string16
& input_query
,
114 query_parser::MatchingAlgorithm matching_algorithm
,
115 std::vector
<BookmarkMatch
>* results
) {
116 const base::string16 query
= Normalize(input_query
);
117 std::vector
<base::string16
> terms
= ExtractQueryWords(query
);
122 for (size_t i
= 0; i
< terms
.size(); ++i
) {
123 if (!GetBookmarksMatchingTerm(
124 terms
[i
], i
== 0, matching_algorithm
, &matches
)) {
130 SortMatches(matches
, &sorted_nodes
);
132 // We use a QueryParser to fill in match positions for us. It's not the most
133 // efficient way to go about this, but by the time we get here we know what
134 // matches and so this shouldn't be performance critical.
135 query_parser::QueryParser parser
;
136 ScopedVector
<query_parser::QueryNode
> query_nodes
;
137 parser
.ParseQueryNodes(query
, matching_algorithm
, &query_nodes
.get());
139 // The highest typed counts should be at the beginning of the results vector
140 // so that the best matches will always be included in the results. The loop
141 // that calculates result relevance in HistoryContentsProvider::ConvertResults
142 // will run backwards to assure higher relevance will be attributed to the
144 for (Nodes::const_iterator i
= sorted_nodes
.begin();
145 i
!= sorted_nodes
.end() && results
->size() < max_count
;
147 AddMatchToResults(*i
, &parser
, query_nodes
.get(), results
);
150 void BookmarkIndex::SortMatches(const NodeSet
& matches
,
151 Nodes
* sorted_nodes
) const {
152 sorted_nodes
->reserve(matches
.size());
153 if (client_
->SupportsTypedCountForNodes()) {
154 NodeTypedCountPairs node_typed_counts
;
155 client_
->GetTypedCountForNodes(matches
, &node_typed_counts
);
156 std::sort(node_typed_counts
.begin(),
157 node_typed_counts
.end(),
158 NodeTypedCountPairSortFunctor());
159 std::transform(node_typed_counts
.begin(),
160 node_typed_counts
.end(),
161 std::back_inserter(*sorted_nodes
),
162 NodeTypedCountPairExtractNodeFunctor());
164 sorted_nodes
->insert(sorted_nodes
->end(), matches
.begin(), matches
.end());
168 void BookmarkIndex::AddMatchToResults(
169 const BookmarkNode
* node
,
170 query_parser::QueryParser
* parser
,
171 const query_parser::QueryNodeStarVector
& query_nodes
,
172 std::vector
<BookmarkMatch
>* results
) {
173 // Check that the result matches the query. The previous search
174 // was a simple per-word search, while the more complex matching
175 // of QueryParser may filter it out. For example, the query
176 // ["thi"] will match the bookmark titled [Thinking], but since
177 // ["thi"] is quoted we don't want to do a prefix match.
178 query_parser::QueryWordVector title_words
, url_words
;
179 const base::string16 lower_title
=
180 base::i18n::ToLower(Normalize(node
->GetTitle()));
181 parser
->ExtractQueryWords(lower_title
, &title_words
);
182 base::OffsetAdjuster::Adjustments adjustments
;
183 parser
->ExtractQueryWords(
184 CleanUpUrlForMatching(node
->url(), languages_
, &adjustments
),
186 query_parser::Snippet::MatchPositions title_matches
, url_matches
;
187 for (size_t i
= 0; i
< query_nodes
.size(); ++i
) {
188 const bool has_title_matches
=
189 query_nodes
[i
]->HasMatchIn(title_words
, &title_matches
);
190 const bool has_url_matches
=
191 query_nodes
[i
]->HasMatchIn(url_words
, &url_matches
);
192 if (!has_title_matches
&& !has_url_matches
)
194 query_parser::QueryParser::SortAndCoalesceMatchPositions(&title_matches
);
195 query_parser::QueryParser::SortAndCoalesceMatchPositions(&url_matches
);
198 if (lower_title
.length() == node
->GetTitle().length()) {
199 // Only use title matches if the lowercase string is the same length
200 // as the original string, otherwise the matches are meaningless.
201 // TODO(mpearson): revise match positions appropriately.
202 match
.title_match_positions
.swap(title_matches
);
204 // Now that we're done processing this entry, correct the offsets of the
205 // matches in |url_matches| so they point to offsets in the original URL
206 // spec, not the cleaned-up URL string that we used for matching.
207 std::vector
<size_t> offsets
=
208 BookmarkMatch::OffsetsFromMatchPositions(url_matches
);
209 base::OffsetAdjuster::UnadjustOffsets(adjustments
, &offsets
);
211 BookmarkMatch::ReplaceOffsetsInMatchPositions(url_matches
, offsets
);
212 match
.url_match_positions
.swap(url_matches
);
214 results
->push_back(match
);
217 bool BookmarkIndex::GetBookmarksMatchingTerm(
218 const base::string16
& term
,
220 query_parser::MatchingAlgorithm matching_algorithm
,
222 Index::const_iterator i
= index_
.lower_bound(term
);
223 if (i
== index_
.end())
226 if (!query_parser::QueryParser::IsWordLongEnoughForPrefixSearch(
227 term
, matching_algorithm
)) {
228 // Term is too short for prefix match, compare using exact match.
229 if (i
->first
!= term
)
230 return false; // No bookmarks with this term.
233 (*matches
) = i
->second
;
236 *matches
= base::STLSetIntersection
<NodeSet
>(i
->second
, *matches
);
238 // Loop through index adding all entries that start with term to
240 NodeSet tmp_prefix_matches
;
241 // If this is the first term, then store the result directly in |matches|
242 // to avoid calling stl intersection (which requires a copy).
243 NodeSet
* prefix_matches
= first_term
? matches
: &tmp_prefix_matches
;
244 while (i
!= index_
.end() &&
245 i
->first
.size() >= term
.size() &&
246 term
.compare(0, term
.size(), i
->first
, 0, term
.size()) == 0) {
247 #if !defined(OS_ANDROID)
248 prefix_matches
->insert(i
->second
.begin(), i
->second
.end());
250 // Work around a bug in the implementation of std::set::insert in the STL
251 // used on android (http://crbug.com/367050).
252 for (NodeSet::const_iterator n
= i
->second
.begin(); n
!= i
->second
.end();
254 prefix_matches
->insert(prefix_matches
->end(), *n
);
259 *matches
= base::STLSetIntersection
<NodeSet
>(*prefix_matches
, *matches
);
261 return !matches
->empty();
264 std::vector
<base::string16
> BookmarkIndex::ExtractQueryWords(
265 const base::string16
& query
) {
266 std::vector
<base::string16
> terms
;
268 return std::vector
<base::string16
>();
269 query_parser::QueryParser parser
;
270 parser
.ParseQueryWords(base::i18n::ToLower(query
),
271 query_parser::MatchingAlgorithm::DEFAULT
,
276 void BookmarkIndex::RegisterNode(const base::string16
& term
,
277 const BookmarkNode
* node
) {
278 index_
[term
].insert(node
);
281 void BookmarkIndex::UnregisterNode(const base::string16
& term
,
282 const BookmarkNode
* node
) {
283 Index::iterator i
= index_
.find(term
);
284 if (i
== index_
.end()) {
285 // We can get here if the node has the same term more than once. For
286 // example, a bookmark with the title 'foo foo' would end up here.
289 i
->second
.erase(node
);
290 if (i
->second
.empty())
294 } // namespace bookmarks