1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "chrome/browser/autocomplete/url_index_private_data.h"
14 #include "base/basictypes.h"
15 #include "base/files/file_util.h"
16 #include "base/i18n/break_iterator.h"
17 #include "base/i18n/case_conversion.h"
18 #include "base/metrics/histogram.h"
19 #include "base/strings/string_util.h"
20 #include "base/strings/utf_string_conversions.h"
21 #include "base/time/time.h"
22 #include "chrome/browser/autocomplete/in_memory_url_index.h"
23 #include "components/bookmarks/browser/bookmark_model.h"
24 #include "components/bookmarks/browser/bookmark_utils.h"
25 #include "components/history/core/browser/history_database.h"
26 #include "components/history/core/browser/history_db_task.h"
27 #include "components/history/core/browser/history_service.h"
28 #include "net/base/net_util.h"
30 #if defined(USE_SYSTEM_PROTOBUF)
31 #include <google/protobuf/repeated_field.h>
33 #include "third_party/protobuf/src/google/protobuf/repeated_field.h"
36 using google::protobuf::RepeatedField
;
37 using google::protobuf::RepeatedPtrField
;
38 using in_memory_url_index::InMemoryURLIndexCacheItem
;
41 static const size_t kMaxVisitsToStoreInCache
= 10u;
44 typedef in_memory_url_index::InMemoryURLIndexCacheItem_WordListItem
46 typedef in_memory_url_index::InMemoryURLIndexCacheItem_WordMapItem_WordMapEntry
48 typedef in_memory_url_index::InMemoryURLIndexCacheItem_WordMapItem WordMapItem
;
49 typedef in_memory_url_index::InMemoryURLIndexCacheItem_CharWordMapItem
51 typedef in_memory_url_index::
52 InMemoryURLIndexCacheItem_CharWordMapItem_CharWordMapEntry CharWordMapEntry
;
53 typedef in_memory_url_index::InMemoryURLIndexCacheItem_WordIDHistoryMapItem
55 typedef in_memory_url_index::
56 InMemoryURLIndexCacheItem_WordIDHistoryMapItem_WordIDHistoryMapEntry
57 WordIDHistoryMapEntry
;
58 typedef in_memory_url_index::InMemoryURLIndexCacheItem_HistoryInfoMapItem
60 typedef in_memory_url_index::
61 InMemoryURLIndexCacheItem_HistoryInfoMapItem_HistoryInfoMapEntry
63 typedef in_memory_url_index::
64 InMemoryURLIndexCacheItem_HistoryInfoMapItem_HistoryInfoMapEntry_VisitInfo
65 HistoryInfoMapEntry_VisitInfo
;
66 typedef in_memory_url_index::InMemoryURLIndexCacheItem_WordStartsMapItem
68 typedef in_memory_url_index::
69 InMemoryURLIndexCacheItem_WordStartsMapItem_WordStartsMapEntry
72 // Algorithm Functions ---------------------------------------------------------
74 // Comparison function for sorting search terms by descending length.
75 bool LengthGreater(const base::string16
& string_a
,
76 const base::string16
& string_b
) {
77 return string_a
.length() > string_b
.length();
81 // UpdateRecentVisitsFromHistoryDBTask -----------------------------------------
83 // HistoryDBTask used to update the recent visit data for a particular
84 // row from the history database.
85 class UpdateRecentVisitsFromHistoryDBTask
: public history::HistoryDBTask
{
87 explicit UpdateRecentVisitsFromHistoryDBTask(
88 URLIndexPrivateData
* private_data
,
89 history::URLID url_id
);
91 bool RunOnDBThread(history::HistoryBackend
* backend
,
92 history::HistoryDatabase
* db
) override
;
93 void DoneRunOnMainThread() override
;
96 ~UpdateRecentVisitsFromHistoryDBTask() override
;
98 // The URLIndexPrivateData that gets updated after the historyDB
100 URLIndexPrivateData
* private_data_
;
101 // The ID of the URL to get visits for and then update.
102 history::URLID url_id_
;
103 // Whether fetching the recent visits for the URL succeeded.
105 // The awaited data that's shown to private_data_ for it to copy and
107 history::VisitVector recent_visits_
;
109 DISALLOW_COPY_AND_ASSIGN(UpdateRecentVisitsFromHistoryDBTask
);
112 UpdateRecentVisitsFromHistoryDBTask::UpdateRecentVisitsFromHistoryDBTask(
113 URLIndexPrivateData
* private_data
,
114 history::URLID url_id
)
115 : private_data_(private_data
), url_id_(url_id
), succeeded_(false) {
118 bool UpdateRecentVisitsFromHistoryDBTask::RunOnDBThread(
119 history::HistoryBackend
* backend
,
120 history::HistoryDatabase
* db
) {
121 // Make sure the private data is going to get as many recent visits as
122 // ScoredHistoryMatch::GetFrequency() hopes to use.
123 DCHECK_GE(kMaxVisitsToStoreInCache
, ScoredHistoryMatch::kMaxVisitsToScore
);
124 succeeded_
= db
->GetMostRecentVisitsForURL(url_id_
,
125 kMaxVisitsToStoreInCache
,
128 recent_visits_
.clear();
129 return true; // Always claim to be done; do not retry failures.
132 void UpdateRecentVisitsFromHistoryDBTask::DoneRunOnMainThread() {
134 private_data_
->UpdateRecentVisits(url_id_
, recent_visits_
);
137 UpdateRecentVisitsFromHistoryDBTask::~UpdateRecentVisitsFromHistoryDBTask() {
141 // URLIndexPrivateData ---------------------------------------------------------
143 URLIndexPrivateData::URLIndexPrivateData()
144 : restored_cache_version_(0),
145 saved_cache_version_(kCurrentCacheFileVersion
),
146 pre_filter_item_count_(0),
147 post_filter_item_count_(0),
148 post_scoring_item_count_(0) {
151 ScoredHistoryMatches
URLIndexPrivateData::HistoryItemsForTerms(
152 base::string16 search_string
,
153 size_t cursor_position
,
155 const std::string
& languages
,
156 bookmarks::BookmarkModel
* bookmark_model
) {
157 // If cursor position is set and useful (not at either end of the
158 // string), allow the search string to be broken at cursor position.
159 // We do this by pretending there's a space where the cursor is.
160 if ((cursor_position
!= base::string16::npos
) &&
161 (cursor_position
< search_string
.length()) &&
162 (cursor_position
> 0)) {
163 search_string
.insert(cursor_position
, base::ASCIIToUTF16(" "));
165 pre_filter_item_count_
= 0;
166 post_filter_item_count_
= 0;
167 post_scoring_item_count_
= 0;
168 // The search string we receive may contain escaped characters. For reducing
169 // the index we need individual, lower-cased words, ignoring escapings. For
170 // the final filtering we need whitespace separated substrings possibly
171 // containing escaped characters.
172 base::string16
lower_raw_string(base::i18n::ToLower(search_string
));
173 base::string16 lower_unescaped_string
=
174 net::UnescapeURLComponent(lower_raw_string
,
175 net::UnescapeRule::SPACES
| net::UnescapeRule::URL_SPECIAL_CHARS
);
176 // Extract individual 'words' (as opposed to 'terms'; see below) from the
177 // search string. When the user types "colspec=ID%20Mstone Release" we get
178 // four 'words': "colspec", "id", "mstone" and "release".
179 String16Vector
lower_words(
180 String16VectorFromString16(lower_unescaped_string
, false, NULL
));
181 ScoredHistoryMatches scored_items
;
183 // Do nothing if we have indexed no words (probably because we've not been
184 // initialized yet) or the search string has no words.
185 if (word_list_
.empty() || lower_words
.empty()) {
186 search_term_cache_
.clear(); // Invalidate the term cache.
190 // Reset used_ flags for search_term_cache_. We use a basic mark-and-sweep
192 ResetSearchTermCache();
194 HistoryIDSet history_id_set
= HistoryIDSetFromWords(lower_words
);
196 // Trim the candidate pool if it is large. Note that we do not filter out
197 // items that do not contain the search terms as proper substrings -- doing
198 // so is the performance-costly operation we are trying to avoid in order
199 // to maintain omnibox responsiveness.
200 const size_t kItemsToScoreLimit
= 500;
201 pre_filter_item_count_
= history_id_set
.size();
202 // If we trim the results set we do not want to cache the results for next
203 // time as the user's ultimately desired result could easily be eliminated
204 // in this early rough filter.
205 bool was_trimmed
= (pre_filter_item_count_
> kItemsToScoreLimit
);
207 HistoryIDVector history_ids
;
208 std::copy(history_id_set
.begin(), history_id_set
.end(),
209 std::back_inserter(history_ids
));
210 // Trim down the set by sorting by typed-count, visit-count, and last
212 HistoryItemFactorGreater
213 item_factor_functor(history_info_map_
);
214 std::partial_sort(history_ids
.begin(),
215 history_ids
.begin() + kItemsToScoreLimit
,
217 item_factor_functor
);
218 history_id_set
.clear();
219 std::copy(history_ids
.begin(), history_ids
.begin() + kItemsToScoreLimit
,
220 std::inserter(history_id_set
, history_id_set
.end()));
221 post_filter_item_count_
= history_id_set
.size();
224 // Pass over all of the candidates filtering out any without a proper
225 // substring match, inserting those which pass in order by score. Note that
226 // in this step we are using the raw search string complete with escaped
227 // URL elements. When the user has specifically typed something akin to
228 // "sort=pri&colspec=ID%20Mstone%20Release" we want to make sure that that
229 // specific substring appears in the URL or page title.
231 // We call these 'terms' (as opposed to 'words'; see above) as in this case
232 // we only want to break up the search string on 'true' whitespace rather than
233 // escaped whitespace. When the user types "colspec=ID%20Mstone Release" we
234 // get two 'terms': "colspec=id%20mstone" and "release".
235 String16Vector lower_raw_terms
;
236 if (Tokenize(lower_raw_string
, base::kWhitespaceUTF16
,
237 &lower_raw_terms
) == 0) {
238 // Don't score matches when there are no terms to score against. (It's
239 // possible that the word break iterater that extracts words to search
240 // for in the database allows some whitespace "words" whereas Tokenize
241 // excludes a long list of whitespace.) One could write a scoring
242 // function that gives a reasonable order to matches when there
243 // are no terms (i.e., all the words are some form of whitespace),
244 // but this is such a rare edge case that it's not worth the time.
249 history_id_set
.begin(), history_id_set
.end(),
250 AddHistoryMatch(bookmark_model
, *this, languages
, lower_raw_string
,
251 lower_raw_terms
, base::Time::Now())).ScoredMatches();
253 // Select and sort only the top |max_matches| results.
254 if (scored_items
.size() > max_matches
) {
255 std::partial_sort(scored_items
.begin(),
256 scored_items
.begin() +
259 ScoredHistoryMatch::MatchScoreGreater
);
260 scored_items
.resize(max_matches
);
262 std::sort(scored_items
.begin(), scored_items
.end(),
263 ScoredHistoryMatch::MatchScoreGreater
);
265 post_scoring_item_count_
= scored_items
.size();
268 search_term_cache_
.clear(); // Invalidate the term cache.
270 // Remove any stale SearchTermCacheItems.
271 for (SearchTermCacheMap::iterator cache_iter
= search_term_cache_
.begin();
272 cache_iter
!= search_term_cache_
.end(); ) {
273 if (!cache_iter
->second
.used_
)
274 search_term_cache_
.erase(cache_iter
++);
283 bool URLIndexPrivateData::UpdateURL(
284 history::HistoryService
* history_service
,
285 const history::URLRow
& row
,
286 const std::string
& languages
,
287 const std::set
<std::string
>& scheme_whitelist
,
288 base::CancelableTaskTracker
* tracker
) {
289 // The row may or may not already be in our index. If it is not already
290 // indexed and it qualifies then it gets indexed. If it is already
291 // indexed and still qualifies then it gets updated, otherwise it
292 // is deleted from the index.
293 bool row_was_updated
= false;
294 history::URLID row_id
= row
.id();
295 HistoryInfoMap::iterator row_pos
= history_info_map_
.find(row_id
);
296 if (row_pos
== history_info_map_
.end()) {
297 // This new row should be indexed if it qualifies.
298 history::URLRow
new_row(row
);
299 new_row
.set_id(row_id
);
300 row_was_updated
= RowQualifiesAsSignificant(new_row
, base::Time()) &&
307 } else if (RowQualifiesAsSignificant(row
, base::Time())) {
308 // This indexed row still qualifies and will be re-indexed.
309 // The url won't have changed but the title, visit count, etc.
310 // might have changed.
311 history::URLRow
& row_to_update
= row_pos
->second
.url_row
;
312 bool title_updated
= row_to_update
.title() != row
.title();
313 if (row_to_update
.visit_count() != row
.visit_count() ||
314 row_to_update
.typed_count() != row
.typed_count() ||
315 row_to_update
.last_visit() != row
.last_visit() || title_updated
) {
316 row_to_update
.set_visit_count(row
.visit_count());
317 row_to_update
.set_typed_count(row
.typed_count());
318 row_to_update
.set_last_visit(row
.last_visit());
319 // If something appears to have changed, update the recent visits
321 ScheduleUpdateRecentVisits(history_service
, row_id
, tracker
);
322 // While the URL is guaranteed to remain stable, the title may have
323 // changed. If so, then update the index with the changed words.
325 // Clear all words associated with this row and re-index both the
327 RemoveRowWordsFromIndex(row_to_update
);
328 row_to_update
.set_title(row
.title());
329 RowWordStarts word_starts
;
330 AddRowWordsToIndex(row_to_update
, &word_starts
, languages
);
331 word_starts_map_
[row_id
] = word_starts
;
333 row_was_updated
= true;
336 // This indexed row no longer qualifies and will be de-indexed by
337 // clearing all words associated with this row.
338 RemoveRowFromIndex(row
);
339 row_was_updated
= true;
342 search_term_cache_
.clear(); // This invalidates the cache.
343 return row_was_updated
;
346 void URLIndexPrivateData::UpdateRecentVisits(
347 history::URLID url_id
,
348 const history::VisitVector
& recent_visits
) {
349 HistoryInfoMap::iterator row_pos
= history_info_map_
.find(url_id
);
350 if (row_pos
!= history_info_map_
.end()) {
351 VisitInfoVector
* visits
= &row_pos
->second
.visits
;
354 std::min(recent_visits
.size(), kMaxVisitsToStoreInCache
);
355 visits
->reserve(size
);
356 for (size_t i
= 0; i
< size
; i
++) {
357 // Copy from the history::VisitVector the only fields visits needs.
358 visits
->push_back(std::make_pair(recent_visits
[i
].visit_time
,
359 recent_visits
[i
].transition
));
362 // Else: Oddly, the URL doesn't seem to exist in the private index.
363 // Ignore this update. This can happen if, for instance, the user
364 // removes the URL from URLIndexPrivateData before the historyDB call
368 void URLIndexPrivateData::ScheduleUpdateRecentVisits(
369 history::HistoryService
* history_service
,
370 history::URLID url_id
,
371 base::CancelableTaskTracker
* tracker
) {
372 history_service
->ScheduleDBTask(
373 scoped_ptr
<history::HistoryDBTask
>(
374 new UpdateRecentVisitsFromHistoryDBTask(this, url_id
)), tracker
);
377 // Helper functor for DeleteURL.
378 class HistoryInfoMapItemHasURL
{
380 explicit HistoryInfoMapItemHasURL(const GURL
& url
): url_(url
) {}
382 bool operator()(const std::pair
<const HistoryID
, HistoryInfoMapValue
>& item
) {
383 return item
.second
.url_row
.url() == url_
;
390 bool URLIndexPrivateData::DeleteURL(const GURL
& url
) {
391 // Find the matching entry in the history_info_map_.
392 HistoryInfoMap::iterator pos
= std::find_if(
393 history_info_map_
.begin(),
394 history_info_map_
.end(),
395 HistoryInfoMapItemHasURL(url
));
396 if (pos
== history_info_map_
.end())
398 RemoveRowFromIndex(pos
->second
.url_row
);
399 search_term_cache_
.clear(); // This invalidates the cache.
404 scoped_refptr
<URLIndexPrivateData
> URLIndexPrivateData::RestoreFromFile(
405 const base::FilePath
& file_path
,
406 const std::string
& languages
) {
407 base::TimeTicks beginning_time
= base::TimeTicks::Now();
408 if (!base::PathExists(file_path
))
411 // If there is no cache file then simply give up. This will cause us to
412 // attempt to rebuild from the history database.
413 if (!base::ReadFileToString(file_path
, &data
))
416 scoped_refptr
<URLIndexPrivateData
> restored_data(new URLIndexPrivateData
);
417 InMemoryURLIndexCacheItem index_cache
;
418 if (!index_cache
.ParseFromArray(data
.c_str(), data
.size())) {
419 LOG(WARNING
) << "Failed to parse URLIndexPrivateData cache data read from "
420 << file_path
.value();
421 return restored_data
;
424 if (!restored_data
->RestorePrivateData(index_cache
, languages
))
427 UMA_HISTOGRAM_TIMES("History.InMemoryURLIndexRestoreCacheTime",
428 base::TimeTicks::Now() - beginning_time
);
429 UMA_HISTOGRAM_COUNTS("History.InMemoryURLHistoryItems",
430 restored_data
->history_id_word_map_
.size());
431 UMA_HISTOGRAM_COUNTS("History.InMemoryURLCacheSize", data
.size());
432 UMA_HISTOGRAM_COUNTS_10000("History.InMemoryURLWords",
433 restored_data
->word_map_
.size());
434 UMA_HISTOGRAM_COUNTS_10000("History.InMemoryURLChars",
435 restored_data
->char_word_map_
.size());
436 if (restored_data
->Empty())
437 return NULL
; // 'No data' is the same as a failed reload.
438 return restored_data
;
442 scoped_refptr
<URLIndexPrivateData
> URLIndexPrivateData::RebuildFromHistory(
443 history::HistoryDatabase
* history_db
,
444 const std::string
& languages
,
445 const std::set
<std::string
>& scheme_whitelist
) {
449 base::TimeTicks beginning_time
= base::TimeTicks::Now();
451 scoped_refptr
<URLIndexPrivateData
>
452 rebuilt_data(new URLIndexPrivateData
);
453 history::URLDatabase::URLEnumerator history_enum
;
454 if (!history_db
->InitURLEnumeratorForSignificant(&history_enum
))
456 rebuilt_data
->last_time_rebuilt_from_history_
= base::Time::Now();
457 for (history::URLRow row
; history_enum
.GetNextURL(&row
);) {
458 rebuilt_data
->IndexRow(
459 history_db
, NULL
, row
, languages
, scheme_whitelist
, NULL
);
462 UMA_HISTOGRAM_TIMES("History.InMemoryURLIndexingTime",
463 base::TimeTicks::Now() - beginning_time
);
464 UMA_HISTOGRAM_COUNTS("History.InMemoryURLHistoryItems",
465 rebuilt_data
->history_id_word_map_
.size());
466 UMA_HISTOGRAM_COUNTS_10000("History.InMemoryURLWords",
467 rebuilt_data
->word_map_
.size());
468 UMA_HISTOGRAM_COUNTS_10000("History.InMemoryURLChars",
469 rebuilt_data
->char_word_map_
.size());
474 bool URLIndexPrivateData::WritePrivateDataToCacheFileTask(
475 scoped_refptr
<URLIndexPrivateData
> private_data
,
476 const base::FilePath
& file_path
) {
477 DCHECK(private_data
.get());
478 DCHECK(!file_path
.empty());
479 return private_data
->SaveToFile(file_path
);
482 scoped_refptr
<URLIndexPrivateData
> URLIndexPrivateData::Duplicate() const {
483 scoped_refptr
<URLIndexPrivateData
> data_copy
= new URLIndexPrivateData
;
484 data_copy
->last_time_rebuilt_from_history_
= last_time_rebuilt_from_history_
;
485 data_copy
->word_list_
= word_list_
;
486 data_copy
->available_words_
= available_words_
;
487 data_copy
->word_map_
= word_map_
;
488 data_copy
->char_word_map_
= char_word_map_
;
489 data_copy
->word_id_history_map_
= word_id_history_map_
;
490 data_copy
->history_id_word_map_
= history_id_word_map_
;
491 data_copy
->history_info_map_
= history_info_map_
;
492 data_copy
->word_starts_map_
= word_starts_map_
;
495 // search_term_cache_
496 // pre_filter_item_count_
497 // post_filter_item_count_
498 // post_scoring_item_count_
501 bool URLIndexPrivateData::Empty() const {
502 return history_info_map_
.empty();
505 void URLIndexPrivateData::Clear() {
506 last_time_rebuilt_from_history_
= base::Time();
508 available_words_
.clear();
510 char_word_map_
.clear();
511 word_id_history_map_
.clear();
512 history_id_word_map_
.clear();
513 history_info_map_
.clear();
514 word_starts_map_
.clear();
517 URLIndexPrivateData::~URLIndexPrivateData() {}
519 HistoryIDSet
URLIndexPrivateData::HistoryIDSetFromWords(
520 const String16Vector
& unsorted_words
) {
521 // Break the terms down into individual terms (words), get the candidate
522 // set for each term, and intersect each to get a final candidate list.
523 // Note that a single 'term' from the user's perspective might be
524 // a string like "http://www.somewebsite.com" which, from our perspective,
525 // is four words: 'http', 'www', 'somewebsite', and 'com'.
526 HistoryIDSet history_id_set
;
527 String16Vector
words(unsorted_words
);
528 // Sort the words into the longest first as such are likely to narrow down
529 // the results quicker. Also, single character words are the most expensive
530 // to process so save them for last.
531 std::sort(words
.begin(), words
.end(), LengthGreater
);
532 for (String16Vector::iterator iter
= words
.begin(); iter
!= words
.end();
534 base::string16 uni_word
= *iter
;
535 HistoryIDSet term_history_set
= HistoryIDsForTerm(uni_word
);
536 if (term_history_set
.empty()) {
537 history_id_set
.clear();
540 if (iter
== words
.begin()) {
541 history_id_set
.swap(term_history_set
);
543 HistoryIDSet new_history_id_set
= base::STLSetIntersection
<HistoryIDSet
>(
544 history_id_set
, term_history_set
);
545 history_id_set
.swap(new_history_id_set
);
548 return history_id_set
;
551 HistoryIDSet
URLIndexPrivateData::HistoryIDsForTerm(
552 const base::string16
& term
) {
554 return HistoryIDSet();
556 // TODO(mrossetti): Consider optimizing for very common terms such as
557 // 'http[s]', 'www', 'com', etc. Or collect the top 100 more frequently
558 // occuring words in the user's searches.
560 size_t term_length
= term
.length();
561 WordIDSet word_id_set
;
562 if (term_length
> 1) {
563 // See if this term or a prefix thereof is present in the cache.
564 SearchTermCacheMap::iterator
best_prefix(search_term_cache_
.end());
565 for (SearchTermCacheMap::iterator cache_iter
= search_term_cache_
.begin();
566 cache_iter
!= search_term_cache_
.end(); ++cache_iter
) {
567 if (StartsWith(term
, cache_iter
->first
, false) &&
568 (best_prefix
== search_term_cache_
.end() ||
569 cache_iter
->first
.length() > best_prefix
->first
.length()))
570 best_prefix
= cache_iter
;
573 // If a prefix was found then determine the leftover characters to be used
574 // for further refining the results from that prefix.
575 Char16Set prefix_chars
;
576 base::string16
leftovers(term
);
577 if (best_prefix
!= search_term_cache_
.end()) {
578 // If the prefix is an exact match for the term then grab the cached
579 // results and we're done.
580 size_t prefix_length
= best_prefix
->first
.length();
581 if (prefix_length
== term_length
) {
582 best_prefix
->second
.used_
= true;
583 return best_prefix
->second
.history_id_set_
;
586 // Otherwise we have a handy starting point.
587 // If there are no history results for this prefix then we can bail early
588 // as there will be no history results for the full term.
589 if (best_prefix
->second
.history_id_set_
.empty()) {
590 search_term_cache_
[term
] = SearchTermCacheItem();
591 return HistoryIDSet();
593 word_id_set
= best_prefix
->second
.word_id_set_
;
594 prefix_chars
= Char16SetFromString16(best_prefix
->first
);
595 leftovers
= term
.substr(prefix_length
);
598 // Filter for each remaining, unique character in the term.
599 Char16Set leftover_chars
= Char16SetFromString16(leftovers
);
600 Char16Set unique_chars
=
601 base::STLSetDifference
<Char16Set
>(leftover_chars
, prefix_chars
);
603 // Reduce the word set with any leftover, unprocessed characters.
604 if (!unique_chars
.empty()) {
605 WordIDSet
leftover_set(WordIDSetForTermChars(unique_chars
));
606 // We might come up empty on the leftovers.
607 if (leftover_set
.empty()) {
608 search_term_cache_
[term
] = SearchTermCacheItem();
609 return HistoryIDSet();
611 // Or there may not have been a prefix from which to start.
612 if (prefix_chars
.empty()) {
613 word_id_set
.swap(leftover_set
);
615 WordIDSet new_word_id_set
= base::STLSetIntersection
<WordIDSet
>(
616 word_id_set
, leftover_set
);
617 word_id_set
.swap(new_word_id_set
);
621 // We must filter the word list because the resulting word set surely
622 // contains words which do not have the search term as a proper subset.
623 for (WordIDSet::iterator word_set_iter
= word_id_set
.begin();
624 word_set_iter
!= word_id_set
.end(); ) {
625 if (word_list_
[*word_set_iter
].find(term
) == base::string16::npos
)
626 word_id_set
.erase(word_set_iter
++);
631 word_id_set
= WordIDSetForTermChars(Char16SetFromString16(term
));
634 // If any words resulted then we can compose a set of history IDs by unioning
635 // the sets from each word.
636 HistoryIDSet history_id_set
;
637 if (!word_id_set
.empty()) {
638 for (WordIDSet::iterator word_id_iter
= word_id_set
.begin();
639 word_id_iter
!= word_id_set
.end(); ++word_id_iter
) {
640 WordID word_id
= *word_id_iter
;
641 WordIDHistoryMap::iterator word_iter
= word_id_history_map_
.find(word_id
);
642 if (word_iter
!= word_id_history_map_
.end()) {
643 HistoryIDSet
& word_history_id_set(word_iter
->second
);
644 history_id_set
.insert(word_history_id_set
.begin(),
645 word_history_id_set
.end());
650 // Record a new cache entry for this word if the term is longer than
651 // a single character.
653 search_term_cache_
[term
] = SearchTermCacheItem(word_id_set
, history_id_set
);
655 return history_id_set
;
658 WordIDSet
URLIndexPrivateData::WordIDSetForTermChars(
659 const Char16Set
& term_chars
) {
660 WordIDSet word_id_set
;
661 for (Char16Set::const_iterator c_iter
= term_chars
.begin();
662 c_iter
!= term_chars
.end(); ++c_iter
) {
663 CharWordIDMap::iterator char_iter
= char_word_map_
.find(*c_iter
);
664 if (char_iter
== char_word_map_
.end()) {
665 // A character was not found so there are no matching results: bail.
669 WordIDSet
& char_word_id_set(char_iter
->second
);
670 // It is possible for there to no longer be any words associated with
671 // a particular character. Give up in that case.
672 if (char_word_id_set
.empty()) {
677 if (c_iter
== term_chars
.begin()) {
678 // First character results becomes base set of results.
679 word_id_set
= char_word_id_set
;
681 // Subsequent character results get intersected in.
682 WordIDSet new_word_id_set
= base::STLSetIntersection
<WordIDSet
>(
683 word_id_set
, char_word_id_set
);
684 word_id_set
.swap(new_word_id_set
);
690 bool URLIndexPrivateData::IndexRow(
691 history::HistoryDatabase
* history_db
,
692 history::HistoryService
* history_service
,
693 const history::URLRow
& row
,
694 const std::string
& languages
,
695 const std::set
<std::string
>& scheme_whitelist
,
696 base::CancelableTaskTracker
* tracker
) {
697 const GURL
& gurl(row
.url());
699 // Index only URLs with a whitelisted scheme.
700 if (!URLSchemeIsWhitelisted(gurl
, scheme_whitelist
))
703 history::URLID row_id
= row
.id();
704 // Strip out username and password before saving and indexing.
705 base::string16
url(net::FormatUrl(gurl
, languages
,
706 net::kFormatUrlOmitUsernamePassword
,
707 net::UnescapeRule::NONE
,
710 HistoryID history_id
= static_cast<HistoryID
>(row_id
);
711 DCHECK_LT(history_id
, std::numeric_limits
<HistoryID
>::max());
713 // Add the row for quick lookup in the history info store.
714 history::URLRow
new_row(GURL(url
), row_id
);
715 new_row
.set_visit_count(row
.visit_count());
716 new_row
.set_typed_count(row
.typed_count());
717 new_row
.set_last_visit(row
.last_visit());
718 new_row
.set_title(row
.title());
719 history_info_map_
[history_id
].url_row
= new_row
;
721 // Index the words contained in the URL and title of the row.
722 RowWordStarts word_starts
;
723 AddRowWordsToIndex(new_row
, &word_starts
, languages
);
724 word_starts_map_
[history_id
] = word_starts
;
726 // Update the recent visits information or schedule the update
729 // We'd like to check that we're on the history DB thread.
730 // However, unittest code actually calls this on the UI thread.
731 // So we don't do any thread checks.
732 history::VisitVector recent_visits
;
733 // Make sure the private data is going to get as many recent visits as
734 // ScoredHistoryMatch::GetFrequency() hopes to use.
735 DCHECK_GE(kMaxVisitsToStoreInCache
, ScoredHistoryMatch::kMaxVisitsToScore
);
736 if (history_db
->GetMostRecentVisitsForURL(row_id
,
737 kMaxVisitsToStoreInCache
,
739 UpdateRecentVisits(row_id
, recent_visits
);
742 DCHECK(history_service
);
743 ScheduleUpdateRecentVisits(history_service
, row_id
, tracker
);
749 void URLIndexPrivateData::AddRowWordsToIndex(const history::URLRow
& row
,
750 RowWordStarts
* word_starts
,
751 const std::string
& languages
) {
752 HistoryID history_id
= static_cast<HistoryID
>(row
.id());
753 // Split URL into individual, unique words then add in the title words.
754 const GURL
& gurl(row
.url());
755 const base::string16
& url
=
756 bookmarks::CleanUpUrlForMatching(gurl
, languages
, NULL
);
757 String16Set url_words
= String16SetFromString16(url
,
758 word_starts
? &word_starts
->url_word_starts_
: NULL
);
759 const base::string16
& title
= bookmarks::CleanUpTitleForMatching(row
.title());
760 String16Set title_words
= String16SetFromString16(title
,
761 word_starts
? &word_starts
->title_word_starts_
: NULL
);
762 String16Set words
= base::STLSetUnion
<String16Set
>(url_words
, title_words
);
763 for (String16Set::iterator word_iter
= words
.begin();
764 word_iter
!= words
.end(); ++word_iter
)
765 AddWordToIndex(*word_iter
, history_id
);
767 search_term_cache_
.clear(); // Invalidate the term cache.
770 void URLIndexPrivateData::AddWordToIndex(const base::string16
& term
,
771 HistoryID history_id
) {
772 WordMap::iterator word_pos
= word_map_
.find(term
);
773 if (word_pos
!= word_map_
.end())
774 UpdateWordHistory(word_pos
->second
, history_id
);
776 AddWordHistory(term
, history_id
);
779 void URLIndexPrivateData::AddWordHistory(const base::string16
& term
,
780 HistoryID history_id
) {
781 WordID word_id
= word_list_
.size();
782 if (available_words_
.empty()) {
783 word_list_
.push_back(term
);
785 word_id
= *(available_words_
.begin());
786 word_list_
[word_id
] = term
;
787 available_words_
.erase(word_id
);
789 word_map_
[term
] = word_id
;
791 HistoryIDSet history_id_set
;
792 history_id_set
.insert(history_id
);
793 word_id_history_map_
[word_id
] = history_id_set
;
794 AddToHistoryIDWordMap(history_id
, word_id
);
796 // For each character in the newly added word (i.e. a word that is not
797 // already in the word index), add the word to the character index.
798 Char16Set characters
= Char16SetFromString16(term
);
799 for (Char16Set::iterator uni_char_iter
= characters
.begin();
800 uni_char_iter
!= characters
.end(); ++uni_char_iter
) {
801 base::char16 uni_char
= *uni_char_iter
;
802 CharWordIDMap::iterator char_iter
= char_word_map_
.find(uni_char
);
803 if (char_iter
!= char_word_map_
.end()) {
804 // Update existing entry in the char/word index.
805 WordIDSet
& word_id_set(char_iter
->second
);
806 word_id_set
.insert(word_id
);
808 // Create a new entry in the char/word index.
809 WordIDSet word_id_set
;
810 word_id_set
.insert(word_id
);
811 char_word_map_
[uni_char
] = word_id_set
;
816 void URLIndexPrivateData::UpdateWordHistory(WordID word_id
,
817 HistoryID history_id
) {
818 WordIDHistoryMap::iterator history_pos
= word_id_history_map_
.find(word_id
);
819 DCHECK(history_pos
!= word_id_history_map_
.end());
820 HistoryIDSet
& history_id_set(history_pos
->second
);
821 history_id_set
.insert(history_id
);
822 AddToHistoryIDWordMap(history_id
, word_id
);
825 void URLIndexPrivateData::AddToHistoryIDWordMap(HistoryID history_id
,
827 HistoryIDWordMap::iterator iter
= history_id_word_map_
.find(history_id
);
828 if (iter
!= history_id_word_map_
.end()) {
829 WordIDSet
& word_id_set(iter
->second
);
830 word_id_set
.insert(word_id
);
832 WordIDSet word_id_set
;
833 word_id_set
.insert(word_id
);
834 history_id_word_map_
[history_id
] = word_id_set
;
838 void URLIndexPrivateData::RemoveRowFromIndex(const history::URLRow
& row
) {
839 RemoveRowWordsFromIndex(row
);
840 HistoryID history_id
= static_cast<HistoryID
>(row
.id());
841 history_info_map_
.erase(history_id
);
842 word_starts_map_
.erase(history_id
);
845 void URLIndexPrivateData::RemoveRowWordsFromIndex(const history::URLRow
& row
) {
846 // Remove the entries in history_id_word_map_ and word_id_history_map_ for
848 HistoryID history_id
= static_cast<HistoryID
>(row
.id());
849 WordIDSet word_id_set
= history_id_word_map_
[history_id
];
850 history_id_word_map_
.erase(history_id
);
852 // Reconcile any changes to word usage.
853 for (WordIDSet::iterator word_id_iter
= word_id_set
.begin();
854 word_id_iter
!= word_id_set
.end(); ++word_id_iter
) {
855 WordID word_id
= *word_id_iter
;
856 word_id_history_map_
[word_id
].erase(history_id
);
857 if (!word_id_history_map_
[word_id
].empty())
858 continue; // The word is still in use.
860 // The word is no longer in use. Reconcile any changes to character usage.
861 base::string16 word
= word_list_
[word_id
];
862 Char16Set characters
= Char16SetFromString16(word
);
863 for (Char16Set::iterator uni_char_iter
= characters
.begin();
864 uni_char_iter
!= characters
.end(); ++uni_char_iter
) {
865 base::char16 uni_char
= *uni_char_iter
;
866 char_word_map_
[uni_char
].erase(word_id
);
867 if (char_word_map_
[uni_char
].empty())
868 char_word_map_
.erase(uni_char
); // No longer in use.
871 // Complete the removal of references to the word.
872 word_id_history_map_
.erase(word_id
);
873 word_map_
.erase(word
);
874 word_list_
[word_id
] = base::string16();
875 available_words_
.insert(word_id
);
879 void URLIndexPrivateData::ResetSearchTermCache() {
880 for (SearchTermCacheMap::iterator iter
= search_term_cache_
.begin();
881 iter
!= search_term_cache_
.end(); ++iter
)
882 iter
->second
.used_
= false;
885 bool URLIndexPrivateData::SaveToFile(const base::FilePath
& file_path
) {
886 base::TimeTicks beginning_time
= base::TimeTicks::Now();
887 InMemoryURLIndexCacheItem index_cache
;
888 SavePrivateData(&index_cache
);
890 if (!index_cache
.SerializeToString(&data
)) {
891 LOG(WARNING
) << "Failed to serialize the InMemoryURLIndex cache.";
895 int size
= data
.size();
896 if (base::WriteFile(file_path
, data
.c_str(), size
) != size
) {
897 LOG(WARNING
) << "Failed to write " << file_path
.value();
900 UMA_HISTOGRAM_TIMES("History.InMemoryURLIndexSaveCacheTime",
901 base::TimeTicks::Now() - beginning_time
);
905 void URLIndexPrivateData::SavePrivateData(
906 InMemoryURLIndexCacheItem
* cache
) const {
908 cache
->set_last_rebuild_timestamp(
909 last_time_rebuilt_from_history_
.ToInternalValue());
910 cache
->set_version(saved_cache_version_
);
911 // history_item_count_ is no longer used but rather than change the protobuf
912 // definition use a placeholder. This will go away with the switch to SQLite.
913 cache
->set_history_item_count(0);
916 SaveCharWordMap(cache
);
917 SaveWordIDHistoryMap(cache
);
918 SaveHistoryInfoMap(cache
);
919 SaveWordStartsMap(cache
);
922 void URLIndexPrivateData::SaveWordList(InMemoryURLIndexCacheItem
* cache
) const {
923 if (word_list_
.empty())
925 WordListItem
* list_item
= cache
->mutable_word_list();
926 list_item
->set_word_count(word_list_
.size());
927 for (String16Vector::const_iterator iter
= word_list_
.begin();
928 iter
!= word_list_
.end(); ++iter
)
929 list_item
->add_word(base::UTF16ToUTF8(*iter
));
932 void URLIndexPrivateData::SaveWordMap(InMemoryURLIndexCacheItem
* cache
) const {
933 if (word_map_
.empty())
935 WordMapItem
* map_item
= cache
->mutable_word_map();
936 map_item
->set_item_count(word_map_
.size());
937 for (WordMap::const_iterator iter
= word_map_
.begin();
938 iter
!= word_map_
.end(); ++iter
) {
939 WordMapEntry
* map_entry
= map_item
->add_word_map_entry();
940 map_entry
->set_word(base::UTF16ToUTF8(iter
->first
));
941 map_entry
->set_word_id(iter
->second
);
945 void URLIndexPrivateData::SaveCharWordMap(
946 InMemoryURLIndexCacheItem
* cache
) const {
947 if (char_word_map_
.empty())
949 CharWordMapItem
* map_item
= cache
->mutable_char_word_map();
950 map_item
->set_item_count(char_word_map_
.size());
951 for (CharWordIDMap::const_iterator iter
= char_word_map_
.begin();
952 iter
!= char_word_map_
.end(); ++iter
) {
953 CharWordMapEntry
* map_entry
= map_item
->add_char_word_map_entry();
954 map_entry
->set_char_16(iter
->first
);
955 const WordIDSet
& word_id_set(iter
->second
);
956 map_entry
->set_item_count(word_id_set
.size());
957 for (WordIDSet::const_iterator set_iter
= word_id_set
.begin();
958 set_iter
!= word_id_set
.end(); ++set_iter
)
959 map_entry
->add_word_id(*set_iter
);
963 void URLIndexPrivateData::SaveWordIDHistoryMap(
964 InMemoryURLIndexCacheItem
* cache
) const {
965 if (word_id_history_map_
.empty())
967 WordIDHistoryMapItem
* map_item
= cache
->mutable_word_id_history_map();
968 map_item
->set_item_count(word_id_history_map_
.size());
969 for (WordIDHistoryMap::const_iterator iter
= word_id_history_map_
.begin();
970 iter
!= word_id_history_map_
.end(); ++iter
) {
971 WordIDHistoryMapEntry
* map_entry
=
972 map_item
->add_word_id_history_map_entry();
973 map_entry
->set_word_id(iter
->first
);
974 const HistoryIDSet
& history_id_set(iter
->second
);
975 map_entry
->set_item_count(history_id_set
.size());
976 for (HistoryIDSet::const_iterator set_iter
= history_id_set
.begin();
977 set_iter
!= history_id_set
.end(); ++set_iter
)
978 map_entry
->add_history_id(*set_iter
);
982 void URLIndexPrivateData::SaveHistoryInfoMap(
983 InMemoryURLIndexCacheItem
* cache
) const {
984 if (history_info_map_
.empty())
986 HistoryInfoMapItem
* map_item
= cache
->mutable_history_info_map();
987 map_item
->set_item_count(history_info_map_
.size());
988 for (HistoryInfoMap::const_iterator iter
= history_info_map_
.begin();
989 iter
!= history_info_map_
.end(); ++iter
) {
990 HistoryInfoMapEntry
* map_entry
= map_item
->add_history_info_map_entry();
991 map_entry
->set_history_id(iter
->first
);
992 const history::URLRow
& url_row(iter
->second
.url_row
);
993 // Note: We only save information that contributes to the index so there
994 // is no need to save search_term_cache_ (not persistent).
995 map_entry
->set_visit_count(url_row
.visit_count());
996 map_entry
->set_typed_count(url_row
.typed_count());
997 map_entry
->set_last_visit(url_row
.last_visit().ToInternalValue());
998 map_entry
->set_url(url_row
.url().spec());
999 map_entry
->set_title(base::UTF16ToUTF8(url_row
.title()));
1000 const VisitInfoVector
& visits(iter
->second
.visits
);
1001 for (VisitInfoVector::const_iterator visit_iter
= visits
.begin();
1002 visit_iter
!= visits
.end(); ++visit_iter
) {
1003 HistoryInfoMapEntry_VisitInfo
* visit_info
= map_entry
->add_visits();
1004 visit_info
->set_visit_time(visit_iter
->first
.ToInternalValue());
1005 visit_info
->set_transition_type(visit_iter
->second
);
1010 void URLIndexPrivateData::SaveWordStartsMap(
1011 InMemoryURLIndexCacheItem
* cache
) const {
1012 if (word_starts_map_
.empty())
1014 // For unit testing: Enable saving of the cache as an earlier version to
1015 // allow testing of cache file upgrading in ReadFromFile().
1016 // TODO(mrossetti): Instead of intruding on production code with this kind of
1017 // test harness, save a copy of an older version cache with known results.
1018 // Implement this when switching the caching over to SQLite.
1019 if (saved_cache_version_
< 1)
1022 WordStartsMapItem
* map_item
= cache
->mutable_word_starts_map();
1023 map_item
->set_item_count(word_starts_map_
.size());
1024 for (WordStartsMap::const_iterator iter
= word_starts_map_
.begin();
1025 iter
!= word_starts_map_
.end(); ++iter
) {
1026 WordStartsMapEntry
* map_entry
= map_item
->add_word_starts_map_entry();
1027 map_entry
->set_history_id(iter
->first
);
1028 const RowWordStarts
& word_starts(iter
->second
);
1029 for (WordStarts::const_iterator i
= word_starts
.url_word_starts_
.begin();
1030 i
!= word_starts
.url_word_starts_
.end(); ++i
)
1031 map_entry
->add_url_word_starts(*i
);
1032 for (WordStarts::const_iterator i
= word_starts
.title_word_starts_
.begin();
1033 i
!= word_starts
.title_word_starts_
.end(); ++i
)
1034 map_entry
->add_title_word_starts(*i
);
1038 bool URLIndexPrivateData::RestorePrivateData(
1039 const InMemoryURLIndexCacheItem
& cache
,
1040 const std::string
& languages
) {
1041 last_time_rebuilt_from_history_
=
1042 base::Time::FromInternalValue(cache
.last_rebuild_timestamp());
1043 const base::TimeDelta rebuilt_ago
=
1044 base::Time::Now() - last_time_rebuilt_from_history_
;
1045 if ((rebuilt_ago
> base::TimeDelta::FromDays(7)) ||
1046 (rebuilt_ago
< base::TimeDelta::FromDays(-1))) {
1047 // Cache is more than a week old or, somehow, from some time in the future.
1048 // It's probably a good time to rebuild the index from history to
1049 // allow synced entries to now appear, expired entries to disappear, etc.
1050 // Allow one day in the future to make the cache not rebuild on simple
1051 // system clock changes such as time zone changes.
1054 if (cache
.has_version()) {
1055 if (cache
.version() < kCurrentCacheFileVersion
) {
1056 // Don't try to restore an old format cache file. (This will cause
1057 // the InMemoryURLIndex to schedule rebuilding the URLIndexPrivateData
1061 restored_cache_version_
= cache
.version();
1063 return RestoreWordList(cache
) && RestoreWordMap(cache
) &&
1064 RestoreCharWordMap(cache
) && RestoreWordIDHistoryMap(cache
) &&
1065 RestoreHistoryInfoMap(cache
) && RestoreWordStartsMap(cache
, languages
);
1068 bool URLIndexPrivateData::RestoreWordList(
1069 const InMemoryURLIndexCacheItem
& cache
) {
1070 if (!cache
.has_word_list())
1072 const WordListItem
& list_item(cache
.word_list());
1073 uint32 expected_item_count
= list_item
.word_count();
1074 uint32 actual_item_count
= list_item
.word_size();
1075 if (actual_item_count
== 0 || actual_item_count
!= expected_item_count
)
1077 const RepeatedPtrField
<std::string
>& words(list_item
.word());
1078 for (RepeatedPtrField
<std::string
>::const_iterator iter
= words
.begin();
1079 iter
!= words
.end(); ++iter
)
1080 word_list_
.push_back(base::UTF8ToUTF16(*iter
));
1084 bool URLIndexPrivateData::RestoreWordMap(
1085 const InMemoryURLIndexCacheItem
& cache
) {
1086 if (!cache
.has_word_map())
1088 const WordMapItem
& list_item(cache
.word_map());
1089 uint32 expected_item_count
= list_item
.item_count();
1090 uint32 actual_item_count
= list_item
.word_map_entry_size();
1091 if (actual_item_count
== 0 || actual_item_count
!= expected_item_count
)
1093 const RepeatedPtrField
<WordMapEntry
>& entries(list_item
.word_map_entry());
1094 for (RepeatedPtrField
<WordMapEntry
>::const_iterator iter
= entries
.begin();
1095 iter
!= entries
.end(); ++iter
)
1096 word_map_
[base::UTF8ToUTF16(iter
->word())] = iter
->word_id();
1100 bool URLIndexPrivateData::RestoreCharWordMap(
1101 const InMemoryURLIndexCacheItem
& cache
) {
1102 if (!cache
.has_char_word_map())
1104 const CharWordMapItem
& list_item(cache
.char_word_map());
1105 uint32 expected_item_count
= list_item
.item_count();
1106 uint32 actual_item_count
= list_item
.char_word_map_entry_size();
1107 if (actual_item_count
== 0 || actual_item_count
!= expected_item_count
)
1109 const RepeatedPtrField
<CharWordMapEntry
>&
1110 entries(list_item
.char_word_map_entry());
1111 for (RepeatedPtrField
<CharWordMapEntry
>::const_iterator iter
=
1112 entries
.begin(); iter
!= entries
.end(); ++iter
) {
1113 expected_item_count
= iter
->item_count();
1114 actual_item_count
= iter
->word_id_size();
1115 if (actual_item_count
== 0 || actual_item_count
!= expected_item_count
)
1117 base::char16 uni_char
= static_cast<base::char16
>(iter
->char_16());
1118 WordIDSet word_id_set
;
1119 const RepeatedField
<int32
>& word_ids(iter
->word_id());
1120 for (RepeatedField
<int32
>::const_iterator jiter
= word_ids
.begin();
1121 jiter
!= word_ids
.end(); ++jiter
)
1122 word_id_set
.insert(*jiter
);
1123 char_word_map_
[uni_char
] = word_id_set
;
1128 bool URLIndexPrivateData::RestoreWordIDHistoryMap(
1129 const InMemoryURLIndexCacheItem
& cache
) {
1130 if (!cache
.has_word_id_history_map())
1132 const WordIDHistoryMapItem
& list_item(cache
.word_id_history_map());
1133 uint32 expected_item_count
= list_item
.item_count();
1134 uint32 actual_item_count
= list_item
.word_id_history_map_entry_size();
1135 if (actual_item_count
== 0 || actual_item_count
!= expected_item_count
)
1137 const RepeatedPtrField
<WordIDHistoryMapEntry
>&
1138 entries(list_item
.word_id_history_map_entry());
1139 for (RepeatedPtrField
<WordIDHistoryMapEntry
>::const_iterator iter
=
1140 entries
.begin(); iter
!= entries
.end(); ++iter
) {
1141 expected_item_count
= iter
->item_count();
1142 actual_item_count
= iter
->history_id_size();
1143 if (actual_item_count
== 0 || actual_item_count
!= expected_item_count
)
1145 WordID word_id
= iter
->word_id();
1146 HistoryIDSet history_id_set
;
1147 const RepeatedField
<int64
>& history_ids(iter
->history_id());
1148 for (RepeatedField
<int64
>::const_iterator jiter
= history_ids
.begin();
1149 jiter
!= history_ids
.end(); ++jiter
) {
1150 history_id_set
.insert(*jiter
);
1151 AddToHistoryIDWordMap(*jiter
, word_id
);
1153 word_id_history_map_
[word_id
] = history_id_set
;
1158 bool URLIndexPrivateData::RestoreHistoryInfoMap(
1159 const InMemoryURLIndexCacheItem
& cache
) {
1160 if (!cache
.has_history_info_map())
1162 const HistoryInfoMapItem
& list_item(cache
.history_info_map());
1163 uint32 expected_item_count
= list_item
.item_count();
1164 uint32 actual_item_count
= list_item
.history_info_map_entry_size();
1165 if (actual_item_count
== 0 || actual_item_count
!= expected_item_count
)
1167 const RepeatedPtrField
<HistoryInfoMapEntry
>&
1168 entries(list_item
.history_info_map_entry());
1169 for (RepeatedPtrField
<HistoryInfoMapEntry
>::const_iterator iter
=
1170 entries
.begin(); iter
!= entries
.end(); ++iter
) {
1171 HistoryID history_id
= iter
->history_id();
1172 GURL
url(iter
->url());
1173 history::URLRow
url_row(url
, history_id
);
1174 url_row
.set_visit_count(iter
->visit_count());
1175 url_row
.set_typed_count(iter
->typed_count());
1176 url_row
.set_last_visit(base::Time::FromInternalValue(iter
->last_visit()));
1177 if (iter
->has_title()) {
1178 base::string16
title(base::UTF8ToUTF16(iter
->title()));
1179 url_row
.set_title(title
);
1181 history_info_map_
[history_id
].url_row
= url_row
;
1183 // Restore visits list.
1184 VisitInfoVector visits
;
1185 visits
.reserve(iter
->visits_size());
1186 for (int i
= 0; i
< iter
->visits_size(); ++i
) {
1187 visits
.push_back(std::make_pair(
1188 base::Time::FromInternalValue(iter
->visits(i
).visit_time()),
1189 ui::PageTransitionFromInt(iter
->visits(i
).transition_type())));
1191 history_info_map_
[history_id
].visits
= visits
;
1196 bool URLIndexPrivateData::RestoreWordStartsMap(
1197 const InMemoryURLIndexCacheItem
& cache
,
1198 const std::string
& languages
) {
1199 // Note that this function must be called after RestoreHistoryInfoMap() has
1200 // been run as the word starts may have to be recalculated from the urls and
1202 if (cache
.has_word_starts_map()) {
1203 const WordStartsMapItem
& list_item(cache
.word_starts_map());
1204 uint32 expected_item_count
= list_item
.item_count();
1205 uint32 actual_item_count
= list_item
.word_starts_map_entry_size();
1206 if (actual_item_count
== 0 || actual_item_count
!= expected_item_count
)
1208 const RepeatedPtrField
<WordStartsMapEntry
>&
1209 entries(list_item
.word_starts_map_entry());
1210 for (RepeatedPtrField
<WordStartsMapEntry
>::const_iterator iter
=
1211 entries
.begin(); iter
!= entries
.end(); ++iter
) {
1212 HistoryID history_id
= iter
->history_id();
1213 RowWordStarts word_starts
;
1214 // Restore the URL word starts.
1215 const RepeatedField
<int32
>& url_starts(iter
->url_word_starts());
1216 for (RepeatedField
<int32
>::const_iterator jiter
= url_starts
.begin();
1217 jiter
!= url_starts
.end(); ++jiter
)
1218 word_starts
.url_word_starts_
.push_back(*jiter
);
1219 // Restore the page title word starts.
1220 const RepeatedField
<int32
>& title_starts(iter
->title_word_starts());
1221 for (RepeatedField
<int32
>::const_iterator jiter
= title_starts
.begin();
1222 jiter
!= title_starts
.end(); ++jiter
)
1223 word_starts
.title_word_starts_
.push_back(*jiter
);
1224 word_starts_map_
[history_id
] = word_starts
;
1227 // Since the cache did not contain any word starts we must rebuild then from
1228 // the URL and page titles.
1229 for (HistoryInfoMap::const_iterator iter
= history_info_map_
.begin();
1230 iter
!= history_info_map_
.end(); ++iter
) {
1231 RowWordStarts word_starts
;
1232 const history::URLRow
& row(iter
->second
.url_row
);
1233 const base::string16
& url
=
1234 bookmarks::CleanUpUrlForMatching(row
.url(), languages
, NULL
);
1235 String16VectorFromString16(url
, false, &word_starts
.url_word_starts_
);
1236 const base::string16
& title
=
1237 bookmarks::CleanUpTitleForMatching(row
.title());
1238 String16VectorFromString16(title
, false, &word_starts
.title_word_starts_
);
1239 word_starts_map_
[iter
->first
] = word_starts
;
1246 bool URLIndexPrivateData::URLSchemeIsWhitelisted(
1248 const std::set
<std::string
>& whitelist
) {
1249 return whitelist
.find(gurl
.scheme()) != whitelist
.end();
1253 // SearchTermCacheItem ---------------------------------------------------------
1255 URLIndexPrivateData::SearchTermCacheItem::SearchTermCacheItem(
1256 const WordIDSet
& word_id_set
,
1257 const HistoryIDSet
& history_id_set
)
1258 : word_id_set_(word_id_set
), history_id_set_(history_id_set
), used_(true) {
1261 URLIndexPrivateData::SearchTermCacheItem::SearchTermCacheItem() : used_(true) {
1264 URLIndexPrivateData::SearchTermCacheItem::~SearchTermCacheItem() {
1267 // URLIndexPrivateData::AddHistoryMatch ----------------------------------------
1269 URLIndexPrivateData::AddHistoryMatch::AddHistoryMatch(
1270 bookmarks::BookmarkModel
* bookmark_model
,
1271 const URLIndexPrivateData
& private_data
,
1272 const std::string
& languages
,
1273 const base::string16
& lower_string
,
1274 const String16Vector
& lower_terms
,
1275 const base::Time now
)
1276 : bookmark_model_(bookmark_model
),
1277 private_data_(private_data
),
1278 languages_(languages
),
1279 lower_string_(lower_string
),
1280 lower_terms_(lower_terms
),
1282 // Calculate offsets for each term. For instance, the offset for
1283 // ".net" should be 1, indicating that the actual word-part of the term
1284 // starts at offset 1.
1285 lower_terms_to_word_starts_offsets_
.resize(lower_terms_
.size(), 0u);
1286 for (size_t i
= 0; i
< lower_terms_
.size(); ++i
) {
1287 base::i18n::BreakIterator
iter(lower_terms_
[i
],
1288 base::i18n::BreakIterator::BREAK_WORD
);
1289 // If the iterator doesn't work, assume an offset of 0.
1292 // Find the first word start.
1293 while (iter
.Advance() && !iter
.IsWord()) {}
1295 lower_terms_to_word_starts_offsets_
[i
] = iter
.prev();
1296 // Else: the iterator didn't find a word break. Assume an offset of 0.
1300 URLIndexPrivateData::AddHistoryMatch::~AddHistoryMatch() {
1303 void URLIndexPrivateData::AddHistoryMatch::operator()(
1304 const HistoryID history_id
) {
1305 HistoryInfoMap::const_iterator hist_pos
=
1306 private_data_
.history_info_map_
.find(history_id
);
1307 if (hist_pos
!= private_data_
.history_info_map_
.end()) {
1308 const history::URLRow
& hist_item
= hist_pos
->second
.url_row
;
1309 const VisitInfoVector
& visits
= hist_pos
->second
.visits
;
1310 WordStartsMap::const_iterator starts_pos
=
1311 private_data_
.word_starts_map_
.find(history_id
);
1312 DCHECK(starts_pos
!= private_data_
.word_starts_map_
.end());
1313 ScoredHistoryMatch
match(
1314 hist_item
, visits
, languages_
, lower_string_
, lower_terms_
,
1315 lower_terms_to_word_starts_offsets_
, starts_pos
->second
,
1316 bookmark_model_
&& bookmark_model_
->IsBookmarked(hist_item
.url()),
1318 if (match
.raw_score
> 0)
1319 scored_matches_
.push_back(match
);
1324 // URLIndexPrivateData::HistoryItemFactorGreater -------------------------------
1326 URLIndexPrivateData::HistoryItemFactorGreater::HistoryItemFactorGreater(
1327 const HistoryInfoMap
& history_info_map
)
1328 : history_info_map_(history_info_map
) {
1331 URLIndexPrivateData::HistoryItemFactorGreater::~HistoryItemFactorGreater() {
1334 bool URLIndexPrivateData::HistoryItemFactorGreater::operator()(
1336 const HistoryID h2
) {
1337 HistoryInfoMap::const_iterator
entry1(history_info_map_
.find(h1
));
1338 if (entry1
== history_info_map_
.end())
1340 HistoryInfoMap::const_iterator
entry2(history_info_map_
.find(h2
));
1341 if (entry2
== history_info_map_
.end())
1343 const history::URLRow
& r1(entry1
->second
.url_row
);
1344 const history::URLRow
& r2(entry2
->second
.url_row
);
1345 // First cut: typed count, visit count, recency.
1346 // TODO(mrossetti): This is too simplistic. Consider an approach which ranks
1347 // recently visited (within the last 12/24 hours) as highly important. Get
1348 // input from mpearson.
1349 if (r1
.typed_count() != r2
.typed_count())
1350 return (r1
.typed_count() > r2
.typed_count());
1351 if (r1
.visit_count() != r2
.visit_count())
1352 return (r1
.visit_count() > r2
.visit_count());
1353 return (r1
.last_visit() > r2
.last_visit());