Add ENABLE_MEDIA_ROUTER define to builds other than Android and iOS.
[chromium-blink-merge.git] / chrome / browser / autocomplete / url_index_private_data.cc
blob2b1138003c83206c9044e2b228579104a3ebeee2
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "chrome/browser/autocomplete/url_index_private_data.h"
7 #include <functional>
8 #include <iterator>
9 #include <limits>
10 #include <numeric>
11 #include <string>
12 #include <vector>
14 #include "base/basictypes.h"
15 #include "base/files/file_util.h"
16 #include "base/i18n/break_iterator.h"
17 #include "base/i18n/case_conversion.h"
18 #include "base/metrics/histogram.h"
19 #include "base/strings/string_util.h"
20 #include "base/strings/utf_string_conversions.h"
21 #include "base/time/time.h"
22 #include "chrome/browser/autocomplete/in_memory_url_index.h"
23 #include "components/bookmarks/browser/bookmark_model.h"
24 #include "components/bookmarks/browser/bookmark_utils.h"
25 #include "components/history/core/browser/history_database.h"
26 #include "components/history/core/browser/history_db_task.h"
27 #include "components/history/core/browser/history_service.h"
28 #include "net/base/net_util.h"
30 #if defined(USE_SYSTEM_PROTOBUF)
31 #include <google/protobuf/repeated_field.h>
32 #else
33 #include "third_party/protobuf/src/google/protobuf/repeated_field.h"
34 #endif
36 using google::protobuf::RepeatedField;
37 using google::protobuf::RepeatedPtrField;
38 using in_memory_url_index::InMemoryURLIndexCacheItem;
40 namespace {
41 static const size_t kMaxVisitsToStoreInCache = 10u;
44 typedef in_memory_url_index::InMemoryURLIndexCacheItem_WordListItem
45 WordListItem;
46 typedef in_memory_url_index::InMemoryURLIndexCacheItem_WordMapItem_WordMapEntry
47 WordMapEntry;
48 typedef in_memory_url_index::InMemoryURLIndexCacheItem_WordMapItem WordMapItem;
49 typedef in_memory_url_index::InMemoryURLIndexCacheItem_CharWordMapItem
50 CharWordMapItem;
51 typedef in_memory_url_index::
52 InMemoryURLIndexCacheItem_CharWordMapItem_CharWordMapEntry CharWordMapEntry;
53 typedef in_memory_url_index::InMemoryURLIndexCacheItem_WordIDHistoryMapItem
54 WordIDHistoryMapItem;
55 typedef in_memory_url_index::
56 InMemoryURLIndexCacheItem_WordIDHistoryMapItem_WordIDHistoryMapEntry
57 WordIDHistoryMapEntry;
58 typedef in_memory_url_index::InMemoryURLIndexCacheItem_HistoryInfoMapItem
59 HistoryInfoMapItem;
60 typedef in_memory_url_index::
61 InMemoryURLIndexCacheItem_HistoryInfoMapItem_HistoryInfoMapEntry
62 HistoryInfoMapEntry;
63 typedef in_memory_url_index::
64 InMemoryURLIndexCacheItem_HistoryInfoMapItem_HistoryInfoMapEntry_VisitInfo
65 HistoryInfoMapEntry_VisitInfo;
66 typedef in_memory_url_index::InMemoryURLIndexCacheItem_WordStartsMapItem
67 WordStartsMapItem;
68 typedef in_memory_url_index::
69 InMemoryURLIndexCacheItem_WordStartsMapItem_WordStartsMapEntry
70 WordStartsMapEntry;
72 // Algorithm Functions ---------------------------------------------------------
74 // Comparison function for sorting search terms by descending length.
75 bool LengthGreater(const base::string16& string_a,
76 const base::string16& string_b) {
77 return string_a.length() > string_b.length();
81 // UpdateRecentVisitsFromHistoryDBTask -----------------------------------------
83 // HistoryDBTask used to update the recent visit data for a particular
84 // row from the history database.
85 class UpdateRecentVisitsFromHistoryDBTask : public history::HistoryDBTask {
86 public:
87 explicit UpdateRecentVisitsFromHistoryDBTask(
88 URLIndexPrivateData* private_data,
89 history::URLID url_id);
91 bool RunOnDBThread(history::HistoryBackend* backend,
92 history::HistoryDatabase* db) override;
93 void DoneRunOnMainThread() override;
95 private:
96 ~UpdateRecentVisitsFromHistoryDBTask() override;
98 // The URLIndexPrivateData that gets updated after the historyDB
99 // task returns.
100 URLIndexPrivateData* private_data_;
101 // The ID of the URL to get visits for and then update.
102 history::URLID url_id_;
103 // Whether fetching the recent visits for the URL succeeded.
104 bool succeeded_;
105 // The awaited data that's shown to private_data_ for it to copy and
106 // store.
107 history::VisitVector recent_visits_;
109 DISALLOW_COPY_AND_ASSIGN(UpdateRecentVisitsFromHistoryDBTask);
112 UpdateRecentVisitsFromHistoryDBTask::UpdateRecentVisitsFromHistoryDBTask(
113 URLIndexPrivateData* private_data,
114 history::URLID url_id)
115 : private_data_(private_data), url_id_(url_id), succeeded_(false) {
118 bool UpdateRecentVisitsFromHistoryDBTask::RunOnDBThread(
119 history::HistoryBackend* backend,
120 history::HistoryDatabase* db) {
121 // Make sure the private data is going to get as many recent visits as
122 // ScoredHistoryMatch::GetFrequency() hopes to use.
123 DCHECK_GE(kMaxVisitsToStoreInCache, ScoredHistoryMatch::kMaxVisitsToScore);
124 succeeded_ = db->GetMostRecentVisitsForURL(url_id_,
125 kMaxVisitsToStoreInCache,
126 &recent_visits_);
127 if (!succeeded_)
128 recent_visits_.clear();
129 return true; // Always claim to be done; do not retry failures.
132 void UpdateRecentVisitsFromHistoryDBTask::DoneRunOnMainThread() {
133 if (succeeded_)
134 private_data_->UpdateRecentVisits(url_id_, recent_visits_);
137 UpdateRecentVisitsFromHistoryDBTask::~UpdateRecentVisitsFromHistoryDBTask() {
141 // URLIndexPrivateData ---------------------------------------------------------
143 URLIndexPrivateData::URLIndexPrivateData()
144 : restored_cache_version_(0),
145 saved_cache_version_(kCurrentCacheFileVersion),
146 pre_filter_item_count_(0),
147 post_filter_item_count_(0),
148 post_scoring_item_count_(0) {
151 ScoredHistoryMatches URLIndexPrivateData::HistoryItemsForTerms(
152 base::string16 search_string,
153 size_t cursor_position,
154 size_t max_matches,
155 const std::string& languages,
156 bookmarks::BookmarkModel* bookmark_model) {
157 // If cursor position is set and useful (not at either end of the
158 // string), allow the search string to be broken at cursor position.
159 // We do this by pretending there's a space where the cursor is.
160 if ((cursor_position != base::string16::npos) &&
161 (cursor_position < search_string.length()) &&
162 (cursor_position > 0)) {
163 search_string.insert(cursor_position, base::ASCIIToUTF16(" "));
165 pre_filter_item_count_ = 0;
166 post_filter_item_count_ = 0;
167 post_scoring_item_count_ = 0;
168 // The search string we receive may contain escaped characters. For reducing
169 // the index we need individual, lower-cased words, ignoring escapings. For
170 // the final filtering we need whitespace separated substrings possibly
171 // containing escaped characters.
172 base::string16 lower_raw_string(base::i18n::ToLower(search_string));
173 base::string16 lower_unescaped_string =
174 net::UnescapeURLComponent(lower_raw_string,
175 net::UnescapeRule::SPACES | net::UnescapeRule::URL_SPECIAL_CHARS);
176 // Extract individual 'words' (as opposed to 'terms'; see below) from the
177 // search string. When the user types "colspec=ID%20Mstone Release" we get
178 // four 'words': "colspec", "id", "mstone" and "release".
179 String16Vector lower_words(
180 String16VectorFromString16(lower_unescaped_string, false, NULL));
181 ScoredHistoryMatches scored_items;
183 // Do nothing if we have indexed no words (probably because we've not been
184 // initialized yet) or the search string has no words.
185 if (word_list_.empty() || lower_words.empty()) {
186 search_term_cache_.clear(); // Invalidate the term cache.
187 return scored_items;
190 // Reset used_ flags for search_term_cache_. We use a basic mark-and-sweep
191 // approach.
192 ResetSearchTermCache();
194 HistoryIDSet history_id_set = HistoryIDSetFromWords(lower_words);
196 // Trim the candidate pool if it is large. Note that we do not filter out
197 // items that do not contain the search terms as proper substrings -- doing
198 // so is the performance-costly operation we are trying to avoid in order
199 // to maintain omnibox responsiveness.
200 const size_t kItemsToScoreLimit = 500;
201 pre_filter_item_count_ = history_id_set.size();
202 // If we trim the results set we do not want to cache the results for next
203 // time as the user's ultimately desired result could easily be eliminated
204 // in this early rough filter.
205 bool was_trimmed = (pre_filter_item_count_ > kItemsToScoreLimit);
206 if (was_trimmed) {
207 HistoryIDVector history_ids;
208 std::copy(history_id_set.begin(), history_id_set.end(),
209 std::back_inserter(history_ids));
210 // Trim down the set by sorting by typed-count, visit-count, and last
211 // visit.
212 HistoryItemFactorGreater
213 item_factor_functor(history_info_map_);
214 std::partial_sort(history_ids.begin(),
215 history_ids.begin() + kItemsToScoreLimit,
216 history_ids.end(),
217 item_factor_functor);
218 history_id_set.clear();
219 std::copy(history_ids.begin(), history_ids.begin() + kItemsToScoreLimit,
220 std::inserter(history_id_set, history_id_set.end()));
221 post_filter_item_count_ = history_id_set.size();
224 // Pass over all of the candidates filtering out any without a proper
225 // substring match, inserting those which pass in order by score. Note that
226 // in this step we are using the raw search string complete with escaped
227 // URL elements. When the user has specifically typed something akin to
228 // "sort=pri&colspec=ID%20Mstone%20Release" we want to make sure that that
229 // specific substring appears in the URL or page title.
231 // We call these 'terms' (as opposed to 'words'; see above) as in this case
232 // we only want to break up the search string on 'true' whitespace rather than
233 // escaped whitespace. When the user types "colspec=ID%20Mstone Release" we
234 // get two 'terms': "colspec=id%20mstone" and "release".
235 String16Vector lower_raw_terms;
236 if (Tokenize(lower_raw_string, base::kWhitespaceUTF16,
237 &lower_raw_terms) == 0) {
238 // Don't score matches when there are no terms to score against. (It's
239 // possible that the word break iterater that extracts words to search
240 // for in the database allows some whitespace "words" whereas Tokenize
241 // excludes a long list of whitespace.) One could write a scoring
242 // function that gives a reasonable order to matches when there
243 // are no terms (i.e., all the words are some form of whitespace),
244 // but this is such a rare edge case that it's not worth the time.
245 return scored_items;
247 scored_items =
248 std::for_each(
249 history_id_set.begin(), history_id_set.end(),
250 AddHistoryMatch(bookmark_model, *this, languages, lower_raw_string,
251 lower_raw_terms, base::Time::Now())).ScoredMatches();
253 // Select and sort only the top |max_matches| results.
254 if (scored_items.size() > max_matches) {
255 std::partial_sort(scored_items.begin(),
256 scored_items.begin() +
257 max_matches,
258 scored_items.end(),
259 ScoredHistoryMatch::MatchScoreGreater);
260 scored_items.resize(max_matches);
261 } else {
262 std::sort(scored_items.begin(), scored_items.end(),
263 ScoredHistoryMatch::MatchScoreGreater);
265 post_scoring_item_count_ = scored_items.size();
267 if (was_trimmed) {
268 search_term_cache_.clear(); // Invalidate the term cache.
269 } else {
270 // Remove any stale SearchTermCacheItems.
271 for (SearchTermCacheMap::iterator cache_iter = search_term_cache_.begin();
272 cache_iter != search_term_cache_.end(); ) {
273 if (!cache_iter->second.used_)
274 search_term_cache_.erase(cache_iter++);
275 else
276 ++cache_iter;
280 return scored_items;
283 bool URLIndexPrivateData::UpdateURL(
284 history::HistoryService* history_service,
285 const history::URLRow& row,
286 const std::string& languages,
287 const std::set<std::string>& scheme_whitelist,
288 base::CancelableTaskTracker* tracker) {
289 // The row may or may not already be in our index. If it is not already
290 // indexed and it qualifies then it gets indexed. If it is already
291 // indexed and still qualifies then it gets updated, otherwise it
292 // is deleted from the index.
293 bool row_was_updated = false;
294 history::URLID row_id = row.id();
295 HistoryInfoMap::iterator row_pos = history_info_map_.find(row_id);
296 if (row_pos == history_info_map_.end()) {
297 // This new row should be indexed if it qualifies.
298 history::URLRow new_row(row);
299 new_row.set_id(row_id);
300 row_was_updated = RowQualifiesAsSignificant(new_row, base::Time()) &&
301 IndexRow(NULL,
302 history_service,
303 new_row,
304 languages,
305 scheme_whitelist,
306 tracker);
307 } else if (RowQualifiesAsSignificant(row, base::Time())) {
308 // This indexed row still qualifies and will be re-indexed.
309 // The url won't have changed but the title, visit count, etc.
310 // might have changed.
311 history::URLRow& row_to_update = row_pos->second.url_row;
312 bool title_updated = row_to_update.title() != row.title();
313 if (row_to_update.visit_count() != row.visit_count() ||
314 row_to_update.typed_count() != row.typed_count() ||
315 row_to_update.last_visit() != row.last_visit() || title_updated) {
316 row_to_update.set_visit_count(row.visit_count());
317 row_to_update.set_typed_count(row.typed_count());
318 row_to_update.set_last_visit(row.last_visit());
319 // If something appears to have changed, update the recent visits
320 // information.
321 ScheduleUpdateRecentVisits(history_service, row_id, tracker);
322 // While the URL is guaranteed to remain stable, the title may have
323 // changed. If so, then update the index with the changed words.
324 if (title_updated) {
325 // Clear all words associated with this row and re-index both the
326 // URL and title.
327 RemoveRowWordsFromIndex(row_to_update);
328 row_to_update.set_title(row.title());
329 RowWordStarts word_starts;
330 AddRowWordsToIndex(row_to_update, &word_starts, languages);
331 word_starts_map_[row_id] = word_starts;
333 row_was_updated = true;
335 } else {
336 // This indexed row no longer qualifies and will be de-indexed by
337 // clearing all words associated with this row.
338 RemoveRowFromIndex(row);
339 row_was_updated = true;
341 if (row_was_updated)
342 search_term_cache_.clear(); // This invalidates the cache.
343 return row_was_updated;
346 void URLIndexPrivateData::UpdateRecentVisits(
347 history::URLID url_id,
348 const history::VisitVector& recent_visits) {
349 HistoryInfoMap::iterator row_pos = history_info_map_.find(url_id);
350 if (row_pos != history_info_map_.end()) {
351 VisitInfoVector* visits = &row_pos->second.visits;
352 visits->clear();
353 const size_t size =
354 std::min(recent_visits.size(), kMaxVisitsToStoreInCache);
355 visits->reserve(size);
356 for (size_t i = 0; i < size; i++) {
357 // Copy from the history::VisitVector the only fields visits needs.
358 visits->push_back(std::make_pair(recent_visits[i].visit_time,
359 recent_visits[i].transition));
362 // Else: Oddly, the URL doesn't seem to exist in the private index.
363 // Ignore this update. This can happen if, for instance, the user
364 // removes the URL from URLIndexPrivateData before the historyDB call
365 // returns.
368 void URLIndexPrivateData::ScheduleUpdateRecentVisits(
369 history::HistoryService* history_service,
370 history::URLID url_id,
371 base::CancelableTaskTracker* tracker) {
372 history_service->ScheduleDBTask(
373 scoped_ptr<history::HistoryDBTask>(
374 new UpdateRecentVisitsFromHistoryDBTask(this, url_id)), tracker);
377 // Helper functor for DeleteURL.
378 class HistoryInfoMapItemHasURL {
379 public:
380 explicit HistoryInfoMapItemHasURL(const GURL& url): url_(url) {}
382 bool operator()(const std::pair<const HistoryID, HistoryInfoMapValue>& item) {
383 return item.second.url_row.url() == url_;
386 private:
387 const GURL& url_;
390 bool URLIndexPrivateData::DeleteURL(const GURL& url) {
391 // Find the matching entry in the history_info_map_.
392 HistoryInfoMap::iterator pos = std::find_if(
393 history_info_map_.begin(),
394 history_info_map_.end(),
395 HistoryInfoMapItemHasURL(url));
396 if (pos == history_info_map_.end())
397 return false;
398 RemoveRowFromIndex(pos->second.url_row);
399 search_term_cache_.clear(); // This invalidates the cache.
400 return true;
403 // static
404 scoped_refptr<URLIndexPrivateData> URLIndexPrivateData::RestoreFromFile(
405 const base::FilePath& file_path,
406 const std::string& languages) {
407 base::TimeTicks beginning_time = base::TimeTicks::Now();
408 if (!base::PathExists(file_path))
409 return NULL;
410 std::string data;
411 // If there is no cache file then simply give up. This will cause us to
412 // attempt to rebuild from the history database.
413 if (!base::ReadFileToString(file_path, &data))
414 return NULL;
416 scoped_refptr<URLIndexPrivateData> restored_data(new URLIndexPrivateData);
417 InMemoryURLIndexCacheItem index_cache;
418 if (!index_cache.ParseFromArray(data.c_str(), data.size())) {
419 LOG(WARNING) << "Failed to parse URLIndexPrivateData cache data read from "
420 << file_path.value();
421 return restored_data;
424 if (!restored_data->RestorePrivateData(index_cache, languages))
425 return NULL;
427 UMA_HISTOGRAM_TIMES("History.InMemoryURLIndexRestoreCacheTime",
428 base::TimeTicks::Now() - beginning_time);
429 UMA_HISTOGRAM_COUNTS("History.InMemoryURLHistoryItems",
430 restored_data->history_id_word_map_.size());
431 UMA_HISTOGRAM_COUNTS("History.InMemoryURLCacheSize", data.size());
432 UMA_HISTOGRAM_COUNTS_10000("History.InMemoryURLWords",
433 restored_data->word_map_.size());
434 UMA_HISTOGRAM_COUNTS_10000("History.InMemoryURLChars",
435 restored_data->char_word_map_.size());
436 if (restored_data->Empty())
437 return NULL; // 'No data' is the same as a failed reload.
438 return restored_data;
441 // static
442 scoped_refptr<URLIndexPrivateData> URLIndexPrivateData::RebuildFromHistory(
443 history::HistoryDatabase* history_db,
444 const std::string& languages,
445 const std::set<std::string>& scheme_whitelist) {
446 if (!history_db)
447 return NULL;
449 base::TimeTicks beginning_time = base::TimeTicks::Now();
451 scoped_refptr<URLIndexPrivateData>
452 rebuilt_data(new URLIndexPrivateData);
453 history::URLDatabase::URLEnumerator history_enum;
454 if (!history_db->InitURLEnumeratorForSignificant(&history_enum))
455 return NULL;
456 rebuilt_data->last_time_rebuilt_from_history_ = base::Time::Now();
457 for (history::URLRow row; history_enum.GetNextURL(&row);) {
458 rebuilt_data->IndexRow(
459 history_db, NULL, row, languages, scheme_whitelist, NULL);
462 UMA_HISTOGRAM_TIMES("History.InMemoryURLIndexingTime",
463 base::TimeTicks::Now() - beginning_time);
464 UMA_HISTOGRAM_COUNTS("History.InMemoryURLHistoryItems",
465 rebuilt_data->history_id_word_map_.size());
466 UMA_HISTOGRAM_COUNTS_10000("History.InMemoryURLWords",
467 rebuilt_data->word_map_.size());
468 UMA_HISTOGRAM_COUNTS_10000("History.InMemoryURLChars",
469 rebuilt_data->char_word_map_.size());
470 return rebuilt_data;
473 // static
474 bool URLIndexPrivateData::WritePrivateDataToCacheFileTask(
475 scoped_refptr<URLIndexPrivateData> private_data,
476 const base::FilePath& file_path) {
477 DCHECK(private_data.get());
478 DCHECK(!file_path.empty());
479 return private_data->SaveToFile(file_path);
482 scoped_refptr<URLIndexPrivateData> URLIndexPrivateData::Duplicate() const {
483 scoped_refptr<URLIndexPrivateData> data_copy = new URLIndexPrivateData;
484 data_copy->last_time_rebuilt_from_history_ = last_time_rebuilt_from_history_;
485 data_copy->word_list_ = word_list_;
486 data_copy->available_words_ = available_words_;
487 data_copy->word_map_ = word_map_;
488 data_copy->char_word_map_ = char_word_map_;
489 data_copy->word_id_history_map_ = word_id_history_map_;
490 data_copy->history_id_word_map_ = history_id_word_map_;
491 data_copy->history_info_map_ = history_info_map_;
492 data_copy->word_starts_map_ = word_starts_map_;
493 return data_copy;
494 // Not copied:
495 // search_term_cache_
496 // pre_filter_item_count_
497 // post_filter_item_count_
498 // post_scoring_item_count_
501 bool URLIndexPrivateData::Empty() const {
502 return history_info_map_.empty();
505 void URLIndexPrivateData::Clear() {
506 last_time_rebuilt_from_history_ = base::Time();
507 word_list_.clear();
508 available_words_.clear();
509 word_map_.clear();
510 char_word_map_.clear();
511 word_id_history_map_.clear();
512 history_id_word_map_.clear();
513 history_info_map_.clear();
514 word_starts_map_.clear();
517 URLIndexPrivateData::~URLIndexPrivateData() {}
519 HistoryIDSet URLIndexPrivateData::HistoryIDSetFromWords(
520 const String16Vector& unsorted_words) {
521 // Break the terms down into individual terms (words), get the candidate
522 // set for each term, and intersect each to get a final candidate list.
523 // Note that a single 'term' from the user's perspective might be
524 // a string like "http://www.somewebsite.com" which, from our perspective,
525 // is four words: 'http', 'www', 'somewebsite', and 'com'.
526 HistoryIDSet history_id_set;
527 String16Vector words(unsorted_words);
528 // Sort the words into the longest first as such are likely to narrow down
529 // the results quicker. Also, single character words are the most expensive
530 // to process so save them for last.
531 std::sort(words.begin(), words.end(), LengthGreater);
532 for (String16Vector::iterator iter = words.begin(); iter != words.end();
533 ++iter) {
534 base::string16 uni_word = *iter;
535 HistoryIDSet term_history_set = HistoryIDsForTerm(uni_word);
536 if (term_history_set.empty()) {
537 history_id_set.clear();
538 break;
540 if (iter == words.begin()) {
541 history_id_set.swap(term_history_set);
542 } else {
543 HistoryIDSet new_history_id_set = base::STLSetIntersection<HistoryIDSet>(
544 history_id_set, term_history_set);
545 history_id_set.swap(new_history_id_set);
548 return history_id_set;
551 HistoryIDSet URLIndexPrivateData::HistoryIDsForTerm(
552 const base::string16& term) {
553 if (term.empty())
554 return HistoryIDSet();
556 // TODO(mrossetti): Consider optimizing for very common terms such as
557 // 'http[s]', 'www', 'com', etc. Or collect the top 100 more frequently
558 // occuring words in the user's searches.
560 size_t term_length = term.length();
561 WordIDSet word_id_set;
562 if (term_length > 1) {
563 // See if this term or a prefix thereof is present in the cache.
564 SearchTermCacheMap::iterator best_prefix(search_term_cache_.end());
565 for (SearchTermCacheMap::iterator cache_iter = search_term_cache_.begin();
566 cache_iter != search_term_cache_.end(); ++cache_iter) {
567 if (StartsWith(term, cache_iter->first, false) &&
568 (best_prefix == search_term_cache_.end() ||
569 cache_iter->first.length() > best_prefix->first.length()))
570 best_prefix = cache_iter;
573 // If a prefix was found then determine the leftover characters to be used
574 // for further refining the results from that prefix.
575 Char16Set prefix_chars;
576 base::string16 leftovers(term);
577 if (best_prefix != search_term_cache_.end()) {
578 // If the prefix is an exact match for the term then grab the cached
579 // results and we're done.
580 size_t prefix_length = best_prefix->first.length();
581 if (prefix_length == term_length) {
582 best_prefix->second.used_ = true;
583 return best_prefix->second.history_id_set_;
586 // Otherwise we have a handy starting point.
587 // If there are no history results for this prefix then we can bail early
588 // as there will be no history results for the full term.
589 if (best_prefix->second.history_id_set_.empty()) {
590 search_term_cache_[term] = SearchTermCacheItem();
591 return HistoryIDSet();
593 word_id_set = best_prefix->second.word_id_set_;
594 prefix_chars = Char16SetFromString16(best_prefix->first);
595 leftovers = term.substr(prefix_length);
598 // Filter for each remaining, unique character in the term.
599 Char16Set leftover_chars = Char16SetFromString16(leftovers);
600 Char16Set unique_chars =
601 base::STLSetDifference<Char16Set>(leftover_chars, prefix_chars);
603 // Reduce the word set with any leftover, unprocessed characters.
604 if (!unique_chars.empty()) {
605 WordIDSet leftover_set(WordIDSetForTermChars(unique_chars));
606 // We might come up empty on the leftovers.
607 if (leftover_set.empty()) {
608 search_term_cache_[term] = SearchTermCacheItem();
609 return HistoryIDSet();
611 // Or there may not have been a prefix from which to start.
612 if (prefix_chars.empty()) {
613 word_id_set.swap(leftover_set);
614 } else {
615 WordIDSet new_word_id_set = base::STLSetIntersection<WordIDSet>(
616 word_id_set, leftover_set);
617 word_id_set.swap(new_word_id_set);
621 // We must filter the word list because the resulting word set surely
622 // contains words which do not have the search term as a proper subset.
623 for (WordIDSet::iterator word_set_iter = word_id_set.begin();
624 word_set_iter != word_id_set.end(); ) {
625 if (word_list_[*word_set_iter].find(term) == base::string16::npos)
626 word_id_set.erase(word_set_iter++);
627 else
628 ++word_set_iter;
630 } else {
631 word_id_set = WordIDSetForTermChars(Char16SetFromString16(term));
634 // If any words resulted then we can compose a set of history IDs by unioning
635 // the sets from each word.
636 HistoryIDSet history_id_set;
637 if (!word_id_set.empty()) {
638 for (WordIDSet::iterator word_id_iter = word_id_set.begin();
639 word_id_iter != word_id_set.end(); ++word_id_iter) {
640 WordID word_id = *word_id_iter;
641 WordIDHistoryMap::iterator word_iter = word_id_history_map_.find(word_id);
642 if (word_iter != word_id_history_map_.end()) {
643 HistoryIDSet& word_history_id_set(word_iter->second);
644 history_id_set.insert(word_history_id_set.begin(),
645 word_history_id_set.end());
650 // Record a new cache entry for this word if the term is longer than
651 // a single character.
652 if (term_length > 1)
653 search_term_cache_[term] = SearchTermCacheItem(word_id_set, history_id_set);
655 return history_id_set;
658 WordIDSet URLIndexPrivateData::WordIDSetForTermChars(
659 const Char16Set& term_chars) {
660 WordIDSet word_id_set;
661 for (Char16Set::const_iterator c_iter = term_chars.begin();
662 c_iter != term_chars.end(); ++c_iter) {
663 CharWordIDMap::iterator char_iter = char_word_map_.find(*c_iter);
664 if (char_iter == char_word_map_.end()) {
665 // A character was not found so there are no matching results: bail.
666 word_id_set.clear();
667 break;
669 WordIDSet& char_word_id_set(char_iter->second);
670 // It is possible for there to no longer be any words associated with
671 // a particular character. Give up in that case.
672 if (char_word_id_set.empty()) {
673 word_id_set.clear();
674 break;
677 if (c_iter == term_chars.begin()) {
678 // First character results becomes base set of results.
679 word_id_set = char_word_id_set;
680 } else {
681 // Subsequent character results get intersected in.
682 WordIDSet new_word_id_set = base::STLSetIntersection<WordIDSet>(
683 word_id_set, char_word_id_set);
684 word_id_set.swap(new_word_id_set);
687 return word_id_set;
690 bool URLIndexPrivateData::IndexRow(
691 history::HistoryDatabase* history_db,
692 history::HistoryService* history_service,
693 const history::URLRow& row,
694 const std::string& languages,
695 const std::set<std::string>& scheme_whitelist,
696 base::CancelableTaskTracker* tracker) {
697 const GURL& gurl(row.url());
699 // Index only URLs with a whitelisted scheme.
700 if (!URLSchemeIsWhitelisted(gurl, scheme_whitelist))
701 return false;
703 history::URLID row_id = row.id();
704 // Strip out username and password before saving and indexing.
705 base::string16 url(net::FormatUrl(gurl, languages,
706 net::kFormatUrlOmitUsernamePassword,
707 net::UnescapeRule::NONE,
708 NULL, NULL, NULL));
710 HistoryID history_id = static_cast<HistoryID>(row_id);
711 DCHECK_LT(history_id, std::numeric_limits<HistoryID>::max());
713 // Add the row for quick lookup in the history info store.
714 history::URLRow new_row(GURL(url), row_id);
715 new_row.set_visit_count(row.visit_count());
716 new_row.set_typed_count(row.typed_count());
717 new_row.set_last_visit(row.last_visit());
718 new_row.set_title(row.title());
719 history_info_map_[history_id].url_row = new_row;
721 // Index the words contained in the URL and title of the row.
722 RowWordStarts word_starts;
723 AddRowWordsToIndex(new_row, &word_starts, languages);
724 word_starts_map_[history_id] = word_starts;
726 // Update the recent visits information or schedule the update
727 // as appropriate.
728 if (history_db) {
729 // We'd like to check that we're on the history DB thread.
730 // However, unittest code actually calls this on the UI thread.
731 // So we don't do any thread checks.
732 history::VisitVector recent_visits;
733 // Make sure the private data is going to get as many recent visits as
734 // ScoredHistoryMatch::GetFrequency() hopes to use.
735 DCHECK_GE(kMaxVisitsToStoreInCache, ScoredHistoryMatch::kMaxVisitsToScore);
736 if (history_db->GetMostRecentVisitsForURL(row_id,
737 kMaxVisitsToStoreInCache,
738 &recent_visits))
739 UpdateRecentVisits(row_id, recent_visits);
740 } else {
741 DCHECK(tracker);
742 DCHECK(history_service);
743 ScheduleUpdateRecentVisits(history_service, row_id, tracker);
746 return true;
749 void URLIndexPrivateData::AddRowWordsToIndex(const history::URLRow& row,
750 RowWordStarts* word_starts,
751 const std::string& languages) {
752 HistoryID history_id = static_cast<HistoryID>(row.id());
753 // Split URL into individual, unique words then add in the title words.
754 const GURL& gurl(row.url());
755 const base::string16& url =
756 bookmarks::CleanUpUrlForMatching(gurl, languages, NULL);
757 String16Set url_words = String16SetFromString16(url,
758 word_starts ? &word_starts->url_word_starts_ : NULL);
759 const base::string16& title = bookmarks::CleanUpTitleForMatching(row.title());
760 String16Set title_words = String16SetFromString16(title,
761 word_starts ? &word_starts->title_word_starts_ : NULL);
762 String16Set words = base::STLSetUnion<String16Set>(url_words, title_words);
763 for (String16Set::iterator word_iter = words.begin();
764 word_iter != words.end(); ++word_iter)
765 AddWordToIndex(*word_iter, history_id);
767 search_term_cache_.clear(); // Invalidate the term cache.
770 void URLIndexPrivateData::AddWordToIndex(const base::string16& term,
771 HistoryID history_id) {
772 WordMap::iterator word_pos = word_map_.find(term);
773 if (word_pos != word_map_.end())
774 UpdateWordHistory(word_pos->second, history_id);
775 else
776 AddWordHistory(term, history_id);
779 void URLIndexPrivateData::AddWordHistory(const base::string16& term,
780 HistoryID history_id) {
781 WordID word_id = word_list_.size();
782 if (available_words_.empty()) {
783 word_list_.push_back(term);
784 } else {
785 word_id = *(available_words_.begin());
786 word_list_[word_id] = term;
787 available_words_.erase(word_id);
789 word_map_[term] = word_id;
791 HistoryIDSet history_id_set;
792 history_id_set.insert(history_id);
793 word_id_history_map_[word_id] = history_id_set;
794 AddToHistoryIDWordMap(history_id, word_id);
796 // For each character in the newly added word (i.e. a word that is not
797 // already in the word index), add the word to the character index.
798 Char16Set characters = Char16SetFromString16(term);
799 for (Char16Set::iterator uni_char_iter = characters.begin();
800 uni_char_iter != characters.end(); ++uni_char_iter) {
801 base::char16 uni_char = *uni_char_iter;
802 CharWordIDMap::iterator char_iter = char_word_map_.find(uni_char);
803 if (char_iter != char_word_map_.end()) {
804 // Update existing entry in the char/word index.
805 WordIDSet& word_id_set(char_iter->second);
806 word_id_set.insert(word_id);
807 } else {
808 // Create a new entry in the char/word index.
809 WordIDSet word_id_set;
810 word_id_set.insert(word_id);
811 char_word_map_[uni_char] = word_id_set;
816 void URLIndexPrivateData::UpdateWordHistory(WordID word_id,
817 HistoryID history_id) {
818 WordIDHistoryMap::iterator history_pos = word_id_history_map_.find(word_id);
819 DCHECK(history_pos != word_id_history_map_.end());
820 HistoryIDSet& history_id_set(history_pos->second);
821 history_id_set.insert(history_id);
822 AddToHistoryIDWordMap(history_id, word_id);
825 void URLIndexPrivateData::AddToHistoryIDWordMap(HistoryID history_id,
826 WordID word_id) {
827 HistoryIDWordMap::iterator iter = history_id_word_map_.find(history_id);
828 if (iter != history_id_word_map_.end()) {
829 WordIDSet& word_id_set(iter->second);
830 word_id_set.insert(word_id);
831 } else {
832 WordIDSet word_id_set;
833 word_id_set.insert(word_id);
834 history_id_word_map_[history_id] = word_id_set;
838 void URLIndexPrivateData::RemoveRowFromIndex(const history::URLRow& row) {
839 RemoveRowWordsFromIndex(row);
840 HistoryID history_id = static_cast<HistoryID>(row.id());
841 history_info_map_.erase(history_id);
842 word_starts_map_.erase(history_id);
845 void URLIndexPrivateData::RemoveRowWordsFromIndex(const history::URLRow& row) {
846 // Remove the entries in history_id_word_map_ and word_id_history_map_ for
847 // this row.
848 HistoryID history_id = static_cast<HistoryID>(row.id());
849 WordIDSet word_id_set = history_id_word_map_[history_id];
850 history_id_word_map_.erase(history_id);
852 // Reconcile any changes to word usage.
853 for (WordIDSet::iterator word_id_iter = word_id_set.begin();
854 word_id_iter != word_id_set.end(); ++word_id_iter) {
855 WordID word_id = *word_id_iter;
856 word_id_history_map_[word_id].erase(history_id);
857 if (!word_id_history_map_[word_id].empty())
858 continue; // The word is still in use.
860 // The word is no longer in use. Reconcile any changes to character usage.
861 base::string16 word = word_list_[word_id];
862 Char16Set characters = Char16SetFromString16(word);
863 for (Char16Set::iterator uni_char_iter = characters.begin();
864 uni_char_iter != characters.end(); ++uni_char_iter) {
865 base::char16 uni_char = *uni_char_iter;
866 char_word_map_[uni_char].erase(word_id);
867 if (char_word_map_[uni_char].empty())
868 char_word_map_.erase(uni_char); // No longer in use.
871 // Complete the removal of references to the word.
872 word_id_history_map_.erase(word_id);
873 word_map_.erase(word);
874 word_list_[word_id] = base::string16();
875 available_words_.insert(word_id);
879 void URLIndexPrivateData::ResetSearchTermCache() {
880 for (SearchTermCacheMap::iterator iter = search_term_cache_.begin();
881 iter != search_term_cache_.end(); ++iter)
882 iter->second.used_ = false;
885 bool URLIndexPrivateData::SaveToFile(const base::FilePath& file_path) {
886 base::TimeTicks beginning_time = base::TimeTicks::Now();
887 InMemoryURLIndexCacheItem index_cache;
888 SavePrivateData(&index_cache);
889 std::string data;
890 if (!index_cache.SerializeToString(&data)) {
891 LOG(WARNING) << "Failed to serialize the InMemoryURLIndex cache.";
892 return false;
895 int size = data.size();
896 if (base::WriteFile(file_path, data.c_str(), size) != size) {
897 LOG(WARNING) << "Failed to write " << file_path.value();
898 return false;
900 UMA_HISTOGRAM_TIMES("History.InMemoryURLIndexSaveCacheTime",
901 base::TimeTicks::Now() - beginning_time);
902 return true;
905 void URLIndexPrivateData::SavePrivateData(
906 InMemoryURLIndexCacheItem* cache) const {
907 DCHECK(cache);
908 cache->set_last_rebuild_timestamp(
909 last_time_rebuilt_from_history_.ToInternalValue());
910 cache->set_version(saved_cache_version_);
911 // history_item_count_ is no longer used but rather than change the protobuf
912 // definition use a placeholder. This will go away with the switch to SQLite.
913 cache->set_history_item_count(0);
914 SaveWordList(cache);
915 SaveWordMap(cache);
916 SaveCharWordMap(cache);
917 SaveWordIDHistoryMap(cache);
918 SaveHistoryInfoMap(cache);
919 SaveWordStartsMap(cache);
922 void URLIndexPrivateData::SaveWordList(InMemoryURLIndexCacheItem* cache) const {
923 if (word_list_.empty())
924 return;
925 WordListItem* list_item = cache->mutable_word_list();
926 list_item->set_word_count(word_list_.size());
927 for (String16Vector::const_iterator iter = word_list_.begin();
928 iter != word_list_.end(); ++iter)
929 list_item->add_word(base::UTF16ToUTF8(*iter));
932 void URLIndexPrivateData::SaveWordMap(InMemoryURLIndexCacheItem* cache) const {
933 if (word_map_.empty())
934 return;
935 WordMapItem* map_item = cache->mutable_word_map();
936 map_item->set_item_count(word_map_.size());
937 for (WordMap::const_iterator iter = word_map_.begin();
938 iter != word_map_.end(); ++iter) {
939 WordMapEntry* map_entry = map_item->add_word_map_entry();
940 map_entry->set_word(base::UTF16ToUTF8(iter->first));
941 map_entry->set_word_id(iter->second);
945 void URLIndexPrivateData::SaveCharWordMap(
946 InMemoryURLIndexCacheItem* cache) const {
947 if (char_word_map_.empty())
948 return;
949 CharWordMapItem* map_item = cache->mutable_char_word_map();
950 map_item->set_item_count(char_word_map_.size());
951 for (CharWordIDMap::const_iterator iter = char_word_map_.begin();
952 iter != char_word_map_.end(); ++iter) {
953 CharWordMapEntry* map_entry = map_item->add_char_word_map_entry();
954 map_entry->set_char_16(iter->first);
955 const WordIDSet& word_id_set(iter->second);
956 map_entry->set_item_count(word_id_set.size());
957 for (WordIDSet::const_iterator set_iter = word_id_set.begin();
958 set_iter != word_id_set.end(); ++set_iter)
959 map_entry->add_word_id(*set_iter);
963 void URLIndexPrivateData::SaveWordIDHistoryMap(
964 InMemoryURLIndexCacheItem* cache) const {
965 if (word_id_history_map_.empty())
966 return;
967 WordIDHistoryMapItem* map_item = cache->mutable_word_id_history_map();
968 map_item->set_item_count(word_id_history_map_.size());
969 for (WordIDHistoryMap::const_iterator iter = word_id_history_map_.begin();
970 iter != word_id_history_map_.end(); ++iter) {
971 WordIDHistoryMapEntry* map_entry =
972 map_item->add_word_id_history_map_entry();
973 map_entry->set_word_id(iter->first);
974 const HistoryIDSet& history_id_set(iter->second);
975 map_entry->set_item_count(history_id_set.size());
976 for (HistoryIDSet::const_iterator set_iter = history_id_set.begin();
977 set_iter != history_id_set.end(); ++set_iter)
978 map_entry->add_history_id(*set_iter);
982 void URLIndexPrivateData::SaveHistoryInfoMap(
983 InMemoryURLIndexCacheItem* cache) const {
984 if (history_info_map_.empty())
985 return;
986 HistoryInfoMapItem* map_item = cache->mutable_history_info_map();
987 map_item->set_item_count(history_info_map_.size());
988 for (HistoryInfoMap::const_iterator iter = history_info_map_.begin();
989 iter != history_info_map_.end(); ++iter) {
990 HistoryInfoMapEntry* map_entry = map_item->add_history_info_map_entry();
991 map_entry->set_history_id(iter->first);
992 const history::URLRow& url_row(iter->second.url_row);
993 // Note: We only save information that contributes to the index so there
994 // is no need to save search_term_cache_ (not persistent).
995 map_entry->set_visit_count(url_row.visit_count());
996 map_entry->set_typed_count(url_row.typed_count());
997 map_entry->set_last_visit(url_row.last_visit().ToInternalValue());
998 map_entry->set_url(url_row.url().spec());
999 map_entry->set_title(base::UTF16ToUTF8(url_row.title()));
1000 const VisitInfoVector& visits(iter->second.visits);
1001 for (VisitInfoVector::const_iterator visit_iter = visits.begin();
1002 visit_iter != visits.end(); ++visit_iter) {
1003 HistoryInfoMapEntry_VisitInfo* visit_info = map_entry->add_visits();
1004 visit_info->set_visit_time(visit_iter->first.ToInternalValue());
1005 visit_info->set_transition_type(visit_iter->second);
1010 void URLIndexPrivateData::SaveWordStartsMap(
1011 InMemoryURLIndexCacheItem* cache) const {
1012 if (word_starts_map_.empty())
1013 return;
1014 // For unit testing: Enable saving of the cache as an earlier version to
1015 // allow testing of cache file upgrading in ReadFromFile().
1016 // TODO(mrossetti): Instead of intruding on production code with this kind of
1017 // test harness, save a copy of an older version cache with known results.
1018 // Implement this when switching the caching over to SQLite.
1019 if (saved_cache_version_ < 1)
1020 return;
1022 WordStartsMapItem* map_item = cache->mutable_word_starts_map();
1023 map_item->set_item_count(word_starts_map_.size());
1024 for (WordStartsMap::const_iterator iter = word_starts_map_.begin();
1025 iter != word_starts_map_.end(); ++iter) {
1026 WordStartsMapEntry* map_entry = map_item->add_word_starts_map_entry();
1027 map_entry->set_history_id(iter->first);
1028 const RowWordStarts& word_starts(iter->second);
1029 for (WordStarts::const_iterator i = word_starts.url_word_starts_.begin();
1030 i != word_starts.url_word_starts_.end(); ++i)
1031 map_entry->add_url_word_starts(*i);
1032 for (WordStarts::const_iterator i = word_starts.title_word_starts_.begin();
1033 i != word_starts.title_word_starts_.end(); ++i)
1034 map_entry->add_title_word_starts(*i);
1038 bool URLIndexPrivateData::RestorePrivateData(
1039 const InMemoryURLIndexCacheItem& cache,
1040 const std::string& languages) {
1041 last_time_rebuilt_from_history_ =
1042 base::Time::FromInternalValue(cache.last_rebuild_timestamp());
1043 const base::TimeDelta rebuilt_ago =
1044 base::Time::Now() - last_time_rebuilt_from_history_;
1045 if ((rebuilt_ago > base::TimeDelta::FromDays(7)) ||
1046 (rebuilt_ago < base::TimeDelta::FromDays(-1))) {
1047 // Cache is more than a week old or, somehow, from some time in the future.
1048 // It's probably a good time to rebuild the index from history to
1049 // allow synced entries to now appear, expired entries to disappear, etc.
1050 // Allow one day in the future to make the cache not rebuild on simple
1051 // system clock changes such as time zone changes.
1052 return false;
1054 if (cache.has_version()) {
1055 if (cache.version() < kCurrentCacheFileVersion) {
1056 // Don't try to restore an old format cache file. (This will cause
1057 // the InMemoryURLIndex to schedule rebuilding the URLIndexPrivateData
1058 // from history.)
1059 return false;
1061 restored_cache_version_ = cache.version();
1063 return RestoreWordList(cache) && RestoreWordMap(cache) &&
1064 RestoreCharWordMap(cache) && RestoreWordIDHistoryMap(cache) &&
1065 RestoreHistoryInfoMap(cache) && RestoreWordStartsMap(cache, languages);
1068 bool URLIndexPrivateData::RestoreWordList(
1069 const InMemoryURLIndexCacheItem& cache) {
1070 if (!cache.has_word_list())
1071 return false;
1072 const WordListItem& list_item(cache.word_list());
1073 uint32 expected_item_count = list_item.word_count();
1074 uint32 actual_item_count = list_item.word_size();
1075 if (actual_item_count == 0 || actual_item_count != expected_item_count)
1076 return false;
1077 const RepeatedPtrField<std::string>& words(list_item.word());
1078 for (RepeatedPtrField<std::string>::const_iterator iter = words.begin();
1079 iter != words.end(); ++iter)
1080 word_list_.push_back(base::UTF8ToUTF16(*iter));
1081 return true;
1084 bool URLIndexPrivateData::RestoreWordMap(
1085 const InMemoryURLIndexCacheItem& cache) {
1086 if (!cache.has_word_map())
1087 return false;
1088 const WordMapItem& list_item(cache.word_map());
1089 uint32 expected_item_count = list_item.item_count();
1090 uint32 actual_item_count = list_item.word_map_entry_size();
1091 if (actual_item_count == 0 || actual_item_count != expected_item_count)
1092 return false;
1093 const RepeatedPtrField<WordMapEntry>& entries(list_item.word_map_entry());
1094 for (RepeatedPtrField<WordMapEntry>::const_iterator iter = entries.begin();
1095 iter != entries.end(); ++iter)
1096 word_map_[base::UTF8ToUTF16(iter->word())] = iter->word_id();
1097 return true;
1100 bool URLIndexPrivateData::RestoreCharWordMap(
1101 const InMemoryURLIndexCacheItem& cache) {
1102 if (!cache.has_char_word_map())
1103 return false;
1104 const CharWordMapItem& list_item(cache.char_word_map());
1105 uint32 expected_item_count = list_item.item_count();
1106 uint32 actual_item_count = list_item.char_word_map_entry_size();
1107 if (actual_item_count == 0 || actual_item_count != expected_item_count)
1108 return false;
1109 const RepeatedPtrField<CharWordMapEntry>&
1110 entries(list_item.char_word_map_entry());
1111 for (RepeatedPtrField<CharWordMapEntry>::const_iterator iter =
1112 entries.begin(); iter != entries.end(); ++iter) {
1113 expected_item_count = iter->item_count();
1114 actual_item_count = iter->word_id_size();
1115 if (actual_item_count == 0 || actual_item_count != expected_item_count)
1116 return false;
1117 base::char16 uni_char = static_cast<base::char16>(iter->char_16());
1118 WordIDSet word_id_set;
1119 const RepeatedField<int32>& word_ids(iter->word_id());
1120 for (RepeatedField<int32>::const_iterator jiter = word_ids.begin();
1121 jiter != word_ids.end(); ++jiter)
1122 word_id_set.insert(*jiter);
1123 char_word_map_[uni_char] = word_id_set;
1125 return true;
1128 bool URLIndexPrivateData::RestoreWordIDHistoryMap(
1129 const InMemoryURLIndexCacheItem& cache) {
1130 if (!cache.has_word_id_history_map())
1131 return false;
1132 const WordIDHistoryMapItem& list_item(cache.word_id_history_map());
1133 uint32 expected_item_count = list_item.item_count();
1134 uint32 actual_item_count = list_item.word_id_history_map_entry_size();
1135 if (actual_item_count == 0 || actual_item_count != expected_item_count)
1136 return false;
1137 const RepeatedPtrField<WordIDHistoryMapEntry>&
1138 entries(list_item.word_id_history_map_entry());
1139 for (RepeatedPtrField<WordIDHistoryMapEntry>::const_iterator iter =
1140 entries.begin(); iter != entries.end(); ++iter) {
1141 expected_item_count = iter->item_count();
1142 actual_item_count = iter->history_id_size();
1143 if (actual_item_count == 0 || actual_item_count != expected_item_count)
1144 return false;
1145 WordID word_id = iter->word_id();
1146 HistoryIDSet history_id_set;
1147 const RepeatedField<int64>& history_ids(iter->history_id());
1148 for (RepeatedField<int64>::const_iterator jiter = history_ids.begin();
1149 jiter != history_ids.end(); ++jiter) {
1150 history_id_set.insert(*jiter);
1151 AddToHistoryIDWordMap(*jiter, word_id);
1153 word_id_history_map_[word_id] = history_id_set;
1155 return true;
1158 bool URLIndexPrivateData::RestoreHistoryInfoMap(
1159 const InMemoryURLIndexCacheItem& cache) {
1160 if (!cache.has_history_info_map())
1161 return false;
1162 const HistoryInfoMapItem& list_item(cache.history_info_map());
1163 uint32 expected_item_count = list_item.item_count();
1164 uint32 actual_item_count = list_item.history_info_map_entry_size();
1165 if (actual_item_count == 0 || actual_item_count != expected_item_count)
1166 return false;
1167 const RepeatedPtrField<HistoryInfoMapEntry>&
1168 entries(list_item.history_info_map_entry());
1169 for (RepeatedPtrField<HistoryInfoMapEntry>::const_iterator iter =
1170 entries.begin(); iter != entries.end(); ++iter) {
1171 HistoryID history_id = iter->history_id();
1172 GURL url(iter->url());
1173 history::URLRow url_row(url, history_id);
1174 url_row.set_visit_count(iter->visit_count());
1175 url_row.set_typed_count(iter->typed_count());
1176 url_row.set_last_visit(base::Time::FromInternalValue(iter->last_visit()));
1177 if (iter->has_title()) {
1178 base::string16 title(base::UTF8ToUTF16(iter->title()));
1179 url_row.set_title(title);
1181 history_info_map_[history_id].url_row = url_row;
1183 // Restore visits list.
1184 VisitInfoVector visits;
1185 visits.reserve(iter->visits_size());
1186 for (int i = 0; i < iter->visits_size(); ++i) {
1187 visits.push_back(std::make_pair(
1188 base::Time::FromInternalValue(iter->visits(i).visit_time()),
1189 ui::PageTransitionFromInt(iter->visits(i).transition_type())));
1191 history_info_map_[history_id].visits = visits;
1193 return true;
1196 bool URLIndexPrivateData::RestoreWordStartsMap(
1197 const InMemoryURLIndexCacheItem& cache,
1198 const std::string& languages) {
1199 // Note that this function must be called after RestoreHistoryInfoMap() has
1200 // been run as the word starts may have to be recalculated from the urls and
1201 // page titles.
1202 if (cache.has_word_starts_map()) {
1203 const WordStartsMapItem& list_item(cache.word_starts_map());
1204 uint32 expected_item_count = list_item.item_count();
1205 uint32 actual_item_count = list_item.word_starts_map_entry_size();
1206 if (actual_item_count == 0 || actual_item_count != expected_item_count)
1207 return false;
1208 const RepeatedPtrField<WordStartsMapEntry>&
1209 entries(list_item.word_starts_map_entry());
1210 for (RepeatedPtrField<WordStartsMapEntry>::const_iterator iter =
1211 entries.begin(); iter != entries.end(); ++iter) {
1212 HistoryID history_id = iter->history_id();
1213 RowWordStarts word_starts;
1214 // Restore the URL word starts.
1215 const RepeatedField<int32>& url_starts(iter->url_word_starts());
1216 for (RepeatedField<int32>::const_iterator jiter = url_starts.begin();
1217 jiter != url_starts.end(); ++jiter)
1218 word_starts.url_word_starts_.push_back(*jiter);
1219 // Restore the page title word starts.
1220 const RepeatedField<int32>& title_starts(iter->title_word_starts());
1221 for (RepeatedField<int32>::const_iterator jiter = title_starts.begin();
1222 jiter != title_starts.end(); ++jiter)
1223 word_starts.title_word_starts_.push_back(*jiter);
1224 word_starts_map_[history_id] = word_starts;
1226 } else {
1227 // Since the cache did not contain any word starts we must rebuild then from
1228 // the URL and page titles.
1229 for (HistoryInfoMap::const_iterator iter = history_info_map_.begin();
1230 iter != history_info_map_.end(); ++iter) {
1231 RowWordStarts word_starts;
1232 const history::URLRow& row(iter->second.url_row);
1233 const base::string16& url =
1234 bookmarks::CleanUpUrlForMatching(row.url(), languages, NULL);
1235 String16VectorFromString16(url, false, &word_starts.url_word_starts_);
1236 const base::string16& title =
1237 bookmarks::CleanUpTitleForMatching(row.title());
1238 String16VectorFromString16(title, false, &word_starts.title_word_starts_);
1239 word_starts_map_[iter->first] = word_starts;
1242 return true;
1245 // static
1246 bool URLIndexPrivateData::URLSchemeIsWhitelisted(
1247 const GURL& gurl,
1248 const std::set<std::string>& whitelist) {
1249 return whitelist.find(gurl.scheme()) != whitelist.end();
1253 // SearchTermCacheItem ---------------------------------------------------------
1255 URLIndexPrivateData::SearchTermCacheItem::SearchTermCacheItem(
1256 const WordIDSet& word_id_set,
1257 const HistoryIDSet& history_id_set)
1258 : word_id_set_(word_id_set), history_id_set_(history_id_set), used_(true) {
1261 URLIndexPrivateData::SearchTermCacheItem::SearchTermCacheItem() : used_(true) {
1264 URLIndexPrivateData::SearchTermCacheItem::~SearchTermCacheItem() {
1267 // URLIndexPrivateData::AddHistoryMatch ----------------------------------------
1269 URLIndexPrivateData::AddHistoryMatch::AddHistoryMatch(
1270 bookmarks::BookmarkModel* bookmark_model,
1271 const URLIndexPrivateData& private_data,
1272 const std::string& languages,
1273 const base::string16& lower_string,
1274 const String16Vector& lower_terms,
1275 const base::Time now)
1276 : bookmark_model_(bookmark_model),
1277 private_data_(private_data),
1278 languages_(languages),
1279 lower_string_(lower_string),
1280 lower_terms_(lower_terms),
1281 now_(now) {
1282 // Calculate offsets for each term. For instance, the offset for
1283 // ".net" should be 1, indicating that the actual word-part of the term
1284 // starts at offset 1.
1285 lower_terms_to_word_starts_offsets_.resize(lower_terms_.size(), 0u);
1286 for (size_t i = 0; i < lower_terms_.size(); ++i) {
1287 base::i18n::BreakIterator iter(lower_terms_[i],
1288 base::i18n::BreakIterator::BREAK_WORD);
1289 // If the iterator doesn't work, assume an offset of 0.
1290 if (!iter.Init())
1291 continue;
1292 // Find the first word start.
1293 while (iter.Advance() && !iter.IsWord()) {}
1294 if (iter.IsWord())
1295 lower_terms_to_word_starts_offsets_[i] = iter.prev();
1296 // Else: the iterator didn't find a word break. Assume an offset of 0.
1300 URLIndexPrivateData::AddHistoryMatch::~AddHistoryMatch() {
1303 void URLIndexPrivateData::AddHistoryMatch::operator()(
1304 const HistoryID history_id) {
1305 HistoryInfoMap::const_iterator hist_pos =
1306 private_data_.history_info_map_.find(history_id);
1307 if (hist_pos != private_data_.history_info_map_.end()) {
1308 const history::URLRow& hist_item = hist_pos->second.url_row;
1309 const VisitInfoVector& visits = hist_pos->second.visits;
1310 WordStartsMap::const_iterator starts_pos =
1311 private_data_.word_starts_map_.find(history_id);
1312 DCHECK(starts_pos != private_data_.word_starts_map_.end());
1313 ScoredHistoryMatch match(
1314 hist_item, visits, languages_, lower_string_, lower_terms_,
1315 lower_terms_to_word_starts_offsets_, starts_pos->second,
1316 bookmark_model_ && bookmark_model_->IsBookmarked(hist_item.url()),
1317 now_);
1318 if (match.raw_score > 0)
1319 scored_matches_.push_back(match);
1324 // URLIndexPrivateData::HistoryItemFactorGreater -------------------------------
1326 URLIndexPrivateData::HistoryItemFactorGreater::HistoryItemFactorGreater(
1327 const HistoryInfoMap& history_info_map)
1328 : history_info_map_(history_info_map) {
1331 URLIndexPrivateData::HistoryItemFactorGreater::~HistoryItemFactorGreater() {
1334 bool URLIndexPrivateData::HistoryItemFactorGreater::operator()(
1335 const HistoryID h1,
1336 const HistoryID h2) {
1337 HistoryInfoMap::const_iterator entry1(history_info_map_.find(h1));
1338 if (entry1 == history_info_map_.end())
1339 return false;
1340 HistoryInfoMap::const_iterator entry2(history_info_map_.find(h2));
1341 if (entry2 == history_info_map_.end())
1342 return true;
1343 const history::URLRow& r1(entry1->second.url_row);
1344 const history::URLRow& r2(entry2->second.url_row);
1345 // First cut: typed count, visit count, recency.
1346 // TODO(mrossetti): This is too simplistic. Consider an approach which ranks
1347 // recently visited (within the last 12/24 hours) as highly important. Get
1348 // input from mpearson.
1349 if (r1.typed_count() != r2.typed_count())
1350 return (r1.typed_count() > r2.typed_count());
1351 if (r1.visit_count() != r2.visit_count())
1352 return (r1.visit_count() > r2.visit_count());
1353 return (r1.last_visit() > r2.last_visit());