components/omnibox/browser/url_index_private_data.cc

   1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
   2 // Use of this source code is governed by a BSD-style license that can be
   3 // found in the LICENSE file.
   4
   5 #include "components/omnibox/browser/url_index_private_data.h"
   6
   7 #include <functional>
   8 #include <iterator>
   9 #include <limits>
  10 #include <numeric>
  11 #include <string>
  12 #include <vector>
  13
  14 #include "base/basictypes.h"
  15 #include "base/files/file_util.h"
  16 #include "base/i18n/break_iterator.h"
  17 #include "base/i18n/case_conversion.h"
  18 #include "base/metrics/histogram.h"
  19 #include "base/strings/string_split.h"
  20 #include "base/strings/string_util.h"
  21 #include "base/strings/utf_string_conversions.h"
  22 #include "base/time/time.h"
  23 #include "components/bookmarks/browser/bookmark_model.h"
  24 #include "components/bookmarks/browser/bookmark_utils.h"
  25 #include "components/history/core/browser/history_database.h"
  26 #include "components/history/core/browser/history_db_task.h"
  27 #include "components/history/core/browser/history_service.h"
  28 #include "components/omnibox/browser/in_memory_url_index.h"
  29 #include "components/url_formatter/url_formatter.h"
  30
  31 #if defined(USE_SYSTEM_PROTOBUF)
  32 #include <google/protobuf/repeated_field.h>
  33 #else
  34 #include "third_party/protobuf/src/google/protobuf/repeated_field.h"
  35 #endif
  36
  37 using google::protobuf::RepeatedField;
  38 using google::protobuf::RepeatedPtrField;
  39 using in_memory_url_index::InMemoryURLIndexCacheItem;
  40
  41 namespace {
  42 static const size_t kMaxVisitsToStoreInCache = 10u;
  43 }
  44
  45 typedef in_memory_url_index::InMemoryURLIndexCacheItem_WordListItem
  46     WordListItem;
  47 typedef in_memory_url_index::InMemoryURLIndexCacheItem_WordMapItem_WordMapEntry
  48     WordMapEntry;
  49 typedef in_memory_url_index::InMemoryURLIndexCacheItem_WordMapItem WordMapItem;
  50 typedef in_memory_url_index::InMemoryURLIndexCacheItem_CharWordMapItem
  51     CharWordMapItem;
  52 typedef in_memory_url_index::
  53     InMemoryURLIndexCacheItem_CharWordMapItem_CharWordMapEntry CharWordMapEntry;
  54 typedef in_memory_url_index::InMemoryURLIndexCacheItem_WordIDHistoryMapItem
  55     WordIDHistoryMapItem;
  56 typedef in_memory_url_index::
  57     InMemoryURLIndexCacheItem_WordIDHistoryMapItem_WordIDHistoryMapEntry
  58         WordIDHistoryMapEntry;
  59 typedef in_memory_url_index::InMemoryURLIndexCacheItem_HistoryInfoMapItem
  60     HistoryInfoMapItem;
  61 typedef in_memory_url_index::
  62     InMemoryURLIndexCacheItem_HistoryInfoMapItem_HistoryInfoMapEntry
  63         HistoryInfoMapEntry;
  64 typedef in_memory_url_index::
  65     InMemoryURLIndexCacheItem_HistoryInfoMapItem_HistoryInfoMapEntry_VisitInfo
  66         HistoryInfoMapEntry_VisitInfo;
  67 typedef in_memory_url_index::InMemoryURLIndexCacheItem_WordStartsMapItem
  68     WordStartsMapItem;
  69 typedef in_memory_url_index::
  70     InMemoryURLIndexCacheItem_WordStartsMapItem_WordStartsMapEntry
  71         WordStartsMapEntry;
  72
  73 // Algorithm Functions ---------------------------------------------------------
  74
  75 // Comparison function for sorting search terms by descending length.
  76 bool LengthGreater(const base::string16& string_a,
  77                    const base::string16& string_b) {
  78   return string_a.length() > string_b.length();
  79 }
  80
  81
  82 // UpdateRecentVisitsFromHistoryDBTask -----------------------------------------
  83
  84 // HistoryDBTask used to update the recent visit data for a particular
  85 // row from the history database.
  86 class UpdateRecentVisitsFromHistoryDBTask : public history::HistoryDBTask {
  87  public:
  88   explicit UpdateRecentVisitsFromHistoryDBTask(
  89       URLIndexPrivateData* private_data,
  90       history::URLID url_id);
  91
  92   bool RunOnDBThread(history::HistoryBackend* backend,
  93                      history::HistoryDatabase* db) override;
  94   void DoneRunOnMainThread() override;
  95
  96  private:
  97   ~UpdateRecentVisitsFromHistoryDBTask() override;
  98
  99   // The URLIndexPrivateData that gets updated after the historyDB
 100   // task returns.
 101   URLIndexPrivateData* private_data_;
 102   // The ID of the URL to get visits for and then update.
 103   history::URLID url_id_;
 104   // Whether fetching the recent visits for the URL succeeded.
 105   bool succeeded_;
 106   // The awaited data that's shown to private_data_ for it to copy and
 107   // store.
 108   history::VisitVector recent_visits_;
 109
 110   DISALLOW_COPY_AND_ASSIGN(UpdateRecentVisitsFromHistoryDBTask);
 111 };
 112
 113 UpdateRecentVisitsFromHistoryDBTask::UpdateRecentVisitsFromHistoryDBTask(
 114     URLIndexPrivateData* private_data,
 115     history::URLID url_id)
 116     : private_data_(private_data), url_id_(url_id), succeeded_(false) {
 117 }
 118
 119 bool UpdateRecentVisitsFromHistoryDBTask::RunOnDBThread(
 120     history::HistoryBackend* backend,
 121     history::HistoryDatabase* db) {
 122   // Make sure the private data is going to get as many recent visits as
 123   // ScoredHistoryMatch::GetFrequency() hopes to use.
 124   DCHECK_GE(kMaxVisitsToStoreInCache, ScoredHistoryMatch::kMaxVisitsToScore);
 125   succeeded_ = db->GetMostRecentVisitsForURL(url_id_,
 126                                              kMaxVisitsToStoreInCache,
 127                                              &recent_visits_);
 128   if (!succeeded_)
 129     recent_visits_.clear();
 130   return true;  // Always claim to be done; do not retry failures.
 131 }
 132
 133 void UpdateRecentVisitsFromHistoryDBTask::DoneRunOnMainThread() {
 134   if (succeeded_)
 135     private_data_->UpdateRecentVisits(url_id_, recent_visits_);
 136 }
 137
 138 UpdateRecentVisitsFromHistoryDBTask::~UpdateRecentVisitsFromHistoryDBTask() {
 139 }
 140
 141
 142 // URLIndexPrivateData ---------------------------------------------------------
 143
 144 URLIndexPrivateData::URLIndexPrivateData()
 145     : restored_cache_version_(0),
 146       saved_cache_version_(kCurrentCacheFileVersion),
 147       pre_filter_item_count_(0),
 148       post_filter_item_count_(0),
 149       post_scoring_item_count_(0) {
 150 }
 151
 152 ScoredHistoryMatches URLIndexPrivateData::HistoryItemsForTerms(
 153     base::string16 search_string,
 154     size_t cursor_position,
 155     size_t max_matches,
 156     const std::string& languages,
 157     bookmarks::BookmarkModel* bookmark_model) {
 158   // If cursor position is set and useful (not at either end of the
 159   // string), allow the search string to be broken at cursor position.
 160   // We do this by pretending there's a space where the cursor is.
 161   if ((cursor_position != base::string16::npos) &&
 162       (cursor_position < search_string.length()) &&
 163       (cursor_position > 0)) {
 164     search_string.insert(cursor_position, base::ASCIIToUTF16(" "));
 165   }
 166   pre_filter_item_count_ = 0;
 167   post_filter_item_count_ = 0;
 168   post_scoring_item_count_ = 0;
 169   // The search string we receive may contain escaped characters. For reducing
 170   // the index we need individual, lower-cased words, ignoring escapings. For
 171   // the final filtering we need whitespace separated substrings possibly
 172   // containing escaped characters.
 173   base::string16 lower_raw_string(base::i18n::ToLower(search_string));
 174   base::string16 lower_unescaped_string =
 175       net::UnescapeURLComponent(lower_raw_string,
 176           net::UnescapeRule::SPACES | net::UnescapeRule::URL_SPECIAL_CHARS);
 177   // Extract individual 'words' (as opposed to 'terms'; see below) from the
 178   // search string. When the user types "colspec=ID%20Mstone Release" we get
 179   // four 'words': "colspec", "id", "mstone" and "release".
 180   String16Vector lower_words(
 181       String16VectorFromString16(lower_unescaped_string, false, NULL));
 182   ScoredHistoryMatches scored_items;
 183
 184   // Do nothing if we have indexed no words (probably because we've not been
 185   // initialized yet) or the search string has no words.
 186   if (word_list_.empty() || lower_words.empty()) {
 187     search_term_cache_.clear();  // Invalidate the term cache.
 188     return scored_items;
 189   }
 190
 191   // Reset used_ flags for search_term_cache_. We use a basic mark-and-sweep
 192   // approach.
 193   ResetSearchTermCache();
 194
 195   HistoryIDSet history_id_set = HistoryIDSetFromWords(lower_words);
 196
 197   // Trim the candidate pool if it is large. Note that we do not filter out
 198   // items that do not contain the search terms as proper substrings -- doing
 199   // so is the performance-costly operation we are trying to avoid in order
 200   // to maintain omnibox responsiveness.
 201   const size_t kItemsToScoreLimit = 500;
 202   pre_filter_item_count_ = history_id_set.size();
 203   // If we trim the results set we do not want to cache the results for next
 204   // time as the user's ultimately desired result could easily be eliminated
 205   // in this early rough filter.
 206   bool was_trimmed = (pre_filter_item_count_ > kItemsToScoreLimit);
 207   if (was_trimmed) {
 208     HistoryIDVector history_ids;
 209     std::copy(history_id_set.begin(), history_id_set.end(),
 210               std::back_inserter(history_ids));
 211     // Trim down the set by sorting by typed-count, visit-count, and last
 212     // visit.
 213     HistoryItemFactorGreater
 214         item_factor_functor(history_info_map_);
 215     std::partial_sort(history_ids.begin(),
 216                       history_ids.begin() + kItemsToScoreLimit,
 217                       history_ids.end(),
 218                       item_factor_functor);
 219     history_id_set.clear();
 220     std::copy(history_ids.begin(), history_ids.begin() + kItemsToScoreLimit,
 221               std::inserter(history_id_set, history_id_set.end()));
 222     post_filter_item_count_ = history_id_set.size();
 223   }
 224
 225   // Pass over all of the candidates filtering out any without a proper
 226   // substring match, inserting those which pass in order by score. Note that
 227   // in this step we are using the raw search string complete with escaped
 228   // URL elements. When the user has specifically typed something akin to
 229   // "sort=pri&colspec=ID%20Mstone%20Release" we want to make sure that that
 230   // specific substring appears in the URL or page title.
 231
 232   // We call these 'terms' (as opposed to 'words'; see above) as in this case
 233   // we only want to break up the search string on 'true' whitespace rather than
 234   // escaped whitespace. When the user types "colspec=ID%20Mstone Release" we
 235   // get two 'terms': "colspec=id%20mstone" and "release".
 236   String16Vector lower_raw_terms = base::SplitString(
 237       lower_raw_string, base::kWhitespaceUTF16, base::KEEP_WHITESPACE,
 238       base::SPLIT_WANT_NONEMPTY);
 239   if (lower_raw_terms.empty()) {
 240     // Don't score matches when there are no terms to score against.  (It's
 241     // possible that the word break iterater that extracts words to search
 242     // for in the database allows some whitespace "words" whereas SplitString
 243     // excludes a long list of whitespace.)  One could write a scoring
 244     // function that gives a reasonable order to matches when there
 245     // are no terms (i.e., all the words are some form of whitespace),
 246     // but this is such a rare edge case that it's not worth the time.
 247     return scored_items;
 248   }
 249   scored_items =
 250       std::for_each(
 251           history_id_set.begin(), history_id_set.end(),
 252           AddHistoryMatch(bookmark_model, *this, languages, lower_raw_string,
 253                           lower_raw_terms, base::Time::Now())).ScoredMatches();
 254
 255   // Select and sort only the top |max_matches| results.
 256   if (scored_items.size() > max_matches) {
 257     std::partial_sort(scored_items.begin(),
 258                       scored_items.begin() +
 259                           max_matches,
 260                       scored_items.end(),
 261                       ScoredHistoryMatch::MatchScoreGreater);
 262       scored_items.resize(max_matches);
 263   } else {
 264     std::sort(scored_items.begin(), scored_items.end(),
 265               ScoredHistoryMatch::MatchScoreGreater);
 266   }
 267   post_scoring_item_count_ = scored_items.size();
 268
 269   if (was_trimmed) {
 270     search_term_cache_.clear();  // Invalidate the term cache.
 271   } else {
 272     // Remove any stale SearchTermCacheItems.
 273     for (SearchTermCacheMap::iterator cache_iter = search_term_cache_.begin();
 274          cache_iter != search_term_cache_.end(); ) {
 275       if (!cache_iter->second.used_)
 276         search_term_cache_.erase(cache_iter++);
 277       else
 278         ++cache_iter;
 279     }
 280   }
 281
 282   return scored_items;
 283 }
 284
 285 bool URLIndexPrivateData::UpdateURL(
 286     history::HistoryService* history_service,
 287     const history::URLRow& row,
 288     const std::string& languages,
 289     const std::set<std::string>& scheme_whitelist,
 290     base::CancelableTaskTracker* tracker) {
 291   // The row may or may not already be in our index. If it is not already
 292   // indexed and it qualifies then it gets indexed. If it is already
 293   // indexed and still qualifies then it gets updated, otherwise it
 294   // is deleted from the index.
 295   bool row_was_updated = false;
 296   history::URLID row_id = row.id();
 297   HistoryInfoMap::iterator row_pos = history_info_map_.find(row_id);
 298   if (row_pos == history_info_map_.end()) {
 299     // This new row should be indexed if it qualifies.
 300     history::URLRow new_row(row);
 301     new_row.set_id(row_id);
 302     row_was_updated = RowQualifiesAsSignificant(new_row, base::Time()) &&
 303                       IndexRow(NULL,
 304                                history_service,
 305                                new_row,
 306                                languages,
 307                                scheme_whitelist,
 308                                tracker);
 309   } else if (RowQualifiesAsSignificant(row, base::Time())) {
 310     // This indexed row still qualifies and will be re-indexed.
 311     // The url won't have changed but the title, visit count, etc.
 312     // might have changed.
 313     history::URLRow& row_to_update = row_pos->second.url_row;
 314     bool title_updated = row_to_update.title() != row.title();
 315     if (row_to_update.visit_count() != row.visit_count() ||
 316         row_to_update.typed_count() != row.typed_count() ||
 317         row_to_update.last_visit() != row.last_visit() || title_updated) {
 318       row_to_update.set_visit_count(row.visit_count());
 319       row_to_update.set_typed_count(row.typed_count());
 320       row_to_update.set_last_visit(row.last_visit());
 321       // If something appears to have changed, update the recent visits
 322       // information.
 323       ScheduleUpdateRecentVisits(history_service, row_id, tracker);
 324       // While the URL is guaranteed to remain stable, the title may have
 325       // changed. If so, then update the index with the changed words.
 326       if (title_updated) {
 327         // Clear all words associated with this row and re-index both the
 328         // URL and title.
 329         RemoveRowWordsFromIndex(row_to_update);
 330         row_to_update.set_title(row.title());
 331         RowWordStarts word_starts;
 332         AddRowWordsToIndex(row_to_update, &word_starts, languages);
 333         word_starts_map_[row_id] = word_starts;
 334       }
 335       row_was_updated = true;
 336     }
 337   } else {
 338     // This indexed row no longer qualifies and will be de-indexed by
 339     // clearing all words associated with this row.
 340     RemoveRowFromIndex(row);
 341     row_was_updated = true;
 342   }
 343   if (row_was_updated)
 344     search_term_cache_.clear();  // This invalidates the cache.
 345   return row_was_updated;
 346 }
 347
 348 void URLIndexPrivateData::UpdateRecentVisits(
 349     history::URLID url_id,
 350     const history::VisitVector& recent_visits) {
 351   HistoryInfoMap::iterator row_pos = history_info_map_.find(url_id);
 352   if (row_pos != history_info_map_.end()) {
 353     VisitInfoVector* visits = &row_pos->second.visits;
 354     visits->clear();
 355     const size_t size =
 356         std::min(recent_visits.size(), kMaxVisitsToStoreInCache);
 357     visits->reserve(size);
 358     for (size_t i = 0; i < size; i++) {
 359       // Copy from the history::VisitVector the only fields visits needs.
 360       visits->push_back(std::make_pair(recent_visits[i].visit_time,
 361                                        recent_visits[i].transition));
 362     }
 363   }
 364   // Else: Oddly, the URL doesn't seem to exist in the private index.
 365   // Ignore this update.  This can happen if, for instance, the user
 366   // removes the URL from URLIndexPrivateData before the historyDB call
 367   // returns.
 368 }
 369
 370 void URLIndexPrivateData::ScheduleUpdateRecentVisits(
 371     history::HistoryService* history_service,
 372     history::URLID url_id,
 373     base::CancelableTaskTracker* tracker) {
 374   history_service->ScheduleDBTask(
 375       scoped_ptr<history::HistoryDBTask>(
 376           new UpdateRecentVisitsFromHistoryDBTask(this, url_id)), tracker);
 377 }
 378
 379 // Helper functor for DeleteURL.
 380 class HistoryInfoMapItemHasURL {
 381  public:
 382   explicit HistoryInfoMapItemHasURL(const GURL& url): url_(url) {}
 383
 384   bool operator()(const std::pair<const HistoryID, HistoryInfoMapValue>& item) {
 385     return item.second.url_row.url() == url_;
 386   }
 387
 388  private:
 389   const GURL& url_;
 390 };
 391
 392 bool URLIndexPrivateData::DeleteURL(const GURL& url) {
 393   // Find the matching entry in the history_info_map_.
 394   HistoryInfoMap::iterator pos = std::find_if(
 395       history_info_map_.begin(),
 396       history_info_map_.end(),
 397       HistoryInfoMapItemHasURL(url));
 398   if (pos == history_info_map_.end())
 399     return false;
 400   RemoveRowFromIndex(pos->second.url_row);
 401   search_term_cache_.clear();  // This invalidates the cache.
 402   return true;
 403 }
 404
 405 // static
 406 scoped_refptr<URLIndexPrivateData> URLIndexPrivateData::RestoreFromFile(
 407     const base::FilePath& file_path,
 408     const std::string& languages) {
 409   base::TimeTicks beginning_time = base::TimeTicks::Now();
 410   if (!base::PathExists(file_path))
 411     return NULL;
 412   std::string data;
 413   // If there is no cache file then simply give up. This will cause us to
 414   // attempt to rebuild from the history database.
 415   if (!base::ReadFileToString(file_path, &data))
 416     return NULL;
 417
 418   scoped_refptr<URLIndexPrivateData> restored_data(new URLIndexPrivateData);
 419   InMemoryURLIndexCacheItem index_cache;
 420   if (!index_cache.ParseFromArray(data.c_str(), data.size())) {
 421     LOG(WARNING) << "Failed to parse URLIndexPrivateData cache data read from "
 422                  << file_path.value();
 423     return restored_data;
 424   }
 425
 426   if (!restored_data->RestorePrivateData(index_cache, languages))
 427     return NULL;
 428
 429   UMA_HISTOGRAM_TIMES("History.InMemoryURLIndexRestoreCacheTime",
 430                       base::TimeTicks::Now() - beginning_time);
 431   UMA_HISTOGRAM_COUNTS("History.InMemoryURLHistoryItems",
 432                        restored_data->history_id_word_map_.size());
 433   UMA_HISTOGRAM_COUNTS("History.InMemoryURLCacheSize", data.size());
 434   UMA_HISTOGRAM_COUNTS_10000("History.InMemoryURLWords",
 435                              restored_data->word_map_.size());
 436   UMA_HISTOGRAM_COUNTS_10000("History.InMemoryURLChars",
 437                              restored_data->char_word_map_.size());
 438   if (restored_data->Empty())
 439     return NULL;  // 'No data' is the same as a failed reload.
 440   return restored_data;
 441 }
 442
 443 // static
 444 scoped_refptr<URLIndexPrivateData> URLIndexPrivateData::RebuildFromHistory(
 445     history::HistoryDatabase* history_db,
 446     const std::string& languages,
 447     const std::set<std::string>& scheme_whitelist) {
 448   if (!history_db)
 449     return NULL;
 450
 451   base::TimeTicks beginning_time = base::TimeTicks::Now();
 452
 453   scoped_refptr<URLIndexPrivateData>
 454       rebuilt_data(new URLIndexPrivateData);
 455   history::URLDatabase::URLEnumerator history_enum;
 456   if (!history_db->InitURLEnumeratorForSignificant(&history_enum))
 457     return NULL;
 458   rebuilt_data->last_time_rebuilt_from_history_ = base::Time::Now();
 459   for (history::URLRow row; history_enum.GetNextURL(&row);) {
 460     rebuilt_data->IndexRow(
 461         history_db, NULL, row, languages, scheme_whitelist, NULL);
 462   }
 463
 464   UMA_HISTOGRAM_TIMES("History.InMemoryURLIndexingTime",
 465                       base::TimeTicks::Now() - beginning_time);
 466   UMA_HISTOGRAM_COUNTS("History.InMemoryURLHistoryItems",
 467                        rebuilt_data->history_id_word_map_.size());
 468   UMA_HISTOGRAM_COUNTS_10000("History.InMemoryURLWords",
 469                              rebuilt_data->word_map_.size());
 470   UMA_HISTOGRAM_COUNTS_10000("History.InMemoryURLChars",
 471                              rebuilt_data->char_word_map_.size());
 472   return rebuilt_data;
 473 }
 474
 475 // static
 476 bool URLIndexPrivateData::WritePrivateDataToCacheFileTask(
 477     scoped_refptr<URLIndexPrivateData> private_data,
 478     const base::FilePath& file_path) {
 479   DCHECK(private_data.get());
 480   DCHECK(!file_path.empty());
 481   return private_data->SaveToFile(file_path);
 482 }
 483
 484 scoped_refptr<URLIndexPrivateData> URLIndexPrivateData::Duplicate() const {
 485   scoped_refptr<URLIndexPrivateData> data_copy = new URLIndexPrivateData;
 486   data_copy->last_time_rebuilt_from_history_ = last_time_rebuilt_from_history_;
 487   data_copy->word_list_ = word_list_;
 488   data_copy->available_words_ = available_words_;
 489   data_copy->word_map_ = word_map_;
 490   data_copy->char_word_map_ = char_word_map_;
 491   data_copy->word_id_history_map_ = word_id_history_map_;
 492   data_copy->history_id_word_map_ = history_id_word_map_;
 493   data_copy->history_info_map_ = history_info_map_;
 494   data_copy->word_starts_map_ = word_starts_map_;
 495   return data_copy;
 496   // Not copied:
 497   //    search_term_cache_
 498   //    pre_filter_item_count_
 499   //    post_filter_item_count_
 500   //    post_scoring_item_count_
 501 }
 502
 503 bool URLIndexPrivateData::Empty() const {
 504   return history_info_map_.empty();
 505 }
 506
 507 void URLIndexPrivateData::Clear() {
 508   last_time_rebuilt_from_history_ = base::Time();
 509   word_list_.clear();
 510   available_words_.clear();
 511   word_map_.clear();
 512   char_word_map_.clear();
 513   word_id_history_map_.clear();
 514   history_id_word_map_.clear();
 515   history_info_map_.clear();
 516   word_starts_map_.clear();
 517 }
 518
 519 URLIndexPrivateData::~URLIndexPrivateData() {}
 520
 521 HistoryIDSet URLIndexPrivateData::HistoryIDSetFromWords(
 522     const String16Vector& unsorted_words) {
 523   // Break the terms down into individual terms (words), get the candidate
 524   // set for each term, and intersect each to get a final candidate list.
 525   // Note that a single 'term' from the user's perspective might be
 526   // a string like "http://www.somewebsite.com" which, from our perspective,
 527   // is four words: 'http', 'www', 'somewebsite', and 'com'.
 528   HistoryIDSet history_id_set;
 529   String16Vector words(unsorted_words);
 530   // Sort the words into the longest first as such are likely to narrow down
 531   // the results quicker. Also, single character words are the most expensive
 532   // to process so save them for last.
 533   std::sort(words.begin(), words.end(), LengthGreater);
 534   for (String16Vector::iterator iter = words.begin(); iter != words.end();
 535        ++iter) {
 536     base::string16 uni_word = *iter;
 537     HistoryIDSet term_history_set = HistoryIDsForTerm(uni_word);
 538     if (term_history_set.empty()) {
 539       history_id_set.clear();
 540       break;
 541     }
 542     if (iter == words.begin()) {
 543       history_id_set.swap(term_history_set);
 544     } else {
 545       HistoryIDSet new_history_id_set = base::STLSetIntersection<HistoryIDSet>(
 546           history_id_set, term_history_set);
 547       history_id_set.swap(new_history_id_set);
 548     }
 549   }
 550   return history_id_set;
 551 }
 552
 553 HistoryIDSet URLIndexPrivateData::HistoryIDsForTerm(
 554     const base::string16& term) {
 555   if (term.empty())
 556     return HistoryIDSet();
 557
 558   // TODO(mrossetti): Consider optimizing for very common terms such as
 559   // 'http[s]', 'www', 'com', etc. Or collect the top 100 more frequently
 560   // occuring words in the user's searches.
 561
 562   size_t term_length = term.length();
 563   WordIDSet word_id_set;
 564   if (term_length > 1) {
 565     // See if this term or a prefix thereof is present in the cache.
 566     base::string16 term_lower = base::i18n::ToLower(term);
 567     SearchTermCacheMap::iterator best_prefix(search_term_cache_.end());
 568     for (SearchTermCacheMap::iterator cache_iter = search_term_cache_.begin();
 569          cache_iter != search_term_cache_.end(); ++cache_iter) {
 570       if (base::StartsWith(term_lower,
 571                            base::i18n::ToLower(cache_iter->first),
 572                            base::CompareCase::SENSITIVE) &&
 573           (best_prefix == search_term_cache_.end() ||
 574            cache_iter->first.length() > best_prefix->first.length()))
 575         best_prefix = cache_iter;
 576     }
 577
 578     // If a prefix was found then determine the leftover characters to be used
 579     // for further refining the results from that prefix.
 580     Char16Set prefix_chars;
 581     base::string16 leftovers(term);
 582     if (best_prefix != search_term_cache_.end()) {
 583       // If the prefix is an exact match for the term then grab the cached
 584       // results and we're done.
 585       size_t prefix_length = best_prefix->first.length();
 586       if (prefix_length == term_length) {
 587         best_prefix->second.used_ = true;
 588         return best_prefix->second.history_id_set_;
 589       }
 590
 591       // Otherwise we have a handy starting point.
 592       // If there are no history results for this prefix then we can bail early
 593       // as there will be no history results for the full term.
 594       if (best_prefix->second.history_id_set_.empty()) {
 595         search_term_cache_[term] = SearchTermCacheItem();
 596         return HistoryIDSet();
 597       }
 598       word_id_set = best_prefix->second.word_id_set_;
 599       prefix_chars = Char16SetFromString16(best_prefix->first);
 600       leftovers = term.substr(prefix_length);
 601     }
 602
 603     // Filter for each remaining, unique character in the term.
 604     Char16Set leftover_chars = Char16SetFromString16(leftovers);
 605     Char16Set unique_chars =
 606         base::STLSetDifference<Char16Set>(leftover_chars, prefix_chars);
 607
 608     // Reduce the word set with any leftover, unprocessed characters.
 609     if (!unique_chars.empty()) {
 610       WordIDSet leftover_set(WordIDSetForTermChars(unique_chars));
 611       // We might come up empty on the leftovers.
 612       if (leftover_set.empty()) {
 613         search_term_cache_[term] = SearchTermCacheItem();
 614         return HistoryIDSet();
 615       }
 616       // Or there may not have been a prefix from which to start.
 617       if (prefix_chars.empty()) {
 618         word_id_set.swap(leftover_set);
 619       } else {
 620         WordIDSet new_word_id_set = base::STLSetIntersection<WordIDSet>(
 621             word_id_set, leftover_set);
 622         word_id_set.swap(new_word_id_set);
 623       }
 624     }
 625
 626     // We must filter the word list because the resulting word set surely
 627     // contains words which do not have the search term as a proper subset.
 628     for (WordIDSet::iterator word_set_iter = word_id_set.begin();
 629          word_set_iter != word_id_set.end(); ) {
 630       if (word_list_[*word_set_iter].find(term) == base::string16::npos)
 631         word_id_set.erase(word_set_iter++);
 632       else
 633         ++word_set_iter;
 634     }
 635   } else {
 636     word_id_set = WordIDSetForTermChars(Char16SetFromString16(term));
 637   }
 638
 639   // If any words resulted then we can compose a set of history IDs by unioning
 640   // the sets from each word.
 641   HistoryIDSet history_id_set;
 642   if (!word_id_set.empty()) {
 643     for (WordIDSet::iterator word_id_iter = word_id_set.begin();
 644          word_id_iter != word_id_set.end(); ++word_id_iter) {
 645       WordID word_id = *word_id_iter;
 646       WordIDHistoryMap::iterator word_iter = word_id_history_map_.find(word_id);
 647       if (word_iter != word_id_history_map_.end()) {
 648         HistoryIDSet& word_history_id_set(word_iter->second);
 649         history_id_set.insert(word_history_id_set.begin(),
 650                               word_history_id_set.end());
 651       }
 652     }
 653   }
 654
 655   // Record a new cache entry for this word if the term is longer than
 656   // a single character.
 657   if (term_length > 1)
 658     search_term_cache_[term] = SearchTermCacheItem(word_id_set, history_id_set);
 659
 660   return history_id_set;
 661 }
 662
 663 WordIDSet URLIndexPrivateData::WordIDSetForTermChars(
 664     const Char16Set& term_chars) {
 665   WordIDSet word_id_set;
 666   for (Char16Set::const_iterator c_iter = term_chars.begin();
 667        c_iter != term_chars.end(); ++c_iter) {
 668     CharWordIDMap::iterator char_iter = char_word_map_.find(*c_iter);
 669     if (char_iter == char_word_map_.end()) {
 670       // A character was not found so there are no matching results: bail.
 671       word_id_set.clear();
 672       break;
 673     }
 674     WordIDSet& char_word_id_set(char_iter->second);
 675     // It is possible for there to no longer be any words associated with
 676     // a particular character. Give up in that case.
 677     if (char_word_id_set.empty()) {
 678       word_id_set.clear();
 679       break;
 680     }
 681
 682     if (c_iter == term_chars.begin()) {
 683       // First character results becomes base set of results.
 684       word_id_set = char_word_id_set;
 685     } else {
 686       // Subsequent character results get intersected in.
 687       WordIDSet new_word_id_set = base::STLSetIntersection<WordIDSet>(
 688           word_id_set, char_word_id_set);
 689       word_id_set.swap(new_word_id_set);
 690     }
 691   }
 692   return word_id_set;
 693 }
 694
 695 bool URLIndexPrivateData::IndexRow(
 696     history::HistoryDatabase* history_db,
 697     history::HistoryService* history_service,
 698     const history::URLRow& row,
 699     const std::string& languages,
 700     const std::set<std::string>& scheme_whitelist,
 701     base::CancelableTaskTracker* tracker) {
 702   const GURL& gurl(row.url());
 703
 704   // Index only URLs with a whitelisted scheme.
 705   if (!URLSchemeIsWhitelisted(gurl, scheme_whitelist))
 706     return false;
 707
 708   history::URLID row_id = row.id();
 709   // Strip out username and password before saving and indexing.
 710   base::string16 url(url_formatter::FormatUrl(
 711       gurl, languages, url_formatter::kFormatUrlOmitUsernamePassword,
 712       net::UnescapeRule::NONE, nullptr, nullptr, nullptr));
 713
 714   HistoryID history_id = static_cast<HistoryID>(row_id);
 715   DCHECK_LT(history_id, std::numeric_limits<HistoryID>::max());
 716
 717   // Add the row for quick lookup in the history info store.
 718   history::URLRow new_row(GURL(url), row_id);
 719   new_row.set_visit_count(row.visit_count());
 720   new_row.set_typed_count(row.typed_count());
 721   new_row.set_last_visit(row.last_visit());
 722   new_row.set_title(row.title());
 723   history_info_map_[history_id].url_row = new_row;
 724
 725   // Index the words contained in the URL and title of the row.
 726   RowWordStarts word_starts;
 727   AddRowWordsToIndex(new_row, &word_starts, languages);
 728   word_starts_map_[history_id] = word_starts;
 729
 730   // Update the recent visits information or schedule the update
 731   // as appropriate.
 732   if (history_db) {
 733     // We'd like to check that we're on the history DB thread.
 734     // However, unittest code actually calls this on the UI thread.
 735     // So we don't do any thread checks.
 736     history::VisitVector recent_visits;
 737     // Make sure the private data is going to get as many recent visits as
 738     // ScoredHistoryMatch::GetFrequency() hopes to use.
 739     DCHECK_GE(kMaxVisitsToStoreInCache, ScoredHistoryMatch::kMaxVisitsToScore);
 740     if (history_db->GetMostRecentVisitsForURL(row_id,
 741                                               kMaxVisitsToStoreInCache,
 742                                               &recent_visits))
 743       UpdateRecentVisits(row_id, recent_visits);
 744   } else {
 745     DCHECK(tracker);
 746     DCHECK(history_service);
 747     ScheduleUpdateRecentVisits(history_service, row_id, tracker);
 748   }
 749
 750   return true;
 751 }
 752
 753 void URLIndexPrivateData::AddRowWordsToIndex(const history::URLRow& row,
 754                                              RowWordStarts* word_starts,
 755                                              const std::string& languages) {
 756   HistoryID history_id = static_cast<HistoryID>(row.id());
 757   // Split URL into individual, unique words then add in the title words.
 758   const GURL& gurl(row.url());
 759   const base::string16& url =
 760       bookmarks::CleanUpUrlForMatching(gurl, languages, NULL);
 761   String16Set url_words = String16SetFromString16(url,
 762       word_starts ? &word_starts->url_word_starts_ : NULL);
 763   const base::string16& title = bookmarks::CleanUpTitleForMatching(row.title());
 764   String16Set title_words = String16SetFromString16(title,
 765       word_starts ? &word_starts->title_word_starts_ : NULL);
 766   String16Set words = base::STLSetUnion<String16Set>(url_words, title_words);
 767   for (String16Set::iterator word_iter = words.begin();
 768        word_iter != words.end(); ++word_iter)
 769     AddWordToIndex(*word_iter, history_id);
 770
 771   search_term_cache_.clear();  // Invalidate the term cache.
 772 }
 773
 774 void URLIndexPrivateData::AddWordToIndex(const base::string16& term,
 775                                          HistoryID history_id) {
 776   WordMap::iterator word_pos = word_map_.find(term);
 777   if (word_pos != word_map_.end())
 778     UpdateWordHistory(word_pos->second, history_id);
 779   else
 780     AddWordHistory(term, history_id);
 781 }
 782
 783 void URLIndexPrivateData::AddWordHistory(const base::string16& term,
 784                                          HistoryID history_id) {
 785   WordID word_id = word_list_.size();
 786   if (available_words_.empty()) {
 787     word_list_.push_back(term);
 788   } else {
 789     word_id = *(available_words_.begin());
 790     word_list_[word_id] = term;
 791     available_words_.erase(word_id);
 792   }
 793   word_map_[term] = word_id;
 794
 795   HistoryIDSet history_id_set;
 796   history_id_set.insert(history_id);
 797   word_id_history_map_[word_id] = history_id_set;
 798   AddToHistoryIDWordMap(history_id, word_id);
 799
 800   // For each character in the newly added word (i.e. a word that is not
 801   // already in the word index), add the word to the character index.
 802   Char16Set characters = Char16SetFromString16(term);
 803   for (Char16Set::iterator uni_char_iter = characters.begin();
 804        uni_char_iter != characters.end(); ++uni_char_iter) {
 805     base::char16 uni_char = *uni_char_iter;
 806     CharWordIDMap::iterator char_iter = char_word_map_.find(uni_char);
 807     if (char_iter != char_word_map_.end()) {
 808       // Update existing entry in the char/word index.
 809       WordIDSet& word_id_set(char_iter->second);
 810       word_id_set.insert(word_id);
 811     } else {
 812       // Create a new entry in the char/word index.
 813       WordIDSet word_id_set;
 814       word_id_set.insert(word_id);
 815       char_word_map_[uni_char] = word_id_set;
 816     }
 817   }
 818 }
 819
 820 void URLIndexPrivateData::UpdateWordHistory(WordID word_id,
 821                                             HistoryID history_id) {
 822   WordIDHistoryMap::iterator history_pos = word_id_history_map_.find(word_id);
 823   DCHECK(history_pos != word_id_history_map_.end());
 824   HistoryIDSet& history_id_set(history_pos->second);
 825   history_id_set.insert(history_id);
 826   AddToHistoryIDWordMap(history_id, word_id);
 827 }
 828
 829 void URLIndexPrivateData::AddToHistoryIDWordMap(HistoryID history_id,
 830                                                 WordID word_id) {
 831   HistoryIDWordMap::iterator iter = history_id_word_map_.find(history_id);
 832   if (iter != history_id_word_map_.end()) {
 833     WordIDSet& word_id_set(iter->second);
 834     word_id_set.insert(word_id);
 835   } else {
 836     WordIDSet word_id_set;
 837     word_id_set.insert(word_id);
 838     history_id_word_map_[history_id] = word_id_set;
 839   }
 840 }
 841
 842 void URLIndexPrivateData::RemoveRowFromIndex(const history::URLRow& row) {
 843   RemoveRowWordsFromIndex(row);
 844   HistoryID history_id = static_cast<HistoryID>(row.id());
 845   history_info_map_.erase(history_id);
 846   word_starts_map_.erase(history_id);
 847 }
 848
 849 void URLIndexPrivateData::RemoveRowWordsFromIndex(const history::URLRow& row) {
 850   // Remove the entries in history_id_word_map_ and word_id_history_map_ for
 851   // this row.
 852   HistoryID history_id = static_cast<HistoryID>(row.id());
 853   WordIDSet word_id_set = history_id_word_map_[history_id];
 854   history_id_word_map_.erase(history_id);
 855
 856   // Reconcile any changes to word usage.
 857   for (WordIDSet::iterator word_id_iter = word_id_set.begin();
 858        word_id_iter != word_id_set.end(); ++word_id_iter) {
 859     WordID word_id = *word_id_iter;
 860     word_id_history_map_[word_id].erase(history_id);
 861     if (!word_id_history_map_[word_id].empty())
 862       continue;  // The word is still in use.
 863
 864     // The word is no longer in use. Reconcile any changes to character usage.
 865     base::string16 word = word_list_[word_id];
 866     Char16Set characters = Char16SetFromString16(word);
 867     for (Char16Set::iterator uni_char_iter = characters.begin();
 868          uni_char_iter != characters.end(); ++uni_char_iter) {
 869       base::char16 uni_char = *uni_char_iter;
 870       char_word_map_[uni_char].erase(word_id);
 871       if (char_word_map_[uni_char].empty())
 872         char_word_map_.erase(uni_char);  // No longer in use.
 873     }
 874
 875     // Complete the removal of references to the word.
 876     word_id_history_map_.erase(word_id);
 877     word_map_.erase(word);
 878     word_list_[word_id] = base::string16();
 879     available_words_.insert(word_id);
 880   }
 881 }
 882
 883 void URLIndexPrivateData::ResetSearchTermCache() {
 884   for (SearchTermCacheMap::iterator iter = search_term_cache_.begin();
 885        iter != search_term_cache_.end(); ++iter)
 886     iter->second.used_ = false;
 887 }
 888
 889 bool URLIndexPrivateData::SaveToFile(const base::FilePath& file_path) {
 890   base::TimeTicks beginning_time = base::TimeTicks::Now();
 891   InMemoryURLIndexCacheItem index_cache;
 892   SavePrivateData(&index_cache);
 893   std::string data;
 894   if (!index_cache.SerializeToString(&data)) {
 895     LOG(WARNING) << "Failed to serialize the InMemoryURLIndex cache.";
 896     return false;
 897   }
 898
 899   int size = data.size();
 900   if (base::WriteFile(file_path, data.c_str(), size) != size) {
 901     LOG(WARNING) << "Failed to write " << file_path.value();
 902     return false;
 903   }
 904   UMA_HISTOGRAM_TIMES("History.InMemoryURLIndexSaveCacheTime",
 905                       base::TimeTicks::Now() - beginning_time);
 906   return true;
 907 }
 908
 909 void URLIndexPrivateData::SavePrivateData(
 910     InMemoryURLIndexCacheItem* cache) const {
 911   DCHECK(cache);
 912   cache->set_last_rebuild_timestamp(
 913       last_time_rebuilt_from_history_.ToInternalValue());
 914   cache->set_version(saved_cache_version_);
 915   // history_item_count_ is no longer used but rather than change the protobuf
 916   // definition use a placeholder. This will go away with the switch to SQLite.
 917   cache->set_history_item_count(0);
 918   SaveWordList(cache);
 919   SaveWordMap(cache);
 920   SaveCharWordMap(cache);
 921   SaveWordIDHistoryMap(cache);
 922   SaveHistoryInfoMap(cache);
 923   SaveWordStartsMap(cache);
 924 }
 925
 926 void URLIndexPrivateData::SaveWordList(InMemoryURLIndexCacheItem* cache) const {
 927   if (word_list_.empty())
 928     return;
 929   WordListItem* list_item = cache->mutable_word_list();
 930   list_item->set_word_count(word_list_.size());
 931   for (String16Vector::const_iterator iter = word_list_.begin();
 932        iter != word_list_.end(); ++iter)
 933     list_item->add_word(base::UTF16ToUTF8(*iter));
 934 }
 935
 936 void URLIndexPrivateData::SaveWordMap(InMemoryURLIndexCacheItem* cache) const {
 937   if (word_map_.empty())
 938     return;
 939   WordMapItem* map_item = cache->mutable_word_map();
 940   map_item->set_item_count(word_map_.size());
 941   for (WordMap::const_iterator iter = word_map_.begin();
 942        iter != word_map_.end(); ++iter) {
 943     WordMapEntry* map_entry = map_item->add_word_map_entry();
 944     map_entry->set_word(base::UTF16ToUTF8(iter->first));
 945     map_entry->set_word_id(iter->second);
 946   }
 947 }
 948
 949 void URLIndexPrivateData::SaveCharWordMap(
 950     InMemoryURLIndexCacheItem* cache) const {
 951   if (char_word_map_.empty())
 952     return;
 953   CharWordMapItem* map_item = cache->mutable_char_word_map();
 954   map_item->set_item_count(char_word_map_.size());
 955   for (CharWordIDMap::const_iterator iter = char_word_map_.begin();
 956        iter != char_word_map_.end(); ++iter) {
 957     CharWordMapEntry* map_entry = map_item->add_char_word_map_entry();
 958     map_entry->set_char_16(iter->first);
 959     const WordIDSet& word_id_set(iter->second);
 960     map_entry->set_item_count(word_id_set.size());
 961     for (WordIDSet::const_iterator set_iter = word_id_set.begin();
 962          set_iter != word_id_set.end(); ++set_iter)
 963       map_entry->add_word_id(*set_iter);
 964   }
 965 }
 966
 967 void URLIndexPrivateData::SaveWordIDHistoryMap(
 968     InMemoryURLIndexCacheItem* cache) const {
 969   if (word_id_history_map_.empty())
 970     return;
 971   WordIDHistoryMapItem* map_item = cache->mutable_word_id_history_map();
 972   map_item->set_item_count(word_id_history_map_.size());
 973   for (WordIDHistoryMap::const_iterator iter = word_id_history_map_.begin();
 974        iter != word_id_history_map_.end(); ++iter) {
 975     WordIDHistoryMapEntry* map_entry =
 976         map_item->add_word_id_history_map_entry();
 977     map_entry->set_word_id(iter->first);
 978     const HistoryIDSet& history_id_set(iter->second);
 979     map_entry->set_item_count(history_id_set.size());
 980     for (HistoryIDSet::const_iterator set_iter = history_id_set.begin();
 981          set_iter != history_id_set.end(); ++set_iter)
 982       map_entry->add_history_id(*set_iter);
 983   }
 984 }
 985
 986 void URLIndexPrivateData::SaveHistoryInfoMap(
 987     InMemoryURLIndexCacheItem* cache) const {
 988   if (history_info_map_.empty())
 989     return;
 990   HistoryInfoMapItem* map_item = cache->mutable_history_info_map();
 991   map_item->set_item_count(history_info_map_.size());
 992   for (HistoryInfoMap::const_iterator iter = history_info_map_.begin();
 993        iter != history_info_map_.end(); ++iter) {
 994     HistoryInfoMapEntry* map_entry = map_item->add_history_info_map_entry();
 995     map_entry->set_history_id(iter->first);
 996     const history::URLRow& url_row(iter->second.url_row);
 997     // Note: We only save information that contributes to the index so there
 998     // is no need to save search_term_cache_ (not persistent).
 999     map_entry->set_visit_count(url_row.visit_count());
1000     map_entry->set_typed_count(url_row.typed_count());
1001     map_entry->set_last_visit(url_row.last_visit().ToInternalValue());
1002     map_entry->set_url(url_row.url().spec());
1003     map_entry->set_title(base::UTF16ToUTF8(url_row.title()));
1004     const VisitInfoVector& visits(iter->second.visits);
1005     for (VisitInfoVector::const_iterator visit_iter = visits.begin();
1006          visit_iter != visits.end(); ++visit_iter) {
1007       HistoryInfoMapEntry_VisitInfo* visit_info = map_entry->add_visits();
1008       visit_info->set_visit_time(visit_iter->first.ToInternalValue());
1009       visit_info->set_transition_type(visit_iter->second);
1010     }
1011   }
1012 }
1013
1014 void URLIndexPrivateData::SaveWordStartsMap(
1015     InMemoryURLIndexCacheItem* cache) const {
1016   if (word_starts_map_.empty())
1017     return;
1018   // For unit testing: Enable saving of the cache as an earlier version to
1019   // allow testing of cache file upgrading in ReadFromFile().
1020   // TODO(mrossetti): Instead of intruding on production code with this kind of
1021   // test harness, save a copy of an older version cache with known results.
1022   // Implement this when switching the caching over to SQLite.
1023   if (saved_cache_version_ < 1)
1024     return;
1025
1026   WordStartsMapItem* map_item = cache->mutable_word_starts_map();
1027   map_item->set_item_count(word_starts_map_.size());
1028   for (WordStartsMap::const_iterator iter = word_starts_map_.begin();
1029        iter != word_starts_map_.end(); ++iter) {
1030     WordStartsMapEntry* map_entry = map_item->add_word_starts_map_entry();
1031     map_entry->set_history_id(iter->first);
1032     const RowWordStarts& word_starts(iter->second);
1033     for (WordStarts::const_iterator i = word_starts.url_word_starts_.begin();
1034          i != word_starts.url_word_starts_.end(); ++i)
1035       map_entry->add_url_word_starts(*i);
1036     for (WordStarts::const_iterator i = word_starts.title_word_starts_.begin();
1037          i != word_starts.title_word_starts_.end(); ++i)
1038       map_entry->add_title_word_starts(*i);
1039   }
1040 }
1041
1042 bool URLIndexPrivateData::RestorePrivateData(
1043     const InMemoryURLIndexCacheItem& cache,
1044     const std::string& languages) {
1045   last_time_rebuilt_from_history_ =
1046       base::Time::FromInternalValue(cache.last_rebuild_timestamp());
1047   const base::TimeDelta rebuilt_ago =
1048       base::Time::Now() - last_time_rebuilt_from_history_;
1049   if ((rebuilt_ago > base::TimeDelta::FromDays(7)) ||
1050       (rebuilt_ago < base::TimeDelta::FromDays(-1))) {
1051     // Cache is more than a week old or, somehow, from some time in the future.
1052     // It's probably a good time to rebuild the index from history to
1053     // allow synced entries to now appear, expired entries to disappear, etc.
1054     // Allow one day in the future to make the cache not rebuild on simple
1055     // system clock changes such as time zone changes.
1056     return false;
1057   }
1058   if (cache.has_version()) {
1059     if (cache.version() < kCurrentCacheFileVersion) {
1060       // Don't try to restore an old format cache file.  (This will cause
1061       // the InMemoryURLIndex to schedule rebuilding the URLIndexPrivateData
1062       // from history.)
1063       return false;
1064     }
1065     restored_cache_version_ = cache.version();
1066   }
1067   return RestoreWordList(cache) && RestoreWordMap(cache) &&
1068       RestoreCharWordMap(cache) && RestoreWordIDHistoryMap(cache) &&
1069       RestoreHistoryInfoMap(cache) && RestoreWordStartsMap(cache, languages);
1070 }
1071
1072 bool URLIndexPrivateData::RestoreWordList(
1073     const InMemoryURLIndexCacheItem& cache) {
1074   if (!cache.has_word_list())
1075     return false;
1076   const WordListItem& list_item(cache.word_list());
1077   uint32 expected_item_count = list_item.word_count();
1078   uint32 actual_item_count = list_item.word_size();
1079   if (actual_item_count == 0 || actual_item_count != expected_item_count)
1080     return false;
1081   const RepeatedPtrField<std::string>& words(list_item.word());
1082   for (RepeatedPtrField<std::string>::const_iterator iter = words.begin();
1083        iter != words.end(); ++iter)
1084     word_list_.push_back(base::UTF8ToUTF16(*iter));
1085   return true;
1086 }
1087
1088 bool URLIndexPrivateData::RestoreWordMap(
1089     const InMemoryURLIndexCacheItem& cache) {
1090   if (!cache.has_word_map())
1091     return false;
1092   const WordMapItem& list_item(cache.word_map());
1093   uint32 expected_item_count = list_item.item_count();
1094   uint32 actual_item_count = list_item.word_map_entry_size();
1095   if (actual_item_count == 0 || actual_item_count != expected_item_count)
1096     return false;
1097   const RepeatedPtrField<WordMapEntry>& entries(list_item.word_map_entry());
1098   for (RepeatedPtrField<WordMapEntry>::const_iterator iter = entries.begin();
1099        iter != entries.end(); ++iter)
1100     word_map_[base::UTF8ToUTF16(iter->word())] = iter->word_id();
1101   return true;
1102 }
1103
1104 bool URLIndexPrivateData::RestoreCharWordMap(
1105     const InMemoryURLIndexCacheItem& cache) {
1106   if (!cache.has_char_word_map())
1107     return false;
1108   const CharWordMapItem& list_item(cache.char_word_map());
1109   uint32 expected_item_count = list_item.item_count();
1110   uint32 actual_item_count = list_item.char_word_map_entry_size();
1111   if (actual_item_count == 0 || actual_item_count != expected_item_count)
1112     return false;
1113   const RepeatedPtrField<CharWordMapEntry>&
1114       entries(list_item.char_word_map_entry());
1115   for (RepeatedPtrField<CharWordMapEntry>::const_iterator iter =
1116        entries.begin(); iter != entries.end(); ++iter) {
1117     expected_item_count = iter->item_count();
1118     actual_item_count = iter->word_id_size();
1119     if (actual_item_count == 0 || actual_item_count != expected_item_count)
1120       return false;
1121     base::char16 uni_char = static_cast<base::char16>(iter->char_16());
1122     WordIDSet word_id_set;
1123     const RepeatedField<int32>& word_ids(iter->word_id());
1124     for (RepeatedField<int32>::const_iterator jiter = word_ids.begin();
1125          jiter != word_ids.end(); ++jiter)
1126       word_id_set.insert(*jiter);
1127     char_word_map_[uni_char] = word_id_set;
1128   }
1129   return true;
1130 }
1131
1132 bool URLIndexPrivateData::RestoreWordIDHistoryMap(
1133     const InMemoryURLIndexCacheItem& cache) {
1134   if (!cache.has_word_id_history_map())
1135     return false;
1136   const WordIDHistoryMapItem& list_item(cache.word_id_history_map());
1137   uint32 expected_item_count = list_item.item_count();
1138   uint32 actual_item_count = list_item.word_id_history_map_entry_size();
1139   if (actual_item_count == 0 || actual_item_count != expected_item_count)
1140     return false;
1141   const RepeatedPtrField<WordIDHistoryMapEntry>&
1142       entries(list_item.word_id_history_map_entry());
1143   for (RepeatedPtrField<WordIDHistoryMapEntry>::const_iterator iter =
1144        entries.begin(); iter != entries.end(); ++iter) {
1145     expected_item_count = iter->item_count();
1146     actual_item_count = iter->history_id_size();
1147     if (actual_item_count == 0 || actual_item_count != expected_item_count)
1148       return false;
1149     WordID word_id = iter->word_id();
1150     HistoryIDSet history_id_set;
1151     const RepeatedField<int64>& history_ids(iter->history_id());
1152     for (RepeatedField<int64>::const_iterator jiter = history_ids.begin();
1153          jiter != history_ids.end(); ++jiter) {
1154       history_id_set.insert(*jiter);
1155       AddToHistoryIDWordMap(*jiter, word_id);
1156     }
1157     word_id_history_map_[word_id] = history_id_set;
1158   }
1159   return true;
1160 }
1161
1162 bool URLIndexPrivateData::RestoreHistoryInfoMap(
1163     const InMemoryURLIndexCacheItem& cache) {
1164   if (!cache.has_history_info_map())
1165     return false;
1166   const HistoryInfoMapItem& list_item(cache.history_info_map());
1167   uint32 expected_item_count = list_item.item_count();
1168   uint32 actual_item_count = list_item.history_info_map_entry_size();
1169   if (actual_item_count == 0 || actual_item_count != expected_item_count)
1170     return false;
1171   const RepeatedPtrField<HistoryInfoMapEntry>&
1172       entries(list_item.history_info_map_entry());
1173   for (RepeatedPtrField<HistoryInfoMapEntry>::const_iterator iter =
1174        entries.begin(); iter != entries.end(); ++iter) {
1175     HistoryID history_id = iter->history_id();
1176     GURL url(iter->url());
1177     history::URLRow url_row(url, history_id);
1178     url_row.set_visit_count(iter->visit_count());
1179     url_row.set_typed_count(iter->typed_count());
1180     url_row.set_last_visit(base::Time::FromInternalValue(iter->last_visit()));
1181     if (iter->has_title()) {
1182       base::string16 title(base::UTF8ToUTF16(iter->title()));
1183       url_row.set_title(title);
1184     }
1185     history_info_map_[history_id].url_row = url_row;
1186
1187     // Restore visits list.
1188     VisitInfoVector visits;
1189     visits.reserve(iter->visits_size());
1190     for (int i = 0; i < iter->visits_size(); ++i) {
1191       visits.push_back(std::make_pair(
1192           base::Time::FromInternalValue(iter->visits(i).visit_time()),
1193           ui::PageTransitionFromInt(iter->visits(i).transition_type())));
1194     }
1195     history_info_map_[history_id].visits = visits;
1196   }
1197   return true;
1198 }
1199
1200 bool URLIndexPrivateData::RestoreWordStartsMap(
1201     const InMemoryURLIndexCacheItem& cache,
1202     const std::string& languages) {
1203   // Note that this function must be called after RestoreHistoryInfoMap() has
1204   // been run as the word starts may have to be recalculated from the urls and
1205   // page titles.
1206   if (cache.has_word_starts_map()) {
1207     const WordStartsMapItem& list_item(cache.word_starts_map());
1208     uint32 expected_item_count = list_item.item_count();
1209     uint32 actual_item_count = list_item.word_starts_map_entry_size();
1210     if (actual_item_count == 0 || actual_item_count != expected_item_count)
1211       return false;
1212     const RepeatedPtrField<WordStartsMapEntry>&
1213         entries(list_item.word_starts_map_entry());
1214     for (RepeatedPtrField<WordStartsMapEntry>::const_iterator iter =
1215          entries.begin(); iter != entries.end(); ++iter) {
1216       HistoryID history_id = iter->history_id();
1217       RowWordStarts word_starts;
1218       // Restore the URL word starts.
1219       const RepeatedField<int32>& url_starts(iter->url_word_starts());
1220       for (RepeatedField<int32>::const_iterator jiter = url_starts.begin();
1221            jiter != url_starts.end(); ++jiter)
1222         word_starts.url_word_starts_.push_back(*jiter);
1223       // Restore the page title word starts.
1224       const RepeatedField<int32>& title_starts(iter->title_word_starts());
1225       for (RepeatedField<int32>::const_iterator jiter = title_starts.begin();
1226            jiter != title_starts.end(); ++jiter)
1227         word_starts.title_word_starts_.push_back(*jiter);
1228       word_starts_map_[history_id] = word_starts;
1229     }
1230   } else {
1231     // Since the cache did not contain any word starts we must rebuild then from
1232     // the URL and page titles.
1233     for (HistoryInfoMap::const_iterator iter = history_info_map_.begin();
1234          iter != history_info_map_.end(); ++iter) {
1235       RowWordStarts word_starts;
1236       const history::URLRow& row(iter->second.url_row);
1237       const base::string16& url =
1238           bookmarks::CleanUpUrlForMatching(row.url(), languages, NULL);
1239       String16VectorFromString16(url, false, &word_starts.url_word_starts_);
1240       const base::string16& title =
1241           bookmarks::CleanUpTitleForMatching(row.title());
1242       String16VectorFromString16(title, false, &word_starts.title_word_starts_);
1243       word_starts_map_[iter->first] = word_starts;
1244     }
1245   }
1246   return true;
1247 }
1248
1249 // static
1250 bool URLIndexPrivateData::URLSchemeIsWhitelisted(
1251     const GURL& gurl,
1252     const std::set<std::string>& whitelist) {
1253   return whitelist.find(gurl.scheme()) != whitelist.end();
1254 }
1255
1256
1257 // SearchTermCacheItem ---------------------------------------------------------
1258
1259 URLIndexPrivateData::SearchTermCacheItem::SearchTermCacheItem(
1260     const WordIDSet& word_id_set,
1261     const HistoryIDSet& history_id_set)
1262     : word_id_set_(word_id_set), history_id_set_(history_id_set), used_(true) {
1263 }
1264
1265 URLIndexPrivateData::SearchTermCacheItem::SearchTermCacheItem() : used_(true) {
1266 }
1267
1268 URLIndexPrivateData::SearchTermCacheItem::~SearchTermCacheItem() {
1269 }
1270
1271 // URLIndexPrivateData::AddHistoryMatch ----------------------------------------
1272
1273 URLIndexPrivateData::AddHistoryMatch::AddHistoryMatch(
1274     bookmarks::BookmarkModel* bookmark_model,
1275     const URLIndexPrivateData& private_data,
1276     const std::string& languages,
1277     const base::string16& lower_string,
1278     const String16Vector& lower_terms,
1279     const base::Time now)
1280     : bookmark_model_(bookmark_model),
1281       private_data_(private_data),
1282       languages_(languages),
1283       lower_string_(lower_string),
1284       lower_terms_(lower_terms),
1285       now_(now) {
1286   // Calculate offsets for each term.  For instance, the offset for
1287   // ".net" should be 1, indicating that the actual word-part of the term
1288   // starts at offset 1.
1289   lower_terms_to_word_starts_offsets_.resize(lower_terms_.size(), 0u);
1290   for (size_t i = 0; i < lower_terms_.size(); ++i) {
1291     base::i18n::BreakIterator iter(lower_terms_[i],
1292                                    base::i18n::BreakIterator::BREAK_WORD);
1293     // If the iterator doesn't work, assume an offset of 0.
1294     if (!iter.Init())
1295       continue;
1296     // Find the first word start. If the iterator didn't find a word break, set
1297     // an offset of term size. For example, the offset for "://" should be 3,
1298     // indicating that the word-part is missing.
1299     while (iter.Advance() && !iter.IsWord()) {}
1300
1301     lower_terms_to_word_starts_offsets_[i] = iter.prev();
1302   }
1303 }
1304
1305 URLIndexPrivateData::AddHistoryMatch::~AddHistoryMatch() {
1306 }
1307
1308 void URLIndexPrivateData::AddHistoryMatch::operator()(
1309     const HistoryID history_id) {
1310   HistoryInfoMap::const_iterator hist_pos =
1311       private_data_.history_info_map_.find(history_id);
1312   if (hist_pos != private_data_.history_info_map_.end()) {
1313     const history::URLRow& hist_item = hist_pos->second.url_row;
1314     const VisitInfoVector& visits = hist_pos->second.visits;
1315     WordStartsMap::const_iterator starts_pos =
1316         private_data_.word_starts_map_.find(history_id);
1317     DCHECK(starts_pos != private_data_.word_starts_map_.end());
1318     ScoredHistoryMatch match(
1319         hist_item, visits, languages_, lower_string_, lower_terms_,
1320         lower_terms_to_word_starts_offsets_, starts_pos->second,
1321         bookmark_model_ && bookmark_model_->IsBookmarked(hist_item.url()),
1322         now_);
1323     if (match.raw_score > 0)
1324       scored_matches_.push_back(match);
1325   }
1326 }
1327
1328
1329 // URLIndexPrivateData::HistoryItemFactorGreater -------------------------------
1330
1331 URLIndexPrivateData::HistoryItemFactorGreater::HistoryItemFactorGreater(
1332     const HistoryInfoMap& history_info_map)
1333     : history_info_map_(history_info_map) {
1334 }
1335
1336 URLIndexPrivateData::HistoryItemFactorGreater::~HistoryItemFactorGreater() {
1337 }
1338
1339 bool URLIndexPrivateData::HistoryItemFactorGreater::operator()(
1340     const HistoryID h1,
1341     const HistoryID h2) {
1342   HistoryInfoMap::const_iterator entry1(history_info_map_.find(h1));
1343   if (entry1 == history_info_map_.end())
1344     return false;
1345   HistoryInfoMap::const_iterator entry2(history_info_map_.find(h2));
1346   if (entry2 == history_info_map_.end())
1347     return true;
1348   const history::URLRow& r1(entry1->second.url_row);
1349   const history::URLRow& r2(entry2->second.url_row);
1350   // First cut: typed count, visit count, recency.
1351   // TODO(mrossetti): This is too simplistic. Consider an approach which ranks
1352   // recently visited (within the last 12/24 hours) as highly important. Get
1353   // input from mpearson.
1354   if (r1.typed_count() != r2.typed_count())
1355     return (r1.typed_count() > r2.typed_count());
1356   if (r1.visit_count() != r2.visit_count())
1357     return (r1.visit_count() > r2.visit_count());
1358   return (r1.last_visit() > r2.last_visit());
1359 }