chrome/browser/autocomplete/url_index_private_data.cc

   1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
   2 // Use of this source code is governed by a BSD-style license that can be
   3 // found in the LICENSE file.
   4
   5 #include "chrome/browser/autocomplete/url_index_private_data.h"
   6
   7 #include <functional>
   8 #include <iterator>
   9 #include <limits>
  10 #include <numeric>
  11 #include <string>
  12 #include <vector>
  13
  14 #include "base/basictypes.h"
  15 #include "base/files/file_util.h"
  16 #include "base/i18n/break_iterator.h"
  17 #include "base/i18n/case_conversion.h"
  18 #include "base/metrics/histogram.h"
  19 #include "base/strings/string_util.h"
  20 #include "base/strings/utf_string_conversions.h"
  21 #include "base/time/time.h"
  22 #include "chrome/browser/autocomplete/in_memory_url_index.h"
  23 #include "components/bookmarks/browser/bookmark_model.h"
  24 #include "components/bookmarks/browser/bookmark_utils.h"
  25 #include "components/history/core/browser/history_database.h"
  26 #include "components/history/core/browser/history_db_task.h"
  27 #include "components/history/core/browser/history_service.h"
  28 #include "net/base/net_util.h"
  29
  30 #if defined(USE_SYSTEM_PROTOBUF)
  31 #include <google/protobuf/repeated_field.h>
  32 #else
  33 #include "third_party/protobuf/src/google/protobuf/repeated_field.h"
  34 #endif
  35
  36 using google::protobuf::RepeatedField;
  37 using google::protobuf::RepeatedPtrField;
  38 using in_memory_url_index::InMemoryURLIndexCacheItem;
  39
  40 namespace {
  41 static const size_t kMaxVisitsToStoreInCache = 10u;
  42 }
  43
  44 typedef in_memory_url_index::InMemoryURLIndexCacheItem_WordListItem
  45     WordListItem;
  46 typedef in_memory_url_index::InMemoryURLIndexCacheItem_WordMapItem_WordMapEntry
  47     WordMapEntry;
  48 typedef in_memory_url_index::InMemoryURLIndexCacheItem_WordMapItem WordMapItem;
  49 typedef in_memory_url_index::InMemoryURLIndexCacheItem_CharWordMapItem
  50     CharWordMapItem;
  51 typedef in_memory_url_index::
  52     InMemoryURLIndexCacheItem_CharWordMapItem_CharWordMapEntry CharWordMapEntry;
  53 typedef in_memory_url_index::InMemoryURLIndexCacheItem_WordIDHistoryMapItem
  54     WordIDHistoryMapItem;
  55 typedef in_memory_url_index::
  56     InMemoryURLIndexCacheItem_WordIDHistoryMapItem_WordIDHistoryMapEntry
  57         WordIDHistoryMapEntry;
  58 typedef in_memory_url_index::InMemoryURLIndexCacheItem_HistoryInfoMapItem
  59     HistoryInfoMapItem;
  60 typedef in_memory_url_index::
  61     InMemoryURLIndexCacheItem_HistoryInfoMapItem_HistoryInfoMapEntry
  62         HistoryInfoMapEntry;
  63 typedef in_memory_url_index::
  64     InMemoryURLIndexCacheItem_HistoryInfoMapItem_HistoryInfoMapEntry_VisitInfo
  65         HistoryInfoMapEntry_VisitInfo;
  66 typedef in_memory_url_index::InMemoryURLIndexCacheItem_WordStartsMapItem
  67     WordStartsMapItem;
  68 typedef in_memory_url_index::
  69     InMemoryURLIndexCacheItem_WordStartsMapItem_WordStartsMapEntry
  70         WordStartsMapEntry;
  71
  72 // Algorithm Functions ---------------------------------------------------------
  73
  74 // Comparison function for sorting search terms by descending length.
  75 bool LengthGreater(const base::string16& string_a,
  76                    const base::string16& string_b) {
  77   return string_a.length() > string_b.length();
  78 }
  79
  80
  81 // UpdateRecentVisitsFromHistoryDBTask -----------------------------------------
  82
  83 // HistoryDBTask used to update the recent visit data for a particular
  84 // row from the history database.
  85 class UpdateRecentVisitsFromHistoryDBTask : public history::HistoryDBTask {
  86  public:
  87   explicit UpdateRecentVisitsFromHistoryDBTask(
  88       URLIndexPrivateData* private_data,
  89       history::URLID url_id);
  90
  91   bool RunOnDBThread(history::HistoryBackend* backend,
  92                      history::HistoryDatabase* db) override;
  93   void DoneRunOnMainThread() override;
  94
  95  private:
  96   ~UpdateRecentVisitsFromHistoryDBTask() override;
  97
  98   // The URLIndexPrivateData that gets updated after the historyDB
  99   // task returns.
 100   URLIndexPrivateData* private_data_;
 101   // The ID of the URL to get visits for and then update.
 102   history::URLID url_id_;
 103   // Whether fetching the recent visits for the URL succeeded.
 104   bool succeeded_;
 105   // The awaited data that's shown to private_data_ for it to copy and
 106   // store.
 107   history::VisitVector recent_visits_;
 108
 109   DISALLOW_COPY_AND_ASSIGN(UpdateRecentVisitsFromHistoryDBTask);
 110 };
 111
 112 UpdateRecentVisitsFromHistoryDBTask::UpdateRecentVisitsFromHistoryDBTask(
 113     URLIndexPrivateData* private_data,
 114     history::URLID url_id)
 115     : private_data_(private_data), url_id_(url_id), succeeded_(false) {
 116 }
 117
 118 bool UpdateRecentVisitsFromHistoryDBTask::RunOnDBThread(
 119     history::HistoryBackend* backend,
 120     history::HistoryDatabase* db) {
 121   // Make sure the private data is going to get as many recent visits as
 122   // ScoredHistoryMatch::GetFrequency() hopes to use.
 123   DCHECK_GE(kMaxVisitsToStoreInCache, ScoredHistoryMatch::kMaxVisitsToScore);
 124   succeeded_ = db->GetMostRecentVisitsForURL(url_id_,
 125                                              kMaxVisitsToStoreInCache,
 126                                              &recent_visits_);
 127   if (!succeeded_)
 128     recent_visits_.clear();
 129   return true;  // Always claim to be done; do not retry failures.
 130 }
 131
 132 void UpdateRecentVisitsFromHistoryDBTask::DoneRunOnMainThread() {
 133   if (succeeded_)
 134     private_data_->UpdateRecentVisits(url_id_, recent_visits_);
 135 }
 136
 137 UpdateRecentVisitsFromHistoryDBTask::~UpdateRecentVisitsFromHistoryDBTask() {
 138 }
 139
 140
 141 // URLIndexPrivateData ---------------------------------------------------------
 142
 143 URLIndexPrivateData::URLIndexPrivateData()
 144     : restored_cache_version_(0),
 145       saved_cache_version_(kCurrentCacheFileVersion),
 146       pre_filter_item_count_(0),
 147       post_filter_item_count_(0),
 148       post_scoring_item_count_(0) {
 149 }
 150
 151 ScoredHistoryMatches URLIndexPrivateData::HistoryItemsForTerms(
 152     base::string16 search_string,
 153     size_t cursor_position,
 154     size_t max_matches,
 155     const std::string& languages,
 156     bookmarks::BookmarkModel* bookmark_model) {
 157   // If cursor position is set and useful (not at either end of the
 158   // string), allow the search string to be broken at cursor position.
 159   // We do this by pretending there's a space where the cursor is.
 160   if ((cursor_position != base::string16::npos) &&
 161       (cursor_position < search_string.length()) &&
 162       (cursor_position > 0)) {
 163     search_string.insert(cursor_position, base::ASCIIToUTF16(" "));
 164   }
 165   pre_filter_item_count_ = 0;
 166   post_filter_item_count_ = 0;
 167   post_scoring_item_count_ = 0;
 168   // The search string we receive may contain escaped characters. For reducing
 169   // the index we need individual, lower-cased words, ignoring escapings. For
 170   // the final filtering we need whitespace separated substrings possibly
 171   // containing escaped characters.
 172   base::string16 lower_raw_string(base::i18n::ToLower(search_string));
 173   base::string16 lower_unescaped_string =
 174       net::UnescapeURLComponent(lower_raw_string,
 175           net::UnescapeRule::SPACES | net::UnescapeRule::URL_SPECIAL_CHARS);
 176   // Extract individual 'words' (as opposed to 'terms'; see below) from the
 177   // search string. When the user types "colspec=ID%20Mstone Release" we get
 178   // four 'words': "colspec", "id", "mstone" and "release".
 179   String16Vector lower_words(
 180       String16VectorFromString16(lower_unescaped_string, false, NULL));
 181   ScoredHistoryMatches scored_items;
 182
 183   // Do nothing if we have indexed no words (probably because we've not been
 184   // initialized yet) or the search string has no words.
 185   if (word_list_.empty() || lower_words.empty()) {
 186     search_term_cache_.clear();  // Invalidate the term cache.
 187     return scored_items;
 188   }
 189
 190   // Reset used_ flags for search_term_cache_. We use a basic mark-and-sweep
 191   // approach.
 192   ResetSearchTermCache();
 193
 194   HistoryIDSet history_id_set = HistoryIDSetFromWords(lower_words);
 195
 196   // Trim the candidate pool if it is large. Note that we do not filter out
 197   // items that do not contain the search terms as proper substrings -- doing
 198   // so is the performance-costly operation we are trying to avoid in order
 199   // to maintain omnibox responsiveness.
 200   const size_t kItemsToScoreLimit = 500;
 201   pre_filter_item_count_ = history_id_set.size();
 202   // If we trim the results set we do not want to cache the results for next
 203   // time as the user's ultimately desired result could easily be eliminated
 204   // in this early rough filter.
 205   bool was_trimmed = (pre_filter_item_count_ > kItemsToScoreLimit);
 206   if (was_trimmed) {
 207     HistoryIDVector history_ids;
 208     std::copy(history_id_set.begin(), history_id_set.end(),
 209               std::back_inserter(history_ids));
 210     // Trim down the set by sorting by typed-count, visit-count, and last
 211     // visit.
 212     HistoryItemFactorGreater
 213         item_factor_functor(history_info_map_);
 214     std::partial_sort(history_ids.begin(),
 215                       history_ids.begin() + kItemsToScoreLimit,
 216                       history_ids.end(),
 217                       item_factor_functor);
 218     history_id_set.clear();
 219     std::copy(history_ids.begin(), history_ids.begin() + kItemsToScoreLimit,
 220               std::inserter(history_id_set, history_id_set.end()));
 221     post_filter_item_count_ = history_id_set.size();
 222   }
 223
 224   // Pass over all of the candidates filtering out any without a proper
 225   // substring match, inserting those which pass in order by score. Note that
 226   // in this step we are using the raw search string complete with escaped
 227   // URL elements. When the user has specifically typed something akin to
 228   // "sort=pri&colspec=ID%20Mstone%20Release" we want to make sure that that
 229   // specific substring appears in the URL or page title.
 230
 231   // We call these 'terms' (as opposed to 'words'; see above) as in this case
 232   // we only want to break up the search string on 'true' whitespace rather than
 233   // escaped whitespace. When the user types "colspec=ID%20Mstone Release" we
 234   // get two 'terms': "colspec=id%20mstone" and "release".
 235   String16Vector lower_raw_terms;
 236   if (Tokenize(lower_raw_string, base::kWhitespaceUTF16,
 237                &lower_raw_terms) == 0) {
 238     // Don't score matches when there are no terms to score against.  (It's
 239     // possible that the word break iterater that extracts words to search
 240     // for in the database allows some whitespace "words" whereas Tokenize
 241     // excludes a long list of whitespace.)  One could write a scoring
 242     // function that gives a reasonable order to matches when there
 243     // are no terms (i.e., all the words are some form of whitespace),
 244     // but this is such a rare edge case that it's not worth the time.
 245     return scored_items;
 246   }
 247   scored_items =
 248       std::for_each(
 249           history_id_set.begin(), history_id_set.end(),
 250           AddHistoryMatch(bookmark_model, *this, languages, lower_raw_string,
 251                           lower_raw_terms, base::Time::Now())).ScoredMatches();
 252
 253   // Select and sort only the top |max_matches| results.
 254   if (scored_items.size() > max_matches) {
 255     std::partial_sort(scored_items.begin(),
 256                       scored_items.begin() +
 257                           max_matches,
 258                       scored_items.end(),
 259                       ScoredHistoryMatch::MatchScoreGreater);
 260       scored_items.resize(max_matches);
 261   } else {
 262     std::sort(scored_items.begin(), scored_items.end(),
 263               ScoredHistoryMatch::MatchScoreGreater);
 264   }
 265   post_scoring_item_count_ = scored_items.size();
 266
 267   if (was_trimmed) {
 268     search_term_cache_.clear();  // Invalidate the term cache.
 269   } else {
 270     // Remove any stale SearchTermCacheItems.
 271     for (SearchTermCacheMap::iterator cache_iter = search_term_cache_.begin();
 272          cache_iter != search_term_cache_.end(); ) {
 273       if (!cache_iter->second.used_)
 274         search_term_cache_.erase(cache_iter++);
 275       else
 276         ++cache_iter;
 277     }
 278   }
 279
 280   return scored_items;
 281 }
 282
 283 bool URLIndexPrivateData::UpdateURL(
 284     history::HistoryService* history_service,
 285     const history::URLRow& row,
 286     const std::string& languages,
 287     const std::set<std::string>& scheme_whitelist,
 288     base::CancelableTaskTracker* tracker) {
 289   // The row may or may not already be in our index. If it is not already
 290   // indexed and it qualifies then it gets indexed. If it is already
 291   // indexed and still qualifies then it gets updated, otherwise it
 292   // is deleted from the index.
 293   bool row_was_updated = false;
 294   history::URLID row_id = row.id();
 295   HistoryInfoMap::iterator row_pos = history_info_map_.find(row_id);
 296   if (row_pos == history_info_map_.end()) {
 297     // This new row should be indexed if it qualifies.
 298     history::URLRow new_row(row);
 299     new_row.set_id(row_id);
 300     row_was_updated = RowQualifiesAsSignificant(new_row, base::Time()) &&
 301                       IndexRow(NULL,
 302                                history_service,
 303                                new_row,
 304                                languages,
 305                                scheme_whitelist,
 306                                tracker);
 307   } else if (RowQualifiesAsSignificant(row, base::Time())) {
 308     // This indexed row still qualifies and will be re-indexed.
 309     // The url won't have changed but the title, visit count, etc.
 310     // might have changed.
 311     history::URLRow& row_to_update = row_pos->second.url_row;
 312     bool title_updated = row_to_update.title() != row.title();
 313     if (row_to_update.visit_count() != row.visit_count() ||
 314         row_to_update.typed_count() != row.typed_count() ||
 315         row_to_update.last_visit() != row.last_visit() || title_updated) {
 316       row_to_update.set_visit_count(row.visit_count());
 317       row_to_update.set_typed_count(row.typed_count());
 318       row_to_update.set_last_visit(row.last_visit());
 319       // If something appears to have changed, update the recent visits
 320       // information.
 321       ScheduleUpdateRecentVisits(history_service, row_id, tracker);
 322       // While the URL is guaranteed to remain stable, the title may have
 323       // changed. If so, then update the index with the changed words.
 324       if (title_updated) {
 325         // Clear all words associated with this row and re-index both the
 326         // URL and title.
 327         RemoveRowWordsFromIndex(row_to_update);
 328         row_to_update.set_title(row.title());
 329         RowWordStarts word_starts;
 330         AddRowWordsToIndex(row_to_update, &word_starts, languages);
 331         word_starts_map_[row_id] = word_starts;
 332       }
 333       row_was_updated = true;
 334     }
 335   } else {
 336     // This indexed row no longer qualifies and will be de-indexed by
 337     // clearing all words associated with this row.
 338     RemoveRowFromIndex(row);
 339     row_was_updated = true;
 340   }
 341   if (row_was_updated)
 342     search_term_cache_.clear();  // This invalidates the cache.
 343   return row_was_updated;
 344 }
 345
 346 void URLIndexPrivateData::UpdateRecentVisits(
 347     history::URLID url_id,
 348     const history::VisitVector& recent_visits) {
 349   HistoryInfoMap::iterator row_pos = history_info_map_.find(url_id);
 350   if (row_pos != history_info_map_.end()) {
 351     VisitInfoVector* visits = &row_pos->second.visits;
 352     visits->clear();
 353     const size_t size =
 354         std::min(recent_visits.size(), kMaxVisitsToStoreInCache);
 355     visits->reserve(size);
 356     for (size_t i = 0; i < size; i++) {
 357       // Copy from the history::VisitVector the only fields visits needs.
 358       visits->push_back(std::make_pair(recent_visits[i].visit_time,
 359                                        recent_visits[i].transition));
 360     }
 361   }
 362   // Else: Oddly, the URL doesn't seem to exist in the private index.
 363   // Ignore this update.  This can happen if, for instance, the user
 364   // removes the URL from URLIndexPrivateData before the historyDB call
 365   // returns.
 366 }
 367
 368 void URLIndexPrivateData::ScheduleUpdateRecentVisits(
 369     history::HistoryService* history_service,
 370     history::URLID url_id,
 371     base::CancelableTaskTracker* tracker) {
 372   history_service->ScheduleDBTask(
 373       scoped_ptr<history::HistoryDBTask>(
 374           new UpdateRecentVisitsFromHistoryDBTask(this, url_id)), tracker);
 375 }
 376
 377 // Helper functor for DeleteURL.
 378 class HistoryInfoMapItemHasURL {
 379  public:
 380   explicit HistoryInfoMapItemHasURL(const GURL& url): url_(url) {}
 381
 382   bool operator()(const std::pair<const HistoryID, HistoryInfoMapValue>& item) {
 383     return item.second.url_row.url() == url_;
 384   }
 385
 386  private:
 387   const GURL& url_;
 388 };
 389
 390 bool URLIndexPrivateData::DeleteURL(const GURL& url) {
 391   // Find the matching entry in the history_info_map_.
 392   HistoryInfoMap::iterator pos = std::find_if(
 393       history_info_map_.begin(),
 394       history_info_map_.end(),
 395       HistoryInfoMapItemHasURL(url));
 396   if (pos == history_info_map_.end())
 397     return false;
 398   RemoveRowFromIndex(pos->second.url_row);
 399   search_term_cache_.clear();  // This invalidates the cache.
 400   return true;
 401 }
 402
 403 // static
 404 scoped_refptr<URLIndexPrivateData> URLIndexPrivateData::RestoreFromFile(
 405     const base::FilePath& file_path,
 406     const std::string& languages) {
 407   base::TimeTicks beginning_time = base::TimeTicks::Now();
 408   if (!base::PathExists(file_path))
 409     return NULL;
 410   std::string data;
 411   // If there is no cache file then simply give up. This will cause us to
 412   // attempt to rebuild from the history database.
 413   if (!base::ReadFileToString(file_path, &data))
 414     return NULL;
 415
 416   scoped_refptr<URLIndexPrivateData> restored_data(new URLIndexPrivateData);
 417   InMemoryURLIndexCacheItem index_cache;
 418   if (!index_cache.ParseFromArray(data.c_str(), data.size())) {
 419     LOG(WARNING) << "Failed to parse URLIndexPrivateData cache data read from "
 420                  << file_path.value();
 421     return restored_data;
 422   }
 423
 424   if (!restored_data->RestorePrivateData(index_cache, languages))
 425     return NULL;
 426
 427   UMA_HISTOGRAM_TIMES("History.InMemoryURLIndexRestoreCacheTime",
 428                       base::TimeTicks::Now() - beginning_time);
 429   UMA_HISTOGRAM_COUNTS("History.InMemoryURLHistoryItems",
 430                        restored_data->history_id_word_map_.size());
 431   UMA_HISTOGRAM_COUNTS("History.InMemoryURLCacheSize", data.size());
 432   UMA_HISTOGRAM_COUNTS_10000("History.InMemoryURLWords",
 433                              restored_data->word_map_.size());
 434   UMA_HISTOGRAM_COUNTS_10000("History.InMemoryURLChars",
 435                              restored_data->char_word_map_.size());
 436   if (restored_data->Empty())
 437     return NULL;  // 'No data' is the same as a failed reload.
 438   return restored_data;
 439 }
 440
 441 // static
 442 scoped_refptr<URLIndexPrivateData> URLIndexPrivateData::RebuildFromHistory(
 443     history::HistoryDatabase* history_db,
 444     const std::string& languages,
 445     const std::set<std::string>& scheme_whitelist) {
 446   if (!history_db)
 447     return NULL;
 448
 449   base::TimeTicks beginning_time = base::TimeTicks::Now();
 450
 451   scoped_refptr<URLIndexPrivateData>
 452       rebuilt_data(new URLIndexPrivateData);
 453   history::URLDatabase::URLEnumerator history_enum;
 454   if (!history_db->InitURLEnumeratorForSignificant(&history_enum))
 455     return NULL;
 456   rebuilt_data->last_time_rebuilt_from_history_ = base::Time::Now();
 457   for (history::URLRow row; history_enum.GetNextURL(&row);) {
 458     rebuilt_data->IndexRow(
 459         history_db, NULL, row, languages, scheme_whitelist, NULL);
 460   }
 461
 462   UMA_HISTOGRAM_TIMES("History.InMemoryURLIndexingTime",
 463                       base::TimeTicks::Now() - beginning_time);
 464   UMA_HISTOGRAM_COUNTS("History.InMemoryURLHistoryItems",
 465                        rebuilt_data->history_id_word_map_.size());
 466   UMA_HISTOGRAM_COUNTS_10000("History.InMemoryURLWords",
 467                              rebuilt_data->word_map_.size());
 468   UMA_HISTOGRAM_COUNTS_10000("History.InMemoryURLChars",
 469                              rebuilt_data->char_word_map_.size());
 470   return rebuilt_data;
 471 }
 472
 473 // static
 474 bool URLIndexPrivateData::WritePrivateDataToCacheFileTask(
 475     scoped_refptr<URLIndexPrivateData> private_data,
 476     const base::FilePath& file_path) {
 477   DCHECK(private_data.get());
 478   DCHECK(!file_path.empty());
 479   return private_data->SaveToFile(file_path);
 480 }
 481
 482 scoped_refptr<URLIndexPrivateData> URLIndexPrivateData::Duplicate() const {
 483   scoped_refptr<URLIndexPrivateData> data_copy = new URLIndexPrivateData;
 484   data_copy->last_time_rebuilt_from_history_ = last_time_rebuilt_from_history_;
 485   data_copy->word_list_ = word_list_;
 486   data_copy->available_words_ = available_words_;
 487   data_copy->word_map_ = word_map_;
 488   data_copy->char_word_map_ = char_word_map_;
 489   data_copy->word_id_history_map_ = word_id_history_map_;
 490   data_copy->history_id_word_map_ = history_id_word_map_;
 491   data_copy->history_info_map_ = history_info_map_;
 492   data_copy->word_starts_map_ = word_starts_map_;
 493   return data_copy;
 494   // Not copied:
 495   //    search_term_cache_
 496   //    pre_filter_item_count_
 497   //    post_filter_item_count_
 498   //    post_scoring_item_count_
 499 }
 500
 501 bool URLIndexPrivateData::Empty() const {
 502   return history_info_map_.empty();
 503 }
 504
 505 void URLIndexPrivateData::Clear() {
 506   last_time_rebuilt_from_history_ = base::Time();
 507   word_list_.clear();
 508   available_words_.clear();
 509   word_map_.clear();
 510   char_word_map_.clear();
 511   word_id_history_map_.clear();
 512   history_id_word_map_.clear();
 513   history_info_map_.clear();
 514   word_starts_map_.clear();
 515 }
 516
 517 URLIndexPrivateData::~URLIndexPrivateData() {}
 518
 519 HistoryIDSet URLIndexPrivateData::HistoryIDSetFromWords(
 520     const String16Vector& unsorted_words) {
 521   // Break the terms down into individual terms (words), get the candidate
 522   // set for each term, and intersect each to get a final candidate list.
 523   // Note that a single 'term' from the user's perspective might be
 524   // a string like "http://www.somewebsite.com" which, from our perspective,
 525   // is four words: 'http', 'www', 'somewebsite', and 'com'.
 526   HistoryIDSet history_id_set;
 527   String16Vector words(unsorted_words);
 528   // Sort the words into the longest first as such are likely to narrow down
 529   // the results quicker. Also, single character words are the most expensive
 530   // to process so save them for last.
 531   std::sort(words.begin(), words.end(), LengthGreater);
 532   for (String16Vector::iterator iter = words.begin(); iter != words.end();
 533        ++iter) {
 534     base::string16 uni_word = *iter;
 535     HistoryIDSet term_history_set = HistoryIDsForTerm(uni_word);
 536     if (term_history_set.empty()) {
 537       history_id_set.clear();
 538       break;
 539     }
 540     if (iter == words.begin()) {
 541       history_id_set.swap(term_history_set);
 542     } else {
 543       HistoryIDSet new_history_id_set = base::STLSetIntersection<HistoryIDSet>(
 544           history_id_set, term_history_set);
 545       history_id_set.swap(new_history_id_set);
 546     }
 547   }
 548   return history_id_set;
 549 }
 550
 551 HistoryIDSet URLIndexPrivateData::HistoryIDsForTerm(
 552     const base::string16& term) {
 553   if (term.empty())
 554     return HistoryIDSet();
 555
 556   // TODO(mrossetti): Consider optimizing for very common terms such as
 557   // 'http[s]', 'www', 'com', etc. Or collect the top 100 more frequently
 558   // occuring words in the user's searches.
 559
 560   size_t term_length = term.length();
 561   WordIDSet word_id_set;
 562   if (term_length > 1) {
 563     // See if this term or a prefix thereof is present in the cache.
 564     SearchTermCacheMap::iterator best_prefix(search_term_cache_.end());
 565     for (SearchTermCacheMap::iterator cache_iter = search_term_cache_.begin();
 566          cache_iter != search_term_cache_.end(); ++cache_iter) {
 567       if (StartsWith(term, cache_iter->first, false) &&
 568           (best_prefix == search_term_cache_.end() ||
 569            cache_iter->first.length() > best_prefix->first.length()))
 570         best_prefix = cache_iter;
 571     }
 572
 573     // If a prefix was found then determine the leftover characters to be used
 574     // for further refining the results from that prefix.
 575     Char16Set prefix_chars;
 576     base::string16 leftovers(term);
 577     if (best_prefix != search_term_cache_.end()) {
 578       // If the prefix is an exact match for the term then grab the cached
 579       // results and we're done.
 580       size_t prefix_length = best_prefix->first.length();
 581       if (prefix_length == term_length) {
 582         best_prefix->second.used_ = true;
 583         return best_prefix->second.history_id_set_;
 584       }
 585
 586       // Otherwise we have a handy starting point.
 587       // If there are no history results for this prefix then we can bail early
 588       // as there will be no history results for the full term.
 589       if (best_prefix->second.history_id_set_.empty()) {
 590         search_term_cache_[term] = SearchTermCacheItem();
 591         return HistoryIDSet();
 592       }
 593       word_id_set = best_prefix->second.word_id_set_;
 594       prefix_chars = Char16SetFromString16(best_prefix->first);
 595       leftovers = term.substr(prefix_length);
 596     }
 597
 598     // Filter for each remaining, unique character in the term.
 599     Char16Set leftover_chars = Char16SetFromString16(leftovers);
 600     Char16Set unique_chars =
 601         base::STLSetDifference<Char16Set>(leftover_chars, prefix_chars);
 602
 603     // Reduce the word set with any leftover, unprocessed characters.
 604     if (!unique_chars.empty()) {
 605       WordIDSet leftover_set(WordIDSetForTermChars(unique_chars));
 606       // We might come up empty on the leftovers.
 607       if (leftover_set.empty()) {
 608         search_term_cache_[term] = SearchTermCacheItem();
 609         return HistoryIDSet();
 610       }
 611       // Or there may not have been a prefix from which to start.
 612       if (prefix_chars.empty()) {
 613         word_id_set.swap(leftover_set);
 614       } else {
 615         WordIDSet new_word_id_set = base::STLSetIntersection<WordIDSet>(
 616             word_id_set, leftover_set);
 617         word_id_set.swap(new_word_id_set);
 618       }
 619     }
 620
 621     // We must filter the word list because the resulting word set surely
 622     // contains words which do not have the search term as a proper subset.
 623     for (WordIDSet::iterator word_set_iter = word_id_set.begin();
 624          word_set_iter != word_id_set.end(); ) {
 625       if (word_list_[*word_set_iter].find(term) == base::string16::npos)
 626         word_id_set.erase(word_set_iter++);
 627       else
 628         ++word_set_iter;
 629     }
 630   } else {
 631     word_id_set = WordIDSetForTermChars(Char16SetFromString16(term));
 632   }
 633
 634   // If any words resulted then we can compose a set of history IDs by unioning
 635   // the sets from each word.
 636   HistoryIDSet history_id_set;
 637   if (!word_id_set.empty()) {
 638     for (WordIDSet::iterator word_id_iter = word_id_set.begin();
 639          word_id_iter != word_id_set.end(); ++word_id_iter) {
 640       WordID word_id = *word_id_iter;
 641       WordIDHistoryMap::iterator word_iter = word_id_history_map_.find(word_id);
 642       if (word_iter != word_id_history_map_.end()) {
 643         HistoryIDSet& word_history_id_set(word_iter->second);
 644         history_id_set.insert(word_history_id_set.begin(),
 645                               word_history_id_set.end());
 646       }
 647     }
 648   }
 649
 650   // Record a new cache entry for this word if the term is longer than
 651   // a single character.
 652   if (term_length > 1)
 653     search_term_cache_[term] = SearchTermCacheItem(word_id_set, history_id_set);
 654
 655   return history_id_set;
 656 }
 657
 658 WordIDSet URLIndexPrivateData::WordIDSetForTermChars(
 659     const Char16Set& term_chars) {
 660   WordIDSet word_id_set;
 661   for (Char16Set::const_iterator c_iter = term_chars.begin();
 662        c_iter != term_chars.end(); ++c_iter) {
 663     CharWordIDMap::iterator char_iter = char_word_map_.find(*c_iter);
 664     if (char_iter == char_word_map_.end()) {
 665       // A character was not found so there are no matching results: bail.
 666       word_id_set.clear();
 667       break;
 668     }
 669     WordIDSet& char_word_id_set(char_iter->second);
 670     // It is possible for there to no longer be any words associated with
 671     // a particular character. Give up in that case.
 672     if (char_word_id_set.empty()) {
 673       word_id_set.clear();
 674       break;
 675     }
 676
 677     if (c_iter == term_chars.begin()) {
 678       // First character results becomes base set of results.
 679       word_id_set = char_word_id_set;
 680     } else {
 681       // Subsequent character results get intersected in.
 682       WordIDSet new_word_id_set = base::STLSetIntersection<WordIDSet>(
 683           word_id_set, char_word_id_set);
 684       word_id_set.swap(new_word_id_set);
 685     }
 686   }
 687   return word_id_set;
 688 }
 689
 690 bool URLIndexPrivateData::IndexRow(
 691     history::HistoryDatabase* history_db,
 692     history::HistoryService* history_service,
 693     const history::URLRow& row,
 694     const std::string& languages,
 695     const std::set<std::string>& scheme_whitelist,
 696     base::CancelableTaskTracker* tracker) {
 697   const GURL& gurl(row.url());
 698
 699   // Index only URLs with a whitelisted scheme.
 700   if (!URLSchemeIsWhitelisted(gurl, scheme_whitelist))
 701     return false;
 702
 703   history::URLID row_id = row.id();
 704   // Strip out username and password before saving and indexing.
 705   base::string16 url(net::FormatUrl(gurl, languages,
 706       net::kFormatUrlOmitUsernamePassword,
 707       net::UnescapeRule::NONE,
 708       NULL, NULL, NULL));
 709
 710   HistoryID history_id = static_cast<HistoryID>(row_id);
 711   DCHECK_LT(history_id, std::numeric_limits<HistoryID>::max());
 712
 713   // Add the row for quick lookup in the history info store.
 714   history::URLRow new_row(GURL(url), row_id);
 715   new_row.set_visit_count(row.visit_count());
 716   new_row.set_typed_count(row.typed_count());
 717   new_row.set_last_visit(row.last_visit());
 718   new_row.set_title(row.title());
 719   history_info_map_[history_id].url_row = new_row;
 720
 721   // Index the words contained in the URL and title of the row.
 722   RowWordStarts word_starts;
 723   AddRowWordsToIndex(new_row, &word_starts, languages);
 724   word_starts_map_[history_id] = word_starts;
 725
 726   // Update the recent visits information or schedule the update
 727   // as appropriate.
 728   if (history_db) {
 729     // We'd like to check that we're on the history DB thread.
 730     // However, unittest code actually calls this on the UI thread.
 731     // So we don't do any thread checks.
 732     history::VisitVector recent_visits;
 733     // Make sure the private data is going to get as many recent visits as
 734     // ScoredHistoryMatch::GetFrequency() hopes to use.
 735     DCHECK_GE(kMaxVisitsToStoreInCache, ScoredHistoryMatch::kMaxVisitsToScore);
 736     if (history_db->GetMostRecentVisitsForURL(row_id,
 737                                               kMaxVisitsToStoreInCache,
 738                                               &recent_visits))
 739       UpdateRecentVisits(row_id, recent_visits);
 740   } else {
 741     DCHECK(tracker);
 742     DCHECK(history_service);
 743     ScheduleUpdateRecentVisits(history_service, row_id, tracker);
 744   }
 745
 746   return true;
 747 }
 748
 749 void URLIndexPrivateData::AddRowWordsToIndex(const history::URLRow& row,
 750                                              RowWordStarts* word_starts,
 751                                              const std::string& languages) {
 752   HistoryID history_id = static_cast<HistoryID>(row.id());
 753   // Split URL into individual, unique words then add in the title words.
 754   const GURL& gurl(row.url());
 755   const base::string16& url =
 756       bookmarks::CleanUpUrlForMatching(gurl, languages, NULL);
 757   String16Set url_words = String16SetFromString16(url,
 758       word_starts ? &word_starts->url_word_starts_ : NULL);
 759   const base::string16& title = bookmarks::CleanUpTitleForMatching(row.title());
 760   String16Set title_words = String16SetFromString16(title,
 761       word_starts ? &word_starts->title_word_starts_ : NULL);
 762   String16Set words = base::STLSetUnion<String16Set>(url_words, title_words);
 763   for (String16Set::iterator word_iter = words.begin();
 764        word_iter != words.end(); ++word_iter)
 765     AddWordToIndex(*word_iter, history_id);
 766
 767   search_term_cache_.clear();  // Invalidate the term cache.
 768 }
 769
 770 void URLIndexPrivateData::AddWordToIndex(const base::string16& term,
 771                                          HistoryID history_id) {
 772   WordMap::iterator word_pos = word_map_.find(term);
 773   if (word_pos != word_map_.end())
 774     UpdateWordHistory(word_pos->second, history_id);
 775   else
 776     AddWordHistory(term, history_id);
 777 }
 778
 779 void URLIndexPrivateData::AddWordHistory(const base::string16& term,
 780                                          HistoryID history_id) {
 781   WordID word_id = word_list_.size();
 782   if (available_words_.empty()) {
 783     word_list_.push_back(term);
 784   } else {
 785     word_id = *(available_words_.begin());
 786     word_list_[word_id] = term;
 787     available_words_.erase(word_id);
 788   }
 789   word_map_[term] = word_id;
 790
 791   HistoryIDSet history_id_set;
 792   history_id_set.insert(history_id);
 793   word_id_history_map_[word_id] = history_id_set;
 794   AddToHistoryIDWordMap(history_id, word_id);
 795
 796   // For each character in the newly added word (i.e. a word that is not
 797   // already in the word index), add the word to the character index.
 798   Char16Set characters = Char16SetFromString16(term);
 799   for (Char16Set::iterator uni_char_iter = characters.begin();
 800        uni_char_iter != characters.end(); ++uni_char_iter) {
 801     base::char16 uni_char = *uni_char_iter;
 802     CharWordIDMap::iterator char_iter = char_word_map_.find(uni_char);
 803     if (char_iter != char_word_map_.end()) {
 804       // Update existing entry in the char/word index.
 805       WordIDSet& word_id_set(char_iter->second);
 806       word_id_set.insert(word_id);
 807     } else {
 808       // Create a new entry in the char/word index.
 809       WordIDSet word_id_set;
 810       word_id_set.insert(word_id);
 811       char_word_map_[uni_char] = word_id_set;
 812     }
 813   }
 814 }
 815
 816 void URLIndexPrivateData::UpdateWordHistory(WordID word_id,
 817                                             HistoryID history_id) {
 818   WordIDHistoryMap::iterator history_pos = word_id_history_map_.find(word_id);
 819   DCHECK(history_pos != word_id_history_map_.end());
 820   HistoryIDSet& history_id_set(history_pos->second);
 821   history_id_set.insert(history_id);
 822   AddToHistoryIDWordMap(history_id, word_id);
 823 }
 824
 825 void URLIndexPrivateData::AddToHistoryIDWordMap(HistoryID history_id,
 826                                                 WordID word_id) {
 827   HistoryIDWordMap::iterator iter = history_id_word_map_.find(history_id);
 828   if (iter != history_id_word_map_.end()) {
 829     WordIDSet& word_id_set(iter->second);
 830     word_id_set.insert(word_id);
 831   } else {
 832     WordIDSet word_id_set;
 833     word_id_set.insert(word_id);
 834     history_id_word_map_[history_id] = word_id_set;
 835   }
 836 }
 837
 838 void URLIndexPrivateData::RemoveRowFromIndex(const history::URLRow& row) {
 839   RemoveRowWordsFromIndex(row);
 840   HistoryID history_id = static_cast<HistoryID>(row.id());
 841   history_info_map_.erase(history_id);
 842   word_starts_map_.erase(history_id);
 843 }
 844
 845 void URLIndexPrivateData::RemoveRowWordsFromIndex(const history::URLRow& row) {
 846   // Remove the entries in history_id_word_map_ and word_id_history_map_ for
 847   // this row.
 848   HistoryID history_id = static_cast<HistoryID>(row.id());
 849   WordIDSet word_id_set = history_id_word_map_[history_id];
 850   history_id_word_map_.erase(history_id);
 851
 852   // Reconcile any changes to word usage.
 853   for (WordIDSet::iterator word_id_iter = word_id_set.begin();
 854        word_id_iter != word_id_set.end(); ++word_id_iter) {
 855     WordID word_id = *word_id_iter;
 856     word_id_history_map_[word_id].erase(history_id);
 857     if (!word_id_history_map_[word_id].empty())
 858       continue;  // The word is still in use.
 859
 860     // The word is no longer in use. Reconcile any changes to character usage.
 861     base::string16 word = word_list_[word_id];
 862     Char16Set characters = Char16SetFromString16(word);
 863     for (Char16Set::iterator uni_char_iter = characters.begin();
 864          uni_char_iter != characters.end(); ++uni_char_iter) {
 865       base::char16 uni_char = *uni_char_iter;
 866       char_word_map_[uni_char].erase(word_id);
 867       if (char_word_map_[uni_char].empty())
 868         char_word_map_.erase(uni_char);  // No longer in use.
 869     }
 870
 871     // Complete the removal of references to the word.
 872     word_id_history_map_.erase(word_id);
 873     word_map_.erase(word);
 874     word_list_[word_id] = base::string16();
 875     available_words_.insert(word_id);
 876   }
 877 }
 878
 879 void URLIndexPrivateData::ResetSearchTermCache() {
 880   for (SearchTermCacheMap::iterator iter = search_term_cache_.begin();
 881        iter != search_term_cache_.end(); ++iter)
 882     iter->second.used_ = false;
 883 }
 884
 885 bool URLIndexPrivateData::SaveToFile(const base::FilePath& file_path) {
 886   base::TimeTicks beginning_time = base::TimeTicks::Now();
 887   InMemoryURLIndexCacheItem index_cache;
 888   SavePrivateData(&index_cache);
 889   std::string data;
 890   if (!index_cache.SerializeToString(&data)) {
 891     LOG(WARNING) << "Failed to serialize the InMemoryURLIndex cache.";
 892     return false;
 893   }
 894
 895   int size = data.size();
 896   if (base::WriteFile(file_path, data.c_str(), size) != size) {
 897     LOG(WARNING) << "Failed to write " << file_path.value();
 898     return false;
 899   }
 900   UMA_HISTOGRAM_TIMES("History.InMemoryURLIndexSaveCacheTime",
 901                       base::TimeTicks::Now() - beginning_time);
 902   return true;
 903 }
 904
 905 void URLIndexPrivateData::SavePrivateData(
 906     InMemoryURLIndexCacheItem* cache) const {
 907   DCHECK(cache);
 908   cache->set_last_rebuild_timestamp(
 909       last_time_rebuilt_from_history_.ToInternalValue());
 910   cache->set_version(saved_cache_version_);
 911   // history_item_count_ is no longer used but rather than change the protobuf
 912   // definition use a placeholder. This will go away with the switch to SQLite.
 913   cache->set_history_item_count(0);
 914   SaveWordList(cache);
 915   SaveWordMap(cache);
 916   SaveCharWordMap(cache);
 917   SaveWordIDHistoryMap(cache);
 918   SaveHistoryInfoMap(cache);
 919   SaveWordStartsMap(cache);
 920 }
 921
 922 void URLIndexPrivateData::SaveWordList(InMemoryURLIndexCacheItem* cache) const {
 923   if (word_list_.empty())
 924     return;
 925   WordListItem* list_item = cache->mutable_word_list();
 926   list_item->set_word_count(word_list_.size());
 927   for (String16Vector::const_iterator iter = word_list_.begin();
 928        iter != word_list_.end(); ++iter)
 929     list_item->add_word(base::UTF16ToUTF8(*iter));
 930 }
 931
 932 void URLIndexPrivateData::SaveWordMap(InMemoryURLIndexCacheItem* cache) const {
 933   if (word_map_.empty())
 934     return;
 935   WordMapItem* map_item = cache->mutable_word_map();
 936   map_item->set_item_count(word_map_.size());
 937   for (WordMap::const_iterator iter = word_map_.begin();
 938        iter != word_map_.end(); ++iter) {
 939     WordMapEntry* map_entry = map_item->add_word_map_entry();
 940     map_entry->set_word(base::UTF16ToUTF8(iter->first));
 941     map_entry->set_word_id(iter->second);
 942   }
 943 }
 944
 945 void URLIndexPrivateData::SaveCharWordMap(
 946     InMemoryURLIndexCacheItem* cache) const {
 947   if (char_word_map_.empty())
 948     return;
 949   CharWordMapItem* map_item = cache->mutable_char_word_map();
 950   map_item->set_item_count(char_word_map_.size());
 951   for (CharWordIDMap::const_iterator iter = char_word_map_.begin();
 952        iter != char_word_map_.end(); ++iter) {
 953     CharWordMapEntry* map_entry = map_item->add_char_word_map_entry();
 954     map_entry->set_char_16(iter->first);
 955     const WordIDSet& word_id_set(iter->second);
 956     map_entry->set_item_count(word_id_set.size());
 957     for (WordIDSet::const_iterator set_iter = word_id_set.begin();
 958          set_iter != word_id_set.end(); ++set_iter)
 959       map_entry->add_word_id(*set_iter);
 960   }
 961 }
 962
 963 void URLIndexPrivateData::SaveWordIDHistoryMap(
 964     InMemoryURLIndexCacheItem* cache) const {
 965   if (word_id_history_map_.empty())
 966     return;
 967   WordIDHistoryMapItem* map_item = cache->mutable_word_id_history_map();
 968   map_item->set_item_count(word_id_history_map_.size());
 969   for (WordIDHistoryMap::const_iterator iter = word_id_history_map_.begin();
 970        iter != word_id_history_map_.end(); ++iter) {
 971     WordIDHistoryMapEntry* map_entry =
 972         map_item->add_word_id_history_map_entry();
 973     map_entry->set_word_id(iter->first);
 974     const HistoryIDSet& history_id_set(iter->second);
 975     map_entry->set_item_count(history_id_set.size());
 976     for (HistoryIDSet::const_iterator set_iter = history_id_set.begin();
 977          set_iter != history_id_set.end(); ++set_iter)
 978       map_entry->add_history_id(*set_iter);
 979   }
 980 }
 981
 982 void URLIndexPrivateData::SaveHistoryInfoMap(
 983     InMemoryURLIndexCacheItem* cache) const {
 984   if (history_info_map_.empty())
 985     return;
 986   HistoryInfoMapItem* map_item = cache->mutable_history_info_map();
 987   map_item->set_item_count(history_info_map_.size());
 988   for (HistoryInfoMap::const_iterator iter = history_info_map_.begin();
 989        iter != history_info_map_.end(); ++iter) {
 990     HistoryInfoMapEntry* map_entry = map_item->add_history_info_map_entry();
 991     map_entry->set_history_id(iter->first);
 992     const history::URLRow& url_row(iter->second.url_row);
 993     // Note: We only save information that contributes to the index so there
 994     // is no need to save search_term_cache_ (not persistent).
 995     map_entry->set_visit_count(url_row.visit_count());
 996     map_entry->set_typed_count(url_row.typed_count());
 997     map_entry->set_last_visit(url_row.last_visit().ToInternalValue());
 998     map_entry->set_url(url_row.url().spec());
 999     map_entry->set_title(base::UTF16ToUTF8(url_row.title()));
1000     const VisitInfoVector& visits(iter->second.visits);
1001     for (VisitInfoVector::const_iterator visit_iter = visits.begin();
1002          visit_iter != visits.end(); ++visit_iter) {
1003       HistoryInfoMapEntry_VisitInfo* visit_info = map_entry->add_visits();
1004       visit_info->set_visit_time(visit_iter->first.ToInternalValue());
1005       visit_info->set_transition_type(visit_iter->second);
1006     }
1007   }
1008 }
1009
1010 void URLIndexPrivateData::SaveWordStartsMap(
1011     InMemoryURLIndexCacheItem* cache) const {
1012   if (word_starts_map_.empty())
1013     return;
1014   // For unit testing: Enable saving of the cache as an earlier version to
1015   // allow testing of cache file upgrading in ReadFromFile().
1016   // TODO(mrossetti): Instead of intruding on production code with this kind of
1017   // test harness, save a copy of an older version cache with known results.
1018   // Implement this when switching the caching over to SQLite.
1019   if (saved_cache_version_ < 1)
1020     return;
1021
1022   WordStartsMapItem* map_item = cache->mutable_word_starts_map();
1023   map_item->set_item_count(word_starts_map_.size());
1024   for (WordStartsMap::const_iterator iter = word_starts_map_.begin();
1025        iter != word_starts_map_.end(); ++iter) {
1026     WordStartsMapEntry* map_entry = map_item->add_word_starts_map_entry();
1027     map_entry->set_history_id(iter->first);
1028     const RowWordStarts& word_starts(iter->second);
1029     for (WordStarts::const_iterator i = word_starts.url_word_starts_.begin();
1030          i != word_starts.url_word_starts_.end(); ++i)
1031       map_entry->add_url_word_starts(*i);
1032     for (WordStarts::const_iterator i = word_starts.title_word_starts_.begin();
1033          i != word_starts.title_word_starts_.end(); ++i)
1034       map_entry->add_title_word_starts(*i);
1035   }
1036 }
1037
1038 bool URLIndexPrivateData::RestorePrivateData(
1039     const InMemoryURLIndexCacheItem& cache,
1040     const std::string& languages) {
1041   last_time_rebuilt_from_history_ =
1042       base::Time::FromInternalValue(cache.last_rebuild_timestamp());
1043   const base::TimeDelta rebuilt_ago =
1044       base::Time::Now() - last_time_rebuilt_from_history_;
1045   if ((rebuilt_ago > base::TimeDelta::FromDays(7)) ||
1046       (rebuilt_ago < base::TimeDelta::FromDays(-1))) {
1047     // Cache is more than a week old or, somehow, from some time in the future.
1048     // It's probably a good time to rebuild the index from history to
1049     // allow synced entries to now appear, expired entries to disappear, etc.
1050     // Allow one day in the future to make the cache not rebuild on simple
1051     // system clock changes such as time zone changes.
1052     return false;
1053   }
1054   if (cache.has_version()) {
1055     if (cache.version() < kCurrentCacheFileVersion) {
1056       // Don't try to restore an old format cache file.  (This will cause
1057       // the InMemoryURLIndex to schedule rebuilding the URLIndexPrivateData
1058       // from history.)
1059       return false;
1060     }
1061     restored_cache_version_ = cache.version();
1062   }
1063   return RestoreWordList(cache) && RestoreWordMap(cache) &&
1064       RestoreCharWordMap(cache) && RestoreWordIDHistoryMap(cache) &&
1065       RestoreHistoryInfoMap(cache) && RestoreWordStartsMap(cache, languages);
1066 }
1067
1068 bool URLIndexPrivateData::RestoreWordList(
1069     const InMemoryURLIndexCacheItem& cache) {
1070   if (!cache.has_word_list())
1071     return false;
1072   const WordListItem& list_item(cache.word_list());
1073   uint32 expected_item_count = list_item.word_count();
1074   uint32 actual_item_count = list_item.word_size();
1075   if (actual_item_count == 0 || actual_item_count != expected_item_count)
1076     return false;
1077   const RepeatedPtrField<std::string>& words(list_item.word());
1078   for (RepeatedPtrField<std::string>::const_iterator iter = words.begin();
1079        iter != words.end(); ++iter)
1080     word_list_.push_back(base::UTF8ToUTF16(*iter));
1081   return true;
1082 }
1083
1084 bool URLIndexPrivateData::RestoreWordMap(
1085     const InMemoryURLIndexCacheItem& cache) {
1086   if (!cache.has_word_map())
1087     return false;
1088   const WordMapItem& list_item(cache.word_map());
1089   uint32 expected_item_count = list_item.item_count();
1090   uint32 actual_item_count = list_item.word_map_entry_size();
1091   if (actual_item_count == 0 || actual_item_count != expected_item_count)
1092     return false;
1093   const RepeatedPtrField<WordMapEntry>& entries(list_item.word_map_entry());
1094   for (RepeatedPtrField<WordMapEntry>::const_iterator iter = entries.begin();
1095        iter != entries.end(); ++iter)
1096     word_map_[base::UTF8ToUTF16(iter->word())] = iter->word_id();
1097   return true;
1098 }
1099
1100 bool URLIndexPrivateData::RestoreCharWordMap(
1101     const InMemoryURLIndexCacheItem& cache) {
1102   if (!cache.has_char_word_map())
1103     return false;
1104   const CharWordMapItem& list_item(cache.char_word_map());
1105   uint32 expected_item_count = list_item.item_count();
1106   uint32 actual_item_count = list_item.char_word_map_entry_size();
1107   if (actual_item_count == 0 || actual_item_count != expected_item_count)
1108     return false;
1109   const RepeatedPtrField<CharWordMapEntry>&
1110       entries(list_item.char_word_map_entry());
1111   for (RepeatedPtrField<CharWordMapEntry>::const_iterator iter =
1112        entries.begin(); iter != entries.end(); ++iter) {
1113     expected_item_count = iter->item_count();
1114     actual_item_count = iter->word_id_size();
1115     if (actual_item_count == 0 || actual_item_count != expected_item_count)
1116       return false;
1117     base::char16 uni_char = static_cast<base::char16>(iter->char_16());
1118     WordIDSet word_id_set;
1119     const RepeatedField<int32>& word_ids(iter->word_id());
1120     for (RepeatedField<int32>::const_iterator jiter = word_ids.begin();
1121          jiter != word_ids.end(); ++jiter)
1122       word_id_set.insert(*jiter);
1123     char_word_map_[uni_char] = word_id_set;
1124   }
1125   return true;
1126 }
1127
1128 bool URLIndexPrivateData::RestoreWordIDHistoryMap(
1129     const InMemoryURLIndexCacheItem& cache) {
1130   if (!cache.has_word_id_history_map())
1131     return false;
1132   const WordIDHistoryMapItem& list_item(cache.word_id_history_map());
1133   uint32 expected_item_count = list_item.item_count();
1134   uint32 actual_item_count = list_item.word_id_history_map_entry_size();
1135   if (actual_item_count == 0 || actual_item_count != expected_item_count)
1136     return false;
1137   const RepeatedPtrField<WordIDHistoryMapEntry>&
1138       entries(list_item.word_id_history_map_entry());
1139   for (RepeatedPtrField<WordIDHistoryMapEntry>::const_iterator iter =
1140        entries.begin(); iter != entries.end(); ++iter) {
1141     expected_item_count = iter->item_count();
1142     actual_item_count = iter->history_id_size();
1143     if (actual_item_count == 0 || actual_item_count != expected_item_count)
1144       return false;
1145     WordID word_id = iter->word_id();
1146     HistoryIDSet history_id_set;
1147     const RepeatedField<int64>& history_ids(iter->history_id());
1148     for (RepeatedField<int64>::const_iterator jiter = history_ids.begin();
1149          jiter != history_ids.end(); ++jiter) {
1150       history_id_set.insert(*jiter);
1151       AddToHistoryIDWordMap(*jiter, word_id);
1152     }
1153     word_id_history_map_[word_id] = history_id_set;
1154   }
1155   return true;
1156 }
1157
1158 bool URLIndexPrivateData::RestoreHistoryInfoMap(
1159     const InMemoryURLIndexCacheItem& cache) {
1160   if (!cache.has_history_info_map())
1161     return false;
1162   const HistoryInfoMapItem& list_item(cache.history_info_map());
1163   uint32 expected_item_count = list_item.item_count();
1164   uint32 actual_item_count = list_item.history_info_map_entry_size();
1165   if (actual_item_count == 0 || actual_item_count != expected_item_count)
1166     return false;
1167   const RepeatedPtrField<HistoryInfoMapEntry>&
1168       entries(list_item.history_info_map_entry());
1169   for (RepeatedPtrField<HistoryInfoMapEntry>::const_iterator iter =
1170        entries.begin(); iter != entries.end(); ++iter) {
1171     HistoryID history_id = iter->history_id();
1172     GURL url(iter->url());
1173     history::URLRow url_row(url, history_id);
1174     url_row.set_visit_count(iter->visit_count());
1175     url_row.set_typed_count(iter->typed_count());
1176     url_row.set_last_visit(base::Time::FromInternalValue(iter->last_visit()));
1177     if (iter->has_title()) {
1178       base::string16 title(base::UTF8ToUTF16(iter->title()));
1179       url_row.set_title(title);
1180     }
1181     history_info_map_[history_id].url_row = url_row;
1182
1183     // Restore visits list.
1184     VisitInfoVector visits;
1185     visits.reserve(iter->visits_size());
1186     for (int i = 0; i < iter->visits_size(); ++i) {
1187       visits.push_back(std::make_pair(
1188           base::Time::FromInternalValue(iter->visits(i).visit_time()),
1189           ui::PageTransitionFromInt(iter->visits(i).transition_type())));
1190     }
1191     history_info_map_[history_id].visits = visits;
1192   }
1193   return true;
1194 }
1195
1196 bool URLIndexPrivateData::RestoreWordStartsMap(
1197     const InMemoryURLIndexCacheItem& cache,
1198     const std::string& languages) {
1199   // Note that this function must be called after RestoreHistoryInfoMap() has
1200   // been run as the word starts may have to be recalculated from the urls and
1201   // page titles.
1202   if (cache.has_word_starts_map()) {
1203     const WordStartsMapItem& list_item(cache.word_starts_map());
1204     uint32 expected_item_count = list_item.item_count();
1205     uint32 actual_item_count = list_item.word_starts_map_entry_size();
1206     if (actual_item_count == 0 || actual_item_count != expected_item_count)
1207       return false;
1208     const RepeatedPtrField<WordStartsMapEntry>&
1209         entries(list_item.word_starts_map_entry());
1210     for (RepeatedPtrField<WordStartsMapEntry>::const_iterator iter =
1211          entries.begin(); iter != entries.end(); ++iter) {
1212       HistoryID history_id = iter->history_id();
1213       RowWordStarts word_starts;
1214       // Restore the URL word starts.
1215       const RepeatedField<int32>& url_starts(iter->url_word_starts());
1216       for (RepeatedField<int32>::const_iterator jiter = url_starts.begin();
1217            jiter != url_starts.end(); ++jiter)
1218         word_starts.url_word_starts_.push_back(*jiter);
1219       // Restore the page title word starts.
1220       const RepeatedField<int32>& title_starts(iter->title_word_starts());
1221       for (RepeatedField<int32>::const_iterator jiter = title_starts.begin();
1222            jiter != title_starts.end(); ++jiter)
1223         word_starts.title_word_starts_.push_back(*jiter);
1224       word_starts_map_[history_id] = word_starts;
1225     }
1226   } else {
1227     // Since the cache did not contain any word starts we must rebuild then from
1228     // the URL and page titles.
1229     for (HistoryInfoMap::const_iterator iter = history_info_map_.begin();
1230          iter != history_info_map_.end(); ++iter) {
1231       RowWordStarts word_starts;
1232       const history::URLRow& row(iter->second.url_row);
1233       const base::string16& url =
1234           bookmarks::CleanUpUrlForMatching(row.url(), languages, NULL);
1235       String16VectorFromString16(url, false, &word_starts.url_word_starts_);
1236       const base::string16& title =
1237           bookmarks::CleanUpTitleForMatching(row.title());
1238       String16VectorFromString16(title, false, &word_starts.title_word_starts_);
1239       word_starts_map_[iter->first] = word_starts;
1240     }
1241   }
1242   return true;
1243 }
1244
1245 // static
1246 bool URLIndexPrivateData::URLSchemeIsWhitelisted(
1247     const GURL& gurl,
1248     const std::set<std::string>& whitelist) {
1249   return whitelist.find(gurl.scheme()) != whitelist.end();
1250 }
1251
1252
1253 // SearchTermCacheItem ---------------------------------------------------------
1254
1255 URLIndexPrivateData::SearchTermCacheItem::SearchTermCacheItem(
1256     const WordIDSet& word_id_set,
1257     const HistoryIDSet& history_id_set)
1258     : word_id_set_(word_id_set), history_id_set_(history_id_set), used_(true) {
1259 }
1260
1261 URLIndexPrivateData::SearchTermCacheItem::SearchTermCacheItem() : used_(true) {
1262 }
1263
1264 URLIndexPrivateData::SearchTermCacheItem::~SearchTermCacheItem() {
1265 }
1266
1267 // URLIndexPrivateData::AddHistoryMatch ----------------------------------------
1268
1269 URLIndexPrivateData::AddHistoryMatch::AddHistoryMatch(
1270     bookmarks::BookmarkModel* bookmark_model,
1271     const URLIndexPrivateData& private_data,
1272     const std::string& languages,
1273     const base::string16& lower_string,
1274     const String16Vector& lower_terms,
1275     const base::Time now)
1276     : bookmark_model_(bookmark_model),
1277       private_data_(private_data),
1278       languages_(languages),
1279       lower_string_(lower_string),
1280       lower_terms_(lower_terms),
1281       now_(now) {
1282   // Calculate offsets for each term.  For instance, the offset for
1283   // ".net" should be 1, indicating that the actual word-part of the term
1284   // starts at offset 1.
1285   lower_terms_to_word_starts_offsets_.resize(lower_terms_.size(), 0u);
1286   for (size_t i = 0; i < lower_terms_.size(); ++i) {
1287     base::i18n::BreakIterator iter(lower_terms_[i],
1288                                    base::i18n::BreakIterator::BREAK_WORD);
1289     // If the iterator doesn't work, assume an offset of 0.
1290     if (!iter.Init())
1291       continue;
1292     // Find the first word start.
1293     while (iter.Advance() && !iter.IsWord()) {}
1294     if (iter.IsWord())
1295       lower_terms_to_word_starts_offsets_[i] = iter.prev();
1296     // Else: the iterator didn't find a word break.  Assume an offset of 0.
1297   }
1298 }
1299
1300 URLIndexPrivateData::AddHistoryMatch::~AddHistoryMatch() {
1301 }
1302
1303 void URLIndexPrivateData::AddHistoryMatch::operator()(
1304     const HistoryID history_id) {
1305   HistoryInfoMap::const_iterator hist_pos =
1306       private_data_.history_info_map_.find(history_id);
1307   if (hist_pos != private_data_.history_info_map_.end()) {
1308     const history::URLRow& hist_item = hist_pos->second.url_row;
1309     const VisitInfoVector& visits = hist_pos->second.visits;
1310     WordStartsMap::const_iterator starts_pos =
1311         private_data_.word_starts_map_.find(history_id);
1312     DCHECK(starts_pos != private_data_.word_starts_map_.end());
1313     ScoredHistoryMatch match(
1314         hist_item, visits, languages_, lower_string_, lower_terms_,
1315         lower_terms_to_word_starts_offsets_, starts_pos->second,
1316         bookmark_model_ && bookmark_model_->IsBookmarked(hist_item.url()),
1317         now_);
1318     if (match.raw_score > 0)
1319       scored_matches_.push_back(match);
1320   }
1321 }
1322
1323
1324 // URLIndexPrivateData::HistoryItemFactorGreater -------------------------------
1325
1326 URLIndexPrivateData::HistoryItemFactorGreater::HistoryItemFactorGreater(
1327     const HistoryInfoMap& history_info_map)
1328     : history_info_map_(history_info_map) {
1329 }
1330
1331 URLIndexPrivateData::HistoryItemFactorGreater::~HistoryItemFactorGreater() {
1332 }
1333
1334 bool URLIndexPrivateData::HistoryItemFactorGreater::operator()(
1335     const HistoryID h1,
1336     const HistoryID h2) {
1337   HistoryInfoMap::const_iterator entry1(history_info_map_.find(h1));
1338   if (entry1 == history_info_map_.end())
1339     return false;
1340   HistoryInfoMap::const_iterator entry2(history_info_map_.find(h2));
1341   if (entry2 == history_info_map_.end())
1342     return true;
1343   const history::URLRow& r1(entry1->second.url_row);
1344   const history::URLRow& r2(entry2->second.url_row);
1345   // First cut: typed count, visit count, recency.
1346   // TODO(mrossetti): This is too simplistic. Consider an approach which ranks
1347   // recently visited (within the last 12/24 hours) as highly important. Get
1348   // input from mpearson.
1349   if (r1.typed_count() != r2.typed_count())
1350     return (r1.typed_count() > r2.typed_count());
1351   if (r1.visit_count() != r2.visit_count())
1352     return (r1.visit_count() > r2.visit_count());
1353   return (r1.last_visit() > r2.last_visit());
1354 }