components/omnibox/browser/url_index_private_data.h

   1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
   2 // Use of this source code is governed by a BSD-style license that can be
   3 // found in the LICENSE file.
   4
   5 #ifndef COMPONENTS_OMNIBOX_BROWSER_URL_INDEX_PRIVATE_DATA_H_
   6 #define COMPONENTS_OMNIBOX_BROWSER_URL_INDEX_PRIVATE_DATA_H_
   7
   8 #include <set>
   9 #include <string>
  10
  11 #include "base/files/file_path.h"
  12 #include "base/gtest_prod_util.h"
  13 #include "base/memory/ref_counted.h"
  14 #include "components/history/core/browser/history_service.h"
  15 #include "components/omnibox/browser/in_memory_url_index_cache.pb.h"
  16 #include "components/omnibox/browser/in_memory_url_index_types.h"
  17 #include "components/omnibox/browser/scored_history_match.h"
  18
  19 class HistoryQuickProviderTest;
  20
  21 namespace bookmarks {
  22 class BookmarkModel;
  23 }
  24
  25 namespace in_memory_url_index {
  26 class InMemoryURLIndexCacheItem;
  27 }
  28
  29 namespace history {
  30 class HistoryDatabase;
  31 class InMemoryURLIndex;
  32 class RefCountedBool;
  33 }
  34
  35 // Current version of the cache file.
  36 static const int kCurrentCacheFileVersion = 5;
  37
  38 // A structure private to InMemoryURLIndex describing its internal data and
  39 // providing for restoring, rebuilding and updating that internal data. As
  40 // this class is for exclusive use by the InMemoryURLIndex class there should
  41 // be no calls from any other class.
  42 //
  43 // All public member functions are called on the main thread unless otherwise
  44 // annotated.
  45 class URLIndexPrivateData
  46     : public base::RefCountedThreadSafe<URLIndexPrivateData> {
  47  public:
  48   URLIndexPrivateData();
  49
  50   // Given a base::string16 in |term_string|, scans the history index and
  51   // returns a vector with all scored, matching history items. The
  52   // |term_string| is broken down into individual terms (words), each of which
  53   // must occur in the candidate history item's URL or page title for the item
  54   // to qualify; however, the terms do not necessarily have to be adjacent. We
  55   // also allow breaking |term_string| at |cursor_position| (if
  56   // set). Once we have a set of candidates, they are filtered to ensure
  57   // that all |term_string| terms, as separated by whitespace and the
  58   // cursor (if set), occur within the candidate's URL or page title.
  59   // Scores are then calculated on no more than |kItemsToScoreLimit|
  60   // candidates, as the scoring of such a large number of candidates may
  61   // cause perceptible typing response delays in the omnibox. This is
  62   // likely to occur for short omnibox terms such as 'h' and 'w' which
  63   // will be found in nearly all history candidates. Results are sorted by
  64   // descending score. The full results set (i.e. beyond the
  65   // |kItemsToScoreLimit| limit) will be retained and used for subsequent calls
  66   // to this function. |languages| is used to help parse/format the URLs in the
  67   // history index.  In total, |max_matches| of items will be returned in the
  68   // |ScoredHistoryMatches| vector.
  69   ScoredHistoryMatches HistoryItemsForTerms(
  70       base::string16 term_string,
  71       size_t cursor_position,
  72       size_t max_matches,
  73       const std::string& languages,
  74       bookmarks::BookmarkModel* bookmark_model);
  75
  76   // Adds the history item in |row| to the index if it does not already already
  77   // exist and it meets the minimum 'quick' criteria. If the row already exists
  78   // in the index then the index will be updated if the row still meets the
  79   // criteria, otherwise the row will be removed from the index. Returns true
  80   // if the index was actually updated. |languages| gives a list of language
  81   // encodings by which the URLs and page titles are broken down into words and
  82   // characters. |scheme_whitelist| is used to filter non-qualifying schemes.
  83   // |history_service| is used to schedule an update to the recent visits
  84   // component of this URL's entry in the index.
  85   bool UpdateURL(history::HistoryService* history_service,
  86                  const history::URLRow& row,
  87                  const std::string& languages,
  88                  const std::set<std::string>& scheme_whitelist,
  89                  base::CancelableTaskTracker* tracker);
  90
  91   // Updates the entry for |url_id| in the index, replacing its
  92   // recent visits information with |recent_visits|.  If |url_id|
  93   // is not in the index, does nothing.
  94   void UpdateRecentVisits(history::URLID url_id,
  95                           const history::VisitVector& recent_visits);
  96
  97   // Using |history_service| schedules an update (using the historyDB
  98   // thread) for the recent visits information for |url_id|.  Unless
  99   // something unexpectedly goes wrong, UdpateRecentVisits() should
 100   // eventually be called from a callback.
 101   void ScheduleUpdateRecentVisits(history::HistoryService* history_service,
 102                                   history::URLID url_id,
 103                                   base::CancelableTaskTracker* tracker);
 104
 105   // Deletes index data for the history item with the given |url|.
 106   // The item may not have actually been indexed, which is the case if it did
 107   // not previously meet minimum 'quick' criteria. Returns true if the index
 108   // was actually updated.
 109   bool DeleteURL(const GURL& url);
 110
 111   // Constructs a new object by restoring its contents from the cache file
 112   // at |path|. Returns the new URLIndexPrivateData which on success will
 113   // contain the restored data but upon failure will be empty.  |languages|
 114   // is used to break URLs and page titles into words.  This function
 115   // should be run on the the file thread.
 116   static scoped_refptr<URLIndexPrivateData> RestoreFromFile(
 117       const base::FilePath& path,
 118       const std::string& languages);
 119
 120   // Constructs a new object by rebuilding its contents from the history
 121   // database in |history_db|. Returns the new URLIndexPrivateData which on
 122   // success will contain the rebuilt data but upon failure will be empty.
 123   // |languages| gives a list of language encodings by which the URLs and page
 124   // titles are broken down into words and characters.
 125   static scoped_refptr<URLIndexPrivateData> RebuildFromHistory(
 126       history::HistoryDatabase* history_db,
 127       const std::string& languages,
 128       const std::set<std::string>& scheme_whitelist);
 129
 130   // Writes |private_data| as a cache file to |file_path| and returns success.
 131   static bool WritePrivateDataToCacheFileTask(
 132       scoped_refptr<URLIndexPrivateData> private_data,
 133       const base::FilePath& file_path);
 134
 135   // Creates a copy of ourself.
 136   scoped_refptr<URLIndexPrivateData> Duplicate() const;
 137
 138   // Returns true if there is no data in the index.
 139   bool Empty() const;
 140
 141   // Initializes all index data members in preparation for restoring the index
 142   // from the cache or a complete rebuild from the history database.
 143   void Clear();
 144
 145  private:
 146   friend class base::RefCountedThreadSafe<URLIndexPrivateData>;
 147   ~URLIndexPrivateData();
 148
 149   friend class AddHistoryMatch;
 150   friend class ::HistoryQuickProviderTest;
 151   friend class InMemoryURLIndexTest;
 152   FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, AddHistoryMatch);
 153   FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, CacheSaveRestore);
 154   FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, HugeResultSet);
 155   FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, ReadVisitsFromHistory);
 156   FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, RebuildFromHistoryIfCacheOld);
 157   FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, Scoring);
 158   FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, TitleSearch);
 159   FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, TypedCharacterCaching);
 160   FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, WhitelistedURLs);
 161   FRIEND_TEST_ALL_PREFIXES(LimitedInMemoryURLIndexTest, Initialization);
 162
 163   // Support caching of term results so that we can optimize searches which
 164   // build upon a previous search. Each entry in this map represents one
 165   // search term from the most recent search. For example, if the user had
 166   // typed "google blog trans" and then typed an additional 'l' (at the end,
 167   // of course) then there would be four items in the cache: 'blog', 'google',
 168   // 'trans', and 'transl'. All would be marked as being in use except for the
 169   // 'trans' item; its cached data would have been used when optimizing the
 170   // construction of the search results candidates for 'transl' but then would
 171   // no longer needed.
 172   //
 173   // Items stored in the search term cache. If a search term exactly matches one
 174   // in the cache then we can quickly supply the proper |history_id_set_| (and
 175   // marking the cache item as being |used_|. If we find a prefix for a search
 176   // term in the cache (which is very likely to occur as the user types each
 177   // term into the omnibox) then we can short-circuit the index search for those
 178   // characters in the prefix by returning the |word_id_set|. In that case we do
 179   // not mark the item as being |used_|.
 180   struct SearchTermCacheItem {
 181     SearchTermCacheItem(const WordIDSet& word_id_set,
 182                         const HistoryIDSet& history_id_set);
 183     // Creates a cache item for a term which has no results.
 184     SearchTermCacheItem();
 185
 186     ~SearchTermCacheItem();
 187
 188     WordIDSet word_id_set_;
 189     HistoryIDSet history_id_set_;
 190     bool used_;  // True if this item has been used for the current term search.
 191   };
 192   typedef std::map<base::string16, SearchTermCacheItem> SearchTermCacheMap;
 193
 194   // A helper class which performs the final filter on each candidate
 195   // history URL match, inserting accepted matches into |scored_matches_|.
 196   class AddHistoryMatch : public std::unary_function<HistoryID, void> {
 197    public:
 198     AddHistoryMatch(bookmarks::BookmarkModel* bookmark_model,
 199                     const URLIndexPrivateData& private_data,
 200                     const std::string& languages,
 201                     const base::string16& lower_string,
 202                     const String16Vector& lower_terms,
 203                     const base::Time now);
 204     ~AddHistoryMatch();
 205
 206     void operator()(const HistoryID history_id);
 207
 208     ScoredHistoryMatches ScoredMatches() const { return scored_matches_; }
 209
 210    private:
 211     friend class InMemoryURLIndexTest;
 212     FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, AddHistoryMatch);
 213     bookmarks::BookmarkModel* bookmark_model_;
 214     const URLIndexPrivateData& private_data_;
 215     const std::string& languages_;
 216     ScoredHistoryMatches scored_matches_;
 217     const base::string16& lower_string_;
 218     const String16Vector& lower_terms_;
 219     WordStarts lower_terms_to_word_starts_offsets_;
 220     const base::Time now_;
 221   };
 222
 223   // A helper predicate class used to filter excess history items when the
 224   // candidate results set is too large.
 225   class HistoryItemFactorGreater
 226       : public std::binary_function<HistoryID, HistoryID, void> {
 227    public:
 228     explicit HistoryItemFactorGreater(const HistoryInfoMap& history_info_map);
 229     ~HistoryItemFactorGreater();
 230
 231     bool operator()(const HistoryID h1, const HistoryID h2);
 232
 233    private:
 234     const HistoryInfoMap& history_info_map_;
 235   };
 236
 237   // URL History indexing support functions.
 238
 239   // Composes a set of history item IDs by intersecting the set for each word
 240   // in |unsorted_words|.
 241   HistoryIDSet HistoryIDSetFromWords(const String16Vector& unsorted_words);
 242
 243   // Helper function to HistoryIDSetFromWords which composes a set of history
 244   // ids for the given term given in |term|.
 245   HistoryIDSet HistoryIDsForTerm(const base::string16& term);
 246
 247   // Given a set of Char16s, finds words containing those characters.
 248   WordIDSet WordIDSetForTermChars(const Char16Set& term_chars);
 249
 250   // Indexes one URL history item as described by |row|. Returns true if the
 251   // row was actually indexed. |languages| gives a list of language encodings by
 252   // which the URLs and page titles are broken down into words and characters.
 253   // |scheme_whitelist| is used to filter non-qualifying schemes.  If
 254   // |history_db| is not NULL then this function uses the history database
 255   // synchronously to get the URL's recent visits information.  This mode should
 256   // only be used on the historyDB thread.  If |history_db| is NULL, then
 257   // this function uses |history_service| to schedule a task on the
 258   // historyDB thread to fetch and update the recent visits
 259   // information.
 260   bool IndexRow(history::HistoryDatabase* history_db,
 261                 history::HistoryService* history_service,
 262                 const history::URLRow& row,
 263                 const std::string& languages,
 264                 const std::set<std::string>& scheme_whitelist,
 265                 base::CancelableTaskTracker* tracker);
 266
 267   // Parses and indexes the words in the URL and page title of |row| and
 268   // calculate the word starts in each, saving the starts in |word_starts|.
 269   // |languages| gives a list of language encodings by which the URLs and page
 270   // titles are broken down into words and characters.
 271   void AddRowWordsToIndex(const history::URLRow& row,
 272                           RowWordStarts* word_starts,
 273                           const std::string& languages);
 274
 275   // Given a single word in |uni_word|, adds a reference for the containing
 276   // history item identified by |history_id| to the index.
 277   void AddWordToIndex(const base::string16& uni_word, HistoryID history_id);
 278
 279   // Creates a new entry in the word/history map for |word_id| and add
 280   // |history_id| as the initial element of the word's set.
 281   void AddWordHistory(const base::string16& uni_word, HistoryID history_id);
 282
 283   // Updates an existing entry in the word/history index by adding the
 284   // |history_id| to set for |word_id| in the word_id_history_map_.
 285   void UpdateWordHistory(WordID word_id, HistoryID history_id);
 286
 287   // Adds |word_id| to |history_id|'s entry in the history/word map,
 288   // creating a new entry if one does not already exist.
 289   void AddToHistoryIDWordMap(HistoryID history_id, WordID word_id);
 290
 291   // Removes |row| and all associated words and characters from the index.
 292   void RemoveRowFromIndex(const history::URLRow& row);
 293
 294   // Removes all words and characters associated with |row| from the index.
 295   void RemoveRowWordsFromIndex(const history::URLRow& row);
 296
 297   // Clears |used_| for each item in the search term cache.
 298   void ResetSearchTermCache();
 299
 300   // Caches the index private data and writes the cache file to the profile
 301   // directory.  Called by WritePrivateDataToCacheFileTask.
 302   bool SaveToFile(const base::FilePath& file_path);
 303
 304   // Encode a data structure into the protobuf |cache|.
 305   void SavePrivateData(
 306       in_memory_url_index::InMemoryURLIndexCacheItem* cache) const;
 307   void SaveWordList(
 308       in_memory_url_index::InMemoryURLIndexCacheItem* cache) const;
 309   void SaveWordMap(in_memory_url_index::InMemoryURLIndexCacheItem* cache) const;
 310   void SaveCharWordMap(
 311       in_memory_url_index::InMemoryURLIndexCacheItem* cache) const;
 312   void SaveWordIDHistoryMap(
 313       in_memory_url_index::InMemoryURLIndexCacheItem* cache) const;
 314   void SaveHistoryInfoMap(
 315       in_memory_url_index::InMemoryURLIndexCacheItem* cache) const;
 316   void SaveWordStartsMap(
 317       in_memory_url_index::InMemoryURLIndexCacheItem* cache) const;
 318
 319   // Decode a data structure from the protobuf |cache|. Return false if there
 320   // is any kind of failure. |languages| will be used to break URLs and page
 321   // titles into words
 322   bool RestorePrivateData(
 323       const in_memory_url_index::InMemoryURLIndexCacheItem& cache,
 324       const std::string& languages);
 325   bool RestoreWordList(
 326       const in_memory_url_index::InMemoryURLIndexCacheItem& cache);
 327   bool RestoreWordMap(
 328       const in_memory_url_index::InMemoryURLIndexCacheItem& cache);
 329   bool RestoreCharWordMap(
 330       const in_memory_url_index::InMemoryURLIndexCacheItem& cache);
 331   bool RestoreWordIDHistoryMap(
 332       const in_memory_url_index::InMemoryURLIndexCacheItem& cache);
 333   bool RestoreHistoryInfoMap(
 334       const in_memory_url_index::InMemoryURLIndexCacheItem& cache);
 335   bool RestoreWordStartsMap(
 336       const in_memory_url_index::InMemoryURLIndexCacheItem& cache,
 337       const std::string& languages);
 338
 339   // Determines if |gurl| has a whitelisted scheme and returns true if so.
 340   static bool URLSchemeIsWhitelisted(const GURL& gurl,
 341                                      const std::set<std::string>& whitelist);
 342
 343   // Cache of search terms.
 344   SearchTermCacheMap search_term_cache_;
 345
 346   // Start of data members that are cached -------------------------------------
 347
 348   // The version of the cache file most recently used to restore this instance
 349   // of the private data. If the private data was rebuilt from the history
 350   // database this will be 0.
 351   int restored_cache_version_;
 352
 353   // The last time the data was rebuilt from the history database.
 354   base::Time last_time_rebuilt_from_history_;
 355
 356   // A list of all of indexed words. The index of a word in this list is the
 357   // ID of the word in the word_map_. It reduces the memory overhead by
 358   // replacing a potentially long and repeated string with a simple index.
 359   String16Vector word_list_;
 360
 361   // A list of available words slots in |word_list_|. An available word slot
 362   // is the index of a unused word in word_list_ vector, also referred to as
 363   // a WordID. As URL visits are added or modified new words may be added to
 364   // the index, in which case any available words are used, if any, and then
 365   // words are added to the end of the word_list_. When URL visits are
 366   // modified or deleted old words may be removed from the index, in which
 367   // case the slots for those words are added to available_words_ for resuse
 368   // by future URL updates.
 369   WordIDSet available_words_;
 370
 371   // A one-to-one mapping from the a word string to its slot number (i.e.
 372   // WordID) in the |word_list_|.
 373   WordMap word_map_;
 374
 375   // A one-to-many mapping from a single character to all WordIDs of words
 376   // containing that character.
 377   CharWordIDMap char_word_map_;
 378
 379   // A one-to-many mapping from a WordID to all HistoryIDs (the row_id as
 380   // used in the history database) of history items in which the word occurs.
 381   WordIDHistoryMap word_id_history_map_;
 382
 383   // A one-to-many mapping from a HistoryID to all WordIDs of words that occur
 384   // in the URL and/or page title of the history item referenced by that
 385   // HistoryID.
 386   HistoryIDWordMap history_id_word_map_;
 387
 388   // A one-to-one mapping from HistoryID to the history item data governing
 389   // index inclusion and relevance scoring.
 390   HistoryInfoMap history_info_map_;
 391
 392   // A one-to-one mapping from HistoryID to the word starts detected in each
 393   // item's URL and page title.
 394   WordStartsMap word_starts_map_;
 395
 396   // End of data members that are cached ---------------------------------------
 397
 398   // For unit testing only. Specifies the version of the cache file to be saved.
 399   // Used only for testing upgrading of an older version of the cache upon
 400   // restore.
 401   int saved_cache_version_;
 402
 403   // Used for unit testing only. Records the number of candidate history items
 404   // at three stages in the index searching process.
 405   size_t pre_filter_item_count_;    // After word index is queried.
 406   size_t post_filter_item_count_;   // After trimming large result set.
 407   size_t post_scoring_item_count_;  // After performing final filter/scoring.
 408 };
 409
 410 #endif  // COMPONENTS_OMNIBOX_BROWSER_URL_INDEX_PRIVATE_DATA_H_