1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #ifndef COMPONENTS_OMNIBOX_BROWSER_URL_INDEX_PRIVATE_DATA_H_
6 #define COMPONENTS_OMNIBOX_BROWSER_URL_INDEX_PRIVATE_DATA_H_
11 #include "base/files/file_path.h"
12 #include "base/gtest_prod_util.h"
13 #include "base/memory/ref_counted.h"
14 #include "components/history/core/browser/history_service.h"
15 #include "components/omnibox/browser/in_memory_url_index_cache.pb.h"
16 #include "components/omnibox/browser/in_memory_url_index_types.h"
17 #include "components/omnibox/browser/scored_history_match.h"
19 class HistoryQuickProviderTest
;
25 namespace in_memory_url_index
{
26 class InMemoryURLIndexCacheItem
;
30 class HistoryDatabase
;
31 class InMemoryURLIndex
;
35 // Current version of the cache file.
36 static const int kCurrentCacheFileVersion
= 5;
38 // A structure private to InMemoryURLIndex describing its internal data and
39 // providing for restoring, rebuilding and updating that internal data. As
40 // this class is for exclusive use by the InMemoryURLIndex class there should
41 // be no calls from any other class.
43 // All public member functions are called on the main thread unless otherwise
45 class URLIndexPrivateData
46 : public base::RefCountedThreadSafe
<URLIndexPrivateData
> {
48 URLIndexPrivateData();
50 // Given a base::string16 in |term_string|, scans the history index and
51 // returns a vector with all scored, matching history items. The
52 // |term_string| is broken down into individual terms (words), each of which
53 // must occur in the candidate history item's URL or page title for the item
54 // to qualify; however, the terms do not necessarily have to be adjacent. We
55 // also allow breaking |term_string| at |cursor_position| (if
56 // set). Once we have a set of candidates, they are filtered to ensure
57 // that all |term_string| terms, as separated by whitespace and the
58 // cursor (if set), occur within the candidate's URL or page title.
59 // Scores are then calculated on no more than |kItemsToScoreLimit|
60 // candidates, as the scoring of such a large number of candidates may
61 // cause perceptible typing response delays in the omnibox. This is
62 // likely to occur for short omnibox terms such as 'h' and 'w' which
63 // will be found in nearly all history candidates. Results are sorted by
64 // descending score. The full results set (i.e. beyond the
65 // |kItemsToScoreLimit| limit) will be retained and used for subsequent calls
66 // to this function. |languages| is used to help parse/format the URLs in the
67 // history index. In total, |max_matches| of items will be returned in the
68 // |ScoredHistoryMatches| vector.
69 ScoredHistoryMatches
HistoryItemsForTerms(
70 base::string16 term_string
,
71 size_t cursor_position
,
73 const std::string
& languages
,
74 bookmarks::BookmarkModel
* bookmark_model
);
76 // Adds the history item in |row| to the index if it does not already already
77 // exist and it meets the minimum 'quick' criteria. If the row already exists
78 // in the index then the index will be updated if the row still meets the
79 // criteria, otherwise the row will be removed from the index. Returns true
80 // if the index was actually updated. |languages| gives a list of language
81 // encodings by which the URLs and page titles are broken down into words and
82 // characters. |scheme_whitelist| is used to filter non-qualifying schemes.
83 // |history_service| is used to schedule an update to the recent visits
84 // component of this URL's entry in the index.
85 bool UpdateURL(history::HistoryService
* history_service
,
86 const history::URLRow
& row
,
87 const std::string
& languages
,
88 const std::set
<std::string
>& scheme_whitelist
,
89 base::CancelableTaskTracker
* tracker
);
91 // Updates the entry for |url_id| in the index, replacing its
92 // recent visits information with |recent_visits|. If |url_id|
93 // is not in the index, does nothing.
94 void UpdateRecentVisits(history::URLID url_id
,
95 const history::VisitVector
& recent_visits
);
97 // Using |history_service| schedules an update (using the historyDB
98 // thread) for the recent visits information for |url_id|. Unless
99 // something unexpectedly goes wrong, UdpateRecentVisits() should
100 // eventually be called from a callback.
101 void ScheduleUpdateRecentVisits(history::HistoryService
* history_service
,
102 history::URLID url_id
,
103 base::CancelableTaskTracker
* tracker
);
105 // Deletes index data for the history item with the given |url|.
106 // The item may not have actually been indexed, which is the case if it did
107 // not previously meet minimum 'quick' criteria. Returns true if the index
108 // was actually updated.
109 bool DeleteURL(const GURL
& url
);
111 // Constructs a new object by restoring its contents from the cache file
112 // at |path|. Returns the new URLIndexPrivateData which on success will
113 // contain the restored data but upon failure will be empty. |languages|
114 // is used to break URLs and page titles into words. This function
115 // should be run on the the file thread.
116 static scoped_refptr
<URLIndexPrivateData
> RestoreFromFile(
117 const base::FilePath
& path
,
118 const std::string
& languages
);
120 // Constructs a new object by rebuilding its contents from the history
121 // database in |history_db|. Returns the new URLIndexPrivateData which on
122 // success will contain the rebuilt data but upon failure will be empty.
123 // |languages| gives a list of language encodings by which the URLs and page
124 // titles are broken down into words and characters.
125 static scoped_refptr
<URLIndexPrivateData
> RebuildFromHistory(
126 history::HistoryDatabase
* history_db
,
127 const std::string
& languages
,
128 const std::set
<std::string
>& scheme_whitelist
);
130 // Writes |private_data| as a cache file to |file_path| and returns success.
131 static bool WritePrivateDataToCacheFileTask(
132 scoped_refptr
<URLIndexPrivateData
> private_data
,
133 const base::FilePath
& file_path
);
135 // Creates a copy of ourself.
136 scoped_refptr
<URLIndexPrivateData
> Duplicate() const;
138 // Returns true if there is no data in the index.
141 // Initializes all index data members in preparation for restoring the index
142 // from the cache or a complete rebuild from the history database.
146 friend class base::RefCountedThreadSafe
<URLIndexPrivateData
>;
147 ~URLIndexPrivateData();
149 friend class AddHistoryMatch
;
150 friend class ::HistoryQuickProviderTest
;
151 friend class InMemoryURLIndexTest
;
152 FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest
, AddHistoryMatch
);
153 FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest
, CacheSaveRestore
);
154 FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest
, HugeResultSet
);
155 FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest
, ReadVisitsFromHistory
);
156 FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest
, RebuildFromHistoryIfCacheOld
);
157 FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest
, Scoring
);
158 FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest
, TitleSearch
);
159 FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest
, TypedCharacterCaching
);
160 FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest
, WhitelistedURLs
);
161 FRIEND_TEST_ALL_PREFIXES(LimitedInMemoryURLIndexTest
, Initialization
);
163 // Support caching of term results so that we can optimize searches which
164 // build upon a previous search. Each entry in this map represents one
165 // search term from the most recent search. For example, if the user had
166 // typed "google blog trans" and then typed an additional 'l' (at the end,
167 // of course) then there would be four items in the cache: 'blog', 'google',
168 // 'trans', and 'transl'. All would be marked as being in use except for the
169 // 'trans' item; its cached data would have been used when optimizing the
170 // construction of the search results candidates for 'transl' but then would
173 // Items stored in the search term cache. If a search term exactly matches one
174 // in the cache then we can quickly supply the proper |history_id_set_| (and
175 // marking the cache item as being |used_|. If we find a prefix for a search
176 // term in the cache (which is very likely to occur as the user types each
177 // term into the omnibox) then we can short-circuit the index search for those
178 // characters in the prefix by returning the |word_id_set|. In that case we do
179 // not mark the item as being |used_|.
180 struct SearchTermCacheItem
{
181 SearchTermCacheItem(const WordIDSet
& word_id_set
,
182 const HistoryIDSet
& history_id_set
);
183 // Creates a cache item for a term which has no results.
184 SearchTermCacheItem();
186 ~SearchTermCacheItem();
188 WordIDSet word_id_set_
;
189 HistoryIDSet history_id_set_
;
190 bool used_
; // True if this item has been used for the current term search.
192 typedef std::map
<base::string16
, SearchTermCacheItem
> SearchTermCacheMap
;
194 // A helper class which performs the final filter on each candidate
195 // history URL match, inserting accepted matches into |scored_matches_|.
196 class AddHistoryMatch
: public std::unary_function
<HistoryID
, void> {
198 AddHistoryMatch(bookmarks::BookmarkModel
* bookmark_model
,
199 const URLIndexPrivateData
& private_data
,
200 const std::string
& languages
,
201 const base::string16
& lower_string
,
202 const String16Vector
& lower_terms
,
203 const base::Time now
);
206 void operator()(const HistoryID history_id
);
208 ScoredHistoryMatches
ScoredMatches() const { return scored_matches_
; }
211 friend class InMemoryURLIndexTest
;
212 FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest
, AddHistoryMatch
);
213 bookmarks::BookmarkModel
* bookmark_model_
;
214 const URLIndexPrivateData
& private_data_
;
215 const std::string
& languages_
;
216 ScoredHistoryMatches scored_matches_
;
217 const base::string16
& lower_string_
;
218 const String16Vector
& lower_terms_
;
219 WordStarts lower_terms_to_word_starts_offsets_
;
220 const base::Time now_
;
223 // A helper predicate class used to filter excess history items when the
224 // candidate results set is too large.
225 class HistoryItemFactorGreater
226 : public std::binary_function
<HistoryID
, HistoryID
, void> {
228 explicit HistoryItemFactorGreater(const HistoryInfoMap
& history_info_map
);
229 ~HistoryItemFactorGreater();
231 bool operator()(const HistoryID h1
, const HistoryID h2
);
234 const HistoryInfoMap
& history_info_map_
;
237 // URL History indexing support functions.
239 // Composes a set of history item IDs by intersecting the set for each word
240 // in |unsorted_words|.
241 HistoryIDSet
HistoryIDSetFromWords(const String16Vector
& unsorted_words
);
243 // Helper function to HistoryIDSetFromWords which composes a set of history
244 // ids for the given term given in |term|.
245 HistoryIDSet
HistoryIDsForTerm(const base::string16
& term
);
247 // Given a set of Char16s, finds words containing those characters.
248 WordIDSet
WordIDSetForTermChars(const Char16Set
& term_chars
);
250 // Indexes one URL history item as described by |row|. Returns true if the
251 // row was actually indexed. |languages| gives a list of language encodings by
252 // which the URLs and page titles are broken down into words and characters.
253 // |scheme_whitelist| is used to filter non-qualifying schemes. If
254 // |history_db| is not NULL then this function uses the history database
255 // synchronously to get the URL's recent visits information. This mode should
256 // only be used on the historyDB thread. If |history_db| is NULL, then
257 // this function uses |history_service| to schedule a task on the
258 // historyDB thread to fetch and update the recent visits
260 bool IndexRow(history::HistoryDatabase
* history_db
,
261 history::HistoryService
* history_service
,
262 const history::URLRow
& row
,
263 const std::string
& languages
,
264 const std::set
<std::string
>& scheme_whitelist
,
265 base::CancelableTaskTracker
* tracker
);
267 // Parses and indexes the words in the URL and page title of |row| and
268 // calculate the word starts in each, saving the starts in |word_starts|.
269 // |languages| gives a list of language encodings by which the URLs and page
270 // titles are broken down into words and characters.
271 void AddRowWordsToIndex(const history::URLRow
& row
,
272 RowWordStarts
* word_starts
,
273 const std::string
& languages
);
275 // Given a single word in |uni_word|, adds a reference for the containing
276 // history item identified by |history_id| to the index.
277 void AddWordToIndex(const base::string16
& uni_word
, HistoryID history_id
);
279 // Creates a new entry in the word/history map for |word_id| and add
280 // |history_id| as the initial element of the word's set.
281 void AddWordHistory(const base::string16
& uni_word
, HistoryID history_id
);
283 // Updates an existing entry in the word/history index by adding the
284 // |history_id| to set for |word_id| in the word_id_history_map_.
285 void UpdateWordHistory(WordID word_id
, HistoryID history_id
);
287 // Adds |word_id| to |history_id|'s entry in the history/word map,
288 // creating a new entry if one does not already exist.
289 void AddToHistoryIDWordMap(HistoryID history_id
, WordID word_id
);
291 // Removes |row| and all associated words and characters from the index.
292 void RemoveRowFromIndex(const history::URLRow
& row
);
294 // Removes all words and characters associated with |row| from the index.
295 void RemoveRowWordsFromIndex(const history::URLRow
& row
);
297 // Clears |used_| for each item in the search term cache.
298 void ResetSearchTermCache();
300 // Caches the index private data and writes the cache file to the profile
301 // directory. Called by WritePrivateDataToCacheFileTask.
302 bool SaveToFile(const base::FilePath
& file_path
);
304 // Encode a data structure into the protobuf |cache|.
305 void SavePrivateData(
306 in_memory_url_index::InMemoryURLIndexCacheItem
* cache
) const;
308 in_memory_url_index::InMemoryURLIndexCacheItem
* cache
) const;
309 void SaveWordMap(in_memory_url_index::InMemoryURLIndexCacheItem
* cache
) const;
310 void SaveCharWordMap(
311 in_memory_url_index::InMemoryURLIndexCacheItem
* cache
) const;
312 void SaveWordIDHistoryMap(
313 in_memory_url_index::InMemoryURLIndexCacheItem
* cache
) const;
314 void SaveHistoryInfoMap(
315 in_memory_url_index::InMemoryURLIndexCacheItem
* cache
) const;
316 void SaveWordStartsMap(
317 in_memory_url_index::InMemoryURLIndexCacheItem
* cache
) const;
319 // Decode a data structure from the protobuf |cache|. Return false if there
320 // is any kind of failure. |languages| will be used to break URLs and page
322 bool RestorePrivateData(
323 const in_memory_url_index::InMemoryURLIndexCacheItem
& cache
,
324 const std::string
& languages
);
325 bool RestoreWordList(
326 const in_memory_url_index::InMemoryURLIndexCacheItem
& cache
);
328 const in_memory_url_index::InMemoryURLIndexCacheItem
& cache
);
329 bool RestoreCharWordMap(
330 const in_memory_url_index::InMemoryURLIndexCacheItem
& cache
);
331 bool RestoreWordIDHistoryMap(
332 const in_memory_url_index::InMemoryURLIndexCacheItem
& cache
);
333 bool RestoreHistoryInfoMap(
334 const in_memory_url_index::InMemoryURLIndexCacheItem
& cache
);
335 bool RestoreWordStartsMap(
336 const in_memory_url_index::InMemoryURLIndexCacheItem
& cache
,
337 const std::string
& languages
);
339 // Determines if |gurl| has a whitelisted scheme and returns true if so.
340 static bool URLSchemeIsWhitelisted(const GURL
& gurl
,
341 const std::set
<std::string
>& whitelist
);
343 // Cache of search terms.
344 SearchTermCacheMap search_term_cache_
;
346 // Start of data members that are cached -------------------------------------
348 // The version of the cache file most recently used to restore this instance
349 // of the private data. If the private data was rebuilt from the history
350 // database this will be 0.
351 int restored_cache_version_
;
353 // The last time the data was rebuilt from the history database.
354 base::Time last_time_rebuilt_from_history_
;
356 // A list of all of indexed words. The index of a word in this list is the
357 // ID of the word in the word_map_. It reduces the memory overhead by
358 // replacing a potentially long and repeated string with a simple index.
359 String16Vector word_list_
;
361 // A list of available words slots in |word_list_|. An available word slot
362 // is the index of a unused word in word_list_ vector, also referred to as
363 // a WordID. As URL visits are added or modified new words may be added to
364 // the index, in which case any available words are used, if any, and then
365 // words are added to the end of the word_list_. When URL visits are
366 // modified or deleted old words may be removed from the index, in which
367 // case the slots for those words are added to available_words_ for resuse
368 // by future URL updates.
369 WordIDSet available_words_
;
371 // A one-to-one mapping from the a word string to its slot number (i.e.
372 // WordID) in the |word_list_|.
375 // A one-to-many mapping from a single character to all WordIDs of words
376 // containing that character.
377 CharWordIDMap char_word_map_
;
379 // A one-to-many mapping from a WordID to all HistoryIDs (the row_id as
380 // used in the history database) of history items in which the word occurs.
381 WordIDHistoryMap word_id_history_map_
;
383 // A one-to-many mapping from a HistoryID to all WordIDs of words that occur
384 // in the URL and/or page title of the history item referenced by that
386 HistoryIDWordMap history_id_word_map_
;
388 // A one-to-one mapping from HistoryID to the history item data governing
389 // index inclusion and relevance scoring.
390 HistoryInfoMap history_info_map_
;
392 // A one-to-one mapping from HistoryID to the word starts detected in each
393 // item's URL and page title.
394 WordStartsMap word_starts_map_
;
396 // End of data members that are cached ---------------------------------------
398 // For unit testing only. Specifies the version of the cache file to be saved.
399 // Used only for testing upgrading of an older version of the cache upon
401 int saved_cache_version_
;
403 // Used for unit testing only. Records the number of candidate history items
404 // at three stages in the index searching process.
405 size_t pre_filter_item_count_
; // After word index is queried.
406 size_t post_filter_item_count_
; // After trimming large result set.
407 size_t post_scoring_item_count_
; // After performing final filter/scoring.
410 #endif // COMPONENTS_OMNIBOX_BROWSER_URL_INDEX_PRIVATE_DATA_H_