Roll src/third_party/WebKit 3aea697:d9c6159 (svn 201973:201974)
[chromium-blink-merge.git] / components / omnibox / browser / url_index_private_data.cc
bloba468d2b6624e5258da96bf2c31ff165de849f909
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "components/omnibox/browser/url_index_private_data.h"
7 #include <functional>
8 #include <iterator>
9 #include <limits>
10 #include <numeric>
11 #include <string>
12 #include <vector>
14 #include "base/basictypes.h"
15 #include "base/files/file_util.h"
16 #include "base/i18n/break_iterator.h"
17 #include "base/i18n/case_conversion.h"
18 #include "base/metrics/histogram.h"
19 #include "base/strings/string_split.h"
20 #include "base/strings/string_util.h"
21 #include "base/strings/utf_string_conversions.h"
22 #include "base/time/time.h"
23 #include "components/bookmarks/browser/bookmark_model.h"
24 #include "components/bookmarks/browser/bookmark_utils.h"
25 #include "components/history/core/browser/history_database.h"
26 #include "components/history/core/browser/history_db_task.h"
27 #include "components/history/core/browser/history_service.h"
28 #include "components/omnibox/browser/in_memory_url_index.h"
29 #include "components/url_formatter/url_formatter.h"
31 #if defined(USE_SYSTEM_PROTOBUF)
32 #include <google/protobuf/repeated_field.h>
33 #else
34 #include "third_party/protobuf/src/google/protobuf/repeated_field.h"
35 #endif
37 using google::protobuf::RepeatedField;
38 using google::protobuf::RepeatedPtrField;
39 using in_memory_url_index::InMemoryURLIndexCacheItem;
41 namespace {
42 static const size_t kMaxVisitsToStoreInCache = 10u;
45 typedef in_memory_url_index::InMemoryURLIndexCacheItem_WordListItem
46 WordListItem;
47 typedef in_memory_url_index::InMemoryURLIndexCacheItem_WordMapItem_WordMapEntry
48 WordMapEntry;
49 typedef in_memory_url_index::InMemoryURLIndexCacheItem_WordMapItem WordMapItem;
50 typedef in_memory_url_index::InMemoryURLIndexCacheItem_CharWordMapItem
51 CharWordMapItem;
52 typedef in_memory_url_index::
53 InMemoryURLIndexCacheItem_CharWordMapItem_CharWordMapEntry CharWordMapEntry;
54 typedef in_memory_url_index::InMemoryURLIndexCacheItem_WordIDHistoryMapItem
55 WordIDHistoryMapItem;
56 typedef in_memory_url_index::
57 InMemoryURLIndexCacheItem_WordIDHistoryMapItem_WordIDHistoryMapEntry
58 WordIDHistoryMapEntry;
59 typedef in_memory_url_index::InMemoryURLIndexCacheItem_HistoryInfoMapItem
60 HistoryInfoMapItem;
61 typedef in_memory_url_index::
62 InMemoryURLIndexCacheItem_HistoryInfoMapItem_HistoryInfoMapEntry
63 HistoryInfoMapEntry;
64 typedef in_memory_url_index::
65 InMemoryURLIndexCacheItem_HistoryInfoMapItem_HistoryInfoMapEntry_VisitInfo
66 HistoryInfoMapEntry_VisitInfo;
67 typedef in_memory_url_index::InMemoryURLIndexCacheItem_WordStartsMapItem
68 WordStartsMapItem;
69 typedef in_memory_url_index::
70 InMemoryURLIndexCacheItem_WordStartsMapItem_WordStartsMapEntry
71 WordStartsMapEntry;
73 // Algorithm Functions ---------------------------------------------------------
75 // Comparison function for sorting search terms by descending length.
76 bool LengthGreater(const base::string16& string_a,
77 const base::string16& string_b) {
78 return string_a.length() > string_b.length();
82 // UpdateRecentVisitsFromHistoryDBTask -----------------------------------------
84 // HistoryDBTask used to update the recent visit data for a particular
85 // row from the history database.
86 class UpdateRecentVisitsFromHistoryDBTask : public history::HistoryDBTask {
87 public:
88 explicit UpdateRecentVisitsFromHistoryDBTask(
89 URLIndexPrivateData* private_data,
90 history::URLID url_id);
92 bool RunOnDBThread(history::HistoryBackend* backend,
93 history::HistoryDatabase* db) override;
94 void DoneRunOnMainThread() override;
96 private:
97 ~UpdateRecentVisitsFromHistoryDBTask() override;
99 // The URLIndexPrivateData that gets updated after the historyDB
100 // task returns.
101 URLIndexPrivateData* private_data_;
102 // The ID of the URL to get visits for and then update.
103 history::URLID url_id_;
104 // Whether fetching the recent visits for the URL succeeded.
105 bool succeeded_;
106 // The awaited data that's shown to private_data_ for it to copy and
107 // store.
108 history::VisitVector recent_visits_;
110 DISALLOW_COPY_AND_ASSIGN(UpdateRecentVisitsFromHistoryDBTask);
113 UpdateRecentVisitsFromHistoryDBTask::UpdateRecentVisitsFromHistoryDBTask(
114 URLIndexPrivateData* private_data,
115 history::URLID url_id)
116 : private_data_(private_data), url_id_(url_id), succeeded_(false) {
119 bool UpdateRecentVisitsFromHistoryDBTask::RunOnDBThread(
120 history::HistoryBackend* backend,
121 history::HistoryDatabase* db) {
122 // Make sure the private data is going to get as many recent visits as
123 // ScoredHistoryMatch::GetFrequency() hopes to use.
124 DCHECK_GE(kMaxVisitsToStoreInCache, ScoredHistoryMatch::kMaxVisitsToScore);
125 succeeded_ = db->GetMostRecentVisitsForURL(url_id_,
126 kMaxVisitsToStoreInCache,
127 &recent_visits_);
128 if (!succeeded_)
129 recent_visits_.clear();
130 return true; // Always claim to be done; do not retry failures.
133 void UpdateRecentVisitsFromHistoryDBTask::DoneRunOnMainThread() {
134 if (succeeded_)
135 private_data_->UpdateRecentVisits(url_id_, recent_visits_);
138 UpdateRecentVisitsFromHistoryDBTask::~UpdateRecentVisitsFromHistoryDBTask() {
142 // URLIndexPrivateData ---------------------------------------------------------
144 URLIndexPrivateData::URLIndexPrivateData()
145 : restored_cache_version_(0),
146 saved_cache_version_(kCurrentCacheFileVersion),
147 pre_filter_item_count_(0),
148 post_filter_item_count_(0),
149 post_scoring_item_count_(0) {
152 ScoredHistoryMatches URLIndexPrivateData::HistoryItemsForTerms(
153 base::string16 search_string,
154 size_t cursor_position,
155 size_t max_matches,
156 const std::string& languages,
157 bookmarks::BookmarkModel* bookmark_model) {
158 // If cursor position is set and useful (not at either end of the
159 // string), allow the search string to be broken at cursor position.
160 // We do this by pretending there's a space where the cursor is.
161 if ((cursor_position != base::string16::npos) &&
162 (cursor_position < search_string.length()) &&
163 (cursor_position > 0)) {
164 search_string.insert(cursor_position, base::ASCIIToUTF16(" "));
166 pre_filter_item_count_ = 0;
167 post_filter_item_count_ = 0;
168 post_scoring_item_count_ = 0;
169 // The search string we receive may contain escaped characters. For reducing
170 // the index we need individual, lower-cased words, ignoring escapings. For
171 // the final filtering we need whitespace separated substrings possibly
172 // containing escaped characters.
173 base::string16 lower_raw_string(base::i18n::ToLower(search_string));
174 base::string16 lower_unescaped_string =
175 net::UnescapeURLComponent(lower_raw_string,
176 net::UnescapeRule::SPACES | net::UnescapeRule::URL_SPECIAL_CHARS);
177 // Extract individual 'words' (as opposed to 'terms'; see below) from the
178 // search string. When the user types "colspec=ID%20Mstone Release" we get
179 // four 'words': "colspec", "id", "mstone" and "release".
180 String16Vector lower_words(
181 String16VectorFromString16(lower_unescaped_string, false, NULL));
182 ScoredHistoryMatches scored_items;
184 // Do nothing if we have indexed no words (probably because we've not been
185 // initialized yet) or the search string has no words.
186 if (word_list_.empty() || lower_words.empty()) {
187 search_term_cache_.clear(); // Invalidate the term cache.
188 return scored_items;
191 // Reset used_ flags for search_term_cache_. We use a basic mark-and-sweep
192 // approach.
193 ResetSearchTermCache();
195 HistoryIDSet history_id_set = HistoryIDSetFromWords(lower_words);
197 // Trim the candidate pool if it is large. Note that we do not filter out
198 // items that do not contain the search terms as proper substrings -- doing
199 // so is the performance-costly operation we are trying to avoid in order
200 // to maintain omnibox responsiveness.
201 const size_t kItemsToScoreLimit = 500;
202 pre_filter_item_count_ = history_id_set.size();
203 // If we trim the results set we do not want to cache the results for next
204 // time as the user's ultimately desired result could easily be eliminated
205 // in this early rough filter.
206 bool was_trimmed = (pre_filter_item_count_ > kItemsToScoreLimit);
207 if (was_trimmed) {
208 HistoryIDVector history_ids;
209 std::copy(history_id_set.begin(), history_id_set.end(),
210 std::back_inserter(history_ids));
211 // Trim down the set by sorting by typed-count, visit-count, and last
212 // visit.
213 HistoryItemFactorGreater
214 item_factor_functor(history_info_map_);
215 std::partial_sort(history_ids.begin(),
216 history_ids.begin() + kItemsToScoreLimit,
217 history_ids.end(),
218 item_factor_functor);
219 history_id_set.clear();
220 std::copy(history_ids.begin(), history_ids.begin() + kItemsToScoreLimit,
221 std::inserter(history_id_set, history_id_set.end()));
222 post_filter_item_count_ = history_id_set.size();
225 // Pass over all of the candidates filtering out any without a proper
226 // substring match, inserting those which pass in order by score. Note that
227 // in this step we are using the raw search string complete with escaped
228 // URL elements. When the user has specifically typed something akin to
229 // "sort=pri&colspec=ID%20Mstone%20Release" we want to make sure that that
230 // specific substring appears in the URL or page title.
232 // We call these 'terms' (as opposed to 'words'; see above) as in this case
233 // we only want to break up the search string on 'true' whitespace rather than
234 // escaped whitespace. When the user types "colspec=ID%20Mstone Release" we
235 // get two 'terms': "colspec=id%20mstone" and "release".
236 String16Vector lower_raw_terms = base::SplitString(
237 lower_raw_string, base::kWhitespaceUTF16, base::KEEP_WHITESPACE,
238 base::SPLIT_WANT_NONEMPTY);
239 if (lower_raw_terms.empty()) {
240 // Don't score matches when there are no terms to score against. (It's
241 // possible that the word break iterater that extracts words to search
242 // for in the database allows some whitespace "words" whereas SplitString
243 // excludes a long list of whitespace.) One could write a scoring
244 // function that gives a reasonable order to matches when there
245 // are no terms (i.e., all the words are some form of whitespace),
246 // but this is such a rare edge case that it's not worth the time.
247 return scored_items;
249 scored_items =
250 std::for_each(
251 history_id_set.begin(), history_id_set.end(),
252 AddHistoryMatch(bookmark_model, *this, languages, lower_raw_string,
253 lower_raw_terms, base::Time::Now())).ScoredMatches();
255 // Select and sort only the top |max_matches| results.
256 if (scored_items.size() > max_matches) {
257 std::partial_sort(scored_items.begin(),
258 scored_items.begin() +
259 max_matches,
260 scored_items.end(),
261 ScoredHistoryMatch::MatchScoreGreater);
262 scored_items.resize(max_matches);
263 } else {
264 std::sort(scored_items.begin(), scored_items.end(),
265 ScoredHistoryMatch::MatchScoreGreater);
267 post_scoring_item_count_ = scored_items.size();
269 if (was_trimmed) {
270 search_term_cache_.clear(); // Invalidate the term cache.
271 } else {
272 // Remove any stale SearchTermCacheItems.
273 for (SearchTermCacheMap::iterator cache_iter = search_term_cache_.begin();
274 cache_iter != search_term_cache_.end(); ) {
275 if (!cache_iter->second.used_)
276 search_term_cache_.erase(cache_iter++);
277 else
278 ++cache_iter;
282 return scored_items;
285 bool URLIndexPrivateData::UpdateURL(
286 history::HistoryService* history_service,
287 const history::URLRow& row,
288 const std::string& languages,
289 const std::set<std::string>& scheme_whitelist,
290 base::CancelableTaskTracker* tracker) {
291 // The row may or may not already be in our index. If it is not already
292 // indexed and it qualifies then it gets indexed. If it is already
293 // indexed and still qualifies then it gets updated, otherwise it
294 // is deleted from the index.
295 bool row_was_updated = false;
296 history::URLID row_id = row.id();
297 HistoryInfoMap::iterator row_pos = history_info_map_.find(row_id);
298 if (row_pos == history_info_map_.end()) {
299 // This new row should be indexed if it qualifies.
300 history::URLRow new_row(row);
301 new_row.set_id(row_id);
302 row_was_updated = RowQualifiesAsSignificant(new_row, base::Time()) &&
303 IndexRow(NULL,
304 history_service,
305 new_row,
306 languages,
307 scheme_whitelist,
308 tracker);
309 } else if (RowQualifiesAsSignificant(row, base::Time())) {
310 // This indexed row still qualifies and will be re-indexed.
311 // The url won't have changed but the title, visit count, etc.
312 // might have changed.
313 history::URLRow& row_to_update = row_pos->second.url_row;
314 bool title_updated = row_to_update.title() != row.title();
315 if (row_to_update.visit_count() != row.visit_count() ||
316 row_to_update.typed_count() != row.typed_count() ||
317 row_to_update.last_visit() != row.last_visit() || title_updated) {
318 row_to_update.set_visit_count(row.visit_count());
319 row_to_update.set_typed_count(row.typed_count());
320 row_to_update.set_last_visit(row.last_visit());
321 // If something appears to have changed, update the recent visits
322 // information.
323 ScheduleUpdateRecentVisits(history_service, row_id, tracker);
324 // While the URL is guaranteed to remain stable, the title may have
325 // changed. If so, then update the index with the changed words.
326 if (title_updated) {
327 // Clear all words associated with this row and re-index both the
328 // URL and title.
329 RemoveRowWordsFromIndex(row_to_update);
330 row_to_update.set_title(row.title());
331 RowWordStarts word_starts;
332 AddRowWordsToIndex(row_to_update, &word_starts, languages);
333 word_starts_map_[row_id] = word_starts;
335 row_was_updated = true;
337 } else {
338 // This indexed row no longer qualifies and will be de-indexed by
339 // clearing all words associated with this row.
340 RemoveRowFromIndex(row);
341 row_was_updated = true;
343 if (row_was_updated)
344 search_term_cache_.clear(); // This invalidates the cache.
345 return row_was_updated;
348 void URLIndexPrivateData::UpdateRecentVisits(
349 history::URLID url_id,
350 const history::VisitVector& recent_visits) {
351 HistoryInfoMap::iterator row_pos = history_info_map_.find(url_id);
352 if (row_pos != history_info_map_.end()) {
353 VisitInfoVector* visits = &row_pos->second.visits;
354 visits->clear();
355 const size_t size =
356 std::min(recent_visits.size(), kMaxVisitsToStoreInCache);
357 visits->reserve(size);
358 for (size_t i = 0; i < size; i++) {
359 // Copy from the history::VisitVector the only fields visits needs.
360 visits->push_back(std::make_pair(recent_visits[i].visit_time,
361 recent_visits[i].transition));
364 // Else: Oddly, the URL doesn't seem to exist in the private index.
365 // Ignore this update. This can happen if, for instance, the user
366 // removes the URL from URLIndexPrivateData before the historyDB call
367 // returns.
370 void URLIndexPrivateData::ScheduleUpdateRecentVisits(
371 history::HistoryService* history_service,
372 history::URLID url_id,
373 base::CancelableTaskTracker* tracker) {
374 history_service->ScheduleDBTask(
375 scoped_ptr<history::HistoryDBTask>(
376 new UpdateRecentVisitsFromHistoryDBTask(this, url_id)), tracker);
379 // Helper functor for DeleteURL.
380 class HistoryInfoMapItemHasURL {
381 public:
382 explicit HistoryInfoMapItemHasURL(const GURL& url): url_(url) {}
384 bool operator()(const std::pair<const HistoryID, HistoryInfoMapValue>& item) {
385 return item.second.url_row.url() == url_;
388 private:
389 const GURL& url_;
392 bool URLIndexPrivateData::DeleteURL(const GURL& url) {
393 // Find the matching entry in the history_info_map_.
394 HistoryInfoMap::iterator pos = std::find_if(
395 history_info_map_.begin(),
396 history_info_map_.end(),
397 HistoryInfoMapItemHasURL(url));
398 if (pos == history_info_map_.end())
399 return false;
400 RemoveRowFromIndex(pos->second.url_row);
401 search_term_cache_.clear(); // This invalidates the cache.
402 return true;
405 // static
406 scoped_refptr<URLIndexPrivateData> URLIndexPrivateData::RestoreFromFile(
407 const base::FilePath& file_path,
408 const std::string& languages) {
409 base::TimeTicks beginning_time = base::TimeTicks::Now();
410 if (!base::PathExists(file_path))
411 return NULL;
412 std::string data;
413 // If there is no cache file then simply give up. This will cause us to
414 // attempt to rebuild from the history database.
415 if (!base::ReadFileToString(file_path, &data))
416 return NULL;
418 scoped_refptr<URLIndexPrivateData> restored_data(new URLIndexPrivateData);
419 InMemoryURLIndexCacheItem index_cache;
420 if (!index_cache.ParseFromArray(data.c_str(), data.size())) {
421 LOG(WARNING) << "Failed to parse URLIndexPrivateData cache data read from "
422 << file_path.value();
423 return restored_data;
426 if (!restored_data->RestorePrivateData(index_cache, languages))
427 return NULL;
429 UMA_HISTOGRAM_TIMES("History.InMemoryURLIndexRestoreCacheTime",
430 base::TimeTicks::Now() - beginning_time);
431 UMA_HISTOGRAM_COUNTS("History.InMemoryURLHistoryItems",
432 restored_data->history_id_word_map_.size());
433 UMA_HISTOGRAM_COUNTS("History.InMemoryURLCacheSize", data.size());
434 UMA_HISTOGRAM_COUNTS_10000("History.InMemoryURLWords",
435 restored_data->word_map_.size());
436 UMA_HISTOGRAM_COUNTS_10000("History.InMemoryURLChars",
437 restored_data->char_word_map_.size());
438 if (restored_data->Empty())
439 return NULL; // 'No data' is the same as a failed reload.
440 return restored_data;
443 // static
444 scoped_refptr<URLIndexPrivateData> URLIndexPrivateData::RebuildFromHistory(
445 history::HistoryDatabase* history_db,
446 const std::string& languages,
447 const std::set<std::string>& scheme_whitelist) {
448 if (!history_db)
449 return NULL;
451 base::TimeTicks beginning_time = base::TimeTicks::Now();
453 scoped_refptr<URLIndexPrivateData>
454 rebuilt_data(new URLIndexPrivateData);
455 history::URLDatabase::URLEnumerator history_enum;
456 if (!history_db->InitURLEnumeratorForSignificant(&history_enum))
457 return NULL;
458 rebuilt_data->last_time_rebuilt_from_history_ = base::Time::Now();
459 for (history::URLRow row; history_enum.GetNextURL(&row);) {
460 rebuilt_data->IndexRow(
461 history_db, NULL, row, languages, scheme_whitelist, NULL);
464 UMA_HISTOGRAM_TIMES("History.InMemoryURLIndexingTime",
465 base::TimeTicks::Now() - beginning_time);
466 UMA_HISTOGRAM_COUNTS("History.InMemoryURLHistoryItems",
467 rebuilt_data->history_id_word_map_.size());
468 UMA_HISTOGRAM_COUNTS_10000("History.InMemoryURLWords",
469 rebuilt_data->word_map_.size());
470 UMA_HISTOGRAM_COUNTS_10000("History.InMemoryURLChars",
471 rebuilt_data->char_word_map_.size());
472 return rebuilt_data;
475 // static
476 bool URLIndexPrivateData::WritePrivateDataToCacheFileTask(
477 scoped_refptr<URLIndexPrivateData> private_data,
478 const base::FilePath& file_path) {
479 DCHECK(private_data.get());
480 DCHECK(!file_path.empty());
481 return private_data->SaveToFile(file_path);
484 scoped_refptr<URLIndexPrivateData> URLIndexPrivateData::Duplicate() const {
485 scoped_refptr<URLIndexPrivateData> data_copy = new URLIndexPrivateData;
486 data_copy->last_time_rebuilt_from_history_ = last_time_rebuilt_from_history_;
487 data_copy->word_list_ = word_list_;
488 data_copy->available_words_ = available_words_;
489 data_copy->word_map_ = word_map_;
490 data_copy->char_word_map_ = char_word_map_;
491 data_copy->word_id_history_map_ = word_id_history_map_;
492 data_copy->history_id_word_map_ = history_id_word_map_;
493 data_copy->history_info_map_ = history_info_map_;
494 data_copy->word_starts_map_ = word_starts_map_;
495 return data_copy;
496 // Not copied:
497 // search_term_cache_
498 // pre_filter_item_count_
499 // post_filter_item_count_
500 // post_scoring_item_count_
503 bool URLIndexPrivateData::Empty() const {
504 return history_info_map_.empty();
507 void URLIndexPrivateData::Clear() {
508 last_time_rebuilt_from_history_ = base::Time();
509 word_list_.clear();
510 available_words_.clear();
511 word_map_.clear();
512 char_word_map_.clear();
513 word_id_history_map_.clear();
514 history_id_word_map_.clear();
515 history_info_map_.clear();
516 word_starts_map_.clear();
519 URLIndexPrivateData::~URLIndexPrivateData() {}
521 HistoryIDSet URLIndexPrivateData::HistoryIDSetFromWords(
522 const String16Vector& unsorted_words) {
523 // Break the terms down into individual terms (words), get the candidate
524 // set for each term, and intersect each to get a final candidate list.
525 // Note that a single 'term' from the user's perspective might be
526 // a string like "http://www.somewebsite.com" which, from our perspective,
527 // is four words: 'http', 'www', 'somewebsite', and 'com'.
528 HistoryIDSet history_id_set;
529 String16Vector words(unsorted_words);
530 // Sort the words into the longest first as such are likely to narrow down
531 // the results quicker. Also, single character words are the most expensive
532 // to process so save them for last.
533 std::sort(words.begin(), words.end(), LengthGreater);
534 for (String16Vector::iterator iter = words.begin(); iter != words.end();
535 ++iter) {
536 base::string16 uni_word = *iter;
537 HistoryIDSet term_history_set = HistoryIDsForTerm(uni_word);
538 if (term_history_set.empty()) {
539 history_id_set.clear();
540 break;
542 if (iter == words.begin()) {
543 history_id_set.swap(term_history_set);
544 } else {
545 HistoryIDSet new_history_id_set = base::STLSetIntersection<HistoryIDSet>(
546 history_id_set, term_history_set);
547 history_id_set.swap(new_history_id_set);
550 return history_id_set;
553 HistoryIDSet URLIndexPrivateData::HistoryIDsForTerm(
554 const base::string16& term) {
555 if (term.empty())
556 return HistoryIDSet();
558 // TODO(mrossetti): Consider optimizing for very common terms such as
559 // 'http[s]', 'www', 'com', etc. Or collect the top 100 more frequently
560 // occuring words in the user's searches.
562 size_t term_length = term.length();
563 WordIDSet word_id_set;
564 if (term_length > 1) {
565 // See if this term or a prefix thereof is present in the cache.
566 base::string16 term_lower = base::i18n::ToLower(term);
567 SearchTermCacheMap::iterator best_prefix(search_term_cache_.end());
568 for (SearchTermCacheMap::iterator cache_iter = search_term_cache_.begin();
569 cache_iter != search_term_cache_.end(); ++cache_iter) {
570 if (base::StartsWith(term_lower,
571 base::i18n::ToLower(cache_iter->first),
572 base::CompareCase::SENSITIVE) &&
573 (best_prefix == search_term_cache_.end() ||
574 cache_iter->first.length() > best_prefix->first.length()))
575 best_prefix = cache_iter;
578 // If a prefix was found then determine the leftover characters to be used
579 // for further refining the results from that prefix.
580 Char16Set prefix_chars;
581 base::string16 leftovers(term);
582 if (best_prefix != search_term_cache_.end()) {
583 // If the prefix is an exact match for the term then grab the cached
584 // results and we're done.
585 size_t prefix_length = best_prefix->first.length();
586 if (prefix_length == term_length) {
587 best_prefix->second.used_ = true;
588 return best_prefix->second.history_id_set_;
591 // Otherwise we have a handy starting point.
592 // If there are no history results for this prefix then we can bail early
593 // as there will be no history results for the full term.
594 if (best_prefix->second.history_id_set_.empty()) {
595 search_term_cache_[term] = SearchTermCacheItem();
596 return HistoryIDSet();
598 word_id_set = best_prefix->second.word_id_set_;
599 prefix_chars = Char16SetFromString16(best_prefix->first);
600 leftovers = term.substr(prefix_length);
603 // Filter for each remaining, unique character in the term.
604 Char16Set leftover_chars = Char16SetFromString16(leftovers);
605 Char16Set unique_chars =
606 base::STLSetDifference<Char16Set>(leftover_chars, prefix_chars);
608 // Reduce the word set with any leftover, unprocessed characters.
609 if (!unique_chars.empty()) {
610 WordIDSet leftover_set(WordIDSetForTermChars(unique_chars));
611 // We might come up empty on the leftovers.
612 if (leftover_set.empty()) {
613 search_term_cache_[term] = SearchTermCacheItem();
614 return HistoryIDSet();
616 // Or there may not have been a prefix from which to start.
617 if (prefix_chars.empty()) {
618 word_id_set.swap(leftover_set);
619 } else {
620 WordIDSet new_word_id_set = base::STLSetIntersection<WordIDSet>(
621 word_id_set, leftover_set);
622 word_id_set.swap(new_word_id_set);
626 // We must filter the word list because the resulting word set surely
627 // contains words which do not have the search term as a proper subset.
628 for (WordIDSet::iterator word_set_iter = word_id_set.begin();
629 word_set_iter != word_id_set.end(); ) {
630 if (word_list_[*word_set_iter].find(term) == base::string16::npos)
631 word_id_set.erase(word_set_iter++);
632 else
633 ++word_set_iter;
635 } else {
636 word_id_set = WordIDSetForTermChars(Char16SetFromString16(term));
639 // If any words resulted then we can compose a set of history IDs by unioning
640 // the sets from each word.
641 HistoryIDSet history_id_set;
642 if (!word_id_set.empty()) {
643 for (WordIDSet::iterator word_id_iter = word_id_set.begin();
644 word_id_iter != word_id_set.end(); ++word_id_iter) {
645 WordID word_id = *word_id_iter;
646 WordIDHistoryMap::iterator word_iter = word_id_history_map_.find(word_id);
647 if (word_iter != word_id_history_map_.end()) {
648 HistoryIDSet& word_history_id_set(word_iter->second);
649 history_id_set.insert(word_history_id_set.begin(),
650 word_history_id_set.end());
655 // Record a new cache entry for this word if the term is longer than
656 // a single character.
657 if (term_length > 1)
658 search_term_cache_[term] = SearchTermCacheItem(word_id_set, history_id_set);
660 return history_id_set;
663 WordIDSet URLIndexPrivateData::WordIDSetForTermChars(
664 const Char16Set& term_chars) {
665 WordIDSet word_id_set;
666 for (Char16Set::const_iterator c_iter = term_chars.begin();
667 c_iter != term_chars.end(); ++c_iter) {
668 CharWordIDMap::iterator char_iter = char_word_map_.find(*c_iter);
669 if (char_iter == char_word_map_.end()) {
670 // A character was not found so there are no matching results: bail.
671 word_id_set.clear();
672 break;
674 WordIDSet& char_word_id_set(char_iter->second);
675 // It is possible for there to no longer be any words associated with
676 // a particular character. Give up in that case.
677 if (char_word_id_set.empty()) {
678 word_id_set.clear();
679 break;
682 if (c_iter == term_chars.begin()) {
683 // First character results becomes base set of results.
684 word_id_set = char_word_id_set;
685 } else {
686 // Subsequent character results get intersected in.
687 WordIDSet new_word_id_set = base::STLSetIntersection<WordIDSet>(
688 word_id_set, char_word_id_set);
689 word_id_set.swap(new_word_id_set);
692 return word_id_set;
695 bool URLIndexPrivateData::IndexRow(
696 history::HistoryDatabase* history_db,
697 history::HistoryService* history_service,
698 const history::URLRow& row,
699 const std::string& languages,
700 const std::set<std::string>& scheme_whitelist,
701 base::CancelableTaskTracker* tracker) {
702 const GURL& gurl(row.url());
704 // Index only URLs with a whitelisted scheme.
705 if (!URLSchemeIsWhitelisted(gurl, scheme_whitelist))
706 return false;
708 history::URLID row_id = row.id();
709 // Strip out username and password before saving and indexing.
710 base::string16 url(url_formatter::FormatUrl(
711 gurl, languages, url_formatter::kFormatUrlOmitUsernamePassword,
712 net::UnescapeRule::NONE, nullptr, nullptr, nullptr));
714 HistoryID history_id = static_cast<HistoryID>(row_id);
715 DCHECK_LT(history_id, std::numeric_limits<HistoryID>::max());
717 // Add the row for quick lookup in the history info store.
718 history::URLRow new_row(GURL(url), row_id);
719 new_row.set_visit_count(row.visit_count());
720 new_row.set_typed_count(row.typed_count());
721 new_row.set_last_visit(row.last_visit());
722 new_row.set_title(row.title());
723 history_info_map_[history_id].url_row = new_row;
725 // Index the words contained in the URL and title of the row.
726 RowWordStarts word_starts;
727 AddRowWordsToIndex(new_row, &word_starts, languages);
728 word_starts_map_[history_id] = word_starts;
730 // Update the recent visits information or schedule the update
731 // as appropriate.
732 if (history_db) {
733 // We'd like to check that we're on the history DB thread.
734 // However, unittest code actually calls this on the UI thread.
735 // So we don't do any thread checks.
736 history::VisitVector recent_visits;
737 // Make sure the private data is going to get as many recent visits as
738 // ScoredHistoryMatch::GetFrequency() hopes to use.
739 DCHECK_GE(kMaxVisitsToStoreInCache, ScoredHistoryMatch::kMaxVisitsToScore);
740 if (history_db->GetMostRecentVisitsForURL(row_id,
741 kMaxVisitsToStoreInCache,
742 &recent_visits))
743 UpdateRecentVisits(row_id, recent_visits);
744 } else {
745 DCHECK(tracker);
746 DCHECK(history_service);
747 ScheduleUpdateRecentVisits(history_service, row_id, tracker);
750 return true;
753 void URLIndexPrivateData::AddRowWordsToIndex(const history::URLRow& row,
754 RowWordStarts* word_starts,
755 const std::string& languages) {
756 HistoryID history_id = static_cast<HistoryID>(row.id());
757 // Split URL into individual, unique words then add in the title words.
758 const GURL& gurl(row.url());
759 const base::string16& url =
760 bookmarks::CleanUpUrlForMatching(gurl, languages, NULL);
761 String16Set url_words = String16SetFromString16(url,
762 word_starts ? &word_starts->url_word_starts_ : NULL);
763 const base::string16& title = bookmarks::CleanUpTitleForMatching(row.title());
764 String16Set title_words = String16SetFromString16(title,
765 word_starts ? &word_starts->title_word_starts_ : NULL);
766 String16Set words = base::STLSetUnion<String16Set>(url_words, title_words);
767 for (String16Set::iterator word_iter = words.begin();
768 word_iter != words.end(); ++word_iter)
769 AddWordToIndex(*word_iter, history_id);
771 search_term_cache_.clear(); // Invalidate the term cache.
774 void URLIndexPrivateData::AddWordToIndex(const base::string16& term,
775 HistoryID history_id) {
776 WordMap::iterator word_pos = word_map_.find(term);
777 if (word_pos != word_map_.end())
778 UpdateWordHistory(word_pos->second, history_id);
779 else
780 AddWordHistory(term, history_id);
783 void URLIndexPrivateData::AddWordHistory(const base::string16& term,
784 HistoryID history_id) {
785 WordID word_id = word_list_.size();
786 if (available_words_.empty()) {
787 word_list_.push_back(term);
788 } else {
789 word_id = *(available_words_.begin());
790 word_list_[word_id] = term;
791 available_words_.erase(word_id);
793 word_map_[term] = word_id;
795 HistoryIDSet history_id_set;
796 history_id_set.insert(history_id);
797 word_id_history_map_[word_id] = history_id_set;
798 AddToHistoryIDWordMap(history_id, word_id);
800 // For each character in the newly added word (i.e. a word that is not
801 // already in the word index), add the word to the character index.
802 Char16Set characters = Char16SetFromString16(term);
803 for (Char16Set::iterator uni_char_iter = characters.begin();
804 uni_char_iter != characters.end(); ++uni_char_iter) {
805 base::char16 uni_char = *uni_char_iter;
806 CharWordIDMap::iterator char_iter = char_word_map_.find(uni_char);
807 if (char_iter != char_word_map_.end()) {
808 // Update existing entry in the char/word index.
809 WordIDSet& word_id_set(char_iter->second);
810 word_id_set.insert(word_id);
811 } else {
812 // Create a new entry in the char/word index.
813 WordIDSet word_id_set;
814 word_id_set.insert(word_id);
815 char_word_map_[uni_char] = word_id_set;
820 void URLIndexPrivateData::UpdateWordHistory(WordID word_id,
821 HistoryID history_id) {
822 WordIDHistoryMap::iterator history_pos = word_id_history_map_.find(word_id);
823 DCHECK(history_pos != word_id_history_map_.end());
824 HistoryIDSet& history_id_set(history_pos->second);
825 history_id_set.insert(history_id);
826 AddToHistoryIDWordMap(history_id, word_id);
829 void URLIndexPrivateData::AddToHistoryIDWordMap(HistoryID history_id,
830 WordID word_id) {
831 HistoryIDWordMap::iterator iter = history_id_word_map_.find(history_id);
832 if (iter != history_id_word_map_.end()) {
833 WordIDSet& word_id_set(iter->second);
834 word_id_set.insert(word_id);
835 } else {
836 WordIDSet word_id_set;
837 word_id_set.insert(word_id);
838 history_id_word_map_[history_id] = word_id_set;
842 void URLIndexPrivateData::RemoveRowFromIndex(const history::URLRow& row) {
843 RemoveRowWordsFromIndex(row);
844 HistoryID history_id = static_cast<HistoryID>(row.id());
845 history_info_map_.erase(history_id);
846 word_starts_map_.erase(history_id);
849 void URLIndexPrivateData::RemoveRowWordsFromIndex(const history::URLRow& row) {
850 // Remove the entries in history_id_word_map_ and word_id_history_map_ for
851 // this row.
852 HistoryID history_id = static_cast<HistoryID>(row.id());
853 WordIDSet word_id_set = history_id_word_map_[history_id];
854 history_id_word_map_.erase(history_id);
856 // Reconcile any changes to word usage.
857 for (WordIDSet::iterator word_id_iter = word_id_set.begin();
858 word_id_iter != word_id_set.end(); ++word_id_iter) {
859 WordID word_id = *word_id_iter;
860 word_id_history_map_[word_id].erase(history_id);
861 if (!word_id_history_map_[word_id].empty())
862 continue; // The word is still in use.
864 // The word is no longer in use. Reconcile any changes to character usage.
865 base::string16 word = word_list_[word_id];
866 Char16Set characters = Char16SetFromString16(word);
867 for (Char16Set::iterator uni_char_iter = characters.begin();
868 uni_char_iter != characters.end(); ++uni_char_iter) {
869 base::char16 uni_char = *uni_char_iter;
870 char_word_map_[uni_char].erase(word_id);
871 if (char_word_map_[uni_char].empty())
872 char_word_map_.erase(uni_char); // No longer in use.
875 // Complete the removal of references to the word.
876 word_id_history_map_.erase(word_id);
877 word_map_.erase(word);
878 word_list_[word_id] = base::string16();
879 available_words_.insert(word_id);
883 void URLIndexPrivateData::ResetSearchTermCache() {
884 for (SearchTermCacheMap::iterator iter = search_term_cache_.begin();
885 iter != search_term_cache_.end(); ++iter)
886 iter->second.used_ = false;
889 bool URLIndexPrivateData::SaveToFile(const base::FilePath& file_path) {
890 base::TimeTicks beginning_time = base::TimeTicks::Now();
891 InMemoryURLIndexCacheItem index_cache;
892 SavePrivateData(&index_cache);
893 std::string data;
894 if (!index_cache.SerializeToString(&data)) {
895 LOG(WARNING) << "Failed to serialize the InMemoryURLIndex cache.";
896 return false;
899 int size = data.size();
900 if (base::WriteFile(file_path, data.c_str(), size) != size) {
901 LOG(WARNING) << "Failed to write " << file_path.value();
902 return false;
904 UMA_HISTOGRAM_TIMES("History.InMemoryURLIndexSaveCacheTime",
905 base::TimeTicks::Now() - beginning_time);
906 return true;
909 void URLIndexPrivateData::SavePrivateData(
910 InMemoryURLIndexCacheItem* cache) const {
911 DCHECK(cache);
912 cache->set_last_rebuild_timestamp(
913 last_time_rebuilt_from_history_.ToInternalValue());
914 cache->set_version(saved_cache_version_);
915 // history_item_count_ is no longer used but rather than change the protobuf
916 // definition use a placeholder. This will go away with the switch to SQLite.
917 cache->set_history_item_count(0);
918 SaveWordList(cache);
919 SaveWordMap(cache);
920 SaveCharWordMap(cache);
921 SaveWordIDHistoryMap(cache);
922 SaveHistoryInfoMap(cache);
923 SaveWordStartsMap(cache);
926 void URLIndexPrivateData::SaveWordList(InMemoryURLIndexCacheItem* cache) const {
927 if (word_list_.empty())
928 return;
929 WordListItem* list_item = cache->mutable_word_list();
930 list_item->set_word_count(word_list_.size());
931 for (String16Vector::const_iterator iter = word_list_.begin();
932 iter != word_list_.end(); ++iter)
933 list_item->add_word(base::UTF16ToUTF8(*iter));
936 void URLIndexPrivateData::SaveWordMap(InMemoryURLIndexCacheItem* cache) const {
937 if (word_map_.empty())
938 return;
939 WordMapItem* map_item = cache->mutable_word_map();
940 map_item->set_item_count(word_map_.size());
941 for (WordMap::const_iterator iter = word_map_.begin();
942 iter != word_map_.end(); ++iter) {
943 WordMapEntry* map_entry = map_item->add_word_map_entry();
944 map_entry->set_word(base::UTF16ToUTF8(iter->first));
945 map_entry->set_word_id(iter->second);
949 void URLIndexPrivateData::SaveCharWordMap(
950 InMemoryURLIndexCacheItem* cache) const {
951 if (char_word_map_.empty())
952 return;
953 CharWordMapItem* map_item = cache->mutable_char_word_map();
954 map_item->set_item_count(char_word_map_.size());
955 for (CharWordIDMap::const_iterator iter = char_word_map_.begin();
956 iter != char_word_map_.end(); ++iter) {
957 CharWordMapEntry* map_entry = map_item->add_char_word_map_entry();
958 map_entry->set_char_16(iter->first);
959 const WordIDSet& word_id_set(iter->second);
960 map_entry->set_item_count(word_id_set.size());
961 for (WordIDSet::const_iterator set_iter = word_id_set.begin();
962 set_iter != word_id_set.end(); ++set_iter)
963 map_entry->add_word_id(*set_iter);
967 void URLIndexPrivateData::SaveWordIDHistoryMap(
968 InMemoryURLIndexCacheItem* cache) const {
969 if (word_id_history_map_.empty())
970 return;
971 WordIDHistoryMapItem* map_item = cache->mutable_word_id_history_map();
972 map_item->set_item_count(word_id_history_map_.size());
973 for (WordIDHistoryMap::const_iterator iter = word_id_history_map_.begin();
974 iter != word_id_history_map_.end(); ++iter) {
975 WordIDHistoryMapEntry* map_entry =
976 map_item->add_word_id_history_map_entry();
977 map_entry->set_word_id(iter->first);
978 const HistoryIDSet& history_id_set(iter->second);
979 map_entry->set_item_count(history_id_set.size());
980 for (HistoryIDSet::const_iterator set_iter = history_id_set.begin();
981 set_iter != history_id_set.end(); ++set_iter)
982 map_entry->add_history_id(*set_iter);
986 void URLIndexPrivateData::SaveHistoryInfoMap(
987 InMemoryURLIndexCacheItem* cache) const {
988 if (history_info_map_.empty())
989 return;
990 HistoryInfoMapItem* map_item = cache->mutable_history_info_map();
991 map_item->set_item_count(history_info_map_.size());
992 for (HistoryInfoMap::const_iterator iter = history_info_map_.begin();
993 iter != history_info_map_.end(); ++iter) {
994 HistoryInfoMapEntry* map_entry = map_item->add_history_info_map_entry();
995 map_entry->set_history_id(iter->first);
996 const history::URLRow& url_row(iter->second.url_row);
997 // Note: We only save information that contributes to the index so there
998 // is no need to save search_term_cache_ (not persistent).
999 map_entry->set_visit_count(url_row.visit_count());
1000 map_entry->set_typed_count(url_row.typed_count());
1001 map_entry->set_last_visit(url_row.last_visit().ToInternalValue());
1002 map_entry->set_url(url_row.url().spec());
1003 map_entry->set_title(base::UTF16ToUTF8(url_row.title()));
1004 const VisitInfoVector& visits(iter->second.visits);
1005 for (VisitInfoVector::const_iterator visit_iter = visits.begin();
1006 visit_iter != visits.end(); ++visit_iter) {
1007 HistoryInfoMapEntry_VisitInfo* visit_info = map_entry->add_visits();
1008 visit_info->set_visit_time(visit_iter->first.ToInternalValue());
1009 visit_info->set_transition_type(visit_iter->second);
1014 void URLIndexPrivateData::SaveWordStartsMap(
1015 InMemoryURLIndexCacheItem* cache) const {
1016 if (word_starts_map_.empty())
1017 return;
1018 // For unit testing: Enable saving of the cache as an earlier version to
1019 // allow testing of cache file upgrading in ReadFromFile().
1020 // TODO(mrossetti): Instead of intruding on production code with this kind of
1021 // test harness, save a copy of an older version cache with known results.
1022 // Implement this when switching the caching over to SQLite.
1023 if (saved_cache_version_ < 1)
1024 return;
1026 WordStartsMapItem* map_item = cache->mutable_word_starts_map();
1027 map_item->set_item_count(word_starts_map_.size());
1028 for (WordStartsMap::const_iterator iter = word_starts_map_.begin();
1029 iter != word_starts_map_.end(); ++iter) {
1030 WordStartsMapEntry* map_entry = map_item->add_word_starts_map_entry();
1031 map_entry->set_history_id(iter->first);
1032 const RowWordStarts& word_starts(iter->second);
1033 for (WordStarts::const_iterator i = word_starts.url_word_starts_.begin();
1034 i != word_starts.url_word_starts_.end(); ++i)
1035 map_entry->add_url_word_starts(*i);
1036 for (WordStarts::const_iterator i = word_starts.title_word_starts_.begin();
1037 i != word_starts.title_word_starts_.end(); ++i)
1038 map_entry->add_title_word_starts(*i);
1042 bool URLIndexPrivateData::RestorePrivateData(
1043 const InMemoryURLIndexCacheItem& cache,
1044 const std::string& languages) {
1045 last_time_rebuilt_from_history_ =
1046 base::Time::FromInternalValue(cache.last_rebuild_timestamp());
1047 const base::TimeDelta rebuilt_ago =
1048 base::Time::Now() - last_time_rebuilt_from_history_;
1049 if ((rebuilt_ago > base::TimeDelta::FromDays(7)) ||
1050 (rebuilt_ago < base::TimeDelta::FromDays(-1))) {
1051 // Cache is more than a week old or, somehow, from some time in the future.
1052 // It's probably a good time to rebuild the index from history to
1053 // allow synced entries to now appear, expired entries to disappear, etc.
1054 // Allow one day in the future to make the cache not rebuild on simple
1055 // system clock changes such as time zone changes.
1056 return false;
1058 if (cache.has_version()) {
1059 if (cache.version() < kCurrentCacheFileVersion) {
1060 // Don't try to restore an old format cache file. (This will cause
1061 // the InMemoryURLIndex to schedule rebuilding the URLIndexPrivateData
1062 // from history.)
1063 return false;
1065 restored_cache_version_ = cache.version();
1067 return RestoreWordList(cache) && RestoreWordMap(cache) &&
1068 RestoreCharWordMap(cache) && RestoreWordIDHistoryMap(cache) &&
1069 RestoreHistoryInfoMap(cache) && RestoreWordStartsMap(cache, languages);
1072 bool URLIndexPrivateData::RestoreWordList(
1073 const InMemoryURLIndexCacheItem& cache) {
1074 if (!cache.has_word_list())
1075 return false;
1076 const WordListItem& list_item(cache.word_list());
1077 uint32 expected_item_count = list_item.word_count();
1078 uint32 actual_item_count = list_item.word_size();
1079 if (actual_item_count == 0 || actual_item_count != expected_item_count)
1080 return false;
1081 const RepeatedPtrField<std::string>& words(list_item.word());
1082 for (RepeatedPtrField<std::string>::const_iterator iter = words.begin();
1083 iter != words.end(); ++iter)
1084 word_list_.push_back(base::UTF8ToUTF16(*iter));
1085 return true;
1088 bool URLIndexPrivateData::RestoreWordMap(
1089 const InMemoryURLIndexCacheItem& cache) {
1090 if (!cache.has_word_map())
1091 return false;
1092 const WordMapItem& list_item(cache.word_map());
1093 uint32 expected_item_count = list_item.item_count();
1094 uint32 actual_item_count = list_item.word_map_entry_size();
1095 if (actual_item_count == 0 || actual_item_count != expected_item_count)
1096 return false;
1097 const RepeatedPtrField<WordMapEntry>& entries(list_item.word_map_entry());
1098 for (RepeatedPtrField<WordMapEntry>::const_iterator iter = entries.begin();
1099 iter != entries.end(); ++iter)
1100 word_map_[base::UTF8ToUTF16(iter->word())] = iter->word_id();
1101 return true;
1104 bool URLIndexPrivateData::RestoreCharWordMap(
1105 const InMemoryURLIndexCacheItem& cache) {
1106 if (!cache.has_char_word_map())
1107 return false;
1108 const CharWordMapItem& list_item(cache.char_word_map());
1109 uint32 expected_item_count = list_item.item_count();
1110 uint32 actual_item_count = list_item.char_word_map_entry_size();
1111 if (actual_item_count == 0 || actual_item_count != expected_item_count)
1112 return false;
1113 const RepeatedPtrField<CharWordMapEntry>&
1114 entries(list_item.char_word_map_entry());
1115 for (RepeatedPtrField<CharWordMapEntry>::const_iterator iter =
1116 entries.begin(); iter != entries.end(); ++iter) {
1117 expected_item_count = iter->item_count();
1118 actual_item_count = iter->word_id_size();
1119 if (actual_item_count == 0 || actual_item_count != expected_item_count)
1120 return false;
1121 base::char16 uni_char = static_cast<base::char16>(iter->char_16());
1122 WordIDSet word_id_set;
1123 const RepeatedField<int32>& word_ids(iter->word_id());
1124 for (RepeatedField<int32>::const_iterator jiter = word_ids.begin();
1125 jiter != word_ids.end(); ++jiter)
1126 word_id_set.insert(*jiter);
1127 char_word_map_[uni_char] = word_id_set;
1129 return true;
1132 bool URLIndexPrivateData::RestoreWordIDHistoryMap(
1133 const InMemoryURLIndexCacheItem& cache) {
1134 if (!cache.has_word_id_history_map())
1135 return false;
1136 const WordIDHistoryMapItem& list_item(cache.word_id_history_map());
1137 uint32 expected_item_count = list_item.item_count();
1138 uint32 actual_item_count = list_item.word_id_history_map_entry_size();
1139 if (actual_item_count == 0 || actual_item_count != expected_item_count)
1140 return false;
1141 const RepeatedPtrField<WordIDHistoryMapEntry>&
1142 entries(list_item.word_id_history_map_entry());
1143 for (RepeatedPtrField<WordIDHistoryMapEntry>::const_iterator iter =
1144 entries.begin(); iter != entries.end(); ++iter) {
1145 expected_item_count = iter->item_count();
1146 actual_item_count = iter->history_id_size();
1147 if (actual_item_count == 0 || actual_item_count != expected_item_count)
1148 return false;
1149 WordID word_id = iter->word_id();
1150 HistoryIDSet history_id_set;
1151 const RepeatedField<int64>& history_ids(iter->history_id());
1152 for (RepeatedField<int64>::const_iterator jiter = history_ids.begin();
1153 jiter != history_ids.end(); ++jiter) {
1154 history_id_set.insert(*jiter);
1155 AddToHistoryIDWordMap(*jiter, word_id);
1157 word_id_history_map_[word_id] = history_id_set;
1159 return true;
1162 bool URLIndexPrivateData::RestoreHistoryInfoMap(
1163 const InMemoryURLIndexCacheItem& cache) {
1164 if (!cache.has_history_info_map())
1165 return false;
1166 const HistoryInfoMapItem& list_item(cache.history_info_map());
1167 uint32 expected_item_count = list_item.item_count();
1168 uint32 actual_item_count = list_item.history_info_map_entry_size();
1169 if (actual_item_count == 0 || actual_item_count != expected_item_count)
1170 return false;
1171 const RepeatedPtrField<HistoryInfoMapEntry>&
1172 entries(list_item.history_info_map_entry());
1173 for (RepeatedPtrField<HistoryInfoMapEntry>::const_iterator iter =
1174 entries.begin(); iter != entries.end(); ++iter) {
1175 HistoryID history_id = iter->history_id();
1176 GURL url(iter->url());
1177 history::URLRow url_row(url, history_id);
1178 url_row.set_visit_count(iter->visit_count());
1179 url_row.set_typed_count(iter->typed_count());
1180 url_row.set_last_visit(base::Time::FromInternalValue(iter->last_visit()));
1181 if (iter->has_title()) {
1182 base::string16 title(base::UTF8ToUTF16(iter->title()));
1183 url_row.set_title(title);
1185 history_info_map_[history_id].url_row = url_row;
1187 // Restore visits list.
1188 VisitInfoVector visits;
1189 visits.reserve(iter->visits_size());
1190 for (int i = 0; i < iter->visits_size(); ++i) {
1191 visits.push_back(std::make_pair(
1192 base::Time::FromInternalValue(iter->visits(i).visit_time()),
1193 ui::PageTransitionFromInt(iter->visits(i).transition_type())));
1195 history_info_map_[history_id].visits = visits;
1197 return true;
1200 bool URLIndexPrivateData::RestoreWordStartsMap(
1201 const InMemoryURLIndexCacheItem& cache,
1202 const std::string& languages) {
1203 // Note that this function must be called after RestoreHistoryInfoMap() has
1204 // been run as the word starts may have to be recalculated from the urls and
1205 // page titles.
1206 if (cache.has_word_starts_map()) {
1207 const WordStartsMapItem& list_item(cache.word_starts_map());
1208 uint32 expected_item_count = list_item.item_count();
1209 uint32 actual_item_count = list_item.word_starts_map_entry_size();
1210 if (actual_item_count == 0 || actual_item_count != expected_item_count)
1211 return false;
1212 const RepeatedPtrField<WordStartsMapEntry>&
1213 entries(list_item.word_starts_map_entry());
1214 for (RepeatedPtrField<WordStartsMapEntry>::const_iterator iter =
1215 entries.begin(); iter != entries.end(); ++iter) {
1216 HistoryID history_id = iter->history_id();
1217 RowWordStarts word_starts;
1218 // Restore the URL word starts.
1219 const RepeatedField<int32>& url_starts(iter->url_word_starts());
1220 for (RepeatedField<int32>::const_iterator jiter = url_starts.begin();
1221 jiter != url_starts.end(); ++jiter)
1222 word_starts.url_word_starts_.push_back(*jiter);
1223 // Restore the page title word starts.
1224 const RepeatedField<int32>& title_starts(iter->title_word_starts());
1225 for (RepeatedField<int32>::const_iterator jiter = title_starts.begin();
1226 jiter != title_starts.end(); ++jiter)
1227 word_starts.title_word_starts_.push_back(*jiter);
1228 word_starts_map_[history_id] = word_starts;
1230 } else {
1231 // Since the cache did not contain any word starts we must rebuild then from
1232 // the URL and page titles.
1233 for (HistoryInfoMap::const_iterator iter = history_info_map_.begin();
1234 iter != history_info_map_.end(); ++iter) {
1235 RowWordStarts word_starts;
1236 const history::URLRow& row(iter->second.url_row);
1237 const base::string16& url =
1238 bookmarks::CleanUpUrlForMatching(row.url(), languages, NULL);
1239 String16VectorFromString16(url, false, &word_starts.url_word_starts_);
1240 const base::string16& title =
1241 bookmarks::CleanUpTitleForMatching(row.title());
1242 String16VectorFromString16(title, false, &word_starts.title_word_starts_);
1243 word_starts_map_[iter->first] = word_starts;
1246 return true;
1249 // static
1250 bool URLIndexPrivateData::URLSchemeIsWhitelisted(
1251 const GURL& gurl,
1252 const std::set<std::string>& whitelist) {
1253 return whitelist.find(gurl.scheme()) != whitelist.end();
1257 // SearchTermCacheItem ---------------------------------------------------------
1259 URLIndexPrivateData::SearchTermCacheItem::SearchTermCacheItem(
1260 const WordIDSet& word_id_set,
1261 const HistoryIDSet& history_id_set)
1262 : word_id_set_(word_id_set), history_id_set_(history_id_set), used_(true) {
1265 URLIndexPrivateData::SearchTermCacheItem::SearchTermCacheItem() : used_(true) {
1268 URLIndexPrivateData::SearchTermCacheItem::~SearchTermCacheItem() {
1271 // URLIndexPrivateData::AddHistoryMatch ----------------------------------------
1273 URLIndexPrivateData::AddHistoryMatch::AddHistoryMatch(
1274 bookmarks::BookmarkModel* bookmark_model,
1275 const URLIndexPrivateData& private_data,
1276 const std::string& languages,
1277 const base::string16& lower_string,
1278 const String16Vector& lower_terms,
1279 const base::Time now)
1280 : bookmark_model_(bookmark_model),
1281 private_data_(private_data),
1282 languages_(languages),
1283 lower_string_(lower_string),
1284 lower_terms_(lower_terms),
1285 now_(now) {
1286 // Calculate offsets for each term. For instance, the offset for
1287 // ".net" should be 1, indicating that the actual word-part of the term
1288 // starts at offset 1.
1289 lower_terms_to_word_starts_offsets_.resize(lower_terms_.size(), 0u);
1290 for (size_t i = 0; i < lower_terms_.size(); ++i) {
1291 base::i18n::BreakIterator iter(lower_terms_[i],
1292 base::i18n::BreakIterator::BREAK_WORD);
1293 // If the iterator doesn't work, assume an offset of 0.
1294 if (!iter.Init())
1295 continue;
1296 // Find the first word start. If the iterator didn't find a word break, set
1297 // an offset of term size. For example, the offset for "://" should be 3,
1298 // indicating that the word-part is missing.
1299 while (iter.Advance() && !iter.IsWord()) {}
1301 lower_terms_to_word_starts_offsets_[i] = iter.prev();
1305 URLIndexPrivateData::AddHistoryMatch::~AddHistoryMatch() {
1308 void URLIndexPrivateData::AddHistoryMatch::operator()(
1309 const HistoryID history_id) {
1310 HistoryInfoMap::const_iterator hist_pos =
1311 private_data_.history_info_map_.find(history_id);
1312 if (hist_pos != private_data_.history_info_map_.end()) {
1313 const history::URLRow& hist_item = hist_pos->second.url_row;
1314 const VisitInfoVector& visits = hist_pos->second.visits;
1315 WordStartsMap::const_iterator starts_pos =
1316 private_data_.word_starts_map_.find(history_id);
1317 DCHECK(starts_pos != private_data_.word_starts_map_.end());
1318 ScoredHistoryMatch match(
1319 hist_item, visits, languages_, lower_string_, lower_terms_,
1320 lower_terms_to_word_starts_offsets_, starts_pos->second,
1321 bookmark_model_ && bookmark_model_->IsBookmarked(hist_item.url()),
1322 now_);
1323 if (match.raw_score > 0)
1324 scored_matches_.push_back(match);
1329 // URLIndexPrivateData::HistoryItemFactorGreater -------------------------------
1331 URLIndexPrivateData::HistoryItemFactorGreater::HistoryItemFactorGreater(
1332 const HistoryInfoMap& history_info_map)
1333 : history_info_map_(history_info_map) {
1336 URLIndexPrivateData::HistoryItemFactorGreater::~HistoryItemFactorGreater() {
1339 bool URLIndexPrivateData::HistoryItemFactorGreater::operator()(
1340 const HistoryID h1,
1341 const HistoryID h2) {
1342 HistoryInfoMap::const_iterator entry1(history_info_map_.find(h1));
1343 if (entry1 == history_info_map_.end())
1344 return false;
1345 HistoryInfoMap::const_iterator entry2(history_info_map_.find(h2));
1346 if (entry2 == history_info_map_.end())
1347 return true;
1348 const history::URLRow& r1(entry1->second.url_row);
1349 const history::URLRow& r2(entry2->second.url_row);
1350 // First cut: typed count, visit count, recency.
1351 // TODO(mrossetti): This is too simplistic. Consider an approach which ranks
1352 // recently visited (within the last 12/24 hours) as highly important. Get
1353 // input from mpearson.
1354 if (r1.typed_count() != r2.typed_count())
1355 return (r1.typed_count() > r2.typed_count());
1356 if (r1.visit_count() != r2.visit_count())
1357 return (r1.visit_count() > r2.visit_count());
1358 return (r1.last_visit() > r2.last_visit());