1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "components/omnibox/browser/url_index_private_data.h"
14 #include "base/basictypes.h"
15 #include "base/files/file_util.h"
16 #include "base/i18n/break_iterator.h"
17 #include "base/i18n/case_conversion.h"
18 #include "base/metrics/histogram.h"
19 #include "base/strings/string_split.h"
20 #include "base/strings/string_util.h"
21 #include "base/strings/utf_string_conversions.h"
22 #include "base/time/time.h"
23 #include "components/bookmarks/browser/bookmark_model.h"
24 #include "components/bookmarks/browser/bookmark_utils.h"
25 #include "components/history/core/browser/history_database.h"
26 #include "components/history/core/browser/history_db_task.h"
27 #include "components/history/core/browser/history_service.h"
28 #include "components/omnibox/browser/in_memory_url_index.h"
29 #include "components/url_formatter/url_formatter.h"
31 #if defined(USE_SYSTEM_PROTOBUF)
32 #include <google/protobuf/repeated_field.h>
34 #include "third_party/protobuf/src/google/protobuf/repeated_field.h"
37 using google::protobuf::RepeatedField
;
38 using google::protobuf::RepeatedPtrField
;
39 using in_memory_url_index::InMemoryURLIndexCacheItem
;
42 static const size_t kMaxVisitsToStoreInCache
= 10u;
45 typedef in_memory_url_index::InMemoryURLIndexCacheItem_WordListItem
47 typedef in_memory_url_index::InMemoryURLIndexCacheItem_WordMapItem_WordMapEntry
49 typedef in_memory_url_index::InMemoryURLIndexCacheItem_WordMapItem WordMapItem
;
50 typedef in_memory_url_index::InMemoryURLIndexCacheItem_CharWordMapItem
52 typedef in_memory_url_index::
53 InMemoryURLIndexCacheItem_CharWordMapItem_CharWordMapEntry CharWordMapEntry
;
54 typedef in_memory_url_index::InMemoryURLIndexCacheItem_WordIDHistoryMapItem
56 typedef in_memory_url_index::
57 InMemoryURLIndexCacheItem_WordIDHistoryMapItem_WordIDHistoryMapEntry
58 WordIDHistoryMapEntry
;
59 typedef in_memory_url_index::InMemoryURLIndexCacheItem_HistoryInfoMapItem
61 typedef in_memory_url_index::
62 InMemoryURLIndexCacheItem_HistoryInfoMapItem_HistoryInfoMapEntry
64 typedef in_memory_url_index::
65 InMemoryURLIndexCacheItem_HistoryInfoMapItem_HistoryInfoMapEntry_VisitInfo
66 HistoryInfoMapEntry_VisitInfo
;
67 typedef in_memory_url_index::InMemoryURLIndexCacheItem_WordStartsMapItem
69 typedef in_memory_url_index::
70 InMemoryURLIndexCacheItem_WordStartsMapItem_WordStartsMapEntry
73 // Algorithm Functions ---------------------------------------------------------
75 // Comparison function for sorting search terms by descending length.
76 bool LengthGreater(const base::string16
& string_a
,
77 const base::string16
& string_b
) {
78 return string_a
.length() > string_b
.length();
82 // UpdateRecentVisitsFromHistoryDBTask -----------------------------------------
84 // HistoryDBTask used to update the recent visit data for a particular
85 // row from the history database.
86 class UpdateRecentVisitsFromHistoryDBTask
: public history::HistoryDBTask
{
88 explicit UpdateRecentVisitsFromHistoryDBTask(
89 URLIndexPrivateData
* private_data
,
90 history::URLID url_id
);
92 bool RunOnDBThread(history::HistoryBackend
* backend
,
93 history::HistoryDatabase
* db
) override
;
94 void DoneRunOnMainThread() override
;
97 ~UpdateRecentVisitsFromHistoryDBTask() override
;
99 // The URLIndexPrivateData that gets updated after the historyDB
101 URLIndexPrivateData
* private_data_
;
102 // The ID of the URL to get visits for and then update.
103 history::URLID url_id_
;
104 // Whether fetching the recent visits for the URL succeeded.
106 // The awaited data that's shown to private_data_ for it to copy and
108 history::VisitVector recent_visits_
;
110 DISALLOW_COPY_AND_ASSIGN(UpdateRecentVisitsFromHistoryDBTask
);
113 UpdateRecentVisitsFromHistoryDBTask::UpdateRecentVisitsFromHistoryDBTask(
114 URLIndexPrivateData
* private_data
,
115 history::URLID url_id
)
116 : private_data_(private_data
), url_id_(url_id
), succeeded_(false) {
119 bool UpdateRecentVisitsFromHistoryDBTask::RunOnDBThread(
120 history::HistoryBackend
* backend
,
121 history::HistoryDatabase
* db
) {
122 // Make sure the private data is going to get as many recent visits as
123 // ScoredHistoryMatch::GetFrequency() hopes to use.
124 DCHECK_GE(kMaxVisitsToStoreInCache
, ScoredHistoryMatch::kMaxVisitsToScore
);
125 succeeded_
= db
->GetMostRecentVisitsForURL(url_id_
,
126 kMaxVisitsToStoreInCache
,
129 recent_visits_
.clear();
130 return true; // Always claim to be done; do not retry failures.
133 void UpdateRecentVisitsFromHistoryDBTask::DoneRunOnMainThread() {
135 private_data_
->UpdateRecentVisits(url_id_
, recent_visits_
);
138 UpdateRecentVisitsFromHistoryDBTask::~UpdateRecentVisitsFromHistoryDBTask() {
142 // URLIndexPrivateData ---------------------------------------------------------
144 URLIndexPrivateData::URLIndexPrivateData()
145 : restored_cache_version_(0),
146 saved_cache_version_(kCurrentCacheFileVersion
),
147 pre_filter_item_count_(0),
148 post_filter_item_count_(0),
149 post_scoring_item_count_(0) {
152 ScoredHistoryMatches
URLIndexPrivateData::HistoryItemsForTerms(
153 base::string16 search_string
,
154 size_t cursor_position
,
156 const std::string
& languages
,
157 bookmarks::BookmarkModel
* bookmark_model
) {
158 // If cursor position is set and useful (not at either end of the
159 // string), allow the search string to be broken at cursor position.
160 // We do this by pretending there's a space where the cursor is.
161 if ((cursor_position
!= base::string16::npos
) &&
162 (cursor_position
< search_string
.length()) &&
163 (cursor_position
> 0)) {
164 search_string
.insert(cursor_position
, base::ASCIIToUTF16(" "));
166 pre_filter_item_count_
= 0;
167 post_filter_item_count_
= 0;
168 post_scoring_item_count_
= 0;
169 // The search string we receive may contain escaped characters. For reducing
170 // the index we need individual, lower-cased words, ignoring escapings. For
171 // the final filtering we need whitespace separated substrings possibly
172 // containing escaped characters.
173 base::string16
lower_raw_string(base::i18n::ToLower(search_string
));
174 base::string16 lower_unescaped_string
=
175 net::UnescapeURLComponent(lower_raw_string
,
176 net::UnescapeRule::SPACES
| net::UnescapeRule::URL_SPECIAL_CHARS
);
177 // Extract individual 'words' (as opposed to 'terms'; see below) from the
178 // search string. When the user types "colspec=ID%20Mstone Release" we get
179 // four 'words': "colspec", "id", "mstone" and "release".
180 String16Vector
lower_words(
181 String16VectorFromString16(lower_unescaped_string
, false, NULL
));
182 ScoredHistoryMatches scored_items
;
184 // Do nothing if we have indexed no words (probably because we've not been
185 // initialized yet) or the search string has no words.
186 if (word_list_
.empty() || lower_words
.empty()) {
187 search_term_cache_
.clear(); // Invalidate the term cache.
191 // Reset used_ flags for search_term_cache_. We use a basic mark-and-sweep
193 ResetSearchTermCache();
195 HistoryIDSet history_id_set
= HistoryIDSetFromWords(lower_words
);
197 // Trim the candidate pool if it is large. Note that we do not filter out
198 // items that do not contain the search terms as proper substrings -- doing
199 // so is the performance-costly operation we are trying to avoid in order
200 // to maintain omnibox responsiveness.
201 const size_t kItemsToScoreLimit
= 500;
202 pre_filter_item_count_
= history_id_set
.size();
203 // If we trim the results set we do not want to cache the results for next
204 // time as the user's ultimately desired result could easily be eliminated
205 // in this early rough filter.
206 bool was_trimmed
= (pre_filter_item_count_
> kItemsToScoreLimit
);
208 HistoryIDVector history_ids
;
209 std::copy(history_id_set
.begin(), history_id_set
.end(),
210 std::back_inserter(history_ids
));
211 // Trim down the set by sorting by typed-count, visit-count, and last
213 HistoryItemFactorGreater
214 item_factor_functor(history_info_map_
);
215 std::partial_sort(history_ids
.begin(),
216 history_ids
.begin() + kItemsToScoreLimit
,
218 item_factor_functor
);
219 history_id_set
.clear();
220 std::copy(history_ids
.begin(), history_ids
.begin() + kItemsToScoreLimit
,
221 std::inserter(history_id_set
, history_id_set
.end()));
222 post_filter_item_count_
= history_id_set
.size();
225 // Pass over all of the candidates filtering out any without a proper
226 // substring match, inserting those which pass in order by score. Note that
227 // in this step we are using the raw search string complete with escaped
228 // URL elements. When the user has specifically typed something akin to
229 // "sort=pri&colspec=ID%20Mstone%20Release" we want to make sure that that
230 // specific substring appears in the URL or page title.
232 // We call these 'terms' (as opposed to 'words'; see above) as in this case
233 // we only want to break up the search string on 'true' whitespace rather than
234 // escaped whitespace. When the user types "colspec=ID%20Mstone Release" we
235 // get two 'terms': "colspec=id%20mstone" and "release".
236 String16Vector lower_raw_terms
= base::SplitString(
237 lower_raw_string
, base::kWhitespaceUTF16
, base::KEEP_WHITESPACE
,
238 base::SPLIT_WANT_NONEMPTY
);
239 if (lower_raw_terms
.empty()) {
240 // Don't score matches when there are no terms to score against. (It's
241 // possible that the word break iterater that extracts words to search
242 // for in the database allows some whitespace "words" whereas SplitString
243 // excludes a long list of whitespace.) One could write a scoring
244 // function that gives a reasonable order to matches when there
245 // are no terms (i.e., all the words are some form of whitespace),
246 // but this is such a rare edge case that it's not worth the time.
251 history_id_set
.begin(), history_id_set
.end(),
252 AddHistoryMatch(bookmark_model
, *this, languages
, lower_raw_string
,
253 lower_raw_terms
, base::Time::Now())).ScoredMatches();
255 // Select and sort only the top |max_matches| results.
256 if (scored_items
.size() > max_matches
) {
257 std::partial_sort(scored_items
.begin(),
258 scored_items
.begin() +
261 ScoredHistoryMatch::MatchScoreGreater
);
262 scored_items
.resize(max_matches
);
264 std::sort(scored_items
.begin(), scored_items
.end(),
265 ScoredHistoryMatch::MatchScoreGreater
);
267 post_scoring_item_count_
= scored_items
.size();
270 search_term_cache_
.clear(); // Invalidate the term cache.
272 // Remove any stale SearchTermCacheItems.
273 for (SearchTermCacheMap::iterator cache_iter
= search_term_cache_
.begin();
274 cache_iter
!= search_term_cache_
.end(); ) {
275 if (!cache_iter
->second
.used_
)
276 search_term_cache_
.erase(cache_iter
++);
285 bool URLIndexPrivateData::UpdateURL(
286 history::HistoryService
* history_service
,
287 const history::URLRow
& row
,
288 const std::string
& languages
,
289 const std::set
<std::string
>& scheme_whitelist
,
290 base::CancelableTaskTracker
* tracker
) {
291 // The row may or may not already be in our index. If it is not already
292 // indexed and it qualifies then it gets indexed. If it is already
293 // indexed and still qualifies then it gets updated, otherwise it
294 // is deleted from the index.
295 bool row_was_updated
= false;
296 history::URLID row_id
= row
.id();
297 HistoryInfoMap::iterator row_pos
= history_info_map_
.find(row_id
);
298 if (row_pos
== history_info_map_
.end()) {
299 // This new row should be indexed if it qualifies.
300 history::URLRow
new_row(row
);
301 new_row
.set_id(row_id
);
302 row_was_updated
= RowQualifiesAsSignificant(new_row
, base::Time()) &&
309 } else if (RowQualifiesAsSignificant(row
, base::Time())) {
310 // This indexed row still qualifies and will be re-indexed.
311 // The url won't have changed but the title, visit count, etc.
312 // might have changed.
313 history::URLRow
& row_to_update
= row_pos
->second
.url_row
;
314 bool title_updated
= row_to_update
.title() != row
.title();
315 if (row_to_update
.visit_count() != row
.visit_count() ||
316 row_to_update
.typed_count() != row
.typed_count() ||
317 row_to_update
.last_visit() != row
.last_visit() || title_updated
) {
318 row_to_update
.set_visit_count(row
.visit_count());
319 row_to_update
.set_typed_count(row
.typed_count());
320 row_to_update
.set_last_visit(row
.last_visit());
321 // If something appears to have changed, update the recent visits
323 ScheduleUpdateRecentVisits(history_service
, row_id
, tracker
);
324 // While the URL is guaranteed to remain stable, the title may have
325 // changed. If so, then update the index with the changed words.
327 // Clear all words associated with this row and re-index both the
329 RemoveRowWordsFromIndex(row_to_update
);
330 row_to_update
.set_title(row
.title());
331 RowWordStarts word_starts
;
332 AddRowWordsToIndex(row_to_update
, &word_starts
, languages
);
333 word_starts_map_
[row_id
] = word_starts
;
335 row_was_updated
= true;
338 // This indexed row no longer qualifies and will be de-indexed by
339 // clearing all words associated with this row.
340 RemoveRowFromIndex(row
);
341 row_was_updated
= true;
344 search_term_cache_
.clear(); // This invalidates the cache.
345 return row_was_updated
;
348 void URLIndexPrivateData::UpdateRecentVisits(
349 history::URLID url_id
,
350 const history::VisitVector
& recent_visits
) {
351 HistoryInfoMap::iterator row_pos
= history_info_map_
.find(url_id
);
352 if (row_pos
!= history_info_map_
.end()) {
353 VisitInfoVector
* visits
= &row_pos
->second
.visits
;
356 std::min(recent_visits
.size(), kMaxVisitsToStoreInCache
);
357 visits
->reserve(size
);
358 for (size_t i
= 0; i
< size
; i
++) {
359 // Copy from the history::VisitVector the only fields visits needs.
360 visits
->push_back(std::make_pair(recent_visits
[i
].visit_time
,
361 recent_visits
[i
].transition
));
364 // Else: Oddly, the URL doesn't seem to exist in the private index.
365 // Ignore this update. This can happen if, for instance, the user
366 // removes the URL from URLIndexPrivateData before the historyDB call
370 void URLIndexPrivateData::ScheduleUpdateRecentVisits(
371 history::HistoryService
* history_service
,
372 history::URLID url_id
,
373 base::CancelableTaskTracker
* tracker
) {
374 history_service
->ScheduleDBTask(
375 scoped_ptr
<history::HistoryDBTask
>(
376 new UpdateRecentVisitsFromHistoryDBTask(this, url_id
)), tracker
);
379 // Helper functor for DeleteURL.
380 class HistoryInfoMapItemHasURL
{
382 explicit HistoryInfoMapItemHasURL(const GURL
& url
): url_(url
) {}
384 bool operator()(const std::pair
<const HistoryID
, HistoryInfoMapValue
>& item
) {
385 return item
.second
.url_row
.url() == url_
;
392 bool URLIndexPrivateData::DeleteURL(const GURL
& url
) {
393 // Find the matching entry in the history_info_map_.
394 HistoryInfoMap::iterator pos
= std::find_if(
395 history_info_map_
.begin(),
396 history_info_map_
.end(),
397 HistoryInfoMapItemHasURL(url
));
398 if (pos
== history_info_map_
.end())
400 RemoveRowFromIndex(pos
->second
.url_row
);
401 search_term_cache_
.clear(); // This invalidates the cache.
406 scoped_refptr
<URLIndexPrivateData
> URLIndexPrivateData::RestoreFromFile(
407 const base::FilePath
& file_path
,
408 const std::string
& languages
) {
409 base::TimeTicks beginning_time
= base::TimeTicks::Now();
410 if (!base::PathExists(file_path
))
413 // If there is no cache file then simply give up. This will cause us to
414 // attempt to rebuild from the history database.
415 if (!base::ReadFileToString(file_path
, &data
))
418 scoped_refptr
<URLIndexPrivateData
> restored_data(new URLIndexPrivateData
);
419 InMemoryURLIndexCacheItem index_cache
;
420 if (!index_cache
.ParseFromArray(data
.c_str(), data
.size())) {
421 LOG(WARNING
) << "Failed to parse URLIndexPrivateData cache data read from "
422 << file_path
.value();
423 return restored_data
;
426 if (!restored_data
->RestorePrivateData(index_cache
, languages
))
429 UMA_HISTOGRAM_TIMES("History.InMemoryURLIndexRestoreCacheTime",
430 base::TimeTicks::Now() - beginning_time
);
431 UMA_HISTOGRAM_COUNTS("History.InMemoryURLHistoryItems",
432 restored_data
->history_id_word_map_
.size());
433 UMA_HISTOGRAM_COUNTS("History.InMemoryURLCacheSize", data
.size());
434 UMA_HISTOGRAM_COUNTS_10000("History.InMemoryURLWords",
435 restored_data
->word_map_
.size());
436 UMA_HISTOGRAM_COUNTS_10000("History.InMemoryURLChars",
437 restored_data
->char_word_map_
.size());
438 if (restored_data
->Empty())
439 return NULL
; // 'No data' is the same as a failed reload.
440 return restored_data
;
444 scoped_refptr
<URLIndexPrivateData
> URLIndexPrivateData::RebuildFromHistory(
445 history::HistoryDatabase
* history_db
,
446 const std::string
& languages
,
447 const std::set
<std::string
>& scheme_whitelist
) {
451 base::TimeTicks beginning_time
= base::TimeTicks::Now();
453 scoped_refptr
<URLIndexPrivateData
>
454 rebuilt_data(new URLIndexPrivateData
);
455 history::URLDatabase::URLEnumerator history_enum
;
456 if (!history_db
->InitURLEnumeratorForSignificant(&history_enum
))
458 rebuilt_data
->last_time_rebuilt_from_history_
= base::Time::Now();
459 for (history::URLRow row
; history_enum
.GetNextURL(&row
);) {
460 rebuilt_data
->IndexRow(
461 history_db
, NULL
, row
, languages
, scheme_whitelist
, NULL
);
464 UMA_HISTOGRAM_TIMES("History.InMemoryURLIndexingTime",
465 base::TimeTicks::Now() - beginning_time
);
466 UMA_HISTOGRAM_COUNTS("History.InMemoryURLHistoryItems",
467 rebuilt_data
->history_id_word_map_
.size());
468 UMA_HISTOGRAM_COUNTS_10000("History.InMemoryURLWords",
469 rebuilt_data
->word_map_
.size());
470 UMA_HISTOGRAM_COUNTS_10000("History.InMemoryURLChars",
471 rebuilt_data
->char_word_map_
.size());
476 bool URLIndexPrivateData::WritePrivateDataToCacheFileTask(
477 scoped_refptr
<URLIndexPrivateData
> private_data
,
478 const base::FilePath
& file_path
) {
479 DCHECK(private_data
.get());
480 DCHECK(!file_path
.empty());
481 return private_data
->SaveToFile(file_path
);
484 scoped_refptr
<URLIndexPrivateData
> URLIndexPrivateData::Duplicate() const {
485 scoped_refptr
<URLIndexPrivateData
> data_copy
= new URLIndexPrivateData
;
486 data_copy
->last_time_rebuilt_from_history_
= last_time_rebuilt_from_history_
;
487 data_copy
->word_list_
= word_list_
;
488 data_copy
->available_words_
= available_words_
;
489 data_copy
->word_map_
= word_map_
;
490 data_copy
->char_word_map_
= char_word_map_
;
491 data_copy
->word_id_history_map_
= word_id_history_map_
;
492 data_copy
->history_id_word_map_
= history_id_word_map_
;
493 data_copy
->history_info_map_
= history_info_map_
;
494 data_copy
->word_starts_map_
= word_starts_map_
;
497 // search_term_cache_
498 // pre_filter_item_count_
499 // post_filter_item_count_
500 // post_scoring_item_count_
503 bool URLIndexPrivateData::Empty() const {
504 return history_info_map_
.empty();
507 void URLIndexPrivateData::Clear() {
508 last_time_rebuilt_from_history_
= base::Time();
510 available_words_
.clear();
512 char_word_map_
.clear();
513 word_id_history_map_
.clear();
514 history_id_word_map_
.clear();
515 history_info_map_
.clear();
516 word_starts_map_
.clear();
519 URLIndexPrivateData::~URLIndexPrivateData() {}
521 HistoryIDSet
URLIndexPrivateData::HistoryIDSetFromWords(
522 const String16Vector
& unsorted_words
) {
523 // Break the terms down into individual terms (words), get the candidate
524 // set for each term, and intersect each to get a final candidate list.
525 // Note that a single 'term' from the user's perspective might be
526 // a string like "http://www.somewebsite.com" which, from our perspective,
527 // is four words: 'http', 'www', 'somewebsite', and 'com'.
528 HistoryIDSet history_id_set
;
529 String16Vector
words(unsorted_words
);
530 // Sort the words into the longest first as such are likely to narrow down
531 // the results quicker. Also, single character words are the most expensive
532 // to process so save them for last.
533 std::sort(words
.begin(), words
.end(), LengthGreater
);
534 for (String16Vector::iterator iter
= words
.begin(); iter
!= words
.end();
536 base::string16 uni_word
= *iter
;
537 HistoryIDSet term_history_set
= HistoryIDsForTerm(uni_word
);
538 if (term_history_set
.empty()) {
539 history_id_set
.clear();
542 if (iter
== words
.begin()) {
543 history_id_set
.swap(term_history_set
);
545 HistoryIDSet new_history_id_set
= base::STLSetIntersection
<HistoryIDSet
>(
546 history_id_set
, term_history_set
);
547 history_id_set
.swap(new_history_id_set
);
550 return history_id_set
;
553 HistoryIDSet
URLIndexPrivateData::HistoryIDsForTerm(
554 const base::string16
& term
) {
556 return HistoryIDSet();
558 // TODO(mrossetti): Consider optimizing for very common terms such as
559 // 'http[s]', 'www', 'com', etc. Or collect the top 100 more frequently
560 // occuring words in the user's searches.
562 size_t term_length
= term
.length();
563 WordIDSet word_id_set
;
564 if (term_length
> 1) {
565 // See if this term or a prefix thereof is present in the cache.
566 base::string16 term_lower
= base::i18n::ToLower(term
);
567 SearchTermCacheMap::iterator
best_prefix(search_term_cache_
.end());
568 for (SearchTermCacheMap::iterator cache_iter
= search_term_cache_
.begin();
569 cache_iter
!= search_term_cache_
.end(); ++cache_iter
) {
570 if (base::StartsWith(term_lower
,
571 base::i18n::ToLower(cache_iter
->first
),
572 base::CompareCase::SENSITIVE
) &&
573 (best_prefix
== search_term_cache_
.end() ||
574 cache_iter
->first
.length() > best_prefix
->first
.length()))
575 best_prefix
= cache_iter
;
578 // If a prefix was found then determine the leftover characters to be used
579 // for further refining the results from that prefix.
580 Char16Set prefix_chars
;
581 base::string16
leftovers(term
);
582 if (best_prefix
!= search_term_cache_
.end()) {
583 // If the prefix is an exact match for the term then grab the cached
584 // results and we're done.
585 size_t prefix_length
= best_prefix
->first
.length();
586 if (prefix_length
== term_length
) {
587 best_prefix
->second
.used_
= true;
588 return best_prefix
->second
.history_id_set_
;
591 // Otherwise we have a handy starting point.
592 // If there are no history results for this prefix then we can bail early
593 // as there will be no history results for the full term.
594 if (best_prefix
->second
.history_id_set_
.empty()) {
595 search_term_cache_
[term
] = SearchTermCacheItem();
596 return HistoryIDSet();
598 word_id_set
= best_prefix
->second
.word_id_set_
;
599 prefix_chars
= Char16SetFromString16(best_prefix
->first
);
600 leftovers
= term
.substr(prefix_length
);
603 // Filter for each remaining, unique character in the term.
604 Char16Set leftover_chars
= Char16SetFromString16(leftovers
);
605 Char16Set unique_chars
=
606 base::STLSetDifference
<Char16Set
>(leftover_chars
, prefix_chars
);
608 // Reduce the word set with any leftover, unprocessed characters.
609 if (!unique_chars
.empty()) {
610 WordIDSet
leftover_set(WordIDSetForTermChars(unique_chars
));
611 // We might come up empty on the leftovers.
612 if (leftover_set
.empty()) {
613 search_term_cache_
[term
] = SearchTermCacheItem();
614 return HistoryIDSet();
616 // Or there may not have been a prefix from which to start.
617 if (prefix_chars
.empty()) {
618 word_id_set
.swap(leftover_set
);
620 WordIDSet new_word_id_set
= base::STLSetIntersection
<WordIDSet
>(
621 word_id_set
, leftover_set
);
622 word_id_set
.swap(new_word_id_set
);
626 // We must filter the word list because the resulting word set surely
627 // contains words which do not have the search term as a proper subset.
628 for (WordIDSet::iterator word_set_iter
= word_id_set
.begin();
629 word_set_iter
!= word_id_set
.end(); ) {
630 if (word_list_
[*word_set_iter
].find(term
) == base::string16::npos
)
631 word_id_set
.erase(word_set_iter
++);
636 word_id_set
= WordIDSetForTermChars(Char16SetFromString16(term
));
639 // If any words resulted then we can compose a set of history IDs by unioning
640 // the sets from each word.
641 HistoryIDSet history_id_set
;
642 if (!word_id_set
.empty()) {
643 for (WordIDSet::iterator word_id_iter
= word_id_set
.begin();
644 word_id_iter
!= word_id_set
.end(); ++word_id_iter
) {
645 WordID word_id
= *word_id_iter
;
646 WordIDHistoryMap::iterator word_iter
= word_id_history_map_
.find(word_id
);
647 if (word_iter
!= word_id_history_map_
.end()) {
648 HistoryIDSet
& word_history_id_set(word_iter
->second
);
649 history_id_set
.insert(word_history_id_set
.begin(),
650 word_history_id_set
.end());
655 // Record a new cache entry for this word if the term is longer than
656 // a single character.
658 search_term_cache_
[term
] = SearchTermCacheItem(word_id_set
, history_id_set
);
660 return history_id_set
;
663 WordIDSet
URLIndexPrivateData::WordIDSetForTermChars(
664 const Char16Set
& term_chars
) {
665 WordIDSet word_id_set
;
666 for (Char16Set::const_iterator c_iter
= term_chars
.begin();
667 c_iter
!= term_chars
.end(); ++c_iter
) {
668 CharWordIDMap::iterator char_iter
= char_word_map_
.find(*c_iter
);
669 if (char_iter
== char_word_map_
.end()) {
670 // A character was not found so there are no matching results: bail.
674 WordIDSet
& char_word_id_set(char_iter
->second
);
675 // It is possible for there to no longer be any words associated with
676 // a particular character. Give up in that case.
677 if (char_word_id_set
.empty()) {
682 if (c_iter
== term_chars
.begin()) {
683 // First character results becomes base set of results.
684 word_id_set
= char_word_id_set
;
686 // Subsequent character results get intersected in.
687 WordIDSet new_word_id_set
= base::STLSetIntersection
<WordIDSet
>(
688 word_id_set
, char_word_id_set
);
689 word_id_set
.swap(new_word_id_set
);
695 bool URLIndexPrivateData::IndexRow(
696 history::HistoryDatabase
* history_db
,
697 history::HistoryService
* history_service
,
698 const history::URLRow
& row
,
699 const std::string
& languages
,
700 const std::set
<std::string
>& scheme_whitelist
,
701 base::CancelableTaskTracker
* tracker
) {
702 const GURL
& gurl(row
.url());
704 // Index only URLs with a whitelisted scheme.
705 if (!URLSchemeIsWhitelisted(gurl
, scheme_whitelist
))
708 history::URLID row_id
= row
.id();
709 // Strip out username and password before saving and indexing.
710 base::string16
url(url_formatter::FormatUrl(
711 gurl
, languages
, url_formatter::kFormatUrlOmitUsernamePassword
,
712 net::UnescapeRule::NONE
, nullptr, nullptr, nullptr));
714 HistoryID history_id
= static_cast<HistoryID
>(row_id
);
715 DCHECK_LT(history_id
, std::numeric_limits
<HistoryID
>::max());
717 // Add the row for quick lookup in the history info store.
718 history::URLRow
new_row(GURL(url
), row_id
);
719 new_row
.set_visit_count(row
.visit_count());
720 new_row
.set_typed_count(row
.typed_count());
721 new_row
.set_last_visit(row
.last_visit());
722 new_row
.set_title(row
.title());
723 history_info_map_
[history_id
].url_row
= new_row
;
725 // Index the words contained in the URL and title of the row.
726 RowWordStarts word_starts
;
727 AddRowWordsToIndex(new_row
, &word_starts
, languages
);
728 word_starts_map_
[history_id
] = word_starts
;
730 // Update the recent visits information or schedule the update
733 // We'd like to check that we're on the history DB thread.
734 // However, unittest code actually calls this on the UI thread.
735 // So we don't do any thread checks.
736 history::VisitVector recent_visits
;
737 // Make sure the private data is going to get as many recent visits as
738 // ScoredHistoryMatch::GetFrequency() hopes to use.
739 DCHECK_GE(kMaxVisitsToStoreInCache
, ScoredHistoryMatch::kMaxVisitsToScore
);
740 if (history_db
->GetMostRecentVisitsForURL(row_id
,
741 kMaxVisitsToStoreInCache
,
743 UpdateRecentVisits(row_id
, recent_visits
);
746 DCHECK(history_service
);
747 ScheduleUpdateRecentVisits(history_service
, row_id
, tracker
);
753 void URLIndexPrivateData::AddRowWordsToIndex(const history::URLRow
& row
,
754 RowWordStarts
* word_starts
,
755 const std::string
& languages
) {
756 HistoryID history_id
= static_cast<HistoryID
>(row
.id());
757 // Split URL into individual, unique words then add in the title words.
758 const GURL
& gurl(row
.url());
759 const base::string16
& url
=
760 bookmarks::CleanUpUrlForMatching(gurl
, languages
, NULL
);
761 String16Set url_words
= String16SetFromString16(url
,
762 word_starts
? &word_starts
->url_word_starts_
: NULL
);
763 const base::string16
& title
= bookmarks::CleanUpTitleForMatching(row
.title());
764 String16Set title_words
= String16SetFromString16(title
,
765 word_starts
? &word_starts
->title_word_starts_
: NULL
);
766 String16Set words
= base::STLSetUnion
<String16Set
>(url_words
, title_words
);
767 for (String16Set::iterator word_iter
= words
.begin();
768 word_iter
!= words
.end(); ++word_iter
)
769 AddWordToIndex(*word_iter
, history_id
);
771 search_term_cache_
.clear(); // Invalidate the term cache.
774 void URLIndexPrivateData::AddWordToIndex(const base::string16
& term
,
775 HistoryID history_id
) {
776 WordMap::iterator word_pos
= word_map_
.find(term
);
777 if (word_pos
!= word_map_
.end())
778 UpdateWordHistory(word_pos
->second
, history_id
);
780 AddWordHistory(term
, history_id
);
783 void URLIndexPrivateData::AddWordHistory(const base::string16
& term
,
784 HistoryID history_id
) {
785 WordID word_id
= word_list_
.size();
786 if (available_words_
.empty()) {
787 word_list_
.push_back(term
);
789 word_id
= *(available_words_
.begin());
790 word_list_
[word_id
] = term
;
791 available_words_
.erase(word_id
);
793 word_map_
[term
] = word_id
;
795 HistoryIDSet history_id_set
;
796 history_id_set
.insert(history_id
);
797 word_id_history_map_
[word_id
] = history_id_set
;
798 AddToHistoryIDWordMap(history_id
, word_id
);
800 // For each character in the newly added word (i.e. a word that is not
801 // already in the word index), add the word to the character index.
802 Char16Set characters
= Char16SetFromString16(term
);
803 for (Char16Set::iterator uni_char_iter
= characters
.begin();
804 uni_char_iter
!= characters
.end(); ++uni_char_iter
) {
805 base::char16 uni_char
= *uni_char_iter
;
806 CharWordIDMap::iterator char_iter
= char_word_map_
.find(uni_char
);
807 if (char_iter
!= char_word_map_
.end()) {
808 // Update existing entry in the char/word index.
809 WordIDSet
& word_id_set(char_iter
->second
);
810 word_id_set
.insert(word_id
);
812 // Create a new entry in the char/word index.
813 WordIDSet word_id_set
;
814 word_id_set
.insert(word_id
);
815 char_word_map_
[uni_char
] = word_id_set
;
820 void URLIndexPrivateData::UpdateWordHistory(WordID word_id
,
821 HistoryID history_id
) {
822 WordIDHistoryMap::iterator history_pos
= word_id_history_map_
.find(word_id
);
823 DCHECK(history_pos
!= word_id_history_map_
.end());
824 HistoryIDSet
& history_id_set(history_pos
->second
);
825 history_id_set
.insert(history_id
);
826 AddToHistoryIDWordMap(history_id
, word_id
);
829 void URLIndexPrivateData::AddToHistoryIDWordMap(HistoryID history_id
,
831 HistoryIDWordMap::iterator iter
= history_id_word_map_
.find(history_id
);
832 if (iter
!= history_id_word_map_
.end()) {
833 WordIDSet
& word_id_set(iter
->second
);
834 word_id_set
.insert(word_id
);
836 WordIDSet word_id_set
;
837 word_id_set
.insert(word_id
);
838 history_id_word_map_
[history_id
] = word_id_set
;
842 void URLIndexPrivateData::RemoveRowFromIndex(const history::URLRow
& row
) {
843 RemoveRowWordsFromIndex(row
);
844 HistoryID history_id
= static_cast<HistoryID
>(row
.id());
845 history_info_map_
.erase(history_id
);
846 word_starts_map_
.erase(history_id
);
849 void URLIndexPrivateData::RemoveRowWordsFromIndex(const history::URLRow
& row
) {
850 // Remove the entries in history_id_word_map_ and word_id_history_map_ for
852 HistoryID history_id
= static_cast<HistoryID
>(row
.id());
853 WordIDSet word_id_set
= history_id_word_map_
[history_id
];
854 history_id_word_map_
.erase(history_id
);
856 // Reconcile any changes to word usage.
857 for (WordIDSet::iterator word_id_iter
= word_id_set
.begin();
858 word_id_iter
!= word_id_set
.end(); ++word_id_iter
) {
859 WordID word_id
= *word_id_iter
;
860 word_id_history_map_
[word_id
].erase(history_id
);
861 if (!word_id_history_map_
[word_id
].empty())
862 continue; // The word is still in use.
864 // The word is no longer in use. Reconcile any changes to character usage.
865 base::string16 word
= word_list_
[word_id
];
866 Char16Set characters
= Char16SetFromString16(word
);
867 for (Char16Set::iterator uni_char_iter
= characters
.begin();
868 uni_char_iter
!= characters
.end(); ++uni_char_iter
) {
869 base::char16 uni_char
= *uni_char_iter
;
870 char_word_map_
[uni_char
].erase(word_id
);
871 if (char_word_map_
[uni_char
].empty())
872 char_word_map_
.erase(uni_char
); // No longer in use.
875 // Complete the removal of references to the word.
876 word_id_history_map_
.erase(word_id
);
877 word_map_
.erase(word
);
878 word_list_
[word_id
] = base::string16();
879 available_words_
.insert(word_id
);
883 void URLIndexPrivateData::ResetSearchTermCache() {
884 for (SearchTermCacheMap::iterator iter
= search_term_cache_
.begin();
885 iter
!= search_term_cache_
.end(); ++iter
)
886 iter
->second
.used_
= false;
889 bool URLIndexPrivateData::SaveToFile(const base::FilePath
& file_path
) {
890 base::TimeTicks beginning_time
= base::TimeTicks::Now();
891 InMemoryURLIndexCacheItem index_cache
;
892 SavePrivateData(&index_cache
);
894 if (!index_cache
.SerializeToString(&data
)) {
895 LOG(WARNING
) << "Failed to serialize the InMemoryURLIndex cache.";
899 int size
= data
.size();
900 if (base::WriteFile(file_path
, data
.c_str(), size
) != size
) {
901 LOG(WARNING
) << "Failed to write " << file_path
.value();
904 UMA_HISTOGRAM_TIMES("History.InMemoryURLIndexSaveCacheTime",
905 base::TimeTicks::Now() - beginning_time
);
909 void URLIndexPrivateData::SavePrivateData(
910 InMemoryURLIndexCacheItem
* cache
) const {
912 cache
->set_last_rebuild_timestamp(
913 last_time_rebuilt_from_history_
.ToInternalValue());
914 cache
->set_version(saved_cache_version_
);
915 // history_item_count_ is no longer used but rather than change the protobuf
916 // definition use a placeholder. This will go away with the switch to SQLite.
917 cache
->set_history_item_count(0);
920 SaveCharWordMap(cache
);
921 SaveWordIDHistoryMap(cache
);
922 SaveHistoryInfoMap(cache
);
923 SaveWordStartsMap(cache
);
926 void URLIndexPrivateData::SaveWordList(InMemoryURLIndexCacheItem
* cache
) const {
927 if (word_list_
.empty())
929 WordListItem
* list_item
= cache
->mutable_word_list();
930 list_item
->set_word_count(word_list_
.size());
931 for (String16Vector::const_iterator iter
= word_list_
.begin();
932 iter
!= word_list_
.end(); ++iter
)
933 list_item
->add_word(base::UTF16ToUTF8(*iter
));
936 void URLIndexPrivateData::SaveWordMap(InMemoryURLIndexCacheItem
* cache
) const {
937 if (word_map_
.empty())
939 WordMapItem
* map_item
= cache
->mutable_word_map();
940 map_item
->set_item_count(word_map_
.size());
941 for (WordMap::const_iterator iter
= word_map_
.begin();
942 iter
!= word_map_
.end(); ++iter
) {
943 WordMapEntry
* map_entry
= map_item
->add_word_map_entry();
944 map_entry
->set_word(base::UTF16ToUTF8(iter
->first
));
945 map_entry
->set_word_id(iter
->second
);
949 void URLIndexPrivateData::SaveCharWordMap(
950 InMemoryURLIndexCacheItem
* cache
) const {
951 if (char_word_map_
.empty())
953 CharWordMapItem
* map_item
= cache
->mutable_char_word_map();
954 map_item
->set_item_count(char_word_map_
.size());
955 for (CharWordIDMap::const_iterator iter
= char_word_map_
.begin();
956 iter
!= char_word_map_
.end(); ++iter
) {
957 CharWordMapEntry
* map_entry
= map_item
->add_char_word_map_entry();
958 map_entry
->set_char_16(iter
->first
);
959 const WordIDSet
& word_id_set(iter
->second
);
960 map_entry
->set_item_count(word_id_set
.size());
961 for (WordIDSet::const_iterator set_iter
= word_id_set
.begin();
962 set_iter
!= word_id_set
.end(); ++set_iter
)
963 map_entry
->add_word_id(*set_iter
);
967 void URLIndexPrivateData::SaveWordIDHistoryMap(
968 InMemoryURLIndexCacheItem
* cache
) const {
969 if (word_id_history_map_
.empty())
971 WordIDHistoryMapItem
* map_item
= cache
->mutable_word_id_history_map();
972 map_item
->set_item_count(word_id_history_map_
.size());
973 for (WordIDHistoryMap::const_iterator iter
= word_id_history_map_
.begin();
974 iter
!= word_id_history_map_
.end(); ++iter
) {
975 WordIDHistoryMapEntry
* map_entry
=
976 map_item
->add_word_id_history_map_entry();
977 map_entry
->set_word_id(iter
->first
);
978 const HistoryIDSet
& history_id_set(iter
->second
);
979 map_entry
->set_item_count(history_id_set
.size());
980 for (HistoryIDSet::const_iterator set_iter
= history_id_set
.begin();
981 set_iter
!= history_id_set
.end(); ++set_iter
)
982 map_entry
->add_history_id(*set_iter
);
986 void URLIndexPrivateData::SaveHistoryInfoMap(
987 InMemoryURLIndexCacheItem
* cache
) const {
988 if (history_info_map_
.empty())
990 HistoryInfoMapItem
* map_item
= cache
->mutable_history_info_map();
991 map_item
->set_item_count(history_info_map_
.size());
992 for (HistoryInfoMap::const_iterator iter
= history_info_map_
.begin();
993 iter
!= history_info_map_
.end(); ++iter
) {
994 HistoryInfoMapEntry
* map_entry
= map_item
->add_history_info_map_entry();
995 map_entry
->set_history_id(iter
->first
);
996 const history::URLRow
& url_row(iter
->second
.url_row
);
997 // Note: We only save information that contributes to the index so there
998 // is no need to save search_term_cache_ (not persistent).
999 map_entry
->set_visit_count(url_row
.visit_count());
1000 map_entry
->set_typed_count(url_row
.typed_count());
1001 map_entry
->set_last_visit(url_row
.last_visit().ToInternalValue());
1002 map_entry
->set_url(url_row
.url().spec());
1003 map_entry
->set_title(base::UTF16ToUTF8(url_row
.title()));
1004 const VisitInfoVector
& visits(iter
->second
.visits
);
1005 for (VisitInfoVector::const_iterator visit_iter
= visits
.begin();
1006 visit_iter
!= visits
.end(); ++visit_iter
) {
1007 HistoryInfoMapEntry_VisitInfo
* visit_info
= map_entry
->add_visits();
1008 visit_info
->set_visit_time(visit_iter
->first
.ToInternalValue());
1009 visit_info
->set_transition_type(visit_iter
->second
);
1014 void URLIndexPrivateData::SaveWordStartsMap(
1015 InMemoryURLIndexCacheItem
* cache
) const {
1016 if (word_starts_map_
.empty())
1018 // For unit testing: Enable saving of the cache as an earlier version to
1019 // allow testing of cache file upgrading in ReadFromFile().
1020 // TODO(mrossetti): Instead of intruding on production code with this kind of
1021 // test harness, save a copy of an older version cache with known results.
1022 // Implement this when switching the caching over to SQLite.
1023 if (saved_cache_version_
< 1)
1026 WordStartsMapItem
* map_item
= cache
->mutable_word_starts_map();
1027 map_item
->set_item_count(word_starts_map_
.size());
1028 for (WordStartsMap::const_iterator iter
= word_starts_map_
.begin();
1029 iter
!= word_starts_map_
.end(); ++iter
) {
1030 WordStartsMapEntry
* map_entry
= map_item
->add_word_starts_map_entry();
1031 map_entry
->set_history_id(iter
->first
);
1032 const RowWordStarts
& word_starts(iter
->second
);
1033 for (WordStarts::const_iterator i
= word_starts
.url_word_starts_
.begin();
1034 i
!= word_starts
.url_word_starts_
.end(); ++i
)
1035 map_entry
->add_url_word_starts(*i
);
1036 for (WordStarts::const_iterator i
= word_starts
.title_word_starts_
.begin();
1037 i
!= word_starts
.title_word_starts_
.end(); ++i
)
1038 map_entry
->add_title_word_starts(*i
);
1042 bool URLIndexPrivateData::RestorePrivateData(
1043 const InMemoryURLIndexCacheItem
& cache
,
1044 const std::string
& languages
) {
1045 last_time_rebuilt_from_history_
=
1046 base::Time::FromInternalValue(cache
.last_rebuild_timestamp());
1047 const base::TimeDelta rebuilt_ago
=
1048 base::Time::Now() - last_time_rebuilt_from_history_
;
1049 if ((rebuilt_ago
> base::TimeDelta::FromDays(7)) ||
1050 (rebuilt_ago
< base::TimeDelta::FromDays(-1))) {
1051 // Cache is more than a week old or, somehow, from some time in the future.
1052 // It's probably a good time to rebuild the index from history to
1053 // allow synced entries to now appear, expired entries to disappear, etc.
1054 // Allow one day in the future to make the cache not rebuild on simple
1055 // system clock changes such as time zone changes.
1058 if (cache
.has_version()) {
1059 if (cache
.version() < kCurrentCacheFileVersion
) {
1060 // Don't try to restore an old format cache file. (This will cause
1061 // the InMemoryURLIndex to schedule rebuilding the URLIndexPrivateData
1065 restored_cache_version_
= cache
.version();
1067 return RestoreWordList(cache
) && RestoreWordMap(cache
) &&
1068 RestoreCharWordMap(cache
) && RestoreWordIDHistoryMap(cache
) &&
1069 RestoreHistoryInfoMap(cache
) && RestoreWordStartsMap(cache
, languages
);
1072 bool URLIndexPrivateData::RestoreWordList(
1073 const InMemoryURLIndexCacheItem
& cache
) {
1074 if (!cache
.has_word_list())
1076 const WordListItem
& list_item(cache
.word_list());
1077 uint32 expected_item_count
= list_item
.word_count();
1078 uint32 actual_item_count
= list_item
.word_size();
1079 if (actual_item_count
== 0 || actual_item_count
!= expected_item_count
)
1081 const RepeatedPtrField
<std::string
>& words(list_item
.word());
1082 for (RepeatedPtrField
<std::string
>::const_iterator iter
= words
.begin();
1083 iter
!= words
.end(); ++iter
)
1084 word_list_
.push_back(base::UTF8ToUTF16(*iter
));
1088 bool URLIndexPrivateData::RestoreWordMap(
1089 const InMemoryURLIndexCacheItem
& cache
) {
1090 if (!cache
.has_word_map())
1092 const WordMapItem
& list_item(cache
.word_map());
1093 uint32 expected_item_count
= list_item
.item_count();
1094 uint32 actual_item_count
= list_item
.word_map_entry_size();
1095 if (actual_item_count
== 0 || actual_item_count
!= expected_item_count
)
1097 const RepeatedPtrField
<WordMapEntry
>& entries(list_item
.word_map_entry());
1098 for (RepeatedPtrField
<WordMapEntry
>::const_iterator iter
= entries
.begin();
1099 iter
!= entries
.end(); ++iter
)
1100 word_map_
[base::UTF8ToUTF16(iter
->word())] = iter
->word_id();
1104 bool URLIndexPrivateData::RestoreCharWordMap(
1105 const InMemoryURLIndexCacheItem
& cache
) {
1106 if (!cache
.has_char_word_map())
1108 const CharWordMapItem
& list_item(cache
.char_word_map());
1109 uint32 expected_item_count
= list_item
.item_count();
1110 uint32 actual_item_count
= list_item
.char_word_map_entry_size();
1111 if (actual_item_count
== 0 || actual_item_count
!= expected_item_count
)
1113 const RepeatedPtrField
<CharWordMapEntry
>&
1114 entries(list_item
.char_word_map_entry());
1115 for (RepeatedPtrField
<CharWordMapEntry
>::const_iterator iter
=
1116 entries
.begin(); iter
!= entries
.end(); ++iter
) {
1117 expected_item_count
= iter
->item_count();
1118 actual_item_count
= iter
->word_id_size();
1119 if (actual_item_count
== 0 || actual_item_count
!= expected_item_count
)
1121 base::char16 uni_char
= static_cast<base::char16
>(iter
->char_16());
1122 WordIDSet word_id_set
;
1123 const RepeatedField
<int32
>& word_ids(iter
->word_id());
1124 for (RepeatedField
<int32
>::const_iterator jiter
= word_ids
.begin();
1125 jiter
!= word_ids
.end(); ++jiter
)
1126 word_id_set
.insert(*jiter
);
1127 char_word_map_
[uni_char
] = word_id_set
;
1132 bool URLIndexPrivateData::RestoreWordIDHistoryMap(
1133 const InMemoryURLIndexCacheItem
& cache
) {
1134 if (!cache
.has_word_id_history_map())
1136 const WordIDHistoryMapItem
& list_item(cache
.word_id_history_map());
1137 uint32 expected_item_count
= list_item
.item_count();
1138 uint32 actual_item_count
= list_item
.word_id_history_map_entry_size();
1139 if (actual_item_count
== 0 || actual_item_count
!= expected_item_count
)
1141 const RepeatedPtrField
<WordIDHistoryMapEntry
>&
1142 entries(list_item
.word_id_history_map_entry());
1143 for (RepeatedPtrField
<WordIDHistoryMapEntry
>::const_iterator iter
=
1144 entries
.begin(); iter
!= entries
.end(); ++iter
) {
1145 expected_item_count
= iter
->item_count();
1146 actual_item_count
= iter
->history_id_size();
1147 if (actual_item_count
== 0 || actual_item_count
!= expected_item_count
)
1149 WordID word_id
= iter
->word_id();
1150 HistoryIDSet history_id_set
;
1151 const RepeatedField
<int64
>& history_ids(iter
->history_id());
1152 for (RepeatedField
<int64
>::const_iterator jiter
= history_ids
.begin();
1153 jiter
!= history_ids
.end(); ++jiter
) {
1154 history_id_set
.insert(*jiter
);
1155 AddToHistoryIDWordMap(*jiter
, word_id
);
1157 word_id_history_map_
[word_id
] = history_id_set
;
1162 bool URLIndexPrivateData::RestoreHistoryInfoMap(
1163 const InMemoryURLIndexCacheItem
& cache
) {
1164 if (!cache
.has_history_info_map())
1166 const HistoryInfoMapItem
& list_item(cache
.history_info_map());
1167 uint32 expected_item_count
= list_item
.item_count();
1168 uint32 actual_item_count
= list_item
.history_info_map_entry_size();
1169 if (actual_item_count
== 0 || actual_item_count
!= expected_item_count
)
1171 const RepeatedPtrField
<HistoryInfoMapEntry
>&
1172 entries(list_item
.history_info_map_entry());
1173 for (RepeatedPtrField
<HistoryInfoMapEntry
>::const_iterator iter
=
1174 entries
.begin(); iter
!= entries
.end(); ++iter
) {
1175 HistoryID history_id
= iter
->history_id();
1176 GURL
url(iter
->url());
1177 history::URLRow
url_row(url
, history_id
);
1178 url_row
.set_visit_count(iter
->visit_count());
1179 url_row
.set_typed_count(iter
->typed_count());
1180 url_row
.set_last_visit(base::Time::FromInternalValue(iter
->last_visit()));
1181 if (iter
->has_title()) {
1182 base::string16
title(base::UTF8ToUTF16(iter
->title()));
1183 url_row
.set_title(title
);
1185 history_info_map_
[history_id
].url_row
= url_row
;
1187 // Restore visits list.
1188 VisitInfoVector visits
;
1189 visits
.reserve(iter
->visits_size());
1190 for (int i
= 0; i
< iter
->visits_size(); ++i
) {
1191 visits
.push_back(std::make_pair(
1192 base::Time::FromInternalValue(iter
->visits(i
).visit_time()),
1193 ui::PageTransitionFromInt(iter
->visits(i
).transition_type())));
1195 history_info_map_
[history_id
].visits
= visits
;
1200 bool URLIndexPrivateData::RestoreWordStartsMap(
1201 const InMemoryURLIndexCacheItem
& cache
,
1202 const std::string
& languages
) {
1203 // Note that this function must be called after RestoreHistoryInfoMap() has
1204 // been run as the word starts may have to be recalculated from the urls and
1206 if (cache
.has_word_starts_map()) {
1207 const WordStartsMapItem
& list_item(cache
.word_starts_map());
1208 uint32 expected_item_count
= list_item
.item_count();
1209 uint32 actual_item_count
= list_item
.word_starts_map_entry_size();
1210 if (actual_item_count
== 0 || actual_item_count
!= expected_item_count
)
1212 const RepeatedPtrField
<WordStartsMapEntry
>&
1213 entries(list_item
.word_starts_map_entry());
1214 for (RepeatedPtrField
<WordStartsMapEntry
>::const_iterator iter
=
1215 entries
.begin(); iter
!= entries
.end(); ++iter
) {
1216 HistoryID history_id
= iter
->history_id();
1217 RowWordStarts word_starts
;
1218 // Restore the URL word starts.
1219 const RepeatedField
<int32
>& url_starts(iter
->url_word_starts());
1220 for (RepeatedField
<int32
>::const_iterator jiter
= url_starts
.begin();
1221 jiter
!= url_starts
.end(); ++jiter
)
1222 word_starts
.url_word_starts_
.push_back(*jiter
);
1223 // Restore the page title word starts.
1224 const RepeatedField
<int32
>& title_starts(iter
->title_word_starts());
1225 for (RepeatedField
<int32
>::const_iterator jiter
= title_starts
.begin();
1226 jiter
!= title_starts
.end(); ++jiter
)
1227 word_starts
.title_word_starts_
.push_back(*jiter
);
1228 word_starts_map_
[history_id
] = word_starts
;
1231 // Since the cache did not contain any word starts we must rebuild then from
1232 // the URL and page titles.
1233 for (HistoryInfoMap::const_iterator iter
= history_info_map_
.begin();
1234 iter
!= history_info_map_
.end(); ++iter
) {
1235 RowWordStarts word_starts
;
1236 const history::URLRow
& row(iter
->second
.url_row
);
1237 const base::string16
& url
=
1238 bookmarks::CleanUpUrlForMatching(row
.url(), languages
, NULL
);
1239 String16VectorFromString16(url
, false, &word_starts
.url_word_starts_
);
1240 const base::string16
& title
=
1241 bookmarks::CleanUpTitleForMatching(row
.title());
1242 String16VectorFromString16(title
, false, &word_starts
.title_word_starts_
);
1243 word_starts_map_
[iter
->first
] = word_starts
;
1250 bool URLIndexPrivateData::URLSchemeIsWhitelisted(
1252 const std::set
<std::string
>& whitelist
) {
1253 return whitelist
.find(gurl
.scheme()) != whitelist
.end();
1257 // SearchTermCacheItem ---------------------------------------------------------
1259 URLIndexPrivateData::SearchTermCacheItem::SearchTermCacheItem(
1260 const WordIDSet
& word_id_set
,
1261 const HistoryIDSet
& history_id_set
)
1262 : word_id_set_(word_id_set
), history_id_set_(history_id_set
), used_(true) {
1265 URLIndexPrivateData::SearchTermCacheItem::SearchTermCacheItem() : used_(true) {
1268 URLIndexPrivateData::SearchTermCacheItem::~SearchTermCacheItem() {
1271 // URLIndexPrivateData::AddHistoryMatch ----------------------------------------
1273 URLIndexPrivateData::AddHistoryMatch::AddHistoryMatch(
1274 bookmarks::BookmarkModel
* bookmark_model
,
1275 const URLIndexPrivateData
& private_data
,
1276 const std::string
& languages
,
1277 const base::string16
& lower_string
,
1278 const String16Vector
& lower_terms
,
1279 const base::Time now
)
1280 : bookmark_model_(bookmark_model
),
1281 private_data_(private_data
),
1282 languages_(languages
),
1283 lower_string_(lower_string
),
1284 lower_terms_(lower_terms
),
1286 // Calculate offsets for each term. For instance, the offset for
1287 // ".net" should be 1, indicating that the actual word-part of the term
1288 // starts at offset 1.
1289 lower_terms_to_word_starts_offsets_
.resize(lower_terms_
.size(), 0u);
1290 for (size_t i
= 0; i
< lower_terms_
.size(); ++i
) {
1291 base::i18n::BreakIterator
iter(lower_terms_
[i
],
1292 base::i18n::BreakIterator::BREAK_WORD
);
1293 // If the iterator doesn't work, assume an offset of 0.
1296 // Find the first word start. If the iterator didn't find a word break, set
1297 // an offset of term size. For example, the offset for "://" should be 3,
1298 // indicating that the word-part is missing.
1299 while (iter
.Advance() && !iter
.IsWord()) {}
1301 lower_terms_to_word_starts_offsets_
[i
] = iter
.prev();
1305 URLIndexPrivateData::AddHistoryMatch::~AddHistoryMatch() {
1308 void URLIndexPrivateData::AddHistoryMatch::operator()(
1309 const HistoryID history_id
) {
1310 HistoryInfoMap::const_iterator hist_pos
=
1311 private_data_
.history_info_map_
.find(history_id
);
1312 if (hist_pos
!= private_data_
.history_info_map_
.end()) {
1313 const history::URLRow
& hist_item
= hist_pos
->second
.url_row
;
1314 const VisitInfoVector
& visits
= hist_pos
->second
.visits
;
1315 WordStartsMap::const_iterator starts_pos
=
1316 private_data_
.word_starts_map_
.find(history_id
);
1317 DCHECK(starts_pos
!= private_data_
.word_starts_map_
.end());
1318 ScoredHistoryMatch
match(
1319 hist_item
, visits
, languages_
, lower_string_
, lower_terms_
,
1320 lower_terms_to_word_starts_offsets_
, starts_pos
->second
,
1321 bookmark_model_
&& bookmark_model_
->IsBookmarked(hist_item
.url()),
1323 if (match
.raw_score
> 0)
1324 scored_matches_
.push_back(match
);
1329 // URLIndexPrivateData::HistoryItemFactorGreater -------------------------------
1331 URLIndexPrivateData::HistoryItemFactorGreater::HistoryItemFactorGreater(
1332 const HistoryInfoMap
& history_info_map
)
1333 : history_info_map_(history_info_map
) {
1336 URLIndexPrivateData::HistoryItemFactorGreater::~HistoryItemFactorGreater() {
1339 bool URLIndexPrivateData::HistoryItemFactorGreater::operator()(
1341 const HistoryID h2
) {
1342 HistoryInfoMap::const_iterator
entry1(history_info_map_
.find(h1
));
1343 if (entry1
== history_info_map_
.end())
1345 HistoryInfoMap::const_iterator
entry2(history_info_map_
.find(h2
));
1346 if (entry2
== history_info_map_
.end())
1348 const history::URLRow
& r1(entry1
->second
.url_row
);
1349 const history::URLRow
& r2(entry2
->second
.url_row
);
1350 // First cut: typed count, visit count, recency.
1351 // TODO(mrossetti): This is too simplistic. Consider an approach which ranks
1352 // recently visited (within the last 12/24 hours) as highly important. Get
1353 // input from mpearson.
1354 if (r1
.typed_count() != r2
.typed_count())
1355 return (r1
.typed_count() > r2
.typed_count());
1356 if (r1
.visit_count() != r2
.visit_count())
1357 return (r1
.visit_count() > r2
.visit_count());
1358 return (r1
.last_visit() > r2
.last_visit());