1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "chrome/renderer/safe_browsing/phishing_term_feature_extractor.h"
10 #include "base/bind.h"
11 #include "base/compiler_specific.h"
12 #include "base/i18n/break_iterator.h"
13 #include "base/i18n/case_conversion.h"
14 #include "base/logging.h"
15 #include "base/memory/scoped_ptr.h"
16 #include "base/message_loop/message_loop.h"
17 #include "base/metrics/histogram.h"
18 #include "base/strings/utf_string_conversions.h"
19 #include "base/time/time.h"
20 #include "chrome/renderer/safe_browsing/feature_extractor_clock.h"
21 #include "chrome/renderer/safe_browsing/features.h"
22 #include "chrome/renderer/safe_browsing/murmurhash3_util.h"
23 #include "crypto/sha2.h"
24 #include "ui/base/l10n/l10n_util.h"
26 namespace safe_browsing
{
28 // This time should be short enough that it doesn't noticeably disrupt the
29 // user's interaction with the page.
30 const int PhishingTermFeatureExtractor::kMaxTimePerChunkMs
= 10;
32 // Experimenting shows that we get a reasonable gain in performance by
33 // increasing this up to around 10, but there's not much benefit in
34 // increasing it past that.
35 const int PhishingTermFeatureExtractor::kClockCheckGranularity
= 5;
37 // This should be longer than we expect feature extraction to take on any
38 // actual phishing page.
39 const int PhishingTermFeatureExtractor::kMaxTotalTimeMs
= 500;
41 // All of the state pertaining to the current feature extraction.
42 struct PhishingTermFeatureExtractor::ExtractionState
{
43 // Stores up to max_words_per_term_ previous words separated by spaces.
44 std::string previous_words
;
46 // Stores the current shingle after a new word is processed and added in.
47 std::string current_shingle
;
49 // Stores the sizes of the words in current_shingle. Note: the size includes
50 // the space after each word. In other words, the sum of all sizes in this
51 // list is equal to the length of current_shingle.
52 std::list
<size_t> shingle_word_sizes
;
54 // Stores the sizes of the words in previous_words. Note: the size includes
55 // the space after each word. In other words, the sum of all sizes in this
56 // list is equal to the length of previous_words.
57 std::list
<size_t> previous_word_sizes
;
59 // An iterator for word breaking.
60 scoped_ptr
<base::i18n::BreakIterator
> iterator
;
62 // The time at which we started feature extraction for the current page.
63 base::TimeTicks start_time
;
65 // The number of iterations we've done for the current extraction.
68 ExtractionState(const base::string16
& text
, base::TimeTicks start_time_ticks
)
69 : start_time(start_time_ticks
),
72 scoped_ptr
<base::i18n::BreakIterator
> i(
73 new base::i18n::BreakIterator(
74 text
, base::i18n::BreakIterator::BREAK_WORD
));
79 DLOG(ERROR
) << "failed to open iterator";
84 PhishingTermFeatureExtractor::PhishingTermFeatureExtractor(
85 const base::hash_set
<std::string
>* page_term_hashes
,
86 const base::hash_set
<uint32
>* page_word_hashes
,
87 size_t max_words_per_term
,
88 uint32 murmurhash3_seed
,
89 size_t max_shingles_per_page
,
91 FeatureExtractorClock
* clock
)
92 : page_term_hashes_(page_term_hashes
),
93 page_word_hashes_(page_word_hashes
),
94 max_words_per_term_(max_words_per_term
),
95 murmurhash3_seed_(murmurhash3_seed
),
96 max_shingles_per_page_(max_shingles_per_page
),
97 shingle_size_(shingle_size
),
103 PhishingTermFeatureExtractor::~PhishingTermFeatureExtractor() {
104 // The RenderView should have called CancelPendingExtraction() before
106 CheckNoPendingExtraction();
109 void PhishingTermFeatureExtractor::ExtractFeatures(
110 const base::string16
* page_text
,
111 FeatureMap
* features
,
112 std::set
<uint32
>* shingle_hashes
,
113 const DoneCallback
& done_callback
) {
114 // The RenderView should have called CancelPendingExtraction() before
115 // starting a new extraction, so DCHECK this.
116 CheckNoPendingExtraction();
117 // However, in an opt build, we will go ahead and clean up the pending
118 // extraction so that we can start in a known state.
119 CancelPendingExtraction();
121 page_text_
= page_text
;
122 features_
= features
;
123 shingle_hashes_
= shingle_hashes
,
124 done_callback_
= done_callback
;
126 state_
.reset(new ExtractionState(*page_text_
, clock_
->Now()));
127 base::MessageLoop::current()->PostTask(
129 base::Bind(&PhishingTermFeatureExtractor::ExtractFeaturesWithTimeout
,
130 weak_factory_
.GetWeakPtr()));
133 void PhishingTermFeatureExtractor::CancelPendingExtraction() {
134 // Cancel any pending callbacks, and clear our state.
135 weak_factory_
.InvalidateWeakPtrs();
139 void PhishingTermFeatureExtractor::ExtractFeaturesWithTimeout() {
140 DCHECK(state_
.get());
141 ++state_
->num_iterations
;
142 base::TimeTicks current_chunk_start_time
= clock_
->Now();
144 if (!state_
->iterator
.get()) {
145 // We failed to initialize the break iterator, so stop now.
146 UMA_HISTOGRAM_COUNTS("SBClientPhishing.TermFeatureBreakIterError", 1);
152 while (state_
->iterator
->Advance()) {
153 if (state_
->iterator
->IsWord()) {
154 const size_t start
= state_
->iterator
->prev();
155 const size_t length
= state_
->iterator
->pos() - start
;
156 HandleWord(base::StringPiece16(page_text_
->data() + start
, length
));
160 if (num_words
>= kClockCheckGranularity
) {
162 base::TimeTicks now
= clock_
->Now();
163 if (now
- state_
->start_time
>=
164 base::TimeDelta::FromMilliseconds(kMaxTotalTimeMs
)) {
165 DLOG(ERROR
) << "Feature extraction took too long, giving up";
166 // We expect this to happen infrequently, so record when it does.
167 UMA_HISTOGRAM_COUNTS("SBClientPhishing.TermFeatureTimeout", 1);
171 base::TimeDelta chunk_elapsed
= now
- current_chunk_start_time
;
173 base::TimeDelta::FromMilliseconds(kMaxTimePerChunkMs
)) {
174 // The time limit for the current chunk is up, so post a task to
175 // continue extraction.
177 // Record how much time we actually spent on the chunk. If this is
178 // much higher than kMaxTimePerChunkMs, we may need to adjust the
179 // clock granularity.
180 UMA_HISTOGRAM_TIMES("SBClientPhishing.TermFeatureChunkTime",
182 base::MessageLoop::current()->PostTask(
185 &PhishingTermFeatureExtractor::ExtractFeaturesWithTimeout
,
186 weak_factory_
.GetWeakPtr()));
189 // Otherwise, continue.
195 void PhishingTermFeatureExtractor::HandleWord(
196 const base::StringPiece16
& word
) {
197 // First, extract shingle hashes.
198 const std::string
& word_lower
= base::UTF16ToUTF8(base::i18n::ToLower(word
));
199 state_
->current_shingle
.append(word_lower
+ " ");
200 state_
->shingle_word_sizes
.push_back(word_lower
.size() + 1);
201 if (state_
->shingle_word_sizes
.size() == shingle_size_
) {
202 shingle_hashes_
->insert(
203 MurmurHash3String(state_
->current_shingle
, murmurhash3_seed_
));
204 state_
->current_shingle
.erase(0, state_
->shingle_word_sizes
.front());
205 state_
->shingle_word_sizes
.pop_front();
207 // Check if the size of shingle hashes is over the limit.
208 if (shingle_hashes_
->size() > max_shingles_per_page_
) {
209 // Pop the largest one.
210 std::set
<uint32
>::iterator it
= shingle_hashes_
->end();
211 shingle_hashes_
->erase(--it
);
214 // Next, extract page terms.
215 uint32 word_hash
= MurmurHash3String(word_lower
, murmurhash3_seed_
);
217 // Quick out if the word is not part of any term, which is the common case.
218 if (page_word_hashes_
->find(word_hash
) == page_word_hashes_
->end()) {
219 // Word doesn't exist in our terms so we can clear the n-gram state.
220 state_
->previous_words
.clear();
221 state_
->previous_word_sizes
.clear();
225 // Find all of the n-grams that we need to check and compute their SHA-256
227 std::map
<std::string
/* hash */, std::string
/* plaintext */>
229 hashes_to_check
[crypto::SHA256HashString(word_lower
)] = word_lower
;
231 // Combine the new word with the previous words to find additional n-grams.
232 // Note that we don't yet add the new word length to previous_word_sizes,
233 // since we don't want to compute the hash for the word by itself again.
235 state_
->previous_words
.append(word_lower
);
236 std::string current_term
= state_
->previous_words
;
237 for (std::list
<size_t>::iterator it
= state_
->previous_word_sizes
.begin();
238 it
!= state_
->previous_word_sizes
.end(); ++it
) {
239 hashes_to_check
[crypto::SHA256HashString(current_term
)] = current_term
;
240 current_term
.erase(0, *it
);
243 // Add features for any hashes that match page_term_hashes_.
244 for (std::map
<std::string
, std::string
>::iterator it
=
245 hashes_to_check
.begin();
246 it
!= hashes_to_check
.end(); ++it
) {
247 if (page_term_hashes_
->find(it
->first
) != page_term_hashes_
->end()) {
248 features_
->AddBooleanFeature(features::kPageTerm
+ it
->second
);
252 // Now that we have handled the current word, we have to add a space at the
253 // end of it, and add the new word's size (including the space) to
254 // previous_word_sizes. Note: it's possible that the document language
255 // doesn't use ASCII spaces to separate words. That's fine though, we just
256 // need to be consistent with how the model is generated.
257 state_
->previous_words
.append(" ");
258 state_
->previous_word_sizes
.push_back(word_lower
.size() + 1);
260 // Cap the number of previous words.
261 if (state_
->previous_word_sizes
.size() >= max_words_per_term_
) {
262 state_
->previous_words
.erase(0, state_
->previous_word_sizes
.front());
263 state_
->previous_word_sizes
.pop_front();
267 void PhishingTermFeatureExtractor::CheckNoPendingExtraction() {
268 DCHECK(done_callback_
.is_null());
269 DCHECK(!state_
.get());
270 if (!done_callback_
.is_null() || state_
.get()) {
271 LOG(ERROR
) << "Extraction in progress, missing call to "
272 << "CancelPendingExtraction";
276 void PhishingTermFeatureExtractor::RunCallback(bool success
) {
277 // Record some timing stats that we can use to evaluate feature extraction
278 // performance. These include both successful and failed extractions.
279 DCHECK(state_
.get());
280 UMA_HISTOGRAM_COUNTS("SBClientPhishing.TermFeatureIterations",
281 state_
->num_iterations
);
282 UMA_HISTOGRAM_TIMES("SBClientPhishing.TermFeatureTotalTime",
283 clock_
->Now() - state_
->start_time
);
285 DCHECK(!done_callback_
.is_null());
286 done_callback_
.Run(success
);
290 void PhishingTermFeatureExtractor::Clear() {
293 shingle_hashes_
= NULL
;
294 done_callback_
.Reset();
298 } // namespace safe_browsing