1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "chrome/renderer/safe_browsing/phishing_term_feature_extractor.h"
10 #include "base/bind.h"
11 #include "base/compiler_specific.h"
12 #include "base/i18n/break_iterator.h"
13 #include "base/i18n/case_conversion.h"
14 #include "base/location.h"
15 #include "base/logging.h"
16 #include "base/memory/scoped_ptr.h"
17 #include "base/metrics/histogram.h"
18 #include "base/single_thread_task_runner.h"
19 #include "base/strings/utf_string_conversions.h"
20 #include "base/thread_task_runner_handle.h"
21 #include "base/time/time.h"
22 #include "chrome/renderer/safe_browsing/feature_extractor_clock.h"
23 #include "chrome/renderer/safe_browsing/features.h"
24 #include "chrome/renderer/safe_browsing/murmurhash3_util.h"
25 #include "crypto/sha2.h"
27 namespace safe_browsing
{
29 // This time should be short enough that it doesn't noticeably disrupt the
30 // user's interaction with the page.
31 const int PhishingTermFeatureExtractor::kMaxTimePerChunkMs
= 10;
33 // Experimenting shows that we get a reasonable gain in performance by
34 // increasing this up to around 10, but there's not much benefit in
35 // increasing it past that.
36 const int PhishingTermFeatureExtractor::kClockCheckGranularity
= 5;
38 // This should be longer than we expect feature extraction to take on any
39 // actual phishing page.
40 const int PhishingTermFeatureExtractor::kMaxTotalTimeMs
= 500;
42 // All of the state pertaining to the current feature extraction.
43 struct PhishingTermFeatureExtractor::ExtractionState
{
44 // Stores up to max_words_per_term_ previous words separated by spaces.
45 std::string previous_words
;
47 // Stores the current shingle after a new word is processed and added in.
48 std::string current_shingle
;
50 // Stores the sizes of the words in current_shingle. Note: the size includes
51 // the space after each word. In other words, the sum of all sizes in this
52 // list is equal to the length of current_shingle.
53 std::list
<size_t> shingle_word_sizes
;
55 // Stores the sizes of the words in previous_words. Note: the size includes
56 // the space after each word. In other words, the sum of all sizes in this
57 // list is equal to the length of previous_words.
58 std::list
<size_t> previous_word_sizes
;
60 // An iterator for word breaking.
61 scoped_ptr
<base::i18n::BreakIterator
> iterator
;
63 // The time at which we started feature extraction for the current page.
64 base::TimeTicks start_time
;
66 // The number of iterations we've done for the current extraction.
69 ExtractionState(const base::string16
& text
, base::TimeTicks start_time_ticks
)
70 : start_time(start_time_ticks
),
73 scoped_ptr
<base::i18n::BreakIterator
> i(
74 new base::i18n::BreakIterator(
75 text
, base::i18n::BreakIterator::BREAK_WORD
));
80 DLOG(ERROR
) << "failed to open iterator";
85 PhishingTermFeatureExtractor::PhishingTermFeatureExtractor(
86 const base::hash_set
<std::string
>* page_term_hashes
,
87 const base::hash_set
<uint32
>* page_word_hashes
,
88 size_t max_words_per_term
,
89 uint32 murmurhash3_seed
,
90 size_t max_shingles_per_page
,
92 FeatureExtractorClock
* clock
)
93 : page_term_hashes_(page_term_hashes
),
94 page_word_hashes_(page_word_hashes
),
95 max_words_per_term_(max_words_per_term
),
96 murmurhash3_seed_(murmurhash3_seed
),
97 max_shingles_per_page_(max_shingles_per_page
),
98 shingle_size_(shingle_size
),
100 weak_factory_(this) {
104 PhishingTermFeatureExtractor::~PhishingTermFeatureExtractor() {
105 // The RenderView should have called CancelPendingExtraction() before
107 CheckNoPendingExtraction();
110 void PhishingTermFeatureExtractor::ExtractFeatures(
111 const base::string16
* page_text
,
112 FeatureMap
* features
,
113 std::set
<uint32
>* shingle_hashes
,
114 const DoneCallback
& done_callback
) {
115 // The RenderView should have called CancelPendingExtraction() before
116 // starting a new extraction, so DCHECK this.
117 CheckNoPendingExtraction();
118 // However, in an opt build, we will go ahead and clean up the pending
119 // extraction so that we can start in a known state.
120 CancelPendingExtraction();
122 page_text_
= page_text
;
123 features_
= features
;
124 shingle_hashes_
= shingle_hashes
,
125 done_callback_
= done_callback
;
127 state_
.reset(new ExtractionState(*page_text_
, clock_
->Now()));
128 base::ThreadTaskRunnerHandle::Get()->PostTask(
130 base::Bind(&PhishingTermFeatureExtractor::ExtractFeaturesWithTimeout
,
131 weak_factory_
.GetWeakPtr()));
134 void PhishingTermFeatureExtractor::CancelPendingExtraction() {
135 // Cancel any pending callbacks, and clear our state.
136 weak_factory_
.InvalidateWeakPtrs();
140 void PhishingTermFeatureExtractor::ExtractFeaturesWithTimeout() {
141 DCHECK(state_
.get());
142 ++state_
->num_iterations
;
143 base::TimeTicks current_chunk_start_time
= clock_
->Now();
145 if (!state_
->iterator
.get()) {
146 // We failed to initialize the break iterator, so stop now.
147 UMA_HISTOGRAM_COUNTS("SBClientPhishing.TermFeatureBreakIterError", 1);
153 while (state_
->iterator
->Advance()) {
154 if (state_
->iterator
->IsWord()) {
155 const size_t start
= state_
->iterator
->prev();
156 const size_t length
= state_
->iterator
->pos() - start
;
157 HandleWord(base::StringPiece16(page_text_
->data() + start
, length
));
161 if (num_words
>= kClockCheckGranularity
) {
163 base::TimeTicks now
= clock_
->Now();
164 if (now
- state_
->start_time
>=
165 base::TimeDelta::FromMilliseconds(kMaxTotalTimeMs
)) {
166 DLOG(ERROR
) << "Feature extraction took too long, giving up";
167 // We expect this to happen infrequently, so record when it does.
168 UMA_HISTOGRAM_COUNTS("SBClientPhishing.TermFeatureTimeout", 1);
172 base::TimeDelta chunk_elapsed
= now
- current_chunk_start_time
;
174 base::TimeDelta::FromMilliseconds(kMaxTimePerChunkMs
)) {
175 // The time limit for the current chunk is up, so post a task to
176 // continue extraction.
178 // Record how much time we actually spent on the chunk. If this is
179 // much higher than kMaxTimePerChunkMs, we may need to adjust the
180 // clock granularity.
181 UMA_HISTOGRAM_TIMES("SBClientPhishing.TermFeatureChunkTime",
183 base::ThreadTaskRunnerHandle::Get()->PostTask(
186 &PhishingTermFeatureExtractor::ExtractFeaturesWithTimeout
,
187 weak_factory_
.GetWeakPtr()));
190 // Otherwise, continue.
196 void PhishingTermFeatureExtractor::HandleWord(
197 const base::StringPiece16
& word
) {
198 // First, extract shingle hashes.
199 const std::string
& word_lower
= base::UTF16ToUTF8(base::i18n::ToLower(word
));
200 state_
->current_shingle
.append(word_lower
+ " ");
201 state_
->shingle_word_sizes
.push_back(word_lower
.size() + 1);
202 if (state_
->shingle_word_sizes
.size() == shingle_size_
) {
203 shingle_hashes_
->insert(
204 MurmurHash3String(state_
->current_shingle
, murmurhash3_seed_
));
205 state_
->current_shingle
.erase(0, state_
->shingle_word_sizes
.front());
206 state_
->shingle_word_sizes
.pop_front();
208 // Check if the size of shingle hashes is over the limit.
209 if (shingle_hashes_
->size() > max_shingles_per_page_
) {
210 // Pop the largest one.
211 std::set
<uint32
>::iterator it
= shingle_hashes_
->end();
212 shingle_hashes_
->erase(--it
);
215 // Next, extract page terms.
216 uint32 word_hash
= MurmurHash3String(word_lower
, murmurhash3_seed_
);
218 // Quick out if the word is not part of any term, which is the common case.
219 if (page_word_hashes_
->find(word_hash
) == page_word_hashes_
->end()) {
220 // Word doesn't exist in our terms so we can clear the n-gram state.
221 state_
->previous_words
.clear();
222 state_
->previous_word_sizes
.clear();
226 // Find all of the n-grams that we need to check and compute their SHA-256
228 std::map
<std::string
/* hash */, std::string
/* plaintext */>
230 hashes_to_check
[crypto::SHA256HashString(word_lower
)] = word_lower
;
232 // Combine the new word with the previous words to find additional n-grams.
233 // Note that we don't yet add the new word length to previous_word_sizes,
234 // since we don't want to compute the hash for the word by itself again.
236 state_
->previous_words
.append(word_lower
);
237 std::string current_term
= state_
->previous_words
;
238 for (std::list
<size_t>::iterator it
= state_
->previous_word_sizes
.begin();
239 it
!= state_
->previous_word_sizes
.end(); ++it
) {
240 hashes_to_check
[crypto::SHA256HashString(current_term
)] = current_term
;
241 current_term
.erase(0, *it
);
244 // Add features for any hashes that match page_term_hashes_.
245 for (std::map
<std::string
, std::string
>::iterator it
=
246 hashes_to_check
.begin();
247 it
!= hashes_to_check
.end(); ++it
) {
248 if (page_term_hashes_
->find(it
->first
) != page_term_hashes_
->end()) {
249 features_
->AddBooleanFeature(features::kPageTerm
+ it
->second
);
253 // Now that we have handled the current word, we have to add a space at the
254 // end of it, and add the new word's size (including the space) to
255 // previous_word_sizes. Note: it's possible that the document language
256 // doesn't use ASCII spaces to separate words. That's fine though, we just
257 // need to be consistent with how the model is generated.
258 state_
->previous_words
.append(" ");
259 state_
->previous_word_sizes
.push_back(word_lower
.size() + 1);
261 // Cap the number of previous words.
262 if (state_
->previous_word_sizes
.size() >= max_words_per_term_
) {
263 state_
->previous_words
.erase(0, state_
->previous_word_sizes
.front());
264 state_
->previous_word_sizes
.pop_front();
268 void PhishingTermFeatureExtractor::CheckNoPendingExtraction() {
269 DCHECK(done_callback_
.is_null());
270 DCHECK(!state_
.get());
271 if (!done_callback_
.is_null() || state_
.get()) {
272 LOG(ERROR
) << "Extraction in progress, missing call to "
273 << "CancelPendingExtraction";
277 void PhishingTermFeatureExtractor::RunCallback(bool success
) {
278 // Record some timing stats that we can use to evaluate feature extraction
279 // performance. These include both successful and failed extractions.
280 DCHECK(state_
.get());
281 UMA_HISTOGRAM_COUNTS("SBClientPhishing.TermFeatureIterations",
282 state_
->num_iterations
);
283 UMA_HISTOGRAM_TIMES("SBClientPhishing.TermFeatureTotalTime",
284 clock_
->Now() - state_
->start_time
);
286 DCHECK(!done_callback_
.is_null());
287 done_callback_
.Run(success
);
291 void PhishingTermFeatureExtractor::Clear() {
294 shingle_hashes_
= NULL
;
295 done_callback_
.Reset();
299 } // namespace safe_browsing