1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "chrome/renderer/safe_browsing/phishing_term_feature_extractor.h"
10 #include "base/bind.h"
11 #include "base/compiler_specific.h"
12 #include "base/i18n/break_iterator.h"
13 #include "base/i18n/case_conversion.h"
14 #include "base/logging.h"
15 #include "base/memory/scoped_ptr.h"
16 #include "base/message_loop/message_loop.h"
17 #include "base/metrics/histogram.h"
18 #include "base/strings/utf_string_conversions.h"
19 #include "base/time/time.h"
20 #include "chrome/renderer/safe_browsing/feature_extractor_clock.h"
21 #include "chrome/renderer/safe_browsing/features.h"
22 #include "chrome/renderer/safe_browsing/murmurhash3_util.h"
23 #include "crypto/sha2.h"
25 namespace safe_browsing
{
27 // This time should be short enough that it doesn't noticeably disrupt the
28 // user's interaction with the page.
29 const int PhishingTermFeatureExtractor::kMaxTimePerChunkMs
= 10;
31 // Experimenting shows that we get a reasonable gain in performance by
32 // increasing this up to around 10, but there's not much benefit in
33 // increasing it past that.
34 const int PhishingTermFeatureExtractor::kClockCheckGranularity
= 5;
36 // This should be longer than we expect feature extraction to take on any
37 // actual phishing page.
38 const int PhishingTermFeatureExtractor::kMaxTotalTimeMs
= 500;
40 // All of the state pertaining to the current feature extraction.
41 struct PhishingTermFeatureExtractor::ExtractionState
{
42 // Stores up to max_words_per_term_ previous words separated by spaces.
43 std::string previous_words
;
45 // Stores the current shingle after a new word is processed and added in.
46 std::string current_shingle
;
48 // Stores the sizes of the words in current_shingle. Note: the size includes
49 // the space after each word. In other words, the sum of all sizes in this
50 // list is equal to the length of current_shingle.
51 std::list
<size_t> shingle_word_sizes
;
53 // Stores the sizes of the words in previous_words. Note: the size includes
54 // the space after each word. In other words, the sum of all sizes in this
55 // list is equal to the length of previous_words.
56 std::list
<size_t> previous_word_sizes
;
58 // An iterator for word breaking.
59 scoped_ptr
<base::i18n::BreakIterator
> iterator
;
61 // The time at which we started feature extraction for the current page.
62 base::TimeTicks start_time
;
64 // The number of iterations we've done for the current extraction.
67 ExtractionState(const base::string16
& text
, base::TimeTicks start_time_ticks
)
68 : start_time(start_time_ticks
),
71 scoped_ptr
<base::i18n::BreakIterator
> i(
72 new base::i18n::BreakIterator(
73 text
, base::i18n::BreakIterator::BREAK_WORD
));
78 DLOG(ERROR
) << "failed to open iterator";
83 PhishingTermFeatureExtractor::PhishingTermFeatureExtractor(
84 const base::hash_set
<std::string
>* page_term_hashes
,
85 const base::hash_set
<uint32
>* page_word_hashes
,
86 size_t max_words_per_term
,
87 uint32 murmurhash3_seed
,
88 size_t max_shingles_per_page
,
90 FeatureExtractorClock
* clock
)
91 : page_term_hashes_(page_term_hashes
),
92 page_word_hashes_(page_word_hashes
),
93 max_words_per_term_(max_words_per_term
),
94 murmurhash3_seed_(murmurhash3_seed
),
95 max_shingles_per_page_(max_shingles_per_page
),
96 shingle_size_(shingle_size
),
102 PhishingTermFeatureExtractor::~PhishingTermFeatureExtractor() {
103 // The RenderView should have called CancelPendingExtraction() before
105 CheckNoPendingExtraction();
108 void PhishingTermFeatureExtractor::ExtractFeatures(
109 const base::string16
* page_text
,
110 FeatureMap
* features
,
111 std::set
<uint32
>* shingle_hashes
,
112 const DoneCallback
& done_callback
) {
113 // The RenderView should have called CancelPendingExtraction() before
114 // starting a new extraction, so DCHECK this.
115 CheckNoPendingExtraction();
116 // However, in an opt build, we will go ahead and clean up the pending
117 // extraction so that we can start in a known state.
118 CancelPendingExtraction();
120 page_text_
= page_text
;
121 features_
= features
;
122 shingle_hashes_
= shingle_hashes
,
123 done_callback_
= done_callback
;
125 state_
.reset(new ExtractionState(*page_text_
, clock_
->Now()));
126 base::MessageLoop::current()->PostTask(
128 base::Bind(&PhishingTermFeatureExtractor::ExtractFeaturesWithTimeout
,
129 weak_factory_
.GetWeakPtr()));
132 void PhishingTermFeatureExtractor::CancelPendingExtraction() {
133 // Cancel any pending callbacks, and clear our state.
134 weak_factory_
.InvalidateWeakPtrs();
138 void PhishingTermFeatureExtractor::ExtractFeaturesWithTimeout() {
139 DCHECK(state_
.get());
140 ++state_
->num_iterations
;
141 base::TimeTicks current_chunk_start_time
= clock_
->Now();
143 if (!state_
->iterator
.get()) {
144 // We failed to initialize the break iterator, so stop now.
145 UMA_HISTOGRAM_COUNTS("SBClientPhishing.TermFeatureBreakIterError", 1);
151 while (state_
->iterator
->Advance()) {
152 if (state_
->iterator
->IsWord()) {
153 const size_t start
= state_
->iterator
->prev();
154 const size_t length
= state_
->iterator
->pos() - start
;
155 HandleWord(base::StringPiece16(page_text_
->data() + start
, length
));
159 if (num_words
>= kClockCheckGranularity
) {
161 base::TimeTicks now
= clock_
->Now();
162 if (now
- state_
->start_time
>=
163 base::TimeDelta::FromMilliseconds(kMaxTotalTimeMs
)) {
164 DLOG(ERROR
) << "Feature extraction took too long, giving up";
165 // We expect this to happen infrequently, so record when it does.
166 UMA_HISTOGRAM_COUNTS("SBClientPhishing.TermFeatureTimeout", 1);
170 base::TimeDelta chunk_elapsed
= now
- current_chunk_start_time
;
172 base::TimeDelta::FromMilliseconds(kMaxTimePerChunkMs
)) {
173 // The time limit for the current chunk is up, so post a task to
174 // continue extraction.
176 // Record how much time we actually spent on the chunk. If this is
177 // much higher than kMaxTimePerChunkMs, we may need to adjust the
178 // clock granularity.
179 UMA_HISTOGRAM_TIMES("SBClientPhishing.TermFeatureChunkTime",
181 base::MessageLoop::current()->PostTask(
184 &PhishingTermFeatureExtractor::ExtractFeaturesWithTimeout
,
185 weak_factory_
.GetWeakPtr()));
188 // Otherwise, continue.
194 void PhishingTermFeatureExtractor::HandleWord(
195 const base::StringPiece16
& word
) {
196 // First, extract shingle hashes.
197 const std::string
& word_lower
= base::UTF16ToUTF8(base::i18n::ToLower(word
));
198 state_
->current_shingle
.append(word_lower
+ " ");
199 state_
->shingle_word_sizes
.push_back(word_lower
.size() + 1);
200 if (state_
->shingle_word_sizes
.size() == shingle_size_
) {
201 shingle_hashes_
->insert(
202 MurmurHash3String(state_
->current_shingle
, murmurhash3_seed_
));
203 state_
->current_shingle
.erase(0, state_
->shingle_word_sizes
.front());
204 state_
->shingle_word_sizes
.pop_front();
206 // Check if the size of shingle hashes is over the limit.
207 if (shingle_hashes_
->size() > max_shingles_per_page_
) {
208 // Pop the largest one.
209 std::set
<uint32
>::iterator it
= shingle_hashes_
->end();
210 shingle_hashes_
->erase(--it
);
213 // Next, extract page terms.
214 uint32 word_hash
= MurmurHash3String(word_lower
, murmurhash3_seed_
);
216 // Quick out if the word is not part of any term, which is the common case.
217 if (page_word_hashes_
->find(word_hash
) == page_word_hashes_
->end()) {
218 // Word doesn't exist in our terms so we can clear the n-gram state.
219 state_
->previous_words
.clear();
220 state_
->previous_word_sizes
.clear();
224 // Find all of the n-grams that we need to check and compute their SHA-256
226 std::map
<std::string
/* hash */, std::string
/* plaintext */>
228 hashes_to_check
[crypto::SHA256HashString(word_lower
)] = word_lower
;
230 // Combine the new word with the previous words to find additional n-grams.
231 // Note that we don't yet add the new word length to previous_word_sizes,
232 // since we don't want to compute the hash for the word by itself again.
234 state_
->previous_words
.append(word_lower
);
235 std::string current_term
= state_
->previous_words
;
236 for (std::list
<size_t>::iterator it
= state_
->previous_word_sizes
.begin();
237 it
!= state_
->previous_word_sizes
.end(); ++it
) {
238 hashes_to_check
[crypto::SHA256HashString(current_term
)] = current_term
;
239 current_term
.erase(0, *it
);
242 // Add features for any hashes that match page_term_hashes_.
243 for (std::map
<std::string
, std::string
>::iterator it
=
244 hashes_to_check
.begin();
245 it
!= hashes_to_check
.end(); ++it
) {
246 if (page_term_hashes_
->find(it
->first
) != page_term_hashes_
->end()) {
247 features_
->AddBooleanFeature(features::kPageTerm
+ it
->second
);
251 // Now that we have handled the current word, we have to add a space at the
252 // end of it, and add the new word's size (including the space) to
253 // previous_word_sizes. Note: it's possible that the document language
254 // doesn't use ASCII spaces to separate words. That's fine though, we just
255 // need to be consistent with how the model is generated.
256 state_
->previous_words
.append(" ");
257 state_
->previous_word_sizes
.push_back(word_lower
.size() + 1);
259 // Cap the number of previous words.
260 if (state_
->previous_word_sizes
.size() >= max_words_per_term_
) {
261 state_
->previous_words
.erase(0, state_
->previous_word_sizes
.front());
262 state_
->previous_word_sizes
.pop_front();
266 void PhishingTermFeatureExtractor::CheckNoPendingExtraction() {
267 DCHECK(done_callback_
.is_null());
268 DCHECK(!state_
.get());
269 if (!done_callback_
.is_null() || state_
.get()) {
270 LOG(ERROR
) << "Extraction in progress, missing call to "
271 << "CancelPendingExtraction";
275 void PhishingTermFeatureExtractor::RunCallback(bool success
) {
276 // Record some timing stats that we can use to evaluate feature extraction
277 // performance. These include both successful and failed extractions.
278 DCHECK(state_
.get());
279 UMA_HISTOGRAM_COUNTS("SBClientPhishing.TermFeatureIterations",
280 state_
->num_iterations
);
281 UMA_HISTOGRAM_TIMES("SBClientPhishing.TermFeatureTotalTime",
282 clock_
->Now() - state_
->start_time
);
284 DCHECK(!done_callback_
.is_null());
285 done_callback_
.Run(success
);
289 void PhishingTermFeatureExtractor::Clear() {
292 shingle_hashes_
= NULL
;
293 done_callback_
.Reset();
297 } // namespace safe_browsing