1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "chrome/renderer/safe_browsing/phishing_term_feature_extractor.h"
10 #include "base/bind.h"
11 #include "base/compiler_specific.h"
12 #include "base/i18n/case_conversion.h"
13 #include "base/logging.h"
14 #include "base/message_loop/message_loop.h"
15 #include "base/metrics/histogram.h"
16 #include "base/strings/utf_string_conversions.h"
17 #include "base/time/time.h"
18 #include "chrome/renderer/safe_browsing/feature_extractor_clock.h"
19 #include "chrome/renderer/safe_browsing/features.h"
20 #include "chrome/renderer/safe_browsing/murmurhash3_util.h"
21 #include "crypto/sha2.h"
22 #include "third_party/icu/source/common/unicode/ubrk.h"
23 #include "ui/base/l10n/l10n_util.h"
25 namespace safe_browsing
{
27 // This time should be short enough that it doesn't noticeably disrupt the
28 // user's interaction with the page.
29 const int PhishingTermFeatureExtractor::kMaxTimePerChunkMs
= 10;
31 // Experimenting shows that we get a reasonable gain in performance by
32 // increasing this up to around 10, but there's not much benefit in
33 // increasing it past that.
34 const int PhishingTermFeatureExtractor::kClockCheckGranularity
= 5;
36 // This should be longer than we expect feature extraction to take on any
37 // actual phishing page.
38 const int PhishingTermFeatureExtractor::kMaxTotalTimeMs
= 500;
40 // The maximum size of the negative word cache.
41 const int PhishingTermFeatureExtractor::kMaxNegativeWordCacheSize
= 1000;
43 // All of the state pertaining to the current feature extraction.
44 struct PhishingTermFeatureExtractor::ExtractionState
{
45 // Stores up to max_words_per_term_ previous words separated by spaces.
46 std::string previous_words
;
48 // Stores the sizes of the words in previous_words. Note: the size includes
49 // the space after each word. In other words, the sum of all sizes in this
50 // list is equal to the length of previous_words.
51 std::list
<size_t> previous_word_sizes
;
53 // An iterator for word breaking.
54 UBreakIterator
* iterator
;
56 // Our current position in the text that was passed to the ExtractionState
57 // constructor, speciailly, the most recent break position returned by our
61 // True if position has been initialized.
62 bool position_initialized
;
64 // The time at which we started feature extraction for the current page.
65 base::TimeTicks start_time
;
67 // The number of iterations we've done for the current extraction.
70 ExtractionState(const base::string16
& text
, base::TimeTicks start_time_ticks
)
72 position_initialized(false),
73 start_time(start_time_ticks
),
75 UErrorCode status
= U_ZERO_ERROR
;
76 // TODO(bryner): We should pass in the language for the document.
77 iterator
= ubrk_open(UBRK_WORD
, NULL
,
78 text
.data(), text
.size(),
80 if (U_FAILURE(status
)) {
81 DLOG(ERROR
) << "ubrk_open failed: " << status
;
93 PhishingTermFeatureExtractor::PhishingTermFeatureExtractor(
94 const base::hash_set
<std::string
>* page_term_hashes
,
95 const base::hash_set
<uint32
>* page_word_hashes
,
96 size_t max_words_per_term
,
97 uint32 murmurhash3_seed
,
98 FeatureExtractorClock
* clock
)
99 : page_term_hashes_(page_term_hashes
),
100 page_word_hashes_(page_word_hashes
),
101 max_words_per_term_(max_words_per_term
),
102 murmurhash3_seed_(murmurhash3_seed
),
103 negative_word_cache_(kMaxNegativeWordCacheSize
),
105 weak_factory_(this) {
109 PhishingTermFeatureExtractor::~PhishingTermFeatureExtractor() {
110 // The RenderView should have called CancelPendingExtraction() before
112 CheckNoPendingExtraction();
115 void PhishingTermFeatureExtractor::ExtractFeatures(
116 const base::string16
* page_text
,
117 FeatureMap
* features
,
118 const DoneCallback
& done_callback
) {
119 // The RenderView should have called CancelPendingExtraction() before
120 // starting a new extraction, so DCHECK this.
121 CheckNoPendingExtraction();
122 // However, in an opt build, we will go ahead and clean up the pending
123 // extraction so that we can start in a known state.
124 CancelPendingExtraction();
126 page_text_
= page_text
;
127 features_
= features
;
128 done_callback_
= done_callback
;
130 state_
.reset(new ExtractionState(*page_text_
, clock_
->Now()));
131 base::MessageLoop::current()->PostTask(
133 base::Bind(&PhishingTermFeatureExtractor::ExtractFeaturesWithTimeout
,
134 weak_factory_
.GetWeakPtr()));
137 void PhishingTermFeatureExtractor::CancelPendingExtraction() {
138 // Cancel any pending callbacks, and clear our state.
139 weak_factory_
.InvalidateWeakPtrs();
143 void PhishingTermFeatureExtractor::ExtractFeaturesWithTimeout() {
144 DCHECK(state_
.get());
145 ++state_
->num_iterations
;
146 base::TimeTicks current_chunk_start_time
= clock_
->Now();
148 if (!state_
->iterator
) {
149 // We failed to initialize the break iterator, so stop now.
150 UMA_HISTOGRAM_COUNTS("SBClientPhishing.TermFeatureBreakIterError", 1);
155 if (!state_
->position_initialized
) {
156 state_
->position
= ubrk_first(state_
->iterator
);
157 if (state_
->position
== UBRK_DONE
) {
158 // No words present, so we're done.
162 state_
->position_initialized
= true;
166 for (int next
= ubrk_next(state_
->iterator
);
167 next
!= UBRK_DONE
; next
= ubrk_next(state_
->iterator
)) {
168 if (ubrk_getRuleStatus(state_
->iterator
) != UBRK_WORD_NONE
) {
169 // next is now positioned at the end of a word.
170 HandleWord(base::StringPiece16(page_text_
->data() + state_
->position
,
171 next
- state_
->position
));
174 state_
->position
= next
;
176 if (num_words
>= kClockCheckGranularity
) {
178 base::TimeTicks now
= clock_
->Now();
179 if (now
- state_
->start_time
>=
180 base::TimeDelta::FromMilliseconds(kMaxTotalTimeMs
)) {
181 DLOG(ERROR
) << "Feature extraction took too long, giving up";
182 // We expect this to happen infrequently, so record when it does.
183 UMA_HISTOGRAM_COUNTS("SBClientPhishing.TermFeatureTimeout", 1);
187 base::TimeDelta chunk_elapsed
= now
- current_chunk_start_time
;
189 base::TimeDelta::FromMilliseconds(kMaxTimePerChunkMs
)) {
190 // The time limit for the current chunk is up, so post a task to
191 // continue extraction.
193 // Record how much time we actually spent on the chunk. If this is
194 // much higher than kMaxTimePerChunkMs, we may need to adjust the
195 // clock granularity.
196 UMA_HISTOGRAM_TIMES("SBClientPhishing.TermFeatureChunkTime",
198 base::MessageLoop::current()->PostTask(
201 &PhishingTermFeatureExtractor::ExtractFeaturesWithTimeout
,
202 weak_factory_
.GetWeakPtr()));
205 // Otherwise, continue.
211 void PhishingTermFeatureExtractor::HandleWord(
212 const base::StringPiece16
& word
) {
213 // Quickest out if we have seen this word before and know that it's not
214 // part of any term. This avoids the lowercasing and UTF conversion, both of
215 // which are relatively expensive.
216 if (negative_word_cache_
.Get(word
) != negative_word_cache_
.end()) {
217 // We know we're no longer in a possible n-gram, so clear the previous word
219 state_
->previous_words
.clear();
220 state_
->previous_word_sizes
.clear();
224 std::string word_lower
= base::UTF16ToUTF8(base::i18n::ToLower(word
));
225 uint32 word_hash
= MurmurHash3String(word_lower
, murmurhash3_seed_
);
227 // Quick out if the word is not part of any term, which is the common case.
228 if (page_word_hashes_
->find(word_hash
) == page_word_hashes_
->end()) {
229 // Word doesn't exist in our terms so we can clear the n-gram state.
230 state_
->previous_words
.clear();
231 state_
->previous_word_sizes
.clear();
232 // Insert into negative cache so that we don't try this again.
233 negative_word_cache_
.Put(word
, true);
237 // Find all of the n-grams that we need to check and compute their SHA-256
239 std::map
<std::string
/* hash */, std::string
/* plaintext */>
241 hashes_to_check
[crypto::SHA256HashString(word_lower
)] = word_lower
;
243 // Combine the new word with the previous words to find additional n-grams.
244 // Note that we don't yet add the new word length to previous_word_sizes,
245 // since we don't want to compute the hash for the word by itself again.
247 state_
->previous_words
.append(word_lower
);
248 std::string current_term
= state_
->previous_words
;
249 for (std::list
<size_t>::iterator it
= state_
->previous_word_sizes
.begin();
250 it
!= state_
->previous_word_sizes
.end(); ++it
) {
251 hashes_to_check
[crypto::SHA256HashString(current_term
)] = current_term
;
252 current_term
.erase(0, *it
);
255 // Add features for any hashes that match page_term_hashes_.
256 for (std::map
<std::string
, std::string
>::iterator it
=
257 hashes_to_check
.begin();
258 it
!= hashes_to_check
.end(); ++it
) {
259 if (page_term_hashes_
->find(it
->first
) != page_term_hashes_
->end()) {
260 features_
->AddBooleanFeature(features::kPageTerm
+ it
->second
);
264 // Now that we have handled the current word, we have to add a space at the
265 // end of it, and add the new word's size (including the space) to
266 // previous_word_sizes. Note: it's possible that the document language
267 // doesn't use ASCII spaces to separate words. That's fine though, we just
268 // need to be consistent with how the model is generated.
269 state_
->previous_words
.append(" ");
270 state_
->previous_word_sizes
.push_back(word_lower
.size() + 1);
272 // Cap the number of previous words.
273 if (state_
->previous_word_sizes
.size() >= max_words_per_term_
) {
274 state_
->previous_words
.erase(0, state_
->previous_word_sizes
.front());
275 state_
->previous_word_sizes
.pop_front();
279 void PhishingTermFeatureExtractor::CheckNoPendingExtraction() {
280 DCHECK(done_callback_
.is_null());
281 DCHECK(!state_
.get());
282 if (!done_callback_
.is_null() || state_
.get()) {
283 LOG(ERROR
) << "Extraction in progress, missing call to "
284 << "CancelPendingExtraction";
288 void PhishingTermFeatureExtractor::RunCallback(bool success
) {
289 // Record some timing stats that we can use to evaluate feature extraction
290 // performance. These include both successful and failed extractions.
291 DCHECK(state_
.get());
292 UMA_HISTOGRAM_COUNTS("SBClientPhishing.TermFeatureIterations",
293 state_
->num_iterations
);
294 UMA_HISTOGRAM_TIMES("SBClientPhishing.TermFeatureTotalTime",
295 clock_
->Now() - state_
->start_time
);
297 DCHECK(!done_callback_
.is_null());
298 done_callback_
.Run(success
);
302 void PhishingTermFeatureExtractor::Clear() {
305 done_callback_
.Reset();
307 negative_word_cache_
.Clear();
310 } // namespace safe_browsing