Version 7 golden file for safe-browsing test.
[chromium-blink-merge.git] / chrome / renderer / safe_browsing / phishing_term_feature_extractor.cc
blob89994dfd04cf4488d4f4a87689cff92bc8760bb8
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "chrome/renderer/safe_browsing/phishing_term_feature_extractor.h"
7 #include <list>
8 #include <map>
10 #include "base/bind.h"
11 #include "base/compiler_specific.h"
12 #include "base/i18n/case_conversion.h"
13 #include "base/logging.h"
14 #include "base/message_loop/message_loop.h"
15 #include "base/metrics/histogram.h"
16 #include "base/strings/utf_string_conversions.h"
17 #include "base/time/time.h"
18 #include "chrome/renderer/safe_browsing/feature_extractor_clock.h"
19 #include "chrome/renderer/safe_browsing/features.h"
20 #include "chrome/renderer/safe_browsing/murmurhash3_util.h"
21 #include "crypto/sha2.h"
22 #include "third_party/icu/source/common/unicode/ubrk.h"
23 #include "ui/base/l10n/l10n_util.h"
25 namespace safe_browsing {
27 // This time should be short enough that it doesn't noticeably disrupt the
28 // user's interaction with the page.
29 const int PhishingTermFeatureExtractor::kMaxTimePerChunkMs = 10;
31 // Experimenting shows that we get a reasonable gain in performance by
32 // increasing this up to around 10, but there's not much benefit in
33 // increasing it past that.
34 const int PhishingTermFeatureExtractor::kClockCheckGranularity = 5;
36 // This should be longer than we expect feature extraction to take on any
37 // actual phishing page.
38 const int PhishingTermFeatureExtractor::kMaxTotalTimeMs = 500;
40 // The maximum size of the negative word cache.
41 const int PhishingTermFeatureExtractor::kMaxNegativeWordCacheSize = 1000;
43 // All of the state pertaining to the current feature extraction.
44 struct PhishingTermFeatureExtractor::ExtractionState {
45 // Stores up to max_words_per_term_ previous words separated by spaces.
46 std::string previous_words;
48 // Stores the sizes of the words in previous_words. Note: the size includes
49 // the space after each word. In other words, the sum of all sizes in this
50 // list is equal to the length of previous_words.
51 std::list<size_t> previous_word_sizes;
53 // An iterator for word breaking.
54 UBreakIterator* iterator;
56 // Our current position in the text that was passed to the ExtractionState
57 // constructor, speciailly, the most recent break position returned by our
58 // iterator.
59 int position;
61 // True if position has been initialized.
62 bool position_initialized;
64 // The time at which we started feature extraction for the current page.
65 base::TimeTicks start_time;
67 // The number of iterations we've done for the current extraction.
68 int num_iterations;
70 ExtractionState(const base::string16& text, base::TimeTicks start_time_ticks)
71 : position(-1),
72 position_initialized(false),
73 start_time(start_time_ticks),
74 num_iterations(0) {
75 UErrorCode status = U_ZERO_ERROR;
76 // TODO(bryner): We should pass in the language for the document.
77 iterator = ubrk_open(UBRK_WORD, NULL,
78 text.data(), text.size(),
79 &status);
80 if (U_FAILURE(status)) {
81 DLOG(ERROR) << "ubrk_open failed: " << status;
82 iterator = NULL;
86 ~ExtractionState() {
87 if (iterator) {
88 ubrk_close(iterator);
93 PhishingTermFeatureExtractor::PhishingTermFeatureExtractor(
94 const base::hash_set<std::string>* page_term_hashes,
95 const base::hash_set<uint32>* page_word_hashes,
96 size_t max_words_per_term,
97 uint32 murmurhash3_seed,
98 FeatureExtractorClock* clock)
99 : page_term_hashes_(page_term_hashes),
100 page_word_hashes_(page_word_hashes),
101 max_words_per_term_(max_words_per_term),
102 murmurhash3_seed_(murmurhash3_seed),
103 negative_word_cache_(kMaxNegativeWordCacheSize),
104 clock_(clock),
105 weak_factory_(this) {
106 Clear();
109 PhishingTermFeatureExtractor::~PhishingTermFeatureExtractor() {
110 // The RenderView should have called CancelPendingExtraction() before
111 // we are destroyed.
112 CheckNoPendingExtraction();
115 void PhishingTermFeatureExtractor::ExtractFeatures(
116 const base::string16* page_text,
117 FeatureMap* features,
118 const DoneCallback& done_callback) {
119 // The RenderView should have called CancelPendingExtraction() before
120 // starting a new extraction, so DCHECK this.
121 CheckNoPendingExtraction();
122 // However, in an opt build, we will go ahead and clean up the pending
123 // extraction so that we can start in a known state.
124 CancelPendingExtraction();
126 page_text_ = page_text;
127 features_ = features;
128 done_callback_ = done_callback;
130 state_.reset(new ExtractionState(*page_text_, clock_->Now()));
131 base::MessageLoop::current()->PostTask(
132 FROM_HERE,
133 base::Bind(&PhishingTermFeatureExtractor::ExtractFeaturesWithTimeout,
134 weak_factory_.GetWeakPtr()));
137 void PhishingTermFeatureExtractor::CancelPendingExtraction() {
138 // Cancel any pending callbacks, and clear our state.
139 weak_factory_.InvalidateWeakPtrs();
140 Clear();
143 void PhishingTermFeatureExtractor::ExtractFeaturesWithTimeout() {
144 DCHECK(state_.get());
145 ++state_->num_iterations;
146 base::TimeTicks current_chunk_start_time = clock_->Now();
148 if (!state_->iterator) {
149 // We failed to initialize the break iterator, so stop now.
150 UMA_HISTOGRAM_COUNTS("SBClientPhishing.TermFeatureBreakIterError", 1);
151 RunCallback(false);
152 return;
155 if (!state_->position_initialized) {
156 state_->position = ubrk_first(state_->iterator);
157 if (state_->position == UBRK_DONE) {
158 // No words present, so we're done.
159 RunCallback(true);
160 return;
162 state_->position_initialized = true;
165 int num_words = 0;
166 for (int next = ubrk_next(state_->iterator);
167 next != UBRK_DONE; next = ubrk_next(state_->iterator)) {
168 if (ubrk_getRuleStatus(state_->iterator) != UBRK_WORD_NONE) {
169 // next is now positioned at the end of a word.
170 HandleWord(base::StringPiece16(page_text_->data() + state_->position,
171 next - state_->position));
172 ++num_words;
174 state_->position = next;
176 if (num_words >= kClockCheckGranularity) {
177 num_words = 0;
178 base::TimeTicks now = clock_->Now();
179 if (now - state_->start_time >=
180 base::TimeDelta::FromMilliseconds(kMaxTotalTimeMs)) {
181 DLOG(ERROR) << "Feature extraction took too long, giving up";
182 // We expect this to happen infrequently, so record when it does.
183 UMA_HISTOGRAM_COUNTS("SBClientPhishing.TermFeatureTimeout", 1);
184 RunCallback(false);
185 return;
187 base::TimeDelta chunk_elapsed = now - current_chunk_start_time;
188 if (chunk_elapsed >=
189 base::TimeDelta::FromMilliseconds(kMaxTimePerChunkMs)) {
190 // The time limit for the current chunk is up, so post a task to
191 // continue extraction.
193 // Record how much time we actually spent on the chunk. If this is
194 // much higher than kMaxTimePerChunkMs, we may need to adjust the
195 // clock granularity.
196 UMA_HISTOGRAM_TIMES("SBClientPhishing.TermFeatureChunkTime",
197 chunk_elapsed);
198 base::MessageLoop::current()->PostTask(
199 FROM_HERE,
200 base::Bind(
201 &PhishingTermFeatureExtractor::ExtractFeaturesWithTimeout,
202 weak_factory_.GetWeakPtr()));
203 return;
205 // Otherwise, continue.
208 RunCallback(true);
211 void PhishingTermFeatureExtractor::HandleWord(
212 const base::StringPiece16& word) {
213 // Quickest out if we have seen this word before and know that it's not
214 // part of any term. This avoids the lowercasing and UTF conversion, both of
215 // which are relatively expensive.
216 if (negative_word_cache_.Get(word) != negative_word_cache_.end()) {
217 // We know we're no longer in a possible n-gram, so clear the previous word
218 // state.
219 state_->previous_words.clear();
220 state_->previous_word_sizes.clear();
221 return;
224 std::string word_lower = base::UTF16ToUTF8(base::i18n::ToLower(word));
225 uint32 word_hash = MurmurHash3String(word_lower, murmurhash3_seed_);
227 // Quick out if the word is not part of any term, which is the common case.
228 if (page_word_hashes_->find(word_hash) == page_word_hashes_->end()) {
229 // Word doesn't exist in our terms so we can clear the n-gram state.
230 state_->previous_words.clear();
231 state_->previous_word_sizes.clear();
232 // Insert into negative cache so that we don't try this again.
233 negative_word_cache_.Put(word, true);
234 return;
237 // Find all of the n-grams that we need to check and compute their SHA-256
238 // hashes.
239 std::map<std::string /* hash */, std::string /* plaintext */>
240 hashes_to_check;
241 hashes_to_check[crypto::SHA256HashString(word_lower)] = word_lower;
243 // Combine the new word with the previous words to find additional n-grams.
244 // Note that we don't yet add the new word length to previous_word_sizes,
245 // since we don't want to compute the hash for the word by itself again.
247 state_->previous_words.append(word_lower);
248 std::string current_term = state_->previous_words;
249 for (std::list<size_t>::iterator it = state_->previous_word_sizes.begin();
250 it != state_->previous_word_sizes.end(); ++it) {
251 hashes_to_check[crypto::SHA256HashString(current_term)] = current_term;
252 current_term.erase(0, *it);
255 // Add features for any hashes that match page_term_hashes_.
256 for (std::map<std::string, std::string>::iterator it =
257 hashes_to_check.begin();
258 it != hashes_to_check.end(); ++it) {
259 if (page_term_hashes_->find(it->first) != page_term_hashes_->end()) {
260 features_->AddBooleanFeature(features::kPageTerm + it->second);
264 // Now that we have handled the current word, we have to add a space at the
265 // end of it, and add the new word's size (including the space) to
266 // previous_word_sizes. Note: it's possible that the document language
267 // doesn't use ASCII spaces to separate words. That's fine though, we just
268 // need to be consistent with how the model is generated.
269 state_->previous_words.append(" ");
270 state_->previous_word_sizes.push_back(word_lower.size() + 1);
272 // Cap the number of previous words.
273 if (state_->previous_word_sizes.size() >= max_words_per_term_) {
274 state_->previous_words.erase(0, state_->previous_word_sizes.front());
275 state_->previous_word_sizes.pop_front();
279 void PhishingTermFeatureExtractor::CheckNoPendingExtraction() {
280 DCHECK(done_callback_.is_null());
281 DCHECK(!state_.get());
282 if (!done_callback_.is_null() || state_.get()) {
283 LOG(ERROR) << "Extraction in progress, missing call to "
284 << "CancelPendingExtraction";
288 void PhishingTermFeatureExtractor::RunCallback(bool success) {
289 // Record some timing stats that we can use to evaluate feature extraction
290 // performance. These include both successful and failed extractions.
291 DCHECK(state_.get());
292 UMA_HISTOGRAM_COUNTS("SBClientPhishing.TermFeatureIterations",
293 state_->num_iterations);
294 UMA_HISTOGRAM_TIMES("SBClientPhishing.TermFeatureTotalTime",
295 clock_->Now() - state_->start_time);
297 DCHECK(!done_callback_.is_null());
298 done_callback_.Run(success);
299 Clear();
302 void PhishingTermFeatureExtractor::Clear() {
303 page_text_ = NULL;
304 features_ = NULL;
305 done_callback_.Reset();
306 state_.reset(NULL);
307 negative_word_cache_.Clear();
310 } // namespace safe_browsing