chrome/renderer/safe_browsing/phishing_term_feature_extractor.cc

   1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
   2 // Use of this source code is governed by a BSD-style license that can be
   3 // found in the LICENSE file.
   4
   5 #include "chrome/renderer/safe_browsing/phishing_term_feature_extractor.h"
   6
   7 #include <list>
   8 #include <map>
   9
  10 #include "base/bind.h"
  11 #include "base/compiler_specific.h"
  12 #include "base/i18n/case_conversion.h"
  13 #include "base/logging.h"
  14 #include "base/message_loop/message_loop.h"
  15 #include "base/metrics/histogram.h"
  16 #include "base/strings/utf_string_conversions.h"
  17 #include "base/time/time.h"
  18 #include "chrome/renderer/safe_browsing/feature_extractor_clock.h"
  19 #include "chrome/renderer/safe_browsing/features.h"
  20 #include "chrome/renderer/safe_browsing/murmurhash3_util.h"
  21 #include "crypto/sha2.h"
  22 #include "third_party/icu/source/common/unicode/ubrk.h"
  23 #include "ui/base/l10n/l10n_util.h"
  24
  25 namespace safe_browsing {
  26
  27 // This time should be short enough that it doesn't noticeably disrupt the
  28 // user's interaction with the page.
  29 const int PhishingTermFeatureExtractor::kMaxTimePerChunkMs = 10;
  30
  31 // Experimenting shows that we get a reasonable gain in performance by
  32 // increasing this up to around 10, but there's not much benefit in
  33 // increasing it past that.
  34 const int PhishingTermFeatureExtractor::kClockCheckGranularity = 5;
  35
  36 // This should be longer than we expect feature extraction to take on any
  37 // actual phishing page.
  38 const int PhishingTermFeatureExtractor::kMaxTotalTimeMs = 500;
  39
  40 // The maximum size of the negative word cache.
  41 const int PhishingTermFeatureExtractor::kMaxNegativeWordCacheSize = 1000;
  42
  43 // All of the state pertaining to the current feature extraction.
  44 struct PhishingTermFeatureExtractor::ExtractionState {
  45   // Stores up to max_words_per_term_ previous words separated by spaces.
  46   std::string previous_words;
  47
  48   // Stores the sizes of the words in previous_words.  Note: the size includes
  49   // the space after each word.  In other words, the sum of all sizes in this
  50   // list is equal to the length of previous_words.
  51   std::list<size_t> previous_word_sizes;
  52
  53   // An iterator for word breaking.
  54   UBreakIterator* iterator;
  55
  56   // Our current position in the text that was passed to the ExtractionState
  57   // constructor, speciailly, the most recent break position returned by our
  58   // iterator.
  59   int position;
  60
  61   // True if position has been initialized.
  62   bool position_initialized;
  63
  64   // The time at which we started feature extraction for the current page.
  65   base::TimeTicks start_time;
  66
  67   // The number of iterations we've done for the current extraction.
  68   int num_iterations;
  69
  70   ExtractionState(const base::string16& text, base::TimeTicks start_time_ticks)
  71       : position(-1),
  72         position_initialized(false),
  73         start_time(start_time_ticks),
  74         num_iterations(0) {
  75     UErrorCode status = U_ZERO_ERROR;
  76     // TODO(bryner): We should pass in the language for the document.
  77     iterator = ubrk_open(UBRK_WORD, NULL,
  78                          text.data(), text.size(),
  79                          &status);
  80     if (U_FAILURE(status)) {
  81       DLOG(ERROR) << "ubrk_open failed: " << status;
  82       iterator = NULL;
  83     }
  84   }
  85
  86   ~ExtractionState() {
  87     if (iterator) {
  88       ubrk_close(iterator);
  89     }
  90   }
  91 };
  92
  93 PhishingTermFeatureExtractor::PhishingTermFeatureExtractor(
  94     const base::hash_set<std::string>* page_term_hashes,
  95     const base::hash_set<uint32>* page_word_hashes,
  96     size_t max_words_per_term,
  97     uint32 murmurhash3_seed,
  98     FeatureExtractorClock* clock)
  99     : page_term_hashes_(page_term_hashes),
 100       page_word_hashes_(page_word_hashes),
 101       max_words_per_term_(max_words_per_term),
 102       murmurhash3_seed_(murmurhash3_seed),
 103       negative_word_cache_(kMaxNegativeWordCacheSize),
 104       clock_(clock),
 105       weak_factory_(this) {
 106   Clear();
 107 }
 108
 109 PhishingTermFeatureExtractor::~PhishingTermFeatureExtractor() {
 110   // The RenderView should have called CancelPendingExtraction() before
 111   // we are destroyed.
 112   CheckNoPendingExtraction();
 113 }
 114
 115 void PhishingTermFeatureExtractor::ExtractFeatures(
 116     const base::string16* page_text,
 117     FeatureMap* features,
 118     const DoneCallback& done_callback) {
 119   // The RenderView should have called CancelPendingExtraction() before
 120   // starting a new extraction, so DCHECK this.
 121   CheckNoPendingExtraction();
 122   // However, in an opt build, we will go ahead and clean up the pending
 123   // extraction so that we can start in a known state.
 124   CancelPendingExtraction();
 125
 126   page_text_ = page_text;
 127   features_ = features;
 128   done_callback_ = done_callback;
 129
 130   state_.reset(new ExtractionState(*page_text_, clock_->Now()));
 131   base::MessageLoop::current()->PostTask(
 132       FROM_HERE,
 133       base::Bind(&PhishingTermFeatureExtractor::ExtractFeaturesWithTimeout,
 134                  weak_factory_.GetWeakPtr()));
 135 }
 136
 137 void PhishingTermFeatureExtractor::CancelPendingExtraction() {
 138   // Cancel any pending callbacks, and clear our state.
 139   weak_factory_.InvalidateWeakPtrs();
 140   Clear();
 141 }
 142
 143 void PhishingTermFeatureExtractor::ExtractFeaturesWithTimeout() {
 144   DCHECK(state_.get());
 145   ++state_->num_iterations;
 146   base::TimeTicks current_chunk_start_time = clock_->Now();
 147
 148   if (!state_->iterator) {
 149     // We failed to initialize the break iterator, so stop now.
 150     UMA_HISTOGRAM_COUNTS("SBClientPhishing.TermFeatureBreakIterError", 1);
 151     RunCallback(false);
 152     return;
 153   }
 154
 155   if (!state_->position_initialized) {
 156     state_->position = ubrk_first(state_->iterator);
 157     if (state_->position == UBRK_DONE) {
 158       // No words present, so we're done.
 159       RunCallback(true);
 160       return;
 161     }
 162     state_->position_initialized = true;
 163   }
 164
 165   int num_words = 0;
 166   for (int next = ubrk_next(state_->iterator);
 167        next != UBRK_DONE; next = ubrk_next(state_->iterator)) {
 168     if (ubrk_getRuleStatus(state_->iterator) != UBRK_WORD_NONE) {
 169       // next is now positioned at the end of a word.
 170       HandleWord(base::StringPiece16(page_text_->data() + state_->position,
 171                                      next - state_->position));
 172       ++num_words;
 173     }
 174     state_->position = next;
 175
 176     if (num_words >= kClockCheckGranularity) {
 177       num_words = 0;
 178       base::TimeTicks now = clock_->Now();
 179       if (now - state_->start_time >=
 180           base::TimeDelta::FromMilliseconds(kMaxTotalTimeMs)) {
 181         DLOG(ERROR) << "Feature extraction took too long, giving up";
 182         // We expect this to happen infrequently, so record when it does.
 183         UMA_HISTOGRAM_COUNTS("SBClientPhishing.TermFeatureTimeout", 1);
 184         RunCallback(false);
 185         return;
 186       }
 187       base::TimeDelta chunk_elapsed = now - current_chunk_start_time;
 188       if (chunk_elapsed >=
 189           base::TimeDelta::FromMilliseconds(kMaxTimePerChunkMs)) {
 190         // The time limit for the current chunk is up, so post a task to
 191         // continue extraction.
 192         //
 193         // Record how much time we actually spent on the chunk.  If this is
 194         // much higher than kMaxTimePerChunkMs, we may need to adjust the
 195         // clock granularity.
 196         UMA_HISTOGRAM_TIMES("SBClientPhishing.TermFeatureChunkTime",
 197                             chunk_elapsed);
 198         base::MessageLoop::current()->PostTask(
 199             FROM_HERE,
 200             base::Bind(
 201                 &PhishingTermFeatureExtractor::ExtractFeaturesWithTimeout,
 202                 weak_factory_.GetWeakPtr()));
 203         return;
 204       }
 205       // Otherwise, continue.
 206     }
 207   }
 208   RunCallback(true);
 209 }
 210
 211 void PhishingTermFeatureExtractor::HandleWord(
 212     const base::StringPiece16& word) {
 213   // Quickest out if we have seen this word before and know that it's not
 214   // part of any term. This avoids the lowercasing and UTF conversion, both of
 215   // which are relatively expensive.
 216   if (negative_word_cache_.Get(word) != negative_word_cache_.end()) {
 217     // We know we're no longer in a possible n-gram, so clear the previous word
 218     // state.
 219     state_->previous_words.clear();
 220     state_->previous_word_sizes.clear();
 221     return;
 222   }
 223
 224   std::string word_lower = base::UTF16ToUTF8(base::i18n::ToLower(word));
 225   uint32 word_hash = MurmurHash3String(word_lower, murmurhash3_seed_);
 226
 227   // Quick out if the word is not part of any term, which is the common case.
 228   if (page_word_hashes_->find(word_hash) == page_word_hashes_->end()) {
 229     // Word doesn't exist in our terms so we can clear the n-gram state.
 230     state_->previous_words.clear();
 231     state_->previous_word_sizes.clear();
 232     // Insert into negative cache so that we don't try this again.
 233     negative_word_cache_.Put(word, true);
 234     return;
 235   }
 236
 237   // Find all of the n-grams that we need to check and compute their SHA-256
 238   // hashes.
 239   std::map<std::string /* hash */, std::string /* plaintext */>
 240       hashes_to_check;
 241   hashes_to_check[crypto::SHA256HashString(word_lower)] = word_lower;
 242
 243   // Combine the new word with the previous words to find additional n-grams.
 244   // Note that we don't yet add the new word length to previous_word_sizes,
 245   // since we don't want to compute the hash for the word by itself again.
 246   //
 247   state_->previous_words.append(word_lower);
 248   std::string current_term = state_->previous_words;
 249   for (std::list<size_t>::iterator it = state_->previous_word_sizes.begin();
 250        it != state_->previous_word_sizes.end(); ++it) {
 251     hashes_to_check[crypto::SHA256HashString(current_term)] = current_term;
 252     current_term.erase(0, *it);
 253   }
 254
 255   // Add features for any hashes that match page_term_hashes_.
 256   for (std::map<std::string, std::string>::iterator it =
 257            hashes_to_check.begin();
 258        it != hashes_to_check.end(); ++it) {
 259     if (page_term_hashes_->find(it->first) != page_term_hashes_->end()) {
 260       features_->AddBooleanFeature(features::kPageTerm + it->second);
 261     }
 262   }
 263
 264   // Now that we have handled the current word, we have to add a space at the
 265   // end of it, and add the new word's size (including the space) to
 266   // previous_word_sizes.  Note: it's possible that the document language
 267   // doesn't use ASCII spaces to separate words.  That's fine though, we just
 268   // need to be consistent with how the model is generated.
 269   state_->previous_words.append(" ");
 270   state_->previous_word_sizes.push_back(word_lower.size() + 1);
 271
 272   // Cap the number of previous words.
 273   if (state_->previous_word_sizes.size() >= max_words_per_term_) {
 274     state_->previous_words.erase(0, state_->previous_word_sizes.front());
 275     state_->previous_word_sizes.pop_front();
 276   }
 277 }
 278
 279 void PhishingTermFeatureExtractor::CheckNoPendingExtraction() {
 280   DCHECK(done_callback_.is_null());
 281   DCHECK(!state_.get());
 282   if (!done_callback_.is_null() || state_.get()) {
 283     LOG(ERROR) << "Extraction in progress, missing call to "
 284                << "CancelPendingExtraction";
 285   }
 286 }
 287
 288 void PhishingTermFeatureExtractor::RunCallback(bool success) {
 289   // Record some timing stats that we can use to evaluate feature extraction
 290   // performance.  These include both successful and failed extractions.
 291   DCHECK(state_.get());
 292   UMA_HISTOGRAM_COUNTS("SBClientPhishing.TermFeatureIterations",
 293                        state_->num_iterations);
 294   UMA_HISTOGRAM_TIMES("SBClientPhishing.TermFeatureTotalTime",
 295                       clock_->Now() - state_->start_time);
 296
 297   DCHECK(!done_callback_.is_null());
 298   done_callback_.Run(success);
 299   Clear();
 300 }
 301
 302 void PhishingTermFeatureExtractor::Clear() {
 303   page_text_ = NULL;
 304   features_ = NULL;
 305   done_callback_.Reset();
 306   state_.reset(NULL);
 307   negative_word_cache_.Clear();
 308 }
 309
 310 }  // namespace safe_browsing