chrome/renderer/safe_browsing/phishing_term_feature_extractor.h

   1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.
   2 // Use of this source code is governed by a BSD-style license that can be
   3 // found in the LICENSE file.
   4 //
   5 // PhishingTermFeatureExtractor handles computing term features from the text
   6 // of a web page for the client-side phishing detection model.  To do this, it
   7 // takes a list of terms that appear in the model, and scans through the page
   8 // text looking for them.  Any terms that appear will cause a corresponding
   9 // features::kPageTerm feature to be added to the FeatureMap.
  10 //
  11 // To make it harder for a phisher to enumerate all of the relevant terms in
  12 // the model, the terms are provided as SHA-256 hashes, rather than plain text.
  13 //
  14 // There is one PhishingTermFeatureExtractor per RenderView.
  15
  16 #ifndef CHROME_RENDERER_SAFE_BROWSING_PHISHING_TERM_FEATURE_EXTRACTOR_H_
  17 #define CHROME_RENDERER_SAFE_BROWSING_PHISHING_TERM_FEATURE_EXTRACTOR_H_
  18
  19 #include <string>
  20
  21 #include "base/basictypes.h"
  22 #include "base/callback.h"
  23 #include "base/containers/hash_tables.h"
  24 #include "base/containers/mru_cache.h"
  25 #include "base/memory/scoped_ptr.h"
  26 #include "base/memory/weak_ptr.h"
  27 #include "base/strings/string16.h"
  28 #include "base/strings/string_piece.h"
  29
  30 namespace safe_browsing {
  31 class FeatureExtractorClock;
  32 class FeatureMap;
  33
  34 class PhishingTermFeatureExtractor {
  35  public:
  36   // Callback to be run when feature extraction finishes.  The callback
  37   // argument is true if extraction was successful, false otherwise.
  38   typedef base::Callback<void(bool)> DoneCallback;
  39
  40   // Creates a PhishingTermFeatureExtractor which will extract features for
  41   // all of the terms whose SHA-256 hashes are in |page_term_hashes|.  These
  42   // terms may be multi-word n-grams, with at most |max_words_per_term| words.
  43   //
  44   // |page_word_hashes| contains the murmur3 hashes for all of the individual
  45   // words that make up the terms.  Both sets of strings are UTF-8 encoded and
  46   // lowercased prior to hashing.  The caller owns both sets of strings, and
  47   // must ensure that they are valid until the PhishingTermFeatureExtractor is
  48   // destroyed.
  49   //
  50   // |clock| is used for timing feature extractor operations, and may be mocked
  51   // for testing.  The caller keeps ownership of the clock.
  52   PhishingTermFeatureExtractor(
  53       const base::hash_set<std::string>* page_term_hashes,
  54       const base::hash_set<uint32>* page_word_hashes,
  55       size_t max_words_per_term,
  56       uint32 murmurhash3_seed,
  57       FeatureExtractorClock* clock);
  58   ~PhishingTermFeatureExtractor();
  59
  60   // Begins extracting features from |page_text| into the given FeatureMap.
  61   // |page_text| should contain the plain text of a web page, including any
  62   // subframes, as returned by RenderView::CaptureText().
  63   //
  64   // To avoid blocking the render thread for too long, the feature extractor
  65   // may run in several chunks of work, posting a task to the current
  66   // MessageLoop to continue processing.  Once feature extraction is complete,
  67   // |done_callback| is run on the current thread.
  68   // PhishingTermFeatureExtractor takes ownership of the callback.
  69   //
  70   // |page_text| and |features| are owned by the caller, and must not be
  71   // destroyed until either |done_callback| is run or
  72   // CancelPendingExtraction() is called.
  73   void ExtractFeatures(const base::string16* page_text,
  74                        FeatureMap* features,
  75                        const DoneCallback& done_callback);
  76
  77   // Cancels any pending feature extraction.  The DoneCallback will not be run.
  78   // Must be called if there is a feature extraction in progress when the page
  79   // is unloaded or the PhishingTermFeatureExtractor is destroyed.
  80   void CancelPendingExtraction();
  81
  82  private:
  83   struct ExtractionState;
  84
  85   // The maximum amount of wall time that we will spend on a single extraction
  86   // iteration before pausing to let other MessageLoop tasks run.
  87   static const int kMaxTimePerChunkMs;
  88
  89   // The number of words that we will process before checking to see whether
  90   // kMaxTimePerChunkMs has elapsed.  Since checking the current time can be
  91   // slow, we don't do this on every word processed.
  92   static const int kClockCheckGranularity;
  93
  94   // The maximum total amount of time that the feature extractor will run
  95   // before giving up on the current page.
  96   static const int kMaxTotalTimeMs;
  97
  98   // The size of the cache that we use to determine if we can avoid lower
  99   // casing, hashing, and UTF conversion.
 100   static const int kMaxNegativeWordCacheSize;
 101
 102   // Does the actual work of ExtractFeatures.  ExtractFeaturesWithTimeout runs
 103   // until a predefined maximum amount of time has elapsed, then posts a task
 104   // to the current MessageLoop to continue extraction.  When extraction
 105   // finishes, calls RunCallback().
 106   void ExtractFeaturesWithTimeout();
 107
 108   // Handles a single word in the page text.
 109   void HandleWord(const base::StringPiece16& word);
 110
 111   // Helper to verify that there is no pending feature extraction.  Dies in
 112   // debug builds if the state is not as expected.  This is a no-op in release
 113   // builds.
 114   void CheckNoPendingExtraction();
 115
 116   // Runs |done_callback_| and then clears all internal state.
 117   void RunCallback(bool success);
 118
 119   // Clears all internal feature extraction state.
 120   void Clear();
 121
 122   // All of the term hashes that we are looking for in the page.
 123   const base::hash_set<std::string>* page_term_hashes_;
 124
 125   // Murmur3 hashes of all the individual words in page_term_hashes_.  If
 126   // page_term_hashes_ included (hashed) "one" and "one two", page_word_hashes_
 127   // would contain (hashed) "one" and "two".  We do this so that we can have a
 128   // quick out in the common case that the current word we are processing
 129   // doesn't contain any part of one of our terms.
 130   const base::hash_set<uint32>* page_word_hashes_;
 131
 132   // The maximum number of words in an n-gram.
 133   const size_t max_words_per_term_;
 134
 135   // The seed for murmurhash3.
 136   const uint32 murmurhash3_seed_;
 137
 138   // This cache is used to see if we need to check the word at all, as
 139   // converting to UTF8, lowercasing, and hashing are all relatively expensive
 140   // operations. Though this is called an MRU cache, it seems to behave like
 141   // an LRU cache (i.e. it evicts the oldest accesses first).
 142   typedef base::HashingMRUCache<base::StringPiece16, bool> WordCache;
 143   WordCache negative_word_cache_;
 144
 145   // Non-owned pointer to our clock.
 146   FeatureExtractorClock* clock_;
 147
 148   // The output parameters from the most recent call to ExtractFeatures().
 149   const base::string16* page_text_;  // The caller keeps ownership of this.
 150   FeatureMap* features_;  // The caller keeps ownership of this.
 151   DoneCallback done_callback_;
 152
 153   // Stores the current state of term extraction from |page_text_|.
 154   scoped_ptr<ExtractionState> state_;
 155
 156   // Used in scheduling ExtractFeaturesWithTimeout tasks.
 157   // These pointers are invalidated if extraction is cancelled.
 158   base::WeakPtrFactory<PhishingTermFeatureExtractor> weak_factory_;
 159
 160   DISALLOW_COPY_AND_ASSIGN(PhishingTermFeatureExtractor);
 161 };
 162
 163 }  // namespace safe_browsing
 164
 165 #endif  // CHROME_RENDERER_SAFE_BROWSING_PHISHING_TERM_FEATURE_EXTRACTOR_H_