chrome/renderer/safe_browsing/phishing_classifier.h

   1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
   2 // Use of this source code is governed by a BSD-style license that can be
   3 // found in the LICENSE file.
   4 //
   5 // This class handles the process of extracting all of the features from a
   6 // page and computing a phishyness score.  The basic steps are:
   7 //  - Run each feature extractor over the page, building up a FeatureMap of
   8 //    feature -> value.
   9 //  - SHA-256 hash all of the feature names in the map so that they match the
  10 //    supplied model.
  11 //  - Hand the hashed map off to a Scorer, which computes the probability that
  12 //    the page is phishy.
  13 //  - If the page is phishy, run the supplied callback.
  14 //
  15 // For more details, see phishing_*_feature_extractor.h, scorer.h, and
  16 // client_model.proto.
  17
  18 #ifndef CHROME_RENDERER_SAFE_BROWSING_PHISHING_CLASSIFIER_H_
  19 #define CHROME_RENDERER_SAFE_BROWSING_PHISHING_CLASSIFIER_H_
  20
  21 #include "base/basictypes.h"
  22 #include "base/callback.h"
  23 #include "base/memory/scoped_ptr.h"
  24 #include "base/memory/weak_ptr.h"
  25 #include "base/strings/string16.h"
  26
  27 namespace content {
  28 class RenderView;
  29 }
  30
  31 namespace safe_browsing {
  32 class ClientPhishingRequest;
  33 class FeatureExtractorClock;
  34 class FeatureMap;
  35 class PhishingDOMFeatureExtractor;
  36 class PhishingTermFeatureExtractor;
  37 class PhishingUrlFeatureExtractor;
  38 class Scorer;
  39
  40 class PhishingClassifier {
  41  public:
  42   // Callback to be run when phishing classification finishes. The verdict
  43   // is a ClientPhishingRequest which contains the verdict computed by the
  44   // classifier as well as the extracted features.  If the verdict.is_phishing()
  45   // is true, the page is considered phishy by the client-side model,
  46   // and the browser should ping back to get a final verdict.  The
  47   // verdict.client_score() is set to kInvalidScore if classification failed.
  48   typedef base::Callback<void(const ClientPhishingRequest& /* verdict */)>
  49       DoneCallback;
  50
  51   static const float kInvalidScore;
  52
  53   // Creates a new PhishingClassifier object that will operate on
  54   // |render_view|.  |clock| is used to time feature extractor operations, and
  55   // the PhishingClassifier takes ownership of this object.  Note that the
  56   // classifier will not be 'ready' until set_phishing_scorer() is called.
  57   PhishingClassifier(content::RenderView* render_view,
  58                      FeatureExtractorClock* clock);
  59   virtual ~PhishingClassifier();
  60
  61   // Sets a scorer for the classifier to use in computing the phishiness score.
  62   // This must live at least as long as the PhishingClassifier.  The caller is
  63   // expected to cancel any pending classification before setting a phishing
  64   // scorer.
  65   void set_phishing_scorer(const Scorer* scorer);
  66
  67   // Returns true if the classifier is ready to classify pages, i.e. it
  68   // has had a scorer set via set_phishing_scorer().
  69   bool is_ready() const;
  70
  71   // Called by the RenderView when a page has finished loading.  This begins
  72   // the feature extraction and scoring process. |page_text| should contain
  73   // the plain text of a web page, including any subframes, as returned by
  74   // RenderView::CaptureText().  |page_text| is owned by the caller, and must
  75   // not be destroyed until either |done_callback| is run or
  76   // CancelPendingClassification() is called.
  77   //
  78   // To avoid blocking the render thread for too long, phishing classification
  79   // may run in several chunks of work, posting a task to the current
  80   // MessageLoop to continue processing.  Once the scoring process is complete,
  81   // |done_callback| is run on the current thread.  PhishingClassifier takes
  82   // ownership of the callback.
  83   //
  84   // It is an error to call BeginClassification if the classifier is not yet
  85   // ready.
  86   virtual void BeginClassification(const base::string16* page_text,
  87                                    const DoneCallback& callback);
  88
  89   // Called by the RenderView (on the render thread) when a page is unloading
  90   // or the RenderView is being destroyed.  This cancels any extraction that
  91   // is in progress.  It is an error to call CancelPendingClassification if
  92   // the classifier is not yet ready.
  93   virtual void CancelPendingClassification();
  94
  95  private:
  96   // Any score equal to or above this value is considered phishy.
  97   static const float kPhishyThreshold;
  98
  99   // Begins the feature extraction process, by extracting URL features and
 100   // beginning DOM feature extraction.
 101   void BeginFeatureExtraction();
 102
 103   // Callback to be run when DOM feature extraction is complete.
 104   // If it was successful, begins term feature extraction, otherwise
 105   // runs the DoneCallback with a non-phishy verdict.
 106   void DOMExtractionFinished(bool success);
 107
 108   // Callback to be run when term feature extraction is complete.
 109   // If it was successful, computes a score and runs the DoneCallback.
 110   // If extraction was unsuccessful, runs the DoneCallback with a
 111   // non-phishy verdict.
 112   void TermExtractionFinished(bool success);
 113
 114   // Helper to verify that there is no pending phishing classification.  Dies
 115   // in debug builds if the state is not as expected.  This is a no-op in
 116   // release builds.
 117   void CheckNoPendingClassification();
 118
 119   // Helper method to run the DoneCallback and clear the state.
 120   void RunCallback(const ClientPhishingRequest& verdict);
 121
 122   // Helper to run the DoneCallback when feature extraction has failed.
 123   // This always signals a non-phishy verdict for the page, with kInvalidScore.
 124   void RunFailureCallback();
 125
 126   // Clears the current state of the PhishingClassifier.
 127   void Clear();
 128
 129   content::RenderView* render_view_;  // owns us
 130   const Scorer* scorer_;  // owned by the caller
 131   scoped_ptr<FeatureExtractorClock> clock_;
 132   scoped_ptr<PhishingUrlFeatureExtractor> url_extractor_;
 133   scoped_ptr<PhishingDOMFeatureExtractor> dom_extractor_;
 134   scoped_ptr<PhishingTermFeatureExtractor> term_extractor_;
 135
 136   // State for any in-progress extraction.
 137   scoped_ptr<FeatureMap> features_;
 138   const base::string16* page_text_;  // owned by the caller
 139   DoneCallback done_callback_;
 140
 141   // Used in scheduling BeginFeatureExtraction tasks.
 142   // These pointers are invalidated if classification is cancelled.
 143   base::WeakPtrFactory<PhishingClassifier> weak_factory_;
 144
 145   DISALLOW_COPY_AND_ASSIGN(PhishingClassifier);
 146 };
 147
 148 }  // namespace safe_browsing
 149
 150 #endif  // CHROME_RENDERER_SAFE_BROWSING_PHISHING_CLASSIFIER_H_