chrome/renderer/safe_browsing/phishing_classifier.cc

   1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.
   2 // Use of this source code is governed by a BSD-style license that can be
   3 // found in the LICENSE file.
   4
   5 #include "chrome/renderer/safe_browsing/phishing_classifier.h"
   6
   7 #include <string>
   8
   9 #include "base/bind.h"
  10 #include "base/callback.h"
  11 #include "base/compiler_specific.h"
  12 #include "base/location.h"
  13 #include "base/logging.h"
  14 #include "base/metrics/histogram.h"
  15 #include "base/single_thread_task_runner.h"
  16 #include "base/strings/string_util.h"
  17 #include "base/thread_task_runner_handle.h"
  18 #include "chrome/common/safe_browsing/csd.pb.h"
  19 #include "chrome/common/url_constants.h"
  20 #include "chrome/renderer/safe_browsing/feature_extractor_clock.h"
  21 #include "chrome/renderer/safe_browsing/features.h"
  22 #include "chrome/renderer/safe_browsing/phishing_dom_feature_extractor.h"
  23 #include "chrome/renderer/safe_browsing/phishing_term_feature_extractor.h"
  24 #include "chrome/renderer/safe_browsing/phishing_url_feature_extractor.h"
  25 #include "chrome/renderer/safe_browsing/scorer.h"
  26 #include "content/public/renderer/render_view.h"
  27 #include "crypto/sha2.h"
  28 #include "third_party/WebKit/public/platform/WebURL.h"
  29 #include "third_party/WebKit/public/platform/WebURLRequest.h"
  30 #include "third_party/WebKit/public/web/WebDataSource.h"
  31 #include "third_party/WebKit/public/web/WebDocument.h"
  32 #include "third_party/WebKit/public/web/WebFrame.h"
  33 #include "third_party/WebKit/public/web/WebView.h"
  34 #include "url/gurl.h"
  35
  36 namespace safe_browsing {
  37
  38 const float PhishingClassifier::kInvalidScore = -1.0;
  39 const float PhishingClassifier::kPhishyThreshold = 0.5;
  40
  41 PhishingClassifier::PhishingClassifier(content::RenderView* render_view,
  42                                        FeatureExtractorClock* clock)
  43     : render_view_(render_view),
  44       scorer_(NULL),
  45       clock_(clock),
  46       weak_factory_(this) {
  47   Clear();
  48 }
  49
  50 PhishingClassifier::~PhishingClassifier() {
  51   // The RenderView should have called CancelPendingClassification() before
  52   // we are destroyed.
  53   CheckNoPendingClassification();
  54 }
  55
  56 void PhishingClassifier::set_phishing_scorer(const Scorer* scorer) {
  57   CheckNoPendingClassification();
  58   scorer_ = scorer;
  59   if (scorer_) {
  60     url_extractor_.reset(new PhishingUrlFeatureExtractor);
  61     dom_extractor_.reset(new PhishingDOMFeatureExtractor(clock_.get()));
  62     term_extractor_.reset(new PhishingTermFeatureExtractor(
  63         &scorer_->page_terms(),
  64         &scorer_->page_words(),
  65         scorer_->max_words_per_term(),
  66         scorer_->murmurhash3_seed(),
  67         scorer_->max_shingles_per_page(),
  68         scorer_->shingle_size(),
  69         clock_.get()));
  70   } else {
  71     // We're disabling client-side phishing detection, so tear down all
  72     // of the relevant objects.
  73     url_extractor_.reset();
  74     dom_extractor_.reset();
  75     term_extractor_.reset();
  76   }
  77 }
  78
  79 bool PhishingClassifier::is_ready() const {
  80   return scorer_ != NULL;
  81 }
  82
  83 void PhishingClassifier::BeginClassification(
  84     const base::string16* page_text,
  85     const DoneCallback& done_callback) {
  86   DCHECK(is_ready());
  87
  88   // The RenderView should have called CancelPendingClassification() before
  89   // starting a new classification, so DCHECK this.
  90   CheckNoPendingClassification();
  91   // However, in an opt build, we will go ahead and clean up the pending
  92   // classification so that we can start in a known state.
  93   CancelPendingClassification();
  94
  95   page_text_ = page_text;
  96   done_callback_ = done_callback;
  97
  98   // For consistency, we always want to invoke the DoneCallback
  99   // asynchronously, rather than directly from this method.  To ensure that
 100   // this is the case, post a task to begin feature extraction on the next
 101   // iteration of the message loop.
 102   base::ThreadTaskRunnerHandle::Get()->PostTask(
 103       FROM_HERE, base::Bind(&PhishingClassifier::BeginFeatureExtraction,
 104                             weak_factory_.GetWeakPtr()));
 105 }
 106
 107 void PhishingClassifier::BeginFeatureExtraction() {
 108   blink::WebView* web_view = render_view_->GetWebView();
 109   if (!web_view) {
 110     RunFailureCallback();
 111     return;
 112   }
 113
 114   blink::WebFrame* frame = web_view->mainFrame();
 115   if (!frame) {
 116     RunFailureCallback();
 117     return;
 118   }
 119
 120   // Check whether the URL is one that we should classify.
 121   // Currently, we only classify http: URLs that are GET requests.
 122   GURL url(frame->document().url());
 123   if (!url.SchemeIs(url::kHttpScheme)) {
 124     RunFailureCallback();
 125     return;
 126   }
 127
 128   blink::WebDataSource* ds = frame->dataSource();
 129   if (!ds ||
 130       !base::EqualsASCII(base::StringPiece16(ds->request().httpMethod()),
 131                          "GET")) {
 132     RunFailureCallback();
 133     return;
 134   }
 135
 136   features_.reset(new FeatureMap);
 137   if (!url_extractor_->ExtractFeatures(url, features_.get())) {
 138     RunFailureCallback();
 139     return;
 140   }
 141
 142   // DOM feature extraction can take awhile, so it runs asynchronously
 143   // in several chunks of work and invokes the callback when finished.
 144   dom_extractor_->ExtractFeatures(
 145       frame->document(), features_.get(),
 146       base::Bind(&PhishingClassifier::DOMExtractionFinished,
 147                  base::Unretained(this)));
 148 }
 149
 150 void PhishingClassifier::CancelPendingClassification() {
 151   // Note that cancelling the feature extractors is simply a no-op if they
 152   // were not running.
 153   DCHECK(is_ready());
 154   dom_extractor_->CancelPendingExtraction();
 155   term_extractor_->CancelPendingExtraction();
 156   weak_factory_.InvalidateWeakPtrs();
 157   Clear();
 158 }
 159
 160 void PhishingClassifier::DOMExtractionFinished(bool success) {
 161   shingle_hashes_.reset(new std::set<uint32>);
 162   if (success) {
 163     // Term feature extraction can take awhile, so it runs asynchronously
 164     // in several chunks of work and invokes the callback when finished.
 165     term_extractor_->ExtractFeatures(
 166         page_text_,
 167         features_.get(),
 168         shingle_hashes_.get(),
 169         base::Bind(&PhishingClassifier::TermExtractionFinished,
 170                    base::Unretained(this)));
 171   } else {
 172     RunFailureCallback();
 173   }
 174 }
 175
 176 void PhishingClassifier::TermExtractionFinished(bool success) {
 177   if (success) {
 178     blink::WebView* web_view = render_view_->GetWebView();
 179     if (!web_view) {
 180       RunFailureCallback();
 181       return;
 182     }
 183     blink::WebFrame* main_frame = web_view->mainFrame();
 184     if (!main_frame) {
 185       RunFailureCallback();
 186       return;
 187     }
 188
 189     // Hash all of the features so that they match the model, then compute
 190     // the score.
 191     FeatureMap hashed_features;
 192     ClientPhishingRequest verdict;
 193     verdict.set_model_version(scorer_->model_version());
 194     verdict.set_url(main_frame->document().url().spec());
 195     for (base::hash_map<std::string, double>::const_iterator it =
 196              features_->features().begin();
 197          it != features_->features().end(); ++it) {
 198       DVLOG(2) << "Feature: " << it->first << " = " << it->second;
 199       bool result = hashed_features.AddRealFeature(
 200           crypto::SHA256HashString(it->first), it->second);
 201       DCHECK(result);
 202       ClientPhishingRequest::Feature* feature = verdict.add_feature_map();
 203       feature->set_name(it->first);
 204       feature->set_value(it->second);
 205     }
 206     for (std::set<uint32>::const_iterator it = shingle_hashes_->begin();
 207          it != shingle_hashes_->end(); ++it) {
 208       verdict.add_shingle_hashes(*it);
 209     }
 210     float score = static_cast<float>(scorer_->ComputeScore(hashed_features));
 211     verdict.set_client_score(score);
 212     verdict.set_is_phishing(score >= kPhishyThreshold);
 213     RunCallback(verdict);
 214   } else {
 215     RunFailureCallback();
 216   }
 217 }
 218
 219 void PhishingClassifier::CheckNoPendingClassification() {
 220   DCHECK(done_callback_.is_null());
 221   DCHECK(!page_text_);
 222   if (!done_callback_.is_null() || page_text_) {
 223     LOG(ERROR) << "Classification in progress, missing call to "
 224                << "CancelPendingClassification";
 225     UMA_HISTOGRAM_COUNTS("SBClientPhishing.CheckNoPendingClassificationFailed",
 226                          1);
 227   }
 228 }
 229
 230 void PhishingClassifier::RunCallback(const ClientPhishingRequest& verdict) {
 231   done_callback_.Run(verdict);
 232   Clear();
 233 }
 234
 235 void PhishingClassifier::RunFailureCallback() {
 236   ClientPhishingRequest verdict;
 237   // In this case we're not guaranteed to have a valid URL.  Just set it
 238   // to the empty string to make sure we have a valid protocol buffer.
 239   verdict.set_url("");
 240   verdict.set_client_score(kInvalidScore);
 241   verdict.set_is_phishing(false);
 242   RunCallback(verdict);
 243 }
 244
 245 void PhishingClassifier::Clear() {
 246   page_text_ = NULL;
 247   done_callback_.Reset();
 248   features_.reset(NULL);
 249   shingle_hashes_.reset(NULL);
 250 }
 251
 252 }  // namespace safe_browsing