chrome/renderer/safe_browsing/phishing_classifier.cc

   1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.
   2 // Use of this source code is governed by a BSD-style license that can be
   3 // found in the LICENSE file.
   4
   5 #include "chrome/renderer/safe_browsing/phishing_classifier.h"
   6
   7 #include <string>
   8
   9 #include "base/bind.h"
  10 #include "base/callback.h"
  11 #include "base/compiler_specific.h"
  12 #include "base/location.h"
  13 #include "base/logging.h"
  14 #include "base/metrics/histogram.h"
  15 #include "base/single_thread_task_runner.h"
  16 #include "base/strings/string_util.h"
  17 #include "base/thread_task_runner_handle.h"
  18 #include "chrome/common/safe_browsing/csd.pb.h"
  19 #include "chrome/common/url_constants.h"
  20 #include "chrome/renderer/safe_browsing/feature_extractor_clock.h"
  21 #include "chrome/renderer/safe_browsing/features.h"
  22 #include "chrome/renderer/safe_browsing/phishing_dom_feature_extractor.h"
  23 #include "chrome/renderer/safe_browsing/phishing_term_feature_extractor.h"
  24 #include "chrome/renderer/safe_browsing/phishing_url_feature_extractor.h"
  25 #include "chrome/renderer/safe_browsing/scorer.h"
  26 #include "content/public/renderer/render_view.h"
  27 #include "crypto/sha2.h"
  28 #include "third_party/WebKit/public/platform/WebURL.h"
  29 #include "third_party/WebKit/public/platform/WebURLRequest.h"
  30 #include "third_party/WebKit/public/web/WebDataSource.h"
  31 #include "third_party/WebKit/public/web/WebDocument.h"
  32 #include "third_party/WebKit/public/web/WebFrame.h"
  33 #include "third_party/WebKit/public/web/WebView.h"
  34 #include "url/gurl.h"
  35
  36 namespace safe_browsing {
  37
  38 const float PhishingClassifier::kInvalidScore = -1.0;
  39 const float PhishingClassifier::kPhishyThreshold = 0.5;
  40
  41 PhishingClassifier::PhishingClassifier(content::RenderView* render_view,
  42                                        FeatureExtractorClock* clock)
  43     : render_view_(render_view),
  44       scorer_(NULL),
  45       clock_(clock),
  46       weak_factory_(this) {
  47   Clear();
  48 }
  49
  50 PhishingClassifier::~PhishingClassifier() {
  51   // The RenderView should have called CancelPendingClassification() before
  52   // we are destroyed.
  53   CheckNoPendingClassification();
  54 }
  55
  56 void PhishingClassifier::set_phishing_scorer(const Scorer* scorer) {
  57   CheckNoPendingClassification();
  58   scorer_ = scorer;
  59   if (scorer_) {
  60     url_extractor_.reset(new PhishingUrlFeatureExtractor);
  61     dom_extractor_.reset(
  62         new PhishingDOMFeatureExtractor(render_view_, clock_.get()));
  63     term_extractor_.reset(new PhishingTermFeatureExtractor(
  64         &scorer_->page_terms(),
  65         &scorer_->page_words(),
  66         scorer_->max_words_per_term(),
  67         scorer_->murmurhash3_seed(),
  68         scorer_->max_shingles_per_page(),
  69         scorer_->shingle_size(),
  70         clock_.get()));
  71   } else {
  72     // We're disabling client-side phishing detection, so tear down all
  73     // of the relevant objects.
  74     url_extractor_.reset();
  75     dom_extractor_.reset();
  76     term_extractor_.reset();
  77   }
  78 }
  79
  80 bool PhishingClassifier::is_ready() const {
  81   return scorer_ != NULL;
  82 }
  83
  84 void PhishingClassifier::BeginClassification(
  85     const base::string16* page_text,
  86     const DoneCallback& done_callback) {
  87   DCHECK(is_ready());
  88
  89   // The RenderView should have called CancelPendingClassification() before
  90   // starting a new classification, so DCHECK this.
  91   CheckNoPendingClassification();
  92   // However, in an opt build, we will go ahead and clean up the pending
  93   // classification so that we can start in a known state.
  94   CancelPendingClassification();
  95
  96   page_text_ = page_text;
  97   done_callback_ = done_callback;
  98
  99   // For consistency, we always want to invoke the DoneCallback
 100   // asynchronously, rather than directly from this method.  To ensure that
 101   // this is the case, post a task to begin feature extraction on the next
 102   // iteration of the message loop.
 103   base::ThreadTaskRunnerHandle::Get()->PostTask(
 104       FROM_HERE, base::Bind(&PhishingClassifier::BeginFeatureExtraction,
 105                             weak_factory_.GetWeakPtr()));
 106 }
 107
 108 void PhishingClassifier::BeginFeatureExtraction() {
 109   blink::WebView* web_view = render_view_->GetWebView();
 110   if (!web_view) {
 111     RunFailureCallback();
 112     return;
 113   }
 114
 115   blink::WebFrame* frame = web_view->mainFrame();
 116   if (!frame) {
 117     RunFailureCallback();
 118     return;
 119   }
 120
 121   // Check whether the URL is one that we should classify.
 122   // Currently, we only classify http: URLs that are GET requests.
 123   GURL url(frame->document().url());
 124   if (!url.SchemeIs(url::kHttpScheme)) {
 125     RunFailureCallback();
 126     return;
 127   }
 128
 129   blink::WebDataSource* ds = frame->dataSource();
 130   if (!ds ||
 131       !base::EqualsASCII(base::StringPiece16(ds->request().httpMethod()),
 132                          "GET")) {
 133     RunFailureCallback();
 134     return;
 135   }
 136
 137   features_.reset(new FeatureMap);
 138   if (!url_extractor_->ExtractFeatures(url, features_.get())) {
 139     RunFailureCallback();
 140     return;
 141   }
 142
 143   // DOM feature extraction can take awhile, so it runs asynchronously
 144   // in several chunks of work and invokes the callback when finished.
 145   dom_extractor_->ExtractFeatures(
 146       features_.get(),
 147       base::Bind(&PhishingClassifier::DOMExtractionFinished,
 148                  base::Unretained(this)));
 149 }
 150
 151 void PhishingClassifier::CancelPendingClassification() {
 152   // Note that cancelling the feature extractors is simply a no-op if they
 153   // were not running.
 154   DCHECK(is_ready());
 155   dom_extractor_->CancelPendingExtraction();
 156   term_extractor_->CancelPendingExtraction();
 157   weak_factory_.InvalidateWeakPtrs();
 158   Clear();
 159 }
 160
 161 void PhishingClassifier::DOMExtractionFinished(bool success) {
 162   shingle_hashes_.reset(new std::set<uint32>);
 163   if (success) {
 164     // Term feature extraction can take awhile, so it runs asynchronously
 165     // in several chunks of work and invokes the callback when finished.
 166     term_extractor_->ExtractFeatures(
 167         page_text_,
 168         features_.get(),
 169         shingle_hashes_.get(),
 170         base::Bind(&PhishingClassifier::TermExtractionFinished,
 171                    base::Unretained(this)));
 172   } else {
 173     RunFailureCallback();
 174   }
 175 }
 176
 177 void PhishingClassifier::TermExtractionFinished(bool success) {
 178   if (success) {
 179     blink::WebView* web_view = render_view_->GetWebView();
 180     if (!web_view) {
 181       RunFailureCallback();
 182       return;
 183     }
 184     blink::WebFrame* main_frame = web_view->mainFrame();
 185     if (!main_frame) {
 186       RunFailureCallback();
 187       return;
 188     }
 189
 190     // Hash all of the features so that they match the model, then compute
 191     // the score.
 192     FeatureMap hashed_features;
 193     ClientPhishingRequest verdict;
 194     verdict.set_model_version(scorer_->model_version());
 195     verdict.set_url(main_frame->document().url().spec());
 196     for (base::hash_map<std::string, double>::const_iterator it =
 197              features_->features().begin();
 198          it != features_->features().end(); ++it) {
 199       DVLOG(2) << "Feature: " << it->first << " = " << it->second;
 200       bool result = hashed_features.AddRealFeature(
 201           crypto::SHA256HashString(it->first), it->second);
 202       DCHECK(result);
 203       ClientPhishingRequest::Feature* feature = verdict.add_feature_map();
 204       feature->set_name(it->first);
 205       feature->set_value(it->second);
 206     }
 207     for (std::set<uint32>::const_iterator it = shingle_hashes_->begin();
 208          it != shingle_hashes_->end(); ++it) {
 209       verdict.add_shingle_hashes(*it);
 210     }
 211     float score = static_cast<float>(scorer_->ComputeScore(hashed_features));
 212     verdict.set_client_score(score);
 213     verdict.set_is_phishing(score >= kPhishyThreshold);
 214     RunCallback(verdict);
 215   } else {
 216     RunFailureCallback();
 217   }
 218 }
 219
 220 void PhishingClassifier::CheckNoPendingClassification() {
 221   DCHECK(done_callback_.is_null());
 222   DCHECK(!page_text_);
 223   if (!done_callback_.is_null() || page_text_) {
 224     LOG(ERROR) << "Classification in progress, missing call to "
 225                << "CancelPendingClassification";
 226     UMA_HISTOGRAM_COUNTS("SBClientPhishing.CheckNoPendingClassificationFailed",
 227                          1);
 228   }
 229 }
 230
 231 void PhishingClassifier::RunCallback(const ClientPhishingRequest& verdict) {
 232   done_callback_.Run(verdict);
 233   Clear();
 234 }
 235
 236 void PhishingClassifier::RunFailureCallback() {
 237   ClientPhishingRequest verdict;
 238   // In this case we're not guaranteed to have a valid URL.  Just set it
 239   // to the empty string to make sure we have a valid protocol buffer.
 240   verdict.set_url("");
 241   verdict.set_client_score(kInvalidScore);
 242   verdict.set_is_phishing(false);
 243   RunCallback(verdict);
 244 }
 245
 246 void PhishingClassifier::Clear() {
 247   page_text_ = NULL;
 248   done_callback_.Reset();
 249   features_.reset(NULL);
 250   shingle_hashes_.reset(NULL);
 251 }
 252
 253 }  // namespace safe_browsing