chrome/renderer/safe_browsing/phishing_classifier_delegate.cc

   1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.
   2 // Use of this source code is governed by a BSD-style license that can be
   3 // found in the LICENSE file.
   4
   5 #include "chrome/renderer/safe_browsing/phishing_classifier_delegate.h"
   6
   7 #include <set>
   8
   9 #include "base/bind.h"
  10 #include "base/callback.h"
  11 #include "base/lazy_instance.h"
  12 #include "base/logging.h"
  13 #include "base/metrics/histogram.h"
  14 #include "chrome/common/safe_browsing/csd.pb.h"
  15 #include "chrome/common/safe_browsing/safebrowsing_messages.h"
  16 #include "chrome/renderer/safe_browsing/feature_extractor_clock.h"
  17 #include "chrome/renderer/safe_browsing/phishing_classifier.h"
  18 #include "chrome/renderer/safe_browsing/scorer.h"
  19 #include "content/public/renderer/document_state.h"
  20 #include "content/public/renderer/navigation_state.h"
  21 #include "content/public/renderer/render_thread.h"
  22 #include "content/public/renderer/render_view.h"
  23 #include "third_party/WebKit/public/platform/WebURL.h"
  24 #include "third_party/WebKit/public/web/WebDocument.h"
  25 #include "third_party/WebKit/public/web/WebLocalFrame.h"
  26 #include "third_party/WebKit/public/web/WebView.h"
  27
  28 using content::DocumentState;
  29 using content::NavigationState;
  30 using content::RenderThread;
  31
  32 namespace safe_browsing {
  33
  34 static GURL StripRef(const GURL& url) {
  35   GURL::Replacements replacements;
  36   replacements.ClearRef();
  37   return url.ReplaceComponents(replacements);
  38 }
  39
  40 typedef std::set<PhishingClassifierDelegate*> PhishingClassifierDelegates;
  41 static base::LazyInstance<PhishingClassifierDelegates>
  42     g_delegates = LAZY_INSTANCE_INITIALIZER;
  43
  44 static base::LazyInstance<scoped_ptr<const safe_browsing::Scorer> >
  45     g_phishing_scorer = LAZY_INSTANCE_INITIALIZER;
  46
  47 // static
  48 PhishingClassifierFilter* PhishingClassifierFilter::Create() {
  49   // Private constructor and public static Create() method to facilitate
  50   // stubbing out this class for binary-size reduction purposes.
  51   return new PhishingClassifierFilter();
  52 }
  53
  54 PhishingClassifierFilter::PhishingClassifierFilter()
  55     : RenderProcessObserver() {}
  56
  57 PhishingClassifierFilter::~PhishingClassifierFilter() {}
  58
  59 bool PhishingClassifierFilter::OnControlMessageReceived(
  60     const IPC::Message& message) {
  61   bool handled = true;
  62   IPC_BEGIN_MESSAGE_MAP(PhishingClassifierFilter, message)
  63     IPC_MESSAGE_HANDLER(SafeBrowsingMsg_SetPhishingModel, OnSetPhishingModel)
  64     IPC_MESSAGE_UNHANDLED(handled = false)
  65   IPC_END_MESSAGE_MAP()
  66   return handled;
  67 }
  68
  69 void PhishingClassifierFilter::OnSetPhishingModel(const std::string& model) {
  70   safe_browsing::Scorer* scorer = NULL;
  71   // An empty model string means we should disable client-side phishing
  72   // detection.
  73   if (!model.empty()) {
  74     scorer = safe_browsing::Scorer::Create(model);
  75     if (!scorer) {
  76       DLOG(ERROR) << "Unable to create a PhishingScorer - corrupt model?";
  77       return;
  78     }
  79   }
  80   PhishingClassifierDelegates::iterator i;
  81   for (i = g_delegates.Get().begin(); i != g_delegates.Get().end(); ++i) {
  82     (*i)->SetPhishingScorer(scorer);
  83   }
  84   g_phishing_scorer.Get().reset(scorer);
  85 }
  86
  87 // static
  88 PhishingClassifierDelegate* PhishingClassifierDelegate::Create(
  89     content::RenderView* render_view, PhishingClassifier* classifier) {
  90   // Private constructor and public static Create() method to facilitate
  91   // stubbing out this class for binary-size reduction purposes.
  92   return new PhishingClassifierDelegate(render_view, classifier);
  93 }
  94
  95 PhishingClassifierDelegate::PhishingClassifierDelegate(
  96     content::RenderView* render_view,
  97     PhishingClassifier* classifier)
  98     : content::RenderViewObserver(render_view),
  99       last_main_frame_transition_(ui::PAGE_TRANSITION_LINK),
 100       have_page_text_(false),
 101       is_classifying_(false) {
 102   g_delegates.Get().insert(this);
 103   if (!classifier) {
 104     classifier = new PhishingClassifier(render_view,
 105                                         new FeatureExtractorClock());
 106   }
 107
 108   classifier_.reset(classifier);
 109
 110   if (g_phishing_scorer.Get().get())
 111     SetPhishingScorer(g_phishing_scorer.Get().get());
 112 }
 113
 114 PhishingClassifierDelegate::~PhishingClassifierDelegate() {
 115   CancelPendingClassification(SHUTDOWN);
 116   g_delegates.Get().erase(this);
 117 }
 118
 119 void PhishingClassifierDelegate::SetPhishingScorer(
 120     const safe_browsing::Scorer* scorer) {
 121   if (!render_view()->GetWebView())
 122     return;  // RenderView is tearing down.
 123   if (is_classifying_) {
 124     // If there is a classification going on right now it means we're
 125     // actually replacing an existing scorer with a new model.  In
 126     // this case we simply cancel the current classification.
 127     // TODO(noelutz): if this happens too frequently we could also
 128     // replace the old scorer with the new one once classification is done
 129     // but this would complicate the code somewhat.
 130     CancelPendingClassification(NEW_PHISHING_SCORER);
 131   }
 132   classifier_->set_phishing_scorer(scorer);
 133   // Start classifying the current page if all conditions are met.
 134   // See MaybeStartClassification() for details.
 135   MaybeStartClassification();
 136 }
 137
 138 void PhishingClassifierDelegate::OnStartPhishingDetection(const GURL& url) {
 139   last_url_received_from_browser_ = StripRef(url);
 140   // Start classifying the current page if all conditions are met.
 141   // See MaybeStartClassification() for details.
 142   MaybeStartClassification();
 143 }
 144
 145 void PhishingClassifierDelegate::DidCommitProvisionalLoad(
 146     blink::WebLocalFrame* frame, bool is_new_navigation) {
 147   // A new page is starting to load, so cancel classificaiton.
 148   //
 149   // TODO(bryner): We shouldn't need to cancel classification if the navigation
 150   // is within the same page.  However, if we let classification continue in
 151   // this case, we need to properly deal with the fact that PageCaptured will
 152   // be called again for the in-page navigation.  We need to be sure not to
 153   // swap out the page text while the term feature extractor is still running.
 154   DocumentState* document_state = DocumentState::FromDataSource(
 155       frame->dataSource());
 156   NavigationState* navigation_state = document_state->navigation_state();
 157   CancelPendingClassification(navigation_state->WasWithinSamePage()
 158                                   ? NAVIGATE_WITHIN_PAGE
 159                                   : NAVIGATE_AWAY);
 160   if (frame == render_view()->GetWebView()->mainFrame()) {
 161     last_main_frame_transition_ = navigation_state->GetTransitionType();
 162   }
 163 }
 164
 165 void PhishingClassifierDelegate::PageCaptured(base::string16* page_text,
 166                                               bool preliminary_capture) {
 167   if (preliminary_capture) {
 168     return;
 169   }
 170   // Make sure there's no classification in progress.  We don't want to swap
 171   // out the page text string from underneath the term feature extractor.
 172   //
 173   // Note: Currently, if the url hasn't changed, we won't restart
 174   // classification in this case.  We may want to adjust this.
 175   CancelPendingClassification(PAGE_RECAPTURED);
 176   last_finished_load_url_ = GetToplevelUrl();
 177   classifier_page_text_.swap(*page_text);
 178   have_page_text_ = true;
 179   MaybeStartClassification();
 180 }
 181
 182 void PhishingClassifierDelegate::CancelPendingClassification(
 183     CancelClassificationReason reason) {
 184   if (is_classifying_) {
 185     UMA_HISTOGRAM_ENUMERATION("SBClientPhishing.CancelClassificationReason",
 186                               reason,
 187                               CANCEL_CLASSIFICATION_MAX);
 188     is_classifying_ = false;
 189   }
 190   if (classifier_->is_ready()) {
 191     classifier_->CancelPendingClassification();
 192   }
 193   classifier_page_text_.clear();
 194   have_page_text_ = false;
 195 }
 196
 197 bool PhishingClassifierDelegate::OnMessageReceived(
 198     const IPC::Message& message) {
 199   bool handled = true;
 200   IPC_BEGIN_MESSAGE_MAP(PhishingClassifierDelegate, message)
 201     IPC_MESSAGE_HANDLER(SafeBrowsingMsg_StartPhishingDetection,
 202                         OnStartPhishingDetection)
 203     IPC_MESSAGE_UNHANDLED(handled = false)
 204   IPC_END_MESSAGE_MAP()
 205   return handled;
 206 }
 207
 208 void PhishingClassifierDelegate::ClassificationDone(
 209     const ClientPhishingRequest& verdict) {
 210   // We no longer need the page text.
 211   classifier_page_text_.clear();
 212   DVLOG(2) << "Phishy verdict = " << verdict.is_phishing()
 213            << " score = " << verdict.client_score();
 214   if (verdict.client_score() != PhishingClassifier::kInvalidScore) {
 215     DCHECK_EQ(last_url_sent_to_classifier_.spec(), verdict.url());
 216     RenderThread::Get()->Send(new SafeBrowsingHostMsg_PhishingDetectionDone(
 217         routing_id(), verdict.SerializeAsString()));
 218   }
 219 }
 220
 221 GURL PhishingClassifierDelegate::GetToplevelUrl() {
 222   return render_view()->GetWebView()->mainFrame()->document().url();
 223 }
 224
 225 void PhishingClassifierDelegate::MaybeStartClassification() {
 226   // We can begin phishing classification when the following conditions are
 227   // met:
 228   //  1. A Scorer has been created
 229   //  2. The browser has sent a StartPhishingDetection message for the current
 230   //     toplevel URL.
 231   //  3. The page has finished loading and the page text has been extracted.
 232   //  4. The load is a new navigation (not a session history navigation).
 233   //  5. The toplevel URL has not already been classified.
 234   //
 235   // Note that if we determine that this particular navigation should not be
 236   // classified at all (as opposed to deferring it until we get an IPC or the
 237   // load completes), we discard the page text since it won't be needed.
 238   if (!classifier_->is_ready()) {
 239     DVLOG(2) << "Not starting classification, no Scorer created.";
 240     // Keep classifier_page_text_, in case a Scorer is set later.
 241     return;
 242   }
 243
 244   if (last_main_frame_transition_ & ui::PAGE_TRANSITION_FORWARD_BACK) {
 245     // Skip loads from session history navigation.  However, update the
 246     // last URL sent to the classifier, so that we'll properly detect
 247     // in-page navigations.
 248     DVLOG(2) << "Not starting classification for back/forward navigation";
 249     last_url_sent_to_classifier_ = last_finished_load_url_;
 250     classifier_page_text_.clear();  // we won't need this.
 251     have_page_text_ = false;
 252     return;
 253   }
 254
 255   GURL stripped_last_load_url(StripRef(last_finished_load_url_));
 256   if (stripped_last_load_url == StripRef(last_url_sent_to_classifier_)) {
 257     // We've already classified this toplevel URL, so this was likely an
 258     // in-page navigation or a subframe navigation.  The browser should not
 259     // send a StartPhishingDetection IPC in this case.
 260     DVLOG(2) << "Toplevel URL is unchanged, not starting classification.";
 261     classifier_page_text_.clear();  // we won't need this.
 262     have_page_text_ = false;
 263     return;
 264   }
 265
 266   if (!have_page_text_) {
 267     DVLOG(2) << "Not starting classification, there is no page text ready.";
 268     return;
 269   }
 270
 271   if (last_url_received_from_browser_ != stripped_last_load_url) {
 272     // The browser has not yet confirmed that this URL should be classified,
 273     // so defer classification for now.  Note: the ref does not affect
 274     // any of the browser's preclassification checks, so we don't require it
 275     // to match.
 276     DVLOG(2) << "Not starting classification, last url from browser is "
 277              << last_url_received_from_browser_ << ", last finished load is "
 278              << last_finished_load_url_;
 279     // Keep classifier_page_text_, in case the browser notifies us later that
 280     // we should classify the URL.
 281     return;
 282   }
 283
 284   DVLOG(2) << "Starting classification for " << last_finished_load_url_;
 285   last_url_sent_to_classifier_ = last_finished_load_url_;
 286   is_classifying_ = true;
 287   classifier_->BeginClassification(
 288       &classifier_page_text_,
 289       base::Bind(&PhishingClassifierDelegate::ClassificationDone,
 290                  base::Unretained(this)));
 291 }
 292
 293 }  // namespace safe_browsing