Added documentation to web_view.js/web_view_experimental.js regarding the webview...
[chromium-blink-merge.git] / chrome / renderer / safe_browsing / phishing_classifier.cc
blobf48e6c138c74e444e79c58d8edb32809d2b6daa7
1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "chrome/renderer/safe_browsing/phishing_classifier.h"
7 #include <string>
9 #include "base/bind.h"
10 #include "base/callback.h"
11 #include "base/compiler_specific.h"
12 #include "base/logging.h"
13 #include "base/message_loop/message_loop.h"
14 #include "base/metrics/histogram.h"
15 #include "base/strings/string_util.h"
16 #include "chrome/common/safe_browsing/csd.pb.h"
17 #include "chrome/common/url_constants.h"
18 #include "chrome/renderer/safe_browsing/feature_extractor_clock.h"
19 #include "chrome/renderer/safe_browsing/features.h"
20 #include "chrome/renderer/safe_browsing/phishing_dom_feature_extractor.h"
21 #include "chrome/renderer/safe_browsing/phishing_term_feature_extractor.h"
22 #include "chrome/renderer/safe_browsing/phishing_url_feature_extractor.h"
23 #include "chrome/renderer/safe_browsing/scorer.h"
24 #include "content/public/renderer/render_view.h"
25 #include "crypto/sha2.h"
26 #include "third_party/WebKit/public/platform/WebURL.h"
27 #include "third_party/WebKit/public/platform/WebURLRequest.h"
28 #include "third_party/WebKit/public/web/WebDataSource.h"
29 #include "third_party/WebKit/public/web/WebDocument.h"
30 #include "third_party/WebKit/public/web/WebFrame.h"
31 #include "third_party/WebKit/public/web/WebView.h"
32 #include "url/gurl.h"
34 namespace safe_browsing {
36 const float PhishingClassifier::kInvalidScore = -1.0;
37 const float PhishingClassifier::kPhishyThreshold = 0.5;
39 PhishingClassifier::PhishingClassifier(content::RenderView* render_view,
40 FeatureExtractorClock* clock)
41 : render_view_(render_view),
42 scorer_(NULL),
43 clock_(clock),
44 weak_factory_(this) {
45 Clear();
48 PhishingClassifier::~PhishingClassifier() {
49 // The RenderView should have called CancelPendingClassification() before
50 // we are destroyed.
51 CheckNoPendingClassification();
54 void PhishingClassifier::set_phishing_scorer(const Scorer* scorer) {
55 CheckNoPendingClassification();
56 scorer_ = scorer;
57 if (scorer_) {
58 url_extractor_.reset(new PhishingUrlFeatureExtractor);
59 dom_extractor_.reset(
60 new PhishingDOMFeatureExtractor(render_view_, clock_.get()));
61 term_extractor_.reset(new PhishingTermFeatureExtractor(
62 &scorer_->page_terms(),
63 &scorer_->page_words(),
64 scorer_->max_words_per_term(),
65 scorer_->murmurhash3_seed(),
66 clock_.get()));
67 } else {
68 // We're disabling client-side phishing detection, so tear down all
69 // of the relevant objects.
70 url_extractor_.reset();
71 dom_extractor_.reset();
72 term_extractor_.reset();
76 bool PhishingClassifier::is_ready() const {
77 return scorer_ != NULL;
80 void PhishingClassifier::BeginClassification(
81 const base::string16* page_text,
82 const DoneCallback& done_callback) {
83 DCHECK(is_ready());
85 // The RenderView should have called CancelPendingClassification() before
86 // starting a new classification, so DCHECK this.
87 CheckNoPendingClassification();
88 // However, in an opt build, we will go ahead and clean up the pending
89 // classification so that we can start in a known state.
90 CancelPendingClassification();
92 page_text_ = page_text;
93 done_callback_ = done_callback;
95 // For consistency, we always want to invoke the DoneCallback
96 // asynchronously, rather than directly from this method. To ensure that
97 // this is the case, post a task to begin feature extraction on the next
98 // iteration of the message loop.
99 base::MessageLoop::current()->PostTask(
100 FROM_HERE,
101 base::Bind(&PhishingClassifier::BeginFeatureExtraction,
102 weak_factory_.GetWeakPtr()));
105 void PhishingClassifier::BeginFeatureExtraction() {
106 blink::WebView* web_view = render_view_->GetWebView();
107 if (!web_view) {
108 RunFailureCallback();
109 return;
112 blink::WebFrame* frame = web_view->mainFrame();
113 if (!frame) {
114 RunFailureCallback();
115 return;
118 // Check whether the URL is one that we should classify.
119 // Currently, we only classify http: URLs that are GET requests.
120 GURL url(frame->document().url());
121 if (!url.SchemeIs(content::kHttpScheme)) {
122 RunFailureCallback();
123 return;
126 blink::WebDataSource* ds = frame->dataSource();
127 if (!ds || !EqualsASCII(ds->request().httpMethod(), "GET")) {
128 RunFailureCallback();
129 return;
132 features_.reset(new FeatureMap);
133 if (!url_extractor_->ExtractFeatures(url, features_.get())) {
134 RunFailureCallback();
135 return;
138 // DOM feature extraction can take awhile, so it runs asynchronously
139 // in several chunks of work and invokes the callback when finished.
140 dom_extractor_->ExtractFeatures(
141 features_.get(),
142 base::Bind(&PhishingClassifier::DOMExtractionFinished,
143 base::Unretained(this)));
146 void PhishingClassifier::CancelPendingClassification() {
147 // Note that cancelling the feature extractors is simply a no-op if they
148 // were not running.
149 DCHECK(is_ready());
150 dom_extractor_->CancelPendingExtraction();
151 term_extractor_->CancelPendingExtraction();
152 weak_factory_.InvalidateWeakPtrs();
153 Clear();
156 void PhishingClassifier::DOMExtractionFinished(bool success) {
157 if (success) {
158 // Term feature extraction can take awhile, so it runs asynchronously
159 // in several chunks of work and invokes the callback when finished.
160 term_extractor_->ExtractFeatures(
161 page_text_,
162 features_.get(),
163 base::Bind(&PhishingClassifier::TermExtractionFinished,
164 base::Unretained(this)));
165 } else {
166 RunFailureCallback();
170 void PhishingClassifier::TermExtractionFinished(bool success) {
171 if (success) {
172 blink::WebView* web_view = render_view_->GetWebView();
173 if (!web_view) {
174 RunFailureCallback();
175 return;
177 blink::WebFrame* main_frame = web_view->mainFrame();
178 if (!main_frame) {
179 RunFailureCallback();
180 return;
183 // Hash all of the features so that they match the model, then compute
184 // the score.
185 FeatureMap hashed_features;
186 ClientPhishingRequest verdict;
187 verdict.set_model_version(scorer_->model_version());
188 verdict.set_url(main_frame->document().url().spec());
189 for (base::hash_map<std::string, double>::const_iterator it =
190 features_->features().begin();
191 it != features_->features().end(); ++it) {
192 VLOG(2) << "Feature: " << it->first << " = " << it->second;
193 bool result = hashed_features.AddRealFeature(
194 crypto::SHA256HashString(it->first), it->second);
195 DCHECK(result);
196 ClientPhishingRequest::Feature* feature = verdict.add_feature_map();
197 feature->set_name(it->first);
198 feature->set_value(it->second);
200 float score = static_cast<float>(scorer_->ComputeScore(hashed_features));
201 verdict.set_client_score(score);
202 verdict.set_is_phishing(score >= kPhishyThreshold);
203 RunCallback(verdict);
204 } else {
205 RunFailureCallback();
209 void PhishingClassifier::CheckNoPendingClassification() {
210 DCHECK(done_callback_.is_null());
211 DCHECK(!page_text_);
212 if (!done_callback_.is_null() || page_text_) {
213 LOG(ERROR) << "Classification in progress, missing call to "
214 << "CancelPendingClassification";
215 UMA_HISTOGRAM_COUNTS("SBClientPhishing.CheckNoPendingClassificationFailed",
220 void PhishingClassifier::RunCallback(const ClientPhishingRequest& verdict) {
221 done_callback_.Run(verdict);
222 Clear();
225 void PhishingClassifier::RunFailureCallback() {
226 ClientPhishingRequest verdict;
227 // In this case we're not guaranteed to have a valid URL. Just set it
228 // to the empty string to make sure we have a valid protocol buffer.
229 verdict.set_url("");
230 verdict.set_client_score(kInvalidScore);
231 verdict.set_is_phishing(false);
232 RunCallback(verdict);
235 void PhishingClassifier::Clear() {
236 page_text_ = NULL;
237 done_callback_.Reset();
238 features_.reset(NULL);
241 } // namespace safe_browsing