Pin Chrome's shortcut to the Win10 Start menu on install and OS upgrade.
[chromium-blink-merge.git] / chrome / renderer / safe_browsing / phishing_classifier.cc
blob91fa58915d879958da8d3a0a66394c5f008ddaf3
1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "chrome/renderer/safe_browsing/phishing_classifier.h"
7 #include <string>
9 #include "base/bind.h"
10 #include "base/callback.h"
11 #include "base/compiler_specific.h"
12 #include "base/location.h"
13 #include "base/logging.h"
14 #include "base/metrics/histogram.h"
15 #include "base/single_thread_task_runner.h"
16 #include "base/strings/string_util.h"
17 #include "base/thread_task_runner_handle.h"
18 #include "chrome/common/safe_browsing/csd.pb.h"
19 #include "chrome/common/url_constants.h"
20 #include "chrome/renderer/safe_browsing/feature_extractor_clock.h"
21 #include "chrome/renderer/safe_browsing/features.h"
22 #include "chrome/renderer/safe_browsing/phishing_dom_feature_extractor.h"
23 #include "chrome/renderer/safe_browsing/phishing_term_feature_extractor.h"
24 #include "chrome/renderer/safe_browsing/phishing_url_feature_extractor.h"
25 #include "chrome/renderer/safe_browsing/scorer.h"
26 #include "content/public/renderer/render_view.h"
27 #include "crypto/sha2.h"
28 #include "third_party/WebKit/public/platform/WebURL.h"
29 #include "third_party/WebKit/public/platform/WebURLRequest.h"
30 #include "third_party/WebKit/public/web/WebDataSource.h"
31 #include "third_party/WebKit/public/web/WebDocument.h"
32 #include "third_party/WebKit/public/web/WebFrame.h"
33 #include "third_party/WebKit/public/web/WebView.h"
34 #include "url/gurl.h"
36 namespace safe_browsing {
38 const float PhishingClassifier::kInvalidScore = -1.0;
39 const float PhishingClassifier::kPhishyThreshold = 0.5;
41 PhishingClassifier::PhishingClassifier(content::RenderView* render_view,
42 FeatureExtractorClock* clock)
43 : render_view_(render_view),
44 scorer_(NULL),
45 clock_(clock),
46 weak_factory_(this) {
47 Clear();
50 PhishingClassifier::~PhishingClassifier() {
51 // The RenderView should have called CancelPendingClassification() before
52 // we are destroyed.
53 CheckNoPendingClassification();
56 void PhishingClassifier::set_phishing_scorer(const Scorer* scorer) {
57 CheckNoPendingClassification();
58 scorer_ = scorer;
59 if (scorer_) {
60 url_extractor_.reset(new PhishingUrlFeatureExtractor);
61 dom_extractor_.reset(
62 new PhishingDOMFeatureExtractor(render_view_, clock_.get()));
63 term_extractor_.reset(new PhishingTermFeatureExtractor(
64 &scorer_->page_terms(),
65 &scorer_->page_words(),
66 scorer_->max_words_per_term(),
67 scorer_->murmurhash3_seed(),
68 scorer_->max_shingles_per_page(),
69 scorer_->shingle_size(),
70 clock_.get()));
71 } else {
72 // We're disabling client-side phishing detection, so tear down all
73 // of the relevant objects.
74 url_extractor_.reset();
75 dom_extractor_.reset();
76 term_extractor_.reset();
80 bool PhishingClassifier::is_ready() const {
81 return scorer_ != NULL;
84 void PhishingClassifier::BeginClassification(
85 const base::string16* page_text,
86 const DoneCallback& done_callback) {
87 DCHECK(is_ready());
89 // The RenderView should have called CancelPendingClassification() before
90 // starting a new classification, so DCHECK this.
91 CheckNoPendingClassification();
92 // However, in an opt build, we will go ahead and clean up the pending
93 // classification so that we can start in a known state.
94 CancelPendingClassification();
96 page_text_ = page_text;
97 done_callback_ = done_callback;
99 // For consistency, we always want to invoke the DoneCallback
100 // asynchronously, rather than directly from this method. To ensure that
101 // this is the case, post a task to begin feature extraction on the next
102 // iteration of the message loop.
103 base::ThreadTaskRunnerHandle::Get()->PostTask(
104 FROM_HERE, base::Bind(&PhishingClassifier::BeginFeatureExtraction,
105 weak_factory_.GetWeakPtr()));
108 void PhishingClassifier::BeginFeatureExtraction() {
109 blink::WebView* web_view = render_view_->GetWebView();
110 if (!web_view) {
111 RunFailureCallback();
112 return;
115 blink::WebFrame* frame = web_view->mainFrame();
116 if (!frame) {
117 RunFailureCallback();
118 return;
121 // Check whether the URL is one that we should classify.
122 // Currently, we only classify http: URLs that are GET requests.
123 GURL url(frame->document().url());
124 if (!url.SchemeIs(url::kHttpScheme)) {
125 RunFailureCallback();
126 return;
129 blink::WebDataSource* ds = frame->dataSource();
130 if (!ds ||
131 !base::EqualsASCII(base::StringPiece16(ds->request().httpMethod()),
132 "GET")) {
133 RunFailureCallback();
134 return;
137 features_.reset(new FeatureMap);
138 if (!url_extractor_->ExtractFeatures(url, features_.get())) {
139 RunFailureCallback();
140 return;
143 // DOM feature extraction can take awhile, so it runs asynchronously
144 // in several chunks of work and invokes the callback when finished.
145 dom_extractor_->ExtractFeatures(
146 features_.get(),
147 base::Bind(&PhishingClassifier::DOMExtractionFinished,
148 base::Unretained(this)));
151 void PhishingClassifier::CancelPendingClassification() {
152 // Note that cancelling the feature extractors is simply a no-op if they
153 // were not running.
154 DCHECK(is_ready());
155 dom_extractor_->CancelPendingExtraction();
156 term_extractor_->CancelPendingExtraction();
157 weak_factory_.InvalidateWeakPtrs();
158 Clear();
161 void PhishingClassifier::DOMExtractionFinished(bool success) {
162 shingle_hashes_.reset(new std::set<uint32>);
163 if (success) {
164 // Term feature extraction can take awhile, so it runs asynchronously
165 // in several chunks of work and invokes the callback when finished.
166 term_extractor_->ExtractFeatures(
167 page_text_,
168 features_.get(),
169 shingle_hashes_.get(),
170 base::Bind(&PhishingClassifier::TermExtractionFinished,
171 base::Unretained(this)));
172 } else {
173 RunFailureCallback();
177 void PhishingClassifier::TermExtractionFinished(bool success) {
178 if (success) {
179 blink::WebView* web_view = render_view_->GetWebView();
180 if (!web_view) {
181 RunFailureCallback();
182 return;
184 blink::WebFrame* main_frame = web_view->mainFrame();
185 if (!main_frame) {
186 RunFailureCallback();
187 return;
190 // Hash all of the features so that they match the model, then compute
191 // the score.
192 FeatureMap hashed_features;
193 ClientPhishingRequest verdict;
194 verdict.set_model_version(scorer_->model_version());
195 verdict.set_url(main_frame->document().url().spec());
196 for (base::hash_map<std::string, double>::const_iterator it =
197 features_->features().begin();
198 it != features_->features().end(); ++it) {
199 DVLOG(2) << "Feature: " << it->first << " = " << it->second;
200 bool result = hashed_features.AddRealFeature(
201 crypto::SHA256HashString(it->first), it->second);
202 DCHECK(result);
203 ClientPhishingRequest::Feature* feature = verdict.add_feature_map();
204 feature->set_name(it->first);
205 feature->set_value(it->second);
207 for (std::set<uint32>::const_iterator it = shingle_hashes_->begin();
208 it != shingle_hashes_->end(); ++it) {
209 verdict.add_shingle_hashes(*it);
211 float score = static_cast<float>(scorer_->ComputeScore(hashed_features));
212 verdict.set_client_score(score);
213 verdict.set_is_phishing(score >= kPhishyThreshold);
214 RunCallback(verdict);
215 } else {
216 RunFailureCallback();
220 void PhishingClassifier::CheckNoPendingClassification() {
221 DCHECK(done_callback_.is_null());
222 DCHECK(!page_text_);
223 if (!done_callback_.is_null() || page_text_) {
224 LOG(ERROR) << "Classification in progress, missing call to "
225 << "CancelPendingClassification";
226 UMA_HISTOGRAM_COUNTS("SBClientPhishing.CheckNoPendingClassificationFailed",
231 void PhishingClassifier::RunCallback(const ClientPhishingRequest& verdict) {
232 done_callback_.Run(verdict);
233 Clear();
236 void PhishingClassifier::RunFailureCallback() {
237 ClientPhishingRequest verdict;
238 // In this case we're not guaranteed to have a valid URL. Just set it
239 // to the empty string to make sure we have a valid protocol buffer.
240 verdict.set_url("");
241 verdict.set_client_score(kInvalidScore);
242 verdict.set_is_phishing(false);
243 RunCallback(verdict);
246 void PhishingClassifier::Clear() {
247 page_text_ = NULL;
248 done_callback_.Reset();
249 features_.reset(NULL);
250 shingle_hashes_.reset(NULL);
253 } // namespace safe_browsing