1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "chrome/renderer/safe_browsing/phishing_classifier.h"
10 #include "base/callback.h"
11 #include "base/compiler_specific.h"
12 #include "base/location.h"
13 #include "base/logging.h"
14 #include "base/metrics/histogram.h"
15 #include "base/single_thread_task_runner.h"
16 #include "base/strings/string_util.h"
17 #include "base/thread_task_runner_handle.h"
18 #include "chrome/common/safe_browsing/csd.pb.h"
19 #include "chrome/common/url_constants.h"
20 #include "chrome/renderer/safe_browsing/feature_extractor_clock.h"
21 #include "chrome/renderer/safe_browsing/features.h"
22 #include "chrome/renderer/safe_browsing/phishing_dom_feature_extractor.h"
23 #include "chrome/renderer/safe_browsing/phishing_term_feature_extractor.h"
24 #include "chrome/renderer/safe_browsing/phishing_url_feature_extractor.h"
25 #include "chrome/renderer/safe_browsing/scorer.h"
26 #include "content/public/renderer/render_view.h"
27 #include "crypto/sha2.h"
28 #include "third_party/WebKit/public/platform/WebURL.h"
29 #include "third_party/WebKit/public/platform/WebURLRequest.h"
30 #include "third_party/WebKit/public/web/WebDataSource.h"
31 #include "third_party/WebKit/public/web/WebDocument.h"
32 #include "third_party/WebKit/public/web/WebFrame.h"
33 #include "third_party/WebKit/public/web/WebView.h"
36 namespace safe_browsing
{
38 const float PhishingClassifier::kInvalidScore
= -1.0;
39 const float PhishingClassifier::kPhishyThreshold
= 0.5;
41 PhishingClassifier::PhishingClassifier(content::RenderView
* render_view
,
42 FeatureExtractorClock
* clock
)
43 : render_view_(render_view
),
50 PhishingClassifier::~PhishingClassifier() {
51 // The RenderView should have called CancelPendingClassification() before
53 CheckNoPendingClassification();
56 void PhishingClassifier::set_phishing_scorer(const Scorer
* scorer
) {
57 CheckNoPendingClassification();
60 url_extractor_
.reset(new PhishingUrlFeatureExtractor
);
61 dom_extractor_
.reset(new PhishingDOMFeatureExtractor(clock_
.get()));
62 term_extractor_
.reset(new PhishingTermFeatureExtractor(
63 &scorer_
->page_terms(),
64 &scorer_
->page_words(),
65 scorer_
->max_words_per_term(),
66 scorer_
->murmurhash3_seed(),
67 scorer_
->max_shingles_per_page(),
68 scorer_
->shingle_size(),
71 // We're disabling client-side phishing detection, so tear down all
72 // of the relevant objects.
73 url_extractor_
.reset();
74 dom_extractor_
.reset();
75 term_extractor_
.reset();
79 bool PhishingClassifier::is_ready() const {
80 return scorer_
!= NULL
;
83 void PhishingClassifier::BeginClassification(
84 const base::string16
* page_text
,
85 const DoneCallback
& done_callback
) {
88 // The RenderView should have called CancelPendingClassification() before
89 // starting a new classification, so DCHECK this.
90 CheckNoPendingClassification();
91 // However, in an opt build, we will go ahead and clean up the pending
92 // classification so that we can start in a known state.
93 CancelPendingClassification();
95 page_text_
= page_text
;
96 done_callback_
= done_callback
;
98 // For consistency, we always want to invoke the DoneCallback
99 // asynchronously, rather than directly from this method. To ensure that
100 // this is the case, post a task to begin feature extraction on the next
101 // iteration of the message loop.
102 base::ThreadTaskRunnerHandle::Get()->PostTask(
103 FROM_HERE
, base::Bind(&PhishingClassifier::BeginFeatureExtraction
,
104 weak_factory_
.GetWeakPtr()));
107 void PhishingClassifier::BeginFeatureExtraction() {
108 blink::WebView
* web_view
= render_view_
->GetWebView();
110 RunFailureCallback();
114 blink::WebFrame
* frame
= web_view
->mainFrame();
116 RunFailureCallback();
120 // Check whether the URL is one that we should classify.
121 // Currently, we only classify http: URLs that are GET requests.
122 GURL
url(frame
->document().url());
123 if (!url
.SchemeIs(url::kHttpScheme
)) {
124 RunFailureCallback();
128 blink::WebDataSource
* ds
= frame
->dataSource();
130 !base::EqualsASCII(base::StringPiece16(ds
->request().httpMethod()),
132 RunFailureCallback();
136 features_
.reset(new FeatureMap
);
137 if (!url_extractor_
->ExtractFeatures(url
, features_
.get())) {
138 RunFailureCallback();
142 // DOM feature extraction can take awhile, so it runs asynchronously
143 // in several chunks of work and invokes the callback when finished.
144 dom_extractor_
->ExtractFeatures(
145 frame
->document(), features_
.get(),
146 base::Bind(&PhishingClassifier::DOMExtractionFinished
,
147 base::Unretained(this)));
150 void PhishingClassifier::CancelPendingClassification() {
151 // Note that cancelling the feature extractors is simply a no-op if they
154 dom_extractor_
->CancelPendingExtraction();
155 term_extractor_
->CancelPendingExtraction();
156 weak_factory_
.InvalidateWeakPtrs();
160 void PhishingClassifier::DOMExtractionFinished(bool success
) {
161 shingle_hashes_
.reset(new std::set
<uint32
>);
163 // Term feature extraction can take awhile, so it runs asynchronously
164 // in several chunks of work and invokes the callback when finished.
165 term_extractor_
->ExtractFeatures(
168 shingle_hashes_
.get(),
169 base::Bind(&PhishingClassifier::TermExtractionFinished
,
170 base::Unretained(this)));
172 RunFailureCallback();
176 void PhishingClassifier::TermExtractionFinished(bool success
) {
178 blink::WebView
* web_view
= render_view_
->GetWebView();
180 RunFailureCallback();
183 blink::WebFrame
* main_frame
= web_view
->mainFrame();
185 RunFailureCallback();
189 // Hash all of the features so that they match the model, then compute
191 FeatureMap hashed_features
;
192 ClientPhishingRequest verdict
;
193 verdict
.set_model_version(scorer_
->model_version());
194 verdict
.set_url(main_frame
->document().url().spec());
195 for (base::hash_map
<std::string
, double>::const_iterator it
=
196 features_
->features().begin();
197 it
!= features_
->features().end(); ++it
) {
198 DVLOG(2) << "Feature: " << it
->first
<< " = " << it
->second
;
199 bool result
= hashed_features
.AddRealFeature(
200 crypto::SHA256HashString(it
->first
), it
->second
);
202 ClientPhishingRequest::Feature
* feature
= verdict
.add_feature_map();
203 feature
->set_name(it
->first
);
204 feature
->set_value(it
->second
);
206 for (std::set
<uint32
>::const_iterator it
= shingle_hashes_
->begin();
207 it
!= shingle_hashes_
->end(); ++it
) {
208 verdict
.add_shingle_hashes(*it
);
210 float score
= static_cast<float>(scorer_
->ComputeScore(hashed_features
));
211 verdict
.set_client_score(score
);
212 verdict
.set_is_phishing(score
>= kPhishyThreshold
);
213 RunCallback(verdict
);
215 RunFailureCallback();
219 void PhishingClassifier::CheckNoPendingClassification() {
220 DCHECK(done_callback_
.is_null());
222 if (!done_callback_
.is_null() || page_text_
) {
223 LOG(ERROR
) << "Classification in progress, missing call to "
224 << "CancelPendingClassification";
225 UMA_HISTOGRAM_COUNTS("SBClientPhishing.CheckNoPendingClassificationFailed",
230 void PhishingClassifier::RunCallback(const ClientPhishingRequest
& verdict
) {
231 done_callback_
.Run(verdict
);
235 void PhishingClassifier::RunFailureCallback() {
236 ClientPhishingRequest verdict
;
237 // In this case we're not guaranteed to have a valid URL. Just set it
238 // to the empty string to make sure we have a valid protocol buffer.
240 verdict
.set_client_score(kInvalidScore
);
241 verdict
.set_is_phishing(false);
242 RunCallback(verdict
);
245 void PhishingClassifier::Clear() {
247 done_callback_
.Reset();
248 features_
.reset(NULL
);
249 shingle_hashes_
.reset(NULL
);
252 } // namespace safe_browsing