1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "chrome/renderer/safe_browsing/phishing_dom_feature_extractor.h"
8 #include "base/compiler_specific.h"
9 #include "base/containers/hash_tables.h"
10 #include "base/location.h"
11 #include "base/logging.h"
12 #include "base/metrics/histogram.h"
13 #include "base/single_thread_task_runner.h"
14 #include "base/strings/string_util.h"
15 #include "base/thread_task_runner_handle.h"
16 #include "base/time/time.h"
17 #include "chrome/renderer/safe_browsing/feature_extractor_clock.h"
18 #include "chrome/renderer/safe_browsing/features.h"
19 #include "content/public/renderer/render_view.h"
20 #include "net/base/registry_controlled_domains/registry_controlled_domain.h"
21 #include "third_party/WebKit/public/platform/WebString.h"
22 #include "third_party/WebKit/public/web/WebElement.h"
23 #include "third_party/WebKit/public/web/WebElementCollection.h"
24 #include "third_party/WebKit/public/web/WebLocalFrame.h"
25 #include "third_party/WebKit/public/web/WebView.h"
27 namespace safe_browsing
{
29 // This time should be short enough that it doesn't noticeably disrupt the
30 // user's interaction with the page.
31 const int PhishingDOMFeatureExtractor::kMaxTimePerChunkMs
= 10;
33 // Experimenting shows that we get a reasonable gain in performance by
34 // increasing this up to around 10, but there's not much benefit in
35 // increasing it past that.
36 const int PhishingDOMFeatureExtractor::kClockCheckGranularity
= 10;
38 // This should be longer than we expect feature extraction to take on any
39 // actual phishing page.
40 const int PhishingDOMFeatureExtractor::kMaxTotalTimeMs
= 500;
42 // Intermediate state used for computing features. See features.h for
43 // descriptions of the DOM features that are computed.
44 struct PhishingDOMFeatureExtractor::PageFeatureState
{
45 // Link related features
47 base::hash_set
<std::string
> external_domains
;
51 // Form related features
57 int action_other_domain
;
59 base::hash_set
<std::string
> page_action_urls
;
61 // Image related features
65 // How many script tags
68 // The time at which we started feature extraction for the current page.
69 base::TimeTicks start_time
;
71 // The number of iterations we've done for the current extraction.
74 explicit PageFeatureState(base::TimeTicks start_time_ticks
)
83 action_other_domain(0),
88 start_time(start_time_ticks
),
91 ~PageFeatureState() {}
95 struct PhishingDOMFeatureExtractor::FrameData
{
96 // This is our reference to document.all, which is an iterator over all
97 // of the elements in the document. It keeps track of our current position.
98 blink::WebElementCollection elements
;
99 // The domain of the document URL, stored here so that we don't need to
100 // recompute it every time it's needed.
104 PhishingDOMFeatureExtractor::PhishingDOMFeatureExtractor(
105 content::RenderView
* render_view
,
106 FeatureExtractorClock
* clock
)
107 : render_view_(render_view
),
109 weak_factory_(this) {
113 PhishingDOMFeatureExtractor::~PhishingDOMFeatureExtractor() {
114 // The RenderView should have called CancelPendingExtraction() before
116 CheckNoPendingExtraction();
119 void PhishingDOMFeatureExtractor::ExtractFeatures(
120 FeatureMap
* features
,
121 const DoneCallback
& done_callback
) {
122 // The RenderView should have called CancelPendingExtraction() before
123 // starting a new extraction, so DCHECK this.
124 CheckNoPendingExtraction();
125 // However, in an opt build, we will go ahead and clean up the pending
126 // extraction so that we can start in a known state.
127 CancelPendingExtraction();
129 features_
= features
;
130 done_callback_
= done_callback
;
132 page_feature_state_
.reset(new PageFeatureState(clock_
->Now()));
133 blink::WebView
* web_view
= render_view_
->GetWebView();
134 if (web_view
&& web_view
->mainFrame()) {
135 cur_document_
= web_view
->mainFrame()->document();
138 base::ThreadTaskRunnerHandle::Get()->PostTask(
140 base::Bind(&PhishingDOMFeatureExtractor::ExtractFeaturesWithTimeout
,
141 weak_factory_
.GetWeakPtr()));
144 void PhishingDOMFeatureExtractor::CancelPendingExtraction() {
145 // Cancel any pending callbacks, and clear our state.
146 weak_factory_
.InvalidateWeakPtrs();
150 void PhishingDOMFeatureExtractor::ExtractFeaturesWithTimeout() {
151 DCHECK(page_feature_state_
.get());
152 ++page_feature_state_
->num_iterations
;
153 base::TimeTicks current_chunk_start_time
= clock_
->Now();
155 if (cur_document_
.isNull()) {
156 // This will only happen if we weren't able to get the document for the
157 // main frame. We'll treat this as an extraction failure.
162 int num_elements
= 0;
163 for (; !cur_document_
.isNull(); cur_document_
= GetNextDocument()) {
164 blink::WebElement cur_element
;
165 if (cur_frame_data_
.get()) {
166 // We're resuming traversal of a frame, so just advance to the next
168 cur_element
= cur_frame_data_
->elements
.nextItem();
169 // When we resume the traversal, the first call to nextItem() potentially
170 // has to walk through the document again from the beginning, if it was
171 // modified between our chunks of work. Log how long this takes, so we
172 // can tell if it's too slow.
173 UMA_HISTOGRAM_TIMES("SBClientPhishing.DOMFeatureResumeTime",
174 clock_
->Now() - current_chunk_start_time
);
176 // We just moved to a new frame, so update our frame state
177 // and advance to the first element.
179 cur_element
= cur_frame_data_
->elements
.firstItem();
182 for (; !cur_element
.isNull();
183 cur_element
= cur_frame_data_
->elements
.nextItem()) {
184 if (cur_element
.hasHTMLTagName("a")) {
185 HandleLink(cur_element
);
186 } else if (cur_element
.hasHTMLTagName("form")) {
187 HandleForm(cur_element
);
188 } else if (cur_element
.hasHTMLTagName("img")) {
189 HandleImage(cur_element
);
190 } else if (cur_element
.hasHTMLTagName("input")) {
191 HandleInput(cur_element
);
192 } else if (cur_element
.hasHTMLTagName("script")) {
193 HandleScript(cur_element
);
196 if (++num_elements
>= kClockCheckGranularity
) {
198 base::TimeTicks now
= clock_
->Now();
199 if (now
- page_feature_state_
->start_time
>=
200 base::TimeDelta::FromMilliseconds(kMaxTotalTimeMs
)) {
201 DLOG(ERROR
) << "Feature extraction took too long, giving up";
202 // We expect this to happen infrequently, so record when it does.
203 UMA_HISTOGRAM_COUNTS("SBClientPhishing.DOMFeatureTimeout", 1);
207 base::TimeDelta chunk_elapsed
= now
- current_chunk_start_time
;
209 base::TimeDelta::FromMilliseconds(kMaxTimePerChunkMs
)) {
210 // The time limit for the current chunk is up, so post a task to
211 // continue extraction.
213 // Record how much time we actually spent on the chunk. If this is
214 // much higher than kMaxTimePerChunkMs, we may need to adjust the
215 // clock granularity.
216 UMA_HISTOGRAM_TIMES("SBClientPhishing.DOMFeatureChunkTime",
218 base::ThreadTaskRunnerHandle::Get()->PostTask(
221 &PhishingDOMFeatureExtractor::ExtractFeaturesWithTimeout
,
222 weak_factory_
.GetWeakPtr()));
225 // Otherwise, continue.
229 // We're done with this frame, recalculate the FrameData when we
230 // advance to the next frame.
231 cur_frame_data_
.reset();
238 void PhishingDOMFeatureExtractor::HandleLink(
239 const blink::WebElement
& element
) {
240 // Count the number of times we link to a different host.
241 if (!element
.hasAttribute("href")) {
242 DVLOG(1) << "Skipping anchor tag with no href";
246 // Retrieve the link and resolve the link in case it's relative.
247 blink::WebURL full_url
= element
.document().completeURL(
248 element
.getAttribute("href"));
251 bool is_external
= IsExternalDomain(full_url
, &domain
);
252 if (domain
.empty()) {
253 DVLOG(1) << "Could not extract domain from link: " << full_url
;
258 ++page_feature_state_
->external_links
;
260 // Record each unique domain that we link to.
261 page_feature_state_
->external_domains
.insert(domain
);
264 // Check how many are https links.
265 if (GURL(full_url
).SchemeIs("https")) {
266 ++page_feature_state_
->secure_links
;
269 ++page_feature_state_
->total_links
;
272 void PhishingDOMFeatureExtractor::HandleForm(
273 const blink::WebElement
& element
) {
274 // Increment the number of forms on this page.
275 ++page_feature_state_
->num_forms
;
277 // Record whether the action points to a different domain.
278 if (!element
.hasAttribute("action")) {
282 blink::WebURL full_url
= element
.document().completeURL(
283 element
.getAttribute("action"));
285 page_feature_state_
->page_action_urls
.insert(full_url
.string().utf8());
288 bool is_external
= IsExternalDomain(full_url
, &domain
);
289 if (domain
.empty()) {
290 DVLOG(1) << "Could not extract domain from form action: " << full_url
;
295 ++page_feature_state_
->action_other_domain
;
297 ++page_feature_state_
->total_actions
;
300 void PhishingDOMFeatureExtractor::HandleImage(
301 const blink::WebElement
& element
) {
302 if (!element
.hasAttribute("src")) {
303 DVLOG(1) << "Skipping img tag with no src";
306 // Record whether the image points to a different domain.
307 blink::WebURL full_url
= element
.document().completeURL(
308 element
.getAttribute("src"));
310 bool is_external
= IsExternalDomain(full_url
, &domain
);
311 if (domain
.empty()) {
312 DVLOG(1) << "Could not extract domain from image src: " << full_url
;
317 ++page_feature_state_
->img_other_domain
;
319 ++page_feature_state_
->total_imgs
;
322 void PhishingDOMFeatureExtractor::HandleInput(
323 const blink::WebElement
& element
) {
324 // The HTML spec says that if the type is unspecified, it defaults to text.
325 // In addition, any unrecognized type will be treated as a text input.
327 // Note that we use the attribute value rather than
328 // WebFormControlElement::formControlType() for consistency with the
329 // way the phishing classification model is created.
330 std::string type
= base::ToLowerASCII(element
.getAttribute("type").utf8());
331 if (type
== "password") {
332 ++page_feature_state_
->num_pswd_inputs
;
333 } else if (type
== "radio") {
334 ++page_feature_state_
->num_radio_inputs
;
335 } else if (type
== "checkbox") {
336 ++page_feature_state_
->num_check_inputs
;
337 } else if (type
!= "submit" && type
!= "reset" && type
!= "file" &&
338 type
!= "hidden" && type
!= "image" && type
!= "button") {
339 // Note that there are a number of new input types in HTML5 that are not
340 // handled above. For now, we will consider these as text inputs since
341 // they could be used to capture user input.
342 ++page_feature_state_
->num_text_inputs
;
346 void PhishingDOMFeatureExtractor::HandleScript(
347 const blink::WebElement
& element
) {
348 ++page_feature_state_
->num_script_tags
;
351 void PhishingDOMFeatureExtractor::CheckNoPendingExtraction() {
352 DCHECK(done_callback_
.is_null());
353 DCHECK(!cur_frame_data_
.get());
354 DCHECK(cur_document_
.isNull());
355 if (!done_callback_
.is_null() || cur_frame_data_
.get() ||
356 !cur_document_
.isNull()) {
357 LOG(ERROR
) << "Extraction in progress, missing call to "
358 << "CancelPendingExtraction";
362 void PhishingDOMFeatureExtractor::RunCallback(bool success
) {
363 // Record some timing stats that we can use to evaluate feature extraction
364 // performance. These include both successful and failed extractions.
365 DCHECK(page_feature_state_
.get());
366 UMA_HISTOGRAM_COUNTS("SBClientPhishing.DOMFeatureIterations",
367 page_feature_state_
->num_iterations
);
368 UMA_HISTOGRAM_TIMES("SBClientPhishing.DOMFeatureTotalTime",
369 clock_
->Now() - page_feature_state_
->start_time
);
371 DCHECK(!done_callback_
.is_null());
372 done_callback_
.Run(success
);
376 void PhishingDOMFeatureExtractor::Clear() {
378 done_callback_
.Reset();
379 cur_frame_data_
.reset(NULL
);
380 cur_document_
.reset();
383 void PhishingDOMFeatureExtractor::ResetFrameData() {
384 DCHECK(!cur_document_
.isNull());
385 DCHECK(!cur_frame_data_
.get());
387 cur_frame_data_
.reset(new FrameData());
388 cur_frame_data_
->elements
= cur_document_
.all();
389 cur_frame_data_
->domain
=
390 net::registry_controlled_domains::GetDomainAndRegistry(
392 net::registry_controlled_domains::EXCLUDE_PRIVATE_REGISTRIES
);
395 blink::WebDocument
PhishingDOMFeatureExtractor::GetNextDocument() {
396 DCHECK(!cur_document_
.isNull());
397 blink::WebFrame
* frame
= cur_document_
.frame();
398 // Advance to the next frame that contains a document, with no wrapping.
400 for (frame
= frame
->traverseNext(false); frame
;
401 frame
= frame
->traverseNext(false)) {
402 if (!frame
->document().isNull()) {
403 return frame
->document();
407 // Keep track of how often frame traversal got "stuck" due to the
408 // current subdocument getting removed from the frame tree.
409 UMA_HISTOGRAM_COUNTS("SBClientPhishing.DOMFeatureFrameRemoved", 1);
411 return blink::WebDocument();
414 bool PhishingDOMFeatureExtractor::IsExternalDomain(const GURL
& url
,
415 std::string
* domain
) const {
417 DCHECK(cur_frame_data_
.get());
419 if (cur_frame_data_
->domain
.empty()) {
423 // TODO(bryner): Ensure that the url encoding is consistent with the features
425 if (url
.HostIsIPAddress()) {
426 domain
->assign(url
.host());
428 domain
->assign(net::registry_controlled_domains::GetDomainAndRegistry(
429 url
, net::registry_controlled_domains::EXCLUDE_PRIVATE_REGISTRIES
));
432 return !domain
->empty() && *domain
!= cur_frame_data_
->domain
;
435 void PhishingDOMFeatureExtractor::InsertFeatures() {
436 DCHECK(page_feature_state_
.get());
438 if (page_feature_state_
->total_links
> 0) {
439 // Add a feature for the fraction of times the page links to an external
440 // domain vs. an internal domain.
441 double link_freq
= static_cast<double>(
442 page_feature_state_
->external_links
) /
443 page_feature_state_
->total_links
;
444 features_
->AddRealFeature(features::kPageExternalLinksFreq
, link_freq
);
446 // Add a feature for each unique domain that we're linking to
447 for (const auto& domain
: page_feature_state_
->external_domains
) {
448 features_
->AddBooleanFeature(features::kPageLinkDomain
+ domain
);
451 // Fraction of links that use https.
452 double secure_freq
= static_cast<double>(
453 page_feature_state_
->secure_links
) / page_feature_state_
->total_links
;
454 features_
->AddRealFeature(features::kPageSecureLinksFreq
, secure_freq
);
457 // Record whether forms appear and whether various form elements appear.
458 if (page_feature_state_
->num_forms
> 0) {
459 features_
->AddBooleanFeature(features::kPageHasForms
);
461 if (page_feature_state_
->num_text_inputs
> 0) {
462 features_
->AddBooleanFeature(features::kPageHasTextInputs
);
464 if (page_feature_state_
->num_pswd_inputs
> 0) {
465 features_
->AddBooleanFeature(features::kPageHasPswdInputs
);
467 if (page_feature_state_
->num_radio_inputs
> 0) {
468 features_
->AddBooleanFeature(features::kPageHasRadioInputs
);
470 if (page_feature_state_
->num_check_inputs
> 0) {
471 features_
->AddBooleanFeature(features::kPageHasCheckInputs
);
474 // Record fraction of form actions that point to a different domain.
475 if (page_feature_state_
->total_actions
> 0) {
476 double action_freq
= static_cast<double>(
477 page_feature_state_
->action_other_domain
) /
478 page_feature_state_
->total_actions
;
479 features_
->AddRealFeature(features::kPageActionOtherDomainFreq
,
483 // Add a feature for each unique external action url.
484 for (const auto& url
: page_feature_state_
->page_action_urls
) {
485 features_
->AddBooleanFeature(features::kPageActionURL
+ url
);
488 // Record how many image src attributes point to a different domain.
489 if (page_feature_state_
->total_imgs
> 0) {
490 double img_freq
= static_cast<double>(
491 page_feature_state_
->img_other_domain
) /
492 page_feature_state_
->total_imgs
;
493 features_
->AddRealFeature(features::kPageImgOtherDomainFreq
, img_freq
);
496 // Record number of script tags (discretized for numerical stability.)
497 if (page_feature_state_
->num_script_tags
> 1) {
498 features_
->AddBooleanFeature(features::kPageNumScriptTagsGTOne
);
499 if (page_feature_state_
->num_script_tags
> 6) {
500 features_
->AddBooleanFeature(features::kPageNumScriptTagsGTSix
);
505 } // namespace safe_browsing