Enable Cast in ChromePublic
[chromium-blink-merge.git] / chrome / renderer / safe_browsing / phishing_dom_feature_extractor.cc
blobf47a186a098cf937f290461cd67fb7db55c6580a
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "chrome/renderer/safe_browsing/phishing_dom_feature_extractor.h"
7 #include "base/bind.h"
8 #include "base/compiler_specific.h"
9 #include "base/containers/hash_tables.h"
10 #include "base/location.h"
11 #include "base/logging.h"
12 #include "base/metrics/histogram.h"
13 #include "base/single_thread_task_runner.h"
14 #include "base/strings/string_util.h"
15 #include "base/thread_task_runner_handle.h"
16 #include "base/time/time.h"
17 #include "chrome/renderer/safe_browsing/feature_extractor_clock.h"
18 #include "chrome/renderer/safe_browsing/features.h"
19 #include "content/public/renderer/render_view.h"
20 #include "net/base/registry_controlled_domains/registry_controlled_domain.h"
21 #include "third_party/WebKit/public/platform/WebString.h"
22 #include "third_party/WebKit/public/web/WebElement.h"
23 #include "third_party/WebKit/public/web/WebElementCollection.h"
24 #include "third_party/WebKit/public/web/WebLocalFrame.h"
25 #include "third_party/WebKit/public/web/WebView.h"
27 namespace safe_browsing {
29 // This time should be short enough that it doesn't noticeably disrupt the
30 // user's interaction with the page.
31 const int PhishingDOMFeatureExtractor::kMaxTimePerChunkMs = 10;
33 // Experimenting shows that we get a reasonable gain in performance by
34 // increasing this up to around 10, but there's not much benefit in
35 // increasing it past that.
36 const int PhishingDOMFeatureExtractor::kClockCheckGranularity = 10;
38 // This should be longer than we expect feature extraction to take on any
39 // actual phishing page.
40 const int PhishingDOMFeatureExtractor::kMaxTotalTimeMs = 500;
42 // Intermediate state used for computing features. See features.h for
43 // descriptions of the DOM features that are computed.
44 struct PhishingDOMFeatureExtractor::PageFeatureState {
45 // Link related features
46 int external_links;
47 base::hash_set<std::string> external_domains;
48 int secure_links;
49 int total_links;
51 // Form related features
52 int num_forms;
53 int num_text_inputs;
54 int num_pswd_inputs;
55 int num_radio_inputs;
56 int num_check_inputs;
57 int action_other_domain;
58 int total_actions;
59 base::hash_set<std::string> page_action_urls;
61 // Image related features
62 int img_other_domain;
63 int total_imgs;
65 // How many script tags
66 int num_script_tags;
68 // The time at which we started feature extraction for the current page.
69 base::TimeTicks start_time;
71 // The number of iterations we've done for the current extraction.
72 int num_iterations;
74 explicit PageFeatureState(base::TimeTicks start_time_ticks)
75 : external_links(0),
76 secure_links(0),
77 total_links(0),
78 num_forms(0),
79 num_text_inputs(0),
80 num_pswd_inputs(0),
81 num_radio_inputs(0),
82 num_check_inputs(0),
83 action_other_domain(0),
84 total_actions(0),
85 img_other_domain(0),
86 total_imgs(0),
87 num_script_tags(0),
88 start_time(start_time_ticks),
89 num_iterations(0) {}
91 ~PageFeatureState() {}
94 // Per-frame state
95 struct PhishingDOMFeatureExtractor::FrameData {
96 // This is our reference to document.all, which is an iterator over all
97 // of the elements in the document. It keeps track of our current position.
98 blink::WebElementCollection elements;
99 // The domain of the document URL, stored here so that we don't need to
100 // recompute it every time it's needed.
101 std::string domain;
104 PhishingDOMFeatureExtractor::PhishingDOMFeatureExtractor(
105 content::RenderView* render_view,
106 FeatureExtractorClock* clock)
107 : render_view_(render_view),
108 clock_(clock),
109 weak_factory_(this) {
110 Clear();
113 PhishingDOMFeatureExtractor::~PhishingDOMFeatureExtractor() {
114 // The RenderView should have called CancelPendingExtraction() before
115 // we are destroyed.
116 CheckNoPendingExtraction();
119 void PhishingDOMFeatureExtractor::ExtractFeatures(
120 FeatureMap* features,
121 const DoneCallback& done_callback) {
122 // The RenderView should have called CancelPendingExtraction() before
123 // starting a new extraction, so DCHECK this.
124 CheckNoPendingExtraction();
125 // However, in an opt build, we will go ahead and clean up the pending
126 // extraction so that we can start in a known state.
127 CancelPendingExtraction();
129 features_ = features;
130 done_callback_ = done_callback;
132 page_feature_state_.reset(new PageFeatureState(clock_->Now()));
133 blink::WebView* web_view = render_view_->GetWebView();
134 if (web_view && web_view->mainFrame()) {
135 cur_document_ = web_view->mainFrame()->document();
138 base::ThreadTaskRunnerHandle::Get()->PostTask(
139 FROM_HERE,
140 base::Bind(&PhishingDOMFeatureExtractor::ExtractFeaturesWithTimeout,
141 weak_factory_.GetWeakPtr()));
144 void PhishingDOMFeatureExtractor::CancelPendingExtraction() {
145 // Cancel any pending callbacks, and clear our state.
146 weak_factory_.InvalidateWeakPtrs();
147 Clear();
150 void PhishingDOMFeatureExtractor::ExtractFeaturesWithTimeout() {
151 DCHECK(page_feature_state_.get());
152 ++page_feature_state_->num_iterations;
153 base::TimeTicks current_chunk_start_time = clock_->Now();
155 if (cur_document_.isNull()) {
156 // This will only happen if we weren't able to get the document for the
157 // main frame. We'll treat this as an extraction failure.
158 RunCallback(false);
159 return;
162 int num_elements = 0;
163 for (; !cur_document_.isNull(); cur_document_ = GetNextDocument()) {
164 blink::WebElement cur_element;
165 if (cur_frame_data_.get()) {
166 // We're resuming traversal of a frame, so just advance to the next
167 // element.
168 cur_element = cur_frame_data_->elements.nextItem();
169 // When we resume the traversal, the first call to nextItem() potentially
170 // has to walk through the document again from the beginning, if it was
171 // modified between our chunks of work. Log how long this takes, so we
172 // can tell if it's too slow.
173 UMA_HISTOGRAM_TIMES("SBClientPhishing.DOMFeatureResumeTime",
174 clock_->Now() - current_chunk_start_time);
175 } else {
176 // We just moved to a new frame, so update our frame state
177 // and advance to the first element.
178 ResetFrameData();
179 cur_element = cur_frame_data_->elements.firstItem();
182 for (; !cur_element.isNull();
183 cur_element = cur_frame_data_->elements.nextItem()) {
184 if (cur_element.hasHTMLTagName("a")) {
185 HandleLink(cur_element);
186 } else if (cur_element.hasHTMLTagName("form")) {
187 HandleForm(cur_element);
188 } else if (cur_element.hasHTMLTagName("img")) {
189 HandleImage(cur_element);
190 } else if (cur_element.hasHTMLTagName("input")) {
191 HandleInput(cur_element);
192 } else if (cur_element.hasHTMLTagName("script")) {
193 HandleScript(cur_element);
196 if (++num_elements >= kClockCheckGranularity) {
197 num_elements = 0;
198 base::TimeTicks now = clock_->Now();
199 if (now - page_feature_state_->start_time >=
200 base::TimeDelta::FromMilliseconds(kMaxTotalTimeMs)) {
201 DLOG(ERROR) << "Feature extraction took too long, giving up";
202 // We expect this to happen infrequently, so record when it does.
203 UMA_HISTOGRAM_COUNTS("SBClientPhishing.DOMFeatureTimeout", 1);
204 RunCallback(false);
205 return;
207 base::TimeDelta chunk_elapsed = now - current_chunk_start_time;
208 if (chunk_elapsed >=
209 base::TimeDelta::FromMilliseconds(kMaxTimePerChunkMs)) {
210 // The time limit for the current chunk is up, so post a task to
211 // continue extraction.
213 // Record how much time we actually spent on the chunk. If this is
214 // much higher than kMaxTimePerChunkMs, we may need to adjust the
215 // clock granularity.
216 UMA_HISTOGRAM_TIMES("SBClientPhishing.DOMFeatureChunkTime",
217 chunk_elapsed);
218 base::ThreadTaskRunnerHandle::Get()->PostTask(
219 FROM_HERE,
220 base::Bind(
221 &PhishingDOMFeatureExtractor::ExtractFeaturesWithTimeout,
222 weak_factory_.GetWeakPtr()));
223 return;
225 // Otherwise, continue.
229 // We're done with this frame, recalculate the FrameData when we
230 // advance to the next frame.
231 cur_frame_data_.reset();
234 InsertFeatures();
235 RunCallback(true);
238 void PhishingDOMFeatureExtractor::HandleLink(
239 const blink::WebElement& element) {
240 // Count the number of times we link to a different host.
241 if (!element.hasAttribute("href")) {
242 DVLOG(1) << "Skipping anchor tag with no href";
243 return;
246 // Retrieve the link and resolve the link in case it's relative.
247 blink::WebURL full_url = element.document().completeURL(
248 element.getAttribute("href"));
250 std::string domain;
251 bool is_external = IsExternalDomain(full_url, &domain);
252 if (domain.empty()) {
253 DVLOG(1) << "Could not extract domain from link: " << full_url;
254 return;
257 if (is_external) {
258 ++page_feature_state_->external_links;
260 // Record each unique domain that we link to.
261 page_feature_state_->external_domains.insert(domain);
264 // Check how many are https links.
265 if (GURL(full_url).SchemeIs("https")) {
266 ++page_feature_state_->secure_links;
269 ++page_feature_state_->total_links;
272 void PhishingDOMFeatureExtractor::HandleForm(
273 const blink::WebElement& element) {
274 // Increment the number of forms on this page.
275 ++page_feature_state_->num_forms;
277 // Record whether the action points to a different domain.
278 if (!element.hasAttribute("action")) {
279 return;
282 blink::WebURL full_url = element.document().completeURL(
283 element.getAttribute("action"));
285 page_feature_state_->page_action_urls.insert(full_url.string().utf8());
287 std::string domain;
288 bool is_external = IsExternalDomain(full_url, &domain);
289 if (domain.empty()) {
290 DVLOG(1) << "Could not extract domain from form action: " << full_url;
291 return;
294 if (is_external) {
295 ++page_feature_state_->action_other_domain;
297 ++page_feature_state_->total_actions;
300 void PhishingDOMFeatureExtractor::HandleImage(
301 const blink::WebElement& element) {
302 if (!element.hasAttribute("src")) {
303 DVLOG(1) << "Skipping img tag with no src";
306 // Record whether the image points to a different domain.
307 blink::WebURL full_url = element.document().completeURL(
308 element.getAttribute("src"));
309 std::string domain;
310 bool is_external = IsExternalDomain(full_url, &domain);
311 if (domain.empty()) {
312 DVLOG(1) << "Could not extract domain from image src: " << full_url;
313 return;
316 if (is_external) {
317 ++page_feature_state_->img_other_domain;
319 ++page_feature_state_->total_imgs;
322 void PhishingDOMFeatureExtractor::HandleInput(
323 const blink::WebElement& element) {
324 // The HTML spec says that if the type is unspecified, it defaults to text.
325 // In addition, any unrecognized type will be treated as a text input.
327 // Note that we use the attribute value rather than
328 // WebFormControlElement::formControlType() for consistency with the
329 // way the phishing classification model is created.
330 std::string type = element.getAttribute("type").utf8();
331 base::StringToLowerASCII(&type);
332 if (type == "password") {
333 ++page_feature_state_->num_pswd_inputs;
334 } else if (type == "radio") {
335 ++page_feature_state_->num_radio_inputs;
336 } else if (type == "checkbox") {
337 ++page_feature_state_->num_check_inputs;
338 } else if (type != "submit" && type != "reset" && type != "file" &&
339 type != "hidden" && type != "image" && type != "button") {
340 // Note that there are a number of new input types in HTML5 that are not
341 // handled above. For now, we will consider these as text inputs since
342 // they could be used to capture user input.
343 ++page_feature_state_->num_text_inputs;
347 void PhishingDOMFeatureExtractor::HandleScript(
348 const blink::WebElement& element) {
349 ++page_feature_state_->num_script_tags;
352 void PhishingDOMFeatureExtractor::CheckNoPendingExtraction() {
353 DCHECK(done_callback_.is_null());
354 DCHECK(!cur_frame_data_.get());
355 DCHECK(cur_document_.isNull());
356 if (!done_callback_.is_null() || cur_frame_data_.get() ||
357 !cur_document_.isNull()) {
358 LOG(ERROR) << "Extraction in progress, missing call to "
359 << "CancelPendingExtraction";
363 void PhishingDOMFeatureExtractor::RunCallback(bool success) {
364 // Record some timing stats that we can use to evaluate feature extraction
365 // performance. These include both successful and failed extractions.
366 DCHECK(page_feature_state_.get());
367 UMA_HISTOGRAM_COUNTS("SBClientPhishing.DOMFeatureIterations",
368 page_feature_state_->num_iterations);
369 UMA_HISTOGRAM_TIMES("SBClientPhishing.DOMFeatureTotalTime",
370 clock_->Now() - page_feature_state_->start_time);
372 DCHECK(!done_callback_.is_null());
373 done_callback_.Run(success);
374 Clear();
377 void PhishingDOMFeatureExtractor::Clear() {
378 features_ = NULL;
379 done_callback_.Reset();
380 cur_frame_data_.reset(NULL);
381 cur_document_.reset();
384 void PhishingDOMFeatureExtractor::ResetFrameData() {
385 DCHECK(!cur_document_.isNull());
386 DCHECK(!cur_frame_data_.get());
388 cur_frame_data_.reset(new FrameData());
389 cur_frame_data_->elements = cur_document_.all();
390 cur_frame_data_->domain =
391 net::registry_controlled_domains::GetDomainAndRegistry(
392 cur_document_.url(),
393 net::registry_controlled_domains::EXCLUDE_PRIVATE_REGISTRIES);
396 blink::WebDocument PhishingDOMFeatureExtractor::GetNextDocument() {
397 DCHECK(!cur_document_.isNull());
398 blink::WebFrame* frame = cur_document_.frame();
399 // Advance to the next frame that contains a document, with no wrapping.
400 if (frame) {
401 for (frame = frame->traverseNext(false); frame;
402 frame = frame->traverseNext(false)) {
403 if (!frame->document().isNull()) {
404 return frame->document();
407 } else {
408 // Keep track of how often frame traversal got "stuck" due to the
409 // current subdocument getting removed from the frame tree.
410 UMA_HISTOGRAM_COUNTS("SBClientPhishing.DOMFeatureFrameRemoved", 1);
412 return blink::WebDocument();
415 bool PhishingDOMFeatureExtractor::IsExternalDomain(const GURL& url,
416 std::string* domain) const {
417 DCHECK(domain);
418 DCHECK(cur_frame_data_.get());
420 if (cur_frame_data_->domain.empty()) {
421 return false;
424 // TODO(bryner): Ensure that the url encoding is consistent with the features
425 // in the model.
426 if (url.HostIsIPAddress()) {
427 domain->assign(url.host());
428 } else {
429 domain->assign(net::registry_controlled_domains::GetDomainAndRegistry(
430 url, net::registry_controlled_domains::EXCLUDE_PRIVATE_REGISTRIES));
433 return !domain->empty() && *domain != cur_frame_data_->domain;
436 void PhishingDOMFeatureExtractor::InsertFeatures() {
437 DCHECK(page_feature_state_.get());
439 if (page_feature_state_->total_links > 0) {
440 // Add a feature for the fraction of times the page links to an external
441 // domain vs. an internal domain.
442 double link_freq = static_cast<double>(
443 page_feature_state_->external_links) /
444 page_feature_state_->total_links;
445 features_->AddRealFeature(features::kPageExternalLinksFreq, link_freq);
447 // Add a feature for each unique domain that we're linking to
448 for (const auto& domain : page_feature_state_->external_domains) {
449 features_->AddBooleanFeature(features::kPageLinkDomain + domain);
452 // Fraction of links that use https.
453 double secure_freq = static_cast<double>(
454 page_feature_state_->secure_links) / page_feature_state_->total_links;
455 features_->AddRealFeature(features::kPageSecureLinksFreq, secure_freq);
458 // Record whether forms appear and whether various form elements appear.
459 if (page_feature_state_->num_forms > 0) {
460 features_->AddBooleanFeature(features::kPageHasForms);
462 if (page_feature_state_->num_text_inputs > 0) {
463 features_->AddBooleanFeature(features::kPageHasTextInputs);
465 if (page_feature_state_->num_pswd_inputs > 0) {
466 features_->AddBooleanFeature(features::kPageHasPswdInputs);
468 if (page_feature_state_->num_radio_inputs > 0) {
469 features_->AddBooleanFeature(features::kPageHasRadioInputs);
471 if (page_feature_state_->num_check_inputs > 0) {
472 features_->AddBooleanFeature(features::kPageHasCheckInputs);
475 // Record fraction of form actions that point to a different domain.
476 if (page_feature_state_->total_actions > 0) {
477 double action_freq = static_cast<double>(
478 page_feature_state_->action_other_domain) /
479 page_feature_state_->total_actions;
480 features_->AddRealFeature(features::kPageActionOtherDomainFreq,
481 action_freq);
484 // Add a feature for each unique external action url.
485 for (const auto& url : page_feature_state_->page_action_urls) {
486 features_->AddBooleanFeature(features::kPageActionURL + url);
489 // Record how many image src attributes point to a different domain.
490 if (page_feature_state_->total_imgs > 0) {
491 double img_freq = static_cast<double>(
492 page_feature_state_->img_other_domain) /
493 page_feature_state_->total_imgs;
494 features_->AddRealFeature(features::kPageImgOtherDomainFreq, img_freq);
497 // Record number of script tags (discretized for numerical stability.)
498 if (page_feature_state_->num_script_tags > 1) {
499 features_->AddBooleanFeature(features::kPageNumScriptTagsGTOne);
500 if (page_feature_state_->num_script_tags > 6) {
501 features_->AddBooleanFeature(features::kPageNumScriptTagsGTSix);
506 } // namespace safe_browsing