This flag is obsolete, since a duplicate of it already exists as:
[chromium-blink-merge.git] / chrome / renderer / safe_browsing / phishing_dom_feature_extractor.cc
blobae5ad651aabfc555f8fed57aa85fbfbd2f5abb3a
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "chrome/renderer/safe_browsing/phishing_dom_feature_extractor.h"
7 #include "base/bind.h"
8 #include "base/compiler_specific.h"
9 #include "base/containers/hash_tables.h"
10 #include "base/logging.h"
11 #include "base/message_loop/message_loop.h"
12 #include "base/metrics/histogram.h"
13 #include "base/strings/string_util.h"
14 #include "base/time/time.h"
15 #include "chrome/renderer/safe_browsing/feature_extractor_clock.h"
16 #include "chrome/renderer/safe_browsing/features.h"
17 #include "content/public/renderer/render_view.h"
18 #include "net/base/registry_controlled_domains/registry_controlled_domain.h"
19 #include "third_party/WebKit/public/platform/WebString.h"
20 #include "third_party/WebKit/public/web/WebElement.h"
21 #include "third_party/WebKit/public/web/WebElementCollection.h"
22 #include "third_party/WebKit/public/web/WebLocalFrame.h"
23 #include "third_party/WebKit/public/web/WebView.h"
25 namespace safe_browsing {
27 // This time should be short enough that it doesn't noticeably disrupt the
28 // user's interaction with the page.
29 const int PhishingDOMFeatureExtractor::kMaxTimePerChunkMs = 10;
31 // Experimenting shows that we get a reasonable gain in performance by
32 // increasing this up to around 10, but there's not much benefit in
33 // increasing it past that.
34 const int PhishingDOMFeatureExtractor::kClockCheckGranularity = 10;
36 // This should be longer than we expect feature extraction to take on any
37 // actual phishing page.
38 const int PhishingDOMFeatureExtractor::kMaxTotalTimeMs = 500;
40 // Intermediate state used for computing features. See features.h for
41 // descriptions of the DOM features that are computed.
42 struct PhishingDOMFeatureExtractor::PageFeatureState {
43 // Link related features
44 int external_links;
45 base::hash_set<std::string> external_domains;
46 int secure_links;
47 int total_links;
49 // Form related features
50 int num_forms;
51 int num_text_inputs;
52 int num_pswd_inputs;
53 int num_radio_inputs;
54 int num_check_inputs;
55 int action_other_domain;
56 int total_actions;
58 // Image related features
59 int img_other_domain;
60 int total_imgs;
62 // How many script tags
63 int num_script_tags;
65 // The time at which we started feature extraction for the current page.
66 base::TimeTicks start_time;
68 // The number of iterations we've done for the current extraction.
69 int num_iterations;
71 explicit PageFeatureState(base::TimeTicks start_time_ticks)
72 : external_links(0),
73 secure_links(0),
74 total_links(0),
75 num_forms(0),
76 num_text_inputs(0),
77 num_pswd_inputs(0),
78 num_radio_inputs(0),
79 num_check_inputs(0),
80 action_other_domain(0),
81 total_actions(0),
82 img_other_domain(0),
83 total_imgs(0),
84 num_script_tags(0),
85 start_time(start_time_ticks),
86 num_iterations(0) {}
88 ~PageFeatureState() {}
91 // Per-frame state
92 struct PhishingDOMFeatureExtractor::FrameData {
93 // This is our reference to document.all, which is an iterator over all
94 // of the elements in the document. It keeps track of our current position.
95 blink::WebElementCollection elements;
96 // The domain of the document URL, stored here so that we don't need to
97 // recompute it every time it's needed.
98 std::string domain;
101 PhishingDOMFeatureExtractor::PhishingDOMFeatureExtractor(
102 content::RenderView* render_view,
103 FeatureExtractorClock* clock)
104 : render_view_(render_view),
105 clock_(clock),
106 weak_factory_(this) {
107 Clear();
110 PhishingDOMFeatureExtractor::~PhishingDOMFeatureExtractor() {
111 // The RenderView should have called CancelPendingExtraction() before
112 // we are destroyed.
113 CheckNoPendingExtraction();
116 void PhishingDOMFeatureExtractor::ExtractFeatures(
117 FeatureMap* features,
118 const DoneCallback& done_callback) {
119 // The RenderView should have called CancelPendingExtraction() before
120 // starting a new extraction, so DCHECK this.
121 CheckNoPendingExtraction();
122 // However, in an opt build, we will go ahead and clean up the pending
123 // extraction so that we can start in a known state.
124 CancelPendingExtraction();
126 features_ = features;
127 done_callback_ = done_callback;
129 page_feature_state_.reset(new PageFeatureState(clock_->Now()));
130 blink::WebView* web_view = render_view_->GetWebView();
131 if (web_view && web_view->mainFrame()) {
132 cur_document_ = web_view->mainFrame()->document();
135 base::MessageLoop::current()->PostTask(
136 FROM_HERE,
137 base::Bind(&PhishingDOMFeatureExtractor::ExtractFeaturesWithTimeout,
138 weak_factory_.GetWeakPtr()));
141 void PhishingDOMFeatureExtractor::CancelPendingExtraction() {
142 // Cancel any pending callbacks, and clear our state.
143 weak_factory_.InvalidateWeakPtrs();
144 Clear();
147 void PhishingDOMFeatureExtractor::ExtractFeaturesWithTimeout() {
148 DCHECK(page_feature_state_.get());
149 ++page_feature_state_->num_iterations;
150 base::TimeTicks current_chunk_start_time = clock_->Now();
152 if (cur_document_.isNull()) {
153 // This will only happen if we weren't able to get the document for the
154 // main frame. We'll treat this as an extraction failure.
155 RunCallback(false);
156 return;
159 int num_elements = 0;
160 for (; !cur_document_.isNull(); cur_document_ = GetNextDocument()) {
161 blink::WebElement cur_element;
162 if (cur_frame_data_.get()) {
163 // We're resuming traversal of a frame, so just advance to the next
164 // element.
165 cur_element = cur_frame_data_->elements.nextItem();
166 // When we resume the traversal, the first call to nextItem() potentially
167 // has to walk through the document again from the beginning, if it was
168 // modified between our chunks of work. Log how long this takes, so we
169 // can tell if it's too slow.
170 UMA_HISTOGRAM_TIMES("SBClientPhishing.DOMFeatureResumeTime",
171 clock_->Now() - current_chunk_start_time);
172 } else {
173 // We just moved to a new frame, so update our frame state
174 // and advance to the first element.
175 ResetFrameData();
176 cur_element = cur_frame_data_->elements.firstItem();
179 for (; !cur_element.isNull();
180 cur_element = cur_frame_data_->elements.nextItem()) {
181 if (cur_element.hasHTMLTagName("a")) {
182 HandleLink(cur_element);
183 } else if (cur_element.hasHTMLTagName("form")) {
184 HandleForm(cur_element);
185 } else if (cur_element.hasHTMLTagName("img")) {
186 HandleImage(cur_element);
187 } else if (cur_element.hasHTMLTagName("input")) {
188 HandleInput(cur_element);
189 } else if (cur_element.hasHTMLTagName("script")) {
190 HandleScript(cur_element);
193 if (++num_elements >= kClockCheckGranularity) {
194 num_elements = 0;
195 base::TimeTicks now = clock_->Now();
196 if (now - page_feature_state_->start_time >=
197 base::TimeDelta::FromMilliseconds(kMaxTotalTimeMs)) {
198 DLOG(ERROR) << "Feature extraction took too long, giving up";
199 // We expect this to happen infrequently, so record when it does.
200 UMA_HISTOGRAM_COUNTS("SBClientPhishing.DOMFeatureTimeout", 1);
201 RunCallback(false);
202 return;
204 base::TimeDelta chunk_elapsed = now - current_chunk_start_time;
205 if (chunk_elapsed >=
206 base::TimeDelta::FromMilliseconds(kMaxTimePerChunkMs)) {
207 // The time limit for the current chunk is up, so post a task to
208 // continue extraction.
210 // Record how much time we actually spent on the chunk. If this is
211 // much higher than kMaxTimePerChunkMs, we may need to adjust the
212 // clock granularity.
213 UMA_HISTOGRAM_TIMES("SBClientPhishing.DOMFeatureChunkTime",
214 chunk_elapsed);
215 base::MessageLoop::current()->PostTask(
216 FROM_HERE,
217 base::Bind(
218 &PhishingDOMFeatureExtractor::ExtractFeaturesWithTimeout,
219 weak_factory_.GetWeakPtr()));
220 return;
222 // Otherwise, continue.
226 // We're done with this frame, recalculate the FrameData when we
227 // advance to the next frame.
228 cur_frame_data_.reset();
231 InsertFeatures();
232 RunCallback(true);
235 void PhishingDOMFeatureExtractor::HandleLink(
236 const blink::WebElement& element) {
237 // Count the number of times we link to a different host.
238 if (!element.hasAttribute("href")) {
239 DVLOG(1) << "Skipping anchor tag with no href";
240 return;
243 // Retrieve the link and resolve the link in case it's relative.
244 blink::WebURL full_url = element.document().completeURL(
245 element.getAttribute("href"));
247 std::string domain;
248 bool is_external = IsExternalDomain(full_url, &domain);
249 if (domain.empty()) {
250 DVLOG(1) << "Could not extract domain from link: " << full_url;
251 return;
254 if (is_external) {
255 ++page_feature_state_->external_links;
257 // Record each unique domain that we link to.
258 page_feature_state_->external_domains.insert(domain);
261 // Check how many are https links.
262 if (GURL(full_url).SchemeIs("https")) {
263 ++page_feature_state_->secure_links;
266 ++page_feature_state_->total_links;
269 void PhishingDOMFeatureExtractor::HandleForm(
270 const blink::WebElement& element) {
271 // Increment the number of forms on this page.
272 ++page_feature_state_->num_forms;
274 // Record whether the action points to a different domain.
275 if (!element.hasAttribute("action")) {
276 return;
279 blink::WebURL full_url = element.document().completeURL(
280 element.getAttribute("action"));
282 std::string domain;
283 bool is_external = IsExternalDomain(full_url, &domain);
284 if (domain.empty()) {
285 DVLOG(1) << "Could not extract domain from form action: " << full_url;
286 return;
289 if (is_external) {
290 ++page_feature_state_->action_other_domain;
292 ++page_feature_state_->total_actions;
295 void PhishingDOMFeatureExtractor::HandleImage(
296 const blink::WebElement& element) {
297 if (!element.hasAttribute("src")) {
298 DVLOG(1) << "Skipping img tag with no src";
301 // Record whether the image points to a different domain.
302 blink::WebURL full_url = element.document().completeURL(
303 element.getAttribute("src"));
304 std::string domain;
305 bool is_external = IsExternalDomain(full_url, &domain);
306 if (domain.empty()) {
307 DVLOG(1) << "Could not extract domain from image src: " << full_url;
308 return;
311 if (is_external) {
312 ++page_feature_state_->img_other_domain;
314 ++page_feature_state_->total_imgs;
317 void PhishingDOMFeatureExtractor::HandleInput(
318 const blink::WebElement& element) {
319 // The HTML spec says that if the type is unspecified, it defaults to text.
320 // In addition, any unrecognized type will be treated as a text input.
322 // Note that we use the attribute value rather than
323 // WebFormControlElement::formControlType() for consistency with the
324 // way the phishing classification model is created.
325 std::string type = element.getAttribute("type").utf8();
326 base::StringToLowerASCII(&type);
327 if (type == "password") {
328 ++page_feature_state_->num_pswd_inputs;
329 } else if (type == "radio") {
330 ++page_feature_state_->num_radio_inputs;
331 } else if (type == "checkbox") {
332 ++page_feature_state_->num_check_inputs;
333 } else if (type != "submit" && type != "reset" && type != "file" &&
334 type != "hidden" && type != "image" && type != "button") {
335 // Note that there are a number of new input types in HTML5 that are not
336 // handled above. For now, we will consider these as text inputs since
337 // they could be used to capture user input.
338 ++page_feature_state_->num_text_inputs;
342 void PhishingDOMFeatureExtractor::HandleScript(
343 const blink::WebElement& element) {
344 ++page_feature_state_->num_script_tags;
347 void PhishingDOMFeatureExtractor::CheckNoPendingExtraction() {
348 DCHECK(done_callback_.is_null());
349 DCHECK(!cur_frame_data_.get());
350 DCHECK(cur_document_.isNull());
351 if (!done_callback_.is_null() || cur_frame_data_.get() ||
352 !cur_document_.isNull()) {
353 LOG(ERROR) << "Extraction in progress, missing call to "
354 << "CancelPendingExtraction";
358 void PhishingDOMFeatureExtractor::RunCallback(bool success) {
359 // Record some timing stats that we can use to evaluate feature extraction
360 // performance. These include both successful and failed extractions.
361 DCHECK(page_feature_state_.get());
362 UMA_HISTOGRAM_COUNTS("SBClientPhishing.DOMFeatureIterations",
363 page_feature_state_->num_iterations);
364 UMA_HISTOGRAM_TIMES("SBClientPhishing.DOMFeatureTotalTime",
365 clock_->Now() - page_feature_state_->start_time);
367 DCHECK(!done_callback_.is_null());
368 done_callback_.Run(success);
369 Clear();
372 void PhishingDOMFeatureExtractor::Clear() {
373 features_ = NULL;
374 done_callback_.Reset();
375 cur_frame_data_.reset(NULL);
376 cur_document_.reset();
379 void PhishingDOMFeatureExtractor::ResetFrameData() {
380 DCHECK(!cur_document_.isNull());
381 DCHECK(!cur_frame_data_.get());
383 cur_frame_data_.reset(new FrameData());
384 cur_frame_data_->elements = cur_document_.all();
385 cur_frame_data_->domain =
386 net::registry_controlled_domains::GetDomainAndRegistry(
387 cur_document_.url(),
388 net::registry_controlled_domains::EXCLUDE_PRIVATE_REGISTRIES);
391 blink::WebDocument PhishingDOMFeatureExtractor::GetNextDocument() {
392 DCHECK(!cur_document_.isNull());
393 blink::WebFrame* frame = cur_document_.frame();
394 // Advance to the next frame that contains a document, with no wrapping.
395 if (frame) {
396 for (frame = frame->traverseNext(false); frame;
397 frame = frame->traverseNext(false)) {
398 if (!frame->document().isNull()) {
399 return frame->document();
402 } else {
403 // Keep track of how often frame traversal got "stuck" due to the
404 // current subdocument getting removed from the frame tree.
405 UMA_HISTOGRAM_COUNTS("SBClientPhishing.DOMFeatureFrameRemoved", 1);
407 return blink::WebDocument();
410 bool PhishingDOMFeatureExtractor::IsExternalDomain(const GURL& url,
411 std::string* domain) const {
412 DCHECK(domain);
413 DCHECK(cur_frame_data_.get());
415 if (cur_frame_data_->domain.empty()) {
416 return false;
419 // TODO(bryner): Ensure that the url encoding is consistent with the features
420 // in the model.
421 if (url.HostIsIPAddress()) {
422 domain->assign(url.host());
423 } else {
424 domain->assign(net::registry_controlled_domains::GetDomainAndRegistry(
425 url, net::registry_controlled_domains::EXCLUDE_PRIVATE_REGISTRIES));
428 return !domain->empty() && *domain != cur_frame_data_->domain;
431 void PhishingDOMFeatureExtractor::InsertFeatures() {
432 DCHECK(page_feature_state_.get());
434 if (page_feature_state_->total_links > 0) {
435 // Add a feature for the fraction of times the page links to an external
436 // domain vs. an internal domain.
437 double link_freq = static_cast<double>(
438 page_feature_state_->external_links) /
439 page_feature_state_->total_links;
440 features_->AddRealFeature(features::kPageExternalLinksFreq, link_freq);
442 // Add a feature for each unique domain that we're linking to
443 for (base::hash_set<std::string>::iterator it =
444 page_feature_state_->external_domains.begin();
445 it != page_feature_state_->external_domains.end(); ++it) {
446 features_->AddBooleanFeature(features::kPageLinkDomain + *it);
449 // Fraction of links that use https.
450 double secure_freq = static_cast<double>(
451 page_feature_state_->secure_links) / page_feature_state_->total_links;
452 features_->AddRealFeature(features::kPageSecureLinksFreq, secure_freq);
455 // Record whether forms appear and whether various form elements appear.
456 if (page_feature_state_->num_forms > 0) {
457 features_->AddBooleanFeature(features::kPageHasForms);
459 if (page_feature_state_->num_text_inputs > 0) {
460 features_->AddBooleanFeature(features::kPageHasTextInputs);
462 if (page_feature_state_->num_pswd_inputs > 0) {
463 features_->AddBooleanFeature(features::kPageHasPswdInputs);
465 if (page_feature_state_->num_radio_inputs > 0) {
466 features_->AddBooleanFeature(features::kPageHasRadioInputs);
468 if (page_feature_state_->num_check_inputs > 0) {
469 features_->AddBooleanFeature(features::kPageHasCheckInputs);
472 // Record fraction of form actions that point to a different domain.
473 if (page_feature_state_->total_actions > 0) {
474 double action_freq = static_cast<double>(
475 page_feature_state_->action_other_domain) /
476 page_feature_state_->total_actions;
477 features_->AddRealFeature(features::kPageActionOtherDomainFreq,
478 action_freq);
481 // Record how many image src attributes point to a different domain.
482 if (page_feature_state_->total_imgs > 0) {
483 double img_freq = static_cast<double>(
484 page_feature_state_->img_other_domain) /
485 page_feature_state_->total_imgs;
486 features_->AddRealFeature(features::kPageImgOtherDomainFreq, img_freq);
489 // Record number of script tags (discretized for numerical stability.)
490 if (page_feature_state_->num_script_tags > 1) {
491 features_->AddBooleanFeature(features::kPageNumScriptTagsGTOne);
492 if (page_feature_state_->num_script_tags > 6) {
493 features_->AddBooleanFeature(features::kPageNumScriptTagsGTSix);
498 } // namespace safe_browsing