Revert of Linux MSan: enable swarming/sharding for browser_tests. (patchset #1 id...
[chromium-blink-merge.git] / chrome / renderer / safe_browsing / phishing_dom_feature_extractor.h
blobb2de3f7ee4080073f894c85e3a4d8eff73b51df8
1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 //
5 // PhishingDOMFeatureExtractor handles computing DOM-based features for the
6 // client-side phishing detection model. These include the presence of various
7 // types of elements, ratios of external and secure links, and tokens for
8 // external domains linked to.
10 #ifndef CHROME_RENDERER_SAFE_BROWSING_PHISHING_DOM_FEATURE_EXTRACTOR_H_
11 #define CHROME_RENDERER_SAFE_BROWSING_PHISHING_DOM_FEATURE_EXTRACTOR_H_
13 #include <string>
15 #include "base/basictypes.h"
16 #include "base/callback.h"
17 #include "base/memory/scoped_ptr.h"
18 #include "base/memory/weak_ptr.h"
19 #include "third_party/WebKit/public/web/WebDocument.h"
21 class GURL;
23 namespace blink {
24 class WebElement;
27 namespace content {
28 class RenderView;
31 namespace safe_browsing {
32 class FeatureExtractorClock;
33 class FeatureMap;
35 class PhishingDOMFeatureExtractor {
36 public:
37 // Callback to be run when feature extraction finishes. The callback
38 // argument is true if extraction was successful, false otherwise.
39 typedef base::Callback<void(bool)> DoneCallback;
41 // Creates a PhishingDOMFeatureExtractor for the specified RenderView.
42 // The PhishingDOMFeatureExtrator should be destroyed prior to destroying
43 // the RenderView. |clock| is used for timing feature extractor operations,
44 // and may be mocked for testing. The caller maintains ownership of the
45 // clock.
46 PhishingDOMFeatureExtractor(content::RenderView* render_view,
47 FeatureExtractorClock* clock);
48 ~PhishingDOMFeatureExtractor();
50 // Begins extracting features into the given FeatureMap for the page
51 // currently loaded in this object's RenderView. To avoid blocking the
52 // render thread for too long, the feature extractor may run in several
53 // chunks of work, posting a task to the current MessageLoop to continue
54 // processing. Once feature extraction is complete, |done_callback|
55 // is run on the current thread. PhishingDOMFeatureExtractor takes
56 // ownership of the callback.
57 void ExtractFeatures(FeatureMap* features, const DoneCallback& done_callback);
59 // Cancels any pending feature extraction. The DoneCallback will not be run.
60 // Must be called if there is a feature extraction in progress when the page
61 // is unloaded or the PhishingDOMFeatureExtractor is destroyed.
62 void CancelPendingExtraction();
64 private:
65 struct FrameData;
66 struct PageFeatureState;
68 // The maximum amount of wall time that we will spend on a single extraction
69 // iteration before pausing to let other MessageLoop tasks run.
70 static const int kMaxTimePerChunkMs;
72 // The number of elements that we will process before checking to see whether
73 // kMaxTimePerChunkMs has elapsed. Since checking the current time can be
74 // slow, we don't do this on every element processed.
75 static const int kClockCheckGranularity;
77 // The maximum total amount of time that the feature extractor will run
78 // before giving up on the current page.
79 static const int kMaxTotalTimeMs;
81 // Does the actual work of ExtractFeatures. ExtractFeaturesWithTimeout runs
82 // until a predefined maximum amount of time has elapsed, then posts a task
83 // to the current MessageLoop to continue extraction. When extraction
84 // finishes, calls RunCallback().
85 void ExtractFeaturesWithTimeout();
87 // Handlers for the various HTML elements that we compute features for.
88 // Since some of the features (such as ratios) cannot be computed until
89 // feature extraction is finished, these handlers do not add to the feature
90 // map directly. Instead, they update the values in the PageFeatureState.
91 void HandleLink(const blink::WebElement& element);
92 void HandleForm(const blink::WebElement& element);
93 void HandleImage(const blink::WebElement& element);
94 void HandleInput(const blink::WebElement& element);
95 void HandleScript(const blink::WebElement& element);
97 // Helper to verify that there is no pending feature extraction. Dies in
98 // debug builds if the state is not as expected. This is a no-op in release
99 // builds.
100 void CheckNoPendingExtraction();
102 // Runs |done_callback_| and then clears all internal state.
103 void RunCallback(bool success);
105 // Clears all internal feature extraction state.
106 void Clear();
108 // Called after advancing |cur_document_| to update the state in
109 // |cur_frame_data_|.
110 void ResetFrameData();
112 // Returns the next document in frame-traversal order from cur_document_.
113 // If there are no more documents, returns a null WebDocument.
114 blink::WebDocument GetNextDocument();
116 // Given a URL, checks whether the domain is different from the domain of
117 // the current frame's URL. If so, stores the domain in |domain| and returns
118 // true, otherwise returns false.
119 bool IsExternalDomain(const GURL& url, std::string* domain) const;
121 // Called once all frames have been processed to compute features from the
122 // PageFeatureState and add them to |features_|. See features.h for a
123 // description of which features are computed.
124 void InsertFeatures();
126 // Non-owned pointer to the view that we will extract features from.
127 content::RenderView* render_view_;
129 // Non-owned pointer to our clock.
130 FeatureExtractorClock* clock_;
132 // The output parameters from the most recent call to ExtractFeatures().
133 FeatureMap* features_; // The caller keeps ownership of this.
134 DoneCallback done_callback_;
136 // The current (sub-)document that we are processing. May be a null document
137 // (isNull()) if we are not currently extracting features.
138 blink::WebDocument cur_document_;
140 // Stores extra state for |cur_document_| that will be persisted until we
141 // advance to the next frame.
142 scoped_ptr<FrameData> cur_frame_data_;
144 // Stores the intermediate data used to create features. This data is
145 // accumulated across all frames in the RenderView.
146 scoped_ptr<PageFeatureState> page_feature_state_;
148 // Used in scheduling ExtractFeaturesWithTimeout tasks.
149 // These pointers are invalidated if extraction is cancelled.
150 base::WeakPtrFactory<PhishingDOMFeatureExtractor> weak_factory_;
152 DISALLOW_COPY_AND_ASSIGN(PhishingDOMFeatureExtractor);
155 } // namespace safe_browsing
157 #endif // CHROME_RENDERER_SAFE_BROWSING_PHISHING_DOM_FEATURE_EXTRACTOR_H_