1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 // PhishingDOMFeatureExtractor handles computing DOM-based features for the
6 // client-side phishing detection model. These include the presence of various
7 // types of elements, ratios of external and secure links, and tokens for
8 // external domains linked to.
10 #ifndef CHROME_RENDERER_SAFE_BROWSING_PHISHING_DOM_FEATURE_EXTRACTOR_H_
11 #define CHROME_RENDERER_SAFE_BROWSING_PHISHING_DOM_FEATURE_EXTRACTOR_H_
15 #include "base/basictypes.h"
16 #include "base/callback.h"
17 #include "base/memory/scoped_ptr.h"
18 #include "base/memory/weak_ptr.h"
19 #include "third_party/WebKit/public/web/WebDocument.h"
31 namespace safe_browsing
{
32 class FeatureExtractorClock
;
35 class PhishingDOMFeatureExtractor
{
37 // Callback to be run when feature extraction finishes. The callback
38 // argument is true if extraction was successful, false otherwise.
39 typedef base::Callback
<void(bool)> DoneCallback
;
41 // Creates a PhishingDOMFeatureExtractor for the specified RenderView.
42 // The PhishingDOMFeatureExtrator should be destroyed prior to destroying
43 // the RenderView. |clock| is used for timing feature extractor operations,
44 // and may be mocked for testing. The caller maintains ownership of the
46 PhishingDOMFeatureExtractor(content::RenderView
* render_view
,
47 FeatureExtractorClock
* clock
);
48 ~PhishingDOMFeatureExtractor();
50 // Begins extracting features into the given FeatureMap for the page
51 // currently loaded in this object's RenderView. To avoid blocking the
52 // render thread for too long, the feature extractor may run in several
53 // chunks of work, posting a task to the current MessageLoop to continue
54 // processing. Once feature extraction is complete, |done_callback|
55 // is run on the current thread. PhishingDOMFeatureExtractor takes
56 // ownership of the callback.
57 void ExtractFeatures(FeatureMap
* features
, const DoneCallback
& done_callback
);
59 // Cancels any pending feature extraction. The DoneCallback will not be run.
60 // Must be called if there is a feature extraction in progress when the page
61 // is unloaded or the PhishingDOMFeatureExtractor is destroyed.
62 void CancelPendingExtraction();
66 struct PageFeatureState
;
68 // The maximum amount of wall time that we will spend on a single extraction
69 // iteration before pausing to let other MessageLoop tasks run.
70 static const int kMaxTimePerChunkMs
;
72 // The number of elements that we will process before checking to see whether
73 // kMaxTimePerChunkMs has elapsed. Since checking the current time can be
74 // slow, we don't do this on every element processed.
75 static const int kClockCheckGranularity
;
77 // The maximum total amount of time that the feature extractor will run
78 // before giving up on the current page.
79 static const int kMaxTotalTimeMs
;
81 // Does the actual work of ExtractFeatures. ExtractFeaturesWithTimeout runs
82 // until a predefined maximum amount of time has elapsed, then posts a task
83 // to the current MessageLoop to continue extraction. When extraction
84 // finishes, calls RunCallback().
85 void ExtractFeaturesWithTimeout();
87 // Handlers for the various HTML elements that we compute features for.
88 // Since some of the features (such as ratios) cannot be computed until
89 // feature extraction is finished, these handlers do not add to the feature
90 // map directly. Instead, they update the values in the PageFeatureState.
91 void HandleLink(const blink::WebElement
& element
);
92 void HandleForm(const blink::WebElement
& element
);
93 void HandleImage(const blink::WebElement
& element
);
94 void HandleInput(const blink::WebElement
& element
);
95 void HandleScript(const blink::WebElement
& element
);
97 // Helper to verify that there is no pending feature extraction. Dies in
98 // debug builds if the state is not as expected. This is a no-op in release
100 void CheckNoPendingExtraction();
102 // Runs |done_callback_| and then clears all internal state.
103 void RunCallback(bool success
);
105 // Clears all internal feature extraction state.
108 // Called after advancing |cur_document_| to update the state in
109 // |cur_frame_data_|.
110 void ResetFrameData();
112 // Returns the next document in frame-traversal order from cur_document_.
113 // If there are no more documents, returns a null WebDocument.
114 blink::WebDocument
GetNextDocument();
116 // Given a URL, checks whether the domain is different from the domain of
117 // the current frame's URL. If so, stores the domain in |domain| and returns
118 // true, otherwise returns false.
119 bool IsExternalDomain(const GURL
& url
, std::string
* domain
) const;
121 // Called once all frames have been processed to compute features from the
122 // PageFeatureState and add them to |features_|. See features.h for a
123 // description of which features are computed.
124 void InsertFeatures();
126 // Non-owned pointer to the view that we will extract features from.
127 content::RenderView
* render_view_
;
129 // Non-owned pointer to our clock.
130 FeatureExtractorClock
* clock_
;
132 // The output parameters from the most recent call to ExtractFeatures().
133 FeatureMap
* features_
; // The caller keeps ownership of this.
134 DoneCallback done_callback_
;
136 // The current (sub-)document that we are processing. May be a null document
137 // (isNull()) if we are not currently extracting features.
138 blink::WebDocument cur_document_
;
140 // Stores extra state for |cur_document_| that will be persisted until we
141 // advance to the next frame.
142 scoped_ptr
<FrameData
> cur_frame_data_
;
144 // Stores the intermediate data used to create features. This data is
145 // accumulated across all frames in the RenderView.
146 scoped_ptr
<PageFeatureState
> page_feature_state_
;
148 // Used in scheduling ExtractFeaturesWithTimeout tasks.
149 // These pointers are invalidated if extraction is cancelled.
150 base::WeakPtrFactory
<PhishingDOMFeatureExtractor
> weak_factory_
;
152 DISALLOW_COPY_AND_ASSIGN(PhishingDOMFeatureExtractor
);
155 } // namespace safe_browsing
157 #endif // CHROME_RENDERER_SAFE_BROWSING_PHISHING_DOM_FEATURE_EXTRACTOR_H_