1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 // This class handles the process of extracting all of the features from a
6 // page and computing a phishyness score. The basic steps are:
7 // - Run each feature extractor over the page, building up a FeatureMap of
9 // - SHA-256 hash all of the feature names in the map so that they match the
11 // - Hand the hashed map off to a Scorer, which computes the probability that
12 // the page is phishy.
13 // - If the page is phishy, run the supplied callback.
15 // For more details, see phishing_*_feature_extractor.h, scorer.h, and
16 // client_model.proto.
18 #ifndef CHROME_RENDERER_SAFE_BROWSING_PHISHING_CLASSIFIER_H_
19 #define CHROME_RENDERER_SAFE_BROWSING_PHISHING_CLASSIFIER_H_
21 #include "base/basictypes.h"
22 #include "base/callback.h"
23 #include "base/memory/scoped_ptr.h"
24 #include "base/memory/weak_ptr.h"
25 #include "base/strings/string16.h"
31 namespace safe_browsing
{
32 class ClientPhishingRequest
;
33 class FeatureExtractorClock
;
35 class PhishingDOMFeatureExtractor
;
36 class PhishingTermFeatureExtractor
;
37 class PhishingUrlFeatureExtractor
;
40 class PhishingClassifier
{
42 // Callback to be run when phishing classification finishes. The verdict
43 // is a ClientPhishingRequest which contains the verdict computed by the
44 // classifier as well as the extracted features. If the verdict.is_phishing()
45 // is true, the page is considered phishy by the client-side model,
46 // and the browser should ping back to get a final verdict. The
47 // verdict.client_score() is set to kInvalidScore if classification failed.
48 typedef base::Callback
<void(const ClientPhishingRequest
& /* verdict */)>
51 static const float kInvalidScore
;
53 // Creates a new PhishingClassifier object that will operate on
54 // |render_view|. |clock| is used to time feature extractor operations, and
55 // the PhishingClassifier takes ownership of this object. Note that the
56 // classifier will not be 'ready' until set_phishing_scorer() is called.
57 PhishingClassifier(content::RenderView
* render_view
,
58 FeatureExtractorClock
* clock
);
59 virtual ~PhishingClassifier();
61 // Sets a scorer for the classifier to use in computing the phishiness score.
62 // This must live at least as long as the PhishingClassifier. The caller is
63 // expected to cancel any pending classification before setting a phishing
65 void set_phishing_scorer(const Scorer
* scorer
);
67 // Returns true if the classifier is ready to classify pages, i.e. it
68 // has had a scorer set via set_phishing_scorer().
69 bool is_ready() const;
71 // Called by the RenderView when a page has finished loading. This begins
72 // the feature extraction and scoring process. |page_text| should contain
73 // the plain text of a web page, including any subframes, as returned by
74 // RenderView::CaptureText(). |page_text| is owned by the caller, and must
75 // not be destroyed until either |done_callback| is run or
76 // CancelPendingClassification() is called.
78 // To avoid blocking the render thread for too long, phishing classification
79 // may run in several chunks of work, posting a task to the current
80 // MessageLoop to continue processing. Once the scoring process is complete,
81 // |done_callback| is run on the current thread. PhishingClassifier takes
82 // ownership of the callback.
84 // It is an error to call BeginClassification if the classifier is not yet
86 virtual void BeginClassification(const base::string16
* page_text
,
87 const DoneCallback
& callback
);
89 // Called by the RenderView (on the render thread) when a page is unloading
90 // or the RenderView is being destroyed. This cancels any extraction that
91 // is in progress. It is an error to call CancelPendingClassification if
92 // the classifier is not yet ready.
93 virtual void CancelPendingClassification();
96 // Any score equal to or above this value is considered phishy.
97 static const float kPhishyThreshold
;
99 // Begins the feature extraction process, by extracting URL features and
100 // beginning DOM feature extraction.
101 void BeginFeatureExtraction();
103 // Callback to be run when DOM feature extraction is complete.
104 // If it was successful, begins term feature extraction, otherwise
105 // runs the DoneCallback with a non-phishy verdict.
106 void DOMExtractionFinished(bool success
);
108 // Callback to be run when term feature extraction is complete.
109 // If it was successful, computes a score and runs the DoneCallback.
110 // If extraction was unsuccessful, runs the DoneCallback with a
111 // non-phishy verdict.
112 void TermExtractionFinished(bool success
);
114 // Helper to verify that there is no pending phishing classification. Dies
115 // in debug builds if the state is not as expected. This is a no-op in
117 void CheckNoPendingClassification();
119 // Helper method to run the DoneCallback and clear the state.
120 void RunCallback(const ClientPhishingRequest
& verdict
);
122 // Helper to run the DoneCallback when feature extraction has failed.
123 // This always signals a non-phishy verdict for the page, with kInvalidScore.
124 void RunFailureCallback();
126 // Clears the current state of the PhishingClassifier.
129 content::RenderView
* render_view_
; // owns us
130 const Scorer
* scorer_
; // owned by the caller
131 scoped_ptr
<FeatureExtractorClock
> clock_
;
132 scoped_ptr
<PhishingUrlFeatureExtractor
> url_extractor_
;
133 scoped_ptr
<PhishingDOMFeatureExtractor
> dom_extractor_
;
134 scoped_ptr
<PhishingTermFeatureExtractor
> term_extractor_
;
136 // State for any in-progress extraction.
137 scoped_ptr
<FeatureMap
> features_
;
138 const base::string16
* page_text_
; // owned by the caller
139 DoneCallback done_callback_
;
141 // Used in scheduling BeginFeatureExtraction tasks.
142 // These pointers are invalidated if classification is cancelled.
143 base::WeakPtrFactory
<PhishingClassifier
> weak_factory_
;
145 DISALLOW_COPY_AND_ASSIGN(PhishingClassifier
);
148 } // namespace safe_browsing
150 #endif // CHROME_RENDERER_SAFE_BROWSING_PHISHING_CLASSIFIER_H_