1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 // PhishingTermFeatureExtractor handles computing term features from the text
6 // of a web page for the client-side phishing detection model. To do this, it
7 // takes a list of terms that appear in the model, and scans through the page
8 // text looking for them. Any terms that appear will cause a corresponding
9 // features::kPageTerm feature to be added to the FeatureMap.
11 // To make it harder for a phisher to enumerate all of the relevant terms in
12 // the model, the terms are provided as SHA-256 hashes, rather than plain text.
14 // There is one PhishingTermFeatureExtractor per RenderView.
16 #ifndef CHROME_RENDERER_SAFE_BROWSING_PHISHING_TERM_FEATURE_EXTRACTOR_H_
17 #define CHROME_RENDERER_SAFE_BROWSING_PHISHING_TERM_FEATURE_EXTRACTOR_H_
21 #include "base/basictypes.h"
22 #include "base/callback.h"
23 #include "base/containers/hash_tables.h"
24 #include "base/containers/mru_cache.h"
25 #include "base/memory/scoped_ptr.h"
26 #include "base/memory/weak_ptr.h"
27 #include "base/strings/string16.h"
28 #include "base/strings/string_piece.h"
30 namespace safe_browsing
{
31 class FeatureExtractorClock
;
34 class PhishingTermFeatureExtractor
{
36 // Callback to be run when feature extraction finishes. The callback
37 // argument is true if extraction was successful, false otherwise.
38 typedef base::Callback
<void(bool)> DoneCallback
;
40 // Creates a PhishingTermFeatureExtractor which will extract features for
41 // all of the terms whose SHA-256 hashes are in |page_term_hashes|. These
42 // terms may be multi-word n-grams, with at most |max_words_per_term| words.
44 // |page_word_hashes| contains the murmur3 hashes for all of the individual
45 // words that make up the terms. Both sets of strings are UTF-8 encoded and
46 // lowercased prior to hashing. The caller owns both sets of strings, and
47 // must ensure that they are valid until the PhishingTermFeatureExtractor is
50 // |clock| is used for timing feature extractor operations, and may be mocked
51 // for testing. The caller keeps ownership of the clock.
52 PhishingTermFeatureExtractor(
53 const base::hash_set
<std::string
>* page_term_hashes
,
54 const base::hash_set
<uint32
>* page_word_hashes
,
55 size_t max_words_per_term
,
56 uint32 murmurhash3_seed
,
57 FeatureExtractorClock
* clock
);
58 ~PhishingTermFeatureExtractor();
60 // Begins extracting features from |page_text| into the given FeatureMap.
61 // |page_text| should contain the plain text of a web page, including any
62 // subframes, as returned by RenderView::CaptureText().
64 // To avoid blocking the render thread for too long, the feature extractor
65 // may run in several chunks of work, posting a task to the current
66 // MessageLoop to continue processing. Once feature extraction is complete,
67 // |done_callback| is run on the current thread.
68 // PhishingTermFeatureExtractor takes ownership of the callback.
70 // |page_text| and |features| are owned by the caller, and must not be
71 // destroyed until either |done_callback| is run or
72 // CancelPendingExtraction() is called.
73 void ExtractFeatures(const base::string16
* page_text
,
75 const DoneCallback
& done_callback
);
77 // Cancels any pending feature extraction. The DoneCallback will not be run.
78 // Must be called if there is a feature extraction in progress when the page
79 // is unloaded or the PhishingTermFeatureExtractor is destroyed.
80 void CancelPendingExtraction();
83 struct ExtractionState
;
85 // The maximum amount of wall time that we will spend on a single extraction
86 // iteration before pausing to let other MessageLoop tasks run.
87 static const int kMaxTimePerChunkMs
;
89 // The number of words that we will process before checking to see whether
90 // kMaxTimePerChunkMs has elapsed. Since checking the current time can be
91 // slow, we don't do this on every word processed.
92 static const int kClockCheckGranularity
;
94 // The maximum total amount of time that the feature extractor will run
95 // before giving up on the current page.
96 static const int kMaxTotalTimeMs
;
98 // The size of the cache that we use to determine if we can avoid lower
99 // casing, hashing, and UTF conversion.
100 static const int kMaxNegativeWordCacheSize
;
102 // Does the actual work of ExtractFeatures. ExtractFeaturesWithTimeout runs
103 // until a predefined maximum amount of time has elapsed, then posts a task
104 // to the current MessageLoop to continue extraction. When extraction
105 // finishes, calls RunCallback().
106 void ExtractFeaturesWithTimeout();
108 // Handles a single word in the page text.
109 void HandleWord(const base::StringPiece16
& word
);
111 // Helper to verify that there is no pending feature extraction. Dies in
112 // debug builds if the state is not as expected. This is a no-op in release
114 void CheckNoPendingExtraction();
116 // Runs |done_callback_| and then clears all internal state.
117 void RunCallback(bool success
);
119 // Clears all internal feature extraction state.
122 // All of the term hashes that we are looking for in the page.
123 const base::hash_set
<std::string
>* page_term_hashes_
;
125 // Murmur3 hashes of all the individual words in page_term_hashes_. If
126 // page_term_hashes_ included (hashed) "one" and "one two", page_word_hashes_
127 // would contain (hashed) "one" and "two". We do this so that we can have a
128 // quick out in the common case that the current word we are processing
129 // doesn't contain any part of one of our terms.
130 const base::hash_set
<uint32
>* page_word_hashes_
;
132 // The maximum number of words in an n-gram.
133 const size_t max_words_per_term_
;
135 // The seed for murmurhash3.
136 const uint32 murmurhash3_seed_
;
138 // This cache is used to see if we need to check the word at all, as
139 // converting to UTF8, lowercasing, and hashing are all relatively expensive
140 // operations. Though this is called an MRU cache, it seems to behave like
141 // an LRU cache (i.e. it evicts the oldest accesses first).
142 typedef base::HashingMRUCache
<base::StringPiece16
, bool> WordCache
;
143 WordCache negative_word_cache_
;
145 // Non-owned pointer to our clock.
146 FeatureExtractorClock
* clock_
;
148 // The output parameters from the most recent call to ExtractFeatures().
149 const base::string16
* page_text_
; // The caller keeps ownership of this.
150 FeatureMap
* features_
; // The caller keeps ownership of this.
151 DoneCallback done_callback_
;
153 // Stores the current state of term extraction from |page_text_|.
154 scoped_ptr
<ExtractionState
> state_
;
156 // Used in scheduling ExtractFeaturesWithTimeout tasks.
157 // These pointers are invalidated if extraction is cancelled.
158 base::WeakPtrFactory
<PhishingTermFeatureExtractor
> weak_factory_
;
160 DISALLOW_COPY_AND_ASSIGN(PhishingTermFeatureExtractor
);
163 } // namespace safe_browsing
165 #endif // CHROME_RENDERER_SAFE_BROWSING_PHISHING_TERM_FEATURE_EXTRACTOR_H_