1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "chrome/renderer/safe_browsing/phishing_term_feature_extractor.h"
10 #include "base/callback.h"
11 #include "base/containers/hash_tables.h"
12 #include "base/memory/scoped_ptr.h"
13 #include "base/message_loop/message_loop.h"
14 #include "base/strings/string16.h"
15 #include "base/strings/stringprintf.h"
16 #include "base/strings/utf_string_conversions.h"
17 #include "base/time/time.h"
18 #include "chrome/renderer/safe_browsing/features.h"
19 #include "chrome/renderer/safe_browsing/mock_feature_extractor_clock.h"
20 #include "chrome/renderer/safe_browsing/murmurhash3_util.h"
21 #include "chrome/renderer/safe_browsing/test_utils.h"
22 #include "crypto/sha2.h"
23 #include "testing/gmock/include/gmock/gmock.h"
24 #include "testing/gtest/include/gtest/gtest.h"
26 using base::ASCIIToUTF16
;
27 using ::testing::Return
;
29 namespace safe_browsing
{
31 class PhishingTermFeatureExtractorTest
: public ::testing::Test
{
33 virtual void SetUp() {
34 base::hash_set
<std::string
> terms
;
36 terms
.insert("one one");
38 terms
.insert("multi word test");
39 terms
.insert("capitalization");
40 terms
.insert("space");
41 terms
.insert("separator");
42 terms
.insert("punctuation");
43 // Chinese (translation of "hello")
44 terms
.insert("\xe4\xbd\xa0\xe5\xa5\xbd");
45 // Chinese (translation of "goodbye")
46 terms
.insert("\xe5\x86\x8d\xe8\xa7\x81");
48 for (base::hash_set
<std::string
>::iterator it
= terms
.begin();
49 it
!= terms
.end(); ++it
) {
50 term_hashes_
.insert(crypto::SHA256HashString(*it
));
53 base::hash_set
<std::string
> words
;
56 words
.insert("multi");
59 words
.insert("capitalization");
60 words
.insert("space");
61 words
.insert("separator");
62 words
.insert("punctuation");
63 words
.insert("\xe4\xbd\xa0\xe5\xa5\xbd");
64 words
.insert("\xe5\x86\x8d\xe8\xa7\x81");
66 static const uint32 kMurmurHash3Seed
= 2777808611U;
67 for (base::hash_set
<std::string
>::iterator it
= words
.begin();
68 it
!= words
.end(); ++it
) {
69 word_hashes_
.insert(MurmurHash3String(*it
, kMurmurHash3Seed
));
72 extractor_
.reset(new PhishingTermFeatureExtractor(
75 3 /* max_words_per_term */,
80 // Runs the TermFeatureExtractor on |page_text|, waiting for the
81 // completion callback. Returns the success boolean from the callback.
82 bool ExtractFeatures(const base::string16
* page_text
, FeatureMap
* features
) {
84 extractor_
->ExtractFeatures(
87 base::Bind(&PhishingTermFeatureExtractorTest::ExtractionDone
,
88 base::Unretained(this)));
93 void PartialExtractFeatures(const base::string16
* page_text
,
94 FeatureMap
* features
) {
95 extractor_
->ExtractFeatures(
98 base::Bind(&PhishingTermFeatureExtractorTest::ExtractionDone
,
99 base::Unretained(this)));
102 base::Bind(&PhishingTermFeatureExtractorTest::QuitExtraction
,
103 base::Unretained(this)));
104 msg_loop_
.RunUntilIdle();
107 // Completion callback for feature extraction.
108 void ExtractionDone(bool success
) {
113 void QuitExtraction() {
114 extractor_
->CancelPendingExtraction();
118 base::MessageLoop msg_loop_
;
119 MockFeatureExtractorClock clock_
;
120 scoped_ptr
<PhishingTermFeatureExtractor
> extractor_
;
121 base::hash_set
<std::string
> term_hashes_
;
122 base::hash_set
<uint32
> word_hashes_
;
123 bool success_
; // holds the success value from ExtractFeatures
126 TEST_F(PhishingTermFeatureExtractorTest
, ExtractFeatures
) {
127 // This test doesn't exercise the extraction timing.
128 EXPECT_CALL(clock_
, Now()).WillRepeatedly(Return(base::TimeTicks::Now()));
130 base::string16 page_text
= ASCIIToUTF16("blah");
131 FeatureMap expected_features
; // initially empty
134 ASSERT_TRUE(ExtractFeatures(&page_text
, &features
));
135 ExpectFeatureMapsAreEqual(features
, expected_features
);
137 page_text
= ASCIIToUTF16("one one");
138 expected_features
.Clear();
139 expected_features
.AddBooleanFeature(features::kPageTerm
+
141 expected_features
.AddBooleanFeature(features::kPageTerm
+
142 std::string("one one"));
145 ASSERT_TRUE(ExtractFeatures(&page_text
, &features
));
146 ExpectFeatureMapsAreEqual(features
, expected_features
);
148 page_text
= ASCIIToUTF16("bla bla multi word test bla");
149 expected_features
.Clear();
150 expected_features
.AddBooleanFeature(features::kPageTerm
+
151 std::string("multi word test"));
154 ASSERT_TRUE(ExtractFeatures(&page_text
, &features
));
155 ExpectFeatureMapsAreEqual(features
, expected_features
);
157 // This text has all of the words for one of the terms, but they are
158 // not in the correct order.
159 page_text
= ASCIIToUTF16("bla bla test word multi bla");
160 expected_features
.Clear();
163 ASSERT_TRUE(ExtractFeatures(&page_text
, &features
));
164 ExpectFeatureMapsAreEqual(features
, expected_features
);
166 page_text
= ASCIIToUTF16("Capitalization plus non-space\n"
167 "separator... punctuation!");
168 expected_features
.Clear();
169 expected_features
.AddBooleanFeature(features::kPageTerm
+
170 std::string("capitalization"));
171 expected_features
.AddBooleanFeature(features::kPageTerm
+
172 std::string("space"));
173 expected_features
.AddBooleanFeature(features::kPageTerm
+
174 std::string("separator"));
175 expected_features
.AddBooleanFeature(features::kPageTerm
+
176 std::string("punctuation"));
179 ASSERT_TRUE(ExtractFeatures(&page_text
, &features
));
180 ExpectFeatureMapsAreEqual(features
, expected_features
);
182 // Test with empty page text.
183 page_text
= base::string16();
184 expected_features
.Clear();
186 ASSERT_TRUE(ExtractFeatures(&page_text
, &features
));
187 ExpectFeatureMapsAreEqual(features
, expected_features
);
189 // Chinese translation of the phrase "hello goodbye". This tests that
190 // we can correctly separate terms in languages that don't use spaces.
192 base::UTF8ToUTF16("\xe4\xbd\xa0\xe5\xa5\xbd\xe5\x86\x8d\xe8\xa7\x81");
193 expected_features
.Clear();
194 expected_features
.AddBooleanFeature(
195 features::kPageTerm
+ std::string("\xe4\xbd\xa0\xe5\xa5\xbd"));
196 expected_features
.AddBooleanFeature(
197 features::kPageTerm
+ std::string("\xe5\x86\x8d\xe8\xa7\x81"));
200 ASSERT_TRUE(ExtractFeatures(&page_text
, &features
));
201 ExpectFeatureMapsAreEqual(features
, expected_features
);
204 TEST_F(PhishingTermFeatureExtractorTest
, Continuation
) {
205 // For this test, we'll cause the feature extraction to run multiple
206 // iterations by incrementing the clock.
208 // This page has a total of 30 words. For the features to be computed
209 // correctly, the extractor has to process the entire string of text.
210 base::string16
page_text(ASCIIToUTF16("one "));
211 for (int i
= 0; i
< 28; ++i
) {
212 page_text
.append(ASCIIToUTF16(base::StringPrintf("%d ", i
)));
214 page_text
.append(ASCIIToUTF16("two"));
216 // Advance the clock 3 ms every 5 words processed, 10 ms between chunks.
217 // Note that this assumes kClockCheckGranularity = 5 and
218 // kMaxTimePerChunkMs = 10.
219 base::TimeTicks now
= base::TimeTicks::Now();
220 EXPECT_CALL(clock_
, Now())
221 // Time check at the start of extraction.
222 .WillOnce(Return(now
))
223 // Time check at the start of the first chunk of work.
224 .WillOnce(Return(now
))
225 // Time check after the first 5 words.
226 .WillOnce(Return(now
+ base::TimeDelta::FromMilliseconds(3)))
227 // Time check after the next 5 words.
228 .WillOnce(Return(now
+ base::TimeDelta::FromMilliseconds(6)))
229 // Time check after the next 5 words.
230 .WillOnce(Return(now
+ base::TimeDelta::FromMilliseconds(9)))
231 // Time check after the next 5 words. This is over the chunk
232 // time limit, so a continuation task will be posted.
233 .WillOnce(Return(now
+ base::TimeDelta::FromMilliseconds(12)))
234 // Time check at the start of the second chunk of work.
235 .WillOnce(Return(now
+ base::TimeDelta::FromMilliseconds(22)))
236 // Time check after the next 5 words.
237 .WillOnce(Return(now
+ base::TimeDelta::FromMilliseconds(25)))
238 // Time check after the next 5 words.
239 .WillOnce(Return(now
+ base::TimeDelta::FromMilliseconds(28)))
240 // A final check for the histograms.
241 .WillOnce(Return(now
+ base::TimeDelta::FromMilliseconds(30)));
243 FeatureMap expected_features
;
244 expected_features
.AddBooleanFeature(features::kPageTerm
+
246 expected_features
.AddBooleanFeature(features::kPageTerm
+
250 ASSERT_TRUE(ExtractFeatures(&page_text
, &features
));
251 ExpectFeatureMapsAreEqual(features
, expected_features
);
252 // Make sure none of the mock expectations carry over to the next test.
253 ::testing::Mock::VerifyAndClearExpectations(&clock_
);
255 // Now repeat the test with the same text, but advance the clock faster so
256 // that the extraction time exceeds the maximum total time for the feature
257 // extractor. Extraction should fail. Note that this assumes
258 // kMaxTotalTimeMs = 500.
259 EXPECT_CALL(clock_
, Now())
260 // Time check at the start of extraction.
261 .WillOnce(Return(now
))
262 // Time check at the start of the first chunk of work.
263 .WillOnce(Return(now
))
264 // Time check after the first 5 words,
265 .WillOnce(Return(now
+ base::TimeDelta::FromMilliseconds(300)))
266 // Time check at the start of the second chunk of work.
267 .WillOnce(Return(now
+ base::TimeDelta::FromMilliseconds(350)))
268 // Time check after the next 5 words. This is over the limit.
269 .WillOnce(Return(now
+ base::TimeDelta::FromMilliseconds(600)))
270 // A final time check for the histograms.
271 .WillOnce(Return(now
+ base::TimeDelta::FromMilliseconds(620)));
274 EXPECT_FALSE(ExtractFeatures(&page_text
, &features
));
277 TEST_F(PhishingTermFeatureExtractorTest
, PartialExtractionTest
) {
278 scoped_ptr
<base::string16
> page_text(
279 new base::string16(ASCIIToUTF16("one ")));
280 for (int i
= 0; i
< 28; ++i
) {
281 page_text
->append(ASCIIToUTF16(base::StringPrintf("%d ", i
)));
284 base::TimeTicks now
= base::TimeTicks::Now();
285 EXPECT_CALL(clock_
, Now())
286 // Time check at the start of extraction.
287 .WillOnce(Return(now
))
288 // Time check at the start of the first chunk of work.
289 .WillOnce(Return(now
))
290 // Time check after the first 5 words.
291 .WillOnce(Return(now
+ base::TimeDelta::FromMilliseconds(7)))
292 // Time check after the next 5 words. This should be greater than
293 // kMaxTimePerChunkMs so that we stop and schedule extraction for later.
294 .WillOnce(Return(now
+ base::TimeDelta::FromMilliseconds(14)));
297 // Extract first 10 words then stop.
298 PartialExtractFeatures(page_text
.get(), &features
);
300 page_text
.reset(new base::string16());
301 for (int i
= 30; i
< 58; ++i
) {
302 page_text
->append(ASCIIToUTF16(base::StringPrintf("%d ", i
)));
304 page_text
->append(ASCIIToUTF16("multi word test "));
307 // This part doesn't exercise the extraction timing.
308 EXPECT_CALL(clock_
, Now()).WillRepeatedly(Return(base::TimeTicks::Now()));
310 // Now extract normally and make sure nothing breaks.
311 EXPECT_TRUE(ExtractFeatures(page_text
.get(), &features
));
313 FeatureMap expected_features
;
314 expected_features
.AddBooleanFeature(features::kPageTerm
+
315 std::string("multi word test"));
316 ExpectFeatureMapsAreEqual(features
, expected_features
);
319 } // namespace safe_browsing