Explicitly add python-numpy dependency to install-build-deps.
[chromium-blink-merge.git] / chrome / renderer / safe_browsing / phishing_term_feature_extractor_unittest.cc
blob4c1560553bb0c433c8589c19f84fc574b483e54d
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "chrome/renderer/safe_browsing/phishing_term_feature_extractor.h"
7 #include <string>
9 #include "base/bind.h"
10 #include "base/callback.h"
11 #include "base/containers/hash_tables.h"
12 #include "base/memory/scoped_ptr.h"
13 #include "base/message_loop/message_loop.h"
14 #include "base/strings/string16.h"
15 #include "base/strings/stringprintf.h"
16 #include "base/strings/utf_string_conversions.h"
17 #include "base/time/time.h"
18 #include "chrome/renderer/safe_browsing/features.h"
19 #include "chrome/renderer/safe_browsing/mock_feature_extractor_clock.h"
20 #include "chrome/renderer/safe_browsing/murmurhash3_util.h"
21 #include "chrome/renderer/safe_browsing/test_utils.h"
22 #include "crypto/sha2.h"
23 #include "testing/gmock/include/gmock/gmock.h"
24 #include "testing/gtest/include/gtest/gtest.h"
26 using base::ASCIIToUTF16;
27 using ::testing::Return;
30 static const uint32 kMurmurHash3Seed = 2777808611U;
32 namespace safe_browsing {
34 class PhishingTermFeatureExtractorTest : public ::testing::Test {
35 protected:
36 void SetUp() override {
37 base::hash_set<std::string> terms;
38 terms.insert("one");
39 terms.insert("one one");
40 terms.insert("two");
41 terms.insert("multi word test");
42 terms.insert("capitalization");
43 terms.insert("space");
44 terms.insert("separator");
45 terms.insert("punctuation");
46 // Chinese (translation of "hello")
47 terms.insert("\xe4\xbd\xa0\xe5\xa5\xbd");
48 // Chinese (translation of "goodbye")
49 terms.insert("\xe5\x86\x8d\xe8\xa7\x81");
51 for (base::hash_set<std::string>::iterator it = terms.begin();
52 it != terms.end(); ++it) {
53 term_hashes_.insert(crypto::SHA256HashString(*it));
56 base::hash_set<std::string> words;
57 words.insert("one");
58 words.insert("two");
59 words.insert("multi");
60 words.insert("word");
61 words.insert("test");
62 words.insert("capitalization");
63 words.insert("space");
64 words.insert("separator");
65 words.insert("punctuation");
66 words.insert("\xe4\xbd\xa0\xe5\xa5\xbd");
67 words.insert("\xe5\x86\x8d\xe8\xa7\x81");
69 for (base::hash_set<std::string>::iterator it = words.begin();
70 it != words.end(); ++it) {
71 word_hashes_.insert(MurmurHash3String(*it, kMurmurHash3Seed));
74 ResetExtractor(3 /* max shingles per page */);
77 void ResetExtractor(size_t max_shingles_per_page) {
78 extractor_.reset(new PhishingTermFeatureExtractor(
79 &term_hashes_,
80 &word_hashes_,
81 3 /* max_words_per_term */,
82 kMurmurHash3Seed,
83 max_shingles_per_page,
84 4 /* shingle_size */,
85 &clock_));
88 // Runs the TermFeatureExtractor on |page_text|, waiting for the
89 // completion callback. Returns the success boolean from the callback.
90 bool ExtractFeatures(const base::string16* page_text,
91 FeatureMap* features,
92 std::set<uint32>* shingle_hashes) {
93 success_ = false;
94 extractor_->ExtractFeatures(
95 page_text,
96 features,
97 shingle_hashes,
98 base::Bind(&PhishingTermFeatureExtractorTest::ExtractionDone,
99 base::Unretained(this)));
100 msg_loop_.Run();
101 return success_;
104 void PartialExtractFeatures(const base::string16* page_text,
105 FeatureMap* features,
106 std::set<uint32>* shingle_hashes) {
107 extractor_->ExtractFeatures(
108 page_text,
109 features,
110 shingle_hashes,
111 base::Bind(&PhishingTermFeatureExtractorTest::ExtractionDone,
112 base::Unretained(this)));
113 msg_loop_.PostTask(
114 FROM_HERE,
115 base::Bind(&PhishingTermFeatureExtractorTest::QuitExtraction,
116 base::Unretained(this)));
117 msg_loop_.RunUntilIdle();
120 // Completion callback for feature extraction.
121 void ExtractionDone(bool success) {
122 success_ = success;
123 msg_loop_.Quit();
126 void QuitExtraction() {
127 extractor_->CancelPendingExtraction();
128 msg_loop_.Quit();
131 base::MessageLoop msg_loop_;
132 MockFeatureExtractorClock clock_;
133 scoped_ptr<PhishingTermFeatureExtractor> extractor_;
134 base::hash_set<std::string> term_hashes_;
135 base::hash_set<uint32> word_hashes_;
136 bool success_; // holds the success value from ExtractFeatures
139 TEST_F(PhishingTermFeatureExtractorTest, ExtractFeatures) {
140 // This test doesn't exercise the extraction timing.
141 EXPECT_CALL(clock_, Now()).WillRepeatedly(Return(base::TimeTicks::Now()));
143 base::string16 page_text = ASCIIToUTF16("blah");
144 FeatureMap expected_features; // initially empty
145 std::set<uint32> expected_shingle_hashes;
147 FeatureMap features;
148 std::set<uint32> shingle_hashes;
149 ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes));
150 ExpectFeatureMapsAreEqual(features, expected_features);
151 EXPECT_THAT(expected_shingle_hashes, testing::ContainerEq(shingle_hashes));
153 page_text = ASCIIToUTF16("one one");
154 expected_features.Clear();
155 expected_features.AddBooleanFeature(features::kPageTerm +
156 std::string("one"));
157 expected_features.AddBooleanFeature(features::kPageTerm +
158 std::string("one one"));
159 expected_shingle_hashes.clear();
161 features.Clear();
162 shingle_hashes.clear();
163 ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes));
164 ExpectFeatureMapsAreEqual(features, expected_features);
165 EXPECT_THAT(expected_shingle_hashes, testing::ContainerEq(shingle_hashes));
167 page_text = ASCIIToUTF16("bla bla multi word test bla");
168 expected_features.Clear();
169 expected_features.AddBooleanFeature(features::kPageTerm +
170 std::string("multi word test"));
171 expected_shingle_hashes.clear();
172 expected_shingle_hashes.insert(MurmurHash3String("bla bla multi word ",
173 kMurmurHash3Seed));
174 expected_shingle_hashes.insert(MurmurHash3String("bla multi word test ",
175 kMurmurHash3Seed));
176 expected_shingle_hashes.insert(MurmurHash3String("multi word test bla ",
177 kMurmurHash3Seed));
179 features.Clear();
180 shingle_hashes.clear();
181 ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes));
182 ExpectFeatureMapsAreEqual(features, expected_features);
183 EXPECT_THAT(expected_shingle_hashes, testing::ContainerEq(shingle_hashes));
185 // This text has all of the words for one of the terms, but they are
186 // not in the correct order.
187 page_text = ASCIIToUTF16("bla bla test word multi bla");
188 expected_features.Clear();
189 expected_shingle_hashes.clear();
190 expected_shingle_hashes.insert(MurmurHash3String("bla bla test word ",
191 kMurmurHash3Seed));
192 expected_shingle_hashes.insert(MurmurHash3String("bla test word multi ",
193 kMurmurHash3Seed));
194 expected_shingle_hashes.insert(MurmurHash3String("test word multi bla ",
195 kMurmurHash3Seed));
197 features.Clear();
198 shingle_hashes.clear();
199 ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes));
200 ExpectFeatureMapsAreEqual(features, expected_features);
201 EXPECT_THAT(expected_shingle_hashes, testing::ContainerEq(shingle_hashes));
203 // Test various separators.
204 page_text = ASCIIToUTF16("Capitalization plus non-space\n"
205 "separator... punctuation!");
206 expected_features.Clear();
207 expected_features.AddBooleanFeature(features::kPageTerm +
208 std::string("capitalization"));
209 expected_features.AddBooleanFeature(features::kPageTerm +
210 std::string("space"));
211 expected_features.AddBooleanFeature(features::kPageTerm +
212 std::string("separator"));
213 expected_features.AddBooleanFeature(features::kPageTerm +
214 std::string("punctuation"));
215 expected_shingle_hashes.clear();
216 expected_shingle_hashes.insert(
217 MurmurHash3String("capitalization plus non space ", kMurmurHash3Seed));
218 expected_shingle_hashes.insert(MurmurHash3String("plus non space separator ",
219 kMurmurHash3Seed));
220 expected_shingle_hashes.insert(
221 MurmurHash3String("non space separator punctuation ", kMurmurHash3Seed));
223 features.Clear();
224 shingle_hashes.clear();
225 ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes));
226 ExpectFeatureMapsAreEqual(features, expected_features);
227 EXPECT_THAT(expected_shingle_hashes, testing::ContainerEq(shingle_hashes));
229 // Test a page with too many words and we should only 3 minimum hashes.
230 page_text = ASCIIToUTF16("This page has way too many words.");
231 expected_features.Clear();
232 expected_shingle_hashes.clear();
233 expected_shingle_hashes.insert(MurmurHash3String("this page has way ",
234 kMurmurHash3Seed));
235 expected_shingle_hashes.insert(MurmurHash3String("page has way too ",
236 kMurmurHash3Seed));
237 expected_shingle_hashes.insert(MurmurHash3String("has way too many ",
238 kMurmurHash3Seed));
239 expected_shingle_hashes.insert(MurmurHash3String("way too many words ",
240 kMurmurHash3Seed));
241 std::set<uint32>::iterator it = expected_shingle_hashes.end();
242 expected_shingle_hashes.erase(--it);
244 features.Clear();
245 shingle_hashes.clear();
246 ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes));
247 ExpectFeatureMapsAreEqual(features, expected_features);
248 EXPECT_THAT(expected_shingle_hashes, testing::ContainerEq(shingle_hashes));
250 // Test with empty page text.
251 page_text = base::string16();
252 expected_features.Clear();
253 expected_shingle_hashes.clear();
254 features.Clear();
255 shingle_hashes.clear();
256 ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes));
257 ExpectFeatureMapsAreEqual(features, expected_features);
258 EXPECT_THAT(expected_shingle_hashes, testing::ContainerEq(shingle_hashes));
260 #if !defined(OS_ANDROID)
261 // The test code is disabled due to http://crbug.com/392234
262 // The client-side detection feature is not enabled on Android yet.
263 // If we decided to enable the feature, we need to fix the bug first.
265 // Chinese translation of the phrase "hello goodbye hello goodbye". This tests
266 // that we can correctly separate terms in languages that don't use spaces.
267 page_text =
268 base::UTF8ToUTF16("\xe4\xbd\xa0\xe5\xa5\xbd\xe5\x86\x8d\xe8\xa7\x81"
269 "\xe4\xbd\xa0\xe5\xa5\xbd\xe5\x86\x8d\xe8\xa7\x81");
270 expected_features.Clear();
271 expected_features.AddBooleanFeature(
272 features::kPageTerm + std::string("\xe4\xbd\xa0\xe5\xa5\xbd"));
273 expected_features.AddBooleanFeature(
274 features::kPageTerm + std::string("\xe5\x86\x8d\xe8\xa7\x81"));
275 expected_shingle_hashes.clear();
276 expected_shingle_hashes.insert(MurmurHash3String(
277 "\xe4\xbd\xa0\xe5\xa5\xbd \xe5\x86\x8d\xe8\xa7\x81 "
278 "\xe4\xbd\xa0\xe5\xa5\xbd \xe5\x86\x8d\xe8\xa7\x81 ", kMurmurHash3Seed));
280 features.Clear();
281 shingle_hashes.clear();
282 ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes));
283 ExpectFeatureMapsAreEqual(features, expected_features);
284 EXPECT_THAT(expected_shingle_hashes, testing::ContainerEq(shingle_hashes));
285 #endif
288 TEST_F(PhishingTermFeatureExtractorTest, Continuation) {
289 // For this test, we'll cause the feature extraction to run multiple
290 // iterations by incrementing the clock.
291 ResetExtractor(200 /* max shingles per page */);
293 // This page has a total of 30 words. For the features to be computed
294 // correctly, the extractor has to process the entire string of text.
295 base::string16 page_text(ASCIIToUTF16("one "));
296 for (int i = 0; i < 28; ++i) {
297 page_text.append(ASCIIToUTF16(base::StringPrintf("%d ", i)));
299 page_text.append(ASCIIToUTF16("two"));
301 // Advance the clock 3 ms every 5 words processed, 10 ms between chunks.
302 // Note that this assumes kClockCheckGranularity = 5 and
303 // kMaxTimePerChunkMs = 10.
304 base::TimeTicks now = base::TimeTicks::Now();
305 EXPECT_CALL(clock_, Now())
306 // Time check at the start of extraction.
307 .WillOnce(Return(now))
308 // Time check at the start of the first chunk of work.
309 .WillOnce(Return(now))
310 // Time check after the first 5 words.
311 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(3)))
312 // Time check after the next 5 words.
313 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(6)))
314 // Time check after the next 5 words.
315 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(9)))
316 // Time check after the next 5 words. This is over the chunk
317 // time limit, so a continuation task will be posted.
318 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(12)))
319 // Time check at the start of the second chunk of work.
320 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(22)))
321 // Time check after the next 5 words.
322 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(25)))
323 // Time check after the next 5 words.
324 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(28)))
325 // A final check for the histograms.
326 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(30)));
328 FeatureMap expected_features;
329 expected_features.AddBooleanFeature(features::kPageTerm +
330 std::string("one"));
331 expected_features.AddBooleanFeature(features::kPageTerm +
332 std::string("two"));
333 std::set<uint32> expected_shingle_hashes;
334 expected_shingle_hashes.insert(
335 MurmurHash3String("one 0 1 2 ", kMurmurHash3Seed));
336 expected_shingle_hashes.insert(
337 MurmurHash3String("0 1 2 3 ", kMurmurHash3Seed));
338 expected_shingle_hashes.insert(
339 MurmurHash3String("1 2 3 4 ", kMurmurHash3Seed));
340 expected_shingle_hashes.insert(
341 MurmurHash3String("2 3 4 5 ", kMurmurHash3Seed));
342 expected_shingle_hashes.insert(
343 MurmurHash3String("3 4 5 6 ", kMurmurHash3Seed));
344 expected_shingle_hashes.insert(
345 MurmurHash3String("4 5 6 7 ", kMurmurHash3Seed));
346 expected_shingle_hashes.insert(
347 MurmurHash3String("5 6 7 8 ", kMurmurHash3Seed));
348 expected_shingle_hashes.insert(
349 MurmurHash3String("6 7 8 9 ", kMurmurHash3Seed));
350 expected_shingle_hashes.insert(
351 MurmurHash3String("7 8 9 10 ", kMurmurHash3Seed));
352 expected_shingle_hashes.insert(
353 MurmurHash3String("8 9 10 11 ", kMurmurHash3Seed));
354 expected_shingle_hashes.insert(
355 MurmurHash3String("9 10 11 12 ", kMurmurHash3Seed));
356 expected_shingle_hashes.insert(
357 MurmurHash3String("10 11 12 13 ", kMurmurHash3Seed));
358 expected_shingle_hashes.insert(
359 MurmurHash3String("11 12 13 14 ", kMurmurHash3Seed));
360 expected_shingle_hashes.insert(
361 MurmurHash3String("12 13 14 15 ", kMurmurHash3Seed));
362 expected_shingle_hashes.insert(
363 MurmurHash3String("13 14 15 16 ", kMurmurHash3Seed));
364 expected_shingle_hashes.insert(
365 MurmurHash3String("14 15 16 17 ", kMurmurHash3Seed));
366 expected_shingle_hashes.insert(
367 MurmurHash3String("15 16 17 18 ", kMurmurHash3Seed));
368 expected_shingle_hashes.insert(
369 MurmurHash3String("16 17 18 19 ", kMurmurHash3Seed));
370 expected_shingle_hashes.insert(
371 MurmurHash3String("17 18 19 20 ", kMurmurHash3Seed));
372 expected_shingle_hashes.insert(
373 MurmurHash3String("18 19 20 21 ", kMurmurHash3Seed));
374 expected_shingle_hashes.insert(
375 MurmurHash3String("19 20 21 22 ", kMurmurHash3Seed));
376 expected_shingle_hashes.insert(
377 MurmurHash3String("20 21 22 23 ", kMurmurHash3Seed));
378 expected_shingle_hashes.insert(
379 MurmurHash3String("21 22 23 24 ", kMurmurHash3Seed));
380 expected_shingle_hashes.insert(
381 MurmurHash3String("22 23 24 25 ", kMurmurHash3Seed));
382 expected_shingle_hashes.insert(
383 MurmurHash3String("23 24 25 26 ", kMurmurHash3Seed));
384 expected_shingle_hashes.insert(
385 MurmurHash3String("24 25 26 27 ", kMurmurHash3Seed));
386 expected_shingle_hashes.insert(
387 MurmurHash3String("25 26 27 two ", kMurmurHash3Seed));
389 FeatureMap features;
390 std::set<uint32> shingle_hashes;
391 ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes));
392 ExpectFeatureMapsAreEqual(features, expected_features);
393 EXPECT_THAT(expected_shingle_hashes, testing::ContainerEq(shingle_hashes));
394 // Make sure none of the mock expectations carry over to the next test.
395 ::testing::Mock::VerifyAndClearExpectations(&clock_);
397 // Now repeat the test with the same text, but advance the clock faster so
398 // that the extraction time exceeds the maximum total time for the feature
399 // extractor. Extraction should fail. Note that this assumes
400 // kMaxTotalTimeMs = 500.
401 EXPECT_CALL(clock_, Now())
402 // Time check at the start of extraction.
403 .WillOnce(Return(now))
404 // Time check at the start of the first chunk of work.
405 .WillOnce(Return(now))
406 // Time check after the first 5 words,
407 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(300)))
408 // Time check at the start of the second chunk of work.
409 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(350)))
410 // Time check after the next 5 words. This is over the limit.
411 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(600)))
412 // A final time check for the histograms.
413 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(620)));
415 features.Clear();
416 shingle_hashes.clear();
417 EXPECT_FALSE(ExtractFeatures(&page_text, &features, &shingle_hashes));
420 TEST_F(PhishingTermFeatureExtractorTest, PartialExtractionTest) {
421 scoped_ptr<base::string16> page_text(
422 new base::string16(ASCIIToUTF16("one ")));
423 for (int i = 0; i < 28; ++i) {
424 page_text->append(ASCIIToUTF16(base::StringPrintf("%d ", i)));
427 base::TimeTicks now = base::TimeTicks::Now();
428 EXPECT_CALL(clock_, Now())
429 // Time check at the start of extraction.
430 .WillOnce(Return(now))
431 // Time check at the start of the first chunk of work.
432 .WillOnce(Return(now))
433 // Time check after the first 5 words.
434 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(7)))
435 // Time check after the next 5 words. This should be greater than
436 // kMaxTimePerChunkMs so that we stop and schedule extraction for later.
437 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(14)));
439 FeatureMap features;
440 std::set<uint32> shingle_hashes;
441 // Extract first 10 words then stop.
442 PartialExtractFeatures(page_text.get(), &features, &shingle_hashes);
444 page_text.reset(new base::string16());
445 for (int i = 30; i < 58; ++i) {
446 page_text->append(ASCIIToUTF16(base::StringPrintf("%d ", i)));
448 page_text->append(ASCIIToUTF16("multi word test "));
449 features.Clear();
450 shingle_hashes.clear();
452 // This part doesn't exercise the extraction timing.
453 EXPECT_CALL(clock_, Now()).WillRepeatedly(Return(base::TimeTicks::Now()));
455 // Now extract normally and make sure nothing breaks.
456 EXPECT_TRUE(ExtractFeatures(page_text.get(), &features, &shingle_hashes));
458 FeatureMap expected_features;
459 expected_features.AddBooleanFeature(features::kPageTerm +
460 std::string("multi word test"));
461 ExpectFeatureMapsAreEqual(features, expected_features);
464 } // namespace safe_browsing