Pin Chrome's shortcut to the Win10 Start menu on install and OS upgrade.
[chromium-blink-merge.git] / chrome / renderer / safe_browsing / phishing_term_feature_extractor_unittest.cc
blob78967586bf32e78a37dbd8f6eae374977beaa17b
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "chrome/renderer/safe_browsing/phishing_term_feature_extractor.h"
7 #include <string>
9 #include "base/bind.h"
10 #include "base/callback.h"
11 #include "base/containers/hash_tables.h"
12 #include "base/location.h"
13 #include "base/memory/scoped_ptr.h"
14 #include "base/message_loop/message_loop.h"
15 #include "base/single_thread_task_runner.h"
16 #include "base/strings/string16.h"
17 #include "base/strings/stringprintf.h"
18 #include "base/strings/utf_string_conversions.h"
19 #include "base/time/time.h"
20 #include "chrome/renderer/safe_browsing/features.h"
21 #include "chrome/renderer/safe_browsing/mock_feature_extractor_clock.h"
22 #include "chrome/renderer/safe_browsing/murmurhash3_util.h"
23 #include "chrome/renderer/safe_browsing/test_utils.h"
24 #include "crypto/sha2.h"
25 #include "testing/gmock/include/gmock/gmock.h"
26 #include "testing/gtest/include/gtest/gtest.h"
28 using base::ASCIIToUTF16;
29 using ::testing::Return;
32 static const uint32 kMurmurHash3Seed = 2777808611U;
34 namespace safe_browsing {
36 class PhishingTermFeatureExtractorTest : public ::testing::Test {
37 protected:
38 void SetUp() override {
39 base::hash_set<std::string> terms;
40 terms.insert("one");
41 terms.insert("one one");
42 terms.insert("two");
43 terms.insert("multi word test");
44 terms.insert("capitalization");
45 terms.insert("space");
46 terms.insert("separator");
47 terms.insert("punctuation");
48 // Chinese (translation of "hello")
49 terms.insert("\xe4\xbd\xa0\xe5\xa5\xbd");
50 // Chinese (translation of "goodbye")
51 terms.insert("\xe5\x86\x8d\xe8\xa7\x81");
53 for (base::hash_set<std::string>::iterator it = terms.begin();
54 it != terms.end(); ++it) {
55 term_hashes_.insert(crypto::SHA256HashString(*it));
58 base::hash_set<std::string> words;
59 words.insert("one");
60 words.insert("two");
61 words.insert("multi");
62 words.insert("word");
63 words.insert("test");
64 words.insert("capitalization");
65 words.insert("space");
66 words.insert("separator");
67 words.insert("punctuation");
68 words.insert("\xe4\xbd\xa0\xe5\xa5\xbd");
69 words.insert("\xe5\x86\x8d\xe8\xa7\x81");
71 for (base::hash_set<std::string>::iterator it = words.begin();
72 it != words.end(); ++it) {
73 word_hashes_.insert(MurmurHash3String(*it, kMurmurHash3Seed));
76 ResetExtractor(3 /* max shingles per page */);
79 void ResetExtractor(size_t max_shingles_per_page) {
80 extractor_.reset(new PhishingTermFeatureExtractor(
81 &term_hashes_,
82 &word_hashes_,
83 3 /* max_words_per_term */,
84 kMurmurHash3Seed,
85 max_shingles_per_page,
86 4 /* shingle_size */,
87 &clock_));
90 // Runs the TermFeatureExtractor on |page_text|, waiting for the
91 // completion callback. Returns the success boolean from the callback.
92 bool ExtractFeatures(const base::string16* page_text,
93 FeatureMap* features,
94 std::set<uint32>* shingle_hashes) {
95 success_ = false;
96 extractor_->ExtractFeatures(
97 page_text,
98 features,
99 shingle_hashes,
100 base::Bind(&PhishingTermFeatureExtractorTest::ExtractionDone,
101 base::Unretained(this)));
102 msg_loop_.Run();
103 return success_;
106 void PartialExtractFeatures(const base::string16* page_text,
107 FeatureMap* features,
108 std::set<uint32>* shingle_hashes) {
109 extractor_->ExtractFeatures(
110 page_text,
111 features,
112 shingle_hashes,
113 base::Bind(&PhishingTermFeatureExtractorTest::ExtractionDone,
114 base::Unretained(this)));
115 msg_loop_.task_runner()->PostTask(
116 FROM_HERE, base::Bind(&PhishingTermFeatureExtractorTest::QuitExtraction,
117 base::Unretained(this)));
118 msg_loop_.RunUntilIdle();
121 // Completion callback for feature extraction.
122 void ExtractionDone(bool success) {
123 success_ = success;
124 msg_loop_.Quit();
127 void QuitExtraction() {
128 extractor_->CancelPendingExtraction();
129 msg_loop_.Quit();
132 base::MessageLoop msg_loop_;
133 MockFeatureExtractorClock clock_;
134 scoped_ptr<PhishingTermFeatureExtractor> extractor_;
135 base::hash_set<std::string> term_hashes_;
136 base::hash_set<uint32> word_hashes_;
137 bool success_; // holds the success value from ExtractFeatures
140 TEST_F(PhishingTermFeatureExtractorTest, ExtractFeatures) {
141 // This test doesn't exercise the extraction timing.
142 EXPECT_CALL(clock_, Now()).WillRepeatedly(Return(base::TimeTicks::Now()));
144 base::string16 page_text = ASCIIToUTF16("blah");
145 FeatureMap expected_features; // initially empty
146 std::set<uint32> expected_shingle_hashes;
148 FeatureMap features;
149 std::set<uint32> shingle_hashes;
150 ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes));
151 ExpectFeatureMapsAreEqual(features, expected_features);
152 EXPECT_THAT(expected_shingle_hashes, testing::ContainerEq(shingle_hashes));
154 page_text = ASCIIToUTF16("one one");
155 expected_features.Clear();
156 expected_features.AddBooleanFeature(features::kPageTerm +
157 std::string("one"));
158 expected_features.AddBooleanFeature(features::kPageTerm +
159 std::string("one one"));
160 expected_shingle_hashes.clear();
162 features.Clear();
163 shingle_hashes.clear();
164 ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes));
165 ExpectFeatureMapsAreEqual(features, expected_features);
166 EXPECT_THAT(expected_shingle_hashes, testing::ContainerEq(shingle_hashes));
168 page_text = ASCIIToUTF16("bla bla multi word test bla");
169 expected_features.Clear();
170 expected_features.AddBooleanFeature(features::kPageTerm +
171 std::string("multi word test"));
172 expected_shingle_hashes.clear();
173 expected_shingle_hashes.insert(MurmurHash3String("bla bla multi word ",
174 kMurmurHash3Seed));
175 expected_shingle_hashes.insert(MurmurHash3String("bla multi word test ",
176 kMurmurHash3Seed));
177 expected_shingle_hashes.insert(MurmurHash3String("multi word test bla ",
178 kMurmurHash3Seed));
180 features.Clear();
181 shingle_hashes.clear();
182 ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes));
183 ExpectFeatureMapsAreEqual(features, expected_features);
184 EXPECT_THAT(expected_shingle_hashes, testing::ContainerEq(shingle_hashes));
186 // This text has all of the words for one of the terms, but they are
187 // not in the correct order.
188 page_text = ASCIIToUTF16("bla bla test word multi bla");
189 expected_features.Clear();
190 expected_shingle_hashes.clear();
191 expected_shingle_hashes.insert(MurmurHash3String("bla bla test word ",
192 kMurmurHash3Seed));
193 expected_shingle_hashes.insert(MurmurHash3String("bla test word multi ",
194 kMurmurHash3Seed));
195 expected_shingle_hashes.insert(MurmurHash3String("test word multi bla ",
196 kMurmurHash3Seed));
198 features.Clear();
199 shingle_hashes.clear();
200 ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes));
201 ExpectFeatureMapsAreEqual(features, expected_features);
202 EXPECT_THAT(expected_shingle_hashes, testing::ContainerEq(shingle_hashes));
204 // Test various separators.
205 page_text = ASCIIToUTF16("Capitalization plus non-space\n"
206 "separator... punctuation!");
207 expected_features.Clear();
208 expected_features.AddBooleanFeature(features::kPageTerm +
209 std::string("capitalization"));
210 expected_features.AddBooleanFeature(features::kPageTerm +
211 std::string("space"));
212 expected_features.AddBooleanFeature(features::kPageTerm +
213 std::string("separator"));
214 expected_features.AddBooleanFeature(features::kPageTerm +
215 std::string("punctuation"));
216 expected_shingle_hashes.clear();
217 expected_shingle_hashes.insert(
218 MurmurHash3String("capitalization plus non space ", kMurmurHash3Seed));
219 expected_shingle_hashes.insert(MurmurHash3String("plus non space separator ",
220 kMurmurHash3Seed));
221 expected_shingle_hashes.insert(
222 MurmurHash3String("non space separator punctuation ", kMurmurHash3Seed));
224 features.Clear();
225 shingle_hashes.clear();
226 ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes));
227 ExpectFeatureMapsAreEqual(features, expected_features);
228 EXPECT_THAT(expected_shingle_hashes, testing::ContainerEq(shingle_hashes));
230 // Test a page with too many words and we should only 3 minimum hashes.
231 page_text = ASCIIToUTF16("This page has way too many words.");
232 expected_features.Clear();
233 expected_shingle_hashes.clear();
234 expected_shingle_hashes.insert(MurmurHash3String("this page has way ",
235 kMurmurHash3Seed));
236 expected_shingle_hashes.insert(MurmurHash3String("page has way too ",
237 kMurmurHash3Seed));
238 expected_shingle_hashes.insert(MurmurHash3String("has way too many ",
239 kMurmurHash3Seed));
240 expected_shingle_hashes.insert(MurmurHash3String("way too many words ",
241 kMurmurHash3Seed));
242 std::set<uint32>::iterator it = expected_shingle_hashes.end();
243 expected_shingle_hashes.erase(--it);
245 features.Clear();
246 shingle_hashes.clear();
247 ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes));
248 ExpectFeatureMapsAreEqual(features, expected_features);
249 EXPECT_THAT(expected_shingle_hashes, testing::ContainerEq(shingle_hashes));
251 // Test with empty page text.
252 page_text = base::string16();
253 expected_features.Clear();
254 expected_shingle_hashes.clear();
255 features.Clear();
256 shingle_hashes.clear();
257 ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes));
258 ExpectFeatureMapsAreEqual(features, expected_features);
259 EXPECT_THAT(expected_shingle_hashes, testing::ContainerEq(shingle_hashes));
261 #if !defined(OS_ANDROID)
262 // The test code is disabled due to http://crbug.com/392234
263 // The client-side detection feature is not enabled on Android yet.
264 // If we decided to enable the feature, we need to fix the bug first.
266 // Chinese translation of the phrase "hello goodbye hello goodbye". This tests
267 // that we can correctly separate terms in languages that don't use spaces.
268 page_text =
269 base::UTF8ToUTF16("\xe4\xbd\xa0\xe5\xa5\xbd\xe5\x86\x8d\xe8\xa7\x81"
270 "\xe4\xbd\xa0\xe5\xa5\xbd\xe5\x86\x8d\xe8\xa7\x81");
271 expected_features.Clear();
272 expected_features.AddBooleanFeature(
273 features::kPageTerm + std::string("\xe4\xbd\xa0\xe5\xa5\xbd"));
274 expected_features.AddBooleanFeature(
275 features::kPageTerm + std::string("\xe5\x86\x8d\xe8\xa7\x81"));
276 expected_shingle_hashes.clear();
277 expected_shingle_hashes.insert(MurmurHash3String(
278 "\xe4\xbd\xa0\xe5\xa5\xbd \xe5\x86\x8d\xe8\xa7\x81 "
279 "\xe4\xbd\xa0\xe5\xa5\xbd \xe5\x86\x8d\xe8\xa7\x81 ", kMurmurHash3Seed));
281 features.Clear();
282 shingle_hashes.clear();
283 ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes));
284 ExpectFeatureMapsAreEqual(features, expected_features);
285 EXPECT_THAT(expected_shingle_hashes, testing::ContainerEq(shingle_hashes));
286 #endif
289 TEST_F(PhishingTermFeatureExtractorTest, Continuation) {
290 // For this test, we'll cause the feature extraction to run multiple
291 // iterations by incrementing the clock.
292 ResetExtractor(200 /* max shingles per page */);
294 // This page has a total of 30 words. For the features to be computed
295 // correctly, the extractor has to process the entire string of text.
296 base::string16 page_text(ASCIIToUTF16("one "));
297 for (int i = 0; i < 28; ++i) {
298 page_text.append(ASCIIToUTF16(base::StringPrintf("%d ", i)));
300 page_text.append(ASCIIToUTF16("two"));
302 // Advance the clock 3 ms every 5 words processed, 10 ms between chunks.
303 // Note that this assumes kClockCheckGranularity = 5 and
304 // kMaxTimePerChunkMs = 10.
305 base::TimeTicks now = base::TimeTicks::Now();
306 EXPECT_CALL(clock_, Now())
307 // Time check at the start of extraction.
308 .WillOnce(Return(now))
309 // Time check at the start of the first chunk of work.
310 .WillOnce(Return(now))
311 // Time check after the first 5 words.
312 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(3)))
313 // Time check after the next 5 words.
314 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(6)))
315 // Time check after the next 5 words.
316 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(9)))
317 // Time check after the next 5 words. This is over the chunk
318 // time limit, so a continuation task will be posted.
319 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(12)))
320 // Time check at the start of the second chunk of work.
321 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(22)))
322 // Time check after the next 5 words.
323 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(25)))
324 // Time check after the next 5 words.
325 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(28)))
326 // A final check for the histograms.
327 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(30)));
329 FeatureMap expected_features;
330 expected_features.AddBooleanFeature(features::kPageTerm +
331 std::string("one"));
332 expected_features.AddBooleanFeature(features::kPageTerm +
333 std::string("two"));
334 std::set<uint32> expected_shingle_hashes;
335 expected_shingle_hashes.insert(
336 MurmurHash3String("one 0 1 2 ", kMurmurHash3Seed));
337 expected_shingle_hashes.insert(
338 MurmurHash3String("0 1 2 3 ", kMurmurHash3Seed));
339 expected_shingle_hashes.insert(
340 MurmurHash3String("1 2 3 4 ", kMurmurHash3Seed));
341 expected_shingle_hashes.insert(
342 MurmurHash3String("2 3 4 5 ", kMurmurHash3Seed));
343 expected_shingle_hashes.insert(
344 MurmurHash3String("3 4 5 6 ", kMurmurHash3Seed));
345 expected_shingle_hashes.insert(
346 MurmurHash3String("4 5 6 7 ", kMurmurHash3Seed));
347 expected_shingle_hashes.insert(
348 MurmurHash3String("5 6 7 8 ", kMurmurHash3Seed));
349 expected_shingle_hashes.insert(
350 MurmurHash3String("6 7 8 9 ", kMurmurHash3Seed));
351 expected_shingle_hashes.insert(
352 MurmurHash3String("7 8 9 10 ", kMurmurHash3Seed));
353 expected_shingle_hashes.insert(
354 MurmurHash3String("8 9 10 11 ", kMurmurHash3Seed));
355 expected_shingle_hashes.insert(
356 MurmurHash3String("9 10 11 12 ", kMurmurHash3Seed));
357 expected_shingle_hashes.insert(
358 MurmurHash3String("10 11 12 13 ", kMurmurHash3Seed));
359 expected_shingle_hashes.insert(
360 MurmurHash3String("11 12 13 14 ", kMurmurHash3Seed));
361 expected_shingle_hashes.insert(
362 MurmurHash3String("12 13 14 15 ", kMurmurHash3Seed));
363 expected_shingle_hashes.insert(
364 MurmurHash3String("13 14 15 16 ", kMurmurHash3Seed));
365 expected_shingle_hashes.insert(
366 MurmurHash3String("14 15 16 17 ", kMurmurHash3Seed));
367 expected_shingle_hashes.insert(
368 MurmurHash3String("15 16 17 18 ", kMurmurHash3Seed));
369 expected_shingle_hashes.insert(
370 MurmurHash3String("16 17 18 19 ", kMurmurHash3Seed));
371 expected_shingle_hashes.insert(
372 MurmurHash3String("17 18 19 20 ", kMurmurHash3Seed));
373 expected_shingle_hashes.insert(
374 MurmurHash3String("18 19 20 21 ", kMurmurHash3Seed));
375 expected_shingle_hashes.insert(
376 MurmurHash3String("19 20 21 22 ", kMurmurHash3Seed));
377 expected_shingle_hashes.insert(
378 MurmurHash3String("20 21 22 23 ", kMurmurHash3Seed));
379 expected_shingle_hashes.insert(
380 MurmurHash3String("21 22 23 24 ", kMurmurHash3Seed));
381 expected_shingle_hashes.insert(
382 MurmurHash3String("22 23 24 25 ", kMurmurHash3Seed));
383 expected_shingle_hashes.insert(
384 MurmurHash3String("23 24 25 26 ", kMurmurHash3Seed));
385 expected_shingle_hashes.insert(
386 MurmurHash3String("24 25 26 27 ", kMurmurHash3Seed));
387 expected_shingle_hashes.insert(
388 MurmurHash3String("25 26 27 two ", kMurmurHash3Seed));
390 FeatureMap features;
391 std::set<uint32> shingle_hashes;
392 ASSERT_TRUE(ExtractFeatures(&page_text, &features, &shingle_hashes));
393 ExpectFeatureMapsAreEqual(features, expected_features);
394 EXPECT_THAT(expected_shingle_hashes, testing::ContainerEq(shingle_hashes));
395 // Make sure none of the mock expectations carry over to the next test.
396 ::testing::Mock::VerifyAndClearExpectations(&clock_);
398 // Now repeat the test with the same text, but advance the clock faster so
399 // that the extraction time exceeds the maximum total time for the feature
400 // extractor. Extraction should fail. Note that this assumes
401 // kMaxTotalTimeMs = 500.
402 EXPECT_CALL(clock_, Now())
403 // Time check at the start of extraction.
404 .WillOnce(Return(now))
405 // Time check at the start of the first chunk of work.
406 .WillOnce(Return(now))
407 // Time check after the first 5 words,
408 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(300)))
409 // Time check at the start of the second chunk of work.
410 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(350)))
411 // Time check after the next 5 words. This is over the limit.
412 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(600)))
413 // A final time check for the histograms.
414 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(620)));
416 features.Clear();
417 shingle_hashes.clear();
418 EXPECT_FALSE(ExtractFeatures(&page_text, &features, &shingle_hashes));
421 TEST_F(PhishingTermFeatureExtractorTest, PartialExtractionTest) {
422 scoped_ptr<base::string16> page_text(
423 new base::string16(ASCIIToUTF16("one ")));
424 for (int i = 0; i < 28; ++i) {
425 page_text->append(ASCIIToUTF16(base::StringPrintf("%d ", i)));
428 base::TimeTicks now = base::TimeTicks::Now();
429 EXPECT_CALL(clock_, Now())
430 // Time check at the start of extraction.
431 .WillOnce(Return(now))
432 // Time check at the start of the first chunk of work.
433 .WillOnce(Return(now))
434 // Time check after the first 5 words.
435 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(7)))
436 // Time check after the next 5 words. This should be greater than
437 // kMaxTimePerChunkMs so that we stop and schedule extraction for later.
438 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(14)));
440 FeatureMap features;
441 std::set<uint32> shingle_hashes;
442 // Extract first 10 words then stop.
443 PartialExtractFeatures(page_text.get(), &features, &shingle_hashes);
445 page_text.reset(new base::string16());
446 for (int i = 30; i < 58; ++i) {
447 page_text->append(ASCIIToUTF16(base::StringPrintf("%d ", i)));
449 page_text->append(ASCIIToUTF16("multi word test "));
450 features.Clear();
451 shingle_hashes.clear();
453 // This part doesn't exercise the extraction timing.
454 EXPECT_CALL(clock_, Now()).WillRepeatedly(Return(base::TimeTicks::Now()));
456 // Now extract normally and make sure nothing breaks.
457 EXPECT_TRUE(ExtractFeatures(page_text.get(), &features, &shingle_hashes));
459 FeatureMap expected_features;
460 expected_features.AddBooleanFeature(features::kPageTerm +
461 std::string("multi word test"));
462 ExpectFeatureMapsAreEqual(features, expected_features);
465 } // namespace safe_browsing