Added documentation to web_view.js/web_view_experimental.js regarding the webview...
[chromium-blink-merge.git] / chrome / renderer / safe_browsing / phishing_term_feature_extractor_unittest.cc
blobb8627de34d3a756c2a9ac2707a3e7bb340d9aea6
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "chrome/renderer/safe_browsing/phishing_term_feature_extractor.h"
7 #include <string>
9 #include "base/bind.h"
10 #include "base/callback.h"
11 #include "base/containers/hash_tables.h"
12 #include "base/memory/scoped_ptr.h"
13 #include "base/message_loop/message_loop.h"
14 #include "base/strings/string16.h"
15 #include "base/strings/stringprintf.h"
16 #include "base/strings/utf_string_conversions.h"
17 #include "base/time/time.h"
18 #include "chrome/renderer/safe_browsing/features.h"
19 #include "chrome/renderer/safe_browsing/mock_feature_extractor_clock.h"
20 #include "chrome/renderer/safe_browsing/murmurhash3_util.h"
21 #include "chrome/renderer/safe_browsing/test_utils.h"
22 #include "crypto/sha2.h"
23 #include "testing/gmock/include/gmock/gmock.h"
24 #include "testing/gtest/include/gtest/gtest.h"
26 using base::ASCIIToUTF16;
27 using ::testing::Return;
29 namespace safe_browsing {
31 class PhishingTermFeatureExtractorTest : public ::testing::Test {
32 protected:
33 virtual void SetUp() {
34 base::hash_set<std::string> terms;
35 terms.insert("one");
36 terms.insert("one one");
37 terms.insert("two");
38 terms.insert("multi word test");
39 terms.insert("capitalization");
40 terms.insert("space");
41 terms.insert("separator");
42 terms.insert("punctuation");
43 // Chinese (translation of "hello")
44 terms.insert("\xe4\xbd\xa0\xe5\xa5\xbd");
45 // Chinese (translation of "goodbye")
46 terms.insert("\xe5\x86\x8d\xe8\xa7\x81");
48 for (base::hash_set<std::string>::iterator it = terms.begin();
49 it != terms.end(); ++it) {
50 term_hashes_.insert(crypto::SHA256HashString(*it));
53 base::hash_set<std::string> words;
54 words.insert("one");
55 words.insert("two");
56 words.insert("multi");
57 words.insert("word");
58 words.insert("test");
59 words.insert("capitalization");
60 words.insert("space");
61 words.insert("separator");
62 words.insert("punctuation");
63 words.insert("\xe4\xbd\xa0\xe5\xa5\xbd");
64 words.insert("\xe5\x86\x8d\xe8\xa7\x81");
66 static const uint32 kMurmurHash3Seed = 2777808611U;
67 for (base::hash_set<std::string>::iterator it = words.begin();
68 it != words.end(); ++it) {
69 word_hashes_.insert(MurmurHash3String(*it, kMurmurHash3Seed));
72 extractor_.reset(new PhishingTermFeatureExtractor(
73 &term_hashes_,
74 &word_hashes_,
75 3 /* max_words_per_term */,
76 kMurmurHash3Seed,
77 &clock_));
80 // Runs the TermFeatureExtractor on |page_text|, waiting for the
81 // completion callback. Returns the success boolean from the callback.
82 bool ExtractFeatures(const base::string16* page_text, FeatureMap* features) {
83 success_ = false;
84 extractor_->ExtractFeatures(
85 page_text,
86 features,
87 base::Bind(&PhishingTermFeatureExtractorTest::ExtractionDone,
88 base::Unretained(this)));
89 msg_loop_.Run();
90 return success_;
93 void PartialExtractFeatures(const base::string16* page_text,
94 FeatureMap* features) {
95 extractor_->ExtractFeatures(
96 page_text,
97 features,
98 base::Bind(&PhishingTermFeatureExtractorTest::ExtractionDone,
99 base::Unretained(this)));
100 msg_loop_.PostTask(
101 FROM_HERE,
102 base::Bind(&PhishingTermFeatureExtractorTest::QuitExtraction,
103 base::Unretained(this)));
104 msg_loop_.RunUntilIdle();
107 // Completion callback for feature extraction.
108 void ExtractionDone(bool success) {
109 success_ = success;
110 msg_loop_.Quit();
113 void QuitExtraction() {
114 extractor_->CancelPendingExtraction();
115 msg_loop_.Quit();
118 base::MessageLoop msg_loop_;
119 MockFeatureExtractorClock clock_;
120 scoped_ptr<PhishingTermFeatureExtractor> extractor_;
121 base::hash_set<std::string> term_hashes_;
122 base::hash_set<uint32> word_hashes_;
123 bool success_; // holds the success value from ExtractFeatures
126 TEST_F(PhishingTermFeatureExtractorTest, ExtractFeatures) {
127 // This test doesn't exercise the extraction timing.
128 EXPECT_CALL(clock_, Now()).WillRepeatedly(Return(base::TimeTicks::Now()));
130 base::string16 page_text = ASCIIToUTF16("blah");
131 FeatureMap expected_features; // initially empty
133 FeatureMap features;
134 ASSERT_TRUE(ExtractFeatures(&page_text, &features));
135 ExpectFeatureMapsAreEqual(features, expected_features);
137 page_text = ASCIIToUTF16("one one");
138 expected_features.Clear();
139 expected_features.AddBooleanFeature(features::kPageTerm +
140 std::string("one"));
141 expected_features.AddBooleanFeature(features::kPageTerm +
142 std::string("one one"));
144 features.Clear();
145 ASSERT_TRUE(ExtractFeatures(&page_text, &features));
146 ExpectFeatureMapsAreEqual(features, expected_features);
148 page_text = ASCIIToUTF16("bla bla multi word test bla");
149 expected_features.Clear();
150 expected_features.AddBooleanFeature(features::kPageTerm +
151 std::string("multi word test"));
153 features.Clear();
154 ASSERT_TRUE(ExtractFeatures(&page_text, &features));
155 ExpectFeatureMapsAreEqual(features, expected_features);
157 // This text has all of the words for one of the terms, but they are
158 // not in the correct order.
159 page_text = ASCIIToUTF16("bla bla test word multi bla");
160 expected_features.Clear();
162 features.Clear();
163 ASSERT_TRUE(ExtractFeatures(&page_text, &features));
164 ExpectFeatureMapsAreEqual(features, expected_features);
166 page_text = ASCIIToUTF16("Capitalization plus non-space\n"
167 "separator... punctuation!");
168 expected_features.Clear();
169 expected_features.AddBooleanFeature(features::kPageTerm +
170 std::string("capitalization"));
171 expected_features.AddBooleanFeature(features::kPageTerm +
172 std::string("space"));
173 expected_features.AddBooleanFeature(features::kPageTerm +
174 std::string("separator"));
175 expected_features.AddBooleanFeature(features::kPageTerm +
176 std::string("punctuation"));
178 features.Clear();
179 ASSERT_TRUE(ExtractFeatures(&page_text, &features));
180 ExpectFeatureMapsAreEqual(features, expected_features);
182 // Test with empty page text.
183 page_text = base::string16();
184 expected_features.Clear();
185 features.Clear();
186 ASSERT_TRUE(ExtractFeatures(&page_text, &features));
187 ExpectFeatureMapsAreEqual(features, expected_features);
189 // Chinese translation of the phrase "hello goodbye". This tests that
190 // we can correctly separate terms in languages that don't use spaces.
191 page_text =
192 base::UTF8ToUTF16("\xe4\xbd\xa0\xe5\xa5\xbd\xe5\x86\x8d\xe8\xa7\x81");
193 expected_features.Clear();
194 expected_features.AddBooleanFeature(
195 features::kPageTerm + std::string("\xe4\xbd\xa0\xe5\xa5\xbd"));
196 expected_features.AddBooleanFeature(
197 features::kPageTerm + std::string("\xe5\x86\x8d\xe8\xa7\x81"));
199 features.Clear();
200 ASSERT_TRUE(ExtractFeatures(&page_text, &features));
201 ExpectFeatureMapsAreEqual(features, expected_features);
204 TEST_F(PhishingTermFeatureExtractorTest, Continuation) {
205 // For this test, we'll cause the feature extraction to run multiple
206 // iterations by incrementing the clock.
208 // This page has a total of 30 words. For the features to be computed
209 // correctly, the extractor has to process the entire string of text.
210 base::string16 page_text(ASCIIToUTF16("one "));
211 for (int i = 0; i < 28; ++i) {
212 page_text.append(ASCIIToUTF16(base::StringPrintf("%d ", i)));
214 page_text.append(ASCIIToUTF16("two"));
216 // Advance the clock 3 ms every 5 words processed, 10 ms between chunks.
217 // Note that this assumes kClockCheckGranularity = 5 and
218 // kMaxTimePerChunkMs = 10.
219 base::TimeTicks now = base::TimeTicks::Now();
220 EXPECT_CALL(clock_, Now())
221 // Time check at the start of extraction.
222 .WillOnce(Return(now))
223 // Time check at the start of the first chunk of work.
224 .WillOnce(Return(now))
225 // Time check after the first 5 words.
226 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(3)))
227 // Time check after the next 5 words.
228 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(6)))
229 // Time check after the next 5 words.
230 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(9)))
231 // Time check after the next 5 words. This is over the chunk
232 // time limit, so a continuation task will be posted.
233 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(12)))
234 // Time check at the start of the second chunk of work.
235 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(22)))
236 // Time check after the next 5 words.
237 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(25)))
238 // Time check after the next 5 words.
239 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(28)))
240 // A final check for the histograms.
241 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(30)));
243 FeatureMap expected_features;
244 expected_features.AddBooleanFeature(features::kPageTerm +
245 std::string("one"));
246 expected_features.AddBooleanFeature(features::kPageTerm +
247 std::string("two"));
249 FeatureMap features;
250 ASSERT_TRUE(ExtractFeatures(&page_text, &features));
251 ExpectFeatureMapsAreEqual(features, expected_features);
252 // Make sure none of the mock expectations carry over to the next test.
253 ::testing::Mock::VerifyAndClearExpectations(&clock_);
255 // Now repeat the test with the same text, but advance the clock faster so
256 // that the extraction time exceeds the maximum total time for the feature
257 // extractor. Extraction should fail. Note that this assumes
258 // kMaxTotalTimeMs = 500.
259 EXPECT_CALL(clock_, Now())
260 // Time check at the start of extraction.
261 .WillOnce(Return(now))
262 // Time check at the start of the first chunk of work.
263 .WillOnce(Return(now))
264 // Time check after the first 5 words,
265 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(300)))
266 // Time check at the start of the second chunk of work.
267 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(350)))
268 // Time check after the next 5 words. This is over the limit.
269 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(600)))
270 // A final time check for the histograms.
271 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(620)));
273 features.Clear();
274 EXPECT_FALSE(ExtractFeatures(&page_text, &features));
277 TEST_F(PhishingTermFeatureExtractorTest, PartialExtractionTest) {
278 scoped_ptr<base::string16> page_text(
279 new base::string16(ASCIIToUTF16("one ")));
280 for (int i = 0; i < 28; ++i) {
281 page_text->append(ASCIIToUTF16(base::StringPrintf("%d ", i)));
284 base::TimeTicks now = base::TimeTicks::Now();
285 EXPECT_CALL(clock_, Now())
286 // Time check at the start of extraction.
287 .WillOnce(Return(now))
288 // Time check at the start of the first chunk of work.
289 .WillOnce(Return(now))
290 // Time check after the first 5 words.
291 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(7)))
292 // Time check after the next 5 words. This should be greater than
293 // kMaxTimePerChunkMs so that we stop and schedule extraction for later.
294 .WillOnce(Return(now + base::TimeDelta::FromMilliseconds(14)));
296 FeatureMap features;
297 // Extract first 10 words then stop.
298 PartialExtractFeatures(page_text.get(), &features);
300 page_text.reset(new base::string16());
301 for (int i = 30; i < 58; ++i) {
302 page_text->append(ASCIIToUTF16(base::StringPrintf("%d ", i)));
304 page_text->append(ASCIIToUTF16("multi word test "));
305 features.Clear();
307 // This part doesn't exercise the extraction timing.
308 EXPECT_CALL(clock_, Now()).WillRepeatedly(Return(base::TimeTicks::Now()));
310 // Now extract normally and make sure nothing breaks.
311 EXPECT_TRUE(ExtractFeatures(page_text.get(), &features));
313 FeatureMap expected_features;
314 expected_features.AddBooleanFeature(features::kPageTerm +
315 std::string("multi word test"));
316 ExpectFeatureMapsAreEqual(features, expected_features);
319 } // namespace safe_browsing