1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "chrome/renderer/safe_browsing/phishing_url_feature_extractor.h"
9 #include "chrome/renderer/safe_browsing/features.h"
10 #include "chrome/renderer/safe_browsing/test_utils.h"
11 #include "testing/gmock/include/gmock/gmock.h"
12 #include "testing/gtest/include/gtest/gtest.h"
15 using ::testing::ElementsAre
;
17 namespace safe_browsing
{
19 class PhishingUrlFeatureExtractorTest
: public ::testing::Test
{
21 PhishingUrlFeatureExtractor extractor_
;
23 void SplitStringIntoLongAlphanumTokens(const std::string
& full
,
24 std::vector
<std::string
>* tokens
) {
25 PhishingUrlFeatureExtractor::SplitStringIntoLongAlphanumTokens(full
,
30 TEST_F(PhishingUrlFeatureExtractorTest
, ExtractFeatures
) {
31 std::string url
= "http://123.0.0.1/mydocuments/a.file.html";
32 FeatureMap expected_features
;
33 expected_features
.AddBooleanFeature(features::kUrlHostIsIpAddress
);
34 expected_features
.AddBooleanFeature(features::kUrlPathToken
+
35 std::string("mydocuments"));
36 expected_features
.AddBooleanFeature(features::kUrlPathToken
+
38 expected_features
.AddBooleanFeature(features::kUrlPathToken
+
42 ASSERT_TRUE(extractor_
.ExtractFeatures(GURL(url
), &features
));
43 ExpectFeatureMapsAreEqual(features
, expected_features
);
45 url
= "http://www.www.cnn.co.uk/sports/sports/index.html?shouldnotappear";
46 expected_features
.Clear();
47 expected_features
.AddBooleanFeature(features::kUrlTldToken
+
48 std::string("co.uk"));
49 expected_features
.AddBooleanFeature(features::kUrlDomainToken
+
51 expected_features
.AddBooleanFeature(features::kUrlOtherHostToken
+
53 expected_features
.AddBooleanFeature(features::kUrlNumOtherHostTokensGTOne
);
54 expected_features
.AddBooleanFeature(features::kUrlPathToken
+
55 std::string("sports"));
56 expected_features
.AddBooleanFeature(features::kUrlPathToken
+
57 std::string("index"));
58 expected_features
.AddBooleanFeature(features::kUrlPathToken
+
62 ASSERT_TRUE(extractor_
.ExtractFeatures(GURL(url
), &features
));
63 ExpectFeatureMapsAreEqual(features
, expected_features
);
65 url
= "http://justadomain.com/";
66 expected_features
.Clear();
67 expected_features
.AddBooleanFeature(features::kUrlTldToken
+
69 expected_features
.AddBooleanFeature(features::kUrlDomainToken
+
70 std::string("justadomain"));
73 ASSERT_TRUE(extractor_
.ExtractFeatures(GURL(url
), &features
));
74 ExpectFeatureMapsAreEqual(features
, expected_features
);
76 url
= "http://witharef.com/#abc";
77 expected_features
.Clear();
78 expected_features
.AddBooleanFeature(features::kUrlTldToken
+
80 expected_features
.AddBooleanFeature(features::kUrlDomainToken
+
81 std::string("witharef"));
84 ASSERT_TRUE(extractor_
.ExtractFeatures(GURL(url
), &features
));
85 ExpectFeatureMapsAreEqual(features
, expected_features
);
87 url
= "http://...www..lotsodots....com./";
88 expected_features
.Clear();
89 expected_features
.AddBooleanFeature(features::kUrlTldToken
+
91 expected_features
.AddBooleanFeature(features::kUrlDomainToken
+
92 std::string("lotsodots"));
93 expected_features
.AddBooleanFeature(features::kUrlOtherHostToken
+
97 ASSERT_TRUE(extractor_
.ExtractFeatures(GURL(url
), &features
));
98 ExpectFeatureMapsAreEqual(features
, expected_features
);
100 url
= "http://unrecognized.tld/";
101 EXPECT_FALSE(extractor_
.ExtractFeatures(GURL(url
), &features
));
103 url
= "http://com/123";
104 EXPECT_FALSE(extractor_
.ExtractFeatures(GURL(url
), &features
));
106 url
= "http://.co.uk/";
107 EXPECT_FALSE(extractor_
.ExtractFeatures(GURL(url
), &features
));
109 url
= "file:///nohost.txt";
110 EXPECT_FALSE(extractor_
.ExtractFeatures(GURL(url
), &features
));
112 url
= "not:valid:at:all";
113 EXPECT_FALSE(extractor_
.ExtractFeatures(GURL(url
), &features
));
116 TEST_F(PhishingUrlFeatureExtractorTest
, SplitStringIntoLongAlphanumTokens
) {
117 std::string full
= "This.is/a_pretty\\unusual-!path,indeed";
118 std::vector
<std::string
> long_tokens
;
119 SplitStringIntoLongAlphanumTokens(full
, &long_tokens
);
120 EXPECT_THAT(long_tokens
,
121 ElementsAre("This", "pretty", "unusual", "path", "indeed"));
124 full
= "...i-am_re/al&ly\\b,r,o|k=e:n///up%20";
125 SplitStringIntoLongAlphanumTokens(full
, &long_tokens
);
126 EXPECT_THAT(long_tokens
, ElementsAre());
129 } // namespace safe_browsing