Version 7 golden file for safe-browsing test.
[chromium-blink-merge.git] / chrome / renderer / safe_browsing / phishing_url_feature_extractor.cc
blobe35363638e3126c57345db537db04c4c7c650241
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "chrome/renderer/safe_browsing/phishing_url_feature_extractor.h"
7 #include <algorithm>
8 #include <string>
9 #include <vector>
11 #include "base/logging.h"
12 #include "base/metrics/histogram.h"
13 #include "base/strings/string_split.h"
14 #include "base/strings/string_util.h"
15 #include "base/timer/elapsed_timer.h"
16 #include "chrome/renderer/safe_browsing/features.h"
17 #include "net/base/registry_controlled_domains/registry_controlled_domain.h"
18 #include "url/gurl.h"
20 namespace safe_browsing {
22 PhishingUrlFeatureExtractor::PhishingUrlFeatureExtractor() {}
24 PhishingUrlFeatureExtractor::~PhishingUrlFeatureExtractor() {}
26 bool PhishingUrlFeatureExtractor::ExtractFeatures(const GURL& url,
27 FeatureMap* features) {
28 base::ElapsedTimer timer;
29 if (url.HostIsIPAddress()) {
30 if (!features->AddBooleanFeature(features::kUrlHostIsIpAddress))
31 return false;
32 } else {
33 // Remove any leading/trailing dots.
34 std::string host;
35 base::TrimString(url.host(), ".", &host);
37 // TODO(bryner): Ensure that the url encoding is consistent with
38 // the features in the model.
40 // Disallow unknown registries so that we don't classify
41 // partial hostnames (e.g. "www.subdomain").
42 size_t registry_length =
43 net::registry_controlled_domains::GetRegistryLength(
44 host,
45 net::registry_controlled_domains::EXCLUDE_UNKNOWN_REGISTRIES,
46 net::registry_controlled_domains::EXCLUDE_PRIVATE_REGISTRIES);
48 if (registry_length == 0 || registry_length == std::string::npos) {
49 DVLOG(1) << "Could not find TLD for host: " << host;
50 return false;
52 DCHECK_LT(registry_length, host.size()) << "Non-zero registry length, but "
53 "host is only a TLD: " << host;
54 size_t tld_start = host.size() - registry_length;
55 if (!features->AddBooleanFeature(features::kUrlTldToken +
56 host.substr(tld_start)))
57 return false;
59 // Pull off the TLD and the preceeding dot.
60 host.erase(tld_start - 1);
61 std::vector<std::string> host_tokens;
62 base::SplitStringDontTrim(host, '.', &host_tokens);
63 // Get rid of any empty components.
64 std::vector<std::string>::iterator new_end =
65 std::remove(host_tokens.begin(), host_tokens.end(), "");
66 host_tokens.erase(new_end, host_tokens.end());
67 if (host_tokens.empty()) {
68 DVLOG(1) << "Could not find domain for host: " << host;
69 return false;
71 if (!features->AddBooleanFeature(features::kUrlDomainToken +
72 host_tokens.back()))
73 return false;
74 host_tokens.pop_back();
76 // Now we're just left with the "other" host tokens.
77 for (std::vector<std::string>::iterator it = host_tokens.begin();
78 it != host_tokens.end(); ++it) {
79 if (!features->AddBooleanFeature(features::kUrlOtherHostToken + *it))
80 return false;
83 if (host_tokens.size() > 1) {
84 if (!features->AddBooleanFeature(features::kUrlNumOtherHostTokensGTOne))
85 return false;
86 if (host_tokens.size() > 3) {
87 if (!features->AddBooleanFeature(
88 features::kUrlNumOtherHostTokensGTThree))
89 return false;
94 std::vector<std::string> long_tokens;
95 SplitStringIntoLongAlphanumTokens(url.path(), &long_tokens);
96 for (std::vector<std::string>::iterator it = long_tokens.begin();
97 it != long_tokens.end(); ++it) {
98 if (!features->AddBooleanFeature(features::kUrlPathToken + *it))
99 return false;
102 UMA_HISTOGRAM_TIMES("SBClientPhishing.URLFeatureTime", timer.Elapsed());
103 return true;
106 // static
107 void PhishingUrlFeatureExtractor::SplitStringIntoLongAlphanumTokens(
108 const std::string& full,
109 std::vector<std::string>* tokens) {
110 // Split on common non-alphanumerics.
111 // TODO(bryner): Split on all(?) non-alphanumerics and handle %XX properly.
112 static const char kTokenSeparators[] = ".,\\/_-|=%:!&";
113 std::vector<std::string> raw_splits;
114 Tokenize(full, kTokenSeparators, &raw_splits);
116 // Copy over only the splits that are 3 or more chars long.
117 // TODO(bryner): Determine a meaningful min size.
118 for (std::vector<std::string>::iterator it = raw_splits.begin();
119 it != raw_splits.end(); ++it) {
120 if (it->length() >= kMinPathComponentLength)
121 tokens->push_back(*it);
125 } // namespace safe_browsing