1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "chrome/renderer/safe_browsing/phishing_url_feature_extractor.h"
11 #include "base/logging.h"
12 #include "base/metrics/histogram.h"
13 #include "base/strings/string_split.h"
14 #include "base/strings/string_util.h"
15 #include "base/timer/elapsed_timer.h"
16 #include "chrome/renderer/safe_browsing/features.h"
17 #include "net/base/registry_controlled_domains/registry_controlled_domain.h"
20 namespace safe_browsing
{
22 PhishingUrlFeatureExtractor::PhishingUrlFeatureExtractor() {}
24 PhishingUrlFeatureExtractor::~PhishingUrlFeatureExtractor() {}
26 bool PhishingUrlFeatureExtractor::ExtractFeatures(const GURL
& url
,
27 FeatureMap
* features
) {
28 base::ElapsedTimer timer
;
29 if (url
.HostIsIPAddress()) {
30 if (!features
->AddBooleanFeature(features::kUrlHostIsIpAddress
))
33 // Remove any leading/trailing dots.
35 base::TrimString(url
.host(), ".", &host
);
37 // TODO(bryner): Ensure that the url encoding is consistent with
38 // the features in the model.
40 // Disallow unknown registries so that we don't classify
41 // partial hostnames (e.g. "www.subdomain").
42 size_t registry_length
=
43 net::registry_controlled_domains::GetRegistryLength(
45 net::registry_controlled_domains::EXCLUDE_UNKNOWN_REGISTRIES
,
46 net::registry_controlled_domains::EXCLUDE_PRIVATE_REGISTRIES
);
48 if (registry_length
== 0 || registry_length
== std::string::npos
) {
49 DVLOG(1) << "Could not find TLD for host: " << host
;
52 DCHECK_LT(registry_length
, host
.size()) << "Non-zero registry length, but "
53 "host is only a TLD: " << host
;
54 size_t tld_start
= host
.size() - registry_length
;
55 if (!features
->AddBooleanFeature(features::kUrlTldToken
+
56 host
.substr(tld_start
)))
59 // Pull off the TLD and the preceeding dot.
60 host
.erase(tld_start
- 1);
61 std::vector
<std::string
> host_tokens
= base::SplitString(
62 host
, ".", base::KEEP_WHITESPACE
, base::SPLIT_WANT_NONEMPTY
);
63 if (host_tokens
.empty()) {
64 DVLOG(1) << "Could not find domain for host: " << host
;
67 if (!features
->AddBooleanFeature(features::kUrlDomainToken
+
70 host_tokens
.pop_back();
72 // Now we're just left with the "other" host tokens.
73 for (std::vector
<std::string
>::iterator it
= host_tokens
.begin();
74 it
!= host_tokens
.end(); ++it
) {
75 if (!features
->AddBooleanFeature(features::kUrlOtherHostToken
+ *it
))
79 if (host_tokens
.size() > 1) {
80 if (!features
->AddBooleanFeature(features::kUrlNumOtherHostTokensGTOne
))
82 if (host_tokens
.size() > 3) {
83 if (!features
->AddBooleanFeature(
84 features::kUrlNumOtherHostTokensGTThree
))
90 std::vector
<std::string
> long_tokens
;
91 SplitStringIntoLongAlphanumTokens(url
.path(), &long_tokens
);
92 for (const std::string
& token
: long_tokens
) {
93 if (!features
->AddBooleanFeature(features::kUrlPathToken
+ token
))
97 UMA_HISTOGRAM_TIMES("SBClientPhishing.URLFeatureTime", timer
.Elapsed());
102 void PhishingUrlFeatureExtractor::SplitStringIntoLongAlphanumTokens(
103 const std::string
& full
,
104 std::vector
<std::string
>* tokens
) {
105 // Split on common non-alphanumerics.
106 // TODO(bryner): Split on all(?) non-alphanumerics and handle %XX properly.
107 static const char kTokenSeparators
[] = ".,\\/_-|=%:!&";
108 for (const base::StringPiece
& token
:
109 base::SplitStringPiece(full
, kTokenSeparators
, base::KEEP_WHITESPACE
,
110 base::SPLIT_WANT_NONEMPTY
)) {
111 // Copy over only the splits that are 3 or more chars long.
112 // TODO(bryner): Determine a meaningful min size.
113 if (token
.length() >= kMinPathComponentLength
)
114 tokens
->push_back(token
.as_string());
118 } // namespace safe_browsing