1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 // Common types and constants for extracting and evaluating features in the
6 // client-side phishing detection model. A feature is simply a string and an
7 // associated floating-point value between 0 and 1. The phishing
8 // classification model contains rules which give an appropriate weight to each
9 // feature or combination of features. These values can then be summed to
10 // compute a final phishiness score.
12 // Some features are boolean features. If these features are set, they always
13 // have a value of 0.0 or 1.0. In practice, the features are only set if the
14 // value is true (1.0).
16 // We also use token features. These features have a unique name that is
17 // constructed from the URL or page contents that we are classifying, for
18 // example, "UrlDomain=chromium". These features are also always set to 1.0
19 // if they are present.
21 // The intermediate storage of the features for a URL is a FeatureMap, which is
22 // just a thin wrapper around a map of feature name to value. The entire set
23 // of features for a URL is extracted before we do any scoring.
25 #ifndef CHROME_RENDERER_SAFE_BROWSING_FEATURES_H_
26 #define CHROME_RENDERER_SAFE_BROWSING_FEATURES_H_
29 #include "base/basictypes.h"
30 #include "base/containers/hash_tables.h"
32 namespace safe_browsing
{
34 // Container for a map of features to values, which enforces behavior
35 // such as a maximum number of features in the map.
41 // Adds a boolean feature to a FeatureMap with a value of 1.0.
42 // Returns true on success, or false if the feature map exceeds
43 // kMaxFeatureMapSize.
44 bool AddBooleanFeature(const std::string
& name
);
46 // Adds a real-valued feature to a FeatureMap with the given value.
47 // Values must always be in the range [0.0, 1.0]. Returns true on
48 // success, or false if the feature map exceeds kMaxFeatureMapSize
49 // or the value is outside of the allowed range.
50 bool AddRealFeature(const std::string
& name
, double value
);
52 // Provides read-only access to the current set of features.
53 const base::hash_map
<std::string
, double>& features() const {
57 // Clears the set of features in the map.
60 // This is an upper bound on the number of features that will be extracted.
61 // We should never hit this cap; it is intended as a sanity check to prevent
62 // the FeatureMap from growing too large.
63 static const size_t kMaxFeatureMapSize
;
66 base::hash_map
<std::string
, double> features_
;
68 DISALLOW_COPY_AND_ASSIGN(FeatureMap
);
72 // Constants for the various feature names that we use.
74 // IMPORTANT: when adding new features, you must update kAllowedFeatures in
75 // chrome/browser/safe_browsing/client_side_detection_service.cc if the feature
76 // should be sent in sanitized pingbacks.
78 ////////////////////////////////////////////////////
80 ////////////////////////////////////////////////////
82 // Set if the URL's hostname is an IP address.
83 extern const char kUrlHostIsIpAddress
[];
84 // Token feature containing the portion of the hostname controlled by a
85 // registrar, for example "com" or "co.uk".
86 extern const char kUrlTldToken
[];
87 // Token feature containing the first host component below the registrar.
88 // For example, in "www.google.com", the domain would be "google".
89 extern const char kUrlDomainToken
[];
90 // Token feature containing each host component below the domain.
91 // For example, in "www.host.example.com", both "www" and "host" would be
92 // "other host tokens".
93 extern const char kUrlOtherHostToken
[];
95 ////////////////////////////////////////////////////
96 // Aggregate features for URL host tokens
97 ////////////////////////////////////////////////////
99 // Set if the number of "other" host tokens for a URL is greater than one.
100 // Longer hostnames, regardless of the specific tokens, can be a signal that
101 // the URL is phishy.
102 extern const char kUrlNumOtherHostTokensGTOne
[];
103 // Set if the number of "other" host tokens for a URL is greater than three.
104 extern const char kUrlNumOtherHostTokensGTThree
[];
106 ////////////////////////////////////////////////////
107 // URL path token features
108 ////////////////////////////////////////////////////
110 // Token feature containing each alphanumeric string in the path that is at
111 // least 3 characters long. For example, "/abc/d/efg" would have 2 path
112 // token features, "abc" and "efg". Query parameters are not included.
113 extern const char kUrlPathToken
[];
115 ////////////////////////////////////////////////////
116 // DOM HTML form features
117 ////////////////////////////////////////////////////
119 // Set if the page has any <form> elements.
120 extern const char kPageHasForms
[];
121 // The fraction of form elements whose |action| attribute points to a
122 // URL on a different domain from the document URL.
123 extern const char kPageActionOtherDomainFreq
[];
124 // Token feature containing each URL that an |action| attribute
126 extern const char kPageActionURL
[];
127 // Set if the page has any <input type="text"> elements
128 // (includes inputs with missing or unknown types).
129 extern const char kPageHasTextInputs
[];
130 // Set if the page has any <input type="password"> elements.
131 extern const char kPageHasPswdInputs
[];
132 // Set if the page has any <input type="radio"> elements.
133 extern const char kPageHasRadioInputs
[];
134 // Set if the page has any <input type="checkbox"> elements.
135 extern const char kPageHasCheckInputs
[];
137 ////////////////////////////////////////////////////
138 // DOM HTML link features
139 ////////////////////////////////////////////////////
141 // The fraction of links in the page which point to a domain other than the
142 // domain of the document. See "URL host features" above for a discussion
143 // of how the doamin is computed.
144 extern const char kPageExternalLinksFreq
[];
145 // Token feature containing each external domain that is linked to.
146 extern const char kPageLinkDomain
[];
147 // Fraction of links in the page that use https.
148 extern const char kPageSecureLinksFreq
[];
150 ////////////////////////////////////////////////////
151 // DOM HTML script features
152 ////////////////////////////////////////////////////
154 // Set if the number of <script> elements in the page is greater than 1.
155 extern const char kPageNumScriptTagsGTOne
[];
156 // Set if the number of <script> elements in the page is greater than 6.
157 extern const char kPageNumScriptTagsGTSix
[];
159 ////////////////////////////////////////////////////
160 // Other DOM HTML features
161 ////////////////////////////////////////////////////
163 // The fraction of images whose src attribute points to an external domain.
164 extern const char kPageImgOtherDomainFreq
[];
166 ////////////////////////////////////////////////////
167 // Page term features
168 ////////////////////////////////////////////////////
170 // Token feature for a term (whitespace-delimited) on a page. Terms can be
171 // single words or multi-word n-grams. Rather than adding this feature for
172 // every possible token on a page, only the terms that are mentioned in the
173 // classification model are added.
174 extern const char kPageTerm
[];
176 } // namespace features
177 } // namespace safe_browsing
179 #endif // CHROME_RENDERER_SAFE_BROWSING_FEATURES_H_