1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "chrome/browser/safe_browsing/browser_feature_extractor.h"
10 #include "base/bind.h"
11 #include "base/bind_helpers.h"
12 #include "base/format_macros.h"
13 #include "base/location.h"
14 #include "base/single_thread_task_runner.h"
15 #include "base/stl_util.h"
16 #include "base/strings/stringprintf.h"
17 #include "base/thread_task_runner_handle.h"
18 #include "base/time/time.h"
19 #include "chrome/browser/history/history_service_factory.h"
20 #include "chrome/browser/profiles/profile.h"
21 #include "chrome/browser/safe_browsing/browser_features.h"
22 #include "chrome/browser/safe_browsing/client_side_detection_host.h"
23 #include "chrome/browser/safe_browsing/database_manager.h"
24 #include "chrome/common/safe_browsing/csd.pb.h"
25 #include "components/history/core/browser/history_service.h"
26 #include "components/history/core/browser/history_types.h"
27 #include "content/public/browser/browser_thread.h"
28 #include "content/public/browser/navigation_controller.h"
29 #include "content/public/browser/navigation_entry.h"
30 #include "content/public/browser/web_contents.h"
31 #include "ui/base/page_transition_types.h"
34 using content::BrowserThread
;
35 using content::NavigationController
;
36 using content::NavigationEntry
;
37 using content::ResourceType
;
38 using content::WebContents
;
40 namespace safe_browsing
{
44 const int kMaxMalwareIPPerRequest
= 5;
46 void FilterBenignIpsOnIOThread(
47 scoped_refptr
<SafeBrowsingDatabaseManager
> database_manager
,
49 DCHECK_CURRENTLY_ON(BrowserThread::IO
);
50 for (IPUrlMap::iterator it
= ips
->begin(); it
!= ips
->end();) {
51 if (!database_manager
.get() ||
52 !database_manager
->MatchMalwareIP(it
->first
)) {
53 // it++ here returns a copy of the old iterator and passes it to erase.
62 IPUrlInfo::IPUrlInfo(const std::string
& url
,
63 const std::string
& method
,
64 const std::string
& referrer
,
65 const ResourceType
& resource_type
)
69 resource_type(resource_type
) {
72 IPUrlInfo::~IPUrlInfo() {}
74 BrowseInfo::BrowseInfo() : http_status_code(0) {}
76 BrowseInfo::~BrowseInfo() {}
78 static void AddFeature(const std::string
& feature_name
,
80 ClientPhishingRequest
* request
) {
82 ClientPhishingRequest::Feature
* feature
=
83 request
->add_non_model_feature_map();
84 feature
->set_name(feature_name
);
85 feature
->set_value(feature_value
);
86 DVLOG(2) << "Browser feature: " << feature
->name() << " " << feature
->value();
89 static void AddMalwareIpUrlInfo(const std::string
& ip
,
90 const std::vector
<IPUrlInfo
>& meta_infos
,
91 ClientMalwareRequest
* request
) {
93 for (std::vector
<IPUrlInfo
>::const_iterator it
= meta_infos
.begin();
94 it
!= meta_infos
.end(); ++it
) {
95 ClientMalwareRequest::UrlInfo
* urlinfo
=
96 request
->add_bad_ip_url_info();
97 // We add the information about url on the bad ip.
99 urlinfo
->set_url(it
->url
);
100 urlinfo
->set_method(it
->method
);
101 urlinfo
->set_referrer(it
->referrer
);
102 urlinfo
->set_resource_type(static_cast<int>(it
->resource_type
));
104 DVLOG(2) << "Added url info for bad ip: " << ip
;
107 static void AddNavigationFeatures(
108 const std::string
& feature_prefix
,
109 const NavigationController
& controller
,
111 const std::vector
<GURL
>& redirect_chain
,
112 ClientPhishingRequest
* request
) {
113 NavigationEntry
* entry
= controller
.GetEntryAtIndex(index
);
114 bool is_secure_referrer
= entry
->GetReferrer().url
.SchemeIsCryptographic();
115 if (!is_secure_referrer
) {
116 AddFeature(base::StringPrintf("%s%s=%s",
117 feature_prefix
.c_str(),
119 entry
->GetReferrer().url
.spec().c_str()),
123 AddFeature(feature_prefix
+ features::kHasSSLReferrer
,
124 is_secure_referrer
? 1.0 : 0.0,
126 AddFeature(feature_prefix
+ features::kPageTransitionType
,
128 ui::PageTransitionStripQualifier(
129 entry
->GetTransitionType())),
131 AddFeature(feature_prefix
+ features::kIsFirstNavigation
,
132 index
== 0 ? 1.0 : 0.0,
134 // Redirect chain should always be at least of size one, as the rendered
135 // url is the last element in the chain.
136 if (redirect_chain
.empty()) {
140 if (redirect_chain
.back() != entry
->GetURL()) {
141 // I originally had this as a DCHECK but I saw a failure once that I
142 // can't reproduce. It looks like it might be related to the
143 // navigation controller only keeping a limited number of navigation
144 // events. For now we'll just attach a feature specifying that this is
145 // a mismatch and try and figure out what to do with it on the server.
146 DLOG(WARNING
) << "Expected:" << entry
->GetURL()
147 << " Actual:" << redirect_chain
.back();
148 AddFeature(feature_prefix
+ features::kRedirectUrlMismatch
,
153 // We skip the last element since it should just be the current url.
154 for (size_t i
= 0; i
< redirect_chain
.size() - 1; i
++) {
155 std::string printable_redirect
= redirect_chain
[i
].spec();
156 if (redirect_chain
[i
].SchemeIsCryptographic()) {
157 printable_redirect
= features::kSecureRedirectValue
;
159 AddFeature(base::StringPrintf("%s%s[%" PRIuS
"]=%s",
160 feature_prefix
.c_str(),
163 printable_redirect
.c_str()),
169 BrowserFeatureExtractor::BrowserFeatureExtractor(
171 ClientSideDetectionHost
* host
)
174 weak_factory_(this) {
178 BrowserFeatureExtractor::~BrowserFeatureExtractor() {
179 weak_factory_
.InvalidateWeakPtrs();
182 void BrowserFeatureExtractor::ExtractFeatures(const BrowseInfo
* info
,
183 ClientPhishingRequest
* request
,
184 const DoneCallback
& callback
) {
185 DCHECK_CURRENTLY_ON(BrowserThread::UI
);
188 DCHECK_EQ(0U, request
->url().find("http:"));
189 DCHECK(!callback
.is_null());
190 // Extract features pertaining to this navigation.
191 const NavigationController
& controller
= tab_
->GetController();
193 int first_host_index
= -1;
195 GURL
request_url(request
->url());
196 int index
= controller
.GetCurrentEntryIndex();
197 // The url that we are extracting features for should already be commited.
198 DCHECK_NE(index
, -1);
199 for (; index
>= 0; index
--) {
200 NavigationEntry
* entry
= controller
.GetEntryAtIndex(index
);
201 if (url_index
== -1 && entry
->GetURL() == request_url
) {
202 // It's possible that we've been on the on the possibly phishy url before
203 // in this tab, so make sure that we use the latest navigation for
205 // Note that it's possible that the url_index should always be the
206 // latest entry, but I'm worried about possible races during a navigation
207 // and transient entries (i.e. interstiatials) so for now we will just
210 } else if (index
< url_index
) {
211 if (entry
->GetURL().host() == request_url
.host()) {
212 first_host_index
= index
;
214 // We have found the possibly phishing url, but we are no longer on the
215 // host. No reason to look back any further.
221 // Add features pertaining to how we got to
222 // 1) The candidate url
223 // 2) The first url on the same host as the candidate url (assuming that
224 // it's different from the candidate url).
225 if (url_index
!= -1) {
226 AddNavigationFeatures(
227 std::string(), controller
, url_index
, info
->url_redirects
, request
);
229 if (first_host_index
!= -1) {
230 AddNavigationFeatures(features::kHostPrefix
,
233 info
->host_redirects
,
237 // The API doesn't take a scoped_ptr because the API gets mocked and we
238 // cannot mock an API that takes scoped_ptr as arguments.
239 scoped_ptr
<ClientPhishingRequest
> req(request
);
241 ExtractBrowseInfoFeatures(*info
, request
);
242 base::ThreadTaskRunnerHandle::Get()->PostTask(
244 base::Bind(&BrowserFeatureExtractor::StartExtractFeatures
,
245 weak_factory_
.GetWeakPtr(), base::Passed(&req
), callback
));
248 void BrowserFeatureExtractor::ExtractMalwareFeatures(
250 ClientMalwareRequest
* request
,
251 const MalwareDoneCallback
& callback
) {
252 DCHECK_CURRENTLY_ON(BrowserThread::UI
);
253 DCHECK(!callback
.is_null());
255 // Grab the IPs because they might go away before we're done
256 // checking them against the IP blacklist on the IO thread.
257 scoped_ptr
<IPUrlMap
> ips(new IPUrlMap
);
258 ips
->swap(info
->ips
);
260 IPUrlMap
* ips_ptr
= ips
.get();
262 // The API doesn't take a scoped_ptr because the API gets mocked and we
263 // cannot mock an API that takes scoped_ptr as arguments.
264 scoped_ptr
<ClientMalwareRequest
> req(request
);
266 // IP blacklist lookups have to happen on the IO thread.
267 BrowserThread::PostTaskAndReply(
270 base::Bind(&FilterBenignIpsOnIOThread
,
271 host_
->database_manager(),
273 base::Bind(&BrowserFeatureExtractor::FinishExtractMalwareFeatures
,
274 weak_factory_
.GetWeakPtr(),
275 base::Passed(&ips
), callback
, base::Passed(&req
)));
278 void BrowserFeatureExtractor::ExtractBrowseInfoFeatures(
279 const BrowseInfo
& info
,
280 ClientPhishingRequest
* request
) {
281 if (info
.unsafe_resource
.get()) {
282 // A SafeBrowsing interstitial was shown for the current URL.
283 AddFeature(features::kSafeBrowsingMaliciousUrl
+
284 info
.unsafe_resource
->url
.spec(),
287 AddFeature(features::kSafeBrowsingOriginalUrl
+
288 info
.unsafe_resource
->original_url
.spec(),
291 AddFeature(features::kSafeBrowsingIsSubresource
,
292 info
.unsafe_resource
->is_subresource
? 1.0 : 0.0,
294 AddFeature(features::kSafeBrowsingThreatType
,
295 static_cast<double>(info
.unsafe_resource
->threat_type
),
298 if (info
.http_status_code
!= 0) {
299 AddFeature(features::kHttpStatusCode
, info
.http_status_code
, request
);
303 void BrowserFeatureExtractor::StartExtractFeatures(
304 scoped_ptr
<ClientPhishingRequest
> request
,
305 const DoneCallback
& callback
) {
306 DCHECK_CURRENTLY_ON(BrowserThread::UI
);
307 history::HistoryService
* history
;
308 if (!request
|| !request
->IsInitialized() || !GetHistoryService(&history
)) {
309 callback
.Run(false, request
.Pass());
312 GURL
request_url(request
->url());
313 history
->QueryURL(request_url
,
314 true /* wants_visits */,
315 base::Bind(&BrowserFeatureExtractor::QueryUrlHistoryDone
,
316 base::Unretained(this),
317 base::Passed(&request
),
319 &cancelable_task_tracker_
);
322 void BrowserFeatureExtractor::QueryUrlHistoryDone(
323 scoped_ptr
<ClientPhishingRequest
> request
,
324 const DoneCallback
& callback
,
326 const history::URLRow
& row
,
327 const history::VisitVector
& visits
) {
328 DCHECK_CURRENTLY_ON(BrowserThread::UI
);
330 DCHECK(!callback
.is_null());
332 // URL is not found in the history. In practice this should not
333 // happen (unless there is a real error) because we just visited
335 callback
.Run(false, request
.Pass());
338 AddFeature(features::kUrlHistoryVisitCount
,
339 static_cast<double>(row
.visit_count()),
342 base::Time threshold
= base::Time::Now() - base::TimeDelta::FromDays(1);
343 int num_visits_24h_ago
= 0;
344 int num_visits_typed
= 0;
345 int num_visits_link
= 0;
346 for (history::VisitVector::const_iterator it
= visits
.begin();
349 if (!ui::PageTransitionIsMainFrame(it
->transition
)) {
352 if (it
->visit_time
< threshold
) {
353 ++num_visits_24h_ago
;
355 ui::PageTransition transition
= ui::PageTransitionStripQualifier(
357 if (transition
== ui::PAGE_TRANSITION_TYPED
) {
359 } else if (transition
== ui::PAGE_TRANSITION_LINK
) {
363 AddFeature(features::kUrlHistoryVisitCountMoreThan24hAgo
,
364 static_cast<double>(num_visits_24h_ago
),
366 AddFeature(features::kUrlHistoryTypedCount
,
367 static_cast<double>(num_visits_typed
),
369 AddFeature(features::kUrlHistoryLinkCount
,
370 static_cast<double>(num_visits_link
),
373 // Issue next history lookup for host visits.
374 history::HistoryService
* history
;
375 if (!GetHistoryService(&history
)) {
376 callback
.Run(false, request
.Pass());
379 GURL
request_url(request
->url());
380 history
->GetVisibleVisitCountToHost(
382 base::Bind(&BrowserFeatureExtractor::QueryHttpHostVisitsDone
,
383 base::Unretained(this),
384 base::Passed(&request
),
386 &cancelable_task_tracker_
);
389 void BrowserFeatureExtractor::QueryHttpHostVisitsDone(
390 scoped_ptr
<ClientPhishingRequest
> request
,
391 const DoneCallback
& callback
,
394 base::Time first_visit
) {
395 DCHECK_CURRENTLY_ON(BrowserThread::UI
);
397 DCHECK(!callback
.is_null());
399 callback
.Run(false, request
.Pass());
402 SetHostVisitsFeatures(num_visits
, first_visit
, true, request
.get());
404 // Same lookup but for the HTTPS URL.
405 history::HistoryService
* history
;
406 if (!GetHistoryService(&history
)) {
407 callback
.Run(false, request
.Pass());
410 std::string https_url
= request
->url();
411 history
->GetVisibleVisitCountToHost(
412 GURL(https_url
.replace(0, 5, "https:")),
413 base::Bind(&BrowserFeatureExtractor::QueryHttpsHostVisitsDone
,
414 base::Unretained(this),
415 base::Passed(&request
),
417 &cancelable_task_tracker_
);
420 void BrowserFeatureExtractor::QueryHttpsHostVisitsDone(
421 scoped_ptr
<ClientPhishingRequest
> request
,
422 const DoneCallback
& callback
,
425 base::Time first_visit
) {
426 DCHECK_CURRENTLY_ON(BrowserThread::UI
);
428 DCHECK(!callback
.is_null());
430 callback
.Run(false, request
.Pass());
433 SetHostVisitsFeatures(num_visits
, first_visit
, false, request
.get());
434 callback
.Run(true, request
.Pass());
437 void BrowserFeatureExtractor::SetHostVisitsFeatures(
439 base::Time first_visit
,
441 ClientPhishingRequest
* request
) {
443 AddFeature(is_http_query
?
444 features::kHttpHostVisitCount
: features::kHttpsHostVisitCount
,
445 static_cast<double>(num_visits
),
447 if (num_visits
> 0) {
450 features::kFirstHttpHostVisitMoreThan24hAgo
:
451 features::kFirstHttpsHostVisitMoreThan24hAgo
,
452 (first_visit
< (base::Time::Now() - base::TimeDelta::FromDays(1))) ?
458 bool BrowserFeatureExtractor::GetHistoryService(
459 history::HistoryService
** history
) {
461 if (tab_
&& tab_
->GetBrowserContext()) {
462 Profile
* profile
= Profile::FromBrowserContext(tab_
->GetBrowserContext());
463 *history
= HistoryServiceFactory::GetForProfile(
464 profile
, ServiceAccessType::EXPLICIT_ACCESS
);
469 DVLOG(2) << "Unable to query history. No history service available.";
473 void BrowserFeatureExtractor::FinishExtractMalwareFeatures(
474 scoped_ptr
<IPUrlMap
> bad_ips
,
475 MalwareDoneCallback callback
,
476 scoped_ptr
<ClientMalwareRequest
> request
) {
477 DCHECK_CURRENTLY_ON(BrowserThread::UI
);
478 int matched_bad_ips
= 0;
479 for (IPUrlMap::const_iterator it
= bad_ips
->begin();
480 it
!= bad_ips
->end(); ++it
) {
481 AddMalwareIpUrlInfo(it
->first
, it
->second
, request
.get());
483 // Limit the number of matched bad IPs in one request to control
484 // the request's size
485 if (matched_bad_ips
>= kMaxMalwareIPPerRequest
) {
489 callback
.Run(true, request
.Pass());
492 } // namespace safe_browsing