1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "chrome/browser/safe_browsing/browser_feature_extractor.h"
10 #include "base/bind.h"
11 #include "base/bind_helpers.h"
12 #include "base/format_macros.h"
13 #include "base/stl_util.h"
14 #include "base/strings/stringprintf.h"
15 #include "base/time/time.h"
16 #include "chrome/browser/common/cancelable_request.h"
17 #include "chrome/browser/history/history_service.h"
18 #include "chrome/browser/history/history_service_factory.h"
19 #include "chrome/browser/history/history_types.h"
20 #include "chrome/browser/profiles/profile.h"
21 #include "chrome/browser/safe_browsing/browser_features.h"
22 #include "chrome/browser/safe_browsing/client_side_detection_host.h"
23 #include "chrome/browser/safe_browsing/database_manager.h"
24 #include "chrome/common/safe_browsing/csd.pb.h"
25 #include "content/public/browser/browser_thread.h"
26 #include "content/public/browser/navigation_controller.h"
27 #include "content/public/browser/navigation_entry.h"
28 #include "content/public/browser/web_contents.h"
29 #include "content/public/common/page_transition_types.h"
32 using content::BrowserThread
;
33 using content::NavigationController
;
34 using content::NavigationEntry
;
35 using content::WebContents
;
37 namespace safe_browsing
{
41 const int kMaxMalwareIPPerRequest
= 5;
43 void FilterBenignIpsOnIOThread(
44 scoped_refptr
<SafeBrowsingDatabaseManager
> database_manager
,
46 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO
));
47 for (IPUrlMap::iterator it
= ips
->begin(); it
!= ips
->end();) {
48 if (!database_manager
.get() ||
49 !database_manager
->MatchMalwareIP(it
->first
)) {
50 // it++ here returns a copy of the old iterator and passes it to erase.
59 IPUrlInfo::IPUrlInfo(const std::string
& url
,
60 const std::string
& method
,
61 const std::string
& referrer
,
62 const ResourceType::Type
& resource_type
)
66 resource_type(resource_type
) {
69 IPUrlInfo::~IPUrlInfo() {}
71 BrowseInfo::BrowseInfo() : http_status_code(0) {}
73 BrowseInfo::~BrowseInfo() {}
75 static void AddFeature(const std::string
& feature_name
,
77 ClientPhishingRequest
* request
) {
79 ClientPhishingRequest::Feature
* feature
=
80 request
->add_non_model_feature_map();
81 feature
->set_name(feature_name
);
82 feature
->set_value(feature_value
);
83 VLOG(2) << "Browser feature: " << feature
->name() << " " << feature
->value();
86 static void AddMalwareIpUrlInfo(const std::string
& ip
,
87 const std::vector
<IPUrlInfo
>& meta_infos
,
88 ClientMalwareRequest
* request
) {
90 for (std::vector
<IPUrlInfo
>::const_iterator it
= meta_infos
.begin();
91 it
!= meta_infos
.end(); ++it
) {
92 ClientMalwareRequest::UrlInfo
* urlinfo
=
93 request
->add_bad_ip_url_info();
94 // We add the information about url on the bad ip.
96 urlinfo
->set_url(it
->url
);
97 urlinfo
->set_method(it
->method
);
98 urlinfo
->set_referrer(it
->referrer
);
99 urlinfo
->set_resource_type(static_cast<int>(it
->resource_type
));
101 DVLOG(2) << "Added url info for bad ip: " << ip
;
104 static void AddNavigationFeatures(
105 const std::string
& feature_prefix
,
106 const NavigationController
& controller
,
108 const std::vector
<GURL
>& redirect_chain
,
109 ClientPhishingRequest
* request
) {
110 NavigationEntry
* entry
= controller
.GetEntryAtIndex(index
);
111 bool is_secure_referrer
= entry
->GetReferrer().url
.SchemeIsSecure();
112 if (!is_secure_referrer
) {
113 AddFeature(base::StringPrintf("%s%s=%s",
114 feature_prefix
.c_str(),
116 entry
->GetReferrer().url
.spec().c_str()),
120 AddFeature(feature_prefix
+ features::kHasSSLReferrer
,
121 is_secure_referrer
? 1.0 : 0.0,
123 AddFeature(feature_prefix
+ features::kPageTransitionType
,
125 content::PageTransitionStripQualifier(
126 entry
->GetTransitionType())),
128 AddFeature(feature_prefix
+ features::kIsFirstNavigation
,
129 index
== 0 ? 1.0 : 0.0,
131 // Redirect chain should always be at least of size one, as the rendered
132 // url is the last element in the chain.
133 if (redirect_chain
.empty()) {
137 if (redirect_chain
.back() != entry
->GetURL()) {
138 // I originally had this as a DCHECK but I saw a failure once that I
139 // can't reproduce. It looks like it might be related to the
140 // navigation controller only keeping a limited number of navigation
141 // events. For now we'll just attach a feature specifying that this is
142 // a mismatch and try and figure out what to do with it on the server.
143 DLOG(WARNING
) << "Expected:" << entry
->GetURL()
144 << " Actual:" << redirect_chain
.back();
145 AddFeature(feature_prefix
+ features::kRedirectUrlMismatch
,
150 // We skip the last element since it should just be the current url.
151 for (size_t i
= 0; i
< redirect_chain
.size() - 1; i
++) {
152 std::string printable_redirect
= redirect_chain
[i
].spec();
153 if (redirect_chain
[i
].SchemeIsSecure()) {
154 printable_redirect
= features::kSecureRedirectValue
;
156 AddFeature(base::StringPrintf("%s%s[%" PRIuS
"]=%s",
157 feature_prefix
.c_str(),
160 printable_redirect
.c_str()),
166 BrowserFeatureExtractor::BrowserFeatureExtractor(
168 ClientSideDetectionHost
* host
)
171 weak_factory_(this) {
175 BrowserFeatureExtractor::~BrowserFeatureExtractor() {
176 weak_factory_
.InvalidateWeakPtrs();
177 // Delete all the pending extractions (delete callback and request objects).
178 STLDeleteContainerPairFirstPointers(pending_extractions_
.begin(),
179 pending_extractions_
.end());
181 // Also cancel all the pending history service queries.
182 HistoryService
* history
;
183 bool success
= GetHistoryService(&history
);
184 DCHECK(success
|| pending_queries_
.size() == 0);
185 // Cancel all the pending history lookups and cleanup the memory.
186 for (PendingQueriesMap::iterator it
= pending_queries_
.begin();
187 it
!= pending_queries_
.end(); ++it
) {
189 history
->CancelRequest(it
->first
);
191 ExtractionData
& extraction
= it
->second
;
192 delete extraction
.first
; // delete request
194 pending_queries_
.clear();
197 void BrowserFeatureExtractor::ExtractFeatures(const BrowseInfo
* info
,
198 ClientPhishingRequest
* request
,
199 const DoneCallback
& callback
) {
200 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::UI
));
203 DCHECK_EQ(0U, request
->url().find("http:"));
204 DCHECK(!callback
.is_null());
205 // Extract features pertaining to this navigation.
206 const NavigationController
& controller
= tab_
->GetController();
208 int first_host_index
= -1;
210 GURL
request_url(request
->url());
211 int index
= controller
.GetCurrentEntryIndex();
212 // The url that we are extracting features for should already be commited.
213 DCHECK_NE(index
, -1);
214 for (; index
>= 0; index
--) {
215 NavigationEntry
* entry
= controller
.GetEntryAtIndex(index
);
216 if (url_index
== -1 && entry
->GetURL() == request_url
) {
217 // It's possible that we've been on the on the possibly phishy url before
218 // in this tab, so make sure that we use the latest navigation for
220 // Note that it's possible that the url_index should always be the
221 // latest entry, but I'm worried about possible races during a navigation
222 // and transient entries (i.e. interstiatials) so for now we will just
225 } else if (index
< url_index
) {
226 if (entry
->GetURL().host() == request_url
.host()) {
227 first_host_index
= index
;
229 // We have found the possibly phishing url, but we are no longer on the
230 // host. No reason to look back any further.
236 // Add features pertaining to how we got to
237 // 1) The candidate url
238 // 2) The first url on the same host as the candidate url (assuming that
239 // it's different from the candidate url).
240 if (url_index
!= -1) {
241 AddNavigationFeatures(
242 std::string(), controller
, url_index
, info
->url_redirects
, request
);
244 if (first_host_index
!= -1) {
245 AddNavigationFeatures(features::kHostPrefix
,
248 info
->host_redirects
,
252 ExtractBrowseInfoFeatures(*info
, request
);
253 pending_extractions_
[request
] = callback
;
254 base::MessageLoop::current()->PostTask(
256 base::Bind(&BrowserFeatureExtractor::StartExtractFeatures
,
257 weak_factory_
.GetWeakPtr(), request
, callback
));
260 void BrowserFeatureExtractor::ExtractMalwareFeatures(
262 ClientMalwareRequest
* request
,
263 const MalwareDoneCallback
& callback
) {
264 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::UI
));
265 DCHECK_EQ(0U, request
->url().find("http:"));
266 DCHECK(!callback
.is_null());
268 // Grab the IPs because they might go away before we're done
269 // checking them against the IP blacklist on the IO thread.
270 scoped_ptr
<IPUrlMap
> ips(new IPUrlMap
);
271 ips
->swap(info
->ips
);
273 IPUrlMap
* ips_ptr
= ips
.get();
275 // The API doesn't take a scoped_ptr because the API gets mocked and we
276 // cannot mock an API that takes scoped_ptr as arguments.
277 scoped_ptr
<ClientMalwareRequest
> req(request
);
279 // IP blacklist lookups have to happen on the IO thread.
280 BrowserThread::PostTaskAndReply(
283 base::Bind(&FilterBenignIpsOnIOThread
,
284 host_
->database_manager(),
286 base::Bind(&BrowserFeatureExtractor::FinishExtractMalwareFeatures
,
287 weak_factory_
.GetWeakPtr(),
288 base::Passed(&ips
), callback
, base::Passed(&req
)));
291 void BrowserFeatureExtractor::ExtractBrowseInfoFeatures(
292 const BrowseInfo
& info
,
293 ClientPhishingRequest
* request
) {
294 if (info
.unsafe_resource
.get()) {
295 // A SafeBrowsing interstitial was shown for the current URL.
296 AddFeature(features::kSafeBrowsingMaliciousUrl
+
297 info
.unsafe_resource
->url
.spec(),
300 AddFeature(features::kSafeBrowsingOriginalUrl
+
301 info
.unsafe_resource
->original_url
.spec(),
304 AddFeature(features::kSafeBrowsingIsSubresource
,
305 info
.unsafe_resource
->is_subresource
? 1.0 : 0.0,
307 AddFeature(features::kSafeBrowsingThreatType
,
308 static_cast<double>(info
.unsafe_resource
->threat_type
),
311 if (info
.http_status_code
!= 0) {
312 AddFeature(features::kHttpStatusCode
, info
.http_status_code
, request
);
316 void BrowserFeatureExtractor::StartExtractFeatures(
317 ClientPhishingRequest
* request
,
318 const DoneCallback
& callback
) {
319 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::UI
));
320 size_t removed
= pending_extractions_
.erase(request
);
321 DCHECK_EQ(1U, removed
);
322 HistoryService
* history
;
323 if (!request
|| !request
->IsInitialized() || !GetHistoryService(&history
)) {
324 callback
.Run(false, request
);
327 CancelableRequestProvider::Handle handle
= history
->QueryURL(
328 GURL(request
->url()),
329 true /* wants_visits */,
331 base::Bind(&BrowserFeatureExtractor::QueryUrlHistoryDone
,
332 base::Unretained(this)));
334 StorePendingQuery(handle
, request
, callback
);
337 void BrowserFeatureExtractor::QueryUrlHistoryDone(
338 CancelableRequestProvider::Handle handle
,
340 const history::URLRow
* row
,
341 history::VisitVector
* visits
) {
342 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::UI
));
343 ClientPhishingRequest
* request
;
344 DoneCallback callback
;
345 if (!GetPendingQuery(handle
, &request
, &callback
)) {
346 DLOG(FATAL
) << "No pending history query found";
350 DCHECK(!callback
.is_null());
352 // URL is not found in the history. In practice this should not
353 // happen (unless there is a real error) because we just visited
355 callback
.Run(false, request
);
358 AddFeature(features::kUrlHistoryVisitCount
,
359 static_cast<double>(row
->visit_count()),
362 base::Time threshold
= base::Time::Now() - base::TimeDelta::FromDays(1);
363 int num_visits_24h_ago
= 0;
364 int num_visits_typed
= 0;
365 int num_visits_link
= 0;
366 for (history::VisitVector::const_iterator it
= visits
->begin();
367 it
!= visits
->end(); ++it
) {
368 if (!content::PageTransitionIsMainFrame(it
->transition
)) {
371 if (it
->visit_time
< threshold
) {
372 ++num_visits_24h_ago
;
374 content::PageTransition transition
= content::PageTransitionStripQualifier(
376 if (transition
== content::PAGE_TRANSITION_TYPED
) {
378 } else if (transition
== content::PAGE_TRANSITION_LINK
) {
382 AddFeature(features::kUrlHistoryVisitCountMoreThan24hAgo
,
383 static_cast<double>(num_visits_24h_ago
),
385 AddFeature(features::kUrlHistoryTypedCount
,
386 static_cast<double>(num_visits_typed
),
388 AddFeature(features::kUrlHistoryLinkCount
,
389 static_cast<double>(num_visits_link
),
392 // Issue next history lookup for host visits.
393 HistoryService
* history
;
394 if (!GetHistoryService(&history
)) {
395 callback
.Run(false, request
);
398 CancelableRequestProvider::Handle next_handle
=
399 history
->GetVisibleVisitCountToHost(
400 GURL(request
->url()),
402 base::Bind(&BrowserFeatureExtractor::QueryHttpHostVisitsDone
,
403 base::Unretained(this)));
404 StorePendingQuery(next_handle
, request
, callback
);
407 void BrowserFeatureExtractor::QueryHttpHostVisitsDone(
408 CancelableRequestProvider::Handle handle
,
411 base::Time first_visit
) {
412 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::UI
));
413 ClientPhishingRequest
* request
;
414 DoneCallback callback
;
415 if (!GetPendingQuery(handle
, &request
, &callback
)) {
416 DLOG(FATAL
) << "No pending history query found";
420 DCHECK(!callback
.is_null());
422 callback
.Run(false, request
);
425 SetHostVisitsFeatures(num_visits
, first_visit
, true, request
);
427 // Same lookup but for the HTTPS URL.
428 HistoryService
* history
;
429 if (!GetHistoryService(&history
)) {
430 callback
.Run(false, request
);
433 std::string https_url
= request
->url();
434 CancelableRequestProvider::Handle next_handle
=
435 history
->GetVisibleVisitCountToHost(
436 GURL(https_url
.replace(0, 5, "https:")),
438 base::Bind(&BrowserFeatureExtractor::QueryHttpsHostVisitsDone
,
439 base::Unretained(this)));
440 StorePendingQuery(next_handle
, request
, callback
);
443 void BrowserFeatureExtractor::QueryHttpsHostVisitsDone(
444 CancelableRequestProvider::Handle handle
,
447 base::Time first_visit
) {
448 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::UI
));
449 ClientPhishingRequest
* request
;
450 DoneCallback callback
;
451 if (!GetPendingQuery(handle
, &request
, &callback
)) {
452 DLOG(FATAL
) << "No pending history query found";
456 DCHECK(!callback
.is_null());
458 callback
.Run(false, request
);
461 SetHostVisitsFeatures(num_visits
, first_visit
, false, request
);
462 callback
.Run(true, request
); // We're done with all the history lookups.
465 void BrowserFeatureExtractor::SetHostVisitsFeatures(
467 base::Time first_visit
,
469 ClientPhishingRequest
* request
) {
471 AddFeature(is_http_query
?
472 features::kHttpHostVisitCount
: features::kHttpsHostVisitCount
,
473 static_cast<double>(num_visits
),
475 if (num_visits
> 0) {
478 features::kFirstHttpHostVisitMoreThan24hAgo
:
479 features::kFirstHttpsHostVisitMoreThan24hAgo
,
480 (first_visit
< (base::Time::Now() - base::TimeDelta::FromDays(1))) ?
486 void BrowserFeatureExtractor::StorePendingQuery(
487 CancelableRequestProvider::Handle handle
,
488 ClientPhishingRequest
* request
,
489 const DoneCallback
& callback
) {
490 DCHECK_EQ(0U, pending_queries_
.count(handle
));
491 pending_queries_
[handle
] = std::make_pair(request
, callback
);
494 bool BrowserFeatureExtractor::GetPendingQuery(
495 CancelableRequestProvider::Handle handle
,
496 ClientPhishingRequest
** request
,
497 DoneCallback
* callback
) {
498 PendingQueriesMap::iterator it
= pending_queries_
.find(handle
);
499 DCHECK(it
!= pending_queries_
.end());
500 if (it
!= pending_queries_
.end()) {
501 *request
= it
->second
.first
;
502 *callback
= it
->second
.second
;
503 pending_queries_
.erase(it
);
509 bool BrowserFeatureExtractor::GetHistoryService(HistoryService
** history
) {
511 if (tab_
&& tab_
->GetBrowserContext()) {
512 Profile
* profile
= Profile::FromBrowserContext(tab_
->GetBrowserContext());
513 *history
= HistoryServiceFactory::GetForProfile(profile
,
514 Profile::EXPLICIT_ACCESS
);
519 VLOG(2) << "Unable to query history. No history service available.";
523 void BrowserFeatureExtractor::FinishExtractMalwareFeatures(
524 scoped_ptr
<IPUrlMap
> bad_ips
,
525 MalwareDoneCallback callback
,
526 scoped_ptr
<ClientMalwareRequest
> request
) {
527 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::UI
));
528 int matched_bad_ips
= 0;
529 for (IPUrlMap::const_iterator it
= bad_ips
->begin();
530 it
!= bad_ips
->end(); ++it
) {
531 AddMalwareIpUrlInfo(it
->first
, it
->second
, request
.get());
533 // Limit the number of matched bad IPs in one request to control
534 // the request's size
535 if (matched_bad_ips
>= kMaxMalwareIPPerRequest
) {
539 callback
.Run(true, request
.Pass());
542 } // namespace safe_browsing