Disable view source for Developer Tools.
[chromium-blink-merge.git] / chrome / browser / safe_browsing / browser_feature_extractor.cc
blob522c3fc07d69a96dc57f855518a5d59cd75e4bf5
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "chrome/browser/safe_browsing/browser_feature_extractor.h"
7 #include <map>
8 #include <utility>
10 #include "base/bind.h"
11 #include "base/bind_helpers.h"
12 #include "base/format_macros.h"
13 #include "base/stl_util.h"
14 #include "base/strings/stringprintf.h"
15 #include "base/time/time.h"
16 #include "chrome/browser/common/cancelable_request.h"
17 #include "chrome/browser/history/history_service.h"
18 #include "chrome/browser/history/history_service_factory.h"
19 #include "chrome/browser/history/history_types.h"
20 #include "chrome/browser/profiles/profile.h"
21 #include "chrome/browser/safe_browsing/browser_features.h"
22 #include "chrome/browser/safe_browsing/client_side_detection_host.h"
23 #include "chrome/browser/safe_browsing/database_manager.h"
24 #include "chrome/common/safe_browsing/csd.pb.h"
25 #include "content/public/browser/browser_thread.h"
26 #include "content/public/browser/navigation_controller.h"
27 #include "content/public/browser/navigation_entry.h"
28 #include "content/public/browser/web_contents.h"
29 #include "content/public/common/page_transition_types.h"
30 #include "url/gurl.h"
32 using content::BrowserThread;
33 using content::NavigationController;
34 using content::NavigationEntry;
35 using content::WebContents;
37 namespace safe_browsing {
39 namespace {
41 const int kMaxMalwareIPPerRequest = 5;
43 void FilterBenignIpsOnIOThread(
44 scoped_refptr<SafeBrowsingDatabaseManager> database_manager,
45 IPUrlMap* ips) {
46 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO));
47 for (IPUrlMap::iterator it = ips->begin(); it != ips->end();) {
48 if (!database_manager.get() ||
49 !database_manager->MatchMalwareIP(it->first)) {
50 // it++ here returns a copy of the old iterator and passes it to erase.
51 ips->erase(it++);
52 } else {
53 ++it;
57 } // namespace
59 IPUrlInfo::IPUrlInfo(const std::string& url,
60 const std::string& method,
61 const std::string& referrer,
62 const ResourceType::Type& resource_type)
63 : url(url),
64 method(method),
65 referrer(referrer),
66 resource_type(resource_type) {
69 IPUrlInfo::~IPUrlInfo() {}
71 BrowseInfo::BrowseInfo() : http_status_code(0) {}
73 BrowseInfo::~BrowseInfo() {}
75 static void AddFeature(const std::string& feature_name,
76 double feature_value,
77 ClientPhishingRequest* request) {
78 DCHECK(request);
79 ClientPhishingRequest::Feature* feature =
80 request->add_non_model_feature_map();
81 feature->set_name(feature_name);
82 feature->set_value(feature_value);
83 VLOG(2) << "Browser feature: " << feature->name() << " " << feature->value();
86 static void AddMalwareIpUrlInfo(const std::string& ip,
87 const std::vector<IPUrlInfo>& meta_infos,
88 ClientMalwareRequest* request) {
89 DCHECK(request);
90 for (std::vector<IPUrlInfo>::const_iterator it = meta_infos.begin();
91 it != meta_infos.end(); ++it) {
92 ClientMalwareRequest::UrlInfo* urlinfo =
93 request->add_bad_ip_url_info();
94 // We add the information about url on the bad ip.
95 urlinfo->set_ip(ip);
96 urlinfo->set_url(it->url);
97 urlinfo->set_method(it->method);
98 urlinfo->set_referrer(it->referrer);
99 urlinfo->set_resource_type(static_cast<int>(it->resource_type));
101 DVLOG(2) << "Added url info for bad ip: " << ip;
104 static void AddNavigationFeatures(
105 const std::string& feature_prefix,
106 const NavigationController& controller,
107 int index,
108 const std::vector<GURL>& redirect_chain,
109 ClientPhishingRequest* request) {
110 NavigationEntry* entry = controller.GetEntryAtIndex(index);
111 bool is_secure_referrer = entry->GetReferrer().url.SchemeIsSecure();
112 if (!is_secure_referrer) {
113 AddFeature(base::StringPrintf("%s%s=%s",
114 feature_prefix.c_str(),
115 features::kReferrer,
116 entry->GetReferrer().url.spec().c_str()),
117 1.0,
118 request);
120 AddFeature(feature_prefix + features::kHasSSLReferrer,
121 is_secure_referrer ? 1.0 : 0.0,
122 request);
123 AddFeature(feature_prefix + features::kPageTransitionType,
124 static_cast<double>(
125 content::PageTransitionStripQualifier(
126 entry->GetTransitionType())),
127 request);
128 AddFeature(feature_prefix + features::kIsFirstNavigation,
129 index == 0 ? 1.0 : 0.0,
130 request);
131 // Redirect chain should always be at least of size one, as the rendered
132 // url is the last element in the chain.
133 if (redirect_chain.empty()) {
134 NOTREACHED();
135 return;
137 if (redirect_chain.back() != entry->GetURL()) {
138 // I originally had this as a DCHECK but I saw a failure once that I
139 // can't reproduce. It looks like it might be related to the
140 // navigation controller only keeping a limited number of navigation
141 // events. For now we'll just attach a feature specifying that this is
142 // a mismatch and try and figure out what to do with it on the server.
143 DLOG(WARNING) << "Expected:" << entry->GetURL()
144 << " Actual:" << redirect_chain.back();
145 AddFeature(feature_prefix + features::kRedirectUrlMismatch,
146 1.0,
147 request);
148 return;
150 // We skip the last element since it should just be the current url.
151 for (size_t i = 0; i < redirect_chain.size() - 1; i++) {
152 std::string printable_redirect = redirect_chain[i].spec();
153 if (redirect_chain[i].SchemeIsSecure()) {
154 printable_redirect = features::kSecureRedirectValue;
156 AddFeature(base::StringPrintf("%s%s[%" PRIuS "]=%s",
157 feature_prefix.c_str(),
158 features::kRedirect,
160 printable_redirect.c_str()),
161 1.0,
162 request);
166 BrowserFeatureExtractor::BrowserFeatureExtractor(
167 WebContents* tab,
168 ClientSideDetectionHost* host)
169 : tab_(tab),
170 host_(host),
171 weak_factory_(this) {
172 DCHECK(tab);
175 BrowserFeatureExtractor::~BrowserFeatureExtractor() {
176 weak_factory_.InvalidateWeakPtrs();
177 // Delete all the pending extractions (delete callback and request objects).
178 STLDeleteContainerPairFirstPointers(pending_extractions_.begin(),
179 pending_extractions_.end());
181 // Also cancel all the pending history service queries.
182 HistoryService* history;
183 bool success = GetHistoryService(&history);
184 DCHECK(success || pending_queries_.size() == 0);
185 // Cancel all the pending history lookups and cleanup the memory.
186 for (PendingQueriesMap::iterator it = pending_queries_.begin();
187 it != pending_queries_.end(); ++it) {
188 if (history) {
189 history->CancelRequest(it->first);
191 ExtractionData& extraction = it->second;
192 delete extraction.first; // delete request
194 pending_queries_.clear();
197 void BrowserFeatureExtractor::ExtractFeatures(const BrowseInfo* info,
198 ClientPhishingRequest* request,
199 const DoneCallback& callback) {
200 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::UI));
201 DCHECK(request);
202 DCHECK(info);
203 DCHECK_EQ(0U, request->url().find("http:"));
204 DCHECK(!callback.is_null());
205 // Extract features pertaining to this navigation.
206 const NavigationController& controller = tab_->GetController();
207 int url_index = -1;
208 int first_host_index = -1;
210 GURL request_url(request->url());
211 int index = controller.GetCurrentEntryIndex();
212 // The url that we are extracting features for should already be commited.
213 DCHECK_NE(index, -1);
214 for (; index >= 0; index--) {
215 NavigationEntry* entry = controller.GetEntryAtIndex(index);
216 if (url_index == -1 && entry->GetURL() == request_url) {
217 // It's possible that we've been on the on the possibly phishy url before
218 // in this tab, so make sure that we use the latest navigation for
219 // features.
220 // Note that it's possible that the url_index should always be the
221 // latest entry, but I'm worried about possible races during a navigation
222 // and transient entries (i.e. interstiatials) so for now we will just
223 // be cautious.
224 url_index = index;
225 } else if (index < url_index) {
226 if (entry->GetURL().host() == request_url.host()) {
227 first_host_index = index;
228 } else {
229 // We have found the possibly phishing url, but we are no longer on the
230 // host. No reason to look back any further.
231 break;
236 // Add features pertaining to how we got to
237 // 1) The candidate url
238 // 2) The first url on the same host as the candidate url (assuming that
239 // it's different from the candidate url).
240 if (url_index != -1) {
241 AddNavigationFeatures(
242 std::string(), controller, url_index, info->url_redirects, request);
244 if (first_host_index != -1) {
245 AddNavigationFeatures(features::kHostPrefix,
246 controller,
247 first_host_index,
248 info->host_redirects,
249 request);
252 ExtractBrowseInfoFeatures(*info, request);
253 pending_extractions_[request] = callback;
254 base::MessageLoop::current()->PostTask(
255 FROM_HERE,
256 base::Bind(&BrowserFeatureExtractor::StartExtractFeatures,
257 weak_factory_.GetWeakPtr(), request, callback));
260 void BrowserFeatureExtractor::ExtractMalwareFeatures(
261 BrowseInfo* info,
262 ClientMalwareRequest* request,
263 const MalwareDoneCallback& callback) {
264 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::UI));
265 DCHECK_EQ(0U, request->url().find("http:"));
266 DCHECK(!callback.is_null());
268 // Grab the IPs because they might go away before we're done
269 // checking them against the IP blacklist on the IO thread.
270 scoped_ptr<IPUrlMap> ips(new IPUrlMap);
271 ips->swap(info->ips);
273 IPUrlMap* ips_ptr = ips.get();
275 // The API doesn't take a scoped_ptr because the API gets mocked and we
276 // cannot mock an API that takes scoped_ptr as arguments.
277 scoped_ptr<ClientMalwareRequest> req(request);
279 // IP blacklist lookups have to happen on the IO thread.
280 BrowserThread::PostTaskAndReply(
281 BrowserThread::IO,
282 FROM_HERE,
283 base::Bind(&FilterBenignIpsOnIOThread,
284 host_->database_manager(),
285 ips_ptr),
286 base::Bind(&BrowserFeatureExtractor::FinishExtractMalwareFeatures,
287 weak_factory_.GetWeakPtr(),
288 base::Passed(&ips), callback, base::Passed(&req)));
291 void BrowserFeatureExtractor::ExtractBrowseInfoFeatures(
292 const BrowseInfo& info,
293 ClientPhishingRequest* request) {
294 if (info.unsafe_resource.get()) {
295 // A SafeBrowsing interstitial was shown for the current URL.
296 AddFeature(features::kSafeBrowsingMaliciousUrl +
297 info.unsafe_resource->url.spec(),
298 1.0,
299 request);
300 AddFeature(features::kSafeBrowsingOriginalUrl +
301 info.unsafe_resource->original_url.spec(),
302 1.0,
303 request);
304 AddFeature(features::kSafeBrowsingIsSubresource,
305 info.unsafe_resource->is_subresource ? 1.0 : 0.0,
306 request);
307 AddFeature(features::kSafeBrowsingThreatType,
308 static_cast<double>(info.unsafe_resource->threat_type),
309 request);
311 if (info.http_status_code != 0) {
312 AddFeature(features::kHttpStatusCode, info.http_status_code, request);
316 void BrowserFeatureExtractor::StartExtractFeatures(
317 ClientPhishingRequest* request,
318 const DoneCallback& callback) {
319 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::UI));
320 size_t removed = pending_extractions_.erase(request);
321 DCHECK_EQ(1U, removed);
322 HistoryService* history;
323 if (!request || !request->IsInitialized() || !GetHistoryService(&history)) {
324 callback.Run(false, request);
325 return;
327 CancelableRequestProvider::Handle handle = history->QueryURL(
328 GURL(request->url()),
329 true /* wants_visits */,
330 &request_consumer_,
331 base::Bind(&BrowserFeatureExtractor::QueryUrlHistoryDone,
332 base::Unretained(this)));
334 StorePendingQuery(handle, request, callback);
337 void BrowserFeatureExtractor::QueryUrlHistoryDone(
338 CancelableRequestProvider::Handle handle,
339 bool success,
340 const history::URLRow* row,
341 history::VisitVector* visits) {
342 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::UI));
343 ClientPhishingRequest* request;
344 DoneCallback callback;
345 if (!GetPendingQuery(handle, &request, &callback)) {
346 DLOG(FATAL) << "No pending history query found";
347 return;
349 DCHECK(request);
350 DCHECK(!callback.is_null());
351 if (!success) {
352 // URL is not found in the history. In practice this should not
353 // happen (unless there is a real error) because we just visited
354 // that URL.
355 callback.Run(false, request);
356 return;
358 AddFeature(features::kUrlHistoryVisitCount,
359 static_cast<double>(row->visit_count()),
360 request);
362 base::Time threshold = base::Time::Now() - base::TimeDelta::FromDays(1);
363 int num_visits_24h_ago = 0;
364 int num_visits_typed = 0;
365 int num_visits_link = 0;
366 for (history::VisitVector::const_iterator it = visits->begin();
367 it != visits->end(); ++it) {
368 if (!content::PageTransitionIsMainFrame(it->transition)) {
369 continue;
371 if (it->visit_time < threshold) {
372 ++num_visits_24h_ago;
374 content::PageTransition transition = content::PageTransitionStripQualifier(
375 it->transition);
376 if (transition == content::PAGE_TRANSITION_TYPED) {
377 ++num_visits_typed;
378 } else if (transition == content::PAGE_TRANSITION_LINK) {
379 ++num_visits_link;
382 AddFeature(features::kUrlHistoryVisitCountMoreThan24hAgo,
383 static_cast<double>(num_visits_24h_ago),
384 request);
385 AddFeature(features::kUrlHistoryTypedCount,
386 static_cast<double>(num_visits_typed),
387 request);
388 AddFeature(features::kUrlHistoryLinkCount,
389 static_cast<double>(num_visits_link),
390 request);
392 // Issue next history lookup for host visits.
393 HistoryService* history;
394 if (!GetHistoryService(&history)) {
395 callback.Run(false, request);
396 return;
398 CancelableRequestProvider::Handle next_handle =
399 history->GetVisibleVisitCountToHost(
400 GURL(request->url()),
401 &request_consumer_,
402 base::Bind(&BrowserFeatureExtractor::QueryHttpHostVisitsDone,
403 base::Unretained(this)));
404 StorePendingQuery(next_handle, request, callback);
407 void BrowserFeatureExtractor::QueryHttpHostVisitsDone(
408 CancelableRequestProvider::Handle handle,
409 bool success,
410 int num_visits,
411 base::Time first_visit) {
412 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::UI));
413 ClientPhishingRequest* request;
414 DoneCallback callback;
415 if (!GetPendingQuery(handle, &request, &callback)) {
416 DLOG(FATAL) << "No pending history query found";
417 return;
419 DCHECK(request);
420 DCHECK(!callback.is_null());
421 if (!success) {
422 callback.Run(false, request);
423 return;
425 SetHostVisitsFeatures(num_visits, first_visit, true, request);
427 // Same lookup but for the HTTPS URL.
428 HistoryService* history;
429 if (!GetHistoryService(&history)) {
430 callback.Run(false, request);
431 return;
433 std::string https_url = request->url();
434 CancelableRequestProvider::Handle next_handle =
435 history->GetVisibleVisitCountToHost(
436 GURL(https_url.replace(0, 5, "https:")),
437 &request_consumer_,
438 base::Bind(&BrowserFeatureExtractor::QueryHttpsHostVisitsDone,
439 base::Unretained(this)));
440 StorePendingQuery(next_handle, request, callback);
443 void BrowserFeatureExtractor::QueryHttpsHostVisitsDone(
444 CancelableRequestProvider::Handle handle,
445 bool success,
446 int num_visits,
447 base::Time first_visit) {
448 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::UI));
449 ClientPhishingRequest* request;
450 DoneCallback callback;
451 if (!GetPendingQuery(handle, &request, &callback)) {
452 DLOG(FATAL) << "No pending history query found";
453 return;
455 DCHECK(request);
456 DCHECK(!callback.is_null());
457 if (!success) {
458 callback.Run(false, request);
459 return;
461 SetHostVisitsFeatures(num_visits, first_visit, false, request);
462 callback.Run(true, request); // We're done with all the history lookups.
465 void BrowserFeatureExtractor::SetHostVisitsFeatures(
466 int num_visits,
467 base::Time first_visit,
468 bool is_http_query,
469 ClientPhishingRequest* request) {
470 DCHECK(request);
471 AddFeature(is_http_query ?
472 features::kHttpHostVisitCount : features::kHttpsHostVisitCount,
473 static_cast<double>(num_visits),
474 request);
475 if (num_visits > 0) {
476 AddFeature(
477 is_http_query ?
478 features::kFirstHttpHostVisitMoreThan24hAgo :
479 features::kFirstHttpsHostVisitMoreThan24hAgo,
480 (first_visit < (base::Time::Now() - base::TimeDelta::FromDays(1))) ?
481 1.0 : 0.0,
482 request);
486 void BrowserFeatureExtractor::StorePendingQuery(
487 CancelableRequestProvider::Handle handle,
488 ClientPhishingRequest* request,
489 const DoneCallback& callback) {
490 DCHECK_EQ(0U, pending_queries_.count(handle));
491 pending_queries_[handle] = std::make_pair(request, callback);
494 bool BrowserFeatureExtractor::GetPendingQuery(
495 CancelableRequestProvider::Handle handle,
496 ClientPhishingRequest** request,
497 DoneCallback* callback) {
498 PendingQueriesMap::iterator it = pending_queries_.find(handle);
499 DCHECK(it != pending_queries_.end());
500 if (it != pending_queries_.end()) {
501 *request = it->second.first;
502 *callback = it->second.second;
503 pending_queries_.erase(it);
504 return true;
506 return false;
509 bool BrowserFeatureExtractor::GetHistoryService(HistoryService** history) {
510 *history = NULL;
511 if (tab_ && tab_->GetBrowserContext()) {
512 Profile* profile = Profile::FromBrowserContext(tab_->GetBrowserContext());
513 *history = HistoryServiceFactory::GetForProfile(profile,
514 Profile::EXPLICIT_ACCESS);
515 if (*history) {
516 return true;
519 VLOG(2) << "Unable to query history. No history service available.";
520 return false;
523 void BrowserFeatureExtractor::FinishExtractMalwareFeatures(
524 scoped_ptr<IPUrlMap> bad_ips,
525 MalwareDoneCallback callback,
526 scoped_ptr<ClientMalwareRequest> request) {
527 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::UI));
528 int matched_bad_ips = 0;
529 for (IPUrlMap::const_iterator it = bad_ips->begin();
530 it != bad_ips->end(); ++it) {
531 AddMalwareIpUrlInfo(it->first, it->second, request.get());
532 ++matched_bad_ips;
533 // Limit the number of matched bad IPs in one request to control
534 // the request's size
535 if (matched_bad_ips >= kMaxMalwareIPPerRequest) {
536 break;
539 callback.Run(true, request.Pass());
542 } // namespace safe_browsing