Roll src/third_party/WebKit eac3800:0237a66 (svn 202606:202607)
[chromium-blink-merge.git] / chrome / browser / safe_browsing / browser_feature_extractor.cc
blob29b4d4a7d51db34d19dc789eb63594398498baf0
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "chrome/browser/safe_browsing/browser_feature_extractor.h"
7 #include <map>
8 #include <utility>
10 #include "base/bind.h"
11 #include "base/bind_helpers.h"
12 #include "base/format_macros.h"
13 #include "base/location.h"
14 #include "base/single_thread_task_runner.h"
15 #include "base/stl_util.h"
16 #include "base/strings/stringprintf.h"
17 #include "base/thread_task_runner_handle.h"
18 #include "base/time/time.h"
19 #include "chrome/browser/history/history_service_factory.h"
20 #include "chrome/browser/profiles/profile.h"
21 #include "chrome/browser/safe_browsing/browser_features.h"
22 #include "chrome/browser/safe_browsing/client_side_detection_host.h"
23 #include "chrome/browser/safe_browsing/database_manager.h"
24 #include "chrome/common/safe_browsing/csd.pb.h"
25 #include "components/history/core/browser/history_service.h"
26 #include "components/history/core/browser/history_types.h"
27 #include "content/public/browser/browser_thread.h"
28 #include "content/public/browser/navigation_controller.h"
29 #include "content/public/browser/navigation_entry.h"
30 #include "content/public/browser/web_contents.h"
31 #include "ui/base/page_transition_types.h"
32 #include "url/gurl.h"
34 using content::BrowserThread;
35 using content::NavigationController;
36 using content::NavigationEntry;
37 using content::ResourceType;
38 using content::WebContents;
40 namespace safe_browsing {
42 namespace {
44 const int kMaxMalwareIPPerRequest = 5;
46 void FilterBenignIpsOnIOThread(
47 scoped_refptr<SafeBrowsingDatabaseManager> database_manager,
48 IPUrlMap* ips) {
49 DCHECK_CURRENTLY_ON(BrowserThread::IO);
50 for (IPUrlMap::iterator it = ips->begin(); it != ips->end();) {
51 if (!database_manager.get() ||
52 !database_manager->MatchMalwareIP(it->first)) {
53 // it++ here returns a copy of the old iterator and passes it to erase.
54 ips->erase(it++);
55 } else {
56 ++it;
60 } // namespace
62 IPUrlInfo::IPUrlInfo(const std::string& url,
63 const std::string& method,
64 const std::string& referrer,
65 const ResourceType& resource_type)
66 : url(url),
67 method(method),
68 referrer(referrer),
69 resource_type(resource_type) {
72 IPUrlInfo::~IPUrlInfo() {}
74 BrowseInfo::BrowseInfo() : http_status_code(0) {}
76 BrowseInfo::~BrowseInfo() {}
78 static void AddFeature(const std::string& feature_name,
79 double feature_value,
80 ClientPhishingRequest* request) {
81 DCHECK(request);
82 ClientPhishingRequest::Feature* feature =
83 request->add_non_model_feature_map();
84 feature->set_name(feature_name);
85 feature->set_value(feature_value);
86 DVLOG(2) << "Browser feature: " << feature->name() << " " << feature->value();
89 static void AddMalwareIpUrlInfo(const std::string& ip,
90 const std::vector<IPUrlInfo>& meta_infos,
91 ClientMalwareRequest* request) {
92 DCHECK(request);
93 for (std::vector<IPUrlInfo>::const_iterator it = meta_infos.begin();
94 it != meta_infos.end(); ++it) {
95 ClientMalwareRequest::UrlInfo* urlinfo =
96 request->add_bad_ip_url_info();
97 // We add the information about url on the bad ip.
98 urlinfo->set_ip(ip);
99 urlinfo->set_url(it->url);
100 urlinfo->set_method(it->method);
101 urlinfo->set_referrer(it->referrer);
102 urlinfo->set_resource_type(static_cast<int>(it->resource_type));
104 DVLOG(2) << "Added url info for bad ip: " << ip;
107 static void AddNavigationFeatures(
108 const std::string& feature_prefix,
109 const NavigationController& controller,
110 int index,
111 const std::vector<GURL>& redirect_chain,
112 ClientPhishingRequest* request) {
113 NavigationEntry* entry = controller.GetEntryAtIndex(index);
114 bool is_secure_referrer = entry->GetReferrer().url.SchemeIsCryptographic();
115 if (!is_secure_referrer) {
116 AddFeature(base::StringPrintf("%s%s=%s",
117 feature_prefix.c_str(),
118 features::kReferrer,
119 entry->GetReferrer().url.spec().c_str()),
120 1.0,
121 request);
123 AddFeature(feature_prefix + features::kHasSSLReferrer,
124 is_secure_referrer ? 1.0 : 0.0,
125 request);
126 AddFeature(feature_prefix + features::kPageTransitionType,
127 static_cast<double>(
128 ui::PageTransitionStripQualifier(
129 entry->GetTransitionType())),
130 request);
131 AddFeature(feature_prefix + features::kIsFirstNavigation,
132 index == 0 ? 1.0 : 0.0,
133 request);
134 // Redirect chain should always be at least of size one, as the rendered
135 // url is the last element in the chain.
136 if (redirect_chain.empty()) {
137 NOTREACHED();
138 return;
140 if (redirect_chain.back() != entry->GetURL()) {
141 // I originally had this as a DCHECK but I saw a failure once that I
142 // can't reproduce. It looks like it might be related to the
143 // navigation controller only keeping a limited number of navigation
144 // events. For now we'll just attach a feature specifying that this is
145 // a mismatch and try and figure out what to do with it on the server.
146 DLOG(WARNING) << "Expected:" << entry->GetURL()
147 << " Actual:" << redirect_chain.back();
148 AddFeature(feature_prefix + features::kRedirectUrlMismatch,
149 1.0,
150 request);
151 return;
153 // We skip the last element since it should just be the current url.
154 for (size_t i = 0; i < redirect_chain.size() - 1; i++) {
155 std::string printable_redirect = redirect_chain[i].spec();
156 if (redirect_chain[i].SchemeIsCryptographic()) {
157 printable_redirect = features::kSecureRedirectValue;
159 AddFeature(base::StringPrintf("%s%s[%" PRIuS "]=%s",
160 feature_prefix.c_str(),
161 features::kRedirect,
163 printable_redirect.c_str()),
164 1.0,
165 request);
169 BrowserFeatureExtractor::BrowserFeatureExtractor(
170 WebContents* tab,
171 ClientSideDetectionHost* host)
172 : tab_(tab),
173 host_(host),
174 weak_factory_(this) {
175 DCHECK(tab);
178 BrowserFeatureExtractor::~BrowserFeatureExtractor() {
179 weak_factory_.InvalidateWeakPtrs();
182 void BrowserFeatureExtractor::ExtractFeatures(const BrowseInfo* info,
183 ClientPhishingRequest* request,
184 const DoneCallback& callback) {
185 DCHECK_CURRENTLY_ON(BrowserThread::UI);
186 DCHECK(request);
187 DCHECK(info);
188 DCHECK_EQ(0U, request->url().find("http:"));
189 DCHECK(!callback.is_null());
190 // Extract features pertaining to this navigation.
191 const NavigationController& controller = tab_->GetController();
192 int url_index = -1;
193 int first_host_index = -1;
195 GURL request_url(request->url());
196 int index = controller.GetCurrentEntryIndex();
197 // The url that we are extracting features for should already be commited.
198 DCHECK_NE(index, -1);
199 for (; index >= 0; index--) {
200 NavigationEntry* entry = controller.GetEntryAtIndex(index);
201 if (url_index == -1 && entry->GetURL() == request_url) {
202 // It's possible that we've been on the on the possibly phishy url before
203 // in this tab, so make sure that we use the latest navigation for
204 // features.
205 // Note that it's possible that the url_index should always be the
206 // latest entry, but I'm worried about possible races during a navigation
207 // and transient entries (i.e. interstiatials) so for now we will just
208 // be cautious.
209 url_index = index;
210 } else if (index < url_index) {
211 if (entry->GetURL().host() == request_url.host()) {
212 first_host_index = index;
213 } else {
214 // We have found the possibly phishing url, but we are no longer on the
215 // host. No reason to look back any further.
216 break;
221 // Add features pertaining to how we got to
222 // 1) The candidate url
223 // 2) The first url on the same host as the candidate url (assuming that
224 // it's different from the candidate url).
225 if (url_index != -1) {
226 AddNavigationFeatures(
227 std::string(), controller, url_index, info->url_redirects, request);
229 if (first_host_index != -1) {
230 AddNavigationFeatures(features::kHostPrefix,
231 controller,
232 first_host_index,
233 info->host_redirects,
234 request);
237 // The API doesn't take a scoped_ptr because the API gets mocked and we
238 // cannot mock an API that takes scoped_ptr as arguments.
239 scoped_ptr<ClientPhishingRequest> req(request);
241 ExtractBrowseInfoFeatures(*info, request);
242 base::ThreadTaskRunnerHandle::Get()->PostTask(
243 FROM_HERE,
244 base::Bind(&BrowserFeatureExtractor::StartExtractFeatures,
245 weak_factory_.GetWeakPtr(), base::Passed(&req), callback));
248 void BrowserFeatureExtractor::ExtractMalwareFeatures(
249 BrowseInfo* info,
250 ClientMalwareRequest* request,
251 const MalwareDoneCallback& callback) {
252 DCHECK_CURRENTLY_ON(BrowserThread::UI);
253 DCHECK(!callback.is_null());
255 // Grab the IPs because they might go away before we're done
256 // checking them against the IP blacklist on the IO thread.
257 scoped_ptr<IPUrlMap> ips(new IPUrlMap);
258 ips->swap(info->ips);
260 IPUrlMap* ips_ptr = ips.get();
262 // The API doesn't take a scoped_ptr because the API gets mocked and we
263 // cannot mock an API that takes scoped_ptr as arguments.
264 scoped_ptr<ClientMalwareRequest> req(request);
266 // IP blacklist lookups have to happen on the IO thread.
267 BrowserThread::PostTaskAndReply(
268 BrowserThread::IO,
269 FROM_HERE,
270 base::Bind(&FilterBenignIpsOnIOThread,
271 host_->database_manager(),
272 ips_ptr),
273 base::Bind(&BrowserFeatureExtractor::FinishExtractMalwareFeatures,
274 weak_factory_.GetWeakPtr(),
275 base::Passed(&ips), callback, base::Passed(&req)));
278 void BrowserFeatureExtractor::ExtractBrowseInfoFeatures(
279 const BrowseInfo& info,
280 ClientPhishingRequest* request) {
281 if (info.unsafe_resource.get()) {
282 // A SafeBrowsing interstitial was shown for the current URL.
283 AddFeature(features::kSafeBrowsingMaliciousUrl +
284 info.unsafe_resource->url.spec(),
285 1.0,
286 request);
287 AddFeature(features::kSafeBrowsingOriginalUrl +
288 info.unsafe_resource->original_url.spec(),
289 1.0,
290 request);
291 AddFeature(features::kSafeBrowsingIsSubresource,
292 info.unsafe_resource->is_subresource ? 1.0 : 0.0,
293 request);
294 AddFeature(features::kSafeBrowsingThreatType,
295 static_cast<double>(info.unsafe_resource->threat_type),
296 request);
298 if (info.http_status_code != 0) {
299 AddFeature(features::kHttpStatusCode, info.http_status_code, request);
303 void BrowserFeatureExtractor::StartExtractFeatures(
304 scoped_ptr<ClientPhishingRequest> request,
305 const DoneCallback& callback) {
306 DCHECK_CURRENTLY_ON(BrowserThread::UI);
307 history::HistoryService* history;
308 if (!request || !request->IsInitialized() || !GetHistoryService(&history)) {
309 callback.Run(false, request.Pass());
310 return;
312 GURL request_url(request->url());
313 history->QueryURL(request_url,
314 true /* wants_visits */,
315 base::Bind(&BrowserFeatureExtractor::QueryUrlHistoryDone,
316 base::Unretained(this),
317 base::Passed(&request),
318 callback),
319 &cancelable_task_tracker_);
322 void BrowserFeatureExtractor::QueryUrlHistoryDone(
323 scoped_ptr<ClientPhishingRequest> request,
324 const DoneCallback& callback,
325 bool success,
326 const history::URLRow& row,
327 const history::VisitVector& visits) {
328 DCHECK_CURRENTLY_ON(BrowserThread::UI);
329 DCHECK(request);
330 DCHECK(!callback.is_null());
331 if (!success) {
332 // URL is not found in the history. In practice this should not
333 // happen (unless there is a real error) because we just visited
334 // that URL.
335 callback.Run(false, request.Pass());
336 return;
338 AddFeature(features::kUrlHistoryVisitCount,
339 static_cast<double>(row.visit_count()),
340 request.get());
342 base::Time threshold = base::Time::Now() - base::TimeDelta::FromDays(1);
343 int num_visits_24h_ago = 0;
344 int num_visits_typed = 0;
345 int num_visits_link = 0;
346 for (history::VisitVector::const_iterator it = visits.begin();
347 it != visits.end();
348 ++it) {
349 if (!ui::PageTransitionIsMainFrame(it->transition)) {
350 continue;
352 if (it->visit_time < threshold) {
353 ++num_visits_24h_ago;
355 ui::PageTransition transition = ui::PageTransitionStripQualifier(
356 it->transition);
357 if (transition == ui::PAGE_TRANSITION_TYPED) {
358 ++num_visits_typed;
359 } else if (transition == ui::PAGE_TRANSITION_LINK) {
360 ++num_visits_link;
363 AddFeature(features::kUrlHistoryVisitCountMoreThan24hAgo,
364 static_cast<double>(num_visits_24h_ago),
365 request.get());
366 AddFeature(features::kUrlHistoryTypedCount,
367 static_cast<double>(num_visits_typed),
368 request.get());
369 AddFeature(features::kUrlHistoryLinkCount,
370 static_cast<double>(num_visits_link),
371 request.get());
373 // Issue next history lookup for host visits.
374 history::HistoryService* history;
375 if (!GetHistoryService(&history)) {
376 callback.Run(false, request.Pass());
377 return;
379 GURL request_url(request->url());
380 history->GetVisibleVisitCountToHost(
381 request_url,
382 base::Bind(&BrowserFeatureExtractor::QueryHttpHostVisitsDone,
383 base::Unretained(this),
384 base::Passed(&request),
385 callback),
386 &cancelable_task_tracker_);
389 void BrowserFeatureExtractor::QueryHttpHostVisitsDone(
390 scoped_ptr<ClientPhishingRequest> request,
391 const DoneCallback& callback,
392 bool success,
393 int num_visits,
394 base::Time first_visit) {
395 DCHECK_CURRENTLY_ON(BrowserThread::UI);
396 DCHECK(request);
397 DCHECK(!callback.is_null());
398 if (!success) {
399 callback.Run(false, request.Pass());
400 return;
402 SetHostVisitsFeatures(num_visits, first_visit, true, request.get());
404 // Same lookup but for the HTTPS URL.
405 history::HistoryService* history;
406 if (!GetHistoryService(&history)) {
407 callback.Run(false, request.Pass());
408 return;
410 std::string https_url = request->url();
411 history->GetVisibleVisitCountToHost(
412 GURL(https_url.replace(0, 5, "https:")),
413 base::Bind(&BrowserFeatureExtractor::QueryHttpsHostVisitsDone,
414 base::Unretained(this),
415 base::Passed(&request),
416 callback),
417 &cancelable_task_tracker_);
420 void BrowserFeatureExtractor::QueryHttpsHostVisitsDone(
421 scoped_ptr<ClientPhishingRequest> request,
422 const DoneCallback& callback,
423 bool success,
424 int num_visits,
425 base::Time first_visit) {
426 DCHECK_CURRENTLY_ON(BrowserThread::UI);
427 DCHECK(request);
428 DCHECK(!callback.is_null());
429 if (!success) {
430 callback.Run(false, request.Pass());
431 return;
433 SetHostVisitsFeatures(num_visits, first_visit, false, request.get());
434 callback.Run(true, request.Pass());
437 void BrowserFeatureExtractor::SetHostVisitsFeatures(
438 int num_visits,
439 base::Time first_visit,
440 bool is_http_query,
441 ClientPhishingRequest* request) {
442 DCHECK(request);
443 AddFeature(is_http_query ?
444 features::kHttpHostVisitCount : features::kHttpsHostVisitCount,
445 static_cast<double>(num_visits),
446 request);
447 if (num_visits > 0) {
448 AddFeature(
449 is_http_query ?
450 features::kFirstHttpHostVisitMoreThan24hAgo :
451 features::kFirstHttpsHostVisitMoreThan24hAgo,
452 (first_visit < (base::Time::Now() - base::TimeDelta::FromDays(1))) ?
453 1.0 : 0.0,
454 request);
458 bool BrowserFeatureExtractor::GetHistoryService(
459 history::HistoryService** history) {
460 *history = NULL;
461 if (tab_ && tab_->GetBrowserContext()) {
462 Profile* profile = Profile::FromBrowserContext(tab_->GetBrowserContext());
463 *history = HistoryServiceFactory::GetForProfile(
464 profile, ServiceAccessType::EXPLICIT_ACCESS);
465 if (*history) {
466 return true;
469 DVLOG(2) << "Unable to query history. No history service available.";
470 return false;
473 void BrowserFeatureExtractor::FinishExtractMalwareFeatures(
474 scoped_ptr<IPUrlMap> bad_ips,
475 MalwareDoneCallback callback,
476 scoped_ptr<ClientMalwareRequest> request) {
477 DCHECK_CURRENTLY_ON(BrowserThread::UI);
478 int matched_bad_ips = 0;
479 for (IPUrlMap::const_iterator it = bad_ips->begin();
480 it != bad_ips->end(); ++it) {
481 AddMalwareIpUrlInfo(it->first, it->second, request.get());
482 ++matched_bad_ips;
483 // Limit the number of matched bad IPs in one request to control
484 // the request's size
485 if (matched_bad_ips >= kMaxMalwareIPPerRequest) {
486 break;
489 callback.Run(true, request.Pass());
492 } // namespace safe_browsing