1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "chrome/browser/safe_browsing/safe_browsing_util.h"
7 #include "base/logging.h"
8 #include "base/metrics/field_trial.h"
9 #include "base/strings/string_util.h"
10 #include "base/strings/stringprintf.h"
11 #include "chrome/browser/browser_process.h"
12 #include "chrome/browser/safe_browsing/chunk.pb.h"
13 #include "components/google/core/browser/google_util.h"
14 #include "crypto/sha2.h"
15 #include "net/base/escape.h"
17 #include "url/url_util.h"
20 #include "chrome/installer/util/browser_distribution.h"
23 static const char kReportParams
[] = "?tpl=%s&url=%s";
25 SBFullHash
SBFullHashForString(const base::StringPiece
& str
) {
27 crypto::SHA256HashString(str
, &h
.full_hash
, sizeof(h
.full_hash
));
31 // SBCachedFullHashResult ------------------------------------------------------
33 SBCachedFullHashResult::SBCachedFullHashResult() {}
35 SBCachedFullHashResult::SBCachedFullHashResult(
36 const base::Time
& in_expire_after
)
37 : expire_after(in_expire_after
) {}
39 SBCachedFullHashResult::~SBCachedFullHashResult() {}
41 // SBChunkData -----------------------------------------------------------------
43 // TODO(shess): Right now this contains a scoped_ptr<ChunkData> so that the
44 // proto buffer isn't copied all over the place, then these are contained in a
45 // ScopedVector for purposes of passing things around between tasks. This seems
46 // convoluted. Maybe it would make sense to have an overall container class
47 // returning references to a nested per-chunk class?
49 SBChunkData::SBChunkData() {
52 SBChunkData::SBChunkData(safe_browsing::ChunkData
* raw_data
)
53 : chunk_data_(raw_data
) {
54 DCHECK(chunk_data_
.get());
57 SBChunkData::~SBChunkData() {
60 bool SBChunkData::ParseFrom(const unsigned char* data
, size_t length
) {
61 scoped_ptr
<safe_browsing::ChunkData
> chunk(new safe_browsing::ChunkData());
62 if (!chunk
->ParseFromArray(data
, length
))
65 if (chunk
->chunk_type() != safe_browsing::ChunkData::ADD
&&
66 chunk
->chunk_type() != safe_browsing::ChunkData::SUB
) {
71 if (chunk
->prefix_type() == safe_browsing::ChunkData::PREFIX_4B
) {
72 hash_size
= sizeof(SBPrefix
);
73 } else if (chunk
->prefix_type() == safe_browsing::ChunkData::FULL_32B
) {
74 hash_size
= sizeof(SBFullHash
);
79 const size_t hash_count
= chunk
->hashes().size() / hash_size
;
80 if (hash_count
* hash_size
!= chunk
->hashes().size())
83 if (chunk
->chunk_type() == safe_browsing::ChunkData::SUB
&&
84 static_cast<size_t>(chunk
->add_numbers_size()) != hash_count
) {
88 chunk_data_
.swap(chunk
);
92 int SBChunkData::ChunkNumber() const {
93 return chunk_data_
->chunk_number();
96 bool SBChunkData::IsAdd() const {
97 return chunk_data_
->chunk_type() == safe_browsing::ChunkData::ADD
;
100 bool SBChunkData::IsSub() const {
101 return chunk_data_
->chunk_type() == safe_browsing::ChunkData::SUB
;
104 int SBChunkData::AddChunkNumberAt(size_t i
) const {
106 DCHECK((IsPrefix() && i
< PrefixCount()) ||
107 (IsFullHash() && i
< FullHashCount()));
108 return chunk_data_
->add_numbers(i
);
111 bool SBChunkData::IsPrefix() const {
112 return chunk_data_
->prefix_type() == safe_browsing::ChunkData::PREFIX_4B
;
115 size_t SBChunkData::PrefixCount() const {
117 return chunk_data_
->hashes().size() / sizeof(SBPrefix
);
120 SBPrefix
SBChunkData::PrefixAt(size_t i
) const {
122 DCHECK_LT(i
, PrefixCount());
125 memcpy(&prefix
, chunk_data_
->hashes().data() + i
* sizeof(SBPrefix
),
130 bool SBChunkData::IsFullHash() const {
131 return chunk_data_
->prefix_type() == safe_browsing::ChunkData::FULL_32B
;
134 size_t SBChunkData::FullHashCount() const {
135 DCHECK(IsFullHash());
136 return chunk_data_
->hashes().size() / sizeof(SBFullHash
);
139 SBFullHash
SBChunkData::FullHashAt(size_t i
) const {
140 DCHECK(IsFullHash());
141 DCHECK_LT(i
, FullHashCount());
143 SBFullHash full_hash
;
144 memcpy(&full_hash
, chunk_data_
->hashes().data() + i
* sizeof(SBFullHash
),
149 // SBListChunkRanges -----------------------------------------------------------
151 SBListChunkRanges::SBListChunkRanges(const std::string
& n
)
155 // SBChunkDelete ---------------------------------------------------------------
157 SBChunkDelete::SBChunkDelete() : is_sub_del(false) {}
159 SBChunkDelete::~SBChunkDelete() {}
161 // Utility functions -----------------------------------------------------------
164 bool IsKnownList(const std::string
& name
) {
165 for (size_t i
= 0; i
< arraysize(safe_browsing_util::kAllLists
); ++i
) {
166 if (!strcmp(safe_browsing_util::kAllLists
[i
], name
.c_str())) {
173 // String constants for the M40 UwS Finch trial.
174 const char kUnwantedTrialName
[] = "UwSInterstitialStatus";
175 const char kOff
[] = "Off";
176 const char kOnButInvisible
[] = "OnButInvisible";
177 const char kOn
[] = "On";
181 namespace safe_browsing_util
{
183 // Listnames that browser can process.
184 // TODO(shess): This shouldn't be OS-driven <http://crbug.com/394379>
185 #if defined(OS_ANDROID)
186 // NOTE(shess): This difference is also reflected in the store name in
187 // safe_browsing_database.cc.
188 const char kMalwareList
[] = "goog-mobilemalware-shavar";
189 const char kPhishingList
[] = "goog-mobilephish-shavar";
191 const char kMalwareList
[] = "goog-malware-shavar";
192 const char kPhishingList
[] = "goog-phish-shavar";
194 const char kBinUrlList
[] = "goog-badbinurl-shavar";
195 const char kCsdWhiteList
[] = "goog-csdwhite-sha256";
196 const char kDownloadWhiteList
[] = "goog-downloadwhite-digest256";
197 const char kExtensionBlacklist
[] = "goog-badcrxids-digestvar";
198 const char kSideEffectFreeWhitelist
[] = "goog-sideeffectfree-shavar";
199 const char kIPBlacklist
[] = "goog-badip-digest256";
200 const char kUnwantedUrlList
[] = "goog-unwanted-shavar";
201 const char kInclusionWhitelist
[] = "goog-csdinclusionwhite-sha256";
203 const char* kAllLists
[10] = {
210 kSideEffectFreeWhitelist
,
216 ListType
GetListId(const base::StringPiece
& name
) {
218 if (name
== safe_browsing_util::kMalwareList
) {
220 } else if (name
== safe_browsing_util::kPhishingList
) {
222 } else if (name
== safe_browsing_util::kBinUrlList
) {
224 } else if (name
== safe_browsing_util::kCsdWhiteList
) {
226 } else if (name
== safe_browsing_util::kDownloadWhiteList
) {
227 id
= DOWNLOADWHITELIST
;
228 } else if (name
== safe_browsing_util::kExtensionBlacklist
) {
229 id
= EXTENSIONBLACKLIST
;
230 } else if (name
== safe_browsing_util::kSideEffectFreeWhitelist
) {
231 id
= SIDEEFFECTFREEWHITELIST
;
232 } else if (name
== safe_browsing_util::kIPBlacklist
) {
234 } else if (name
== safe_browsing_util::kUnwantedUrlList
) {
236 } else if (name
== safe_browsing_util::kInclusionWhitelist
) {
237 id
= INCLUSIONWHITELIST
;
244 bool GetListName(ListType list_id
, std::string
* list
) {
247 *list
= safe_browsing_util::kMalwareList
;
250 *list
= safe_browsing_util::kPhishingList
;
253 *list
= safe_browsing_util::kBinUrlList
;
256 *list
= safe_browsing_util::kCsdWhiteList
;
258 case DOWNLOADWHITELIST
:
259 *list
= safe_browsing_util::kDownloadWhiteList
;
261 case EXTENSIONBLACKLIST
:
262 *list
= safe_browsing_util::kExtensionBlacklist
;
264 case SIDEEFFECTFREEWHITELIST
:
265 *list
= safe_browsing_util::kSideEffectFreeWhitelist
;
268 *list
= safe_browsing_util::kIPBlacklist
;
271 *list
= safe_browsing_util::kUnwantedUrlList
;
273 case INCLUSIONWHITELIST
:
274 *list
= safe_browsing_util::kInclusionWhitelist
;
279 DCHECK(IsKnownList(*list
));
283 std::string
Unescape(const std::string
& url
) {
284 std::string
unescaped_str(url
);
285 std::string old_unescaped_str
;
286 const int kMaxLoopIterations
= 1024;
289 old_unescaped_str
= unescaped_str
;
290 unescaped_str
= net::UnescapeURLComponent(old_unescaped_str
,
291 net::UnescapeRule::CONTROL_CHARS
| net::UnescapeRule::SPACES
|
292 net::UnescapeRule::URL_SPECIAL_CHARS
);
293 } while (unescaped_str
!= old_unescaped_str
&& ++loop_var
<=
296 return unescaped_str
;
299 std::string
Escape(const std::string
& url
) {
300 std::string escaped_str
;
301 const char* kHexString
= "0123456789ABCDEF";
302 for (size_t i
= 0; i
< url
.length(); i
++) {
303 unsigned char c
= static_cast<unsigned char>(url
[i
]);
304 if (c
<= ' ' || c
> '~' || c
== '#' || c
== '%') {
305 escaped_str
.push_back('%');
306 escaped_str
.push_back(kHexString
[c
>> 4]);
307 escaped_str
.push_back(kHexString
[c
& 0xf]);
309 escaped_str
.push_back(c
);
316 std::string
RemoveConsecutiveChars(const std::string
& str
, const char c
) {
317 std::string
output(str
);
318 std::string string_to_find
;
319 std::string::size_type loc
= 0;
320 string_to_find
.append(2, c
);
321 while ((loc
= output
.find(string_to_find
, loc
)) != std::string::npos
) {
322 output
.erase(loc
, 1);
328 // Canonicalizes url as per Google Safe Browsing Specification.
329 // See section 6.1 in
330 // http://code.google.com/p/google-safe-browsing/wiki/Protocolv2Spec.
331 void CanonicalizeUrl(const GURL
& url
,
332 std::string
* canonicalized_hostname
,
333 std::string
* canonicalized_path
,
334 std::string
* canonicalized_query
) {
335 DCHECK(url
.is_valid());
337 // We only canonicalize "normal" URLs.
338 if (!url
.IsStandard())
341 // Following canonicalization steps are excluded since url parsing takes care
343 // 1. Remove any tab (0x09), CR (0x0d), and LF (0x0a) chars from url.
344 // (Exclude escaped version of these chars).
345 // 2. Normalize hostname to 4 dot-seperated decimal values.
346 // 3. Lowercase hostname.
347 // 4. Resolve path sequences "/../" and "/./".
349 // That leaves us with the following :-
350 // 1. Remove fragment in URL.
351 GURL url_without_fragment
;
352 GURL::Replacements f_replacements
;
353 f_replacements
.ClearRef();
354 f_replacements
.ClearUsername();
355 f_replacements
.ClearPassword();
356 url_without_fragment
= url
.ReplaceComponents(f_replacements
);
358 // 2. Do URL unescaping until no more hex encoded characters exist.
359 std::string
url_unescaped_str(Unescape(url_without_fragment
.spec()));
361 url::ParseStandardURL(url_unescaped_str
.data(), url_unescaped_str
.length(),
364 // 3. In hostname, remove all leading and trailing dots.
365 const std::string host
=
366 (parsed
.host
.len
> 0)
367 ? url_unescaped_str
.substr(parsed
.host
.begin
, parsed
.host
.len
)
369 std::string host_without_end_dots
;
370 base::TrimString(host
, ".", &host_without_end_dots
);
372 // 4. In hostname, replace consecutive dots with a single dot.
373 std::string
host_without_consecutive_dots(RemoveConsecutiveChars(
374 host_without_end_dots
, '.'));
376 // 5. In path, replace runs of consecutive slashes with a single slash.
378 (parsed
.path
.len
> 0)
379 ? url_unescaped_str
.substr(parsed
.path
.begin
, parsed
.path
.len
)
381 std::string
path_without_consecutive_slash(RemoveConsecutiveChars(path
, '/'));
383 url::Replacements
<char> hp_replacements
;
384 hp_replacements
.SetHost(
385 host_without_consecutive_dots
.data(),
386 url::Component(0, host_without_consecutive_dots
.length()));
387 hp_replacements
.SetPath(
388 path_without_consecutive_slash
.data(),
389 url::Component(0, path_without_consecutive_slash
.length()));
391 std::string url_unescaped_with_can_hostpath
;
392 url::StdStringCanonOutput
output(&url_unescaped_with_can_hostpath
);
393 url::Parsed temp_parsed
;
394 url::ReplaceComponents(url_unescaped_str
.data(),
395 url_unescaped_str
.length(),
403 // 6. Step needed to revert escaping done in url::ReplaceComponents.
404 url_unescaped_with_can_hostpath
= Unescape(url_unescaped_with_can_hostpath
);
406 // 7. After performing all above steps, percent-escape all chars in url which
407 // are <= ASCII 32, >= 127, #, %. Escapes must be uppercase hex characters.
408 std::string
escaped_canon_url_str(Escape(url_unescaped_with_can_hostpath
));
409 url::Parsed final_parsed
;
410 url::ParseStandardURL(escaped_canon_url_str
.data(),
411 escaped_canon_url_str
.length(),
414 if (canonicalized_hostname
&& final_parsed
.host
.len
> 0) {
415 *canonicalized_hostname
=
416 escaped_canon_url_str
.substr(final_parsed
.host
.begin
,
417 final_parsed
.host
.len
);
419 if (canonicalized_path
&& final_parsed
.path
.len
> 0) {
420 *canonicalized_path
= escaped_canon_url_str
.substr(final_parsed
.path
.begin
,
421 final_parsed
.path
.len
);
423 if (canonicalized_query
&& final_parsed
.query
.len
> 0) {
424 *canonicalized_query
= escaped_canon_url_str
.substr(
425 final_parsed
.query
.begin
, final_parsed
.query
.len
);
429 void GenerateHostsToCheck(const GURL
& url
, std::vector
<std::string
>* hosts
) {
432 std::string canon_host
;
433 CanonicalizeUrl(url
, &canon_host
, NULL
, NULL
);
435 const std::string host
= canon_host
; // const sidesteps GCC bugs below!
439 // Per the Safe Browsing Protocol v2 spec, we try the host, and also up to 4
440 // hostnames formed by starting with the last 5 components and successively
441 // removing the leading component. The last component isn't examined alone,
442 // since it's the TLD or a subcomponent thereof.
444 // Note that we don't need to be clever about stopping at the "real" eTLD --
445 // the data on the server side has been filtered to ensure it will not
446 // blacklist a whole TLD, and it's not significantly slower on our side to
447 // just check too much.
449 // Also note that because we have a simple blacklist, not some sort of complex
450 // whitelist-in-blacklist or vice versa, it doesn't matter what order we check
452 const size_t kMaxHostsToCheck
= 4;
453 bool skipped_last_component
= false;
454 for (std::string::const_reverse_iterator
i(host
.rbegin());
455 i
!= host
.rend() && hosts
->size() < kMaxHostsToCheck
; ++i
) {
457 if (skipped_last_component
)
458 hosts
->push_back(std::string(i
.base(), host
.end()));
460 skipped_last_component
= true;
463 hosts
->push_back(host
);
466 void GeneratePathsToCheck(const GURL
& url
, std::vector
<std::string
>* paths
) {
469 std::string canon_path
;
470 std::string canon_query
;
471 CanonicalizeUrl(url
, NULL
, &canon_path
, &canon_query
);
473 const std::string path
= canon_path
; // const sidesteps GCC bugs below!
474 const std::string query
= canon_query
;
478 // Per the Safe Browsing Protocol v2 spec, we try the exact path with/without
479 // the query parameters, and also up to 4 paths formed by starting at the root
480 // and adding more path components.
482 // As with the hosts above, it doesn't matter what order we check these in.
483 const size_t kMaxPathsToCheck
= 4;
484 for (std::string::const_iterator
i(path
.begin());
485 i
!= path
.end() && paths
->size() < kMaxPathsToCheck
; ++i
) {
487 paths
->push_back(std::string(path
.begin(), i
+ 1));
490 if (!paths
->empty() && paths
->back() != path
)
491 paths
->push_back(path
);
494 paths
->push_back(path
+ "?" + query
);
497 void GeneratePatternsToCheck(const GURL
& url
, std::vector
<std::string
>* urls
) {
498 std::vector
<std::string
> hosts
, paths
;
499 GenerateHostsToCheck(url
, &hosts
);
500 GeneratePathsToCheck(url
, &paths
);
501 for (size_t h
= 0; h
< hosts
.size(); ++h
) {
502 for (size_t p
= 0; p
< paths
.size(); ++p
) {
503 urls
->push_back(hosts
[h
] + paths
[p
]);
508 GURL
GeneratePhishingReportUrl(const std::string
& report_page
,
509 const std::string
& url_to_report
,
510 bool is_client_side_detection
) {
511 const std::string current_esc
= net::EscapeQueryParamValue(url_to_report
,
515 BrowserDistribution
* dist
= BrowserDistribution::GetDistribution();
516 std::string
client_name(dist
->GetSafeBrowsingName());
518 std::string
client_name("googlechrome");
520 if (is_client_side_detection
)
521 client_name
.append("_csd");
523 GURL
report_url(report_page
+ base::StringPrintf(kReportParams
,
525 current_esc
.c_str()));
526 return google_util::AppendGoogleLocaleParam(
527 report_url
, g_browser_process
->GetApplicationLocale());
530 SBFullHash
StringToSBFullHash(const std::string
& hash_in
) {
531 DCHECK_EQ(crypto::kSHA256Length
, hash_in
.size());
533 memcpy(hash_out
.full_hash
, hash_in
.data(), crypto::kSHA256Length
);
537 std::string
SBFullHashToString(const SBFullHash
& hash
) {
538 DCHECK_EQ(crypto::kSHA256Length
, sizeof(hash
.full_hash
));
539 return std::string(hash
.full_hash
, sizeof(hash
.full_hash
));
542 UnwantedStatus
GetUnwantedTrialGroup() {
543 std::string
status(base::FieldTrialList::FindFullName(kUnwantedTrialName
));
546 if (status
== kOnButInvisible
)
547 return UWS_ON_INVISIBLE
;
553 } // namespace safe_browsing_util