1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "chrome/browser/safe_browsing/safe_browsing_util.h"
7 #include "base/logging.h"
8 #include "base/strings/string_util.h"
9 #include "base/strings/stringprintf.h"
10 #include "chrome/browser/browser_process.h"
11 #include "chrome/browser/safe_browsing/chunk.pb.h"
12 #include "components/google/core/browser/google_util.h"
13 #include "crypto/sha2.h"
14 #include "net/base/escape.h"
16 #include "url/url_util.h"
19 #include "chrome/installer/util/browser_distribution.h"
22 static const char kReportParams
[] = "?tpl=%s&url=%s";
24 SBFullHash
SBFullHashForString(const base::StringPiece
& str
) {
26 crypto::SHA256HashString(str
, &h
.full_hash
, sizeof(h
.full_hash
));
30 // SBCachedFullHashResult ------------------------------------------------------
32 SBCachedFullHashResult::SBCachedFullHashResult() {}
34 SBCachedFullHashResult::SBCachedFullHashResult(
35 const base::Time
& in_expire_after
)
36 : expire_after(in_expire_after
) {}
38 SBCachedFullHashResult::~SBCachedFullHashResult() {}
40 // SBChunkData -----------------------------------------------------------------
42 // TODO(shess): Right now this contains a scoped_ptr<ChunkData> so that the
43 // proto buffer isn't copied all over the place, then these are contained in a
44 // ScopedVector for purposes of passing things around between tasks. This seems
45 // convoluted. Maybe it would make sense to have an overall container class
46 // returning references to a nested per-chunk class?
48 SBChunkData::SBChunkData() {
51 SBChunkData::SBChunkData(safe_browsing::ChunkData
* raw_data
)
52 : chunk_data_(raw_data
) {
53 DCHECK(chunk_data_
.get());
56 SBChunkData::~SBChunkData() {
59 bool SBChunkData::ParseFrom(const unsigned char* data
, size_t length
) {
60 scoped_ptr
<safe_browsing::ChunkData
> chunk(new safe_browsing::ChunkData());
61 if (!chunk
->ParseFromArray(data
, length
))
64 if (chunk
->chunk_type() != safe_browsing::ChunkData::ADD
&&
65 chunk
->chunk_type() != safe_browsing::ChunkData::SUB
) {
70 if (chunk
->prefix_type() == safe_browsing::ChunkData::PREFIX_4B
) {
71 hash_size
= sizeof(SBPrefix
);
72 } else if (chunk
->prefix_type() == safe_browsing::ChunkData::FULL_32B
) {
73 hash_size
= sizeof(SBFullHash
);
78 const size_t hash_count
= chunk
->hashes().size() / hash_size
;
79 if (hash_count
* hash_size
!= chunk
->hashes().size())
82 if (chunk
->chunk_type() == safe_browsing::ChunkData::SUB
&&
83 static_cast<size_t>(chunk
->add_numbers_size()) != hash_count
) {
87 chunk_data_
.swap(chunk
);
91 int SBChunkData::ChunkNumber() const {
92 return chunk_data_
->chunk_number();
95 bool SBChunkData::IsAdd() const {
96 return chunk_data_
->chunk_type() == safe_browsing::ChunkData::ADD
;
99 bool SBChunkData::IsSub() const {
100 return chunk_data_
->chunk_type() == safe_browsing::ChunkData::SUB
;
103 int SBChunkData::AddChunkNumberAt(size_t i
) const {
105 DCHECK((IsPrefix() && i
< PrefixCount()) ||
106 (IsFullHash() && i
< FullHashCount()));
107 return chunk_data_
->add_numbers(i
);
110 bool SBChunkData::IsPrefix() const {
111 return chunk_data_
->prefix_type() == safe_browsing::ChunkData::PREFIX_4B
;
114 size_t SBChunkData::PrefixCount() const {
116 return chunk_data_
->hashes().size() / sizeof(SBPrefix
);
119 SBPrefix
SBChunkData::PrefixAt(size_t i
) const {
121 DCHECK_LT(i
, PrefixCount());
124 memcpy(&prefix
, chunk_data_
->hashes().data() + i
* sizeof(SBPrefix
),
129 bool SBChunkData::IsFullHash() const {
130 return chunk_data_
->prefix_type() == safe_browsing::ChunkData::FULL_32B
;
133 size_t SBChunkData::FullHashCount() const {
134 DCHECK(IsFullHash());
135 return chunk_data_
->hashes().size() / sizeof(SBFullHash
);
138 SBFullHash
SBChunkData::FullHashAt(size_t i
) const {
139 DCHECK(IsFullHash());
140 DCHECK_LT(i
, FullHashCount());
142 SBFullHash full_hash
;
143 memcpy(&full_hash
, chunk_data_
->hashes().data() + i
* sizeof(SBFullHash
),
148 // SBListChunkRanges -----------------------------------------------------------
150 SBListChunkRanges::SBListChunkRanges(const std::string
& n
)
154 // SBChunkDelete ---------------------------------------------------------------
156 SBChunkDelete::SBChunkDelete() : is_sub_del(false) {}
158 SBChunkDelete::~SBChunkDelete() {}
160 // Utility functions -----------------------------------------------------------
163 bool IsKnownList(const std::string
& name
) {
164 for (size_t i
= 0; i
< arraysize(safe_browsing_util::kAllLists
); ++i
) {
165 if (!strcmp(safe_browsing_util::kAllLists
[i
], name
.c_str())) {
173 namespace safe_browsing_util
{
175 // Listnames that browser can process.
176 // TODO(shess): This shouldn't be OS-driven <http://crbug.com/394379>
177 #if defined(OS_ANDROID)
178 // NOTE(shess): This difference is also reflected in the store name in
179 // safe_browsing_database.cc.
180 const char kMalwareList
[] = "goog-mobilemalware-shavar";
181 const char kPhishingList
[] = "goog-mobilephish-shavar";
183 const char kMalwareList
[] = "goog-malware-shavar";
184 const char kPhishingList
[] = "goog-phish-shavar";
186 const char kBinUrlList
[] = "goog-badbinurl-shavar";
187 const char kCsdWhiteList
[] = "goog-csdwhite-sha256";
188 const char kDownloadWhiteList
[] = "goog-downloadwhite-digest256";
189 const char kExtensionBlacklist
[] = "goog-badcrxids-digestvar";
190 const char kIPBlacklist
[] = "goog-badip-digest256";
191 const char kUnwantedUrlList
[] = "goog-unwanted-shavar";
192 const char kInclusionWhitelist
[] = "goog-csdinclusionwhite-sha256";
194 const char* kAllLists
[9] = {
206 ListType
GetListId(const base::StringPiece
& name
) {
208 if (name
== safe_browsing_util::kMalwareList
) {
210 } else if (name
== safe_browsing_util::kPhishingList
) {
212 } else if (name
== safe_browsing_util::kBinUrlList
) {
214 } else if (name
== safe_browsing_util::kCsdWhiteList
) {
216 } else if (name
== safe_browsing_util::kDownloadWhiteList
) {
217 id
= DOWNLOADWHITELIST
;
218 } else if (name
== safe_browsing_util::kExtensionBlacklist
) {
219 id
= EXTENSIONBLACKLIST
;
220 } else if (name
== safe_browsing_util::kIPBlacklist
) {
222 } else if (name
== safe_browsing_util::kUnwantedUrlList
) {
224 } else if (name
== safe_browsing_util::kInclusionWhitelist
) {
225 id
= INCLUSIONWHITELIST
;
232 bool GetListName(ListType list_id
, std::string
* list
) {
235 *list
= safe_browsing_util::kMalwareList
;
238 *list
= safe_browsing_util::kPhishingList
;
241 *list
= safe_browsing_util::kBinUrlList
;
244 *list
= safe_browsing_util::kCsdWhiteList
;
246 case DOWNLOADWHITELIST
:
247 *list
= safe_browsing_util::kDownloadWhiteList
;
249 case EXTENSIONBLACKLIST
:
250 *list
= safe_browsing_util::kExtensionBlacklist
;
253 *list
= safe_browsing_util::kIPBlacklist
;
256 *list
= safe_browsing_util::kUnwantedUrlList
;
258 case INCLUSIONWHITELIST
:
259 *list
= safe_browsing_util::kInclusionWhitelist
;
264 DCHECK(IsKnownList(*list
));
268 std::string
Unescape(const std::string
& url
) {
269 std::string
unescaped_str(url
);
270 std::string old_unescaped_str
;
271 const int kMaxLoopIterations
= 1024;
274 old_unescaped_str
= unescaped_str
;
275 unescaped_str
= net::UnescapeURLComponent(
276 old_unescaped_str
, net::UnescapeRule::SPOOFING_AND_CONTROL_CHARS
|
277 net::UnescapeRule::SPACES
|
278 net::UnescapeRule::URL_SPECIAL_CHARS
);
279 } while (unescaped_str
!= old_unescaped_str
&& ++loop_var
<=
282 return unescaped_str
;
285 std::string
Escape(const std::string
& url
) {
286 std::string escaped_str
;
287 const char* kHexString
= "0123456789ABCDEF";
288 for (size_t i
= 0; i
< url
.length(); i
++) {
289 unsigned char c
= static_cast<unsigned char>(url
[i
]);
290 if (c
<= ' ' || c
> '~' || c
== '#' || c
== '%') {
291 escaped_str
.push_back('%');
292 escaped_str
.push_back(kHexString
[c
>> 4]);
293 escaped_str
.push_back(kHexString
[c
& 0xf]);
295 escaped_str
.push_back(c
);
302 std::string
RemoveConsecutiveChars(const std::string
& str
, const char c
) {
303 std::string
output(str
);
304 std::string string_to_find
;
305 std::string::size_type loc
= 0;
306 string_to_find
.append(2, c
);
307 while ((loc
= output
.find(string_to_find
, loc
)) != std::string::npos
) {
308 output
.erase(loc
, 1);
314 // Canonicalizes url as per Google Safe Browsing Specification.
315 // See section 6.1 in
316 // http://code.google.com/p/google-safe-browsing/wiki/Protocolv2Spec.
317 void CanonicalizeUrl(const GURL
& url
,
318 std::string
* canonicalized_hostname
,
319 std::string
* canonicalized_path
,
320 std::string
* canonicalized_query
) {
321 DCHECK(url
.is_valid());
323 // We only canonicalize "normal" URLs.
324 if (!url
.IsStandard())
327 // Following canonicalization steps are excluded since url parsing takes care
329 // 1. Remove any tab (0x09), CR (0x0d), and LF (0x0a) chars from url.
330 // (Exclude escaped version of these chars).
331 // 2. Normalize hostname to 4 dot-seperated decimal values.
332 // 3. Lowercase hostname.
333 // 4. Resolve path sequences "/../" and "/./".
335 // That leaves us with the following :-
336 // 1. Remove fragment in URL.
337 GURL url_without_fragment
;
338 GURL::Replacements f_replacements
;
339 f_replacements
.ClearRef();
340 f_replacements
.ClearUsername();
341 f_replacements
.ClearPassword();
342 url_without_fragment
= url
.ReplaceComponents(f_replacements
);
344 // 2. Do URL unescaping until no more hex encoded characters exist.
345 std::string
url_unescaped_str(Unescape(url_without_fragment
.spec()));
347 url::ParseStandardURL(url_unescaped_str
.data(), url_unescaped_str
.length(),
350 // 3. In hostname, remove all leading and trailing dots.
351 const std::string host
=
352 (parsed
.host
.len
> 0)
353 ? url_unescaped_str
.substr(parsed
.host
.begin
, parsed
.host
.len
)
355 std::string host_without_end_dots
;
356 base::TrimString(host
, ".", &host_without_end_dots
);
358 // 4. In hostname, replace consecutive dots with a single dot.
359 std::string
host_without_consecutive_dots(RemoveConsecutiveChars(
360 host_without_end_dots
, '.'));
362 // 5. In path, replace runs of consecutive slashes with a single slash.
364 (parsed
.path
.len
> 0)
365 ? url_unescaped_str
.substr(parsed
.path
.begin
, parsed
.path
.len
)
367 std::string
path_without_consecutive_slash(RemoveConsecutiveChars(path
, '/'));
369 url::Replacements
<char> hp_replacements
;
370 hp_replacements
.SetHost(
371 host_without_consecutive_dots
.data(),
372 url::Component(0, host_without_consecutive_dots
.length()));
373 hp_replacements
.SetPath(
374 path_without_consecutive_slash
.data(),
375 url::Component(0, path_without_consecutive_slash
.length()));
377 std::string url_unescaped_with_can_hostpath
;
378 url::StdStringCanonOutput
output(&url_unescaped_with_can_hostpath
);
379 url::Parsed temp_parsed
;
380 url::ReplaceComponents(url_unescaped_str
.data(),
381 url_unescaped_str
.length(),
389 // 6. Step needed to revert escaping done in url::ReplaceComponents.
390 url_unescaped_with_can_hostpath
= Unescape(url_unescaped_with_can_hostpath
);
392 // 7. After performing all above steps, percent-escape all chars in url which
393 // are <= ASCII 32, >= 127, #, %. Escapes must be uppercase hex characters.
394 std::string
escaped_canon_url_str(Escape(url_unescaped_with_can_hostpath
));
395 url::Parsed final_parsed
;
396 url::ParseStandardURL(escaped_canon_url_str
.data(),
397 escaped_canon_url_str
.length(),
400 if (canonicalized_hostname
&& final_parsed
.host
.len
> 0) {
401 *canonicalized_hostname
=
402 escaped_canon_url_str
.substr(final_parsed
.host
.begin
,
403 final_parsed
.host
.len
);
405 if (canonicalized_path
&& final_parsed
.path
.len
> 0) {
406 *canonicalized_path
= escaped_canon_url_str
.substr(final_parsed
.path
.begin
,
407 final_parsed
.path
.len
);
409 if (canonicalized_query
&& final_parsed
.query
.len
> 0) {
410 *canonicalized_query
= escaped_canon_url_str
.substr(
411 final_parsed
.query
.begin
, final_parsed
.query
.len
);
415 void GenerateHostsToCheck(const GURL
& url
, std::vector
<std::string
>* hosts
) {
418 std::string canon_host
;
419 CanonicalizeUrl(url
, &canon_host
, NULL
, NULL
);
421 const std::string host
= canon_host
; // const sidesteps GCC bugs below!
425 // Per the Safe Browsing Protocol v2 spec, we try the host, and also up to 4
426 // hostnames formed by starting with the last 5 components and successively
427 // removing the leading component. The last component isn't examined alone,
428 // since it's the TLD or a subcomponent thereof.
430 // Note that we don't need to be clever about stopping at the "real" eTLD --
431 // the data on the server side has been filtered to ensure it will not
432 // blacklist a whole TLD, and it's not significantly slower on our side to
433 // just check too much.
435 // Also note that because we have a simple blacklist, not some sort of complex
436 // whitelist-in-blacklist or vice versa, it doesn't matter what order we check
438 const size_t kMaxHostsToCheck
= 4;
439 bool skipped_last_component
= false;
440 for (std::string::const_reverse_iterator
i(host
.rbegin());
441 i
!= host
.rend() && hosts
->size() < kMaxHostsToCheck
; ++i
) {
443 if (skipped_last_component
)
444 hosts
->push_back(std::string(i
.base(), host
.end()));
446 skipped_last_component
= true;
449 hosts
->push_back(host
);
452 void GeneratePathsToCheck(const GURL
& url
, std::vector
<std::string
>* paths
) {
455 std::string canon_path
;
456 std::string canon_query
;
457 CanonicalizeUrl(url
, NULL
, &canon_path
, &canon_query
);
459 const std::string path
= canon_path
; // const sidesteps GCC bugs below!
460 const std::string query
= canon_query
;
464 // Per the Safe Browsing Protocol v2 spec, we try the exact path with/without
465 // the query parameters, and also up to 4 paths formed by starting at the root
466 // and adding more path components.
468 // As with the hosts above, it doesn't matter what order we check these in.
469 const size_t kMaxPathsToCheck
= 4;
470 for (std::string::const_iterator
i(path
.begin());
471 i
!= path
.end() && paths
->size() < kMaxPathsToCheck
; ++i
) {
473 paths
->push_back(std::string(path
.begin(), i
+ 1));
476 if (!paths
->empty() && paths
->back() != path
)
477 paths
->push_back(path
);
480 paths
->push_back(path
+ "?" + query
);
483 void GeneratePatternsToCheck(const GURL
& url
, std::vector
<std::string
>* urls
) {
484 std::vector
<std::string
> hosts
, paths
;
485 GenerateHostsToCheck(url
, &hosts
);
486 GeneratePathsToCheck(url
, &paths
);
487 for (size_t h
= 0; h
< hosts
.size(); ++h
) {
488 for (size_t p
= 0; p
< paths
.size(); ++p
) {
489 urls
->push_back(hosts
[h
] + paths
[p
]);
494 GURL
GeneratePhishingReportUrl(const std::string
& report_page
,
495 const std::string
& url_to_report
,
496 bool is_client_side_detection
) {
497 const std::string current_esc
= net::EscapeQueryParamValue(url_to_report
,
501 BrowserDistribution
* dist
= BrowserDistribution::GetDistribution();
502 std::string
client_name(dist
->GetSafeBrowsingName());
504 std::string
client_name("googlechrome");
506 if (is_client_side_detection
)
507 client_name
.append("_csd");
509 GURL
report_url(report_page
+ base::StringPrintf(kReportParams
,
511 current_esc
.c_str()));
512 return google_util::AppendGoogleLocaleParam(
513 report_url
, g_browser_process
->GetApplicationLocale());
516 SBFullHash
StringToSBFullHash(const std::string
& hash_in
) {
517 DCHECK_EQ(crypto::kSHA256Length
, hash_in
.size());
519 memcpy(hash_out
.full_hash
, hash_in
.data(), crypto::kSHA256Length
);
523 std::string
SBFullHashToString(const SBFullHash
& hash
) {
524 DCHECK_EQ(crypto::kSHA256Length
, sizeof(hash
.full_hash
));
525 return std::string(hash
.full_hash
, sizeof(hash
.full_hash
));
528 } // namespace safe_browsing_util