1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "chrome/browser/safe_browsing/safe_browsing_util.h"
7 #include "base/logging.h"
8 #include "base/strings/string_util.h"
9 #include "base/strings/stringprintf.h"
10 #include "chrome/browser/browser_process.h"
11 #include "chrome/browser/safe_browsing/chunk.pb.h"
12 #include "components/google/core/browser/google_util.h"
13 #include "crypto/sha2.h"
14 #include "net/base/escape.h"
16 #include "url/url_util.h"
19 #include "chrome/installer/util/browser_distribution.h"
22 static const char kReportParams
[] = "?tpl=%s&url=%s";
24 SBFullHash
SBFullHashForString(const base::StringPiece
& str
) {
26 crypto::SHA256HashString(str
, &h
.full_hash
, sizeof(h
.full_hash
));
30 // SBCachedFullHashResult ------------------------------------------------------
32 SBCachedFullHashResult::SBCachedFullHashResult() {}
34 SBCachedFullHashResult::SBCachedFullHashResult(
35 const base::Time
& in_expire_after
)
36 : expire_after(in_expire_after
) {}
38 SBCachedFullHashResult::~SBCachedFullHashResult() {}
40 // SBChunkData -----------------------------------------------------------------
42 // TODO(shess): Right now this contains a scoped_ptr<ChunkData> so that the
43 // proto buffer isn't copied all over the place, then these are contained in a
44 // ScopedVector for purposes of passing things around between tasks. This seems
45 // convoluted. Maybe it would make sense to have an overall container class
46 // returning references to a nested per-chunk class?
48 SBChunkData::SBChunkData() {
51 SBChunkData::SBChunkData(safe_browsing::ChunkData
* raw_data
)
52 : chunk_data_(raw_data
) {
53 DCHECK(chunk_data_
.get());
56 SBChunkData::~SBChunkData() {
59 bool SBChunkData::ParseFrom(const unsigned char* data
, size_t length
) {
60 scoped_ptr
<safe_browsing::ChunkData
> chunk(new safe_browsing::ChunkData());
61 if (!chunk
->ParseFromArray(data
, length
))
64 if (chunk
->chunk_type() != safe_browsing::ChunkData::ADD
&&
65 chunk
->chunk_type() != safe_browsing::ChunkData::SUB
) {
70 if (chunk
->prefix_type() == safe_browsing::ChunkData::PREFIX_4B
) {
71 hash_size
= sizeof(SBPrefix
);
72 } else if (chunk
->prefix_type() == safe_browsing::ChunkData::FULL_32B
) {
73 hash_size
= sizeof(SBFullHash
);
78 const size_t hash_count
= chunk
->hashes().size() / hash_size
;
79 if (hash_count
* hash_size
!= chunk
->hashes().size())
82 if (chunk
->chunk_type() == safe_browsing::ChunkData::SUB
&&
83 static_cast<size_t>(chunk
->add_numbers_size()) != hash_count
) {
87 chunk_data_
.swap(chunk
);
91 int SBChunkData::ChunkNumber() const {
92 return chunk_data_
->chunk_number();
95 bool SBChunkData::IsAdd() const {
96 return chunk_data_
->chunk_type() == safe_browsing::ChunkData::ADD
;
99 bool SBChunkData::IsSub() const {
100 return chunk_data_
->chunk_type() == safe_browsing::ChunkData::SUB
;
103 int SBChunkData::AddChunkNumberAt(size_t i
) const {
105 DCHECK((IsPrefix() && i
< PrefixCount()) ||
106 (IsFullHash() && i
< FullHashCount()));
107 return chunk_data_
->add_numbers(i
);
110 bool SBChunkData::IsPrefix() const {
111 return chunk_data_
->prefix_type() == safe_browsing::ChunkData::PREFIX_4B
;
114 size_t SBChunkData::PrefixCount() const {
116 return chunk_data_
->hashes().size() / sizeof(SBPrefix
);
119 SBPrefix
SBChunkData::PrefixAt(size_t i
) const {
121 DCHECK_LT(i
, PrefixCount());
124 memcpy(&prefix
, chunk_data_
->hashes().data() + i
* sizeof(SBPrefix
),
129 bool SBChunkData::IsFullHash() const {
130 return chunk_data_
->prefix_type() == safe_browsing::ChunkData::FULL_32B
;
133 size_t SBChunkData::FullHashCount() const {
134 DCHECK(IsFullHash());
135 return chunk_data_
->hashes().size() / sizeof(SBFullHash
);
138 SBFullHash
SBChunkData::FullHashAt(size_t i
) const {
139 DCHECK(IsFullHash());
140 DCHECK_LT(i
, FullHashCount());
142 SBFullHash full_hash
;
143 memcpy(&full_hash
, chunk_data_
->hashes().data() + i
* sizeof(SBFullHash
),
148 // SBListChunkRanges -----------------------------------------------------------
150 SBListChunkRanges::SBListChunkRanges(const std::string
& n
)
154 // SBChunkDelete ---------------------------------------------------------------
156 SBChunkDelete::SBChunkDelete() : is_sub_del(false) {}
158 SBChunkDelete::~SBChunkDelete() {}
160 // Utility functions -----------------------------------------------------------
163 bool IsKnownList(const std::string
& name
) {
164 for (size_t i
= 0; i
< arraysize(safe_browsing_util::kAllLists
); ++i
) {
165 if (!strcmp(safe_browsing_util::kAllLists
[i
], name
.c_str())) {
173 namespace safe_browsing_util
{
175 // Listnames that browser can process.
176 // TODO(shess): This shouldn't be OS-driven <http://crbug.com/394379>
177 #if defined(OS_ANDROID)
178 // NOTE(shess): This difference is also reflected in the store name in
179 // safe_browsing_database.cc.
180 const char kMalwareList
[] = "goog-mobilemalware-shavar";
181 const char kPhishingList
[] = "goog-mobilephish-shavar";
183 const char kMalwareList
[] = "goog-malware-shavar";
184 const char kPhishingList
[] = "goog-phish-shavar";
186 const char kBinUrlList
[] = "goog-badbinurl-shavar";
187 const char kCsdWhiteList
[] = "goog-csdwhite-sha256";
188 const char kDownloadWhiteList
[] = "goog-downloadwhite-digest256";
189 const char kExtensionBlacklist
[] = "goog-badcrxids-digestvar";
190 const char kSideEffectFreeWhitelist
[] = "goog-sideeffectfree-shavar";
191 const char kIPBlacklist
[] = "goog-badip-digest256";
193 const char* kAllLists
[8] = {
200 kSideEffectFreeWhitelist
,
204 ListType
GetListId(const base::StringPiece
& name
) {
206 if (name
== safe_browsing_util::kMalwareList
) {
208 } else if (name
== safe_browsing_util::kPhishingList
) {
210 } else if (name
== safe_browsing_util::kBinUrlList
) {
212 } else if (name
== safe_browsing_util::kCsdWhiteList
) {
214 } else if (name
== safe_browsing_util::kDownloadWhiteList
) {
215 id
= DOWNLOADWHITELIST
;
216 } else if (name
== safe_browsing_util::kExtensionBlacklist
) {
217 id
= EXTENSIONBLACKLIST
;
218 } else if (name
== safe_browsing_util::kSideEffectFreeWhitelist
) {
219 id
= SIDEEFFECTFREEWHITELIST
;
220 } else if (name
== safe_browsing_util::kIPBlacklist
) {
228 bool GetListName(ListType list_id
, std::string
* list
) {
231 *list
= safe_browsing_util::kMalwareList
;
234 *list
= safe_browsing_util::kPhishingList
;
237 *list
= safe_browsing_util::kBinUrlList
;
240 *list
= safe_browsing_util::kCsdWhiteList
;
242 case DOWNLOADWHITELIST
:
243 *list
= safe_browsing_util::kDownloadWhiteList
;
245 case EXTENSIONBLACKLIST
:
246 *list
= safe_browsing_util::kExtensionBlacklist
;
248 case SIDEEFFECTFREEWHITELIST
:
249 *list
= safe_browsing_util::kSideEffectFreeWhitelist
;
252 *list
= safe_browsing_util::kIPBlacklist
;
257 DCHECK(IsKnownList(*list
));
261 std::string
Unescape(const std::string
& url
) {
262 std::string
unescaped_str(url
);
263 std::string old_unescaped_str
;
264 const int kMaxLoopIterations
= 1024;
267 old_unescaped_str
= unescaped_str
;
268 unescaped_str
= net::UnescapeURLComponent(old_unescaped_str
,
269 net::UnescapeRule::CONTROL_CHARS
| net::UnescapeRule::SPACES
|
270 net::UnescapeRule::URL_SPECIAL_CHARS
);
271 } while (unescaped_str
!= old_unescaped_str
&& ++loop_var
<=
274 return unescaped_str
;
277 std::string
Escape(const std::string
& url
) {
278 std::string escaped_str
;
279 const char* kHexString
= "0123456789ABCDEF";
280 for (size_t i
= 0; i
< url
.length(); i
++) {
281 unsigned char c
= static_cast<unsigned char>(url
[i
]);
282 if (c
<= ' ' || c
> '~' || c
== '#' || c
== '%') {
283 escaped_str
.push_back('%');
284 escaped_str
.push_back(kHexString
[c
>> 4]);
285 escaped_str
.push_back(kHexString
[c
& 0xf]);
287 escaped_str
.push_back(c
);
294 std::string
RemoveConsecutiveChars(const std::string
& str
, const char c
) {
295 std::string
output(str
);
296 std::string string_to_find
;
297 std::string::size_type loc
= 0;
298 string_to_find
.append(2, c
);
299 while ((loc
= output
.find(string_to_find
, loc
)) != std::string::npos
) {
300 output
.erase(loc
, 1);
306 // Canonicalizes url as per Google Safe Browsing Specification.
307 // See section 6.1 in
308 // http://code.google.com/p/google-safe-browsing/wiki/Protocolv2Spec.
309 void CanonicalizeUrl(const GURL
& url
,
310 std::string
* canonicalized_hostname
,
311 std::string
* canonicalized_path
,
312 std::string
* canonicalized_query
) {
313 DCHECK(url
.is_valid());
315 // We only canonicalize "normal" URLs.
316 if (!url
.IsStandard())
319 // Following canonicalization steps are excluded since url parsing takes care
321 // 1. Remove any tab (0x09), CR (0x0d), and LF (0x0a) chars from url.
322 // (Exclude escaped version of these chars).
323 // 2. Normalize hostname to 4 dot-seperated decimal values.
324 // 3. Lowercase hostname.
325 // 4. Resolve path sequences "/../" and "/./".
327 // That leaves us with the following :-
328 // 1. Remove fragment in URL.
329 GURL url_without_fragment
;
330 GURL::Replacements f_replacements
;
331 f_replacements
.ClearRef();
332 f_replacements
.ClearUsername();
333 f_replacements
.ClearPassword();
334 url_without_fragment
= url
.ReplaceComponents(f_replacements
);
336 // 2. Do URL unescaping until no more hex encoded characters exist.
337 std::string
url_unescaped_str(Unescape(url_without_fragment
.spec()));
339 url::ParseStandardURL(url_unescaped_str
.data(), url_unescaped_str
.length(),
342 // 3. In hostname, remove all leading and trailing dots.
343 const std::string host
=
344 (parsed
.host
.len
> 0)
345 ? url_unescaped_str
.substr(parsed
.host
.begin
, parsed
.host
.len
)
347 std::string host_without_end_dots
;
348 base::TrimString(host
, ".", &host_without_end_dots
);
350 // 4. In hostname, replace consecutive dots with a single dot.
351 std::string
host_without_consecutive_dots(RemoveConsecutiveChars(
352 host_without_end_dots
, '.'));
354 // 5. In path, replace runs of consecutive slashes with a single slash.
356 (parsed
.path
.len
> 0)
357 ? url_unescaped_str
.substr(parsed
.path
.begin
, parsed
.path
.len
)
359 std::string
path_without_consecutive_slash(RemoveConsecutiveChars(path
, '/'));
361 url::Replacements
<char> hp_replacements
;
362 hp_replacements
.SetHost(
363 host_without_consecutive_dots
.data(),
364 url::Component(0, host_without_consecutive_dots
.length()));
365 hp_replacements
.SetPath(
366 path_without_consecutive_slash
.data(),
367 url::Component(0, path_without_consecutive_slash
.length()));
369 std::string url_unescaped_with_can_hostpath
;
370 url::StdStringCanonOutput
output(&url_unescaped_with_can_hostpath
);
371 url::Parsed temp_parsed
;
372 url::ReplaceComponents(url_unescaped_str
.data(),
373 url_unescaped_str
.length(),
381 // 6. Step needed to revert escaping done in url::ReplaceComponents.
382 url_unescaped_with_can_hostpath
= Unescape(url_unescaped_with_can_hostpath
);
384 // 7. After performing all above steps, percent-escape all chars in url which
385 // are <= ASCII 32, >= 127, #, %. Escapes must be uppercase hex characters.
386 std::string
escaped_canon_url_str(Escape(url_unescaped_with_can_hostpath
));
387 url::Parsed final_parsed
;
388 url::ParseStandardURL(escaped_canon_url_str
.data(),
389 escaped_canon_url_str
.length(),
392 if (canonicalized_hostname
&& final_parsed
.host
.len
> 0) {
393 *canonicalized_hostname
=
394 escaped_canon_url_str
.substr(final_parsed
.host
.begin
,
395 final_parsed
.host
.len
);
397 if (canonicalized_path
&& final_parsed
.path
.len
> 0) {
398 *canonicalized_path
= escaped_canon_url_str
.substr(final_parsed
.path
.begin
,
399 final_parsed
.path
.len
);
401 if (canonicalized_query
&& final_parsed
.query
.len
> 0) {
402 *canonicalized_query
= escaped_canon_url_str
.substr(
403 final_parsed
.query
.begin
, final_parsed
.query
.len
);
407 void GenerateHostsToCheck(const GURL
& url
, std::vector
<std::string
>* hosts
) {
410 std::string canon_host
;
411 CanonicalizeUrl(url
, &canon_host
, NULL
, NULL
);
413 const std::string host
= canon_host
; // const sidesteps GCC bugs below!
417 // Per the Safe Browsing Protocol v2 spec, we try the host, and also up to 4
418 // hostnames formed by starting with the last 5 components and successively
419 // removing the leading component. The last component isn't examined alone,
420 // since it's the TLD or a subcomponent thereof.
422 // Note that we don't need to be clever about stopping at the "real" eTLD --
423 // the data on the server side has been filtered to ensure it will not
424 // blacklist a whole TLD, and it's not significantly slower on our side to
425 // just check too much.
427 // Also note that because we have a simple blacklist, not some sort of complex
428 // whitelist-in-blacklist or vice versa, it doesn't matter what order we check
430 const size_t kMaxHostsToCheck
= 4;
431 bool skipped_last_component
= false;
432 for (std::string::const_reverse_iterator
i(host
.rbegin());
433 i
!= host
.rend() && hosts
->size() < kMaxHostsToCheck
; ++i
) {
435 if (skipped_last_component
)
436 hosts
->push_back(std::string(i
.base(), host
.end()));
438 skipped_last_component
= true;
441 hosts
->push_back(host
);
444 void GeneratePathsToCheck(const GURL
& url
, std::vector
<std::string
>* paths
) {
447 std::string canon_path
;
448 std::string canon_query
;
449 CanonicalizeUrl(url
, NULL
, &canon_path
, &canon_query
);
451 const std::string path
= canon_path
; // const sidesteps GCC bugs below!
452 const std::string query
= canon_query
;
456 // Per the Safe Browsing Protocol v2 spec, we try the exact path with/without
457 // the query parameters, and also up to 4 paths formed by starting at the root
458 // and adding more path components.
460 // As with the hosts above, it doesn't matter what order we check these in.
461 const size_t kMaxPathsToCheck
= 4;
462 for (std::string::const_iterator
i(path
.begin());
463 i
!= path
.end() && paths
->size() < kMaxPathsToCheck
; ++i
) {
465 paths
->push_back(std::string(path
.begin(), i
+ 1));
468 if (!paths
->empty() && paths
->back() != path
)
469 paths
->push_back(path
);
472 paths
->push_back(path
+ "?" + query
);
475 void GeneratePatternsToCheck(const GURL
& url
, std::vector
<std::string
>* urls
) {
476 std::vector
<std::string
> hosts
, paths
;
477 GenerateHostsToCheck(url
, &hosts
);
478 GeneratePathsToCheck(url
, &paths
);
479 for (size_t h
= 0; h
< hosts
.size(); ++h
) {
480 for (size_t p
= 0; p
< paths
.size(); ++p
) {
481 urls
->push_back(hosts
[h
] + paths
[p
]);
486 GURL
GeneratePhishingReportUrl(const std::string
& report_page
,
487 const std::string
& url_to_report
,
488 bool is_client_side_detection
) {
489 const std::string current_esc
= net::EscapeQueryParamValue(url_to_report
,
493 BrowserDistribution
* dist
= BrowserDistribution::GetDistribution();
494 std::string
client_name(dist
->GetSafeBrowsingName());
496 std::string
client_name("googlechrome");
498 if (is_client_side_detection
)
499 client_name
.append("_csd");
501 GURL
report_url(report_page
+ base::StringPrintf(kReportParams
,
503 current_esc
.c_str()));
504 return google_util::AppendGoogleLocaleParam(
505 report_url
, g_browser_process
->GetApplicationLocale());
508 SBFullHash
StringToSBFullHash(const std::string
& hash_in
) {
509 DCHECK_EQ(crypto::kSHA256Length
, hash_in
.size());
511 memcpy(hash_out
.full_hash
, hash_in
.data(), crypto::kSHA256Length
);
515 std::string
SBFullHashToString(const SBFullHash
& hash
) {
516 DCHECK_EQ(crypto::kSHA256Length
, sizeof(hash
.full_hash
));
517 return std::string(hash
.full_hash
, sizeof(hash
.full_hash
));
520 } // namespace safe_browsing_util