1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "chrome/browser/safe_browsing/safe_browsing_util.h"
7 #include "base/logging.h"
8 #include "base/strings/string_util.h"
9 #include "base/strings/stringprintf.h"
10 #include "chrome/browser/google/google_util.h"
11 #include "crypto/sha2.h"
12 #include "net/base/escape.h"
14 #include "url/url_util.h"
17 #include "chrome/installer/util/browser_distribution.h"
20 static const char kReportParams
[] = "?tpl=%s&url=%s";
22 // SBChunk ---------------------------------------------------------------------
30 SBChunk::~SBChunk() {}
32 // SBChunkList -----------------------------------------------------------------
34 SBChunkList::SBChunkList() {}
36 SBChunkList::~SBChunkList() {
40 void SBChunkList::clear() {
41 for (std::vector
<SBChunk
>::iterator citer
= chunks_
.begin();
42 citer
!= chunks_
.end(); ++citer
) {
43 for (std::deque
<SBChunkHost
>::iterator hiter
= citer
->hosts
.begin();
44 hiter
!= citer
->hosts
.end(); ++hiter
) {
46 hiter
->entry
->Destroy();
54 // SBListChunkRanges -----------------------------------------------------------
56 SBListChunkRanges::SBListChunkRanges(const std::string
& n
) : name(n
) {}
58 // SBChunkDelete ---------------------------------------------------------------
60 SBChunkDelete::SBChunkDelete() : is_sub_del(false) {}
62 SBChunkDelete::~SBChunkDelete() {}
64 // SBEntry ---------------------------------------------------------------------
67 SBEntry
* SBEntry::Create(Type type
, int prefix_count
) {
68 int size
= Size(type
, prefix_count
);
69 SBEntry
*rv
= static_cast<SBEntry
*>(malloc(size
));
72 rv
->set_prefix_count(prefix_count
);
76 void SBEntry::Destroy() {
81 int SBEntry::PrefixSize(Type type
) {
84 return sizeof(SBPrefix
);
86 return sizeof(SBFullHash
);
88 return sizeof(SBSubPrefix
);
90 return sizeof(SBSubFullHash
);
97 int SBEntry::Size() const {
98 return Size(type(), prefix_count());
102 int SBEntry::Size(Type type
, int prefix_count
) {
103 return sizeof(Data
) + prefix_count
* PrefixSize(type
);
106 int SBEntry::ChunkIdAtPrefix(int index
) const {
107 if (type() == SUB_PREFIX
)
108 return sub_prefixes_
[index
].add_chunk
;
109 return (type() == SUB_FULL_HASH
) ?
110 sub_full_hashes_
[index
].add_chunk
: chunk_id();
113 void SBEntry::SetChunkIdAtPrefix(int index
, int chunk_id
) {
116 if (type() == SUB_PREFIX
)
117 sub_prefixes_
[index
].add_chunk
= chunk_id
;
119 sub_full_hashes_
[index
].add_chunk
= chunk_id
;
122 const SBPrefix
& SBEntry::PrefixAt(int index
) const {
125 return IsAdd() ? add_prefixes_
[index
] : sub_prefixes_
[index
].prefix
;
128 const SBFullHash
& SBEntry::FullHashAt(int index
) const {
131 return IsAdd() ? add_full_hashes_
[index
] : sub_full_hashes_
[index
].prefix
;
134 void SBEntry::SetPrefixAt(int index
, const SBPrefix
& prefix
) {
138 add_prefixes_
[index
] = prefix
;
140 sub_prefixes_
[index
].prefix
= prefix
;
143 void SBEntry::SetFullHashAt(int index
, const SBFullHash
& full_hash
) {
147 add_full_hashes_
[index
] = full_hash
;
149 sub_full_hashes_
[index
].prefix
= full_hash
;
153 // Utility functions -----------------------------------------------------------
156 bool IsKnownList(const std::string
& name
) {
157 for (size_t i
= 0; i
< arraysize(safe_browsing_util::kAllLists
); ++i
) {
158 if (!strcmp(safe_browsing_util::kAllLists
[i
], name
.c_str())) {
166 namespace safe_browsing_util
{
168 // Listnames that browser can process.
169 const char kMalwareList
[] = "goog-malware-shavar";
170 const char kPhishingList
[] = "goog-phish-shavar";
171 const char kBinUrlList
[] = "goog-badbinurl-shavar";
172 // We don't use the bad binary digest list anymore. Use a fake listname to be
173 // sure we don't request it accidentally.
174 const char kBinHashList
[] = "goog-badbin-digestvar-disabled";
175 const char kCsdWhiteList
[] = "goog-csdwhite-sha256";
176 const char kDownloadWhiteList
[] = "goog-downloadwhite-digest256";
177 const char kExtensionBlacklist
[] = "goog-badcrxids-digestvar";
178 const char kSideEffectFreeWhitelist
[] = "goog-sideeffectfree-shavar";
179 const char kIPBlacklist
[] = "goog-badip-digest256";
181 const char* kAllLists
[10] = {
190 kSideEffectFreeWhitelist
,
194 ListType
GetListId(const std::string
& name
) {
196 if (name
== safe_browsing_util::kMalwareList
) {
198 } else if (name
== safe_browsing_util::kPhishingList
) {
200 } else if (name
== safe_browsing_util::kBinUrlList
) {
202 } else if (name
== safe_browsing_util::kBinHashList
) {
204 } else if (name
== safe_browsing_util::kCsdWhiteList
) {
206 } else if (name
== safe_browsing_util::kDownloadWhiteList
) {
207 id
= DOWNLOADWHITELIST
;
208 } else if (name
== safe_browsing_util::kExtensionBlacklist
) {
209 id
= EXTENSIONBLACKLIST
;
210 } else if (name
== safe_browsing_util::kSideEffectFreeWhitelist
) {
211 id
= SIDEEFFECTFREEWHITELIST
;
212 } else if (name
== safe_browsing_util::kIPBlacklist
) {
220 bool GetListName(ListType list_id
, std::string
* list
) {
223 *list
= safe_browsing_util::kMalwareList
;
226 *list
= safe_browsing_util::kPhishingList
;
229 *list
= safe_browsing_util::kBinUrlList
;
232 *list
= safe_browsing_util::kBinHashList
;
235 *list
= safe_browsing_util::kCsdWhiteList
;
237 case DOWNLOADWHITELIST
:
238 *list
= safe_browsing_util::kDownloadWhiteList
;
240 case EXTENSIONBLACKLIST
:
241 *list
= safe_browsing_util::kExtensionBlacklist
;
243 case SIDEEFFECTFREEWHITELIST
:
244 *list
= safe_browsing_util::kSideEffectFreeWhitelist
;
247 *list
= safe_browsing_util::kIPBlacklist
;
252 DCHECK(IsKnownList(*list
));
256 std::string
Unescape(const std::string
& url
) {
257 std::string
unescaped_str(url
);
258 std::string old_unescaped_str
;
259 const int kMaxLoopIterations
= 1024;
262 old_unescaped_str
= unescaped_str
;
263 unescaped_str
= net::UnescapeURLComponent(old_unescaped_str
,
264 net::UnescapeRule::CONTROL_CHARS
| net::UnescapeRule::SPACES
|
265 net::UnescapeRule::URL_SPECIAL_CHARS
);
266 } while (unescaped_str
!= old_unescaped_str
&& ++loop_var
<=
269 return unescaped_str
;
272 std::string
Escape(const std::string
& url
) {
273 std::string escaped_str
;
274 const char* kHexString
= "0123456789ABCDEF";
275 for (size_t i
= 0; i
< url
.length(); i
++) {
276 unsigned char c
= static_cast<unsigned char>(url
[i
]);
277 if (c
<= ' ' || c
> '~' || c
== '#' || c
== '%') {
278 escaped_str
.push_back('%');
279 escaped_str
.push_back(kHexString
[c
>> 4]);
280 escaped_str
.push_back(kHexString
[c
& 0xf]);
282 escaped_str
.push_back(c
);
289 std::string
RemoveConsecutiveChars(const std::string
& str
, const char c
) {
290 std::string
output(str
);
291 std::string string_to_find
;
292 std::string::size_type loc
= 0;
293 string_to_find
.append(2, c
);
294 while ((loc
= output
.find(string_to_find
, loc
)) != std::string::npos
) {
295 output
.erase(loc
, 1);
301 // Canonicalizes url as per Google Safe Browsing Specification.
302 // See section 6.1 in
303 // http://code.google.com/p/google-safe-browsing/wiki/Protocolv2Spec.
304 void CanonicalizeUrl(const GURL
& url
,
305 std::string
* canonicalized_hostname
,
306 std::string
* canonicalized_path
,
307 std::string
* canonicalized_query
) {
308 DCHECK(url
.is_valid());
310 // We only canonicalize "normal" URLs.
311 if (!url
.IsStandard())
314 // Following canonicalization steps are excluded since url parsing takes care
316 // 1. Remove any tab (0x09), CR (0x0d), and LF (0x0a) chars from url.
317 // (Exclude escaped version of these chars).
318 // 2. Normalize hostname to 4 dot-seperated decimal values.
319 // 3. Lowercase hostname.
320 // 4. Resolve path sequences "/../" and "/./".
322 // That leaves us with the following :-
323 // 1. Remove fragment in URL.
324 GURL url_without_fragment
;
325 GURL::Replacements f_replacements
;
326 f_replacements
.ClearRef();
327 f_replacements
.ClearUsername();
328 f_replacements
.ClearPassword();
329 url_without_fragment
= url
.ReplaceComponents(f_replacements
);
331 // 2. Do URL unescaping until no more hex encoded characters exist.
332 std::string
url_unescaped_str(Unescape(url_without_fragment
.spec()));
333 url_parse::Parsed parsed
;
334 url_parse::ParseStandardURL(url_unescaped_str
.data(),
335 url_unescaped_str
.length(), &parsed
);
337 // 3. In hostname, remove all leading and trailing dots.
338 const std::string host
=
339 (parsed
.host
.len
> 0)
340 ? url_unescaped_str
.substr(parsed
.host
.begin
, parsed
.host
.len
)
342 const char kCharsToTrim
[] = ".";
343 std::string host_without_end_dots
;
344 base::TrimString(host
, kCharsToTrim
, &host_without_end_dots
);
346 // 4. In hostname, replace consecutive dots with a single dot.
347 std::string
host_without_consecutive_dots(RemoveConsecutiveChars(
348 host_without_end_dots
, '.'));
350 // 5. In path, replace runs of consecutive slashes with a single slash.
352 (parsed
.path
.len
> 0)
353 ? url_unescaped_str
.substr(parsed
.path
.begin
, parsed
.path
.len
)
355 std::string
path_without_consecutive_slash(RemoveConsecutiveChars(path
, '/'));
357 url_canon::Replacements
<char> hp_replacements
;
358 hp_replacements
.SetHost(host_without_consecutive_dots
.data(),
359 url_parse::Component(0, host_without_consecutive_dots
.length()));
360 hp_replacements
.SetPath(path_without_consecutive_slash
.data(),
361 url_parse::Component(0, path_without_consecutive_slash
.length()));
363 std::string url_unescaped_with_can_hostpath
;
364 url_canon::StdStringCanonOutput
output(&url_unescaped_with_can_hostpath
);
365 url_parse::Parsed temp_parsed
;
366 url_util::ReplaceComponents(url_unescaped_str
.data(),
367 url_unescaped_str
.length(), parsed
,
368 hp_replacements
, NULL
, &output
, &temp_parsed
);
371 // 6. Step needed to revert escaping done in url_util::ReplaceComponents.
372 url_unescaped_with_can_hostpath
= Unescape(url_unescaped_with_can_hostpath
);
374 // 7. After performing all above steps, percent-escape all chars in url which
375 // are <= ASCII 32, >= 127, #, %. Escapes must be uppercase hex characters.
376 std::string
escaped_canon_url_str(Escape(url_unescaped_with_can_hostpath
));
377 url_parse::Parsed final_parsed
;
378 url_parse::ParseStandardURL(escaped_canon_url_str
.data(),
379 escaped_canon_url_str
.length(), &final_parsed
);
381 if (canonicalized_hostname
&& final_parsed
.host
.len
> 0) {
382 *canonicalized_hostname
=
383 escaped_canon_url_str
.substr(final_parsed
.host
.begin
,
384 final_parsed
.host
.len
);
386 if (canonicalized_path
&& final_parsed
.path
.len
> 0) {
387 *canonicalized_path
= escaped_canon_url_str
.substr(final_parsed
.path
.begin
,
388 final_parsed
.path
.len
);
390 if (canonicalized_query
&& final_parsed
.query
.len
> 0) {
391 *canonicalized_query
= escaped_canon_url_str
.substr(
392 final_parsed
.query
.begin
, final_parsed
.query
.len
);
396 void GenerateHostsToCheck(const GURL
& url
, std::vector
<std::string
>* hosts
) {
399 std::string canon_host
;
400 CanonicalizeUrl(url
, &canon_host
, NULL
, NULL
);
402 const std::string host
= canon_host
; // const sidesteps GCC bugs below!
406 // Per the Safe Browsing Protocol v2 spec, we try the host, and also up to 4
407 // hostnames formed by starting with the last 5 components and successively
408 // removing the leading component. The last component isn't examined alone,
409 // since it's the TLD or a subcomponent thereof.
411 // Note that we don't need to be clever about stopping at the "real" eTLD --
412 // the data on the server side has been filtered to ensure it will not
413 // blacklist a whole TLD, and it's not significantly slower on our side to
414 // just check too much.
416 // Also note that because we have a simple blacklist, not some sort of complex
417 // whitelist-in-blacklist or vice versa, it doesn't matter what order we check
419 const size_t kMaxHostsToCheck
= 4;
420 bool skipped_last_component
= false;
421 for (std::string::const_reverse_iterator
i(host
.rbegin());
422 i
!= host
.rend() && hosts
->size() < kMaxHostsToCheck
; ++i
) {
424 if (skipped_last_component
)
425 hosts
->push_back(std::string(i
.base(), host
.end()));
427 skipped_last_component
= true;
430 hosts
->push_back(host
);
433 void GeneratePathsToCheck(const GURL
& url
, std::vector
<std::string
>* paths
) {
436 std::string canon_path
;
437 std::string canon_query
;
438 CanonicalizeUrl(url
, NULL
, &canon_path
, &canon_query
);
440 const std::string path
= canon_path
; // const sidesteps GCC bugs below!
441 const std::string query
= canon_query
;
445 // Per the Safe Browsing Protocol v2 spec, we try the exact path with/without
446 // the query parameters, and also up to 4 paths formed by starting at the root
447 // and adding more path components.
449 // As with the hosts above, it doesn't matter what order we check these in.
450 const size_t kMaxPathsToCheck
= 4;
451 for (std::string::const_iterator
i(path
.begin());
452 i
!= path
.end() && paths
->size() < kMaxPathsToCheck
; ++i
) {
454 paths
->push_back(std::string(path
.begin(), i
+ 1));
457 if (!paths
->empty() && paths
->back() != path
)
458 paths
->push_back(path
);
461 paths
->push_back(path
+ "?" + query
);
464 void GeneratePatternsToCheck(const GURL
& url
, std::vector
<std::string
>* urls
) {
465 std::vector
<std::string
> hosts
, paths
;
466 GenerateHostsToCheck(url
, &hosts
);
467 GeneratePathsToCheck(url
, &paths
);
468 for (size_t h
= 0; h
< hosts
.size(); ++h
) {
469 for (size_t p
= 0; p
< paths
.size(); ++p
) {
470 urls
->push_back(hosts
[h
] + paths
[p
]);
475 int GetHashIndex(const SBFullHash
& hash
,
476 const std::vector
<SBFullHashResult
>& full_hashes
) {
477 for (size_t i
= 0; i
< full_hashes
.size(); ++i
) {
478 if (hash
== full_hashes
[i
].hash
)
479 return static_cast<int>(i
);
484 int GetUrlHashIndex(const GURL
& url
,
485 const std::vector
<SBFullHashResult
>& full_hashes
) {
486 if (full_hashes
.empty())
489 std::vector
<std::string
> patterns
;
490 GeneratePatternsToCheck(url
, &patterns
);
492 for (size_t i
= 0; i
< patterns
.size(); ++i
) {
494 crypto::SHA256HashString(patterns
[i
], key
.full_hash
, sizeof(SBFullHash
));
495 int index
= GetHashIndex(key
, full_hashes
);
502 bool IsPhishingList(const std::string
& list_name
) {
503 return list_name
.compare(kPhishingList
) == 0;
506 bool IsMalwareList(const std::string
& list_name
) {
507 return list_name
.compare(kMalwareList
) == 0;
510 bool IsBadbinurlList(const std::string
& list_name
) {
511 return list_name
.compare(kBinUrlList
) == 0;
514 bool IsBadbinhashList(const std::string
& list_name
) {
515 return list_name
.compare(kBinHashList
) == 0;
518 bool IsExtensionList(const std::string
& list_name
) {
519 return list_name
.compare(kExtensionBlacklist
) == 0;
522 GURL
GeneratePhishingReportUrl(const std::string
& report_page
,
523 const std::string
& url_to_report
,
524 bool is_client_side_detection
) {
525 const std::string current_esc
= net::EscapeQueryParamValue(url_to_report
,
529 BrowserDistribution
* dist
= BrowserDistribution::GetDistribution();
530 std::string
client_name(dist
->GetSafeBrowsingName());
532 std::string
client_name("googlechrome");
534 if (is_client_side_detection
)
535 client_name
.append("_csd");
537 GURL
report_url(report_page
+ base::StringPrintf(kReportParams
,
539 current_esc
.c_str()));
540 return google_util::AppendGoogleLocaleParam(report_url
);
543 SBFullHash
StringToSBFullHash(const std::string
& hash_in
) {
544 DCHECK_EQ(crypto::kSHA256Length
, hash_in
.size());
546 memcpy(hash_out
.full_hash
, hash_in
.data(), crypto::kSHA256Length
);
550 std::string
SBFullHashToString(const SBFullHash
& hash
) {
551 DCHECK_EQ(crypto::kSHA256Length
, sizeof(hash
.full_hash
));
552 return std::string(hash
.full_hash
, sizeof(hash
.full_hash
));
555 } // namespace safe_browsing_util