1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "chrome/browser/safe_browsing/safe_browsing_util.h"
7 #include "base/logging.h"
8 #include "base/strings/string_util.h"
9 #include "base/strings/stringprintf.h"
10 #include "chrome/browser/google/google_util.h"
11 #include "crypto/sha2.h"
12 #include "net/base/escape.h"
14 #include "url/url_util.h"
17 #include "chrome/installer/util/browser_distribution.h"
20 static const char kReportParams
[] = "?tpl=%s&url=%s";
22 SBFullHash
SBFullHashForString(const base::StringPiece
& str
) {
24 crypto::SHA256HashString(str
, &h
.full_hash
, sizeof(h
.full_hash
));
28 // SBChunk ---------------------------------------------------------------------
36 SBChunk::~SBChunk() {}
38 // SBChunkList -----------------------------------------------------------------
40 SBChunkList::SBChunkList() {}
42 SBChunkList::~SBChunkList() {
46 void SBChunkList::clear() {
47 for (std::vector
<SBChunk
>::iterator citer
= chunks_
.begin();
48 citer
!= chunks_
.end(); ++citer
) {
49 for (std::deque
<SBChunkHost
>::iterator hiter
= citer
->hosts
.begin();
50 hiter
!= citer
->hosts
.end(); ++hiter
) {
52 hiter
->entry
->Destroy();
60 // SBListChunkRanges -----------------------------------------------------------
62 SBListChunkRanges::SBListChunkRanges(const std::string
& n
) : name(n
) {}
64 // SBChunkDelete ---------------------------------------------------------------
66 SBChunkDelete::SBChunkDelete() : is_sub_del(false) {}
68 SBChunkDelete::~SBChunkDelete() {}
70 // SBEntry ---------------------------------------------------------------------
73 SBEntry
* SBEntry::Create(Type type
, int prefix_count
) {
74 int size
= Size(type
, prefix_count
);
75 SBEntry
*rv
= static_cast<SBEntry
*>(malloc(size
));
78 rv
->set_prefix_count(prefix_count
);
82 void SBEntry::Destroy() {
87 int SBEntry::PrefixSize(Type type
) {
90 return sizeof(SBPrefix
);
92 return sizeof(SBFullHash
);
94 return sizeof(SBSubPrefix
);
96 return sizeof(SBSubFullHash
);
103 int SBEntry::Size() const {
104 return Size(type(), prefix_count());
108 int SBEntry::Size(Type type
, int prefix_count
) {
109 return sizeof(Data
) + prefix_count
* PrefixSize(type
);
112 int SBEntry::ChunkIdAtPrefix(int index
) const {
113 if (type() == SUB_PREFIX
)
114 return sub_prefixes_
[index
].add_chunk
;
115 return (type() == SUB_FULL_HASH
) ?
116 sub_full_hashes_
[index
].add_chunk
: chunk_id();
119 void SBEntry::SetChunkIdAtPrefix(int index
, int chunk_id
) {
122 if (type() == SUB_PREFIX
)
123 sub_prefixes_
[index
].add_chunk
= chunk_id
;
125 sub_full_hashes_
[index
].add_chunk
= chunk_id
;
128 const SBPrefix
& SBEntry::PrefixAt(int index
) const {
131 return IsAdd() ? add_prefixes_
[index
] : sub_prefixes_
[index
].prefix
;
134 const SBFullHash
& SBEntry::FullHashAt(int index
) const {
137 return IsAdd() ? add_full_hashes_
[index
] : sub_full_hashes_
[index
].prefix
;
140 void SBEntry::SetPrefixAt(int index
, const SBPrefix
& prefix
) {
144 add_prefixes_
[index
] = prefix
;
146 sub_prefixes_
[index
].prefix
= prefix
;
149 void SBEntry::SetFullHashAt(int index
, const SBFullHash
& full_hash
) {
153 add_full_hashes_
[index
] = full_hash
;
155 sub_full_hashes_
[index
].prefix
= full_hash
;
159 // Utility functions -----------------------------------------------------------
162 bool IsKnownList(const std::string
& name
) {
163 for (size_t i
= 0; i
< arraysize(safe_browsing_util::kAllLists
); ++i
) {
164 if (!strcmp(safe_browsing_util::kAllLists
[i
], name
.c_str())) {
172 namespace safe_browsing_util
{
174 // Listnames that browser can process.
175 const char kMalwareList
[] = "goog-malware-shavar";
176 const char kPhishingList
[] = "goog-phish-shavar";
177 const char kBinUrlList
[] = "goog-badbinurl-shavar";
178 const char kCsdWhiteList
[] = "goog-csdwhite-sha256";
179 const char kDownloadWhiteList
[] = "goog-downloadwhite-digest256";
180 const char kExtensionBlacklist
[] = "goog-badcrxids-digestvar";
181 const char kSideEffectFreeWhitelist
[] = "goog-sideeffectfree-shavar";
182 const char kIPBlacklist
[] = "goog-badip-digest256";
184 const char* kAllLists
[8] = {
191 kSideEffectFreeWhitelist
,
195 ListType
GetListId(const std::string
& name
) {
197 if (name
== safe_browsing_util::kMalwareList
) {
199 } else if (name
== safe_browsing_util::kPhishingList
) {
201 } else if (name
== safe_browsing_util::kBinUrlList
) {
203 } else if (name
== safe_browsing_util::kCsdWhiteList
) {
205 } else if (name
== safe_browsing_util::kDownloadWhiteList
) {
206 id
= DOWNLOADWHITELIST
;
207 } else if (name
== safe_browsing_util::kExtensionBlacklist
) {
208 id
= EXTENSIONBLACKLIST
;
209 } else if (name
== safe_browsing_util::kSideEffectFreeWhitelist
) {
210 id
= SIDEEFFECTFREEWHITELIST
;
211 } else if (name
== safe_browsing_util::kIPBlacklist
) {
219 bool GetListName(ListType list_id
, std::string
* list
) {
222 *list
= safe_browsing_util::kMalwareList
;
225 *list
= safe_browsing_util::kPhishingList
;
228 *list
= safe_browsing_util::kBinUrlList
;
231 *list
= safe_browsing_util::kCsdWhiteList
;
233 case DOWNLOADWHITELIST
:
234 *list
= safe_browsing_util::kDownloadWhiteList
;
236 case EXTENSIONBLACKLIST
:
237 *list
= safe_browsing_util::kExtensionBlacklist
;
239 case SIDEEFFECTFREEWHITELIST
:
240 *list
= safe_browsing_util::kSideEffectFreeWhitelist
;
243 *list
= safe_browsing_util::kIPBlacklist
;
248 DCHECK(IsKnownList(*list
));
252 std::string
Unescape(const std::string
& url
) {
253 std::string
unescaped_str(url
);
254 std::string old_unescaped_str
;
255 const int kMaxLoopIterations
= 1024;
258 old_unescaped_str
= unescaped_str
;
259 unescaped_str
= net::UnescapeURLComponent(old_unescaped_str
,
260 net::UnescapeRule::CONTROL_CHARS
| net::UnescapeRule::SPACES
|
261 net::UnescapeRule::URL_SPECIAL_CHARS
);
262 } while (unescaped_str
!= old_unescaped_str
&& ++loop_var
<=
265 return unescaped_str
;
268 std::string
Escape(const std::string
& url
) {
269 std::string escaped_str
;
270 const char* kHexString
= "0123456789ABCDEF";
271 for (size_t i
= 0; i
< url
.length(); i
++) {
272 unsigned char c
= static_cast<unsigned char>(url
[i
]);
273 if (c
<= ' ' || c
> '~' || c
== '#' || c
== '%') {
274 escaped_str
.push_back('%');
275 escaped_str
.push_back(kHexString
[c
>> 4]);
276 escaped_str
.push_back(kHexString
[c
& 0xf]);
278 escaped_str
.push_back(c
);
285 std::string
RemoveConsecutiveChars(const std::string
& str
, const char c
) {
286 std::string
output(str
);
287 std::string string_to_find
;
288 std::string::size_type loc
= 0;
289 string_to_find
.append(2, c
);
290 while ((loc
= output
.find(string_to_find
, loc
)) != std::string::npos
) {
291 output
.erase(loc
, 1);
297 // Canonicalizes url as per Google Safe Browsing Specification.
298 // See section 6.1 in
299 // http://code.google.com/p/google-safe-browsing/wiki/Protocolv2Spec.
300 void CanonicalizeUrl(const GURL
& url
,
301 std::string
* canonicalized_hostname
,
302 std::string
* canonicalized_path
,
303 std::string
* canonicalized_query
) {
304 DCHECK(url
.is_valid());
306 // We only canonicalize "normal" URLs.
307 if (!url
.IsStandard())
310 // Following canonicalization steps are excluded since url parsing takes care
312 // 1. Remove any tab (0x09), CR (0x0d), and LF (0x0a) chars from url.
313 // (Exclude escaped version of these chars).
314 // 2. Normalize hostname to 4 dot-seperated decimal values.
315 // 3. Lowercase hostname.
316 // 4. Resolve path sequences "/../" and "/./".
318 // That leaves us with the following :-
319 // 1. Remove fragment in URL.
320 GURL url_without_fragment
;
321 GURL::Replacements f_replacements
;
322 f_replacements
.ClearRef();
323 f_replacements
.ClearUsername();
324 f_replacements
.ClearPassword();
325 url_without_fragment
= url
.ReplaceComponents(f_replacements
);
327 // 2. Do URL unescaping until no more hex encoded characters exist.
328 std::string
url_unescaped_str(Unescape(url_without_fragment
.spec()));
330 url::ParseStandardURL(url_unescaped_str
.data(), url_unescaped_str
.length(),
333 // 3. In hostname, remove all leading and trailing dots.
334 const std::string host
=
335 (parsed
.host
.len
> 0)
336 ? url_unescaped_str
.substr(parsed
.host
.begin
, parsed
.host
.len
)
338 const char kCharsToTrim
[] = ".";
339 std::string host_without_end_dots
;
340 base::TrimString(host
, kCharsToTrim
, &host_without_end_dots
);
342 // 4. In hostname, replace consecutive dots with a single dot.
343 std::string
host_without_consecutive_dots(RemoveConsecutiveChars(
344 host_without_end_dots
, '.'));
346 // 5. In path, replace runs of consecutive slashes with a single slash.
348 (parsed
.path
.len
> 0)
349 ? url_unescaped_str
.substr(parsed
.path
.begin
, parsed
.path
.len
)
351 std::string
path_without_consecutive_slash(RemoveConsecutiveChars(path
, '/'));
353 url::Replacements
<char> hp_replacements
;
354 hp_replacements
.SetHost(
355 host_without_consecutive_dots
.data(),
356 url::Component(0, host_without_consecutive_dots
.length()));
357 hp_replacements
.SetPath(
358 path_without_consecutive_slash
.data(),
359 url::Component(0, path_without_consecutive_slash
.length()));
361 std::string url_unescaped_with_can_hostpath
;
362 url::StdStringCanonOutput
output(&url_unescaped_with_can_hostpath
);
363 url::Parsed temp_parsed
;
364 url::ReplaceComponents(url_unescaped_str
.data(),
365 url_unescaped_str
.length(),
373 // 6. Step needed to revert escaping done in url::ReplaceComponents.
374 url_unescaped_with_can_hostpath
= Unescape(url_unescaped_with_can_hostpath
);
376 // 7. After performing all above steps, percent-escape all chars in url which
377 // are <= ASCII 32, >= 127, #, %. Escapes must be uppercase hex characters.
378 std::string
escaped_canon_url_str(Escape(url_unescaped_with_can_hostpath
));
379 url::Parsed final_parsed
;
380 url::ParseStandardURL(escaped_canon_url_str
.data(),
381 escaped_canon_url_str
.length(),
384 if (canonicalized_hostname
&& final_parsed
.host
.len
> 0) {
385 *canonicalized_hostname
=
386 escaped_canon_url_str
.substr(final_parsed
.host
.begin
,
387 final_parsed
.host
.len
);
389 if (canonicalized_path
&& final_parsed
.path
.len
> 0) {
390 *canonicalized_path
= escaped_canon_url_str
.substr(final_parsed
.path
.begin
,
391 final_parsed
.path
.len
);
393 if (canonicalized_query
&& final_parsed
.query
.len
> 0) {
394 *canonicalized_query
= escaped_canon_url_str
.substr(
395 final_parsed
.query
.begin
, final_parsed
.query
.len
);
399 void GenerateHostsToCheck(const GURL
& url
, std::vector
<std::string
>* hosts
) {
402 std::string canon_host
;
403 CanonicalizeUrl(url
, &canon_host
, NULL
, NULL
);
405 const std::string host
= canon_host
; // const sidesteps GCC bugs below!
409 // Per the Safe Browsing Protocol v2 spec, we try the host, and also up to 4
410 // hostnames formed by starting with the last 5 components and successively
411 // removing the leading component. The last component isn't examined alone,
412 // since it's the TLD or a subcomponent thereof.
414 // Note that we don't need to be clever about stopping at the "real" eTLD --
415 // the data on the server side has been filtered to ensure it will not
416 // blacklist a whole TLD, and it's not significantly slower on our side to
417 // just check too much.
419 // Also note that because we have a simple blacklist, not some sort of complex
420 // whitelist-in-blacklist or vice versa, it doesn't matter what order we check
422 const size_t kMaxHostsToCheck
= 4;
423 bool skipped_last_component
= false;
424 for (std::string::const_reverse_iterator
i(host
.rbegin());
425 i
!= host
.rend() && hosts
->size() < kMaxHostsToCheck
; ++i
) {
427 if (skipped_last_component
)
428 hosts
->push_back(std::string(i
.base(), host
.end()));
430 skipped_last_component
= true;
433 hosts
->push_back(host
);
436 void GeneratePathsToCheck(const GURL
& url
, std::vector
<std::string
>* paths
) {
439 std::string canon_path
;
440 std::string canon_query
;
441 CanonicalizeUrl(url
, NULL
, &canon_path
, &canon_query
);
443 const std::string path
= canon_path
; // const sidesteps GCC bugs below!
444 const std::string query
= canon_query
;
448 // Per the Safe Browsing Protocol v2 spec, we try the exact path with/without
449 // the query parameters, and also up to 4 paths formed by starting at the root
450 // and adding more path components.
452 // As with the hosts above, it doesn't matter what order we check these in.
453 const size_t kMaxPathsToCheck
= 4;
454 for (std::string::const_iterator
i(path
.begin());
455 i
!= path
.end() && paths
->size() < kMaxPathsToCheck
; ++i
) {
457 paths
->push_back(std::string(path
.begin(), i
+ 1));
460 if (!paths
->empty() && paths
->back() != path
)
461 paths
->push_back(path
);
464 paths
->push_back(path
+ "?" + query
);
467 void GeneratePatternsToCheck(const GURL
& url
, std::vector
<std::string
>* urls
) {
468 std::vector
<std::string
> hosts
, paths
;
469 GenerateHostsToCheck(url
, &hosts
);
470 GeneratePathsToCheck(url
, &paths
);
471 for (size_t h
= 0; h
< hosts
.size(); ++h
) {
472 for (size_t p
= 0; p
< paths
.size(); ++p
) {
473 urls
->push_back(hosts
[h
] + paths
[p
]);
478 int GetHashIndex(const SBFullHash
& hash
,
479 const std::vector
<SBFullHashResult
>& full_hashes
) {
480 for (size_t i
= 0; i
< full_hashes
.size(); ++i
) {
481 if (SBFullHashEqual(hash
, full_hashes
[i
].hash
))
482 return static_cast<int>(i
);
487 int GetUrlHashIndex(const GURL
& url
,
488 const std::vector
<SBFullHashResult
>& full_hashes
) {
489 if (full_hashes
.empty())
492 std::vector
<std::string
> patterns
;
493 GeneratePatternsToCheck(url
, &patterns
);
495 for (size_t i
= 0; i
< patterns
.size(); ++i
) {
496 SBFullHash key
= SBFullHashForString(patterns
[i
]);
497 int index
= GetHashIndex(key
, full_hashes
);
504 GURL
GeneratePhishingReportUrl(const std::string
& report_page
,
505 const std::string
& url_to_report
,
506 bool is_client_side_detection
) {
507 const std::string current_esc
= net::EscapeQueryParamValue(url_to_report
,
511 BrowserDistribution
* dist
= BrowserDistribution::GetDistribution();
512 std::string
client_name(dist
->GetSafeBrowsingName());
514 std::string
client_name("googlechrome");
516 if (is_client_side_detection
)
517 client_name
.append("_csd");
519 GURL
report_url(report_page
+ base::StringPrintf(kReportParams
,
521 current_esc
.c_str()));
522 return google_util::AppendGoogleLocaleParam(report_url
);
525 SBFullHash
StringToSBFullHash(const std::string
& hash_in
) {
526 DCHECK_EQ(crypto::kSHA256Length
, hash_in
.size());
528 memcpy(hash_out
.full_hash
, hash_in
.data(), crypto::kSHA256Length
);
532 std::string
SBFullHashToString(const SBFullHash
& hash
) {
533 DCHECK_EQ(crypto::kSHA256Length
, sizeof(hash
.full_hash
));
534 return std::string(hash
.full_hash
, sizeof(hash
.full_hash
));
537 } // namespace safe_browsing_util