Cast: Stop logging kVideoFrameSentToEncoder and rename a couple events.
[chromium-blink-merge.git] / chrome / browser / safe_browsing / safe_browsing_util.cc
blob03346378b39138e5c4968c79978cd0ff27e90e91
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "chrome/browser/safe_browsing/safe_browsing_util.h"
7 #include "base/logging.h"
8 #include "base/strings/string_util.h"
9 #include "base/strings/stringprintf.h"
10 #include "chrome/browser/google/google_util.h"
11 #include "crypto/sha2.h"
12 #include "net/base/escape.h"
13 #include "url/gurl.h"
14 #include "url/url_util.h"
16 #if defined(OS_WIN)
17 #include "chrome/installer/util/browser_distribution.h"
18 #endif
20 static const char kReportParams[] = "?tpl=%s&url=%s";
22 SBFullHash SBFullHashForString(const base::StringPiece& str) {
23 SBFullHash h;
24 crypto::SHA256HashString(str, &h.full_hash, sizeof(h.full_hash));
25 return h;
28 // SBChunk ---------------------------------------------------------------------
30 SBChunk::SBChunk()
31 : chunk_number(0),
32 list_id(0),
33 is_add(false) {
36 SBChunk::~SBChunk() {}
38 // SBChunkList -----------------------------------------------------------------
40 SBChunkList::SBChunkList() {}
42 SBChunkList::~SBChunkList() {
43 clear();
46 void SBChunkList::clear() {
47 for (std::vector<SBChunk>::iterator citer = chunks_.begin();
48 citer != chunks_.end(); ++citer) {
49 for (std::deque<SBChunkHost>::iterator hiter = citer->hosts.begin();
50 hiter != citer->hosts.end(); ++hiter) {
51 if (hiter->entry) {
52 hiter->entry->Destroy();
53 hiter->entry = NULL;
57 chunks_.clear();
60 // SBListChunkRanges -----------------------------------------------------------
62 SBListChunkRanges::SBListChunkRanges(const std::string& n) : name(n) {}
64 // SBChunkDelete ---------------------------------------------------------------
66 SBChunkDelete::SBChunkDelete() : is_sub_del(false) {}
68 SBChunkDelete::~SBChunkDelete() {}
70 // SBEntry ---------------------------------------------------------------------
72 // static
73 SBEntry* SBEntry::Create(Type type, int prefix_count) {
74 int size = Size(type, prefix_count);
75 SBEntry *rv = static_cast<SBEntry*>(malloc(size));
76 memset(rv, 0, size);
77 rv->set_type(type);
78 rv->set_prefix_count(prefix_count);
79 return rv;
82 void SBEntry::Destroy() {
83 free(this);
86 // static
87 int SBEntry::PrefixSize(Type type) {
88 switch (type) {
89 case ADD_PREFIX:
90 return sizeof(SBPrefix);
91 case ADD_FULL_HASH:
92 return sizeof(SBFullHash);
93 case SUB_PREFIX:
94 return sizeof(SBSubPrefix);
95 case SUB_FULL_HASH:
96 return sizeof(SBSubFullHash);
97 default:
98 NOTREACHED();
99 return 0;
103 int SBEntry::Size() const {
104 return Size(type(), prefix_count());
107 // static
108 int SBEntry::Size(Type type, int prefix_count) {
109 return sizeof(Data) + prefix_count * PrefixSize(type);
112 int SBEntry::ChunkIdAtPrefix(int index) const {
113 if (type() == SUB_PREFIX)
114 return sub_prefixes_[index].add_chunk;
115 return (type() == SUB_FULL_HASH) ?
116 sub_full_hashes_[index].add_chunk : chunk_id();
119 void SBEntry::SetChunkIdAtPrefix(int index, int chunk_id) {
120 DCHECK(IsSub());
122 if (type() == SUB_PREFIX)
123 sub_prefixes_[index].add_chunk = chunk_id;
124 else
125 sub_full_hashes_[index].add_chunk = chunk_id;
128 const SBPrefix& SBEntry::PrefixAt(int index) const {
129 DCHECK(IsPrefix());
131 return IsAdd() ? add_prefixes_[index] : sub_prefixes_[index].prefix;
134 const SBFullHash& SBEntry::FullHashAt(int index) const {
135 DCHECK(!IsPrefix());
137 return IsAdd() ? add_full_hashes_[index] : sub_full_hashes_[index].prefix;
140 void SBEntry::SetPrefixAt(int index, const SBPrefix& prefix) {
141 DCHECK(IsPrefix());
143 if (IsAdd())
144 add_prefixes_[index] = prefix;
145 else
146 sub_prefixes_[index].prefix = prefix;
149 void SBEntry::SetFullHashAt(int index, const SBFullHash& full_hash) {
150 DCHECK(!IsPrefix());
152 if (IsAdd())
153 add_full_hashes_[index] = full_hash;
154 else
155 sub_full_hashes_[index].prefix = full_hash;
159 // Utility functions -----------------------------------------------------------
161 namespace {
162 bool IsKnownList(const std::string& name) {
163 for (size_t i = 0; i < arraysize(safe_browsing_util::kAllLists); ++i) {
164 if (!strcmp(safe_browsing_util::kAllLists[i], name.c_str())) {
165 return true;
168 return false;
170 } // namespace
172 namespace safe_browsing_util {
174 // Listnames that browser can process.
175 const char kMalwareList[] = "goog-malware-shavar";
176 const char kPhishingList[] = "goog-phish-shavar";
177 const char kBinUrlList[] = "goog-badbinurl-shavar";
178 const char kCsdWhiteList[] = "goog-csdwhite-sha256";
179 const char kDownloadWhiteList[] = "goog-downloadwhite-digest256";
180 const char kExtensionBlacklist[] = "goog-badcrxids-digestvar";
181 const char kSideEffectFreeWhitelist[] = "goog-sideeffectfree-shavar";
182 const char kIPBlacklist[] = "goog-badip-digest256";
184 const char* kAllLists[8] = {
185 kMalwareList,
186 kPhishingList,
187 kBinUrlList,
188 kCsdWhiteList,
189 kDownloadWhiteList,
190 kExtensionBlacklist,
191 kSideEffectFreeWhitelist,
192 kIPBlacklist,
195 ListType GetListId(const std::string& name) {
196 ListType id;
197 if (name == safe_browsing_util::kMalwareList) {
198 id = MALWARE;
199 } else if (name == safe_browsing_util::kPhishingList) {
200 id = PHISH;
201 } else if (name == safe_browsing_util::kBinUrlList) {
202 id = BINURL;
203 } else if (name == safe_browsing_util::kCsdWhiteList) {
204 id = CSDWHITELIST;
205 } else if (name == safe_browsing_util::kDownloadWhiteList) {
206 id = DOWNLOADWHITELIST;
207 } else if (name == safe_browsing_util::kExtensionBlacklist) {
208 id = EXTENSIONBLACKLIST;
209 } else if (name == safe_browsing_util::kSideEffectFreeWhitelist) {
210 id = SIDEEFFECTFREEWHITELIST;
211 } else if (name == safe_browsing_util::kIPBlacklist) {
212 id = IPBLACKLIST;
213 } else {
214 id = INVALID;
216 return id;
219 bool GetListName(ListType list_id, std::string* list) {
220 switch (list_id) {
221 case MALWARE:
222 *list = safe_browsing_util::kMalwareList;
223 break;
224 case PHISH:
225 *list = safe_browsing_util::kPhishingList;
226 break;
227 case BINURL:
228 *list = safe_browsing_util::kBinUrlList;
229 break;
230 case CSDWHITELIST:
231 *list = safe_browsing_util::kCsdWhiteList;
232 break;
233 case DOWNLOADWHITELIST:
234 *list = safe_browsing_util::kDownloadWhiteList;
235 break;
236 case EXTENSIONBLACKLIST:
237 *list = safe_browsing_util::kExtensionBlacklist;
238 break;
239 case SIDEEFFECTFREEWHITELIST:
240 *list = safe_browsing_util::kSideEffectFreeWhitelist;
241 break;
242 case IPBLACKLIST:
243 *list = safe_browsing_util::kIPBlacklist;
244 break;
245 default:
246 return false;
248 DCHECK(IsKnownList(*list));
249 return true;
252 std::string Unescape(const std::string& url) {
253 std::string unescaped_str(url);
254 std::string old_unescaped_str;
255 const int kMaxLoopIterations = 1024;
256 int loop_var = 0;
257 do {
258 old_unescaped_str = unescaped_str;
259 unescaped_str = net::UnescapeURLComponent(old_unescaped_str,
260 net::UnescapeRule::CONTROL_CHARS | net::UnescapeRule::SPACES |
261 net::UnescapeRule::URL_SPECIAL_CHARS);
262 } while (unescaped_str != old_unescaped_str && ++loop_var <=
263 kMaxLoopIterations);
265 return unescaped_str;
268 std::string Escape(const std::string& url) {
269 std::string escaped_str;
270 const char* kHexString = "0123456789ABCDEF";
271 for (size_t i = 0; i < url.length(); i++) {
272 unsigned char c = static_cast<unsigned char>(url[i]);
273 if (c <= ' ' || c > '~' || c == '#' || c == '%') {
274 escaped_str.push_back('%');
275 escaped_str.push_back(kHexString[c >> 4]);
276 escaped_str.push_back(kHexString[c & 0xf]);
277 } else {
278 escaped_str.push_back(c);
282 return escaped_str;
285 std::string RemoveConsecutiveChars(const std::string& str, const char c) {
286 std::string output(str);
287 std::string string_to_find;
288 std::string::size_type loc = 0;
289 string_to_find.append(2, c);
290 while ((loc = output.find(string_to_find, loc)) != std::string::npos) {
291 output.erase(loc, 1);
294 return output;
297 // Canonicalizes url as per Google Safe Browsing Specification.
298 // See section 6.1 in
299 // http://code.google.com/p/google-safe-browsing/wiki/Protocolv2Spec.
300 void CanonicalizeUrl(const GURL& url,
301 std::string* canonicalized_hostname,
302 std::string* canonicalized_path,
303 std::string* canonicalized_query) {
304 DCHECK(url.is_valid());
306 // We only canonicalize "normal" URLs.
307 if (!url.IsStandard())
308 return;
310 // Following canonicalization steps are excluded since url parsing takes care
311 // of those :-
312 // 1. Remove any tab (0x09), CR (0x0d), and LF (0x0a) chars from url.
313 // (Exclude escaped version of these chars).
314 // 2. Normalize hostname to 4 dot-seperated decimal values.
315 // 3. Lowercase hostname.
316 // 4. Resolve path sequences "/../" and "/./".
318 // That leaves us with the following :-
319 // 1. Remove fragment in URL.
320 GURL url_without_fragment;
321 GURL::Replacements f_replacements;
322 f_replacements.ClearRef();
323 f_replacements.ClearUsername();
324 f_replacements.ClearPassword();
325 url_without_fragment = url.ReplaceComponents(f_replacements);
327 // 2. Do URL unescaping until no more hex encoded characters exist.
328 std::string url_unescaped_str(Unescape(url_without_fragment.spec()));
329 url::Parsed parsed;
330 url::ParseStandardURL(url_unescaped_str.data(), url_unescaped_str.length(),
331 &parsed);
333 // 3. In hostname, remove all leading and trailing dots.
334 const std::string host =
335 (parsed.host.len > 0)
336 ? url_unescaped_str.substr(parsed.host.begin, parsed.host.len)
337 : std::string();
338 const char kCharsToTrim[] = ".";
339 std::string host_without_end_dots;
340 base::TrimString(host, kCharsToTrim, &host_without_end_dots);
342 // 4. In hostname, replace consecutive dots with a single dot.
343 std::string host_without_consecutive_dots(RemoveConsecutiveChars(
344 host_without_end_dots, '.'));
346 // 5. In path, replace runs of consecutive slashes with a single slash.
347 std::string path =
348 (parsed.path.len > 0)
349 ? url_unescaped_str.substr(parsed.path.begin, parsed.path.len)
350 : std::string();
351 std::string path_without_consecutive_slash(RemoveConsecutiveChars(path, '/'));
353 url::Replacements<char> hp_replacements;
354 hp_replacements.SetHost(
355 host_without_consecutive_dots.data(),
356 url::Component(0, host_without_consecutive_dots.length()));
357 hp_replacements.SetPath(
358 path_without_consecutive_slash.data(),
359 url::Component(0, path_without_consecutive_slash.length()));
361 std::string url_unescaped_with_can_hostpath;
362 url::StdStringCanonOutput output(&url_unescaped_with_can_hostpath);
363 url::Parsed temp_parsed;
364 url::ReplaceComponents(url_unescaped_str.data(),
365 url_unescaped_str.length(),
366 parsed,
367 hp_replacements,
368 NULL,
369 &output,
370 &temp_parsed);
371 output.Complete();
373 // 6. Step needed to revert escaping done in url::ReplaceComponents.
374 url_unescaped_with_can_hostpath = Unescape(url_unescaped_with_can_hostpath);
376 // 7. After performing all above steps, percent-escape all chars in url which
377 // are <= ASCII 32, >= 127, #, %. Escapes must be uppercase hex characters.
378 std::string escaped_canon_url_str(Escape(url_unescaped_with_can_hostpath));
379 url::Parsed final_parsed;
380 url::ParseStandardURL(escaped_canon_url_str.data(),
381 escaped_canon_url_str.length(),
382 &final_parsed);
384 if (canonicalized_hostname && final_parsed.host.len > 0) {
385 *canonicalized_hostname =
386 escaped_canon_url_str.substr(final_parsed.host.begin,
387 final_parsed.host.len);
389 if (canonicalized_path && final_parsed.path.len > 0) {
390 *canonicalized_path = escaped_canon_url_str.substr(final_parsed.path.begin,
391 final_parsed.path.len);
393 if (canonicalized_query && final_parsed.query.len > 0) {
394 *canonicalized_query = escaped_canon_url_str.substr(
395 final_parsed.query.begin, final_parsed.query.len);
399 void GenerateHostsToCheck(const GURL& url, std::vector<std::string>* hosts) {
400 hosts->clear();
402 std::string canon_host;
403 CanonicalizeUrl(url, &canon_host, NULL, NULL);
405 const std::string host = canon_host; // const sidesteps GCC bugs below!
406 if (host.empty())
407 return;
409 // Per the Safe Browsing Protocol v2 spec, we try the host, and also up to 4
410 // hostnames formed by starting with the last 5 components and successively
411 // removing the leading component. The last component isn't examined alone,
412 // since it's the TLD or a subcomponent thereof.
414 // Note that we don't need to be clever about stopping at the "real" eTLD --
415 // the data on the server side has been filtered to ensure it will not
416 // blacklist a whole TLD, and it's not significantly slower on our side to
417 // just check too much.
419 // Also note that because we have a simple blacklist, not some sort of complex
420 // whitelist-in-blacklist or vice versa, it doesn't matter what order we check
421 // these in.
422 const size_t kMaxHostsToCheck = 4;
423 bool skipped_last_component = false;
424 for (std::string::const_reverse_iterator i(host.rbegin());
425 i != host.rend() && hosts->size() < kMaxHostsToCheck; ++i) {
426 if (*i == '.') {
427 if (skipped_last_component)
428 hosts->push_back(std::string(i.base(), host.end()));
429 else
430 skipped_last_component = true;
433 hosts->push_back(host);
436 void GeneratePathsToCheck(const GURL& url, std::vector<std::string>* paths) {
437 paths->clear();
439 std::string canon_path;
440 std::string canon_query;
441 CanonicalizeUrl(url, NULL, &canon_path, &canon_query);
443 const std::string path = canon_path; // const sidesteps GCC bugs below!
444 const std::string query = canon_query;
445 if (path.empty())
446 return;
448 // Per the Safe Browsing Protocol v2 spec, we try the exact path with/without
449 // the query parameters, and also up to 4 paths formed by starting at the root
450 // and adding more path components.
452 // As with the hosts above, it doesn't matter what order we check these in.
453 const size_t kMaxPathsToCheck = 4;
454 for (std::string::const_iterator i(path.begin());
455 i != path.end() && paths->size() < kMaxPathsToCheck; ++i) {
456 if (*i == '/')
457 paths->push_back(std::string(path.begin(), i + 1));
460 if (!paths->empty() && paths->back() != path)
461 paths->push_back(path);
463 if (!query.empty())
464 paths->push_back(path + "?" + query);
467 void GeneratePatternsToCheck(const GURL& url, std::vector<std::string>* urls) {
468 std::vector<std::string> hosts, paths;
469 GenerateHostsToCheck(url, &hosts);
470 GeneratePathsToCheck(url, &paths);
471 for (size_t h = 0; h < hosts.size(); ++h) {
472 for (size_t p = 0; p < paths.size(); ++p) {
473 urls->push_back(hosts[h] + paths[p]);
478 int GetHashIndex(const SBFullHash& hash,
479 const std::vector<SBFullHashResult>& full_hashes) {
480 for (size_t i = 0; i < full_hashes.size(); ++i) {
481 if (SBFullHashEqual(hash, full_hashes[i].hash))
482 return static_cast<int>(i);
484 return -1;
487 int GetUrlHashIndex(const GURL& url,
488 const std::vector<SBFullHashResult>& full_hashes) {
489 if (full_hashes.empty())
490 return -1;
492 std::vector<std::string> patterns;
493 GeneratePatternsToCheck(url, &patterns);
495 for (size_t i = 0; i < patterns.size(); ++i) {
496 SBFullHash key = SBFullHashForString(patterns[i]);
497 int index = GetHashIndex(key, full_hashes);
498 if (index != -1)
499 return index;
501 return -1;
504 GURL GeneratePhishingReportUrl(const std::string& report_page,
505 const std::string& url_to_report,
506 bool is_client_side_detection) {
507 const std::string current_esc = net::EscapeQueryParamValue(url_to_report,
508 true);
510 #if defined(OS_WIN)
511 BrowserDistribution* dist = BrowserDistribution::GetDistribution();
512 std::string client_name(dist->GetSafeBrowsingName());
513 #else
514 std::string client_name("googlechrome");
515 #endif
516 if (is_client_side_detection)
517 client_name.append("_csd");
519 GURL report_url(report_page + base::StringPrintf(kReportParams,
520 client_name.c_str(),
521 current_esc.c_str()));
522 return google_util::AppendGoogleLocaleParam(report_url);
525 SBFullHash StringToSBFullHash(const std::string& hash_in) {
526 DCHECK_EQ(crypto::kSHA256Length, hash_in.size());
527 SBFullHash hash_out;
528 memcpy(hash_out.full_hash, hash_in.data(), crypto::kSHA256Length);
529 return hash_out;
532 std::string SBFullHashToString(const SBFullHash& hash) {
533 DCHECK_EQ(crypto::kSHA256Length, sizeof(hash.full_hash));
534 return std::string(hash.full_hash, sizeof(hash.full_hash));
537 } // namespace safe_browsing_util