Cast: Stop logging kVideoFrameSentToEncoder and rename a couple events.
[chromium-blink-merge.git] / chrome / browser / safe_browsing / safe_browsing_store_file.cc
blob0c8b2cd8a5e2ba03f0bc977d79d3650a8e729915
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "chrome/browser/safe_browsing/safe_browsing_store_file.h"
7 #include "base/file_util.h"
8 #include "base/files/scoped_file.h"
9 #include "base/md5.h"
10 #include "base/metrics/histogram.h"
11 #include "base/metrics/sparse_histogram.h"
13 namespace {
15 // NOTE(shess): kFileMagic should not be a byte-wise palindrome, so
16 // that byte-order changes force corruption.
17 const int32 kFileMagic = 0x600D71FE;
19 // Version history:
20 // Version 6: aad08754/r2814 by erikkay@google.com on 2008-10-02 (sqlite)
21 // Version 7: 6afe28a5/r37435 by shess@chromium.org on 2010-01-28
22 // Version 8: d3dd0715/r259791 by shess@chromium.org on 2014-03-27
23 const int32 kFileVersion = 8;
25 // ReadAndVerifyHeader() returns this in case of error.
26 const int32 kInvalidVersion = -1;
28 // Header at the front of the main database file.
29 struct FileHeaderV7 {
30 int32 magic, version;
31 uint32 add_chunk_count, sub_chunk_count;
32 uint32 add_prefix_count, sub_prefix_count;
33 uint32 add_hash_count, sub_hash_count;
36 // Starting with version 8, the storage is sorted and can be sharded to allow
37 // updates to be done with lower memory requirements. Newly written files will
38 // be sharded to need less than this amount of memory during update. Larger
39 // values are preferred to minimize looping overhead during processing.
40 const int64 kUpdateStorageBytes = 100 * 1024;
42 // Prevent excessive sharding by setting a lower limit on the shard stride.
43 // Smaller values should work fine, but very small values will probably lead to
44 // poor performance. Shard stride is indirectly related to
45 // |kUpdateStorageBytes|, setting that very small will bump against this.
46 const uint32 kMinShardStride = 1 << 24;
48 // Strides over the entire SBPrefix space.
49 const uint64 kMaxShardStride = 1ULL << 32;
51 // Maximum SBPrefix value.
52 const SBPrefix kMaxSBPrefix = ~0;
54 // Header at the front of the main database file.
55 struct FileHeaderV8 {
56 int32 magic, version;
57 uint32 add_chunk_count, sub_chunk_count;
58 uint32 shard_stride;
59 // TODO(shess): Is this where 64-bit will bite me? Perhaps write a
60 // specialized read/write?
63 union FileHeader {
64 struct FileHeaderV7 v7;
65 struct FileHeaderV8 v8;
68 // Header for each chunk in the chunk-accumulation file.
69 struct ChunkHeader {
70 uint32 add_prefix_count, sub_prefix_count;
71 uint32 add_hash_count, sub_hash_count;
74 // Header for each shard of data in the main database file.
75 struct ShardHeader {
76 uint32 add_prefix_count, sub_prefix_count;
77 uint32 add_hash_count, sub_hash_count;
80 // Enumerate different format-change events for histogramming
81 // purposes. DO NOT CHANGE THE ORDERING OF THESE VALUES.
82 enum FormatEventType {
83 // Corruption detected, broken down by file format.
84 FORMAT_EVENT_FILE_CORRUPT,
85 FORMAT_EVENT_SQLITE_CORRUPT, // Obsolete
87 // The type of format found in the file. The expected case (new
88 // file format) is intentionally not covered.
89 FORMAT_EVENT_FOUND_SQLITE,
90 FORMAT_EVENT_FOUND_UNKNOWN,
92 // The number of SQLite-format files deleted should be the same as
93 // FORMAT_EVENT_FOUND_SQLITE. It can differ if the delete fails,
94 // or if a failure prevents the update from succeeding.
95 FORMAT_EVENT_SQLITE_DELETED, // Obsolete
96 FORMAT_EVENT_SQLITE_DELETE_FAILED, // Obsolete
98 // Found and deleted (or failed to delete) the ancient "Safe
99 // Browsing" file.
100 FORMAT_EVENT_DELETED_ORIGINAL,
101 FORMAT_EVENT_DELETED_ORIGINAL_FAILED,
103 // The checksum did not check out in CheckValidity() or in
104 // FinishUpdate(). This most likely indicates that the machine
105 // crashed before the file was fully sync'ed to disk.
106 FORMAT_EVENT_VALIDITY_CHECKSUM_FAILURE,
107 FORMAT_EVENT_UPDATE_CHECKSUM_FAILURE,
109 // The header checksum was incorrect in ReadAndVerifyHeader(). Likely
110 // indicates that the system crashed while writing an update.
111 FORMAT_EVENT_HEADER_CHECKSUM_FAILURE,
113 // Memory space for histograms is determined by the max. ALWAYS
114 // ADD NEW VALUES BEFORE THIS ONE.
115 FORMAT_EVENT_MAX
118 void RecordFormatEvent(FormatEventType event_type) {
119 UMA_HISTOGRAM_ENUMERATION("SB2.FormatEvent", event_type, FORMAT_EVENT_MAX);
122 // Rewind the file. Using fseek(2) because rewind(3) errors are
123 // weird.
124 bool FileRewind(FILE* fp) {
125 int rv = fseek(fp, 0, SEEK_SET);
126 DCHECK_EQ(rv, 0);
127 return rv == 0;
130 // Read from |fp| into |item|, and fold the input data into the
131 // checksum in |context|, if non-NULL. Return true on success.
132 template <class T>
133 bool ReadItem(T* item, FILE* fp, base::MD5Context* context) {
134 const size_t ret = fread(item, sizeof(T), 1, fp);
135 if (ret != 1)
136 return false;
138 if (context) {
139 base::MD5Update(context,
140 base::StringPiece(reinterpret_cast<char*>(item),
141 sizeof(T)));
143 return true;
146 // Write |item| to |fp|, and fold the output data into the checksum in
147 // |context|, if non-NULL. Return true on success.
148 template <class T>
149 bool WriteItem(const T& item, FILE* fp, base::MD5Context* context) {
150 const size_t ret = fwrite(&item, sizeof(T), 1, fp);
151 if (ret != 1)
152 return false;
154 if (context) {
155 base::MD5Update(context,
156 base::StringPiece(reinterpret_cast<const char*>(&item),
157 sizeof(T)));
160 return true;
163 // Read |count| items into |values| from |fp|, and fold them into the
164 // checksum in |context|. Returns true on success.
165 template <typename CT>
166 bool ReadToContainer(CT* values, size_t count, FILE* fp,
167 base::MD5Context* context) {
168 if (!count)
169 return true;
171 for (size_t i = 0; i < count; ++i) {
172 typename CT::value_type value;
173 if (!ReadItem(&value, fp, context))
174 return false;
176 // push_back() is more obvious, but coded this way std::set can
177 // also be read.
178 values->insert(values->end(), value);
181 return true;
184 // Write values between |beg| and |end| to |fp|, and fold the data into the
185 // checksum in |context|, if non-NULL. Returns true if all items successful.
186 template <typename CTI>
187 bool WriteRange(const CTI& beg, const CTI& end,
188 FILE* fp, base::MD5Context* context) {
189 for (CTI iter = beg; iter != end; ++iter) {
190 if (!WriteItem(*iter, fp, context))
191 return false;
193 return true;
196 // Write all of |values| to |fp|, and fold the data into the checksum
197 // in |context|, if non-NULL. Returns true if all items successful.
198 template <typename CT>
199 bool WriteContainer(const CT& values, FILE* fp,
200 base::MD5Context* context) {
201 return WriteRange(values.begin(), values.end(), fp, context);
204 // Delete the chunks in |deleted| from |chunks|.
205 void DeleteChunksFromSet(const base::hash_set<int32>& deleted,
206 std::set<int32>* chunks) {
207 for (std::set<int32>::iterator iter = chunks->begin();
208 iter != chunks->end();) {
209 std::set<int32>::iterator prev = iter++;
210 if (deleted.count(*prev) > 0)
211 chunks->erase(prev);
215 bool ReadAndVerifyChecksum(FILE* fp, base::MD5Context* context) {
216 base::MD5Digest calculated_digest;
217 base::MD5IntermediateFinal(&calculated_digest, context);
219 base::MD5Digest file_digest;
220 if (!ReadItem(&file_digest, fp, context))
221 return false;
223 return memcmp(&file_digest, &calculated_digest, sizeof(file_digest)) == 0;
226 // Sanity-check the header against the file's size to make sure our
227 // vectors aren't gigantic. This doubles as a cheap way to detect
228 // corruption without having to checksum the entire file.
229 bool FileHeaderV7SanityCheck(const base::FilePath& filename,
230 const FileHeaderV7& header) {
231 DCHECK_EQ(header.version, 7);
233 int64 size = 0;
234 if (!base::GetFileSize(filename, &size))
235 return false;
237 int64 expected_size = sizeof(FileHeaderV7);
238 expected_size += header.add_chunk_count * sizeof(int32);
239 expected_size += header.sub_chunk_count * sizeof(int32);
240 expected_size += header.add_prefix_count * sizeof(SBAddPrefix);
241 expected_size += header.sub_prefix_count * sizeof(SBSubPrefix);
242 expected_size += header.add_hash_count * sizeof(SBAddFullHash);
243 expected_size += header.sub_hash_count * sizeof(SBSubFullHash);
244 expected_size += sizeof(base::MD5Digest);
245 if (size != expected_size)
246 return false;
248 return true;
251 // Helper function to read the file header and chunk TOC. Rewinds |fp| and
252 // initializes |context|. The header is left in |header|, with the version
253 // returned. kInvalidVersion is returned for sanity check or checksum failure.
254 int ReadAndVerifyHeader(const base::FilePath& filename,
255 FileHeader* header,
256 std::set<int32>* add_chunks,
257 std::set<int32>* sub_chunks,
258 FILE* fp,
259 base::MD5Context* context) {
260 DCHECK(header);
261 DCHECK(add_chunks);
262 DCHECK(sub_chunks);
263 DCHECK(fp);
264 DCHECK(context);
266 int version = kInvalidVersion;
268 base::MD5Init(context);
269 if (!FileRewind(fp))
270 return kInvalidVersion;
271 if (!ReadItem(&header->v8, fp, context))
272 return kInvalidVersion;
273 if (header->v8.magic != kFileMagic)
274 return kInvalidVersion;
276 size_t add_chunks_count = 0;
277 size_t sub_chunks_count = 0;
279 // Track version read to inform removal of support for older versions.
280 UMA_HISTOGRAM_SPARSE_SLOWLY("SB2.StoreVersionRead", header->v8.version);
282 if (header->v8.version == 7) {
283 version = 7;
285 // Reset the context and re-read the v7 header.
286 base::MD5Init(context);
287 if (!FileRewind(fp))
288 return kInvalidVersion;
289 if (!ReadItem(&header->v7, fp, context))
290 return kInvalidVersion;
291 if (header->v7.magic != kFileMagic || header->v7.version != 7)
292 return kInvalidVersion;
293 if (!FileHeaderV7SanityCheck(filename, header->v7))
294 return kInvalidVersion;
296 add_chunks_count = header->v7.add_chunk_count;
297 sub_chunks_count = header->v7.sub_chunk_count;
298 } else if (header->v8.version == kFileVersion) {
299 version = 8;
300 add_chunks_count = header->v8.add_chunk_count;
301 sub_chunks_count = header->v8.sub_chunk_count;
302 } else {
303 return kInvalidVersion;
306 if (!ReadToContainer(add_chunks, add_chunks_count, fp, context) ||
307 !ReadToContainer(sub_chunks, sub_chunks_count, fp, context)) {
308 return kInvalidVersion;
311 // v8 includes a checksum to validate the header.
312 if (version > 7 && !ReadAndVerifyChecksum(fp, context)) {
313 RecordFormatEvent(FORMAT_EVENT_HEADER_CHECKSUM_FAILURE);
314 return kInvalidVersion;
317 return version;
320 // Helper function to write out the initial header and chunks-contained data.
321 // Rewinds |fp|, initializes |context|, then writes a file header and
322 // |add_chunks| and |sub_chunks|.
323 bool WriteHeader(uint32 out_stride,
324 const std::set<int32>& add_chunks,
325 const std::set<int32>& sub_chunks,
326 FILE* fp,
327 base::MD5Context* context) {
328 if (!FileRewind(fp))
329 return false;
331 base::MD5Init(context);
332 FileHeaderV8 header;
333 header.magic = kFileMagic;
334 header.version = kFileVersion;
335 header.add_chunk_count = add_chunks.size();
336 header.sub_chunk_count = sub_chunks.size();
337 header.shard_stride = out_stride;
338 if (!WriteItem(header, fp, context))
339 return false;
341 if (!WriteContainer(add_chunks, fp, context) ||
342 !WriteContainer(sub_chunks, fp, context))
343 return false;
345 // Write out the header digest.
346 base::MD5Digest header_digest;
347 base::MD5IntermediateFinal(&header_digest, context);
348 if (!WriteItem(header_digest, fp, context))
349 return false;
351 return true;
354 // Return |true| if the range is sorted by the given comparator.
355 template <typename CTI, typename LESS>
356 bool sorted(CTI beg, CTI end, LESS less) {
357 while ((end - beg) > 2) {
358 CTI n = beg++;
359 DCHECK(!less(*beg, *n));
360 if (less(*beg, *n))
361 return false;
363 return true;
366 // Merge |beg|..|end| into |container|. Both should be sorted by the given
367 // comparator, and the range iterators should not be derived from |container|.
368 // Differs from std::inplace_merge() in that additional memory is not required
369 // for linear performance.
370 template <typename CT, typename CTI, typename COMP>
371 void container_merge(CT* container, CTI beg, CTI end, const COMP& less) {
372 DCHECK(sorted(container->begin(), container->end(), less));
373 DCHECK(sorted(beg, end, less));
375 // Size the container to fit the results.
376 const size_t c_size = container->size();
377 container->resize(c_size + (end - beg));
379 // |c_end| points to the original endpoint, while |c_out| points to the
380 // endpoint that will scan from end to beginning while merging.
381 typename CT::iterator c_end = container->begin() + c_size;
382 typename CT::iterator c_out = container->end();
384 // While both inputs have data, move the greater to |c_out|.
385 while (c_end != container->begin() && end != beg) {
386 if (less(*(c_end - 1), *(end - 1))) {
387 *(--c_out) = *(--end);
388 } else {
389 *(--c_out) = *(--c_end);
393 // Copy any data remaining in the new range.
394 if (end != beg) {
395 // The original container data has been fully shifted.
396 DCHECK(c_end == container->begin());
398 // There is exactly the correct amount of space left.
399 DCHECK_EQ(c_out - c_end, end - beg);
401 std::copy(beg, end, container->begin());
404 DCHECK(sorted(container->begin(), container->end(), less));
407 // Collection of iterators used while stepping through StateInternal (see
408 // below).
409 class StateInternalPos {
410 public:
411 StateInternalPos(SBAddPrefixes::iterator add_prefixes_iter,
412 SBSubPrefixes::iterator sub_prefixes_iter,
413 std::vector<SBAddFullHash>::iterator add_hashes_iter,
414 std::vector<SBSubFullHash>::iterator sub_hashes_iter)
415 : add_prefixes_iter_(add_prefixes_iter),
416 sub_prefixes_iter_(sub_prefixes_iter),
417 add_hashes_iter_(add_hashes_iter),
418 sub_hashes_iter_(sub_hashes_iter) {
421 SBAddPrefixes::iterator add_prefixes_iter_;
422 SBSubPrefixes::iterator sub_prefixes_iter_;
423 std::vector<SBAddFullHash>::iterator add_hashes_iter_;
424 std::vector<SBSubFullHash>::iterator sub_hashes_iter_;
427 // Helper to find the next shard boundary.
428 template <class T>
429 bool prefix_bounder(SBPrefix val, const T& elt) {
430 return val < elt.GetAddPrefix();
433 // Container for partial database state. Includes add/sub prefixes/hashes, plus
434 // aggregate operations on same.
435 class StateInternal {
436 public:
437 // Append indicated amount of data from |fp|.
438 bool AppendData(size_t add_prefix_count, size_t sub_prefix_count,
439 size_t add_hash_count, size_t sub_hash_count,
440 FILE* fp, base::MD5Context* context) {
441 return
442 ReadToContainer(&add_prefixes_, add_prefix_count, fp, context) &&
443 ReadToContainer(&sub_prefixes_, sub_prefix_count, fp, context) &&
444 ReadToContainer(&add_full_hashes_, add_hash_count, fp, context) &&
445 ReadToContainer(&sub_full_hashes_, sub_hash_count, fp, context);
448 void ClearData() {
449 add_prefixes_.clear();
450 sub_prefixes_.clear();
451 add_full_hashes_.clear();
452 sub_full_hashes_.clear();
455 // Merge data from |beg|..|end| into receiver's state, then process the state.
456 // The current state and the range given should corrospond to the same sorted
457 // shard of data from different sources. |add_del_cache| and |sub_del_cache|
458 // indicate the chunk ids which should be deleted during processing (see
459 // SBProcessSubs).
460 void MergeDataAndProcess(const StateInternalPos& beg,
461 const StateInternalPos& end,
462 const base::hash_set<int32>& add_del_cache,
463 const base::hash_set<int32>& sub_del_cache) {
464 container_merge(&add_prefixes_,
465 beg.add_prefixes_iter_,
466 end.add_prefixes_iter_,
467 SBAddPrefixLess<SBAddPrefix,SBAddPrefix>);
469 container_merge(&sub_prefixes_,
470 beg.sub_prefixes_iter_,
471 end.sub_prefixes_iter_,
472 SBAddPrefixLess<SBSubPrefix,SBSubPrefix>);
474 container_merge(&add_full_hashes_,
475 beg.add_hashes_iter_,
476 end.add_hashes_iter_,
477 SBAddPrefixHashLess<SBAddFullHash,SBAddFullHash>);
479 container_merge(&sub_full_hashes_,
480 beg.sub_hashes_iter_,
481 end.sub_hashes_iter_,
482 SBAddPrefixHashLess<SBSubFullHash, SBSubFullHash>);
484 SBProcessSubs(&add_prefixes_, &sub_prefixes_,
485 &add_full_hashes_, &sub_full_hashes_,
486 add_del_cache, sub_del_cache);
489 // Sort the data appropriately for the sharding, merging, and processing
490 // operations.
491 void SortData() {
492 std::sort(add_prefixes_.begin(), add_prefixes_.end(),
493 SBAddPrefixLess<SBAddPrefix,SBAddPrefix>);
494 std::sort(sub_prefixes_.begin(), sub_prefixes_.end(),
495 SBAddPrefixLess<SBSubPrefix,SBSubPrefix>);
496 std::sort(add_full_hashes_.begin(), add_full_hashes_.end(),
497 SBAddPrefixHashLess<SBAddFullHash,SBAddFullHash>);
498 std::sort(sub_full_hashes_.begin(), sub_full_hashes_.end(),
499 SBAddPrefixHashLess<SBSubFullHash,SBSubFullHash>);
502 // Iterator from the beginning of the state's data.
503 StateInternalPos StateBegin() {
504 return StateInternalPos(add_prefixes_.begin(),
505 sub_prefixes_.begin(),
506 add_full_hashes_.begin(),
507 sub_full_hashes_.begin());
510 // An iterator pointing just after the last possible element of the shard
511 // indicated by |shard_max|. Used to step through the state by shard.
512 // TODO(shess): Verify whether binary search really improves over linear.
513 // Merging or writing will immediately touch all of these elements.
514 StateInternalPos ShardEnd(const StateInternalPos& beg, SBPrefix shard_max) {
515 return StateInternalPos(
516 std::upper_bound(beg.add_prefixes_iter_, add_prefixes_.end(),
517 shard_max, prefix_bounder<SBAddPrefix>),
518 std::upper_bound(beg.sub_prefixes_iter_, sub_prefixes_.end(),
519 shard_max, prefix_bounder<SBSubPrefix>),
520 std::upper_bound(beg.add_hashes_iter_, add_full_hashes_.end(),
521 shard_max, prefix_bounder<SBAddFullHash>),
522 std::upper_bound(beg.sub_hashes_iter_, sub_full_hashes_.end(),
523 shard_max, prefix_bounder<SBSubFullHash>));
526 // Write a shard header and data for the shard starting at |beg| and ending at
527 // the element before |end|.
528 bool WriteShard(const StateInternalPos& beg, const StateInternalPos& end,
529 FILE* fp, base::MD5Context* context) {
530 ShardHeader shard_header;
531 shard_header.add_prefix_count =
532 end.add_prefixes_iter_ - beg.add_prefixes_iter_;
533 shard_header.sub_prefix_count =
534 end.sub_prefixes_iter_ - beg.sub_prefixes_iter_;
535 shard_header.add_hash_count =
536 end.add_hashes_iter_ - beg.add_hashes_iter_;
537 shard_header.sub_hash_count =
538 end.sub_hashes_iter_ - beg.sub_hashes_iter_;
540 return
541 WriteItem(shard_header, fp, context) &&
542 WriteRange(beg.add_prefixes_iter_, end.add_prefixes_iter_,
543 fp, context) &&
544 WriteRange(beg.sub_prefixes_iter_, end.sub_prefixes_iter_,
545 fp, context) &&
546 WriteRange(beg.add_hashes_iter_, end.add_hashes_iter_,
547 fp, context) &&
548 WriteRange(beg.sub_hashes_iter_, end.sub_hashes_iter_,
549 fp, context);
552 SBAddPrefixes add_prefixes_;
553 SBSubPrefixes sub_prefixes_;
554 std::vector<SBAddFullHash> add_full_hashes_;
555 std::vector<SBSubFullHash> sub_full_hashes_;
558 // True if |val| is an even power of two.
559 template <typename T>
560 bool IsPowerOfTwo(const T& val) {
561 return val && (val & (val - 1)) == 0;
564 // Helper to read the entire database state, used by GetAddPrefixes() and
565 // GetAddFullHashes(). Those functions are generally used only for smaller
566 // files. Returns false in case of errors reading the data.
567 bool ReadDbStateHelper(const base::FilePath& filename,
568 StateInternal* db_state) {
569 file_util::ScopedFILE file(base::OpenFile(filename, "rb"));
570 if (file.get() == NULL)
571 return false;
573 std::set<int32> add_chunks;
574 std::set<int32> sub_chunks;
576 base::MD5Context context;
577 FileHeader header;
578 const int version =
579 ReadAndVerifyHeader(filename, &header, &add_chunks, &sub_chunks,
580 file.get(), &context);
581 if (version == kInvalidVersion)
582 return false;
584 if (version == 7) {
585 if (!db_state->AppendData(header.v7.add_prefix_count,
586 header.v7.sub_prefix_count,
587 header.v7.add_hash_count,
588 header.v7.sub_hash_count,
589 file.get(), &context)) {
590 return false;
593 // v7 data was not stored sorted.
594 db_state->SortData();
595 } else {
596 // Read until the shard start overflows, always at least one pass.
597 uint64 in_min = 0;
598 uint64 in_stride = header.v8.shard_stride;
599 if (!in_stride)
600 in_stride = kMaxShardStride;
601 if (!IsPowerOfTwo(in_stride))
602 return false;
604 do {
605 ShardHeader shard_header;
606 if (!ReadItem(&shard_header, file.get(), &context))
607 return false;
609 if (!db_state->AppendData(shard_header.add_prefix_count,
610 shard_header.sub_prefix_count,
611 shard_header.add_hash_count,
612 shard_header.sub_hash_count,
613 file.get(), &context)) {
614 return false;
617 in_min += in_stride;
618 } while (in_min <= kMaxSBPrefix);
621 if (!ReadAndVerifyChecksum(file.get(), &context))
622 return false;
624 int64 size = 0;
625 if (!base::GetFileSize(filename, &size))
626 return false;
628 return static_cast<int64>(ftell(file.get())) == size;
631 } // namespace
633 // static
634 void SafeBrowsingStoreFile::CheckForOriginalAndDelete(
635 const base::FilePath& current_filename) {
636 const base::FilePath original_filename(
637 current_filename.DirName().AppendASCII("Safe Browsing"));
638 if (base::PathExists(original_filename)) {
639 int64 size = 0;
640 if (base::GetFileSize(original_filename, &size)) {
641 UMA_HISTOGRAM_COUNTS("SB2.OldDatabaseKilobytes",
642 static_cast<int>(size / 1024));
645 if (base::DeleteFile(original_filename, false)) {
646 RecordFormatEvent(FORMAT_EVENT_DELETED_ORIGINAL);
647 } else {
648 RecordFormatEvent(FORMAT_EVENT_DELETED_ORIGINAL_FAILED);
651 // Just best-effort on the journal file, don't want to get lost in
652 // the weeds.
653 const base::FilePath journal_filename(
654 current_filename.DirName().AppendASCII("Safe Browsing-journal"));
655 base::DeleteFile(journal_filename, false);
659 SafeBrowsingStoreFile::SafeBrowsingStoreFile()
660 : chunks_written_(0), empty_(false), corruption_seen_(false) {}
662 SafeBrowsingStoreFile::~SafeBrowsingStoreFile() {
663 Close();
666 bool SafeBrowsingStoreFile::Delete() {
667 // The database should not be open at this point. But, just in
668 // case, close everything before deleting.
669 if (!Close()) {
670 NOTREACHED();
671 return false;
674 return DeleteStore(filename_);
677 bool SafeBrowsingStoreFile::CheckValidity() {
678 // The file was either empty or never opened. The empty case is
679 // presumed not to be invalid. The never-opened case can happen if
680 // BeginUpdate() fails for any databases, and should already have
681 // caused the corruption callback to fire.
682 if (!file_.get())
683 return true;
685 if (!FileRewind(file_.get()))
686 return OnCorruptDatabase();
688 int64 size = 0;
689 if (!base::GetFileSize(filename_, &size))
690 return OnCorruptDatabase();
692 base::MD5Context context;
693 base::MD5Init(&context);
695 // Read everything except the final digest.
696 size_t bytes_left = static_cast<size_t>(size);
697 CHECK(size == static_cast<int64>(bytes_left));
698 if (bytes_left < sizeof(base::MD5Digest))
699 return OnCorruptDatabase();
700 bytes_left -= sizeof(base::MD5Digest);
702 // Fold the contents of the file into the checksum.
703 while (bytes_left > 0) {
704 char buf[4096];
705 const size_t c = std::min(sizeof(buf), bytes_left);
706 const size_t ret = fread(buf, 1, c, file_.get());
708 // The file's size changed while reading, give up.
709 if (ret != c)
710 return OnCorruptDatabase();
711 base::MD5Update(&context, base::StringPiece(buf, c));
712 bytes_left -= c;
715 if (!ReadAndVerifyChecksum(file_.get(), &context)) {
716 RecordFormatEvent(FORMAT_EVENT_VALIDITY_CHECKSUM_FAILURE);
717 return OnCorruptDatabase();
720 return true;
723 void SafeBrowsingStoreFile::Init(
724 const base::FilePath& filename,
725 const base::Closure& corruption_callback
727 filename_ = filename;
728 corruption_callback_ = corruption_callback;
731 bool SafeBrowsingStoreFile::BeginChunk() {
732 return ClearChunkBuffers();
735 bool SafeBrowsingStoreFile::WriteAddPrefix(int32 chunk_id, SBPrefix prefix) {
736 add_prefixes_.push_back(SBAddPrefix(chunk_id, prefix));
737 return true;
740 bool SafeBrowsingStoreFile::GetAddPrefixes(SBAddPrefixes* add_prefixes) {
741 add_prefixes->clear();
742 if (!base::PathExists(filename_))
743 return true;
745 StateInternal db_state;
746 if (!ReadDbStateHelper(filename_, &db_state))
747 return OnCorruptDatabase();
749 add_prefixes->swap(db_state.add_prefixes_);
750 return true;
753 bool SafeBrowsingStoreFile::GetAddFullHashes(
754 std::vector<SBAddFullHash>* add_full_hashes) {
755 add_full_hashes->clear();
756 if (!base::PathExists(filename_))
757 return true;
759 StateInternal db_state;
760 if (!ReadDbStateHelper(filename_, &db_state))
761 return OnCorruptDatabase();
763 add_full_hashes->swap(db_state.add_full_hashes_);
764 return true;
767 bool SafeBrowsingStoreFile::WriteAddHash(int32 chunk_id,
768 base::Time receive_time,
769 const SBFullHash& full_hash) {
770 add_hashes_.push_back(SBAddFullHash(chunk_id, receive_time, full_hash));
771 return true;
774 bool SafeBrowsingStoreFile::WriteSubPrefix(int32 chunk_id,
775 int32 add_chunk_id,
776 SBPrefix prefix) {
777 sub_prefixes_.push_back(SBSubPrefix(chunk_id, add_chunk_id, prefix));
778 return true;
781 bool SafeBrowsingStoreFile::WriteSubHash(int32 chunk_id, int32 add_chunk_id,
782 const SBFullHash& full_hash) {
783 sub_hashes_.push_back(SBSubFullHash(chunk_id, add_chunk_id, full_hash));
784 return true;
787 bool SafeBrowsingStoreFile::OnCorruptDatabase() {
788 if (!corruption_seen_)
789 RecordFormatEvent(FORMAT_EVENT_FILE_CORRUPT);
790 corruption_seen_ = true;
792 corruption_callback_.Run();
794 // Return false as a convenience to callers.
795 return false;
798 bool SafeBrowsingStoreFile::Close() {
799 ClearUpdateBuffers();
801 // Make sure the files are closed.
802 file_.reset();
803 new_file_.reset();
804 return true;
807 bool SafeBrowsingStoreFile::BeginUpdate() {
808 DCHECK(!file_.get() && !new_file_.get());
810 // Structures should all be clear unless something bad happened.
811 DCHECK(add_chunks_cache_.empty());
812 DCHECK(sub_chunks_cache_.empty());
813 DCHECK(add_del_cache_.empty());
814 DCHECK(sub_del_cache_.empty());
815 DCHECK(add_prefixes_.empty());
816 DCHECK(sub_prefixes_.empty());
817 DCHECK(add_hashes_.empty());
818 DCHECK(sub_hashes_.empty());
819 DCHECK_EQ(chunks_written_, 0);
821 // Since the following code will already hit the profile looking for
822 // database files, this is a reasonable to time delete any old
823 // files.
824 CheckForOriginalAndDelete(filename_);
826 corruption_seen_ = false;
828 const base::FilePath new_filename = TemporaryFileForFilename(filename_);
829 base::ScopedFILE new_file(base::OpenFile(new_filename, "wb+"));
830 if (new_file.get() == NULL)
831 return false;
833 base::ScopedFILE file(base::OpenFile(filename_, "rb"));
834 empty_ = (file.get() == NULL);
835 if (empty_) {
836 // If the file exists but cannot be opened, try to delete it (not
837 // deleting directly, the bloom filter needs to be deleted, too).
838 if (base::PathExists(filename_))
839 return OnCorruptDatabase();
841 new_file_.swap(new_file);
842 return true;
845 base::MD5Context context;
846 FileHeader header;
847 const int version =
848 ReadAndVerifyHeader(filename_, &header,
849 &add_chunks_cache_, &sub_chunks_cache_,
850 file.get(), &context);
851 if (version == kInvalidVersion) {
852 FileHeaderV8 retry_header;
853 if (FileRewind(file.get()) && ReadItem(&retry_header, file.get(), NULL) &&
854 (retry_header.magic != kFileMagic ||
855 (retry_header.version != 8 && retry_header.version != 7))) {
856 // TODO(shess): Think on whether these histograms are generating any
857 // actionable data. I kid you not, SQLITE happens many thousands of times
858 // per day, UNKNOWN about 3x higher than that.
859 if (!strcmp(reinterpret_cast<char*>(&retry_header.magic),
860 "SQLite format 3")) {
861 RecordFormatEvent(FORMAT_EVENT_FOUND_SQLITE);
862 } else {
863 RecordFormatEvent(FORMAT_EVENT_FOUND_UNKNOWN);
867 // Close the file so that it can be deleted.
868 file.reset();
870 return OnCorruptDatabase();
873 file_.swap(file);
874 new_file_.swap(new_file);
875 return true;
878 bool SafeBrowsingStoreFile::FinishChunk() {
879 if (!add_prefixes_.size() && !sub_prefixes_.size() &&
880 !add_hashes_.size() && !sub_hashes_.size())
881 return true;
883 ChunkHeader header;
884 header.add_prefix_count = add_prefixes_.size();
885 header.sub_prefix_count = sub_prefixes_.size();
886 header.add_hash_count = add_hashes_.size();
887 header.sub_hash_count = sub_hashes_.size();
888 if (!WriteItem(header, new_file_.get(), NULL))
889 return false;
891 if (!WriteContainer(add_prefixes_, new_file_.get(), NULL) ||
892 !WriteContainer(sub_prefixes_, new_file_.get(), NULL) ||
893 !WriteContainer(add_hashes_, new_file_.get(), NULL) ||
894 !WriteContainer(sub_hashes_, new_file_.get(), NULL))
895 return false;
897 ++chunks_written_;
899 // Clear everything to save memory.
900 return ClearChunkBuffers();
903 bool SafeBrowsingStoreFile::DoUpdate(
904 safe_browsing::PrefixSetBuilder* builder,
905 std::vector<SBAddFullHash>* add_full_hashes_result) {
906 DCHECK(file_.get() || empty_);
907 DCHECK(new_file_.get());
908 CHECK(builder);
909 CHECK(add_full_hashes_result);
911 // Rewind the temporary storage.
912 if (!FileRewind(new_file_.get()))
913 return false;
915 // Get chunk file's size for validating counts.
916 int64 update_size = 0;
917 if (!base::GetFileSize(TemporaryFileForFilename(filename_), &update_size))
918 return OnCorruptDatabase();
920 // Track update size to answer questions at http://crbug.com/72216 .
921 // Log small updates as 1k so that the 0 (underflow) bucket can be
922 // used for "empty" in SafeBrowsingDatabase.
923 UMA_HISTOGRAM_COUNTS("SB2.DatabaseUpdateKilobytes",
924 std::max(static_cast<int>(update_size / 1024), 1));
926 // Chunk updates to integrate.
927 StateInternal new_state;
929 // Read update chunks.
930 for (int i = 0; i < chunks_written_; ++i) {
931 ChunkHeader header;
933 int64 ofs = ftell(new_file_.get());
934 if (ofs == -1)
935 return false;
937 if (!ReadItem(&header, new_file_.get(), NULL))
938 return false;
940 // As a safety measure, make sure that the header describes a sane
941 // chunk, given the remaining file size.
942 int64 expected_size = ofs + sizeof(ChunkHeader);
943 expected_size += header.add_prefix_count * sizeof(SBAddPrefix);
944 expected_size += header.sub_prefix_count * sizeof(SBSubPrefix);
945 expected_size += header.add_hash_count * sizeof(SBAddFullHash);
946 expected_size += header.sub_hash_count * sizeof(SBSubFullHash);
947 if (expected_size > update_size)
948 return false;
950 if (!new_state.AppendData(header.add_prefix_count, header.sub_prefix_count,
951 header.add_hash_count, header.sub_hash_count,
952 new_file_.get(), NULL)) {
953 return false;
957 // The state was accumulated by chunk, sort by prefix.
958 new_state.SortData();
960 // These strides control how much data is loaded into memory per pass.
961 // Strides must be an even power of two. |in_stride| will be derived from the
962 // input file. |out_stride| will be derived from an estimate of the resulting
963 // file's size. |process_stride| will be the max of both.
964 uint64 in_stride = kMaxShardStride;
965 uint64 out_stride = kMaxShardStride;
966 uint64 process_stride = 0;
968 // The header info is only used later if |!empty_|. The v8 read loop only
969 // needs |in_stride|, while v7 needs to refer to header information.
970 base::MD5Context in_context;
971 int version = kInvalidVersion;
972 FileHeader header;
974 if (!empty_) {
975 DCHECK(file_.get());
977 version = ReadAndVerifyHeader(filename_, &header,
978 &add_chunks_cache_, &sub_chunks_cache_,
979 file_.get(), &in_context);
980 if (version == kInvalidVersion)
981 return OnCorruptDatabase();
983 if (version == 8 && header.v8.shard_stride)
984 in_stride = header.v8.shard_stride;
986 // The header checksum should have prevented this case, but the code will be
987 // broken if this is not correct.
988 if (!IsPowerOfTwo(in_stride))
989 return OnCorruptDatabase();
992 // We no longer need to track deleted chunks.
993 DeleteChunksFromSet(add_del_cache_, &add_chunks_cache_);
994 DeleteChunksFromSet(sub_del_cache_, &sub_chunks_cache_);
996 // Calculate |out_stride| to break the file down into reasonable shards.
998 int64 original_size = 0;
999 if (!empty_ && !base::GetFileSize(filename_, &original_size))
1000 return OnCorruptDatabase();
1002 // Approximate the final size as everything. Subs and deletes will reduce
1003 // the size, but modest over-sharding won't hurt much.
1004 int64 shard_size = original_size + update_size;
1006 // Keep splitting until a single stride of data fits the target.
1007 size_t shifts = 0;
1008 while (out_stride > kMinShardStride && shard_size > kUpdateStorageBytes) {
1009 out_stride >>= 1;
1010 shard_size >>= 1;
1011 ++shifts;
1013 UMA_HISTOGRAM_COUNTS("SB2.OutShardShifts", shifts);
1015 DCHECK(IsPowerOfTwo(out_stride));
1018 // Outer loop strides by the max of the input stride (to read integral shards)
1019 // and the output stride (to write integral shards).
1020 process_stride = std::max(in_stride, out_stride);
1021 DCHECK(IsPowerOfTwo(process_stride));
1022 DCHECK_EQ(0u, process_stride % in_stride);
1023 DCHECK_EQ(0u, process_stride % out_stride);
1025 // Start writing the new data to |new_file_|.
1026 base::MD5Context out_context;
1027 if (!WriteHeader(out_stride, add_chunks_cache_, sub_chunks_cache_,
1028 new_file_.get(), &out_context)) {
1029 return false;
1032 // Start at the beginning of the SBPrefix space.
1033 uint64 in_min = 0;
1034 uint64 out_min = 0;
1035 uint64 process_min = 0;
1037 // Start at the beginning of the updates.
1038 StateInternalPos new_pos = new_state.StateBegin();
1040 // Re-usable container for shard processing.
1041 StateInternal db_state;
1043 // Track aggregate counts for histograms.
1044 size_t add_prefix_count = 0;
1045 size_t sub_prefix_count = 0;
1047 do {
1048 // Maximum element in the current shard.
1049 SBPrefix process_max =
1050 static_cast<SBPrefix>(process_min + process_stride - 1);
1051 DCHECK_GT(process_max, process_min);
1053 // Drop the data from previous pass.
1054 db_state.ClearData();
1056 // Fill the processing shard with one or more input shards.
1057 if (!empty_) {
1058 if (version == 7) {
1059 // Treat v7 as a single-shard file.
1060 DCHECK_EQ(in_min, 0u);
1061 DCHECK_EQ(in_stride, kMaxShardStride);
1062 DCHECK_EQ(process_stride, kMaxShardStride);
1063 if (!db_state.AppendData(header.v7.add_prefix_count,
1064 header.v7.sub_prefix_count,
1065 header.v7.add_hash_count,
1066 header.v7.sub_hash_count,
1067 file_.get(), &in_context))
1068 return OnCorruptDatabase();
1070 // v7 data is not sorted correctly.
1071 db_state.SortData();
1072 } else {
1073 do {
1074 ShardHeader shard_header;
1075 if (!ReadItem(&shard_header, file_.get(), &in_context))
1076 return OnCorruptDatabase();
1078 if (!db_state.AppendData(shard_header.add_prefix_count,
1079 shard_header.sub_prefix_count,
1080 shard_header.add_hash_count,
1081 shard_header.sub_hash_count,
1082 file_.get(), &in_context))
1083 return OnCorruptDatabase();
1085 in_min += in_stride;
1086 } while (in_min <= kMaxSBPrefix && in_min < process_max);
1090 // Shard the update data to match the database data, then merge the update
1091 // data and process the results.
1093 StateInternalPos new_end = new_state.ShardEnd(new_pos, process_max);
1094 db_state.MergeDataAndProcess(new_pos, new_end,
1095 add_del_cache_, sub_del_cache_);
1096 new_pos = new_end;
1099 // Collect the processed data for return to caller.
1100 for (size_t i = 0; i < db_state.add_prefixes_.size(); ++i) {
1101 builder->AddPrefix(db_state.add_prefixes_[i].prefix);
1103 add_full_hashes_result->insert(add_full_hashes_result->end(),
1104 db_state.add_full_hashes_.begin(),
1105 db_state.add_full_hashes_.end());
1106 add_prefix_count += db_state.add_prefixes_.size();
1107 sub_prefix_count += db_state.sub_prefixes_.size();
1109 // Write one or more shards of processed output.
1110 StateInternalPos out_pos = db_state.StateBegin();
1111 do {
1112 SBPrefix out_max = static_cast<SBPrefix>(out_min + out_stride - 1);
1113 DCHECK_GT(out_max, out_min);
1115 StateInternalPos out_end = db_state.ShardEnd(out_pos, out_max);
1116 if (!db_state.WriteShard(out_pos, out_end, new_file_.get(), &out_context))
1117 return false;
1118 out_pos = out_end;
1120 out_min += out_stride;
1121 } while (out_min == static_cast<SBPrefix>(out_min) &&
1122 out_min < process_max);
1124 process_min += process_stride;
1125 } while (process_min <= kMaxSBPrefix);
1127 // Verify the overall checksum.
1128 if (!empty_) {
1129 if (!ReadAndVerifyChecksum(file_.get(), &in_context)) {
1130 RecordFormatEvent(FORMAT_EVENT_UPDATE_CHECKSUM_FAILURE);
1131 return OnCorruptDatabase();
1134 // TODO(shess): Verify EOF?
1136 // Close the input file so the new file can be renamed over it.
1137 file_.reset();
1139 DCHECK(!file_.get());
1141 // Write the overall checksum.
1142 base::MD5Digest out_digest;
1143 base::MD5Final(&out_digest, &out_context);
1144 if (!WriteItem(out_digest, new_file_.get(), NULL))
1145 return false;
1147 // Trim any excess left over from the temporary chunk data.
1148 if (!base::TruncateFile(new_file_.get()))
1149 return false;
1151 // Close the file handle and swizzle the file into place.
1152 new_file_.reset();
1153 if (!base::DeleteFile(filename_, false) &&
1154 base::PathExists(filename_))
1155 return false;
1157 const base::FilePath new_filename = TemporaryFileForFilename(filename_);
1158 if (!base::Move(new_filename, filename_))
1159 return false;
1161 // Record counts before swapping to caller.
1162 UMA_HISTOGRAM_COUNTS("SB2.AddPrefixes", add_prefix_count);
1163 UMA_HISTOGRAM_COUNTS("SB2.SubPrefixes", sub_prefix_count);
1165 return true;
1168 bool SafeBrowsingStoreFile::FinishUpdate(
1169 safe_browsing::PrefixSetBuilder* builder,
1170 std::vector<SBAddFullHash>* add_full_hashes_result) {
1171 DCHECK(builder);
1172 DCHECK(add_full_hashes_result);
1174 if (!DoUpdate(builder, add_full_hashes_result)) {
1175 CancelUpdate();
1176 return false;
1179 DCHECK(!new_file_.get());
1180 DCHECK(!file_.get());
1182 return Close();
1185 bool SafeBrowsingStoreFile::CancelUpdate() {
1186 return Close();
1189 void SafeBrowsingStoreFile::SetAddChunk(int32 chunk_id) {
1190 add_chunks_cache_.insert(chunk_id);
1193 bool SafeBrowsingStoreFile::CheckAddChunk(int32 chunk_id) {
1194 return add_chunks_cache_.count(chunk_id) > 0;
1197 void SafeBrowsingStoreFile::GetAddChunks(std::vector<int32>* out) {
1198 out->clear();
1199 out->insert(out->end(), add_chunks_cache_.begin(), add_chunks_cache_.end());
1202 void SafeBrowsingStoreFile::SetSubChunk(int32 chunk_id) {
1203 sub_chunks_cache_.insert(chunk_id);
1206 bool SafeBrowsingStoreFile::CheckSubChunk(int32 chunk_id) {
1207 return sub_chunks_cache_.count(chunk_id) > 0;
1210 void SafeBrowsingStoreFile::GetSubChunks(std::vector<int32>* out) {
1211 out->clear();
1212 out->insert(out->end(), sub_chunks_cache_.begin(), sub_chunks_cache_.end());
1215 void SafeBrowsingStoreFile::DeleteAddChunk(int32 chunk_id) {
1216 add_del_cache_.insert(chunk_id);
1219 void SafeBrowsingStoreFile::DeleteSubChunk(int32 chunk_id) {
1220 sub_del_cache_.insert(chunk_id);
1223 // static
1224 bool SafeBrowsingStoreFile::DeleteStore(const base::FilePath& basename) {
1225 if (!base::DeleteFile(basename, false) &&
1226 base::PathExists(basename)) {
1227 NOTREACHED();
1228 return false;
1231 const base::FilePath new_filename = TemporaryFileForFilename(basename);
1232 if (!base::DeleteFile(new_filename, false) &&
1233 base::PathExists(new_filename)) {
1234 NOTREACHED();
1235 return false;
1238 // With SQLite support gone, one way to get to this code is if the
1239 // existing file is a SQLite file. Make sure the journal file is
1240 // also removed.
1241 const base::FilePath journal_filename(
1242 basename.value() + FILE_PATH_LITERAL("-journal"));
1243 if (base::PathExists(journal_filename))
1244 base::DeleteFile(journal_filename, false);
1246 return true;