base::Time multiplicative operator overloading
[chromium-blink-merge.git] / chrome / browser / safe_browsing / safe_browsing_store_file.cc
blob47c3f9efafd73e8fb954b0a5130230f3fd2f2b0b
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "chrome/browser/safe_browsing/safe_browsing_store_file.h"
7 #include "base/files/file_util.h"
8 #include "base/files/scoped_file.h"
9 #include "base/md5.h"
10 #include "base/metrics/histogram.h"
11 #include "base/metrics/sparse_histogram.h"
13 namespace {
15 // NOTE(shess): kFileMagic should not be a byte-wise palindrome, so
16 // that byte-order changes force corruption.
17 const int32 kFileMagic = 0x600D71FE;
19 // Version history:
20 // Version 6: aad08754/r2814 by erikkay@google.com on 2008-10-02 (sqlite)
21 // Version 7: 6afe28a5/r37435 by shess@chromium.org on 2010-01-28
22 // Version 8: d3dd0715/r259791 by shess@chromium.org on 2014-03-27
23 const int32 kFileVersion = 8;
25 // ReadAndVerifyHeader() returns this in case of error.
26 const int32 kInvalidVersion = -1;
28 // Starting with version 8, the storage is sorted and can be sharded to allow
29 // updates to be done with lower memory requirements. Newly written files will
30 // be sharded to need less than this amount of memory during update. Larger
31 // values are preferred to minimize looping overhead during processing.
32 const int64 kUpdateStorageBytes = 100 * 1024;
34 // Prevent excessive sharding by setting a lower limit on the shard stride.
35 // Smaller values should work fine, but very small values will probably lead to
36 // poor performance. Shard stride is indirectly related to
37 // |kUpdateStorageBytes|, setting that very small will bump against this.
38 const uint32 kMinShardStride = 1 << 24;
40 // Strides over the entire SBPrefix space.
41 const uint64 kMaxShardStride = 1ULL << 32;
43 // Maximum SBPrefix value.
44 const SBPrefix kMaxSBPrefix = 0xFFFFFFFF;
46 // Header at the front of the main database file.
47 struct FileHeader {
48 int32 magic, version;
49 uint32 add_chunk_count, sub_chunk_count;
50 uint32 shard_stride;
51 // TODO(shess): Is this where 64-bit will bite me? Perhaps write a
52 // specialized read/write?
55 // Header for each chunk in the chunk-accumulation file.
56 struct ChunkHeader {
57 uint32 add_prefix_count, sub_prefix_count;
58 uint32 add_hash_count, sub_hash_count;
61 // Header for each shard of data in the main database file.
62 struct ShardHeader {
63 uint32 add_prefix_count, sub_prefix_count;
64 uint32 add_hash_count, sub_hash_count;
67 // Enumerate different format-change events for histogramming
68 // purposes. DO NOT CHANGE THE ORDERING OF THESE VALUES.
69 enum FormatEventType {
70 // Corruption detected, broken down by file format.
71 FORMAT_EVENT_FILE_CORRUPT,
72 FORMAT_EVENT_SQLITE_CORRUPT, // Obsolete
74 // The type of format found in the file. The expected case (new
75 // file format) is intentionally not covered.
76 FORMAT_EVENT_FOUND_SQLITE, // Obsolete
77 FORMAT_EVENT_FOUND_UNKNOWN, // magic does not match.
79 // The number of SQLite-format files deleted should be the same as
80 // FORMAT_EVENT_FOUND_SQLITE. It can differ if the delete fails,
81 // or if a failure prevents the update from succeeding.
82 FORMAT_EVENT_SQLITE_DELETED, // Obsolete
83 FORMAT_EVENT_SQLITE_DELETE_FAILED, // Obsolete
85 // Found and deleted (or failed to delete) the ancient "Safe
86 // Browsing" file.
87 FORMAT_EVENT_DELETED_ORIGINAL, // Obsolete
88 FORMAT_EVENT_DELETED_ORIGINAL_FAILED, // Obsolete
90 // The checksum did not check out in CheckValidity() or in
91 // FinishUpdate(). This most likely indicates that the machine
92 // crashed before the file was fully sync'ed to disk.
93 FORMAT_EVENT_VALIDITY_CHECKSUM_FAILURE,
94 FORMAT_EVENT_UPDATE_CHECKSUM_FAILURE,
96 // The header checksum was incorrect in ReadAndVerifyHeader(). Likely
97 // indicates that the system crashed while writing an update.
98 FORMAT_EVENT_HEADER_CHECKSUM_FAILURE,
100 FORMAT_EVENT_FOUND_DEPRECATED, // version too old.
102 // Memory space for histograms is determined by the max. ALWAYS
103 // ADD NEW VALUES BEFORE THIS ONE.
104 FORMAT_EVENT_MAX
107 void RecordFormatEvent(FormatEventType event_type) {
108 UMA_HISTOGRAM_ENUMERATION("SB2.FormatEvent", event_type, FORMAT_EVENT_MAX);
111 // Rewind the file. Using fseek(2) because rewind(3) errors are
112 // weird.
113 bool FileRewind(FILE* fp) {
114 int rv = fseek(fp, 0, SEEK_SET);
115 DCHECK_EQ(rv, 0);
116 return rv == 0;
119 // Read from |fp| into |item|, and fold the input data into the
120 // checksum in |context|, if non-NULL. Return true on success.
121 template <class T>
122 bool ReadItem(T* item, FILE* fp, base::MD5Context* context) {
123 const size_t ret = fread(item, sizeof(T), 1, fp);
124 if (ret != 1)
125 return false;
127 if (context) {
128 base::MD5Update(context,
129 base::StringPiece(reinterpret_cast<char*>(item),
130 sizeof(T)));
132 return true;
135 // Write |item| to |fp|, and fold the output data into the checksum in
136 // |context|, if non-NULL. Return true on success.
137 template <class T>
138 bool WriteItem(const T& item, FILE* fp, base::MD5Context* context) {
139 const size_t ret = fwrite(&item, sizeof(T), 1, fp);
140 if (ret != 1)
141 return false;
143 if (context) {
144 base::MD5Update(context,
145 base::StringPiece(reinterpret_cast<const char*>(&item),
146 sizeof(T)));
149 return true;
152 // Read |count| items into |values| from |fp|, and fold them into the
153 // checksum in |context|. Returns true on success.
154 template <typename CT>
155 bool ReadToContainer(CT* values, size_t count, FILE* fp,
156 base::MD5Context* context) {
157 if (!count)
158 return true;
160 for (size_t i = 0; i < count; ++i) {
161 typename CT::value_type value;
162 if (!ReadItem(&value, fp, context))
163 return false;
165 // push_back() is more obvious, but coded this way std::set can
166 // also be read.
167 values->insert(values->end(), value);
170 return true;
173 // Write values between |beg| and |end| to |fp|, and fold the data into the
174 // checksum in |context|, if non-NULL. Returns true if all items successful.
175 template <typename CTI>
176 bool WriteRange(const CTI& beg, const CTI& end,
177 FILE* fp, base::MD5Context* context) {
178 for (CTI iter = beg; iter != end; ++iter) {
179 if (!WriteItem(*iter, fp, context))
180 return false;
182 return true;
185 // Write all of |values| to |fp|, and fold the data into the checksum
186 // in |context|, if non-NULL. Returns true if all items successful.
187 template <typename CT>
188 bool WriteContainer(const CT& values, FILE* fp,
189 base::MD5Context* context) {
190 return WriteRange(values.begin(), values.end(), fp, context);
193 // Delete the chunks in |deleted| from |chunks|.
194 void DeleteChunksFromSet(const base::hash_set<int32>& deleted,
195 std::set<int32>* chunks) {
196 for (std::set<int32>::iterator iter = chunks->begin();
197 iter != chunks->end();) {
198 std::set<int32>::iterator prev = iter++;
199 if (deleted.count(*prev) > 0)
200 chunks->erase(prev);
204 bool ReadAndVerifyChecksum(FILE* fp, base::MD5Context* context) {
205 base::MD5Digest calculated_digest;
206 base::MD5IntermediateFinal(&calculated_digest, context);
208 base::MD5Digest file_digest;
209 if (!ReadItem(&file_digest, fp, context))
210 return false;
212 return memcmp(&file_digest, &calculated_digest, sizeof(file_digest)) == 0;
215 // Helper function to read the file header and chunk TOC. Rewinds |fp| and
216 // initializes |context|. The header is left in |header|, with the version
217 // returned. kInvalidVersion is returned for sanity check or checksum failure.
218 int ReadAndVerifyHeader(const base::FilePath& filename,
219 FileHeader* header,
220 std::set<int32>* add_chunks,
221 std::set<int32>* sub_chunks,
222 FILE* fp,
223 base::MD5Context* context) {
224 DCHECK(header);
225 DCHECK(add_chunks);
226 DCHECK(sub_chunks);
227 DCHECK(fp);
228 DCHECK(context);
230 base::MD5Init(context);
231 if (!FileRewind(fp))
232 return kInvalidVersion;
233 if (!ReadItem(header, fp, context))
234 return kInvalidVersion;
235 if (header->magic != kFileMagic)
236 return kInvalidVersion;
238 // Track version read to inform removal of support for older versions.
239 UMA_HISTOGRAM_SPARSE_SLOWLY("SB2.StoreVersionRead", header->version);
241 if (header->version != kFileVersion)
242 return kInvalidVersion;
244 if (!ReadToContainer(add_chunks, header->add_chunk_count, fp, context) ||
245 !ReadToContainer(sub_chunks, header->sub_chunk_count, fp, context)) {
246 return kInvalidVersion;
249 // Verify that the data read thus far is valid.
250 if (!ReadAndVerifyChecksum(fp, context)) {
251 RecordFormatEvent(FORMAT_EVENT_HEADER_CHECKSUM_FAILURE);
252 return kInvalidVersion;
255 return kFileVersion;
258 // Helper function to write out the initial header and chunks-contained data.
259 // Rewinds |fp|, initializes |context|, then writes a file header and
260 // |add_chunks| and |sub_chunks|.
261 bool WriteHeader(uint32 out_stride,
262 const std::set<int32>& add_chunks,
263 const std::set<int32>& sub_chunks,
264 FILE* fp,
265 base::MD5Context* context) {
266 if (!FileRewind(fp))
267 return false;
269 base::MD5Init(context);
270 FileHeader header;
271 header.magic = kFileMagic;
272 header.version = kFileVersion;
273 header.add_chunk_count = add_chunks.size();
274 header.sub_chunk_count = sub_chunks.size();
275 header.shard_stride = out_stride;
276 if (!WriteItem(header, fp, context))
277 return false;
279 if (!WriteContainer(add_chunks, fp, context) ||
280 !WriteContainer(sub_chunks, fp, context))
281 return false;
283 // Write out the header digest.
284 base::MD5Digest header_digest;
285 base::MD5IntermediateFinal(&header_digest, context);
286 if (!WriteItem(header_digest, fp, context))
287 return false;
289 return true;
292 // Return |true| if the range is sorted by the given comparator.
293 template <typename CTI, typename LESS>
294 bool sorted(CTI beg, CTI end, LESS less) {
295 while ((end - beg) > 2) {
296 CTI n = beg++;
297 DCHECK(!less(*beg, *n));
298 if (less(*beg, *n))
299 return false;
301 return true;
304 // Merge |beg|..|end| into |container|. Both should be sorted by the given
305 // comparator, and the range iterators should not be derived from |container|.
306 // Differs from std::inplace_merge() in that additional memory is not required
307 // for linear performance.
308 template <typename CT, typename CTI, typename COMP>
309 void container_merge(CT* container, CTI beg, CTI end, const COMP& less) {
310 DCHECK(sorted(container->begin(), container->end(), less));
311 DCHECK(sorted(beg, end, less));
313 // Size the container to fit the results.
314 const size_t c_size = container->size();
315 container->resize(c_size + (end - beg));
317 // |c_end| points to the original endpoint, while |c_out| points to the
318 // endpoint that will scan from end to beginning while merging.
319 typename CT::iterator c_end = container->begin() + c_size;
320 typename CT::iterator c_out = container->end();
322 // While both inputs have data, move the greater to |c_out|.
323 while (c_end != container->begin() && end != beg) {
324 if (less(*(c_end - 1), *(end - 1))) {
325 *(--c_out) = *(--end);
326 } else {
327 *(--c_out) = *(--c_end);
331 // Copy any data remaining in the new range.
332 if (end != beg) {
333 // The original container data has been fully shifted.
334 DCHECK(c_end == container->begin());
336 // There is exactly the correct amount of space left.
337 DCHECK_EQ(c_out - c_end, end - beg);
339 std::copy(beg, end, container->begin());
342 DCHECK(sorted(container->begin(), container->end(), less));
345 // Collection of iterators used while stepping through StateInternal (see
346 // below).
347 class StateInternalPos {
348 public:
349 StateInternalPos(SBAddPrefixes::iterator add_prefixes_iter,
350 SBSubPrefixes::iterator sub_prefixes_iter,
351 std::vector<SBAddFullHash>::iterator add_hashes_iter,
352 std::vector<SBSubFullHash>::iterator sub_hashes_iter)
353 : add_prefixes_iter_(add_prefixes_iter),
354 sub_prefixes_iter_(sub_prefixes_iter),
355 add_hashes_iter_(add_hashes_iter),
356 sub_hashes_iter_(sub_hashes_iter) {
359 SBAddPrefixes::iterator add_prefixes_iter_;
360 SBSubPrefixes::iterator sub_prefixes_iter_;
361 std::vector<SBAddFullHash>::iterator add_hashes_iter_;
362 std::vector<SBSubFullHash>::iterator sub_hashes_iter_;
365 // Helper to find the next shard boundary.
366 template <class T>
367 bool prefix_bounder(SBPrefix val, const T& elt) {
368 return val < elt.GetAddPrefix();
371 // Container for partial database state. Includes add/sub prefixes/hashes, plus
372 // aggregate operations on same.
373 class StateInternal {
374 public:
375 // Append indicated amount of data from |fp|.
376 bool AppendData(size_t add_prefix_count, size_t sub_prefix_count,
377 size_t add_hash_count, size_t sub_hash_count,
378 FILE* fp, base::MD5Context* context) {
379 return
380 ReadToContainer(&add_prefixes_, add_prefix_count, fp, context) &&
381 ReadToContainer(&sub_prefixes_, sub_prefix_count, fp, context) &&
382 ReadToContainer(&add_full_hashes_, add_hash_count, fp, context) &&
383 ReadToContainer(&sub_full_hashes_, sub_hash_count, fp, context);
386 void ClearData() {
387 add_prefixes_.clear();
388 sub_prefixes_.clear();
389 add_full_hashes_.clear();
390 sub_full_hashes_.clear();
393 // Merge data from |beg|..|end| into receiver's state, then process the state.
394 // The current state and the range given should corrospond to the same sorted
395 // shard of data from different sources. |add_del_cache| and |sub_del_cache|
396 // indicate the chunk ids which should be deleted during processing (see
397 // SBProcessSubs).
398 void MergeDataAndProcess(const StateInternalPos& beg,
399 const StateInternalPos& end,
400 const base::hash_set<int32>& add_del_cache,
401 const base::hash_set<int32>& sub_del_cache) {
402 container_merge(&add_prefixes_,
403 beg.add_prefixes_iter_,
404 end.add_prefixes_iter_,
405 SBAddPrefixLess<SBAddPrefix,SBAddPrefix>);
407 container_merge(&sub_prefixes_,
408 beg.sub_prefixes_iter_,
409 end.sub_prefixes_iter_,
410 SBAddPrefixLess<SBSubPrefix,SBSubPrefix>);
412 container_merge(&add_full_hashes_,
413 beg.add_hashes_iter_,
414 end.add_hashes_iter_,
415 SBAddPrefixHashLess<SBAddFullHash,SBAddFullHash>);
417 container_merge(&sub_full_hashes_,
418 beg.sub_hashes_iter_,
419 end.sub_hashes_iter_,
420 SBAddPrefixHashLess<SBSubFullHash, SBSubFullHash>);
422 SBProcessSubs(&add_prefixes_, &sub_prefixes_,
423 &add_full_hashes_, &sub_full_hashes_,
424 add_del_cache, sub_del_cache);
427 // Sort the data appropriately for the sharding, merging, and processing
428 // operations.
429 void SortData() {
430 std::sort(add_prefixes_.begin(), add_prefixes_.end(),
431 SBAddPrefixLess<SBAddPrefix,SBAddPrefix>);
432 std::sort(sub_prefixes_.begin(), sub_prefixes_.end(),
433 SBAddPrefixLess<SBSubPrefix,SBSubPrefix>);
434 std::sort(add_full_hashes_.begin(), add_full_hashes_.end(),
435 SBAddPrefixHashLess<SBAddFullHash,SBAddFullHash>);
436 std::sort(sub_full_hashes_.begin(), sub_full_hashes_.end(),
437 SBAddPrefixHashLess<SBSubFullHash,SBSubFullHash>);
440 // Iterator from the beginning of the state's data.
441 StateInternalPos StateBegin() {
442 return StateInternalPos(add_prefixes_.begin(),
443 sub_prefixes_.begin(),
444 add_full_hashes_.begin(),
445 sub_full_hashes_.begin());
448 // An iterator pointing just after the last possible element of the shard
449 // indicated by |shard_max|. Used to step through the state by shard.
450 // TODO(shess): Verify whether binary search really improves over linear.
451 // Merging or writing will immediately touch all of these elements.
452 StateInternalPos ShardEnd(const StateInternalPos& beg, SBPrefix shard_max) {
453 return StateInternalPos(
454 std::upper_bound(beg.add_prefixes_iter_, add_prefixes_.end(),
455 shard_max, prefix_bounder<SBAddPrefix>),
456 std::upper_bound(beg.sub_prefixes_iter_, sub_prefixes_.end(),
457 shard_max, prefix_bounder<SBSubPrefix>),
458 std::upper_bound(beg.add_hashes_iter_, add_full_hashes_.end(),
459 shard_max, prefix_bounder<SBAddFullHash>),
460 std::upper_bound(beg.sub_hashes_iter_, sub_full_hashes_.end(),
461 shard_max, prefix_bounder<SBSubFullHash>));
464 // Write a shard header and data for the shard starting at |beg| and ending at
465 // the element before |end|.
466 bool WriteShard(const StateInternalPos& beg, const StateInternalPos& end,
467 FILE* fp, base::MD5Context* context) {
468 ShardHeader shard_header;
469 shard_header.add_prefix_count =
470 end.add_prefixes_iter_ - beg.add_prefixes_iter_;
471 shard_header.sub_prefix_count =
472 end.sub_prefixes_iter_ - beg.sub_prefixes_iter_;
473 shard_header.add_hash_count =
474 end.add_hashes_iter_ - beg.add_hashes_iter_;
475 shard_header.sub_hash_count =
476 end.sub_hashes_iter_ - beg.sub_hashes_iter_;
478 return
479 WriteItem(shard_header, fp, context) &&
480 WriteRange(beg.add_prefixes_iter_, end.add_prefixes_iter_,
481 fp, context) &&
482 WriteRange(beg.sub_prefixes_iter_, end.sub_prefixes_iter_,
483 fp, context) &&
484 WriteRange(beg.add_hashes_iter_, end.add_hashes_iter_,
485 fp, context) &&
486 WriteRange(beg.sub_hashes_iter_, end.sub_hashes_iter_,
487 fp, context);
490 SBAddPrefixes add_prefixes_;
491 SBSubPrefixes sub_prefixes_;
492 std::vector<SBAddFullHash> add_full_hashes_;
493 std::vector<SBSubFullHash> sub_full_hashes_;
496 // True if |val| is an even power of two.
497 template <typename T>
498 bool IsPowerOfTwo(const T& val) {
499 return val && (val & (val - 1)) == 0;
502 // Helper to read the entire database state, used by GetAddPrefixes() and
503 // GetAddFullHashes(). Those functions are generally used only for smaller
504 // files. Returns false in case of errors reading the data.
505 bool ReadDbStateHelper(const base::FilePath& filename,
506 StateInternal* db_state) {
507 base::ScopedFILE file(base::OpenFile(filename, "rb"));
508 if (file.get() == NULL)
509 return false;
511 std::set<int32> add_chunks;
512 std::set<int32> sub_chunks;
514 base::MD5Context context;
515 FileHeader header;
516 const int version =
517 ReadAndVerifyHeader(filename, &header, &add_chunks, &sub_chunks,
518 file.get(), &context);
519 if (version == kInvalidVersion)
520 return false;
522 uint64 in_min = 0;
523 uint64 in_stride = header.shard_stride;
524 if (!in_stride)
525 in_stride = kMaxShardStride;
526 if (!IsPowerOfTwo(in_stride))
527 return false;
529 do {
530 ShardHeader shard_header;
531 if (!ReadItem(&shard_header, file.get(), &context))
532 return false;
534 if (!db_state->AppendData(shard_header.add_prefix_count,
535 shard_header.sub_prefix_count,
536 shard_header.add_hash_count,
537 shard_header.sub_hash_count,
538 file.get(), &context)) {
539 return false;
542 in_min += in_stride;
543 } while (in_min <= kMaxSBPrefix);
545 if (!ReadAndVerifyChecksum(file.get(), &context))
546 return false;
548 int64 size = 0;
549 if (!base::GetFileSize(filename, &size))
550 return false;
552 return static_cast<int64>(ftell(file.get())) == size;
555 } // namespace
557 SafeBrowsingStoreFile::SafeBrowsingStoreFile()
558 : chunks_written_(0), empty_(false), corruption_seen_(false) {}
560 SafeBrowsingStoreFile::~SafeBrowsingStoreFile() {
561 // Thread-checking is disabled in the destructor due to crbug.com/338486.
562 DetachFromThread();
564 Close();
567 bool SafeBrowsingStoreFile::Delete() {
568 DCHECK(CalledOnValidThread());
570 // The database should not be open at this point. But, just in
571 // case, close everything before deleting.
572 if (!Close()) {
573 NOTREACHED();
574 return false;
577 return DeleteStore(filename_);
580 bool SafeBrowsingStoreFile::CheckValidity() {
581 DCHECK(CalledOnValidThread());
583 // The file was either empty or never opened. The empty case is
584 // presumed not to be invalid. The never-opened case can happen if
585 // BeginUpdate() fails for any databases, and should already have
586 // caused the corruption callback to fire.
587 if (!file_.get())
588 return true;
590 if (!FileRewind(file_.get()))
591 return OnCorruptDatabase();
593 int64 size = 0;
594 if (!base::GetFileSize(filename_, &size))
595 return OnCorruptDatabase();
597 base::MD5Context context;
598 base::MD5Init(&context);
600 // Read everything except the final digest.
601 size_t bytes_left = static_cast<size_t>(size);
602 CHECK(size == static_cast<int64>(bytes_left));
603 if (bytes_left < sizeof(base::MD5Digest))
604 return OnCorruptDatabase();
605 bytes_left -= sizeof(base::MD5Digest);
607 // Fold the contents of the file into the checksum.
608 while (bytes_left > 0) {
609 char buf[4096];
610 const size_t c = std::min(sizeof(buf), bytes_left);
611 const size_t ret = fread(buf, 1, c, file_.get());
613 // The file's size changed while reading, give up.
614 if (ret != c)
615 return OnCorruptDatabase();
616 base::MD5Update(&context, base::StringPiece(buf, c));
617 bytes_left -= c;
620 if (!ReadAndVerifyChecksum(file_.get(), &context)) {
621 RecordFormatEvent(FORMAT_EVENT_VALIDITY_CHECKSUM_FAILURE);
622 return OnCorruptDatabase();
625 return true;
628 void SafeBrowsingStoreFile::Init(const base::FilePath& filename,
629 const base::Closure& corruption_callback) {
630 DCHECK(CalledOnValidThread());
631 filename_ = filename;
632 corruption_callback_ = corruption_callback;
635 bool SafeBrowsingStoreFile::BeginChunk() {
636 DCHECK(CalledOnValidThread());
637 return ClearChunkBuffers();
640 bool SafeBrowsingStoreFile::WriteAddPrefix(int32 chunk_id, SBPrefix prefix) {
641 DCHECK(CalledOnValidThread());
642 add_prefixes_.push_back(SBAddPrefix(chunk_id, prefix));
643 return true;
646 bool SafeBrowsingStoreFile::GetAddPrefixes(SBAddPrefixes* add_prefixes) {
647 DCHECK(CalledOnValidThread());
649 add_prefixes->clear();
650 if (!base::PathExists(filename_))
651 return true;
653 StateInternal db_state;
654 if (!ReadDbStateHelper(filename_, &db_state))
655 return OnCorruptDatabase();
657 add_prefixes->swap(db_state.add_prefixes_);
658 return true;
661 bool SafeBrowsingStoreFile::GetAddFullHashes(
662 std::vector<SBAddFullHash>* add_full_hashes) {
663 DCHECK(CalledOnValidThread());
665 add_full_hashes->clear();
666 if (!base::PathExists(filename_))
667 return true;
669 StateInternal db_state;
670 if (!ReadDbStateHelper(filename_, &db_state))
671 return OnCorruptDatabase();
673 add_full_hashes->swap(db_state.add_full_hashes_);
674 return true;
677 bool SafeBrowsingStoreFile::WriteAddHash(int32 chunk_id,
678 const SBFullHash& full_hash) {
679 DCHECK(CalledOnValidThread());
680 add_hashes_.push_back(SBAddFullHash(chunk_id, full_hash));
681 return true;
684 bool SafeBrowsingStoreFile::WriteSubPrefix(int32 chunk_id,
685 int32 add_chunk_id,
686 SBPrefix prefix) {
687 DCHECK(CalledOnValidThread());
688 sub_prefixes_.push_back(SBSubPrefix(chunk_id, add_chunk_id, prefix));
689 return true;
692 bool SafeBrowsingStoreFile::WriteSubHash(int32 chunk_id, int32 add_chunk_id,
693 const SBFullHash& full_hash) {
694 DCHECK(CalledOnValidThread());
695 sub_hashes_.push_back(SBSubFullHash(chunk_id, add_chunk_id, full_hash));
696 return true;
699 bool SafeBrowsingStoreFile::OnCorruptDatabase() {
700 DCHECK(CalledOnValidThread());
702 if (!corruption_seen_)
703 RecordFormatEvent(FORMAT_EVENT_FILE_CORRUPT);
704 corruption_seen_ = true;
706 corruption_callback_.Run();
708 // Return false as a convenience to callers.
709 return false;
712 bool SafeBrowsingStoreFile::Close() {
713 DCHECK(CalledOnValidThread());
715 ClearUpdateBuffers();
717 // Make sure the files are closed.
718 file_.reset();
719 new_file_.reset();
720 return true;
723 bool SafeBrowsingStoreFile::BeginUpdate() {
724 DCHECK(CalledOnValidThread());
725 DCHECK(!file_.get() && !new_file_.get());
727 // Structures should all be clear unless something bad happened.
728 DCHECK(add_chunks_cache_.empty());
729 DCHECK(sub_chunks_cache_.empty());
730 DCHECK(add_del_cache_.empty());
731 DCHECK(sub_del_cache_.empty());
732 DCHECK(add_prefixes_.empty());
733 DCHECK(sub_prefixes_.empty());
734 DCHECK(add_hashes_.empty());
735 DCHECK(sub_hashes_.empty());
736 DCHECK_EQ(chunks_written_, 0);
738 corruption_seen_ = false;
740 const base::FilePath new_filename = TemporaryFileForFilename(filename_);
741 base::ScopedFILE new_file(base::OpenFile(new_filename, "wb+"));
742 if (new_file.get() == NULL)
743 return false;
745 base::ScopedFILE file(base::OpenFile(filename_, "rb"));
746 empty_ = (file.get() == NULL);
747 if (empty_) {
748 // If the file exists but cannot be opened, try to delete it (not
749 // deleting directly, the bloom filter needs to be deleted, too).
750 if (base::PathExists(filename_))
751 return OnCorruptDatabase();
753 new_file_.swap(new_file);
754 return true;
757 base::MD5Context context;
758 FileHeader header;
759 const int version =
760 ReadAndVerifyHeader(filename_, &header,
761 &add_chunks_cache_, &sub_chunks_cache_,
762 file.get(), &context);
763 if (version == kInvalidVersion) {
764 FileHeader retry_header;
765 if (FileRewind(file.get()) && ReadItem(&retry_header, file.get(), NULL)) {
766 if (retry_header.magic == kFileMagic &&
767 retry_header.version < kFileVersion) {
768 RecordFormatEvent(FORMAT_EVENT_FOUND_DEPRECATED);
769 } else {
770 RecordFormatEvent(FORMAT_EVENT_FOUND_UNKNOWN);
774 // Close the file so that it can be deleted.
775 file.reset();
777 return OnCorruptDatabase();
780 file_.swap(file);
781 new_file_.swap(new_file);
782 return true;
785 bool SafeBrowsingStoreFile::FinishChunk() {
786 DCHECK(CalledOnValidThread());
788 if (!add_prefixes_.size() && !sub_prefixes_.size() &&
789 !add_hashes_.size() && !sub_hashes_.size())
790 return true;
792 ChunkHeader header;
793 header.add_prefix_count = add_prefixes_.size();
794 header.sub_prefix_count = sub_prefixes_.size();
795 header.add_hash_count = add_hashes_.size();
796 header.sub_hash_count = sub_hashes_.size();
797 if (!WriteItem(header, new_file_.get(), NULL))
798 return false;
800 if (!WriteContainer(add_prefixes_, new_file_.get(), NULL) ||
801 !WriteContainer(sub_prefixes_, new_file_.get(), NULL) ||
802 !WriteContainer(add_hashes_, new_file_.get(), NULL) ||
803 !WriteContainer(sub_hashes_, new_file_.get(), NULL))
804 return false;
806 ++chunks_written_;
808 // Clear everything to save memory.
809 return ClearChunkBuffers();
812 bool SafeBrowsingStoreFile::DoUpdate(
813 safe_browsing::PrefixSetBuilder* builder,
814 std::vector<SBAddFullHash>* add_full_hashes_result) {
815 DCHECK(CalledOnValidThread());
816 DCHECK(file_.get() || empty_);
817 DCHECK(new_file_.get());
818 CHECK(builder);
819 CHECK(add_full_hashes_result);
821 // Rewind the temporary storage.
822 if (!FileRewind(new_file_.get()))
823 return false;
825 // Get chunk file's size for validating counts.
826 int64 update_size = 0;
827 if (!base::GetFileSize(TemporaryFileForFilename(filename_), &update_size))
828 return OnCorruptDatabase();
830 // Track update size to answer questions at http://crbug.com/72216 .
831 // Log small updates as 1k so that the 0 (underflow) bucket can be
832 // used for "empty" in SafeBrowsingDatabase.
833 UMA_HISTOGRAM_COUNTS("SB2.DatabaseUpdateKilobytes",
834 std::max(static_cast<int>(update_size / 1024), 1));
836 // Chunk updates to integrate.
837 StateInternal new_state;
839 // Read update chunks.
840 for (int i = 0; i < chunks_written_; ++i) {
841 ChunkHeader header;
843 int64 ofs = ftell(new_file_.get());
844 if (ofs == -1)
845 return false;
847 if (!ReadItem(&header, new_file_.get(), NULL))
848 return false;
850 // As a safety measure, make sure that the header describes a sane
851 // chunk, given the remaining file size.
852 int64 expected_size = ofs + sizeof(ChunkHeader);
853 expected_size += header.add_prefix_count * sizeof(SBAddPrefix);
854 expected_size += header.sub_prefix_count * sizeof(SBSubPrefix);
855 expected_size += header.add_hash_count * sizeof(SBAddFullHash);
856 expected_size += header.sub_hash_count * sizeof(SBSubFullHash);
857 if (expected_size > update_size)
858 return false;
860 if (!new_state.AppendData(header.add_prefix_count, header.sub_prefix_count,
861 header.add_hash_count, header.sub_hash_count,
862 new_file_.get(), NULL)) {
863 return false;
867 // The state was accumulated by chunk, sort by prefix.
868 new_state.SortData();
870 // These strides control how much data is loaded into memory per pass.
871 // Strides must be an even power of two. |in_stride| will be derived from the
872 // input file. |out_stride| will be derived from an estimate of the resulting
873 // file's size. |process_stride| will be the max of both.
874 uint64 in_stride = kMaxShardStride;
875 uint64 out_stride = kMaxShardStride;
876 uint64 process_stride = 0;
878 // Used to verify the input's checksum if |!empty_|.
879 base::MD5Context in_context;
881 if (!empty_) {
882 DCHECK(file_.get());
884 FileHeader header = {0};
885 int version = ReadAndVerifyHeader(filename_, &header,
886 &add_chunks_cache_, &sub_chunks_cache_,
887 file_.get(), &in_context);
888 if (version == kInvalidVersion)
889 return OnCorruptDatabase();
891 if (header.shard_stride)
892 in_stride = header.shard_stride;
894 // The header checksum should have prevented this case, but the code will be
895 // broken if this is not correct.
896 if (!IsPowerOfTwo(in_stride))
897 return OnCorruptDatabase();
900 // We no longer need to track deleted chunks.
901 DeleteChunksFromSet(add_del_cache_, &add_chunks_cache_);
902 DeleteChunksFromSet(sub_del_cache_, &sub_chunks_cache_);
904 // Calculate |out_stride| to break the file down into reasonable shards.
906 int64 original_size = 0;
907 if (!empty_ && !base::GetFileSize(filename_, &original_size))
908 return OnCorruptDatabase();
910 // Approximate the final size as everything. Subs and deletes will reduce
911 // the size, but modest over-sharding won't hurt much.
912 int64 shard_size = original_size + update_size;
914 // Keep splitting until a single stride of data fits the target.
915 size_t shifts = 0;
916 while (out_stride > kMinShardStride && shard_size > kUpdateStorageBytes) {
917 out_stride >>= 1;
918 shard_size >>= 1;
919 ++shifts;
921 UMA_HISTOGRAM_COUNTS("SB2.OutShardShifts", shifts);
923 DCHECK(IsPowerOfTwo(out_stride));
926 // Outer loop strides by the max of the input stride (to read integral shards)
927 // and the output stride (to write integral shards).
928 process_stride = std::max(in_stride, out_stride);
929 DCHECK(IsPowerOfTwo(process_stride));
930 DCHECK_EQ(0u, process_stride % in_stride);
931 DCHECK_EQ(0u, process_stride % out_stride);
933 // Start writing the new data to |new_file_|.
934 base::MD5Context out_context;
935 if (!WriteHeader(out_stride, add_chunks_cache_, sub_chunks_cache_,
936 new_file_.get(), &out_context)) {
937 return false;
940 // Start at the beginning of the SBPrefix space.
941 uint64 in_min = 0;
942 uint64 out_min = 0;
943 uint64 process_min = 0;
945 // Start at the beginning of the updates.
946 StateInternalPos new_pos = new_state.StateBegin();
948 // Re-usable container for shard processing.
949 StateInternal db_state;
951 // Track aggregate counts for histograms.
952 size_t add_prefix_count = 0;
953 size_t sub_prefix_count = 0;
955 do {
956 // Maximum element in the current shard.
957 SBPrefix process_max =
958 static_cast<SBPrefix>(process_min + process_stride - 1);
959 DCHECK_GT(process_max, process_min);
961 // Drop the data from previous pass.
962 db_state.ClearData();
964 // Fill the processing shard with one or more input shards.
965 if (!empty_) {
966 do {
967 ShardHeader shard_header;
968 if (!ReadItem(&shard_header, file_.get(), &in_context))
969 return OnCorruptDatabase();
971 if (!db_state.AppendData(shard_header.add_prefix_count,
972 shard_header.sub_prefix_count,
973 shard_header.add_hash_count,
974 shard_header.sub_hash_count,
975 file_.get(), &in_context))
976 return OnCorruptDatabase();
978 in_min += in_stride;
979 } while (in_min <= kMaxSBPrefix && in_min < process_max);
982 // Shard the update data to match the database data, then merge the update
983 // data and process the results.
985 StateInternalPos new_end = new_state.ShardEnd(new_pos, process_max);
986 db_state.MergeDataAndProcess(new_pos, new_end,
987 add_del_cache_, sub_del_cache_);
988 new_pos = new_end;
991 // Collect the processed data for return to caller.
992 for (size_t i = 0; i < db_state.add_prefixes_.size(); ++i) {
993 builder->AddPrefix(db_state.add_prefixes_[i].prefix);
995 add_full_hashes_result->insert(add_full_hashes_result->end(),
996 db_state.add_full_hashes_.begin(),
997 db_state.add_full_hashes_.end());
998 add_prefix_count += db_state.add_prefixes_.size();
999 sub_prefix_count += db_state.sub_prefixes_.size();
1001 // Write one or more shards of processed output.
1002 StateInternalPos out_pos = db_state.StateBegin();
1003 do {
1004 SBPrefix out_max = static_cast<SBPrefix>(out_min + out_stride - 1);
1005 DCHECK_GT(out_max, out_min);
1007 StateInternalPos out_end = db_state.ShardEnd(out_pos, out_max);
1008 if (!db_state.WriteShard(out_pos, out_end, new_file_.get(), &out_context))
1009 return false;
1010 out_pos = out_end;
1012 out_min += out_stride;
1013 } while (out_min == static_cast<SBPrefix>(out_min) &&
1014 out_min < process_max);
1016 process_min += process_stride;
1017 } while (process_min <= kMaxSBPrefix);
1019 // Verify the overall checksum.
1020 if (!empty_) {
1021 if (!ReadAndVerifyChecksum(file_.get(), &in_context)) {
1022 RecordFormatEvent(FORMAT_EVENT_UPDATE_CHECKSUM_FAILURE);
1023 return OnCorruptDatabase();
1026 // TODO(shess): Verify EOF?
1028 // Close the input file so the new file can be renamed over it.
1029 file_.reset();
1031 DCHECK(!file_.get());
1033 // Write the overall checksum.
1034 base::MD5Digest out_digest;
1035 base::MD5Final(&out_digest, &out_context);
1036 if (!WriteItem(out_digest, new_file_.get(), NULL))
1037 return false;
1039 // Trim any excess left over from the temporary chunk data.
1040 if (!base::TruncateFile(new_file_.get()))
1041 return false;
1043 // Close the file handle and swizzle the file into place.
1044 new_file_.reset();
1045 if (!base::DeleteFile(filename_, false) &&
1046 base::PathExists(filename_))
1047 return false;
1049 const base::FilePath new_filename = TemporaryFileForFilename(filename_);
1050 if (!base::Move(new_filename, filename_))
1051 return false;
1053 // Record counts before swapping to caller.
1054 UMA_HISTOGRAM_COUNTS("SB2.AddPrefixes", add_prefix_count);
1055 UMA_HISTOGRAM_COUNTS("SB2.SubPrefixes", sub_prefix_count);
1057 return true;
1060 bool SafeBrowsingStoreFile::FinishUpdate(
1061 safe_browsing::PrefixSetBuilder* builder,
1062 std::vector<SBAddFullHash>* add_full_hashes_result) {
1063 DCHECK(CalledOnValidThread());
1064 DCHECK(builder);
1065 DCHECK(add_full_hashes_result);
1067 if (!DoUpdate(builder, add_full_hashes_result)) {
1068 CancelUpdate();
1069 return false;
1072 DCHECK(!new_file_.get());
1073 DCHECK(!file_.get());
1075 return Close();
1078 bool SafeBrowsingStoreFile::CancelUpdate() {
1079 DCHECK(CalledOnValidThread());
1080 bool ret = Close();
1082 // Delete stale staging file.
1083 const base::FilePath new_filename = TemporaryFileForFilename(filename_);
1084 base::DeleteFile(new_filename, false);
1086 return ret;
1089 void SafeBrowsingStoreFile::SetAddChunk(int32 chunk_id) {
1090 DCHECK(CalledOnValidThread());
1091 add_chunks_cache_.insert(chunk_id);
1094 bool SafeBrowsingStoreFile::CheckAddChunk(int32 chunk_id) {
1095 DCHECK(CalledOnValidThread());
1096 return add_chunks_cache_.count(chunk_id) > 0;
1099 void SafeBrowsingStoreFile::GetAddChunks(std::vector<int32>* out) {
1100 DCHECK(CalledOnValidThread());
1101 out->clear();
1102 out->insert(out->end(), add_chunks_cache_.begin(), add_chunks_cache_.end());
1105 void SafeBrowsingStoreFile::SetSubChunk(int32 chunk_id) {
1106 DCHECK(CalledOnValidThread());
1107 sub_chunks_cache_.insert(chunk_id);
1110 bool SafeBrowsingStoreFile::CheckSubChunk(int32 chunk_id) {
1111 DCHECK(CalledOnValidThread());
1112 return sub_chunks_cache_.count(chunk_id) > 0;
1115 void SafeBrowsingStoreFile::GetSubChunks(std::vector<int32>* out) {
1116 DCHECK(CalledOnValidThread());
1117 out->clear();
1118 out->insert(out->end(), sub_chunks_cache_.begin(), sub_chunks_cache_.end());
1121 void SafeBrowsingStoreFile::DeleteAddChunk(int32 chunk_id) {
1122 DCHECK(CalledOnValidThread());
1123 add_del_cache_.insert(chunk_id);
1126 void SafeBrowsingStoreFile::DeleteSubChunk(int32 chunk_id) {
1127 DCHECK(CalledOnValidThread());
1128 sub_del_cache_.insert(chunk_id);
1131 // static
1132 bool SafeBrowsingStoreFile::DeleteStore(const base::FilePath& basename) {
1133 if (!base::DeleteFile(basename, false) &&
1134 base::PathExists(basename)) {
1135 NOTREACHED();
1136 return false;
1139 const base::FilePath new_filename = TemporaryFileForFilename(basename);
1140 if (!base::DeleteFile(new_filename, false) &&
1141 base::PathExists(new_filename)) {
1142 NOTREACHED();
1143 return false;
1146 // With SQLite support gone, one way to get to this code is if the
1147 // existing file is a SQLite file. Make sure the journal file is
1148 // also removed.
1149 const base::FilePath journal_filename(
1150 basename.value() + FILE_PATH_LITERAL("-journal"));
1151 if (base::PathExists(journal_filename))
1152 base::DeleteFile(journal_filename, false);
1154 return true;