Roll src/third_party/WebKit eac3800:0237a66 (svn 202606:202607)
[chromium-blink-merge.git] / chrome / browser / safe_browsing / safe_browsing_store_file.cc
blob60fdb4da8de9b904f302e88340da7640114f2206
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "chrome/browser/safe_browsing/safe_browsing_store_file.h"
7 #include "base/files/file_util.h"
8 #include "base/files/scoped_file.h"
9 #include "base/md5.h"
10 #include "base/metrics/histogram.h"
11 #include "base/metrics/sparse_histogram.h"
13 namespace {
15 // NOTE(shess): kFileMagic should not be a byte-wise palindrome, so
16 // that byte-order changes force corruption.
17 const int32 kFileMagic = 0x600D71FE;
19 // Version history:
20 // Version 6: aad08754/r2814 by erikkay@google.com on 2008-10-02 (sqlite)
21 // Version 7: 6afe28a5/r37435 by shess@chromium.org on 2010-01-28
22 // Version 8: d3dd0715/r259791 by shess@chromium.org on 2014-03-27
23 const int32 kFileVersion = 8;
25 // ReadAndVerifyHeader() returns this in case of error.
26 const int32 kInvalidVersion = -1;
28 // Starting with version 8, the storage is sorted and can be sharded to allow
29 // updates to be done with lower memory requirements. Newly written files will
30 // be sharded to need less than this amount of memory during update. Larger
31 // values are preferred to minimize looping overhead during processing.
32 const int64 kUpdateStorageBytes = 100 * 1024;
34 // Prevent excessive sharding by setting a lower limit on the shard stride.
35 // Smaller values should work fine, but very small values will probably lead to
36 // poor performance. Shard stride is indirectly related to
37 // |kUpdateStorageBytes|, setting that very small will bump against this.
38 const uint32 kMinShardStride = 1 << 24;
40 // Strides over the entire SBPrefix space.
41 const uint64 kMaxShardStride = 1ULL << 32;
43 // Maximum SBPrefix value.
44 const SBPrefix kMaxSBPrefix = 0xFFFFFFFF;
46 // Header at the front of the main database file.
47 struct FileHeader {
48 int32 magic, version;
49 uint32 add_chunk_count, sub_chunk_count;
50 uint32 shard_stride;
51 // TODO(shess): Is this where 64-bit will bite me? Perhaps write a
52 // specialized read/write?
55 // Header for each chunk in the chunk-accumulation file.
56 struct ChunkHeader {
57 uint32 add_prefix_count, sub_prefix_count;
58 uint32 add_hash_count, sub_hash_count;
61 // Header for each shard of data in the main database file.
62 struct ShardHeader {
63 uint32 add_prefix_count, sub_prefix_count;
64 uint32 add_hash_count, sub_hash_count;
67 // Enumerate different format-change events for histogramming
68 // purposes. DO NOT CHANGE THE ORDERING OF THESE VALUES.
69 enum FormatEventType {
70 // Corruption detected, broken down by file format.
71 FORMAT_EVENT_FILE_CORRUPT,
72 FORMAT_EVENT_SQLITE_CORRUPT, // Obsolete
74 // The type of format found in the file. The expected case (new
75 // file format) is intentionally not covered.
76 FORMAT_EVENT_FOUND_SQLITE, // Obsolete
77 FORMAT_EVENT_FOUND_UNKNOWN, // magic does not match.
79 // The number of SQLite-format files deleted should be the same as
80 // FORMAT_EVENT_FOUND_SQLITE. It can differ if the delete fails,
81 // or if a failure prevents the update from succeeding.
82 FORMAT_EVENT_SQLITE_DELETED, // Obsolete
83 FORMAT_EVENT_SQLITE_DELETE_FAILED, // Obsolete
85 // Found and deleted (or failed to delete) the ancient "Safe
86 // Browsing" file.
87 FORMAT_EVENT_DELETED_ORIGINAL, // Obsolete
88 FORMAT_EVENT_DELETED_ORIGINAL_FAILED, // Obsolete
90 // The checksum did not check out in CheckValidity() or in
91 // FinishUpdate(). This most likely indicates that the machine
92 // crashed before the file was fully sync'ed to disk.
93 FORMAT_EVENT_VALIDITY_CHECKSUM_FAILURE,
94 FORMAT_EVENT_UPDATE_CHECKSUM_FAILURE,
96 // The header checksum was incorrect in ReadAndVerifyHeader(). Likely
97 // indicates that the system crashed while writing an update.
98 FORMAT_EVENT_HEADER_CHECKSUM_FAILURE,
100 FORMAT_EVENT_FOUND_DEPRECATED, // version too old.
102 // Memory space for histograms is determined by the max. ALWAYS
103 // ADD NEW VALUES BEFORE THIS ONE.
104 FORMAT_EVENT_MAX
107 void RecordFormatEvent(FormatEventType event_type) {
108 UMA_HISTOGRAM_ENUMERATION("SB2.FormatEvent", event_type, FORMAT_EVENT_MAX);
111 // Rewind the file. Using fseek(2) because rewind(3) errors are
112 // weird.
113 bool FileRewind(FILE* fp) {
114 int rv = fseek(fp, 0, SEEK_SET);
115 DCHECK_EQ(rv, 0);
116 return rv == 0;
119 // Read from |fp| into |item|, and fold the input data into the
120 // checksum in |context|, if non-NULL. Return true on success.
121 template <class T>
122 bool ReadItem(T* item, FILE* fp, base::MD5Context* context) {
123 const size_t ret = fread(item, sizeof(T), 1, fp);
124 if (ret != 1)
125 return false;
127 if (context) {
128 base::MD5Update(context,
129 base::StringPiece(reinterpret_cast<char*>(item),
130 sizeof(T)));
132 return true;
135 // Write |item| to |fp|, and fold the output data into the checksum in
136 // |context|, if non-NULL. Return true on success.
137 template <class T>
138 bool WriteItem(const T& item, FILE* fp, base::MD5Context* context) {
139 const size_t ret = fwrite(&item, sizeof(T), 1, fp);
140 if (ret != 1)
141 return false;
143 if (context) {
144 base::MD5Update(context,
145 base::StringPiece(reinterpret_cast<const char*>(&item),
146 sizeof(T)));
149 return true;
152 // Read |count| items into |values| from |fp|, and fold them into the
153 // checksum in |context|. Returns true on success.
154 template <typename CT>
155 bool ReadToContainer(CT* values, size_t count, FILE* fp,
156 base::MD5Context* context) {
157 if (!count)
158 return true;
160 for (size_t i = 0; i < count; ++i) {
161 typename CT::value_type value;
162 if (!ReadItem(&value, fp, context))
163 return false;
165 // push_back() is more obvious, but coded this way std::set can
166 // also be read.
167 values->insert(values->end(), value);
170 return true;
173 // Write values between |beg| and |end| to |fp|, and fold the data into the
174 // checksum in |context|, if non-NULL. Returns true if all items successful.
175 template <typename CTI>
176 bool WriteRange(const CTI& beg, const CTI& end,
177 FILE* fp, base::MD5Context* context) {
178 for (CTI iter = beg; iter != end; ++iter) {
179 if (!WriteItem(*iter, fp, context))
180 return false;
182 return true;
185 // Write all of |values| to |fp|, and fold the data into the checksum
186 // in |context|, if non-NULL. Returns true if all items successful.
187 template <typename CT>
188 bool WriteContainer(const CT& values, FILE* fp,
189 base::MD5Context* context) {
190 return WriteRange(values.begin(), values.end(), fp, context);
193 // Delete the chunks in |deleted| from |chunks|.
194 void DeleteChunksFromSet(const base::hash_set<int32>& deleted,
195 std::set<int32>* chunks) {
196 for (std::set<int32>::iterator iter = chunks->begin();
197 iter != chunks->end();) {
198 std::set<int32>::iterator prev = iter++;
199 if (deleted.count(*prev) > 0)
200 chunks->erase(prev);
204 bool ReadAndVerifyChecksum(FILE* fp, base::MD5Context* context) {
205 base::MD5Digest calculated_digest;
206 base::MD5IntermediateFinal(&calculated_digest, context);
208 base::MD5Digest file_digest;
209 if (!ReadItem(&file_digest, fp, context))
210 return false;
212 return memcmp(&file_digest, &calculated_digest, sizeof(file_digest)) == 0;
215 // Helper function to read the file header and chunk TOC. Rewinds |fp| and
216 // initializes |context|. The header is left in |header|, with the version
217 // returned. kInvalidVersion is returned for sanity check or checksum failure.
218 int ReadAndVerifyHeader(const base::FilePath& filename,
219 FileHeader* header,
220 std::set<int32>* add_chunks,
221 std::set<int32>* sub_chunks,
222 FILE* fp,
223 base::MD5Context* context) {
224 DCHECK(header);
225 DCHECK(add_chunks);
226 DCHECK(sub_chunks);
227 DCHECK(fp);
228 DCHECK(context);
230 base::MD5Init(context);
231 if (!FileRewind(fp))
232 return kInvalidVersion;
233 if (!ReadItem(header, fp, context))
234 return kInvalidVersion;
235 if (header->magic != kFileMagic)
236 return kInvalidVersion;
238 // Track version read to inform removal of support for older versions.
239 UMA_HISTOGRAM_SPARSE_SLOWLY("SB2.StoreVersionRead", header->version);
241 if (header->version != kFileVersion)
242 return kInvalidVersion;
244 if (!ReadToContainer(add_chunks, header->add_chunk_count, fp, context) ||
245 !ReadToContainer(sub_chunks, header->sub_chunk_count, fp, context)) {
246 return kInvalidVersion;
249 // Verify that the data read thus far is valid.
250 if (!ReadAndVerifyChecksum(fp, context)) {
251 RecordFormatEvent(FORMAT_EVENT_HEADER_CHECKSUM_FAILURE);
252 return kInvalidVersion;
255 return kFileVersion;
258 // Helper function to write out the initial header and chunks-contained data.
259 // Rewinds |fp|, initializes |context|, then writes a file header and
260 // |add_chunks| and |sub_chunks|.
261 bool WriteHeader(uint32 out_stride,
262 const std::set<int32>& add_chunks,
263 const std::set<int32>& sub_chunks,
264 FILE* fp,
265 base::MD5Context* context) {
266 if (!FileRewind(fp))
267 return false;
269 base::MD5Init(context);
270 FileHeader header;
271 header.magic = kFileMagic;
272 header.version = kFileVersion;
273 header.add_chunk_count = add_chunks.size();
274 header.sub_chunk_count = sub_chunks.size();
275 header.shard_stride = out_stride;
276 if (!WriteItem(header, fp, context))
277 return false;
279 if (!WriteContainer(add_chunks, fp, context) ||
280 !WriteContainer(sub_chunks, fp, context))
281 return false;
283 // Write out the header digest.
284 base::MD5Digest header_digest;
285 base::MD5IntermediateFinal(&header_digest, context);
286 if (!WriteItem(header_digest, fp, context))
287 return false;
289 return true;
292 // Return |true| if the range is sorted by the given comparator.
293 template <typename CTI, typename LESS>
294 bool sorted(CTI beg, CTI end, LESS less) {
295 while ((end - beg) > 2) {
296 CTI n = beg++;
297 DCHECK(!less(*beg, *n));
298 if (less(*beg, *n))
299 return false;
301 return true;
304 // Merge |beg|..|end| into |container|. Both should be sorted by the given
305 // comparator, and the range iterators should not be derived from |container|.
306 // Differs from std::inplace_merge() in that additional memory is not required
307 // for linear performance.
308 template <typename CT, typename CTI, typename COMP>
309 void container_merge(CT* container, CTI beg, CTI end, const COMP& less) {
310 DCHECK(sorted(container->begin(), container->end(), less));
311 DCHECK(sorted(beg, end, less));
313 // Size the container to fit the results.
314 const size_t c_size = container->size();
315 container->resize(c_size + (end - beg));
317 // |c_end| points to the original endpoint, while |c_out| points to the
318 // endpoint that will scan from end to beginning while merging.
319 typename CT::iterator c_end = container->begin() + c_size;
320 typename CT::iterator c_out = container->end();
322 // While both inputs have data, move the greater to |c_out|.
323 while (c_end != container->begin() && end != beg) {
324 if (less(*(c_end - 1), *(end - 1))) {
325 *(--c_out) = *(--end);
326 } else {
327 *(--c_out) = *(--c_end);
331 // Copy any data remaining in the new range.
332 if (end != beg) {
333 // The original container data has been fully shifted.
334 DCHECK(c_end == container->begin());
336 // There is exactly the correct amount of space left.
337 DCHECK_EQ(c_out - c_end, end - beg);
339 std::copy(beg, end, container->begin());
342 DCHECK(sorted(container->begin(), container->end(), less));
345 // Collection of iterators used while stepping through StateInternal (see
346 // below).
347 class StateInternalPos {
348 public:
349 StateInternalPos(SBAddPrefixes::iterator add_prefixes_iter,
350 SBSubPrefixes::iterator sub_prefixes_iter,
351 std::vector<SBAddFullHash>::iterator add_hashes_iter,
352 std::vector<SBSubFullHash>::iterator sub_hashes_iter)
353 : add_prefixes_iter_(add_prefixes_iter),
354 sub_prefixes_iter_(sub_prefixes_iter),
355 add_hashes_iter_(add_hashes_iter),
356 sub_hashes_iter_(sub_hashes_iter) {
359 SBAddPrefixes::iterator add_prefixes_iter_;
360 SBSubPrefixes::iterator sub_prefixes_iter_;
361 std::vector<SBAddFullHash>::iterator add_hashes_iter_;
362 std::vector<SBSubFullHash>::iterator sub_hashes_iter_;
365 // Helper to find the next shard boundary.
366 template <class T>
367 bool prefix_bounder(SBPrefix val, const T& elt) {
368 return val < elt.GetAddPrefix();
371 // Container for partial database state. Includes add/sub prefixes/hashes, plus
372 // aggregate operations on same.
373 class StateInternal {
374 public:
375 // Append indicated amount of data from |fp|.
376 bool AppendData(size_t add_prefix_count, size_t sub_prefix_count,
377 size_t add_hash_count, size_t sub_hash_count,
378 FILE* fp, base::MD5Context* context) {
379 return
380 ReadToContainer(&add_prefixes_, add_prefix_count, fp, context) &&
381 ReadToContainer(&sub_prefixes_, sub_prefix_count, fp, context) &&
382 ReadToContainer(&add_full_hashes_, add_hash_count, fp, context) &&
383 ReadToContainer(&sub_full_hashes_, sub_hash_count, fp, context);
386 void ClearData() {
387 add_prefixes_.clear();
388 sub_prefixes_.clear();
389 add_full_hashes_.clear();
390 sub_full_hashes_.clear();
393 // Merge data from |beg|..|end| into receiver's state, then process the state.
394 // The current state and the range given should corrospond to the same sorted
395 // shard of data from different sources. |add_del_cache| and |sub_del_cache|
396 // indicate the chunk ids which should be deleted during processing (see
397 // SBProcessSubs).
398 void MergeDataAndProcess(const StateInternalPos& beg,
399 const StateInternalPos& end,
400 const base::hash_set<int32>& add_del_cache,
401 const base::hash_set<int32>& sub_del_cache) {
402 container_merge(&add_prefixes_,
403 beg.add_prefixes_iter_,
404 end.add_prefixes_iter_,
405 SBAddPrefixLess<SBAddPrefix,SBAddPrefix>);
407 container_merge(&sub_prefixes_,
408 beg.sub_prefixes_iter_,
409 end.sub_prefixes_iter_,
410 SBAddPrefixLess<SBSubPrefix,SBSubPrefix>);
412 container_merge(&add_full_hashes_,
413 beg.add_hashes_iter_,
414 end.add_hashes_iter_,
415 SBAddPrefixHashLess<SBAddFullHash,SBAddFullHash>);
417 container_merge(&sub_full_hashes_,
418 beg.sub_hashes_iter_,
419 end.sub_hashes_iter_,
420 SBAddPrefixHashLess<SBSubFullHash, SBSubFullHash>);
422 SBProcessSubs(&add_prefixes_, &sub_prefixes_,
423 &add_full_hashes_, &sub_full_hashes_,
424 add_del_cache, sub_del_cache);
427 // Sort the data appropriately for the sharding, merging, and processing
428 // operations.
429 void SortData() {
430 std::sort(add_prefixes_.begin(), add_prefixes_.end(),
431 SBAddPrefixLess<SBAddPrefix,SBAddPrefix>);
432 std::sort(sub_prefixes_.begin(), sub_prefixes_.end(),
433 SBAddPrefixLess<SBSubPrefix,SBSubPrefix>);
434 std::sort(add_full_hashes_.begin(), add_full_hashes_.end(),
435 SBAddPrefixHashLess<SBAddFullHash,SBAddFullHash>);
436 std::sort(sub_full_hashes_.begin(), sub_full_hashes_.end(),
437 SBAddPrefixHashLess<SBSubFullHash,SBSubFullHash>);
440 // Iterator from the beginning of the state's data.
441 StateInternalPos StateBegin() {
442 return StateInternalPos(add_prefixes_.begin(),
443 sub_prefixes_.begin(),
444 add_full_hashes_.begin(),
445 sub_full_hashes_.begin());
448 // An iterator pointing just after the last possible element of the shard
449 // indicated by |shard_max|. Used to step through the state by shard.
450 // TODO(shess): Verify whether binary search really improves over linear.
451 // Merging or writing will immediately touch all of these elements.
452 StateInternalPos ShardEnd(const StateInternalPos& beg, SBPrefix shard_max) {
453 return StateInternalPos(
454 std::upper_bound(beg.add_prefixes_iter_, add_prefixes_.end(),
455 shard_max, prefix_bounder<SBAddPrefix>),
456 std::upper_bound(beg.sub_prefixes_iter_, sub_prefixes_.end(),
457 shard_max, prefix_bounder<SBSubPrefix>),
458 std::upper_bound(beg.add_hashes_iter_, add_full_hashes_.end(),
459 shard_max, prefix_bounder<SBAddFullHash>),
460 std::upper_bound(beg.sub_hashes_iter_, sub_full_hashes_.end(),
461 shard_max, prefix_bounder<SBSubFullHash>));
464 // Write a shard header and data for the shard starting at |beg| and ending at
465 // the element before |end|.
466 bool WriteShard(const StateInternalPos& beg, const StateInternalPos& end,
467 FILE* fp, base::MD5Context* context) {
468 ShardHeader shard_header;
469 shard_header.add_prefix_count =
470 end.add_prefixes_iter_ - beg.add_prefixes_iter_;
471 shard_header.sub_prefix_count =
472 end.sub_prefixes_iter_ - beg.sub_prefixes_iter_;
473 shard_header.add_hash_count =
474 end.add_hashes_iter_ - beg.add_hashes_iter_;
475 shard_header.sub_hash_count =
476 end.sub_hashes_iter_ - beg.sub_hashes_iter_;
478 return
479 WriteItem(shard_header, fp, context) &&
480 WriteRange(beg.add_prefixes_iter_, end.add_prefixes_iter_,
481 fp, context) &&
482 WriteRange(beg.sub_prefixes_iter_, end.sub_prefixes_iter_,
483 fp, context) &&
484 WriteRange(beg.add_hashes_iter_, end.add_hashes_iter_,
485 fp, context) &&
486 WriteRange(beg.sub_hashes_iter_, end.sub_hashes_iter_,
487 fp, context);
490 SBAddPrefixes add_prefixes_;
491 SBSubPrefixes sub_prefixes_;
492 std::vector<SBAddFullHash> add_full_hashes_;
493 std::vector<SBSubFullHash> sub_full_hashes_;
496 // True if |val| is an even power of two.
497 template <typename T>
498 bool IsPowerOfTwo(const T& val) {
499 return val && (val & (val - 1)) == 0;
502 // Helper to read the entire database state, used by GetAddPrefixes() and
503 // GetAddFullHashes(). Those functions are generally used only for smaller
504 // files. Returns false in case of errors reading the data.
505 bool ReadDbStateHelper(const base::FilePath& filename,
506 StateInternal* db_state) {
507 base::ScopedFILE file(base::OpenFile(filename, "rb"));
508 if (file.get() == NULL)
509 return false;
511 std::set<int32> add_chunks;
512 std::set<int32> sub_chunks;
514 base::MD5Context context;
515 FileHeader header;
516 const int version =
517 ReadAndVerifyHeader(filename, &header, &add_chunks, &sub_chunks,
518 file.get(), &context);
519 if (version == kInvalidVersion)
520 return false;
522 uint64 in_min = 0;
523 uint64 in_stride = header.shard_stride;
524 if (!in_stride)
525 in_stride = kMaxShardStride;
526 if (!IsPowerOfTwo(in_stride))
527 return false;
529 do {
530 ShardHeader shard_header;
531 if (!ReadItem(&shard_header, file.get(), &context))
532 return false;
534 if (!db_state->AppendData(shard_header.add_prefix_count,
535 shard_header.sub_prefix_count,
536 shard_header.add_hash_count,
537 shard_header.sub_hash_count,
538 file.get(), &context)) {
539 return false;
542 in_min += in_stride;
543 } while (in_min <= kMaxSBPrefix);
545 if (!ReadAndVerifyChecksum(file.get(), &context))
546 return false;
548 int64 size = 0;
549 if (!base::GetFileSize(filename, &size))
550 return false;
552 return static_cast<int64>(ftell(file.get())) == size;
555 } // namespace
557 SafeBrowsingStoreFile::SafeBrowsingStoreFile(
558 const scoped_refptr<const base::SequencedTaskRunner>& task_runner)
559 : task_runner_(task_runner),
560 chunks_written_(0),
561 empty_(false),
562 corruption_seen_(false) {
565 SafeBrowsingStoreFile::~SafeBrowsingStoreFile() {
566 // Thread-checking is disabled in the destructor due to crbug.com/338486.
567 task_runner_ = nullptr;
569 Close();
572 bool SafeBrowsingStoreFile::CalledOnValidThread() {
573 return !task_runner_ || task_runner_->RunsTasksOnCurrentThread();
576 bool SafeBrowsingStoreFile::Delete() {
577 DCHECK(CalledOnValidThread());
579 // The database should not be open at this point. But, just in
580 // case, close everything before deleting.
581 if (!Close()) {
582 NOTREACHED();
583 return false;
586 return DeleteStore(filename_);
589 bool SafeBrowsingStoreFile::CheckValidity() {
590 DCHECK(CalledOnValidThread());
592 // The file was either empty or never opened. The empty case is
593 // presumed not to be invalid. The never-opened case can happen if
594 // BeginUpdate() fails for any databases, and should already have
595 // caused the corruption callback to fire.
596 if (!file_.get())
597 return true;
599 if (!FileRewind(file_.get()))
600 return OnCorruptDatabase();
602 int64 size = 0;
603 if (!base::GetFileSize(filename_, &size))
604 return OnCorruptDatabase();
606 base::MD5Context context;
607 base::MD5Init(&context);
609 // Read everything except the final digest.
610 size_t bytes_left = static_cast<size_t>(size);
611 CHECK(size == static_cast<int64>(bytes_left));
612 if (bytes_left < sizeof(base::MD5Digest))
613 return OnCorruptDatabase();
614 bytes_left -= sizeof(base::MD5Digest);
616 // Fold the contents of the file into the checksum.
617 while (bytes_left > 0) {
618 char buf[4096];
619 const size_t c = std::min(sizeof(buf), bytes_left);
620 const size_t ret = fread(buf, 1, c, file_.get());
622 // The file's size changed while reading, give up.
623 if (ret != c)
624 return OnCorruptDatabase();
625 base::MD5Update(&context, base::StringPiece(buf, c));
626 bytes_left -= c;
629 if (!ReadAndVerifyChecksum(file_.get(), &context)) {
630 RecordFormatEvent(FORMAT_EVENT_VALIDITY_CHECKSUM_FAILURE);
631 return OnCorruptDatabase();
634 return true;
637 void SafeBrowsingStoreFile::Init(const base::FilePath& filename,
638 const base::Closure& corruption_callback) {
639 DCHECK(CalledOnValidThread());
640 filename_ = filename;
641 corruption_callback_ = corruption_callback;
644 bool SafeBrowsingStoreFile::BeginChunk() {
645 DCHECK(CalledOnValidThread());
646 return ClearChunkBuffers();
649 bool SafeBrowsingStoreFile::WriteAddPrefix(int32 chunk_id, SBPrefix prefix) {
650 DCHECK(CalledOnValidThread());
651 add_prefixes_.push_back(SBAddPrefix(chunk_id, prefix));
652 return true;
655 bool SafeBrowsingStoreFile::GetAddPrefixes(SBAddPrefixes* add_prefixes) {
656 DCHECK(CalledOnValidThread());
658 add_prefixes->clear();
659 if (!base::PathExists(filename_))
660 return true;
662 StateInternal db_state;
663 if (!ReadDbStateHelper(filename_, &db_state))
664 return OnCorruptDatabase();
666 add_prefixes->swap(db_state.add_prefixes_);
667 return true;
670 bool SafeBrowsingStoreFile::GetAddFullHashes(
671 std::vector<SBAddFullHash>* add_full_hashes) {
672 DCHECK(CalledOnValidThread());
674 add_full_hashes->clear();
675 if (!base::PathExists(filename_))
676 return true;
678 StateInternal db_state;
679 if (!ReadDbStateHelper(filename_, &db_state))
680 return OnCorruptDatabase();
682 add_full_hashes->swap(db_state.add_full_hashes_);
683 return true;
686 bool SafeBrowsingStoreFile::WriteAddHash(int32 chunk_id,
687 const SBFullHash& full_hash) {
688 DCHECK(CalledOnValidThread());
689 add_hashes_.push_back(SBAddFullHash(chunk_id, full_hash));
690 return true;
693 bool SafeBrowsingStoreFile::WriteSubPrefix(int32 chunk_id,
694 int32 add_chunk_id,
695 SBPrefix prefix) {
696 DCHECK(CalledOnValidThread());
697 sub_prefixes_.push_back(SBSubPrefix(chunk_id, add_chunk_id, prefix));
698 return true;
701 bool SafeBrowsingStoreFile::WriteSubHash(int32 chunk_id, int32 add_chunk_id,
702 const SBFullHash& full_hash) {
703 DCHECK(CalledOnValidThread());
704 sub_hashes_.push_back(SBSubFullHash(chunk_id, add_chunk_id, full_hash));
705 return true;
708 bool SafeBrowsingStoreFile::OnCorruptDatabase() {
709 DCHECK(CalledOnValidThread());
711 if (!corruption_seen_)
712 RecordFormatEvent(FORMAT_EVENT_FILE_CORRUPT);
713 corruption_seen_ = true;
715 corruption_callback_.Run();
717 // Return false as a convenience to callers.
718 return false;
721 bool SafeBrowsingStoreFile::Close() {
722 DCHECK(CalledOnValidThread());
724 ClearUpdateBuffers();
726 // Make sure the files are closed.
727 file_.reset();
728 new_file_.reset();
729 return true;
732 bool SafeBrowsingStoreFile::BeginUpdate() {
733 DCHECK(CalledOnValidThread());
734 DCHECK(!file_.get() && !new_file_.get());
736 // Structures should all be clear unless something bad happened.
737 DCHECK(add_chunks_cache_.empty());
738 DCHECK(sub_chunks_cache_.empty());
739 DCHECK(add_del_cache_.empty());
740 DCHECK(sub_del_cache_.empty());
741 DCHECK(add_prefixes_.empty());
742 DCHECK(sub_prefixes_.empty());
743 DCHECK(add_hashes_.empty());
744 DCHECK(sub_hashes_.empty());
745 DCHECK_EQ(chunks_written_, 0);
747 corruption_seen_ = false;
749 const base::FilePath new_filename = TemporaryFileForFilename(filename_);
750 base::ScopedFILE new_file(base::OpenFile(new_filename, "wb+"));
751 if (new_file.get() == NULL)
752 return false;
754 base::ScopedFILE file(base::OpenFile(filename_, "rb"));
755 empty_ = (file.get() == NULL);
756 if (empty_) {
757 // If the file exists but cannot be opened, try to delete it (not
758 // deleting directly, the bloom filter needs to be deleted, too).
759 if (base::PathExists(filename_))
760 return OnCorruptDatabase();
762 new_file_.swap(new_file);
763 return true;
766 base::MD5Context context;
767 FileHeader header;
768 const int version =
769 ReadAndVerifyHeader(filename_, &header,
770 &add_chunks_cache_, &sub_chunks_cache_,
771 file.get(), &context);
772 if (version == kInvalidVersion) {
773 FileHeader retry_header;
774 if (FileRewind(file.get()) && ReadItem(&retry_header, file.get(), NULL)) {
775 if (retry_header.magic == kFileMagic &&
776 retry_header.version < kFileVersion) {
777 RecordFormatEvent(FORMAT_EVENT_FOUND_DEPRECATED);
778 } else {
779 RecordFormatEvent(FORMAT_EVENT_FOUND_UNKNOWN);
783 // Close the file so that it can be deleted.
784 file.reset();
786 return OnCorruptDatabase();
789 file_.swap(file);
790 new_file_.swap(new_file);
791 return true;
794 bool SafeBrowsingStoreFile::FinishChunk() {
795 DCHECK(CalledOnValidThread());
797 if (!add_prefixes_.size() && !sub_prefixes_.size() &&
798 !add_hashes_.size() && !sub_hashes_.size())
799 return true;
801 ChunkHeader header;
802 header.add_prefix_count = add_prefixes_.size();
803 header.sub_prefix_count = sub_prefixes_.size();
804 header.add_hash_count = add_hashes_.size();
805 header.sub_hash_count = sub_hashes_.size();
806 if (!WriteItem(header, new_file_.get(), NULL))
807 return false;
809 if (!WriteContainer(add_prefixes_, new_file_.get(), NULL) ||
810 !WriteContainer(sub_prefixes_, new_file_.get(), NULL) ||
811 !WriteContainer(add_hashes_, new_file_.get(), NULL) ||
812 !WriteContainer(sub_hashes_, new_file_.get(), NULL))
813 return false;
815 ++chunks_written_;
817 // Clear everything to save memory.
818 return ClearChunkBuffers();
821 bool SafeBrowsingStoreFile::DoUpdate(
822 safe_browsing::PrefixSetBuilder* builder,
823 std::vector<SBAddFullHash>* add_full_hashes_result) {
824 DCHECK(CalledOnValidThread());
825 DCHECK(file_.get() || empty_);
826 DCHECK(new_file_.get());
827 CHECK(builder);
828 CHECK(add_full_hashes_result);
830 // Rewind the temporary storage.
831 if (!FileRewind(new_file_.get()))
832 return false;
834 // Get chunk file's size for validating counts.
835 int64 update_size = 0;
836 if (!base::GetFileSize(TemporaryFileForFilename(filename_), &update_size))
837 return OnCorruptDatabase();
839 // Track update size to answer questions at http://crbug.com/72216 .
840 // Log small updates as 1k so that the 0 (underflow) bucket can be
841 // used for "empty" in SafeBrowsingDatabase.
842 UMA_HISTOGRAM_COUNTS("SB2.DatabaseUpdateKilobytes",
843 std::max(static_cast<int>(update_size / 1024), 1));
845 // Chunk updates to integrate.
846 StateInternal new_state;
848 // Read update chunks.
849 for (int i = 0; i < chunks_written_; ++i) {
850 ChunkHeader header;
852 int64 ofs = ftell(new_file_.get());
853 if (ofs == -1)
854 return false;
856 if (!ReadItem(&header, new_file_.get(), NULL))
857 return false;
859 // As a safety measure, make sure that the header describes a sane
860 // chunk, given the remaining file size.
861 int64 expected_size = ofs + sizeof(ChunkHeader);
862 expected_size += header.add_prefix_count * sizeof(SBAddPrefix);
863 expected_size += header.sub_prefix_count * sizeof(SBSubPrefix);
864 expected_size += header.add_hash_count * sizeof(SBAddFullHash);
865 expected_size += header.sub_hash_count * sizeof(SBSubFullHash);
866 if (expected_size > update_size)
867 return false;
869 if (!new_state.AppendData(header.add_prefix_count, header.sub_prefix_count,
870 header.add_hash_count, header.sub_hash_count,
871 new_file_.get(), NULL)) {
872 return false;
876 // The state was accumulated by chunk, sort by prefix.
877 new_state.SortData();
879 // These strides control how much data is loaded into memory per pass.
880 // Strides must be an even power of two. |in_stride| will be derived from the
881 // input file. |out_stride| will be derived from an estimate of the resulting
882 // file's size. |process_stride| will be the max of both.
883 uint64 in_stride = kMaxShardStride;
884 uint64 out_stride = kMaxShardStride;
885 uint64 process_stride = 0;
887 // Used to verify the input's checksum if |!empty_|.
888 base::MD5Context in_context;
890 if (!empty_) {
891 DCHECK(file_.get());
893 FileHeader header = {0};
894 int version = ReadAndVerifyHeader(filename_, &header,
895 &add_chunks_cache_, &sub_chunks_cache_,
896 file_.get(), &in_context);
897 if (version == kInvalidVersion)
898 return OnCorruptDatabase();
900 if (header.shard_stride)
901 in_stride = header.shard_stride;
903 // The header checksum should have prevented this case, but the code will be
904 // broken if this is not correct.
905 if (!IsPowerOfTwo(in_stride))
906 return OnCorruptDatabase();
909 // We no longer need to track deleted chunks.
910 DeleteChunksFromSet(add_del_cache_, &add_chunks_cache_);
911 DeleteChunksFromSet(sub_del_cache_, &sub_chunks_cache_);
913 // Calculate |out_stride| to break the file down into reasonable shards.
915 int64 original_size = 0;
916 if (!empty_ && !base::GetFileSize(filename_, &original_size))
917 return OnCorruptDatabase();
919 // Approximate the final size as everything. Subs and deletes will reduce
920 // the size, but modest over-sharding won't hurt much.
921 int64 shard_size = original_size + update_size;
923 // Keep splitting until a single stride of data fits the target.
924 size_t shifts = 0;
925 while (out_stride > kMinShardStride && shard_size > kUpdateStorageBytes) {
926 out_stride >>= 1;
927 shard_size >>= 1;
928 ++shifts;
930 UMA_HISTOGRAM_COUNTS("SB2.OutShardShifts", shifts);
932 DCHECK(IsPowerOfTwo(out_stride));
935 // Outer loop strides by the max of the input stride (to read integral shards)
936 // and the output stride (to write integral shards).
937 process_stride = std::max(in_stride, out_stride);
938 DCHECK(IsPowerOfTwo(process_stride));
939 DCHECK_EQ(0u, process_stride % in_stride);
940 DCHECK_EQ(0u, process_stride % out_stride);
942 // Start writing the new data to |new_file_|.
943 base::MD5Context out_context;
944 if (!WriteHeader(out_stride, add_chunks_cache_, sub_chunks_cache_,
945 new_file_.get(), &out_context)) {
946 return false;
949 // Start at the beginning of the SBPrefix space.
950 uint64 in_min = 0;
951 uint64 out_min = 0;
952 uint64 process_min = 0;
954 // Start at the beginning of the updates.
955 StateInternalPos new_pos = new_state.StateBegin();
957 // Re-usable container for shard processing.
958 StateInternal db_state;
960 // Track aggregate counts for histograms.
961 size_t add_prefix_count = 0;
962 size_t sub_prefix_count = 0;
964 do {
965 // Maximum element in the current shard.
966 SBPrefix process_max =
967 static_cast<SBPrefix>(process_min + process_stride - 1);
968 DCHECK_GT(process_max, process_min);
970 // Drop the data from previous pass.
971 db_state.ClearData();
973 // Fill the processing shard with one or more input shards.
974 if (!empty_) {
975 do {
976 ShardHeader shard_header;
977 if (!ReadItem(&shard_header, file_.get(), &in_context))
978 return OnCorruptDatabase();
980 if (!db_state.AppendData(shard_header.add_prefix_count,
981 shard_header.sub_prefix_count,
982 shard_header.add_hash_count,
983 shard_header.sub_hash_count,
984 file_.get(), &in_context))
985 return OnCorruptDatabase();
987 in_min += in_stride;
988 } while (in_min <= kMaxSBPrefix && in_min < process_max);
991 // Shard the update data to match the database data, then merge the update
992 // data and process the results.
994 StateInternalPos new_end = new_state.ShardEnd(new_pos, process_max);
995 db_state.MergeDataAndProcess(new_pos, new_end,
996 add_del_cache_, sub_del_cache_);
997 new_pos = new_end;
1000 // Collect the processed data for return to caller.
1001 for (size_t i = 0; i < db_state.add_prefixes_.size(); ++i) {
1002 builder->AddPrefix(db_state.add_prefixes_[i].prefix);
1004 add_full_hashes_result->insert(add_full_hashes_result->end(),
1005 db_state.add_full_hashes_.begin(),
1006 db_state.add_full_hashes_.end());
1007 add_prefix_count += db_state.add_prefixes_.size();
1008 sub_prefix_count += db_state.sub_prefixes_.size();
1010 // Write one or more shards of processed output.
1011 StateInternalPos out_pos = db_state.StateBegin();
1012 do {
1013 SBPrefix out_max = static_cast<SBPrefix>(out_min + out_stride - 1);
1014 DCHECK_GT(out_max, out_min);
1016 StateInternalPos out_end = db_state.ShardEnd(out_pos, out_max);
1017 if (!db_state.WriteShard(out_pos, out_end, new_file_.get(), &out_context))
1018 return false;
1019 out_pos = out_end;
1021 out_min += out_stride;
1022 } while (out_min == static_cast<SBPrefix>(out_min) &&
1023 out_min < process_max);
1025 process_min += process_stride;
1026 } while (process_min <= kMaxSBPrefix);
1028 // Verify the overall checksum.
1029 if (!empty_) {
1030 if (!ReadAndVerifyChecksum(file_.get(), &in_context)) {
1031 RecordFormatEvent(FORMAT_EVENT_UPDATE_CHECKSUM_FAILURE);
1032 return OnCorruptDatabase();
1035 // TODO(shess): Verify EOF?
1037 // Close the input file so the new file can be renamed over it.
1038 file_.reset();
1040 DCHECK(!file_.get());
1042 // Write the overall checksum.
1043 base::MD5Digest out_digest;
1044 base::MD5Final(&out_digest, &out_context);
1045 if (!WriteItem(out_digest, new_file_.get(), NULL))
1046 return false;
1048 // Trim any excess left over from the temporary chunk data.
1049 if (!base::TruncateFile(new_file_.get()))
1050 return false;
1052 // Close the file handle and swizzle the file into place.
1053 new_file_.reset();
1054 if (!base::DeleteFile(filename_, false) &&
1055 base::PathExists(filename_))
1056 return false;
1058 const base::FilePath new_filename = TemporaryFileForFilename(filename_);
1059 if (!base::Move(new_filename, filename_))
1060 return false;
1062 // Record counts before swapping to caller.
1063 UMA_HISTOGRAM_COUNTS("SB2.AddPrefixes", add_prefix_count);
1064 UMA_HISTOGRAM_COUNTS("SB2.SubPrefixes", sub_prefix_count);
1066 return true;
1069 bool SafeBrowsingStoreFile::FinishUpdate(
1070 safe_browsing::PrefixSetBuilder* builder,
1071 std::vector<SBAddFullHash>* add_full_hashes_result) {
1072 DCHECK(CalledOnValidThread());
1073 DCHECK(builder);
1074 DCHECK(add_full_hashes_result);
1076 if (!DoUpdate(builder, add_full_hashes_result)) {
1077 CancelUpdate();
1078 return false;
1081 DCHECK(!new_file_.get());
1082 DCHECK(!file_.get());
1084 return Close();
1087 bool SafeBrowsingStoreFile::CancelUpdate() {
1088 DCHECK(CalledOnValidThread());
1089 bool ret = Close();
1091 // Delete stale staging file.
1092 const base::FilePath new_filename = TemporaryFileForFilename(filename_);
1093 base::DeleteFile(new_filename, false);
1095 return ret;
1098 void SafeBrowsingStoreFile::SetAddChunk(int32 chunk_id) {
1099 DCHECK(CalledOnValidThread());
1100 add_chunks_cache_.insert(chunk_id);
1103 bool SafeBrowsingStoreFile::CheckAddChunk(int32 chunk_id) {
1104 DCHECK(CalledOnValidThread());
1105 return add_chunks_cache_.count(chunk_id) > 0;
1108 void SafeBrowsingStoreFile::GetAddChunks(std::vector<int32>* out) {
1109 DCHECK(CalledOnValidThread());
1110 out->clear();
1111 out->insert(out->end(), add_chunks_cache_.begin(), add_chunks_cache_.end());
1114 void SafeBrowsingStoreFile::SetSubChunk(int32 chunk_id) {
1115 DCHECK(CalledOnValidThread());
1116 sub_chunks_cache_.insert(chunk_id);
1119 bool SafeBrowsingStoreFile::CheckSubChunk(int32 chunk_id) {
1120 DCHECK(CalledOnValidThread());
1121 return sub_chunks_cache_.count(chunk_id) > 0;
1124 void SafeBrowsingStoreFile::GetSubChunks(std::vector<int32>* out) {
1125 DCHECK(CalledOnValidThread());
1126 out->clear();
1127 out->insert(out->end(), sub_chunks_cache_.begin(), sub_chunks_cache_.end());
1130 void SafeBrowsingStoreFile::DeleteAddChunk(int32 chunk_id) {
1131 DCHECK(CalledOnValidThread());
1132 add_del_cache_.insert(chunk_id);
1135 void SafeBrowsingStoreFile::DeleteSubChunk(int32 chunk_id) {
1136 DCHECK(CalledOnValidThread());
1137 sub_del_cache_.insert(chunk_id);
1140 // static
1141 bool SafeBrowsingStoreFile::DeleteStore(const base::FilePath& basename) {
1142 if (!base::DeleteFile(basename, false) &&
1143 base::PathExists(basename)) {
1144 NOTREACHED();
1145 return false;
1148 const base::FilePath new_filename = TemporaryFileForFilename(basename);
1149 if (!base::DeleteFile(new_filename, false) &&
1150 base::PathExists(new_filename)) {
1151 NOTREACHED();
1152 return false;
1155 // With SQLite support gone, one way to get to this code is if the
1156 // existing file is a SQLite file. Make sure the journal file is
1157 // also removed.
1158 const base::FilePath journal_filename(
1159 basename.value() + FILE_PATH_LITERAL("-journal"));
1160 if (base::PathExists(journal_filename))
1161 base::DeleteFile(journal_filename, false);
1163 return true;