1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "chrome/browser/safe_browsing/safe_browsing_store_file.h"
7 #include "base/file_util.h"
8 #include "base/files/scoped_file.h"
10 #include "base/metrics/histogram.h"
11 #include "base/metrics/sparse_histogram.h"
15 // NOTE(shess): kFileMagic should not be a byte-wise palindrome, so
16 // that byte-order changes force corruption.
17 const int32 kFileMagic
= 0x600D71FE;
20 // Version 6: aad08754/r2814 by erikkay@google.com on 2008-10-02 (sqlite)
21 // Version 7: 6afe28a5/r37435 by shess@chromium.org on 2010-01-28
22 // Version 8: d3dd0715/r259791 by shess@chromium.org on 2014-03-27
23 const int32 kFileVersion
= 8;
25 // ReadAndVerifyHeader() returns this in case of error.
26 const int32 kInvalidVersion
= -1;
28 // Header at the front of the main database file.
31 uint32 add_chunk_count
, sub_chunk_count
;
32 uint32 add_prefix_count
, sub_prefix_count
;
33 uint32 add_hash_count
, sub_hash_count
;
36 // Starting with version 8, the storage is sorted and can be sharded to allow
37 // updates to be done with lower memory requirements. Newly written files will
38 // be sharded to need less than this amount of memory during update. Larger
39 // values are preferred to minimize looping overhead during processing.
40 const int64 kUpdateStorageBytes
= 100 * 1024;
42 // Prevent excessive sharding by setting a lower limit on the shard stride.
43 // Smaller values should work fine, but very small values will probably lead to
44 // poor performance. Shard stride is indirectly related to
45 // |kUpdateStorageBytes|, setting that very small will bump against this.
46 const uint32 kMinShardStride
= 1 << 24;
48 // Strides over the entire SBPrefix space.
49 const uint64 kMaxShardStride
= 1ULL << 32;
51 // Maximum SBPrefix value.
52 const SBPrefix kMaxSBPrefix
= ~0;
54 // Header at the front of the main database file.
57 uint32 add_chunk_count
, sub_chunk_count
;
59 // TODO(shess): Is this where 64-bit will bite me? Perhaps write a
60 // specialized read/write?
64 struct FileHeaderV7 v7
;
65 struct FileHeaderV8 v8
;
68 // Header for each chunk in the chunk-accumulation file.
70 uint32 add_prefix_count
, sub_prefix_count
;
71 uint32 add_hash_count
, sub_hash_count
;
74 // Header for each shard of data in the main database file.
76 uint32 add_prefix_count
, sub_prefix_count
;
77 uint32 add_hash_count
, sub_hash_count
;
80 // Enumerate different format-change events for histogramming
81 // purposes. DO NOT CHANGE THE ORDERING OF THESE VALUES.
82 enum FormatEventType
{
83 // Corruption detected, broken down by file format.
84 FORMAT_EVENT_FILE_CORRUPT
,
85 FORMAT_EVENT_SQLITE_CORRUPT
, // Obsolete
87 // The type of format found in the file. The expected case (new
88 // file format) is intentionally not covered.
89 FORMAT_EVENT_FOUND_SQLITE
,
90 FORMAT_EVENT_FOUND_UNKNOWN
,
92 // The number of SQLite-format files deleted should be the same as
93 // FORMAT_EVENT_FOUND_SQLITE. It can differ if the delete fails,
94 // or if a failure prevents the update from succeeding.
95 FORMAT_EVENT_SQLITE_DELETED
, // Obsolete
96 FORMAT_EVENT_SQLITE_DELETE_FAILED
, // Obsolete
98 // Found and deleted (or failed to delete) the ancient "Safe
100 FORMAT_EVENT_DELETED_ORIGINAL
,
101 FORMAT_EVENT_DELETED_ORIGINAL_FAILED
,
103 // The checksum did not check out in CheckValidity() or in
104 // FinishUpdate(). This most likely indicates that the machine
105 // crashed before the file was fully sync'ed to disk.
106 FORMAT_EVENT_VALIDITY_CHECKSUM_FAILURE
,
107 FORMAT_EVENT_UPDATE_CHECKSUM_FAILURE
,
109 // The header checksum was incorrect in ReadAndVerifyHeader(). Likely
110 // indicates that the system crashed while writing an update.
111 FORMAT_EVENT_HEADER_CHECKSUM_FAILURE
,
113 // Memory space for histograms is determined by the max. ALWAYS
114 // ADD NEW VALUES BEFORE THIS ONE.
118 void RecordFormatEvent(FormatEventType event_type
) {
119 UMA_HISTOGRAM_ENUMERATION("SB2.FormatEvent", event_type
, FORMAT_EVENT_MAX
);
122 // Rewind the file. Using fseek(2) because rewind(3) errors are
124 bool FileRewind(FILE* fp
) {
125 int rv
= fseek(fp
, 0, SEEK_SET
);
130 // Read from |fp| into |item|, and fold the input data into the
131 // checksum in |context|, if non-NULL. Return true on success.
133 bool ReadItem(T
* item
, FILE* fp
, base::MD5Context
* context
) {
134 const size_t ret
= fread(item
, sizeof(T
), 1, fp
);
139 base::MD5Update(context
,
140 base::StringPiece(reinterpret_cast<char*>(item
),
146 // Write |item| to |fp|, and fold the output data into the checksum in
147 // |context|, if non-NULL. Return true on success.
149 bool WriteItem(const T
& item
, FILE* fp
, base::MD5Context
* context
) {
150 const size_t ret
= fwrite(&item
, sizeof(T
), 1, fp
);
155 base::MD5Update(context
,
156 base::StringPiece(reinterpret_cast<const char*>(&item
),
163 // Read |count| items into |values| from |fp|, and fold them into the
164 // checksum in |context|. Returns true on success.
165 template <typename CT
>
166 bool ReadToContainer(CT
* values
, size_t count
, FILE* fp
,
167 base::MD5Context
* context
) {
171 for (size_t i
= 0; i
< count
; ++i
) {
172 typename
CT::value_type value
;
173 if (!ReadItem(&value
, fp
, context
))
176 // push_back() is more obvious, but coded this way std::set can
178 values
->insert(values
->end(), value
);
184 // Write values between |beg| and |end| to |fp|, and fold the data into the
185 // checksum in |context|, if non-NULL. Returns true if all items successful.
186 template <typename CTI
>
187 bool WriteRange(const CTI
& beg
, const CTI
& end
,
188 FILE* fp
, base::MD5Context
* context
) {
189 for (CTI iter
= beg
; iter
!= end
; ++iter
) {
190 if (!WriteItem(*iter
, fp
, context
))
196 // Write all of |values| to |fp|, and fold the data into the checksum
197 // in |context|, if non-NULL. Returns true if all items successful.
198 template <typename CT
>
199 bool WriteContainer(const CT
& values
, FILE* fp
,
200 base::MD5Context
* context
) {
201 return WriteRange(values
.begin(), values
.end(), fp
, context
);
204 // Delete the chunks in |deleted| from |chunks|.
205 void DeleteChunksFromSet(const base::hash_set
<int32
>& deleted
,
206 std::set
<int32
>* chunks
) {
207 for (std::set
<int32
>::iterator iter
= chunks
->begin();
208 iter
!= chunks
->end();) {
209 std::set
<int32
>::iterator prev
= iter
++;
210 if (deleted
.count(*prev
) > 0)
215 bool ReadAndVerifyChecksum(FILE* fp
, base::MD5Context
* context
) {
216 base::MD5Digest calculated_digest
;
217 base::MD5IntermediateFinal(&calculated_digest
, context
);
219 base::MD5Digest file_digest
;
220 if (!ReadItem(&file_digest
, fp
, context
))
223 return memcmp(&file_digest
, &calculated_digest
, sizeof(file_digest
)) == 0;
226 // Sanity-check the header against the file's size to make sure our
227 // vectors aren't gigantic. This doubles as a cheap way to detect
228 // corruption without having to checksum the entire file.
229 bool FileHeaderV7SanityCheck(const base::FilePath
& filename
,
230 const FileHeaderV7
& header
) {
231 DCHECK_EQ(header
.version
, 7);
234 if (!base::GetFileSize(filename
, &size
))
237 int64 expected_size
= sizeof(FileHeaderV7
);
238 expected_size
+= header
.add_chunk_count
* sizeof(int32
);
239 expected_size
+= header
.sub_chunk_count
* sizeof(int32
);
240 expected_size
+= header
.add_prefix_count
* sizeof(SBAddPrefix
);
241 expected_size
+= header
.sub_prefix_count
* sizeof(SBSubPrefix
);
242 expected_size
+= header
.add_hash_count
* sizeof(SBAddFullHash
);
243 expected_size
+= header
.sub_hash_count
* sizeof(SBSubFullHash
);
244 expected_size
+= sizeof(base::MD5Digest
);
245 if (size
!= expected_size
)
251 // Helper function to read the file header and chunk TOC. Rewinds |fp| and
252 // initializes |context|. The header is left in |header|, with the version
253 // returned. kInvalidVersion is returned for sanity check or checksum failure.
254 int ReadAndVerifyHeader(const base::FilePath
& filename
,
256 std::set
<int32
>* add_chunks
,
257 std::set
<int32
>* sub_chunks
,
259 base::MD5Context
* context
) {
266 int version
= kInvalidVersion
;
268 base::MD5Init(context
);
270 return kInvalidVersion
;
271 if (!ReadItem(&header
->v8
, fp
, context
))
272 return kInvalidVersion
;
273 if (header
->v8
.magic
!= kFileMagic
)
274 return kInvalidVersion
;
276 size_t add_chunks_count
= 0;
277 size_t sub_chunks_count
= 0;
279 // Track version read to inform removal of support for older versions.
280 UMA_HISTOGRAM_SPARSE_SLOWLY("SB2.StoreVersionRead", header
->v8
.version
);
282 if (header
->v8
.version
== 7) {
285 // Reset the context and re-read the v7 header.
286 base::MD5Init(context
);
288 return kInvalidVersion
;
289 if (!ReadItem(&header
->v7
, fp
, context
))
290 return kInvalidVersion
;
291 if (header
->v7
.magic
!= kFileMagic
|| header
->v7
.version
!= 7)
292 return kInvalidVersion
;
293 if (!FileHeaderV7SanityCheck(filename
, header
->v7
))
294 return kInvalidVersion
;
296 add_chunks_count
= header
->v7
.add_chunk_count
;
297 sub_chunks_count
= header
->v7
.sub_chunk_count
;
298 } else if (header
->v8
.version
== kFileVersion
) {
300 add_chunks_count
= header
->v8
.add_chunk_count
;
301 sub_chunks_count
= header
->v8
.sub_chunk_count
;
303 return kInvalidVersion
;
306 if (!ReadToContainer(add_chunks
, add_chunks_count
, fp
, context
) ||
307 !ReadToContainer(sub_chunks
, sub_chunks_count
, fp
, context
)) {
308 return kInvalidVersion
;
311 // v8 includes a checksum to validate the header.
312 if (version
> 7 && !ReadAndVerifyChecksum(fp
, context
)) {
313 RecordFormatEvent(FORMAT_EVENT_HEADER_CHECKSUM_FAILURE
);
314 return kInvalidVersion
;
320 // Helper function to write out the initial header and chunks-contained data.
321 // Rewinds |fp|, initializes |context|, then writes a file header and
322 // |add_chunks| and |sub_chunks|.
323 bool WriteHeader(uint32 out_stride
,
324 const std::set
<int32
>& add_chunks
,
325 const std::set
<int32
>& sub_chunks
,
327 base::MD5Context
* context
) {
331 base::MD5Init(context
);
333 header
.magic
= kFileMagic
;
334 header
.version
= kFileVersion
;
335 header
.add_chunk_count
= add_chunks
.size();
336 header
.sub_chunk_count
= sub_chunks
.size();
337 header
.shard_stride
= out_stride
;
338 if (!WriteItem(header
, fp
, context
))
341 if (!WriteContainer(add_chunks
, fp
, context
) ||
342 !WriteContainer(sub_chunks
, fp
, context
))
345 // Write out the header digest.
346 base::MD5Digest header_digest
;
347 base::MD5IntermediateFinal(&header_digest
, context
);
348 if (!WriteItem(header_digest
, fp
, context
))
354 // Return |true| if the range is sorted by the given comparator.
355 template <typename CTI
, typename LESS
>
356 bool sorted(CTI beg
, CTI end
, LESS less
) {
357 while ((end
- beg
) > 2) {
359 DCHECK(!less(*beg
, *n
));
366 // Merge |beg|..|end| into |container|. Both should be sorted by the given
367 // comparator, and the range iterators should not be derived from |container|.
368 // Differs from std::inplace_merge() in that additional memory is not required
369 // for linear performance.
370 template <typename CT
, typename CTI
, typename COMP
>
371 void container_merge(CT
* container
, CTI beg
, CTI end
, const COMP
& less
) {
372 DCHECK(sorted(container
->begin(), container
->end(), less
));
373 DCHECK(sorted(beg
, end
, less
));
375 // Size the container to fit the results.
376 const size_t c_size
= container
->size();
377 container
->resize(c_size
+ (end
- beg
));
379 // |c_end| points to the original endpoint, while |c_out| points to the
380 // endpoint that will scan from end to beginning while merging.
381 typename
CT::iterator c_end
= container
->begin() + c_size
;
382 typename
CT::iterator c_out
= container
->end();
384 // While both inputs have data, move the greater to |c_out|.
385 while (c_end
!= container
->begin() && end
!= beg
) {
386 if (less(*(c_end
- 1), *(end
- 1))) {
387 *(--c_out
) = *(--end
);
389 *(--c_out
) = *(--c_end
);
393 // Copy any data remaining in the new range.
395 // The original container data has been fully shifted.
396 DCHECK(c_end
== container
->begin());
398 // There is exactly the correct amount of space left.
399 DCHECK_EQ(c_out
- c_end
, end
- beg
);
401 std::copy(beg
, end
, container
->begin());
404 DCHECK(sorted(container
->begin(), container
->end(), less
));
407 // Collection of iterators used while stepping through StateInternal (see
409 class StateInternalPos
{
411 StateInternalPos(SBAddPrefixes::iterator add_prefixes_iter
,
412 SBSubPrefixes::iterator sub_prefixes_iter
,
413 std::vector
<SBAddFullHash
>::iterator add_hashes_iter
,
414 std::vector
<SBSubFullHash
>::iterator sub_hashes_iter
)
415 : add_prefixes_iter_(add_prefixes_iter
),
416 sub_prefixes_iter_(sub_prefixes_iter
),
417 add_hashes_iter_(add_hashes_iter
),
418 sub_hashes_iter_(sub_hashes_iter
) {
421 SBAddPrefixes::iterator add_prefixes_iter_
;
422 SBSubPrefixes::iterator sub_prefixes_iter_
;
423 std::vector
<SBAddFullHash
>::iterator add_hashes_iter_
;
424 std::vector
<SBSubFullHash
>::iterator sub_hashes_iter_
;
427 // Helper to find the next shard boundary.
429 bool prefix_bounder(SBPrefix val
, const T
& elt
) {
430 return val
< elt
.GetAddPrefix();
433 // Container for partial database state. Includes add/sub prefixes/hashes, plus
434 // aggregate operations on same.
435 class StateInternal
{
437 // Append indicated amount of data from |fp|.
438 bool AppendData(size_t add_prefix_count
, size_t sub_prefix_count
,
439 size_t add_hash_count
, size_t sub_hash_count
,
440 FILE* fp
, base::MD5Context
* context
) {
442 ReadToContainer(&add_prefixes_
, add_prefix_count
, fp
, context
) &&
443 ReadToContainer(&sub_prefixes_
, sub_prefix_count
, fp
, context
) &&
444 ReadToContainer(&add_full_hashes_
, add_hash_count
, fp
, context
) &&
445 ReadToContainer(&sub_full_hashes_
, sub_hash_count
, fp
, context
);
449 add_prefixes_
.clear();
450 sub_prefixes_
.clear();
451 add_full_hashes_
.clear();
452 sub_full_hashes_
.clear();
455 // Merge data from |beg|..|end| into receiver's state, then process the state.
456 // The current state and the range given should corrospond to the same sorted
457 // shard of data from different sources. |add_del_cache| and |sub_del_cache|
458 // indicate the chunk ids which should be deleted during processing (see
460 void MergeDataAndProcess(const StateInternalPos
& beg
,
461 const StateInternalPos
& end
,
462 const base::hash_set
<int32
>& add_del_cache
,
463 const base::hash_set
<int32
>& sub_del_cache
) {
464 container_merge(&add_prefixes_
,
465 beg
.add_prefixes_iter_
,
466 end
.add_prefixes_iter_
,
467 SBAddPrefixLess
<SBAddPrefix
,SBAddPrefix
>);
469 container_merge(&sub_prefixes_
,
470 beg
.sub_prefixes_iter_
,
471 end
.sub_prefixes_iter_
,
472 SBAddPrefixLess
<SBSubPrefix
,SBSubPrefix
>);
474 container_merge(&add_full_hashes_
,
475 beg
.add_hashes_iter_
,
476 end
.add_hashes_iter_
,
477 SBAddPrefixHashLess
<SBAddFullHash
,SBAddFullHash
>);
479 container_merge(&sub_full_hashes_
,
480 beg
.sub_hashes_iter_
,
481 end
.sub_hashes_iter_
,
482 SBAddPrefixHashLess
<SBSubFullHash
, SBSubFullHash
>);
484 SBProcessSubs(&add_prefixes_
, &sub_prefixes_
,
485 &add_full_hashes_
, &sub_full_hashes_
,
486 add_del_cache
, sub_del_cache
);
489 // Sort the data appropriately for the sharding, merging, and processing
492 std::sort(add_prefixes_
.begin(), add_prefixes_
.end(),
493 SBAddPrefixLess
<SBAddPrefix
,SBAddPrefix
>);
494 std::sort(sub_prefixes_
.begin(), sub_prefixes_
.end(),
495 SBAddPrefixLess
<SBSubPrefix
,SBSubPrefix
>);
496 std::sort(add_full_hashes_
.begin(), add_full_hashes_
.end(),
497 SBAddPrefixHashLess
<SBAddFullHash
,SBAddFullHash
>);
498 std::sort(sub_full_hashes_
.begin(), sub_full_hashes_
.end(),
499 SBAddPrefixHashLess
<SBSubFullHash
,SBSubFullHash
>);
502 // Iterator from the beginning of the state's data.
503 StateInternalPos
StateBegin() {
504 return StateInternalPos(add_prefixes_
.begin(),
505 sub_prefixes_
.begin(),
506 add_full_hashes_
.begin(),
507 sub_full_hashes_
.begin());
510 // An iterator pointing just after the last possible element of the shard
511 // indicated by |shard_max|. Used to step through the state by shard.
512 // TODO(shess): Verify whether binary search really improves over linear.
513 // Merging or writing will immediately touch all of these elements.
514 StateInternalPos
ShardEnd(const StateInternalPos
& beg
, SBPrefix shard_max
) {
515 return StateInternalPos(
516 std::upper_bound(beg
.add_prefixes_iter_
, add_prefixes_
.end(),
517 shard_max
, prefix_bounder
<SBAddPrefix
>),
518 std::upper_bound(beg
.sub_prefixes_iter_
, sub_prefixes_
.end(),
519 shard_max
, prefix_bounder
<SBSubPrefix
>),
520 std::upper_bound(beg
.add_hashes_iter_
, add_full_hashes_
.end(),
521 shard_max
, prefix_bounder
<SBAddFullHash
>),
522 std::upper_bound(beg
.sub_hashes_iter_
, sub_full_hashes_
.end(),
523 shard_max
, prefix_bounder
<SBSubFullHash
>));
526 // Write a shard header and data for the shard starting at |beg| and ending at
527 // the element before |end|.
528 bool WriteShard(const StateInternalPos
& beg
, const StateInternalPos
& end
,
529 FILE* fp
, base::MD5Context
* context
) {
530 ShardHeader shard_header
;
531 shard_header
.add_prefix_count
=
532 end
.add_prefixes_iter_
- beg
.add_prefixes_iter_
;
533 shard_header
.sub_prefix_count
=
534 end
.sub_prefixes_iter_
- beg
.sub_prefixes_iter_
;
535 shard_header
.add_hash_count
=
536 end
.add_hashes_iter_
- beg
.add_hashes_iter_
;
537 shard_header
.sub_hash_count
=
538 end
.sub_hashes_iter_
- beg
.sub_hashes_iter_
;
541 WriteItem(shard_header
, fp
, context
) &&
542 WriteRange(beg
.add_prefixes_iter_
, end
.add_prefixes_iter_
,
544 WriteRange(beg
.sub_prefixes_iter_
, end
.sub_prefixes_iter_
,
546 WriteRange(beg
.add_hashes_iter_
, end
.add_hashes_iter_
,
548 WriteRange(beg
.sub_hashes_iter_
, end
.sub_hashes_iter_
,
552 SBAddPrefixes add_prefixes_
;
553 SBSubPrefixes sub_prefixes_
;
554 std::vector
<SBAddFullHash
> add_full_hashes_
;
555 std::vector
<SBSubFullHash
> sub_full_hashes_
;
558 // True if |val| is an even power of two.
559 template <typename T
>
560 bool IsPowerOfTwo(const T
& val
) {
561 return val
&& (val
& (val
- 1)) == 0;
564 // Helper to read the entire database state, used by GetAddPrefixes() and
565 // GetAddFullHashes(). Those functions are generally used only for smaller
566 // files. Returns false in case of errors reading the data.
567 bool ReadDbStateHelper(const base::FilePath
& filename
,
568 StateInternal
* db_state
) {
569 file_util::ScopedFILE
file(base::OpenFile(filename
, "rb"));
570 if (file
.get() == NULL
)
573 std::set
<int32
> add_chunks
;
574 std::set
<int32
> sub_chunks
;
576 base::MD5Context context
;
579 ReadAndVerifyHeader(filename
, &header
, &add_chunks
, &sub_chunks
,
580 file
.get(), &context
);
581 if (version
== kInvalidVersion
)
585 if (!db_state
->AppendData(header
.v7
.add_prefix_count
,
586 header
.v7
.sub_prefix_count
,
587 header
.v7
.add_hash_count
,
588 header
.v7
.sub_hash_count
,
589 file
.get(), &context
)) {
593 // v7 data was not stored sorted.
594 db_state
->SortData();
596 // Read until the shard start overflows, always at least one pass.
598 uint64 in_stride
= header
.v8
.shard_stride
;
600 in_stride
= kMaxShardStride
;
601 if (!IsPowerOfTwo(in_stride
))
605 ShardHeader shard_header
;
606 if (!ReadItem(&shard_header
, file
.get(), &context
))
609 if (!db_state
->AppendData(shard_header
.add_prefix_count
,
610 shard_header
.sub_prefix_count
,
611 shard_header
.add_hash_count
,
612 shard_header
.sub_hash_count
,
613 file
.get(), &context
)) {
618 } while (in_min
<= kMaxSBPrefix
);
621 if (!ReadAndVerifyChecksum(file
.get(), &context
))
625 if (!base::GetFileSize(filename
, &size
))
628 return static_cast<int64
>(ftell(file
.get())) == size
;
634 void SafeBrowsingStoreFile::CheckForOriginalAndDelete(
635 const base::FilePath
& current_filename
) {
636 const base::FilePath
original_filename(
637 current_filename
.DirName().AppendASCII("Safe Browsing"));
638 if (base::PathExists(original_filename
)) {
640 if (base::GetFileSize(original_filename
, &size
)) {
641 UMA_HISTOGRAM_COUNTS("SB2.OldDatabaseKilobytes",
642 static_cast<int>(size
/ 1024));
645 if (base::DeleteFile(original_filename
, false)) {
646 RecordFormatEvent(FORMAT_EVENT_DELETED_ORIGINAL
);
648 RecordFormatEvent(FORMAT_EVENT_DELETED_ORIGINAL_FAILED
);
651 // Just best-effort on the journal file, don't want to get lost in
653 const base::FilePath
journal_filename(
654 current_filename
.DirName().AppendASCII("Safe Browsing-journal"));
655 base::DeleteFile(journal_filename
, false);
659 SafeBrowsingStoreFile::SafeBrowsingStoreFile()
660 : chunks_written_(0), empty_(false), corruption_seen_(false) {}
662 SafeBrowsingStoreFile::~SafeBrowsingStoreFile() {
666 bool SafeBrowsingStoreFile::Delete() {
667 // The database should not be open at this point. But, just in
668 // case, close everything before deleting.
674 return DeleteStore(filename_
);
677 bool SafeBrowsingStoreFile::CheckValidity() {
678 // The file was either empty or never opened. The empty case is
679 // presumed not to be invalid. The never-opened case can happen if
680 // BeginUpdate() fails for any databases, and should already have
681 // caused the corruption callback to fire.
685 if (!FileRewind(file_
.get()))
686 return OnCorruptDatabase();
689 if (!base::GetFileSize(filename_
, &size
))
690 return OnCorruptDatabase();
692 base::MD5Context context
;
693 base::MD5Init(&context
);
695 // Read everything except the final digest.
696 size_t bytes_left
= static_cast<size_t>(size
);
697 CHECK(size
== static_cast<int64
>(bytes_left
));
698 if (bytes_left
< sizeof(base::MD5Digest
))
699 return OnCorruptDatabase();
700 bytes_left
-= sizeof(base::MD5Digest
);
702 // Fold the contents of the file into the checksum.
703 while (bytes_left
> 0) {
705 const size_t c
= std::min(sizeof(buf
), bytes_left
);
706 const size_t ret
= fread(buf
, 1, c
, file_
.get());
708 // The file's size changed while reading, give up.
710 return OnCorruptDatabase();
711 base::MD5Update(&context
, base::StringPiece(buf
, c
));
715 if (!ReadAndVerifyChecksum(file_
.get(), &context
)) {
716 RecordFormatEvent(FORMAT_EVENT_VALIDITY_CHECKSUM_FAILURE
);
717 return OnCorruptDatabase();
723 void SafeBrowsingStoreFile::Init(
724 const base::FilePath
& filename
,
725 const base::Closure
& corruption_callback
727 filename_
= filename
;
728 corruption_callback_
= corruption_callback
;
731 bool SafeBrowsingStoreFile::BeginChunk() {
732 return ClearChunkBuffers();
735 bool SafeBrowsingStoreFile::WriteAddPrefix(int32 chunk_id
, SBPrefix prefix
) {
736 add_prefixes_
.push_back(SBAddPrefix(chunk_id
, prefix
));
740 bool SafeBrowsingStoreFile::GetAddPrefixes(SBAddPrefixes
* add_prefixes
) {
741 add_prefixes
->clear();
742 if (!base::PathExists(filename_
))
745 StateInternal db_state
;
746 if (!ReadDbStateHelper(filename_
, &db_state
))
747 return OnCorruptDatabase();
749 add_prefixes
->swap(db_state
.add_prefixes_
);
753 bool SafeBrowsingStoreFile::GetAddFullHashes(
754 std::vector
<SBAddFullHash
>* add_full_hashes
) {
755 add_full_hashes
->clear();
756 if (!base::PathExists(filename_
))
759 StateInternal db_state
;
760 if (!ReadDbStateHelper(filename_
, &db_state
))
761 return OnCorruptDatabase();
763 add_full_hashes
->swap(db_state
.add_full_hashes_
);
767 bool SafeBrowsingStoreFile::WriteAddHash(int32 chunk_id
,
768 const SBFullHash
& full_hash
) {
769 add_hashes_
.push_back(SBAddFullHash(chunk_id
, full_hash
));
773 bool SafeBrowsingStoreFile::WriteSubPrefix(int32 chunk_id
,
776 sub_prefixes_
.push_back(SBSubPrefix(chunk_id
, add_chunk_id
, prefix
));
780 bool SafeBrowsingStoreFile::WriteSubHash(int32 chunk_id
, int32 add_chunk_id
,
781 const SBFullHash
& full_hash
) {
782 sub_hashes_
.push_back(SBSubFullHash(chunk_id
, add_chunk_id
, full_hash
));
786 bool SafeBrowsingStoreFile::OnCorruptDatabase() {
787 if (!corruption_seen_
)
788 RecordFormatEvent(FORMAT_EVENT_FILE_CORRUPT
);
789 corruption_seen_
= true;
791 corruption_callback_
.Run();
793 // Return false as a convenience to callers.
797 bool SafeBrowsingStoreFile::Close() {
798 ClearUpdateBuffers();
800 // Make sure the files are closed.
806 bool SafeBrowsingStoreFile::BeginUpdate() {
807 DCHECK(!file_
.get() && !new_file_
.get());
809 // Structures should all be clear unless something bad happened.
810 DCHECK(add_chunks_cache_
.empty());
811 DCHECK(sub_chunks_cache_
.empty());
812 DCHECK(add_del_cache_
.empty());
813 DCHECK(sub_del_cache_
.empty());
814 DCHECK(add_prefixes_
.empty());
815 DCHECK(sub_prefixes_
.empty());
816 DCHECK(add_hashes_
.empty());
817 DCHECK(sub_hashes_
.empty());
818 DCHECK_EQ(chunks_written_
, 0);
820 // Since the following code will already hit the profile looking for
821 // database files, this is a reasonable to time delete any old
823 CheckForOriginalAndDelete(filename_
);
825 corruption_seen_
= false;
827 const base::FilePath new_filename
= TemporaryFileForFilename(filename_
);
828 base::ScopedFILE
new_file(base::OpenFile(new_filename
, "wb+"));
829 if (new_file
.get() == NULL
)
832 base::ScopedFILE
file(base::OpenFile(filename_
, "rb"));
833 empty_
= (file
.get() == NULL
);
835 // If the file exists but cannot be opened, try to delete it (not
836 // deleting directly, the bloom filter needs to be deleted, too).
837 if (base::PathExists(filename_
))
838 return OnCorruptDatabase();
840 new_file_
.swap(new_file
);
844 base::MD5Context context
;
847 ReadAndVerifyHeader(filename_
, &header
,
848 &add_chunks_cache_
, &sub_chunks_cache_
,
849 file
.get(), &context
);
850 if (version
== kInvalidVersion
) {
851 FileHeaderV8 retry_header
;
852 if (FileRewind(file
.get()) && ReadItem(&retry_header
, file
.get(), NULL
) &&
853 (retry_header
.magic
!= kFileMagic
||
854 (retry_header
.version
!= 8 && retry_header
.version
!= 7))) {
855 // TODO(shess): Think on whether these histograms are generating any
856 // actionable data. I kid you not, SQLITE happens many thousands of times
857 // per day, UNKNOWN about 3x higher than that.
858 if (!strcmp(reinterpret_cast<char*>(&retry_header
.magic
),
859 "SQLite format 3")) {
860 RecordFormatEvent(FORMAT_EVENT_FOUND_SQLITE
);
862 RecordFormatEvent(FORMAT_EVENT_FOUND_UNKNOWN
);
866 // Close the file so that it can be deleted.
869 return OnCorruptDatabase();
873 new_file_
.swap(new_file
);
877 bool SafeBrowsingStoreFile::FinishChunk() {
878 if (!add_prefixes_
.size() && !sub_prefixes_
.size() &&
879 !add_hashes_
.size() && !sub_hashes_
.size())
883 header
.add_prefix_count
= add_prefixes_
.size();
884 header
.sub_prefix_count
= sub_prefixes_
.size();
885 header
.add_hash_count
= add_hashes_
.size();
886 header
.sub_hash_count
= sub_hashes_
.size();
887 if (!WriteItem(header
, new_file_
.get(), NULL
))
890 if (!WriteContainer(add_prefixes_
, new_file_
.get(), NULL
) ||
891 !WriteContainer(sub_prefixes_
, new_file_
.get(), NULL
) ||
892 !WriteContainer(add_hashes_
, new_file_
.get(), NULL
) ||
893 !WriteContainer(sub_hashes_
, new_file_
.get(), NULL
))
898 // Clear everything to save memory.
899 return ClearChunkBuffers();
902 bool SafeBrowsingStoreFile::DoUpdate(
903 safe_browsing::PrefixSetBuilder
* builder
,
904 std::vector
<SBAddFullHash
>* add_full_hashes_result
) {
905 DCHECK(file_
.get() || empty_
);
906 DCHECK(new_file_
.get());
908 CHECK(add_full_hashes_result
);
910 // Rewind the temporary storage.
911 if (!FileRewind(new_file_
.get()))
914 // Get chunk file's size for validating counts.
915 int64 update_size
= 0;
916 if (!base::GetFileSize(TemporaryFileForFilename(filename_
), &update_size
))
917 return OnCorruptDatabase();
919 // Track update size to answer questions at http://crbug.com/72216 .
920 // Log small updates as 1k so that the 0 (underflow) bucket can be
921 // used for "empty" in SafeBrowsingDatabase.
922 UMA_HISTOGRAM_COUNTS("SB2.DatabaseUpdateKilobytes",
923 std::max(static_cast<int>(update_size
/ 1024), 1));
925 // Chunk updates to integrate.
926 StateInternal new_state
;
928 // Read update chunks.
929 for (int i
= 0; i
< chunks_written_
; ++i
) {
932 int64 ofs
= ftell(new_file_
.get());
936 if (!ReadItem(&header
, new_file_
.get(), NULL
))
939 // As a safety measure, make sure that the header describes a sane
940 // chunk, given the remaining file size.
941 int64 expected_size
= ofs
+ sizeof(ChunkHeader
);
942 expected_size
+= header
.add_prefix_count
* sizeof(SBAddPrefix
);
943 expected_size
+= header
.sub_prefix_count
* sizeof(SBSubPrefix
);
944 expected_size
+= header
.add_hash_count
* sizeof(SBAddFullHash
);
945 expected_size
+= header
.sub_hash_count
* sizeof(SBSubFullHash
);
946 if (expected_size
> update_size
)
949 if (!new_state
.AppendData(header
.add_prefix_count
, header
.sub_prefix_count
,
950 header
.add_hash_count
, header
.sub_hash_count
,
951 new_file_
.get(), NULL
)) {
956 // The state was accumulated by chunk, sort by prefix.
957 new_state
.SortData();
959 // These strides control how much data is loaded into memory per pass.
960 // Strides must be an even power of two. |in_stride| will be derived from the
961 // input file. |out_stride| will be derived from an estimate of the resulting
962 // file's size. |process_stride| will be the max of both.
963 uint64 in_stride
= kMaxShardStride
;
964 uint64 out_stride
= kMaxShardStride
;
965 uint64 process_stride
= 0;
967 // The header info is only used later if |!empty_|. The v8 read loop only
968 // needs |in_stride|, while v7 needs to refer to header information.
969 base::MD5Context in_context
;
970 int version
= kInvalidVersion
;
976 version
= ReadAndVerifyHeader(filename_
, &header
,
977 &add_chunks_cache_
, &sub_chunks_cache_
,
978 file_
.get(), &in_context
);
979 if (version
== kInvalidVersion
)
980 return OnCorruptDatabase();
982 if (version
== 8 && header
.v8
.shard_stride
)
983 in_stride
= header
.v8
.shard_stride
;
985 // The header checksum should have prevented this case, but the code will be
986 // broken if this is not correct.
987 if (!IsPowerOfTwo(in_stride
))
988 return OnCorruptDatabase();
991 // We no longer need to track deleted chunks.
992 DeleteChunksFromSet(add_del_cache_
, &add_chunks_cache_
);
993 DeleteChunksFromSet(sub_del_cache_
, &sub_chunks_cache_
);
995 // Calculate |out_stride| to break the file down into reasonable shards.
997 int64 original_size
= 0;
998 if (!empty_
&& !base::GetFileSize(filename_
, &original_size
))
999 return OnCorruptDatabase();
1001 // Approximate the final size as everything. Subs and deletes will reduce
1002 // the size, but modest over-sharding won't hurt much.
1003 int64 shard_size
= original_size
+ update_size
;
1005 // Keep splitting until a single stride of data fits the target.
1007 while (out_stride
> kMinShardStride
&& shard_size
> kUpdateStorageBytes
) {
1012 UMA_HISTOGRAM_COUNTS("SB2.OutShardShifts", shifts
);
1014 DCHECK(IsPowerOfTwo(out_stride
));
1017 // Outer loop strides by the max of the input stride (to read integral shards)
1018 // and the output stride (to write integral shards).
1019 process_stride
= std::max(in_stride
, out_stride
);
1020 DCHECK(IsPowerOfTwo(process_stride
));
1021 DCHECK_EQ(0u, process_stride
% in_stride
);
1022 DCHECK_EQ(0u, process_stride
% out_stride
);
1024 // Start writing the new data to |new_file_|.
1025 base::MD5Context out_context
;
1026 if (!WriteHeader(out_stride
, add_chunks_cache_
, sub_chunks_cache_
,
1027 new_file_
.get(), &out_context
)) {
1031 // Start at the beginning of the SBPrefix space.
1034 uint64 process_min
= 0;
1036 // Start at the beginning of the updates.
1037 StateInternalPos new_pos
= new_state
.StateBegin();
1039 // Re-usable container for shard processing.
1040 StateInternal db_state
;
1042 // Track aggregate counts for histograms.
1043 size_t add_prefix_count
= 0;
1044 size_t sub_prefix_count
= 0;
1047 // Maximum element in the current shard.
1048 SBPrefix process_max
=
1049 static_cast<SBPrefix
>(process_min
+ process_stride
- 1);
1050 DCHECK_GT(process_max
, process_min
);
1052 // Drop the data from previous pass.
1053 db_state
.ClearData();
1055 // Fill the processing shard with one or more input shards.
1058 // Treat v7 as a single-shard file.
1059 DCHECK_EQ(in_min
, 0u);
1060 DCHECK_EQ(in_stride
, kMaxShardStride
);
1061 DCHECK_EQ(process_stride
, kMaxShardStride
);
1062 if (!db_state
.AppendData(header
.v7
.add_prefix_count
,
1063 header
.v7
.sub_prefix_count
,
1064 header
.v7
.add_hash_count
,
1065 header
.v7
.sub_hash_count
,
1066 file_
.get(), &in_context
))
1067 return OnCorruptDatabase();
1069 // v7 data is not sorted correctly.
1070 db_state
.SortData();
1073 ShardHeader shard_header
;
1074 if (!ReadItem(&shard_header
, file_
.get(), &in_context
))
1075 return OnCorruptDatabase();
1077 if (!db_state
.AppendData(shard_header
.add_prefix_count
,
1078 shard_header
.sub_prefix_count
,
1079 shard_header
.add_hash_count
,
1080 shard_header
.sub_hash_count
,
1081 file_
.get(), &in_context
))
1082 return OnCorruptDatabase();
1084 in_min
+= in_stride
;
1085 } while (in_min
<= kMaxSBPrefix
&& in_min
< process_max
);
1089 // Shard the update data to match the database data, then merge the update
1090 // data and process the results.
1092 StateInternalPos new_end
= new_state
.ShardEnd(new_pos
, process_max
);
1093 db_state
.MergeDataAndProcess(new_pos
, new_end
,
1094 add_del_cache_
, sub_del_cache_
);
1098 // Collect the processed data for return to caller.
1099 for (size_t i
= 0; i
< db_state
.add_prefixes_
.size(); ++i
) {
1100 builder
->AddPrefix(db_state
.add_prefixes_
[i
].prefix
);
1102 add_full_hashes_result
->insert(add_full_hashes_result
->end(),
1103 db_state
.add_full_hashes_
.begin(),
1104 db_state
.add_full_hashes_
.end());
1105 add_prefix_count
+= db_state
.add_prefixes_
.size();
1106 sub_prefix_count
+= db_state
.sub_prefixes_
.size();
1108 // Write one or more shards of processed output.
1109 StateInternalPos out_pos
= db_state
.StateBegin();
1111 SBPrefix out_max
= static_cast<SBPrefix
>(out_min
+ out_stride
- 1);
1112 DCHECK_GT(out_max
, out_min
);
1114 StateInternalPos out_end
= db_state
.ShardEnd(out_pos
, out_max
);
1115 if (!db_state
.WriteShard(out_pos
, out_end
, new_file_
.get(), &out_context
))
1119 out_min
+= out_stride
;
1120 } while (out_min
== static_cast<SBPrefix
>(out_min
) &&
1121 out_min
< process_max
);
1123 process_min
+= process_stride
;
1124 } while (process_min
<= kMaxSBPrefix
);
1126 // Verify the overall checksum.
1128 if (!ReadAndVerifyChecksum(file_
.get(), &in_context
)) {
1129 RecordFormatEvent(FORMAT_EVENT_UPDATE_CHECKSUM_FAILURE
);
1130 return OnCorruptDatabase();
1133 // TODO(shess): Verify EOF?
1135 // Close the input file so the new file can be renamed over it.
1138 DCHECK(!file_
.get());
1140 // Write the overall checksum.
1141 base::MD5Digest out_digest
;
1142 base::MD5Final(&out_digest
, &out_context
);
1143 if (!WriteItem(out_digest
, new_file_
.get(), NULL
))
1146 // Trim any excess left over from the temporary chunk data.
1147 if (!base::TruncateFile(new_file_
.get()))
1150 // Close the file handle and swizzle the file into place.
1152 if (!base::DeleteFile(filename_
, false) &&
1153 base::PathExists(filename_
))
1156 const base::FilePath new_filename
= TemporaryFileForFilename(filename_
);
1157 if (!base::Move(new_filename
, filename_
))
1160 // Record counts before swapping to caller.
1161 UMA_HISTOGRAM_COUNTS("SB2.AddPrefixes", add_prefix_count
);
1162 UMA_HISTOGRAM_COUNTS("SB2.SubPrefixes", sub_prefix_count
);
1167 bool SafeBrowsingStoreFile::FinishUpdate(
1168 safe_browsing::PrefixSetBuilder
* builder
,
1169 std::vector
<SBAddFullHash
>* add_full_hashes_result
) {
1171 DCHECK(add_full_hashes_result
);
1173 if (!DoUpdate(builder
, add_full_hashes_result
)) {
1178 DCHECK(!new_file_
.get());
1179 DCHECK(!file_
.get());
1184 bool SafeBrowsingStoreFile::CancelUpdate() {
1188 void SafeBrowsingStoreFile::SetAddChunk(int32 chunk_id
) {
1189 add_chunks_cache_
.insert(chunk_id
);
1192 bool SafeBrowsingStoreFile::CheckAddChunk(int32 chunk_id
) {
1193 return add_chunks_cache_
.count(chunk_id
) > 0;
1196 void SafeBrowsingStoreFile::GetAddChunks(std::vector
<int32
>* out
) {
1198 out
->insert(out
->end(), add_chunks_cache_
.begin(), add_chunks_cache_
.end());
1201 void SafeBrowsingStoreFile::SetSubChunk(int32 chunk_id
) {
1202 sub_chunks_cache_
.insert(chunk_id
);
1205 bool SafeBrowsingStoreFile::CheckSubChunk(int32 chunk_id
) {
1206 return sub_chunks_cache_
.count(chunk_id
) > 0;
1209 void SafeBrowsingStoreFile::GetSubChunks(std::vector
<int32
>* out
) {
1211 out
->insert(out
->end(), sub_chunks_cache_
.begin(), sub_chunks_cache_
.end());
1214 void SafeBrowsingStoreFile::DeleteAddChunk(int32 chunk_id
) {
1215 add_del_cache_
.insert(chunk_id
);
1218 void SafeBrowsingStoreFile::DeleteSubChunk(int32 chunk_id
) {
1219 sub_del_cache_
.insert(chunk_id
);
1223 bool SafeBrowsingStoreFile::DeleteStore(const base::FilePath
& basename
) {
1224 if (!base::DeleteFile(basename
, false) &&
1225 base::PathExists(basename
)) {
1230 const base::FilePath new_filename
= TemporaryFileForFilename(basename
);
1231 if (!base::DeleteFile(new_filename
, false) &&
1232 base::PathExists(new_filename
)) {
1237 // With SQLite support gone, one way to get to this code is if the
1238 // existing file is a SQLite file. Make sure the journal file is
1240 const base::FilePath
journal_filename(
1241 basename
.value() + FILE_PATH_LITERAL("-journal"));
1242 if (base::PathExists(journal_filename
))
1243 base::DeleteFile(journal_filename
, false);