1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "chrome/browser/safe_browsing/safe_browsing_store_file.h"
7 #include "base/files/file_util.h"
8 #include "base/files/scoped_file.h"
10 #include "base/metrics/histogram.h"
11 #include "base/metrics/sparse_histogram.h"
15 // NOTE(shess): kFileMagic should not be a byte-wise palindrome, so
16 // that byte-order changes force corruption.
17 const int32 kFileMagic
= 0x600D71FE;
20 // Version 6: aad08754/r2814 by erikkay@google.com on 2008-10-02 (sqlite)
21 // Version 7: 6afe28a5/r37435 by shess@chromium.org on 2010-01-28
22 // Version 8: d3dd0715/r259791 by shess@chromium.org on 2014-03-27
23 const int32 kFileVersion
= 8;
25 // ReadAndVerifyHeader() returns this in case of error.
26 const int32 kInvalidVersion
= -1;
28 // Starting with version 8, the storage is sorted and can be sharded to allow
29 // updates to be done with lower memory requirements. Newly written files will
30 // be sharded to need less than this amount of memory during update. Larger
31 // values are preferred to minimize looping overhead during processing.
32 const int64 kUpdateStorageBytes
= 100 * 1024;
34 // Prevent excessive sharding by setting a lower limit on the shard stride.
35 // Smaller values should work fine, but very small values will probably lead to
36 // poor performance. Shard stride is indirectly related to
37 // |kUpdateStorageBytes|, setting that very small will bump against this.
38 const uint32 kMinShardStride
= 1 << 24;
40 // Strides over the entire SBPrefix space.
41 const uint64 kMaxShardStride
= 1ULL << 32;
43 // Maximum SBPrefix value.
44 const SBPrefix kMaxSBPrefix
= 0xFFFFFFFF;
46 // Header at the front of the main database file.
49 uint32 add_chunk_count
, sub_chunk_count
;
51 // TODO(shess): Is this where 64-bit will bite me? Perhaps write a
52 // specialized read/write?
55 // Header for each chunk in the chunk-accumulation file.
57 uint32 add_prefix_count
, sub_prefix_count
;
58 uint32 add_hash_count
, sub_hash_count
;
61 // Header for each shard of data in the main database file.
63 uint32 add_prefix_count
, sub_prefix_count
;
64 uint32 add_hash_count
, sub_hash_count
;
67 // Enumerate different format-change events for histogramming
68 // purposes. DO NOT CHANGE THE ORDERING OF THESE VALUES.
69 enum FormatEventType
{
70 // Corruption detected, broken down by file format.
71 FORMAT_EVENT_FILE_CORRUPT
,
72 FORMAT_EVENT_SQLITE_CORRUPT
, // Obsolete
74 // The type of format found in the file. The expected case (new
75 // file format) is intentionally not covered.
76 FORMAT_EVENT_FOUND_SQLITE
, // Obsolete
77 FORMAT_EVENT_FOUND_UNKNOWN
, // magic does not match.
79 // The number of SQLite-format files deleted should be the same as
80 // FORMAT_EVENT_FOUND_SQLITE. It can differ if the delete fails,
81 // or if a failure prevents the update from succeeding.
82 FORMAT_EVENT_SQLITE_DELETED
, // Obsolete
83 FORMAT_EVENT_SQLITE_DELETE_FAILED
, // Obsolete
85 // Found and deleted (or failed to delete) the ancient "Safe
87 FORMAT_EVENT_DELETED_ORIGINAL
, // Obsolete
88 FORMAT_EVENT_DELETED_ORIGINAL_FAILED
, // Obsolete
90 // The checksum did not check out in CheckValidity() or in
91 // FinishUpdate(). This most likely indicates that the machine
92 // crashed before the file was fully sync'ed to disk.
93 FORMAT_EVENT_VALIDITY_CHECKSUM_FAILURE
,
94 FORMAT_EVENT_UPDATE_CHECKSUM_FAILURE
,
96 // The header checksum was incorrect in ReadAndVerifyHeader(). Likely
97 // indicates that the system crashed while writing an update.
98 FORMAT_EVENT_HEADER_CHECKSUM_FAILURE
,
100 FORMAT_EVENT_FOUND_DEPRECATED
, // version too old.
102 // Memory space for histograms is determined by the max. ALWAYS
103 // ADD NEW VALUES BEFORE THIS ONE.
107 void RecordFormatEvent(FormatEventType event_type
) {
108 UMA_HISTOGRAM_ENUMERATION("SB2.FormatEvent", event_type
, FORMAT_EVENT_MAX
);
111 // Rewind the file. Using fseek(2) because rewind(3) errors are
113 bool FileRewind(FILE* fp
) {
114 int rv
= fseek(fp
, 0, SEEK_SET
);
119 // Read from |fp| into |item|, and fold the input data into the
120 // checksum in |context|, if non-NULL. Return true on success.
122 bool ReadItem(T
* item
, FILE* fp
, base::MD5Context
* context
) {
123 const size_t ret
= fread(item
, sizeof(T
), 1, fp
);
128 base::MD5Update(context
,
129 base::StringPiece(reinterpret_cast<char*>(item
),
135 // Write |item| to |fp|, and fold the output data into the checksum in
136 // |context|, if non-NULL. Return true on success.
138 bool WriteItem(const T
& item
, FILE* fp
, base::MD5Context
* context
) {
139 const size_t ret
= fwrite(&item
, sizeof(T
), 1, fp
);
144 base::MD5Update(context
,
145 base::StringPiece(reinterpret_cast<const char*>(&item
),
152 // Read |count| items into |values| from |fp|, and fold them into the
153 // checksum in |context|. Returns true on success.
154 template <typename CT
>
155 bool ReadToContainer(CT
* values
, size_t count
, FILE* fp
,
156 base::MD5Context
* context
) {
160 for (size_t i
= 0; i
< count
; ++i
) {
161 typename
CT::value_type value
;
162 if (!ReadItem(&value
, fp
, context
))
165 // push_back() is more obvious, but coded this way std::set can
167 values
->insert(values
->end(), value
);
173 // Write values between |beg| and |end| to |fp|, and fold the data into the
174 // checksum in |context|, if non-NULL. Returns true if all items successful.
175 template <typename CTI
>
176 bool WriteRange(const CTI
& beg
, const CTI
& end
,
177 FILE* fp
, base::MD5Context
* context
) {
178 for (CTI iter
= beg
; iter
!= end
; ++iter
) {
179 if (!WriteItem(*iter
, fp
, context
))
185 // Write all of |values| to |fp|, and fold the data into the checksum
186 // in |context|, if non-NULL. Returns true if all items successful.
187 template <typename CT
>
188 bool WriteContainer(const CT
& values
, FILE* fp
,
189 base::MD5Context
* context
) {
190 return WriteRange(values
.begin(), values
.end(), fp
, context
);
193 // Delete the chunks in |deleted| from |chunks|.
194 void DeleteChunksFromSet(const base::hash_set
<int32
>& deleted
,
195 std::set
<int32
>* chunks
) {
196 for (std::set
<int32
>::iterator iter
= chunks
->begin();
197 iter
!= chunks
->end();) {
198 std::set
<int32
>::iterator prev
= iter
++;
199 if (deleted
.count(*prev
) > 0)
204 bool ReadAndVerifyChecksum(FILE* fp
, base::MD5Context
* context
) {
205 base::MD5Digest calculated_digest
;
206 base::MD5IntermediateFinal(&calculated_digest
, context
);
208 base::MD5Digest file_digest
;
209 if (!ReadItem(&file_digest
, fp
, context
))
212 return memcmp(&file_digest
, &calculated_digest
, sizeof(file_digest
)) == 0;
215 // Helper function to read the file header and chunk TOC. Rewinds |fp| and
216 // initializes |context|. The header is left in |header|, with the version
217 // returned. kInvalidVersion is returned for sanity check or checksum failure.
218 int ReadAndVerifyHeader(const base::FilePath
& filename
,
220 std::set
<int32
>* add_chunks
,
221 std::set
<int32
>* sub_chunks
,
223 base::MD5Context
* context
) {
230 base::MD5Init(context
);
232 return kInvalidVersion
;
233 if (!ReadItem(header
, fp
, context
))
234 return kInvalidVersion
;
235 if (header
->magic
!= kFileMagic
)
236 return kInvalidVersion
;
238 // Track version read to inform removal of support for older versions.
239 UMA_HISTOGRAM_SPARSE_SLOWLY("SB2.StoreVersionRead", header
->version
);
241 if (header
->version
!= kFileVersion
)
242 return kInvalidVersion
;
244 if (!ReadToContainer(add_chunks
, header
->add_chunk_count
, fp
, context
) ||
245 !ReadToContainer(sub_chunks
, header
->sub_chunk_count
, fp
, context
)) {
246 return kInvalidVersion
;
249 // Verify that the data read thus far is valid.
250 if (!ReadAndVerifyChecksum(fp
, context
)) {
251 RecordFormatEvent(FORMAT_EVENT_HEADER_CHECKSUM_FAILURE
);
252 return kInvalidVersion
;
258 // Helper function to write out the initial header and chunks-contained data.
259 // Rewinds |fp|, initializes |context|, then writes a file header and
260 // |add_chunks| and |sub_chunks|.
261 bool WriteHeader(uint32 out_stride
,
262 const std::set
<int32
>& add_chunks
,
263 const std::set
<int32
>& sub_chunks
,
265 base::MD5Context
* context
) {
269 base::MD5Init(context
);
271 header
.magic
= kFileMagic
;
272 header
.version
= kFileVersion
;
273 header
.add_chunk_count
= add_chunks
.size();
274 header
.sub_chunk_count
= sub_chunks
.size();
275 header
.shard_stride
= out_stride
;
276 if (!WriteItem(header
, fp
, context
))
279 if (!WriteContainer(add_chunks
, fp
, context
) ||
280 !WriteContainer(sub_chunks
, fp
, context
))
283 // Write out the header digest.
284 base::MD5Digest header_digest
;
285 base::MD5IntermediateFinal(&header_digest
, context
);
286 if (!WriteItem(header_digest
, fp
, context
))
292 // Return |true| if the range is sorted by the given comparator.
293 template <typename CTI
, typename LESS
>
294 bool sorted(CTI beg
, CTI end
, LESS less
) {
295 while ((end
- beg
) > 2) {
297 DCHECK(!less(*beg
, *n
));
304 // Merge |beg|..|end| into |container|. Both should be sorted by the given
305 // comparator, and the range iterators should not be derived from |container|.
306 // Differs from std::inplace_merge() in that additional memory is not required
307 // for linear performance.
308 template <typename CT
, typename CTI
, typename COMP
>
309 void container_merge(CT
* container
, CTI beg
, CTI end
, const COMP
& less
) {
310 DCHECK(sorted(container
->begin(), container
->end(), less
));
311 DCHECK(sorted(beg
, end
, less
));
313 // Size the container to fit the results.
314 const size_t c_size
= container
->size();
315 container
->resize(c_size
+ (end
- beg
));
317 // |c_end| points to the original endpoint, while |c_out| points to the
318 // endpoint that will scan from end to beginning while merging.
319 typename
CT::iterator c_end
= container
->begin() + c_size
;
320 typename
CT::iterator c_out
= container
->end();
322 // While both inputs have data, move the greater to |c_out|.
323 while (c_end
!= container
->begin() && end
!= beg
) {
324 if (less(*(c_end
- 1), *(end
- 1))) {
325 *(--c_out
) = *(--end
);
327 *(--c_out
) = *(--c_end
);
331 // Copy any data remaining in the new range.
333 // The original container data has been fully shifted.
334 DCHECK(c_end
== container
->begin());
336 // There is exactly the correct amount of space left.
337 DCHECK_EQ(c_out
- c_end
, end
- beg
);
339 std::copy(beg
, end
, container
->begin());
342 DCHECK(sorted(container
->begin(), container
->end(), less
));
345 // Collection of iterators used while stepping through StateInternal (see
347 class StateInternalPos
{
349 StateInternalPos(SBAddPrefixes::iterator add_prefixes_iter
,
350 SBSubPrefixes::iterator sub_prefixes_iter
,
351 std::vector
<SBAddFullHash
>::iterator add_hashes_iter
,
352 std::vector
<SBSubFullHash
>::iterator sub_hashes_iter
)
353 : add_prefixes_iter_(add_prefixes_iter
),
354 sub_prefixes_iter_(sub_prefixes_iter
),
355 add_hashes_iter_(add_hashes_iter
),
356 sub_hashes_iter_(sub_hashes_iter
) {
359 SBAddPrefixes::iterator add_prefixes_iter_
;
360 SBSubPrefixes::iterator sub_prefixes_iter_
;
361 std::vector
<SBAddFullHash
>::iterator add_hashes_iter_
;
362 std::vector
<SBSubFullHash
>::iterator sub_hashes_iter_
;
365 // Helper to find the next shard boundary.
367 bool prefix_bounder(SBPrefix val
, const T
& elt
) {
368 return val
< elt
.GetAddPrefix();
371 // Container for partial database state. Includes add/sub prefixes/hashes, plus
372 // aggregate operations on same.
373 class StateInternal
{
375 // Append indicated amount of data from |fp|.
376 bool AppendData(size_t add_prefix_count
, size_t sub_prefix_count
,
377 size_t add_hash_count
, size_t sub_hash_count
,
378 FILE* fp
, base::MD5Context
* context
) {
380 ReadToContainer(&add_prefixes_
, add_prefix_count
, fp
, context
) &&
381 ReadToContainer(&sub_prefixes_
, sub_prefix_count
, fp
, context
) &&
382 ReadToContainer(&add_full_hashes_
, add_hash_count
, fp
, context
) &&
383 ReadToContainer(&sub_full_hashes_
, sub_hash_count
, fp
, context
);
387 add_prefixes_
.clear();
388 sub_prefixes_
.clear();
389 add_full_hashes_
.clear();
390 sub_full_hashes_
.clear();
393 // Merge data from |beg|..|end| into receiver's state, then process the state.
394 // The current state and the range given should corrospond to the same sorted
395 // shard of data from different sources. |add_del_cache| and |sub_del_cache|
396 // indicate the chunk ids which should be deleted during processing (see
398 void MergeDataAndProcess(const StateInternalPos
& beg
,
399 const StateInternalPos
& end
,
400 const base::hash_set
<int32
>& add_del_cache
,
401 const base::hash_set
<int32
>& sub_del_cache
) {
402 container_merge(&add_prefixes_
,
403 beg
.add_prefixes_iter_
,
404 end
.add_prefixes_iter_
,
405 SBAddPrefixLess
<SBAddPrefix
,SBAddPrefix
>);
407 container_merge(&sub_prefixes_
,
408 beg
.sub_prefixes_iter_
,
409 end
.sub_prefixes_iter_
,
410 SBAddPrefixLess
<SBSubPrefix
,SBSubPrefix
>);
412 container_merge(&add_full_hashes_
,
413 beg
.add_hashes_iter_
,
414 end
.add_hashes_iter_
,
415 SBAddPrefixHashLess
<SBAddFullHash
,SBAddFullHash
>);
417 container_merge(&sub_full_hashes_
,
418 beg
.sub_hashes_iter_
,
419 end
.sub_hashes_iter_
,
420 SBAddPrefixHashLess
<SBSubFullHash
, SBSubFullHash
>);
422 SBProcessSubs(&add_prefixes_
, &sub_prefixes_
,
423 &add_full_hashes_
, &sub_full_hashes_
,
424 add_del_cache
, sub_del_cache
);
427 // Sort the data appropriately for the sharding, merging, and processing
430 std::sort(add_prefixes_
.begin(), add_prefixes_
.end(),
431 SBAddPrefixLess
<SBAddPrefix
,SBAddPrefix
>);
432 std::sort(sub_prefixes_
.begin(), sub_prefixes_
.end(),
433 SBAddPrefixLess
<SBSubPrefix
,SBSubPrefix
>);
434 std::sort(add_full_hashes_
.begin(), add_full_hashes_
.end(),
435 SBAddPrefixHashLess
<SBAddFullHash
,SBAddFullHash
>);
436 std::sort(sub_full_hashes_
.begin(), sub_full_hashes_
.end(),
437 SBAddPrefixHashLess
<SBSubFullHash
,SBSubFullHash
>);
440 // Iterator from the beginning of the state's data.
441 StateInternalPos
StateBegin() {
442 return StateInternalPos(add_prefixes_
.begin(),
443 sub_prefixes_
.begin(),
444 add_full_hashes_
.begin(),
445 sub_full_hashes_
.begin());
448 // An iterator pointing just after the last possible element of the shard
449 // indicated by |shard_max|. Used to step through the state by shard.
450 // TODO(shess): Verify whether binary search really improves over linear.
451 // Merging or writing will immediately touch all of these elements.
452 StateInternalPos
ShardEnd(const StateInternalPos
& beg
, SBPrefix shard_max
) {
453 return StateInternalPos(
454 std::upper_bound(beg
.add_prefixes_iter_
, add_prefixes_
.end(),
455 shard_max
, prefix_bounder
<SBAddPrefix
>),
456 std::upper_bound(beg
.sub_prefixes_iter_
, sub_prefixes_
.end(),
457 shard_max
, prefix_bounder
<SBSubPrefix
>),
458 std::upper_bound(beg
.add_hashes_iter_
, add_full_hashes_
.end(),
459 shard_max
, prefix_bounder
<SBAddFullHash
>),
460 std::upper_bound(beg
.sub_hashes_iter_
, sub_full_hashes_
.end(),
461 shard_max
, prefix_bounder
<SBSubFullHash
>));
464 // Write a shard header and data for the shard starting at |beg| and ending at
465 // the element before |end|.
466 bool WriteShard(const StateInternalPos
& beg
, const StateInternalPos
& end
,
467 FILE* fp
, base::MD5Context
* context
) {
468 ShardHeader shard_header
;
469 shard_header
.add_prefix_count
=
470 end
.add_prefixes_iter_
- beg
.add_prefixes_iter_
;
471 shard_header
.sub_prefix_count
=
472 end
.sub_prefixes_iter_
- beg
.sub_prefixes_iter_
;
473 shard_header
.add_hash_count
=
474 end
.add_hashes_iter_
- beg
.add_hashes_iter_
;
475 shard_header
.sub_hash_count
=
476 end
.sub_hashes_iter_
- beg
.sub_hashes_iter_
;
479 WriteItem(shard_header
, fp
, context
) &&
480 WriteRange(beg
.add_prefixes_iter_
, end
.add_prefixes_iter_
,
482 WriteRange(beg
.sub_prefixes_iter_
, end
.sub_prefixes_iter_
,
484 WriteRange(beg
.add_hashes_iter_
, end
.add_hashes_iter_
,
486 WriteRange(beg
.sub_hashes_iter_
, end
.sub_hashes_iter_
,
490 SBAddPrefixes add_prefixes_
;
491 SBSubPrefixes sub_prefixes_
;
492 std::vector
<SBAddFullHash
> add_full_hashes_
;
493 std::vector
<SBSubFullHash
> sub_full_hashes_
;
496 // True if |val| is an even power of two.
497 template <typename T
>
498 bool IsPowerOfTwo(const T
& val
) {
499 return val
&& (val
& (val
- 1)) == 0;
502 // Helper to read the entire database state, used by GetAddPrefixes() and
503 // GetAddFullHashes(). Those functions are generally used only for smaller
504 // files. Returns false in case of errors reading the data.
505 bool ReadDbStateHelper(const base::FilePath
& filename
,
506 StateInternal
* db_state
) {
507 base::ScopedFILE
file(base::OpenFile(filename
, "rb"));
508 if (file
.get() == NULL
)
511 std::set
<int32
> add_chunks
;
512 std::set
<int32
> sub_chunks
;
514 base::MD5Context context
;
517 ReadAndVerifyHeader(filename
, &header
, &add_chunks
, &sub_chunks
,
518 file
.get(), &context
);
519 if (version
== kInvalidVersion
)
523 uint64 in_stride
= header
.shard_stride
;
525 in_stride
= kMaxShardStride
;
526 if (!IsPowerOfTwo(in_stride
))
530 ShardHeader shard_header
;
531 if (!ReadItem(&shard_header
, file
.get(), &context
))
534 if (!db_state
->AppendData(shard_header
.add_prefix_count
,
535 shard_header
.sub_prefix_count
,
536 shard_header
.add_hash_count
,
537 shard_header
.sub_hash_count
,
538 file
.get(), &context
)) {
543 } while (in_min
<= kMaxSBPrefix
);
545 if (!ReadAndVerifyChecksum(file
.get(), &context
))
549 if (!base::GetFileSize(filename
, &size
))
552 return static_cast<int64
>(ftell(file
.get())) == size
;
557 SafeBrowsingStoreFile::SafeBrowsingStoreFile(
558 const scoped_refptr
<const base::SequencedTaskRunner
>& task_runner
)
559 : task_runner_(task_runner
),
562 corruption_seen_(false) {
565 SafeBrowsingStoreFile::~SafeBrowsingStoreFile() {
566 // Thread-checking is disabled in the destructor due to crbug.com/338486.
567 task_runner_
= nullptr;
572 bool SafeBrowsingStoreFile::CalledOnValidThread() {
573 return !task_runner_
|| task_runner_
->RunsTasksOnCurrentThread();
576 bool SafeBrowsingStoreFile::Delete() {
577 DCHECK(CalledOnValidThread());
579 // The database should not be open at this point. But, just in
580 // case, close everything before deleting.
586 return DeleteStore(filename_
);
589 bool SafeBrowsingStoreFile::CheckValidity() {
590 DCHECK(CalledOnValidThread());
592 // The file was either empty or never opened. The empty case is
593 // presumed not to be invalid. The never-opened case can happen if
594 // BeginUpdate() fails for any databases, and should already have
595 // caused the corruption callback to fire.
599 if (!FileRewind(file_
.get()))
600 return OnCorruptDatabase();
603 if (!base::GetFileSize(filename_
, &size
))
604 return OnCorruptDatabase();
606 base::MD5Context context
;
607 base::MD5Init(&context
);
609 // Read everything except the final digest.
610 size_t bytes_left
= static_cast<size_t>(size
);
611 CHECK(size
== static_cast<int64
>(bytes_left
));
612 if (bytes_left
< sizeof(base::MD5Digest
))
613 return OnCorruptDatabase();
614 bytes_left
-= sizeof(base::MD5Digest
);
616 // Fold the contents of the file into the checksum.
617 while (bytes_left
> 0) {
619 const size_t c
= std::min(sizeof(buf
), bytes_left
);
620 const size_t ret
= fread(buf
, 1, c
, file_
.get());
622 // The file's size changed while reading, give up.
624 return OnCorruptDatabase();
625 base::MD5Update(&context
, base::StringPiece(buf
, c
));
629 if (!ReadAndVerifyChecksum(file_
.get(), &context
)) {
630 RecordFormatEvent(FORMAT_EVENT_VALIDITY_CHECKSUM_FAILURE
);
631 return OnCorruptDatabase();
637 void SafeBrowsingStoreFile::Init(const base::FilePath
& filename
,
638 const base::Closure
& corruption_callback
) {
639 DCHECK(CalledOnValidThread());
640 filename_
= filename
;
641 corruption_callback_
= corruption_callback
;
644 bool SafeBrowsingStoreFile::BeginChunk() {
645 DCHECK(CalledOnValidThread());
646 return ClearChunkBuffers();
649 bool SafeBrowsingStoreFile::WriteAddPrefix(int32 chunk_id
, SBPrefix prefix
) {
650 DCHECK(CalledOnValidThread());
651 add_prefixes_
.push_back(SBAddPrefix(chunk_id
, prefix
));
655 bool SafeBrowsingStoreFile::GetAddPrefixes(SBAddPrefixes
* add_prefixes
) {
656 DCHECK(CalledOnValidThread());
658 add_prefixes
->clear();
659 if (!base::PathExists(filename_
))
662 StateInternal db_state
;
663 if (!ReadDbStateHelper(filename_
, &db_state
))
664 return OnCorruptDatabase();
666 add_prefixes
->swap(db_state
.add_prefixes_
);
670 bool SafeBrowsingStoreFile::GetAddFullHashes(
671 std::vector
<SBAddFullHash
>* add_full_hashes
) {
672 DCHECK(CalledOnValidThread());
674 add_full_hashes
->clear();
675 if (!base::PathExists(filename_
))
678 StateInternal db_state
;
679 if (!ReadDbStateHelper(filename_
, &db_state
))
680 return OnCorruptDatabase();
682 add_full_hashes
->swap(db_state
.add_full_hashes_
);
686 bool SafeBrowsingStoreFile::WriteAddHash(int32 chunk_id
,
687 const SBFullHash
& full_hash
) {
688 DCHECK(CalledOnValidThread());
689 add_hashes_
.push_back(SBAddFullHash(chunk_id
, full_hash
));
693 bool SafeBrowsingStoreFile::WriteSubPrefix(int32 chunk_id
,
696 DCHECK(CalledOnValidThread());
697 sub_prefixes_
.push_back(SBSubPrefix(chunk_id
, add_chunk_id
, prefix
));
701 bool SafeBrowsingStoreFile::WriteSubHash(int32 chunk_id
, int32 add_chunk_id
,
702 const SBFullHash
& full_hash
) {
703 DCHECK(CalledOnValidThread());
704 sub_hashes_
.push_back(SBSubFullHash(chunk_id
, add_chunk_id
, full_hash
));
708 bool SafeBrowsingStoreFile::OnCorruptDatabase() {
709 DCHECK(CalledOnValidThread());
711 if (!corruption_seen_
)
712 RecordFormatEvent(FORMAT_EVENT_FILE_CORRUPT
);
713 corruption_seen_
= true;
715 corruption_callback_
.Run();
717 // Return false as a convenience to callers.
721 bool SafeBrowsingStoreFile::Close() {
722 DCHECK(CalledOnValidThread());
724 ClearUpdateBuffers();
726 // Make sure the files are closed.
732 bool SafeBrowsingStoreFile::BeginUpdate() {
733 DCHECK(CalledOnValidThread());
734 DCHECK(!file_
.get() && !new_file_
.get());
736 // Structures should all be clear unless something bad happened.
737 DCHECK(add_chunks_cache_
.empty());
738 DCHECK(sub_chunks_cache_
.empty());
739 DCHECK(add_del_cache_
.empty());
740 DCHECK(sub_del_cache_
.empty());
741 DCHECK(add_prefixes_
.empty());
742 DCHECK(sub_prefixes_
.empty());
743 DCHECK(add_hashes_
.empty());
744 DCHECK(sub_hashes_
.empty());
745 DCHECK_EQ(chunks_written_
, 0);
747 corruption_seen_
= false;
749 const base::FilePath new_filename
= TemporaryFileForFilename(filename_
);
750 base::ScopedFILE
new_file(base::OpenFile(new_filename
, "wb+"));
751 if (new_file
.get() == NULL
)
754 base::ScopedFILE
file(base::OpenFile(filename_
, "rb"));
755 empty_
= (file
.get() == NULL
);
757 // If the file exists but cannot be opened, try to delete it (not
758 // deleting directly, the bloom filter needs to be deleted, too).
759 if (base::PathExists(filename_
))
760 return OnCorruptDatabase();
762 new_file_
.swap(new_file
);
766 base::MD5Context context
;
769 ReadAndVerifyHeader(filename_
, &header
,
770 &add_chunks_cache_
, &sub_chunks_cache_
,
771 file
.get(), &context
);
772 if (version
== kInvalidVersion
) {
773 FileHeader retry_header
;
774 if (FileRewind(file
.get()) && ReadItem(&retry_header
, file
.get(), NULL
)) {
775 if (retry_header
.magic
== kFileMagic
&&
776 retry_header
.version
< kFileVersion
) {
777 RecordFormatEvent(FORMAT_EVENT_FOUND_DEPRECATED
);
779 RecordFormatEvent(FORMAT_EVENT_FOUND_UNKNOWN
);
783 // Close the file so that it can be deleted.
786 return OnCorruptDatabase();
790 new_file_
.swap(new_file
);
794 bool SafeBrowsingStoreFile::FinishChunk() {
795 DCHECK(CalledOnValidThread());
797 if (!add_prefixes_
.size() && !sub_prefixes_
.size() &&
798 !add_hashes_
.size() && !sub_hashes_
.size())
802 header
.add_prefix_count
= add_prefixes_
.size();
803 header
.sub_prefix_count
= sub_prefixes_
.size();
804 header
.add_hash_count
= add_hashes_
.size();
805 header
.sub_hash_count
= sub_hashes_
.size();
806 if (!WriteItem(header
, new_file_
.get(), NULL
))
809 if (!WriteContainer(add_prefixes_
, new_file_
.get(), NULL
) ||
810 !WriteContainer(sub_prefixes_
, new_file_
.get(), NULL
) ||
811 !WriteContainer(add_hashes_
, new_file_
.get(), NULL
) ||
812 !WriteContainer(sub_hashes_
, new_file_
.get(), NULL
))
817 // Clear everything to save memory.
818 return ClearChunkBuffers();
821 bool SafeBrowsingStoreFile::DoUpdate(
822 safe_browsing::PrefixSetBuilder
* builder
,
823 std::vector
<SBAddFullHash
>* add_full_hashes_result
) {
824 DCHECK(CalledOnValidThread());
825 DCHECK(file_
.get() || empty_
);
826 DCHECK(new_file_
.get());
828 CHECK(add_full_hashes_result
);
830 // Rewind the temporary storage.
831 if (!FileRewind(new_file_
.get()))
834 // Get chunk file's size for validating counts.
835 int64 update_size
= 0;
836 if (!base::GetFileSize(TemporaryFileForFilename(filename_
), &update_size
))
837 return OnCorruptDatabase();
839 // Track update size to answer questions at http://crbug.com/72216 .
840 // Log small updates as 1k so that the 0 (underflow) bucket can be
841 // used for "empty" in SafeBrowsingDatabase.
842 UMA_HISTOGRAM_COUNTS("SB2.DatabaseUpdateKilobytes",
843 std::max(static_cast<int>(update_size
/ 1024), 1));
845 // Chunk updates to integrate.
846 StateInternal new_state
;
848 // Read update chunks.
849 for (int i
= 0; i
< chunks_written_
; ++i
) {
852 int64 ofs
= ftell(new_file_
.get());
856 if (!ReadItem(&header
, new_file_
.get(), NULL
))
859 // As a safety measure, make sure that the header describes a sane
860 // chunk, given the remaining file size.
861 int64 expected_size
= ofs
+ sizeof(ChunkHeader
);
862 expected_size
+= header
.add_prefix_count
* sizeof(SBAddPrefix
);
863 expected_size
+= header
.sub_prefix_count
* sizeof(SBSubPrefix
);
864 expected_size
+= header
.add_hash_count
* sizeof(SBAddFullHash
);
865 expected_size
+= header
.sub_hash_count
* sizeof(SBSubFullHash
);
866 if (expected_size
> update_size
)
869 if (!new_state
.AppendData(header
.add_prefix_count
, header
.sub_prefix_count
,
870 header
.add_hash_count
, header
.sub_hash_count
,
871 new_file_
.get(), NULL
)) {
876 // The state was accumulated by chunk, sort by prefix.
877 new_state
.SortData();
879 // These strides control how much data is loaded into memory per pass.
880 // Strides must be an even power of two. |in_stride| will be derived from the
881 // input file. |out_stride| will be derived from an estimate of the resulting
882 // file's size. |process_stride| will be the max of both.
883 uint64 in_stride
= kMaxShardStride
;
884 uint64 out_stride
= kMaxShardStride
;
885 uint64 process_stride
= 0;
887 // Used to verify the input's checksum if |!empty_|.
888 base::MD5Context in_context
;
893 FileHeader header
= {0};
894 int version
= ReadAndVerifyHeader(filename_
, &header
,
895 &add_chunks_cache_
, &sub_chunks_cache_
,
896 file_
.get(), &in_context
);
897 if (version
== kInvalidVersion
)
898 return OnCorruptDatabase();
900 if (header
.shard_stride
)
901 in_stride
= header
.shard_stride
;
903 // The header checksum should have prevented this case, but the code will be
904 // broken if this is not correct.
905 if (!IsPowerOfTwo(in_stride
))
906 return OnCorruptDatabase();
909 // We no longer need to track deleted chunks.
910 DeleteChunksFromSet(add_del_cache_
, &add_chunks_cache_
);
911 DeleteChunksFromSet(sub_del_cache_
, &sub_chunks_cache_
);
913 // Calculate |out_stride| to break the file down into reasonable shards.
915 int64 original_size
= 0;
916 if (!empty_
&& !base::GetFileSize(filename_
, &original_size
))
917 return OnCorruptDatabase();
919 // Approximate the final size as everything. Subs and deletes will reduce
920 // the size, but modest over-sharding won't hurt much.
921 int64 shard_size
= original_size
+ update_size
;
923 // Keep splitting until a single stride of data fits the target.
925 while (out_stride
> kMinShardStride
&& shard_size
> kUpdateStorageBytes
) {
930 UMA_HISTOGRAM_COUNTS("SB2.OutShardShifts", shifts
);
932 DCHECK(IsPowerOfTwo(out_stride
));
935 // Outer loop strides by the max of the input stride (to read integral shards)
936 // and the output stride (to write integral shards).
937 process_stride
= std::max(in_stride
, out_stride
);
938 DCHECK(IsPowerOfTwo(process_stride
));
939 DCHECK_EQ(0u, process_stride
% in_stride
);
940 DCHECK_EQ(0u, process_stride
% out_stride
);
942 // Start writing the new data to |new_file_|.
943 base::MD5Context out_context
;
944 if (!WriteHeader(out_stride
, add_chunks_cache_
, sub_chunks_cache_
,
945 new_file_
.get(), &out_context
)) {
949 // Start at the beginning of the SBPrefix space.
952 uint64 process_min
= 0;
954 // Start at the beginning of the updates.
955 StateInternalPos new_pos
= new_state
.StateBegin();
957 // Re-usable container for shard processing.
958 StateInternal db_state
;
960 // Track aggregate counts for histograms.
961 size_t add_prefix_count
= 0;
962 size_t sub_prefix_count
= 0;
965 // Maximum element in the current shard.
966 SBPrefix process_max
=
967 static_cast<SBPrefix
>(process_min
+ process_stride
- 1);
968 DCHECK_GT(process_max
, process_min
);
970 // Drop the data from previous pass.
971 db_state
.ClearData();
973 // Fill the processing shard with one or more input shards.
976 ShardHeader shard_header
;
977 if (!ReadItem(&shard_header
, file_
.get(), &in_context
))
978 return OnCorruptDatabase();
980 if (!db_state
.AppendData(shard_header
.add_prefix_count
,
981 shard_header
.sub_prefix_count
,
982 shard_header
.add_hash_count
,
983 shard_header
.sub_hash_count
,
984 file_
.get(), &in_context
))
985 return OnCorruptDatabase();
988 } while (in_min
<= kMaxSBPrefix
&& in_min
< process_max
);
991 // Shard the update data to match the database data, then merge the update
992 // data and process the results.
994 StateInternalPos new_end
= new_state
.ShardEnd(new_pos
, process_max
);
995 db_state
.MergeDataAndProcess(new_pos
, new_end
,
996 add_del_cache_
, sub_del_cache_
);
1000 // Collect the processed data for return to caller.
1001 for (size_t i
= 0; i
< db_state
.add_prefixes_
.size(); ++i
) {
1002 builder
->AddPrefix(db_state
.add_prefixes_
[i
].prefix
);
1004 add_full_hashes_result
->insert(add_full_hashes_result
->end(),
1005 db_state
.add_full_hashes_
.begin(),
1006 db_state
.add_full_hashes_
.end());
1007 add_prefix_count
+= db_state
.add_prefixes_
.size();
1008 sub_prefix_count
+= db_state
.sub_prefixes_
.size();
1010 // Write one or more shards of processed output.
1011 StateInternalPos out_pos
= db_state
.StateBegin();
1013 SBPrefix out_max
= static_cast<SBPrefix
>(out_min
+ out_stride
- 1);
1014 DCHECK_GT(out_max
, out_min
);
1016 StateInternalPos out_end
= db_state
.ShardEnd(out_pos
, out_max
);
1017 if (!db_state
.WriteShard(out_pos
, out_end
, new_file_
.get(), &out_context
))
1021 out_min
+= out_stride
;
1022 } while (out_min
== static_cast<SBPrefix
>(out_min
) &&
1023 out_min
< process_max
);
1025 process_min
+= process_stride
;
1026 } while (process_min
<= kMaxSBPrefix
);
1028 // Verify the overall checksum.
1030 if (!ReadAndVerifyChecksum(file_
.get(), &in_context
)) {
1031 RecordFormatEvent(FORMAT_EVENT_UPDATE_CHECKSUM_FAILURE
);
1032 return OnCorruptDatabase();
1035 // TODO(shess): Verify EOF?
1037 // Close the input file so the new file can be renamed over it.
1040 DCHECK(!file_
.get());
1042 // Write the overall checksum.
1043 base::MD5Digest out_digest
;
1044 base::MD5Final(&out_digest
, &out_context
);
1045 if (!WriteItem(out_digest
, new_file_
.get(), NULL
))
1048 // Trim any excess left over from the temporary chunk data.
1049 if (!base::TruncateFile(new_file_
.get()))
1052 // Close the file handle and swizzle the file into place.
1054 if (!base::DeleteFile(filename_
, false) &&
1055 base::PathExists(filename_
))
1058 const base::FilePath new_filename
= TemporaryFileForFilename(filename_
);
1059 if (!base::Move(new_filename
, filename_
))
1062 // Record counts before swapping to caller.
1063 UMA_HISTOGRAM_COUNTS("SB2.AddPrefixes", add_prefix_count
);
1064 UMA_HISTOGRAM_COUNTS("SB2.SubPrefixes", sub_prefix_count
);
1069 bool SafeBrowsingStoreFile::FinishUpdate(
1070 safe_browsing::PrefixSetBuilder
* builder
,
1071 std::vector
<SBAddFullHash
>* add_full_hashes_result
) {
1072 DCHECK(CalledOnValidThread());
1074 DCHECK(add_full_hashes_result
);
1076 if (!DoUpdate(builder
, add_full_hashes_result
)) {
1081 DCHECK(!new_file_
.get());
1082 DCHECK(!file_
.get());
1087 bool SafeBrowsingStoreFile::CancelUpdate() {
1088 DCHECK(CalledOnValidThread());
1091 // Delete stale staging file.
1092 const base::FilePath new_filename
= TemporaryFileForFilename(filename_
);
1093 base::DeleteFile(new_filename
, false);
1098 void SafeBrowsingStoreFile::SetAddChunk(int32 chunk_id
) {
1099 DCHECK(CalledOnValidThread());
1100 add_chunks_cache_
.insert(chunk_id
);
1103 bool SafeBrowsingStoreFile::CheckAddChunk(int32 chunk_id
) {
1104 DCHECK(CalledOnValidThread());
1105 return add_chunks_cache_
.count(chunk_id
) > 0;
1108 void SafeBrowsingStoreFile::GetAddChunks(std::vector
<int32
>* out
) {
1109 DCHECK(CalledOnValidThread());
1111 out
->insert(out
->end(), add_chunks_cache_
.begin(), add_chunks_cache_
.end());
1114 void SafeBrowsingStoreFile::SetSubChunk(int32 chunk_id
) {
1115 DCHECK(CalledOnValidThread());
1116 sub_chunks_cache_
.insert(chunk_id
);
1119 bool SafeBrowsingStoreFile::CheckSubChunk(int32 chunk_id
) {
1120 DCHECK(CalledOnValidThread());
1121 return sub_chunks_cache_
.count(chunk_id
) > 0;
1124 void SafeBrowsingStoreFile::GetSubChunks(std::vector
<int32
>* out
) {
1125 DCHECK(CalledOnValidThread());
1127 out
->insert(out
->end(), sub_chunks_cache_
.begin(), sub_chunks_cache_
.end());
1130 void SafeBrowsingStoreFile::DeleteAddChunk(int32 chunk_id
) {
1131 DCHECK(CalledOnValidThread());
1132 add_del_cache_
.insert(chunk_id
);
1135 void SafeBrowsingStoreFile::DeleteSubChunk(int32 chunk_id
) {
1136 DCHECK(CalledOnValidThread());
1137 sub_del_cache_
.insert(chunk_id
);
1141 bool SafeBrowsingStoreFile::DeleteStore(const base::FilePath
& basename
) {
1142 if (!base::DeleteFile(basename
, false) &&
1143 base::PathExists(basename
)) {
1148 const base::FilePath new_filename
= TemporaryFileForFilename(basename
);
1149 if (!base::DeleteFile(new_filename
, false) &&
1150 base::PathExists(new_filename
)) {
1155 // With SQLite support gone, one way to get to this code is if the
1156 // existing file is a SQLite file. Make sure the journal file is
1158 const base::FilePath
journal_filename(
1159 basename
.value() + FILE_PATH_LITERAL("-journal"));
1160 if (base::PathExists(journal_filename
))
1161 base::DeleteFile(journal_filename
, false);