chrome/browser/safe_browsing/safe_browsing_store_file.cc

   1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
   2 // Use of this source code is governed by a BSD-style license that can be
   3 // found in the LICENSE file.
   4
   5 #include "chrome/browser/safe_browsing/safe_browsing_store_file.h"
   6
   7 #include "base/files/file_util.h"
   8 #include "base/files/scoped_file.h"
   9 #include "base/md5.h"
  10 #include "base/metrics/histogram.h"
  11 #include "base/metrics/sparse_histogram.h"
  12
  13 namespace {
  14
  15 // NOTE(shess): kFileMagic should not be a byte-wise palindrome, so
  16 // that byte-order changes force corruption.
  17 const int32 kFileMagic = 0x600D71FE;
  18
  19 // Version history:
  20 // Version 6: aad08754/r2814 by erikkay@google.com on 2008-10-02 (sqlite)
  21 // Version 7: 6afe28a5/r37435 by shess@chromium.org on 2010-01-28
  22 // Version 8: d3dd0715/r259791 by shess@chromium.org on 2014-03-27
  23 const int32 kFileVersion = 8;
  24
  25 // ReadAndVerifyHeader() returns this in case of error.
  26 const int32 kInvalidVersion = -1;
  27
  28 // Starting with version 8, the storage is sorted and can be sharded to allow
  29 // updates to be done with lower memory requirements.  Newly written files will
  30 // be sharded to need less than this amount of memory during update.  Larger
  31 // values are preferred to minimize looping overhead during processing.
  32 const int64 kUpdateStorageBytes = 100 * 1024;
  33
  34 // Prevent excessive sharding by setting a lower limit on the shard stride.
  35 // Smaller values should work fine, but very small values will probably lead to
  36 // poor performance.  Shard stride is indirectly related to
  37 // |kUpdateStorageBytes|, setting that very small will bump against this.
  38 const uint32 kMinShardStride = 1 << 24;
  39
  40 // Strides over the entire SBPrefix space.
  41 const uint64 kMaxShardStride = 1ULL << 32;
  42
  43 // Maximum SBPrefix value.
  44 const SBPrefix kMaxSBPrefix = 0xFFFFFFFF;
  45
  46 // Header at the front of the main database file.
  47 struct FileHeader {
  48   int32 magic, version;
  49   uint32 add_chunk_count, sub_chunk_count;
  50   uint32 shard_stride;
  51   // TODO(shess): Is this where 64-bit will bite me?  Perhaps write a
  52   // specialized read/write?
  53 };
  54
  55 // Header for each chunk in the chunk-accumulation file.
  56 struct ChunkHeader {
  57   uint32 add_prefix_count, sub_prefix_count;
  58   uint32 add_hash_count, sub_hash_count;
  59 };
  60
  61 // Header for each shard of data in the main database file.
  62 struct ShardHeader {
  63   uint32 add_prefix_count, sub_prefix_count;
  64   uint32 add_hash_count, sub_hash_count;
  65 };
  66
  67 // Enumerate different format-change events for histogramming
  68 // purposes.  DO NOT CHANGE THE ORDERING OF THESE VALUES.
  69 enum FormatEventType {
  70   // Corruption detected, broken down by file format.
  71   FORMAT_EVENT_FILE_CORRUPT,
  72   FORMAT_EVENT_SQLITE_CORRUPT,  // Obsolete
  73
  74   // The type of format found in the file.  The expected case (new
  75   // file format) is intentionally not covered.
  76   FORMAT_EVENT_FOUND_SQLITE,  // Obsolete
  77   FORMAT_EVENT_FOUND_UNKNOWN,  // magic does not match.
  78
  79   // The number of SQLite-format files deleted should be the same as
  80   // FORMAT_EVENT_FOUND_SQLITE.  It can differ if the delete fails,
  81   // or if a failure prevents the update from succeeding.
  82   FORMAT_EVENT_SQLITE_DELETED,  // Obsolete
  83   FORMAT_EVENT_SQLITE_DELETE_FAILED,  // Obsolete
  84
  85   // Found and deleted (or failed to delete) the ancient "Safe
  86   // Browsing" file.
  87   FORMAT_EVENT_DELETED_ORIGINAL,  // Obsolete
  88   FORMAT_EVENT_DELETED_ORIGINAL_FAILED,  // Obsolete
  89
  90   // The checksum did not check out in CheckValidity() or in
  91   // FinishUpdate().  This most likely indicates that the machine
  92   // crashed before the file was fully sync'ed to disk.
  93   FORMAT_EVENT_VALIDITY_CHECKSUM_FAILURE,
  94   FORMAT_EVENT_UPDATE_CHECKSUM_FAILURE,
  95
  96   // The header checksum was incorrect in ReadAndVerifyHeader().  Likely
  97   // indicates that the system crashed while writing an update.
  98   FORMAT_EVENT_HEADER_CHECKSUM_FAILURE,
  99
 100   FORMAT_EVENT_FOUND_DEPRECATED,  // version too old.
 101
 102   // Memory space for histograms is determined by the max.  ALWAYS
 103   // ADD NEW VALUES BEFORE THIS ONE.
 104   FORMAT_EVENT_MAX
 105 };
 106
 107 void RecordFormatEvent(FormatEventType event_type) {
 108   UMA_HISTOGRAM_ENUMERATION("SB2.FormatEvent", event_type, FORMAT_EVENT_MAX);
 109 }
 110
 111 // Rewind the file.  Using fseek(2) because rewind(3) errors are
 112 // weird.
 113 bool FileRewind(FILE* fp) {
 114   int rv = fseek(fp, 0, SEEK_SET);
 115   DCHECK_EQ(rv, 0);
 116   return rv == 0;
 117 }
 118
 119 // Read from |fp| into |item|, and fold the input data into the
 120 // checksum in |context|, if non-NULL.  Return true on success.
 121 template <class T>
 122 bool ReadItem(T* item, FILE* fp, base::MD5Context* context) {
 123   const size_t ret = fread(item, sizeof(T), 1, fp);
 124   if (ret != 1)
 125     return false;
 126
 127   if (context) {
 128     base::MD5Update(context,
 129                     base::StringPiece(reinterpret_cast<char*>(item),
 130                                       sizeof(T)));
 131   }
 132   return true;
 133 }
 134
 135 // Write |item| to |fp|, and fold the output data into the checksum in
 136 // |context|, if non-NULL.  Return true on success.
 137 template <class T>
 138 bool WriteItem(const T& item, FILE* fp, base::MD5Context* context) {
 139   const size_t ret = fwrite(&item, sizeof(T), 1, fp);
 140   if (ret != 1)
 141     return false;
 142
 143   if (context) {
 144     base::MD5Update(context,
 145                     base::StringPiece(reinterpret_cast<const char*>(&item),
 146                                       sizeof(T)));
 147   }
 148
 149   return true;
 150 }
 151
 152 // Read |count| items into |values| from |fp|, and fold them into the
 153 // checksum in |context|.  Returns true on success.
 154 template <typename CT>
 155 bool ReadToContainer(CT* values, size_t count, FILE* fp,
 156                      base::MD5Context* context) {
 157   if (!count)
 158     return true;
 159
 160   for (size_t i = 0; i < count; ++i) {
 161     typename CT::value_type value;
 162     if (!ReadItem(&value, fp, context))
 163       return false;
 164
 165     // push_back() is more obvious, but coded this way std::set can
 166     // also be read.
 167     values->insert(values->end(), value);
 168   }
 169
 170   return true;
 171 }
 172
 173 // Write values between |beg| and |end| to |fp|, and fold the data into the
 174 // checksum in |context|, if non-NULL.  Returns true if all items successful.
 175 template <typename CTI>
 176 bool WriteRange(const CTI& beg, const CTI& end,
 177                 FILE* fp, base::MD5Context* context) {
 178   for (CTI iter = beg; iter != end; ++iter) {
 179     if (!WriteItem(*iter, fp, context))
 180       return false;
 181   }
 182   return true;
 183 }
 184
 185 // Write all of |values| to |fp|, and fold the data into the checksum
 186 // in |context|, if non-NULL.  Returns true if all items successful.
 187 template <typename CT>
 188 bool WriteContainer(const CT& values, FILE* fp,
 189                     base::MD5Context* context) {
 190   return WriteRange(values.begin(), values.end(), fp, context);
 191 }
 192
 193 // Delete the chunks in |deleted| from |chunks|.
 194 void DeleteChunksFromSet(const base::hash_set<int32>& deleted,
 195                          std::set<int32>* chunks) {
 196   for (std::set<int32>::iterator iter = chunks->begin();
 197        iter != chunks->end();) {
 198     std::set<int32>::iterator prev = iter++;
 199     if (deleted.count(*prev) > 0)
 200       chunks->erase(prev);
 201   }
 202 }
 203
 204 bool ReadAndVerifyChecksum(FILE* fp, base::MD5Context* context) {
 205   base::MD5Digest calculated_digest;
 206   base::MD5IntermediateFinal(&calculated_digest, context);
 207
 208   base::MD5Digest file_digest;
 209   if (!ReadItem(&file_digest, fp, context))
 210     return false;
 211
 212   return memcmp(&file_digest, &calculated_digest, sizeof(file_digest)) == 0;
 213 }
 214
 215 // Helper function to read the file header and chunk TOC.  Rewinds |fp| and
 216 // initializes |context|.  The header is left in |header|, with the version
 217 // returned.  kInvalidVersion is returned for sanity check or checksum failure.
 218 int ReadAndVerifyHeader(const base::FilePath& filename,
 219                         FileHeader* header,
 220                         std::set<int32>* add_chunks,
 221                         std::set<int32>* sub_chunks,
 222                         FILE* fp,
 223                         base::MD5Context* context) {
 224   DCHECK(header);
 225   DCHECK(add_chunks);
 226   DCHECK(sub_chunks);
 227   DCHECK(fp);
 228   DCHECK(context);
 229
 230   base::MD5Init(context);
 231   if (!FileRewind(fp))
 232     return kInvalidVersion;
 233   if (!ReadItem(header, fp, context))
 234     return kInvalidVersion;
 235   if (header->magic != kFileMagic)
 236     return kInvalidVersion;
 237
 238   // Track version read to inform removal of support for older versions.
 239   UMA_HISTOGRAM_SPARSE_SLOWLY("SB2.StoreVersionRead", header->version);
 240
 241   if (header->version != kFileVersion)
 242     return kInvalidVersion;
 243
 244   if (!ReadToContainer(add_chunks, header->add_chunk_count, fp, context) ||
 245       !ReadToContainer(sub_chunks, header->sub_chunk_count, fp, context)) {
 246     return kInvalidVersion;
 247   }
 248
 249   // Verify that the data read thus far is valid.
 250   if (!ReadAndVerifyChecksum(fp, context)) {
 251     RecordFormatEvent(FORMAT_EVENT_HEADER_CHECKSUM_FAILURE);
 252     return kInvalidVersion;
 253   }
 254
 255   return kFileVersion;
 256 }
 257
 258 // Helper function to write out the initial header and chunks-contained data.
 259 // Rewinds |fp|, initializes |context|, then writes a file header and
 260 // |add_chunks| and |sub_chunks|.
 261 bool WriteHeader(uint32 out_stride,
 262                  const std::set<int32>& add_chunks,
 263                  const std::set<int32>& sub_chunks,
 264                  FILE* fp,
 265                  base::MD5Context* context) {
 266   if (!FileRewind(fp))
 267     return false;
 268
 269   base::MD5Init(context);
 270   FileHeader header;
 271   header.magic = kFileMagic;
 272   header.version = kFileVersion;
 273   header.add_chunk_count = add_chunks.size();
 274   header.sub_chunk_count = sub_chunks.size();
 275   header.shard_stride = out_stride;
 276   if (!WriteItem(header, fp, context))
 277     return false;
 278
 279   if (!WriteContainer(add_chunks, fp, context) ||
 280       !WriteContainer(sub_chunks, fp, context))
 281     return false;
 282
 283   // Write out the header digest.
 284   base::MD5Digest header_digest;
 285   base::MD5IntermediateFinal(&header_digest, context);
 286   if (!WriteItem(header_digest, fp, context))
 287     return false;
 288
 289   return true;
 290 }
 291
 292 // Return |true| if the range is sorted by the given comparator.
 293 template <typename CTI, typename LESS>
 294 bool sorted(CTI beg, CTI end, LESS less) {
 295   while ((end - beg) > 2) {
 296     CTI n = beg++;
 297     DCHECK(!less(*beg, *n));
 298     if (less(*beg, *n))
 299       return false;
 300   }
 301   return true;
 302 }
 303
 304 // Merge |beg|..|end| into |container|.  Both should be sorted by the given
 305 // comparator, and the range iterators should not be derived from |container|.
 306 // Differs from std::inplace_merge() in that additional memory is not required
 307 // for linear performance.
 308 template <typename CT, typename CTI, typename COMP>
 309 void container_merge(CT* container, CTI beg, CTI end, const COMP& less) {
 310   DCHECK(sorted(container->begin(), container->end(), less));
 311   DCHECK(sorted(beg, end, less));
 312
 313   // Size the container to fit the results.
 314   const size_t c_size = container->size();
 315   container->resize(c_size + (end - beg));
 316
 317   // |c_end| points to the original endpoint, while |c_out| points to the
 318   // endpoint that will scan from end to beginning while merging.
 319   typename CT::iterator c_end = container->begin() + c_size;
 320   typename CT::iterator c_out = container->end();
 321
 322   // While both inputs have data, move the greater to |c_out|.
 323   while (c_end != container->begin() && end != beg) {
 324     if (less(*(c_end - 1), *(end - 1))) {
 325       *(--c_out) = *(--end);
 326     } else {
 327       *(--c_out) = *(--c_end);
 328     }
 329   }
 330
 331   // Copy any data remaining in the new range.
 332   if (end != beg) {
 333     // The original container data has been fully shifted.
 334     DCHECK(c_end == container->begin());
 335
 336     // There is exactly the correct amount of space left.
 337     DCHECK_EQ(c_out - c_end, end - beg);
 338
 339     std::copy(beg, end, container->begin());
 340   }
 341
 342   DCHECK(sorted(container->begin(), container->end(), less));
 343 }
 344
 345 // Collection of iterators used while stepping through StateInternal (see
 346 // below).
 347 class StateInternalPos {
 348  public:
 349   StateInternalPos(SBAddPrefixes::iterator add_prefixes_iter,
 350                    SBSubPrefixes::iterator sub_prefixes_iter,
 351                    std::vector<SBAddFullHash>::iterator add_hashes_iter,
 352                    std::vector<SBSubFullHash>::iterator sub_hashes_iter)
 353       : add_prefixes_iter_(add_prefixes_iter),
 354         sub_prefixes_iter_(sub_prefixes_iter),
 355         add_hashes_iter_(add_hashes_iter),
 356         sub_hashes_iter_(sub_hashes_iter) {
 357   }
 358
 359   SBAddPrefixes::iterator add_prefixes_iter_;
 360   SBSubPrefixes::iterator sub_prefixes_iter_;
 361   std::vector<SBAddFullHash>::iterator add_hashes_iter_;
 362   std::vector<SBSubFullHash>::iterator sub_hashes_iter_;
 363 };
 364
 365 // Helper to find the next shard boundary.
 366 template <class T>
 367 bool prefix_bounder(SBPrefix val, const T& elt) {
 368   return val < elt.GetAddPrefix();
 369 }
 370
 371 // Container for partial database state.  Includes add/sub prefixes/hashes, plus
 372 // aggregate operations on same.
 373 class StateInternal {
 374  public:
 375   // Append indicated amount of data from |fp|.
 376   bool AppendData(size_t add_prefix_count, size_t sub_prefix_count,
 377                   size_t add_hash_count, size_t sub_hash_count,
 378                   FILE* fp, base::MD5Context* context) {
 379     return
 380         ReadToContainer(&add_prefixes_, add_prefix_count, fp, context) &&
 381         ReadToContainer(&sub_prefixes_, sub_prefix_count, fp, context) &&
 382         ReadToContainer(&add_full_hashes_, add_hash_count, fp, context) &&
 383         ReadToContainer(&sub_full_hashes_, sub_hash_count, fp, context);
 384   }
 385
 386   void ClearData() {
 387     add_prefixes_.clear();
 388     sub_prefixes_.clear();
 389     add_full_hashes_.clear();
 390     sub_full_hashes_.clear();
 391   }
 392
 393   // Merge data from |beg|..|end| into receiver's state, then process the state.
 394   // The current state and the range given should corrospond to the same sorted
 395   // shard of data from different sources.  |add_del_cache| and |sub_del_cache|
 396   // indicate the chunk ids which should be deleted during processing (see
 397   // SBProcessSubs).
 398   void MergeDataAndProcess(const StateInternalPos& beg,
 399                            const StateInternalPos& end,
 400                            const base::hash_set<int32>& add_del_cache,
 401                            const base::hash_set<int32>& sub_del_cache) {
 402     container_merge(&add_prefixes_,
 403                     beg.add_prefixes_iter_,
 404                     end.add_prefixes_iter_,
 405                     SBAddPrefixLess<SBAddPrefix,SBAddPrefix>);
 406
 407     container_merge(&sub_prefixes_,
 408                     beg.sub_prefixes_iter_,
 409                     end.sub_prefixes_iter_,
 410                     SBAddPrefixLess<SBSubPrefix,SBSubPrefix>);
 411
 412     container_merge(&add_full_hashes_,
 413                     beg.add_hashes_iter_,
 414                     end.add_hashes_iter_,
 415                     SBAddPrefixHashLess<SBAddFullHash,SBAddFullHash>);
 416
 417     container_merge(&sub_full_hashes_,
 418                     beg.sub_hashes_iter_,
 419                     end.sub_hashes_iter_,
 420                     SBAddPrefixHashLess<SBSubFullHash, SBSubFullHash>);
 421
 422     SBProcessSubs(&add_prefixes_, &sub_prefixes_,
 423                   &add_full_hashes_, &sub_full_hashes_,
 424                   add_del_cache, sub_del_cache);
 425   }
 426
 427   // Sort the data appropriately for the sharding, merging, and processing
 428   // operations.
 429   void SortData() {
 430     std::sort(add_prefixes_.begin(), add_prefixes_.end(),
 431               SBAddPrefixLess<SBAddPrefix,SBAddPrefix>);
 432     std::sort(sub_prefixes_.begin(), sub_prefixes_.end(),
 433               SBAddPrefixLess<SBSubPrefix,SBSubPrefix>);
 434     std::sort(add_full_hashes_.begin(), add_full_hashes_.end(),
 435               SBAddPrefixHashLess<SBAddFullHash,SBAddFullHash>);
 436     std::sort(sub_full_hashes_.begin(), sub_full_hashes_.end(),
 437               SBAddPrefixHashLess<SBSubFullHash,SBSubFullHash>);
 438   }
 439
 440   // Iterator from the beginning of the state's data.
 441   StateInternalPos StateBegin() {
 442     return StateInternalPos(add_prefixes_.begin(),
 443                             sub_prefixes_.begin(),
 444                             add_full_hashes_.begin(),
 445                             sub_full_hashes_.begin());
 446   }
 447
 448   // An iterator pointing just after the last possible element of the shard
 449   // indicated by |shard_max|.  Used to step through the state by shard.
 450   // TODO(shess): Verify whether binary search really improves over linear.
 451   // Merging or writing will immediately touch all of these elements.
 452   StateInternalPos ShardEnd(const StateInternalPos& beg, SBPrefix shard_max) {
 453     return StateInternalPos(
 454         std::upper_bound(beg.add_prefixes_iter_, add_prefixes_.end(),
 455                          shard_max, prefix_bounder<SBAddPrefix>),
 456         std::upper_bound(beg.sub_prefixes_iter_, sub_prefixes_.end(),
 457                          shard_max, prefix_bounder<SBSubPrefix>),
 458         std::upper_bound(beg.add_hashes_iter_, add_full_hashes_.end(),
 459                          shard_max, prefix_bounder<SBAddFullHash>),
 460         std::upper_bound(beg.sub_hashes_iter_, sub_full_hashes_.end(),
 461                          shard_max, prefix_bounder<SBSubFullHash>));
 462   }
 463
 464   // Write a shard header and data for the shard starting at |beg| and ending at
 465   // the element before |end|.
 466   bool WriteShard(const StateInternalPos& beg, const StateInternalPos& end,
 467                   FILE* fp, base::MD5Context* context) {
 468     ShardHeader shard_header;
 469     shard_header.add_prefix_count =
 470         end.add_prefixes_iter_ - beg.add_prefixes_iter_;
 471     shard_header.sub_prefix_count =
 472         end.sub_prefixes_iter_ - beg.sub_prefixes_iter_;
 473     shard_header.add_hash_count =
 474         end.add_hashes_iter_ - beg.add_hashes_iter_;
 475     shard_header.sub_hash_count =
 476         end.sub_hashes_iter_ - beg.sub_hashes_iter_;
 477
 478     return
 479         WriteItem(shard_header, fp, context) &&
 480         WriteRange(beg.add_prefixes_iter_, end.add_prefixes_iter_,
 481                    fp, context) &&
 482         WriteRange(beg.sub_prefixes_iter_, end.sub_prefixes_iter_,
 483                    fp, context) &&
 484         WriteRange(beg.add_hashes_iter_, end.add_hashes_iter_,
 485                    fp, context) &&
 486         WriteRange(beg.sub_hashes_iter_, end.sub_hashes_iter_,
 487                    fp, context);
 488   }
 489
 490   SBAddPrefixes add_prefixes_;
 491   SBSubPrefixes sub_prefixes_;
 492   std::vector<SBAddFullHash> add_full_hashes_;
 493   std::vector<SBSubFullHash> sub_full_hashes_;
 494 };
 495
 496 // True if |val| is an even power of two.
 497 template <typename T>
 498 bool IsPowerOfTwo(const T& val) {
 499   return val && (val & (val - 1)) == 0;
 500 }
 501
 502 // Helper to read the entire database state, used by GetAddPrefixes() and
 503 // GetAddFullHashes().  Those functions are generally used only for smaller
 504 // files.  Returns false in case of errors reading the data.
 505 bool ReadDbStateHelper(const base::FilePath& filename,
 506                        StateInternal* db_state) {
 507   base::ScopedFILE file(base::OpenFile(filename, "rb"));
 508   if (file.get() == NULL)
 509     return false;
 510
 511   std::set<int32> add_chunks;
 512   std::set<int32> sub_chunks;
 513
 514   base::MD5Context context;
 515   FileHeader header;
 516   const int version =
 517       ReadAndVerifyHeader(filename, &header, &add_chunks, &sub_chunks,
 518                           file.get(), &context);
 519   if (version == kInvalidVersion)
 520     return false;
 521
 522   uint64 in_min = 0;
 523   uint64 in_stride = header.shard_stride;
 524   if (!in_stride)
 525     in_stride = kMaxShardStride;
 526   if (!IsPowerOfTwo(in_stride))
 527     return false;
 528
 529   do {
 530     ShardHeader shard_header;
 531     if (!ReadItem(&shard_header, file.get(), &context))
 532       return false;
 533
 534     if (!db_state->AppendData(shard_header.add_prefix_count,
 535                               shard_header.sub_prefix_count,
 536                               shard_header.add_hash_count,
 537                               shard_header.sub_hash_count,
 538                               file.get(), &context)) {
 539       return false;
 540     }
 541
 542     in_min += in_stride;
 543   } while (in_min <= kMaxSBPrefix);
 544
 545   if (!ReadAndVerifyChecksum(file.get(), &context))
 546     return false;
 547
 548   int64 size = 0;
 549   if (!base::GetFileSize(filename, &size))
 550     return false;
 551
 552   return static_cast<int64>(ftell(file.get())) == size;
 553 }
 554
 555 }  // namespace
 556
 557 SafeBrowsingStoreFile::SafeBrowsingStoreFile(
 558     const scoped_refptr<const base::SequencedTaskRunner>& task_runner)
 559     : task_runner_(task_runner),
 560       chunks_written_(0),
 561       empty_(false),
 562       corruption_seen_(false) {
 563 }
 564
 565 SafeBrowsingStoreFile::~SafeBrowsingStoreFile() {
 566   // Thread-checking is disabled in the destructor due to crbug.com/338486.
 567   task_runner_ = nullptr;
 568
 569   Close();
 570 }
 571
 572 bool SafeBrowsingStoreFile::CalledOnValidThread() {
 573   return !task_runner_ || task_runner_->RunsTasksOnCurrentThread();
 574 }
 575
 576 bool SafeBrowsingStoreFile::Delete() {
 577   DCHECK(CalledOnValidThread());
 578
 579   // The database should not be open at this point.  But, just in
 580   // case, close everything before deleting.
 581   if (!Close()) {
 582     NOTREACHED();
 583     return false;
 584   }
 585
 586   return DeleteStore(filename_);
 587 }
 588
 589 bool SafeBrowsingStoreFile::CheckValidity() {
 590   DCHECK(CalledOnValidThread());
 591
 592   // The file was either empty or never opened.  The empty case is
 593   // presumed not to be invalid.  The never-opened case can happen if
 594   // BeginUpdate() fails for any databases, and should already have
 595   // caused the corruption callback to fire.
 596   if (!file_.get())
 597     return true;
 598
 599   if (!FileRewind(file_.get()))
 600     return OnCorruptDatabase();
 601
 602   int64 size = 0;
 603   if (!base::GetFileSize(filename_, &size))
 604     return OnCorruptDatabase();
 605
 606   base::MD5Context context;
 607   base::MD5Init(&context);
 608
 609   // Read everything except the final digest.
 610   size_t bytes_left = static_cast<size_t>(size);
 611   CHECK(size == static_cast<int64>(bytes_left));
 612   if (bytes_left < sizeof(base::MD5Digest))
 613     return OnCorruptDatabase();
 614   bytes_left -= sizeof(base::MD5Digest);
 615
 616   // Fold the contents of the file into the checksum.
 617   while (bytes_left > 0) {
 618     char buf[4096];
 619     const size_t c = std::min(sizeof(buf), bytes_left);
 620     const size_t ret = fread(buf, 1, c, file_.get());
 621
 622     // The file's size changed while reading, give up.
 623     if (ret != c)
 624       return OnCorruptDatabase();
 625     base::MD5Update(&context, base::StringPiece(buf, c));
 626     bytes_left -= c;
 627   }
 628
 629   if (!ReadAndVerifyChecksum(file_.get(), &context)) {
 630     RecordFormatEvent(FORMAT_EVENT_VALIDITY_CHECKSUM_FAILURE);
 631     return OnCorruptDatabase();
 632   }
 633
 634   return true;
 635 }
 636
 637 void SafeBrowsingStoreFile::Init(const base::FilePath& filename,
 638                                  const base::Closure& corruption_callback) {
 639   DCHECK(CalledOnValidThread());
 640   filename_ = filename;
 641   corruption_callback_ = corruption_callback;
 642 }
 643
 644 bool SafeBrowsingStoreFile::BeginChunk() {
 645   DCHECK(CalledOnValidThread());
 646   return ClearChunkBuffers();
 647 }
 648
 649 bool SafeBrowsingStoreFile::WriteAddPrefix(int32 chunk_id, SBPrefix prefix) {
 650   DCHECK(CalledOnValidThread());
 651   add_prefixes_.push_back(SBAddPrefix(chunk_id, prefix));
 652   return true;
 653 }
 654
 655 bool SafeBrowsingStoreFile::GetAddPrefixes(SBAddPrefixes* add_prefixes) {
 656   DCHECK(CalledOnValidThread());
 657
 658   add_prefixes->clear();
 659   if (!base::PathExists(filename_))
 660     return true;
 661
 662   StateInternal db_state;
 663   if (!ReadDbStateHelper(filename_, &db_state))
 664     return OnCorruptDatabase();
 665
 666   add_prefixes->swap(db_state.add_prefixes_);
 667   return true;
 668 }
 669
 670 bool SafeBrowsingStoreFile::GetAddFullHashes(
 671     std::vector<SBAddFullHash>* add_full_hashes) {
 672   DCHECK(CalledOnValidThread());
 673
 674   add_full_hashes->clear();
 675   if (!base::PathExists(filename_))
 676     return true;
 677
 678   StateInternal db_state;
 679   if (!ReadDbStateHelper(filename_, &db_state))
 680     return OnCorruptDatabase();
 681
 682   add_full_hashes->swap(db_state.add_full_hashes_);
 683   return true;
 684 }
 685
 686 bool SafeBrowsingStoreFile::WriteAddHash(int32 chunk_id,
 687                                          const SBFullHash& full_hash) {
 688   DCHECK(CalledOnValidThread());
 689   add_hashes_.push_back(SBAddFullHash(chunk_id, full_hash));
 690   return true;
 691 }
 692
 693 bool SafeBrowsingStoreFile::WriteSubPrefix(int32 chunk_id,
 694                                            int32 add_chunk_id,
 695                                            SBPrefix prefix) {
 696   DCHECK(CalledOnValidThread());
 697   sub_prefixes_.push_back(SBSubPrefix(chunk_id, add_chunk_id, prefix));
 698   return true;
 699 }
 700
 701 bool SafeBrowsingStoreFile::WriteSubHash(int32 chunk_id, int32 add_chunk_id,
 702                                          const SBFullHash& full_hash) {
 703   DCHECK(CalledOnValidThread());
 704   sub_hashes_.push_back(SBSubFullHash(chunk_id, add_chunk_id, full_hash));
 705   return true;
 706 }
 707
 708 bool SafeBrowsingStoreFile::OnCorruptDatabase() {
 709   DCHECK(CalledOnValidThread());
 710
 711   if (!corruption_seen_)
 712     RecordFormatEvent(FORMAT_EVENT_FILE_CORRUPT);
 713   corruption_seen_ = true;
 714
 715   corruption_callback_.Run();
 716
 717   // Return false as a convenience to callers.
 718   return false;
 719 }
 720
 721 bool SafeBrowsingStoreFile::Close() {
 722   DCHECK(CalledOnValidThread());
 723
 724   ClearUpdateBuffers();
 725
 726   // Make sure the files are closed.
 727   file_.reset();
 728   new_file_.reset();
 729   return true;
 730 }
 731
 732 bool SafeBrowsingStoreFile::BeginUpdate() {
 733   DCHECK(CalledOnValidThread());
 734   DCHECK(!file_.get() && !new_file_.get());
 735
 736   // Structures should all be clear unless something bad happened.
 737   DCHECK(add_chunks_cache_.empty());
 738   DCHECK(sub_chunks_cache_.empty());
 739   DCHECK(add_del_cache_.empty());
 740   DCHECK(sub_del_cache_.empty());
 741   DCHECK(add_prefixes_.empty());
 742   DCHECK(sub_prefixes_.empty());
 743   DCHECK(add_hashes_.empty());
 744   DCHECK(sub_hashes_.empty());
 745   DCHECK_EQ(chunks_written_, 0);
 746
 747   corruption_seen_ = false;
 748
 749   const base::FilePath new_filename = TemporaryFileForFilename(filename_);
 750   base::ScopedFILE new_file(base::OpenFile(new_filename, "wb+"));
 751   if (new_file.get() == NULL)
 752     return false;
 753
 754   base::ScopedFILE file(base::OpenFile(filename_, "rb"));
 755   empty_ = (file.get() == NULL);
 756   if (empty_) {
 757     // If the file exists but cannot be opened, try to delete it (not
 758     // deleting directly, the bloom filter needs to be deleted, too).
 759     if (base::PathExists(filename_))
 760       return OnCorruptDatabase();
 761
 762     new_file_.swap(new_file);
 763     return true;
 764   }
 765
 766   base::MD5Context context;
 767   FileHeader header;
 768   const int version =
 769       ReadAndVerifyHeader(filename_, &header,
 770                           &add_chunks_cache_, &sub_chunks_cache_,
 771                           file.get(), &context);
 772   if (version == kInvalidVersion) {
 773     FileHeader retry_header;
 774     if (FileRewind(file.get()) && ReadItem(&retry_header, file.get(), NULL)) {
 775       if (retry_header.magic == kFileMagic &&
 776           retry_header.version < kFileVersion) {
 777         RecordFormatEvent(FORMAT_EVENT_FOUND_DEPRECATED);
 778       } else {
 779         RecordFormatEvent(FORMAT_EVENT_FOUND_UNKNOWN);
 780       }
 781     }
 782
 783     // Close the file so that it can be deleted.
 784     file.reset();
 785
 786     return OnCorruptDatabase();
 787   }
 788
 789   file_.swap(file);
 790   new_file_.swap(new_file);
 791   return true;
 792 }
 793
 794 bool SafeBrowsingStoreFile::FinishChunk() {
 795   DCHECK(CalledOnValidThread());
 796
 797   if (!add_prefixes_.size() && !sub_prefixes_.size() &&
 798       !add_hashes_.size() && !sub_hashes_.size())
 799     return true;
 800
 801   ChunkHeader header;
 802   header.add_prefix_count = add_prefixes_.size();
 803   header.sub_prefix_count = sub_prefixes_.size();
 804   header.add_hash_count = add_hashes_.size();
 805   header.sub_hash_count = sub_hashes_.size();
 806   if (!WriteItem(header, new_file_.get(), NULL))
 807     return false;
 808
 809   if (!WriteContainer(add_prefixes_, new_file_.get(), NULL) ||
 810       !WriteContainer(sub_prefixes_, new_file_.get(), NULL) ||
 811       !WriteContainer(add_hashes_, new_file_.get(), NULL) ||
 812       !WriteContainer(sub_hashes_, new_file_.get(), NULL))
 813     return false;
 814
 815   ++chunks_written_;
 816
 817   // Clear everything to save memory.
 818   return ClearChunkBuffers();
 819 }
 820
 821 bool SafeBrowsingStoreFile::DoUpdate(
 822     safe_browsing::PrefixSetBuilder* builder,
 823     std::vector<SBAddFullHash>* add_full_hashes_result) {
 824   DCHECK(CalledOnValidThread());
 825   DCHECK(file_.get() || empty_);
 826   DCHECK(new_file_.get());
 827   CHECK(builder);
 828   CHECK(add_full_hashes_result);
 829
 830   // Rewind the temporary storage.
 831   if (!FileRewind(new_file_.get()))
 832     return false;
 833
 834   // Get chunk file's size for validating counts.
 835   int64 update_size = 0;
 836   if (!base::GetFileSize(TemporaryFileForFilename(filename_), &update_size))
 837     return OnCorruptDatabase();
 838
 839   // Track update size to answer questions at http://crbug.com/72216 .
 840   // Log small updates as 1k so that the 0 (underflow) bucket can be
 841   // used for "empty" in SafeBrowsingDatabase.
 842   UMA_HISTOGRAM_COUNTS("SB2.DatabaseUpdateKilobytes",
 843                        std::max(static_cast<int>(update_size / 1024), 1));
 844
 845   // Chunk updates to integrate.
 846   StateInternal new_state;
 847
 848   // Read update chunks.
 849   for (int i = 0; i < chunks_written_; ++i) {
 850     ChunkHeader header;
 851
 852     int64 ofs = ftell(new_file_.get());
 853     if (ofs == -1)
 854       return false;
 855
 856     if (!ReadItem(&header, new_file_.get(), NULL))
 857       return false;
 858
 859     // As a safety measure, make sure that the header describes a sane
 860     // chunk, given the remaining file size.
 861     int64 expected_size = ofs + sizeof(ChunkHeader);
 862     expected_size += header.add_prefix_count * sizeof(SBAddPrefix);
 863     expected_size += header.sub_prefix_count * sizeof(SBSubPrefix);
 864     expected_size += header.add_hash_count * sizeof(SBAddFullHash);
 865     expected_size += header.sub_hash_count * sizeof(SBSubFullHash);
 866     if (expected_size > update_size)
 867       return false;
 868
 869     if (!new_state.AppendData(header.add_prefix_count, header.sub_prefix_count,
 870                               header.add_hash_count, header.sub_hash_count,
 871                               new_file_.get(), NULL)) {
 872       return false;
 873     }
 874   }
 875
 876   // The state was accumulated by chunk, sort by prefix.
 877   new_state.SortData();
 878
 879   // These strides control how much data is loaded into memory per pass.
 880   // Strides must be an even power of two.  |in_stride| will be derived from the
 881   // input file.  |out_stride| will be derived from an estimate of the resulting
 882   // file's size.  |process_stride| will be the max of both.
 883   uint64 in_stride = kMaxShardStride;
 884   uint64 out_stride = kMaxShardStride;
 885   uint64 process_stride = 0;
 886
 887   // Used to verify the input's checksum if |!empty_|.
 888   base::MD5Context in_context;
 889
 890   if (!empty_) {
 891     DCHECK(file_.get());
 892
 893     FileHeader header = {0};
 894     int version = ReadAndVerifyHeader(filename_, &header,
 895                                       &add_chunks_cache_, &sub_chunks_cache_,
 896                                       file_.get(), &in_context);
 897     if (version == kInvalidVersion)
 898       return OnCorruptDatabase();
 899
 900     if (header.shard_stride)
 901       in_stride = header.shard_stride;
 902
 903     // The header checksum should have prevented this case, but the code will be
 904     // broken if this is not correct.
 905     if (!IsPowerOfTwo(in_stride))
 906       return OnCorruptDatabase();
 907   }
 908
 909   // We no longer need to track deleted chunks.
 910   DeleteChunksFromSet(add_del_cache_, &add_chunks_cache_);
 911   DeleteChunksFromSet(sub_del_cache_, &sub_chunks_cache_);
 912
 913   // Calculate |out_stride| to break the file down into reasonable shards.
 914   {
 915     int64 original_size = 0;
 916     if (!empty_ && !base::GetFileSize(filename_, &original_size))
 917       return OnCorruptDatabase();
 918
 919     // Approximate the final size as everything.  Subs and deletes will reduce
 920     // the size, but modest over-sharding won't hurt much.
 921     int64 shard_size = original_size + update_size;
 922
 923     // Keep splitting until a single stride of data fits the target.
 924     size_t shifts = 0;
 925     while (out_stride > kMinShardStride && shard_size > kUpdateStorageBytes) {
 926       out_stride >>= 1;
 927       shard_size >>= 1;
 928       ++shifts;
 929     }
 930     UMA_HISTOGRAM_COUNTS("SB2.OutShardShifts", shifts);
 931
 932     DCHECK(IsPowerOfTwo(out_stride));
 933   }
 934
 935   // Outer loop strides by the max of the input stride (to read integral shards)
 936   // and the output stride (to write integral shards).
 937   process_stride = std::max(in_stride, out_stride);
 938   DCHECK(IsPowerOfTwo(process_stride));
 939   DCHECK_EQ(0u, process_stride % in_stride);
 940   DCHECK_EQ(0u, process_stride % out_stride);
 941
 942   // Start writing the new data to |new_file_|.
 943   base::MD5Context out_context;
 944   if (!WriteHeader(out_stride, add_chunks_cache_, sub_chunks_cache_,
 945                    new_file_.get(), &out_context)) {
 946     return false;
 947   }
 948
 949   // Start at the beginning of the SBPrefix space.
 950   uint64 in_min = 0;
 951   uint64 out_min = 0;
 952   uint64 process_min = 0;
 953
 954   // Start at the beginning of the updates.
 955   StateInternalPos new_pos = new_state.StateBegin();
 956
 957   // Re-usable container for shard processing.
 958   StateInternal db_state;
 959
 960   // Track aggregate counts for histograms.
 961   size_t add_prefix_count = 0;
 962   size_t sub_prefix_count = 0;
 963
 964   do {
 965     // Maximum element in the current shard.
 966     SBPrefix process_max =
 967         static_cast<SBPrefix>(process_min + process_stride - 1);
 968     DCHECK_GT(process_max, process_min);
 969
 970     // Drop the data from previous pass.
 971     db_state.ClearData();
 972
 973     // Fill the processing shard with one or more input shards.
 974     if (!empty_) {
 975       do {
 976         ShardHeader shard_header;
 977         if (!ReadItem(&shard_header, file_.get(), &in_context))
 978           return OnCorruptDatabase();
 979
 980         if (!db_state.AppendData(shard_header.add_prefix_count,
 981                                  shard_header.sub_prefix_count,
 982                                  shard_header.add_hash_count,
 983                                  shard_header.sub_hash_count,
 984                                  file_.get(), &in_context))
 985           return OnCorruptDatabase();
 986
 987         in_min += in_stride;
 988       } while (in_min <= kMaxSBPrefix && in_min < process_max);
 989     }
 990
 991     // Shard the update data to match the database data, then merge the update
 992     // data and process the results.
 993     {
 994       StateInternalPos new_end = new_state.ShardEnd(new_pos, process_max);
 995       db_state.MergeDataAndProcess(new_pos, new_end,
 996                                    add_del_cache_, sub_del_cache_);
 997       new_pos = new_end;
 998     }
 999
1000     // Collect the processed data for return to caller.
1001     for (size_t i = 0; i < db_state.add_prefixes_.size(); ++i) {
1002       builder->AddPrefix(db_state.add_prefixes_[i].prefix);
1003     }
1004     add_full_hashes_result->insert(add_full_hashes_result->end(),
1005                                    db_state.add_full_hashes_.begin(),
1006                                    db_state.add_full_hashes_.end());
1007     add_prefix_count += db_state.add_prefixes_.size();
1008     sub_prefix_count += db_state.sub_prefixes_.size();
1009
1010     // Write one or more shards of processed output.
1011     StateInternalPos out_pos = db_state.StateBegin();
1012     do {
1013       SBPrefix out_max = static_cast<SBPrefix>(out_min + out_stride - 1);
1014       DCHECK_GT(out_max, out_min);
1015
1016       StateInternalPos out_end = db_state.ShardEnd(out_pos, out_max);
1017       if (!db_state.WriteShard(out_pos, out_end, new_file_.get(), &out_context))
1018         return false;
1019       out_pos = out_end;
1020
1021       out_min += out_stride;
1022     } while (out_min == static_cast<SBPrefix>(out_min) &&
1023              out_min < process_max);
1024
1025     process_min += process_stride;
1026   } while (process_min <= kMaxSBPrefix);
1027
1028   // Verify the overall checksum.
1029   if (!empty_) {
1030     if (!ReadAndVerifyChecksum(file_.get(), &in_context)) {
1031       RecordFormatEvent(FORMAT_EVENT_UPDATE_CHECKSUM_FAILURE);
1032       return OnCorruptDatabase();
1033     }
1034
1035     // TODO(shess): Verify EOF?
1036
1037     // Close the input file so the new file can be renamed over it.
1038     file_.reset();
1039   }
1040   DCHECK(!file_.get());
1041
1042   // Write the overall checksum.
1043   base::MD5Digest out_digest;
1044   base::MD5Final(&out_digest, &out_context);
1045   if (!WriteItem(out_digest, new_file_.get(), NULL))
1046     return false;
1047
1048   // Trim any excess left over from the temporary chunk data.
1049   if (!base::TruncateFile(new_file_.get()))
1050     return false;
1051
1052   // Close the file handle and swizzle the file into place.
1053   new_file_.reset();
1054   if (!base::DeleteFile(filename_, false) &&
1055       base::PathExists(filename_))
1056     return false;
1057
1058   const base::FilePath new_filename = TemporaryFileForFilename(filename_);
1059   if (!base::Move(new_filename, filename_))
1060     return false;
1061
1062   // Record counts before swapping to caller.
1063   UMA_HISTOGRAM_COUNTS("SB2.AddPrefixes", add_prefix_count);
1064   UMA_HISTOGRAM_COUNTS("SB2.SubPrefixes", sub_prefix_count);
1065
1066   return true;
1067 }
1068
1069 bool SafeBrowsingStoreFile::FinishUpdate(
1070     safe_browsing::PrefixSetBuilder* builder,
1071     std::vector<SBAddFullHash>* add_full_hashes_result) {
1072   DCHECK(CalledOnValidThread());
1073   DCHECK(builder);
1074   DCHECK(add_full_hashes_result);
1075
1076   if (!DoUpdate(builder, add_full_hashes_result)) {
1077     CancelUpdate();
1078     return false;
1079   }
1080
1081   DCHECK(!new_file_.get());
1082   DCHECK(!file_.get());
1083
1084   return Close();
1085 }
1086
1087 bool SafeBrowsingStoreFile::CancelUpdate() {
1088   DCHECK(CalledOnValidThread());
1089   bool ret = Close();
1090
1091   // Delete stale staging file.
1092   const base::FilePath new_filename = TemporaryFileForFilename(filename_);
1093   base::DeleteFile(new_filename, false);
1094
1095   return ret;
1096 }
1097
1098 void SafeBrowsingStoreFile::SetAddChunk(int32 chunk_id) {
1099   DCHECK(CalledOnValidThread());
1100   add_chunks_cache_.insert(chunk_id);
1101 }
1102
1103 bool SafeBrowsingStoreFile::CheckAddChunk(int32 chunk_id) {
1104   DCHECK(CalledOnValidThread());
1105   return add_chunks_cache_.count(chunk_id) > 0;
1106 }
1107
1108 void SafeBrowsingStoreFile::GetAddChunks(std::vector<int32>* out) {
1109   DCHECK(CalledOnValidThread());
1110   out->clear();
1111   out->insert(out->end(), add_chunks_cache_.begin(), add_chunks_cache_.end());
1112 }
1113
1114 void SafeBrowsingStoreFile::SetSubChunk(int32 chunk_id) {
1115   DCHECK(CalledOnValidThread());
1116   sub_chunks_cache_.insert(chunk_id);
1117 }
1118
1119 bool SafeBrowsingStoreFile::CheckSubChunk(int32 chunk_id) {
1120   DCHECK(CalledOnValidThread());
1121   return sub_chunks_cache_.count(chunk_id) > 0;
1122 }
1123
1124 void SafeBrowsingStoreFile::GetSubChunks(std::vector<int32>* out) {
1125   DCHECK(CalledOnValidThread());
1126   out->clear();
1127   out->insert(out->end(), sub_chunks_cache_.begin(), sub_chunks_cache_.end());
1128 }
1129
1130 void SafeBrowsingStoreFile::DeleteAddChunk(int32 chunk_id) {
1131   DCHECK(CalledOnValidThread());
1132   add_del_cache_.insert(chunk_id);
1133 }
1134
1135 void SafeBrowsingStoreFile::DeleteSubChunk(int32 chunk_id) {
1136   DCHECK(CalledOnValidThread());
1137   sub_del_cache_.insert(chunk_id);
1138 }
1139
1140 // static
1141 bool SafeBrowsingStoreFile::DeleteStore(const base::FilePath& basename) {
1142   if (!base::DeleteFile(basename, false) &&
1143       base::PathExists(basename)) {
1144     NOTREACHED();
1145     return false;
1146   }
1147
1148   const base::FilePath new_filename = TemporaryFileForFilename(basename);
1149   if (!base::DeleteFile(new_filename, false) &&
1150       base::PathExists(new_filename)) {
1151     NOTREACHED();
1152     return false;
1153   }
1154
1155   // With SQLite support gone, one way to get to this code is if the
1156   // existing file is a SQLite file.  Make sure the journal file is
1157   // also removed.
1158   const base::FilePath journal_filename(
1159       basename.value() + FILE_PATH_LITERAL("-journal"));
1160   if (base::PathExists(journal_filename))
1161     base::DeleteFile(journal_filename, false);
1162
1163   return true;
1164 }