chrome/browser/safe_browsing/safe_browsing_store_file.cc

   1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
   2 // Use of this source code is governed by a BSD-style license that can be
   3 // found in the LICENSE file.
   4
   5 #include "chrome/browser/safe_browsing/safe_browsing_store_file.h"
   6
   7 #include "base/files/file_util.h"
   8 #include "base/files/scoped_file.h"
   9 #include "base/md5.h"
  10 #include "base/metrics/histogram.h"
  11 #include "base/metrics/sparse_histogram.h"
  12
  13 namespace {
  14
  15 // NOTE(shess): kFileMagic should not be a byte-wise palindrome, so
  16 // that byte-order changes force corruption.
  17 const int32 kFileMagic = 0x600D71FE;
  18
  19 // Version history:
  20 // Version 6: aad08754/r2814 by erikkay@google.com on 2008-10-02 (sqlite)
  21 // Version 7: 6afe28a5/r37435 by shess@chromium.org on 2010-01-28
  22 // Version 8: d3dd0715/r259791 by shess@chromium.org on 2014-03-27
  23 const int32 kFileVersion = 8;
  24
  25 // ReadAndVerifyHeader() returns this in case of error.
  26 const int32 kInvalidVersion = -1;
  27
  28 // Starting with version 8, the storage is sorted and can be sharded to allow
  29 // updates to be done with lower memory requirements.  Newly written files will
  30 // be sharded to need less than this amount of memory during update.  Larger
  31 // values are preferred to minimize looping overhead during processing.
  32 const int64 kUpdateStorageBytes = 100 * 1024;
  33
  34 // Prevent excessive sharding by setting a lower limit on the shard stride.
  35 // Smaller values should work fine, but very small values will probably lead to
  36 // poor performance.  Shard stride is indirectly related to
  37 // |kUpdateStorageBytes|, setting that very small will bump against this.
  38 const uint32 kMinShardStride = 1 << 24;
  39
  40 // Strides over the entire SBPrefix space.
  41 const uint64 kMaxShardStride = 1ULL << 32;
  42
  43 // Maximum SBPrefix value.
  44 const SBPrefix kMaxSBPrefix = 0xFFFFFFFF;
  45
  46 // Header at the front of the main database file.
  47 struct FileHeader {
  48   int32 magic, version;
  49   uint32 add_chunk_count, sub_chunk_count;
  50   uint32 shard_stride;
  51   // TODO(shess): Is this where 64-bit will bite me?  Perhaps write a
  52   // specialized read/write?
  53 };
  54
  55 // Header for each chunk in the chunk-accumulation file.
  56 struct ChunkHeader {
  57   uint32 add_prefix_count, sub_prefix_count;
  58   uint32 add_hash_count, sub_hash_count;
  59 };
  60
  61 // Header for each shard of data in the main database file.
  62 struct ShardHeader {
  63   uint32 add_prefix_count, sub_prefix_count;
  64   uint32 add_hash_count, sub_hash_count;
  65 };
  66
  67 // Enumerate different format-change events for histogramming
  68 // purposes.  DO NOT CHANGE THE ORDERING OF THESE VALUES.
  69 enum FormatEventType {
  70   // Corruption detected, broken down by file format.
  71   FORMAT_EVENT_FILE_CORRUPT,
  72   FORMAT_EVENT_SQLITE_CORRUPT,  // Obsolete
  73
  74   // The type of format found in the file.  The expected case (new
  75   // file format) is intentionally not covered.
  76   FORMAT_EVENT_FOUND_SQLITE,  // Obsolete
  77   FORMAT_EVENT_FOUND_UNKNOWN,  // magic does not match.
  78
  79   // The number of SQLite-format files deleted should be the same as
  80   // FORMAT_EVENT_FOUND_SQLITE.  It can differ if the delete fails,
  81   // or if a failure prevents the update from succeeding.
  82   FORMAT_EVENT_SQLITE_DELETED,  // Obsolete
  83   FORMAT_EVENT_SQLITE_DELETE_FAILED,  // Obsolete
  84
  85   // Found and deleted (or failed to delete) the ancient "Safe
  86   // Browsing" file.
  87   FORMAT_EVENT_DELETED_ORIGINAL,  // Obsolete
  88   FORMAT_EVENT_DELETED_ORIGINAL_FAILED,  // Obsolete
  89
  90   // The checksum did not check out in CheckValidity() or in
  91   // FinishUpdate().  This most likely indicates that the machine
  92   // crashed before the file was fully sync'ed to disk.
  93   FORMAT_EVENT_VALIDITY_CHECKSUM_FAILURE,
  94   FORMAT_EVENT_UPDATE_CHECKSUM_FAILURE,
  95
  96   // The header checksum was incorrect in ReadAndVerifyHeader().  Likely
  97   // indicates that the system crashed while writing an update.
  98   FORMAT_EVENT_HEADER_CHECKSUM_FAILURE,
  99
 100   FORMAT_EVENT_FOUND_DEPRECATED,  // version too old.
 101
 102   // Memory space for histograms is determined by the max.  ALWAYS
 103   // ADD NEW VALUES BEFORE THIS ONE.
 104   FORMAT_EVENT_MAX
 105 };
 106
 107 void RecordFormatEvent(FormatEventType event_type) {
 108   UMA_HISTOGRAM_ENUMERATION("SB2.FormatEvent", event_type, FORMAT_EVENT_MAX);
 109 }
 110
 111 // Rewind the file.  Using fseek(2) because rewind(3) errors are
 112 // weird.
 113 bool FileRewind(FILE* fp) {
 114   int rv = fseek(fp, 0, SEEK_SET);
 115   DCHECK_EQ(rv, 0);
 116   return rv == 0;
 117 }
 118
 119 // Read from |fp| into |item|, and fold the input data into the
 120 // checksum in |context|, if non-NULL.  Return true on success.
 121 template <class T>
 122 bool ReadItem(T* item, FILE* fp, base::MD5Context* context) {
 123   const size_t ret = fread(item, sizeof(T), 1, fp);
 124   if (ret != 1)
 125     return false;
 126
 127   if (context) {
 128     base::MD5Update(context,
 129                     base::StringPiece(reinterpret_cast<char*>(item),
 130                                       sizeof(T)));
 131   }
 132   return true;
 133 }
 134
 135 // Write |item| to |fp|, and fold the output data into the checksum in
 136 // |context|, if non-NULL.  Return true on success.
 137 template <class T>
 138 bool WriteItem(const T& item, FILE* fp, base::MD5Context* context) {
 139   const size_t ret = fwrite(&item, sizeof(T), 1, fp);
 140   if (ret != 1)
 141     return false;
 142
 143   if (context) {
 144     base::MD5Update(context,
 145                     base::StringPiece(reinterpret_cast<const char*>(&item),
 146                                       sizeof(T)));
 147   }
 148
 149   return true;
 150 }
 151
 152 // Read |count| items into |values| from |fp|, and fold them into the
 153 // checksum in |context|.  Returns true on success.
 154 template <typename CT>
 155 bool ReadToContainer(CT* values, size_t count, FILE* fp,
 156                      base::MD5Context* context) {
 157   if (!count)
 158     return true;
 159
 160   for (size_t i = 0; i < count; ++i) {
 161     typename CT::value_type value;
 162     if (!ReadItem(&value, fp, context))
 163       return false;
 164
 165     // push_back() is more obvious, but coded this way std::set can
 166     // also be read.
 167     values->insert(values->end(), value);
 168   }
 169
 170   return true;
 171 }
 172
 173 // Write values between |beg| and |end| to |fp|, and fold the data into the
 174 // checksum in |context|, if non-NULL.  Returns true if all items successful.
 175 template <typename CTI>
 176 bool WriteRange(const CTI& beg, const CTI& end,
 177                 FILE* fp, base::MD5Context* context) {
 178   for (CTI iter = beg; iter != end; ++iter) {
 179     if (!WriteItem(*iter, fp, context))
 180       return false;
 181   }
 182   return true;
 183 }
 184
 185 // Write all of |values| to |fp|, and fold the data into the checksum
 186 // in |context|, if non-NULL.  Returns true if all items successful.
 187 template <typename CT>
 188 bool WriteContainer(const CT& values, FILE* fp,
 189                     base::MD5Context* context) {
 190   return WriteRange(values.begin(), values.end(), fp, context);
 191 }
 192
 193 // Delete the chunks in |deleted| from |chunks|.
 194 void DeleteChunksFromSet(const base::hash_set<int32>& deleted,
 195                          std::set<int32>* chunks) {
 196   for (std::set<int32>::iterator iter = chunks->begin();
 197        iter != chunks->end();) {
 198     std::set<int32>::iterator prev = iter++;
 199     if (deleted.count(*prev) > 0)
 200       chunks->erase(prev);
 201   }
 202 }
 203
 204 bool ReadAndVerifyChecksum(FILE* fp, base::MD5Context* context) {
 205   base::MD5Digest calculated_digest;
 206   base::MD5IntermediateFinal(&calculated_digest, context);
 207
 208   base::MD5Digest file_digest;
 209   if (!ReadItem(&file_digest, fp, context))
 210     return false;
 211
 212   return memcmp(&file_digest, &calculated_digest, sizeof(file_digest)) == 0;
 213 }
 214
 215 // Helper function to read the file header and chunk TOC.  Rewinds |fp| and
 216 // initializes |context|.  The header is left in |header|, with the version
 217 // returned.  kInvalidVersion is returned for sanity check or checksum failure.
 218 int ReadAndVerifyHeader(const base::FilePath& filename,
 219                         FileHeader* header,
 220                         std::set<int32>* add_chunks,
 221                         std::set<int32>* sub_chunks,
 222                         FILE* fp,
 223                         base::MD5Context* context) {
 224   DCHECK(header);
 225   DCHECK(add_chunks);
 226   DCHECK(sub_chunks);
 227   DCHECK(fp);
 228   DCHECK(context);
 229
 230   base::MD5Init(context);
 231   if (!FileRewind(fp))
 232     return kInvalidVersion;
 233   if (!ReadItem(header, fp, context))
 234     return kInvalidVersion;
 235   if (header->magic != kFileMagic)
 236     return kInvalidVersion;
 237
 238   // Track version read to inform removal of support for older versions.
 239   UMA_HISTOGRAM_SPARSE_SLOWLY("SB2.StoreVersionRead", header->version);
 240
 241   if (header->version != kFileVersion)
 242     return kInvalidVersion;
 243
 244   if (!ReadToContainer(add_chunks, header->add_chunk_count, fp, context) ||
 245       !ReadToContainer(sub_chunks, header->sub_chunk_count, fp, context)) {
 246     return kInvalidVersion;
 247   }
 248
 249   // Verify that the data read thus far is valid.
 250   if (!ReadAndVerifyChecksum(fp, context)) {
 251     RecordFormatEvent(FORMAT_EVENT_HEADER_CHECKSUM_FAILURE);
 252     return kInvalidVersion;
 253   }
 254
 255   return kFileVersion;
 256 }
 257
 258 // Helper function to write out the initial header and chunks-contained data.
 259 // Rewinds |fp|, initializes |context|, then writes a file header and
 260 // |add_chunks| and |sub_chunks|.
 261 bool WriteHeader(uint32 out_stride,
 262                  const std::set<int32>& add_chunks,
 263                  const std::set<int32>& sub_chunks,
 264                  FILE* fp,
 265                  base::MD5Context* context) {
 266   if (!FileRewind(fp))
 267     return false;
 268
 269   base::MD5Init(context);
 270   FileHeader header;
 271   header.magic = kFileMagic;
 272   header.version = kFileVersion;
 273   header.add_chunk_count = add_chunks.size();
 274   header.sub_chunk_count = sub_chunks.size();
 275   header.shard_stride = out_stride;
 276   if (!WriteItem(header, fp, context))
 277     return false;
 278
 279   if (!WriteContainer(add_chunks, fp, context) ||
 280       !WriteContainer(sub_chunks, fp, context))
 281     return false;
 282
 283   // Write out the header digest.
 284   base::MD5Digest header_digest;
 285   base::MD5IntermediateFinal(&header_digest, context);
 286   if (!WriteItem(header_digest, fp, context))
 287     return false;
 288
 289   return true;
 290 }
 291
 292 // Return |true| if the range is sorted by the given comparator.
 293 template <typename CTI, typename LESS>
 294 bool sorted(CTI beg, CTI end, LESS less) {
 295   while ((end - beg) > 2) {
 296     CTI n = beg++;
 297     DCHECK(!less(*beg, *n));
 298     if (less(*beg, *n))
 299       return false;
 300   }
 301   return true;
 302 }
 303
 304 // Merge |beg|..|end| into |container|.  Both should be sorted by the given
 305 // comparator, and the range iterators should not be derived from |container|.
 306 // Differs from std::inplace_merge() in that additional memory is not required
 307 // for linear performance.
 308 template <typename CT, typename CTI, typename COMP>
 309 void container_merge(CT* container, CTI beg, CTI end, const COMP& less) {
 310   DCHECK(sorted(container->begin(), container->end(), less));
 311   DCHECK(sorted(beg, end, less));
 312
 313   // Size the container to fit the results.
 314   const size_t c_size = container->size();
 315   container->resize(c_size + (end - beg));
 316
 317   // |c_end| points to the original endpoint, while |c_out| points to the
 318   // endpoint that will scan from end to beginning while merging.
 319   typename CT::iterator c_end = container->begin() + c_size;
 320   typename CT::iterator c_out = container->end();
 321
 322   // While both inputs have data, move the greater to |c_out|.
 323   while (c_end != container->begin() && end != beg) {
 324     if (less(*(c_end - 1), *(end - 1))) {
 325       *(--c_out) = *(--end);
 326     } else {
 327       *(--c_out) = *(--c_end);
 328     }
 329   }
 330
 331   // Copy any data remaining in the new range.
 332   if (end != beg) {
 333     // The original container data has been fully shifted.
 334     DCHECK(c_end == container->begin());
 335
 336     // There is exactly the correct amount of space left.
 337     DCHECK_EQ(c_out - c_end, end - beg);
 338
 339     std::copy(beg, end, container->begin());
 340   }
 341
 342   DCHECK(sorted(container->begin(), container->end(), less));
 343 }
 344
 345 // Collection of iterators used while stepping through StateInternal (see
 346 // below).
 347 class StateInternalPos {
 348  public:
 349   StateInternalPos(SBAddPrefixes::iterator add_prefixes_iter,
 350                    SBSubPrefixes::iterator sub_prefixes_iter,
 351                    std::vector<SBAddFullHash>::iterator add_hashes_iter,
 352                    std::vector<SBSubFullHash>::iterator sub_hashes_iter)
 353       : add_prefixes_iter_(add_prefixes_iter),
 354         sub_prefixes_iter_(sub_prefixes_iter),
 355         add_hashes_iter_(add_hashes_iter),
 356         sub_hashes_iter_(sub_hashes_iter) {
 357   }
 358
 359   SBAddPrefixes::iterator add_prefixes_iter_;
 360   SBSubPrefixes::iterator sub_prefixes_iter_;
 361   std::vector<SBAddFullHash>::iterator add_hashes_iter_;
 362   std::vector<SBSubFullHash>::iterator sub_hashes_iter_;
 363 };
 364
 365 // Helper to find the next shard boundary.
 366 template <class T>
 367 bool prefix_bounder(SBPrefix val, const T& elt) {
 368   return val < elt.GetAddPrefix();
 369 }
 370
 371 // Container for partial database state.  Includes add/sub prefixes/hashes, plus
 372 // aggregate operations on same.
 373 class StateInternal {
 374  public:
 375   // Append indicated amount of data from |fp|.
 376   bool AppendData(size_t add_prefix_count, size_t sub_prefix_count,
 377                   size_t add_hash_count, size_t sub_hash_count,
 378                   FILE* fp, base::MD5Context* context) {
 379     return
 380         ReadToContainer(&add_prefixes_, add_prefix_count, fp, context) &&
 381         ReadToContainer(&sub_prefixes_, sub_prefix_count, fp, context) &&
 382         ReadToContainer(&add_full_hashes_, add_hash_count, fp, context) &&
 383         ReadToContainer(&sub_full_hashes_, sub_hash_count, fp, context);
 384   }
 385
 386   void ClearData() {
 387     add_prefixes_.clear();
 388     sub_prefixes_.clear();
 389     add_full_hashes_.clear();
 390     sub_full_hashes_.clear();
 391   }
 392
 393   // Merge data from |beg|..|end| into receiver's state, then process the state.
 394   // The current state and the range given should corrospond to the same sorted
 395   // shard of data from different sources.  |add_del_cache| and |sub_del_cache|
 396   // indicate the chunk ids which should be deleted during processing (see
 397   // SBProcessSubs).
 398   void MergeDataAndProcess(const StateInternalPos& beg,
 399                            const StateInternalPos& end,
 400                            const base::hash_set<int32>& add_del_cache,
 401                            const base::hash_set<int32>& sub_del_cache) {
 402     container_merge(&add_prefixes_,
 403                     beg.add_prefixes_iter_,
 404                     end.add_prefixes_iter_,
 405                     SBAddPrefixLess<SBAddPrefix,SBAddPrefix>);
 406
 407     container_merge(&sub_prefixes_,
 408                     beg.sub_prefixes_iter_,
 409                     end.sub_prefixes_iter_,
 410                     SBAddPrefixLess<SBSubPrefix,SBSubPrefix>);
 411
 412     container_merge(&add_full_hashes_,
 413                     beg.add_hashes_iter_,
 414                     end.add_hashes_iter_,
 415                     SBAddPrefixHashLess<SBAddFullHash,SBAddFullHash>);
 416
 417     container_merge(&sub_full_hashes_,
 418                     beg.sub_hashes_iter_,
 419                     end.sub_hashes_iter_,
 420                     SBAddPrefixHashLess<SBSubFullHash, SBSubFullHash>);
 421
 422     SBProcessSubs(&add_prefixes_, &sub_prefixes_,
 423                   &add_full_hashes_, &sub_full_hashes_,
 424                   add_del_cache, sub_del_cache);
 425   }
 426
 427   // Sort the data appropriately for the sharding, merging, and processing
 428   // operations.
 429   void SortData() {
 430     std::sort(add_prefixes_.begin(), add_prefixes_.end(),
 431               SBAddPrefixLess<SBAddPrefix,SBAddPrefix>);
 432     std::sort(sub_prefixes_.begin(), sub_prefixes_.end(),
 433               SBAddPrefixLess<SBSubPrefix,SBSubPrefix>);
 434     std::sort(add_full_hashes_.begin(), add_full_hashes_.end(),
 435               SBAddPrefixHashLess<SBAddFullHash,SBAddFullHash>);
 436     std::sort(sub_full_hashes_.begin(), sub_full_hashes_.end(),
 437               SBAddPrefixHashLess<SBSubFullHash,SBSubFullHash>);
 438   }
 439
 440   // Iterator from the beginning of the state's data.
 441   StateInternalPos StateBegin() {
 442     return StateInternalPos(add_prefixes_.begin(),
 443                             sub_prefixes_.begin(),
 444                             add_full_hashes_.begin(),
 445                             sub_full_hashes_.begin());
 446   }
 447
 448   // An iterator pointing just after the last possible element of the shard
 449   // indicated by |shard_max|.  Used to step through the state by shard.
 450   // TODO(shess): Verify whether binary search really improves over linear.
 451   // Merging or writing will immediately touch all of these elements.
 452   StateInternalPos ShardEnd(const StateInternalPos& beg, SBPrefix shard_max) {
 453     return StateInternalPos(
 454         std::upper_bound(beg.add_prefixes_iter_, add_prefixes_.end(),
 455                          shard_max, prefix_bounder<SBAddPrefix>),
 456         std::upper_bound(beg.sub_prefixes_iter_, sub_prefixes_.end(),
 457                          shard_max, prefix_bounder<SBSubPrefix>),
 458         std::upper_bound(beg.add_hashes_iter_, add_full_hashes_.end(),
 459                          shard_max, prefix_bounder<SBAddFullHash>),
 460         std::upper_bound(beg.sub_hashes_iter_, sub_full_hashes_.end(),
 461                          shard_max, prefix_bounder<SBSubFullHash>));
 462   }
 463
 464   // Write a shard header and data for the shard starting at |beg| and ending at
 465   // the element before |end|.
 466   bool WriteShard(const StateInternalPos& beg, const StateInternalPos& end,
 467                   FILE* fp, base::MD5Context* context) {
 468     ShardHeader shard_header;
 469     shard_header.add_prefix_count =
 470         end.add_prefixes_iter_ - beg.add_prefixes_iter_;
 471     shard_header.sub_prefix_count =
 472         end.sub_prefixes_iter_ - beg.sub_prefixes_iter_;
 473     shard_header.add_hash_count =
 474         end.add_hashes_iter_ - beg.add_hashes_iter_;
 475     shard_header.sub_hash_count =
 476         end.sub_hashes_iter_ - beg.sub_hashes_iter_;
 477
 478     return
 479         WriteItem(shard_header, fp, context) &&
 480         WriteRange(beg.add_prefixes_iter_, end.add_prefixes_iter_,
 481                    fp, context) &&
 482         WriteRange(beg.sub_prefixes_iter_, end.sub_prefixes_iter_,
 483                    fp, context) &&
 484         WriteRange(beg.add_hashes_iter_, end.add_hashes_iter_,
 485                    fp, context) &&
 486         WriteRange(beg.sub_hashes_iter_, end.sub_hashes_iter_,
 487                    fp, context);
 488   }
 489
 490   SBAddPrefixes add_prefixes_;
 491   SBSubPrefixes sub_prefixes_;
 492   std::vector<SBAddFullHash> add_full_hashes_;
 493   std::vector<SBSubFullHash> sub_full_hashes_;
 494 };
 495
 496 // True if |val| is an even power of two.
 497 template <typename T>
 498 bool IsPowerOfTwo(const T& val) {
 499   return val && (val & (val - 1)) == 0;
 500 }
 501
 502 // Helper to read the entire database state, used by GetAddPrefixes() and
 503 // GetAddFullHashes().  Those functions are generally used only for smaller
 504 // files.  Returns false in case of errors reading the data.
 505 bool ReadDbStateHelper(const base::FilePath& filename,
 506                        StateInternal* db_state) {
 507   base::ScopedFILE file(base::OpenFile(filename, "rb"));
 508   if (file.get() == NULL)
 509     return false;
 510
 511   std::set<int32> add_chunks;
 512   std::set<int32> sub_chunks;
 513
 514   base::MD5Context context;
 515   FileHeader header;
 516   const int version =
 517       ReadAndVerifyHeader(filename, &header, &add_chunks, &sub_chunks,
 518                           file.get(), &context);
 519   if (version == kInvalidVersion)
 520     return false;
 521
 522   uint64 in_min = 0;
 523   uint64 in_stride = header.shard_stride;
 524   if (!in_stride)
 525     in_stride = kMaxShardStride;
 526   if (!IsPowerOfTwo(in_stride))
 527     return false;
 528
 529   do {
 530     ShardHeader shard_header;
 531     if (!ReadItem(&shard_header, file.get(), &context))
 532       return false;
 533
 534     if (!db_state->AppendData(shard_header.add_prefix_count,
 535                               shard_header.sub_prefix_count,
 536                               shard_header.add_hash_count,
 537                               shard_header.sub_hash_count,
 538                               file.get(), &context)) {
 539       return false;
 540     }
 541
 542     in_min += in_stride;
 543   } while (in_min <= kMaxSBPrefix);
 544
 545   if (!ReadAndVerifyChecksum(file.get(), &context))
 546     return false;
 547
 548   int64 size = 0;
 549   if (!base::GetFileSize(filename, &size))
 550     return false;
 551
 552   return static_cast<int64>(ftell(file.get())) == size;
 553 }
 554
 555 }  // namespace
 556
 557 SafeBrowsingStoreFile::SafeBrowsingStoreFile()
 558     : chunks_written_(0), empty_(false), corruption_seen_(false) {}
 559
 560 SafeBrowsingStoreFile::~SafeBrowsingStoreFile() {
 561   // Thread-checking is disabled in the destructor due to crbug.com/338486.
 562   DetachFromThread();
 563
 564   Close();
 565 }
 566
 567 bool SafeBrowsingStoreFile::Delete() {
 568   DCHECK(CalledOnValidThread());
 569
 570   // The database should not be open at this point.  But, just in
 571   // case, close everything before deleting.
 572   if (!Close()) {
 573     NOTREACHED();
 574     return false;
 575   }
 576
 577   return DeleteStore(filename_);
 578 }
 579
 580 bool SafeBrowsingStoreFile::CheckValidity() {
 581   DCHECK(CalledOnValidThread());
 582
 583   // The file was either empty or never opened.  The empty case is
 584   // presumed not to be invalid.  The never-opened case can happen if
 585   // BeginUpdate() fails for any databases, and should already have
 586   // caused the corruption callback to fire.
 587   if (!file_.get())
 588     return true;
 589
 590   if (!FileRewind(file_.get()))
 591     return OnCorruptDatabase();
 592
 593   int64 size = 0;
 594   if (!base::GetFileSize(filename_, &size))
 595     return OnCorruptDatabase();
 596
 597   base::MD5Context context;
 598   base::MD5Init(&context);
 599
 600   // Read everything except the final digest.
 601   size_t bytes_left = static_cast<size_t>(size);
 602   CHECK(size == static_cast<int64>(bytes_left));
 603   if (bytes_left < sizeof(base::MD5Digest))
 604     return OnCorruptDatabase();
 605   bytes_left -= sizeof(base::MD5Digest);
 606
 607   // Fold the contents of the file into the checksum.
 608   while (bytes_left > 0) {
 609     char buf[4096];
 610     const size_t c = std::min(sizeof(buf), bytes_left);
 611     const size_t ret = fread(buf, 1, c, file_.get());
 612
 613     // The file's size changed while reading, give up.
 614     if (ret != c)
 615       return OnCorruptDatabase();
 616     base::MD5Update(&context, base::StringPiece(buf, c));
 617     bytes_left -= c;
 618   }
 619
 620   if (!ReadAndVerifyChecksum(file_.get(), &context)) {
 621     RecordFormatEvent(FORMAT_EVENT_VALIDITY_CHECKSUM_FAILURE);
 622     return OnCorruptDatabase();
 623   }
 624
 625   return true;
 626 }
 627
 628 void SafeBrowsingStoreFile::Init(const base::FilePath& filename,
 629                                  const base::Closure& corruption_callback) {
 630   DCHECK(CalledOnValidThread());
 631   filename_ = filename;
 632   corruption_callback_ = corruption_callback;
 633 }
 634
 635 bool SafeBrowsingStoreFile::BeginChunk() {
 636   DCHECK(CalledOnValidThread());
 637   return ClearChunkBuffers();
 638 }
 639
 640 bool SafeBrowsingStoreFile::WriteAddPrefix(int32 chunk_id, SBPrefix prefix) {
 641   DCHECK(CalledOnValidThread());
 642   add_prefixes_.push_back(SBAddPrefix(chunk_id, prefix));
 643   return true;
 644 }
 645
 646 bool SafeBrowsingStoreFile::GetAddPrefixes(SBAddPrefixes* add_prefixes) {
 647   DCHECK(CalledOnValidThread());
 648
 649   add_prefixes->clear();
 650   if (!base::PathExists(filename_))
 651     return true;
 652
 653   StateInternal db_state;
 654   if (!ReadDbStateHelper(filename_, &db_state))
 655     return OnCorruptDatabase();
 656
 657   add_prefixes->swap(db_state.add_prefixes_);
 658   return true;
 659 }
 660
 661 bool SafeBrowsingStoreFile::GetAddFullHashes(
 662     std::vector<SBAddFullHash>* add_full_hashes) {
 663   DCHECK(CalledOnValidThread());
 664
 665   add_full_hashes->clear();
 666   if (!base::PathExists(filename_))
 667     return true;
 668
 669   StateInternal db_state;
 670   if (!ReadDbStateHelper(filename_, &db_state))
 671     return OnCorruptDatabase();
 672
 673   add_full_hashes->swap(db_state.add_full_hashes_);
 674   return true;
 675 }
 676
 677 bool SafeBrowsingStoreFile::WriteAddHash(int32 chunk_id,
 678                                          const SBFullHash& full_hash) {
 679   DCHECK(CalledOnValidThread());
 680   add_hashes_.push_back(SBAddFullHash(chunk_id, full_hash));
 681   return true;
 682 }
 683
 684 bool SafeBrowsingStoreFile::WriteSubPrefix(int32 chunk_id,
 685                                            int32 add_chunk_id,
 686                                            SBPrefix prefix) {
 687   DCHECK(CalledOnValidThread());
 688   sub_prefixes_.push_back(SBSubPrefix(chunk_id, add_chunk_id, prefix));
 689   return true;
 690 }
 691
 692 bool SafeBrowsingStoreFile::WriteSubHash(int32 chunk_id, int32 add_chunk_id,
 693                                          const SBFullHash& full_hash) {
 694   DCHECK(CalledOnValidThread());
 695   sub_hashes_.push_back(SBSubFullHash(chunk_id, add_chunk_id, full_hash));
 696   return true;
 697 }
 698
 699 bool SafeBrowsingStoreFile::OnCorruptDatabase() {
 700   DCHECK(CalledOnValidThread());
 701
 702   if (!corruption_seen_)
 703     RecordFormatEvent(FORMAT_EVENT_FILE_CORRUPT);
 704   corruption_seen_ = true;
 705
 706   corruption_callback_.Run();
 707
 708   // Return false as a convenience to callers.
 709   return false;
 710 }
 711
 712 bool SafeBrowsingStoreFile::Close() {
 713   DCHECK(CalledOnValidThread());
 714
 715   ClearUpdateBuffers();
 716
 717   // Make sure the files are closed.
 718   file_.reset();
 719   new_file_.reset();
 720   return true;
 721 }
 722
 723 bool SafeBrowsingStoreFile::BeginUpdate() {
 724   DCHECK(CalledOnValidThread());
 725   DCHECK(!file_.get() && !new_file_.get());
 726
 727   // Structures should all be clear unless something bad happened.
 728   DCHECK(add_chunks_cache_.empty());
 729   DCHECK(sub_chunks_cache_.empty());
 730   DCHECK(add_del_cache_.empty());
 731   DCHECK(sub_del_cache_.empty());
 732   DCHECK(add_prefixes_.empty());
 733   DCHECK(sub_prefixes_.empty());
 734   DCHECK(add_hashes_.empty());
 735   DCHECK(sub_hashes_.empty());
 736   DCHECK_EQ(chunks_written_, 0);
 737
 738   corruption_seen_ = false;
 739
 740   const base::FilePath new_filename = TemporaryFileForFilename(filename_);
 741   base::ScopedFILE new_file(base::OpenFile(new_filename, "wb+"));
 742   if (new_file.get() == NULL)
 743     return false;
 744
 745   base::ScopedFILE file(base::OpenFile(filename_, "rb"));
 746   empty_ = (file.get() == NULL);
 747   if (empty_) {
 748     // If the file exists but cannot be opened, try to delete it (not
 749     // deleting directly, the bloom filter needs to be deleted, too).
 750     if (base::PathExists(filename_))
 751       return OnCorruptDatabase();
 752
 753     new_file_.swap(new_file);
 754     return true;
 755   }
 756
 757   base::MD5Context context;
 758   FileHeader header;
 759   const int version =
 760       ReadAndVerifyHeader(filename_, &header,
 761                           &add_chunks_cache_, &sub_chunks_cache_,
 762                           file.get(), &context);
 763   if (version == kInvalidVersion) {
 764     FileHeader retry_header;
 765     if (FileRewind(file.get()) && ReadItem(&retry_header, file.get(), NULL)) {
 766       if (retry_header.magic == kFileMagic &&
 767           retry_header.version < kFileVersion) {
 768         RecordFormatEvent(FORMAT_EVENT_FOUND_DEPRECATED);
 769       } else {
 770         RecordFormatEvent(FORMAT_EVENT_FOUND_UNKNOWN);
 771       }
 772     }
 773
 774     // Close the file so that it can be deleted.
 775     file.reset();
 776
 777     return OnCorruptDatabase();
 778   }
 779
 780   file_.swap(file);
 781   new_file_.swap(new_file);
 782   return true;
 783 }
 784
 785 bool SafeBrowsingStoreFile::FinishChunk() {
 786   DCHECK(CalledOnValidThread());
 787
 788   if (!add_prefixes_.size() && !sub_prefixes_.size() &&
 789       !add_hashes_.size() && !sub_hashes_.size())
 790     return true;
 791
 792   ChunkHeader header;
 793   header.add_prefix_count = add_prefixes_.size();
 794   header.sub_prefix_count = sub_prefixes_.size();
 795   header.add_hash_count = add_hashes_.size();
 796   header.sub_hash_count = sub_hashes_.size();
 797   if (!WriteItem(header, new_file_.get(), NULL))
 798     return false;
 799
 800   if (!WriteContainer(add_prefixes_, new_file_.get(), NULL) ||
 801       !WriteContainer(sub_prefixes_, new_file_.get(), NULL) ||
 802       !WriteContainer(add_hashes_, new_file_.get(), NULL) ||
 803       !WriteContainer(sub_hashes_, new_file_.get(), NULL))
 804     return false;
 805
 806   ++chunks_written_;
 807
 808   // Clear everything to save memory.
 809   return ClearChunkBuffers();
 810 }
 811
 812 bool SafeBrowsingStoreFile::DoUpdate(
 813     safe_browsing::PrefixSetBuilder* builder,
 814     std::vector<SBAddFullHash>* add_full_hashes_result) {
 815   DCHECK(CalledOnValidThread());
 816   DCHECK(file_.get() || empty_);
 817   DCHECK(new_file_.get());
 818   CHECK(builder);
 819   CHECK(add_full_hashes_result);
 820
 821   // Rewind the temporary storage.
 822   if (!FileRewind(new_file_.get()))
 823     return false;
 824
 825   // Get chunk file's size for validating counts.
 826   int64 update_size = 0;
 827   if (!base::GetFileSize(TemporaryFileForFilename(filename_), &update_size))
 828     return OnCorruptDatabase();
 829
 830   // Track update size to answer questions at http://crbug.com/72216 .
 831   // Log small updates as 1k so that the 0 (underflow) bucket can be
 832   // used for "empty" in SafeBrowsingDatabase.
 833   UMA_HISTOGRAM_COUNTS("SB2.DatabaseUpdateKilobytes",
 834                        std::max(static_cast<int>(update_size / 1024), 1));
 835
 836   // Chunk updates to integrate.
 837   StateInternal new_state;
 838
 839   // Read update chunks.
 840   for (int i = 0; i < chunks_written_; ++i) {
 841     ChunkHeader header;
 842
 843     int64 ofs = ftell(new_file_.get());
 844     if (ofs == -1)
 845       return false;
 846
 847     if (!ReadItem(&header, new_file_.get(), NULL))
 848       return false;
 849
 850     // As a safety measure, make sure that the header describes a sane
 851     // chunk, given the remaining file size.
 852     int64 expected_size = ofs + sizeof(ChunkHeader);
 853     expected_size += header.add_prefix_count * sizeof(SBAddPrefix);
 854     expected_size += header.sub_prefix_count * sizeof(SBSubPrefix);
 855     expected_size += header.add_hash_count * sizeof(SBAddFullHash);
 856     expected_size += header.sub_hash_count * sizeof(SBSubFullHash);
 857     if (expected_size > update_size)
 858       return false;
 859
 860     if (!new_state.AppendData(header.add_prefix_count, header.sub_prefix_count,
 861                               header.add_hash_count, header.sub_hash_count,
 862                               new_file_.get(), NULL)) {
 863       return false;
 864     }
 865   }
 866
 867   // The state was accumulated by chunk, sort by prefix.
 868   new_state.SortData();
 869
 870   // These strides control how much data is loaded into memory per pass.
 871   // Strides must be an even power of two.  |in_stride| will be derived from the
 872   // input file.  |out_stride| will be derived from an estimate of the resulting
 873   // file's size.  |process_stride| will be the max of both.
 874   uint64 in_stride = kMaxShardStride;
 875   uint64 out_stride = kMaxShardStride;
 876   uint64 process_stride = 0;
 877
 878   // Used to verify the input's checksum if |!empty_|.
 879   base::MD5Context in_context;
 880
 881   if (!empty_) {
 882     DCHECK(file_.get());
 883
 884     FileHeader header = {0};
 885     int version = ReadAndVerifyHeader(filename_, &header,
 886                                       &add_chunks_cache_, &sub_chunks_cache_,
 887                                       file_.get(), &in_context);
 888     if (version == kInvalidVersion)
 889       return OnCorruptDatabase();
 890
 891     if (header.shard_stride)
 892       in_stride = header.shard_stride;
 893
 894     // The header checksum should have prevented this case, but the code will be
 895     // broken if this is not correct.
 896     if (!IsPowerOfTwo(in_stride))
 897       return OnCorruptDatabase();
 898   }
 899
 900   // We no longer need to track deleted chunks.
 901   DeleteChunksFromSet(add_del_cache_, &add_chunks_cache_);
 902   DeleteChunksFromSet(sub_del_cache_, &sub_chunks_cache_);
 903
 904   // Calculate |out_stride| to break the file down into reasonable shards.
 905   {
 906     int64 original_size = 0;
 907     if (!empty_ && !base::GetFileSize(filename_, &original_size))
 908       return OnCorruptDatabase();
 909
 910     // Approximate the final size as everything.  Subs and deletes will reduce
 911     // the size, but modest over-sharding won't hurt much.
 912     int64 shard_size = original_size + update_size;
 913
 914     // Keep splitting until a single stride of data fits the target.
 915     size_t shifts = 0;
 916     while (out_stride > kMinShardStride && shard_size > kUpdateStorageBytes) {
 917       out_stride >>= 1;
 918       shard_size >>= 1;
 919       ++shifts;
 920     }
 921     UMA_HISTOGRAM_COUNTS("SB2.OutShardShifts", shifts);
 922
 923     DCHECK(IsPowerOfTwo(out_stride));
 924   }
 925
 926   // Outer loop strides by the max of the input stride (to read integral shards)
 927   // and the output stride (to write integral shards).
 928   process_stride = std::max(in_stride, out_stride);
 929   DCHECK(IsPowerOfTwo(process_stride));
 930   DCHECK_EQ(0u, process_stride % in_stride);
 931   DCHECK_EQ(0u, process_stride % out_stride);
 932
 933   // Start writing the new data to |new_file_|.
 934   base::MD5Context out_context;
 935   if (!WriteHeader(out_stride, add_chunks_cache_, sub_chunks_cache_,
 936                    new_file_.get(), &out_context)) {
 937     return false;
 938   }
 939
 940   // Start at the beginning of the SBPrefix space.
 941   uint64 in_min = 0;
 942   uint64 out_min = 0;
 943   uint64 process_min = 0;
 944
 945   // Start at the beginning of the updates.
 946   StateInternalPos new_pos = new_state.StateBegin();
 947
 948   // Re-usable container for shard processing.
 949   StateInternal db_state;
 950
 951   // Track aggregate counts for histograms.
 952   size_t add_prefix_count = 0;
 953   size_t sub_prefix_count = 0;
 954
 955   do {
 956     // Maximum element in the current shard.
 957     SBPrefix process_max =
 958         static_cast<SBPrefix>(process_min + process_stride - 1);
 959     DCHECK_GT(process_max, process_min);
 960
 961     // Drop the data from previous pass.
 962     db_state.ClearData();
 963
 964     // Fill the processing shard with one or more input shards.
 965     if (!empty_) {
 966       do {
 967         ShardHeader shard_header;
 968         if (!ReadItem(&shard_header, file_.get(), &in_context))
 969           return OnCorruptDatabase();
 970
 971         if (!db_state.AppendData(shard_header.add_prefix_count,
 972                                  shard_header.sub_prefix_count,
 973                                  shard_header.add_hash_count,
 974                                  shard_header.sub_hash_count,
 975                                  file_.get(), &in_context))
 976           return OnCorruptDatabase();
 977
 978         in_min += in_stride;
 979       } while (in_min <= kMaxSBPrefix && in_min < process_max);
 980     }
 981
 982     // Shard the update data to match the database data, then merge the update
 983     // data and process the results.
 984     {
 985       StateInternalPos new_end = new_state.ShardEnd(new_pos, process_max);
 986       db_state.MergeDataAndProcess(new_pos, new_end,
 987                                    add_del_cache_, sub_del_cache_);
 988       new_pos = new_end;
 989     }
 990
 991     // Collect the processed data for return to caller.
 992     for (size_t i = 0; i < db_state.add_prefixes_.size(); ++i) {
 993       builder->AddPrefix(db_state.add_prefixes_[i].prefix);
 994     }
 995     add_full_hashes_result->insert(add_full_hashes_result->end(),
 996                                    db_state.add_full_hashes_.begin(),
 997                                    db_state.add_full_hashes_.end());
 998     add_prefix_count += db_state.add_prefixes_.size();
 999     sub_prefix_count += db_state.sub_prefixes_.size();
1000
1001     // Write one or more shards of processed output.
1002     StateInternalPos out_pos = db_state.StateBegin();
1003     do {
1004       SBPrefix out_max = static_cast<SBPrefix>(out_min + out_stride - 1);
1005       DCHECK_GT(out_max, out_min);
1006
1007       StateInternalPos out_end = db_state.ShardEnd(out_pos, out_max);
1008       if (!db_state.WriteShard(out_pos, out_end, new_file_.get(), &out_context))
1009         return false;
1010       out_pos = out_end;
1011
1012       out_min += out_stride;
1013     } while (out_min == static_cast<SBPrefix>(out_min) &&
1014              out_min < process_max);
1015
1016     process_min += process_stride;
1017   } while (process_min <= kMaxSBPrefix);
1018
1019   // Verify the overall checksum.
1020   if (!empty_) {
1021     if (!ReadAndVerifyChecksum(file_.get(), &in_context)) {
1022       RecordFormatEvent(FORMAT_EVENT_UPDATE_CHECKSUM_FAILURE);
1023       return OnCorruptDatabase();
1024     }
1025
1026     // TODO(shess): Verify EOF?
1027
1028     // Close the input file so the new file can be renamed over it.
1029     file_.reset();
1030   }
1031   DCHECK(!file_.get());
1032
1033   // Write the overall checksum.
1034   base::MD5Digest out_digest;
1035   base::MD5Final(&out_digest, &out_context);
1036   if (!WriteItem(out_digest, new_file_.get(), NULL))
1037     return false;
1038
1039   // Trim any excess left over from the temporary chunk data.
1040   if (!base::TruncateFile(new_file_.get()))
1041     return false;
1042
1043   // Close the file handle and swizzle the file into place.
1044   new_file_.reset();
1045   if (!base::DeleteFile(filename_, false) &&
1046       base::PathExists(filename_))
1047     return false;
1048
1049   const base::FilePath new_filename = TemporaryFileForFilename(filename_);
1050   if (!base::Move(new_filename, filename_))
1051     return false;
1052
1053   // Record counts before swapping to caller.
1054   UMA_HISTOGRAM_COUNTS("SB2.AddPrefixes", add_prefix_count);
1055   UMA_HISTOGRAM_COUNTS("SB2.SubPrefixes", sub_prefix_count);
1056
1057   return true;
1058 }
1059
1060 bool SafeBrowsingStoreFile::FinishUpdate(
1061     safe_browsing::PrefixSetBuilder* builder,
1062     std::vector<SBAddFullHash>* add_full_hashes_result) {
1063   DCHECK(CalledOnValidThread());
1064   DCHECK(builder);
1065   DCHECK(add_full_hashes_result);
1066
1067   if (!DoUpdate(builder, add_full_hashes_result)) {
1068     CancelUpdate();
1069     return false;
1070   }
1071
1072   DCHECK(!new_file_.get());
1073   DCHECK(!file_.get());
1074
1075   return Close();
1076 }
1077
1078 bool SafeBrowsingStoreFile::CancelUpdate() {
1079   DCHECK(CalledOnValidThread());
1080   bool ret = Close();
1081
1082   // Delete stale staging file.
1083   const base::FilePath new_filename = TemporaryFileForFilename(filename_);
1084   base::DeleteFile(new_filename, false);
1085
1086   return ret;
1087 }
1088
1089 void SafeBrowsingStoreFile::SetAddChunk(int32 chunk_id) {
1090   DCHECK(CalledOnValidThread());
1091   add_chunks_cache_.insert(chunk_id);
1092 }
1093
1094 bool SafeBrowsingStoreFile::CheckAddChunk(int32 chunk_id) {
1095   DCHECK(CalledOnValidThread());
1096   return add_chunks_cache_.count(chunk_id) > 0;
1097 }
1098
1099 void SafeBrowsingStoreFile::GetAddChunks(std::vector<int32>* out) {
1100   DCHECK(CalledOnValidThread());
1101   out->clear();
1102   out->insert(out->end(), add_chunks_cache_.begin(), add_chunks_cache_.end());
1103 }
1104
1105 void SafeBrowsingStoreFile::SetSubChunk(int32 chunk_id) {
1106   DCHECK(CalledOnValidThread());
1107   sub_chunks_cache_.insert(chunk_id);
1108 }
1109
1110 bool SafeBrowsingStoreFile::CheckSubChunk(int32 chunk_id) {
1111   DCHECK(CalledOnValidThread());
1112   return sub_chunks_cache_.count(chunk_id) > 0;
1113 }
1114
1115 void SafeBrowsingStoreFile::GetSubChunks(std::vector<int32>* out) {
1116   DCHECK(CalledOnValidThread());
1117   out->clear();
1118   out->insert(out->end(), sub_chunks_cache_.begin(), sub_chunks_cache_.end());
1119 }
1120
1121 void SafeBrowsingStoreFile::DeleteAddChunk(int32 chunk_id) {
1122   DCHECK(CalledOnValidThread());
1123   add_del_cache_.insert(chunk_id);
1124 }
1125
1126 void SafeBrowsingStoreFile::DeleteSubChunk(int32 chunk_id) {
1127   DCHECK(CalledOnValidThread());
1128   sub_del_cache_.insert(chunk_id);
1129 }
1130
1131 // static
1132 bool SafeBrowsingStoreFile::DeleteStore(const base::FilePath& basename) {
1133   if (!base::DeleteFile(basename, false) &&
1134       base::PathExists(basename)) {
1135     NOTREACHED();
1136     return false;
1137   }
1138
1139   const base::FilePath new_filename = TemporaryFileForFilename(basename);
1140   if (!base::DeleteFile(new_filename, false) &&
1141       base::PathExists(new_filename)) {
1142     NOTREACHED();
1143     return false;
1144   }
1145
1146   // With SQLite support gone, one way to get to this code is if the
1147   // existing file is a SQLite file.  Make sure the journal file is
1148   // also removed.
1149   const base::FilePath journal_filename(
1150       basename.value() + FILE_PATH_LITERAL("-journal"));
1151   if (base::PathExists(journal_filename))
1152     base::DeleteFile(journal_filename, false);
1153
1154   return true;
1155 }