chrome/browser/safe_browsing/prefix_set.cc

   1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
   2 // Use of this source code is governed by a BSD-style license that can be
   3 // found in the LICENSE file.
   4
   5 #include "chrome/browser/safe_browsing/prefix_set.h"
   6
   7 #include <algorithm>
   8 #include <math.h>
   9
  10 #include "base/file_util.h"
  11 #include "base/files/scoped_file.h"
  12 #include "base/logging.h"
  13 #include "base/md5.h"
  14 #include "base/metrics/histogram.h"
  15 #include "base/metrics/sparse_histogram.h"
  16
  17 namespace {
  18
  19 // |kMagic| should be reasonably unique, and not match itself across
  20 // endianness changes.  I generated this value with:
  21 // md5 -qs chrome/browser/safe_browsing/prefix_set.cc | colrm 9
  22 static uint32 kMagic = 0x864088dd;
  23
  24 // Version history:
  25 // Version 1: b6cb7cfe/r74487 by shess@chromium.org on 2011-02-10
  26 // Version 2: 2b59b0a6/r253924 by shess@chromium.org on 2014-02-27
  27 // Version 3: ????????/r?????? by shess@chromium.org on 2014-04-??
  28
  29 // Version 2 layout is identical to version 1.  The sort order of |index_|
  30 // changed from |int32| to |uint32| to match the change of |SBPrefix|.
  31 // Version 3 adds storage for full hashes.
  32 static uint32 kVersion = 0x3;
  33
  34 typedef struct {
  35   uint32 magic;
  36   uint32 version;
  37   uint32 index_size;
  38   uint32 deltas_size;
  39 } FileHeader_v2;
  40
  41 typedef struct {
  42   uint32 magic;
  43   uint32 version;
  44   uint32 index_size;
  45   uint32 deltas_size;
  46   uint32 full_hashes_size;
  47 } FileHeader;
  48
  49 // Common std::vector<> implementations add capacity by multiplying from the
  50 // current size (usually either by 2 or 1.5) to satisfy push_back() running in
  51 // amortized constant time.  This is not necessary for insert() at end(), but
  52 // AFAICT it seems true for some implementations.  SBPrefix values should
  53 // uniformly cover the 32-bit space, so the final size can be estimated given a
  54 // subset of the input.
  55 //
  56 // |kEstimateThreshold| is when estimates start converging.  Results are strong
  57 // starting around 1<<27.  1<<30 is chosen to prevent the degenerate case of
  58 // resizing capacity from >50% to 100%.
  59 //
  60 // TODO(shess): I'm sure there is math in the world to describe good settings
  61 // for estimating the size of a uniformly-distributed set of integers from a
  62 // sorted subset.  I do not have such math in me, so I assumed that my current
  63 // organic database of prefixes was scale-free, and wrote a script to see how
  64 // often given slop values would always suffice for given strides.  At 1<<30,
  65 // .5% slop was sufficient to cover all cases (though the code below uses 1%).
  66 //
  67 // TODO(shess): A smaller threshold uses less transient space in reallocation.
  68 // 1<<30 uses between 125% and 150%, 1<<29 between 112% and 125%, etc.  The cost
  69 // is that a smaller threshold needs more slop (locked down for the long term).
  70 // 1<<29 worked well with 1%, 1<<27 worked well with 2%.
  71 const SBPrefix kEstimateThreshold = 1 << 30;
  72 size_t EstimateFinalCount(SBPrefix current_prefix, size_t current_count) {
  73   // estimated_count / current_count == estimated_max / current_prefix
  74   // For large input sets, estimated_max of 2^32 is close enough.
  75   const size_t estimated_prefix_count = static_cast<size_t>(
  76       (static_cast<uint64>(current_count) << 32) / current_prefix);
  77
  78   // The estimate has an error bar, if the final total is below the estimate, no
  79   // harm, but if it is above a capacity resize will happen at nearly 100%.  Add
  80   // some slop to make sure all cases are covered.
  81   return estimated_prefix_count + estimated_prefix_count / 100;
  82 }
  83
  84 }  // namespace
  85
  86 namespace safe_browsing {
  87
  88 // For |std::upper_bound()| to find a prefix w/in a vector of pairs.
  89 // static
  90 bool PrefixSet::PrefixLess(const IndexPair& a, const IndexPair& b) {
  91   return a.first < b.first;
  92 }
  93
  94 PrefixSet::PrefixSet() {
  95 }
  96
  97 PrefixSet::PrefixSet(IndexVector* index,
  98                      std::vector<uint16>* deltas,
  99                      std::vector<SBFullHash>* full_hashes) {
 100   DCHECK(index && deltas && full_hashes);
 101   index_.swap(*index);
 102   deltas_.swap(*deltas);
 103   full_hashes_.swap(*full_hashes);
 104 }
 105
 106 PrefixSet::~PrefixSet() {}
 107
 108 bool PrefixSet::PrefixExists(SBPrefix prefix) const {
 109   if (index_.empty())
 110     return false;
 111
 112   // Find the first position after |prefix| in |index_|.
 113   IndexVector::const_iterator iter =
 114       std::upper_bound(index_.begin(), index_.end(),
 115                        IndexPair(prefix, 0), PrefixLess);
 116
 117   // |prefix| comes before anything that's in the set.
 118   if (iter == index_.begin())
 119     return false;
 120
 121   // Capture the upper bound of our target entry's deltas.
 122   const size_t bound = (iter == index_.end() ? deltas_.size() : iter->second);
 123
 124   // Back up to the entry our target is in.
 125   --iter;
 126
 127   // All prefixes in |index_| are in the set.
 128   SBPrefix current = iter->first;
 129   if (current == prefix)
 130     return true;
 131
 132   // Scan forward accumulating deltas while a match is possible.
 133   for (size_t di = iter->second; di < bound && current < prefix; ++di) {
 134     current += deltas_[di];
 135   }
 136
 137   return current == prefix;
 138 }
 139
 140 bool PrefixSet::Exists(const SBFullHash& hash) const {
 141   if (std::binary_search(full_hashes_.begin(), full_hashes_.end(),
 142                          hash, SBFullHashLess)) {
 143     return true;
 144   }
 145   return PrefixExists(hash.prefix);
 146 }
 147
 148 void PrefixSet::GetPrefixes(std::vector<SBPrefix>* prefixes) const {
 149   prefixes->reserve(index_.size() + deltas_.size());
 150
 151   for (size_t ii = 0; ii < index_.size(); ++ii) {
 152     // The deltas for this |index_| entry run to the next index entry,
 153     // or the end of the deltas.
 154     const size_t deltas_end =
 155         (ii + 1 < index_.size()) ? index_[ii + 1].second : deltas_.size();
 156
 157     SBPrefix current = index_[ii].first;
 158     prefixes->push_back(current);
 159     for (size_t di = index_[ii].second; di < deltas_end; ++di) {
 160       current += deltas_[di];
 161       prefixes->push_back(current);
 162     }
 163   }
 164 }
 165
 166 // static
 167 scoped_ptr<PrefixSet> PrefixSet::LoadFile(const base::FilePath& filter_name) {
 168   int64 size_64;
 169   if (!base::GetFileSize(filter_name, &size_64))
 170     return scoped_ptr<PrefixSet>();
 171   using base::MD5Digest;
 172   // TODO(shess): Revert to sizeof(FileHeader) for sanity check once v2 is
 173   // deprecated.
 174   if (size_64 < static_cast<int64>(sizeof(FileHeader_v2) + sizeof(MD5Digest)))
 175     return scoped_ptr<PrefixSet>();
 176
 177   base::ScopedFILE file(base::OpenFile(filter_name, "rb"));
 178   if (!file.get())
 179     return scoped_ptr<PrefixSet>();
 180
 181   FileHeader header;
 182   size_t read = fread(&header, sizeof(header), 1, file.get());
 183   if (read != 1)
 184     return scoped_ptr<PrefixSet>();
 185
 186   // The file looks valid, start building the digest.
 187   base::MD5Context context;
 188   base::MD5Init(&context);
 189   base::MD5Update(&context, base::StringPiece(reinterpret_cast<char*>(&header),
 190                                               sizeof(header)));
 191
 192   if (header.magic != kMagic)
 193     return scoped_ptr<PrefixSet>();
 194
 195   // Track version read to inform removal of support for older versions.
 196   UMA_HISTOGRAM_SPARSE_SLOWLY("SB2.PrefixSetVersionRead", header.version);
 197
 198   // TODO(shess): Version 1 and 2 use the same file structure, with version 1
 199   // data using a signed sort.  For M-35, the data is re-sorted before return.
 200   // After M-36, just drop v1 support. <http://crbug.com/346405>
 201   // TODO(shess): <http://crbug.com/368044> for removing v2 support.
 202   size_t header_size = sizeof(header);
 203   if (header.version == 2 || header.version == 1) {
 204     // Rewind the file and restart building the digest with the old header
 205     // structure.
 206     FileHeader_v2 v2_header;
 207     if (0 != fseek(file.get(), 0, SEEK_SET))
 208       return scoped_ptr<PrefixSet>();
 209
 210     size_t read = fread(&v2_header, sizeof(v2_header), 1, file.get());
 211     if (read != 1)
 212       return scoped_ptr<PrefixSet>();
 213
 214     base::MD5Init(&context);
 215     base::MD5Update(&context,
 216                     base::StringPiece(reinterpret_cast<char*>(&v2_header),
 217                                       sizeof(v2_header)));
 218
 219     // The current header is a superset of the old header, fill it in with the
 220     // information read.
 221     header.magic = v2_header.magic;
 222     header.version = v2_header.version;
 223     header.index_size = v2_header.index_size;
 224     header.deltas_size = v2_header.deltas_size;
 225     header.full_hashes_size = 0;
 226     header_size = sizeof(v2_header);
 227   } else if (header.version != kVersion) {
 228     return scoped_ptr<PrefixSet>();
 229   }
 230
 231   IndexVector index;
 232   const size_t index_bytes = sizeof(index[0]) * header.index_size;
 233
 234   std::vector<uint16> deltas;
 235   const size_t deltas_bytes = sizeof(deltas[0]) * header.deltas_size;
 236
 237   std::vector<SBFullHash> full_hashes;
 238   const size_t full_hashes_bytes =
 239       sizeof(full_hashes[0]) * header.full_hashes_size;
 240
 241   // Check for bogus sizes before allocating any space.
 242   const size_t expected_bytes = header_size +
 243       index_bytes + deltas_bytes + full_hashes_bytes + sizeof(MD5Digest);
 244   if (static_cast<int64>(expected_bytes) != size_64)
 245     return scoped_ptr<PrefixSet>();
 246
 247   // Read the index vector.  Herb Sutter indicates that vectors are
 248   // guaranteed to be contiuguous, so reading to where element 0 lives
 249   // is valid.
 250   if (header.index_size) {
 251     index.resize(header.index_size);
 252     read = fread(&(index[0]), sizeof(index[0]), index.size(), file.get());
 253     if (read != index.size())
 254       return scoped_ptr<PrefixSet>();
 255     base::MD5Update(&context,
 256                     base::StringPiece(reinterpret_cast<char*>(&(index[0])),
 257                                       index_bytes));
 258   }
 259
 260   // Read vector of deltas.
 261   if (header.deltas_size) {
 262     deltas.resize(header.deltas_size);
 263     read = fread(&(deltas[0]), sizeof(deltas[0]), deltas.size(), file.get());
 264     if (read != deltas.size())
 265       return scoped_ptr<PrefixSet>();
 266     base::MD5Update(&context,
 267                     base::StringPiece(reinterpret_cast<char*>(&(deltas[0])),
 268                                       deltas_bytes));
 269   }
 270
 271   // Read vector of full hashes.
 272   if (header.full_hashes_size) {
 273     full_hashes.resize(header.full_hashes_size);
 274     read = fread(&(full_hashes[0]), sizeof(full_hashes[0]), full_hashes.size(),
 275                  file.get());
 276     if (read != full_hashes.size())
 277       return scoped_ptr<PrefixSet>();
 278     base::MD5Update(&context,
 279                     base::StringPiece(
 280                         reinterpret_cast<char*>(&(full_hashes[0])),
 281                         full_hashes_bytes));
 282   }
 283
 284   base::MD5Digest calculated_digest;
 285   base::MD5Final(&calculated_digest, &context);
 286
 287   base::MD5Digest file_digest;
 288   read = fread(&file_digest, sizeof(file_digest), 1, file.get());
 289   if (read != 1)
 290     return scoped_ptr<PrefixSet>();
 291
 292   if (0 != memcmp(&file_digest, &calculated_digest, sizeof(file_digest)))
 293     return scoped_ptr<PrefixSet>();
 294
 295   // For version 1, fetch the prefixes and re-sort.
 296   if (header.version == 1) {
 297     std::vector<SBPrefix> prefixes;
 298     PrefixSet(&index, &deltas, &full_hashes).GetPrefixes(&prefixes);
 299     std::sort(prefixes.begin(), prefixes.end());
 300
 301     // v1 cannot have full hashes, so no need to propagate a copy here.
 302     return PrefixSetBuilder(prefixes).GetPrefixSetNoHashes().Pass();
 303   }
 304
 305   // Steals vector contents using swap().
 306   return scoped_ptr<PrefixSet>(new PrefixSet(&index, &deltas, &full_hashes));
 307 }
 308
 309 bool PrefixSet::WriteFile(const base::FilePath& filter_name) const {
 310   FileHeader header;
 311   header.magic = kMagic;
 312   header.version = kVersion;
 313   header.index_size = static_cast<uint32>(index_.size());
 314   header.deltas_size = static_cast<uint32>(deltas_.size());
 315   header.full_hashes_size = static_cast<uint32>(full_hashes_.size());
 316
 317   // Sanity check that the 32-bit values never mess things up.
 318   if (static_cast<size_t>(header.index_size) != index_.size() ||
 319       static_cast<size_t>(header.deltas_size) != deltas_.size() ||
 320       static_cast<size_t>(header.full_hashes_size) != full_hashes_.size()) {
 321     NOTREACHED();
 322     return false;
 323   }
 324
 325   base::ScopedFILE file(base::OpenFile(filter_name, "wb"));
 326   if (!file.get())
 327     return false;
 328
 329   base::MD5Context context;
 330   base::MD5Init(&context);
 331
 332   // TODO(shess): The I/O code in safe_browsing_store_file.cc would
 333   // sure be useful about now.
 334   size_t written = fwrite(&header, sizeof(header), 1, file.get());
 335   if (written != 1)
 336     return false;
 337   base::MD5Update(&context, base::StringPiece(reinterpret_cast<char*>(&header),
 338                                               sizeof(header)));
 339
 340   // As for reads, the standard guarantees the ability to access the
 341   // contents of the vector by a pointer to an element.
 342   if (index_.size()) {
 343     const size_t index_bytes = sizeof(index_[0]) * index_.size();
 344     written = fwrite(&(index_[0]), sizeof(index_[0]), index_.size(),
 345                      file.get());
 346     if (written != index_.size())
 347       return false;
 348     base::MD5Update(&context,
 349                     base::StringPiece(
 350                         reinterpret_cast<const char*>(&(index_[0])),
 351                         index_bytes));
 352   }
 353
 354   if (deltas_.size()) {
 355     const size_t deltas_bytes = sizeof(deltas_[0]) * deltas_.size();
 356     written = fwrite(&(deltas_[0]), sizeof(deltas_[0]), deltas_.size(),
 357                      file.get());
 358     if (written != deltas_.size())
 359       return false;
 360     base::MD5Update(&context,
 361                     base::StringPiece(
 362                         reinterpret_cast<const char*>(&(deltas_[0])),
 363                         deltas_bytes));
 364   }
 365
 366   if (full_hashes_.size()) {
 367     const size_t elt_size = sizeof(full_hashes_[0]);
 368     const size_t elts = full_hashes_.size();
 369     const size_t full_hashes_bytes = elt_size * elts;
 370     written = fwrite(&(full_hashes_[0]), elt_size, elts, file.get());
 371     if (written != elts)
 372       return false;
 373     base::MD5Update(&context,
 374                     base::StringPiece(
 375                         reinterpret_cast<const char*>(&(full_hashes_[0])),
 376                         full_hashes_bytes));
 377   }
 378
 379   base::MD5Digest digest;
 380   base::MD5Final(&digest, &context);
 381   written = fwrite(&digest, sizeof(digest), 1, file.get());
 382   if (written != 1)
 383     return false;
 384
 385   // TODO(shess): Can this code check that the close was successful?
 386   file.reset();
 387
 388   return true;
 389 }
 390
 391 void PrefixSet::AddRun(SBPrefix index_prefix,
 392                        const uint16* run_begin, const uint16* run_end) {
 393   // Preempt organic capacity decisions for |delta_| once strong estimates can
 394   // be made.
 395   if (index_prefix > kEstimateThreshold &&
 396       deltas_.capacity() < deltas_.size() + (run_end - run_begin)) {
 397     deltas_.reserve(EstimateFinalCount(index_prefix, deltas_.size()));
 398   }
 399
 400   index_.push_back(std::make_pair(index_prefix, deltas_.size()));
 401   deltas_.insert(deltas_.end(), run_begin, run_end);
 402 }
 403
 404 PrefixSetBuilder::PrefixSetBuilder()
 405     : prefix_set_(new PrefixSet()) {
 406 }
 407
 408 PrefixSetBuilder::PrefixSetBuilder(const std::vector<SBPrefix>& prefixes)
 409     : prefix_set_(new PrefixSet()) {
 410   for (size_t i = 0; i < prefixes.size(); ++i) {
 411     AddPrefix(prefixes[i]);
 412   }
 413 }
 414
 415 PrefixSetBuilder::~PrefixSetBuilder() {
 416 }
 417
 418 scoped_ptr<PrefixSet> PrefixSetBuilder::GetPrefixSet(
 419     const std::vector<SBFullHash>& hashes) {
 420   DCHECK(prefix_set_.get());
 421
 422   // Flush runs until buffered data is gone.
 423   while (!buffer_.empty()) {
 424     EmitRun();
 425   }
 426
 427   // Precisely size |index_| for read-only.  It's 50k-60k, so minor savings, but
 428   // they're almost free.
 429   PrefixSet::IndexVector(prefix_set_->index_).swap(prefix_set_->index_);
 430
 431   prefix_set_->full_hashes_ = hashes;
 432   std::sort(prefix_set_->full_hashes_.begin(), prefix_set_->full_hashes_.end(),
 433             SBFullHashLess);
 434
 435   return prefix_set_.Pass();
 436 }
 437
 438 scoped_ptr<PrefixSet> PrefixSetBuilder::GetPrefixSetNoHashes() {
 439   return GetPrefixSet(std::vector<SBFullHash>()).Pass();
 440 }
 441
 442 void PrefixSetBuilder::EmitRun() {
 443   DCHECK(prefix_set_.get());
 444
 445   SBPrefix prev_prefix = buffer_[0];
 446   uint16 run[PrefixSet::kMaxRun];
 447   size_t run_pos = 0;
 448
 449   size_t i;
 450   for (i = 1; i < buffer_.size() && run_pos < PrefixSet::kMaxRun; ++i) {
 451     // Calculate the delta.  |unsigned| is mandatory, because the
 452     // sorted_prefixes could be more than INT_MAX apart.
 453     DCHECK_GT(buffer_[i], prev_prefix);
 454     const unsigned delta = buffer_[i] - prev_prefix;
 455     const uint16 delta16 = static_cast<uint16>(delta);
 456
 457     // Break the run if the delta doesn't fit.
 458     if (delta != static_cast<unsigned>(delta16))
 459       break;
 460
 461     // Continue the run of deltas.
 462     run[run_pos++] = delta16;
 463     DCHECK_EQ(static_cast<unsigned>(run[run_pos - 1]), delta);
 464
 465     prev_prefix = buffer_[i];
 466   }
 467   prefix_set_->AddRun(buffer_[0], run, run + run_pos);
 468   buffer_.erase(buffer_.begin(), buffer_.begin() + i);
 469 }
 470
 471 void PrefixSetBuilder::AddPrefix(SBPrefix prefix) {
 472   DCHECK(prefix_set_.get());
 473
 474   if (buffer_.empty()) {
 475     DCHECK(prefix_set_->index_.empty());
 476     DCHECK(prefix_set_->deltas_.empty());
 477   } else {
 478     // Drop duplicates.
 479     if (buffer_.back() == prefix)
 480       return;
 481
 482     DCHECK_LT(buffer_.back(), prefix);
 483   }
 484   buffer_.push_back(prefix);
 485
 486   // Flush buffer when a run can be constructed.  +1 for the index item, and +1
 487   // to leave at least one item in the buffer for dropping duplicates.
 488   if (buffer_.size() > PrefixSet::kMaxRun + 2)
 489     EmitRun();
 490 }
 491
 492 }  // namespace safe_browsing