chrome/browser/safe_browsing/prefix_set.cc

   1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
   2 // Use of this source code is governed by a BSD-style license that can be
   3 // found in the LICENSE file.
   4
   5 #include "chrome/browser/safe_browsing/prefix_set.h"
   6
   7 #include <algorithm>
   8
   9 #include "base/files/file_util.h"
  10 #include "base/files/scoped_file.h"
  11 #include "base/logging.h"
  12 #include "base/md5.h"
  13 #include "base/metrics/histogram.h"
  14 #include "base/metrics/sparse_histogram.h"
  15
  16 namespace {
  17
  18 // |kMagic| should be reasonably unique, and not match itself across
  19 // endianness changes.  I generated this value with:
  20 // md5 -qs chrome/browser/safe_browsing/prefix_set.cc | colrm 9
  21 static uint32 kMagic = 0x864088dd;
  22
  23 // Version history:
  24 // Version 1: b6cb7cfe/r74487 by shess@chromium.org on 2011-02-10
  25 // Version 2: 2b59b0a6/r253924 by shess@chromium.org on 2014-02-27
  26 // Version 3: dd07faf5/r268145 by shess@chromium.org on 2014-05-05
  27
  28 // Version 2 layout is identical to version 1.  The sort order of |index_|
  29 // changed from |int32| to |uint32| to match the change of |SBPrefix|.
  30 // Version 3 adds storage for full hashes.
  31 static uint32 kVersion = 3;
  32 static uint32 kDeprecatedVersion = 2;  // And lower.
  33
  34 typedef struct {
  35   uint32 magic;
  36   uint32 version;
  37   uint32 index_size;
  38   uint32 deltas_size;
  39   uint32 full_hashes_size;
  40 } FileHeader;
  41
  42 // Common std::vector<> implementations add capacity by multiplying from the
  43 // current size (usually either by 2 or 1.5) to satisfy push_back() running in
  44 // amortized constant time.  This is not necessary for insert() at end(), but
  45 // AFAICT it seems true for some implementations.  SBPrefix values should
  46 // uniformly cover the 32-bit space, so the final size can be estimated given a
  47 // subset of the input.
  48 //
  49 // |kEstimateThreshold| is when estimates start converging.  Results are strong
  50 // starting around 1<<27.  1<<30 is chosen to prevent the degenerate case of
  51 // resizing capacity from >50% to 100%.
  52 //
  53 // TODO(shess): I'm sure there is math in the world to describe good settings
  54 // for estimating the size of a uniformly-distributed set of integers from a
  55 // sorted subset.  I do not have such math in me, so I assumed that my current
  56 // organic database of prefixes was scale-free, and wrote a script to see how
  57 // often given slop values would always suffice for given strides.  At 1<<30,
  58 // .5% slop was sufficient to cover all cases (though the code below uses 1%).
  59 //
  60 // TODO(shess): A smaller threshold uses less transient space in reallocation.
  61 // 1<<30 uses between 125% and 150%, 1<<29 between 112% and 125%, etc.  The cost
  62 // is that a smaller threshold needs more slop (locked down for the long term).
  63 // 1<<29 worked well with 1%, 1<<27 worked well with 2%.
  64 const SBPrefix kEstimateThreshold = 1 << 30;
  65 size_t EstimateFinalCount(SBPrefix current_prefix, size_t current_count) {
  66   // estimated_count / current_count == estimated_max / current_prefix
  67   // For large input sets, estimated_max of 2^32 is close enough.
  68   const size_t estimated_prefix_count = static_cast<size_t>(
  69       (static_cast<uint64>(current_count) << 32) / current_prefix);
  70
  71   // The estimate has an error bar, if the final total is below the estimate, no
  72   // harm, but if it is above a capacity resize will happen at nearly 100%.  Add
  73   // some slop to make sure all cases are covered.
  74   return estimated_prefix_count + estimated_prefix_count / 100;
  75 }
  76
  77 }  // namespace
  78
  79 namespace safe_browsing {
  80
  81 // For |std::upper_bound()| to find a prefix w/in a vector of pairs.
  82 // static
  83 bool PrefixSet::PrefixLess(const IndexPair& a, const IndexPair& b) {
  84   return a.first < b.first;
  85 }
  86
  87 PrefixSet::PrefixSet() {
  88 }
  89
  90 PrefixSet::PrefixSet(IndexVector* index,
  91                      std::vector<uint16>* deltas,
  92                      std::vector<SBFullHash>* full_hashes) {
  93   DCHECK(index && deltas && full_hashes);
  94   index_.swap(*index);
  95   deltas_.swap(*deltas);
  96   full_hashes_.swap(*full_hashes);
  97 }
  98
  99 PrefixSet::~PrefixSet() {}
 100
 101 bool PrefixSet::PrefixExists(SBPrefix prefix) const {
 102   if (index_.empty())
 103     return false;
 104
 105   // Find the first position after |prefix| in |index_|.
 106   IndexVector::const_iterator iter =
 107       std::upper_bound(index_.begin(), index_.end(),
 108                        IndexPair(prefix, 0), PrefixLess);
 109
 110   // |prefix| comes before anything that's in the set.
 111   if (iter == index_.begin())
 112     return false;
 113
 114   // Capture the upper bound of our target entry's deltas.
 115   const size_t bound = (iter == index_.end() ? deltas_.size() : iter->second);
 116
 117   // Back up to the entry our target is in.
 118   --iter;
 119
 120   // All prefixes in |index_| are in the set.
 121   SBPrefix current = iter->first;
 122   if (current == prefix)
 123     return true;
 124
 125   // Scan forward accumulating deltas while a match is possible.
 126   for (size_t di = iter->second; di < bound && current < prefix; ++di) {
 127     current += deltas_[di];
 128   }
 129
 130   return current == prefix;
 131 }
 132
 133 bool PrefixSet::Exists(const SBFullHash& hash) const {
 134   if (std::binary_search(full_hashes_.begin(), full_hashes_.end(),
 135                          hash, SBFullHashLess)) {
 136     return true;
 137   }
 138   return PrefixExists(hash.prefix);
 139 }
 140
 141 void PrefixSet::GetPrefixes(std::vector<SBPrefix>* prefixes) const {
 142   prefixes->reserve(index_.size() + deltas_.size());
 143
 144   for (size_t ii = 0; ii < index_.size(); ++ii) {
 145     // The deltas for this |index_| entry run to the next index entry,
 146     // or the end of the deltas.
 147     const size_t deltas_end =
 148         (ii + 1 < index_.size()) ? index_[ii + 1].second : deltas_.size();
 149
 150     SBPrefix current = index_[ii].first;
 151     prefixes->push_back(current);
 152     for (size_t di = index_[ii].second; di < deltas_end; ++di) {
 153       current += deltas_[di];
 154       prefixes->push_back(current);
 155     }
 156   }
 157 }
 158
 159 // static
 160 scoped_ptr<const PrefixSet> PrefixSet::LoadFile(
 161     const base::FilePath& filter_name) {
 162   int64 size_64;
 163   if (!base::GetFileSize(filter_name, &size_64))
 164     return nullptr;
 165   using base::MD5Digest;
 166   if (size_64 < static_cast<int64>(sizeof(FileHeader) + sizeof(MD5Digest)))
 167     return nullptr;
 168
 169   base::ScopedFILE file(base::OpenFile(filter_name, "rb"));
 170   if (!file.get())
 171     return nullptr;
 172
 173   FileHeader header;
 174   size_t read = fread(&header, sizeof(header), 1, file.get());
 175   if (read != 1)
 176     return nullptr;
 177
 178   // The file looks valid, start building the digest.
 179   base::MD5Context context;
 180   base::MD5Init(&context);
 181   base::MD5Update(&context, base::StringPiece(reinterpret_cast<char*>(&header),
 182                                               sizeof(header)));
 183
 184   if (header.magic != kMagic)
 185     return nullptr;
 186
 187   // Track version read to inform removal of support for older versions.
 188   UMA_HISTOGRAM_SPARSE_SLOWLY("SB2.PrefixSetVersionRead", header.version);
 189
 190   if (header.version <= kDeprecatedVersion) {
 191     return nullptr;
 192   } else if (header.version != kVersion) {
 193     return nullptr;
 194   }
 195
 196   IndexVector index;
 197   const size_t index_bytes = sizeof(index[0]) * header.index_size;
 198
 199   std::vector<uint16> deltas;
 200   const size_t deltas_bytes = sizeof(deltas[0]) * header.deltas_size;
 201
 202   std::vector<SBFullHash> full_hashes;
 203   const size_t full_hashes_bytes =
 204       sizeof(full_hashes[0]) * header.full_hashes_size;
 205
 206   // Check for bogus sizes before allocating any space.
 207   const size_t expected_bytes = sizeof(header) +
 208       index_bytes + deltas_bytes + full_hashes_bytes + sizeof(MD5Digest);
 209   if (static_cast<int64>(expected_bytes) != size_64)
 210     return nullptr;
 211
 212   // Read the index vector.  Herb Sutter indicates that vectors are
 213   // guaranteed to be contiuguous, so reading to where element 0 lives
 214   // is valid.
 215   if (header.index_size) {
 216     index.resize(header.index_size);
 217     read = fread(&(index[0]), sizeof(index[0]), index.size(), file.get());
 218     if (read != index.size())
 219       return nullptr;
 220     base::MD5Update(&context,
 221                     base::StringPiece(reinterpret_cast<char*>(&(index[0])),
 222                                       index_bytes));
 223   }
 224
 225   // Read vector of deltas.
 226   if (header.deltas_size) {
 227     deltas.resize(header.deltas_size);
 228     read = fread(&(deltas[0]), sizeof(deltas[0]), deltas.size(), file.get());
 229     if (read != deltas.size())
 230       return nullptr;
 231     base::MD5Update(&context,
 232                     base::StringPiece(reinterpret_cast<char*>(&(deltas[0])),
 233                                       deltas_bytes));
 234   }
 235
 236   // Read vector of full hashes.
 237   if (header.full_hashes_size) {
 238     full_hashes.resize(header.full_hashes_size);
 239     read = fread(&(full_hashes[0]), sizeof(full_hashes[0]), full_hashes.size(),
 240                  file.get());
 241     if (read != full_hashes.size())
 242       return nullptr;
 243     base::MD5Update(&context,
 244                     base::StringPiece(
 245                         reinterpret_cast<char*>(&(full_hashes[0])),
 246                         full_hashes_bytes));
 247   }
 248
 249   base::MD5Digest calculated_digest;
 250   base::MD5Final(&calculated_digest, &context);
 251
 252   base::MD5Digest file_digest;
 253   read = fread(&file_digest, sizeof(file_digest), 1, file.get());
 254   if (read != 1)
 255     return nullptr;
 256
 257   if (0 != memcmp(&file_digest, &calculated_digest, sizeof(file_digest)))
 258     return nullptr;
 259
 260   // Steals vector contents using swap().
 261   return make_scoped_ptr(
 262       new PrefixSet(&index, &deltas, &full_hashes));
 263 }
 264
 265 bool PrefixSet::WriteFile(const base::FilePath& filter_name) const {
 266   FileHeader header;
 267   header.magic = kMagic;
 268   header.version = kVersion;
 269   header.index_size = static_cast<uint32>(index_.size());
 270   header.deltas_size = static_cast<uint32>(deltas_.size());
 271   header.full_hashes_size = static_cast<uint32>(full_hashes_.size());
 272
 273   // Sanity check that the 32-bit values never mess things up.
 274   if (static_cast<size_t>(header.index_size) != index_.size() ||
 275       static_cast<size_t>(header.deltas_size) != deltas_.size() ||
 276       static_cast<size_t>(header.full_hashes_size) != full_hashes_.size()) {
 277     NOTREACHED();
 278     return false;
 279   }
 280
 281   base::ScopedFILE file(base::OpenFile(filter_name, "wb"));
 282   if (!file.get())
 283     return false;
 284
 285   base::MD5Context context;
 286   base::MD5Init(&context);
 287
 288   // TODO(shess): The I/O code in safe_browsing_store_file.cc would
 289   // sure be useful about now.
 290   size_t written = fwrite(&header, sizeof(header), 1, file.get());
 291   if (written != 1)
 292     return false;
 293   base::MD5Update(&context, base::StringPiece(reinterpret_cast<char*>(&header),
 294                                               sizeof(header)));
 295
 296   // As for reads, the standard guarantees the ability to access the
 297   // contents of the vector by a pointer to an element.
 298   if (index_.size()) {
 299     const size_t index_bytes = sizeof(index_[0]) * index_.size();
 300     written = fwrite(&(index_[0]), sizeof(index_[0]), index_.size(),
 301                      file.get());
 302     if (written != index_.size())
 303       return false;
 304     base::MD5Update(&context,
 305                     base::StringPiece(
 306                         reinterpret_cast<const char*>(&(index_[0])),
 307                         index_bytes));
 308   }
 309
 310   if (deltas_.size()) {
 311     const size_t deltas_bytes = sizeof(deltas_[0]) * deltas_.size();
 312     written = fwrite(&(deltas_[0]), sizeof(deltas_[0]), deltas_.size(),
 313                      file.get());
 314     if (written != deltas_.size())
 315       return false;
 316     base::MD5Update(&context,
 317                     base::StringPiece(
 318                         reinterpret_cast<const char*>(&(deltas_[0])),
 319                         deltas_bytes));
 320   }
 321
 322   if (full_hashes_.size()) {
 323     const size_t elt_size = sizeof(full_hashes_[0]);
 324     const size_t elts = full_hashes_.size();
 325     const size_t full_hashes_bytes = elt_size * elts;
 326     written = fwrite(&(full_hashes_[0]), elt_size, elts, file.get());
 327     if (written != elts)
 328       return false;
 329     base::MD5Update(&context,
 330                     base::StringPiece(
 331                         reinterpret_cast<const char*>(&(full_hashes_[0])),
 332                         full_hashes_bytes));
 333   }
 334
 335   base::MD5Digest digest;
 336   base::MD5Final(&digest, &context);
 337   written = fwrite(&digest, sizeof(digest), 1, file.get());
 338   if (written != 1)
 339     return false;
 340
 341   // TODO(shess): Can this code check that the close was successful?
 342   file.reset();
 343
 344   return true;
 345 }
 346
 347 void PrefixSet::AddRun(SBPrefix index_prefix,
 348                        const uint16* run_begin, const uint16* run_end) {
 349   // Preempt organic capacity decisions for |delta_| once strong estimates can
 350   // be made.
 351   if (index_prefix > kEstimateThreshold &&
 352       deltas_.capacity() < deltas_.size() + (run_end - run_begin)) {
 353     deltas_.reserve(EstimateFinalCount(index_prefix, deltas_.size()));
 354   }
 355
 356   index_.push_back(std::make_pair(index_prefix, deltas_.size()));
 357   deltas_.insert(deltas_.end(), run_begin, run_end);
 358 }
 359
 360 PrefixSetBuilder::PrefixSetBuilder()
 361     : prefix_set_(new PrefixSet()) {
 362 }
 363
 364 PrefixSetBuilder::PrefixSetBuilder(const std::vector<SBPrefix>& prefixes)
 365     : prefix_set_(new PrefixSet()) {
 366   for (size_t i = 0; i < prefixes.size(); ++i) {
 367     AddPrefix(prefixes[i]);
 368   }
 369 }
 370
 371 PrefixSetBuilder::~PrefixSetBuilder() {
 372 }
 373
 374 scoped_ptr<const PrefixSet> PrefixSetBuilder::GetPrefixSet(
 375     const std::vector<SBFullHash>& hashes) {
 376   DCHECK(prefix_set_.get());
 377
 378   // Flush runs until buffered data is gone.
 379   while (!buffer_.empty()) {
 380     EmitRun();
 381   }
 382
 383   // Precisely size |index_| for read-only.  It's 50k-60k, so minor savings, but
 384   // they're almost free.
 385   PrefixSet::IndexVector(prefix_set_->index_).swap(prefix_set_->index_);
 386
 387   prefix_set_->full_hashes_ = hashes;
 388   std::sort(prefix_set_->full_hashes_.begin(), prefix_set_->full_hashes_.end(),
 389             SBFullHashLess);
 390
 391   return prefix_set_.Pass();
 392 }
 393
 394 scoped_ptr<const PrefixSet> PrefixSetBuilder::GetPrefixSetNoHashes() {
 395   return GetPrefixSet(std::vector<SBFullHash>()).Pass();
 396 }
 397
 398 void PrefixSetBuilder::EmitRun() {
 399   DCHECK(prefix_set_.get());
 400
 401   SBPrefix prev_prefix = buffer_[0];
 402   uint16 run[PrefixSet::kMaxRun];
 403   size_t run_pos = 0;
 404
 405   size_t i;
 406   for (i = 1; i < buffer_.size() && run_pos < PrefixSet::kMaxRun; ++i) {
 407     // Calculate the delta.  |unsigned| is mandatory, because the
 408     // sorted_prefixes could be more than INT_MAX apart.
 409     DCHECK_GT(buffer_[i], prev_prefix);
 410     const unsigned delta = buffer_[i] - prev_prefix;
 411     const uint16 delta16 = static_cast<uint16>(delta);
 412
 413     // Break the run if the delta doesn't fit.
 414     if (delta != static_cast<unsigned>(delta16))
 415       break;
 416
 417     // Continue the run of deltas.
 418     run[run_pos++] = delta16;
 419     DCHECK_EQ(static_cast<unsigned>(run[run_pos - 1]), delta);
 420
 421     prev_prefix = buffer_[i];
 422   }
 423   prefix_set_->AddRun(buffer_[0], run, run + run_pos);
 424   buffer_.erase(buffer_.begin(), buffer_.begin() + i);
 425 }
 426
 427 void PrefixSetBuilder::AddPrefix(SBPrefix prefix) {
 428   DCHECK(prefix_set_.get());
 429
 430   if (buffer_.empty()) {
 431     DCHECK(prefix_set_->index_.empty());
 432     DCHECK(prefix_set_->deltas_.empty());
 433   } else {
 434     // Drop duplicates.
 435     if (buffer_.back() == prefix)
 436       return;
 437
 438     DCHECK_LT(buffer_.back(), prefix);
 439   }
 440   buffer_.push_back(prefix);
 441
 442   // Flush buffer when a run can be constructed.  +1 for the index item, and +1
 443   // to leave at least one item in the buffer for dropping duplicates.
 444   if (buffer_.size() > PrefixSet::kMaxRun + 2)
 445     EmitRun();
 446 }
 447
 448 }  // namespace safe_browsing