chrome/browser/safe_browsing/prefix_set.cc

   1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
   2 // Use of this source code is governed by a BSD-style license that can be
   3 // found in the LICENSE file.
   4
   5 #include "chrome/browser/safe_browsing/prefix_set.h"
   6
   7 #include <algorithm>
   8 #include <math.h>
   9
  10 #include "base/file_util.h"
  11 #include "base/logging.h"
  12 #include "base/md5.h"
  13 #include "base/metrics/histogram.h"
  14
  15 namespace {
  16
  17 // |kMagic| should be reasonably unique, and not match itself across
  18 // endianness changes.  I generated this value with:
  19 // md5 -qs chrome/browser/safe_browsing/prefix_set.cc | colrm 9
  20 static uint32 kMagic = 0x864088dd;
  21
  22 // Current version the code writes out.
  23 static uint32 kVersion = 0x1;
  24
  25 typedef struct {
  26   uint32 magic;
  27   uint32 version;
  28   uint32 index_size;
  29   uint32 deltas_size;
  30 } FileHeader;
  31
  32 // For |std::upper_bound()| to find a prefix w/in a vector of pairs.
  33 bool PrefixLess(const std::pair<SBPrefix,size_t>& a,
  34                 const std::pair<SBPrefix,size_t>& b) {
  35   return a.first < b.first;
  36 }
  37
  38 }  // namespace
  39
  40 namespace safe_browsing {
  41
  42 PrefixSet::PrefixSet(const std::vector<SBPrefix>& sorted_prefixes) {
  43   if (sorted_prefixes.size()) {
  44     // Estimate the resulting vector sizes.  There will be strictly
  45     // more than |min_runs| entries in |index_|, but there generally
  46     // aren't many forced breaks.
  47     const size_t min_runs = sorted_prefixes.size() / kMaxRun;
  48     index_.reserve(min_runs);
  49     deltas_.reserve(sorted_prefixes.size() - min_runs);
  50
  51     // Lead with the first prefix.
  52     SBPrefix prev_prefix = sorted_prefixes[0];
  53     size_t run_length = 0;
  54     index_.push_back(std::make_pair(prev_prefix, deltas_.size()));
  55
  56     for (size_t i = 1; i < sorted_prefixes.size(); ++i) {
  57       // Skip duplicates.
  58       if (sorted_prefixes[i] == prev_prefix)
  59         continue;
  60
  61       // Calculate the delta.  |unsigned| is mandatory, because the
  62       // sorted_prefixes could be more than INT_MAX apart.
  63       DCHECK_GT(sorted_prefixes[i], prev_prefix);
  64       const unsigned delta = sorted_prefixes[i] - prev_prefix;
  65       const uint16 delta16 = static_cast<uint16>(delta);
  66
  67       // New index ref if the delta doesn't fit, or if too many
  68       // consecutive deltas have been encoded.
  69       if (delta != static_cast<unsigned>(delta16) || run_length >= kMaxRun) {
  70         index_.push_back(std::make_pair(sorted_prefixes[i], deltas_.size()));
  71         run_length = 0;
  72       } else {
  73         // Continue the run of deltas.
  74         deltas_.push_back(delta16);
  75         DCHECK_EQ(static_cast<unsigned>(deltas_.back()), delta);
  76         ++run_length;
  77       }
  78
  79       prev_prefix = sorted_prefixes[i];
  80     }
  81
  82     // Send up some memory-usage stats.  Bits because fractional bytes
  83     // are weird.
  84     const size_t bits_used = index_.size() * sizeof(index_[0]) * CHAR_BIT +
  85         deltas_.size() * sizeof(deltas_[0]) * CHAR_BIT;
  86     const size_t unique_prefixes = index_.size() + deltas_.size();
  87     static const size_t kMaxBitsPerPrefix = sizeof(SBPrefix) * CHAR_BIT;
  88     UMA_HISTOGRAM_ENUMERATION("SB2.PrefixSetBitsPerPrefix",
  89                               bits_used / unique_prefixes,
  90                               kMaxBitsPerPrefix);
  91   }
  92 }
  93
  94 PrefixSet::PrefixSet(std::vector<std::pair<SBPrefix,size_t> > *index,
  95                      std::vector<uint16> *deltas) {
  96   DCHECK(index && deltas);
  97   index_.swap(*index);
  98   deltas_.swap(*deltas);
  99 }
 100
 101 PrefixSet::~PrefixSet() {}
 102
 103 bool PrefixSet::Exists(SBPrefix prefix) const {
 104   if (index_.empty())
 105     return false;
 106
 107   // Find the first position after |prefix| in |index_|.
 108   std::vector<std::pair<SBPrefix,size_t> >::const_iterator
 109       iter = std::upper_bound(index_.begin(), index_.end(),
 110                               std::pair<SBPrefix,size_t>(prefix, 0),
 111                               PrefixLess);
 112
 113   // |prefix| comes before anything that's in the set.
 114   if (iter == index_.begin())
 115     return false;
 116
 117   // Capture the upper bound of our target entry's deltas.
 118   const size_t bound = (iter == index_.end() ? deltas_.size() : iter->second);
 119
 120   // Back up to the entry our target is in.
 121   --iter;
 122
 123   // All prefixes in |index_| are in the set.
 124   SBPrefix current = iter->first;
 125   if (current == prefix)
 126     return true;
 127
 128   // Scan forward accumulating deltas while a match is possible.
 129   for (size_t di = iter->second; di < bound && current < prefix; ++di) {
 130     current += deltas_[di];
 131   }
 132
 133   return current == prefix;
 134 }
 135
 136 void PrefixSet::GetPrefixes(std::vector<SBPrefix>* prefixes) const {
 137   prefixes->reserve(index_.size() + deltas_.size());
 138
 139   for (size_t ii = 0; ii < index_.size(); ++ii) {
 140     // The deltas for this |index_| entry run to the next index entry,
 141     // or the end of the deltas.
 142     const size_t deltas_end =
 143         (ii + 1 < index_.size()) ? index_[ii + 1].second : deltas_.size();
 144
 145     SBPrefix current = index_[ii].first;
 146     prefixes->push_back(current);
 147     for (size_t di = index_[ii].second; di < deltas_end; ++di) {
 148       current += deltas_[di];
 149       prefixes->push_back(current);
 150     }
 151   }
 152 }
 153
 154 // static
 155 PrefixSet* PrefixSet::LoadFile(const base::FilePath& filter_name) {
 156   int64 size_64;
 157   if (!base::GetFileSize(filter_name, &size_64))
 158     return NULL;
 159   using base::MD5Digest;
 160   if (size_64 < static_cast<int64>(sizeof(FileHeader) + sizeof(MD5Digest)))
 161     return NULL;
 162
 163   file_util::ScopedFILE file(base::OpenFile(filter_name, "rb"));
 164   if (!file.get())
 165     return NULL;
 166
 167   FileHeader header;
 168   size_t read = fread(&header, sizeof(header), 1, file.get());
 169   if (read != 1)
 170     return NULL;
 171
 172   if (header.magic != kMagic || header.version != kVersion)
 173     return NULL;
 174
 175   std::vector<std::pair<SBPrefix,size_t> > index;
 176   const size_t index_bytes = sizeof(index[0]) * header.index_size;
 177
 178   std::vector<uint16> deltas;
 179   const size_t deltas_bytes = sizeof(deltas[0]) * header.deltas_size;
 180
 181   // Check for bogus sizes before allocating any space.
 182   const size_t expected_bytes =
 183       sizeof(header) + index_bytes + deltas_bytes + sizeof(MD5Digest);
 184   if (static_cast<int64>(expected_bytes) != size_64)
 185     return NULL;
 186
 187   // The file looks valid, start building the digest.
 188   base::MD5Context context;
 189   base::MD5Init(&context);
 190   base::MD5Update(&context, base::StringPiece(reinterpret_cast<char*>(&header),
 191                                               sizeof(header)));
 192
 193   // Read the index vector.  Herb Sutter indicates that vectors are
 194   // guaranteed to be contiuguous, so reading to where element 0 lives
 195   // is valid.
 196   if (header.index_size) {
 197     index.resize(header.index_size);
 198     read = fread(&(index[0]), sizeof(index[0]), index.size(), file.get());
 199     if (read != index.size())
 200       return NULL;
 201     base::MD5Update(&context,
 202                     base::StringPiece(reinterpret_cast<char*>(&(index[0])),
 203                                       index_bytes));
 204   }
 205
 206   // Read vector of deltas.
 207   if (header.deltas_size) {
 208     deltas.resize(header.deltas_size);
 209     read = fread(&(deltas[0]), sizeof(deltas[0]), deltas.size(), file.get());
 210     if (read != deltas.size())
 211       return NULL;
 212     base::MD5Update(&context,
 213                     base::StringPiece(reinterpret_cast<char*>(&(deltas[0])),
 214                                       deltas_bytes));
 215   }
 216
 217   base::MD5Digest calculated_digest;
 218   base::MD5Final(&calculated_digest, &context);
 219
 220   base::MD5Digest file_digest;
 221   read = fread(&file_digest, sizeof(file_digest), 1, file.get());
 222   if (read != 1)
 223     return NULL;
 224
 225   if (0 != memcmp(&file_digest, &calculated_digest, sizeof(file_digest)))
 226     return NULL;
 227
 228   // Steals contents of |index| and |deltas| via swap().
 229   return new PrefixSet(&index, &deltas);
 230 }
 231
 232 bool PrefixSet::WriteFile(const base::FilePath& filter_name) const {
 233   FileHeader header;
 234   header.magic = kMagic;
 235   header.version = kVersion;
 236   header.index_size = static_cast<uint32>(index_.size());
 237   header.deltas_size = static_cast<uint32>(deltas_.size());
 238
 239   // Sanity check that the 32-bit values never mess things up.
 240   if (static_cast<size_t>(header.index_size) != index_.size() ||
 241       static_cast<size_t>(header.deltas_size) != deltas_.size()) {
 242     NOTREACHED();
 243     return false;
 244   }
 245
 246   file_util::ScopedFILE file(base::OpenFile(filter_name, "wb"));
 247   if (!file.get())
 248     return false;
 249
 250   base::MD5Context context;
 251   base::MD5Init(&context);
 252
 253   // TODO(shess): The I/O code in safe_browsing_store_file.cc would
 254   // sure be useful about now.
 255   size_t written = fwrite(&header, sizeof(header), 1, file.get());
 256   if (written != 1)
 257     return false;
 258   base::MD5Update(&context, base::StringPiece(reinterpret_cast<char*>(&header),
 259                                               sizeof(header)));
 260
 261   // As for reads, the standard guarantees the ability to access the
 262   // contents of the vector by a pointer to an element.
 263   if (index_.size()) {
 264     const size_t index_bytes = sizeof(index_[0]) * index_.size();
 265     written = fwrite(&(index_[0]), sizeof(index_[0]), index_.size(),
 266                      file.get());
 267     if (written != index_.size())
 268       return false;
 269     base::MD5Update(&context,
 270                     base::StringPiece(
 271                         reinterpret_cast<const char*>(&(index_[0])),
 272                         index_bytes));
 273   }
 274
 275   if (deltas_.size()) {
 276     const size_t deltas_bytes = sizeof(deltas_[0]) * deltas_.size();
 277     written = fwrite(&(deltas_[0]), sizeof(deltas_[0]), deltas_.size(),
 278                      file.get());
 279     if (written != deltas_.size())
 280       return false;
 281     base::MD5Update(&context,
 282                     base::StringPiece(
 283                         reinterpret_cast<const char*>(&(deltas_[0])),
 284                         deltas_bytes));
 285   }
 286
 287   base::MD5Digest digest;
 288   base::MD5Final(&digest, &context);
 289   written = fwrite(&digest, sizeof(digest), 1, file.get());
 290   if (written != 1)
 291     return false;
 292
 293   // TODO(shess): Can this code check that the close was successful?
 294   file.reset();
 295
 296   return true;
 297 }
 298
 299 }  // namespace safe_browsing