Roll src/third_party/WebKit f298044:aa8346d (svn 202628:202629)
[chromium-blink-merge.git] / chrome / browser / safe_browsing / prefix_set.cc
blob7cd69e99177d3a8d1512614690c541cf319869be
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "chrome/browser/safe_browsing/prefix_set.h"
7 #include <algorithm>
9 #include "base/files/file_util.h"
10 #include "base/files/scoped_file.h"
11 #include "base/logging.h"
12 #include "base/md5.h"
13 #include "base/metrics/histogram.h"
14 #include "base/metrics/sparse_histogram.h"
16 namespace {
18 // |kMagic| should be reasonably unique, and not match itself across
19 // endianness changes. I generated this value with:
20 // md5 -qs chrome/browser/safe_browsing/prefix_set.cc | colrm 9
21 static uint32 kMagic = 0x864088dd;
23 // Version history:
24 // Version 1: b6cb7cfe/r74487 by shess@chromium.org on 2011-02-10
25 // Version 2: 2b59b0a6/r253924 by shess@chromium.org on 2014-02-27
26 // Version 3: dd07faf5/r268145 by shess@chromium.org on 2014-05-05
28 // Version 2 layout is identical to version 1. The sort order of |index_|
29 // changed from |int32| to |uint32| to match the change of |SBPrefix|.
30 // Version 3 adds storage for full hashes.
31 static uint32 kVersion = 3;
32 static uint32 kDeprecatedVersion = 2; // And lower.
34 typedef struct {
35 uint32 magic;
36 uint32 version;
37 uint32 index_size;
38 uint32 deltas_size;
39 uint32 full_hashes_size;
40 } FileHeader;
42 // Common std::vector<> implementations add capacity by multiplying from the
43 // current size (usually either by 2 or 1.5) to satisfy push_back() running in
44 // amortized constant time. This is not necessary for insert() at end(), but
45 // AFAICT it seems true for some implementations. SBPrefix values should
46 // uniformly cover the 32-bit space, so the final size can be estimated given a
47 // subset of the input.
49 // |kEstimateThreshold| is when estimates start converging. Results are strong
50 // starting around 1<<27. 1<<30 is chosen to prevent the degenerate case of
51 // resizing capacity from >50% to 100%.
53 // TODO(shess): I'm sure there is math in the world to describe good settings
54 // for estimating the size of a uniformly-distributed set of integers from a
55 // sorted subset. I do not have such math in me, so I assumed that my current
56 // organic database of prefixes was scale-free, and wrote a script to see how
57 // often given slop values would always suffice for given strides. At 1<<30,
58 // .5% slop was sufficient to cover all cases (though the code below uses 1%).
60 // TODO(shess): A smaller threshold uses less transient space in reallocation.
61 // 1<<30 uses between 125% and 150%, 1<<29 between 112% and 125%, etc. The cost
62 // is that a smaller threshold needs more slop (locked down for the long term).
63 // 1<<29 worked well with 1%, 1<<27 worked well with 2%.
64 const SBPrefix kEstimateThreshold = 1 << 30;
65 size_t EstimateFinalCount(SBPrefix current_prefix, size_t current_count) {
66 // estimated_count / current_count == estimated_max / current_prefix
67 // For large input sets, estimated_max of 2^32 is close enough.
68 const size_t estimated_prefix_count = static_cast<size_t>(
69 (static_cast<uint64>(current_count) << 32) / current_prefix);
71 // The estimate has an error bar, if the final total is below the estimate, no
72 // harm, but if it is above a capacity resize will happen at nearly 100%. Add
73 // some slop to make sure all cases are covered.
74 return estimated_prefix_count + estimated_prefix_count / 100;
77 } // namespace
79 namespace safe_browsing {
81 // For |std::upper_bound()| to find a prefix w/in a vector of pairs.
82 // static
83 bool PrefixSet::PrefixLess(const IndexPair& a, const IndexPair& b) {
84 return a.first < b.first;
87 PrefixSet::PrefixSet() {
90 PrefixSet::PrefixSet(IndexVector* index,
91 std::vector<uint16>* deltas,
92 std::vector<SBFullHash>* full_hashes) {
93 DCHECK(index && deltas && full_hashes);
94 index_.swap(*index);
95 deltas_.swap(*deltas);
96 full_hashes_.swap(*full_hashes);
99 PrefixSet::~PrefixSet() {}
101 bool PrefixSet::PrefixExists(SBPrefix prefix) const {
102 if (index_.empty())
103 return false;
105 // Find the first position after |prefix| in |index_|.
106 IndexVector::const_iterator iter =
107 std::upper_bound(index_.begin(), index_.end(),
108 IndexPair(prefix, 0), PrefixLess);
110 // |prefix| comes before anything that's in the set.
111 if (iter == index_.begin())
112 return false;
114 // Capture the upper bound of our target entry's deltas.
115 const size_t bound = (iter == index_.end() ? deltas_.size() : iter->second);
117 // Back up to the entry our target is in.
118 --iter;
120 // All prefixes in |index_| are in the set.
121 SBPrefix current = iter->first;
122 if (current == prefix)
123 return true;
125 // Scan forward accumulating deltas while a match is possible.
126 for (size_t di = iter->second; di < bound && current < prefix; ++di) {
127 current += deltas_[di];
130 return current == prefix;
133 bool PrefixSet::Exists(const SBFullHash& hash) const {
134 if (std::binary_search(full_hashes_.begin(), full_hashes_.end(),
135 hash, SBFullHashLess)) {
136 return true;
138 return PrefixExists(hash.prefix);
141 void PrefixSet::GetPrefixes(std::vector<SBPrefix>* prefixes) const {
142 prefixes->reserve(index_.size() + deltas_.size());
144 for (size_t ii = 0; ii < index_.size(); ++ii) {
145 // The deltas for this |index_| entry run to the next index entry,
146 // or the end of the deltas.
147 const size_t deltas_end =
148 (ii + 1 < index_.size()) ? index_[ii + 1].second : deltas_.size();
150 SBPrefix current = index_[ii].first;
151 prefixes->push_back(current);
152 for (size_t di = index_[ii].second; di < deltas_end; ++di) {
153 current += deltas_[di];
154 prefixes->push_back(current);
159 // static
160 scoped_ptr<const PrefixSet> PrefixSet::LoadFile(
161 const base::FilePath& filter_name) {
162 int64 size_64;
163 if (!base::GetFileSize(filter_name, &size_64))
164 return nullptr;
165 using base::MD5Digest;
166 if (size_64 < static_cast<int64>(sizeof(FileHeader) + sizeof(MD5Digest)))
167 return nullptr;
169 base::ScopedFILE file(base::OpenFile(filter_name, "rb"));
170 if (!file.get())
171 return nullptr;
173 FileHeader header;
174 size_t read = fread(&header, sizeof(header), 1, file.get());
175 if (read != 1)
176 return nullptr;
178 // The file looks valid, start building the digest.
179 base::MD5Context context;
180 base::MD5Init(&context);
181 base::MD5Update(&context, base::StringPiece(reinterpret_cast<char*>(&header),
182 sizeof(header)));
184 if (header.magic != kMagic)
185 return nullptr;
187 // Track version read to inform removal of support for older versions.
188 UMA_HISTOGRAM_SPARSE_SLOWLY("SB2.PrefixSetVersionRead", header.version);
190 if (header.version <= kDeprecatedVersion) {
191 return nullptr;
192 } else if (header.version != kVersion) {
193 return nullptr;
196 IndexVector index;
197 const size_t index_bytes = sizeof(index[0]) * header.index_size;
199 std::vector<uint16> deltas;
200 const size_t deltas_bytes = sizeof(deltas[0]) * header.deltas_size;
202 std::vector<SBFullHash> full_hashes;
203 const size_t full_hashes_bytes =
204 sizeof(full_hashes[0]) * header.full_hashes_size;
206 // Check for bogus sizes before allocating any space.
207 const size_t expected_bytes = sizeof(header) +
208 index_bytes + deltas_bytes + full_hashes_bytes + sizeof(MD5Digest);
209 if (static_cast<int64>(expected_bytes) != size_64)
210 return nullptr;
212 // Read the index vector. Herb Sutter indicates that vectors are
213 // guaranteed to be contiuguous, so reading to where element 0 lives
214 // is valid.
215 if (header.index_size) {
216 index.resize(header.index_size);
217 read = fread(&(index[0]), sizeof(index[0]), index.size(), file.get());
218 if (read != index.size())
219 return nullptr;
220 base::MD5Update(&context,
221 base::StringPiece(reinterpret_cast<char*>(&(index[0])),
222 index_bytes));
225 // Read vector of deltas.
226 if (header.deltas_size) {
227 deltas.resize(header.deltas_size);
228 read = fread(&(deltas[0]), sizeof(deltas[0]), deltas.size(), file.get());
229 if (read != deltas.size())
230 return nullptr;
231 base::MD5Update(&context,
232 base::StringPiece(reinterpret_cast<char*>(&(deltas[0])),
233 deltas_bytes));
236 // Read vector of full hashes.
237 if (header.full_hashes_size) {
238 full_hashes.resize(header.full_hashes_size);
239 read = fread(&(full_hashes[0]), sizeof(full_hashes[0]), full_hashes.size(),
240 file.get());
241 if (read != full_hashes.size())
242 return nullptr;
243 base::MD5Update(&context,
244 base::StringPiece(
245 reinterpret_cast<char*>(&(full_hashes[0])),
246 full_hashes_bytes));
249 base::MD5Digest calculated_digest;
250 base::MD5Final(&calculated_digest, &context);
252 base::MD5Digest file_digest;
253 read = fread(&file_digest, sizeof(file_digest), 1, file.get());
254 if (read != 1)
255 return nullptr;
257 if (0 != memcmp(&file_digest, &calculated_digest, sizeof(file_digest)))
258 return nullptr;
260 // Steals vector contents using swap().
261 return make_scoped_ptr(
262 new PrefixSet(&index, &deltas, &full_hashes));
265 bool PrefixSet::WriteFile(const base::FilePath& filter_name) const {
266 FileHeader header;
267 header.magic = kMagic;
268 header.version = kVersion;
269 header.index_size = static_cast<uint32>(index_.size());
270 header.deltas_size = static_cast<uint32>(deltas_.size());
271 header.full_hashes_size = static_cast<uint32>(full_hashes_.size());
273 // Sanity check that the 32-bit values never mess things up.
274 if (static_cast<size_t>(header.index_size) != index_.size() ||
275 static_cast<size_t>(header.deltas_size) != deltas_.size() ||
276 static_cast<size_t>(header.full_hashes_size) != full_hashes_.size()) {
277 NOTREACHED();
278 return false;
281 base::ScopedFILE file(base::OpenFile(filter_name, "wb"));
282 if (!file.get())
283 return false;
285 base::MD5Context context;
286 base::MD5Init(&context);
288 // TODO(shess): The I/O code in safe_browsing_store_file.cc would
289 // sure be useful about now.
290 size_t written = fwrite(&header, sizeof(header), 1, file.get());
291 if (written != 1)
292 return false;
293 base::MD5Update(&context, base::StringPiece(reinterpret_cast<char*>(&header),
294 sizeof(header)));
296 // As for reads, the standard guarantees the ability to access the
297 // contents of the vector by a pointer to an element.
298 if (index_.size()) {
299 const size_t index_bytes = sizeof(index_[0]) * index_.size();
300 written = fwrite(&(index_[0]), sizeof(index_[0]), index_.size(),
301 file.get());
302 if (written != index_.size())
303 return false;
304 base::MD5Update(&context,
305 base::StringPiece(
306 reinterpret_cast<const char*>(&(index_[0])),
307 index_bytes));
310 if (deltas_.size()) {
311 const size_t deltas_bytes = sizeof(deltas_[0]) * deltas_.size();
312 written = fwrite(&(deltas_[0]), sizeof(deltas_[0]), deltas_.size(),
313 file.get());
314 if (written != deltas_.size())
315 return false;
316 base::MD5Update(&context,
317 base::StringPiece(
318 reinterpret_cast<const char*>(&(deltas_[0])),
319 deltas_bytes));
322 if (full_hashes_.size()) {
323 const size_t elt_size = sizeof(full_hashes_[0]);
324 const size_t elts = full_hashes_.size();
325 const size_t full_hashes_bytes = elt_size * elts;
326 written = fwrite(&(full_hashes_[0]), elt_size, elts, file.get());
327 if (written != elts)
328 return false;
329 base::MD5Update(&context,
330 base::StringPiece(
331 reinterpret_cast<const char*>(&(full_hashes_[0])),
332 full_hashes_bytes));
335 base::MD5Digest digest;
336 base::MD5Final(&digest, &context);
337 written = fwrite(&digest, sizeof(digest), 1, file.get());
338 if (written != 1)
339 return false;
341 // TODO(shess): Can this code check that the close was successful?
342 file.reset();
344 return true;
347 void PrefixSet::AddRun(SBPrefix index_prefix,
348 const uint16* run_begin, const uint16* run_end) {
349 // Preempt organic capacity decisions for |delta_| once strong estimates can
350 // be made.
351 if (index_prefix > kEstimateThreshold &&
352 deltas_.capacity() < deltas_.size() + (run_end - run_begin)) {
353 deltas_.reserve(EstimateFinalCount(index_prefix, deltas_.size()));
356 index_.push_back(std::make_pair(index_prefix, deltas_.size()));
357 deltas_.insert(deltas_.end(), run_begin, run_end);
360 PrefixSetBuilder::PrefixSetBuilder()
361 : prefix_set_(new PrefixSet()) {
364 PrefixSetBuilder::PrefixSetBuilder(const std::vector<SBPrefix>& prefixes)
365 : prefix_set_(new PrefixSet()) {
366 for (size_t i = 0; i < prefixes.size(); ++i) {
367 AddPrefix(prefixes[i]);
371 PrefixSetBuilder::~PrefixSetBuilder() {
374 scoped_ptr<const PrefixSet> PrefixSetBuilder::GetPrefixSet(
375 const std::vector<SBFullHash>& hashes) {
376 DCHECK(prefix_set_.get());
378 // Flush runs until buffered data is gone.
379 while (!buffer_.empty()) {
380 EmitRun();
383 // Precisely size |index_| for read-only. It's 50k-60k, so minor savings, but
384 // they're almost free.
385 PrefixSet::IndexVector(prefix_set_->index_).swap(prefix_set_->index_);
387 prefix_set_->full_hashes_ = hashes;
388 std::sort(prefix_set_->full_hashes_.begin(), prefix_set_->full_hashes_.end(),
389 SBFullHashLess);
391 return prefix_set_.Pass();
394 scoped_ptr<const PrefixSet> PrefixSetBuilder::GetPrefixSetNoHashes() {
395 return GetPrefixSet(std::vector<SBFullHash>()).Pass();
398 void PrefixSetBuilder::EmitRun() {
399 DCHECK(prefix_set_.get());
401 SBPrefix prev_prefix = buffer_[0];
402 uint16 run[PrefixSet::kMaxRun];
403 size_t run_pos = 0;
405 size_t i;
406 for (i = 1; i < buffer_.size() && run_pos < PrefixSet::kMaxRun; ++i) {
407 // Calculate the delta. |unsigned| is mandatory, because the
408 // sorted_prefixes could be more than INT_MAX apart.
409 DCHECK_GT(buffer_[i], prev_prefix);
410 const unsigned delta = buffer_[i] - prev_prefix;
411 const uint16 delta16 = static_cast<uint16>(delta);
413 // Break the run if the delta doesn't fit.
414 if (delta != static_cast<unsigned>(delta16))
415 break;
417 // Continue the run of deltas.
418 run[run_pos++] = delta16;
419 DCHECK_EQ(static_cast<unsigned>(run[run_pos - 1]), delta);
421 prev_prefix = buffer_[i];
423 prefix_set_->AddRun(buffer_[0], run, run + run_pos);
424 buffer_.erase(buffer_.begin(), buffer_.begin() + i);
427 void PrefixSetBuilder::AddPrefix(SBPrefix prefix) {
428 DCHECK(prefix_set_.get());
430 if (buffer_.empty()) {
431 DCHECK(prefix_set_->index_.empty());
432 DCHECK(prefix_set_->deltas_.empty());
433 } else {
434 // Drop duplicates.
435 if (buffer_.back() == prefix)
436 return;
438 DCHECK_LT(buffer_.back(), prefix);
440 buffer_.push_back(prefix);
442 // Flush buffer when a run can be constructed. +1 for the index item, and +1
443 // to leave at least one item in the buffer for dropping duplicates.
444 if (buffer_.size() > PrefixSet::kMaxRun + 2)
445 EmitRun();
448 } // namespace safe_browsing