Cast: Stop logging kVideoFrameSentToEncoder and rename a couple events.
[chromium-blink-merge.git] / chrome / browser / safe_browsing / prefix_set.cc
blob21335e156373d309a4e2a3c4be72ce7c3ee36cc2
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "chrome/browser/safe_browsing/prefix_set.h"
7 #include <algorithm>
8 #include <math.h>
10 #include "base/file_util.h"
11 #include "base/files/scoped_file.h"
12 #include "base/logging.h"
13 #include "base/md5.h"
14 #include "base/metrics/histogram.h"
15 #include "base/metrics/sparse_histogram.h"
17 namespace {
19 // |kMagic| should be reasonably unique, and not match itself across
20 // endianness changes. I generated this value with:
21 // md5 -qs chrome/browser/safe_browsing/prefix_set.cc | colrm 9
22 static uint32 kMagic = 0x864088dd;
24 // Version history:
25 // Version 1: b6cb7cfe/r74487 by shess@chromium.org on 2011-02-10
26 // Version 2: 2b59b0a6/r253924 by shess@chromium.org on 2014-02-27
27 // Version 3: ????????/r?????? by shess@chromium.org on 2014-04-??
29 // Version 2 layout is identical to version 1. The sort order of |index_|
30 // changed from |int32| to |uint32| to match the change of |SBPrefix|.
31 // Version 3 adds storage for full hashes.
32 static uint32 kVersion = 0x3;
34 typedef struct {
35 uint32 magic;
36 uint32 version;
37 uint32 index_size;
38 uint32 deltas_size;
39 } FileHeader_v2;
41 typedef struct {
42 uint32 magic;
43 uint32 version;
44 uint32 index_size;
45 uint32 deltas_size;
46 uint32 full_hashes_size;
47 } FileHeader;
49 // Common std::vector<> implementations add capacity by multiplying from the
50 // current size (usually either by 2 or 1.5) to satisfy push_back() running in
51 // amortized constant time. This is not necessary for insert() at end(), but
52 // AFAICT it seems true for some implementations. SBPrefix values should
53 // uniformly cover the 32-bit space, so the final size can be estimated given a
54 // subset of the input.
56 // |kEstimateThreshold| is when estimates start converging. Results are strong
57 // starting around 1<<27. 1<<30 is chosen to prevent the degenerate case of
58 // resizing capacity from >50% to 100%.
60 // TODO(shess): I'm sure there is math in the world to describe good settings
61 // for estimating the size of a uniformly-distributed set of integers from a
62 // sorted subset. I do not have such math in me, so I assumed that my current
63 // organic database of prefixes was scale-free, and wrote a script to see how
64 // often given slop values would always suffice for given strides. At 1<<30,
65 // .5% slop was sufficient to cover all cases (though the code below uses 1%).
67 // TODO(shess): A smaller threshold uses less transient space in reallocation.
68 // 1<<30 uses between 125% and 150%, 1<<29 between 112% and 125%, etc. The cost
69 // is that a smaller threshold needs more slop (locked down for the long term).
70 // 1<<29 worked well with 1%, 1<<27 worked well with 2%.
71 const SBPrefix kEstimateThreshold = 1 << 30;
72 size_t EstimateFinalCount(SBPrefix current_prefix, size_t current_count) {
73 // estimated_count / current_count == estimated_max / current_prefix
74 // For large input sets, estimated_max of 2^32 is close enough.
75 const size_t estimated_prefix_count = static_cast<size_t>(
76 (static_cast<uint64>(current_count) << 32) / current_prefix);
78 // The estimate has an error bar, if the final total is below the estimate, no
79 // harm, but if it is above a capacity resize will happen at nearly 100%. Add
80 // some slop to make sure all cases are covered.
81 return estimated_prefix_count + estimated_prefix_count / 100;
84 } // namespace
86 namespace safe_browsing {
88 // For |std::upper_bound()| to find a prefix w/in a vector of pairs.
89 // static
90 bool PrefixSet::PrefixLess(const IndexPair& a, const IndexPair& b) {
91 return a.first < b.first;
94 PrefixSet::PrefixSet() {
97 PrefixSet::PrefixSet(IndexVector* index,
98 std::vector<uint16>* deltas,
99 std::vector<SBFullHash>* full_hashes) {
100 DCHECK(index && deltas && full_hashes);
101 index_.swap(*index);
102 deltas_.swap(*deltas);
103 full_hashes_.swap(*full_hashes);
106 PrefixSet::~PrefixSet() {}
108 bool PrefixSet::PrefixExists(SBPrefix prefix) const {
109 if (index_.empty())
110 return false;
112 // Find the first position after |prefix| in |index_|.
113 IndexVector::const_iterator iter =
114 std::upper_bound(index_.begin(), index_.end(),
115 IndexPair(prefix, 0), PrefixLess);
117 // |prefix| comes before anything that's in the set.
118 if (iter == index_.begin())
119 return false;
121 // Capture the upper bound of our target entry's deltas.
122 const size_t bound = (iter == index_.end() ? deltas_.size() : iter->second);
124 // Back up to the entry our target is in.
125 --iter;
127 // All prefixes in |index_| are in the set.
128 SBPrefix current = iter->first;
129 if (current == prefix)
130 return true;
132 // Scan forward accumulating deltas while a match is possible.
133 for (size_t di = iter->second; di < bound && current < prefix; ++di) {
134 current += deltas_[di];
137 return current == prefix;
140 bool PrefixSet::Exists(const SBFullHash& hash) const {
141 if (std::binary_search(full_hashes_.begin(), full_hashes_.end(),
142 hash, SBFullHashLess)) {
143 return true;
145 return PrefixExists(hash.prefix);
148 void PrefixSet::GetPrefixes(std::vector<SBPrefix>* prefixes) const {
149 prefixes->reserve(index_.size() + deltas_.size());
151 for (size_t ii = 0; ii < index_.size(); ++ii) {
152 // The deltas for this |index_| entry run to the next index entry,
153 // or the end of the deltas.
154 const size_t deltas_end =
155 (ii + 1 < index_.size()) ? index_[ii + 1].second : deltas_.size();
157 SBPrefix current = index_[ii].first;
158 prefixes->push_back(current);
159 for (size_t di = index_[ii].second; di < deltas_end; ++di) {
160 current += deltas_[di];
161 prefixes->push_back(current);
166 // static
167 scoped_ptr<PrefixSet> PrefixSet::LoadFile(const base::FilePath& filter_name) {
168 int64 size_64;
169 if (!base::GetFileSize(filter_name, &size_64))
170 return scoped_ptr<PrefixSet>();
171 using base::MD5Digest;
172 // TODO(shess): Revert to sizeof(FileHeader) for sanity check once v2 is
173 // deprecated.
174 if (size_64 < static_cast<int64>(sizeof(FileHeader_v2) + sizeof(MD5Digest)))
175 return scoped_ptr<PrefixSet>();
177 base::ScopedFILE file(base::OpenFile(filter_name, "rb"));
178 if (!file.get())
179 return scoped_ptr<PrefixSet>();
181 FileHeader header;
182 size_t read = fread(&header, sizeof(header), 1, file.get());
183 if (read != 1)
184 return scoped_ptr<PrefixSet>();
186 // The file looks valid, start building the digest.
187 base::MD5Context context;
188 base::MD5Init(&context);
189 base::MD5Update(&context, base::StringPiece(reinterpret_cast<char*>(&header),
190 sizeof(header)));
192 if (header.magic != kMagic)
193 return scoped_ptr<PrefixSet>();
195 // Track version read to inform removal of support for older versions.
196 UMA_HISTOGRAM_SPARSE_SLOWLY("SB2.PrefixSetVersionRead", header.version);
198 // TODO(shess): Version 1 and 2 use the same file structure, with version 1
199 // data using a signed sort. For M-35, the data is re-sorted before return.
200 // After M-36, just drop v1 support. <http://crbug.com/346405>
201 // TODO(shess): <http://crbug.com/368044> for removing v2 support.
202 size_t header_size = sizeof(header);
203 if (header.version == 2 || header.version == 1) {
204 // Rewind the file and restart building the digest with the old header
205 // structure.
206 FileHeader_v2 v2_header;
207 if (0 != fseek(file.get(), 0, SEEK_SET))
208 return scoped_ptr<PrefixSet>();
210 size_t read = fread(&v2_header, sizeof(v2_header), 1, file.get());
211 if (read != 1)
212 return scoped_ptr<PrefixSet>();
214 base::MD5Init(&context);
215 base::MD5Update(&context,
216 base::StringPiece(reinterpret_cast<char*>(&v2_header),
217 sizeof(v2_header)));
219 // The current header is a superset of the old header, fill it in with the
220 // information read.
221 header.magic = v2_header.magic;
222 header.version = v2_header.version;
223 header.index_size = v2_header.index_size;
224 header.deltas_size = v2_header.deltas_size;
225 header.full_hashes_size = 0;
226 header_size = sizeof(v2_header);
227 } else if (header.version != kVersion) {
228 return scoped_ptr<PrefixSet>();
231 IndexVector index;
232 const size_t index_bytes = sizeof(index[0]) * header.index_size;
234 std::vector<uint16> deltas;
235 const size_t deltas_bytes = sizeof(deltas[0]) * header.deltas_size;
237 std::vector<SBFullHash> full_hashes;
238 const size_t full_hashes_bytes =
239 sizeof(full_hashes[0]) * header.full_hashes_size;
241 // Check for bogus sizes before allocating any space.
242 const size_t expected_bytes = header_size +
243 index_bytes + deltas_bytes + full_hashes_bytes + sizeof(MD5Digest);
244 if (static_cast<int64>(expected_bytes) != size_64)
245 return scoped_ptr<PrefixSet>();
247 // Read the index vector. Herb Sutter indicates that vectors are
248 // guaranteed to be contiuguous, so reading to where element 0 lives
249 // is valid.
250 if (header.index_size) {
251 index.resize(header.index_size);
252 read = fread(&(index[0]), sizeof(index[0]), index.size(), file.get());
253 if (read != index.size())
254 return scoped_ptr<PrefixSet>();
255 base::MD5Update(&context,
256 base::StringPiece(reinterpret_cast<char*>(&(index[0])),
257 index_bytes));
260 // Read vector of deltas.
261 if (header.deltas_size) {
262 deltas.resize(header.deltas_size);
263 read = fread(&(deltas[0]), sizeof(deltas[0]), deltas.size(), file.get());
264 if (read != deltas.size())
265 return scoped_ptr<PrefixSet>();
266 base::MD5Update(&context,
267 base::StringPiece(reinterpret_cast<char*>(&(deltas[0])),
268 deltas_bytes));
271 // Read vector of full hashes.
272 if (header.full_hashes_size) {
273 full_hashes.resize(header.full_hashes_size);
274 read = fread(&(full_hashes[0]), sizeof(full_hashes[0]), full_hashes.size(),
275 file.get());
276 if (read != full_hashes.size())
277 return scoped_ptr<PrefixSet>();
278 base::MD5Update(&context,
279 base::StringPiece(
280 reinterpret_cast<char*>(&(full_hashes[0])),
281 full_hashes_bytes));
284 base::MD5Digest calculated_digest;
285 base::MD5Final(&calculated_digest, &context);
287 base::MD5Digest file_digest;
288 read = fread(&file_digest, sizeof(file_digest), 1, file.get());
289 if (read != 1)
290 return scoped_ptr<PrefixSet>();
292 if (0 != memcmp(&file_digest, &calculated_digest, sizeof(file_digest)))
293 return scoped_ptr<PrefixSet>();
295 // For version 1, fetch the prefixes and re-sort.
296 if (header.version == 1) {
297 std::vector<SBPrefix> prefixes;
298 PrefixSet(&index, &deltas, &full_hashes).GetPrefixes(&prefixes);
299 std::sort(prefixes.begin(), prefixes.end());
301 // v1 cannot have full hashes, so no need to propagate a copy here.
302 return PrefixSetBuilder(prefixes).GetPrefixSetNoHashes().Pass();
305 // Steals vector contents using swap().
306 return scoped_ptr<PrefixSet>(new PrefixSet(&index, &deltas, &full_hashes));
309 bool PrefixSet::WriteFile(const base::FilePath& filter_name) const {
310 FileHeader header;
311 header.magic = kMagic;
312 header.version = kVersion;
313 header.index_size = static_cast<uint32>(index_.size());
314 header.deltas_size = static_cast<uint32>(deltas_.size());
315 header.full_hashes_size = static_cast<uint32>(full_hashes_.size());
317 // Sanity check that the 32-bit values never mess things up.
318 if (static_cast<size_t>(header.index_size) != index_.size() ||
319 static_cast<size_t>(header.deltas_size) != deltas_.size() ||
320 static_cast<size_t>(header.full_hashes_size) != full_hashes_.size()) {
321 NOTREACHED();
322 return false;
325 base::ScopedFILE file(base::OpenFile(filter_name, "wb"));
326 if (!file.get())
327 return false;
329 base::MD5Context context;
330 base::MD5Init(&context);
332 // TODO(shess): The I/O code in safe_browsing_store_file.cc would
333 // sure be useful about now.
334 size_t written = fwrite(&header, sizeof(header), 1, file.get());
335 if (written != 1)
336 return false;
337 base::MD5Update(&context, base::StringPiece(reinterpret_cast<char*>(&header),
338 sizeof(header)));
340 // As for reads, the standard guarantees the ability to access the
341 // contents of the vector by a pointer to an element.
342 if (index_.size()) {
343 const size_t index_bytes = sizeof(index_[0]) * index_.size();
344 written = fwrite(&(index_[0]), sizeof(index_[0]), index_.size(),
345 file.get());
346 if (written != index_.size())
347 return false;
348 base::MD5Update(&context,
349 base::StringPiece(
350 reinterpret_cast<const char*>(&(index_[0])),
351 index_bytes));
354 if (deltas_.size()) {
355 const size_t deltas_bytes = sizeof(deltas_[0]) * deltas_.size();
356 written = fwrite(&(deltas_[0]), sizeof(deltas_[0]), deltas_.size(),
357 file.get());
358 if (written != deltas_.size())
359 return false;
360 base::MD5Update(&context,
361 base::StringPiece(
362 reinterpret_cast<const char*>(&(deltas_[0])),
363 deltas_bytes));
366 if (full_hashes_.size()) {
367 const size_t elt_size = sizeof(full_hashes_[0]);
368 const size_t elts = full_hashes_.size();
369 const size_t full_hashes_bytes = elt_size * elts;
370 written = fwrite(&(full_hashes_[0]), elt_size, elts, file.get());
371 if (written != elts)
372 return false;
373 base::MD5Update(&context,
374 base::StringPiece(
375 reinterpret_cast<const char*>(&(full_hashes_[0])),
376 full_hashes_bytes));
379 base::MD5Digest digest;
380 base::MD5Final(&digest, &context);
381 written = fwrite(&digest, sizeof(digest), 1, file.get());
382 if (written != 1)
383 return false;
385 // TODO(shess): Can this code check that the close was successful?
386 file.reset();
388 return true;
391 void PrefixSet::AddRun(SBPrefix index_prefix,
392 const uint16* run_begin, const uint16* run_end) {
393 // Preempt organic capacity decisions for |delta_| once strong estimates can
394 // be made.
395 if (index_prefix > kEstimateThreshold &&
396 deltas_.capacity() < deltas_.size() + (run_end - run_begin)) {
397 deltas_.reserve(EstimateFinalCount(index_prefix, deltas_.size()));
400 index_.push_back(std::make_pair(index_prefix, deltas_.size()));
401 deltas_.insert(deltas_.end(), run_begin, run_end);
404 PrefixSetBuilder::PrefixSetBuilder()
405 : prefix_set_(new PrefixSet()) {
408 PrefixSetBuilder::PrefixSetBuilder(const std::vector<SBPrefix>& prefixes)
409 : prefix_set_(new PrefixSet()) {
410 for (size_t i = 0; i < prefixes.size(); ++i) {
411 AddPrefix(prefixes[i]);
415 PrefixSetBuilder::~PrefixSetBuilder() {
418 scoped_ptr<PrefixSet> PrefixSetBuilder::GetPrefixSet(
419 const std::vector<SBFullHash>& hashes) {
420 DCHECK(prefix_set_.get());
422 // Flush runs until buffered data is gone.
423 while (!buffer_.empty()) {
424 EmitRun();
427 // Precisely size |index_| for read-only. It's 50k-60k, so minor savings, but
428 // they're almost free.
429 PrefixSet::IndexVector(prefix_set_->index_).swap(prefix_set_->index_);
431 prefix_set_->full_hashes_ = hashes;
432 std::sort(prefix_set_->full_hashes_.begin(), prefix_set_->full_hashes_.end(),
433 SBFullHashLess);
435 return prefix_set_.Pass();
438 scoped_ptr<PrefixSet> PrefixSetBuilder::GetPrefixSetNoHashes() {
439 return GetPrefixSet(std::vector<SBFullHash>()).Pass();
442 void PrefixSetBuilder::EmitRun() {
443 DCHECK(prefix_set_.get());
445 SBPrefix prev_prefix = buffer_[0];
446 uint16 run[PrefixSet::kMaxRun];
447 size_t run_pos = 0;
449 size_t i;
450 for (i = 1; i < buffer_.size() && run_pos < PrefixSet::kMaxRun; ++i) {
451 // Calculate the delta. |unsigned| is mandatory, because the
452 // sorted_prefixes could be more than INT_MAX apart.
453 DCHECK_GT(buffer_[i], prev_prefix);
454 const unsigned delta = buffer_[i] - prev_prefix;
455 const uint16 delta16 = static_cast<uint16>(delta);
457 // Break the run if the delta doesn't fit.
458 if (delta != static_cast<unsigned>(delta16))
459 break;
461 // Continue the run of deltas.
462 run[run_pos++] = delta16;
463 DCHECK_EQ(static_cast<unsigned>(run[run_pos - 1]), delta);
465 prev_prefix = buffer_[i];
467 prefix_set_->AddRun(buffer_[0], run, run + run_pos);
468 buffer_.erase(buffer_.begin(), buffer_.begin() + i);
471 void PrefixSetBuilder::AddPrefix(SBPrefix prefix) {
472 DCHECK(prefix_set_.get());
474 if (buffer_.empty()) {
475 DCHECK(prefix_set_->index_.empty());
476 DCHECK(prefix_set_->deltas_.empty());
477 } else {
478 // Drop duplicates.
479 if (buffer_.back() == prefix)
480 return;
482 DCHECK_LT(buffer_.back(), prefix);
484 buffer_.push_back(prefix);
486 // Flush buffer when a run can be constructed. +1 for the index item, and +1
487 // to leave at least one item in the buffer for dropping duplicates.
488 if (buffer_.size() > PrefixSet::kMaxRun + 2)
489 EmitRun();
492 } // namespace safe_browsing