1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "chrome/browser/safe_browsing/prefix_set.h"
9 #include "base/files/file_util.h"
10 #include "base/files/scoped_file.h"
11 #include "base/logging.h"
13 #include "base/metrics/histogram.h"
14 #include "base/metrics/sparse_histogram.h"
18 // |kMagic| should be reasonably unique, and not match itself across
19 // endianness changes. I generated this value with:
20 // md5 -qs chrome/browser/safe_browsing/prefix_set.cc | colrm 9
21 static uint32 kMagic
= 0x864088dd;
24 // Version 1: b6cb7cfe/r74487 by shess@chromium.org on 2011-02-10
25 // Version 2: 2b59b0a6/r253924 by shess@chromium.org on 2014-02-27
26 // Version 3: dd07faf5/r268145 by shess@chromium.org on 2014-05-05
28 // Version 2 layout is identical to version 1. The sort order of |index_|
29 // changed from |int32| to |uint32| to match the change of |SBPrefix|.
30 // Version 3 adds storage for full hashes.
31 static uint32 kVersion
= 3;
32 static uint32 kDeprecatedVersion
= 2; // And lower.
39 uint32 full_hashes_size
;
42 // Common std::vector<> implementations add capacity by multiplying from the
43 // current size (usually either by 2 or 1.5) to satisfy push_back() running in
44 // amortized constant time. This is not necessary for insert() at end(), but
45 // AFAICT it seems true for some implementations. SBPrefix values should
46 // uniformly cover the 32-bit space, so the final size can be estimated given a
47 // subset of the input.
49 // |kEstimateThreshold| is when estimates start converging. Results are strong
50 // starting around 1<<27. 1<<30 is chosen to prevent the degenerate case of
51 // resizing capacity from >50% to 100%.
53 // TODO(shess): I'm sure there is math in the world to describe good settings
54 // for estimating the size of a uniformly-distributed set of integers from a
55 // sorted subset. I do not have such math in me, so I assumed that my current
56 // organic database of prefixes was scale-free, and wrote a script to see how
57 // often given slop values would always suffice for given strides. At 1<<30,
58 // .5% slop was sufficient to cover all cases (though the code below uses 1%).
60 // TODO(shess): A smaller threshold uses less transient space in reallocation.
61 // 1<<30 uses between 125% and 150%, 1<<29 between 112% and 125%, etc. The cost
62 // is that a smaller threshold needs more slop (locked down for the long term).
63 // 1<<29 worked well with 1%, 1<<27 worked well with 2%.
64 const SBPrefix kEstimateThreshold
= 1 << 30;
65 size_t EstimateFinalCount(SBPrefix current_prefix
, size_t current_count
) {
66 // estimated_count / current_count == estimated_max / current_prefix
67 // For large input sets, estimated_max of 2^32 is close enough.
68 const size_t estimated_prefix_count
= static_cast<size_t>(
69 (static_cast<uint64
>(current_count
) << 32) / current_prefix
);
71 // The estimate has an error bar, if the final total is below the estimate, no
72 // harm, but if it is above a capacity resize will happen at nearly 100%. Add
73 // some slop to make sure all cases are covered.
74 return estimated_prefix_count
+ estimated_prefix_count
/ 100;
79 namespace safe_browsing
{
81 // For |std::upper_bound()| to find a prefix w/in a vector of pairs.
83 bool PrefixSet::PrefixLess(const IndexPair
& a
, const IndexPair
& b
) {
84 return a
.first
< b
.first
;
87 PrefixSet::PrefixSet() {
90 PrefixSet::PrefixSet(IndexVector
* index
,
91 std::vector
<uint16
>* deltas
,
92 std::vector
<SBFullHash
>* full_hashes
) {
93 DCHECK(index
&& deltas
&& full_hashes
);
95 deltas_
.swap(*deltas
);
96 full_hashes_
.swap(*full_hashes
);
99 PrefixSet::~PrefixSet() {}
101 bool PrefixSet::PrefixExists(SBPrefix prefix
) const {
105 // Find the first position after |prefix| in |index_|.
106 IndexVector::const_iterator iter
=
107 std::upper_bound(index_
.begin(), index_
.end(),
108 IndexPair(prefix
, 0), PrefixLess
);
110 // |prefix| comes before anything that's in the set.
111 if (iter
== index_
.begin())
114 // Capture the upper bound of our target entry's deltas.
115 const size_t bound
= (iter
== index_
.end() ? deltas_
.size() : iter
->second
);
117 // Back up to the entry our target is in.
120 // All prefixes in |index_| are in the set.
121 SBPrefix current
= iter
->first
;
122 if (current
== prefix
)
125 // Scan forward accumulating deltas while a match is possible.
126 for (size_t di
= iter
->second
; di
< bound
&& current
< prefix
; ++di
) {
127 current
+= deltas_
[di
];
130 return current
== prefix
;
133 bool PrefixSet::Exists(const SBFullHash
& hash
) const {
134 if (std::binary_search(full_hashes_
.begin(), full_hashes_
.end(),
135 hash
, SBFullHashLess
)) {
138 return PrefixExists(hash
.prefix
);
141 void PrefixSet::GetPrefixes(std::vector
<SBPrefix
>* prefixes
) const {
142 prefixes
->reserve(index_
.size() + deltas_
.size());
144 for (size_t ii
= 0; ii
< index_
.size(); ++ii
) {
145 // The deltas for this |index_| entry run to the next index entry,
146 // or the end of the deltas.
147 const size_t deltas_end
=
148 (ii
+ 1 < index_
.size()) ? index_
[ii
+ 1].second
: deltas_
.size();
150 SBPrefix current
= index_
[ii
].first
;
151 prefixes
->push_back(current
);
152 for (size_t di
= index_
[ii
].second
; di
< deltas_end
; ++di
) {
153 current
+= deltas_
[di
];
154 prefixes
->push_back(current
);
160 scoped_ptr
<PrefixSet
> PrefixSet::LoadFile(const base::FilePath
& filter_name
) {
162 if (!base::GetFileSize(filter_name
, &size_64
))
163 return scoped_ptr
<PrefixSet
>();
164 using base::MD5Digest
;
165 if (size_64
< static_cast<int64
>(sizeof(FileHeader
) + sizeof(MD5Digest
)))
166 return scoped_ptr
<PrefixSet
>();
168 base::ScopedFILE
file(base::OpenFile(filter_name
, "rb"));
170 return scoped_ptr
<PrefixSet
>();
173 size_t read
= fread(&header
, sizeof(header
), 1, file
.get());
175 return scoped_ptr
<PrefixSet
>();
177 // The file looks valid, start building the digest.
178 base::MD5Context context
;
179 base::MD5Init(&context
);
180 base::MD5Update(&context
, base::StringPiece(reinterpret_cast<char*>(&header
),
183 if (header
.magic
!= kMagic
)
184 return scoped_ptr
<PrefixSet
>();
186 // Track version read to inform removal of support for older versions.
187 UMA_HISTOGRAM_SPARSE_SLOWLY("SB2.PrefixSetVersionRead", header
.version
);
189 if (header
.version
<= kDeprecatedVersion
) {
190 return scoped_ptr
<PrefixSet
>();
191 } else if (header
.version
!= kVersion
) {
192 return scoped_ptr
<PrefixSet
>();
196 const size_t index_bytes
= sizeof(index
[0]) * header
.index_size
;
198 std::vector
<uint16
> deltas
;
199 const size_t deltas_bytes
= sizeof(deltas
[0]) * header
.deltas_size
;
201 std::vector
<SBFullHash
> full_hashes
;
202 const size_t full_hashes_bytes
=
203 sizeof(full_hashes
[0]) * header
.full_hashes_size
;
205 // Check for bogus sizes before allocating any space.
206 const size_t expected_bytes
= sizeof(header
) +
207 index_bytes
+ deltas_bytes
+ full_hashes_bytes
+ sizeof(MD5Digest
);
208 if (static_cast<int64
>(expected_bytes
) != size_64
)
209 return scoped_ptr
<PrefixSet
>();
211 // Read the index vector. Herb Sutter indicates that vectors are
212 // guaranteed to be contiuguous, so reading to where element 0 lives
214 if (header
.index_size
) {
215 index
.resize(header
.index_size
);
216 read
= fread(&(index
[0]), sizeof(index
[0]), index
.size(), file
.get());
217 if (read
!= index
.size())
218 return scoped_ptr
<PrefixSet
>();
219 base::MD5Update(&context
,
220 base::StringPiece(reinterpret_cast<char*>(&(index
[0])),
224 // Read vector of deltas.
225 if (header
.deltas_size
) {
226 deltas
.resize(header
.deltas_size
);
227 read
= fread(&(deltas
[0]), sizeof(deltas
[0]), deltas
.size(), file
.get());
228 if (read
!= deltas
.size())
229 return scoped_ptr
<PrefixSet
>();
230 base::MD5Update(&context
,
231 base::StringPiece(reinterpret_cast<char*>(&(deltas
[0])),
235 // Read vector of full hashes.
236 if (header
.full_hashes_size
) {
237 full_hashes
.resize(header
.full_hashes_size
);
238 read
= fread(&(full_hashes
[0]), sizeof(full_hashes
[0]), full_hashes
.size(),
240 if (read
!= full_hashes
.size())
241 return scoped_ptr
<PrefixSet
>();
242 base::MD5Update(&context
,
244 reinterpret_cast<char*>(&(full_hashes
[0])),
248 base::MD5Digest calculated_digest
;
249 base::MD5Final(&calculated_digest
, &context
);
251 base::MD5Digest file_digest
;
252 read
= fread(&file_digest
, sizeof(file_digest
), 1, file
.get());
254 return scoped_ptr
<PrefixSet
>();
256 if (0 != memcmp(&file_digest
, &calculated_digest
, sizeof(file_digest
)))
257 return scoped_ptr
<PrefixSet
>();
259 // Steals vector contents using swap().
260 return scoped_ptr
<PrefixSet
>(new PrefixSet(&index
, &deltas
, &full_hashes
));
263 bool PrefixSet::WriteFile(const base::FilePath
& filter_name
) const {
265 header
.magic
= kMagic
;
266 header
.version
= kVersion
;
267 header
.index_size
= static_cast<uint32
>(index_
.size());
268 header
.deltas_size
= static_cast<uint32
>(deltas_
.size());
269 header
.full_hashes_size
= static_cast<uint32
>(full_hashes_
.size());
271 // Sanity check that the 32-bit values never mess things up.
272 if (static_cast<size_t>(header
.index_size
) != index_
.size() ||
273 static_cast<size_t>(header
.deltas_size
) != deltas_
.size() ||
274 static_cast<size_t>(header
.full_hashes_size
) != full_hashes_
.size()) {
279 base::ScopedFILE
file(base::OpenFile(filter_name
, "wb"));
283 base::MD5Context context
;
284 base::MD5Init(&context
);
286 // TODO(shess): The I/O code in safe_browsing_store_file.cc would
287 // sure be useful about now.
288 size_t written
= fwrite(&header
, sizeof(header
), 1, file
.get());
291 base::MD5Update(&context
, base::StringPiece(reinterpret_cast<char*>(&header
),
294 // As for reads, the standard guarantees the ability to access the
295 // contents of the vector by a pointer to an element.
297 const size_t index_bytes
= sizeof(index_
[0]) * index_
.size();
298 written
= fwrite(&(index_
[0]), sizeof(index_
[0]), index_
.size(),
300 if (written
!= index_
.size())
302 base::MD5Update(&context
,
304 reinterpret_cast<const char*>(&(index_
[0])),
308 if (deltas_
.size()) {
309 const size_t deltas_bytes
= sizeof(deltas_
[0]) * deltas_
.size();
310 written
= fwrite(&(deltas_
[0]), sizeof(deltas_
[0]), deltas_
.size(),
312 if (written
!= deltas_
.size())
314 base::MD5Update(&context
,
316 reinterpret_cast<const char*>(&(deltas_
[0])),
320 if (full_hashes_
.size()) {
321 const size_t elt_size
= sizeof(full_hashes_
[0]);
322 const size_t elts
= full_hashes_
.size();
323 const size_t full_hashes_bytes
= elt_size
* elts
;
324 written
= fwrite(&(full_hashes_
[0]), elt_size
, elts
, file
.get());
327 base::MD5Update(&context
,
329 reinterpret_cast<const char*>(&(full_hashes_
[0])),
333 base::MD5Digest digest
;
334 base::MD5Final(&digest
, &context
);
335 written
= fwrite(&digest
, sizeof(digest
), 1, file
.get());
339 // TODO(shess): Can this code check that the close was successful?
345 void PrefixSet::AddRun(SBPrefix index_prefix
,
346 const uint16
* run_begin
, const uint16
* run_end
) {
347 // Preempt organic capacity decisions for |delta_| once strong estimates can
349 if (index_prefix
> kEstimateThreshold
&&
350 deltas_
.capacity() < deltas_
.size() + (run_end
- run_begin
)) {
351 deltas_
.reserve(EstimateFinalCount(index_prefix
, deltas_
.size()));
354 index_
.push_back(std::make_pair(index_prefix
, deltas_
.size()));
355 deltas_
.insert(deltas_
.end(), run_begin
, run_end
);
358 PrefixSetBuilder::PrefixSetBuilder()
359 : prefix_set_(new PrefixSet()) {
362 PrefixSetBuilder::PrefixSetBuilder(const std::vector
<SBPrefix
>& prefixes
)
363 : prefix_set_(new PrefixSet()) {
364 for (size_t i
= 0; i
< prefixes
.size(); ++i
) {
365 AddPrefix(prefixes
[i
]);
369 PrefixSetBuilder::~PrefixSetBuilder() {
372 scoped_ptr
<PrefixSet
> PrefixSetBuilder::GetPrefixSet(
373 const std::vector
<SBFullHash
>& hashes
) {
374 DCHECK(prefix_set_
.get());
376 // Flush runs until buffered data is gone.
377 while (!buffer_
.empty()) {
381 // Precisely size |index_| for read-only. It's 50k-60k, so minor savings, but
382 // they're almost free.
383 PrefixSet::IndexVector(prefix_set_
->index_
).swap(prefix_set_
->index_
);
385 prefix_set_
->full_hashes_
= hashes
;
386 std::sort(prefix_set_
->full_hashes_
.begin(), prefix_set_
->full_hashes_
.end(),
389 return prefix_set_
.Pass();
392 scoped_ptr
<PrefixSet
> PrefixSetBuilder::GetPrefixSetNoHashes() {
393 return GetPrefixSet(std::vector
<SBFullHash
>()).Pass();
396 void PrefixSetBuilder::EmitRun() {
397 DCHECK(prefix_set_
.get());
399 SBPrefix prev_prefix
= buffer_
[0];
400 uint16 run
[PrefixSet::kMaxRun
];
404 for (i
= 1; i
< buffer_
.size() && run_pos
< PrefixSet::kMaxRun
; ++i
) {
405 // Calculate the delta. |unsigned| is mandatory, because the
406 // sorted_prefixes could be more than INT_MAX apart.
407 DCHECK_GT(buffer_
[i
], prev_prefix
);
408 const unsigned delta
= buffer_
[i
] - prev_prefix
;
409 const uint16 delta16
= static_cast<uint16
>(delta
);
411 // Break the run if the delta doesn't fit.
412 if (delta
!= static_cast<unsigned>(delta16
))
415 // Continue the run of deltas.
416 run
[run_pos
++] = delta16
;
417 DCHECK_EQ(static_cast<unsigned>(run
[run_pos
- 1]), delta
);
419 prev_prefix
= buffer_
[i
];
421 prefix_set_
->AddRun(buffer_
[0], run
, run
+ run_pos
);
422 buffer_
.erase(buffer_
.begin(), buffer_
.begin() + i
);
425 void PrefixSetBuilder::AddPrefix(SBPrefix prefix
) {
426 DCHECK(prefix_set_
.get());
428 if (buffer_
.empty()) {
429 DCHECK(prefix_set_
->index_
.empty());
430 DCHECK(prefix_set_
->deltas_
.empty());
433 if (buffer_
.back() == prefix
)
436 DCHECK_LT(buffer_
.back(), prefix
);
438 buffer_
.push_back(prefix
);
440 // Flush buffer when a run can be constructed. +1 for the index item, and +1
441 // to leave at least one item in the buffer for dropping duplicates.
442 if (buffer_
.size() > PrefixSet::kMaxRun
+ 2)
446 } // namespace safe_browsing