1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "chrome/browser/safe_browsing/prefix_set.h"
10 #include "base/file_util.h"
11 #include "base/files/scoped_file.h"
12 #include "base/logging.h"
14 #include "base/metrics/histogram.h"
15 #include "base/metrics/sparse_histogram.h"
19 // |kMagic| should be reasonably unique, and not match itself across
20 // endianness changes. I generated this value with:
21 // md5 -qs chrome/browser/safe_browsing/prefix_set.cc | colrm 9
22 static uint32 kMagic
= 0x864088dd;
25 // Version 1: b6cb7cfe/r74487 by shess@chromium.org on 2011-02-10
26 // Version 2: 2b59b0a6/r253924 by shess@chromium.org on 2014-02-27
27 // Version 3: ????????/r?????? by shess@chromium.org on 2014-04-??
29 // Version 2 layout is identical to version 1. The sort order of |index_|
30 // changed from |int32| to |uint32| to match the change of |SBPrefix|.
31 // Version 3 adds storage for full hashes.
32 static uint32 kVersion
= 0x3;
46 uint32 full_hashes_size
;
49 // Common std::vector<> implementations add capacity by multiplying from the
50 // current size (usually either by 2 or 1.5) to satisfy push_back() running in
51 // amortized constant time. This is not necessary for insert() at end(), but
52 // AFAICT it seems true for some implementations. SBPrefix values should
53 // uniformly cover the 32-bit space, so the final size can be estimated given a
54 // subset of the input.
56 // |kEstimateThreshold| is when estimates start converging. Results are strong
57 // starting around 1<<27. 1<<30 is chosen to prevent the degenerate case of
58 // resizing capacity from >50% to 100%.
60 // TODO(shess): I'm sure there is math in the world to describe good settings
61 // for estimating the size of a uniformly-distributed set of integers from a
62 // sorted subset. I do not have such math in me, so I assumed that my current
63 // organic database of prefixes was scale-free, and wrote a script to see how
64 // often given slop values would always suffice for given strides. At 1<<30,
65 // .5% slop was sufficient to cover all cases (though the code below uses 1%).
67 // TODO(shess): A smaller threshold uses less transient space in reallocation.
68 // 1<<30 uses between 125% and 150%, 1<<29 between 112% and 125%, etc. The cost
69 // is that a smaller threshold needs more slop (locked down for the long term).
70 // 1<<29 worked well with 1%, 1<<27 worked well with 2%.
71 const SBPrefix kEstimateThreshold
= 1 << 30;
72 size_t EstimateFinalCount(SBPrefix current_prefix
, size_t current_count
) {
73 // estimated_count / current_count == estimated_max / current_prefix
74 // For large input sets, estimated_max of 2^32 is close enough.
75 const size_t estimated_prefix_count
= static_cast<size_t>(
76 (static_cast<uint64
>(current_count
) << 32) / current_prefix
);
78 // The estimate has an error bar, if the final total is below the estimate, no
79 // harm, but if it is above a capacity resize will happen at nearly 100%. Add
80 // some slop to make sure all cases are covered.
81 return estimated_prefix_count
+ estimated_prefix_count
/ 100;
86 namespace safe_browsing
{
88 // For |std::upper_bound()| to find a prefix w/in a vector of pairs.
90 bool PrefixSet::PrefixLess(const IndexPair
& a
, const IndexPair
& b
) {
91 return a
.first
< b
.first
;
94 PrefixSet::PrefixSet() {
97 PrefixSet::PrefixSet(IndexVector
* index
,
98 std::vector
<uint16
>* deltas
,
99 std::vector
<SBFullHash
>* full_hashes
) {
100 DCHECK(index
&& deltas
&& full_hashes
);
102 deltas_
.swap(*deltas
);
103 full_hashes_
.swap(*full_hashes
);
106 PrefixSet::~PrefixSet() {}
108 bool PrefixSet::PrefixExists(SBPrefix prefix
) const {
112 // Find the first position after |prefix| in |index_|.
113 IndexVector::const_iterator iter
=
114 std::upper_bound(index_
.begin(), index_
.end(),
115 IndexPair(prefix
, 0), PrefixLess
);
117 // |prefix| comes before anything that's in the set.
118 if (iter
== index_
.begin())
121 // Capture the upper bound of our target entry's deltas.
122 const size_t bound
= (iter
== index_
.end() ? deltas_
.size() : iter
->second
);
124 // Back up to the entry our target is in.
127 // All prefixes in |index_| are in the set.
128 SBPrefix current
= iter
->first
;
129 if (current
== prefix
)
132 // Scan forward accumulating deltas while a match is possible.
133 for (size_t di
= iter
->second
; di
< bound
&& current
< prefix
; ++di
) {
134 current
+= deltas_
[di
];
137 return current
== prefix
;
140 bool PrefixSet::Exists(const SBFullHash
& hash
) const {
141 if (std::binary_search(full_hashes_
.begin(), full_hashes_
.end(),
142 hash
, SBFullHashLess
)) {
145 return PrefixExists(hash
.prefix
);
148 void PrefixSet::GetPrefixes(std::vector
<SBPrefix
>* prefixes
) const {
149 prefixes
->reserve(index_
.size() + deltas_
.size());
151 for (size_t ii
= 0; ii
< index_
.size(); ++ii
) {
152 // The deltas for this |index_| entry run to the next index entry,
153 // or the end of the deltas.
154 const size_t deltas_end
=
155 (ii
+ 1 < index_
.size()) ? index_
[ii
+ 1].second
: deltas_
.size();
157 SBPrefix current
= index_
[ii
].first
;
158 prefixes
->push_back(current
);
159 for (size_t di
= index_
[ii
].second
; di
< deltas_end
; ++di
) {
160 current
+= deltas_
[di
];
161 prefixes
->push_back(current
);
167 scoped_ptr
<PrefixSet
> PrefixSet::LoadFile(const base::FilePath
& filter_name
) {
169 if (!base::GetFileSize(filter_name
, &size_64
))
170 return scoped_ptr
<PrefixSet
>();
171 using base::MD5Digest
;
172 // TODO(shess): Revert to sizeof(FileHeader) for sanity check once v2 is
174 if (size_64
< static_cast<int64
>(sizeof(FileHeader_v2
) + sizeof(MD5Digest
)))
175 return scoped_ptr
<PrefixSet
>();
177 base::ScopedFILE
file(base::OpenFile(filter_name
, "rb"));
179 return scoped_ptr
<PrefixSet
>();
182 size_t read
= fread(&header
, sizeof(header
), 1, file
.get());
184 return scoped_ptr
<PrefixSet
>();
186 // The file looks valid, start building the digest.
187 base::MD5Context context
;
188 base::MD5Init(&context
);
189 base::MD5Update(&context
, base::StringPiece(reinterpret_cast<char*>(&header
),
192 if (header
.magic
!= kMagic
)
193 return scoped_ptr
<PrefixSet
>();
195 // Track version read to inform removal of support for older versions.
196 UMA_HISTOGRAM_SPARSE_SLOWLY("SB2.PrefixSetVersionRead", header
.version
);
198 // TODO(shess): Version 1 and 2 use the same file structure, with version 1
199 // data using a signed sort. For M-35, the data is re-sorted before return.
200 // After M-36, just drop v1 support. <http://crbug.com/346405>
201 // TODO(shess): <http://crbug.com/368044> for removing v2 support.
202 size_t header_size
= sizeof(header
);
203 if (header
.version
== 2 || header
.version
== 1) {
204 // Rewind the file and restart building the digest with the old header
206 FileHeader_v2 v2_header
;
207 if (0 != fseek(file
.get(), 0, SEEK_SET
))
208 return scoped_ptr
<PrefixSet
>();
210 size_t read
= fread(&v2_header
, sizeof(v2_header
), 1, file
.get());
212 return scoped_ptr
<PrefixSet
>();
214 base::MD5Init(&context
);
215 base::MD5Update(&context
,
216 base::StringPiece(reinterpret_cast<char*>(&v2_header
),
219 // The current header is a superset of the old header, fill it in with the
221 header
.magic
= v2_header
.magic
;
222 header
.version
= v2_header
.version
;
223 header
.index_size
= v2_header
.index_size
;
224 header
.deltas_size
= v2_header
.deltas_size
;
225 header
.full_hashes_size
= 0;
226 header_size
= sizeof(v2_header
);
227 } else if (header
.version
!= kVersion
) {
228 return scoped_ptr
<PrefixSet
>();
232 const size_t index_bytes
= sizeof(index
[0]) * header
.index_size
;
234 std::vector
<uint16
> deltas
;
235 const size_t deltas_bytes
= sizeof(deltas
[0]) * header
.deltas_size
;
237 std::vector
<SBFullHash
> full_hashes
;
238 const size_t full_hashes_bytes
=
239 sizeof(full_hashes
[0]) * header
.full_hashes_size
;
241 // Check for bogus sizes before allocating any space.
242 const size_t expected_bytes
= header_size
+
243 index_bytes
+ deltas_bytes
+ full_hashes_bytes
+ sizeof(MD5Digest
);
244 if (static_cast<int64
>(expected_bytes
) != size_64
)
245 return scoped_ptr
<PrefixSet
>();
247 // Read the index vector. Herb Sutter indicates that vectors are
248 // guaranteed to be contiuguous, so reading to where element 0 lives
250 if (header
.index_size
) {
251 index
.resize(header
.index_size
);
252 read
= fread(&(index
[0]), sizeof(index
[0]), index
.size(), file
.get());
253 if (read
!= index
.size())
254 return scoped_ptr
<PrefixSet
>();
255 base::MD5Update(&context
,
256 base::StringPiece(reinterpret_cast<char*>(&(index
[0])),
260 // Read vector of deltas.
261 if (header
.deltas_size
) {
262 deltas
.resize(header
.deltas_size
);
263 read
= fread(&(deltas
[0]), sizeof(deltas
[0]), deltas
.size(), file
.get());
264 if (read
!= deltas
.size())
265 return scoped_ptr
<PrefixSet
>();
266 base::MD5Update(&context
,
267 base::StringPiece(reinterpret_cast<char*>(&(deltas
[0])),
271 // Read vector of full hashes.
272 if (header
.full_hashes_size
) {
273 full_hashes
.resize(header
.full_hashes_size
);
274 read
= fread(&(full_hashes
[0]), sizeof(full_hashes
[0]), full_hashes
.size(),
276 if (read
!= full_hashes
.size())
277 return scoped_ptr
<PrefixSet
>();
278 base::MD5Update(&context
,
280 reinterpret_cast<char*>(&(full_hashes
[0])),
284 base::MD5Digest calculated_digest
;
285 base::MD5Final(&calculated_digest
, &context
);
287 base::MD5Digest file_digest
;
288 read
= fread(&file_digest
, sizeof(file_digest
), 1, file
.get());
290 return scoped_ptr
<PrefixSet
>();
292 if (0 != memcmp(&file_digest
, &calculated_digest
, sizeof(file_digest
)))
293 return scoped_ptr
<PrefixSet
>();
295 // For version 1, fetch the prefixes and re-sort.
296 if (header
.version
== 1) {
297 std::vector
<SBPrefix
> prefixes
;
298 PrefixSet(&index
, &deltas
, &full_hashes
).GetPrefixes(&prefixes
);
299 std::sort(prefixes
.begin(), prefixes
.end());
301 // v1 cannot have full hashes, so no need to propagate a copy here.
302 return PrefixSetBuilder(prefixes
).GetPrefixSetNoHashes().Pass();
305 // Steals vector contents using swap().
306 return scoped_ptr
<PrefixSet
>(new PrefixSet(&index
, &deltas
, &full_hashes
));
309 bool PrefixSet::WriteFile(const base::FilePath
& filter_name
) const {
311 header
.magic
= kMagic
;
312 header
.version
= kVersion
;
313 header
.index_size
= static_cast<uint32
>(index_
.size());
314 header
.deltas_size
= static_cast<uint32
>(deltas_
.size());
315 header
.full_hashes_size
= static_cast<uint32
>(full_hashes_
.size());
317 // Sanity check that the 32-bit values never mess things up.
318 if (static_cast<size_t>(header
.index_size
) != index_
.size() ||
319 static_cast<size_t>(header
.deltas_size
) != deltas_
.size() ||
320 static_cast<size_t>(header
.full_hashes_size
) != full_hashes_
.size()) {
325 base::ScopedFILE
file(base::OpenFile(filter_name
, "wb"));
329 base::MD5Context context
;
330 base::MD5Init(&context
);
332 // TODO(shess): The I/O code in safe_browsing_store_file.cc would
333 // sure be useful about now.
334 size_t written
= fwrite(&header
, sizeof(header
), 1, file
.get());
337 base::MD5Update(&context
, base::StringPiece(reinterpret_cast<char*>(&header
),
340 // As for reads, the standard guarantees the ability to access the
341 // contents of the vector by a pointer to an element.
343 const size_t index_bytes
= sizeof(index_
[0]) * index_
.size();
344 written
= fwrite(&(index_
[0]), sizeof(index_
[0]), index_
.size(),
346 if (written
!= index_
.size())
348 base::MD5Update(&context
,
350 reinterpret_cast<const char*>(&(index_
[0])),
354 if (deltas_
.size()) {
355 const size_t deltas_bytes
= sizeof(deltas_
[0]) * deltas_
.size();
356 written
= fwrite(&(deltas_
[0]), sizeof(deltas_
[0]), deltas_
.size(),
358 if (written
!= deltas_
.size())
360 base::MD5Update(&context
,
362 reinterpret_cast<const char*>(&(deltas_
[0])),
366 if (full_hashes_
.size()) {
367 const size_t elt_size
= sizeof(full_hashes_
[0]);
368 const size_t elts
= full_hashes_
.size();
369 const size_t full_hashes_bytes
= elt_size
* elts
;
370 written
= fwrite(&(full_hashes_
[0]), elt_size
, elts
, file
.get());
373 base::MD5Update(&context
,
375 reinterpret_cast<const char*>(&(full_hashes_
[0])),
379 base::MD5Digest digest
;
380 base::MD5Final(&digest
, &context
);
381 written
= fwrite(&digest
, sizeof(digest
), 1, file
.get());
385 // TODO(shess): Can this code check that the close was successful?
391 void PrefixSet::AddRun(SBPrefix index_prefix
,
392 const uint16
* run_begin
, const uint16
* run_end
) {
393 // Preempt organic capacity decisions for |delta_| once strong estimates can
395 if (index_prefix
> kEstimateThreshold
&&
396 deltas_
.capacity() < deltas_
.size() + (run_end
- run_begin
)) {
397 deltas_
.reserve(EstimateFinalCount(index_prefix
, deltas_
.size()));
400 index_
.push_back(std::make_pair(index_prefix
, deltas_
.size()));
401 deltas_
.insert(deltas_
.end(), run_begin
, run_end
);
404 PrefixSetBuilder::PrefixSetBuilder()
405 : prefix_set_(new PrefixSet()) {
408 PrefixSetBuilder::PrefixSetBuilder(const std::vector
<SBPrefix
>& prefixes
)
409 : prefix_set_(new PrefixSet()) {
410 for (size_t i
= 0; i
< prefixes
.size(); ++i
) {
411 AddPrefix(prefixes
[i
]);
415 PrefixSetBuilder::~PrefixSetBuilder() {
418 scoped_ptr
<PrefixSet
> PrefixSetBuilder::GetPrefixSet(
419 const std::vector
<SBFullHash
>& hashes
) {
420 DCHECK(prefix_set_
.get());
422 // Flush runs until buffered data is gone.
423 while (!buffer_
.empty()) {
427 // Precisely size |index_| for read-only. It's 50k-60k, so minor savings, but
428 // they're almost free.
429 PrefixSet::IndexVector(prefix_set_
->index_
).swap(prefix_set_
->index_
);
431 prefix_set_
->full_hashes_
= hashes
;
432 std::sort(prefix_set_
->full_hashes_
.begin(), prefix_set_
->full_hashes_
.end(),
435 return prefix_set_
.Pass();
438 scoped_ptr
<PrefixSet
> PrefixSetBuilder::GetPrefixSetNoHashes() {
439 return GetPrefixSet(std::vector
<SBFullHash
>()).Pass();
442 void PrefixSetBuilder::EmitRun() {
443 DCHECK(prefix_set_
.get());
445 SBPrefix prev_prefix
= buffer_
[0];
446 uint16 run
[PrefixSet::kMaxRun
];
450 for (i
= 1; i
< buffer_
.size() && run_pos
< PrefixSet::kMaxRun
; ++i
) {
451 // Calculate the delta. |unsigned| is mandatory, because the
452 // sorted_prefixes could be more than INT_MAX apart.
453 DCHECK_GT(buffer_
[i
], prev_prefix
);
454 const unsigned delta
= buffer_
[i
] - prev_prefix
;
455 const uint16 delta16
= static_cast<uint16
>(delta
);
457 // Break the run if the delta doesn't fit.
458 if (delta
!= static_cast<unsigned>(delta16
))
461 // Continue the run of deltas.
462 run
[run_pos
++] = delta16
;
463 DCHECK_EQ(static_cast<unsigned>(run
[run_pos
- 1]), delta
);
465 prev_prefix
= buffer_
[i
];
467 prefix_set_
->AddRun(buffer_
[0], run
, run
+ run_pos
);
468 buffer_
.erase(buffer_
.begin(), buffer_
.begin() + i
);
471 void PrefixSetBuilder::AddPrefix(SBPrefix prefix
) {
472 DCHECK(prefix_set_
.get());
474 if (buffer_
.empty()) {
475 DCHECK(prefix_set_
->index_
.empty());
476 DCHECK(prefix_set_
->deltas_
.empty());
479 if (buffer_
.back() == prefix
)
482 DCHECK_LT(buffer_
.back(), prefix
);
484 buffer_
.push_back(prefix
);
486 // Flush buffer when a run can be constructed. +1 for the index item, and +1
487 // to leave at least one item in the buffer for dropping duplicates.
488 if (buffer_
.size() > PrefixSet::kMaxRun
+ 2)
492 } // namespace safe_browsing