Roll leveldb 3f7758:803d69 (v1.17 -> v1.18)
[chromium-blink-merge.git] / courgette / difference_estimator.cc
blob3b2cddc94dec0099107bfd6e643d2fc997cc1345
1 // Copyright (c) 2009 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 // We want to measure the similarity of two sequences of bytes as a surrogate
6 // for measuring how well a second sequence will compress differentially to the
7 // first sequence.
9 #include "courgette/difference_estimator.h"
11 #include "base/containers/hash_tables.h"
13 namespace courgette {
15 // Our difference measure is the number of k-tuples that occur in Subject that
16 // don't occur in Base.
17 const int kTupleSize = 4;
19 namespace {
21 COMPILE_ASSERT(kTupleSize >= 4 && kTupleSize <= 8, kTupleSize_between_4_and_8);
23 size_t HashTuple(const uint8* source) {
24 size_t hash1 = *reinterpret_cast<const uint32*>(source);
25 size_t hash2 = *reinterpret_cast<const uint32*>(source + kTupleSize - 4);
26 size_t hash = ((hash1 * 17 + hash2 * 37) + (hash1 >> 17)) ^ (hash2 >> 23);
27 return hash;
30 bool RegionsEqual(const Region& a, const Region& b) {
31 if (a.length() != b.length())
32 return false;
33 return memcmp(a.start(), b.start(), a.length()) == 0;
36 } // anonymous namepace
38 class DifferenceEstimator::Base {
39 public:
40 explicit Base(const Region& region) : region_(region) { }
42 void Init() {
43 const uint8* start = region_.start();
44 const uint8* end = region_.end() - (kTupleSize - 1);
45 for (const uint8* p = start; p < end; ++p) {
46 size_t hash = HashTuple(p);
47 hashes_.insert(hash);
51 const Region& region() const { return region_; }
53 private:
54 Region region_;
55 base::hash_set<size_t> hashes_;
57 friend class DifferenceEstimator;
58 DISALLOW_COPY_AND_ASSIGN(Base);
61 class DifferenceEstimator::Subject {
62 public:
63 explicit Subject(const Region& region) : region_(region) {}
65 const Region& region() const { return region_; }
67 private:
68 Region region_;
70 DISALLOW_COPY_AND_ASSIGN(Subject);
73 DifferenceEstimator::DifferenceEstimator() {}
75 DifferenceEstimator::~DifferenceEstimator() {
76 for (size_t i = 0; i < owned_bases_.size(); ++i)
77 delete owned_bases_[i];
78 for (size_t i = 0; i < owned_subjects_.size(); ++i)
79 delete owned_subjects_[i];
82 DifferenceEstimator::Base* DifferenceEstimator::MakeBase(const Region& region) {
83 Base* base = new Base(region);
84 base->Init();
85 owned_bases_.push_back(base);
86 return base;
89 DifferenceEstimator::Subject* DifferenceEstimator::MakeSubject(
90 const Region& region) {
91 Subject* subject = new Subject(region);
92 owned_subjects_.push_back(subject);
93 return subject;
96 size_t DifferenceEstimator::Measure(Base* base, Subject* subject) {
97 size_t mismatches = 0;
98 const uint8* start = subject->region().start();
99 const uint8* end = subject->region().end() - (kTupleSize - 1);
101 const uint8* p = start;
102 while (p < end) {
103 size_t hash = HashTuple(p);
104 if (base->hashes_.find(hash) == base->hashes_.end()) {
105 ++mismatches;
107 p += 1;
110 if (mismatches == 0) {
111 if (RegionsEqual(base->region(), subject->region()))
112 return 0;
114 ++mismatches; // Guarantee not zero.
115 return mismatches;
118 } // namespace