Include all dupe types (event when value is zero) in scan stats.
[chromium-blink-merge.git] / courgette / difference_estimator.cc
blob82c23eef33c99a7109669bc620dd7f01ad8dccf2
1 // Copyright (c) 2009 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 // We want to measure the similarity of two sequences of bytes as a surrogate
6 // for measuring how well a second sequence will compress differentially to the
7 // first sequence.
9 #include "courgette/difference_estimator.h"
11 #include "base/containers/hash_tables.h"
13 namespace courgette {
15 // Our difference measure is the number of k-tuples that occur in Subject that
16 // don't occur in Base.
17 const int kTupleSize = 4;
19 namespace {
21 static_assert(kTupleSize >= 4 && kTupleSize <= 8,
22 "kTupleSize should be between 4 and 8");
24 size_t HashTuple(const uint8* source) {
25 size_t hash1 = *reinterpret_cast<const uint32*>(source);
26 size_t hash2 = *reinterpret_cast<const uint32*>(source + kTupleSize - 4);
27 size_t hash = ((hash1 * 17 + hash2 * 37) + (hash1 >> 17)) ^ (hash2 >> 23);
28 return hash;
31 bool RegionsEqual(const Region& a, const Region& b) {
32 if (a.length() != b.length())
33 return false;
34 return memcmp(a.start(), b.start(), a.length()) == 0;
37 } // anonymous namepace
39 class DifferenceEstimator::Base {
40 public:
41 explicit Base(const Region& region) : region_(region) { }
43 void Init() {
44 if (region_.length() < kTupleSize)
45 return;
46 const uint8* start = region_.start();
47 const uint8* end = region_.end() - (kTupleSize - 1);
48 for (const uint8* p = start; p < end; ++p) {
49 size_t hash = HashTuple(p);
50 hashes_.insert(hash);
54 const Region& region() const { return region_; }
56 private:
57 Region region_;
58 base::hash_set<size_t> hashes_;
60 friend class DifferenceEstimator;
61 DISALLOW_COPY_AND_ASSIGN(Base);
64 class DifferenceEstimator::Subject {
65 public:
66 explicit Subject(const Region& region) : region_(region) {}
68 const Region& region() const { return region_; }
70 private:
71 Region region_;
73 DISALLOW_COPY_AND_ASSIGN(Subject);
76 DifferenceEstimator::DifferenceEstimator() {}
78 DifferenceEstimator::~DifferenceEstimator() {
79 for (size_t i = 0; i < owned_bases_.size(); ++i)
80 delete owned_bases_[i];
81 for (size_t i = 0; i < owned_subjects_.size(); ++i)
82 delete owned_subjects_[i];
85 DifferenceEstimator::Base* DifferenceEstimator::MakeBase(const Region& region) {
86 Base* base = new Base(region);
87 base->Init();
88 owned_bases_.push_back(base);
89 return base;
92 DifferenceEstimator::Subject* DifferenceEstimator::MakeSubject(
93 const Region& region) {
94 Subject* subject = new Subject(region);
95 owned_subjects_.push_back(subject);
96 return subject;
99 size_t DifferenceEstimator::Measure(Base* base, Subject* subject) {
100 size_t mismatches = 0;
101 if (subject->region().length() >= kTupleSize) {
102 const uint8* start = subject->region().start();
103 const uint8* end = subject->region().end() - (kTupleSize - 1);
105 const uint8* p = start;
106 while (p < end) {
107 size_t hash = HashTuple(p);
108 if (base->hashes_.find(hash) == base->hashes_.end()) {
109 ++mismatches;
111 p += 1;
115 if (mismatches == 0) {
116 if (RegionsEqual(base->region(), subject->region()))
117 return 0;
119 ++mismatches; // Guarantee not zero.
120 return mismatches;
123 } // namespace