1 // Copyright (c) 2009 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 // We want to measure the similarity of two sequences of bytes as a surrogate
6 // for measuring how well a second sequence will compress differentially to the
9 #include "courgette/difference_estimator.h"
11 #include "base/containers/hash_tables.h"
15 // Our difference measure is the number of k-tuples that occur in Subject that
16 // don't occur in Base.
17 const int kTupleSize
= 4;
21 static_assert(kTupleSize
>= 4 && kTupleSize
<= 8,
22 "kTupleSize should be between 4 and 8");
24 size_t HashTuple(const uint8
* source
) {
25 size_t hash1
= *reinterpret_cast<const uint32
*>(source
);
26 size_t hash2
= *reinterpret_cast<const uint32
*>(source
+ kTupleSize
- 4);
27 size_t hash
= ((hash1
* 17 + hash2
* 37) + (hash1
>> 17)) ^ (hash2
>> 23);
31 bool RegionsEqual(const Region
& a
, const Region
& b
) {
32 if (a
.length() != b
.length())
34 return memcmp(a
.start(), b
.start(), a
.length()) == 0;
37 } // anonymous namepace
39 class DifferenceEstimator::Base
{
41 explicit Base(const Region
& region
) : region_(region
) { }
44 const uint8
* start
= region_
.start();
45 const uint8
* end
= region_
.end() - (kTupleSize
- 1);
46 for (const uint8
* p
= start
; p
< end
; ++p
) {
47 size_t hash
= HashTuple(p
);
52 const Region
& region() const { return region_
; }
56 base::hash_set
<size_t> hashes_
;
58 friend class DifferenceEstimator
;
59 DISALLOW_COPY_AND_ASSIGN(Base
);
62 class DifferenceEstimator::Subject
{
64 explicit Subject(const Region
& region
) : region_(region
) {}
66 const Region
& region() const { return region_
; }
71 DISALLOW_COPY_AND_ASSIGN(Subject
);
74 DifferenceEstimator::DifferenceEstimator() {}
76 DifferenceEstimator::~DifferenceEstimator() {
77 for (size_t i
= 0; i
< owned_bases_
.size(); ++i
)
78 delete owned_bases_
[i
];
79 for (size_t i
= 0; i
< owned_subjects_
.size(); ++i
)
80 delete owned_subjects_
[i
];
83 DifferenceEstimator::Base
* DifferenceEstimator::MakeBase(const Region
& region
) {
84 Base
* base
= new Base(region
);
86 owned_bases_
.push_back(base
);
90 DifferenceEstimator::Subject
* DifferenceEstimator::MakeSubject(
91 const Region
& region
) {
92 Subject
* subject
= new Subject(region
);
93 owned_subjects_
.push_back(subject
);
97 size_t DifferenceEstimator::Measure(Base
* base
, Subject
* subject
) {
98 size_t mismatches
= 0;
99 const uint8
* start
= subject
->region().start();
100 const uint8
* end
= subject
->region().end() - (kTupleSize
- 1);
102 const uint8
* p
= start
;
104 size_t hash
= HashTuple(p
);
105 if (base
->hashes_
.find(hash
) == base
->hashes_
.end()) {
111 if (mismatches
== 0) {
112 if (RegionsEqual(base
->region(), subject
->region()))
115 ++mismatches
; // Guarantee not zero.