1 // Copyright (c) 2009 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 // We want to measure the similarity of two sequences of bytes as a surrogate
6 // for measuring how well a second sequence will compress differentially to the
9 #include "courgette/difference_estimator.h"
11 #include "base/hash_tables.h"
15 // Our difference measure is the number of k-tuples that occur in Subject that
16 // don't occur in Base.
17 const int kTupleSize
= 4;
21 COMPILE_ASSERT(kTupleSize
>= 4 && kTupleSize
<= 8, kTupleSize_between_4_and_8
);
23 size_t HashTuple(const uint8
* source
) {
24 size_t hash1
= *reinterpret_cast<const uint32
*>(source
);
25 size_t hash2
= *reinterpret_cast<const uint32
*>(source
+ kTupleSize
- 4);
26 size_t hash
= ((hash1
* 17 + hash2
* 37) + (hash1
>> 17)) ^ (hash2
>> 23);
30 bool RegionsEqual(const Region
& a
, const Region
& b
) {
31 if (a
.length() != b
.length())
33 return memcmp(a
.start(), b
.start(), a
.length()) == 0;
36 } // anonymous namepace
38 class DifferenceEstimator::Base
{
40 explicit Base(const Region
& region
) : region_(region
) { }
43 const uint8
* start
= region_
.start();
44 const uint8
* end
= region_
.end() - (kTupleSize
- 1);
45 for (const uint8
* p
= start
; p
< end
; ++p
) {
46 size_t hash
= HashTuple(p
);
51 const Region
& region() const { return region_
; }
55 base::hash_set
<size_t> hashes_
;
57 friend class DifferenceEstimator
;
58 DISALLOW_COPY_AND_ASSIGN(Base
);
61 class DifferenceEstimator::Subject
{
63 explicit Subject(const Region
& region
) : region_(region
) {}
65 const Region
& region() const { return region_
; }
70 DISALLOW_COPY_AND_ASSIGN(Subject
);
73 DifferenceEstimator::DifferenceEstimator() {}
75 DifferenceEstimator::~DifferenceEstimator() {
76 for (size_t i
= 0; i
< owned_bases_
.size(); ++i
)
77 delete owned_bases_
[i
];
78 for (size_t i
= 0; i
< owned_subjects_
.size(); ++i
)
79 delete owned_subjects_
[i
];
82 DifferenceEstimator::Base
* DifferenceEstimator::MakeBase(const Region
& region
) {
83 Base
* base
= new Base(region
);
85 owned_bases_
.push_back(base
);
89 DifferenceEstimator::Subject
* DifferenceEstimator::MakeSubject(
90 const Region
& region
) {
91 Subject
* subject
= new Subject(region
);
92 owned_subjects_
.push_back(subject
);
96 size_t DifferenceEstimator::Measure(Base
* base
, Subject
* subject
) {
97 size_t mismatches
= 0;
98 const uint8
* start
= subject
->region().start();
99 const uint8
* end
= subject
->region().end() - (kTupleSize
- 1);
101 const uint8
* p
= start
;
103 size_t hash
= HashTuple(p
);
104 if (base
->hashes_
.find(hash
) == base
->hashes_
.end()) {
110 if (mismatches
== 0) {
111 if (RegionsEqual(base
->region(), subject
->region()))
114 ++mismatches
; // Guarantee not zero.