1 //===-- Clustering.cpp ------------------------------------------*- C++ -*-===//
3 // The LLVM Compiler Infrastructure
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
8 //===----------------------------------------------------------------------===//
10 #include "Clustering.h"
12 #include <unordered_set>
17 // The clustering problem has the following characteristics:
18 // (A) - Low dimension (dimensions are typically proc resource units,
20 // (B) - Number of points : ~thousands (points are measurements of an MCInst)
21 // (C) - Number of clusters: ~tens.
22 // (D) - The number of clusters is not known /a priory/.
23 // (E) - The amount of noise is relatively small.
24 // The problem is rather small. In terms of algorithms, (D) disqualifies
25 // k-means and makes algorithms such as DBSCAN[1] or OPTICS[2] more applicable.
27 // We've used DBSCAN here because it's simple to implement. This is a pretty
28 // straightforward and inefficient implementation of the pseudocode in [2].
30 // [1] https://en.wikipedia.org/wiki/DBSCAN
31 // [2] https://en.wikipedia.org/wiki/OPTICS_algorithm
33 // Finds the points at distance less than sqrt(EpsilonSquared) of Q (not
36 InstructionBenchmarkClustering::rangeQuery(const size_t Q
) const {
37 std::vector
<size_t> Neighbors
;
38 const auto &QMeasurements
= Points_
[Q
].Measurements
;
39 for (size_t P
= 0, NumPoints
= Points_
.size(); P
< NumPoints
; ++P
) {
42 const auto &PMeasurements
= Points_
[P
].Measurements
;
43 if (PMeasurements
.empty()) // Error point.
45 if (isNeighbour(PMeasurements
, QMeasurements
)) {
46 Neighbors
.push_back(P
);
52 bool InstructionBenchmarkClustering::isNeighbour(
53 const std::vector
<BenchmarkMeasure
> &P
,
54 const std::vector
<BenchmarkMeasure
> &Q
) const {
55 double DistanceSquared
= 0.0;
56 for (size_t I
= 0, E
= P
.size(); I
< E
; ++I
) {
57 const auto Diff
= P
[I
].PerInstructionValue
- Q
[I
].PerInstructionValue
;
58 DistanceSquared
+= Diff
* Diff
;
60 return DistanceSquared
<= EpsilonSquared_
;
63 InstructionBenchmarkClustering::InstructionBenchmarkClustering(
64 const std::vector
<InstructionBenchmark
> &Points
,
65 const double EpsilonSquared
)
66 : Points_(Points
), EpsilonSquared_(EpsilonSquared
),
67 NoiseCluster_(ClusterId::noise()), ErrorCluster_(ClusterId::error()) {}
69 llvm::Error
InstructionBenchmarkClustering::validateAndSetup() {
70 ClusterIdForPoint_
.resize(Points_
.size());
71 // Mark erroneous measurements out.
72 // All points must have the same number of dimensions, in the same order.
73 const std::vector
<BenchmarkMeasure
> *LastMeasurement
= nullptr;
74 for (size_t P
= 0, NumPoints
= Points_
.size(); P
< NumPoints
; ++P
) {
75 const auto &Point
= Points_
[P
];
76 if (!Point
.Error
.empty()) {
77 ClusterIdForPoint_
[P
] = ClusterId::error();
78 ErrorCluster_
.PointIndices
.push_back(P
);
81 const auto *CurMeasurement
= &Point
.Measurements
;
82 if (LastMeasurement
) {
83 if (LastMeasurement
->size() != CurMeasurement
->size()) {
84 return llvm::make_error
<llvm::StringError
>(
85 "inconsistent measurement dimensions",
86 llvm::inconvertibleErrorCode());
88 for (size_t I
= 0, E
= LastMeasurement
->size(); I
< E
; ++I
) {
89 if (LastMeasurement
->at(I
).Key
!= CurMeasurement
->at(I
).Key
) {
90 return llvm::make_error
<llvm::StringError
>(
91 "inconsistent measurement dimensions keys",
92 llvm::inconvertibleErrorCode());
96 LastMeasurement
= CurMeasurement
;
98 if (LastMeasurement
) {
99 NumDimensions_
= LastMeasurement
->size();
101 return llvm::Error::success();
104 void InstructionBenchmarkClustering::dbScan(const size_t MinPts
) {
105 for (size_t P
= 0, NumPoints
= Points_
.size(); P
< NumPoints
; ++P
) {
106 if (!ClusterIdForPoint_
[P
].isUndef())
107 continue; // Previously processed in inner loop.
108 const auto Neighbors
= rangeQuery(P
);
109 if (Neighbors
.size() + 1 < MinPts
) { // Density check.
110 // The region around P is not dense enough to create a new cluster, mark
112 ClusterIdForPoint_
[P
] = ClusterId::noise();
116 // Create a new cluster, add P.
117 Clusters_
.emplace_back(ClusterId::makeValid(Clusters_
.size()));
118 Cluster
&CurrentCluster
= Clusters_
.back();
119 ClusterIdForPoint_
[P
] = CurrentCluster
.Id
; /* Label initial point */
120 CurrentCluster
.PointIndices
.push_back(P
);
122 // Process P's neighbors.
123 std::unordered_set
<size_t> ToProcess(Neighbors
.begin(), Neighbors
.end());
124 while (!ToProcess
.empty()) {
125 // Retrieve a point from the set.
126 const size_t Q
= *ToProcess
.begin();
129 if (ClusterIdForPoint_
[Q
].isNoise()) {
130 // Change noise point to border point.
131 ClusterIdForPoint_
[Q
] = CurrentCluster
.Id
;
132 CurrentCluster
.PointIndices
.push_back(Q
);
135 if (!ClusterIdForPoint_
[Q
].isUndef()) {
136 continue; // Previously processed.
138 // Add Q to the current custer.
139 ClusterIdForPoint_
[Q
] = CurrentCluster
.Id
;
140 CurrentCluster
.PointIndices
.push_back(Q
);
141 // And extend to the neighbors of Q if the region is dense enough.
142 const auto Neighbors
= rangeQuery(Q
);
143 if (Neighbors
.size() + 1 >= MinPts
) {
144 ToProcess
.insert(Neighbors
.begin(), Neighbors
.end());
149 // Add noisy points to noise cluster.
150 for (size_t P
= 0, NumPoints
= Points_
.size(); P
< NumPoints
; ++P
) {
151 if (ClusterIdForPoint_
[P
].isNoise()) {
152 NoiseCluster_
.PointIndices
.push_back(P
);
157 llvm::Expected
<InstructionBenchmarkClustering
>
158 InstructionBenchmarkClustering::create(
159 const std::vector
<InstructionBenchmark
> &Points
, const size_t MinPts
,
160 const double Epsilon
) {
161 InstructionBenchmarkClustering
Clustering(Points
, Epsilon
* Epsilon
);
162 if (auto Error
= Clustering
.validateAndSetup()) {
163 return std::move(Error
);
165 if (Clustering
.ErrorCluster_
.PointIndices
.size() == Points
.size()) {
166 return Clustering
; // Nothing to cluster.
169 Clustering
.dbScan(MinPts
);
173 } // namespace exegesis