1 //===-- Clustering.cpp ------------------------------------------*- C++ -*-===//
3 // The LLVM Compiler Infrastructure
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
8 //===----------------------------------------------------------------------===//
10 #include "Clustering.h"
12 #include <unordered_set>
16 // The clustering problem has the following characteristics:
17 // (A) - Low dimension (dimensions are typically proc resource units,
19 // (B) - Number of points : ~thousands (points are measurements of an MCInst)
20 // (C) - Number of clusters: ~tens.
21 // (D) - The number of clusters is not known /a priory/.
22 // (E) - The amount of noise is relatively small.
23 // The problem is rather small. In terms of algorithms, (D) disqualifies
24 // k-means and makes algorithms such as DBSCAN[1] or OPTICS[2] more applicable.
26 // We've used DBSCAN here because it's simple to implement. This is a pretty
27 // straightforward and inefficient implementation of the pseudocode in [2].
29 // [1] https://en.wikipedia.org/wiki/DBSCAN
30 // [2] https://en.wikipedia.org/wiki/OPTICS_algorithm
32 // Finds the points at distance less than sqrt(EpsilonSquared) of Q (not
35 InstructionBenchmarkClustering::rangeQuery(const size_t Q
) const {
36 std::vector
<size_t> Neighbors
;
37 const auto &QMeasurements
= Points_
[Q
].Measurements
;
38 for (size_t P
= 0, NumPoints
= Points_
.size(); P
< NumPoints
; ++P
) {
41 const auto &PMeasurements
= Points_
[P
].Measurements
;
42 if (PMeasurements
.empty()) // Error point.
44 if (isNeighbour(PMeasurements
, QMeasurements
)) {
45 Neighbors
.push_back(P
);
51 bool InstructionBenchmarkClustering::isNeighbour(
52 const std::vector
<BenchmarkMeasure
> &P
,
53 const std::vector
<BenchmarkMeasure
> &Q
) const {
54 double DistanceSquared
= 0.0;
55 for (size_t I
= 0, E
= P
.size(); I
< E
; ++I
) {
56 const auto Diff
= P
[I
].PerInstructionValue
- Q
[I
].PerInstructionValue
;
57 DistanceSquared
+= Diff
* Diff
;
59 return DistanceSquared
<= EpsilonSquared_
;
62 InstructionBenchmarkClustering::InstructionBenchmarkClustering(
63 const std::vector
<InstructionBenchmark
> &Points
,
64 const double EpsilonSquared
)
65 : Points_(Points
), EpsilonSquared_(EpsilonSquared
),
66 NoiseCluster_(ClusterId::noise()), ErrorCluster_(ClusterId::error()) {}
68 llvm::Error
InstructionBenchmarkClustering::validateAndSetup() {
69 ClusterIdForPoint_
.resize(Points_
.size());
70 // Mark erroneous measurements out.
71 // All points must have the same number of dimensions, in the same order.
72 const std::vector
<BenchmarkMeasure
> *LastMeasurement
= nullptr;
73 for (size_t P
= 0, NumPoints
= Points_
.size(); P
< NumPoints
; ++P
) {
74 const auto &Point
= Points_
[P
];
75 if (!Point
.Error
.empty()) {
76 ClusterIdForPoint_
[P
] = ClusterId::error();
77 ErrorCluster_
.PointIndices
.push_back(P
);
80 const auto *CurMeasurement
= &Point
.Measurements
;
81 if (LastMeasurement
) {
82 if (LastMeasurement
->size() != CurMeasurement
->size()) {
83 return llvm::make_error
<llvm::StringError
>(
84 "inconsistent measurement dimensions",
85 llvm::inconvertibleErrorCode());
87 for (size_t I
= 0, E
= LastMeasurement
->size(); I
< E
; ++I
) {
88 if (LastMeasurement
->at(I
).Key
!= CurMeasurement
->at(I
).Key
) {
89 return llvm::make_error
<llvm::StringError
>(
90 "inconsistent measurement dimensions keys",
91 llvm::inconvertibleErrorCode());
95 LastMeasurement
= CurMeasurement
;
97 if (LastMeasurement
) {
98 NumDimensions_
= LastMeasurement
->size();
100 return llvm::Error::success();
103 void InstructionBenchmarkClustering::dbScan(const size_t MinPts
) {
104 for (size_t P
= 0, NumPoints
= Points_
.size(); P
< NumPoints
; ++P
) {
105 if (!ClusterIdForPoint_
[P
].isUndef())
106 continue; // Previously processed in inner loop.
107 const auto Neighbors
= rangeQuery(P
);
108 if (Neighbors
.size() + 1 < MinPts
) { // Density check.
109 // The region around P is not dense enough to create a new cluster, mark
111 ClusterIdForPoint_
[P
] = ClusterId::noise();
115 // Create a new cluster, add P.
116 Clusters_
.emplace_back(ClusterId::makeValid(Clusters_
.size()));
117 Cluster
&CurrentCluster
= Clusters_
.back();
118 ClusterIdForPoint_
[P
] = CurrentCluster
.Id
; /* Label initial point */
119 CurrentCluster
.PointIndices
.push_back(P
);
121 // Process P's neighbors.
122 std::unordered_set
<size_t> ToProcess(Neighbors
.begin(), Neighbors
.end());
123 while (!ToProcess
.empty()) {
124 // Retrieve a point from the set.
125 const size_t Q
= *ToProcess
.begin();
128 if (ClusterIdForPoint_
[Q
].isNoise()) {
129 // Change noise point to border point.
130 ClusterIdForPoint_
[Q
] = CurrentCluster
.Id
;
131 CurrentCluster
.PointIndices
.push_back(Q
);
134 if (!ClusterIdForPoint_
[Q
].isUndef()) {
135 continue; // Previously processed.
137 // Add Q to the current custer.
138 ClusterIdForPoint_
[Q
] = CurrentCluster
.Id
;
139 CurrentCluster
.PointIndices
.push_back(Q
);
140 // And extend to the neighbors of Q if the region is dense enough.
141 const auto Neighbors
= rangeQuery(Q
);
142 if (Neighbors
.size() + 1 >= MinPts
) {
143 ToProcess
.insert(Neighbors
.begin(), Neighbors
.end());
148 // Add noisy points to noise cluster.
149 for (size_t P
= 0, NumPoints
= Points_
.size(); P
< NumPoints
; ++P
) {
150 if (ClusterIdForPoint_
[P
].isNoise()) {
151 NoiseCluster_
.PointIndices
.push_back(P
);
156 llvm::Expected
<InstructionBenchmarkClustering
>
157 InstructionBenchmarkClustering::create(
158 const std::vector
<InstructionBenchmark
> &Points
, const size_t MinPts
,
159 const double Epsilon
) {
160 InstructionBenchmarkClustering
Clustering(Points
, Epsilon
* Epsilon
);
161 if (auto Error
= Clustering
.validateAndSetup()) {
162 return std::move(Error
);
164 if (Clustering
.ErrorCluster_
.PointIndices
.size() == Points
.size()) {
165 return Clustering
; // Nothing to cluster.
168 Clustering
.dbScan(MinPts
);
172 } // namespace exegesis