Run DCE after a LoopFlatten test to reduce spurious output [nfc]
[llvm-project.git] / bolt / lib / Passes / HFSortPlus.cpp
blob0a481b5418dd25977cd84bb4fd60e35eabfa1d8f
1 //===- bolt/Passes/HFSortPlus.cpp - Order functions by hotness ------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // hfsort+ - layout of hot functions with i-TLB cache optimization.
11 // Given an ordering of hot functions (and hence, their assignment to the
12 // i-TLB pages), we can divide all functions calls Into two categories:
13 // - 'short' ones that have a caller-callee distance less than a page;
14 // - 'long' ones where the distance exceeds a page.
15 // The short calls are likely to result in a i-TLB cache hit. For the long ones,
16 // the hit/miss result depends on the 'hotness' of the page (i.e., how often
17 // the page is accessed). Assuming that functions are sent to the i-TLB cache
18 // in a random order, the probability that a page is present in the cache is
19 // proportional to the number of samples corresponding to the functions on the
20 // page. The following algorithm detects short and long calls, and optimizes
21 // the expected number of cache misses for the long ones.
23 //===----------------------------------------------------------------------===//
25 #include "bolt/Passes/HFSort.h"
26 #include "llvm/Support/CommandLine.h"
27 #include <cmath>
28 #include <set>
29 #include <vector>
31 #define DEBUG_TYPE "hfsort"
33 using namespace llvm;
34 using namespace bolt;
36 namespace opts {
38 extern cl::OptionCategory BoltOptCategory;
40 cl::opt<unsigned> ITLBPageSize("itlb-page-size",
41 cl::desc("The size of i-tlb cache page"),
42 cl::init(4096), cl::ReallyHidden,
43 cl::cat(BoltOptCategory));
45 cl::opt<unsigned> ITLBEntries("itlb-entries",
46 cl::desc("The number of entries in i-tlb cache"),
47 cl::init(16), cl::ReallyHidden,
48 cl::cat(BoltOptCategory));
50 static cl::opt<unsigned> ITLBDensity("itlb-density",
51 cl::desc("The density of i-tlb cache"),
52 cl::init(4096), cl::ReallyHidden,
53 cl::cat(BoltOptCategory));
55 static cl::opt<double> MergeProbability(
56 "merge-probability",
57 cl::desc("The minimum probability of a call for merging two clusters"),
58 cl::init(0.9), cl::ReallyHidden, cl::cat(BoltOptCategory));
60 static cl::opt<double> ArcThreshold(
61 "arc-threshold",
62 cl::desc("The threshold for ignoring arcs with a small relative weight"),
63 cl::init(0.00000001), cl::ReallyHidden, cl::cat(BoltOptCategory));
65 } // namespace opts
67 namespace llvm {
68 namespace bolt {
70 using NodeId = CallGraph::NodeId;
71 using Arc = CallGraph::Arc;
73 namespace {
75 class Edge;
76 using ArcList = std::vector<const Arc *>;
78 // A chain (ordered sequence) of nodes (functions) in the call graph
79 class Chain {
80 public:
81 Chain(const Chain &) = delete;
82 Chain(Chain &&) = default;
83 Chain &operator=(const Chain &) = delete;
84 Chain &operator=(Chain &&) = default;
86 explicit Chain(size_t Id_, NodeId Node, size_t Samples_, size_t Size_)
87 : Id(Id_), Samples(Samples_), Size(Size_), Nodes(1, Node) {}
89 double density() const { return static_cast<double>(Samples) / Size; }
91 Edge *getEdge(Chain *Other) const {
92 for (std::pair<Chain *, Edge *> It : Edges)
93 if (It.first == Other)
94 return It.second;
95 return nullptr;
98 void removeEdge(Chain *Other) {
99 auto It = Edges.begin();
100 while (It != Edges.end()) {
101 if (It->first == Other) {
102 Edges.erase(It);
103 return;
105 It++;
109 void addEdge(Chain *Other, Edge *Edge) { Edges.emplace_back(Other, Edge); }
111 void merge(Chain *Other) {
112 Nodes.insert(Nodes.end(), Other->Nodes.begin(), Other->Nodes.end());
113 Samples += Other->Samples;
114 Size += Other->Size;
117 void mergeEdges(Chain *Other);
119 void clear() {
120 Nodes.clear();
121 Edges.clear();
124 public:
125 size_t Id;
126 uint64_t Samples;
127 uint64_t Size;
128 // Cached score for the chain
129 double Score{0};
130 // Cached short-calls for the chain
131 double ShortCalls{0};
132 // Nodes in the chain
133 std::vector<NodeId> Nodes;
134 // Adjacent chains and corresponding edges (lists of arcs)
135 std::vector<std::pair<Chain *, Edge *>> Edges;
138 // An edge in the call graph representing Arcs between two Chains.
139 // When functions are merged Into chains, the edges are combined too so that
140 // there is always at most one edge between a pair of chains
141 class Edge {
142 public:
143 Edge(const Edge &) = delete;
144 Edge(Edge &&) = default;
145 Edge &operator=(const Edge &) = delete;
146 Edge &operator=(Edge &&) = default;
148 explicit Edge(Chain *SrcChain_, Chain *DstChain_, const Arc *A)
149 : SrcChain(SrcChain_), DstChain(DstChain_), Arcs(1, A) {}
151 void changeEndpoint(Chain *From, Chain *To) {
152 if (From == SrcChain)
153 SrcChain = To;
154 if (From == DstChain)
155 DstChain = To;
158 void moveArcs(Edge *Other) {
159 Arcs.insert(Arcs.end(), Other->Arcs.begin(), Other->Arcs.end());
160 Other->Arcs.clear();
163 void setMergeGain(Chain *PredChain, double ForwardGain, double BackwardGain) {
164 // When forward and backward gains are the same, prioritize merging that
165 // preserves the original order of the functions in the binary
166 if (std::abs(ForwardGain - BackwardGain) < 1e-8) {
167 if (SrcChain->Id < DstChain->Id) {
168 IsGainForward = true;
169 CachedGain = PredChain == SrcChain ? ForwardGain : BackwardGain;
170 } else {
171 IsGainForward = false;
172 CachedGain = PredChain == SrcChain ? BackwardGain : ForwardGain;
174 } else if (ForwardGain > BackwardGain) {
175 IsGainForward = PredChain == SrcChain;
176 CachedGain = ForwardGain;
177 } else {
178 IsGainForward = PredChain != SrcChain;
179 CachedGain = BackwardGain;
183 double gain() const { return CachedGain; }
185 Chain *predChain() const { return IsGainForward ? SrcChain : DstChain; }
187 Chain *succChain() const { return IsGainForward ? DstChain : SrcChain; }
189 private:
190 Chain *SrcChain{nullptr};
191 Chain *DstChain{nullptr};
193 public:
194 // Original arcs in the binary with corresponding execution counts
195 ArcList Arcs;
196 // Cached gain of merging the pair of chains
197 double CachedGain{-1.0};
198 // Since the gain of merging (Src, Dst) and (Dst, Src) might be different,
199 // we store a flag indicating which of the options results in a higher gain
200 bool IsGainForward;
203 void Chain::mergeEdges(Chain *Other) {
204 // Update edges adjacent to chain other
205 for (auto EdgeIt : Other->Edges) {
206 Chain *const DstChain = EdgeIt.first;
207 Edge *const DstEdge = EdgeIt.second;
208 Chain *const TargetChain = DstChain == Other ? this : DstChain;
210 // Find the corresponding edge in the current chain
211 Edge *CurEdge = getEdge(TargetChain);
212 if (CurEdge == nullptr) {
213 DstEdge->changeEndpoint(Other, this);
214 this->addEdge(TargetChain, DstEdge);
215 if (DstChain != this && DstChain != Other)
216 DstChain->addEdge(this, DstEdge);
217 } else {
218 CurEdge->moveArcs(DstEdge);
220 // Cleanup leftover edge
221 if (DstChain != Other)
222 DstChain->removeEdge(Other);
226 class HFSortPlus {
227 public:
228 explicit HFSortPlus(const CallGraph &Cg) : Cg(Cg) { initialize(); }
230 /// Run the algorithm and return ordered set of function clusters.
231 std::vector<Cluster> run() {
232 // Pass 1
233 runPassOne();
235 // Pass 2
236 runPassTwo();
238 outs() << "BOLT-INFO: hfsort+ reduced the number of chains from "
239 << Cg.numNodes() << " to " << HotChains.size() << "\n";
241 // Sorting chains by density in decreasing order
242 auto DensityComparator = [](const Chain *L, const Chain *R) {
243 if (L->density() != R->density())
244 return L->density() > R->density();
245 // Making sure the comparison is deterministic
246 return L->Id < R->Id;
248 llvm::stable_sort(HotChains, DensityComparator);
250 // Return the set of clusters that are left, which are the ones that
251 // didn't get merged (so their first func is its original func)
252 std::vector<Cluster> Clusters;
253 Clusters.reserve(HotChains.size());
254 for (Chain *Chain : HotChains)
255 Clusters.emplace_back(Cluster(Chain->Nodes, Cg));
256 return Clusters;
259 private:
260 /// Initialize the set of active chains, function id to chain mapping,
261 /// total number of samples and function addresses.
262 void initialize() {
263 OutWeight.resize(Cg.numNodes(), 0);
264 InWeight.resize(Cg.numNodes(), 0);
265 AllChains.reserve(Cg.numNodes());
266 HotChains.reserve(Cg.numNodes());
267 NodeChain.resize(Cg.numNodes(), nullptr);
268 Addr.resize(Cg.numNodes(), 0);
270 // Initialize chains
271 for (NodeId F = 0; F < Cg.numNodes(); ++F) {
272 AllChains.emplace_back(F, F, Cg.samples(F), Cg.size(F));
273 HotChains.push_back(&AllChains.back());
274 NodeChain[F] = &AllChains.back();
275 TotalSamples += Cg.samples(F);
276 for (NodeId Succ : Cg.successors(F)) {
277 if (F == Succ)
278 continue;
279 const Arc &Arc = *Cg.findArc(F, Succ);
280 OutWeight[F] += Arc.weight();
281 InWeight[Succ] += Arc.weight();
285 AllEdges.reserve(Cg.numArcs());
286 for (NodeId F = 0; F < Cg.numNodes(); ++F) {
287 for (NodeId Succ : Cg.successors(F)) {
288 if (F == Succ)
289 continue;
290 const Arc &Arc = *Cg.findArc(F, Succ);
291 if (Arc.weight() == 0.0 ||
292 Arc.weight() / TotalSamples < opts::ArcThreshold) {
293 continue;
296 Edge *CurEdge = NodeChain[F]->getEdge(NodeChain[Succ]);
297 if (CurEdge != nullptr) {
298 // This edge is already present in the graph
299 assert(NodeChain[Succ]->getEdge(NodeChain[F]) != nullptr);
300 CurEdge->Arcs.push_back(&Arc);
301 } else {
302 // This is a new edge
303 AllEdges.emplace_back(NodeChain[F], NodeChain[Succ], &Arc);
304 NodeChain[F]->addEdge(NodeChain[Succ], &AllEdges.back());
305 NodeChain[Succ]->addEdge(NodeChain[F], &AllEdges.back());
310 for (Chain *&Chain : HotChains) {
311 Chain->ShortCalls = shortCalls(Chain);
312 Chain->Score = score(Chain);
316 /// The probability that a page with a given density is not in the cache.
318 /// Assume that the hot functions are called in a random order; then the
319 /// probability of an i-TLB page being accessed after a function call is
320 /// p = pageSamples / TotalSamples. The probability that the page is not
321 /// accessed is (1 - p), and the probability that it is not in the cache
322 /// (i.e. not accessed during the last kCacheEntries function calls)
323 /// is (1 - p)^kCacheEntries
324 double missProbability(double ChainDensity) const {
325 double PageSamples = ChainDensity * opts::ITLBDensity;
327 if (PageSamples >= TotalSamples)
328 return 0;
330 double P = PageSamples / TotalSamples;
331 return pow(1.0 - P, double(opts::ITLBEntries));
334 /// The expected number of calls on different i-TLB pages for an arc of the
335 /// call graph with a specified weight
336 double expectedCalls(uint64_t SrcAddr, uint64_t DstAddr,
337 double Weight) const {
338 uint64_t Dist = SrcAddr >= DstAddr ? SrcAddr - DstAddr : DstAddr - SrcAddr;
339 if (Dist >= opts::ITLBPageSize)
340 return 0;
342 double D = double(Dist) / double(opts::ITLBPageSize);
343 // Increasing the importance of shorter calls
344 return (1.0 - D * D) * Weight;
347 /// The expected number of calls within a given chain with both endpoints on
348 /// the same cache page
349 double shortCalls(Chain *Chain) const {
350 Edge *Edge = Chain->getEdge(Chain);
351 if (Edge == nullptr)
352 return 0;
354 double Calls = 0;
355 for (const Arc *Arc : Edge->Arcs) {
356 uint64_t SrcAddr = Addr[Arc->src()] + uint64_t(Arc->avgCallOffset());
357 uint64_t DstAddr = Addr[Arc->dst()];
358 Calls += expectedCalls(SrcAddr, DstAddr, Arc->weight());
360 return Calls;
363 /// The number of calls between the two chains with both endpoints on
364 /// the same i-TLB page, assuming that a given pair of chains gets merged
365 double shortCalls(Chain *ChainPred, Chain *ChainSucc, Edge *Edge) const {
366 double Calls = 0;
367 for (const Arc *Arc : Edge->Arcs) {
368 Chain *SrcChain = NodeChain[Arc->src()];
369 uint64_t SrcAddr;
370 uint64_t DstAddr;
371 if (SrcChain == ChainPred) {
372 SrcAddr = Addr[Arc->src()] + uint64_t(Arc->avgCallOffset());
373 DstAddr = Addr[Arc->dst()] + ChainPred->Size;
374 } else {
375 SrcAddr =
376 Addr[Arc->src()] + uint64_t(Arc->avgCallOffset()) + ChainPred->Size;
377 DstAddr = Addr[Arc->dst()];
379 Calls += expectedCalls(SrcAddr, DstAddr, Arc->weight());
382 Calls += ChainPred->ShortCalls;
383 Calls += ChainSucc->ShortCalls;
385 return Calls;
388 double score(Chain *Chain) const {
389 double LongCalls = Chain->Samples - Chain->ShortCalls;
390 return LongCalls * missProbability(Chain->density());
393 /// The gain of merging two chains.
395 /// We assume that the final chains are sorted by their density, and hence
396 /// every chain is likely to be adjacent with chains of the same density.
397 /// Thus, the 'hotness' of every chain can be estimated by density*pageSize,
398 /// which is used to compute the probability of cache misses for long calls
399 /// of a given chain.
400 /// The result is also scaled by the size of the resulting chain in order to
401 /// increase the chance of merging short chains, which is helpful for
402 /// the i-cache performance.
403 double mergeGain(Chain *ChainPred, Chain *ChainSucc, Edge *Edge) const {
404 // Cache misses on the chains before merging
405 double CurScore = ChainPred->Score + ChainSucc->Score;
407 // Cache misses on the merged chain
408 double LongCalls = ChainPred->Samples + ChainSucc->Samples -
409 shortCalls(ChainPred, ChainSucc, Edge);
410 const double MergedSamples = ChainPred->Samples + ChainSucc->Samples;
411 const double MergedSize = ChainPred->Size + ChainSucc->Size;
412 double NewScore = LongCalls * missProbability(MergedSamples / MergedSize);
414 double Gain = CurScore - NewScore;
415 // Scale the result to increase the importance of merging short chains
416 Gain /= std::min(ChainPred->Size, ChainSucc->Size);
418 return Gain;
421 /// Run the first optimization pass of the algorithm:
422 /// Merge chains that call each other with a high probability.
423 void runPassOne() {
424 // Find candidate pairs of chains for merging
425 std::vector<const Arc *> ArcsToMerge;
426 for (Chain *ChainPred : HotChains) {
427 NodeId F = ChainPred->Nodes.back();
428 for (NodeId Succ : Cg.successors(F)) {
429 if (F == Succ)
430 continue;
432 const Arc &Arc = *Cg.findArc(F, Succ);
433 if (Arc.weight() == 0.0 ||
434 Arc.weight() / TotalSamples < opts::ArcThreshold)
435 continue;
437 const double CallsFromPred = OutWeight[F];
438 const double CallsToSucc = InWeight[Succ];
439 const double CallsPredSucc = Arc.weight();
441 // Probability that the first chain is calling the second one
442 const double ProbOut =
443 CallsFromPred > 0 ? CallsPredSucc / CallsFromPred : 0;
444 assert(0.0 <= ProbOut && ProbOut <= 1.0 && "incorrect out-probability");
446 // Probability that the second chain is called From the first one
447 const double ProbIn = CallsToSucc > 0 ? CallsPredSucc / CallsToSucc : 0;
448 assert(0.0 <= ProbIn && ProbIn <= 1.0 && "incorrect in-probability");
450 if (std::min(ProbOut, ProbIn) >= opts::MergeProbability)
451 ArcsToMerge.push_back(&Arc);
455 // Sort the pairs by the weight in reverse order
456 llvm::sort(ArcsToMerge, [](const Arc *L, const Arc *R) {
457 return L->weight() > R->weight();
460 // Merge the pairs of chains
461 for (const Arc *Arc : ArcsToMerge) {
462 Chain *ChainPred = NodeChain[Arc->src()];
463 Chain *ChainSucc = NodeChain[Arc->dst()];
464 if (ChainPred == ChainSucc)
465 continue;
466 if (ChainPred->Nodes.back() == Arc->src() &&
467 ChainSucc->Nodes.front() == Arc->dst())
468 mergeChains(ChainPred, ChainSucc);
472 /// Run the second optimization pass of the hfsort+ algorithm:
473 /// Merge pairs of chains while there is an improvement in the
474 /// expected cache miss ratio.
475 void runPassTwo() {
476 // Creating a priority queue containing all edges ordered by the merge gain
477 auto GainComparator = [](Edge *L, Edge *R) {
478 if (std::abs(L->gain() - R->gain()) > 1e-8)
479 return L->gain() > R->gain();
481 // Making sure the comparison is deterministic
482 if (L->predChain()->Id != R->predChain()->Id)
483 return L->predChain()->Id < R->predChain()->Id;
485 return L->succChain()->Id < R->succChain()->Id;
487 std::set<Edge *, decltype(GainComparator)> Queue(GainComparator);
489 // Inserting the edges Into the queue
490 for (Chain *ChainPred : HotChains) {
491 for (auto EdgeIt : ChainPred->Edges) {
492 Chain *ChainSucc = EdgeIt.first;
493 Edge *ChainEdge = EdgeIt.second;
494 // Ignore loop edges
495 if (ChainPred == ChainSucc)
496 continue;
497 // Ignore already processed edges
498 if (ChainEdge->gain() != -1.0)
499 continue;
501 // Compute the gain of merging the two chains
502 auto ForwardGain = mergeGain(ChainPred, ChainSucc, ChainEdge);
503 auto BackwardGain = mergeGain(ChainSucc, ChainPred, ChainEdge);
504 ChainEdge->setMergeGain(ChainPred, ForwardGain, BackwardGain);
505 if (ChainEdge->gain() > 0.0)
506 Queue.insert(ChainEdge);
510 // Merge the chains while the gain of merging is positive
511 while (!Queue.empty()) {
512 // Extract the best (top) edge for merging
513 Edge *It = *Queue.begin();
514 Queue.erase(Queue.begin());
515 Edge *BestEdge = It;
516 Chain *BestChainPred = BestEdge->predChain();
517 Chain *BestChainSucc = BestEdge->succChain();
518 if (BestChainPred == BestChainSucc || BestEdge->gain() <= 0.0)
519 continue;
521 // Remove outdated edges
522 for (std::pair<Chain *, Edge *> EdgeIt : BestChainPred->Edges)
523 Queue.erase(EdgeIt.second);
524 for (std::pair<Chain *, Edge *> EdgeIt : BestChainSucc->Edges)
525 Queue.erase(EdgeIt.second);
527 // Merge the best pair of chains
528 mergeChains(BestChainPred, BestChainSucc);
530 // Insert newly created edges Into the queue
531 for (auto EdgeIt : BestChainPred->Edges) {
532 Chain *ChainSucc = EdgeIt.first;
533 Edge *ChainEdge = EdgeIt.second;
534 // Ignore loop edges
535 if (BestChainPred == ChainSucc)
536 continue;
538 // Compute the gain of merging the two chains
539 auto ForwardGain = mergeGain(BestChainPred, ChainSucc, ChainEdge);
540 auto BackwardGain = mergeGain(ChainSucc, BestChainPred, ChainEdge);
541 ChainEdge->setMergeGain(BestChainPred, ForwardGain, BackwardGain);
542 if (ChainEdge->gain() > 0.0)
543 Queue.insert(ChainEdge);
548 /// Merge chain From into chain Into and update the list of active chains.
549 void mergeChains(Chain *Into, Chain *From) {
550 assert(Into != From && "cannot merge a chain with itself");
551 Into->merge(From);
553 // Update the chains and addresses for functions merged from From
554 size_t CurAddr = 0;
555 for (NodeId F : Into->Nodes) {
556 NodeChain[F] = Into;
557 Addr[F] = CurAddr;
558 CurAddr += Cg.size(F);
561 // Merge edges
562 Into->mergeEdges(From);
563 From->clear();
565 // Update cached scores for the new chain
566 Into->ShortCalls = shortCalls(Into);
567 Into->Score = score(Into);
569 // Remove chain From From the list of active chains
570 llvm::erase(HotChains, From);
573 private:
574 // The call graph
575 const CallGraph &Cg;
577 // All chains of functions
578 std::vector<Chain> AllChains;
580 // Active chains. The vector gets updated at runtime when chains are merged
581 std::vector<Chain *> HotChains;
583 // All edges between chains
584 std::vector<Edge> AllEdges;
586 // Node_id => chain
587 std::vector<Chain *> NodeChain;
589 // Current address of the function From the beginning of its chain
590 std::vector<uint64_t> Addr;
592 // Total weight of outgoing arcs for each function
593 std::vector<double> OutWeight;
595 // Total weight of incoming arcs for each function
596 std::vector<double> InWeight;
597 // The total number of samples in the graph
598 double TotalSamples{0};
601 } // end anonymous namespace
603 std::vector<Cluster> hfsortPlus(CallGraph &Cg) {
604 // It is required that the sum of incoming arc weights is not greater
605 // than the number of samples for every function.
606 // Ensuring the call graph obeys the property before running the algorithm.
607 Cg.adjustArcWeights();
608 return HFSortPlus(Cg).run();
611 } // namespace bolt
612 } // namespace llvm