1 //===- bolt/Passes/HFSortPlus.cpp - Order functions by hotness ------------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 // hfsort+ - layout of hot functions with i-TLB cache optimization.
11 // Given an ordering of hot functions (and hence, their assignment to the
12 // i-TLB pages), we can divide all functions calls Into two categories:
13 // - 'short' ones that have a caller-callee distance less than a page;
14 // - 'long' ones where the distance exceeds a page.
15 // The short calls are likely to result in a i-TLB cache hit. For the long ones,
16 // the hit/miss result depends on the 'hotness' of the page (i.e., how often
17 // the page is accessed). Assuming that functions are sent to the i-TLB cache
18 // in a random order, the probability that a page is present in the cache is
19 // proportional to the number of samples corresponding to the functions on the
20 // page. The following algorithm detects short and long calls, and optimizes
21 // the expected number of cache misses for the long ones.
23 //===----------------------------------------------------------------------===//
25 #include "bolt/Passes/HFSort.h"
26 #include "llvm/Support/CommandLine.h"
31 #define DEBUG_TYPE "hfsort"
38 extern cl::OptionCategory BoltOptCategory
;
40 cl::opt
<unsigned> ITLBPageSize("itlb-page-size",
41 cl::desc("The size of i-tlb cache page"),
42 cl::init(4096), cl::ReallyHidden
,
43 cl::cat(BoltOptCategory
));
45 cl::opt
<unsigned> ITLBEntries("itlb-entries",
46 cl::desc("The number of entries in i-tlb cache"),
47 cl::init(16), cl::ReallyHidden
,
48 cl::cat(BoltOptCategory
));
50 static cl::opt
<unsigned> ITLBDensity("itlb-density",
51 cl::desc("The density of i-tlb cache"),
52 cl::init(4096), cl::ReallyHidden
,
53 cl::cat(BoltOptCategory
));
55 static cl::opt
<double> MergeProbability(
57 cl::desc("The minimum probability of a call for merging two clusters"),
58 cl::init(0.9), cl::ReallyHidden
, cl::cat(BoltOptCategory
));
60 static cl::opt
<double> ArcThreshold(
62 cl::desc("The threshold for ignoring arcs with a small relative weight"),
63 cl::init(0.00000001), cl::ReallyHidden
, cl::cat(BoltOptCategory
));
70 using NodeId
= CallGraph::NodeId
;
71 using Arc
= CallGraph::Arc
;
76 using ArcList
= std::vector
<const Arc
*>;
78 // A chain (ordered sequence) of nodes (functions) in the call graph
81 Chain(const Chain
&) = delete;
82 Chain(Chain
&&) = default;
83 Chain
&operator=(const Chain
&) = delete;
84 Chain
&operator=(Chain
&&) = default;
86 explicit Chain(size_t Id_
, NodeId Node
, size_t Samples_
, size_t Size_
)
87 : Id(Id_
), Samples(Samples_
), Size(Size_
), Nodes(1, Node
) {}
89 double density() const { return static_cast<double>(Samples
) / Size
; }
91 Edge
*getEdge(Chain
*Other
) const {
92 for (std::pair
<Chain
*, Edge
*> It
: Edges
)
93 if (It
.first
== Other
)
98 void removeEdge(Chain
*Other
) {
99 auto It
= Edges
.begin();
100 while (It
!= Edges
.end()) {
101 if (It
->first
== Other
) {
109 void addEdge(Chain
*Other
, Edge
*Edge
) { Edges
.emplace_back(Other
, Edge
); }
111 void merge(Chain
*Other
) {
112 Nodes
.insert(Nodes
.end(), Other
->Nodes
.begin(), Other
->Nodes
.end());
113 Samples
+= Other
->Samples
;
117 void mergeEdges(Chain
*Other
);
128 // Cached score for the chain
130 // Cached short-calls for the chain
131 double ShortCalls
{0};
132 // Nodes in the chain
133 std::vector
<NodeId
> Nodes
;
134 // Adjacent chains and corresponding edges (lists of arcs)
135 std::vector
<std::pair
<Chain
*, Edge
*>> Edges
;
138 // An edge in the call graph representing Arcs between two Chains.
139 // When functions are merged Into chains, the edges are combined too so that
140 // there is always at most one edge between a pair of chains
143 Edge(const Edge
&) = delete;
144 Edge(Edge
&&) = default;
145 Edge
&operator=(const Edge
&) = delete;
146 Edge
&operator=(Edge
&&) = default;
148 explicit Edge(Chain
*SrcChain_
, Chain
*DstChain_
, const Arc
*A
)
149 : SrcChain(SrcChain_
), DstChain(DstChain_
), Arcs(1, A
) {}
151 void changeEndpoint(Chain
*From
, Chain
*To
) {
152 if (From
== SrcChain
)
154 if (From
== DstChain
)
158 void moveArcs(Edge
*Other
) {
159 Arcs
.insert(Arcs
.end(), Other
->Arcs
.begin(), Other
->Arcs
.end());
163 void setMergeGain(Chain
*PredChain
, double ForwardGain
, double BackwardGain
) {
164 // When forward and backward gains are the same, prioritize merging that
165 // preserves the original order of the functions in the binary
166 if (std::abs(ForwardGain
- BackwardGain
) < 1e-8) {
167 if (SrcChain
->Id
< DstChain
->Id
) {
168 IsGainForward
= true;
169 CachedGain
= PredChain
== SrcChain
? ForwardGain
: BackwardGain
;
171 IsGainForward
= false;
172 CachedGain
= PredChain
== SrcChain
? BackwardGain
: ForwardGain
;
174 } else if (ForwardGain
> BackwardGain
) {
175 IsGainForward
= PredChain
== SrcChain
;
176 CachedGain
= ForwardGain
;
178 IsGainForward
= PredChain
!= SrcChain
;
179 CachedGain
= BackwardGain
;
183 double gain() const { return CachedGain
; }
185 Chain
*predChain() const { return IsGainForward
? SrcChain
: DstChain
; }
187 Chain
*succChain() const { return IsGainForward
? DstChain
: SrcChain
; }
190 Chain
*SrcChain
{nullptr};
191 Chain
*DstChain
{nullptr};
194 // Original arcs in the binary with corresponding execution counts
196 // Cached gain of merging the pair of chains
197 double CachedGain
{-1.0};
198 // Since the gain of merging (Src, Dst) and (Dst, Src) might be different,
199 // we store a flag indicating which of the options results in a higher gain
203 void Chain::mergeEdges(Chain
*Other
) {
204 // Update edges adjacent to chain other
205 for (auto EdgeIt
: Other
->Edges
) {
206 Chain
*const DstChain
= EdgeIt
.first
;
207 Edge
*const DstEdge
= EdgeIt
.second
;
208 Chain
*const TargetChain
= DstChain
== Other
? this : DstChain
;
210 // Find the corresponding edge in the current chain
211 Edge
*CurEdge
= getEdge(TargetChain
);
212 if (CurEdge
== nullptr) {
213 DstEdge
->changeEndpoint(Other
, this);
214 this->addEdge(TargetChain
, DstEdge
);
215 if (DstChain
!= this && DstChain
!= Other
)
216 DstChain
->addEdge(this, DstEdge
);
218 CurEdge
->moveArcs(DstEdge
);
220 // Cleanup leftover edge
221 if (DstChain
!= Other
)
222 DstChain
->removeEdge(Other
);
228 explicit HFSortPlus(const CallGraph
&Cg
) : Cg(Cg
) { initialize(); }
230 /// Run the algorithm and return ordered set of function clusters.
231 std::vector
<Cluster
> run() {
238 outs() << "BOLT-INFO: hfsort+ reduced the number of chains from "
239 << Cg
.numNodes() << " to " << HotChains
.size() << "\n";
241 // Sorting chains by density in decreasing order
242 auto DensityComparator
= [](const Chain
*L
, const Chain
*R
) {
243 if (L
->density() != R
->density())
244 return L
->density() > R
->density();
245 // Making sure the comparison is deterministic
246 return L
->Id
< R
->Id
;
248 llvm::stable_sort(HotChains
, DensityComparator
);
250 // Return the set of clusters that are left, which are the ones that
251 // didn't get merged (so their first func is its original func)
252 std::vector
<Cluster
> Clusters
;
253 Clusters
.reserve(HotChains
.size());
254 for (Chain
*Chain
: HotChains
)
255 Clusters
.emplace_back(Cluster(Chain
->Nodes
, Cg
));
260 /// Initialize the set of active chains, function id to chain mapping,
261 /// total number of samples and function addresses.
263 OutWeight
.resize(Cg
.numNodes(), 0);
264 InWeight
.resize(Cg
.numNodes(), 0);
265 AllChains
.reserve(Cg
.numNodes());
266 HotChains
.reserve(Cg
.numNodes());
267 NodeChain
.resize(Cg
.numNodes(), nullptr);
268 Addr
.resize(Cg
.numNodes(), 0);
271 for (NodeId F
= 0; F
< Cg
.numNodes(); ++F
) {
272 AllChains
.emplace_back(F
, F
, Cg
.samples(F
), Cg
.size(F
));
273 HotChains
.push_back(&AllChains
.back());
274 NodeChain
[F
] = &AllChains
.back();
275 TotalSamples
+= Cg
.samples(F
);
276 for (NodeId Succ
: Cg
.successors(F
)) {
279 const Arc
&Arc
= *Cg
.findArc(F
, Succ
);
280 OutWeight
[F
] += Arc
.weight();
281 InWeight
[Succ
] += Arc
.weight();
285 AllEdges
.reserve(Cg
.numArcs());
286 for (NodeId F
= 0; F
< Cg
.numNodes(); ++F
) {
287 for (NodeId Succ
: Cg
.successors(F
)) {
290 const Arc
&Arc
= *Cg
.findArc(F
, Succ
);
291 if (Arc
.weight() == 0.0 ||
292 Arc
.weight() / TotalSamples
< opts::ArcThreshold
) {
296 Edge
*CurEdge
= NodeChain
[F
]->getEdge(NodeChain
[Succ
]);
297 if (CurEdge
!= nullptr) {
298 // This edge is already present in the graph
299 assert(NodeChain
[Succ
]->getEdge(NodeChain
[F
]) != nullptr);
300 CurEdge
->Arcs
.push_back(&Arc
);
302 // This is a new edge
303 AllEdges
.emplace_back(NodeChain
[F
], NodeChain
[Succ
], &Arc
);
304 NodeChain
[F
]->addEdge(NodeChain
[Succ
], &AllEdges
.back());
305 NodeChain
[Succ
]->addEdge(NodeChain
[F
], &AllEdges
.back());
310 for (Chain
*&Chain
: HotChains
) {
311 Chain
->ShortCalls
= shortCalls(Chain
);
312 Chain
->Score
= score(Chain
);
316 /// The probability that a page with a given density is not in the cache.
318 /// Assume that the hot functions are called in a random order; then the
319 /// probability of an i-TLB page being accessed after a function call is
320 /// p = pageSamples / TotalSamples. The probability that the page is not
321 /// accessed is (1 - p), and the probability that it is not in the cache
322 /// (i.e. not accessed during the last kCacheEntries function calls)
323 /// is (1 - p)^kCacheEntries
324 double missProbability(double ChainDensity
) const {
325 double PageSamples
= ChainDensity
* opts::ITLBDensity
;
327 if (PageSamples
>= TotalSamples
)
330 double P
= PageSamples
/ TotalSamples
;
331 return pow(1.0 - P
, double(opts::ITLBEntries
));
334 /// The expected number of calls on different i-TLB pages for an arc of the
335 /// call graph with a specified weight
336 double expectedCalls(uint64_t SrcAddr
, uint64_t DstAddr
,
337 double Weight
) const {
338 uint64_t Dist
= SrcAddr
>= DstAddr
? SrcAddr
- DstAddr
: DstAddr
- SrcAddr
;
339 if (Dist
>= opts::ITLBPageSize
)
342 double D
= double(Dist
) / double(opts::ITLBPageSize
);
343 // Increasing the importance of shorter calls
344 return (1.0 - D
* D
) * Weight
;
347 /// The expected number of calls within a given chain with both endpoints on
348 /// the same cache page
349 double shortCalls(Chain
*Chain
) const {
350 Edge
*Edge
= Chain
->getEdge(Chain
);
355 for (const Arc
*Arc
: Edge
->Arcs
) {
356 uint64_t SrcAddr
= Addr
[Arc
->src()] + uint64_t(Arc
->avgCallOffset());
357 uint64_t DstAddr
= Addr
[Arc
->dst()];
358 Calls
+= expectedCalls(SrcAddr
, DstAddr
, Arc
->weight());
363 /// The number of calls between the two chains with both endpoints on
364 /// the same i-TLB page, assuming that a given pair of chains gets merged
365 double shortCalls(Chain
*ChainPred
, Chain
*ChainSucc
, Edge
*Edge
) const {
367 for (const Arc
*Arc
: Edge
->Arcs
) {
368 Chain
*SrcChain
= NodeChain
[Arc
->src()];
371 if (SrcChain
== ChainPred
) {
372 SrcAddr
= Addr
[Arc
->src()] + uint64_t(Arc
->avgCallOffset());
373 DstAddr
= Addr
[Arc
->dst()] + ChainPred
->Size
;
376 Addr
[Arc
->src()] + uint64_t(Arc
->avgCallOffset()) + ChainPred
->Size
;
377 DstAddr
= Addr
[Arc
->dst()];
379 Calls
+= expectedCalls(SrcAddr
, DstAddr
, Arc
->weight());
382 Calls
+= ChainPred
->ShortCalls
;
383 Calls
+= ChainSucc
->ShortCalls
;
388 double score(Chain
*Chain
) const {
389 double LongCalls
= Chain
->Samples
- Chain
->ShortCalls
;
390 return LongCalls
* missProbability(Chain
->density());
393 /// The gain of merging two chains.
395 /// We assume that the final chains are sorted by their density, and hence
396 /// every chain is likely to be adjacent with chains of the same density.
397 /// Thus, the 'hotness' of every chain can be estimated by density*pageSize,
398 /// which is used to compute the probability of cache misses for long calls
399 /// of a given chain.
400 /// The result is also scaled by the size of the resulting chain in order to
401 /// increase the chance of merging short chains, which is helpful for
402 /// the i-cache performance.
403 double mergeGain(Chain
*ChainPred
, Chain
*ChainSucc
, Edge
*Edge
) const {
404 // Cache misses on the chains before merging
405 double CurScore
= ChainPred
->Score
+ ChainSucc
->Score
;
407 // Cache misses on the merged chain
408 double LongCalls
= ChainPred
->Samples
+ ChainSucc
->Samples
-
409 shortCalls(ChainPred
, ChainSucc
, Edge
);
410 const double MergedSamples
= ChainPred
->Samples
+ ChainSucc
->Samples
;
411 const double MergedSize
= ChainPred
->Size
+ ChainSucc
->Size
;
412 double NewScore
= LongCalls
* missProbability(MergedSamples
/ MergedSize
);
414 double Gain
= CurScore
- NewScore
;
415 // Scale the result to increase the importance of merging short chains
416 Gain
/= std::min(ChainPred
->Size
, ChainSucc
->Size
);
421 /// Run the first optimization pass of the algorithm:
422 /// Merge chains that call each other with a high probability.
424 // Find candidate pairs of chains for merging
425 std::vector
<const Arc
*> ArcsToMerge
;
426 for (Chain
*ChainPred
: HotChains
) {
427 NodeId F
= ChainPred
->Nodes
.back();
428 for (NodeId Succ
: Cg
.successors(F
)) {
432 const Arc
&Arc
= *Cg
.findArc(F
, Succ
);
433 if (Arc
.weight() == 0.0 ||
434 Arc
.weight() / TotalSamples
< opts::ArcThreshold
)
437 const double CallsFromPred
= OutWeight
[F
];
438 const double CallsToSucc
= InWeight
[Succ
];
439 const double CallsPredSucc
= Arc
.weight();
441 // Probability that the first chain is calling the second one
442 const double ProbOut
=
443 CallsFromPred
> 0 ? CallsPredSucc
/ CallsFromPred
: 0;
444 assert(0.0 <= ProbOut
&& ProbOut
<= 1.0 && "incorrect out-probability");
446 // Probability that the second chain is called From the first one
447 const double ProbIn
= CallsToSucc
> 0 ? CallsPredSucc
/ CallsToSucc
: 0;
448 assert(0.0 <= ProbIn
&& ProbIn
<= 1.0 && "incorrect in-probability");
450 if (std::min(ProbOut
, ProbIn
) >= opts::MergeProbability
)
451 ArcsToMerge
.push_back(&Arc
);
455 // Sort the pairs by the weight in reverse order
456 llvm::sort(ArcsToMerge
, [](const Arc
*L
, const Arc
*R
) {
457 return L
->weight() > R
->weight();
460 // Merge the pairs of chains
461 for (const Arc
*Arc
: ArcsToMerge
) {
462 Chain
*ChainPred
= NodeChain
[Arc
->src()];
463 Chain
*ChainSucc
= NodeChain
[Arc
->dst()];
464 if (ChainPred
== ChainSucc
)
466 if (ChainPred
->Nodes
.back() == Arc
->src() &&
467 ChainSucc
->Nodes
.front() == Arc
->dst())
468 mergeChains(ChainPred
, ChainSucc
);
472 /// Run the second optimization pass of the hfsort+ algorithm:
473 /// Merge pairs of chains while there is an improvement in the
474 /// expected cache miss ratio.
476 // Creating a priority queue containing all edges ordered by the merge gain
477 auto GainComparator
= [](Edge
*L
, Edge
*R
) {
478 if (std::abs(L
->gain() - R
->gain()) > 1e-8)
479 return L
->gain() > R
->gain();
481 // Making sure the comparison is deterministic
482 if (L
->predChain()->Id
!= R
->predChain()->Id
)
483 return L
->predChain()->Id
< R
->predChain()->Id
;
485 return L
->succChain()->Id
< R
->succChain()->Id
;
487 std::set
<Edge
*, decltype(GainComparator
)> Queue(GainComparator
);
489 // Inserting the edges Into the queue
490 for (Chain
*ChainPred
: HotChains
) {
491 for (auto EdgeIt
: ChainPred
->Edges
) {
492 Chain
*ChainSucc
= EdgeIt
.first
;
493 Edge
*ChainEdge
= EdgeIt
.second
;
495 if (ChainPred
== ChainSucc
)
497 // Ignore already processed edges
498 if (ChainEdge
->gain() != -1.0)
501 // Compute the gain of merging the two chains
502 auto ForwardGain
= mergeGain(ChainPred
, ChainSucc
, ChainEdge
);
503 auto BackwardGain
= mergeGain(ChainSucc
, ChainPred
, ChainEdge
);
504 ChainEdge
->setMergeGain(ChainPred
, ForwardGain
, BackwardGain
);
505 if (ChainEdge
->gain() > 0.0)
506 Queue
.insert(ChainEdge
);
510 // Merge the chains while the gain of merging is positive
511 while (!Queue
.empty()) {
512 // Extract the best (top) edge for merging
513 Edge
*It
= *Queue
.begin();
514 Queue
.erase(Queue
.begin());
516 Chain
*BestChainPred
= BestEdge
->predChain();
517 Chain
*BestChainSucc
= BestEdge
->succChain();
518 if (BestChainPred
== BestChainSucc
|| BestEdge
->gain() <= 0.0)
521 // Remove outdated edges
522 for (std::pair
<Chain
*, Edge
*> EdgeIt
: BestChainPred
->Edges
)
523 Queue
.erase(EdgeIt
.second
);
524 for (std::pair
<Chain
*, Edge
*> EdgeIt
: BestChainSucc
->Edges
)
525 Queue
.erase(EdgeIt
.second
);
527 // Merge the best pair of chains
528 mergeChains(BestChainPred
, BestChainSucc
);
530 // Insert newly created edges Into the queue
531 for (auto EdgeIt
: BestChainPred
->Edges
) {
532 Chain
*ChainSucc
= EdgeIt
.first
;
533 Edge
*ChainEdge
= EdgeIt
.second
;
535 if (BestChainPred
== ChainSucc
)
538 // Compute the gain of merging the two chains
539 auto ForwardGain
= mergeGain(BestChainPred
, ChainSucc
, ChainEdge
);
540 auto BackwardGain
= mergeGain(ChainSucc
, BestChainPred
, ChainEdge
);
541 ChainEdge
->setMergeGain(BestChainPred
, ForwardGain
, BackwardGain
);
542 if (ChainEdge
->gain() > 0.0)
543 Queue
.insert(ChainEdge
);
548 /// Merge chain From into chain Into and update the list of active chains.
549 void mergeChains(Chain
*Into
, Chain
*From
) {
550 assert(Into
!= From
&& "cannot merge a chain with itself");
553 // Update the chains and addresses for functions merged from From
555 for (NodeId F
: Into
->Nodes
) {
558 CurAddr
+= Cg
.size(F
);
562 Into
->mergeEdges(From
);
565 // Update cached scores for the new chain
566 Into
->ShortCalls
= shortCalls(Into
);
567 Into
->Score
= score(Into
);
569 // Remove chain From From the list of active chains
570 llvm::erase(HotChains
, From
);
577 // All chains of functions
578 std::vector
<Chain
> AllChains
;
580 // Active chains. The vector gets updated at runtime when chains are merged
581 std::vector
<Chain
*> HotChains
;
583 // All edges between chains
584 std::vector
<Edge
> AllEdges
;
587 std::vector
<Chain
*> NodeChain
;
589 // Current address of the function From the beginning of its chain
590 std::vector
<uint64_t> Addr
;
592 // Total weight of outgoing arcs for each function
593 std::vector
<double> OutWeight
;
595 // Total weight of incoming arcs for each function
596 std::vector
<double> InWeight
;
597 // The total number of samples in the graph
598 double TotalSamples
{0};
601 } // end anonymous namespace
603 std::vector
<Cluster
> hfsortPlus(CallGraph
&Cg
) {
604 // It is required that the sum of incoming arc weights is not greater
605 // than the number of samples for every function.
606 // Ensuring the call graph obeys the property before running the algorithm.
607 Cg
.adjustArcWeights();
608 return HFSortPlus(Cg
).run();