1 //===- bolt/Passes/ReorderAlgorithm.cpp - Basic block reordering ----------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 // This file implements classes used by several basic block reordering
12 //===----------------------------------------------------------------------===//
14 #include "bolt/Passes/ReorderAlgorithm.h"
15 #include "bolt/Core/BinaryBasicBlock.h"
16 #include "bolt/Core/BinaryFunction.h"
17 #include "llvm/Support/CommandLine.h"
18 #include "llvm/Transforms/Utils/CodeLayout.h"
24 #define DEBUG_TYPE "bolt"
31 extern cl::OptionCategory BoltOptCategory
;
32 extern cl::opt
<bool> NoThreads
;
34 static cl::opt
<unsigned> ColdThreshold(
36 cl::desc("tenths of percents of main entry frequency to use as a "
37 "threshold when evaluating whether a basic block is cold "
38 "(0 means it is only considered cold if the block has zero "
39 "samples). Default: 0 "),
40 cl::init(0), cl::ZeroOrMore
, cl::Hidden
, cl::cat(BoltOptCategory
));
42 static cl::opt
<bool> PrintClusters("print-clusters", cl::desc("print clusters"),
43 cl::Hidden
, cl::cat(BoltOptCategory
));
45 cl::opt
<uint32_t> RandomSeed("bolt-seed", cl::desc("seed for randomization"),
46 cl::init(42), cl::Hidden
,
47 cl::cat(BoltOptCategory
));
53 template <class T
> inline void hashCombine(size_t &Seed
, const T
&Val
) {
55 Seed
^= Hasher(Val
) + 0x9e3779b9 + (Seed
<< 6) + (Seed
>> 2);
58 template <typename A
, typename B
> struct HashPair
{
59 size_t operator()(const std::pair
<A
, B
> &Val
) const {
61 size_t Seed
= Hasher(Val
.first
);
62 hashCombine(Seed
, Val
.second
);
69 void ClusterAlgorithm::computeClusterAverageFrequency(const BinaryContext
&BC
) {
70 // Create a separate MCCodeEmitter to allow lock-free execution
71 BinaryContext::IndependentCodeEmitter Emitter
;
73 Emitter
= BC
.createIndependentMCCodeEmitter();
75 AvgFreq
.resize(Clusters
.size(), 0.0);
76 for (uint32_t I
= 0, E
= Clusters
.size(); I
< E
; ++I
) {
78 uint64_t ClusterSize
= 0;
79 for (const BinaryBasicBlock
*BB
: Clusters
[I
]) {
80 if (BB
->getNumNonPseudos() > 0) {
81 Freq
+= BB
->getExecutionCount();
82 // Estimate the size of a block in bytes at run time
83 // NOTE: This might be inaccurate
84 ClusterSize
+= BB
->estimateSize(Emitter
.MCE
.get());
87 AvgFreq
[I
] = ClusterSize
== 0 ? 0 : Freq
/ ClusterSize
;
91 void ClusterAlgorithm::printClusters() const {
92 for (uint32_t I
= 0, E
= Clusters
.size(); I
< E
; ++I
) {
93 errs() << "Cluster number " << I
;
94 if (AvgFreq
.size() == Clusters
.size())
95 errs() << " (frequency: " << AvgFreq
[I
] << ")";
98 for (const BinaryBasicBlock
*BB
: Clusters
[I
]) {
99 errs() << Sep
<< BB
->getName();
106 void ClusterAlgorithm::reset() {
108 ClusterEdges
.clear();
112 void GreedyClusterAlgorithm::EdgeTy::print(raw_ostream
&OS
) const {
113 OS
<< Src
->getName() << " -> " << Dst
->getName() << ", count: " << Count
;
116 size_t GreedyClusterAlgorithm::EdgeHash::operator()(const EdgeTy
&E
) const {
117 HashPair
<const BinaryBasicBlock
*, const BinaryBasicBlock
*> Hasher
;
118 return Hasher(std::make_pair(E
.Src
, E
.Dst
));
121 bool GreedyClusterAlgorithm::EdgeEqual::operator()(const EdgeTy
&A
,
122 const EdgeTy
&B
) const {
123 return A
.Src
== B
.Src
&& A
.Dst
== B
.Dst
;
126 void GreedyClusterAlgorithm::clusterBasicBlocks(BinaryFunction
&BF
,
130 // Greedy heuristic implementation for the TSP, applied to BB layout. Try to
131 // maximize weight during a path traversing all BBs. In this way, we will
132 // convert the hottest branches into fall-throughs.
134 // This is the queue of edges from which we will pop edges and use them to
135 // cluster basic blocks in a greedy fashion.
136 std::vector
<EdgeTy
> Queue
;
138 // Initialize inter-cluster weights.
140 ClusterEdges
.resize(BF
.getLayout().block_size());
142 // Initialize clusters and edge queue.
143 for (BinaryBasicBlock
*BB
: BF
.getLayout().blocks()) {
144 // Create a cluster for this BB.
145 uint32_t I
= Clusters
.size();
146 Clusters
.emplace_back();
147 std::vector
<BinaryBasicBlock
*> &Cluster
= Clusters
.back();
148 Cluster
.push_back(BB
);
149 BBToClusterMap
[BB
] = I
;
150 // Populate priority queue with edges.
151 auto BI
= BB
->branch_info_begin();
152 for (const BinaryBasicBlock
*I
: BB
->successors()) {
153 assert(BI
->Count
!= BinaryBasicBlock::COUNT_NO_PROFILE
&&
154 "attempted reordering blocks of function with no profile data");
155 Queue
.emplace_back(EdgeTy(BB
, I
, BI
->Count
));
159 // Sort and adjust the edge queue.
160 initQueue(Queue
, BF
);
162 // Grow clusters in a greedy fashion.
163 while (!Queue
.empty()) {
164 EdgeTy E
= Queue
.back();
167 const BinaryBasicBlock
*SrcBB
= E
.Src
;
168 const BinaryBasicBlock
*DstBB
= E
.Dst
;
170 LLVM_DEBUG(dbgs() << "Popped edge "; E
.print(dbgs()); dbgs() << "\n");
172 // Case 1: BBSrc and BBDst are the same. Ignore this edge
173 if (SrcBB
== DstBB
|| DstBB
== *BF
.getLayout().block_begin()) {
174 LLVM_DEBUG(dbgs() << "\tIgnored (same src, dst)\n");
178 int I
= BBToClusterMap
[SrcBB
];
179 int J
= BBToClusterMap
[DstBB
];
181 // Case 2: If they are already allocated at the same cluster, just increase
182 // the weight of this cluster
185 ClusterEdges
[I
][I
] += E
.Count
;
186 LLVM_DEBUG(dbgs() << "\tIgnored (src, dst belong to the same cluster)\n");
190 std::vector
<BinaryBasicBlock
*> &ClusterA
= Clusters
[I
];
191 std::vector
<BinaryBasicBlock
*> &ClusterB
= Clusters
[J
];
192 if (areClustersCompatible(ClusterA
, ClusterB
, E
)) {
193 // Case 3: SrcBB is at the end of a cluster and DstBB is at the start,
194 // allowing us to merge two clusters.
195 for (const BinaryBasicBlock
*BB
: ClusterB
)
196 BBToClusterMap
[BB
] = I
;
197 ClusterA
.insert(ClusterA
.end(), ClusterB
.begin(), ClusterB
.end());
200 // Increase the intra-cluster edge count of cluster A with the count of
201 // this edge as well as with the total count of previously visited edges
202 // from cluster B cluster A.
203 ClusterEdges
[I
][I
] += E
.Count
;
204 ClusterEdges
[I
][I
] += ClusterEdges
[J
][I
];
205 // Iterate through all inter-cluster edges and transfer edges targeting
206 // cluster B to cluster A.
207 for (uint32_t K
= 0, E
= ClusterEdges
.size(); K
!= E
; ++K
)
208 ClusterEdges
[K
][I
] += ClusterEdges
[K
][J
];
210 // Adjust the weights of the remaining edges and re-sort the queue.
211 adjustQueue(Queue
, BF
);
212 LLVM_DEBUG(dbgs() << "\tMerged clusters of src, dst\n");
214 // Case 4: Both SrcBB and DstBB are allocated in positions we cannot
215 // merge them. Add the count of this edge to the inter-cluster edge count
216 // between clusters A and B to help us decide ordering between these
219 ClusterEdges
[I
][J
] += E
.Count
;
221 dbgs() << "\tIgnored (src, dst belong to incompatible clusters)\n");
226 void GreedyClusterAlgorithm::reset() {
227 ClusterAlgorithm::reset();
228 BBToClusterMap
.clear();
231 void PHGreedyClusterAlgorithm::initQueue(std::vector
<EdgeTy
> &Queue
,
232 const BinaryFunction
&BF
) {
233 // Define a comparison function to establish SWO between edges.
234 auto Comp
= [&BF
](const EdgeTy
&A
, const EdgeTy
&B
) {
235 // With equal weights, prioritize branches with lower index
236 // source/destination. This helps to keep original block order for blocks
237 // when optimal order cannot be deducted from a profile.
238 if (A
.Count
== B
.Count
) {
239 const signed SrcOrder
= BF
.getOriginalLayoutRelativeOrder(A
.Src
, B
.Src
);
240 return (SrcOrder
!= 0)
242 : BF
.getOriginalLayoutRelativeOrder(A
.Dst
, B
.Dst
) > 0;
244 return A
.Count
< B
.Count
;
247 // Sort edges in increasing profile count order.
248 llvm::sort(Queue
, Comp
);
251 void PHGreedyClusterAlgorithm::adjustQueue(std::vector
<EdgeTy
> &Queue
,
252 const BinaryFunction
&BF
) {
256 bool PHGreedyClusterAlgorithm::areClustersCompatible(const ClusterTy
&Front
,
257 const ClusterTy
&Back
,
258 const EdgeTy
&E
) const {
259 return Front
.back() == E
.Src
&& Back
.front() == E
.Dst
;
262 int64_t MinBranchGreedyClusterAlgorithm::calculateWeight(
263 const EdgeTy
&E
, const BinaryFunction
&BF
) const {
264 const BinaryBasicBlock
*SrcBB
= E
.Src
;
265 const BinaryBasicBlock
*DstBB
= E
.Dst
;
267 // Initial weight value.
268 int64_t W
= (int64_t)E
.Count
;
270 // Adjust the weight by taking into account other edges with the same source.
271 auto BI
= SrcBB
->branch_info_begin();
272 for (const BinaryBasicBlock
*SuccBB
: SrcBB
->successors()) {
273 assert(BI
->Count
!= BinaryBasicBlock::COUNT_NO_PROFILE
&&
274 "attempted reordering blocks of function with no profile data");
275 assert(BI
->Count
<= std::numeric_limits
<int64_t>::max() &&
276 "overflow detected");
277 // Ignore edges with same source and destination, edges that target the
278 // entry block as well as the edge E itself.
279 if (SuccBB
!= SrcBB
&& SuccBB
!= *BF
.getLayout().block_begin() &&
281 W
-= (int64_t)BI
->Count
;
285 // Adjust the weight by taking into account other edges with the same
287 for (const BinaryBasicBlock
*PredBB
: DstBB
->predecessors()) {
288 // Ignore edges with same source and destination as well as the edge E
290 if (PredBB
== DstBB
|| PredBB
== SrcBB
)
292 auto BI
= PredBB
->branch_info_begin();
293 for (const BinaryBasicBlock
*SuccBB
: PredBB
->successors()) {
298 assert(BI
!= PredBB
->branch_info_end() && "invalid control flow graph");
299 assert(BI
->Count
!= BinaryBasicBlock::COUNT_NO_PROFILE
&&
300 "attempted reordering blocks of function with no profile data");
301 assert(BI
->Count
<= std::numeric_limits
<int64_t>::max() &&
302 "overflow detected");
303 W
-= (int64_t)BI
->Count
;
309 void MinBranchGreedyClusterAlgorithm::initQueue(std::vector
<EdgeTy
> &Queue
,
310 const BinaryFunction
&BF
) {
311 // Initialize edge weights.
312 for (const EdgeTy
&E
: Queue
)
313 Weight
.emplace(std::make_pair(E
, calculateWeight(E
, BF
)));
315 // Sort edges in increasing weight order.
316 adjustQueue(Queue
, BF
);
319 void MinBranchGreedyClusterAlgorithm::adjustQueue(std::vector
<EdgeTy
> &Queue
,
320 const BinaryFunction
&BF
) {
321 // Define a comparison function to establish SWO between edges.
322 auto Comp
= [&](const EdgeTy
&A
, const EdgeTy
&B
) {
323 // With equal weights, prioritize branches with lower index
324 // source/destination. This helps to keep original block order for blocks
325 // when optimal order cannot be deduced from a profile.
326 if (Weight
[A
] == Weight
[B
]) {
327 const signed SrcOrder
= BF
.getOriginalLayoutRelativeOrder(A
.Src
, B
.Src
);
328 return (SrcOrder
!= 0)
330 : BF
.getOriginalLayoutRelativeOrder(A
.Dst
, B
.Dst
) > 0;
332 return Weight
[A
] < Weight
[B
];
335 // Iterate through all remaining edges to find edges that have their
336 // source and destination in the same cluster.
337 std::vector
<EdgeTy
> NewQueue
;
338 for (const EdgeTy
&E
: Queue
) {
339 const BinaryBasicBlock
*SrcBB
= E
.Src
;
340 const BinaryBasicBlock
*DstBB
= E
.Dst
;
342 // Case 1: SrcBB and DstBB are the same or DstBB is the entry block. Ignore
344 if (SrcBB
== DstBB
|| DstBB
== *BF
.getLayout().block_begin()) {
345 LLVM_DEBUG(dbgs() << "\tAdjustment: Ignored edge "; E
.print(dbgs());
346 dbgs() << " (same src, dst)\n");
350 int I
= BBToClusterMap
[SrcBB
];
351 int J
= BBToClusterMap
[DstBB
];
352 std::vector
<BinaryBasicBlock
*> &ClusterA
= Clusters
[I
];
353 std::vector
<BinaryBasicBlock
*> &ClusterB
= Clusters
[J
];
355 // Case 2: They are already allocated at the same cluster or incompatible
356 // clusters. Adjust the weights of edges with the same source or
357 // destination, so that this edge has no effect on them any more, and ignore
358 // this edge. Also increase the intra- (or inter-) cluster edge count.
359 if (I
== J
|| !areClustersCompatible(ClusterA
, ClusterB
, E
)) {
360 if (!ClusterEdges
.empty())
361 ClusterEdges
[I
][J
] += E
.Count
;
362 LLVM_DEBUG(dbgs() << "\tAdjustment: Ignored edge "; E
.print(dbgs());
363 dbgs() << " (src, dst belong to same cluster or incompatible "
365 for (const BinaryBasicBlock
*SuccBB
: SrcBB
->successors()) {
368 auto WI
= Weight
.find(EdgeTy(SrcBB
, SuccBB
, 0));
369 assert(WI
!= Weight
.end() && "CFG edge not found in Weight map");
370 WI
->second
+= (int64_t)E
.Count
;
372 for (const BinaryBasicBlock
*PredBB
: DstBB
->predecessors()) {
375 auto WI
= Weight
.find(EdgeTy(PredBB
, DstBB
, 0));
376 assert(WI
!= Weight
.end() && "CFG edge not found in Weight map");
377 WI
->second
+= (int64_t)E
.Count
;
382 // Case 3: None of the previous cases is true, so just keep this edge in
384 NewQueue
.emplace_back(E
);
387 // Sort remaining edges in increasing weight order.
388 Queue
.swap(NewQueue
);
389 llvm::sort(Queue
, Comp
);
392 bool MinBranchGreedyClusterAlgorithm::areClustersCompatible(
393 const ClusterTy
&Front
, const ClusterTy
&Back
, const EdgeTy
&E
) const {
394 return Front
.back() == E
.Src
&& Back
.front() == E
.Dst
;
397 void MinBranchGreedyClusterAlgorithm::reset() {
398 GreedyClusterAlgorithm::reset();
402 void TSPReorderAlgorithm::reorderBasicBlocks(BinaryFunction
&BF
,
403 BasicBlockOrder
&Order
) const {
404 std::vector
<std::vector
<uint64_t>> Weight
;
405 std::vector
<BinaryBasicBlock
*> IndexToBB
;
407 const size_t N
= BF
.getLayout().block_size();
408 assert(N
<= std::numeric_limits
<uint64_t>::digits
&&
409 "cannot use TSP solution for sizes larger than bits in uint64_t");
411 // Populating weight map and index map
412 for (BinaryBasicBlock
*BB
: BF
.getLayout().blocks()) {
413 BB
->setLayoutIndex(IndexToBB
.size());
414 IndexToBB
.push_back(BB
);
417 for (const BinaryBasicBlock
*BB
: BF
.getLayout().blocks()) {
418 auto BI
= BB
->branch_info_begin();
419 Weight
[BB
->getLayoutIndex()].resize(N
);
420 for (BinaryBasicBlock
*SuccBB
: BB
->successors()) {
421 if (BI
->Count
!= BinaryBasicBlock::COUNT_NO_PROFILE
)
422 Weight
[BB
->getLayoutIndex()][SuccBB
->getLayoutIndex()] = BI
->Count
;
427 std::vector
<std::vector
<int64_t>> DP
;
429 for (std::vector
<int64_t> &Elmt
: DP
)
432 // Start with the entry basic block being allocated with cost zero
434 // Walk through TSP solutions using a bitmask to represent state (current set
435 // of BBs in the layout)
436 uint64_t BestSet
= 1;
437 uint64_t BestLast
= 0;
438 int64_t BestWeight
= 0;
439 for (uint64_t Set
= 1; Set
< (1ULL << N
); ++Set
) {
440 // Traverse each possibility of Last BB visited in this layout
441 for (uint64_t Last
= 0; Last
< N
; ++Last
) {
442 // Case 1: There is no possible layout with this BB as Last
443 if (DP
[Set
][Last
] == -1)
446 // Case 2: There is a layout with this Set and this Last, and we try
447 // to expand this set with New
448 for (uint64_t New
= 1; New
< N
; ++New
) {
449 // Case 2a: BB "New" is already in this Set
450 if ((Set
& (1ULL << New
)) != 0)
453 // Case 2b: BB "New" is not in this set and we add it to this Set and
454 // record total weight of this layout with "New" as the last BB.
455 uint64_t NewSet
= (Set
| (1ULL << New
));
456 if (DP
[NewSet
][New
] == -1)
457 DP
[NewSet
][New
] = DP
[Set
][Last
] + (int64_t)Weight
[Last
][New
];
458 DP
[NewSet
][New
] = std::max(DP
[NewSet
][New
],
459 DP
[Set
][Last
] + (int64_t)Weight
[Last
][New
]);
461 if (DP
[NewSet
][New
] > BestWeight
) {
462 BestWeight
= DP
[NewSet
][New
];
470 // Define final function layout based on layout that maximizes weight
471 uint64_t Last
= BestLast
;
472 uint64_t Set
= BestSet
;
475 Visited
[Last
] = true;
476 Order
.push_back(IndexToBB
[Last
]);
477 Set
= Set
& ~(1ULL << Last
);
481 for (uint64_t I
= 0; I
< N
; ++I
) {
482 if (DP
[Set
][I
] == -1)
484 int64_t AdjWeight
= Weight
[I
][Last
] > 0 ? Weight
[I
][Last
] : 0;
485 if (DP
[Set
][I
] + AdjWeight
> Best
) {
487 Best
= DP
[Set
][I
] + AdjWeight
;
491 Visited
[Last
] = true;
492 Order
.push_back(IndexToBB
[Last
]);
493 Set
= Set
& ~(1ULL << Last
);
495 std::reverse(Order
.begin(), Order
.end());
497 // Finalize layout with BBs that weren't assigned to the layout using the
499 for (BinaryBasicBlock
*BB
: BF
.getLayout().blocks())
500 if (Visited
[BB
->getLayoutIndex()] == false)
504 void ExtTSPReorderAlgorithm::reorderBasicBlocks(BinaryFunction
&BF
,
505 BasicBlockOrder
&Order
) const {
506 if (BF
.getLayout().block_empty())
509 // Do not change layout of functions w/o profile information
510 if (!BF
.hasValidProfile() || BF
.getLayout().block_size() <= 2) {
511 for (BinaryBasicBlock
*BB
: BF
.getLayout().blocks())
516 // Create a separate MCCodeEmitter to allow lock-free execution
517 BinaryContext::IndependentCodeEmitter Emitter
;
518 if (!opts::NoThreads
)
519 Emitter
= BF
.getBinaryContext().createIndependentMCCodeEmitter();
521 // Initialize CFG nodes and their data
522 std::vector
<uint64_t> BlockSizes
;
523 std::vector
<uint64_t> BlockCounts
;
524 BasicBlockOrder OrigOrder
;
525 BF
.getLayout().updateLayoutIndices();
526 for (BinaryBasicBlock
*BB
: BF
.getLayout().blocks()) {
527 uint64_t Size
= std::max
<uint64_t>(BB
->estimateSize(Emitter
.MCE
.get()), 1);
528 BlockSizes
.push_back(Size
);
529 BlockCounts
.push_back(BB
->getKnownExecutionCount());
530 OrigOrder
.push_back(BB
);
533 // Initialize CFG edges
534 std::vector
<codelayout::EdgeCount
> JumpCounts
;
535 for (BinaryBasicBlock
*BB
: BF
.getLayout().blocks()) {
536 auto BI
= BB
->branch_info_begin();
537 for (BinaryBasicBlock
*SuccBB
: BB
->successors()) {
538 assert(BI
->Count
!= BinaryBasicBlock::COUNT_NO_PROFILE
&&
539 "missing profile for a jump");
540 JumpCounts
.push_back(
541 {BB
->getLayoutIndex(), SuccBB
->getLayoutIndex(), BI
->Count
});
546 // Run the layout algorithm
548 codelayout::computeExtTspLayout(BlockSizes
, BlockCounts
, JumpCounts
);
549 Order
.reserve(BF
.getLayout().block_size());
550 for (uint64_t R
: Result
)
551 Order
.push_back(OrigOrder
[R
]);
554 void OptimizeReorderAlgorithm::reorderBasicBlocks(
555 BinaryFunction
&BF
, BasicBlockOrder
&Order
) const {
556 if (BF
.getLayout().block_empty())
559 // Cluster basic blocks.
560 CAlgo
->clusterBasicBlocks(BF
);
562 if (opts::PrintClusters
)
563 CAlgo
->printClusters();
565 // Arrange basic blocks according to clusters.
566 for (ClusterAlgorithm::ClusterTy
&Cluster
: CAlgo
->Clusters
)
567 Order
.insert(Order
.end(), Cluster
.begin(), Cluster
.end());
570 void OptimizeBranchReorderAlgorithm::reorderBasicBlocks(
571 BinaryFunction
&BF
, BasicBlockOrder
&Order
) const {
572 if (BF
.getLayout().block_empty())
575 // Cluster basic blocks.
576 CAlgo
->clusterBasicBlocks(BF
, /* ComputeEdges = */ true);
577 std::vector
<ClusterAlgorithm::ClusterTy
> &Clusters
= CAlgo
->Clusters
;
578 std::vector
<std::unordered_map
<uint32_t, uint64_t>> &ClusterEdges
=
581 // Compute clusters' average frequencies.
582 CAlgo
->computeClusterAverageFrequency(BF
.getBinaryContext());
583 std::vector
<double> &AvgFreq
= CAlgo
->AvgFreq
;
585 if (opts::PrintClusters
)
586 CAlgo
->printClusters();
588 // Cluster layout order
589 std::vector
<uint32_t> ClusterOrder
;
591 // Do a topological sort for clusters, prioritizing frequently-executed BBs
592 // during the traversal.
593 std::stack
<uint32_t> Stack
;
594 std::vector
<uint32_t> Status
;
595 std::vector
<uint32_t> Parent
;
596 Status
.resize(Clusters
.size(), 0);
597 Parent
.resize(Clusters
.size(), 0);
598 constexpr uint32_t STACKED
= 1;
599 constexpr uint32_t VISITED
= 2;
602 while (!Stack
.empty()) {
603 uint32_t I
= Stack
.top();
604 if (!(Status
[I
] & VISITED
)) {
605 Status
[I
] |= VISITED
;
606 // Order successors by weight
607 auto ClusterComp
= [&ClusterEdges
, I
](uint32_t A
, uint32_t B
) {
608 return ClusterEdges
[I
][A
] > ClusterEdges
[I
][B
];
610 std::priority_queue
<uint32_t, std::vector
<uint32_t>,
611 decltype(ClusterComp
)>
612 SuccQueue(ClusterComp
);
613 for (std::pair
<const uint32_t, uint64_t> &Target
: ClusterEdges
[I
]) {
614 if (Target
.second
> 0 && !(Status
[Target
.first
] & STACKED
) &&
615 !Clusters
[Target
.first
].empty()) {
616 Parent
[Target
.first
] = I
;
617 Status
[Target
.first
] = STACKED
;
618 SuccQueue
.push(Target
.first
);
621 while (!SuccQueue
.empty()) {
622 Stack
.push(SuccQueue
.top());
627 // Already visited this node
629 ClusterOrder
.push_back(I
);
631 std::reverse(ClusterOrder
.begin(), ClusterOrder
.end());
632 // Put unreachable clusters at the end
633 for (uint32_t I
= 0, E
= Clusters
.size(); I
< E
; ++I
)
634 if (!(Status
[I
] & VISITED
) && !Clusters
[I
].empty())
635 ClusterOrder
.push_back(I
);
637 // Sort nodes with equal precedence
638 auto Beg
= ClusterOrder
.begin();
639 // Don't reorder the first cluster, which contains the function entry point
641 std::stable_sort(Beg
, ClusterOrder
.end(),
642 [&AvgFreq
, &Parent
](uint32_t A
, uint32_t B
) {
643 uint32_t P
= Parent
[A
];
644 while (Parent
[P
] != 0) {
650 while (Parent
[P
] != 0) {
655 return AvgFreq
[A
] > AvgFreq
[B
];
658 if (opts::PrintClusters
) {
659 errs() << "New cluster order: ";
660 const char *Sep
= "";
661 for (uint32_t O
: ClusterOrder
) {
668 // Arrange basic blocks according to cluster order.
669 for (uint32_t ClusterIndex
: ClusterOrder
) {
670 ClusterAlgorithm::ClusterTy
&Cluster
= Clusters
[ClusterIndex
];
671 Order
.insert(Order
.end(), Cluster
.begin(), Cluster
.end());
675 void OptimizeCacheReorderAlgorithm::reorderBasicBlocks(
676 BinaryFunction
&BF
, BasicBlockOrder
&Order
) const {
677 if (BF
.getLayout().block_empty())
680 const uint64_t ColdThreshold
=
681 opts::ColdThreshold
*
682 (*BF
.getLayout().block_begin())->getExecutionCount() / 1000;
684 // Cluster basic blocks.
685 CAlgo
->clusterBasicBlocks(BF
);
686 std::vector
<ClusterAlgorithm::ClusterTy
> &Clusters
= CAlgo
->Clusters
;
688 // Compute clusters' average frequencies.
689 CAlgo
->computeClusterAverageFrequency(BF
.getBinaryContext());
690 std::vector
<double> &AvgFreq
= CAlgo
->AvgFreq
;
692 if (opts::PrintClusters
)
693 CAlgo
->printClusters();
695 // Cluster layout order
696 std::vector
<uint32_t> ClusterOrder
;
698 // Order clusters based on average instruction execution frequency
699 for (uint32_t I
= 0, E
= Clusters
.size(); I
< E
; ++I
)
700 if (!Clusters
[I
].empty())
701 ClusterOrder
.push_back(I
);
702 // Don't reorder the first cluster, which contains the function entry point
704 std::next(ClusterOrder
.begin()), ClusterOrder
.end(),
705 [&AvgFreq
](uint32_t A
, uint32_t B
) { return AvgFreq
[A
] > AvgFreq
[B
]; });
707 if (opts::PrintClusters
) {
708 errs() << "New cluster order: ";
709 const char *Sep
= "";
710 for (uint32_t O
: ClusterOrder
) {
717 // Arrange basic blocks according to cluster order.
718 for (uint32_t ClusterIndex
: ClusterOrder
) {
719 ClusterAlgorithm::ClusterTy
&Cluster
= Clusters
[ClusterIndex
];
720 Order
.insert(Order
.end(), Cluster
.begin(), Cluster
.end());
721 // Force zero execution count on clusters that do not meet the cut off
722 // specified by --cold-threshold.
723 if (AvgFreq
[ClusterIndex
] < static_cast<double>(ColdThreshold
))
724 for (BinaryBasicBlock
*BBPtr
: Cluster
)
725 BBPtr
->setExecutionCount(0);
729 void ReverseReorderAlgorithm::reorderBasicBlocks(BinaryFunction
&BF
,
730 BasicBlockOrder
&Order
) const {
731 if (BF
.getLayout().block_empty())
734 BinaryBasicBlock
*FirstBB
= *BF
.getLayout().block_begin();
735 Order
.push_back(FirstBB
);
736 for (auto RLI
= BF
.getLayout().block_rbegin(); *RLI
!= FirstBB
; ++RLI
)
737 Order
.push_back(*RLI
);
740 void RandomClusterReorderAlgorithm::reorderBasicBlocks(
741 BinaryFunction
&BF
, BasicBlockOrder
&Order
) const {
742 if (BF
.getLayout().block_empty())
745 // Cluster basic blocks.
746 CAlgo
->clusterBasicBlocks(BF
);
747 std::vector
<ClusterAlgorithm::ClusterTy
> &Clusters
= CAlgo
->Clusters
;
749 if (opts::PrintClusters
)
750 CAlgo
->printClusters();
752 // Cluster layout order
753 std::vector
<uint32_t> ClusterOrder
;
755 // Order clusters based on average instruction execution frequency
756 for (uint32_t I
= 0, E
= Clusters
.size(); I
< E
; ++I
)
757 if (!Clusters
[I
].empty())
758 ClusterOrder
.push_back(I
);
760 std::shuffle(std::next(ClusterOrder
.begin()), ClusterOrder
.end(),
761 std::default_random_engine(opts::RandomSeed
.getValue()));
763 if (opts::PrintClusters
) {
764 errs() << "New cluster order: ";
765 const char *Sep
= "";
766 for (uint32_t O
: ClusterOrder
) {
773 // Arrange basic blocks according to cluster order.
774 for (uint32_t ClusterIndex
: ClusterOrder
) {
775 ClusterAlgorithm::ClusterTy
&Cluster
= Clusters
[ClusterIndex
];
776 Order
.insert(Order
.end(), Cluster
.begin(), Cluster
.end());