1 //===- bolt/Passes/SplitFunctions.cpp - Pass for splitting function code --===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 // This file implements the SplitFunctions pass.
11 //===----------------------------------------------------------------------===//
13 #include "bolt/Passes/SplitFunctions.h"
14 #include "bolt/Core/BinaryBasicBlock.h"
15 #include "bolt/Core/BinaryFunction.h"
16 #include "bolt/Core/FunctionLayout.h"
17 #include "bolt/Core/ParallelUtilities.h"
18 #include "bolt/Utils/CommandLineOpts.h"
19 #include "llvm/ADT/STLExtras.h"
20 #include "llvm/ADT/SmallVector.h"
21 #include "llvm/ADT/iterator_range.h"
22 #include "llvm/Support/CommandLine.h"
23 #include "llvm/Support/FormatVariadic.h"
31 #define DEBUG_TYPE "bolt-opts"
37 class DeprecatedSplitFunctionOptionParser
: public cl::parser
<bool> {
39 explicit DeprecatedSplitFunctionOptionParser(cl::Option
&O
)
40 : cl::parser
<bool>(O
) {}
42 bool parse(cl::Option
&O
, StringRef ArgName
, StringRef Arg
, bool &Value
) {
43 if (Arg
== "2" || Arg
== "3") {
45 errs() << formatv("BOLT-WARNING: specifying non-boolean value \"{0}\" "
46 "for option -{1} is deprecated\n",
50 return cl::parser
<bool>::parse(O
, ArgName
, Arg
, Value
);
57 extern cl::OptionCategory BoltOptCategory
;
59 extern cl::opt
<bool> SplitEH
;
60 extern cl::opt
<unsigned> ExecutionCountThreshold
;
61 extern cl::opt
<uint32_t> RandomSeed
;
63 static cl::opt
<bool> AggressiveSplitting(
64 "split-all-cold", cl::desc("outline as many cold basic blocks as possible"),
65 cl::cat(BoltOptCategory
));
67 static cl::opt
<unsigned> SplitAlignThreshold(
68 "split-align-threshold",
69 cl::desc("when deciding to split a function, apply this alignment "
70 "while doing the size comparison (see -split-threshold). "
74 cl::Hidden
, cl::cat(BoltOptCategory
));
76 static cl::opt
<bool, false, DeprecatedSplitFunctionOptionParser
>
77 SplitFunctions("split-functions",
78 cl::desc("split functions into fragments"),
79 cl::cat(BoltOptCategory
));
81 static cl::opt
<unsigned> SplitThreshold(
83 cl::desc("split function only if its main size is reduced by more than "
84 "given amount of bytes. Default value: 0, i.e. split iff the "
85 "size is reduced. Note that on some architectures the size can "
86 "increase after splitting."),
87 cl::init(0), cl::Hidden
, cl::cat(BoltOptCategory
));
89 static cl::opt
<SplitFunctionsStrategy
> SplitStrategy(
90 "split-strategy", cl::init(SplitFunctionsStrategy::Profile2
),
91 cl::values(clEnumValN(SplitFunctionsStrategy::Profile2
, "profile2",
92 "split each function into a hot and cold fragment "
93 "using profiling information")),
94 cl::values(clEnumValN(SplitFunctionsStrategy::CDSplit
, "cdsplit",
95 "split each function into a hot, warm, and cold "
96 "fragment using profiling information")),
97 cl::values(clEnumValN(
98 SplitFunctionsStrategy::Random2
, "random2",
99 "split each function into a hot and cold fragment at a randomly chosen "
100 "split point (ignoring any available profiling information)")),
101 cl::values(clEnumValN(
102 SplitFunctionsStrategy::RandomN
, "randomN",
103 "split each function into N fragments at a randomly chosen split "
104 "points (ignoring any available profiling information)")),
105 cl::values(clEnumValN(
106 SplitFunctionsStrategy::All
, "all",
107 "split all basic blocks of each function into fragments such that each "
108 "fragment contains exactly a single basic block")),
109 cl::desc("strategy used to partition blocks into fragments"),
110 cl::cat(BoltOptCategory
));
112 static cl::opt
<double> CallScale(
114 cl::desc("Call score scale coefficient (when --split-strategy=cdsplit)"),
115 cl::init(0.95), cl::ReallyHidden
, cl::cat(BoltOptCategory
));
117 static cl::opt
<double>
118 CallPower("call-power",
119 cl::desc("Call score power (when --split-strategy=cdsplit)"),
120 cl::init(0.05), cl::ReallyHidden
, cl::cat(BoltOptCategory
));
122 static cl::opt
<double>
123 JumpPower("jump-power",
124 cl::desc("Jump score power (when --split-strategy=cdsplit)"),
125 cl::init(0.15), cl::ReallyHidden
, cl::cat(BoltOptCategory
));
129 bool hasFullProfile(const BinaryFunction
&BF
) {
130 return llvm::all_of(BF
.blocks(), [](const BinaryBasicBlock
&BB
) {
131 return BB
.getExecutionCount() != BinaryBasicBlock::COUNT_NO_PROFILE
;
135 bool allBlocksCold(const BinaryFunction
&BF
) {
136 return llvm::all_of(BF
.blocks(), [](const BinaryBasicBlock
&BB
) {
137 return BB
.getExecutionCount() == 0;
141 struct SplitProfile2 final
: public SplitStrategy
{
142 bool canSplit(const BinaryFunction
&BF
) override
{
143 return BF
.hasValidProfile() && hasFullProfile(BF
) && !allBlocksCold(BF
);
146 bool compactFragments() override
{ return true; }
148 void fragment(const BlockIt Start
, const BlockIt End
) override
{
149 for (BinaryBasicBlock
*const BB
: llvm::make_range(Start
, End
)) {
150 if (BB
->getExecutionCount() == 0)
151 BB
->setFragmentNum(FragmentNum::cold());
156 struct SplitCacheDirected final
: public SplitStrategy
{
158 using BasicBlockOrder
= BinaryFunction::BasicBlockOrderType
;
160 bool canSplit(const BinaryFunction
&BF
) override
{
161 return BF
.hasValidProfile() && hasFullProfile(BF
) && !allBlocksCold(BF
);
164 explicit SplitCacheDirected(BinaryContext
&BC
) : BC(BC
) {
165 initializeAuxiliaryVariables();
169 // When some functions are hot-warm split and others are hot-warm-cold split,
170 // we do not want to change the fragment numbers of the blocks in the hot-warm
172 bool compactFragments() override
{ return false; }
174 void fragment(const BlockIt Start
, const BlockIt End
) override
{
175 BasicBlockOrder
BlockOrder(Start
, End
);
176 BinaryFunction
&BF
= *BlockOrder
.front()->getFunction();
177 // No need to re-split small functions.
178 if (BlockOrder
.size() <= 2)
181 size_t BestSplitIndex
= findSplitIndex(BF
, BlockOrder
);
182 assert(BestSplitIndex
< BlockOrder
.size());
184 // Assign fragments based on the computed best split index.
185 // All basic blocks with index up to the best split index become hot.
186 // All remaining blocks are warm / cold depending on if count is
187 // greater than zero or not.
188 for (size_t Index
= 0; Index
< BlockOrder
.size(); Index
++) {
189 BinaryBasicBlock
*BB
= BlockOrder
[Index
];
190 if (Index
<= BestSplitIndex
)
191 BB
->setFragmentNum(FragmentNum::main());
193 BB
->setFragmentNum(BB
->getKnownExecutionCount() > 0
194 ? FragmentNum::warm()
195 : FragmentNum::cold());
206 size_t SplitIndex
= size_t(-1);
207 size_t HotSizeReduction
= 0;
208 double LocalScore
= 0;
209 double CoverCallScore
= 0;
211 double sum() const { return LocalScore
+ CoverCallScore
; }
214 // Auxiliary variables used by the algorithm.
215 size_t TotalNumBlocks
{0};
216 size_t OrigHotSectionSize
{0};
217 DenseMap
<const BinaryBasicBlock
*, size_t> GlobalIndices
;
218 DenseMap
<const BinaryBasicBlock
*, size_t> BBSizes
;
219 DenseMap
<const BinaryBasicBlock
*, size_t> BBOffsets
;
222 std::vector
<SmallVector
<const BinaryBasicBlock
*, 0>> Callers
;
223 std::vector
<SmallVector
<const BinaryBasicBlock
*, 0>> Callees
;
225 bool shouldConsiderForCallGraph(const BinaryFunction
&BF
) {
226 // Only a subset of the functions in the binary will be considered
227 // for initializing auxiliary variables and building call graph.
228 return BF
.hasValidIndex() && BF
.hasValidProfile() && !BF
.empty();
231 void initializeAuxiliaryVariables() {
232 for (BinaryFunction
*BF
: BC
.getSortedFunctions()) {
233 if (!shouldConsiderForCallGraph(*BF
))
236 // Calculate the size of each BB after hot-cold splitting.
237 // This populates BinaryBasicBlock::OutputAddressRange which
238 // can be used to compute the size of each BB.
239 BC
.calculateEmittedSize(*BF
, /*FixBranches=*/true);
241 for (BinaryBasicBlock
*BB
: BF
->getLayout().blocks()) {
242 // Unique global index.
243 GlobalIndices
[BB
] = TotalNumBlocks
;
246 // Block size after hot-cold splitting.
247 BBSizes
[BB
] = BB
->getOutputSize();
249 // Hot block offset after hot-cold splitting.
250 BBOffsets
[BB
] = OrigHotSectionSize
;
252 OrigHotSectionSize
+= BBSizes
[BB
];
257 void buildCallGraph() {
258 Callers
.resize(TotalNumBlocks
);
259 Callees
.resize(TotalNumBlocks
);
260 for (const BinaryFunction
*SrcFunction
: BC
.getSortedFunctions()) {
261 if (!shouldConsiderForCallGraph(*SrcFunction
))
264 for (BinaryBasicBlock
&SrcBB
: SrcFunction
->blocks()) {
265 // Skip blocks that are not executed
266 if (SrcBB
.getKnownExecutionCount() == 0)
269 // Find call instructions and extract target symbols from each one
270 for (const MCInst
&Inst
: SrcBB
) {
271 if (!BC
.MIB
->isCall(Inst
))
275 const MCSymbol
*DstSym
= BC
.MIB
->getTargetSymbol(Inst
);
276 // Ignore calls w/o information
280 const BinaryFunction
*DstFunction
= BC
.getFunctionForSymbol(DstSym
);
281 // Ignore calls that do not have a valid target, but do not ignore
282 // recursive calls, because caller block could be moved to warm.
283 if (!DstFunction
|| DstFunction
->getLayout().block_empty())
286 const BinaryBasicBlock
*DstBB
= &(DstFunction
->front());
288 // Record the call only if DstBB is also in functions to consider for
290 if (GlobalIndices
.contains(DstBB
)) {
291 Callers
[GlobalIndices
[DstBB
]].push_back(&SrcBB
);
292 Callees
[GlobalIndices
[&SrcBB
]].push_back(DstBB
);
299 /// Populate BinaryBasicBlock::OutputAddressRange with estimated basic block
300 /// start and end addresses for hot and warm basic blocks, assuming hot-warm
301 /// splitting happens at \p SplitIndex. Also return estimated end addresses
302 /// of the hot fragment before and after splitting.
303 /// The estimations take into account the potential addition of branch
304 /// instructions due to split fall through branches as well as the need to
305 /// use longer branch instructions for split (un)conditional branches.
306 std::pair
<size_t, size_t>
307 estimatePostSplitBBAddress(const BasicBlockOrder
&BlockOrder
,
308 const size_t SplitIndex
) {
309 assert(SplitIndex
< BlockOrder
.size() && "Invalid split index");
311 // Update function layout assuming hot-warm splitting at SplitIndex.
312 for (size_t Index
= 0; Index
< BlockOrder
.size(); Index
++) {
313 BinaryBasicBlock
*BB
= BlockOrder
[Index
];
314 if (BB
->getFragmentNum() == FragmentNum::cold())
316 BB
->setFragmentNum(Index
<= SplitIndex
? FragmentNum::main()
317 : FragmentNum::warm());
319 BinaryFunction
*BF
= BlockOrder
[0]->getFunction();
320 BF
->getLayout().update(BlockOrder
);
321 // Populate BB.OutputAddressRange under the updated layout.
322 BC
.calculateEmittedSize(*BF
);
324 // Populate BB.OutputAddressRange with estimated new start and end addresses
325 // and compute the old end address of the hot section and the new end
326 // address of the hot section.
327 size_t OldHotEndAddr
{0};
328 size_t NewHotEndAddr
{0};
329 size_t CurrentAddr
= BBOffsets
[BlockOrder
[0]];
330 for (BinaryBasicBlock
*BB
: BlockOrder
) {
331 // We only care about new addresses of blocks in hot/warm.
332 if (BB
->getFragmentNum() == FragmentNum::cold())
334 const size_t NewSize
= BB
->getOutputSize();
335 BB
->setOutputStartAddress(CurrentAddr
);
336 CurrentAddr
+= NewSize
;
337 BB
->setOutputEndAddress(CurrentAddr
);
338 if (BB
->getLayoutIndex() == SplitIndex
) {
339 NewHotEndAddr
= CurrentAddr
;
340 // Approximate the start address of the warm fragment of the current
341 // function using the original hot section size.
342 CurrentAddr
= OrigHotSectionSize
;
344 OldHotEndAddr
= BBOffsets
[BB
] + BBSizes
[BB
];
346 return std::make_pair(OldHotEndAddr
, NewHotEndAddr
);
349 /// Get a collection of "shortenable" calls, that is, calls of type X->Y
350 /// when the function order is [... X ... BF ... Y ...].
351 /// If the hot fragment size of BF is reduced, then such calls are guaranteed
352 /// to get shorter by the reduced hot fragment size.
353 std::vector
<CallInfo
> extractCoverCalls(const BinaryFunction
&BF
) {
354 // Record the length and the count of the calls that can be shortened
355 std::vector
<CallInfo
> CoverCalls
;
356 if (opts::CallScale
== 0)
359 const BinaryFunction
*ThisBF
= &BF
;
360 const BinaryBasicBlock
*ThisBB
= &(ThisBF
->front());
361 const size_t ThisGI
= GlobalIndices
[ThisBB
];
363 for (const BinaryFunction
*DstBF
: BC
.getSortedFunctions()) {
364 if (!shouldConsiderForCallGraph(*DstBF
))
367 const BinaryBasicBlock
*DstBB
= &(DstBF
->front());
368 if (DstBB
->getKnownExecutionCount() == 0)
371 const size_t DstGI
= GlobalIndices
[DstBB
];
372 for (const BinaryBasicBlock
*SrcBB
: Callers
[DstGI
]) {
373 const BinaryFunction
*SrcBF
= SrcBB
->getFunction();
377 const size_t CallCount
= SrcBB
->getKnownExecutionCount();
379 const size_t SrcGI
= GlobalIndices
[SrcBB
];
381 const bool IsCoverCall
= (SrcGI
< ThisGI
&& ThisGI
< DstGI
) ||
382 (DstGI
<= ThisGI
&& ThisGI
< SrcGI
);
386 const size_t SrcBBEndAddr
= BBOffsets
[SrcBB
] + BBSizes
[SrcBB
];
387 const size_t DstBBStartAddr
= BBOffsets
[DstBB
];
388 const size_t CallLength
=
389 AbsoluteDifference(SrcBBEndAddr
, DstBBStartAddr
);
390 const CallInfo CI
{CallLength
, CallCount
};
391 CoverCalls
.emplace_back(CI
);
397 /// Compute the edge score of a call edge.
398 double computeCallScore(uint64_t CallCount
, size_t CallLength
) {
399 // Increase call lengths by 1 to avoid raising 0 to a negative power.
400 return opts::CallScale
* static_cast<double>(CallCount
) /
401 std::pow(static_cast<double>(CallLength
+ 1), opts::CallPower
);
404 /// Compute the edge score of a jump (branch) edge.
405 double computeJumpScore(uint64_t JumpCount
, size_t JumpLength
) {
406 // Increase jump lengths by 1 to avoid raising 0 to a negative power.
407 return static_cast<double>(JumpCount
) /
408 std::pow(static_cast<double>(JumpLength
+ 1), opts::JumpPower
);
411 /// Compute sum of scores over jumps within \p BlockOrder given \p SplitIndex.
412 /// Increament Score.LocalScore in place by the sum.
413 void computeJumpScore(const BasicBlockOrder
&BlockOrder
,
414 const size_t SplitIndex
, SplitScore
&Score
) {
416 for (const BinaryBasicBlock
*SrcBB
: BlockOrder
) {
417 if (SrcBB
->getKnownExecutionCount() == 0)
420 const size_t SrcBBEndAddr
= SrcBB
->getOutputAddressRange().second
;
422 for (const auto Pair
: zip(SrcBB
->successors(), SrcBB
->branch_info())) {
423 const BinaryBasicBlock
*DstBB
= std::get
<0>(Pair
);
424 const BinaryBasicBlock::BinaryBranchInfo
&Branch
= std::get
<1>(Pair
);
425 const size_t JumpCount
= Branch
.Count
;
430 const size_t DstBBStartAddr
= DstBB
->getOutputAddressRange().first
;
431 const size_t NewJumpLength
=
432 AbsoluteDifference(SrcBBEndAddr
, DstBBStartAddr
);
433 Score
.LocalScore
+= computeJumpScore(JumpCount
, NewJumpLength
);
438 /// Compute sum of scores over calls originated in the current function
439 /// given \p SplitIndex. Increament Score.LocalScore in place by the sum.
440 void computeLocalCallScore(const BasicBlockOrder
&BlockOrder
,
441 const size_t SplitIndex
, SplitScore
&Score
) {
442 if (opts::CallScale
== 0)
445 // Global index of the last block in the current function.
446 // This is later used to determine whether a call originated in the current
447 // function is to a function that comes after the current function.
448 const size_t LastGlobalIndex
= GlobalIndices
[BlockOrder
.back()];
450 // The length of calls originated in the input function can increase /
451 // decrease depending on the splitting decision.
452 for (const BinaryBasicBlock
*SrcBB
: BlockOrder
) {
453 const size_t CallCount
= SrcBB
->getKnownExecutionCount();
454 // If SrcBB does not call any functions, skip it.
458 // Obtain an estimate on the end address of the src basic block
459 // after splitting at SplitIndex.
460 const size_t SrcBBEndAddr
= SrcBB
->getOutputAddressRange().second
;
462 for (const BinaryBasicBlock
*DstBB
: Callees
[GlobalIndices
[SrcBB
]]) {
463 // Obtain an estimate on the start address of the dst basic block
464 // after splitting at SplitIndex. If DstBB is in a function before
465 // the current function, then its start address remains unchanged.
466 size_t DstBBStartAddr
= BBOffsets
[DstBB
];
467 // If DstBB is in a function after the current function, then its
468 // start address should be adjusted based on the reduction in hot size.
469 if (GlobalIndices
[DstBB
] > LastGlobalIndex
) {
470 assert(DstBBStartAddr
>= Score
.HotSizeReduction
);
471 DstBBStartAddr
-= Score
.HotSizeReduction
;
473 const size_t NewCallLength
=
474 AbsoluteDifference(SrcBBEndAddr
, DstBBStartAddr
);
475 Score
.LocalScore
+= computeCallScore(CallCount
, NewCallLength
);
480 /// Compute sum of splitting scores for cover calls of the input function.
481 /// Increament Score.CoverCallScore in place by the sum.
482 void computeCoverCallScore(const BasicBlockOrder
&BlockOrder
,
483 const size_t SplitIndex
,
484 const std::vector
<CallInfo
> &CoverCalls
,
486 if (opts::CallScale
== 0)
489 for (const CallInfo CI
: CoverCalls
) {
490 assert(CI
.Length
>= Score
.HotSizeReduction
&&
491 "Length of cover calls must exceed reduced size of hot fragment.");
492 // Compute the new length of the call, which is shorter than the original
493 // one by the size of the splitted fragment minus the total size increase.
494 const size_t NewCallLength
= CI
.Length
- Score
.HotSizeReduction
;
495 Score
.CoverCallScore
+= computeCallScore(CI
.Count
, NewCallLength
);
499 /// Compute the split score of splitting a function at a given index.
500 /// The split score consists of local score and cover score. This function
501 /// returns \p Score of SplitScore type. It contains the local score and
502 /// cover score of the current splitting index. For easier book keeping and
503 /// comparison, it also stores the split index and the resulting reduction
504 /// in hot fragment size.
505 SplitScore
computeSplitScore(const BinaryFunction
&BF
,
506 const BasicBlockOrder
&BlockOrder
,
507 const size_t SplitIndex
,
508 const std::vector
<CallInfo
> &CoverCalls
) {
509 // Populate BinaryBasicBlock::OutputAddressRange with estimated
510 // new start and end addresses after hot-warm splitting at SplitIndex.
513 std::tie(OldHotEnd
, NewHotEnd
) =
514 estimatePostSplitBBAddress(BlockOrder
, SplitIndex
);
517 Score
.SplitIndex
= SplitIndex
;
519 // It's not worth splitting if OldHotEnd < NewHotEnd.
520 if (OldHotEnd
< NewHotEnd
)
523 // Hot fragment size reduction due to splitting.
524 Score
.HotSizeReduction
= OldHotEnd
- NewHotEnd
;
526 // First part of LocalScore is the sum over call edges originated in the
527 // input function. These edges can get shorter or longer depending on
528 // SplitIndex. Score.LocalScore is increamented in place.
529 computeLocalCallScore(BlockOrder
, SplitIndex
, Score
);
531 // Second part of LocalScore is the sum over jump edges with src basic block
532 // and dst basic block in the current function. Score.LocalScore is
533 // increamented in place.
534 computeJumpScore(BlockOrder
, SplitIndex
, Score
);
536 // Compute CoverCallScore and store in Score in place.
537 computeCoverCallScore(BlockOrder
, SplitIndex
, CoverCalls
, Score
);
541 /// Find the most likely successor of a basic block when it has one or two
542 /// successors. Return nullptr otherwise.
543 const BinaryBasicBlock
*getMostLikelySuccessor(const BinaryBasicBlock
*BB
) {
544 if (BB
->succ_size() == 1)
545 return BB
->getSuccessor();
546 if (BB
->succ_size() == 2) {
547 uint64_t TakenCount
= BB
->getTakenBranchInfo().Count
;
548 assert(TakenCount
!= BinaryBasicBlock::COUNT_NO_PROFILE
);
549 uint64_t NonTakenCount
= BB
->getFallthroughBranchInfo().Count
;
550 assert(NonTakenCount
!= BinaryBasicBlock::COUNT_NO_PROFILE
);
551 if (TakenCount
> NonTakenCount
)
552 return BB
->getConditionalSuccessor(true);
553 else if (TakenCount
< NonTakenCount
)
554 return BB
->getConditionalSuccessor(false);
559 /// Find the best index for splitting. The returned value is the index of the
560 /// last hot basic block. Hence, "no splitting" is equivalent to returning the
561 /// value which is one less than the size of the function.
562 size_t findSplitIndex(const BinaryFunction
&BF
,
563 const BasicBlockOrder
&BlockOrder
) {
564 assert(BlockOrder
.size() > 2);
565 // Find all function calls that can be shortened if we move blocks of the
566 // current function to warm/cold
567 const std::vector
<CallInfo
> CoverCalls
= extractCoverCalls(BF
);
569 // Find the existing hot-cold splitting index.
570 size_t HotColdIndex
= 0;
571 while (HotColdIndex
+ 1 < BlockOrder
.size()) {
572 if (BlockOrder
[HotColdIndex
+ 1]->getFragmentNum() == FragmentNum::cold())
576 assert(HotColdIndex
+ 1 == BlockOrder
.size() ||
577 (BlockOrder
[HotColdIndex
]->getFragmentNum() == FragmentNum::main() &&
578 BlockOrder
[HotColdIndex
+ 1]->getFragmentNum() ==
579 FragmentNum::cold()));
581 // Try all possible split indices up to HotColdIndex (blocks that have
582 // Index <= SplitIndex are in hot) and find the one maximizing the
584 SplitScore BestScore
;
585 for (size_t Index
= 0; Index
<= HotColdIndex
; Index
++) {
586 const BinaryBasicBlock
*LastHotBB
= BlockOrder
[Index
];
587 assert(LastHotBB
->getFragmentNum() != FragmentNum::cold());
589 // Do not break jump to the most likely successor.
590 if (Index
+ 1 < BlockOrder
.size() &&
591 BlockOrder
[Index
+ 1] == getMostLikelySuccessor(LastHotBB
))
594 const SplitScore Score
=
595 computeSplitScore(BF
, BlockOrder
, Index
, CoverCalls
);
596 if (Score
.sum() > BestScore
.sum())
600 // If we don't find a good splitting point, fallback to the original one.
601 if (BestScore
.SplitIndex
== size_t(-1))
604 return BestScore
.SplitIndex
;
608 struct SplitRandom2 final
: public SplitStrategy
{
609 std::minstd_rand0 Gen
;
611 SplitRandom2() : Gen(opts::RandomSeed
.getValue()) {}
613 bool canSplit(const BinaryFunction
&BF
) override
{ return true; }
615 bool compactFragments() override
{ return true; }
617 void fragment(const BlockIt Start
, const BlockIt End
) override
{
618 using DiffT
= typename
std::iterator_traits
<BlockIt
>::difference_type
;
619 const DiffT NumBlocks
= End
- Start
;
620 assert(NumBlocks
> 0 && "Cannot fragment empty function");
622 // We want to split at least one block
623 const auto LastSplitPoint
= std::max
<DiffT
>(NumBlocks
- 1, 1);
624 std::uniform_int_distribution
<DiffT
> Dist(1, LastSplitPoint
);
625 const DiffT SplitPoint
= Dist(Gen
);
626 for (BinaryBasicBlock
*BB
: llvm::make_range(Start
+ SplitPoint
, End
))
627 BB
->setFragmentNum(FragmentNum::cold());
629 LLVM_DEBUG(dbgs() << formatv("BOLT-DEBUG: randomly chose last {0} (out of "
630 "{1} possible) blocks to split\n",
631 NumBlocks
- SplitPoint
, End
- Start
));
635 struct SplitRandomN final
: public SplitStrategy
{
636 std::minstd_rand0 Gen
;
638 SplitRandomN() : Gen(opts::RandomSeed
.getValue()) {}
640 bool canSplit(const BinaryFunction
&BF
) override
{ return true; }
642 bool compactFragments() override
{ return true; }
644 void fragment(const BlockIt Start
, const BlockIt End
) override
{
645 using DiffT
= typename
std::iterator_traits
<BlockIt
>::difference_type
;
646 const DiffT NumBlocks
= End
- Start
;
647 assert(NumBlocks
> 0 && "Cannot fragment empty function");
649 // With n blocks, there are n-1 places to split them.
650 const DiffT MaximumSplits
= NumBlocks
- 1;
651 // We want to generate at least two fragment if possible, but if there is
652 // only one block, no splits are possible.
653 const auto MinimumSplits
= std::min
<DiffT
>(MaximumSplits
, 1);
654 std::uniform_int_distribution
<DiffT
> Dist(MinimumSplits
, MaximumSplits
);
655 // Choose how many splits to perform
656 const DiffT NumSplits
= Dist(Gen
);
658 // Draw split points from a lottery
659 SmallVector
<unsigned, 0> Lottery(MaximumSplits
);
660 // Start lottery at 1, because there is no meaningful splitpoint before the
662 std::iota(Lottery
.begin(), Lottery
.end(), 1u);
663 std::shuffle(Lottery
.begin(), Lottery
.end(), Gen
);
664 Lottery
.resize(NumSplits
);
667 // Add one past the end entry to lottery
668 Lottery
.push_back(NumBlocks
);
670 unsigned LotteryIndex
= 0;
672 for (BinaryBasicBlock
*const BB
: make_range(Start
, End
)) {
673 // Check whether to start new fragment
674 if (BBPos
>= Lottery
[LotteryIndex
])
677 // Because LotteryIndex is 0 based and cold fragments are 1 based, we can
678 // use the index to assign fragments.
679 BB
->setFragmentNum(FragmentNum(LotteryIndex
));
686 struct SplitAll final
: public SplitStrategy
{
687 bool canSplit(const BinaryFunction
&BF
) override
{ return true; }
689 bool compactFragments() override
{
690 // Keeping empty fragments allows us to test, that empty fragments do not
695 void fragment(const BlockIt Start
, const BlockIt End
) override
{
696 unsigned Fragment
= 0;
697 for (BinaryBasicBlock
*const BB
: llvm::make_range(Start
, End
))
698 BB
->setFragmentNum(FragmentNum(Fragment
++));
706 bool SplitFunctions::shouldOptimize(const BinaryFunction
&BF
) const {
707 // Apply execution count threshold
708 if (BF
.getKnownExecutionCount() < opts::ExecutionCountThreshold
)
711 return BinaryFunctionPass::shouldOptimize(BF
);
714 Error
SplitFunctions::runOnFunctions(BinaryContext
&BC
) {
715 if (!opts::SplitFunctions
)
716 return Error::success();
718 if (BC
.IsLinuxKernel
&& BC
.BOLTReserved
.empty()) {
719 BC
.errs() << "BOLT-ERROR: split functions require reserved space in the "
720 "Linux kernel binary\n";
724 // If split strategy is not CDSplit, then a second run of the pass is not
725 // needed after function reordering.
726 if (BC
.HasFinalizedFunctionOrder
&&
727 opts::SplitStrategy
!= SplitFunctionsStrategy::CDSplit
)
728 return Error::success();
730 std::unique_ptr
<SplitStrategy
> Strategy
;
731 bool ForceSequential
= false;
733 switch (opts::SplitStrategy
) {
734 case SplitFunctionsStrategy::CDSplit
:
735 // CDSplit runs two splitting passes: hot-cold splitting (SplitPrfoile2)
736 // before function reordering and hot-warm-cold splitting
737 // (SplitCacheDirected) after function reordering.
738 if (BC
.HasFinalizedFunctionOrder
)
739 Strategy
= std::make_unique
<SplitCacheDirected
>(BC
);
741 Strategy
= std::make_unique
<SplitProfile2
>();
742 opts::AggressiveSplitting
= true;
743 BC
.HasWarmSection
= true;
745 case SplitFunctionsStrategy::Profile2
:
746 Strategy
= std::make_unique
<SplitProfile2
>();
748 case SplitFunctionsStrategy::Random2
:
749 Strategy
= std::make_unique
<SplitRandom2
>();
750 // If we split functions randomly, we need to ensure that across runs with
751 // the same input, we generate random numbers for each function in the same
753 ForceSequential
= true;
755 case SplitFunctionsStrategy::RandomN
:
756 Strategy
= std::make_unique
<SplitRandomN
>();
757 ForceSequential
= true;
759 case SplitFunctionsStrategy::All
:
760 Strategy
= std::make_unique
<SplitAll
>();
764 ParallelUtilities::PredicateTy SkipFunc
= [&](const BinaryFunction
&BF
) {
765 return !shouldOptimize(BF
);
768 ParallelUtilities::runOnEachFunction(
769 BC
, ParallelUtilities::SchedulingPolicy::SP_BB_LINEAR
,
770 [&](BinaryFunction
&BF
) { splitFunction(BF
, *Strategy
); }, SkipFunc
,
771 "SplitFunctions", ForceSequential
);
773 if (SplitBytesHot
+ SplitBytesCold
> 0)
774 BC
.outs() << "BOLT-INFO: splitting separates " << SplitBytesHot
775 << " hot bytes from " << SplitBytesCold
<< " cold bytes "
776 << format("(%.2lf%% of split functions is hot).\n",
777 100.0 * SplitBytesHot
/
778 (SplitBytesHot
+ SplitBytesCold
));
779 return Error::success();
782 void SplitFunctions::splitFunction(BinaryFunction
&BF
, SplitStrategy
&S
) {
789 FunctionLayout
&Layout
= BF
.getLayout();
790 BinaryFunction::BasicBlockOrderType
PreSplitLayout(Layout
.block_begin(),
793 BinaryContext
&BC
= BF
.getBinaryContext();
794 size_t OriginalHotSize
;
798 std::tie(OriginalHotSize
, ColdSize
) = BC
.calculateEmittedSize(BF
);
799 LLVM_DEBUG(dbgs() << "Estimated size for function " << BF
800 << " pre-split is <0x"
801 << Twine::utohexstr(OriginalHotSize
) << ", 0x"
802 << Twine::utohexstr(ColdSize
) << ">\n");
805 BinaryFunction::BasicBlockOrderType
NewLayout(Layout
.block_begin(),
807 // Never outline the first basic block.
808 NewLayout
.front()->setCanOutline(false);
809 for (BinaryBasicBlock
*const BB
: NewLayout
) {
810 if (!BB
->canOutline())
813 // Do not split extra entry points in aarch64. They can be referred by
814 // using ADRs and when this happens, these blocks cannot be placed far
815 // away due to the limited range in ADR instruction.
816 if (BC
.isAArch64() && BB
->isEntryPoint()) {
817 BB
->setCanOutline(false);
821 if (BF
.hasEHRanges() && !opts::SplitEH
) {
822 // We cannot move landing pads (or rather entry points for landing pads).
823 if (BB
->isLandingPad()) {
824 BB
->setCanOutline(false);
827 // We cannot move a block that can throw since exception-handling
828 // runtime cannot deal with split functions. However, if we can guarantee
829 // that the block never throws, it is safe to move the block to
830 // decrease the size of the function.
831 for (MCInst
&Instr
: *BB
) {
832 if (BC
.MIB
->isInvoke(Instr
)) {
833 BB
->setCanOutline(false);
839 // Outlining blocks with dynamic branches is not supported yet.
840 if (BC
.IsLinuxKernel
) {
842 *BB
, [&](MCInst
&Inst
) { return BC
.MIB
->isDynamicBranch(Inst
); }))
843 BB
->setCanOutline(false);
847 BF
.getLayout().updateLayoutIndices();
848 S
.fragment(NewLayout
.begin(), NewLayout
.end());
850 // Make sure all non-outlineable blocks are in the main-fragment.
851 for (BinaryBasicBlock
*const BB
: NewLayout
) {
852 if (!BB
->canOutline())
853 BB
->setFragmentNum(FragmentNum::main());
856 if (opts::AggressiveSplitting
) {
857 // All blocks with 0 count that we can move go to the end of the function.
858 // Even if they were natural to cluster formation and were seen in-between
860 llvm::stable_sort(NewLayout
, [&](const BinaryBasicBlock
*const A
,
861 const BinaryBasicBlock
*const B
) {
862 return A
->getFragmentNum() < B
->getFragmentNum();
864 } else if (BF
.hasEHRanges() && !opts::SplitEH
) {
865 // Typically functions with exception handling have landing pads at the end.
866 // We cannot move beginning of landing pads, but we can move 0-count blocks
867 // comprising landing pads to the end and thus facilitate splitting.
868 auto FirstLP
= NewLayout
.begin();
869 while ((*FirstLP
)->isLandingPad())
872 std::stable_sort(FirstLP
, NewLayout
.end(),
873 [&](BinaryBasicBlock
*A
, BinaryBasicBlock
*B
) {
874 return A
->getFragmentNum() < B
->getFragmentNum();
878 // Make sure that fragments are increasing.
879 FragmentNum CurrentFragment
= NewLayout
.back()->getFragmentNum();
880 for (BinaryBasicBlock
*const BB
: reverse(NewLayout
)) {
881 if (BB
->getFragmentNum() > CurrentFragment
)
882 BB
->setFragmentNum(CurrentFragment
);
883 CurrentFragment
= BB
->getFragmentNum();
886 if (S
.compactFragments()) {
887 FragmentNum CurrentFragment
= FragmentNum::main();
888 FragmentNum NewFragment
= FragmentNum::main();
889 for (BinaryBasicBlock
*const BB
: NewLayout
) {
890 if (BB
->getFragmentNum() > CurrentFragment
) {
891 CurrentFragment
= BB
->getFragmentNum();
892 NewFragment
= FragmentNum(NewFragment
.get() + 1);
894 BB
->setFragmentNum(NewFragment
);
898 const bool LayoutUpdated
= BF
.getLayout().update(NewLayout
);
900 // For shared objects, invoke instructions and corresponding landing pads
901 // have to be placed in the same fragment. When we split them, create
902 // trampoline landing pads that will redirect the execution to real LPs.
903 TrampolineSetType Trampolines
;
904 if (BF
.hasEHRanges() && BF
.isSplit()) {
905 // If all landing pads for this fragment are grouped in one (potentially
906 // different) fragment, we can set LPStart to the start of that fragment
907 // and avoid trampoline code.
908 bool NeedsTrampolines
= false;
909 for (FunctionFragment
&FF
: BF
.getLayout().fragments()) {
910 // Vector of fragments that contain landing pads for this fragment.
911 SmallVector
<FragmentNum
, 4> LandingPadFragments
;
912 for (const BinaryBasicBlock
*BB
: FF
)
913 for (const BinaryBasicBlock
*LPB
: BB
->landing_pads())
914 LandingPadFragments
.push_back(LPB
->getFragmentNum());
916 // Eliminate duplicate entries from the vector.
917 llvm::sort(LandingPadFragments
);
918 auto Last
= llvm::unique(LandingPadFragments
);
919 LandingPadFragments
.erase(Last
, LandingPadFragments
.end());
921 if (LandingPadFragments
.size() == 0) {
922 // If the fragment has no landing pads, we can safely set itself as its
923 // landing pad fragment.
924 BF
.setLPFragment(FF
.getFragmentNum(), FF
.getFragmentNum());
925 } else if (LandingPadFragments
.size() == 1) {
926 BF
.setLPFragment(FF
.getFragmentNum(), LandingPadFragments
.front());
928 if (!BC
.HasFixedLoadAddress
) {
929 NeedsTrampolines
= true;
932 BF
.setLPFragment(FF
.getFragmentNum(), std::nullopt
);
937 // Trampolines guarantee that all landing pads for any given fragment will
938 // be contained in the same fragment.
939 if (NeedsTrampolines
) {
940 for (FunctionFragment
&FF
: BF
.getLayout().fragments())
941 BF
.setLPFragment(FF
.getFragmentNum(), FF
.getFragmentNum());
942 Trampolines
= createEHTrampolines(BF
);
946 // Check the new size to see if it's worth splitting the function.
947 if (BC
.isX86() && LayoutUpdated
) {
948 std::tie(HotSize
, ColdSize
) = BC
.calculateEmittedSize(BF
);
949 LLVM_DEBUG(dbgs() << "Estimated size for function " << BF
950 << " post-split is <0x" << Twine::utohexstr(HotSize
)
951 << ", 0x" << Twine::utohexstr(ColdSize
) << ">\n");
952 if (alignTo(OriginalHotSize
, opts::SplitAlignThreshold
) <=
953 alignTo(HotSize
, opts::SplitAlignThreshold
) + opts::SplitThreshold
) {
954 if (opts::Verbosity
>= 2) {
955 BC
.outs() << "BOLT-INFO: Reversing splitting of function "
956 << formatv("{0}:\n {1:x}, {2:x} -> {3:x}\n", BF
, HotSize
,
957 ColdSize
, OriginalHotSize
);
960 // Reverse the action of createEHTrampolines(). The trampolines will be
961 // placed immediately before the matching destination resulting in no
963 if (PreSplitLayout
.size() != BF
.size())
964 PreSplitLayout
= mergeEHTrampolines(BF
, PreSplitLayout
, Trampolines
);
966 for (BinaryBasicBlock
&BB
: BF
)
967 BB
.setFragmentNum(FragmentNum::main());
968 BF
.getLayout().update(PreSplitLayout
);
970 SplitBytesHot
+= HotSize
;
971 SplitBytesCold
+= ColdSize
;
975 // Restore LP fragment for the main fragment if the splitting was undone.
976 if (BF
.hasEHRanges() && !BF
.isSplit())
977 BF
.setLPFragment(FragmentNum::main(), FragmentNum::main());
979 // Fix branches if the splitting decision of the pass after function
980 // reordering is different from that of the pass before function reordering.
981 if (LayoutUpdated
&& BC
.HasFinalizedFunctionOrder
)
985 SplitFunctions::TrampolineSetType
986 SplitFunctions::createEHTrampolines(BinaryFunction
&BF
) const {
987 const auto &MIB
= BF
.getBinaryContext().MIB
;
989 // Map real landing pads to the corresponding trampolines.
990 TrampolineSetType LPTrampolines
;
992 // Iterate over the copy of basic blocks since we are adding new blocks to the
993 // function which will invalidate its iterators.
994 std::vector
<BinaryBasicBlock
*> Blocks(BF
.pbegin(), BF
.pend());
995 for (BinaryBasicBlock
*BB
: Blocks
) {
996 for (MCInst
&Instr
: *BB
) {
997 const std::optional
<MCPlus::MCLandingPad
> EHInfo
= MIB
->getEHInfo(Instr
);
998 if (!EHInfo
|| !EHInfo
->first
)
1001 const MCSymbol
*LPLabel
= EHInfo
->first
;
1002 BinaryBasicBlock
*LPBlock
= BF
.getBasicBlockForLabel(LPLabel
);
1003 if (BB
->getFragmentNum() == LPBlock
->getFragmentNum())
1006 const MCSymbol
*TrampolineLabel
= nullptr;
1007 const TrampolineKey
Key(BB
->getFragmentNum(), LPLabel
);
1008 auto Iter
= LPTrampolines
.find(Key
);
1009 if (Iter
!= LPTrampolines
.end()) {
1010 TrampolineLabel
= Iter
->second
;
1012 // Create a trampoline basic block in the same fragment as the thrower.
1013 // Note: there's no need to insert the jump instruction, it will be
1014 // added by fixBranches().
1015 BinaryBasicBlock
*TrampolineBB
= BF
.addBasicBlock();
1016 TrampolineBB
->setFragmentNum(BB
->getFragmentNum());
1017 TrampolineBB
->setExecutionCount(LPBlock
->getExecutionCount());
1018 TrampolineBB
->addSuccessor(LPBlock
, TrampolineBB
->getExecutionCount());
1019 TrampolineBB
->setCFIState(LPBlock
->getCFIState());
1020 TrampolineLabel
= TrampolineBB
->getLabel();
1021 LPTrampolines
.insert(std::make_pair(Key
, TrampolineLabel
));
1024 // Substitute the landing pad with the trampoline.
1025 MIB
->updateEHInfo(Instr
,
1026 MCPlus::MCLandingPad(TrampolineLabel
, EHInfo
->second
));
1030 if (LPTrampolines
.empty())
1031 return LPTrampolines
;
1033 // All trampoline blocks were added to the end of the function. Place them at
1034 // the end of corresponding fragments.
1035 BinaryFunction::BasicBlockOrderType
NewLayout(BF
.getLayout().block_begin(),
1036 BF
.getLayout().block_end());
1037 stable_sort(NewLayout
, [&](BinaryBasicBlock
*A
, BinaryBasicBlock
*B
) {
1038 return A
->getFragmentNum() < B
->getFragmentNum();
1040 BF
.getLayout().update(NewLayout
);
1042 // Conservatively introduce branch instructions.
1045 // Update exception-handling CFG for the function.
1046 BF
.recomputeLandingPads();
1048 return LPTrampolines
;
1051 SplitFunctions::BasicBlockOrderType
SplitFunctions::mergeEHTrampolines(
1052 BinaryFunction
&BF
, SplitFunctions::BasicBlockOrderType
&Layout
,
1053 const SplitFunctions::TrampolineSetType
&Trampolines
) const {
1054 DenseMap
<const MCSymbol
*, SmallVector
<const MCSymbol
*, 0>>
1055 IncomingTrampolines
;
1056 for (const auto &Entry
: Trampolines
) {
1057 IncomingTrampolines
[Entry
.getFirst().Target
].emplace_back(
1061 BasicBlockOrderType MergedLayout
;
1062 for (BinaryBasicBlock
*BB
: Layout
) {
1063 auto Iter
= IncomingTrampolines
.find(BB
->getLabel());
1064 if (Iter
!= IncomingTrampolines
.end()) {
1065 for (const MCSymbol
*const Trampoline
: Iter
->getSecond()) {
1066 BinaryBasicBlock
*LPBlock
= BF
.getBasicBlockForLabel(Trampoline
);
1067 assert(LPBlock
&& "Could not find matching landing pad block.");
1068 MergedLayout
.push_back(LPBlock
);
1071 MergedLayout
.push_back(BB
);
1074 return MergedLayout
;