1 //===- bolt/Passes/BinaryPasses.cpp - Binary-level passes -----------------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 // This file implements multiple passes for binary optimization and analysis.
11 //===----------------------------------------------------------------------===//
13 #include "bolt/Passes/BinaryPasses.h"
14 #include "bolt/Core/FunctionLayout.h"
15 #include "bolt/Core/ParallelUtilities.h"
16 #include "bolt/Passes/ReorderAlgorithm.h"
17 #include "bolt/Passes/ReorderFunctions.h"
18 #include "llvm/Support/CommandLine.h"
24 #define DEBUG_TYPE "bolt-opts"
29 static const char *dynoStatsOptName(const bolt::DynoStats::Category C
) {
30 assert(C
> bolt::DynoStats::FIRST_DYNO_STAT
&&
31 C
< DynoStats::LAST_DYNO_STAT
&& "Unexpected dyno stat category.");
33 static std::string OptNames
[bolt::DynoStats::LAST_DYNO_STAT
+ 1];
35 OptNames
[C
] = bolt::DynoStats::Description(C
);
37 std::replace(OptNames
[C
].begin(), OptNames
[C
].end(), ' ', '-');
39 return OptNames
[C
].c_str();
44 extern cl::OptionCategory BoltCategory
;
45 extern cl::OptionCategory BoltOptCategory
;
47 extern cl::opt
<bolt::MacroFusionType
> AlignMacroOpFusion
;
48 extern cl::opt
<unsigned> Verbosity
;
49 extern cl::opt
<bool> EnableBAT
;
50 extern cl::opt
<unsigned> ExecutionCountThreshold
;
51 extern cl::opt
<bool> UpdateDebugSections
;
52 extern cl::opt
<bolt::ReorderFunctions::ReorderType
> ReorderFunctions
;
54 enum DynoStatsSortOrder
: char {
59 static cl::opt
<DynoStatsSortOrder
> DynoStatsSortOrderOpt(
60 "print-sorted-by-order",
61 cl::desc("use ascending or descending order when printing functions "
62 "ordered by dyno stats"),
63 cl::init(DynoStatsSortOrder::Descending
), cl::cat(BoltOptCategory
));
66 HotTextMoveSections("hot-text-move-sections",
67 cl::desc("list of sections containing functions used for hugifying hot text. "
68 "BOLT makes sure these functions are not placed on the same page as "
69 "the hot text. (default=\'.stub,.mover\')."),
70 cl::value_desc("sec1,sec2,sec3,..."),
73 cl::cat(BoltCategory
));
75 bool isHotTextMover(const BinaryFunction
&Function
) {
76 for (std::string
&SectionName
: opts::HotTextMoveSections
) {
77 if (Function
.getOriginSectionName() &&
78 *Function
.getOriginSectionName() == SectionName
)
85 static cl::opt
<bool> MinBranchClusters(
86 "min-branch-clusters",
87 cl::desc("use a modified clustering algorithm geared towards minimizing "
89 cl::Hidden
, cl::cat(BoltOptCategory
));
91 static cl::list
<Peepholes::PeepholeOpts
> Peepholes(
92 "peepholes", cl::CommaSeparated
, cl::desc("enable peephole optimizations"),
93 cl::value_desc("opt1,opt2,opt3,..."),
94 cl::values(clEnumValN(Peepholes::PEEP_NONE
, "none", "disable peepholes"),
95 clEnumValN(Peepholes::PEEP_DOUBLE_JUMPS
, "double-jumps",
96 "remove double jumps when able"),
97 clEnumValN(Peepholes::PEEP_TAILCALL_TRAPS
, "tailcall-traps",
98 "insert tail call traps"),
99 clEnumValN(Peepholes::PEEP_USELESS_BRANCHES
, "useless-branches",
100 "remove useless conditional branches"),
101 clEnumValN(Peepholes::PEEP_ALL
, "all",
102 "enable all peephole optimizations")),
103 cl::ZeroOrMore
, cl::cat(BoltOptCategory
));
105 static cl::opt
<unsigned>
106 PrintFuncStat("print-function-statistics",
107 cl::desc("print statistics about basic block ordering"),
108 cl::init(0), cl::cat(BoltOptCategory
));
110 static cl::list
<bolt::DynoStats::Category
>
111 PrintSortedBy("print-sorted-by", cl::CommaSeparated
,
112 cl::desc("print functions sorted by order of dyno stats"),
113 cl::value_desc("key1,key2,key3,..."),
115 #define D(name, description, ...) \
116 clEnumValN(bolt::DynoStats::name, dynoStatsOptName(bolt::DynoStats::name), \
120 clEnumValN(bolt::DynoStats::LAST_DYNO_STAT
, "all",
121 "sorted by all names")),
122 cl::ZeroOrMore
, cl::cat(BoltOptCategory
));
125 PrintUnknown("print-unknown",
126 cl::desc("print names of functions with unknown control flow"),
127 cl::cat(BoltCategory
), cl::Hidden
);
130 PrintUnknownCFG("print-unknown-cfg",
131 cl::desc("dump CFG of functions with unknown control flow"),
132 cl::cat(BoltCategory
), cl::ReallyHidden
);
134 // Please MSVC19 with a forward declaration: otherwise it reports an error about
135 // an undeclared variable inside a callback.
136 extern cl::opt
<bolt::ReorderBasicBlocks::LayoutType
> ReorderBlocks
;
137 cl::opt
<bolt::ReorderBasicBlocks::LayoutType
> ReorderBlocks(
138 "reorder-blocks", cl::desc("change layout of basic blocks in a function"),
139 cl::init(bolt::ReorderBasicBlocks::LT_NONE
),
141 clEnumValN(bolt::ReorderBasicBlocks::LT_NONE
, "none",
142 "do not reorder basic blocks"),
143 clEnumValN(bolt::ReorderBasicBlocks::LT_REVERSE
, "reverse",
144 "layout blocks in reverse order"),
145 clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE
, "normal",
146 "perform optimal layout based on profile"),
147 clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_BRANCH
,
149 "perform optimal layout prioritizing branch "
151 clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_CACHE
, "cache",
152 "perform optimal layout prioritizing I-cache "
154 clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_CACHE_PLUS
, "cache+",
155 "perform layout optimizing I-cache behavior"),
156 clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_EXT_TSP
, "ext-tsp",
157 "perform layout optimizing I-cache behavior"),
158 clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_SHUFFLE
,
159 "cluster-shuffle", "perform random layout of clusters")),
160 cl::ZeroOrMore
, cl::cat(BoltOptCategory
),
161 cl::callback([](const bolt::ReorderBasicBlocks::LayoutType
&option
) {
162 if (option
== bolt::ReorderBasicBlocks::LT_OPTIMIZE_CACHE_PLUS
) {
163 errs() << "BOLT-WARNING: '-reorder-blocks=cache+' is deprecated, please"
164 << " use '-reorder-blocks=ext-tsp' instead\n";
165 ReorderBlocks
= bolt::ReorderBasicBlocks::LT_OPTIMIZE_EXT_TSP
;
169 static cl::opt
<unsigned> ReportBadLayout(
171 cl::desc("print top <uint> functions with suboptimal code layout on input"),
172 cl::init(0), cl::Hidden
, cl::cat(BoltOptCategory
));
175 ReportStaleFuncs("report-stale",
176 cl::desc("print the list of functions with stale profile"),
177 cl::Hidden
, cl::cat(BoltOptCategory
));
179 enum SctcModes
: char {
181 SctcPreserveDirection
,
185 static cl::opt
<SctcModes
>
186 SctcMode("sctc-mode",
187 cl::desc("mode for simplify conditional tail calls"),
188 cl::init(SctcAlways
),
189 cl::values(clEnumValN(SctcAlways
, "always", "always perform sctc"),
190 clEnumValN(SctcPreserveDirection
,
192 "only perform sctc when branch direction is "
194 clEnumValN(SctcHeuristic
,
196 "use branch prediction data to control sctc")),
198 cl::cat(BoltOptCategory
));
200 static cl::opt
<unsigned>
201 StaleThreshold("stale-threshold",
203 "maximum percentage of stale functions to tolerate (default: 100)"),
206 cl::cat(BoltOptCategory
));
208 static cl::opt
<unsigned> TSPThreshold(
211 "maximum number of hot basic blocks in a function for which to use "
212 "a precise TSP solution while re-ordering basic blocks"),
213 cl::init(10), cl::Hidden
, cl::cat(BoltOptCategory
));
215 static cl::opt
<unsigned> TopCalledLimit(
217 cl::desc("maximum number of functions to print in top called "
218 "functions section"),
219 cl::init(100), cl::Hidden
, cl::cat(BoltCategory
));
226 bool BinaryFunctionPass::shouldOptimize(const BinaryFunction
&BF
) const {
227 return BF
.isSimple() && BF
.getState() == BinaryFunction::State::CFG
&&
231 bool BinaryFunctionPass::shouldPrint(const BinaryFunction
&BF
) const {
232 return BF
.isSimple() && !BF
.isIgnored();
235 void NormalizeCFG::runOnFunction(BinaryFunction
&BF
) {
236 uint64_t NumRemoved
= 0;
237 uint64_t NumDuplicateEdges
= 0;
238 uint64_t NeedsFixBranches
= 0;
239 for (BinaryBasicBlock
&BB
: BF
) {
243 if (BB
.isEntryPoint() || BB
.isLandingPad())
246 // Handle a dangling empty block.
247 if (BB
.succ_size() == 0) {
248 // If an empty dangling basic block has a predecessor, it could be a
249 // result of codegen for __builtin_unreachable. In such case, do not
251 if (BB
.pred_size() == 0) {
258 // The block should have just one successor.
259 BinaryBasicBlock
*Successor
= BB
.getSuccessor();
260 assert(Successor
&& "invalid CFG encountered");
262 // Redirect all predecessors to the successor block.
263 while (!BB
.pred_empty()) {
264 BinaryBasicBlock
*Predecessor
= *BB
.pred_begin();
265 if (Predecessor
->hasJumpTable())
268 if (Predecessor
== Successor
)
271 BinaryBasicBlock::BinaryBranchInfo
&BI
= Predecessor
->getBranchInfo(BB
);
272 Predecessor
->replaceSuccessor(&BB
, Successor
, BI
.Count
,
273 BI
.MispredictedCount
);
274 // We need to fix branches even if we failed to replace all successors
275 // and remove the block.
276 NeedsFixBranches
= true;
279 if (BB
.pred_empty()) {
280 BB
.removeAllSuccessors();
287 BF
.eraseInvalidBBs();
289 // Check for duplicate successors. Do it after the empty block elimination as
290 // we can get more duplicate successors.
291 for (BinaryBasicBlock
&BB
: BF
)
292 if (!BB
.hasJumpTable() && BB
.succ_size() == 2 &&
293 BB
.getConditionalSuccessor(false) == BB
.getConditionalSuccessor(true))
296 // fixBranches() will get rid of duplicate edges and update jump instructions.
297 if (NumDuplicateEdges
|| NeedsFixBranches
)
300 NumDuplicateEdgesMerged
+= NumDuplicateEdges
;
301 NumBlocksRemoved
+= NumRemoved
;
304 void NormalizeCFG::runOnFunctions(BinaryContext
&BC
) {
305 ParallelUtilities::runOnEachFunction(
306 BC
, ParallelUtilities::SchedulingPolicy::SP_BB_LINEAR
,
307 [&](BinaryFunction
&BF
) { runOnFunction(BF
); },
308 [&](const BinaryFunction
&BF
) { return !shouldOptimize(BF
); },
310 if (NumBlocksRemoved
)
311 outs() << "BOLT-INFO: removed " << NumBlocksRemoved
<< " empty block"
312 << (NumBlocksRemoved
== 1 ? "" : "s") << '\n';
313 if (NumDuplicateEdgesMerged
)
314 outs() << "BOLT-INFO: merged " << NumDuplicateEdgesMerged
315 << " duplicate CFG edge" << (NumDuplicateEdgesMerged
== 1 ? "" : "s")
319 void EliminateUnreachableBlocks::runOnFunction(BinaryFunction
&Function
) {
320 if (!Function
.getLayout().block_empty()) {
323 Function
.markUnreachableBlocks();
325 for (BinaryBasicBlock
&BB
: Function
) {
327 dbgs() << "BOLT-INFO: UCE found unreachable block " << BB
.getName()
328 << " in function " << Function
<< "\n";
333 std::tie(Count
, Bytes
) = Function
.eraseInvalidBBs();
334 DeletedBlocks
+= Count
;
335 DeletedBytes
+= Bytes
;
337 Modified
.insert(&Function
);
338 if (opts::Verbosity
> 0)
339 outs() << "BOLT-INFO: removed " << Count
340 << " dead basic block(s) accounting for " << Bytes
341 << " bytes in function " << Function
<< '\n';
346 void EliminateUnreachableBlocks::runOnFunctions(BinaryContext
&BC
) {
347 for (auto &It
: BC
.getBinaryFunctions()) {
348 BinaryFunction
&Function
= It
.second
;
349 if (shouldOptimize(Function
))
350 runOnFunction(Function
);
354 outs() << "BOLT-INFO: UCE removed " << DeletedBlocks
<< " blocks and "
355 << DeletedBytes
<< " bytes of code\n";
358 bool ReorderBasicBlocks::shouldPrint(const BinaryFunction
&BF
) const {
359 return (BinaryFunctionPass::shouldPrint(BF
) &&
360 opts::ReorderBlocks
!= ReorderBasicBlocks::LT_NONE
);
363 bool ReorderBasicBlocks::shouldOptimize(const BinaryFunction
&BF
) const {
364 // Apply execution count threshold
365 if (BF
.getKnownExecutionCount() < opts::ExecutionCountThreshold
)
368 return BinaryFunctionPass::shouldOptimize(BF
);
371 void ReorderBasicBlocks::runOnFunctions(BinaryContext
&BC
) {
372 if (opts::ReorderBlocks
== ReorderBasicBlocks::LT_NONE
)
375 std::atomic_uint64_t
ModifiedFuncCount(0);
376 std::mutex FunctionEditDistanceMutex
;
377 DenseMap
<const BinaryFunction
*, uint64_t> FunctionEditDistance
;
379 ParallelUtilities::WorkFuncTy WorkFun
= [&](BinaryFunction
&BF
) {
380 SmallVector
<const BinaryBasicBlock
*, 0> OldBlockOrder
;
381 if (opts::PrintFuncStat
> 0)
382 llvm::copy(BF
.getLayout().blocks(), std::back_inserter(OldBlockOrder
));
384 const bool LayoutChanged
=
385 modifyFunctionLayout(BF
, opts::ReorderBlocks
, opts::MinBranchClusters
);
387 ModifiedFuncCount
.fetch_add(1, std::memory_order_relaxed
);
388 if (opts::PrintFuncStat
> 0) {
389 const uint64_t Distance
= BF
.getLayout().getEditDistance(OldBlockOrder
);
390 std::lock_guard
<std::mutex
> Lock(FunctionEditDistanceMutex
);
391 FunctionEditDistance
[&BF
] = Distance
;
396 ParallelUtilities::PredicateTy SkipFunc
= [&](const BinaryFunction
&BF
) {
397 return !shouldOptimize(BF
);
400 ParallelUtilities::runOnEachFunction(
401 BC
, ParallelUtilities::SchedulingPolicy::SP_BB_LINEAR
, WorkFun
, SkipFunc
,
402 "ReorderBasicBlocks");
403 const size_t NumAllProfiledFunctions
=
404 BC
.NumProfiledFuncs
+ BC
.NumStaleProfileFuncs
;
406 outs() << "BOLT-INFO: basic block reordering modified layout of "
407 << format("%zu functions (%.2lf%% of profiled, %.2lf%% of total)\n",
408 ModifiedFuncCount
.load(std::memory_order_relaxed
),
409 100.0 * ModifiedFuncCount
.load(std::memory_order_relaxed
) /
410 NumAllProfiledFunctions
,
411 100.0 * ModifiedFuncCount
.load(std::memory_order_relaxed
) /
412 BC
.getBinaryFunctions().size());
414 if (opts::PrintFuncStat
> 0) {
415 raw_ostream
&OS
= outs();
416 // Copy all the values into vector in order to sort them
417 std::map
<uint64_t, BinaryFunction
&> ScoreMap
;
418 auto &BFs
= BC
.getBinaryFunctions();
419 for (auto It
= BFs
.begin(); It
!= BFs
.end(); ++It
)
420 ScoreMap
.insert(std::pair
<uint64_t, BinaryFunction
&>(
421 It
->second
.getFunctionScore(), It
->second
));
423 OS
<< "\nBOLT-INFO: Printing Function Statistics:\n\n";
424 OS
<< " There are " << BFs
.size() << " functions in total. \n";
425 OS
<< " Number of functions being modified: "
426 << ModifiedFuncCount
.load(std::memory_order_relaxed
) << "\n";
427 OS
<< " User asks for detailed information on top "
428 << opts::PrintFuncStat
<< " functions. (Ranked by function score)"
431 for (std::map
<uint64_t, BinaryFunction
&>::reverse_iterator Rit
=
433 Rit
!= ScoreMap
.rend() && I
< opts::PrintFuncStat
; ++Rit
, ++I
) {
434 BinaryFunction
&Function
= Rit
->second
;
436 OS
<< " Information for function of top: " << (I
+ 1) << ": \n";
437 OS
<< " Function Score is: " << Function
.getFunctionScore()
439 OS
<< " There are " << Function
.size()
440 << " number of blocks in this function.\n";
441 OS
<< " There are " << Function
.getInstructionCount()
442 << " number of instructions in this function.\n";
443 OS
<< " The edit distance for this function is: "
444 << FunctionEditDistance
.lookup(&Function
) << "\n\n";
449 bool ReorderBasicBlocks::modifyFunctionLayout(BinaryFunction
&BF
,
451 bool MinBranchClusters
) const {
452 if (BF
.size() == 0 || Type
== LT_NONE
)
455 BinaryFunction::BasicBlockOrderType NewLayout
;
456 std::unique_ptr
<ReorderAlgorithm
> Algo
;
458 // Cannot do optimal layout without profile.
459 if (Type
!= LT_REVERSE
&& !BF
.hasValidProfile())
462 if (Type
== LT_REVERSE
) {
463 Algo
.reset(new ReverseReorderAlgorithm());
464 } else if (BF
.size() <= opts::TSPThreshold
&& Type
!= LT_OPTIMIZE_SHUFFLE
) {
465 // Work on optimal solution if problem is small enough
466 LLVM_DEBUG(dbgs() << "finding optimal block layout for " << BF
<< "\n");
467 Algo
.reset(new TSPReorderAlgorithm());
469 LLVM_DEBUG(dbgs() << "running block layout heuristics on " << BF
<< "\n");
471 std::unique_ptr
<ClusterAlgorithm
> CAlgo
;
472 if (MinBranchClusters
)
473 CAlgo
.reset(new MinBranchGreedyClusterAlgorithm());
475 CAlgo
.reset(new PHGreedyClusterAlgorithm());
479 Algo
.reset(new OptimizeReorderAlgorithm(std::move(CAlgo
)));
482 case LT_OPTIMIZE_BRANCH
:
483 Algo
.reset(new OptimizeBranchReorderAlgorithm(std::move(CAlgo
)));
486 case LT_OPTIMIZE_CACHE
:
487 Algo
.reset(new OptimizeCacheReorderAlgorithm(std::move(CAlgo
)));
490 case LT_OPTIMIZE_EXT_TSP
:
491 Algo
.reset(new ExtTSPReorderAlgorithm());
494 case LT_OPTIMIZE_SHUFFLE
:
495 Algo
.reset(new RandomClusterReorderAlgorithm(std::move(CAlgo
)));
499 llvm_unreachable("unexpected layout type");
503 Algo
->reorderBasicBlocks(BF
, NewLayout
);
505 return BF
.getLayout().update(NewLayout
);
508 void FixupBranches::runOnFunctions(BinaryContext
&BC
) {
509 for (auto &It
: BC
.getBinaryFunctions()) {
510 BinaryFunction
&Function
= It
.second
;
511 if (!BC
.shouldEmit(Function
) || !Function
.isSimple())
514 Function
.fixBranches();
518 void FinalizeFunctions::runOnFunctions(BinaryContext
&BC
) {
519 ParallelUtilities::WorkFuncTy WorkFun
= [&](BinaryFunction
&BF
) {
520 if (!BF
.finalizeCFIState()) {
521 if (BC
.HasRelocations
) {
522 errs() << "BOLT-ERROR: unable to fix CFI state for function " << BF
532 // Update exception handling information.
536 ParallelUtilities::PredicateTy SkipPredicate
= [&](const BinaryFunction
&BF
) {
537 return !BC
.shouldEmit(BF
);
540 ParallelUtilities::runOnEachFunction(
541 BC
, ParallelUtilities::SchedulingPolicy::SP_CONSTANT
, WorkFun
,
542 SkipPredicate
, "FinalizeFunctions");
545 void CheckLargeFunctions::runOnFunctions(BinaryContext
&BC
) {
546 if (BC
.HasRelocations
)
549 if (!opts::UpdateDebugSections
)
552 // If the function wouldn't fit, mark it as non-simple. Otherwise, we may emit
553 // incorrect debug info.
554 ParallelUtilities::WorkFuncTy WorkFun
= [&](BinaryFunction
&BF
) {
555 uint64_t HotSize
, ColdSize
;
556 std::tie(HotSize
, ColdSize
) =
557 BC
.calculateEmittedSize(BF
, /*FixBranches=*/false);
558 if (HotSize
> BF
.getMaxSize())
562 ParallelUtilities::PredicateTy SkipFunc
= [&](const BinaryFunction
&BF
) {
563 return !shouldOptimize(BF
);
566 ParallelUtilities::runOnEachFunction(
567 BC
, ParallelUtilities::SchedulingPolicy::SP_INST_LINEAR
, WorkFun
,
568 SkipFunc
, "CheckLargeFunctions");
571 bool CheckLargeFunctions::shouldOptimize(const BinaryFunction
&BF
) const {
572 // Unlike other passes, allow functions in non-CFG state.
573 return BF
.isSimple() && !BF
.isIgnored();
576 void LowerAnnotations::runOnFunctions(BinaryContext
&BC
) {
577 std::vector
<std::pair
<MCInst
*, uint32_t>> PreservedOffsetAnnotations
;
578 std::vector
<std::pair
<MCInst
*, MCSymbol
*>> PreservedLabelAnnotations
;
580 for (auto &It
: BC
.getBinaryFunctions()) {
581 BinaryFunction
&BF
= It
.second
;
583 for (FunctionFragment
&FF
: BF
.getLayout().fragments()) {
584 int64_t CurrentGnuArgsSize
= 0;
586 for (BinaryBasicBlock
*const BB
: FF
) {
587 // First convert GnuArgsSize annotations into CFIs. This may change
588 // instr pointers, so do it before recording ptrs for preserved
590 if (BF
.usesGnuArgsSize()) {
591 for (auto II
= BB
->begin(); II
!= BB
->end(); ++II
) {
592 if (!BC
.MIB
->isInvoke(*II
))
594 const int64_t NewGnuArgsSize
= BC
.MIB
->getGnuArgsSize(*II
);
595 assert(NewGnuArgsSize
>= 0 &&
596 "expected non-negative GNU_args_size");
597 if (NewGnuArgsSize
!= CurrentGnuArgsSize
) {
598 auto InsertII
= BF
.addCFIInstruction(
600 MCCFIInstruction::createGnuArgsSize(nullptr, NewGnuArgsSize
));
601 CurrentGnuArgsSize
= NewGnuArgsSize
;
602 II
= std::next(InsertII
);
607 // Now record preserved annotations separately and then strip
609 for (auto II
= BB
->begin(); II
!= BB
->end(); ++II
) {
610 if (BF
.requiresAddressTranslation() && BC
.MIB
->getOffset(*II
))
611 PreservedOffsetAnnotations
.emplace_back(&(*II
),
612 *BC
.MIB
->getOffset(*II
));
613 if (auto Label
= BC
.MIB
->getLabel(*II
))
614 PreservedLabelAnnotations
.emplace_back(&*II
, *Label
);
615 BC
.MIB
->stripAnnotations(*II
);
620 for (BinaryFunction
*BF
: BC
.getInjectedBinaryFunctions())
621 for (BinaryBasicBlock
&BB
: *BF
)
622 for (MCInst
&Instruction
: BB
) {
623 if (auto Label
= BC
.MIB
->getLabel(Instruction
))
624 PreservedLabelAnnotations
.emplace_back(&Instruction
, *Label
);
625 BC
.MIB
->stripAnnotations(Instruction
);
628 // Release all memory taken by annotations
629 BC
.MIB
->freeAnnotations();
631 // Reinsert preserved annotations we need during code emission.
632 for (const std::pair
<MCInst
*, uint32_t> &Item
: PreservedOffsetAnnotations
)
633 BC
.MIB
->setOffset(*Item
.first
, Item
.second
);
634 for (auto [Instr
, Label
] : PreservedLabelAnnotations
)
635 BC
.MIB
->setLabel(*Instr
, Label
);
638 // Check for dirty state in MCSymbol objects that might be a consequence
639 // of running calculateEmittedSize() in parallel, during split functions
640 // pass. If an inconsistent state is found (symbol already registered or
641 // already defined), clean it.
642 void CleanMCState::runOnFunctions(BinaryContext
&BC
) {
643 MCContext
&Ctx
= *BC
.Ctx
;
644 for (const auto &SymMapEntry
: Ctx
.getSymbols()) {
645 const MCSymbol
*S
= SymMapEntry
.second
;
646 if (S
->isDefined()) {
647 LLVM_DEBUG(dbgs() << "BOLT-DEBUG: Symbol \"" << S
->getName()
648 << "\" is already defined\n");
649 const_cast<MCSymbol
*>(S
)->setUndefined();
651 if (S
->isRegistered()) {
652 LLVM_DEBUG(dbgs() << "BOLT-DEBUG: Symbol \"" << S
->getName()
653 << "\" is already registered\n");
654 const_cast<MCSymbol
*>(S
)->setIsRegistered(false);
656 LLVM_DEBUG(if (S
->isVariable()) {
657 dbgs() << "BOLT-DEBUG: Symbol \"" << S
->getName() << "\" is variable\n";
662 // This peephole fixes jump instructions that jump to another basic
663 // block with a single jump instruction, e.g.
666 // jmp B1 (or jcc B1)
673 // jmp B2 (or jcc B2)
675 static uint64_t fixDoubleJumps(BinaryFunction
&Function
, bool MarkInvalid
) {
676 uint64_t NumDoubleJumps
= 0;
678 MCContext
*Ctx
= Function
.getBinaryContext().Ctx
.get();
679 MCPlusBuilder
*MIB
= Function
.getBinaryContext().MIB
.get();
680 for (BinaryBasicBlock
&BB
: Function
) {
681 auto checkAndPatch
= [&](BinaryBasicBlock
*Pred
, BinaryBasicBlock
*Succ
,
682 const MCSymbol
*SuccSym
) {
683 // Ignore infinite loop jumps or fallthrough tail jumps.
684 if (Pred
== Succ
|| Succ
== &BB
)
688 const MCSymbol
*TBB
= nullptr;
689 const MCSymbol
*FBB
= nullptr;
690 MCInst
*CondBranch
= nullptr;
691 MCInst
*UncondBranch
= nullptr;
692 bool Res
= Pred
->analyzeBranch(TBB
, FBB
, CondBranch
, UncondBranch
);
694 LLVM_DEBUG(dbgs() << "analyzeBranch failed in peepholes in block:\n";
698 Pred
->replaceSuccessor(&BB
, Succ
);
700 // We must patch up any existing branch instructions to match up
701 // with the new successor.
702 assert((CondBranch
|| (!CondBranch
&& Pred
->succ_size() == 1)) &&
703 "Predecessor block has inconsistent number of successors");
704 if (CondBranch
&& MIB
->getTargetSymbol(*CondBranch
) == BB
.getLabel()) {
705 MIB
->replaceBranchTarget(*CondBranch
, Succ
->getLabel(), Ctx
);
706 } else if (UncondBranch
&&
707 MIB
->getTargetSymbol(*UncondBranch
) == BB
.getLabel()) {
708 MIB
->replaceBranchTarget(*UncondBranch
, Succ
->getLabel(), Ctx
);
709 } else if (!UncondBranch
) {
710 assert(Function
.getLayout().getBasicBlockAfter(Pred
, false) != Succ
&&
711 "Don't add an explicit jump to a fallthrough block.");
712 Pred
->addBranchInstruction(Succ
);
715 // Succ will be null in the tail call case. In this case we
716 // need to explicitly add a tail call instruction.
717 MCInst
*Branch
= Pred
->getLastNonPseudoInstr();
718 if (Branch
&& MIB
->isUnconditionalBranch(*Branch
)) {
719 assert(MIB
->getTargetSymbol(*Branch
) == BB
.getLabel());
720 Pred
->removeSuccessor(&BB
);
721 Pred
->eraseInstruction(Pred
->findInstruction(Branch
));
722 Pred
->addTailCallInstruction(SuccSym
);
729 LLVM_DEBUG(dbgs() << "Removed double jump in " << Function
<< " from "
730 << Pred
->getName() << " -> " << BB
.getName() << " to "
731 << Pred
->getName() << " -> " << SuccSym
->getName()
732 << (!Succ
? " (tail)\n" : "\n"));
737 if (BB
.getNumNonPseudos() != 1 || BB
.isLandingPad())
740 MCInst
*Inst
= BB
.getFirstNonPseudoInstr();
741 const bool IsTailCall
= MIB
->isTailCall(*Inst
);
743 if (!MIB
->isUnconditionalBranch(*Inst
) && !IsTailCall
)
746 // If we operate after SCTC make sure it's not a conditional tail call.
747 if (IsTailCall
&& MIB
->isConditionalBranch(*Inst
))
750 const MCSymbol
*SuccSym
= MIB
->getTargetSymbol(*Inst
);
751 BinaryBasicBlock
*Succ
= BB
.getSuccessor();
753 if (((!Succ
|| &BB
== Succ
) && !IsTailCall
) || (IsTailCall
&& !SuccSym
))
756 std::vector
<BinaryBasicBlock
*> Preds
= {BB
.pred_begin(), BB
.pred_end()};
758 for (BinaryBasicBlock
*Pred
: Preds
) {
759 if (Pred
->isLandingPad())
762 if (Pred
->getSuccessor() == &BB
||
763 (Pred
->getConditionalSuccessor(true) == &BB
&& !IsTailCall
) ||
764 Pred
->getConditionalSuccessor(false) == &BB
)
765 if (checkAndPatch(Pred
, Succ
, SuccSym
) && MarkInvalid
)
766 BB
.markValid(BB
.pred_size() != 0 || BB
.isLandingPad() ||
771 return NumDoubleJumps
;
774 bool SimplifyConditionalTailCalls::shouldRewriteBranch(
775 const BinaryBasicBlock
*PredBB
, const MCInst
&CondBranch
,
776 const BinaryBasicBlock
*BB
, const bool DirectionFlag
) {
777 if (BeenOptimized
.count(PredBB
))
780 const bool IsForward
= BinaryFunction::isForwardBranch(PredBB
, BB
);
783 ++NumOrigForwardBranches
;
785 ++NumOrigBackwardBranches
;
787 if (opts::SctcMode
== opts::SctcAlways
)
790 if (opts::SctcMode
== opts::SctcPreserveDirection
)
791 return IsForward
== DirectionFlag
;
793 const ErrorOr
<std::pair
<double, double>> Frequency
=
794 PredBB
->getBranchStats(BB
);
796 // It's ok to rewrite the conditional branch if the new target will be
797 // a backward branch.
799 // If no data available for these branches, then it should be ok to
800 // do the optimization since it will reduce code size.
801 if (Frequency
.getError())
804 // TODO: should this use misprediction frequency instead?
805 const bool Result
= (IsForward
&& Frequency
.get().first
>= 0.5) ||
806 (!IsForward
&& Frequency
.get().first
<= 0.5);
808 return Result
== DirectionFlag
;
811 uint64_t SimplifyConditionalTailCalls::fixTailCalls(BinaryFunction
&BF
) {
812 // Need updated indices to correctly detect branch' direction.
813 BF
.getLayout().updateLayoutIndices();
814 BF
.markUnreachableBlocks();
816 MCPlusBuilder
*MIB
= BF
.getBinaryContext().MIB
.get();
817 MCContext
*Ctx
= BF
.getBinaryContext().Ctx
.get();
818 uint64_t NumLocalCTCCandidates
= 0;
819 uint64_t NumLocalCTCs
= 0;
820 uint64_t LocalCTCTakenCount
= 0;
821 uint64_t LocalCTCExecCount
= 0;
822 std::vector
<std::pair
<BinaryBasicBlock
*, const BinaryBasicBlock
*>>
825 // Will block be deleted by UCE?
826 auto isValid
= [](const BinaryBasicBlock
*BB
) {
827 return (BB
->pred_size() != 0 || BB
->isLandingPad() || BB
->isEntryPoint());
830 for (BinaryBasicBlock
*BB
: BF
.getLayout().blocks()) {
831 // Locate BB with a single direct tail-call instruction.
832 if (BB
->getNumNonPseudos() != 1)
835 MCInst
*Instr
= BB
->getFirstNonPseudoInstr();
836 if (!MIB
->isTailCall(*Instr
) || MIB
->isConditionalBranch(*Instr
))
839 const MCSymbol
*CalleeSymbol
= MIB
->getTargetSymbol(*Instr
);
843 // Detect direction of the possible conditional tail call.
844 const bool IsForwardCTC
= BF
.isForwardCall(CalleeSymbol
);
846 // Iterate through all predecessors.
847 for (BinaryBasicBlock
*PredBB
: BB
->predecessors()) {
848 BinaryBasicBlock
*CondSucc
= PredBB
->getConditionalSuccessor(true);
852 ++NumLocalCTCCandidates
;
854 const MCSymbol
*TBB
= nullptr;
855 const MCSymbol
*FBB
= nullptr;
856 MCInst
*CondBranch
= nullptr;
857 MCInst
*UncondBranch
= nullptr;
858 bool Result
= PredBB
->analyzeBranch(TBB
, FBB
, CondBranch
, UncondBranch
);
860 // analyzeBranch() can fail due to unusual branch instructions, e.g. jrcxz
862 LLVM_DEBUG(dbgs() << "analyzeBranch failed in SCTC in block:\n";
867 assert(Result
&& "internal error analyzing conditional branch");
868 assert(CondBranch
&& "conditional branch expected");
870 // It's possible that PredBB is also a successor to BB that may have
871 // been processed by a previous iteration of the SCTC loop, in which
872 // case it may have been marked invalid. We should skip rewriting in
874 if (!PredBB
->isValid()) {
875 assert(PredBB
->isSuccessor(BB
) &&
876 "PredBB should be valid if it is not a successor to BB");
880 // We don't want to reverse direction of the branch in new order
881 // without further profile analysis.
882 const bool DirectionFlag
= CondSucc
== BB
? IsForwardCTC
: !IsForwardCTC
;
883 if (!shouldRewriteBranch(PredBB
, *CondBranch
, BB
, DirectionFlag
))
886 // Record this block so that we don't try to optimize it twice.
887 BeenOptimized
.insert(PredBB
);
890 if (CondSucc
!= BB
) {
891 // Patch the new target address into the conditional branch.
892 MIB
->reverseBranchCondition(*CondBranch
, CalleeSymbol
, Ctx
);
893 // Since we reversed the condition on the branch we need to change
894 // the target for the unconditional branch or add a unconditional
895 // branch to the old target. This has to be done manually since
896 // fixupBranches is not called after SCTC.
897 NeedsUncondBranch
.emplace_back(PredBB
, CondSucc
);
898 Count
= PredBB
->getFallthroughBranchInfo().Count
;
900 // Change destination of the conditional branch.
901 MIB
->replaceBranchTarget(*CondBranch
, CalleeSymbol
, Ctx
);
902 Count
= PredBB
->getTakenBranchInfo().Count
;
904 const uint64_t CTCTakenFreq
=
905 Count
== BinaryBasicBlock::COUNT_NO_PROFILE
? 0 : Count
;
907 // Annotate it, so "isCall" returns true for this jcc
908 MIB
->setConditionalTailCall(*CondBranch
);
909 // Add info about the conditional tail call frequency, otherwise this
910 // info will be lost when we delete the associated BranchInfo entry
911 auto &CTCAnnotation
=
912 MIB
->getOrCreateAnnotationAs
<uint64_t>(*CondBranch
, "CTCTakenCount");
913 CTCAnnotation
= CTCTakenFreq
;
915 // Remove the unused successor which may be eliminated later
916 // if there are no other users.
917 PredBB
->removeSuccessor(BB
);
918 // Update BB execution count
919 if (CTCTakenFreq
&& CTCTakenFreq
<= BB
->getKnownExecutionCount())
920 BB
->setExecutionCount(BB
->getExecutionCount() - CTCTakenFreq
);
921 else if (CTCTakenFreq
> BB
->getKnownExecutionCount())
922 BB
->setExecutionCount(0);
925 LocalCTCTakenCount
+= CTCTakenFreq
;
926 LocalCTCExecCount
+= PredBB
->getKnownExecutionCount();
929 // Remove the block from CFG if all predecessors were removed.
930 BB
->markValid(isValid(BB
));
933 // Add unconditional branches at the end of BBs to new successors
934 // as long as the successor is not a fallthrough.
935 for (auto &Entry
: NeedsUncondBranch
) {
936 BinaryBasicBlock
*PredBB
= Entry
.first
;
937 const BinaryBasicBlock
*CondSucc
= Entry
.second
;
939 const MCSymbol
*TBB
= nullptr;
940 const MCSymbol
*FBB
= nullptr;
941 MCInst
*CondBranch
= nullptr;
942 MCInst
*UncondBranch
= nullptr;
943 PredBB
->analyzeBranch(TBB
, FBB
, CondBranch
, UncondBranch
);
945 // Find the next valid block. Invalid blocks will be deleted
946 // so they shouldn't be considered fallthrough targets.
947 const BinaryBasicBlock
*NextBlock
=
948 BF
.getLayout().getBasicBlockAfter(PredBB
, false);
949 while (NextBlock
&& !isValid(NextBlock
))
950 NextBlock
= BF
.getLayout().getBasicBlockAfter(NextBlock
, false);
952 // Get the unconditional successor to this block.
953 const BinaryBasicBlock
*PredSucc
= PredBB
->getSuccessor();
954 assert(PredSucc
&& "The other branch should be a tail call");
956 const bool HasFallthrough
= (NextBlock
&& PredSucc
== NextBlock
);
960 PredBB
->eraseInstruction(PredBB
->findInstruction(UncondBranch
));
962 MIB
->replaceBranchTarget(*UncondBranch
, CondSucc
->getLabel(), Ctx
);
963 } else if (!HasFallthrough
) {
965 MIB
->createUncondBranch(Branch
, CondSucc
->getLabel(), Ctx
);
966 PredBB
->addInstruction(Branch
);
970 if (NumLocalCTCs
> 0) {
971 NumDoubleJumps
+= fixDoubleJumps(BF
, true);
972 // Clean-up unreachable tail-call blocks.
973 const std::pair
<unsigned, uint64_t> Stats
= BF
.eraseInvalidBBs();
974 DeletedBlocks
+= Stats
.first
;
975 DeletedBytes
+= Stats
.second
;
977 assert(BF
.validateCFG());
980 LLVM_DEBUG(dbgs() << "BOLT: created " << NumLocalCTCs
981 << " conditional tail calls from a total of "
982 << NumLocalCTCCandidates
<< " candidates in function " << BF
983 << ". CTCs execution count for this function is "
984 << LocalCTCExecCount
<< " and CTC taken count is "
985 << LocalCTCTakenCount
<< "\n";);
987 NumTailCallsPatched
+= NumLocalCTCs
;
988 NumCandidateTailCalls
+= NumLocalCTCCandidates
;
989 CTCExecCount
+= LocalCTCExecCount
;
990 CTCTakenCount
+= LocalCTCTakenCount
;
992 return NumLocalCTCs
> 0;
995 void SimplifyConditionalTailCalls::runOnFunctions(BinaryContext
&BC
) {
999 for (auto &It
: BC
.getBinaryFunctions()) {
1000 BinaryFunction
&Function
= It
.second
;
1002 if (!shouldOptimize(Function
))
1005 if (fixTailCalls(Function
)) {
1006 Modified
.insert(&Function
);
1007 Function
.setHasCanonicalCFG(false);
1011 if (NumTailCallsPatched
)
1012 outs() << "BOLT-INFO: SCTC: patched " << NumTailCallsPatched
1013 << " tail calls (" << NumOrigForwardBranches
<< " forward)"
1014 << " tail calls (" << NumOrigBackwardBranches
<< " backward)"
1015 << " from a total of " << NumCandidateTailCalls
<< " while removing "
1016 << NumDoubleJumps
<< " double jumps"
1017 << " and removing " << DeletedBlocks
<< " basic blocks"
1018 << " totalling " << DeletedBytes
1019 << " bytes of code. CTCs total execution count is " << CTCExecCount
1020 << " and the number of times CTCs are taken is " << CTCTakenCount
1024 uint64_t ShortenInstructions::shortenInstructions(BinaryFunction
&Function
) {
1026 const BinaryContext
&BC
= Function
.getBinaryContext();
1027 for (BinaryBasicBlock
&BB
: Function
) {
1028 for (MCInst
&Inst
: BB
) {
1029 MCInst OriginalInst
;
1030 if (opts::Verbosity
> 2)
1031 OriginalInst
= Inst
;
1033 if (!BC
.MIB
->shortenInstruction(Inst
, *BC
.STI
))
1036 if (opts::Verbosity
> 2) {
1038 outs() << "BOLT-INFO: shortening:\nBOLT-INFO: ";
1039 BC
.printInstruction(outs(), OriginalInst
, 0, &Function
);
1040 outs() << "BOLT-INFO: to:";
1041 BC
.printInstruction(outs(), Inst
, 0, &Function
);
1051 void ShortenInstructions::runOnFunctions(BinaryContext
&BC
) {
1052 std::atomic
<uint64_t> NumShortened
{0};
1056 ParallelUtilities::runOnEachFunction(
1057 BC
, ParallelUtilities::SchedulingPolicy::SP_INST_LINEAR
,
1058 [&](BinaryFunction
&BF
) { NumShortened
+= shortenInstructions(BF
); },
1059 nullptr, "ShortenInstructions");
1062 outs() << "BOLT-INFO: " << NumShortened
<< " instructions were shortened\n";
1065 void Peepholes::addTailcallTraps(BinaryFunction
&Function
) {
1066 MCPlusBuilder
*MIB
= Function
.getBinaryContext().MIB
.get();
1067 for (BinaryBasicBlock
&BB
: Function
) {
1068 MCInst
*Inst
= BB
.getLastNonPseudoInstr();
1069 if (Inst
&& MIB
->isTailCall(*Inst
) && MIB
->isIndirectBranch(*Inst
)) {
1071 if (MIB
->createTrap(Trap
)) {
1072 BB
.addInstruction(Trap
);
1079 void Peepholes::removeUselessCondBranches(BinaryFunction
&Function
) {
1080 for (BinaryBasicBlock
&BB
: Function
) {
1081 if (BB
.succ_size() != 2)
1084 BinaryBasicBlock
*CondBB
= BB
.getConditionalSuccessor(true);
1085 BinaryBasicBlock
*UncondBB
= BB
.getConditionalSuccessor(false);
1086 if (CondBB
!= UncondBB
)
1089 const MCSymbol
*TBB
= nullptr;
1090 const MCSymbol
*FBB
= nullptr;
1091 MCInst
*CondBranch
= nullptr;
1092 MCInst
*UncondBranch
= nullptr;
1093 bool Result
= BB
.analyzeBranch(TBB
, FBB
, CondBranch
, UncondBranch
);
1095 // analyzeBranch() can fail due to unusual branch instructions,
1096 // e.g. jrcxz, or jump tables (indirect jump).
1097 if (!Result
|| !CondBranch
)
1100 BB
.removeDuplicateConditionalSuccessor(CondBranch
);
1101 ++NumUselessCondBranches
;
1105 void Peepholes::runOnFunctions(BinaryContext
&BC
) {
1107 std::accumulate(opts::Peepholes
.begin(), opts::Peepholes
.end(), 0,
1108 [](const char A
, const PeepholeOpts B
) { return A
| B
; });
1109 if (Opts
== PEEP_NONE
)
1112 for (auto &It
: BC
.getBinaryFunctions()) {
1113 BinaryFunction
&Function
= It
.second
;
1114 if (shouldOptimize(Function
)) {
1115 if (Opts
& PEEP_DOUBLE_JUMPS
)
1116 NumDoubleJumps
+= fixDoubleJumps(Function
, false);
1117 if (Opts
& PEEP_TAILCALL_TRAPS
)
1118 addTailcallTraps(Function
);
1119 if (Opts
& PEEP_USELESS_BRANCHES
)
1120 removeUselessCondBranches(Function
);
1121 assert(Function
.validateCFG());
1124 outs() << "BOLT-INFO: Peephole: " << NumDoubleJumps
1125 << " double jumps patched.\n"
1126 << "BOLT-INFO: Peephole: " << TailCallTraps
1127 << " tail call traps inserted.\n"
1128 << "BOLT-INFO: Peephole: " << NumUselessCondBranches
1129 << " useless conditional branches removed.\n";
1132 bool SimplifyRODataLoads::simplifyRODataLoads(BinaryFunction
&BF
) {
1133 BinaryContext
&BC
= BF
.getBinaryContext();
1134 MCPlusBuilder
*MIB
= BC
.MIB
.get();
1136 uint64_t NumLocalLoadsSimplified
= 0;
1137 uint64_t NumDynamicLocalLoadsSimplified
= 0;
1138 uint64_t NumLocalLoadsFound
= 0;
1139 uint64_t NumDynamicLocalLoadsFound
= 0;
1141 for (BinaryBasicBlock
*BB
: BF
.getLayout().blocks()) {
1142 for (MCInst
&Inst
: *BB
) {
1143 unsigned Opcode
= Inst
.getOpcode();
1144 const MCInstrDesc
&Desc
= BC
.MII
->get(Opcode
);
1146 // Skip instructions that do not load from memory.
1147 if (!Desc
.mayLoad())
1150 // Try to statically evaluate the target memory address;
1151 uint64_t TargetAddress
;
1153 if (MIB
->hasPCRelOperand(Inst
)) {
1154 // Try to find the symbol that corresponds to the PC-relative operand.
1155 MCOperand
*DispOpI
= MIB
->getMemOperandDisp(Inst
);
1156 assert(DispOpI
!= Inst
.end() && "expected PC-relative displacement");
1157 assert(DispOpI
->isExpr() &&
1158 "found PC-relative with non-symbolic displacement");
1160 // Get displacement symbol.
1161 const MCSymbol
*DisplSymbol
;
1162 uint64_t DisplOffset
;
1164 std::tie(DisplSymbol
, DisplOffset
) =
1165 MIB
->getTargetSymbolInfo(DispOpI
->getExpr());
1170 // Look up the symbol address in the global symbols map of the binary
1172 BinaryData
*BD
= BC
.getBinaryDataByName(DisplSymbol
->getName());
1175 TargetAddress
= BD
->getAddress() + DisplOffset
;
1176 } else if (!MIB
->evaluateMemOperandTarget(Inst
, TargetAddress
)) {
1180 // Get the contents of the section containing the target address of the
1181 // memory operand. We are only interested in read-only sections.
1182 ErrorOr
<BinarySection
&> DataSection
=
1183 BC
.getSectionForAddress(TargetAddress
);
1184 if (!DataSection
|| DataSection
->isWritable())
1187 if (BC
.getRelocationAt(TargetAddress
) ||
1188 BC
.getDynamicRelocationAt(TargetAddress
))
1191 uint32_t Offset
= TargetAddress
- DataSection
->getAddress();
1192 StringRef ConstantData
= DataSection
->getContents();
1194 ++NumLocalLoadsFound
;
1195 if (BB
->hasProfile())
1196 NumDynamicLocalLoadsFound
+= BB
->getExecutionCount();
1198 if (MIB
->replaceMemOperandWithImm(Inst
, ConstantData
, Offset
)) {
1199 ++NumLocalLoadsSimplified
;
1200 if (BB
->hasProfile())
1201 NumDynamicLocalLoadsSimplified
+= BB
->getExecutionCount();
1206 NumLoadsFound
+= NumLocalLoadsFound
;
1207 NumDynamicLoadsFound
+= NumDynamicLocalLoadsFound
;
1208 NumLoadsSimplified
+= NumLocalLoadsSimplified
;
1209 NumDynamicLoadsSimplified
+= NumDynamicLocalLoadsSimplified
;
1211 return NumLocalLoadsSimplified
> 0;
1214 void SimplifyRODataLoads::runOnFunctions(BinaryContext
&BC
) {
1215 for (auto &It
: BC
.getBinaryFunctions()) {
1216 BinaryFunction
&Function
= It
.second
;
1217 if (shouldOptimize(Function
) && simplifyRODataLoads(Function
))
1218 Modified
.insert(&Function
);
1221 outs() << "BOLT-INFO: simplified " << NumLoadsSimplified
<< " out of "
1222 << NumLoadsFound
<< " loads from a statically computed address.\n"
1223 << "BOLT-INFO: dynamic loads simplified: " << NumDynamicLoadsSimplified
1225 << "BOLT-INFO: dynamic loads found: " << NumDynamicLoadsFound
<< "\n";
1228 void AssignSections::runOnFunctions(BinaryContext
&BC
) {
1229 for (BinaryFunction
*Function
: BC
.getInjectedBinaryFunctions()) {
1230 Function
->setCodeSectionName(BC
.getInjectedCodeSectionName());
1231 Function
->setColdCodeSectionName(BC
.getInjectedColdCodeSectionName());
1234 // In non-relocation mode functions have pre-assigned section names.
1235 if (!BC
.HasRelocations
)
1238 const bool UseColdSection
=
1239 BC
.NumProfiledFuncs
> 0 ||
1240 opts::ReorderFunctions
== ReorderFunctions::RT_USER
;
1241 for (auto &BFI
: BC
.getBinaryFunctions()) {
1242 BinaryFunction
&Function
= BFI
.second
;
1243 if (opts::isHotTextMover(Function
)) {
1244 Function
.setCodeSectionName(BC
.getHotTextMoverSectionName());
1245 Function
.setColdCodeSectionName(BC
.getHotTextMoverSectionName());
1249 if (!UseColdSection
|| Function
.hasValidIndex())
1250 Function
.setCodeSectionName(BC
.getMainCodeSectionName());
1252 Function
.setCodeSectionName(BC
.getColdCodeSectionName());
1254 if (Function
.isSplit())
1255 Function
.setColdCodeSectionName(BC
.getColdCodeSectionName());
1259 void PrintProfileStats::runOnFunctions(BinaryContext
&BC
) {
1260 double FlowImbalanceMean
= 0.0;
1261 size_t NumBlocksConsidered
= 0;
1262 double WorstBias
= 0.0;
1263 const BinaryFunction
*WorstBiasFunc
= nullptr;
1265 // For each function CFG, we fill an IncomingMap with the sum of the frequency
1266 // of incoming edges for each BB. Likewise for each OutgoingMap and the sum
1267 // of the frequency of outgoing edges.
1268 using FlowMapTy
= std::unordered_map
<const BinaryBasicBlock
*, uint64_t>;
1269 std::unordered_map
<const BinaryFunction
*, FlowMapTy
> TotalIncomingMaps
;
1270 std::unordered_map
<const BinaryFunction
*, FlowMapTy
> TotalOutgoingMaps
;
1273 for (const auto &BFI
: BC
.getBinaryFunctions()) {
1274 const BinaryFunction
&Function
= BFI
.second
;
1275 if (Function
.empty() || !Function
.isSimple())
1277 FlowMapTy
&IncomingMap
= TotalIncomingMaps
[&Function
];
1278 FlowMapTy
&OutgoingMap
= TotalOutgoingMaps
[&Function
];
1279 for (const BinaryBasicBlock
&BB
: Function
) {
1280 uint64_t TotalOutgoing
= 0ULL;
1281 auto SuccBIIter
= BB
.branch_info_begin();
1282 for (BinaryBasicBlock
*Succ
: BB
.successors()) {
1283 uint64_t Count
= SuccBIIter
->Count
;
1284 if (Count
== BinaryBasicBlock::COUNT_NO_PROFILE
|| Count
== 0) {
1288 TotalOutgoing
+= Count
;
1289 IncomingMap
[Succ
] += Count
;
1292 OutgoingMap
[&BB
] = TotalOutgoing
;
1295 size_t NumBlocks
= 0;
1297 for (const BinaryBasicBlock
&BB
: Function
) {
1298 // Do not compute score for low frequency blocks, entry or exit blocks
1299 if (IncomingMap
[&BB
] < 100 || OutgoingMap
[&BB
] == 0 || BB
.isEntryPoint())
1302 const double Difference
= (double)OutgoingMap
[&BB
] - IncomingMap
[&BB
];
1303 Mean
+= fabs(Difference
/ IncomingMap
[&BB
]);
1306 FlowImbalanceMean
+= Mean
;
1307 NumBlocksConsidered
+= NumBlocks
;
1310 double FuncMean
= Mean
/ NumBlocks
;
1311 if (FuncMean
> WorstBias
) {
1312 WorstBias
= FuncMean
;
1313 WorstBiasFunc
= &Function
;
1316 if (NumBlocksConsidered
> 0)
1317 FlowImbalanceMean
/= NumBlocksConsidered
;
1319 // Compute standard deviation
1320 NumBlocksConsidered
= 0;
1321 double FlowImbalanceVar
= 0.0;
1322 for (const auto &BFI
: BC
.getBinaryFunctions()) {
1323 const BinaryFunction
&Function
= BFI
.second
;
1324 if (Function
.empty() || !Function
.isSimple())
1326 FlowMapTy
&IncomingMap
= TotalIncomingMaps
[&Function
];
1327 FlowMapTy
&OutgoingMap
= TotalOutgoingMaps
[&Function
];
1328 for (const BinaryBasicBlock
&BB
: Function
) {
1329 if (IncomingMap
[&BB
] < 100 || OutgoingMap
[&BB
] == 0)
1331 ++NumBlocksConsidered
;
1332 const double Difference
= (double)OutgoingMap
[&BB
] - IncomingMap
[&BB
];
1334 pow(fabs(Difference
/ IncomingMap
[&BB
]) - FlowImbalanceMean
, 2);
1337 if (NumBlocksConsidered
) {
1338 FlowImbalanceVar
/= NumBlocksConsidered
;
1339 FlowImbalanceVar
= sqrt(FlowImbalanceVar
);
1343 outs() << format("BOLT-INFO: Profile bias score: %.4lf%% StDev: %.4lf%%\n",
1344 (100.0 * FlowImbalanceMean
), (100.0 * FlowImbalanceVar
));
1345 if (WorstBiasFunc
&& opts::Verbosity
>= 1) {
1346 outs() << "Worst average bias observed in " << WorstBiasFunc
->getPrintName()
1348 LLVM_DEBUG(WorstBiasFunc
->dump());
1352 void PrintProgramStats::runOnFunctions(BinaryContext
&BC
) {
1353 uint64_t NumRegularFunctions
= 0;
1354 uint64_t NumStaleProfileFunctions
= 0;
1355 uint64_t NumAllStaleFunctions
= 0;
1356 uint64_t NumInferredFunctions
= 0;
1357 uint64_t NumNonSimpleProfiledFunctions
= 0;
1358 uint64_t NumUnknownControlFlowFunctions
= 0;
1359 uint64_t TotalSampleCount
= 0;
1360 uint64_t StaleSampleCount
= 0;
1361 uint64_t InferredSampleCount
= 0;
1362 std::vector
<const BinaryFunction
*> ProfiledFunctions
;
1363 const char *StaleFuncsHeader
= "BOLT-INFO: Functions with stale profile:\n";
1364 for (auto &BFI
: BC
.getBinaryFunctions()) {
1365 const BinaryFunction
&Function
= BFI
.second
;
1367 // Ignore PLT functions for stats.
1368 if (Function
.isPLTFunction())
1371 ++NumRegularFunctions
;
1373 if (!Function
.isSimple()) {
1374 if (Function
.hasProfile())
1375 ++NumNonSimpleProfiledFunctions
;
1379 if (Function
.hasUnknownControlFlow()) {
1380 if (opts::PrintUnknownCFG
)
1382 else if (opts::PrintUnknown
)
1383 errs() << "function with unknown control flow: " << Function
<< '\n';
1385 ++NumUnknownControlFlowFunctions
;
1388 if (!Function
.hasProfile())
1391 uint64_t SampleCount
= Function
.getRawBranchCount();
1392 TotalSampleCount
+= SampleCount
;
1394 if (Function
.hasValidProfile()) {
1395 ProfiledFunctions
.push_back(&Function
);
1396 if (Function
.hasInferredProfile()) {
1397 ++NumInferredFunctions
;
1398 InferredSampleCount
+= SampleCount
;
1399 ++NumAllStaleFunctions
;
1402 if (opts::ReportStaleFuncs
) {
1403 outs() << StaleFuncsHeader
;
1404 StaleFuncsHeader
= "";
1405 outs() << " " << Function
<< '\n';
1407 ++NumStaleProfileFunctions
;
1408 StaleSampleCount
+= SampleCount
;
1409 ++NumAllStaleFunctions
;
1412 BC
.NumProfiledFuncs
= ProfiledFunctions
.size();
1413 BC
.NumStaleProfileFuncs
= NumStaleProfileFunctions
;
1415 const size_t NumAllProfiledFunctions
=
1416 ProfiledFunctions
.size() + NumStaleProfileFunctions
;
1417 outs() << "BOLT-INFO: " << NumAllProfiledFunctions
<< " out of "
1418 << NumRegularFunctions
<< " functions in the binary ("
1419 << format("%.1f", NumAllProfiledFunctions
/
1420 (float)NumRegularFunctions
* 100.0f
)
1421 << "%) have non-empty execution profile\n";
1422 if (NumNonSimpleProfiledFunctions
) {
1423 outs() << "BOLT-INFO: " << NumNonSimpleProfiledFunctions
<< " function"
1424 << (NumNonSimpleProfiledFunctions
== 1 ? "" : "s")
1425 << " with profile could not be optimized\n";
1427 if (NumStaleProfileFunctions
) {
1428 const float PctStale
=
1429 NumStaleProfileFunctions
/ (float)NumAllProfiledFunctions
* 100.0f
;
1430 auto printErrorOrWarning
= [&]() {
1431 if (PctStale
> opts::StaleThreshold
)
1432 errs() << "BOLT-ERROR: ";
1434 errs() << "BOLT-WARNING: ";
1436 printErrorOrWarning();
1437 errs() << NumStaleProfileFunctions
1438 << format(" (%.1f%% of all profiled)", PctStale
) << " function"
1439 << (NumStaleProfileFunctions
== 1 ? "" : "s")
1440 << " have invalid (possibly stale) profile."
1441 " Use -report-stale to see the list.\n";
1442 if (TotalSampleCount
> 0) {
1443 printErrorOrWarning();
1444 errs() << StaleSampleCount
<< " out of " << TotalSampleCount
1445 << " samples in the binary ("
1446 << format("%.1f", ((100.0f
* StaleSampleCount
) / TotalSampleCount
))
1447 << "%) belong to functions with invalid"
1448 " (possibly stale) profile.\n";
1450 if (PctStale
> opts::StaleThreshold
) {
1451 errs() << "BOLT-ERROR: stale functions exceed specified threshold of "
1452 << opts::StaleThreshold
<< "%. Exiting.\n";
1456 if (NumInferredFunctions
) {
1457 outs() << format("BOLT-INFO: inferred profile for %d (%.2f%% of profiled, "
1458 "%.2f%% of stale) functions responsible for %.2f%% samples"
1459 " (%zu out of %zu)\n",
1460 NumInferredFunctions
,
1461 100.0 * NumInferredFunctions
/ NumAllProfiledFunctions
,
1462 100.0 * NumInferredFunctions
/ NumAllStaleFunctions
,
1463 100.0 * InferredSampleCount
/ TotalSampleCount
,
1464 InferredSampleCount
, TotalSampleCount
);
1466 "BOLT-INFO: inference found an exact match for %.2f%% of basic blocks"
1467 " (%zu out of %zu stale) responsible for %.2f%% samples"
1468 " (%zu out of %zu stale)\n",
1469 100.0 * BC
.Stats
.NumMatchedBlocks
/ BC
.Stats
.NumStaleBlocks
,
1470 BC
.Stats
.NumMatchedBlocks
, BC
.Stats
.NumStaleBlocks
,
1471 100.0 * BC
.Stats
.MatchedSampleCount
/ BC
.Stats
.StaleSampleCount
,
1472 BC
.Stats
.MatchedSampleCount
, BC
.Stats
.StaleSampleCount
);
1475 if (const uint64_t NumUnusedObjects
= BC
.getNumUnusedProfiledObjects()) {
1476 outs() << "BOLT-INFO: profile for " << NumUnusedObjects
1477 << " objects was ignored\n";
1480 if (ProfiledFunctions
.size() > 10) {
1481 if (opts::Verbosity
>= 1) {
1482 outs() << "BOLT-INFO: top called functions are:\n";
1483 llvm::sort(ProfiledFunctions
,
1484 [](const BinaryFunction
*A
, const BinaryFunction
*B
) {
1485 return B
->getExecutionCount() < A
->getExecutionCount();
1487 auto SFI
= ProfiledFunctions
.begin();
1488 auto SFIend
= ProfiledFunctions
.end();
1489 for (unsigned I
= 0u; I
< opts::TopCalledLimit
&& SFI
!= SFIend
;
1491 outs() << " " << **SFI
<< " : " << (*SFI
)->getExecutionCount() << '\n';
1495 if (!opts::PrintSortedBy
.empty()) {
1496 std::vector
<BinaryFunction
*> Functions
;
1497 std::map
<const BinaryFunction
*, DynoStats
> Stats
;
1499 for (auto &BFI
: BC
.getBinaryFunctions()) {
1500 BinaryFunction
&BF
= BFI
.second
;
1501 if (shouldOptimize(BF
) && BF
.hasValidProfile()) {
1502 Functions
.push_back(&BF
);
1503 Stats
.emplace(&BF
, getDynoStats(BF
));
1507 const bool SortAll
=
1508 llvm::is_contained(opts::PrintSortedBy
, DynoStats::LAST_DYNO_STAT
);
1510 const bool Ascending
=
1511 opts::DynoStatsSortOrderOpt
== opts::DynoStatsSortOrder::Ascending
;
1514 llvm::stable_sort(Functions
,
1515 [Ascending
, &Stats
](const BinaryFunction
*A
,
1516 const BinaryFunction
*B
) {
1517 return Ascending
? Stats
.at(A
) < Stats
.at(B
)
1518 : Stats
.at(B
) < Stats
.at(A
);
1522 Functions
, [Ascending
, &Stats
](const BinaryFunction
*A
,
1523 const BinaryFunction
*B
) {
1524 const DynoStats
&StatsA
= Stats
.at(A
);
1525 const DynoStats
&StatsB
= Stats
.at(B
);
1526 return Ascending
? StatsA
.lessThan(StatsB
, opts::PrintSortedBy
)
1527 : StatsB
.lessThan(StatsA
, opts::PrintSortedBy
);
1531 outs() << "BOLT-INFO: top functions sorted by ";
1533 outs() << "dyno stats";
1536 bool PrintComma
= false;
1537 for (const DynoStats::Category Category
: opts::PrintSortedBy
) {
1540 outs() << DynoStats::Description(Category
);
1546 outs() << " are:\n";
1547 auto SFI
= Functions
.begin();
1548 for (unsigned I
= 0; I
< 100 && SFI
!= Functions
.end(); ++SFI
, ++I
) {
1549 const DynoStats Stats
= getDynoStats(**SFI
);
1550 outs() << " " << **SFI
;
1553 bool PrintComma
= false;
1554 for (const DynoStats::Category Category
: opts::PrintSortedBy
) {
1557 outs() << dynoStatsOptName(Category
) << "=" << Stats
[Category
];
1566 if (!BC
.TrappedFunctions
.empty()) {
1567 errs() << "BOLT-WARNING: " << BC
.TrappedFunctions
.size() << " function"
1568 << (BC
.TrappedFunctions
.size() > 1 ? "s" : "")
1569 << " will trap on entry. Use -trap-avx512=0 to disable"
1571 if (opts::Verbosity
>= 1 || BC
.TrappedFunctions
.size() <= 5) {
1573 for (const BinaryFunction
*Function
: BC
.TrappedFunctions
)
1574 errs() << " " << *Function
<< '\n';
1576 errs() << " Use -v=1 to see the list.\n";
1580 // Print information on missed macro-fusion opportunities seen on input.
1581 if (BC
.Stats
.MissedMacroFusionPairs
) {
1582 outs() << format("BOLT-INFO: the input contains %zu (dynamic count : %zu)"
1583 " opportunities for macro-fusion optimization",
1584 BC
.Stats
.MissedMacroFusionPairs
,
1585 BC
.Stats
.MissedMacroFusionExecCount
);
1586 switch (opts::AlignMacroOpFusion
) {
1588 outs() << ". Use -align-macro-fusion to fix.\n";
1591 outs() << ". Will fix instances on a hot path.\n";
1594 outs() << " that are going to be fixed\n";
1599 // Collect and print information about suboptimal code layout on input.
1600 if (opts::ReportBadLayout
) {
1601 std::vector
<BinaryFunction
*> SuboptimalFuncs
;
1602 for (auto &BFI
: BC
.getBinaryFunctions()) {
1603 BinaryFunction
&BF
= BFI
.second
;
1604 if (!BF
.hasValidProfile())
1607 const uint64_t HotThreshold
=
1608 std::max
<uint64_t>(BF
.getKnownExecutionCount(), 1);
1609 bool HotSeen
= false;
1610 for (const BinaryBasicBlock
*BB
: BF
.getLayout().rblocks()) {
1611 if (!HotSeen
&& BB
->getKnownExecutionCount() > HotThreshold
) {
1615 if (HotSeen
&& BB
->getKnownExecutionCount() == 0) {
1616 SuboptimalFuncs
.push_back(&BF
);
1622 if (!SuboptimalFuncs
.empty()) {
1623 llvm::sort(SuboptimalFuncs
,
1624 [](const BinaryFunction
*A
, const BinaryFunction
*B
) {
1625 return A
->getKnownExecutionCount() / A
->getSize() >
1626 B
->getKnownExecutionCount() / B
->getSize();
1629 outs() << "BOLT-INFO: " << SuboptimalFuncs
.size()
1630 << " functions have "
1631 "cold code in the middle of hot code. Top functions are:\n";
1632 for (unsigned I
= 0;
1633 I
< std::min(static_cast<size_t>(opts::ReportBadLayout
),
1634 SuboptimalFuncs
.size());
1636 SuboptimalFuncs
[I
]->print(outs());
1640 if (NumUnknownControlFlowFunctions
) {
1641 outs() << "BOLT-INFO: " << NumUnknownControlFlowFunctions
1642 << " functions have instructions with unknown control flow";
1643 if (!opts::PrintUnknown
)
1644 outs() << ". Use -print-unknown to see the list.";
1649 void InstructionLowering::runOnFunctions(BinaryContext
&BC
) {
1650 for (auto &BFI
: BC
.getBinaryFunctions())
1651 for (BinaryBasicBlock
&BB
: BFI
.second
)
1652 for (MCInst
&Instruction
: BB
)
1653 BC
.MIB
->lowerTailCall(Instruction
);
1656 void StripRepRet::runOnFunctions(BinaryContext
&BC
) {
1660 uint64_t NumPrefixesRemoved
= 0;
1661 uint64_t NumBytesSaved
= 0;
1662 for (auto &BFI
: BC
.getBinaryFunctions()) {
1663 for (BinaryBasicBlock
&BB
: BFI
.second
) {
1664 auto LastInstRIter
= BB
.getLastNonPseudo();
1665 if (LastInstRIter
== BB
.rend() || !BC
.MIB
->isReturn(*LastInstRIter
) ||
1666 !BC
.MIB
->deleteREPPrefix(*LastInstRIter
))
1669 NumPrefixesRemoved
+= BB
.getKnownExecutionCount();
1675 outs() << "BOLT-INFO: removed " << NumBytesSaved
1676 << " 'repz' prefixes"
1677 " with estimated execution count of "
1678 << NumPrefixesRemoved
<< " times.\n";
1681 void InlineMemcpy::runOnFunctions(BinaryContext
&BC
) {
1685 uint64_t NumInlined
= 0;
1686 uint64_t NumInlinedDyno
= 0;
1687 for (auto &BFI
: BC
.getBinaryFunctions()) {
1688 for (BinaryBasicBlock
&BB
: BFI
.second
) {
1689 for (auto II
= BB
.begin(); II
!= BB
.end(); ++II
) {
1692 if (!BC
.MIB
->isCall(Inst
) || MCPlus::getNumPrimeOperands(Inst
) != 1 ||
1693 !Inst
.getOperand(0).isExpr())
1696 const MCSymbol
*CalleeSymbol
= BC
.MIB
->getTargetSymbol(Inst
);
1697 if (CalleeSymbol
->getName() != "memcpy" &&
1698 CalleeSymbol
->getName() != "memcpy@PLT" &&
1699 CalleeSymbol
->getName() != "_memcpy8")
1702 const bool IsMemcpy8
= (CalleeSymbol
->getName() == "_memcpy8");
1703 const bool IsTailCall
= BC
.MIB
->isTailCall(Inst
);
1705 const InstructionListType NewCode
=
1706 BC
.MIB
->createInlineMemcpy(IsMemcpy8
);
1707 II
= BB
.replaceInstruction(II
, NewCode
);
1708 std::advance(II
, NewCode
.size() - 1);
1711 BC
.MIB
->createReturn(Return
);
1712 II
= BB
.insertInstruction(std::next(II
), std::move(Return
));
1716 NumInlinedDyno
+= BB
.getKnownExecutionCount();
1722 outs() << "BOLT-INFO: inlined " << NumInlined
<< " memcpy() calls";
1724 outs() << ". The calls were executed " << NumInlinedDyno
1725 << " times based on profile.";
1730 bool SpecializeMemcpy1::shouldOptimize(const BinaryFunction
&Function
) const {
1731 if (!BinaryFunctionPass::shouldOptimize(Function
))
1734 for (const std::string
&FunctionSpec
: Spec
) {
1735 StringRef FunctionName
= StringRef(FunctionSpec
).split(':').first
;
1736 if (Function
.hasNameRegex(FunctionName
))
1743 std::set
<size_t> SpecializeMemcpy1::getCallSitesToOptimize(
1744 const BinaryFunction
&Function
) const {
1745 StringRef SitesString
;
1746 for (const std::string
&FunctionSpec
: Spec
) {
1747 StringRef FunctionName
;
1748 std::tie(FunctionName
, SitesString
) = StringRef(FunctionSpec
).split(':');
1749 if (Function
.hasNameRegex(FunctionName
))
1754 std::set
<size_t> Sites
;
1755 SmallVector
<StringRef
, 4> SitesVec
;
1756 SitesString
.split(SitesVec
, ':');
1757 for (StringRef SiteString
: SitesVec
) {
1758 if (SiteString
.empty())
1761 if (!SiteString
.getAsInteger(10, Result
))
1762 Sites
.emplace(Result
);
1768 void SpecializeMemcpy1::runOnFunctions(BinaryContext
&BC
) {
1772 uint64_t NumSpecialized
= 0;
1773 uint64_t NumSpecializedDyno
= 0;
1774 for (auto &BFI
: BC
.getBinaryFunctions()) {
1775 BinaryFunction
&Function
= BFI
.second
;
1776 if (!shouldOptimize(Function
))
1779 std::set
<size_t> CallsToOptimize
= getCallSitesToOptimize(Function
);
1780 auto shouldOptimize
= [&](size_t N
) {
1781 return CallsToOptimize
.empty() || CallsToOptimize
.count(N
);
1784 std::vector
<BinaryBasicBlock
*> Blocks(Function
.pbegin(), Function
.pend());
1785 size_t CallSiteID
= 0;
1786 for (BinaryBasicBlock
*CurBB
: Blocks
) {
1787 for (auto II
= CurBB
->begin(); II
!= CurBB
->end(); ++II
) {
1790 if (!BC
.MIB
->isCall(Inst
) || MCPlus::getNumPrimeOperands(Inst
) != 1 ||
1791 !Inst
.getOperand(0).isExpr())
1794 const MCSymbol
*CalleeSymbol
= BC
.MIB
->getTargetSymbol(Inst
);
1795 if (CalleeSymbol
->getName() != "memcpy" &&
1796 CalleeSymbol
->getName() != "memcpy@PLT")
1799 if (BC
.MIB
->isTailCall(Inst
))
1804 if (!shouldOptimize(CallSiteID
))
1807 // Create a copy of a call to memcpy(dest, src, size).
1808 MCInst MemcpyInstr
= Inst
;
1810 BinaryBasicBlock
*OneByteMemcpyBB
= CurBB
->splitAt(II
);
1812 BinaryBasicBlock
*NextBB
= nullptr;
1813 if (OneByteMemcpyBB
->getNumNonPseudos() > 1) {
1814 NextBB
= OneByteMemcpyBB
->splitAt(OneByteMemcpyBB
->begin());
1815 NextBB
->eraseInstruction(NextBB
->begin());
1817 NextBB
= OneByteMemcpyBB
->getSuccessor();
1818 OneByteMemcpyBB
->eraseInstruction(OneByteMemcpyBB
->begin());
1819 assert(NextBB
&& "unexpected call to memcpy() with no return");
1822 BinaryBasicBlock
*MemcpyBB
= Function
.addBasicBlock();
1823 MemcpyBB
->setOffset(CurBB
->getInputOffset());
1824 InstructionListType CmpJCC
=
1825 BC
.MIB
->createCmpJE(BC
.MIB
->getIntArgRegister(2), 1,
1826 OneByteMemcpyBB
->getLabel(), BC
.Ctx
.get());
1827 CurBB
->addInstructions(CmpJCC
);
1828 CurBB
->addSuccessor(MemcpyBB
);
1830 MemcpyBB
->addInstruction(std::move(MemcpyInstr
));
1831 MemcpyBB
->addSuccessor(NextBB
);
1832 MemcpyBB
->setCFIState(NextBB
->getCFIState());
1833 MemcpyBB
->setExecutionCount(0);
1835 // To prevent the actual call from being moved to cold, we set its
1836 // execution count to 1.
1837 if (CurBB
->getKnownExecutionCount() > 0)
1838 MemcpyBB
->setExecutionCount(1);
1840 InstructionListType OneByteMemcpy
= BC
.MIB
->createOneByteMemcpy();
1841 OneByteMemcpyBB
->addInstructions(OneByteMemcpy
);
1844 NumSpecializedDyno
+= CurBB
->getKnownExecutionCount();
1848 // Note: we don't expect the next instruction to be a call to memcpy.
1849 II
= CurBB
->begin();
1854 if (NumSpecialized
) {
1855 outs() << "BOLT-INFO: specialized " << NumSpecialized
1856 << " memcpy() call sites for size 1";
1857 if (NumSpecializedDyno
)
1858 outs() << ". The calls were executed " << NumSpecializedDyno
1859 << " times based on profile.";
1864 void RemoveNops::runOnFunction(BinaryFunction
&BF
) {
1865 const BinaryContext
&BC
= BF
.getBinaryContext();
1866 for (BinaryBasicBlock
&BB
: BF
) {
1867 for (int64_t I
= BB
.size() - 1; I
>= 0; --I
) {
1868 MCInst
&Inst
= BB
.getInstructionAtIndex(I
);
1869 if (BC
.MIB
->isNoop(Inst
) && BC
.MIB
->hasAnnotation(Inst
, "NOP"))
1870 BB
.eraseInstructionAtIndex(I
);
1875 void RemoveNops::runOnFunctions(BinaryContext
&BC
) {
1876 ParallelUtilities::WorkFuncTy WorkFun
= [&](BinaryFunction
&BF
) {
1880 ParallelUtilities::PredicateTy SkipFunc
= [&](const BinaryFunction
&BF
) {
1881 return BF
.shouldPreserveNops();
1884 ParallelUtilities::runOnEachFunction(
1885 BC
, ParallelUtilities::SchedulingPolicy::SP_INST_LINEAR
, WorkFun
,
1886 SkipFunc
, "RemoveNops");