1 //===- bolt/Passes/BinaryPasses.cpp - Binary-level passes -----------------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 // This file implements multiple passes for binary optimization and analysis.
11 //===----------------------------------------------------------------------===//
13 #include "bolt/Passes/BinaryPasses.h"
14 #include "bolt/Core/FunctionLayout.h"
15 #include "bolt/Core/ParallelUtilities.h"
16 #include "bolt/Passes/ReorderAlgorithm.h"
17 #include "bolt/Passes/ReorderFunctions.h"
18 #include "llvm/Support/CommandLine.h"
24 #define DEBUG_TYPE "bolt-opts"
29 static const char *dynoStatsOptName(const bolt::DynoStats::Category C
) {
30 assert(C
> bolt::DynoStats::FIRST_DYNO_STAT
&&
31 C
< DynoStats::LAST_DYNO_STAT
&& "Unexpected dyno stat category.");
33 static std::string OptNames
[bolt::DynoStats::LAST_DYNO_STAT
+ 1];
35 OptNames
[C
] = bolt::DynoStats::Description(C
);
37 std::replace(OptNames
[C
].begin(), OptNames
[C
].end(), ' ', '-');
39 return OptNames
[C
].c_str();
44 extern cl::OptionCategory BoltCategory
;
45 extern cl::OptionCategory BoltOptCategory
;
47 extern cl::opt
<unsigned> Verbosity
;
48 extern cl::opt
<bool> EnableBAT
;
49 extern cl::opt
<unsigned> ExecutionCountThreshold
;
50 extern cl::opt
<bool> UpdateDebugSections
;
51 extern cl::opt
<bolt::ReorderFunctions::ReorderType
> ReorderFunctions
;
53 enum DynoStatsSortOrder
: char {
58 static cl::opt
<DynoStatsSortOrder
> DynoStatsSortOrderOpt(
59 "print-sorted-by-order",
60 cl::desc("use ascending or descending order when printing functions "
61 "ordered by dyno stats"),
62 cl::init(DynoStatsSortOrder::Descending
), cl::cat(BoltOptCategory
));
65 HotTextMoveSections("hot-text-move-sections",
66 cl::desc("list of sections containing functions used for hugifying hot text. "
67 "BOLT makes sure these functions are not placed on the same page as "
68 "the hot text. (default=\'.stub,.mover\')."),
69 cl::value_desc("sec1,sec2,sec3,..."),
72 cl::cat(BoltCategory
));
74 bool isHotTextMover(const BinaryFunction
&Function
) {
75 for (std::string
&SectionName
: opts::HotTextMoveSections
) {
76 if (Function
.getOriginSectionName() &&
77 *Function
.getOriginSectionName() == SectionName
)
84 static cl::opt
<bool> MinBranchClusters(
85 "min-branch-clusters",
86 cl::desc("use a modified clustering algorithm geared towards minimizing "
88 cl::Hidden
, cl::cat(BoltOptCategory
));
90 static cl::list
<Peepholes::PeepholeOpts
> Peepholes(
91 "peepholes", cl::CommaSeparated
, cl::desc("enable peephole optimizations"),
92 cl::value_desc("opt1,opt2,opt3,..."),
93 cl::values(clEnumValN(Peepholes::PEEP_NONE
, "none", "disable peepholes"),
94 clEnumValN(Peepholes::PEEP_DOUBLE_JUMPS
, "double-jumps",
95 "remove double jumps when able"),
96 clEnumValN(Peepholes::PEEP_TAILCALL_TRAPS
, "tailcall-traps",
97 "insert tail call traps"),
98 clEnumValN(Peepholes::PEEP_USELESS_BRANCHES
, "useless-branches",
99 "remove useless conditional branches"),
100 clEnumValN(Peepholes::PEEP_ALL
, "all",
101 "enable all peephole optimizations")),
102 cl::ZeroOrMore
, cl::cat(BoltOptCategory
));
104 static cl::opt
<unsigned>
105 PrintFuncStat("print-function-statistics",
106 cl::desc("print statistics about basic block ordering"),
107 cl::init(0), cl::cat(BoltOptCategory
));
109 static cl::opt
<bool> PrintLargeFunctions(
110 "print-large-functions",
111 cl::desc("print functions that could not be overwritten due to excessive "
113 cl::init(false), cl::cat(BoltOptCategory
));
115 static cl::list
<bolt::DynoStats::Category
>
116 PrintSortedBy("print-sorted-by", cl::CommaSeparated
,
117 cl::desc("print functions sorted by order of dyno stats"),
118 cl::value_desc("key1,key2,key3,..."),
120 #define D(name, description, ...) \
121 clEnumValN(bolt::DynoStats::name, dynoStatsOptName(bolt::DynoStats::name), \
125 clEnumValN(bolt::DynoStats::LAST_DYNO_STAT
, "all",
126 "sorted by all names")),
127 cl::ZeroOrMore
, cl::cat(BoltOptCategory
));
130 PrintUnknown("print-unknown",
131 cl::desc("print names of functions with unknown control flow"),
132 cl::cat(BoltCategory
), cl::Hidden
);
135 PrintUnknownCFG("print-unknown-cfg",
136 cl::desc("dump CFG of functions with unknown control flow"),
137 cl::cat(BoltCategory
), cl::ReallyHidden
);
139 // Please MSVC19 with a forward declaration: otherwise it reports an error about
140 // an undeclared variable inside a callback.
141 extern cl::opt
<bolt::ReorderBasicBlocks::LayoutType
> ReorderBlocks
;
142 cl::opt
<bolt::ReorderBasicBlocks::LayoutType
> ReorderBlocks(
143 "reorder-blocks", cl::desc("change layout of basic blocks in a function"),
144 cl::init(bolt::ReorderBasicBlocks::LT_NONE
),
146 clEnumValN(bolt::ReorderBasicBlocks::LT_NONE
, "none",
147 "do not reorder basic blocks"),
148 clEnumValN(bolt::ReorderBasicBlocks::LT_REVERSE
, "reverse",
149 "layout blocks in reverse order"),
150 clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE
, "normal",
151 "perform optimal layout based on profile"),
152 clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_BRANCH
,
154 "perform optimal layout prioritizing branch "
156 clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_CACHE
, "cache",
157 "perform optimal layout prioritizing I-cache "
159 clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_CACHE_PLUS
, "cache+",
160 "perform layout optimizing I-cache behavior"),
161 clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_EXT_TSP
, "ext-tsp",
162 "perform layout optimizing I-cache behavior"),
163 clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_SHUFFLE
,
164 "cluster-shuffle", "perform random layout of clusters")),
165 cl::ZeroOrMore
, cl::cat(BoltOptCategory
),
166 cl::callback([](const bolt::ReorderBasicBlocks::LayoutType
&option
) {
167 if (option
== bolt::ReorderBasicBlocks::LT_OPTIMIZE_CACHE_PLUS
) {
168 errs() << "BOLT-WARNING: '-reorder-blocks=cache+' is deprecated, please"
169 << " use '-reorder-blocks=ext-tsp' instead\n";
170 ReorderBlocks
= bolt::ReorderBasicBlocks::LT_OPTIMIZE_EXT_TSP
;
174 static cl::opt
<unsigned> ReportBadLayout(
176 cl::desc("print top <uint> functions with suboptimal code layout on input"),
177 cl::init(0), cl::Hidden
, cl::cat(BoltOptCategory
));
180 ReportStaleFuncs("report-stale",
181 cl::desc("print the list of functions with stale profile"),
182 cl::Hidden
, cl::cat(BoltOptCategory
));
184 enum SctcModes
: char {
186 SctcPreserveDirection
,
190 static cl::opt
<SctcModes
>
191 SctcMode("sctc-mode",
192 cl::desc("mode for simplify conditional tail calls"),
193 cl::init(SctcAlways
),
194 cl::values(clEnumValN(SctcAlways
, "always", "always perform sctc"),
195 clEnumValN(SctcPreserveDirection
,
197 "only perform sctc when branch direction is "
199 clEnumValN(SctcHeuristic
,
201 "use branch prediction data to control sctc")),
203 cl::cat(BoltOptCategory
));
205 static cl::opt
<unsigned>
206 StaleThreshold("stale-threshold",
208 "maximum percentage of stale functions to tolerate (default: 100)"),
211 cl::cat(BoltOptCategory
));
213 static cl::opt
<unsigned> TSPThreshold(
216 "maximum number of hot basic blocks in a function for which to use "
217 "a precise TSP solution while re-ordering basic blocks"),
218 cl::init(10), cl::Hidden
, cl::cat(BoltOptCategory
));
220 static cl::opt
<unsigned> TopCalledLimit(
222 cl::desc("maximum number of functions to print in top called "
223 "functions section"),
224 cl::init(100), cl::Hidden
, cl::cat(BoltCategory
));
231 bool BinaryFunctionPass::shouldOptimize(const BinaryFunction
&BF
) const {
232 return BF
.isSimple() && BF
.getState() == BinaryFunction::State::CFG
&&
236 bool BinaryFunctionPass::shouldPrint(const BinaryFunction
&BF
) const {
237 return BF
.isSimple() && !BF
.isIgnored();
240 void NormalizeCFG::runOnFunction(BinaryFunction
&BF
) {
241 uint64_t NumRemoved
= 0;
242 uint64_t NumDuplicateEdges
= 0;
243 uint64_t NeedsFixBranches
= 0;
244 for (BinaryBasicBlock
&BB
: BF
) {
248 if (BB
.isEntryPoint() || BB
.isLandingPad())
251 // Handle a dangling empty block.
252 if (BB
.succ_size() == 0) {
253 // If an empty dangling basic block has a predecessor, it could be a
254 // result of codegen for __builtin_unreachable. In such case, do not
256 if (BB
.pred_size() == 0) {
263 // The block should have just one successor.
264 BinaryBasicBlock
*Successor
= BB
.getSuccessor();
265 assert(Successor
&& "invalid CFG encountered");
267 // Redirect all predecessors to the successor block.
268 while (!BB
.pred_empty()) {
269 BinaryBasicBlock
*Predecessor
= *BB
.pred_begin();
270 if (Predecessor
->hasJumpTable())
273 if (Predecessor
== Successor
)
276 BinaryBasicBlock::BinaryBranchInfo
&BI
= Predecessor
->getBranchInfo(BB
);
277 Predecessor
->replaceSuccessor(&BB
, Successor
, BI
.Count
,
278 BI
.MispredictedCount
);
279 // We need to fix branches even if we failed to replace all successors
280 // and remove the block.
281 NeedsFixBranches
= true;
284 if (BB
.pred_empty()) {
285 BB
.removeAllSuccessors();
292 BF
.eraseInvalidBBs();
294 // Check for duplicate successors. Do it after the empty block elimination as
295 // we can get more duplicate successors.
296 for (BinaryBasicBlock
&BB
: BF
)
297 if (!BB
.hasJumpTable() && BB
.succ_size() == 2 &&
298 BB
.getConditionalSuccessor(false) == BB
.getConditionalSuccessor(true))
301 // fixBranches() will get rid of duplicate edges and update jump instructions.
302 if (NumDuplicateEdges
|| NeedsFixBranches
)
305 NumDuplicateEdgesMerged
+= NumDuplicateEdges
;
306 NumBlocksRemoved
+= NumRemoved
;
309 Error
NormalizeCFG::runOnFunctions(BinaryContext
&BC
) {
310 ParallelUtilities::runOnEachFunction(
311 BC
, ParallelUtilities::SchedulingPolicy::SP_BB_LINEAR
,
312 [&](BinaryFunction
&BF
) { runOnFunction(BF
); },
313 [&](const BinaryFunction
&BF
) { return !shouldOptimize(BF
); },
315 if (NumBlocksRemoved
)
316 BC
.outs() << "BOLT-INFO: removed " << NumBlocksRemoved
<< " empty block"
317 << (NumBlocksRemoved
== 1 ? "" : "s") << '\n';
318 if (NumDuplicateEdgesMerged
)
319 BC
.outs() << "BOLT-INFO: merged " << NumDuplicateEdgesMerged
320 << " duplicate CFG edge"
321 << (NumDuplicateEdgesMerged
== 1 ? "" : "s") << '\n';
322 return Error::success();
325 void EliminateUnreachableBlocks::runOnFunction(BinaryFunction
&Function
) {
326 BinaryContext
&BC
= Function
.getBinaryContext();
329 Function
.markUnreachableBlocks();
331 for (BinaryBasicBlock
&BB
: Function
) {
333 dbgs() << "BOLT-INFO: UCE found unreachable block " << BB
.getName()
334 << " in function " << Function
<< "\n";
339 BinaryContext::IndependentCodeEmitter Emitter
=
340 BC
.createIndependentMCCodeEmitter();
341 std::tie(Count
, Bytes
) = Function
.eraseInvalidBBs(Emitter
.MCE
.get());
342 DeletedBlocks
+= Count
;
343 DeletedBytes
+= Bytes
;
345 auto L
= BC
.scopeLock();
346 Modified
.insert(&Function
);
347 if (opts::Verbosity
> 0)
348 BC
.outs() << "BOLT-INFO: removed " << Count
349 << " dead basic block(s) accounting for " << Bytes
350 << " bytes in function " << Function
<< '\n';
354 Error
EliminateUnreachableBlocks::runOnFunctions(BinaryContext
&BC
) {
355 ParallelUtilities::WorkFuncTy WorkFun
= [&](BinaryFunction
&BF
) {
359 ParallelUtilities::PredicateTy SkipPredicate
= [&](const BinaryFunction
&BF
) {
360 return !shouldOptimize(BF
) || BF
.getLayout().block_empty();
363 ParallelUtilities::runOnEachFunction(
364 BC
, ParallelUtilities::SchedulingPolicy::SP_CONSTANT
, WorkFun
,
365 SkipPredicate
, "elimininate-unreachable");
368 BC
.outs() << "BOLT-INFO: UCE removed " << DeletedBlocks
<< " blocks and "
369 << DeletedBytes
<< " bytes of code\n";
370 return Error::success();
373 bool ReorderBasicBlocks::shouldPrint(const BinaryFunction
&BF
) const {
374 return (BinaryFunctionPass::shouldPrint(BF
) &&
375 opts::ReorderBlocks
!= ReorderBasicBlocks::LT_NONE
);
378 bool ReorderBasicBlocks::shouldOptimize(const BinaryFunction
&BF
) const {
379 // Apply execution count threshold
380 if (BF
.getKnownExecutionCount() < opts::ExecutionCountThreshold
)
383 return BinaryFunctionPass::shouldOptimize(BF
);
386 Error
ReorderBasicBlocks::runOnFunctions(BinaryContext
&BC
) {
387 if (opts::ReorderBlocks
== ReorderBasicBlocks::LT_NONE
)
388 return Error::success();
390 std::atomic_uint64_t
ModifiedFuncCount(0);
391 std::mutex FunctionEditDistanceMutex
;
392 DenseMap
<const BinaryFunction
*, uint64_t> FunctionEditDistance
;
394 ParallelUtilities::WorkFuncTy WorkFun
= [&](BinaryFunction
&BF
) {
395 SmallVector
<const BinaryBasicBlock
*, 0> OldBlockOrder
;
396 if (opts::PrintFuncStat
> 0)
397 llvm::copy(BF
.getLayout().blocks(), std::back_inserter(OldBlockOrder
));
399 const bool LayoutChanged
=
400 modifyFunctionLayout(BF
, opts::ReorderBlocks
, opts::MinBranchClusters
);
402 ModifiedFuncCount
.fetch_add(1, std::memory_order_relaxed
);
403 if (opts::PrintFuncStat
> 0) {
404 const uint64_t Distance
= BF
.getLayout().getEditDistance(OldBlockOrder
);
405 std::lock_guard
<std::mutex
> Lock(FunctionEditDistanceMutex
);
406 FunctionEditDistance
[&BF
] = Distance
;
411 ParallelUtilities::PredicateTy SkipFunc
= [&](const BinaryFunction
&BF
) {
412 return !shouldOptimize(BF
);
415 ParallelUtilities::runOnEachFunction(
416 BC
, ParallelUtilities::SchedulingPolicy::SP_BB_LINEAR
, WorkFun
, SkipFunc
,
417 "ReorderBasicBlocks");
418 const size_t NumAllProfiledFunctions
=
419 BC
.NumProfiledFuncs
+ BC
.NumStaleProfileFuncs
;
421 BC
.outs() << "BOLT-INFO: basic block reordering modified layout of "
423 "%zu functions (%.2lf%% of profiled, %.2lf%% of total)\n",
424 ModifiedFuncCount
.load(std::memory_order_relaxed
),
425 100.0 * ModifiedFuncCount
.load(std::memory_order_relaxed
) /
426 NumAllProfiledFunctions
,
427 100.0 * ModifiedFuncCount
.load(std::memory_order_relaxed
) /
428 BC
.getBinaryFunctions().size());
430 if (opts::PrintFuncStat
> 0) {
431 raw_ostream
&OS
= BC
.outs();
432 // Copy all the values into vector in order to sort them
433 std::map
<uint64_t, BinaryFunction
&> ScoreMap
;
434 auto &BFs
= BC
.getBinaryFunctions();
435 for (auto It
= BFs
.begin(); It
!= BFs
.end(); ++It
)
436 ScoreMap
.insert(std::pair
<uint64_t, BinaryFunction
&>(
437 It
->second
.getFunctionScore(), It
->second
));
439 OS
<< "\nBOLT-INFO: Printing Function Statistics:\n\n";
440 OS
<< " There are " << BFs
.size() << " functions in total. \n";
441 OS
<< " Number of functions being modified: "
442 << ModifiedFuncCount
.load(std::memory_order_relaxed
) << "\n";
443 OS
<< " User asks for detailed information on top "
444 << opts::PrintFuncStat
<< " functions. (Ranked by function score)"
447 for (std::map
<uint64_t, BinaryFunction
&>::reverse_iterator Rit
=
449 Rit
!= ScoreMap
.rend() && I
< opts::PrintFuncStat
; ++Rit
, ++I
) {
450 BinaryFunction
&Function
= Rit
->second
;
452 OS
<< " Information for function of top: " << (I
+ 1) << ": \n";
453 OS
<< " Function Score is: " << Function
.getFunctionScore()
455 OS
<< " There are " << Function
.size()
456 << " number of blocks in this function.\n";
457 OS
<< " There are " << Function
.getInstructionCount()
458 << " number of instructions in this function.\n";
459 OS
<< " The edit distance for this function is: "
460 << FunctionEditDistance
.lookup(&Function
) << "\n\n";
463 return Error::success();
466 bool ReorderBasicBlocks::modifyFunctionLayout(BinaryFunction
&BF
,
468 bool MinBranchClusters
) const {
469 if (BF
.size() == 0 || Type
== LT_NONE
)
472 BinaryFunction::BasicBlockOrderType NewLayout
;
473 std::unique_ptr
<ReorderAlgorithm
> Algo
;
475 // Cannot do optimal layout without profile.
476 if (Type
!= LT_REVERSE
&& !BF
.hasValidProfile())
479 if (Type
== LT_REVERSE
) {
480 Algo
.reset(new ReverseReorderAlgorithm());
481 } else if (BF
.size() <= opts::TSPThreshold
&& Type
!= LT_OPTIMIZE_SHUFFLE
) {
482 // Work on optimal solution if problem is small enough
483 LLVM_DEBUG(dbgs() << "finding optimal block layout for " << BF
<< "\n");
484 Algo
.reset(new TSPReorderAlgorithm());
486 LLVM_DEBUG(dbgs() << "running block layout heuristics on " << BF
<< "\n");
488 std::unique_ptr
<ClusterAlgorithm
> CAlgo
;
489 if (MinBranchClusters
)
490 CAlgo
.reset(new MinBranchGreedyClusterAlgorithm());
492 CAlgo
.reset(new PHGreedyClusterAlgorithm());
496 Algo
.reset(new OptimizeReorderAlgorithm(std::move(CAlgo
)));
499 case LT_OPTIMIZE_BRANCH
:
500 Algo
.reset(new OptimizeBranchReorderAlgorithm(std::move(CAlgo
)));
503 case LT_OPTIMIZE_CACHE
:
504 Algo
.reset(new OptimizeCacheReorderAlgorithm(std::move(CAlgo
)));
507 case LT_OPTIMIZE_EXT_TSP
:
508 Algo
.reset(new ExtTSPReorderAlgorithm());
511 case LT_OPTIMIZE_SHUFFLE
:
512 Algo
.reset(new RandomClusterReorderAlgorithm(std::move(CAlgo
)));
516 llvm_unreachable("unexpected layout type");
520 Algo
->reorderBasicBlocks(BF
, NewLayout
);
522 return BF
.getLayout().update(NewLayout
);
525 Error
FixupBranches::runOnFunctions(BinaryContext
&BC
) {
526 for (auto &It
: BC
.getBinaryFunctions()) {
527 BinaryFunction
&Function
= It
.second
;
528 if (!BC
.shouldEmit(Function
) || !Function
.isSimple())
531 Function
.fixBranches();
533 return Error::success();
536 Error
FinalizeFunctions::runOnFunctions(BinaryContext
&BC
) {
537 std::atomic
<bool> HasFatal
{false};
538 ParallelUtilities::WorkFuncTy WorkFun
= [&](BinaryFunction
&BF
) {
539 if (!BF
.finalizeCFIState()) {
540 if (BC
.HasRelocations
) {
541 BC
.errs() << "BOLT-ERROR: unable to fix CFI state for function " << BF
552 // Update exception handling information.
556 ParallelUtilities::PredicateTy SkipPredicate
= [&](const BinaryFunction
&BF
) {
557 return !BC
.shouldEmit(BF
);
560 ParallelUtilities::runOnEachFunction(
561 BC
, ParallelUtilities::SchedulingPolicy::SP_CONSTANT
, WorkFun
,
562 SkipPredicate
, "FinalizeFunctions");
564 return createFatalBOLTError("finalize CFI state failure");
565 return Error::success();
568 Error
CheckLargeFunctions::runOnFunctions(BinaryContext
&BC
) {
569 if (BC
.HasRelocations
)
570 return Error::success();
572 // If the function wouldn't fit, mark it as non-simple. Otherwise, we may emit
573 // incorrect meta data.
574 ParallelUtilities::WorkFuncTy WorkFun
= [&](BinaryFunction
&BF
) {
575 uint64_t HotSize
, ColdSize
;
576 std::tie(HotSize
, ColdSize
) =
577 BC
.calculateEmittedSize(BF
, /*FixBranches=*/false);
578 if (HotSize
> BF
.getMaxSize()) {
579 if (opts::PrintLargeFunctions
)
580 BC
.outs() << "BOLT-INFO: " << BF
<< " size exceeds allocated space by "
581 << (HotSize
- BF
.getMaxSize()) << " bytes\n";
586 ParallelUtilities::PredicateTy SkipFunc
= [&](const BinaryFunction
&BF
) {
587 return !shouldOptimize(BF
);
590 ParallelUtilities::runOnEachFunction(
591 BC
, ParallelUtilities::SchedulingPolicy::SP_INST_LINEAR
, WorkFun
,
592 SkipFunc
, "CheckLargeFunctions");
594 return Error::success();
597 bool CheckLargeFunctions::shouldOptimize(const BinaryFunction
&BF
) const {
598 // Unlike other passes, allow functions in non-CFG state.
599 return BF
.isSimple() && !BF
.isIgnored();
602 Error
LowerAnnotations::runOnFunctions(BinaryContext
&BC
) {
603 // Convert GnuArgsSize annotations into CFIs.
604 for (BinaryFunction
*BF
: BC
.getAllBinaryFunctions()) {
605 for (FunctionFragment
&FF
: BF
->getLayout().fragments()) {
606 // Reset at the start of the new fragment.
607 int64_t CurrentGnuArgsSize
= 0;
609 for (BinaryBasicBlock
*const BB
: FF
) {
610 for (auto II
= BB
->begin(); II
!= BB
->end(); ++II
) {
611 if (!BF
->usesGnuArgsSize() || !BC
.MIB
->isInvoke(*II
))
614 const int64_t NewGnuArgsSize
= BC
.MIB
->getGnuArgsSize(*II
);
615 assert(NewGnuArgsSize
>= 0 && "Expected non-negative GNU_args_size.");
616 if (NewGnuArgsSize
== CurrentGnuArgsSize
)
619 auto InsertII
= BF
->addCFIInstruction(
621 MCCFIInstruction::createGnuArgsSize(nullptr, NewGnuArgsSize
));
622 CurrentGnuArgsSize
= NewGnuArgsSize
;
623 II
= std::next(InsertII
);
628 return Error::success();
631 // Check for dirty state in MCSymbol objects that might be a consequence
632 // of running calculateEmittedSize() in parallel, during split functions
633 // pass. If an inconsistent state is found (symbol already registered or
634 // already defined), clean it.
635 Error
CleanMCState::runOnFunctions(BinaryContext
&BC
) {
636 MCContext
&Ctx
= *BC
.Ctx
;
637 for (const auto &SymMapEntry
: Ctx
.getSymbols()) {
638 const MCSymbol
*S
= SymMapEntry
.getValue().Symbol
;
641 if (S
->isDefined()) {
642 LLVM_DEBUG(dbgs() << "BOLT-DEBUG: Symbol \"" << S
->getName()
643 << "\" is already defined\n");
644 const_cast<MCSymbol
*>(S
)->setUndefined();
646 if (S
->isRegistered()) {
647 LLVM_DEBUG(dbgs() << "BOLT-DEBUG: Symbol \"" << S
->getName()
648 << "\" is already registered\n");
649 const_cast<MCSymbol
*>(S
)->setIsRegistered(false);
651 LLVM_DEBUG(if (S
->isVariable()) {
652 dbgs() << "BOLT-DEBUG: Symbol \"" << S
->getName() << "\" is variable\n";
655 return Error::success();
658 // This peephole fixes jump instructions that jump to another basic
659 // block with a single jump instruction, e.g.
662 // jmp B1 (or jcc B1)
669 // jmp B2 (or jcc B2)
671 static uint64_t fixDoubleJumps(BinaryFunction
&Function
, bool MarkInvalid
) {
672 uint64_t NumDoubleJumps
= 0;
674 MCContext
*Ctx
= Function
.getBinaryContext().Ctx
.get();
675 MCPlusBuilder
*MIB
= Function
.getBinaryContext().MIB
.get();
676 for (BinaryBasicBlock
&BB
: Function
) {
677 auto checkAndPatch
= [&](BinaryBasicBlock
*Pred
, BinaryBasicBlock
*Succ
,
678 const MCSymbol
*SuccSym
,
679 std::optional
<uint32_t> Offset
) {
680 // Ignore infinite loop jumps or fallthrough tail jumps.
681 if (Pred
== Succ
|| Succ
== &BB
)
685 const MCSymbol
*TBB
= nullptr;
686 const MCSymbol
*FBB
= nullptr;
687 MCInst
*CondBranch
= nullptr;
688 MCInst
*UncondBranch
= nullptr;
689 bool Res
= Pred
->analyzeBranch(TBB
, FBB
, CondBranch
, UncondBranch
);
691 LLVM_DEBUG(dbgs() << "analyzeBranch failed in peepholes in block:\n";
695 Pred
->replaceSuccessor(&BB
, Succ
);
697 // We must patch up any existing branch instructions to match up
698 // with the new successor.
699 assert((CondBranch
|| (!CondBranch
&& Pred
->succ_size() == 1)) &&
700 "Predecessor block has inconsistent number of successors");
701 if (CondBranch
&& MIB
->getTargetSymbol(*CondBranch
) == BB
.getLabel()) {
702 MIB
->replaceBranchTarget(*CondBranch
, Succ
->getLabel(), Ctx
);
703 } else if (UncondBranch
&&
704 MIB
->getTargetSymbol(*UncondBranch
) == BB
.getLabel()) {
705 MIB
->replaceBranchTarget(*UncondBranch
, Succ
->getLabel(), Ctx
);
706 } else if (!UncondBranch
) {
707 assert(Function
.getLayout().getBasicBlockAfter(Pred
, false) != Succ
&&
708 "Don't add an explicit jump to a fallthrough block.");
709 Pred
->addBranchInstruction(Succ
);
712 // Succ will be null in the tail call case. In this case we
713 // need to explicitly add a tail call instruction.
714 MCInst
*Branch
= Pred
->getLastNonPseudoInstr();
715 if (Branch
&& MIB
->isUnconditionalBranch(*Branch
)) {
716 assert(MIB
->getTargetSymbol(*Branch
) == BB
.getLabel());
717 Pred
->removeSuccessor(&BB
);
718 Pred
->eraseInstruction(Pred
->findInstruction(Branch
));
719 Pred
->addTailCallInstruction(SuccSym
);
721 MCInst
*TailCall
= Pred
->getLastNonPseudoInstr();
723 MIB
->setOffset(*TailCall
, *Offset
);
731 LLVM_DEBUG(dbgs() << "Removed double jump in " << Function
<< " from "
732 << Pred
->getName() << " -> " << BB
.getName() << " to "
733 << Pred
->getName() << " -> " << SuccSym
->getName()
734 << (!Succ
? " (tail)\n" : "\n"));
739 if (BB
.getNumNonPseudos() != 1 || BB
.isLandingPad())
742 MCInst
*Inst
= BB
.getFirstNonPseudoInstr();
743 const bool IsTailCall
= MIB
->isTailCall(*Inst
);
745 if (!MIB
->isUnconditionalBranch(*Inst
) && !IsTailCall
)
748 // If we operate after SCTC make sure it's not a conditional tail call.
749 if (IsTailCall
&& MIB
->isConditionalBranch(*Inst
))
752 const MCSymbol
*SuccSym
= MIB
->getTargetSymbol(*Inst
);
753 BinaryBasicBlock
*Succ
= BB
.getSuccessor();
755 if (((!Succ
|| &BB
== Succ
) && !IsTailCall
) || (IsTailCall
&& !SuccSym
))
758 std::vector
<BinaryBasicBlock
*> Preds
= {BB
.pred_begin(), BB
.pred_end()};
760 for (BinaryBasicBlock
*Pred
: Preds
) {
761 if (Pred
->isLandingPad())
764 if (Pred
->getSuccessor() == &BB
||
765 (Pred
->getConditionalSuccessor(true) == &BB
&& !IsTailCall
) ||
766 Pred
->getConditionalSuccessor(false) == &BB
)
767 if (checkAndPatch(Pred
, Succ
, SuccSym
, MIB
->getOffset(*Inst
)) &&
769 BB
.markValid(BB
.pred_size() != 0 || BB
.isLandingPad() ||
774 return NumDoubleJumps
;
777 bool SimplifyConditionalTailCalls::shouldRewriteBranch(
778 const BinaryBasicBlock
*PredBB
, const MCInst
&CondBranch
,
779 const BinaryBasicBlock
*BB
, const bool DirectionFlag
) {
780 if (BeenOptimized
.count(PredBB
))
783 const bool IsForward
= BinaryFunction::isForwardBranch(PredBB
, BB
);
786 ++NumOrigForwardBranches
;
788 ++NumOrigBackwardBranches
;
790 if (opts::SctcMode
== opts::SctcAlways
)
793 if (opts::SctcMode
== opts::SctcPreserveDirection
)
794 return IsForward
== DirectionFlag
;
796 const ErrorOr
<std::pair
<double, double>> Frequency
=
797 PredBB
->getBranchStats(BB
);
799 // It's ok to rewrite the conditional branch if the new target will be
800 // a backward branch.
802 // If no data available for these branches, then it should be ok to
803 // do the optimization since it will reduce code size.
804 if (Frequency
.getError())
807 // TODO: should this use misprediction frequency instead?
808 const bool Result
= (IsForward
&& Frequency
.get().first
>= 0.5) ||
809 (!IsForward
&& Frequency
.get().first
<= 0.5);
811 return Result
== DirectionFlag
;
814 uint64_t SimplifyConditionalTailCalls::fixTailCalls(BinaryFunction
&BF
) {
815 // Need updated indices to correctly detect branch' direction.
816 BF
.getLayout().updateLayoutIndices();
817 BF
.markUnreachableBlocks();
819 MCPlusBuilder
*MIB
= BF
.getBinaryContext().MIB
.get();
820 MCContext
*Ctx
= BF
.getBinaryContext().Ctx
.get();
821 uint64_t NumLocalCTCCandidates
= 0;
822 uint64_t NumLocalCTCs
= 0;
823 uint64_t LocalCTCTakenCount
= 0;
824 uint64_t LocalCTCExecCount
= 0;
825 std::vector
<std::pair
<BinaryBasicBlock
*, const BinaryBasicBlock
*>>
828 // Will block be deleted by UCE?
829 auto isValid
= [](const BinaryBasicBlock
*BB
) {
830 return (BB
->pred_size() != 0 || BB
->isLandingPad() || BB
->isEntryPoint());
833 for (BinaryBasicBlock
*BB
: BF
.getLayout().blocks()) {
834 // Locate BB with a single direct tail-call instruction.
835 if (BB
->getNumNonPseudos() != 1)
838 MCInst
*Instr
= BB
->getFirstNonPseudoInstr();
839 if (!MIB
->isTailCall(*Instr
) || MIB
->isConditionalBranch(*Instr
))
842 const MCSymbol
*CalleeSymbol
= MIB
->getTargetSymbol(*Instr
);
846 // Detect direction of the possible conditional tail call.
847 const bool IsForwardCTC
= BF
.isForwardCall(CalleeSymbol
);
849 // Iterate through all predecessors.
850 for (BinaryBasicBlock
*PredBB
: BB
->predecessors()) {
851 BinaryBasicBlock
*CondSucc
= PredBB
->getConditionalSuccessor(true);
855 ++NumLocalCTCCandidates
;
857 const MCSymbol
*TBB
= nullptr;
858 const MCSymbol
*FBB
= nullptr;
859 MCInst
*CondBranch
= nullptr;
860 MCInst
*UncondBranch
= nullptr;
861 bool Result
= PredBB
->analyzeBranch(TBB
, FBB
, CondBranch
, UncondBranch
);
863 // analyzeBranch() can fail due to unusual branch instructions, e.g. jrcxz
865 LLVM_DEBUG(dbgs() << "analyzeBranch failed in SCTC in block:\n";
870 assert(Result
&& "internal error analyzing conditional branch");
871 assert(CondBranch
&& "conditional branch expected");
873 // Skip dynamic branches for now.
874 if (BF
.getBinaryContext().MIB
->isDynamicBranch(*CondBranch
))
877 // It's possible that PredBB is also a successor to BB that may have
878 // been processed by a previous iteration of the SCTC loop, in which
879 // case it may have been marked invalid. We should skip rewriting in
881 if (!PredBB
->isValid()) {
882 assert(PredBB
->isSuccessor(BB
) &&
883 "PredBB should be valid if it is not a successor to BB");
887 // We don't want to reverse direction of the branch in new order
888 // without further profile analysis.
889 const bool DirectionFlag
= CondSucc
== BB
? IsForwardCTC
: !IsForwardCTC
;
890 if (!shouldRewriteBranch(PredBB
, *CondBranch
, BB
, DirectionFlag
))
893 // Record this block so that we don't try to optimize it twice.
894 BeenOptimized
.insert(PredBB
);
897 if (CondSucc
!= BB
) {
898 // Patch the new target address into the conditional branch.
899 MIB
->reverseBranchCondition(*CondBranch
, CalleeSymbol
, Ctx
);
900 // Since we reversed the condition on the branch we need to change
901 // the target for the unconditional branch or add a unconditional
902 // branch to the old target. This has to be done manually since
903 // fixupBranches is not called after SCTC.
904 NeedsUncondBranch
.emplace_back(PredBB
, CondSucc
);
905 Count
= PredBB
->getFallthroughBranchInfo().Count
;
907 // Change destination of the conditional branch.
908 MIB
->replaceBranchTarget(*CondBranch
, CalleeSymbol
, Ctx
);
909 Count
= PredBB
->getTakenBranchInfo().Count
;
911 const uint64_t CTCTakenFreq
=
912 Count
== BinaryBasicBlock::COUNT_NO_PROFILE
? 0 : Count
;
914 // Annotate it, so "isCall" returns true for this jcc
915 MIB
->setConditionalTailCall(*CondBranch
);
916 // Add info about the conditional tail call frequency, otherwise this
917 // info will be lost when we delete the associated BranchInfo entry
918 auto &CTCAnnotation
=
919 MIB
->getOrCreateAnnotationAs
<uint64_t>(*CondBranch
, "CTCTakenCount");
920 CTCAnnotation
= CTCTakenFreq
;
921 // Preserve Offset annotation, used in BAT.
922 // Instr is a direct tail call instruction that was created when CTCs are
923 // first expanded, and has the original CTC offset set.
924 if (std::optional
<uint32_t> Offset
= MIB
->getOffset(*Instr
))
925 MIB
->setOffset(*CondBranch
, *Offset
);
927 // Remove the unused successor which may be eliminated later
928 // if there are no other users.
929 PredBB
->removeSuccessor(BB
);
930 // Update BB execution count
931 if (CTCTakenFreq
&& CTCTakenFreq
<= BB
->getKnownExecutionCount())
932 BB
->setExecutionCount(BB
->getExecutionCount() - CTCTakenFreq
);
933 else if (CTCTakenFreq
> BB
->getKnownExecutionCount())
934 BB
->setExecutionCount(0);
937 LocalCTCTakenCount
+= CTCTakenFreq
;
938 LocalCTCExecCount
+= PredBB
->getKnownExecutionCount();
941 // Remove the block from CFG if all predecessors were removed.
942 BB
->markValid(isValid(BB
));
945 // Add unconditional branches at the end of BBs to new successors
946 // as long as the successor is not a fallthrough.
947 for (auto &Entry
: NeedsUncondBranch
) {
948 BinaryBasicBlock
*PredBB
= Entry
.first
;
949 const BinaryBasicBlock
*CondSucc
= Entry
.second
;
951 const MCSymbol
*TBB
= nullptr;
952 const MCSymbol
*FBB
= nullptr;
953 MCInst
*CondBranch
= nullptr;
954 MCInst
*UncondBranch
= nullptr;
955 PredBB
->analyzeBranch(TBB
, FBB
, CondBranch
, UncondBranch
);
957 // Find the next valid block. Invalid blocks will be deleted
958 // so they shouldn't be considered fallthrough targets.
959 const BinaryBasicBlock
*NextBlock
=
960 BF
.getLayout().getBasicBlockAfter(PredBB
, false);
961 while (NextBlock
&& !isValid(NextBlock
))
962 NextBlock
= BF
.getLayout().getBasicBlockAfter(NextBlock
, false);
964 // Get the unconditional successor to this block.
965 const BinaryBasicBlock
*PredSucc
= PredBB
->getSuccessor();
966 assert(PredSucc
&& "The other branch should be a tail call");
968 const bool HasFallthrough
= (NextBlock
&& PredSucc
== NextBlock
);
972 PredBB
->eraseInstruction(PredBB
->findInstruction(UncondBranch
));
974 MIB
->replaceBranchTarget(*UncondBranch
, CondSucc
->getLabel(), Ctx
);
975 } else if (!HasFallthrough
) {
977 MIB
->createUncondBranch(Branch
, CondSucc
->getLabel(), Ctx
);
978 PredBB
->addInstruction(Branch
);
982 if (NumLocalCTCs
> 0) {
983 NumDoubleJumps
+= fixDoubleJumps(BF
, true);
984 // Clean-up unreachable tail-call blocks.
985 const std::pair
<unsigned, uint64_t> Stats
= BF
.eraseInvalidBBs();
986 DeletedBlocks
+= Stats
.first
;
987 DeletedBytes
+= Stats
.second
;
989 assert(BF
.validateCFG());
992 LLVM_DEBUG(dbgs() << "BOLT: created " << NumLocalCTCs
993 << " conditional tail calls from a total of "
994 << NumLocalCTCCandidates
<< " candidates in function " << BF
995 << ". CTCs execution count for this function is "
996 << LocalCTCExecCount
<< " and CTC taken count is "
997 << LocalCTCTakenCount
<< "\n";);
999 NumTailCallsPatched
+= NumLocalCTCs
;
1000 NumCandidateTailCalls
+= NumLocalCTCCandidates
;
1001 CTCExecCount
+= LocalCTCExecCount
;
1002 CTCTakenCount
+= LocalCTCTakenCount
;
1004 return NumLocalCTCs
> 0;
1007 Error
SimplifyConditionalTailCalls::runOnFunctions(BinaryContext
&BC
) {
1009 return Error::success();
1011 for (auto &It
: BC
.getBinaryFunctions()) {
1012 BinaryFunction
&Function
= It
.second
;
1014 if (!shouldOptimize(Function
))
1017 if (fixTailCalls(Function
)) {
1018 Modified
.insert(&Function
);
1019 Function
.setHasCanonicalCFG(false);
1023 if (NumTailCallsPatched
)
1024 BC
.outs() << "BOLT-INFO: SCTC: patched " << NumTailCallsPatched
1025 << " tail calls (" << NumOrigForwardBranches
<< " forward)"
1026 << " tail calls (" << NumOrigBackwardBranches
<< " backward)"
1027 << " from a total of " << NumCandidateTailCalls
1028 << " while removing " << NumDoubleJumps
<< " double jumps"
1029 << " and removing " << DeletedBlocks
<< " basic blocks"
1030 << " totalling " << DeletedBytes
1031 << " bytes of code. CTCs total execution count is "
1032 << CTCExecCount
<< " and the number of times CTCs are taken is "
1033 << CTCTakenCount
<< "\n";
1034 return Error::success();
1037 uint64_t ShortenInstructions::shortenInstructions(BinaryFunction
&Function
) {
1039 const BinaryContext
&BC
= Function
.getBinaryContext();
1040 for (BinaryBasicBlock
&BB
: Function
) {
1041 for (MCInst
&Inst
: BB
) {
1042 // Skip shortening instructions with Size annotation.
1043 if (BC
.MIB
->getSize(Inst
))
1046 MCInst OriginalInst
;
1047 if (opts::Verbosity
> 2)
1048 OriginalInst
= Inst
;
1050 if (!BC
.MIB
->shortenInstruction(Inst
, *BC
.STI
))
1053 if (opts::Verbosity
> 2) {
1055 BC
.outs() << "BOLT-INFO: shortening:\nBOLT-INFO: ";
1056 BC
.printInstruction(BC
.outs(), OriginalInst
, 0, &Function
);
1057 BC
.outs() << "BOLT-INFO: to:";
1058 BC
.printInstruction(BC
.outs(), Inst
, 0, &Function
);
1068 Error
ShortenInstructions::runOnFunctions(BinaryContext
&BC
) {
1069 std::atomic
<uint64_t> NumShortened
{0};
1071 return Error::success();
1073 ParallelUtilities::runOnEachFunction(
1074 BC
, ParallelUtilities::SchedulingPolicy::SP_INST_LINEAR
,
1075 [&](BinaryFunction
&BF
) { NumShortened
+= shortenInstructions(BF
); },
1076 nullptr, "ShortenInstructions");
1079 BC
.outs() << "BOLT-INFO: " << NumShortened
1080 << " instructions were shortened\n";
1081 return Error::success();
1084 void Peepholes::addTailcallTraps(BinaryFunction
&Function
) {
1085 MCPlusBuilder
*MIB
= Function
.getBinaryContext().MIB
.get();
1086 for (BinaryBasicBlock
&BB
: Function
) {
1087 MCInst
*Inst
= BB
.getLastNonPseudoInstr();
1088 if (Inst
&& MIB
->isTailCall(*Inst
) && MIB
->isIndirectBranch(*Inst
)) {
1090 MIB
->createTrap(Trap
);
1091 BB
.addInstruction(Trap
);
1097 void Peepholes::removeUselessCondBranches(BinaryFunction
&Function
) {
1098 for (BinaryBasicBlock
&BB
: Function
) {
1099 if (BB
.succ_size() != 2)
1102 BinaryBasicBlock
*CondBB
= BB
.getConditionalSuccessor(true);
1103 BinaryBasicBlock
*UncondBB
= BB
.getConditionalSuccessor(false);
1104 if (CondBB
!= UncondBB
)
1107 const MCSymbol
*TBB
= nullptr;
1108 const MCSymbol
*FBB
= nullptr;
1109 MCInst
*CondBranch
= nullptr;
1110 MCInst
*UncondBranch
= nullptr;
1111 bool Result
= BB
.analyzeBranch(TBB
, FBB
, CondBranch
, UncondBranch
);
1113 // analyzeBranch() can fail due to unusual branch instructions,
1114 // e.g. jrcxz, or jump tables (indirect jump).
1115 if (!Result
|| !CondBranch
)
1118 BB
.removeDuplicateConditionalSuccessor(CondBranch
);
1119 ++NumUselessCondBranches
;
1123 Error
Peepholes::runOnFunctions(BinaryContext
&BC
) {
1125 std::accumulate(opts::Peepholes
.begin(), opts::Peepholes
.end(), 0,
1126 [](const char A
, const PeepholeOpts B
) { return A
| B
; });
1127 if (Opts
== PEEP_NONE
)
1128 return Error::success();
1130 for (auto &It
: BC
.getBinaryFunctions()) {
1131 BinaryFunction
&Function
= It
.second
;
1132 if (shouldOptimize(Function
)) {
1133 if (Opts
& PEEP_DOUBLE_JUMPS
)
1134 NumDoubleJumps
+= fixDoubleJumps(Function
, false);
1135 if (Opts
& PEEP_TAILCALL_TRAPS
)
1136 addTailcallTraps(Function
);
1137 if (Opts
& PEEP_USELESS_BRANCHES
)
1138 removeUselessCondBranches(Function
);
1139 assert(Function
.validateCFG());
1142 BC
.outs() << "BOLT-INFO: Peephole: " << NumDoubleJumps
1143 << " double jumps patched.\n"
1144 << "BOLT-INFO: Peephole: " << TailCallTraps
1145 << " tail call traps inserted.\n"
1146 << "BOLT-INFO: Peephole: " << NumUselessCondBranches
1147 << " useless conditional branches removed.\n";
1148 return Error::success();
1151 bool SimplifyRODataLoads::simplifyRODataLoads(BinaryFunction
&BF
) {
1152 BinaryContext
&BC
= BF
.getBinaryContext();
1153 MCPlusBuilder
*MIB
= BC
.MIB
.get();
1155 uint64_t NumLocalLoadsSimplified
= 0;
1156 uint64_t NumDynamicLocalLoadsSimplified
= 0;
1157 uint64_t NumLocalLoadsFound
= 0;
1158 uint64_t NumDynamicLocalLoadsFound
= 0;
1160 for (BinaryBasicBlock
*BB
: BF
.getLayout().blocks()) {
1161 for (MCInst
&Inst
: *BB
) {
1162 unsigned Opcode
= Inst
.getOpcode();
1163 const MCInstrDesc
&Desc
= BC
.MII
->get(Opcode
);
1165 // Skip instructions that do not load from memory.
1166 if (!Desc
.mayLoad())
1169 // Try to statically evaluate the target memory address;
1170 uint64_t TargetAddress
;
1172 if (MIB
->hasPCRelOperand(Inst
)) {
1173 // Try to find the symbol that corresponds to the PC-relative operand.
1174 MCOperand
*DispOpI
= MIB
->getMemOperandDisp(Inst
);
1175 assert(DispOpI
!= Inst
.end() && "expected PC-relative displacement");
1176 assert(DispOpI
->isExpr() &&
1177 "found PC-relative with non-symbolic displacement");
1179 // Get displacement symbol.
1180 const MCSymbol
*DisplSymbol
;
1181 uint64_t DisplOffset
;
1183 std::tie(DisplSymbol
, DisplOffset
) =
1184 MIB
->getTargetSymbolInfo(DispOpI
->getExpr());
1189 // Look up the symbol address in the global symbols map of the binary
1191 BinaryData
*BD
= BC
.getBinaryDataByName(DisplSymbol
->getName());
1194 TargetAddress
= BD
->getAddress() + DisplOffset
;
1195 } else if (!MIB
->evaluateMemOperandTarget(Inst
, TargetAddress
)) {
1199 // Get the contents of the section containing the target address of the
1200 // memory operand. We are only interested in read-only sections.
1201 ErrorOr
<BinarySection
&> DataSection
=
1202 BC
.getSectionForAddress(TargetAddress
);
1203 if (!DataSection
|| DataSection
->isWritable())
1206 if (BC
.getRelocationAt(TargetAddress
) ||
1207 BC
.getDynamicRelocationAt(TargetAddress
))
1210 uint32_t Offset
= TargetAddress
- DataSection
->getAddress();
1211 StringRef ConstantData
= DataSection
->getContents();
1213 ++NumLocalLoadsFound
;
1214 if (BB
->hasProfile())
1215 NumDynamicLocalLoadsFound
+= BB
->getExecutionCount();
1217 if (MIB
->replaceMemOperandWithImm(Inst
, ConstantData
, Offset
)) {
1218 ++NumLocalLoadsSimplified
;
1219 if (BB
->hasProfile())
1220 NumDynamicLocalLoadsSimplified
+= BB
->getExecutionCount();
1225 NumLoadsFound
+= NumLocalLoadsFound
;
1226 NumDynamicLoadsFound
+= NumDynamicLocalLoadsFound
;
1227 NumLoadsSimplified
+= NumLocalLoadsSimplified
;
1228 NumDynamicLoadsSimplified
+= NumDynamicLocalLoadsSimplified
;
1230 return NumLocalLoadsSimplified
> 0;
1233 Error
SimplifyRODataLoads::runOnFunctions(BinaryContext
&BC
) {
1234 for (auto &It
: BC
.getBinaryFunctions()) {
1235 BinaryFunction
&Function
= It
.second
;
1236 if (shouldOptimize(Function
) && simplifyRODataLoads(Function
))
1237 Modified
.insert(&Function
);
1240 BC
.outs() << "BOLT-INFO: simplified " << NumLoadsSimplified
<< " out of "
1241 << NumLoadsFound
<< " loads from a statically computed address.\n"
1242 << "BOLT-INFO: dynamic loads simplified: "
1243 << NumDynamicLoadsSimplified
<< "\n"
1244 << "BOLT-INFO: dynamic loads found: " << NumDynamicLoadsFound
1246 return Error::success();
1249 Error
AssignSections::runOnFunctions(BinaryContext
&BC
) {
1250 for (BinaryFunction
*Function
: BC
.getInjectedBinaryFunctions()) {
1251 Function
->setCodeSectionName(BC
.getInjectedCodeSectionName());
1252 Function
->setColdCodeSectionName(BC
.getInjectedColdCodeSectionName());
1255 // In non-relocation mode functions have pre-assigned section names.
1256 if (!BC
.HasRelocations
)
1257 return Error::success();
1259 const bool UseColdSection
=
1260 BC
.NumProfiledFuncs
> 0 ||
1261 opts::ReorderFunctions
== ReorderFunctions::RT_USER
;
1262 for (auto &BFI
: BC
.getBinaryFunctions()) {
1263 BinaryFunction
&Function
= BFI
.second
;
1264 if (opts::isHotTextMover(Function
)) {
1265 Function
.setCodeSectionName(BC
.getHotTextMoverSectionName());
1266 Function
.setColdCodeSectionName(BC
.getHotTextMoverSectionName());
1270 if (!UseColdSection
|| Function
.hasValidIndex())
1271 Function
.setCodeSectionName(BC
.getMainCodeSectionName());
1273 Function
.setCodeSectionName(BC
.getColdCodeSectionName());
1275 if (Function
.isSplit())
1276 Function
.setColdCodeSectionName(BC
.getColdCodeSectionName());
1278 return Error::success();
1281 Error
PrintProfileStats::runOnFunctions(BinaryContext
&BC
) {
1282 double FlowImbalanceMean
= 0.0;
1283 size_t NumBlocksConsidered
= 0;
1284 double WorstBias
= 0.0;
1285 const BinaryFunction
*WorstBiasFunc
= nullptr;
1287 // For each function CFG, we fill an IncomingMap with the sum of the frequency
1288 // of incoming edges for each BB. Likewise for each OutgoingMap and the sum
1289 // of the frequency of outgoing edges.
1290 using FlowMapTy
= std::unordered_map
<const BinaryBasicBlock
*, uint64_t>;
1291 std::unordered_map
<const BinaryFunction
*, FlowMapTy
> TotalIncomingMaps
;
1292 std::unordered_map
<const BinaryFunction
*, FlowMapTy
> TotalOutgoingMaps
;
1295 for (const auto &BFI
: BC
.getBinaryFunctions()) {
1296 const BinaryFunction
&Function
= BFI
.second
;
1297 if (Function
.empty() || !Function
.isSimple())
1299 FlowMapTy
&IncomingMap
= TotalIncomingMaps
[&Function
];
1300 FlowMapTy
&OutgoingMap
= TotalOutgoingMaps
[&Function
];
1301 for (const BinaryBasicBlock
&BB
: Function
) {
1302 uint64_t TotalOutgoing
= 0ULL;
1303 auto SuccBIIter
= BB
.branch_info_begin();
1304 for (BinaryBasicBlock
*Succ
: BB
.successors()) {
1305 uint64_t Count
= SuccBIIter
->Count
;
1306 if (Count
== BinaryBasicBlock::COUNT_NO_PROFILE
|| Count
== 0) {
1310 TotalOutgoing
+= Count
;
1311 IncomingMap
[Succ
] += Count
;
1314 OutgoingMap
[&BB
] = TotalOutgoing
;
1317 size_t NumBlocks
= 0;
1319 for (const BinaryBasicBlock
&BB
: Function
) {
1320 // Do not compute score for low frequency blocks, entry or exit blocks
1321 if (IncomingMap
[&BB
] < 100 || OutgoingMap
[&BB
] == 0 || BB
.isEntryPoint())
1324 const double Difference
= (double)OutgoingMap
[&BB
] - IncomingMap
[&BB
];
1325 Mean
+= fabs(Difference
/ IncomingMap
[&BB
]);
1328 FlowImbalanceMean
+= Mean
;
1329 NumBlocksConsidered
+= NumBlocks
;
1332 double FuncMean
= Mean
/ NumBlocks
;
1333 if (FuncMean
> WorstBias
) {
1334 WorstBias
= FuncMean
;
1335 WorstBiasFunc
= &Function
;
1338 if (NumBlocksConsidered
> 0)
1339 FlowImbalanceMean
/= NumBlocksConsidered
;
1341 // Compute standard deviation
1342 NumBlocksConsidered
= 0;
1343 double FlowImbalanceVar
= 0.0;
1344 for (const auto &BFI
: BC
.getBinaryFunctions()) {
1345 const BinaryFunction
&Function
= BFI
.second
;
1346 if (Function
.empty() || !Function
.isSimple())
1348 FlowMapTy
&IncomingMap
= TotalIncomingMaps
[&Function
];
1349 FlowMapTy
&OutgoingMap
= TotalOutgoingMaps
[&Function
];
1350 for (const BinaryBasicBlock
&BB
: Function
) {
1351 if (IncomingMap
[&BB
] < 100 || OutgoingMap
[&BB
] == 0)
1353 ++NumBlocksConsidered
;
1354 const double Difference
= (double)OutgoingMap
[&BB
] - IncomingMap
[&BB
];
1356 pow(fabs(Difference
/ IncomingMap
[&BB
]) - FlowImbalanceMean
, 2);
1359 if (NumBlocksConsidered
) {
1360 FlowImbalanceVar
/= NumBlocksConsidered
;
1361 FlowImbalanceVar
= sqrt(FlowImbalanceVar
);
1365 BC
.outs() << format("BOLT-INFO: Profile bias score: %.4lf%% StDev: %.4lf%%\n",
1366 (100.0 * FlowImbalanceMean
), (100.0 * FlowImbalanceVar
));
1367 if (WorstBiasFunc
&& opts::Verbosity
>= 1) {
1368 BC
.outs() << "Worst average bias observed in "
1369 << WorstBiasFunc
->getPrintName() << "\n";
1370 LLVM_DEBUG(WorstBiasFunc
->dump());
1372 return Error::success();
1375 Error
PrintProgramStats::runOnFunctions(BinaryContext
&BC
) {
1376 uint64_t NumRegularFunctions
= 0;
1377 uint64_t NumStaleProfileFunctions
= 0;
1378 uint64_t NumAllStaleFunctions
= 0;
1379 uint64_t NumInferredFunctions
= 0;
1380 uint64_t NumNonSimpleProfiledFunctions
= 0;
1381 uint64_t NumUnknownControlFlowFunctions
= 0;
1382 uint64_t TotalSampleCount
= 0;
1383 uint64_t StaleSampleCount
= 0;
1384 uint64_t InferredSampleCount
= 0;
1385 std::vector
<const BinaryFunction
*> ProfiledFunctions
;
1386 const char *StaleFuncsHeader
= "BOLT-INFO: Functions with stale profile:\n";
1387 for (auto &BFI
: BC
.getBinaryFunctions()) {
1388 const BinaryFunction
&Function
= BFI
.second
;
1390 // Ignore PLT functions for stats.
1391 if (Function
.isPLTFunction())
1394 // Adjustment for BAT mode: the profile for BOLT split fragments is combined
1395 // so only count the hot fragment.
1396 const uint64_t Address
= Function
.getAddress();
1397 bool IsHotParentOfBOLTSplitFunction
= !Function
.getFragments().empty() &&
1398 BAT
&& BAT
->isBATFunction(Address
) &&
1399 !BAT
->fetchParentAddress(Address
);
1401 ++NumRegularFunctions
;
1403 // In BOLTed binaries split functions are non-simple (due to non-relocation
1404 // mode), but the original function is known to be simple and we have a
1405 // valid profile for it.
1406 if (!Function
.isSimple() && !IsHotParentOfBOLTSplitFunction
) {
1407 if (Function
.hasProfile())
1408 ++NumNonSimpleProfiledFunctions
;
1412 if (Function
.hasUnknownControlFlow()) {
1413 if (opts::PrintUnknownCFG
)
1415 else if (opts::PrintUnknown
)
1416 BC
.errs() << "function with unknown control flow: " << Function
<< '\n';
1418 ++NumUnknownControlFlowFunctions
;
1421 if (!Function
.hasProfile())
1424 uint64_t SampleCount
= Function
.getRawBranchCount();
1425 TotalSampleCount
+= SampleCount
;
1427 if (Function
.hasValidProfile()) {
1428 ProfiledFunctions
.push_back(&Function
);
1429 if (Function
.hasInferredProfile()) {
1430 ++NumInferredFunctions
;
1431 InferredSampleCount
+= SampleCount
;
1432 ++NumAllStaleFunctions
;
1435 if (opts::ReportStaleFuncs
) {
1436 BC
.outs() << StaleFuncsHeader
;
1437 StaleFuncsHeader
= "";
1438 BC
.outs() << " " << Function
<< '\n';
1440 ++NumStaleProfileFunctions
;
1441 StaleSampleCount
+= SampleCount
;
1442 ++NumAllStaleFunctions
;
1445 BC
.NumProfiledFuncs
= ProfiledFunctions
.size();
1446 BC
.NumStaleProfileFuncs
= NumStaleProfileFunctions
;
1448 const size_t NumAllProfiledFunctions
=
1449 ProfiledFunctions
.size() + NumStaleProfileFunctions
;
1450 BC
.outs() << "BOLT-INFO: " << NumAllProfiledFunctions
<< " out of "
1451 << NumRegularFunctions
<< " functions in the binary ("
1452 << format("%.1f", NumAllProfiledFunctions
/
1453 (float)NumRegularFunctions
* 100.0f
)
1454 << "%) have non-empty execution profile\n";
1455 if (NumNonSimpleProfiledFunctions
) {
1456 BC
.outs() << "BOLT-INFO: " << NumNonSimpleProfiledFunctions
<< " function"
1457 << (NumNonSimpleProfiledFunctions
== 1 ? "" : "s")
1458 << " with profile could not be optimized\n";
1460 if (NumAllStaleFunctions
) {
1461 const float PctStale
=
1462 NumAllStaleFunctions
/ (float)NumAllProfiledFunctions
* 100.0f
;
1463 const float PctStaleFuncsWithEqualBlockCount
=
1464 (float)BC
.Stats
.NumStaleFuncsWithEqualBlockCount
/
1465 NumAllStaleFunctions
* 100.0f
;
1466 const float PctStaleBlocksWithEqualIcount
=
1467 (float)BC
.Stats
.NumStaleBlocksWithEqualIcount
/
1468 BC
.Stats
.NumStaleBlocks
* 100.0f
;
1469 auto printErrorOrWarning
= [&]() {
1470 if (PctStale
> opts::StaleThreshold
)
1471 BC
.errs() << "BOLT-ERROR: ";
1473 BC
.errs() << "BOLT-WARNING: ";
1475 printErrorOrWarning();
1476 BC
.errs() << NumAllStaleFunctions
1477 << format(" (%.1f%% of all profiled)", PctStale
) << " function"
1478 << (NumAllStaleFunctions
== 1 ? "" : "s")
1479 << " have invalid (possibly stale) profile."
1480 " Use -report-stale to see the list.\n";
1481 if (TotalSampleCount
> 0) {
1482 printErrorOrWarning();
1483 BC
.errs() << (StaleSampleCount
+ InferredSampleCount
) << " out of "
1484 << TotalSampleCount
<< " samples in the binary ("
1486 ((100.0f
* (StaleSampleCount
+ InferredSampleCount
)) /
1488 << "%) belong to functions with invalid"
1489 " (possibly stale) profile.\n";
1491 BC
.outs() << "BOLT-INFO: " << BC
.Stats
.NumStaleFuncsWithEqualBlockCount
1492 << " stale function"
1493 << (BC
.Stats
.NumStaleFuncsWithEqualBlockCount
== 1 ? "" : "s")
1494 << format(" (%.1f%% of all stale)",
1495 PctStaleFuncsWithEqualBlockCount
)
1496 << " have matching block count.\n";
1497 BC
.outs() << "BOLT-INFO: " << BC
.Stats
.NumStaleBlocksWithEqualIcount
1499 << (BC
.Stats
.NumStaleBlocksWithEqualIcount
== 1 ? "" : "s")
1500 << format(" (%.1f%% of all stale)", PctStaleBlocksWithEqualIcount
)
1501 << " have matching icount.\n";
1502 if (PctStale
> opts::StaleThreshold
) {
1503 return createFatalBOLTError(
1504 Twine("BOLT-ERROR: stale functions exceed specified threshold of ") +
1505 Twine(opts::StaleThreshold
.getValue()) + Twine("%. Exiting.\n"));
1508 if (NumInferredFunctions
) {
1509 BC
.outs() << format(
1510 "BOLT-INFO: inferred profile for %d (%.2f%% of profiled, "
1511 "%.2f%% of stale) functions responsible for %.2f%% samples"
1512 " (%zu out of %zu)\n",
1513 NumInferredFunctions
,
1514 100.0 * NumInferredFunctions
/ NumAllProfiledFunctions
,
1515 100.0 * NumInferredFunctions
/ NumAllStaleFunctions
,
1516 100.0 * InferredSampleCount
/ TotalSampleCount
, InferredSampleCount
,
1518 BC
.outs() << format(
1519 "BOLT-INFO: inference found an exact match for %.2f%% of basic blocks"
1520 " (%zu out of %zu stale) responsible for %.2f%% samples"
1521 " (%zu out of %zu stale)\n",
1522 100.0 * BC
.Stats
.NumMatchedBlocks
/ BC
.Stats
.NumStaleBlocks
,
1523 BC
.Stats
.NumMatchedBlocks
, BC
.Stats
.NumStaleBlocks
,
1524 100.0 * BC
.Stats
.MatchedSampleCount
/ BC
.Stats
.StaleSampleCount
,
1525 BC
.Stats
.MatchedSampleCount
, BC
.Stats
.StaleSampleCount
);
1528 if (const uint64_t NumUnusedObjects
= BC
.getNumUnusedProfiledObjects()) {
1529 BC
.outs() << "BOLT-INFO: profile for " << NumUnusedObjects
1530 << " objects was ignored\n";
1533 if (ProfiledFunctions
.size() > 10) {
1534 if (opts::Verbosity
>= 1) {
1535 BC
.outs() << "BOLT-INFO: top called functions are:\n";
1536 llvm::sort(ProfiledFunctions
,
1537 [](const BinaryFunction
*A
, const BinaryFunction
*B
) {
1538 return B
->getExecutionCount() < A
->getExecutionCount();
1540 auto SFI
= ProfiledFunctions
.begin();
1541 auto SFIend
= ProfiledFunctions
.end();
1542 for (unsigned I
= 0u; I
< opts::TopCalledLimit
&& SFI
!= SFIend
;
1544 BC
.outs() << " " << **SFI
<< " : " << (*SFI
)->getExecutionCount()
1549 if (!opts::PrintSortedBy
.empty()) {
1550 std::vector
<BinaryFunction
*> Functions
;
1551 std::map
<const BinaryFunction
*, DynoStats
> Stats
;
1553 for (auto &BFI
: BC
.getBinaryFunctions()) {
1554 BinaryFunction
&BF
= BFI
.second
;
1555 if (shouldOptimize(BF
) && BF
.hasValidProfile()) {
1556 Functions
.push_back(&BF
);
1557 Stats
.emplace(&BF
, getDynoStats(BF
));
1561 const bool SortAll
=
1562 llvm::is_contained(opts::PrintSortedBy
, DynoStats::LAST_DYNO_STAT
);
1564 const bool Ascending
=
1565 opts::DynoStatsSortOrderOpt
== opts::DynoStatsSortOrder::Ascending
;
1567 std::function
<bool(const DynoStats
&, const DynoStats
&)>
1568 DynoStatsComparator
=
1569 SortAll
? [](const DynoStats
&StatsA
,
1570 const DynoStats
&StatsB
) { return StatsA
< StatsB
; }
1571 : [](const DynoStats
&StatsA
, const DynoStats
&StatsB
) {
1572 return StatsA
.lessThan(StatsB
, opts::PrintSortedBy
);
1575 llvm::stable_sort(Functions
,
1576 [Ascending
, &Stats
, DynoStatsComparator
](
1577 const BinaryFunction
*A
, const BinaryFunction
*B
) {
1578 auto StatsItr
= Stats
.find(A
);
1579 assert(StatsItr
!= Stats
.end());
1580 const DynoStats
&StatsA
= StatsItr
->second
;
1582 StatsItr
= Stats
.find(B
);
1583 assert(StatsItr
!= Stats
.end());
1584 const DynoStats
&StatsB
= StatsItr
->second
;
1586 return Ascending
? DynoStatsComparator(StatsA
, StatsB
)
1587 : DynoStatsComparator(StatsB
, StatsA
);
1590 BC
.outs() << "BOLT-INFO: top functions sorted by ";
1592 BC
.outs() << "dyno stats";
1595 bool PrintComma
= false;
1596 for (const DynoStats::Category Category
: opts::PrintSortedBy
) {
1599 BC
.outs() << DynoStats::Description(Category
);
1605 BC
.outs() << " are:\n";
1606 auto SFI
= Functions
.begin();
1607 for (unsigned I
= 0; I
< 100 && SFI
!= Functions
.end(); ++SFI
, ++I
) {
1608 const DynoStats Stats
= getDynoStats(**SFI
);
1609 BC
.outs() << " " << **SFI
;
1612 bool PrintComma
= false;
1613 for (const DynoStats::Category Category
: opts::PrintSortedBy
) {
1616 BC
.outs() << dynoStatsOptName(Category
) << "=" << Stats
[Category
];
1625 if (!BC
.TrappedFunctions
.empty()) {
1626 BC
.errs() << "BOLT-WARNING: " << BC
.TrappedFunctions
.size() << " function"
1627 << (BC
.TrappedFunctions
.size() > 1 ? "s" : "")
1628 << " will trap on entry. Use -trap-avx512=0 to disable"
1630 if (opts::Verbosity
>= 1 || BC
.TrappedFunctions
.size() <= 5) {
1632 for (const BinaryFunction
*Function
: BC
.TrappedFunctions
)
1633 BC
.errs() << " " << *Function
<< '\n';
1635 BC
.errs() << " Use -v=1 to see the list.\n";
1639 // Collect and print information about suboptimal code layout on input.
1640 if (opts::ReportBadLayout
) {
1641 std::vector
<BinaryFunction
*> SuboptimalFuncs
;
1642 for (auto &BFI
: BC
.getBinaryFunctions()) {
1643 BinaryFunction
&BF
= BFI
.second
;
1644 if (!BF
.hasValidProfile())
1647 const uint64_t HotThreshold
=
1648 std::max
<uint64_t>(BF
.getKnownExecutionCount(), 1);
1649 bool HotSeen
= false;
1650 for (const BinaryBasicBlock
*BB
: BF
.getLayout().rblocks()) {
1651 if (!HotSeen
&& BB
->getKnownExecutionCount() > HotThreshold
) {
1655 if (HotSeen
&& BB
->getKnownExecutionCount() == 0) {
1656 SuboptimalFuncs
.push_back(&BF
);
1662 if (!SuboptimalFuncs
.empty()) {
1663 llvm::sort(SuboptimalFuncs
,
1664 [](const BinaryFunction
*A
, const BinaryFunction
*B
) {
1665 return A
->getKnownExecutionCount() / A
->getSize() >
1666 B
->getKnownExecutionCount() / B
->getSize();
1669 BC
.outs() << "BOLT-INFO: " << SuboptimalFuncs
.size()
1670 << " functions have "
1671 "cold code in the middle of hot code. Top functions are:\n";
1672 for (unsigned I
= 0;
1673 I
< std::min(static_cast<size_t>(opts::ReportBadLayout
),
1674 SuboptimalFuncs
.size());
1676 SuboptimalFuncs
[I
]->print(BC
.outs());
1680 if (NumUnknownControlFlowFunctions
) {
1681 BC
.outs() << "BOLT-INFO: " << NumUnknownControlFlowFunctions
1682 << " functions have instructions with unknown control flow";
1683 if (!opts::PrintUnknown
)
1684 BC
.outs() << ". Use -print-unknown to see the list.";
1687 return Error::success();
1690 Error
InstructionLowering::runOnFunctions(BinaryContext
&BC
) {
1691 for (auto &BFI
: BC
.getBinaryFunctions())
1692 for (BinaryBasicBlock
&BB
: BFI
.second
)
1693 for (MCInst
&Instruction
: BB
)
1694 BC
.MIB
->lowerTailCall(Instruction
);
1695 return Error::success();
1698 Error
StripRepRet::runOnFunctions(BinaryContext
&BC
) {
1700 return Error::success();
1702 uint64_t NumPrefixesRemoved
= 0;
1703 uint64_t NumBytesSaved
= 0;
1704 for (auto &BFI
: BC
.getBinaryFunctions()) {
1705 for (BinaryBasicBlock
&BB
: BFI
.second
) {
1706 auto LastInstRIter
= BB
.getLastNonPseudo();
1707 if (LastInstRIter
== BB
.rend() || !BC
.MIB
->isReturn(*LastInstRIter
) ||
1708 !BC
.MIB
->deleteREPPrefix(*LastInstRIter
))
1711 NumPrefixesRemoved
+= BB
.getKnownExecutionCount();
1717 BC
.outs() << "BOLT-INFO: removed " << NumBytesSaved
1718 << " 'repz' prefixes"
1719 " with estimated execution count of "
1720 << NumPrefixesRemoved
<< " times.\n";
1721 return Error::success();
1724 Error
InlineMemcpy::runOnFunctions(BinaryContext
&BC
) {
1726 return Error::success();
1728 uint64_t NumInlined
= 0;
1729 uint64_t NumInlinedDyno
= 0;
1730 for (auto &BFI
: BC
.getBinaryFunctions()) {
1731 for (BinaryBasicBlock
&BB
: BFI
.second
) {
1732 for (auto II
= BB
.begin(); II
!= BB
.end(); ++II
) {
1735 if (!BC
.MIB
->isCall(Inst
) || MCPlus::getNumPrimeOperands(Inst
) != 1 ||
1736 !Inst
.getOperand(0).isExpr())
1739 const MCSymbol
*CalleeSymbol
= BC
.MIB
->getTargetSymbol(Inst
);
1740 if (CalleeSymbol
->getName() != "memcpy" &&
1741 CalleeSymbol
->getName() != "memcpy@PLT" &&
1742 CalleeSymbol
->getName() != "_memcpy8")
1745 const bool IsMemcpy8
= (CalleeSymbol
->getName() == "_memcpy8");
1746 const bool IsTailCall
= BC
.MIB
->isTailCall(Inst
);
1748 const InstructionListType NewCode
=
1749 BC
.MIB
->createInlineMemcpy(IsMemcpy8
);
1750 II
= BB
.replaceInstruction(II
, NewCode
);
1751 std::advance(II
, NewCode
.size() - 1);
1754 BC
.MIB
->createReturn(Return
);
1755 II
= BB
.insertInstruction(std::next(II
), std::move(Return
));
1759 NumInlinedDyno
+= BB
.getKnownExecutionCount();
1765 BC
.outs() << "BOLT-INFO: inlined " << NumInlined
<< " memcpy() calls";
1767 BC
.outs() << ". The calls were executed " << NumInlinedDyno
1768 << " times based on profile.";
1771 return Error::success();
1774 bool SpecializeMemcpy1::shouldOptimize(const BinaryFunction
&Function
) const {
1775 if (!BinaryFunctionPass::shouldOptimize(Function
))
1778 for (const std::string
&FunctionSpec
: Spec
) {
1779 StringRef FunctionName
= StringRef(FunctionSpec
).split(':').first
;
1780 if (Function
.hasNameRegex(FunctionName
))
1787 std::set
<size_t> SpecializeMemcpy1::getCallSitesToOptimize(
1788 const BinaryFunction
&Function
) const {
1789 StringRef SitesString
;
1790 for (const std::string
&FunctionSpec
: Spec
) {
1791 StringRef FunctionName
;
1792 std::tie(FunctionName
, SitesString
) = StringRef(FunctionSpec
).split(':');
1793 if (Function
.hasNameRegex(FunctionName
))
1798 std::set
<size_t> Sites
;
1799 SmallVector
<StringRef
, 4> SitesVec
;
1800 SitesString
.split(SitesVec
, ':');
1801 for (StringRef SiteString
: SitesVec
) {
1802 if (SiteString
.empty())
1805 if (!SiteString
.getAsInteger(10, Result
))
1806 Sites
.emplace(Result
);
1812 Error
SpecializeMemcpy1::runOnFunctions(BinaryContext
&BC
) {
1814 return Error::success();
1816 uint64_t NumSpecialized
= 0;
1817 uint64_t NumSpecializedDyno
= 0;
1818 for (auto &BFI
: BC
.getBinaryFunctions()) {
1819 BinaryFunction
&Function
= BFI
.second
;
1820 if (!shouldOptimize(Function
))
1823 std::set
<size_t> CallsToOptimize
= getCallSitesToOptimize(Function
);
1824 auto shouldOptimize
= [&](size_t N
) {
1825 return CallsToOptimize
.empty() || CallsToOptimize
.count(N
);
1828 std::vector
<BinaryBasicBlock
*> Blocks(Function
.pbegin(), Function
.pend());
1829 size_t CallSiteID
= 0;
1830 for (BinaryBasicBlock
*CurBB
: Blocks
) {
1831 for (auto II
= CurBB
->begin(); II
!= CurBB
->end(); ++II
) {
1834 if (!BC
.MIB
->isCall(Inst
) || MCPlus::getNumPrimeOperands(Inst
) != 1 ||
1835 !Inst
.getOperand(0).isExpr())
1838 const MCSymbol
*CalleeSymbol
= BC
.MIB
->getTargetSymbol(Inst
);
1839 if (CalleeSymbol
->getName() != "memcpy" &&
1840 CalleeSymbol
->getName() != "memcpy@PLT")
1843 if (BC
.MIB
->isTailCall(Inst
))
1848 if (!shouldOptimize(CallSiteID
))
1851 // Create a copy of a call to memcpy(dest, src, size).
1852 MCInst MemcpyInstr
= Inst
;
1854 BinaryBasicBlock
*OneByteMemcpyBB
= CurBB
->splitAt(II
);
1856 BinaryBasicBlock
*NextBB
= nullptr;
1857 if (OneByteMemcpyBB
->getNumNonPseudos() > 1) {
1858 NextBB
= OneByteMemcpyBB
->splitAt(OneByteMemcpyBB
->begin());
1859 NextBB
->eraseInstruction(NextBB
->begin());
1861 NextBB
= OneByteMemcpyBB
->getSuccessor();
1862 OneByteMemcpyBB
->eraseInstruction(OneByteMemcpyBB
->begin());
1863 assert(NextBB
&& "unexpected call to memcpy() with no return");
1866 BinaryBasicBlock
*MemcpyBB
= Function
.addBasicBlock();
1867 MemcpyBB
->setOffset(CurBB
->getInputOffset());
1868 InstructionListType CmpJCC
=
1869 BC
.MIB
->createCmpJE(BC
.MIB
->getIntArgRegister(2), 1,
1870 OneByteMemcpyBB
->getLabel(), BC
.Ctx
.get());
1871 CurBB
->addInstructions(CmpJCC
);
1872 CurBB
->addSuccessor(MemcpyBB
);
1874 MemcpyBB
->addInstruction(std::move(MemcpyInstr
));
1875 MemcpyBB
->addSuccessor(NextBB
);
1876 MemcpyBB
->setCFIState(NextBB
->getCFIState());
1877 MemcpyBB
->setExecutionCount(0);
1879 // To prevent the actual call from being moved to cold, we set its
1880 // execution count to 1.
1881 if (CurBB
->getKnownExecutionCount() > 0)
1882 MemcpyBB
->setExecutionCount(1);
1884 InstructionListType OneByteMemcpy
= BC
.MIB
->createOneByteMemcpy();
1885 OneByteMemcpyBB
->addInstructions(OneByteMemcpy
);
1888 NumSpecializedDyno
+= CurBB
->getKnownExecutionCount();
1892 // Note: we don't expect the next instruction to be a call to memcpy.
1893 II
= CurBB
->begin();
1898 if (NumSpecialized
) {
1899 BC
.outs() << "BOLT-INFO: specialized " << NumSpecialized
1900 << " memcpy() call sites for size 1";
1901 if (NumSpecializedDyno
)
1902 BC
.outs() << ". The calls were executed " << NumSpecializedDyno
1903 << " times based on profile.";
1906 return Error::success();
1909 void RemoveNops::runOnFunction(BinaryFunction
&BF
) {
1910 const BinaryContext
&BC
= BF
.getBinaryContext();
1911 for (BinaryBasicBlock
&BB
: BF
) {
1912 for (int64_t I
= BB
.size() - 1; I
>= 0; --I
) {
1913 MCInst
&Inst
= BB
.getInstructionAtIndex(I
);
1914 if (BC
.MIB
->isNoop(Inst
) && BC
.MIB
->hasAnnotation(Inst
, "NOP"))
1915 BB
.eraseInstructionAtIndex(I
);
1920 Error
RemoveNops::runOnFunctions(BinaryContext
&BC
) {
1921 ParallelUtilities::WorkFuncTy WorkFun
= [&](BinaryFunction
&BF
) {
1925 ParallelUtilities::PredicateTy SkipFunc
= [&](const BinaryFunction
&BF
) {
1926 return BF
.shouldPreserveNops();
1929 ParallelUtilities::runOnEachFunction(
1930 BC
, ParallelUtilities::SchedulingPolicy::SP_INST_LINEAR
, WorkFun
,
1931 SkipFunc
, "RemoveNops");
1932 return Error::success();