Run DCE after a LoopFlatten test to reduce spurious output [nfc]
[llvm-project.git] / bolt / lib / Passes / BinaryPasses.cpp
blobe50fa9dea602bc7cbb12e556937326a2b1ba63d4
1 //===- bolt/Passes/BinaryPasses.cpp - Binary-level passes -----------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements multiple passes for binary optimization and analysis.
11 //===----------------------------------------------------------------------===//
13 #include "bolt/Passes/BinaryPasses.h"
14 #include "bolt/Core/FunctionLayout.h"
15 #include "bolt/Core/ParallelUtilities.h"
16 #include "bolt/Passes/ReorderAlgorithm.h"
17 #include "bolt/Passes/ReorderFunctions.h"
18 #include "llvm/Support/CommandLine.h"
19 #include <atomic>
20 #include <mutex>
21 #include <numeric>
22 #include <vector>
24 #define DEBUG_TYPE "bolt-opts"
26 using namespace llvm;
27 using namespace bolt;
29 static const char *dynoStatsOptName(const bolt::DynoStats::Category C) {
30 assert(C > bolt::DynoStats::FIRST_DYNO_STAT &&
31 C < DynoStats::LAST_DYNO_STAT && "Unexpected dyno stat category.");
33 static std::string OptNames[bolt::DynoStats::LAST_DYNO_STAT + 1];
35 OptNames[C] = bolt::DynoStats::Description(C);
37 std::replace(OptNames[C].begin(), OptNames[C].end(), ' ', '-');
39 return OptNames[C].c_str();
42 namespace opts {
44 extern cl::OptionCategory BoltCategory;
45 extern cl::OptionCategory BoltOptCategory;
47 extern cl::opt<bolt::MacroFusionType> AlignMacroOpFusion;
48 extern cl::opt<unsigned> Verbosity;
49 extern cl::opt<bool> EnableBAT;
50 extern cl::opt<unsigned> ExecutionCountThreshold;
51 extern cl::opt<bool> UpdateDebugSections;
52 extern cl::opt<bolt::ReorderFunctions::ReorderType> ReorderFunctions;
54 enum DynoStatsSortOrder : char {
55 Ascending,
56 Descending
59 static cl::opt<DynoStatsSortOrder> DynoStatsSortOrderOpt(
60 "print-sorted-by-order",
61 cl::desc("use ascending or descending order when printing functions "
62 "ordered by dyno stats"),
63 cl::init(DynoStatsSortOrder::Descending), cl::cat(BoltOptCategory));
65 cl::list<std::string>
66 HotTextMoveSections("hot-text-move-sections",
67 cl::desc("list of sections containing functions used for hugifying hot text. "
68 "BOLT makes sure these functions are not placed on the same page as "
69 "the hot text. (default=\'.stub,.mover\')."),
70 cl::value_desc("sec1,sec2,sec3,..."),
71 cl::CommaSeparated,
72 cl::ZeroOrMore,
73 cl::cat(BoltCategory));
75 bool isHotTextMover(const BinaryFunction &Function) {
76 for (std::string &SectionName : opts::HotTextMoveSections) {
77 if (Function.getOriginSectionName() &&
78 *Function.getOriginSectionName() == SectionName)
79 return true;
82 return false;
85 static cl::opt<bool> MinBranchClusters(
86 "min-branch-clusters",
87 cl::desc("use a modified clustering algorithm geared towards minimizing "
88 "branches"),
89 cl::Hidden, cl::cat(BoltOptCategory));
91 static cl::list<Peepholes::PeepholeOpts> Peepholes(
92 "peepholes", cl::CommaSeparated, cl::desc("enable peephole optimizations"),
93 cl::value_desc("opt1,opt2,opt3,..."),
94 cl::values(clEnumValN(Peepholes::PEEP_NONE, "none", "disable peepholes"),
95 clEnumValN(Peepholes::PEEP_DOUBLE_JUMPS, "double-jumps",
96 "remove double jumps when able"),
97 clEnumValN(Peepholes::PEEP_TAILCALL_TRAPS, "tailcall-traps",
98 "insert tail call traps"),
99 clEnumValN(Peepholes::PEEP_USELESS_BRANCHES, "useless-branches",
100 "remove useless conditional branches"),
101 clEnumValN(Peepholes::PEEP_ALL, "all",
102 "enable all peephole optimizations")),
103 cl::ZeroOrMore, cl::cat(BoltOptCategory));
105 static cl::opt<unsigned>
106 PrintFuncStat("print-function-statistics",
107 cl::desc("print statistics about basic block ordering"),
108 cl::init(0), cl::cat(BoltOptCategory));
110 static cl::list<bolt::DynoStats::Category>
111 PrintSortedBy("print-sorted-by", cl::CommaSeparated,
112 cl::desc("print functions sorted by order of dyno stats"),
113 cl::value_desc("key1,key2,key3,..."),
114 cl::values(
115 #define D(name, description, ...) \
116 clEnumValN(bolt::DynoStats::name, dynoStatsOptName(bolt::DynoStats::name), \
117 description),
118 REAL_DYNO_STATS
119 #undef D
120 clEnumValN(bolt::DynoStats::LAST_DYNO_STAT, "all",
121 "sorted by all names")),
122 cl::ZeroOrMore, cl::cat(BoltOptCategory));
124 static cl::opt<bool>
125 PrintUnknown("print-unknown",
126 cl::desc("print names of functions with unknown control flow"),
127 cl::cat(BoltCategory), cl::Hidden);
129 static cl::opt<bool>
130 PrintUnknownCFG("print-unknown-cfg",
131 cl::desc("dump CFG of functions with unknown control flow"),
132 cl::cat(BoltCategory), cl::ReallyHidden);
134 // Please MSVC19 with a forward declaration: otherwise it reports an error about
135 // an undeclared variable inside a callback.
136 extern cl::opt<bolt::ReorderBasicBlocks::LayoutType> ReorderBlocks;
137 cl::opt<bolt::ReorderBasicBlocks::LayoutType> ReorderBlocks(
138 "reorder-blocks", cl::desc("change layout of basic blocks in a function"),
139 cl::init(bolt::ReorderBasicBlocks::LT_NONE),
140 cl::values(
141 clEnumValN(bolt::ReorderBasicBlocks::LT_NONE, "none",
142 "do not reorder basic blocks"),
143 clEnumValN(bolt::ReorderBasicBlocks::LT_REVERSE, "reverse",
144 "layout blocks in reverse order"),
145 clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE, "normal",
146 "perform optimal layout based on profile"),
147 clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_BRANCH,
148 "branch-predictor",
149 "perform optimal layout prioritizing branch "
150 "predictions"),
151 clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_CACHE, "cache",
152 "perform optimal layout prioritizing I-cache "
153 "behavior"),
154 clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_CACHE_PLUS, "cache+",
155 "perform layout optimizing I-cache behavior"),
156 clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_EXT_TSP, "ext-tsp",
157 "perform layout optimizing I-cache behavior"),
158 clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_SHUFFLE,
159 "cluster-shuffle", "perform random layout of clusters")),
160 cl::ZeroOrMore, cl::cat(BoltOptCategory),
161 cl::callback([](const bolt::ReorderBasicBlocks::LayoutType &option) {
162 if (option == bolt::ReorderBasicBlocks::LT_OPTIMIZE_CACHE_PLUS) {
163 errs() << "BOLT-WARNING: '-reorder-blocks=cache+' is deprecated, please"
164 << " use '-reorder-blocks=ext-tsp' instead\n";
165 ReorderBlocks = bolt::ReorderBasicBlocks::LT_OPTIMIZE_EXT_TSP;
167 }));
169 static cl::opt<unsigned> ReportBadLayout(
170 "report-bad-layout",
171 cl::desc("print top <uint> functions with suboptimal code layout on input"),
172 cl::init(0), cl::Hidden, cl::cat(BoltOptCategory));
174 static cl::opt<bool>
175 ReportStaleFuncs("report-stale",
176 cl::desc("print the list of functions with stale profile"),
177 cl::Hidden, cl::cat(BoltOptCategory));
179 enum SctcModes : char {
180 SctcAlways,
181 SctcPreserveDirection,
182 SctcHeuristic
185 static cl::opt<SctcModes>
186 SctcMode("sctc-mode",
187 cl::desc("mode for simplify conditional tail calls"),
188 cl::init(SctcAlways),
189 cl::values(clEnumValN(SctcAlways, "always", "always perform sctc"),
190 clEnumValN(SctcPreserveDirection,
191 "preserve",
192 "only perform sctc when branch direction is "
193 "preserved"),
194 clEnumValN(SctcHeuristic,
195 "heuristic",
196 "use branch prediction data to control sctc")),
197 cl::ZeroOrMore,
198 cl::cat(BoltOptCategory));
200 static cl::opt<unsigned>
201 StaleThreshold("stale-threshold",
202 cl::desc(
203 "maximum percentage of stale functions to tolerate (default: 100)"),
204 cl::init(100),
205 cl::Hidden,
206 cl::cat(BoltOptCategory));
208 static cl::opt<unsigned> TSPThreshold(
209 "tsp-threshold",
210 cl::desc(
211 "maximum number of hot basic blocks in a function for which to use "
212 "a precise TSP solution while re-ordering basic blocks"),
213 cl::init(10), cl::Hidden, cl::cat(BoltOptCategory));
215 static cl::opt<unsigned> TopCalledLimit(
216 "top-called-limit",
217 cl::desc("maximum number of functions to print in top called "
218 "functions section"),
219 cl::init(100), cl::Hidden, cl::cat(BoltCategory));
221 } // namespace opts
223 namespace llvm {
224 namespace bolt {
226 bool BinaryFunctionPass::shouldOptimize(const BinaryFunction &BF) const {
227 return BF.isSimple() && BF.getState() == BinaryFunction::State::CFG &&
228 !BF.isIgnored();
231 bool BinaryFunctionPass::shouldPrint(const BinaryFunction &BF) const {
232 return BF.isSimple() && !BF.isIgnored();
235 void NormalizeCFG::runOnFunction(BinaryFunction &BF) {
236 uint64_t NumRemoved = 0;
237 uint64_t NumDuplicateEdges = 0;
238 uint64_t NeedsFixBranches = 0;
239 for (BinaryBasicBlock &BB : BF) {
240 if (!BB.empty())
241 continue;
243 if (BB.isEntryPoint() || BB.isLandingPad())
244 continue;
246 // Handle a dangling empty block.
247 if (BB.succ_size() == 0) {
248 // If an empty dangling basic block has a predecessor, it could be a
249 // result of codegen for __builtin_unreachable. In such case, do not
250 // remove the block.
251 if (BB.pred_size() == 0) {
252 BB.markValid(false);
253 ++NumRemoved;
255 continue;
258 // The block should have just one successor.
259 BinaryBasicBlock *Successor = BB.getSuccessor();
260 assert(Successor && "invalid CFG encountered");
262 // Redirect all predecessors to the successor block.
263 while (!BB.pred_empty()) {
264 BinaryBasicBlock *Predecessor = *BB.pred_begin();
265 if (Predecessor->hasJumpTable())
266 break;
268 if (Predecessor == Successor)
269 break;
271 BinaryBasicBlock::BinaryBranchInfo &BI = Predecessor->getBranchInfo(BB);
272 Predecessor->replaceSuccessor(&BB, Successor, BI.Count,
273 BI.MispredictedCount);
274 // We need to fix branches even if we failed to replace all successors
275 // and remove the block.
276 NeedsFixBranches = true;
279 if (BB.pred_empty()) {
280 BB.removeAllSuccessors();
281 BB.markValid(false);
282 ++NumRemoved;
286 if (NumRemoved)
287 BF.eraseInvalidBBs();
289 // Check for duplicate successors. Do it after the empty block elimination as
290 // we can get more duplicate successors.
291 for (BinaryBasicBlock &BB : BF)
292 if (!BB.hasJumpTable() && BB.succ_size() == 2 &&
293 BB.getConditionalSuccessor(false) == BB.getConditionalSuccessor(true))
294 ++NumDuplicateEdges;
296 // fixBranches() will get rid of duplicate edges and update jump instructions.
297 if (NumDuplicateEdges || NeedsFixBranches)
298 BF.fixBranches();
300 NumDuplicateEdgesMerged += NumDuplicateEdges;
301 NumBlocksRemoved += NumRemoved;
304 void NormalizeCFG::runOnFunctions(BinaryContext &BC) {
305 ParallelUtilities::runOnEachFunction(
306 BC, ParallelUtilities::SchedulingPolicy::SP_BB_LINEAR,
307 [&](BinaryFunction &BF) { runOnFunction(BF); },
308 [&](const BinaryFunction &BF) { return !shouldOptimize(BF); },
309 "NormalizeCFG");
310 if (NumBlocksRemoved)
311 outs() << "BOLT-INFO: removed " << NumBlocksRemoved << " empty block"
312 << (NumBlocksRemoved == 1 ? "" : "s") << '\n';
313 if (NumDuplicateEdgesMerged)
314 outs() << "BOLT-INFO: merged " << NumDuplicateEdgesMerged
315 << " duplicate CFG edge" << (NumDuplicateEdgesMerged == 1 ? "" : "s")
316 << '\n';
319 void EliminateUnreachableBlocks::runOnFunction(BinaryFunction &Function) {
320 if (!Function.getLayout().block_empty()) {
321 unsigned Count;
322 uint64_t Bytes;
323 Function.markUnreachableBlocks();
324 LLVM_DEBUG({
325 for (BinaryBasicBlock &BB : Function) {
326 if (!BB.isValid()) {
327 dbgs() << "BOLT-INFO: UCE found unreachable block " << BB.getName()
328 << " in function " << Function << "\n";
329 Function.dump();
333 std::tie(Count, Bytes) = Function.eraseInvalidBBs();
334 DeletedBlocks += Count;
335 DeletedBytes += Bytes;
336 if (Count) {
337 Modified.insert(&Function);
338 if (opts::Verbosity > 0)
339 outs() << "BOLT-INFO: removed " << Count
340 << " dead basic block(s) accounting for " << Bytes
341 << " bytes in function " << Function << '\n';
346 void EliminateUnreachableBlocks::runOnFunctions(BinaryContext &BC) {
347 for (auto &It : BC.getBinaryFunctions()) {
348 BinaryFunction &Function = It.second;
349 if (shouldOptimize(Function))
350 runOnFunction(Function);
353 if (DeletedBlocks)
354 outs() << "BOLT-INFO: UCE removed " << DeletedBlocks << " blocks and "
355 << DeletedBytes << " bytes of code\n";
358 bool ReorderBasicBlocks::shouldPrint(const BinaryFunction &BF) const {
359 return (BinaryFunctionPass::shouldPrint(BF) &&
360 opts::ReorderBlocks != ReorderBasicBlocks::LT_NONE);
363 bool ReorderBasicBlocks::shouldOptimize(const BinaryFunction &BF) const {
364 // Apply execution count threshold
365 if (BF.getKnownExecutionCount() < opts::ExecutionCountThreshold)
366 return false;
368 return BinaryFunctionPass::shouldOptimize(BF);
371 void ReorderBasicBlocks::runOnFunctions(BinaryContext &BC) {
372 if (opts::ReorderBlocks == ReorderBasicBlocks::LT_NONE)
373 return;
375 std::atomic_uint64_t ModifiedFuncCount(0);
376 std::mutex FunctionEditDistanceMutex;
377 DenseMap<const BinaryFunction *, uint64_t> FunctionEditDistance;
379 ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) {
380 SmallVector<const BinaryBasicBlock *, 0> OldBlockOrder;
381 if (opts::PrintFuncStat > 0)
382 llvm::copy(BF.getLayout().blocks(), std::back_inserter(OldBlockOrder));
384 const bool LayoutChanged =
385 modifyFunctionLayout(BF, opts::ReorderBlocks, opts::MinBranchClusters);
386 if (LayoutChanged) {
387 ModifiedFuncCount.fetch_add(1, std::memory_order_relaxed);
388 if (opts::PrintFuncStat > 0) {
389 const uint64_t Distance = BF.getLayout().getEditDistance(OldBlockOrder);
390 std::lock_guard<std::mutex> Lock(FunctionEditDistanceMutex);
391 FunctionEditDistance[&BF] = Distance;
396 ParallelUtilities::PredicateTy SkipFunc = [&](const BinaryFunction &BF) {
397 return !shouldOptimize(BF);
400 ParallelUtilities::runOnEachFunction(
401 BC, ParallelUtilities::SchedulingPolicy::SP_BB_LINEAR, WorkFun, SkipFunc,
402 "ReorderBasicBlocks");
403 const size_t NumAllProfiledFunctions =
404 BC.NumProfiledFuncs + BC.NumStaleProfileFuncs;
406 outs() << "BOLT-INFO: basic block reordering modified layout of "
407 << format("%zu functions (%.2lf%% of profiled, %.2lf%% of total)\n",
408 ModifiedFuncCount.load(std::memory_order_relaxed),
409 100.0 * ModifiedFuncCount.load(std::memory_order_relaxed) /
410 NumAllProfiledFunctions,
411 100.0 * ModifiedFuncCount.load(std::memory_order_relaxed) /
412 BC.getBinaryFunctions().size());
414 if (opts::PrintFuncStat > 0) {
415 raw_ostream &OS = outs();
416 // Copy all the values into vector in order to sort them
417 std::map<uint64_t, BinaryFunction &> ScoreMap;
418 auto &BFs = BC.getBinaryFunctions();
419 for (auto It = BFs.begin(); It != BFs.end(); ++It)
420 ScoreMap.insert(std::pair<uint64_t, BinaryFunction &>(
421 It->second.getFunctionScore(), It->second));
423 OS << "\nBOLT-INFO: Printing Function Statistics:\n\n";
424 OS << " There are " << BFs.size() << " functions in total. \n";
425 OS << " Number of functions being modified: "
426 << ModifiedFuncCount.load(std::memory_order_relaxed) << "\n";
427 OS << " User asks for detailed information on top "
428 << opts::PrintFuncStat << " functions. (Ranked by function score)"
429 << "\n\n";
430 uint64_t I = 0;
431 for (std::map<uint64_t, BinaryFunction &>::reverse_iterator Rit =
432 ScoreMap.rbegin();
433 Rit != ScoreMap.rend() && I < opts::PrintFuncStat; ++Rit, ++I) {
434 BinaryFunction &Function = Rit->second;
436 OS << " Information for function of top: " << (I + 1) << ": \n";
437 OS << " Function Score is: " << Function.getFunctionScore()
438 << "\n";
439 OS << " There are " << Function.size()
440 << " number of blocks in this function.\n";
441 OS << " There are " << Function.getInstructionCount()
442 << " number of instructions in this function.\n";
443 OS << " The edit distance for this function is: "
444 << FunctionEditDistance.lookup(&Function) << "\n\n";
449 bool ReorderBasicBlocks::modifyFunctionLayout(BinaryFunction &BF,
450 LayoutType Type,
451 bool MinBranchClusters) const {
452 if (BF.size() == 0 || Type == LT_NONE)
453 return false;
455 BinaryFunction::BasicBlockOrderType NewLayout;
456 std::unique_ptr<ReorderAlgorithm> Algo;
458 // Cannot do optimal layout without profile.
459 if (Type != LT_REVERSE && !BF.hasValidProfile())
460 return false;
462 if (Type == LT_REVERSE) {
463 Algo.reset(new ReverseReorderAlgorithm());
464 } else if (BF.size() <= opts::TSPThreshold && Type != LT_OPTIMIZE_SHUFFLE) {
465 // Work on optimal solution if problem is small enough
466 LLVM_DEBUG(dbgs() << "finding optimal block layout for " << BF << "\n");
467 Algo.reset(new TSPReorderAlgorithm());
468 } else {
469 LLVM_DEBUG(dbgs() << "running block layout heuristics on " << BF << "\n");
471 std::unique_ptr<ClusterAlgorithm> CAlgo;
472 if (MinBranchClusters)
473 CAlgo.reset(new MinBranchGreedyClusterAlgorithm());
474 else
475 CAlgo.reset(new PHGreedyClusterAlgorithm());
477 switch (Type) {
478 case LT_OPTIMIZE:
479 Algo.reset(new OptimizeReorderAlgorithm(std::move(CAlgo)));
480 break;
482 case LT_OPTIMIZE_BRANCH:
483 Algo.reset(new OptimizeBranchReorderAlgorithm(std::move(CAlgo)));
484 break;
486 case LT_OPTIMIZE_CACHE:
487 Algo.reset(new OptimizeCacheReorderAlgorithm(std::move(CAlgo)));
488 break;
490 case LT_OPTIMIZE_EXT_TSP:
491 Algo.reset(new ExtTSPReorderAlgorithm());
492 break;
494 case LT_OPTIMIZE_SHUFFLE:
495 Algo.reset(new RandomClusterReorderAlgorithm(std::move(CAlgo)));
496 break;
498 default:
499 llvm_unreachable("unexpected layout type");
503 Algo->reorderBasicBlocks(BF, NewLayout);
505 return BF.getLayout().update(NewLayout);
508 void FixupBranches::runOnFunctions(BinaryContext &BC) {
509 for (auto &It : BC.getBinaryFunctions()) {
510 BinaryFunction &Function = It.second;
511 if (!BC.shouldEmit(Function) || !Function.isSimple())
512 continue;
514 Function.fixBranches();
518 void FinalizeFunctions::runOnFunctions(BinaryContext &BC) {
519 ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) {
520 if (!BF.finalizeCFIState()) {
521 if (BC.HasRelocations) {
522 errs() << "BOLT-ERROR: unable to fix CFI state for function " << BF
523 << ". Exiting.\n";
524 exit(1);
526 BF.setSimple(false);
527 return;
530 BF.setFinalized();
532 // Update exception handling information.
533 BF.updateEHRanges();
536 ParallelUtilities::PredicateTy SkipPredicate = [&](const BinaryFunction &BF) {
537 return !BC.shouldEmit(BF);
540 ParallelUtilities::runOnEachFunction(
541 BC, ParallelUtilities::SchedulingPolicy::SP_CONSTANT, WorkFun,
542 SkipPredicate, "FinalizeFunctions");
545 void CheckLargeFunctions::runOnFunctions(BinaryContext &BC) {
546 if (BC.HasRelocations)
547 return;
549 if (!opts::UpdateDebugSections)
550 return;
552 // If the function wouldn't fit, mark it as non-simple. Otherwise, we may emit
553 // incorrect debug info.
554 ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) {
555 uint64_t HotSize, ColdSize;
556 std::tie(HotSize, ColdSize) =
557 BC.calculateEmittedSize(BF, /*FixBranches=*/false);
558 if (HotSize > BF.getMaxSize())
559 BF.setSimple(false);
562 ParallelUtilities::PredicateTy SkipFunc = [&](const BinaryFunction &BF) {
563 return !shouldOptimize(BF);
566 ParallelUtilities::runOnEachFunction(
567 BC, ParallelUtilities::SchedulingPolicy::SP_INST_LINEAR, WorkFun,
568 SkipFunc, "CheckLargeFunctions");
571 bool CheckLargeFunctions::shouldOptimize(const BinaryFunction &BF) const {
572 // Unlike other passes, allow functions in non-CFG state.
573 return BF.isSimple() && !BF.isIgnored();
576 void LowerAnnotations::runOnFunctions(BinaryContext &BC) {
577 std::vector<std::pair<MCInst *, uint32_t>> PreservedOffsetAnnotations;
578 std::vector<std::pair<MCInst *, MCSymbol *>> PreservedLabelAnnotations;
580 for (auto &It : BC.getBinaryFunctions()) {
581 BinaryFunction &BF = It.second;
583 for (FunctionFragment &FF : BF.getLayout().fragments()) {
584 int64_t CurrentGnuArgsSize = 0;
586 for (BinaryBasicBlock *const BB : FF) {
587 // First convert GnuArgsSize annotations into CFIs. This may change
588 // instr pointers, so do it before recording ptrs for preserved
589 // annotations
590 if (BF.usesGnuArgsSize()) {
591 for (auto II = BB->begin(); II != BB->end(); ++II) {
592 if (!BC.MIB->isInvoke(*II))
593 continue;
594 const int64_t NewGnuArgsSize = BC.MIB->getGnuArgsSize(*II);
595 assert(NewGnuArgsSize >= 0 &&
596 "expected non-negative GNU_args_size");
597 if (NewGnuArgsSize != CurrentGnuArgsSize) {
598 auto InsertII = BF.addCFIInstruction(
599 BB, II,
600 MCCFIInstruction::createGnuArgsSize(nullptr, NewGnuArgsSize));
601 CurrentGnuArgsSize = NewGnuArgsSize;
602 II = std::next(InsertII);
607 // Now record preserved annotations separately and then strip
608 // annotations.
609 for (auto II = BB->begin(); II != BB->end(); ++II) {
610 if (BF.requiresAddressTranslation() && BC.MIB->getOffset(*II))
611 PreservedOffsetAnnotations.emplace_back(&(*II),
612 *BC.MIB->getOffset(*II));
613 if (auto Label = BC.MIB->getLabel(*II))
614 PreservedLabelAnnotations.emplace_back(&*II, *Label);
615 BC.MIB->stripAnnotations(*II);
620 for (BinaryFunction *BF : BC.getInjectedBinaryFunctions())
621 for (BinaryBasicBlock &BB : *BF)
622 for (MCInst &Instruction : BB) {
623 if (auto Label = BC.MIB->getLabel(Instruction))
624 PreservedLabelAnnotations.emplace_back(&Instruction, *Label);
625 BC.MIB->stripAnnotations(Instruction);
628 // Release all memory taken by annotations
629 BC.MIB->freeAnnotations();
631 // Reinsert preserved annotations we need during code emission.
632 for (const std::pair<MCInst *, uint32_t> &Item : PreservedOffsetAnnotations)
633 BC.MIB->setOffset(*Item.first, Item.second);
634 for (auto [Instr, Label] : PreservedLabelAnnotations)
635 BC.MIB->setLabel(*Instr, Label);
638 // Check for dirty state in MCSymbol objects that might be a consequence
639 // of running calculateEmittedSize() in parallel, during split functions
640 // pass. If an inconsistent state is found (symbol already registered or
641 // already defined), clean it.
642 void CleanMCState::runOnFunctions(BinaryContext &BC) {
643 MCContext &Ctx = *BC.Ctx;
644 for (const auto &SymMapEntry : Ctx.getSymbols()) {
645 const MCSymbol *S = SymMapEntry.second;
646 if (S->isDefined()) {
647 LLVM_DEBUG(dbgs() << "BOLT-DEBUG: Symbol \"" << S->getName()
648 << "\" is already defined\n");
649 const_cast<MCSymbol *>(S)->setUndefined();
651 if (S->isRegistered()) {
652 LLVM_DEBUG(dbgs() << "BOLT-DEBUG: Symbol \"" << S->getName()
653 << "\" is already registered\n");
654 const_cast<MCSymbol *>(S)->setIsRegistered(false);
656 LLVM_DEBUG(if (S->isVariable()) {
657 dbgs() << "BOLT-DEBUG: Symbol \"" << S->getName() << "\" is variable\n";
662 // This peephole fixes jump instructions that jump to another basic
663 // block with a single jump instruction, e.g.
665 // B0: ...
666 // jmp B1 (or jcc B1)
668 // B1: jmp B2
670 // ->
672 // B0: ...
673 // jmp B2 (or jcc B2)
675 static uint64_t fixDoubleJumps(BinaryFunction &Function, bool MarkInvalid) {
676 uint64_t NumDoubleJumps = 0;
678 MCContext *Ctx = Function.getBinaryContext().Ctx.get();
679 MCPlusBuilder *MIB = Function.getBinaryContext().MIB.get();
680 for (BinaryBasicBlock &BB : Function) {
681 auto checkAndPatch = [&](BinaryBasicBlock *Pred, BinaryBasicBlock *Succ,
682 const MCSymbol *SuccSym) {
683 // Ignore infinite loop jumps or fallthrough tail jumps.
684 if (Pred == Succ || Succ == &BB)
685 return false;
687 if (Succ) {
688 const MCSymbol *TBB = nullptr;
689 const MCSymbol *FBB = nullptr;
690 MCInst *CondBranch = nullptr;
691 MCInst *UncondBranch = nullptr;
692 bool Res = Pred->analyzeBranch(TBB, FBB, CondBranch, UncondBranch);
693 if (!Res) {
694 LLVM_DEBUG(dbgs() << "analyzeBranch failed in peepholes in block:\n";
695 Pred->dump());
696 return false;
698 Pred->replaceSuccessor(&BB, Succ);
700 // We must patch up any existing branch instructions to match up
701 // with the new successor.
702 assert((CondBranch || (!CondBranch && Pred->succ_size() == 1)) &&
703 "Predecessor block has inconsistent number of successors");
704 if (CondBranch && MIB->getTargetSymbol(*CondBranch) == BB.getLabel()) {
705 MIB->replaceBranchTarget(*CondBranch, Succ->getLabel(), Ctx);
706 } else if (UncondBranch &&
707 MIB->getTargetSymbol(*UncondBranch) == BB.getLabel()) {
708 MIB->replaceBranchTarget(*UncondBranch, Succ->getLabel(), Ctx);
709 } else if (!UncondBranch) {
710 assert(Function.getLayout().getBasicBlockAfter(Pred, false) != Succ &&
711 "Don't add an explicit jump to a fallthrough block.");
712 Pred->addBranchInstruction(Succ);
714 } else {
715 // Succ will be null in the tail call case. In this case we
716 // need to explicitly add a tail call instruction.
717 MCInst *Branch = Pred->getLastNonPseudoInstr();
718 if (Branch && MIB->isUnconditionalBranch(*Branch)) {
719 assert(MIB->getTargetSymbol(*Branch) == BB.getLabel());
720 Pred->removeSuccessor(&BB);
721 Pred->eraseInstruction(Pred->findInstruction(Branch));
722 Pred->addTailCallInstruction(SuccSym);
723 } else {
724 return false;
728 ++NumDoubleJumps;
729 LLVM_DEBUG(dbgs() << "Removed double jump in " << Function << " from "
730 << Pred->getName() << " -> " << BB.getName() << " to "
731 << Pred->getName() << " -> " << SuccSym->getName()
732 << (!Succ ? " (tail)\n" : "\n"));
734 return true;
737 if (BB.getNumNonPseudos() != 1 || BB.isLandingPad())
738 continue;
740 MCInst *Inst = BB.getFirstNonPseudoInstr();
741 const bool IsTailCall = MIB->isTailCall(*Inst);
743 if (!MIB->isUnconditionalBranch(*Inst) && !IsTailCall)
744 continue;
746 // If we operate after SCTC make sure it's not a conditional tail call.
747 if (IsTailCall && MIB->isConditionalBranch(*Inst))
748 continue;
750 const MCSymbol *SuccSym = MIB->getTargetSymbol(*Inst);
751 BinaryBasicBlock *Succ = BB.getSuccessor();
753 if (((!Succ || &BB == Succ) && !IsTailCall) || (IsTailCall && !SuccSym))
754 continue;
756 std::vector<BinaryBasicBlock *> Preds = {BB.pred_begin(), BB.pred_end()};
758 for (BinaryBasicBlock *Pred : Preds) {
759 if (Pred->isLandingPad())
760 continue;
762 if (Pred->getSuccessor() == &BB ||
763 (Pred->getConditionalSuccessor(true) == &BB && !IsTailCall) ||
764 Pred->getConditionalSuccessor(false) == &BB)
765 if (checkAndPatch(Pred, Succ, SuccSym) && MarkInvalid)
766 BB.markValid(BB.pred_size() != 0 || BB.isLandingPad() ||
767 BB.isEntryPoint());
771 return NumDoubleJumps;
774 bool SimplifyConditionalTailCalls::shouldRewriteBranch(
775 const BinaryBasicBlock *PredBB, const MCInst &CondBranch,
776 const BinaryBasicBlock *BB, const bool DirectionFlag) {
777 if (BeenOptimized.count(PredBB))
778 return false;
780 const bool IsForward = BinaryFunction::isForwardBranch(PredBB, BB);
782 if (IsForward)
783 ++NumOrigForwardBranches;
784 else
785 ++NumOrigBackwardBranches;
787 if (opts::SctcMode == opts::SctcAlways)
788 return true;
790 if (opts::SctcMode == opts::SctcPreserveDirection)
791 return IsForward == DirectionFlag;
793 const ErrorOr<std::pair<double, double>> Frequency =
794 PredBB->getBranchStats(BB);
796 // It's ok to rewrite the conditional branch if the new target will be
797 // a backward branch.
799 // If no data available for these branches, then it should be ok to
800 // do the optimization since it will reduce code size.
801 if (Frequency.getError())
802 return true;
804 // TODO: should this use misprediction frequency instead?
805 const bool Result = (IsForward && Frequency.get().first >= 0.5) ||
806 (!IsForward && Frequency.get().first <= 0.5);
808 return Result == DirectionFlag;
811 uint64_t SimplifyConditionalTailCalls::fixTailCalls(BinaryFunction &BF) {
812 // Need updated indices to correctly detect branch' direction.
813 BF.getLayout().updateLayoutIndices();
814 BF.markUnreachableBlocks();
816 MCPlusBuilder *MIB = BF.getBinaryContext().MIB.get();
817 MCContext *Ctx = BF.getBinaryContext().Ctx.get();
818 uint64_t NumLocalCTCCandidates = 0;
819 uint64_t NumLocalCTCs = 0;
820 uint64_t LocalCTCTakenCount = 0;
821 uint64_t LocalCTCExecCount = 0;
822 std::vector<std::pair<BinaryBasicBlock *, const BinaryBasicBlock *>>
823 NeedsUncondBranch;
825 // Will block be deleted by UCE?
826 auto isValid = [](const BinaryBasicBlock *BB) {
827 return (BB->pred_size() != 0 || BB->isLandingPad() || BB->isEntryPoint());
830 for (BinaryBasicBlock *BB : BF.getLayout().blocks()) {
831 // Locate BB with a single direct tail-call instruction.
832 if (BB->getNumNonPseudos() != 1)
833 continue;
835 MCInst *Instr = BB->getFirstNonPseudoInstr();
836 if (!MIB->isTailCall(*Instr) || MIB->isConditionalBranch(*Instr))
837 continue;
839 const MCSymbol *CalleeSymbol = MIB->getTargetSymbol(*Instr);
840 if (!CalleeSymbol)
841 continue;
843 // Detect direction of the possible conditional tail call.
844 const bool IsForwardCTC = BF.isForwardCall(CalleeSymbol);
846 // Iterate through all predecessors.
847 for (BinaryBasicBlock *PredBB : BB->predecessors()) {
848 BinaryBasicBlock *CondSucc = PredBB->getConditionalSuccessor(true);
849 if (!CondSucc)
850 continue;
852 ++NumLocalCTCCandidates;
854 const MCSymbol *TBB = nullptr;
855 const MCSymbol *FBB = nullptr;
856 MCInst *CondBranch = nullptr;
857 MCInst *UncondBranch = nullptr;
858 bool Result = PredBB->analyzeBranch(TBB, FBB, CondBranch, UncondBranch);
860 // analyzeBranch() can fail due to unusual branch instructions, e.g. jrcxz
861 if (!Result) {
862 LLVM_DEBUG(dbgs() << "analyzeBranch failed in SCTC in block:\n";
863 PredBB->dump());
864 continue;
867 assert(Result && "internal error analyzing conditional branch");
868 assert(CondBranch && "conditional branch expected");
870 // It's possible that PredBB is also a successor to BB that may have
871 // been processed by a previous iteration of the SCTC loop, in which
872 // case it may have been marked invalid. We should skip rewriting in
873 // this case.
874 if (!PredBB->isValid()) {
875 assert(PredBB->isSuccessor(BB) &&
876 "PredBB should be valid if it is not a successor to BB");
877 continue;
880 // We don't want to reverse direction of the branch in new order
881 // without further profile analysis.
882 const bool DirectionFlag = CondSucc == BB ? IsForwardCTC : !IsForwardCTC;
883 if (!shouldRewriteBranch(PredBB, *CondBranch, BB, DirectionFlag))
884 continue;
886 // Record this block so that we don't try to optimize it twice.
887 BeenOptimized.insert(PredBB);
889 uint64_t Count = 0;
890 if (CondSucc != BB) {
891 // Patch the new target address into the conditional branch.
892 MIB->reverseBranchCondition(*CondBranch, CalleeSymbol, Ctx);
893 // Since we reversed the condition on the branch we need to change
894 // the target for the unconditional branch or add a unconditional
895 // branch to the old target. This has to be done manually since
896 // fixupBranches is not called after SCTC.
897 NeedsUncondBranch.emplace_back(PredBB, CondSucc);
898 Count = PredBB->getFallthroughBranchInfo().Count;
899 } else {
900 // Change destination of the conditional branch.
901 MIB->replaceBranchTarget(*CondBranch, CalleeSymbol, Ctx);
902 Count = PredBB->getTakenBranchInfo().Count;
904 const uint64_t CTCTakenFreq =
905 Count == BinaryBasicBlock::COUNT_NO_PROFILE ? 0 : Count;
907 // Annotate it, so "isCall" returns true for this jcc
908 MIB->setConditionalTailCall(*CondBranch);
909 // Add info about the conditional tail call frequency, otherwise this
910 // info will be lost when we delete the associated BranchInfo entry
911 auto &CTCAnnotation =
912 MIB->getOrCreateAnnotationAs<uint64_t>(*CondBranch, "CTCTakenCount");
913 CTCAnnotation = CTCTakenFreq;
915 // Remove the unused successor which may be eliminated later
916 // if there are no other users.
917 PredBB->removeSuccessor(BB);
918 // Update BB execution count
919 if (CTCTakenFreq && CTCTakenFreq <= BB->getKnownExecutionCount())
920 BB->setExecutionCount(BB->getExecutionCount() - CTCTakenFreq);
921 else if (CTCTakenFreq > BB->getKnownExecutionCount())
922 BB->setExecutionCount(0);
924 ++NumLocalCTCs;
925 LocalCTCTakenCount += CTCTakenFreq;
926 LocalCTCExecCount += PredBB->getKnownExecutionCount();
929 // Remove the block from CFG if all predecessors were removed.
930 BB->markValid(isValid(BB));
933 // Add unconditional branches at the end of BBs to new successors
934 // as long as the successor is not a fallthrough.
935 for (auto &Entry : NeedsUncondBranch) {
936 BinaryBasicBlock *PredBB = Entry.first;
937 const BinaryBasicBlock *CondSucc = Entry.second;
939 const MCSymbol *TBB = nullptr;
940 const MCSymbol *FBB = nullptr;
941 MCInst *CondBranch = nullptr;
942 MCInst *UncondBranch = nullptr;
943 PredBB->analyzeBranch(TBB, FBB, CondBranch, UncondBranch);
945 // Find the next valid block. Invalid blocks will be deleted
946 // so they shouldn't be considered fallthrough targets.
947 const BinaryBasicBlock *NextBlock =
948 BF.getLayout().getBasicBlockAfter(PredBB, false);
949 while (NextBlock && !isValid(NextBlock))
950 NextBlock = BF.getLayout().getBasicBlockAfter(NextBlock, false);
952 // Get the unconditional successor to this block.
953 const BinaryBasicBlock *PredSucc = PredBB->getSuccessor();
954 assert(PredSucc && "The other branch should be a tail call");
956 const bool HasFallthrough = (NextBlock && PredSucc == NextBlock);
958 if (UncondBranch) {
959 if (HasFallthrough)
960 PredBB->eraseInstruction(PredBB->findInstruction(UncondBranch));
961 else
962 MIB->replaceBranchTarget(*UncondBranch, CondSucc->getLabel(), Ctx);
963 } else if (!HasFallthrough) {
964 MCInst Branch;
965 MIB->createUncondBranch(Branch, CondSucc->getLabel(), Ctx);
966 PredBB->addInstruction(Branch);
970 if (NumLocalCTCs > 0) {
971 NumDoubleJumps += fixDoubleJumps(BF, true);
972 // Clean-up unreachable tail-call blocks.
973 const std::pair<unsigned, uint64_t> Stats = BF.eraseInvalidBBs();
974 DeletedBlocks += Stats.first;
975 DeletedBytes += Stats.second;
977 assert(BF.validateCFG());
980 LLVM_DEBUG(dbgs() << "BOLT: created " << NumLocalCTCs
981 << " conditional tail calls from a total of "
982 << NumLocalCTCCandidates << " candidates in function " << BF
983 << ". CTCs execution count for this function is "
984 << LocalCTCExecCount << " and CTC taken count is "
985 << LocalCTCTakenCount << "\n";);
987 NumTailCallsPatched += NumLocalCTCs;
988 NumCandidateTailCalls += NumLocalCTCCandidates;
989 CTCExecCount += LocalCTCExecCount;
990 CTCTakenCount += LocalCTCTakenCount;
992 return NumLocalCTCs > 0;
995 void SimplifyConditionalTailCalls::runOnFunctions(BinaryContext &BC) {
996 if (!BC.isX86())
997 return;
999 for (auto &It : BC.getBinaryFunctions()) {
1000 BinaryFunction &Function = It.second;
1002 if (!shouldOptimize(Function))
1003 continue;
1005 if (fixTailCalls(Function)) {
1006 Modified.insert(&Function);
1007 Function.setHasCanonicalCFG(false);
1011 if (NumTailCallsPatched)
1012 outs() << "BOLT-INFO: SCTC: patched " << NumTailCallsPatched
1013 << " tail calls (" << NumOrigForwardBranches << " forward)"
1014 << " tail calls (" << NumOrigBackwardBranches << " backward)"
1015 << " from a total of " << NumCandidateTailCalls << " while removing "
1016 << NumDoubleJumps << " double jumps"
1017 << " and removing " << DeletedBlocks << " basic blocks"
1018 << " totalling " << DeletedBytes
1019 << " bytes of code. CTCs total execution count is " << CTCExecCount
1020 << " and the number of times CTCs are taken is " << CTCTakenCount
1021 << "\n";
1024 uint64_t ShortenInstructions::shortenInstructions(BinaryFunction &Function) {
1025 uint64_t Count = 0;
1026 const BinaryContext &BC = Function.getBinaryContext();
1027 for (BinaryBasicBlock &BB : Function) {
1028 for (MCInst &Inst : BB) {
1029 MCInst OriginalInst;
1030 if (opts::Verbosity > 2)
1031 OriginalInst = Inst;
1033 if (!BC.MIB->shortenInstruction(Inst, *BC.STI))
1034 continue;
1036 if (opts::Verbosity > 2) {
1037 BC.scopeLock();
1038 outs() << "BOLT-INFO: shortening:\nBOLT-INFO: ";
1039 BC.printInstruction(outs(), OriginalInst, 0, &Function);
1040 outs() << "BOLT-INFO: to:";
1041 BC.printInstruction(outs(), Inst, 0, &Function);
1044 ++Count;
1048 return Count;
1051 void ShortenInstructions::runOnFunctions(BinaryContext &BC) {
1052 std::atomic<uint64_t> NumShortened{0};
1053 if (!BC.isX86())
1054 return;
1056 ParallelUtilities::runOnEachFunction(
1057 BC, ParallelUtilities::SchedulingPolicy::SP_INST_LINEAR,
1058 [&](BinaryFunction &BF) { NumShortened += shortenInstructions(BF); },
1059 nullptr, "ShortenInstructions");
1061 if (NumShortened)
1062 outs() << "BOLT-INFO: " << NumShortened << " instructions were shortened\n";
1065 void Peepholes::addTailcallTraps(BinaryFunction &Function) {
1066 MCPlusBuilder *MIB = Function.getBinaryContext().MIB.get();
1067 for (BinaryBasicBlock &BB : Function) {
1068 MCInst *Inst = BB.getLastNonPseudoInstr();
1069 if (Inst && MIB->isTailCall(*Inst) && MIB->isIndirectBranch(*Inst)) {
1070 MCInst Trap;
1071 if (MIB->createTrap(Trap)) {
1072 BB.addInstruction(Trap);
1073 ++TailCallTraps;
1079 void Peepholes::removeUselessCondBranches(BinaryFunction &Function) {
1080 for (BinaryBasicBlock &BB : Function) {
1081 if (BB.succ_size() != 2)
1082 continue;
1084 BinaryBasicBlock *CondBB = BB.getConditionalSuccessor(true);
1085 BinaryBasicBlock *UncondBB = BB.getConditionalSuccessor(false);
1086 if (CondBB != UncondBB)
1087 continue;
1089 const MCSymbol *TBB = nullptr;
1090 const MCSymbol *FBB = nullptr;
1091 MCInst *CondBranch = nullptr;
1092 MCInst *UncondBranch = nullptr;
1093 bool Result = BB.analyzeBranch(TBB, FBB, CondBranch, UncondBranch);
1095 // analyzeBranch() can fail due to unusual branch instructions,
1096 // e.g. jrcxz, or jump tables (indirect jump).
1097 if (!Result || !CondBranch)
1098 continue;
1100 BB.removeDuplicateConditionalSuccessor(CondBranch);
1101 ++NumUselessCondBranches;
1105 void Peepholes::runOnFunctions(BinaryContext &BC) {
1106 const char Opts =
1107 std::accumulate(opts::Peepholes.begin(), opts::Peepholes.end(), 0,
1108 [](const char A, const PeepholeOpts B) { return A | B; });
1109 if (Opts == PEEP_NONE)
1110 return;
1112 for (auto &It : BC.getBinaryFunctions()) {
1113 BinaryFunction &Function = It.second;
1114 if (shouldOptimize(Function)) {
1115 if (Opts & PEEP_DOUBLE_JUMPS)
1116 NumDoubleJumps += fixDoubleJumps(Function, false);
1117 if (Opts & PEEP_TAILCALL_TRAPS)
1118 addTailcallTraps(Function);
1119 if (Opts & PEEP_USELESS_BRANCHES)
1120 removeUselessCondBranches(Function);
1121 assert(Function.validateCFG());
1124 outs() << "BOLT-INFO: Peephole: " << NumDoubleJumps
1125 << " double jumps patched.\n"
1126 << "BOLT-INFO: Peephole: " << TailCallTraps
1127 << " tail call traps inserted.\n"
1128 << "BOLT-INFO: Peephole: " << NumUselessCondBranches
1129 << " useless conditional branches removed.\n";
1132 bool SimplifyRODataLoads::simplifyRODataLoads(BinaryFunction &BF) {
1133 BinaryContext &BC = BF.getBinaryContext();
1134 MCPlusBuilder *MIB = BC.MIB.get();
1136 uint64_t NumLocalLoadsSimplified = 0;
1137 uint64_t NumDynamicLocalLoadsSimplified = 0;
1138 uint64_t NumLocalLoadsFound = 0;
1139 uint64_t NumDynamicLocalLoadsFound = 0;
1141 for (BinaryBasicBlock *BB : BF.getLayout().blocks()) {
1142 for (MCInst &Inst : *BB) {
1143 unsigned Opcode = Inst.getOpcode();
1144 const MCInstrDesc &Desc = BC.MII->get(Opcode);
1146 // Skip instructions that do not load from memory.
1147 if (!Desc.mayLoad())
1148 continue;
1150 // Try to statically evaluate the target memory address;
1151 uint64_t TargetAddress;
1153 if (MIB->hasPCRelOperand(Inst)) {
1154 // Try to find the symbol that corresponds to the PC-relative operand.
1155 MCOperand *DispOpI = MIB->getMemOperandDisp(Inst);
1156 assert(DispOpI != Inst.end() && "expected PC-relative displacement");
1157 assert(DispOpI->isExpr() &&
1158 "found PC-relative with non-symbolic displacement");
1160 // Get displacement symbol.
1161 const MCSymbol *DisplSymbol;
1162 uint64_t DisplOffset;
1164 std::tie(DisplSymbol, DisplOffset) =
1165 MIB->getTargetSymbolInfo(DispOpI->getExpr());
1167 if (!DisplSymbol)
1168 continue;
1170 // Look up the symbol address in the global symbols map of the binary
1171 // context object.
1172 BinaryData *BD = BC.getBinaryDataByName(DisplSymbol->getName());
1173 if (!BD)
1174 continue;
1175 TargetAddress = BD->getAddress() + DisplOffset;
1176 } else if (!MIB->evaluateMemOperandTarget(Inst, TargetAddress)) {
1177 continue;
1180 // Get the contents of the section containing the target address of the
1181 // memory operand. We are only interested in read-only sections.
1182 ErrorOr<BinarySection &> DataSection =
1183 BC.getSectionForAddress(TargetAddress);
1184 if (!DataSection || DataSection->isWritable())
1185 continue;
1187 if (BC.getRelocationAt(TargetAddress) ||
1188 BC.getDynamicRelocationAt(TargetAddress))
1189 continue;
1191 uint32_t Offset = TargetAddress - DataSection->getAddress();
1192 StringRef ConstantData = DataSection->getContents();
1194 ++NumLocalLoadsFound;
1195 if (BB->hasProfile())
1196 NumDynamicLocalLoadsFound += BB->getExecutionCount();
1198 if (MIB->replaceMemOperandWithImm(Inst, ConstantData, Offset)) {
1199 ++NumLocalLoadsSimplified;
1200 if (BB->hasProfile())
1201 NumDynamicLocalLoadsSimplified += BB->getExecutionCount();
1206 NumLoadsFound += NumLocalLoadsFound;
1207 NumDynamicLoadsFound += NumDynamicLocalLoadsFound;
1208 NumLoadsSimplified += NumLocalLoadsSimplified;
1209 NumDynamicLoadsSimplified += NumDynamicLocalLoadsSimplified;
1211 return NumLocalLoadsSimplified > 0;
1214 void SimplifyRODataLoads::runOnFunctions(BinaryContext &BC) {
1215 for (auto &It : BC.getBinaryFunctions()) {
1216 BinaryFunction &Function = It.second;
1217 if (shouldOptimize(Function) && simplifyRODataLoads(Function))
1218 Modified.insert(&Function);
1221 outs() << "BOLT-INFO: simplified " << NumLoadsSimplified << " out of "
1222 << NumLoadsFound << " loads from a statically computed address.\n"
1223 << "BOLT-INFO: dynamic loads simplified: " << NumDynamicLoadsSimplified
1224 << "\n"
1225 << "BOLT-INFO: dynamic loads found: " << NumDynamicLoadsFound << "\n";
1228 void AssignSections::runOnFunctions(BinaryContext &BC) {
1229 for (BinaryFunction *Function : BC.getInjectedBinaryFunctions()) {
1230 Function->setCodeSectionName(BC.getInjectedCodeSectionName());
1231 Function->setColdCodeSectionName(BC.getInjectedColdCodeSectionName());
1234 // In non-relocation mode functions have pre-assigned section names.
1235 if (!BC.HasRelocations)
1236 return;
1238 const bool UseColdSection =
1239 BC.NumProfiledFuncs > 0 ||
1240 opts::ReorderFunctions == ReorderFunctions::RT_USER;
1241 for (auto &BFI : BC.getBinaryFunctions()) {
1242 BinaryFunction &Function = BFI.second;
1243 if (opts::isHotTextMover(Function)) {
1244 Function.setCodeSectionName(BC.getHotTextMoverSectionName());
1245 Function.setColdCodeSectionName(BC.getHotTextMoverSectionName());
1246 continue;
1249 if (!UseColdSection || Function.hasValidIndex())
1250 Function.setCodeSectionName(BC.getMainCodeSectionName());
1251 else
1252 Function.setCodeSectionName(BC.getColdCodeSectionName());
1254 if (Function.isSplit())
1255 Function.setColdCodeSectionName(BC.getColdCodeSectionName());
1259 void PrintProfileStats::runOnFunctions(BinaryContext &BC) {
1260 double FlowImbalanceMean = 0.0;
1261 size_t NumBlocksConsidered = 0;
1262 double WorstBias = 0.0;
1263 const BinaryFunction *WorstBiasFunc = nullptr;
1265 // For each function CFG, we fill an IncomingMap with the sum of the frequency
1266 // of incoming edges for each BB. Likewise for each OutgoingMap and the sum
1267 // of the frequency of outgoing edges.
1268 using FlowMapTy = std::unordered_map<const BinaryBasicBlock *, uint64_t>;
1269 std::unordered_map<const BinaryFunction *, FlowMapTy> TotalIncomingMaps;
1270 std::unordered_map<const BinaryFunction *, FlowMapTy> TotalOutgoingMaps;
1272 // Compute mean
1273 for (const auto &BFI : BC.getBinaryFunctions()) {
1274 const BinaryFunction &Function = BFI.second;
1275 if (Function.empty() || !Function.isSimple())
1276 continue;
1277 FlowMapTy &IncomingMap = TotalIncomingMaps[&Function];
1278 FlowMapTy &OutgoingMap = TotalOutgoingMaps[&Function];
1279 for (const BinaryBasicBlock &BB : Function) {
1280 uint64_t TotalOutgoing = 0ULL;
1281 auto SuccBIIter = BB.branch_info_begin();
1282 for (BinaryBasicBlock *Succ : BB.successors()) {
1283 uint64_t Count = SuccBIIter->Count;
1284 if (Count == BinaryBasicBlock::COUNT_NO_PROFILE || Count == 0) {
1285 ++SuccBIIter;
1286 continue;
1288 TotalOutgoing += Count;
1289 IncomingMap[Succ] += Count;
1290 ++SuccBIIter;
1292 OutgoingMap[&BB] = TotalOutgoing;
1295 size_t NumBlocks = 0;
1296 double Mean = 0.0;
1297 for (const BinaryBasicBlock &BB : Function) {
1298 // Do not compute score for low frequency blocks, entry or exit blocks
1299 if (IncomingMap[&BB] < 100 || OutgoingMap[&BB] == 0 || BB.isEntryPoint())
1300 continue;
1301 ++NumBlocks;
1302 const double Difference = (double)OutgoingMap[&BB] - IncomingMap[&BB];
1303 Mean += fabs(Difference / IncomingMap[&BB]);
1306 FlowImbalanceMean += Mean;
1307 NumBlocksConsidered += NumBlocks;
1308 if (!NumBlocks)
1309 continue;
1310 double FuncMean = Mean / NumBlocks;
1311 if (FuncMean > WorstBias) {
1312 WorstBias = FuncMean;
1313 WorstBiasFunc = &Function;
1316 if (NumBlocksConsidered > 0)
1317 FlowImbalanceMean /= NumBlocksConsidered;
1319 // Compute standard deviation
1320 NumBlocksConsidered = 0;
1321 double FlowImbalanceVar = 0.0;
1322 for (const auto &BFI : BC.getBinaryFunctions()) {
1323 const BinaryFunction &Function = BFI.second;
1324 if (Function.empty() || !Function.isSimple())
1325 continue;
1326 FlowMapTy &IncomingMap = TotalIncomingMaps[&Function];
1327 FlowMapTy &OutgoingMap = TotalOutgoingMaps[&Function];
1328 for (const BinaryBasicBlock &BB : Function) {
1329 if (IncomingMap[&BB] < 100 || OutgoingMap[&BB] == 0)
1330 continue;
1331 ++NumBlocksConsidered;
1332 const double Difference = (double)OutgoingMap[&BB] - IncomingMap[&BB];
1333 FlowImbalanceVar +=
1334 pow(fabs(Difference / IncomingMap[&BB]) - FlowImbalanceMean, 2);
1337 if (NumBlocksConsidered) {
1338 FlowImbalanceVar /= NumBlocksConsidered;
1339 FlowImbalanceVar = sqrt(FlowImbalanceVar);
1342 // Report to user
1343 outs() << format("BOLT-INFO: Profile bias score: %.4lf%% StDev: %.4lf%%\n",
1344 (100.0 * FlowImbalanceMean), (100.0 * FlowImbalanceVar));
1345 if (WorstBiasFunc && opts::Verbosity >= 1) {
1346 outs() << "Worst average bias observed in " << WorstBiasFunc->getPrintName()
1347 << "\n";
1348 LLVM_DEBUG(WorstBiasFunc->dump());
1352 void PrintProgramStats::runOnFunctions(BinaryContext &BC) {
1353 uint64_t NumRegularFunctions = 0;
1354 uint64_t NumStaleProfileFunctions = 0;
1355 uint64_t NumAllStaleFunctions = 0;
1356 uint64_t NumInferredFunctions = 0;
1357 uint64_t NumNonSimpleProfiledFunctions = 0;
1358 uint64_t NumUnknownControlFlowFunctions = 0;
1359 uint64_t TotalSampleCount = 0;
1360 uint64_t StaleSampleCount = 0;
1361 uint64_t InferredSampleCount = 0;
1362 std::vector<const BinaryFunction *> ProfiledFunctions;
1363 const char *StaleFuncsHeader = "BOLT-INFO: Functions with stale profile:\n";
1364 for (auto &BFI : BC.getBinaryFunctions()) {
1365 const BinaryFunction &Function = BFI.second;
1367 // Ignore PLT functions for stats.
1368 if (Function.isPLTFunction())
1369 continue;
1371 ++NumRegularFunctions;
1373 if (!Function.isSimple()) {
1374 if (Function.hasProfile())
1375 ++NumNonSimpleProfiledFunctions;
1376 continue;
1379 if (Function.hasUnknownControlFlow()) {
1380 if (opts::PrintUnknownCFG)
1381 Function.dump();
1382 else if (opts::PrintUnknown)
1383 errs() << "function with unknown control flow: " << Function << '\n';
1385 ++NumUnknownControlFlowFunctions;
1388 if (!Function.hasProfile())
1389 continue;
1391 uint64_t SampleCount = Function.getRawBranchCount();
1392 TotalSampleCount += SampleCount;
1394 if (Function.hasValidProfile()) {
1395 ProfiledFunctions.push_back(&Function);
1396 if (Function.hasInferredProfile()) {
1397 ++NumInferredFunctions;
1398 InferredSampleCount += SampleCount;
1399 ++NumAllStaleFunctions;
1401 } else {
1402 if (opts::ReportStaleFuncs) {
1403 outs() << StaleFuncsHeader;
1404 StaleFuncsHeader = "";
1405 outs() << " " << Function << '\n';
1407 ++NumStaleProfileFunctions;
1408 StaleSampleCount += SampleCount;
1409 ++NumAllStaleFunctions;
1412 BC.NumProfiledFuncs = ProfiledFunctions.size();
1413 BC.NumStaleProfileFuncs = NumStaleProfileFunctions;
1415 const size_t NumAllProfiledFunctions =
1416 ProfiledFunctions.size() + NumStaleProfileFunctions;
1417 outs() << "BOLT-INFO: " << NumAllProfiledFunctions << " out of "
1418 << NumRegularFunctions << " functions in the binary ("
1419 << format("%.1f", NumAllProfiledFunctions /
1420 (float)NumRegularFunctions * 100.0f)
1421 << "%) have non-empty execution profile\n";
1422 if (NumNonSimpleProfiledFunctions) {
1423 outs() << "BOLT-INFO: " << NumNonSimpleProfiledFunctions << " function"
1424 << (NumNonSimpleProfiledFunctions == 1 ? "" : "s")
1425 << " with profile could not be optimized\n";
1427 if (NumStaleProfileFunctions) {
1428 const float PctStale =
1429 NumStaleProfileFunctions / (float)NumAllProfiledFunctions * 100.0f;
1430 auto printErrorOrWarning = [&]() {
1431 if (PctStale > opts::StaleThreshold)
1432 errs() << "BOLT-ERROR: ";
1433 else
1434 errs() << "BOLT-WARNING: ";
1436 printErrorOrWarning();
1437 errs() << NumStaleProfileFunctions
1438 << format(" (%.1f%% of all profiled)", PctStale) << " function"
1439 << (NumStaleProfileFunctions == 1 ? "" : "s")
1440 << " have invalid (possibly stale) profile."
1441 " Use -report-stale to see the list.\n";
1442 if (TotalSampleCount > 0) {
1443 printErrorOrWarning();
1444 errs() << StaleSampleCount << " out of " << TotalSampleCount
1445 << " samples in the binary ("
1446 << format("%.1f", ((100.0f * StaleSampleCount) / TotalSampleCount))
1447 << "%) belong to functions with invalid"
1448 " (possibly stale) profile.\n";
1450 if (PctStale > opts::StaleThreshold) {
1451 errs() << "BOLT-ERROR: stale functions exceed specified threshold of "
1452 << opts::StaleThreshold << "%. Exiting.\n";
1453 exit(1);
1456 if (NumInferredFunctions) {
1457 outs() << format("BOLT-INFO: inferred profile for %d (%.2f%% of profiled, "
1458 "%.2f%% of stale) functions responsible for %.2f%% samples"
1459 " (%zu out of %zu)\n",
1460 NumInferredFunctions,
1461 100.0 * NumInferredFunctions / NumAllProfiledFunctions,
1462 100.0 * NumInferredFunctions / NumAllStaleFunctions,
1463 100.0 * InferredSampleCount / TotalSampleCount,
1464 InferredSampleCount, TotalSampleCount);
1465 outs() << format(
1466 "BOLT-INFO: inference found an exact match for %.2f%% of basic blocks"
1467 " (%zu out of %zu stale) responsible for %.2f%% samples"
1468 " (%zu out of %zu stale)\n",
1469 100.0 * BC.Stats.NumMatchedBlocks / BC.Stats.NumStaleBlocks,
1470 BC.Stats.NumMatchedBlocks, BC.Stats.NumStaleBlocks,
1471 100.0 * BC.Stats.MatchedSampleCount / BC.Stats.StaleSampleCount,
1472 BC.Stats.MatchedSampleCount, BC.Stats.StaleSampleCount);
1475 if (const uint64_t NumUnusedObjects = BC.getNumUnusedProfiledObjects()) {
1476 outs() << "BOLT-INFO: profile for " << NumUnusedObjects
1477 << " objects was ignored\n";
1480 if (ProfiledFunctions.size() > 10) {
1481 if (opts::Verbosity >= 1) {
1482 outs() << "BOLT-INFO: top called functions are:\n";
1483 llvm::sort(ProfiledFunctions,
1484 [](const BinaryFunction *A, const BinaryFunction *B) {
1485 return B->getExecutionCount() < A->getExecutionCount();
1487 auto SFI = ProfiledFunctions.begin();
1488 auto SFIend = ProfiledFunctions.end();
1489 for (unsigned I = 0u; I < opts::TopCalledLimit && SFI != SFIend;
1490 ++SFI, ++I)
1491 outs() << " " << **SFI << " : " << (*SFI)->getExecutionCount() << '\n';
1495 if (!opts::PrintSortedBy.empty()) {
1496 std::vector<BinaryFunction *> Functions;
1497 std::map<const BinaryFunction *, DynoStats> Stats;
1499 for (auto &BFI : BC.getBinaryFunctions()) {
1500 BinaryFunction &BF = BFI.second;
1501 if (shouldOptimize(BF) && BF.hasValidProfile()) {
1502 Functions.push_back(&BF);
1503 Stats.emplace(&BF, getDynoStats(BF));
1507 const bool SortAll =
1508 llvm::is_contained(opts::PrintSortedBy, DynoStats::LAST_DYNO_STAT);
1510 const bool Ascending =
1511 opts::DynoStatsSortOrderOpt == opts::DynoStatsSortOrder::Ascending;
1513 if (SortAll) {
1514 llvm::stable_sort(Functions,
1515 [Ascending, &Stats](const BinaryFunction *A,
1516 const BinaryFunction *B) {
1517 return Ascending ? Stats.at(A) < Stats.at(B)
1518 : Stats.at(B) < Stats.at(A);
1520 } else {
1521 llvm::stable_sort(
1522 Functions, [Ascending, &Stats](const BinaryFunction *A,
1523 const BinaryFunction *B) {
1524 const DynoStats &StatsA = Stats.at(A);
1525 const DynoStats &StatsB = Stats.at(B);
1526 return Ascending ? StatsA.lessThan(StatsB, opts::PrintSortedBy)
1527 : StatsB.lessThan(StatsA, opts::PrintSortedBy);
1531 outs() << "BOLT-INFO: top functions sorted by ";
1532 if (SortAll) {
1533 outs() << "dyno stats";
1534 } else {
1535 outs() << "(";
1536 bool PrintComma = false;
1537 for (const DynoStats::Category Category : opts::PrintSortedBy) {
1538 if (PrintComma)
1539 outs() << ", ";
1540 outs() << DynoStats::Description(Category);
1541 PrintComma = true;
1543 outs() << ")";
1546 outs() << " are:\n";
1547 auto SFI = Functions.begin();
1548 for (unsigned I = 0; I < 100 && SFI != Functions.end(); ++SFI, ++I) {
1549 const DynoStats Stats = getDynoStats(**SFI);
1550 outs() << " " << **SFI;
1551 if (!SortAll) {
1552 outs() << " (";
1553 bool PrintComma = false;
1554 for (const DynoStats::Category Category : opts::PrintSortedBy) {
1555 if (PrintComma)
1556 outs() << ", ";
1557 outs() << dynoStatsOptName(Category) << "=" << Stats[Category];
1558 PrintComma = true;
1560 outs() << ")";
1562 outs() << "\n";
1566 if (!BC.TrappedFunctions.empty()) {
1567 errs() << "BOLT-WARNING: " << BC.TrappedFunctions.size() << " function"
1568 << (BC.TrappedFunctions.size() > 1 ? "s" : "")
1569 << " will trap on entry. Use -trap-avx512=0 to disable"
1570 " traps.";
1571 if (opts::Verbosity >= 1 || BC.TrappedFunctions.size() <= 5) {
1572 errs() << '\n';
1573 for (const BinaryFunction *Function : BC.TrappedFunctions)
1574 errs() << " " << *Function << '\n';
1575 } else {
1576 errs() << " Use -v=1 to see the list.\n";
1580 // Print information on missed macro-fusion opportunities seen on input.
1581 if (BC.Stats.MissedMacroFusionPairs) {
1582 outs() << format("BOLT-INFO: the input contains %zu (dynamic count : %zu)"
1583 " opportunities for macro-fusion optimization",
1584 BC.Stats.MissedMacroFusionPairs,
1585 BC.Stats.MissedMacroFusionExecCount);
1586 switch (opts::AlignMacroOpFusion) {
1587 case MFT_NONE:
1588 outs() << ". Use -align-macro-fusion to fix.\n";
1589 break;
1590 case MFT_HOT:
1591 outs() << ". Will fix instances on a hot path.\n";
1592 break;
1593 case MFT_ALL:
1594 outs() << " that are going to be fixed\n";
1595 break;
1599 // Collect and print information about suboptimal code layout on input.
1600 if (opts::ReportBadLayout) {
1601 std::vector<BinaryFunction *> SuboptimalFuncs;
1602 for (auto &BFI : BC.getBinaryFunctions()) {
1603 BinaryFunction &BF = BFI.second;
1604 if (!BF.hasValidProfile())
1605 continue;
1607 const uint64_t HotThreshold =
1608 std::max<uint64_t>(BF.getKnownExecutionCount(), 1);
1609 bool HotSeen = false;
1610 for (const BinaryBasicBlock *BB : BF.getLayout().rblocks()) {
1611 if (!HotSeen && BB->getKnownExecutionCount() > HotThreshold) {
1612 HotSeen = true;
1613 continue;
1615 if (HotSeen && BB->getKnownExecutionCount() == 0) {
1616 SuboptimalFuncs.push_back(&BF);
1617 break;
1622 if (!SuboptimalFuncs.empty()) {
1623 llvm::sort(SuboptimalFuncs,
1624 [](const BinaryFunction *A, const BinaryFunction *B) {
1625 return A->getKnownExecutionCount() / A->getSize() >
1626 B->getKnownExecutionCount() / B->getSize();
1629 outs() << "BOLT-INFO: " << SuboptimalFuncs.size()
1630 << " functions have "
1631 "cold code in the middle of hot code. Top functions are:\n";
1632 for (unsigned I = 0;
1633 I < std::min(static_cast<size_t>(opts::ReportBadLayout),
1634 SuboptimalFuncs.size());
1635 ++I)
1636 SuboptimalFuncs[I]->print(outs());
1640 if (NumUnknownControlFlowFunctions) {
1641 outs() << "BOLT-INFO: " << NumUnknownControlFlowFunctions
1642 << " functions have instructions with unknown control flow";
1643 if (!opts::PrintUnknown)
1644 outs() << ". Use -print-unknown to see the list.";
1645 outs() << '\n';
1649 void InstructionLowering::runOnFunctions(BinaryContext &BC) {
1650 for (auto &BFI : BC.getBinaryFunctions())
1651 for (BinaryBasicBlock &BB : BFI.second)
1652 for (MCInst &Instruction : BB)
1653 BC.MIB->lowerTailCall(Instruction);
1656 void StripRepRet::runOnFunctions(BinaryContext &BC) {
1657 if (!BC.isX86())
1658 return;
1660 uint64_t NumPrefixesRemoved = 0;
1661 uint64_t NumBytesSaved = 0;
1662 for (auto &BFI : BC.getBinaryFunctions()) {
1663 for (BinaryBasicBlock &BB : BFI.second) {
1664 auto LastInstRIter = BB.getLastNonPseudo();
1665 if (LastInstRIter == BB.rend() || !BC.MIB->isReturn(*LastInstRIter) ||
1666 !BC.MIB->deleteREPPrefix(*LastInstRIter))
1667 continue;
1669 NumPrefixesRemoved += BB.getKnownExecutionCount();
1670 ++NumBytesSaved;
1674 if (NumBytesSaved)
1675 outs() << "BOLT-INFO: removed " << NumBytesSaved
1676 << " 'repz' prefixes"
1677 " with estimated execution count of "
1678 << NumPrefixesRemoved << " times.\n";
1681 void InlineMemcpy::runOnFunctions(BinaryContext &BC) {
1682 if (!BC.isX86())
1683 return;
1685 uint64_t NumInlined = 0;
1686 uint64_t NumInlinedDyno = 0;
1687 for (auto &BFI : BC.getBinaryFunctions()) {
1688 for (BinaryBasicBlock &BB : BFI.second) {
1689 for (auto II = BB.begin(); II != BB.end(); ++II) {
1690 MCInst &Inst = *II;
1692 if (!BC.MIB->isCall(Inst) || MCPlus::getNumPrimeOperands(Inst) != 1 ||
1693 !Inst.getOperand(0).isExpr())
1694 continue;
1696 const MCSymbol *CalleeSymbol = BC.MIB->getTargetSymbol(Inst);
1697 if (CalleeSymbol->getName() != "memcpy" &&
1698 CalleeSymbol->getName() != "memcpy@PLT" &&
1699 CalleeSymbol->getName() != "_memcpy8")
1700 continue;
1702 const bool IsMemcpy8 = (CalleeSymbol->getName() == "_memcpy8");
1703 const bool IsTailCall = BC.MIB->isTailCall(Inst);
1705 const InstructionListType NewCode =
1706 BC.MIB->createInlineMemcpy(IsMemcpy8);
1707 II = BB.replaceInstruction(II, NewCode);
1708 std::advance(II, NewCode.size() - 1);
1709 if (IsTailCall) {
1710 MCInst Return;
1711 BC.MIB->createReturn(Return);
1712 II = BB.insertInstruction(std::next(II), std::move(Return));
1715 ++NumInlined;
1716 NumInlinedDyno += BB.getKnownExecutionCount();
1721 if (NumInlined) {
1722 outs() << "BOLT-INFO: inlined " << NumInlined << " memcpy() calls";
1723 if (NumInlinedDyno)
1724 outs() << ". The calls were executed " << NumInlinedDyno
1725 << " times based on profile.";
1726 outs() << '\n';
1730 bool SpecializeMemcpy1::shouldOptimize(const BinaryFunction &Function) const {
1731 if (!BinaryFunctionPass::shouldOptimize(Function))
1732 return false;
1734 for (const std::string &FunctionSpec : Spec) {
1735 StringRef FunctionName = StringRef(FunctionSpec).split(':').first;
1736 if (Function.hasNameRegex(FunctionName))
1737 return true;
1740 return false;
1743 std::set<size_t> SpecializeMemcpy1::getCallSitesToOptimize(
1744 const BinaryFunction &Function) const {
1745 StringRef SitesString;
1746 for (const std::string &FunctionSpec : Spec) {
1747 StringRef FunctionName;
1748 std::tie(FunctionName, SitesString) = StringRef(FunctionSpec).split(':');
1749 if (Function.hasNameRegex(FunctionName))
1750 break;
1751 SitesString = "";
1754 std::set<size_t> Sites;
1755 SmallVector<StringRef, 4> SitesVec;
1756 SitesString.split(SitesVec, ':');
1757 for (StringRef SiteString : SitesVec) {
1758 if (SiteString.empty())
1759 continue;
1760 size_t Result;
1761 if (!SiteString.getAsInteger(10, Result))
1762 Sites.emplace(Result);
1765 return Sites;
1768 void SpecializeMemcpy1::runOnFunctions(BinaryContext &BC) {
1769 if (!BC.isX86())
1770 return;
1772 uint64_t NumSpecialized = 0;
1773 uint64_t NumSpecializedDyno = 0;
1774 for (auto &BFI : BC.getBinaryFunctions()) {
1775 BinaryFunction &Function = BFI.second;
1776 if (!shouldOptimize(Function))
1777 continue;
1779 std::set<size_t> CallsToOptimize = getCallSitesToOptimize(Function);
1780 auto shouldOptimize = [&](size_t N) {
1781 return CallsToOptimize.empty() || CallsToOptimize.count(N);
1784 std::vector<BinaryBasicBlock *> Blocks(Function.pbegin(), Function.pend());
1785 size_t CallSiteID = 0;
1786 for (BinaryBasicBlock *CurBB : Blocks) {
1787 for (auto II = CurBB->begin(); II != CurBB->end(); ++II) {
1788 MCInst &Inst = *II;
1790 if (!BC.MIB->isCall(Inst) || MCPlus::getNumPrimeOperands(Inst) != 1 ||
1791 !Inst.getOperand(0).isExpr())
1792 continue;
1794 const MCSymbol *CalleeSymbol = BC.MIB->getTargetSymbol(Inst);
1795 if (CalleeSymbol->getName() != "memcpy" &&
1796 CalleeSymbol->getName() != "memcpy@PLT")
1797 continue;
1799 if (BC.MIB->isTailCall(Inst))
1800 continue;
1802 ++CallSiteID;
1804 if (!shouldOptimize(CallSiteID))
1805 continue;
1807 // Create a copy of a call to memcpy(dest, src, size).
1808 MCInst MemcpyInstr = Inst;
1810 BinaryBasicBlock *OneByteMemcpyBB = CurBB->splitAt(II);
1812 BinaryBasicBlock *NextBB = nullptr;
1813 if (OneByteMemcpyBB->getNumNonPseudos() > 1) {
1814 NextBB = OneByteMemcpyBB->splitAt(OneByteMemcpyBB->begin());
1815 NextBB->eraseInstruction(NextBB->begin());
1816 } else {
1817 NextBB = OneByteMemcpyBB->getSuccessor();
1818 OneByteMemcpyBB->eraseInstruction(OneByteMemcpyBB->begin());
1819 assert(NextBB && "unexpected call to memcpy() with no return");
1822 BinaryBasicBlock *MemcpyBB = Function.addBasicBlock();
1823 MemcpyBB->setOffset(CurBB->getInputOffset());
1824 InstructionListType CmpJCC =
1825 BC.MIB->createCmpJE(BC.MIB->getIntArgRegister(2), 1,
1826 OneByteMemcpyBB->getLabel(), BC.Ctx.get());
1827 CurBB->addInstructions(CmpJCC);
1828 CurBB->addSuccessor(MemcpyBB);
1830 MemcpyBB->addInstruction(std::move(MemcpyInstr));
1831 MemcpyBB->addSuccessor(NextBB);
1832 MemcpyBB->setCFIState(NextBB->getCFIState());
1833 MemcpyBB->setExecutionCount(0);
1835 // To prevent the actual call from being moved to cold, we set its
1836 // execution count to 1.
1837 if (CurBB->getKnownExecutionCount() > 0)
1838 MemcpyBB->setExecutionCount(1);
1840 InstructionListType OneByteMemcpy = BC.MIB->createOneByteMemcpy();
1841 OneByteMemcpyBB->addInstructions(OneByteMemcpy);
1843 ++NumSpecialized;
1844 NumSpecializedDyno += CurBB->getKnownExecutionCount();
1846 CurBB = NextBB;
1848 // Note: we don't expect the next instruction to be a call to memcpy.
1849 II = CurBB->begin();
1854 if (NumSpecialized) {
1855 outs() << "BOLT-INFO: specialized " << NumSpecialized
1856 << " memcpy() call sites for size 1";
1857 if (NumSpecializedDyno)
1858 outs() << ". The calls were executed " << NumSpecializedDyno
1859 << " times based on profile.";
1860 outs() << '\n';
1864 void RemoveNops::runOnFunction(BinaryFunction &BF) {
1865 const BinaryContext &BC = BF.getBinaryContext();
1866 for (BinaryBasicBlock &BB : BF) {
1867 for (int64_t I = BB.size() - 1; I >= 0; --I) {
1868 MCInst &Inst = BB.getInstructionAtIndex(I);
1869 if (BC.MIB->isNoop(Inst) && BC.MIB->hasAnnotation(Inst, "NOP"))
1870 BB.eraseInstructionAtIndex(I);
1875 void RemoveNops::runOnFunctions(BinaryContext &BC) {
1876 ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) {
1877 runOnFunction(BF);
1880 ParallelUtilities::PredicateTy SkipFunc = [&](const BinaryFunction &BF) {
1881 return BF.shouldPreserveNops();
1884 ParallelUtilities::runOnEachFunction(
1885 BC, ParallelUtilities::SchedulingPolicy::SP_INST_LINEAR, WorkFun,
1886 SkipFunc, "RemoveNops");
1889 } // namespace bolt
1890 } // namespace llvm