Bump version to 19.1.0 (final)
[llvm-project.git] / bolt / lib / Passes / BinaryPasses.cpp
blobfa95ad7324ac1cda944c386df94deabe72dbb007
1 //===- bolt/Passes/BinaryPasses.cpp - Binary-level passes -----------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements multiple passes for binary optimization and analysis.
11 //===----------------------------------------------------------------------===//
13 #include "bolt/Passes/BinaryPasses.h"
14 #include "bolt/Core/FunctionLayout.h"
15 #include "bolt/Core/ParallelUtilities.h"
16 #include "bolt/Passes/ReorderAlgorithm.h"
17 #include "bolt/Passes/ReorderFunctions.h"
18 #include "llvm/Support/CommandLine.h"
19 #include <atomic>
20 #include <mutex>
21 #include <numeric>
22 #include <vector>
24 #define DEBUG_TYPE "bolt-opts"
26 using namespace llvm;
27 using namespace bolt;
29 static const char *dynoStatsOptName(const bolt::DynoStats::Category C) {
30 assert(C > bolt::DynoStats::FIRST_DYNO_STAT &&
31 C < DynoStats::LAST_DYNO_STAT && "Unexpected dyno stat category.");
33 static std::string OptNames[bolt::DynoStats::LAST_DYNO_STAT + 1];
35 OptNames[C] = bolt::DynoStats::Description(C);
37 std::replace(OptNames[C].begin(), OptNames[C].end(), ' ', '-');
39 return OptNames[C].c_str();
42 namespace opts {
44 extern cl::OptionCategory BoltCategory;
45 extern cl::OptionCategory BoltOptCategory;
47 extern cl::opt<unsigned> Verbosity;
48 extern cl::opt<bool> EnableBAT;
49 extern cl::opt<unsigned> ExecutionCountThreshold;
50 extern cl::opt<bool> UpdateDebugSections;
51 extern cl::opt<bolt::ReorderFunctions::ReorderType> ReorderFunctions;
53 enum DynoStatsSortOrder : char {
54 Ascending,
55 Descending
58 static cl::opt<DynoStatsSortOrder> DynoStatsSortOrderOpt(
59 "print-sorted-by-order",
60 cl::desc("use ascending or descending order when printing functions "
61 "ordered by dyno stats"),
62 cl::init(DynoStatsSortOrder::Descending), cl::cat(BoltOptCategory));
64 cl::list<std::string>
65 HotTextMoveSections("hot-text-move-sections",
66 cl::desc("list of sections containing functions used for hugifying hot text. "
67 "BOLT makes sure these functions are not placed on the same page as "
68 "the hot text. (default=\'.stub,.mover\')."),
69 cl::value_desc("sec1,sec2,sec3,..."),
70 cl::CommaSeparated,
71 cl::ZeroOrMore,
72 cl::cat(BoltCategory));
74 bool isHotTextMover(const BinaryFunction &Function) {
75 for (std::string &SectionName : opts::HotTextMoveSections) {
76 if (Function.getOriginSectionName() &&
77 *Function.getOriginSectionName() == SectionName)
78 return true;
81 return false;
84 static cl::opt<bool> MinBranchClusters(
85 "min-branch-clusters",
86 cl::desc("use a modified clustering algorithm geared towards minimizing "
87 "branches"),
88 cl::Hidden, cl::cat(BoltOptCategory));
90 static cl::list<Peepholes::PeepholeOpts> Peepholes(
91 "peepholes", cl::CommaSeparated, cl::desc("enable peephole optimizations"),
92 cl::value_desc("opt1,opt2,opt3,..."),
93 cl::values(clEnumValN(Peepholes::PEEP_NONE, "none", "disable peepholes"),
94 clEnumValN(Peepholes::PEEP_DOUBLE_JUMPS, "double-jumps",
95 "remove double jumps when able"),
96 clEnumValN(Peepholes::PEEP_TAILCALL_TRAPS, "tailcall-traps",
97 "insert tail call traps"),
98 clEnumValN(Peepholes::PEEP_USELESS_BRANCHES, "useless-branches",
99 "remove useless conditional branches"),
100 clEnumValN(Peepholes::PEEP_ALL, "all",
101 "enable all peephole optimizations")),
102 cl::ZeroOrMore, cl::cat(BoltOptCategory));
104 static cl::opt<unsigned>
105 PrintFuncStat("print-function-statistics",
106 cl::desc("print statistics about basic block ordering"),
107 cl::init(0), cl::cat(BoltOptCategory));
109 static cl::opt<bool> PrintLargeFunctions(
110 "print-large-functions",
111 cl::desc("print functions that could not be overwritten due to excessive "
112 "size"),
113 cl::init(false), cl::cat(BoltOptCategory));
115 static cl::list<bolt::DynoStats::Category>
116 PrintSortedBy("print-sorted-by", cl::CommaSeparated,
117 cl::desc("print functions sorted by order of dyno stats"),
118 cl::value_desc("key1,key2,key3,..."),
119 cl::values(
120 #define D(name, description, ...) \
121 clEnumValN(bolt::DynoStats::name, dynoStatsOptName(bolt::DynoStats::name), \
122 description),
123 REAL_DYNO_STATS
124 #undef D
125 clEnumValN(bolt::DynoStats::LAST_DYNO_STAT, "all",
126 "sorted by all names")),
127 cl::ZeroOrMore, cl::cat(BoltOptCategory));
129 static cl::opt<bool>
130 PrintUnknown("print-unknown",
131 cl::desc("print names of functions with unknown control flow"),
132 cl::cat(BoltCategory), cl::Hidden);
134 static cl::opt<bool>
135 PrintUnknownCFG("print-unknown-cfg",
136 cl::desc("dump CFG of functions with unknown control flow"),
137 cl::cat(BoltCategory), cl::ReallyHidden);
139 // Please MSVC19 with a forward declaration: otherwise it reports an error about
140 // an undeclared variable inside a callback.
141 extern cl::opt<bolt::ReorderBasicBlocks::LayoutType> ReorderBlocks;
142 cl::opt<bolt::ReorderBasicBlocks::LayoutType> ReorderBlocks(
143 "reorder-blocks", cl::desc("change layout of basic blocks in a function"),
144 cl::init(bolt::ReorderBasicBlocks::LT_NONE),
145 cl::values(
146 clEnumValN(bolt::ReorderBasicBlocks::LT_NONE, "none",
147 "do not reorder basic blocks"),
148 clEnumValN(bolt::ReorderBasicBlocks::LT_REVERSE, "reverse",
149 "layout blocks in reverse order"),
150 clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE, "normal",
151 "perform optimal layout based on profile"),
152 clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_BRANCH,
153 "branch-predictor",
154 "perform optimal layout prioritizing branch "
155 "predictions"),
156 clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_CACHE, "cache",
157 "perform optimal layout prioritizing I-cache "
158 "behavior"),
159 clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_CACHE_PLUS, "cache+",
160 "perform layout optimizing I-cache behavior"),
161 clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_EXT_TSP, "ext-tsp",
162 "perform layout optimizing I-cache behavior"),
163 clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_SHUFFLE,
164 "cluster-shuffle", "perform random layout of clusters")),
165 cl::ZeroOrMore, cl::cat(BoltOptCategory),
166 cl::callback([](const bolt::ReorderBasicBlocks::LayoutType &option) {
167 if (option == bolt::ReorderBasicBlocks::LT_OPTIMIZE_CACHE_PLUS) {
168 errs() << "BOLT-WARNING: '-reorder-blocks=cache+' is deprecated, please"
169 << " use '-reorder-blocks=ext-tsp' instead\n";
170 ReorderBlocks = bolt::ReorderBasicBlocks::LT_OPTIMIZE_EXT_TSP;
172 }));
174 static cl::opt<unsigned> ReportBadLayout(
175 "report-bad-layout",
176 cl::desc("print top <uint> functions with suboptimal code layout on input"),
177 cl::init(0), cl::Hidden, cl::cat(BoltOptCategory));
179 static cl::opt<bool>
180 ReportStaleFuncs("report-stale",
181 cl::desc("print the list of functions with stale profile"),
182 cl::Hidden, cl::cat(BoltOptCategory));
184 enum SctcModes : char {
185 SctcAlways,
186 SctcPreserveDirection,
187 SctcHeuristic
190 static cl::opt<SctcModes>
191 SctcMode("sctc-mode",
192 cl::desc("mode for simplify conditional tail calls"),
193 cl::init(SctcAlways),
194 cl::values(clEnumValN(SctcAlways, "always", "always perform sctc"),
195 clEnumValN(SctcPreserveDirection,
196 "preserve",
197 "only perform sctc when branch direction is "
198 "preserved"),
199 clEnumValN(SctcHeuristic,
200 "heuristic",
201 "use branch prediction data to control sctc")),
202 cl::ZeroOrMore,
203 cl::cat(BoltOptCategory));
205 static cl::opt<unsigned>
206 StaleThreshold("stale-threshold",
207 cl::desc(
208 "maximum percentage of stale functions to tolerate (default: 100)"),
209 cl::init(100),
210 cl::Hidden,
211 cl::cat(BoltOptCategory));
213 static cl::opt<unsigned> TSPThreshold(
214 "tsp-threshold",
215 cl::desc(
216 "maximum number of hot basic blocks in a function for which to use "
217 "a precise TSP solution while re-ordering basic blocks"),
218 cl::init(10), cl::Hidden, cl::cat(BoltOptCategory));
220 static cl::opt<unsigned> TopCalledLimit(
221 "top-called-limit",
222 cl::desc("maximum number of functions to print in top called "
223 "functions section"),
224 cl::init(100), cl::Hidden, cl::cat(BoltCategory));
226 } // namespace opts
228 namespace llvm {
229 namespace bolt {
231 bool BinaryFunctionPass::shouldOptimize(const BinaryFunction &BF) const {
232 return BF.isSimple() && BF.getState() == BinaryFunction::State::CFG &&
233 !BF.isIgnored();
236 bool BinaryFunctionPass::shouldPrint(const BinaryFunction &BF) const {
237 return BF.isSimple() && !BF.isIgnored();
240 void NormalizeCFG::runOnFunction(BinaryFunction &BF) {
241 uint64_t NumRemoved = 0;
242 uint64_t NumDuplicateEdges = 0;
243 uint64_t NeedsFixBranches = 0;
244 for (BinaryBasicBlock &BB : BF) {
245 if (!BB.empty())
246 continue;
248 if (BB.isEntryPoint() || BB.isLandingPad())
249 continue;
251 // Handle a dangling empty block.
252 if (BB.succ_size() == 0) {
253 // If an empty dangling basic block has a predecessor, it could be a
254 // result of codegen for __builtin_unreachable. In such case, do not
255 // remove the block.
256 if (BB.pred_size() == 0) {
257 BB.markValid(false);
258 ++NumRemoved;
260 continue;
263 // The block should have just one successor.
264 BinaryBasicBlock *Successor = BB.getSuccessor();
265 assert(Successor && "invalid CFG encountered");
267 // Redirect all predecessors to the successor block.
268 while (!BB.pred_empty()) {
269 BinaryBasicBlock *Predecessor = *BB.pred_begin();
270 if (Predecessor->hasJumpTable())
271 break;
273 if (Predecessor == Successor)
274 break;
276 BinaryBasicBlock::BinaryBranchInfo &BI = Predecessor->getBranchInfo(BB);
277 Predecessor->replaceSuccessor(&BB, Successor, BI.Count,
278 BI.MispredictedCount);
279 // We need to fix branches even if we failed to replace all successors
280 // and remove the block.
281 NeedsFixBranches = true;
284 if (BB.pred_empty()) {
285 BB.removeAllSuccessors();
286 BB.markValid(false);
287 ++NumRemoved;
291 if (NumRemoved)
292 BF.eraseInvalidBBs();
294 // Check for duplicate successors. Do it after the empty block elimination as
295 // we can get more duplicate successors.
296 for (BinaryBasicBlock &BB : BF)
297 if (!BB.hasJumpTable() && BB.succ_size() == 2 &&
298 BB.getConditionalSuccessor(false) == BB.getConditionalSuccessor(true))
299 ++NumDuplicateEdges;
301 // fixBranches() will get rid of duplicate edges and update jump instructions.
302 if (NumDuplicateEdges || NeedsFixBranches)
303 BF.fixBranches();
305 NumDuplicateEdgesMerged += NumDuplicateEdges;
306 NumBlocksRemoved += NumRemoved;
309 Error NormalizeCFG::runOnFunctions(BinaryContext &BC) {
310 ParallelUtilities::runOnEachFunction(
311 BC, ParallelUtilities::SchedulingPolicy::SP_BB_LINEAR,
312 [&](BinaryFunction &BF) { runOnFunction(BF); },
313 [&](const BinaryFunction &BF) { return !shouldOptimize(BF); },
314 "NormalizeCFG");
315 if (NumBlocksRemoved)
316 BC.outs() << "BOLT-INFO: removed " << NumBlocksRemoved << " empty block"
317 << (NumBlocksRemoved == 1 ? "" : "s") << '\n';
318 if (NumDuplicateEdgesMerged)
319 BC.outs() << "BOLT-INFO: merged " << NumDuplicateEdgesMerged
320 << " duplicate CFG edge"
321 << (NumDuplicateEdgesMerged == 1 ? "" : "s") << '\n';
322 return Error::success();
325 void EliminateUnreachableBlocks::runOnFunction(BinaryFunction &Function) {
326 BinaryContext &BC = Function.getBinaryContext();
327 unsigned Count;
328 uint64_t Bytes;
329 Function.markUnreachableBlocks();
330 LLVM_DEBUG({
331 for (BinaryBasicBlock &BB : Function) {
332 if (!BB.isValid()) {
333 dbgs() << "BOLT-INFO: UCE found unreachable block " << BB.getName()
334 << " in function " << Function << "\n";
335 Function.dump();
339 BinaryContext::IndependentCodeEmitter Emitter =
340 BC.createIndependentMCCodeEmitter();
341 std::tie(Count, Bytes) = Function.eraseInvalidBBs(Emitter.MCE.get());
342 DeletedBlocks += Count;
343 DeletedBytes += Bytes;
344 if (Count) {
345 auto L = BC.scopeLock();
346 Modified.insert(&Function);
347 if (opts::Verbosity > 0)
348 BC.outs() << "BOLT-INFO: removed " << Count
349 << " dead basic block(s) accounting for " << Bytes
350 << " bytes in function " << Function << '\n';
354 Error EliminateUnreachableBlocks::runOnFunctions(BinaryContext &BC) {
355 ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) {
356 runOnFunction(BF);
359 ParallelUtilities::PredicateTy SkipPredicate = [&](const BinaryFunction &BF) {
360 return !shouldOptimize(BF) || BF.getLayout().block_empty();
363 ParallelUtilities::runOnEachFunction(
364 BC, ParallelUtilities::SchedulingPolicy::SP_CONSTANT, WorkFun,
365 SkipPredicate, "elimininate-unreachable");
367 if (DeletedBlocks)
368 BC.outs() << "BOLT-INFO: UCE removed " << DeletedBlocks << " blocks and "
369 << DeletedBytes << " bytes of code\n";
370 return Error::success();
373 bool ReorderBasicBlocks::shouldPrint(const BinaryFunction &BF) const {
374 return (BinaryFunctionPass::shouldPrint(BF) &&
375 opts::ReorderBlocks != ReorderBasicBlocks::LT_NONE);
378 bool ReorderBasicBlocks::shouldOptimize(const BinaryFunction &BF) const {
379 // Apply execution count threshold
380 if (BF.getKnownExecutionCount() < opts::ExecutionCountThreshold)
381 return false;
383 return BinaryFunctionPass::shouldOptimize(BF);
386 Error ReorderBasicBlocks::runOnFunctions(BinaryContext &BC) {
387 if (opts::ReorderBlocks == ReorderBasicBlocks::LT_NONE)
388 return Error::success();
390 std::atomic_uint64_t ModifiedFuncCount(0);
391 std::mutex FunctionEditDistanceMutex;
392 DenseMap<const BinaryFunction *, uint64_t> FunctionEditDistance;
394 ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) {
395 SmallVector<const BinaryBasicBlock *, 0> OldBlockOrder;
396 if (opts::PrintFuncStat > 0)
397 llvm::copy(BF.getLayout().blocks(), std::back_inserter(OldBlockOrder));
399 const bool LayoutChanged =
400 modifyFunctionLayout(BF, opts::ReorderBlocks, opts::MinBranchClusters);
401 if (LayoutChanged) {
402 ModifiedFuncCount.fetch_add(1, std::memory_order_relaxed);
403 if (opts::PrintFuncStat > 0) {
404 const uint64_t Distance = BF.getLayout().getEditDistance(OldBlockOrder);
405 std::lock_guard<std::mutex> Lock(FunctionEditDistanceMutex);
406 FunctionEditDistance[&BF] = Distance;
411 ParallelUtilities::PredicateTy SkipFunc = [&](const BinaryFunction &BF) {
412 return !shouldOptimize(BF);
415 ParallelUtilities::runOnEachFunction(
416 BC, ParallelUtilities::SchedulingPolicy::SP_BB_LINEAR, WorkFun, SkipFunc,
417 "ReorderBasicBlocks");
418 const size_t NumAllProfiledFunctions =
419 BC.NumProfiledFuncs + BC.NumStaleProfileFuncs;
421 BC.outs() << "BOLT-INFO: basic block reordering modified layout of "
422 << format(
423 "%zu functions (%.2lf%% of profiled, %.2lf%% of total)\n",
424 ModifiedFuncCount.load(std::memory_order_relaxed),
425 100.0 * ModifiedFuncCount.load(std::memory_order_relaxed) /
426 NumAllProfiledFunctions,
427 100.0 * ModifiedFuncCount.load(std::memory_order_relaxed) /
428 BC.getBinaryFunctions().size());
430 if (opts::PrintFuncStat > 0) {
431 raw_ostream &OS = BC.outs();
432 // Copy all the values into vector in order to sort them
433 std::map<uint64_t, BinaryFunction &> ScoreMap;
434 auto &BFs = BC.getBinaryFunctions();
435 for (auto It = BFs.begin(); It != BFs.end(); ++It)
436 ScoreMap.insert(std::pair<uint64_t, BinaryFunction &>(
437 It->second.getFunctionScore(), It->second));
439 OS << "\nBOLT-INFO: Printing Function Statistics:\n\n";
440 OS << " There are " << BFs.size() << " functions in total. \n";
441 OS << " Number of functions being modified: "
442 << ModifiedFuncCount.load(std::memory_order_relaxed) << "\n";
443 OS << " User asks for detailed information on top "
444 << opts::PrintFuncStat << " functions. (Ranked by function score)"
445 << "\n\n";
446 uint64_t I = 0;
447 for (std::map<uint64_t, BinaryFunction &>::reverse_iterator Rit =
448 ScoreMap.rbegin();
449 Rit != ScoreMap.rend() && I < opts::PrintFuncStat; ++Rit, ++I) {
450 BinaryFunction &Function = Rit->second;
452 OS << " Information for function of top: " << (I + 1) << ": \n";
453 OS << " Function Score is: " << Function.getFunctionScore()
454 << "\n";
455 OS << " There are " << Function.size()
456 << " number of blocks in this function.\n";
457 OS << " There are " << Function.getInstructionCount()
458 << " number of instructions in this function.\n";
459 OS << " The edit distance for this function is: "
460 << FunctionEditDistance.lookup(&Function) << "\n\n";
463 return Error::success();
466 bool ReorderBasicBlocks::modifyFunctionLayout(BinaryFunction &BF,
467 LayoutType Type,
468 bool MinBranchClusters) const {
469 if (BF.size() == 0 || Type == LT_NONE)
470 return false;
472 BinaryFunction::BasicBlockOrderType NewLayout;
473 std::unique_ptr<ReorderAlgorithm> Algo;
475 // Cannot do optimal layout without profile.
476 if (Type != LT_REVERSE && !BF.hasValidProfile())
477 return false;
479 if (Type == LT_REVERSE) {
480 Algo.reset(new ReverseReorderAlgorithm());
481 } else if (BF.size() <= opts::TSPThreshold && Type != LT_OPTIMIZE_SHUFFLE) {
482 // Work on optimal solution if problem is small enough
483 LLVM_DEBUG(dbgs() << "finding optimal block layout for " << BF << "\n");
484 Algo.reset(new TSPReorderAlgorithm());
485 } else {
486 LLVM_DEBUG(dbgs() << "running block layout heuristics on " << BF << "\n");
488 std::unique_ptr<ClusterAlgorithm> CAlgo;
489 if (MinBranchClusters)
490 CAlgo.reset(new MinBranchGreedyClusterAlgorithm());
491 else
492 CAlgo.reset(new PHGreedyClusterAlgorithm());
494 switch (Type) {
495 case LT_OPTIMIZE:
496 Algo.reset(new OptimizeReorderAlgorithm(std::move(CAlgo)));
497 break;
499 case LT_OPTIMIZE_BRANCH:
500 Algo.reset(new OptimizeBranchReorderAlgorithm(std::move(CAlgo)));
501 break;
503 case LT_OPTIMIZE_CACHE:
504 Algo.reset(new OptimizeCacheReorderAlgorithm(std::move(CAlgo)));
505 break;
507 case LT_OPTIMIZE_EXT_TSP:
508 Algo.reset(new ExtTSPReorderAlgorithm());
509 break;
511 case LT_OPTIMIZE_SHUFFLE:
512 Algo.reset(new RandomClusterReorderAlgorithm(std::move(CAlgo)));
513 break;
515 default:
516 llvm_unreachable("unexpected layout type");
520 Algo->reorderBasicBlocks(BF, NewLayout);
522 return BF.getLayout().update(NewLayout);
525 Error FixupBranches::runOnFunctions(BinaryContext &BC) {
526 for (auto &It : BC.getBinaryFunctions()) {
527 BinaryFunction &Function = It.second;
528 if (!BC.shouldEmit(Function) || !Function.isSimple())
529 continue;
531 Function.fixBranches();
533 return Error::success();
536 Error FinalizeFunctions::runOnFunctions(BinaryContext &BC) {
537 std::atomic<bool> HasFatal{false};
538 ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) {
539 if (!BF.finalizeCFIState()) {
540 if (BC.HasRelocations) {
541 BC.errs() << "BOLT-ERROR: unable to fix CFI state for function " << BF
542 << ". Exiting.\n";
543 HasFatal = true;
544 return;
546 BF.setSimple(false);
547 return;
550 BF.setFinalized();
552 // Update exception handling information.
553 BF.updateEHRanges();
556 ParallelUtilities::PredicateTy SkipPredicate = [&](const BinaryFunction &BF) {
557 return !BC.shouldEmit(BF);
560 ParallelUtilities::runOnEachFunction(
561 BC, ParallelUtilities::SchedulingPolicy::SP_CONSTANT, WorkFun,
562 SkipPredicate, "FinalizeFunctions");
563 if (HasFatal)
564 return createFatalBOLTError("finalize CFI state failure");
565 return Error::success();
568 Error CheckLargeFunctions::runOnFunctions(BinaryContext &BC) {
569 if (BC.HasRelocations)
570 return Error::success();
572 // If the function wouldn't fit, mark it as non-simple. Otherwise, we may emit
573 // incorrect meta data.
574 ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) {
575 uint64_t HotSize, ColdSize;
576 std::tie(HotSize, ColdSize) =
577 BC.calculateEmittedSize(BF, /*FixBranches=*/false);
578 if (HotSize > BF.getMaxSize()) {
579 if (opts::PrintLargeFunctions)
580 BC.outs() << "BOLT-INFO: " << BF << " size exceeds allocated space by "
581 << (HotSize - BF.getMaxSize()) << " bytes\n";
582 BF.setSimple(false);
586 ParallelUtilities::PredicateTy SkipFunc = [&](const BinaryFunction &BF) {
587 return !shouldOptimize(BF);
590 ParallelUtilities::runOnEachFunction(
591 BC, ParallelUtilities::SchedulingPolicy::SP_INST_LINEAR, WorkFun,
592 SkipFunc, "CheckLargeFunctions");
594 return Error::success();
597 bool CheckLargeFunctions::shouldOptimize(const BinaryFunction &BF) const {
598 // Unlike other passes, allow functions in non-CFG state.
599 return BF.isSimple() && !BF.isIgnored();
602 Error LowerAnnotations::runOnFunctions(BinaryContext &BC) {
603 // Convert GnuArgsSize annotations into CFIs.
604 for (BinaryFunction *BF : BC.getAllBinaryFunctions()) {
605 for (FunctionFragment &FF : BF->getLayout().fragments()) {
606 // Reset at the start of the new fragment.
607 int64_t CurrentGnuArgsSize = 0;
609 for (BinaryBasicBlock *const BB : FF) {
610 for (auto II = BB->begin(); II != BB->end(); ++II) {
611 if (!BF->usesGnuArgsSize() || !BC.MIB->isInvoke(*II))
612 continue;
614 const int64_t NewGnuArgsSize = BC.MIB->getGnuArgsSize(*II);
615 assert(NewGnuArgsSize >= 0 && "Expected non-negative GNU_args_size.");
616 if (NewGnuArgsSize == CurrentGnuArgsSize)
617 continue;
619 auto InsertII = BF->addCFIInstruction(
620 BB, II,
621 MCCFIInstruction::createGnuArgsSize(nullptr, NewGnuArgsSize));
622 CurrentGnuArgsSize = NewGnuArgsSize;
623 II = std::next(InsertII);
628 return Error::success();
631 // Check for dirty state in MCSymbol objects that might be a consequence
632 // of running calculateEmittedSize() in parallel, during split functions
633 // pass. If an inconsistent state is found (symbol already registered or
634 // already defined), clean it.
635 Error CleanMCState::runOnFunctions(BinaryContext &BC) {
636 MCContext &Ctx = *BC.Ctx;
637 for (const auto &SymMapEntry : Ctx.getSymbols()) {
638 const MCSymbol *S = SymMapEntry.getValue().Symbol;
639 if (!S)
640 continue;
641 if (S->isDefined()) {
642 LLVM_DEBUG(dbgs() << "BOLT-DEBUG: Symbol \"" << S->getName()
643 << "\" is already defined\n");
644 const_cast<MCSymbol *>(S)->setUndefined();
646 if (S->isRegistered()) {
647 LLVM_DEBUG(dbgs() << "BOLT-DEBUG: Symbol \"" << S->getName()
648 << "\" is already registered\n");
649 const_cast<MCSymbol *>(S)->setIsRegistered(false);
651 LLVM_DEBUG(if (S->isVariable()) {
652 dbgs() << "BOLT-DEBUG: Symbol \"" << S->getName() << "\" is variable\n";
655 return Error::success();
658 // This peephole fixes jump instructions that jump to another basic
659 // block with a single jump instruction, e.g.
661 // B0: ...
662 // jmp B1 (or jcc B1)
664 // B1: jmp B2
666 // ->
668 // B0: ...
669 // jmp B2 (or jcc B2)
671 static uint64_t fixDoubleJumps(BinaryFunction &Function, bool MarkInvalid) {
672 uint64_t NumDoubleJumps = 0;
674 MCContext *Ctx = Function.getBinaryContext().Ctx.get();
675 MCPlusBuilder *MIB = Function.getBinaryContext().MIB.get();
676 for (BinaryBasicBlock &BB : Function) {
677 auto checkAndPatch = [&](BinaryBasicBlock *Pred, BinaryBasicBlock *Succ,
678 const MCSymbol *SuccSym,
679 std::optional<uint32_t> Offset) {
680 // Ignore infinite loop jumps or fallthrough tail jumps.
681 if (Pred == Succ || Succ == &BB)
682 return false;
684 if (Succ) {
685 const MCSymbol *TBB = nullptr;
686 const MCSymbol *FBB = nullptr;
687 MCInst *CondBranch = nullptr;
688 MCInst *UncondBranch = nullptr;
689 bool Res = Pred->analyzeBranch(TBB, FBB, CondBranch, UncondBranch);
690 if (!Res) {
691 LLVM_DEBUG(dbgs() << "analyzeBranch failed in peepholes in block:\n";
692 Pred->dump());
693 return false;
695 Pred->replaceSuccessor(&BB, Succ);
697 // We must patch up any existing branch instructions to match up
698 // with the new successor.
699 assert((CondBranch || (!CondBranch && Pred->succ_size() == 1)) &&
700 "Predecessor block has inconsistent number of successors");
701 if (CondBranch && MIB->getTargetSymbol(*CondBranch) == BB.getLabel()) {
702 MIB->replaceBranchTarget(*CondBranch, Succ->getLabel(), Ctx);
703 } else if (UncondBranch &&
704 MIB->getTargetSymbol(*UncondBranch) == BB.getLabel()) {
705 MIB->replaceBranchTarget(*UncondBranch, Succ->getLabel(), Ctx);
706 } else if (!UncondBranch) {
707 assert(Function.getLayout().getBasicBlockAfter(Pred, false) != Succ &&
708 "Don't add an explicit jump to a fallthrough block.");
709 Pred->addBranchInstruction(Succ);
711 } else {
712 // Succ will be null in the tail call case. In this case we
713 // need to explicitly add a tail call instruction.
714 MCInst *Branch = Pred->getLastNonPseudoInstr();
715 if (Branch && MIB->isUnconditionalBranch(*Branch)) {
716 assert(MIB->getTargetSymbol(*Branch) == BB.getLabel());
717 Pred->removeSuccessor(&BB);
718 Pred->eraseInstruction(Pred->findInstruction(Branch));
719 Pred->addTailCallInstruction(SuccSym);
720 if (Offset) {
721 MCInst *TailCall = Pred->getLastNonPseudoInstr();
722 assert(TailCall);
723 MIB->setOffset(*TailCall, *Offset);
725 } else {
726 return false;
730 ++NumDoubleJumps;
731 LLVM_DEBUG(dbgs() << "Removed double jump in " << Function << " from "
732 << Pred->getName() << " -> " << BB.getName() << " to "
733 << Pred->getName() << " -> " << SuccSym->getName()
734 << (!Succ ? " (tail)\n" : "\n"));
736 return true;
739 if (BB.getNumNonPseudos() != 1 || BB.isLandingPad())
740 continue;
742 MCInst *Inst = BB.getFirstNonPseudoInstr();
743 const bool IsTailCall = MIB->isTailCall(*Inst);
745 if (!MIB->isUnconditionalBranch(*Inst) && !IsTailCall)
746 continue;
748 // If we operate after SCTC make sure it's not a conditional tail call.
749 if (IsTailCall && MIB->isConditionalBranch(*Inst))
750 continue;
752 const MCSymbol *SuccSym = MIB->getTargetSymbol(*Inst);
753 BinaryBasicBlock *Succ = BB.getSuccessor();
755 if (((!Succ || &BB == Succ) && !IsTailCall) || (IsTailCall && !SuccSym))
756 continue;
758 std::vector<BinaryBasicBlock *> Preds = {BB.pred_begin(), BB.pred_end()};
760 for (BinaryBasicBlock *Pred : Preds) {
761 if (Pred->isLandingPad())
762 continue;
764 if (Pred->getSuccessor() == &BB ||
765 (Pred->getConditionalSuccessor(true) == &BB && !IsTailCall) ||
766 Pred->getConditionalSuccessor(false) == &BB)
767 if (checkAndPatch(Pred, Succ, SuccSym, MIB->getOffset(*Inst)) &&
768 MarkInvalid)
769 BB.markValid(BB.pred_size() != 0 || BB.isLandingPad() ||
770 BB.isEntryPoint());
774 return NumDoubleJumps;
777 bool SimplifyConditionalTailCalls::shouldRewriteBranch(
778 const BinaryBasicBlock *PredBB, const MCInst &CondBranch,
779 const BinaryBasicBlock *BB, const bool DirectionFlag) {
780 if (BeenOptimized.count(PredBB))
781 return false;
783 const bool IsForward = BinaryFunction::isForwardBranch(PredBB, BB);
785 if (IsForward)
786 ++NumOrigForwardBranches;
787 else
788 ++NumOrigBackwardBranches;
790 if (opts::SctcMode == opts::SctcAlways)
791 return true;
793 if (opts::SctcMode == opts::SctcPreserveDirection)
794 return IsForward == DirectionFlag;
796 const ErrorOr<std::pair<double, double>> Frequency =
797 PredBB->getBranchStats(BB);
799 // It's ok to rewrite the conditional branch if the new target will be
800 // a backward branch.
802 // If no data available for these branches, then it should be ok to
803 // do the optimization since it will reduce code size.
804 if (Frequency.getError())
805 return true;
807 // TODO: should this use misprediction frequency instead?
808 const bool Result = (IsForward && Frequency.get().first >= 0.5) ||
809 (!IsForward && Frequency.get().first <= 0.5);
811 return Result == DirectionFlag;
814 uint64_t SimplifyConditionalTailCalls::fixTailCalls(BinaryFunction &BF) {
815 // Need updated indices to correctly detect branch' direction.
816 BF.getLayout().updateLayoutIndices();
817 BF.markUnreachableBlocks();
819 MCPlusBuilder *MIB = BF.getBinaryContext().MIB.get();
820 MCContext *Ctx = BF.getBinaryContext().Ctx.get();
821 uint64_t NumLocalCTCCandidates = 0;
822 uint64_t NumLocalCTCs = 0;
823 uint64_t LocalCTCTakenCount = 0;
824 uint64_t LocalCTCExecCount = 0;
825 std::vector<std::pair<BinaryBasicBlock *, const BinaryBasicBlock *>>
826 NeedsUncondBranch;
828 // Will block be deleted by UCE?
829 auto isValid = [](const BinaryBasicBlock *BB) {
830 return (BB->pred_size() != 0 || BB->isLandingPad() || BB->isEntryPoint());
833 for (BinaryBasicBlock *BB : BF.getLayout().blocks()) {
834 // Locate BB with a single direct tail-call instruction.
835 if (BB->getNumNonPseudos() != 1)
836 continue;
838 MCInst *Instr = BB->getFirstNonPseudoInstr();
839 if (!MIB->isTailCall(*Instr) || MIB->isConditionalBranch(*Instr))
840 continue;
842 const MCSymbol *CalleeSymbol = MIB->getTargetSymbol(*Instr);
843 if (!CalleeSymbol)
844 continue;
846 // Detect direction of the possible conditional tail call.
847 const bool IsForwardCTC = BF.isForwardCall(CalleeSymbol);
849 // Iterate through all predecessors.
850 for (BinaryBasicBlock *PredBB : BB->predecessors()) {
851 BinaryBasicBlock *CondSucc = PredBB->getConditionalSuccessor(true);
852 if (!CondSucc)
853 continue;
855 ++NumLocalCTCCandidates;
857 const MCSymbol *TBB = nullptr;
858 const MCSymbol *FBB = nullptr;
859 MCInst *CondBranch = nullptr;
860 MCInst *UncondBranch = nullptr;
861 bool Result = PredBB->analyzeBranch(TBB, FBB, CondBranch, UncondBranch);
863 // analyzeBranch() can fail due to unusual branch instructions, e.g. jrcxz
864 if (!Result) {
865 LLVM_DEBUG(dbgs() << "analyzeBranch failed in SCTC in block:\n";
866 PredBB->dump());
867 continue;
870 assert(Result && "internal error analyzing conditional branch");
871 assert(CondBranch && "conditional branch expected");
873 // Skip dynamic branches for now.
874 if (BF.getBinaryContext().MIB->isDynamicBranch(*CondBranch))
875 continue;
877 // It's possible that PredBB is also a successor to BB that may have
878 // been processed by a previous iteration of the SCTC loop, in which
879 // case it may have been marked invalid. We should skip rewriting in
880 // this case.
881 if (!PredBB->isValid()) {
882 assert(PredBB->isSuccessor(BB) &&
883 "PredBB should be valid if it is not a successor to BB");
884 continue;
887 // We don't want to reverse direction of the branch in new order
888 // without further profile analysis.
889 const bool DirectionFlag = CondSucc == BB ? IsForwardCTC : !IsForwardCTC;
890 if (!shouldRewriteBranch(PredBB, *CondBranch, BB, DirectionFlag))
891 continue;
893 // Record this block so that we don't try to optimize it twice.
894 BeenOptimized.insert(PredBB);
896 uint64_t Count = 0;
897 if (CondSucc != BB) {
898 // Patch the new target address into the conditional branch.
899 MIB->reverseBranchCondition(*CondBranch, CalleeSymbol, Ctx);
900 // Since we reversed the condition on the branch we need to change
901 // the target for the unconditional branch or add a unconditional
902 // branch to the old target. This has to be done manually since
903 // fixupBranches is not called after SCTC.
904 NeedsUncondBranch.emplace_back(PredBB, CondSucc);
905 Count = PredBB->getFallthroughBranchInfo().Count;
906 } else {
907 // Change destination of the conditional branch.
908 MIB->replaceBranchTarget(*CondBranch, CalleeSymbol, Ctx);
909 Count = PredBB->getTakenBranchInfo().Count;
911 const uint64_t CTCTakenFreq =
912 Count == BinaryBasicBlock::COUNT_NO_PROFILE ? 0 : Count;
914 // Annotate it, so "isCall" returns true for this jcc
915 MIB->setConditionalTailCall(*CondBranch);
916 // Add info about the conditional tail call frequency, otherwise this
917 // info will be lost when we delete the associated BranchInfo entry
918 auto &CTCAnnotation =
919 MIB->getOrCreateAnnotationAs<uint64_t>(*CondBranch, "CTCTakenCount");
920 CTCAnnotation = CTCTakenFreq;
921 // Preserve Offset annotation, used in BAT.
922 // Instr is a direct tail call instruction that was created when CTCs are
923 // first expanded, and has the original CTC offset set.
924 if (std::optional<uint32_t> Offset = MIB->getOffset(*Instr))
925 MIB->setOffset(*CondBranch, *Offset);
927 // Remove the unused successor which may be eliminated later
928 // if there are no other users.
929 PredBB->removeSuccessor(BB);
930 // Update BB execution count
931 if (CTCTakenFreq && CTCTakenFreq <= BB->getKnownExecutionCount())
932 BB->setExecutionCount(BB->getExecutionCount() - CTCTakenFreq);
933 else if (CTCTakenFreq > BB->getKnownExecutionCount())
934 BB->setExecutionCount(0);
936 ++NumLocalCTCs;
937 LocalCTCTakenCount += CTCTakenFreq;
938 LocalCTCExecCount += PredBB->getKnownExecutionCount();
941 // Remove the block from CFG if all predecessors were removed.
942 BB->markValid(isValid(BB));
945 // Add unconditional branches at the end of BBs to new successors
946 // as long as the successor is not a fallthrough.
947 for (auto &Entry : NeedsUncondBranch) {
948 BinaryBasicBlock *PredBB = Entry.first;
949 const BinaryBasicBlock *CondSucc = Entry.second;
951 const MCSymbol *TBB = nullptr;
952 const MCSymbol *FBB = nullptr;
953 MCInst *CondBranch = nullptr;
954 MCInst *UncondBranch = nullptr;
955 PredBB->analyzeBranch(TBB, FBB, CondBranch, UncondBranch);
957 // Find the next valid block. Invalid blocks will be deleted
958 // so they shouldn't be considered fallthrough targets.
959 const BinaryBasicBlock *NextBlock =
960 BF.getLayout().getBasicBlockAfter(PredBB, false);
961 while (NextBlock && !isValid(NextBlock))
962 NextBlock = BF.getLayout().getBasicBlockAfter(NextBlock, false);
964 // Get the unconditional successor to this block.
965 const BinaryBasicBlock *PredSucc = PredBB->getSuccessor();
966 assert(PredSucc && "The other branch should be a tail call");
968 const bool HasFallthrough = (NextBlock && PredSucc == NextBlock);
970 if (UncondBranch) {
971 if (HasFallthrough)
972 PredBB->eraseInstruction(PredBB->findInstruction(UncondBranch));
973 else
974 MIB->replaceBranchTarget(*UncondBranch, CondSucc->getLabel(), Ctx);
975 } else if (!HasFallthrough) {
976 MCInst Branch;
977 MIB->createUncondBranch(Branch, CondSucc->getLabel(), Ctx);
978 PredBB->addInstruction(Branch);
982 if (NumLocalCTCs > 0) {
983 NumDoubleJumps += fixDoubleJumps(BF, true);
984 // Clean-up unreachable tail-call blocks.
985 const std::pair<unsigned, uint64_t> Stats = BF.eraseInvalidBBs();
986 DeletedBlocks += Stats.first;
987 DeletedBytes += Stats.second;
989 assert(BF.validateCFG());
992 LLVM_DEBUG(dbgs() << "BOLT: created " << NumLocalCTCs
993 << " conditional tail calls from a total of "
994 << NumLocalCTCCandidates << " candidates in function " << BF
995 << ". CTCs execution count for this function is "
996 << LocalCTCExecCount << " and CTC taken count is "
997 << LocalCTCTakenCount << "\n";);
999 NumTailCallsPatched += NumLocalCTCs;
1000 NumCandidateTailCalls += NumLocalCTCCandidates;
1001 CTCExecCount += LocalCTCExecCount;
1002 CTCTakenCount += LocalCTCTakenCount;
1004 return NumLocalCTCs > 0;
1007 Error SimplifyConditionalTailCalls::runOnFunctions(BinaryContext &BC) {
1008 if (!BC.isX86())
1009 return Error::success();
1011 for (auto &It : BC.getBinaryFunctions()) {
1012 BinaryFunction &Function = It.second;
1014 if (!shouldOptimize(Function))
1015 continue;
1017 if (fixTailCalls(Function)) {
1018 Modified.insert(&Function);
1019 Function.setHasCanonicalCFG(false);
1023 if (NumTailCallsPatched)
1024 BC.outs() << "BOLT-INFO: SCTC: patched " << NumTailCallsPatched
1025 << " tail calls (" << NumOrigForwardBranches << " forward)"
1026 << " tail calls (" << NumOrigBackwardBranches << " backward)"
1027 << " from a total of " << NumCandidateTailCalls
1028 << " while removing " << NumDoubleJumps << " double jumps"
1029 << " and removing " << DeletedBlocks << " basic blocks"
1030 << " totalling " << DeletedBytes
1031 << " bytes of code. CTCs total execution count is "
1032 << CTCExecCount << " and the number of times CTCs are taken is "
1033 << CTCTakenCount << "\n";
1034 return Error::success();
1037 uint64_t ShortenInstructions::shortenInstructions(BinaryFunction &Function) {
1038 uint64_t Count = 0;
1039 const BinaryContext &BC = Function.getBinaryContext();
1040 for (BinaryBasicBlock &BB : Function) {
1041 for (MCInst &Inst : BB) {
1042 // Skip shortening instructions with Size annotation.
1043 if (BC.MIB->getSize(Inst))
1044 continue;
1046 MCInst OriginalInst;
1047 if (opts::Verbosity > 2)
1048 OriginalInst = Inst;
1050 if (!BC.MIB->shortenInstruction(Inst, *BC.STI))
1051 continue;
1053 if (opts::Verbosity > 2) {
1054 BC.scopeLock();
1055 BC.outs() << "BOLT-INFO: shortening:\nBOLT-INFO: ";
1056 BC.printInstruction(BC.outs(), OriginalInst, 0, &Function);
1057 BC.outs() << "BOLT-INFO: to:";
1058 BC.printInstruction(BC.outs(), Inst, 0, &Function);
1061 ++Count;
1065 return Count;
1068 Error ShortenInstructions::runOnFunctions(BinaryContext &BC) {
1069 std::atomic<uint64_t> NumShortened{0};
1070 if (!BC.isX86())
1071 return Error::success();
1073 ParallelUtilities::runOnEachFunction(
1074 BC, ParallelUtilities::SchedulingPolicy::SP_INST_LINEAR,
1075 [&](BinaryFunction &BF) { NumShortened += shortenInstructions(BF); },
1076 nullptr, "ShortenInstructions");
1078 if (NumShortened)
1079 BC.outs() << "BOLT-INFO: " << NumShortened
1080 << " instructions were shortened\n";
1081 return Error::success();
1084 void Peepholes::addTailcallTraps(BinaryFunction &Function) {
1085 MCPlusBuilder *MIB = Function.getBinaryContext().MIB.get();
1086 for (BinaryBasicBlock &BB : Function) {
1087 MCInst *Inst = BB.getLastNonPseudoInstr();
1088 if (Inst && MIB->isTailCall(*Inst) && MIB->isIndirectBranch(*Inst)) {
1089 MCInst Trap;
1090 MIB->createTrap(Trap);
1091 BB.addInstruction(Trap);
1092 ++TailCallTraps;
1097 void Peepholes::removeUselessCondBranches(BinaryFunction &Function) {
1098 for (BinaryBasicBlock &BB : Function) {
1099 if (BB.succ_size() != 2)
1100 continue;
1102 BinaryBasicBlock *CondBB = BB.getConditionalSuccessor(true);
1103 BinaryBasicBlock *UncondBB = BB.getConditionalSuccessor(false);
1104 if (CondBB != UncondBB)
1105 continue;
1107 const MCSymbol *TBB = nullptr;
1108 const MCSymbol *FBB = nullptr;
1109 MCInst *CondBranch = nullptr;
1110 MCInst *UncondBranch = nullptr;
1111 bool Result = BB.analyzeBranch(TBB, FBB, CondBranch, UncondBranch);
1113 // analyzeBranch() can fail due to unusual branch instructions,
1114 // e.g. jrcxz, or jump tables (indirect jump).
1115 if (!Result || !CondBranch)
1116 continue;
1118 BB.removeDuplicateConditionalSuccessor(CondBranch);
1119 ++NumUselessCondBranches;
1123 Error Peepholes::runOnFunctions(BinaryContext &BC) {
1124 const char Opts =
1125 std::accumulate(opts::Peepholes.begin(), opts::Peepholes.end(), 0,
1126 [](const char A, const PeepholeOpts B) { return A | B; });
1127 if (Opts == PEEP_NONE)
1128 return Error::success();
1130 for (auto &It : BC.getBinaryFunctions()) {
1131 BinaryFunction &Function = It.second;
1132 if (shouldOptimize(Function)) {
1133 if (Opts & PEEP_DOUBLE_JUMPS)
1134 NumDoubleJumps += fixDoubleJumps(Function, false);
1135 if (Opts & PEEP_TAILCALL_TRAPS)
1136 addTailcallTraps(Function);
1137 if (Opts & PEEP_USELESS_BRANCHES)
1138 removeUselessCondBranches(Function);
1139 assert(Function.validateCFG());
1142 BC.outs() << "BOLT-INFO: Peephole: " << NumDoubleJumps
1143 << " double jumps patched.\n"
1144 << "BOLT-INFO: Peephole: " << TailCallTraps
1145 << " tail call traps inserted.\n"
1146 << "BOLT-INFO: Peephole: " << NumUselessCondBranches
1147 << " useless conditional branches removed.\n";
1148 return Error::success();
1151 bool SimplifyRODataLoads::simplifyRODataLoads(BinaryFunction &BF) {
1152 BinaryContext &BC = BF.getBinaryContext();
1153 MCPlusBuilder *MIB = BC.MIB.get();
1155 uint64_t NumLocalLoadsSimplified = 0;
1156 uint64_t NumDynamicLocalLoadsSimplified = 0;
1157 uint64_t NumLocalLoadsFound = 0;
1158 uint64_t NumDynamicLocalLoadsFound = 0;
1160 for (BinaryBasicBlock *BB : BF.getLayout().blocks()) {
1161 for (MCInst &Inst : *BB) {
1162 unsigned Opcode = Inst.getOpcode();
1163 const MCInstrDesc &Desc = BC.MII->get(Opcode);
1165 // Skip instructions that do not load from memory.
1166 if (!Desc.mayLoad())
1167 continue;
1169 // Try to statically evaluate the target memory address;
1170 uint64_t TargetAddress;
1172 if (MIB->hasPCRelOperand(Inst)) {
1173 // Try to find the symbol that corresponds to the PC-relative operand.
1174 MCOperand *DispOpI = MIB->getMemOperandDisp(Inst);
1175 assert(DispOpI != Inst.end() && "expected PC-relative displacement");
1176 assert(DispOpI->isExpr() &&
1177 "found PC-relative with non-symbolic displacement");
1179 // Get displacement symbol.
1180 const MCSymbol *DisplSymbol;
1181 uint64_t DisplOffset;
1183 std::tie(DisplSymbol, DisplOffset) =
1184 MIB->getTargetSymbolInfo(DispOpI->getExpr());
1186 if (!DisplSymbol)
1187 continue;
1189 // Look up the symbol address in the global symbols map of the binary
1190 // context object.
1191 BinaryData *BD = BC.getBinaryDataByName(DisplSymbol->getName());
1192 if (!BD)
1193 continue;
1194 TargetAddress = BD->getAddress() + DisplOffset;
1195 } else if (!MIB->evaluateMemOperandTarget(Inst, TargetAddress)) {
1196 continue;
1199 // Get the contents of the section containing the target address of the
1200 // memory operand. We are only interested in read-only sections.
1201 ErrorOr<BinarySection &> DataSection =
1202 BC.getSectionForAddress(TargetAddress);
1203 if (!DataSection || DataSection->isWritable())
1204 continue;
1206 if (BC.getRelocationAt(TargetAddress) ||
1207 BC.getDynamicRelocationAt(TargetAddress))
1208 continue;
1210 uint32_t Offset = TargetAddress - DataSection->getAddress();
1211 StringRef ConstantData = DataSection->getContents();
1213 ++NumLocalLoadsFound;
1214 if (BB->hasProfile())
1215 NumDynamicLocalLoadsFound += BB->getExecutionCount();
1217 if (MIB->replaceMemOperandWithImm(Inst, ConstantData, Offset)) {
1218 ++NumLocalLoadsSimplified;
1219 if (BB->hasProfile())
1220 NumDynamicLocalLoadsSimplified += BB->getExecutionCount();
1225 NumLoadsFound += NumLocalLoadsFound;
1226 NumDynamicLoadsFound += NumDynamicLocalLoadsFound;
1227 NumLoadsSimplified += NumLocalLoadsSimplified;
1228 NumDynamicLoadsSimplified += NumDynamicLocalLoadsSimplified;
1230 return NumLocalLoadsSimplified > 0;
1233 Error SimplifyRODataLoads::runOnFunctions(BinaryContext &BC) {
1234 for (auto &It : BC.getBinaryFunctions()) {
1235 BinaryFunction &Function = It.second;
1236 if (shouldOptimize(Function) && simplifyRODataLoads(Function))
1237 Modified.insert(&Function);
1240 BC.outs() << "BOLT-INFO: simplified " << NumLoadsSimplified << " out of "
1241 << NumLoadsFound << " loads from a statically computed address.\n"
1242 << "BOLT-INFO: dynamic loads simplified: "
1243 << NumDynamicLoadsSimplified << "\n"
1244 << "BOLT-INFO: dynamic loads found: " << NumDynamicLoadsFound
1245 << "\n";
1246 return Error::success();
1249 Error AssignSections::runOnFunctions(BinaryContext &BC) {
1250 for (BinaryFunction *Function : BC.getInjectedBinaryFunctions()) {
1251 Function->setCodeSectionName(BC.getInjectedCodeSectionName());
1252 Function->setColdCodeSectionName(BC.getInjectedColdCodeSectionName());
1255 // In non-relocation mode functions have pre-assigned section names.
1256 if (!BC.HasRelocations)
1257 return Error::success();
1259 const bool UseColdSection =
1260 BC.NumProfiledFuncs > 0 ||
1261 opts::ReorderFunctions == ReorderFunctions::RT_USER;
1262 for (auto &BFI : BC.getBinaryFunctions()) {
1263 BinaryFunction &Function = BFI.second;
1264 if (opts::isHotTextMover(Function)) {
1265 Function.setCodeSectionName(BC.getHotTextMoverSectionName());
1266 Function.setColdCodeSectionName(BC.getHotTextMoverSectionName());
1267 continue;
1270 if (!UseColdSection || Function.hasValidIndex())
1271 Function.setCodeSectionName(BC.getMainCodeSectionName());
1272 else
1273 Function.setCodeSectionName(BC.getColdCodeSectionName());
1275 if (Function.isSplit())
1276 Function.setColdCodeSectionName(BC.getColdCodeSectionName());
1278 return Error::success();
1281 Error PrintProfileStats::runOnFunctions(BinaryContext &BC) {
1282 double FlowImbalanceMean = 0.0;
1283 size_t NumBlocksConsidered = 0;
1284 double WorstBias = 0.0;
1285 const BinaryFunction *WorstBiasFunc = nullptr;
1287 // For each function CFG, we fill an IncomingMap with the sum of the frequency
1288 // of incoming edges for each BB. Likewise for each OutgoingMap and the sum
1289 // of the frequency of outgoing edges.
1290 using FlowMapTy = std::unordered_map<const BinaryBasicBlock *, uint64_t>;
1291 std::unordered_map<const BinaryFunction *, FlowMapTy> TotalIncomingMaps;
1292 std::unordered_map<const BinaryFunction *, FlowMapTy> TotalOutgoingMaps;
1294 // Compute mean
1295 for (const auto &BFI : BC.getBinaryFunctions()) {
1296 const BinaryFunction &Function = BFI.second;
1297 if (Function.empty() || !Function.isSimple())
1298 continue;
1299 FlowMapTy &IncomingMap = TotalIncomingMaps[&Function];
1300 FlowMapTy &OutgoingMap = TotalOutgoingMaps[&Function];
1301 for (const BinaryBasicBlock &BB : Function) {
1302 uint64_t TotalOutgoing = 0ULL;
1303 auto SuccBIIter = BB.branch_info_begin();
1304 for (BinaryBasicBlock *Succ : BB.successors()) {
1305 uint64_t Count = SuccBIIter->Count;
1306 if (Count == BinaryBasicBlock::COUNT_NO_PROFILE || Count == 0) {
1307 ++SuccBIIter;
1308 continue;
1310 TotalOutgoing += Count;
1311 IncomingMap[Succ] += Count;
1312 ++SuccBIIter;
1314 OutgoingMap[&BB] = TotalOutgoing;
1317 size_t NumBlocks = 0;
1318 double Mean = 0.0;
1319 for (const BinaryBasicBlock &BB : Function) {
1320 // Do not compute score for low frequency blocks, entry or exit blocks
1321 if (IncomingMap[&BB] < 100 || OutgoingMap[&BB] == 0 || BB.isEntryPoint())
1322 continue;
1323 ++NumBlocks;
1324 const double Difference = (double)OutgoingMap[&BB] - IncomingMap[&BB];
1325 Mean += fabs(Difference / IncomingMap[&BB]);
1328 FlowImbalanceMean += Mean;
1329 NumBlocksConsidered += NumBlocks;
1330 if (!NumBlocks)
1331 continue;
1332 double FuncMean = Mean / NumBlocks;
1333 if (FuncMean > WorstBias) {
1334 WorstBias = FuncMean;
1335 WorstBiasFunc = &Function;
1338 if (NumBlocksConsidered > 0)
1339 FlowImbalanceMean /= NumBlocksConsidered;
1341 // Compute standard deviation
1342 NumBlocksConsidered = 0;
1343 double FlowImbalanceVar = 0.0;
1344 for (const auto &BFI : BC.getBinaryFunctions()) {
1345 const BinaryFunction &Function = BFI.second;
1346 if (Function.empty() || !Function.isSimple())
1347 continue;
1348 FlowMapTy &IncomingMap = TotalIncomingMaps[&Function];
1349 FlowMapTy &OutgoingMap = TotalOutgoingMaps[&Function];
1350 for (const BinaryBasicBlock &BB : Function) {
1351 if (IncomingMap[&BB] < 100 || OutgoingMap[&BB] == 0)
1352 continue;
1353 ++NumBlocksConsidered;
1354 const double Difference = (double)OutgoingMap[&BB] - IncomingMap[&BB];
1355 FlowImbalanceVar +=
1356 pow(fabs(Difference / IncomingMap[&BB]) - FlowImbalanceMean, 2);
1359 if (NumBlocksConsidered) {
1360 FlowImbalanceVar /= NumBlocksConsidered;
1361 FlowImbalanceVar = sqrt(FlowImbalanceVar);
1364 // Report to user
1365 BC.outs() << format("BOLT-INFO: Profile bias score: %.4lf%% StDev: %.4lf%%\n",
1366 (100.0 * FlowImbalanceMean), (100.0 * FlowImbalanceVar));
1367 if (WorstBiasFunc && opts::Verbosity >= 1) {
1368 BC.outs() << "Worst average bias observed in "
1369 << WorstBiasFunc->getPrintName() << "\n";
1370 LLVM_DEBUG(WorstBiasFunc->dump());
1372 return Error::success();
1375 Error PrintProgramStats::runOnFunctions(BinaryContext &BC) {
1376 uint64_t NumRegularFunctions = 0;
1377 uint64_t NumStaleProfileFunctions = 0;
1378 uint64_t NumAllStaleFunctions = 0;
1379 uint64_t NumInferredFunctions = 0;
1380 uint64_t NumNonSimpleProfiledFunctions = 0;
1381 uint64_t NumUnknownControlFlowFunctions = 0;
1382 uint64_t TotalSampleCount = 0;
1383 uint64_t StaleSampleCount = 0;
1384 uint64_t InferredSampleCount = 0;
1385 std::vector<const BinaryFunction *> ProfiledFunctions;
1386 const char *StaleFuncsHeader = "BOLT-INFO: Functions with stale profile:\n";
1387 for (auto &BFI : BC.getBinaryFunctions()) {
1388 const BinaryFunction &Function = BFI.second;
1390 // Ignore PLT functions for stats.
1391 if (Function.isPLTFunction())
1392 continue;
1394 // Adjustment for BAT mode: the profile for BOLT split fragments is combined
1395 // so only count the hot fragment.
1396 const uint64_t Address = Function.getAddress();
1397 bool IsHotParentOfBOLTSplitFunction = !Function.getFragments().empty() &&
1398 BAT && BAT->isBATFunction(Address) &&
1399 !BAT->fetchParentAddress(Address);
1401 ++NumRegularFunctions;
1403 // In BOLTed binaries split functions are non-simple (due to non-relocation
1404 // mode), but the original function is known to be simple and we have a
1405 // valid profile for it.
1406 if (!Function.isSimple() && !IsHotParentOfBOLTSplitFunction) {
1407 if (Function.hasProfile())
1408 ++NumNonSimpleProfiledFunctions;
1409 continue;
1412 if (Function.hasUnknownControlFlow()) {
1413 if (opts::PrintUnknownCFG)
1414 Function.dump();
1415 else if (opts::PrintUnknown)
1416 BC.errs() << "function with unknown control flow: " << Function << '\n';
1418 ++NumUnknownControlFlowFunctions;
1421 if (!Function.hasProfile())
1422 continue;
1424 uint64_t SampleCount = Function.getRawBranchCount();
1425 TotalSampleCount += SampleCount;
1427 if (Function.hasValidProfile()) {
1428 ProfiledFunctions.push_back(&Function);
1429 if (Function.hasInferredProfile()) {
1430 ++NumInferredFunctions;
1431 InferredSampleCount += SampleCount;
1432 ++NumAllStaleFunctions;
1434 } else {
1435 if (opts::ReportStaleFuncs) {
1436 BC.outs() << StaleFuncsHeader;
1437 StaleFuncsHeader = "";
1438 BC.outs() << " " << Function << '\n';
1440 ++NumStaleProfileFunctions;
1441 StaleSampleCount += SampleCount;
1442 ++NumAllStaleFunctions;
1445 BC.NumProfiledFuncs = ProfiledFunctions.size();
1446 BC.NumStaleProfileFuncs = NumStaleProfileFunctions;
1448 const size_t NumAllProfiledFunctions =
1449 ProfiledFunctions.size() + NumStaleProfileFunctions;
1450 BC.outs() << "BOLT-INFO: " << NumAllProfiledFunctions << " out of "
1451 << NumRegularFunctions << " functions in the binary ("
1452 << format("%.1f", NumAllProfiledFunctions /
1453 (float)NumRegularFunctions * 100.0f)
1454 << "%) have non-empty execution profile\n";
1455 if (NumNonSimpleProfiledFunctions) {
1456 BC.outs() << "BOLT-INFO: " << NumNonSimpleProfiledFunctions << " function"
1457 << (NumNonSimpleProfiledFunctions == 1 ? "" : "s")
1458 << " with profile could not be optimized\n";
1460 if (NumAllStaleFunctions) {
1461 const float PctStale =
1462 NumAllStaleFunctions / (float)NumAllProfiledFunctions * 100.0f;
1463 const float PctStaleFuncsWithEqualBlockCount =
1464 (float)BC.Stats.NumStaleFuncsWithEqualBlockCount /
1465 NumAllStaleFunctions * 100.0f;
1466 const float PctStaleBlocksWithEqualIcount =
1467 (float)BC.Stats.NumStaleBlocksWithEqualIcount /
1468 BC.Stats.NumStaleBlocks * 100.0f;
1469 auto printErrorOrWarning = [&]() {
1470 if (PctStale > opts::StaleThreshold)
1471 BC.errs() << "BOLT-ERROR: ";
1472 else
1473 BC.errs() << "BOLT-WARNING: ";
1475 printErrorOrWarning();
1476 BC.errs() << NumAllStaleFunctions
1477 << format(" (%.1f%% of all profiled)", PctStale) << " function"
1478 << (NumAllStaleFunctions == 1 ? "" : "s")
1479 << " have invalid (possibly stale) profile."
1480 " Use -report-stale to see the list.\n";
1481 if (TotalSampleCount > 0) {
1482 printErrorOrWarning();
1483 BC.errs() << (StaleSampleCount + InferredSampleCount) << " out of "
1484 << TotalSampleCount << " samples in the binary ("
1485 << format("%.1f",
1486 ((100.0f * (StaleSampleCount + InferredSampleCount)) /
1487 TotalSampleCount))
1488 << "%) belong to functions with invalid"
1489 " (possibly stale) profile.\n";
1491 BC.outs() << "BOLT-INFO: " << BC.Stats.NumStaleFuncsWithEqualBlockCount
1492 << " stale function"
1493 << (BC.Stats.NumStaleFuncsWithEqualBlockCount == 1 ? "" : "s")
1494 << format(" (%.1f%% of all stale)",
1495 PctStaleFuncsWithEqualBlockCount)
1496 << " have matching block count.\n";
1497 BC.outs() << "BOLT-INFO: " << BC.Stats.NumStaleBlocksWithEqualIcount
1498 << " stale block"
1499 << (BC.Stats.NumStaleBlocksWithEqualIcount == 1 ? "" : "s")
1500 << format(" (%.1f%% of all stale)", PctStaleBlocksWithEqualIcount)
1501 << " have matching icount.\n";
1502 if (PctStale > opts::StaleThreshold) {
1503 return createFatalBOLTError(
1504 Twine("BOLT-ERROR: stale functions exceed specified threshold of ") +
1505 Twine(opts::StaleThreshold.getValue()) + Twine("%. Exiting.\n"));
1508 if (NumInferredFunctions) {
1509 BC.outs() << format(
1510 "BOLT-INFO: inferred profile for %d (%.2f%% of profiled, "
1511 "%.2f%% of stale) functions responsible for %.2f%% samples"
1512 " (%zu out of %zu)\n",
1513 NumInferredFunctions,
1514 100.0 * NumInferredFunctions / NumAllProfiledFunctions,
1515 100.0 * NumInferredFunctions / NumAllStaleFunctions,
1516 100.0 * InferredSampleCount / TotalSampleCount, InferredSampleCount,
1517 TotalSampleCount);
1518 BC.outs() << format(
1519 "BOLT-INFO: inference found an exact match for %.2f%% of basic blocks"
1520 " (%zu out of %zu stale) responsible for %.2f%% samples"
1521 " (%zu out of %zu stale)\n",
1522 100.0 * BC.Stats.NumMatchedBlocks / BC.Stats.NumStaleBlocks,
1523 BC.Stats.NumMatchedBlocks, BC.Stats.NumStaleBlocks,
1524 100.0 * BC.Stats.MatchedSampleCount / BC.Stats.StaleSampleCount,
1525 BC.Stats.MatchedSampleCount, BC.Stats.StaleSampleCount);
1528 if (const uint64_t NumUnusedObjects = BC.getNumUnusedProfiledObjects()) {
1529 BC.outs() << "BOLT-INFO: profile for " << NumUnusedObjects
1530 << " objects was ignored\n";
1533 if (ProfiledFunctions.size() > 10) {
1534 if (opts::Verbosity >= 1) {
1535 BC.outs() << "BOLT-INFO: top called functions are:\n";
1536 llvm::sort(ProfiledFunctions,
1537 [](const BinaryFunction *A, const BinaryFunction *B) {
1538 return B->getExecutionCount() < A->getExecutionCount();
1540 auto SFI = ProfiledFunctions.begin();
1541 auto SFIend = ProfiledFunctions.end();
1542 for (unsigned I = 0u; I < opts::TopCalledLimit && SFI != SFIend;
1543 ++SFI, ++I)
1544 BC.outs() << " " << **SFI << " : " << (*SFI)->getExecutionCount()
1545 << '\n';
1549 if (!opts::PrintSortedBy.empty()) {
1550 std::vector<BinaryFunction *> Functions;
1551 std::map<const BinaryFunction *, DynoStats> Stats;
1553 for (auto &BFI : BC.getBinaryFunctions()) {
1554 BinaryFunction &BF = BFI.second;
1555 if (shouldOptimize(BF) && BF.hasValidProfile()) {
1556 Functions.push_back(&BF);
1557 Stats.emplace(&BF, getDynoStats(BF));
1561 const bool SortAll =
1562 llvm::is_contained(opts::PrintSortedBy, DynoStats::LAST_DYNO_STAT);
1564 const bool Ascending =
1565 opts::DynoStatsSortOrderOpt == opts::DynoStatsSortOrder::Ascending;
1567 std::function<bool(const DynoStats &, const DynoStats &)>
1568 DynoStatsComparator =
1569 SortAll ? [](const DynoStats &StatsA,
1570 const DynoStats &StatsB) { return StatsA < StatsB; }
1571 : [](const DynoStats &StatsA, const DynoStats &StatsB) {
1572 return StatsA.lessThan(StatsB, opts::PrintSortedBy);
1575 llvm::stable_sort(Functions,
1576 [Ascending, &Stats, DynoStatsComparator](
1577 const BinaryFunction *A, const BinaryFunction *B) {
1578 auto StatsItr = Stats.find(A);
1579 assert(StatsItr != Stats.end());
1580 const DynoStats &StatsA = StatsItr->second;
1582 StatsItr = Stats.find(B);
1583 assert(StatsItr != Stats.end());
1584 const DynoStats &StatsB = StatsItr->second;
1586 return Ascending ? DynoStatsComparator(StatsA, StatsB)
1587 : DynoStatsComparator(StatsB, StatsA);
1590 BC.outs() << "BOLT-INFO: top functions sorted by ";
1591 if (SortAll) {
1592 BC.outs() << "dyno stats";
1593 } else {
1594 BC.outs() << "(";
1595 bool PrintComma = false;
1596 for (const DynoStats::Category Category : opts::PrintSortedBy) {
1597 if (PrintComma)
1598 BC.outs() << ", ";
1599 BC.outs() << DynoStats::Description(Category);
1600 PrintComma = true;
1602 BC.outs() << ")";
1605 BC.outs() << " are:\n";
1606 auto SFI = Functions.begin();
1607 for (unsigned I = 0; I < 100 && SFI != Functions.end(); ++SFI, ++I) {
1608 const DynoStats Stats = getDynoStats(**SFI);
1609 BC.outs() << " " << **SFI;
1610 if (!SortAll) {
1611 BC.outs() << " (";
1612 bool PrintComma = false;
1613 for (const DynoStats::Category Category : opts::PrintSortedBy) {
1614 if (PrintComma)
1615 BC.outs() << ", ";
1616 BC.outs() << dynoStatsOptName(Category) << "=" << Stats[Category];
1617 PrintComma = true;
1619 BC.outs() << ")";
1621 BC.outs() << "\n";
1625 if (!BC.TrappedFunctions.empty()) {
1626 BC.errs() << "BOLT-WARNING: " << BC.TrappedFunctions.size() << " function"
1627 << (BC.TrappedFunctions.size() > 1 ? "s" : "")
1628 << " will trap on entry. Use -trap-avx512=0 to disable"
1629 " traps.";
1630 if (opts::Verbosity >= 1 || BC.TrappedFunctions.size() <= 5) {
1631 BC.errs() << '\n';
1632 for (const BinaryFunction *Function : BC.TrappedFunctions)
1633 BC.errs() << " " << *Function << '\n';
1634 } else {
1635 BC.errs() << " Use -v=1 to see the list.\n";
1639 // Collect and print information about suboptimal code layout on input.
1640 if (opts::ReportBadLayout) {
1641 std::vector<BinaryFunction *> SuboptimalFuncs;
1642 for (auto &BFI : BC.getBinaryFunctions()) {
1643 BinaryFunction &BF = BFI.second;
1644 if (!BF.hasValidProfile())
1645 continue;
1647 const uint64_t HotThreshold =
1648 std::max<uint64_t>(BF.getKnownExecutionCount(), 1);
1649 bool HotSeen = false;
1650 for (const BinaryBasicBlock *BB : BF.getLayout().rblocks()) {
1651 if (!HotSeen && BB->getKnownExecutionCount() > HotThreshold) {
1652 HotSeen = true;
1653 continue;
1655 if (HotSeen && BB->getKnownExecutionCount() == 0) {
1656 SuboptimalFuncs.push_back(&BF);
1657 break;
1662 if (!SuboptimalFuncs.empty()) {
1663 llvm::sort(SuboptimalFuncs,
1664 [](const BinaryFunction *A, const BinaryFunction *B) {
1665 return A->getKnownExecutionCount() / A->getSize() >
1666 B->getKnownExecutionCount() / B->getSize();
1669 BC.outs() << "BOLT-INFO: " << SuboptimalFuncs.size()
1670 << " functions have "
1671 "cold code in the middle of hot code. Top functions are:\n";
1672 for (unsigned I = 0;
1673 I < std::min(static_cast<size_t>(opts::ReportBadLayout),
1674 SuboptimalFuncs.size());
1675 ++I)
1676 SuboptimalFuncs[I]->print(BC.outs());
1680 if (NumUnknownControlFlowFunctions) {
1681 BC.outs() << "BOLT-INFO: " << NumUnknownControlFlowFunctions
1682 << " functions have instructions with unknown control flow";
1683 if (!opts::PrintUnknown)
1684 BC.outs() << ". Use -print-unknown to see the list.";
1685 BC.outs() << '\n';
1687 return Error::success();
1690 Error InstructionLowering::runOnFunctions(BinaryContext &BC) {
1691 for (auto &BFI : BC.getBinaryFunctions())
1692 for (BinaryBasicBlock &BB : BFI.second)
1693 for (MCInst &Instruction : BB)
1694 BC.MIB->lowerTailCall(Instruction);
1695 return Error::success();
1698 Error StripRepRet::runOnFunctions(BinaryContext &BC) {
1699 if (!BC.isX86())
1700 return Error::success();
1702 uint64_t NumPrefixesRemoved = 0;
1703 uint64_t NumBytesSaved = 0;
1704 for (auto &BFI : BC.getBinaryFunctions()) {
1705 for (BinaryBasicBlock &BB : BFI.second) {
1706 auto LastInstRIter = BB.getLastNonPseudo();
1707 if (LastInstRIter == BB.rend() || !BC.MIB->isReturn(*LastInstRIter) ||
1708 !BC.MIB->deleteREPPrefix(*LastInstRIter))
1709 continue;
1711 NumPrefixesRemoved += BB.getKnownExecutionCount();
1712 ++NumBytesSaved;
1716 if (NumBytesSaved)
1717 BC.outs() << "BOLT-INFO: removed " << NumBytesSaved
1718 << " 'repz' prefixes"
1719 " with estimated execution count of "
1720 << NumPrefixesRemoved << " times.\n";
1721 return Error::success();
1724 Error InlineMemcpy::runOnFunctions(BinaryContext &BC) {
1725 if (!BC.isX86())
1726 return Error::success();
1728 uint64_t NumInlined = 0;
1729 uint64_t NumInlinedDyno = 0;
1730 for (auto &BFI : BC.getBinaryFunctions()) {
1731 for (BinaryBasicBlock &BB : BFI.second) {
1732 for (auto II = BB.begin(); II != BB.end(); ++II) {
1733 MCInst &Inst = *II;
1735 if (!BC.MIB->isCall(Inst) || MCPlus::getNumPrimeOperands(Inst) != 1 ||
1736 !Inst.getOperand(0).isExpr())
1737 continue;
1739 const MCSymbol *CalleeSymbol = BC.MIB->getTargetSymbol(Inst);
1740 if (CalleeSymbol->getName() != "memcpy" &&
1741 CalleeSymbol->getName() != "memcpy@PLT" &&
1742 CalleeSymbol->getName() != "_memcpy8")
1743 continue;
1745 const bool IsMemcpy8 = (CalleeSymbol->getName() == "_memcpy8");
1746 const bool IsTailCall = BC.MIB->isTailCall(Inst);
1748 const InstructionListType NewCode =
1749 BC.MIB->createInlineMemcpy(IsMemcpy8);
1750 II = BB.replaceInstruction(II, NewCode);
1751 std::advance(II, NewCode.size() - 1);
1752 if (IsTailCall) {
1753 MCInst Return;
1754 BC.MIB->createReturn(Return);
1755 II = BB.insertInstruction(std::next(II), std::move(Return));
1758 ++NumInlined;
1759 NumInlinedDyno += BB.getKnownExecutionCount();
1764 if (NumInlined) {
1765 BC.outs() << "BOLT-INFO: inlined " << NumInlined << " memcpy() calls";
1766 if (NumInlinedDyno)
1767 BC.outs() << ". The calls were executed " << NumInlinedDyno
1768 << " times based on profile.";
1769 BC.outs() << '\n';
1771 return Error::success();
1774 bool SpecializeMemcpy1::shouldOptimize(const BinaryFunction &Function) const {
1775 if (!BinaryFunctionPass::shouldOptimize(Function))
1776 return false;
1778 for (const std::string &FunctionSpec : Spec) {
1779 StringRef FunctionName = StringRef(FunctionSpec).split(':').first;
1780 if (Function.hasNameRegex(FunctionName))
1781 return true;
1784 return false;
1787 std::set<size_t> SpecializeMemcpy1::getCallSitesToOptimize(
1788 const BinaryFunction &Function) const {
1789 StringRef SitesString;
1790 for (const std::string &FunctionSpec : Spec) {
1791 StringRef FunctionName;
1792 std::tie(FunctionName, SitesString) = StringRef(FunctionSpec).split(':');
1793 if (Function.hasNameRegex(FunctionName))
1794 break;
1795 SitesString = "";
1798 std::set<size_t> Sites;
1799 SmallVector<StringRef, 4> SitesVec;
1800 SitesString.split(SitesVec, ':');
1801 for (StringRef SiteString : SitesVec) {
1802 if (SiteString.empty())
1803 continue;
1804 size_t Result;
1805 if (!SiteString.getAsInteger(10, Result))
1806 Sites.emplace(Result);
1809 return Sites;
1812 Error SpecializeMemcpy1::runOnFunctions(BinaryContext &BC) {
1813 if (!BC.isX86())
1814 return Error::success();
1816 uint64_t NumSpecialized = 0;
1817 uint64_t NumSpecializedDyno = 0;
1818 for (auto &BFI : BC.getBinaryFunctions()) {
1819 BinaryFunction &Function = BFI.second;
1820 if (!shouldOptimize(Function))
1821 continue;
1823 std::set<size_t> CallsToOptimize = getCallSitesToOptimize(Function);
1824 auto shouldOptimize = [&](size_t N) {
1825 return CallsToOptimize.empty() || CallsToOptimize.count(N);
1828 std::vector<BinaryBasicBlock *> Blocks(Function.pbegin(), Function.pend());
1829 size_t CallSiteID = 0;
1830 for (BinaryBasicBlock *CurBB : Blocks) {
1831 for (auto II = CurBB->begin(); II != CurBB->end(); ++II) {
1832 MCInst &Inst = *II;
1834 if (!BC.MIB->isCall(Inst) || MCPlus::getNumPrimeOperands(Inst) != 1 ||
1835 !Inst.getOperand(0).isExpr())
1836 continue;
1838 const MCSymbol *CalleeSymbol = BC.MIB->getTargetSymbol(Inst);
1839 if (CalleeSymbol->getName() != "memcpy" &&
1840 CalleeSymbol->getName() != "memcpy@PLT")
1841 continue;
1843 if (BC.MIB->isTailCall(Inst))
1844 continue;
1846 ++CallSiteID;
1848 if (!shouldOptimize(CallSiteID))
1849 continue;
1851 // Create a copy of a call to memcpy(dest, src, size).
1852 MCInst MemcpyInstr = Inst;
1854 BinaryBasicBlock *OneByteMemcpyBB = CurBB->splitAt(II);
1856 BinaryBasicBlock *NextBB = nullptr;
1857 if (OneByteMemcpyBB->getNumNonPseudos() > 1) {
1858 NextBB = OneByteMemcpyBB->splitAt(OneByteMemcpyBB->begin());
1859 NextBB->eraseInstruction(NextBB->begin());
1860 } else {
1861 NextBB = OneByteMemcpyBB->getSuccessor();
1862 OneByteMemcpyBB->eraseInstruction(OneByteMemcpyBB->begin());
1863 assert(NextBB && "unexpected call to memcpy() with no return");
1866 BinaryBasicBlock *MemcpyBB = Function.addBasicBlock();
1867 MemcpyBB->setOffset(CurBB->getInputOffset());
1868 InstructionListType CmpJCC =
1869 BC.MIB->createCmpJE(BC.MIB->getIntArgRegister(2), 1,
1870 OneByteMemcpyBB->getLabel(), BC.Ctx.get());
1871 CurBB->addInstructions(CmpJCC);
1872 CurBB->addSuccessor(MemcpyBB);
1874 MemcpyBB->addInstruction(std::move(MemcpyInstr));
1875 MemcpyBB->addSuccessor(NextBB);
1876 MemcpyBB->setCFIState(NextBB->getCFIState());
1877 MemcpyBB->setExecutionCount(0);
1879 // To prevent the actual call from being moved to cold, we set its
1880 // execution count to 1.
1881 if (CurBB->getKnownExecutionCount() > 0)
1882 MemcpyBB->setExecutionCount(1);
1884 InstructionListType OneByteMemcpy = BC.MIB->createOneByteMemcpy();
1885 OneByteMemcpyBB->addInstructions(OneByteMemcpy);
1887 ++NumSpecialized;
1888 NumSpecializedDyno += CurBB->getKnownExecutionCount();
1890 CurBB = NextBB;
1892 // Note: we don't expect the next instruction to be a call to memcpy.
1893 II = CurBB->begin();
1898 if (NumSpecialized) {
1899 BC.outs() << "BOLT-INFO: specialized " << NumSpecialized
1900 << " memcpy() call sites for size 1";
1901 if (NumSpecializedDyno)
1902 BC.outs() << ". The calls were executed " << NumSpecializedDyno
1903 << " times based on profile.";
1904 BC.outs() << '\n';
1906 return Error::success();
1909 void RemoveNops::runOnFunction(BinaryFunction &BF) {
1910 const BinaryContext &BC = BF.getBinaryContext();
1911 for (BinaryBasicBlock &BB : BF) {
1912 for (int64_t I = BB.size() - 1; I >= 0; --I) {
1913 MCInst &Inst = BB.getInstructionAtIndex(I);
1914 if (BC.MIB->isNoop(Inst) && BC.MIB->hasAnnotation(Inst, "NOP"))
1915 BB.eraseInstructionAtIndex(I);
1920 Error RemoveNops::runOnFunctions(BinaryContext &BC) {
1921 ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) {
1922 runOnFunction(BF);
1925 ParallelUtilities::PredicateTy SkipFunc = [&](const BinaryFunction &BF) {
1926 return BF.shouldPreserveNops();
1929 ParallelUtilities::runOnEachFunction(
1930 BC, ParallelUtilities::SchedulingPolicy::SP_INST_LINEAR, WorkFun,
1931 SkipFunc, "RemoveNops");
1932 return Error::success();
1935 } // namespace bolt
1936 } // namespace llvm