Run DCE after a LoopFlatten test to reduce spurious output [nfc]
[llvm-project.git] / bolt / lib / Passes / IndirectCallPromotion.cpp
blob457997d35323d8b02fa52e7979654b11679b1f11
1 //===- bolt/Passes/IndirectCallPromotion.cpp ------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements the IndirectCallPromotion class.
11 //===----------------------------------------------------------------------===//
13 #include "bolt/Passes/IndirectCallPromotion.h"
14 #include "bolt/Passes/BinaryFunctionCallGraph.h"
15 #include "bolt/Passes/DataflowInfoManager.h"
16 #include "bolt/Passes/Inliner.h"
17 #include "llvm/ADT/STLExtras.h"
18 #include "llvm/Support/CommandLine.h"
19 #include <iterator>
21 #define DEBUG_TYPE "ICP"
22 #define DEBUG_VERBOSE(Level, X) \
23 if (opts::Verbosity >= (Level)) { \
24 X; \
27 using namespace llvm;
28 using namespace bolt;
30 namespace opts {
32 extern cl::OptionCategory BoltOptCategory;
34 extern cl::opt<IndirectCallPromotionType> ICP;
35 extern cl::opt<unsigned> Verbosity;
36 extern cl::opt<unsigned> ExecutionCountThreshold;
38 static cl::opt<unsigned> ICPJTRemainingPercentThreshold(
39 "icp-jt-remaining-percent-threshold",
40 cl::desc("The percentage threshold against remaining unpromoted indirect "
41 "call count for the promotion for jump tables"),
42 cl::init(30), cl::ZeroOrMore, cl::Hidden, cl::cat(BoltOptCategory));
44 static cl::opt<unsigned> ICPJTTotalPercentThreshold(
45 "icp-jt-total-percent-threshold",
46 cl::desc(
47 "The percentage threshold against total count for the promotion for "
48 "jump tables"),
49 cl::init(5), cl::Hidden, cl::cat(BoltOptCategory));
51 static cl::opt<unsigned> ICPCallsRemainingPercentThreshold(
52 "icp-calls-remaining-percent-threshold",
53 cl::desc("The percentage threshold against remaining unpromoted indirect "
54 "call count for the promotion for calls"),
55 cl::init(50), cl::Hidden, cl::cat(BoltOptCategory));
57 static cl::opt<unsigned> ICPCallsTotalPercentThreshold(
58 "icp-calls-total-percent-threshold",
59 cl::desc(
60 "The percentage threshold against total count for the promotion for "
61 "calls"),
62 cl::init(30), cl::Hidden, cl::cat(BoltOptCategory));
64 static cl::opt<unsigned> ICPMispredictThreshold(
65 "indirect-call-promotion-mispredict-threshold",
66 cl::desc("misprediction threshold for skipping ICP on an "
67 "indirect call"),
68 cl::init(0), cl::cat(BoltOptCategory));
70 static cl::alias ICPMispredictThresholdAlias(
71 "icp-mp-threshold",
72 cl::desc("alias for --indirect-call-promotion-mispredict-threshold"),
73 cl::aliasopt(ICPMispredictThreshold));
75 static cl::opt<bool> ICPUseMispredicts(
76 "indirect-call-promotion-use-mispredicts",
77 cl::desc("use misprediction frequency for determining whether or not ICP "
78 "should be applied at a callsite. The "
79 "-indirect-call-promotion-mispredict-threshold value will be used "
80 "by this heuristic"),
81 cl::cat(BoltOptCategory));
83 static cl::alias ICPUseMispredictsAlias(
84 "icp-use-mp",
85 cl::desc("alias for --indirect-call-promotion-use-mispredicts"),
86 cl::aliasopt(ICPUseMispredicts));
88 static cl::opt<unsigned>
89 ICPTopN("indirect-call-promotion-topn",
90 cl::desc("limit number of targets to consider when doing indirect "
91 "call promotion. 0 = no limit"),
92 cl::init(3), cl::cat(BoltOptCategory));
94 static cl::alias
95 ICPTopNAlias("icp-topn",
96 cl::desc("alias for --indirect-call-promotion-topn"),
97 cl::aliasopt(ICPTopN));
99 static cl::opt<unsigned> ICPCallsTopN(
100 "indirect-call-promotion-calls-topn",
101 cl::desc("limit number of targets to consider when doing indirect "
102 "call promotion on calls. 0 = no limit"),
103 cl::init(0), cl::cat(BoltOptCategory));
105 static cl::alias ICPCallsTopNAlias(
106 "icp-calls-topn",
107 cl::desc("alias for --indirect-call-promotion-calls-topn"),
108 cl::aliasopt(ICPCallsTopN));
110 static cl::opt<unsigned> ICPJumpTablesTopN(
111 "indirect-call-promotion-jump-tables-topn",
112 cl::desc("limit number of targets to consider when doing indirect "
113 "call promotion on jump tables. 0 = no limit"),
114 cl::init(0), cl::cat(BoltOptCategory));
116 static cl::alias ICPJumpTablesTopNAlias(
117 "icp-jt-topn",
118 cl::desc("alias for --indirect-call-promotion-jump-tables-topn"),
119 cl::aliasopt(ICPJumpTablesTopN));
121 static cl::opt<bool> EliminateLoads(
122 "icp-eliminate-loads",
123 cl::desc("enable load elimination using memory profiling data when "
124 "performing ICP"),
125 cl::init(true), cl::cat(BoltOptCategory));
127 static cl::opt<unsigned> ICPTopCallsites(
128 "icp-top-callsites",
129 cl::desc("optimize hottest calls until at least this percentage of all "
130 "indirect calls frequency is covered. 0 = all callsites"),
131 cl::init(99), cl::Hidden, cl::cat(BoltOptCategory));
133 static cl::list<std::string>
134 ICPFuncsList("icp-funcs", cl::CommaSeparated,
135 cl::desc("list of functions to enable ICP for"),
136 cl::value_desc("func1,func2,func3,..."), cl::Hidden,
137 cl::cat(BoltOptCategory));
139 static cl::opt<bool>
140 ICPOldCodeSequence("icp-old-code-sequence",
141 cl::desc("use old code sequence for promoted calls"),
142 cl::Hidden, cl::cat(BoltOptCategory));
144 static cl::opt<bool> ICPJumpTablesByTarget(
145 "icp-jump-tables-targets",
146 cl::desc(
147 "for jump tables, optimize indirect jmp targets instead of indices"),
148 cl::Hidden, cl::cat(BoltOptCategory));
150 static cl::alias
151 ICPJumpTablesByTargetAlias("icp-jt-targets",
152 cl::desc("alias for --icp-jump-tables-targets"),
153 cl::aliasopt(ICPJumpTablesByTarget));
155 static cl::opt<bool> ICPPeelForInline(
156 "icp-inline", cl::desc("only promote call targets eligible for inlining"),
157 cl::Hidden, cl::cat(BoltOptCategory));
159 } // namespace opts
161 static bool verifyProfile(std::map<uint64_t, BinaryFunction> &BFs) {
162 bool IsValid = true;
163 for (auto &BFI : BFs) {
164 BinaryFunction &BF = BFI.second;
165 if (!BF.isSimple())
166 continue;
167 for (const BinaryBasicBlock &BB : BF) {
168 auto BI = BB.branch_info_begin();
169 for (BinaryBasicBlock *SuccBB : BB.successors()) {
170 if (BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE && BI->Count > 0) {
171 if (BB.getKnownExecutionCount() == 0 ||
172 SuccBB->getKnownExecutionCount() == 0) {
173 errs() << "BOLT-WARNING: profile verification failed after ICP for "
174 "function "
175 << BF << '\n';
176 IsValid = false;
179 ++BI;
183 return IsValid;
186 namespace llvm {
187 namespace bolt {
189 IndirectCallPromotion::Callsite::Callsite(BinaryFunction &BF,
190 const IndirectCallProfile &ICP)
191 : From(BF.getSymbol()), To(ICP.Offset), Mispreds(ICP.Mispreds),
192 Branches(ICP.Count) {
193 if (ICP.Symbol) {
194 To.Sym = ICP.Symbol;
195 To.Addr = 0;
199 void IndirectCallPromotion::printDecision(
200 llvm::raw_ostream &OS,
201 std::vector<IndirectCallPromotion::Callsite> &Targets, unsigned N) const {
202 uint64_t TotalCount = 0;
203 uint64_t TotalMispreds = 0;
204 for (const Callsite &S : Targets) {
205 TotalCount += S.Branches;
206 TotalMispreds += S.Mispreds;
208 if (!TotalCount)
209 TotalCount = 1;
210 if (!TotalMispreds)
211 TotalMispreds = 1;
213 OS << "BOLT-INFO: ICP decision for call site with " << Targets.size()
214 << " targets, Count = " << TotalCount << ", Mispreds = " << TotalMispreds
215 << "\n";
217 size_t I = 0;
218 for (const Callsite &S : Targets) {
219 OS << "Count = " << S.Branches << ", "
220 << format("%.1f", (100.0 * S.Branches) / TotalCount) << ", "
221 << "Mispreds = " << S.Mispreds << ", "
222 << format("%.1f", (100.0 * S.Mispreds) / TotalMispreds);
223 if (I < N)
224 OS << " * to be optimized *";
225 if (!S.JTIndices.empty()) {
226 OS << " Indices:";
227 for (const uint64_t Idx : S.JTIndices)
228 OS << " " << Idx;
230 OS << "\n";
231 I += S.JTIndices.empty() ? 1 : S.JTIndices.size();
235 // Get list of targets for a given call sorted by most frequently
236 // called first.
237 std::vector<IndirectCallPromotion::Callsite>
238 IndirectCallPromotion::getCallTargets(BinaryBasicBlock &BB,
239 const MCInst &Inst) const {
240 BinaryFunction &BF = *BB.getFunction();
241 const BinaryContext &BC = BF.getBinaryContext();
242 std::vector<Callsite> Targets;
244 if (const JumpTable *JT = BF.getJumpTable(Inst)) {
245 // Don't support PIC jump tables for now
246 if (!opts::ICPJumpTablesByTarget && JT->Type == JumpTable::JTT_PIC)
247 return Targets;
248 const Location From(BF.getSymbol());
249 const std::pair<size_t, size_t> Range =
250 JT->getEntriesForAddress(BC.MIB->getJumpTable(Inst));
251 assert(JT->Counts.empty() || JT->Counts.size() >= Range.second);
252 JumpTable::JumpInfo DefaultJI;
253 const JumpTable::JumpInfo *JI =
254 JT->Counts.empty() ? &DefaultJI : &JT->Counts[Range.first];
255 const size_t JIAdj = JT->Counts.empty() ? 0 : 1;
256 assert(JT->Type == JumpTable::JTT_PIC ||
257 JT->EntrySize == BC.AsmInfo->getCodePointerSize());
258 for (size_t I = Range.first; I < Range.second; ++I, JI += JIAdj) {
259 MCSymbol *Entry = JT->Entries[I];
260 const BinaryBasicBlock *ToBB = BF.getBasicBlockForLabel(Entry);
261 assert(ToBB || Entry == BF.getFunctionEndLabel() ||
262 Entry == BF.getFunctionEndLabel(FragmentNum::cold()));
263 if (Entry == BF.getFunctionEndLabel() ||
264 Entry == BF.getFunctionEndLabel(FragmentNum::cold()))
265 continue;
266 const Location To(Entry);
267 const BinaryBasicBlock::BinaryBranchInfo &BI = BB.getBranchInfo(*ToBB);
268 Targets.emplace_back(From, To, BI.MispredictedCount, BI.Count,
269 I - Range.first);
272 // Sort by symbol then addr.
273 llvm::sort(Targets, [](const Callsite &A, const Callsite &B) {
274 if (A.To.Sym && B.To.Sym)
275 return A.To.Sym < B.To.Sym;
276 else if (A.To.Sym && !B.To.Sym)
277 return true;
278 else if (!A.To.Sym && B.To.Sym)
279 return false;
280 else
281 return A.To.Addr < B.To.Addr;
284 // Targets may contain multiple entries to the same target, but using
285 // different indices. Their profile will report the same number of branches
286 // for different indices if the target is the same. That's because we don't
287 // profile the index value, but only the target via LBR.
288 auto First = Targets.begin();
289 auto Last = Targets.end();
290 auto Result = First;
291 while (++First != Last) {
292 Callsite &A = *Result;
293 const Callsite &B = *First;
294 if (A.To.Sym && B.To.Sym && A.To.Sym == B.To.Sym)
295 A.JTIndices.insert(A.JTIndices.end(), B.JTIndices.begin(),
296 B.JTIndices.end());
297 else
298 *(++Result) = *First;
300 ++Result;
302 LLVM_DEBUG(if (Targets.end() - Result > 0) {
303 dbgs() << "BOLT-INFO: ICP: " << (Targets.end() - Result)
304 << " duplicate targets removed\n";
307 Targets.erase(Result, Targets.end());
308 } else {
309 // Don't try to optimize PC relative indirect calls.
310 if (Inst.getOperand(0).isReg() &&
311 Inst.getOperand(0).getReg() == BC.MRI->getProgramCounter())
312 return Targets;
314 const auto ICSP = BC.MIB->tryGetAnnotationAs<IndirectCallSiteProfile>(
315 Inst, "CallProfile");
316 if (ICSP) {
317 for (const IndirectCallProfile &CSP : ICSP.get()) {
318 Callsite Site(BF, CSP);
319 if (Site.isValid())
320 Targets.emplace_back(std::move(Site));
325 // Sort by target count, number of indices in case of jump table, and
326 // mispredicts. We prioritize targets with high count, small number of indices
327 // and high mispredicts. Break ties by selecting targets with lower addresses.
328 llvm::stable_sort(Targets, [](const Callsite &A, const Callsite &B) {
329 if (A.Branches != B.Branches)
330 return A.Branches > B.Branches;
331 if (A.JTIndices.size() != B.JTIndices.size())
332 return A.JTIndices.size() < B.JTIndices.size();
333 if (A.Mispreds != B.Mispreds)
334 return A.Mispreds > B.Mispreds;
335 return A.To.Addr < B.To.Addr;
338 // Remove non-symbol targets
339 llvm::erase_if(Targets, [](const Callsite &CS) { return !CS.To.Sym; });
341 LLVM_DEBUG(if (BF.getJumpTable(Inst)) {
342 uint64_t TotalCount = 0;
343 uint64_t TotalMispreds = 0;
344 for (const Callsite &S : Targets) {
345 TotalCount += S.Branches;
346 TotalMispreds += S.Mispreds;
348 if (!TotalCount)
349 TotalCount = 1;
350 if (!TotalMispreds)
351 TotalMispreds = 1;
353 dbgs() << "BOLT-INFO: ICP: jump table size = " << Targets.size()
354 << ", Count = " << TotalCount << ", Mispreds = " << TotalMispreds
355 << "\n";
357 size_t I = 0;
358 for (const Callsite &S : Targets) {
359 dbgs() << "Count[" << I << "] = " << S.Branches << ", "
360 << format("%.1f", (100.0 * S.Branches) / TotalCount) << ", "
361 << "Mispreds[" << I << "] = " << S.Mispreds << ", "
362 << format("%.1f", (100.0 * S.Mispreds) / TotalMispreds) << "\n";
363 ++I;
367 return Targets;
370 IndirectCallPromotion::JumpTableInfoType
371 IndirectCallPromotion::maybeGetHotJumpTableTargets(BinaryBasicBlock &BB,
372 MCInst &CallInst,
373 MCInst *&TargetFetchInst,
374 const JumpTable *JT) const {
375 assert(JT && "Can't get jump table addrs for non-jump tables.");
377 BinaryFunction &Function = *BB.getFunction();
378 BinaryContext &BC = Function.getBinaryContext();
380 if (!Function.hasMemoryProfile() || !opts::EliminateLoads)
381 return JumpTableInfoType();
383 JumpTableInfoType HotTargets;
384 MCInst *MemLocInstr;
385 MCInst *PCRelBaseOut;
386 unsigned BaseReg, IndexReg;
387 int64_t DispValue;
388 const MCExpr *DispExpr;
389 MutableArrayRef<MCInst> Insts(&BB.front(), &CallInst);
390 const IndirectBranchType Type = BC.MIB->analyzeIndirectBranch(
391 CallInst, Insts.begin(), Insts.end(), BC.AsmInfo->getCodePointerSize(),
392 MemLocInstr, BaseReg, IndexReg, DispValue, DispExpr, PCRelBaseOut);
394 assert(MemLocInstr && "There should always be a load for jump tables");
395 if (!MemLocInstr)
396 return JumpTableInfoType();
398 LLVM_DEBUG({
399 dbgs() << "BOLT-INFO: ICP attempting to find memory profiling data for "
400 << "jump table in " << Function << " at @ "
401 << (&CallInst - &BB.front()) << "\n"
402 << "BOLT-INFO: ICP target fetch instructions:\n";
403 BC.printInstruction(dbgs(), *MemLocInstr, 0, &Function);
404 if (MemLocInstr != &CallInst)
405 BC.printInstruction(dbgs(), CallInst, 0, &Function);
408 DEBUG_VERBOSE(1, {
409 dbgs() << "Jmp info: Type = " << (unsigned)Type << ", "
410 << "BaseReg = " << BC.MRI->getName(BaseReg) << ", "
411 << "IndexReg = " << BC.MRI->getName(IndexReg) << ", "
412 << "DispValue = " << Twine::utohexstr(DispValue) << ", "
413 << "DispExpr = " << DispExpr << ", "
414 << "MemLocInstr = ";
415 BC.printInstruction(dbgs(), *MemLocInstr, 0, &Function);
416 dbgs() << "\n";
419 ++TotalIndexBasedCandidates;
421 auto ErrorOrMemAccessProfile =
422 BC.MIB->tryGetAnnotationAs<MemoryAccessProfile>(*MemLocInstr,
423 "MemoryAccessProfile");
424 if (!ErrorOrMemAccessProfile) {
425 DEBUG_VERBOSE(1, dbgs()
426 << "BOLT-INFO: ICP no memory profiling data found\n");
427 return JumpTableInfoType();
429 MemoryAccessProfile &MemAccessProfile = ErrorOrMemAccessProfile.get();
431 uint64_t ArrayStart;
432 if (DispExpr) {
433 ErrorOr<uint64_t> DispValueOrError =
434 BC.getSymbolValue(*BC.MIB->getTargetSymbol(DispExpr));
435 assert(DispValueOrError && "global symbol needs a value");
436 ArrayStart = *DispValueOrError;
437 } else {
438 ArrayStart = static_cast<uint64_t>(DispValue);
441 if (BaseReg == BC.MRI->getProgramCounter())
442 ArrayStart += Function.getAddress() + MemAccessProfile.NextInstrOffset;
444 // This is a map of [symbol] -> [count, index] and is used to combine indices
445 // into the jump table since there may be multiple addresses that all have the
446 // same entry.
447 std::map<MCSymbol *, std::pair<uint64_t, uint64_t>> HotTargetMap;
448 const std::pair<size_t, size_t> Range = JT->getEntriesForAddress(ArrayStart);
450 for (const AddressAccess &AccessInfo : MemAccessProfile.AddressAccessInfo) {
451 size_t Index;
452 // Mem data occasionally includes nullprs, ignore them.
453 if (!AccessInfo.MemoryObject && !AccessInfo.Offset)
454 continue;
456 if (AccessInfo.Offset % JT->EntrySize != 0) // ignore bogus data
457 return JumpTableInfoType();
459 if (AccessInfo.MemoryObject) {
460 // Deal with bad/stale data
461 if (!AccessInfo.MemoryObject->getName().startswith(
462 "JUMP_TABLE/" + Function.getOneName().str()))
463 return JumpTableInfoType();
464 Index =
465 (AccessInfo.Offset - (ArrayStart - JT->getAddress())) / JT->EntrySize;
466 } else {
467 Index = (AccessInfo.Offset - ArrayStart) / JT->EntrySize;
470 // If Index is out of range it probably means the memory profiling data is
471 // wrong for this instruction, bail out.
472 if (Index >= Range.second) {
473 LLVM_DEBUG(dbgs() << "BOLT-INFO: Index out of range of " << Range.first
474 << ", " << Range.second << "\n");
475 return JumpTableInfoType();
478 // Make sure the hot index points at a legal label corresponding to a BB,
479 // e.g. not the end of function (unreachable) label.
480 if (!Function.getBasicBlockForLabel(JT->Entries[Index + Range.first])) {
481 LLVM_DEBUG({
482 dbgs() << "BOLT-INFO: hot index " << Index << " pointing at bogus "
483 << "label " << JT->Entries[Index + Range.first]->getName()
484 << " in jump table:\n";
485 JT->print(dbgs());
486 dbgs() << "HotTargetMap:\n";
487 for (std::pair<MCSymbol *const, std::pair<uint64_t, uint64_t>> &HT :
488 HotTargetMap)
489 dbgs() << "BOLT-INFO: " << HT.first->getName()
490 << " = (count=" << HT.second.first
491 << ", index=" << HT.second.second << ")\n";
493 return JumpTableInfoType();
496 std::pair<uint64_t, uint64_t> &HotTarget =
497 HotTargetMap[JT->Entries[Index + Range.first]];
498 HotTarget.first += AccessInfo.Count;
499 HotTarget.second = Index;
502 llvm::copy(llvm::make_second_range(HotTargetMap),
503 std::back_inserter(HotTargets));
505 // Sort with highest counts first.
506 llvm::sort(reverse(HotTargets));
508 LLVM_DEBUG({
509 dbgs() << "BOLT-INFO: ICP jump table hot targets:\n";
510 for (const std::pair<uint64_t, uint64_t> &Target : HotTargets)
511 dbgs() << "BOLT-INFO: Idx = " << Target.second << ", "
512 << "Count = " << Target.first << "\n";
515 BC.MIB->getOrCreateAnnotationAs<uint16_t>(CallInst, "JTIndexReg") = IndexReg;
517 TargetFetchInst = MemLocInstr;
519 return HotTargets;
522 IndirectCallPromotion::SymTargetsType
523 IndirectCallPromotion::findCallTargetSymbols(std::vector<Callsite> &Targets,
524 size_t &N, BinaryBasicBlock &BB,
525 MCInst &CallInst,
526 MCInst *&TargetFetchInst) const {
527 const JumpTable *JT = BB.getFunction()->getJumpTable(CallInst);
528 SymTargetsType SymTargets;
530 if (!JT) {
531 for (size_t I = 0; I < N; ++I) {
532 assert(Targets[I].To.Sym && "All ICP targets must be to known symbols");
533 assert(Targets[I].JTIndices.empty() &&
534 "Can't have jump table indices for non-jump tables");
535 SymTargets.emplace_back(Targets[I].To.Sym, 0);
537 return SymTargets;
540 // Use memory profile to select hot targets.
541 JumpTableInfoType HotTargets =
542 maybeGetHotJumpTableTargets(BB, CallInst, TargetFetchInst, JT);
544 auto findTargetsIndex = [&](uint64_t JTIndex) {
545 for (size_t I = 0; I < Targets.size(); ++I)
546 if (llvm::is_contained(Targets[I].JTIndices, JTIndex))
547 return I;
548 LLVM_DEBUG(dbgs() << "BOLT-ERROR: Unable to find target index for hot jump "
549 << " table entry in " << *BB.getFunction() << "\n");
550 llvm_unreachable("Hot indices must be referred to by at least one "
551 "callsite");
554 if (!HotTargets.empty()) {
555 if (opts::Verbosity >= 1)
556 for (size_t I = 0; I < HotTargets.size(); ++I)
557 outs() << "BOLT-INFO: HotTarget[" << I << "] = (" << HotTargets[I].first
558 << ", " << HotTargets[I].second << ")\n";
560 // Recompute hottest targets, now discriminating which index is hot
561 // NOTE: This is a tradeoff. On one hand, we get index information. On the
562 // other hand, info coming from the memory profile is much less accurate
563 // than LBRs. So we may actually end up working with more coarse
564 // profile granularity in exchange for information about indices.
565 std::vector<Callsite> NewTargets;
566 std::map<const MCSymbol *, uint32_t> IndicesPerTarget;
567 uint64_t TotalMemAccesses = 0;
568 for (size_t I = 0; I < HotTargets.size(); ++I) {
569 const uint64_t TargetIndex = findTargetsIndex(HotTargets[I].second);
570 ++IndicesPerTarget[Targets[TargetIndex].To.Sym];
571 TotalMemAccesses += HotTargets[I].first;
573 uint64_t RemainingMemAccesses = TotalMemAccesses;
574 const size_t TopN =
575 opts::ICPJumpTablesTopN ? opts::ICPJumpTablesTopN : opts::ICPTopN;
576 size_t I = 0;
577 for (; I < HotTargets.size(); ++I) {
578 const uint64_t MemAccesses = HotTargets[I].first;
579 if (100 * MemAccesses <
580 TotalMemAccesses * opts::ICPJTTotalPercentThreshold)
581 break;
582 if (100 * MemAccesses <
583 RemainingMemAccesses * opts::ICPJTRemainingPercentThreshold)
584 break;
585 if (TopN && I >= TopN)
586 break;
587 RemainingMemAccesses -= MemAccesses;
589 const uint64_t JTIndex = HotTargets[I].second;
590 Callsite &Target = Targets[findTargetsIndex(JTIndex)];
592 NewTargets.push_back(Target);
593 std::vector<uint64_t>({JTIndex}).swap(NewTargets.back().JTIndices);
594 llvm::erase(Target.JTIndices, JTIndex);
596 // Keep fixCFG counts sane if more indices use this same target later
597 assert(IndicesPerTarget[Target.To.Sym] > 0 && "wrong map");
598 NewTargets.back().Branches =
599 Target.Branches / IndicesPerTarget[Target.To.Sym];
600 NewTargets.back().Mispreds =
601 Target.Mispreds / IndicesPerTarget[Target.To.Sym];
602 assert(Target.Branches >= NewTargets.back().Branches);
603 assert(Target.Mispreds >= NewTargets.back().Mispreds);
604 Target.Branches -= NewTargets.back().Branches;
605 Target.Mispreds -= NewTargets.back().Mispreds;
607 llvm::copy(Targets, std::back_inserter(NewTargets));
608 std::swap(NewTargets, Targets);
609 N = I;
611 if (N == 0 && opts::Verbosity >= 1) {
612 outs() << "BOLT-INFO: ICP failed in " << *BB.getFunction() << " in "
613 << BB.getName() << ": failed to meet thresholds after memory "
614 << "profile data was loaded.\n";
615 return SymTargets;
619 for (size_t I = 0, TgtIdx = 0; I < N; ++TgtIdx) {
620 Callsite &Target = Targets[TgtIdx];
621 assert(Target.To.Sym && "All ICP targets must be to known symbols");
622 assert(!Target.JTIndices.empty() && "Jump tables must have indices");
623 for (uint64_t Idx : Target.JTIndices) {
624 SymTargets.emplace_back(Target.To.Sym, Idx);
625 ++I;
629 return SymTargets;
632 IndirectCallPromotion::MethodInfoType IndirectCallPromotion::maybeGetVtableSyms(
633 BinaryBasicBlock &BB, MCInst &Inst,
634 const SymTargetsType &SymTargets) const {
635 BinaryFunction &Function = *BB.getFunction();
636 BinaryContext &BC = Function.getBinaryContext();
637 std::vector<std::pair<MCSymbol *, uint64_t>> VtableSyms;
638 std::vector<MCInst *> MethodFetchInsns;
639 unsigned VtableReg, MethodReg;
640 uint64_t MethodOffset;
642 assert(!Function.getJumpTable(Inst) &&
643 "Can't get vtable addrs for jump tables.");
645 if (!Function.hasMemoryProfile() || !opts::EliminateLoads)
646 return MethodInfoType();
648 MutableArrayRef<MCInst> Insts(&BB.front(), &Inst + 1);
649 if (!BC.MIB->analyzeVirtualMethodCall(Insts.begin(), Insts.end(),
650 MethodFetchInsns, VtableReg, MethodReg,
651 MethodOffset)) {
652 DEBUG_VERBOSE(
653 1, dbgs() << "BOLT-INFO: ICP unable to analyze method call in "
654 << Function << " at @ " << (&Inst - &BB.front()) << "\n");
655 return MethodInfoType();
658 ++TotalMethodLoadEliminationCandidates;
660 DEBUG_VERBOSE(1, {
661 dbgs() << "BOLT-INFO: ICP found virtual method call in " << Function
662 << " at @ " << (&Inst - &BB.front()) << "\n";
663 dbgs() << "BOLT-INFO: ICP method fetch instructions:\n";
664 for (MCInst *Inst : MethodFetchInsns)
665 BC.printInstruction(dbgs(), *Inst, 0, &Function);
667 if (MethodFetchInsns.back() != &Inst)
668 BC.printInstruction(dbgs(), Inst, 0, &Function);
671 // Try to get value profiling data for the method load instruction.
672 auto ErrorOrMemAccessProfile =
673 BC.MIB->tryGetAnnotationAs<MemoryAccessProfile>(*MethodFetchInsns.back(),
674 "MemoryAccessProfile");
675 if (!ErrorOrMemAccessProfile) {
676 DEBUG_VERBOSE(1, dbgs()
677 << "BOLT-INFO: ICP no memory profiling data found\n");
678 return MethodInfoType();
680 MemoryAccessProfile &MemAccessProfile = ErrorOrMemAccessProfile.get();
682 // Find the vtable that each method belongs to.
683 std::map<const MCSymbol *, uint64_t> MethodToVtable;
685 for (const AddressAccess &AccessInfo : MemAccessProfile.AddressAccessInfo) {
686 uint64_t Address = AccessInfo.Offset;
687 if (AccessInfo.MemoryObject)
688 Address += AccessInfo.MemoryObject->getAddress();
690 // Ignore bogus data.
691 if (!Address)
692 continue;
694 const uint64_t VtableBase = Address - MethodOffset;
696 DEBUG_VERBOSE(1, dbgs() << "BOLT-INFO: ICP vtable = "
697 << Twine::utohexstr(VtableBase) << "+"
698 << MethodOffset << "/" << AccessInfo.Count << "\n");
700 if (ErrorOr<uint64_t> MethodAddr = BC.getPointerAtAddress(Address)) {
701 BinaryData *MethodBD = BC.getBinaryDataAtAddress(MethodAddr.get());
702 if (!MethodBD) // skip unknown methods
703 continue;
704 MCSymbol *MethodSym = MethodBD->getSymbol();
705 MethodToVtable[MethodSym] = VtableBase;
706 DEBUG_VERBOSE(1, {
707 const BinaryFunction *Method = BC.getFunctionForSymbol(MethodSym);
708 dbgs() << "BOLT-INFO: ICP found method = "
709 << Twine::utohexstr(MethodAddr.get()) << "/"
710 << (Method ? Method->getPrintName() : "") << "\n";
715 // Find the vtable for each target symbol.
716 for (size_t I = 0; I < SymTargets.size(); ++I) {
717 auto Itr = MethodToVtable.find(SymTargets[I].first);
718 if (Itr != MethodToVtable.end()) {
719 if (BinaryData *BD = BC.getBinaryDataContainingAddress(Itr->second)) {
720 const uint64_t Addend = Itr->second - BD->getAddress();
721 VtableSyms.emplace_back(BD->getSymbol(), Addend);
722 continue;
725 // Give up if we can't find the vtable for a method.
726 DEBUG_VERBOSE(1, dbgs() << "BOLT-INFO: ICP can't find vtable for "
727 << SymTargets[I].first->getName() << "\n");
728 return MethodInfoType();
731 // Make sure the vtable reg is not clobbered by the argument passing code
732 if (VtableReg != MethodReg) {
733 for (MCInst *CurInst = MethodFetchInsns.front(); CurInst < &Inst;
734 ++CurInst) {
735 const MCInstrDesc &InstrInfo = BC.MII->get(CurInst->getOpcode());
736 if (InstrInfo.hasDefOfPhysReg(*CurInst, VtableReg, *BC.MRI))
737 return MethodInfoType();
741 return MethodInfoType(VtableSyms, MethodFetchInsns);
744 std::vector<std::unique_ptr<BinaryBasicBlock>>
745 IndirectCallPromotion::rewriteCall(
746 BinaryBasicBlock &IndCallBlock, const MCInst &CallInst,
747 MCPlusBuilder::BlocksVectorTy &&ICPcode,
748 const std::vector<MCInst *> &MethodFetchInsns) const {
749 BinaryFunction &Function = *IndCallBlock.getFunction();
750 MCPlusBuilder *MIB = Function.getBinaryContext().MIB.get();
752 // Create new basic blocks with correct code in each one first.
753 std::vector<std::unique_ptr<BinaryBasicBlock>> NewBBs;
754 const bool IsTailCallOrJT =
755 (MIB->isTailCall(CallInst) || Function.getJumpTable(CallInst));
757 // Move instructions from the tail of the original call block
758 // to the merge block.
760 // Remember any pseudo instructions following a tail call. These
761 // must be preserved and moved to the original block.
762 InstructionListType TailInsts;
763 const MCInst *TailInst = &CallInst;
764 if (IsTailCallOrJT)
765 while (TailInst + 1 < &(*IndCallBlock.end()) &&
766 MIB->isPseudo(*(TailInst + 1)))
767 TailInsts.push_back(*++TailInst);
769 InstructionListType MovedInst = IndCallBlock.splitInstructions(&CallInst);
770 // Link new BBs to the original input offset of the BB where the indirect
771 // call site is, so we can map samples recorded in new BBs back to the
772 // original BB seen in the input binary (if using BAT)
773 const uint32_t OrigOffset = IndCallBlock.getInputOffset();
775 IndCallBlock.eraseInstructions(MethodFetchInsns.begin(),
776 MethodFetchInsns.end());
777 if (IndCallBlock.empty() ||
778 (!MethodFetchInsns.empty() && MethodFetchInsns.back() == &CallInst))
779 IndCallBlock.addInstructions(ICPcode.front().second.begin(),
780 ICPcode.front().second.end());
781 else
782 IndCallBlock.replaceInstruction(std::prev(IndCallBlock.end()),
783 ICPcode.front().second);
784 IndCallBlock.addInstructions(TailInsts.begin(), TailInsts.end());
786 for (auto Itr = ICPcode.begin() + 1; Itr != ICPcode.end(); ++Itr) {
787 MCSymbol *&Sym = Itr->first;
788 InstructionListType &Insts = Itr->second;
789 assert(Sym);
790 std::unique_ptr<BinaryBasicBlock> TBB = Function.createBasicBlock(Sym);
791 TBB->setOffset(OrigOffset);
792 for (MCInst &Inst : Insts) // sanitize new instructions.
793 if (MIB->isCall(Inst))
794 MIB->removeAnnotation(Inst, "CallProfile");
795 TBB->addInstructions(Insts.begin(), Insts.end());
796 NewBBs.emplace_back(std::move(TBB));
799 // Move tail of instructions from after the original call to
800 // the merge block.
801 if (!IsTailCallOrJT)
802 NewBBs.back()->addInstructions(MovedInst.begin(), MovedInst.end());
804 return NewBBs;
807 BinaryBasicBlock *
808 IndirectCallPromotion::fixCFG(BinaryBasicBlock &IndCallBlock,
809 const bool IsTailCall, const bool IsJumpTable,
810 IndirectCallPromotion::BasicBlocksVector &&NewBBs,
811 const std::vector<Callsite> &Targets) const {
812 BinaryFunction &Function = *IndCallBlock.getFunction();
813 using BinaryBranchInfo = BinaryBasicBlock::BinaryBranchInfo;
814 BinaryBasicBlock *MergeBlock = nullptr;
816 // Scale indirect call counts to the execution count of the original
817 // basic block containing the indirect call.
818 uint64_t TotalCount = IndCallBlock.getKnownExecutionCount();
819 uint64_t TotalIndirectBranches = 0;
820 for (const Callsite &Target : Targets)
821 TotalIndirectBranches += Target.Branches;
822 if (TotalIndirectBranches == 0)
823 TotalIndirectBranches = 1;
824 BinaryBasicBlock::BranchInfoType BBI;
825 BinaryBasicBlock::BranchInfoType ScaledBBI;
826 for (const Callsite &Target : Targets) {
827 const size_t NumEntries =
828 std::max(static_cast<std::size_t>(1UL), Target.JTIndices.size());
829 for (size_t I = 0; I < NumEntries; ++I) {
830 BBI.push_back(
831 BinaryBranchInfo{(Target.Branches + NumEntries - 1) / NumEntries,
832 (Target.Mispreds + NumEntries - 1) / NumEntries});
833 ScaledBBI.push_back(
834 BinaryBranchInfo{uint64_t(TotalCount * Target.Branches /
835 (NumEntries * TotalIndirectBranches)),
836 uint64_t(TotalCount * Target.Mispreds /
837 (NumEntries * TotalIndirectBranches))});
841 if (IsJumpTable) {
842 BinaryBasicBlock *NewIndCallBlock = NewBBs.back().get();
843 IndCallBlock.moveAllSuccessorsTo(NewIndCallBlock);
845 std::vector<MCSymbol *> SymTargets;
846 for (const Callsite &Target : Targets) {
847 const size_t NumEntries =
848 std::max(static_cast<std::size_t>(1UL), Target.JTIndices.size());
849 for (size_t I = 0; I < NumEntries; ++I)
850 SymTargets.push_back(Target.To.Sym);
852 assert(SymTargets.size() > NewBBs.size() - 1 &&
853 "There must be a target symbol associated with each new BB.");
855 for (uint64_t I = 0; I < NewBBs.size(); ++I) {
856 BinaryBasicBlock *SourceBB = I ? NewBBs[I - 1].get() : &IndCallBlock;
857 SourceBB->setExecutionCount(TotalCount);
859 BinaryBasicBlock *TargetBB =
860 Function.getBasicBlockForLabel(SymTargets[I]);
861 SourceBB->addSuccessor(TargetBB, ScaledBBI[I]); // taken
863 TotalCount -= ScaledBBI[I].Count;
864 SourceBB->addSuccessor(NewBBs[I].get(), TotalCount); // fall-through
866 // Update branch info for the indirect jump.
867 BinaryBasicBlock::BinaryBranchInfo &BranchInfo =
868 NewIndCallBlock->getBranchInfo(*TargetBB);
869 if (BranchInfo.Count > BBI[I].Count)
870 BranchInfo.Count -= BBI[I].Count;
871 else
872 BranchInfo.Count = 0;
874 if (BranchInfo.MispredictedCount > BBI[I].MispredictedCount)
875 BranchInfo.MispredictedCount -= BBI[I].MispredictedCount;
876 else
877 BranchInfo.MispredictedCount = 0;
879 } else {
880 assert(NewBBs.size() >= 2);
881 assert(NewBBs.size() % 2 == 1 || IndCallBlock.succ_empty());
882 assert(NewBBs.size() % 2 == 1 || IsTailCall);
884 auto ScaledBI = ScaledBBI.begin();
885 auto updateCurrentBranchInfo = [&] {
886 assert(ScaledBI != ScaledBBI.end());
887 TotalCount -= ScaledBI->Count;
888 ++ScaledBI;
891 if (!IsTailCall) {
892 MergeBlock = NewBBs.back().get();
893 IndCallBlock.moveAllSuccessorsTo(MergeBlock);
896 // Fix up successors and execution counts.
897 updateCurrentBranchInfo();
898 IndCallBlock.addSuccessor(NewBBs[1].get(), TotalCount);
899 IndCallBlock.addSuccessor(NewBBs[0].get(), ScaledBBI[0]);
901 const size_t Adj = IsTailCall ? 1 : 2;
902 for (size_t I = 0; I < NewBBs.size() - Adj; ++I) {
903 assert(TotalCount <= IndCallBlock.getExecutionCount() ||
904 TotalCount <= uint64_t(TotalIndirectBranches));
905 uint64_t ExecCount = ScaledBBI[(I + 1) / 2].Count;
906 if (I % 2 == 0) {
907 if (MergeBlock)
908 NewBBs[I]->addSuccessor(MergeBlock, ScaledBBI[(I + 1) / 2].Count);
909 } else {
910 assert(I + 2 < NewBBs.size());
911 updateCurrentBranchInfo();
912 NewBBs[I]->addSuccessor(NewBBs[I + 2].get(), TotalCount);
913 NewBBs[I]->addSuccessor(NewBBs[I + 1].get(), ScaledBBI[(I + 1) / 2]);
914 ExecCount += TotalCount;
916 NewBBs[I]->setExecutionCount(ExecCount);
919 if (MergeBlock) {
920 // Arrange for the MergeBlock to be the fallthrough for the first
921 // promoted call block.
922 std::unique_ptr<BinaryBasicBlock> MBPtr;
923 std::swap(MBPtr, NewBBs.back());
924 NewBBs.pop_back();
925 NewBBs.emplace(NewBBs.begin() + 1, std::move(MBPtr));
926 // TODO: is COUNT_FALLTHROUGH_EDGE the right thing here?
927 NewBBs.back()->addSuccessor(MergeBlock, TotalCount); // uncond branch
931 // Update the execution count.
932 NewBBs.back()->setExecutionCount(TotalCount);
934 // Update BB and BB layout.
935 Function.insertBasicBlocks(&IndCallBlock, std::move(NewBBs));
936 assert(Function.validateCFG());
938 return MergeBlock;
941 size_t IndirectCallPromotion::canPromoteCallsite(
942 const BinaryBasicBlock &BB, const MCInst &Inst,
943 const std::vector<Callsite> &Targets, uint64_t NumCalls) {
944 BinaryFunction *BF = BB.getFunction();
945 const BinaryContext &BC = BF->getBinaryContext();
947 if (BB.getKnownExecutionCount() < opts::ExecutionCountThreshold)
948 return 0;
950 const bool IsJumpTable = BF->getJumpTable(Inst);
952 auto computeStats = [&](size_t N) {
953 for (size_t I = 0; I < N; ++I)
954 if (IsJumpTable)
955 TotalNumFrequentJmps += Targets[I].Branches;
956 else
957 TotalNumFrequentCalls += Targets[I].Branches;
960 // If we have no targets (or no calls), skip this callsite.
961 if (Targets.empty() || !NumCalls) {
962 if (opts::Verbosity >= 1) {
963 const ptrdiff_t InstIdx = &Inst - &(*BB.begin());
964 outs() << "BOLT-INFO: ICP failed in " << *BF << " @ " << InstIdx << " in "
965 << BB.getName() << ", calls = " << NumCalls
966 << ", targets empty or NumCalls == 0.\n";
968 return 0;
971 size_t TopN = opts::ICPTopN;
972 if (IsJumpTable)
973 TopN = opts::ICPJumpTablesTopN ? opts::ICPJumpTablesTopN : TopN;
974 else
975 TopN = opts::ICPCallsTopN ? opts::ICPCallsTopN : TopN;
977 const size_t TrialN = TopN ? std::min(TopN, Targets.size()) : Targets.size();
979 if (opts::ICPTopCallsites && !BC.MIB->hasAnnotation(Inst, "DoICP"))
980 return 0;
982 // Pick the top N targets.
983 uint64_t TotalMispredictsTopN = 0;
984 size_t N = 0;
986 if (opts::ICPUseMispredicts &&
987 (!IsJumpTable || opts::ICPJumpTablesByTarget)) {
988 // Count total number of mispredictions for (at most) the top N targets.
989 // We may choose a smaller N (TrialN vs. N) if the frequency threshold
990 // is exceeded by fewer targets.
991 double Threshold = double(opts::ICPMispredictThreshold);
992 for (size_t I = 0; I < TrialN && Threshold > 0; ++I, ++N) {
993 Threshold -= (100.0 * Targets[I].Mispreds) / NumCalls;
994 TotalMispredictsTopN += Targets[I].Mispreds;
996 computeStats(N);
998 // Compute the misprediction frequency of the top N call targets. If this
999 // frequency is greater than the threshold, we should try ICP on this
1000 // callsite.
1001 const double TopNFrequency = (100.0 * TotalMispredictsTopN) / NumCalls;
1002 if (TopNFrequency == 0 || TopNFrequency < opts::ICPMispredictThreshold) {
1003 if (opts::Verbosity >= 1) {
1004 const ptrdiff_t InstIdx = &Inst - &(*BB.begin());
1005 outs() << "BOLT-INFO: ICP failed in " << *BF << " @ " << InstIdx
1006 << " in " << BB.getName() << ", calls = " << NumCalls
1007 << ", top N mis. frequency " << format("%.1f", TopNFrequency)
1008 << "% < " << opts::ICPMispredictThreshold << "%\n";
1010 return 0;
1012 } else {
1013 size_t MaxTargets = 0;
1015 // Count total number of calls for (at most) the top N targets.
1016 // We may choose a smaller N (TrialN vs. N) if the frequency threshold
1017 // is exceeded by fewer targets.
1018 const unsigned TotalThreshold = IsJumpTable
1019 ? opts::ICPJTTotalPercentThreshold
1020 : opts::ICPCallsTotalPercentThreshold;
1021 const unsigned RemainingThreshold =
1022 IsJumpTable ? opts::ICPJTRemainingPercentThreshold
1023 : opts::ICPCallsRemainingPercentThreshold;
1024 uint64_t NumRemainingCalls = NumCalls;
1025 for (size_t I = 0; I < TrialN; ++I, ++MaxTargets) {
1026 if (100 * Targets[I].Branches < NumCalls * TotalThreshold)
1027 break;
1028 if (100 * Targets[I].Branches < NumRemainingCalls * RemainingThreshold)
1029 break;
1030 if (N + (Targets[I].JTIndices.empty() ? 1 : Targets[I].JTIndices.size()) >
1031 TrialN)
1032 break;
1033 TotalMispredictsTopN += Targets[I].Mispreds;
1034 NumRemainingCalls -= Targets[I].Branches;
1035 N += Targets[I].JTIndices.empty() ? 1 : Targets[I].JTIndices.size();
1037 computeStats(MaxTargets);
1039 // Don't check misprediction frequency for jump tables -- we don't really
1040 // care as long as we are saving loads from the jump table.
1041 if (!IsJumpTable || opts::ICPJumpTablesByTarget) {
1042 // Compute the misprediction frequency of the top N call targets. If
1043 // this frequency is less than the threshold, we should skip ICP at
1044 // this callsite.
1045 const double TopNMispredictFrequency =
1046 (100.0 * TotalMispredictsTopN) / NumCalls;
1048 if (TopNMispredictFrequency < opts::ICPMispredictThreshold) {
1049 if (opts::Verbosity >= 1) {
1050 const ptrdiff_t InstIdx = &Inst - &(*BB.begin());
1051 outs() << "BOLT-INFO: ICP failed in " << *BF << " @ " << InstIdx
1052 << " in " << BB.getName() << ", calls = " << NumCalls
1053 << ", top N mispredict frequency "
1054 << format("%.1f", TopNMispredictFrequency) << "% < "
1055 << opts::ICPMispredictThreshold << "%\n";
1057 return 0;
1062 // Filter by inline-ability of target functions, stop at first target that
1063 // can't be inlined.
1064 if (!IsJumpTable && opts::ICPPeelForInline) {
1065 for (size_t I = 0; I < N; ++I) {
1066 const MCSymbol *TargetSym = Targets[I].To.Sym;
1067 const BinaryFunction *TargetBF = BC.getFunctionForSymbol(TargetSym);
1068 if (!TargetBF || !BinaryFunctionPass::shouldOptimize(*TargetBF) ||
1069 getInliningInfo(*TargetBF).Type == InliningType::INL_NONE) {
1070 N = I;
1071 break;
1076 // Filter functions that can have ICP applied (for debugging)
1077 if (!opts::ICPFuncsList.empty()) {
1078 for (std::string &Name : opts::ICPFuncsList)
1079 if (BF->hasName(Name))
1080 return N;
1081 return 0;
1084 return N;
1087 void IndirectCallPromotion::printCallsiteInfo(
1088 const BinaryBasicBlock &BB, const MCInst &Inst,
1089 const std::vector<Callsite> &Targets, const size_t N,
1090 uint64_t NumCalls) const {
1091 BinaryContext &BC = BB.getFunction()->getBinaryContext();
1092 const bool IsTailCall = BC.MIB->isTailCall(Inst);
1093 const bool IsJumpTable = BB.getFunction()->getJumpTable(Inst);
1094 const ptrdiff_t InstIdx = &Inst - &(*BB.begin());
1096 outs() << "BOLT-INFO: ICP candidate branch info: " << *BB.getFunction()
1097 << " @ " << InstIdx << " in " << BB.getName()
1098 << " -> calls = " << NumCalls
1099 << (IsTailCall ? " (tail)" : (IsJumpTable ? " (jump table)" : ""))
1100 << "\n";
1101 for (size_t I = 0; I < N; I++) {
1102 const double Frequency = 100.0 * Targets[I].Branches / NumCalls;
1103 const double MisFrequency = 100.0 * Targets[I].Mispreds / NumCalls;
1104 outs() << "BOLT-INFO: ";
1105 if (Targets[I].To.Sym)
1106 outs() << Targets[I].To.Sym->getName();
1107 else
1108 outs() << Targets[I].To.Addr;
1109 outs() << ", calls = " << Targets[I].Branches
1110 << ", mispreds = " << Targets[I].Mispreds
1111 << ", taken freq = " << format("%.1f", Frequency) << "%"
1112 << ", mis. freq = " << format("%.1f", MisFrequency) << "%";
1113 bool First = true;
1114 for (uint64_t JTIndex : Targets[I].JTIndices) {
1115 outs() << (First ? ", indices = " : ", ") << JTIndex;
1116 First = false;
1118 outs() << "\n";
1121 LLVM_DEBUG({
1122 dbgs() << "BOLT-INFO: ICP original call instruction:";
1123 BC.printInstruction(dbgs(), Inst, Targets[0].From.Addr, nullptr, true);
1127 void IndirectCallPromotion::runOnFunctions(BinaryContext &BC) {
1128 if (opts::ICP == ICP_NONE)
1129 return;
1131 auto &BFs = BC.getBinaryFunctions();
1133 const bool OptimizeCalls = (opts::ICP == ICP_CALLS || opts::ICP == ICP_ALL);
1134 const bool OptimizeJumpTables =
1135 (opts::ICP == ICP_JUMP_TABLES || opts::ICP == ICP_ALL);
1137 std::unique_ptr<RegAnalysis> RA;
1138 std::unique_ptr<BinaryFunctionCallGraph> CG;
1139 if (OptimizeJumpTables) {
1140 CG.reset(new BinaryFunctionCallGraph(buildCallGraph(BC)));
1141 RA.reset(new RegAnalysis(BC, &BFs, &*CG));
1144 // If icp-top-callsites is enabled, compute the total number of indirect
1145 // calls and then optimize the hottest callsites that contribute to that
1146 // total.
1147 SetVector<BinaryFunction *> Functions;
1148 if (opts::ICPTopCallsites == 0) {
1149 for (auto &KV : BFs)
1150 Functions.insert(&KV.second);
1151 } else {
1152 using IndirectCallsite = std::tuple<uint64_t, MCInst *, BinaryFunction *>;
1153 std::vector<IndirectCallsite> IndirectCalls;
1154 size_t TotalIndirectCalls = 0;
1156 // Find all the indirect callsites.
1157 for (auto &BFIt : BFs) {
1158 BinaryFunction &Function = BFIt.second;
1160 if (!shouldOptimize(Function))
1161 continue;
1163 const bool HasLayout = !Function.getLayout().block_empty();
1165 for (BinaryBasicBlock &BB : Function) {
1166 if (HasLayout && Function.isSplit() && BB.isCold())
1167 continue;
1169 for (MCInst &Inst : BB) {
1170 const bool IsJumpTable = Function.getJumpTable(Inst);
1171 const bool HasIndirectCallProfile =
1172 BC.MIB->hasAnnotation(Inst, "CallProfile");
1173 const bool IsDirectCall =
1174 (BC.MIB->isCall(Inst) && BC.MIB->getTargetSymbol(Inst, 0));
1176 if (!IsDirectCall &&
1177 ((HasIndirectCallProfile && !IsJumpTable && OptimizeCalls) ||
1178 (IsJumpTable && OptimizeJumpTables))) {
1179 uint64_t NumCalls = 0;
1180 for (const Callsite &BInfo : getCallTargets(BB, Inst))
1181 NumCalls += BInfo.Branches;
1182 IndirectCalls.push_back(
1183 std::make_tuple(NumCalls, &Inst, &Function));
1184 TotalIndirectCalls += NumCalls;
1190 // Sort callsites by execution count.
1191 llvm::sort(reverse(IndirectCalls));
1193 // Find callsites that contribute to the top "opts::ICPTopCallsites"%
1194 // number of calls.
1195 const float TopPerc = opts::ICPTopCallsites / 100.0f;
1196 int64_t MaxCalls = TotalIndirectCalls * TopPerc;
1197 uint64_t LastFreq = std::numeric_limits<uint64_t>::max();
1198 size_t Num = 0;
1199 for (const IndirectCallsite &IC : IndirectCalls) {
1200 const uint64_t CurFreq = std::get<0>(IC);
1201 // Once we decide to stop, include at least all branches that share the
1202 // same frequency of the last one to avoid non-deterministic behavior
1203 // (e.g. turning on/off ICP depending on the order of functions)
1204 if (MaxCalls <= 0 && CurFreq != LastFreq)
1205 break;
1206 MaxCalls -= CurFreq;
1207 LastFreq = CurFreq;
1208 BC.MIB->addAnnotation(*std::get<1>(IC), "DoICP", true);
1209 Functions.insert(std::get<2>(IC));
1210 ++Num;
1212 outs() << "BOLT-INFO: ICP Total indirect calls = " << TotalIndirectCalls
1213 << ", " << Num << " callsites cover " << opts::ICPTopCallsites
1214 << "% of all indirect calls\n";
1217 for (BinaryFunction *FuncPtr : Functions) {
1218 BinaryFunction &Function = *FuncPtr;
1220 if (!shouldOptimize(Function))
1221 continue;
1223 const bool HasLayout = !Function.getLayout().block_empty();
1225 // Total number of indirect calls issued from the current Function.
1226 // (a fraction of TotalIndirectCalls)
1227 uint64_t FuncTotalIndirectCalls = 0;
1228 uint64_t FuncTotalIndirectJmps = 0;
1230 std::vector<BinaryBasicBlock *> BBs;
1231 for (BinaryBasicBlock &BB : Function) {
1232 // Skip indirect calls in cold blocks.
1233 if (!HasLayout || !Function.isSplit() || !BB.isCold())
1234 BBs.push_back(&BB);
1236 if (BBs.empty())
1237 continue;
1239 DataflowInfoManager Info(Function, RA.get(), nullptr);
1240 while (!BBs.empty()) {
1241 BinaryBasicBlock *BB = BBs.back();
1242 BBs.pop_back();
1244 for (unsigned Idx = 0; Idx < BB->size(); ++Idx) {
1245 MCInst &Inst = BB->getInstructionAtIndex(Idx);
1246 const ptrdiff_t InstIdx = &Inst - &(*BB->begin());
1247 const bool IsTailCall = BC.MIB->isTailCall(Inst);
1248 const bool HasIndirectCallProfile =
1249 BC.MIB->hasAnnotation(Inst, "CallProfile");
1250 const bool IsJumpTable = Function.getJumpTable(Inst);
1252 if (BC.MIB->isCall(Inst))
1253 TotalCalls += BB->getKnownExecutionCount();
1255 if (IsJumpTable && !OptimizeJumpTables)
1256 continue;
1258 if (!IsJumpTable && (!HasIndirectCallProfile || !OptimizeCalls))
1259 continue;
1261 // Ignore direct calls.
1262 if (BC.MIB->isCall(Inst) && BC.MIB->getTargetSymbol(Inst, 0))
1263 continue;
1265 assert((BC.MIB->isCall(Inst) || BC.MIB->isIndirectBranch(Inst)) &&
1266 "expected a call or an indirect jump instruction");
1268 if (IsJumpTable)
1269 ++TotalJumpTableCallsites;
1270 else
1271 ++TotalIndirectCallsites;
1273 std::vector<Callsite> Targets = getCallTargets(*BB, Inst);
1275 // Compute the total number of calls from this particular callsite.
1276 uint64_t NumCalls = 0;
1277 for (const Callsite &BInfo : Targets)
1278 NumCalls += BInfo.Branches;
1279 if (!IsJumpTable)
1280 FuncTotalIndirectCalls += NumCalls;
1281 else
1282 FuncTotalIndirectJmps += NumCalls;
1284 // If FLAGS regs is alive after this jmp site, do not try
1285 // promoting because we will clobber FLAGS.
1286 if (IsJumpTable) {
1287 ErrorOr<const BitVector &> State =
1288 Info.getLivenessAnalysis().getStateBefore(Inst);
1289 if (!State || (State && (*State)[BC.MIB->getFlagsReg()])) {
1290 if (opts::Verbosity >= 1)
1291 outs() << "BOLT-INFO: ICP failed in " << Function << " @ "
1292 << InstIdx << " in " << BB->getName()
1293 << ", calls = " << NumCalls
1294 << (State ? ", cannot clobber flags reg.\n"
1295 : ", no liveness data available.\n");
1296 continue;
1300 // Should this callsite be optimized? Return the number of targets
1301 // to use when promoting this call. A value of zero means to skip
1302 // this callsite.
1303 size_t N = canPromoteCallsite(*BB, Inst, Targets, NumCalls);
1305 // If it is a jump table and it failed to meet our initial threshold,
1306 // proceed to findCallTargetSymbols -- it may reevaluate N if
1307 // memory profile is present
1308 if (!N && !IsJumpTable)
1309 continue;
1311 if (opts::Verbosity >= 1)
1312 printCallsiteInfo(*BB, Inst, Targets, N, NumCalls);
1314 // Find MCSymbols or absolute addresses for each call target.
1315 MCInst *TargetFetchInst = nullptr;
1316 const SymTargetsType SymTargets =
1317 findCallTargetSymbols(Targets, N, *BB, Inst, TargetFetchInst);
1319 // findCallTargetSymbols may have changed N if mem profile is available
1320 // for jump tables
1321 if (!N)
1322 continue;
1324 LLVM_DEBUG(printDecision(dbgs(), Targets, N));
1326 // If we can't resolve any of the target symbols, punt on this callsite.
1327 // TODO: can this ever happen?
1328 if (SymTargets.size() < N) {
1329 const size_t LastTarget = SymTargets.size();
1330 if (opts::Verbosity >= 1)
1331 outs() << "BOLT-INFO: ICP failed in " << Function << " @ "
1332 << InstIdx << " in " << BB->getName()
1333 << ", calls = " << NumCalls
1334 << ", ICP failed to find target symbol for "
1335 << Targets[LastTarget].To.Sym->getName() << "\n";
1336 continue;
1339 MethodInfoType MethodInfo;
1341 if (!IsJumpTable) {
1342 MethodInfo = maybeGetVtableSyms(*BB, Inst, SymTargets);
1343 TotalMethodLoadsEliminated += MethodInfo.first.empty() ? 0 : 1;
1344 LLVM_DEBUG(dbgs()
1345 << "BOLT-INFO: ICP "
1346 << (!MethodInfo.first.empty() ? "found" : "did not find")
1347 << " vtables for all methods.\n");
1348 } else if (TargetFetchInst) {
1349 ++TotalIndexBasedJumps;
1350 MethodInfo.second.push_back(TargetFetchInst);
1353 // Generate new promoted call code for this callsite.
1354 MCPlusBuilder::BlocksVectorTy ICPcode =
1355 (IsJumpTable && !opts::ICPJumpTablesByTarget)
1356 ? BC.MIB->jumpTablePromotion(Inst, SymTargets,
1357 MethodInfo.second, BC.Ctx.get())
1358 : BC.MIB->indirectCallPromotion(
1359 Inst, SymTargets, MethodInfo.first, MethodInfo.second,
1360 opts::ICPOldCodeSequence, BC.Ctx.get());
1362 if (ICPcode.empty()) {
1363 if (opts::Verbosity >= 1)
1364 outs() << "BOLT-INFO: ICP failed in " << Function << " @ "
1365 << InstIdx << " in " << BB->getName()
1366 << ", calls = " << NumCalls
1367 << ", unable to generate promoted call code.\n";
1368 continue;
1371 LLVM_DEBUG({
1372 uint64_t Offset = Targets[0].From.Addr;
1373 dbgs() << "BOLT-INFO: ICP indirect call code:\n";
1374 for (const auto &entry : ICPcode) {
1375 const MCSymbol *const &Sym = entry.first;
1376 const InstructionListType &Insts = entry.second;
1377 if (Sym)
1378 dbgs() << Sym->getName() << ":\n";
1379 Offset = BC.printInstructions(dbgs(), Insts.begin(), Insts.end(),
1380 Offset);
1382 dbgs() << "---------------------------------------------------\n";
1385 // Rewrite the CFG with the newly generated ICP code.
1386 std::vector<std::unique_ptr<BinaryBasicBlock>> NewBBs =
1387 rewriteCall(*BB, Inst, std::move(ICPcode), MethodInfo.second);
1389 // Fix the CFG after inserting the new basic blocks.
1390 BinaryBasicBlock *MergeBlock =
1391 fixCFG(*BB, IsTailCall, IsJumpTable, std::move(NewBBs), Targets);
1393 // Since the tail of the original block was split off and it may contain
1394 // additional indirect calls, we must add the merge block to the set of
1395 // blocks to process.
1396 if (MergeBlock)
1397 BBs.push_back(MergeBlock);
1399 if (opts::Verbosity >= 1)
1400 outs() << "BOLT-INFO: ICP succeeded in " << Function << " @ "
1401 << InstIdx << " in " << BB->getName()
1402 << " -> calls = " << NumCalls << "\n";
1404 if (IsJumpTable)
1405 ++TotalOptimizedJumpTableCallsites;
1406 else
1407 ++TotalOptimizedIndirectCallsites;
1409 Modified.insert(&Function);
1412 TotalIndirectCalls += FuncTotalIndirectCalls;
1413 TotalIndirectJmps += FuncTotalIndirectJmps;
1416 outs() << "BOLT-INFO: ICP total indirect callsites with profile = "
1417 << TotalIndirectCallsites << "\n"
1418 << "BOLT-INFO: ICP total jump table callsites = "
1419 << TotalJumpTableCallsites << "\n"
1420 << "BOLT-INFO: ICP total number of calls = " << TotalCalls << "\n"
1421 << "BOLT-INFO: ICP percentage of calls that are indirect = "
1422 << format("%.1f", (100.0 * TotalIndirectCalls) / TotalCalls) << "%\n"
1423 << "BOLT-INFO: ICP percentage of indirect calls that can be "
1424 "optimized = "
1425 << format("%.1f", (100.0 * TotalNumFrequentCalls) /
1426 std::max<size_t>(TotalIndirectCalls, 1))
1427 << "%\n"
1428 << "BOLT-INFO: ICP percentage of indirect callsites that are "
1429 "optimized = "
1430 << format("%.1f", (100.0 * TotalOptimizedIndirectCallsites) /
1431 std::max<uint64_t>(TotalIndirectCallsites, 1))
1432 << "%\n"
1433 << "BOLT-INFO: ICP number of method load elimination candidates = "
1434 << TotalMethodLoadEliminationCandidates << "\n"
1435 << "BOLT-INFO: ICP percentage of method calls candidates that have "
1436 "loads eliminated = "
1437 << format("%.1f", (100.0 * TotalMethodLoadsEliminated) /
1438 std::max<uint64_t>(
1439 TotalMethodLoadEliminationCandidates, 1))
1440 << "%\n"
1441 << "BOLT-INFO: ICP percentage of indirect branches that are "
1442 "optimized = "
1443 << format("%.1f", (100.0 * TotalNumFrequentJmps) /
1444 std::max<uint64_t>(TotalIndirectJmps, 1))
1445 << "%\n"
1446 << "BOLT-INFO: ICP percentage of jump table callsites that are "
1447 << "optimized = "
1448 << format("%.1f", (100.0 * TotalOptimizedJumpTableCallsites) /
1449 std::max<uint64_t>(TotalJumpTableCallsites, 1))
1450 << "%\n"
1451 << "BOLT-INFO: ICP number of jump table callsites that can use hot "
1452 << "indices = " << TotalIndexBasedCandidates << "\n"
1453 << "BOLT-INFO: ICP percentage of jump table callsites that use hot "
1454 "indices = "
1455 << format("%.1f", (100.0 * TotalIndexBasedJumps) /
1456 std::max<uint64_t>(TotalIndexBasedCandidates, 1))
1457 << "%\n";
1459 (void)verifyProfile;
1460 #ifndef NDEBUG
1461 verifyProfile(BFs);
1462 #endif
1465 } // namespace bolt
1466 } // namespace llvm