1 //===- bolt/Passes/IndirectCallPromotion.cpp ------------------------------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 // This file implements the IndirectCallPromotion class.
11 //===----------------------------------------------------------------------===//
13 #include "bolt/Passes/IndirectCallPromotion.h"
14 #include "bolt/Passes/BinaryFunctionCallGraph.h"
15 #include "bolt/Passes/DataflowInfoManager.h"
16 #include "bolt/Passes/Inliner.h"
17 #include "llvm/ADT/STLExtras.h"
18 #include "llvm/Support/CommandLine.h"
21 #define DEBUG_TYPE "ICP"
22 #define DEBUG_VERBOSE(Level, X) \
23 if (opts::Verbosity >= (Level)) { \
32 extern cl::OptionCategory BoltOptCategory
;
34 extern cl::opt
<IndirectCallPromotionType
> ICP
;
35 extern cl::opt
<unsigned> Verbosity
;
36 extern cl::opt
<unsigned> ExecutionCountThreshold
;
38 static cl::opt
<unsigned> ICPJTRemainingPercentThreshold(
39 "icp-jt-remaining-percent-threshold",
40 cl::desc("The percentage threshold against remaining unpromoted indirect "
41 "call count for the promotion for jump tables"),
42 cl::init(30), cl::ZeroOrMore
, cl::Hidden
, cl::cat(BoltOptCategory
));
44 static cl::opt
<unsigned> ICPJTTotalPercentThreshold(
45 "icp-jt-total-percent-threshold",
47 "The percentage threshold against total count for the promotion for "
49 cl::init(5), cl::Hidden
, cl::cat(BoltOptCategory
));
51 static cl::opt
<unsigned> ICPCallsRemainingPercentThreshold(
52 "icp-calls-remaining-percent-threshold",
53 cl::desc("The percentage threshold against remaining unpromoted indirect "
54 "call count for the promotion for calls"),
55 cl::init(50), cl::Hidden
, cl::cat(BoltOptCategory
));
57 static cl::opt
<unsigned> ICPCallsTotalPercentThreshold(
58 "icp-calls-total-percent-threshold",
60 "The percentage threshold against total count for the promotion for "
62 cl::init(30), cl::Hidden
, cl::cat(BoltOptCategory
));
64 static cl::opt
<unsigned> ICPMispredictThreshold(
65 "indirect-call-promotion-mispredict-threshold",
66 cl::desc("misprediction threshold for skipping ICP on an "
68 cl::init(0), cl::cat(BoltOptCategory
));
70 static cl::alias
ICPMispredictThresholdAlias(
72 cl::desc("alias for --indirect-call-promotion-mispredict-threshold"),
73 cl::aliasopt(ICPMispredictThreshold
));
75 static cl::opt
<bool> ICPUseMispredicts(
76 "indirect-call-promotion-use-mispredicts",
77 cl::desc("use misprediction frequency for determining whether or not ICP "
78 "should be applied at a callsite. The "
79 "-indirect-call-promotion-mispredict-threshold value will be used "
81 cl::cat(BoltOptCategory
));
83 static cl::alias
ICPUseMispredictsAlias(
85 cl::desc("alias for --indirect-call-promotion-use-mispredicts"),
86 cl::aliasopt(ICPUseMispredicts
));
88 static cl::opt
<unsigned>
89 ICPTopN("indirect-call-promotion-topn",
90 cl::desc("limit number of targets to consider when doing indirect "
91 "call promotion. 0 = no limit"),
92 cl::init(3), cl::cat(BoltOptCategory
));
95 ICPTopNAlias("icp-topn",
96 cl::desc("alias for --indirect-call-promotion-topn"),
97 cl::aliasopt(ICPTopN
));
99 static cl::opt
<unsigned> ICPCallsTopN(
100 "indirect-call-promotion-calls-topn",
101 cl::desc("limit number of targets to consider when doing indirect "
102 "call promotion on calls. 0 = no limit"),
103 cl::init(0), cl::cat(BoltOptCategory
));
105 static cl::alias
ICPCallsTopNAlias(
107 cl::desc("alias for --indirect-call-promotion-calls-topn"),
108 cl::aliasopt(ICPCallsTopN
));
110 static cl::opt
<unsigned> ICPJumpTablesTopN(
111 "indirect-call-promotion-jump-tables-topn",
112 cl::desc("limit number of targets to consider when doing indirect "
113 "call promotion on jump tables. 0 = no limit"),
114 cl::init(0), cl::cat(BoltOptCategory
));
116 static cl::alias
ICPJumpTablesTopNAlias(
118 cl::desc("alias for --indirect-call-promotion-jump-tables-topn"),
119 cl::aliasopt(ICPJumpTablesTopN
));
121 static cl::opt
<bool> EliminateLoads(
122 "icp-eliminate-loads",
123 cl::desc("enable load elimination using memory profiling data when "
125 cl::init(true), cl::cat(BoltOptCategory
));
127 static cl::opt
<unsigned> ICPTopCallsites(
129 cl::desc("optimize hottest calls until at least this percentage of all "
130 "indirect calls frequency is covered. 0 = all callsites"),
131 cl::init(99), cl::Hidden
, cl::cat(BoltOptCategory
));
133 static cl::list
<std::string
>
134 ICPFuncsList("icp-funcs", cl::CommaSeparated
,
135 cl::desc("list of functions to enable ICP for"),
136 cl::value_desc("func1,func2,func3,..."), cl::Hidden
,
137 cl::cat(BoltOptCategory
));
140 ICPOldCodeSequence("icp-old-code-sequence",
141 cl::desc("use old code sequence for promoted calls"),
142 cl::Hidden
, cl::cat(BoltOptCategory
));
144 static cl::opt
<bool> ICPJumpTablesByTarget(
145 "icp-jump-tables-targets",
147 "for jump tables, optimize indirect jmp targets instead of indices"),
148 cl::Hidden
, cl::cat(BoltOptCategory
));
151 ICPJumpTablesByTargetAlias("icp-jt-targets",
152 cl::desc("alias for --icp-jump-tables-targets"),
153 cl::aliasopt(ICPJumpTablesByTarget
));
155 static cl::opt
<bool> ICPPeelForInline(
156 "icp-inline", cl::desc("only promote call targets eligible for inlining"),
157 cl::Hidden
, cl::cat(BoltOptCategory
));
161 static bool verifyProfile(std::map
<uint64_t, BinaryFunction
> &BFs
) {
163 for (auto &BFI
: BFs
) {
164 BinaryFunction
&BF
= BFI
.second
;
167 for (const BinaryBasicBlock
&BB
: BF
) {
168 auto BI
= BB
.branch_info_begin();
169 for (BinaryBasicBlock
*SuccBB
: BB
.successors()) {
170 if (BI
->Count
!= BinaryBasicBlock::COUNT_NO_PROFILE
&& BI
->Count
> 0) {
171 if (BB
.getKnownExecutionCount() == 0 ||
172 SuccBB
->getKnownExecutionCount() == 0) {
173 errs() << "BOLT-WARNING: profile verification failed after ICP for "
189 IndirectCallPromotion::Callsite::Callsite(BinaryFunction
&BF
,
190 const IndirectCallProfile
&ICP
)
191 : From(BF
.getSymbol()), To(ICP
.Offset
), Mispreds(ICP
.Mispreds
),
192 Branches(ICP
.Count
) {
199 void IndirectCallPromotion::printDecision(
200 llvm::raw_ostream
&OS
,
201 std::vector
<IndirectCallPromotion::Callsite
> &Targets
, unsigned N
) const {
202 uint64_t TotalCount
= 0;
203 uint64_t TotalMispreds
= 0;
204 for (const Callsite
&S
: Targets
) {
205 TotalCount
+= S
.Branches
;
206 TotalMispreds
+= S
.Mispreds
;
213 OS
<< "BOLT-INFO: ICP decision for call site with " << Targets
.size()
214 << " targets, Count = " << TotalCount
<< ", Mispreds = " << TotalMispreds
218 for (const Callsite
&S
: Targets
) {
219 OS
<< "Count = " << S
.Branches
<< ", "
220 << format("%.1f", (100.0 * S
.Branches
) / TotalCount
) << ", "
221 << "Mispreds = " << S
.Mispreds
<< ", "
222 << format("%.1f", (100.0 * S
.Mispreds
) / TotalMispreds
);
224 OS
<< " * to be optimized *";
225 if (!S
.JTIndices
.empty()) {
227 for (const uint64_t Idx
: S
.JTIndices
)
231 I
+= S
.JTIndices
.empty() ? 1 : S
.JTIndices
.size();
235 // Get list of targets for a given call sorted by most frequently
237 std::vector
<IndirectCallPromotion::Callsite
>
238 IndirectCallPromotion::getCallTargets(BinaryBasicBlock
&BB
,
239 const MCInst
&Inst
) const {
240 BinaryFunction
&BF
= *BB
.getFunction();
241 const BinaryContext
&BC
= BF
.getBinaryContext();
242 std::vector
<Callsite
> Targets
;
244 if (const JumpTable
*JT
= BF
.getJumpTable(Inst
)) {
245 // Don't support PIC jump tables for now
246 if (!opts::ICPJumpTablesByTarget
&& JT
->Type
== JumpTable::JTT_PIC
)
248 const Location
From(BF
.getSymbol());
249 const std::pair
<size_t, size_t> Range
=
250 JT
->getEntriesForAddress(BC
.MIB
->getJumpTable(Inst
));
251 assert(JT
->Counts
.empty() || JT
->Counts
.size() >= Range
.second
);
252 JumpTable::JumpInfo DefaultJI
;
253 const JumpTable::JumpInfo
*JI
=
254 JT
->Counts
.empty() ? &DefaultJI
: &JT
->Counts
[Range
.first
];
255 const size_t JIAdj
= JT
->Counts
.empty() ? 0 : 1;
256 assert(JT
->Type
== JumpTable::JTT_PIC
||
257 JT
->EntrySize
== BC
.AsmInfo
->getCodePointerSize());
258 for (size_t I
= Range
.first
; I
< Range
.second
; ++I
, JI
+= JIAdj
) {
259 MCSymbol
*Entry
= JT
->Entries
[I
];
260 const BinaryBasicBlock
*ToBB
= BF
.getBasicBlockForLabel(Entry
);
261 assert(ToBB
|| Entry
== BF
.getFunctionEndLabel() ||
262 Entry
== BF
.getFunctionEndLabel(FragmentNum::cold()));
263 if (Entry
== BF
.getFunctionEndLabel() ||
264 Entry
== BF
.getFunctionEndLabel(FragmentNum::cold()))
266 const Location
To(Entry
);
267 const BinaryBasicBlock::BinaryBranchInfo
&BI
= BB
.getBranchInfo(*ToBB
);
268 Targets
.emplace_back(From
, To
, BI
.MispredictedCount
, BI
.Count
,
272 // Sort by symbol then addr.
273 llvm::sort(Targets
, [](const Callsite
&A
, const Callsite
&B
) {
274 if (A
.To
.Sym
&& B
.To
.Sym
)
275 return A
.To
.Sym
< B
.To
.Sym
;
276 else if (A
.To
.Sym
&& !B
.To
.Sym
)
278 else if (!A
.To
.Sym
&& B
.To
.Sym
)
281 return A
.To
.Addr
< B
.To
.Addr
;
284 // Targets may contain multiple entries to the same target, but using
285 // different indices. Their profile will report the same number of branches
286 // for different indices if the target is the same. That's because we don't
287 // profile the index value, but only the target via LBR.
288 auto First
= Targets
.begin();
289 auto Last
= Targets
.end();
291 while (++First
!= Last
) {
292 Callsite
&A
= *Result
;
293 const Callsite
&B
= *First
;
294 if (A
.To
.Sym
&& B
.To
.Sym
&& A
.To
.Sym
== B
.To
.Sym
)
295 A
.JTIndices
.insert(A
.JTIndices
.end(), B
.JTIndices
.begin(),
298 *(++Result
) = *First
;
302 LLVM_DEBUG(if (Targets
.end() - Result
> 0) {
303 dbgs() << "BOLT-INFO: ICP: " << (Targets
.end() - Result
)
304 << " duplicate targets removed\n";
307 Targets
.erase(Result
, Targets
.end());
309 // Don't try to optimize PC relative indirect calls.
310 if (Inst
.getOperand(0).isReg() &&
311 Inst
.getOperand(0).getReg() == BC
.MRI
->getProgramCounter())
314 const auto ICSP
= BC
.MIB
->tryGetAnnotationAs
<IndirectCallSiteProfile
>(
315 Inst
, "CallProfile");
317 for (const IndirectCallProfile
&CSP
: ICSP
.get()) {
318 Callsite
Site(BF
, CSP
);
320 Targets
.emplace_back(std::move(Site
));
325 // Sort by target count, number of indices in case of jump table, and
326 // mispredicts. We prioritize targets with high count, small number of indices
327 // and high mispredicts. Break ties by selecting targets with lower addresses.
328 llvm::stable_sort(Targets
, [](const Callsite
&A
, const Callsite
&B
) {
329 if (A
.Branches
!= B
.Branches
)
330 return A
.Branches
> B
.Branches
;
331 if (A
.JTIndices
.size() != B
.JTIndices
.size())
332 return A
.JTIndices
.size() < B
.JTIndices
.size();
333 if (A
.Mispreds
!= B
.Mispreds
)
334 return A
.Mispreds
> B
.Mispreds
;
335 return A
.To
.Addr
< B
.To
.Addr
;
338 // Remove non-symbol targets
339 llvm::erase_if(Targets
, [](const Callsite
&CS
) { return !CS
.To
.Sym
; });
341 LLVM_DEBUG(if (BF
.getJumpTable(Inst
)) {
342 uint64_t TotalCount
= 0;
343 uint64_t TotalMispreds
= 0;
344 for (const Callsite
&S
: Targets
) {
345 TotalCount
+= S
.Branches
;
346 TotalMispreds
+= S
.Mispreds
;
353 dbgs() << "BOLT-INFO: ICP: jump table size = " << Targets
.size()
354 << ", Count = " << TotalCount
<< ", Mispreds = " << TotalMispreds
358 for (const Callsite
&S
: Targets
) {
359 dbgs() << "Count[" << I
<< "] = " << S
.Branches
<< ", "
360 << format("%.1f", (100.0 * S
.Branches
) / TotalCount
) << ", "
361 << "Mispreds[" << I
<< "] = " << S
.Mispreds
<< ", "
362 << format("%.1f", (100.0 * S
.Mispreds
) / TotalMispreds
) << "\n";
370 IndirectCallPromotion::JumpTableInfoType
371 IndirectCallPromotion::maybeGetHotJumpTableTargets(BinaryBasicBlock
&BB
,
373 MCInst
*&TargetFetchInst
,
374 const JumpTable
*JT
) const {
375 assert(JT
&& "Can't get jump table addrs for non-jump tables.");
377 BinaryFunction
&Function
= *BB
.getFunction();
378 BinaryContext
&BC
= Function
.getBinaryContext();
380 if (!Function
.hasMemoryProfile() || !opts::EliminateLoads
)
381 return JumpTableInfoType();
383 JumpTableInfoType HotTargets
;
385 MCInst
*PCRelBaseOut
;
386 unsigned BaseReg
, IndexReg
;
388 const MCExpr
*DispExpr
;
389 MutableArrayRef
<MCInst
> Insts(&BB
.front(), &CallInst
);
390 const IndirectBranchType Type
= BC
.MIB
->analyzeIndirectBranch(
391 CallInst
, Insts
.begin(), Insts
.end(), BC
.AsmInfo
->getCodePointerSize(),
392 MemLocInstr
, BaseReg
, IndexReg
, DispValue
, DispExpr
, PCRelBaseOut
);
394 assert(MemLocInstr
&& "There should always be a load for jump tables");
396 return JumpTableInfoType();
399 dbgs() << "BOLT-INFO: ICP attempting to find memory profiling data for "
400 << "jump table in " << Function
<< " at @ "
401 << (&CallInst
- &BB
.front()) << "\n"
402 << "BOLT-INFO: ICP target fetch instructions:\n";
403 BC
.printInstruction(dbgs(), *MemLocInstr
, 0, &Function
);
404 if (MemLocInstr
!= &CallInst
)
405 BC
.printInstruction(dbgs(), CallInst
, 0, &Function
);
409 dbgs() << "Jmp info: Type = " << (unsigned)Type
<< ", "
410 << "BaseReg = " << BC
.MRI
->getName(BaseReg
) << ", "
411 << "IndexReg = " << BC
.MRI
->getName(IndexReg
) << ", "
412 << "DispValue = " << Twine::utohexstr(DispValue
) << ", "
413 << "DispExpr = " << DispExpr
<< ", "
415 BC
.printInstruction(dbgs(), *MemLocInstr
, 0, &Function
);
419 ++TotalIndexBasedCandidates
;
421 auto ErrorOrMemAccessProfile
=
422 BC
.MIB
->tryGetAnnotationAs
<MemoryAccessProfile
>(*MemLocInstr
,
423 "MemoryAccessProfile");
424 if (!ErrorOrMemAccessProfile
) {
425 DEBUG_VERBOSE(1, dbgs()
426 << "BOLT-INFO: ICP no memory profiling data found\n");
427 return JumpTableInfoType();
429 MemoryAccessProfile
&MemAccessProfile
= ErrorOrMemAccessProfile
.get();
433 ErrorOr
<uint64_t> DispValueOrError
=
434 BC
.getSymbolValue(*BC
.MIB
->getTargetSymbol(DispExpr
));
435 assert(DispValueOrError
&& "global symbol needs a value");
436 ArrayStart
= *DispValueOrError
;
438 ArrayStart
= static_cast<uint64_t>(DispValue
);
441 if (BaseReg
== BC
.MRI
->getProgramCounter())
442 ArrayStart
+= Function
.getAddress() + MemAccessProfile
.NextInstrOffset
;
444 // This is a map of [symbol] -> [count, index] and is used to combine indices
445 // into the jump table since there may be multiple addresses that all have the
447 std::map
<MCSymbol
*, std::pair
<uint64_t, uint64_t>> HotTargetMap
;
448 const std::pair
<size_t, size_t> Range
= JT
->getEntriesForAddress(ArrayStart
);
450 for (const AddressAccess
&AccessInfo
: MemAccessProfile
.AddressAccessInfo
) {
452 // Mem data occasionally includes nullprs, ignore them.
453 if (!AccessInfo
.MemoryObject
&& !AccessInfo
.Offset
)
456 if (AccessInfo
.Offset
% JT
->EntrySize
!= 0) // ignore bogus data
457 return JumpTableInfoType();
459 if (AccessInfo
.MemoryObject
) {
460 // Deal with bad/stale data
461 if (!AccessInfo
.MemoryObject
->getName().startswith(
462 "JUMP_TABLE/" + Function
.getOneName().str()))
463 return JumpTableInfoType();
465 (AccessInfo
.Offset
- (ArrayStart
- JT
->getAddress())) / JT
->EntrySize
;
467 Index
= (AccessInfo
.Offset
- ArrayStart
) / JT
->EntrySize
;
470 // If Index is out of range it probably means the memory profiling data is
471 // wrong for this instruction, bail out.
472 if (Index
>= Range
.second
) {
473 LLVM_DEBUG(dbgs() << "BOLT-INFO: Index out of range of " << Range
.first
474 << ", " << Range
.second
<< "\n");
475 return JumpTableInfoType();
478 // Make sure the hot index points at a legal label corresponding to a BB,
479 // e.g. not the end of function (unreachable) label.
480 if (!Function
.getBasicBlockForLabel(JT
->Entries
[Index
+ Range
.first
])) {
482 dbgs() << "BOLT-INFO: hot index " << Index
<< " pointing at bogus "
483 << "label " << JT
->Entries
[Index
+ Range
.first
]->getName()
484 << " in jump table:\n";
486 dbgs() << "HotTargetMap:\n";
487 for (std::pair
<MCSymbol
*const, std::pair
<uint64_t, uint64_t>> &HT
:
489 dbgs() << "BOLT-INFO: " << HT
.first
->getName()
490 << " = (count=" << HT
.second
.first
491 << ", index=" << HT
.second
.second
<< ")\n";
493 return JumpTableInfoType();
496 std::pair
<uint64_t, uint64_t> &HotTarget
=
497 HotTargetMap
[JT
->Entries
[Index
+ Range
.first
]];
498 HotTarget
.first
+= AccessInfo
.Count
;
499 HotTarget
.second
= Index
;
502 llvm::copy(llvm::make_second_range(HotTargetMap
),
503 std::back_inserter(HotTargets
));
505 // Sort with highest counts first.
506 llvm::sort(reverse(HotTargets
));
509 dbgs() << "BOLT-INFO: ICP jump table hot targets:\n";
510 for (const std::pair
<uint64_t, uint64_t> &Target
: HotTargets
)
511 dbgs() << "BOLT-INFO: Idx = " << Target
.second
<< ", "
512 << "Count = " << Target
.first
<< "\n";
515 BC
.MIB
->getOrCreateAnnotationAs
<uint16_t>(CallInst
, "JTIndexReg") = IndexReg
;
517 TargetFetchInst
= MemLocInstr
;
522 IndirectCallPromotion::SymTargetsType
523 IndirectCallPromotion::findCallTargetSymbols(std::vector
<Callsite
> &Targets
,
524 size_t &N
, BinaryBasicBlock
&BB
,
526 MCInst
*&TargetFetchInst
) const {
527 const JumpTable
*JT
= BB
.getFunction()->getJumpTable(CallInst
);
528 SymTargetsType SymTargets
;
531 for (size_t I
= 0; I
< N
; ++I
) {
532 assert(Targets
[I
].To
.Sym
&& "All ICP targets must be to known symbols");
533 assert(Targets
[I
].JTIndices
.empty() &&
534 "Can't have jump table indices for non-jump tables");
535 SymTargets
.emplace_back(Targets
[I
].To
.Sym
, 0);
540 // Use memory profile to select hot targets.
541 JumpTableInfoType HotTargets
=
542 maybeGetHotJumpTableTargets(BB
, CallInst
, TargetFetchInst
, JT
);
544 auto findTargetsIndex
= [&](uint64_t JTIndex
) {
545 for (size_t I
= 0; I
< Targets
.size(); ++I
)
546 if (llvm::is_contained(Targets
[I
].JTIndices
, JTIndex
))
548 LLVM_DEBUG(dbgs() << "BOLT-ERROR: Unable to find target index for hot jump "
549 << " table entry in " << *BB
.getFunction() << "\n");
550 llvm_unreachable("Hot indices must be referred to by at least one "
554 if (!HotTargets
.empty()) {
555 if (opts::Verbosity
>= 1)
556 for (size_t I
= 0; I
< HotTargets
.size(); ++I
)
557 outs() << "BOLT-INFO: HotTarget[" << I
<< "] = (" << HotTargets
[I
].first
558 << ", " << HotTargets
[I
].second
<< ")\n";
560 // Recompute hottest targets, now discriminating which index is hot
561 // NOTE: This is a tradeoff. On one hand, we get index information. On the
562 // other hand, info coming from the memory profile is much less accurate
563 // than LBRs. So we may actually end up working with more coarse
564 // profile granularity in exchange for information about indices.
565 std::vector
<Callsite
> NewTargets
;
566 std::map
<const MCSymbol
*, uint32_t> IndicesPerTarget
;
567 uint64_t TotalMemAccesses
= 0;
568 for (size_t I
= 0; I
< HotTargets
.size(); ++I
) {
569 const uint64_t TargetIndex
= findTargetsIndex(HotTargets
[I
].second
);
570 ++IndicesPerTarget
[Targets
[TargetIndex
].To
.Sym
];
571 TotalMemAccesses
+= HotTargets
[I
].first
;
573 uint64_t RemainingMemAccesses
= TotalMemAccesses
;
575 opts::ICPJumpTablesTopN
? opts::ICPJumpTablesTopN
: opts::ICPTopN
;
577 for (; I
< HotTargets
.size(); ++I
) {
578 const uint64_t MemAccesses
= HotTargets
[I
].first
;
579 if (100 * MemAccesses
<
580 TotalMemAccesses
* opts::ICPJTTotalPercentThreshold
)
582 if (100 * MemAccesses
<
583 RemainingMemAccesses
* opts::ICPJTRemainingPercentThreshold
)
585 if (TopN
&& I
>= TopN
)
587 RemainingMemAccesses
-= MemAccesses
;
589 const uint64_t JTIndex
= HotTargets
[I
].second
;
590 Callsite
&Target
= Targets
[findTargetsIndex(JTIndex
)];
592 NewTargets
.push_back(Target
);
593 std::vector
<uint64_t>({JTIndex
}).swap(NewTargets
.back().JTIndices
);
594 llvm::erase(Target
.JTIndices
, JTIndex
);
596 // Keep fixCFG counts sane if more indices use this same target later
597 assert(IndicesPerTarget
[Target
.To
.Sym
] > 0 && "wrong map");
598 NewTargets
.back().Branches
=
599 Target
.Branches
/ IndicesPerTarget
[Target
.To
.Sym
];
600 NewTargets
.back().Mispreds
=
601 Target
.Mispreds
/ IndicesPerTarget
[Target
.To
.Sym
];
602 assert(Target
.Branches
>= NewTargets
.back().Branches
);
603 assert(Target
.Mispreds
>= NewTargets
.back().Mispreds
);
604 Target
.Branches
-= NewTargets
.back().Branches
;
605 Target
.Mispreds
-= NewTargets
.back().Mispreds
;
607 llvm::copy(Targets
, std::back_inserter(NewTargets
));
608 std::swap(NewTargets
, Targets
);
611 if (N
== 0 && opts::Verbosity
>= 1) {
612 outs() << "BOLT-INFO: ICP failed in " << *BB
.getFunction() << " in "
613 << BB
.getName() << ": failed to meet thresholds after memory "
614 << "profile data was loaded.\n";
619 for (size_t I
= 0, TgtIdx
= 0; I
< N
; ++TgtIdx
) {
620 Callsite
&Target
= Targets
[TgtIdx
];
621 assert(Target
.To
.Sym
&& "All ICP targets must be to known symbols");
622 assert(!Target
.JTIndices
.empty() && "Jump tables must have indices");
623 for (uint64_t Idx
: Target
.JTIndices
) {
624 SymTargets
.emplace_back(Target
.To
.Sym
, Idx
);
632 IndirectCallPromotion::MethodInfoType
IndirectCallPromotion::maybeGetVtableSyms(
633 BinaryBasicBlock
&BB
, MCInst
&Inst
,
634 const SymTargetsType
&SymTargets
) const {
635 BinaryFunction
&Function
= *BB
.getFunction();
636 BinaryContext
&BC
= Function
.getBinaryContext();
637 std::vector
<std::pair
<MCSymbol
*, uint64_t>> VtableSyms
;
638 std::vector
<MCInst
*> MethodFetchInsns
;
639 unsigned VtableReg
, MethodReg
;
640 uint64_t MethodOffset
;
642 assert(!Function
.getJumpTable(Inst
) &&
643 "Can't get vtable addrs for jump tables.");
645 if (!Function
.hasMemoryProfile() || !opts::EliminateLoads
)
646 return MethodInfoType();
648 MutableArrayRef
<MCInst
> Insts(&BB
.front(), &Inst
+ 1);
649 if (!BC
.MIB
->analyzeVirtualMethodCall(Insts
.begin(), Insts
.end(),
650 MethodFetchInsns
, VtableReg
, MethodReg
,
653 1, dbgs() << "BOLT-INFO: ICP unable to analyze method call in "
654 << Function
<< " at @ " << (&Inst
- &BB
.front()) << "\n");
655 return MethodInfoType();
658 ++TotalMethodLoadEliminationCandidates
;
661 dbgs() << "BOLT-INFO: ICP found virtual method call in " << Function
662 << " at @ " << (&Inst
- &BB
.front()) << "\n";
663 dbgs() << "BOLT-INFO: ICP method fetch instructions:\n";
664 for (MCInst
*Inst
: MethodFetchInsns
)
665 BC
.printInstruction(dbgs(), *Inst
, 0, &Function
);
667 if (MethodFetchInsns
.back() != &Inst
)
668 BC
.printInstruction(dbgs(), Inst
, 0, &Function
);
671 // Try to get value profiling data for the method load instruction.
672 auto ErrorOrMemAccessProfile
=
673 BC
.MIB
->tryGetAnnotationAs
<MemoryAccessProfile
>(*MethodFetchInsns
.back(),
674 "MemoryAccessProfile");
675 if (!ErrorOrMemAccessProfile
) {
676 DEBUG_VERBOSE(1, dbgs()
677 << "BOLT-INFO: ICP no memory profiling data found\n");
678 return MethodInfoType();
680 MemoryAccessProfile
&MemAccessProfile
= ErrorOrMemAccessProfile
.get();
682 // Find the vtable that each method belongs to.
683 std::map
<const MCSymbol
*, uint64_t> MethodToVtable
;
685 for (const AddressAccess
&AccessInfo
: MemAccessProfile
.AddressAccessInfo
) {
686 uint64_t Address
= AccessInfo
.Offset
;
687 if (AccessInfo
.MemoryObject
)
688 Address
+= AccessInfo
.MemoryObject
->getAddress();
690 // Ignore bogus data.
694 const uint64_t VtableBase
= Address
- MethodOffset
;
696 DEBUG_VERBOSE(1, dbgs() << "BOLT-INFO: ICP vtable = "
697 << Twine::utohexstr(VtableBase
) << "+"
698 << MethodOffset
<< "/" << AccessInfo
.Count
<< "\n");
700 if (ErrorOr
<uint64_t> MethodAddr
= BC
.getPointerAtAddress(Address
)) {
701 BinaryData
*MethodBD
= BC
.getBinaryDataAtAddress(MethodAddr
.get());
702 if (!MethodBD
) // skip unknown methods
704 MCSymbol
*MethodSym
= MethodBD
->getSymbol();
705 MethodToVtable
[MethodSym
] = VtableBase
;
707 const BinaryFunction
*Method
= BC
.getFunctionForSymbol(MethodSym
);
708 dbgs() << "BOLT-INFO: ICP found method = "
709 << Twine::utohexstr(MethodAddr
.get()) << "/"
710 << (Method
? Method
->getPrintName() : "") << "\n";
715 // Find the vtable for each target symbol.
716 for (size_t I
= 0; I
< SymTargets
.size(); ++I
) {
717 auto Itr
= MethodToVtable
.find(SymTargets
[I
].first
);
718 if (Itr
!= MethodToVtable
.end()) {
719 if (BinaryData
*BD
= BC
.getBinaryDataContainingAddress(Itr
->second
)) {
720 const uint64_t Addend
= Itr
->second
- BD
->getAddress();
721 VtableSyms
.emplace_back(BD
->getSymbol(), Addend
);
725 // Give up if we can't find the vtable for a method.
726 DEBUG_VERBOSE(1, dbgs() << "BOLT-INFO: ICP can't find vtable for "
727 << SymTargets
[I
].first
->getName() << "\n");
728 return MethodInfoType();
731 // Make sure the vtable reg is not clobbered by the argument passing code
732 if (VtableReg
!= MethodReg
) {
733 for (MCInst
*CurInst
= MethodFetchInsns
.front(); CurInst
< &Inst
;
735 const MCInstrDesc
&InstrInfo
= BC
.MII
->get(CurInst
->getOpcode());
736 if (InstrInfo
.hasDefOfPhysReg(*CurInst
, VtableReg
, *BC
.MRI
))
737 return MethodInfoType();
741 return MethodInfoType(VtableSyms
, MethodFetchInsns
);
744 std::vector
<std::unique_ptr
<BinaryBasicBlock
>>
745 IndirectCallPromotion::rewriteCall(
746 BinaryBasicBlock
&IndCallBlock
, const MCInst
&CallInst
,
747 MCPlusBuilder::BlocksVectorTy
&&ICPcode
,
748 const std::vector
<MCInst
*> &MethodFetchInsns
) const {
749 BinaryFunction
&Function
= *IndCallBlock
.getFunction();
750 MCPlusBuilder
*MIB
= Function
.getBinaryContext().MIB
.get();
752 // Create new basic blocks with correct code in each one first.
753 std::vector
<std::unique_ptr
<BinaryBasicBlock
>> NewBBs
;
754 const bool IsTailCallOrJT
=
755 (MIB
->isTailCall(CallInst
) || Function
.getJumpTable(CallInst
));
757 // Move instructions from the tail of the original call block
758 // to the merge block.
760 // Remember any pseudo instructions following a tail call. These
761 // must be preserved and moved to the original block.
762 InstructionListType TailInsts
;
763 const MCInst
*TailInst
= &CallInst
;
765 while (TailInst
+ 1 < &(*IndCallBlock
.end()) &&
766 MIB
->isPseudo(*(TailInst
+ 1)))
767 TailInsts
.push_back(*++TailInst
);
769 InstructionListType MovedInst
= IndCallBlock
.splitInstructions(&CallInst
);
770 // Link new BBs to the original input offset of the BB where the indirect
771 // call site is, so we can map samples recorded in new BBs back to the
772 // original BB seen in the input binary (if using BAT)
773 const uint32_t OrigOffset
= IndCallBlock
.getInputOffset();
775 IndCallBlock
.eraseInstructions(MethodFetchInsns
.begin(),
776 MethodFetchInsns
.end());
777 if (IndCallBlock
.empty() ||
778 (!MethodFetchInsns
.empty() && MethodFetchInsns
.back() == &CallInst
))
779 IndCallBlock
.addInstructions(ICPcode
.front().second
.begin(),
780 ICPcode
.front().second
.end());
782 IndCallBlock
.replaceInstruction(std::prev(IndCallBlock
.end()),
783 ICPcode
.front().second
);
784 IndCallBlock
.addInstructions(TailInsts
.begin(), TailInsts
.end());
786 for (auto Itr
= ICPcode
.begin() + 1; Itr
!= ICPcode
.end(); ++Itr
) {
787 MCSymbol
*&Sym
= Itr
->first
;
788 InstructionListType
&Insts
= Itr
->second
;
790 std::unique_ptr
<BinaryBasicBlock
> TBB
= Function
.createBasicBlock(Sym
);
791 TBB
->setOffset(OrigOffset
);
792 for (MCInst
&Inst
: Insts
) // sanitize new instructions.
793 if (MIB
->isCall(Inst
))
794 MIB
->removeAnnotation(Inst
, "CallProfile");
795 TBB
->addInstructions(Insts
.begin(), Insts
.end());
796 NewBBs
.emplace_back(std::move(TBB
));
799 // Move tail of instructions from after the original call to
802 NewBBs
.back()->addInstructions(MovedInst
.begin(), MovedInst
.end());
808 IndirectCallPromotion::fixCFG(BinaryBasicBlock
&IndCallBlock
,
809 const bool IsTailCall
, const bool IsJumpTable
,
810 IndirectCallPromotion::BasicBlocksVector
&&NewBBs
,
811 const std::vector
<Callsite
> &Targets
) const {
812 BinaryFunction
&Function
= *IndCallBlock
.getFunction();
813 using BinaryBranchInfo
= BinaryBasicBlock::BinaryBranchInfo
;
814 BinaryBasicBlock
*MergeBlock
= nullptr;
816 // Scale indirect call counts to the execution count of the original
817 // basic block containing the indirect call.
818 uint64_t TotalCount
= IndCallBlock
.getKnownExecutionCount();
819 uint64_t TotalIndirectBranches
= 0;
820 for (const Callsite
&Target
: Targets
)
821 TotalIndirectBranches
+= Target
.Branches
;
822 if (TotalIndirectBranches
== 0)
823 TotalIndirectBranches
= 1;
824 BinaryBasicBlock::BranchInfoType BBI
;
825 BinaryBasicBlock::BranchInfoType ScaledBBI
;
826 for (const Callsite
&Target
: Targets
) {
827 const size_t NumEntries
=
828 std::max(static_cast<std::size_t>(1UL), Target
.JTIndices
.size());
829 for (size_t I
= 0; I
< NumEntries
; ++I
) {
831 BinaryBranchInfo
{(Target
.Branches
+ NumEntries
- 1) / NumEntries
,
832 (Target
.Mispreds
+ NumEntries
- 1) / NumEntries
});
834 BinaryBranchInfo
{uint64_t(TotalCount
* Target
.Branches
/
835 (NumEntries
* TotalIndirectBranches
)),
836 uint64_t(TotalCount
* Target
.Mispreds
/
837 (NumEntries
* TotalIndirectBranches
))});
842 BinaryBasicBlock
*NewIndCallBlock
= NewBBs
.back().get();
843 IndCallBlock
.moveAllSuccessorsTo(NewIndCallBlock
);
845 std::vector
<MCSymbol
*> SymTargets
;
846 for (const Callsite
&Target
: Targets
) {
847 const size_t NumEntries
=
848 std::max(static_cast<std::size_t>(1UL), Target
.JTIndices
.size());
849 for (size_t I
= 0; I
< NumEntries
; ++I
)
850 SymTargets
.push_back(Target
.To
.Sym
);
852 assert(SymTargets
.size() > NewBBs
.size() - 1 &&
853 "There must be a target symbol associated with each new BB.");
855 for (uint64_t I
= 0; I
< NewBBs
.size(); ++I
) {
856 BinaryBasicBlock
*SourceBB
= I
? NewBBs
[I
- 1].get() : &IndCallBlock
;
857 SourceBB
->setExecutionCount(TotalCount
);
859 BinaryBasicBlock
*TargetBB
=
860 Function
.getBasicBlockForLabel(SymTargets
[I
]);
861 SourceBB
->addSuccessor(TargetBB
, ScaledBBI
[I
]); // taken
863 TotalCount
-= ScaledBBI
[I
].Count
;
864 SourceBB
->addSuccessor(NewBBs
[I
].get(), TotalCount
); // fall-through
866 // Update branch info for the indirect jump.
867 BinaryBasicBlock::BinaryBranchInfo
&BranchInfo
=
868 NewIndCallBlock
->getBranchInfo(*TargetBB
);
869 if (BranchInfo
.Count
> BBI
[I
].Count
)
870 BranchInfo
.Count
-= BBI
[I
].Count
;
872 BranchInfo
.Count
= 0;
874 if (BranchInfo
.MispredictedCount
> BBI
[I
].MispredictedCount
)
875 BranchInfo
.MispredictedCount
-= BBI
[I
].MispredictedCount
;
877 BranchInfo
.MispredictedCount
= 0;
880 assert(NewBBs
.size() >= 2);
881 assert(NewBBs
.size() % 2 == 1 || IndCallBlock
.succ_empty());
882 assert(NewBBs
.size() % 2 == 1 || IsTailCall
);
884 auto ScaledBI
= ScaledBBI
.begin();
885 auto updateCurrentBranchInfo
= [&] {
886 assert(ScaledBI
!= ScaledBBI
.end());
887 TotalCount
-= ScaledBI
->Count
;
892 MergeBlock
= NewBBs
.back().get();
893 IndCallBlock
.moveAllSuccessorsTo(MergeBlock
);
896 // Fix up successors and execution counts.
897 updateCurrentBranchInfo();
898 IndCallBlock
.addSuccessor(NewBBs
[1].get(), TotalCount
);
899 IndCallBlock
.addSuccessor(NewBBs
[0].get(), ScaledBBI
[0]);
901 const size_t Adj
= IsTailCall
? 1 : 2;
902 for (size_t I
= 0; I
< NewBBs
.size() - Adj
; ++I
) {
903 assert(TotalCount
<= IndCallBlock
.getExecutionCount() ||
904 TotalCount
<= uint64_t(TotalIndirectBranches
));
905 uint64_t ExecCount
= ScaledBBI
[(I
+ 1) / 2].Count
;
908 NewBBs
[I
]->addSuccessor(MergeBlock
, ScaledBBI
[(I
+ 1) / 2].Count
);
910 assert(I
+ 2 < NewBBs
.size());
911 updateCurrentBranchInfo();
912 NewBBs
[I
]->addSuccessor(NewBBs
[I
+ 2].get(), TotalCount
);
913 NewBBs
[I
]->addSuccessor(NewBBs
[I
+ 1].get(), ScaledBBI
[(I
+ 1) / 2]);
914 ExecCount
+= TotalCount
;
916 NewBBs
[I
]->setExecutionCount(ExecCount
);
920 // Arrange for the MergeBlock to be the fallthrough for the first
921 // promoted call block.
922 std::unique_ptr
<BinaryBasicBlock
> MBPtr
;
923 std::swap(MBPtr
, NewBBs
.back());
925 NewBBs
.emplace(NewBBs
.begin() + 1, std::move(MBPtr
));
926 // TODO: is COUNT_FALLTHROUGH_EDGE the right thing here?
927 NewBBs
.back()->addSuccessor(MergeBlock
, TotalCount
); // uncond branch
931 // Update the execution count.
932 NewBBs
.back()->setExecutionCount(TotalCount
);
934 // Update BB and BB layout.
935 Function
.insertBasicBlocks(&IndCallBlock
, std::move(NewBBs
));
936 assert(Function
.validateCFG());
941 size_t IndirectCallPromotion::canPromoteCallsite(
942 const BinaryBasicBlock
&BB
, const MCInst
&Inst
,
943 const std::vector
<Callsite
> &Targets
, uint64_t NumCalls
) {
944 BinaryFunction
*BF
= BB
.getFunction();
945 const BinaryContext
&BC
= BF
->getBinaryContext();
947 if (BB
.getKnownExecutionCount() < opts::ExecutionCountThreshold
)
950 const bool IsJumpTable
= BF
->getJumpTable(Inst
);
952 auto computeStats
= [&](size_t N
) {
953 for (size_t I
= 0; I
< N
; ++I
)
955 TotalNumFrequentJmps
+= Targets
[I
].Branches
;
957 TotalNumFrequentCalls
+= Targets
[I
].Branches
;
960 // If we have no targets (or no calls), skip this callsite.
961 if (Targets
.empty() || !NumCalls
) {
962 if (opts::Verbosity
>= 1) {
963 const ptrdiff_t InstIdx
= &Inst
- &(*BB
.begin());
964 outs() << "BOLT-INFO: ICP failed in " << *BF
<< " @ " << InstIdx
<< " in "
965 << BB
.getName() << ", calls = " << NumCalls
966 << ", targets empty or NumCalls == 0.\n";
971 size_t TopN
= opts::ICPTopN
;
973 TopN
= opts::ICPJumpTablesTopN
? opts::ICPJumpTablesTopN
: TopN
;
975 TopN
= opts::ICPCallsTopN
? opts::ICPCallsTopN
: TopN
;
977 const size_t TrialN
= TopN
? std::min(TopN
, Targets
.size()) : Targets
.size();
979 if (opts::ICPTopCallsites
&& !BC
.MIB
->hasAnnotation(Inst
, "DoICP"))
982 // Pick the top N targets.
983 uint64_t TotalMispredictsTopN
= 0;
986 if (opts::ICPUseMispredicts
&&
987 (!IsJumpTable
|| opts::ICPJumpTablesByTarget
)) {
988 // Count total number of mispredictions for (at most) the top N targets.
989 // We may choose a smaller N (TrialN vs. N) if the frequency threshold
990 // is exceeded by fewer targets.
991 double Threshold
= double(opts::ICPMispredictThreshold
);
992 for (size_t I
= 0; I
< TrialN
&& Threshold
> 0; ++I
, ++N
) {
993 Threshold
-= (100.0 * Targets
[I
].Mispreds
) / NumCalls
;
994 TotalMispredictsTopN
+= Targets
[I
].Mispreds
;
998 // Compute the misprediction frequency of the top N call targets. If this
999 // frequency is greater than the threshold, we should try ICP on this
1001 const double TopNFrequency
= (100.0 * TotalMispredictsTopN
) / NumCalls
;
1002 if (TopNFrequency
== 0 || TopNFrequency
< opts::ICPMispredictThreshold
) {
1003 if (opts::Verbosity
>= 1) {
1004 const ptrdiff_t InstIdx
= &Inst
- &(*BB
.begin());
1005 outs() << "BOLT-INFO: ICP failed in " << *BF
<< " @ " << InstIdx
1006 << " in " << BB
.getName() << ", calls = " << NumCalls
1007 << ", top N mis. frequency " << format("%.1f", TopNFrequency
)
1008 << "% < " << opts::ICPMispredictThreshold
<< "%\n";
1013 size_t MaxTargets
= 0;
1015 // Count total number of calls for (at most) the top N targets.
1016 // We may choose a smaller N (TrialN vs. N) if the frequency threshold
1017 // is exceeded by fewer targets.
1018 const unsigned TotalThreshold
= IsJumpTable
1019 ? opts::ICPJTTotalPercentThreshold
1020 : opts::ICPCallsTotalPercentThreshold
;
1021 const unsigned RemainingThreshold
=
1022 IsJumpTable
? opts::ICPJTRemainingPercentThreshold
1023 : opts::ICPCallsRemainingPercentThreshold
;
1024 uint64_t NumRemainingCalls
= NumCalls
;
1025 for (size_t I
= 0; I
< TrialN
; ++I
, ++MaxTargets
) {
1026 if (100 * Targets
[I
].Branches
< NumCalls
* TotalThreshold
)
1028 if (100 * Targets
[I
].Branches
< NumRemainingCalls
* RemainingThreshold
)
1030 if (N
+ (Targets
[I
].JTIndices
.empty() ? 1 : Targets
[I
].JTIndices
.size()) >
1033 TotalMispredictsTopN
+= Targets
[I
].Mispreds
;
1034 NumRemainingCalls
-= Targets
[I
].Branches
;
1035 N
+= Targets
[I
].JTIndices
.empty() ? 1 : Targets
[I
].JTIndices
.size();
1037 computeStats(MaxTargets
);
1039 // Don't check misprediction frequency for jump tables -- we don't really
1040 // care as long as we are saving loads from the jump table.
1041 if (!IsJumpTable
|| opts::ICPJumpTablesByTarget
) {
1042 // Compute the misprediction frequency of the top N call targets. If
1043 // this frequency is less than the threshold, we should skip ICP at
1045 const double TopNMispredictFrequency
=
1046 (100.0 * TotalMispredictsTopN
) / NumCalls
;
1048 if (TopNMispredictFrequency
< opts::ICPMispredictThreshold
) {
1049 if (opts::Verbosity
>= 1) {
1050 const ptrdiff_t InstIdx
= &Inst
- &(*BB
.begin());
1051 outs() << "BOLT-INFO: ICP failed in " << *BF
<< " @ " << InstIdx
1052 << " in " << BB
.getName() << ", calls = " << NumCalls
1053 << ", top N mispredict frequency "
1054 << format("%.1f", TopNMispredictFrequency
) << "% < "
1055 << opts::ICPMispredictThreshold
<< "%\n";
1062 // Filter by inline-ability of target functions, stop at first target that
1063 // can't be inlined.
1064 if (!IsJumpTable
&& opts::ICPPeelForInline
) {
1065 for (size_t I
= 0; I
< N
; ++I
) {
1066 const MCSymbol
*TargetSym
= Targets
[I
].To
.Sym
;
1067 const BinaryFunction
*TargetBF
= BC
.getFunctionForSymbol(TargetSym
);
1068 if (!TargetBF
|| !BinaryFunctionPass::shouldOptimize(*TargetBF
) ||
1069 getInliningInfo(*TargetBF
).Type
== InliningType::INL_NONE
) {
1076 // Filter functions that can have ICP applied (for debugging)
1077 if (!opts::ICPFuncsList
.empty()) {
1078 for (std::string
&Name
: opts::ICPFuncsList
)
1079 if (BF
->hasName(Name
))
1087 void IndirectCallPromotion::printCallsiteInfo(
1088 const BinaryBasicBlock
&BB
, const MCInst
&Inst
,
1089 const std::vector
<Callsite
> &Targets
, const size_t N
,
1090 uint64_t NumCalls
) const {
1091 BinaryContext
&BC
= BB
.getFunction()->getBinaryContext();
1092 const bool IsTailCall
= BC
.MIB
->isTailCall(Inst
);
1093 const bool IsJumpTable
= BB
.getFunction()->getJumpTable(Inst
);
1094 const ptrdiff_t InstIdx
= &Inst
- &(*BB
.begin());
1096 outs() << "BOLT-INFO: ICP candidate branch info: " << *BB
.getFunction()
1097 << " @ " << InstIdx
<< " in " << BB
.getName()
1098 << " -> calls = " << NumCalls
1099 << (IsTailCall
? " (tail)" : (IsJumpTable
? " (jump table)" : ""))
1101 for (size_t I
= 0; I
< N
; I
++) {
1102 const double Frequency
= 100.0 * Targets
[I
].Branches
/ NumCalls
;
1103 const double MisFrequency
= 100.0 * Targets
[I
].Mispreds
/ NumCalls
;
1104 outs() << "BOLT-INFO: ";
1105 if (Targets
[I
].To
.Sym
)
1106 outs() << Targets
[I
].To
.Sym
->getName();
1108 outs() << Targets
[I
].To
.Addr
;
1109 outs() << ", calls = " << Targets
[I
].Branches
1110 << ", mispreds = " << Targets
[I
].Mispreds
1111 << ", taken freq = " << format("%.1f", Frequency
) << "%"
1112 << ", mis. freq = " << format("%.1f", MisFrequency
) << "%";
1114 for (uint64_t JTIndex
: Targets
[I
].JTIndices
) {
1115 outs() << (First
? ", indices = " : ", ") << JTIndex
;
1122 dbgs() << "BOLT-INFO: ICP original call instruction:";
1123 BC
.printInstruction(dbgs(), Inst
, Targets
[0].From
.Addr
, nullptr, true);
1127 void IndirectCallPromotion::runOnFunctions(BinaryContext
&BC
) {
1128 if (opts::ICP
== ICP_NONE
)
1131 auto &BFs
= BC
.getBinaryFunctions();
1133 const bool OptimizeCalls
= (opts::ICP
== ICP_CALLS
|| opts::ICP
== ICP_ALL
);
1134 const bool OptimizeJumpTables
=
1135 (opts::ICP
== ICP_JUMP_TABLES
|| opts::ICP
== ICP_ALL
);
1137 std::unique_ptr
<RegAnalysis
> RA
;
1138 std::unique_ptr
<BinaryFunctionCallGraph
> CG
;
1139 if (OptimizeJumpTables
) {
1140 CG
.reset(new BinaryFunctionCallGraph(buildCallGraph(BC
)));
1141 RA
.reset(new RegAnalysis(BC
, &BFs
, &*CG
));
1144 // If icp-top-callsites is enabled, compute the total number of indirect
1145 // calls and then optimize the hottest callsites that contribute to that
1147 SetVector
<BinaryFunction
*> Functions
;
1148 if (opts::ICPTopCallsites
== 0) {
1149 for (auto &KV
: BFs
)
1150 Functions
.insert(&KV
.second
);
1152 using IndirectCallsite
= std::tuple
<uint64_t, MCInst
*, BinaryFunction
*>;
1153 std::vector
<IndirectCallsite
> IndirectCalls
;
1154 size_t TotalIndirectCalls
= 0;
1156 // Find all the indirect callsites.
1157 for (auto &BFIt
: BFs
) {
1158 BinaryFunction
&Function
= BFIt
.second
;
1160 if (!shouldOptimize(Function
))
1163 const bool HasLayout
= !Function
.getLayout().block_empty();
1165 for (BinaryBasicBlock
&BB
: Function
) {
1166 if (HasLayout
&& Function
.isSplit() && BB
.isCold())
1169 for (MCInst
&Inst
: BB
) {
1170 const bool IsJumpTable
= Function
.getJumpTable(Inst
);
1171 const bool HasIndirectCallProfile
=
1172 BC
.MIB
->hasAnnotation(Inst
, "CallProfile");
1173 const bool IsDirectCall
=
1174 (BC
.MIB
->isCall(Inst
) && BC
.MIB
->getTargetSymbol(Inst
, 0));
1176 if (!IsDirectCall
&&
1177 ((HasIndirectCallProfile
&& !IsJumpTable
&& OptimizeCalls
) ||
1178 (IsJumpTable
&& OptimizeJumpTables
))) {
1179 uint64_t NumCalls
= 0;
1180 for (const Callsite
&BInfo
: getCallTargets(BB
, Inst
))
1181 NumCalls
+= BInfo
.Branches
;
1182 IndirectCalls
.push_back(
1183 std::make_tuple(NumCalls
, &Inst
, &Function
));
1184 TotalIndirectCalls
+= NumCalls
;
1190 // Sort callsites by execution count.
1191 llvm::sort(reverse(IndirectCalls
));
1193 // Find callsites that contribute to the top "opts::ICPTopCallsites"%
1195 const float TopPerc
= opts::ICPTopCallsites
/ 100.0f
;
1196 int64_t MaxCalls
= TotalIndirectCalls
* TopPerc
;
1197 uint64_t LastFreq
= std::numeric_limits
<uint64_t>::max();
1199 for (const IndirectCallsite
&IC
: IndirectCalls
) {
1200 const uint64_t CurFreq
= std::get
<0>(IC
);
1201 // Once we decide to stop, include at least all branches that share the
1202 // same frequency of the last one to avoid non-deterministic behavior
1203 // (e.g. turning on/off ICP depending on the order of functions)
1204 if (MaxCalls
<= 0 && CurFreq
!= LastFreq
)
1206 MaxCalls
-= CurFreq
;
1208 BC
.MIB
->addAnnotation(*std::get
<1>(IC
), "DoICP", true);
1209 Functions
.insert(std::get
<2>(IC
));
1212 outs() << "BOLT-INFO: ICP Total indirect calls = " << TotalIndirectCalls
1213 << ", " << Num
<< " callsites cover " << opts::ICPTopCallsites
1214 << "% of all indirect calls\n";
1217 for (BinaryFunction
*FuncPtr
: Functions
) {
1218 BinaryFunction
&Function
= *FuncPtr
;
1220 if (!shouldOptimize(Function
))
1223 const bool HasLayout
= !Function
.getLayout().block_empty();
1225 // Total number of indirect calls issued from the current Function.
1226 // (a fraction of TotalIndirectCalls)
1227 uint64_t FuncTotalIndirectCalls
= 0;
1228 uint64_t FuncTotalIndirectJmps
= 0;
1230 std::vector
<BinaryBasicBlock
*> BBs
;
1231 for (BinaryBasicBlock
&BB
: Function
) {
1232 // Skip indirect calls in cold blocks.
1233 if (!HasLayout
|| !Function
.isSplit() || !BB
.isCold())
1239 DataflowInfoManager
Info(Function
, RA
.get(), nullptr);
1240 while (!BBs
.empty()) {
1241 BinaryBasicBlock
*BB
= BBs
.back();
1244 for (unsigned Idx
= 0; Idx
< BB
->size(); ++Idx
) {
1245 MCInst
&Inst
= BB
->getInstructionAtIndex(Idx
);
1246 const ptrdiff_t InstIdx
= &Inst
- &(*BB
->begin());
1247 const bool IsTailCall
= BC
.MIB
->isTailCall(Inst
);
1248 const bool HasIndirectCallProfile
=
1249 BC
.MIB
->hasAnnotation(Inst
, "CallProfile");
1250 const bool IsJumpTable
= Function
.getJumpTable(Inst
);
1252 if (BC
.MIB
->isCall(Inst
))
1253 TotalCalls
+= BB
->getKnownExecutionCount();
1255 if (IsJumpTable
&& !OptimizeJumpTables
)
1258 if (!IsJumpTable
&& (!HasIndirectCallProfile
|| !OptimizeCalls
))
1261 // Ignore direct calls.
1262 if (BC
.MIB
->isCall(Inst
) && BC
.MIB
->getTargetSymbol(Inst
, 0))
1265 assert((BC
.MIB
->isCall(Inst
) || BC
.MIB
->isIndirectBranch(Inst
)) &&
1266 "expected a call or an indirect jump instruction");
1269 ++TotalJumpTableCallsites
;
1271 ++TotalIndirectCallsites
;
1273 std::vector
<Callsite
> Targets
= getCallTargets(*BB
, Inst
);
1275 // Compute the total number of calls from this particular callsite.
1276 uint64_t NumCalls
= 0;
1277 for (const Callsite
&BInfo
: Targets
)
1278 NumCalls
+= BInfo
.Branches
;
1280 FuncTotalIndirectCalls
+= NumCalls
;
1282 FuncTotalIndirectJmps
+= NumCalls
;
1284 // If FLAGS regs is alive after this jmp site, do not try
1285 // promoting because we will clobber FLAGS.
1287 ErrorOr
<const BitVector
&> State
=
1288 Info
.getLivenessAnalysis().getStateBefore(Inst
);
1289 if (!State
|| (State
&& (*State
)[BC
.MIB
->getFlagsReg()])) {
1290 if (opts::Verbosity
>= 1)
1291 outs() << "BOLT-INFO: ICP failed in " << Function
<< " @ "
1292 << InstIdx
<< " in " << BB
->getName()
1293 << ", calls = " << NumCalls
1294 << (State
? ", cannot clobber flags reg.\n"
1295 : ", no liveness data available.\n");
1300 // Should this callsite be optimized? Return the number of targets
1301 // to use when promoting this call. A value of zero means to skip
1303 size_t N
= canPromoteCallsite(*BB
, Inst
, Targets
, NumCalls
);
1305 // If it is a jump table and it failed to meet our initial threshold,
1306 // proceed to findCallTargetSymbols -- it may reevaluate N if
1307 // memory profile is present
1308 if (!N
&& !IsJumpTable
)
1311 if (opts::Verbosity
>= 1)
1312 printCallsiteInfo(*BB
, Inst
, Targets
, N
, NumCalls
);
1314 // Find MCSymbols or absolute addresses for each call target.
1315 MCInst
*TargetFetchInst
= nullptr;
1316 const SymTargetsType SymTargets
=
1317 findCallTargetSymbols(Targets
, N
, *BB
, Inst
, TargetFetchInst
);
1319 // findCallTargetSymbols may have changed N if mem profile is available
1324 LLVM_DEBUG(printDecision(dbgs(), Targets
, N
));
1326 // If we can't resolve any of the target symbols, punt on this callsite.
1327 // TODO: can this ever happen?
1328 if (SymTargets
.size() < N
) {
1329 const size_t LastTarget
= SymTargets
.size();
1330 if (opts::Verbosity
>= 1)
1331 outs() << "BOLT-INFO: ICP failed in " << Function
<< " @ "
1332 << InstIdx
<< " in " << BB
->getName()
1333 << ", calls = " << NumCalls
1334 << ", ICP failed to find target symbol for "
1335 << Targets
[LastTarget
].To
.Sym
->getName() << "\n";
1339 MethodInfoType MethodInfo
;
1342 MethodInfo
= maybeGetVtableSyms(*BB
, Inst
, SymTargets
);
1343 TotalMethodLoadsEliminated
+= MethodInfo
.first
.empty() ? 0 : 1;
1345 << "BOLT-INFO: ICP "
1346 << (!MethodInfo
.first
.empty() ? "found" : "did not find")
1347 << " vtables for all methods.\n");
1348 } else if (TargetFetchInst
) {
1349 ++TotalIndexBasedJumps
;
1350 MethodInfo
.second
.push_back(TargetFetchInst
);
1353 // Generate new promoted call code for this callsite.
1354 MCPlusBuilder::BlocksVectorTy ICPcode
=
1355 (IsJumpTable
&& !opts::ICPJumpTablesByTarget
)
1356 ? BC
.MIB
->jumpTablePromotion(Inst
, SymTargets
,
1357 MethodInfo
.second
, BC
.Ctx
.get())
1358 : BC
.MIB
->indirectCallPromotion(
1359 Inst
, SymTargets
, MethodInfo
.first
, MethodInfo
.second
,
1360 opts::ICPOldCodeSequence
, BC
.Ctx
.get());
1362 if (ICPcode
.empty()) {
1363 if (opts::Verbosity
>= 1)
1364 outs() << "BOLT-INFO: ICP failed in " << Function
<< " @ "
1365 << InstIdx
<< " in " << BB
->getName()
1366 << ", calls = " << NumCalls
1367 << ", unable to generate promoted call code.\n";
1372 uint64_t Offset
= Targets
[0].From
.Addr
;
1373 dbgs() << "BOLT-INFO: ICP indirect call code:\n";
1374 for (const auto &entry
: ICPcode
) {
1375 const MCSymbol
*const &Sym
= entry
.first
;
1376 const InstructionListType
&Insts
= entry
.second
;
1378 dbgs() << Sym
->getName() << ":\n";
1379 Offset
= BC
.printInstructions(dbgs(), Insts
.begin(), Insts
.end(),
1382 dbgs() << "---------------------------------------------------\n";
1385 // Rewrite the CFG with the newly generated ICP code.
1386 std::vector
<std::unique_ptr
<BinaryBasicBlock
>> NewBBs
=
1387 rewriteCall(*BB
, Inst
, std::move(ICPcode
), MethodInfo
.second
);
1389 // Fix the CFG after inserting the new basic blocks.
1390 BinaryBasicBlock
*MergeBlock
=
1391 fixCFG(*BB
, IsTailCall
, IsJumpTable
, std::move(NewBBs
), Targets
);
1393 // Since the tail of the original block was split off and it may contain
1394 // additional indirect calls, we must add the merge block to the set of
1395 // blocks to process.
1397 BBs
.push_back(MergeBlock
);
1399 if (opts::Verbosity
>= 1)
1400 outs() << "BOLT-INFO: ICP succeeded in " << Function
<< " @ "
1401 << InstIdx
<< " in " << BB
->getName()
1402 << " -> calls = " << NumCalls
<< "\n";
1405 ++TotalOptimizedJumpTableCallsites
;
1407 ++TotalOptimizedIndirectCallsites
;
1409 Modified
.insert(&Function
);
1412 TotalIndirectCalls
+= FuncTotalIndirectCalls
;
1413 TotalIndirectJmps
+= FuncTotalIndirectJmps
;
1416 outs() << "BOLT-INFO: ICP total indirect callsites with profile = "
1417 << TotalIndirectCallsites
<< "\n"
1418 << "BOLT-INFO: ICP total jump table callsites = "
1419 << TotalJumpTableCallsites
<< "\n"
1420 << "BOLT-INFO: ICP total number of calls = " << TotalCalls
<< "\n"
1421 << "BOLT-INFO: ICP percentage of calls that are indirect = "
1422 << format("%.1f", (100.0 * TotalIndirectCalls
) / TotalCalls
) << "%\n"
1423 << "BOLT-INFO: ICP percentage of indirect calls that can be "
1425 << format("%.1f", (100.0 * TotalNumFrequentCalls
) /
1426 std::max
<size_t>(TotalIndirectCalls
, 1))
1428 << "BOLT-INFO: ICP percentage of indirect callsites that are "
1430 << format("%.1f", (100.0 * TotalOptimizedIndirectCallsites
) /
1431 std::max
<uint64_t>(TotalIndirectCallsites
, 1))
1433 << "BOLT-INFO: ICP number of method load elimination candidates = "
1434 << TotalMethodLoadEliminationCandidates
<< "\n"
1435 << "BOLT-INFO: ICP percentage of method calls candidates that have "
1436 "loads eliminated = "
1437 << format("%.1f", (100.0 * TotalMethodLoadsEliminated
) /
1439 TotalMethodLoadEliminationCandidates
, 1))
1441 << "BOLT-INFO: ICP percentage of indirect branches that are "
1443 << format("%.1f", (100.0 * TotalNumFrequentJmps
) /
1444 std::max
<uint64_t>(TotalIndirectJmps
, 1))
1446 << "BOLT-INFO: ICP percentage of jump table callsites that are "
1448 << format("%.1f", (100.0 * TotalOptimizedJumpTableCallsites
) /
1449 std::max
<uint64_t>(TotalJumpTableCallsites
, 1))
1451 << "BOLT-INFO: ICP number of jump table callsites that can use hot "
1452 << "indices = " << TotalIndexBasedCandidates
<< "\n"
1453 << "BOLT-INFO: ICP percentage of jump table callsites that use hot "
1455 << format("%.1f", (100.0 * TotalIndexBasedJumps
) /
1456 std::max
<uint64_t>(TotalIndexBasedCandidates
, 1))
1459 (void)verifyProfile
;