[RISCV] Fix mgather -> riscv.masked.strided.load combine not extending indices (...
[llvm-project.git] / llvm / lib / Analysis / InlineCost.cpp
blob1fa7badaa4fa01e1e8b8aeeac0c07b169bdae0ac
1 //===- InlineCost.cpp - Cost analysis for inliner -------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements inline cost analysis.
11 //===----------------------------------------------------------------------===//
13 #include "llvm/Analysis/InlineCost.h"
14 #include "llvm/ADT/STLExtras.h"
15 #include "llvm/ADT/SetVector.h"
16 #include "llvm/ADT/SmallPtrSet.h"
17 #include "llvm/ADT/SmallVector.h"
18 #include "llvm/ADT/Statistic.h"
19 #include "llvm/Analysis/AssumptionCache.h"
20 #include "llvm/Analysis/BlockFrequencyInfo.h"
21 #include "llvm/Analysis/CodeMetrics.h"
22 #include "llvm/Analysis/ConstantFolding.h"
23 #include "llvm/Analysis/InstructionSimplify.h"
24 #include "llvm/Analysis/LoopInfo.h"
25 #include "llvm/Analysis/MemoryBuiltins.h"
26 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
27 #include "llvm/Analysis/ProfileSummaryInfo.h"
28 #include "llvm/Analysis/TargetLibraryInfo.h"
29 #include "llvm/Analysis/TargetTransformInfo.h"
30 #include "llvm/Analysis/ValueTracking.h"
31 #include "llvm/Config/llvm-config.h"
32 #include "llvm/IR/AssemblyAnnotationWriter.h"
33 #include "llvm/IR/CallingConv.h"
34 #include "llvm/IR/DataLayout.h"
35 #include "llvm/IR/Dominators.h"
36 #include "llvm/IR/GetElementPtrTypeIterator.h"
37 #include "llvm/IR/GlobalAlias.h"
38 #include "llvm/IR/InstVisitor.h"
39 #include "llvm/IR/IntrinsicInst.h"
40 #include "llvm/IR/Operator.h"
41 #include "llvm/IR/PatternMatch.h"
42 #include "llvm/Support/CommandLine.h"
43 #include "llvm/Support/Debug.h"
44 #include "llvm/Support/FormattedStream.h"
45 #include "llvm/Support/raw_ostream.h"
46 #include <climits>
47 #include <limits>
48 #include <optional>
50 using namespace llvm;
52 #define DEBUG_TYPE "inline-cost"
54 STATISTIC(NumCallsAnalyzed, "Number of call sites analyzed");
56 static cl::opt<int>
57 DefaultThreshold("inlinedefault-threshold", cl::Hidden, cl::init(225),
58 cl::desc("Default amount of inlining to perform"));
60 // We introduce this option since there is a minor compile-time win by avoiding
61 // addition of TTI attributes (target-features in particular) to inline
62 // candidates when they are guaranteed to be the same as top level methods in
63 // some use cases. If we avoid adding the attribute, we need an option to avoid
64 // checking these attributes.
65 static cl::opt<bool> IgnoreTTIInlineCompatible(
66 "ignore-tti-inline-compatible", cl::Hidden, cl::init(false),
67 cl::desc("Ignore TTI attributes compatibility check between callee/caller "
68 "during inline cost calculation"));
70 static cl::opt<bool> PrintInstructionComments(
71 "print-instruction-comments", cl::Hidden, cl::init(false),
72 cl::desc("Prints comments for instruction based on inline cost analysis"));
74 static cl::opt<int> InlineThreshold(
75 "inline-threshold", cl::Hidden, cl::init(225),
76 cl::desc("Control the amount of inlining to perform (default = 225)"));
78 static cl::opt<int> HintThreshold(
79 "inlinehint-threshold", cl::Hidden, cl::init(325),
80 cl::desc("Threshold for inlining functions with inline hint"));
82 static cl::opt<int>
83 ColdCallSiteThreshold("inline-cold-callsite-threshold", cl::Hidden,
84 cl::init(45),
85 cl::desc("Threshold for inlining cold callsites"));
87 static cl::opt<bool> InlineEnableCostBenefitAnalysis(
88 "inline-enable-cost-benefit-analysis", cl::Hidden, cl::init(false),
89 cl::desc("Enable the cost-benefit analysis for the inliner"));
91 // InlineSavingsMultiplier overrides per TTI multipliers iff it is
92 // specified explicitly in command line options. This option is exposed
93 // for tuning and testing.
94 static cl::opt<int> InlineSavingsMultiplier(
95 "inline-savings-multiplier", cl::Hidden, cl::init(8),
96 cl::desc("Multiplier to multiply cycle savings by during inlining"));
98 // InlineSavingsProfitableMultiplier overrides per TTI multipliers iff it is
99 // specified explicitly in command line options. This option is exposed
100 // for tuning and testing.
101 static cl::opt<int> InlineSavingsProfitableMultiplier(
102 "inline-savings-profitable-multiplier", cl::Hidden, cl::init(4),
103 cl::desc("A multiplier on top of cycle savings to decide whether the "
104 "savings won't justify the cost"));
106 static cl::opt<int>
107 InlineSizeAllowance("inline-size-allowance", cl::Hidden, cl::init(100),
108 cl::desc("The maximum size of a callee that get's "
109 "inlined without sufficient cycle savings"));
111 // We introduce this threshold to help performance of instrumentation based
112 // PGO before we actually hook up inliner with analysis passes such as BPI and
113 // BFI.
114 static cl::opt<int> ColdThreshold(
115 "inlinecold-threshold", cl::Hidden, cl::init(45),
116 cl::desc("Threshold for inlining functions with cold attribute"));
118 static cl::opt<int>
119 HotCallSiteThreshold("hot-callsite-threshold", cl::Hidden, cl::init(3000),
120 cl::desc("Threshold for hot callsites "));
122 static cl::opt<int> LocallyHotCallSiteThreshold(
123 "locally-hot-callsite-threshold", cl::Hidden, cl::init(525),
124 cl::desc("Threshold for locally hot callsites "));
126 static cl::opt<int> ColdCallSiteRelFreq(
127 "cold-callsite-rel-freq", cl::Hidden, cl::init(2),
128 cl::desc("Maximum block frequency, expressed as a percentage of caller's "
129 "entry frequency, for a callsite to be cold in the absence of "
130 "profile information."));
132 static cl::opt<uint64_t> HotCallSiteRelFreq(
133 "hot-callsite-rel-freq", cl::Hidden, cl::init(60),
134 cl::desc("Minimum block frequency, expressed as a multiple of caller's "
135 "entry frequency, for a callsite to be hot in the absence of "
136 "profile information."));
138 static cl::opt<int>
139 InstrCost("inline-instr-cost", cl::Hidden, cl::init(5),
140 cl::desc("Cost of a single instruction when inlining"));
142 static cl::opt<int>
143 MemAccessCost("inline-memaccess-cost", cl::Hidden, cl::init(0),
144 cl::desc("Cost of load/store instruction when inlining"));
146 static cl::opt<int> CallPenalty(
147 "inline-call-penalty", cl::Hidden, cl::init(25),
148 cl::desc("Call penalty that is applied per callsite when inlining"));
150 static cl::opt<size_t>
151 StackSizeThreshold("inline-max-stacksize", cl::Hidden,
152 cl::init(std::numeric_limits<size_t>::max()),
153 cl::desc("Do not inline functions with a stack size "
154 "that exceeds the specified limit"));
156 static cl::opt<size_t> RecurStackSizeThreshold(
157 "recursive-inline-max-stacksize", cl::Hidden,
158 cl::init(InlineConstants::TotalAllocaSizeRecursiveCaller),
159 cl::desc("Do not inline recursive functions with a stack "
160 "size that exceeds the specified limit"));
162 static cl::opt<bool> OptComputeFullInlineCost(
163 "inline-cost-full", cl::Hidden,
164 cl::desc("Compute the full inline cost of a call site even when the cost "
165 "exceeds the threshold."));
167 static cl::opt<bool> InlineCallerSupersetNoBuiltin(
168 "inline-caller-superset-nobuiltin", cl::Hidden, cl::init(true),
169 cl::desc("Allow inlining when caller has a superset of callee's nobuiltin "
170 "attributes."));
172 static cl::opt<bool> DisableGEPConstOperand(
173 "disable-gep-const-evaluation", cl::Hidden, cl::init(false),
174 cl::desc("Disables evaluation of GetElementPtr with constant operands"));
176 namespace llvm {
177 std::optional<int> getStringFnAttrAsInt(const Attribute &Attr) {
178 if (Attr.isValid()) {
179 int AttrValue = 0;
180 if (!Attr.getValueAsString().getAsInteger(10, AttrValue))
181 return AttrValue;
183 return std::nullopt;
186 std::optional<int> getStringFnAttrAsInt(CallBase &CB, StringRef AttrKind) {
187 return getStringFnAttrAsInt(CB.getFnAttr(AttrKind));
190 std::optional<int> getStringFnAttrAsInt(Function *F, StringRef AttrKind) {
191 return getStringFnAttrAsInt(F->getFnAttribute(AttrKind));
194 namespace InlineConstants {
195 int getInstrCost() { return InstrCost; }
197 } // namespace InlineConstants
199 } // namespace llvm
201 namespace {
202 class InlineCostCallAnalyzer;
204 // This struct is used to store information about inline cost of a
205 // particular instruction
206 struct InstructionCostDetail {
207 int CostBefore = 0;
208 int CostAfter = 0;
209 int ThresholdBefore = 0;
210 int ThresholdAfter = 0;
212 int getThresholdDelta() const { return ThresholdAfter - ThresholdBefore; }
214 int getCostDelta() const { return CostAfter - CostBefore; }
216 bool hasThresholdChanged() const { return ThresholdAfter != ThresholdBefore; }
219 class InlineCostAnnotationWriter : public AssemblyAnnotationWriter {
220 private:
221 InlineCostCallAnalyzer *const ICCA;
223 public:
224 InlineCostAnnotationWriter(InlineCostCallAnalyzer *ICCA) : ICCA(ICCA) {}
225 void emitInstructionAnnot(const Instruction *I,
226 formatted_raw_ostream &OS) override;
229 /// Carry out call site analysis, in order to evaluate inlinability.
230 /// NOTE: the type is currently used as implementation detail of functions such
231 /// as llvm::getInlineCost. Note the function_ref constructor parameters - the
232 /// expectation is that they come from the outer scope, from the wrapper
233 /// functions. If we want to support constructing CallAnalyzer objects where
234 /// lambdas are provided inline at construction, or where the object needs to
235 /// otherwise survive past the scope of the provided functions, we need to
236 /// revisit the argument types.
237 class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> {
238 typedef InstVisitor<CallAnalyzer, bool> Base;
239 friend class InstVisitor<CallAnalyzer, bool>;
241 protected:
242 virtual ~CallAnalyzer() = default;
243 /// The TargetTransformInfo available for this compilation.
244 const TargetTransformInfo &TTI;
246 /// Getter for the cache of @llvm.assume intrinsics.
247 function_ref<AssumptionCache &(Function &)> GetAssumptionCache;
249 /// Getter for BlockFrequencyInfo
250 function_ref<BlockFrequencyInfo &(Function &)> GetBFI;
252 /// Profile summary information.
253 ProfileSummaryInfo *PSI;
255 /// The called function.
256 Function &F;
258 // Cache the DataLayout since we use it a lot.
259 const DataLayout &DL;
261 /// The OptimizationRemarkEmitter available for this compilation.
262 OptimizationRemarkEmitter *ORE;
264 /// The candidate callsite being analyzed. Please do not use this to do
265 /// analysis in the caller function; we want the inline cost query to be
266 /// easily cacheable. Instead, use the cover function paramHasAttr.
267 CallBase &CandidateCall;
269 /// Extension points for handling callsite features.
270 // Called before a basic block was analyzed.
271 virtual void onBlockStart(const BasicBlock *BB) {}
273 /// Called after a basic block was analyzed.
274 virtual void onBlockAnalyzed(const BasicBlock *BB) {}
276 /// Called before an instruction was analyzed
277 virtual void onInstructionAnalysisStart(const Instruction *I) {}
279 /// Called after an instruction was analyzed
280 virtual void onInstructionAnalysisFinish(const Instruction *I) {}
282 /// Called at the end of the analysis of the callsite. Return the outcome of
283 /// the analysis, i.e. 'InlineResult(true)' if the inlining may happen, or
284 /// the reason it can't.
285 virtual InlineResult finalizeAnalysis() { return InlineResult::success(); }
286 /// Called when we're about to start processing a basic block, and every time
287 /// we are done processing an instruction. Return true if there is no point in
288 /// continuing the analysis (e.g. we've determined already the call site is
289 /// too expensive to inline)
290 virtual bool shouldStop() { return false; }
292 /// Called before the analysis of the callee body starts (with callsite
293 /// contexts propagated). It checks callsite-specific information. Return a
294 /// reason analysis can't continue if that's the case, or 'true' if it may
295 /// continue.
296 virtual InlineResult onAnalysisStart() { return InlineResult::success(); }
297 /// Called if the analysis engine decides SROA cannot be done for the given
298 /// alloca.
299 virtual void onDisableSROA(AllocaInst *Arg) {}
301 /// Called the analysis engine determines load elimination won't happen.
302 virtual void onDisableLoadElimination() {}
304 /// Called when we visit a CallBase, before the analysis starts. Return false
305 /// to stop further processing of the instruction.
306 virtual bool onCallBaseVisitStart(CallBase &Call) { return true; }
308 /// Called to account for a call.
309 virtual void onCallPenalty() {}
311 /// Called to account for a load or store.
312 virtual void onMemAccess(){};
314 /// Called to account for the expectation the inlining would result in a load
315 /// elimination.
316 virtual void onLoadEliminationOpportunity() {}
318 /// Called to account for the cost of argument setup for the Call in the
319 /// callee's body (not the callsite currently under analysis).
320 virtual void onCallArgumentSetup(const CallBase &Call) {}
322 /// Called to account for a load relative intrinsic.
323 virtual void onLoadRelativeIntrinsic() {}
325 /// Called to account for a lowered call.
326 virtual void onLoweredCall(Function *F, CallBase &Call, bool IsIndirectCall) {
329 /// Account for a jump table of given size. Return false to stop further
330 /// processing the switch instruction
331 virtual bool onJumpTable(unsigned JumpTableSize) { return true; }
333 /// Account for a case cluster of given size. Return false to stop further
334 /// processing of the instruction.
335 virtual bool onCaseCluster(unsigned NumCaseCluster) { return true; }
337 /// Called at the end of processing a switch instruction, with the given
338 /// number of case clusters.
339 virtual void onFinalizeSwitch(unsigned JumpTableSize,
340 unsigned NumCaseCluster) {}
342 /// Called to account for any other instruction not specifically accounted
343 /// for.
344 virtual void onMissedSimplification() {}
346 /// Start accounting potential benefits due to SROA for the given alloca.
347 virtual void onInitializeSROAArg(AllocaInst *Arg) {}
349 /// Account SROA savings for the AllocaInst value.
350 virtual void onAggregateSROAUse(AllocaInst *V) {}
352 bool handleSROA(Value *V, bool DoNotDisable) {
353 // Check for SROA candidates in comparisons.
354 if (auto *SROAArg = getSROAArgForValueOrNull(V)) {
355 if (DoNotDisable) {
356 onAggregateSROAUse(SROAArg);
357 return true;
359 disableSROAForArg(SROAArg);
361 return false;
364 bool IsCallerRecursive = false;
365 bool IsRecursiveCall = false;
366 bool ExposesReturnsTwice = false;
367 bool HasDynamicAlloca = false;
368 bool ContainsNoDuplicateCall = false;
369 bool HasReturn = false;
370 bool HasIndirectBr = false;
371 bool HasUninlineableIntrinsic = false;
372 bool InitsVargArgs = false;
374 /// Number of bytes allocated statically by the callee.
375 uint64_t AllocatedSize = 0;
376 unsigned NumInstructions = 0;
377 unsigned NumVectorInstructions = 0;
379 /// While we walk the potentially-inlined instructions, we build up and
380 /// maintain a mapping of simplified values specific to this callsite. The
381 /// idea is to propagate any special information we have about arguments to
382 /// this call through the inlinable section of the function, and account for
383 /// likely simplifications post-inlining. The most important aspect we track
384 /// is CFG altering simplifications -- when we prove a basic block dead, that
385 /// can cause dramatic shifts in the cost of inlining a function.
386 DenseMap<Value *, Constant *> SimplifiedValues;
388 /// Keep track of the values which map back (through function arguments) to
389 /// allocas on the caller stack which could be simplified through SROA.
390 DenseMap<Value *, AllocaInst *> SROAArgValues;
392 /// Keep track of Allocas for which we believe we may get SROA optimization.
393 DenseSet<AllocaInst *> EnabledSROAAllocas;
395 /// Keep track of values which map to a pointer base and constant offset.
396 DenseMap<Value *, std::pair<Value *, APInt>> ConstantOffsetPtrs;
398 /// Keep track of dead blocks due to the constant arguments.
399 SmallPtrSet<BasicBlock *, 16> DeadBlocks;
401 /// The mapping of the blocks to their known unique successors due to the
402 /// constant arguments.
403 DenseMap<BasicBlock *, BasicBlock *> KnownSuccessors;
405 /// Model the elimination of repeated loads that is expected to happen
406 /// whenever we simplify away the stores that would otherwise cause them to be
407 /// loads.
408 bool EnableLoadElimination = true;
410 /// Whether we allow inlining for recursive call.
411 bool AllowRecursiveCall = false;
413 SmallPtrSet<Value *, 16> LoadAddrSet;
415 AllocaInst *getSROAArgForValueOrNull(Value *V) const {
416 auto It = SROAArgValues.find(V);
417 if (It == SROAArgValues.end() || EnabledSROAAllocas.count(It->second) == 0)
418 return nullptr;
419 return It->second;
422 // Custom simplification helper routines.
423 bool isAllocaDerivedArg(Value *V);
424 void disableSROAForArg(AllocaInst *SROAArg);
425 void disableSROA(Value *V);
426 void findDeadBlocks(BasicBlock *CurrBB, BasicBlock *NextBB);
427 void disableLoadElimination();
428 bool isGEPFree(GetElementPtrInst &GEP);
429 bool canFoldInboundsGEP(GetElementPtrInst &I);
430 bool accumulateGEPOffset(GEPOperator &GEP, APInt &Offset);
431 bool simplifyCallSite(Function *F, CallBase &Call);
432 bool simplifyInstruction(Instruction &I);
433 bool simplifyIntrinsicCallIsConstant(CallBase &CB);
434 bool simplifyIntrinsicCallObjectSize(CallBase &CB);
435 ConstantInt *stripAndComputeInBoundsConstantOffsets(Value *&V);
437 /// Return true if the given argument to the function being considered for
438 /// inlining has the given attribute set either at the call site or the
439 /// function declaration. Primarily used to inspect call site specific
440 /// attributes since these can be more precise than the ones on the callee
441 /// itself.
442 bool paramHasAttr(Argument *A, Attribute::AttrKind Attr);
444 /// Return true if the given value is known non null within the callee if
445 /// inlined through this particular callsite.
446 bool isKnownNonNullInCallee(Value *V);
448 /// Return true if size growth is allowed when inlining the callee at \p Call.
449 bool allowSizeGrowth(CallBase &Call);
451 // Custom analysis routines.
452 InlineResult analyzeBlock(BasicBlock *BB,
453 SmallPtrSetImpl<const Value *> &EphValues);
455 // Disable several entry points to the visitor so we don't accidentally use
456 // them by declaring but not defining them here.
457 void visit(Module *);
458 void visit(Module &);
459 void visit(Function *);
460 void visit(Function &);
461 void visit(BasicBlock *);
462 void visit(BasicBlock &);
464 // Provide base case for our instruction visit.
465 bool visitInstruction(Instruction &I);
467 // Our visit overrides.
468 bool visitAlloca(AllocaInst &I);
469 bool visitPHI(PHINode &I);
470 bool visitGetElementPtr(GetElementPtrInst &I);
471 bool visitBitCast(BitCastInst &I);
472 bool visitPtrToInt(PtrToIntInst &I);
473 bool visitIntToPtr(IntToPtrInst &I);
474 bool visitCastInst(CastInst &I);
475 bool visitCmpInst(CmpInst &I);
476 bool visitSub(BinaryOperator &I);
477 bool visitBinaryOperator(BinaryOperator &I);
478 bool visitFNeg(UnaryOperator &I);
479 bool visitLoad(LoadInst &I);
480 bool visitStore(StoreInst &I);
481 bool visitExtractValue(ExtractValueInst &I);
482 bool visitInsertValue(InsertValueInst &I);
483 bool visitCallBase(CallBase &Call);
484 bool visitReturnInst(ReturnInst &RI);
485 bool visitBranchInst(BranchInst &BI);
486 bool visitSelectInst(SelectInst &SI);
487 bool visitSwitchInst(SwitchInst &SI);
488 bool visitIndirectBrInst(IndirectBrInst &IBI);
489 bool visitResumeInst(ResumeInst &RI);
490 bool visitCleanupReturnInst(CleanupReturnInst &RI);
491 bool visitCatchReturnInst(CatchReturnInst &RI);
492 bool visitUnreachableInst(UnreachableInst &I);
494 public:
495 CallAnalyzer(Function &Callee, CallBase &Call, const TargetTransformInfo &TTI,
496 function_ref<AssumptionCache &(Function &)> GetAssumptionCache,
497 function_ref<BlockFrequencyInfo &(Function &)> GetBFI = nullptr,
498 ProfileSummaryInfo *PSI = nullptr,
499 OptimizationRemarkEmitter *ORE = nullptr)
500 : TTI(TTI), GetAssumptionCache(GetAssumptionCache), GetBFI(GetBFI),
501 PSI(PSI), F(Callee), DL(F.getParent()->getDataLayout()), ORE(ORE),
502 CandidateCall(Call) {}
504 InlineResult analyze();
506 std::optional<Constant *> getSimplifiedValue(Instruction *I) {
507 if (SimplifiedValues.contains(I))
508 return SimplifiedValues[I];
509 return std::nullopt;
512 // Keep a bunch of stats about the cost savings found so we can print them
513 // out when debugging.
514 unsigned NumConstantArgs = 0;
515 unsigned NumConstantOffsetPtrArgs = 0;
516 unsigned NumAllocaArgs = 0;
517 unsigned NumConstantPtrCmps = 0;
518 unsigned NumConstantPtrDiffs = 0;
519 unsigned NumInstructionsSimplified = 0;
521 void dump();
524 // Considering forming a binary search, we should find the number of nodes
525 // which is same as the number of comparisons when lowered. For a given
526 // number of clusters, n, we can define a recursive function, f(n), to find
527 // the number of nodes in the tree. The recursion is :
528 // f(n) = 1 + f(n/2) + f (n - n/2), when n > 3,
529 // and f(n) = n, when n <= 3.
530 // This will lead a binary tree where the leaf should be either f(2) or f(3)
531 // when n > 3. So, the number of comparisons from leaves should be n, while
532 // the number of non-leaf should be :
533 // 2^(log2(n) - 1) - 1
534 // = 2^log2(n) * 2^-1 - 1
535 // = n / 2 - 1.
536 // Considering comparisons from leaf and non-leaf nodes, we can estimate the
537 // number of comparisons in a simple closed form :
538 // n + n / 2 - 1 = n * 3 / 2 - 1
539 int64_t getExpectedNumberOfCompare(int NumCaseCluster) {
540 return 3 * static_cast<int64_t>(NumCaseCluster) / 2 - 1;
543 /// FIXME: if it is necessary to derive from InlineCostCallAnalyzer, note
544 /// the FIXME in onLoweredCall, when instantiating an InlineCostCallAnalyzer
545 class InlineCostCallAnalyzer final : public CallAnalyzer {
546 const bool ComputeFullInlineCost;
547 int LoadEliminationCost = 0;
548 /// Bonus to be applied when percentage of vector instructions in callee is
549 /// high (see more details in updateThreshold).
550 int VectorBonus = 0;
551 /// Bonus to be applied when the callee has only one reachable basic block.
552 int SingleBBBonus = 0;
554 /// Tunable parameters that control the analysis.
555 const InlineParams &Params;
557 // This DenseMap stores the delta change in cost and threshold after
558 // accounting for the given instruction. The map is filled only with the
559 // flag PrintInstructionComments on.
560 DenseMap<const Instruction *, InstructionCostDetail> InstructionCostDetailMap;
562 /// Upper bound for the inlining cost. Bonuses are being applied to account
563 /// for speculative "expected profit" of the inlining decision.
564 int Threshold = 0;
566 /// The amount of StaticBonus applied.
567 int StaticBonusApplied = 0;
569 /// Attempt to evaluate indirect calls to boost its inline cost.
570 const bool BoostIndirectCalls;
572 /// Ignore the threshold when finalizing analysis.
573 const bool IgnoreThreshold;
575 // True if the cost-benefit-analysis-based inliner is enabled.
576 const bool CostBenefitAnalysisEnabled;
578 /// Inlining cost measured in abstract units, accounts for all the
579 /// instructions expected to be executed for a given function invocation.
580 /// Instructions that are statically proven to be dead based on call-site
581 /// arguments are not counted here.
582 int Cost = 0;
584 // The cumulative cost at the beginning of the basic block being analyzed. At
585 // the end of analyzing each basic block, "Cost - CostAtBBStart" represents
586 // the size of that basic block.
587 int CostAtBBStart = 0;
589 // The static size of live but cold basic blocks. This is "static" in the
590 // sense that it's not weighted by profile counts at all.
591 int ColdSize = 0;
593 // Whether inlining is decided by cost-threshold analysis.
594 bool DecidedByCostThreshold = false;
596 // Whether inlining is decided by cost-benefit analysis.
597 bool DecidedByCostBenefit = false;
599 // The cost-benefit pair computed by cost-benefit analysis.
600 std::optional<CostBenefitPair> CostBenefit;
602 bool SingleBB = true;
604 unsigned SROACostSavings = 0;
605 unsigned SROACostSavingsLost = 0;
607 /// The mapping of caller Alloca values to their accumulated cost savings. If
608 /// we have to disable SROA for one of the allocas, this tells us how much
609 /// cost must be added.
610 DenseMap<AllocaInst *, int> SROAArgCosts;
612 /// Return true if \p Call is a cold callsite.
613 bool isColdCallSite(CallBase &Call, BlockFrequencyInfo *CallerBFI);
615 /// Update Threshold based on callsite properties such as callee
616 /// attributes and callee hotness for PGO builds. The Callee is explicitly
617 /// passed to support analyzing indirect calls whose target is inferred by
618 /// analysis.
619 void updateThreshold(CallBase &Call, Function &Callee);
620 /// Return a higher threshold if \p Call is a hot callsite.
621 std::optional<int> getHotCallSiteThreshold(CallBase &Call,
622 BlockFrequencyInfo *CallerBFI);
624 /// Handle a capped 'int' increment for Cost.
625 void addCost(int64_t Inc) {
626 Inc = std::clamp<int64_t>(Inc, INT_MIN, INT_MAX);
627 Cost = std::clamp<int64_t>(Inc + Cost, INT_MIN, INT_MAX);
630 void onDisableSROA(AllocaInst *Arg) override {
631 auto CostIt = SROAArgCosts.find(Arg);
632 if (CostIt == SROAArgCosts.end())
633 return;
634 addCost(CostIt->second);
635 SROACostSavings -= CostIt->second;
636 SROACostSavingsLost += CostIt->second;
637 SROAArgCosts.erase(CostIt);
640 void onDisableLoadElimination() override {
641 addCost(LoadEliminationCost);
642 LoadEliminationCost = 0;
645 bool onCallBaseVisitStart(CallBase &Call) override {
646 if (std::optional<int> AttrCallThresholdBonus =
647 getStringFnAttrAsInt(Call, "call-threshold-bonus"))
648 Threshold += *AttrCallThresholdBonus;
650 if (std::optional<int> AttrCallCost =
651 getStringFnAttrAsInt(Call, "call-inline-cost")) {
652 addCost(*AttrCallCost);
653 // Prevent further processing of the call since we want to override its
654 // inline cost, not just add to it.
655 return false;
657 return true;
660 void onCallPenalty() override { addCost(CallPenalty); }
662 void onMemAccess() override { addCost(MemAccessCost); }
664 void onCallArgumentSetup(const CallBase &Call) override {
665 // Pay the price of the argument setup. We account for the average 1
666 // instruction per call argument setup here.
667 addCost(Call.arg_size() * InstrCost);
669 void onLoadRelativeIntrinsic() override {
670 // This is normally lowered to 4 LLVM instructions.
671 addCost(3 * InstrCost);
673 void onLoweredCall(Function *F, CallBase &Call,
674 bool IsIndirectCall) override {
675 // We account for the average 1 instruction per call argument setup here.
676 addCost(Call.arg_size() * InstrCost);
678 // If we have a constant that we are calling as a function, we can peer
679 // through it and see the function target. This happens not infrequently
680 // during devirtualization and so we want to give it a hefty bonus for
681 // inlining, but cap that bonus in the event that inlining wouldn't pan out.
682 // Pretend to inline the function, with a custom threshold.
683 if (IsIndirectCall && BoostIndirectCalls) {
684 auto IndirectCallParams = Params;
685 IndirectCallParams.DefaultThreshold =
686 InlineConstants::IndirectCallThreshold;
687 /// FIXME: if InlineCostCallAnalyzer is derived from, this may need
688 /// to instantiate the derived class.
689 InlineCostCallAnalyzer CA(*F, Call, IndirectCallParams, TTI,
690 GetAssumptionCache, GetBFI, PSI, ORE, false);
691 if (CA.analyze().isSuccess()) {
692 // We were able to inline the indirect call! Subtract the cost from the
693 // threshold to get the bonus we want to apply, but don't go below zero.
694 Cost -= std::max(0, CA.getThreshold() - CA.getCost());
696 } else
697 // Otherwise simply add the cost for merely making the call.
698 addCost(TTI.getInlineCallPenalty(CandidateCall.getCaller(), Call,
699 CallPenalty));
702 void onFinalizeSwitch(unsigned JumpTableSize,
703 unsigned NumCaseCluster) override {
704 // If suitable for a jump table, consider the cost for the table size and
705 // branch to destination.
706 // Maximum valid cost increased in this function.
707 if (JumpTableSize) {
708 int64_t JTCost =
709 static_cast<int64_t>(JumpTableSize) * InstrCost + 4 * InstrCost;
711 addCost(JTCost);
712 return;
715 if (NumCaseCluster <= 3) {
716 // Suppose a comparison includes one compare and one conditional branch.
717 addCost(NumCaseCluster * 2 * InstrCost);
718 return;
721 int64_t ExpectedNumberOfCompare =
722 getExpectedNumberOfCompare(NumCaseCluster);
723 int64_t SwitchCost = ExpectedNumberOfCompare * 2 * InstrCost;
725 addCost(SwitchCost);
727 void onMissedSimplification() override { addCost(InstrCost); }
729 void onInitializeSROAArg(AllocaInst *Arg) override {
730 assert(Arg != nullptr &&
731 "Should not initialize SROA costs for null value.");
732 auto SROAArgCost = TTI.getCallerAllocaCost(&CandidateCall, Arg);
733 SROACostSavings += SROAArgCost;
734 SROAArgCosts[Arg] = SROAArgCost;
737 void onAggregateSROAUse(AllocaInst *SROAArg) override {
738 auto CostIt = SROAArgCosts.find(SROAArg);
739 assert(CostIt != SROAArgCosts.end() &&
740 "expected this argument to have a cost");
741 CostIt->second += InstrCost;
742 SROACostSavings += InstrCost;
745 void onBlockStart(const BasicBlock *BB) override { CostAtBBStart = Cost; }
747 void onBlockAnalyzed(const BasicBlock *BB) override {
748 if (CostBenefitAnalysisEnabled) {
749 // Keep track of the static size of live but cold basic blocks. For now,
750 // we define a cold basic block to be one that's never executed.
751 assert(GetBFI && "GetBFI must be available");
752 BlockFrequencyInfo *BFI = &(GetBFI(F));
753 assert(BFI && "BFI must be available");
754 auto ProfileCount = BFI->getBlockProfileCount(BB);
755 if (*ProfileCount == 0)
756 ColdSize += Cost - CostAtBBStart;
759 auto *TI = BB->getTerminator();
760 // If we had any successors at this point, than post-inlining is likely to
761 // have them as well. Note that we assume any basic blocks which existed
762 // due to branches or switches which folded above will also fold after
763 // inlining.
764 if (SingleBB && TI->getNumSuccessors() > 1) {
765 // Take off the bonus we applied to the threshold.
766 Threshold -= SingleBBBonus;
767 SingleBB = false;
771 void onInstructionAnalysisStart(const Instruction *I) override {
772 // This function is called to store the initial cost of inlining before
773 // the given instruction was assessed.
774 if (!PrintInstructionComments)
775 return;
776 InstructionCostDetailMap[I].CostBefore = Cost;
777 InstructionCostDetailMap[I].ThresholdBefore = Threshold;
780 void onInstructionAnalysisFinish(const Instruction *I) override {
781 // This function is called to find new values of cost and threshold after
782 // the instruction has been assessed.
783 if (!PrintInstructionComments)
784 return;
785 InstructionCostDetailMap[I].CostAfter = Cost;
786 InstructionCostDetailMap[I].ThresholdAfter = Threshold;
789 bool isCostBenefitAnalysisEnabled() {
790 if (!PSI || !PSI->hasProfileSummary())
791 return false;
793 if (!GetBFI)
794 return false;
796 if (InlineEnableCostBenefitAnalysis.getNumOccurrences()) {
797 // Honor the explicit request from the user.
798 if (!InlineEnableCostBenefitAnalysis)
799 return false;
800 } else {
801 // Otherwise, require instrumentation profile.
802 if (!(PSI->hasInstrumentationProfile() || PSI->hasSampleProfile()))
803 return false;
806 auto *Caller = CandidateCall.getParent()->getParent();
807 if (!Caller->getEntryCount())
808 return false;
810 BlockFrequencyInfo *CallerBFI = &(GetBFI(*Caller));
811 if (!CallerBFI)
812 return false;
814 // For now, limit to hot call site.
815 if (!PSI->isHotCallSite(CandidateCall, CallerBFI))
816 return false;
818 // Make sure we have a nonzero entry count.
819 auto EntryCount = F.getEntryCount();
820 if (!EntryCount || !EntryCount->getCount())
821 return false;
823 BlockFrequencyInfo *CalleeBFI = &(GetBFI(F));
824 if (!CalleeBFI)
825 return false;
827 return true;
830 // A helper function to choose between command line override and default.
831 unsigned getInliningCostBenefitAnalysisSavingsMultiplier() const {
832 if (InlineSavingsMultiplier.getNumOccurrences())
833 return InlineSavingsMultiplier;
834 return TTI.getInliningCostBenefitAnalysisSavingsMultiplier();
837 // A helper function to choose between command line override and default.
838 unsigned getInliningCostBenefitAnalysisProfitableMultiplier() const {
839 if (InlineSavingsProfitableMultiplier.getNumOccurrences())
840 return InlineSavingsProfitableMultiplier;
841 return TTI.getInliningCostBenefitAnalysisProfitableMultiplier();
844 void OverrideCycleSavingsAndSizeForTesting(APInt &CycleSavings, int &Size) {
845 if (std::optional<int> AttrCycleSavings = getStringFnAttrAsInt(
846 CandidateCall, "inline-cycle-savings-for-test")) {
847 CycleSavings = *AttrCycleSavings;
850 if (std::optional<int> AttrRuntimeCost = getStringFnAttrAsInt(
851 CandidateCall, "inline-runtime-cost-for-test")) {
852 Size = *AttrRuntimeCost;
856 // Determine whether we should inline the given call site, taking into account
857 // both the size cost and the cycle savings. Return std::nullopt if we don't
858 // have sufficient profiling information to determine.
859 std::optional<bool> costBenefitAnalysis() {
860 if (!CostBenefitAnalysisEnabled)
861 return std::nullopt;
863 // buildInlinerPipeline in the pass builder sets HotCallSiteThreshold to 0
864 // for the prelink phase of the AutoFDO + ThinLTO build. Honor the logic by
865 // falling back to the cost-based metric.
866 // TODO: Improve this hacky condition.
867 if (Threshold == 0)
868 return std::nullopt;
870 assert(GetBFI);
871 BlockFrequencyInfo *CalleeBFI = &(GetBFI(F));
872 assert(CalleeBFI);
874 // The cycle savings expressed as the sum of InstrCost
875 // multiplied by the estimated dynamic count of each instruction we can
876 // avoid. Savings come from the call site cost, such as argument setup and
877 // the call instruction, as well as the instructions that are folded.
879 // We use 128-bit APInt here to avoid potential overflow. This variable
880 // should stay well below 10^^24 (or 2^^80) in practice. This "worst" case
881 // assumes that we can avoid or fold a billion instructions, each with a
882 // profile count of 10^^15 -- roughly the number of cycles for a 24-hour
883 // period on a 4GHz machine.
884 APInt CycleSavings(128, 0);
886 for (auto &BB : F) {
887 APInt CurrentSavings(128, 0);
888 for (auto &I : BB) {
889 if (BranchInst *BI = dyn_cast<BranchInst>(&I)) {
890 // Count a conditional branch as savings if it becomes unconditional.
891 if (BI->isConditional() &&
892 isa_and_nonnull<ConstantInt>(
893 SimplifiedValues.lookup(BI->getCondition()))) {
894 CurrentSavings += InstrCost;
896 } else if (SwitchInst *SI = dyn_cast<SwitchInst>(&I)) {
897 if (isa_and_present<ConstantInt>(SimplifiedValues.lookup(SI->getCondition())))
898 CurrentSavings += InstrCost;
899 } else if (Value *V = dyn_cast<Value>(&I)) {
900 // Count an instruction as savings if we can fold it.
901 if (SimplifiedValues.count(V)) {
902 CurrentSavings += InstrCost;
907 auto ProfileCount = CalleeBFI->getBlockProfileCount(&BB);
908 CurrentSavings *= *ProfileCount;
909 CycleSavings += CurrentSavings;
912 // Compute the cycle savings per call.
913 auto EntryProfileCount = F.getEntryCount();
914 assert(EntryProfileCount && EntryProfileCount->getCount());
915 auto EntryCount = EntryProfileCount->getCount();
916 CycleSavings += EntryCount / 2;
917 CycleSavings = CycleSavings.udiv(EntryCount);
919 // Compute the total savings for the call site.
920 auto *CallerBB = CandidateCall.getParent();
921 BlockFrequencyInfo *CallerBFI = &(GetBFI(*(CallerBB->getParent())));
922 CycleSavings += getCallsiteCost(TTI, this->CandidateCall, DL);
923 CycleSavings *= *CallerBFI->getBlockProfileCount(CallerBB);
925 // Remove the cost of the cold basic blocks to model the runtime cost more
926 // accurately. Both machine block placement and function splitting could
927 // place cold blocks further from hot blocks.
928 int Size = Cost - ColdSize;
930 // Allow tiny callees to be inlined regardless of whether they meet the
931 // savings threshold.
932 Size = Size > InlineSizeAllowance ? Size - InlineSizeAllowance : 1;
934 OverrideCycleSavingsAndSizeForTesting(CycleSavings, Size);
935 CostBenefit.emplace(APInt(128, Size), CycleSavings);
937 // Let R be the ratio of CycleSavings to Size. We accept the inlining
938 // opportunity if R is really high and reject if R is really low. If R is
939 // somewhere in the middle, we fall back to the cost-based analysis.
941 // Specifically, let R = CycleSavings / Size, we accept the inlining
942 // opportunity if:
944 // PSI->getOrCompHotCountThreshold()
945 // R > -------------------------------------------------
946 // getInliningCostBenefitAnalysisSavingsMultiplier()
948 // and reject the inlining opportunity if:
950 // PSI->getOrCompHotCountThreshold()
951 // R <= ----------------------------------------------------
952 // getInliningCostBenefitAnalysisProfitableMultiplier()
954 // Otherwise, we fall back to the cost-based analysis.
956 // Implementation-wise, use multiplication (CycleSavings * Multiplier,
957 // HotCountThreshold * Size) rather than division to avoid precision loss.
958 APInt Threshold(128, PSI->getOrCompHotCountThreshold());
959 Threshold *= Size;
961 APInt UpperBoundCycleSavings = CycleSavings;
962 UpperBoundCycleSavings *= getInliningCostBenefitAnalysisSavingsMultiplier();
963 if (UpperBoundCycleSavings.uge(Threshold))
964 return true;
966 APInt LowerBoundCycleSavings = CycleSavings;
967 LowerBoundCycleSavings *=
968 getInliningCostBenefitAnalysisProfitableMultiplier();
969 if (LowerBoundCycleSavings.ult(Threshold))
970 return false;
972 // Otherwise, fall back to the cost-based analysis.
973 return std::nullopt;
976 InlineResult finalizeAnalysis() override {
977 // Loops generally act a lot like calls in that they act like barriers to
978 // movement, require a certain amount of setup, etc. So when optimising for
979 // size, we penalise any call sites that perform loops. We do this after all
980 // other costs here, so will likely only be dealing with relatively small
981 // functions (and hence DT and LI will hopefully be cheap).
982 auto *Caller = CandidateCall.getFunction();
983 if (Caller->hasMinSize()) {
984 DominatorTree DT(F);
985 LoopInfo LI(DT);
986 int NumLoops = 0;
987 for (Loop *L : LI) {
988 // Ignore loops that will not be executed
989 if (DeadBlocks.count(L->getHeader()))
990 continue;
991 NumLoops++;
993 addCost(NumLoops * InlineConstants::LoopPenalty);
996 // We applied the maximum possible vector bonus at the beginning. Now,
997 // subtract the excess bonus, if any, from the Threshold before
998 // comparing against Cost.
999 if (NumVectorInstructions <= NumInstructions / 10)
1000 Threshold -= VectorBonus;
1001 else if (NumVectorInstructions <= NumInstructions / 2)
1002 Threshold -= VectorBonus / 2;
1004 if (std::optional<int> AttrCost =
1005 getStringFnAttrAsInt(CandidateCall, "function-inline-cost"))
1006 Cost = *AttrCost;
1008 if (std::optional<int> AttrCostMult = getStringFnAttrAsInt(
1009 CandidateCall,
1010 InlineConstants::FunctionInlineCostMultiplierAttributeName))
1011 Cost *= *AttrCostMult;
1013 if (std::optional<int> AttrThreshold =
1014 getStringFnAttrAsInt(CandidateCall, "function-inline-threshold"))
1015 Threshold = *AttrThreshold;
1017 if (auto Result = costBenefitAnalysis()) {
1018 DecidedByCostBenefit = true;
1019 if (*Result)
1020 return InlineResult::success();
1021 else
1022 return InlineResult::failure("Cost over threshold.");
1025 if (IgnoreThreshold)
1026 return InlineResult::success();
1028 DecidedByCostThreshold = true;
1029 return Cost < std::max(1, Threshold)
1030 ? InlineResult::success()
1031 : InlineResult::failure("Cost over threshold.");
1034 bool shouldStop() override {
1035 if (IgnoreThreshold || ComputeFullInlineCost)
1036 return false;
1037 // Bail out the moment we cross the threshold. This means we'll under-count
1038 // the cost, but only when undercounting doesn't matter.
1039 if (Cost < Threshold)
1040 return false;
1041 DecidedByCostThreshold = true;
1042 return true;
1045 void onLoadEliminationOpportunity() override {
1046 LoadEliminationCost += InstrCost;
1049 InlineResult onAnalysisStart() override {
1050 // Perform some tweaks to the cost and threshold based on the direct
1051 // callsite information.
1053 // We want to more aggressively inline vector-dense kernels, so up the
1054 // threshold, and we'll lower it if the % of vector instructions gets too
1055 // low. Note that these bonuses are some what arbitrary and evolved over
1056 // time by accident as much as because they are principled bonuses.
1058 // FIXME: It would be nice to remove all such bonuses. At least it would be
1059 // nice to base the bonus values on something more scientific.
1060 assert(NumInstructions == 0);
1061 assert(NumVectorInstructions == 0);
1063 // Update the threshold based on callsite properties
1064 updateThreshold(CandidateCall, F);
1066 // While Threshold depends on commandline options that can take negative
1067 // values, we want to enforce the invariant that the computed threshold and
1068 // bonuses are non-negative.
1069 assert(Threshold >= 0);
1070 assert(SingleBBBonus >= 0);
1071 assert(VectorBonus >= 0);
1073 // Speculatively apply all possible bonuses to Threshold. If cost exceeds
1074 // this Threshold any time, and cost cannot decrease, we can stop processing
1075 // the rest of the function body.
1076 Threshold += (SingleBBBonus + VectorBonus);
1078 // Give out bonuses for the callsite, as the instructions setting them up
1079 // will be gone after inlining.
1080 addCost(-getCallsiteCost(TTI, this->CandidateCall, DL));
1082 // If this function uses the coldcc calling convention, prefer not to inline
1083 // it.
1084 if (F.getCallingConv() == CallingConv::Cold)
1085 Cost += InlineConstants::ColdccPenalty;
1087 LLVM_DEBUG(dbgs() << " Initial cost: " << Cost << "\n");
1089 // Check if we're done. This can happen due to bonuses and penalties.
1090 if (Cost >= Threshold && !ComputeFullInlineCost)
1091 return InlineResult::failure("high cost");
1093 return InlineResult::success();
1096 public:
1097 InlineCostCallAnalyzer(
1098 Function &Callee, CallBase &Call, const InlineParams &Params,
1099 const TargetTransformInfo &TTI,
1100 function_ref<AssumptionCache &(Function &)> GetAssumptionCache,
1101 function_ref<BlockFrequencyInfo &(Function &)> GetBFI = nullptr,
1102 ProfileSummaryInfo *PSI = nullptr,
1103 OptimizationRemarkEmitter *ORE = nullptr, bool BoostIndirect = true,
1104 bool IgnoreThreshold = false)
1105 : CallAnalyzer(Callee, Call, TTI, GetAssumptionCache, GetBFI, PSI, ORE),
1106 ComputeFullInlineCost(OptComputeFullInlineCost ||
1107 Params.ComputeFullInlineCost || ORE ||
1108 isCostBenefitAnalysisEnabled()),
1109 Params(Params), Threshold(Params.DefaultThreshold),
1110 BoostIndirectCalls(BoostIndirect), IgnoreThreshold(IgnoreThreshold),
1111 CostBenefitAnalysisEnabled(isCostBenefitAnalysisEnabled()),
1112 Writer(this) {
1113 AllowRecursiveCall = *Params.AllowRecursiveCall;
1116 /// Annotation Writer for instruction details
1117 InlineCostAnnotationWriter Writer;
1119 void dump();
1121 // Prints the same analysis as dump(), but its definition is not dependent
1122 // on the build.
1123 void print(raw_ostream &OS);
1125 std::optional<InstructionCostDetail> getCostDetails(const Instruction *I) {
1126 if (InstructionCostDetailMap.contains(I))
1127 return InstructionCostDetailMap[I];
1128 return std::nullopt;
1131 virtual ~InlineCostCallAnalyzer() = default;
1132 int getThreshold() const { return Threshold; }
1133 int getCost() const { return Cost; }
1134 int getStaticBonusApplied() const { return StaticBonusApplied; }
1135 std::optional<CostBenefitPair> getCostBenefitPair() { return CostBenefit; }
1136 bool wasDecidedByCostBenefit() const { return DecidedByCostBenefit; }
1137 bool wasDecidedByCostThreshold() const { return DecidedByCostThreshold; }
1140 // Return true if CB is the sole call to local function Callee.
1141 static bool isSoleCallToLocalFunction(const CallBase &CB,
1142 const Function &Callee) {
1143 return Callee.hasLocalLinkage() && Callee.hasOneLiveUse() &&
1144 &Callee == CB.getCalledFunction();
1147 class InlineCostFeaturesAnalyzer final : public CallAnalyzer {
1148 private:
1149 InlineCostFeatures Cost = {};
1151 // FIXME: These constants are taken from the heuristic-based cost visitor.
1152 // These should be removed entirely in a later revision to avoid reliance on
1153 // heuristics in the ML inliner.
1154 static constexpr int JTCostMultiplier = 4;
1155 static constexpr int CaseClusterCostMultiplier = 2;
1156 static constexpr int SwitchCostMultiplier = 2;
1158 // FIXME: These are taken from the heuristic-based cost visitor: we should
1159 // eventually abstract these to the CallAnalyzer to avoid duplication.
1160 unsigned SROACostSavingOpportunities = 0;
1161 int VectorBonus = 0;
1162 int SingleBBBonus = 0;
1163 int Threshold = 5;
1165 DenseMap<AllocaInst *, unsigned> SROACosts;
1167 void increment(InlineCostFeatureIndex Feature, int64_t Delta = 1) {
1168 Cost[static_cast<size_t>(Feature)] += Delta;
1171 void set(InlineCostFeatureIndex Feature, int64_t Value) {
1172 Cost[static_cast<size_t>(Feature)] = Value;
1175 void onDisableSROA(AllocaInst *Arg) override {
1176 auto CostIt = SROACosts.find(Arg);
1177 if (CostIt == SROACosts.end())
1178 return;
1180 increment(InlineCostFeatureIndex::sroa_losses, CostIt->second);
1181 SROACostSavingOpportunities -= CostIt->second;
1182 SROACosts.erase(CostIt);
1185 void onDisableLoadElimination() override {
1186 set(InlineCostFeatureIndex::load_elimination, 1);
1189 void onCallPenalty() override {
1190 increment(InlineCostFeatureIndex::call_penalty, CallPenalty);
1193 void onCallArgumentSetup(const CallBase &Call) override {
1194 increment(InlineCostFeatureIndex::call_argument_setup,
1195 Call.arg_size() * InstrCost);
1198 void onLoadRelativeIntrinsic() override {
1199 increment(InlineCostFeatureIndex::load_relative_intrinsic, 3 * InstrCost);
1202 void onLoweredCall(Function *F, CallBase &Call,
1203 bool IsIndirectCall) override {
1204 increment(InlineCostFeatureIndex::lowered_call_arg_setup,
1205 Call.arg_size() * InstrCost);
1207 if (IsIndirectCall) {
1208 InlineParams IndirectCallParams = {/* DefaultThreshold*/ 0,
1209 /*HintThreshold*/ {},
1210 /*ColdThreshold*/ {},
1211 /*OptSizeThreshold*/ {},
1212 /*OptMinSizeThreshold*/ {},
1213 /*HotCallSiteThreshold*/ {},
1214 /*LocallyHotCallSiteThreshold*/ {},
1215 /*ColdCallSiteThreshold*/ {},
1216 /*ComputeFullInlineCost*/ true,
1217 /*EnableDeferral*/ true};
1218 IndirectCallParams.DefaultThreshold =
1219 InlineConstants::IndirectCallThreshold;
1221 InlineCostCallAnalyzer CA(*F, Call, IndirectCallParams, TTI,
1222 GetAssumptionCache, GetBFI, PSI, ORE, false,
1223 true);
1224 if (CA.analyze().isSuccess()) {
1225 increment(InlineCostFeatureIndex::nested_inline_cost_estimate,
1226 CA.getCost());
1227 increment(InlineCostFeatureIndex::nested_inlines, 1);
1229 } else {
1230 onCallPenalty();
1234 void onFinalizeSwitch(unsigned JumpTableSize,
1235 unsigned NumCaseCluster) override {
1237 if (JumpTableSize) {
1238 int64_t JTCost = static_cast<int64_t>(JumpTableSize) * InstrCost +
1239 JTCostMultiplier * InstrCost;
1240 increment(InlineCostFeatureIndex::jump_table_penalty, JTCost);
1241 return;
1244 if (NumCaseCluster <= 3) {
1245 increment(InlineCostFeatureIndex::case_cluster_penalty,
1246 NumCaseCluster * CaseClusterCostMultiplier * InstrCost);
1247 return;
1250 int64_t ExpectedNumberOfCompare =
1251 getExpectedNumberOfCompare(NumCaseCluster);
1253 int64_t SwitchCost =
1254 ExpectedNumberOfCompare * SwitchCostMultiplier * InstrCost;
1255 increment(InlineCostFeatureIndex::switch_penalty, SwitchCost);
1258 void onMissedSimplification() override {
1259 increment(InlineCostFeatureIndex::unsimplified_common_instructions,
1260 InstrCost);
1263 void onInitializeSROAArg(AllocaInst *Arg) override {
1264 auto SROAArgCost = TTI.getCallerAllocaCost(&CandidateCall, Arg);
1265 SROACosts[Arg] = SROAArgCost;
1266 SROACostSavingOpportunities += SROAArgCost;
1269 void onAggregateSROAUse(AllocaInst *Arg) override {
1270 SROACosts.find(Arg)->second += InstrCost;
1271 SROACostSavingOpportunities += InstrCost;
1274 void onBlockAnalyzed(const BasicBlock *BB) override {
1275 if (BB->getTerminator()->getNumSuccessors() > 1)
1276 set(InlineCostFeatureIndex::is_multiple_blocks, 1);
1277 Threshold -= SingleBBBonus;
1280 InlineResult finalizeAnalysis() override {
1281 auto *Caller = CandidateCall.getFunction();
1282 if (Caller->hasMinSize()) {
1283 DominatorTree DT(F);
1284 LoopInfo LI(DT);
1285 for (Loop *L : LI) {
1286 // Ignore loops that will not be executed
1287 if (DeadBlocks.count(L->getHeader()))
1288 continue;
1289 increment(InlineCostFeatureIndex::num_loops,
1290 InlineConstants::LoopPenalty);
1293 set(InlineCostFeatureIndex::dead_blocks, DeadBlocks.size());
1294 set(InlineCostFeatureIndex::simplified_instructions,
1295 NumInstructionsSimplified);
1296 set(InlineCostFeatureIndex::constant_args, NumConstantArgs);
1297 set(InlineCostFeatureIndex::constant_offset_ptr_args,
1298 NumConstantOffsetPtrArgs);
1299 set(InlineCostFeatureIndex::sroa_savings, SROACostSavingOpportunities);
1301 if (NumVectorInstructions <= NumInstructions / 10)
1302 Threshold -= VectorBonus;
1303 else if (NumVectorInstructions <= NumInstructions / 2)
1304 Threshold -= VectorBonus / 2;
1306 set(InlineCostFeatureIndex::threshold, Threshold);
1308 return InlineResult::success();
1311 bool shouldStop() override { return false; }
1313 void onLoadEliminationOpportunity() override {
1314 increment(InlineCostFeatureIndex::load_elimination, 1);
1317 InlineResult onAnalysisStart() override {
1318 increment(InlineCostFeatureIndex::callsite_cost,
1319 -1 * getCallsiteCost(TTI, this->CandidateCall, DL));
1321 set(InlineCostFeatureIndex::cold_cc_penalty,
1322 (F.getCallingConv() == CallingConv::Cold));
1324 set(InlineCostFeatureIndex::last_call_to_static_bonus,
1325 isSoleCallToLocalFunction(CandidateCall, F));
1327 // FIXME: we shouldn't repeat this logic in both the Features and Cost
1328 // analyzer - instead, we should abstract it to a common method in the
1329 // CallAnalyzer
1330 int SingleBBBonusPercent = 50;
1331 int VectorBonusPercent = TTI.getInlinerVectorBonusPercent();
1332 Threshold += TTI.adjustInliningThreshold(&CandidateCall);
1333 Threshold *= TTI.getInliningThresholdMultiplier();
1334 SingleBBBonus = Threshold * SingleBBBonusPercent / 100;
1335 VectorBonus = Threshold * VectorBonusPercent / 100;
1336 Threshold += (SingleBBBonus + VectorBonus);
1338 return InlineResult::success();
1341 public:
1342 InlineCostFeaturesAnalyzer(
1343 const TargetTransformInfo &TTI,
1344 function_ref<AssumptionCache &(Function &)> &GetAssumptionCache,
1345 function_ref<BlockFrequencyInfo &(Function &)> GetBFI,
1346 ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE, Function &Callee,
1347 CallBase &Call)
1348 : CallAnalyzer(Callee, Call, TTI, GetAssumptionCache, GetBFI, PSI) {}
1350 const InlineCostFeatures &features() const { return Cost; }
1353 } // namespace
1355 /// Test whether the given value is an Alloca-derived function argument.
1356 bool CallAnalyzer::isAllocaDerivedArg(Value *V) {
1357 return SROAArgValues.count(V);
1360 void CallAnalyzer::disableSROAForArg(AllocaInst *SROAArg) {
1361 onDisableSROA(SROAArg);
1362 EnabledSROAAllocas.erase(SROAArg);
1363 disableLoadElimination();
1366 void InlineCostAnnotationWriter::emitInstructionAnnot(
1367 const Instruction *I, formatted_raw_ostream &OS) {
1368 // The cost of inlining of the given instruction is printed always.
1369 // The threshold delta is printed only when it is non-zero. It happens
1370 // when we decided to give a bonus at a particular instruction.
1371 std::optional<InstructionCostDetail> Record = ICCA->getCostDetails(I);
1372 if (!Record)
1373 OS << "; No analysis for the instruction";
1374 else {
1375 OS << "; cost before = " << Record->CostBefore
1376 << ", cost after = " << Record->CostAfter
1377 << ", threshold before = " << Record->ThresholdBefore
1378 << ", threshold after = " << Record->ThresholdAfter << ", ";
1379 OS << "cost delta = " << Record->getCostDelta();
1380 if (Record->hasThresholdChanged())
1381 OS << ", threshold delta = " << Record->getThresholdDelta();
1383 auto C = ICCA->getSimplifiedValue(const_cast<Instruction *>(I));
1384 if (C) {
1385 OS << ", simplified to ";
1386 (*C)->print(OS, true);
1388 OS << "\n";
1391 /// If 'V' maps to a SROA candidate, disable SROA for it.
1392 void CallAnalyzer::disableSROA(Value *V) {
1393 if (auto *SROAArg = getSROAArgForValueOrNull(V)) {
1394 disableSROAForArg(SROAArg);
1398 void CallAnalyzer::disableLoadElimination() {
1399 if (EnableLoadElimination) {
1400 onDisableLoadElimination();
1401 EnableLoadElimination = false;
1405 /// Accumulate a constant GEP offset into an APInt if possible.
1407 /// Returns false if unable to compute the offset for any reason. Respects any
1408 /// simplified values known during the analysis of this callsite.
1409 bool CallAnalyzer::accumulateGEPOffset(GEPOperator &GEP, APInt &Offset) {
1410 unsigned IntPtrWidth = DL.getIndexTypeSizeInBits(GEP.getType());
1411 assert(IntPtrWidth == Offset.getBitWidth());
1413 for (gep_type_iterator GTI = gep_type_begin(GEP), GTE = gep_type_end(GEP);
1414 GTI != GTE; ++GTI) {
1415 ConstantInt *OpC = dyn_cast<ConstantInt>(GTI.getOperand());
1416 if (!OpC)
1417 if (Constant *SimpleOp = SimplifiedValues.lookup(GTI.getOperand()))
1418 OpC = dyn_cast<ConstantInt>(SimpleOp);
1419 if (!OpC)
1420 return false;
1421 if (OpC->isZero())
1422 continue;
1424 // Handle a struct index, which adds its field offset to the pointer.
1425 if (StructType *STy = GTI.getStructTypeOrNull()) {
1426 unsigned ElementIdx = OpC->getZExtValue();
1427 const StructLayout *SL = DL.getStructLayout(STy);
1428 Offset += APInt(IntPtrWidth, SL->getElementOffset(ElementIdx));
1429 continue;
1432 APInt TypeSize(IntPtrWidth, GTI.getSequentialElementStride(DL));
1433 Offset += OpC->getValue().sextOrTrunc(IntPtrWidth) * TypeSize;
1435 return true;
1438 /// Use TTI to check whether a GEP is free.
1440 /// Respects any simplified values known during the analysis of this callsite.
1441 bool CallAnalyzer::isGEPFree(GetElementPtrInst &GEP) {
1442 SmallVector<Value *, 4> Operands;
1443 Operands.push_back(GEP.getOperand(0));
1444 for (const Use &Op : GEP.indices())
1445 if (Constant *SimpleOp = SimplifiedValues.lookup(Op))
1446 Operands.push_back(SimpleOp);
1447 else
1448 Operands.push_back(Op);
1449 return TTI.getInstructionCost(&GEP, Operands,
1450 TargetTransformInfo::TCK_SizeAndLatency) ==
1451 TargetTransformInfo::TCC_Free;
1454 bool CallAnalyzer::visitAlloca(AllocaInst &I) {
1455 disableSROA(I.getOperand(0));
1457 // Check whether inlining will turn a dynamic alloca into a static
1458 // alloca and handle that case.
1459 if (I.isArrayAllocation()) {
1460 Constant *Size = SimplifiedValues.lookup(I.getArraySize());
1461 if (auto *AllocSize = dyn_cast_or_null<ConstantInt>(Size)) {
1462 // Sometimes a dynamic alloca could be converted into a static alloca
1463 // after this constant prop, and become a huge static alloca on an
1464 // unconditional CFG path. Avoid inlining if this is going to happen above
1465 // a threshold.
1466 // FIXME: If the threshold is removed or lowered too much, we could end up
1467 // being too pessimistic and prevent inlining non-problematic code. This
1468 // could result in unintended perf regressions. A better overall strategy
1469 // is needed to track stack usage during inlining.
1470 Type *Ty = I.getAllocatedType();
1471 AllocatedSize = SaturatingMultiplyAdd(
1472 AllocSize->getLimitedValue(),
1473 DL.getTypeAllocSize(Ty).getKnownMinValue(), AllocatedSize);
1474 if (AllocatedSize > InlineConstants::MaxSimplifiedDynamicAllocaToInline)
1475 HasDynamicAlloca = true;
1476 return false;
1480 // Accumulate the allocated size.
1481 if (I.isStaticAlloca()) {
1482 Type *Ty = I.getAllocatedType();
1483 AllocatedSize = SaturatingAdd(DL.getTypeAllocSize(Ty).getKnownMinValue(),
1484 AllocatedSize);
1487 // FIXME: This is overly conservative. Dynamic allocas are inefficient for
1488 // a variety of reasons, and so we would like to not inline them into
1489 // functions which don't currently have a dynamic alloca. This simply
1490 // disables inlining altogether in the presence of a dynamic alloca.
1491 if (!I.isStaticAlloca())
1492 HasDynamicAlloca = true;
1494 return false;
1497 bool CallAnalyzer::visitPHI(PHINode &I) {
1498 // FIXME: We need to propagate SROA *disabling* through phi nodes, even
1499 // though we don't want to propagate it's bonuses. The idea is to disable
1500 // SROA if it *might* be used in an inappropriate manner.
1502 // Phi nodes are always zero-cost.
1503 // FIXME: Pointer sizes may differ between different address spaces, so do we
1504 // need to use correct address space in the call to getPointerSizeInBits here?
1505 // Or could we skip the getPointerSizeInBits call completely? As far as I can
1506 // see the ZeroOffset is used as a dummy value, so we can probably use any
1507 // bit width for the ZeroOffset?
1508 APInt ZeroOffset = APInt::getZero(DL.getPointerSizeInBits(0));
1509 bool CheckSROA = I.getType()->isPointerTy();
1511 // Track the constant or pointer with constant offset we've seen so far.
1512 Constant *FirstC = nullptr;
1513 std::pair<Value *, APInt> FirstBaseAndOffset = {nullptr, ZeroOffset};
1514 Value *FirstV = nullptr;
1516 for (unsigned i = 0, e = I.getNumIncomingValues(); i != e; ++i) {
1517 BasicBlock *Pred = I.getIncomingBlock(i);
1518 // If the incoming block is dead, skip the incoming block.
1519 if (DeadBlocks.count(Pred))
1520 continue;
1521 // If the parent block of phi is not the known successor of the incoming
1522 // block, skip the incoming block.
1523 BasicBlock *KnownSuccessor = KnownSuccessors[Pred];
1524 if (KnownSuccessor && KnownSuccessor != I.getParent())
1525 continue;
1527 Value *V = I.getIncomingValue(i);
1528 // If the incoming value is this phi itself, skip the incoming value.
1529 if (&I == V)
1530 continue;
1532 Constant *C = dyn_cast<Constant>(V);
1533 if (!C)
1534 C = SimplifiedValues.lookup(V);
1536 std::pair<Value *, APInt> BaseAndOffset = {nullptr, ZeroOffset};
1537 if (!C && CheckSROA)
1538 BaseAndOffset = ConstantOffsetPtrs.lookup(V);
1540 if (!C && !BaseAndOffset.first)
1541 // The incoming value is neither a constant nor a pointer with constant
1542 // offset, exit early.
1543 return true;
1545 if (FirstC) {
1546 if (FirstC == C)
1547 // If we've seen a constant incoming value before and it is the same
1548 // constant we see this time, continue checking the next incoming value.
1549 continue;
1550 // Otherwise early exit because we either see a different constant or saw
1551 // a constant before but we have a pointer with constant offset this time.
1552 return true;
1555 if (FirstV) {
1556 // The same logic as above, but check pointer with constant offset here.
1557 if (FirstBaseAndOffset == BaseAndOffset)
1558 continue;
1559 return true;
1562 if (C) {
1563 // This is the 1st time we've seen a constant, record it.
1564 FirstC = C;
1565 continue;
1568 // The remaining case is that this is the 1st time we've seen a pointer with
1569 // constant offset, record it.
1570 FirstV = V;
1571 FirstBaseAndOffset = BaseAndOffset;
1574 // Check if we can map phi to a constant.
1575 if (FirstC) {
1576 SimplifiedValues[&I] = FirstC;
1577 return true;
1580 // Check if we can map phi to a pointer with constant offset.
1581 if (FirstBaseAndOffset.first) {
1582 ConstantOffsetPtrs[&I] = FirstBaseAndOffset;
1584 if (auto *SROAArg = getSROAArgForValueOrNull(FirstV))
1585 SROAArgValues[&I] = SROAArg;
1588 return true;
1591 /// Check we can fold GEPs of constant-offset call site argument pointers.
1592 /// This requires target data and inbounds GEPs.
1594 /// \return true if the specified GEP can be folded.
1595 bool CallAnalyzer::canFoldInboundsGEP(GetElementPtrInst &I) {
1596 // Check if we have a base + offset for the pointer.
1597 std::pair<Value *, APInt> BaseAndOffset =
1598 ConstantOffsetPtrs.lookup(I.getPointerOperand());
1599 if (!BaseAndOffset.first)
1600 return false;
1602 // Check if the offset of this GEP is constant, and if so accumulate it
1603 // into Offset.
1604 if (!accumulateGEPOffset(cast<GEPOperator>(I), BaseAndOffset.second))
1605 return false;
1607 // Add the result as a new mapping to Base + Offset.
1608 ConstantOffsetPtrs[&I] = BaseAndOffset;
1610 return true;
1613 bool CallAnalyzer::visitGetElementPtr(GetElementPtrInst &I) {
1614 auto *SROAArg = getSROAArgForValueOrNull(I.getPointerOperand());
1616 // Lambda to check whether a GEP's indices are all constant.
1617 auto IsGEPOffsetConstant = [&](GetElementPtrInst &GEP) {
1618 for (const Use &Op : GEP.indices())
1619 if (!isa<Constant>(Op) && !SimplifiedValues.lookup(Op))
1620 return false;
1621 return true;
1624 if (!DisableGEPConstOperand)
1625 if (simplifyInstruction(I))
1626 return true;
1628 if ((I.isInBounds() && canFoldInboundsGEP(I)) || IsGEPOffsetConstant(I)) {
1629 if (SROAArg)
1630 SROAArgValues[&I] = SROAArg;
1632 // Constant GEPs are modeled as free.
1633 return true;
1636 // Variable GEPs will require math and will disable SROA.
1637 if (SROAArg)
1638 disableSROAForArg(SROAArg);
1639 return isGEPFree(I);
1642 /// Simplify \p I if its operands are constants and update SimplifiedValues.
1643 bool CallAnalyzer::simplifyInstruction(Instruction &I) {
1644 SmallVector<Constant *> COps;
1645 for (Value *Op : I.operands()) {
1646 Constant *COp = dyn_cast<Constant>(Op);
1647 if (!COp)
1648 COp = SimplifiedValues.lookup(Op);
1649 if (!COp)
1650 return false;
1651 COps.push_back(COp);
1653 auto *C = ConstantFoldInstOperands(&I, COps, DL);
1654 if (!C)
1655 return false;
1656 SimplifiedValues[&I] = C;
1657 return true;
1660 /// Try to simplify a call to llvm.is.constant.
1662 /// Duplicate the argument checking from CallAnalyzer::simplifyCallSite since
1663 /// we expect calls of this specific intrinsic to be infrequent.
1665 /// FIXME: Given that we know CB's parent (F) caller
1666 /// (CandidateCall->getParent()->getParent()), we might be able to determine
1667 /// whether inlining F into F's caller would change how the call to
1668 /// llvm.is.constant would evaluate.
1669 bool CallAnalyzer::simplifyIntrinsicCallIsConstant(CallBase &CB) {
1670 Value *Arg = CB.getArgOperand(0);
1671 auto *C = dyn_cast<Constant>(Arg);
1673 if (!C)
1674 C = dyn_cast_or_null<Constant>(SimplifiedValues.lookup(Arg));
1676 Type *RT = CB.getFunctionType()->getReturnType();
1677 SimplifiedValues[&CB] = ConstantInt::get(RT, C ? 1 : 0);
1678 return true;
1681 bool CallAnalyzer::simplifyIntrinsicCallObjectSize(CallBase &CB) {
1682 // As per the langref, "The fourth argument to llvm.objectsize determines if
1683 // the value should be evaluated at runtime."
1684 if (cast<ConstantInt>(CB.getArgOperand(3))->isOne())
1685 return false;
1687 Value *V = lowerObjectSizeCall(&cast<IntrinsicInst>(CB), DL, nullptr,
1688 /*MustSucceed=*/true);
1689 Constant *C = dyn_cast_or_null<Constant>(V);
1690 if (C)
1691 SimplifiedValues[&CB] = C;
1692 return C;
1695 bool CallAnalyzer::visitBitCast(BitCastInst &I) {
1696 // Propagate constants through bitcasts.
1697 if (simplifyInstruction(I))
1698 return true;
1700 // Track base/offsets through casts
1701 std::pair<Value *, APInt> BaseAndOffset =
1702 ConstantOffsetPtrs.lookup(I.getOperand(0));
1703 // Casts don't change the offset, just wrap it up.
1704 if (BaseAndOffset.first)
1705 ConstantOffsetPtrs[&I] = BaseAndOffset;
1707 // Also look for SROA candidates here.
1708 if (auto *SROAArg = getSROAArgForValueOrNull(I.getOperand(0)))
1709 SROAArgValues[&I] = SROAArg;
1711 // Bitcasts are always zero cost.
1712 return true;
1715 bool CallAnalyzer::visitPtrToInt(PtrToIntInst &I) {
1716 // Propagate constants through ptrtoint.
1717 if (simplifyInstruction(I))
1718 return true;
1720 // Track base/offset pairs when converted to a plain integer provided the
1721 // integer is large enough to represent the pointer.
1722 unsigned IntegerSize = I.getType()->getScalarSizeInBits();
1723 unsigned AS = I.getOperand(0)->getType()->getPointerAddressSpace();
1724 if (IntegerSize == DL.getPointerSizeInBits(AS)) {
1725 std::pair<Value *, APInt> BaseAndOffset =
1726 ConstantOffsetPtrs.lookup(I.getOperand(0));
1727 if (BaseAndOffset.first)
1728 ConstantOffsetPtrs[&I] = BaseAndOffset;
1731 // This is really weird. Technically, ptrtoint will disable SROA. However,
1732 // unless that ptrtoint is *used* somewhere in the live basic blocks after
1733 // inlining, it will be nuked, and SROA should proceed. All of the uses which
1734 // would block SROA would also block SROA if applied directly to a pointer,
1735 // and so we can just add the integer in here. The only places where SROA is
1736 // preserved either cannot fire on an integer, or won't in-and-of themselves
1737 // disable SROA (ext) w/o some later use that we would see and disable.
1738 if (auto *SROAArg = getSROAArgForValueOrNull(I.getOperand(0)))
1739 SROAArgValues[&I] = SROAArg;
1741 return TTI.getInstructionCost(&I, TargetTransformInfo::TCK_SizeAndLatency) ==
1742 TargetTransformInfo::TCC_Free;
1745 bool CallAnalyzer::visitIntToPtr(IntToPtrInst &I) {
1746 // Propagate constants through ptrtoint.
1747 if (simplifyInstruction(I))
1748 return true;
1750 // Track base/offset pairs when round-tripped through a pointer without
1751 // modifications provided the integer is not too large.
1752 Value *Op = I.getOperand(0);
1753 unsigned IntegerSize = Op->getType()->getScalarSizeInBits();
1754 if (IntegerSize <= DL.getPointerTypeSizeInBits(I.getType())) {
1755 std::pair<Value *, APInt> BaseAndOffset = ConstantOffsetPtrs.lookup(Op);
1756 if (BaseAndOffset.first)
1757 ConstantOffsetPtrs[&I] = BaseAndOffset;
1760 // "Propagate" SROA here in the same manner as we do for ptrtoint above.
1761 if (auto *SROAArg = getSROAArgForValueOrNull(Op))
1762 SROAArgValues[&I] = SROAArg;
1764 return TTI.getInstructionCost(&I, TargetTransformInfo::TCK_SizeAndLatency) ==
1765 TargetTransformInfo::TCC_Free;
1768 bool CallAnalyzer::visitCastInst(CastInst &I) {
1769 // Propagate constants through casts.
1770 if (simplifyInstruction(I))
1771 return true;
1773 // Disable SROA in the face of arbitrary casts we don't explicitly list
1774 // elsewhere.
1775 disableSROA(I.getOperand(0));
1777 // If this is a floating-point cast, and the target says this operation
1778 // is expensive, this may eventually become a library call. Treat the cost
1779 // as such.
1780 switch (I.getOpcode()) {
1781 case Instruction::FPTrunc:
1782 case Instruction::FPExt:
1783 case Instruction::UIToFP:
1784 case Instruction::SIToFP:
1785 case Instruction::FPToUI:
1786 case Instruction::FPToSI:
1787 if (TTI.getFPOpCost(I.getType()) == TargetTransformInfo::TCC_Expensive)
1788 onCallPenalty();
1789 break;
1790 default:
1791 break;
1794 return TTI.getInstructionCost(&I, TargetTransformInfo::TCK_SizeAndLatency) ==
1795 TargetTransformInfo::TCC_Free;
1798 bool CallAnalyzer::paramHasAttr(Argument *A, Attribute::AttrKind Attr) {
1799 return CandidateCall.paramHasAttr(A->getArgNo(), Attr);
1802 bool CallAnalyzer::isKnownNonNullInCallee(Value *V) {
1803 // Does the *call site* have the NonNull attribute set on an argument? We
1804 // use the attribute on the call site to memoize any analysis done in the
1805 // caller. This will also trip if the callee function has a non-null
1806 // parameter attribute, but that's a less interesting case because hopefully
1807 // the callee would already have been simplified based on that.
1808 if (Argument *A = dyn_cast<Argument>(V))
1809 if (paramHasAttr(A, Attribute::NonNull))
1810 return true;
1812 // Is this an alloca in the caller? This is distinct from the attribute case
1813 // above because attributes aren't updated within the inliner itself and we
1814 // always want to catch the alloca derived case.
1815 if (isAllocaDerivedArg(V))
1816 // We can actually predict the result of comparisons between an
1817 // alloca-derived value and null. Note that this fires regardless of
1818 // SROA firing.
1819 return true;
1821 return false;
1824 bool CallAnalyzer::allowSizeGrowth(CallBase &Call) {
1825 // If the normal destination of the invoke or the parent block of the call
1826 // site is unreachable-terminated, there is little point in inlining this
1827 // unless there is literally zero cost.
1828 // FIXME: Note that it is possible that an unreachable-terminated block has a
1829 // hot entry. For example, in below scenario inlining hot_call_X() may be
1830 // beneficial :
1831 // main() {
1832 // hot_call_1();
1833 // ...
1834 // hot_call_N()
1835 // exit(0);
1836 // }
1837 // For now, we are not handling this corner case here as it is rare in real
1838 // code. In future, we should elaborate this based on BPI and BFI in more
1839 // general threshold adjusting heuristics in updateThreshold().
1840 if (InvokeInst *II = dyn_cast<InvokeInst>(&Call)) {
1841 if (isa<UnreachableInst>(II->getNormalDest()->getTerminator()))
1842 return false;
1843 } else if (isa<UnreachableInst>(Call.getParent()->getTerminator()))
1844 return false;
1846 return true;
1849 bool InlineCostCallAnalyzer::isColdCallSite(CallBase &Call,
1850 BlockFrequencyInfo *CallerBFI) {
1851 // If global profile summary is available, then callsite's coldness is
1852 // determined based on that.
1853 if (PSI && PSI->hasProfileSummary())
1854 return PSI->isColdCallSite(Call, CallerBFI);
1856 // Otherwise we need BFI to be available.
1857 if (!CallerBFI)
1858 return false;
1860 // Determine if the callsite is cold relative to caller's entry. We could
1861 // potentially cache the computation of scaled entry frequency, but the added
1862 // complexity is not worth it unless this scaling shows up high in the
1863 // profiles.
1864 const BranchProbability ColdProb(ColdCallSiteRelFreq, 100);
1865 auto CallSiteBB = Call.getParent();
1866 auto CallSiteFreq = CallerBFI->getBlockFreq(CallSiteBB);
1867 auto CallerEntryFreq =
1868 CallerBFI->getBlockFreq(&(Call.getCaller()->getEntryBlock()));
1869 return CallSiteFreq < CallerEntryFreq * ColdProb;
1872 std::optional<int>
1873 InlineCostCallAnalyzer::getHotCallSiteThreshold(CallBase &Call,
1874 BlockFrequencyInfo *CallerBFI) {
1876 // If global profile summary is available, then callsite's hotness is
1877 // determined based on that.
1878 if (PSI && PSI->hasProfileSummary() && PSI->isHotCallSite(Call, CallerBFI))
1879 return Params.HotCallSiteThreshold;
1881 // Otherwise we need BFI to be available and to have a locally hot callsite
1882 // threshold.
1883 if (!CallerBFI || !Params.LocallyHotCallSiteThreshold)
1884 return std::nullopt;
1886 // Determine if the callsite is hot relative to caller's entry. We could
1887 // potentially cache the computation of scaled entry frequency, but the added
1888 // complexity is not worth it unless this scaling shows up high in the
1889 // profiles.
1890 const BasicBlock *CallSiteBB = Call.getParent();
1891 BlockFrequency CallSiteFreq = CallerBFI->getBlockFreq(CallSiteBB);
1892 BlockFrequency CallerEntryFreq = CallerBFI->getEntryFreq();
1893 std::optional<BlockFrequency> Limit = CallerEntryFreq.mul(HotCallSiteRelFreq);
1894 if (Limit && CallSiteFreq >= *Limit)
1895 return Params.LocallyHotCallSiteThreshold;
1897 // Otherwise treat it normally.
1898 return std::nullopt;
1901 void InlineCostCallAnalyzer::updateThreshold(CallBase &Call, Function &Callee) {
1902 // If no size growth is allowed for this inlining, set Threshold to 0.
1903 if (!allowSizeGrowth(Call)) {
1904 Threshold = 0;
1905 return;
1908 Function *Caller = Call.getCaller();
1910 // return min(A, B) if B is valid.
1911 auto MinIfValid = [](int A, std::optional<int> B) {
1912 return B ? std::min(A, *B) : A;
1915 // return max(A, B) if B is valid.
1916 auto MaxIfValid = [](int A, std::optional<int> B) {
1917 return B ? std::max(A, *B) : A;
1920 // Various bonus percentages. These are multiplied by Threshold to get the
1921 // bonus values.
1922 // SingleBBBonus: This bonus is applied if the callee has a single reachable
1923 // basic block at the given callsite context. This is speculatively applied
1924 // and withdrawn if more than one basic block is seen.
1926 // LstCallToStaticBonus: This large bonus is applied to ensure the inlining
1927 // of the last call to a static function as inlining such functions is
1928 // guaranteed to reduce code size.
1930 // These bonus percentages may be set to 0 based on properties of the caller
1931 // and the callsite.
1932 int SingleBBBonusPercent = 50;
1933 int VectorBonusPercent = TTI.getInlinerVectorBonusPercent();
1934 int LastCallToStaticBonus = InlineConstants::LastCallToStaticBonus;
1936 // Lambda to set all the above bonus and bonus percentages to 0.
1937 auto DisallowAllBonuses = [&]() {
1938 SingleBBBonusPercent = 0;
1939 VectorBonusPercent = 0;
1940 LastCallToStaticBonus = 0;
1943 // Use the OptMinSizeThreshold or OptSizeThreshold knob if they are available
1944 // and reduce the threshold if the caller has the necessary attribute.
1945 if (Caller->hasMinSize()) {
1946 Threshold = MinIfValid(Threshold, Params.OptMinSizeThreshold);
1947 // For minsize, we want to disable the single BB bonus and the vector
1948 // bonuses, but not the last-call-to-static bonus. Inlining the last call to
1949 // a static function will, at the minimum, eliminate the parameter setup and
1950 // call/return instructions.
1951 SingleBBBonusPercent = 0;
1952 VectorBonusPercent = 0;
1953 } else if (Caller->hasOptSize())
1954 Threshold = MinIfValid(Threshold, Params.OptSizeThreshold);
1956 // Adjust the threshold based on inlinehint attribute and profile based
1957 // hotness information if the caller does not have MinSize attribute.
1958 if (!Caller->hasMinSize()) {
1959 if (Callee.hasFnAttribute(Attribute::InlineHint))
1960 Threshold = MaxIfValid(Threshold, Params.HintThreshold);
1962 // FIXME: After switching to the new passmanager, simplify the logic below
1963 // by checking only the callsite hotness/coldness as we will reliably
1964 // have local profile information.
1966 // Callsite hotness and coldness can be determined if sample profile is
1967 // used (which adds hotness metadata to calls) or if caller's
1968 // BlockFrequencyInfo is available.
1969 BlockFrequencyInfo *CallerBFI = GetBFI ? &(GetBFI(*Caller)) : nullptr;
1970 auto HotCallSiteThreshold = getHotCallSiteThreshold(Call, CallerBFI);
1971 if (!Caller->hasOptSize() && HotCallSiteThreshold) {
1972 LLVM_DEBUG(dbgs() << "Hot callsite.\n");
1973 // FIXME: This should update the threshold only if it exceeds the
1974 // current threshold, but AutoFDO + ThinLTO currently relies on this
1975 // behavior to prevent inlining of hot callsites during ThinLTO
1976 // compile phase.
1977 Threshold = *HotCallSiteThreshold;
1978 } else if (isColdCallSite(Call, CallerBFI)) {
1979 LLVM_DEBUG(dbgs() << "Cold callsite.\n");
1980 // Do not apply bonuses for a cold callsite including the
1981 // LastCallToStatic bonus. While this bonus might result in code size
1982 // reduction, it can cause the size of a non-cold caller to increase
1983 // preventing it from being inlined.
1984 DisallowAllBonuses();
1985 Threshold = MinIfValid(Threshold, Params.ColdCallSiteThreshold);
1986 } else if (PSI) {
1987 // Use callee's global profile information only if we have no way of
1988 // determining this via callsite information.
1989 if (PSI->isFunctionEntryHot(&Callee)) {
1990 LLVM_DEBUG(dbgs() << "Hot callee.\n");
1991 // If callsite hotness can not be determined, we may still know
1992 // that the callee is hot and treat it as a weaker hint for threshold
1993 // increase.
1994 Threshold = MaxIfValid(Threshold, Params.HintThreshold);
1995 } else if (PSI->isFunctionEntryCold(&Callee)) {
1996 LLVM_DEBUG(dbgs() << "Cold callee.\n");
1997 // Do not apply bonuses for a cold callee including the
1998 // LastCallToStatic bonus. While this bonus might result in code size
1999 // reduction, it can cause the size of a non-cold caller to increase
2000 // preventing it from being inlined.
2001 DisallowAllBonuses();
2002 Threshold = MinIfValid(Threshold, Params.ColdThreshold);
2007 Threshold += TTI.adjustInliningThreshold(&Call);
2009 // Finally, take the target-specific inlining threshold multiplier into
2010 // account.
2011 Threshold *= TTI.getInliningThresholdMultiplier();
2013 SingleBBBonus = Threshold * SingleBBBonusPercent / 100;
2014 VectorBonus = Threshold * VectorBonusPercent / 100;
2016 // If there is only one call of the function, and it has internal linkage,
2017 // the cost of inlining it drops dramatically. It may seem odd to update
2018 // Cost in updateThreshold, but the bonus depends on the logic in this method.
2019 if (isSoleCallToLocalFunction(Call, F)) {
2020 Cost -= LastCallToStaticBonus;
2021 StaticBonusApplied = LastCallToStaticBonus;
2025 bool CallAnalyzer::visitCmpInst(CmpInst &I) {
2026 Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
2027 // First try to handle simplified comparisons.
2028 if (simplifyInstruction(I))
2029 return true;
2031 if (I.getOpcode() == Instruction::FCmp)
2032 return false;
2034 // Otherwise look for a comparison between constant offset pointers with
2035 // a common base.
2036 Value *LHSBase, *RHSBase;
2037 APInt LHSOffset, RHSOffset;
2038 std::tie(LHSBase, LHSOffset) = ConstantOffsetPtrs.lookup(LHS);
2039 if (LHSBase) {
2040 std::tie(RHSBase, RHSOffset) = ConstantOffsetPtrs.lookup(RHS);
2041 if (RHSBase && LHSBase == RHSBase) {
2042 // We have common bases, fold the icmp to a constant based on the
2043 // offsets.
2044 Constant *CLHS = ConstantInt::get(LHS->getContext(), LHSOffset);
2045 Constant *CRHS = ConstantInt::get(RHS->getContext(), RHSOffset);
2046 if (Constant *C = ConstantExpr::getICmp(I.getPredicate(), CLHS, CRHS)) {
2047 SimplifiedValues[&I] = C;
2048 ++NumConstantPtrCmps;
2049 return true;
2054 auto isImplicitNullCheckCmp = [](const CmpInst &I) {
2055 for (auto *User : I.users())
2056 if (auto *Instr = dyn_cast<Instruction>(User))
2057 if (!Instr->getMetadata(LLVMContext::MD_make_implicit))
2058 return false;
2059 return true;
2062 // If the comparison is an equality comparison with null, we can simplify it
2063 // if we know the value (argument) can't be null
2064 if (I.isEquality() && isa<ConstantPointerNull>(I.getOperand(1))) {
2065 if (isKnownNonNullInCallee(I.getOperand(0))) {
2066 bool IsNotEqual = I.getPredicate() == CmpInst::ICMP_NE;
2067 SimplifiedValues[&I] = IsNotEqual ? ConstantInt::getTrue(I.getType())
2068 : ConstantInt::getFalse(I.getType());
2069 return true;
2071 // Implicit null checks act as unconditional branches and their comparisons
2072 // should be treated as simplified and free of cost.
2073 if (isImplicitNullCheckCmp(I))
2074 return true;
2076 return handleSROA(I.getOperand(0), isa<ConstantPointerNull>(I.getOperand(1)));
2079 bool CallAnalyzer::visitSub(BinaryOperator &I) {
2080 // Try to handle a special case: we can fold computing the difference of two
2081 // constant-related pointers.
2082 Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
2083 Value *LHSBase, *RHSBase;
2084 APInt LHSOffset, RHSOffset;
2085 std::tie(LHSBase, LHSOffset) = ConstantOffsetPtrs.lookup(LHS);
2086 if (LHSBase) {
2087 std::tie(RHSBase, RHSOffset) = ConstantOffsetPtrs.lookup(RHS);
2088 if (RHSBase && LHSBase == RHSBase) {
2089 // We have common bases, fold the subtract to a constant based on the
2090 // offsets.
2091 Constant *CLHS = ConstantInt::get(LHS->getContext(), LHSOffset);
2092 Constant *CRHS = ConstantInt::get(RHS->getContext(), RHSOffset);
2093 if (Constant *C = ConstantExpr::getSub(CLHS, CRHS)) {
2094 SimplifiedValues[&I] = C;
2095 ++NumConstantPtrDiffs;
2096 return true;
2101 // Otherwise, fall back to the generic logic for simplifying and handling
2102 // instructions.
2103 return Base::visitSub(I);
2106 bool CallAnalyzer::visitBinaryOperator(BinaryOperator &I) {
2107 Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
2108 Constant *CLHS = dyn_cast<Constant>(LHS);
2109 if (!CLHS)
2110 CLHS = SimplifiedValues.lookup(LHS);
2111 Constant *CRHS = dyn_cast<Constant>(RHS);
2112 if (!CRHS)
2113 CRHS = SimplifiedValues.lookup(RHS);
2115 Value *SimpleV = nullptr;
2116 if (auto FI = dyn_cast<FPMathOperator>(&I))
2117 SimpleV = simplifyBinOp(I.getOpcode(), CLHS ? CLHS : LHS, CRHS ? CRHS : RHS,
2118 FI->getFastMathFlags(), DL);
2119 else
2120 SimpleV =
2121 simplifyBinOp(I.getOpcode(), CLHS ? CLHS : LHS, CRHS ? CRHS : RHS, DL);
2123 if (Constant *C = dyn_cast_or_null<Constant>(SimpleV))
2124 SimplifiedValues[&I] = C;
2126 if (SimpleV)
2127 return true;
2129 // Disable any SROA on arguments to arbitrary, unsimplified binary operators.
2130 disableSROA(LHS);
2131 disableSROA(RHS);
2133 // If the instruction is floating point, and the target says this operation
2134 // is expensive, this may eventually become a library call. Treat the cost
2135 // as such. Unless it's fneg which can be implemented with an xor.
2136 using namespace llvm::PatternMatch;
2137 if (I.getType()->isFloatingPointTy() &&
2138 TTI.getFPOpCost(I.getType()) == TargetTransformInfo::TCC_Expensive &&
2139 !match(&I, m_FNeg(m_Value())))
2140 onCallPenalty();
2142 return false;
2145 bool CallAnalyzer::visitFNeg(UnaryOperator &I) {
2146 Value *Op = I.getOperand(0);
2147 Constant *COp = dyn_cast<Constant>(Op);
2148 if (!COp)
2149 COp = SimplifiedValues.lookup(Op);
2151 Value *SimpleV = simplifyFNegInst(
2152 COp ? COp : Op, cast<FPMathOperator>(I).getFastMathFlags(), DL);
2154 if (Constant *C = dyn_cast_or_null<Constant>(SimpleV))
2155 SimplifiedValues[&I] = C;
2157 if (SimpleV)
2158 return true;
2160 // Disable any SROA on arguments to arbitrary, unsimplified fneg.
2161 disableSROA(Op);
2163 return false;
2166 bool CallAnalyzer::visitLoad(LoadInst &I) {
2167 if (handleSROA(I.getPointerOperand(), I.isSimple()))
2168 return true;
2170 // If the data is already loaded from this address and hasn't been clobbered
2171 // by any stores or calls, this load is likely to be redundant and can be
2172 // eliminated.
2173 if (EnableLoadElimination &&
2174 !LoadAddrSet.insert(I.getPointerOperand()).second && I.isUnordered()) {
2175 onLoadEliminationOpportunity();
2176 return true;
2179 onMemAccess();
2180 return false;
2183 bool CallAnalyzer::visitStore(StoreInst &I) {
2184 if (handleSROA(I.getPointerOperand(), I.isSimple()))
2185 return true;
2187 // The store can potentially clobber loads and prevent repeated loads from
2188 // being eliminated.
2189 // FIXME:
2190 // 1. We can probably keep an initial set of eliminatable loads substracted
2191 // from the cost even when we finally see a store. We just need to disable
2192 // *further* accumulation of elimination savings.
2193 // 2. We should probably at some point thread MemorySSA for the callee into
2194 // this and then use that to actually compute *really* precise savings.
2195 disableLoadElimination();
2197 onMemAccess();
2198 return false;
2201 bool CallAnalyzer::visitExtractValue(ExtractValueInst &I) {
2202 // Constant folding for extract value is trivial.
2203 if (simplifyInstruction(I))
2204 return true;
2206 // SROA can't look through these, but they may be free.
2207 return Base::visitExtractValue(I);
2210 bool CallAnalyzer::visitInsertValue(InsertValueInst &I) {
2211 // Constant folding for insert value is trivial.
2212 if (simplifyInstruction(I))
2213 return true;
2215 // SROA can't look through these, but they may be free.
2216 return Base::visitInsertValue(I);
2219 /// Try to simplify a call site.
2221 /// Takes a concrete function and callsite and tries to actually simplify it by
2222 /// analyzing the arguments and call itself with instsimplify. Returns true if
2223 /// it has simplified the callsite to some other entity (a constant), making it
2224 /// free.
2225 bool CallAnalyzer::simplifyCallSite(Function *F, CallBase &Call) {
2226 // FIXME: Using the instsimplify logic directly for this is inefficient
2227 // because we have to continually rebuild the argument list even when no
2228 // simplifications can be performed. Until that is fixed with remapping
2229 // inside of instsimplify, directly constant fold calls here.
2230 if (!canConstantFoldCallTo(&Call, F))
2231 return false;
2233 // Try to re-map the arguments to constants.
2234 SmallVector<Constant *, 4> ConstantArgs;
2235 ConstantArgs.reserve(Call.arg_size());
2236 for (Value *I : Call.args()) {
2237 Constant *C = dyn_cast<Constant>(I);
2238 if (!C)
2239 C = dyn_cast_or_null<Constant>(SimplifiedValues.lookup(I));
2240 if (!C)
2241 return false; // This argument doesn't map to a constant.
2243 ConstantArgs.push_back(C);
2245 if (Constant *C = ConstantFoldCall(&Call, F, ConstantArgs)) {
2246 SimplifiedValues[&Call] = C;
2247 return true;
2250 return false;
2253 bool CallAnalyzer::visitCallBase(CallBase &Call) {
2254 if (!onCallBaseVisitStart(Call))
2255 return true;
2257 if (Call.hasFnAttr(Attribute::ReturnsTwice) &&
2258 !F.hasFnAttribute(Attribute::ReturnsTwice)) {
2259 // This aborts the entire analysis.
2260 ExposesReturnsTwice = true;
2261 return false;
2263 if (isa<CallInst>(Call) && cast<CallInst>(Call).cannotDuplicate())
2264 ContainsNoDuplicateCall = true;
2266 Function *F = Call.getCalledFunction();
2267 bool IsIndirectCall = !F;
2268 if (IsIndirectCall) {
2269 // Check if this happens to be an indirect function call to a known function
2270 // in this inline context. If not, we've done all we can.
2271 Value *Callee = Call.getCalledOperand();
2272 F = dyn_cast_or_null<Function>(SimplifiedValues.lookup(Callee));
2273 if (!F || F->getFunctionType() != Call.getFunctionType()) {
2274 onCallArgumentSetup(Call);
2276 if (!Call.onlyReadsMemory())
2277 disableLoadElimination();
2278 return Base::visitCallBase(Call);
2282 assert(F && "Expected a call to a known function");
2284 // When we have a concrete function, first try to simplify it directly.
2285 if (simplifyCallSite(F, Call))
2286 return true;
2288 // Next check if it is an intrinsic we know about.
2289 // FIXME: Lift this into part of the InstVisitor.
2290 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&Call)) {
2291 switch (II->getIntrinsicID()) {
2292 default:
2293 if (!Call.onlyReadsMemory() && !isAssumeLikeIntrinsic(II))
2294 disableLoadElimination();
2295 return Base::visitCallBase(Call);
2297 case Intrinsic::load_relative:
2298 onLoadRelativeIntrinsic();
2299 return false;
2301 case Intrinsic::memset:
2302 case Intrinsic::memcpy:
2303 case Intrinsic::memmove:
2304 disableLoadElimination();
2305 // SROA can usually chew through these intrinsics, but they aren't free.
2306 return false;
2307 case Intrinsic::icall_branch_funnel:
2308 case Intrinsic::localescape:
2309 HasUninlineableIntrinsic = true;
2310 return false;
2311 case Intrinsic::vastart:
2312 InitsVargArgs = true;
2313 return false;
2314 case Intrinsic::launder_invariant_group:
2315 case Intrinsic::strip_invariant_group:
2316 if (auto *SROAArg = getSROAArgForValueOrNull(II->getOperand(0)))
2317 SROAArgValues[II] = SROAArg;
2318 return true;
2319 case Intrinsic::is_constant:
2320 return simplifyIntrinsicCallIsConstant(Call);
2321 case Intrinsic::objectsize:
2322 return simplifyIntrinsicCallObjectSize(Call);
2326 if (F == Call.getFunction()) {
2327 // This flag will fully abort the analysis, so don't bother with anything
2328 // else.
2329 IsRecursiveCall = true;
2330 if (!AllowRecursiveCall)
2331 return false;
2334 if (TTI.isLoweredToCall(F)) {
2335 onLoweredCall(F, Call, IsIndirectCall);
2338 if (!(Call.onlyReadsMemory() || (IsIndirectCall && F->onlyReadsMemory())))
2339 disableLoadElimination();
2340 return Base::visitCallBase(Call);
2343 bool CallAnalyzer::visitReturnInst(ReturnInst &RI) {
2344 // At least one return instruction will be free after inlining.
2345 bool Free = !HasReturn;
2346 HasReturn = true;
2347 return Free;
2350 bool CallAnalyzer::visitBranchInst(BranchInst &BI) {
2351 // We model unconditional branches as essentially free -- they really
2352 // shouldn't exist at all, but handling them makes the behavior of the
2353 // inliner more regular and predictable. Interestingly, conditional branches
2354 // which will fold away are also free.
2355 return BI.isUnconditional() || isa<ConstantInt>(BI.getCondition()) ||
2356 BI.getMetadata(LLVMContext::MD_make_implicit) ||
2357 isa_and_nonnull<ConstantInt>(
2358 SimplifiedValues.lookup(BI.getCondition()));
2361 bool CallAnalyzer::visitSelectInst(SelectInst &SI) {
2362 bool CheckSROA = SI.getType()->isPointerTy();
2363 Value *TrueVal = SI.getTrueValue();
2364 Value *FalseVal = SI.getFalseValue();
2366 Constant *TrueC = dyn_cast<Constant>(TrueVal);
2367 if (!TrueC)
2368 TrueC = SimplifiedValues.lookup(TrueVal);
2369 Constant *FalseC = dyn_cast<Constant>(FalseVal);
2370 if (!FalseC)
2371 FalseC = SimplifiedValues.lookup(FalseVal);
2372 Constant *CondC =
2373 dyn_cast_or_null<Constant>(SimplifiedValues.lookup(SI.getCondition()));
2375 if (!CondC) {
2376 // Select C, X, X => X
2377 if (TrueC == FalseC && TrueC) {
2378 SimplifiedValues[&SI] = TrueC;
2379 return true;
2382 if (!CheckSROA)
2383 return Base::visitSelectInst(SI);
2385 std::pair<Value *, APInt> TrueBaseAndOffset =
2386 ConstantOffsetPtrs.lookup(TrueVal);
2387 std::pair<Value *, APInt> FalseBaseAndOffset =
2388 ConstantOffsetPtrs.lookup(FalseVal);
2389 if (TrueBaseAndOffset == FalseBaseAndOffset && TrueBaseAndOffset.first) {
2390 ConstantOffsetPtrs[&SI] = TrueBaseAndOffset;
2392 if (auto *SROAArg = getSROAArgForValueOrNull(TrueVal))
2393 SROAArgValues[&SI] = SROAArg;
2394 return true;
2397 return Base::visitSelectInst(SI);
2400 // Select condition is a constant.
2401 Value *SelectedV = CondC->isAllOnesValue() ? TrueVal
2402 : (CondC->isNullValue()) ? FalseVal
2403 : nullptr;
2404 if (!SelectedV) {
2405 // Condition is a vector constant that is not all 1s or all 0s. If all
2406 // operands are constants, ConstantFoldSelectInstruction() can handle the
2407 // cases such as select vectors.
2408 if (TrueC && FalseC) {
2409 if (auto *C = ConstantFoldSelectInstruction(CondC, TrueC, FalseC)) {
2410 SimplifiedValues[&SI] = C;
2411 return true;
2414 return Base::visitSelectInst(SI);
2417 // Condition is either all 1s or all 0s. SI can be simplified.
2418 if (Constant *SelectedC = dyn_cast<Constant>(SelectedV)) {
2419 SimplifiedValues[&SI] = SelectedC;
2420 return true;
2423 if (!CheckSROA)
2424 return true;
2426 std::pair<Value *, APInt> BaseAndOffset =
2427 ConstantOffsetPtrs.lookup(SelectedV);
2428 if (BaseAndOffset.first) {
2429 ConstantOffsetPtrs[&SI] = BaseAndOffset;
2431 if (auto *SROAArg = getSROAArgForValueOrNull(SelectedV))
2432 SROAArgValues[&SI] = SROAArg;
2435 return true;
2438 bool CallAnalyzer::visitSwitchInst(SwitchInst &SI) {
2439 // We model unconditional switches as free, see the comments on handling
2440 // branches.
2441 if (isa<ConstantInt>(SI.getCondition()))
2442 return true;
2443 if (Value *V = SimplifiedValues.lookup(SI.getCondition()))
2444 if (isa<ConstantInt>(V))
2445 return true;
2447 // Assume the most general case where the switch is lowered into
2448 // either a jump table, bit test, or a balanced binary tree consisting of
2449 // case clusters without merging adjacent clusters with the same
2450 // destination. We do not consider the switches that are lowered with a mix
2451 // of jump table/bit test/binary search tree. The cost of the switch is
2452 // proportional to the size of the tree or the size of jump table range.
2454 // NB: We convert large switches which are just used to initialize large phi
2455 // nodes to lookup tables instead in simplifycfg, so this shouldn't prevent
2456 // inlining those. It will prevent inlining in cases where the optimization
2457 // does not (yet) fire.
2459 unsigned JumpTableSize = 0;
2460 BlockFrequencyInfo *BFI = GetBFI ? &(GetBFI(F)) : nullptr;
2461 unsigned NumCaseCluster =
2462 TTI.getEstimatedNumberOfCaseClusters(SI, JumpTableSize, PSI, BFI);
2464 onFinalizeSwitch(JumpTableSize, NumCaseCluster);
2465 return false;
2468 bool CallAnalyzer::visitIndirectBrInst(IndirectBrInst &IBI) {
2469 // We never want to inline functions that contain an indirectbr. This is
2470 // incorrect because all the blockaddress's (in static global initializers
2471 // for example) would be referring to the original function, and this
2472 // indirect jump would jump from the inlined copy of the function into the
2473 // original function which is extremely undefined behavior.
2474 // FIXME: This logic isn't really right; we can safely inline functions with
2475 // indirectbr's as long as no other function or global references the
2476 // blockaddress of a block within the current function.
2477 HasIndirectBr = true;
2478 return false;
2481 bool CallAnalyzer::visitResumeInst(ResumeInst &RI) {
2482 // FIXME: It's not clear that a single instruction is an accurate model for
2483 // the inline cost of a resume instruction.
2484 return false;
2487 bool CallAnalyzer::visitCleanupReturnInst(CleanupReturnInst &CRI) {
2488 // FIXME: It's not clear that a single instruction is an accurate model for
2489 // the inline cost of a cleanupret instruction.
2490 return false;
2493 bool CallAnalyzer::visitCatchReturnInst(CatchReturnInst &CRI) {
2494 // FIXME: It's not clear that a single instruction is an accurate model for
2495 // the inline cost of a catchret instruction.
2496 return false;
2499 bool CallAnalyzer::visitUnreachableInst(UnreachableInst &I) {
2500 // FIXME: It might be reasonably to discount the cost of instructions leading
2501 // to unreachable as they have the lowest possible impact on both runtime and
2502 // code size.
2503 return true; // No actual code is needed for unreachable.
2506 bool CallAnalyzer::visitInstruction(Instruction &I) {
2507 // Some instructions are free. All of the free intrinsics can also be
2508 // handled by SROA, etc.
2509 if (TTI.getInstructionCost(&I, TargetTransformInfo::TCK_SizeAndLatency) ==
2510 TargetTransformInfo::TCC_Free)
2511 return true;
2513 // We found something we don't understand or can't handle. Mark any SROA-able
2514 // values in the operand list as no longer viable.
2515 for (const Use &Op : I.operands())
2516 disableSROA(Op);
2518 return false;
2521 /// Analyze a basic block for its contribution to the inline cost.
2523 /// This method walks the analyzer over every instruction in the given basic
2524 /// block and accounts for their cost during inlining at this callsite. It
2525 /// aborts early if the threshold has been exceeded or an impossible to inline
2526 /// construct has been detected. It returns false if inlining is no longer
2527 /// viable, and true if inlining remains viable.
2528 InlineResult
2529 CallAnalyzer::analyzeBlock(BasicBlock *BB,
2530 SmallPtrSetImpl<const Value *> &EphValues) {
2531 for (Instruction &I : *BB) {
2532 // FIXME: Currently, the number of instructions in a function regardless of
2533 // our ability to simplify them during inline to constants or dead code,
2534 // are actually used by the vector bonus heuristic. As long as that's true,
2535 // we have to special case debug intrinsics here to prevent differences in
2536 // inlining due to debug symbols. Eventually, the number of unsimplified
2537 // instructions shouldn't factor into the cost computation, but until then,
2538 // hack around it here.
2539 // Similarly, skip pseudo-probes.
2540 if (I.isDebugOrPseudoInst())
2541 continue;
2543 // Skip ephemeral values.
2544 if (EphValues.count(&I))
2545 continue;
2547 ++NumInstructions;
2548 if (isa<ExtractElementInst>(I) || I.getType()->isVectorTy())
2549 ++NumVectorInstructions;
2551 // If the instruction simplified to a constant, there is no cost to this
2552 // instruction. Visit the instructions using our InstVisitor to account for
2553 // all of the per-instruction logic. The visit tree returns true if we
2554 // consumed the instruction in any way, and false if the instruction's base
2555 // cost should count against inlining.
2556 onInstructionAnalysisStart(&I);
2558 if (Base::visit(&I))
2559 ++NumInstructionsSimplified;
2560 else
2561 onMissedSimplification();
2563 onInstructionAnalysisFinish(&I);
2564 using namespace ore;
2565 // If the visit this instruction detected an uninlinable pattern, abort.
2566 InlineResult IR = InlineResult::success();
2567 if (IsRecursiveCall && !AllowRecursiveCall)
2568 IR = InlineResult::failure("recursive");
2569 else if (ExposesReturnsTwice)
2570 IR = InlineResult::failure("exposes returns twice");
2571 else if (HasDynamicAlloca)
2572 IR = InlineResult::failure("dynamic alloca");
2573 else if (HasIndirectBr)
2574 IR = InlineResult::failure("indirect branch");
2575 else if (HasUninlineableIntrinsic)
2576 IR = InlineResult::failure("uninlinable intrinsic");
2577 else if (InitsVargArgs)
2578 IR = InlineResult::failure("varargs");
2579 if (!IR.isSuccess()) {
2580 if (ORE)
2581 ORE->emit([&]() {
2582 return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline",
2583 &CandidateCall)
2584 << NV("Callee", &F) << " has uninlinable pattern ("
2585 << NV("InlineResult", IR.getFailureReason())
2586 << ") and cost is not fully computed";
2588 return IR;
2591 // If the caller is a recursive function then we don't want to inline
2592 // functions which allocate a lot of stack space because it would increase
2593 // the caller stack usage dramatically.
2594 if (IsCallerRecursive && AllocatedSize > RecurStackSizeThreshold) {
2595 auto IR =
2596 InlineResult::failure("recursive and allocates too much stack space");
2597 if (ORE)
2598 ORE->emit([&]() {
2599 return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline",
2600 &CandidateCall)
2601 << NV("Callee", &F) << " is "
2602 << NV("InlineResult", IR.getFailureReason())
2603 << ". Cost is not fully computed";
2605 return IR;
2608 if (shouldStop())
2609 return InlineResult::failure(
2610 "Call site analysis is not favorable to inlining.");
2613 return InlineResult::success();
2616 /// Compute the base pointer and cumulative constant offsets for V.
2618 /// This strips all constant offsets off of V, leaving it the base pointer, and
2619 /// accumulates the total constant offset applied in the returned constant. It
2620 /// returns 0 if V is not a pointer, and returns the constant '0' if there are
2621 /// no constant offsets applied.
2622 ConstantInt *CallAnalyzer::stripAndComputeInBoundsConstantOffsets(Value *&V) {
2623 if (!V->getType()->isPointerTy())
2624 return nullptr;
2626 unsigned AS = V->getType()->getPointerAddressSpace();
2627 unsigned IntPtrWidth = DL.getIndexSizeInBits(AS);
2628 APInt Offset = APInt::getZero(IntPtrWidth);
2630 // Even though we don't look through PHI nodes, we could be called on an
2631 // instruction in an unreachable block, which may be on a cycle.
2632 SmallPtrSet<Value *, 4> Visited;
2633 Visited.insert(V);
2634 do {
2635 if (GEPOperator *GEP = dyn_cast<GEPOperator>(V)) {
2636 if (!GEP->isInBounds() || !accumulateGEPOffset(*GEP, Offset))
2637 return nullptr;
2638 V = GEP->getPointerOperand();
2639 } else if (Operator::getOpcode(V) == Instruction::BitCast) {
2640 V = cast<Operator>(V)->getOperand(0);
2641 } else if (GlobalAlias *GA = dyn_cast<GlobalAlias>(V)) {
2642 if (GA->isInterposable())
2643 break;
2644 V = GA->getAliasee();
2645 } else {
2646 break;
2648 assert(V->getType()->isPointerTy() && "Unexpected operand type!");
2649 } while (Visited.insert(V).second);
2651 Type *IdxPtrTy = DL.getIndexType(V->getType());
2652 return cast<ConstantInt>(ConstantInt::get(IdxPtrTy, Offset));
2655 /// Find dead blocks due to deleted CFG edges during inlining.
2657 /// If we know the successor of the current block, \p CurrBB, has to be \p
2658 /// NextBB, the other successors of \p CurrBB are dead if these successors have
2659 /// no live incoming CFG edges. If one block is found to be dead, we can
2660 /// continue growing the dead block list by checking the successors of the dead
2661 /// blocks to see if all their incoming edges are dead or not.
2662 void CallAnalyzer::findDeadBlocks(BasicBlock *CurrBB, BasicBlock *NextBB) {
2663 auto IsEdgeDead = [&](BasicBlock *Pred, BasicBlock *Succ) {
2664 // A CFG edge is dead if the predecessor is dead or the predecessor has a
2665 // known successor which is not the one under exam.
2666 return (DeadBlocks.count(Pred) ||
2667 (KnownSuccessors[Pred] && KnownSuccessors[Pred] != Succ));
2670 auto IsNewlyDead = [&](BasicBlock *BB) {
2671 // If all the edges to a block are dead, the block is also dead.
2672 return (!DeadBlocks.count(BB) &&
2673 llvm::all_of(predecessors(BB),
2674 [&](BasicBlock *P) { return IsEdgeDead(P, BB); }));
2677 for (BasicBlock *Succ : successors(CurrBB)) {
2678 if (Succ == NextBB || !IsNewlyDead(Succ))
2679 continue;
2680 SmallVector<BasicBlock *, 4> NewDead;
2681 NewDead.push_back(Succ);
2682 while (!NewDead.empty()) {
2683 BasicBlock *Dead = NewDead.pop_back_val();
2684 if (DeadBlocks.insert(Dead).second)
2685 // Continue growing the dead block lists.
2686 for (BasicBlock *S : successors(Dead))
2687 if (IsNewlyDead(S))
2688 NewDead.push_back(S);
2693 /// Analyze a call site for potential inlining.
2695 /// Returns true if inlining this call is viable, and false if it is not
2696 /// viable. It computes the cost and adjusts the threshold based on numerous
2697 /// factors and heuristics. If this method returns false but the computed cost
2698 /// is below the computed threshold, then inlining was forcibly disabled by
2699 /// some artifact of the routine.
2700 InlineResult CallAnalyzer::analyze() {
2701 ++NumCallsAnalyzed;
2703 auto Result = onAnalysisStart();
2704 if (!Result.isSuccess())
2705 return Result;
2707 if (F.empty())
2708 return InlineResult::success();
2710 Function *Caller = CandidateCall.getFunction();
2711 // Check if the caller function is recursive itself.
2712 for (User *U : Caller->users()) {
2713 CallBase *Call = dyn_cast<CallBase>(U);
2714 if (Call && Call->getFunction() == Caller) {
2715 IsCallerRecursive = true;
2716 break;
2720 // Populate our simplified values by mapping from function arguments to call
2721 // arguments with known important simplifications.
2722 auto CAI = CandidateCall.arg_begin();
2723 for (Argument &FAI : F.args()) {
2724 assert(CAI != CandidateCall.arg_end());
2725 if (Constant *C = dyn_cast<Constant>(CAI))
2726 SimplifiedValues[&FAI] = C;
2728 Value *PtrArg = *CAI;
2729 if (ConstantInt *C = stripAndComputeInBoundsConstantOffsets(PtrArg)) {
2730 ConstantOffsetPtrs[&FAI] = std::make_pair(PtrArg, C->getValue());
2732 // We can SROA any pointer arguments derived from alloca instructions.
2733 if (auto *SROAArg = dyn_cast<AllocaInst>(PtrArg)) {
2734 SROAArgValues[&FAI] = SROAArg;
2735 onInitializeSROAArg(SROAArg);
2736 EnabledSROAAllocas.insert(SROAArg);
2739 ++CAI;
2741 NumConstantArgs = SimplifiedValues.size();
2742 NumConstantOffsetPtrArgs = ConstantOffsetPtrs.size();
2743 NumAllocaArgs = SROAArgValues.size();
2745 // FIXME: If a caller has multiple calls to a callee, we end up recomputing
2746 // the ephemeral values multiple times (and they're completely determined by
2747 // the callee, so this is purely duplicate work).
2748 SmallPtrSet<const Value *, 32> EphValues;
2749 CodeMetrics::collectEphemeralValues(&F, &GetAssumptionCache(F), EphValues);
2751 // The worklist of live basic blocks in the callee *after* inlining. We avoid
2752 // adding basic blocks of the callee which can be proven to be dead for this
2753 // particular call site in order to get more accurate cost estimates. This
2754 // requires a somewhat heavyweight iteration pattern: we need to walk the
2755 // basic blocks in a breadth-first order as we insert live successors. To
2756 // accomplish this, prioritizing for small iterations because we exit after
2757 // crossing our threshold, we use a small-size optimized SetVector.
2758 typedef SmallSetVector<BasicBlock *, 16> BBSetVector;
2759 BBSetVector BBWorklist;
2760 BBWorklist.insert(&F.getEntryBlock());
2762 // Note that we *must not* cache the size, this loop grows the worklist.
2763 for (unsigned Idx = 0; Idx != BBWorklist.size(); ++Idx) {
2764 if (shouldStop())
2765 break;
2767 BasicBlock *BB = BBWorklist[Idx];
2768 if (BB->empty())
2769 continue;
2771 onBlockStart(BB);
2773 // Disallow inlining a blockaddress with uses other than strictly callbr.
2774 // A blockaddress only has defined behavior for an indirect branch in the
2775 // same function, and we do not currently support inlining indirect
2776 // branches. But, the inliner may not see an indirect branch that ends up
2777 // being dead code at a particular call site. If the blockaddress escapes
2778 // the function, e.g., via a global variable, inlining may lead to an
2779 // invalid cross-function reference.
2780 // FIXME: pr/39560: continue relaxing this overt restriction.
2781 if (BB->hasAddressTaken())
2782 for (User *U : BlockAddress::get(&*BB)->users())
2783 if (!isa<CallBrInst>(*U))
2784 return InlineResult::failure("blockaddress used outside of callbr");
2786 // Analyze the cost of this block. If we blow through the threshold, this
2787 // returns false, and we can bail on out.
2788 InlineResult IR = analyzeBlock(BB, EphValues);
2789 if (!IR.isSuccess())
2790 return IR;
2792 Instruction *TI = BB->getTerminator();
2794 // Add in the live successors by first checking whether we have terminator
2795 // that may be simplified based on the values simplified by this call.
2796 if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
2797 if (BI->isConditional()) {
2798 Value *Cond = BI->getCondition();
2799 if (ConstantInt *SimpleCond =
2800 dyn_cast_or_null<ConstantInt>(SimplifiedValues.lookup(Cond))) {
2801 BasicBlock *NextBB = BI->getSuccessor(SimpleCond->isZero() ? 1 : 0);
2802 BBWorklist.insert(NextBB);
2803 KnownSuccessors[BB] = NextBB;
2804 findDeadBlocks(BB, NextBB);
2805 continue;
2808 } else if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
2809 Value *Cond = SI->getCondition();
2810 if (ConstantInt *SimpleCond =
2811 dyn_cast_or_null<ConstantInt>(SimplifiedValues.lookup(Cond))) {
2812 BasicBlock *NextBB = SI->findCaseValue(SimpleCond)->getCaseSuccessor();
2813 BBWorklist.insert(NextBB);
2814 KnownSuccessors[BB] = NextBB;
2815 findDeadBlocks(BB, NextBB);
2816 continue;
2820 // If we're unable to select a particular successor, just count all of
2821 // them.
2822 for (unsigned TIdx = 0, TSize = TI->getNumSuccessors(); TIdx != TSize;
2823 ++TIdx)
2824 BBWorklist.insert(TI->getSuccessor(TIdx));
2826 onBlockAnalyzed(BB);
2829 // If this is a noduplicate call, we can still inline as long as
2830 // inlining this would cause the removal of the caller (so the instruction
2831 // is not actually duplicated, just moved).
2832 if (!isSoleCallToLocalFunction(CandidateCall, F) && ContainsNoDuplicateCall)
2833 return InlineResult::failure("noduplicate");
2835 // If the callee's stack size exceeds the user-specified threshold,
2836 // do not let it be inlined.
2837 // The command line option overrides a limit set in the function attributes.
2838 size_t FinalStackSizeThreshold = StackSizeThreshold;
2839 if (!StackSizeThreshold.getNumOccurrences())
2840 if (std::optional<int> AttrMaxStackSize = getStringFnAttrAsInt(
2841 Caller, InlineConstants::MaxInlineStackSizeAttributeName))
2842 FinalStackSizeThreshold = *AttrMaxStackSize;
2843 if (AllocatedSize > FinalStackSizeThreshold)
2844 return InlineResult::failure("stacksize");
2846 return finalizeAnalysis();
2849 void InlineCostCallAnalyzer::print(raw_ostream &OS) {
2850 #define DEBUG_PRINT_STAT(x) OS << " " #x ": " << x << "\n"
2851 if (PrintInstructionComments)
2852 F.print(OS, &Writer);
2853 DEBUG_PRINT_STAT(NumConstantArgs);
2854 DEBUG_PRINT_STAT(NumConstantOffsetPtrArgs);
2855 DEBUG_PRINT_STAT(NumAllocaArgs);
2856 DEBUG_PRINT_STAT(NumConstantPtrCmps);
2857 DEBUG_PRINT_STAT(NumConstantPtrDiffs);
2858 DEBUG_PRINT_STAT(NumInstructionsSimplified);
2859 DEBUG_PRINT_STAT(NumInstructions);
2860 DEBUG_PRINT_STAT(SROACostSavings);
2861 DEBUG_PRINT_STAT(SROACostSavingsLost);
2862 DEBUG_PRINT_STAT(LoadEliminationCost);
2863 DEBUG_PRINT_STAT(ContainsNoDuplicateCall);
2864 DEBUG_PRINT_STAT(Cost);
2865 DEBUG_PRINT_STAT(Threshold);
2866 #undef DEBUG_PRINT_STAT
2869 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2870 /// Dump stats about this call's analysis.
2871 LLVM_DUMP_METHOD void InlineCostCallAnalyzer::dump() { print(dbgs()); }
2872 #endif
2874 /// Test that there are no attribute conflicts between Caller and Callee
2875 /// that prevent inlining.
2876 static bool functionsHaveCompatibleAttributes(
2877 Function *Caller, Function *Callee, TargetTransformInfo &TTI,
2878 function_ref<const TargetLibraryInfo &(Function &)> &GetTLI) {
2879 // Note that CalleeTLI must be a copy not a reference. The legacy pass manager
2880 // caches the most recently created TLI in the TargetLibraryInfoWrapperPass
2881 // object, and always returns the same object (which is overwritten on each
2882 // GetTLI call). Therefore we copy the first result.
2883 auto CalleeTLI = GetTLI(*Callee);
2884 return (IgnoreTTIInlineCompatible ||
2885 TTI.areInlineCompatible(Caller, Callee)) &&
2886 GetTLI(*Caller).areInlineCompatible(CalleeTLI,
2887 InlineCallerSupersetNoBuiltin) &&
2888 AttributeFuncs::areInlineCompatible(*Caller, *Callee);
2891 int llvm::getCallsiteCost(const TargetTransformInfo &TTI, const CallBase &Call,
2892 const DataLayout &DL) {
2893 int64_t Cost = 0;
2894 for (unsigned I = 0, E = Call.arg_size(); I != E; ++I) {
2895 if (Call.isByValArgument(I)) {
2896 // We approximate the number of loads and stores needed by dividing the
2897 // size of the byval type by the target's pointer size.
2898 PointerType *PTy = cast<PointerType>(Call.getArgOperand(I)->getType());
2899 unsigned TypeSize = DL.getTypeSizeInBits(Call.getParamByValType(I));
2900 unsigned AS = PTy->getAddressSpace();
2901 unsigned PointerSize = DL.getPointerSizeInBits(AS);
2902 // Ceiling division.
2903 unsigned NumStores = (TypeSize + PointerSize - 1) / PointerSize;
2905 // If it generates more than 8 stores it is likely to be expanded as an
2906 // inline memcpy so we take that as an upper bound. Otherwise we assume
2907 // one load and one store per word copied.
2908 // FIXME: The maxStoresPerMemcpy setting from the target should be used
2909 // here instead of a magic number of 8, but it's not available via
2910 // DataLayout.
2911 NumStores = std::min(NumStores, 8U);
2913 Cost += 2 * NumStores * InstrCost;
2914 } else {
2915 // For non-byval arguments subtract off one instruction per call
2916 // argument.
2917 Cost += InstrCost;
2920 // The call instruction also disappears after inlining.
2921 Cost += InstrCost;
2922 Cost += TTI.getInlineCallPenalty(Call.getCaller(), Call, CallPenalty);
2924 return std::min<int64_t>(Cost, INT_MAX);
2927 InlineCost llvm::getInlineCost(
2928 CallBase &Call, const InlineParams &Params, TargetTransformInfo &CalleeTTI,
2929 function_ref<AssumptionCache &(Function &)> GetAssumptionCache,
2930 function_ref<const TargetLibraryInfo &(Function &)> GetTLI,
2931 function_ref<BlockFrequencyInfo &(Function &)> GetBFI,
2932 ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE) {
2933 return getInlineCost(Call, Call.getCalledFunction(), Params, CalleeTTI,
2934 GetAssumptionCache, GetTLI, GetBFI, PSI, ORE);
2937 std::optional<int> llvm::getInliningCostEstimate(
2938 CallBase &Call, TargetTransformInfo &CalleeTTI,
2939 function_ref<AssumptionCache &(Function &)> GetAssumptionCache,
2940 function_ref<BlockFrequencyInfo &(Function &)> GetBFI,
2941 ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE) {
2942 const InlineParams Params = {/* DefaultThreshold*/ 0,
2943 /*HintThreshold*/ {},
2944 /*ColdThreshold*/ {},
2945 /*OptSizeThreshold*/ {},
2946 /*OptMinSizeThreshold*/ {},
2947 /*HotCallSiteThreshold*/ {},
2948 /*LocallyHotCallSiteThreshold*/ {},
2949 /*ColdCallSiteThreshold*/ {},
2950 /*ComputeFullInlineCost*/ true,
2951 /*EnableDeferral*/ true};
2953 InlineCostCallAnalyzer CA(*Call.getCalledFunction(), Call, Params, CalleeTTI,
2954 GetAssumptionCache, GetBFI, PSI, ORE, true,
2955 /*IgnoreThreshold*/ true);
2956 auto R = CA.analyze();
2957 if (!R.isSuccess())
2958 return std::nullopt;
2959 return CA.getCost();
2962 std::optional<InlineCostFeatures> llvm::getInliningCostFeatures(
2963 CallBase &Call, TargetTransformInfo &CalleeTTI,
2964 function_ref<AssumptionCache &(Function &)> GetAssumptionCache,
2965 function_ref<BlockFrequencyInfo &(Function &)> GetBFI,
2966 ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE) {
2967 InlineCostFeaturesAnalyzer CFA(CalleeTTI, GetAssumptionCache, GetBFI, PSI,
2968 ORE, *Call.getCalledFunction(), Call);
2969 auto R = CFA.analyze();
2970 if (!R.isSuccess())
2971 return std::nullopt;
2972 return CFA.features();
2975 std::optional<InlineResult> llvm::getAttributeBasedInliningDecision(
2976 CallBase &Call, Function *Callee, TargetTransformInfo &CalleeTTI,
2977 function_ref<const TargetLibraryInfo &(Function &)> GetTLI) {
2979 // Cannot inline indirect calls.
2980 if (!Callee)
2981 return InlineResult::failure("indirect call");
2983 // When callee coroutine function is inlined into caller coroutine function
2984 // before coro-split pass,
2985 // coro-early pass can not handle this quiet well.
2986 // So we won't inline the coroutine function if it have not been unsplited
2987 if (Callee->isPresplitCoroutine())
2988 return InlineResult::failure("unsplited coroutine call");
2990 // Never inline calls with byval arguments that does not have the alloca
2991 // address space. Since byval arguments can be replaced with a copy to an
2992 // alloca, the inlined code would need to be adjusted to handle that the
2993 // argument is in the alloca address space (so it is a little bit complicated
2994 // to solve).
2995 unsigned AllocaAS = Callee->getParent()->getDataLayout().getAllocaAddrSpace();
2996 for (unsigned I = 0, E = Call.arg_size(); I != E; ++I)
2997 if (Call.isByValArgument(I)) {
2998 PointerType *PTy = cast<PointerType>(Call.getArgOperand(I)->getType());
2999 if (PTy->getAddressSpace() != AllocaAS)
3000 return InlineResult::failure("byval arguments without alloca"
3001 " address space");
3004 // Calls to functions with always-inline attributes should be inlined
3005 // whenever possible.
3006 if (Call.hasFnAttr(Attribute::AlwaysInline)) {
3007 if (Call.getAttributes().hasFnAttr(Attribute::NoInline))
3008 return InlineResult::failure("noinline call site attribute");
3010 auto IsViable = isInlineViable(*Callee);
3011 if (IsViable.isSuccess())
3012 return InlineResult::success();
3013 return InlineResult::failure(IsViable.getFailureReason());
3016 // Never inline functions with conflicting attributes (unless callee has
3017 // always-inline attribute).
3018 Function *Caller = Call.getCaller();
3019 if (!functionsHaveCompatibleAttributes(Caller, Callee, CalleeTTI, GetTLI))
3020 return InlineResult::failure("conflicting attributes");
3022 // Don't inline this call if the caller has the optnone attribute.
3023 if (Caller->hasOptNone())
3024 return InlineResult::failure("optnone attribute");
3026 // Don't inline a function that treats null pointer as valid into a caller
3027 // that does not have this attribute.
3028 if (!Caller->nullPointerIsDefined() && Callee->nullPointerIsDefined())
3029 return InlineResult::failure("nullptr definitions incompatible");
3031 // Don't inline functions which can be interposed at link-time.
3032 if (Callee->isInterposable())
3033 return InlineResult::failure("interposable");
3035 // Don't inline functions marked noinline.
3036 if (Callee->hasFnAttribute(Attribute::NoInline))
3037 return InlineResult::failure("noinline function attribute");
3039 // Don't inline call sites marked noinline.
3040 if (Call.isNoInline())
3041 return InlineResult::failure("noinline call site attribute");
3043 return std::nullopt;
3046 InlineCost llvm::getInlineCost(
3047 CallBase &Call, Function *Callee, const InlineParams &Params,
3048 TargetTransformInfo &CalleeTTI,
3049 function_ref<AssumptionCache &(Function &)> GetAssumptionCache,
3050 function_ref<const TargetLibraryInfo &(Function &)> GetTLI,
3051 function_ref<BlockFrequencyInfo &(Function &)> GetBFI,
3052 ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE) {
3054 auto UserDecision =
3055 llvm::getAttributeBasedInliningDecision(Call, Callee, CalleeTTI, GetTLI);
3057 if (UserDecision) {
3058 if (UserDecision->isSuccess())
3059 return llvm::InlineCost::getAlways("always inline attribute");
3060 return llvm::InlineCost::getNever(UserDecision->getFailureReason());
3063 LLVM_DEBUG(llvm::dbgs() << " Analyzing call of " << Callee->getName()
3064 << "... (caller:" << Call.getCaller()->getName()
3065 << ")\n");
3067 InlineCostCallAnalyzer CA(*Callee, Call, Params, CalleeTTI,
3068 GetAssumptionCache, GetBFI, PSI, ORE);
3069 InlineResult ShouldInline = CA.analyze();
3071 LLVM_DEBUG(CA.dump());
3073 // Always make cost benefit based decision explicit.
3074 // We use always/never here since threshold is not meaningful,
3075 // as it's not what drives cost-benefit analysis.
3076 if (CA.wasDecidedByCostBenefit()) {
3077 if (ShouldInline.isSuccess())
3078 return InlineCost::getAlways("benefit over cost",
3079 CA.getCostBenefitPair());
3080 else
3081 return InlineCost::getNever("cost over benefit", CA.getCostBenefitPair());
3084 if (CA.wasDecidedByCostThreshold())
3085 return InlineCost::get(CA.getCost(), CA.getThreshold(),
3086 CA.getStaticBonusApplied());
3088 // No details on how the decision was made, simply return always or never.
3089 return ShouldInline.isSuccess()
3090 ? InlineCost::getAlways("empty function")
3091 : InlineCost::getNever(ShouldInline.getFailureReason());
3094 InlineResult llvm::isInlineViable(Function &F) {
3095 bool ReturnsTwice = F.hasFnAttribute(Attribute::ReturnsTwice);
3096 for (BasicBlock &BB : F) {
3097 // Disallow inlining of functions which contain indirect branches.
3098 if (isa<IndirectBrInst>(BB.getTerminator()))
3099 return InlineResult::failure("contains indirect branches");
3101 // Disallow inlining of blockaddresses which are used by non-callbr
3102 // instructions.
3103 if (BB.hasAddressTaken())
3104 for (User *U : BlockAddress::get(&BB)->users())
3105 if (!isa<CallBrInst>(*U))
3106 return InlineResult::failure("blockaddress used outside of callbr");
3108 for (auto &II : BB) {
3109 CallBase *Call = dyn_cast<CallBase>(&II);
3110 if (!Call)
3111 continue;
3113 // Disallow recursive calls.
3114 Function *Callee = Call->getCalledFunction();
3115 if (&F == Callee)
3116 return InlineResult::failure("recursive call");
3118 // Disallow calls which expose returns-twice to a function not previously
3119 // attributed as such.
3120 if (!ReturnsTwice && isa<CallInst>(Call) &&
3121 cast<CallInst>(Call)->canReturnTwice())
3122 return InlineResult::failure("exposes returns-twice attribute");
3124 if (Callee)
3125 switch (Callee->getIntrinsicID()) {
3126 default:
3127 break;
3128 case llvm::Intrinsic::icall_branch_funnel:
3129 // Disallow inlining of @llvm.icall.branch.funnel because current
3130 // backend can't separate call targets from call arguments.
3131 return InlineResult::failure(
3132 "disallowed inlining of @llvm.icall.branch.funnel");
3133 case llvm::Intrinsic::localescape:
3134 // Disallow inlining functions that call @llvm.localescape. Doing this
3135 // correctly would require major changes to the inliner.
3136 return InlineResult::failure(
3137 "disallowed inlining of @llvm.localescape");
3138 case llvm::Intrinsic::vastart:
3139 // Disallow inlining of functions that initialize VarArgs with
3140 // va_start.
3141 return InlineResult::failure(
3142 "contains VarArgs initialized with va_start");
3147 return InlineResult::success();
3150 // APIs to create InlineParams based on command line flags and/or other
3151 // parameters.
3153 InlineParams llvm::getInlineParams(int Threshold) {
3154 InlineParams Params;
3156 // This field is the threshold to use for a callee by default. This is
3157 // derived from one or more of:
3158 // * optimization or size-optimization levels,
3159 // * a value passed to createFunctionInliningPass function, or
3160 // * the -inline-threshold flag.
3161 // If the -inline-threshold flag is explicitly specified, that is used
3162 // irrespective of anything else.
3163 if (InlineThreshold.getNumOccurrences() > 0)
3164 Params.DefaultThreshold = InlineThreshold;
3165 else
3166 Params.DefaultThreshold = Threshold;
3168 // Set the HintThreshold knob from the -inlinehint-threshold.
3169 Params.HintThreshold = HintThreshold;
3171 // Set the HotCallSiteThreshold knob from the -hot-callsite-threshold.
3172 Params.HotCallSiteThreshold = HotCallSiteThreshold;
3174 // If the -locally-hot-callsite-threshold is explicitly specified, use it to
3175 // populate LocallyHotCallSiteThreshold. Later, we populate
3176 // Params.LocallyHotCallSiteThreshold from -locally-hot-callsite-threshold if
3177 // we know that optimization level is O3 (in the getInlineParams variant that
3178 // takes the opt and size levels).
3179 // FIXME: Remove this check (and make the assignment unconditional) after
3180 // addressing size regression issues at O2.
3181 if (LocallyHotCallSiteThreshold.getNumOccurrences() > 0)
3182 Params.LocallyHotCallSiteThreshold = LocallyHotCallSiteThreshold;
3184 // Set the ColdCallSiteThreshold knob from the
3185 // -inline-cold-callsite-threshold.
3186 Params.ColdCallSiteThreshold = ColdCallSiteThreshold;
3188 // Set the OptMinSizeThreshold and OptSizeThreshold params only if the
3189 // -inlinehint-threshold commandline option is not explicitly given. If that
3190 // option is present, then its value applies even for callees with size and
3191 // minsize attributes.
3192 // If the -inline-threshold is not specified, set the ColdThreshold from the
3193 // -inlinecold-threshold even if it is not explicitly passed. If
3194 // -inline-threshold is specified, then -inlinecold-threshold needs to be
3195 // explicitly specified to set the ColdThreshold knob
3196 if (InlineThreshold.getNumOccurrences() == 0) {
3197 Params.OptMinSizeThreshold = InlineConstants::OptMinSizeThreshold;
3198 Params.OptSizeThreshold = InlineConstants::OptSizeThreshold;
3199 Params.ColdThreshold = ColdThreshold;
3200 } else if (ColdThreshold.getNumOccurrences() > 0) {
3201 Params.ColdThreshold = ColdThreshold;
3203 return Params;
3206 InlineParams llvm::getInlineParams() {
3207 return getInlineParams(DefaultThreshold);
3210 // Compute the default threshold for inlining based on the opt level and the
3211 // size opt level.
3212 static int computeThresholdFromOptLevels(unsigned OptLevel,
3213 unsigned SizeOptLevel) {
3214 if (OptLevel > 2)
3215 return InlineConstants::OptAggressiveThreshold;
3216 if (SizeOptLevel == 1) // -Os
3217 return InlineConstants::OptSizeThreshold;
3218 if (SizeOptLevel == 2) // -Oz
3219 return InlineConstants::OptMinSizeThreshold;
3220 return DefaultThreshold;
3223 InlineParams llvm::getInlineParams(unsigned OptLevel, unsigned SizeOptLevel) {
3224 auto Params =
3225 getInlineParams(computeThresholdFromOptLevels(OptLevel, SizeOptLevel));
3226 // At O3, use the value of -locally-hot-callsite-threshold option to populate
3227 // Params.LocallyHotCallSiteThreshold. Below O3, this flag has effect only
3228 // when it is specified explicitly.
3229 if (OptLevel > 2)
3230 Params.LocallyHotCallSiteThreshold = LocallyHotCallSiteThreshold;
3231 return Params;
3234 PreservedAnalyses
3235 InlineCostAnnotationPrinterPass::run(Function &F,
3236 FunctionAnalysisManager &FAM) {
3237 PrintInstructionComments = true;
3238 std::function<AssumptionCache &(Function &)> GetAssumptionCache =
3239 [&](Function &F) -> AssumptionCache & {
3240 return FAM.getResult<AssumptionAnalysis>(F);
3242 Module *M = F.getParent();
3243 ProfileSummaryInfo PSI(*M);
3244 DataLayout DL(M);
3245 TargetTransformInfo TTI(DL);
3246 // FIXME: Redesign the usage of InlineParams to expand the scope of this pass.
3247 // In the current implementation, the type of InlineParams doesn't matter as
3248 // the pass serves only for verification of inliner's decisions.
3249 // We can add a flag which determines InlineParams for this run. Right now,
3250 // the default InlineParams are used.
3251 const InlineParams Params = llvm::getInlineParams();
3252 for (BasicBlock &BB : F) {
3253 for (Instruction &I : BB) {
3254 if (CallInst *CI = dyn_cast<CallInst>(&I)) {
3255 Function *CalledFunction = CI->getCalledFunction();
3256 if (!CalledFunction || CalledFunction->isDeclaration())
3257 continue;
3258 OptimizationRemarkEmitter ORE(CalledFunction);
3259 InlineCostCallAnalyzer ICCA(*CalledFunction, *CI, Params, TTI,
3260 GetAssumptionCache, nullptr, &PSI, &ORE);
3261 ICCA.analyze();
3262 OS << " Analyzing call of " << CalledFunction->getName()
3263 << "... (caller:" << CI->getCaller()->getName() << ")\n";
3264 ICCA.print(OS);
3265 OS << "\n";
3269 return PreservedAnalyses::all();