1 //===- InlineCost.cpp - Cost analysis for inliner -------------------------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 // This file implements inline cost analysis.
11 //===----------------------------------------------------------------------===//
13 #include "llvm/Analysis/InlineCost.h"
14 #include "llvm/ADT/STLExtras.h"
15 #include "llvm/ADT/SetVector.h"
16 #include "llvm/ADT/SmallPtrSet.h"
17 #include "llvm/ADT/SmallVector.h"
18 #include "llvm/ADT/Statistic.h"
19 #include "llvm/Analysis/AssumptionCache.h"
20 #include "llvm/Analysis/BlockFrequencyInfo.h"
21 #include "llvm/Analysis/CodeMetrics.h"
22 #include "llvm/Analysis/ConstantFolding.h"
23 #include "llvm/Analysis/InstructionSimplify.h"
24 #include "llvm/Analysis/LoopInfo.h"
25 #include "llvm/Analysis/MemoryBuiltins.h"
26 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
27 #include "llvm/Analysis/ProfileSummaryInfo.h"
28 #include "llvm/Analysis/TargetLibraryInfo.h"
29 #include "llvm/Analysis/TargetTransformInfo.h"
30 #include "llvm/Analysis/ValueTracking.h"
31 #include "llvm/Config/llvm-config.h"
32 #include "llvm/IR/AssemblyAnnotationWriter.h"
33 #include "llvm/IR/CallingConv.h"
34 #include "llvm/IR/DataLayout.h"
35 #include "llvm/IR/Dominators.h"
36 #include "llvm/IR/GetElementPtrTypeIterator.h"
37 #include "llvm/IR/GlobalAlias.h"
38 #include "llvm/IR/InstVisitor.h"
39 #include "llvm/IR/IntrinsicInst.h"
40 #include "llvm/IR/Operator.h"
41 #include "llvm/IR/PatternMatch.h"
42 #include "llvm/Support/CommandLine.h"
43 #include "llvm/Support/Debug.h"
44 #include "llvm/Support/FormattedStream.h"
45 #include "llvm/Support/raw_ostream.h"
52 #define DEBUG_TYPE "inline-cost"
54 STATISTIC(NumCallsAnalyzed
, "Number of call sites analyzed");
57 DefaultThreshold("inlinedefault-threshold", cl::Hidden
, cl::init(225),
58 cl::desc("Default amount of inlining to perform"));
60 // We introduce this option since there is a minor compile-time win by avoiding
61 // addition of TTI attributes (target-features in particular) to inline
62 // candidates when they are guaranteed to be the same as top level methods in
63 // some use cases. If we avoid adding the attribute, we need an option to avoid
64 // checking these attributes.
65 static cl::opt
<bool> IgnoreTTIInlineCompatible(
66 "ignore-tti-inline-compatible", cl::Hidden
, cl::init(false),
67 cl::desc("Ignore TTI attributes compatibility check between callee/caller "
68 "during inline cost calculation"));
70 static cl::opt
<bool> PrintInstructionComments(
71 "print-instruction-comments", cl::Hidden
, cl::init(false),
72 cl::desc("Prints comments for instruction based on inline cost analysis"));
74 static cl::opt
<int> InlineThreshold(
75 "inline-threshold", cl::Hidden
, cl::init(225),
76 cl::desc("Control the amount of inlining to perform (default = 225)"));
78 static cl::opt
<int> HintThreshold(
79 "inlinehint-threshold", cl::Hidden
, cl::init(325),
80 cl::desc("Threshold for inlining functions with inline hint"));
83 ColdCallSiteThreshold("inline-cold-callsite-threshold", cl::Hidden
,
85 cl::desc("Threshold for inlining cold callsites"));
87 static cl::opt
<bool> InlineEnableCostBenefitAnalysis(
88 "inline-enable-cost-benefit-analysis", cl::Hidden
, cl::init(false),
89 cl::desc("Enable the cost-benefit analysis for the inliner"));
91 // InlineSavingsMultiplier overrides per TTI multipliers iff it is
92 // specified explicitly in command line options. This option is exposed
93 // for tuning and testing.
94 static cl::opt
<int> InlineSavingsMultiplier(
95 "inline-savings-multiplier", cl::Hidden
, cl::init(8),
96 cl::desc("Multiplier to multiply cycle savings by during inlining"));
98 // InlineSavingsProfitableMultiplier overrides per TTI multipliers iff it is
99 // specified explicitly in command line options. This option is exposed
100 // for tuning and testing.
101 static cl::opt
<int> InlineSavingsProfitableMultiplier(
102 "inline-savings-profitable-multiplier", cl::Hidden
, cl::init(4),
103 cl::desc("A multiplier on top of cycle savings to decide whether the "
104 "savings won't justify the cost"));
107 InlineSizeAllowance("inline-size-allowance", cl::Hidden
, cl::init(100),
108 cl::desc("The maximum size of a callee that get's "
109 "inlined without sufficient cycle savings"));
111 // We introduce this threshold to help performance of instrumentation based
112 // PGO before we actually hook up inliner with analysis passes such as BPI and
114 static cl::opt
<int> ColdThreshold(
115 "inlinecold-threshold", cl::Hidden
, cl::init(45),
116 cl::desc("Threshold for inlining functions with cold attribute"));
119 HotCallSiteThreshold("hot-callsite-threshold", cl::Hidden
, cl::init(3000),
120 cl::desc("Threshold for hot callsites "));
122 static cl::opt
<int> LocallyHotCallSiteThreshold(
123 "locally-hot-callsite-threshold", cl::Hidden
, cl::init(525),
124 cl::desc("Threshold for locally hot callsites "));
126 static cl::opt
<int> ColdCallSiteRelFreq(
127 "cold-callsite-rel-freq", cl::Hidden
, cl::init(2),
128 cl::desc("Maximum block frequency, expressed as a percentage of caller's "
129 "entry frequency, for a callsite to be cold in the absence of "
130 "profile information."));
132 static cl::opt
<uint64_t> HotCallSiteRelFreq(
133 "hot-callsite-rel-freq", cl::Hidden
, cl::init(60),
134 cl::desc("Minimum block frequency, expressed as a multiple of caller's "
135 "entry frequency, for a callsite to be hot in the absence of "
136 "profile information."));
139 InstrCost("inline-instr-cost", cl::Hidden
, cl::init(5),
140 cl::desc("Cost of a single instruction when inlining"));
143 MemAccessCost("inline-memaccess-cost", cl::Hidden
, cl::init(0),
144 cl::desc("Cost of load/store instruction when inlining"));
146 static cl::opt
<int> CallPenalty(
147 "inline-call-penalty", cl::Hidden
, cl::init(25),
148 cl::desc("Call penalty that is applied per callsite when inlining"));
150 static cl::opt
<size_t>
151 StackSizeThreshold("inline-max-stacksize", cl::Hidden
,
152 cl::init(std::numeric_limits
<size_t>::max()),
153 cl::desc("Do not inline functions with a stack size "
154 "that exceeds the specified limit"));
156 static cl::opt
<size_t> RecurStackSizeThreshold(
157 "recursive-inline-max-stacksize", cl::Hidden
,
158 cl::init(InlineConstants::TotalAllocaSizeRecursiveCaller
),
159 cl::desc("Do not inline recursive functions with a stack "
160 "size that exceeds the specified limit"));
162 static cl::opt
<bool> OptComputeFullInlineCost(
163 "inline-cost-full", cl::Hidden
,
164 cl::desc("Compute the full inline cost of a call site even when the cost "
165 "exceeds the threshold."));
167 static cl::opt
<bool> InlineCallerSupersetNoBuiltin(
168 "inline-caller-superset-nobuiltin", cl::Hidden
, cl::init(true),
169 cl::desc("Allow inlining when caller has a superset of callee's nobuiltin "
172 static cl::opt
<bool> DisableGEPConstOperand(
173 "disable-gep-const-evaluation", cl::Hidden
, cl::init(false),
174 cl::desc("Disables evaluation of GetElementPtr with constant operands"));
177 std::optional
<int> getStringFnAttrAsInt(const Attribute
&Attr
) {
178 if (Attr
.isValid()) {
180 if (!Attr
.getValueAsString().getAsInteger(10, AttrValue
))
186 std::optional
<int> getStringFnAttrAsInt(CallBase
&CB
, StringRef AttrKind
) {
187 return getStringFnAttrAsInt(CB
.getFnAttr(AttrKind
));
190 std::optional
<int> getStringFnAttrAsInt(Function
*F
, StringRef AttrKind
) {
191 return getStringFnAttrAsInt(F
->getFnAttribute(AttrKind
));
194 namespace InlineConstants
{
195 int getInstrCost() { return InstrCost
; }
197 } // namespace InlineConstants
202 class InlineCostCallAnalyzer
;
204 // This struct is used to store information about inline cost of a
205 // particular instruction
206 struct InstructionCostDetail
{
209 int ThresholdBefore
= 0;
210 int ThresholdAfter
= 0;
212 int getThresholdDelta() const { return ThresholdAfter
- ThresholdBefore
; }
214 int getCostDelta() const { return CostAfter
- CostBefore
; }
216 bool hasThresholdChanged() const { return ThresholdAfter
!= ThresholdBefore
; }
219 class InlineCostAnnotationWriter
: public AssemblyAnnotationWriter
{
221 InlineCostCallAnalyzer
*const ICCA
;
224 InlineCostAnnotationWriter(InlineCostCallAnalyzer
*ICCA
) : ICCA(ICCA
) {}
225 void emitInstructionAnnot(const Instruction
*I
,
226 formatted_raw_ostream
&OS
) override
;
229 /// Carry out call site analysis, in order to evaluate inlinability.
230 /// NOTE: the type is currently used as implementation detail of functions such
231 /// as llvm::getInlineCost. Note the function_ref constructor parameters - the
232 /// expectation is that they come from the outer scope, from the wrapper
233 /// functions. If we want to support constructing CallAnalyzer objects where
234 /// lambdas are provided inline at construction, or where the object needs to
235 /// otherwise survive past the scope of the provided functions, we need to
236 /// revisit the argument types.
237 class CallAnalyzer
: public InstVisitor
<CallAnalyzer
, bool> {
238 typedef InstVisitor
<CallAnalyzer
, bool> Base
;
239 friend class InstVisitor
<CallAnalyzer
, bool>;
242 virtual ~CallAnalyzer() = default;
243 /// The TargetTransformInfo available for this compilation.
244 const TargetTransformInfo
&TTI
;
246 /// Getter for the cache of @llvm.assume intrinsics.
247 function_ref
<AssumptionCache
&(Function
&)> GetAssumptionCache
;
249 /// Getter for BlockFrequencyInfo
250 function_ref
<BlockFrequencyInfo
&(Function
&)> GetBFI
;
252 /// Profile summary information.
253 ProfileSummaryInfo
*PSI
;
255 /// The called function.
258 // Cache the DataLayout since we use it a lot.
259 const DataLayout
&DL
;
261 /// The OptimizationRemarkEmitter available for this compilation.
262 OptimizationRemarkEmitter
*ORE
;
264 /// The candidate callsite being analyzed. Please do not use this to do
265 /// analysis in the caller function; we want the inline cost query to be
266 /// easily cacheable. Instead, use the cover function paramHasAttr.
267 CallBase
&CandidateCall
;
269 /// Extension points for handling callsite features.
270 // Called before a basic block was analyzed.
271 virtual void onBlockStart(const BasicBlock
*BB
) {}
273 /// Called after a basic block was analyzed.
274 virtual void onBlockAnalyzed(const BasicBlock
*BB
) {}
276 /// Called before an instruction was analyzed
277 virtual void onInstructionAnalysisStart(const Instruction
*I
) {}
279 /// Called after an instruction was analyzed
280 virtual void onInstructionAnalysisFinish(const Instruction
*I
) {}
282 /// Called at the end of the analysis of the callsite. Return the outcome of
283 /// the analysis, i.e. 'InlineResult(true)' if the inlining may happen, or
284 /// the reason it can't.
285 virtual InlineResult
finalizeAnalysis() { return InlineResult::success(); }
286 /// Called when we're about to start processing a basic block, and every time
287 /// we are done processing an instruction. Return true if there is no point in
288 /// continuing the analysis (e.g. we've determined already the call site is
289 /// too expensive to inline)
290 virtual bool shouldStop() { return false; }
292 /// Called before the analysis of the callee body starts (with callsite
293 /// contexts propagated). It checks callsite-specific information. Return a
294 /// reason analysis can't continue if that's the case, or 'true' if it may
296 virtual InlineResult
onAnalysisStart() { return InlineResult::success(); }
297 /// Called if the analysis engine decides SROA cannot be done for the given
299 virtual void onDisableSROA(AllocaInst
*Arg
) {}
301 /// Called the analysis engine determines load elimination won't happen.
302 virtual void onDisableLoadElimination() {}
304 /// Called when we visit a CallBase, before the analysis starts. Return false
305 /// to stop further processing of the instruction.
306 virtual bool onCallBaseVisitStart(CallBase
&Call
) { return true; }
308 /// Called to account for a call.
309 virtual void onCallPenalty() {}
311 /// Called to account for a load or store.
312 virtual void onMemAccess(){};
314 /// Called to account for the expectation the inlining would result in a load
316 virtual void onLoadEliminationOpportunity() {}
318 /// Called to account for the cost of argument setup for the Call in the
319 /// callee's body (not the callsite currently under analysis).
320 virtual void onCallArgumentSetup(const CallBase
&Call
) {}
322 /// Called to account for a load relative intrinsic.
323 virtual void onLoadRelativeIntrinsic() {}
325 /// Called to account for a lowered call.
326 virtual void onLoweredCall(Function
*F
, CallBase
&Call
, bool IsIndirectCall
) {
329 /// Account for a jump table of given size. Return false to stop further
330 /// processing the switch instruction
331 virtual bool onJumpTable(unsigned JumpTableSize
) { return true; }
333 /// Account for a case cluster of given size. Return false to stop further
334 /// processing of the instruction.
335 virtual bool onCaseCluster(unsigned NumCaseCluster
) { return true; }
337 /// Called at the end of processing a switch instruction, with the given
338 /// number of case clusters.
339 virtual void onFinalizeSwitch(unsigned JumpTableSize
,
340 unsigned NumCaseCluster
) {}
342 /// Called to account for any other instruction not specifically accounted
344 virtual void onMissedSimplification() {}
346 /// Start accounting potential benefits due to SROA for the given alloca.
347 virtual void onInitializeSROAArg(AllocaInst
*Arg
) {}
349 /// Account SROA savings for the AllocaInst value.
350 virtual void onAggregateSROAUse(AllocaInst
*V
) {}
352 bool handleSROA(Value
*V
, bool DoNotDisable
) {
353 // Check for SROA candidates in comparisons.
354 if (auto *SROAArg
= getSROAArgForValueOrNull(V
)) {
356 onAggregateSROAUse(SROAArg
);
359 disableSROAForArg(SROAArg
);
364 bool IsCallerRecursive
= false;
365 bool IsRecursiveCall
= false;
366 bool ExposesReturnsTwice
= false;
367 bool HasDynamicAlloca
= false;
368 bool ContainsNoDuplicateCall
= false;
369 bool HasReturn
= false;
370 bool HasIndirectBr
= false;
371 bool HasUninlineableIntrinsic
= false;
372 bool InitsVargArgs
= false;
374 /// Number of bytes allocated statically by the callee.
375 uint64_t AllocatedSize
= 0;
376 unsigned NumInstructions
= 0;
377 unsigned NumVectorInstructions
= 0;
379 /// While we walk the potentially-inlined instructions, we build up and
380 /// maintain a mapping of simplified values specific to this callsite. The
381 /// idea is to propagate any special information we have about arguments to
382 /// this call through the inlinable section of the function, and account for
383 /// likely simplifications post-inlining. The most important aspect we track
384 /// is CFG altering simplifications -- when we prove a basic block dead, that
385 /// can cause dramatic shifts in the cost of inlining a function.
386 DenseMap
<Value
*, Constant
*> SimplifiedValues
;
388 /// Keep track of the values which map back (through function arguments) to
389 /// allocas on the caller stack which could be simplified through SROA.
390 DenseMap
<Value
*, AllocaInst
*> SROAArgValues
;
392 /// Keep track of Allocas for which we believe we may get SROA optimization.
393 DenseSet
<AllocaInst
*> EnabledSROAAllocas
;
395 /// Keep track of values which map to a pointer base and constant offset.
396 DenseMap
<Value
*, std::pair
<Value
*, APInt
>> ConstantOffsetPtrs
;
398 /// Keep track of dead blocks due to the constant arguments.
399 SmallPtrSet
<BasicBlock
*, 16> DeadBlocks
;
401 /// The mapping of the blocks to their known unique successors due to the
402 /// constant arguments.
403 DenseMap
<BasicBlock
*, BasicBlock
*> KnownSuccessors
;
405 /// Model the elimination of repeated loads that is expected to happen
406 /// whenever we simplify away the stores that would otherwise cause them to be
408 bool EnableLoadElimination
= true;
410 /// Whether we allow inlining for recursive call.
411 bool AllowRecursiveCall
= false;
413 SmallPtrSet
<Value
*, 16> LoadAddrSet
;
415 AllocaInst
*getSROAArgForValueOrNull(Value
*V
) const {
416 auto It
= SROAArgValues
.find(V
);
417 if (It
== SROAArgValues
.end() || EnabledSROAAllocas
.count(It
->second
) == 0)
422 // Custom simplification helper routines.
423 bool isAllocaDerivedArg(Value
*V
);
424 void disableSROAForArg(AllocaInst
*SROAArg
);
425 void disableSROA(Value
*V
);
426 void findDeadBlocks(BasicBlock
*CurrBB
, BasicBlock
*NextBB
);
427 void disableLoadElimination();
428 bool isGEPFree(GetElementPtrInst
&GEP
);
429 bool canFoldInboundsGEP(GetElementPtrInst
&I
);
430 bool accumulateGEPOffset(GEPOperator
&GEP
, APInt
&Offset
);
431 bool simplifyCallSite(Function
*F
, CallBase
&Call
);
432 bool simplifyInstruction(Instruction
&I
);
433 bool simplifyIntrinsicCallIsConstant(CallBase
&CB
);
434 bool simplifyIntrinsicCallObjectSize(CallBase
&CB
);
435 ConstantInt
*stripAndComputeInBoundsConstantOffsets(Value
*&V
);
437 /// Return true if the given argument to the function being considered for
438 /// inlining has the given attribute set either at the call site or the
439 /// function declaration. Primarily used to inspect call site specific
440 /// attributes since these can be more precise than the ones on the callee
442 bool paramHasAttr(Argument
*A
, Attribute::AttrKind Attr
);
444 /// Return true if the given value is known non null within the callee if
445 /// inlined through this particular callsite.
446 bool isKnownNonNullInCallee(Value
*V
);
448 /// Return true if size growth is allowed when inlining the callee at \p Call.
449 bool allowSizeGrowth(CallBase
&Call
);
451 // Custom analysis routines.
452 InlineResult
analyzeBlock(BasicBlock
*BB
,
453 SmallPtrSetImpl
<const Value
*> &EphValues
);
455 // Disable several entry points to the visitor so we don't accidentally use
456 // them by declaring but not defining them here.
457 void visit(Module
*);
458 void visit(Module
&);
459 void visit(Function
*);
460 void visit(Function
&);
461 void visit(BasicBlock
*);
462 void visit(BasicBlock
&);
464 // Provide base case for our instruction visit.
465 bool visitInstruction(Instruction
&I
);
467 // Our visit overrides.
468 bool visitAlloca(AllocaInst
&I
);
469 bool visitPHI(PHINode
&I
);
470 bool visitGetElementPtr(GetElementPtrInst
&I
);
471 bool visitBitCast(BitCastInst
&I
);
472 bool visitPtrToInt(PtrToIntInst
&I
);
473 bool visitIntToPtr(IntToPtrInst
&I
);
474 bool visitCastInst(CastInst
&I
);
475 bool visitCmpInst(CmpInst
&I
);
476 bool visitSub(BinaryOperator
&I
);
477 bool visitBinaryOperator(BinaryOperator
&I
);
478 bool visitFNeg(UnaryOperator
&I
);
479 bool visitLoad(LoadInst
&I
);
480 bool visitStore(StoreInst
&I
);
481 bool visitExtractValue(ExtractValueInst
&I
);
482 bool visitInsertValue(InsertValueInst
&I
);
483 bool visitCallBase(CallBase
&Call
);
484 bool visitReturnInst(ReturnInst
&RI
);
485 bool visitBranchInst(BranchInst
&BI
);
486 bool visitSelectInst(SelectInst
&SI
);
487 bool visitSwitchInst(SwitchInst
&SI
);
488 bool visitIndirectBrInst(IndirectBrInst
&IBI
);
489 bool visitResumeInst(ResumeInst
&RI
);
490 bool visitCleanupReturnInst(CleanupReturnInst
&RI
);
491 bool visitCatchReturnInst(CatchReturnInst
&RI
);
492 bool visitUnreachableInst(UnreachableInst
&I
);
495 CallAnalyzer(Function
&Callee
, CallBase
&Call
, const TargetTransformInfo
&TTI
,
496 function_ref
<AssumptionCache
&(Function
&)> GetAssumptionCache
,
497 function_ref
<BlockFrequencyInfo
&(Function
&)> GetBFI
= nullptr,
498 ProfileSummaryInfo
*PSI
= nullptr,
499 OptimizationRemarkEmitter
*ORE
= nullptr)
500 : TTI(TTI
), GetAssumptionCache(GetAssumptionCache
), GetBFI(GetBFI
),
501 PSI(PSI
), F(Callee
), DL(F
.getParent()->getDataLayout()), ORE(ORE
),
502 CandidateCall(Call
) {}
504 InlineResult
analyze();
506 std::optional
<Constant
*> getSimplifiedValue(Instruction
*I
) {
507 if (SimplifiedValues
.contains(I
))
508 return SimplifiedValues
[I
];
512 // Keep a bunch of stats about the cost savings found so we can print them
513 // out when debugging.
514 unsigned NumConstantArgs
= 0;
515 unsigned NumConstantOffsetPtrArgs
= 0;
516 unsigned NumAllocaArgs
= 0;
517 unsigned NumConstantPtrCmps
= 0;
518 unsigned NumConstantPtrDiffs
= 0;
519 unsigned NumInstructionsSimplified
= 0;
524 // Considering forming a binary search, we should find the number of nodes
525 // which is same as the number of comparisons when lowered. For a given
526 // number of clusters, n, we can define a recursive function, f(n), to find
527 // the number of nodes in the tree. The recursion is :
528 // f(n) = 1 + f(n/2) + f (n - n/2), when n > 3,
529 // and f(n) = n, when n <= 3.
530 // This will lead a binary tree where the leaf should be either f(2) or f(3)
531 // when n > 3. So, the number of comparisons from leaves should be n, while
532 // the number of non-leaf should be :
533 // 2^(log2(n) - 1) - 1
534 // = 2^log2(n) * 2^-1 - 1
536 // Considering comparisons from leaf and non-leaf nodes, we can estimate the
537 // number of comparisons in a simple closed form :
538 // n + n / 2 - 1 = n * 3 / 2 - 1
539 int64_t getExpectedNumberOfCompare(int NumCaseCluster
) {
540 return 3 * static_cast<int64_t>(NumCaseCluster
) / 2 - 1;
543 /// FIXME: if it is necessary to derive from InlineCostCallAnalyzer, note
544 /// the FIXME in onLoweredCall, when instantiating an InlineCostCallAnalyzer
545 class InlineCostCallAnalyzer final
: public CallAnalyzer
{
546 const bool ComputeFullInlineCost
;
547 int LoadEliminationCost
= 0;
548 /// Bonus to be applied when percentage of vector instructions in callee is
549 /// high (see more details in updateThreshold).
551 /// Bonus to be applied when the callee has only one reachable basic block.
552 int SingleBBBonus
= 0;
554 /// Tunable parameters that control the analysis.
555 const InlineParams
&Params
;
557 // This DenseMap stores the delta change in cost and threshold after
558 // accounting for the given instruction. The map is filled only with the
559 // flag PrintInstructionComments on.
560 DenseMap
<const Instruction
*, InstructionCostDetail
> InstructionCostDetailMap
;
562 /// Upper bound for the inlining cost. Bonuses are being applied to account
563 /// for speculative "expected profit" of the inlining decision.
566 /// The amount of StaticBonus applied.
567 int StaticBonusApplied
= 0;
569 /// Attempt to evaluate indirect calls to boost its inline cost.
570 const bool BoostIndirectCalls
;
572 /// Ignore the threshold when finalizing analysis.
573 const bool IgnoreThreshold
;
575 // True if the cost-benefit-analysis-based inliner is enabled.
576 const bool CostBenefitAnalysisEnabled
;
578 /// Inlining cost measured in abstract units, accounts for all the
579 /// instructions expected to be executed for a given function invocation.
580 /// Instructions that are statically proven to be dead based on call-site
581 /// arguments are not counted here.
584 // The cumulative cost at the beginning of the basic block being analyzed. At
585 // the end of analyzing each basic block, "Cost - CostAtBBStart" represents
586 // the size of that basic block.
587 int CostAtBBStart
= 0;
589 // The static size of live but cold basic blocks. This is "static" in the
590 // sense that it's not weighted by profile counts at all.
593 // Whether inlining is decided by cost-threshold analysis.
594 bool DecidedByCostThreshold
= false;
596 // Whether inlining is decided by cost-benefit analysis.
597 bool DecidedByCostBenefit
= false;
599 // The cost-benefit pair computed by cost-benefit analysis.
600 std::optional
<CostBenefitPair
> CostBenefit
;
602 bool SingleBB
= true;
604 unsigned SROACostSavings
= 0;
605 unsigned SROACostSavingsLost
= 0;
607 /// The mapping of caller Alloca values to their accumulated cost savings. If
608 /// we have to disable SROA for one of the allocas, this tells us how much
609 /// cost must be added.
610 DenseMap
<AllocaInst
*, int> SROAArgCosts
;
612 /// Return true if \p Call is a cold callsite.
613 bool isColdCallSite(CallBase
&Call
, BlockFrequencyInfo
*CallerBFI
);
615 /// Update Threshold based on callsite properties such as callee
616 /// attributes and callee hotness for PGO builds. The Callee is explicitly
617 /// passed to support analyzing indirect calls whose target is inferred by
619 void updateThreshold(CallBase
&Call
, Function
&Callee
);
620 /// Return a higher threshold if \p Call is a hot callsite.
621 std::optional
<int> getHotCallSiteThreshold(CallBase
&Call
,
622 BlockFrequencyInfo
*CallerBFI
);
624 /// Handle a capped 'int' increment for Cost.
625 void addCost(int64_t Inc
) {
626 Inc
= std::clamp
<int64_t>(Inc
, INT_MIN
, INT_MAX
);
627 Cost
= std::clamp
<int64_t>(Inc
+ Cost
, INT_MIN
, INT_MAX
);
630 void onDisableSROA(AllocaInst
*Arg
) override
{
631 auto CostIt
= SROAArgCosts
.find(Arg
);
632 if (CostIt
== SROAArgCosts
.end())
634 addCost(CostIt
->second
);
635 SROACostSavings
-= CostIt
->second
;
636 SROACostSavingsLost
+= CostIt
->second
;
637 SROAArgCosts
.erase(CostIt
);
640 void onDisableLoadElimination() override
{
641 addCost(LoadEliminationCost
);
642 LoadEliminationCost
= 0;
645 bool onCallBaseVisitStart(CallBase
&Call
) override
{
646 if (std::optional
<int> AttrCallThresholdBonus
=
647 getStringFnAttrAsInt(Call
, "call-threshold-bonus"))
648 Threshold
+= *AttrCallThresholdBonus
;
650 if (std::optional
<int> AttrCallCost
=
651 getStringFnAttrAsInt(Call
, "call-inline-cost")) {
652 addCost(*AttrCallCost
);
653 // Prevent further processing of the call since we want to override its
654 // inline cost, not just add to it.
660 void onCallPenalty() override
{ addCost(CallPenalty
); }
662 void onMemAccess() override
{ addCost(MemAccessCost
); }
664 void onCallArgumentSetup(const CallBase
&Call
) override
{
665 // Pay the price of the argument setup. We account for the average 1
666 // instruction per call argument setup here.
667 addCost(Call
.arg_size() * InstrCost
);
669 void onLoadRelativeIntrinsic() override
{
670 // This is normally lowered to 4 LLVM instructions.
671 addCost(3 * InstrCost
);
673 void onLoweredCall(Function
*F
, CallBase
&Call
,
674 bool IsIndirectCall
) override
{
675 // We account for the average 1 instruction per call argument setup here.
676 addCost(Call
.arg_size() * InstrCost
);
678 // If we have a constant that we are calling as a function, we can peer
679 // through it and see the function target. This happens not infrequently
680 // during devirtualization and so we want to give it a hefty bonus for
681 // inlining, but cap that bonus in the event that inlining wouldn't pan out.
682 // Pretend to inline the function, with a custom threshold.
683 if (IsIndirectCall
&& BoostIndirectCalls
) {
684 auto IndirectCallParams
= Params
;
685 IndirectCallParams
.DefaultThreshold
=
686 InlineConstants::IndirectCallThreshold
;
687 /// FIXME: if InlineCostCallAnalyzer is derived from, this may need
688 /// to instantiate the derived class.
689 InlineCostCallAnalyzer
CA(*F
, Call
, IndirectCallParams
, TTI
,
690 GetAssumptionCache
, GetBFI
, PSI
, ORE
, false);
691 if (CA
.analyze().isSuccess()) {
692 // We were able to inline the indirect call! Subtract the cost from the
693 // threshold to get the bonus we want to apply, but don't go below zero.
694 Cost
-= std::max(0, CA
.getThreshold() - CA
.getCost());
697 // Otherwise simply add the cost for merely making the call.
698 addCost(TTI
.getInlineCallPenalty(CandidateCall
.getCaller(), Call
,
702 void onFinalizeSwitch(unsigned JumpTableSize
,
703 unsigned NumCaseCluster
) override
{
704 // If suitable for a jump table, consider the cost for the table size and
705 // branch to destination.
706 // Maximum valid cost increased in this function.
709 static_cast<int64_t>(JumpTableSize
) * InstrCost
+ 4 * InstrCost
;
715 if (NumCaseCluster
<= 3) {
716 // Suppose a comparison includes one compare and one conditional branch.
717 addCost(NumCaseCluster
* 2 * InstrCost
);
721 int64_t ExpectedNumberOfCompare
=
722 getExpectedNumberOfCompare(NumCaseCluster
);
723 int64_t SwitchCost
= ExpectedNumberOfCompare
* 2 * InstrCost
;
727 void onMissedSimplification() override
{ addCost(InstrCost
); }
729 void onInitializeSROAArg(AllocaInst
*Arg
) override
{
730 assert(Arg
!= nullptr &&
731 "Should not initialize SROA costs for null value.");
732 auto SROAArgCost
= TTI
.getCallerAllocaCost(&CandidateCall
, Arg
);
733 SROACostSavings
+= SROAArgCost
;
734 SROAArgCosts
[Arg
] = SROAArgCost
;
737 void onAggregateSROAUse(AllocaInst
*SROAArg
) override
{
738 auto CostIt
= SROAArgCosts
.find(SROAArg
);
739 assert(CostIt
!= SROAArgCosts
.end() &&
740 "expected this argument to have a cost");
741 CostIt
->second
+= InstrCost
;
742 SROACostSavings
+= InstrCost
;
745 void onBlockStart(const BasicBlock
*BB
) override
{ CostAtBBStart
= Cost
; }
747 void onBlockAnalyzed(const BasicBlock
*BB
) override
{
748 if (CostBenefitAnalysisEnabled
) {
749 // Keep track of the static size of live but cold basic blocks. For now,
750 // we define a cold basic block to be one that's never executed.
751 assert(GetBFI
&& "GetBFI must be available");
752 BlockFrequencyInfo
*BFI
= &(GetBFI(F
));
753 assert(BFI
&& "BFI must be available");
754 auto ProfileCount
= BFI
->getBlockProfileCount(BB
);
755 if (*ProfileCount
== 0)
756 ColdSize
+= Cost
- CostAtBBStart
;
759 auto *TI
= BB
->getTerminator();
760 // If we had any successors at this point, than post-inlining is likely to
761 // have them as well. Note that we assume any basic blocks which existed
762 // due to branches or switches which folded above will also fold after
764 if (SingleBB
&& TI
->getNumSuccessors() > 1) {
765 // Take off the bonus we applied to the threshold.
766 Threshold
-= SingleBBBonus
;
771 void onInstructionAnalysisStart(const Instruction
*I
) override
{
772 // This function is called to store the initial cost of inlining before
773 // the given instruction was assessed.
774 if (!PrintInstructionComments
)
776 InstructionCostDetailMap
[I
].CostBefore
= Cost
;
777 InstructionCostDetailMap
[I
].ThresholdBefore
= Threshold
;
780 void onInstructionAnalysisFinish(const Instruction
*I
) override
{
781 // This function is called to find new values of cost and threshold after
782 // the instruction has been assessed.
783 if (!PrintInstructionComments
)
785 InstructionCostDetailMap
[I
].CostAfter
= Cost
;
786 InstructionCostDetailMap
[I
].ThresholdAfter
= Threshold
;
789 bool isCostBenefitAnalysisEnabled() {
790 if (!PSI
|| !PSI
->hasProfileSummary())
796 if (InlineEnableCostBenefitAnalysis
.getNumOccurrences()) {
797 // Honor the explicit request from the user.
798 if (!InlineEnableCostBenefitAnalysis
)
801 // Otherwise, require instrumentation profile.
802 if (!(PSI
->hasInstrumentationProfile() || PSI
->hasSampleProfile()))
806 auto *Caller
= CandidateCall
.getParent()->getParent();
807 if (!Caller
->getEntryCount())
810 BlockFrequencyInfo
*CallerBFI
= &(GetBFI(*Caller
));
814 // For now, limit to hot call site.
815 if (!PSI
->isHotCallSite(CandidateCall
, CallerBFI
))
818 // Make sure we have a nonzero entry count.
819 auto EntryCount
= F
.getEntryCount();
820 if (!EntryCount
|| !EntryCount
->getCount())
823 BlockFrequencyInfo
*CalleeBFI
= &(GetBFI(F
));
830 // A helper function to choose between command line override and default.
831 unsigned getInliningCostBenefitAnalysisSavingsMultiplier() const {
832 if (InlineSavingsMultiplier
.getNumOccurrences())
833 return InlineSavingsMultiplier
;
834 return TTI
.getInliningCostBenefitAnalysisSavingsMultiplier();
837 // A helper function to choose between command line override and default.
838 unsigned getInliningCostBenefitAnalysisProfitableMultiplier() const {
839 if (InlineSavingsProfitableMultiplier
.getNumOccurrences())
840 return InlineSavingsProfitableMultiplier
;
841 return TTI
.getInliningCostBenefitAnalysisProfitableMultiplier();
844 void OverrideCycleSavingsAndSizeForTesting(APInt
&CycleSavings
, int &Size
) {
845 if (std::optional
<int> AttrCycleSavings
= getStringFnAttrAsInt(
846 CandidateCall
, "inline-cycle-savings-for-test")) {
847 CycleSavings
= *AttrCycleSavings
;
850 if (std::optional
<int> AttrRuntimeCost
= getStringFnAttrAsInt(
851 CandidateCall
, "inline-runtime-cost-for-test")) {
852 Size
= *AttrRuntimeCost
;
856 // Determine whether we should inline the given call site, taking into account
857 // both the size cost and the cycle savings. Return std::nullopt if we don't
858 // have sufficient profiling information to determine.
859 std::optional
<bool> costBenefitAnalysis() {
860 if (!CostBenefitAnalysisEnabled
)
863 // buildInlinerPipeline in the pass builder sets HotCallSiteThreshold to 0
864 // for the prelink phase of the AutoFDO + ThinLTO build. Honor the logic by
865 // falling back to the cost-based metric.
866 // TODO: Improve this hacky condition.
871 BlockFrequencyInfo
*CalleeBFI
= &(GetBFI(F
));
874 // The cycle savings expressed as the sum of InstrCost
875 // multiplied by the estimated dynamic count of each instruction we can
876 // avoid. Savings come from the call site cost, such as argument setup and
877 // the call instruction, as well as the instructions that are folded.
879 // We use 128-bit APInt here to avoid potential overflow. This variable
880 // should stay well below 10^^24 (or 2^^80) in practice. This "worst" case
881 // assumes that we can avoid or fold a billion instructions, each with a
882 // profile count of 10^^15 -- roughly the number of cycles for a 24-hour
883 // period on a 4GHz machine.
884 APInt
CycleSavings(128, 0);
887 APInt
CurrentSavings(128, 0);
889 if (BranchInst
*BI
= dyn_cast
<BranchInst
>(&I
)) {
890 // Count a conditional branch as savings if it becomes unconditional.
891 if (BI
->isConditional() &&
892 isa_and_nonnull
<ConstantInt
>(
893 SimplifiedValues
.lookup(BI
->getCondition()))) {
894 CurrentSavings
+= InstrCost
;
896 } else if (SwitchInst
*SI
= dyn_cast
<SwitchInst
>(&I
)) {
897 if (isa_and_present
<ConstantInt
>(SimplifiedValues
.lookup(SI
->getCondition())))
898 CurrentSavings
+= InstrCost
;
899 } else if (Value
*V
= dyn_cast
<Value
>(&I
)) {
900 // Count an instruction as savings if we can fold it.
901 if (SimplifiedValues
.count(V
)) {
902 CurrentSavings
+= InstrCost
;
907 auto ProfileCount
= CalleeBFI
->getBlockProfileCount(&BB
);
908 CurrentSavings
*= *ProfileCount
;
909 CycleSavings
+= CurrentSavings
;
912 // Compute the cycle savings per call.
913 auto EntryProfileCount
= F
.getEntryCount();
914 assert(EntryProfileCount
&& EntryProfileCount
->getCount());
915 auto EntryCount
= EntryProfileCount
->getCount();
916 CycleSavings
+= EntryCount
/ 2;
917 CycleSavings
= CycleSavings
.udiv(EntryCount
);
919 // Compute the total savings for the call site.
920 auto *CallerBB
= CandidateCall
.getParent();
921 BlockFrequencyInfo
*CallerBFI
= &(GetBFI(*(CallerBB
->getParent())));
922 CycleSavings
+= getCallsiteCost(TTI
, this->CandidateCall
, DL
);
923 CycleSavings
*= *CallerBFI
->getBlockProfileCount(CallerBB
);
925 // Remove the cost of the cold basic blocks to model the runtime cost more
926 // accurately. Both machine block placement and function splitting could
927 // place cold blocks further from hot blocks.
928 int Size
= Cost
- ColdSize
;
930 // Allow tiny callees to be inlined regardless of whether they meet the
931 // savings threshold.
932 Size
= Size
> InlineSizeAllowance
? Size
- InlineSizeAllowance
: 1;
934 OverrideCycleSavingsAndSizeForTesting(CycleSavings
, Size
);
935 CostBenefit
.emplace(APInt(128, Size
), CycleSavings
);
937 // Let R be the ratio of CycleSavings to Size. We accept the inlining
938 // opportunity if R is really high and reject if R is really low. If R is
939 // somewhere in the middle, we fall back to the cost-based analysis.
941 // Specifically, let R = CycleSavings / Size, we accept the inlining
944 // PSI->getOrCompHotCountThreshold()
945 // R > -------------------------------------------------
946 // getInliningCostBenefitAnalysisSavingsMultiplier()
948 // and reject the inlining opportunity if:
950 // PSI->getOrCompHotCountThreshold()
951 // R <= ----------------------------------------------------
952 // getInliningCostBenefitAnalysisProfitableMultiplier()
954 // Otherwise, we fall back to the cost-based analysis.
956 // Implementation-wise, use multiplication (CycleSavings * Multiplier,
957 // HotCountThreshold * Size) rather than division to avoid precision loss.
958 APInt
Threshold(128, PSI
->getOrCompHotCountThreshold());
961 APInt UpperBoundCycleSavings
= CycleSavings
;
962 UpperBoundCycleSavings
*= getInliningCostBenefitAnalysisSavingsMultiplier();
963 if (UpperBoundCycleSavings
.uge(Threshold
))
966 APInt LowerBoundCycleSavings
= CycleSavings
;
967 LowerBoundCycleSavings
*=
968 getInliningCostBenefitAnalysisProfitableMultiplier();
969 if (LowerBoundCycleSavings
.ult(Threshold
))
972 // Otherwise, fall back to the cost-based analysis.
976 InlineResult
finalizeAnalysis() override
{
977 // Loops generally act a lot like calls in that they act like barriers to
978 // movement, require a certain amount of setup, etc. So when optimising for
979 // size, we penalise any call sites that perform loops. We do this after all
980 // other costs here, so will likely only be dealing with relatively small
981 // functions (and hence DT and LI will hopefully be cheap).
982 auto *Caller
= CandidateCall
.getFunction();
983 if (Caller
->hasMinSize()) {
988 // Ignore loops that will not be executed
989 if (DeadBlocks
.count(L
->getHeader()))
993 addCost(NumLoops
* InlineConstants::LoopPenalty
);
996 // We applied the maximum possible vector bonus at the beginning. Now,
997 // subtract the excess bonus, if any, from the Threshold before
998 // comparing against Cost.
999 if (NumVectorInstructions
<= NumInstructions
/ 10)
1000 Threshold
-= VectorBonus
;
1001 else if (NumVectorInstructions
<= NumInstructions
/ 2)
1002 Threshold
-= VectorBonus
/ 2;
1004 if (std::optional
<int> AttrCost
=
1005 getStringFnAttrAsInt(CandidateCall
, "function-inline-cost"))
1008 if (std::optional
<int> AttrCostMult
= getStringFnAttrAsInt(
1010 InlineConstants::FunctionInlineCostMultiplierAttributeName
))
1011 Cost
*= *AttrCostMult
;
1013 if (std::optional
<int> AttrThreshold
=
1014 getStringFnAttrAsInt(CandidateCall
, "function-inline-threshold"))
1015 Threshold
= *AttrThreshold
;
1017 if (auto Result
= costBenefitAnalysis()) {
1018 DecidedByCostBenefit
= true;
1020 return InlineResult::success();
1022 return InlineResult::failure("Cost over threshold.");
1025 if (IgnoreThreshold
)
1026 return InlineResult::success();
1028 DecidedByCostThreshold
= true;
1029 return Cost
< std::max(1, Threshold
)
1030 ? InlineResult::success()
1031 : InlineResult::failure("Cost over threshold.");
1034 bool shouldStop() override
{
1035 if (IgnoreThreshold
|| ComputeFullInlineCost
)
1037 // Bail out the moment we cross the threshold. This means we'll under-count
1038 // the cost, but only when undercounting doesn't matter.
1039 if (Cost
< Threshold
)
1041 DecidedByCostThreshold
= true;
1045 void onLoadEliminationOpportunity() override
{
1046 LoadEliminationCost
+= InstrCost
;
1049 InlineResult
onAnalysisStart() override
{
1050 // Perform some tweaks to the cost and threshold based on the direct
1051 // callsite information.
1053 // We want to more aggressively inline vector-dense kernels, so up the
1054 // threshold, and we'll lower it if the % of vector instructions gets too
1055 // low. Note that these bonuses are some what arbitrary and evolved over
1056 // time by accident as much as because they are principled bonuses.
1058 // FIXME: It would be nice to remove all such bonuses. At least it would be
1059 // nice to base the bonus values on something more scientific.
1060 assert(NumInstructions
== 0);
1061 assert(NumVectorInstructions
== 0);
1063 // Update the threshold based on callsite properties
1064 updateThreshold(CandidateCall
, F
);
1066 // While Threshold depends on commandline options that can take negative
1067 // values, we want to enforce the invariant that the computed threshold and
1068 // bonuses are non-negative.
1069 assert(Threshold
>= 0);
1070 assert(SingleBBBonus
>= 0);
1071 assert(VectorBonus
>= 0);
1073 // Speculatively apply all possible bonuses to Threshold. If cost exceeds
1074 // this Threshold any time, and cost cannot decrease, we can stop processing
1075 // the rest of the function body.
1076 Threshold
+= (SingleBBBonus
+ VectorBonus
);
1078 // Give out bonuses for the callsite, as the instructions setting them up
1079 // will be gone after inlining.
1080 addCost(-getCallsiteCost(TTI
, this->CandidateCall
, DL
));
1082 // If this function uses the coldcc calling convention, prefer not to inline
1084 if (F
.getCallingConv() == CallingConv::Cold
)
1085 Cost
+= InlineConstants::ColdccPenalty
;
1087 LLVM_DEBUG(dbgs() << " Initial cost: " << Cost
<< "\n");
1089 // Check if we're done. This can happen due to bonuses and penalties.
1090 if (Cost
>= Threshold
&& !ComputeFullInlineCost
)
1091 return InlineResult::failure("high cost");
1093 return InlineResult::success();
1097 InlineCostCallAnalyzer(
1098 Function
&Callee
, CallBase
&Call
, const InlineParams
&Params
,
1099 const TargetTransformInfo
&TTI
,
1100 function_ref
<AssumptionCache
&(Function
&)> GetAssumptionCache
,
1101 function_ref
<BlockFrequencyInfo
&(Function
&)> GetBFI
= nullptr,
1102 ProfileSummaryInfo
*PSI
= nullptr,
1103 OptimizationRemarkEmitter
*ORE
= nullptr, bool BoostIndirect
= true,
1104 bool IgnoreThreshold
= false)
1105 : CallAnalyzer(Callee
, Call
, TTI
, GetAssumptionCache
, GetBFI
, PSI
, ORE
),
1106 ComputeFullInlineCost(OptComputeFullInlineCost
||
1107 Params
.ComputeFullInlineCost
|| ORE
||
1108 isCostBenefitAnalysisEnabled()),
1109 Params(Params
), Threshold(Params
.DefaultThreshold
),
1110 BoostIndirectCalls(BoostIndirect
), IgnoreThreshold(IgnoreThreshold
),
1111 CostBenefitAnalysisEnabled(isCostBenefitAnalysisEnabled()),
1113 AllowRecursiveCall
= *Params
.AllowRecursiveCall
;
1116 /// Annotation Writer for instruction details
1117 InlineCostAnnotationWriter Writer
;
1121 // Prints the same analysis as dump(), but its definition is not dependent
1123 void print(raw_ostream
&OS
);
1125 std::optional
<InstructionCostDetail
> getCostDetails(const Instruction
*I
) {
1126 if (InstructionCostDetailMap
.contains(I
))
1127 return InstructionCostDetailMap
[I
];
1128 return std::nullopt
;
1131 virtual ~InlineCostCallAnalyzer() = default;
1132 int getThreshold() const { return Threshold
; }
1133 int getCost() const { return Cost
; }
1134 int getStaticBonusApplied() const { return StaticBonusApplied
; }
1135 std::optional
<CostBenefitPair
> getCostBenefitPair() { return CostBenefit
; }
1136 bool wasDecidedByCostBenefit() const { return DecidedByCostBenefit
; }
1137 bool wasDecidedByCostThreshold() const { return DecidedByCostThreshold
; }
1140 // Return true if CB is the sole call to local function Callee.
1141 static bool isSoleCallToLocalFunction(const CallBase
&CB
,
1142 const Function
&Callee
) {
1143 return Callee
.hasLocalLinkage() && Callee
.hasOneLiveUse() &&
1144 &Callee
== CB
.getCalledFunction();
1147 class InlineCostFeaturesAnalyzer final
: public CallAnalyzer
{
1149 InlineCostFeatures Cost
= {};
1151 // FIXME: These constants are taken from the heuristic-based cost visitor.
1152 // These should be removed entirely in a later revision to avoid reliance on
1153 // heuristics in the ML inliner.
1154 static constexpr int JTCostMultiplier
= 4;
1155 static constexpr int CaseClusterCostMultiplier
= 2;
1156 static constexpr int SwitchCostMultiplier
= 2;
1158 // FIXME: These are taken from the heuristic-based cost visitor: we should
1159 // eventually abstract these to the CallAnalyzer to avoid duplication.
1160 unsigned SROACostSavingOpportunities
= 0;
1161 int VectorBonus
= 0;
1162 int SingleBBBonus
= 0;
1165 DenseMap
<AllocaInst
*, unsigned> SROACosts
;
1167 void increment(InlineCostFeatureIndex Feature
, int64_t Delta
= 1) {
1168 Cost
[static_cast<size_t>(Feature
)] += Delta
;
1171 void set(InlineCostFeatureIndex Feature
, int64_t Value
) {
1172 Cost
[static_cast<size_t>(Feature
)] = Value
;
1175 void onDisableSROA(AllocaInst
*Arg
) override
{
1176 auto CostIt
= SROACosts
.find(Arg
);
1177 if (CostIt
== SROACosts
.end())
1180 increment(InlineCostFeatureIndex::sroa_losses
, CostIt
->second
);
1181 SROACostSavingOpportunities
-= CostIt
->second
;
1182 SROACosts
.erase(CostIt
);
1185 void onDisableLoadElimination() override
{
1186 set(InlineCostFeatureIndex::load_elimination
, 1);
1189 void onCallPenalty() override
{
1190 increment(InlineCostFeatureIndex::call_penalty
, CallPenalty
);
1193 void onCallArgumentSetup(const CallBase
&Call
) override
{
1194 increment(InlineCostFeatureIndex::call_argument_setup
,
1195 Call
.arg_size() * InstrCost
);
1198 void onLoadRelativeIntrinsic() override
{
1199 increment(InlineCostFeatureIndex::load_relative_intrinsic
, 3 * InstrCost
);
1202 void onLoweredCall(Function
*F
, CallBase
&Call
,
1203 bool IsIndirectCall
) override
{
1204 increment(InlineCostFeatureIndex::lowered_call_arg_setup
,
1205 Call
.arg_size() * InstrCost
);
1207 if (IsIndirectCall
) {
1208 InlineParams IndirectCallParams
= {/* DefaultThreshold*/ 0,
1209 /*HintThreshold*/ {},
1210 /*ColdThreshold*/ {},
1211 /*OptSizeThreshold*/ {},
1212 /*OptMinSizeThreshold*/ {},
1213 /*HotCallSiteThreshold*/ {},
1214 /*LocallyHotCallSiteThreshold*/ {},
1215 /*ColdCallSiteThreshold*/ {},
1216 /*ComputeFullInlineCost*/ true,
1217 /*EnableDeferral*/ true};
1218 IndirectCallParams
.DefaultThreshold
=
1219 InlineConstants::IndirectCallThreshold
;
1221 InlineCostCallAnalyzer
CA(*F
, Call
, IndirectCallParams
, TTI
,
1222 GetAssumptionCache
, GetBFI
, PSI
, ORE
, false,
1224 if (CA
.analyze().isSuccess()) {
1225 increment(InlineCostFeatureIndex::nested_inline_cost_estimate
,
1227 increment(InlineCostFeatureIndex::nested_inlines
, 1);
1234 void onFinalizeSwitch(unsigned JumpTableSize
,
1235 unsigned NumCaseCluster
) override
{
1237 if (JumpTableSize
) {
1238 int64_t JTCost
= static_cast<int64_t>(JumpTableSize
) * InstrCost
+
1239 JTCostMultiplier
* InstrCost
;
1240 increment(InlineCostFeatureIndex::jump_table_penalty
, JTCost
);
1244 if (NumCaseCluster
<= 3) {
1245 increment(InlineCostFeatureIndex::case_cluster_penalty
,
1246 NumCaseCluster
* CaseClusterCostMultiplier
* InstrCost
);
1250 int64_t ExpectedNumberOfCompare
=
1251 getExpectedNumberOfCompare(NumCaseCluster
);
1253 int64_t SwitchCost
=
1254 ExpectedNumberOfCompare
* SwitchCostMultiplier
* InstrCost
;
1255 increment(InlineCostFeatureIndex::switch_penalty
, SwitchCost
);
1258 void onMissedSimplification() override
{
1259 increment(InlineCostFeatureIndex::unsimplified_common_instructions
,
1263 void onInitializeSROAArg(AllocaInst
*Arg
) override
{
1264 auto SROAArgCost
= TTI
.getCallerAllocaCost(&CandidateCall
, Arg
);
1265 SROACosts
[Arg
] = SROAArgCost
;
1266 SROACostSavingOpportunities
+= SROAArgCost
;
1269 void onAggregateSROAUse(AllocaInst
*Arg
) override
{
1270 SROACosts
.find(Arg
)->second
+= InstrCost
;
1271 SROACostSavingOpportunities
+= InstrCost
;
1274 void onBlockAnalyzed(const BasicBlock
*BB
) override
{
1275 if (BB
->getTerminator()->getNumSuccessors() > 1)
1276 set(InlineCostFeatureIndex::is_multiple_blocks
, 1);
1277 Threshold
-= SingleBBBonus
;
1280 InlineResult
finalizeAnalysis() override
{
1281 auto *Caller
= CandidateCall
.getFunction();
1282 if (Caller
->hasMinSize()) {
1283 DominatorTree
DT(F
);
1285 for (Loop
*L
: LI
) {
1286 // Ignore loops that will not be executed
1287 if (DeadBlocks
.count(L
->getHeader()))
1289 increment(InlineCostFeatureIndex::num_loops
,
1290 InlineConstants::LoopPenalty
);
1293 set(InlineCostFeatureIndex::dead_blocks
, DeadBlocks
.size());
1294 set(InlineCostFeatureIndex::simplified_instructions
,
1295 NumInstructionsSimplified
);
1296 set(InlineCostFeatureIndex::constant_args
, NumConstantArgs
);
1297 set(InlineCostFeatureIndex::constant_offset_ptr_args
,
1298 NumConstantOffsetPtrArgs
);
1299 set(InlineCostFeatureIndex::sroa_savings
, SROACostSavingOpportunities
);
1301 if (NumVectorInstructions
<= NumInstructions
/ 10)
1302 Threshold
-= VectorBonus
;
1303 else if (NumVectorInstructions
<= NumInstructions
/ 2)
1304 Threshold
-= VectorBonus
/ 2;
1306 set(InlineCostFeatureIndex::threshold
, Threshold
);
1308 return InlineResult::success();
1311 bool shouldStop() override
{ return false; }
1313 void onLoadEliminationOpportunity() override
{
1314 increment(InlineCostFeatureIndex::load_elimination
, 1);
1317 InlineResult
onAnalysisStart() override
{
1318 increment(InlineCostFeatureIndex::callsite_cost
,
1319 -1 * getCallsiteCost(TTI
, this->CandidateCall
, DL
));
1321 set(InlineCostFeatureIndex::cold_cc_penalty
,
1322 (F
.getCallingConv() == CallingConv::Cold
));
1324 set(InlineCostFeatureIndex::last_call_to_static_bonus
,
1325 isSoleCallToLocalFunction(CandidateCall
, F
));
1327 // FIXME: we shouldn't repeat this logic in both the Features and Cost
1328 // analyzer - instead, we should abstract it to a common method in the
1330 int SingleBBBonusPercent
= 50;
1331 int VectorBonusPercent
= TTI
.getInlinerVectorBonusPercent();
1332 Threshold
+= TTI
.adjustInliningThreshold(&CandidateCall
);
1333 Threshold
*= TTI
.getInliningThresholdMultiplier();
1334 SingleBBBonus
= Threshold
* SingleBBBonusPercent
/ 100;
1335 VectorBonus
= Threshold
* VectorBonusPercent
/ 100;
1336 Threshold
+= (SingleBBBonus
+ VectorBonus
);
1338 return InlineResult::success();
1342 InlineCostFeaturesAnalyzer(
1343 const TargetTransformInfo
&TTI
,
1344 function_ref
<AssumptionCache
&(Function
&)> &GetAssumptionCache
,
1345 function_ref
<BlockFrequencyInfo
&(Function
&)> GetBFI
,
1346 ProfileSummaryInfo
*PSI
, OptimizationRemarkEmitter
*ORE
, Function
&Callee
,
1348 : CallAnalyzer(Callee
, Call
, TTI
, GetAssumptionCache
, GetBFI
, PSI
) {}
1350 const InlineCostFeatures
&features() const { return Cost
; }
1355 /// Test whether the given value is an Alloca-derived function argument.
1356 bool CallAnalyzer::isAllocaDerivedArg(Value
*V
) {
1357 return SROAArgValues
.count(V
);
1360 void CallAnalyzer::disableSROAForArg(AllocaInst
*SROAArg
) {
1361 onDisableSROA(SROAArg
);
1362 EnabledSROAAllocas
.erase(SROAArg
);
1363 disableLoadElimination();
1366 void InlineCostAnnotationWriter::emitInstructionAnnot(
1367 const Instruction
*I
, formatted_raw_ostream
&OS
) {
1368 // The cost of inlining of the given instruction is printed always.
1369 // The threshold delta is printed only when it is non-zero. It happens
1370 // when we decided to give a bonus at a particular instruction.
1371 std::optional
<InstructionCostDetail
> Record
= ICCA
->getCostDetails(I
);
1373 OS
<< "; No analysis for the instruction";
1375 OS
<< "; cost before = " << Record
->CostBefore
1376 << ", cost after = " << Record
->CostAfter
1377 << ", threshold before = " << Record
->ThresholdBefore
1378 << ", threshold after = " << Record
->ThresholdAfter
<< ", ";
1379 OS
<< "cost delta = " << Record
->getCostDelta();
1380 if (Record
->hasThresholdChanged())
1381 OS
<< ", threshold delta = " << Record
->getThresholdDelta();
1383 auto C
= ICCA
->getSimplifiedValue(const_cast<Instruction
*>(I
));
1385 OS
<< ", simplified to ";
1386 (*C
)->print(OS
, true);
1391 /// If 'V' maps to a SROA candidate, disable SROA for it.
1392 void CallAnalyzer::disableSROA(Value
*V
) {
1393 if (auto *SROAArg
= getSROAArgForValueOrNull(V
)) {
1394 disableSROAForArg(SROAArg
);
1398 void CallAnalyzer::disableLoadElimination() {
1399 if (EnableLoadElimination
) {
1400 onDisableLoadElimination();
1401 EnableLoadElimination
= false;
1405 /// Accumulate a constant GEP offset into an APInt if possible.
1407 /// Returns false if unable to compute the offset for any reason. Respects any
1408 /// simplified values known during the analysis of this callsite.
1409 bool CallAnalyzer::accumulateGEPOffset(GEPOperator
&GEP
, APInt
&Offset
) {
1410 unsigned IntPtrWidth
= DL
.getIndexTypeSizeInBits(GEP
.getType());
1411 assert(IntPtrWidth
== Offset
.getBitWidth());
1413 for (gep_type_iterator GTI
= gep_type_begin(GEP
), GTE
= gep_type_end(GEP
);
1414 GTI
!= GTE
; ++GTI
) {
1415 ConstantInt
*OpC
= dyn_cast
<ConstantInt
>(GTI
.getOperand());
1417 if (Constant
*SimpleOp
= SimplifiedValues
.lookup(GTI
.getOperand()))
1418 OpC
= dyn_cast
<ConstantInt
>(SimpleOp
);
1424 // Handle a struct index, which adds its field offset to the pointer.
1425 if (StructType
*STy
= GTI
.getStructTypeOrNull()) {
1426 unsigned ElementIdx
= OpC
->getZExtValue();
1427 const StructLayout
*SL
= DL
.getStructLayout(STy
);
1428 Offset
+= APInt(IntPtrWidth
, SL
->getElementOffset(ElementIdx
));
1432 APInt
TypeSize(IntPtrWidth
, GTI
.getSequentialElementStride(DL
));
1433 Offset
+= OpC
->getValue().sextOrTrunc(IntPtrWidth
) * TypeSize
;
1438 /// Use TTI to check whether a GEP is free.
1440 /// Respects any simplified values known during the analysis of this callsite.
1441 bool CallAnalyzer::isGEPFree(GetElementPtrInst
&GEP
) {
1442 SmallVector
<Value
*, 4> Operands
;
1443 Operands
.push_back(GEP
.getOperand(0));
1444 for (const Use
&Op
: GEP
.indices())
1445 if (Constant
*SimpleOp
= SimplifiedValues
.lookup(Op
))
1446 Operands
.push_back(SimpleOp
);
1448 Operands
.push_back(Op
);
1449 return TTI
.getInstructionCost(&GEP
, Operands
,
1450 TargetTransformInfo::TCK_SizeAndLatency
) ==
1451 TargetTransformInfo::TCC_Free
;
1454 bool CallAnalyzer::visitAlloca(AllocaInst
&I
) {
1455 disableSROA(I
.getOperand(0));
1457 // Check whether inlining will turn a dynamic alloca into a static
1458 // alloca and handle that case.
1459 if (I
.isArrayAllocation()) {
1460 Constant
*Size
= SimplifiedValues
.lookup(I
.getArraySize());
1461 if (auto *AllocSize
= dyn_cast_or_null
<ConstantInt
>(Size
)) {
1462 // Sometimes a dynamic alloca could be converted into a static alloca
1463 // after this constant prop, and become a huge static alloca on an
1464 // unconditional CFG path. Avoid inlining if this is going to happen above
1466 // FIXME: If the threshold is removed or lowered too much, we could end up
1467 // being too pessimistic and prevent inlining non-problematic code. This
1468 // could result in unintended perf regressions. A better overall strategy
1469 // is needed to track stack usage during inlining.
1470 Type
*Ty
= I
.getAllocatedType();
1471 AllocatedSize
= SaturatingMultiplyAdd(
1472 AllocSize
->getLimitedValue(),
1473 DL
.getTypeAllocSize(Ty
).getKnownMinValue(), AllocatedSize
);
1474 if (AllocatedSize
> InlineConstants::MaxSimplifiedDynamicAllocaToInline
)
1475 HasDynamicAlloca
= true;
1480 // Accumulate the allocated size.
1481 if (I
.isStaticAlloca()) {
1482 Type
*Ty
= I
.getAllocatedType();
1483 AllocatedSize
= SaturatingAdd(DL
.getTypeAllocSize(Ty
).getKnownMinValue(),
1487 // FIXME: This is overly conservative. Dynamic allocas are inefficient for
1488 // a variety of reasons, and so we would like to not inline them into
1489 // functions which don't currently have a dynamic alloca. This simply
1490 // disables inlining altogether in the presence of a dynamic alloca.
1491 if (!I
.isStaticAlloca())
1492 HasDynamicAlloca
= true;
1497 bool CallAnalyzer::visitPHI(PHINode
&I
) {
1498 // FIXME: We need to propagate SROA *disabling* through phi nodes, even
1499 // though we don't want to propagate it's bonuses. The idea is to disable
1500 // SROA if it *might* be used in an inappropriate manner.
1502 // Phi nodes are always zero-cost.
1503 // FIXME: Pointer sizes may differ between different address spaces, so do we
1504 // need to use correct address space in the call to getPointerSizeInBits here?
1505 // Or could we skip the getPointerSizeInBits call completely? As far as I can
1506 // see the ZeroOffset is used as a dummy value, so we can probably use any
1507 // bit width for the ZeroOffset?
1508 APInt ZeroOffset
= APInt::getZero(DL
.getPointerSizeInBits(0));
1509 bool CheckSROA
= I
.getType()->isPointerTy();
1511 // Track the constant or pointer with constant offset we've seen so far.
1512 Constant
*FirstC
= nullptr;
1513 std::pair
<Value
*, APInt
> FirstBaseAndOffset
= {nullptr, ZeroOffset
};
1514 Value
*FirstV
= nullptr;
1516 for (unsigned i
= 0, e
= I
.getNumIncomingValues(); i
!= e
; ++i
) {
1517 BasicBlock
*Pred
= I
.getIncomingBlock(i
);
1518 // If the incoming block is dead, skip the incoming block.
1519 if (DeadBlocks
.count(Pred
))
1521 // If the parent block of phi is not the known successor of the incoming
1522 // block, skip the incoming block.
1523 BasicBlock
*KnownSuccessor
= KnownSuccessors
[Pred
];
1524 if (KnownSuccessor
&& KnownSuccessor
!= I
.getParent())
1527 Value
*V
= I
.getIncomingValue(i
);
1528 // If the incoming value is this phi itself, skip the incoming value.
1532 Constant
*C
= dyn_cast
<Constant
>(V
);
1534 C
= SimplifiedValues
.lookup(V
);
1536 std::pair
<Value
*, APInt
> BaseAndOffset
= {nullptr, ZeroOffset
};
1537 if (!C
&& CheckSROA
)
1538 BaseAndOffset
= ConstantOffsetPtrs
.lookup(V
);
1540 if (!C
&& !BaseAndOffset
.first
)
1541 // The incoming value is neither a constant nor a pointer with constant
1542 // offset, exit early.
1547 // If we've seen a constant incoming value before and it is the same
1548 // constant we see this time, continue checking the next incoming value.
1550 // Otherwise early exit because we either see a different constant or saw
1551 // a constant before but we have a pointer with constant offset this time.
1556 // The same logic as above, but check pointer with constant offset here.
1557 if (FirstBaseAndOffset
== BaseAndOffset
)
1563 // This is the 1st time we've seen a constant, record it.
1568 // The remaining case is that this is the 1st time we've seen a pointer with
1569 // constant offset, record it.
1571 FirstBaseAndOffset
= BaseAndOffset
;
1574 // Check if we can map phi to a constant.
1576 SimplifiedValues
[&I
] = FirstC
;
1580 // Check if we can map phi to a pointer with constant offset.
1581 if (FirstBaseAndOffset
.first
) {
1582 ConstantOffsetPtrs
[&I
] = FirstBaseAndOffset
;
1584 if (auto *SROAArg
= getSROAArgForValueOrNull(FirstV
))
1585 SROAArgValues
[&I
] = SROAArg
;
1591 /// Check we can fold GEPs of constant-offset call site argument pointers.
1592 /// This requires target data and inbounds GEPs.
1594 /// \return true if the specified GEP can be folded.
1595 bool CallAnalyzer::canFoldInboundsGEP(GetElementPtrInst
&I
) {
1596 // Check if we have a base + offset for the pointer.
1597 std::pair
<Value
*, APInt
> BaseAndOffset
=
1598 ConstantOffsetPtrs
.lookup(I
.getPointerOperand());
1599 if (!BaseAndOffset
.first
)
1602 // Check if the offset of this GEP is constant, and if so accumulate it
1604 if (!accumulateGEPOffset(cast
<GEPOperator
>(I
), BaseAndOffset
.second
))
1607 // Add the result as a new mapping to Base + Offset.
1608 ConstantOffsetPtrs
[&I
] = BaseAndOffset
;
1613 bool CallAnalyzer::visitGetElementPtr(GetElementPtrInst
&I
) {
1614 auto *SROAArg
= getSROAArgForValueOrNull(I
.getPointerOperand());
1616 // Lambda to check whether a GEP's indices are all constant.
1617 auto IsGEPOffsetConstant
= [&](GetElementPtrInst
&GEP
) {
1618 for (const Use
&Op
: GEP
.indices())
1619 if (!isa
<Constant
>(Op
) && !SimplifiedValues
.lookup(Op
))
1624 if (!DisableGEPConstOperand
)
1625 if (simplifyInstruction(I
))
1628 if ((I
.isInBounds() && canFoldInboundsGEP(I
)) || IsGEPOffsetConstant(I
)) {
1630 SROAArgValues
[&I
] = SROAArg
;
1632 // Constant GEPs are modeled as free.
1636 // Variable GEPs will require math and will disable SROA.
1638 disableSROAForArg(SROAArg
);
1639 return isGEPFree(I
);
1642 /// Simplify \p I if its operands are constants and update SimplifiedValues.
1643 bool CallAnalyzer::simplifyInstruction(Instruction
&I
) {
1644 SmallVector
<Constant
*> COps
;
1645 for (Value
*Op
: I
.operands()) {
1646 Constant
*COp
= dyn_cast
<Constant
>(Op
);
1648 COp
= SimplifiedValues
.lookup(Op
);
1651 COps
.push_back(COp
);
1653 auto *C
= ConstantFoldInstOperands(&I
, COps
, DL
);
1656 SimplifiedValues
[&I
] = C
;
1660 /// Try to simplify a call to llvm.is.constant.
1662 /// Duplicate the argument checking from CallAnalyzer::simplifyCallSite since
1663 /// we expect calls of this specific intrinsic to be infrequent.
1665 /// FIXME: Given that we know CB's parent (F) caller
1666 /// (CandidateCall->getParent()->getParent()), we might be able to determine
1667 /// whether inlining F into F's caller would change how the call to
1668 /// llvm.is.constant would evaluate.
1669 bool CallAnalyzer::simplifyIntrinsicCallIsConstant(CallBase
&CB
) {
1670 Value
*Arg
= CB
.getArgOperand(0);
1671 auto *C
= dyn_cast
<Constant
>(Arg
);
1674 C
= dyn_cast_or_null
<Constant
>(SimplifiedValues
.lookup(Arg
));
1676 Type
*RT
= CB
.getFunctionType()->getReturnType();
1677 SimplifiedValues
[&CB
] = ConstantInt::get(RT
, C
? 1 : 0);
1681 bool CallAnalyzer::simplifyIntrinsicCallObjectSize(CallBase
&CB
) {
1682 // As per the langref, "The fourth argument to llvm.objectsize determines if
1683 // the value should be evaluated at runtime."
1684 if (cast
<ConstantInt
>(CB
.getArgOperand(3))->isOne())
1687 Value
*V
= lowerObjectSizeCall(&cast
<IntrinsicInst
>(CB
), DL
, nullptr,
1688 /*MustSucceed=*/true);
1689 Constant
*C
= dyn_cast_or_null
<Constant
>(V
);
1691 SimplifiedValues
[&CB
] = C
;
1695 bool CallAnalyzer::visitBitCast(BitCastInst
&I
) {
1696 // Propagate constants through bitcasts.
1697 if (simplifyInstruction(I
))
1700 // Track base/offsets through casts
1701 std::pair
<Value
*, APInt
> BaseAndOffset
=
1702 ConstantOffsetPtrs
.lookup(I
.getOperand(0));
1703 // Casts don't change the offset, just wrap it up.
1704 if (BaseAndOffset
.first
)
1705 ConstantOffsetPtrs
[&I
] = BaseAndOffset
;
1707 // Also look for SROA candidates here.
1708 if (auto *SROAArg
= getSROAArgForValueOrNull(I
.getOperand(0)))
1709 SROAArgValues
[&I
] = SROAArg
;
1711 // Bitcasts are always zero cost.
1715 bool CallAnalyzer::visitPtrToInt(PtrToIntInst
&I
) {
1716 // Propagate constants through ptrtoint.
1717 if (simplifyInstruction(I
))
1720 // Track base/offset pairs when converted to a plain integer provided the
1721 // integer is large enough to represent the pointer.
1722 unsigned IntegerSize
= I
.getType()->getScalarSizeInBits();
1723 unsigned AS
= I
.getOperand(0)->getType()->getPointerAddressSpace();
1724 if (IntegerSize
== DL
.getPointerSizeInBits(AS
)) {
1725 std::pair
<Value
*, APInt
> BaseAndOffset
=
1726 ConstantOffsetPtrs
.lookup(I
.getOperand(0));
1727 if (BaseAndOffset
.first
)
1728 ConstantOffsetPtrs
[&I
] = BaseAndOffset
;
1731 // This is really weird. Technically, ptrtoint will disable SROA. However,
1732 // unless that ptrtoint is *used* somewhere in the live basic blocks after
1733 // inlining, it will be nuked, and SROA should proceed. All of the uses which
1734 // would block SROA would also block SROA if applied directly to a pointer,
1735 // and so we can just add the integer in here. The only places where SROA is
1736 // preserved either cannot fire on an integer, or won't in-and-of themselves
1737 // disable SROA (ext) w/o some later use that we would see and disable.
1738 if (auto *SROAArg
= getSROAArgForValueOrNull(I
.getOperand(0)))
1739 SROAArgValues
[&I
] = SROAArg
;
1741 return TTI
.getInstructionCost(&I
, TargetTransformInfo::TCK_SizeAndLatency
) ==
1742 TargetTransformInfo::TCC_Free
;
1745 bool CallAnalyzer::visitIntToPtr(IntToPtrInst
&I
) {
1746 // Propagate constants through ptrtoint.
1747 if (simplifyInstruction(I
))
1750 // Track base/offset pairs when round-tripped through a pointer without
1751 // modifications provided the integer is not too large.
1752 Value
*Op
= I
.getOperand(0);
1753 unsigned IntegerSize
= Op
->getType()->getScalarSizeInBits();
1754 if (IntegerSize
<= DL
.getPointerTypeSizeInBits(I
.getType())) {
1755 std::pair
<Value
*, APInt
> BaseAndOffset
= ConstantOffsetPtrs
.lookup(Op
);
1756 if (BaseAndOffset
.first
)
1757 ConstantOffsetPtrs
[&I
] = BaseAndOffset
;
1760 // "Propagate" SROA here in the same manner as we do for ptrtoint above.
1761 if (auto *SROAArg
= getSROAArgForValueOrNull(Op
))
1762 SROAArgValues
[&I
] = SROAArg
;
1764 return TTI
.getInstructionCost(&I
, TargetTransformInfo::TCK_SizeAndLatency
) ==
1765 TargetTransformInfo::TCC_Free
;
1768 bool CallAnalyzer::visitCastInst(CastInst
&I
) {
1769 // Propagate constants through casts.
1770 if (simplifyInstruction(I
))
1773 // Disable SROA in the face of arbitrary casts we don't explicitly list
1775 disableSROA(I
.getOperand(0));
1777 // If this is a floating-point cast, and the target says this operation
1778 // is expensive, this may eventually become a library call. Treat the cost
1780 switch (I
.getOpcode()) {
1781 case Instruction::FPTrunc
:
1782 case Instruction::FPExt
:
1783 case Instruction::UIToFP
:
1784 case Instruction::SIToFP
:
1785 case Instruction::FPToUI
:
1786 case Instruction::FPToSI
:
1787 if (TTI
.getFPOpCost(I
.getType()) == TargetTransformInfo::TCC_Expensive
)
1794 return TTI
.getInstructionCost(&I
, TargetTransformInfo::TCK_SizeAndLatency
) ==
1795 TargetTransformInfo::TCC_Free
;
1798 bool CallAnalyzer::paramHasAttr(Argument
*A
, Attribute::AttrKind Attr
) {
1799 return CandidateCall
.paramHasAttr(A
->getArgNo(), Attr
);
1802 bool CallAnalyzer::isKnownNonNullInCallee(Value
*V
) {
1803 // Does the *call site* have the NonNull attribute set on an argument? We
1804 // use the attribute on the call site to memoize any analysis done in the
1805 // caller. This will also trip if the callee function has a non-null
1806 // parameter attribute, but that's a less interesting case because hopefully
1807 // the callee would already have been simplified based on that.
1808 if (Argument
*A
= dyn_cast
<Argument
>(V
))
1809 if (paramHasAttr(A
, Attribute::NonNull
))
1812 // Is this an alloca in the caller? This is distinct from the attribute case
1813 // above because attributes aren't updated within the inliner itself and we
1814 // always want to catch the alloca derived case.
1815 if (isAllocaDerivedArg(V
))
1816 // We can actually predict the result of comparisons between an
1817 // alloca-derived value and null. Note that this fires regardless of
1824 bool CallAnalyzer::allowSizeGrowth(CallBase
&Call
) {
1825 // If the normal destination of the invoke or the parent block of the call
1826 // site is unreachable-terminated, there is little point in inlining this
1827 // unless there is literally zero cost.
1828 // FIXME: Note that it is possible that an unreachable-terminated block has a
1829 // hot entry. For example, in below scenario inlining hot_call_X() may be
1837 // For now, we are not handling this corner case here as it is rare in real
1838 // code. In future, we should elaborate this based on BPI and BFI in more
1839 // general threshold adjusting heuristics in updateThreshold().
1840 if (InvokeInst
*II
= dyn_cast
<InvokeInst
>(&Call
)) {
1841 if (isa
<UnreachableInst
>(II
->getNormalDest()->getTerminator()))
1843 } else if (isa
<UnreachableInst
>(Call
.getParent()->getTerminator()))
1849 bool InlineCostCallAnalyzer::isColdCallSite(CallBase
&Call
,
1850 BlockFrequencyInfo
*CallerBFI
) {
1851 // If global profile summary is available, then callsite's coldness is
1852 // determined based on that.
1853 if (PSI
&& PSI
->hasProfileSummary())
1854 return PSI
->isColdCallSite(Call
, CallerBFI
);
1856 // Otherwise we need BFI to be available.
1860 // Determine if the callsite is cold relative to caller's entry. We could
1861 // potentially cache the computation of scaled entry frequency, but the added
1862 // complexity is not worth it unless this scaling shows up high in the
1864 const BranchProbability
ColdProb(ColdCallSiteRelFreq
, 100);
1865 auto CallSiteBB
= Call
.getParent();
1866 auto CallSiteFreq
= CallerBFI
->getBlockFreq(CallSiteBB
);
1867 auto CallerEntryFreq
=
1868 CallerBFI
->getBlockFreq(&(Call
.getCaller()->getEntryBlock()));
1869 return CallSiteFreq
< CallerEntryFreq
* ColdProb
;
1873 InlineCostCallAnalyzer::getHotCallSiteThreshold(CallBase
&Call
,
1874 BlockFrequencyInfo
*CallerBFI
) {
1876 // If global profile summary is available, then callsite's hotness is
1877 // determined based on that.
1878 if (PSI
&& PSI
->hasProfileSummary() && PSI
->isHotCallSite(Call
, CallerBFI
))
1879 return Params
.HotCallSiteThreshold
;
1881 // Otherwise we need BFI to be available and to have a locally hot callsite
1883 if (!CallerBFI
|| !Params
.LocallyHotCallSiteThreshold
)
1884 return std::nullopt
;
1886 // Determine if the callsite is hot relative to caller's entry. We could
1887 // potentially cache the computation of scaled entry frequency, but the added
1888 // complexity is not worth it unless this scaling shows up high in the
1890 const BasicBlock
*CallSiteBB
= Call
.getParent();
1891 BlockFrequency CallSiteFreq
= CallerBFI
->getBlockFreq(CallSiteBB
);
1892 BlockFrequency CallerEntryFreq
= CallerBFI
->getEntryFreq();
1893 std::optional
<BlockFrequency
> Limit
= CallerEntryFreq
.mul(HotCallSiteRelFreq
);
1894 if (Limit
&& CallSiteFreq
>= *Limit
)
1895 return Params
.LocallyHotCallSiteThreshold
;
1897 // Otherwise treat it normally.
1898 return std::nullopt
;
1901 void InlineCostCallAnalyzer::updateThreshold(CallBase
&Call
, Function
&Callee
) {
1902 // If no size growth is allowed for this inlining, set Threshold to 0.
1903 if (!allowSizeGrowth(Call
)) {
1908 Function
*Caller
= Call
.getCaller();
1910 // return min(A, B) if B is valid.
1911 auto MinIfValid
= [](int A
, std::optional
<int> B
) {
1912 return B
? std::min(A
, *B
) : A
;
1915 // return max(A, B) if B is valid.
1916 auto MaxIfValid
= [](int A
, std::optional
<int> B
) {
1917 return B
? std::max(A
, *B
) : A
;
1920 // Various bonus percentages. These are multiplied by Threshold to get the
1922 // SingleBBBonus: This bonus is applied if the callee has a single reachable
1923 // basic block at the given callsite context. This is speculatively applied
1924 // and withdrawn if more than one basic block is seen.
1926 // LstCallToStaticBonus: This large bonus is applied to ensure the inlining
1927 // of the last call to a static function as inlining such functions is
1928 // guaranteed to reduce code size.
1930 // These bonus percentages may be set to 0 based on properties of the caller
1931 // and the callsite.
1932 int SingleBBBonusPercent
= 50;
1933 int VectorBonusPercent
= TTI
.getInlinerVectorBonusPercent();
1934 int LastCallToStaticBonus
= InlineConstants::LastCallToStaticBonus
;
1936 // Lambda to set all the above bonus and bonus percentages to 0.
1937 auto DisallowAllBonuses
= [&]() {
1938 SingleBBBonusPercent
= 0;
1939 VectorBonusPercent
= 0;
1940 LastCallToStaticBonus
= 0;
1943 // Use the OptMinSizeThreshold or OptSizeThreshold knob if they are available
1944 // and reduce the threshold if the caller has the necessary attribute.
1945 if (Caller
->hasMinSize()) {
1946 Threshold
= MinIfValid(Threshold
, Params
.OptMinSizeThreshold
);
1947 // For minsize, we want to disable the single BB bonus and the vector
1948 // bonuses, but not the last-call-to-static bonus. Inlining the last call to
1949 // a static function will, at the minimum, eliminate the parameter setup and
1950 // call/return instructions.
1951 SingleBBBonusPercent
= 0;
1952 VectorBonusPercent
= 0;
1953 } else if (Caller
->hasOptSize())
1954 Threshold
= MinIfValid(Threshold
, Params
.OptSizeThreshold
);
1956 // Adjust the threshold based on inlinehint attribute and profile based
1957 // hotness information if the caller does not have MinSize attribute.
1958 if (!Caller
->hasMinSize()) {
1959 if (Callee
.hasFnAttribute(Attribute::InlineHint
))
1960 Threshold
= MaxIfValid(Threshold
, Params
.HintThreshold
);
1962 // FIXME: After switching to the new passmanager, simplify the logic below
1963 // by checking only the callsite hotness/coldness as we will reliably
1964 // have local profile information.
1966 // Callsite hotness and coldness can be determined if sample profile is
1967 // used (which adds hotness metadata to calls) or if caller's
1968 // BlockFrequencyInfo is available.
1969 BlockFrequencyInfo
*CallerBFI
= GetBFI
? &(GetBFI(*Caller
)) : nullptr;
1970 auto HotCallSiteThreshold
= getHotCallSiteThreshold(Call
, CallerBFI
);
1971 if (!Caller
->hasOptSize() && HotCallSiteThreshold
) {
1972 LLVM_DEBUG(dbgs() << "Hot callsite.\n");
1973 // FIXME: This should update the threshold only if it exceeds the
1974 // current threshold, but AutoFDO + ThinLTO currently relies on this
1975 // behavior to prevent inlining of hot callsites during ThinLTO
1977 Threshold
= *HotCallSiteThreshold
;
1978 } else if (isColdCallSite(Call
, CallerBFI
)) {
1979 LLVM_DEBUG(dbgs() << "Cold callsite.\n");
1980 // Do not apply bonuses for a cold callsite including the
1981 // LastCallToStatic bonus. While this bonus might result in code size
1982 // reduction, it can cause the size of a non-cold caller to increase
1983 // preventing it from being inlined.
1984 DisallowAllBonuses();
1985 Threshold
= MinIfValid(Threshold
, Params
.ColdCallSiteThreshold
);
1987 // Use callee's global profile information only if we have no way of
1988 // determining this via callsite information.
1989 if (PSI
->isFunctionEntryHot(&Callee
)) {
1990 LLVM_DEBUG(dbgs() << "Hot callee.\n");
1991 // If callsite hotness can not be determined, we may still know
1992 // that the callee is hot and treat it as a weaker hint for threshold
1994 Threshold
= MaxIfValid(Threshold
, Params
.HintThreshold
);
1995 } else if (PSI
->isFunctionEntryCold(&Callee
)) {
1996 LLVM_DEBUG(dbgs() << "Cold callee.\n");
1997 // Do not apply bonuses for a cold callee including the
1998 // LastCallToStatic bonus. While this bonus might result in code size
1999 // reduction, it can cause the size of a non-cold caller to increase
2000 // preventing it from being inlined.
2001 DisallowAllBonuses();
2002 Threshold
= MinIfValid(Threshold
, Params
.ColdThreshold
);
2007 Threshold
+= TTI
.adjustInliningThreshold(&Call
);
2009 // Finally, take the target-specific inlining threshold multiplier into
2011 Threshold
*= TTI
.getInliningThresholdMultiplier();
2013 SingleBBBonus
= Threshold
* SingleBBBonusPercent
/ 100;
2014 VectorBonus
= Threshold
* VectorBonusPercent
/ 100;
2016 // If there is only one call of the function, and it has internal linkage,
2017 // the cost of inlining it drops dramatically. It may seem odd to update
2018 // Cost in updateThreshold, but the bonus depends on the logic in this method.
2019 if (isSoleCallToLocalFunction(Call
, F
)) {
2020 Cost
-= LastCallToStaticBonus
;
2021 StaticBonusApplied
= LastCallToStaticBonus
;
2025 bool CallAnalyzer::visitCmpInst(CmpInst
&I
) {
2026 Value
*LHS
= I
.getOperand(0), *RHS
= I
.getOperand(1);
2027 // First try to handle simplified comparisons.
2028 if (simplifyInstruction(I
))
2031 if (I
.getOpcode() == Instruction::FCmp
)
2034 // Otherwise look for a comparison between constant offset pointers with
2036 Value
*LHSBase
, *RHSBase
;
2037 APInt LHSOffset
, RHSOffset
;
2038 std::tie(LHSBase
, LHSOffset
) = ConstantOffsetPtrs
.lookup(LHS
);
2040 std::tie(RHSBase
, RHSOffset
) = ConstantOffsetPtrs
.lookup(RHS
);
2041 if (RHSBase
&& LHSBase
== RHSBase
) {
2042 // We have common bases, fold the icmp to a constant based on the
2044 Constant
*CLHS
= ConstantInt::get(LHS
->getContext(), LHSOffset
);
2045 Constant
*CRHS
= ConstantInt::get(RHS
->getContext(), RHSOffset
);
2046 if (Constant
*C
= ConstantExpr::getICmp(I
.getPredicate(), CLHS
, CRHS
)) {
2047 SimplifiedValues
[&I
] = C
;
2048 ++NumConstantPtrCmps
;
2054 auto isImplicitNullCheckCmp
= [](const CmpInst
&I
) {
2055 for (auto *User
: I
.users())
2056 if (auto *Instr
= dyn_cast
<Instruction
>(User
))
2057 if (!Instr
->getMetadata(LLVMContext::MD_make_implicit
))
2062 // If the comparison is an equality comparison with null, we can simplify it
2063 // if we know the value (argument) can't be null
2064 if (I
.isEquality() && isa
<ConstantPointerNull
>(I
.getOperand(1))) {
2065 if (isKnownNonNullInCallee(I
.getOperand(0))) {
2066 bool IsNotEqual
= I
.getPredicate() == CmpInst::ICMP_NE
;
2067 SimplifiedValues
[&I
] = IsNotEqual
? ConstantInt::getTrue(I
.getType())
2068 : ConstantInt::getFalse(I
.getType());
2071 // Implicit null checks act as unconditional branches and their comparisons
2072 // should be treated as simplified and free of cost.
2073 if (isImplicitNullCheckCmp(I
))
2076 return handleSROA(I
.getOperand(0), isa
<ConstantPointerNull
>(I
.getOperand(1)));
2079 bool CallAnalyzer::visitSub(BinaryOperator
&I
) {
2080 // Try to handle a special case: we can fold computing the difference of two
2081 // constant-related pointers.
2082 Value
*LHS
= I
.getOperand(0), *RHS
= I
.getOperand(1);
2083 Value
*LHSBase
, *RHSBase
;
2084 APInt LHSOffset
, RHSOffset
;
2085 std::tie(LHSBase
, LHSOffset
) = ConstantOffsetPtrs
.lookup(LHS
);
2087 std::tie(RHSBase
, RHSOffset
) = ConstantOffsetPtrs
.lookup(RHS
);
2088 if (RHSBase
&& LHSBase
== RHSBase
) {
2089 // We have common bases, fold the subtract to a constant based on the
2091 Constant
*CLHS
= ConstantInt::get(LHS
->getContext(), LHSOffset
);
2092 Constant
*CRHS
= ConstantInt::get(RHS
->getContext(), RHSOffset
);
2093 if (Constant
*C
= ConstantExpr::getSub(CLHS
, CRHS
)) {
2094 SimplifiedValues
[&I
] = C
;
2095 ++NumConstantPtrDiffs
;
2101 // Otherwise, fall back to the generic logic for simplifying and handling
2103 return Base::visitSub(I
);
2106 bool CallAnalyzer::visitBinaryOperator(BinaryOperator
&I
) {
2107 Value
*LHS
= I
.getOperand(0), *RHS
= I
.getOperand(1);
2108 Constant
*CLHS
= dyn_cast
<Constant
>(LHS
);
2110 CLHS
= SimplifiedValues
.lookup(LHS
);
2111 Constant
*CRHS
= dyn_cast
<Constant
>(RHS
);
2113 CRHS
= SimplifiedValues
.lookup(RHS
);
2115 Value
*SimpleV
= nullptr;
2116 if (auto FI
= dyn_cast
<FPMathOperator
>(&I
))
2117 SimpleV
= simplifyBinOp(I
.getOpcode(), CLHS
? CLHS
: LHS
, CRHS
? CRHS
: RHS
,
2118 FI
->getFastMathFlags(), DL
);
2121 simplifyBinOp(I
.getOpcode(), CLHS
? CLHS
: LHS
, CRHS
? CRHS
: RHS
, DL
);
2123 if (Constant
*C
= dyn_cast_or_null
<Constant
>(SimpleV
))
2124 SimplifiedValues
[&I
] = C
;
2129 // Disable any SROA on arguments to arbitrary, unsimplified binary operators.
2133 // If the instruction is floating point, and the target says this operation
2134 // is expensive, this may eventually become a library call. Treat the cost
2135 // as such. Unless it's fneg which can be implemented with an xor.
2136 using namespace llvm::PatternMatch
;
2137 if (I
.getType()->isFloatingPointTy() &&
2138 TTI
.getFPOpCost(I
.getType()) == TargetTransformInfo::TCC_Expensive
&&
2139 !match(&I
, m_FNeg(m_Value())))
2145 bool CallAnalyzer::visitFNeg(UnaryOperator
&I
) {
2146 Value
*Op
= I
.getOperand(0);
2147 Constant
*COp
= dyn_cast
<Constant
>(Op
);
2149 COp
= SimplifiedValues
.lookup(Op
);
2151 Value
*SimpleV
= simplifyFNegInst(
2152 COp
? COp
: Op
, cast
<FPMathOperator
>(I
).getFastMathFlags(), DL
);
2154 if (Constant
*C
= dyn_cast_or_null
<Constant
>(SimpleV
))
2155 SimplifiedValues
[&I
] = C
;
2160 // Disable any SROA on arguments to arbitrary, unsimplified fneg.
2166 bool CallAnalyzer::visitLoad(LoadInst
&I
) {
2167 if (handleSROA(I
.getPointerOperand(), I
.isSimple()))
2170 // If the data is already loaded from this address and hasn't been clobbered
2171 // by any stores or calls, this load is likely to be redundant and can be
2173 if (EnableLoadElimination
&&
2174 !LoadAddrSet
.insert(I
.getPointerOperand()).second
&& I
.isUnordered()) {
2175 onLoadEliminationOpportunity();
2183 bool CallAnalyzer::visitStore(StoreInst
&I
) {
2184 if (handleSROA(I
.getPointerOperand(), I
.isSimple()))
2187 // The store can potentially clobber loads and prevent repeated loads from
2188 // being eliminated.
2190 // 1. We can probably keep an initial set of eliminatable loads substracted
2191 // from the cost even when we finally see a store. We just need to disable
2192 // *further* accumulation of elimination savings.
2193 // 2. We should probably at some point thread MemorySSA for the callee into
2194 // this and then use that to actually compute *really* precise savings.
2195 disableLoadElimination();
2201 bool CallAnalyzer::visitExtractValue(ExtractValueInst
&I
) {
2202 // Constant folding for extract value is trivial.
2203 if (simplifyInstruction(I
))
2206 // SROA can't look through these, but they may be free.
2207 return Base::visitExtractValue(I
);
2210 bool CallAnalyzer::visitInsertValue(InsertValueInst
&I
) {
2211 // Constant folding for insert value is trivial.
2212 if (simplifyInstruction(I
))
2215 // SROA can't look through these, but they may be free.
2216 return Base::visitInsertValue(I
);
2219 /// Try to simplify a call site.
2221 /// Takes a concrete function and callsite and tries to actually simplify it by
2222 /// analyzing the arguments and call itself with instsimplify. Returns true if
2223 /// it has simplified the callsite to some other entity (a constant), making it
2225 bool CallAnalyzer::simplifyCallSite(Function
*F
, CallBase
&Call
) {
2226 // FIXME: Using the instsimplify logic directly for this is inefficient
2227 // because we have to continually rebuild the argument list even when no
2228 // simplifications can be performed. Until that is fixed with remapping
2229 // inside of instsimplify, directly constant fold calls here.
2230 if (!canConstantFoldCallTo(&Call
, F
))
2233 // Try to re-map the arguments to constants.
2234 SmallVector
<Constant
*, 4> ConstantArgs
;
2235 ConstantArgs
.reserve(Call
.arg_size());
2236 for (Value
*I
: Call
.args()) {
2237 Constant
*C
= dyn_cast
<Constant
>(I
);
2239 C
= dyn_cast_or_null
<Constant
>(SimplifiedValues
.lookup(I
));
2241 return false; // This argument doesn't map to a constant.
2243 ConstantArgs
.push_back(C
);
2245 if (Constant
*C
= ConstantFoldCall(&Call
, F
, ConstantArgs
)) {
2246 SimplifiedValues
[&Call
] = C
;
2253 bool CallAnalyzer::visitCallBase(CallBase
&Call
) {
2254 if (!onCallBaseVisitStart(Call
))
2257 if (Call
.hasFnAttr(Attribute::ReturnsTwice
) &&
2258 !F
.hasFnAttribute(Attribute::ReturnsTwice
)) {
2259 // This aborts the entire analysis.
2260 ExposesReturnsTwice
= true;
2263 if (isa
<CallInst
>(Call
) && cast
<CallInst
>(Call
).cannotDuplicate())
2264 ContainsNoDuplicateCall
= true;
2266 Function
*F
= Call
.getCalledFunction();
2267 bool IsIndirectCall
= !F
;
2268 if (IsIndirectCall
) {
2269 // Check if this happens to be an indirect function call to a known function
2270 // in this inline context. If not, we've done all we can.
2271 Value
*Callee
= Call
.getCalledOperand();
2272 F
= dyn_cast_or_null
<Function
>(SimplifiedValues
.lookup(Callee
));
2273 if (!F
|| F
->getFunctionType() != Call
.getFunctionType()) {
2274 onCallArgumentSetup(Call
);
2276 if (!Call
.onlyReadsMemory())
2277 disableLoadElimination();
2278 return Base::visitCallBase(Call
);
2282 assert(F
&& "Expected a call to a known function");
2284 // When we have a concrete function, first try to simplify it directly.
2285 if (simplifyCallSite(F
, Call
))
2288 // Next check if it is an intrinsic we know about.
2289 // FIXME: Lift this into part of the InstVisitor.
2290 if (IntrinsicInst
*II
= dyn_cast
<IntrinsicInst
>(&Call
)) {
2291 switch (II
->getIntrinsicID()) {
2293 if (!Call
.onlyReadsMemory() && !isAssumeLikeIntrinsic(II
))
2294 disableLoadElimination();
2295 return Base::visitCallBase(Call
);
2297 case Intrinsic::load_relative
:
2298 onLoadRelativeIntrinsic();
2301 case Intrinsic::memset
:
2302 case Intrinsic::memcpy
:
2303 case Intrinsic::memmove
:
2304 disableLoadElimination();
2305 // SROA can usually chew through these intrinsics, but they aren't free.
2307 case Intrinsic::icall_branch_funnel
:
2308 case Intrinsic::localescape
:
2309 HasUninlineableIntrinsic
= true;
2311 case Intrinsic::vastart
:
2312 InitsVargArgs
= true;
2314 case Intrinsic::launder_invariant_group
:
2315 case Intrinsic::strip_invariant_group
:
2316 if (auto *SROAArg
= getSROAArgForValueOrNull(II
->getOperand(0)))
2317 SROAArgValues
[II
] = SROAArg
;
2319 case Intrinsic::is_constant
:
2320 return simplifyIntrinsicCallIsConstant(Call
);
2321 case Intrinsic::objectsize
:
2322 return simplifyIntrinsicCallObjectSize(Call
);
2326 if (F
== Call
.getFunction()) {
2327 // This flag will fully abort the analysis, so don't bother with anything
2329 IsRecursiveCall
= true;
2330 if (!AllowRecursiveCall
)
2334 if (TTI
.isLoweredToCall(F
)) {
2335 onLoweredCall(F
, Call
, IsIndirectCall
);
2338 if (!(Call
.onlyReadsMemory() || (IsIndirectCall
&& F
->onlyReadsMemory())))
2339 disableLoadElimination();
2340 return Base::visitCallBase(Call
);
2343 bool CallAnalyzer::visitReturnInst(ReturnInst
&RI
) {
2344 // At least one return instruction will be free after inlining.
2345 bool Free
= !HasReturn
;
2350 bool CallAnalyzer::visitBranchInst(BranchInst
&BI
) {
2351 // We model unconditional branches as essentially free -- they really
2352 // shouldn't exist at all, but handling them makes the behavior of the
2353 // inliner more regular and predictable. Interestingly, conditional branches
2354 // which will fold away are also free.
2355 return BI
.isUnconditional() || isa
<ConstantInt
>(BI
.getCondition()) ||
2356 BI
.getMetadata(LLVMContext::MD_make_implicit
) ||
2357 isa_and_nonnull
<ConstantInt
>(
2358 SimplifiedValues
.lookup(BI
.getCondition()));
2361 bool CallAnalyzer::visitSelectInst(SelectInst
&SI
) {
2362 bool CheckSROA
= SI
.getType()->isPointerTy();
2363 Value
*TrueVal
= SI
.getTrueValue();
2364 Value
*FalseVal
= SI
.getFalseValue();
2366 Constant
*TrueC
= dyn_cast
<Constant
>(TrueVal
);
2368 TrueC
= SimplifiedValues
.lookup(TrueVal
);
2369 Constant
*FalseC
= dyn_cast
<Constant
>(FalseVal
);
2371 FalseC
= SimplifiedValues
.lookup(FalseVal
);
2373 dyn_cast_or_null
<Constant
>(SimplifiedValues
.lookup(SI
.getCondition()));
2376 // Select C, X, X => X
2377 if (TrueC
== FalseC
&& TrueC
) {
2378 SimplifiedValues
[&SI
] = TrueC
;
2383 return Base::visitSelectInst(SI
);
2385 std::pair
<Value
*, APInt
> TrueBaseAndOffset
=
2386 ConstantOffsetPtrs
.lookup(TrueVal
);
2387 std::pair
<Value
*, APInt
> FalseBaseAndOffset
=
2388 ConstantOffsetPtrs
.lookup(FalseVal
);
2389 if (TrueBaseAndOffset
== FalseBaseAndOffset
&& TrueBaseAndOffset
.first
) {
2390 ConstantOffsetPtrs
[&SI
] = TrueBaseAndOffset
;
2392 if (auto *SROAArg
= getSROAArgForValueOrNull(TrueVal
))
2393 SROAArgValues
[&SI
] = SROAArg
;
2397 return Base::visitSelectInst(SI
);
2400 // Select condition is a constant.
2401 Value
*SelectedV
= CondC
->isAllOnesValue() ? TrueVal
2402 : (CondC
->isNullValue()) ? FalseVal
2405 // Condition is a vector constant that is not all 1s or all 0s. If all
2406 // operands are constants, ConstantFoldSelectInstruction() can handle the
2407 // cases such as select vectors.
2408 if (TrueC
&& FalseC
) {
2409 if (auto *C
= ConstantFoldSelectInstruction(CondC
, TrueC
, FalseC
)) {
2410 SimplifiedValues
[&SI
] = C
;
2414 return Base::visitSelectInst(SI
);
2417 // Condition is either all 1s or all 0s. SI can be simplified.
2418 if (Constant
*SelectedC
= dyn_cast
<Constant
>(SelectedV
)) {
2419 SimplifiedValues
[&SI
] = SelectedC
;
2426 std::pair
<Value
*, APInt
> BaseAndOffset
=
2427 ConstantOffsetPtrs
.lookup(SelectedV
);
2428 if (BaseAndOffset
.first
) {
2429 ConstantOffsetPtrs
[&SI
] = BaseAndOffset
;
2431 if (auto *SROAArg
= getSROAArgForValueOrNull(SelectedV
))
2432 SROAArgValues
[&SI
] = SROAArg
;
2438 bool CallAnalyzer::visitSwitchInst(SwitchInst
&SI
) {
2439 // We model unconditional switches as free, see the comments on handling
2441 if (isa
<ConstantInt
>(SI
.getCondition()))
2443 if (Value
*V
= SimplifiedValues
.lookup(SI
.getCondition()))
2444 if (isa
<ConstantInt
>(V
))
2447 // Assume the most general case where the switch is lowered into
2448 // either a jump table, bit test, or a balanced binary tree consisting of
2449 // case clusters without merging adjacent clusters with the same
2450 // destination. We do not consider the switches that are lowered with a mix
2451 // of jump table/bit test/binary search tree. The cost of the switch is
2452 // proportional to the size of the tree or the size of jump table range.
2454 // NB: We convert large switches which are just used to initialize large phi
2455 // nodes to lookup tables instead in simplifycfg, so this shouldn't prevent
2456 // inlining those. It will prevent inlining in cases where the optimization
2457 // does not (yet) fire.
2459 unsigned JumpTableSize
= 0;
2460 BlockFrequencyInfo
*BFI
= GetBFI
? &(GetBFI(F
)) : nullptr;
2461 unsigned NumCaseCluster
=
2462 TTI
.getEstimatedNumberOfCaseClusters(SI
, JumpTableSize
, PSI
, BFI
);
2464 onFinalizeSwitch(JumpTableSize
, NumCaseCluster
);
2468 bool CallAnalyzer::visitIndirectBrInst(IndirectBrInst
&IBI
) {
2469 // We never want to inline functions that contain an indirectbr. This is
2470 // incorrect because all the blockaddress's (in static global initializers
2471 // for example) would be referring to the original function, and this
2472 // indirect jump would jump from the inlined copy of the function into the
2473 // original function which is extremely undefined behavior.
2474 // FIXME: This logic isn't really right; we can safely inline functions with
2475 // indirectbr's as long as no other function or global references the
2476 // blockaddress of a block within the current function.
2477 HasIndirectBr
= true;
2481 bool CallAnalyzer::visitResumeInst(ResumeInst
&RI
) {
2482 // FIXME: It's not clear that a single instruction is an accurate model for
2483 // the inline cost of a resume instruction.
2487 bool CallAnalyzer::visitCleanupReturnInst(CleanupReturnInst
&CRI
) {
2488 // FIXME: It's not clear that a single instruction is an accurate model for
2489 // the inline cost of a cleanupret instruction.
2493 bool CallAnalyzer::visitCatchReturnInst(CatchReturnInst
&CRI
) {
2494 // FIXME: It's not clear that a single instruction is an accurate model for
2495 // the inline cost of a catchret instruction.
2499 bool CallAnalyzer::visitUnreachableInst(UnreachableInst
&I
) {
2500 // FIXME: It might be reasonably to discount the cost of instructions leading
2501 // to unreachable as they have the lowest possible impact on both runtime and
2503 return true; // No actual code is needed for unreachable.
2506 bool CallAnalyzer::visitInstruction(Instruction
&I
) {
2507 // Some instructions are free. All of the free intrinsics can also be
2508 // handled by SROA, etc.
2509 if (TTI
.getInstructionCost(&I
, TargetTransformInfo::TCK_SizeAndLatency
) ==
2510 TargetTransformInfo::TCC_Free
)
2513 // We found something we don't understand or can't handle. Mark any SROA-able
2514 // values in the operand list as no longer viable.
2515 for (const Use
&Op
: I
.operands())
2521 /// Analyze a basic block for its contribution to the inline cost.
2523 /// This method walks the analyzer over every instruction in the given basic
2524 /// block and accounts for their cost during inlining at this callsite. It
2525 /// aborts early if the threshold has been exceeded or an impossible to inline
2526 /// construct has been detected. It returns false if inlining is no longer
2527 /// viable, and true if inlining remains viable.
2529 CallAnalyzer::analyzeBlock(BasicBlock
*BB
,
2530 SmallPtrSetImpl
<const Value
*> &EphValues
) {
2531 for (Instruction
&I
: *BB
) {
2532 // FIXME: Currently, the number of instructions in a function regardless of
2533 // our ability to simplify them during inline to constants or dead code,
2534 // are actually used by the vector bonus heuristic. As long as that's true,
2535 // we have to special case debug intrinsics here to prevent differences in
2536 // inlining due to debug symbols. Eventually, the number of unsimplified
2537 // instructions shouldn't factor into the cost computation, but until then,
2538 // hack around it here.
2539 // Similarly, skip pseudo-probes.
2540 if (I
.isDebugOrPseudoInst())
2543 // Skip ephemeral values.
2544 if (EphValues
.count(&I
))
2548 if (isa
<ExtractElementInst
>(I
) || I
.getType()->isVectorTy())
2549 ++NumVectorInstructions
;
2551 // If the instruction simplified to a constant, there is no cost to this
2552 // instruction. Visit the instructions using our InstVisitor to account for
2553 // all of the per-instruction logic. The visit tree returns true if we
2554 // consumed the instruction in any way, and false if the instruction's base
2555 // cost should count against inlining.
2556 onInstructionAnalysisStart(&I
);
2558 if (Base::visit(&I
))
2559 ++NumInstructionsSimplified
;
2561 onMissedSimplification();
2563 onInstructionAnalysisFinish(&I
);
2564 using namespace ore
;
2565 // If the visit this instruction detected an uninlinable pattern, abort.
2566 InlineResult IR
= InlineResult::success();
2567 if (IsRecursiveCall
&& !AllowRecursiveCall
)
2568 IR
= InlineResult::failure("recursive");
2569 else if (ExposesReturnsTwice
)
2570 IR
= InlineResult::failure("exposes returns twice");
2571 else if (HasDynamicAlloca
)
2572 IR
= InlineResult::failure("dynamic alloca");
2573 else if (HasIndirectBr
)
2574 IR
= InlineResult::failure("indirect branch");
2575 else if (HasUninlineableIntrinsic
)
2576 IR
= InlineResult::failure("uninlinable intrinsic");
2577 else if (InitsVargArgs
)
2578 IR
= InlineResult::failure("varargs");
2579 if (!IR
.isSuccess()) {
2582 return OptimizationRemarkMissed(DEBUG_TYPE
, "NeverInline",
2584 << NV("Callee", &F
) << " has uninlinable pattern ("
2585 << NV("InlineResult", IR
.getFailureReason())
2586 << ") and cost is not fully computed";
2591 // If the caller is a recursive function then we don't want to inline
2592 // functions which allocate a lot of stack space because it would increase
2593 // the caller stack usage dramatically.
2594 if (IsCallerRecursive
&& AllocatedSize
> RecurStackSizeThreshold
) {
2596 InlineResult::failure("recursive and allocates too much stack space");
2599 return OptimizationRemarkMissed(DEBUG_TYPE
, "NeverInline",
2601 << NV("Callee", &F
) << " is "
2602 << NV("InlineResult", IR
.getFailureReason())
2603 << ". Cost is not fully computed";
2609 return InlineResult::failure(
2610 "Call site analysis is not favorable to inlining.");
2613 return InlineResult::success();
2616 /// Compute the base pointer and cumulative constant offsets for V.
2618 /// This strips all constant offsets off of V, leaving it the base pointer, and
2619 /// accumulates the total constant offset applied in the returned constant. It
2620 /// returns 0 if V is not a pointer, and returns the constant '0' if there are
2621 /// no constant offsets applied.
2622 ConstantInt
*CallAnalyzer::stripAndComputeInBoundsConstantOffsets(Value
*&V
) {
2623 if (!V
->getType()->isPointerTy())
2626 unsigned AS
= V
->getType()->getPointerAddressSpace();
2627 unsigned IntPtrWidth
= DL
.getIndexSizeInBits(AS
);
2628 APInt Offset
= APInt::getZero(IntPtrWidth
);
2630 // Even though we don't look through PHI nodes, we could be called on an
2631 // instruction in an unreachable block, which may be on a cycle.
2632 SmallPtrSet
<Value
*, 4> Visited
;
2635 if (GEPOperator
*GEP
= dyn_cast
<GEPOperator
>(V
)) {
2636 if (!GEP
->isInBounds() || !accumulateGEPOffset(*GEP
, Offset
))
2638 V
= GEP
->getPointerOperand();
2639 } else if (Operator::getOpcode(V
) == Instruction::BitCast
) {
2640 V
= cast
<Operator
>(V
)->getOperand(0);
2641 } else if (GlobalAlias
*GA
= dyn_cast
<GlobalAlias
>(V
)) {
2642 if (GA
->isInterposable())
2644 V
= GA
->getAliasee();
2648 assert(V
->getType()->isPointerTy() && "Unexpected operand type!");
2649 } while (Visited
.insert(V
).second
);
2651 Type
*IdxPtrTy
= DL
.getIndexType(V
->getType());
2652 return cast
<ConstantInt
>(ConstantInt::get(IdxPtrTy
, Offset
));
2655 /// Find dead blocks due to deleted CFG edges during inlining.
2657 /// If we know the successor of the current block, \p CurrBB, has to be \p
2658 /// NextBB, the other successors of \p CurrBB are dead if these successors have
2659 /// no live incoming CFG edges. If one block is found to be dead, we can
2660 /// continue growing the dead block list by checking the successors of the dead
2661 /// blocks to see if all their incoming edges are dead or not.
2662 void CallAnalyzer::findDeadBlocks(BasicBlock
*CurrBB
, BasicBlock
*NextBB
) {
2663 auto IsEdgeDead
= [&](BasicBlock
*Pred
, BasicBlock
*Succ
) {
2664 // A CFG edge is dead if the predecessor is dead or the predecessor has a
2665 // known successor which is not the one under exam.
2666 return (DeadBlocks
.count(Pred
) ||
2667 (KnownSuccessors
[Pred
] && KnownSuccessors
[Pred
] != Succ
));
2670 auto IsNewlyDead
= [&](BasicBlock
*BB
) {
2671 // If all the edges to a block are dead, the block is also dead.
2672 return (!DeadBlocks
.count(BB
) &&
2673 llvm::all_of(predecessors(BB
),
2674 [&](BasicBlock
*P
) { return IsEdgeDead(P
, BB
); }));
2677 for (BasicBlock
*Succ
: successors(CurrBB
)) {
2678 if (Succ
== NextBB
|| !IsNewlyDead(Succ
))
2680 SmallVector
<BasicBlock
*, 4> NewDead
;
2681 NewDead
.push_back(Succ
);
2682 while (!NewDead
.empty()) {
2683 BasicBlock
*Dead
= NewDead
.pop_back_val();
2684 if (DeadBlocks
.insert(Dead
).second
)
2685 // Continue growing the dead block lists.
2686 for (BasicBlock
*S
: successors(Dead
))
2688 NewDead
.push_back(S
);
2693 /// Analyze a call site for potential inlining.
2695 /// Returns true if inlining this call is viable, and false if it is not
2696 /// viable. It computes the cost and adjusts the threshold based on numerous
2697 /// factors and heuristics. If this method returns false but the computed cost
2698 /// is below the computed threshold, then inlining was forcibly disabled by
2699 /// some artifact of the routine.
2700 InlineResult
CallAnalyzer::analyze() {
2703 auto Result
= onAnalysisStart();
2704 if (!Result
.isSuccess())
2708 return InlineResult::success();
2710 Function
*Caller
= CandidateCall
.getFunction();
2711 // Check if the caller function is recursive itself.
2712 for (User
*U
: Caller
->users()) {
2713 CallBase
*Call
= dyn_cast
<CallBase
>(U
);
2714 if (Call
&& Call
->getFunction() == Caller
) {
2715 IsCallerRecursive
= true;
2720 // Populate our simplified values by mapping from function arguments to call
2721 // arguments with known important simplifications.
2722 auto CAI
= CandidateCall
.arg_begin();
2723 for (Argument
&FAI
: F
.args()) {
2724 assert(CAI
!= CandidateCall
.arg_end());
2725 if (Constant
*C
= dyn_cast
<Constant
>(CAI
))
2726 SimplifiedValues
[&FAI
] = C
;
2728 Value
*PtrArg
= *CAI
;
2729 if (ConstantInt
*C
= stripAndComputeInBoundsConstantOffsets(PtrArg
)) {
2730 ConstantOffsetPtrs
[&FAI
] = std::make_pair(PtrArg
, C
->getValue());
2732 // We can SROA any pointer arguments derived from alloca instructions.
2733 if (auto *SROAArg
= dyn_cast
<AllocaInst
>(PtrArg
)) {
2734 SROAArgValues
[&FAI
] = SROAArg
;
2735 onInitializeSROAArg(SROAArg
);
2736 EnabledSROAAllocas
.insert(SROAArg
);
2741 NumConstantArgs
= SimplifiedValues
.size();
2742 NumConstantOffsetPtrArgs
= ConstantOffsetPtrs
.size();
2743 NumAllocaArgs
= SROAArgValues
.size();
2745 // FIXME: If a caller has multiple calls to a callee, we end up recomputing
2746 // the ephemeral values multiple times (and they're completely determined by
2747 // the callee, so this is purely duplicate work).
2748 SmallPtrSet
<const Value
*, 32> EphValues
;
2749 CodeMetrics::collectEphemeralValues(&F
, &GetAssumptionCache(F
), EphValues
);
2751 // The worklist of live basic blocks in the callee *after* inlining. We avoid
2752 // adding basic blocks of the callee which can be proven to be dead for this
2753 // particular call site in order to get more accurate cost estimates. This
2754 // requires a somewhat heavyweight iteration pattern: we need to walk the
2755 // basic blocks in a breadth-first order as we insert live successors. To
2756 // accomplish this, prioritizing for small iterations because we exit after
2757 // crossing our threshold, we use a small-size optimized SetVector.
2758 typedef SmallSetVector
<BasicBlock
*, 16> BBSetVector
;
2759 BBSetVector BBWorklist
;
2760 BBWorklist
.insert(&F
.getEntryBlock());
2762 // Note that we *must not* cache the size, this loop grows the worklist.
2763 for (unsigned Idx
= 0; Idx
!= BBWorklist
.size(); ++Idx
) {
2767 BasicBlock
*BB
= BBWorklist
[Idx
];
2773 // Disallow inlining a blockaddress with uses other than strictly callbr.
2774 // A blockaddress only has defined behavior for an indirect branch in the
2775 // same function, and we do not currently support inlining indirect
2776 // branches. But, the inliner may not see an indirect branch that ends up
2777 // being dead code at a particular call site. If the blockaddress escapes
2778 // the function, e.g., via a global variable, inlining may lead to an
2779 // invalid cross-function reference.
2780 // FIXME: pr/39560: continue relaxing this overt restriction.
2781 if (BB
->hasAddressTaken())
2782 for (User
*U
: BlockAddress::get(&*BB
)->users())
2783 if (!isa
<CallBrInst
>(*U
))
2784 return InlineResult::failure("blockaddress used outside of callbr");
2786 // Analyze the cost of this block. If we blow through the threshold, this
2787 // returns false, and we can bail on out.
2788 InlineResult IR
= analyzeBlock(BB
, EphValues
);
2789 if (!IR
.isSuccess())
2792 Instruction
*TI
= BB
->getTerminator();
2794 // Add in the live successors by first checking whether we have terminator
2795 // that may be simplified based on the values simplified by this call.
2796 if (BranchInst
*BI
= dyn_cast
<BranchInst
>(TI
)) {
2797 if (BI
->isConditional()) {
2798 Value
*Cond
= BI
->getCondition();
2799 if (ConstantInt
*SimpleCond
=
2800 dyn_cast_or_null
<ConstantInt
>(SimplifiedValues
.lookup(Cond
))) {
2801 BasicBlock
*NextBB
= BI
->getSuccessor(SimpleCond
->isZero() ? 1 : 0);
2802 BBWorklist
.insert(NextBB
);
2803 KnownSuccessors
[BB
] = NextBB
;
2804 findDeadBlocks(BB
, NextBB
);
2808 } else if (SwitchInst
*SI
= dyn_cast
<SwitchInst
>(TI
)) {
2809 Value
*Cond
= SI
->getCondition();
2810 if (ConstantInt
*SimpleCond
=
2811 dyn_cast_or_null
<ConstantInt
>(SimplifiedValues
.lookup(Cond
))) {
2812 BasicBlock
*NextBB
= SI
->findCaseValue(SimpleCond
)->getCaseSuccessor();
2813 BBWorklist
.insert(NextBB
);
2814 KnownSuccessors
[BB
] = NextBB
;
2815 findDeadBlocks(BB
, NextBB
);
2820 // If we're unable to select a particular successor, just count all of
2822 for (unsigned TIdx
= 0, TSize
= TI
->getNumSuccessors(); TIdx
!= TSize
;
2824 BBWorklist
.insert(TI
->getSuccessor(TIdx
));
2826 onBlockAnalyzed(BB
);
2829 // If this is a noduplicate call, we can still inline as long as
2830 // inlining this would cause the removal of the caller (so the instruction
2831 // is not actually duplicated, just moved).
2832 if (!isSoleCallToLocalFunction(CandidateCall
, F
) && ContainsNoDuplicateCall
)
2833 return InlineResult::failure("noduplicate");
2835 // If the callee's stack size exceeds the user-specified threshold,
2836 // do not let it be inlined.
2837 // The command line option overrides a limit set in the function attributes.
2838 size_t FinalStackSizeThreshold
= StackSizeThreshold
;
2839 if (!StackSizeThreshold
.getNumOccurrences())
2840 if (std::optional
<int> AttrMaxStackSize
= getStringFnAttrAsInt(
2841 Caller
, InlineConstants::MaxInlineStackSizeAttributeName
))
2842 FinalStackSizeThreshold
= *AttrMaxStackSize
;
2843 if (AllocatedSize
> FinalStackSizeThreshold
)
2844 return InlineResult::failure("stacksize");
2846 return finalizeAnalysis();
2849 void InlineCostCallAnalyzer::print(raw_ostream
&OS
) {
2850 #define DEBUG_PRINT_STAT(x) OS << " " #x ": " << x << "\n"
2851 if (PrintInstructionComments
)
2852 F
.print(OS
, &Writer
);
2853 DEBUG_PRINT_STAT(NumConstantArgs
);
2854 DEBUG_PRINT_STAT(NumConstantOffsetPtrArgs
);
2855 DEBUG_PRINT_STAT(NumAllocaArgs
);
2856 DEBUG_PRINT_STAT(NumConstantPtrCmps
);
2857 DEBUG_PRINT_STAT(NumConstantPtrDiffs
);
2858 DEBUG_PRINT_STAT(NumInstructionsSimplified
);
2859 DEBUG_PRINT_STAT(NumInstructions
);
2860 DEBUG_PRINT_STAT(SROACostSavings
);
2861 DEBUG_PRINT_STAT(SROACostSavingsLost
);
2862 DEBUG_PRINT_STAT(LoadEliminationCost
);
2863 DEBUG_PRINT_STAT(ContainsNoDuplicateCall
);
2864 DEBUG_PRINT_STAT(Cost
);
2865 DEBUG_PRINT_STAT(Threshold
);
2866 #undef DEBUG_PRINT_STAT
2869 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2870 /// Dump stats about this call's analysis.
2871 LLVM_DUMP_METHOD
void InlineCostCallAnalyzer::dump() { print(dbgs()); }
2874 /// Test that there are no attribute conflicts between Caller and Callee
2875 /// that prevent inlining.
2876 static bool functionsHaveCompatibleAttributes(
2877 Function
*Caller
, Function
*Callee
, TargetTransformInfo
&TTI
,
2878 function_ref
<const TargetLibraryInfo
&(Function
&)> &GetTLI
) {
2879 // Note that CalleeTLI must be a copy not a reference. The legacy pass manager
2880 // caches the most recently created TLI in the TargetLibraryInfoWrapperPass
2881 // object, and always returns the same object (which is overwritten on each
2882 // GetTLI call). Therefore we copy the first result.
2883 auto CalleeTLI
= GetTLI(*Callee
);
2884 return (IgnoreTTIInlineCompatible
||
2885 TTI
.areInlineCompatible(Caller
, Callee
)) &&
2886 GetTLI(*Caller
).areInlineCompatible(CalleeTLI
,
2887 InlineCallerSupersetNoBuiltin
) &&
2888 AttributeFuncs::areInlineCompatible(*Caller
, *Callee
);
2891 int llvm::getCallsiteCost(const TargetTransformInfo
&TTI
, const CallBase
&Call
,
2892 const DataLayout
&DL
) {
2894 for (unsigned I
= 0, E
= Call
.arg_size(); I
!= E
; ++I
) {
2895 if (Call
.isByValArgument(I
)) {
2896 // We approximate the number of loads and stores needed by dividing the
2897 // size of the byval type by the target's pointer size.
2898 PointerType
*PTy
= cast
<PointerType
>(Call
.getArgOperand(I
)->getType());
2899 unsigned TypeSize
= DL
.getTypeSizeInBits(Call
.getParamByValType(I
));
2900 unsigned AS
= PTy
->getAddressSpace();
2901 unsigned PointerSize
= DL
.getPointerSizeInBits(AS
);
2902 // Ceiling division.
2903 unsigned NumStores
= (TypeSize
+ PointerSize
- 1) / PointerSize
;
2905 // If it generates more than 8 stores it is likely to be expanded as an
2906 // inline memcpy so we take that as an upper bound. Otherwise we assume
2907 // one load and one store per word copied.
2908 // FIXME: The maxStoresPerMemcpy setting from the target should be used
2909 // here instead of a magic number of 8, but it's not available via
2911 NumStores
= std::min(NumStores
, 8U);
2913 Cost
+= 2 * NumStores
* InstrCost
;
2915 // For non-byval arguments subtract off one instruction per call
2920 // The call instruction also disappears after inlining.
2922 Cost
+= TTI
.getInlineCallPenalty(Call
.getCaller(), Call
, CallPenalty
);
2924 return std::min
<int64_t>(Cost
, INT_MAX
);
2927 InlineCost
llvm::getInlineCost(
2928 CallBase
&Call
, const InlineParams
&Params
, TargetTransformInfo
&CalleeTTI
,
2929 function_ref
<AssumptionCache
&(Function
&)> GetAssumptionCache
,
2930 function_ref
<const TargetLibraryInfo
&(Function
&)> GetTLI
,
2931 function_ref
<BlockFrequencyInfo
&(Function
&)> GetBFI
,
2932 ProfileSummaryInfo
*PSI
, OptimizationRemarkEmitter
*ORE
) {
2933 return getInlineCost(Call
, Call
.getCalledFunction(), Params
, CalleeTTI
,
2934 GetAssumptionCache
, GetTLI
, GetBFI
, PSI
, ORE
);
2937 std::optional
<int> llvm::getInliningCostEstimate(
2938 CallBase
&Call
, TargetTransformInfo
&CalleeTTI
,
2939 function_ref
<AssumptionCache
&(Function
&)> GetAssumptionCache
,
2940 function_ref
<BlockFrequencyInfo
&(Function
&)> GetBFI
,
2941 ProfileSummaryInfo
*PSI
, OptimizationRemarkEmitter
*ORE
) {
2942 const InlineParams Params
= {/* DefaultThreshold*/ 0,
2943 /*HintThreshold*/ {},
2944 /*ColdThreshold*/ {},
2945 /*OptSizeThreshold*/ {},
2946 /*OptMinSizeThreshold*/ {},
2947 /*HotCallSiteThreshold*/ {},
2948 /*LocallyHotCallSiteThreshold*/ {},
2949 /*ColdCallSiteThreshold*/ {},
2950 /*ComputeFullInlineCost*/ true,
2951 /*EnableDeferral*/ true};
2953 InlineCostCallAnalyzer
CA(*Call
.getCalledFunction(), Call
, Params
, CalleeTTI
,
2954 GetAssumptionCache
, GetBFI
, PSI
, ORE
, true,
2955 /*IgnoreThreshold*/ true);
2956 auto R
= CA
.analyze();
2958 return std::nullopt
;
2959 return CA
.getCost();
2962 std::optional
<InlineCostFeatures
> llvm::getInliningCostFeatures(
2963 CallBase
&Call
, TargetTransformInfo
&CalleeTTI
,
2964 function_ref
<AssumptionCache
&(Function
&)> GetAssumptionCache
,
2965 function_ref
<BlockFrequencyInfo
&(Function
&)> GetBFI
,
2966 ProfileSummaryInfo
*PSI
, OptimizationRemarkEmitter
*ORE
) {
2967 InlineCostFeaturesAnalyzer
CFA(CalleeTTI
, GetAssumptionCache
, GetBFI
, PSI
,
2968 ORE
, *Call
.getCalledFunction(), Call
);
2969 auto R
= CFA
.analyze();
2971 return std::nullopt
;
2972 return CFA
.features();
2975 std::optional
<InlineResult
> llvm::getAttributeBasedInliningDecision(
2976 CallBase
&Call
, Function
*Callee
, TargetTransformInfo
&CalleeTTI
,
2977 function_ref
<const TargetLibraryInfo
&(Function
&)> GetTLI
) {
2979 // Cannot inline indirect calls.
2981 return InlineResult::failure("indirect call");
2983 // When callee coroutine function is inlined into caller coroutine function
2984 // before coro-split pass,
2985 // coro-early pass can not handle this quiet well.
2986 // So we won't inline the coroutine function if it have not been unsplited
2987 if (Callee
->isPresplitCoroutine())
2988 return InlineResult::failure("unsplited coroutine call");
2990 // Never inline calls with byval arguments that does not have the alloca
2991 // address space. Since byval arguments can be replaced with a copy to an
2992 // alloca, the inlined code would need to be adjusted to handle that the
2993 // argument is in the alloca address space (so it is a little bit complicated
2995 unsigned AllocaAS
= Callee
->getParent()->getDataLayout().getAllocaAddrSpace();
2996 for (unsigned I
= 0, E
= Call
.arg_size(); I
!= E
; ++I
)
2997 if (Call
.isByValArgument(I
)) {
2998 PointerType
*PTy
= cast
<PointerType
>(Call
.getArgOperand(I
)->getType());
2999 if (PTy
->getAddressSpace() != AllocaAS
)
3000 return InlineResult::failure("byval arguments without alloca"
3004 // Calls to functions with always-inline attributes should be inlined
3005 // whenever possible.
3006 if (Call
.hasFnAttr(Attribute::AlwaysInline
)) {
3007 if (Call
.getAttributes().hasFnAttr(Attribute::NoInline
))
3008 return InlineResult::failure("noinline call site attribute");
3010 auto IsViable
= isInlineViable(*Callee
);
3011 if (IsViable
.isSuccess())
3012 return InlineResult::success();
3013 return InlineResult::failure(IsViable
.getFailureReason());
3016 // Never inline functions with conflicting attributes (unless callee has
3017 // always-inline attribute).
3018 Function
*Caller
= Call
.getCaller();
3019 if (!functionsHaveCompatibleAttributes(Caller
, Callee
, CalleeTTI
, GetTLI
))
3020 return InlineResult::failure("conflicting attributes");
3022 // Don't inline this call if the caller has the optnone attribute.
3023 if (Caller
->hasOptNone())
3024 return InlineResult::failure("optnone attribute");
3026 // Don't inline a function that treats null pointer as valid into a caller
3027 // that does not have this attribute.
3028 if (!Caller
->nullPointerIsDefined() && Callee
->nullPointerIsDefined())
3029 return InlineResult::failure("nullptr definitions incompatible");
3031 // Don't inline functions which can be interposed at link-time.
3032 if (Callee
->isInterposable())
3033 return InlineResult::failure("interposable");
3035 // Don't inline functions marked noinline.
3036 if (Callee
->hasFnAttribute(Attribute::NoInline
))
3037 return InlineResult::failure("noinline function attribute");
3039 // Don't inline call sites marked noinline.
3040 if (Call
.isNoInline())
3041 return InlineResult::failure("noinline call site attribute");
3043 return std::nullopt
;
3046 InlineCost
llvm::getInlineCost(
3047 CallBase
&Call
, Function
*Callee
, const InlineParams
&Params
,
3048 TargetTransformInfo
&CalleeTTI
,
3049 function_ref
<AssumptionCache
&(Function
&)> GetAssumptionCache
,
3050 function_ref
<const TargetLibraryInfo
&(Function
&)> GetTLI
,
3051 function_ref
<BlockFrequencyInfo
&(Function
&)> GetBFI
,
3052 ProfileSummaryInfo
*PSI
, OptimizationRemarkEmitter
*ORE
) {
3055 llvm::getAttributeBasedInliningDecision(Call
, Callee
, CalleeTTI
, GetTLI
);
3058 if (UserDecision
->isSuccess())
3059 return llvm::InlineCost::getAlways("always inline attribute");
3060 return llvm::InlineCost::getNever(UserDecision
->getFailureReason());
3063 LLVM_DEBUG(llvm::dbgs() << " Analyzing call of " << Callee
->getName()
3064 << "... (caller:" << Call
.getCaller()->getName()
3067 InlineCostCallAnalyzer
CA(*Callee
, Call
, Params
, CalleeTTI
,
3068 GetAssumptionCache
, GetBFI
, PSI
, ORE
);
3069 InlineResult ShouldInline
= CA
.analyze();
3071 LLVM_DEBUG(CA
.dump());
3073 // Always make cost benefit based decision explicit.
3074 // We use always/never here since threshold is not meaningful,
3075 // as it's not what drives cost-benefit analysis.
3076 if (CA
.wasDecidedByCostBenefit()) {
3077 if (ShouldInline
.isSuccess())
3078 return InlineCost::getAlways("benefit over cost",
3079 CA
.getCostBenefitPair());
3081 return InlineCost::getNever("cost over benefit", CA
.getCostBenefitPair());
3084 if (CA
.wasDecidedByCostThreshold())
3085 return InlineCost::get(CA
.getCost(), CA
.getThreshold(),
3086 CA
.getStaticBonusApplied());
3088 // No details on how the decision was made, simply return always or never.
3089 return ShouldInline
.isSuccess()
3090 ? InlineCost::getAlways("empty function")
3091 : InlineCost::getNever(ShouldInline
.getFailureReason());
3094 InlineResult
llvm::isInlineViable(Function
&F
) {
3095 bool ReturnsTwice
= F
.hasFnAttribute(Attribute::ReturnsTwice
);
3096 for (BasicBlock
&BB
: F
) {
3097 // Disallow inlining of functions which contain indirect branches.
3098 if (isa
<IndirectBrInst
>(BB
.getTerminator()))
3099 return InlineResult::failure("contains indirect branches");
3101 // Disallow inlining of blockaddresses which are used by non-callbr
3103 if (BB
.hasAddressTaken())
3104 for (User
*U
: BlockAddress::get(&BB
)->users())
3105 if (!isa
<CallBrInst
>(*U
))
3106 return InlineResult::failure("blockaddress used outside of callbr");
3108 for (auto &II
: BB
) {
3109 CallBase
*Call
= dyn_cast
<CallBase
>(&II
);
3113 // Disallow recursive calls.
3114 Function
*Callee
= Call
->getCalledFunction();
3116 return InlineResult::failure("recursive call");
3118 // Disallow calls which expose returns-twice to a function not previously
3119 // attributed as such.
3120 if (!ReturnsTwice
&& isa
<CallInst
>(Call
) &&
3121 cast
<CallInst
>(Call
)->canReturnTwice())
3122 return InlineResult::failure("exposes returns-twice attribute");
3125 switch (Callee
->getIntrinsicID()) {
3128 case llvm::Intrinsic::icall_branch_funnel
:
3129 // Disallow inlining of @llvm.icall.branch.funnel because current
3130 // backend can't separate call targets from call arguments.
3131 return InlineResult::failure(
3132 "disallowed inlining of @llvm.icall.branch.funnel");
3133 case llvm::Intrinsic::localescape
:
3134 // Disallow inlining functions that call @llvm.localescape. Doing this
3135 // correctly would require major changes to the inliner.
3136 return InlineResult::failure(
3137 "disallowed inlining of @llvm.localescape");
3138 case llvm::Intrinsic::vastart
:
3139 // Disallow inlining of functions that initialize VarArgs with
3141 return InlineResult::failure(
3142 "contains VarArgs initialized with va_start");
3147 return InlineResult::success();
3150 // APIs to create InlineParams based on command line flags and/or other
3153 InlineParams
llvm::getInlineParams(int Threshold
) {
3154 InlineParams Params
;
3156 // This field is the threshold to use for a callee by default. This is
3157 // derived from one or more of:
3158 // * optimization or size-optimization levels,
3159 // * a value passed to createFunctionInliningPass function, or
3160 // * the -inline-threshold flag.
3161 // If the -inline-threshold flag is explicitly specified, that is used
3162 // irrespective of anything else.
3163 if (InlineThreshold
.getNumOccurrences() > 0)
3164 Params
.DefaultThreshold
= InlineThreshold
;
3166 Params
.DefaultThreshold
= Threshold
;
3168 // Set the HintThreshold knob from the -inlinehint-threshold.
3169 Params
.HintThreshold
= HintThreshold
;
3171 // Set the HotCallSiteThreshold knob from the -hot-callsite-threshold.
3172 Params
.HotCallSiteThreshold
= HotCallSiteThreshold
;
3174 // If the -locally-hot-callsite-threshold is explicitly specified, use it to
3175 // populate LocallyHotCallSiteThreshold. Later, we populate
3176 // Params.LocallyHotCallSiteThreshold from -locally-hot-callsite-threshold if
3177 // we know that optimization level is O3 (in the getInlineParams variant that
3178 // takes the opt and size levels).
3179 // FIXME: Remove this check (and make the assignment unconditional) after
3180 // addressing size regression issues at O2.
3181 if (LocallyHotCallSiteThreshold
.getNumOccurrences() > 0)
3182 Params
.LocallyHotCallSiteThreshold
= LocallyHotCallSiteThreshold
;
3184 // Set the ColdCallSiteThreshold knob from the
3185 // -inline-cold-callsite-threshold.
3186 Params
.ColdCallSiteThreshold
= ColdCallSiteThreshold
;
3188 // Set the OptMinSizeThreshold and OptSizeThreshold params only if the
3189 // -inlinehint-threshold commandline option is not explicitly given. If that
3190 // option is present, then its value applies even for callees with size and
3191 // minsize attributes.
3192 // If the -inline-threshold is not specified, set the ColdThreshold from the
3193 // -inlinecold-threshold even if it is not explicitly passed. If
3194 // -inline-threshold is specified, then -inlinecold-threshold needs to be
3195 // explicitly specified to set the ColdThreshold knob
3196 if (InlineThreshold
.getNumOccurrences() == 0) {
3197 Params
.OptMinSizeThreshold
= InlineConstants::OptMinSizeThreshold
;
3198 Params
.OptSizeThreshold
= InlineConstants::OptSizeThreshold
;
3199 Params
.ColdThreshold
= ColdThreshold
;
3200 } else if (ColdThreshold
.getNumOccurrences() > 0) {
3201 Params
.ColdThreshold
= ColdThreshold
;
3206 InlineParams
llvm::getInlineParams() {
3207 return getInlineParams(DefaultThreshold
);
3210 // Compute the default threshold for inlining based on the opt level and the
3212 static int computeThresholdFromOptLevels(unsigned OptLevel
,
3213 unsigned SizeOptLevel
) {
3215 return InlineConstants::OptAggressiveThreshold
;
3216 if (SizeOptLevel
== 1) // -Os
3217 return InlineConstants::OptSizeThreshold
;
3218 if (SizeOptLevel
== 2) // -Oz
3219 return InlineConstants::OptMinSizeThreshold
;
3220 return DefaultThreshold
;
3223 InlineParams
llvm::getInlineParams(unsigned OptLevel
, unsigned SizeOptLevel
) {
3225 getInlineParams(computeThresholdFromOptLevels(OptLevel
, SizeOptLevel
));
3226 // At O3, use the value of -locally-hot-callsite-threshold option to populate
3227 // Params.LocallyHotCallSiteThreshold. Below O3, this flag has effect only
3228 // when it is specified explicitly.
3230 Params
.LocallyHotCallSiteThreshold
= LocallyHotCallSiteThreshold
;
3235 InlineCostAnnotationPrinterPass::run(Function
&F
,
3236 FunctionAnalysisManager
&FAM
) {
3237 PrintInstructionComments
= true;
3238 std::function
<AssumptionCache
&(Function
&)> GetAssumptionCache
=
3239 [&](Function
&F
) -> AssumptionCache
& {
3240 return FAM
.getResult
<AssumptionAnalysis
>(F
);
3242 Module
*M
= F
.getParent();
3243 ProfileSummaryInfo
PSI(*M
);
3245 TargetTransformInfo
TTI(DL
);
3246 // FIXME: Redesign the usage of InlineParams to expand the scope of this pass.
3247 // In the current implementation, the type of InlineParams doesn't matter as
3248 // the pass serves only for verification of inliner's decisions.
3249 // We can add a flag which determines InlineParams for this run. Right now,
3250 // the default InlineParams are used.
3251 const InlineParams Params
= llvm::getInlineParams();
3252 for (BasicBlock
&BB
: F
) {
3253 for (Instruction
&I
: BB
) {
3254 if (CallInst
*CI
= dyn_cast
<CallInst
>(&I
)) {
3255 Function
*CalledFunction
= CI
->getCalledFunction();
3256 if (!CalledFunction
|| CalledFunction
->isDeclaration())
3258 OptimizationRemarkEmitter
ORE(CalledFunction
);
3259 InlineCostCallAnalyzer
ICCA(*CalledFunction
, *CI
, Params
, TTI
,
3260 GetAssumptionCache
, nullptr, &PSI
, &ORE
);
3262 OS
<< " Analyzing call of " << CalledFunction
->getName()
3263 << "... (caller:" << CI
->getCaller()->getName() << ")\n";
3269 return PreservedAnalyses::all();