1 //===- InlineCost.cpp - Cost analysis for inliner -------------------------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 // This file implements inline cost analysis.
11 //===----------------------------------------------------------------------===//
13 #include "llvm/Analysis/InlineCost.h"
14 #include "llvm/ADT/STLExtras.h"
15 #include "llvm/ADT/SetVector.h"
16 #include "llvm/ADT/SmallPtrSet.h"
17 #include "llvm/ADT/SmallVector.h"
18 #include "llvm/ADT/Statistic.h"
19 #include "llvm/Analysis/AssumptionCache.h"
20 #include "llvm/Analysis/BlockFrequencyInfo.h"
21 #include "llvm/Analysis/CodeMetrics.h"
22 #include "llvm/Analysis/ConstantFolding.h"
23 #include "llvm/Analysis/InstructionSimplify.h"
24 #include "llvm/Analysis/LoopInfo.h"
25 #include "llvm/Analysis/MemoryBuiltins.h"
26 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
27 #include "llvm/Analysis/ProfileSummaryInfo.h"
28 #include "llvm/Analysis/TargetLibraryInfo.h"
29 #include "llvm/Analysis/TargetTransformInfo.h"
30 #include "llvm/Analysis/ValueTracking.h"
31 #include "llvm/Config/llvm-config.h"
32 #include "llvm/IR/AssemblyAnnotationWriter.h"
33 #include "llvm/IR/CallingConv.h"
34 #include "llvm/IR/DataLayout.h"
35 #include "llvm/IR/Dominators.h"
36 #include "llvm/IR/GetElementPtrTypeIterator.h"
37 #include "llvm/IR/GlobalAlias.h"
38 #include "llvm/IR/InstVisitor.h"
39 #include "llvm/IR/IntrinsicInst.h"
40 #include "llvm/IR/Operator.h"
41 #include "llvm/IR/PatternMatch.h"
42 #include "llvm/Support/CommandLine.h"
43 #include "llvm/Support/Debug.h"
44 #include "llvm/Support/FormattedStream.h"
45 #include "llvm/Support/raw_ostream.h"
52 #define DEBUG_TYPE "inline-cost"
54 STATISTIC(NumCallsAnalyzed
, "Number of call sites analyzed");
57 DefaultThreshold("inlinedefault-threshold", cl::Hidden
, cl::init(225),
58 cl::desc("Default amount of inlining to perform"));
60 // We introduce this option since there is a minor compile-time win by avoiding
61 // addition of TTI attributes (target-features in particular) to inline
62 // candidates when they are guaranteed to be the same as top level methods in
63 // some use cases. If we avoid adding the attribute, we need an option to avoid
64 // checking these attributes.
65 static cl::opt
<bool> IgnoreTTIInlineCompatible(
66 "ignore-tti-inline-compatible", cl::Hidden
, cl::init(false),
67 cl::desc("Ignore TTI attributes compatibility check between callee/caller "
68 "during inline cost calculation"));
70 static cl::opt
<bool> PrintInstructionComments(
71 "print-instruction-comments", cl::Hidden
, cl::init(false),
72 cl::desc("Prints comments for instruction based on inline cost analysis"));
74 static cl::opt
<int> InlineThreshold(
75 "inline-threshold", cl::Hidden
, cl::init(225),
76 cl::desc("Control the amount of inlining to perform (default = 225)"));
78 static cl::opt
<int> HintThreshold(
79 "inlinehint-threshold", cl::Hidden
, cl::init(325),
80 cl::desc("Threshold for inlining functions with inline hint"));
83 ColdCallSiteThreshold("inline-cold-callsite-threshold", cl::Hidden
,
85 cl::desc("Threshold for inlining cold callsites"));
87 static cl::opt
<bool> InlineEnableCostBenefitAnalysis(
88 "inline-enable-cost-benefit-analysis", cl::Hidden
, cl::init(false),
89 cl::desc("Enable the cost-benefit analysis for the inliner"));
91 // InlineSavingsMultiplier overrides per TTI multipliers iff it is
92 // specified explicitly in command line options. This option is exposed
93 // for tuning and testing.
94 static cl::opt
<int> InlineSavingsMultiplier(
95 "inline-savings-multiplier", cl::Hidden
, cl::init(8),
96 cl::desc("Multiplier to multiply cycle savings by during inlining"));
98 // InlineSavingsProfitableMultiplier overrides per TTI multipliers iff it is
99 // specified explicitly in command line options. This option is exposed
100 // for tuning and testing.
101 static cl::opt
<int> InlineSavingsProfitableMultiplier(
102 "inline-savings-profitable-multiplier", cl::Hidden
, cl::init(4),
103 cl::desc("A multiplier on top of cycle savings to decide whether the "
104 "savings won't justify the cost"));
107 InlineSizeAllowance("inline-size-allowance", cl::Hidden
, cl::init(100),
108 cl::desc("The maximum size of a callee that get's "
109 "inlined without sufficient cycle savings"));
111 // We introduce this threshold to help performance of instrumentation based
112 // PGO before we actually hook up inliner with analysis passes such as BPI and
114 static cl::opt
<int> ColdThreshold(
115 "inlinecold-threshold", cl::Hidden
, cl::init(45),
116 cl::desc("Threshold for inlining functions with cold attribute"));
119 HotCallSiteThreshold("hot-callsite-threshold", cl::Hidden
, cl::init(3000),
120 cl::desc("Threshold for hot callsites "));
122 static cl::opt
<int> LocallyHotCallSiteThreshold(
123 "locally-hot-callsite-threshold", cl::Hidden
, cl::init(525),
124 cl::desc("Threshold for locally hot callsites "));
126 static cl::opt
<int> ColdCallSiteRelFreq(
127 "cold-callsite-rel-freq", cl::Hidden
, cl::init(2),
128 cl::desc("Maximum block frequency, expressed as a percentage of caller's "
129 "entry frequency, for a callsite to be cold in the absence of "
130 "profile information."));
132 static cl::opt
<uint64_t> HotCallSiteRelFreq(
133 "hot-callsite-rel-freq", cl::Hidden
, cl::init(60),
134 cl::desc("Minimum block frequency, expressed as a multiple of caller's "
135 "entry frequency, for a callsite to be hot in the absence of "
136 "profile information."));
139 InstrCost("inline-instr-cost", cl::Hidden
, cl::init(5),
140 cl::desc("Cost of a single instruction when inlining"));
143 MemAccessCost("inline-memaccess-cost", cl::Hidden
, cl::init(0),
144 cl::desc("Cost of load/store instruction when inlining"));
146 static cl::opt
<int> CallPenalty(
147 "inline-call-penalty", cl::Hidden
, cl::init(25),
148 cl::desc("Call penalty that is applied per callsite when inlining"));
150 static cl::opt
<size_t>
151 StackSizeThreshold("inline-max-stacksize", cl::Hidden
,
152 cl::init(std::numeric_limits
<size_t>::max()),
153 cl::desc("Do not inline functions with a stack size "
154 "that exceeds the specified limit"));
156 static cl::opt
<size_t> RecurStackSizeThreshold(
157 "recursive-inline-max-stacksize", cl::Hidden
,
158 cl::init(InlineConstants::TotalAllocaSizeRecursiveCaller
),
159 cl::desc("Do not inline recursive functions with a stack "
160 "size that exceeds the specified limit"));
162 static cl::opt
<bool> OptComputeFullInlineCost(
163 "inline-cost-full", cl::Hidden
,
164 cl::desc("Compute the full inline cost of a call site even when the cost "
165 "exceeds the threshold."));
167 static cl::opt
<bool> InlineCallerSupersetNoBuiltin(
168 "inline-caller-superset-nobuiltin", cl::Hidden
, cl::init(true),
169 cl::desc("Allow inlining when caller has a superset of callee's nobuiltin "
172 static cl::opt
<bool> DisableGEPConstOperand(
173 "disable-gep-const-evaluation", cl::Hidden
, cl::init(false),
174 cl::desc("Disables evaluation of GetElementPtr with constant operands"));
177 std::optional
<int> getStringFnAttrAsInt(const Attribute
&Attr
) {
178 if (Attr
.isValid()) {
180 if (!Attr
.getValueAsString().getAsInteger(10, AttrValue
))
186 std::optional
<int> getStringFnAttrAsInt(CallBase
&CB
, StringRef AttrKind
) {
187 return getStringFnAttrAsInt(CB
.getFnAttr(AttrKind
));
190 std::optional
<int> getStringFnAttrAsInt(Function
*F
, StringRef AttrKind
) {
191 return getStringFnAttrAsInt(F
->getFnAttribute(AttrKind
));
194 namespace InlineConstants
{
195 int getInstrCost() { return InstrCost
; }
197 } // namespace InlineConstants
202 class InlineCostCallAnalyzer
;
204 // This struct is used to store information about inline cost of a
205 // particular instruction
206 struct InstructionCostDetail
{
209 int ThresholdBefore
= 0;
210 int ThresholdAfter
= 0;
212 int getThresholdDelta() const { return ThresholdAfter
- ThresholdBefore
; }
214 int getCostDelta() const { return CostAfter
- CostBefore
; }
216 bool hasThresholdChanged() const { return ThresholdAfter
!= ThresholdBefore
; }
219 class InlineCostAnnotationWriter
: public AssemblyAnnotationWriter
{
221 InlineCostCallAnalyzer
*const ICCA
;
224 InlineCostAnnotationWriter(InlineCostCallAnalyzer
*ICCA
) : ICCA(ICCA
) {}
225 void emitInstructionAnnot(const Instruction
*I
,
226 formatted_raw_ostream
&OS
) override
;
229 /// Carry out call site analysis, in order to evaluate inlinability.
230 /// NOTE: the type is currently used as implementation detail of functions such
231 /// as llvm::getInlineCost. Note the function_ref constructor parameters - the
232 /// expectation is that they come from the outer scope, from the wrapper
233 /// functions. If we want to support constructing CallAnalyzer objects where
234 /// lambdas are provided inline at construction, or where the object needs to
235 /// otherwise survive past the scope of the provided functions, we need to
236 /// revisit the argument types.
237 class CallAnalyzer
: public InstVisitor
<CallAnalyzer
, bool> {
238 typedef InstVisitor
<CallAnalyzer
, bool> Base
;
239 friend class InstVisitor
<CallAnalyzer
, bool>;
242 virtual ~CallAnalyzer() = default;
243 /// The TargetTransformInfo available for this compilation.
244 const TargetTransformInfo
&TTI
;
246 /// Getter for the cache of @llvm.assume intrinsics.
247 function_ref
<AssumptionCache
&(Function
&)> GetAssumptionCache
;
249 /// Getter for BlockFrequencyInfo
250 function_ref
<BlockFrequencyInfo
&(Function
&)> GetBFI
;
252 /// Getter for TargetLibraryInfo
253 function_ref
<const TargetLibraryInfo
&(Function
&)> GetTLI
;
255 /// Profile summary information.
256 ProfileSummaryInfo
*PSI
;
258 /// The called function.
261 // Cache the DataLayout since we use it a lot.
262 const DataLayout
&DL
;
264 /// The OptimizationRemarkEmitter available for this compilation.
265 OptimizationRemarkEmitter
*ORE
;
267 /// The candidate callsite being analyzed. Please do not use this to do
268 /// analysis in the caller function; we want the inline cost query to be
269 /// easily cacheable. Instead, use the cover function paramHasAttr.
270 CallBase
&CandidateCall
;
272 /// Extension points for handling callsite features.
273 // Called before a basic block was analyzed.
274 virtual void onBlockStart(const BasicBlock
*BB
) {}
276 /// Called after a basic block was analyzed.
277 virtual void onBlockAnalyzed(const BasicBlock
*BB
) {}
279 /// Called before an instruction was analyzed
280 virtual void onInstructionAnalysisStart(const Instruction
*I
) {}
282 /// Called after an instruction was analyzed
283 virtual void onInstructionAnalysisFinish(const Instruction
*I
) {}
285 /// Called at the end of the analysis of the callsite. Return the outcome of
286 /// the analysis, i.e. 'InlineResult(true)' if the inlining may happen, or
287 /// the reason it can't.
288 virtual InlineResult
finalizeAnalysis() { return InlineResult::success(); }
289 /// Called when we're about to start processing a basic block, and every time
290 /// we are done processing an instruction. Return true if there is no point in
291 /// continuing the analysis (e.g. we've determined already the call site is
292 /// too expensive to inline)
293 virtual bool shouldStop() { return false; }
295 /// Called before the analysis of the callee body starts (with callsite
296 /// contexts propagated). It checks callsite-specific information. Return a
297 /// reason analysis can't continue if that's the case, or 'true' if it may
299 virtual InlineResult
onAnalysisStart() { return InlineResult::success(); }
300 /// Called if the analysis engine decides SROA cannot be done for the given
302 virtual void onDisableSROA(AllocaInst
*Arg
) {}
304 /// Called the analysis engine determines load elimination won't happen.
305 virtual void onDisableLoadElimination() {}
307 /// Called when we visit a CallBase, before the analysis starts. Return false
308 /// to stop further processing of the instruction.
309 virtual bool onCallBaseVisitStart(CallBase
&Call
) { return true; }
311 /// Called to account for a call.
312 virtual void onCallPenalty() {}
314 /// Called to account for a load or store.
315 virtual void onMemAccess(){};
317 /// Called to account for the expectation the inlining would result in a load
319 virtual void onLoadEliminationOpportunity() {}
321 /// Called to account for the cost of argument setup for the Call in the
322 /// callee's body (not the callsite currently under analysis).
323 virtual void onCallArgumentSetup(const CallBase
&Call
) {}
325 /// Called to account for a load relative intrinsic.
326 virtual void onLoadRelativeIntrinsic() {}
328 /// Called to account for a lowered call.
329 virtual void onLoweredCall(Function
*F
, CallBase
&Call
, bool IsIndirectCall
) {
332 /// Account for a jump table of given size. Return false to stop further
333 /// processing the switch instruction
334 virtual bool onJumpTable(unsigned JumpTableSize
) { return true; }
336 /// Account for a case cluster of given size. Return false to stop further
337 /// processing of the instruction.
338 virtual bool onCaseCluster(unsigned NumCaseCluster
) { return true; }
340 /// Called at the end of processing a switch instruction, with the given
341 /// number of case clusters.
342 virtual void onFinalizeSwitch(unsigned JumpTableSize
, unsigned NumCaseCluster
,
343 bool DefaultDestUndefined
) {}
345 /// Called to account for any other instruction not specifically accounted
347 virtual void onMissedSimplification() {}
349 /// Start accounting potential benefits due to SROA for the given alloca.
350 virtual void onInitializeSROAArg(AllocaInst
*Arg
) {}
352 /// Account SROA savings for the AllocaInst value.
353 virtual void onAggregateSROAUse(AllocaInst
*V
) {}
355 bool handleSROA(Value
*V
, bool DoNotDisable
) {
356 // Check for SROA candidates in comparisons.
357 if (auto *SROAArg
= getSROAArgForValueOrNull(V
)) {
359 onAggregateSROAUse(SROAArg
);
362 disableSROAForArg(SROAArg
);
367 bool IsCallerRecursive
= false;
368 bool IsRecursiveCall
= false;
369 bool ExposesReturnsTwice
= false;
370 bool HasDynamicAlloca
= false;
371 bool ContainsNoDuplicateCall
= false;
372 bool HasReturn
= false;
373 bool HasIndirectBr
= false;
374 bool HasUninlineableIntrinsic
= false;
375 bool InitsVargArgs
= false;
377 /// Number of bytes allocated statically by the callee.
378 uint64_t AllocatedSize
= 0;
379 unsigned NumInstructions
= 0;
380 unsigned NumVectorInstructions
= 0;
382 /// While we walk the potentially-inlined instructions, we build up and
383 /// maintain a mapping of simplified values specific to this callsite. The
384 /// idea is to propagate any special information we have about arguments to
385 /// this call through the inlinable section of the function, and account for
386 /// likely simplifications post-inlining. The most important aspect we track
387 /// is CFG altering simplifications -- when we prove a basic block dead, that
388 /// can cause dramatic shifts in the cost of inlining a function.
389 DenseMap
<Value
*, Constant
*> SimplifiedValues
;
391 /// Keep track of the values which map back (through function arguments) to
392 /// allocas on the caller stack which could be simplified through SROA.
393 DenseMap
<Value
*, AllocaInst
*> SROAArgValues
;
395 /// Keep track of Allocas for which we believe we may get SROA optimization.
396 DenseSet
<AllocaInst
*> EnabledSROAAllocas
;
398 /// Keep track of values which map to a pointer base and constant offset.
399 DenseMap
<Value
*, std::pair
<Value
*, APInt
>> ConstantOffsetPtrs
;
401 /// Keep track of dead blocks due to the constant arguments.
402 SmallPtrSet
<BasicBlock
*, 16> DeadBlocks
;
404 /// The mapping of the blocks to their known unique successors due to the
405 /// constant arguments.
406 DenseMap
<BasicBlock
*, BasicBlock
*> KnownSuccessors
;
408 /// Model the elimination of repeated loads that is expected to happen
409 /// whenever we simplify away the stores that would otherwise cause them to be
411 bool EnableLoadElimination
= true;
413 /// Whether we allow inlining for recursive call.
414 bool AllowRecursiveCall
= false;
416 SmallPtrSet
<Value
*, 16> LoadAddrSet
;
418 AllocaInst
*getSROAArgForValueOrNull(Value
*V
) const {
419 auto It
= SROAArgValues
.find(V
);
420 if (It
== SROAArgValues
.end() || EnabledSROAAllocas
.count(It
->second
) == 0)
425 /// Use a value in its given form directly if possible, otherwise try looking
426 /// for it in SimplifiedValues.
427 template <typename T
> T
*getDirectOrSimplifiedValue(Value
*V
) const {
428 if (auto *Direct
= dyn_cast
<T
>(V
))
430 return dyn_cast_if_present
<T
>(SimplifiedValues
.lookup(V
));
433 // Custom simplification helper routines.
434 bool isAllocaDerivedArg(Value
*V
);
435 void disableSROAForArg(AllocaInst
*SROAArg
);
436 void disableSROA(Value
*V
);
437 void findDeadBlocks(BasicBlock
*CurrBB
, BasicBlock
*NextBB
);
438 void disableLoadElimination();
439 bool isGEPFree(GetElementPtrInst
&GEP
);
440 bool canFoldInboundsGEP(GetElementPtrInst
&I
);
441 bool accumulateGEPOffset(GEPOperator
&GEP
, APInt
&Offset
);
442 bool simplifyCallSite(Function
*F
, CallBase
&Call
);
443 bool simplifyInstruction(Instruction
&I
);
444 bool simplifyIntrinsicCallIsConstant(CallBase
&CB
);
445 bool simplifyIntrinsicCallObjectSize(CallBase
&CB
);
446 ConstantInt
*stripAndComputeInBoundsConstantOffsets(Value
*&V
);
447 bool isLoweredToCall(Function
*F
, CallBase
&Call
);
449 /// Return true if the given argument to the function being considered for
450 /// inlining has the given attribute set either at the call site or the
451 /// function declaration. Primarily used to inspect call site specific
452 /// attributes since these can be more precise than the ones on the callee
454 bool paramHasAttr(Argument
*A
, Attribute::AttrKind Attr
);
456 /// Return true if the given value is known non null within the callee if
457 /// inlined through this particular callsite.
458 bool isKnownNonNullInCallee(Value
*V
);
460 /// Return true if size growth is allowed when inlining the callee at \p Call.
461 bool allowSizeGrowth(CallBase
&Call
);
463 // Custom analysis routines.
464 InlineResult
analyzeBlock(BasicBlock
*BB
,
465 SmallPtrSetImpl
<const Value
*> &EphValues
);
467 // Disable several entry points to the visitor so we don't accidentally use
468 // them by declaring but not defining them here.
469 void visit(Module
*);
470 void visit(Module
&);
471 void visit(Function
*);
472 void visit(Function
&);
473 void visit(BasicBlock
*);
474 void visit(BasicBlock
&);
476 // Provide base case for our instruction visit.
477 bool visitInstruction(Instruction
&I
);
479 // Our visit overrides.
480 bool visitAlloca(AllocaInst
&I
);
481 bool visitPHI(PHINode
&I
);
482 bool visitGetElementPtr(GetElementPtrInst
&I
);
483 bool visitBitCast(BitCastInst
&I
);
484 bool visitPtrToInt(PtrToIntInst
&I
);
485 bool visitIntToPtr(IntToPtrInst
&I
);
486 bool visitCastInst(CastInst
&I
);
487 bool visitCmpInst(CmpInst
&I
);
488 bool visitSub(BinaryOperator
&I
);
489 bool visitBinaryOperator(BinaryOperator
&I
);
490 bool visitFNeg(UnaryOperator
&I
);
491 bool visitLoad(LoadInst
&I
);
492 bool visitStore(StoreInst
&I
);
493 bool visitExtractValue(ExtractValueInst
&I
);
494 bool visitInsertValue(InsertValueInst
&I
);
495 bool visitCallBase(CallBase
&Call
);
496 bool visitReturnInst(ReturnInst
&RI
);
497 bool visitBranchInst(BranchInst
&BI
);
498 bool visitSelectInst(SelectInst
&SI
);
499 bool visitSwitchInst(SwitchInst
&SI
);
500 bool visitIndirectBrInst(IndirectBrInst
&IBI
);
501 bool visitResumeInst(ResumeInst
&RI
);
502 bool visitCleanupReturnInst(CleanupReturnInst
&RI
);
503 bool visitCatchReturnInst(CatchReturnInst
&RI
);
504 bool visitUnreachableInst(UnreachableInst
&I
);
508 Function
&Callee
, CallBase
&Call
, const TargetTransformInfo
&TTI
,
509 function_ref
<AssumptionCache
&(Function
&)> GetAssumptionCache
,
510 function_ref
<BlockFrequencyInfo
&(Function
&)> GetBFI
= nullptr,
511 function_ref
<const TargetLibraryInfo
&(Function
&)> GetTLI
= nullptr,
512 ProfileSummaryInfo
*PSI
= nullptr,
513 OptimizationRemarkEmitter
*ORE
= nullptr)
514 : TTI(TTI
), GetAssumptionCache(GetAssumptionCache
), GetBFI(GetBFI
),
515 GetTLI(GetTLI
), PSI(PSI
), F(Callee
), DL(F
.getDataLayout()), ORE(ORE
),
516 CandidateCall(Call
) {}
518 InlineResult
analyze();
520 std::optional
<Constant
*> getSimplifiedValue(Instruction
*I
) {
521 auto It
= SimplifiedValues
.find(I
);
522 if (It
!= SimplifiedValues
.end())
527 // Keep a bunch of stats about the cost savings found so we can print them
528 // out when debugging.
529 unsigned NumConstantArgs
= 0;
530 unsigned NumConstantOffsetPtrArgs
= 0;
531 unsigned NumAllocaArgs
= 0;
532 unsigned NumConstantPtrCmps
= 0;
533 unsigned NumConstantPtrDiffs
= 0;
534 unsigned NumInstructionsSimplified
= 0;
539 // Considering forming a binary search, we should find the number of nodes
540 // which is same as the number of comparisons when lowered. For a given
541 // number of clusters, n, we can define a recursive function, f(n), to find
542 // the number of nodes in the tree. The recursion is :
543 // f(n) = 1 + f(n/2) + f (n - n/2), when n > 3,
544 // and f(n) = n, when n <= 3.
545 // This will lead a binary tree where the leaf should be either f(2) or f(3)
546 // when n > 3. So, the number of comparisons from leaves should be n, while
547 // the number of non-leaf should be :
548 // 2^(log2(n) - 1) - 1
549 // = 2^log2(n) * 2^-1 - 1
551 // Considering comparisons from leaf and non-leaf nodes, we can estimate the
552 // number of comparisons in a simple closed form :
553 // n + n / 2 - 1 = n * 3 / 2 - 1
554 int64_t getExpectedNumberOfCompare(int NumCaseCluster
) {
555 return 3 * static_cast<int64_t>(NumCaseCluster
) / 2 - 1;
558 /// FIXME: if it is necessary to derive from InlineCostCallAnalyzer, note
559 /// the FIXME in onLoweredCall, when instantiating an InlineCostCallAnalyzer
560 class InlineCostCallAnalyzer final
: public CallAnalyzer
{
561 const bool ComputeFullInlineCost
;
562 int LoadEliminationCost
= 0;
563 /// Bonus to be applied when percentage of vector instructions in callee is
564 /// high (see more details in updateThreshold).
566 /// Bonus to be applied when the callee has only one reachable basic block.
567 int SingleBBBonus
= 0;
569 /// Tunable parameters that control the analysis.
570 const InlineParams
&Params
;
572 // This DenseMap stores the delta change in cost and threshold after
573 // accounting for the given instruction. The map is filled only with the
574 // flag PrintInstructionComments on.
575 DenseMap
<const Instruction
*, InstructionCostDetail
> InstructionCostDetailMap
;
577 /// Upper bound for the inlining cost. Bonuses are being applied to account
578 /// for speculative "expected profit" of the inlining decision.
581 /// The amount of StaticBonus applied.
582 int StaticBonusApplied
= 0;
584 /// Attempt to evaluate indirect calls to boost its inline cost.
585 const bool BoostIndirectCalls
;
587 /// Ignore the threshold when finalizing analysis.
588 const bool IgnoreThreshold
;
590 // True if the cost-benefit-analysis-based inliner is enabled.
591 const bool CostBenefitAnalysisEnabled
;
593 /// Inlining cost measured in abstract units, accounts for all the
594 /// instructions expected to be executed for a given function invocation.
595 /// Instructions that are statically proven to be dead based on call-site
596 /// arguments are not counted here.
599 // The cumulative cost at the beginning of the basic block being analyzed. At
600 // the end of analyzing each basic block, "Cost - CostAtBBStart" represents
601 // the size of that basic block.
602 int CostAtBBStart
= 0;
604 // The static size of live but cold basic blocks. This is "static" in the
605 // sense that it's not weighted by profile counts at all.
608 // Whether inlining is decided by cost-threshold analysis.
609 bool DecidedByCostThreshold
= false;
611 // Whether inlining is decided by cost-benefit analysis.
612 bool DecidedByCostBenefit
= false;
614 // The cost-benefit pair computed by cost-benefit analysis.
615 std::optional
<CostBenefitPair
> CostBenefit
;
617 bool SingleBB
= true;
619 unsigned SROACostSavings
= 0;
620 unsigned SROACostSavingsLost
= 0;
622 /// The mapping of caller Alloca values to their accumulated cost savings. If
623 /// we have to disable SROA for one of the allocas, this tells us how much
624 /// cost must be added.
625 DenseMap
<AllocaInst
*, int> SROAArgCosts
;
627 /// Return true if \p Call is a cold callsite.
628 bool isColdCallSite(CallBase
&Call
, BlockFrequencyInfo
*CallerBFI
);
630 /// Update Threshold based on callsite properties such as callee
631 /// attributes and callee hotness for PGO builds. The Callee is explicitly
632 /// passed to support analyzing indirect calls whose target is inferred by
634 void updateThreshold(CallBase
&Call
, Function
&Callee
);
635 /// Return a higher threshold if \p Call is a hot callsite.
636 std::optional
<int> getHotCallSiteThreshold(CallBase
&Call
,
637 BlockFrequencyInfo
*CallerBFI
);
639 /// Handle a capped 'int' increment for Cost.
640 void addCost(int64_t Inc
) {
641 Inc
= std::clamp
<int64_t>(Inc
, INT_MIN
, INT_MAX
);
642 Cost
= std::clamp
<int64_t>(Inc
+ Cost
, INT_MIN
, INT_MAX
);
645 void onDisableSROA(AllocaInst
*Arg
) override
{
646 auto CostIt
= SROAArgCosts
.find(Arg
);
647 if (CostIt
== SROAArgCosts
.end())
649 addCost(CostIt
->second
);
650 SROACostSavings
-= CostIt
->second
;
651 SROACostSavingsLost
+= CostIt
->second
;
652 SROAArgCosts
.erase(CostIt
);
655 void onDisableLoadElimination() override
{
656 addCost(LoadEliminationCost
);
657 LoadEliminationCost
= 0;
660 bool onCallBaseVisitStart(CallBase
&Call
) override
{
661 if (std::optional
<int> AttrCallThresholdBonus
=
662 getStringFnAttrAsInt(Call
, "call-threshold-bonus"))
663 Threshold
+= *AttrCallThresholdBonus
;
665 if (std::optional
<int> AttrCallCost
=
666 getStringFnAttrAsInt(Call
, "call-inline-cost")) {
667 addCost(*AttrCallCost
);
668 // Prevent further processing of the call since we want to override its
669 // inline cost, not just add to it.
675 void onCallPenalty() override
{ addCost(CallPenalty
); }
677 void onMemAccess() override
{ addCost(MemAccessCost
); }
679 void onCallArgumentSetup(const CallBase
&Call
) override
{
680 // Pay the price of the argument setup. We account for the average 1
681 // instruction per call argument setup here.
682 addCost(Call
.arg_size() * InstrCost
);
684 void onLoadRelativeIntrinsic() override
{
685 // This is normally lowered to 4 LLVM instructions.
686 addCost(3 * InstrCost
);
688 void onLoweredCall(Function
*F
, CallBase
&Call
,
689 bool IsIndirectCall
) override
{
690 // We account for the average 1 instruction per call argument setup here.
691 addCost(Call
.arg_size() * InstrCost
);
693 // If we have a constant that we are calling as a function, we can peer
694 // through it and see the function target. This happens not infrequently
695 // during devirtualization and so we want to give it a hefty bonus for
696 // inlining, but cap that bonus in the event that inlining wouldn't pan out.
697 // Pretend to inline the function, with a custom threshold.
698 if (IsIndirectCall
&& BoostIndirectCalls
) {
699 auto IndirectCallParams
= Params
;
700 IndirectCallParams
.DefaultThreshold
=
701 InlineConstants::IndirectCallThreshold
;
702 /// FIXME: if InlineCostCallAnalyzer is derived from, this may need
703 /// to instantiate the derived class.
704 InlineCostCallAnalyzer
CA(*F
, Call
, IndirectCallParams
, TTI
,
705 GetAssumptionCache
, GetBFI
, GetTLI
, PSI
, ORE
,
707 if (CA
.analyze().isSuccess()) {
708 // We were able to inline the indirect call! Subtract the cost from the
709 // threshold to get the bonus we want to apply, but don't go below zero.
710 Cost
-= std::max(0, CA
.getThreshold() - CA
.getCost());
713 // Otherwise simply add the cost for merely making the call.
714 addCost(TTI
.getInlineCallPenalty(CandidateCall
.getCaller(), Call
,
718 void onFinalizeSwitch(unsigned JumpTableSize
, unsigned NumCaseCluster
,
719 bool DefaultDestUndefined
) override
{
720 // If suitable for a jump table, consider the cost for the table size and
721 // branch to destination.
722 // Maximum valid cost increased in this function.
724 // Suppose a default branch includes one compare and one conditional
725 // branch if it's reachable.
726 if (!DefaultDestUndefined
)
727 addCost(2 * InstrCost
);
728 // Suppose a jump table requires one load and one jump instruction.
730 static_cast<int64_t>(JumpTableSize
) * InstrCost
+ 2 * InstrCost
;
735 if (NumCaseCluster
<= 3) {
736 // Suppose a comparison includes one compare and one conditional branch.
737 // We can reduce a set of instructions if the default branch is
739 addCost((NumCaseCluster
- DefaultDestUndefined
) * 2 * InstrCost
);
743 int64_t ExpectedNumberOfCompare
=
744 getExpectedNumberOfCompare(NumCaseCluster
);
745 int64_t SwitchCost
= ExpectedNumberOfCompare
* 2 * InstrCost
;
749 void onMissedSimplification() override
{ addCost(InstrCost
); }
751 void onInitializeSROAArg(AllocaInst
*Arg
) override
{
752 assert(Arg
!= nullptr &&
753 "Should not initialize SROA costs for null value.");
754 auto SROAArgCost
= TTI
.getCallerAllocaCost(&CandidateCall
, Arg
);
755 SROACostSavings
+= SROAArgCost
;
756 SROAArgCosts
[Arg
] = SROAArgCost
;
759 void onAggregateSROAUse(AllocaInst
*SROAArg
) override
{
760 auto CostIt
= SROAArgCosts
.find(SROAArg
);
761 assert(CostIt
!= SROAArgCosts
.end() &&
762 "expected this argument to have a cost");
763 CostIt
->second
+= InstrCost
;
764 SROACostSavings
+= InstrCost
;
767 void onBlockStart(const BasicBlock
*BB
) override
{ CostAtBBStart
= Cost
; }
769 void onBlockAnalyzed(const BasicBlock
*BB
) override
{
770 if (CostBenefitAnalysisEnabled
) {
771 // Keep track of the static size of live but cold basic blocks. For now,
772 // we define a cold basic block to be one that's never executed.
773 assert(GetBFI
&& "GetBFI must be available");
774 BlockFrequencyInfo
*BFI
= &(GetBFI(F
));
775 assert(BFI
&& "BFI must be available");
776 auto ProfileCount
= BFI
->getBlockProfileCount(BB
);
777 if (*ProfileCount
== 0)
778 ColdSize
+= Cost
- CostAtBBStart
;
781 auto *TI
= BB
->getTerminator();
782 // If we had any successors at this point, than post-inlining is likely to
783 // have them as well. Note that we assume any basic blocks which existed
784 // due to branches or switches which folded above will also fold after
786 if (SingleBB
&& TI
->getNumSuccessors() > 1) {
787 // Take off the bonus we applied to the threshold.
788 Threshold
-= SingleBBBonus
;
793 void onInstructionAnalysisStart(const Instruction
*I
) override
{
794 // This function is called to store the initial cost of inlining before
795 // the given instruction was assessed.
796 if (!PrintInstructionComments
)
798 InstructionCostDetailMap
[I
].CostBefore
= Cost
;
799 InstructionCostDetailMap
[I
].ThresholdBefore
= Threshold
;
802 void onInstructionAnalysisFinish(const Instruction
*I
) override
{
803 // This function is called to find new values of cost and threshold after
804 // the instruction has been assessed.
805 if (!PrintInstructionComments
)
807 InstructionCostDetailMap
[I
].CostAfter
= Cost
;
808 InstructionCostDetailMap
[I
].ThresholdAfter
= Threshold
;
811 bool isCostBenefitAnalysisEnabled() {
812 if (!PSI
|| !PSI
->hasProfileSummary())
818 if (InlineEnableCostBenefitAnalysis
.getNumOccurrences()) {
819 // Honor the explicit request from the user.
820 if (!InlineEnableCostBenefitAnalysis
)
823 // Otherwise, require instrumentation profile.
824 if (!PSI
->hasInstrumentationProfile())
828 auto *Caller
= CandidateCall
.getParent()->getParent();
829 if (!Caller
->getEntryCount())
832 BlockFrequencyInfo
*CallerBFI
= &(GetBFI(*Caller
));
836 // For now, limit to hot call site.
837 if (!PSI
->isHotCallSite(CandidateCall
, CallerBFI
))
840 // Make sure we have a nonzero entry count.
841 auto EntryCount
= F
.getEntryCount();
842 if (!EntryCount
|| !EntryCount
->getCount())
845 BlockFrequencyInfo
*CalleeBFI
= &(GetBFI(F
));
852 // A helper function to choose between command line override and default.
853 unsigned getInliningCostBenefitAnalysisSavingsMultiplier() const {
854 if (InlineSavingsMultiplier
.getNumOccurrences())
855 return InlineSavingsMultiplier
;
856 return TTI
.getInliningCostBenefitAnalysisSavingsMultiplier();
859 // A helper function to choose between command line override and default.
860 unsigned getInliningCostBenefitAnalysisProfitableMultiplier() const {
861 if (InlineSavingsProfitableMultiplier
.getNumOccurrences())
862 return InlineSavingsProfitableMultiplier
;
863 return TTI
.getInliningCostBenefitAnalysisProfitableMultiplier();
866 void OverrideCycleSavingsAndSizeForTesting(APInt
&CycleSavings
, int &Size
) {
867 if (std::optional
<int> AttrCycleSavings
= getStringFnAttrAsInt(
868 CandidateCall
, "inline-cycle-savings-for-test")) {
869 CycleSavings
= *AttrCycleSavings
;
872 if (std::optional
<int> AttrRuntimeCost
= getStringFnAttrAsInt(
873 CandidateCall
, "inline-runtime-cost-for-test")) {
874 Size
= *AttrRuntimeCost
;
878 // Determine whether we should inline the given call site, taking into account
879 // both the size cost and the cycle savings. Return std::nullopt if we don't
880 // have sufficient profiling information to determine.
881 std::optional
<bool> costBenefitAnalysis() {
882 if (!CostBenefitAnalysisEnabled
)
885 // buildInlinerPipeline in the pass builder sets HotCallSiteThreshold to 0
886 // for the prelink phase of the AutoFDO + ThinLTO build. Honor the logic by
887 // falling back to the cost-based metric.
888 // TODO: Improve this hacky condition.
893 BlockFrequencyInfo
*CalleeBFI
= &(GetBFI(F
));
896 // The cycle savings expressed as the sum of InstrCost
897 // multiplied by the estimated dynamic count of each instruction we can
898 // avoid. Savings come from the call site cost, such as argument setup and
899 // the call instruction, as well as the instructions that are folded.
901 // We use 128-bit APInt here to avoid potential overflow. This variable
902 // should stay well below 10^^24 (or 2^^80) in practice. This "worst" case
903 // assumes that we can avoid or fold a billion instructions, each with a
904 // profile count of 10^^15 -- roughly the number of cycles for a 24-hour
905 // period on a 4GHz machine.
906 APInt
CycleSavings(128, 0);
909 APInt
CurrentSavings(128, 0);
911 if (BranchInst
*BI
= dyn_cast
<BranchInst
>(&I
)) {
912 // Count a conditional branch as savings if it becomes unconditional.
913 if (BI
->isConditional() &&
914 isa_and_nonnull
<ConstantInt
>(
915 SimplifiedValues
.lookup(BI
->getCondition()))) {
916 CurrentSavings
+= InstrCost
;
918 } else if (SwitchInst
*SI
= dyn_cast
<SwitchInst
>(&I
)) {
919 if (isa_and_present
<ConstantInt
>(SimplifiedValues
.lookup(SI
->getCondition())))
920 CurrentSavings
+= InstrCost
;
921 } else if (Value
*V
= dyn_cast
<Value
>(&I
)) {
922 // Count an instruction as savings if we can fold it.
923 if (SimplifiedValues
.count(V
)) {
924 CurrentSavings
+= InstrCost
;
929 auto ProfileCount
= CalleeBFI
->getBlockProfileCount(&BB
);
930 CurrentSavings
*= *ProfileCount
;
931 CycleSavings
+= CurrentSavings
;
934 // Compute the cycle savings per call.
935 auto EntryProfileCount
= F
.getEntryCount();
936 assert(EntryProfileCount
&& EntryProfileCount
->getCount());
937 auto EntryCount
= EntryProfileCount
->getCount();
938 CycleSavings
+= EntryCount
/ 2;
939 CycleSavings
= CycleSavings
.udiv(EntryCount
);
941 // Compute the total savings for the call site.
942 auto *CallerBB
= CandidateCall
.getParent();
943 BlockFrequencyInfo
*CallerBFI
= &(GetBFI(*(CallerBB
->getParent())));
944 CycleSavings
+= getCallsiteCost(TTI
, this->CandidateCall
, DL
);
945 CycleSavings
*= *CallerBFI
->getBlockProfileCount(CallerBB
);
947 // Remove the cost of the cold basic blocks to model the runtime cost more
948 // accurately. Both machine block placement and function splitting could
949 // place cold blocks further from hot blocks.
950 int Size
= Cost
- ColdSize
;
952 // Allow tiny callees to be inlined regardless of whether they meet the
953 // savings threshold.
954 Size
= Size
> InlineSizeAllowance
? Size
- InlineSizeAllowance
: 1;
956 OverrideCycleSavingsAndSizeForTesting(CycleSavings
, Size
);
957 CostBenefit
.emplace(APInt(128, Size
), CycleSavings
);
959 // Let R be the ratio of CycleSavings to Size. We accept the inlining
960 // opportunity if R is really high and reject if R is really low. If R is
961 // somewhere in the middle, we fall back to the cost-based analysis.
963 // Specifically, let R = CycleSavings / Size, we accept the inlining
966 // PSI->getOrCompHotCountThreshold()
967 // R > -------------------------------------------------
968 // getInliningCostBenefitAnalysisSavingsMultiplier()
970 // and reject the inlining opportunity if:
972 // PSI->getOrCompHotCountThreshold()
973 // R <= ----------------------------------------------------
974 // getInliningCostBenefitAnalysisProfitableMultiplier()
976 // Otherwise, we fall back to the cost-based analysis.
978 // Implementation-wise, use multiplication (CycleSavings * Multiplier,
979 // HotCountThreshold * Size) rather than division to avoid precision loss.
980 APInt
Threshold(128, PSI
->getOrCompHotCountThreshold());
983 APInt UpperBoundCycleSavings
= CycleSavings
;
984 UpperBoundCycleSavings
*= getInliningCostBenefitAnalysisSavingsMultiplier();
985 if (UpperBoundCycleSavings
.uge(Threshold
))
988 APInt LowerBoundCycleSavings
= CycleSavings
;
989 LowerBoundCycleSavings
*=
990 getInliningCostBenefitAnalysisProfitableMultiplier();
991 if (LowerBoundCycleSavings
.ult(Threshold
))
994 // Otherwise, fall back to the cost-based analysis.
998 InlineResult
finalizeAnalysis() override
{
999 // Loops generally act a lot like calls in that they act like barriers to
1000 // movement, require a certain amount of setup, etc. So when optimising for
1001 // size, we penalise any call sites that perform loops. We do this after all
1002 // other costs here, so will likely only be dealing with relatively small
1003 // functions (and hence DT and LI will hopefully be cheap).
1004 auto *Caller
= CandidateCall
.getFunction();
1005 if (Caller
->hasMinSize()) {
1006 DominatorTree
DT(F
);
1009 for (Loop
*L
: LI
) {
1010 // Ignore loops that will not be executed
1011 if (DeadBlocks
.count(L
->getHeader()))
1015 addCost(NumLoops
* InlineConstants::LoopPenalty
);
1018 // We applied the maximum possible vector bonus at the beginning. Now,
1019 // subtract the excess bonus, if any, from the Threshold before
1020 // comparing against Cost.
1021 if (NumVectorInstructions
<= NumInstructions
/ 10)
1022 Threshold
-= VectorBonus
;
1023 else if (NumVectorInstructions
<= NumInstructions
/ 2)
1024 Threshold
-= VectorBonus
/ 2;
1026 if (std::optional
<int> AttrCost
=
1027 getStringFnAttrAsInt(CandidateCall
, "function-inline-cost"))
1030 if (std::optional
<int> AttrCostMult
= getStringFnAttrAsInt(
1032 InlineConstants::FunctionInlineCostMultiplierAttributeName
))
1033 Cost
*= *AttrCostMult
;
1035 if (std::optional
<int> AttrThreshold
=
1036 getStringFnAttrAsInt(CandidateCall
, "function-inline-threshold"))
1037 Threshold
= *AttrThreshold
;
1039 if (auto Result
= costBenefitAnalysis()) {
1040 DecidedByCostBenefit
= true;
1042 return InlineResult::success();
1044 return InlineResult::failure("Cost over threshold.");
1047 if (IgnoreThreshold
)
1048 return InlineResult::success();
1050 DecidedByCostThreshold
= true;
1051 return Cost
< std::max(1, Threshold
)
1052 ? InlineResult::success()
1053 : InlineResult::failure("Cost over threshold.");
1056 bool shouldStop() override
{
1057 if (IgnoreThreshold
|| ComputeFullInlineCost
)
1059 // Bail out the moment we cross the threshold. This means we'll under-count
1060 // the cost, but only when undercounting doesn't matter.
1061 if (Cost
< Threshold
)
1063 DecidedByCostThreshold
= true;
1067 void onLoadEliminationOpportunity() override
{
1068 LoadEliminationCost
+= InstrCost
;
1071 InlineResult
onAnalysisStart() override
{
1072 // Perform some tweaks to the cost and threshold based on the direct
1073 // callsite information.
1075 // We want to more aggressively inline vector-dense kernels, so up the
1076 // threshold, and we'll lower it if the % of vector instructions gets too
1077 // low. Note that these bonuses are some what arbitrary and evolved over
1078 // time by accident as much as because they are principled bonuses.
1080 // FIXME: It would be nice to remove all such bonuses. At least it would be
1081 // nice to base the bonus values on something more scientific.
1082 assert(NumInstructions
== 0);
1083 assert(NumVectorInstructions
== 0);
1085 // Update the threshold based on callsite properties
1086 updateThreshold(CandidateCall
, F
);
1088 // While Threshold depends on commandline options that can take negative
1089 // values, we want to enforce the invariant that the computed threshold and
1090 // bonuses are non-negative.
1091 assert(Threshold
>= 0);
1092 assert(SingleBBBonus
>= 0);
1093 assert(VectorBonus
>= 0);
1095 // Speculatively apply all possible bonuses to Threshold. If cost exceeds
1096 // this Threshold any time, and cost cannot decrease, we can stop processing
1097 // the rest of the function body.
1098 Threshold
+= (SingleBBBonus
+ VectorBonus
);
1100 // Give out bonuses for the callsite, as the instructions setting them up
1101 // will be gone after inlining.
1102 addCost(-getCallsiteCost(TTI
, this->CandidateCall
, DL
));
1104 // If this function uses the coldcc calling convention, prefer not to inline
1106 if (F
.getCallingConv() == CallingConv::Cold
)
1107 Cost
+= InlineConstants::ColdccPenalty
;
1109 LLVM_DEBUG(dbgs() << " Initial cost: " << Cost
<< "\n");
1111 // Check if we're done. This can happen due to bonuses and penalties.
1112 if (Cost
>= Threshold
&& !ComputeFullInlineCost
)
1113 return InlineResult::failure("high cost");
1115 return InlineResult::success();
1119 InlineCostCallAnalyzer(
1120 Function
&Callee
, CallBase
&Call
, const InlineParams
&Params
,
1121 const TargetTransformInfo
&TTI
,
1122 function_ref
<AssumptionCache
&(Function
&)> GetAssumptionCache
,
1123 function_ref
<BlockFrequencyInfo
&(Function
&)> GetBFI
= nullptr,
1124 function_ref
<const TargetLibraryInfo
&(Function
&)> GetTLI
= nullptr,
1125 ProfileSummaryInfo
*PSI
= nullptr,
1126 OptimizationRemarkEmitter
*ORE
= nullptr, bool BoostIndirect
= true,
1127 bool IgnoreThreshold
= false)
1128 : CallAnalyzer(Callee
, Call
, TTI
, GetAssumptionCache
, GetBFI
, GetTLI
, PSI
,
1130 ComputeFullInlineCost(OptComputeFullInlineCost
||
1131 Params
.ComputeFullInlineCost
|| ORE
||
1132 isCostBenefitAnalysisEnabled()),
1133 Params(Params
), Threshold(Params
.DefaultThreshold
),
1134 BoostIndirectCalls(BoostIndirect
), IgnoreThreshold(IgnoreThreshold
),
1135 CostBenefitAnalysisEnabled(isCostBenefitAnalysisEnabled()),
1137 AllowRecursiveCall
= *Params
.AllowRecursiveCall
;
1140 /// Annotation Writer for instruction details
1141 InlineCostAnnotationWriter Writer
;
1145 // Prints the same analysis as dump(), but its definition is not dependent
1147 void print(raw_ostream
&OS
);
1149 std::optional
<InstructionCostDetail
> getCostDetails(const Instruction
*I
) {
1150 auto It
= InstructionCostDetailMap
.find(I
);
1151 if (It
!= InstructionCostDetailMap
.end())
1153 return std::nullopt
;
1156 virtual ~InlineCostCallAnalyzer() = default;
1157 int getThreshold() const { return Threshold
; }
1158 int getCost() const { return Cost
; }
1159 int getStaticBonusApplied() const { return StaticBonusApplied
; }
1160 std::optional
<CostBenefitPair
> getCostBenefitPair() { return CostBenefit
; }
1161 bool wasDecidedByCostBenefit() const { return DecidedByCostBenefit
; }
1162 bool wasDecidedByCostThreshold() const { return DecidedByCostThreshold
; }
1165 // Return true if CB is the sole call to local function Callee.
1166 static bool isSoleCallToLocalFunction(const CallBase
&CB
,
1167 const Function
&Callee
) {
1168 return Callee
.hasLocalLinkage() && Callee
.hasOneLiveUse() &&
1169 &Callee
== CB
.getCalledFunction();
1172 class InlineCostFeaturesAnalyzer final
: public CallAnalyzer
{
1174 InlineCostFeatures Cost
= {};
1176 // FIXME: These constants are taken from the heuristic-based cost visitor.
1177 // These should be removed entirely in a later revision to avoid reliance on
1178 // heuristics in the ML inliner.
1179 static constexpr int JTCostMultiplier
= 2;
1180 static constexpr int CaseClusterCostMultiplier
= 2;
1181 static constexpr int SwitchDefaultDestCostMultiplier
= 2;
1182 static constexpr int SwitchCostMultiplier
= 2;
1184 // FIXME: These are taken from the heuristic-based cost visitor: we should
1185 // eventually abstract these to the CallAnalyzer to avoid duplication.
1186 unsigned SROACostSavingOpportunities
= 0;
1187 int VectorBonus
= 0;
1188 int SingleBBBonus
= 0;
1191 DenseMap
<AllocaInst
*, unsigned> SROACosts
;
1193 void increment(InlineCostFeatureIndex Feature
, int64_t Delta
= 1) {
1194 Cost
[static_cast<size_t>(Feature
)] += Delta
;
1197 void set(InlineCostFeatureIndex Feature
, int64_t Value
) {
1198 Cost
[static_cast<size_t>(Feature
)] = Value
;
1201 void onDisableSROA(AllocaInst
*Arg
) override
{
1202 auto CostIt
= SROACosts
.find(Arg
);
1203 if (CostIt
== SROACosts
.end())
1206 increment(InlineCostFeatureIndex::sroa_losses
, CostIt
->second
);
1207 SROACostSavingOpportunities
-= CostIt
->second
;
1208 SROACosts
.erase(CostIt
);
1211 void onDisableLoadElimination() override
{
1212 set(InlineCostFeatureIndex::load_elimination
, 1);
1215 void onCallPenalty() override
{
1216 increment(InlineCostFeatureIndex::call_penalty
, CallPenalty
);
1219 void onCallArgumentSetup(const CallBase
&Call
) override
{
1220 increment(InlineCostFeatureIndex::call_argument_setup
,
1221 Call
.arg_size() * InstrCost
);
1224 void onLoadRelativeIntrinsic() override
{
1225 increment(InlineCostFeatureIndex::load_relative_intrinsic
, 3 * InstrCost
);
1228 void onLoweredCall(Function
*F
, CallBase
&Call
,
1229 bool IsIndirectCall
) override
{
1230 increment(InlineCostFeatureIndex::lowered_call_arg_setup
,
1231 Call
.arg_size() * InstrCost
);
1233 if (IsIndirectCall
) {
1234 InlineParams IndirectCallParams
= {/* DefaultThreshold*/ 0,
1235 /*HintThreshold*/ {},
1236 /*ColdThreshold*/ {},
1237 /*OptSizeThreshold*/ {},
1238 /*OptMinSizeThreshold*/ {},
1239 /*HotCallSiteThreshold*/ {},
1240 /*LocallyHotCallSiteThreshold*/ {},
1241 /*ColdCallSiteThreshold*/ {},
1242 /*ComputeFullInlineCost*/ true,
1243 /*EnableDeferral*/ true};
1244 IndirectCallParams
.DefaultThreshold
=
1245 InlineConstants::IndirectCallThreshold
;
1247 InlineCostCallAnalyzer
CA(*F
, Call
, IndirectCallParams
, TTI
,
1248 GetAssumptionCache
, GetBFI
, GetTLI
, PSI
, ORE
,
1250 if (CA
.analyze().isSuccess()) {
1251 increment(InlineCostFeatureIndex::nested_inline_cost_estimate
,
1253 increment(InlineCostFeatureIndex::nested_inlines
, 1);
1260 void onFinalizeSwitch(unsigned JumpTableSize
, unsigned NumCaseCluster
,
1261 bool DefaultDestUndefined
) override
{
1262 if (JumpTableSize
) {
1263 if (!DefaultDestUndefined
)
1264 increment(InlineCostFeatureIndex::switch_default_dest_penalty
,
1265 SwitchDefaultDestCostMultiplier
* InstrCost
);
1266 int64_t JTCost
= static_cast<int64_t>(JumpTableSize
) * InstrCost
+
1267 JTCostMultiplier
* InstrCost
;
1268 increment(InlineCostFeatureIndex::jump_table_penalty
, JTCost
);
1272 if (NumCaseCluster
<= 3) {
1273 increment(InlineCostFeatureIndex::case_cluster_penalty
,
1274 (NumCaseCluster
- DefaultDestUndefined
) *
1275 CaseClusterCostMultiplier
* InstrCost
);
1279 int64_t ExpectedNumberOfCompare
=
1280 getExpectedNumberOfCompare(NumCaseCluster
);
1282 int64_t SwitchCost
=
1283 ExpectedNumberOfCompare
* SwitchCostMultiplier
* InstrCost
;
1284 increment(InlineCostFeatureIndex::switch_penalty
, SwitchCost
);
1287 void onMissedSimplification() override
{
1288 increment(InlineCostFeatureIndex::unsimplified_common_instructions
,
1292 void onInitializeSROAArg(AllocaInst
*Arg
) override
{
1293 auto SROAArgCost
= TTI
.getCallerAllocaCost(&CandidateCall
, Arg
);
1294 SROACosts
[Arg
] = SROAArgCost
;
1295 SROACostSavingOpportunities
+= SROAArgCost
;
1298 void onAggregateSROAUse(AllocaInst
*Arg
) override
{
1299 SROACosts
.find(Arg
)->second
+= InstrCost
;
1300 SROACostSavingOpportunities
+= InstrCost
;
1303 void onBlockAnalyzed(const BasicBlock
*BB
) override
{
1304 if (BB
->getTerminator()->getNumSuccessors() > 1)
1305 set(InlineCostFeatureIndex::is_multiple_blocks
, 1);
1306 Threshold
-= SingleBBBonus
;
1309 InlineResult
finalizeAnalysis() override
{
1310 auto *Caller
= CandidateCall
.getFunction();
1311 if (Caller
->hasMinSize()) {
1312 DominatorTree
DT(F
);
1314 for (Loop
*L
: LI
) {
1315 // Ignore loops that will not be executed
1316 if (DeadBlocks
.count(L
->getHeader()))
1318 increment(InlineCostFeatureIndex::num_loops
,
1319 InlineConstants::LoopPenalty
);
1322 set(InlineCostFeatureIndex::dead_blocks
, DeadBlocks
.size());
1323 set(InlineCostFeatureIndex::simplified_instructions
,
1324 NumInstructionsSimplified
);
1325 set(InlineCostFeatureIndex::constant_args
, NumConstantArgs
);
1326 set(InlineCostFeatureIndex::constant_offset_ptr_args
,
1327 NumConstantOffsetPtrArgs
);
1328 set(InlineCostFeatureIndex::sroa_savings
, SROACostSavingOpportunities
);
1330 if (NumVectorInstructions
<= NumInstructions
/ 10)
1331 Threshold
-= VectorBonus
;
1332 else if (NumVectorInstructions
<= NumInstructions
/ 2)
1333 Threshold
-= VectorBonus
/ 2;
1335 set(InlineCostFeatureIndex::threshold
, Threshold
);
1337 return InlineResult::success();
1340 bool shouldStop() override
{ return false; }
1342 void onLoadEliminationOpportunity() override
{
1343 increment(InlineCostFeatureIndex::load_elimination
, 1);
1346 InlineResult
onAnalysisStart() override
{
1347 increment(InlineCostFeatureIndex::callsite_cost
,
1348 -1 * getCallsiteCost(TTI
, this->CandidateCall
, DL
));
1350 set(InlineCostFeatureIndex::cold_cc_penalty
,
1351 (F
.getCallingConv() == CallingConv::Cold
));
1353 set(InlineCostFeatureIndex::last_call_to_static_bonus
,
1354 isSoleCallToLocalFunction(CandidateCall
, F
));
1356 // FIXME: we shouldn't repeat this logic in both the Features and Cost
1357 // analyzer - instead, we should abstract it to a common method in the
1359 int SingleBBBonusPercent
= 50;
1360 int VectorBonusPercent
= TTI
.getInlinerVectorBonusPercent();
1361 Threshold
+= TTI
.adjustInliningThreshold(&CandidateCall
);
1362 Threshold
*= TTI
.getInliningThresholdMultiplier();
1363 SingleBBBonus
= Threshold
* SingleBBBonusPercent
/ 100;
1364 VectorBonus
= Threshold
* VectorBonusPercent
/ 100;
1365 Threshold
+= (SingleBBBonus
+ VectorBonus
);
1367 return InlineResult::success();
1371 InlineCostFeaturesAnalyzer(
1372 const TargetTransformInfo
&TTI
,
1373 function_ref
<AssumptionCache
&(Function
&)> &GetAssumptionCache
,
1374 function_ref
<BlockFrequencyInfo
&(Function
&)> GetBFI
,
1375 function_ref
<const TargetLibraryInfo
&(Function
&)> GetTLI
,
1376 ProfileSummaryInfo
*PSI
, OptimizationRemarkEmitter
*ORE
, Function
&Callee
,
1378 : CallAnalyzer(Callee
, Call
, TTI
, GetAssumptionCache
, GetBFI
, GetTLI
,
1381 const InlineCostFeatures
&features() const { return Cost
; }
1386 /// Test whether the given value is an Alloca-derived function argument.
1387 bool CallAnalyzer::isAllocaDerivedArg(Value
*V
) {
1388 return SROAArgValues
.count(V
);
1391 void CallAnalyzer::disableSROAForArg(AllocaInst
*SROAArg
) {
1392 onDisableSROA(SROAArg
);
1393 EnabledSROAAllocas
.erase(SROAArg
);
1394 disableLoadElimination();
1397 void InlineCostAnnotationWriter::emitInstructionAnnot(
1398 const Instruction
*I
, formatted_raw_ostream
&OS
) {
1399 // The cost of inlining of the given instruction is printed always.
1400 // The threshold delta is printed only when it is non-zero. It happens
1401 // when we decided to give a bonus at a particular instruction.
1402 std::optional
<InstructionCostDetail
> Record
= ICCA
->getCostDetails(I
);
1404 OS
<< "; No analysis for the instruction";
1406 OS
<< "; cost before = " << Record
->CostBefore
1407 << ", cost after = " << Record
->CostAfter
1408 << ", threshold before = " << Record
->ThresholdBefore
1409 << ", threshold after = " << Record
->ThresholdAfter
<< ", ";
1410 OS
<< "cost delta = " << Record
->getCostDelta();
1411 if (Record
->hasThresholdChanged())
1412 OS
<< ", threshold delta = " << Record
->getThresholdDelta();
1414 auto C
= ICCA
->getSimplifiedValue(const_cast<Instruction
*>(I
));
1416 OS
<< ", simplified to ";
1417 (*C
)->print(OS
, true);
1422 /// If 'V' maps to a SROA candidate, disable SROA for it.
1423 void CallAnalyzer::disableSROA(Value
*V
) {
1424 if (auto *SROAArg
= getSROAArgForValueOrNull(V
)) {
1425 disableSROAForArg(SROAArg
);
1429 void CallAnalyzer::disableLoadElimination() {
1430 if (EnableLoadElimination
) {
1431 onDisableLoadElimination();
1432 EnableLoadElimination
= false;
1436 /// Accumulate a constant GEP offset into an APInt if possible.
1438 /// Returns false if unable to compute the offset for any reason. Respects any
1439 /// simplified values known during the analysis of this callsite.
1440 bool CallAnalyzer::accumulateGEPOffset(GEPOperator
&GEP
, APInt
&Offset
) {
1441 unsigned IntPtrWidth
= DL
.getIndexTypeSizeInBits(GEP
.getType());
1442 assert(IntPtrWidth
== Offset
.getBitWidth());
1444 for (gep_type_iterator GTI
= gep_type_begin(GEP
), GTE
= gep_type_end(GEP
);
1445 GTI
!= GTE
; ++GTI
) {
1447 getDirectOrSimplifiedValue
<ConstantInt
>(GTI
.getOperand());
1453 // Handle a struct index, which adds its field offset to the pointer.
1454 if (StructType
*STy
= GTI
.getStructTypeOrNull()) {
1455 unsigned ElementIdx
= OpC
->getZExtValue();
1456 const StructLayout
*SL
= DL
.getStructLayout(STy
);
1457 Offset
+= APInt(IntPtrWidth
, SL
->getElementOffset(ElementIdx
));
1461 APInt
TypeSize(IntPtrWidth
, GTI
.getSequentialElementStride(DL
));
1462 Offset
+= OpC
->getValue().sextOrTrunc(IntPtrWidth
) * TypeSize
;
1467 /// Use TTI to check whether a GEP is free.
1469 /// Respects any simplified values known during the analysis of this callsite.
1470 bool CallAnalyzer::isGEPFree(GetElementPtrInst
&GEP
) {
1471 SmallVector
<Value
*, 4> Operands
;
1472 Operands
.push_back(GEP
.getOperand(0));
1473 for (const Use
&Op
: GEP
.indices())
1474 if (Constant
*SimpleOp
= SimplifiedValues
.lookup(Op
))
1475 Operands
.push_back(SimpleOp
);
1477 Operands
.push_back(Op
);
1478 return TTI
.getInstructionCost(&GEP
, Operands
,
1479 TargetTransformInfo::TCK_SizeAndLatency
) ==
1480 TargetTransformInfo::TCC_Free
;
1483 bool CallAnalyzer::visitAlloca(AllocaInst
&I
) {
1484 disableSROA(I
.getOperand(0));
1486 // Check whether inlining will turn a dynamic alloca into a static
1487 // alloca and handle that case.
1488 if (I
.isArrayAllocation()) {
1489 Constant
*Size
= SimplifiedValues
.lookup(I
.getArraySize());
1490 if (auto *AllocSize
= dyn_cast_or_null
<ConstantInt
>(Size
)) {
1491 // Sometimes a dynamic alloca could be converted into a static alloca
1492 // after this constant prop, and become a huge static alloca on an
1493 // unconditional CFG path. Avoid inlining if this is going to happen above
1495 // FIXME: If the threshold is removed or lowered too much, we could end up
1496 // being too pessimistic and prevent inlining non-problematic code. This
1497 // could result in unintended perf regressions. A better overall strategy
1498 // is needed to track stack usage during inlining.
1499 Type
*Ty
= I
.getAllocatedType();
1500 AllocatedSize
= SaturatingMultiplyAdd(
1501 AllocSize
->getLimitedValue(),
1502 DL
.getTypeAllocSize(Ty
).getKnownMinValue(), AllocatedSize
);
1503 if (AllocatedSize
> InlineConstants::MaxSimplifiedDynamicAllocaToInline
)
1504 HasDynamicAlloca
= true;
1509 // Accumulate the allocated size.
1510 if (I
.isStaticAlloca()) {
1511 Type
*Ty
= I
.getAllocatedType();
1512 AllocatedSize
= SaturatingAdd(DL
.getTypeAllocSize(Ty
).getKnownMinValue(),
1516 // FIXME: This is overly conservative. Dynamic allocas are inefficient for
1517 // a variety of reasons, and so we would like to not inline them into
1518 // functions which don't currently have a dynamic alloca. This simply
1519 // disables inlining altogether in the presence of a dynamic alloca.
1520 if (!I
.isStaticAlloca())
1521 HasDynamicAlloca
= true;
1526 bool CallAnalyzer::visitPHI(PHINode
&I
) {
1527 // FIXME: We need to propagate SROA *disabling* through phi nodes, even
1528 // though we don't want to propagate it's bonuses. The idea is to disable
1529 // SROA if it *might* be used in an inappropriate manner.
1531 // Phi nodes are always zero-cost.
1532 // FIXME: Pointer sizes may differ between different address spaces, so do we
1533 // need to use correct address space in the call to getPointerSizeInBits here?
1534 // Or could we skip the getPointerSizeInBits call completely? As far as I can
1535 // see the ZeroOffset is used as a dummy value, so we can probably use any
1536 // bit width for the ZeroOffset?
1537 APInt ZeroOffset
= APInt::getZero(DL
.getPointerSizeInBits(0));
1538 bool CheckSROA
= I
.getType()->isPointerTy();
1540 // Track the constant or pointer with constant offset we've seen so far.
1541 Constant
*FirstC
= nullptr;
1542 std::pair
<Value
*, APInt
> FirstBaseAndOffset
= {nullptr, ZeroOffset
};
1543 Value
*FirstV
= nullptr;
1545 for (unsigned i
= 0, e
= I
.getNumIncomingValues(); i
!= e
; ++i
) {
1546 BasicBlock
*Pred
= I
.getIncomingBlock(i
);
1547 // If the incoming block is dead, skip the incoming block.
1548 if (DeadBlocks
.count(Pred
))
1550 // If the parent block of phi is not the known successor of the incoming
1551 // block, skip the incoming block.
1552 BasicBlock
*KnownSuccessor
= KnownSuccessors
[Pred
];
1553 if (KnownSuccessor
&& KnownSuccessor
!= I
.getParent())
1556 Value
*V
= I
.getIncomingValue(i
);
1557 // If the incoming value is this phi itself, skip the incoming value.
1561 Constant
*C
= getDirectOrSimplifiedValue
<Constant
>(V
);
1563 std::pair
<Value
*, APInt
> BaseAndOffset
= {nullptr, ZeroOffset
};
1564 if (!C
&& CheckSROA
)
1565 BaseAndOffset
= ConstantOffsetPtrs
.lookup(V
);
1567 if (!C
&& !BaseAndOffset
.first
)
1568 // The incoming value is neither a constant nor a pointer with constant
1569 // offset, exit early.
1574 // If we've seen a constant incoming value before and it is the same
1575 // constant we see this time, continue checking the next incoming value.
1577 // Otherwise early exit because we either see a different constant or saw
1578 // a constant before but we have a pointer with constant offset this time.
1583 // The same logic as above, but check pointer with constant offset here.
1584 if (FirstBaseAndOffset
== BaseAndOffset
)
1590 // This is the 1st time we've seen a constant, record it.
1595 // The remaining case is that this is the 1st time we've seen a pointer with
1596 // constant offset, record it.
1598 FirstBaseAndOffset
= BaseAndOffset
;
1601 // Check if we can map phi to a constant.
1603 SimplifiedValues
[&I
] = FirstC
;
1607 // Check if we can map phi to a pointer with constant offset.
1608 if (FirstBaseAndOffset
.first
) {
1609 ConstantOffsetPtrs
[&I
] = FirstBaseAndOffset
;
1611 if (auto *SROAArg
= getSROAArgForValueOrNull(FirstV
))
1612 SROAArgValues
[&I
] = SROAArg
;
1618 /// Check we can fold GEPs of constant-offset call site argument pointers.
1619 /// This requires target data and inbounds GEPs.
1621 /// \return true if the specified GEP can be folded.
1622 bool CallAnalyzer::canFoldInboundsGEP(GetElementPtrInst
&I
) {
1623 // Check if we have a base + offset for the pointer.
1624 std::pair
<Value
*, APInt
> BaseAndOffset
=
1625 ConstantOffsetPtrs
.lookup(I
.getPointerOperand());
1626 if (!BaseAndOffset
.first
)
1629 // Check if the offset of this GEP is constant, and if so accumulate it
1631 if (!accumulateGEPOffset(cast
<GEPOperator
>(I
), BaseAndOffset
.second
))
1634 // Add the result as a new mapping to Base + Offset.
1635 ConstantOffsetPtrs
[&I
] = BaseAndOffset
;
1640 bool CallAnalyzer::visitGetElementPtr(GetElementPtrInst
&I
) {
1641 auto *SROAArg
= getSROAArgForValueOrNull(I
.getPointerOperand());
1643 // Lambda to check whether a GEP's indices are all constant.
1644 auto IsGEPOffsetConstant
= [&](GetElementPtrInst
&GEP
) {
1645 for (const Use
&Op
: GEP
.indices())
1646 if (!getDirectOrSimplifiedValue
<Constant
>(Op
))
1651 if (!DisableGEPConstOperand
)
1652 if (simplifyInstruction(I
))
1655 if ((I
.isInBounds() && canFoldInboundsGEP(I
)) || IsGEPOffsetConstant(I
)) {
1657 SROAArgValues
[&I
] = SROAArg
;
1659 // Constant GEPs are modeled as free.
1663 // Variable GEPs will require math and will disable SROA.
1665 disableSROAForArg(SROAArg
);
1666 return isGEPFree(I
);
1669 /// Simplify \p I if its operands are constants and update SimplifiedValues.
1670 bool CallAnalyzer::simplifyInstruction(Instruction
&I
) {
1671 SmallVector
<Constant
*> COps
;
1672 for (Value
*Op
: I
.operands()) {
1673 Constant
*COp
= getDirectOrSimplifiedValue
<Constant
>(Op
);
1676 COps
.push_back(COp
);
1678 auto *C
= ConstantFoldInstOperands(&I
, COps
, DL
);
1681 SimplifiedValues
[&I
] = C
;
1685 /// Try to simplify a call to llvm.is.constant.
1687 /// Duplicate the argument checking from CallAnalyzer::simplifyCallSite since
1688 /// we expect calls of this specific intrinsic to be infrequent.
1690 /// FIXME: Given that we know CB's parent (F) caller
1691 /// (CandidateCall->getParent()->getParent()), we might be able to determine
1692 /// whether inlining F into F's caller would change how the call to
1693 /// llvm.is.constant would evaluate.
1694 bool CallAnalyzer::simplifyIntrinsicCallIsConstant(CallBase
&CB
) {
1695 Value
*Arg
= CB
.getArgOperand(0);
1696 auto *C
= getDirectOrSimplifiedValue
<Constant
>(Arg
);
1698 Type
*RT
= CB
.getFunctionType()->getReturnType();
1699 SimplifiedValues
[&CB
] = ConstantInt::get(RT
, C
? 1 : 0);
1703 bool CallAnalyzer::simplifyIntrinsicCallObjectSize(CallBase
&CB
) {
1704 // As per the langref, "The fourth argument to llvm.objectsize determines if
1705 // the value should be evaluated at runtime."
1706 if (cast
<ConstantInt
>(CB
.getArgOperand(3))->isOne())
1709 Value
*V
= lowerObjectSizeCall(&cast
<IntrinsicInst
>(CB
), DL
, nullptr,
1710 /*MustSucceed=*/true);
1711 Constant
*C
= dyn_cast_or_null
<Constant
>(V
);
1713 SimplifiedValues
[&CB
] = C
;
1717 bool CallAnalyzer::visitBitCast(BitCastInst
&I
) {
1718 // Propagate constants through bitcasts.
1719 if (simplifyInstruction(I
))
1722 // Track base/offsets through casts
1723 std::pair
<Value
*, APInt
> BaseAndOffset
=
1724 ConstantOffsetPtrs
.lookup(I
.getOperand(0));
1725 // Casts don't change the offset, just wrap it up.
1726 if (BaseAndOffset
.first
)
1727 ConstantOffsetPtrs
[&I
] = BaseAndOffset
;
1729 // Also look for SROA candidates here.
1730 if (auto *SROAArg
= getSROAArgForValueOrNull(I
.getOperand(0)))
1731 SROAArgValues
[&I
] = SROAArg
;
1733 // Bitcasts are always zero cost.
1737 bool CallAnalyzer::visitPtrToInt(PtrToIntInst
&I
) {
1738 // Propagate constants through ptrtoint.
1739 if (simplifyInstruction(I
))
1742 // Track base/offset pairs when converted to a plain integer provided the
1743 // integer is large enough to represent the pointer.
1744 unsigned IntegerSize
= I
.getType()->getScalarSizeInBits();
1745 unsigned AS
= I
.getOperand(0)->getType()->getPointerAddressSpace();
1746 if (IntegerSize
== DL
.getPointerSizeInBits(AS
)) {
1747 std::pair
<Value
*, APInt
> BaseAndOffset
=
1748 ConstantOffsetPtrs
.lookup(I
.getOperand(0));
1749 if (BaseAndOffset
.first
)
1750 ConstantOffsetPtrs
[&I
] = BaseAndOffset
;
1753 // This is really weird. Technically, ptrtoint will disable SROA. However,
1754 // unless that ptrtoint is *used* somewhere in the live basic blocks after
1755 // inlining, it will be nuked, and SROA should proceed. All of the uses which
1756 // would block SROA would also block SROA if applied directly to a pointer,
1757 // and so we can just add the integer in here. The only places where SROA is
1758 // preserved either cannot fire on an integer, or won't in-and-of themselves
1759 // disable SROA (ext) w/o some later use that we would see and disable.
1760 if (auto *SROAArg
= getSROAArgForValueOrNull(I
.getOperand(0)))
1761 SROAArgValues
[&I
] = SROAArg
;
1763 return TTI
.getInstructionCost(&I
, TargetTransformInfo::TCK_SizeAndLatency
) ==
1764 TargetTransformInfo::TCC_Free
;
1767 bool CallAnalyzer::visitIntToPtr(IntToPtrInst
&I
) {
1768 // Propagate constants through ptrtoint.
1769 if (simplifyInstruction(I
))
1772 // Track base/offset pairs when round-tripped through a pointer without
1773 // modifications provided the integer is not too large.
1774 Value
*Op
= I
.getOperand(0);
1775 unsigned IntegerSize
= Op
->getType()->getScalarSizeInBits();
1776 if (IntegerSize
<= DL
.getPointerTypeSizeInBits(I
.getType())) {
1777 std::pair
<Value
*, APInt
> BaseAndOffset
= ConstantOffsetPtrs
.lookup(Op
);
1778 if (BaseAndOffset
.first
)
1779 ConstantOffsetPtrs
[&I
] = BaseAndOffset
;
1782 // "Propagate" SROA here in the same manner as we do for ptrtoint above.
1783 if (auto *SROAArg
= getSROAArgForValueOrNull(Op
))
1784 SROAArgValues
[&I
] = SROAArg
;
1786 return TTI
.getInstructionCost(&I
, TargetTransformInfo::TCK_SizeAndLatency
) ==
1787 TargetTransformInfo::TCC_Free
;
1790 bool CallAnalyzer::visitCastInst(CastInst
&I
) {
1791 // Propagate constants through casts.
1792 if (simplifyInstruction(I
))
1795 // Disable SROA in the face of arbitrary casts we don't explicitly list
1797 disableSROA(I
.getOperand(0));
1799 // If this is a floating-point cast, and the target says this operation
1800 // is expensive, this may eventually become a library call. Treat the cost
1802 switch (I
.getOpcode()) {
1803 case Instruction::FPTrunc
:
1804 case Instruction::FPExt
:
1805 case Instruction::UIToFP
:
1806 case Instruction::SIToFP
:
1807 case Instruction::FPToUI
:
1808 case Instruction::FPToSI
:
1809 if (TTI
.getFPOpCost(I
.getType()) == TargetTransformInfo::TCC_Expensive
)
1816 return TTI
.getInstructionCost(&I
, TargetTransformInfo::TCK_SizeAndLatency
) ==
1817 TargetTransformInfo::TCC_Free
;
1820 bool CallAnalyzer::paramHasAttr(Argument
*A
, Attribute::AttrKind Attr
) {
1821 return CandidateCall
.paramHasAttr(A
->getArgNo(), Attr
);
1824 bool CallAnalyzer::isKnownNonNullInCallee(Value
*V
) {
1825 // Does the *call site* have the NonNull attribute set on an argument? We
1826 // use the attribute on the call site to memoize any analysis done in the
1827 // caller. This will also trip if the callee function has a non-null
1828 // parameter attribute, but that's a less interesting case because hopefully
1829 // the callee would already have been simplified based on that.
1830 if (Argument
*A
= dyn_cast
<Argument
>(V
))
1831 if (paramHasAttr(A
, Attribute::NonNull
))
1834 // Is this an alloca in the caller? This is distinct from the attribute case
1835 // above because attributes aren't updated within the inliner itself and we
1836 // always want to catch the alloca derived case.
1837 if (isAllocaDerivedArg(V
))
1838 // We can actually predict the result of comparisons between an
1839 // alloca-derived value and null. Note that this fires regardless of
1846 bool CallAnalyzer::allowSizeGrowth(CallBase
&Call
) {
1847 // If the normal destination of the invoke or the parent block of the call
1848 // site is unreachable-terminated, there is little point in inlining this
1849 // unless there is literally zero cost.
1850 // FIXME: Note that it is possible that an unreachable-terminated block has a
1851 // hot entry. For example, in below scenario inlining hot_call_X() may be
1859 // For now, we are not handling this corner case here as it is rare in real
1860 // code. In future, we should elaborate this based on BPI and BFI in more
1861 // general threshold adjusting heuristics in updateThreshold().
1862 if (InvokeInst
*II
= dyn_cast
<InvokeInst
>(&Call
)) {
1863 if (isa
<UnreachableInst
>(II
->getNormalDest()->getTerminator()))
1865 } else if (isa
<UnreachableInst
>(Call
.getParent()->getTerminator()))
1871 bool InlineCostCallAnalyzer::isColdCallSite(CallBase
&Call
,
1872 BlockFrequencyInfo
*CallerBFI
) {
1873 // If global profile summary is available, then callsite's coldness is
1874 // determined based on that.
1875 if (PSI
&& PSI
->hasProfileSummary())
1876 return PSI
->isColdCallSite(Call
, CallerBFI
);
1878 // Otherwise we need BFI to be available.
1882 // Determine if the callsite is cold relative to caller's entry. We could
1883 // potentially cache the computation of scaled entry frequency, but the added
1884 // complexity is not worth it unless this scaling shows up high in the
1886 const BranchProbability
ColdProb(ColdCallSiteRelFreq
, 100);
1887 auto CallSiteBB
= Call
.getParent();
1888 auto CallSiteFreq
= CallerBFI
->getBlockFreq(CallSiteBB
);
1889 auto CallerEntryFreq
=
1890 CallerBFI
->getBlockFreq(&(Call
.getCaller()->getEntryBlock()));
1891 return CallSiteFreq
< CallerEntryFreq
* ColdProb
;
1895 InlineCostCallAnalyzer::getHotCallSiteThreshold(CallBase
&Call
,
1896 BlockFrequencyInfo
*CallerBFI
) {
1898 // If global profile summary is available, then callsite's hotness is
1899 // determined based on that.
1900 if (PSI
&& PSI
->hasProfileSummary() && PSI
->isHotCallSite(Call
, CallerBFI
))
1901 return Params
.HotCallSiteThreshold
;
1903 // Otherwise we need BFI to be available and to have a locally hot callsite
1905 if (!CallerBFI
|| !Params
.LocallyHotCallSiteThreshold
)
1906 return std::nullopt
;
1908 // Determine if the callsite is hot relative to caller's entry. We could
1909 // potentially cache the computation of scaled entry frequency, but the added
1910 // complexity is not worth it unless this scaling shows up high in the
1912 const BasicBlock
*CallSiteBB
= Call
.getParent();
1913 BlockFrequency CallSiteFreq
= CallerBFI
->getBlockFreq(CallSiteBB
);
1914 BlockFrequency CallerEntryFreq
= CallerBFI
->getEntryFreq();
1915 std::optional
<BlockFrequency
> Limit
= CallerEntryFreq
.mul(HotCallSiteRelFreq
);
1916 if (Limit
&& CallSiteFreq
>= *Limit
)
1917 return Params
.LocallyHotCallSiteThreshold
;
1919 // Otherwise treat it normally.
1920 return std::nullopt
;
1923 void InlineCostCallAnalyzer::updateThreshold(CallBase
&Call
, Function
&Callee
) {
1924 // If no size growth is allowed for this inlining, set Threshold to 0.
1925 if (!allowSizeGrowth(Call
)) {
1930 Function
*Caller
= Call
.getCaller();
1932 // return min(A, B) if B is valid.
1933 auto MinIfValid
= [](int A
, std::optional
<int> B
) {
1934 return B
? std::min(A
, *B
) : A
;
1937 // return max(A, B) if B is valid.
1938 auto MaxIfValid
= [](int A
, std::optional
<int> B
) {
1939 return B
? std::max(A
, *B
) : A
;
1942 // Various bonus percentages. These are multiplied by Threshold to get the
1944 // SingleBBBonus: This bonus is applied if the callee has a single reachable
1945 // basic block at the given callsite context. This is speculatively applied
1946 // and withdrawn if more than one basic block is seen.
1948 // LstCallToStaticBonus: This large bonus is applied to ensure the inlining
1949 // of the last call to a static function as inlining such functions is
1950 // guaranteed to reduce code size.
1952 // These bonus percentages may be set to 0 based on properties of the caller
1953 // and the callsite.
1954 int SingleBBBonusPercent
= 50;
1955 int VectorBonusPercent
= TTI
.getInlinerVectorBonusPercent();
1956 int LastCallToStaticBonus
= TTI
.getInliningLastCallToStaticBonus();
1958 // Lambda to set all the above bonus and bonus percentages to 0.
1959 auto DisallowAllBonuses
= [&]() {
1960 SingleBBBonusPercent
= 0;
1961 VectorBonusPercent
= 0;
1962 LastCallToStaticBonus
= 0;
1965 // Use the OptMinSizeThreshold or OptSizeThreshold knob if they are available
1966 // and reduce the threshold if the caller has the necessary attribute.
1967 if (Caller
->hasMinSize()) {
1968 Threshold
= MinIfValid(Threshold
, Params
.OptMinSizeThreshold
);
1969 // For minsize, we want to disable the single BB bonus and the vector
1970 // bonuses, but not the last-call-to-static bonus. Inlining the last call to
1971 // a static function will, at the minimum, eliminate the parameter setup and
1972 // call/return instructions.
1973 SingleBBBonusPercent
= 0;
1974 VectorBonusPercent
= 0;
1975 } else if (Caller
->hasOptSize())
1976 Threshold
= MinIfValid(Threshold
, Params
.OptSizeThreshold
);
1978 // Adjust the threshold based on inlinehint attribute and profile based
1979 // hotness information if the caller does not have MinSize attribute.
1980 if (!Caller
->hasMinSize()) {
1981 if (Callee
.hasFnAttribute(Attribute::InlineHint
))
1982 Threshold
= MaxIfValid(Threshold
, Params
.HintThreshold
);
1984 // FIXME: After switching to the new passmanager, simplify the logic below
1985 // by checking only the callsite hotness/coldness as we will reliably
1986 // have local profile information.
1988 // Callsite hotness and coldness can be determined if sample profile is
1989 // used (which adds hotness metadata to calls) or if caller's
1990 // BlockFrequencyInfo is available.
1991 BlockFrequencyInfo
*CallerBFI
= GetBFI
? &(GetBFI(*Caller
)) : nullptr;
1992 auto HotCallSiteThreshold
= getHotCallSiteThreshold(Call
, CallerBFI
);
1993 if (!Caller
->hasOptSize() && HotCallSiteThreshold
) {
1994 LLVM_DEBUG(dbgs() << "Hot callsite.\n");
1995 // FIXME: This should update the threshold only if it exceeds the
1996 // current threshold, but AutoFDO + ThinLTO currently relies on this
1997 // behavior to prevent inlining of hot callsites during ThinLTO
1999 Threshold
= *HotCallSiteThreshold
;
2000 } else if (isColdCallSite(Call
, CallerBFI
)) {
2001 LLVM_DEBUG(dbgs() << "Cold callsite.\n");
2002 // Do not apply bonuses for a cold callsite including the
2003 // LastCallToStatic bonus. While this bonus might result in code size
2004 // reduction, it can cause the size of a non-cold caller to increase
2005 // preventing it from being inlined.
2006 DisallowAllBonuses();
2007 Threshold
= MinIfValid(Threshold
, Params
.ColdCallSiteThreshold
);
2009 // Use callee's global profile information only if we have no way of
2010 // determining this via callsite information.
2011 if (PSI
->isFunctionEntryHot(&Callee
)) {
2012 LLVM_DEBUG(dbgs() << "Hot callee.\n");
2013 // If callsite hotness can not be determined, we may still know
2014 // that the callee is hot and treat it as a weaker hint for threshold
2016 Threshold
= MaxIfValid(Threshold
, Params
.HintThreshold
);
2017 } else if (PSI
->isFunctionEntryCold(&Callee
)) {
2018 LLVM_DEBUG(dbgs() << "Cold callee.\n");
2019 // Do not apply bonuses for a cold callee including the
2020 // LastCallToStatic bonus. While this bonus might result in code size
2021 // reduction, it can cause the size of a non-cold caller to increase
2022 // preventing it from being inlined.
2023 DisallowAllBonuses();
2024 Threshold
= MinIfValid(Threshold
, Params
.ColdThreshold
);
2029 Threshold
+= TTI
.adjustInliningThreshold(&Call
);
2031 // Finally, take the target-specific inlining threshold multiplier into
2033 Threshold
*= TTI
.getInliningThresholdMultiplier();
2035 SingleBBBonus
= Threshold
* SingleBBBonusPercent
/ 100;
2036 VectorBonus
= Threshold
* VectorBonusPercent
/ 100;
2038 // If there is only one call of the function, and it has internal linkage,
2039 // the cost of inlining it drops dramatically. It may seem odd to update
2040 // Cost in updateThreshold, but the bonus depends on the logic in this method.
2041 if (isSoleCallToLocalFunction(Call
, F
)) {
2042 Cost
-= LastCallToStaticBonus
;
2043 StaticBonusApplied
= LastCallToStaticBonus
;
2047 bool CallAnalyzer::visitCmpInst(CmpInst
&I
) {
2048 Value
*LHS
= I
.getOperand(0), *RHS
= I
.getOperand(1);
2049 // First try to handle simplified comparisons.
2050 if (simplifyInstruction(I
))
2053 if (I
.getOpcode() == Instruction::FCmp
)
2056 // Otherwise look for a comparison between constant offset pointers with
2058 Value
*LHSBase
, *RHSBase
;
2059 APInt LHSOffset
, RHSOffset
;
2060 std::tie(LHSBase
, LHSOffset
) = ConstantOffsetPtrs
.lookup(LHS
);
2062 std::tie(RHSBase
, RHSOffset
) = ConstantOffsetPtrs
.lookup(RHS
);
2063 if (RHSBase
&& LHSBase
== RHSBase
) {
2064 // We have common bases, fold the icmp to a constant based on the
2066 SimplifiedValues
[&I
] = ConstantInt::getBool(
2068 ICmpInst::compare(LHSOffset
, RHSOffset
, I
.getPredicate()));
2069 ++NumConstantPtrCmps
;
2074 auto isImplicitNullCheckCmp
= [](const CmpInst
&I
) {
2075 for (auto *User
: I
.users())
2076 if (auto *Instr
= dyn_cast
<Instruction
>(User
))
2077 if (!Instr
->getMetadata(LLVMContext::MD_make_implicit
))
2082 // If the comparison is an equality comparison with null, we can simplify it
2083 // if we know the value (argument) can't be null
2084 if (I
.isEquality() && isa
<ConstantPointerNull
>(I
.getOperand(1))) {
2085 if (isKnownNonNullInCallee(I
.getOperand(0))) {
2086 bool IsNotEqual
= I
.getPredicate() == CmpInst::ICMP_NE
;
2087 SimplifiedValues
[&I
] = IsNotEqual
? ConstantInt::getTrue(I
.getType())
2088 : ConstantInt::getFalse(I
.getType());
2091 // Implicit null checks act as unconditional branches and their comparisons
2092 // should be treated as simplified and free of cost.
2093 if (isImplicitNullCheckCmp(I
))
2096 return handleSROA(I
.getOperand(0), isa
<ConstantPointerNull
>(I
.getOperand(1)));
2099 bool CallAnalyzer::visitSub(BinaryOperator
&I
) {
2100 // Try to handle a special case: we can fold computing the difference of two
2101 // constant-related pointers.
2102 Value
*LHS
= I
.getOperand(0), *RHS
= I
.getOperand(1);
2103 Value
*LHSBase
, *RHSBase
;
2104 APInt LHSOffset
, RHSOffset
;
2105 std::tie(LHSBase
, LHSOffset
) = ConstantOffsetPtrs
.lookup(LHS
);
2107 std::tie(RHSBase
, RHSOffset
) = ConstantOffsetPtrs
.lookup(RHS
);
2108 if (RHSBase
&& LHSBase
== RHSBase
) {
2109 // We have common bases, fold the subtract to a constant based on the
2111 Constant
*CLHS
= ConstantInt::get(LHS
->getContext(), LHSOffset
);
2112 Constant
*CRHS
= ConstantInt::get(RHS
->getContext(), RHSOffset
);
2113 if (Constant
*C
= ConstantExpr::getSub(CLHS
, CRHS
)) {
2114 SimplifiedValues
[&I
] = C
;
2115 ++NumConstantPtrDiffs
;
2121 // Otherwise, fall back to the generic logic for simplifying and handling
2123 return Base::visitSub(I
);
2126 bool CallAnalyzer::visitBinaryOperator(BinaryOperator
&I
) {
2127 Value
*LHS
= I
.getOperand(0), *RHS
= I
.getOperand(1);
2128 Constant
*CLHS
= getDirectOrSimplifiedValue
<Constant
>(LHS
);
2129 Constant
*CRHS
= getDirectOrSimplifiedValue
<Constant
>(RHS
);
2131 Value
*SimpleV
= nullptr;
2132 if (auto FI
= dyn_cast
<FPMathOperator
>(&I
))
2133 SimpleV
= simplifyBinOp(I
.getOpcode(), CLHS
? CLHS
: LHS
, CRHS
? CRHS
: RHS
,
2134 FI
->getFastMathFlags(), DL
);
2137 simplifyBinOp(I
.getOpcode(), CLHS
? CLHS
: LHS
, CRHS
? CRHS
: RHS
, DL
);
2139 if (Constant
*C
= dyn_cast_or_null
<Constant
>(SimpleV
))
2140 SimplifiedValues
[&I
] = C
;
2145 // Disable any SROA on arguments to arbitrary, unsimplified binary operators.
2149 // If the instruction is floating point, and the target says this operation
2150 // is expensive, this may eventually become a library call. Treat the cost
2151 // as such. Unless it's fneg which can be implemented with an xor.
2152 using namespace llvm::PatternMatch
;
2153 if (I
.getType()->isFloatingPointTy() &&
2154 TTI
.getFPOpCost(I
.getType()) == TargetTransformInfo::TCC_Expensive
&&
2155 !match(&I
, m_FNeg(m_Value())))
2161 bool CallAnalyzer::visitFNeg(UnaryOperator
&I
) {
2162 Value
*Op
= I
.getOperand(0);
2163 Constant
*COp
= getDirectOrSimplifiedValue
<Constant
>(Op
);
2165 Value
*SimpleV
= simplifyFNegInst(
2166 COp
? COp
: Op
, cast
<FPMathOperator
>(I
).getFastMathFlags(), DL
);
2168 if (Constant
*C
= dyn_cast_or_null
<Constant
>(SimpleV
))
2169 SimplifiedValues
[&I
] = C
;
2174 // Disable any SROA on arguments to arbitrary, unsimplified fneg.
2180 bool CallAnalyzer::visitLoad(LoadInst
&I
) {
2181 if (handleSROA(I
.getPointerOperand(), I
.isSimple()))
2184 // If the data is already loaded from this address and hasn't been clobbered
2185 // by any stores or calls, this load is likely to be redundant and can be
2187 if (EnableLoadElimination
&&
2188 !LoadAddrSet
.insert(I
.getPointerOperand()).second
&& I
.isUnordered()) {
2189 onLoadEliminationOpportunity();
2197 bool CallAnalyzer::visitStore(StoreInst
&I
) {
2198 if (handleSROA(I
.getPointerOperand(), I
.isSimple()))
2201 // The store can potentially clobber loads and prevent repeated loads from
2202 // being eliminated.
2204 // 1. We can probably keep an initial set of eliminatable loads substracted
2205 // from the cost even when we finally see a store. We just need to disable
2206 // *further* accumulation of elimination savings.
2207 // 2. We should probably at some point thread MemorySSA for the callee into
2208 // this and then use that to actually compute *really* precise savings.
2209 disableLoadElimination();
2215 bool CallAnalyzer::visitExtractValue(ExtractValueInst
&I
) {
2216 // Constant folding for extract value is trivial.
2217 if (simplifyInstruction(I
))
2220 // SROA can't look through these, but they may be free.
2221 return Base::visitExtractValue(I
);
2224 bool CallAnalyzer::visitInsertValue(InsertValueInst
&I
) {
2225 // Constant folding for insert value is trivial.
2226 if (simplifyInstruction(I
))
2229 // SROA can't look through these, but they may be free.
2230 return Base::visitInsertValue(I
);
2233 /// Try to simplify a call site.
2235 /// Takes a concrete function and callsite and tries to actually simplify it by
2236 /// analyzing the arguments and call itself with instsimplify. Returns true if
2237 /// it has simplified the callsite to some other entity (a constant), making it
2239 bool CallAnalyzer::simplifyCallSite(Function
*F
, CallBase
&Call
) {
2240 // FIXME: Using the instsimplify logic directly for this is inefficient
2241 // because we have to continually rebuild the argument list even when no
2242 // simplifications can be performed. Until that is fixed with remapping
2243 // inside of instsimplify, directly constant fold calls here.
2244 if (!canConstantFoldCallTo(&Call
, F
))
2247 // Try to re-map the arguments to constants.
2248 SmallVector
<Constant
*, 4> ConstantArgs
;
2249 ConstantArgs
.reserve(Call
.arg_size());
2250 for (Value
*I
: Call
.args()) {
2251 Constant
*C
= getDirectOrSimplifiedValue
<Constant
>(I
);
2253 return false; // This argument doesn't map to a constant.
2255 ConstantArgs
.push_back(C
);
2257 if (Constant
*C
= ConstantFoldCall(&Call
, F
, ConstantArgs
)) {
2258 SimplifiedValues
[&Call
] = C
;
2265 bool CallAnalyzer::isLoweredToCall(Function
*F
, CallBase
&Call
) {
2266 const TargetLibraryInfo
*TLI
= GetTLI
? &GetTLI(*F
) : nullptr;
2268 if (!TLI
|| !TLI
->getLibFunc(*F
, LF
) || !TLI
->has(LF
))
2269 return TTI
.isLoweredToCall(F
);
2272 case LibFunc_memcpy_chk
:
2273 case LibFunc_memmove_chk
:
2274 case LibFunc_mempcpy_chk
:
2275 case LibFunc_memset_chk
: {
2276 // Calls to __memcpy_chk whose length is known to fit within the object
2277 // size will eventually be replaced by inline stores. Therefore, these
2278 // should not incur a call penalty. This is only really relevant on
2279 // platforms whose headers redirect memcpy to __memcpy_chk (e.g. Darwin), as
2280 // other platforms use memcpy intrinsics, which are already exempt from the
2282 auto *LenOp
= getDirectOrSimplifiedValue
<ConstantInt
>(Call
.getOperand(2));
2284 getDirectOrSimplifiedValue
<ConstantInt
>(Call
.getOperand(3));
2285 if (LenOp
&& ObjSizeOp
&&
2286 LenOp
->getLimitedValue() <= ObjSizeOp
->getLimitedValue()) {
2295 return TTI
.isLoweredToCall(F
);
2298 bool CallAnalyzer::visitCallBase(CallBase
&Call
) {
2299 if (!onCallBaseVisitStart(Call
))
2302 if (Call
.hasFnAttr(Attribute::ReturnsTwice
) &&
2303 !F
.hasFnAttribute(Attribute::ReturnsTwice
)) {
2304 // This aborts the entire analysis.
2305 ExposesReturnsTwice
= true;
2308 if (isa
<CallInst
>(Call
) && cast
<CallInst
>(Call
).cannotDuplicate())
2309 ContainsNoDuplicateCall
= true;
2311 Function
*F
= Call
.getCalledFunction();
2312 bool IsIndirectCall
= !F
;
2313 if (IsIndirectCall
) {
2314 // Check if this happens to be an indirect function call to a known function
2315 // in this inline context. If not, we've done all we can.
2316 Value
*Callee
= Call
.getCalledOperand();
2317 F
= dyn_cast_or_null
<Function
>(SimplifiedValues
.lookup(Callee
));
2318 if (!F
|| F
->getFunctionType() != Call
.getFunctionType()) {
2319 onCallArgumentSetup(Call
);
2321 if (!Call
.onlyReadsMemory())
2322 disableLoadElimination();
2323 return Base::visitCallBase(Call
);
2327 assert(F
&& "Expected a call to a known function");
2329 // When we have a concrete function, first try to simplify it directly.
2330 if (simplifyCallSite(F
, Call
))
2333 // Next check if it is an intrinsic we know about.
2334 // FIXME: Lift this into part of the InstVisitor.
2335 if (IntrinsicInst
*II
= dyn_cast
<IntrinsicInst
>(&Call
)) {
2336 switch (II
->getIntrinsicID()) {
2338 if (!Call
.onlyReadsMemory() && !isAssumeLikeIntrinsic(II
))
2339 disableLoadElimination();
2340 return Base::visitCallBase(Call
);
2342 case Intrinsic::load_relative
:
2343 onLoadRelativeIntrinsic();
2346 case Intrinsic::memset
:
2347 case Intrinsic::memcpy
:
2348 case Intrinsic::memmove
:
2349 disableLoadElimination();
2350 // SROA can usually chew through these intrinsics, but they aren't free.
2352 case Intrinsic::icall_branch_funnel
:
2353 case Intrinsic::localescape
:
2354 HasUninlineableIntrinsic
= true;
2356 case Intrinsic::vastart
:
2357 InitsVargArgs
= true;
2359 case Intrinsic::launder_invariant_group
:
2360 case Intrinsic::strip_invariant_group
:
2361 if (auto *SROAArg
= getSROAArgForValueOrNull(II
->getOperand(0)))
2362 SROAArgValues
[II
] = SROAArg
;
2364 case Intrinsic::is_constant
:
2365 return simplifyIntrinsicCallIsConstant(Call
);
2366 case Intrinsic::objectsize
:
2367 return simplifyIntrinsicCallObjectSize(Call
);
2371 if (F
== Call
.getFunction()) {
2372 // This flag will fully abort the analysis, so don't bother with anything
2374 IsRecursiveCall
= true;
2375 if (!AllowRecursiveCall
)
2379 if (isLoweredToCall(F
, Call
)) {
2380 onLoweredCall(F
, Call
, IsIndirectCall
);
2383 if (!(Call
.onlyReadsMemory() || (IsIndirectCall
&& F
->onlyReadsMemory())))
2384 disableLoadElimination();
2385 return Base::visitCallBase(Call
);
2388 bool CallAnalyzer::visitReturnInst(ReturnInst
&RI
) {
2389 // At least one return instruction will be free after inlining.
2390 bool Free
= !HasReturn
;
2395 bool CallAnalyzer::visitBranchInst(BranchInst
&BI
) {
2396 // We model unconditional branches as essentially free -- they really
2397 // shouldn't exist at all, but handling them makes the behavior of the
2398 // inliner more regular and predictable. Interestingly, conditional branches
2399 // which will fold away are also free.
2400 return BI
.isUnconditional() ||
2401 getDirectOrSimplifiedValue
<ConstantInt
>(BI
.getCondition()) ||
2402 BI
.getMetadata(LLVMContext::MD_make_implicit
);
2405 bool CallAnalyzer::visitSelectInst(SelectInst
&SI
) {
2406 bool CheckSROA
= SI
.getType()->isPointerTy();
2407 Value
*TrueVal
= SI
.getTrueValue();
2408 Value
*FalseVal
= SI
.getFalseValue();
2410 Constant
*TrueC
= getDirectOrSimplifiedValue
<Constant
>(TrueVal
);
2411 Constant
*FalseC
= getDirectOrSimplifiedValue
<Constant
>(FalseVal
);
2413 dyn_cast_or_null
<Constant
>(SimplifiedValues
.lookup(SI
.getCondition()));
2416 // Select C, X, X => X
2417 if (TrueC
== FalseC
&& TrueC
) {
2418 SimplifiedValues
[&SI
] = TrueC
;
2423 return Base::visitSelectInst(SI
);
2425 std::pair
<Value
*, APInt
> TrueBaseAndOffset
=
2426 ConstantOffsetPtrs
.lookup(TrueVal
);
2427 std::pair
<Value
*, APInt
> FalseBaseAndOffset
=
2428 ConstantOffsetPtrs
.lookup(FalseVal
);
2429 if (TrueBaseAndOffset
== FalseBaseAndOffset
&& TrueBaseAndOffset
.first
) {
2430 ConstantOffsetPtrs
[&SI
] = TrueBaseAndOffset
;
2432 if (auto *SROAArg
= getSROAArgForValueOrNull(TrueVal
))
2433 SROAArgValues
[&SI
] = SROAArg
;
2437 return Base::visitSelectInst(SI
);
2440 // Select condition is a constant.
2441 Value
*SelectedV
= CondC
->isAllOnesValue() ? TrueVal
2442 : (CondC
->isNullValue()) ? FalseVal
2445 // Condition is a vector constant that is not all 1s or all 0s. If all
2446 // operands are constants, ConstantFoldSelectInstruction() can handle the
2447 // cases such as select vectors.
2448 if (TrueC
&& FalseC
) {
2449 if (auto *C
= ConstantFoldSelectInstruction(CondC
, TrueC
, FalseC
)) {
2450 SimplifiedValues
[&SI
] = C
;
2454 return Base::visitSelectInst(SI
);
2457 // Condition is either all 1s or all 0s. SI can be simplified.
2458 if (Constant
*SelectedC
= dyn_cast
<Constant
>(SelectedV
)) {
2459 SimplifiedValues
[&SI
] = SelectedC
;
2466 std::pair
<Value
*, APInt
> BaseAndOffset
=
2467 ConstantOffsetPtrs
.lookup(SelectedV
);
2468 if (BaseAndOffset
.first
) {
2469 ConstantOffsetPtrs
[&SI
] = BaseAndOffset
;
2471 if (auto *SROAArg
= getSROAArgForValueOrNull(SelectedV
))
2472 SROAArgValues
[&SI
] = SROAArg
;
2478 bool CallAnalyzer::visitSwitchInst(SwitchInst
&SI
) {
2479 // We model unconditional switches as free, see the comments on handling
2481 if (getDirectOrSimplifiedValue
<ConstantInt
>(SI
.getCondition()))
2484 // Assume the most general case where the switch is lowered into
2485 // either a jump table, bit test, or a balanced binary tree consisting of
2486 // case clusters without merging adjacent clusters with the same
2487 // destination. We do not consider the switches that are lowered with a mix
2488 // of jump table/bit test/binary search tree. The cost of the switch is
2489 // proportional to the size of the tree or the size of jump table range.
2491 // NB: We convert large switches which are just used to initialize large phi
2492 // nodes to lookup tables instead in simplifycfg, so this shouldn't prevent
2493 // inlining those. It will prevent inlining in cases where the optimization
2494 // does not (yet) fire.
2496 unsigned JumpTableSize
= 0;
2497 BlockFrequencyInfo
*BFI
= GetBFI
? &(GetBFI(F
)) : nullptr;
2498 unsigned NumCaseCluster
=
2499 TTI
.getEstimatedNumberOfCaseClusters(SI
, JumpTableSize
, PSI
, BFI
);
2501 onFinalizeSwitch(JumpTableSize
, NumCaseCluster
, SI
.defaultDestUndefined());
2505 bool CallAnalyzer::visitIndirectBrInst(IndirectBrInst
&IBI
) {
2506 // We never want to inline functions that contain an indirectbr. This is
2507 // incorrect because all the blockaddress's (in static global initializers
2508 // for example) would be referring to the original function, and this
2509 // indirect jump would jump from the inlined copy of the function into the
2510 // original function which is extremely undefined behavior.
2511 // FIXME: This logic isn't really right; we can safely inline functions with
2512 // indirectbr's as long as no other function or global references the
2513 // blockaddress of a block within the current function.
2514 HasIndirectBr
= true;
2518 bool CallAnalyzer::visitResumeInst(ResumeInst
&RI
) {
2519 // FIXME: It's not clear that a single instruction is an accurate model for
2520 // the inline cost of a resume instruction.
2524 bool CallAnalyzer::visitCleanupReturnInst(CleanupReturnInst
&CRI
) {
2525 // FIXME: It's not clear that a single instruction is an accurate model for
2526 // the inline cost of a cleanupret instruction.
2530 bool CallAnalyzer::visitCatchReturnInst(CatchReturnInst
&CRI
) {
2531 // FIXME: It's not clear that a single instruction is an accurate model for
2532 // the inline cost of a catchret instruction.
2536 bool CallAnalyzer::visitUnreachableInst(UnreachableInst
&I
) {
2537 // FIXME: It might be reasonably to discount the cost of instructions leading
2538 // to unreachable as they have the lowest possible impact on both runtime and
2540 return true; // No actual code is needed for unreachable.
2543 bool CallAnalyzer::visitInstruction(Instruction
&I
) {
2544 // Some instructions are free. All of the free intrinsics can also be
2545 // handled by SROA, etc.
2546 if (TTI
.getInstructionCost(&I
, TargetTransformInfo::TCK_SizeAndLatency
) ==
2547 TargetTransformInfo::TCC_Free
)
2550 // We found something we don't understand or can't handle. Mark any SROA-able
2551 // values in the operand list as no longer viable.
2552 for (const Use
&Op
: I
.operands())
2558 /// Analyze a basic block for its contribution to the inline cost.
2560 /// This method walks the analyzer over every instruction in the given basic
2561 /// block and accounts for their cost during inlining at this callsite. It
2562 /// aborts early if the threshold has been exceeded or an impossible to inline
2563 /// construct has been detected. It returns false if inlining is no longer
2564 /// viable, and true if inlining remains viable.
2566 CallAnalyzer::analyzeBlock(BasicBlock
*BB
,
2567 SmallPtrSetImpl
<const Value
*> &EphValues
) {
2568 for (Instruction
&I
: *BB
) {
2569 // FIXME: Currently, the number of instructions in a function regardless of
2570 // our ability to simplify them during inline to constants or dead code,
2571 // are actually used by the vector bonus heuristic. As long as that's true,
2572 // we have to special case debug intrinsics here to prevent differences in
2573 // inlining due to debug symbols. Eventually, the number of unsimplified
2574 // instructions shouldn't factor into the cost computation, but until then,
2575 // hack around it here.
2576 // Similarly, skip pseudo-probes.
2577 if (I
.isDebugOrPseudoInst())
2580 // Skip ephemeral values.
2581 if (EphValues
.count(&I
))
2585 if (isa
<ExtractElementInst
>(I
) || I
.getType()->isVectorTy())
2586 ++NumVectorInstructions
;
2588 // If the instruction simplified to a constant, there is no cost to this
2589 // instruction. Visit the instructions using our InstVisitor to account for
2590 // all of the per-instruction logic. The visit tree returns true if we
2591 // consumed the instruction in any way, and false if the instruction's base
2592 // cost should count against inlining.
2593 onInstructionAnalysisStart(&I
);
2595 if (Base::visit(&I
))
2596 ++NumInstructionsSimplified
;
2598 onMissedSimplification();
2600 onInstructionAnalysisFinish(&I
);
2601 using namespace ore
;
2602 // If the visit this instruction detected an uninlinable pattern, abort.
2603 InlineResult IR
= InlineResult::success();
2604 if (IsRecursiveCall
&& !AllowRecursiveCall
)
2605 IR
= InlineResult::failure("recursive");
2606 else if (ExposesReturnsTwice
)
2607 IR
= InlineResult::failure("exposes returns twice");
2608 else if (HasDynamicAlloca
)
2609 IR
= InlineResult::failure("dynamic alloca");
2610 else if (HasIndirectBr
)
2611 IR
= InlineResult::failure("indirect branch");
2612 else if (HasUninlineableIntrinsic
)
2613 IR
= InlineResult::failure("uninlinable intrinsic");
2614 else if (InitsVargArgs
)
2615 IR
= InlineResult::failure("varargs");
2616 if (!IR
.isSuccess()) {
2619 return OptimizationRemarkMissed(DEBUG_TYPE
, "NeverInline",
2621 << NV("Callee", &F
) << " has uninlinable pattern ("
2622 << NV("InlineResult", IR
.getFailureReason())
2623 << ") and cost is not fully computed";
2628 // If the caller is a recursive function then we don't want to inline
2629 // functions which allocate a lot of stack space because it would increase
2630 // the caller stack usage dramatically.
2631 if (IsCallerRecursive
&& AllocatedSize
> RecurStackSizeThreshold
) {
2633 InlineResult::failure("recursive and allocates too much stack space");
2636 return OptimizationRemarkMissed(DEBUG_TYPE
, "NeverInline",
2638 << NV("Callee", &F
) << " is "
2639 << NV("InlineResult", IR
.getFailureReason())
2640 << ". Cost is not fully computed";
2646 return InlineResult::failure(
2647 "Call site analysis is not favorable to inlining.");
2650 return InlineResult::success();
2653 /// Compute the base pointer and cumulative constant offsets for V.
2655 /// This strips all constant offsets off of V, leaving it the base pointer, and
2656 /// accumulates the total constant offset applied in the returned constant. It
2657 /// returns 0 if V is not a pointer, and returns the constant '0' if there are
2658 /// no constant offsets applied.
2659 ConstantInt
*CallAnalyzer::stripAndComputeInBoundsConstantOffsets(Value
*&V
) {
2660 if (!V
->getType()->isPointerTy())
2663 unsigned AS
= V
->getType()->getPointerAddressSpace();
2664 unsigned IntPtrWidth
= DL
.getIndexSizeInBits(AS
);
2665 APInt Offset
= APInt::getZero(IntPtrWidth
);
2667 // Even though we don't look through PHI nodes, we could be called on an
2668 // instruction in an unreachable block, which may be on a cycle.
2669 SmallPtrSet
<Value
*, 4> Visited
;
2672 if (GEPOperator
*GEP
= dyn_cast
<GEPOperator
>(V
)) {
2673 if (!GEP
->isInBounds() || !accumulateGEPOffset(*GEP
, Offset
))
2675 V
= GEP
->getPointerOperand();
2676 } else if (GlobalAlias
*GA
= dyn_cast
<GlobalAlias
>(V
)) {
2677 if (GA
->isInterposable())
2679 V
= GA
->getAliasee();
2683 assert(V
->getType()->isPointerTy() && "Unexpected operand type!");
2684 } while (Visited
.insert(V
).second
);
2686 Type
*IdxPtrTy
= DL
.getIndexType(V
->getType());
2687 return cast
<ConstantInt
>(ConstantInt::get(IdxPtrTy
, Offset
));
2690 /// Find dead blocks due to deleted CFG edges during inlining.
2692 /// If we know the successor of the current block, \p CurrBB, has to be \p
2693 /// NextBB, the other successors of \p CurrBB are dead if these successors have
2694 /// no live incoming CFG edges. If one block is found to be dead, we can
2695 /// continue growing the dead block list by checking the successors of the dead
2696 /// blocks to see if all their incoming edges are dead or not.
2697 void CallAnalyzer::findDeadBlocks(BasicBlock
*CurrBB
, BasicBlock
*NextBB
) {
2698 auto IsEdgeDead
= [&](BasicBlock
*Pred
, BasicBlock
*Succ
) {
2699 // A CFG edge is dead if the predecessor is dead or the predecessor has a
2700 // known successor which is not the one under exam.
2701 return (DeadBlocks
.count(Pred
) ||
2702 (KnownSuccessors
[Pred
] && KnownSuccessors
[Pred
] != Succ
));
2705 auto IsNewlyDead
= [&](BasicBlock
*BB
) {
2706 // If all the edges to a block are dead, the block is also dead.
2707 return (!DeadBlocks
.count(BB
) &&
2708 llvm::all_of(predecessors(BB
),
2709 [&](BasicBlock
*P
) { return IsEdgeDead(P
, BB
); }));
2712 for (BasicBlock
*Succ
: successors(CurrBB
)) {
2713 if (Succ
== NextBB
|| !IsNewlyDead(Succ
))
2715 SmallVector
<BasicBlock
*, 4> NewDead
;
2716 NewDead
.push_back(Succ
);
2717 while (!NewDead
.empty()) {
2718 BasicBlock
*Dead
= NewDead
.pop_back_val();
2719 if (DeadBlocks
.insert(Dead
).second
)
2720 // Continue growing the dead block lists.
2721 for (BasicBlock
*S
: successors(Dead
))
2723 NewDead
.push_back(S
);
2728 /// Analyze a call site for potential inlining.
2730 /// Returns true if inlining this call is viable, and false if it is not
2731 /// viable. It computes the cost and adjusts the threshold based on numerous
2732 /// factors and heuristics. If this method returns false but the computed cost
2733 /// is below the computed threshold, then inlining was forcibly disabled by
2734 /// some artifact of the routine.
2735 InlineResult
CallAnalyzer::analyze() {
2738 auto Result
= onAnalysisStart();
2739 if (!Result
.isSuccess())
2743 return InlineResult::success();
2745 Function
*Caller
= CandidateCall
.getFunction();
2746 // Check if the caller function is recursive itself.
2747 for (User
*U
: Caller
->users()) {
2748 CallBase
*Call
= dyn_cast
<CallBase
>(U
);
2749 if (Call
&& Call
->getFunction() == Caller
) {
2750 IsCallerRecursive
= true;
2755 // Populate our simplified values by mapping from function arguments to call
2756 // arguments with known important simplifications.
2757 auto CAI
= CandidateCall
.arg_begin();
2758 for (Argument
&FAI
: F
.args()) {
2759 assert(CAI
!= CandidateCall
.arg_end());
2760 if (Constant
*C
= dyn_cast
<Constant
>(CAI
))
2761 SimplifiedValues
[&FAI
] = C
;
2763 Value
*PtrArg
= *CAI
;
2764 if (ConstantInt
*C
= stripAndComputeInBoundsConstantOffsets(PtrArg
)) {
2765 ConstantOffsetPtrs
[&FAI
] = std::make_pair(PtrArg
, C
->getValue());
2767 // We can SROA any pointer arguments derived from alloca instructions.
2768 if (auto *SROAArg
= dyn_cast
<AllocaInst
>(PtrArg
)) {
2769 SROAArgValues
[&FAI
] = SROAArg
;
2770 onInitializeSROAArg(SROAArg
);
2771 EnabledSROAAllocas
.insert(SROAArg
);
2776 NumConstantArgs
= SimplifiedValues
.size();
2777 NumConstantOffsetPtrArgs
= ConstantOffsetPtrs
.size();
2778 NumAllocaArgs
= SROAArgValues
.size();
2780 // FIXME: If a caller has multiple calls to a callee, we end up recomputing
2781 // the ephemeral values multiple times (and they're completely determined by
2782 // the callee, so this is purely duplicate work).
2783 SmallPtrSet
<const Value
*, 32> EphValues
;
2784 CodeMetrics::collectEphemeralValues(&F
, &GetAssumptionCache(F
), EphValues
);
2786 // The worklist of live basic blocks in the callee *after* inlining. We avoid
2787 // adding basic blocks of the callee which can be proven to be dead for this
2788 // particular call site in order to get more accurate cost estimates. This
2789 // requires a somewhat heavyweight iteration pattern: we need to walk the
2790 // basic blocks in a breadth-first order as we insert live successors. To
2791 // accomplish this, prioritizing for small iterations because we exit after
2792 // crossing our threshold, we use a small-size optimized SetVector.
2793 typedef SmallSetVector
<BasicBlock
*, 16> BBSetVector
;
2794 BBSetVector BBWorklist
;
2795 BBWorklist
.insert(&F
.getEntryBlock());
2797 // Note that we *must not* cache the size, this loop grows the worklist.
2798 for (unsigned Idx
= 0; Idx
!= BBWorklist
.size(); ++Idx
) {
2802 BasicBlock
*BB
= BBWorklist
[Idx
];
2808 // Disallow inlining a blockaddress with uses other than strictly callbr.
2809 // A blockaddress only has defined behavior for an indirect branch in the
2810 // same function, and we do not currently support inlining indirect
2811 // branches. But, the inliner may not see an indirect branch that ends up
2812 // being dead code at a particular call site. If the blockaddress escapes
2813 // the function, e.g., via a global variable, inlining may lead to an
2814 // invalid cross-function reference.
2815 // FIXME: pr/39560: continue relaxing this overt restriction.
2816 if (BB
->hasAddressTaken())
2817 for (User
*U
: BlockAddress::get(&*BB
)->users())
2818 if (!isa
<CallBrInst
>(*U
))
2819 return InlineResult::failure("blockaddress used outside of callbr");
2821 // Analyze the cost of this block. If we blow through the threshold, this
2822 // returns false, and we can bail on out.
2823 InlineResult IR
= analyzeBlock(BB
, EphValues
);
2824 if (!IR
.isSuccess())
2827 Instruction
*TI
= BB
->getTerminator();
2829 // Add in the live successors by first checking whether we have terminator
2830 // that may be simplified based on the values simplified by this call.
2831 if (BranchInst
*BI
= dyn_cast
<BranchInst
>(TI
)) {
2832 if (BI
->isConditional()) {
2833 Value
*Cond
= BI
->getCondition();
2834 if (ConstantInt
*SimpleCond
=
2835 dyn_cast_or_null
<ConstantInt
>(SimplifiedValues
.lookup(Cond
))) {
2836 BasicBlock
*NextBB
= BI
->getSuccessor(SimpleCond
->isZero() ? 1 : 0);
2837 BBWorklist
.insert(NextBB
);
2838 KnownSuccessors
[BB
] = NextBB
;
2839 findDeadBlocks(BB
, NextBB
);
2843 } else if (SwitchInst
*SI
= dyn_cast
<SwitchInst
>(TI
)) {
2844 Value
*Cond
= SI
->getCondition();
2845 if (ConstantInt
*SimpleCond
=
2846 dyn_cast_or_null
<ConstantInt
>(SimplifiedValues
.lookup(Cond
))) {
2847 BasicBlock
*NextBB
= SI
->findCaseValue(SimpleCond
)->getCaseSuccessor();
2848 BBWorklist
.insert(NextBB
);
2849 KnownSuccessors
[BB
] = NextBB
;
2850 findDeadBlocks(BB
, NextBB
);
2855 // If we're unable to select a particular successor, just count all of
2857 for (BasicBlock
*Succ
: successors(BB
))
2858 BBWorklist
.insert(Succ
);
2860 onBlockAnalyzed(BB
);
2863 // If this is a noduplicate call, we can still inline as long as
2864 // inlining this would cause the removal of the caller (so the instruction
2865 // is not actually duplicated, just moved).
2866 if (!isSoleCallToLocalFunction(CandidateCall
, F
) && ContainsNoDuplicateCall
)
2867 return InlineResult::failure("noduplicate");
2869 // If the callee's stack size exceeds the user-specified threshold,
2870 // do not let it be inlined.
2871 // The command line option overrides a limit set in the function attributes.
2872 size_t FinalStackSizeThreshold
= StackSizeThreshold
;
2873 if (!StackSizeThreshold
.getNumOccurrences())
2874 if (std::optional
<int> AttrMaxStackSize
= getStringFnAttrAsInt(
2875 Caller
, InlineConstants::MaxInlineStackSizeAttributeName
))
2876 FinalStackSizeThreshold
= *AttrMaxStackSize
;
2877 if (AllocatedSize
> FinalStackSizeThreshold
)
2878 return InlineResult::failure("stacksize");
2880 return finalizeAnalysis();
2883 void InlineCostCallAnalyzer::print(raw_ostream
&OS
) {
2884 #define DEBUG_PRINT_STAT(x) OS << " " #x ": " << x << "\n"
2885 if (PrintInstructionComments
)
2886 F
.print(OS
, &Writer
);
2887 DEBUG_PRINT_STAT(NumConstantArgs
);
2888 DEBUG_PRINT_STAT(NumConstantOffsetPtrArgs
);
2889 DEBUG_PRINT_STAT(NumAllocaArgs
);
2890 DEBUG_PRINT_STAT(NumConstantPtrCmps
);
2891 DEBUG_PRINT_STAT(NumConstantPtrDiffs
);
2892 DEBUG_PRINT_STAT(NumInstructionsSimplified
);
2893 DEBUG_PRINT_STAT(NumInstructions
);
2894 DEBUG_PRINT_STAT(SROACostSavings
);
2895 DEBUG_PRINT_STAT(SROACostSavingsLost
);
2896 DEBUG_PRINT_STAT(LoadEliminationCost
);
2897 DEBUG_PRINT_STAT(ContainsNoDuplicateCall
);
2898 DEBUG_PRINT_STAT(Cost
);
2899 DEBUG_PRINT_STAT(Threshold
);
2900 #undef DEBUG_PRINT_STAT
2903 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2904 /// Dump stats about this call's analysis.
2905 LLVM_DUMP_METHOD
void InlineCostCallAnalyzer::dump() { print(dbgs()); }
2908 /// Test that there are no attribute conflicts between Caller and Callee
2909 /// that prevent inlining.
2910 static bool functionsHaveCompatibleAttributes(
2911 Function
*Caller
, Function
*Callee
, TargetTransformInfo
&TTI
,
2912 function_ref
<const TargetLibraryInfo
&(Function
&)> &GetTLI
) {
2913 // Note that CalleeTLI must be a copy not a reference. The legacy pass manager
2914 // caches the most recently created TLI in the TargetLibraryInfoWrapperPass
2915 // object, and always returns the same object (which is overwritten on each
2916 // GetTLI call). Therefore we copy the first result.
2917 auto CalleeTLI
= GetTLI(*Callee
);
2918 return (IgnoreTTIInlineCompatible
||
2919 TTI
.areInlineCompatible(Caller
, Callee
)) &&
2920 GetTLI(*Caller
).areInlineCompatible(CalleeTLI
,
2921 InlineCallerSupersetNoBuiltin
) &&
2922 AttributeFuncs::areInlineCompatible(*Caller
, *Callee
);
2925 int llvm::getCallsiteCost(const TargetTransformInfo
&TTI
, const CallBase
&Call
,
2926 const DataLayout
&DL
) {
2928 for (unsigned I
= 0, E
= Call
.arg_size(); I
!= E
; ++I
) {
2929 if (Call
.isByValArgument(I
)) {
2930 // We approximate the number of loads and stores needed by dividing the
2931 // size of the byval type by the target's pointer size.
2932 PointerType
*PTy
= cast
<PointerType
>(Call
.getArgOperand(I
)->getType());
2933 unsigned TypeSize
= DL
.getTypeSizeInBits(Call
.getParamByValType(I
));
2934 unsigned AS
= PTy
->getAddressSpace();
2935 unsigned PointerSize
= DL
.getPointerSizeInBits(AS
);
2936 // Ceiling division.
2937 unsigned NumStores
= (TypeSize
+ PointerSize
- 1) / PointerSize
;
2939 // If it generates more than 8 stores it is likely to be expanded as an
2940 // inline memcpy so we take that as an upper bound. Otherwise we assume
2941 // one load and one store per word copied.
2942 // FIXME: The maxStoresPerMemcpy setting from the target should be used
2943 // here instead of a magic number of 8, but it's not available via
2945 NumStores
= std::min(NumStores
, 8U);
2947 Cost
+= 2 * NumStores
* InstrCost
;
2949 // For non-byval arguments subtract off one instruction per call
2954 // The call instruction also disappears after inlining.
2956 Cost
+= TTI
.getInlineCallPenalty(Call
.getCaller(), Call
, CallPenalty
);
2958 return std::min
<int64_t>(Cost
, INT_MAX
);
2961 InlineCost
llvm::getInlineCost(
2962 CallBase
&Call
, const InlineParams
&Params
, TargetTransformInfo
&CalleeTTI
,
2963 function_ref
<AssumptionCache
&(Function
&)> GetAssumptionCache
,
2964 function_ref
<const TargetLibraryInfo
&(Function
&)> GetTLI
,
2965 function_ref
<BlockFrequencyInfo
&(Function
&)> GetBFI
,
2966 ProfileSummaryInfo
*PSI
, OptimizationRemarkEmitter
*ORE
) {
2967 return getInlineCost(Call
, Call
.getCalledFunction(), Params
, CalleeTTI
,
2968 GetAssumptionCache
, GetTLI
, GetBFI
, PSI
, ORE
);
2971 std::optional
<int> llvm::getInliningCostEstimate(
2972 CallBase
&Call
, TargetTransformInfo
&CalleeTTI
,
2973 function_ref
<AssumptionCache
&(Function
&)> GetAssumptionCache
,
2974 function_ref
<BlockFrequencyInfo
&(Function
&)> GetBFI
,
2975 function_ref
<const TargetLibraryInfo
&(Function
&)> GetTLI
,
2976 ProfileSummaryInfo
*PSI
, OptimizationRemarkEmitter
*ORE
) {
2977 const InlineParams Params
= {/* DefaultThreshold*/ 0,
2978 /*HintThreshold*/ {},
2979 /*ColdThreshold*/ {},
2980 /*OptSizeThreshold*/ {},
2981 /*OptMinSizeThreshold*/ {},
2982 /*HotCallSiteThreshold*/ {},
2983 /*LocallyHotCallSiteThreshold*/ {},
2984 /*ColdCallSiteThreshold*/ {},
2985 /*ComputeFullInlineCost*/ true,
2986 /*EnableDeferral*/ true};
2988 InlineCostCallAnalyzer
CA(*Call
.getCalledFunction(), Call
, Params
, CalleeTTI
,
2989 GetAssumptionCache
, GetBFI
, GetTLI
, PSI
, ORE
, true,
2990 /*IgnoreThreshold*/ true);
2991 auto R
= CA
.analyze();
2993 return std::nullopt
;
2994 return CA
.getCost();
2997 std::optional
<InlineCostFeatures
> llvm::getInliningCostFeatures(
2998 CallBase
&Call
, TargetTransformInfo
&CalleeTTI
,
2999 function_ref
<AssumptionCache
&(Function
&)> GetAssumptionCache
,
3000 function_ref
<BlockFrequencyInfo
&(Function
&)> GetBFI
,
3001 function_ref
<const TargetLibraryInfo
&(Function
&)> GetTLI
,
3002 ProfileSummaryInfo
*PSI
, OptimizationRemarkEmitter
*ORE
) {
3003 InlineCostFeaturesAnalyzer
CFA(CalleeTTI
, GetAssumptionCache
, GetBFI
, GetTLI
,
3004 PSI
, ORE
, *Call
.getCalledFunction(), Call
);
3005 auto R
= CFA
.analyze();
3007 return std::nullopt
;
3008 return CFA
.features();
3011 std::optional
<InlineResult
> llvm::getAttributeBasedInliningDecision(
3012 CallBase
&Call
, Function
*Callee
, TargetTransformInfo
&CalleeTTI
,
3013 function_ref
<const TargetLibraryInfo
&(Function
&)> GetTLI
) {
3015 // Cannot inline indirect calls.
3017 return InlineResult::failure("indirect call");
3019 // When callee coroutine function is inlined into caller coroutine function
3020 // before coro-split pass,
3021 // coro-early pass can not handle this quiet well.
3022 // So we won't inline the coroutine function if it have not been unsplited
3023 if (Callee
->isPresplitCoroutine())
3024 return InlineResult::failure("unsplited coroutine call");
3026 // Never inline calls with byval arguments that does not have the alloca
3027 // address space. Since byval arguments can be replaced with a copy to an
3028 // alloca, the inlined code would need to be adjusted to handle that the
3029 // argument is in the alloca address space (so it is a little bit complicated
3031 unsigned AllocaAS
= Callee
->getDataLayout().getAllocaAddrSpace();
3032 for (unsigned I
= 0, E
= Call
.arg_size(); I
!= E
; ++I
)
3033 if (Call
.isByValArgument(I
)) {
3034 PointerType
*PTy
= cast
<PointerType
>(Call
.getArgOperand(I
)->getType());
3035 if (PTy
->getAddressSpace() != AllocaAS
)
3036 return InlineResult::failure("byval arguments without alloca"
3040 // Calls to functions with always-inline attributes should be inlined
3041 // whenever possible.
3042 if (Call
.hasFnAttr(Attribute::AlwaysInline
)) {
3043 if (Call
.getAttributes().hasFnAttr(Attribute::NoInline
))
3044 return InlineResult::failure("noinline call site attribute");
3046 auto IsViable
= isInlineViable(*Callee
);
3047 if (IsViable
.isSuccess())
3048 return InlineResult::success();
3049 return InlineResult::failure(IsViable
.getFailureReason());
3052 // Never inline functions with conflicting attributes (unless callee has
3053 // always-inline attribute).
3054 Function
*Caller
= Call
.getCaller();
3055 if (!functionsHaveCompatibleAttributes(Caller
, Callee
, CalleeTTI
, GetTLI
))
3056 return InlineResult::failure("conflicting attributes");
3058 // Don't inline this call if the caller has the optnone attribute.
3059 if (Caller
->hasOptNone())
3060 return InlineResult::failure("optnone attribute");
3062 // Don't inline a function that treats null pointer as valid into a caller
3063 // that does not have this attribute.
3064 if (!Caller
->nullPointerIsDefined() && Callee
->nullPointerIsDefined())
3065 return InlineResult::failure("nullptr definitions incompatible");
3067 // Don't inline functions which can be interposed at link-time.
3068 if (Callee
->isInterposable())
3069 return InlineResult::failure("interposable");
3071 // Don't inline functions marked noinline.
3072 if (Callee
->hasFnAttribute(Attribute::NoInline
))
3073 return InlineResult::failure("noinline function attribute");
3075 // Don't inline call sites marked noinline.
3076 if (Call
.isNoInline())
3077 return InlineResult::failure("noinline call site attribute");
3079 return std::nullopt
;
3082 InlineCost
llvm::getInlineCost(
3083 CallBase
&Call
, Function
*Callee
, const InlineParams
&Params
,
3084 TargetTransformInfo
&CalleeTTI
,
3085 function_ref
<AssumptionCache
&(Function
&)> GetAssumptionCache
,
3086 function_ref
<const TargetLibraryInfo
&(Function
&)> GetTLI
,
3087 function_ref
<BlockFrequencyInfo
&(Function
&)> GetBFI
,
3088 ProfileSummaryInfo
*PSI
, OptimizationRemarkEmitter
*ORE
) {
3091 llvm::getAttributeBasedInliningDecision(Call
, Callee
, CalleeTTI
, GetTLI
);
3094 if (UserDecision
->isSuccess())
3095 return llvm::InlineCost::getAlways("always inline attribute");
3096 return llvm::InlineCost::getNever(UserDecision
->getFailureReason());
3099 LLVM_DEBUG(llvm::dbgs() << " Analyzing call of " << Callee
->getName()
3100 << "... (caller:" << Call
.getCaller()->getName()
3103 InlineCostCallAnalyzer
CA(*Callee
, Call
, Params
, CalleeTTI
,
3104 GetAssumptionCache
, GetBFI
, GetTLI
, PSI
, ORE
);
3105 InlineResult ShouldInline
= CA
.analyze();
3107 LLVM_DEBUG(CA
.dump());
3109 // Always make cost benefit based decision explicit.
3110 // We use always/never here since threshold is not meaningful,
3111 // as it's not what drives cost-benefit analysis.
3112 if (CA
.wasDecidedByCostBenefit()) {
3113 if (ShouldInline
.isSuccess())
3114 return InlineCost::getAlways("benefit over cost",
3115 CA
.getCostBenefitPair());
3117 return InlineCost::getNever("cost over benefit", CA
.getCostBenefitPair());
3120 if (CA
.wasDecidedByCostThreshold())
3121 return InlineCost::get(CA
.getCost(), CA
.getThreshold(),
3122 CA
.getStaticBonusApplied());
3124 // No details on how the decision was made, simply return always or never.
3125 return ShouldInline
.isSuccess()
3126 ? InlineCost::getAlways("empty function")
3127 : InlineCost::getNever(ShouldInline
.getFailureReason());
3130 InlineResult
llvm::isInlineViable(Function
&F
) {
3131 bool ReturnsTwice
= F
.hasFnAttribute(Attribute::ReturnsTwice
);
3132 for (BasicBlock
&BB
: F
) {
3133 // Disallow inlining of functions which contain indirect branches.
3134 if (isa
<IndirectBrInst
>(BB
.getTerminator()))
3135 return InlineResult::failure("contains indirect branches");
3137 // Disallow inlining of blockaddresses which are used by non-callbr
3139 if (BB
.hasAddressTaken())
3140 for (User
*U
: BlockAddress::get(&BB
)->users())
3141 if (!isa
<CallBrInst
>(*U
))
3142 return InlineResult::failure("blockaddress used outside of callbr");
3144 for (auto &II
: BB
) {
3145 CallBase
*Call
= dyn_cast
<CallBase
>(&II
);
3149 // Disallow recursive calls.
3150 Function
*Callee
= Call
->getCalledFunction();
3152 return InlineResult::failure("recursive call");
3154 // Disallow calls which expose returns-twice to a function not previously
3155 // attributed as such.
3156 if (!ReturnsTwice
&& isa
<CallInst
>(Call
) &&
3157 cast
<CallInst
>(Call
)->canReturnTwice())
3158 return InlineResult::failure("exposes returns-twice attribute");
3161 switch (Callee
->getIntrinsicID()) {
3164 case llvm::Intrinsic::icall_branch_funnel
:
3165 // Disallow inlining of @llvm.icall.branch.funnel because current
3166 // backend can't separate call targets from call arguments.
3167 return InlineResult::failure(
3168 "disallowed inlining of @llvm.icall.branch.funnel");
3169 case llvm::Intrinsic::localescape
:
3170 // Disallow inlining functions that call @llvm.localescape. Doing this
3171 // correctly would require major changes to the inliner.
3172 return InlineResult::failure(
3173 "disallowed inlining of @llvm.localescape");
3174 case llvm::Intrinsic::vastart
:
3175 // Disallow inlining of functions that initialize VarArgs with
3177 return InlineResult::failure(
3178 "contains VarArgs initialized with va_start");
3183 return InlineResult::success();
3186 // APIs to create InlineParams based on command line flags and/or other
3189 InlineParams
llvm::getInlineParams(int Threshold
) {
3190 InlineParams Params
;
3192 // This field is the threshold to use for a callee by default. This is
3193 // derived from one or more of:
3194 // * optimization or size-optimization levels,
3195 // * a value passed to createFunctionInliningPass function, or
3196 // * the -inline-threshold flag.
3197 // If the -inline-threshold flag is explicitly specified, that is used
3198 // irrespective of anything else.
3199 if (InlineThreshold
.getNumOccurrences() > 0)
3200 Params
.DefaultThreshold
= InlineThreshold
;
3202 Params
.DefaultThreshold
= Threshold
;
3204 // Set the HintThreshold knob from the -inlinehint-threshold.
3205 Params
.HintThreshold
= HintThreshold
;
3207 // Set the HotCallSiteThreshold knob from the -hot-callsite-threshold.
3208 Params
.HotCallSiteThreshold
= HotCallSiteThreshold
;
3210 // If the -locally-hot-callsite-threshold is explicitly specified, use it to
3211 // populate LocallyHotCallSiteThreshold. Later, we populate
3212 // Params.LocallyHotCallSiteThreshold from -locally-hot-callsite-threshold if
3213 // we know that optimization level is O3 (in the getInlineParams variant that
3214 // takes the opt and size levels).
3215 // FIXME: Remove this check (and make the assignment unconditional) after
3216 // addressing size regression issues at O2.
3217 if (LocallyHotCallSiteThreshold
.getNumOccurrences() > 0)
3218 Params
.LocallyHotCallSiteThreshold
= LocallyHotCallSiteThreshold
;
3220 // Set the ColdCallSiteThreshold knob from the
3221 // -inline-cold-callsite-threshold.
3222 Params
.ColdCallSiteThreshold
= ColdCallSiteThreshold
;
3224 // Set the OptMinSizeThreshold and OptSizeThreshold params only if the
3225 // -inlinehint-threshold commandline option is not explicitly given. If that
3226 // option is present, then its value applies even for callees with size and
3227 // minsize attributes.
3228 // If the -inline-threshold is not specified, set the ColdThreshold from the
3229 // -inlinecold-threshold even if it is not explicitly passed. If
3230 // -inline-threshold is specified, then -inlinecold-threshold needs to be
3231 // explicitly specified to set the ColdThreshold knob
3232 if (InlineThreshold
.getNumOccurrences() == 0) {
3233 Params
.OptMinSizeThreshold
= InlineConstants::OptMinSizeThreshold
;
3234 Params
.OptSizeThreshold
= InlineConstants::OptSizeThreshold
;
3235 Params
.ColdThreshold
= ColdThreshold
;
3236 } else if (ColdThreshold
.getNumOccurrences() > 0) {
3237 Params
.ColdThreshold
= ColdThreshold
;
3242 InlineParams
llvm::getInlineParams() {
3243 return getInlineParams(DefaultThreshold
);
3246 // Compute the default threshold for inlining based on the opt level and the
3248 static int computeThresholdFromOptLevels(unsigned OptLevel
,
3249 unsigned SizeOptLevel
) {
3251 return InlineConstants::OptAggressiveThreshold
;
3252 if (SizeOptLevel
== 1) // -Os
3253 return InlineConstants::OptSizeThreshold
;
3254 if (SizeOptLevel
== 2) // -Oz
3255 return InlineConstants::OptMinSizeThreshold
;
3256 return DefaultThreshold
;
3259 InlineParams
llvm::getInlineParams(unsigned OptLevel
, unsigned SizeOptLevel
) {
3261 getInlineParams(computeThresholdFromOptLevels(OptLevel
, SizeOptLevel
));
3262 // At O3, use the value of -locally-hot-callsite-threshold option to populate
3263 // Params.LocallyHotCallSiteThreshold. Below O3, this flag has effect only
3264 // when it is specified explicitly.
3266 Params
.LocallyHotCallSiteThreshold
= LocallyHotCallSiteThreshold
;
3271 InlineCostAnnotationPrinterPass::run(Function
&F
,
3272 FunctionAnalysisManager
&FAM
) {
3273 PrintInstructionComments
= true;
3274 std::function
<AssumptionCache
&(Function
&)> GetAssumptionCache
=
3275 [&](Function
&F
) -> AssumptionCache
& {
3276 return FAM
.getResult
<AssumptionAnalysis
>(F
);
3278 Module
*M
= F
.getParent();
3279 ProfileSummaryInfo
PSI(*M
);
3280 TargetTransformInfo
TTI(M
->getDataLayout());
3281 // FIXME: Redesign the usage of InlineParams to expand the scope of this pass.
3282 // In the current implementation, the type of InlineParams doesn't matter as
3283 // the pass serves only for verification of inliner's decisions.
3284 // We can add a flag which determines InlineParams for this run. Right now,
3285 // the default InlineParams are used.
3286 const InlineParams Params
= llvm::getInlineParams();
3287 for (BasicBlock
&BB
: F
) {
3288 for (Instruction
&I
: BB
) {
3289 if (auto *CB
= dyn_cast
<CallBase
>(&I
)) {
3290 Function
*CalledFunction
= CB
->getCalledFunction();
3291 if (!CalledFunction
|| CalledFunction
->isDeclaration())
3293 OptimizationRemarkEmitter
ORE(CalledFunction
);
3294 InlineCostCallAnalyzer
ICCA(*CalledFunction
, *CB
, Params
, TTI
,
3295 GetAssumptionCache
, nullptr, nullptr, &PSI
,
3298 OS
<< " Analyzing call of " << CalledFunction
->getName()
3299 << "... (caller:" << CB
->getCaller()->getName() << ")\n";
3305 return PreservedAnalyses::all();