[llvm-exegesis] Fix missing std::move.
[llvm-complete.git] / lib / Analysis / InlineCost.cpp
blobfb032e0404cb38ee0ccea7e22d0763d66af4e22b
1 //===- InlineCost.cpp - Cost analysis for inliner -------------------------===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // This file implements inline cost analysis.
12 //===----------------------------------------------------------------------===//
14 #include "llvm/Analysis/InlineCost.h"
15 #include "llvm/ADT/STLExtras.h"
16 #include "llvm/ADT/SetVector.h"
17 #include "llvm/ADT/SmallPtrSet.h"
18 #include "llvm/ADT/SmallVector.h"
19 #include "llvm/ADT/Statistic.h"
20 #include "llvm/Analysis/AssumptionCache.h"
21 #include "llvm/Analysis/BlockFrequencyInfo.h"
22 #include "llvm/Analysis/CodeMetrics.h"
23 #include "llvm/Analysis/ConstantFolding.h"
24 #include "llvm/Analysis/CFG.h"
25 #include "llvm/Analysis/InstructionSimplify.h"
26 #include "llvm/Analysis/ProfileSummaryInfo.h"
27 #include "llvm/Analysis/TargetTransformInfo.h"
28 #include "llvm/Analysis/ValueTracking.h"
29 #include "llvm/Config/llvm-config.h"
30 #include "llvm/IR/CallSite.h"
31 #include "llvm/IR/CallingConv.h"
32 #include "llvm/IR/DataLayout.h"
33 #include "llvm/IR/GetElementPtrTypeIterator.h"
34 #include "llvm/IR/GlobalAlias.h"
35 #include "llvm/IR/InstVisitor.h"
36 #include "llvm/IR/IntrinsicInst.h"
37 #include "llvm/IR/Operator.h"
38 #include "llvm/Support/Debug.h"
39 #include "llvm/Support/raw_ostream.h"
41 using namespace llvm;
43 #define DEBUG_TYPE "inline-cost"
45 STATISTIC(NumCallsAnalyzed, "Number of call sites analyzed");
47 static cl::opt<int> InlineThreshold(
48 "inline-threshold", cl::Hidden, cl::init(225), cl::ZeroOrMore,
49 cl::desc("Control the amount of inlining to perform (default = 225)"));
51 static cl::opt<int> HintThreshold(
52 "inlinehint-threshold", cl::Hidden, cl::init(325),
53 cl::desc("Threshold for inlining functions with inline hint"));
55 static cl::opt<int>
56 ColdCallSiteThreshold("inline-cold-callsite-threshold", cl::Hidden,
57 cl::init(45),
58 cl::desc("Threshold for inlining cold callsites"));
60 // We introduce this threshold to help performance of instrumentation based
61 // PGO before we actually hook up inliner with analysis passes such as BPI and
62 // BFI.
63 static cl::opt<int> ColdThreshold(
64 "inlinecold-threshold", cl::Hidden, cl::init(45),
65 cl::desc("Threshold for inlining functions with cold attribute"));
67 static cl::opt<int>
68 HotCallSiteThreshold("hot-callsite-threshold", cl::Hidden, cl::init(3000),
69 cl::ZeroOrMore,
70 cl::desc("Threshold for hot callsites "));
72 static cl::opt<int> LocallyHotCallSiteThreshold(
73 "locally-hot-callsite-threshold", cl::Hidden, cl::init(525), cl::ZeroOrMore,
74 cl::desc("Threshold for locally hot callsites "));
76 static cl::opt<int> ColdCallSiteRelFreq(
77 "cold-callsite-rel-freq", cl::Hidden, cl::init(2), cl::ZeroOrMore,
78 cl::desc("Maxmimum block frequency, expressed as a percentage of caller's "
79 "entry frequency, for a callsite to be cold in the absence of "
80 "profile information."));
82 static cl::opt<int> HotCallSiteRelFreq(
83 "hot-callsite-rel-freq", cl::Hidden, cl::init(60), cl::ZeroOrMore,
84 cl::desc("Minimum block frequency, expressed as a multiple of caller's "
85 "entry frequency, for a callsite to be hot in the absence of "
86 "profile information."));
88 static cl::opt<bool> OptComputeFullInlineCost(
89 "inline-cost-full", cl::Hidden, cl::init(false),
90 cl::desc("Compute the full inline cost of a call site even when the cost "
91 "exceeds the threshold."));
93 namespace {
95 class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> {
96 typedef InstVisitor<CallAnalyzer, bool> Base;
97 friend class InstVisitor<CallAnalyzer, bool>;
99 /// The TargetTransformInfo available for this compilation.
100 const TargetTransformInfo &TTI;
102 /// Getter for the cache of @llvm.assume intrinsics.
103 std::function<AssumptionCache &(Function &)> &GetAssumptionCache;
105 /// Getter for BlockFrequencyInfo
106 Optional<function_ref<BlockFrequencyInfo &(Function &)>> &GetBFI;
108 /// Profile summary information.
109 ProfileSummaryInfo *PSI;
111 /// The called function.
112 Function &F;
114 // Cache the DataLayout since we use it a lot.
115 const DataLayout &DL;
117 /// The OptimizationRemarkEmitter available for this compilation.
118 OptimizationRemarkEmitter *ORE;
120 /// The candidate callsite being analyzed. Please do not use this to do
121 /// analysis in the caller function; we want the inline cost query to be
122 /// easily cacheable. Instead, use the cover function paramHasAttr.
123 CallSite CandidateCS;
125 /// Tunable parameters that control the analysis.
126 const InlineParams &Params;
128 int Threshold;
129 int Cost;
130 bool ComputeFullInlineCost;
132 bool IsCallerRecursive;
133 bool IsRecursiveCall;
134 bool ExposesReturnsTwice;
135 bool HasDynamicAlloca;
136 bool ContainsNoDuplicateCall;
137 bool HasReturn;
138 bool HasIndirectBr;
139 bool HasUninlineableIntrinsic;
140 bool InitsVargArgs;
142 /// Number of bytes allocated statically by the callee.
143 uint64_t AllocatedSize;
144 unsigned NumInstructions, NumVectorInstructions;
145 int VectorBonus, TenPercentVectorBonus;
146 // Bonus to be applied when the callee has only one reachable basic block.
147 int SingleBBBonus;
149 /// While we walk the potentially-inlined instructions, we build up and
150 /// maintain a mapping of simplified values specific to this callsite. The
151 /// idea is to propagate any special information we have about arguments to
152 /// this call through the inlinable section of the function, and account for
153 /// likely simplifications post-inlining. The most important aspect we track
154 /// is CFG altering simplifications -- when we prove a basic block dead, that
155 /// can cause dramatic shifts in the cost of inlining a function.
156 DenseMap<Value *, Constant *> SimplifiedValues;
158 /// Keep track of the values which map back (through function arguments) to
159 /// allocas on the caller stack which could be simplified through SROA.
160 DenseMap<Value *, Value *> SROAArgValues;
162 /// The mapping of caller Alloca values to their accumulated cost savings. If
163 /// we have to disable SROA for one of the allocas, this tells us how much
164 /// cost must be added.
165 DenseMap<Value *, int> SROAArgCosts;
167 /// Keep track of values which map to a pointer base and constant offset.
168 DenseMap<Value *, std::pair<Value *, APInt>> ConstantOffsetPtrs;
170 /// Keep track of dead blocks due to the constant arguments.
171 SetVector<BasicBlock *> DeadBlocks;
173 /// The mapping of the blocks to their known unique successors due to the
174 /// constant arguments.
175 DenseMap<BasicBlock *, BasicBlock *> KnownSuccessors;
177 /// Model the elimination of repeated loads that is expected to happen
178 /// whenever we simplify away the stores that would otherwise cause them to be
179 /// loads.
180 bool EnableLoadElimination;
181 SmallPtrSet<Value *, 16> LoadAddrSet;
182 int LoadEliminationCost;
184 // Custom simplification helper routines.
185 bool isAllocaDerivedArg(Value *V);
186 bool lookupSROAArgAndCost(Value *V, Value *&Arg,
187 DenseMap<Value *, int>::iterator &CostIt);
188 void disableSROA(DenseMap<Value *, int>::iterator CostIt);
189 void disableSROA(Value *V);
190 void findDeadBlocks(BasicBlock *CurrBB, BasicBlock *NextBB);
191 void accumulateSROACost(DenseMap<Value *, int>::iterator CostIt,
192 int InstructionCost);
193 void disableLoadElimination();
194 bool isGEPFree(GetElementPtrInst &GEP);
195 bool canFoldInboundsGEP(GetElementPtrInst &I);
196 bool accumulateGEPOffset(GEPOperator &GEP, APInt &Offset);
197 bool simplifyCallSite(Function *F, CallSite CS);
198 template <typename Callable>
199 bool simplifyInstruction(Instruction &I, Callable Evaluate);
200 ConstantInt *stripAndComputeInBoundsConstantOffsets(Value *&V);
202 /// Return true if the given argument to the function being considered for
203 /// inlining has the given attribute set either at the call site or the
204 /// function declaration. Primarily used to inspect call site specific
205 /// attributes since these can be more precise than the ones on the callee
206 /// itself.
207 bool paramHasAttr(Argument *A, Attribute::AttrKind Attr);
209 /// Return true if the given value is known non null within the callee if
210 /// inlined through this particular callsite.
211 bool isKnownNonNullInCallee(Value *V);
213 /// Update Threshold based on callsite properties such as callee
214 /// attributes and callee hotness for PGO builds. The Callee is explicitly
215 /// passed to support analyzing indirect calls whose target is inferred by
216 /// analysis.
217 void updateThreshold(CallSite CS, Function &Callee);
219 /// Return true if size growth is allowed when inlining the callee at CS.
220 bool allowSizeGrowth(CallSite CS);
222 /// Return true if \p CS is a cold callsite.
223 bool isColdCallSite(CallSite CS, BlockFrequencyInfo *CallerBFI);
225 /// Return a higher threshold if \p CS is a hot callsite.
226 Optional<int> getHotCallSiteThreshold(CallSite CS,
227 BlockFrequencyInfo *CallerBFI);
229 // Custom analysis routines.
230 InlineResult analyzeBlock(BasicBlock *BB,
231 SmallPtrSetImpl<const Value *> &EphValues);
233 // Disable several entry points to the visitor so we don't accidentally use
234 // them by declaring but not defining them here.
235 void visit(Module *);
236 void visit(Module &);
237 void visit(Function *);
238 void visit(Function &);
239 void visit(BasicBlock *);
240 void visit(BasicBlock &);
242 // Provide base case for our instruction visit.
243 bool visitInstruction(Instruction &I);
245 // Our visit overrides.
246 bool visitAlloca(AllocaInst &I);
247 bool visitPHI(PHINode &I);
248 bool visitGetElementPtr(GetElementPtrInst &I);
249 bool visitBitCast(BitCastInst &I);
250 bool visitPtrToInt(PtrToIntInst &I);
251 bool visitIntToPtr(IntToPtrInst &I);
252 bool visitCastInst(CastInst &I);
253 bool visitUnaryInstruction(UnaryInstruction &I);
254 bool visitCmpInst(CmpInst &I);
255 bool visitSub(BinaryOperator &I);
256 bool visitBinaryOperator(BinaryOperator &I);
257 bool visitLoad(LoadInst &I);
258 bool visitStore(StoreInst &I);
259 bool visitExtractValue(ExtractValueInst &I);
260 bool visitInsertValue(InsertValueInst &I);
261 bool visitCallSite(CallSite CS);
262 bool visitReturnInst(ReturnInst &RI);
263 bool visitBranchInst(BranchInst &BI);
264 bool visitSelectInst(SelectInst &SI);
265 bool visitSwitchInst(SwitchInst &SI);
266 bool visitIndirectBrInst(IndirectBrInst &IBI);
267 bool visitResumeInst(ResumeInst &RI);
268 bool visitCleanupReturnInst(CleanupReturnInst &RI);
269 bool visitCatchReturnInst(CatchReturnInst &RI);
270 bool visitUnreachableInst(UnreachableInst &I);
272 public:
273 CallAnalyzer(const TargetTransformInfo &TTI,
274 std::function<AssumptionCache &(Function &)> &GetAssumptionCache,
275 Optional<function_ref<BlockFrequencyInfo &(Function &)>> &GetBFI,
276 ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE,
277 Function &Callee, CallSite CSArg, const InlineParams &Params)
278 : TTI(TTI), GetAssumptionCache(GetAssumptionCache), GetBFI(GetBFI),
279 PSI(PSI), F(Callee), DL(F.getParent()->getDataLayout()), ORE(ORE),
280 CandidateCS(CSArg), Params(Params), Threshold(Params.DefaultThreshold),
281 Cost(0), ComputeFullInlineCost(OptComputeFullInlineCost ||
282 Params.ComputeFullInlineCost || ORE),
283 IsCallerRecursive(false), IsRecursiveCall(false),
284 ExposesReturnsTwice(false), HasDynamicAlloca(false),
285 ContainsNoDuplicateCall(false), HasReturn(false), HasIndirectBr(false),
286 HasUninlineableIntrinsic(false), InitsVargArgs(false), AllocatedSize(0),
287 NumInstructions(0), NumVectorInstructions(0), VectorBonus(0),
288 SingleBBBonus(0), EnableLoadElimination(true), LoadEliminationCost(0),
289 NumConstantArgs(0), NumConstantOffsetPtrArgs(0), NumAllocaArgs(0),
290 NumConstantPtrCmps(0), NumConstantPtrDiffs(0),
291 NumInstructionsSimplified(0), SROACostSavings(0),
292 SROACostSavingsLost(0) {}
294 InlineResult analyzeCall(CallSite CS);
296 int getThreshold() { return Threshold; }
297 int getCost() { return Cost; }
299 // Keep a bunch of stats about the cost savings found so we can print them
300 // out when debugging.
301 unsigned NumConstantArgs;
302 unsigned NumConstantOffsetPtrArgs;
303 unsigned NumAllocaArgs;
304 unsigned NumConstantPtrCmps;
305 unsigned NumConstantPtrDiffs;
306 unsigned NumInstructionsSimplified;
307 unsigned SROACostSavings;
308 unsigned SROACostSavingsLost;
310 void dump();
313 } // namespace
315 /// Test whether the given value is an Alloca-derived function argument.
316 bool CallAnalyzer::isAllocaDerivedArg(Value *V) {
317 return SROAArgValues.count(V);
320 /// Lookup the SROA-candidate argument and cost iterator which V maps to.
321 /// Returns false if V does not map to a SROA-candidate.
322 bool CallAnalyzer::lookupSROAArgAndCost(
323 Value *V, Value *&Arg, DenseMap<Value *, int>::iterator &CostIt) {
324 if (SROAArgValues.empty() || SROAArgCosts.empty())
325 return false;
327 DenseMap<Value *, Value *>::iterator ArgIt = SROAArgValues.find(V);
328 if (ArgIt == SROAArgValues.end())
329 return false;
331 Arg = ArgIt->second;
332 CostIt = SROAArgCosts.find(Arg);
333 return CostIt != SROAArgCosts.end();
336 /// Disable SROA for the candidate marked by this cost iterator.
338 /// This marks the candidate as no longer viable for SROA, and adds the cost
339 /// savings associated with it back into the inline cost measurement.
340 void CallAnalyzer::disableSROA(DenseMap<Value *, int>::iterator CostIt) {
341 // If we're no longer able to perform SROA we need to undo its cost savings
342 // and prevent subsequent analysis.
343 Cost += CostIt->second;
344 SROACostSavings -= CostIt->second;
345 SROACostSavingsLost += CostIt->second;
346 SROAArgCosts.erase(CostIt);
347 disableLoadElimination();
350 /// If 'V' maps to a SROA candidate, disable SROA for it.
351 void CallAnalyzer::disableSROA(Value *V) {
352 Value *SROAArg;
353 DenseMap<Value *, int>::iterator CostIt;
354 if (lookupSROAArgAndCost(V, SROAArg, CostIt))
355 disableSROA(CostIt);
358 /// Accumulate the given cost for a particular SROA candidate.
359 void CallAnalyzer::accumulateSROACost(DenseMap<Value *, int>::iterator CostIt,
360 int InstructionCost) {
361 CostIt->second += InstructionCost;
362 SROACostSavings += InstructionCost;
365 void CallAnalyzer::disableLoadElimination() {
366 if (EnableLoadElimination) {
367 Cost += LoadEliminationCost;
368 LoadEliminationCost = 0;
369 EnableLoadElimination = false;
373 /// Accumulate a constant GEP offset into an APInt if possible.
375 /// Returns false if unable to compute the offset for any reason. Respects any
376 /// simplified values known during the analysis of this callsite.
377 bool CallAnalyzer::accumulateGEPOffset(GEPOperator &GEP, APInt &Offset) {
378 unsigned IntPtrWidth = DL.getIndexTypeSizeInBits(GEP.getType());
379 assert(IntPtrWidth == Offset.getBitWidth());
381 for (gep_type_iterator GTI = gep_type_begin(GEP), GTE = gep_type_end(GEP);
382 GTI != GTE; ++GTI) {
383 ConstantInt *OpC = dyn_cast<ConstantInt>(GTI.getOperand());
384 if (!OpC)
385 if (Constant *SimpleOp = SimplifiedValues.lookup(GTI.getOperand()))
386 OpC = dyn_cast<ConstantInt>(SimpleOp);
387 if (!OpC)
388 return false;
389 if (OpC->isZero())
390 continue;
392 // Handle a struct index, which adds its field offset to the pointer.
393 if (StructType *STy = GTI.getStructTypeOrNull()) {
394 unsigned ElementIdx = OpC->getZExtValue();
395 const StructLayout *SL = DL.getStructLayout(STy);
396 Offset += APInt(IntPtrWidth, SL->getElementOffset(ElementIdx));
397 continue;
400 APInt TypeSize(IntPtrWidth, DL.getTypeAllocSize(GTI.getIndexedType()));
401 Offset += OpC->getValue().sextOrTrunc(IntPtrWidth) * TypeSize;
403 return true;
406 /// Use TTI to check whether a GEP is free.
408 /// Respects any simplified values known during the analysis of this callsite.
409 bool CallAnalyzer::isGEPFree(GetElementPtrInst &GEP) {
410 SmallVector<Value *, 4> Operands;
411 Operands.push_back(GEP.getOperand(0));
412 for (User::op_iterator I = GEP.idx_begin(), E = GEP.idx_end(); I != E; ++I)
413 if (Constant *SimpleOp = SimplifiedValues.lookup(*I))
414 Operands.push_back(SimpleOp);
415 else
416 Operands.push_back(*I);
417 return TargetTransformInfo::TCC_Free == TTI.getUserCost(&GEP, Operands);
420 bool CallAnalyzer::visitAlloca(AllocaInst &I) {
421 // Check whether inlining will turn a dynamic alloca into a static
422 // alloca and handle that case.
423 if (I.isArrayAllocation()) {
424 Constant *Size = SimplifiedValues.lookup(I.getArraySize());
425 if (auto *AllocSize = dyn_cast_or_null<ConstantInt>(Size)) {
426 Type *Ty = I.getAllocatedType();
427 AllocatedSize = SaturatingMultiplyAdd(
428 AllocSize->getLimitedValue(), DL.getTypeAllocSize(Ty), AllocatedSize);
429 return Base::visitAlloca(I);
433 // Accumulate the allocated size.
434 if (I.isStaticAlloca()) {
435 Type *Ty = I.getAllocatedType();
436 AllocatedSize = SaturatingAdd(DL.getTypeAllocSize(Ty), AllocatedSize);
439 // We will happily inline static alloca instructions.
440 if (I.isStaticAlloca())
441 return Base::visitAlloca(I);
443 // FIXME: This is overly conservative. Dynamic allocas are inefficient for
444 // a variety of reasons, and so we would like to not inline them into
445 // functions which don't currently have a dynamic alloca. This simply
446 // disables inlining altogether in the presence of a dynamic alloca.
447 HasDynamicAlloca = true;
448 return false;
451 bool CallAnalyzer::visitPHI(PHINode &I) {
452 // FIXME: We need to propagate SROA *disabling* through phi nodes, even
453 // though we don't want to propagate it's bonuses. The idea is to disable
454 // SROA if it *might* be used in an inappropriate manner.
456 // Phi nodes are always zero-cost.
457 // FIXME: Pointer sizes may differ between different address spaces, so do we
458 // need to use correct address space in the call to getPointerSizeInBits here?
459 // Or could we skip the getPointerSizeInBits call completely? As far as I can
460 // see the ZeroOffset is used as a dummy value, so we can probably use any
461 // bit width for the ZeroOffset?
462 APInt ZeroOffset = APInt::getNullValue(DL.getPointerSizeInBits(0));
463 bool CheckSROA = I.getType()->isPointerTy();
465 // Track the constant or pointer with constant offset we've seen so far.
466 Constant *FirstC = nullptr;
467 std::pair<Value *, APInt> FirstBaseAndOffset = {nullptr, ZeroOffset};
468 Value *FirstV = nullptr;
470 for (unsigned i = 0, e = I.getNumIncomingValues(); i != e; ++i) {
471 BasicBlock *Pred = I.getIncomingBlock(i);
472 // If the incoming block is dead, skip the incoming block.
473 if (DeadBlocks.count(Pred))
474 continue;
475 // If the parent block of phi is not the known successor of the incoming
476 // block, skip the incoming block.
477 BasicBlock *KnownSuccessor = KnownSuccessors[Pred];
478 if (KnownSuccessor && KnownSuccessor != I.getParent())
479 continue;
481 Value *V = I.getIncomingValue(i);
482 // If the incoming value is this phi itself, skip the incoming value.
483 if (&I == V)
484 continue;
486 Constant *C = dyn_cast<Constant>(V);
487 if (!C)
488 C = SimplifiedValues.lookup(V);
490 std::pair<Value *, APInt> BaseAndOffset = {nullptr, ZeroOffset};
491 if (!C && CheckSROA)
492 BaseAndOffset = ConstantOffsetPtrs.lookup(V);
494 if (!C && !BaseAndOffset.first)
495 // The incoming value is neither a constant nor a pointer with constant
496 // offset, exit early.
497 return true;
499 if (FirstC) {
500 if (FirstC == C)
501 // If we've seen a constant incoming value before and it is the same
502 // constant we see this time, continue checking the next incoming value.
503 continue;
504 // Otherwise early exit because we either see a different constant or saw
505 // a constant before but we have a pointer with constant offset this time.
506 return true;
509 if (FirstV) {
510 // The same logic as above, but check pointer with constant offset here.
511 if (FirstBaseAndOffset == BaseAndOffset)
512 continue;
513 return true;
516 if (C) {
517 // This is the 1st time we've seen a constant, record it.
518 FirstC = C;
519 continue;
522 // The remaining case is that this is the 1st time we've seen a pointer with
523 // constant offset, record it.
524 FirstV = V;
525 FirstBaseAndOffset = BaseAndOffset;
528 // Check if we can map phi to a constant.
529 if (FirstC) {
530 SimplifiedValues[&I] = FirstC;
531 return true;
534 // Check if we can map phi to a pointer with constant offset.
535 if (FirstBaseAndOffset.first) {
536 ConstantOffsetPtrs[&I] = FirstBaseAndOffset;
538 Value *SROAArg;
539 DenseMap<Value *, int>::iterator CostIt;
540 if (lookupSROAArgAndCost(FirstV, SROAArg, CostIt))
541 SROAArgValues[&I] = SROAArg;
544 return true;
547 /// Check we can fold GEPs of constant-offset call site argument pointers.
548 /// This requires target data and inbounds GEPs.
550 /// \return true if the specified GEP can be folded.
551 bool CallAnalyzer::canFoldInboundsGEP(GetElementPtrInst &I) {
552 // Check if we have a base + offset for the pointer.
553 std::pair<Value *, APInt> BaseAndOffset =
554 ConstantOffsetPtrs.lookup(I.getPointerOperand());
555 if (!BaseAndOffset.first)
556 return false;
558 // Check if the offset of this GEP is constant, and if so accumulate it
559 // into Offset.
560 if (!accumulateGEPOffset(cast<GEPOperator>(I), BaseAndOffset.second))
561 return false;
563 // Add the result as a new mapping to Base + Offset.
564 ConstantOffsetPtrs[&I] = BaseAndOffset;
566 return true;
569 bool CallAnalyzer::visitGetElementPtr(GetElementPtrInst &I) {
570 Value *SROAArg;
571 DenseMap<Value *, int>::iterator CostIt;
572 bool SROACandidate =
573 lookupSROAArgAndCost(I.getPointerOperand(), SROAArg, CostIt);
575 // Lambda to check whether a GEP's indices are all constant.
576 auto IsGEPOffsetConstant = [&](GetElementPtrInst &GEP) {
577 for (User::op_iterator I = GEP.idx_begin(), E = GEP.idx_end(); I != E; ++I)
578 if (!isa<Constant>(*I) && !SimplifiedValues.lookup(*I))
579 return false;
580 return true;
583 if ((I.isInBounds() && canFoldInboundsGEP(I)) || IsGEPOffsetConstant(I)) {
584 if (SROACandidate)
585 SROAArgValues[&I] = SROAArg;
587 // Constant GEPs are modeled as free.
588 return true;
591 // Variable GEPs will require math and will disable SROA.
592 if (SROACandidate)
593 disableSROA(CostIt);
594 return isGEPFree(I);
597 /// Simplify \p I if its operands are constants and update SimplifiedValues.
598 /// \p Evaluate is a callable specific to instruction type that evaluates the
599 /// instruction when all the operands are constants.
600 template <typename Callable>
601 bool CallAnalyzer::simplifyInstruction(Instruction &I, Callable Evaluate) {
602 SmallVector<Constant *, 2> COps;
603 for (Value *Op : I.operands()) {
604 Constant *COp = dyn_cast<Constant>(Op);
605 if (!COp)
606 COp = SimplifiedValues.lookup(Op);
607 if (!COp)
608 return false;
609 COps.push_back(COp);
611 auto *C = Evaluate(COps);
612 if (!C)
613 return false;
614 SimplifiedValues[&I] = C;
615 return true;
618 bool CallAnalyzer::visitBitCast(BitCastInst &I) {
619 // Propagate constants through bitcasts.
620 if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
621 return ConstantExpr::getBitCast(COps[0], I.getType());
623 return true;
625 // Track base/offsets through casts
626 std::pair<Value *, APInt> BaseAndOffset =
627 ConstantOffsetPtrs.lookup(I.getOperand(0));
628 // Casts don't change the offset, just wrap it up.
629 if (BaseAndOffset.first)
630 ConstantOffsetPtrs[&I] = BaseAndOffset;
632 // Also look for SROA candidates here.
633 Value *SROAArg;
634 DenseMap<Value *, int>::iterator CostIt;
635 if (lookupSROAArgAndCost(I.getOperand(0), SROAArg, CostIt))
636 SROAArgValues[&I] = SROAArg;
638 // Bitcasts are always zero cost.
639 return true;
642 bool CallAnalyzer::visitPtrToInt(PtrToIntInst &I) {
643 // Propagate constants through ptrtoint.
644 if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
645 return ConstantExpr::getPtrToInt(COps[0], I.getType());
647 return true;
649 // Track base/offset pairs when converted to a plain integer provided the
650 // integer is large enough to represent the pointer.
651 unsigned IntegerSize = I.getType()->getScalarSizeInBits();
652 unsigned AS = I.getOperand(0)->getType()->getPointerAddressSpace();
653 if (IntegerSize >= DL.getPointerSizeInBits(AS)) {
654 std::pair<Value *, APInt> BaseAndOffset =
655 ConstantOffsetPtrs.lookup(I.getOperand(0));
656 if (BaseAndOffset.first)
657 ConstantOffsetPtrs[&I] = BaseAndOffset;
660 // This is really weird. Technically, ptrtoint will disable SROA. However,
661 // unless that ptrtoint is *used* somewhere in the live basic blocks after
662 // inlining, it will be nuked, and SROA should proceed. All of the uses which
663 // would block SROA would also block SROA if applied directly to a pointer,
664 // and so we can just add the integer in here. The only places where SROA is
665 // preserved either cannot fire on an integer, or won't in-and-of themselves
666 // disable SROA (ext) w/o some later use that we would see and disable.
667 Value *SROAArg;
668 DenseMap<Value *, int>::iterator CostIt;
669 if (lookupSROAArgAndCost(I.getOperand(0), SROAArg, CostIt))
670 SROAArgValues[&I] = SROAArg;
672 return TargetTransformInfo::TCC_Free == TTI.getUserCost(&I);
675 bool CallAnalyzer::visitIntToPtr(IntToPtrInst &I) {
676 // Propagate constants through ptrtoint.
677 if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
678 return ConstantExpr::getIntToPtr(COps[0], I.getType());
680 return true;
682 // Track base/offset pairs when round-tripped through a pointer without
683 // modifications provided the integer is not too large.
684 Value *Op = I.getOperand(0);
685 unsigned IntegerSize = Op->getType()->getScalarSizeInBits();
686 if (IntegerSize <= DL.getPointerTypeSizeInBits(I.getType())) {
687 std::pair<Value *, APInt> BaseAndOffset = ConstantOffsetPtrs.lookup(Op);
688 if (BaseAndOffset.first)
689 ConstantOffsetPtrs[&I] = BaseAndOffset;
692 // "Propagate" SROA here in the same manner as we do for ptrtoint above.
693 Value *SROAArg;
694 DenseMap<Value *, int>::iterator CostIt;
695 if (lookupSROAArgAndCost(Op, SROAArg, CostIt))
696 SROAArgValues[&I] = SROAArg;
698 return TargetTransformInfo::TCC_Free == TTI.getUserCost(&I);
701 bool CallAnalyzer::visitCastInst(CastInst &I) {
702 // Propagate constants through ptrtoint.
703 if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
704 return ConstantExpr::getCast(I.getOpcode(), COps[0], I.getType());
706 return true;
708 // Disable SROA in the face of arbitrary casts we don't whitelist elsewhere.
709 disableSROA(I.getOperand(0));
711 // If this is a floating-point cast, and the target says this operation
712 // is expensive, this may eventually become a library call. Treat the cost
713 // as such.
714 switch (I.getOpcode()) {
715 case Instruction::FPTrunc:
716 case Instruction::FPExt:
717 case Instruction::UIToFP:
718 case Instruction::SIToFP:
719 case Instruction::FPToUI:
720 case Instruction::FPToSI:
721 if (TTI.getFPOpCost(I.getType()) == TargetTransformInfo::TCC_Expensive)
722 Cost += InlineConstants::CallPenalty;
723 default:
724 break;
727 return TargetTransformInfo::TCC_Free == TTI.getUserCost(&I);
730 bool CallAnalyzer::visitUnaryInstruction(UnaryInstruction &I) {
731 Value *Operand = I.getOperand(0);
732 if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
733 return ConstantFoldInstOperands(&I, COps[0], DL);
735 return true;
737 // Disable any SROA on the argument to arbitrary unary operators.
738 disableSROA(Operand);
740 return false;
743 bool CallAnalyzer::paramHasAttr(Argument *A, Attribute::AttrKind Attr) {
744 return CandidateCS.paramHasAttr(A->getArgNo(), Attr);
747 bool CallAnalyzer::isKnownNonNullInCallee(Value *V) {
748 // Does the *call site* have the NonNull attribute set on an argument? We
749 // use the attribute on the call site to memoize any analysis done in the
750 // caller. This will also trip if the callee function has a non-null
751 // parameter attribute, but that's a less interesting case because hopefully
752 // the callee would already have been simplified based on that.
753 if (Argument *A = dyn_cast<Argument>(V))
754 if (paramHasAttr(A, Attribute::NonNull))
755 return true;
757 // Is this an alloca in the caller? This is distinct from the attribute case
758 // above because attributes aren't updated within the inliner itself and we
759 // always want to catch the alloca derived case.
760 if (isAllocaDerivedArg(V))
761 // We can actually predict the result of comparisons between an
762 // alloca-derived value and null. Note that this fires regardless of
763 // SROA firing.
764 return true;
766 return false;
769 bool CallAnalyzer::allowSizeGrowth(CallSite CS) {
770 // If the normal destination of the invoke or the parent block of the call
771 // site is unreachable-terminated, there is little point in inlining this
772 // unless there is literally zero cost.
773 // FIXME: Note that it is possible that an unreachable-terminated block has a
774 // hot entry. For example, in below scenario inlining hot_call_X() may be
775 // beneficial :
776 // main() {
777 // hot_call_1();
778 // ...
779 // hot_call_N()
780 // exit(0);
781 // }
782 // For now, we are not handling this corner case here as it is rare in real
783 // code. In future, we should elaborate this based on BPI and BFI in more
784 // general threshold adjusting heuristics in updateThreshold().
785 Instruction *Instr = CS.getInstruction();
786 if (InvokeInst *II = dyn_cast<InvokeInst>(Instr)) {
787 if (isa<UnreachableInst>(II->getNormalDest()->getTerminator()))
788 return false;
789 } else if (isa<UnreachableInst>(Instr->getParent()->getTerminator()))
790 return false;
792 return true;
795 bool CallAnalyzer::isColdCallSite(CallSite CS, BlockFrequencyInfo *CallerBFI) {
796 // If global profile summary is available, then callsite's coldness is
797 // determined based on that.
798 if (PSI && PSI->hasProfileSummary())
799 return PSI->isColdCallSite(CS, CallerBFI);
801 // Otherwise we need BFI to be available.
802 if (!CallerBFI)
803 return false;
805 // Determine if the callsite is cold relative to caller's entry. We could
806 // potentially cache the computation of scaled entry frequency, but the added
807 // complexity is not worth it unless this scaling shows up high in the
808 // profiles.
809 const BranchProbability ColdProb(ColdCallSiteRelFreq, 100);
810 auto CallSiteBB = CS.getInstruction()->getParent();
811 auto CallSiteFreq = CallerBFI->getBlockFreq(CallSiteBB);
812 auto CallerEntryFreq =
813 CallerBFI->getBlockFreq(&(CS.getCaller()->getEntryBlock()));
814 return CallSiteFreq < CallerEntryFreq * ColdProb;
817 Optional<int>
818 CallAnalyzer::getHotCallSiteThreshold(CallSite CS,
819 BlockFrequencyInfo *CallerBFI) {
821 // If global profile summary is available, then callsite's hotness is
822 // determined based on that.
823 if (PSI && PSI->hasProfileSummary() && PSI->isHotCallSite(CS, CallerBFI))
824 return Params.HotCallSiteThreshold;
826 // Otherwise we need BFI to be available and to have a locally hot callsite
827 // threshold.
828 if (!CallerBFI || !Params.LocallyHotCallSiteThreshold)
829 return None;
831 // Determine if the callsite is hot relative to caller's entry. We could
832 // potentially cache the computation of scaled entry frequency, but the added
833 // complexity is not worth it unless this scaling shows up high in the
834 // profiles.
835 auto CallSiteBB = CS.getInstruction()->getParent();
836 auto CallSiteFreq = CallerBFI->getBlockFreq(CallSiteBB).getFrequency();
837 auto CallerEntryFreq = CallerBFI->getEntryFreq();
838 if (CallSiteFreq >= CallerEntryFreq * HotCallSiteRelFreq)
839 return Params.LocallyHotCallSiteThreshold;
841 // Otherwise treat it normally.
842 return None;
845 void CallAnalyzer::updateThreshold(CallSite CS, Function &Callee) {
846 // If no size growth is allowed for this inlining, set Threshold to 0.
847 if (!allowSizeGrowth(CS)) {
848 Threshold = 0;
849 return;
852 Function *Caller = CS.getCaller();
854 // return min(A, B) if B is valid.
855 auto MinIfValid = [](int A, Optional<int> B) {
856 return B ? std::min(A, B.getValue()) : A;
859 // return max(A, B) if B is valid.
860 auto MaxIfValid = [](int A, Optional<int> B) {
861 return B ? std::max(A, B.getValue()) : A;
864 // Various bonus percentages. These are multiplied by Threshold to get the
865 // bonus values.
866 // SingleBBBonus: This bonus is applied if the callee has a single reachable
867 // basic block at the given callsite context. This is speculatively applied
868 // and withdrawn if more than one basic block is seen.
870 // Vector bonuses: We want to more aggressively inline vector-dense kernels
871 // and apply this bonus based on the percentage of vector instructions. A
872 // bonus is applied if the vector instructions exceed 50% and half that amount
873 // is applied if it exceeds 10%. Note that these bonuses are some what
874 // arbitrary and evolved over time by accident as much as because they are
875 // principled bonuses.
876 // FIXME: It would be nice to base the bonus values on something more
877 // scientific.
879 // LstCallToStaticBonus: This large bonus is applied to ensure the inlining
880 // of the last call to a static function as inlining such functions is
881 // guaranteed to reduce code size.
883 // These bonus percentages may be set to 0 based on properties of the caller
884 // and the callsite.
885 int SingleBBBonusPercent = 50;
886 int VectorBonusPercent = 150;
887 int LastCallToStaticBonus = InlineConstants::LastCallToStaticBonus;
889 // Lambda to set all the above bonus and bonus percentages to 0.
890 auto DisallowAllBonuses = [&]() {
891 SingleBBBonusPercent = 0;
892 VectorBonusPercent = 0;
893 LastCallToStaticBonus = 0;
896 // Use the OptMinSizeThreshold or OptSizeThreshold knob if they are available
897 // and reduce the threshold if the caller has the necessary attribute.
898 if (Caller->optForMinSize()) {
899 Threshold = MinIfValid(Threshold, Params.OptMinSizeThreshold);
900 // For minsize, we want to disable the single BB bonus and the vector
901 // bonuses, but not the last-call-to-static bonus. Inlining the last call to
902 // a static function will, at the minimum, eliminate the parameter setup and
903 // call/return instructions.
904 SingleBBBonusPercent = 0;
905 VectorBonusPercent = 0;
906 } else if (Caller->optForSize())
907 Threshold = MinIfValid(Threshold, Params.OptSizeThreshold);
909 // Adjust the threshold based on inlinehint attribute and profile based
910 // hotness information if the caller does not have MinSize attribute.
911 if (!Caller->optForMinSize()) {
912 if (Callee.hasFnAttribute(Attribute::InlineHint))
913 Threshold = MaxIfValid(Threshold, Params.HintThreshold);
915 // FIXME: After switching to the new passmanager, simplify the logic below
916 // by checking only the callsite hotness/coldness as we will reliably
917 // have local profile information.
919 // Callsite hotness and coldness can be determined if sample profile is
920 // used (which adds hotness metadata to calls) or if caller's
921 // BlockFrequencyInfo is available.
922 BlockFrequencyInfo *CallerBFI = GetBFI ? &((*GetBFI)(*Caller)) : nullptr;
923 auto HotCallSiteThreshold = getHotCallSiteThreshold(CS, CallerBFI);
924 if (!Caller->optForSize() && HotCallSiteThreshold) {
925 LLVM_DEBUG(dbgs() << "Hot callsite.\n");
926 // FIXME: This should update the threshold only if it exceeds the
927 // current threshold, but AutoFDO + ThinLTO currently relies on this
928 // behavior to prevent inlining of hot callsites during ThinLTO
929 // compile phase.
930 Threshold = HotCallSiteThreshold.getValue();
931 } else if (isColdCallSite(CS, CallerBFI)) {
932 LLVM_DEBUG(dbgs() << "Cold callsite.\n");
933 // Do not apply bonuses for a cold callsite including the
934 // LastCallToStatic bonus. While this bonus might result in code size
935 // reduction, it can cause the size of a non-cold caller to increase
936 // preventing it from being inlined.
937 DisallowAllBonuses();
938 Threshold = MinIfValid(Threshold, Params.ColdCallSiteThreshold);
939 } else if (PSI) {
940 // Use callee's global profile information only if we have no way of
941 // determining this via callsite information.
942 if (PSI->isFunctionEntryHot(&Callee)) {
943 LLVM_DEBUG(dbgs() << "Hot callee.\n");
944 // If callsite hotness can not be determined, we may still know
945 // that the callee is hot and treat it as a weaker hint for threshold
946 // increase.
947 Threshold = MaxIfValid(Threshold, Params.HintThreshold);
948 } else if (PSI->isFunctionEntryCold(&Callee)) {
949 LLVM_DEBUG(dbgs() << "Cold callee.\n");
950 // Do not apply bonuses for a cold callee including the
951 // LastCallToStatic bonus. While this bonus might result in code size
952 // reduction, it can cause the size of a non-cold caller to increase
953 // preventing it from being inlined.
954 DisallowAllBonuses();
955 Threshold = MinIfValid(Threshold, Params.ColdThreshold);
960 // Finally, take the target-specific inlining threshold multiplier into
961 // account.
962 Threshold *= TTI.getInliningThresholdMultiplier();
964 SingleBBBonus = Threshold * SingleBBBonusPercent / 100;
965 VectorBonus = Threshold * VectorBonusPercent / 100;
967 bool OnlyOneCallAndLocalLinkage =
968 F.hasLocalLinkage() && F.hasOneUse() && &F == CS.getCalledFunction();
969 // If there is only one call of the function, and it has internal linkage,
970 // the cost of inlining it drops dramatically. It may seem odd to update
971 // Cost in updateThreshold, but the bonus depends on the logic in this method.
972 if (OnlyOneCallAndLocalLinkage)
973 Cost -= LastCallToStaticBonus;
976 bool CallAnalyzer::visitCmpInst(CmpInst &I) {
977 Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
978 // First try to handle simplified comparisons.
979 if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
980 return ConstantExpr::getCompare(I.getPredicate(), COps[0], COps[1]);
982 return true;
984 if (I.getOpcode() == Instruction::FCmp)
985 return false;
987 // Otherwise look for a comparison between constant offset pointers with
988 // a common base.
989 Value *LHSBase, *RHSBase;
990 APInt LHSOffset, RHSOffset;
991 std::tie(LHSBase, LHSOffset) = ConstantOffsetPtrs.lookup(LHS);
992 if (LHSBase) {
993 std::tie(RHSBase, RHSOffset) = ConstantOffsetPtrs.lookup(RHS);
994 if (RHSBase && LHSBase == RHSBase) {
995 // We have common bases, fold the icmp to a constant based on the
996 // offsets.
997 Constant *CLHS = ConstantInt::get(LHS->getContext(), LHSOffset);
998 Constant *CRHS = ConstantInt::get(RHS->getContext(), RHSOffset);
999 if (Constant *C = ConstantExpr::getICmp(I.getPredicate(), CLHS, CRHS)) {
1000 SimplifiedValues[&I] = C;
1001 ++NumConstantPtrCmps;
1002 return true;
1007 // If the comparison is an equality comparison with null, we can simplify it
1008 // if we know the value (argument) can't be null
1009 if (I.isEquality() && isa<ConstantPointerNull>(I.getOperand(1)) &&
1010 isKnownNonNullInCallee(I.getOperand(0))) {
1011 bool IsNotEqual = I.getPredicate() == CmpInst::ICMP_NE;
1012 SimplifiedValues[&I] = IsNotEqual ? ConstantInt::getTrue(I.getType())
1013 : ConstantInt::getFalse(I.getType());
1014 return true;
1016 // Finally check for SROA candidates in comparisons.
1017 Value *SROAArg;
1018 DenseMap<Value *, int>::iterator CostIt;
1019 if (lookupSROAArgAndCost(I.getOperand(0), SROAArg, CostIt)) {
1020 if (isa<ConstantPointerNull>(I.getOperand(1))) {
1021 accumulateSROACost(CostIt, InlineConstants::InstrCost);
1022 return true;
1025 disableSROA(CostIt);
1028 return false;
1031 bool CallAnalyzer::visitSub(BinaryOperator &I) {
1032 // Try to handle a special case: we can fold computing the difference of two
1033 // constant-related pointers.
1034 Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
1035 Value *LHSBase, *RHSBase;
1036 APInt LHSOffset, RHSOffset;
1037 std::tie(LHSBase, LHSOffset) = ConstantOffsetPtrs.lookup(LHS);
1038 if (LHSBase) {
1039 std::tie(RHSBase, RHSOffset) = ConstantOffsetPtrs.lookup(RHS);
1040 if (RHSBase && LHSBase == RHSBase) {
1041 // We have common bases, fold the subtract to a constant based on the
1042 // offsets.
1043 Constant *CLHS = ConstantInt::get(LHS->getContext(), LHSOffset);
1044 Constant *CRHS = ConstantInt::get(RHS->getContext(), RHSOffset);
1045 if (Constant *C = ConstantExpr::getSub(CLHS, CRHS)) {
1046 SimplifiedValues[&I] = C;
1047 ++NumConstantPtrDiffs;
1048 return true;
1053 // Otherwise, fall back to the generic logic for simplifying and handling
1054 // instructions.
1055 return Base::visitSub(I);
1058 bool CallAnalyzer::visitBinaryOperator(BinaryOperator &I) {
1059 Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
1060 Constant *CLHS = dyn_cast<Constant>(LHS);
1061 if (!CLHS)
1062 CLHS = SimplifiedValues.lookup(LHS);
1063 Constant *CRHS = dyn_cast<Constant>(RHS);
1064 if (!CRHS)
1065 CRHS = SimplifiedValues.lookup(RHS);
1067 Value *SimpleV = nullptr;
1068 if (auto FI = dyn_cast<FPMathOperator>(&I))
1069 SimpleV = SimplifyFPBinOp(I.getOpcode(), CLHS ? CLHS : LHS,
1070 CRHS ? CRHS : RHS, FI->getFastMathFlags(), DL);
1071 else
1072 SimpleV =
1073 SimplifyBinOp(I.getOpcode(), CLHS ? CLHS : LHS, CRHS ? CRHS : RHS, DL);
1075 if (Constant *C = dyn_cast_or_null<Constant>(SimpleV))
1076 SimplifiedValues[&I] = C;
1078 if (SimpleV)
1079 return true;
1081 // Disable any SROA on arguments to arbitrary, unsimplified binary operators.
1082 disableSROA(LHS);
1083 disableSROA(RHS);
1085 // If the instruction is floating point, and the target says this operation
1086 // is expensive, this may eventually become a library call. Treat the cost
1087 // as such.
1088 if (I.getType()->isFloatingPointTy() &&
1089 TTI.getFPOpCost(I.getType()) == TargetTransformInfo::TCC_Expensive)
1090 Cost += InlineConstants::CallPenalty;
1092 return false;
1095 bool CallAnalyzer::visitLoad(LoadInst &I) {
1096 Value *SROAArg;
1097 DenseMap<Value *, int>::iterator CostIt;
1098 if (lookupSROAArgAndCost(I.getPointerOperand(), SROAArg, CostIt)) {
1099 if (I.isSimple()) {
1100 accumulateSROACost(CostIt, InlineConstants::InstrCost);
1101 return true;
1104 disableSROA(CostIt);
1107 // If the data is already loaded from this address and hasn't been clobbered
1108 // by any stores or calls, this load is likely to be redundant and can be
1109 // eliminated.
1110 if (EnableLoadElimination &&
1111 !LoadAddrSet.insert(I.getPointerOperand()).second && I.isUnordered()) {
1112 LoadEliminationCost += InlineConstants::InstrCost;
1113 return true;
1116 return false;
1119 bool CallAnalyzer::visitStore(StoreInst &I) {
1120 Value *SROAArg;
1121 DenseMap<Value *, int>::iterator CostIt;
1122 if (lookupSROAArgAndCost(I.getPointerOperand(), SROAArg, CostIt)) {
1123 if (I.isSimple()) {
1124 accumulateSROACost(CostIt, InlineConstants::InstrCost);
1125 return true;
1128 disableSROA(CostIt);
1131 // The store can potentially clobber loads and prevent repeated loads from
1132 // being eliminated.
1133 // FIXME:
1134 // 1. We can probably keep an initial set of eliminatable loads substracted
1135 // from the cost even when we finally see a store. We just need to disable
1136 // *further* accumulation of elimination savings.
1137 // 2. We should probably at some point thread MemorySSA for the callee into
1138 // this and then use that to actually compute *really* precise savings.
1139 disableLoadElimination();
1140 return false;
1143 bool CallAnalyzer::visitExtractValue(ExtractValueInst &I) {
1144 // Constant folding for extract value is trivial.
1145 if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
1146 return ConstantExpr::getExtractValue(COps[0], I.getIndices());
1148 return true;
1150 // SROA can look through these but give them a cost.
1151 return false;
1154 bool CallAnalyzer::visitInsertValue(InsertValueInst &I) {
1155 // Constant folding for insert value is trivial.
1156 if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
1157 return ConstantExpr::getInsertValue(/*AggregateOperand*/ COps[0],
1158 /*InsertedValueOperand*/ COps[1],
1159 I.getIndices());
1161 return true;
1163 // SROA can look through these but give them a cost.
1164 return false;
1167 /// Try to simplify a call site.
1169 /// Takes a concrete function and callsite and tries to actually simplify it by
1170 /// analyzing the arguments and call itself with instsimplify. Returns true if
1171 /// it has simplified the callsite to some other entity (a constant), making it
1172 /// free.
1173 bool CallAnalyzer::simplifyCallSite(Function *F, CallSite CS) {
1174 // FIXME: Using the instsimplify logic directly for this is inefficient
1175 // because we have to continually rebuild the argument list even when no
1176 // simplifications can be performed. Until that is fixed with remapping
1177 // inside of instsimplify, directly constant fold calls here.
1178 if (!canConstantFoldCallTo(CS, F))
1179 return false;
1181 // Try to re-map the arguments to constants.
1182 SmallVector<Constant *, 4> ConstantArgs;
1183 ConstantArgs.reserve(CS.arg_size());
1184 for (CallSite::arg_iterator I = CS.arg_begin(), E = CS.arg_end(); I != E;
1185 ++I) {
1186 Constant *C = dyn_cast<Constant>(*I);
1187 if (!C)
1188 C = dyn_cast_or_null<Constant>(SimplifiedValues.lookup(*I));
1189 if (!C)
1190 return false; // This argument doesn't map to a constant.
1192 ConstantArgs.push_back(C);
1194 if (Constant *C = ConstantFoldCall(CS, F, ConstantArgs)) {
1195 SimplifiedValues[CS.getInstruction()] = C;
1196 return true;
1199 return false;
1202 bool CallAnalyzer::visitCallSite(CallSite CS) {
1203 if (CS.hasFnAttr(Attribute::ReturnsTwice) &&
1204 !F.hasFnAttribute(Attribute::ReturnsTwice)) {
1205 // This aborts the entire analysis.
1206 ExposesReturnsTwice = true;
1207 return false;
1209 if (CS.isCall() && cast<CallInst>(CS.getInstruction())->cannotDuplicate())
1210 ContainsNoDuplicateCall = true;
1212 if (Function *F = CS.getCalledFunction()) {
1213 // When we have a concrete function, first try to simplify it directly.
1214 if (simplifyCallSite(F, CS))
1215 return true;
1217 // Next check if it is an intrinsic we know about.
1218 // FIXME: Lift this into part of the InstVisitor.
1219 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(CS.getInstruction())) {
1220 switch (II->getIntrinsicID()) {
1221 default:
1222 if (!CS.onlyReadsMemory() && !isAssumeLikeIntrinsic(II))
1223 disableLoadElimination();
1224 return Base::visitCallSite(CS);
1226 case Intrinsic::load_relative:
1227 // This is normally lowered to 4 LLVM instructions.
1228 Cost += 3 * InlineConstants::InstrCost;
1229 return false;
1231 case Intrinsic::memset:
1232 case Intrinsic::memcpy:
1233 case Intrinsic::memmove:
1234 disableLoadElimination();
1235 // SROA can usually chew through these intrinsics, but they aren't free.
1236 return false;
1237 case Intrinsic::icall_branch_funnel:
1238 case Intrinsic::localescape:
1239 HasUninlineableIntrinsic = true;
1240 return false;
1241 case Intrinsic::vastart:
1242 InitsVargArgs = true;
1243 return false;
1247 if (F == CS.getInstruction()->getFunction()) {
1248 // This flag will fully abort the analysis, so don't bother with anything
1249 // else.
1250 IsRecursiveCall = true;
1251 return false;
1254 if (TTI.isLoweredToCall(F)) {
1255 // We account for the average 1 instruction per call argument setup
1256 // here.
1257 Cost += CS.arg_size() * InlineConstants::InstrCost;
1259 // Everything other than inline ASM will also have a significant cost
1260 // merely from making the call.
1261 if (!isa<InlineAsm>(CS.getCalledValue()))
1262 Cost += InlineConstants::CallPenalty;
1265 if (!CS.onlyReadsMemory())
1266 disableLoadElimination();
1267 return Base::visitCallSite(CS);
1270 // Otherwise we're in a very special case -- an indirect function call. See
1271 // if we can be particularly clever about this.
1272 Value *Callee = CS.getCalledValue();
1274 // First, pay the price of the argument setup. We account for the average
1275 // 1 instruction per call argument setup here.
1276 Cost += CS.arg_size() * InlineConstants::InstrCost;
1278 // Next, check if this happens to be an indirect function call to a known
1279 // function in this inline context. If not, we've done all we can.
1280 Function *F = dyn_cast_or_null<Function>(SimplifiedValues.lookup(Callee));
1281 if (!F) {
1282 if (!CS.onlyReadsMemory())
1283 disableLoadElimination();
1284 return Base::visitCallSite(CS);
1287 // If we have a constant that we are calling as a function, we can peer
1288 // through it and see the function target. This happens not infrequently
1289 // during devirtualization and so we want to give it a hefty bonus for
1290 // inlining, but cap that bonus in the event that inlining wouldn't pan
1291 // out. Pretend to inline the function, with a custom threshold.
1292 auto IndirectCallParams = Params;
1293 IndirectCallParams.DefaultThreshold = InlineConstants::IndirectCallThreshold;
1294 CallAnalyzer CA(TTI, GetAssumptionCache, GetBFI, PSI, ORE, *F, CS,
1295 IndirectCallParams);
1296 if (CA.analyzeCall(CS)) {
1297 // We were able to inline the indirect call! Subtract the cost from the
1298 // threshold to get the bonus we want to apply, but don't go below zero.
1299 Cost -= std::max(0, CA.getThreshold() - CA.getCost());
1302 if (!F->onlyReadsMemory())
1303 disableLoadElimination();
1304 return Base::visitCallSite(CS);
1307 bool CallAnalyzer::visitReturnInst(ReturnInst &RI) {
1308 // At least one return instruction will be free after inlining.
1309 bool Free = !HasReturn;
1310 HasReturn = true;
1311 return Free;
1314 bool CallAnalyzer::visitBranchInst(BranchInst &BI) {
1315 // We model unconditional branches as essentially free -- they really
1316 // shouldn't exist at all, but handling them makes the behavior of the
1317 // inliner more regular and predictable. Interestingly, conditional branches
1318 // which will fold away are also free.
1319 return BI.isUnconditional() || isa<ConstantInt>(BI.getCondition()) ||
1320 dyn_cast_or_null<ConstantInt>(
1321 SimplifiedValues.lookup(BI.getCondition()));
1324 bool CallAnalyzer::visitSelectInst(SelectInst &SI) {
1325 bool CheckSROA = SI.getType()->isPointerTy();
1326 Value *TrueVal = SI.getTrueValue();
1327 Value *FalseVal = SI.getFalseValue();
1329 Constant *TrueC = dyn_cast<Constant>(TrueVal);
1330 if (!TrueC)
1331 TrueC = SimplifiedValues.lookup(TrueVal);
1332 Constant *FalseC = dyn_cast<Constant>(FalseVal);
1333 if (!FalseC)
1334 FalseC = SimplifiedValues.lookup(FalseVal);
1335 Constant *CondC =
1336 dyn_cast_or_null<Constant>(SimplifiedValues.lookup(SI.getCondition()));
1338 if (!CondC) {
1339 // Select C, X, X => X
1340 if (TrueC == FalseC && TrueC) {
1341 SimplifiedValues[&SI] = TrueC;
1342 return true;
1345 if (!CheckSROA)
1346 return Base::visitSelectInst(SI);
1348 std::pair<Value *, APInt> TrueBaseAndOffset =
1349 ConstantOffsetPtrs.lookup(TrueVal);
1350 std::pair<Value *, APInt> FalseBaseAndOffset =
1351 ConstantOffsetPtrs.lookup(FalseVal);
1352 if (TrueBaseAndOffset == FalseBaseAndOffset && TrueBaseAndOffset.first) {
1353 ConstantOffsetPtrs[&SI] = TrueBaseAndOffset;
1355 Value *SROAArg;
1356 DenseMap<Value *, int>::iterator CostIt;
1357 if (lookupSROAArgAndCost(TrueVal, SROAArg, CostIt))
1358 SROAArgValues[&SI] = SROAArg;
1359 return true;
1362 return Base::visitSelectInst(SI);
1365 // Select condition is a constant.
1366 Value *SelectedV = CondC->isAllOnesValue()
1367 ? TrueVal
1368 : (CondC->isNullValue()) ? FalseVal : nullptr;
1369 if (!SelectedV) {
1370 // Condition is a vector constant that is not all 1s or all 0s. If all
1371 // operands are constants, ConstantExpr::getSelect() can handle the cases
1372 // such as select vectors.
1373 if (TrueC && FalseC) {
1374 if (auto *C = ConstantExpr::getSelect(CondC, TrueC, FalseC)) {
1375 SimplifiedValues[&SI] = C;
1376 return true;
1379 return Base::visitSelectInst(SI);
1382 // Condition is either all 1s or all 0s. SI can be simplified.
1383 if (Constant *SelectedC = dyn_cast<Constant>(SelectedV)) {
1384 SimplifiedValues[&SI] = SelectedC;
1385 return true;
1388 if (!CheckSROA)
1389 return true;
1391 std::pair<Value *, APInt> BaseAndOffset =
1392 ConstantOffsetPtrs.lookup(SelectedV);
1393 if (BaseAndOffset.first) {
1394 ConstantOffsetPtrs[&SI] = BaseAndOffset;
1396 Value *SROAArg;
1397 DenseMap<Value *, int>::iterator CostIt;
1398 if (lookupSROAArgAndCost(SelectedV, SROAArg, CostIt))
1399 SROAArgValues[&SI] = SROAArg;
1402 return true;
1405 bool CallAnalyzer::visitSwitchInst(SwitchInst &SI) {
1406 // We model unconditional switches as free, see the comments on handling
1407 // branches.
1408 if (isa<ConstantInt>(SI.getCondition()))
1409 return true;
1410 if (Value *V = SimplifiedValues.lookup(SI.getCondition()))
1411 if (isa<ConstantInt>(V))
1412 return true;
1414 // Assume the most general case where the switch is lowered into
1415 // either a jump table, bit test, or a balanced binary tree consisting of
1416 // case clusters without merging adjacent clusters with the same
1417 // destination. We do not consider the switches that are lowered with a mix
1418 // of jump table/bit test/binary search tree. The cost of the switch is
1419 // proportional to the size of the tree or the size of jump table range.
1421 // NB: We convert large switches which are just used to initialize large phi
1422 // nodes to lookup tables instead in simplify-cfg, so this shouldn't prevent
1423 // inlining those. It will prevent inlining in cases where the optimization
1424 // does not (yet) fire.
1426 // Maximum valid cost increased in this function.
1427 int CostUpperBound = INT_MAX - InlineConstants::InstrCost - 1;
1429 // Exit early for a large switch, assuming one case needs at least one
1430 // instruction.
1431 // FIXME: This is not true for a bit test, but ignore such case for now to
1432 // save compile-time.
1433 int64_t CostLowerBound =
1434 std::min((int64_t)CostUpperBound,
1435 (int64_t)SI.getNumCases() * InlineConstants::InstrCost + Cost);
1437 if (CostLowerBound > Threshold && !ComputeFullInlineCost) {
1438 Cost = CostLowerBound;
1439 return false;
1442 unsigned JumpTableSize = 0;
1443 unsigned NumCaseCluster =
1444 TTI.getEstimatedNumberOfCaseClusters(SI, JumpTableSize);
1446 // If suitable for a jump table, consider the cost for the table size and
1447 // branch to destination.
1448 if (JumpTableSize) {
1449 int64_t JTCost = (int64_t)JumpTableSize * InlineConstants::InstrCost +
1450 4 * InlineConstants::InstrCost;
1452 Cost = std::min((int64_t)CostUpperBound, JTCost + Cost);
1453 return false;
1456 // Considering forming a binary search, we should find the number of nodes
1457 // which is same as the number of comparisons when lowered. For a given
1458 // number of clusters, n, we can define a recursive function, f(n), to find
1459 // the number of nodes in the tree. The recursion is :
1460 // f(n) = 1 + f(n/2) + f (n - n/2), when n > 3,
1461 // and f(n) = n, when n <= 3.
1462 // This will lead a binary tree where the leaf should be either f(2) or f(3)
1463 // when n > 3. So, the number of comparisons from leaves should be n, while
1464 // the number of non-leaf should be :
1465 // 2^(log2(n) - 1) - 1
1466 // = 2^log2(n) * 2^-1 - 1
1467 // = n / 2 - 1.
1468 // Considering comparisons from leaf and non-leaf nodes, we can estimate the
1469 // number of comparisons in a simple closed form :
1470 // n + n / 2 - 1 = n * 3 / 2 - 1
1471 if (NumCaseCluster <= 3) {
1472 // Suppose a comparison includes one compare and one conditional branch.
1473 Cost += NumCaseCluster * 2 * InlineConstants::InstrCost;
1474 return false;
1477 int64_t ExpectedNumberOfCompare = 3 * (int64_t)NumCaseCluster / 2 - 1;
1478 int64_t SwitchCost =
1479 ExpectedNumberOfCompare * 2 * InlineConstants::InstrCost;
1481 Cost = std::min((int64_t)CostUpperBound, SwitchCost + Cost);
1482 return false;
1485 bool CallAnalyzer::visitIndirectBrInst(IndirectBrInst &IBI) {
1486 // We never want to inline functions that contain an indirectbr. This is
1487 // incorrect because all the blockaddress's (in static global initializers
1488 // for example) would be referring to the original function, and this
1489 // indirect jump would jump from the inlined copy of the function into the
1490 // original function which is extremely undefined behavior.
1491 // FIXME: This logic isn't really right; we can safely inline functions with
1492 // indirectbr's as long as no other function or global references the
1493 // blockaddress of a block within the current function.
1494 HasIndirectBr = true;
1495 return false;
1498 bool CallAnalyzer::visitResumeInst(ResumeInst &RI) {
1499 // FIXME: It's not clear that a single instruction is an accurate model for
1500 // the inline cost of a resume instruction.
1501 return false;
1504 bool CallAnalyzer::visitCleanupReturnInst(CleanupReturnInst &CRI) {
1505 // FIXME: It's not clear that a single instruction is an accurate model for
1506 // the inline cost of a cleanupret instruction.
1507 return false;
1510 bool CallAnalyzer::visitCatchReturnInst(CatchReturnInst &CRI) {
1511 // FIXME: It's not clear that a single instruction is an accurate model for
1512 // the inline cost of a catchret instruction.
1513 return false;
1516 bool CallAnalyzer::visitUnreachableInst(UnreachableInst &I) {
1517 // FIXME: It might be reasonably to discount the cost of instructions leading
1518 // to unreachable as they have the lowest possible impact on both runtime and
1519 // code size.
1520 return true; // No actual code is needed for unreachable.
1523 bool CallAnalyzer::visitInstruction(Instruction &I) {
1524 // Some instructions are free. All of the free intrinsics can also be
1525 // handled by SROA, etc.
1526 if (TargetTransformInfo::TCC_Free == TTI.getUserCost(&I))
1527 return true;
1529 // We found something we don't understand or can't handle. Mark any SROA-able
1530 // values in the operand list as no longer viable.
1531 for (User::op_iterator OI = I.op_begin(), OE = I.op_end(); OI != OE; ++OI)
1532 disableSROA(*OI);
1534 return false;
1537 /// Analyze a basic block for its contribution to the inline cost.
1539 /// This method walks the analyzer over every instruction in the given basic
1540 /// block and accounts for their cost during inlining at this callsite. It
1541 /// aborts early if the threshold has been exceeded or an impossible to inline
1542 /// construct has been detected. It returns false if inlining is no longer
1543 /// viable, and true if inlining remains viable.
1544 InlineResult
1545 CallAnalyzer::analyzeBlock(BasicBlock *BB,
1546 SmallPtrSetImpl<const Value *> &EphValues) {
1547 for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
1548 // FIXME: Currently, the number of instructions in a function regardless of
1549 // our ability to simplify them during inline to constants or dead code,
1550 // are actually used by the vector bonus heuristic. As long as that's true,
1551 // we have to special case debug intrinsics here to prevent differences in
1552 // inlining due to debug symbols. Eventually, the number of unsimplified
1553 // instructions shouldn't factor into the cost computation, but until then,
1554 // hack around it here.
1555 if (isa<DbgInfoIntrinsic>(I))
1556 continue;
1558 // Skip ephemeral values.
1559 if (EphValues.count(&*I))
1560 continue;
1562 ++NumInstructions;
1563 if (isa<ExtractElementInst>(I) || I->getType()->isVectorTy())
1564 ++NumVectorInstructions;
1566 // If the instruction simplified to a constant, there is no cost to this
1567 // instruction. Visit the instructions using our InstVisitor to account for
1568 // all of the per-instruction logic. The visit tree returns true if we
1569 // consumed the instruction in any way, and false if the instruction's base
1570 // cost should count against inlining.
1571 if (Base::visit(&*I))
1572 ++NumInstructionsSimplified;
1573 else
1574 Cost += InlineConstants::InstrCost;
1576 using namespace ore;
1577 // If the visit this instruction detected an uninlinable pattern, abort.
1578 InlineResult IR;
1579 if (IsRecursiveCall)
1580 IR = "recursive";
1581 else if (ExposesReturnsTwice)
1582 IR = "exposes returns twice";
1583 else if (HasDynamicAlloca)
1584 IR = "dynamic alloca";
1585 else if (HasIndirectBr)
1586 IR = "indirect branch";
1587 else if (HasUninlineableIntrinsic)
1588 IR = "uninlinable intrinsic";
1589 else if (InitsVargArgs)
1590 IR = "varargs";
1591 if (!IR) {
1592 if (ORE)
1593 ORE->emit([&]() {
1594 return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline",
1595 CandidateCS.getInstruction())
1596 << NV("Callee", &F) << " has uninlinable pattern ("
1597 << NV("InlineResult", IR.message)
1598 << ") and cost is not fully computed";
1600 return IR;
1603 // If the caller is a recursive function then we don't want to inline
1604 // functions which allocate a lot of stack space because it would increase
1605 // the caller stack usage dramatically.
1606 if (IsCallerRecursive &&
1607 AllocatedSize > InlineConstants::TotalAllocaSizeRecursiveCaller) {
1608 InlineResult IR = "recursive and allocates too much stack space";
1609 if (ORE)
1610 ORE->emit([&]() {
1611 return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline",
1612 CandidateCS.getInstruction())
1613 << NV("Callee", &F) << " is " << NV("InlineResult", IR.message)
1614 << ". Cost is not fully computed";
1616 return IR;
1619 // Check if we've past the maximum possible threshold so we don't spin in
1620 // huge basic blocks that will never inline.
1621 if (Cost >= Threshold && !ComputeFullInlineCost)
1622 return false;
1625 return true;
1628 /// Compute the base pointer and cumulative constant offsets for V.
1630 /// This strips all constant offsets off of V, leaving it the base pointer, and
1631 /// accumulates the total constant offset applied in the returned constant. It
1632 /// returns 0 if V is not a pointer, and returns the constant '0' if there are
1633 /// no constant offsets applied.
1634 ConstantInt *CallAnalyzer::stripAndComputeInBoundsConstantOffsets(Value *&V) {
1635 if (!V->getType()->isPointerTy())
1636 return nullptr;
1638 unsigned AS = V->getType()->getPointerAddressSpace();
1639 unsigned IntPtrWidth = DL.getIndexSizeInBits(AS);
1640 APInt Offset = APInt::getNullValue(IntPtrWidth);
1642 // Even though we don't look through PHI nodes, we could be called on an
1643 // instruction in an unreachable block, which may be on a cycle.
1644 SmallPtrSet<Value *, 4> Visited;
1645 Visited.insert(V);
1646 do {
1647 if (GEPOperator *GEP = dyn_cast<GEPOperator>(V)) {
1648 if (!GEP->isInBounds() || !accumulateGEPOffset(*GEP, Offset))
1649 return nullptr;
1650 V = GEP->getPointerOperand();
1651 } else if (Operator::getOpcode(V) == Instruction::BitCast) {
1652 V = cast<Operator>(V)->getOperand(0);
1653 } else if (GlobalAlias *GA = dyn_cast<GlobalAlias>(V)) {
1654 if (GA->isInterposable())
1655 break;
1656 V = GA->getAliasee();
1657 } else {
1658 break;
1660 assert(V->getType()->isPointerTy() && "Unexpected operand type!");
1661 } while (Visited.insert(V).second);
1663 Type *IntPtrTy = DL.getIntPtrType(V->getContext(), AS);
1664 return cast<ConstantInt>(ConstantInt::get(IntPtrTy, Offset));
1667 /// Find dead blocks due to deleted CFG edges during inlining.
1669 /// If we know the successor of the current block, \p CurrBB, has to be \p
1670 /// NextBB, the other successors of \p CurrBB are dead if these successors have
1671 /// no live incoming CFG edges. If one block is found to be dead, we can
1672 /// continue growing the dead block list by checking the successors of the dead
1673 /// blocks to see if all their incoming edges are dead or not.
1674 void CallAnalyzer::findDeadBlocks(BasicBlock *CurrBB, BasicBlock *NextBB) {
1675 auto IsEdgeDead = [&](BasicBlock *Pred, BasicBlock *Succ) {
1676 // A CFG edge is dead if the predecessor is dead or the predessor has a
1677 // known successor which is not the one under exam.
1678 return (DeadBlocks.count(Pred) ||
1679 (KnownSuccessors[Pred] && KnownSuccessors[Pred] != Succ));
1682 auto IsNewlyDead = [&](BasicBlock *BB) {
1683 // If all the edges to a block are dead, the block is also dead.
1684 return (!DeadBlocks.count(BB) &&
1685 llvm::all_of(predecessors(BB),
1686 [&](BasicBlock *P) { return IsEdgeDead(P, BB); }));
1689 for (BasicBlock *Succ : successors(CurrBB)) {
1690 if (Succ == NextBB || !IsNewlyDead(Succ))
1691 continue;
1692 SmallVector<BasicBlock *, 4> NewDead;
1693 NewDead.push_back(Succ);
1694 while (!NewDead.empty()) {
1695 BasicBlock *Dead = NewDead.pop_back_val();
1696 if (DeadBlocks.insert(Dead))
1697 // Continue growing the dead block lists.
1698 for (BasicBlock *S : successors(Dead))
1699 if (IsNewlyDead(S))
1700 NewDead.push_back(S);
1705 /// Analyze a call site for potential inlining.
1707 /// Returns true if inlining this call is viable, and false if it is not
1708 /// viable. It computes the cost and adjusts the threshold based on numerous
1709 /// factors and heuristics. If this method returns false but the computed cost
1710 /// is below the computed threshold, then inlining was forcibly disabled by
1711 /// some artifact of the routine.
1712 InlineResult CallAnalyzer::analyzeCall(CallSite CS) {
1713 ++NumCallsAnalyzed;
1715 // Perform some tweaks to the cost and threshold based on the direct
1716 // callsite information.
1718 // We want to more aggressively inline vector-dense kernels, so up the
1719 // threshold, and we'll lower it if the % of vector instructions gets too
1720 // low. Note that these bonuses are some what arbitrary and evolved over time
1721 // by accident as much as because they are principled bonuses.
1723 // FIXME: It would be nice to remove all such bonuses. At least it would be
1724 // nice to base the bonus values on something more scientific.
1725 assert(NumInstructions == 0);
1726 assert(NumVectorInstructions == 0);
1728 // Update the threshold based on callsite properties
1729 updateThreshold(CS, F);
1731 // Speculatively apply all possible bonuses to Threshold. If cost exceeds
1732 // this Threshold any time, and cost cannot decrease, we can stop processing
1733 // the rest of the function body.
1734 Threshold += (SingleBBBonus + VectorBonus);
1736 // Give out bonuses for the callsite, as the instructions setting them up
1737 // will be gone after inlining.
1738 Cost -= getCallsiteCost(CS, DL);
1740 // If this function uses the coldcc calling convention, prefer not to inline
1741 // it.
1742 if (F.getCallingConv() == CallingConv::Cold)
1743 Cost += InlineConstants::ColdccPenalty;
1745 // Check if we're done. This can happen due to bonuses and penalties.
1746 if (Cost >= Threshold && !ComputeFullInlineCost)
1747 return "high cost";
1749 if (F.empty())
1750 return true;
1752 Function *Caller = CS.getInstruction()->getFunction();
1753 // Check if the caller function is recursive itself.
1754 for (User *U : Caller->users()) {
1755 CallSite Site(U);
1756 if (!Site)
1757 continue;
1758 Instruction *I = Site.getInstruction();
1759 if (I->getFunction() == Caller) {
1760 IsCallerRecursive = true;
1761 break;
1765 // Populate our simplified values by mapping from function arguments to call
1766 // arguments with known important simplifications.
1767 CallSite::arg_iterator CAI = CS.arg_begin();
1768 for (Function::arg_iterator FAI = F.arg_begin(), FAE = F.arg_end();
1769 FAI != FAE; ++FAI, ++CAI) {
1770 assert(CAI != CS.arg_end());
1771 if (Constant *C = dyn_cast<Constant>(CAI))
1772 SimplifiedValues[&*FAI] = C;
1774 Value *PtrArg = *CAI;
1775 if (ConstantInt *C = stripAndComputeInBoundsConstantOffsets(PtrArg)) {
1776 ConstantOffsetPtrs[&*FAI] = std::make_pair(PtrArg, C->getValue());
1778 // We can SROA any pointer arguments derived from alloca instructions.
1779 if (isa<AllocaInst>(PtrArg)) {
1780 SROAArgValues[&*FAI] = PtrArg;
1781 SROAArgCosts[PtrArg] = 0;
1785 NumConstantArgs = SimplifiedValues.size();
1786 NumConstantOffsetPtrArgs = ConstantOffsetPtrs.size();
1787 NumAllocaArgs = SROAArgValues.size();
1789 // FIXME: If a caller has multiple calls to a callee, we end up recomputing
1790 // the ephemeral values multiple times (and they're completely determined by
1791 // the callee, so this is purely duplicate work).
1792 SmallPtrSet<const Value *, 32> EphValues;
1793 CodeMetrics::collectEphemeralValues(&F, &GetAssumptionCache(F), EphValues);
1795 // The worklist of live basic blocks in the callee *after* inlining. We avoid
1796 // adding basic blocks of the callee which can be proven to be dead for this
1797 // particular call site in order to get more accurate cost estimates. This
1798 // requires a somewhat heavyweight iteration pattern: we need to walk the
1799 // basic blocks in a breadth-first order as we insert live successors. To
1800 // accomplish this, prioritizing for small iterations because we exit after
1801 // crossing our threshold, we use a small-size optimized SetVector.
1802 typedef SetVector<BasicBlock *, SmallVector<BasicBlock *, 16>,
1803 SmallPtrSet<BasicBlock *, 16>>
1804 BBSetVector;
1805 BBSetVector BBWorklist;
1806 BBWorklist.insert(&F.getEntryBlock());
1807 bool SingleBB = true;
1808 // Note that we *must not* cache the size, this loop grows the worklist.
1809 for (unsigned Idx = 0; Idx != BBWorklist.size(); ++Idx) {
1810 // Bail out the moment we cross the threshold. This means we'll under-count
1811 // the cost, but only when undercounting doesn't matter.
1812 if (Cost >= Threshold && !ComputeFullInlineCost)
1813 break;
1815 BasicBlock *BB = BBWorklist[Idx];
1816 if (BB->empty())
1817 continue;
1819 // Disallow inlining a blockaddress. A blockaddress only has defined
1820 // behavior for an indirect branch in the same function, and we do not
1821 // currently support inlining indirect branches. But, the inliner may not
1822 // see an indirect branch that ends up being dead code at a particular call
1823 // site. If the blockaddress escapes the function, e.g., via a global
1824 // variable, inlining may lead to an invalid cross-function reference.
1825 if (BB->hasAddressTaken())
1826 return "blockaddress";
1828 // Analyze the cost of this block. If we blow through the threshold, this
1829 // returns false, and we can bail on out.
1830 InlineResult IR = analyzeBlock(BB, EphValues);
1831 if (!IR)
1832 return IR;
1834 TerminatorInst *TI = BB->getTerminator();
1836 // Add in the live successors by first checking whether we have terminator
1837 // that may be simplified based on the values simplified by this call.
1838 if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
1839 if (BI->isConditional()) {
1840 Value *Cond = BI->getCondition();
1841 if (ConstantInt *SimpleCond =
1842 dyn_cast_or_null<ConstantInt>(SimplifiedValues.lookup(Cond))) {
1843 BasicBlock *NextBB = BI->getSuccessor(SimpleCond->isZero() ? 1 : 0);
1844 BBWorklist.insert(NextBB);
1845 KnownSuccessors[BB] = NextBB;
1846 findDeadBlocks(BB, NextBB);
1847 continue;
1850 } else if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
1851 Value *Cond = SI->getCondition();
1852 if (ConstantInt *SimpleCond =
1853 dyn_cast_or_null<ConstantInt>(SimplifiedValues.lookup(Cond))) {
1854 BasicBlock *NextBB = SI->findCaseValue(SimpleCond)->getCaseSuccessor();
1855 BBWorklist.insert(NextBB);
1856 KnownSuccessors[BB] = NextBB;
1857 findDeadBlocks(BB, NextBB);
1858 continue;
1862 // If we're unable to select a particular successor, just count all of
1863 // them.
1864 for (unsigned TIdx = 0, TSize = TI->getNumSuccessors(); TIdx != TSize;
1865 ++TIdx)
1866 BBWorklist.insert(TI->getSuccessor(TIdx));
1868 // If we had any successors at this point, than post-inlining is likely to
1869 // have them as well. Note that we assume any basic blocks which existed
1870 // due to branches or switches which folded above will also fold after
1871 // inlining.
1872 if (SingleBB && TI->getNumSuccessors() > 1) {
1873 // Take off the bonus we applied to the threshold.
1874 Threshold -= SingleBBBonus;
1875 SingleBB = false;
1879 bool OnlyOneCallAndLocalLinkage =
1880 F.hasLocalLinkage() && F.hasOneUse() && &F == CS.getCalledFunction();
1881 // If this is a noduplicate call, we can still inline as long as
1882 // inlining this would cause the removal of the caller (so the instruction
1883 // is not actually duplicated, just moved).
1884 if (!OnlyOneCallAndLocalLinkage && ContainsNoDuplicateCall)
1885 return "noduplicate";
1887 // We applied the maximum possible vector bonus at the beginning. Now,
1888 // subtract the excess bonus, if any, from the Threshold before
1889 // comparing against Cost.
1890 if (NumVectorInstructions <= NumInstructions / 10)
1891 Threshold -= VectorBonus;
1892 else if (NumVectorInstructions <= NumInstructions / 2)
1893 Threshold -= VectorBonus/2;
1895 return Cost < std::max(1, Threshold);
1898 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1899 /// Dump stats about this call's analysis.
1900 LLVM_DUMP_METHOD void CallAnalyzer::dump() {
1901 #define DEBUG_PRINT_STAT(x) dbgs() << " " #x ": " << x << "\n"
1902 DEBUG_PRINT_STAT(NumConstantArgs);
1903 DEBUG_PRINT_STAT(NumConstantOffsetPtrArgs);
1904 DEBUG_PRINT_STAT(NumAllocaArgs);
1905 DEBUG_PRINT_STAT(NumConstantPtrCmps);
1906 DEBUG_PRINT_STAT(NumConstantPtrDiffs);
1907 DEBUG_PRINT_STAT(NumInstructionsSimplified);
1908 DEBUG_PRINT_STAT(NumInstructions);
1909 DEBUG_PRINT_STAT(SROACostSavings);
1910 DEBUG_PRINT_STAT(SROACostSavingsLost);
1911 DEBUG_PRINT_STAT(LoadEliminationCost);
1912 DEBUG_PRINT_STAT(ContainsNoDuplicateCall);
1913 DEBUG_PRINT_STAT(Cost);
1914 DEBUG_PRINT_STAT(Threshold);
1915 #undef DEBUG_PRINT_STAT
1917 #endif
1919 /// Test that there are no attribute conflicts between Caller and Callee
1920 /// that prevent inlining.
1921 static bool functionsHaveCompatibleAttributes(Function *Caller,
1922 Function *Callee,
1923 TargetTransformInfo &TTI) {
1924 return TTI.areInlineCompatible(Caller, Callee) &&
1925 AttributeFuncs::areInlineCompatible(*Caller, *Callee);
1928 int llvm::getCallsiteCost(CallSite CS, const DataLayout &DL) {
1929 int Cost = 0;
1930 for (unsigned I = 0, E = CS.arg_size(); I != E; ++I) {
1931 if (CS.isByValArgument(I)) {
1932 // We approximate the number of loads and stores needed by dividing the
1933 // size of the byval type by the target's pointer size.
1934 PointerType *PTy = cast<PointerType>(CS.getArgument(I)->getType());
1935 unsigned TypeSize = DL.getTypeSizeInBits(PTy->getElementType());
1936 unsigned AS = PTy->getAddressSpace();
1937 unsigned PointerSize = DL.getPointerSizeInBits(AS);
1938 // Ceiling division.
1939 unsigned NumStores = (TypeSize + PointerSize - 1) / PointerSize;
1941 // If it generates more than 8 stores it is likely to be expanded as an
1942 // inline memcpy so we take that as an upper bound. Otherwise we assume
1943 // one load and one store per word copied.
1944 // FIXME: The maxStoresPerMemcpy setting from the target should be used
1945 // here instead of a magic number of 8, but it's not available via
1946 // DataLayout.
1947 NumStores = std::min(NumStores, 8U);
1949 Cost += 2 * NumStores * InlineConstants::InstrCost;
1950 } else {
1951 // For non-byval arguments subtract off one instruction per call
1952 // argument.
1953 Cost += InlineConstants::InstrCost;
1956 // The call instruction also disappears after inlining.
1957 Cost += InlineConstants::InstrCost + InlineConstants::CallPenalty;
1958 return Cost;
1961 InlineCost llvm::getInlineCost(
1962 CallSite CS, const InlineParams &Params, TargetTransformInfo &CalleeTTI,
1963 std::function<AssumptionCache &(Function &)> &GetAssumptionCache,
1964 Optional<function_ref<BlockFrequencyInfo &(Function &)>> GetBFI,
1965 ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE) {
1966 return getInlineCost(CS, CS.getCalledFunction(), Params, CalleeTTI,
1967 GetAssumptionCache, GetBFI, PSI, ORE);
1970 InlineCost llvm::getInlineCost(
1971 CallSite CS, Function *Callee, const InlineParams &Params,
1972 TargetTransformInfo &CalleeTTI,
1973 std::function<AssumptionCache &(Function &)> &GetAssumptionCache,
1974 Optional<function_ref<BlockFrequencyInfo &(Function &)>> GetBFI,
1975 ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE) {
1977 // Cannot inline indirect calls.
1978 if (!Callee)
1979 return llvm::InlineCost::getNever("indirect call");
1981 // Never inline calls with byval arguments that does not have the alloca
1982 // address space. Since byval arguments can be replaced with a copy to an
1983 // alloca, the inlined code would need to be adjusted to handle that the
1984 // argument is in the alloca address space (so it is a little bit complicated
1985 // to solve).
1986 unsigned AllocaAS = Callee->getParent()->getDataLayout().getAllocaAddrSpace();
1987 for (unsigned I = 0, E = CS.arg_size(); I != E; ++I)
1988 if (CS.isByValArgument(I)) {
1989 PointerType *PTy = cast<PointerType>(CS.getArgument(I)->getType());
1990 if (PTy->getAddressSpace() != AllocaAS)
1991 return llvm::InlineCost::getNever("byval arguments without alloca"
1992 " address space");
1995 // Calls to functions with always-inline attributes should be inlined
1996 // whenever possible.
1997 if (CS.hasFnAttr(Attribute::AlwaysInline)) {
1998 if (isInlineViable(*Callee))
1999 return llvm::InlineCost::getAlways("always inline attribute");
2000 return llvm::InlineCost::getNever("inapplicable always inline attribute");
2003 // Never inline functions with conflicting attributes (unless callee has
2004 // always-inline attribute).
2005 Function *Caller = CS.getCaller();
2006 if (!functionsHaveCompatibleAttributes(Caller, Callee, CalleeTTI))
2007 return llvm::InlineCost::getNever("conflicting attributes");
2009 // Don't inline this call if the caller has the optnone attribute.
2010 if (Caller->hasFnAttribute(Attribute::OptimizeNone))
2011 return llvm::InlineCost::getNever("optnone attribute");
2013 // Don't inline a function that treats null pointer as valid into a caller
2014 // that does not have this attribute.
2015 if (!Caller->nullPointerIsDefined() && Callee->nullPointerIsDefined())
2016 return llvm::InlineCost::getNever("nullptr definitions incompatible");
2018 // Don't inline functions which can be interposed at link-time.
2019 if (Callee->isInterposable())
2020 return llvm::InlineCost::getNever("interposable");
2022 // Don't inline functions marked noinline.
2023 if (Callee->hasFnAttribute(Attribute::NoInline))
2024 return llvm::InlineCost::getNever("noinline function attribute");
2026 // Don't inline call sites marked noinline.
2027 if (CS.isNoInline())
2028 return llvm::InlineCost::getNever("noinline call site attribute");
2030 LLVM_DEBUG(llvm::dbgs() << " Analyzing call of " << Callee->getName()
2031 << "... (caller:" << Caller->getName() << ")\n");
2033 CallAnalyzer CA(CalleeTTI, GetAssumptionCache, GetBFI, PSI, ORE, *Callee, CS,
2034 Params);
2035 InlineResult ShouldInline = CA.analyzeCall(CS);
2037 LLVM_DEBUG(CA.dump());
2039 // Check if there was a reason to force inlining or no inlining.
2040 if (!ShouldInline && CA.getCost() < CA.getThreshold())
2041 return InlineCost::getNever(ShouldInline.message);
2042 if (ShouldInline && CA.getCost() >= CA.getThreshold())
2043 return InlineCost::getAlways("empty function");
2045 return llvm::InlineCost::get(CA.getCost(), CA.getThreshold());
2048 bool llvm::isInlineViable(Function &F) {
2049 bool ReturnsTwice = F.hasFnAttribute(Attribute::ReturnsTwice);
2050 for (Function::iterator BI = F.begin(), BE = F.end(); BI != BE; ++BI) {
2051 // Disallow inlining of functions which contain indirect branches or
2052 // blockaddresses.
2053 if (isa<IndirectBrInst>(BI->getTerminator()) || BI->hasAddressTaken())
2054 return false;
2056 for (auto &II : *BI) {
2057 CallSite CS(&II);
2058 if (!CS)
2059 continue;
2061 // Disallow recursive calls.
2062 if (&F == CS.getCalledFunction())
2063 return false;
2065 // Disallow calls which expose returns-twice to a function not previously
2066 // attributed as such.
2067 if (!ReturnsTwice && CS.isCall() &&
2068 cast<CallInst>(CS.getInstruction())->canReturnTwice())
2069 return false;
2071 if (CS.getCalledFunction())
2072 switch (CS.getCalledFunction()->getIntrinsicID()) {
2073 default:
2074 break;
2075 // Disallow inlining of @llvm.icall.branch.funnel because current
2076 // backend can't separate call targets from call arguments.
2077 case llvm::Intrinsic::icall_branch_funnel:
2078 // Disallow inlining functions that call @llvm.localescape. Doing this
2079 // correctly would require major changes to the inliner.
2080 case llvm::Intrinsic::localescape:
2081 // Disallow inlining of functions that initialize VarArgs with va_start.
2082 case llvm::Intrinsic::vastart:
2083 return false;
2088 return true;
2091 // APIs to create InlineParams based on command line flags and/or other
2092 // parameters.
2094 InlineParams llvm::getInlineParams(int Threshold) {
2095 InlineParams Params;
2097 // This field is the threshold to use for a callee by default. This is
2098 // derived from one or more of:
2099 // * optimization or size-optimization levels,
2100 // * a value passed to createFunctionInliningPass function, or
2101 // * the -inline-threshold flag.
2102 // If the -inline-threshold flag is explicitly specified, that is used
2103 // irrespective of anything else.
2104 if (InlineThreshold.getNumOccurrences() > 0)
2105 Params.DefaultThreshold = InlineThreshold;
2106 else
2107 Params.DefaultThreshold = Threshold;
2109 // Set the HintThreshold knob from the -inlinehint-threshold.
2110 Params.HintThreshold = HintThreshold;
2112 // Set the HotCallSiteThreshold knob from the -hot-callsite-threshold.
2113 Params.HotCallSiteThreshold = HotCallSiteThreshold;
2115 // If the -locally-hot-callsite-threshold is explicitly specified, use it to
2116 // populate LocallyHotCallSiteThreshold. Later, we populate
2117 // Params.LocallyHotCallSiteThreshold from -locally-hot-callsite-threshold if
2118 // we know that optimization level is O3 (in the getInlineParams variant that
2119 // takes the opt and size levels).
2120 // FIXME: Remove this check (and make the assignment unconditional) after
2121 // addressing size regression issues at O2.
2122 if (LocallyHotCallSiteThreshold.getNumOccurrences() > 0)
2123 Params.LocallyHotCallSiteThreshold = LocallyHotCallSiteThreshold;
2125 // Set the ColdCallSiteThreshold knob from the -inline-cold-callsite-threshold.
2126 Params.ColdCallSiteThreshold = ColdCallSiteThreshold;
2128 // Set the OptMinSizeThreshold and OptSizeThreshold params only if the
2129 // -inlinehint-threshold commandline option is not explicitly given. If that
2130 // option is present, then its value applies even for callees with size and
2131 // minsize attributes.
2132 // If the -inline-threshold is not specified, set the ColdThreshold from the
2133 // -inlinecold-threshold even if it is not explicitly passed. If
2134 // -inline-threshold is specified, then -inlinecold-threshold needs to be
2135 // explicitly specified to set the ColdThreshold knob
2136 if (InlineThreshold.getNumOccurrences() == 0) {
2137 Params.OptMinSizeThreshold = InlineConstants::OptMinSizeThreshold;
2138 Params.OptSizeThreshold = InlineConstants::OptSizeThreshold;
2139 Params.ColdThreshold = ColdThreshold;
2140 } else if (ColdThreshold.getNumOccurrences() > 0) {
2141 Params.ColdThreshold = ColdThreshold;
2143 return Params;
2146 InlineParams llvm::getInlineParams() {
2147 return getInlineParams(InlineThreshold);
2150 // Compute the default threshold for inlining based on the opt level and the
2151 // size opt level.
2152 static int computeThresholdFromOptLevels(unsigned OptLevel,
2153 unsigned SizeOptLevel) {
2154 if (OptLevel > 2)
2155 return InlineConstants::OptAggressiveThreshold;
2156 if (SizeOptLevel == 1) // -Os
2157 return InlineConstants::OptSizeThreshold;
2158 if (SizeOptLevel == 2) // -Oz
2159 return InlineConstants::OptMinSizeThreshold;
2160 return InlineThreshold;
2163 InlineParams llvm::getInlineParams(unsigned OptLevel, unsigned SizeOptLevel) {
2164 auto Params =
2165 getInlineParams(computeThresholdFromOptLevels(OptLevel, SizeOptLevel));
2166 // At O3, use the value of -locally-hot-callsite-threshold option to populate
2167 // Params.LocallyHotCallSiteThreshold. Below O3, this flag has effect only
2168 // when it is specified explicitly.
2169 if (OptLevel > 2)
2170 Params.LocallyHotCallSiteThreshold = LocallyHotCallSiteThreshold;
2171 return Params;