[x86] fix assert with horizontal math + broadcast of vector (PR43402)
[llvm-core.git] / lib / Analysis / InlineCost.cpp
blob57dee459fc2cbe12ba11bd564ad59e8f1e4e9431
1 //===- InlineCost.cpp - Cost analysis for inliner -------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements inline cost analysis.
11 //===----------------------------------------------------------------------===//
13 #include "llvm/Analysis/InlineCost.h"
14 #include "llvm/ADT/STLExtras.h"
15 #include "llvm/ADT/SetVector.h"
16 #include "llvm/ADT/SmallPtrSet.h"
17 #include "llvm/ADT/SmallVector.h"
18 #include "llvm/ADT/Statistic.h"
19 #include "llvm/Analysis/AssumptionCache.h"
20 #include "llvm/Analysis/BlockFrequencyInfo.h"
21 #include "llvm/Analysis/CodeMetrics.h"
22 #include "llvm/Analysis/ConstantFolding.h"
23 #include "llvm/Analysis/CFG.h"
24 #include "llvm/Analysis/InstructionSimplify.h"
25 #include "llvm/Analysis/LoopInfo.h"
26 #include "llvm/Analysis/ProfileSummaryInfo.h"
27 #include "llvm/Analysis/TargetTransformInfo.h"
28 #include "llvm/Analysis/ValueTracking.h"
29 #include "llvm/Config/llvm-config.h"
30 #include "llvm/IR/CallingConv.h"
31 #include "llvm/IR/DataLayout.h"
32 #include "llvm/IR/Dominators.h"
33 #include "llvm/IR/GetElementPtrTypeIterator.h"
34 #include "llvm/IR/GlobalAlias.h"
35 #include "llvm/IR/InstVisitor.h"
36 #include "llvm/IR/IntrinsicInst.h"
37 #include "llvm/IR/Operator.h"
38 #include "llvm/IR/PatternMatch.h"
39 #include "llvm/Support/Debug.h"
40 #include "llvm/Support/raw_ostream.h"
42 using namespace llvm;
44 #define DEBUG_TYPE "inline-cost"
46 STATISTIC(NumCallsAnalyzed, "Number of call sites analyzed");
48 static cl::opt<int> InlineThreshold(
49 "inline-threshold", cl::Hidden, cl::init(225), cl::ZeroOrMore,
50 cl::desc("Control the amount of inlining to perform (default = 225)"));
52 static cl::opt<int> HintThreshold(
53 "inlinehint-threshold", cl::Hidden, cl::init(325), cl::ZeroOrMore,
54 cl::desc("Threshold for inlining functions with inline hint"));
56 static cl::opt<int>
57 ColdCallSiteThreshold("inline-cold-callsite-threshold", cl::Hidden,
58 cl::init(45), cl::ZeroOrMore,
59 cl::desc("Threshold for inlining cold callsites"));
61 // We introduce this threshold to help performance of instrumentation based
62 // PGO before we actually hook up inliner with analysis passes such as BPI and
63 // BFI.
64 static cl::opt<int> ColdThreshold(
65 "inlinecold-threshold", cl::Hidden, cl::init(45), cl::ZeroOrMore,
66 cl::desc("Threshold for inlining functions with cold attribute"));
68 static cl::opt<int>
69 HotCallSiteThreshold("hot-callsite-threshold", cl::Hidden, cl::init(3000),
70 cl::ZeroOrMore,
71 cl::desc("Threshold for hot callsites "));
73 static cl::opt<int> LocallyHotCallSiteThreshold(
74 "locally-hot-callsite-threshold", cl::Hidden, cl::init(525), cl::ZeroOrMore,
75 cl::desc("Threshold for locally hot callsites "));
77 static cl::opt<int> ColdCallSiteRelFreq(
78 "cold-callsite-rel-freq", cl::Hidden, cl::init(2), cl::ZeroOrMore,
79 cl::desc("Maximum block frequency, expressed as a percentage of caller's "
80 "entry frequency, for a callsite to be cold in the absence of "
81 "profile information."));
83 static cl::opt<int> HotCallSiteRelFreq(
84 "hot-callsite-rel-freq", cl::Hidden, cl::init(60), cl::ZeroOrMore,
85 cl::desc("Minimum block frequency, expressed as a multiple of caller's "
86 "entry frequency, for a callsite to be hot in the absence of "
87 "profile information."));
89 static cl::opt<bool> OptComputeFullInlineCost(
90 "inline-cost-full", cl::Hidden, cl::init(false), cl::ZeroOrMore,
91 cl::desc("Compute the full inline cost of a call site even when the cost "
92 "exceeds the threshold."));
94 namespace {
96 class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> {
97 typedef InstVisitor<CallAnalyzer, bool> Base;
98 friend class InstVisitor<CallAnalyzer, bool>;
100 /// The TargetTransformInfo available for this compilation.
101 const TargetTransformInfo &TTI;
103 /// Getter for the cache of @llvm.assume intrinsics.
104 std::function<AssumptionCache &(Function &)> &GetAssumptionCache;
106 /// Getter for BlockFrequencyInfo
107 Optional<function_ref<BlockFrequencyInfo &(Function &)>> &GetBFI;
109 /// Profile summary information.
110 ProfileSummaryInfo *PSI;
112 /// The called function.
113 Function &F;
115 // Cache the DataLayout since we use it a lot.
116 const DataLayout &DL;
118 /// The OptimizationRemarkEmitter available for this compilation.
119 OptimizationRemarkEmitter *ORE;
121 /// The candidate callsite being analyzed. Please do not use this to do
122 /// analysis in the caller function; we want the inline cost query to be
123 /// easily cacheable. Instead, use the cover function paramHasAttr.
124 CallBase &CandidateCall;
126 /// Tunable parameters that control the analysis.
127 const InlineParams &Params;
129 /// Upper bound for the inlining cost. Bonuses are being applied to account
130 /// for speculative "expected profit" of the inlining decision.
131 int Threshold;
133 /// Inlining cost measured in abstract units, accounts for all the
134 /// instructions expected to be executed for a given function invocation.
135 /// Instructions that are statically proven to be dead based on call-site
136 /// arguments are not counted here.
137 int Cost = 0;
139 bool ComputeFullInlineCost;
141 bool IsCallerRecursive = false;
142 bool IsRecursiveCall = false;
143 bool ExposesReturnsTwice = false;
144 bool HasDynamicAlloca = false;
145 bool ContainsNoDuplicateCall = false;
146 bool HasReturn = false;
147 bool HasIndirectBr = false;
148 bool HasUninlineableIntrinsic = false;
149 bool InitsVargArgs = false;
151 /// Number of bytes allocated statically by the callee.
152 uint64_t AllocatedSize = 0;
153 unsigned NumInstructions = 0;
154 unsigned NumVectorInstructions = 0;
156 /// Bonus to be applied when percentage of vector instructions in callee is
157 /// high (see more details in updateThreshold).
158 int VectorBonus = 0;
159 /// Bonus to be applied when the callee has only one reachable basic block.
160 int SingleBBBonus = 0;
162 /// While we walk the potentially-inlined instructions, we build up and
163 /// maintain a mapping of simplified values specific to this callsite. The
164 /// idea is to propagate any special information we have about arguments to
165 /// this call through the inlinable section of the function, and account for
166 /// likely simplifications post-inlining. The most important aspect we track
167 /// is CFG altering simplifications -- when we prove a basic block dead, that
168 /// can cause dramatic shifts in the cost of inlining a function.
169 DenseMap<Value *, Constant *> SimplifiedValues;
171 /// Keep track of the values which map back (through function arguments) to
172 /// allocas on the caller stack which could be simplified through SROA.
173 DenseMap<Value *, Value *> SROAArgValues;
175 /// The mapping of caller Alloca values to their accumulated cost savings. If
176 /// we have to disable SROA for one of the allocas, this tells us how much
177 /// cost must be added.
178 DenseMap<Value *, int> SROAArgCosts;
180 /// Keep track of values which map to a pointer base and constant offset.
181 DenseMap<Value *, std::pair<Value *, APInt>> ConstantOffsetPtrs;
183 /// Keep track of dead blocks due to the constant arguments.
184 SetVector<BasicBlock *> DeadBlocks;
186 /// The mapping of the blocks to their known unique successors due to the
187 /// constant arguments.
188 DenseMap<BasicBlock *, BasicBlock *> KnownSuccessors;
190 /// Model the elimination of repeated loads that is expected to happen
191 /// whenever we simplify away the stores that would otherwise cause them to be
192 /// loads.
193 bool EnableLoadElimination;
194 SmallPtrSet<Value *, 16> LoadAddrSet;
195 int LoadEliminationCost = 0;
197 // Custom simplification helper routines.
198 bool isAllocaDerivedArg(Value *V);
199 bool lookupSROAArgAndCost(Value *V, Value *&Arg,
200 DenseMap<Value *, int>::iterator &CostIt);
201 void disableSROA(DenseMap<Value *, int>::iterator CostIt);
202 void disableSROA(Value *V);
203 void findDeadBlocks(BasicBlock *CurrBB, BasicBlock *NextBB);
204 void accumulateSROACost(DenseMap<Value *, int>::iterator CostIt,
205 int InstructionCost);
206 void disableLoadElimination();
207 bool isGEPFree(GetElementPtrInst &GEP);
208 bool canFoldInboundsGEP(GetElementPtrInst &I);
209 bool accumulateGEPOffset(GEPOperator &GEP, APInt &Offset);
210 bool simplifyCallSite(Function *F, CallBase &Call);
211 template <typename Callable>
212 bool simplifyInstruction(Instruction &I, Callable Evaluate);
213 ConstantInt *stripAndComputeInBoundsConstantOffsets(Value *&V);
215 /// Return true if the given argument to the function being considered for
216 /// inlining has the given attribute set either at the call site or the
217 /// function declaration. Primarily used to inspect call site specific
218 /// attributes since these can be more precise than the ones on the callee
219 /// itself.
220 bool paramHasAttr(Argument *A, Attribute::AttrKind Attr);
222 /// Return true if the given value is known non null within the callee if
223 /// inlined through this particular callsite.
224 bool isKnownNonNullInCallee(Value *V);
226 /// Update Threshold based on callsite properties such as callee
227 /// attributes and callee hotness for PGO builds. The Callee is explicitly
228 /// passed to support analyzing indirect calls whose target is inferred by
229 /// analysis.
230 void updateThreshold(CallBase &Call, Function &Callee);
232 /// Return true if size growth is allowed when inlining the callee at \p Call.
233 bool allowSizeGrowth(CallBase &Call);
235 /// Return true if \p Call is a cold callsite.
236 bool isColdCallSite(CallBase &Call, BlockFrequencyInfo *CallerBFI);
238 /// Return a higher threshold if \p Call is a hot callsite.
239 Optional<int> getHotCallSiteThreshold(CallBase &Call,
240 BlockFrequencyInfo *CallerBFI);
242 // Custom analysis routines.
243 InlineResult analyzeBlock(BasicBlock *BB,
244 SmallPtrSetImpl<const Value *> &EphValues);
246 /// Handle a capped 'int' increment for Cost.
247 void addCost(int64_t Inc, int64_t UpperBound = INT_MAX) {
248 assert(UpperBound > 0 && UpperBound <= INT_MAX && "invalid upper bound");
249 Cost = (int)std::min(UpperBound, Cost + Inc);
252 // Disable several entry points to the visitor so we don't accidentally use
253 // them by declaring but not defining them here.
254 void visit(Module *);
255 void visit(Module &);
256 void visit(Function *);
257 void visit(Function &);
258 void visit(BasicBlock *);
259 void visit(BasicBlock &);
261 // Provide base case for our instruction visit.
262 bool visitInstruction(Instruction &I);
264 // Our visit overrides.
265 bool visitAlloca(AllocaInst &I);
266 bool visitPHI(PHINode &I);
267 bool visitGetElementPtr(GetElementPtrInst &I);
268 bool visitBitCast(BitCastInst &I);
269 bool visitPtrToInt(PtrToIntInst &I);
270 bool visitIntToPtr(IntToPtrInst &I);
271 bool visitCastInst(CastInst &I);
272 bool visitUnaryInstruction(UnaryInstruction &I);
273 bool visitCmpInst(CmpInst &I);
274 bool visitSub(BinaryOperator &I);
275 bool visitBinaryOperator(BinaryOperator &I);
276 bool visitFNeg(UnaryOperator &I);
277 bool visitLoad(LoadInst &I);
278 bool visitStore(StoreInst &I);
279 bool visitExtractValue(ExtractValueInst &I);
280 bool visitInsertValue(InsertValueInst &I);
281 bool visitCallBase(CallBase &Call);
282 bool visitReturnInst(ReturnInst &RI);
283 bool visitBranchInst(BranchInst &BI);
284 bool visitSelectInst(SelectInst &SI);
285 bool visitSwitchInst(SwitchInst &SI);
286 bool visitIndirectBrInst(IndirectBrInst &IBI);
287 bool visitResumeInst(ResumeInst &RI);
288 bool visitCleanupReturnInst(CleanupReturnInst &RI);
289 bool visitCatchReturnInst(CatchReturnInst &RI);
290 bool visitUnreachableInst(UnreachableInst &I);
292 public:
293 CallAnalyzer(const TargetTransformInfo &TTI,
294 std::function<AssumptionCache &(Function &)> &GetAssumptionCache,
295 Optional<function_ref<BlockFrequencyInfo &(Function &)>> &GetBFI,
296 ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE,
297 Function &Callee, CallBase &Call, const InlineParams &Params)
298 : TTI(TTI), GetAssumptionCache(GetAssumptionCache), GetBFI(GetBFI),
299 PSI(PSI), F(Callee), DL(F.getParent()->getDataLayout()), ORE(ORE),
300 CandidateCall(Call), Params(Params), Threshold(Params.DefaultThreshold),
301 ComputeFullInlineCost(OptComputeFullInlineCost ||
302 Params.ComputeFullInlineCost || ORE),
303 EnableLoadElimination(true) {}
305 InlineResult analyzeCall(CallBase &Call);
307 int getThreshold() { return Threshold; }
308 int getCost() { return Cost; }
310 // Keep a bunch of stats about the cost savings found so we can print them
311 // out when debugging.
312 unsigned NumConstantArgs = 0;
313 unsigned NumConstantOffsetPtrArgs = 0;
314 unsigned NumAllocaArgs = 0;
315 unsigned NumConstantPtrCmps = 0;
316 unsigned NumConstantPtrDiffs = 0;
317 unsigned NumInstructionsSimplified = 0;
318 unsigned SROACostSavings = 0;
319 unsigned SROACostSavingsLost = 0;
321 void dump();
324 } // namespace
326 /// Test whether the given value is an Alloca-derived function argument.
327 bool CallAnalyzer::isAllocaDerivedArg(Value *V) {
328 return SROAArgValues.count(V);
331 /// Lookup the SROA-candidate argument and cost iterator which V maps to.
332 /// Returns false if V does not map to a SROA-candidate.
333 bool CallAnalyzer::lookupSROAArgAndCost(
334 Value *V, Value *&Arg, DenseMap<Value *, int>::iterator &CostIt) {
335 if (SROAArgValues.empty() || SROAArgCosts.empty())
336 return false;
338 DenseMap<Value *, Value *>::iterator ArgIt = SROAArgValues.find(V);
339 if (ArgIt == SROAArgValues.end())
340 return false;
342 Arg = ArgIt->second;
343 CostIt = SROAArgCosts.find(Arg);
344 return CostIt != SROAArgCosts.end();
347 /// Disable SROA for the candidate marked by this cost iterator.
349 /// This marks the candidate as no longer viable for SROA, and adds the cost
350 /// savings associated with it back into the inline cost measurement.
351 void CallAnalyzer::disableSROA(DenseMap<Value *, int>::iterator CostIt) {
352 // If we're no longer able to perform SROA we need to undo its cost savings
353 // and prevent subsequent analysis.
354 addCost(CostIt->second);
355 SROACostSavings -= CostIt->second;
356 SROACostSavingsLost += CostIt->second;
357 SROAArgCosts.erase(CostIt);
358 disableLoadElimination();
361 /// If 'V' maps to a SROA candidate, disable SROA for it.
362 void CallAnalyzer::disableSROA(Value *V) {
363 Value *SROAArg;
364 DenseMap<Value *, int>::iterator CostIt;
365 if (lookupSROAArgAndCost(V, SROAArg, CostIt))
366 disableSROA(CostIt);
369 /// Accumulate the given cost for a particular SROA candidate.
370 void CallAnalyzer::accumulateSROACost(DenseMap<Value *, int>::iterator CostIt,
371 int InstructionCost) {
372 CostIt->second += InstructionCost;
373 SROACostSavings += InstructionCost;
376 void CallAnalyzer::disableLoadElimination() {
377 if (EnableLoadElimination) {
378 addCost(LoadEliminationCost);
379 LoadEliminationCost = 0;
380 EnableLoadElimination = false;
384 /// Accumulate a constant GEP offset into an APInt if possible.
386 /// Returns false if unable to compute the offset for any reason. Respects any
387 /// simplified values known during the analysis of this callsite.
388 bool CallAnalyzer::accumulateGEPOffset(GEPOperator &GEP, APInt &Offset) {
389 unsigned IntPtrWidth = DL.getIndexTypeSizeInBits(GEP.getType());
390 assert(IntPtrWidth == Offset.getBitWidth());
392 for (gep_type_iterator GTI = gep_type_begin(GEP), GTE = gep_type_end(GEP);
393 GTI != GTE; ++GTI) {
394 ConstantInt *OpC = dyn_cast<ConstantInt>(GTI.getOperand());
395 if (!OpC)
396 if (Constant *SimpleOp = SimplifiedValues.lookup(GTI.getOperand()))
397 OpC = dyn_cast<ConstantInt>(SimpleOp);
398 if (!OpC)
399 return false;
400 if (OpC->isZero())
401 continue;
403 // Handle a struct index, which adds its field offset to the pointer.
404 if (StructType *STy = GTI.getStructTypeOrNull()) {
405 unsigned ElementIdx = OpC->getZExtValue();
406 const StructLayout *SL = DL.getStructLayout(STy);
407 Offset += APInt(IntPtrWidth, SL->getElementOffset(ElementIdx));
408 continue;
411 APInt TypeSize(IntPtrWidth, DL.getTypeAllocSize(GTI.getIndexedType()));
412 Offset += OpC->getValue().sextOrTrunc(IntPtrWidth) * TypeSize;
414 return true;
417 /// Use TTI to check whether a GEP is free.
419 /// Respects any simplified values known during the analysis of this callsite.
420 bool CallAnalyzer::isGEPFree(GetElementPtrInst &GEP) {
421 SmallVector<Value *, 4> Operands;
422 Operands.push_back(GEP.getOperand(0));
423 for (User::op_iterator I = GEP.idx_begin(), E = GEP.idx_end(); I != E; ++I)
424 if (Constant *SimpleOp = SimplifiedValues.lookup(*I))
425 Operands.push_back(SimpleOp);
426 else
427 Operands.push_back(*I);
428 return TargetTransformInfo::TCC_Free == TTI.getUserCost(&GEP, Operands);
431 bool CallAnalyzer::visitAlloca(AllocaInst &I) {
432 // Check whether inlining will turn a dynamic alloca into a static
433 // alloca and handle that case.
434 if (I.isArrayAllocation()) {
435 Constant *Size = SimplifiedValues.lookup(I.getArraySize());
436 if (auto *AllocSize = dyn_cast_or_null<ConstantInt>(Size)) {
437 Type *Ty = I.getAllocatedType();
438 AllocatedSize = SaturatingMultiplyAdd(
439 AllocSize->getLimitedValue(), DL.getTypeAllocSize(Ty), AllocatedSize);
440 return Base::visitAlloca(I);
444 // Accumulate the allocated size.
445 if (I.isStaticAlloca()) {
446 Type *Ty = I.getAllocatedType();
447 AllocatedSize = SaturatingAdd(DL.getTypeAllocSize(Ty), AllocatedSize);
450 // We will happily inline static alloca instructions.
451 if (I.isStaticAlloca())
452 return Base::visitAlloca(I);
454 // FIXME: This is overly conservative. Dynamic allocas are inefficient for
455 // a variety of reasons, and so we would like to not inline them into
456 // functions which don't currently have a dynamic alloca. This simply
457 // disables inlining altogether in the presence of a dynamic alloca.
458 HasDynamicAlloca = true;
459 return false;
462 bool CallAnalyzer::visitPHI(PHINode &I) {
463 // FIXME: We need to propagate SROA *disabling* through phi nodes, even
464 // though we don't want to propagate it's bonuses. The idea is to disable
465 // SROA if it *might* be used in an inappropriate manner.
467 // Phi nodes are always zero-cost.
468 // FIXME: Pointer sizes may differ between different address spaces, so do we
469 // need to use correct address space in the call to getPointerSizeInBits here?
470 // Or could we skip the getPointerSizeInBits call completely? As far as I can
471 // see the ZeroOffset is used as a dummy value, so we can probably use any
472 // bit width for the ZeroOffset?
473 APInt ZeroOffset = APInt::getNullValue(DL.getPointerSizeInBits(0));
474 bool CheckSROA = I.getType()->isPointerTy();
476 // Track the constant or pointer with constant offset we've seen so far.
477 Constant *FirstC = nullptr;
478 std::pair<Value *, APInt> FirstBaseAndOffset = {nullptr, ZeroOffset};
479 Value *FirstV = nullptr;
481 for (unsigned i = 0, e = I.getNumIncomingValues(); i != e; ++i) {
482 BasicBlock *Pred = I.getIncomingBlock(i);
483 // If the incoming block is dead, skip the incoming block.
484 if (DeadBlocks.count(Pred))
485 continue;
486 // If the parent block of phi is not the known successor of the incoming
487 // block, skip the incoming block.
488 BasicBlock *KnownSuccessor = KnownSuccessors[Pred];
489 if (KnownSuccessor && KnownSuccessor != I.getParent())
490 continue;
492 Value *V = I.getIncomingValue(i);
493 // If the incoming value is this phi itself, skip the incoming value.
494 if (&I == V)
495 continue;
497 Constant *C = dyn_cast<Constant>(V);
498 if (!C)
499 C = SimplifiedValues.lookup(V);
501 std::pair<Value *, APInt> BaseAndOffset = {nullptr, ZeroOffset};
502 if (!C && CheckSROA)
503 BaseAndOffset = ConstantOffsetPtrs.lookup(V);
505 if (!C && !BaseAndOffset.first)
506 // The incoming value is neither a constant nor a pointer with constant
507 // offset, exit early.
508 return true;
510 if (FirstC) {
511 if (FirstC == C)
512 // If we've seen a constant incoming value before and it is the same
513 // constant we see this time, continue checking the next incoming value.
514 continue;
515 // Otherwise early exit because we either see a different constant or saw
516 // a constant before but we have a pointer with constant offset this time.
517 return true;
520 if (FirstV) {
521 // The same logic as above, but check pointer with constant offset here.
522 if (FirstBaseAndOffset == BaseAndOffset)
523 continue;
524 return true;
527 if (C) {
528 // This is the 1st time we've seen a constant, record it.
529 FirstC = C;
530 continue;
533 // The remaining case is that this is the 1st time we've seen a pointer with
534 // constant offset, record it.
535 FirstV = V;
536 FirstBaseAndOffset = BaseAndOffset;
539 // Check if we can map phi to a constant.
540 if (FirstC) {
541 SimplifiedValues[&I] = FirstC;
542 return true;
545 // Check if we can map phi to a pointer with constant offset.
546 if (FirstBaseAndOffset.first) {
547 ConstantOffsetPtrs[&I] = FirstBaseAndOffset;
549 Value *SROAArg;
550 DenseMap<Value *, int>::iterator CostIt;
551 if (lookupSROAArgAndCost(FirstV, SROAArg, CostIt))
552 SROAArgValues[&I] = SROAArg;
555 return true;
558 /// Check we can fold GEPs of constant-offset call site argument pointers.
559 /// This requires target data and inbounds GEPs.
561 /// \return true if the specified GEP can be folded.
562 bool CallAnalyzer::canFoldInboundsGEP(GetElementPtrInst &I) {
563 // Check if we have a base + offset for the pointer.
564 std::pair<Value *, APInt> BaseAndOffset =
565 ConstantOffsetPtrs.lookup(I.getPointerOperand());
566 if (!BaseAndOffset.first)
567 return false;
569 // Check if the offset of this GEP is constant, and if so accumulate it
570 // into Offset.
571 if (!accumulateGEPOffset(cast<GEPOperator>(I), BaseAndOffset.second))
572 return false;
574 // Add the result as a new mapping to Base + Offset.
575 ConstantOffsetPtrs[&I] = BaseAndOffset;
577 return true;
580 bool CallAnalyzer::visitGetElementPtr(GetElementPtrInst &I) {
581 Value *SROAArg;
582 DenseMap<Value *, int>::iterator CostIt;
583 bool SROACandidate =
584 lookupSROAArgAndCost(I.getPointerOperand(), SROAArg, CostIt);
586 // Lambda to check whether a GEP's indices are all constant.
587 auto IsGEPOffsetConstant = [&](GetElementPtrInst &GEP) {
588 for (User::op_iterator I = GEP.idx_begin(), E = GEP.idx_end(); I != E; ++I)
589 if (!isa<Constant>(*I) && !SimplifiedValues.lookup(*I))
590 return false;
591 return true;
594 if ((I.isInBounds() && canFoldInboundsGEP(I)) || IsGEPOffsetConstant(I)) {
595 if (SROACandidate)
596 SROAArgValues[&I] = SROAArg;
598 // Constant GEPs are modeled as free.
599 return true;
602 // Variable GEPs will require math and will disable SROA.
603 if (SROACandidate)
604 disableSROA(CostIt);
605 return isGEPFree(I);
608 /// Simplify \p I if its operands are constants and update SimplifiedValues.
609 /// \p Evaluate is a callable specific to instruction type that evaluates the
610 /// instruction when all the operands are constants.
611 template <typename Callable>
612 bool CallAnalyzer::simplifyInstruction(Instruction &I, Callable Evaluate) {
613 SmallVector<Constant *, 2> COps;
614 for (Value *Op : I.operands()) {
615 Constant *COp = dyn_cast<Constant>(Op);
616 if (!COp)
617 COp = SimplifiedValues.lookup(Op);
618 if (!COp)
619 return false;
620 COps.push_back(COp);
622 auto *C = Evaluate(COps);
623 if (!C)
624 return false;
625 SimplifiedValues[&I] = C;
626 return true;
629 bool CallAnalyzer::visitBitCast(BitCastInst &I) {
630 // Propagate constants through bitcasts.
631 if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
632 return ConstantExpr::getBitCast(COps[0], I.getType());
634 return true;
636 // Track base/offsets through casts
637 std::pair<Value *, APInt> BaseAndOffset =
638 ConstantOffsetPtrs.lookup(I.getOperand(0));
639 // Casts don't change the offset, just wrap it up.
640 if (BaseAndOffset.first)
641 ConstantOffsetPtrs[&I] = BaseAndOffset;
643 // Also look for SROA candidates here.
644 Value *SROAArg;
645 DenseMap<Value *, int>::iterator CostIt;
646 if (lookupSROAArgAndCost(I.getOperand(0), SROAArg, CostIt))
647 SROAArgValues[&I] = SROAArg;
649 // Bitcasts are always zero cost.
650 return true;
653 bool CallAnalyzer::visitPtrToInt(PtrToIntInst &I) {
654 // Propagate constants through ptrtoint.
655 if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
656 return ConstantExpr::getPtrToInt(COps[0], I.getType());
658 return true;
660 // Track base/offset pairs when converted to a plain integer provided the
661 // integer is large enough to represent the pointer.
662 unsigned IntegerSize = I.getType()->getScalarSizeInBits();
663 unsigned AS = I.getOperand(0)->getType()->getPointerAddressSpace();
664 if (IntegerSize >= DL.getPointerSizeInBits(AS)) {
665 std::pair<Value *, APInt> BaseAndOffset =
666 ConstantOffsetPtrs.lookup(I.getOperand(0));
667 if (BaseAndOffset.first)
668 ConstantOffsetPtrs[&I] = BaseAndOffset;
671 // This is really weird. Technically, ptrtoint will disable SROA. However,
672 // unless that ptrtoint is *used* somewhere in the live basic blocks after
673 // inlining, it will be nuked, and SROA should proceed. All of the uses which
674 // would block SROA would also block SROA if applied directly to a pointer,
675 // and so we can just add the integer in here. The only places where SROA is
676 // preserved either cannot fire on an integer, or won't in-and-of themselves
677 // disable SROA (ext) w/o some later use that we would see and disable.
678 Value *SROAArg;
679 DenseMap<Value *, int>::iterator CostIt;
680 if (lookupSROAArgAndCost(I.getOperand(0), SROAArg, CostIt))
681 SROAArgValues[&I] = SROAArg;
683 return TargetTransformInfo::TCC_Free == TTI.getUserCost(&I);
686 bool CallAnalyzer::visitIntToPtr(IntToPtrInst &I) {
687 // Propagate constants through ptrtoint.
688 if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
689 return ConstantExpr::getIntToPtr(COps[0], I.getType());
691 return true;
693 // Track base/offset pairs when round-tripped through a pointer without
694 // modifications provided the integer is not too large.
695 Value *Op = I.getOperand(0);
696 unsigned IntegerSize = Op->getType()->getScalarSizeInBits();
697 if (IntegerSize <= DL.getPointerTypeSizeInBits(I.getType())) {
698 std::pair<Value *, APInt> BaseAndOffset = ConstantOffsetPtrs.lookup(Op);
699 if (BaseAndOffset.first)
700 ConstantOffsetPtrs[&I] = BaseAndOffset;
703 // "Propagate" SROA here in the same manner as we do for ptrtoint above.
704 Value *SROAArg;
705 DenseMap<Value *, int>::iterator CostIt;
706 if (lookupSROAArgAndCost(Op, SROAArg, CostIt))
707 SROAArgValues[&I] = SROAArg;
709 return TargetTransformInfo::TCC_Free == TTI.getUserCost(&I);
712 bool CallAnalyzer::visitCastInst(CastInst &I) {
713 // Propagate constants through casts.
714 if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
715 return ConstantExpr::getCast(I.getOpcode(), COps[0], I.getType());
717 return true;
719 // Disable SROA in the face of arbitrary casts we don't whitelist elsewhere.
720 disableSROA(I.getOperand(0));
722 // If this is a floating-point cast, and the target says this operation
723 // is expensive, this may eventually become a library call. Treat the cost
724 // as such.
725 switch (I.getOpcode()) {
726 case Instruction::FPTrunc:
727 case Instruction::FPExt:
728 case Instruction::UIToFP:
729 case Instruction::SIToFP:
730 case Instruction::FPToUI:
731 case Instruction::FPToSI:
732 if (TTI.getFPOpCost(I.getType()) == TargetTransformInfo::TCC_Expensive)
733 addCost(InlineConstants::CallPenalty);
734 break;
735 default:
736 break;
739 return TargetTransformInfo::TCC_Free == TTI.getUserCost(&I);
742 bool CallAnalyzer::visitUnaryInstruction(UnaryInstruction &I) {
743 Value *Operand = I.getOperand(0);
744 if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
745 return ConstantFoldInstOperands(&I, COps[0], DL);
747 return true;
749 // Disable any SROA on the argument to arbitrary unary instructions.
750 disableSROA(Operand);
752 return false;
755 bool CallAnalyzer::paramHasAttr(Argument *A, Attribute::AttrKind Attr) {
756 return CandidateCall.paramHasAttr(A->getArgNo(), Attr);
759 bool CallAnalyzer::isKnownNonNullInCallee(Value *V) {
760 // Does the *call site* have the NonNull attribute set on an argument? We
761 // use the attribute on the call site to memoize any analysis done in the
762 // caller. This will also trip if the callee function has a non-null
763 // parameter attribute, but that's a less interesting case because hopefully
764 // the callee would already have been simplified based on that.
765 if (Argument *A = dyn_cast<Argument>(V))
766 if (paramHasAttr(A, Attribute::NonNull))
767 return true;
769 // Is this an alloca in the caller? This is distinct from the attribute case
770 // above because attributes aren't updated within the inliner itself and we
771 // always want to catch the alloca derived case.
772 if (isAllocaDerivedArg(V))
773 // We can actually predict the result of comparisons between an
774 // alloca-derived value and null. Note that this fires regardless of
775 // SROA firing.
776 return true;
778 return false;
781 bool CallAnalyzer::allowSizeGrowth(CallBase &Call) {
782 // If the normal destination of the invoke or the parent block of the call
783 // site is unreachable-terminated, there is little point in inlining this
784 // unless there is literally zero cost.
785 // FIXME: Note that it is possible that an unreachable-terminated block has a
786 // hot entry. For example, in below scenario inlining hot_call_X() may be
787 // beneficial :
788 // main() {
789 // hot_call_1();
790 // ...
791 // hot_call_N()
792 // exit(0);
793 // }
794 // For now, we are not handling this corner case here as it is rare in real
795 // code. In future, we should elaborate this based on BPI and BFI in more
796 // general threshold adjusting heuristics in updateThreshold().
797 if (InvokeInst *II = dyn_cast<InvokeInst>(&Call)) {
798 if (isa<UnreachableInst>(II->getNormalDest()->getTerminator()))
799 return false;
800 } else if (isa<UnreachableInst>(Call.getParent()->getTerminator()))
801 return false;
803 return true;
806 bool CallAnalyzer::isColdCallSite(CallBase &Call,
807 BlockFrequencyInfo *CallerBFI) {
808 // If global profile summary is available, then callsite's coldness is
809 // determined based on that.
810 if (PSI && PSI->hasProfileSummary())
811 return PSI->isColdCallSite(CallSite(&Call), CallerBFI);
813 // Otherwise we need BFI to be available.
814 if (!CallerBFI)
815 return false;
817 // Determine if the callsite is cold relative to caller's entry. We could
818 // potentially cache the computation of scaled entry frequency, but the added
819 // complexity is not worth it unless this scaling shows up high in the
820 // profiles.
821 const BranchProbability ColdProb(ColdCallSiteRelFreq, 100);
822 auto CallSiteBB = Call.getParent();
823 auto CallSiteFreq = CallerBFI->getBlockFreq(CallSiteBB);
824 auto CallerEntryFreq =
825 CallerBFI->getBlockFreq(&(Call.getCaller()->getEntryBlock()));
826 return CallSiteFreq < CallerEntryFreq * ColdProb;
829 Optional<int>
830 CallAnalyzer::getHotCallSiteThreshold(CallBase &Call,
831 BlockFrequencyInfo *CallerBFI) {
833 // If global profile summary is available, then callsite's hotness is
834 // determined based on that.
835 if (PSI && PSI->hasProfileSummary() &&
836 PSI->isHotCallSite(CallSite(&Call), CallerBFI))
837 return Params.HotCallSiteThreshold;
839 // Otherwise we need BFI to be available and to have a locally hot callsite
840 // threshold.
841 if (!CallerBFI || !Params.LocallyHotCallSiteThreshold)
842 return None;
844 // Determine if the callsite is hot relative to caller's entry. We could
845 // potentially cache the computation of scaled entry frequency, but the added
846 // complexity is not worth it unless this scaling shows up high in the
847 // profiles.
848 auto CallSiteBB = Call.getParent();
849 auto CallSiteFreq = CallerBFI->getBlockFreq(CallSiteBB).getFrequency();
850 auto CallerEntryFreq = CallerBFI->getEntryFreq();
851 if (CallSiteFreq >= CallerEntryFreq * HotCallSiteRelFreq)
852 return Params.LocallyHotCallSiteThreshold;
854 // Otherwise treat it normally.
855 return None;
858 void CallAnalyzer::updateThreshold(CallBase &Call, Function &Callee) {
859 // If no size growth is allowed for this inlining, set Threshold to 0.
860 if (!allowSizeGrowth(Call)) {
861 Threshold = 0;
862 return;
865 Function *Caller = Call.getCaller();
867 // return min(A, B) if B is valid.
868 auto MinIfValid = [](int A, Optional<int> B) {
869 return B ? std::min(A, B.getValue()) : A;
872 // return max(A, B) if B is valid.
873 auto MaxIfValid = [](int A, Optional<int> B) {
874 return B ? std::max(A, B.getValue()) : A;
877 // Various bonus percentages. These are multiplied by Threshold to get the
878 // bonus values.
879 // SingleBBBonus: This bonus is applied if the callee has a single reachable
880 // basic block at the given callsite context. This is speculatively applied
881 // and withdrawn if more than one basic block is seen.
883 // LstCallToStaticBonus: This large bonus is applied to ensure the inlining
884 // of the last call to a static function as inlining such functions is
885 // guaranteed to reduce code size.
887 // These bonus percentages may be set to 0 based on properties of the caller
888 // and the callsite.
889 int SingleBBBonusPercent = 50;
890 int VectorBonusPercent = TTI.getInlinerVectorBonusPercent();
891 int LastCallToStaticBonus = InlineConstants::LastCallToStaticBonus;
893 // Lambda to set all the above bonus and bonus percentages to 0.
894 auto DisallowAllBonuses = [&]() {
895 SingleBBBonusPercent = 0;
896 VectorBonusPercent = 0;
897 LastCallToStaticBonus = 0;
900 // Use the OptMinSizeThreshold or OptSizeThreshold knob if they are available
901 // and reduce the threshold if the caller has the necessary attribute.
902 if (Caller->hasMinSize()) {
903 Threshold = MinIfValid(Threshold, Params.OptMinSizeThreshold);
904 // For minsize, we want to disable the single BB bonus and the vector
905 // bonuses, but not the last-call-to-static bonus. Inlining the last call to
906 // a static function will, at the minimum, eliminate the parameter setup and
907 // call/return instructions.
908 SingleBBBonusPercent = 0;
909 VectorBonusPercent = 0;
910 } else if (Caller->hasOptSize())
911 Threshold = MinIfValid(Threshold, Params.OptSizeThreshold);
913 // Adjust the threshold based on inlinehint attribute and profile based
914 // hotness information if the caller does not have MinSize attribute.
915 if (!Caller->hasMinSize()) {
916 if (Callee.hasFnAttribute(Attribute::InlineHint))
917 Threshold = MaxIfValid(Threshold, Params.HintThreshold);
919 // FIXME: After switching to the new passmanager, simplify the logic below
920 // by checking only the callsite hotness/coldness as we will reliably
921 // have local profile information.
923 // Callsite hotness and coldness can be determined if sample profile is
924 // used (which adds hotness metadata to calls) or if caller's
925 // BlockFrequencyInfo is available.
926 BlockFrequencyInfo *CallerBFI = GetBFI ? &((*GetBFI)(*Caller)) : nullptr;
927 auto HotCallSiteThreshold = getHotCallSiteThreshold(Call, CallerBFI);
928 if (!Caller->hasOptSize() && HotCallSiteThreshold) {
929 LLVM_DEBUG(dbgs() << "Hot callsite.\n");
930 // FIXME: This should update the threshold only if it exceeds the
931 // current threshold, but AutoFDO + ThinLTO currently relies on this
932 // behavior to prevent inlining of hot callsites during ThinLTO
933 // compile phase.
934 Threshold = HotCallSiteThreshold.getValue();
935 } else if (isColdCallSite(Call, CallerBFI)) {
936 LLVM_DEBUG(dbgs() << "Cold callsite.\n");
937 // Do not apply bonuses for a cold callsite including the
938 // LastCallToStatic bonus. While this bonus might result in code size
939 // reduction, it can cause the size of a non-cold caller to increase
940 // preventing it from being inlined.
941 DisallowAllBonuses();
942 Threshold = MinIfValid(Threshold, Params.ColdCallSiteThreshold);
943 } else if (PSI) {
944 // Use callee's global profile information only if we have no way of
945 // determining this via callsite information.
946 if (PSI->isFunctionEntryHot(&Callee)) {
947 LLVM_DEBUG(dbgs() << "Hot callee.\n");
948 // If callsite hotness can not be determined, we may still know
949 // that the callee is hot and treat it as a weaker hint for threshold
950 // increase.
951 Threshold = MaxIfValid(Threshold, Params.HintThreshold);
952 } else if (PSI->isFunctionEntryCold(&Callee)) {
953 LLVM_DEBUG(dbgs() << "Cold callee.\n");
954 // Do not apply bonuses for a cold callee including the
955 // LastCallToStatic bonus. While this bonus might result in code size
956 // reduction, it can cause the size of a non-cold caller to increase
957 // preventing it from being inlined.
958 DisallowAllBonuses();
959 Threshold = MinIfValid(Threshold, Params.ColdThreshold);
964 // Finally, take the target-specific inlining threshold multiplier into
965 // account.
966 Threshold *= TTI.getInliningThresholdMultiplier();
968 SingleBBBonus = Threshold * SingleBBBonusPercent / 100;
969 VectorBonus = Threshold * VectorBonusPercent / 100;
971 bool OnlyOneCallAndLocalLinkage =
972 F.hasLocalLinkage() && F.hasOneUse() && &F == Call.getCalledFunction();
973 // If there is only one call of the function, and it has internal linkage,
974 // the cost of inlining it drops dramatically. It may seem odd to update
975 // Cost in updateThreshold, but the bonus depends on the logic in this method.
976 if (OnlyOneCallAndLocalLinkage)
977 Cost -= LastCallToStaticBonus;
980 bool CallAnalyzer::visitCmpInst(CmpInst &I) {
981 Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
982 // First try to handle simplified comparisons.
983 if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
984 return ConstantExpr::getCompare(I.getPredicate(), COps[0], COps[1]);
986 return true;
988 if (I.getOpcode() == Instruction::FCmp)
989 return false;
991 // Otherwise look for a comparison between constant offset pointers with
992 // a common base.
993 Value *LHSBase, *RHSBase;
994 APInt LHSOffset, RHSOffset;
995 std::tie(LHSBase, LHSOffset) = ConstantOffsetPtrs.lookup(LHS);
996 if (LHSBase) {
997 std::tie(RHSBase, RHSOffset) = ConstantOffsetPtrs.lookup(RHS);
998 if (RHSBase && LHSBase == RHSBase) {
999 // We have common bases, fold the icmp to a constant based on the
1000 // offsets.
1001 Constant *CLHS = ConstantInt::get(LHS->getContext(), LHSOffset);
1002 Constant *CRHS = ConstantInt::get(RHS->getContext(), RHSOffset);
1003 if (Constant *C = ConstantExpr::getICmp(I.getPredicate(), CLHS, CRHS)) {
1004 SimplifiedValues[&I] = C;
1005 ++NumConstantPtrCmps;
1006 return true;
1011 // If the comparison is an equality comparison with null, we can simplify it
1012 // if we know the value (argument) can't be null
1013 if (I.isEquality() && isa<ConstantPointerNull>(I.getOperand(1)) &&
1014 isKnownNonNullInCallee(I.getOperand(0))) {
1015 bool IsNotEqual = I.getPredicate() == CmpInst::ICMP_NE;
1016 SimplifiedValues[&I] = IsNotEqual ? ConstantInt::getTrue(I.getType())
1017 : ConstantInt::getFalse(I.getType());
1018 return true;
1020 // Finally check for SROA candidates in comparisons.
1021 Value *SROAArg;
1022 DenseMap<Value *, int>::iterator CostIt;
1023 if (lookupSROAArgAndCost(I.getOperand(0), SROAArg, CostIt)) {
1024 if (isa<ConstantPointerNull>(I.getOperand(1))) {
1025 accumulateSROACost(CostIt, InlineConstants::InstrCost);
1026 return true;
1029 disableSROA(CostIt);
1032 return false;
1035 bool CallAnalyzer::visitSub(BinaryOperator &I) {
1036 // Try to handle a special case: we can fold computing the difference of two
1037 // constant-related pointers.
1038 Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
1039 Value *LHSBase, *RHSBase;
1040 APInt LHSOffset, RHSOffset;
1041 std::tie(LHSBase, LHSOffset) = ConstantOffsetPtrs.lookup(LHS);
1042 if (LHSBase) {
1043 std::tie(RHSBase, RHSOffset) = ConstantOffsetPtrs.lookup(RHS);
1044 if (RHSBase && LHSBase == RHSBase) {
1045 // We have common bases, fold the subtract to a constant based on the
1046 // offsets.
1047 Constant *CLHS = ConstantInt::get(LHS->getContext(), LHSOffset);
1048 Constant *CRHS = ConstantInt::get(RHS->getContext(), RHSOffset);
1049 if (Constant *C = ConstantExpr::getSub(CLHS, CRHS)) {
1050 SimplifiedValues[&I] = C;
1051 ++NumConstantPtrDiffs;
1052 return true;
1057 // Otherwise, fall back to the generic logic for simplifying and handling
1058 // instructions.
1059 return Base::visitSub(I);
1062 bool CallAnalyzer::visitBinaryOperator(BinaryOperator &I) {
1063 Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
1064 Constant *CLHS = dyn_cast<Constant>(LHS);
1065 if (!CLHS)
1066 CLHS = SimplifiedValues.lookup(LHS);
1067 Constant *CRHS = dyn_cast<Constant>(RHS);
1068 if (!CRHS)
1069 CRHS = SimplifiedValues.lookup(RHS);
1071 Value *SimpleV = nullptr;
1072 if (auto FI = dyn_cast<FPMathOperator>(&I))
1073 SimpleV = SimplifyBinOp(I.getOpcode(), CLHS ? CLHS : LHS,
1074 CRHS ? CRHS : RHS, FI->getFastMathFlags(), DL);
1075 else
1076 SimpleV =
1077 SimplifyBinOp(I.getOpcode(), CLHS ? CLHS : LHS, CRHS ? CRHS : RHS, DL);
1079 if (Constant *C = dyn_cast_or_null<Constant>(SimpleV))
1080 SimplifiedValues[&I] = C;
1082 if (SimpleV)
1083 return true;
1085 // Disable any SROA on arguments to arbitrary, unsimplified binary operators.
1086 disableSROA(LHS);
1087 disableSROA(RHS);
1089 // If the instruction is floating point, and the target says this operation
1090 // is expensive, this may eventually become a library call. Treat the cost
1091 // as such. Unless it's fneg which can be implemented with an xor.
1092 using namespace llvm::PatternMatch;
1093 if (I.getType()->isFloatingPointTy() &&
1094 TTI.getFPOpCost(I.getType()) == TargetTransformInfo::TCC_Expensive &&
1095 !match(&I, m_FNeg(m_Value())))
1096 addCost(InlineConstants::CallPenalty);
1098 return false;
1101 bool CallAnalyzer::visitFNeg(UnaryOperator &I) {
1102 Value *Op = I.getOperand(0);
1103 Constant *COp = dyn_cast<Constant>(Op);
1104 if (!COp)
1105 COp = SimplifiedValues.lookup(Op);
1107 Value *SimpleV = SimplifyFNegInst(COp ? COp : Op,
1108 cast<FPMathOperator>(I).getFastMathFlags(),
1109 DL);
1111 if (Constant *C = dyn_cast_or_null<Constant>(SimpleV))
1112 SimplifiedValues[&I] = C;
1114 if (SimpleV)
1115 return true;
1117 // Disable any SROA on arguments to arbitrary, unsimplified fneg.
1118 disableSROA(Op);
1120 return false;
1123 bool CallAnalyzer::visitLoad(LoadInst &I) {
1124 Value *SROAArg;
1125 DenseMap<Value *, int>::iterator CostIt;
1126 if (lookupSROAArgAndCost(I.getPointerOperand(), SROAArg, CostIt)) {
1127 if (I.isSimple()) {
1128 accumulateSROACost(CostIt, InlineConstants::InstrCost);
1129 return true;
1132 disableSROA(CostIt);
1135 // If the data is already loaded from this address and hasn't been clobbered
1136 // by any stores or calls, this load is likely to be redundant and can be
1137 // eliminated.
1138 if (EnableLoadElimination &&
1139 !LoadAddrSet.insert(I.getPointerOperand()).second && I.isUnordered()) {
1140 LoadEliminationCost += InlineConstants::InstrCost;
1141 return true;
1144 return false;
1147 bool CallAnalyzer::visitStore(StoreInst &I) {
1148 Value *SROAArg;
1149 DenseMap<Value *, int>::iterator CostIt;
1150 if (lookupSROAArgAndCost(I.getPointerOperand(), SROAArg, CostIt)) {
1151 if (I.isSimple()) {
1152 accumulateSROACost(CostIt, InlineConstants::InstrCost);
1153 return true;
1156 disableSROA(CostIt);
1159 // The store can potentially clobber loads and prevent repeated loads from
1160 // being eliminated.
1161 // FIXME:
1162 // 1. We can probably keep an initial set of eliminatable loads substracted
1163 // from the cost even when we finally see a store. We just need to disable
1164 // *further* accumulation of elimination savings.
1165 // 2. We should probably at some point thread MemorySSA for the callee into
1166 // this and then use that to actually compute *really* precise savings.
1167 disableLoadElimination();
1168 return false;
1171 bool CallAnalyzer::visitExtractValue(ExtractValueInst &I) {
1172 // Constant folding for extract value is trivial.
1173 if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
1174 return ConstantExpr::getExtractValue(COps[0], I.getIndices());
1176 return true;
1178 // SROA can look through these but give them a cost.
1179 return false;
1182 bool CallAnalyzer::visitInsertValue(InsertValueInst &I) {
1183 // Constant folding for insert value is trivial.
1184 if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
1185 return ConstantExpr::getInsertValue(/*AggregateOperand*/ COps[0],
1186 /*InsertedValueOperand*/ COps[1],
1187 I.getIndices());
1189 return true;
1191 // SROA can look through these but give them a cost.
1192 return false;
1195 /// Try to simplify a call site.
1197 /// Takes a concrete function and callsite and tries to actually simplify it by
1198 /// analyzing the arguments and call itself with instsimplify. Returns true if
1199 /// it has simplified the callsite to some other entity (a constant), making it
1200 /// free.
1201 bool CallAnalyzer::simplifyCallSite(Function *F, CallBase &Call) {
1202 // FIXME: Using the instsimplify logic directly for this is inefficient
1203 // because we have to continually rebuild the argument list even when no
1204 // simplifications can be performed. Until that is fixed with remapping
1205 // inside of instsimplify, directly constant fold calls here.
1206 if (!canConstantFoldCallTo(&Call, F))
1207 return false;
1209 // Try to re-map the arguments to constants.
1210 SmallVector<Constant *, 4> ConstantArgs;
1211 ConstantArgs.reserve(Call.arg_size());
1212 for (Value *I : Call.args()) {
1213 Constant *C = dyn_cast<Constant>(I);
1214 if (!C)
1215 C = dyn_cast_or_null<Constant>(SimplifiedValues.lookup(I));
1216 if (!C)
1217 return false; // This argument doesn't map to a constant.
1219 ConstantArgs.push_back(C);
1221 if (Constant *C = ConstantFoldCall(&Call, F, ConstantArgs)) {
1222 SimplifiedValues[&Call] = C;
1223 return true;
1226 return false;
1229 bool CallAnalyzer::visitCallBase(CallBase &Call) {
1230 if (Call.hasFnAttr(Attribute::ReturnsTwice) &&
1231 !F.hasFnAttribute(Attribute::ReturnsTwice)) {
1232 // This aborts the entire analysis.
1233 ExposesReturnsTwice = true;
1234 return false;
1236 if (isa<CallInst>(Call) && cast<CallInst>(Call).cannotDuplicate())
1237 ContainsNoDuplicateCall = true;
1239 if (Function *F = Call.getCalledFunction()) {
1240 // When we have a concrete function, first try to simplify it directly.
1241 if (simplifyCallSite(F, Call))
1242 return true;
1244 // Next check if it is an intrinsic we know about.
1245 // FIXME: Lift this into part of the InstVisitor.
1246 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&Call)) {
1247 switch (II->getIntrinsicID()) {
1248 default:
1249 if (!Call.onlyReadsMemory() && !isAssumeLikeIntrinsic(II))
1250 disableLoadElimination();
1251 return Base::visitCallBase(Call);
1253 case Intrinsic::load_relative:
1254 // This is normally lowered to 4 LLVM instructions.
1255 addCost(3 * InlineConstants::InstrCost);
1256 return false;
1258 case Intrinsic::memset:
1259 case Intrinsic::memcpy:
1260 case Intrinsic::memmove:
1261 disableLoadElimination();
1262 // SROA can usually chew through these intrinsics, but they aren't free.
1263 return false;
1264 case Intrinsic::icall_branch_funnel:
1265 case Intrinsic::localescape:
1266 HasUninlineableIntrinsic = true;
1267 return false;
1268 case Intrinsic::vastart:
1269 InitsVargArgs = true;
1270 return false;
1274 if (F == Call.getFunction()) {
1275 // This flag will fully abort the analysis, so don't bother with anything
1276 // else.
1277 IsRecursiveCall = true;
1278 return false;
1281 if (TTI.isLoweredToCall(F)) {
1282 // We account for the average 1 instruction per call argument setup
1283 // here.
1284 addCost(Call.arg_size() * InlineConstants::InstrCost);
1286 // Everything other than inline ASM will also have a significant cost
1287 // merely from making the call.
1288 if (!isa<InlineAsm>(Call.getCalledValue()))
1289 addCost(InlineConstants::CallPenalty);
1292 if (!Call.onlyReadsMemory())
1293 disableLoadElimination();
1294 return Base::visitCallBase(Call);
1297 // Otherwise we're in a very special case -- an indirect function call. See
1298 // if we can be particularly clever about this.
1299 Value *Callee = Call.getCalledValue();
1301 // First, pay the price of the argument setup. We account for the average
1302 // 1 instruction per call argument setup here.
1303 addCost(Call.arg_size() * InlineConstants::InstrCost);
1305 // Next, check if this happens to be an indirect function call to a known
1306 // function in this inline context. If not, we've done all we can.
1307 Function *F = dyn_cast_or_null<Function>(SimplifiedValues.lookup(Callee));
1308 if (!F) {
1309 if (!Call.onlyReadsMemory())
1310 disableLoadElimination();
1311 return Base::visitCallBase(Call);
1314 // If we have a constant that we are calling as a function, we can peer
1315 // through it and see the function target. This happens not infrequently
1316 // during devirtualization and so we want to give it a hefty bonus for
1317 // inlining, but cap that bonus in the event that inlining wouldn't pan
1318 // out. Pretend to inline the function, with a custom threshold.
1319 auto IndirectCallParams = Params;
1320 IndirectCallParams.DefaultThreshold = InlineConstants::IndirectCallThreshold;
1321 CallAnalyzer CA(TTI, GetAssumptionCache, GetBFI, PSI, ORE, *F, Call,
1322 IndirectCallParams);
1323 if (CA.analyzeCall(Call)) {
1324 // We were able to inline the indirect call! Subtract the cost from the
1325 // threshold to get the bonus we want to apply, but don't go below zero.
1326 Cost -= std::max(0, CA.getThreshold() - CA.getCost());
1329 if (!F->onlyReadsMemory())
1330 disableLoadElimination();
1331 return Base::visitCallBase(Call);
1334 bool CallAnalyzer::visitReturnInst(ReturnInst &RI) {
1335 // At least one return instruction will be free after inlining.
1336 bool Free = !HasReturn;
1337 HasReturn = true;
1338 return Free;
1341 bool CallAnalyzer::visitBranchInst(BranchInst &BI) {
1342 // We model unconditional branches as essentially free -- they really
1343 // shouldn't exist at all, but handling them makes the behavior of the
1344 // inliner more regular and predictable. Interestingly, conditional branches
1345 // which will fold away are also free.
1346 return BI.isUnconditional() || isa<ConstantInt>(BI.getCondition()) ||
1347 dyn_cast_or_null<ConstantInt>(
1348 SimplifiedValues.lookup(BI.getCondition()));
1351 bool CallAnalyzer::visitSelectInst(SelectInst &SI) {
1352 bool CheckSROA = SI.getType()->isPointerTy();
1353 Value *TrueVal = SI.getTrueValue();
1354 Value *FalseVal = SI.getFalseValue();
1356 Constant *TrueC = dyn_cast<Constant>(TrueVal);
1357 if (!TrueC)
1358 TrueC = SimplifiedValues.lookup(TrueVal);
1359 Constant *FalseC = dyn_cast<Constant>(FalseVal);
1360 if (!FalseC)
1361 FalseC = SimplifiedValues.lookup(FalseVal);
1362 Constant *CondC =
1363 dyn_cast_or_null<Constant>(SimplifiedValues.lookup(SI.getCondition()));
1365 if (!CondC) {
1366 // Select C, X, X => X
1367 if (TrueC == FalseC && TrueC) {
1368 SimplifiedValues[&SI] = TrueC;
1369 return true;
1372 if (!CheckSROA)
1373 return Base::visitSelectInst(SI);
1375 std::pair<Value *, APInt> TrueBaseAndOffset =
1376 ConstantOffsetPtrs.lookup(TrueVal);
1377 std::pair<Value *, APInt> FalseBaseAndOffset =
1378 ConstantOffsetPtrs.lookup(FalseVal);
1379 if (TrueBaseAndOffset == FalseBaseAndOffset && TrueBaseAndOffset.first) {
1380 ConstantOffsetPtrs[&SI] = TrueBaseAndOffset;
1382 Value *SROAArg;
1383 DenseMap<Value *, int>::iterator CostIt;
1384 if (lookupSROAArgAndCost(TrueVal, SROAArg, CostIt))
1385 SROAArgValues[&SI] = SROAArg;
1386 return true;
1389 return Base::visitSelectInst(SI);
1392 // Select condition is a constant.
1393 Value *SelectedV = CondC->isAllOnesValue()
1394 ? TrueVal
1395 : (CondC->isNullValue()) ? FalseVal : nullptr;
1396 if (!SelectedV) {
1397 // Condition is a vector constant that is not all 1s or all 0s. If all
1398 // operands are constants, ConstantExpr::getSelect() can handle the cases
1399 // such as select vectors.
1400 if (TrueC && FalseC) {
1401 if (auto *C = ConstantExpr::getSelect(CondC, TrueC, FalseC)) {
1402 SimplifiedValues[&SI] = C;
1403 return true;
1406 return Base::visitSelectInst(SI);
1409 // Condition is either all 1s or all 0s. SI can be simplified.
1410 if (Constant *SelectedC = dyn_cast<Constant>(SelectedV)) {
1411 SimplifiedValues[&SI] = SelectedC;
1412 return true;
1415 if (!CheckSROA)
1416 return true;
1418 std::pair<Value *, APInt> BaseAndOffset =
1419 ConstantOffsetPtrs.lookup(SelectedV);
1420 if (BaseAndOffset.first) {
1421 ConstantOffsetPtrs[&SI] = BaseAndOffset;
1423 Value *SROAArg;
1424 DenseMap<Value *, int>::iterator CostIt;
1425 if (lookupSROAArgAndCost(SelectedV, SROAArg, CostIt))
1426 SROAArgValues[&SI] = SROAArg;
1429 return true;
1432 bool CallAnalyzer::visitSwitchInst(SwitchInst &SI) {
1433 // We model unconditional switches as free, see the comments on handling
1434 // branches.
1435 if (isa<ConstantInt>(SI.getCondition()))
1436 return true;
1437 if (Value *V = SimplifiedValues.lookup(SI.getCondition()))
1438 if (isa<ConstantInt>(V))
1439 return true;
1441 // Assume the most general case where the switch is lowered into
1442 // either a jump table, bit test, or a balanced binary tree consisting of
1443 // case clusters without merging adjacent clusters with the same
1444 // destination. We do not consider the switches that are lowered with a mix
1445 // of jump table/bit test/binary search tree. The cost of the switch is
1446 // proportional to the size of the tree or the size of jump table range.
1448 // NB: We convert large switches which are just used to initialize large phi
1449 // nodes to lookup tables instead in simplify-cfg, so this shouldn't prevent
1450 // inlining those. It will prevent inlining in cases where the optimization
1451 // does not (yet) fire.
1453 // Maximum valid cost increased in this function.
1454 int CostUpperBound = INT_MAX - InlineConstants::InstrCost - 1;
1456 unsigned JumpTableSize = 0;
1457 unsigned NumCaseCluster =
1458 TTI.getEstimatedNumberOfCaseClusters(SI, JumpTableSize);
1460 // If suitable for a jump table, consider the cost for the table size and
1461 // branch to destination.
1462 if (JumpTableSize) {
1463 int64_t JTCost = (int64_t)JumpTableSize * InlineConstants::InstrCost +
1464 4 * InlineConstants::InstrCost;
1466 addCost(JTCost, (int64_t)CostUpperBound);
1467 return false;
1470 // Considering forming a binary search, we should find the number of nodes
1471 // which is same as the number of comparisons when lowered. For a given
1472 // number of clusters, n, we can define a recursive function, f(n), to find
1473 // the number of nodes in the tree. The recursion is :
1474 // f(n) = 1 + f(n/2) + f (n - n/2), when n > 3,
1475 // and f(n) = n, when n <= 3.
1476 // This will lead a binary tree where the leaf should be either f(2) or f(3)
1477 // when n > 3. So, the number of comparisons from leaves should be n, while
1478 // the number of non-leaf should be :
1479 // 2^(log2(n) - 1) - 1
1480 // = 2^log2(n) * 2^-1 - 1
1481 // = n / 2 - 1.
1482 // Considering comparisons from leaf and non-leaf nodes, we can estimate the
1483 // number of comparisons in a simple closed form :
1484 // n + n / 2 - 1 = n * 3 / 2 - 1
1485 if (NumCaseCluster <= 3) {
1486 // Suppose a comparison includes one compare and one conditional branch.
1487 addCost(NumCaseCluster * 2 * InlineConstants::InstrCost);
1488 return false;
1491 int64_t ExpectedNumberOfCompare = 3 * (int64_t)NumCaseCluster / 2 - 1;
1492 int64_t SwitchCost =
1493 ExpectedNumberOfCompare * 2 * InlineConstants::InstrCost;
1495 addCost(SwitchCost, (int64_t)CostUpperBound);
1496 return false;
1499 bool CallAnalyzer::visitIndirectBrInst(IndirectBrInst &IBI) {
1500 // We never want to inline functions that contain an indirectbr. This is
1501 // incorrect because all the blockaddress's (in static global initializers
1502 // for example) would be referring to the original function, and this
1503 // indirect jump would jump from the inlined copy of the function into the
1504 // original function which is extremely undefined behavior.
1505 // FIXME: This logic isn't really right; we can safely inline functions with
1506 // indirectbr's as long as no other function or global references the
1507 // blockaddress of a block within the current function.
1508 HasIndirectBr = true;
1509 return false;
1512 bool CallAnalyzer::visitResumeInst(ResumeInst &RI) {
1513 // FIXME: It's not clear that a single instruction is an accurate model for
1514 // the inline cost of a resume instruction.
1515 return false;
1518 bool CallAnalyzer::visitCleanupReturnInst(CleanupReturnInst &CRI) {
1519 // FIXME: It's not clear that a single instruction is an accurate model for
1520 // the inline cost of a cleanupret instruction.
1521 return false;
1524 bool CallAnalyzer::visitCatchReturnInst(CatchReturnInst &CRI) {
1525 // FIXME: It's not clear that a single instruction is an accurate model for
1526 // the inline cost of a catchret instruction.
1527 return false;
1530 bool CallAnalyzer::visitUnreachableInst(UnreachableInst &I) {
1531 // FIXME: It might be reasonably to discount the cost of instructions leading
1532 // to unreachable as they have the lowest possible impact on both runtime and
1533 // code size.
1534 return true; // No actual code is needed for unreachable.
1537 bool CallAnalyzer::visitInstruction(Instruction &I) {
1538 // Some instructions are free. All of the free intrinsics can also be
1539 // handled by SROA, etc.
1540 if (TargetTransformInfo::TCC_Free == TTI.getUserCost(&I))
1541 return true;
1543 // We found something we don't understand or can't handle. Mark any SROA-able
1544 // values in the operand list as no longer viable.
1545 for (User::op_iterator OI = I.op_begin(), OE = I.op_end(); OI != OE; ++OI)
1546 disableSROA(*OI);
1548 return false;
1551 /// Analyze a basic block for its contribution to the inline cost.
1553 /// This method walks the analyzer over every instruction in the given basic
1554 /// block and accounts for their cost during inlining at this callsite. It
1555 /// aborts early if the threshold has been exceeded or an impossible to inline
1556 /// construct has been detected. It returns false if inlining is no longer
1557 /// viable, and true if inlining remains viable.
1558 InlineResult
1559 CallAnalyzer::analyzeBlock(BasicBlock *BB,
1560 SmallPtrSetImpl<const Value *> &EphValues) {
1561 for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
1562 // FIXME: Currently, the number of instructions in a function regardless of
1563 // our ability to simplify them during inline to constants or dead code,
1564 // are actually used by the vector bonus heuristic. As long as that's true,
1565 // we have to special case debug intrinsics here to prevent differences in
1566 // inlining due to debug symbols. Eventually, the number of unsimplified
1567 // instructions shouldn't factor into the cost computation, but until then,
1568 // hack around it here.
1569 if (isa<DbgInfoIntrinsic>(I))
1570 continue;
1572 // Skip ephemeral values.
1573 if (EphValues.count(&*I))
1574 continue;
1576 ++NumInstructions;
1577 if (isa<ExtractElementInst>(I) || I->getType()->isVectorTy())
1578 ++NumVectorInstructions;
1580 // If the instruction simplified to a constant, there is no cost to this
1581 // instruction. Visit the instructions using our InstVisitor to account for
1582 // all of the per-instruction logic. The visit tree returns true if we
1583 // consumed the instruction in any way, and false if the instruction's base
1584 // cost should count against inlining.
1585 if (Base::visit(&*I))
1586 ++NumInstructionsSimplified;
1587 else
1588 addCost(InlineConstants::InstrCost);
1590 using namespace ore;
1591 // If the visit this instruction detected an uninlinable pattern, abort.
1592 InlineResult IR;
1593 if (IsRecursiveCall)
1594 IR = "recursive";
1595 else if (ExposesReturnsTwice)
1596 IR = "exposes returns twice";
1597 else if (HasDynamicAlloca)
1598 IR = "dynamic alloca";
1599 else if (HasIndirectBr)
1600 IR = "indirect branch";
1601 else if (HasUninlineableIntrinsic)
1602 IR = "uninlinable intrinsic";
1603 else if (InitsVargArgs)
1604 IR = "varargs";
1605 if (!IR) {
1606 if (ORE)
1607 ORE->emit([&]() {
1608 return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline",
1609 &CandidateCall)
1610 << NV("Callee", &F) << " has uninlinable pattern ("
1611 << NV("InlineResult", IR.message)
1612 << ") and cost is not fully computed";
1614 return IR;
1617 // If the caller is a recursive function then we don't want to inline
1618 // functions which allocate a lot of stack space because it would increase
1619 // the caller stack usage dramatically.
1620 if (IsCallerRecursive &&
1621 AllocatedSize > InlineConstants::TotalAllocaSizeRecursiveCaller) {
1622 InlineResult IR = "recursive and allocates too much stack space";
1623 if (ORE)
1624 ORE->emit([&]() {
1625 return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline",
1626 &CandidateCall)
1627 << NV("Callee", &F) << " is " << NV("InlineResult", IR.message)
1628 << ". Cost is not fully computed";
1630 return IR;
1633 // Check if we've passed the maximum possible threshold so we don't spin in
1634 // huge basic blocks that will never inline.
1635 if (Cost >= Threshold && !ComputeFullInlineCost)
1636 return false;
1639 return true;
1642 /// Compute the base pointer and cumulative constant offsets for V.
1644 /// This strips all constant offsets off of V, leaving it the base pointer, and
1645 /// accumulates the total constant offset applied in the returned constant. It
1646 /// returns 0 if V is not a pointer, and returns the constant '0' if there are
1647 /// no constant offsets applied.
1648 ConstantInt *CallAnalyzer::stripAndComputeInBoundsConstantOffsets(Value *&V) {
1649 if (!V->getType()->isPointerTy())
1650 return nullptr;
1652 unsigned AS = V->getType()->getPointerAddressSpace();
1653 unsigned IntPtrWidth = DL.getIndexSizeInBits(AS);
1654 APInt Offset = APInt::getNullValue(IntPtrWidth);
1656 // Even though we don't look through PHI nodes, we could be called on an
1657 // instruction in an unreachable block, which may be on a cycle.
1658 SmallPtrSet<Value *, 4> Visited;
1659 Visited.insert(V);
1660 do {
1661 if (GEPOperator *GEP = dyn_cast<GEPOperator>(V)) {
1662 if (!GEP->isInBounds() || !accumulateGEPOffset(*GEP, Offset))
1663 return nullptr;
1664 V = GEP->getPointerOperand();
1665 } else if (Operator::getOpcode(V) == Instruction::BitCast) {
1666 V = cast<Operator>(V)->getOperand(0);
1667 } else if (GlobalAlias *GA = dyn_cast<GlobalAlias>(V)) {
1668 if (GA->isInterposable())
1669 break;
1670 V = GA->getAliasee();
1671 } else {
1672 break;
1674 assert(V->getType()->isPointerTy() && "Unexpected operand type!");
1675 } while (Visited.insert(V).second);
1677 Type *IntPtrTy = DL.getIntPtrType(V->getContext(), AS);
1678 return cast<ConstantInt>(ConstantInt::get(IntPtrTy, Offset));
1681 /// Find dead blocks due to deleted CFG edges during inlining.
1683 /// If we know the successor of the current block, \p CurrBB, has to be \p
1684 /// NextBB, the other successors of \p CurrBB are dead if these successors have
1685 /// no live incoming CFG edges. If one block is found to be dead, we can
1686 /// continue growing the dead block list by checking the successors of the dead
1687 /// blocks to see if all their incoming edges are dead or not.
1688 void CallAnalyzer::findDeadBlocks(BasicBlock *CurrBB, BasicBlock *NextBB) {
1689 auto IsEdgeDead = [&](BasicBlock *Pred, BasicBlock *Succ) {
1690 // A CFG edge is dead if the predecessor is dead or the predecessor has a
1691 // known successor which is not the one under exam.
1692 return (DeadBlocks.count(Pred) ||
1693 (KnownSuccessors[Pred] && KnownSuccessors[Pred] != Succ));
1696 auto IsNewlyDead = [&](BasicBlock *BB) {
1697 // If all the edges to a block are dead, the block is also dead.
1698 return (!DeadBlocks.count(BB) &&
1699 llvm::all_of(predecessors(BB),
1700 [&](BasicBlock *P) { return IsEdgeDead(P, BB); }));
1703 for (BasicBlock *Succ : successors(CurrBB)) {
1704 if (Succ == NextBB || !IsNewlyDead(Succ))
1705 continue;
1706 SmallVector<BasicBlock *, 4> NewDead;
1707 NewDead.push_back(Succ);
1708 while (!NewDead.empty()) {
1709 BasicBlock *Dead = NewDead.pop_back_val();
1710 if (DeadBlocks.insert(Dead))
1711 // Continue growing the dead block lists.
1712 for (BasicBlock *S : successors(Dead))
1713 if (IsNewlyDead(S))
1714 NewDead.push_back(S);
1719 /// Analyze a call site for potential inlining.
1721 /// Returns true if inlining this call is viable, and false if it is not
1722 /// viable. It computes the cost and adjusts the threshold based on numerous
1723 /// factors and heuristics. If this method returns false but the computed cost
1724 /// is below the computed threshold, then inlining was forcibly disabled by
1725 /// some artifact of the routine.
1726 InlineResult CallAnalyzer::analyzeCall(CallBase &Call) {
1727 ++NumCallsAnalyzed;
1729 // Perform some tweaks to the cost and threshold based on the direct
1730 // callsite information.
1732 // We want to more aggressively inline vector-dense kernels, so up the
1733 // threshold, and we'll lower it if the % of vector instructions gets too
1734 // low. Note that these bonuses are some what arbitrary and evolved over time
1735 // by accident as much as because they are principled bonuses.
1737 // FIXME: It would be nice to remove all such bonuses. At least it would be
1738 // nice to base the bonus values on something more scientific.
1739 assert(NumInstructions == 0);
1740 assert(NumVectorInstructions == 0);
1742 // Update the threshold based on callsite properties
1743 updateThreshold(Call, F);
1745 // While Threshold depends on commandline options that can take negative
1746 // values, we want to enforce the invariant that the computed threshold and
1747 // bonuses are non-negative.
1748 assert(Threshold >= 0);
1749 assert(SingleBBBonus >= 0);
1750 assert(VectorBonus >= 0);
1752 // Speculatively apply all possible bonuses to Threshold. If cost exceeds
1753 // this Threshold any time, and cost cannot decrease, we can stop processing
1754 // the rest of the function body.
1755 Threshold += (SingleBBBonus + VectorBonus);
1757 // Give out bonuses for the callsite, as the instructions setting them up
1758 // will be gone after inlining.
1759 addCost(-getCallsiteCost(Call, DL));
1761 // If this function uses the coldcc calling convention, prefer not to inline
1762 // it.
1763 if (F.getCallingConv() == CallingConv::Cold)
1764 Cost += InlineConstants::ColdccPenalty;
1766 // Check if we're done. This can happen due to bonuses and penalties.
1767 if (Cost >= Threshold && !ComputeFullInlineCost)
1768 return "high cost";
1770 if (F.empty())
1771 return true;
1773 Function *Caller = Call.getFunction();
1774 // Check if the caller function is recursive itself.
1775 for (User *U : Caller->users()) {
1776 CallBase *Call = dyn_cast<CallBase>(U);
1777 if (Call && Call->getFunction() == Caller) {
1778 IsCallerRecursive = true;
1779 break;
1783 // Populate our simplified values by mapping from function arguments to call
1784 // arguments with known important simplifications.
1785 auto CAI = Call.arg_begin();
1786 for (Function::arg_iterator FAI = F.arg_begin(), FAE = F.arg_end();
1787 FAI != FAE; ++FAI, ++CAI) {
1788 assert(CAI != Call.arg_end());
1789 if (Constant *C = dyn_cast<Constant>(CAI))
1790 SimplifiedValues[&*FAI] = C;
1792 Value *PtrArg = *CAI;
1793 if (ConstantInt *C = stripAndComputeInBoundsConstantOffsets(PtrArg)) {
1794 ConstantOffsetPtrs[&*FAI] = std::make_pair(PtrArg, C->getValue());
1796 // We can SROA any pointer arguments derived from alloca instructions.
1797 if (isa<AllocaInst>(PtrArg)) {
1798 SROAArgValues[&*FAI] = PtrArg;
1799 SROAArgCosts[PtrArg] = 0;
1803 NumConstantArgs = SimplifiedValues.size();
1804 NumConstantOffsetPtrArgs = ConstantOffsetPtrs.size();
1805 NumAllocaArgs = SROAArgValues.size();
1807 // FIXME: If a caller has multiple calls to a callee, we end up recomputing
1808 // the ephemeral values multiple times (and they're completely determined by
1809 // the callee, so this is purely duplicate work).
1810 SmallPtrSet<const Value *, 32> EphValues;
1811 CodeMetrics::collectEphemeralValues(&F, &GetAssumptionCache(F), EphValues);
1813 // The worklist of live basic blocks in the callee *after* inlining. We avoid
1814 // adding basic blocks of the callee which can be proven to be dead for this
1815 // particular call site in order to get more accurate cost estimates. This
1816 // requires a somewhat heavyweight iteration pattern: we need to walk the
1817 // basic blocks in a breadth-first order as we insert live successors. To
1818 // accomplish this, prioritizing for small iterations because we exit after
1819 // crossing our threshold, we use a small-size optimized SetVector.
1820 typedef SetVector<BasicBlock *, SmallVector<BasicBlock *, 16>,
1821 SmallPtrSet<BasicBlock *, 16>>
1822 BBSetVector;
1823 BBSetVector BBWorklist;
1824 BBWorklist.insert(&F.getEntryBlock());
1825 bool SingleBB = true;
1826 // Note that we *must not* cache the size, this loop grows the worklist.
1827 for (unsigned Idx = 0; Idx != BBWorklist.size(); ++Idx) {
1828 // Bail out the moment we cross the threshold. This means we'll under-count
1829 // the cost, but only when undercounting doesn't matter.
1830 if (Cost >= Threshold && !ComputeFullInlineCost)
1831 break;
1833 BasicBlock *BB = BBWorklist[Idx];
1834 if (BB->empty())
1835 continue;
1837 // Disallow inlining a blockaddress with uses other than strictly callbr.
1838 // A blockaddress only has defined behavior for an indirect branch in the
1839 // same function, and we do not currently support inlining indirect
1840 // branches. But, the inliner may not see an indirect branch that ends up
1841 // being dead code at a particular call site. If the blockaddress escapes
1842 // the function, e.g., via a global variable, inlining may lead to an
1843 // invalid cross-function reference.
1844 // FIXME: pr/39560: continue relaxing this overt restriction.
1845 if (BB->hasAddressTaken())
1846 for (User *U : BlockAddress::get(&*BB)->users())
1847 if (!isa<CallBrInst>(*U))
1848 return "blockaddress used outside of callbr";
1850 // Analyze the cost of this block. If we blow through the threshold, this
1851 // returns false, and we can bail on out.
1852 InlineResult IR = analyzeBlock(BB, EphValues);
1853 if (!IR)
1854 return IR;
1856 Instruction *TI = BB->getTerminator();
1858 // Add in the live successors by first checking whether we have terminator
1859 // that may be simplified based on the values simplified by this call.
1860 if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
1861 if (BI->isConditional()) {
1862 Value *Cond = BI->getCondition();
1863 if (ConstantInt *SimpleCond =
1864 dyn_cast_or_null<ConstantInt>(SimplifiedValues.lookup(Cond))) {
1865 BasicBlock *NextBB = BI->getSuccessor(SimpleCond->isZero() ? 1 : 0);
1866 BBWorklist.insert(NextBB);
1867 KnownSuccessors[BB] = NextBB;
1868 findDeadBlocks(BB, NextBB);
1869 continue;
1872 } else if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
1873 Value *Cond = SI->getCondition();
1874 if (ConstantInt *SimpleCond =
1875 dyn_cast_or_null<ConstantInt>(SimplifiedValues.lookup(Cond))) {
1876 BasicBlock *NextBB = SI->findCaseValue(SimpleCond)->getCaseSuccessor();
1877 BBWorklist.insert(NextBB);
1878 KnownSuccessors[BB] = NextBB;
1879 findDeadBlocks(BB, NextBB);
1880 continue;
1884 // If we're unable to select a particular successor, just count all of
1885 // them.
1886 for (unsigned TIdx = 0, TSize = TI->getNumSuccessors(); TIdx != TSize;
1887 ++TIdx)
1888 BBWorklist.insert(TI->getSuccessor(TIdx));
1890 // If we had any successors at this point, than post-inlining is likely to
1891 // have them as well. Note that we assume any basic blocks which existed
1892 // due to branches or switches which folded above will also fold after
1893 // inlining.
1894 if (SingleBB && TI->getNumSuccessors() > 1) {
1895 // Take off the bonus we applied to the threshold.
1896 Threshold -= SingleBBBonus;
1897 SingleBB = false;
1901 bool OnlyOneCallAndLocalLinkage =
1902 F.hasLocalLinkage() && F.hasOneUse() && &F == Call.getCalledFunction();
1903 // If this is a noduplicate call, we can still inline as long as
1904 // inlining this would cause the removal of the caller (so the instruction
1905 // is not actually duplicated, just moved).
1906 if (!OnlyOneCallAndLocalLinkage && ContainsNoDuplicateCall)
1907 return "noduplicate";
1909 // Loops generally act a lot like calls in that they act like barriers to
1910 // movement, require a certain amount of setup, etc. So when optimising for
1911 // size, we penalise any call sites that perform loops. We do this after all
1912 // other costs here, so will likely only be dealing with relatively small
1913 // functions (and hence DT and LI will hopefully be cheap).
1914 if (Caller->hasMinSize()) {
1915 DominatorTree DT(F);
1916 LoopInfo LI(DT);
1917 int NumLoops = 0;
1918 for (Loop *L : LI) {
1919 // Ignore loops that will not be executed
1920 if (DeadBlocks.count(L->getHeader()))
1921 continue;
1922 NumLoops++;
1924 addCost(NumLoops * InlineConstants::CallPenalty);
1927 // We applied the maximum possible vector bonus at the beginning. Now,
1928 // subtract the excess bonus, if any, from the Threshold before
1929 // comparing against Cost.
1930 if (NumVectorInstructions <= NumInstructions / 10)
1931 Threshold -= VectorBonus;
1932 else if (NumVectorInstructions <= NumInstructions / 2)
1933 Threshold -= VectorBonus/2;
1935 return Cost < std::max(1, Threshold);
1938 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1939 /// Dump stats about this call's analysis.
1940 LLVM_DUMP_METHOD void CallAnalyzer::dump() {
1941 #define DEBUG_PRINT_STAT(x) dbgs() << " " #x ": " << x << "\n"
1942 DEBUG_PRINT_STAT(NumConstantArgs);
1943 DEBUG_PRINT_STAT(NumConstantOffsetPtrArgs);
1944 DEBUG_PRINT_STAT(NumAllocaArgs);
1945 DEBUG_PRINT_STAT(NumConstantPtrCmps);
1946 DEBUG_PRINT_STAT(NumConstantPtrDiffs);
1947 DEBUG_PRINT_STAT(NumInstructionsSimplified);
1948 DEBUG_PRINT_STAT(NumInstructions);
1949 DEBUG_PRINT_STAT(SROACostSavings);
1950 DEBUG_PRINT_STAT(SROACostSavingsLost);
1951 DEBUG_PRINT_STAT(LoadEliminationCost);
1952 DEBUG_PRINT_STAT(ContainsNoDuplicateCall);
1953 DEBUG_PRINT_STAT(Cost);
1954 DEBUG_PRINT_STAT(Threshold);
1955 #undef DEBUG_PRINT_STAT
1957 #endif
1959 /// Test that there are no attribute conflicts between Caller and Callee
1960 /// that prevent inlining.
1961 static bool functionsHaveCompatibleAttributes(Function *Caller,
1962 Function *Callee,
1963 TargetTransformInfo &TTI) {
1964 return TTI.areInlineCompatible(Caller, Callee) &&
1965 AttributeFuncs::areInlineCompatible(*Caller, *Callee);
1968 int llvm::getCallsiteCost(CallBase &Call, const DataLayout &DL) {
1969 int Cost = 0;
1970 for (unsigned I = 0, E = Call.arg_size(); I != E; ++I) {
1971 if (Call.isByValArgument(I)) {
1972 // We approximate the number of loads and stores needed by dividing the
1973 // size of the byval type by the target's pointer size.
1974 PointerType *PTy = cast<PointerType>(Call.getArgOperand(I)->getType());
1975 unsigned TypeSize = DL.getTypeSizeInBits(PTy->getElementType());
1976 unsigned AS = PTy->getAddressSpace();
1977 unsigned PointerSize = DL.getPointerSizeInBits(AS);
1978 // Ceiling division.
1979 unsigned NumStores = (TypeSize + PointerSize - 1) / PointerSize;
1981 // If it generates more than 8 stores it is likely to be expanded as an
1982 // inline memcpy so we take that as an upper bound. Otherwise we assume
1983 // one load and one store per word copied.
1984 // FIXME: The maxStoresPerMemcpy setting from the target should be used
1985 // here instead of a magic number of 8, but it's not available via
1986 // DataLayout.
1987 NumStores = std::min(NumStores, 8U);
1989 Cost += 2 * NumStores * InlineConstants::InstrCost;
1990 } else {
1991 // For non-byval arguments subtract off one instruction per call
1992 // argument.
1993 Cost += InlineConstants::InstrCost;
1996 // The call instruction also disappears after inlining.
1997 Cost += InlineConstants::InstrCost + InlineConstants::CallPenalty;
1998 return Cost;
2001 InlineCost llvm::getInlineCost(
2002 CallBase &Call, const InlineParams &Params, TargetTransformInfo &CalleeTTI,
2003 std::function<AssumptionCache &(Function &)> &GetAssumptionCache,
2004 Optional<function_ref<BlockFrequencyInfo &(Function &)>> GetBFI,
2005 ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE) {
2006 return getInlineCost(Call, Call.getCalledFunction(), Params, CalleeTTI,
2007 GetAssumptionCache, GetBFI, PSI, ORE);
2010 InlineCost llvm::getInlineCost(
2011 CallBase &Call, Function *Callee, const InlineParams &Params,
2012 TargetTransformInfo &CalleeTTI,
2013 std::function<AssumptionCache &(Function &)> &GetAssumptionCache,
2014 Optional<function_ref<BlockFrequencyInfo &(Function &)>> GetBFI,
2015 ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE) {
2017 // Cannot inline indirect calls.
2018 if (!Callee)
2019 return llvm::InlineCost::getNever("indirect call");
2021 // Never inline calls with byval arguments that does not have the alloca
2022 // address space. Since byval arguments can be replaced with a copy to an
2023 // alloca, the inlined code would need to be adjusted to handle that the
2024 // argument is in the alloca address space (so it is a little bit complicated
2025 // to solve).
2026 unsigned AllocaAS = Callee->getParent()->getDataLayout().getAllocaAddrSpace();
2027 for (unsigned I = 0, E = Call.arg_size(); I != E; ++I)
2028 if (Call.isByValArgument(I)) {
2029 PointerType *PTy = cast<PointerType>(Call.getArgOperand(I)->getType());
2030 if (PTy->getAddressSpace() != AllocaAS)
2031 return llvm::InlineCost::getNever("byval arguments without alloca"
2032 " address space");
2035 // Calls to functions with always-inline attributes should be inlined
2036 // whenever possible.
2037 if (Call.hasFnAttr(Attribute::AlwaysInline)) {
2038 auto IsViable = isInlineViable(*Callee);
2039 if (IsViable)
2040 return llvm::InlineCost::getAlways("always inline attribute");
2041 return llvm::InlineCost::getNever(IsViable.message);
2044 // Never inline functions with conflicting attributes (unless callee has
2045 // always-inline attribute).
2046 Function *Caller = Call.getCaller();
2047 if (!functionsHaveCompatibleAttributes(Caller, Callee, CalleeTTI))
2048 return llvm::InlineCost::getNever("conflicting attributes");
2050 // Don't inline this call if the caller has the optnone attribute.
2051 if (Caller->hasOptNone())
2052 return llvm::InlineCost::getNever("optnone attribute");
2054 // Don't inline a function that treats null pointer as valid into a caller
2055 // that does not have this attribute.
2056 if (!Caller->nullPointerIsDefined() && Callee->nullPointerIsDefined())
2057 return llvm::InlineCost::getNever("nullptr definitions incompatible");
2059 // Don't inline functions which can be interposed at link-time.
2060 if (Callee->isInterposable())
2061 return llvm::InlineCost::getNever("interposable");
2063 // Don't inline functions marked noinline.
2064 if (Callee->hasFnAttribute(Attribute::NoInline))
2065 return llvm::InlineCost::getNever("noinline function attribute");
2067 // Don't inline call sites marked noinline.
2068 if (Call.isNoInline())
2069 return llvm::InlineCost::getNever("noinline call site attribute");
2071 LLVM_DEBUG(llvm::dbgs() << " Analyzing call of " << Callee->getName()
2072 << "... (caller:" << Caller->getName() << ")\n");
2074 CallAnalyzer CA(CalleeTTI, GetAssumptionCache, GetBFI, PSI, ORE, *Callee,
2075 Call, Params);
2076 InlineResult ShouldInline = CA.analyzeCall(Call);
2078 LLVM_DEBUG(CA.dump());
2080 // Check if there was a reason to force inlining or no inlining.
2081 if (!ShouldInline && CA.getCost() < CA.getThreshold())
2082 return InlineCost::getNever(ShouldInline.message);
2083 if (ShouldInline && CA.getCost() >= CA.getThreshold())
2084 return InlineCost::getAlways("empty function");
2086 return llvm::InlineCost::get(CA.getCost(), CA.getThreshold());
2089 InlineResult llvm::isInlineViable(Function &F) {
2090 bool ReturnsTwice = F.hasFnAttribute(Attribute::ReturnsTwice);
2091 for (Function::iterator BI = F.begin(), BE = F.end(); BI != BE; ++BI) {
2092 // Disallow inlining of functions which contain indirect branches.
2093 if (isa<IndirectBrInst>(BI->getTerminator()))
2094 return "contains indirect branches";
2096 // Disallow inlining of blockaddresses which are used by non-callbr
2097 // instructions.
2098 if (BI->hasAddressTaken())
2099 for (User *U : BlockAddress::get(&*BI)->users())
2100 if (!isa<CallBrInst>(*U))
2101 return "blockaddress used outside of callbr";
2103 for (auto &II : *BI) {
2104 CallBase *Call = dyn_cast<CallBase>(&II);
2105 if (!Call)
2106 continue;
2108 // Disallow recursive calls.
2109 if (&F == Call->getCalledFunction())
2110 return "recursive call";
2112 // Disallow calls which expose returns-twice to a function not previously
2113 // attributed as such.
2114 if (!ReturnsTwice && isa<CallInst>(Call) &&
2115 cast<CallInst>(Call)->canReturnTwice())
2116 return "exposes returns-twice attribute";
2118 if (Call->getCalledFunction())
2119 switch (Call->getCalledFunction()->getIntrinsicID()) {
2120 default:
2121 break;
2122 // Disallow inlining of @llvm.icall.branch.funnel because current
2123 // backend can't separate call targets from call arguments.
2124 case llvm::Intrinsic::icall_branch_funnel:
2125 return "disallowed inlining of @llvm.icall.branch.funnel";
2126 // Disallow inlining functions that call @llvm.localescape. Doing this
2127 // correctly would require major changes to the inliner.
2128 case llvm::Intrinsic::localescape:
2129 return "disallowed inlining of @llvm.localescape";
2130 // Disallow inlining of functions that initialize VarArgs with va_start.
2131 case llvm::Intrinsic::vastart:
2132 return "contains VarArgs initialized with va_start";
2137 return true;
2140 // APIs to create InlineParams based on command line flags and/or other
2141 // parameters.
2143 InlineParams llvm::getInlineParams(int Threshold) {
2144 InlineParams Params;
2146 // This field is the threshold to use for a callee by default. This is
2147 // derived from one or more of:
2148 // * optimization or size-optimization levels,
2149 // * a value passed to createFunctionInliningPass function, or
2150 // * the -inline-threshold flag.
2151 // If the -inline-threshold flag is explicitly specified, that is used
2152 // irrespective of anything else.
2153 if (InlineThreshold.getNumOccurrences() > 0)
2154 Params.DefaultThreshold = InlineThreshold;
2155 else
2156 Params.DefaultThreshold = Threshold;
2158 // Set the HintThreshold knob from the -inlinehint-threshold.
2159 Params.HintThreshold = HintThreshold;
2161 // Set the HotCallSiteThreshold knob from the -hot-callsite-threshold.
2162 Params.HotCallSiteThreshold = HotCallSiteThreshold;
2164 // If the -locally-hot-callsite-threshold is explicitly specified, use it to
2165 // populate LocallyHotCallSiteThreshold. Later, we populate
2166 // Params.LocallyHotCallSiteThreshold from -locally-hot-callsite-threshold if
2167 // we know that optimization level is O3 (in the getInlineParams variant that
2168 // takes the opt and size levels).
2169 // FIXME: Remove this check (and make the assignment unconditional) after
2170 // addressing size regression issues at O2.
2171 if (LocallyHotCallSiteThreshold.getNumOccurrences() > 0)
2172 Params.LocallyHotCallSiteThreshold = LocallyHotCallSiteThreshold;
2174 // Set the ColdCallSiteThreshold knob from the -inline-cold-callsite-threshold.
2175 Params.ColdCallSiteThreshold = ColdCallSiteThreshold;
2177 // Set the OptMinSizeThreshold and OptSizeThreshold params only if the
2178 // -inlinehint-threshold commandline option is not explicitly given. If that
2179 // option is present, then its value applies even for callees with size and
2180 // minsize attributes.
2181 // If the -inline-threshold is not specified, set the ColdThreshold from the
2182 // -inlinecold-threshold even if it is not explicitly passed. If
2183 // -inline-threshold is specified, then -inlinecold-threshold needs to be
2184 // explicitly specified to set the ColdThreshold knob
2185 if (InlineThreshold.getNumOccurrences() == 0) {
2186 Params.OptMinSizeThreshold = InlineConstants::OptMinSizeThreshold;
2187 Params.OptSizeThreshold = InlineConstants::OptSizeThreshold;
2188 Params.ColdThreshold = ColdThreshold;
2189 } else if (ColdThreshold.getNumOccurrences() > 0) {
2190 Params.ColdThreshold = ColdThreshold;
2192 return Params;
2195 InlineParams llvm::getInlineParams() {
2196 return getInlineParams(InlineThreshold);
2199 // Compute the default threshold for inlining based on the opt level and the
2200 // size opt level.
2201 static int computeThresholdFromOptLevels(unsigned OptLevel,
2202 unsigned SizeOptLevel) {
2203 if (OptLevel > 2)
2204 return InlineConstants::OptAggressiveThreshold;
2205 if (SizeOptLevel == 1) // -Os
2206 return InlineConstants::OptSizeThreshold;
2207 if (SizeOptLevel == 2) // -Oz
2208 return InlineConstants::OptMinSizeThreshold;
2209 return InlineThreshold;
2212 InlineParams llvm::getInlineParams(unsigned OptLevel, unsigned SizeOptLevel) {
2213 auto Params =
2214 getInlineParams(computeThresholdFromOptLevels(OptLevel, SizeOptLevel));
2215 // At O3, use the value of -locally-hot-callsite-threshold option to populate
2216 // Params.LocallyHotCallSiteThreshold. Below O3, this flag has effect only
2217 // when it is specified explicitly.
2218 if (OptLevel > 2)
2219 Params.LocallyHotCallSiteThreshold = LocallyHotCallSiteThreshold;
2220 return Params;