1 //===- AMDGPUInline.cpp - Code to perform simple function inlining --------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
10 /// This is AMDGPU specific replacement of the standard inliner.
11 /// The main purpose is to account for the fact that calls not only expensive
12 /// on the AMDGPU, but much more expensive if a private memory pointer is
13 /// passed to a function as an argument. In this situation, we are unable to
14 /// eliminate private memory in the caller unless inlined and end up with slow
15 /// and expensive scratch access. Thus, we boost the inline threshold for such
18 //===----------------------------------------------------------------------===//
22 #include "llvm/Transforms/IPO.h"
23 #include "llvm/Analysis/AssumptionCache.h"
24 #include "llvm/Analysis/CallGraph.h"
25 #include "llvm/Analysis/InlineCost.h"
26 #include "llvm/Analysis/ValueTracking.h"
27 #include "llvm/Analysis/TargetTransformInfo.h"
28 #include "llvm/IR/CallSite.h"
29 #include "llvm/IR/DataLayout.h"
30 #include "llvm/IR/Instructions.h"
31 #include "llvm/IR/Module.h"
32 #include "llvm/IR/Type.h"
33 #include "llvm/Support/CommandLine.h"
34 #include "llvm/Support/Debug.h"
35 #include "llvm/Transforms/IPO/Inliner.h"
39 #define DEBUG_TYPE "inline"
42 ArgAllocaCost("amdgpu-inline-arg-alloca-cost", cl::Hidden
, cl::init(1500),
43 cl::desc("Cost of alloca argument"));
45 // If the amount of scratch memory to eliminate exceeds our ability to allocate
46 // it into registers we gain nothing by aggressively inlining functions for that
48 static cl::opt
<unsigned>
49 ArgAllocaCutoff("amdgpu-inline-arg-alloca-cutoff", cl::Hidden
, cl::init(256),
50 cl::desc("Maximum alloca size to use for inline cost"));
52 // Inliner constraint to achieve reasonable compilation time
53 static cl::opt
<size_t>
54 MaxBB("amdgpu-inline-max-bb", cl::Hidden
, cl::init(300),
55 cl::desc("Maximum BB number allowed in a function after inlining"
56 " (compile time constraint)"));
60 class AMDGPUInliner
: public LegacyInlinerBase
{
63 AMDGPUInliner() : LegacyInlinerBase(ID
) {
64 initializeAMDGPUInlinerPass(*PassRegistry::getPassRegistry());
65 Params
= getInlineParams();
68 static char ID
; // Pass identification, replacement for typeid
70 unsigned getInlineThreshold(CallSite CS
) const;
72 InlineCost
getInlineCost(CallSite CS
) override
;
74 bool runOnSCC(CallGraphSCC
&SCC
) override
;
76 void getAnalysisUsage(AnalysisUsage
&AU
) const override
;
79 TargetTransformInfoWrapperPass
*TTIWP
;
84 } // end anonymous namespace
86 char AMDGPUInliner::ID
= 0;
87 INITIALIZE_PASS_BEGIN(AMDGPUInliner
, "amdgpu-inline",
88 "AMDGPU Function Integration/Inlining", false, false)
89 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker
)
90 INITIALIZE_PASS_DEPENDENCY(CallGraphWrapperPass
)
91 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass
)
92 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass
)
93 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass
)
94 INITIALIZE_PASS_END(AMDGPUInliner
, "amdgpu-inline",
95 "AMDGPU Function Integration/Inlining", false, false)
97 Pass
*llvm::createAMDGPUFunctionInliningPass() { return new AMDGPUInliner(); }
99 bool AMDGPUInliner::runOnSCC(CallGraphSCC
&SCC
) {
100 TTIWP
= &getAnalysis
<TargetTransformInfoWrapperPass
>();
101 return LegacyInlinerBase::runOnSCC(SCC
);
104 void AMDGPUInliner::getAnalysisUsage(AnalysisUsage
&AU
) const {
105 AU
.addRequired
<TargetTransformInfoWrapperPass
>();
106 LegacyInlinerBase::getAnalysisUsage(AU
);
109 unsigned AMDGPUInliner::getInlineThreshold(CallSite CS
) const {
110 int Thres
= Params
.DefaultThreshold
;
112 Function
*Caller
= CS
.getCaller();
113 // Listen to the inlinehint attribute when it would increase the threshold
114 // and the caller does not need to minimize its size.
115 Function
*Callee
= CS
.getCalledFunction();
116 bool InlineHint
= Callee
&& !Callee
->isDeclaration() &&
117 Callee
->hasFnAttribute(Attribute::InlineHint
);
118 if (InlineHint
&& Params
.HintThreshold
&& Params
.HintThreshold
> Thres
119 && !Caller
->hasFnAttribute(Attribute::MinSize
))
120 Thres
= Params
.HintThreshold
.getValue() *
121 TTIWP
->getTTI(*Callee
).getInliningThresholdMultiplier();
123 const DataLayout
&DL
= Caller
->getParent()->getDataLayout();
125 return (unsigned)Thres
;
127 // If we have a pointer to private array passed into a function
128 // it will not be optimized out, leaving scratch usage.
129 // Increase the inline threshold to allow inliniting in this case.
130 uint64_t AllocaSize
= 0;
131 SmallPtrSet
<const AllocaInst
*, 8> AIVisited
;
132 for (Value
*PtrArg
: CS
.args()) {
133 PointerType
*Ty
= dyn_cast
<PointerType
>(PtrArg
->getType());
134 if (!Ty
|| (Ty
->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS
&&
135 Ty
->getAddressSpace() != AMDGPUAS::FLAT_ADDRESS
))
138 PtrArg
= GetUnderlyingObject(PtrArg
, DL
);
139 if (const AllocaInst
*AI
= dyn_cast
<AllocaInst
>(PtrArg
)) {
140 if (!AI
->isStaticAlloca() || !AIVisited
.insert(AI
).second
)
142 AllocaSize
+= DL
.getTypeAllocSize(AI
->getAllocatedType());
143 // If the amount of stack memory is excessive we will not be able
144 // to get rid of the scratch anyway, bail out.
145 if (AllocaSize
> ArgAllocaCutoff
) {
152 Thres
+= ArgAllocaCost
;
154 return (unsigned)Thres
;
157 // Check if call is just a wrapper around another call.
158 // In this case we only have call and ret instructions.
159 static bool isWrapperOnlyCall(CallSite CS
) {
160 Function
*Callee
= CS
.getCalledFunction();
161 if (!Callee
|| Callee
->size() != 1)
163 const BasicBlock
&BB
= Callee
->getEntryBlock();
164 if (const Instruction
*I
= BB
.getFirstNonPHI()) {
165 if (!isa
<CallInst
>(I
)) {
168 if (isa
<ReturnInst
>(*std::next(I
->getIterator()))) {
169 LLVM_DEBUG(dbgs() << " Wrapper only call detected: "
170 << Callee
->getName() << '\n');
177 InlineCost
AMDGPUInliner::getInlineCost(CallSite CS
) {
178 Function
*Callee
= CS
.getCalledFunction();
179 Function
*Caller
= CS
.getCaller();
181 if (!Callee
|| Callee
->isDeclaration())
182 return llvm::InlineCost::getNever("undefined callee");
185 return llvm::InlineCost::getNever("noinline");
187 TargetTransformInfo
&TTI
= TTIWP
->getTTI(*Callee
);
188 if (!TTI
.areInlineCompatible(Caller
, Callee
))
189 return llvm::InlineCost::getNever("incompatible");
191 if (CS
.hasFnAttr(Attribute::AlwaysInline
)) {
192 auto IsViable
= isInlineViable(*Callee
);
194 return llvm::InlineCost::getAlways("alwaysinline viable");
195 return llvm::InlineCost::getNever(IsViable
.message
);
198 if (isWrapperOnlyCall(CS
))
199 return llvm::InlineCost::getAlways("wrapper-only call");
201 InlineParams LocalParams
= Params
;
202 LocalParams
.DefaultThreshold
= (int)getInlineThreshold(CS
);
203 bool RemarksEnabled
= false;
204 const auto &BBs
= Caller
->getBasicBlockList();
206 auto DI
= OptimizationRemark(DEBUG_TYPE
, "", DebugLoc(), &BBs
.front());
208 RemarksEnabled
= true;
211 OptimizationRemarkEmitter
ORE(Caller
);
212 std::function
<AssumptionCache
&(Function
&)> GetAssumptionCache
=
213 [this](Function
&F
) -> AssumptionCache
& {
214 return ACT
->getAssumptionCache(F
);
217 auto IC
= llvm::getInlineCost(cast
<CallBase
>(*CS
.getInstruction()), Callee
,
218 LocalParams
, TTI
, GetAssumptionCache
, None
, PSI
,
219 RemarksEnabled
? &ORE
: nullptr);
221 if (IC
&& !IC
.isAlways() && !Callee
->hasFnAttribute(Attribute::InlineHint
)) {
222 // Single BB does not increase total BB amount, thus subtract 1
223 size_t Size
= Caller
->size() + Callee
->size() - 1;
224 if (MaxBB
&& Size
> MaxBB
)
225 return llvm::InlineCost::getNever("max number of bb exceeded");