1 //===-- AMDGPULowerKernelArguments.cpp ------------------------------------------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 /// \file This pass replaces accesses to kernel arguments with loads from
10 /// offsets from the kernarg base pointer.
12 //===----------------------------------------------------------------------===//
15 #include "GCNSubtarget.h"
16 #include "llvm/CodeGen/TargetPassConfig.h"
17 #include "llvm/IR/IRBuilder.h"
18 #include "llvm/IR/IntrinsicsAMDGPU.h"
19 #include "llvm/IR/MDBuilder.h"
20 #include "llvm/Target/TargetMachine.h"
22 #define DEBUG_TYPE "amdgpu-lower-kernel-arguments"
28 class PreloadKernelArgInfo
{
31 const GCNSubtarget
&ST
;
32 unsigned NumFreeUserSGPRs
;
35 SmallVector
<llvm::Metadata
*, 8> KernelArgMetadata
;
37 PreloadKernelArgInfo(Function
&F
, const GCNSubtarget
&ST
) : F(F
), ST(ST
) {
38 setInitialFreeUserSGPRsCount();
41 // Returns the maximum number of user SGPRs that we have available to preload
43 void setInitialFreeUserSGPRsCount() {
44 const unsigned MaxUserSGPRs
= ST
.getMaxNumUserSGPRs();
45 GCNUserSGPRUsageInfo
UserSGPRInfo(F
, ST
);
47 NumFreeUserSGPRs
= MaxUserSGPRs
- UserSGPRInfo
.getNumUsedUserSGPRs();
50 bool tryAllocPreloadSGPRs(unsigned AllocSize
, uint64_t ArgOffset
,
51 uint64_t LastExplicitArgOffset
) {
52 // Check if this argument may be loaded into the same register as the
54 if (!isAligned(Align(4), ArgOffset
) && AllocSize
< 4)
57 // Pad SGPRs for kernarg alignment.
58 unsigned Padding
= ArgOffset
- LastExplicitArgOffset
;
59 unsigned PaddingSGPRs
= alignTo(Padding
, 4) / 4;
60 unsigned NumPreloadSGPRs
= alignTo(AllocSize
, 4) / 4;
61 if (NumPreloadSGPRs
+ PaddingSGPRs
> NumFreeUserSGPRs
)
64 NumFreeUserSGPRs
-= (NumPreloadSGPRs
+ PaddingSGPRs
);
69 class AMDGPULowerKernelArguments
: public FunctionPass
{
73 AMDGPULowerKernelArguments() : FunctionPass(ID
) {}
75 bool runOnFunction(Function
&F
) override
;
77 void getAnalysisUsage(AnalysisUsage
&AU
) const override
{
78 AU
.addRequired
<TargetPassConfig
>();
83 } // end anonymous namespace
86 static BasicBlock::iterator
getInsertPt(BasicBlock
&BB
) {
87 BasicBlock::iterator InsPt
= BB
.getFirstInsertionPt();
88 for (BasicBlock::iterator E
= BB
.end(); InsPt
!= E
; ++InsPt
) {
89 AllocaInst
*AI
= dyn_cast
<AllocaInst
>(&*InsPt
);
91 // If this is a dynamic alloca, the value may depend on the loaded kernargs,
92 // so loads will need to be inserted before it.
93 if (!AI
|| !AI
->isStaticAlloca())
100 static bool lowerKernelArguments(Function
&F
, const TargetMachine
&TM
) {
101 CallingConv::ID CC
= F
.getCallingConv();
102 if (CC
!= CallingConv::AMDGPU_KERNEL
|| F
.arg_empty())
105 const GCNSubtarget
&ST
= TM
.getSubtarget
<GCNSubtarget
>(F
);
106 LLVMContext
&Ctx
= F
.getParent()->getContext();
107 const DataLayout
&DL
= F
.getDataLayout();
108 BasicBlock
&EntryBlock
= *F
.begin();
109 IRBuilder
<> Builder(&EntryBlock
, getInsertPt(EntryBlock
));
111 const Align
KernArgBaseAlign(16); // FIXME: Increase if necessary
112 const uint64_t BaseOffset
= ST
.getExplicitKernelArgOffset();
115 // FIXME: Alignment is broken with explicit arg offset.;
116 const uint64_t TotalKernArgSize
= ST
.getKernArgSegmentSize(F
, MaxAlign
);
117 if (TotalKernArgSize
== 0)
120 CallInst
*KernArgSegment
=
121 Builder
.CreateIntrinsic(Intrinsic::amdgcn_kernarg_segment_ptr
, {}, {},
122 nullptr, F
.getName() + ".kernarg.segment");
123 KernArgSegment
->addRetAttr(Attribute::NonNull
);
124 KernArgSegment
->addRetAttr(
125 Attribute::getWithDereferenceableBytes(Ctx
, TotalKernArgSize
));
127 uint64_t ExplicitArgOffset
= 0;
128 // Preloaded kernel arguments must be sequential.
129 bool InPreloadSequence
= true;
130 PreloadKernelArgInfo
PreloadInfo(F
, ST
);
132 for (Argument
&Arg
: F
.args()) {
133 const bool IsByRef
= Arg
.hasByRefAttr();
134 Type
*ArgTy
= IsByRef
? Arg
.getParamByRefType() : Arg
.getType();
135 MaybeAlign ParamAlign
= IsByRef
? Arg
.getParamAlign() : std::nullopt
;
136 Align ABITypeAlign
= DL
.getValueOrABITypeAlignment(ParamAlign
, ArgTy
);
138 uint64_t Size
= DL
.getTypeSizeInBits(ArgTy
);
139 uint64_t AllocSize
= DL
.getTypeAllocSize(ArgTy
);
141 uint64_t EltOffset
= alignTo(ExplicitArgOffset
, ABITypeAlign
) + BaseOffset
;
142 uint64_t LastExplicitArgOffset
= ExplicitArgOffset
;
143 ExplicitArgOffset
= alignTo(ExplicitArgOffset
, ABITypeAlign
) + AllocSize
;
145 // Try to preload this argument into user SGPRs.
146 if (Arg
.hasInRegAttr() && InPreloadSequence
&& ST
.hasKernargPreload() &&
147 !Arg
.getType()->isAggregateType())
148 if (PreloadInfo
.tryAllocPreloadSGPRs(AllocSize
, EltOffset
,
149 LastExplicitArgOffset
))
152 InPreloadSequence
= false;
157 // If this is byval, the loads are already explicit in the function. We just
158 // need to rewrite the pointer values.
160 Value
*ArgOffsetPtr
= Builder
.CreateConstInBoundsGEP1_64(
161 Builder
.getInt8Ty(), KernArgSegment
, EltOffset
,
162 Arg
.getName() + ".byval.kernarg.offset");
164 Value
*CastOffsetPtr
=
165 Builder
.CreateAddrSpaceCast(ArgOffsetPtr
, Arg
.getType());
166 Arg
.replaceAllUsesWith(CastOffsetPtr
);
170 if (PointerType
*PT
= dyn_cast
<PointerType
>(ArgTy
)) {
171 // FIXME: Hack. We rely on AssertZext to be able to fold DS addressing
172 // modes on SI to know the high bits are 0 so pointer adds don't wrap. We
173 // can't represent this with range metadata because it's only allowed for
175 if ((PT
->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS
||
176 PT
->getAddressSpace() == AMDGPUAS::REGION_ADDRESS
) &&
177 !ST
.hasUsableDSOffset())
180 // FIXME: We can replace this with equivalent alias.scope/noalias
181 // metadata, but this appears to be a lot of work.
182 if (Arg
.hasNoAliasAttr())
186 auto *VT
= dyn_cast
<FixedVectorType
>(ArgTy
);
187 bool IsV3
= VT
&& VT
->getNumElements() == 3;
188 bool DoShiftOpt
= Size
< 32 && !ArgTy
->isAggregateType();
190 VectorType
*V4Ty
= nullptr;
192 int64_t AlignDownOffset
= alignDown(EltOffset
, 4);
193 int64_t OffsetDiff
= EltOffset
- AlignDownOffset
;
194 Align AdjustedAlign
= commonAlignment(
195 KernArgBaseAlign
, DoShiftOpt
? AlignDownOffset
: EltOffset
);
199 if (DoShiftOpt
) { // FIXME: Handle aggregate types
200 // Since we don't have sub-dword scalar loads, avoid doing an extload by
201 // loading earlier than the argument address, and extracting the relevant
203 // TODO: Update this for GFX12 which does have scalar sub-dword loads.
205 // Additionally widen any sub-dword load to i32 even if suitably aligned,
206 // so that CSE between different argument loads works easily.
207 ArgPtr
= Builder
.CreateConstInBoundsGEP1_64(
208 Builder
.getInt8Ty(), KernArgSegment
, AlignDownOffset
,
209 Arg
.getName() + ".kernarg.offset.align.down");
210 AdjustedArgTy
= Builder
.getInt32Ty();
212 ArgPtr
= Builder
.CreateConstInBoundsGEP1_64(
213 Builder
.getInt8Ty(), KernArgSegment
, EltOffset
,
214 Arg
.getName() + ".kernarg.offset");
215 AdjustedArgTy
= ArgTy
;
218 if (IsV3
&& Size
>= 32) {
219 V4Ty
= FixedVectorType::get(VT
->getElementType(), 4);
220 // Use the hack that clang uses to avoid SelectionDAG ruining v3 loads
221 AdjustedArgTy
= V4Ty
;
225 Builder
.CreateAlignedLoad(AdjustedArgTy
, ArgPtr
, AdjustedAlign
);
226 Load
->setMetadata(LLVMContext::MD_invariant_load
, MDNode::get(Ctx
, {}));
230 if (isa
<PointerType
>(ArgTy
)) {
231 if (Arg
.hasNonNullAttr())
232 Load
->setMetadata(LLVMContext::MD_nonnull
, MDNode::get(Ctx
, {}));
234 uint64_t DerefBytes
= Arg
.getDereferenceableBytes();
235 if (DerefBytes
!= 0) {
237 LLVMContext::MD_dereferenceable
,
240 ConstantInt::get(Builder
.getInt64Ty(), DerefBytes
))));
243 uint64_t DerefOrNullBytes
= Arg
.getDereferenceableOrNullBytes();
244 if (DerefOrNullBytes
!= 0) {
246 LLVMContext::MD_dereferenceable_or_null
,
248 MDB
.createConstant(ConstantInt::get(Builder
.getInt64Ty(),
249 DerefOrNullBytes
))));
252 if (MaybeAlign ParamAlign
= Arg
.getParamAlign()) {
254 LLVMContext::MD_align
,
255 MDNode::get(Ctx
, MDB
.createConstant(ConstantInt::get(
256 Builder
.getInt64Ty(), ParamAlign
->value()))));
260 // TODO: Convert noalias arg to !noalias
263 Value
*ExtractBits
= OffsetDiff
== 0 ?
264 Load
: Builder
.CreateLShr(Load
, OffsetDiff
* 8);
266 IntegerType
*ArgIntTy
= Builder
.getIntNTy(Size
);
267 Value
*Trunc
= Builder
.CreateTrunc(ExtractBits
, ArgIntTy
);
268 Value
*NewVal
= Builder
.CreateBitCast(Trunc
, ArgTy
,
269 Arg
.getName() + ".load");
270 Arg
.replaceAllUsesWith(NewVal
);
272 Value
*Shuf
= Builder
.CreateShuffleVector(Load
, ArrayRef
<int>{0, 1, 2},
273 Arg
.getName() + ".load");
274 Arg
.replaceAllUsesWith(Shuf
);
276 Load
->setName(Arg
.getName() + ".load");
277 Arg
.replaceAllUsesWith(Load
);
281 KernArgSegment
->addRetAttr(
282 Attribute::getWithAlignment(Ctx
, std::max(KernArgBaseAlign
, MaxAlign
)));
287 bool AMDGPULowerKernelArguments::runOnFunction(Function
&F
) {
288 auto &TPC
= getAnalysis
<TargetPassConfig
>();
289 const TargetMachine
&TM
= TPC
.getTM
<TargetMachine
>();
290 return lowerKernelArguments(F
, TM
);
293 INITIALIZE_PASS_BEGIN(AMDGPULowerKernelArguments
, DEBUG_TYPE
,
294 "AMDGPU Lower Kernel Arguments", false, false)
295 INITIALIZE_PASS_END(AMDGPULowerKernelArguments
, DEBUG_TYPE
, "AMDGPU Lower Kernel Arguments",
298 char AMDGPULowerKernelArguments::ID
= 0;
300 FunctionPass
*llvm::createAMDGPULowerKernelArgumentsPass() {
301 return new AMDGPULowerKernelArguments();
305 AMDGPULowerKernelArgumentsPass::run(Function
&F
, FunctionAnalysisManager
&AM
) {
306 bool Changed
= lowerKernelArguments(F
, TM
);
308 // TODO: Preserves a lot more.
309 PreservedAnalyses PA
;
310 PA
.preserveSet
<CFGAnalyses
>();
314 return PreservedAnalyses::all();