1 //===-- AMDGPULowerKernelArguments.cpp ------------------------------------------===//
3 // The LLVM Compiler Infrastructure
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
8 //===----------------------------------------------------------------------===//
10 /// \file This pass replaces accesses to kernel arguments with loads from
11 /// offsets from the kernarg base pointer.
13 //===----------------------------------------------------------------------===//
16 #include "AMDGPUSubtarget.h"
17 #include "AMDGPUTargetMachine.h"
18 #include "llvm/ADT/StringRef.h"
19 #include "llvm/Analysis/DivergenceAnalysis.h"
20 #include "llvm/Analysis/Loads.h"
21 #include "llvm/CodeGen/Passes.h"
22 #include "llvm/CodeGen/TargetPassConfig.h"
23 #include "llvm/IR/Attributes.h"
24 #include "llvm/IR/BasicBlock.h"
25 #include "llvm/IR/Constants.h"
26 #include "llvm/IR/DerivedTypes.h"
27 #include "llvm/IR/Function.h"
28 #include "llvm/IR/IRBuilder.h"
29 #include "llvm/IR/InstrTypes.h"
30 #include "llvm/IR/Instruction.h"
31 #include "llvm/IR/Instructions.h"
32 #include "llvm/IR/LLVMContext.h"
33 #include "llvm/IR/MDBuilder.h"
34 #include "llvm/IR/Metadata.h"
35 #include "llvm/IR/Operator.h"
36 #include "llvm/IR/Type.h"
37 #include "llvm/IR/Value.h"
38 #include "llvm/Pass.h"
39 #include "llvm/Support/Casting.h"
41 #define DEBUG_TYPE "amdgpu-lower-kernel-arguments"
47 class AMDGPULowerKernelArguments
: public FunctionPass
{
51 AMDGPULowerKernelArguments() : FunctionPass(ID
) {}
53 bool runOnFunction(Function
&F
) override
;
55 void getAnalysisUsage(AnalysisUsage
&AU
) const override
{
56 AU
.addRequired
<TargetPassConfig
>();
61 } // end anonymous namespace
63 bool AMDGPULowerKernelArguments::runOnFunction(Function
&F
) {
64 CallingConv::ID CC
= F
.getCallingConv();
65 if (CC
!= CallingConv::AMDGPU_KERNEL
|| F
.arg_empty())
68 auto &TPC
= getAnalysis
<TargetPassConfig
>();
70 const TargetMachine
&TM
= TPC
.getTM
<TargetMachine
>();
71 const GCNSubtarget
&ST
= TM
.getSubtarget
<GCNSubtarget
>(F
);
72 LLVMContext
&Ctx
= F
.getParent()->getContext();
73 const DataLayout
&DL
= F
.getParent()->getDataLayout();
74 BasicBlock
&EntryBlock
= *F
.begin();
75 IRBuilder
<> Builder(&*EntryBlock
.begin());
77 const unsigned KernArgBaseAlign
= 16; // FIXME: Increase if necessary
78 const uint64_t BaseOffset
= ST
.getExplicitKernelArgOffset(F
);
80 // FIXME: Alignment is broken broken with explicit arg offset.;
81 const uint64_t TotalKernArgSize
= ST
.getKernArgSegmentSize(F
);
82 if (TotalKernArgSize
== 0)
85 CallInst
*KernArgSegment
=
86 Builder
.CreateIntrinsic(Intrinsic::amdgcn_kernarg_segment_ptr
, nullptr,
87 F
.getName() + ".kernarg.segment");
89 KernArgSegment
->addAttribute(AttributeList::ReturnIndex
, Attribute::NonNull
);
90 KernArgSegment
->addAttribute(AttributeList::ReturnIndex
,
91 Attribute::getWithDereferenceableBytes(Ctx
, TotalKernArgSize
));
93 unsigned AS
= KernArgSegment
->getType()->getPointerAddressSpace();
94 unsigned MaxAlign
= 1;
95 uint64_t ExplicitArgOffset
= 0;
97 for (Argument
&Arg
: F
.args()) {
98 Type
*ArgTy
= Arg
.getType();
99 unsigned Align
= DL
.getABITypeAlignment(ArgTy
);
100 MaxAlign
= std::max(Align
, MaxAlign
);
101 unsigned Size
= DL
.getTypeSizeInBits(ArgTy
);
102 unsigned AllocSize
= DL
.getTypeAllocSize(ArgTy
);
105 // Clover seems to always pad i8/i16 to i32, but doesn't properly align
107 // Make sure the struct elements have correct size and alignment for ext
108 // args. These seem to be padded up to 4-bytes but not correctly aligned.
109 bool IsExtArg
= AllocSize
< 32 && (Arg
.hasZExtAttr() || Arg
.hasSExtAttr()) &&
114 uint64_t EltOffset
= alignTo(ExplicitArgOffset
, Align
) + BaseOffset
;
115 ExplicitArgOffset
= alignTo(ExplicitArgOffset
, Align
) + AllocSize
;
120 if (PointerType
*PT
= dyn_cast
<PointerType
>(ArgTy
)) {
121 // FIXME: Hack. We rely on AssertZext to be able to fold DS addressing
122 // modes on SI to know the high bits are 0 so pointer adds don't wrap. We
123 // can't represent this with range metadata because it's only allowed for
125 if (PT
->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS
&&
126 ST
.getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS
)
129 // FIXME: We can replace this with equivalent alias.scope/noalias
130 // metadata, but this appears to be a lot of work.
131 if (Arg
.hasNoAliasAttr())
135 VectorType
*VT
= dyn_cast
<VectorType
>(ArgTy
);
136 bool IsV3
= VT
&& VT
->getNumElements() == 3;
137 VectorType
*V4Ty
= nullptr;
139 int64_t AlignDownOffset
= alignDown(EltOffset
, 4);
140 int64_t OffsetDiff
= EltOffset
- AlignDownOffset
;
141 unsigned AdjustedAlign
= MinAlign(KernArgBaseAlign
, AlignDownOffset
);
144 if (Size
< 32 && !ArgTy
->isAggregateType()) { // FIXME: Handle aggregate types
145 // Since we don't have sub-dword scalar loads, avoid doing an extload by
146 // loading earlier than the argument address, and extracting the relevant
149 // Additionally widen any sub-dword load to i32 even if suitably aligned,
150 // so that CSE between different argument loads works easily.
152 ArgPtr
= Builder
.CreateConstInBoundsGEP1_64(
155 Arg
.getName() + ".kernarg.offset.align.down");
156 ArgPtr
= Builder
.CreateBitCast(ArgPtr
,
157 Builder
.getInt32Ty()->getPointerTo(AS
),
158 ArgPtr
->getName() + ".cast");
160 ArgPtr
= Builder
.CreateConstInBoundsGEP1_64(
163 Arg
.getName() + ".kernarg.offset");
164 ArgPtr
= Builder
.CreateBitCast(ArgPtr
, ArgTy
->getPointerTo(AS
),
165 ArgPtr
->getName() + ".cast");
168 assert((!IsExtArg
|| !IsV3
) && "incompatible situation");
170 if (IsV3
&& Size
>= 32) {
171 V4Ty
= VectorType::get(VT
->getVectorElementType(), 4);
172 // Use the hack that clang uses to avoid SelectionDAG ruining v3 loads
173 ArgPtr
= Builder
.CreateBitCast(ArgPtr
, V4Ty
->getPointerTo(AS
));
176 LoadInst
*Load
= Builder
.CreateAlignedLoad(ArgPtr
, AdjustedAlign
);
177 Load
->setMetadata(LLVMContext::MD_invariant_load
, MDNode::get(Ctx
, {}));
181 if (isa
<PointerType
>(ArgTy
)) {
182 if (Arg
.hasNonNullAttr())
183 Load
->setMetadata(LLVMContext::MD_nonnull
, MDNode::get(Ctx
, {}));
185 uint64_t DerefBytes
= Arg
.getDereferenceableBytes();
186 if (DerefBytes
!= 0) {
188 LLVMContext::MD_dereferenceable
,
191 ConstantInt::get(Builder
.getInt64Ty(), DerefBytes
))));
194 uint64_t DerefOrNullBytes
= Arg
.getDereferenceableOrNullBytes();
195 if (DerefOrNullBytes
!= 0) {
197 LLVMContext::MD_dereferenceable_or_null
,
199 MDB
.createConstant(ConstantInt::get(Builder
.getInt64Ty(),
200 DerefOrNullBytes
))));
203 unsigned ParamAlign
= Arg
.getParamAlignment();
204 if (ParamAlign
!= 0) {
206 LLVMContext::MD_align
,
208 MDB
.createConstant(ConstantInt::get(Builder
.getInt64Ty(),
213 // TODO: Convert noalias arg to !noalias
215 if (Size
< 32 && !ArgTy
->isAggregateType()) {
216 if (IsExtArg
&& OffsetDiff
== 0) {
217 Type
*I32Ty
= Builder
.getInt32Ty();
218 bool IsSext
= Arg
.hasSExtAttr();
219 Metadata
*LowAndHigh
[] = {
220 ConstantAsMetadata::get(
221 ConstantInt::get(I32Ty
, IsSext
? minIntN(Size
) : 0)),
222 ConstantAsMetadata::get(
223 ConstantInt::get(I32Ty
,
224 IsSext
? maxIntN(Size
) + 1 : maxUIntN(Size
) + 1))
227 Load
->setMetadata(LLVMContext::MD_range
, MDNode::get(Ctx
, LowAndHigh
));
230 Value
*ExtractBits
= OffsetDiff
== 0 ?
231 Load
: Builder
.CreateLShr(Load
, OffsetDiff
* 8);
233 IntegerType
*ArgIntTy
= Builder
.getIntNTy(Size
);
234 Value
*Trunc
= Builder
.CreateTrunc(ExtractBits
, ArgIntTy
);
235 Value
*NewVal
= Builder
.CreateBitCast(Trunc
, ArgTy
,
236 Arg
.getName() + ".load");
237 Arg
.replaceAllUsesWith(NewVal
);
239 Value
*Shuf
= Builder
.CreateShuffleVector(Load
, UndefValue::get(V4Ty
),
241 Arg
.getName() + ".load");
242 Arg
.replaceAllUsesWith(Shuf
);
244 Load
->setName(Arg
.getName() + ".load");
245 Arg
.replaceAllUsesWith(Load
);
249 KernArgSegment
->addAttribute(
250 AttributeList::ReturnIndex
,
251 Attribute::getWithAlignment(Ctx
, std::max(KernArgBaseAlign
, MaxAlign
)));
256 INITIALIZE_PASS_BEGIN(AMDGPULowerKernelArguments
, DEBUG_TYPE
,
257 "AMDGPU Lower Kernel Arguments", false, false)
258 INITIALIZE_PASS_END(AMDGPULowerKernelArguments
, DEBUG_TYPE
, "AMDGPU Lower Kernel Arguments",
261 char AMDGPULowerKernelArguments::ID
= 0;
263 FunctionPass
*llvm::createAMDGPULowerKernelArgumentsPass() {
264 return new AMDGPULowerKernelArguments();