1 //===-- AMDGPULowerKernelArguments.cpp ------------------------------------------===//
3 // The LLVM Compiler Infrastructure
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
8 //===----------------------------------------------------------------------===//
10 /// \file This pass replaces accesses to kernel arguments with loads from
11 /// offsets from the kernarg base pointer.
13 //===----------------------------------------------------------------------===//
16 #include "AMDGPUSubtarget.h"
17 #include "AMDGPUTargetMachine.h"
18 #include "llvm/ADT/StringRef.h"
19 #include "llvm/Analysis/Loads.h"
20 #include "llvm/CodeGen/Passes.h"
21 #include "llvm/CodeGen/TargetPassConfig.h"
22 #include "llvm/IR/Attributes.h"
23 #include "llvm/IR/BasicBlock.h"
24 #include "llvm/IR/Constants.h"
25 #include "llvm/IR/DerivedTypes.h"
26 #include "llvm/IR/Function.h"
27 #include "llvm/IR/IRBuilder.h"
28 #include "llvm/IR/InstrTypes.h"
29 #include "llvm/IR/Instruction.h"
30 #include "llvm/IR/Instructions.h"
31 #include "llvm/IR/LLVMContext.h"
32 #include "llvm/IR/MDBuilder.h"
33 #include "llvm/IR/Metadata.h"
34 #include "llvm/IR/Operator.h"
35 #include "llvm/IR/Type.h"
36 #include "llvm/IR/Value.h"
37 #include "llvm/Pass.h"
38 #include "llvm/Support/Casting.h"
40 #define DEBUG_TYPE "amdgpu-lower-kernel-arguments"
46 class AMDGPULowerKernelArguments
: public FunctionPass
{
50 AMDGPULowerKernelArguments() : FunctionPass(ID
) {}
52 bool runOnFunction(Function
&F
) override
;
54 void getAnalysisUsage(AnalysisUsage
&AU
) const override
{
55 AU
.addRequired
<TargetPassConfig
>();
60 } // end anonymous namespace
62 bool AMDGPULowerKernelArguments::runOnFunction(Function
&F
) {
63 CallingConv::ID CC
= F
.getCallingConv();
64 if (CC
!= CallingConv::AMDGPU_KERNEL
|| F
.arg_empty())
67 auto &TPC
= getAnalysis
<TargetPassConfig
>();
69 const TargetMachine
&TM
= TPC
.getTM
<TargetMachine
>();
70 const GCNSubtarget
&ST
= TM
.getSubtarget
<GCNSubtarget
>(F
);
71 LLVMContext
&Ctx
= F
.getParent()->getContext();
72 const DataLayout
&DL
= F
.getParent()->getDataLayout();
73 BasicBlock
&EntryBlock
= *F
.begin();
74 IRBuilder
<> Builder(&*EntryBlock
.begin());
76 const unsigned KernArgBaseAlign
= 16; // FIXME: Increase if necessary
77 const uint64_t BaseOffset
= ST
.getExplicitKernelArgOffset(F
);
80 // FIXME: Alignment is broken broken with explicit arg offset.;
81 const uint64_t TotalKernArgSize
= ST
.getKernArgSegmentSize(F
, MaxAlign
);
82 if (TotalKernArgSize
== 0)
85 CallInst
*KernArgSegment
=
86 Builder
.CreateIntrinsic(Intrinsic::amdgcn_kernarg_segment_ptr
, {}, {},
87 nullptr, F
.getName() + ".kernarg.segment");
89 KernArgSegment
->addAttribute(AttributeList::ReturnIndex
, Attribute::NonNull
);
90 KernArgSegment
->addAttribute(AttributeList::ReturnIndex
,
91 Attribute::getWithDereferenceableBytes(Ctx
, TotalKernArgSize
));
93 unsigned AS
= KernArgSegment
->getType()->getPointerAddressSpace();
94 uint64_t ExplicitArgOffset
= 0;
96 for (Argument
&Arg
: F
.args()) {
97 Type
*ArgTy
= Arg
.getType();
98 unsigned Align
= DL
.getABITypeAlignment(ArgTy
);
99 unsigned Size
= DL
.getTypeSizeInBits(ArgTy
);
100 unsigned AllocSize
= DL
.getTypeAllocSize(ArgTy
);
102 uint64_t EltOffset
= alignTo(ExplicitArgOffset
, Align
) + BaseOffset
;
103 ExplicitArgOffset
= alignTo(ExplicitArgOffset
, Align
) + AllocSize
;
108 if (PointerType
*PT
= dyn_cast
<PointerType
>(ArgTy
)) {
109 // FIXME: Hack. We rely on AssertZext to be able to fold DS addressing
110 // modes on SI to know the high bits are 0 so pointer adds don't wrap. We
111 // can't represent this with range metadata because it's only allowed for
113 if (PT
->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS
&&
114 ST
.getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS
)
117 // FIXME: We can replace this with equivalent alias.scope/noalias
118 // metadata, but this appears to be a lot of work.
119 if (Arg
.hasNoAliasAttr())
123 VectorType
*VT
= dyn_cast
<VectorType
>(ArgTy
);
124 bool IsV3
= VT
&& VT
->getNumElements() == 3;
125 VectorType
*V4Ty
= nullptr;
127 int64_t AlignDownOffset
= alignDown(EltOffset
, 4);
128 int64_t OffsetDiff
= EltOffset
- AlignDownOffset
;
129 unsigned AdjustedAlign
= MinAlign(KernArgBaseAlign
, AlignDownOffset
);
132 if (Size
< 32 && !ArgTy
->isAggregateType()) { // FIXME: Handle aggregate types
133 // Since we don't have sub-dword scalar loads, avoid doing an extload by
134 // loading earlier than the argument address, and extracting the relevant
137 // Additionally widen any sub-dword load to i32 even if suitably aligned,
138 // so that CSE between different argument loads works easily.
140 ArgPtr
= Builder
.CreateConstInBoundsGEP1_64(
143 Arg
.getName() + ".kernarg.offset.align.down");
144 ArgPtr
= Builder
.CreateBitCast(ArgPtr
,
145 Builder
.getInt32Ty()->getPointerTo(AS
),
146 ArgPtr
->getName() + ".cast");
148 ArgPtr
= Builder
.CreateConstInBoundsGEP1_64(
151 Arg
.getName() + ".kernarg.offset");
152 ArgPtr
= Builder
.CreateBitCast(ArgPtr
, ArgTy
->getPointerTo(AS
),
153 ArgPtr
->getName() + ".cast");
156 if (IsV3
&& Size
>= 32) {
157 V4Ty
= VectorType::get(VT
->getVectorElementType(), 4);
158 // Use the hack that clang uses to avoid SelectionDAG ruining v3 loads
159 ArgPtr
= Builder
.CreateBitCast(ArgPtr
, V4Ty
->getPointerTo(AS
));
162 LoadInst
*Load
= Builder
.CreateAlignedLoad(ArgPtr
, AdjustedAlign
);
163 Load
->setMetadata(LLVMContext::MD_invariant_load
, MDNode::get(Ctx
, {}));
167 if (isa
<PointerType
>(ArgTy
)) {
168 if (Arg
.hasNonNullAttr())
169 Load
->setMetadata(LLVMContext::MD_nonnull
, MDNode::get(Ctx
, {}));
171 uint64_t DerefBytes
= Arg
.getDereferenceableBytes();
172 if (DerefBytes
!= 0) {
174 LLVMContext::MD_dereferenceable
,
177 ConstantInt::get(Builder
.getInt64Ty(), DerefBytes
))));
180 uint64_t DerefOrNullBytes
= Arg
.getDereferenceableOrNullBytes();
181 if (DerefOrNullBytes
!= 0) {
183 LLVMContext::MD_dereferenceable_or_null
,
185 MDB
.createConstant(ConstantInt::get(Builder
.getInt64Ty(),
186 DerefOrNullBytes
))));
189 unsigned ParamAlign
= Arg
.getParamAlignment();
190 if (ParamAlign
!= 0) {
192 LLVMContext::MD_align
,
194 MDB
.createConstant(ConstantInt::get(Builder
.getInt64Ty(),
199 // TODO: Convert noalias arg to !noalias
201 if (Size
< 32 && !ArgTy
->isAggregateType()) {
202 Value
*ExtractBits
= OffsetDiff
== 0 ?
203 Load
: Builder
.CreateLShr(Load
, OffsetDiff
* 8);
205 IntegerType
*ArgIntTy
= Builder
.getIntNTy(Size
);
206 Value
*Trunc
= Builder
.CreateTrunc(ExtractBits
, ArgIntTy
);
207 Value
*NewVal
= Builder
.CreateBitCast(Trunc
, ArgTy
,
208 Arg
.getName() + ".load");
209 Arg
.replaceAllUsesWith(NewVal
);
211 Value
*Shuf
= Builder
.CreateShuffleVector(Load
, UndefValue::get(V4Ty
),
213 Arg
.getName() + ".load");
214 Arg
.replaceAllUsesWith(Shuf
);
216 Load
->setName(Arg
.getName() + ".load");
217 Arg
.replaceAllUsesWith(Load
);
221 KernArgSegment
->addAttribute(
222 AttributeList::ReturnIndex
,
223 Attribute::getWithAlignment(Ctx
, std::max(KernArgBaseAlign
, MaxAlign
)));
228 INITIALIZE_PASS_BEGIN(AMDGPULowerKernelArguments
, DEBUG_TYPE
,
229 "AMDGPU Lower Kernel Arguments", false, false)
230 INITIALIZE_PASS_END(AMDGPULowerKernelArguments
, DEBUG_TYPE
, "AMDGPU Lower Kernel Arguments",
233 char AMDGPULowerKernelArguments::ID
= 0;
235 FunctionPass
*llvm::createAMDGPULowerKernelArgumentsPass() {
236 return new AMDGPULowerKernelArguments();