1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
10 /// Implements the AMDGPU specific subclass of TargetSubtarget.
12 //===----------------------------------------------------------------------===//
14 #include "AMDGPUSubtarget.h"
15 #include "AMDGPUCallLowering.h"
16 #include "AMDGPUInstructionSelector.h"
17 #include "AMDGPULegalizerInfo.h"
18 #include "AMDGPURegisterBankInfo.h"
19 #include "R600Subtarget.h"
20 #include "SIMachineFunctionInfo.h"
21 #include "Utils/AMDGPUBaseInfo.h"
22 #include "llvm/CodeGen/GlobalISel/InlineAsmLowering.h"
23 #include "llvm/CodeGen/MachineScheduler.h"
24 #include "llvm/CodeGen/TargetFrameLowering.h"
25 #include "llvm/IR/DiagnosticInfo.h"
26 #include "llvm/IR/IntrinsicsAMDGPU.h"
27 #include "llvm/IR/IntrinsicsR600.h"
28 #include "llvm/IR/MDBuilder.h"
33 #define DEBUG_TYPE "amdgpu-subtarget"
35 AMDGPUSubtarget::AMDGPUSubtarget(Triple TT
) : TargetTriple(std::move(TT
)) {}
37 bool AMDGPUSubtarget::useRealTrue16Insts() const {
38 return hasTrue16BitInsts() && EnableRealTrue16Insts
;
41 // Returns the maximum per-workgroup LDS allocation size (in bytes) that still
42 // allows the given function to achieve an occupancy of NWaves waves per
43 // SIMD / EU, taking into account only the function's *maximum* workgroup size.
45 AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves
,
46 const Function
&F
) const {
47 const unsigned WaveSize
= getWavefrontSize();
48 const unsigned WorkGroupSize
= getFlatWorkGroupSizes(F
).second
;
49 const unsigned WavesPerWorkgroup
=
50 std::max(1u, (WorkGroupSize
+ WaveSize
- 1) / WaveSize
);
52 const unsigned WorkGroupsPerCU
=
53 std::max(1u, (NWaves
* getEUsPerCU()) / WavesPerWorkgroup
);
55 return getLocalMemorySize() / WorkGroupsPerCU
;
58 // FIXME: Should return min,max range.
60 // Returns the maximum occupancy, in number of waves per SIMD / EU, that can
61 // be achieved when only the given function is running on the machine; and
62 // taking into account the overall number of wave slots, the (maximum) workgroup
63 // size, and the per-workgroup LDS allocation size.
64 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes
,
65 const Function
&F
) const {
66 const unsigned MaxWorkGroupSize
= getFlatWorkGroupSizes(F
).second
;
67 const unsigned MaxWorkGroupsPerCu
= getMaxWorkGroupsPerCU(MaxWorkGroupSize
);
68 if (!MaxWorkGroupsPerCu
)
71 const unsigned WaveSize
= getWavefrontSize();
73 // FIXME: Do we need to account for alignment requirement of LDS rounding the
75 // Compute restriction based on LDS usage
76 unsigned NumGroups
= getLocalMemorySize() / (Bytes
? Bytes
: 1u);
78 // This can be queried with more LDS than is possible, so just assume the
83 NumGroups
= std::min(MaxWorkGroupsPerCu
, NumGroups
);
85 // Round to the number of waves per CU.
86 const unsigned MaxGroupNumWaves
= divideCeil(MaxWorkGroupSize
, WaveSize
);
87 unsigned MaxWaves
= NumGroups
* MaxGroupNumWaves
;
89 // Number of waves per EU (SIMD).
90 MaxWaves
= divideCeil(MaxWaves
, getEUsPerCU());
92 // Clamp to the maximum possible number of waves.
93 MaxWaves
= std::min(MaxWaves
, getMaxWavesPerEU());
95 // FIXME: Needs to be a multiple of the group size?
96 //MaxWaves = MaxGroupNumWaves * (MaxWaves / MaxGroupNumWaves);
98 assert(MaxWaves
> 0 && MaxWaves
<= getMaxWavesPerEU() &&
99 "computed invalid occupancy");
104 AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction
&MF
) const {
105 const auto *MFI
= MF
.getInfo
<SIMachineFunctionInfo
>();
106 return getOccupancyWithLocalMemSize(MFI
->getLDSSize(), MF
.getFunction());
109 std::pair
<unsigned, unsigned>
110 AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC
) const {
112 case CallingConv::AMDGPU_VS
:
113 case CallingConv::AMDGPU_LS
:
114 case CallingConv::AMDGPU_HS
:
115 case CallingConv::AMDGPU_ES
:
116 case CallingConv::AMDGPU_GS
:
117 case CallingConv::AMDGPU_PS
:
118 return std::pair(1, getWavefrontSize());
120 return std::pair(1u, getMaxFlatWorkGroupSize());
124 std::pair
<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
125 const Function
&F
) const {
126 // Default minimum/maximum flat work group sizes.
127 std::pair
<unsigned, unsigned> Default
=
128 getDefaultFlatWorkGroupSize(F
.getCallingConv());
130 // Requested minimum/maximum flat work group sizes.
131 std::pair
<unsigned, unsigned> Requested
= AMDGPU::getIntegerPairAttribute(
132 F
, "amdgpu-flat-work-group-size", Default
);
134 // Make sure requested minimum is less than requested maximum.
135 if (Requested
.first
> Requested
.second
)
138 // Make sure requested values do not violate subtarget's specifications.
139 if (Requested
.first
< getMinFlatWorkGroupSize())
141 if (Requested
.second
> getMaxFlatWorkGroupSize())
147 std::pair
<unsigned, unsigned> AMDGPUSubtarget::getEffectiveWavesPerEU(
148 std::pair
<unsigned, unsigned> Requested
,
149 std::pair
<unsigned, unsigned> FlatWorkGroupSizes
) const {
150 // Default minimum/maximum number of waves per execution unit.
151 std::pair
<unsigned, unsigned> Default(1, getMaxWavesPerEU());
153 // If minimum/maximum flat work group sizes were explicitly requested using
154 // "amdgpu-flat-workgroup-size" attribute, then set default minimum/maximum
155 // number of waves per execution unit to values implied by requested
156 // minimum/maximum flat work group sizes.
157 unsigned MinImpliedByFlatWorkGroupSize
=
158 getWavesPerEUForWorkGroup(FlatWorkGroupSizes
.second
);
159 Default
.first
= MinImpliedByFlatWorkGroupSize
;
161 // Make sure requested minimum is less than requested maximum.
162 if (Requested
.second
&& Requested
.first
> Requested
.second
)
165 // Make sure requested values do not violate subtarget's specifications.
166 if (Requested
.first
< getMinWavesPerEU() ||
167 Requested
.second
> getMaxWavesPerEU())
170 // Make sure requested values are compatible with values implied by requested
171 // minimum/maximum flat work group sizes.
172 if (Requested
.first
< MinImpliedByFlatWorkGroupSize
)
178 std::pair
<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
179 const Function
&F
, std::pair
<unsigned, unsigned> FlatWorkGroupSizes
) const {
180 // Default minimum/maximum number of waves per execution unit.
181 std::pair
<unsigned, unsigned> Default(1, getMaxWavesPerEU());
183 // Requested minimum/maximum number of waves per execution unit.
184 std::pair
<unsigned, unsigned> Requested
=
185 AMDGPU::getIntegerPairAttribute(F
, "amdgpu-waves-per-eu", Default
, true);
186 return getEffectiveWavesPerEU(Requested
, FlatWorkGroupSizes
);
189 static unsigned getReqdWorkGroupSize(const Function
&Kernel
, unsigned Dim
) {
190 auto *Node
= Kernel
.getMetadata("reqd_work_group_size");
191 if (Node
&& Node
->getNumOperands() == 3)
192 return mdconst::extract
<ConstantInt
>(Node
->getOperand(Dim
))->getZExtValue();
193 return std::numeric_limits
<unsigned>::max();
196 bool AMDGPUSubtarget::isMesaKernel(const Function
&F
) const {
197 return isMesa3DOS() && !AMDGPU::isShader(F
.getCallingConv());
200 unsigned AMDGPUSubtarget::getMaxWorkitemID(const Function
&Kernel
,
201 unsigned Dimension
) const {
202 unsigned ReqdSize
= getReqdWorkGroupSize(Kernel
, Dimension
);
203 if (ReqdSize
!= std::numeric_limits
<unsigned>::max())
205 return getFlatWorkGroupSizes(Kernel
).second
- 1;
208 bool AMDGPUSubtarget::isSingleLaneExecution(const Function
&Func
) const {
209 for (int I
= 0; I
< 3; ++I
) {
210 if (getMaxWorkitemID(Func
, I
) > 0)
217 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction
*I
) const {
218 Function
*Kernel
= I
->getParent()->getParent();
219 unsigned MinSize
= 0;
220 unsigned MaxSize
= getFlatWorkGroupSizes(*Kernel
).second
;
221 bool IdQuery
= false;
223 // If reqd_work_group_size is present it narrows value down.
224 if (auto *CI
= dyn_cast
<CallInst
>(I
)) {
225 const Function
*F
= CI
->getCalledFunction();
227 unsigned Dim
= UINT_MAX
;
228 switch (F
->getIntrinsicID()) {
229 case Intrinsic::amdgcn_workitem_id_x
:
230 case Intrinsic::r600_read_tidig_x
:
233 case Intrinsic::r600_read_local_size_x
:
236 case Intrinsic::amdgcn_workitem_id_y
:
237 case Intrinsic::r600_read_tidig_y
:
240 case Intrinsic::r600_read_local_size_y
:
243 case Intrinsic::amdgcn_workitem_id_z
:
244 case Intrinsic::r600_read_tidig_z
:
247 case Intrinsic::r600_read_local_size_z
:
255 unsigned ReqdSize
= getReqdWorkGroupSize(*Kernel
, Dim
);
256 if (ReqdSize
!= std::numeric_limits
<unsigned>::max())
257 MinSize
= MaxSize
= ReqdSize
;
265 // Range metadata is [Lo, Hi). For ID query we need to pass max size
266 // as Hi. For size query we need to pass Hi + 1.
272 APInt Lower
{32, MinSize
};
273 APInt Upper
{32, MaxSize
};
274 if (auto *CI
= dyn_cast
<CallBase
>(I
)) {
275 ConstantRange
Range(Lower
, Upper
);
276 CI
->addRangeRetAttr(Range
);
278 MDBuilder
MDB(I
->getContext());
279 MDNode
*MaxWorkGroupSizeRange
= MDB
.createRange(Lower
, Upper
);
280 I
->setMetadata(LLVMContext::MD_range
, MaxWorkGroupSizeRange
);
285 unsigned AMDGPUSubtarget::getImplicitArgNumBytes(const Function
&F
) const {
286 assert(AMDGPU::isKernel(F
.getCallingConv()));
288 // We don't allocate the segment if we know the implicit arguments weren't
289 // used, even if the ABI implies we need them.
290 if (F
.hasFnAttribute("amdgpu-no-implicitarg-ptr"))
296 // Assume all implicit inputs are used by default
297 const Module
*M
= F
.getParent();
299 AMDGPU::getAMDHSACodeObjectVersion(*M
) >= AMDGPU::AMDHSA_COV5
? 256 : 56;
300 return F
.getFnAttributeAsParsedInteger("amdgpu-implicitarg-num-bytes",
304 uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function
&F
,
305 Align
&MaxAlign
) const {
306 assert(F
.getCallingConv() == CallingConv::AMDGPU_KERNEL
||
307 F
.getCallingConv() == CallingConv::SPIR_KERNEL
);
309 const DataLayout
&DL
= F
.getDataLayout();
310 uint64_t ExplicitArgBytes
= 0;
313 for (const Argument
&Arg
: F
.args()) {
314 if (Arg
.hasAttribute("amdgpu-hidden-argument"))
317 const bool IsByRef
= Arg
.hasByRefAttr();
318 Type
*ArgTy
= IsByRef
? Arg
.getParamByRefType() : Arg
.getType();
319 Align Alignment
= DL
.getValueOrABITypeAlignment(
320 IsByRef
? Arg
.getParamAlign() : std::nullopt
, ArgTy
);
321 uint64_t AllocSize
= DL
.getTypeAllocSize(ArgTy
);
322 ExplicitArgBytes
= alignTo(ExplicitArgBytes
, Alignment
) + AllocSize
;
323 MaxAlign
= std::max(MaxAlign
, Alignment
);
326 return ExplicitArgBytes
;
329 unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function
&F
,
330 Align
&MaxAlign
) const {
331 if (F
.getCallingConv() != CallingConv::AMDGPU_KERNEL
&&
332 F
.getCallingConv() != CallingConv::SPIR_KERNEL
)
335 uint64_t ExplicitArgBytes
= getExplicitKernArgSize(F
, MaxAlign
);
337 unsigned ExplicitOffset
= getExplicitKernelArgOffset();
339 uint64_t TotalSize
= ExplicitOffset
+ ExplicitArgBytes
;
340 unsigned ImplicitBytes
= getImplicitArgNumBytes(F
);
341 if (ImplicitBytes
!= 0) {
342 const Align Alignment
= getAlignmentForImplicitArgPtr();
343 TotalSize
= alignTo(ExplicitArgBytes
, Alignment
) + ImplicitBytes
;
344 MaxAlign
= std::max(MaxAlign
, Alignment
);
347 // Being able to dereference past the end is useful for emitting scalar loads.
348 return alignTo(TotalSize
, 4);
351 AMDGPUDwarfFlavour
AMDGPUSubtarget::getAMDGPUDwarfFlavour() const {
352 return getWavefrontSize() == 32 ? AMDGPUDwarfFlavour::Wave32
353 : AMDGPUDwarfFlavour::Wave64
;
356 const AMDGPUSubtarget
&AMDGPUSubtarget::get(const MachineFunction
&MF
) {
357 if (MF
.getTarget().getTargetTriple().getArch() == Triple::amdgcn
)
358 return static_cast<const AMDGPUSubtarget
&>(MF
.getSubtarget
<GCNSubtarget
>());
359 return static_cast<const AMDGPUSubtarget
&>(MF
.getSubtarget
<R600Subtarget
>());
362 const AMDGPUSubtarget
&AMDGPUSubtarget::get(const TargetMachine
&TM
, const Function
&F
) {
363 if (TM
.getTargetTriple().getArch() == Triple::amdgcn
)
364 return static_cast<const AMDGPUSubtarget
&>(TM
.getSubtarget
<GCNSubtarget
>(F
));
365 return static_cast<const AMDGPUSubtarget
&>(
366 TM
.getSubtarget
<R600Subtarget
>(F
));
369 SmallVector
<unsigned>
370 AMDGPUSubtarget::getMaxNumWorkGroups(const Function
&F
) const {
371 return AMDGPU::getIntegerVecAttribute(F
, "amdgpu-max-num-workgroups", 3,
372 std::numeric_limits
<uint32_t>::max());