[Clang/AMDGPU] Zero sized arrays not allowed in HIP device code. (#113470)
[llvm-project.git] / llvm / lib / Target / AMDGPU / AMDGPUSubtarget.cpp
blob095184d96720a61180d9acfa85a0a3f97bffc89e
1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Implements the AMDGPU specific subclass of TargetSubtarget.
12 //===----------------------------------------------------------------------===//
14 #include "AMDGPUSubtarget.h"
15 #include "AMDGPUCallLowering.h"
16 #include "AMDGPUInstructionSelector.h"
17 #include "AMDGPULegalizerInfo.h"
18 #include "AMDGPURegisterBankInfo.h"
19 #include "R600Subtarget.h"
20 #include "SIMachineFunctionInfo.h"
21 #include "Utils/AMDGPUBaseInfo.h"
22 #include "llvm/CodeGen/GlobalISel/InlineAsmLowering.h"
23 #include "llvm/CodeGen/MachineScheduler.h"
24 #include "llvm/CodeGen/TargetFrameLowering.h"
25 #include "llvm/IR/DiagnosticInfo.h"
26 #include "llvm/IR/IntrinsicsAMDGPU.h"
27 #include "llvm/IR/IntrinsicsR600.h"
28 #include "llvm/IR/MDBuilder.h"
29 #include <algorithm>
31 using namespace llvm;
33 #define DEBUG_TYPE "amdgpu-subtarget"
35 AMDGPUSubtarget::AMDGPUSubtarget(Triple TT) : TargetTriple(std::move(TT)) {}
37 bool AMDGPUSubtarget::useRealTrue16Insts() const {
38 return hasTrue16BitInsts() && EnableRealTrue16Insts;
41 // Returns the maximum per-workgroup LDS allocation size (in bytes) that still
42 // allows the given function to achieve an occupancy of NWaves waves per
43 // SIMD / EU, taking into account only the function's *maximum* workgroup size.
44 unsigned
45 AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
46 const Function &F) const {
47 const unsigned WaveSize = getWavefrontSize();
48 const unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
49 const unsigned WavesPerWorkgroup =
50 std::max(1u, (WorkGroupSize + WaveSize - 1) / WaveSize);
52 const unsigned WorkGroupsPerCU =
53 std::max(1u, (NWaves * getEUsPerCU()) / WavesPerWorkgroup);
55 return getLocalMemorySize() / WorkGroupsPerCU;
58 // FIXME: Should return min,max range.
60 // Returns the maximum occupancy, in number of waves per SIMD / EU, that can
61 // be achieved when only the given function is running on the machine; and
62 // taking into account the overall number of wave slots, the (maximum) workgroup
63 // size, and the per-workgroup LDS allocation size.
64 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes,
65 const Function &F) const {
66 const unsigned MaxWorkGroupSize = getFlatWorkGroupSizes(F).second;
67 const unsigned MaxWorkGroupsPerCu = getMaxWorkGroupsPerCU(MaxWorkGroupSize);
68 if (!MaxWorkGroupsPerCu)
69 return 0;
71 const unsigned WaveSize = getWavefrontSize();
73 // FIXME: Do we need to account for alignment requirement of LDS rounding the
74 // size up?
75 // Compute restriction based on LDS usage
76 unsigned NumGroups = getLocalMemorySize() / (Bytes ? Bytes : 1u);
78 // This can be queried with more LDS than is possible, so just assume the
79 // worst.
80 if (NumGroups == 0)
81 return 1;
83 NumGroups = std::min(MaxWorkGroupsPerCu, NumGroups);
85 // Round to the number of waves per CU.
86 const unsigned MaxGroupNumWaves = divideCeil(MaxWorkGroupSize, WaveSize);
87 unsigned MaxWaves = NumGroups * MaxGroupNumWaves;
89 // Number of waves per EU (SIMD).
90 MaxWaves = divideCeil(MaxWaves, getEUsPerCU());
92 // Clamp to the maximum possible number of waves.
93 MaxWaves = std::min(MaxWaves, getMaxWavesPerEU());
95 // FIXME: Needs to be a multiple of the group size?
96 //MaxWaves = MaxGroupNumWaves * (MaxWaves / MaxGroupNumWaves);
98 assert(MaxWaves > 0 && MaxWaves <= getMaxWavesPerEU() &&
99 "computed invalid occupancy");
100 return MaxWaves;
103 unsigned
104 AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const {
105 const auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
106 return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction());
109 std::pair<unsigned, unsigned>
110 AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const {
111 switch (CC) {
112 case CallingConv::AMDGPU_VS:
113 case CallingConv::AMDGPU_LS:
114 case CallingConv::AMDGPU_HS:
115 case CallingConv::AMDGPU_ES:
116 case CallingConv::AMDGPU_GS:
117 case CallingConv::AMDGPU_PS:
118 return std::pair(1, getWavefrontSize());
119 default:
120 return std::pair(1u, getMaxFlatWorkGroupSize());
124 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
125 const Function &F) const {
126 // Default minimum/maximum flat work group sizes.
127 std::pair<unsigned, unsigned> Default =
128 getDefaultFlatWorkGroupSize(F.getCallingConv());
130 // Requested minimum/maximum flat work group sizes.
131 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
132 F, "amdgpu-flat-work-group-size", Default);
134 // Make sure requested minimum is less than requested maximum.
135 if (Requested.first > Requested.second)
136 return Default;
138 // Make sure requested values do not violate subtarget's specifications.
139 if (Requested.first < getMinFlatWorkGroupSize())
140 return Default;
141 if (Requested.second > getMaxFlatWorkGroupSize())
142 return Default;
144 return Requested;
147 std::pair<unsigned, unsigned> AMDGPUSubtarget::getEffectiveWavesPerEU(
148 std::pair<unsigned, unsigned> Requested,
149 std::pair<unsigned, unsigned> FlatWorkGroupSizes) const {
150 // Default minimum/maximum number of waves per execution unit.
151 std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
153 // If minimum/maximum flat work group sizes were explicitly requested using
154 // "amdgpu-flat-workgroup-size" attribute, then set default minimum/maximum
155 // number of waves per execution unit to values implied by requested
156 // minimum/maximum flat work group sizes.
157 unsigned MinImpliedByFlatWorkGroupSize =
158 getWavesPerEUForWorkGroup(FlatWorkGroupSizes.second);
159 Default.first = MinImpliedByFlatWorkGroupSize;
161 // Make sure requested minimum is less than requested maximum.
162 if (Requested.second && Requested.first > Requested.second)
163 return Default;
165 // Make sure requested values do not violate subtarget's specifications.
166 if (Requested.first < getMinWavesPerEU() ||
167 Requested.second > getMaxWavesPerEU())
168 return Default;
170 // Make sure requested values are compatible with values implied by requested
171 // minimum/maximum flat work group sizes.
172 if (Requested.first < MinImpliedByFlatWorkGroupSize)
173 return Default;
175 return Requested;
178 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
179 const Function &F, std::pair<unsigned, unsigned> FlatWorkGroupSizes) const {
180 // Default minimum/maximum number of waves per execution unit.
181 std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
183 // Requested minimum/maximum number of waves per execution unit.
184 std::pair<unsigned, unsigned> Requested =
185 AMDGPU::getIntegerPairAttribute(F, "amdgpu-waves-per-eu", Default, true);
186 return getEffectiveWavesPerEU(Requested, FlatWorkGroupSizes);
189 static unsigned getReqdWorkGroupSize(const Function &Kernel, unsigned Dim) {
190 auto *Node = Kernel.getMetadata("reqd_work_group_size");
191 if (Node && Node->getNumOperands() == 3)
192 return mdconst::extract<ConstantInt>(Node->getOperand(Dim))->getZExtValue();
193 return std::numeric_limits<unsigned>::max();
196 bool AMDGPUSubtarget::isMesaKernel(const Function &F) const {
197 return isMesa3DOS() && !AMDGPU::isShader(F.getCallingConv());
200 unsigned AMDGPUSubtarget::getMaxWorkitemID(const Function &Kernel,
201 unsigned Dimension) const {
202 unsigned ReqdSize = getReqdWorkGroupSize(Kernel, Dimension);
203 if (ReqdSize != std::numeric_limits<unsigned>::max())
204 return ReqdSize - 1;
205 return getFlatWorkGroupSizes(Kernel).second - 1;
208 bool AMDGPUSubtarget::isSingleLaneExecution(const Function &Func) const {
209 for (int I = 0; I < 3; ++I) {
210 if (getMaxWorkitemID(Func, I) > 0)
211 return false;
214 return true;
217 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const {
218 Function *Kernel = I->getParent()->getParent();
219 unsigned MinSize = 0;
220 unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second;
221 bool IdQuery = false;
223 // If reqd_work_group_size is present it narrows value down.
224 if (auto *CI = dyn_cast<CallInst>(I)) {
225 const Function *F = CI->getCalledFunction();
226 if (F) {
227 unsigned Dim = UINT_MAX;
228 switch (F->getIntrinsicID()) {
229 case Intrinsic::amdgcn_workitem_id_x:
230 case Intrinsic::r600_read_tidig_x:
231 IdQuery = true;
232 [[fallthrough]];
233 case Intrinsic::r600_read_local_size_x:
234 Dim = 0;
235 break;
236 case Intrinsic::amdgcn_workitem_id_y:
237 case Intrinsic::r600_read_tidig_y:
238 IdQuery = true;
239 [[fallthrough]];
240 case Intrinsic::r600_read_local_size_y:
241 Dim = 1;
242 break;
243 case Intrinsic::amdgcn_workitem_id_z:
244 case Intrinsic::r600_read_tidig_z:
245 IdQuery = true;
246 [[fallthrough]];
247 case Intrinsic::r600_read_local_size_z:
248 Dim = 2;
249 break;
250 default:
251 break;
254 if (Dim <= 3) {
255 unsigned ReqdSize = getReqdWorkGroupSize(*Kernel, Dim);
256 if (ReqdSize != std::numeric_limits<unsigned>::max())
257 MinSize = MaxSize = ReqdSize;
262 if (!MaxSize)
263 return false;
265 // Range metadata is [Lo, Hi). For ID query we need to pass max size
266 // as Hi. For size query we need to pass Hi + 1.
267 if (IdQuery)
268 MinSize = 0;
269 else
270 ++MaxSize;
272 APInt Lower{32, MinSize};
273 APInt Upper{32, MaxSize};
274 if (auto *CI = dyn_cast<CallBase>(I)) {
275 ConstantRange Range(Lower, Upper);
276 CI->addRangeRetAttr(Range);
277 } else {
278 MDBuilder MDB(I->getContext());
279 MDNode *MaxWorkGroupSizeRange = MDB.createRange(Lower, Upper);
280 I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
282 return true;
285 unsigned AMDGPUSubtarget::getImplicitArgNumBytes(const Function &F) const {
286 assert(AMDGPU::isKernel(F.getCallingConv()));
288 // We don't allocate the segment if we know the implicit arguments weren't
289 // used, even if the ABI implies we need them.
290 if (F.hasFnAttribute("amdgpu-no-implicitarg-ptr"))
291 return 0;
293 if (isMesaKernel(F))
294 return 16;
296 // Assume all implicit inputs are used by default
297 const Module *M = F.getParent();
298 unsigned NBytes =
299 AMDGPU::getAMDHSACodeObjectVersion(*M) >= AMDGPU::AMDHSA_COV5 ? 256 : 56;
300 return F.getFnAttributeAsParsedInteger("amdgpu-implicitarg-num-bytes",
301 NBytes);
304 uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F,
305 Align &MaxAlign) const {
306 assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
307 F.getCallingConv() == CallingConv::SPIR_KERNEL);
309 const DataLayout &DL = F.getDataLayout();
310 uint64_t ExplicitArgBytes = 0;
311 MaxAlign = Align(1);
313 for (const Argument &Arg : F.args()) {
314 if (Arg.hasAttribute("amdgpu-hidden-argument"))
315 continue;
317 const bool IsByRef = Arg.hasByRefAttr();
318 Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType();
319 Align Alignment = DL.getValueOrABITypeAlignment(
320 IsByRef ? Arg.getParamAlign() : std::nullopt, ArgTy);
321 uint64_t AllocSize = DL.getTypeAllocSize(ArgTy);
322 ExplicitArgBytes = alignTo(ExplicitArgBytes, Alignment) + AllocSize;
323 MaxAlign = std::max(MaxAlign, Alignment);
326 return ExplicitArgBytes;
329 unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F,
330 Align &MaxAlign) const {
331 if (F.getCallingConv() != CallingConv::AMDGPU_KERNEL &&
332 F.getCallingConv() != CallingConv::SPIR_KERNEL)
333 return 0;
335 uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign);
337 unsigned ExplicitOffset = getExplicitKernelArgOffset();
339 uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes;
340 unsigned ImplicitBytes = getImplicitArgNumBytes(F);
341 if (ImplicitBytes != 0) {
342 const Align Alignment = getAlignmentForImplicitArgPtr();
343 TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
344 MaxAlign = std::max(MaxAlign, Alignment);
347 // Being able to dereference past the end is useful for emitting scalar loads.
348 return alignTo(TotalSize, 4);
351 AMDGPUDwarfFlavour AMDGPUSubtarget::getAMDGPUDwarfFlavour() const {
352 return getWavefrontSize() == 32 ? AMDGPUDwarfFlavour::Wave32
353 : AMDGPUDwarfFlavour::Wave64;
356 const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) {
357 if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn)
358 return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>());
359 return static_cast<const AMDGPUSubtarget &>(MF.getSubtarget<R600Subtarget>());
362 const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Function &F) {
363 if (TM.getTargetTriple().getArch() == Triple::amdgcn)
364 return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F));
365 return static_cast<const AMDGPUSubtarget &>(
366 TM.getSubtarget<R600Subtarget>(F));
369 SmallVector<unsigned>
370 AMDGPUSubtarget::getMaxNumWorkGroups(const Function &F) const {
371 return AMDGPU::getIntegerVecAttribute(F, "amdgpu-max-num-workgroups", 3,
372 std::numeric_limits<uint32_t>::max());