1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
10 /// Implements the AMDGPU specific subclass of TargetSubtarget.
12 //===----------------------------------------------------------------------===//
14 #include "AMDGPUSubtarget.h"
15 #include "AMDGPUCallLowering.h"
16 #include "AMDGPUInstructionSelector.h"
17 #include "AMDGPULegalizerInfo.h"
18 #include "AMDGPURegisterBankInfo.h"
19 #include "AMDGPUTargetMachine.h"
20 #include "GCNSubtarget.h"
21 #include "R600Subtarget.h"
22 #include "SIMachineFunctionInfo.h"
23 #include "Utils/AMDGPUBaseInfo.h"
24 #include "llvm/ADT/SmallString.h"
25 #include "llvm/CodeGen/GlobalISel/InlineAsmLowering.h"
26 #include "llvm/CodeGen/MachineScheduler.h"
27 #include "llvm/CodeGen/TargetFrameLowering.h"
28 #include "llvm/IR/DiagnosticInfo.h"
29 #include "llvm/IR/IntrinsicsAMDGPU.h"
30 #include "llvm/IR/IntrinsicsR600.h"
31 #include "llvm/IR/MDBuilder.h"
32 #include "llvm/MC/MCSubtargetInfo.h"
37 #define DEBUG_TYPE "amdgpu-subtarget"
39 #define GET_SUBTARGETINFO_TARGET_DESC
40 #define GET_SUBTARGETINFO_CTOR
41 #define AMDGPUSubtarget GCNSubtarget
42 #include "AMDGPUGenSubtargetInfo.inc"
43 #undef AMDGPUSubtarget
45 static cl::opt
<bool> EnablePowerSched(
46 "amdgpu-enable-power-sched",
47 cl::desc("Enable scheduling to minimize mAI power bursts"),
50 static cl::opt
<bool> EnableVGPRIndexMode(
51 "amdgpu-vgpr-index-mode",
52 cl::desc("Use GPR indexing mode instead of movrel for vector indexing"),
55 static cl::opt
<bool> UseAA("amdgpu-use-aa-in-codegen",
56 cl::desc("Enable the use of AA during codegen."),
59 static cl::opt
<unsigned> NSAThreshold("amdgpu-nsa-threshold",
60 cl::desc("Number of addresses from which to enable MIMG NSA."),
61 cl::init(3), cl::Hidden
);
63 GCNSubtarget::~GCNSubtarget() = default;
66 GCNSubtarget::initializeSubtargetDependencies(const Triple
&TT
,
67 StringRef GPU
, StringRef FS
) {
68 // Determine default and user-specified characteristics
70 // We want to be able to turn these off, but making this a subtarget feature
71 // for SI has the unhelpful behavior that it unsets everything else if you
74 // Similarly we want enable-prt-strict-null to be on by default and not to
75 // unset everything else if it is disabled
77 SmallString
<256> FullFS("+promote-alloca,+load-store-opt,+enable-ds128,");
79 // Turn on features that HSA ABI requires. Also turn on FlatForGlobal by default
81 FullFS
+= "+flat-for-global,+unaligned-access-mode,+trap-handler,";
83 FullFS
+= "+enable-prt-strict-null,"; // This is overridden by a disable in FS
85 // Disable mutually exclusive bits.
86 if (FS
.contains_insensitive("+wavefrontsize")) {
87 if (!FS
.contains_insensitive("wavefrontsize16"))
88 FullFS
+= "-wavefrontsize16,";
89 if (!FS
.contains_insensitive("wavefrontsize32"))
90 FullFS
+= "-wavefrontsize32,";
91 if (!FS
.contains_insensitive("wavefrontsize64"))
92 FullFS
+= "-wavefrontsize64,";
97 ParseSubtargetFeatures(GPU
, /*TuneCPU*/ GPU
, FullFS
);
99 // Implement the "generic" processors, which acts as the default when no
100 // generation features are enabled (e.g for -mcpu=''). HSA OS defaults to
101 // the first amdgcn target that supports flat addressing. Other OSes defaults
102 // to the first amdgcn target.
103 if (Gen
== AMDGPUSubtarget::INVALID
) {
104 Gen
= TT
.getOS() == Triple::AMDHSA
? AMDGPUSubtarget::SEA_ISLANDS
105 : AMDGPUSubtarget::SOUTHERN_ISLANDS
;
108 if (!hasFeature(AMDGPU::FeatureWavefrontSize32
) &&
109 !hasFeature(AMDGPU::FeatureWavefrontSize64
)) {
110 // If there is no default wave size it must be a generation before gfx10,
111 // these have FeatureWavefrontSize64 in their definition already. For gfx10+
112 // set wave32 as a default.
113 ToggleFeature(AMDGPU::FeatureWavefrontSize32
);
116 // We don't support FP64 for EG/NI atm.
117 assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS
));
119 // Targets must either support 64-bit offsets for MUBUF instructions, and/or
120 // support flat operations, otherwise they cannot access a 64-bit global
122 assert(hasAddr64() || hasFlat());
123 // Unless +-flat-for-global is specified, turn on FlatForGlobal for targets
124 // that do not support ADDR64 variants of MUBUF instructions. Such targets
125 // cannot use a 64 bit offset with a MUBUF instruction to access the global
127 if (!hasAddr64() && !FS
.contains("flat-for-global") && !FlatForGlobal
) {
128 ToggleFeature(AMDGPU::FeatureFlatForGlobal
);
129 FlatForGlobal
= true;
131 // Unless +-flat-for-global is specified, use MUBUF instructions for global
132 // address space access if flat operations are not available.
133 if (!hasFlat() && !FS
.contains("flat-for-global") && FlatForGlobal
) {
134 ToggleFeature(AMDGPU::FeatureFlatForGlobal
);
135 FlatForGlobal
= false;
138 // Set defaults if needed.
139 if (MaxPrivateElementSize
== 0)
140 MaxPrivateElementSize
= 4;
142 if (LDSBankCount
== 0)
145 if (TT
.getArch() == Triple::amdgcn
) {
146 if (LocalMemorySize
== 0)
147 LocalMemorySize
= 32768;
149 // Do something sensible for unspecified target.
150 if (!HasMovrel
&& !HasVGPRIndexMode
)
154 AddressableLocalMemorySize
= LocalMemorySize
;
156 if (AMDGPU::isGFX10Plus(*this) &&
157 !getFeatureBits().test(AMDGPU::FeatureCuMode
))
158 LocalMemorySize
*= 2;
160 // Don't crash on invalid devices.
161 if (WavefrontSizeLog2
== 0)
162 WavefrontSizeLog2
= 5;
164 HasFminFmaxLegacy
= getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS
;
165 HasSMulHi
= getGeneration() >= AMDGPUSubtarget::GFX9
;
167 TargetID
.setTargetIDFromFeaturesString(FS
);
169 LLVM_DEBUG(dbgs() << "xnack setting for subtarget: "
170 << TargetID
.getXnackSetting() << '\n');
171 LLVM_DEBUG(dbgs() << "sramecc setting for subtarget: "
172 << TargetID
.getSramEccSetting() << '\n');
177 void GCNSubtarget::checkSubtargetFeatures(const Function
&F
) const {
178 LLVMContext
&Ctx
= F
.getContext();
179 if (hasFeature(AMDGPU::FeatureWavefrontSize32
) ==
180 hasFeature(AMDGPU::FeatureWavefrontSize64
)) {
181 Ctx
.diagnose(DiagnosticInfoUnsupported(
182 F
, "must specify exactly one of wavefrontsize32 and wavefrontsize64"));
186 AMDGPUSubtarget::AMDGPUSubtarget(Triple TT
) : TargetTriple(std::move(TT
)) {}
188 bool AMDGPUSubtarget::useRealTrue16Insts() const {
189 return hasTrue16BitInsts() && EnableRealTrue16Insts
;
192 GCNSubtarget::GCNSubtarget(const Triple
&TT
, StringRef GPU
, StringRef FS
,
193 const GCNTargetMachine
&TM
)
194 : // clang-format off
195 AMDGPUGenSubtargetInfo(TT
, GPU
, /*TuneCPU*/ GPU
, FS
),
199 InstrItins(getInstrItineraryForCPU(GPU
)),
200 InstrInfo(initializeSubtargetDependencies(TT
, GPU
, FS
)),
202 FrameLowering(TargetFrameLowering::StackGrowsUp
, getStackAlignment(), 0) {
204 MaxWavesPerEU
= AMDGPU::IsaInfo::getMaxWavesPerEU(this);
205 EUsPerCU
= AMDGPU::IsaInfo::getEUsPerCU(this);
206 CallLoweringInfo
= std::make_unique
<AMDGPUCallLowering
>(*getTargetLowering());
207 InlineAsmLoweringInfo
=
208 std::make_unique
<InlineAsmLowering
>(getTargetLowering());
209 Legalizer
= std::make_unique
<AMDGPULegalizerInfo
>(*this, TM
);
210 RegBankInfo
= std::make_unique
<AMDGPURegisterBankInfo
>(*this);
212 std::make_unique
<AMDGPUInstructionSelector
>(*this, *RegBankInfo
, TM
);
215 unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode
) const {
216 if (getGeneration() < GFX10
)
220 case AMDGPU::V_LSHLREV_B64_e64
:
221 case AMDGPU::V_LSHLREV_B64_gfx10
:
222 case AMDGPU::V_LSHLREV_B64_e64_gfx11
:
223 case AMDGPU::V_LSHLREV_B64_e32_gfx12
:
224 case AMDGPU::V_LSHLREV_B64_e64_gfx12
:
225 case AMDGPU::V_LSHL_B64_e64
:
226 case AMDGPU::V_LSHRREV_B64_e64
:
227 case AMDGPU::V_LSHRREV_B64_gfx10
:
228 case AMDGPU::V_LSHRREV_B64_e64_gfx11
:
229 case AMDGPU::V_LSHRREV_B64_e64_gfx12
:
230 case AMDGPU::V_LSHR_B64_e64
:
231 case AMDGPU::V_ASHRREV_I64_e64
:
232 case AMDGPU::V_ASHRREV_I64_gfx10
:
233 case AMDGPU::V_ASHRREV_I64_e64_gfx11
:
234 case AMDGPU::V_ASHRREV_I64_e64_gfx12
:
235 case AMDGPU::V_ASHR_I64_e64
:
242 /// This list was mostly derived from experimentation.
243 bool GCNSubtarget::zeroesHigh16BitsOfDest(unsigned Opcode
) const {
245 case AMDGPU::V_CVT_F16_F32_e32
:
246 case AMDGPU::V_CVT_F16_F32_e64
:
247 case AMDGPU::V_CVT_F16_U16_e32
:
248 case AMDGPU::V_CVT_F16_U16_e64
:
249 case AMDGPU::V_CVT_F16_I16_e32
:
250 case AMDGPU::V_CVT_F16_I16_e64
:
251 case AMDGPU::V_RCP_F16_e64
:
252 case AMDGPU::V_RCP_F16_e32
:
253 case AMDGPU::V_RSQ_F16_e64
:
254 case AMDGPU::V_RSQ_F16_e32
:
255 case AMDGPU::V_SQRT_F16_e64
:
256 case AMDGPU::V_SQRT_F16_e32
:
257 case AMDGPU::V_LOG_F16_e64
:
258 case AMDGPU::V_LOG_F16_e32
:
259 case AMDGPU::V_EXP_F16_e64
:
260 case AMDGPU::V_EXP_F16_e32
:
261 case AMDGPU::V_SIN_F16_e64
:
262 case AMDGPU::V_SIN_F16_e32
:
263 case AMDGPU::V_COS_F16_e64
:
264 case AMDGPU::V_COS_F16_e32
:
265 case AMDGPU::V_FLOOR_F16_e64
:
266 case AMDGPU::V_FLOOR_F16_e32
:
267 case AMDGPU::V_CEIL_F16_e64
:
268 case AMDGPU::V_CEIL_F16_e32
:
269 case AMDGPU::V_TRUNC_F16_e64
:
270 case AMDGPU::V_TRUNC_F16_e32
:
271 case AMDGPU::V_RNDNE_F16_e64
:
272 case AMDGPU::V_RNDNE_F16_e32
:
273 case AMDGPU::V_FRACT_F16_e64
:
274 case AMDGPU::V_FRACT_F16_e32
:
275 case AMDGPU::V_FREXP_MANT_F16_e64
:
276 case AMDGPU::V_FREXP_MANT_F16_e32
:
277 case AMDGPU::V_FREXP_EXP_I16_F16_e64
:
278 case AMDGPU::V_FREXP_EXP_I16_F16_e32
:
279 case AMDGPU::V_LDEXP_F16_e64
:
280 case AMDGPU::V_LDEXP_F16_e32
:
281 case AMDGPU::V_LSHLREV_B16_e64
:
282 case AMDGPU::V_LSHLREV_B16_e32
:
283 case AMDGPU::V_LSHRREV_B16_e64
:
284 case AMDGPU::V_LSHRREV_B16_e32
:
285 case AMDGPU::V_ASHRREV_I16_e64
:
286 case AMDGPU::V_ASHRREV_I16_e32
:
287 case AMDGPU::V_ADD_U16_e64
:
288 case AMDGPU::V_ADD_U16_e32
:
289 case AMDGPU::V_SUB_U16_e64
:
290 case AMDGPU::V_SUB_U16_e32
:
291 case AMDGPU::V_SUBREV_U16_e64
:
292 case AMDGPU::V_SUBREV_U16_e32
:
293 case AMDGPU::V_MUL_LO_U16_e64
:
294 case AMDGPU::V_MUL_LO_U16_e32
:
295 case AMDGPU::V_ADD_F16_e64
:
296 case AMDGPU::V_ADD_F16_e32
:
297 case AMDGPU::V_SUB_F16_e64
:
298 case AMDGPU::V_SUB_F16_e32
:
299 case AMDGPU::V_SUBREV_F16_e64
:
300 case AMDGPU::V_SUBREV_F16_e32
:
301 case AMDGPU::V_MUL_F16_e64
:
302 case AMDGPU::V_MUL_F16_e32
:
303 case AMDGPU::V_MAX_F16_e64
:
304 case AMDGPU::V_MAX_F16_e32
:
305 case AMDGPU::V_MIN_F16_e64
:
306 case AMDGPU::V_MIN_F16_e32
:
307 case AMDGPU::V_MAX_U16_e64
:
308 case AMDGPU::V_MAX_U16_e32
:
309 case AMDGPU::V_MIN_U16_e64
:
310 case AMDGPU::V_MIN_U16_e32
:
311 case AMDGPU::V_MAX_I16_e64
:
312 case AMDGPU::V_MAX_I16_e32
:
313 case AMDGPU::V_MIN_I16_e64
:
314 case AMDGPU::V_MIN_I16_e32
:
315 case AMDGPU::V_MAD_F16_e64
:
316 case AMDGPU::V_MAD_U16_e64
:
317 case AMDGPU::V_MAD_I16_e64
:
318 case AMDGPU::V_FMA_F16_e64
:
319 case AMDGPU::V_DIV_FIXUP_F16_e64
:
320 // On gfx10, all 16-bit instructions preserve the high bits.
321 return getGeneration() <= AMDGPUSubtarget::GFX9
;
322 case AMDGPU::V_MADAK_F16
:
323 case AMDGPU::V_MADMK_F16
:
324 case AMDGPU::V_MAC_F16_e64
:
325 case AMDGPU::V_MAC_F16_e32
:
326 case AMDGPU::V_FMAMK_F16
:
327 case AMDGPU::V_FMAAK_F16
:
328 case AMDGPU::V_FMAC_F16_e64
:
329 case AMDGPU::V_FMAC_F16_e32
:
330 // In gfx9, the preferred handling of the unused high 16-bits changed. Most
331 // instructions maintain the legacy behavior of 0ing. Some instructions
332 // changed to preserving the high bits.
333 return getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS
;
334 case AMDGPU::V_MAD_MIXLO_F16
:
335 case AMDGPU::V_MAD_MIXHI_F16
:
341 // Returns the maximum per-workgroup LDS allocation size (in bytes) that still
342 // allows the given function to achieve an occupancy of NWaves waves per
343 // SIMD / EU, taking into account only the function's *maximum* workgroup size.
345 AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves
,
346 const Function
&F
) const {
347 const unsigned WaveSize
= getWavefrontSize();
348 const unsigned WorkGroupSize
= getFlatWorkGroupSizes(F
).second
;
349 const unsigned WavesPerWorkgroup
=
350 std::max(1u, (WorkGroupSize
+ WaveSize
- 1) / WaveSize
);
352 const unsigned WorkGroupsPerCU
=
353 std::max(1u, (NWaves
* getEUsPerCU()) / WavesPerWorkgroup
);
355 return getLocalMemorySize() / WorkGroupsPerCU
;
358 // FIXME: Should return min,max range.
360 // Returns the maximum occupancy, in number of waves per SIMD / EU, that can
361 // be achieved when only the given function is running on the machine; and
362 // taking into account the overall number of wave slots, the (maximum) workgroup
363 // size, and the per-workgroup LDS allocation size.
364 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes
,
365 const Function
&F
) const {
366 const unsigned MaxWorkGroupSize
= getFlatWorkGroupSizes(F
).second
;
367 const unsigned MaxWorkGroupsPerCu
= getMaxWorkGroupsPerCU(MaxWorkGroupSize
);
368 if (!MaxWorkGroupsPerCu
)
371 const unsigned WaveSize
= getWavefrontSize();
373 // FIXME: Do we need to account for alignment requirement of LDS rounding the
375 // Compute restriction based on LDS usage
376 unsigned NumGroups
= getLocalMemorySize() / (Bytes
? Bytes
: 1u);
378 // This can be queried with more LDS than is possible, so just assume the
383 NumGroups
= std::min(MaxWorkGroupsPerCu
, NumGroups
);
385 // Round to the number of waves per CU.
386 const unsigned MaxGroupNumWaves
= divideCeil(MaxWorkGroupSize
, WaveSize
);
387 unsigned MaxWaves
= NumGroups
* MaxGroupNumWaves
;
389 // Number of waves per EU (SIMD).
390 MaxWaves
= divideCeil(MaxWaves
, getEUsPerCU());
392 // Clamp to the maximum possible number of waves.
393 MaxWaves
= std::min(MaxWaves
, getMaxWavesPerEU());
395 // FIXME: Needs to be a multiple of the group size?
396 //MaxWaves = MaxGroupNumWaves * (MaxWaves / MaxGroupNumWaves);
398 assert(MaxWaves
> 0 && MaxWaves
<= getMaxWavesPerEU() &&
399 "computed invalid occupancy");
404 AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction
&MF
) const {
405 const auto *MFI
= MF
.getInfo
<SIMachineFunctionInfo
>();
406 return getOccupancyWithLocalMemSize(MFI
->getLDSSize(), MF
.getFunction());
409 std::pair
<unsigned, unsigned>
410 AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC
) const {
412 case CallingConv::AMDGPU_VS
:
413 case CallingConv::AMDGPU_LS
:
414 case CallingConv::AMDGPU_HS
:
415 case CallingConv::AMDGPU_ES
:
416 case CallingConv::AMDGPU_GS
:
417 case CallingConv::AMDGPU_PS
:
418 return std::pair(1, getWavefrontSize());
420 return std::pair(1u, getMaxFlatWorkGroupSize());
424 std::pair
<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
425 const Function
&F
) const {
426 // Default minimum/maximum flat work group sizes.
427 std::pair
<unsigned, unsigned> Default
=
428 getDefaultFlatWorkGroupSize(F
.getCallingConv());
430 // Requested minimum/maximum flat work group sizes.
431 std::pair
<unsigned, unsigned> Requested
= AMDGPU::getIntegerPairAttribute(
432 F
, "amdgpu-flat-work-group-size", Default
);
434 // Make sure requested minimum is less than requested maximum.
435 if (Requested
.first
> Requested
.second
)
438 // Make sure requested values do not violate subtarget's specifications.
439 if (Requested
.first
< getMinFlatWorkGroupSize())
441 if (Requested
.second
> getMaxFlatWorkGroupSize())
447 std::pair
<unsigned, unsigned> AMDGPUSubtarget::getEffectiveWavesPerEU(
448 std::pair
<unsigned, unsigned> Requested
,
449 std::pair
<unsigned, unsigned> FlatWorkGroupSizes
) const {
450 // Default minimum/maximum number of waves per execution unit.
451 std::pair
<unsigned, unsigned> Default(1, getMaxWavesPerEU());
453 // If minimum/maximum flat work group sizes were explicitly requested using
454 // "amdgpu-flat-workgroup-size" attribute, then set default minimum/maximum
455 // number of waves per execution unit to values implied by requested
456 // minimum/maximum flat work group sizes.
457 unsigned MinImpliedByFlatWorkGroupSize
=
458 getWavesPerEUForWorkGroup(FlatWorkGroupSizes
.second
);
459 Default
.first
= MinImpliedByFlatWorkGroupSize
;
461 // Make sure requested minimum is less than requested maximum.
462 if (Requested
.second
&& Requested
.first
> Requested
.second
)
465 // Make sure requested values do not violate subtarget's specifications.
466 if (Requested
.first
< getMinWavesPerEU() ||
467 Requested
.second
> getMaxWavesPerEU())
470 // Make sure requested values are compatible with values implied by requested
471 // minimum/maximum flat work group sizes.
472 if (Requested
.first
< MinImpliedByFlatWorkGroupSize
)
478 std::pair
<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
479 const Function
&F
, std::pair
<unsigned, unsigned> FlatWorkGroupSizes
) const {
480 // Default minimum/maximum number of waves per execution unit.
481 std::pair
<unsigned, unsigned> Default(1, getMaxWavesPerEU());
483 // Requested minimum/maximum number of waves per execution unit.
484 std::pair
<unsigned, unsigned> Requested
=
485 AMDGPU::getIntegerPairAttribute(F
, "amdgpu-waves-per-eu", Default
, true);
486 return getEffectiveWavesPerEU(Requested
, FlatWorkGroupSizes
);
489 static unsigned getReqdWorkGroupSize(const Function
&Kernel
, unsigned Dim
) {
490 auto Node
= Kernel
.getMetadata("reqd_work_group_size");
491 if (Node
&& Node
->getNumOperands() == 3)
492 return mdconst::extract
<ConstantInt
>(Node
->getOperand(Dim
))->getZExtValue();
493 return std::numeric_limits
<unsigned>::max();
496 bool AMDGPUSubtarget::isMesaKernel(const Function
&F
) const {
497 return isMesa3DOS() && !AMDGPU::isShader(F
.getCallingConv());
500 unsigned AMDGPUSubtarget::getMaxWorkitemID(const Function
&Kernel
,
501 unsigned Dimension
) const {
502 unsigned ReqdSize
= getReqdWorkGroupSize(Kernel
, Dimension
);
503 if (ReqdSize
!= std::numeric_limits
<unsigned>::max())
505 return getFlatWorkGroupSizes(Kernel
).second
- 1;
508 bool AMDGPUSubtarget::isSingleLaneExecution(const Function
&Func
) const {
509 for (int I
= 0; I
< 3; ++I
) {
510 if (getMaxWorkitemID(Func
, I
) > 0)
517 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction
*I
) const {
518 Function
*Kernel
= I
->getParent()->getParent();
519 unsigned MinSize
= 0;
520 unsigned MaxSize
= getFlatWorkGroupSizes(*Kernel
).second
;
521 bool IdQuery
= false;
523 // If reqd_work_group_size is present it narrows value down.
524 if (auto *CI
= dyn_cast
<CallInst
>(I
)) {
525 const Function
*F
= CI
->getCalledFunction();
527 unsigned Dim
= UINT_MAX
;
528 switch (F
->getIntrinsicID()) {
529 case Intrinsic::amdgcn_workitem_id_x
:
530 case Intrinsic::r600_read_tidig_x
:
533 case Intrinsic::r600_read_local_size_x
:
536 case Intrinsic::amdgcn_workitem_id_y
:
537 case Intrinsic::r600_read_tidig_y
:
540 case Intrinsic::r600_read_local_size_y
:
543 case Intrinsic::amdgcn_workitem_id_z
:
544 case Intrinsic::r600_read_tidig_z
:
547 case Intrinsic::r600_read_local_size_z
:
555 unsigned ReqdSize
= getReqdWorkGroupSize(*Kernel
, Dim
);
556 if (ReqdSize
!= std::numeric_limits
<unsigned>::max())
557 MinSize
= MaxSize
= ReqdSize
;
565 // Range metadata is [Lo, Hi). For ID query we need to pass max size
566 // as Hi. For size query we need to pass Hi + 1.
572 APInt Lower
{32, MinSize
};
573 APInt Upper
{32, MaxSize
};
574 if (auto *CI
= dyn_cast
<CallBase
>(I
)) {
575 ConstantRange
Range(Lower
, Upper
);
576 CI
->addRangeRetAttr(Range
);
578 MDBuilder
MDB(I
->getContext());
579 MDNode
*MaxWorkGroupSizeRange
= MDB
.createRange(Lower
, Upper
);
580 I
->setMetadata(LLVMContext::MD_range
, MaxWorkGroupSizeRange
);
585 unsigned AMDGPUSubtarget::getImplicitArgNumBytes(const Function
&F
) const {
586 assert(AMDGPU::isKernel(F
.getCallingConv()));
588 // We don't allocate the segment if we know the implicit arguments weren't
589 // used, even if the ABI implies we need them.
590 if (F
.hasFnAttribute("amdgpu-no-implicitarg-ptr"))
596 // Assume all implicit inputs are used by default
597 const Module
*M
= F
.getParent();
599 AMDGPU::getAMDHSACodeObjectVersion(*M
) >= AMDGPU::AMDHSA_COV5
? 256 : 56;
600 return F
.getFnAttributeAsParsedInteger("amdgpu-implicitarg-num-bytes",
604 uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function
&F
,
605 Align
&MaxAlign
) const {
606 assert(F
.getCallingConv() == CallingConv::AMDGPU_KERNEL
||
607 F
.getCallingConv() == CallingConv::SPIR_KERNEL
);
609 const DataLayout
&DL
= F
.getDataLayout();
610 uint64_t ExplicitArgBytes
= 0;
613 for (const Argument
&Arg
: F
.args()) {
614 const bool IsByRef
= Arg
.hasByRefAttr();
615 Type
*ArgTy
= IsByRef
? Arg
.getParamByRefType() : Arg
.getType();
616 Align Alignment
= DL
.getValueOrABITypeAlignment(
617 IsByRef
? Arg
.getParamAlign() : std::nullopt
, ArgTy
);
618 uint64_t AllocSize
= DL
.getTypeAllocSize(ArgTy
);
619 ExplicitArgBytes
= alignTo(ExplicitArgBytes
, Alignment
) + AllocSize
;
620 MaxAlign
= std::max(MaxAlign
, Alignment
);
623 return ExplicitArgBytes
;
626 unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function
&F
,
627 Align
&MaxAlign
) const {
628 if (F
.getCallingConv() != CallingConv::AMDGPU_KERNEL
&&
629 F
.getCallingConv() != CallingConv::SPIR_KERNEL
)
632 uint64_t ExplicitArgBytes
= getExplicitKernArgSize(F
, MaxAlign
);
634 unsigned ExplicitOffset
= getExplicitKernelArgOffset();
636 uint64_t TotalSize
= ExplicitOffset
+ ExplicitArgBytes
;
637 unsigned ImplicitBytes
= getImplicitArgNumBytes(F
);
638 if (ImplicitBytes
!= 0) {
639 const Align Alignment
= getAlignmentForImplicitArgPtr();
640 TotalSize
= alignTo(ExplicitArgBytes
, Alignment
) + ImplicitBytes
;
641 MaxAlign
= std::max(MaxAlign
, Alignment
);
644 // Being able to dereference past the end is useful for emitting scalar loads.
645 return alignTo(TotalSize
, 4);
648 AMDGPUDwarfFlavour
AMDGPUSubtarget::getAMDGPUDwarfFlavour() const {
649 return getWavefrontSize() == 32 ? AMDGPUDwarfFlavour::Wave32
650 : AMDGPUDwarfFlavour::Wave64
;
653 void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy
&Policy
,
654 unsigned NumRegionInstrs
) const {
655 // Track register pressure so the scheduler can try to decrease
656 // pressure once register usage is above the threshold defined by
657 // SIRegisterInfo::getRegPressureSetLimit()
658 Policy
.ShouldTrackPressure
= true;
660 // Enabling both top down and bottom up scheduling seems to give us less
661 // register spills than just using one of these approaches on its own.
662 Policy
.OnlyTopDown
= false;
663 Policy
.OnlyBottomUp
= false;
665 // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
666 if (!enableSIScheduler())
667 Policy
.ShouldTrackLaneMasks
= true;
670 void GCNSubtarget::mirFileLoaded(MachineFunction
&MF
) const {
672 // Fix implicit $vcc operands after MIParser has verified that they match
673 // the instruction definitions.
674 for (auto &MBB
: MF
) {
676 InstrInfo
.fixImplicitOperands(MI
);
681 bool GCNSubtarget::hasMadF16() const {
682 return InstrInfo
.pseudoToMCOpcode(AMDGPU::V_MAD_F16_e64
) != -1;
685 bool GCNSubtarget::useVGPRIndexMode() const {
686 return !hasMovrel() || (EnableVGPRIndexMode
&& hasVGPRIndexMode());
689 bool GCNSubtarget::useAA() const { return UseAA
; }
691 unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs
) const {
692 return AMDGPU::IsaInfo::getOccupancyWithNumSGPRs(SGPRs
, getMaxWavesPerEU(),
696 unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned NumVGPRs
) const {
697 return AMDGPU::IsaInfo::getNumWavesPerEUWithNumVGPRs(this, NumVGPRs
);
701 GCNSubtarget::getBaseReservedNumSGPRs(const bool HasFlatScratch
) const {
702 if (getGeneration() >= AMDGPUSubtarget::GFX10
)
703 return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs.
705 if (HasFlatScratch
|| HasArchitectedFlatScratch
) {
706 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS
)
707 return 6; // FLAT_SCRATCH, XNACK, VCC (in that order).
708 if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS
)
709 return 4; // FLAT_SCRATCH, VCC (in that order).
712 if (isXNACKEnabled())
713 return 4; // XNACK, VCC (in that order).
717 unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction
&MF
) const {
718 const SIMachineFunctionInfo
&MFI
= *MF
.getInfo
<SIMachineFunctionInfo
>();
719 return getBaseReservedNumSGPRs(MFI
.getUserSGPRInfo().hasFlatScratchInit());
722 unsigned GCNSubtarget::getReservedNumSGPRs(const Function
&F
) const {
723 // In principle we do not need to reserve SGPR pair used for flat_scratch if
724 // we know flat instructions do not access the stack anywhere in the
725 // program. For now assume it's needed if we have flat instructions.
726 const bool KernelUsesFlatScratch
= hasFlatAddressSpace();
727 return getBaseReservedNumSGPRs(KernelUsesFlatScratch
);
730 unsigned GCNSubtarget::computeOccupancy(const Function
&F
, unsigned LDSSize
,
732 unsigned NumVGPRs
) const {
734 std::min(getMaxWavesPerEU(),
735 getOccupancyWithLocalMemSize(LDSSize
, F
));
737 Occupancy
= std::min(Occupancy
, getOccupancyWithNumSGPRs(NumSGPRs
));
739 Occupancy
= std::min(Occupancy
, getOccupancyWithNumVGPRs(NumVGPRs
));
743 unsigned GCNSubtarget::getBaseMaxNumSGPRs(
744 const Function
&F
, std::pair
<unsigned, unsigned> WavesPerEU
,
745 unsigned PreloadedSGPRs
, unsigned ReservedNumSGPRs
) const {
746 // Compute maximum number of SGPRs function can use using default/requested
747 // minimum number of waves per execution unit.
748 unsigned MaxNumSGPRs
= getMaxNumSGPRs(WavesPerEU
.first
, false);
749 unsigned MaxAddressableNumSGPRs
= getMaxNumSGPRs(WavesPerEU
.first
, true);
751 // Check if maximum number of SGPRs was explicitly requested using
752 // "amdgpu-num-sgpr" attribute.
753 if (F
.hasFnAttribute("amdgpu-num-sgpr")) {
755 F
.getFnAttributeAsParsedInteger("amdgpu-num-sgpr", MaxNumSGPRs
);
757 // Make sure requested value does not violate subtarget's specifications.
758 if (Requested
&& (Requested
<= ReservedNumSGPRs
))
761 // If more SGPRs are required to support the input user/system SGPRs,
762 // increase to accommodate them.
764 // FIXME: This really ends up using the requested number of SGPRs + number
765 // of reserved special registers in total. Theoretically you could re-use
766 // the last input registers for these special registers, but this would
767 // require a lot of complexity to deal with the weird aliasing.
768 unsigned InputNumSGPRs
= PreloadedSGPRs
;
769 if (Requested
&& Requested
< InputNumSGPRs
)
770 Requested
= InputNumSGPRs
;
772 // Make sure requested value is compatible with values implied by
773 // default/requested minimum/maximum number of waves per execution unit.
774 if (Requested
&& Requested
> getMaxNumSGPRs(WavesPerEU
.first
, false))
776 if (WavesPerEU
.second
&&
777 Requested
&& Requested
< getMinNumSGPRs(WavesPerEU
.second
))
781 MaxNumSGPRs
= Requested
;
784 if (hasSGPRInitBug())
785 MaxNumSGPRs
= AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG
;
787 return std::min(MaxNumSGPRs
- ReservedNumSGPRs
, MaxAddressableNumSGPRs
);
790 unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction
&MF
) const {
791 const Function
&F
= MF
.getFunction();
792 const SIMachineFunctionInfo
&MFI
= *MF
.getInfo
<SIMachineFunctionInfo
>();
793 return getBaseMaxNumSGPRs(F
, MFI
.getWavesPerEU(), MFI
.getNumPreloadedSGPRs(),
794 getReservedNumSGPRs(MF
));
797 static unsigned getMaxNumPreloadedSGPRs() {
798 using USI
= GCNUserSGPRUsageInfo
;
799 // Max number of user SGPRs
800 const unsigned MaxUserSGPRs
=
801 USI::getNumUserSGPRForField(USI::PrivateSegmentBufferID
) +
802 USI::getNumUserSGPRForField(USI::DispatchPtrID
) +
803 USI::getNumUserSGPRForField(USI::QueuePtrID
) +
804 USI::getNumUserSGPRForField(USI::KernargSegmentPtrID
) +
805 USI::getNumUserSGPRForField(USI::DispatchIdID
) +
806 USI::getNumUserSGPRForField(USI::FlatScratchInitID
) +
807 USI::getNumUserSGPRForField(USI::ImplicitBufferPtrID
);
809 // Max number of system SGPRs
810 const unsigned MaxSystemSGPRs
= 1 + // WorkGroupIDX
814 1; // private segment wave byte offset
816 // Max number of synthetic SGPRs
817 const unsigned SyntheticSGPRs
= 1; // LDSKernelId
819 return MaxUserSGPRs
+ MaxSystemSGPRs
+ SyntheticSGPRs
;
822 unsigned GCNSubtarget::getMaxNumSGPRs(const Function
&F
) const {
823 return getBaseMaxNumSGPRs(F
, getWavesPerEU(F
), getMaxNumPreloadedSGPRs(),
824 getReservedNumSGPRs(F
));
827 unsigned GCNSubtarget::getBaseMaxNumVGPRs(
828 const Function
&F
, std::pair
<unsigned, unsigned> WavesPerEU
) const {
829 // Compute maximum number of VGPRs function can use using default/requested
830 // minimum number of waves per execution unit.
831 unsigned MaxNumVGPRs
= getMaxNumVGPRs(WavesPerEU
.first
);
833 // Check if maximum number of VGPRs was explicitly requested using
834 // "amdgpu-num-vgpr" attribute.
835 if (F
.hasFnAttribute("amdgpu-num-vgpr")) {
837 F
.getFnAttributeAsParsedInteger("amdgpu-num-vgpr", MaxNumVGPRs
);
839 if (hasGFX90AInsts())
842 // Make sure requested value is compatible with values implied by
843 // default/requested minimum/maximum number of waves per execution unit.
844 if (Requested
&& Requested
> getMaxNumVGPRs(WavesPerEU
.first
))
846 if (WavesPerEU
.second
&&
847 Requested
&& Requested
< getMinNumVGPRs(WavesPerEU
.second
))
851 MaxNumVGPRs
= Requested
;
857 unsigned GCNSubtarget::getMaxNumVGPRs(const Function
&F
) const {
858 return getBaseMaxNumVGPRs(F
, getWavesPerEU(F
));
861 unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction
&MF
) const {
862 const Function
&F
= MF
.getFunction();
863 const SIMachineFunctionInfo
&MFI
= *MF
.getInfo
<SIMachineFunctionInfo
>();
864 return getBaseMaxNumVGPRs(F
, MFI
.getWavesPerEU());
867 void GCNSubtarget::adjustSchedDependency(
868 SUnit
*Def
, int DefOpIdx
, SUnit
*Use
, int UseOpIdx
, SDep
&Dep
,
869 const TargetSchedModel
*SchedModel
) const {
870 if (Dep
.getKind() != SDep::Kind::Data
|| !Dep
.getReg() ||
871 !Def
->isInstr() || !Use
->isInstr())
874 MachineInstr
*DefI
= Def
->getInstr();
875 MachineInstr
*UseI
= Use
->getInstr();
877 if (DefI
->isBundle()) {
878 const SIRegisterInfo
*TRI
= getRegisterInfo();
879 auto Reg
= Dep
.getReg();
880 MachineBasicBlock::const_instr_iterator
I(DefI
->getIterator());
881 MachineBasicBlock::const_instr_iterator
E(DefI
->getParent()->instr_end());
883 for (++I
; I
!= E
&& I
->isBundledWithPred(); ++I
) {
884 if (I
->modifiesRegister(Reg
, TRI
))
885 Lat
= InstrInfo
.getInstrLatency(getInstrItineraryData(), *I
);
890 } else if (UseI
->isBundle()) {
891 const SIRegisterInfo
*TRI
= getRegisterInfo();
892 auto Reg
= Dep
.getReg();
893 MachineBasicBlock::const_instr_iterator
I(UseI
->getIterator());
894 MachineBasicBlock::const_instr_iterator
E(UseI
->getParent()->instr_end());
895 unsigned Lat
= InstrInfo
.getInstrLatency(getInstrItineraryData(), *DefI
);
896 for (++I
; I
!= E
&& I
->isBundledWithPred() && Lat
; ++I
) {
897 if (I
->readsRegister(Reg
, TRI
))
902 } else if (Dep
.getLatency() == 0 && Dep
.getReg() == AMDGPU::VCC_LO
) {
903 // Work around the fact that SIInstrInfo::fixImplicitOperands modifies
904 // implicit operands which come from the MCInstrDesc, which can fool
905 // ScheduleDAGInstrs::addPhysRegDataDeps into treating them as implicit
907 Dep
.setLatency(InstrInfo
.getSchedModel().computeOperandLatency(
908 DefI
, DefOpIdx
, UseI
, UseOpIdx
));
913 struct FillMFMAShadowMutation
: ScheduleDAGMutation
{
914 const SIInstrInfo
*TII
;
918 FillMFMAShadowMutation(const SIInstrInfo
*tii
) : TII(tii
) {}
920 bool isSALU(const SUnit
*SU
) const {
921 const MachineInstr
*MI
= SU
->getInstr();
922 return MI
&& TII
->isSALU(*MI
) && !MI
->isTerminator();
925 bool isVALU(const SUnit
*SU
) const {
926 const MachineInstr
*MI
= SU
->getInstr();
927 return MI
&& TII
->isVALU(*MI
);
930 // Link as many SALU instructions in chain as possible. Return the size
931 // of the chain. Links up to MaxChain instructions.
932 unsigned linkSALUChain(SUnit
*From
, SUnit
*To
, unsigned MaxChain
,
933 SmallPtrSetImpl
<SUnit
*> &Visited
) const {
934 SmallVector
<SUnit
*, 8> Worklist({To
});
937 while (!Worklist
.empty() && MaxChain
-- > 0) {
938 SUnit
*SU
= Worklist
.pop_back_val();
939 if (!Visited
.insert(SU
).second
)
942 LLVM_DEBUG(dbgs() << "Inserting edge from\n" ; DAG
->dumpNode(*From
);
943 dbgs() << "to\n"; DAG
->dumpNode(*SU
); dbgs() << '\n');
945 if (SU
!= From
&& From
!= &DAG
->ExitSU
&& DAG
->canAddEdge(SU
, From
))
946 if (DAG
->addEdge(SU
, SDep(From
, SDep::Artificial
)))
949 for (SDep
&SI
: From
->Succs
) {
950 SUnit
*SUv
= SI
.getSUnit();
951 if (SUv
!= From
&& SU
!= &DAG
->ExitSU
&& isVALU(SUv
) &&
952 DAG
->canAddEdge(SUv
, SU
))
953 DAG
->addEdge(SUv
, SDep(SU
, SDep::Artificial
));
956 for (SDep
&SI
: SU
->Succs
) {
957 SUnit
*Succ
= SI
.getSUnit();
958 if (Succ
!= SU
&& isSALU(Succ
))
959 Worklist
.push_back(Succ
);
966 void apply(ScheduleDAGInstrs
*DAGInstrs
) override
{
967 const GCNSubtarget
&ST
= DAGInstrs
->MF
.getSubtarget
<GCNSubtarget
>();
968 if (!ST
.hasMAIInsts())
970 DAG
= static_cast<ScheduleDAGMI
*>(DAGInstrs
);
971 const TargetSchedModel
*TSchedModel
= DAGInstrs
->getSchedModel();
972 if (!TSchedModel
|| DAG
->SUnits
.empty())
975 // Scan for MFMA long latency instructions and try to add a dependency
976 // of available SALU instructions to give them a chance to fill MFMA
977 // shadow. That is desirable to fill MFMA shadow with SALU instructions
978 // rather than VALU to prevent power consumption bursts and throttle.
979 auto LastSALU
= DAG
->SUnits
.begin();
980 auto E
= DAG
->SUnits
.end();
981 SmallPtrSet
<SUnit
*, 32> Visited
;
982 for (SUnit
&SU
: DAG
->SUnits
) {
983 MachineInstr
&MAI
= *SU
.getInstr();
984 if (!TII
->isMAI(MAI
) ||
985 MAI
.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32_e64
||
986 MAI
.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64
)
989 unsigned Lat
= TSchedModel
->computeInstrLatency(&MAI
) - 1;
991 LLVM_DEBUG(dbgs() << "Found MFMA: "; DAG
->dumpNode(SU
);
992 dbgs() << "Need " << Lat
993 << " instructions to cover latency.\n");
995 // Find up to Lat independent scalar instructions as early as
996 // possible such that they can be scheduled after this MFMA.
997 for ( ; Lat
&& LastSALU
!= E
; ++LastSALU
) {
998 if (Visited
.count(&*LastSALU
))
1001 if (&SU
== &DAG
->ExitSU
|| &SU
== &*LastSALU
|| !isSALU(&*LastSALU
) ||
1002 !DAG
->canAddEdge(&*LastSALU
, &SU
))
1005 Lat
-= linkSALUChain(&SU
, &*LastSALU
, Lat
, Visited
);
1012 void GCNSubtarget::getPostRAMutations(
1013 std::vector
<std::unique_ptr
<ScheduleDAGMutation
>> &Mutations
) const {
1014 Mutations
.push_back(std::make_unique
<FillMFMAShadowMutation
>(&InstrInfo
));
1017 std::unique_ptr
<ScheduleDAGMutation
>
1018 GCNSubtarget::createFillMFMAShadowMutation(const TargetInstrInfo
*TII
) const {
1019 return EnablePowerSched
? std::make_unique
<FillMFMAShadowMutation
>(&InstrInfo
)
1023 unsigned GCNSubtarget::getNSAThreshold(const MachineFunction
&MF
) const {
1024 if (getGeneration() >= AMDGPUSubtarget::GFX12
)
1025 return 0; // Not MIMG encoding.
1027 if (NSAThreshold
.getNumOccurrences() > 0)
1028 return std::max(NSAThreshold
.getValue(), 2u);
1030 int Value
= MF
.getFunction().getFnAttributeAsParsedInteger(
1031 "amdgpu-nsa-threshold", -1);
1033 return std::max(Value
, 2);
1038 const AMDGPUSubtarget
&AMDGPUSubtarget::get(const MachineFunction
&MF
) {
1039 if (MF
.getTarget().getTargetTriple().getArch() == Triple::amdgcn
)
1040 return static_cast<const AMDGPUSubtarget
&>(MF
.getSubtarget
<GCNSubtarget
>());
1041 return static_cast<const AMDGPUSubtarget
&>(MF
.getSubtarget
<R600Subtarget
>());
1044 const AMDGPUSubtarget
&AMDGPUSubtarget::get(const TargetMachine
&TM
, const Function
&F
) {
1045 if (TM
.getTargetTriple().getArch() == Triple::amdgcn
)
1046 return static_cast<const AMDGPUSubtarget
&>(TM
.getSubtarget
<GCNSubtarget
>(F
));
1047 return static_cast<const AMDGPUSubtarget
&>(
1048 TM
.getSubtarget
<R600Subtarget
>(F
));
1051 GCNUserSGPRUsageInfo::GCNUserSGPRUsageInfo(const Function
&F
,
1052 const GCNSubtarget
&ST
)
1054 const CallingConv::ID CC
= F
.getCallingConv();
1055 const bool IsKernel
=
1056 CC
== CallingConv::AMDGPU_KERNEL
|| CC
== CallingConv::SPIR_KERNEL
;
1057 // FIXME: Should have analysis or something rather than attribute to detect
1059 const bool HasCalls
= F
.hasFnAttribute("amdgpu-calls");
1060 // FIXME: This attribute is a hack, we just need an analysis on the function
1061 // to look for allocas.
1062 const bool HasStackObjects
= F
.hasFnAttribute("amdgpu-stack-objects");
1064 if (IsKernel
&& (!F
.arg_empty() || ST
.getImplicitArgNumBytes(F
) != 0))
1065 KernargSegmentPtr
= true;
1067 bool IsAmdHsaOrMesa
= ST
.isAmdHsaOrMesa(F
);
1068 if (IsAmdHsaOrMesa
&& !ST
.enableFlatScratch())
1069 PrivateSegmentBuffer
= true;
1070 else if (ST
.isMesaGfxShader(F
))
1071 ImplicitBufferPtr
= true;
1073 if (!AMDGPU::isGraphics(CC
)) {
1074 if (!F
.hasFnAttribute("amdgpu-no-dispatch-ptr"))
1077 // FIXME: Can this always be disabled with < COv5?
1078 if (!F
.hasFnAttribute("amdgpu-no-queue-ptr"))
1081 if (!F
.hasFnAttribute("amdgpu-no-dispatch-id"))
1085 // TODO: This could be refined a lot. The attribute is a poor way of
1086 // detecting calls or stack objects that may require it before argument
1088 if (ST
.hasFlatAddressSpace() && AMDGPU::isEntryFunctionCC(CC
) &&
1089 (IsAmdHsaOrMesa
|| ST
.enableFlatScratch()) &&
1090 (HasCalls
|| HasStackObjects
|| ST
.enableFlatScratch()) &&
1091 !ST
.flatScratchIsArchitected()) {
1092 FlatScratchInit
= true;
1095 if (hasImplicitBufferPtr())
1096 NumUsedUserSGPRs
+= getNumUserSGPRForField(ImplicitBufferPtrID
);
1098 if (hasPrivateSegmentBuffer())
1099 NumUsedUserSGPRs
+= getNumUserSGPRForField(PrivateSegmentBufferID
);
1101 if (hasDispatchPtr())
1102 NumUsedUserSGPRs
+= getNumUserSGPRForField(DispatchPtrID
);
1105 NumUsedUserSGPRs
+= getNumUserSGPRForField(QueuePtrID
);
1107 if (hasKernargSegmentPtr())
1108 NumUsedUserSGPRs
+= getNumUserSGPRForField(KernargSegmentPtrID
);
1110 if (hasDispatchID())
1111 NumUsedUserSGPRs
+= getNumUserSGPRForField(DispatchIdID
);
1113 if (hasFlatScratchInit())
1114 NumUsedUserSGPRs
+= getNumUserSGPRForField(FlatScratchInitID
);
1116 if (hasPrivateSegmentSize())
1117 NumUsedUserSGPRs
+= getNumUserSGPRForField(PrivateSegmentSizeID
);
1120 void GCNUserSGPRUsageInfo::allocKernargPreloadSGPRs(unsigned NumSGPRs
) {
1121 assert(NumKernargPreloadSGPRs
+ NumSGPRs
<= AMDGPU::getMaxNumUserSGPRs(ST
));
1122 NumKernargPreloadSGPRs
+= NumSGPRs
;
1123 NumUsedUserSGPRs
+= NumSGPRs
;
1126 unsigned GCNUserSGPRUsageInfo::getNumFreeUserSGPRs() {
1127 return AMDGPU::getMaxNumUserSGPRs(ST
) - NumUsedUserSGPRs
;
1130 SmallVector
<unsigned>
1131 AMDGPUSubtarget::getMaxNumWorkGroups(const Function
&F
) const {
1132 return AMDGPU::getIntegerVecAttribute(F
, "amdgpu-max-num-workgroups", 3);