1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
10 /// Implements the AMDGPU specific subclass of TargetSubtarget.
12 //===----------------------------------------------------------------------===//
14 #include "AMDGPUSubtarget.h"
15 #include "AMDGPUCallLowering.h"
16 #include "AMDGPUInstructionSelector.h"
17 #include "AMDGPULegalizerInfo.h"
18 #include "AMDGPURegisterBankInfo.h"
19 #include "AMDGPUTargetMachine.h"
20 #include "R600Subtarget.h"
21 #include "SIMachineFunctionInfo.h"
22 #include "Utils/AMDGPUBaseInfo.h"
23 #include "llvm/ADT/SmallString.h"
24 #include "llvm/CodeGen/GlobalISel/InlineAsmLowering.h"
25 #include "llvm/CodeGen/MachineScheduler.h"
26 #include "llvm/CodeGen/TargetFrameLowering.h"
27 #include "llvm/IR/IntrinsicsAMDGPU.h"
28 #include "llvm/IR/IntrinsicsR600.h"
29 #include "llvm/IR/MDBuilder.h"
30 #include "llvm/MC/MCSubtargetInfo.h"
35 #define DEBUG_TYPE "amdgpu-subtarget"
37 #define GET_SUBTARGETINFO_TARGET_DESC
38 #define GET_SUBTARGETINFO_CTOR
39 #define AMDGPUSubtarget GCNSubtarget
40 #include "AMDGPUGenSubtargetInfo.inc"
41 #undef AMDGPUSubtarget
43 static cl::opt
<bool> DisablePowerSched(
44 "amdgpu-disable-power-sched",
45 cl::desc("Disable scheduling to minimize mAI power bursts"),
48 static cl::opt
<bool> EnableVGPRIndexMode(
49 "amdgpu-vgpr-index-mode",
50 cl::desc("Use GPR indexing mode instead of movrel for vector indexing"),
53 static cl::opt
<bool> UseAA("amdgpu-use-aa-in-codegen",
54 cl::desc("Enable the use of AA during codegen."),
57 GCNSubtarget::~GCNSubtarget() = default;
60 GCNSubtarget::initializeSubtargetDependencies(const Triple
&TT
,
61 StringRef GPU
, StringRef FS
) {
62 // Determine default and user-specified characteristics
64 // We want to be able to turn these off, but making this a subtarget feature
65 // for SI has the unhelpful behavior that it unsets everything else if you
68 // Similarly we want enable-prt-strict-null to be on by default and not to
69 // unset everything else if it is disabled
71 SmallString
<256> FullFS("+promote-alloca,+load-store-opt,+enable-ds128,");
73 // Turn on features that HSA ABI requires. Also turn on FlatForGlobal by default
75 FullFS
+= "+flat-for-global,+unaligned-access-mode,+trap-handler,";
77 FullFS
+= "+enable-prt-strict-null,"; // This is overridden by a disable in FS
79 // Disable mutually exclusive bits.
80 if (FS
.contains_insensitive("+wavefrontsize")) {
81 if (!FS
.contains_insensitive("wavefrontsize16"))
82 FullFS
+= "-wavefrontsize16,";
83 if (!FS
.contains_insensitive("wavefrontsize32"))
84 FullFS
+= "-wavefrontsize32,";
85 if (!FS
.contains_insensitive("wavefrontsize64"))
86 FullFS
+= "-wavefrontsize64,";
91 ParseSubtargetFeatures(GPU
, /*TuneCPU*/ GPU
, FullFS
);
93 // Implement the "generic" processors, which acts as the default when no
94 // generation features are enabled (e.g for -mcpu=''). HSA OS defaults to
95 // the first amdgcn target that supports flat addressing. Other OSes defaults
96 // to the first amdgcn target.
97 if (Gen
== AMDGPUSubtarget::INVALID
) {
98 Gen
= TT
.getOS() == Triple::AMDHSA
? AMDGPUSubtarget::SEA_ISLANDS
99 : AMDGPUSubtarget::SOUTHERN_ISLANDS
;
102 // We don't support FP64 for EG/NI atm.
103 assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS
));
105 // Targets must either support 64-bit offsets for MUBUF instructions, and/or
106 // support flat operations, otherwise they cannot access a 64-bit global
108 assert(hasAddr64() || hasFlat());
109 // Unless +-flat-for-global is specified, turn on FlatForGlobal for targets
110 // that do not support ADDR64 variants of MUBUF instructions. Such targets
111 // cannot use a 64 bit offset with a MUBUF instruction to access the global
113 if (!hasAddr64() && !FS
.contains("flat-for-global") && !FlatForGlobal
) {
114 ToggleFeature(AMDGPU::FeatureFlatForGlobal
);
115 FlatForGlobal
= true;
117 // Unless +-flat-for-global is specified, use MUBUF instructions for global
118 // address space access if flat operations are not available.
119 if (!hasFlat() && !FS
.contains("flat-for-global") && FlatForGlobal
) {
120 ToggleFeature(AMDGPU::FeatureFlatForGlobal
);
121 FlatForGlobal
= false;
124 // Set defaults if needed.
125 if (MaxPrivateElementSize
== 0)
126 MaxPrivateElementSize
= 4;
128 if (LDSBankCount
== 0)
131 if (TT
.getArch() == Triple::amdgcn
) {
132 if (LocalMemorySize
== 0)
133 LocalMemorySize
= 32768;
135 // Do something sensible for unspecified target.
136 if (!HasMovrel
&& !HasVGPRIndexMode
)
140 // Don't crash on invalid devices.
141 if (WavefrontSizeLog2
== 0)
142 WavefrontSizeLog2
= 5;
144 HasFminFmaxLegacy
= getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS
;
145 HasSMulHi
= getGeneration() >= AMDGPUSubtarget::GFX9
;
147 TargetID
.setTargetIDFromFeaturesString(FS
);
149 LLVM_DEBUG(dbgs() << "xnack setting for subtarget: "
150 << TargetID
.getXnackSetting() << '\n');
151 LLVM_DEBUG(dbgs() << "sramecc setting for subtarget: "
152 << TargetID
.getSramEccSetting() << '\n');
157 AMDGPUSubtarget::AMDGPUSubtarget(const Triple
&TT
) :
160 Has16BitInsts(false),
161 HasMadMixInsts(false),
162 HasMadMacF32Insts(false),
163 HasDsSrc2Insts(false),
165 HasVOP3PInsts(false),
169 HasInv2PiInlineImm(false),
170 HasFminFmaxLegacy(true),
171 EnablePromoteAlloca(false),
172 HasTrigReducedRange(false),
178 GCNSubtarget::GCNSubtarget(const Triple
&TT
, StringRef GPU
, StringRef FS
,
179 const GCNTargetMachine
&TM
)
180 : // clang-format off
181 AMDGPUGenSubtargetInfo(TT
, GPU
, /*TuneCPU*/ GPU
, FS
),
186 InstrItins(getInstrItineraryForCPU(GPU
)),
188 MaxPrivateElementSize(0),
191 FastDenormalF32(false),
192 HalfRate64Ops(false),
193 FullRate64Ops(false),
195 FlatForGlobal(false),
196 AutoWaitcntBeforeBarrier(false),
197 BackOffBarrier(false),
198 UnalignedScratchAccess(false),
199 UnalignedAccessMode(false),
201 HasApertureRegs(false),
202 SupportsXNACK(false),
204 EnableTgSplit(false),
208 EnableLoadStoreOpt(false),
209 EnableUnsafeDSOffsetFolding(false),
210 EnableSIScheduler(false),
212 EnablePRTStrictNull(false),
223 GFX7GFX8GFX9Insts(false),
225 NegativeScratchOffsetBug(false),
226 NegativeUnalignedScratchOffsetBug(false),
227 HasSMemRealTime(false),
229 HasFmaMixInsts(false),
231 HasVGPRIndexMode(false),
232 HasScalarStores(false),
233 HasScalarAtomics(false),
235 HasSDWAScalar(false),
238 HasSDWAOutModsVOPC(false),
242 HasPackedFP32Ops(false),
243 HasImageInsts(false),
244 HasExtendedImageInsts(false),
248 HasNSAEncoding(false),
250 GFX10_AEncoding(false),
251 GFX10_BEncoding(false),
261 HasPkFmacF16Inst(false),
262 HasAtomicFaddInsts(false),
263 SupportsSRAMECC(false),
264 EnableSRAMECC(false),
265 HasNoSdstCMPX(false),
267 HasGetWaveIdInst(false),
268 HasSMemTimeInst(false),
269 HasShaderCyclesRegister(false),
270 HasVOP3Literal(false),
271 HasNoDataDepHazard(false),
272 FlatAddressSpace(false),
273 FlatInstOffsets(false),
274 FlatGlobalInsts(false),
275 FlatScratchInsts(false),
276 ScalarFlatScratchInsts(false),
277 HasArchitectedFlatScratch(false),
278 EnableFlatScratch(false),
279 AddNoCarryInsts(false),
280 HasUnpackedD16VMem(false),
281 LDSMisalignedBug(false),
282 HasMFMAInlineLiteralBug(false),
283 UnalignedBufferAccess(false),
284 UnalignedDSAccess(false),
287 ScalarizeGlobal(false),
289 HasVcmpxPermlaneHazard(false),
290 HasVMEMtoScalarWriteHazard(false),
291 HasSMEMtoVectorWriteHazard(false),
292 HasInstFwdPrefetchBug(false),
293 HasVcmpxExecWARHazard(false),
294 HasLdsBranchVmemWARHazard(false),
295 HasNSAtoVMEMBug(false),
296 HasNSAClauseBug(false),
297 HasOffset3fBug(false),
298 HasFlatSegmentOffsetBug(false),
299 HasImageStoreD16Bug(false),
300 HasImageGather4D16Bug(false),
302 FeatureDisable(false),
303 InstrInfo(initializeSubtargetDependencies(TT
, GPU
, FS
)),
305 FrameLowering(TargetFrameLowering::StackGrowsUp
, getStackAlignment(), 0) {
307 MaxWavesPerEU
= AMDGPU::IsaInfo::getMaxWavesPerEU(this);
308 CallLoweringInfo
.reset(new AMDGPUCallLowering(*getTargetLowering()));
309 InlineAsmLoweringInfo
.reset(new InlineAsmLowering(getTargetLowering()));
310 Legalizer
.reset(new AMDGPULegalizerInfo(*this, TM
));
311 RegBankInfo
.reset(new AMDGPURegisterBankInfo(*this));
312 InstSelector
.reset(new AMDGPUInstructionSelector(
313 *this, *static_cast<AMDGPURegisterBankInfo
*>(RegBankInfo
.get()), TM
));
316 unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode
) const {
317 if (getGeneration() < GFX10
)
321 case AMDGPU::V_LSHLREV_B64_e64
:
322 case AMDGPU::V_LSHLREV_B64_gfx10
:
323 case AMDGPU::V_LSHL_B64_e64
:
324 case AMDGPU::V_LSHRREV_B64_e64
:
325 case AMDGPU::V_LSHRREV_B64_gfx10
:
326 case AMDGPU::V_LSHR_B64_e64
:
327 case AMDGPU::V_ASHRREV_I64_e64
:
328 case AMDGPU::V_ASHRREV_I64_gfx10
:
329 case AMDGPU::V_ASHR_I64_e64
:
336 /// This list was mostly derived from experimentation.
337 bool GCNSubtarget::zeroesHigh16BitsOfDest(unsigned Opcode
) const {
339 case AMDGPU::V_CVT_F16_F32_e32
:
340 case AMDGPU::V_CVT_F16_F32_e64
:
341 case AMDGPU::V_CVT_F16_U16_e32
:
342 case AMDGPU::V_CVT_F16_U16_e64
:
343 case AMDGPU::V_CVT_F16_I16_e32
:
344 case AMDGPU::V_CVT_F16_I16_e64
:
345 case AMDGPU::V_RCP_F16_e64
:
346 case AMDGPU::V_RCP_F16_e32
:
347 case AMDGPU::V_RSQ_F16_e64
:
348 case AMDGPU::V_RSQ_F16_e32
:
349 case AMDGPU::V_SQRT_F16_e64
:
350 case AMDGPU::V_SQRT_F16_e32
:
351 case AMDGPU::V_LOG_F16_e64
:
352 case AMDGPU::V_LOG_F16_e32
:
353 case AMDGPU::V_EXP_F16_e64
:
354 case AMDGPU::V_EXP_F16_e32
:
355 case AMDGPU::V_SIN_F16_e64
:
356 case AMDGPU::V_SIN_F16_e32
:
357 case AMDGPU::V_COS_F16_e64
:
358 case AMDGPU::V_COS_F16_e32
:
359 case AMDGPU::V_FLOOR_F16_e64
:
360 case AMDGPU::V_FLOOR_F16_e32
:
361 case AMDGPU::V_CEIL_F16_e64
:
362 case AMDGPU::V_CEIL_F16_e32
:
363 case AMDGPU::V_TRUNC_F16_e64
:
364 case AMDGPU::V_TRUNC_F16_e32
:
365 case AMDGPU::V_RNDNE_F16_e64
:
366 case AMDGPU::V_RNDNE_F16_e32
:
367 case AMDGPU::V_FRACT_F16_e64
:
368 case AMDGPU::V_FRACT_F16_e32
:
369 case AMDGPU::V_FREXP_MANT_F16_e64
:
370 case AMDGPU::V_FREXP_MANT_F16_e32
:
371 case AMDGPU::V_FREXP_EXP_I16_F16_e64
:
372 case AMDGPU::V_FREXP_EXP_I16_F16_e32
:
373 case AMDGPU::V_LDEXP_F16_e64
:
374 case AMDGPU::V_LDEXP_F16_e32
:
375 case AMDGPU::V_LSHLREV_B16_e64
:
376 case AMDGPU::V_LSHLREV_B16_e32
:
377 case AMDGPU::V_LSHRREV_B16_e64
:
378 case AMDGPU::V_LSHRREV_B16_e32
:
379 case AMDGPU::V_ASHRREV_I16_e64
:
380 case AMDGPU::V_ASHRREV_I16_e32
:
381 case AMDGPU::V_ADD_U16_e64
:
382 case AMDGPU::V_ADD_U16_e32
:
383 case AMDGPU::V_SUB_U16_e64
:
384 case AMDGPU::V_SUB_U16_e32
:
385 case AMDGPU::V_SUBREV_U16_e64
:
386 case AMDGPU::V_SUBREV_U16_e32
:
387 case AMDGPU::V_MUL_LO_U16_e64
:
388 case AMDGPU::V_MUL_LO_U16_e32
:
389 case AMDGPU::V_ADD_F16_e64
:
390 case AMDGPU::V_ADD_F16_e32
:
391 case AMDGPU::V_SUB_F16_e64
:
392 case AMDGPU::V_SUB_F16_e32
:
393 case AMDGPU::V_SUBREV_F16_e64
:
394 case AMDGPU::V_SUBREV_F16_e32
:
395 case AMDGPU::V_MUL_F16_e64
:
396 case AMDGPU::V_MUL_F16_e32
:
397 case AMDGPU::V_MAX_F16_e64
:
398 case AMDGPU::V_MAX_F16_e32
:
399 case AMDGPU::V_MIN_F16_e64
:
400 case AMDGPU::V_MIN_F16_e32
:
401 case AMDGPU::V_MAX_U16_e64
:
402 case AMDGPU::V_MAX_U16_e32
:
403 case AMDGPU::V_MIN_U16_e64
:
404 case AMDGPU::V_MIN_U16_e32
:
405 case AMDGPU::V_MAX_I16_e64
:
406 case AMDGPU::V_MAX_I16_e32
:
407 case AMDGPU::V_MIN_I16_e64
:
408 case AMDGPU::V_MIN_I16_e32
:
409 case AMDGPU::V_MAD_F16_e64
:
410 case AMDGPU::V_MAD_U16_e64
:
411 case AMDGPU::V_MAD_I16_e64
:
412 case AMDGPU::V_FMA_F16_e64
:
413 case AMDGPU::V_DIV_FIXUP_F16_e64
:
414 // On gfx10, all 16-bit instructions preserve the high bits.
415 return getGeneration() <= AMDGPUSubtarget::GFX9
;
416 case AMDGPU::V_MADAK_F16
:
417 case AMDGPU::V_MADMK_F16
:
418 case AMDGPU::V_MAC_F16_e64
:
419 case AMDGPU::V_MAC_F16_e32
:
420 case AMDGPU::V_FMAMK_F16
:
421 case AMDGPU::V_FMAAK_F16
:
422 case AMDGPU::V_FMAC_F16_e64
:
423 case AMDGPU::V_FMAC_F16_e32
:
424 // In gfx9, the preferred handling of the unused high 16-bits changed. Most
425 // instructions maintain the legacy behavior of 0ing. Some instructions
426 // changed to preserving the high bits.
427 return getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS
;
428 case AMDGPU::V_MAD_MIXLO_F16
:
429 case AMDGPU::V_MAD_MIXHI_F16
:
435 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves
,
436 const Function
&F
) const {
438 return getLocalMemorySize();
439 unsigned WorkGroupSize
= getFlatWorkGroupSizes(F
).second
;
440 unsigned WorkGroupsPerCu
= getMaxWorkGroupsPerCU(WorkGroupSize
);
441 if (!WorkGroupsPerCu
)
443 unsigned MaxWaves
= getMaxWavesPerEU();
444 return getLocalMemorySize() * MaxWaves
/ WorkGroupsPerCu
/ NWaves
;
447 // FIXME: Should return min,max range.
448 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes
,
449 const Function
&F
) const {
450 const unsigned MaxWorkGroupSize
= getFlatWorkGroupSizes(F
).second
;
451 const unsigned MaxWorkGroupsPerCu
= getMaxWorkGroupsPerCU(MaxWorkGroupSize
);
452 if (!MaxWorkGroupsPerCu
)
455 const unsigned WaveSize
= getWavefrontSize();
457 // FIXME: Do we need to account for alignment requirement of LDS rounding the
459 // Compute restriction based on LDS usage
460 unsigned NumGroups
= getLocalMemorySize() / (Bytes
? Bytes
: 1u);
462 // This can be queried with more LDS than is possible, so just assume the
467 NumGroups
= std::min(MaxWorkGroupsPerCu
, NumGroups
);
469 // Round to the number of waves.
470 const unsigned MaxGroupNumWaves
= (MaxWorkGroupSize
+ WaveSize
- 1) / WaveSize
;
471 unsigned MaxWaves
= NumGroups
* MaxGroupNumWaves
;
473 // Clamp to the maximum possible number of waves.
474 MaxWaves
= std::min(MaxWaves
, getMaxWavesPerEU());
476 // FIXME: Needs to be a multiple of the group size?
477 //MaxWaves = MaxGroupNumWaves * (MaxWaves / MaxGroupNumWaves);
479 assert(MaxWaves
> 0 && MaxWaves
<= getMaxWavesPerEU() &&
480 "computed invalid occupancy");
485 AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction
&MF
) const {
486 const auto *MFI
= MF
.getInfo
<SIMachineFunctionInfo
>();
487 return getOccupancyWithLocalMemSize(MFI
->getLDSSize(), MF
.getFunction());
490 std::pair
<unsigned, unsigned>
491 AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC
) const {
493 case CallingConv::AMDGPU_VS
:
494 case CallingConv::AMDGPU_LS
:
495 case CallingConv::AMDGPU_HS
:
496 case CallingConv::AMDGPU_ES
:
497 case CallingConv::AMDGPU_GS
:
498 case CallingConv::AMDGPU_PS
:
499 return std::make_pair(1, getWavefrontSize());
501 return std::make_pair(1u, getMaxFlatWorkGroupSize());
505 std::pair
<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
506 const Function
&F
) const {
507 // Default minimum/maximum flat work group sizes.
508 std::pair
<unsigned, unsigned> Default
=
509 getDefaultFlatWorkGroupSize(F
.getCallingConv());
511 // Requested minimum/maximum flat work group sizes.
512 std::pair
<unsigned, unsigned> Requested
= AMDGPU::getIntegerPairAttribute(
513 F
, "amdgpu-flat-work-group-size", Default
);
515 // Make sure requested minimum is less than requested maximum.
516 if (Requested
.first
> Requested
.second
)
519 // Make sure requested values do not violate subtarget's specifications.
520 if (Requested
.first
< getMinFlatWorkGroupSize())
522 if (Requested
.second
> getMaxFlatWorkGroupSize())
528 std::pair
<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
529 const Function
&F
, std::pair
<unsigned, unsigned> FlatWorkGroupSizes
) const {
530 // Default minimum/maximum number of waves per execution unit.
531 std::pair
<unsigned, unsigned> Default(1, getMaxWavesPerEU());
533 // If minimum/maximum flat work group sizes were explicitly requested using
534 // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum
535 // number of waves per execution unit to values implied by requested
536 // minimum/maximum flat work group sizes.
537 unsigned MinImpliedByFlatWorkGroupSize
=
538 getWavesPerEUForWorkGroup(FlatWorkGroupSizes
.second
);
539 Default
.first
= MinImpliedByFlatWorkGroupSize
;
541 // Requested minimum/maximum number of waves per execution unit.
542 std::pair
<unsigned, unsigned> Requested
= AMDGPU::getIntegerPairAttribute(
543 F
, "amdgpu-waves-per-eu", Default
, true);
545 // Make sure requested minimum is less than requested maximum.
546 if (Requested
.second
&& Requested
.first
> Requested
.second
)
549 // Make sure requested values do not violate subtarget's specifications.
550 if (Requested
.first
< getMinWavesPerEU() ||
551 Requested
.second
> getMaxWavesPerEU())
554 // Make sure requested values are compatible with values implied by requested
555 // minimum/maximum flat work group sizes.
556 if (Requested
.first
< MinImpliedByFlatWorkGroupSize
)
562 static unsigned getReqdWorkGroupSize(const Function
&Kernel
, unsigned Dim
) {
563 auto Node
= Kernel
.getMetadata("reqd_work_group_size");
564 if (Node
&& Node
->getNumOperands() == 3)
565 return mdconst::extract
<ConstantInt
>(Node
->getOperand(Dim
))->getZExtValue();
566 return std::numeric_limits
<unsigned>::max();
569 bool AMDGPUSubtarget::isMesaKernel(const Function
&F
) const {
570 return isMesa3DOS() && !AMDGPU::isShader(F
.getCallingConv());
573 unsigned AMDGPUSubtarget::getMaxWorkitemID(const Function
&Kernel
,
574 unsigned Dimension
) const {
575 unsigned ReqdSize
= getReqdWorkGroupSize(Kernel
, Dimension
);
576 if (ReqdSize
!= std::numeric_limits
<unsigned>::max())
578 return getFlatWorkGroupSizes(Kernel
).second
- 1;
581 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction
*I
) const {
582 Function
*Kernel
= I
->getParent()->getParent();
583 unsigned MinSize
= 0;
584 unsigned MaxSize
= getFlatWorkGroupSizes(*Kernel
).second
;
585 bool IdQuery
= false;
587 // If reqd_work_group_size is present it narrows value down.
588 if (auto *CI
= dyn_cast
<CallInst
>(I
)) {
589 const Function
*F
= CI
->getCalledFunction();
591 unsigned Dim
= UINT_MAX
;
592 switch (F
->getIntrinsicID()) {
593 case Intrinsic::amdgcn_workitem_id_x
:
594 case Intrinsic::r600_read_tidig_x
:
597 case Intrinsic::r600_read_local_size_x
:
600 case Intrinsic::amdgcn_workitem_id_y
:
601 case Intrinsic::r600_read_tidig_y
:
604 case Intrinsic::r600_read_local_size_y
:
607 case Intrinsic::amdgcn_workitem_id_z
:
608 case Intrinsic::r600_read_tidig_z
:
611 case Intrinsic::r600_read_local_size_z
:
619 unsigned ReqdSize
= getReqdWorkGroupSize(*Kernel
, Dim
);
620 if (ReqdSize
!= std::numeric_limits
<unsigned>::max())
621 MinSize
= MaxSize
= ReqdSize
;
629 // Range metadata is [Lo, Hi). For ID query we need to pass max size
630 // as Hi. For size query we need to pass Hi + 1.
636 MDBuilder
MDB(I
->getContext());
637 MDNode
*MaxWorkGroupSizeRange
= MDB
.createRange(APInt(32, MinSize
),
639 I
->setMetadata(LLVMContext::MD_range
, MaxWorkGroupSizeRange
);
643 unsigned AMDGPUSubtarget::getImplicitArgNumBytes(const Function
&F
) const {
644 assert(AMDGPU::isKernel(F
.getCallingConv()));
646 // We don't allocate the segment if we know the implicit arguments weren't
647 // used, even if the ABI implies we need them.
648 if (F
.hasFnAttribute("amdgpu-no-implicitarg-ptr"))
654 // Assume all implicit inputs are used by default
655 return AMDGPU::getIntegerAttribute(F
, "amdgpu-implicitarg-num-bytes", 56);
658 uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function
&F
,
659 Align
&MaxAlign
) const {
660 assert(F
.getCallingConv() == CallingConv::AMDGPU_KERNEL
||
661 F
.getCallingConv() == CallingConv::SPIR_KERNEL
);
663 const DataLayout
&DL
= F
.getParent()->getDataLayout();
664 uint64_t ExplicitArgBytes
= 0;
667 for (const Argument
&Arg
: F
.args()) {
668 const bool IsByRef
= Arg
.hasByRefAttr();
669 Type
*ArgTy
= IsByRef
? Arg
.getParamByRefType() : Arg
.getType();
670 MaybeAlign Alignment
= IsByRef
? Arg
.getParamAlign() : None
;
672 Alignment
= DL
.getABITypeAlign(ArgTy
);
674 uint64_t AllocSize
= DL
.getTypeAllocSize(ArgTy
);
675 ExplicitArgBytes
= alignTo(ExplicitArgBytes
, Alignment
) + AllocSize
;
676 MaxAlign
= max(MaxAlign
, Alignment
);
679 return ExplicitArgBytes
;
682 unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function
&F
,
683 Align
&MaxAlign
) const {
684 uint64_t ExplicitArgBytes
= getExplicitKernArgSize(F
, MaxAlign
);
686 unsigned ExplicitOffset
= getExplicitKernelArgOffset(F
);
688 uint64_t TotalSize
= ExplicitOffset
+ ExplicitArgBytes
;
689 unsigned ImplicitBytes
= getImplicitArgNumBytes(F
);
690 if (ImplicitBytes
!= 0) {
691 const Align Alignment
= getAlignmentForImplicitArgPtr();
692 TotalSize
= alignTo(ExplicitArgBytes
, Alignment
) + ImplicitBytes
;
693 MaxAlign
= std::max(MaxAlign
, Alignment
);
696 // Being able to dereference past the end is useful for emitting scalar loads.
697 return alignTo(TotalSize
, 4);
700 AMDGPUDwarfFlavour
AMDGPUSubtarget::getAMDGPUDwarfFlavour() const {
701 return getWavefrontSize() == 32 ? AMDGPUDwarfFlavour::Wave32
702 : AMDGPUDwarfFlavour::Wave64
;
705 void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy
&Policy
,
706 unsigned NumRegionInstrs
) const {
707 // Track register pressure so the scheduler can try to decrease
708 // pressure once register usage is above the threshold defined by
709 // SIRegisterInfo::getRegPressureSetLimit()
710 Policy
.ShouldTrackPressure
= true;
712 // Enabling both top down and bottom up scheduling seems to give us less
713 // register spills than just using one of these approaches on its own.
714 Policy
.OnlyTopDown
= false;
715 Policy
.OnlyBottomUp
= false;
717 // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
718 if (!enableSIScheduler())
719 Policy
.ShouldTrackLaneMasks
= true;
722 bool GCNSubtarget::hasMadF16() const {
723 return InstrInfo
.pseudoToMCOpcode(AMDGPU::V_MAD_F16_e64
) != -1;
726 bool GCNSubtarget::useVGPRIndexMode() const {
727 return !hasMovrel() || (EnableVGPRIndexMode
&& hasVGPRIndexMode());
730 bool GCNSubtarget::useAA() const { return UseAA
; }
732 unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs
) const {
733 if (getGeneration() >= AMDGPUSubtarget::GFX10
)
734 return getMaxWavesPerEU();
736 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS
) {
758 unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs
) const {
759 unsigned MaxWaves
= getMaxWavesPerEU();
760 unsigned Granule
= getVGPRAllocGranule();
763 unsigned RoundedRegs
= ((VGPRs
+ Granule
- 1) / Granule
) * Granule
;
764 return std::min(std::max(getTotalNumVGPRs() / RoundedRegs
, 1u), MaxWaves
);
768 GCNSubtarget::getBaseReservedNumSGPRs(const bool HasFlatScratch
) const {
769 if (getGeneration() >= AMDGPUSubtarget::GFX10
)
770 return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs.
772 if (HasFlatScratch
|| HasArchitectedFlatScratch
) {
773 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS
)
774 return 6; // FLAT_SCRATCH, XNACK, VCC (in that order).
775 if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS
)
776 return 4; // FLAT_SCRATCH, VCC (in that order).
779 if (isXNACKEnabled())
780 return 4; // XNACK, VCC (in that order).
784 unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction
&MF
) const {
785 const SIMachineFunctionInfo
&MFI
= *MF
.getInfo
<SIMachineFunctionInfo
>();
786 return getBaseReservedNumSGPRs(MFI
.hasFlatScratchInit());
789 unsigned GCNSubtarget::getReservedNumSGPRs(const Function
&F
) const {
790 // In principle we do not need to reserve SGPR pair used for flat_scratch if
791 // we know flat instructions do not access the stack anywhere in the
792 // program. For now assume it's needed if we have flat instructions.
793 const bool KernelUsesFlatScratch
= hasFlatAddressSpace();
794 return getBaseReservedNumSGPRs(KernelUsesFlatScratch
);
797 unsigned GCNSubtarget::computeOccupancy(const Function
&F
, unsigned LDSSize
,
799 unsigned NumVGPRs
) const {
801 std::min(getMaxWavesPerEU(),
802 getOccupancyWithLocalMemSize(LDSSize
, F
));
804 Occupancy
= std::min(Occupancy
, getOccupancyWithNumSGPRs(NumSGPRs
));
806 Occupancy
= std::min(Occupancy
, getOccupancyWithNumVGPRs(NumVGPRs
));
810 unsigned GCNSubtarget::getBaseMaxNumSGPRs(
811 const Function
&F
, std::pair
<unsigned, unsigned> WavesPerEU
,
812 unsigned PreloadedSGPRs
, unsigned ReservedNumSGPRs
) const {
813 // Compute maximum number of SGPRs function can use using default/requested
814 // minimum number of waves per execution unit.
815 unsigned MaxNumSGPRs
= getMaxNumSGPRs(WavesPerEU
.first
, false);
816 unsigned MaxAddressableNumSGPRs
= getMaxNumSGPRs(WavesPerEU
.first
, true);
818 // Check if maximum number of SGPRs was explicitly requested using
819 // "amdgpu-num-sgpr" attribute.
820 if (F
.hasFnAttribute("amdgpu-num-sgpr")) {
821 unsigned Requested
= AMDGPU::getIntegerAttribute(
822 F
, "amdgpu-num-sgpr", MaxNumSGPRs
);
824 // Make sure requested value does not violate subtarget's specifications.
825 if (Requested
&& (Requested
<= ReservedNumSGPRs
))
828 // If more SGPRs are required to support the input user/system SGPRs,
829 // increase to accommodate them.
831 // FIXME: This really ends up using the requested number of SGPRs + number
832 // of reserved special registers in total. Theoretically you could re-use
833 // the last input registers for these special registers, but this would
834 // require a lot of complexity to deal with the weird aliasing.
835 unsigned InputNumSGPRs
= PreloadedSGPRs
;
836 if (Requested
&& Requested
< InputNumSGPRs
)
837 Requested
= InputNumSGPRs
;
839 // Make sure requested value is compatible with values implied by
840 // default/requested minimum/maximum number of waves per execution unit.
841 if (Requested
&& Requested
> getMaxNumSGPRs(WavesPerEU
.first
, false))
843 if (WavesPerEU
.second
&&
844 Requested
&& Requested
< getMinNumSGPRs(WavesPerEU
.second
))
848 MaxNumSGPRs
= Requested
;
851 if (hasSGPRInitBug())
852 MaxNumSGPRs
= AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG
;
854 return std::min(MaxNumSGPRs
- ReservedNumSGPRs
, MaxAddressableNumSGPRs
);
857 unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction
&MF
) const {
858 const Function
&F
= MF
.getFunction();
859 const SIMachineFunctionInfo
&MFI
= *MF
.getInfo
<SIMachineFunctionInfo
>();
860 return getBaseMaxNumSGPRs(F
, MFI
.getWavesPerEU(), MFI
.getNumPreloadedSGPRs(),
861 getReservedNumSGPRs(MF
));
864 static unsigned getMaxNumPreloadedSGPRs() {
865 // Max number of user SGPRs
866 unsigned MaxUserSGPRs
= 4 + // private segment buffer
869 2 + // kernel segment ptr
871 2 + // flat scratch init
872 2; // Implicit buffer ptr
873 // Max number of system SGPRs
874 unsigned MaxSystemSGPRs
= 1 + // WorkGroupIDX
878 1; // private segment wave byte offset
879 return MaxUserSGPRs
+ MaxSystemSGPRs
;
882 unsigned GCNSubtarget::getMaxNumSGPRs(const Function
&F
) const {
883 return getBaseMaxNumSGPRs(F
, getWavesPerEU(F
), getMaxNumPreloadedSGPRs(),
884 getReservedNumSGPRs(F
));
887 unsigned GCNSubtarget::getBaseMaxNumVGPRs(
888 const Function
&F
, std::pair
<unsigned, unsigned> WavesPerEU
) const {
889 // Compute maximum number of VGPRs function can use using default/requested
890 // minimum number of waves per execution unit.
891 unsigned MaxNumVGPRs
= getMaxNumVGPRs(WavesPerEU
.first
);
893 // Check if maximum number of VGPRs was explicitly requested using
894 // "amdgpu-num-vgpr" attribute.
895 if (F
.hasFnAttribute("amdgpu-num-vgpr")) {
896 unsigned Requested
= AMDGPU::getIntegerAttribute(
897 F
, "amdgpu-num-vgpr", MaxNumVGPRs
);
899 if (hasGFX90AInsts())
902 // Make sure requested value is compatible with values implied by
903 // default/requested minimum/maximum number of waves per execution unit.
904 if (Requested
&& Requested
> getMaxNumVGPRs(WavesPerEU
.first
))
906 if (WavesPerEU
.second
&&
907 Requested
&& Requested
< getMinNumVGPRs(WavesPerEU
.second
))
911 MaxNumVGPRs
= Requested
;
917 unsigned GCNSubtarget::getMaxNumVGPRs(const Function
&F
) const {
918 return getBaseMaxNumVGPRs(F
, getWavesPerEU(F
));
921 unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction
&MF
) const {
922 const Function
&F
= MF
.getFunction();
923 const SIMachineFunctionInfo
&MFI
= *MF
.getInfo
<SIMachineFunctionInfo
>();
924 return getBaseMaxNumVGPRs(F
, MFI
.getWavesPerEU());
927 void GCNSubtarget::adjustSchedDependency(SUnit
*Def
, int DefOpIdx
, SUnit
*Use
,
928 int UseOpIdx
, SDep
&Dep
) const {
929 if (Dep
.getKind() != SDep::Kind::Data
|| !Dep
.getReg() ||
930 !Def
->isInstr() || !Use
->isInstr())
933 MachineInstr
*DefI
= Def
->getInstr();
934 MachineInstr
*UseI
= Use
->getInstr();
936 if (DefI
->isBundle()) {
937 const SIRegisterInfo
*TRI
= getRegisterInfo();
938 auto Reg
= Dep
.getReg();
939 MachineBasicBlock::const_instr_iterator
I(DefI
->getIterator());
940 MachineBasicBlock::const_instr_iterator
E(DefI
->getParent()->instr_end());
942 for (++I
; I
!= E
&& I
->isBundledWithPred(); ++I
) {
943 if (I
->modifiesRegister(Reg
, TRI
))
944 Lat
= InstrInfo
.getInstrLatency(getInstrItineraryData(), *I
);
949 } else if (UseI
->isBundle()) {
950 const SIRegisterInfo
*TRI
= getRegisterInfo();
951 auto Reg
= Dep
.getReg();
952 MachineBasicBlock::const_instr_iterator
I(UseI
->getIterator());
953 MachineBasicBlock::const_instr_iterator
E(UseI
->getParent()->instr_end());
954 unsigned Lat
= InstrInfo
.getInstrLatency(getInstrItineraryData(), *DefI
);
955 for (++I
; I
!= E
&& I
->isBundledWithPred() && Lat
; ++I
) {
956 if (I
->readsRegister(Reg
, TRI
))
961 } else if (Dep
.getLatency() == 0 && Dep
.getReg() == AMDGPU::VCC_LO
) {
962 // Work around the fact that SIInstrInfo::fixImplicitOperands modifies
963 // implicit operands which come from the MCInstrDesc, which can fool
964 // ScheduleDAGInstrs::addPhysRegDataDeps into treating them as implicit
966 Dep
.setLatency(InstrInfo
.getSchedModel().computeOperandLatency(
967 DefI
, DefOpIdx
, UseI
, UseOpIdx
));
972 struct FillMFMAShadowMutation
: ScheduleDAGMutation
{
973 const SIInstrInfo
*TII
;
977 FillMFMAShadowMutation(const SIInstrInfo
*tii
) : TII(tii
) {}
979 bool isSALU(const SUnit
*SU
) const {
980 const MachineInstr
*MI
= SU
->getInstr();
981 return MI
&& TII
->isSALU(*MI
) && !MI
->isTerminator();
984 bool isVALU(const SUnit
*SU
) const {
985 const MachineInstr
*MI
= SU
->getInstr();
986 return MI
&& TII
->isVALU(*MI
);
989 bool canAddEdge(const SUnit
*Succ
, const SUnit
*Pred
) const {
990 if (Pred
->NodeNum
< Succ
->NodeNum
)
993 SmallVector
<const SUnit
*, 64> Succs({Succ
}), Preds({Pred
});
995 for (unsigned I
= 0; I
< Succs
.size(); ++I
) {
996 for (const SDep
&SI
: Succs
[I
]->Succs
) {
997 const SUnit
*SU
= SI
.getSUnit();
998 if (SU
!= Succs
[I
] && !llvm::is_contained(Succs
, SU
))
1003 SmallPtrSet
<const SUnit
*, 32> Visited
;
1004 while (!Preds
.empty()) {
1005 const SUnit
*SU
= Preds
.pop_back_val();
1006 if (llvm::is_contained(Succs
, SU
))
1009 for (const SDep
&SI
: SU
->Preds
)
1010 if (SI
.getSUnit() != SU
&& !Visited
.count(SI
.getSUnit()))
1011 Preds
.push_back(SI
.getSUnit());
1017 // Link as many SALU instructions in chain as possible. Return the size
1018 // of the chain. Links up to MaxChain instructions.
1019 unsigned linkSALUChain(SUnit
*From
, SUnit
*To
, unsigned MaxChain
,
1020 SmallPtrSetImpl
<SUnit
*> &Visited
) const {
1021 SmallVector
<SUnit
*, 8> Worklist({To
});
1022 unsigned Linked
= 0;
1024 while (!Worklist
.empty() && MaxChain
-- > 0) {
1025 SUnit
*SU
= Worklist
.pop_back_val();
1026 if (!Visited
.insert(SU
).second
)
1029 LLVM_DEBUG(dbgs() << "Inserting edge from\n" ; DAG
->dumpNode(*From
);
1030 dbgs() << "to\n"; DAG
->dumpNode(*SU
); dbgs() << '\n');
1032 if (SU
->addPred(SDep(From
, SDep::Artificial
), false))
1035 for (SDep
&SI
: From
->Succs
) {
1036 SUnit
*SUv
= SI
.getSUnit();
1037 if (SUv
!= From
&& isVALU(SUv
) && canAddEdge(SUv
, SU
))
1038 SUv
->addPred(SDep(SU
, SDep::Artificial
), false);
1041 for (SDep
&SI
: SU
->Succs
) {
1042 SUnit
*Succ
= SI
.getSUnit();
1043 if (Succ
!= SU
&& isSALU(Succ
) && canAddEdge(From
, Succ
))
1044 Worklist
.push_back(Succ
);
1051 void apply(ScheduleDAGInstrs
*DAGInstrs
) override
{
1052 const GCNSubtarget
&ST
= DAGInstrs
->MF
.getSubtarget
<GCNSubtarget
>();
1053 if (!ST
.hasMAIInsts() || DisablePowerSched
)
1055 DAG
= static_cast<ScheduleDAGMI
*>(DAGInstrs
);
1056 const TargetSchedModel
*TSchedModel
= DAGInstrs
->getSchedModel();
1057 if (!TSchedModel
|| DAG
->SUnits
.empty())
1060 // Scan for MFMA long latency instructions and try to add a dependency
1061 // of available SALU instructions to give them a chance to fill MFMA
1062 // shadow. That is desirable to fill MFMA shadow with SALU instructions
1063 // rather than VALU to prevent power consumption bursts and throttle.
1064 auto LastSALU
= DAG
->SUnits
.begin();
1065 auto E
= DAG
->SUnits
.end();
1066 SmallPtrSet
<SUnit
*, 32> Visited
;
1067 for (SUnit
&SU
: DAG
->SUnits
) {
1068 MachineInstr
&MAI
= *SU
.getInstr();
1069 if (!TII
->isMAI(MAI
) ||
1070 MAI
.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32_e64
||
1071 MAI
.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64
)
1074 unsigned Lat
= TSchedModel
->computeInstrLatency(&MAI
) - 1;
1076 LLVM_DEBUG(dbgs() << "Found MFMA: "; DAG
->dumpNode(SU
);
1077 dbgs() << "Need " << Lat
1078 << " instructions to cover latency.\n");
1080 // Find up to Lat independent scalar instructions as early as
1081 // possible such that they can be scheduled after this MFMA.
1082 for ( ; Lat
&& LastSALU
!= E
; ++LastSALU
) {
1083 if (Visited
.count(&*LastSALU
))
1086 if (!isSALU(&*LastSALU
) || !canAddEdge(&*LastSALU
, &SU
))
1089 Lat
-= linkSALUChain(&SU
, &*LastSALU
, Lat
, Visited
);
1096 void GCNSubtarget::getPostRAMutations(
1097 std::vector
<std::unique_ptr
<ScheduleDAGMutation
>> &Mutations
) const {
1098 Mutations
.push_back(std::make_unique
<FillMFMAShadowMutation
>(&InstrInfo
));
1101 std::unique_ptr
<ScheduleDAGMutation
>
1102 GCNSubtarget::createFillMFMAShadowMutation(const TargetInstrInfo
*TII
) const {
1103 return std::make_unique
<FillMFMAShadowMutation
>(&InstrInfo
);
1106 const AMDGPUSubtarget
&AMDGPUSubtarget::get(const MachineFunction
&MF
) {
1107 if (MF
.getTarget().getTargetTriple().getArch() == Triple::amdgcn
)
1108 return static_cast<const AMDGPUSubtarget
&>(MF
.getSubtarget
<GCNSubtarget
>());
1110 return static_cast<const AMDGPUSubtarget
&>(MF
.getSubtarget
<R600Subtarget
>());
1113 const AMDGPUSubtarget
&AMDGPUSubtarget::get(const TargetMachine
&TM
, const Function
&F
) {
1114 if (TM
.getTargetTriple().getArch() == Triple::amdgcn
)
1115 return static_cast<const AMDGPUSubtarget
&>(TM
.getSubtarget
<GCNSubtarget
>(F
));
1117 return static_cast<const AMDGPUSubtarget
&>(TM
.getSubtarget
<R600Subtarget
>(F
));