1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
10 /// Implements the AMDGPU specific subclass of TargetSubtarget.
12 //===----------------------------------------------------------------------===//
14 #include "AMDGPUSubtarget.h"
16 #include "AMDGPUTargetMachine.h"
17 #include "AMDGPUCallLowering.h"
18 #include "AMDGPUInstructionSelector.h"
19 #include "AMDGPULegalizerInfo.h"
20 #include "AMDGPURegisterBankInfo.h"
21 #include "SIMachineFunctionInfo.h"
22 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
23 #include "llvm/ADT/SmallString.h"
24 #include "llvm/CodeGen/MachineScheduler.h"
25 #include "llvm/MC/MCSubtargetInfo.h"
26 #include "llvm/IR/MDBuilder.h"
27 #include "llvm/CodeGen/TargetFrameLowering.h"
32 #define DEBUG_TYPE "amdgpu-subtarget"
34 #define GET_SUBTARGETINFO_TARGET_DESC
35 #define GET_SUBTARGETINFO_CTOR
36 #define AMDGPUSubtarget GCNSubtarget
37 #include "AMDGPUGenSubtargetInfo.inc"
38 #define GET_SUBTARGETINFO_TARGET_DESC
39 #define GET_SUBTARGETINFO_CTOR
40 #undef AMDGPUSubtarget
41 #include "R600GenSubtargetInfo.inc"
43 static cl::opt
<bool> DisablePowerSched(
44 "amdgpu-disable-power-sched",
45 cl::desc("Disable scheduling to minimize mAI power bursts"),
48 GCNSubtarget::~GCNSubtarget() = default;
51 R600Subtarget::initializeSubtargetDependencies(const Triple
&TT
,
52 StringRef GPU
, StringRef FS
) {
53 SmallString
<256> FullFS("+promote-alloca,");
55 ParseSubtargetFeatures(GPU
, FullFS
);
57 // FIXME: I don't think think Evergreen has any useful support for
58 // denormals, but should be checked. Should we issue a warning somewhere
59 // if someone tries to enable these?
60 if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS
) {
61 FP32Denormals
= false;
64 HasMulU24
= getGeneration() >= EVERGREEN
;
65 HasMulI24
= hasCaymanISA();
71 GCNSubtarget::initializeSubtargetDependencies(const Triple
&TT
,
72 StringRef GPU
, StringRef FS
) {
73 // Determine default and user-specified characteristics
74 // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be
75 // enabled, but some instructions do not respect them and they run at the
76 // double precision rate, so don't enable by default.
78 // We want to be able to turn these off, but making this a subtarget feature
79 // for SI has the unhelpful behavior that it unsets everything else if you
82 // Similarly we want enable-prt-strict-null to be on by default and not to
83 // unset everything else if it is disabled
85 // Assuming ECC is enabled is the conservative default.
86 SmallString
<256> FullFS("+promote-alloca,+load-store-opt,+sram-ecc,+xnack,");
88 if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA.
89 FullFS
+= "+flat-for-global,+unaligned-buffer-access,+trap-handler,";
91 // FIXME: I don't think think Evergreen has any useful support for
92 // denormals, but should be checked. Should we issue a warning somewhere
93 // if someone tries to enable these?
94 if (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS
) {
95 FullFS
+= "+fp64-fp16-denormals,";
97 FullFS
+= "-fp32-denormals,";
100 FullFS
+= "+enable-prt-strict-null,"; // This is overridden by a disable in FS
102 // Disable mutually exclusive bits.
103 if (FS
.find_lower("+wavefrontsize") != StringRef::npos
) {
104 if (FS
.find_lower("wavefrontsize16") == StringRef::npos
)
105 FullFS
+= "-wavefrontsize16,";
106 if (FS
.find_lower("wavefrontsize32") == StringRef::npos
)
107 FullFS
+= "-wavefrontsize32,";
108 if (FS
.find_lower("wavefrontsize64") == StringRef::npos
)
109 FullFS
+= "-wavefrontsize64,";
114 ParseSubtargetFeatures(GPU
, FullFS
);
116 // We don't support FP64 for EG/NI atm.
117 assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS
));
119 // Unless +-flat-for-global is specified, turn on FlatForGlobal for all OS-es
120 // on VI and newer hardware to avoid assertion failures due to missing ADDR64
121 // variants of MUBUF instructions.
122 if (!hasAddr64() && !FS
.contains("flat-for-global")) {
123 FlatForGlobal
= true;
126 // Set defaults if needed.
127 if (MaxPrivateElementSize
== 0)
128 MaxPrivateElementSize
= 4;
130 if (LDSBankCount
== 0)
133 if (TT
.getArch() == Triple::amdgcn
) {
134 if (LocalMemorySize
== 0)
135 LocalMemorySize
= 32768;
137 // Do something sensible for unspecified target.
138 if (!HasMovrel
&& !HasVGPRIndexMode
)
142 // Don't crash on invalid devices.
143 if (WavefrontSize
== 0)
146 HasFminFmaxLegacy
= getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS
;
148 if (DoesNotSupportXNACK
&& EnableXNACK
) {
149 ToggleFeature(AMDGPU::FeatureXNACK
);
153 // ECC is on by default, but turn it off if the hardware doesn't support it
154 // anyway. This matters for the gfx9 targets with d16 loads, but don't support
156 if (DoesNotSupportSRAMECC
&& EnableSRAMECC
) {
157 ToggleFeature(AMDGPU::FeatureSRAMECC
);
158 EnableSRAMECC
= false;
164 AMDGPUSubtarget::AMDGPUSubtarget(const Triple
&TT
) :
166 Has16BitInsts(false),
167 HasMadMixInsts(false),
168 FP32Denormals(false),
171 HasVOP3PInsts(false),
174 HasInv2PiInlineImm(false),
175 HasFminFmaxLegacy(true),
176 EnablePromoteAlloca(false),
177 HasTrigReducedRange(false),
183 GCNSubtarget::GCNSubtarget(const Triple
&TT
, StringRef GPU
, StringRef FS
,
184 const GCNTargetMachine
&TM
) :
185 AMDGPUGenSubtargetInfo(TT
, GPU
, FS
),
188 Gen(TT
.getOS() == Triple::AMDHSA
? SEA_ISLANDS
: SOUTHERN_ISLANDS
),
189 InstrItins(getInstrItineraryForCPU(GPU
)),
191 MaxPrivateElementSize(0),
194 HalfRate64Ops(false),
196 FP64FP16Denormals(false),
197 FlatForGlobal(false),
198 AutoWaitcntBeforeBarrier(false),
200 UnalignedScratchAccess(false),
201 UnalignedBufferAccess(false),
203 HasApertureRegs(false),
205 DoesNotSupportXNACK(false),
209 EnableLoadStoreOpt(false),
210 EnableUnsafeDSOffsetFolding(false),
211 EnableSIScheduler(false),
213 EnablePRTStrictNull(false),
222 GFX7GFX8GFX9Insts(false),
224 HasSMemRealTime(false),
226 HasFmaMixInsts(false),
228 HasVGPRIndexMode(false),
229 HasScalarStores(false),
230 HasScalarAtomics(false),
232 HasSDWAScalar(false),
235 HasSDWAOutModsVOPC(false),
239 HasNSAEncoding(false),
248 HasPkFmacF16Inst(false),
249 HasAtomicFaddInsts(false),
250 EnableSRAMECC(false),
251 DoesNotSupportSRAMECC(false),
252 HasNoSdstCMPX(false),
254 HasRegisterBanking(false),
255 HasVOP3Literal(false),
256 HasNoDataDepHazard(false),
257 FlatAddressSpace(false),
258 FlatInstOffsets(false),
259 FlatGlobalInsts(false),
260 FlatScratchInsts(false),
261 ScalarFlatScratchInsts(false),
262 AddNoCarryInsts(false),
263 HasUnpackedD16VMem(false),
264 LDSMisalignedBug(false),
265 HasMFMAInlineLiteralBug(false),
267 ScalarizeGlobal(false),
269 HasVcmpxPermlaneHazard(false),
270 HasVMEMtoScalarWriteHazard(false),
271 HasSMEMtoVectorWriteHazard(false),
272 HasInstFwdPrefetchBug(false),
273 HasVcmpxExecWARHazard(false),
274 HasLdsBranchVmemWARHazard(false),
275 HasNSAtoVMEMBug(false),
276 HasOffset3fBug(false),
277 HasFlatSegmentOffsetBug(false),
279 FeatureDisable(false),
280 InstrInfo(initializeSubtargetDependencies(TT
, GPU
, FS
)),
282 FrameLowering(TargetFrameLowering::StackGrowsUp
, getStackAlignment(), 0) {
283 MaxWavesPerEU
= AMDGPU::IsaInfo::getMaxWavesPerEU(this);
284 CallLoweringInfo
.reset(new AMDGPUCallLowering(*getTargetLowering()));
285 Legalizer
.reset(new AMDGPULegalizerInfo(*this, TM
));
286 RegBankInfo
.reset(new AMDGPURegisterBankInfo(*getRegisterInfo()));
287 InstSelector
.reset(new AMDGPUInstructionSelector(
288 *this, *static_cast<AMDGPURegisterBankInfo
*>(RegBankInfo
.get()), TM
));
291 unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode
) const {
292 if (getGeneration() < GFX10
)
296 case AMDGPU::V_LSHLREV_B64
:
297 case AMDGPU::V_LSHLREV_B64_gfx10
:
298 case AMDGPU::V_LSHL_B64
:
299 case AMDGPU::V_LSHRREV_B64
:
300 case AMDGPU::V_LSHRREV_B64_gfx10
:
301 case AMDGPU::V_LSHR_B64
:
302 case AMDGPU::V_ASHRREV_I64
:
303 case AMDGPU::V_ASHRREV_I64_gfx10
:
304 case AMDGPU::V_ASHR_I64
:
311 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves
,
312 const Function
&F
) const {
314 return getLocalMemorySize();
315 unsigned WorkGroupSize
= getFlatWorkGroupSizes(F
).second
;
316 unsigned WorkGroupsPerCu
= getMaxWorkGroupsPerCU(WorkGroupSize
);
317 if (!WorkGroupsPerCu
)
319 unsigned MaxWaves
= getMaxWavesPerEU();
320 return getLocalMemorySize() * MaxWaves
/ WorkGroupsPerCu
/ NWaves
;
323 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes
,
324 const Function
&F
) const {
325 unsigned WorkGroupSize
= getFlatWorkGroupSizes(F
).second
;
326 unsigned WorkGroupsPerCu
= getMaxWorkGroupsPerCU(WorkGroupSize
);
327 if (!WorkGroupsPerCu
)
329 unsigned MaxWaves
= getMaxWavesPerEU();
330 unsigned Limit
= getLocalMemorySize() * MaxWaves
/ WorkGroupsPerCu
;
331 unsigned NumWaves
= Limit
/ (Bytes
? Bytes
: 1u);
332 NumWaves
= std::min(NumWaves
, MaxWaves
);
333 NumWaves
= std::max(NumWaves
, 1u);
338 AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction
&MF
) const {
339 const auto *MFI
= MF
.getInfo
<SIMachineFunctionInfo
>();
340 return getOccupancyWithLocalMemSize(MFI
->getLDSSize(), MF
.getFunction());
343 std::pair
<unsigned, unsigned>
344 AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC
) const {
346 case CallingConv::AMDGPU_CS
:
347 case CallingConv::AMDGPU_KERNEL
:
348 case CallingConv::SPIR_KERNEL
:
349 return std::make_pair(getWavefrontSize() * 2,
350 std::max(getWavefrontSize() * 4, 256u));
351 case CallingConv::AMDGPU_VS
:
352 case CallingConv::AMDGPU_LS
:
353 case CallingConv::AMDGPU_HS
:
354 case CallingConv::AMDGPU_ES
:
355 case CallingConv::AMDGPU_GS
:
356 case CallingConv::AMDGPU_PS
:
357 return std::make_pair(1, getWavefrontSize());
359 return std::make_pair(1, 16 * getWavefrontSize());
363 std::pair
<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
364 const Function
&F
) const {
365 // FIXME: 1024 if function.
366 // Default minimum/maximum flat work group sizes.
367 std::pair
<unsigned, unsigned> Default
=
368 getDefaultFlatWorkGroupSize(F
.getCallingConv());
370 // Requested minimum/maximum flat work group sizes.
371 std::pair
<unsigned, unsigned> Requested
= AMDGPU::getIntegerPairAttribute(
372 F
, "amdgpu-flat-work-group-size", Default
);
374 // Make sure requested minimum is less than requested maximum.
375 if (Requested
.first
> Requested
.second
)
378 // Make sure requested values do not violate subtarget's specifications.
379 if (Requested
.first
< getMinFlatWorkGroupSize())
381 if (Requested
.second
> getMaxFlatWorkGroupSize())
387 std::pair
<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
388 const Function
&F
) const {
389 // Default minimum/maximum number of waves per execution unit.
390 std::pair
<unsigned, unsigned> Default(1, getMaxWavesPerEU());
392 // Default/requested minimum/maximum flat work group sizes.
393 std::pair
<unsigned, unsigned> FlatWorkGroupSizes
= getFlatWorkGroupSizes(F
);
395 // If minimum/maximum flat work group sizes were explicitly requested using
396 // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum
397 // number of waves per execution unit to values implied by requested
398 // minimum/maximum flat work group sizes.
399 unsigned MinImpliedByFlatWorkGroupSize
=
400 getMaxWavesPerEU(FlatWorkGroupSizes
.second
);
401 bool RequestedFlatWorkGroupSize
= false;
403 if (F
.hasFnAttribute("amdgpu-flat-work-group-size")) {
404 Default
.first
= MinImpliedByFlatWorkGroupSize
;
405 RequestedFlatWorkGroupSize
= true;
408 // Requested minimum/maximum number of waves per execution unit.
409 std::pair
<unsigned, unsigned> Requested
= AMDGPU::getIntegerPairAttribute(
410 F
, "amdgpu-waves-per-eu", Default
, true);
412 // Make sure requested minimum is less than requested maximum.
413 if (Requested
.second
&& Requested
.first
> Requested
.second
)
416 // Make sure requested values do not violate subtarget's specifications.
417 if (Requested
.first
< getMinWavesPerEU() ||
418 Requested
.first
> getMaxWavesPerEU())
420 if (Requested
.second
> getMaxWavesPerEU())
423 // Make sure requested values are compatible with values implied by requested
424 // minimum/maximum flat work group sizes.
425 if (RequestedFlatWorkGroupSize
&&
426 Requested
.first
< MinImpliedByFlatWorkGroupSize
)
432 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction
*I
) const {
433 Function
*Kernel
= I
->getParent()->getParent();
434 unsigned MinSize
= 0;
435 unsigned MaxSize
= getFlatWorkGroupSizes(*Kernel
).second
;
436 bool IdQuery
= false;
438 // If reqd_work_group_size is present it narrows value down.
439 if (auto *CI
= dyn_cast
<CallInst
>(I
)) {
440 const Function
*F
= CI
->getCalledFunction();
442 unsigned Dim
= UINT_MAX
;
443 switch (F
->getIntrinsicID()) {
444 case Intrinsic::amdgcn_workitem_id_x
:
445 case Intrinsic::r600_read_tidig_x
:
448 case Intrinsic::r600_read_local_size_x
:
451 case Intrinsic::amdgcn_workitem_id_y
:
452 case Intrinsic::r600_read_tidig_y
:
455 case Intrinsic::r600_read_local_size_y
:
458 case Intrinsic::amdgcn_workitem_id_z
:
459 case Intrinsic::r600_read_tidig_z
:
462 case Intrinsic::r600_read_local_size_z
:
469 if (auto Node
= Kernel
->getMetadata("reqd_work_group_size"))
470 if (Node
->getNumOperands() == 3)
471 MinSize
= MaxSize
= mdconst::extract
<ConstantInt
>(
472 Node
->getOperand(Dim
))->getZExtValue();
480 // Range metadata is [Lo, Hi). For ID query we need to pass max size
481 // as Hi. For size query we need to pass Hi + 1.
487 MDBuilder
MDB(I
->getContext());
488 MDNode
*MaxWorkGroupSizeRange
= MDB
.createRange(APInt(32, MinSize
),
490 I
->setMetadata(LLVMContext::MD_range
, MaxWorkGroupSizeRange
);
494 uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function
&F
,
495 unsigned &MaxAlign
) const {
496 assert(F
.getCallingConv() == CallingConv::AMDGPU_KERNEL
||
497 F
.getCallingConv() == CallingConv::SPIR_KERNEL
);
499 const DataLayout
&DL
= F
.getParent()->getDataLayout();
500 uint64_t ExplicitArgBytes
= 0;
503 for (const Argument
&Arg
: F
.args()) {
504 Type
*ArgTy
= Arg
.getType();
506 unsigned Align
= DL
.getABITypeAlignment(ArgTy
);
507 uint64_t AllocSize
= DL
.getTypeAllocSize(ArgTy
);
508 ExplicitArgBytes
= alignTo(ExplicitArgBytes
, Align
) + AllocSize
;
509 MaxAlign
= std::max(MaxAlign
, Align
);
512 return ExplicitArgBytes
;
515 unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function
&F
,
516 unsigned &MaxAlign
) const {
517 uint64_t ExplicitArgBytes
= getExplicitKernArgSize(F
, MaxAlign
);
519 unsigned ExplicitOffset
= getExplicitKernelArgOffset(F
);
521 uint64_t TotalSize
= ExplicitOffset
+ ExplicitArgBytes
;
522 unsigned ImplicitBytes
= getImplicitArgNumBytes(F
);
523 if (ImplicitBytes
!= 0) {
524 unsigned Alignment
= getAlignmentForImplicitArgPtr();
525 TotalSize
= alignTo(ExplicitArgBytes
, Alignment
) + ImplicitBytes
;
528 // Being able to dereference past the end is useful for emitting scalar loads.
529 return alignTo(TotalSize
, 4);
532 R600Subtarget::R600Subtarget(const Triple
&TT
, StringRef GPU
, StringRef FS
,
533 const TargetMachine
&TM
) :
534 R600GenSubtargetInfo(TT
, GPU
, FS
),
537 FrameLowering(TargetFrameLowering::StackGrowsUp
, getStackAlignment(), 0),
541 HasVertexCache(false),
546 TLInfo(TM
, initializeSubtargetDependencies(TT
, GPU
, FS
)),
547 InstrItins(getInstrItineraryForCPU(GPU
)) { }
549 void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy
&Policy
,
550 unsigned NumRegionInstrs
) const {
551 // Track register pressure so the scheduler can try to decrease
552 // pressure once register usage is above the threshold defined by
553 // SIRegisterInfo::getRegPressureSetLimit()
554 Policy
.ShouldTrackPressure
= true;
556 // Enabling both top down and bottom up scheduling seems to give us less
557 // register spills than just using one of these approaches on its own.
558 Policy
.OnlyTopDown
= false;
559 Policy
.OnlyBottomUp
= false;
561 // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
562 if (!enableSIScheduler())
563 Policy
.ShouldTrackLaneMasks
= true;
566 bool GCNSubtarget::hasMadF16() const {
567 return InstrInfo
.pseudoToMCOpcode(AMDGPU::V_MAD_F16
) != -1;
570 unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs
) const {
571 if (getGeneration() >= AMDGPUSubtarget::GFX10
)
572 return getMaxWavesPerEU();
574 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS
) {
596 unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs
) const {
597 unsigned MaxWaves
= getMaxWavesPerEU();
598 unsigned Granule
= getVGPRAllocGranule();
601 unsigned RoundedRegs
= ((VGPRs
+ Granule
- 1) / Granule
) * Granule
;
602 return std::min(getTotalNumVGPRs() / RoundedRegs
, MaxWaves
);
605 unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction
&MF
) const {
606 const SIMachineFunctionInfo
&MFI
= *MF
.getInfo
<SIMachineFunctionInfo
>();
607 if (getGeneration() >= AMDGPUSubtarget::GFX10
)
608 return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs.
610 if (MFI
.hasFlatScratchInit()) {
611 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS
)
612 return 6; // FLAT_SCRATCH, XNACK, VCC (in that order).
613 if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS
)
614 return 4; // FLAT_SCRATCH, VCC (in that order).
617 if (isXNACKEnabled())
618 return 4; // XNACK, VCC (in that order).
622 unsigned GCNSubtarget::computeOccupancy(const MachineFunction
&MF
,
625 unsigned NumVGPRs
) const {
627 std::min(getMaxWavesPerEU(),
628 getOccupancyWithLocalMemSize(LDSSize
, MF
.getFunction()));
630 Occupancy
= std::min(Occupancy
, getOccupancyWithNumSGPRs(NumSGPRs
));
632 Occupancy
= std::min(Occupancy
, getOccupancyWithNumVGPRs(NumVGPRs
));
636 unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction
&MF
) const {
637 const Function
&F
= MF
.getFunction();
638 const SIMachineFunctionInfo
&MFI
= *MF
.getInfo
<SIMachineFunctionInfo
>();
640 // Compute maximum number of SGPRs function can use using default/requested
641 // minimum number of waves per execution unit.
642 std::pair
<unsigned, unsigned> WavesPerEU
= MFI
.getWavesPerEU();
643 unsigned MaxNumSGPRs
= getMaxNumSGPRs(WavesPerEU
.first
, false);
644 unsigned MaxAddressableNumSGPRs
= getMaxNumSGPRs(WavesPerEU
.first
, true);
646 // Check if maximum number of SGPRs was explicitly requested using
647 // "amdgpu-num-sgpr" attribute.
648 if (F
.hasFnAttribute("amdgpu-num-sgpr")) {
649 unsigned Requested
= AMDGPU::getIntegerAttribute(
650 F
, "amdgpu-num-sgpr", MaxNumSGPRs
);
652 // Make sure requested value does not violate subtarget's specifications.
653 if (Requested
&& (Requested
<= getReservedNumSGPRs(MF
)))
656 // If more SGPRs are required to support the input user/system SGPRs,
657 // increase to accommodate them.
659 // FIXME: This really ends up using the requested number of SGPRs + number
660 // of reserved special registers in total. Theoretically you could re-use
661 // the last input registers for these special registers, but this would
662 // require a lot of complexity to deal with the weird aliasing.
663 unsigned InputNumSGPRs
= MFI
.getNumPreloadedSGPRs();
664 if (Requested
&& Requested
< InputNumSGPRs
)
665 Requested
= InputNumSGPRs
;
667 // Make sure requested value is compatible with values implied by
668 // default/requested minimum/maximum number of waves per execution unit.
669 if (Requested
&& Requested
> getMaxNumSGPRs(WavesPerEU
.first
, false))
671 if (WavesPerEU
.second
&&
672 Requested
&& Requested
< getMinNumSGPRs(WavesPerEU
.second
))
676 MaxNumSGPRs
= Requested
;
679 if (hasSGPRInitBug())
680 MaxNumSGPRs
= AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG
;
682 return std::min(MaxNumSGPRs
- getReservedNumSGPRs(MF
),
683 MaxAddressableNumSGPRs
);
686 unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction
&MF
) const {
687 const Function
&F
= MF
.getFunction();
688 const SIMachineFunctionInfo
&MFI
= *MF
.getInfo
<SIMachineFunctionInfo
>();
690 // Compute maximum number of VGPRs function can use using default/requested
691 // minimum number of waves per execution unit.
692 std::pair
<unsigned, unsigned> WavesPerEU
= MFI
.getWavesPerEU();
693 unsigned MaxNumVGPRs
= getMaxNumVGPRs(WavesPerEU
.first
);
695 // Check if maximum number of VGPRs was explicitly requested using
696 // "amdgpu-num-vgpr" attribute.
697 if (F
.hasFnAttribute("amdgpu-num-vgpr")) {
698 unsigned Requested
= AMDGPU::getIntegerAttribute(
699 F
, "amdgpu-num-vgpr", MaxNumVGPRs
);
701 // Make sure requested value is compatible with values implied by
702 // default/requested minimum/maximum number of waves per execution unit.
703 if (Requested
&& Requested
> getMaxNumVGPRs(WavesPerEU
.first
))
705 if (WavesPerEU
.second
&&
706 Requested
&& Requested
< getMinNumVGPRs(WavesPerEU
.second
))
710 MaxNumVGPRs
= Requested
;
717 struct MemOpClusterMutation
: ScheduleDAGMutation
{
718 const SIInstrInfo
*TII
;
720 MemOpClusterMutation(const SIInstrInfo
*tii
) : TII(tii
) {}
722 void apply(ScheduleDAGInstrs
*DAG
) override
{
723 SUnit
*SUa
= nullptr;
724 // Search for two consequent memory operations and link them
725 // to prevent scheduler from moving them apart.
726 // In DAG pre-process SUnits are in the original order of
727 // the instructions before scheduling.
728 for (SUnit
&SU
: DAG
->SUnits
) {
729 MachineInstr
&MI2
= *SU
.getInstr();
730 if (!MI2
.mayLoad() && !MI2
.mayStore()) {
739 MachineInstr
&MI1
= *SUa
->getInstr();
740 if ((TII
->isVMEM(MI1
) && TII
->isVMEM(MI2
)) ||
741 (TII
->isFLAT(MI1
) && TII
->isFLAT(MI2
)) ||
742 (TII
->isSMRD(MI1
) && TII
->isSMRD(MI2
)) ||
743 (TII
->isDS(MI1
) && TII
->isDS(MI2
))) {
744 SU
.addPredBarrier(SUa
);
746 for (const SDep
&SI
: SU
.Preds
) {
747 if (SI
.getSUnit() != SUa
)
748 SUa
->addPred(SDep(SI
.getSUnit(), SDep::Artificial
));
751 if (&SU
!= &DAG
->ExitSU
) {
752 for (const SDep
&SI
: SUa
->Succs
) {
753 if (SI
.getSUnit() != &SU
)
754 SI
.getSUnit()->addPred(SDep(&SU
, SDep::Artificial
));
764 struct FillMFMAShadowMutation
: ScheduleDAGMutation
{
765 const SIInstrInfo
*TII
;
769 FillMFMAShadowMutation(const SIInstrInfo
*tii
) : TII(tii
) {}
771 bool isSALU(const SUnit
*SU
) const {
772 const MachineInstr
*MI
= SU
->getInstr();
773 return MI
&& TII
->isSALU(*MI
) && !MI
->isTerminator();
776 bool canAddEdge(const SUnit
*Succ
, const SUnit
*Pred
) const {
777 if (Pred
->NodeNum
< Succ
->NodeNum
)
780 SmallVector
<const SUnit
*, 64> Succs({Succ
}), Preds({Pred
});
782 for (unsigned I
= 0; I
< Succs
.size(); ++I
) {
783 for (const SDep
&SI
: Succs
[I
]->Succs
) {
784 const SUnit
*SU
= SI
.getSUnit();
785 if (SU
!= Succs
[I
] && llvm::find(Succs
, SU
) == Succs
.end())
790 SmallPtrSet
<const SUnit
*, 32> Visited
;
791 while (!Preds
.empty()) {
792 const SUnit
*SU
= Preds
.pop_back_val();
793 if (llvm::find(Succs
, SU
) != Succs
.end())
796 for (const SDep
&SI
: SU
->Preds
)
797 if (SI
.getSUnit() != SU
&& !Visited
.count(SI
.getSUnit()))
798 Preds
.push_back(SI
.getSUnit());
804 // Link as much SALU intructions in chain as possible. Return the size
805 // of the chain. Links up to MaxChain instructions.
806 unsigned linkSALUChain(SUnit
*From
, SUnit
*To
, unsigned MaxChain
,
807 SmallPtrSetImpl
<SUnit
*> &Visited
) const {
808 SmallVector
<SUnit
*, 8> Worklist({To
});
811 while (!Worklist
.empty() && MaxChain
-- > 0) {
812 SUnit
*SU
= Worklist
.pop_back_val();
813 if (!Visited
.insert(SU
).second
)
816 LLVM_DEBUG(dbgs() << "Inserting edge from\n" ; DAG
->dumpNode(*From
);
817 dbgs() << "to\n"; DAG
->dumpNode(*SU
); dbgs() << '\n');
819 if (SU
->addPred(SDep(From
, SDep::Artificial
), false))
822 for (SDep
&SI
: From
->Succs
) {
823 SUnit
*SUv
= SI
.getSUnit();
824 if (SUv
!= From
&& TII
->isVALU(*SUv
->getInstr()) && canAddEdge(SUv
, SU
))
825 SUv
->addPred(SDep(SU
, SDep::Artificial
), false);
828 for (SDep
&SI
: SU
->Succs
) {
829 SUnit
*Succ
= SI
.getSUnit();
830 if (Succ
!= SU
&& isSALU(Succ
) && canAddEdge(From
, Succ
))
831 Worklist
.push_back(Succ
);
838 void apply(ScheduleDAGInstrs
*DAGInstrs
) override
{
839 const GCNSubtarget
&ST
= DAGInstrs
->MF
.getSubtarget
<GCNSubtarget
>();
840 if (!ST
.hasMAIInsts() || DisablePowerSched
)
842 DAG
= static_cast<ScheduleDAGMI
*>(DAGInstrs
);
843 const TargetSchedModel
*TSchedModel
= DAGInstrs
->getSchedModel();
844 if (!TSchedModel
|| DAG
->SUnits
.empty())
847 // Scan for MFMA long latency instructions and try to add a dependency
848 // of available SALU instructions to give them a chance to fill MFMA
849 // shadow. That is desirable to fill MFMA shadow with SALU instructions
850 // rather than VALU to prevent power consumption bursts and throttle.
851 auto LastSALU
= DAG
->SUnits
.begin();
852 auto E
= DAG
->SUnits
.end();
853 SmallPtrSet
<SUnit
*, 32> Visited
;
854 for (SUnit
&SU
: DAG
->SUnits
) {
855 MachineInstr
&MAI
= *SU
.getInstr();
856 if (!TII
->isMAI(MAI
) ||
857 MAI
.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32
||
858 MAI
.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32
)
861 unsigned Lat
= TSchedModel
->computeInstrLatency(&MAI
) - 1;
863 LLVM_DEBUG(dbgs() << "Found MFMA: "; DAG
->dumpNode(SU
);
864 dbgs() << "Need " << Lat
865 << " instructions to cover latency.\n");
867 // Find up to Lat independent scalar instructions as early as
868 // possible such that they can be scheduled after this MFMA.
869 for ( ; Lat
&& LastSALU
!= E
; ++LastSALU
) {
870 if (Visited
.count(&*LastSALU
))
873 if (!isSALU(&*LastSALU
) || !canAddEdge(&*LastSALU
, &SU
))
876 Lat
-= linkSALUChain(&SU
, &*LastSALU
, Lat
, Visited
);
883 void GCNSubtarget::getPostRAMutations(
884 std::vector
<std::unique_ptr
<ScheduleDAGMutation
>> &Mutations
) const {
885 Mutations
.push_back(std::make_unique
<MemOpClusterMutation
>(&InstrInfo
));
886 Mutations
.push_back(std::make_unique
<FillMFMAShadowMutation
>(&InstrInfo
));
889 const AMDGPUSubtarget
&AMDGPUSubtarget::get(const MachineFunction
&MF
) {
890 if (MF
.getTarget().getTargetTriple().getArch() == Triple::amdgcn
)
891 return static_cast<const AMDGPUSubtarget
&>(MF
.getSubtarget
<GCNSubtarget
>());
893 return static_cast<const AMDGPUSubtarget
&>(MF
.getSubtarget
<R600Subtarget
>());
896 const AMDGPUSubtarget
&AMDGPUSubtarget::get(const TargetMachine
&TM
, const Function
&F
) {
897 if (TM
.getTargetTriple().getArch() == Triple::amdgcn
)
898 return static_cast<const AMDGPUSubtarget
&>(TM
.getSubtarget
<GCNSubtarget
>(F
));
900 return static_cast<const AMDGPUSubtarget
&>(TM
.getSubtarget
<R600Subtarget
>(F
));