[x86] fix assert with horizontal math + broadcast of vector (PR43402)
[llvm-core.git] / lib / Target / AMDGPU / AMDGPUSubtarget.cpp
blobcf89af68d6f80d8819c43e1bea4ad280c8754848
1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Implements the AMDGPU specific subclass of TargetSubtarget.
12 //===----------------------------------------------------------------------===//
14 #include "AMDGPUSubtarget.h"
15 #include "AMDGPU.h"
16 #include "AMDGPUTargetMachine.h"
17 #include "AMDGPUCallLowering.h"
18 #include "AMDGPUInstructionSelector.h"
19 #include "AMDGPULegalizerInfo.h"
20 #include "AMDGPURegisterBankInfo.h"
21 #include "SIMachineFunctionInfo.h"
22 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
23 #include "llvm/ADT/SmallString.h"
24 #include "llvm/CodeGen/MachineScheduler.h"
25 #include "llvm/MC/MCSubtargetInfo.h"
26 #include "llvm/IR/MDBuilder.h"
27 #include "llvm/CodeGen/TargetFrameLowering.h"
28 #include <algorithm>
30 using namespace llvm;
32 #define DEBUG_TYPE "amdgpu-subtarget"
34 #define GET_SUBTARGETINFO_TARGET_DESC
35 #define GET_SUBTARGETINFO_CTOR
36 #define AMDGPUSubtarget GCNSubtarget
37 #include "AMDGPUGenSubtargetInfo.inc"
38 #define GET_SUBTARGETINFO_TARGET_DESC
39 #define GET_SUBTARGETINFO_CTOR
40 #undef AMDGPUSubtarget
41 #include "R600GenSubtargetInfo.inc"
43 static cl::opt<bool> DisablePowerSched(
44 "amdgpu-disable-power-sched",
45 cl::desc("Disable scheduling to minimize mAI power bursts"),
46 cl::init(false));
48 GCNSubtarget::~GCNSubtarget() = default;
50 R600Subtarget &
51 R600Subtarget::initializeSubtargetDependencies(const Triple &TT,
52 StringRef GPU, StringRef FS) {
53 SmallString<256> FullFS("+promote-alloca,");
54 FullFS += FS;
55 ParseSubtargetFeatures(GPU, FullFS);
57 // FIXME: I don't think think Evergreen has any useful support for
58 // denormals, but should be checked. Should we issue a warning somewhere
59 // if someone tries to enable these?
60 if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
61 FP32Denormals = false;
64 HasMulU24 = getGeneration() >= EVERGREEN;
65 HasMulI24 = hasCaymanISA();
67 return *this;
70 GCNSubtarget &
71 GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
72 StringRef GPU, StringRef FS) {
73 // Determine default and user-specified characteristics
74 // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be
75 // enabled, but some instructions do not respect them and they run at the
76 // double precision rate, so don't enable by default.
78 // We want to be able to turn these off, but making this a subtarget feature
79 // for SI has the unhelpful behavior that it unsets everything else if you
80 // disable it.
82 // Similarly we want enable-prt-strict-null to be on by default and not to
83 // unset everything else if it is disabled
85 // Assuming ECC is enabled is the conservative default.
86 SmallString<256> FullFS("+promote-alloca,+load-store-opt,+sram-ecc,+xnack,");
88 if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA.
89 FullFS += "+flat-for-global,+unaligned-buffer-access,+trap-handler,";
91 // FIXME: I don't think think Evergreen has any useful support for
92 // denormals, but should be checked. Should we issue a warning somewhere
93 // if someone tries to enable these?
94 if (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
95 FullFS += "+fp64-fp16-denormals,";
96 } else {
97 FullFS += "-fp32-denormals,";
100 FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS
102 // Disable mutually exclusive bits.
103 if (FS.find_lower("+wavefrontsize") != StringRef::npos) {
104 if (FS.find_lower("wavefrontsize16") == StringRef::npos)
105 FullFS += "-wavefrontsize16,";
106 if (FS.find_lower("wavefrontsize32") == StringRef::npos)
107 FullFS += "-wavefrontsize32,";
108 if (FS.find_lower("wavefrontsize64") == StringRef::npos)
109 FullFS += "-wavefrontsize64,";
112 FullFS += FS;
114 ParseSubtargetFeatures(GPU, FullFS);
116 // We don't support FP64 for EG/NI atm.
117 assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS));
119 // Unless +-flat-for-global is specified, turn on FlatForGlobal for all OS-es
120 // on VI and newer hardware to avoid assertion failures due to missing ADDR64
121 // variants of MUBUF instructions.
122 if (!hasAddr64() && !FS.contains("flat-for-global")) {
123 FlatForGlobal = true;
126 // Set defaults if needed.
127 if (MaxPrivateElementSize == 0)
128 MaxPrivateElementSize = 4;
130 if (LDSBankCount == 0)
131 LDSBankCount = 32;
133 if (TT.getArch() == Triple::amdgcn) {
134 if (LocalMemorySize == 0)
135 LocalMemorySize = 32768;
137 // Do something sensible for unspecified target.
138 if (!HasMovrel && !HasVGPRIndexMode)
139 HasMovrel = true;
142 // Don't crash on invalid devices.
143 if (WavefrontSize == 0)
144 WavefrontSize = 64;
146 HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS;
148 if (DoesNotSupportXNACK && EnableXNACK) {
149 ToggleFeature(AMDGPU::FeatureXNACK);
150 EnableXNACK = false;
153 // ECC is on by default, but turn it off if the hardware doesn't support it
154 // anyway. This matters for the gfx9 targets with d16 loads, but don't support
155 // ECC.
156 if (DoesNotSupportSRAMECC && EnableSRAMECC) {
157 ToggleFeature(AMDGPU::FeatureSRAMECC);
158 EnableSRAMECC = false;
161 return *this;
164 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) :
165 TargetTriple(TT),
166 Has16BitInsts(false),
167 HasMadMixInsts(false),
168 FP32Denormals(false),
169 FPExceptions(false),
170 HasSDWA(false),
171 HasVOP3PInsts(false),
172 HasMulI24(true),
173 HasMulU24(true),
174 HasInv2PiInlineImm(false),
175 HasFminFmaxLegacy(true),
176 EnablePromoteAlloca(false),
177 HasTrigReducedRange(false),
178 MaxWavesPerEU(10),
179 LocalMemorySize(0),
180 WavefrontSize(0)
183 GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
184 const GCNTargetMachine &TM) :
185 AMDGPUGenSubtargetInfo(TT, GPU, FS),
186 AMDGPUSubtarget(TT),
187 TargetTriple(TT),
188 Gen(TT.getOS() == Triple::AMDHSA ? SEA_ISLANDS : SOUTHERN_ISLANDS),
189 InstrItins(getInstrItineraryForCPU(GPU)),
190 LDSBankCount(0),
191 MaxPrivateElementSize(0),
193 FastFMAF32(false),
194 HalfRate64Ops(false),
196 FP64FP16Denormals(false),
197 FlatForGlobal(false),
198 AutoWaitcntBeforeBarrier(false),
199 CodeObjectV3(false),
200 UnalignedScratchAccess(false),
201 UnalignedBufferAccess(false),
203 HasApertureRegs(false),
204 EnableXNACK(false),
205 DoesNotSupportXNACK(false),
206 EnableCuMode(false),
207 TrapHandler(false),
209 EnableLoadStoreOpt(false),
210 EnableUnsafeDSOffsetFolding(false),
211 EnableSIScheduler(false),
212 EnableDS128(false),
213 EnablePRTStrictNull(false),
214 DumpCode(false),
216 FP64(false),
217 GCN3Encoding(false),
218 CIInsts(false),
219 GFX8Insts(false),
220 GFX9Insts(false),
221 GFX10Insts(false),
222 GFX7GFX8GFX9Insts(false),
223 SGPRInitBug(false),
224 HasSMemRealTime(false),
225 HasIntClamp(false),
226 HasFmaMixInsts(false),
227 HasMovrel(false),
228 HasVGPRIndexMode(false),
229 HasScalarStores(false),
230 HasScalarAtomics(false),
231 HasSDWAOmod(false),
232 HasSDWAScalar(false),
233 HasSDWASdst(false),
234 HasSDWAMac(false),
235 HasSDWAOutModsVOPC(false),
236 HasDPP(false),
237 HasDPP8(false),
238 HasR128A16(false),
239 HasNSAEncoding(false),
240 HasDLInsts(false),
241 HasDot1Insts(false),
242 HasDot2Insts(false),
243 HasDot3Insts(false),
244 HasDot4Insts(false),
245 HasDot5Insts(false),
246 HasDot6Insts(false),
247 HasMAIInsts(false),
248 HasPkFmacF16Inst(false),
249 HasAtomicFaddInsts(false),
250 EnableSRAMECC(false),
251 DoesNotSupportSRAMECC(false),
252 HasNoSdstCMPX(false),
253 HasVscnt(false),
254 HasRegisterBanking(false),
255 HasVOP3Literal(false),
256 HasNoDataDepHazard(false),
257 FlatAddressSpace(false),
258 FlatInstOffsets(false),
259 FlatGlobalInsts(false),
260 FlatScratchInsts(false),
261 ScalarFlatScratchInsts(false),
262 AddNoCarryInsts(false),
263 HasUnpackedD16VMem(false),
264 LDSMisalignedBug(false),
265 HasMFMAInlineLiteralBug(false),
267 ScalarizeGlobal(false),
269 HasVcmpxPermlaneHazard(false),
270 HasVMEMtoScalarWriteHazard(false),
271 HasSMEMtoVectorWriteHazard(false),
272 HasInstFwdPrefetchBug(false),
273 HasVcmpxExecWARHazard(false),
274 HasLdsBranchVmemWARHazard(false),
275 HasNSAtoVMEMBug(false),
276 HasOffset3fBug(false),
277 HasFlatSegmentOffsetBug(false),
279 FeatureDisable(false),
280 InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)),
281 TLInfo(TM, *this),
282 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) {
283 MaxWavesPerEU = AMDGPU::IsaInfo::getMaxWavesPerEU(this);
284 CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering()));
285 Legalizer.reset(new AMDGPULegalizerInfo(*this, TM));
286 RegBankInfo.reset(new AMDGPURegisterBankInfo(*this));
287 InstSelector.reset(new AMDGPUInstructionSelector(
288 *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM));
291 unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const {
292 if (getGeneration() < GFX10)
293 return 1;
295 switch (Opcode) {
296 case AMDGPU::V_LSHLREV_B64:
297 case AMDGPU::V_LSHLREV_B64_gfx10:
298 case AMDGPU::V_LSHL_B64:
299 case AMDGPU::V_LSHRREV_B64:
300 case AMDGPU::V_LSHRREV_B64_gfx10:
301 case AMDGPU::V_LSHR_B64:
302 case AMDGPU::V_ASHRREV_I64:
303 case AMDGPU::V_ASHRREV_I64_gfx10:
304 case AMDGPU::V_ASHR_I64:
305 return 1;
308 return 2;
311 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
312 const Function &F) const {
313 if (NWaves == 1)
314 return getLocalMemorySize();
315 unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
316 unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
317 if (!WorkGroupsPerCu)
318 return 0;
319 unsigned MaxWaves = getMaxWavesPerEU();
320 return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves;
323 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes,
324 const Function &F) const {
325 unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
326 unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
327 if (!WorkGroupsPerCu)
328 return 0;
329 unsigned MaxWaves = getMaxWavesPerEU();
330 unsigned Limit = getLocalMemorySize() * MaxWaves / WorkGroupsPerCu;
331 unsigned NumWaves = Limit / (Bytes ? Bytes : 1u);
332 NumWaves = std::min(NumWaves, MaxWaves);
333 NumWaves = std::max(NumWaves, 1u);
334 return NumWaves;
337 unsigned
338 AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const {
339 const auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
340 return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction());
343 std::pair<unsigned, unsigned>
344 AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const {
345 switch (CC) {
346 case CallingConv::AMDGPU_CS:
347 case CallingConv::AMDGPU_KERNEL:
348 case CallingConv::SPIR_KERNEL:
349 return std::make_pair(getWavefrontSize() * 2,
350 std::max(getWavefrontSize() * 4, 256u));
351 case CallingConv::AMDGPU_VS:
352 case CallingConv::AMDGPU_LS:
353 case CallingConv::AMDGPU_HS:
354 case CallingConv::AMDGPU_ES:
355 case CallingConv::AMDGPU_GS:
356 case CallingConv::AMDGPU_PS:
357 return std::make_pair(1, getWavefrontSize());
358 default:
359 return std::make_pair(1, 16 * getWavefrontSize());
363 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
364 const Function &F) const {
365 // FIXME: 1024 if function.
366 // Default minimum/maximum flat work group sizes.
367 std::pair<unsigned, unsigned> Default =
368 getDefaultFlatWorkGroupSize(F.getCallingConv());
370 // Requested minimum/maximum flat work group sizes.
371 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
372 F, "amdgpu-flat-work-group-size", Default);
374 // Make sure requested minimum is less than requested maximum.
375 if (Requested.first > Requested.second)
376 return Default;
378 // Make sure requested values do not violate subtarget's specifications.
379 if (Requested.first < getMinFlatWorkGroupSize())
380 return Default;
381 if (Requested.second > getMaxFlatWorkGroupSize())
382 return Default;
384 return Requested;
387 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
388 const Function &F) const {
389 // Default minimum/maximum number of waves per execution unit.
390 std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
392 // Default/requested minimum/maximum flat work group sizes.
393 std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F);
395 // If minimum/maximum flat work group sizes were explicitly requested using
396 // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum
397 // number of waves per execution unit to values implied by requested
398 // minimum/maximum flat work group sizes.
399 unsigned MinImpliedByFlatWorkGroupSize =
400 getMaxWavesPerEU(FlatWorkGroupSizes.second);
401 bool RequestedFlatWorkGroupSize = false;
403 if (F.hasFnAttribute("amdgpu-flat-work-group-size")) {
404 Default.first = MinImpliedByFlatWorkGroupSize;
405 RequestedFlatWorkGroupSize = true;
408 // Requested minimum/maximum number of waves per execution unit.
409 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
410 F, "amdgpu-waves-per-eu", Default, true);
412 // Make sure requested minimum is less than requested maximum.
413 if (Requested.second && Requested.first > Requested.second)
414 return Default;
416 // Make sure requested values do not violate subtarget's specifications.
417 if (Requested.first < getMinWavesPerEU() ||
418 Requested.first > getMaxWavesPerEU())
419 return Default;
420 if (Requested.second > getMaxWavesPerEU())
421 return Default;
423 // Make sure requested values are compatible with values implied by requested
424 // minimum/maximum flat work group sizes.
425 if (RequestedFlatWorkGroupSize &&
426 Requested.first < MinImpliedByFlatWorkGroupSize)
427 return Default;
429 return Requested;
432 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const {
433 Function *Kernel = I->getParent()->getParent();
434 unsigned MinSize = 0;
435 unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second;
436 bool IdQuery = false;
438 // If reqd_work_group_size is present it narrows value down.
439 if (auto *CI = dyn_cast<CallInst>(I)) {
440 const Function *F = CI->getCalledFunction();
441 if (F) {
442 unsigned Dim = UINT_MAX;
443 switch (F->getIntrinsicID()) {
444 case Intrinsic::amdgcn_workitem_id_x:
445 case Intrinsic::r600_read_tidig_x:
446 IdQuery = true;
447 LLVM_FALLTHROUGH;
448 case Intrinsic::r600_read_local_size_x:
449 Dim = 0;
450 break;
451 case Intrinsic::amdgcn_workitem_id_y:
452 case Intrinsic::r600_read_tidig_y:
453 IdQuery = true;
454 LLVM_FALLTHROUGH;
455 case Intrinsic::r600_read_local_size_y:
456 Dim = 1;
457 break;
458 case Intrinsic::amdgcn_workitem_id_z:
459 case Intrinsic::r600_read_tidig_z:
460 IdQuery = true;
461 LLVM_FALLTHROUGH;
462 case Intrinsic::r600_read_local_size_z:
463 Dim = 2;
464 break;
465 default:
466 break;
468 if (Dim <= 3) {
469 if (auto Node = Kernel->getMetadata("reqd_work_group_size"))
470 if (Node->getNumOperands() == 3)
471 MinSize = MaxSize = mdconst::extract<ConstantInt>(
472 Node->getOperand(Dim))->getZExtValue();
477 if (!MaxSize)
478 return false;
480 // Range metadata is [Lo, Hi). For ID query we need to pass max size
481 // as Hi. For size query we need to pass Hi + 1.
482 if (IdQuery)
483 MinSize = 0;
484 else
485 ++MaxSize;
487 MDBuilder MDB(I->getContext());
488 MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize),
489 APInt(32, MaxSize));
490 I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
491 return true;
494 uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F,
495 unsigned &MaxAlign) const {
496 assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
497 F.getCallingConv() == CallingConv::SPIR_KERNEL);
499 const DataLayout &DL = F.getParent()->getDataLayout();
500 uint64_t ExplicitArgBytes = 0;
501 MaxAlign = 1;
503 for (const Argument &Arg : F.args()) {
504 Type *ArgTy = Arg.getType();
506 unsigned Align = DL.getABITypeAlignment(ArgTy);
507 uint64_t AllocSize = DL.getTypeAllocSize(ArgTy);
508 ExplicitArgBytes = alignTo(ExplicitArgBytes, Align) + AllocSize;
509 MaxAlign = std::max(MaxAlign, Align);
512 return ExplicitArgBytes;
515 unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F,
516 unsigned &MaxAlign) const {
517 uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign);
519 unsigned ExplicitOffset = getExplicitKernelArgOffset(F);
521 uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes;
522 unsigned ImplicitBytes = getImplicitArgNumBytes(F);
523 if (ImplicitBytes != 0) {
524 unsigned Alignment = getAlignmentForImplicitArgPtr();
525 TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
528 // Being able to dereference past the end is useful for emitting scalar loads.
529 return alignTo(TotalSize, 4);
532 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS,
533 const TargetMachine &TM) :
534 R600GenSubtargetInfo(TT, GPU, FS),
535 AMDGPUSubtarget(TT),
536 InstrInfo(*this),
537 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
538 FMA(false),
539 CaymanISA(false),
540 CFALUBug(false),
541 HasVertexCache(false),
542 R600ALUInst(false),
543 FP64(false),
544 TexVTXClauseSize(0),
545 Gen(R600),
546 TLInfo(TM, initializeSubtargetDependencies(TT, GPU, FS)),
547 InstrItins(getInstrItineraryForCPU(GPU)) { }
549 void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
550 unsigned NumRegionInstrs) const {
551 // Track register pressure so the scheduler can try to decrease
552 // pressure once register usage is above the threshold defined by
553 // SIRegisterInfo::getRegPressureSetLimit()
554 Policy.ShouldTrackPressure = true;
556 // Enabling both top down and bottom up scheduling seems to give us less
557 // register spills than just using one of these approaches on its own.
558 Policy.OnlyTopDown = false;
559 Policy.OnlyBottomUp = false;
561 // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
562 if (!enableSIScheduler())
563 Policy.ShouldTrackLaneMasks = true;
566 bool GCNSubtarget::hasMadF16() const {
567 return InstrInfo.pseudoToMCOpcode(AMDGPU::V_MAD_F16) != -1;
570 unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
571 if (getGeneration() >= AMDGPUSubtarget::GFX10)
572 return getMaxWavesPerEU();
574 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
575 if (SGPRs <= 80)
576 return 10;
577 if (SGPRs <= 88)
578 return 9;
579 if (SGPRs <= 100)
580 return 8;
581 return 7;
583 if (SGPRs <= 48)
584 return 10;
585 if (SGPRs <= 56)
586 return 9;
587 if (SGPRs <= 64)
588 return 8;
589 if (SGPRs <= 72)
590 return 7;
591 if (SGPRs <= 80)
592 return 6;
593 return 5;
596 unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const {
597 unsigned MaxWaves = getMaxWavesPerEU();
598 unsigned Granule = getVGPRAllocGranule();
599 if (VGPRs < Granule)
600 return MaxWaves;
601 unsigned RoundedRegs = ((VGPRs + Granule - 1) / Granule) * Granule;
602 return std::min(std::max(getTotalNumVGPRs() / RoundedRegs, 1u), MaxWaves);
605 unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
606 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
607 if (getGeneration() >= AMDGPUSubtarget::GFX10)
608 return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs.
610 if (MFI.hasFlatScratchInit()) {
611 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
612 return 6; // FLAT_SCRATCH, XNACK, VCC (in that order).
613 if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS)
614 return 4; // FLAT_SCRATCH, VCC (in that order).
617 if (isXNACKEnabled())
618 return 4; // XNACK, VCC (in that order).
619 return 2; // VCC.
622 unsigned GCNSubtarget::computeOccupancy(const MachineFunction &MF,
623 unsigned LDSSize,
624 unsigned NumSGPRs,
625 unsigned NumVGPRs) const {
626 unsigned Occupancy =
627 std::min(getMaxWavesPerEU(),
628 getOccupancyWithLocalMemSize(LDSSize, MF.getFunction()));
629 if (NumSGPRs)
630 Occupancy = std::min(Occupancy, getOccupancyWithNumSGPRs(NumSGPRs));
631 if (NumVGPRs)
632 Occupancy = std::min(Occupancy, getOccupancyWithNumVGPRs(NumVGPRs));
633 return Occupancy;
636 unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
637 const Function &F = MF.getFunction();
638 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
640 // Compute maximum number of SGPRs function can use using default/requested
641 // minimum number of waves per execution unit.
642 std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
643 unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false);
644 unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true);
646 // Check if maximum number of SGPRs was explicitly requested using
647 // "amdgpu-num-sgpr" attribute.
648 if (F.hasFnAttribute("amdgpu-num-sgpr")) {
649 unsigned Requested = AMDGPU::getIntegerAttribute(
650 F, "amdgpu-num-sgpr", MaxNumSGPRs);
652 // Make sure requested value does not violate subtarget's specifications.
653 if (Requested && (Requested <= getReservedNumSGPRs(MF)))
654 Requested = 0;
656 // If more SGPRs are required to support the input user/system SGPRs,
657 // increase to accommodate them.
659 // FIXME: This really ends up using the requested number of SGPRs + number
660 // of reserved special registers in total. Theoretically you could re-use
661 // the last input registers for these special registers, but this would
662 // require a lot of complexity to deal with the weird aliasing.
663 unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs();
664 if (Requested && Requested < InputNumSGPRs)
665 Requested = InputNumSGPRs;
667 // Make sure requested value is compatible with values implied by
668 // default/requested minimum/maximum number of waves per execution unit.
669 if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false))
670 Requested = 0;
671 if (WavesPerEU.second &&
672 Requested && Requested < getMinNumSGPRs(WavesPerEU.second))
673 Requested = 0;
675 if (Requested)
676 MaxNumSGPRs = Requested;
679 if (hasSGPRInitBug())
680 MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
682 return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF),
683 MaxAddressableNumSGPRs);
686 unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
687 const Function &F = MF.getFunction();
688 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
690 // Compute maximum number of VGPRs function can use using default/requested
691 // minimum number of waves per execution unit.
692 std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
693 unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first);
695 // Check if maximum number of VGPRs was explicitly requested using
696 // "amdgpu-num-vgpr" attribute.
697 if (F.hasFnAttribute("amdgpu-num-vgpr")) {
698 unsigned Requested = AMDGPU::getIntegerAttribute(
699 F, "amdgpu-num-vgpr", MaxNumVGPRs);
701 // Make sure requested value is compatible with values implied by
702 // default/requested minimum/maximum number of waves per execution unit.
703 if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first))
704 Requested = 0;
705 if (WavesPerEU.second &&
706 Requested && Requested < getMinNumVGPRs(WavesPerEU.second))
707 Requested = 0;
709 if (Requested)
710 MaxNumVGPRs = Requested;
713 return MaxNumVGPRs;
716 namespace {
717 struct MemOpClusterMutation : ScheduleDAGMutation {
718 const SIInstrInfo *TII;
720 MemOpClusterMutation(const SIInstrInfo *tii) : TII(tii) {}
722 void apply(ScheduleDAGInstrs *DAG) override {
723 SUnit *SUa = nullptr;
724 // Search for two consequent memory operations and link them
725 // to prevent scheduler from moving them apart.
726 // In DAG pre-process SUnits are in the original order of
727 // the instructions before scheduling.
728 for (SUnit &SU : DAG->SUnits) {
729 MachineInstr &MI2 = *SU.getInstr();
730 if (!MI2.mayLoad() && !MI2.mayStore()) {
731 SUa = nullptr;
732 continue;
734 if (!SUa) {
735 SUa = &SU;
736 continue;
739 MachineInstr &MI1 = *SUa->getInstr();
740 if ((TII->isVMEM(MI1) && TII->isVMEM(MI2)) ||
741 (TII->isFLAT(MI1) && TII->isFLAT(MI2)) ||
742 (TII->isSMRD(MI1) && TII->isSMRD(MI2)) ||
743 (TII->isDS(MI1) && TII->isDS(MI2))) {
744 SU.addPredBarrier(SUa);
746 for (const SDep &SI : SU.Preds) {
747 if (SI.getSUnit() != SUa)
748 SUa->addPred(SDep(SI.getSUnit(), SDep::Artificial));
751 if (&SU != &DAG->ExitSU) {
752 for (const SDep &SI : SUa->Succs) {
753 if (SI.getSUnit() != &SU)
754 SI.getSUnit()->addPred(SDep(&SU, SDep::Artificial));
759 SUa = &SU;
764 struct FillMFMAShadowMutation : ScheduleDAGMutation {
765 const SIInstrInfo *TII;
767 ScheduleDAGMI *DAG;
769 FillMFMAShadowMutation(const SIInstrInfo *tii) : TII(tii) {}
771 bool isSALU(const SUnit *SU) const {
772 const MachineInstr *MI = SU->getInstr();
773 return MI && TII->isSALU(*MI) && !MI->isTerminator();
776 bool canAddEdge(const SUnit *Succ, const SUnit *Pred) const {
777 if (Pred->NodeNum < Succ->NodeNum)
778 return true;
780 SmallVector<const SUnit*, 64> Succs({Succ}), Preds({Pred});
782 for (unsigned I = 0; I < Succs.size(); ++I) {
783 for (const SDep &SI : Succs[I]->Succs) {
784 const SUnit *SU = SI.getSUnit();
785 if (SU != Succs[I] && llvm::find(Succs, SU) == Succs.end())
786 Succs.push_back(SU);
790 SmallPtrSet<const SUnit*, 32> Visited;
791 while (!Preds.empty()) {
792 const SUnit *SU = Preds.pop_back_val();
793 if (llvm::find(Succs, SU) != Succs.end())
794 return false;
795 Visited.insert(SU);
796 for (const SDep &SI : SU->Preds)
797 if (SI.getSUnit() != SU && !Visited.count(SI.getSUnit()))
798 Preds.push_back(SI.getSUnit());
801 return true;
804 // Link as much SALU intructions in chain as possible. Return the size
805 // of the chain. Links up to MaxChain instructions.
806 unsigned linkSALUChain(SUnit *From, SUnit *To, unsigned MaxChain,
807 SmallPtrSetImpl<SUnit *> &Visited) const {
808 SmallVector<SUnit *, 8> Worklist({To});
809 unsigned Linked = 0;
811 while (!Worklist.empty() && MaxChain-- > 0) {
812 SUnit *SU = Worklist.pop_back_val();
813 if (!Visited.insert(SU).second)
814 continue;
816 LLVM_DEBUG(dbgs() << "Inserting edge from\n" ; DAG->dumpNode(*From);
817 dbgs() << "to\n"; DAG->dumpNode(*SU); dbgs() << '\n');
819 if (SU->addPred(SDep(From, SDep::Artificial), false))
820 ++Linked;
822 for (SDep &SI : From->Succs) {
823 SUnit *SUv = SI.getSUnit();
824 if (SUv != From && TII->isVALU(*SUv->getInstr()) && canAddEdge(SUv, SU))
825 SUv->addPred(SDep(SU, SDep::Artificial), false);
828 for (SDep &SI : SU->Succs) {
829 SUnit *Succ = SI.getSUnit();
830 if (Succ != SU && isSALU(Succ) && canAddEdge(From, Succ))
831 Worklist.push_back(Succ);
835 return Linked;
838 void apply(ScheduleDAGInstrs *DAGInstrs) override {
839 const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>();
840 if (!ST.hasMAIInsts() || DisablePowerSched)
841 return;
842 DAG = static_cast<ScheduleDAGMI*>(DAGInstrs);
843 const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel();
844 if (!TSchedModel || DAG->SUnits.empty())
845 return;
847 // Scan for MFMA long latency instructions and try to add a dependency
848 // of available SALU instructions to give them a chance to fill MFMA
849 // shadow. That is desirable to fill MFMA shadow with SALU instructions
850 // rather than VALU to prevent power consumption bursts and throttle.
851 auto LastSALU = DAG->SUnits.begin();
852 auto E = DAG->SUnits.end();
853 SmallPtrSet<SUnit*, 32> Visited;
854 for (SUnit &SU : DAG->SUnits) {
855 MachineInstr &MAI = *SU.getInstr();
856 if (!TII->isMAI(MAI) ||
857 MAI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32 ||
858 MAI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32)
859 continue;
861 unsigned Lat = TSchedModel->computeInstrLatency(&MAI) - 1;
863 LLVM_DEBUG(dbgs() << "Found MFMA: "; DAG->dumpNode(SU);
864 dbgs() << "Need " << Lat
865 << " instructions to cover latency.\n");
867 // Find up to Lat independent scalar instructions as early as
868 // possible such that they can be scheduled after this MFMA.
869 for ( ; Lat && LastSALU != E; ++LastSALU) {
870 if (Visited.count(&*LastSALU))
871 continue;
873 if (!isSALU(&*LastSALU) || !canAddEdge(&*LastSALU, &SU))
874 continue;
876 Lat -= linkSALUChain(&SU, &*LastSALU, Lat, Visited);
881 } // namespace
883 void GCNSubtarget::getPostRAMutations(
884 std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
885 Mutations.push_back(std::make_unique<MemOpClusterMutation>(&InstrInfo));
886 Mutations.push_back(std::make_unique<FillMFMAShadowMutation>(&InstrInfo));
889 const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) {
890 if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn)
891 return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>());
892 else
893 return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<R600Subtarget>());
896 const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Function &F) {
897 if (TM.getTargetTriple().getArch() == Triple::amdgcn)
898 return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F));
899 else
900 return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<R600Subtarget>(F));