gn build: Extract git() and git_out() functions in sync script
[llvm-complete.git] / lib / Target / AMDGPU / AMDGPUSubtarget.cpp
blobf9a9679ac68c5641536f0528faa1754f30ce61c4
1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Implements the AMDGPU specific subclass of TargetSubtarget.
12 //===----------------------------------------------------------------------===//
14 #include "AMDGPUSubtarget.h"
15 #include "AMDGPU.h"
16 #include "AMDGPUTargetMachine.h"
17 #include "AMDGPUCallLowering.h"
18 #include "AMDGPUInstructionSelector.h"
19 #include "AMDGPULegalizerInfo.h"
20 #include "AMDGPURegisterBankInfo.h"
21 #include "SIMachineFunctionInfo.h"
22 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
23 #include "llvm/ADT/SmallString.h"
24 #include "llvm/CodeGen/MachineScheduler.h"
25 #include "llvm/MC/MCSubtargetInfo.h"
26 #include "llvm/IR/MDBuilder.h"
27 #include "llvm/CodeGen/TargetFrameLowering.h"
28 #include <algorithm>
30 using namespace llvm;
32 #define DEBUG_TYPE "amdgpu-subtarget"
34 #define GET_SUBTARGETINFO_TARGET_DESC
35 #define GET_SUBTARGETINFO_CTOR
36 #define AMDGPUSubtarget GCNSubtarget
37 #include "AMDGPUGenSubtargetInfo.inc"
38 #define GET_SUBTARGETINFO_TARGET_DESC
39 #define GET_SUBTARGETINFO_CTOR
40 #undef AMDGPUSubtarget
41 #include "R600GenSubtargetInfo.inc"
43 static cl::opt<bool> DisablePowerSched(
44 "amdgpu-disable-power-sched",
45 cl::desc("Disable scheduling to minimize mAI power bursts"),
46 cl::init(false));
48 GCNSubtarget::~GCNSubtarget() = default;
50 R600Subtarget &
51 R600Subtarget::initializeSubtargetDependencies(const Triple &TT,
52 StringRef GPU, StringRef FS) {
53 SmallString<256> FullFS("+promote-alloca,");
54 FullFS += FS;
55 ParseSubtargetFeatures(GPU, FullFS);
57 // FIXME: I don't think think Evergreen has any useful support for
58 // denormals, but should be checked. Should we issue a warning somewhere
59 // if someone tries to enable these?
60 if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
61 FP32Denormals = false;
64 HasMulU24 = getGeneration() >= EVERGREEN;
65 HasMulI24 = hasCaymanISA();
67 return *this;
70 GCNSubtarget &
71 GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
72 StringRef GPU, StringRef FS) {
73 // Determine default and user-specified characteristics
74 // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be
75 // enabled, but some instructions do not respect them and they run at the
76 // double precision rate, so don't enable by default.
78 // We want to be able to turn these off, but making this a subtarget feature
79 // for SI has the unhelpful behavior that it unsets everything else if you
80 // disable it.
82 // Similarly we want enable-prt-strict-null to be on by default and not to
83 // unset everything else if it is disabled
85 // Assuming ECC is enabled is the conservative default.
86 SmallString<256> FullFS("+promote-alloca,+load-store-opt,+sram-ecc,+xnack,");
88 if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA.
89 FullFS += "+flat-for-global,+unaligned-buffer-access,+trap-handler,";
91 // FIXME: I don't think think Evergreen has any useful support for
92 // denormals, but should be checked. Should we issue a warning somewhere
93 // if someone tries to enable these?
94 if (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
95 FullFS += "+fp64-fp16-denormals,";
96 } else {
97 FullFS += "-fp32-denormals,";
100 FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS
102 // Disable mutually exclusive bits.
103 if (FS.find_lower("+wavefrontsize") != StringRef::npos) {
104 if (FS.find_lower("wavefrontsize16") == StringRef::npos)
105 FullFS += "-wavefrontsize16,";
106 if (FS.find_lower("wavefrontsize32") == StringRef::npos)
107 FullFS += "-wavefrontsize32,";
108 if (FS.find_lower("wavefrontsize64") == StringRef::npos)
109 FullFS += "-wavefrontsize64,";
112 FullFS += FS;
114 ParseSubtargetFeatures(GPU, FullFS);
116 // We don't support FP64 for EG/NI atm.
117 assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS));
119 // Unless +-flat-for-global is specified, turn on FlatForGlobal for all OS-es
120 // on VI and newer hardware to avoid assertion failures due to missing ADDR64
121 // variants of MUBUF instructions.
122 if (!hasAddr64() && !FS.contains("flat-for-global")) {
123 FlatForGlobal = true;
126 // Set defaults if needed.
127 if (MaxPrivateElementSize == 0)
128 MaxPrivateElementSize = 4;
130 if (LDSBankCount == 0)
131 LDSBankCount = 32;
133 if (TT.getArch() == Triple::amdgcn) {
134 if (LocalMemorySize == 0)
135 LocalMemorySize = 32768;
137 // Do something sensible for unspecified target.
138 if (!HasMovrel && !HasVGPRIndexMode)
139 HasMovrel = true;
142 // Don't crash on invalid devices.
143 if (WavefrontSize == 0)
144 WavefrontSize = 64;
146 HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS;
148 if (DoesNotSupportXNACK && EnableXNACK) {
149 ToggleFeature(AMDGPU::FeatureXNACK);
150 EnableXNACK = false;
153 // ECC is on by default, but turn it off if the hardware doesn't support it
154 // anyway. This matters for the gfx9 targets with d16 loads, but don't support
155 // ECC.
156 if (DoesNotSupportSRAMECC && EnableSRAMECC) {
157 ToggleFeature(AMDGPU::FeatureSRAMECC);
158 EnableSRAMECC = false;
161 return *this;
164 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) :
165 TargetTriple(TT),
166 Has16BitInsts(false),
167 HasMadMixInsts(false),
168 FP32Denormals(false),
169 FPExceptions(false),
170 HasSDWA(false),
171 HasVOP3PInsts(false),
172 HasMulI24(true),
173 HasMulU24(true),
174 HasInv2PiInlineImm(false),
175 HasFminFmaxLegacy(true),
176 EnablePromoteAlloca(false),
177 HasTrigReducedRange(false),
178 MaxWavesPerEU(10),
179 LocalMemorySize(0),
180 WavefrontSize(0)
183 GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
184 const GCNTargetMachine &TM) :
185 AMDGPUGenSubtargetInfo(TT, GPU, FS),
186 AMDGPUSubtarget(TT),
187 TargetTriple(TT),
188 Gen(TT.getOS() == Triple::AMDHSA ? SEA_ISLANDS : SOUTHERN_ISLANDS),
189 InstrItins(getInstrItineraryForCPU(GPU)),
190 LDSBankCount(0),
191 MaxPrivateElementSize(0),
193 FastFMAF32(false),
194 HalfRate64Ops(false),
196 FP64FP16Denormals(false),
197 FlatForGlobal(false),
198 AutoWaitcntBeforeBarrier(false),
199 CodeObjectV3(false),
200 UnalignedScratchAccess(false),
201 UnalignedBufferAccess(false),
203 HasApertureRegs(false),
204 EnableXNACK(false),
205 DoesNotSupportXNACK(false),
206 EnableCuMode(false),
207 TrapHandler(false),
209 EnableLoadStoreOpt(false),
210 EnableUnsafeDSOffsetFolding(false),
211 EnableSIScheduler(false),
212 EnableDS128(false),
213 EnablePRTStrictNull(false),
214 DumpCode(false),
216 FP64(false),
217 GCN3Encoding(false),
218 CIInsts(false),
219 GFX8Insts(false),
220 GFX9Insts(false),
221 GFX10Insts(false),
222 GFX7GFX8GFX9Insts(false),
223 SGPRInitBug(false),
224 HasSMemRealTime(false),
225 HasIntClamp(false),
226 HasFmaMixInsts(false),
227 HasMovrel(false),
228 HasVGPRIndexMode(false),
229 HasScalarStores(false),
230 HasScalarAtomics(false),
231 HasSDWAOmod(false),
232 HasSDWAScalar(false),
233 HasSDWASdst(false),
234 HasSDWAMac(false),
235 HasSDWAOutModsVOPC(false),
236 HasDPP(false),
237 HasDPP8(false),
238 HasR128A16(false),
239 HasNSAEncoding(false),
240 HasDLInsts(false),
241 HasDot1Insts(false),
242 HasDot2Insts(false),
243 HasDot3Insts(false),
244 HasDot4Insts(false),
245 HasDot5Insts(false),
246 HasDot6Insts(false),
247 HasMAIInsts(false),
248 HasPkFmacF16Inst(false),
249 HasAtomicFaddInsts(false),
250 EnableSRAMECC(false),
251 DoesNotSupportSRAMECC(false),
252 HasNoSdstCMPX(false),
253 HasVscnt(false),
254 HasRegisterBanking(false),
255 HasVOP3Literal(false),
256 HasNoDataDepHazard(false),
257 FlatAddressSpace(false),
258 FlatInstOffsets(false),
259 FlatGlobalInsts(false),
260 FlatScratchInsts(false),
261 ScalarFlatScratchInsts(false),
262 AddNoCarryInsts(false),
263 HasUnpackedD16VMem(false),
264 LDSMisalignedBug(false),
266 ScalarizeGlobal(false),
268 HasVcmpxPermlaneHazard(false),
269 HasVMEMtoScalarWriteHazard(false),
270 HasSMEMtoVectorWriteHazard(false),
271 HasInstFwdPrefetchBug(false),
272 HasVcmpxExecWARHazard(false),
273 HasLdsBranchVmemWARHazard(false),
274 HasNSAtoVMEMBug(false),
275 HasOffset3fBug(false),
276 HasFlatSegmentOffsetBug(false),
278 FeatureDisable(false),
279 InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)),
280 TLInfo(TM, *this),
281 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) {
282 MaxWavesPerEU = AMDGPU::IsaInfo::getMaxWavesPerEU(this);
283 CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering()));
284 Legalizer.reset(new AMDGPULegalizerInfo(*this, TM));
285 RegBankInfo.reset(new AMDGPURegisterBankInfo(*getRegisterInfo()));
286 InstSelector.reset(new AMDGPUInstructionSelector(
287 *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM));
290 unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const {
291 if (getGeneration() < GFX10)
292 return 1;
294 switch (Opcode) {
295 case AMDGPU::V_LSHLREV_B64:
296 case AMDGPU::V_LSHLREV_B64_gfx10:
297 case AMDGPU::V_LSHL_B64:
298 case AMDGPU::V_LSHRREV_B64:
299 case AMDGPU::V_LSHRREV_B64_gfx10:
300 case AMDGPU::V_LSHR_B64:
301 case AMDGPU::V_ASHRREV_I64:
302 case AMDGPU::V_ASHRREV_I64_gfx10:
303 case AMDGPU::V_ASHR_I64:
304 return 1;
307 return 2;
310 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
311 const Function &F) const {
312 if (NWaves == 1)
313 return getLocalMemorySize();
314 unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
315 unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
316 if (!WorkGroupsPerCu)
317 return 0;
318 unsigned MaxWaves = getMaxWavesPerEU();
319 return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves;
322 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes,
323 const Function &F) const {
324 unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
325 unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
326 if (!WorkGroupsPerCu)
327 return 0;
328 unsigned MaxWaves = getMaxWavesPerEU();
329 unsigned Limit = getLocalMemorySize() * MaxWaves / WorkGroupsPerCu;
330 unsigned NumWaves = Limit / (Bytes ? Bytes : 1u);
331 NumWaves = std::min(NumWaves, MaxWaves);
332 NumWaves = std::max(NumWaves, 1u);
333 return NumWaves;
336 unsigned
337 AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const {
338 const auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
339 return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction());
342 std::pair<unsigned, unsigned>
343 AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const {
344 switch (CC) {
345 case CallingConv::AMDGPU_CS:
346 case CallingConv::AMDGPU_KERNEL:
347 case CallingConv::SPIR_KERNEL:
348 return std::make_pair(getWavefrontSize() * 2,
349 std::max(getWavefrontSize() * 4, 256u));
350 case CallingConv::AMDGPU_VS:
351 case CallingConv::AMDGPU_LS:
352 case CallingConv::AMDGPU_HS:
353 case CallingConv::AMDGPU_ES:
354 case CallingConv::AMDGPU_GS:
355 case CallingConv::AMDGPU_PS:
356 return std::make_pair(1, getWavefrontSize());
357 default:
358 return std::make_pair(1, 16 * getWavefrontSize());
362 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
363 const Function &F) const {
364 // FIXME: 1024 if function.
365 // Default minimum/maximum flat work group sizes.
366 std::pair<unsigned, unsigned> Default =
367 getDefaultFlatWorkGroupSize(F.getCallingConv());
369 // Requested minimum/maximum flat work group sizes.
370 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
371 F, "amdgpu-flat-work-group-size", Default);
373 // Make sure requested minimum is less than requested maximum.
374 if (Requested.first > Requested.second)
375 return Default;
377 // Make sure requested values do not violate subtarget's specifications.
378 if (Requested.first < getMinFlatWorkGroupSize())
379 return Default;
380 if (Requested.second > getMaxFlatWorkGroupSize())
381 return Default;
383 return Requested;
386 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
387 const Function &F) const {
388 // Default minimum/maximum number of waves per execution unit.
389 std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
391 // Default/requested minimum/maximum flat work group sizes.
392 std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F);
394 // If minimum/maximum flat work group sizes were explicitly requested using
395 // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum
396 // number of waves per execution unit to values implied by requested
397 // minimum/maximum flat work group sizes.
398 unsigned MinImpliedByFlatWorkGroupSize =
399 getMaxWavesPerEU(FlatWorkGroupSizes.second);
400 bool RequestedFlatWorkGroupSize = false;
402 if (F.hasFnAttribute("amdgpu-flat-work-group-size")) {
403 Default.first = MinImpliedByFlatWorkGroupSize;
404 RequestedFlatWorkGroupSize = true;
407 // Requested minimum/maximum number of waves per execution unit.
408 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
409 F, "amdgpu-waves-per-eu", Default, true);
411 // Make sure requested minimum is less than requested maximum.
412 if (Requested.second && Requested.first > Requested.second)
413 return Default;
415 // Make sure requested values do not violate subtarget's specifications.
416 if (Requested.first < getMinWavesPerEU() ||
417 Requested.first > getMaxWavesPerEU())
418 return Default;
419 if (Requested.second > getMaxWavesPerEU())
420 return Default;
422 // Make sure requested values are compatible with values implied by requested
423 // minimum/maximum flat work group sizes.
424 if (RequestedFlatWorkGroupSize &&
425 Requested.first < MinImpliedByFlatWorkGroupSize)
426 return Default;
428 return Requested;
431 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const {
432 Function *Kernel = I->getParent()->getParent();
433 unsigned MinSize = 0;
434 unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second;
435 bool IdQuery = false;
437 // If reqd_work_group_size is present it narrows value down.
438 if (auto *CI = dyn_cast<CallInst>(I)) {
439 const Function *F = CI->getCalledFunction();
440 if (F) {
441 unsigned Dim = UINT_MAX;
442 switch (F->getIntrinsicID()) {
443 case Intrinsic::amdgcn_workitem_id_x:
444 case Intrinsic::r600_read_tidig_x:
445 IdQuery = true;
446 LLVM_FALLTHROUGH;
447 case Intrinsic::r600_read_local_size_x:
448 Dim = 0;
449 break;
450 case Intrinsic::amdgcn_workitem_id_y:
451 case Intrinsic::r600_read_tidig_y:
452 IdQuery = true;
453 LLVM_FALLTHROUGH;
454 case Intrinsic::r600_read_local_size_y:
455 Dim = 1;
456 break;
457 case Intrinsic::amdgcn_workitem_id_z:
458 case Intrinsic::r600_read_tidig_z:
459 IdQuery = true;
460 LLVM_FALLTHROUGH;
461 case Intrinsic::r600_read_local_size_z:
462 Dim = 2;
463 break;
464 default:
465 break;
467 if (Dim <= 3) {
468 if (auto Node = Kernel->getMetadata("reqd_work_group_size"))
469 if (Node->getNumOperands() == 3)
470 MinSize = MaxSize = mdconst::extract<ConstantInt>(
471 Node->getOperand(Dim))->getZExtValue();
476 if (!MaxSize)
477 return false;
479 // Range metadata is [Lo, Hi). For ID query we need to pass max size
480 // as Hi. For size query we need to pass Hi + 1.
481 if (IdQuery)
482 MinSize = 0;
483 else
484 ++MaxSize;
486 MDBuilder MDB(I->getContext());
487 MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize),
488 APInt(32, MaxSize));
489 I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
490 return true;
493 uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F,
494 unsigned &MaxAlign) const {
495 assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
496 F.getCallingConv() == CallingConv::SPIR_KERNEL);
498 const DataLayout &DL = F.getParent()->getDataLayout();
499 uint64_t ExplicitArgBytes = 0;
500 MaxAlign = 1;
502 for (const Argument &Arg : F.args()) {
503 Type *ArgTy = Arg.getType();
505 unsigned Align = DL.getABITypeAlignment(ArgTy);
506 uint64_t AllocSize = DL.getTypeAllocSize(ArgTy);
507 ExplicitArgBytes = alignTo(ExplicitArgBytes, Align) + AllocSize;
508 MaxAlign = std::max(MaxAlign, Align);
511 return ExplicitArgBytes;
514 unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F,
515 unsigned &MaxAlign) const {
516 uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign);
518 unsigned ExplicitOffset = getExplicitKernelArgOffset(F);
520 uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes;
521 unsigned ImplicitBytes = getImplicitArgNumBytes(F);
522 if (ImplicitBytes != 0) {
523 unsigned Alignment = getAlignmentForImplicitArgPtr();
524 TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
527 // Being able to dereference past the end is useful for emitting scalar loads.
528 return alignTo(TotalSize, 4);
531 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS,
532 const TargetMachine &TM) :
533 R600GenSubtargetInfo(TT, GPU, FS),
534 AMDGPUSubtarget(TT),
535 InstrInfo(*this),
536 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
537 FMA(false),
538 CaymanISA(false),
539 CFALUBug(false),
540 HasVertexCache(false),
541 R600ALUInst(false),
542 FP64(false),
543 TexVTXClauseSize(0),
544 Gen(R600),
545 TLInfo(TM, initializeSubtargetDependencies(TT, GPU, FS)),
546 InstrItins(getInstrItineraryForCPU(GPU)) { }
548 void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
549 unsigned NumRegionInstrs) const {
550 // Track register pressure so the scheduler can try to decrease
551 // pressure once register usage is above the threshold defined by
552 // SIRegisterInfo::getRegPressureSetLimit()
553 Policy.ShouldTrackPressure = true;
555 // Enabling both top down and bottom up scheduling seems to give us less
556 // register spills than just using one of these approaches on its own.
557 Policy.OnlyTopDown = false;
558 Policy.OnlyBottomUp = false;
560 // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
561 if (!enableSIScheduler())
562 Policy.ShouldTrackLaneMasks = true;
565 bool GCNSubtarget::hasMadF16() const {
566 return InstrInfo.pseudoToMCOpcode(AMDGPU::V_MAD_F16) != -1;
569 unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
570 if (getGeneration() >= AMDGPUSubtarget::GFX10)
571 return getMaxWavesPerEU();
573 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
574 if (SGPRs <= 80)
575 return 10;
576 if (SGPRs <= 88)
577 return 9;
578 if (SGPRs <= 100)
579 return 8;
580 return 7;
582 if (SGPRs <= 48)
583 return 10;
584 if (SGPRs <= 56)
585 return 9;
586 if (SGPRs <= 64)
587 return 8;
588 if (SGPRs <= 72)
589 return 7;
590 if (SGPRs <= 80)
591 return 6;
592 return 5;
595 unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const {
596 unsigned MaxWaves = getMaxWavesPerEU();
597 unsigned Granule = getVGPRAllocGranule();
598 if (VGPRs < Granule)
599 return MaxWaves;
600 unsigned RoundedRegs = ((VGPRs + Granule - 1) / Granule) * Granule;
601 return std::min(getTotalNumVGPRs() / RoundedRegs, MaxWaves);
604 unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
605 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
606 if (getGeneration() >= AMDGPUSubtarget::GFX10)
607 return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs.
609 if (MFI.hasFlatScratchInit()) {
610 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
611 return 6; // FLAT_SCRATCH, XNACK, VCC (in that order).
612 if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS)
613 return 4; // FLAT_SCRATCH, VCC (in that order).
616 if (isXNACKEnabled())
617 return 4; // XNACK, VCC (in that order).
618 return 2; // VCC.
621 unsigned GCNSubtarget::computeOccupancy(const MachineFunction &MF,
622 unsigned LDSSize,
623 unsigned NumSGPRs,
624 unsigned NumVGPRs) const {
625 unsigned Occupancy =
626 std::min(getMaxWavesPerEU(),
627 getOccupancyWithLocalMemSize(LDSSize, MF.getFunction()));
628 if (NumSGPRs)
629 Occupancy = std::min(Occupancy, getOccupancyWithNumSGPRs(NumSGPRs));
630 if (NumVGPRs)
631 Occupancy = std::min(Occupancy, getOccupancyWithNumVGPRs(NumVGPRs));
632 return Occupancy;
635 unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
636 const Function &F = MF.getFunction();
637 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
639 // Compute maximum number of SGPRs function can use using default/requested
640 // minimum number of waves per execution unit.
641 std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
642 unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false);
643 unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true);
645 // Check if maximum number of SGPRs was explicitly requested using
646 // "amdgpu-num-sgpr" attribute.
647 if (F.hasFnAttribute("amdgpu-num-sgpr")) {
648 unsigned Requested = AMDGPU::getIntegerAttribute(
649 F, "amdgpu-num-sgpr", MaxNumSGPRs);
651 // Make sure requested value does not violate subtarget's specifications.
652 if (Requested && (Requested <= getReservedNumSGPRs(MF)))
653 Requested = 0;
655 // If more SGPRs are required to support the input user/system SGPRs,
656 // increase to accommodate them.
658 // FIXME: This really ends up using the requested number of SGPRs + number
659 // of reserved special registers in total. Theoretically you could re-use
660 // the last input registers for these special registers, but this would
661 // require a lot of complexity to deal with the weird aliasing.
662 unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs();
663 if (Requested && Requested < InputNumSGPRs)
664 Requested = InputNumSGPRs;
666 // Make sure requested value is compatible with values implied by
667 // default/requested minimum/maximum number of waves per execution unit.
668 if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false))
669 Requested = 0;
670 if (WavesPerEU.second &&
671 Requested && Requested < getMinNumSGPRs(WavesPerEU.second))
672 Requested = 0;
674 if (Requested)
675 MaxNumSGPRs = Requested;
678 if (hasSGPRInitBug())
679 MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
681 return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF),
682 MaxAddressableNumSGPRs);
685 unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
686 const Function &F = MF.getFunction();
687 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
689 // Compute maximum number of VGPRs function can use using default/requested
690 // minimum number of waves per execution unit.
691 std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
692 unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first);
694 // Check if maximum number of VGPRs was explicitly requested using
695 // "amdgpu-num-vgpr" attribute.
696 if (F.hasFnAttribute("amdgpu-num-vgpr")) {
697 unsigned Requested = AMDGPU::getIntegerAttribute(
698 F, "amdgpu-num-vgpr", MaxNumVGPRs);
700 // Make sure requested value is compatible with values implied by
701 // default/requested minimum/maximum number of waves per execution unit.
702 if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first))
703 Requested = 0;
704 if (WavesPerEU.second &&
705 Requested && Requested < getMinNumVGPRs(WavesPerEU.second))
706 Requested = 0;
708 if (Requested)
709 MaxNumVGPRs = Requested;
712 return MaxNumVGPRs;
715 namespace {
716 struct MemOpClusterMutation : ScheduleDAGMutation {
717 const SIInstrInfo *TII;
719 MemOpClusterMutation(const SIInstrInfo *tii) : TII(tii) {}
721 void apply(ScheduleDAGInstrs *DAG) override {
722 SUnit *SUa = nullptr;
723 // Search for two consequent memory operations and link them
724 // to prevent scheduler from moving them apart.
725 // In DAG pre-process SUnits are in the original order of
726 // the instructions before scheduling.
727 for (SUnit &SU : DAG->SUnits) {
728 MachineInstr &MI2 = *SU.getInstr();
729 if (!MI2.mayLoad() && !MI2.mayStore()) {
730 SUa = nullptr;
731 continue;
733 if (!SUa) {
734 SUa = &SU;
735 continue;
738 MachineInstr &MI1 = *SUa->getInstr();
739 if ((TII->isVMEM(MI1) && TII->isVMEM(MI2)) ||
740 (TII->isFLAT(MI1) && TII->isFLAT(MI2)) ||
741 (TII->isSMRD(MI1) && TII->isSMRD(MI2)) ||
742 (TII->isDS(MI1) && TII->isDS(MI2))) {
743 SU.addPredBarrier(SUa);
745 for (const SDep &SI : SU.Preds) {
746 if (SI.getSUnit() != SUa)
747 SUa->addPred(SDep(SI.getSUnit(), SDep::Artificial));
750 if (&SU != &DAG->ExitSU) {
751 for (const SDep &SI : SUa->Succs) {
752 if (SI.getSUnit() != &SU)
753 SI.getSUnit()->addPred(SDep(&SU, SDep::Artificial));
758 SUa = &SU;
763 struct FillMFMAShadowMutation : ScheduleDAGMutation {
764 const SIInstrInfo *TII;
766 ScheduleDAGMI *DAG;
768 FillMFMAShadowMutation(const SIInstrInfo *tii) : TII(tii) {}
770 bool isSALU(const SUnit *SU) const {
771 const MachineInstr *MI = SU->getInstr();
772 return MI && TII->isSALU(*MI) && !MI->isTerminator();
775 bool canAddEdge(const SUnit *Succ, const SUnit *Pred) const {
776 if (Pred->NodeNum < Succ->NodeNum)
777 return true;
779 SmallVector<const SUnit*, 64> Succs({Succ}), Preds({Pred});
781 for (unsigned I = 0; I < Succs.size(); ++I) {
782 for (const SDep &SI : Succs[I]->Succs) {
783 const SUnit *SU = SI.getSUnit();
784 if (SU != Succs[I] && llvm::find(Succs, SU) == Succs.end())
785 Succs.push_back(SU);
789 SmallPtrSet<const SUnit*, 32> Visited;
790 while (!Preds.empty()) {
791 const SUnit *SU = Preds.pop_back_val();
792 if (llvm::find(Succs, SU) != Succs.end())
793 return false;
794 Visited.insert(SU);
795 for (const SDep &SI : SU->Preds)
796 if (SI.getSUnit() != SU && !Visited.count(SI.getSUnit()))
797 Preds.push_back(SI.getSUnit());
800 return true;
803 // Link as much SALU intructions in chain as possible. Return the size
804 // of the chain. Links up to MaxChain instructions.
805 unsigned linkSALUChain(SUnit *From, SUnit *To, unsigned MaxChain,
806 SmallPtrSetImpl<SUnit *> &Visited) const {
807 SmallVector<SUnit *, 8> Worklist({To});
808 unsigned Linked = 0;
810 while (!Worklist.empty() && MaxChain-- > 0) {
811 SUnit *SU = Worklist.pop_back_val();
812 if (!Visited.insert(SU).second)
813 continue;
815 LLVM_DEBUG(dbgs() << "Inserting edge from\n" ; DAG->dumpNode(*From);
816 dbgs() << "to\n"; DAG->dumpNode(*SU); dbgs() << '\n');
818 if (SU->addPred(SDep(From, SDep::Artificial), false))
819 ++Linked;
821 for (SDep &SI : From->Succs) {
822 SUnit *SUv = SI.getSUnit();
823 if (SUv != From && TII->isVALU(*SUv->getInstr()) && canAddEdge(SUv, SU))
824 SUv->addPred(SDep(SU, SDep::Artificial), false);
827 for (SDep &SI : SU->Succs) {
828 SUnit *Succ = SI.getSUnit();
829 if (Succ != SU && isSALU(Succ) && canAddEdge(From, Succ))
830 Worklist.push_back(Succ);
834 return Linked;
837 void apply(ScheduleDAGInstrs *DAGInstrs) override {
838 const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>();
839 if (!ST.hasMAIInsts() || DisablePowerSched)
840 return;
841 DAG = static_cast<ScheduleDAGMI*>(DAGInstrs);
842 const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel();
843 if (!TSchedModel || DAG->SUnits.empty())
844 return;
846 // Scan for MFMA long latency instructions and try to add a dependency
847 // of available SALU instructions to give them a chance to fill MFMA
848 // shadow. That is desirable to fill MFMA shadow with SALU instructions
849 // rather than VALU to prevent power consumption bursts and throttle.
850 auto LastSALU = DAG->SUnits.begin();
851 auto E = DAG->SUnits.end();
852 SmallPtrSet<SUnit*, 32> Visited;
853 for (SUnit &SU : DAG->SUnits) {
854 MachineInstr &MAI = *SU.getInstr();
855 if (!TII->isMAI(MAI) ||
856 MAI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32 ||
857 MAI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32)
858 continue;
860 unsigned Lat = TSchedModel->computeInstrLatency(&MAI) - 1;
862 LLVM_DEBUG(dbgs() << "Found MFMA: "; DAG->dumpNode(SU);
863 dbgs() << "Need " << Lat
864 << " instructions to cover latency.\n");
866 // Find up to Lat independent scalar instructions as early as
867 // possible such that they can be scheduled after this MFMA.
868 for ( ; Lat && LastSALU != E; ++LastSALU) {
869 if (Visited.count(&*LastSALU))
870 continue;
872 if (!isSALU(&*LastSALU) || !canAddEdge(&*LastSALU, &SU))
873 continue;
875 Lat -= linkSALUChain(&SU, &*LastSALU, Lat, Visited);
880 } // namespace
882 void GCNSubtarget::getPostRAMutations(
883 std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
884 Mutations.push_back(llvm::make_unique<MemOpClusterMutation>(&InstrInfo));
885 Mutations.push_back(llvm::make_unique<FillMFMAShadowMutation>(&InstrInfo));
888 const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) {
889 if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn)
890 return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>());
891 else
892 return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<R600Subtarget>());
895 const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Function &F) {
896 if (TM.getTargetTriple().getArch() == Triple::amdgcn)
897 return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F));
898 else
899 return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<R600Subtarget>(F));