Revert r354244 "[DAGCombiner] Eliminate dead stores to stack."
[llvm-complete.git] / lib / Target / AMDGPU / AMDGPUSubtarget.cpp
blobcdfbfb5154e81000563bda3ca6b1981904372511
1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Implements the AMDGPU specific subclass of TargetSubtarget.
12 //===----------------------------------------------------------------------===//
14 #include "AMDGPUSubtarget.h"
15 #include "AMDGPU.h"
16 #include "AMDGPUTargetMachine.h"
17 #include "AMDGPUCallLowering.h"
18 #include "AMDGPUInstructionSelector.h"
19 #include "AMDGPULegalizerInfo.h"
20 #include "AMDGPURegisterBankInfo.h"
21 #include "SIMachineFunctionInfo.h"
22 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
23 #include "llvm/ADT/SmallString.h"
24 #include "llvm/CodeGen/MachineScheduler.h"
25 #include "llvm/MC/MCSubtargetInfo.h"
26 #include "llvm/IR/MDBuilder.h"
27 #include "llvm/CodeGen/TargetFrameLowering.h"
28 #include <algorithm>
30 using namespace llvm;
32 #define DEBUG_TYPE "amdgpu-subtarget"
34 #define GET_SUBTARGETINFO_TARGET_DESC
35 #define GET_SUBTARGETINFO_CTOR
36 #define AMDGPUSubtarget GCNSubtarget
37 #include "AMDGPUGenSubtargetInfo.inc"
38 #define GET_SUBTARGETINFO_TARGET_DESC
39 #define GET_SUBTARGETINFO_CTOR
40 #undef AMDGPUSubtarget
41 #include "R600GenSubtargetInfo.inc"
43 GCNSubtarget::~GCNSubtarget() = default;
45 R600Subtarget &
46 R600Subtarget::initializeSubtargetDependencies(const Triple &TT,
47 StringRef GPU, StringRef FS) {
48 SmallString<256> FullFS("+promote-alloca,+dx10-clamp,");
49 FullFS += FS;
50 ParseSubtargetFeatures(GPU, FullFS);
52 // FIXME: I don't think think Evergreen has any useful support for
53 // denormals, but should be checked. Should we issue a warning somewhere
54 // if someone tries to enable these?
55 if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
56 FP32Denormals = false;
59 HasMulU24 = getGeneration() >= EVERGREEN;
60 HasMulI24 = hasCaymanISA();
62 return *this;
65 GCNSubtarget &
66 GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
67 StringRef GPU, StringRef FS) {
68 // Determine default and user-specified characteristics
69 // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be
70 // enabled, but some instructions do not respect them and they run at the
71 // double precision rate, so don't enable by default.
73 // We want to be able to turn these off, but making this a subtarget feature
74 // for SI has the unhelpful behavior that it unsets everything else if you
75 // disable it.
77 // Similarly we want enable-prt-strict-null to be on by default and not to
78 // unset everything else if it is disabled
80 SmallString<256> FullFS("+promote-alloca,+dx10-clamp,+load-store-opt,");
82 if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA.
83 FullFS += "+flat-address-space,+flat-for-global,+unaligned-buffer-access,+trap-handler,";
85 // FIXME: I don't think think Evergreen has any useful support for
86 // denormals, but should be checked. Should we issue a warning somewhere
87 // if someone tries to enable these?
88 if (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
89 FullFS += "+fp64-fp16-denormals,";
90 } else {
91 FullFS += "-fp32-denormals,";
94 FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS
96 FullFS += FS;
98 ParseSubtargetFeatures(GPU, FullFS);
100 // We don't support FP64 for EG/NI atm.
101 assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS));
103 // Unless +-flat-for-global is specified, turn on FlatForGlobal for all OS-es
104 // on VI and newer hardware to avoid assertion failures due to missing ADDR64
105 // variants of MUBUF instructions.
106 if (!hasAddr64() && !FS.contains("flat-for-global")) {
107 FlatForGlobal = true;
110 // Set defaults if needed.
111 if (MaxPrivateElementSize == 0)
112 MaxPrivateElementSize = 4;
114 if (LDSBankCount == 0)
115 LDSBankCount = 32;
117 if (TT.getArch() == Triple::amdgcn) {
118 if (LocalMemorySize == 0)
119 LocalMemorySize = 32768;
121 // Do something sensible for unspecified target.
122 if (!HasMovrel && !HasVGPRIndexMode)
123 HasMovrel = true;
126 // Don't crash on invalid devices.
127 if (WavefrontSize == 0)
128 WavefrontSize = 64;
130 HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS;
132 return *this;
135 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) :
136 TargetTriple(TT),
137 Has16BitInsts(false),
138 HasMadMixInsts(false),
139 FP32Denormals(false),
140 FPExceptions(false),
141 HasSDWA(false),
142 HasVOP3PInsts(false),
143 HasMulI24(true),
144 HasMulU24(true),
145 HasInv2PiInlineImm(false),
146 HasFminFmaxLegacy(true),
147 EnablePromoteAlloca(false),
148 HasTrigReducedRange(false),
149 LocalMemorySize(0),
150 WavefrontSize(0)
153 GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
154 const GCNTargetMachine &TM) :
155 AMDGPUGenSubtargetInfo(TT, GPU, FS),
156 AMDGPUSubtarget(TT),
157 TargetTriple(TT),
158 Gen(SOUTHERN_ISLANDS),
159 InstrItins(getInstrItineraryForCPU(GPU)),
160 LDSBankCount(0),
161 MaxPrivateElementSize(0),
163 FastFMAF32(false),
164 HalfRate64Ops(false),
166 FP64FP16Denormals(false),
167 DX10Clamp(false),
168 FlatForGlobal(false),
169 AutoWaitcntBeforeBarrier(false),
170 CodeObjectV3(false),
171 UnalignedScratchAccess(false),
172 UnalignedBufferAccess(false),
174 HasApertureRegs(false),
175 EnableXNACK(false),
176 TrapHandler(false),
177 DebuggerInsertNops(false),
178 DebuggerEmitPrologue(false),
180 EnableHugePrivateBuffer(false),
181 EnableLoadStoreOpt(false),
182 EnableUnsafeDSOffsetFolding(false),
183 EnableSIScheduler(false),
184 EnableDS128(false),
185 EnablePRTStrictNull(false),
186 DumpCode(false),
188 FP64(false),
189 GCN3Encoding(false),
190 CIInsts(false),
191 VIInsts(false),
192 GFX9Insts(false),
193 SGPRInitBug(false),
194 HasSMemRealTime(false),
195 HasIntClamp(false),
196 HasFmaMixInsts(false),
197 HasMovrel(false),
198 HasVGPRIndexMode(false),
199 HasScalarStores(false),
200 HasScalarAtomics(false),
201 HasSDWAOmod(false),
202 HasSDWAScalar(false),
203 HasSDWASdst(false),
204 HasSDWAMac(false),
205 HasSDWAOutModsVOPC(false),
206 HasDPP(false),
207 HasR128A16(false),
208 HasDLInsts(false),
209 HasDot1Insts(false),
210 HasDot2Insts(false),
211 EnableSRAMECC(false),
212 FlatAddressSpace(false),
213 FlatInstOffsets(false),
214 FlatGlobalInsts(false),
215 FlatScratchInsts(false),
216 AddNoCarryInsts(false),
217 HasUnpackedD16VMem(false),
219 ScalarizeGlobal(false),
221 FeatureDisable(false),
222 InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)),
223 TLInfo(TM, *this),
224 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) {
225 CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering()));
226 Legalizer.reset(new AMDGPULegalizerInfo(*this, TM));
227 RegBankInfo.reset(new AMDGPURegisterBankInfo(*getRegisterInfo()));
228 InstSelector.reset(new AMDGPUInstructionSelector(
229 *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM));
232 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
233 const Function &F) const {
234 if (NWaves == 1)
235 return getLocalMemorySize();
236 unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
237 unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
238 unsigned MaxWaves = getMaxWavesPerEU();
239 return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves;
242 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes,
243 const Function &F) const {
244 unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
245 unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
246 unsigned MaxWaves = getMaxWavesPerEU();
247 unsigned Limit = getLocalMemorySize() * MaxWaves / WorkGroupsPerCu;
248 unsigned NumWaves = Limit / (Bytes ? Bytes : 1u);
249 NumWaves = std::min(NumWaves, MaxWaves);
250 NumWaves = std::max(NumWaves, 1u);
251 return NumWaves;
254 unsigned
255 AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const {
256 const auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
257 return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction());
260 std::pair<unsigned, unsigned>
261 AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const {
262 switch (CC) {
263 case CallingConv::AMDGPU_CS:
264 case CallingConv::AMDGPU_KERNEL:
265 case CallingConv::SPIR_KERNEL:
266 return std::make_pair(getWavefrontSize() * 2, getWavefrontSize() * 4);
267 case CallingConv::AMDGPU_VS:
268 case CallingConv::AMDGPU_LS:
269 case CallingConv::AMDGPU_HS:
270 case CallingConv::AMDGPU_ES:
271 case CallingConv::AMDGPU_GS:
272 case CallingConv::AMDGPU_PS:
273 return std::make_pair(1, getWavefrontSize());
274 default:
275 return std::make_pair(1, 16 * getWavefrontSize());
279 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
280 const Function &F) const {
281 // FIXME: 1024 if function.
282 // Default minimum/maximum flat work group sizes.
283 std::pair<unsigned, unsigned> Default =
284 getDefaultFlatWorkGroupSize(F.getCallingConv());
286 // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa
287 // starts using "amdgpu-flat-work-group-size" attribute.
288 Default.second = AMDGPU::getIntegerAttribute(
289 F, "amdgpu-max-work-group-size", Default.second);
290 Default.first = std::min(Default.first, Default.second);
292 // Requested minimum/maximum flat work group sizes.
293 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
294 F, "amdgpu-flat-work-group-size", Default);
296 // Make sure requested minimum is less than requested maximum.
297 if (Requested.first > Requested.second)
298 return Default;
300 // Make sure requested values do not violate subtarget's specifications.
301 if (Requested.first < getMinFlatWorkGroupSize())
302 return Default;
303 if (Requested.second > getMaxFlatWorkGroupSize())
304 return Default;
306 return Requested;
309 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
310 const Function &F) const {
311 // Default minimum/maximum number of waves per execution unit.
312 std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
314 // Default/requested minimum/maximum flat work group sizes.
315 std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F);
317 // If minimum/maximum flat work group sizes were explicitly requested using
318 // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum
319 // number of waves per execution unit to values implied by requested
320 // minimum/maximum flat work group sizes.
321 unsigned MinImpliedByFlatWorkGroupSize =
322 getMaxWavesPerEU(FlatWorkGroupSizes.second);
323 bool RequestedFlatWorkGroupSize = false;
325 // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa
326 // starts using "amdgpu-flat-work-group-size" attribute.
327 if (F.hasFnAttribute("amdgpu-max-work-group-size") ||
328 F.hasFnAttribute("amdgpu-flat-work-group-size")) {
329 Default.first = MinImpliedByFlatWorkGroupSize;
330 RequestedFlatWorkGroupSize = true;
333 // Requested minimum/maximum number of waves per execution unit.
334 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
335 F, "amdgpu-waves-per-eu", Default, true);
337 // Make sure requested minimum is less than requested maximum.
338 if (Requested.second && Requested.first > Requested.second)
339 return Default;
341 // Make sure requested values do not violate subtarget's specifications.
342 if (Requested.first < getMinWavesPerEU() ||
343 Requested.first > getMaxWavesPerEU())
344 return Default;
345 if (Requested.second > getMaxWavesPerEU())
346 return Default;
348 // Make sure requested values are compatible with values implied by requested
349 // minimum/maximum flat work group sizes.
350 if (RequestedFlatWorkGroupSize &&
351 Requested.first < MinImpliedByFlatWorkGroupSize)
352 return Default;
354 return Requested;
357 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const {
358 Function *Kernel = I->getParent()->getParent();
359 unsigned MinSize = 0;
360 unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second;
361 bool IdQuery = false;
363 // If reqd_work_group_size is present it narrows value down.
364 if (auto *CI = dyn_cast<CallInst>(I)) {
365 const Function *F = CI->getCalledFunction();
366 if (F) {
367 unsigned Dim = UINT_MAX;
368 switch (F->getIntrinsicID()) {
369 case Intrinsic::amdgcn_workitem_id_x:
370 case Intrinsic::r600_read_tidig_x:
371 IdQuery = true;
372 LLVM_FALLTHROUGH;
373 case Intrinsic::r600_read_local_size_x:
374 Dim = 0;
375 break;
376 case Intrinsic::amdgcn_workitem_id_y:
377 case Intrinsic::r600_read_tidig_y:
378 IdQuery = true;
379 LLVM_FALLTHROUGH;
380 case Intrinsic::r600_read_local_size_y:
381 Dim = 1;
382 break;
383 case Intrinsic::amdgcn_workitem_id_z:
384 case Intrinsic::r600_read_tidig_z:
385 IdQuery = true;
386 LLVM_FALLTHROUGH;
387 case Intrinsic::r600_read_local_size_z:
388 Dim = 2;
389 break;
390 default:
391 break;
393 if (Dim <= 3) {
394 if (auto Node = Kernel->getMetadata("reqd_work_group_size"))
395 if (Node->getNumOperands() == 3)
396 MinSize = MaxSize = mdconst::extract<ConstantInt>(
397 Node->getOperand(Dim))->getZExtValue();
402 if (!MaxSize)
403 return false;
405 // Range metadata is [Lo, Hi). For ID query we need to pass max size
406 // as Hi. For size query we need to pass Hi + 1.
407 if (IdQuery)
408 MinSize = 0;
409 else
410 ++MaxSize;
412 MDBuilder MDB(I->getContext());
413 MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize),
414 APInt(32, MaxSize));
415 I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
416 return true;
419 uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F,
420 unsigned &MaxAlign) const {
421 assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
422 F.getCallingConv() == CallingConv::SPIR_KERNEL);
424 const DataLayout &DL = F.getParent()->getDataLayout();
425 uint64_t ExplicitArgBytes = 0;
426 MaxAlign = 1;
428 for (const Argument &Arg : F.args()) {
429 Type *ArgTy = Arg.getType();
431 unsigned Align = DL.getABITypeAlignment(ArgTy);
432 uint64_t AllocSize = DL.getTypeAllocSize(ArgTy);
433 ExplicitArgBytes = alignTo(ExplicitArgBytes, Align) + AllocSize;
434 MaxAlign = std::max(MaxAlign, Align);
437 return ExplicitArgBytes;
440 unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F,
441 unsigned &MaxAlign) const {
442 uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign);
444 unsigned ExplicitOffset = getExplicitKernelArgOffset(F);
446 uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes;
447 unsigned ImplicitBytes = getImplicitArgNumBytes(F);
448 if (ImplicitBytes != 0) {
449 unsigned Alignment = getAlignmentForImplicitArgPtr();
450 TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
453 // Being able to dereference past the end is useful for emitting scalar loads.
454 return alignTo(TotalSize, 4);
457 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS,
458 const TargetMachine &TM) :
459 R600GenSubtargetInfo(TT, GPU, FS),
460 AMDGPUSubtarget(TT),
461 InstrInfo(*this),
462 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
463 FMA(false),
464 CaymanISA(false),
465 CFALUBug(false),
466 DX10Clamp(false),
467 HasVertexCache(false),
468 R600ALUInst(false),
469 FP64(false),
470 TexVTXClauseSize(0),
471 Gen(R600),
472 TLInfo(TM, initializeSubtargetDependencies(TT, GPU, FS)),
473 InstrItins(getInstrItineraryForCPU(GPU)) { }
475 void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
476 unsigned NumRegionInstrs) const {
477 // Track register pressure so the scheduler can try to decrease
478 // pressure once register usage is above the threshold defined by
479 // SIRegisterInfo::getRegPressureSetLimit()
480 Policy.ShouldTrackPressure = true;
482 // Enabling both top down and bottom up scheduling seems to give us less
483 // register spills than just using one of these approaches on its own.
484 Policy.OnlyTopDown = false;
485 Policy.OnlyBottomUp = false;
487 // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
488 if (!enableSIScheduler())
489 Policy.ShouldTrackLaneMasks = true;
492 unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
493 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
494 if (SGPRs <= 80)
495 return 10;
496 if (SGPRs <= 88)
497 return 9;
498 if (SGPRs <= 100)
499 return 8;
500 return 7;
502 if (SGPRs <= 48)
503 return 10;
504 if (SGPRs <= 56)
505 return 9;
506 if (SGPRs <= 64)
507 return 8;
508 if (SGPRs <= 72)
509 return 7;
510 if (SGPRs <= 80)
511 return 6;
512 return 5;
515 unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const {
516 if (VGPRs <= 24)
517 return 10;
518 if (VGPRs <= 28)
519 return 9;
520 if (VGPRs <= 32)
521 return 8;
522 if (VGPRs <= 36)
523 return 7;
524 if (VGPRs <= 40)
525 return 6;
526 if (VGPRs <= 48)
527 return 5;
528 if (VGPRs <= 64)
529 return 4;
530 if (VGPRs <= 84)
531 return 3;
532 if (VGPRs <= 128)
533 return 2;
534 return 1;
537 unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
538 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
539 if (MFI.hasFlatScratchInit()) {
540 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
541 return 6; // FLAT_SCRATCH, XNACK, VCC (in that order).
542 if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS)
543 return 4; // FLAT_SCRATCH, VCC (in that order).
546 if (isXNACKEnabled())
547 return 4; // XNACK, VCC (in that order).
548 return 2; // VCC.
551 unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
552 const Function &F = MF.getFunction();
553 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
555 // Compute maximum number of SGPRs function can use using default/requested
556 // minimum number of waves per execution unit.
557 std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
558 unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false);
559 unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true);
561 // Check if maximum number of SGPRs was explicitly requested using
562 // "amdgpu-num-sgpr" attribute.
563 if (F.hasFnAttribute("amdgpu-num-sgpr")) {
564 unsigned Requested = AMDGPU::getIntegerAttribute(
565 F, "amdgpu-num-sgpr", MaxNumSGPRs);
567 // Make sure requested value does not violate subtarget's specifications.
568 if (Requested && (Requested <= getReservedNumSGPRs(MF)))
569 Requested = 0;
571 // If more SGPRs are required to support the input user/system SGPRs,
572 // increase to accommodate them.
574 // FIXME: This really ends up using the requested number of SGPRs + number
575 // of reserved special registers in total. Theoretically you could re-use
576 // the last input registers for these special registers, but this would
577 // require a lot of complexity to deal with the weird aliasing.
578 unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs();
579 if (Requested && Requested < InputNumSGPRs)
580 Requested = InputNumSGPRs;
582 // Make sure requested value is compatible with values implied by
583 // default/requested minimum/maximum number of waves per execution unit.
584 if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false))
585 Requested = 0;
586 if (WavesPerEU.second &&
587 Requested && Requested < getMinNumSGPRs(WavesPerEU.second))
588 Requested = 0;
590 if (Requested)
591 MaxNumSGPRs = Requested;
594 if (hasSGPRInitBug())
595 MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
597 return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF),
598 MaxAddressableNumSGPRs);
601 unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
602 const Function &F = MF.getFunction();
603 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
605 // Compute maximum number of VGPRs function can use using default/requested
606 // minimum number of waves per execution unit.
607 std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
608 unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first);
610 // Check if maximum number of VGPRs was explicitly requested using
611 // "amdgpu-num-vgpr" attribute.
612 if (F.hasFnAttribute("amdgpu-num-vgpr")) {
613 unsigned Requested = AMDGPU::getIntegerAttribute(
614 F, "amdgpu-num-vgpr", MaxNumVGPRs);
616 // Make sure requested value is compatible with values implied by
617 // default/requested minimum/maximum number of waves per execution unit.
618 if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first))
619 Requested = 0;
620 if (WavesPerEU.second &&
621 Requested && Requested < getMinNumVGPRs(WavesPerEU.second))
622 Requested = 0;
624 if (Requested)
625 MaxNumVGPRs = Requested;
628 return MaxNumVGPRs;
631 namespace {
632 struct MemOpClusterMutation : ScheduleDAGMutation {
633 const SIInstrInfo *TII;
635 MemOpClusterMutation(const SIInstrInfo *tii) : TII(tii) {}
637 void apply(ScheduleDAGInstrs *DAGInstrs) override {
638 ScheduleDAGMI *DAG = static_cast<ScheduleDAGMI*>(DAGInstrs);
640 SUnit *SUa = nullptr;
641 // Search for two consequent memory operations and link them
642 // to prevent scheduler from moving them apart.
643 // In DAG pre-process SUnits are in the original order of
644 // the instructions before scheduling.
645 for (SUnit &SU : DAG->SUnits) {
646 MachineInstr &MI2 = *SU.getInstr();
647 if (!MI2.mayLoad() && !MI2.mayStore()) {
648 SUa = nullptr;
649 continue;
651 if (!SUa) {
652 SUa = &SU;
653 continue;
656 MachineInstr &MI1 = *SUa->getInstr();
657 if ((TII->isVMEM(MI1) && TII->isVMEM(MI2)) ||
658 (TII->isFLAT(MI1) && TII->isFLAT(MI2)) ||
659 (TII->isSMRD(MI1) && TII->isSMRD(MI2)) ||
660 (TII->isDS(MI1) && TII->isDS(MI2))) {
661 SU.addPredBarrier(SUa);
663 for (const SDep &SI : SU.Preds) {
664 if (SI.getSUnit() != SUa)
665 SUa->addPred(SDep(SI.getSUnit(), SDep::Artificial));
668 if (&SU != &DAG->ExitSU) {
669 for (const SDep &SI : SUa->Succs) {
670 if (SI.getSUnit() != &SU)
671 SI.getSUnit()->addPred(SDep(&SU, SDep::Artificial));
676 SUa = &SU;
680 } // namespace
682 void GCNSubtarget::getPostRAMutations(
683 std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
684 Mutations.push_back(llvm::make_unique<MemOpClusterMutation>(&InstrInfo));
687 const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) {
688 if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn)
689 return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>());
690 else
691 return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<R600Subtarget>());
694 const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Function &F) {
695 if (TM.getTargetTriple().getArch() == Triple::amdgcn)
696 return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F));
697 else
698 return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<R600Subtarget>(F));