[AMDGPU] New gfx940 mfma instructions
[llvm-project.git] / llvm / lib / Target / AMDGPU / AMDGPUSubtarget.cpp
blob572df05d0c4ff23089ec223b236f6f4d31994ad4
1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Implements the AMDGPU specific subclass of TargetSubtarget.
12 //===----------------------------------------------------------------------===//
14 #include "AMDGPUSubtarget.h"
15 #include "AMDGPUCallLowering.h"
16 #include "AMDGPUInstructionSelector.h"
17 #include "AMDGPULegalizerInfo.h"
18 #include "AMDGPURegisterBankInfo.h"
19 #include "AMDGPUTargetMachine.h"
20 #include "R600Subtarget.h"
21 #include "SIMachineFunctionInfo.h"
22 #include "Utils/AMDGPUBaseInfo.h"
23 #include "llvm/ADT/SmallString.h"
24 #include "llvm/CodeGen/GlobalISel/InlineAsmLowering.h"
25 #include "llvm/CodeGen/MachineScheduler.h"
26 #include "llvm/CodeGen/TargetFrameLowering.h"
27 #include "llvm/IR/IntrinsicsAMDGPU.h"
28 #include "llvm/IR/IntrinsicsR600.h"
29 #include "llvm/IR/MDBuilder.h"
30 #include "llvm/MC/MCSubtargetInfo.h"
31 #include <algorithm>
33 using namespace llvm;
35 #define DEBUG_TYPE "amdgpu-subtarget"
37 #define GET_SUBTARGETINFO_TARGET_DESC
38 #define GET_SUBTARGETINFO_CTOR
39 #define AMDGPUSubtarget GCNSubtarget
40 #include "AMDGPUGenSubtargetInfo.inc"
41 #undef AMDGPUSubtarget
43 static cl::opt<bool> DisablePowerSched(
44 "amdgpu-disable-power-sched",
45 cl::desc("Disable scheduling to minimize mAI power bursts"),
46 cl::init(false));
48 static cl::opt<bool> EnableVGPRIndexMode(
49 "amdgpu-vgpr-index-mode",
50 cl::desc("Use GPR indexing mode instead of movrel for vector indexing"),
51 cl::init(false));
53 static cl::opt<bool> UseAA("amdgpu-use-aa-in-codegen",
54 cl::desc("Enable the use of AA during codegen."),
55 cl::init(true));
57 GCNSubtarget::~GCNSubtarget() = default;
59 GCNSubtarget &
60 GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
61 StringRef GPU, StringRef FS) {
62 // Determine default and user-specified characteristics
64 // We want to be able to turn these off, but making this a subtarget feature
65 // for SI has the unhelpful behavior that it unsets everything else if you
66 // disable it.
68 // Similarly we want enable-prt-strict-null to be on by default and not to
69 // unset everything else if it is disabled
71 SmallString<256> FullFS("+promote-alloca,+load-store-opt,+enable-ds128,");
73 // Turn on features that HSA ABI requires. Also turn on FlatForGlobal by default
74 if (isAmdHsaOS())
75 FullFS += "+flat-for-global,+unaligned-access-mode,+trap-handler,";
77 FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS
79 // Disable mutually exclusive bits.
80 if (FS.contains_insensitive("+wavefrontsize")) {
81 if (!FS.contains_insensitive("wavefrontsize16"))
82 FullFS += "-wavefrontsize16,";
83 if (!FS.contains_insensitive("wavefrontsize32"))
84 FullFS += "-wavefrontsize32,";
85 if (!FS.contains_insensitive("wavefrontsize64"))
86 FullFS += "-wavefrontsize64,";
89 FullFS += FS;
91 ParseSubtargetFeatures(GPU, /*TuneCPU*/ GPU, FullFS);
93 // Implement the "generic" processors, which acts as the default when no
94 // generation features are enabled (e.g for -mcpu=''). HSA OS defaults to
95 // the first amdgcn target that supports flat addressing. Other OSes defaults
96 // to the first amdgcn target.
97 if (Gen == AMDGPUSubtarget::INVALID) {
98 Gen = TT.getOS() == Triple::AMDHSA ? AMDGPUSubtarget::SEA_ISLANDS
99 : AMDGPUSubtarget::SOUTHERN_ISLANDS;
102 // We don't support FP64 for EG/NI atm.
103 assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS));
105 // Targets must either support 64-bit offsets for MUBUF instructions, and/or
106 // support flat operations, otherwise they cannot access a 64-bit global
107 // address space
108 assert(hasAddr64() || hasFlat());
109 // Unless +-flat-for-global is specified, turn on FlatForGlobal for targets
110 // that do not support ADDR64 variants of MUBUF instructions. Such targets
111 // cannot use a 64 bit offset with a MUBUF instruction to access the global
112 // address space
113 if (!hasAddr64() && !FS.contains("flat-for-global") && !FlatForGlobal) {
114 ToggleFeature(AMDGPU::FeatureFlatForGlobal);
115 FlatForGlobal = true;
117 // Unless +-flat-for-global is specified, use MUBUF instructions for global
118 // address space access if flat operations are not available.
119 if (!hasFlat() && !FS.contains("flat-for-global") && FlatForGlobal) {
120 ToggleFeature(AMDGPU::FeatureFlatForGlobal);
121 FlatForGlobal = false;
124 // Set defaults if needed.
125 if (MaxPrivateElementSize == 0)
126 MaxPrivateElementSize = 4;
128 if (LDSBankCount == 0)
129 LDSBankCount = 32;
131 if (TT.getArch() == Triple::amdgcn) {
132 if (LocalMemorySize == 0)
133 LocalMemorySize = 32768;
135 // Do something sensible for unspecified target.
136 if (!HasMovrel && !HasVGPRIndexMode)
137 HasMovrel = true;
140 // Don't crash on invalid devices.
141 if (WavefrontSizeLog2 == 0)
142 WavefrontSizeLog2 = 5;
144 HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS;
145 HasSMulHi = getGeneration() >= AMDGPUSubtarget::GFX9;
147 TargetID.setTargetIDFromFeaturesString(FS);
149 LLVM_DEBUG(dbgs() << "xnack setting for subtarget: "
150 << TargetID.getXnackSetting() << '\n');
151 LLVM_DEBUG(dbgs() << "sramecc setting for subtarget: "
152 << TargetID.getSramEccSetting() << '\n');
154 return *this;
157 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) :
158 TargetTriple(TT),
159 GCN3Encoding(false),
160 Has16BitInsts(false),
161 HasMadMixInsts(false),
162 HasMadMacF32Insts(false),
163 HasDsSrc2Insts(false),
164 HasSDWA(false),
165 HasVOP3PInsts(false),
166 HasMulI24(true),
167 HasMulU24(true),
168 HasSMulHi(false),
169 HasInv2PiInlineImm(false),
170 HasFminFmaxLegacy(true),
171 EnablePromoteAlloca(false),
172 HasTrigReducedRange(false),
173 MaxWavesPerEU(10),
174 LocalMemorySize(0),
175 WavefrontSizeLog2(0)
178 GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
179 const GCNTargetMachine &TM)
180 : // clang-format off
181 AMDGPUGenSubtargetInfo(TT, GPU, /*TuneCPU*/ GPU, FS),
182 AMDGPUSubtarget(TT),
183 TargetTriple(TT),
184 TargetID(*this),
185 Gen(INVALID),
186 InstrItins(getInstrItineraryForCPU(GPU)),
187 LDSBankCount(0),
188 MaxPrivateElementSize(0),
190 FastFMAF32(false),
191 FastDenormalF32(false),
192 HalfRate64Ops(false),
193 FullRate64Ops(false),
195 FlatForGlobal(false),
196 AutoWaitcntBeforeBarrier(false),
197 BackOffBarrier(false),
198 UnalignedScratchAccess(false),
199 UnalignedAccessMode(false),
201 HasApertureRegs(false),
202 SupportsXNACK(false),
203 EnableXNACK(false),
204 EnableTgSplit(false),
205 EnableCuMode(false),
206 TrapHandler(false),
208 EnableLoadStoreOpt(false),
209 EnableUnsafeDSOffsetFolding(false),
210 EnableSIScheduler(false),
211 EnableDS128(false),
212 EnablePRTStrictNull(false),
213 DumpCode(false),
215 FP64(false),
216 CIInsts(false),
217 GFX8Insts(false),
218 GFX9Insts(false),
219 GFX90AInsts(false),
220 GFX940Insts(false),
221 GFX10Insts(false),
222 GFX10_3Insts(false),
223 GFX7GFX8GFX9Insts(false),
224 SGPRInitBug(false),
225 NegativeScratchOffsetBug(false),
226 NegativeUnalignedScratchOffsetBug(false),
227 HasSMemRealTime(false),
228 HasIntClamp(false),
229 HasFmaMixInsts(false),
230 HasMovrel(false),
231 HasVGPRIndexMode(false),
232 HasScalarStores(false),
233 HasScalarAtomics(false),
234 HasSDWAOmod(false),
235 HasSDWAScalar(false),
236 HasSDWASdst(false),
237 HasSDWAMac(false),
238 HasSDWAOutModsVOPC(false),
239 HasDPP(false),
240 HasDPP8(false),
241 Has64BitDPP(false),
242 HasPackedFP32Ops(false),
243 HasImageInsts(false),
244 HasExtendedImageInsts(false),
245 HasR128A16(false),
246 HasGFX10A16(false),
247 HasG16(false),
248 HasNSAEncoding(false),
249 NSAMaxSize(0),
250 GFX10_AEncoding(false),
251 GFX10_BEncoding(false),
252 HasDLInsts(false),
253 HasDot1Insts(false),
254 HasDot2Insts(false),
255 HasDot3Insts(false),
256 HasDot4Insts(false),
257 HasDot5Insts(false),
258 HasDot6Insts(false),
259 HasDot7Insts(false),
260 HasMAIInsts(false),
261 HasPkFmacF16Inst(false),
262 HasAtomicFaddInsts(false),
263 SupportsSRAMECC(false),
264 EnableSRAMECC(false),
265 HasNoSdstCMPX(false),
266 HasVscnt(false),
267 HasGetWaveIdInst(false),
268 HasSMemTimeInst(false),
269 HasShaderCyclesRegister(false),
270 HasVOP3Literal(false),
271 HasNoDataDepHazard(false),
272 FlatAddressSpace(false),
273 FlatInstOffsets(false),
274 FlatGlobalInsts(false),
275 FlatScratchInsts(false),
276 ScalarFlatScratchInsts(false),
277 HasArchitectedFlatScratch(false),
278 EnableFlatScratch(false),
279 AddNoCarryInsts(false),
280 HasUnpackedD16VMem(false),
281 LDSMisalignedBug(false),
282 HasMFMAInlineLiteralBug(false),
283 UnalignedBufferAccess(false),
284 UnalignedDSAccess(false),
285 HasPackedTID(false),
287 ScalarizeGlobal(false),
289 HasVcmpxPermlaneHazard(false),
290 HasVMEMtoScalarWriteHazard(false),
291 HasSMEMtoVectorWriteHazard(false),
292 HasInstFwdPrefetchBug(false),
293 HasVcmpxExecWARHazard(false),
294 HasLdsBranchVmemWARHazard(false),
295 HasNSAtoVMEMBug(false),
296 HasNSAClauseBug(false),
297 HasOffset3fBug(false),
298 HasFlatSegmentOffsetBug(false),
299 HasImageStoreD16Bug(false),
300 HasImageGather4D16Bug(false),
302 FeatureDisable(false),
303 InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)),
304 TLInfo(TM, *this),
305 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) {
306 // clang-format on
307 MaxWavesPerEU = AMDGPU::IsaInfo::getMaxWavesPerEU(this);
308 CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering()));
309 InlineAsmLoweringInfo.reset(new InlineAsmLowering(getTargetLowering()));
310 Legalizer.reset(new AMDGPULegalizerInfo(*this, TM));
311 RegBankInfo.reset(new AMDGPURegisterBankInfo(*this));
312 InstSelector.reset(new AMDGPUInstructionSelector(
313 *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM));
316 unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const {
317 if (getGeneration() < GFX10)
318 return 1;
320 switch (Opcode) {
321 case AMDGPU::V_LSHLREV_B64_e64:
322 case AMDGPU::V_LSHLREV_B64_gfx10:
323 case AMDGPU::V_LSHL_B64_e64:
324 case AMDGPU::V_LSHRREV_B64_e64:
325 case AMDGPU::V_LSHRREV_B64_gfx10:
326 case AMDGPU::V_LSHR_B64_e64:
327 case AMDGPU::V_ASHRREV_I64_e64:
328 case AMDGPU::V_ASHRREV_I64_gfx10:
329 case AMDGPU::V_ASHR_I64_e64:
330 return 1;
333 return 2;
336 /// This list was mostly derived from experimentation.
337 bool GCNSubtarget::zeroesHigh16BitsOfDest(unsigned Opcode) const {
338 switch (Opcode) {
339 case AMDGPU::V_CVT_F16_F32_e32:
340 case AMDGPU::V_CVT_F16_F32_e64:
341 case AMDGPU::V_CVT_F16_U16_e32:
342 case AMDGPU::V_CVT_F16_U16_e64:
343 case AMDGPU::V_CVT_F16_I16_e32:
344 case AMDGPU::V_CVT_F16_I16_e64:
345 case AMDGPU::V_RCP_F16_e64:
346 case AMDGPU::V_RCP_F16_e32:
347 case AMDGPU::V_RSQ_F16_e64:
348 case AMDGPU::V_RSQ_F16_e32:
349 case AMDGPU::V_SQRT_F16_e64:
350 case AMDGPU::V_SQRT_F16_e32:
351 case AMDGPU::V_LOG_F16_e64:
352 case AMDGPU::V_LOG_F16_e32:
353 case AMDGPU::V_EXP_F16_e64:
354 case AMDGPU::V_EXP_F16_e32:
355 case AMDGPU::V_SIN_F16_e64:
356 case AMDGPU::V_SIN_F16_e32:
357 case AMDGPU::V_COS_F16_e64:
358 case AMDGPU::V_COS_F16_e32:
359 case AMDGPU::V_FLOOR_F16_e64:
360 case AMDGPU::V_FLOOR_F16_e32:
361 case AMDGPU::V_CEIL_F16_e64:
362 case AMDGPU::V_CEIL_F16_e32:
363 case AMDGPU::V_TRUNC_F16_e64:
364 case AMDGPU::V_TRUNC_F16_e32:
365 case AMDGPU::V_RNDNE_F16_e64:
366 case AMDGPU::V_RNDNE_F16_e32:
367 case AMDGPU::V_FRACT_F16_e64:
368 case AMDGPU::V_FRACT_F16_e32:
369 case AMDGPU::V_FREXP_MANT_F16_e64:
370 case AMDGPU::V_FREXP_MANT_F16_e32:
371 case AMDGPU::V_FREXP_EXP_I16_F16_e64:
372 case AMDGPU::V_FREXP_EXP_I16_F16_e32:
373 case AMDGPU::V_LDEXP_F16_e64:
374 case AMDGPU::V_LDEXP_F16_e32:
375 case AMDGPU::V_LSHLREV_B16_e64:
376 case AMDGPU::V_LSHLREV_B16_e32:
377 case AMDGPU::V_LSHRREV_B16_e64:
378 case AMDGPU::V_LSHRREV_B16_e32:
379 case AMDGPU::V_ASHRREV_I16_e64:
380 case AMDGPU::V_ASHRREV_I16_e32:
381 case AMDGPU::V_ADD_U16_e64:
382 case AMDGPU::V_ADD_U16_e32:
383 case AMDGPU::V_SUB_U16_e64:
384 case AMDGPU::V_SUB_U16_e32:
385 case AMDGPU::V_SUBREV_U16_e64:
386 case AMDGPU::V_SUBREV_U16_e32:
387 case AMDGPU::V_MUL_LO_U16_e64:
388 case AMDGPU::V_MUL_LO_U16_e32:
389 case AMDGPU::V_ADD_F16_e64:
390 case AMDGPU::V_ADD_F16_e32:
391 case AMDGPU::V_SUB_F16_e64:
392 case AMDGPU::V_SUB_F16_e32:
393 case AMDGPU::V_SUBREV_F16_e64:
394 case AMDGPU::V_SUBREV_F16_e32:
395 case AMDGPU::V_MUL_F16_e64:
396 case AMDGPU::V_MUL_F16_e32:
397 case AMDGPU::V_MAX_F16_e64:
398 case AMDGPU::V_MAX_F16_e32:
399 case AMDGPU::V_MIN_F16_e64:
400 case AMDGPU::V_MIN_F16_e32:
401 case AMDGPU::V_MAX_U16_e64:
402 case AMDGPU::V_MAX_U16_e32:
403 case AMDGPU::V_MIN_U16_e64:
404 case AMDGPU::V_MIN_U16_e32:
405 case AMDGPU::V_MAX_I16_e64:
406 case AMDGPU::V_MAX_I16_e32:
407 case AMDGPU::V_MIN_I16_e64:
408 case AMDGPU::V_MIN_I16_e32:
409 case AMDGPU::V_MAD_F16_e64:
410 case AMDGPU::V_MAD_U16_e64:
411 case AMDGPU::V_MAD_I16_e64:
412 case AMDGPU::V_FMA_F16_e64:
413 case AMDGPU::V_DIV_FIXUP_F16_e64:
414 // On gfx10, all 16-bit instructions preserve the high bits.
415 return getGeneration() <= AMDGPUSubtarget::GFX9;
416 case AMDGPU::V_MADAK_F16:
417 case AMDGPU::V_MADMK_F16:
418 case AMDGPU::V_MAC_F16_e64:
419 case AMDGPU::V_MAC_F16_e32:
420 case AMDGPU::V_FMAMK_F16:
421 case AMDGPU::V_FMAAK_F16:
422 case AMDGPU::V_FMAC_F16_e64:
423 case AMDGPU::V_FMAC_F16_e32:
424 // In gfx9, the preferred handling of the unused high 16-bits changed. Most
425 // instructions maintain the legacy behavior of 0ing. Some instructions
426 // changed to preserving the high bits.
427 return getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS;
428 case AMDGPU::V_MAD_MIXLO_F16:
429 case AMDGPU::V_MAD_MIXHI_F16:
430 default:
431 return false;
435 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
436 const Function &F) const {
437 if (NWaves == 1)
438 return getLocalMemorySize();
439 unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
440 unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
441 if (!WorkGroupsPerCu)
442 return 0;
443 unsigned MaxWaves = getMaxWavesPerEU();
444 return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves;
447 // FIXME: Should return min,max range.
448 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes,
449 const Function &F) const {
450 const unsigned MaxWorkGroupSize = getFlatWorkGroupSizes(F).second;
451 const unsigned MaxWorkGroupsPerCu = getMaxWorkGroupsPerCU(MaxWorkGroupSize);
452 if (!MaxWorkGroupsPerCu)
453 return 0;
455 const unsigned WaveSize = getWavefrontSize();
457 // FIXME: Do we need to account for alignment requirement of LDS rounding the
458 // size up?
459 // Compute restriction based on LDS usage
460 unsigned NumGroups = getLocalMemorySize() / (Bytes ? Bytes : 1u);
462 // This can be queried with more LDS than is possible, so just assume the
463 // worst.
464 if (NumGroups == 0)
465 return 1;
467 NumGroups = std::min(MaxWorkGroupsPerCu, NumGroups);
469 // Round to the number of waves.
470 const unsigned MaxGroupNumWaves = (MaxWorkGroupSize + WaveSize - 1) / WaveSize;
471 unsigned MaxWaves = NumGroups * MaxGroupNumWaves;
473 // Clamp to the maximum possible number of waves.
474 MaxWaves = std::min(MaxWaves, getMaxWavesPerEU());
476 // FIXME: Needs to be a multiple of the group size?
477 //MaxWaves = MaxGroupNumWaves * (MaxWaves / MaxGroupNumWaves);
479 assert(MaxWaves > 0 && MaxWaves <= getMaxWavesPerEU() &&
480 "computed invalid occupancy");
481 return MaxWaves;
484 unsigned
485 AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const {
486 const auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
487 return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction());
490 std::pair<unsigned, unsigned>
491 AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const {
492 switch (CC) {
493 case CallingConv::AMDGPU_VS:
494 case CallingConv::AMDGPU_LS:
495 case CallingConv::AMDGPU_HS:
496 case CallingConv::AMDGPU_ES:
497 case CallingConv::AMDGPU_GS:
498 case CallingConv::AMDGPU_PS:
499 return std::make_pair(1, getWavefrontSize());
500 default:
501 return std::make_pair(1u, getMaxFlatWorkGroupSize());
505 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
506 const Function &F) const {
507 // Default minimum/maximum flat work group sizes.
508 std::pair<unsigned, unsigned> Default =
509 getDefaultFlatWorkGroupSize(F.getCallingConv());
511 // Requested minimum/maximum flat work group sizes.
512 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
513 F, "amdgpu-flat-work-group-size", Default);
515 // Make sure requested minimum is less than requested maximum.
516 if (Requested.first > Requested.second)
517 return Default;
519 // Make sure requested values do not violate subtarget's specifications.
520 if (Requested.first < getMinFlatWorkGroupSize())
521 return Default;
522 if (Requested.second > getMaxFlatWorkGroupSize())
523 return Default;
525 return Requested;
528 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
529 const Function &F, std::pair<unsigned, unsigned> FlatWorkGroupSizes) const {
530 // Default minimum/maximum number of waves per execution unit.
531 std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
533 // If minimum/maximum flat work group sizes were explicitly requested using
534 // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum
535 // number of waves per execution unit to values implied by requested
536 // minimum/maximum flat work group sizes.
537 unsigned MinImpliedByFlatWorkGroupSize =
538 getWavesPerEUForWorkGroup(FlatWorkGroupSizes.second);
539 Default.first = MinImpliedByFlatWorkGroupSize;
541 // Requested minimum/maximum number of waves per execution unit.
542 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
543 F, "amdgpu-waves-per-eu", Default, true);
545 // Make sure requested minimum is less than requested maximum.
546 if (Requested.second && Requested.first > Requested.second)
547 return Default;
549 // Make sure requested values do not violate subtarget's specifications.
550 if (Requested.first < getMinWavesPerEU() ||
551 Requested.second > getMaxWavesPerEU())
552 return Default;
554 // Make sure requested values are compatible with values implied by requested
555 // minimum/maximum flat work group sizes.
556 if (Requested.first < MinImpliedByFlatWorkGroupSize)
557 return Default;
559 return Requested;
562 static unsigned getReqdWorkGroupSize(const Function &Kernel, unsigned Dim) {
563 auto Node = Kernel.getMetadata("reqd_work_group_size");
564 if (Node && Node->getNumOperands() == 3)
565 return mdconst::extract<ConstantInt>(Node->getOperand(Dim))->getZExtValue();
566 return std::numeric_limits<unsigned>::max();
569 bool AMDGPUSubtarget::isMesaKernel(const Function &F) const {
570 return isMesa3DOS() && !AMDGPU::isShader(F.getCallingConv());
573 unsigned AMDGPUSubtarget::getMaxWorkitemID(const Function &Kernel,
574 unsigned Dimension) const {
575 unsigned ReqdSize = getReqdWorkGroupSize(Kernel, Dimension);
576 if (ReqdSize != std::numeric_limits<unsigned>::max())
577 return ReqdSize - 1;
578 return getFlatWorkGroupSizes(Kernel).second - 1;
581 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const {
582 Function *Kernel = I->getParent()->getParent();
583 unsigned MinSize = 0;
584 unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second;
585 bool IdQuery = false;
587 // If reqd_work_group_size is present it narrows value down.
588 if (auto *CI = dyn_cast<CallInst>(I)) {
589 const Function *F = CI->getCalledFunction();
590 if (F) {
591 unsigned Dim = UINT_MAX;
592 switch (F->getIntrinsicID()) {
593 case Intrinsic::amdgcn_workitem_id_x:
594 case Intrinsic::r600_read_tidig_x:
595 IdQuery = true;
596 LLVM_FALLTHROUGH;
597 case Intrinsic::r600_read_local_size_x:
598 Dim = 0;
599 break;
600 case Intrinsic::amdgcn_workitem_id_y:
601 case Intrinsic::r600_read_tidig_y:
602 IdQuery = true;
603 LLVM_FALLTHROUGH;
604 case Intrinsic::r600_read_local_size_y:
605 Dim = 1;
606 break;
607 case Intrinsic::amdgcn_workitem_id_z:
608 case Intrinsic::r600_read_tidig_z:
609 IdQuery = true;
610 LLVM_FALLTHROUGH;
611 case Intrinsic::r600_read_local_size_z:
612 Dim = 2;
613 break;
614 default:
615 break;
618 if (Dim <= 3) {
619 unsigned ReqdSize = getReqdWorkGroupSize(*Kernel, Dim);
620 if (ReqdSize != std::numeric_limits<unsigned>::max())
621 MinSize = MaxSize = ReqdSize;
626 if (!MaxSize)
627 return false;
629 // Range metadata is [Lo, Hi). For ID query we need to pass max size
630 // as Hi. For size query we need to pass Hi + 1.
631 if (IdQuery)
632 MinSize = 0;
633 else
634 ++MaxSize;
636 MDBuilder MDB(I->getContext());
637 MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize),
638 APInt(32, MaxSize));
639 I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
640 return true;
643 unsigned AMDGPUSubtarget::getImplicitArgNumBytes(const Function &F) const {
644 assert(AMDGPU::isKernel(F.getCallingConv()));
646 // We don't allocate the segment if we know the implicit arguments weren't
647 // used, even if the ABI implies we need them.
648 if (F.hasFnAttribute("amdgpu-no-implicitarg-ptr"))
649 return 0;
651 if (isMesaKernel(F))
652 return 16;
654 // Assume all implicit inputs are used by default
655 return AMDGPU::getIntegerAttribute(F, "amdgpu-implicitarg-num-bytes", 56);
658 uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F,
659 Align &MaxAlign) const {
660 assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
661 F.getCallingConv() == CallingConv::SPIR_KERNEL);
663 const DataLayout &DL = F.getParent()->getDataLayout();
664 uint64_t ExplicitArgBytes = 0;
665 MaxAlign = Align(1);
667 for (const Argument &Arg : F.args()) {
668 const bool IsByRef = Arg.hasByRefAttr();
669 Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType();
670 MaybeAlign Alignment = IsByRef ? Arg.getParamAlign() : None;
671 if (!Alignment)
672 Alignment = DL.getABITypeAlign(ArgTy);
674 uint64_t AllocSize = DL.getTypeAllocSize(ArgTy);
675 ExplicitArgBytes = alignTo(ExplicitArgBytes, Alignment) + AllocSize;
676 MaxAlign = max(MaxAlign, Alignment);
679 return ExplicitArgBytes;
682 unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F,
683 Align &MaxAlign) const {
684 uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign);
686 unsigned ExplicitOffset = getExplicitKernelArgOffset(F);
688 uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes;
689 unsigned ImplicitBytes = getImplicitArgNumBytes(F);
690 if (ImplicitBytes != 0) {
691 const Align Alignment = getAlignmentForImplicitArgPtr();
692 TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
693 MaxAlign = std::max(MaxAlign, Alignment);
696 // Being able to dereference past the end is useful for emitting scalar loads.
697 return alignTo(TotalSize, 4);
700 AMDGPUDwarfFlavour AMDGPUSubtarget::getAMDGPUDwarfFlavour() const {
701 return getWavefrontSize() == 32 ? AMDGPUDwarfFlavour::Wave32
702 : AMDGPUDwarfFlavour::Wave64;
705 void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
706 unsigned NumRegionInstrs) const {
707 // Track register pressure so the scheduler can try to decrease
708 // pressure once register usage is above the threshold defined by
709 // SIRegisterInfo::getRegPressureSetLimit()
710 Policy.ShouldTrackPressure = true;
712 // Enabling both top down and bottom up scheduling seems to give us less
713 // register spills than just using one of these approaches on its own.
714 Policy.OnlyTopDown = false;
715 Policy.OnlyBottomUp = false;
717 // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
718 if (!enableSIScheduler())
719 Policy.ShouldTrackLaneMasks = true;
722 bool GCNSubtarget::hasMadF16() const {
723 return InstrInfo.pseudoToMCOpcode(AMDGPU::V_MAD_F16_e64) != -1;
726 bool GCNSubtarget::useVGPRIndexMode() const {
727 return !hasMovrel() || (EnableVGPRIndexMode && hasVGPRIndexMode());
730 bool GCNSubtarget::useAA() const { return UseAA; }
732 unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
733 if (getGeneration() >= AMDGPUSubtarget::GFX10)
734 return getMaxWavesPerEU();
736 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
737 if (SGPRs <= 80)
738 return 10;
739 if (SGPRs <= 88)
740 return 9;
741 if (SGPRs <= 100)
742 return 8;
743 return 7;
745 if (SGPRs <= 48)
746 return 10;
747 if (SGPRs <= 56)
748 return 9;
749 if (SGPRs <= 64)
750 return 8;
751 if (SGPRs <= 72)
752 return 7;
753 if (SGPRs <= 80)
754 return 6;
755 return 5;
758 unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const {
759 unsigned MaxWaves = getMaxWavesPerEU();
760 unsigned Granule = getVGPRAllocGranule();
761 if (VGPRs < Granule)
762 return MaxWaves;
763 unsigned RoundedRegs = ((VGPRs + Granule - 1) / Granule) * Granule;
764 return std::min(std::max(getTotalNumVGPRs() / RoundedRegs, 1u), MaxWaves);
767 unsigned
768 GCNSubtarget::getBaseReservedNumSGPRs(const bool HasFlatScratch) const {
769 if (getGeneration() >= AMDGPUSubtarget::GFX10)
770 return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs.
772 if (HasFlatScratch || HasArchitectedFlatScratch) {
773 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
774 return 6; // FLAT_SCRATCH, XNACK, VCC (in that order).
775 if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS)
776 return 4; // FLAT_SCRATCH, VCC (in that order).
779 if (isXNACKEnabled())
780 return 4; // XNACK, VCC (in that order).
781 return 2; // VCC.
784 unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
785 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
786 return getBaseReservedNumSGPRs(MFI.hasFlatScratchInit());
789 unsigned GCNSubtarget::getReservedNumSGPRs(const Function &F) const {
790 // In principle we do not need to reserve SGPR pair used for flat_scratch if
791 // we know flat instructions do not access the stack anywhere in the
792 // program. For now assume it's needed if we have flat instructions.
793 const bool KernelUsesFlatScratch = hasFlatAddressSpace();
794 return getBaseReservedNumSGPRs(KernelUsesFlatScratch);
797 unsigned GCNSubtarget::computeOccupancy(const Function &F, unsigned LDSSize,
798 unsigned NumSGPRs,
799 unsigned NumVGPRs) const {
800 unsigned Occupancy =
801 std::min(getMaxWavesPerEU(),
802 getOccupancyWithLocalMemSize(LDSSize, F));
803 if (NumSGPRs)
804 Occupancy = std::min(Occupancy, getOccupancyWithNumSGPRs(NumSGPRs));
805 if (NumVGPRs)
806 Occupancy = std::min(Occupancy, getOccupancyWithNumVGPRs(NumVGPRs));
807 return Occupancy;
810 unsigned GCNSubtarget::getBaseMaxNumSGPRs(
811 const Function &F, std::pair<unsigned, unsigned> WavesPerEU,
812 unsigned PreloadedSGPRs, unsigned ReservedNumSGPRs) const {
813 // Compute maximum number of SGPRs function can use using default/requested
814 // minimum number of waves per execution unit.
815 unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false);
816 unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true);
818 // Check if maximum number of SGPRs was explicitly requested using
819 // "amdgpu-num-sgpr" attribute.
820 if (F.hasFnAttribute("amdgpu-num-sgpr")) {
821 unsigned Requested = AMDGPU::getIntegerAttribute(
822 F, "amdgpu-num-sgpr", MaxNumSGPRs);
824 // Make sure requested value does not violate subtarget's specifications.
825 if (Requested && (Requested <= ReservedNumSGPRs))
826 Requested = 0;
828 // If more SGPRs are required to support the input user/system SGPRs,
829 // increase to accommodate them.
831 // FIXME: This really ends up using the requested number of SGPRs + number
832 // of reserved special registers in total. Theoretically you could re-use
833 // the last input registers for these special registers, but this would
834 // require a lot of complexity to deal with the weird aliasing.
835 unsigned InputNumSGPRs = PreloadedSGPRs;
836 if (Requested && Requested < InputNumSGPRs)
837 Requested = InputNumSGPRs;
839 // Make sure requested value is compatible with values implied by
840 // default/requested minimum/maximum number of waves per execution unit.
841 if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false))
842 Requested = 0;
843 if (WavesPerEU.second &&
844 Requested && Requested < getMinNumSGPRs(WavesPerEU.second))
845 Requested = 0;
847 if (Requested)
848 MaxNumSGPRs = Requested;
851 if (hasSGPRInitBug())
852 MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
854 return std::min(MaxNumSGPRs - ReservedNumSGPRs, MaxAddressableNumSGPRs);
857 unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
858 const Function &F = MF.getFunction();
859 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
860 return getBaseMaxNumSGPRs(F, MFI.getWavesPerEU(), MFI.getNumPreloadedSGPRs(),
861 getReservedNumSGPRs(MF));
864 static unsigned getMaxNumPreloadedSGPRs() {
865 // Max number of user SGPRs
866 unsigned MaxUserSGPRs = 4 + // private segment buffer
867 2 + // Dispatch ptr
868 2 + // queue ptr
869 2 + // kernel segment ptr
870 2 + // dispatch ID
871 2 + // flat scratch init
872 2; // Implicit buffer ptr
873 // Max number of system SGPRs
874 unsigned MaxSystemSGPRs = 1 + // WorkGroupIDX
875 1 + // WorkGroupIDY
876 1 + // WorkGroupIDZ
877 1 + // WorkGroupInfo
878 1; // private segment wave byte offset
879 return MaxUserSGPRs + MaxSystemSGPRs;
882 unsigned GCNSubtarget::getMaxNumSGPRs(const Function &F) const {
883 return getBaseMaxNumSGPRs(F, getWavesPerEU(F), getMaxNumPreloadedSGPRs(),
884 getReservedNumSGPRs(F));
887 unsigned GCNSubtarget::getBaseMaxNumVGPRs(
888 const Function &F, std::pair<unsigned, unsigned> WavesPerEU) const {
889 // Compute maximum number of VGPRs function can use using default/requested
890 // minimum number of waves per execution unit.
891 unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first);
893 // Check if maximum number of VGPRs was explicitly requested using
894 // "amdgpu-num-vgpr" attribute.
895 if (F.hasFnAttribute("amdgpu-num-vgpr")) {
896 unsigned Requested = AMDGPU::getIntegerAttribute(
897 F, "amdgpu-num-vgpr", MaxNumVGPRs);
899 if (hasGFX90AInsts())
900 Requested *= 2;
902 // Make sure requested value is compatible with values implied by
903 // default/requested minimum/maximum number of waves per execution unit.
904 if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first))
905 Requested = 0;
906 if (WavesPerEU.second &&
907 Requested && Requested < getMinNumVGPRs(WavesPerEU.second))
908 Requested = 0;
910 if (Requested)
911 MaxNumVGPRs = Requested;
914 return MaxNumVGPRs;
917 unsigned GCNSubtarget::getMaxNumVGPRs(const Function &F) const {
918 return getBaseMaxNumVGPRs(F, getWavesPerEU(F));
921 unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
922 const Function &F = MF.getFunction();
923 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
924 return getBaseMaxNumVGPRs(F, MFI.getWavesPerEU());
927 void GCNSubtarget::adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use,
928 int UseOpIdx, SDep &Dep) const {
929 if (Dep.getKind() != SDep::Kind::Data || !Dep.getReg() ||
930 !Def->isInstr() || !Use->isInstr())
931 return;
933 MachineInstr *DefI = Def->getInstr();
934 MachineInstr *UseI = Use->getInstr();
936 if (DefI->isBundle()) {
937 const SIRegisterInfo *TRI = getRegisterInfo();
938 auto Reg = Dep.getReg();
939 MachineBasicBlock::const_instr_iterator I(DefI->getIterator());
940 MachineBasicBlock::const_instr_iterator E(DefI->getParent()->instr_end());
941 unsigned Lat = 0;
942 for (++I; I != E && I->isBundledWithPred(); ++I) {
943 if (I->modifiesRegister(Reg, TRI))
944 Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *I);
945 else if (Lat)
946 --Lat;
948 Dep.setLatency(Lat);
949 } else if (UseI->isBundle()) {
950 const SIRegisterInfo *TRI = getRegisterInfo();
951 auto Reg = Dep.getReg();
952 MachineBasicBlock::const_instr_iterator I(UseI->getIterator());
953 MachineBasicBlock::const_instr_iterator E(UseI->getParent()->instr_end());
954 unsigned Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *DefI);
955 for (++I; I != E && I->isBundledWithPred() && Lat; ++I) {
956 if (I->readsRegister(Reg, TRI))
957 break;
958 --Lat;
960 Dep.setLatency(Lat);
961 } else if (Dep.getLatency() == 0 && Dep.getReg() == AMDGPU::VCC_LO) {
962 // Work around the fact that SIInstrInfo::fixImplicitOperands modifies
963 // implicit operands which come from the MCInstrDesc, which can fool
964 // ScheduleDAGInstrs::addPhysRegDataDeps into treating them as implicit
965 // pseudo operands.
966 Dep.setLatency(InstrInfo.getSchedModel().computeOperandLatency(
967 DefI, DefOpIdx, UseI, UseOpIdx));
971 namespace {
972 struct FillMFMAShadowMutation : ScheduleDAGMutation {
973 const SIInstrInfo *TII;
975 ScheduleDAGMI *DAG;
977 FillMFMAShadowMutation(const SIInstrInfo *tii) : TII(tii) {}
979 bool isSALU(const SUnit *SU) const {
980 const MachineInstr *MI = SU->getInstr();
981 return MI && TII->isSALU(*MI) && !MI->isTerminator();
984 bool isVALU(const SUnit *SU) const {
985 const MachineInstr *MI = SU->getInstr();
986 return MI && TII->isVALU(*MI);
989 bool canAddEdge(const SUnit *Succ, const SUnit *Pred) const {
990 if (Pred->NodeNum < Succ->NodeNum)
991 return true;
993 SmallVector<const SUnit*, 64> Succs({Succ}), Preds({Pred});
995 for (unsigned I = 0; I < Succs.size(); ++I) {
996 for (const SDep &SI : Succs[I]->Succs) {
997 const SUnit *SU = SI.getSUnit();
998 if (SU != Succs[I] && !llvm::is_contained(Succs, SU))
999 Succs.push_back(SU);
1003 SmallPtrSet<const SUnit*, 32> Visited;
1004 while (!Preds.empty()) {
1005 const SUnit *SU = Preds.pop_back_val();
1006 if (llvm::is_contained(Succs, SU))
1007 return false;
1008 Visited.insert(SU);
1009 for (const SDep &SI : SU->Preds)
1010 if (SI.getSUnit() != SU && !Visited.count(SI.getSUnit()))
1011 Preds.push_back(SI.getSUnit());
1014 return true;
1017 // Link as many SALU instructions in chain as possible. Return the size
1018 // of the chain. Links up to MaxChain instructions.
1019 unsigned linkSALUChain(SUnit *From, SUnit *To, unsigned MaxChain,
1020 SmallPtrSetImpl<SUnit *> &Visited) const {
1021 SmallVector<SUnit *, 8> Worklist({To});
1022 unsigned Linked = 0;
1024 while (!Worklist.empty() && MaxChain-- > 0) {
1025 SUnit *SU = Worklist.pop_back_val();
1026 if (!Visited.insert(SU).second)
1027 continue;
1029 LLVM_DEBUG(dbgs() << "Inserting edge from\n" ; DAG->dumpNode(*From);
1030 dbgs() << "to\n"; DAG->dumpNode(*SU); dbgs() << '\n');
1032 if (SU->addPred(SDep(From, SDep::Artificial), false))
1033 ++Linked;
1035 for (SDep &SI : From->Succs) {
1036 SUnit *SUv = SI.getSUnit();
1037 if (SUv != From && isVALU(SUv) && canAddEdge(SUv, SU))
1038 SUv->addPred(SDep(SU, SDep::Artificial), false);
1041 for (SDep &SI : SU->Succs) {
1042 SUnit *Succ = SI.getSUnit();
1043 if (Succ != SU && isSALU(Succ) && canAddEdge(From, Succ))
1044 Worklist.push_back(Succ);
1048 return Linked;
1051 void apply(ScheduleDAGInstrs *DAGInstrs) override {
1052 const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>();
1053 if (!ST.hasMAIInsts() || DisablePowerSched)
1054 return;
1055 DAG = static_cast<ScheduleDAGMI*>(DAGInstrs);
1056 const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel();
1057 if (!TSchedModel || DAG->SUnits.empty())
1058 return;
1060 // Scan for MFMA long latency instructions and try to add a dependency
1061 // of available SALU instructions to give them a chance to fill MFMA
1062 // shadow. That is desirable to fill MFMA shadow with SALU instructions
1063 // rather than VALU to prevent power consumption bursts and throttle.
1064 auto LastSALU = DAG->SUnits.begin();
1065 auto E = DAG->SUnits.end();
1066 SmallPtrSet<SUnit*, 32> Visited;
1067 for (SUnit &SU : DAG->SUnits) {
1068 MachineInstr &MAI = *SU.getInstr();
1069 if (!TII->isMAI(MAI) ||
1070 MAI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
1071 MAI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64)
1072 continue;
1074 unsigned Lat = TSchedModel->computeInstrLatency(&MAI) - 1;
1076 LLVM_DEBUG(dbgs() << "Found MFMA: "; DAG->dumpNode(SU);
1077 dbgs() << "Need " << Lat
1078 << " instructions to cover latency.\n");
1080 // Find up to Lat independent scalar instructions as early as
1081 // possible such that they can be scheduled after this MFMA.
1082 for ( ; Lat && LastSALU != E; ++LastSALU) {
1083 if (Visited.count(&*LastSALU))
1084 continue;
1086 if (!isSALU(&*LastSALU) || !canAddEdge(&*LastSALU, &SU))
1087 continue;
1089 Lat -= linkSALUChain(&SU, &*LastSALU, Lat, Visited);
1094 } // namespace
1096 void GCNSubtarget::getPostRAMutations(
1097 std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
1098 Mutations.push_back(std::make_unique<FillMFMAShadowMutation>(&InstrInfo));
1101 std::unique_ptr<ScheduleDAGMutation>
1102 GCNSubtarget::createFillMFMAShadowMutation(const TargetInstrInfo *TII) const {
1103 return std::make_unique<FillMFMAShadowMutation>(&InstrInfo);
1106 const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) {
1107 if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn)
1108 return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>());
1109 else
1110 return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<R600Subtarget>());
1113 const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Function &F) {
1114 if (TM.getTargetTriple().getArch() == Triple::amdgcn)
1115 return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F));
1116 else
1117 return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<R600Subtarget>(F));