llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp

   1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
   2 //
   3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 // See https://llvm.org/LICENSE.txt for license information.
   5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 //
   7 //===----------------------------------------------------------------------===//
   8 //
   9 /// \file
  10 /// Implements the AMDGPU specific subclass of TargetSubtarget.
  11 //
  12 //===----------------------------------------------------------------------===//
  13
  14 #include "AMDGPUSubtarget.h"
  15 #include "AMDGPUCallLowering.h"
  16 #include "AMDGPUInstructionSelector.h"
  17 #include "AMDGPULegalizerInfo.h"
  18 #include "AMDGPURegisterBankInfo.h"
  19 #include "AMDGPUTargetMachine.h"
  20 #include "R600Subtarget.h"
  21 #include "SIMachineFunctionInfo.h"
  22 #include "Utils/AMDGPUBaseInfo.h"
  23 #include "llvm/ADT/SmallString.h"
  24 #include "llvm/CodeGen/GlobalISel/InlineAsmLowering.h"
  25 #include "llvm/CodeGen/MachineScheduler.h"
  26 #include "llvm/CodeGen/TargetFrameLowering.h"
  27 #include "llvm/IR/IntrinsicsAMDGPU.h"
  28 #include "llvm/IR/IntrinsicsR600.h"
  29 #include "llvm/IR/MDBuilder.h"
  30 #include "llvm/MC/MCSubtargetInfo.h"
  31 #include <algorithm>
  32
  33 using namespace llvm;
  34
  35 #define DEBUG_TYPE "amdgpu-subtarget"
  36
  37 #define GET_SUBTARGETINFO_TARGET_DESC
  38 #define GET_SUBTARGETINFO_CTOR
  39 #define AMDGPUSubtarget GCNSubtarget
  40 #include "AMDGPUGenSubtargetInfo.inc"
  41 #undef AMDGPUSubtarget
  42
  43 static cl::opt<bool> DisablePowerSched(
  44   "amdgpu-disable-power-sched",
  45   cl::desc("Disable scheduling to minimize mAI power bursts"),
  46   cl::init(false));
  47
  48 static cl::opt<bool> EnableVGPRIndexMode(
  49   "amdgpu-vgpr-index-mode",
  50   cl::desc("Use GPR indexing mode instead of movrel for vector indexing"),
  51   cl::init(false));
  52
  53 static cl::opt<bool> UseAA("amdgpu-use-aa-in-codegen",
  54                            cl::desc("Enable the use of AA during codegen."),
  55                            cl::init(true));
  56
  57 GCNSubtarget::~GCNSubtarget() = default;
  58
  59 GCNSubtarget &
  60 GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
  61                                               StringRef GPU, StringRef FS) {
  62   // Determine default and user-specified characteristics
  63   //
  64   // We want to be able to turn these off, but making this a subtarget feature
  65   // for SI has the unhelpful behavior that it unsets everything else if you
  66   // disable it.
  67   //
  68   // Similarly we want enable-prt-strict-null to be on by default and not to
  69   // unset everything else if it is disabled
  70
  71   SmallString<256> FullFS("+promote-alloca,+load-store-opt,+enable-ds128,");
  72
  73   // Turn on features that HSA ABI requires. Also turn on FlatForGlobal by default
  74   if (isAmdHsaOS())
  75     FullFS += "+flat-for-global,+unaligned-access-mode,+trap-handler,";
  76
  77   FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS
  78
  79   // Disable mutually exclusive bits.
  80   if (FS.contains_insensitive("+wavefrontsize")) {
  81     if (!FS.contains_insensitive("wavefrontsize16"))
  82       FullFS += "-wavefrontsize16,";
  83     if (!FS.contains_insensitive("wavefrontsize32"))
  84       FullFS += "-wavefrontsize32,";
  85     if (!FS.contains_insensitive("wavefrontsize64"))
  86       FullFS += "-wavefrontsize64,";
  87   }
  88
  89   FullFS += FS;
  90
  91   ParseSubtargetFeatures(GPU, /*TuneCPU*/ GPU, FullFS);
  92
  93   // Implement the "generic" processors, which acts as the default when no
  94   // generation features are enabled (e.g for -mcpu=''). HSA OS defaults to
  95   // the first amdgcn target that supports flat addressing. Other OSes defaults
  96   // to the first amdgcn target.
  97   if (Gen == AMDGPUSubtarget::INVALID) {
  98      Gen = TT.getOS() == Triple::AMDHSA ? AMDGPUSubtarget::SEA_ISLANDS
  99                                         : AMDGPUSubtarget::SOUTHERN_ISLANDS;
 100   }
 101
 102   // We don't support FP64 for EG/NI atm.
 103   assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS));
 104
 105   // Targets must either support 64-bit offsets for MUBUF instructions, and/or
 106   // support flat operations, otherwise they cannot access a 64-bit global
 107   // address space
 108   assert(hasAddr64() || hasFlat());
 109   // Unless +-flat-for-global is specified, turn on FlatForGlobal for targets
 110   // that do not support ADDR64 variants of MUBUF instructions. Such targets
 111   // cannot use a 64 bit offset with a MUBUF instruction to access the global
 112   // address space
 113   if (!hasAddr64() && !FS.contains("flat-for-global") && !FlatForGlobal) {
 114     ToggleFeature(AMDGPU::FeatureFlatForGlobal);
 115     FlatForGlobal = true;
 116   }
 117   // Unless +-flat-for-global is specified, use MUBUF instructions for global
 118   // address space access if flat operations are not available.
 119   if (!hasFlat() && !FS.contains("flat-for-global") && FlatForGlobal) {
 120     ToggleFeature(AMDGPU::FeatureFlatForGlobal);
 121     FlatForGlobal = false;
 122   }
 123
 124   // Set defaults if needed.
 125   if (MaxPrivateElementSize == 0)
 126     MaxPrivateElementSize = 4;
 127
 128   if (LDSBankCount == 0)
 129     LDSBankCount = 32;
 130
 131   if (TT.getArch() == Triple::amdgcn) {
 132     if (LocalMemorySize == 0)
 133       LocalMemorySize = 32768;
 134
 135     // Do something sensible for unspecified target.
 136     if (!HasMovrel && !HasVGPRIndexMode)
 137       HasMovrel = true;
 138   }
 139
 140   // Don't crash on invalid devices.
 141   if (WavefrontSizeLog2 == 0)
 142     WavefrontSizeLog2 = 5;
 143
 144   HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS;
 145   HasSMulHi = getGeneration() >= AMDGPUSubtarget::GFX9;
 146
 147   TargetID.setTargetIDFromFeaturesString(FS);
 148
 149   LLVM_DEBUG(dbgs() << "xnack setting for subtarget: "
 150                     << TargetID.getXnackSetting() << '\n');
 151   LLVM_DEBUG(dbgs() << "sramecc setting for subtarget: "
 152                     << TargetID.getSramEccSetting() << '\n');
 153
 154   return *this;
 155 }
 156
 157 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) :
 158   TargetTriple(TT),
 159   GCN3Encoding(false),
 160   Has16BitInsts(false),
 161   HasMadMixInsts(false),
 162   HasMadMacF32Insts(false),
 163   HasDsSrc2Insts(false),
 164   HasSDWA(false),
 165   HasVOP3PInsts(false),
 166   HasMulI24(true),
 167   HasMulU24(true),
 168   HasSMulHi(false),
 169   HasInv2PiInlineImm(false),
 170   HasFminFmaxLegacy(true),
 171   EnablePromoteAlloca(false),
 172   HasTrigReducedRange(false),
 173   MaxWavesPerEU(10),
 174   LocalMemorySize(0),
 175   WavefrontSizeLog2(0)
 176   { }
 177
 178 GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
 179                            const GCNTargetMachine &TM)
 180     : // clang-format off
 181     AMDGPUGenSubtargetInfo(TT, GPU, /*TuneCPU*/ GPU, FS),
 182     AMDGPUSubtarget(TT),
 183     TargetTriple(TT),
 184     TargetID(*this),
 185     Gen(INVALID),
 186     InstrItins(getInstrItineraryForCPU(GPU)),
 187     LDSBankCount(0),
 188     MaxPrivateElementSize(0),
 189
 190     FastFMAF32(false),
 191     FastDenormalF32(false),
 192     HalfRate64Ops(false),
 193     FullRate64Ops(false),
 194
 195     FlatForGlobal(false),
 196     AutoWaitcntBeforeBarrier(false),
 197     BackOffBarrier(false),
 198     UnalignedScratchAccess(false),
 199     UnalignedAccessMode(false),
 200
 201     HasApertureRegs(false),
 202     SupportsXNACK(false),
 203     EnableXNACK(false),
 204     EnableTgSplit(false),
 205     EnableCuMode(false),
 206     TrapHandler(false),
 207
 208     EnableLoadStoreOpt(false),
 209     EnableUnsafeDSOffsetFolding(false),
 210     EnableSIScheduler(false),
 211     EnableDS128(false),
 212     EnablePRTStrictNull(false),
 213     DumpCode(false),
 214
 215     FP64(false),
 216     CIInsts(false),
 217     GFX8Insts(false),
 218     GFX9Insts(false),
 219     GFX90AInsts(false),
 220     GFX940Insts(false),
 221     GFX10Insts(false),
 222     GFX10_3Insts(false),
 223     GFX7GFX8GFX9Insts(false),
 224     SGPRInitBug(false),
 225     NegativeScratchOffsetBug(false),
 226     NegativeUnalignedScratchOffsetBug(false),
 227     HasSMemRealTime(false),
 228     HasIntClamp(false),
 229     HasFmaMixInsts(false),
 230     HasMovrel(false),
 231     HasVGPRIndexMode(false),
 232     HasScalarStores(false),
 233     HasScalarAtomics(false),
 234     HasSDWAOmod(false),
 235     HasSDWAScalar(false),
 236     HasSDWASdst(false),
 237     HasSDWAMac(false),
 238     HasSDWAOutModsVOPC(false),
 239     HasDPP(false),
 240     HasDPP8(false),
 241     Has64BitDPP(false),
 242     HasPackedFP32Ops(false),
 243     HasImageInsts(false),
 244     HasExtendedImageInsts(false),
 245     HasR128A16(false),
 246     HasGFX10A16(false),
 247     HasG16(false),
 248     HasNSAEncoding(false),
 249     NSAMaxSize(0),
 250     GFX10_AEncoding(false),
 251     GFX10_BEncoding(false),
 252     HasDLInsts(false),
 253     HasDot1Insts(false),
 254     HasDot2Insts(false),
 255     HasDot3Insts(false),
 256     HasDot4Insts(false),
 257     HasDot5Insts(false),
 258     HasDot6Insts(false),
 259     HasDot7Insts(false),
 260     HasMAIInsts(false),
 261     HasPkFmacF16Inst(false),
 262     HasAtomicFaddInsts(false),
 263     SupportsSRAMECC(false),
 264     EnableSRAMECC(false),
 265     HasNoSdstCMPX(false),
 266     HasVscnt(false),
 267     HasGetWaveIdInst(false),
 268     HasSMemTimeInst(false),
 269     HasShaderCyclesRegister(false),
 270     HasVOP3Literal(false),
 271     HasNoDataDepHazard(false),
 272     FlatAddressSpace(false),
 273     FlatInstOffsets(false),
 274     FlatGlobalInsts(false),
 275     FlatScratchInsts(false),
 276     ScalarFlatScratchInsts(false),
 277     HasArchitectedFlatScratch(false),
 278     EnableFlatScratch(false),
 279     AddNoCarryInsts(false),
 280     HasUnpackedD16VMem(false),
 281     LDSMisalignedBug(false),
 282     HasMFMAInlineLiteralBug(false),
 283     UnalignedBufferAccess(false),
 284     UnalignedDSAccess(false),
 285     HasPackedTID(false),
 286
 287     ScalarizeGlobal(false),
 288
 289     HasVcmpxPermlaneHazard(false),
 290     HasVMEMtoScalarWriteHazard(false),
 291     HasSMEMtoVectorWriteHazard(false),
 292     HasInstFwdPrefetchBug(false),
 293     HasVcmpxExecWARHazard(false),
 294     HasLdsBranchVmemWARHazard(false),
 295     HasNSAtoVMEMBug(false),
 296     HasNSAClauseBug(false),
 297     HasOffset3fBug(false),
 298     HasFlatSegmentOffsetBug(false),
 299     HasImageStoreD16Bug(false),
 300     HasImageGather4D16Bug(false),
 301
 302     FeatureDisable(false),
 303     InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)),
 304     TLInfo(TM, *this),
 305     FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) {
 306   // clang-format on
 307   MaxWavesPerEU = AMDGPU::IsaInfo::getMaxWavesPerEU(this);
 308   CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering()));
 309   InlineAsmLoweringInfo.reset(new InlineAsmLowering(getTargetLowering()));
 310   Legalizer.reset(new AMDGPULegalizerInfo(*this, TM));
 311   RegBankInfo.reset(new AMDGPURegisterBankInfo(*this));
 312   InstSelector.reset(new AMDGPUInstructionSelector(
 313   *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM));
 314 }
 315
 316 unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const {
 317   if (getGeneration() < GFX10)
 318     return 1;
 319
 320   switch (Opcode) {
 321   case AMDGPU::V_LSHLREV_B64_e64:
 322   case AMDGPU::V_LSHLREV_B64_gfx10:
 323   case AMDGPU::V_LSHL_B64_e64:
 324   case AMDGPU::V_LSHRREV_B64_e64:
 325   case AMDGPU::V_LSHRREV_B64_gfx10:
 326   case AMDGPU::V_LSHR_B64_e64:
 327   case AMDGPU::V_ASHRREV_I64_e64:
 328   case AMDGPU::V_ASHRREV_I64_gfx10:
 329   case AMDGPU::V_ASHR_I64_e64:
 330     return 1;
 331   }
 332
 333   return 2;
 334 }
 335
 336 /// This list was mostly derived from experimentation.
 337 bool GCNSubtarget::zeroesHigh16BitsOfDest(unsigned Opcode) const {
 338   switch (Opcode) {
 339   case AMDGPU::V_CVT_F16_F32_e32:
 340   case AMDGPU::V_CVT_F16_F32_e64:
 341   case AMDGPU::V_CVT_F16_U16_e32:
 342   case AMDGPU::V_CVT_F16_U16_e64:
 343   case AMDGPU::V_CVT_F16_I16_e32:
 344   case AMDGPU::V_CVT_F16_I16_e64:
 345   case AMDGPU::V_RCP_F16_e64:
 346   case AMDGPU::V_RCP_F16_e32:
 347   case AMDGPU::V_RSQ_F16_e64:
 348   case AMDGPU::V_RSQ_F16_e32:
 349   case AMDGPU::V_SQRT_F16_e64:
 350   case AMDGPU::V_SQRT_F16_e32:
 351   case AMDGPU::V_LOG_F16_e64:
 352   case AMDGPU::V_LOG_F16_e32:
 353   case AMDGPU::V_EXP_F16_e64:
 354   case AMDGPU::V_EXP_F16_e32:
 355   case AMDGPU::V_SIN_F16_e64:
 356   case AMDGPU::V_SIN_F16_e32:
 357   case AMDGPU::V_COS_F16_e64:
 358   case AMDGPU::V_COS_F16_e32:
 359   case AMDGPU::V_FLOOR_F16_e64:
 360   case AMDGPU::V_FLOOR_F16_e32:
 361   case AMDGPU::V_CEIL_F16_e64:
 362   case AMDGPU::V_CEIL_F16_e32:
 363   case AMDGPU::V_TRUNC_F16_e64:
 364   case AMDGPU::V_TRUNC_F16_e32:
 365   case AMDGPU::V_RNDNE_F16_e64:
 366   case AMDGPU::V_RNDNE_F16_e32:
 367   case AMDGPU::V_FRACT_F16_e64:
 368   case AMDGPU::V_FRACT_F16_e32:
 369   case AMDGPU::V_FREXP_MANT_F16_e64:
 370   case AMDGPU::V_FREXP_MANT_F16_e32:
 371   case AMDGPU::V_FREXP_EXP_I16_F16_e64:
 372   case AMDGPU::V_FREXP_EXP_I16_F16_e32:
 373   case AMDGPU::V_LDEXP_F16_e64:
 374   case AMDGPU::V_LDEXP_F16_e32:
 375   case AMDGPU::V_LSHLREV_B16_e64:
 376   case AMDGPU::V_LSHLREV_B16_e32:
 377   case AMDGPU::V_LSHRREV_B16_e64:
 378   case AMDGPU::V_LSHRREV_B16_e32:
 379   case AMDGPU::V_ASHRREV_I16_e64:
 380   case AMDGPU::V_ASHRREV_I16_e32:
 381   case AMDGPU::V_ADD_U16_e64:
 382   case AMDGPU::V_ADD_U16_e32:
 383   case AMDGPU::V_SUB_U16_e64:
 384   case AMDGPU::V_SUB_U16_e32:
 385   case AMDGPU::V_SUBREV_U16_e64:
 386   case AMDGPU::V_SUBREV_U16_e32:
 387   case AMDGPU::V_MUL_LO_U16_e64:
 388   case AMDGPU::V_MUL_LO_U16_e32:
 389   case AMDGPU::V_ADD_F16_e64:
 390   case AMDGPU::V_ADD_F16_e32:
 391   case AMDGPU::V_SUB_F16_e64:
 392   case AMDGPU::V_SUB_F16_e32:
 393   case AMDGPU::V_SUBREV_F16_e64:
 394   case AMDGPU::V_SUBREV_F16_e32:
 395   case AMDGPU::V_MUL_F16_e64:
 396   case AMDGPU::V_MUL_F16_e32:
 397   case AMDGPU::V_MAX_F16_e64:
 398   case AMDGPU::V_MAX_F16_e32:
 399   case AMDGPU::V_MIN_F16_e64:
 400   case AMDGPU::V_MIN_F16_e32:
 401   case AMDGPU::V_MAX_U16_e64:
 402   case AMDGPU::V_MAX_U16_e32:
 403   case AMDGPU::V_MIN_U16_e64:
 404   case AMDGPU::V_MIN_U16_e32:
 405   case AMDGPU::V_MAX_I16_e64:
 406   case AMDGPU::V_MAX_I16_e32:
 407   case AMDGPU::V_MIN_I16_e64:
 408   case AMDGPU::V_MIN_I16_e32:
 409   case AMDGPU::V_MAD_F16_e64:
 410   case AMDGPU::V_MAD_U16_e64:
 411   case AMDGPU::V_MAD_I16_e64:
 412   case AMDGPU::V_FMA_F16_e64:
 413   case AMDGPU::V_DIV_FIXUP_F16_e64:
 414     // On gfx10, all 16-bit instructions preserve the high bits.
 415     return getGeneration() <= AMDGPUSubtarget::GFX9;
 416   case AMDGPU::V_MADAK_F16:
 417   case AMDGPU::V_MADMK_F16:
 418   case AMDGPU::V_MAC_F16_e64:
 419   case AMDGPU::V_MAC_F16_e32:
 420   case AMDGPU::V_FMAMK_F16:
 421   case AMDGPU::V_FMAAK_F16:
 422   case AMDGPU::V_FMAC_F16_e64:
 423   case AMDGPU::V_FMAC_F16_e32:
 424     // In gfx9, the preferred handling of the unused high 16-bits changed. Most
 425     // instructions maintain the legacy behavior of 0ing. Some instructions
 426     // changed to preserving the high bits.
 427     return getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS;
 428   case AMDGPU::V_MAD_MIXLO_F16:
 429   case AMDGPU::V_MAD_MIXHI_F16:
 430   default:
 431     return false;
 432   }
 433 }
 434
 435 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
 436   const Function &F) const {
 437   if (NWaves == 1)
 438     return getLocalMemorySize();
 439   unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
 440   unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
 441   if (!WorkGroupsPerCu)
 442     return 0;
 443   unsigned MaxWaves = getMaxWavesPerEU();
 444   return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves;
 445 }
 446
 447 // FIXME: Should return min,max range.
 448 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes,
 449   const Function &F) const {
 450   const unsigned MaxWorkGroupSize = getFlatWorkGroupSizes(F).second;
 451   const unsigned MaxWorkGroupsPerCu = getMaxWorkGroupsPerCU(MaxWorkGroupSize);
 452   if (!MaxWorkGroupsPerCu)
 453     return 0;
 454
 455   const unsigned WaveSize = getWavefrontSize();
 456
 457   // FIXME: Do we need to account for alignment requirement of LDS rounding the
 458   // size up?
 459   // Compute restriction based on LDS usage
 460   unsigned NumGroups = getLocalMemorySize() / (Bytes ? Bytes : 1u);
 461
 462   // This can be queried with more LDS than is possible, so just assume the
 463   // worst.
 464   if (NumGroups == 0)
 465     return 1;
 466
 467   NumGroups = std::min(MaxWorkGroupsPerCu, NumGroups);
 468
 469   // Round to the number of waves.
 470   const unsigned MaxGroupNumWaves = (MaxWorkGroupSize + WaveSize - 1) / WaveSize;
 471   unsigned MaxWaves = NumGroups * MaxGroupNumWaves;
 472
 473   // Clamp to the maximum possible number of waves.
 474   MaxWaves = std::min(MaxWaves, getMaxWavesPerEU());
 475
 476   // FIXME: Needs to be a multiple of the group size?
 477   //MaxWaves = MaxGroupNumWaves * (MaxWaves / MaxGroupNumWaves);
 478
 479   assert(MaxWaves > 0 && MaxWaves <= getMaxWavesPerEU() &&
 480          "computed invalid occupancy");
 481   return MaxWaves;
 482 }
 483
 484 unsigned
 485 AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const {
 486   const auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
 487   return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction());
 488 }
 489
 490 std::pair<unsigned, unsigned>
 491 AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const {
 492   switch (CC) {
 493   case CallingConv::AMDGPU_VS:
 494   case CallingConv::AMDGPU_LS:
 495   case CallingConv::AMDGPU_HS:
 496   case CallingConv::AMDGPU_ES:
 497   case CallingConv::AMDGPU_GS:
 498   case CallingConv::AMDGPU_PS:
 499     return std::make_pair(1, getWavefrontSize());
 500   default:
 501     return std::make_pair(1u, getMaxFlatWorkGroupSize());
 502   }
 503 }
 504
 505 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
 506   const Function &F) const {
 507   // Default minimum/maximum flat work group sizes.
 508   std::pair<unsigned, unsigned> Default =
 509     getDefaultFlatWorkGroupSize(F.getCallingConv());
 510
 511   // Requested minimum/maximum flat work group sizes.
 512   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
 513     F, "amdgpu-flat-work-group-size", Default);
 514
 515   // Make sure requested minimum is less than requested maximum.
 516   if (Requested.first > Requested.second)
 517     return Default;
 518
 519   // Make sure requested values do not violate subtarget's specifications.
 520   if (Requested.first < getMinFlatWorkGroupSize())
 521     return Default;
 522   if (Requested.second > getMaxFlatWorkGroupSize())
 523     return Default;
 524
 525   return Requested;
 526 }
 527
 528 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
 529     const Function &F, std::pair<unsigned, unsigned> FlatWorkGroupSizes) const {
 530   // Default minimum/maximum number of waves per execution unit.
 531   std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
 532
 533   // If minimum/maximum flat work group sizes were explicitly requested using
 534   // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum
 535   // number of waves per execution unit to values implied by requested
 536   // minimum/maximum flat work group sizes.
 537   unsigned MinImpliedByFlatWorkGroupSize =
 538     getWavesPerEUForWorkGroup(FlatWorkGroupSizes.second);
 539   Default.first = MinImpliedByFlatWorkGroupSize;
 540
 541   // Requested minimum/maximum number of waves per execution unit.
 542   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
 543     F, "amdgpu-waves-per-eu", Default, true);
 544
 545   // Make sure requested minimum is less than requested maximum.
 546   if (Requested.second && Requested.first > Requested.second)
 547     return Default;
 548
 549   // Make sure requested values do not violate subtarget's specifications.
 550   if (Requested.first < getMinWavesPerEU() ||
 551       Requested.second > getMaxWavesPerEU())
 552     return Default;
 553
 554   // Make sure requested values are compatible with values implied by requested
 555   // minimum/maximum flat work group sizes.
 556   if (Requested.first < MinImpliedByFlatWorkGroupSize)
 557     return Default;
 558
 559   return Requested;
 560 }
 561
 562 static unsigned getReqdWorkGroupSize(const Function &Kernel, unsigned Dim) {
 563   auto Node = Kernel.getMetadata("reqd_work_group_size");
 564   if (Node && Node->getNumOperands() == 3)
 565     return mdconst::extract<ConstantInt>(Node->getOperand(Dim))->getZExtValue();
 566   return std::numeric_limits<unsigned>::max();
 567 }
 568
 569 bool AMDGPUSubtarget::isMesaKernel(const Function &F) const {
 570   return isMesa3DOS() && !AMDGPU::isShader(F.getCallingConv());
 571 }
 572
 573 unsigned AMDGPUSubtarget::getMaxWorkitemID(const Function &Kernel,
 574                                            unsigned Dimension) const {
 575   unsigned ReqdSize = getReqdWorkGroupSize(Kernel, Dimension);
 576   if (ReqdSize != std::numeric_limits<unsigned>::max())
 577     return ReqdSize - 1;
 578   return getFlatWorkGroupSizes(Kernel).second - 1;
 579 }
 580
 581 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const {
 582   Function *Kernel = I->getParent()->getParent();
 583   unsigned MinSize = 0;
 584   unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second;
 585   bool IdQuery = false;
 586
 587   // If reqd_work_group_size is present it narrows value down.
 588   if (auto *CI = dyn_cast<CallInst>(I)) {
 589     const Function *F = CI->getCalledFunction();
 590     if (F) {
 591       unsigned Dim = UINT_MAX;
 592       switch (F->getIntrinsicID()) {
 593       case Intrinsic::amdgcn_workitem_id_x:
 594       case Intrinsic::r600_read_tidig_x:
 595         IdQuery = true;
 596         LLVM_FALLTHROUGH;
 597       case Intrinsic::r600_read_local_size_x:
 598         Dim = 0;
 599         break;
 600       case Intrinsic::amdgcn_workitem_id_y:
 601       case Intrinsic::r600_read_tidig_y:
 602         IdQuery = true;
 603         LLVM_FALLTHROUGH;
 604       case Intrinsic::r600_read_local_size_y:
 605         Dim = 1;
 606         break;
 607       case Intrinsic::amdgcn_workitem_id_z:
 608       case Intrinsic::r600_read_tidig_z:
 609         IdQuery = true;
 610         LLVM_FALLTHROUGH;
 611       case Intrinsic::r600_read_local_size_z:
 612         Dim = 2;
 613         break;
 614       default:
 615         break;
 616       }
 617
 618       if (Dim <= 3) {
 619         unsigned ReqdSize = getReqdWorkGroupSize(*Kernel, Dim);
 620         if (ReqdSize != std::numeric_limits<unsigned>::max())
 621           MinSize = MaxSize = ReqdSize;
 622       }
 623     }
 624   }
 625
 626   if (!MaxSize)
 627     return false;
 628
 629   // Range metadata is [Lo, Hi). For ID query we need to pass max size
 630   // as Hi. For size query we need to pass Hi + 1.
 631   if (IdQuery)
 632     MinSize = 0;
 633   else
 634     ++MaxSize;
 635
 636   MDBuilder MDB(I->getContext());
 637   MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize),
 638                                                   APInt(32, MaxSize));
 639   I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
 640   return true;
 641 }
 642
 643 unsigned AMDGPUSubtarget::getImplicitArgNumBytes(const Function &F) const {
 644   assert(AMDGPU::isKernel(F.getCallingConv()));
 645
 646   // We don't allocate the segment if we know the implicit arguments weren't
 647   // used, even if the ABI implies we need them.
 648   if (F.hasFnAttribute("amdgpu-no-implicitarg-ptr"))
 649     return 0;
 650
 651   if (isMesaKernel(F))
 652     return 16;
 653
 654   // Assume all implicit inputs are used by default
 655   return AMDGPU::getIntegerAttribute(F, "amdgpu-implicitarg-num-bytes", 56);
 656 }
 657
 658 uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F,
 659                                                  Align &MaxAlign) const {
 660   assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
 661          F.getCallingConv() == CallingConv::SPIR_KERNEL);
 662
 663   const DataLayout &DL = F.getParent()->getDataLayout();
 664   uint64_t ExplicitArgBytes = 0;
 665   MaxAlign = Align(1);
 666
 667   for (const Argument &Arg : F.args()) {
 668     const bool IsByRef = Arg.hasByRefAttr();
 669     Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType();
 670     MaybeAlign Alignment = IsByRef ? Arg.getParamAlign() : None;
 671     if (!Alignment)
 672       Alignment = DL.getABITypeAlign(ArgTy);
 673
 674     uint64_t AllocSize = DL.getTypeAllocSize(ArgTy);
 675     ExplicitArgBytes = alignTo(ExplicitArgBytes, Alignment) + AllocSize;
 676     MaxAlign = max(MaxAlign, Alignment);
 677   }
 678
 679   return ExplicitArgBytes;
 680 }
 681
 682 unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F,
 683                                                 Align &MaxAlign) const {
 684   uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign);
 685
 686   unsigned ExplicitOffset = getExplicitKernelArgOffset(F);
 687
 688   uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes;
 689   unsigned ImplicitBytes = getImplicitArgNumBytes(F);
 690   if (ImplicitBytes != 0) {
 691     const Align Alignment = getAlignmentForImplicitArgPtr();
 692     TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
 693     MaxAlign = std::max(MaxAlign, Alignment);
 694   }
 695
 696   // Being able to dereference past the end is useful for emitting scalar loads.
 697   return alignTo(TotalSize, 4);
 698 }
 699
 700 AMDGPUDwarfFlavour AMDGPUSubtarget::getAMDGPUDwarfFlavour() const {
 701   return getWavefrontSize() == 32 ? AMDGPUDwarfFlavour::Wave32
 702                                   : AMDGPUDwarfFlavour::Wave64;
 703 }
 704
 705 void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
 706                                       unsigned NumRegionInstrs) const {
 707   // Track register pressure so the scheduler can try to decrease
 708   // pressure once register usage is above the threshold defined by
 709   // SIRegisterInfo::getRegPressureSetLimit()
 710   Policy.ShouldTrackPressure = true;
 711
 712   // Enabling both top down and bottom up scheduling seems to give us less
 713   // register spills than just using one of these approaches on its own.
 714   Policy.OnlyTopDown = false;
 715   Policy.OnlyBottomUp = false;
 716
 717   // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
 718   if (!enableSIScheduler())
 719     Policy.ShouldTrackLaneMasks = true;
 720 }
 721
 722 bool GCNSubtarget::hasMadF16() const {
 723   return InstrInfo.pseudoToMCOpcode(AMDGPU::V_MAD_F16_e64) != -1;
 724 }
 725
 726 bool GCNSubtarget::useVGPRIndexMode() const {
 727   return !hasMovrel() || (EnableVGPRIndexMode && hasVGPRIndexMode());
 728 }
 729
 730 bool GCNSubtarget::useAA() const { return UseAA; }
 731
 732 unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
 733   if (getGeneration() >= AMDGPUSubtarget::GFX10)
 734     return getMaxWavesPerEU();
 735
 736   if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
 737     if (SGPRs <= 80)
 738       return 10;
 739     if (SGPRs <= 88)
 740       return 9;
 741     if (SGPRs <= 100)
 742       return 8;
 743     return 7;
 744   }
 745   if (SGPRs <= 48)
 746     return 10;
 747   if (SGPRs <= 56)
 748     return 9;
 749   if (SGPRs <= 64)
 750     return 8;
 751   if (SGPRs <= 72)
 752     return 7;
 753   if (SGPRs <= 80)
 754     return 6;
 755   return 5;
 756 }
 757
 758 unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const {
 759   unsigned MaxWaves = getMaxWavesPerEU();
 760   unsigned Granule = getVGPRAllocGranule();
 761   if (VGPRs < Granule)
 762     return MaxWaves;
 763   unsigned RoundedRegs = ((VGPRs + Granule - 1) / Granule) * Granule;
 764   return std::min(std::max(getTotalNumVGPRs() / RoundedRegs, 1u), MaxWaves);
 765 }
 766
 767 unsigned
 768 GCNSubtarget::getBaseReservedNumSGPRs(const bool HasFlatScratch) const {
 769   if (getGeneration() >= AMDGPUSubtarget::GFX10)
 770     return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs.
 771
 772   if (HasFlatScratch || HasArchitectedFlatScratch) {
 773     if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
 774       return 6; // FLAT_SCRATCH, XNACK, VCC (in that order).
 775     if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS)
 776       return 4; // FLAT_SCRATCH, VCC (in that order).
 777   }
 778
 779   if (isXNACKEnabled())
 780     return 4; // XNACK, VCC (in that order).
 781   return 2; // VCC.
 782 }
 783
 784 unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
 785   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
 786   return getBaseReservedNumSGPRs(MFI.hasFlatScratchInit());
 787 }
 788
 789 unsigned GCNSubtarget::getReservedNumSGPRs(const Function &F) const {
 790   // In principle we do not need to reserve SGPR pair used for flat_scratch if
 791   // we know flat instructions do not access the stack anywhere in the
 792   // program. For now assume it's needed if we have flat instructions.
 793   const bool KernelUsesFlatScratch = hasFlatAddressSpace();
 794   return getBaseReservedNumSGPRs(KernelUsesFlatScratch);
 795 }
 796
 797 unsigned GCNSubtarget::computeOccupancy(const Function &F, unsigned LDSSize,
 798                                         unsigned NumSGPRs,
 799                                         unsigned NumVGPRs) const {
 800   unsigned Occupancy =
 801     std::min(getMaxWavesPerEU(),
 802              getOccupancyWithLocalMemSize(LDSSize, F));
 803   if (NumSGPRs)
 804     Occupancy = std::min(Occupancy, getOccupancyWithNumSGPRs(NumSGPRs));
 805   if (NumVGPRs)
 806     Occupancy = std::min(Occupancy, getOccupancyWithNumVGPRs(NumVGPRs));
 807   return Occupancy;
 808 }
 809
 810 unsigned GCNSubtarget::getBaseMaxNumSGPRs(
 811     const Function &F, std::pair<unsigned, unsigned> WavesPerEU,
 812     unsigned PreloadedSGPRs, unsigned ReservedNumSGPRs) const {
 813   // Compute maximum number of SGPRs function can use using default/requested
 814   // minimum number of waves per execution unit.
 815   unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false);
 816   unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true);
 817
 818   // Check if maximum number of SGPRs was explicitly requested using
 819   // "amdgpu-num-sgpr" attribute.
 820   if (F.hasFnAttribute("amdgpu-num-sgpr")) {
 821     unsigned Requested = AMDGPU::getIntegerAttribute(
 822       F, "amdgpu-num-sgpr", MaxNumSGPRs);
 823
 824     // Make sure requested value does not violate subtarget's specifications.
 825     if (Requested && (Requested <= ReservedNumSGPRs))
 826       Requested = 0;
 827
 828     // If more SGPRs are required to support the input user/system SGPRs,
 829     // increase to accommodate them.
 830     //
 831     // FIXME: This really ends up using the requested number of SGPRs + number
 832     // of reserved special registers in total. Theoretically you could re-use
 833     // the last input registers for these special registers, but this would
 834     // require a lot of complexity to deal with the weird aliasing.
 835     unsigned InputNumSGPRs = PreloadedSGPRs;
 836     if (Requested && Requested < InputNumSGPRs)
 837       Requested = InputNumSGPRs;
 838
 839     // Make sure requested value is compatible with values implied by
 840     // default/requested minimum/maximum number of waves per execution unit.
 841     if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false))
 842       Requested = 0;
 843     if (WavesPerEU.second &&
 844         Requested && Requested < getMinNumSGPRs(WavesPerEU.second))
 845       Requested = 0;
 846
 847     if (Requested)
 848       MaxNumSGPRs = Requested;
 849   }
 850
 851   if (hasSGPRInitBug())
 852     MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
 853
 854   return std::min(MaxNumSGPRs - ReservedNumSGPRs, MaxAddressableNumSGPRs);
 855 }
 856
 857 unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
 858   const Function &F = MF.getFunction();
 859   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
 860   return getBaseMaxNumSGPRs(F, MFI.getWavesPerEU(), MFI.getNumPreloadedSGPRs(),
 861                             getReservedNumSGPRs(MF));
 862 }
 863
 864 static unsigned getMaxNumPreloadedSGPRs() {
 865   // Max number of user SGPRs
 866   unsigned MaxUserSGPRs = 4 + // private segment buffer
 867                           2 + // Dispatch ptr
 868                           2 + // queue ptr
 869                           2 + // kernel segment ptr
 870                           2 + // dispatch ID
 871                           2 + // flat scratch init
 872                           2;  // Implicit buffer ptr
 873   // Max number of system SGPRs
 874   unsigned MaxSystemSGPRs = 1 + // WorkGroupIDX
 875                             1 + // WorkGroupIDY
 876                             1 + // WorkGroupIDZ
 877                             1 + // WorkGroupInfo
 878                             1;  // private segment wave byte offset
 879   return MaxUserSGPRs + MaxSystemSGPRs;
 880 }
 881
 882 unsigned GCNSubtarget::getMaxNumSGPRs(const Function &F) const {
 883   return getBaseMaxNumSGPRs(F, getWavesPerEU(F), getMaxNumPreloadedSGPRs(),
 884                             getReservedNumSGPRs(F));
 885 }
 886
 887 unsigned GCNSubtarget::getBaseMaxNumVGPRs(
 888     const Function &F, std::pair<unsigned, unsigned> WavesPerEU) const {
 889   // Compute maximum number of VGPRs function can use using default/requested
 890   // minimum number of waves per execution unit.
 891   unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first);
 892
 893   // Check if maximum number of VGPRs was explicitly requested using
 894   // "amdgpu-num-vgpr" attribute.
 895   if (F.hasFnAttribute("amdgpu-num-vgpr")) {
 896     unsigned Requested = AMDGPU::getIntegerAttribute(
 897       F, "amdgpu-num-vgpr", MaxNumVGPRs);
 898
 899     if (hasGFX90AInsts())
 900       Requested *= 2;
 901
 902     // Make sure requested value is compatible with values implied by
 903     // default/requested minimum/maximum number of waves per execution unit.
 904     if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first))
 905       Requested = 0;
 906     if (WavesPerEU.second &&
 907         Requested && Requested < getMinNumVGPRs(WavesPerEU.second))
 908       Requested = 0;
 909
 910     if (Requested)
 911       MaxNumVGPRs = Requested;
 912   }
 913
 914   return MaxNumVGPRs;
 915 }
 916
 917 unsigned GCNSubtarget::getMaxNumVGPRs(const Function &F) const {
 918   return getBaseMaxNumVGPRs(F, getWavesPerEU(F));
 919 }
 920
 921 unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
 922   const Function &F = MF.getFunction();
 923   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
 924   return getBaseMaxNumVGPRs(F, MFI.getWavesPerEU());
 925 }
 926
 927 void GCNSubtarget::adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use,
 928                                          int UseOpIdx, SDep &Dep) const {
 929   if (Dep.getKind() != SDep::Kind::Data || !Dep.getReg() ||
 930       !Def->isInstr() || !Use->isInstr())
 931     return;
 932
 933   MachineInstr *DefI = Def->getInstr();
 934   MachineInstr *UseI = Use->getInstr();
 935
 936   if (DefI->isBundle()) {
 937     const SIRegisterInfo *TRI = getRegisterInfo();
 938     auto Reg = Dep.getReg();
 939     MachineBasicBlock::const_instr_iterator I(DefI->getIterator());
 940     MachineBasicBlock::const_instr_iterator E(DefI->getParent()->instr_end());
 941     unsigned Lat = 0;
 942     for (++I; I != E && I->isBundledWithPred(); ++I) {
 943       if (I->modifiesRegister(Reg, TRI))
 944         Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *I);
 945       else if (Lat)
 946         --Lat;
 947     }
 948     Dep.setLatency(Lat);
 949   } else if (UseI->isBundle()) {
 950     const SIRegisterInfo *TRI = getRegisterInfo();
 951     auto Reg = Dep.getReg();
 952     MachineBasicBlock::const_instr_iterator I(UseI->getIterator());
 953     MachineBasicBlock::const_instr_iterator E(UseI->getParent()->instr_end());
 954     unsigned Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *DefI);
 955     for (++I; I != E && I->isBundledWithPred() && Lat; ++I) {
 956       if (I->readsRegister(Reg, TRI))
 957         break;
 958       --Lat;
 959     }
 960     Dep.setLatency(Lat);
 961   } else if (Dep.getLatency() == 0 && Dep.getReg() == AMDGPU::VCC_LO) {
 962     // Work around the fact that SIInstrInfo::fixImplicitOperands modifies
 963     // implicit operands which come from the MCInstrDesc, which can fool
 964     // ScheduleDAGInstrs::addPhysRegDataDeps into treating them as implicit
 965     // pseudo operands.
 966     Dep.setLatency(InstrInfo.getSchedModel().computeOperandLatency(
 967         DefI, DefOpIdx, UseI, UseOpIdx));
 968   }
 969 }
 970
 971 namespace {
 972 struct FillMFMAShadowMutation : ScheduleDAGMutation {
 973   const SIInstrInfo *TII;
 974
 975   ScheduleDAGMI *DAG;
 976
 977   FillMFMAShadowMutation(const SIInstrInfo *tii) : TII(tii) {}
 978
 979   bool isSALU(const SUnit *SU) const {
 980     const MachineInstr *MI = SU->getInstr();
 981     return MI && TII->isSALU(*MI) && !MI->isTerminator();
 982   }
 983
 984   bool isVALU(const SUnit *SU) const {
 985     const MachineInstr *MI = SU->getInstr();
 986     return MI && TII->isVALU(*MI);
 987   }
 988
 989   bool canAddEdge(const SUnit *Succ, const SUnit *Pred) const {
 990     if (Pred->NodeNum < Succ->NodeNum)
 991       return true;
 992
 993     SmallVector<const SUnit*, 64> Succs({Succ}), Preds({Pred});
 994
 995     for (unsigned I = 0; I < Succs.size(); ++I) {
 996       for (const SDep &SI : Succs[I]->Succs) {
 997         const SUnit *SU = SI.getSUnit();
 998         if (SU != Succs[I] && !llvm::is_contained(Succs, SU))
 999           Succs.push_back(SU);
1000       }
1001     }
1002
1003     SmallPtrSet<const SUnit*, 32> Visited;
1004     while (!Preds.empty()) {
1005       const SUnit *SU = Preds.pop_back_val();
1006       if (llvm::is_contained(Succs, SU))
1007         return false;
1008       Visited.insert(SU);
1009       for (const SDep &SI : SU->Preds)
1010         if (SI.getSUnit() != SU && !Visited.count(SI.getSUnit()))
1011           Preds.push_back(SI.getSUnit());
1012     }
1013
1014     return true;
1015   }
1016
1017   // Link as many SALU instructions in chain as possible. Return the size
1018   // of the chain. Links up to MaxChain instructions.
1019   unsigned linkSALUChain(SUnit *From, SUnit *To, unsigned MaxChain,
1020                          SmallPtrSetImpl<SUnit *> &Visited) const {
1021     SmallVector<SUnit *, 8> Worklist({To});
1022     unsigned Linked = 0;
1023
1024     while (!Worklist.empty() && MaxChain-- > 0) {
1025       SUnit *SU = Worklist.pop_back_val();
1026       if (!Visited.insert(SU).second)
1027         continue;
1028
1029       LLVM_DEBUG(dbgs() << "Inserting edge from\n" ; DAG->dumpNode(*From);
1030                  dbgs() << "to\n"; DAG->dumpNode(*SU); dbgs() << '\n');
1031
1032       if (SU->addPred(SDep(From, SDep::Artificial), false))
1033         ++Linked;
1034
1035       for (SDep &SI : From->Succs) {
1036         SUnit *SUv = SI.getSUnit();
1037         if (SUv != From && isVALU(SUv) && canAddEdge(SUv, SU))
1038           SUv->addPred(SDep(SU, SDep::Artificial), false);
1039       }
1040
1041       for (SDep &SI : SU->Succs) {
1042         SUnit *Succ = SI.getSUnit();
1043         if (Succ != SU && isSALU(Succ) && canAddEdge(From, Succ))
1044           Worklist.push_back(Succ);
1045       }
1046     }
1047
1048     return Linked;
1049   }
1050
1051   void apply(ScheduleDAGInstrs *DAGInstrs) override {
1052     const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>();
1053     if (!ST.hasMAIInsts() || DisablePowerSched)
1054       return;
1055     DAG = static_cast<ScheduleDAGMI*>(DAGInstrs);
1056     const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel();
1057     if (!TSchedModel || DAG->SUnits.empty())
1058       return;
1059
1060     // Scan for MFMA long latency instructions and try to add a dependency
1061     // of available SALU instructions to give them a chance to fill MFMA
1062     // shadow. That is desirable to fill MFMA shadow with SALU instructions
1063     // rather than VALU to prevent power consumption bursts and throttle.
1064     auto LastSALU = DAG->SUnits.begin();
1065     auto E = DAG->SUnits.end();
1066     SmallPtrSet<SUnit*, 32> Visited;
1067     for (SUnit &SU : DAG->SUnits) {
1068       MachineInstr &MAI = *SU.getInstr();
1069       if (!TII->isMAI(MAI) ||
1070            MAI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
1071            MAI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64)
1072         continue;
1073
1074       unsigned Lat = TSchedModel->computeInstrLatency(&MAI) - 1;
1075
1076       LLVM_DEBUG(dbgs() << "Found MFMA: "; DAG->dumpNode(SU);
1077                  dbgs() << "Need " << Lat
1078                         << " instructions to cover latency.\n");
1079
1080       // Find up to Lat independent scalar instructions as early as
1081       // possible such that they can be scheduled after this MFMA.
1082       for ( ; Lat && LastSALU != E; ++LastSALU) {
1083         if (Visited.count(&*LastSALU))
1084           continue;
1085
1086         if (!isSALU(&*LastSALU) || !canAddEdge(&*LastSALU, &SU))
1087           continue;
1088
1089         Lat -= linkSALUChain(&SU, &*LastSALU, Lat, Visited);
1090       }
1091     }
1092   }
1093 };
1094 } // namespace
1095
1096 void GCNSubtarget::getPostRAMutations(
1097     std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
1098   Mutations.push_back(std::make_unique<FillMFMAShadowMutation>(&InstrInfo));
1099 }
1100
1101 std::unique_ptr<ScheduleDAGMutation>
1102 GCNSubtarget::createFillMFMAShadowMutation(const TargetInstrInfo *TII) const {
1103   return std::make_unique<FillMFMAShadowMutation>(&InstrInfo);
1104 }
1105
1106 const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) {
1107   if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn)
1108     return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>());
1109   else
1110     return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<R600Subtarget>());
1111 }
1112
1113 const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Function &F) {
1114   if (TM.getTargetTriple().getArch() == Triple::amdgcn)
1115     return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F));
1116   else
1117     return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<R600Subtarget>(F));
1118 }