lib/Target/AMDGPU/AMDGPUSubtarget.cpp

   1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
   2 //
   3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 // See https://llvm.org/LICENSE.txt for license information.
   5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 //
   7 //===----------------------------------------------------------------------===//
   8 //
   9 /// \file
  10 /// Implements the AMDGPU specific subclass of TargetSubtarget.
  11 //
  12 //===----------------------------------------------------------------------===//
  13
  14 #include "AMDGPUSubtarget.h"
  15 #include "AMDGPU.h"
  16 #include "AMDGPUTargetMachine.h"
  17 #include "AMDGPUCallLowering.h"
  18 #include "AMDGPUInstructionSelector.h"
  19 #include "AMDGPULegalizerInfo.h"
  20 #include "AMDGPURegisterBankInfo.h"
  21 #include "SIMachineFunctionInfo.h"
  22 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
  23 #include "llvm/ADT/SmallString.h"
  24 #include "llvm/CodeGen/MachineScheduler.h"
  25 #include "llvm/MC/MCSubtargetInfo.h"
  26 #include "llvm/IR/MDBuilder.h"
  27 #include "llvm/CodeGen/TargetFrameLowering.h"
  28 #include <algorithm>
  29
  30 using namespace llvm;
  31
  32 #define DEBUG_TYPE "amdgpu-subtarget"
  33
  34 #define GET_SUBTARGETINFO_TARGET_DESC
  35 #define GET_SUBTARGETINFO_CTOR
  36 #define AMDGPUSubtarget GCNSubtarget
  37 #include "AMDGPUGenSubtargetInfo.inc"
  38 #define GET_SUBTARGETINFO_TARGET_DESC
  39 #define GET_SUBTARGETINFO_CTOR
  40 #undef AMDGPUSubtarget
  41 #include "R600GenSubtargetInfo.inc"
  42
  43 static cl::opt<bool> DisablePowerSched(
  44   "amdgpu-disable-power-sched",
  45   cl::desc("Disable scheduling to minimize mAI power bursts"),
  46   cl::init(false));
  47
  48 GCNSubtarget::~GCNSubtarget() = default;
  49
  50 R600Subtarget &
  51 R600Subtarget::initializeSubtargetDependencies(const Triple &TT,
  52                                                StringRef GPU, StringRef FS) {
  53   SmallString<256> FullFS("+promote-alloca,");
  54   FullFS += FS;
  55   ParseSubtargetFeatures(GPU, FullFS);
  56
  57   // FIXME: I don't think think Evergreen has any useful support for
  58   // denormals, but should be checked. Should we issue a warning somewhere
  59   // if someone tries to enable these?
  60   if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
  61     FP32Denormals = false;
  62   }
  63
  64   HasMulU24 = getGeneration() >= EVERGREEN;
  65   HasMulI24 = hasCaymanISA();
  66
  67   return *this;
  68 }
  69
  70 GCNSubtarget &
  71 GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
  72                                               StringRef GPU, StringRef FS) {
  73   // Determine default and user-specified characteristics
  74   // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be
  75   // enabled, but some instructions do not respect them and they run at the
  76   // double precision rate, so don't enable by default.
  77   //
  78   // We want to be able to turn these off, but making this a subtarget feature
  79   // for SI has the unhelpful behavior that it unsets everything else if you
  80   // disable it.
  81   //
  82   // Similarly we want enable-prt-strict-null to be on by default and not to
  83   // unset everything else if it is disabled
  84
  85   // Assuming ECC is enabled is the conservative default.
  86   SmallString<256> FullFS("+promote-alloca,+load-store-opt,+sram-ecc,+xnack,");
  87
  88   if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA.
  89     FullFS += "+flat-for-global,+unaligned-buffer-access,+trap-handler,";
  90
  91   // FIXME: I don't think think Evergreen has any useful support for
  92   // denormals, but should be checked. Should we issue a warning somewhere
  93   // if someone tries to enable these?
  94   if (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
  95     FullFS += "+fp64-fp16-denormals,";
  96   } else {
  97     FullFS += "-fp32-denormals,";
  98   }
  99
 100   FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS
 101
 102   // Disable mutually exclusive bits.
 103   if (FS.find_lower("+wavefrontsize") != StringRef::npos) {
 104     if (FS.find_lower("wavefrontsize16") == StringRef::npos)
 105       FullFS += "-wavefrontsize16,";
 106     if (FS.find_lower("wavefrontsize32") == StringRef::npos)
 107       FullFS += "-wavefrontsize32,";
 108     if (FS.find_lower("wavefrontsize64") == StringRef::npos)
 109       FullFS += "-wavefrontsize64,";
 110   }
 111
 112   FullFS += FS;
 113
 114   ParseSubtargetFeatures(GPU, FullFS);
 115
 116   // We don't support FP64 for EG/NI atm.
 117   assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS));
 118
 119   // Unless +-flat-for-global is specified, turn on FlatForGlobal for all OS-es
 120   // on VI and newer hardware to avoid assertion failures due to missing ADDR64
 121   // variants of MUBUF instructions.
 122   if (!hasAddr64() && !FS.contains("flat-for-global")) {
 123     FlatForGlobal = true;
 124   }
 125
 126   // Set defaults if needed.
 127   if (MaxPrivateElementSize == 0)
 128     MaxPrivateElementSize = 4;
 129
 130   if (LDSBankCount == 0)
 131     LDSBankCount = 32;
 132
 133   if (TT.getArch() == Triple::amdgcn) {
 134     if (LocalMemorySize == 0)
 135       LocalMemorySize = 32768;
 136
 137     // Do something sensible for unspecified target.
 138     if (!HasMovrel && !HasVGPRIndexMode)
 139       HasMovrel = true;
 140   }
 141
 142   // Don't crash on invalid devices.
 143   if (WavefrontSize == 0)
 144     WavefrontSize = 64;
 145
 146   HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS;
 147
 148   if (DoesNotSupportXNACK && EnableXNACK) {
 149     ToggleFeature(AMDGPU::FeatureXNACK);
 150     EnableXNACK = false;
 151   }
 152
 153   // ECC is on by default, but turn it off if the hardware doesn't support it
 154   // anyway. This matters for the gfx9 targets with d16 loads, but don't support
 155   // ECC.
 156   if (DoesNotSupportSRAMECC && EnableSRAMECC) {
 157     ToggleFeature(AMDGPU::FeatureSRAMECC);
 158     EnableSRAMECC = false;
 159   }
 160
 161   return *this;
 162 }
 163
 164 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) :
 165   TargetTriple(TT),
 166   Has16BitInsts(false),
 167   HasMadMixInsts(false),
 168   FP32Denormals(false),
 169   FPExceptions(false),
 170   HasSDWA(false),
 171   HasVOP3PInsts(false),
 172   HasMulI24(true),
 173   HasMulU24(true),
 174   HasInv2PiInlineImm(false),
 175   HasFminFmaxLegacy(true),
 176   EnablePromoteAlloca(false),
 177   HasTrigReducedRange(false),
 178   MaxWavesPerEU(10),
 179   LocalMemorySize(0),
 180   WavefrontSize(0)
 181   { }
 182
 183 GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
 184                            const GCNTargetMachine &TM) :
 185     AMDGPUGenSubtargetInfo(TT, GPU, FS),
 186     AMDGPUSubtarget(TT),
 187     TargetTriple(TT),
 188     Gen(TT.getOS() == Triple::AMDHSA ? SEA_ISLANDS : SOUTHERN_ISLANDS),
 189     InstrItins(getInstrItineraryForCPU(GPU)),
 190     LDSBankCount(0),
 191     MaxPrivateElementSize(0),
 192
 193     FastFMAF32(false),
 194     HalfRate64Ops(false),
 195
 196     FP64FP16Denormals(false),
 197     FlatForGlobal(false),
 198     AutoWaitcntBeforeBarrier(false),
 199     CodeObjectV3(false),
 200     UnalignedScratchAccess(false),
 201     UnalignedBufferAccess(false),
 202
 203     HasApertureRegs(false),
 204     EnableXNACK(false),
 205     DoesNotSupportXNACK(false),
 206     EnableCuMode(false),
 207     TrapHandler(false),
 208
 209     EnableLoadStoreOpt(false),
 210     EnableUnsafeDSOffsetFolding(false),
 211     EnableSIScheduler(false),
 212     EnableDS128(false),
 213     EnablePRTStrictNull(false),
 214     DumpCode(false),
 215
 216     FP64(false),
 217     GCN3Encoding(false),
 218     CIInsts(false),
 219     GFX8Insts(false),
 220     GFX9Insts(false),
 221     GFX10Insts(false),
 222     GFX7GFX8GFX9Insts(false),
 223     SGPRInitBug(false),
 224     HasSMemRealTime(false),
 225     HasIntClamp(false),
 226     HasFmaMixInsts(false),
 227     HasMovrel(false),
 228     HasVGPRIndexMode(false),
 229     HasScalarStores(false),
 230     HasScalarAtomics(false),
 231     HasSDWAOmod(false),
 232     HasSDWAScalar(false),
 233     HasSDWASdst(false),
 234     HasSDWAMac(false),
 235     HasSDWAOutModsVOPC(false),
 236     HasDPP(false),
 237     HasDPP8(false),
 238     HasR128A16(false),
 239     HasNSAEncoding(false),
 240     HasDLInsts(false),
 241     HasDot1Insts(false),
 242     HasDot2Insts(false),
 243     HasDot3Insts(false),
 244     HasDot4Insts(false),
 245     HasDot5Insts(false),
 246     HasDot6Insts(false),
 247     HasMAIInsts(false),
 248     HasPkFmacF16Inst(false),
 249     HasAtomicFaddInsts(false),
 250     EnableSRAMECC(false),
 251     DoesNotSupportSRAMECC(false),
 252     HasNoSdstCMPX(false),
 253     HasVscnt(false),
 254     HasRegisterBanking(false),
 255     HasVOP3Literal(false),
 256     HasNoDataDepHazard(false),
 257     FlatAddressSpace(false),
 258     FlatInstOffsets(false),
 259     FlatGlobalInsts(false),
 260     FlatScratchInsts(false),
 261     ScalarFlatScratchInsts(false),
 262     AddNoCarryInsts(false),
 263     HasUnpackedD16VMem(false),
 264     LDSMisalignedBug(false),
 265     HasMFMAInlineLiteralBug(false),
 266
 267     ScalarizeGlobal(false),
 268
 269     HasVcmpxPermlaneHazard(false),
 270     HasVMEMtoScalarWriteHazard(false),
 271     HasSMEMtoVectorWriteHazard(false),
 272     HasInstFwdPrefetchBug(false),
 273     HasVcmpxExecWARHazard(false),
 274     HasLdsBranchVmemWARHazard(false),
 275     HasNSAtoVMEMBug(false),
 276     HasOffset3fBug(false),
 277     HasFlatSegmentOffsetBug(false),
 278
 279     FeatureDisable(false),
 280     InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)),
 281     TLInfo(TM, *this),
 282     FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) {
 283   MaxWavesPerEU = AMDGPU::IsaInfo::getMaxWavesPerEU(this);
 284   CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering()));
 285   Legalizer.reset(new AMDGPULegalizerInfo(*this, TM));
 286   RegBankInfo.reset(new AMDGPURegisterBankInfo(*this));
 287   InstSelector.reset(new AMDGPUInstructionSelector(
 288   *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM));
 289 }
 290
 291 unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const {
 292   if (getGeneration() < GFX10)
 293     return 1;
 294
 295   switch (Opcode) {
 296   case AMDGPU::V_LSHLREV_B64:
 297   case AMDGPU::V_LSHLREV_B64_gfx10:
 298   case AMDGPU::V_LSHL_B64:
 299   case AMDGPU::V_LSHRREV_B64:
 300   case AMDGPU::V_LSHRREV_B64_gfx10:
 301   case AMDGPU::V_LSHR_B64:
 302   case AMDGPU::V_ASHRREV_I64:
 303   case AMDGPU::V_ASHRREV_I64_gfx10:
 304   case AMDGPU::V_ASHR_I64:
 305     return 1;
 306   }
 307
 308   return 2;
 309 }
 310
 311 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
 312   const Function &F) const {
 313   if (NWaves == 1)
 314     return getLocalMemorySize();
 315   unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
 316   unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
 317   if (!WorkGroupsPerCu)
 318     return 0;
 319   unsigned MaxWaves = getMaxWavesPerEU();
 320   return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves;
 321 }
 322
 323 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes,
 324   const Function &F) const {
 325   unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
 326   unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
 327   if (!WorkGroupsPerCu)
 328     return 0;
 329   unsigned MaxWaves = getMaxWavesPerEU();
 330   unsigned Limit = getLocalMemorySize() * MaxWaves / WorkGroupsPerCu;
 331   unsigned NumWaves = Limit / (Bytes ? Bytes : 1u);
 332   NumWaves = std::min(NumWaves, MaxWaves);
 333   NumWaves = std::max(NumWaves, 1u);
 334   return NumWaves;
 335 }
 336
 337 unsigned
 338 AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const {
 339   const auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
 340   return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction());
 341 }
 342
 343 std::pair<unsigned, unsigned>
 344 AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const {
 345   switch (CC) {
 346   case CallingConv::AMDGPU_CS:
 347   case CallingConv::AMDGPU_KERNEL:
 348   case CallingConv::SPIR_KERNEL:
 349     return std::make_pair(getWavefrontSize() * 2,
 350                           std::max(getWavefrontSize() * 4, 256u));
 351   case CallingConv::AMDGPU_VS:
 352   case CallingConv::AMDGPU_LS:
 353   case CallingConv::AMDGPU_HS:
 354   case CallingConv::AMDGPU_ES:
 355   case CallingConv::AMDGPU_GS:
 356   case CallingConv::AMDGPU_PS:
 357     return std::make_pair(1, getWavefrontSize());
 358   default:
 359     return std::make_pair(1, 16 * getWavefrontSize());
 360   }
 361 }
 362
 363 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
 364   const Function &F) const {
 365   // FIXME: 1024 if function.
 366   // Default minimum/maximum flat work group sizes.
 367   std::pair<unsigned, unsigned> Default =
 368     getDefaultFlatWorkGroupSize(F.getCallingConv());
 369
 370   // Requested minimum/maximum flat work group sizes.
 371   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
 372     F, "amdgpu-flat-work-group-size", Default);
 373
 374   // Make sure requested minimum is less than requested maximum.
 375   if (Requested.first > Requested.second)
 376     return Default;
 377
 378   // Make sure requested values do not violate subtarget's specifications.
 379   if (Requested.first < getMinFlatWorkGroupSize())
 380     return Default;
 381   if (Requested.second > getMaxFlatWorkGroupSize())
 382     return Default;
 383
 384   return Requested;
 385 }
 386
 387 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
 388   const Function &F) const {
 389   // Default minimum/maximum number of waves per execution unit.
 390   std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
 391
 392   // Default/requested minimum/maximum flat work group sizes.
 393   std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F);
 394
 395   // If minimum/maximum flat work group sizes were explicitly requested using
 396   // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum
 397   // number of waves per execution unit to values implied by requested
 398   // minimum/maximum flat work group sizes.
 399   unsigned MinImpliedByFlatWorkGroupSize =
 400     getMaxWavesPerEU(FlatWorkGroupSizes.second);
 401   bool RequestedFlatWorkGroupSize = false;
 402
 403   if (F.hasFnAttribute("amdgpu-flat-work-group-size")) {
 404     Default.first = MinImpliedByFlatWorkGroupSize;
 405     RequestedFlatWorkGroupSize = true;
 406   }
 407
 408   // Requested minimum/maximum number of waves per execution unit.
 409   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
 410     F, "amdgpu-waves-per-eu", Default, true);
 411
 412   // Make sure requested minimum is less than requested maximum.
 413   if (Requested.second && Requested.first > Requested.second)
 414     return Default;
 415
 416   // Make sure requested values do not violate subtarget's specifications.
 417   if (Requested.first < getMinWavesPerEU() ||
 418       Requested.first > getMaxWavesPerEU())
 419     return Default;
 420   if (Requested.second > getMaxWavesPerEU())
 421     return Default;
 422
 423   // Make sure requested values are compatible with values implied by requested
 424   // minimum/maximum flat work group sizes.
 425   if (RequestedFlatWorkGroupSize &&
 426       Requested.first < MinImpliedByFlatWorkGroupSize)
 427     return Default;
 428
 429   return Requested;
 430 }
 431
 432 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const {
 433   Function *Kernel = I->getParent()->getParent();
 434   unsigned MinSize = 0;
 435   unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second;
 436   bool IdQuery = false;
 437
 438   // If reqd_work_group_size is present it narrows value down.
 439   if (auto *CI = dyn_cast<CallInst>(I)) {
 440     const Function *F = CI->getCalledFunction();
 441     if (F) {
 442       unsigned Dim = UINT_MAX;
 443       switch (F->getIntrinsicID()) {
 444       case Intrinsic::amdgcn_workitem_id_x:
 445       case Intrinsic::r600_read_tidig_x:
 446         IdQuery = true;
 447         LLVM_FALLTHROUGH;
 448       case Intrinsic::r600_read_local_size_x:
 449         Dim = 0;
 450         break;
 451       case Intrinsic::amdgcn_workitem_id_y:
 452       case Intrinsic::r600_read_tidig_y:
 453         IdQuery = true;
 454         LLVM_FALLTHROUGH;
 455       case Intrinsic::r600_read_local_size_y:
 456         Dim = 1;
 457         break;
 458       case Intrinsic::amdgcn_workitem_id_z:
 459       case Intrinsic::r600_read_tidig_z:
 460         IdQuery = true;
 461         LLVM_FALLTHROUGH;
 462       case Intrinsic::r600_read_local_size_z:
 463         Dim = 2;
 464         break;
 465       default:
 466         break;
 467       }
 468       if (Dim <= 3) {
 469         if (auto Node = Kernel->getMetadata("reqd_work_group_size"))
 470           if (Node->getNumOperands() == 3)
 471             MinSize = MaxSize = mdconst::extract<ConstantInt>(
 472                                   Node->getOperand(Dim))->getZExtValue();
 473       }
 474     }
 475   }
 476
 477   if (!MaxSize)
 478     return false;
 479
 480   // Range metadata is [Lo, Hi). For ID query we need to pass max size
 481   // as Hi. For size query we need to pass Hi + 1.
 482   if (IdQuery)
 483     MinSize = 0;
 484   else
 485     ++MaxSize;
 486
 487   MDBuilder MDB(I->getContext());
 488   MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize),
 489                                                   APInt(32, MaxSize));
 490   I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
 491   return true;
 492 }
 493
 494 uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F,
 495                                                  unsigned &MaxAlign) const {
 496   assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
 497          F.getCallingConv() == CallingConv::SPIR_KERNEL);
 498
 499   const DataLayout &DL = F.getParent()->getDataLayout();
 500   uint64_t ExplicitArgBytes = 0;
 501   MaxAlign = 1;
 502
 503   for (const Argument &Arg : F.args()) {
 504     Type *ArgTy = Arg.getType();
 505
 506     unsigned Align = DL.getABITypeAlignment(ArgTy);
 507     uint64_t AllocSize = DL.getTypeAllocSize(ArgTy);
 508     ExplicitArgBytes = alignTo(ExplicitArgBytes, Align) + AllocSize;
 509     MaxAlign = std::max(MaxAlign, Align);
 510   }
 511
 512   return ExplicitArgBytes;
 513 }
 514
 515 unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F,
 516                                                 unsigned &MaxAlign) const {
 517   uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign);
 518
 519   unsigned ExplicitOffset = getExplicitKernelArgOffset(F);
 520
 521   uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes;
 522   unsigned ImplicitBytes = getImplicitArgNumBytes(F);
 523   if (ImplicitBytes != 0) {
 524     unsigned Alignment = getAlignmentForImplicitArgPtr();
 525     TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
 526   }
 527
 528   // Being able to dereference past the end is useful for emitting scalar loads.
 529   return alignTo(TotalSize, 4);
 530 }
 531
 532 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS,
 533                              const TargetMachine &TM) :
 534   R600GenSubtargetInfo(TT, GPU, FS),
 535   AMDGPUSubtarget(TT),
 536   InstrInfo(*this),
 537   FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
 538   FMA(false),
 539   CaymanISA(false),
 540   CFALUBug(false),
 541   HasVertexCache(false),
 542   R600ALUInst(false),
 543   FP64(false),
 544   TexVTXClauseSize(0),
 545   Gen(R600),
 546   TLInfo(TM, initializeSubtargetDependencies(TT, GPU, FS)),
 547   InstrItins(getInstrItineraryForCPU(GPU)) { }
 548
 549 void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
 550                                       unsigned NumRegionInstrs) const {
 551   // Track register pressure so the scheduler can try to decrease
 552   // pressure once register usage is above the threshold defined by
 553   // SIRegisterInfo::getRegPressureSetLimit()
 554   Policy.ShouldTrackPressure = true;
 555
 556   // Enabling both top down and bottom up scheduling seems to give us less
 557   // register spills than just using one of these approaches on its own.
 558   Policy.OnlyTopDown = false;
 559   Policy.OnlyBottomUp = false;
 560
 561   // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
 562   if (!enableSIScheduler())
 563     Policy.ShouldTrackLaneMasks = true;
 564 }
 565
 566 bool GCNSubtarget::hasMadF16() const {
 567   return InstrInfo.pseudoToMCOpcode(AMDGPU::V_MAD_F16) != -1;
 568 }
 569
 570 unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
 571   if (getGeneration() >= AMDGPUSubtarget::GFX10)
 572     return getMaxWavesPerEU();
 573
 574   if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
 575     if (SGPRs <= 80)
 576       return 10;
 577     if (SGPRs <= 88)
 578       return 9;
 579     if (SGPRs <= 100)
 580       return 8;
 581     return 7;
 582   }
 583   if (SGPRs <= 48)
 584     return 10;
 585   if (SGPRs <= 56)
 586     return 9;
 587   if (SGPRs <= 64)
 588     return 8;
 589   if (SGPRs <= 72)
 590     return 7;
 591   if (SGPRs <= 80)
 592     return 6;
 593   return 5;
 594 }
 595
 596 unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const {
 597   unsigned MaxWaves = getMaxWavesPerEU();
 598   unsigned Granule = getVGPRAllocGranule();
 599   if (VGPRs < Granule)
 600     return MaxWaves;
 601   unsigned RoundedRegs = ((VGPRs + Granule - 1) / Granule) * Granule;
 602   return std::min(std::max(getTotalNumVGPRs() / RoundedRegs, 1u), MaxWaves);
 603 }
 604
 605 unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
 606   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
 607   if (getGeneration() >= AMDGPUSubtarget::GFX10)
 608     return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs.
 609
 610   if (MFI.hasFlatScratchInit()) {
 611     if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
 612       return 6; // FLAT_SCRATCH, XNACK, VCC (in that order).
 613     if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS)
 614       return 4; // FLAT_SCRATCH, VCC (in that order).
 615   }
 616
 617   if (isXNACKEnabled())
 618     return 4; // XNACK, VCC (in that order).
 619   return 2; // VCC.
 620 }
 621
 622 unsigned GCNSubtarget::computeOccupancy(const MachineFunction &MF,
 623                                         unsigned LDSSize,
 624                                         unsigned NumSGPRs,
 625                                         unsigned NumVGPRs) const {
 626   unsigned Occupancy =
 627     std::min(getMaxWavesPerEU(),
 628              getOccupancyWithLocalMemSize(LDSSize, MF.getFunction()));
 629   if (NumSGPRs)
 630     Occupancy = std::min(Occupancy, getOccupancyWithNumSGPRs(NumSGPRs));
 631   if (NumVGPRs)
 632     Occupancy = std::min(Occupancy, getOccupancyWithNumVGPRs(NumVGPRs));
 633   return Occupancy;
 634 }
 635
 636 unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
 637   const Function &F = MF.getFunction();
 638   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
 639
 640   // Compute maximum number of SGPRs function can use using default/requested
 641   // minimum number of waves per execution unit.
 642   std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
 643   unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false);
 644   unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true);
 645
 646   // Check if maximum number of SGPRs was explicitly requested using
 647   // "amdgpu-num-sgpr" attribute.
 648   if (F.hasFnAttribute("amdgpu-num-sgpr")) {
 649     unsigned Requested = AMDGPU::getIntegerAttribute(
 650       F, "amdgpu-num-sgpr", MaxNumSGPRs);
 651
 652     // Make sure requested value does not violate subtarget's specifications.
 653     if (Requested && (Requested <= getReservedNumSGPRs(MF)))
 654       Requested = 0;
 655
 656     // If more SGPRs are required to support the input user/system SGPRs,
 657     // increase to accommodate them.
 658     //
 659     // FIXME: This really ends up using the requested number of SGPRs + number
 660     // of reserved special registers in total. Theoretically you could re-use
 661     // the last input registers for these special registers, but this would
 662     // require a lot of complexity to deal with the weird aliasing.
 663     unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs();
 664     if (Requested && Requested < InputNumSGPRs)
 665       Requested = InputNumSGPRs;
 666
 667     // Make sure requested value is compatible with values implied by
 668     // default/requested minimum/maximum number of waves per execution unit.
 669     if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false))
 670       Requested = 0;
 671     if (WavesPerEU.second &&
 672         Requested && Requested < getMinNumSGPRs(WavesPerEU.second))
 673       Requested = 0;
 674
 675     if (Requested)
 676       MaxNumSGPRs = Requested;
 677   }
 678
 679   if (hasSGPRInitBug())
 680     MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
 681
 682   return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF),
 683                   MaxAddressableNumSGPRs);
 684 }
 685
 686 unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
 687   const Function &F = MF.getFunction();
 688   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
 689
 690   // Compute maximum number of VGPRs function can use using default/requested
 691   // minimum number of waves per execution unit.
 692   std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
 693   unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first);
 694
 695   // Check if maximum number of VGPRs was explicitly requested using
 696   // "amdgpu-num-vgpr" attribute.
 697   if (F.hasFnAttribute("amdgpu-num-vgpr")) {
 698     unsigned Requested = AMDGPU::getIntegerAttribute(
 699       F, "amdgpu-num-vgpr", MaxNumVGPRs);
 700
 701     // Make sure requested value is compatible with values implied by
 702     // default/requested minimum/maximum number of waves per execution unit.
 703     if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first))
 704       Requested = 0;
 705     if (WavesPerEU.second &&
 706         Requested && Requested < getMinNumVGPRs(WavesPerEU.second))
 707       Requested = 0;
 708
 709     if (Requested)
 710       MaxNumVGPRs = Requested;
 711   }
 712
 713   return MaxNumVGPRs;
 714 }
 715
 716 namespace {
 717 struct MemOpClusterMutation : ScheduleDAGMutation {
 718   const SIInstrInfo *TII;
 719
 720   MemOpClusterMutation(const SIInstrInfo *tii) : TII(tii) {}
 721
 722   void apply(ScheduleDAGInstrs *DAG) override {
 723     SUnit *SUa = nullptr;
 724     // Search for two consequent memory operations and link them
 725     // to prevent scheduler from moving them apart.
 726     // In DAG pre-process SUnits are in the original order of
 727     // the instructions before scheduling.
 728     for (SUnit &SU : DAG->SUnits) {
 729       MachineInstr &MI2 = *SU.getInstr();
 730       if (!MI2.mayLoad() && !MI2.mayStore()) {
 731         SUa = nullptr;
 732         continue;
 733       }
 734       if (!SUa) {
 735         SUa = &SU;
 736         continue;
 737       }
 738
 739       MachineInstr &MI1 = *SUa->getInstr();
 740       if ((TII->isVMEM(MI1) && TII->isVMEM(MI2)) ||
 741           (TII->isFLAT(MI1) && TII->isFLAT(MI2)) ||
 742           (TII->isSMRD(MI1) && TII->isSMRD(MI2)) ||
 743           (TII->isDS(MI1)   && TII->isDS(MI2))) {
 744         SU.addPredBarrier(SUa);
 745
 746         for (const SDep &SI : SU.Preds) {
 747           if (SI.getSUnit() != SUa)
 748             SUa->addPred(SDep(SI.getSUnit(), SDep::Artificial));
 749         }
 750
 751         if (&SU != &DAG->ExitSU) {
 752           for (const SDep &SI : SUa->Succs) {
 753             if (SI.getSUnit() != &SU)
 754               SI.getSUnit()->addPred(SDep(&SU, SDep::Artificial));
 755           }
 756         }
 757       }
 758
 759       SUa = &SU;
 760     }
 761   }
 762 };
 763
 764 struct FillMFMAShadowMutation : ScheduleDAGMutation {
 765   const SIInstrInfo *TII;
 766
 767   ScheduleDAGMI *DAG;
 768
 769   FillMFMAShadowMutation(const SIInstrInfo *tii) : TII(tii) {}
 770
 771   bool isSALU(const SUnit *SU) const {
 772     const MachineInstr *MI = SU->getInstr();
 773     return MI && TII->isSALU(*MI) && !MI->isTerminator();
 774   }
 775
 776   bool canAddEdge(const SUnit *Succ, const SUnit *Pred) const {
 777     if (Pred->NodeNum < Succ->NodeNum)
 778       return true;
 779
 780     SmallVector<const SUnit*, 64> Succs({Succ}), Preds({Pred});
 781
 782     for (unsigned I = 0; I < Succs.size(); ++I) {
 783       for (const SDep &SI : Succs[I]->Succs) {
 784         const SUnit *SU = SI.getSUnit();
 785         if (SU != Succs[I] && llvm::find(Succs, SU) == Succs.end())
 786           Succs.push_back(SU);
 787       }
 788     }
 789
 790     SmallPtrSet<const SUnit*, 32> Visited;
 791     while (!Preds.empty()) {
 792       const SUnit *SU = Preds.pop_back_val();
 793       if (llvm::find(Succs, SU) != Succs.end())
 794         return false;
 795       Visited.insert(SU);
 796       for (const SDep &SI : SU->Preds)
 797         if (SI.getSUnit() != SU && !Visited.count(SI.getSUnit()))
 798           Preds.push_back(SI.getSUnit());
 799     }
 800
 801     return true;
 802   }
 803
 804   // Link as much SALU intructions in chain as possible. Return the size
 805   // of the chain. Links up to MaxChain instructions.
 806   unsigned linkSALUChain(SUnit *From, SUnit *To, unsigned MaxChain,
 807                          SmallPtrSetImpl<SUnit *> &Visited) const {
 808     SmallVector<SUnit *, 8> Worklist({To});
 809     unsigned Linked = 0;
 810
 811     while (!Worklist.empty() && MaxChain-- > 0) {
 812       SUnit *SU = Worklist.pop_back_val();
 813       if (!Visited.insert(SU).second)
 814         continue;
 815
 816       LLVM_DEBUG(dbgs() << "Inserting edge from\n" ; DAG->dumpNode(*From);
 817                  dbgs() << "to\n"; DAG->dumpNode(*SU); dbgs() << '\n');
 818
 819       if (SU->addPred(SDep(From, SDep::Artificial), false))
 820         ++Linked;
 821
 822       for (SDep &SI : From->Succs) {
 823         SUnit *SUv = SI.getSUnit();
 824         if (SUv != From && TII->isVALU(*SUv->getInstr()) && canAddEdge(SUv, SU))
 825           SUv->addPred(SDep(SU, SDep::Artificial), false);
 826       }
 827
 828       for (SDep &SI : SU->Succs) {
 829         SUnit *Succ = SI.getSUnit();
 830         if (Succ != SU && isSALU(Succ) && canAddEdge(From, Succ))
 831           Worklist.push_back(Succ);
 832       }
 833     }
 834
 835     return Linked;
 836   }
 837
 838   void apply(ScheduleDAGInstrs *DAGInstrs) override {
 839     const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>();
 840     if (!ST.hasMAIInsts() || DisablePowerSched)
 841       return;
 842     DAG = static_cast<ScheduleDAGMI*>(DAGInstrs);
 843     const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel();
 844     if (!TSchedModel || DAG->SUnits.empty())
 845       return;
 846
 847     // Scan for MFMA long latency instructions and try to add a dependency
 848     // of available SALU instructions to give them a chance to fill MFMA
 849     // shadow. That is desirable to fill MFMA shadow with SALU instructions
 850     // rather than VALU to prevent power consumption bursts and throttle.
 851     auto LastSALU = DAG->SUnits.begin();
 852     auto E = DAG->SUnits.end();
 853     SmallPtrSet<SUnit*, 32> Visited;
 854     for (SUnit &SU : DAG->SUnits) {
 855       MachineInstr &MAI = *SU.getInstr();
 856       if (!TII->isMAI(MAI) ||
 857            MAI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32 ||
 858            MAI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32)
 859         continue;
 860
 861       unsigned Lat = TSchedModel->computeInstrLatency(&MAI) - 1;
 862
 863       LLVM_DEBUG(dbgs() << "Found MFMA: "; DAG->dumpNode(SU);
 864                  dbgs() << "Need " << Lat
 865                         << " instructions to cover latency.\n");
 866
 867       // Find up to Lat independent scalar instructions as early as
 868       // possible such that they can be scheduled after this MFMA.
 869       for ( ; Lat && LastSALU != E; ++LastSALU) {
 870         if (Visited.count(&*LastSALU))
 871           continue;
 872
 873         if (!isSALU(&*LastSALU) || !canAddEdge(&*LastSALU, &SU))
 874           continue;
 875
 876         Lat -= linkSALUChain(&SU, &*LastSALU, Lat, Visited);
 877       }
 878     }
 879   }
 880 };
 881 } // namespace
 882
 883 void GCNSubtarget::getPostRAMutations(
 884     std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
 885   Mutations.push_back(std::make_unique<MemOpClusterMutation>(&InstrInfo));
 886   Mutations.push_back(std::make_unique<FillMFMAShadowMutation>(&InstrInfo));
 887 }
 888
 889 const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) {
 890   if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn)
 891     return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>());
 892   else
 893     return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<R600Subtarget>());
 894 }
 895
 896 const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Function &F) {
 897   if (TM.getTargetTriple().getArch() == Triple::amdgcn)
 898     return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F));
 899   else
 900     return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<R600Subtarget>(F));
 901 }