lib/Target/AMDGPU/AMDGPUSubtarget.cpp

   1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
   2 //
   3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 // See https://llvm.org/LICENSE.txt for license information.
   5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 //
   7 //===----------------------------------------------------------------------===//
   8 //
   9 /// \file
  10 /// Implements the AMDGPU specific subclass of TargetSubtarget.
  11 //
  12 //===----------------------------------------------------------------------===//
  13
  14 #include "AMDGPUSubtarget.h"
  15 #include "AMDGPU.h"
  16 #include "AMDGPUTargetMachine.h"
  17 #include "AMDGPUCallLowering.h"
  18 #include "AMDGPUInstructionSelector.h"
  19 #include "AMDGPULegalizerInfo.h"
  20 #include "AMDGPURegisterBankInfo.h"
  21 #include "SIMachineFunctionInfo.h"
  22 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
  23 #include "llvm/ADT/SmallString.h"
  24 #include "llvm/CodeGen/MachineScheduler.h"
  25 #include "llvm/MC/MCSubtargetInfo.h"
  26 #include "llvm/IR/MDBuilder.h"
  27 #include "llvm/CodeGen/TargetFrameLowering.h"
  28 #include <algorithm>
  29
  30 using namespace llvm;
  31
  32 #define DEBUG_TYPE "amdgpu-subtarget"
  33
  34 #define GET_SUBTARGETINFO_TARGET_DESC
  35 #define GET_SUBTARGETINFO_CTOR
  36 #define AMDGPUSubtarget GCNSubtarget
  37 #include "AMDGPUGenSubtargetInfo.inc"
  38 #define GET_SUBTARGETINFO_TARGET_DESC
  39 #define GET_SUBTARGETINFO_CTOR
  40 #undef AMDGPUSubtarget
  41 #include "R600GenSubtargetInfo.inc"
  42
  43 static cl::opt<bool> DisablePowerSched(
  44   "amdgpu-disable-power-sched",
  45   cl::desc("Disable scheduling to minimize mAI power bursts"),
  46   cl::init(false));
  47
  48 GCNSubtarget::~GCNSubtarget() = default;
  49
  50 R600Subtarget &
  51 R600Subtarget::initializeSubtargetDependencies(const Triple &TT,
  52                                                StringRef GPU, StringRef FS) {
  53   SmallString<256> FullFS("+promote-alloca,");
  54   FullFS += FS;
  55   ParseSubtargetFeatures(GPU, FullFS);
  56
  57   // FIXME: I don't think think Evergreen has any useful support for
  58   // denormals, but should be checked. Should we issue a warning somewhere
  59   // if someone tries to enable these?
  60   if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
  61     FP32Denormals = false;
  62   }
  63
  64   HasMulU24 = getGeneration() >= EVERGREEN;
  65   HasMulI24 = hasCaymanISA();
  66
  67   return *this;
  68 }
  69
  70 GCNSubtarget &
  71 GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
  72                                               StringRef GPU, StringRef FS) {
  73   // Determine default and user-specified characteristics
  74   // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be
  75   // enabled, but some instructions do not respect them and they run at the
  76   // double precision rate, so don't enable by default.
  77   //
  78   // We want to be able to turn these off, but making this a subtarget feature
  79   // for SI has the unhelpful behavior that it unsets everything else if you
  80   // disable it.
  81   //
  82   // Similarly we want enable-prt-strict-null to be on by default and not to
  83   // unset everything else if it is disabled
  84
  85   // Assuming ECC is enabled is the conservative default.
  86   SmallString<256> FullFS("+promote-alloca,+load-store-opt,+sram-ecc,+xnack,");
  87
  88   if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA.
  89     FullFS += "+flat-for-global,+unaligned-buffer-access,+trap-handler,";
  90
  91   // FIXME: I don't think think Evergreen has any useful support for
  92   // denormals, but should be checked. Should we issue a warning somewhere
  93   // if someone tries to enable these?
  94   if (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
  95     FullFS += "+fp64-fp16-denormals,";
  96   } else {
  97     FullFS += "-fp32-denormals,";
  98   }
  99
 100   FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS
 101
 102   // Disable mutually exclusive bits.
 103   if (FS.find_lower("+wavefrontsize") != StringRef::npos) {
 104     if (FS.find_lower("wavefrontsize16") == StringRef::npos)
 105       FullFS += "-wavefrontsize16,";
 106     if (FS.find_lower("wavefrontsize32") == StringRef::npos)
 107       FullFS += "-wavefrontsize32,";
 108     if (FS.find_lower("wavefrontsize64") == StringRef::npos)
 109       FullFS += "-wavefrontsize64,";
 110   }
 111
 112   FullFS += FS;
 113
 114   ParseSubtargetFeatures(GPU, FullFS);
 115
 116   // We don't support FP64 for EG/NI atm.
 117   assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS));
 118
 119   // Unless +-flat-for-global is specified, turn on FlatForGlobal for all OS-es
 120   // on VI and newer hardware to avoid assertion failures due to missing ADDR64
 121   // variants of MUBUF instructions.
 122   if (!hasAddr64() && !FS.contains("flat-for-global")) {
 123     FlatForGlobal = true;
 124   }
 125
 126   // Set defaults if needed.
 127   if (MaxPrivateElementSize == 0)
 128     MaxPrivateElementSize = 4;
 129
 130   if (LDSBankCount == 0)
 131     LDSBankCount = 32;
 132
 133   if (TT.getArch() == Triple::amdgcn) {
 134     if (LocalMemorySize == 0)
 135       LocalMemorySize = 32768;
 136
 137     // Do something sensible for unspecified target.
 138     if (!HasMovrel && !HasVGPRIndexMode)
 139       HasMovrel = true;
 140   }
 141
 142   // Don't crash on invalid devices.
 143   if (WavefrontSize == 0)
 144     WavefrontSize = 64;
 145
 146   HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS;
 147
 148   if (DoesNotSupportXNACK && EnableXNACK) {
 149     ToggleFeature(AMDGPU::FeatureXNACK);
 150     EnableXNACK = false;
 151   }
 152
 153   // ECC is on by default, but turn it off if the hardware doesn't support it
 154   // anyway. This matters for the gfx9 targets with d16 loads, but don't support
 155   // ECC.
 156   if (DoesNotSupportSRAMECC && EnableSRAMECC) {
 157     ToggleFeature(AMDGPU::FeatureSRAMECC);
 158     EnableSRAMECC = false;
 159   }
 160
 161   return *this;
 162 }
 163
 164 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) :
 165   TargetTriple(TT),
 166   Has16BitInsts(false),
 167   HasMadMixInsts(false),
 168   FP32Denormals(false),
 169   FPExceptions(false),
 170   HasSDWA(false),
 171   HasVOP3PInsts(false),
 172   HasMulI24(true),
 173   HasMulU24(true),
 174   HasInv2PiInlineImm(false),
 175   HasFminFmaxLegacy(true),
 176   EnablePromoteAlloca(false),
 177   HasTrigReducedRange(false),
 178   MaxWavesPerEU(10),
 179   LocalMemorySize(0),
 180   WavefrontSize(0)
 181   { }
 182
 183 GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
 184                            const GCNTargetMachine &TM) :
 185     AMDGPUGenSubtargetInfo(TT, GPU, FS),
 186     AMDGPUSubtarget(TT),
 187     TargetTriple(TT),
 188     Gen(TT.getOS() == Triple::AMDHSA ? SEA_ISLANDS : SOUTHERN_ISLANDS),
 189     InstrItins(getInstrItineraryForCPU(GPU)),
 190     LDSBankCount(0),
 191     MaxPrivateElementSize(0),
 192
 193     FastFMAF32(false),
 194     HalfRate64Ops(false),
 195
 196     FP64FP16Denormals(false),
 197     FlatForGlobal(false),
 198     AutoWaitcntBeforeBarrier(false),
 199     CodeObjectV3(false),
 200     UnalignedScratchAccess(false),
 201     UnalignedBufferAccess(false),
 202
 203     HasApertureRegs(false),
 204     EnableXNACK(false),
 205     DoesNotSupportXNACK(false),
 206     EnableCuMode(false),
 207     TrapHandler(false),
 208
 209     EnableLoadStoreOpt(false),
 210     EnableUnsafeDSOffsetFolding(false),
 211     EnableSIScheduler(false),
 212     EnableDS128(false),
 213     EnablePRTStrictNull(false),
 214     DumpCode(false),
 215
 216     FP64(false),
 217     GCN3Encoding(false),
 218     CIInsts(false),
 219     GFX8Insts(false),
 220     GFX9Insts(false),
 221     GFX10Insts(false),
 222     GFX7GFX8GFX9Insts(false),
 223     SGPRInitBug(false),
 224     HasSMemRealTime(false),
 225     HasIntClamp(false),
 226     HasFmaMixInsts(false),
 227     HasMovrel(false),
 228     HasVGPRIndexMode(false),
 229     HasScalarStores(false),
 230     HasScalarAtomics(false),
 231     HasSDWAOmod(false),
 232     HasSDWAScalar(false),
 233     HasSDWASdst(false),
 234     HasSDWAMac(false),
 235     HasSDWAOutModsVOPC(false),
 236     HasDPP(false),
 237     HasDPP8(false),
 238     HasR128A16(false),
 239     HasNSAEncoding(false),
 240     HasDLInsts(false),
 241     HasDot1Insts(false),
 242     HasDot2Insts(false),
 243     HasDot3Insts(false),
 244     HasDot4Insts(false),
 245     HasDot5Insts(false),
 246     HasDot6Insts(false),
 247     HasMAIInsts(false),
 248     HasPkFmacF16Inst(false),
 249     HasAtomicFaddInsts(false),
 250     EnableSRAMECC(false),
 251     DoesNotSupportSRAMECC(false),
 252     HasNoSdstCMPX(false),
 253     HasVscnt(false),
 254     HasRegisterBanking(false),
 255     HasVOP3Literal(false),
 256     HasNoDataDepHazard(false),
 257     FlatAddressSpace(false),
 258     FlatInstOffsets(false),
 259     FlatGlobalInsts(false),
 260     FlatScratchInsts(false),
 261     ScalarFlatScratchInsts(false),
 262     AddNoCarryInsts(false),
 263     HasUnpackedD16VMem(false),
 264     LDSMisalignedBug(false),
 265
 266     ScalarizeGlobal(false),
 267
 268     HasVcmpxPermlaneHazard(false),
 269     HasVMEMtoScalarWriteHazard(false),
 270     HasSMEMtoVectorWriteHazard(false),
 271     HasInstFwdPrefetchBug(false),
 272     HasVcmpxExecWARHazard(false),
 273     HasLdsBranchVmemWARHazard(false),
 274     HasNSAtoVMEMBug(false),
 275     HasOffset3fBug(false),
 276     HasFlatSegmentOffsetBug(false),
 277
 278     FeatureDisable(false),
 279     InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)),
 280     TLInfo(TM, *this),
 281     FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) {
 282   MaxWavesPerEU = AMDGPU::IsaInfo::getMaxWavesPerEU(this);
 283   CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering()));
 284   Legalizer.reset(new AMDGPULegalizerInfo(*this, TM));
 285   RegBankInfo.reset(new AMDGPURegisterBankInfo(*getRegisterInfo()));
 286   InstSelector.reset(new AMDGPUInstructionSelector(
 287   *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM));
 288 }
 289
 290 unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const {
 291   if (getGeneration() < GFX10)
 292     return 1;
 293
 294   switch (Opcode) {
 295   case AMDGPU::V_LSHLREV_B64:
 296   case AMDGPU::V_LSHLREV_B64_gfx10:
 297   case AMDGPU::V_LSHL_B64:
 298   case AMDGPU::V_LSHRREV_B64:
 299   case AMDGPU::V_LSHRREV_B64_gfx10:
 300   case AMDGPU::V_LSHR_B64:
 301   case AMDGPU::V_ASHRREV_I64:
 302   case AMDGPU::V_ASHRREV_I64_gfx10:
 303   case AMDGPU::V_ASHR_I64:
 304     return 1;
 305   }
 306
 307   return 2;
 308 }
 309
 310 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
 311   const Function &F) const {
 312   if (NWaves == 1)
 313     return getLocalMemorySize();
 314   unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
 315   unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
 316   if (!WorkGroupsPerCu)
 317     return 0;
 318   unsigned MaxWaves = getMaxWavesPerEU();
 319   return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves;
 320 }
 321
 322 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes,
 323   const Function &F) const {
 324   unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
 325   unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
 326   if (!WorkGroupsPerCu)
 327     return 0;
 328   unsigned MaxWaves = getMaxWavesPerEU();
 329   unsigned Limit = getLocalMemorySize() * MaxWaves / WorkGroupsPerCu;
 330   unsigned NumWaves = Limit / (Bytes ? Bytes : 1u);
 331   NumWaves = std::min(NumWaves, MaxWaves);
 332   NumWaves = std::max(NumWaves, 1u);
 333   return NumWaves;
 334 }
 335
 336 unsigned
 337 AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const {
 338   const auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
 339   return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction());
 340 }
 341
 342 std::pair<unsigned, unsigned>
 343 AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const {
 344   switch (CC) {
 345   case CallingConv::AMDGPU_CS:
 346   case CallingConv::AMDGPU_KERNEL:
 347   case CallingConv::SPIR_KERNEL:
 348     return std::make_pair(getWavefrontSize() * 2,
 349                           std::max(getWavefrontSize() * 4, 256u));
 350   case CallingConv::AMDGPU_VS:
 351   case CallingConv::AMDGPU_LS:
 352   case CallingConv::AMDGPU_HS:
 353   case CallingConv::AMDGPU_ES:
 354   case CallingConv::AMDGPU_GS:
 355   case CallingConv::AMDGPU_PS:
 356     return std::make_pair(1, getWavefrontSize());
 357   default:
 358     return std::make_pair(1, 16 * getWavefrontSize());
 359   }
 360 }
 361
 362 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
 363   const Function &F) const {
 364   // FIXME: 1024 if function.
 365   // Default minimum/maximum flat work group sizes.
 366   std::pair<unsigned, unsigned> Default =
 367     getDefaultFlatWorkGroupSize(F.getCallingConv());
 368
 369   // Requested minimum/maximum flat work group sizes.
 370   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
 371     F, "amdgpu-flat-work-group-size", Default);
 372
 373   // Make sure requested minimum is less than requested maximum.
 374   if (Requested.first > Requested.second)
 375     return Default;
 376
 377   // Make sure requested values do not violate subtarget's specifications.
 378   if (Requested.first < getMinFlatWorkGroupSize())
 379     return Default;
 380   if (Requested.second > getMaxFlatWorkGroupSize())
 381     return Default;
 382
 383   return Requested;
 384 }
 385
 386 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
 387   const Function &F) const {
 388   // Default minimum/maximum number of waves per execution unit.
 389   std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
 390
 391   // Default/requested minimum/maximum flat work group sizes.
 392   std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F);
 393
 394   // If minimum/maximum flat work group sizes were explicitly requested using
 395   // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum
 396   // number of waves per execution unit to values implied by requested
 397   // minimum/maximum flat work group sizes.
 398   unsigned MinImpliedByFlatWorkGroupSize =
 399     getMaxWavesPerEU(FlatWorkGroupSizes.second);
 400   bool RequestedFlatWorkGroupSize = false;
 401
 402   if (F.hasFnAttribute("amdgpu-flat-work-group-size")) {
 403     Default.first = MinImpliedByFlatWorkGroupSize;
 404     RequestedFlatWorkGroupSize = true;
 405   }
 406
 407   // Requested minimum/maximum number of waves per execution unit.
 408   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
 409     F, "amdgpu-waves-per-eu", Default, true);
 410
 411   // Make sure requested minimum is less than requested maximum.
 412   if (Requested.second && Requested.first > Requested.second)
 413     return Default;
 414
 415   // Make sure requested values do not violate subtarget's specifications.
 416   if (Requested.first < getMinWavesPerEU() ||
 417       Requested.first > getMaxWavesPerEU())
 418     return Default;
 419   if (Requested.second > getMaxWavesPerEU())
 420     return Default;
 421
 422   // Make sure requested values are compatible with values implied by requested
 423   // minimum/maximum flat work group sizes.
 424   if (RequestedFlatWorkGroupSize &&
 425       Requested.first < MinImpliedByFlatWorkGroupSize)
 426     return Default;
 427
 428   return Requested;
 429 }
 430
 431 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const {
 432   Function *Kernel = I->getParent()->getParent();
 433   unsigned MinSize = 0;
 434   unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second;
 435   bool IdQuery = false;
 436
 437   // If reqd_work_group_size is present it narrows value down.
 438   if (auto *CI = dyn_cast<CallInst>(I)) {
 439     const Function *F = CI->getCalledFunction();
 440     if (F) {
 441       unsigned Dim = UINT_MAX;
 442       switch (F->getIntrinsicID()) {
 443       case Intrinsic::amdgcn_workitem_id_x:
 444       case Intrinsic::r600_read_tidig_x:
 445         IdQuery = true;
 446         LLVM_FALLTHROUGH;
 447       case Intrinsic::r600_read_local_size_x:
 448         Dim = 0;
 449         break;
 450       case Intrinsic::amdgcn_workitem_id_y:
 451       case Intrinsic::r600_read_tidig_y:
 452         IdQuery = true;
 453         LLVM_FALLTHROUGH;
 454       case Intrinsic::r600_read_local_size_y:
 455         Dim = 1;
 456         break;
 457       case Intrinsic::amdgcn_workitem_id_z:
 458       case Intrinsic::r600_read_tidig_z:
 459         IdQuery = true;
 460         LLVM_FALLTHROUGH;
 461       case Intrinsic::r600_read_local_size_z:
 462         Dim = 2;
 463         break;
 464       default:
 465         break;
 466       }
 467       if (Dim <= 3) {
 468         if (auto Node = Kernel->getMetadata("reqd_work_group_size"))
 469           if (Node->getNumOperands() == 3)
 470             MinSize = MaxSize = mdconst::extract<ConstantInt>(
 471                                   Node->getOperand(Dim))->getZExtValue();
 472       }
 473     }
 474   }
 475
 476   if (!MaxSize)
 477     return false;
 478
 479   // Range metadata is [Lo, Hi). For ID query we need to pass max size
 480   // as Hi. For size query we need to pass Hi + 1.
 481   if (IdQuery)
 482     MinSize = 0;
 483   else
 484     ++MaxSize;
 485
 486   MDBuilder MDB(I->getContext());
 487   MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize),
 488                                                   APInt(32, MaxSize));
 489   I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
 490   return true;
 491 }
 492
 493 uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F,
 494                                                  unsigned &MaxAlign) const {
 495   assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
 496          F.getCallingConv() == CallingConv::SPIR_KERNEL);
 497
 498   const DataLayout &DL = F.getParent()->getDataLayout();
 499   uint64_t ExplicitArgBytes = 0;
 500   MaxAlign = 1;
 501
 502   for (const Argument &Arg : F.args()) {
 503     Type *ArgTy = Arg.getType();
 504
 505     unsigned Align = DL.getABITypeAlignment(ArgTy);
 506     uint64_t AllocSize = DL.getTypeAllocSize(ArgTy);
 507     ExplicitArgBytes = alignTo(ExplicitArgBytes, Align) + AllocSize;
 508     MaxAlign = std::max(MaxAlign, Align);
 509   }
 510
 511   return ExplicitArgBytes;
 512 }
 513
 514 unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F,
 515                                                 unsigned &MaxAlign) const {
 516   uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign);
 517
 518   unsigned ExplicitOffset = getExplicitKernelArgOffset(F);
 519
 520   uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes;
 521   unsigned ImplicitBytes = getImplicitArgNumBytes(F);
 522   if (ImplicitBytes != 0) {
 523     unsigned Alignment = getAlignmentForImplicitArgPtr();
 524     TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
 525   }
 526
 527   // Being able to dereference past the end is useful for emitting scalar loads.
 528   return alignTo(TotalSize, 4);
 529 }
 530
 531 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS,
 532                              const TargetMachine &TM) :
 533   R600GenSubtargetInfo(TT, GPU, FS),
 534   AMDGPUSubtarget(TT),
 535   InstrInfo(*this),
 536   FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
 537   FMA(false),
 538   CaymanISA(false),
 539   CFALUBug(false),
 540   HasVertexCache(false),
 541   R600ALUInst(false),
 542   FP64(false),
 543   TexVTXClauseSize(0),
 544   Gen(R600),
 545   TLInfo(TM, initializeSubtargetDependencies(TT, GPU, FS)),
 546   InstrItins(getInstrItineraryForCPU(GPU)) { }
 547
 548 void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
 549                                       unsigned NumRegionInstrs) const {
 550   // Track register pressure so the scheduler can try to decrease
 551   // pressure once register usage is above the threshold defined by
 552   // SIRegisterInfo::getRegPressureSetLimit()
 553   Policy.ShouldTrackPressure = true;
 554
 555   // Enabling both top down and bottom up scheduling seems to give us less
 556   // register spills than just using one of these approaches on its own.
 557   Policy.OnlyTopDown = false;
 558   Policy.OnlyBottomUp = false;
 559
 560   // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
 561   if (!enableSIScheduler())
 562     Policy.ShouldTrackLaneMasks = true;
 563 }
 564
 565 bool GCNSubtarget::hasMadF16() const {
 566   return InstrInfo.pseudoToMCOpcode(AMDGPU::V_MAD_F16) != -1;
 567 }
 568
 569 unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
 570   if (getGeneration() >= AMDGPUSubtarget::GFX10)
 571     return getMaxWavesPerEU();
 572
 573   if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
 574     if (SGPRs <= 80)
 575       return 10;
 576     if (SGPRs <= 88)
 577       return 9;
 578     if (SGPRs <= 100)
 579       return 8;
 580     return 7;
 581   }
 582   if (SGPRs <= 48)
 583     return 10;
 584   if (SGPRs <= 56)
 585     return 9;
 586   if (SGPRs <= 64)
 587     return 8;
 588   if (SGPRs <= 72)
 589     return 7;
 590   if (SGPRs <= 80)
 591     return 6;
 592   return 5;
 593 }
 594
 595 unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const {
 596   unsigned MaxWaves = getMaxWavesPerEU();
 597   unsigned Granule = getVGPRAllocGranule();
 598   if (VGPRs < Granule)
 599     return MaxWaves;
 600   unsigned RoundedRegs = ((VGPRs + Granule - 1) / Granule) * Granule;
 601   return std::min(getTotalNumVGPRs() / RoundedRegs, MaxWaves);
 602 }
 603
 604 unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
 605   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
 606   if (getGeneration() >= AMDGPUSubtarget::GFX10)
 607     return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs.
 608
 609   if (MFI.hasFlatScratchInit()) {
 610     if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
 611       return 6; // FLAT_SCRATCH, XNACK, VCC (in that order).
 612     if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS)
 613       return 4; // FLAT_SCRATCH, VCC (in that order).
 614   }
 615
 616   if (isXNACKEnabled())
 617     return 4; // XNACK, VCC (in that order).
 618   return 2; // VCC.
 619 }
 620
 621 unsigned GCNSubtarget::computeOccupancy(const MachineFunction &MF,
 622                                         unsigned LDSSize,
 623                                         unsigned NumSGPRs,
 624                                         unsigned NumVGPRs) const {
 625   unsigned Occupancy =
 626     std::min(getMaxWavesPerEU(),
 627              getOccupancyWithLocalMemSize(LDSSize, MF.getFunction()));
 628   if (NumSGPRs)
 629     Occupancy = std::min(Occupancy, getOccupancyWithNumSGPRs(NumSGPRs));
 630   if (NumVGPRs)
 631     Occupancy = std::min(Occupancy, getOccupancyWithNumVGPRs(NumVGPRs));
 632   return Occupancy;
 633 }
 634
 635 unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
 636   const Function &F = MF.getFunction();
 637   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
 638
 639   // Compute maximum number of SGPRs function can use using default/requested
 640   // minimum number of waves per execution unit.
 641   std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
 642   unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false);
 643   unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true);
 644
 645   // Check if maximum number of SGPRs was explicitly requested using
 646   // "amdgpu-num-sgpr" attribute.
 647   if (F.hasFnAttribute("amdgpu-num-sgpr")) {
 648     unsigned Requested = AMDGPU::getIntegerAttribute(
 649       F, "amdgpu-num-sgpr", MaxNumSGPRs);
 650
 651     // Make sure requested value does not violate subtarget's specifications.
 652     if (Requested && (Requested <= getReservedNumSGPRs(MF)))
 653       Requested = 0;
 654
 655     // If more SGPRs are required to support the input user/system SGPRs,
 656     // increase to accommodate them.
 657     //
 658     // FIXME: This really ends up using the requested number of SGPRs + number
 659     // of reserved special registers in total. Theoretically you could re-use
 660     // the last input registers for these special registers, but this would
 661     // require a lot of complexity to deal with the weird aliasing.
 662     unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs();
 663     if (Requested && Requested < InputNumSGPRs)
 664       Requested = InputNumSGPRs;
 665
 666     // Make sure requested value is compatible with values implied by
 667     // default/requested minimum/maximum number of waves per execution unit.
 668     if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false))
 669       Requested = 0;
 670     if (WavesPerEU.second &&
 671         Requested && Requested < getMinNumSGPRs(WavesPerEU.second))
 672       Requested = 0;
 673
 674     if (Requested)
 675       MaxNumSGPRs = Requested;
 676   }
 677
 678   if (hasSGPRInitBug())
 679     MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
 680
 681   return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF),
 682                   MaxAddressableNumSGPRs);
 683 }
 684
 685 unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
 686   const Function &F = MF.getFunction();
 687   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
 688
 689   // Compute maximum number of VGPRs function can use using default/requested
 690   // minimum number of waves per execution unit.
 691   std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
 692   unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first);
 693
 694   // Check if maximum number of VGPRs was explicitly requested using
 695   // "amdgpu-num-vgpr" attribute.
 696   if (F.hasFnAttribute("amdgpu-num-vgpr")) {
 697     unsigned Requested = AMDGPU::getIntegerAttribute(
 698       F, "amdgpu-num-vgpr", MaxNumVGPRs);
 699
 700     // Make sure requested value is compatible with values implied by
 701     // default/requested minimum/maximum number of waves per execution unit.
 702     if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first))
 703       Requested = 0;
 704     if (WavesPerEU.second &&
 705         Requested && Requested < getMinNumVGPRs(WavesPerEU.second))
 706       Requested = 0;
 707
 708     if (Requested)
 709       MaxNumVGPRs = Requested;
 710   }
 711
 712   return MaxNumVGPRs;
 713 }
 714
 715 namespace {
 716 struct MemOpClusterMutation : ScheduleDAGMutation {
 717   const SIInstrInfo *TII;
 718
 719   MemOpClusterMutation(const SIInstrInfo *tii) : TII(tii) {}
 720
 721   void apply(ScheduleDAGInstrs *DAG) override {
 722     SUnit *SUa = nullptr;
 723     // Search for two consequent memory operations and link them
 724     // to prevent scheduler from moving them apart.
 725     // In DAG pre-process SUnits are in the original order of
 726     // the instructions before scheduling.
 727     for (SUnit &SU : DAG->SUnits) {
 728       MachineInstr &MI2 = *SU.getInstr();
 729       if (!MI2.mayLoad() && !MI2.mayStore()) {
 730         SUa = nullptr;
 731         continue;
 732       }
 733       if (!SUa) {
 734         SUa = &SU;
 735         continue;
 736       }
 737
 738       MachineInstr &MI1 = *SUa->getInstr();
 739       if ((TII->isVMEM(MI1) && TII->isVMEM(MI2)) ||
 740           (TII->isFLAT(MI1) && TII->isFLAT(MI2)) ||
 741           (TII->isSMRD(MI1) && TII->isSMRD(MI2)) ||
 742           (TII->isDS(MI1)   && TII->isDS(MI2))) {
 743         SU.addPredBarrier(SUa);
 744
 745         for (const SDep &SI : SU.Preds) {
 746           if (SI.getSUnit() != SUa)
 747             SUa->addPred(SDep(SI.getSUnit(), SDep::Artificial));
 748         }
 749
 750         if (&SU != &DAG->ExitSU) {
 751           for (const SDep &SI : SUa->Succs) {
 752             if (SI.getSUnit() != &SU)
 753               SI.getSUnit()->addPred(SDep(&SU, SDep::Artificial));
 754           }
 755         }
 756       }
 757
 758       SUa = &SU;
 759     }
 760   }
 761 };
 762
 763 struct FillMFMAShadowMutation : ScheduleDAGMutation {
 764   const SIInstrInfo *TII;
 765
 766   ScheduleDAGMI *DAG;
 767
 768   FillMFMAShadowMutation(const SIInstrInfo *tii) : TII(tii) {}
 769
 770   bool isSALU(const SUnit *SU) const {
 771     const MachineInstr *MI = SU->getInstr();
 772     return MI && TII->isSALU(*MI) && !MI->isTerminator();
 773   }
 774
 775   bool canAddEdge(const SUnit *Succ, const SUnit *Pred) const {
 776     if (Pred->NodeNum < Succ->NodeNum)
 777       return true;
 778
 779     SmallVector<const SUnit*, 64> Succs({Succ}), Preds({Pred});
 780
 781     for (unsigned I = 0; I < Succs.size(); ++I) {
 782       for (const SDep &SI : Succs[I]->Succs) {
 783         const SUnit *SU = SI.getSUnit();
 784         if (SU != Succs[I] && llvm::find(Succs, SU) == Succs.end())
 785           Succs.push_back(SU);
 786       }
 787     }
 788
 789     SmallPtrSet<const SUnit*, 32> Visited;
 790     while (!Preds.empty()) {
 791       const SUnit *SU = Preds.pop_back_val();
 792       if (llvm::find(Succs, SU) != Succs.end())
 793         return false;
 794       Visited.insert(SU);
 795       for (const SDep &SI : SU->Preds)
 796         if (SI.getSUnit() != SU && !Visited.count(SI.getSUnit()))
 797           Preds.push_back(SI.getSUnit());
 798     }
 799
 800     return true;
 801   }
 802
 803   // Link as much SALU intructions in chain as possible. Return the size
 804   // of the chain. Links up to MaxChain instructions.
 805   unsigned linkSALUChain(SUnit *From, SUnit *To, unsigned MaxChain,
 806                          SmallPtrSetImpl<SUnit *> &Visited) const {
 807     SmallVector<SUnit *, 8> Worklist({To});
 808     unsigned Linked = 0;
 809
 810     while (!Worklist.empty() && MaxChain-- > 0) {
 811       SUnit *SU = Worklist.pop_back_val();
 812       if (!Visited.insert(SU).second)
 813         continue;
 814
 815       LLVM_DEBUG(dbgs() << "Inserting edge from\n" ; DAG->dumpNode(*From);
 816                  dbgs() << "to\n"; DAG->dumpNode(*SU); dbgs() << '\n');
 817
 818       if (SU->addPred(SDep(From, SDep::Artificial), false))
 819         ++Linked;
 820
 821       for (SDep &SI : From->Succs) {
 822         SUnit *SUv = SI.getSUnit();
 823         if (SUv != From && TII->isVALU(*SUv->getInstr()) && canAddEdge(SUv, SU))
 824           SUv->addPred(SDep(SU, SDep::Artificial), false);
 825       }
 826
 827       for (SDep &SI : SU->Succs) {
 828         SUnit *Succ = SI.getSUnit();
 829         if (Succ != SU && isSALU(Succ) && canAddEdge(From, Succ))
 830           Worklist.push_back(Succ);
 831       }
 832     }
 833
 834     return Linked;
 835   }
 836
 837   void apply(ScheduleDAGInstrs *DAGInstrs) override {
 838     const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>();
 839     if (!ST.hasMAIInsts() || DisablePowerSched)
 840       return;
 841     DAG = static_cast<ScheduleDAGMI*>(DAGInstrs);
 842     const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel();
 843     if (!TSchedModel || DAG->SUnits.empty())
 844       return;
 845
 846     // Scan for MFMA long latency instructions and try to add a dependency
 847     // of available SALU instructions to give them a chance to fill MFMA
 848     // shadow. That is desirable to fill MFMA shadow with SALU instructions
 849     // rather than VALU to prevent power consumption bursts and throttle.
 850     auto LastSALU = DAG->SUnits.begin();
 851     auto E = DAG->SUnits.end();
 852     SmallPtrSet<SUnit*, 32> Visited;
 853     for (SUnit &SU : DAG->SUnits) {
 854       MachineInstr &MAI = *SU.getInstr();
 855       if (!TII->isMAI(MAI) ||
 856            MAI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32 ||
 857            MAI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32)
 858         continue;
 859
 860       unsigned Lat = TSchedModel->computeInstrLatency(&MAI) - 1;
 861
 862       LLVM_DEBUG(dbgs() << "Found MFMA: "; DAG->dumpNode(SU);
 863                  dbgs() << "Need " << Lat
 864                         << " instructions to cover latency.\n");
 865
 866       // Find up to Lat independent scalar instructions as early as
 867       // possible such that they can be scheduled after this MFMA.
 868       for ( ; Lat && LastSALU != E; ++LastSALU) {
 869         if (Visited.count(&*LastSALU))
 870           continue;
 871
 872         if (!isSALU(&*LastSALU) || !canAddEdge(&*LastSALU, &SU))
 873           continue;
 874
 875         Lat -= linkSALUChain(&SU, &*LastSALU, Lat, Visited);
 876       }
 877     }
 878   }
 879 };
 880 } // namespace
 881
 882 void GCNSubtarget::getPostRAMutations(
 883     std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
 884   Mutations.push_back(llvm::make_unique<MemOpClusterMutation>(&InstrInfo));
 885   Mutations.push_back(llvm::make_unique<FillMFMAShadowMutation>(&InstrInfo));
 886 }
 887
 888 const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) {
 889   if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn)
 890     return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>());
 891   else
 892     return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<R600Subtarget>());
 893 }
 894
 895 const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Function &F) {
 896   if (TM.getTargetTriple().getArch() == Triple::amdgcn)
 897     return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F));
 898   else
 899     return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<R600Subtarget>(F));
 900 }