lib/Target/AMDGPU/AMDGPUSubtarget.cpp

   1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
   2 //
   3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 // See https://llvm.org/LICENSE.txt for license information.
   5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 //
   7 //===----------------------------------------------------------------------===//
   8 //
   9 /// \file
  10 /// Implements the AMDGPU specific subclass of TargetSubtarget.
  11 //
  12 //===----------------------------------------------------------------------===//
  13
  14 #include "AMDGPUSubtarget.h"
  15 #include "AMDGPU.h"
  16 #include "AMDGPUTargetMachine.h"
  17 #include "AMDGPUCallLowering.h"
  18 #include "AMDGPUInstructionSelector.h"
  19 #include "AMDGPULegalizerInfo.h"
  20 #include "AMDGPURegisterBankInfo.h"
  21 #include "SIMachineFunctionInfo.h"
  22 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
  23 #include "llvm/ADT/SmallString.h"
  24 #include "llvm/CodeGen/MachineScheduler.h"
  25 #include "llvm/MC/MCSubtargetInfo.h"
  26 #include "llvm/IR/MDBuilder.h"
  27 #include "llvm/CodeGen/TargetFrameLowering.h"
  28 #include <algorithm>
  29
  30 using namespace llvm;
  31
  32 #define DEBUG_TYPE "amdgpu-subtarget"
  33
  34 #define GET_SUBTARGETINFO_TARGET_DESC
  35 #define GET_SUBTARGETINFO_CTOR
  36 #define AMDGPUSubtarget GCNSubtarget
  37 #include "AMDGPUGenSubtargetInfo.inc"
  38 #define GET_SUBTARGETINFO_TARGET_DESC
  39 #define GET_SUBTARGETINFO_CTOR
  40 #undef AMDGPUSubtarget
  41 #include "R600GenSubtargetInfo.inc"
  42
  43 GCNSubtarget::~GCNSubtarget() = default;
  44
  45 R600Subtarget &
  46 R600Subtarget::initializeSubtargetDependencies(const Triple &TT,
  47                                                StringRef GPU, StringRef FS) {
  48   SmallString<256> FullFS("+promote-alloca,+dx10-clamp,");
  49   FullFS += FS;
  50   ParseSubtargetFeatures(GPU, FullFS);
  51
  52   // FIXME: I don't think think Evergreen has any useful support for
  53   // denormals, but should be checked. Should we issue a warning somewhere
  54   // if someone tries to enable these?
  55   if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
  56     FP32Denormals = false;
  57   }
  58
  59   HasMulU24 = getGeneration() >= EVERGREEN;
  60   HasMulI24 = hasCaymanISA();
  61
  62   return *this;
  63 }
  64
  65 GCNSubtarget &
  66 GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
  67                                                  StringRef GPU, StringRef FS) {
  68   // Determine default and user-specified characteristics
  69   // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be
  70   // enabled, but some instructions do not respect them and they run at the
  71   // double precision rate, so don't enable by default.
  72   //
  73   // We want to be able to turn these off, but making this a subtarget feature
  74   // for SI has the unhelpful behavior that it unsets everything else if you
  75   // disable it.
  76   //
  77   // Similarly we want enable-prt-strict-null to be on by default and not to
  78   // unset everything else if it is disabled
  79
  80   SmallString<256> FullFS("+promote-alloca,+dx10-clamp,+load-store-opt,");
  81
  82   if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA.
  83     FullFS += "+flat-address-space,+flat-for-global,+unaligned-buffer-access,+trap-handler,";
  84
  85   // FIXME: I don't think think Evergreen has any useful support for
  86   // denormals, but should be checked. Should we issue a warning somewhere
  87   // if someone tries to enable these?
  88   if (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
  89     FullFS += "+fp64-fp16-denormals,";
  90   } else {
  91     FullFS += "-fp32-denormals,";
  92   }
  93
  94   FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS
  95
  96   FullFS += FS;
  97
  98   ParseSubtargetFeatures(GPU, FullFS);
  99
 100   // We don't support FP64 for EG/NI atm.
 101   assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS));
 102
 103   // Unless +-flat-for-global is specified, turn on FlatForGlobal for all OS-es
 104   // on VI and newer hardware to avoid assertion failures due to missing ADDR64
 105   // variants of MUBUF instructions.
 106   if (!hasAddr64() && !FS.contains("flat-for-global")) {
 107     FlatForGlobal = true;
 108   }
 109
 110   // Set defaults if needed.
 111   if (MaxPrivateElementSize == 0)
 112     MaxPrivateElementSize = 4;
 113
 114   if (LDSBankCount == 0)
 115     LDSBankCount = 32;
 116
 117   if (TT.getArch() == Triple::amdgcn) {
 118     if (LocalMemorySize == 0)
 119       LocalMemorySize = 32768;
 120
 121     // Do something sensible for unspecified target.
 122     if (!HasMovrel && !HasVGPRIndexMode)
 123       HasMovrel = true;
 124   }
 125
 126   // Don't crash on invalid devices.
 127   if (WavefrontSize == 0)
 128     WavefrontSize = 64;
 129
 130   HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS;
 131
 132   return *this;
 133 }
 134
 135 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) :
 136   TargetTriple(TT),
 137   Has16BitInsts(false),
 138   HasMadMixInsts(false),
 139   FP32Denormals(false),
 140   FPExceptions(false),
 141   HasSDWA(false),
 142   HasVOP3PInsts(false),
 143   HasMulI24(true),
 144   HasMulU24(true),
 145   HasInv2PiInlineImm(false),
 146   HasFminFmaxLegacy(true),
 147   EnablePromoteAlloca(false),
 148   HasTrigReducedRange(false),
 149   LocalMemorySize(0),
 150   WavefrontSize(0)
 151   { }
 152
 153 GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
 154                            const GCNTargetMachine &TM) :
 155     AMDGPUGenSubtargetInfo(TT, GPU, FS),
 156     AMDGPUSubtarget(TT),
 157     TargetTriple(TT),
 158     Gen(SOUTHERN_ISLANDS),
 159     InstrItins(getInstrItineraryForCPU(GPU)),
 160     LDSBankCount(0),
 161     MaxPrivateElementSize(0),
 162
 163     FastFMAF32(false),
 164     HalfRate64Ops(false),
 165
 166     FP64FP16Denormals(false),
 167     DX10Clamp(false),
 168     FlatForGlobal(false),
 169     AutoWaitcntBeforeBarrier(false),
 170     CodeObjectV3(false),
 171     UnalignedScratchAccess(false),
 172     UnalignedBufferAccess(false),
 173
 174     HasApertureRegs(false),
 175     EnableXNACK(false),
 176     TrapHandler(false),
 177     DebuggerInsertNops(false),
 178     DebuggerEmitPrologue(false),
 179
 180     EnableHugePrivateBuffer(false),
 181     EnableLoadStoreOpt(false),
 182     EnableUnsafeDSOffsetFolding(false),
 183     EnableSIScheduler(false),
 184     EnableDS128(false),
 185     EnablePRTStrictNull(false),
 186     DumpCode(false),
 187
 188     FP64(false),
 189     GCN3Encoding(false),
 190     CIInsts(false),
 191     VIInsts(false),
 192     GFX9Insts(false),
 193     SGPRInitBug(false),
 194     HasSMemRealTime(false),
 195     HasIntClamp(false),
 196     HasFmaMixInsts(false),
 197     HasMovrel(false),
 198     HasVGPRIndexMode(false),
 199     HasScalarStores(false),
 200     HasScalarAtomics(false),
 201     HasSDWAOmod(false),
 202     HasSDWAScalar(false),
 203     HasSDWASdst(false),
 204     HasSDWAMac(false),
 205     HasSDWAOutModsVOPC(false),
 206     HasDPP(false),
 207     HasR128A16(false),
 208     HasDLInsts(false),
 209     HasDot1Insts(false),
 210     HasDot2Insts(false),
 211     EnableSRAMECC(false),
 212     FlatAddressSpace(false),
 213     FlatInstOffsets(false),
 214     FlatGlobalInsts(false),
 215     FlatScratchInsts(false),
 216     AddNoCarryInsts(false),
 217     HasUnpackedD16VMem(false),
 218
 219     ScalarizeGlobal(false),
 220
 221     FeatureDisable(false),
 222     InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)),
 223     TLInfo(TM, *this),
 224     FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) {
 225   CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering()));
 226   Legalizer.reset(new AMDGPULegalizerInfo(*this, TM));
 227   RegBankInfo.reset(new AMDGPURegisterBankInfo(*getRegisterInfo()));
 228   InstSelector.reset(new AMDGPUInstructionSelector(
 229   *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM));
 230 }
 231
 232 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
 233   const Function &F) const {
 234   if (NWaves == 1)
 235     return getLocalMemorySize();
 236   unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
 237   unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
 238   unsigned MaxWaves = getMaxWavesPerEU();
 239   return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves;
 240 }
 241
 242 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes,
 243   const Function &F) const {
 244   unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
 245   unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
 246   unsigned MaxWaves = getMaxWavesPerEU();
 247   unsigned Limit = getLocalMemorySize() * MaxWaves / WorkGroupsPerCu;
 248   unsigned NumWaves = Limit / (Bytes ? Bytes : 1u);
 249   NumWaves = std::min(NumWaves, MaxWaves);
 250   NumWaves = std::max(NumWaves, 1u);
 251   return NumWaves;
 252 }
 253
 254 unsigned
 255 AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const {
 256   const auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
 257   return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction());
 258 }
 259
 260 std::pair<unsigned, unsigned>
 261 AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const {
 262   switch (CC) {
 263   case CallingConv::AMDGPU_CS:
 264   case CallingConv::AMDGPU_KERNEL:
 265   case CallingConv::SPIR_KERNEL:
 266     return std::make_pair(getWavefrontSize() * 2, getWavefrontSize() * 4);
 267   case CallingConv::AMDGPU_VS:
 268   case CallingConv::AMDGPU_LS:
 269   case CallingConv::AMDGPU_HS:
 270   case CallingConv::AMDGPU_ES:
 271   case CallingConv::AMDGPU_GS:
 272   case CallingConv::AMDGPU_PS:
 273     return std::make_pair(1, getWavefrontSize());
 274   default:
 275     return std::make_pair(1, 16 * getWavefrontSize());
 276   }
 277 }
 278
 279 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
 280   const Function &F) const {
 281   // FIXME: 1024 if function.
 282   // Default minimum/maximum flat work group sizes.
 283   std::pair<unsigned, unsigned> Default =
 284     getDefaultFlatWorkGroupSize(F.getCallingConv());
 285
 286   // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa
 287   // starts using "amdgpu-flat-work-group-size" attribute.
 288   Default.second = AMDGPU::getIntegerAttribute(
 289     F, "amdgpu-max-work-group-size", Default.second);
 290   Default.first = std::min(Default.first, Default.second);
 291
 292   // Requested minimum/maximum flat work group sizes.
 293   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
 294     F, "amdgpu-flat-work-group-size", Default);
 295
 296   // Make sure requested minimum is less than requested maximum.
 297   if (Requested.first > Requested.second)
 298     return Default;
 299
 300   // Make sure requested values do not violate subtarget's specifications.
 301   if (Requested.first < getMinFlatWorkGroupSize())
 302     return Default;
 303   if (Requested.second > getMaxFlatWorkGroupSize())
 304     return Default;
 305
 306   return Requested;
 307 }
 308
 309 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
 310   const Function &F) const {
 311   // Default minimum/maximum number of waves per execution unit.
 312   std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
 313
 314   // Default/requested minimum/maximum flat work group sizes.
 315   std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F);
 316
 317   // If minimum/maximum flat work group sizes were explicitly requested using
 318   // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum
 319   // number of waves per execution unit to values implied by requested
 320   // minimum/maximum flat work group sizes.
 321   unsigned MinImpliedByFlatWorkGroupSize =
 322     getMaxWavesPerEU(FlatWorkGroupSizes.second);
 323   bool RequestedFlatWorkGroupSize = false;
 324
 325   // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa
 326   // starts using "amdgpu-flat-work-group-size" attribute.
 327   if (F.hasFnAttribute("amdgpu-max-work-group-size") ||
 328       F.hasFnAttribute("amdgpu-flat-work-group-size")) {
 329     Default.first = MinImpliedByFlatWorkGroupSize;
 330     RequestedFlatWorkGroupSize = true;
 331   }
 332
 333   // Requested minimum/maximum number of waves per execution unit.
 334   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
 335     F, "amdgpu-waves-per-eu", Default, true);
 336
 337   // Make sure requested minimum is less than requested maximum.
 338   if (Requested.second && Requested.first > Requested.second)
 339     return Default;
 340
 341   // Make sure requested values do not violate subtarget's specifications.
 342   if (Requested.first < getMinWavesPerEU() ||
 343       Requested.first > getMaxWavesPerEU())
 344     return Default;
 345   if (Requested.second > getMaxWavesPerEU())
 346     return Default;
 347
 348   // Make sure requested values are compatible with values implied by requested
 349   // minimum/maximum flat work group sizes.
 350   if (RequestedFlatWorkGroupSize &&
 351       Requested.first < MinImpliedByFlatWorkGroupSize)
 352     return Default;
 353
 354   return Requested;
 355 }
 356
 357 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const {
 358   Function *Kernel = I->getParent()->getParent();
 359   unsigned MinSize = 0;
 360   unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second;
 361   bool IdQuery = false;
 362
 363   // If reqd_work_group_size is present it narrows value down.
 364   if (auto *CI = dyn_cast<CallInst>(I)) {
 365     const Function *F = CI->getCalledFunction();
 366     if (F) {
 367       unsigned Dim = UINT_MAX;
 368       switch (F->getIntrinsicID()) {
 369       case Intrinsic::amdgcn_workitem_id_x:
 370       case Intrinsic::r600_read_tidig_x:
 371         IdQuery = true;
 372         LLVM_FALLTHROUGH;
 373       case Intrinsic::r600_read_local_size_x:
 374         Dim = 0;
 375         break;
 376       case Intrinsic::amdgcn_workitem_id_y:
 377       case Intrinsic::r600_read_tidig_y:
 378         IdQuery = true;
 379         LLVM_FALLTHROUGH;
 380       case Intrinsic::r600_read_local_size_y:
 381         Dim = 1;
 382         break;
 383       case Intrinsic::amdgcn_workitem_id_z:
 384       case Intrinsic::r600_read_tidig_z:
 385         IdQuery = true;
 386         LLVM_FALLTHROUGH;
 387       case Intrinsic::r600_read_local_size_z:
 388         Dim = 2;
 389         break;
 390       default:
 391         break;
 392       }
 393       if (Dim <= 3) {
 394         if (auto Node = Kernel->getMetadata("reqd_work_group_size"))
 395           if (Node->getNumOperands() == 3)
 396             MinSize = MaxSize = mdconst::extract<ConstantInt>(
 397                                   Node->getOperand(Dim))->getZExtValue();
 398       }
 399     }
 400   }
 401
 402   if (!MaxSize)
 403     return false;
 404
 405   // Range metadata is [Lo, Hi). For ID query we need to pass max size
 406   // as Hi. For size query we need to pass Hi + 1.
 407   if (IdQuery)
 408     MinSize = 0;
 409   else
 410     ++MaxSize;
 411
 412   MDBuilder MDB(I->getContext());
 413   MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize),
 414                                                   APInt(32, MaxSize));
 415   I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
 416   return true;
 417 }
 418
 419 uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F,
 420                                                  unsigned &MaxAlign) const {
 421   assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
 422          F.getCallingConv() == CallingConv::SPIR_KERNEL);
 423
 424   const DataLayout &DL = F.getParent()->getDataLayout();
 425   uint64_t ExplicitArgBytes = 0;
 426   MaxAlign = 1;
 427
 428   for (const Argument &Arg : F.args()) {
 429     Type *ArgTy = Arg.getType();
 430
 431     unsigned Align = DL.getABITypeAlignment(ArgTy);
 432     uint64_t AllocSize = DL.getTypeAllocSize(ArgTy);
 433     ExplicitArgBytes = alignTo(ExplicitArgBytes, Align) + AllocSize;
 434     MaxAlign = std::max(MaxAlign, Align);
 435   }
 436
 437   return ExplicitArgBytes;
 438 }
 439
 440 unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F,
 441                                                 unsigned &MaxAlign) const {
 442   uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign);
 443
 444   unsigned ExplicitOffset = getExplicitKernelArgOffset(F);
 445
 446   uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes;
 447   unsigned ImplicitBytes = getImplicitArgNumBytes(F);
 448   if (ImplicitBytes != 0) {
 449     unsigned Alignment = getAlignmentForImplicitArgPtr();
 450     TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
 451   }
 452
 453   // Being able to dereference past the end is useful for emitting scalar loads.
 454   return alignTo(TotalSize, 4);
 455 }
 456
 457 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS,
 458                              const TargetMachine &TM) :
 459   R600GenSubtargetInfo(TT, GPU, FS),
 460   AMDGPUSubtarget(TT),
 461   InstrInfo(*this),
 462   FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
 463   FMA(false),
 464   CaymanISA(false),
 465   CFALUBug(false),
 466   DX10Clamp(false),
 467   HasVertexCache(false),
 468   R600ALUInst(false),
 469   FP64(false),
 470   TexVTXClauseSize(0),
 471   Gen(R600),
 472   TLInfo(TM, initializeSubtargetDependencies(TT, GPU, FS)),
 473   InstrItins(getInstrItineraryForCPU(GPU)) { }
 474
 475 void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
 476                                       unsigned NumRegionInstrs) const {
 477   // Track register pressure so the scheduler can try to decrease
 478   // pressure once register usage is above the threshold defined by
 479   // SIRegisterInfo::getRegPressureSetLimit()
 480   Policy.ShouldTrackPressure = true;
 481
 482   // Enabling both top down and bottom up scheduling seems to give us less
 483   // register spills than just using one of these approaches on its own.
 484   Policy.OnlyTopDown = false;
 485   Policy.OnlyBottomUp = false;
 486
 487   // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
 488   if (!enableSIScheduler())
 489     Policy.ShouldTrackLaneMasks = true;
 490 }
 491
 492 unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
 493   if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
 494     if (SGPRs <= 80)
 495       return 10;
 496     if (SGPRs <= 88)
 497       return 9;
 498     if (SGPRs <= 100)
 499       return 8;
 500     return 7;
 501   }
 502   if (SGPRs <= 48)
 503     return 10;
 504   if (SGPRs <= 56)
 505     return 9;
 506   if (SGPRs <= 64)
 507     return 8;
 508   if (SGPRs <= 72)
 509     return 7;
 510   if (SGPRs <= 80)
 511     return 6;
 512   return 5;
 513 }
 514
 515 unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const {
 516   if (VGPRs <= 24)
 517     return 10;
 518   if (VGPRs <= 28)
 519     return 9;
 520   if (VGPRs <= 32)
 521     return 8;
 522   if (VGPRs <= 36)
 523     return 7;
 524   if (VGPRs <= 40)
 525     return 6;
 526   if (VGPRs <= 48)
 527     return 5;
 528   if (VGPRs <= 64)
 529     return 4;
 530   if (VGPRs <= 84)
 531     return 3;
 532   if (VGPRs <= 128)
 533     return 2;
 534   return 1;
 535 }
 536
 537 unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
 538   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
 539   if (MFI.hasFlatScratchInit()) {
 540     if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
 541       return 6; // FLAT_SCRATCH, XNACK, VCC (in that order).
 542     if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS)
 543       return 4; // FLAT_SCRATCH, VCC (in that order).
 544   }
 545
 546   if (isXNACKEnabled())
 547     return 4; // XNACK, VCC (in that order).
 548   return 2; // VCC.
 549 }
 550
 551 unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
 552   const Function &F = MF.getFunction();
 553   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
 554
 555   // Compute maximum number of SGPRs function can use using default/requested
 556   // minimum number of waves per execution unit.
 557   std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
 558   unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false);
 559   unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true);
 560
 561   // Check if maximum number of SGPRs was explicitly requested using
 562   // "amdgpu-num-sgpr" attribute.
 563   if (F.hasFnAttribute("amdgpu-num-sgpr")) {
 564     unsigned Requested = AMDGPU::getIntegerAttribute(
 565       F, "amdgpu-num-sgpr", MaxNumSGPRs);
 566
 567     // Make sure requested value does not violate subtarget's specifications.
 568     if (Requested && (Requested <= getReservedNumSGPRs(MF)))
 569       Requested = 0;
 570
 571     // If more SGPRs are required to support the input user/system SGPRs,
 572     // increase to accommodate them.
 573     //
 574     // FIXME: This really ends up using the requested number of SGPRs + number
 575     // of reserved special registers in total. Theoretically you could re-use
 576     // the last input registers for these special registers, but this would
 577     // require a lot of complexity to deal with the weird aliasing.
 578     unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs();
 579     if (Requested && Requested < InputNumSGPRs)
 580       Requested = InputNumSGPRs;
 581
 582     // Make sure requested value is compatible with values implied by
 583     // default/requested minimum/maximum number of waves per execution unit.
 584     if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false))
 585       Requested = 0;
 586     if (WavesPerEU.second &&
 587         Requested && Requested < getMinNumSGPRs(WavesPerEU.second))
 588       Requested = 0;
 589
 590     if (Requested)
 591       MaxNumSGPRs = Requested;
 592   }
 593
 594   if (hasSGPRInitBug())
 595     MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
 596
 597   return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF),
 598                   MaxAddressableNumSGPRs);
 599 }
 600
 601 unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
 602   const Function &F = MF.getFunction();
 603   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
 604
 605   // Compute maximum number of VGPRs function can use using default/requested
 606   // minimum number of waves per execution unit.
 607   std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
 608   unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first);
 609
 610   // Check if maximum number of VGPRs was explicitly requested using
 611   // "amdgpu-num-vgpr" attribute.
 612   if (F.hasFnAttribute("amdgpu-num-vgpr")) {
 613     unsigned Requested = AMDGPU::getIntegerAttribute(
 614       F, "amdgpu-num-vgpr", MaxNumVGPRs);
 615
 616     // Make sure requested value is compatible with values implied by
 617     // default/requested minimum/maximum number of waves per execution unit.
 618     if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first))
 619       Requested = 0;
 620     if (WavesPerEU.second &&
 621         Requested && Requested < getMinNumVGPRs(WavesPerEU.second))
 622       Requested = 0;
 623
 624     if (Requested)
 625       MaxNumVGPRs = Requested;
 626   }
 627
 628   return MaxNumVGPRs;
 629 }
 630
 631 namespace {
 632 struct MemOpClusterMutation : ScheduleDAGMutation {
 633   const SIInstrInfo *TII;
 634
 635   MemOpClusterMutation(const SIInstrInfo *tii) : TII(tii) {}
 636
 637   void apply(ScheduleDAGInstrs *DAGInstrs) override {
 638     ScheduleDAGMI *DAG = static_cast<ScheduleDAGMI*>(DAGInstrs);
 639
 640     SUnit *SUa = nullptr;
 641     // Search for two consequent memory operations and link them
 642     // to prevent scheduler from moving them apart.
 643     // In DAG pre-process SUnits are in the original order of
 644     // the instructions before scheduling.
 645     for (SUnit &SU : DAG->SUnits) {
 646       MachineInstr &MI2 = *SU.getInstr();
 647       if (!MI2.mayLoad() && !MI2.mayStore()) {
 648         SUa = nullptr;
 649         continue;
 650       }
 651       if (!SUa) {
 652         SUa = &SU;
 653         continue;
 654       }
 655
 656       MachineInstr &MI1 = *SUa->getInstr();
 657       if ((TII->isVMEM(MI1) && TII->isVMEM(MI2)) ||
 658           (TII->isFLAT(MI1) && TII->isFLAT(MI2)) ||
 659           (TII->isSMRD(MI1) && TII->isSMRD(MI2)) ||
 660           (TII->isDS(MI1)   && TII->isDS(MI2))) {
 661         SU.addPredBarrier(SUa);
 662
 663         for (const SDep &SI : SU.Preds) {
 664           if (SI.getSUnit() != SUa)
 665             SUa->addPred(SDep(SI.getSUnit(), SDep::Artificial));
 666         }
 667
 668         if (&SU != &DAG->ExitSU) {
 669           for (const SDep &SI : SUa->Succs) {
 670             if (SI.getSUnit() != &SU)
 671               SI.getSUnit()->addPred(SDep(&SU, SDep::Artificial));
 672           }
 673         }
 674       }
 675
 676       SUa = &SU;
 677     }
 678   }
 679 };
 680 } // namespace
 681
 682 void GCNSubtarget::getPostRAMutations(
 683     std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
 684   Mutations.push_back(llvm::make_unique<MemOpClusterMutation>(&InstrInfo));
 685 }
 686
 687 const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) {
 688   if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn)
 689     return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>());
 690   else
 691     return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<R600Subtarget>());
 692 }
 693
 694 const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Function &F) {
 695   if (TM.getTargetTriple().getArch() == Triple::amdgcn)
 696     return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F));
 697   else
 698     return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<R600Subtarget>(F));
 699 }