lib/Target/AMDGPU/AMDGPUSubtarget.h

   1 //=====-- AMDGPUSubtarget.h - Define Subtarget for AMDGPU ------*- C++ -*-====//
   2 //
   3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 // See https://llvm.org/LICENSE.txt for license information.
   5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 //
   7 //==-----------------------------------------------------------------------===//
   8 //
   9 /// \file
  10 /// AMDGPU specific subclass of TargetSubtarget.
  11 //
  12 //===----------------------------------------------------------------------===//
  13
  14 #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H
  15 #define LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H
  16
  17 #include "AMDGPU.h"
  18 #include "AMDGPUCallLowering.h"
  19 #include "R600FrameLowering.h"
  20 #include "R600ISelLowering.h"
  21 #include "R600InstrInfo.h"
  22 #include "SIFrameLowering.h"
  23 #include "SIISelLowering.h"
  24 #include "SIInstrInfo.h"
  25 #include "Utils/AMDGPUBaseInfo.h"
  26 #include "llvm/ADT/Triple.h"
  27 #include "llvm/CodeGen/GlobalISel/InstructionSelector.h"
  28 #include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
  29 #include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
  30 #include "llvm/CodeGen/MachineFunction.h"
  31 #include "llvm/CodeGen/SelectionDAGTargetInfo.h"
  32 #include "llvm/MC/MCInstrItineraries.h"
  33 #include "llvm/Support/MathExtras.h"
  34 #include <cassert>
  35 #include <cstdint>
  36 #include <memory>
  37 #include <utility>
  38
  39 #define GET_SUBTARGETINFO_HEADER
  40 #include "AMDGPUGenSubtargetInfo.inc"
  41 #define GET_SUBTARGETINFO_HEADER
  42 #include "R600GenSubtargetInfo.inc"
  43
  44 namespace llvm {
  45
  46 class StringRef;
  47
  48 class AMDGPUSubtarget {
  49 public:
  50   enum Generation {
  51     R600 = 0,
  52     R700 = 1,
  53     EVERGREEN = 2,
  54     NORTHERN_ISLANDS = 3,
  55     SOUTHERN_ISLANDS = 4,
  56     SEA_ISLANDS = 5,
  57     VOLCANIC_ISLANDS = 6,
  58     GFX9 = 7
  59   };
  60
  61 private:
  62   Triple TargetTriple;
  63
  64 protected:
  65   bool Has16BitInsts;
  66   bool HasMadMixInsts;
  67   bool FP32Denormals;
  68   bool FPExceptions;
  69   bool HasSDWA;
  70   bool HasVOP3PInsts;
  71   bool HasMulI24;
  72   bool HasMulU24;
  73   bool HasInv2PiInlineImm;
  74   bool HasFminFmaxLegacy;
  75   bool EnablePromoteAlloca;
  76   bool HasTrigReducedRange;
  77   int LocalMemorySize;
  78   unsigned WavefrontSize;
  79
  80 public:
  81   AMDGPUSubtarget(const Triple &TT);
  82
  83   static const AMDGPUSubtarget &get(const MachineFunction &MF);
  84   static const AMDGPUSubtarget &get(const TargetMachine &TM,
  85                                     const Function &F);
  86
  87   /// \returns Default range flat work group size for a calling convention.
  88   std::pair<unsigned, unsigned> getDefaultFlatWorkGroupSize(CallingConv::ID CC) const;
  89
  90   /// \returns Subtarget's default pair of minimum/maximum flat work group sizes
  91   /// for function \p F, or minimum/maximum flat work group sizes explicitly
  92   /// requested using "amdgpu-flat-work-group-size" attribute attached to
  93   /// function \p F.
  94   ///
  95   /// \returns Subtarget's default values if explicitly requested values cannot
  96   /// be converted to integer, or violate subtarget's specifications.
  97   std::pair<unsigned, unsigned> getFlatWorkGroupSizes(const Function &F) const;
  98
  99   /// \returns Subtarget's default pair of minimum/maximum number of waves per
 100   /// execution unit for function \p F, or minimum/maximum number of waves per
 101   /// execution unit explicitly requested using "amdgpu-waves-per-eu" attribute
 102   /// attached to function \p F.
 103   ///
 104   /// \returns Subtarget's default values if explicitly requested values cannot
 105   /// be converted to integer, violate subtarget's specifications, or are not
 106   /// compatible with minimum/maximum number of waves limited by flat work group
 107   /// size, register usage, and/or lds usage.
 108   std::pair<unsigned, unsigned> getWavesPerEU(const Function &F) const;
 109
 110   /// Return the amount of LDS that can be used that will not restrict the
 111   /// occupancy lower than WaveCount.
 112   unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount,
 113                                            const Function &) const;
 114
 115   /// Inverse of getMaxLocalMemWithWaveCount. Return the maximum wavecount if
 116   /// the given LDS memory size is the only constraint.
 117   unsigned getOccupancyWithLocalMemSize(uint32_t Bytes, const Function &) const;
 118
 119   unsigned getOccupancyWithLocalMemSize(const MachineFunction &MF) const;
 120
 121   bool isAmdHsaOS() const {
 122     return TargetTriple.getOS() == Triple::AMDHSA;
 123   }
 124
 125   bool isAmdPalOS() const {
 126     return TargetTriple.getOS() == Triple::AMDPAL;
 127   }
 128
 129   bool isMesa3DOS() const {
 130     return TargetTriple.getOS() == Triple::Mesa3D;
 131   }
 132
 133   bool isMesaKernel(const Function &F) const {
 134     return isMesa3DOS() && !AMDGPU::isShader(F.getCallingConv());
 135   }
 136
 137   bool isAmdHsaOrMesa(const Function &F) const {
 138     return isAmdHsaOS() || isMesaKernel(F);
 139   }
 140
 141   bool has16BitInsts() const {
 142     return Has16BitInsts;
 143   }
 144
 145   bool hasMadMixInsts() const {
 146     return HasMadMixInsts;
 147   }
 148
 149   bool hasFP32Denormals() const {
 150     return FP32Denormals;
 151   }
 152
 153   bool hasFPExceptions() const {
 154     return FPExceptions;
 155   }
 156
 157   bool hasSDWA() const {
 158     return HasSDWA;
 159   }
 160
 161   bool hasVOP3PInsts() const {
 162     return HasVOP3PInsts;
 163   }
 164
 165   bool hasMulI24() const {
 166     return HasMulI24;
 167   }
 168
 169   bool hasMulU24() const {
 170     return HasMulU24;
 171   }
 172
 173   bool hasInv2PiInlineImm() const {
 174     return HasInv2PiInlineImm;
 175   }
 176
 177   bool hasFminFmaxLegacy() const {
 178     return HasFminFmaxLegacy;
 179   }
 180
 181   bool hasTrigReducedRange() const {
 182     return HasTrigReducedRange;
 183   }
 184
 185   bool isPromoteAllocaEnabled() const {
 186     return EnablePromoteAlloca;
 187   }
 188
 189   unsigned getWavefrontSize() const {
 190     return WavefrontSize;
 191   }
 192
 193   int getLocalMemorySize() const {
 194     return LocalMemorySize;
 195   }
 196
 197   unsigned getAlignmentForImplicitArgPtr() const {
 198     return isAmdHsaOS() ? 8 : 4;
 199   }
 200
 201   /// Returns the offset in bytes from the start of the input buffer
 202   ///        of the first explicit kernel argument.
 203   unsigned getExplicitKernelArgOffset(const Function &F) const {
 204     return isAmdHsaOrMesa(F) ? 0 : 36;
 205   }
 206
 207   /// \returns Maximum number of work groups per compute unit supported by the
 208   /// subtarget and limited by given \p FlatWorkGroupSize.
 209   virtual unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const = 0;
 210
 211   /// \returns Minimum flat work group size supported by the subtarget.
 212   virtual unsigned getMinFlatWorkGroupSize() const = 0;
 213
 214   /// \returns Maximum flat work group size supported by the subtarget.
 215   virtual unsigned getMaxFlatWorkGroupSize() const = 0;
 216
 217   /// \returns Maximum number of waves per execution unit supported by the
 218   /// subtarget and limited by given \p FlatWorkGroupSize.
 219   virtual unsigned getMaxWavesPerEU(unsigned FlatWorkGroupSize) const  = 0;
 220
 221   /// \returns Minimum number of waves per execution unit supported by the
 222   /// subtarget.
 223   virtual unsigned getMinWavesPerEU() const = 0;
 224
 225   unsigned getMaxWavesPerEU() const { return 10; }
 226
 227   /// Creates value range metadata on an workitemid.* inrinsic call or load.
 228   bool makeLIDRangeMetadata(Instruction *I) const;
 229
 230   /// \returns Number of bytes of arguments that are passed to a shader or
 231   /// kernel in addition to the explicit ones declared for the function.
 232   unsigned getImplicitArgNumBytes(const Function &F) const {
 233     if (isMesaKernel(F))
 234       return 16;
 235     return AMDGPU::getIntegerAttribute(F, "amdgpu-implicitarg-num-bytes", 0);
 236   }
 237   uint64_t getExplicitKernArgSize(const Function &F,
 238                                   unsigned &MaxAlign) const;
 239   unsigned getKernArgSegmentSize(const Function &F,
 240                                  unsigned &MaxAlign) const;
 241
 242   virtual ~AMDGPUSubtarget() {}
 243 };
 244
 245 class GCNSubtarget : public AMDGPUGenSubtargetInfo,
 246                      public AMDGPUSubtarget {
 247 public:
 248   enum TrapHandlerAbi {
 249     TrapHandlerAbiNone = 0,
 250     TrapHandlerAbiHsa = 1
 251   };
 252
 253   enum TrapID {
 254     TrapIDHardwareReserved = 0,
 255     TrapIDHSADebugTrap = 1,
 256     TrapIDLLVMTrap = 2,
 257     TrapIDLLVMDebugTrap = 3,
 258     TrapIDDebugBreakpoint = 7,
 259     TrapIDDebugReserved8 = 8,
 260     TrapIDDebugReservedFE = 0xfe,
 261     TrapIDDebugReservedFF = 0xff
 262   };
 263
 264   enum TrapRegValues {
 265     LLVMTrapHandlerRegValue = 1
 266   };
 267
 268 private:
 269   /// GlobalISel related APIs.
 270   std::unique_ptr<AMDGPUCallLowering> CallLoweringInfo;
 271   std::unique_ptr<InstructionSelector> InstSelector;
 272   std::unique_ptr<LegalizerInfo> Legalizer;
 273   std::unique_ptr<RegisterBankInfo> RegBankInfo;
 274
 275 protected:
 276   // Basic subtarget description.
 277   Triple TargetTriple;
 278   unsigned Gen;
 279   InstrItineraryData InstrItins;
 280   int LDSBankCount;
 281   unsigned MaxPrivateElementSize;
 282
 283   // Possibly statically set by tablegen, but may want to be overridden.
 284   bool FastFMAF32;
 285   bool HalfRate64Ops;
 286
 287   // Dynamially set bits that enable features.
 288   bool FP64FP16Denormals;
 289   bool DX10Clamp;
 290   bool FlatForGlobal;
 291   bool AutoWaitcntBeforeBarrier;
 292   bool CodeObjectV3;
 293   bool UnalignedScratchAccess;
 294   bool UnalignedBufferAccess;
 295   bool HasApertureRegs;
 296   bool EnableXNACK;
 297   bool TrapHandler;
 298   bool DebuggerInsertNops;
 299   bool DebuggerEmitPrologue;
 300
 301   // Used as options.
 302   bool EnableHugePrivateBuffer;
 303   bool EnableLoadStoreOpt;
 304   bool EnableUnsafeDSOffsetFolding;
 305   bool EnableSIScheduler;
 306   bool EnableDS128;
 307   bool EnablePRTStrictNull;
 308   bool DumpCode;
 309
 310   // Subtarget statically properties set by tablegen
 311   bool FP64;
 312   bool FMA;
 313   bool MIMG_R128;
 314   bool IsGCN;
 315   bool GCN3Encoding;
 316   bool CIInsts;
 317   bool VIInsts;
 318   bool GFX9Insts;
 319   bool SGPRInitBug;
 320   bool HasSMemRealTime;
 321   bool HasIntClamp;
 322   bool HasFmaMixInsts;
 323   bool HasMovrel;
 324   bool HasVGPRIndexMode;
 325   bool HasScalarStores;
 326   bool HasScalarAtomics;
 327   bool HasSDWAOmod;
 328   bool HasSDWAScalar;
 329   bool HasSDWASdst;
 330   bool HasSDWAMac;
 331   bool HasSDWAOutModsVOPC;
 332   bool HasDPP;
 333   bool HasR128A16;
 334   bool HasDLInsts;
 335   bool HasDot1Insts;
 336   bool HasDot2Insts;
 337   bool EnableSRAMECC;
 338   bool FlatAddressSpace;
 339   bool FlatInstOffsets;
 340   bool FlatGlobalInsts;
 341   bool FlatScratchInsts;
 342   bool AddNoCarryInsts;
 343   bool HasUnpackedD16VMem;
 344   bool R600ALUInst;
 345   bool CaymanISA;
 346   bool CFALUBug;
 347   bool HasVertexCache;
 348   short TexVTXClauseSize;
 349   bool ScalarizeGlobal;
 350
 351   // Dummy feature to use for assembler in tablegen.
 352   bool FeatureDisable;
 353
 354   SelectionDAGTargetInfo TSInfo;
 355 private:
 356   SIInstrInfo InstrInfo;
 357   SITargetLowering TLInfo;
 358   SIFrameLowering FrameLowering;
 359
 360 public:
 361   GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
 362                const GCNTargetMachine &TM);
 363   ~GCNSubtarget() override;
 364
 365   GCNSubtarget &initializeSubtargetDependencies(const Triple &TT,
 366                                                    StringRef GPU, StringRef FS);
 367
 368   const SIInstrInfo *getInstrInfo() const override {
 369     return &InstrInfo;
 370   }
 371
 372   const SIFrameLowering *getFrameLowering() const override {
 373     return &FrameLowering;
 374   }
 375
 376   const SITargetLowering *getTargetLowering() const override {
 377     return &TLInfo;
 378   }
 379
 380   const SIRegisterInfo *getRegisterInfo() const override {
 381     return &InstrInfo.getRegisterInfo();
 382   }
 383
 384   const CallLowering *getCallLowering() const override {
 385     return CallLoweringInfo.get();
 386   }
 387
 388   const InstructionSelector *getInstructionSelector() const override {
 389     return InstSelector.get();
 390   }
 391
 392   const LegalizerInfo *getLegalizerInfo() const override {
 393     return Legalizer.get();
 394   }
 395
 396   const RegisterBankInfo *getRegBankInfo() const override {
 397     return RegBankInfo.get();
 398   }
 399
 400   // Nothing implemented, just prevent crashes on use.
 401   const SelectionDAGTargetInfo *getSelectionDAGInfo() const override {
 402     return &TSInfo;
 403   }
 404
 405   const InstrItineraryData *getInstrItineraryData() const override {
 406     return &InstrItins;
 407   }
 408
 409   void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
 410
 411   Generation getGeneration() const {
 412     return (Generation)Gen;
 413   }
 414
 415   unsigned getWavefrontSizeLog2() const {
 416     return Log2_32(WavefrontSize);
 417   }
 418
 419   int getLDSBankCount() const {
 420     return LDSBankCount;
 421   }
 422
 423   unsigned getMaxPrivateElementSize() const {
 424     return MaxPrivateElementSize;
 425   }
 426
 427   bool hasIntClamp() const {
 428     return HasIntClamp;
 429   }
 430
 431   bool hasFP64() const {
 432     return FP64;
 433   }
 434
 435   bool hasMIMG_R128() const {
 436     return MIMG_R128;
 437   }
 438
 439   bool hasHWFP64() const {
 440     return FP64;
 441   }
 442
 443   bool hasFastFMAF32() const {
 444     return FastFMAF32;
 445   }
 446
 447   bool hasHalfRate64Ops() const {
 448     return HalfRate64Ops;
 449   }
 450
 451   bool hasAddr64() const {
 452     return (getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS);
 453   }
 454
 455   bool hasBFE() const {
 456     return true;
 457   }
 458
 459   bool hasBFI() const {
 460     return true;
 461   }
 462
 463   bool hasBFM() const {
 464     return hasBFE();
 465   }
 466
 467   bool hasBCNT(unsigned Size) const {
 468     return true;
 469   }
 470
 471   bool hasFFBL() const {
 472     return true;
 473   }
 474
 475   bool hasFFBH() const {
 476     return true;
 477   }
 478
 479   bool hasMed3_16() const {
 480     return getGeneration() >= AMDGPUSubtarget::GFX9;
 481   }
 482
 483   bool hasMin3Max3_16() const {
 484     return getGeneration() >= AMDGPUSubtarget::GFX9;
 485   }
 486
 487   bool hasFmaMixInsts() const {
 488     return HasFmaMixInsts;
 489   }
 490
 491   bool hasCARRY() const {
 492     return true;
 493   }
 494
 495   bool hasFMA() const {
 496     return FMA;
 497   }
 498
 499   bool hasSwap() const {
 500     return GFX9Insts;
 501   }
 502
 503   TrapHandlerAbi getTrapHandlerAbi() const {
 504     return isAmdHsaOS() ? TrapHandlerAbiHsa : TrapHandlerAbiNone;
 505   }
 506
 507   bool enableHugePrivateBuffer() const {
 508     return EnableHugePrivateBuffer;
 509   }
 510
 511   bool unsafeDSOffsetFoldingEnabled() const {
 512     return EnableUnsafeDSOffsetFolding;
 513   }
 514
 515   bool dumpCode() const {
 516     return DumpCode;
 517   }
 518
 519   /// Return the amount of LDS that can be used that will not restrict the
 520   /// occupancy lower than WaveCount.
 521   unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount,
 522                                            const Function &) const;
 523
 524   bool hasFP16Denormals() const {
 525     return FP64FP16Denormals;
 526   }
 527
 528   bool hasFP64Denormals() const {
 529     return FP64FP16Denormals;
 530   }
 531
 532   bool supportsMinMaxDenormModes() const {
 533     return getGeneration() >= AMDGPUSubtarget::GFX9;
 534   }
 535
 536   bool enableDX10Clamp() const {
 537     return DX10Clamp;
 538   }
 539
 540   bool enableIEEEBit(const MachineFunction &MF) const {
 541     return AMDGPU::isCompute(MF.getFunction().getCallingConv());
 542   }
 543
 544   bool useFlatForGlobal() const {
 545     return FlatForGlobal;
 546   }
 547
 548   /// \returns If target supports ds_read/write_b128 and user enables generation
 549   /// of ds_read/write_b128.
 550   bool useDS128() const {
 551     return CIInsts && EnableDS128;
 552   }
 553
 554   /// \returns If MUBUF instructions always perform range checking, even for
 555   /// buffer resources used for private memory access.
 556   bool privateMemoryResourceIsRangeChecked() const {
 557     return getGeneration() < AMDGPUSubtarget::GFX9;
 558   }
 559
 560   /// \returns If target requires PRT Struct NULL support (zero result registers
 561   /// for sparse texture support).
 562   bool usePRTStrictNull() const {
 563     return EnablePRTStrictNull;
 564   }
 565
 566   bool hasAutoWaitcntBeforeBarrier() const {
 567     return AutoWaitcntBeforeBarrier;
 568   }
 569
 570   bool hasCodeObjectV3() const {
 571     // FIXME: Need to add code object v3 support for mesa and pal.
 572     return isAmdHsaOS() ? CodeObjectV3 : false;
 573   }
 574
 575   bool hasUnalignedBufferAccess() const {
 576     return UnalignedBufferAccess;
 577   }
 578
 579   bool hasUnalignedScratchAccess() const {
 580     return UnalignedScratchAccess;
 581   }
 582
 583   bool hasApertureRegs() const {
 584     return HasApertureRegs;
 585   }
 586
 587   bool isTrapHandlerEnabled() const {
 588     return TrapHandler;
 589   }
 590
 591   bool isXNACKEnabled() const {
 592     return EnableXNACK;
 593   }
 594
 595   bool hasFlatAddressSpace() const {
 596     return FlatAddressSpace;
 597   }
 598
 599   bool hasFlatInstOffsets() const {
 600     return FlatInstOffsets;
 601   }
 602
 603   bool hasFlatGlobalInsts() const {
 604     return FlatGlobalInsts;
 605   }
 606
 607   bool hasFlatScratchInsts() const {
 608     return FlatScratchInsts;
 609   }
 610
 611   bool hasFlatLgkmVMemCountInOrder() const {
 612     return getGeneration() > GFX9;
 613   }
 614
 615   bool hasD16LoadStore() const {
 616     return getGeneration() >= GFX9;
 617   }
 618
 619   /// Return if most LDS instructions have an m0 use that require m0 to be
 620   /// iniitalized.
 621   bool ldsRequiresM0Init() const {
 622     return getGeneration() < GFX9;
 623   }
 624
 625   bool hasAddNoCarry() const {
 626     return AddNoCarryInsts;
 627   }
 628
 629   bool hasUnpackedD16VMem() const {
 630     return HasUnpackedD16VMem;
 631   }
 632
 633   // Covers VS/PS/CS graphics shaders
 634   bool isMesaGfxShader(const Function &F) const {
 635     return isMesa3DOS() && AMDGPU::isShader(F.getCallingConv());
 636   }
 637
 638   bool hasMad64_32() const {
 639     return getGeneration() >= SEA_ISLANDS;
 640   }
 641
 642   bool hasSDWAOmod() const {
 643     return HasSDWAOmod;
 644   }
 645
 646   bool hasSDWAScalar() const {
 647     return HasSDWAScalar;
 648   }
 649
 650   bool hasSDWASdst() const {
 651     return HasSDWASdst;
 652   }
 653
 654   bool hasSDWAMac() const {
 655     return HasSDWAMac;
 656   }
 657
 658   bool hasSDWAOutModsVOPC() const {
 659     return HasSDWAOutModsVOPC;
 660   }
 661
 662   bool vmemWriteNeedsExpWaitcnt() const {
 663     return getGeneration() < SEA_ISLANDS;
 664   }
 665
 666   bool hasDLInsts() const {
 667     return HasDLInsts;
 668   }
 669
 670   bool hasDot1Insts() const {
 671     return HasDot1Insts;
 672   }
 673
 674   bool hasDot2Insts() const {
 675     return HasDot2Insts;
 676   }
 677
 678   bool isSRAMECCEnabled() const {
 679     return EnableSRAMECC;
 680   }
 681
 682   // Scratch is allocated in 256 dword per wave blocks for the entire
 683   // wavefront. When viewed from the perspecive of an arbitrary workitem, this
 684   // is 4-byte aligned.
 685   //
 686   // Only 4-byte alignment is really needed to access anything. Transformations
 687   // on the pointer value itself may rely on the alignment / known low bits of
 688   // the pointer. Set this to something above the minimum to avoid needing
 689   // dynamic realignment in common cases.
 690   unsigned getStackAlignment() const {
 691     return 16;
 692   }
 693
 694   bool enableMachineScheduler() const override {
 695     return true;
 696   }
 697
 698   bool enableSubRegLiveness() const override {
 699     return true;
 700   }
 701
 702   void setScalarizeGlobalBehavior(bool b) { ScalarizeGlobal = b; }
 703   bool getScalarizeGlobalBehavior() const { return ScalarizeGlobal; }
 704
 705   /// \returns Number of execution units per compute unit supported by the
 706   /// subtarget.
 707   unsigned getEUsPerCU() const {
 708     return AMDGPU::IsaInfo::getEUsPerCU(this);
 709   }
 710
 711   /// \returns Maximum number of waves per compute unit supported by the
 712   /// subtarget without any kind of limitation.
 713   unsigned getMaxWavesPerCU() const {
 714     return AMDGPU::IsaInfo::getMaxWavesPerCU(this);
 715   }
 716
 717   /// \returns Maximum number of waves per compute unit supported by the
 718   /// subtarget and limited by given \p FlatWorkGroupSize.
 719   unsigned getMaxWavesPerCU(unsigned FlatWorkGroupSize) const {
 720     return AMDGPU::IsaInfo::getMaxWavesPerCU(this, FlatWorkGroupSize);
 721   }
 722
 723   /// \returns Maximum number of waves per execution unit supported by the
 724   /// subtarget without any kind of limitation.
 725   unsigned getMaxWavesPerEU() const {
 726     return AMDGPU::IsaInfo::getMaxWavesPerEU();
 727   }
 728
 729   /// \returns Number of waves per work group supported by the subtarget and
 730   /// limited by given \p FlatWorkGroupSize.
 731   unsigned getWavesPerWorkGroup(unsigned FlatWorkGroupSize) const {
 732     return AMDGPU::IsaInfo::getWavesPerWorkGroup(this, FlatWorkGroupSize);
 733   }
 734
 735   // static wrappers
 736   static bool hasHalfRate64Ops(const TargetSubtargetInfo &STI);
 737
 738   // XXX - Why is this here if it isn't in the default pass set?
 739   bool enableEarlyIfConversion() const override {
 740     return true;
 741   }
 742
 743   void overrideSchedPolicy(MachineSchedPolicy &Policy,
 744                            unsigned NumRegionInstrs) const override;
 745
 746   unsigned getMaxNumUserSGPRs() const {
 747     return 16;
 748   }
 749
 750   bool hasSMemRealTime() const {
 751     return HasSMemRealTime;
 752   }
 753
 754   bool hasMovrel() const {
 755     return HasMovrel;
 756   }
 757
 758   bool hasVGPRIndexMode() const {
 759     return HasVGPRIndexMode;
 760   }
 761
 762   bool useVGPRIndexMode(bool UserEnable) const {
 763     return !hasMovrel() || (UserEnable && hasVGPRIndexMode());
 764   }
 765
 766   bool hasScalarCompareEq64() const {
 767     return getGeneration() >= VOLCANIC_ISLANDS;
 768   }
 769
 770   bool hasScalarStores() const {
 771     return HasScalarStores;
 772   }
 773
 774   bool hasScalarAtomics() const {
 775     return HasScalarAtomics;
 776   }
 777
 778   bool hasLDSFPAtomics() const {
 779     return VIInsts;
 780   }
 781
 782   bool hasDPP() const {
 783     return HasDPP;
 784   }
 785
 786   bool hasR128A16() const {
 787     return HasR128A16;
 788   }
 789
 790   bool enableSIScheduler() const {
 791     return EnableSIScheduler;
 792   }
 793
 794   bool debuggerSupported() const {
 795     return debuggerInsertNops() && debuggerEmitPrologue();
 796   }
 797
 798   bool debuggerInsertNops() const {
 799     return DebuggerInsertNops;
 800   }
 801
 802   bool debuggerEmitPrologue() const {
 803     return DebuggerEmitPrologue;
 804   }
 805
 806   bool loadStoreOptEnabled() const {
 807     return EnableLoadStoreOpt;
 808   }
 809
 810   bool hasSGPRInitBug() const {
 811     return SGPRInitBug;
 812   }
 813
 814   bool has12DWordStoreHazard() const {
 815     return getGeneration() != AMDGPUSubtarget::SOUTHERN_ISLANDS;
 816   }
 817
 818   // \returns true if the subtarget supports DWORDX3 load/store instructions.
 819   bool hasDwordx3LoadStores() const {
 820     return CIInsts;
 821   }
 822
 823   bool hasSMovFedHazard() const {
 824     return getGeneration() >= AMDGPUSubtarget::GFX9;
 825   }
 826
 827   bool hasReadM0MovRelInterpHazard() const {
 828     return getGeneration() >= AMDGPUSubtarget::GFX9;
 829   }
 830
 831   bool hasReadM0SendMsgHazard() const {
 832     return getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS;
 833   }
 834
 835   /// Return the maximum number of waves per SIMD for kernels using \p SGPRs
 836   /// SGPRs
 837   unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const;
 838
 839   /// Return the maximum number of waves per SIMD for kernels using \p VGPRs
 840   /// VGPRs
 841   unsigned getOccupancyWithNumVGPRs(unsigned VGPRs) const;
 842
 843   /// \returns true if the flat_scratch register should be initialized with the
 844   /// pointer to the wave's scratch memory rather than a size and offset.
 845   bool flatScratchIsPointer() const {
 846     return getGeneration() >= AMDGPUSubtarget::GFX9;
 847   }
 848
 849   /// \returns true if the machine has merged shaders in which s0-s7 are
 850   /// reserved by the hardware and user SGPRs start at s8
 851   bool hasMergedShaders() const {
 852     return getGeneration() >= GFX9;
 853   }
 854
 855   /// \returns SGPR allocation granularity supported by the subtarget.
 856   unsigned getSGPRAllocGranule() const {
 857     return AMDGPU::IsaInfo::getSGPRAllocGranule(this);
 858   }
 859
 860   /// \returns SGPR encoding granularity supported by the subtarget.
 861   unsigned getSGPREncodingGranule() const {
 862     return AMDGPU::IsaInfo::getSGPREncodingGranule(this);
 863   }
 864
 865   /// \returns Total number of SGPRs supported by the subtarget.
 866   unsigned getTotalNumSGPRs() const {
 867     return AMDGPU::IsaInfo::getTotalNumSGPRs(this);
 868   }
 869
 870   /// \returns Addressable number of SGPRs supported by the subtarget.
 871   unsigned getAddressableNumSGPRs() const {
 872     return AMDGPU::IsaInfo::getAddressableNumSGPRs(this);
 873   }
 874
 875   /// \returns Minimum number of SGPRs that meets the given number of waves per
 876   /// execution unit requirement supported by the subtarget.
 877   unsigned getMinNumSGPRs(unsigned WavesPerEU) const {
 878     return AMDGPU::IsaInfo::getMinNumSGPRs(this, WavesPerEU);
 879   }
 880
 881   /// \returns Maximum number of SGPRs that meets the given number of waves per
 882   /// execution unit requirement supported by the subtarget.
 883   unsigned getMaxNumSGPRs(unsigned WavesPerEU, bool Addressable) const {
 884     return AMDGPU::IsaInfo::getMaxNumSGPRs(this, WavesPerEU, Addressable);
 885   }
 886
 887   /// \returns Reserved number of SGPRs for given function \p MF.
 888   unsigned getReservedNumSGPRs(const MachineFunction &MF) const;
 889
 890   /// \returns Maximum number of SGPRs that meets number of waves per execution
 891   /// unit requirement for function \p MF, or number of SGPRs explicitly
 892   /// requested using "amdgpu-num-sgpr" attribute attached to function \p MF.
 893   ///
 894   /// \returns Value that meets number of waves per execution unit requirement
 895   /// if explicitly requested value cannot be converted to integer, violates
 896   /// subtarget's specifications, or does not meet number of waves per execution
 897   /// unit requirement.
 898   unsigned getMaxNumSGPRs(const MachineFunction &MF) const;
 899
 900   /// \returns VGPR allocation granularity supported by the subtarget.
 901   unsigned getVGPRAllocGranule() const {
 902     return AMDGPU::IsaInfo::getVGPRAllocGranule(this);
 903   }
 904
 905   /// \returns VGPR encoding granularity supported by the subtarget.
 906   unsigned getVGPREncodingGranule() const {
 907     return AMDGPU::IsaInfo::getVGPREncodingGranule(this);
 908   }
 909
 910   /// \returns Total number of VGPRs supported by the subtarget.
 911   unsigned getTotalNumVGPRs() const {
 912     return AMDGPU::IsaInfo::getTotalNumVGPRs(this);
 913   }
 914
 915   /// \returns Addressable number of VGPRs supported by the subtarget.
 916   unsigned getAddressableNumVGPRs() const {
 917     return AMDGPU::IsaInfo::getAddressableNumVGPRs(this);
 918   }
 919
 920   /// \returns Minimum number of VGPRs that meets given number of waves per
 921   /// execution unit requirement supported by the subtarget.
 922   unsigned getMinNumVGPRs(unsigned WavesPerEU) const {
 923     return AMDGPU::IsaInfo::getMinNumVGPRs(this, WavesPerEU);
 924   }
 925
 926   /// \returns Maximum number of VGPRs that meets given number of waves per
 927   /// execution unit requirement supported by the subtarget.
 928   unsigned getMaxNumVGPRs(unsigned WavesPerEU) const {
 929     return AMDGPU::IsaInfo::getMaxNumVGPRs(this, WavesPerEU);
 930   }
 931
 932   /// \returns Maximum number of VGPRs that meets number of waves per execution
 933   /// unit requirement for function \p MF, or number of VGPRs explicitly
 934   /// requested using "amdgpu-num-vgpr" attribute attached to function \p MF.
 935   ///
 936   /// \returns Value that meets number of waves per execution unit requirement
 937   /// if explicitly requested value cannot be converted to integer, violates
 938   /// subtarget's specifications, or does not meet number of waves per execution
 939   /// unit requirement.
 940   unsigned getMaxNumVGPRs(const MachineFunction &MF) const;
 941
 942   void getPostRAMutations(
 943       std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations)
 944       const override;
 945
 946   /// \returns Maximum number of work groups per compute unit supported by the
 947   /// subtarget and limited by given \p FlatWorkGroupSize.
 948   unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const override {
 949     return AMDGPU::IsaInfo::getMaxWorkGroupsPerCU(this, FlatWorkGroupSize);
 950   }
 951
 952   /// \returns Minimum flat work group size supported by the subtarget.
 953   unsigned getMinFlatWorkGroupSize() const override {
 954     return AMDGPU::IsaInfo::getMinFlatWorkGroupSize(this);
 955   }
 956
 957   /// \returns Maximum flat work group size supported by the subtarget.
 958   unsigned getMaxFlatWorkGroupSize() const override {
 959     return AMDGPU::IsaInfo::getMaxFlatWorkGroupSize(this);
 960   }
 961
 962   /// \returns Maximum number of waves per execution unit supported by the
 963   /// subtarget and limited by given \p FlatWorkGroupSize.
 964   unsigned getMaxWavesPerEU(unsigned FlatWorkGroupSize) const override {
 965     return AMDGPU::IsaInfo::getMaxWavesPerEU(this, FlatWorkGroupSize);
 966   }
 967
 968   /// \returns Minimum number of waves per execution unit supported by the
 969   /// subtarget.
 970   unsigned getMinWavesPerEU() const override {
 971     return AMDGPU::IsaInfo::getMinWavesPerEU(this);
 972   }
 973 };
 974
 975 class R600Subtarget final : public R600GenSubtargetInfo,
 976                             public AMDGPUSubtarget {
 977 private:
 978   R600InstrInfo InstrInfo;
 979   R600FrameLowering FrameLowering;
 980   bool FMA;
 981   bool CaymanISA;
 982   bool CFALUBug;
 983   bool DX10Clamp;
 984   bool HasVertexCache;
 985   bool R600ALUInst;
 986   bool FP64;
 987   short TexVTXClauseSize;
 988   Generation Gen;
 989   R600TargetLowering TLInfo;
 990   InstrItineraryData InstrItins;
 991   SelectionDAGTargetInfo TSInfo;
 992
 993 public:
 994   R600Subtarget(const Triple &TT, StringRef CPU, StringRef FS,
 995                 const TargetMachine &TM);
 996
 997   const R600InstrInfo *getInstrInfo() const override { return &InstrInfo; }
 998
 999   const R600FrameLowering *getFrameLowering() const override {
1000     return &FrameLowering;
1001   }
1002
1003   const R600TargetLowering *getTargetLowering() const override {
1004     return &TLInfo;
1005   }
1006
1007   const R600RegisterInfo *getRegisterInfo() const override {
1008     return &InstrInfo.getRegisterInfo();
1009   }
1010
1011   const InstrItineraryData *getInstrItineraryData() const override {
1012     return &InstrItins;
1013   }
1014
1015   // Nothing implemented, just prevent crashes on use.
1016   const SelectionDAGTargetInfo *getSelectionDAGInfo() const override {
1017     return &TSInfo;
1018   }
1019
1020   void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
1021
1022   Generation getGeneration() const {
1023     return Gen;
1024   }
1025
1026   unsigned getStackAlignment() const {
1027     return 4;
1028   }
1029
1030   R600Subtarget &initializeSubtargetDependencies(const Triple &TT,
1031                                                  StringRef GPU, StringRef FS);
1032
1033   bool hasBFE() const {
1034     return (getGeneration() >= EVERGREEN);
1035   }
1036
1037   bool hasBFI() const {
1038     return (getGeneration() >= EVERGREEN);
1039   }
1040
1041   bool hasBCNT(unsigned Size) const {
1042     if (Size == 32)
1043       return (getGeneration() >= EVERGREEN);
1044
1045     return false;
1046   }
1047
1048   bool hasBORROW() const {
1049     return (getGeneration() >= EVERGREEN);
1050   }
1051
1052   bool hasCARRY() const {
1053     return (getGeneration() >= EVERGREEN);
1054   }
1055
1056   bool hasCaymanISA() const {
1057     return CaymanISA;
1058   }
1059
1060   bool hasFFBL() const {
1061     return (getGeneration() >= EVERGREEN);
1062   }
1063
1064   bool hasFFBH() const {
1065     return (getGeneration() >= EVERGREEN);
1066   }
1067
1068   bool hasFMA() const { return FMA; }
1069
1070   bool hasCFAluBug() const { return CFALUBug; }
1071
1072   bool hasVertexCache() const { return HasVertexCache; }
1073
1074   short getTexVTXClauseSize() const { return TexVTXClauseSize; }
1075
1076   bool enableMachineScheduler() const override {
1077     return true;
1078   }
1079
1080   bool enableSubRegLiveness() const override {
1081     return true;
1082   }
1083
1084   /// \returns Maximum number of work groups per compute unit supported by the
1085   /// subtarget and limited by given \p FlatWorkGroupSize.
1086   unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const override {
1087     return AMDGPU::IsaInfo::getMaxWorkGroupsPerCU(this, FlatWorkGroupSize);
1088   }
1089
1090   /// \returns Minimum flat work group size supported by the subtarget.
1091   unsigned getMinFlatWorkGroupSize() const override {
1092     return AMDGPU::IsaInfo::getMinFlatWorkGroupSize(this);
1093   }
1094
1095   /// \returns Maximum flat work group size supported by the subtarget.
1096   unsigned getMaxFlatWorkGroupSize() const override {
1097     return AMDGPU::IsaInfo::getMaxFlatWorkGroupSize(this);
1098   }
1099
1100   /// \returns Maximum number of waves per execution unit supported by the
1101   /// subtarget and limited by given \p FlatWorkGroupSize.
1102   unsigned getMaxWavesPerEU(unsigned FlatWorkGroupSize) const override {
1103     return AMDGPU::IsaInfo::getMaxWavesPerEU(this, FlatWorkGroupSize);
1104   }
1105
1106   /// \returns Minimum number of waves per execution unit supported by the
1107   /// subtarget.
1108   unsigned getMinWavesPerEU() const override {
1109     return AMDGPU::IsaInfo::getMinWavesPerEU(this);
1110   }
1111 };
1112
1113 } // end namespace llvm
1114
1115 #endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H