lib/Target/AMDGPU/AMDGPUSubtarget.h

   1 //=====-- AMDGPUSubtarget.h - Define Subtarget for AMDGPU ------*- C++ -*-====//
   2 //
   3 //                     The LLVM Compiler Infrastructure
   4 //
   5 // This file is distributed under the University of Illinois Open Source
   6 // License. See LICENSE.TXT for details.
   7 //
   8 //==-----------------------------------------------------------------------===//
   9 //
  10 /// \file
  11 /// AMDGPU specific subclass of TargetSubtarget.
  12 //
  13 //===----------------------------------------------------------------------===//
  14
  15 #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H
  16 #define LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H
  17
  18 #include "AMDGPU.h"
  19 #include "AMDGPUCallLowering.h"
  20 #include "R600FrameLowering.h"
  21 #include "R600ISelLowering.h"
  22 #include "R600InstrInfo.h"
  23 #include "SIFrameLowering.h"
  24 #include "SIISelLowering.h"
  25 #include "SIInstrInfo.h"
  26 #include "Utils/AMDGPUBaseInfo.h"
  27 #include "llvm/ADT/Triple.h"
  28 #include "llvm/CodeGen/GlobalISel/InstructionSelector.h"
  29 #include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
  30 #include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
  31 #include "llvm/CodeGen/MachineFunction.h"
  32 #include "llvm/CodeGen/SelectionDAGTargetInfo.h"
  33 #include "llvm/MC/MCInstrItineraries.h"
  34 #include "llvm/Support/MathExtras.h"
  35 #include <cassert>
  36 #include <cstdint>
  37 #include <memory>
  38 #include <utility>
  39
  40 #define GET_SUBTARGETINFO_HEADER
  41 #include "AMDGPUGenSubtargetInfo.inc"
  42 #define GET_SUBTARGETINFO_HEADER
  43 #include "R600GenSubtargetInfo.inc"
  44
  45 namespace llvm {
  46
  47 class StringRef;
  48
  49 class AMDGPUSubtarget {
  50 public:
  51   enum Generation {
  52     R600 = 0,
  53     R700 = 1,
  54     EVERGREEN = 2,
  55     NORTHERN_ISLANDS = 3,
  56     SOUTHERN_ISLANDS = 4,
  57     SEA_ISLANDS = 5,
  58     VOLCANIC_ISLANDS = 6,
  59     GFX9 = 7
  60   };
  61
  62 private:
  63   Triple TargetTriple;
  64
  65 protected:
  66   bool Has16BitInsts;
  67   bool HasMadMixInsts;
  68   bool FP32Denormals;
  69   bool FPExceptions;
  70   bool HasSDWA;
  71   bool HasVOP3PInsts;
  72   bool HasMulI24;
  73   bool HasMulU24;
  74   bool HasInv2PiInlineImm;
  75   bool HasFminFmaxLegacy;
  76   bool EnablePromoteAlloca;
  77   bool HasTrigReducedRange;
  78   int LocalMemorySize;
  79   unsigned WavefrontSize;
  80
  81 public:
  82   AMDGPUSubtarget(const Triple &TT);
  83
  84   static const AMDGPUSubtarget &get(const MachineFunction &MF);
  85   static const AMDGPUSubtarget &get(const TargetMachine &TM,
  86                                     const Function &F);
  87
  88   /// \returns Default range flat work group size for a calling convention.
  89   std::pair<unsigned, unsigned> getDefaultFlatWorkGroupSize(CallingConv::ID CC) const;
  90
  91   /// \returns Subtarget's default pair of minimum/maximum flat work group sizes
  92   /// for function \p F, or minimum/maximum flat work group sizes explicitly
  93   /// requested using "amdgpu-flat-work-group-size" attribute attached to
  94   /// function \p F.
  95   ///
  96   /// \returns Subtarget's default values if explicitly requested values cannot
  97   /// be converted to integer, or violate subtarget's specifications.
  98   std::pair<unsigned, unsigned> getFlatWorkGroupSizes(const Function &F) const;
  99
 100   /// \returns Subtarget's default pair of minimum/maximum number of waves per
 101   /// execution unit for function \p F, or minimum/maximum number of waves per
 102   /// execution unit explicitly requested using "amdgpu-waves-per-eu" attribute
 103   /// attached to function \p F.
 104   ///
 105   /// \returns Subtarget's default values if explicitly requested values cannot
 106   /// be converted to integer, violate subtarget's specifications, or are not
 107   /// compatible with minimum/maximum number of waves limited by flat work group
 108   /// size, register usage, and/or lds usage.
 109   std::pair<unsigned, unsigned> getWavesPerEU(const Function &F) const;
 110
 111   /// Return the amount of LDS that can be used that will not restrict the
 112   /// occupancy lower than WaveCount.
 113   unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount,
 114                                            const Function &) const;
 115
 116   /// Inverse of getMaxLocalMemWithWaveCount. Return the maximum wavecount if
 117   /// the given LDS memory size is the only constraint.
 118   unsigned getOccupancyWithLocalMemSize(uint32_t Bytes, const Function &) const;
 119
 120   unsigned getOccupancyWithLocalMemSize(const MachineFunction &MF) const;
 121
 122   bool isAmdHsaOS() const {
 123     return TargetTriple.getOS() == Triple::AMDHSA;
 124   }
 125
 126   bool isAmdPalOS() const {
 127     return TargetTriple.getOS() == Triple::AMDPAL;
 128   }
 129
 130   bool isMesa3DOS() const {
 131     return TargetTriple.getOS() == Triple::Mesa3D;
 132   }
 133
 134   bool isMesaKernel(const Function &F) const {
 135     return isMesa3DOS() && !AMDGPU::isShader(F.getCallingConv());
 136   }
 137
 138   bool isAmdCodeObjectV2(const Function &F) const {
 139     return isAmdHsaOS() || isMesaKernel(F);
 140   }
 141
 142   bool has16BitInsts() const {
 143     return Has16BitInsts;
 144   }
 145
 146   bool hasMadMixInsts() const {
 147     return HasMadMixInsts;
 148   }
 149
 150   bool hasFP32Denormals() const {
 151     return FP32Denormals;
 152   }
 153
 154   bool hasFPExceptions() const {
 155     return FPExceptions;
 156   }
 157
 158   bool hasSDWA() const {
 159     return HasSDWA;
 160   }
 161
 162   bool hasVOP3PInsts() const {
 163     return HasVOP3PInsts;
 164   }
 165
 166   bool hasMulI24() const {
 167     return HasMulI24;
 168   }
 169
 170   bool hasMulU24() const {
 171     return HasMulU24;
 172   }
 173
 174   bool hasInv2PiInlineImm() const {
 175     return HasInv2PiInlineImm;
 176   }
 177
 178   bool hasFminFmaxLegacy() const {
 179     return HasFminFmaxLegacy;
 180   }
 181
 182   bool hasTrigReducedRange() const {
 183     return HasTrigReducedRange;
 184   }
 185
 186   bool isPromoteAllocaEnabled() const {
 187     return EnablePromoteAlloca;
 188   }
 189
 190   unsigned getWavefrontSize() const {
 191     return WavefrontSize;
 192   }
 193
 194   int getLocalMemorySize() const {
 195     return LocalMemorySize;
 196   }
 197
 198   unsigned getAlignmentForImplicitArgPtr() const {
 199     return isAmdHsaOS() ? 8 : 4;
 200   }
 201
 202   /// Returns the offset in bytes from the start of the input buffer
 203   ///        of the first explicit kernel argument.
 204   unsigned getExplicitKernelArgOffset(const Function &F) const {
 205     return isAmdCodeObjectV2(F) ? 0 : 36;
 206   }
 207
 208   /// \returns Maximum number of work groups per compute unit supported by the
 209   /// subtarget and limited by given \p FlatWorkGroupSize.
 210   virtual unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const = 0;
 211
 212   /// \returns Minimum flat work group size supported by the subtarget.
 213   virtual unsigned getMinFlatWorkGroupSize() const = 0;
 214
 215   /// \returns Maximum flat work group size supported by the subtarget.
 216   virtual unsigned getMaxFlatWorkGroupSize() const = 0;
 217
 218   /// \returns Maximum number of waves per execution unit supported by the
 219   /// subtarget and limited by given \p FlatWorkGroupSize.
 220   virtual unsigned getMaxWavesPerEU(unsigned FlatWorkGroupSize) const  = 0;
 221
 222   /// \returns Minimum number of waves per execution unit supported by the
 223   /// subtarget.
 224   virtual unsigned getMinWavesPerEU() const = 0;
 225
 226   unsigned getMaxWavesPerEU() const { return 10; }
 227
 228   /// Creates value range metadata on an workitemid.* inrinsic call or load.
 229   bool makeLIDRangeMetadata(Instruction *I) const;
 230
 231   /// \returns Number of bytes of arguments that are passed to a shader or
 232   /// kernel in addition to the explicit ones declared for the function.
 233   unsigned getImplicitArgNumBytes(const Function &F) const {
 234     if (isMesaKernel(F))
 235       return 16;
 236     return AMDGPU::getIntegerAttribute(F, "amdgpu-implicitarg-num-bytes", 0);
 237   }
 238   uint64_t getExplicitKernArgSize(const Function &F,
 239                                   unsigned &MaxAlign) const;
 240   unsigned getKernArgSegmentSize(const Function &F,
 241                                  unsigned &MaxAlign) const;
 242
 243   virtual ~AMDGPUSubtarget() {}
 244 };
 245
 246 class GCNSubtarget : public AMDGPUGenSubtargetInfo,
 247                      public AMDGPUSubtarget {
 248 public:
 249   enum {
 250     ISAVersion0_0_0,
 251     ISAVersion6_0_0,
 252     ISAVersion6_0_1,
 253     ISAVersion7_0_0,
 254     ISAVersion7_0_1,
 255     ISAVersion7_0_2,
 256     ISAVersion7_0_3,
 257     ISAVersion7_0_4,
 258     ISAVersion8_0_1,
 259     ISAVersion8_0_2,
 260     ISAVersion8_0_3,
 261     ISAVersion8_1_0,
 262     ISAVersion9_0_0,
 263     ISAVersion9_0_2,
 264     ISAVersion9_0_4,
 265     ISAVersion9_0_6,
 266   };
 267
 268   enum TrapHandlerAbi {
 269     TrapHandlerAbiNone = 0,
 270     TrapHandlerAbiHsa = 1
 271   };
 272
 273   enum TrapID {
 274     TrapIDHardwareReserved = 0,
 275     TrapIDHSADebugTrap = 1,
 276     TrapIDLLVMTrap = 2,
 277     TrapIDLLVMDebugTrap = 3,
 278     TrapIDDebugBreakpoint = 7,
 279     TrapIDDebugReserved8 = 8,
 280     TrapIDDebugReservedFE = 0xfe,
 281     TrapIDDebugReservedFF = 0xff
 282   };
 283
 284   enum TrapRegValues {
 285     LLVMTrapHandlerRegValue = 1
 286   };
 287
 288 private:
 289   /// GlobalISel related APIs.
 290   std::unique_ptr<AMDGPUCallLowering> CallLoweringInfo;
 291   std::unique_ptr<InstructionSelector> InstSelector;
 292   std::unique_ptr<LegalizerInfo> Legalizer;
 293   std::unique_ptr<RegisterBankInfo> RegBankInfo;
 294
 295 protected:
 296   // Basic subtarget description.
 297   Triple TargetTriple;
 298   unsigned Gen;
 299   unsigned IsaVersion;
 300   InstrItineraryData InstrItins;
 301   int LDSBankCount;
 302   unsigned MaxPrivateElementSize;
 303
 304   // Possibly statically set by tablegen, but may want to be overridden.
 305   bool FastFMAF32;
 306   bool HalfRate64Ops;
 307
 308   // Dynamially set bits that enable features.
 309   bool FP64FP16Denormals;
 310   bool DX10Clamp;
 311   bool FlatForGlobal;
 312   bool AutoWaitcntBeforeBarrier;
 313   bool CodeObjectV3;
 314   bool UnalignedScratchAccess;
 315   bool UnalignedBufferAccess;
 316   bool HasApertureRegs;
 317   bool EnableXNACK;
 318   bool TrapHandler;
 319   bool DebuggerInsertNops;
 320   bool DebuggerEmitPrologue;
 321
 322   // Used as options.
 323   bool EnableHugePrivateBuffer;
 324   bool EnableVGPRSpilling;
 325   bool EnableLoadStoreOpt;
 326   bool EnableUnsafeDSOffsetFolding;
 327   bool EnableSIScheduler;
 328   bool EnableDS128;
 329   bool DumpCode;
 330
 331   // Subtarget statically properties set by tablegen
 332   bool FP64;
 333   bool FMA;
 334   bool MIMG_R128;
 335   bool IsGCN;
 336   bool GCN3Encoding;
 337   bool CIInsts;
 338   bool VIInsts;
 339   bool GFX9Insts;
 340   bool SGPRInitBug;
 341   bool HasSMemRealTime;
 342   bool HasIntClamp;
 343   bool HasFmaMixInsts;
 344   bool HasMovrel;
 345   bool HasVGPRIndexMode;
 346   bool HasScalarStores;
 347   bool HasScalarAtomics;
 348   bool HasSDWAOmod;
 349   bool HasSDWAScalar;
 350   bool HasSDWASdst;
 351   bool HasSDWAMac;
 352   bool HasSDWAOutModsVOPC;
 353   bool HasDPP;
 354   bool HasR128A16;
 355   bool HasDLInsts;
 356   bool D16PreservesUnusedBits;
 357   bool FlatAddressSpace;
 358   bool FlatInstOffsets;
 359   bool FlatGlobalInsts;
 360   bool FlatScratchInsts;
 361   bool AddNoCarryInsts;
 362   bool HasUnpackedD16VMem;
 363   bool R600ALUInst;
 364   bool CaymanISA;
 365   bool CFALUBug;
 366   bool HasVertexCache;
 367   short TexVTXClauseSize;
 368   bool ScalarizeGlobal;
 369
 370   // Dummy feature to use for assembler in tablegen.
 371   bool FeatureDisable;
 372
 373   SelectionDAGTargetInfo TSInfo;
 374 private:
 375   SIInstrInfo InstrInfo;
 376   SITargetLowering TLInfo;
 377   SIFrameLowering FrameLowering;
 378
 379 public:
 380   GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
 381                const GCNTargetMachine &TM);
 382   ~GCNSubtarget() override;
 383
 384   GCNSubtarget &initializeSubtargetDependencies(const Triple &TT,
 385                                                    StringRef GPU, StringRef FS);
 386
 387   const SIInstrInfo *getInstrInfo() const override {
 388     return &InstrInfo;
 389   }
 390
 391   const SIFrameLowering *getFrameLowering() const override {
 392     return &FrameLowering;
 393   }
 394
 395   const SITargetLowering *getTargetLowering() const override {
 396     return &TLInfo;
 397   }
 398
 399   const SIRegisterInfo *getRegisterInfo() const override {
 400     return &InstrInfo.getRegisterInfo();
 401   }
 402
 403   const CallLowering *getCallLowering() const override {
 404     return CallLoweringInfo.get();
 405   }
 406
 407   const InstructionSelector *getInstructionSelector() const override {
 408     return InstSelector.get();
 409   }
 410
 411   const LegalizerInfo *getLegalizerInfo() const override {
 412     return Legalizer.get();
 413   }
 414
 415   const RegisterBankInfo *getRegBankInfo() const override {
 416     return RegBankInfo.get();
 417   }
 418
 419   // Nothing implemented, just prevent crashes on use.
 420   const SelectionDAGTargetInfo *getSelectionDAGInfo() const override {
 421     return &TSInfo;
 422   }
 423
 424   const InstrItineraryData *getInstrItineraryData() const override {
 425     return &InstrItins;
 426   }
 427
 428   void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
 429
 430   Generation getGeneration() const {
 431     return (Generation)Gen;
 432   }
 433
 434   unsigned getWavefrontSizeLog2() const {
 435     return Log2_32(WavefrontSize);
 436   }
 437
 438   int getLDSBankCount() const {
 439     return LDSBankCount;
 440   }
 441
 442   unsigned getMaxPrivateElementSize() const {
 443     return MaxPrivateElementSize;
 444   }
 445
 446   bool hasIntClamp() const {
 447     return HasIntClamp;
 448   }
 449
 450   bool hasFP64() const {
 451     return FP64;
 452   }
 453
 454   bool hasMIMG_R128() const {
 455     return MIMG_R128;
 456   }
 457
 458   bool hasHWFP64() const {
 459     return FP64;
 460   }
 461
 462   bool hasFastFMAF32() const {
 463     return FastFMAF32;
 464   }
 465
 466   bool hasHalfRate64Ops() const {
 467     return HalfRate64Ops;
 468   }
 469
 470   bool hasAddr64() const {
 471     return (getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS);
 472   }
 473
 474   bool hasBFE() const {
 475     return true;
 476   }
 477
 478   bool hasBFI() const {
 479     return true;
 480   }
 481
 482   bool hasBFM() const {
 483     return hasBFE();
 484   }
 485
 486   bool hasBCNT(unsigned Size) const {
 487     return true;
 488   }
 489
 490   bool hasFFBL() const {
 491     return true;
 492   }
 493
 494   bool hasFFBH() const {
 495     return true;
 496   }
 497
 498   bool hasMed3_16() const {
 499     return getGeneration() >= AMDGPUSubtarget::GFX9;
 500   }
 501
 502   bool hasMin3Max3_16() const {
 503     return getGeneration() >= AMDGPUSubtarget::GFX9;
 504   }
 505
 506   bool hasFmaMixInsts() const {
 507     return HasFmaMixInsts;
 508   }
 509
 510   bool hasCARRY() const {
 511     return true;
 512   }
 513
 514   bool hasFMA() const {
 515     return FMA;
 516   }
 517
 518   TrapHandlerAbi getTrapHandlerAbi() const {
 519     return isAmdHsaOS() ? TrapHandlerAbiHsa : TrapHandlerAbiNone;
 520   }
 521
 522   bool enableHugePrivateBuffer() const {
 523     return EnableHugePrivateBuffer;
 524   }
 525
 526   bool unsafeDSOffsetFoldingEnabled() const {
 527     return EnableUnsafeDSOffsetFolding;
 528   }
 529
 530   bool dumpCode() const {
 531     return DumpCode;
 532   }
 533
 534   /// Return the amount of LDS that can be used that will not restrict the
 535   /// occupancy lower than WaveCount.
 536   unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount,
 537                                            const Function &) const;
 538
 539   bool hasFP16Denormals() const {
 540     return FP64FP16Denormals;
 541   }
 542
 543   bool hasFP64Denormals() const {
 544     return FP64FP16Denormals;
 545   }
 546
 547   bool supportsMinMaxDenormModes() const {
 548     return getGeneration() >= AMDGPUSubtarget::GFX9;
 549   }
 550
 551   bool enableDX10Clamp() const {
 552     return DX10Clamp;
 553   }
 554
 555   bool enableIEEEBit(const MachineFunction &MF) const {
 556     return AMDGPU::isCompute(MF.getFunction().getCallingConv());
 557   }
 558
 559   bool useFlatForGlobal() const {
 560     return FlatForGlobal;
 561   }
 562
 563   /// \returns If target supports ds_read/write_b128 and user enables generation
 564   /// of ds_read/write_b128.
 565   bool useDS128() const {
 566     return CIInsts && EnableDS128;
 567   }
 568
 569   /// \returns If MUBUF instructions always perform range checking, even for
 570   /// buffer resources used for private memory access.
 571   bool privateMemoryResourceIsRangeChecked() const {
 572     return getGeneration() < AMDGPUSubtarget::GFX9;
 573   }
 574
 575   bool hasAutoWaitcntBeforeBarrier() const {
 576     return AutoWaitcntBeforeBarrier;
 577   }
 578
 579   bool hasCodeObjectV3() const {
 580     return CodeObjectV3;
 581   }
 582
 583   bool hasUnalignedBufferAccess() const {
 584     return UnalignedBufferAccess;
 585   }
 586
 587   bool hasUnalignedScratchAccess() const {
 588     return UnalignedScratchAccess;
 589   }
 590
 591   bool hasApertureRegs() const {
 592     return HasApertureRegs;
 593   }
 594
 595   bool isTrapHandlerEnabled() const {
 596     return TrapHandler;
 597   }
 598
 599   bool isXNACKEnabled() const {
 600     return EnableXNACK;
 601   }
 602
 603   bool hasFlatAddressSpace() const {
 604     return FlatAddressSpace;
 605   }
 606
 607   bool hasFlatInstOffsets() const {
 608     return FlatInstOffsets;
 609   }
 610
 611   bool hasFlatGlobalInsts() const {
 612     return FlatGlobalInsts;
 613   }
 614
 615   bool hasFlatScratchInsts() const {
 616     return FlatScratchInsts;
 617   }
 618
 619   bool hasFlatLgkmVMemCountInOrder() const {
 620     return getGeneration() > GFX9;
 621   }
 622
 623   bool hasD16LoadStore() const {
 624     return getGeneration() >= GFX9;
 625   }
 626
 627   /// Return if most LDS instructions have an m0 use that require m0 to be
 628   /// iniitalized.
 629   bool ldsRequiresM0Init() const {
 630     return getGeneration() < GFX9;
 631   }
 632
 633   bool hasAddNoCarry() const {
 634     return AddNoCarryInsts;
 635   }
 636
 637   bool hasUnpackedD16VMem() const {
 638     return HasUnpackedD16VMem;
 639   }
 640
 641   // Covers VS/PS/CS graphics shaders
 642   bool isMesaGfxShader(const Function &F) const {
 643     return isMesa3DOS() && AMDGPU::isShader(F.getCallingConv());
 644   }
 645
 646   bool hasMad64_32() const {
 647     return getGeneration() >= SEA_ISLANDS;
 648   }
 649
 650   bool hasSDWAOmod() const {
 651     return HasSDWAOmod;
 652   }
 653
 654   bool hasSDWAScalar() const {
 655     return HasSDWAScalar;
 656   }
 657
 658   bool hasSDWASdst() const {
 659     return HasSDWASdst;
 660   }
 661
 662   bool hasSDWAMac() const {
 663     return HasSDWAMac;
 664   }
 665
 666   bool hasSDWAOutModsVOPC() const {
 667     return HasSDWAOutModsVOPC;
 668   }
 669
 670   bool vmemWriteNeedsExpWaitcnt() const {
 671     return getGeneration() < SEA_ISLANDS;
 672   }
 673
 674   bool hasDLInsts() const {
 675     return HasDLInsts;
 676   }
 677
 678   bool d16PreservesUnusedBits() const {
 679     return D16PreservesUnusedBits;
 680   }
 681
 682   // Scratch is allocated in 256 dword per wave blocks for the entire
 683   // wavefront. When viewed from the perspecive of an arbitrary workitem, this
 684   // is 4-byte aligned.
 685   //
 686   // Only 4-byte alignment is really needed to access anything. Transformations
 687   // on the pointer value itself may rely on the alignment / known low bits of
 688   // the pointer. Set this to something above the minimum to avoid needing
 689   // dynamic realignment in common cases.
 690   unsigned getStackAlignment() const {
 691     return 16;
 692   }
 693
 694   bool enableMachineScheduler() const override {
 695     return true;
 696   }
 697
 698   bool enableSubRegLiveness() const override {
 699     return true;
 700   }
 701
 702   void setScalarizeGlobalBehavior(bool b) { ScalarizeGlobal = b; }
 703   bool getScalarizeGlobalBehavior() const { return ScalarizeGlobal; }
 704
 705   /// \returns Number of execution units per compute unit supported by the
 706   /// subtarget.
 707   unsigned getEUsPerCU() const {
 708     return AMDGPU::IsaInfo::getEUsPerCU(this);
 709   }
 710
 711   /// \returns Maximum number of waves per compute unit supported by the
 712   /// subtarget without any kind of limitation.
 713   unsigned getMaxWavesPerCU() const {
 714     return AMDGPU::IsaInfo::getMaxWavesPerCU(this);
 715   }
 716
 717   /// \returns Maximum number of waves per compute unit supported by the
 718   /// subtarget and limited by given \p FlatWorkGroupSize.
 719   unsigned getMaxWavesPerCU(unsigned FlatWorkGroupSize) const {
 720     return AMDGPU::IsaInfo::getMaxWavesPerCU(this, FlatWorkGroupSize);
 721   }
 722
 723   /// \returns Maximum number of waves per execution unit supported by the
 724   /// subtarget without any kind of limitation.
 725   unsigned getMaxWavesPerEU() const {
 726     return AMDGPU::IsaInfo::getMaxWavesPerEU();
 727   }
 728
 729   /// \returns Number of waves per work group supported by the subtarget and
 730   /// limited by given \p FlatWorkGroupSize.
 731   unsigned getWavesPerWorkGroup(unsigned FlatWorkGroupSize) const {
 732     return AMDGPU::IsaInfo::getWavesPerWorkGroup(this, FlatWorkGroupSize);
 733   }
 734
 735   // static wrappers
 736   static bool hasHalfRate64Ops(const TargetSubtargetInfo &STI);
 737
 738   // XXX - Why is this here if it isn't in the default pass set?
 739   bool enableEarlyIfConversion() const override {
 740     return true;
 741   }
 742
 743   void overrideSchedPolicy(MachineSchedPolicy &Policy,
 744                            unsigned NumRegionInstrs) const override;
 745
 746   bool isVGPRSpillingEnabled(const Function &F) const;
 747
 748   unsigned getMaxNumUserSGPRs() const {
 749     return 16;
 750   }
 751
 752   bool hasSMemRealTime() const {
 753     return HasSMemRealTime;
 754   }
 755
 756   bool hasMovrel() const {
 757     return HasMovrel;
 758   }
 759
 760   bool hasVGPRIndexMode() const {
 761     return HasVGPRIndexMode;
 762   }
 763
 764   bool useVGPRIndexMode(bool UserEnable) const {
 765     return !hasMovrel() || (UserEnable && hasVGPRIndexMode());
 766   }
 767
 768   bool hasScalarCompareEq64() const {
 769     return getGeneration() >= VOLCANIC_ISLANDS;
 770   }
 771
 772   bool hasScalarStores() const {
 773     return HasScalarStores;
 774   }
 775
 776   bool hasScalarAtomics() const {
 777     return HasScalarAtomics;
 778   }
 779
 780
 781   bool hasDPP() const {
 782     return HasDPP;
 783   }
 784
 785   bool hasR128A16() const {
 786     return HasR128A16;
 787   }
 788
 789   bool enableSIScheduler() const {
 790     return EnableSIScheduler;
 791   }
 792
 793   bool debuggerSupported() const {
 794     return debuggerInsertNops() && debuggerEmitPrologue();
 795   }
 796
 797   bool debuggerInsertNops() const {
 798     return DebuggerInsertNops;
 799   }
 800
 801   bool debuggerEmitPrologue() const {
 802     return DebuggerEmitPrologue;
 803   }
 804
 805   bool loadStoreOptEnabled() const {
 806     return EnableLoadStoreOpt;
 807   }
 808
 809   bool hasSGPRInitBug() const {
 810     return SGPRInitBug;
 811   }
 812
 813   bool has12DWordStoreHazard() const {
 814     return getGeneration() != AMDGPUSubtarget::SOUTHERN_ISLANDS;
 815   }
 816
 817   bool hasSMovFedHazard() const {
 818     return getGeneration() >= AMDGPUSubtarget::GFX9;
 819   }
 820
 821   bool hasReadM0MovRelInterpHazard() const {
 822     return getGeneration() >= AMDGPUSubtarget::GFX9;
 823   }
 824
 825   bool hasReadM0SendMsgHazard() const {
 826     return getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS;
 827   }
 828
 829   /// Return the maximum number of waves per SIMD for kernels using \p SGPRs
 830   /// SGPRs
 831   unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const;
 832
 833   /// Return the maximum number of waves per SIMD for kernels using \p VGPRs
 834   /// VGPRs
 835   unsigned getOccupancyWithNumVGPRs(unsigned VGPRs) const;
 836
 837   /// \returns true if the flat_scratch register should be initialized with the
 838   /// pointer to the wave's scratch memory rather than a size and offset.
 839   bool flatScratchIsPointer() const {
 840     return getGeneration() >= AMDGPUSubtarget::GFX9;
 841   }
 842
 843   /// \returns true if the machine has merged shaders in which s0-s7 are
 844   /// reserved by the hardware and user SGPRs start at s8
 845   bool hasMergedShaders() const {
 846     return getGeneration() >= GFX9;
 847   }
 848
 849   /// \returns SGPR allocation granularity supported by the subtarget.
 850   unsigned getSGPRAllocGranule() const {
 851     return AMDGPU::IsaInfo::getSGPRAllocGranule(this);
 852   }
 853
 854   /// \returns SGPR encoding granularity supported by the subtarget.
 855   unsigned getSGPREncodingGranule() const {
 856     return AMDGPU::IsaInfo::getSGPREncodingGranule(this);
 857   }
 858
 859   /// \returns Total number of SGPRs supported by the subtarget.
 860   unsigned getTotalNumSGPRs() const {
 861     return AMDGPU::IsaInfo::getTotalNumSGPRs(this);
 862   }
 863
 864   /// \returns Addressable number of SGPRs supported by the subtarget.
 865   unsigned getAddressableNumSGPRs() const {
 866     return AMDGPU::IsaInfo::getAddressableNumSGPRs(this);
 867   }
 868
 869   /// \returns Minimum number of SGPRs that meets the given number of waves per
 870   /// execution unit requirement supported by the subtarget.
 871   unsigned getMinNumSGPRs(unsigned WavesPerEU) const {
 872     return AMDGPU::IsaInfo::getMinNumSGPRs(this, WavesPerEU);
 873   }
 874
 875   /// \returns Maximum number of SGPRs that meets the given number of waves per
 876   /// execution unit requirement supported by the subtarget.
 877   unsigned getMaxNumSGPRs(unsigned WavesPerEU, bool Addressable) const {
 878     return AMDGPU::IsaInfo::getMaxNumSGPRs(this, WavesPerEU, Addressable);
 879   }
 880
 881   /// \returns Reserved number of SGPRs for given function \p MF.
 882   unsigned getReservedNumSGPRs(const MachineFunction &MF) const;
 883
 884   /// \returns Maximum number of SGPRs that meets number of waves per execution
 885   /// unit requirement for function \p MF, or number of SGPRs explicitly
 886   /// requested using "amdgpu-num-sgpr" attribute attached to function \p MF.
 887   ///
 888   /// \returns Value that meets number of waves per execution unit requirement
 889   /// if explicitly requested value cannot be converted to integer, violates
 890   /// subtarget's specifications, or does not meet number of waves per execution
 891   /// unit requirement.
 892   unsigned getMaxNumSGPRs(const MachineFunction &MF) const;
 893
 894   /// \returns VGPR allocation granularity supported by the subtarget.
 895   unsigned getVGPRAllocGranule() const {
 896     return AMDGPU::IsaInfo::getVGPRAllocGranule(this);
 897   }
 898
 899   /// \returns VGPR encoding granularity supported by the subtarget.
 900   unsigned getVGPREncodingGranule() const {
 901     return AMDGPU::IsaInfo::getVGPREncodingGranule(this);
 902   }
 903
 904   /// \returns Total number of VGPRs supported by the subtarget.
 905   unsigned getTotalNumVGPRs() const {
 906     return AMDGPU::IsaInfo::getTotalNumVGPRs(this);
 907   }
 908
 909   /// \returns Addressable number of VGPRs supported by the subtarget.
 910   unsigned getAddressableNumVGPRs() const {
 911     return AMDGPU::IsaInfo::getAddressableNumVGPRs(this);
 912   }
 913
 914   /// \returns Minimum number of VGPRs that meets given number of waves per
 915   /// execution unit requirement supported by the subtarget.
 916   unsigned getMinNumVGPRs(unsigned WavesPerEU) const {
 917     return AMDGPU::IsaInfo::getMinNumVGPRs(this, WavesPerEU);
 918   }
 919
 920   /// \returns Maximum number of VGPRs that meets given number of waves per
 921   /// execution unit requirement supported by the subtarget.
 922   unsigned getMaxNumVGPRs(unsigned WavesPerEU) const {
 923     return AMDGPU::IsaInfo::getMaxNumVGPRs(this, WavesPerEU);
 924   }
 925
 926   /// \returns Maximum number of VGPRs that meets number of waves per execution
 927   /// unit requirement for function \p MF, or number of VGPRs explicitly
 928   /// requested using "amdgpu-num-vgpr" attribute attached to function \p MF.
 929   ///
 930   /// \returns Value that meets number of waves per execution unit requirement
 931   /// if explicitly requested value cannot be converted to integer, violates
 932   /// subtarget's specifications, or does not meet number of waves per execution
 933   /// unit requirement.
 934   unsigned getMaxNumVGPRs(const MachineFunction &MF) const;
 935
 936   void getPostRAMutations(
 937       std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations)
 938       const override;
 939
 940   /// \returns Maximum number of work groups per compute unit supported by the
 941   /// subtarget and limited by given \p FlatWorkGroupSize.
 942   unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const override {
 943     return AMDGPU::IsaInfo::getMaxWorkGroupsPerCU(this, FlatWorkGroupSize);
 944   }
 945
 946   /// \returns Minimum flat work group size supported by the subtarget.
 947   unsigned getMinFlatWorkGroupSize() const override {
 948     return AMDGPU::IsaInfo::getMinFlatWorkGroupSize(this);
 949   }
 950
 951   /// \returns Maximum flat work group size supported by the subtarget.
 952   unsigned getMaxFlatWorkGroupSize() const override {
 953     return AMDGPU::IsaInfo::getMaxFlatWorkGroupSize(this);
 954   }
 955
 956   /// \returns Maximum number of waves per execution unit supported by the
 957   /// subtarget and limited by given \p FlatWorkGroupSize.
 958   unsigned getMaxWavesPerEU(unsigned FlatWorkGroupSize) const override {
 959     return AMDGPU::IsaInfo::getMaxWavesPerEU(this, FlatWorkGroupSize);
 960   }
 961
 962   /// \returns Minimum number of waves per execution unit supported by the
 963   /// subtarget.
 964   unsigned getMinWavesPerEU() const override {
 965     return AMDGPU::IsaInfo::getMinWavesPerEU(this);
 966   }
 967 };
 968
 969 class R600Subtarget final : public R600GenSubtargetInfo,
 970                             public AMDGPUSubtarget {
 971 private:
 972   R600InstrInfo InstrInfo;
 973   R600FrameLowering FrameLowering;
 974   bool FMA;
 975   bool CaymanISA;
 976   bool CFALUBug;
 977   bool DX10Clamp;
 978   bool HasVertexCache;
 979   bool R600ALUInst;
 980   bool FP64;
 981   short TexVTXClauseSize;
 982   Generation Gen;
 983   R600TargetLowering TLInfo;
 984   InstrItineraryData InstrItins;
 985   SelectionDAGTargetInfo TSInfo;
 986
 987 public:
 988   R600Subtarget(const Triple &TT, StringRef CPU, StringRef FS,
 989                 const TargetMachine &TM);
 990
 991   const R600InstrInfo *getInstrInfo() const override { return &InstrInfo; }
 992
 993   const R600FrameLowering *getFrameLowering() const override {
 994     return &FrameLowering;
 995   }
 996
 997   const R600TargetLowering *getTargetLowering() const override {
 998     return &TLInfo;
 999   }
1000
1001   const R600RegisterInfo *getRegisterInfo() const override {
1002     return &InstrInfo.getRegisterInfo();
1003   }
1004
1005   const InstrItineraryData *getInstrItineraryData() const override {
1006     return &InstrItins;
1007   }
1008
1009   // Nothing implemented, just prevent crashes on use.
1010   const SelectionDAGTargetInfo *getSelectionDAGInfo() const override {
1011     return &TSInfo;
1012   }
1013
1014   void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
1015
1016   Generation getGeneration() const {
1017     return Gen;
1018   }
1019
1020   unsigned getStackAlignment() const {
1021     return 4;
1022   }
1023
1024   R600Subtarget &initializeSubtargetDependencies(const Triple &TT,
1025                                                  StringRef GPU, StringRef FS);
1026
1027   bool hasBFE() const {
1028     return (getGeneration() >= EVERGREEN);
1029   }
1030
1031   bool hasBFI() const {
1032     return (getGeneration() >= EVERGREEN);
1033   }
1034
1035   bool hasBCNT(unsigned Size) const {
1036     if (Size == 32)
1037       return (getGeneration() >= EVERGREEN);
1038
1039     return false;
1040   }
1041
1042   bool hasBORROW() const {
1043     return (getGeneration() >= EVERGREEN);
1044   }
1045
1046   bool hasCARRY() const {
1047     return (getGeneration() >= EVERGREEN);
1048   }
1049
1050   bool hasCaymanISA() const {
1051     return CaymanISA;
1052   }
1053
1054   bool hasFFBL() const {
1055     return (getGeneration() >= EVERGREEN);
1056   }
1057
1058   bool hasFFBH() const {
1059     return (getGeneration() >= EVERGREEN);
1060   }
1061
1062   bool hasFMA() const { return FMA; }
1063
1064   bool hasCFAluBug() const { return CFALUBug; }
1065
1066   bool hasVertexCache() const { return HasVertexCache; }
1067
1068   short getTexVTXClauseSize() const { return TexVTXClauseSize; }
1069
1070   bool enableMachineScheduler() const override {
1071     return true;
1072   }
1073
1074   bool enableSubRegLiveness() const override {
1075     return true;
1076   }
1077
1078   /// \returns Maximum number of work groups per compute unit supported by the
1079   /// subtarget and limited by given \p FlatWorkGroupSize.
1080   unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const override {
1081     return AMDGPU::IsaInfo::getMaxWorkGroupsPerCU(this, FlatWorkGroupSize);
1082   }
1083
1084   /// \returns Minimum flat work group size supported by the subtarget.
1085   unsigned getMinFlatWorkGroupSize() const override {
1086     return AMDGPU::IsaInfo::getMinFlatWorkGroupSize(this);
1087   }
1088
1089   /// \returns Maximum flat work group size supported by the subtarget.
1090   unsigned getMaxFlatWorkGroupSize() const override {
1091     return AMDGPU::IsaInfo::getMaxFlatWorkGroupSize(this);
1092   }
1093
1094   /// \returns Maximum number of waves per execution unit supported by the
1095   /// subtarget and limited by given \p FlatWorkGroupSize.
1096   unsigned getMaxWavesPerEU(unsigned FlatWorkGroupSize) const override {
1097     return AMDGPU::IsaInfo::getMaxWavesPerEU(this, FlatWorkGroupSize);
1098   }
1099
1100   /// \returns Minimum number of waves per execution unit supported by the
1101   /// subtarget.
1102   unsigned getMinWavesPerEU() const override {
1103     return AMDGPU::IsaInfo::getMinWavesPerEU(this);
1104   }
1105 };
1106
1107 } // end namespace llvm
1108
1109 #endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H