lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h

   1 //===- AMDGPUTargetTransformInfo.h - AMDGPU specific TTI --------*- C++ -*-===//
   2 //
   3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 // See https://llvm.org/LICENSE.txt for license information.
   5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 //
   7 //===----------------------------------------------------------------------===//
   8 //
   9 /// \file
  10 /// This file a TargetTransformInfo::Concept conforming object specific to the
  11 /// AMDGPU target machine. It uses the target's detailed information to
  12 /// provide more precise answers to certain TTI queries, while letting the
  13 /// target independent and default TTI implementations handle the rest.
  14 //
  15 //===----------------------------------------------------------------------===//
  16
  17 #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETTRANSFORMINFO_H
  18 #define LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETTRANSFORMINFO_H
  19
  20 #include "AMDGPU.h"
  21 #include "AMDGPUSubtarget.h"
  22 #include "AMDGPUTargetMachine.h"
  23 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
  24 #include "Utils/AMDGPUBaseInfo.h"
  25 #include "llvm/ADT/ArrayRef.h"
  26 #include "llvm/Analysis/TargetTransformInfo.h"
  27 #include "llvm/CodeGen/BasicTTIImpl.h"
  28 #include "llvm/IR/Function.h"
  29 #include "llvm/MC/SubtargetFeature.h"
  30 #include "llvm/Support/MathExtras.h"
  31 #include <cassert>
  32
  33 namespace llvm {
  34
  35 class AMDGPUTargetLowering;
  36 class Loop;
  37 class ScalarEvolution;
  38 class Type;
  39 class Value;
  40
  41 class AMDGPUTTIImpl final : public BasicTTIImplBase<AMDGPUTTIImpl> {
  42   using BaseT = BasicTTIImplBase<AMDGPUTTIImpl>;
  43   using TTI = TargetTransformInfo;
  44
  45   friend BaseT;
  46
  47   Triple TargetTriple;
  48
  49   const TargetSubtargetInfo *ST;
  50   const TargetLoweringBase *TLI;
  51
  52   const TargetSubtargetInfo *getST() const { return ST; }
  53   const TargetLoweringBase *getTLI() const { return TLI; }
  54
  55 public:
  56   explicit AMDGPUTTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
  57       : BaseT(TM, F.getParent()->getDataLayout()),
  58         TargetTriple(TM->getTargetTriple()),
  59         ST(static_cast<const GCNSubtarget *>(TM->getSubtargetImpl(F))),
  60         TLI(ST->getTargetLowering()) {}
  61
  62   void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
  63                                TTI::UnrollingPreferences &UP);
  64 };
  65
  66 class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
  67   using BaseT = BasicTTIImplBase<GCNTTIImpl>;
  68   using TTI = TargetTransformInfo;
  69
  70   friend BaseT;
  71
  72   const GCNSubtarget *ST;
  73   const AMDGPUTargetLowering *TLI;
  74   AMDGPUTTIImpl CommonTTI;
  75   bool IsGraphicsShader;
  76
  77   const FeatureBitset InlineFeatureIgnoreList = {
  78     // Codegen control options which don't matter.
  79     AMDGPU::FeatureEnableLoadStoreOpt,
  80     AMDGPU::FeatureEnableSIScheduler,
  81     AMDGPU::FeatureEnableUnsafeDSOffsetFolding,
  82     AMDGPU::FeatureFlatForGlobal,
  83     AMDGPU::FeaturePromoteAlloca,
  84     AMDGPU::FeatureUnalignedBufferAccess,
  85     AMDGPU::FeatureUnalignedScratchAccess,
  86
  87     AMDGPU::FeatureAutoWaitcntBeforeBarrier,
  88
  89     // Property of the kernel/environment which can't actually differ.
  90     AMDGPU::FeatureSGPRInitBug,
  91     AMDGPU::FeatureXNACK,
  92     AMDGPU::FeatureTrapHandler,
  93     AMDGPU::FeatureCodeObjectV3,
  94
  95     // The default assumption needs to be ecc is enabled, but no directly
  96     // exposed operations depend on it, so it can be safely inlined.
  97     AMDGPU::FeatureSRAMECC,
  98
  99     // Perf-tuning features
 100     AMDGPU::FeatureFastFMAF32,
 101     AMDGPU::HalfRate64Ops
 102   };
 103
 104   const GCNSubtarget *getST() const { return ST; }
 105   const AMDGPUTargetLowering *getTLI() const { return TLI; }
 106
 107   static inline int getFullRateInstrCost() {
 108     return TargetTransformInfo::TCC_Basic;
 109   }
 110
 111   static inline int getHalfRateInstrCost() {
 112     return 2 * TargetTransformInfo::TCC_Basic;
 113   }
 114
 115   // TODO: The size is usually 8 bytes, but takes 4x as many cycles. Maybe
 116   // should be 2 or 4.
 117   static inline int getQuarterRateInstrCost() {
 118     return 3 * TargetTransformInfo::TCC_Basic;
 119   }
 120
 121    // On some parts, normal fp64 operations are half rate, and others
 122    // quarter. This also applies to some integer operations.
 123   inline int get64BitInstrCost() const {
 124     return ST->hasHalfRate64Ops() ?
 125       getHalfRateInstrCost() : getQuarterRateInstrCost();
 126   }
 127
 128 public:
 129   explicit GCNTTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
 130     : BaseT(TM, F.getParent()->getDataLayout()),
 131       ST(static_cast<const GCNSubtarget*>(TM->getSubtargetImpl(F))),
 132       TLI(ST->getTargetLowering()),
 133       CommonTTI(TM, F),
 134       IsGraphicsShader(AMDGPU::isShader(F.getCallingConv())) {}
 135
 136   bool hasBranchDivergence() { return true; }
 137
 138   void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
 139                                TTI::UnrollingPreferences &UP);
 140
 141   TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth) {
 142     assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
 143     return TTI::PSK_FastHardware;
 144   }
 145
 146   unsigned getHardwareNumberOfRegisters(bool Vector) const;
 147   unsigned getNumberOfRegisters(bool Vector) const;
 148   unsigned getRegisterBitWidth(bool Vector) const;
 149   unsigned getMinVectorRegisterBitWidth() const;
 150   unsigned getLoadVectorFactor(unsigned VF, unsigned LoadSize,
 151                                unsigned ChainSizeInBytes,
 152                                VectorType *VecTy) const;
 153   unsigned getStoreVectorFactor(unsigned VF, unsigned StoreSize,
 154                                 unsigned ChainSizeInBytes,
 155                                 VectorType *VecTy) const;
 156   unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const;
 157
 158   bool isLegalToVectorizeMemChain(unsigned ChainSizeInBytes,
 159                                   unsigned Alignment,
 160                                   unsigned AddrSpace) const;
 161   bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes,
 162                                    unsigned Alignment,
 163                                    unsigned AddrSpace) const;
 164   bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes,
 165                                     unsigned Alignment,
 166                                     unsigned AddrSpace) const;
 167
 168   unsigned getMaxInterleaveFactor(unsigned VF);
 169
 170   bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info) const;
 171
 172   int getArithmeticInstrCost(
 173     unsigned Opcode, Type *Ty,
 174     TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
 175     TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue,
 176     TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
 177     TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None,
 178     ArrayRef<const Value *> Args = ArrayRef<const Value *>());
 179
 180   unsigned getCFInstrCost(unsigned Opcode);
 181
 182   int getVectorInstrCost(unsigned Opcode, Type *ValTy, unsigned Index);
 183   bool isSourceOfDivergence(const Value *V) const;
 184   bool isAlwaysUniform(const Value *V) const;
 185
 186   unsigned getFlatAddressSpace() const {
 187     // Don't bother running InferAddressSpaces pass on graphics shaders which
 188     // don't use flat addressing.
 189     if (IsGraphicsShader)
 190       return -1;
 191     return AMDGPUAS::FLAT_ADDRESS;
 192   }
 193
 194   bool collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes,
 195                                   Intrinsic::ID IID) const;
 196   bool rewriteIntrinsicWithAddressSpace(IntrinsicInst *II,
 197                                         Value *OldV, Value *NewV) const;
 198
 199   unsigned getVectorSplitCost() { return 0; }
 200
 201   unsigned getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
 202                           Type *SubTp);
 203
 204   bool areInlineCompatible(const Function *Caller,
 205                            const Function *Callee) const;
 206
 207   unsigned getInliningThresholdMultiplier() { return 7; }
 208
 209   int getInlinerVectorBonusPercent() { return 0; }
 210
 211   int getArithmeticReductionCost(unsigned Opcode,
 212                                  Type *Ty,
 213                                  bool IsPairwise);
 214   int getMinMaxReductionCost(Type *Ty, Type *CondTy,
 215                              bool IsPairwiseForm,
 216                              bool IsUnsigned);
 217 };
 218
 219 class R600TTIImpl final : public BasicTTIImplBase<R600TTIImpl> {
 220   using BaseT = BasicTTIImplBase<R600TTIImpl>;
 221   using TTI = TargetTransformInfo;
 222
 223   friend BaseT;
 224
 225   const R600Subtarget *ST;
 226   const AMDGPUTargetLowering *TLI;
 227   AMDGPUTTIImpl CommonTTI;
 228
 229 public:
 230   explicit R600TTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
 231     : BaseT(TM, F.getParent()->getDataLayout()),
 232       ST(static_cast<const R600Subtarget*>(TM->getSubtargetImpl(F))),
 233       TLI(ST->getTargetLowering()),
 234       CommonTTI(TM, F)  {}
 235
 236   const R600Subtarget *getST() const { return ST; }
 237   const AMDGPUTargetLowering *getTLI() const { return TLI; }
 238
 239   void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
 240                                TTI::UnrollingPreferences &UP);
 241   unsigned getHardwareNumberOfRegisters(bool Vec) const;
 242   unsigned getNumberOfRegisters(bool Vec) const;
 243   unsigned getRegisterBitWidth(bool Vector) const;
 244   unsigned getMinVectorRegisterBitWidth() const;
 245   unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const;
 246   bool isLegalToVectorizeMemChain(unsigned ChainSizeInBytes, unsigned Alignment,
 247                                   unsigned AddrSpace) const;
 248   bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes,
 249                                    unsigned Alignment,
 250                                    unsigned AddrSpace) const;
 251   bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes,
 252                                     unsigned Alignment,
 253                                     unsigned AddrSpace) const;
 254   unsigned getMaxInterleaveFactor(unsigned VF);
 255   unsigned getCFInstrCost(unsigned Opcode);
 256   int getVectorInstrCost(unsigned Opcode, Type *ValTy, unsigned Index);
 257 };
 258
 259 } // end namespace llvm
 260
 261 #endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETTRANSFORMINFO_H