lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp

   1 //===- AMDGPUTargetTransformInfo.cpp - AMDGPU specific TTI pass -----------===//
   2 //
   3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 // See https://llvm.org/LICENSE.txt for license information.
   5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 //
   7 //===----------------------------------------------------------------------===//
   8 //
   9 // \file
  10 // This file implements a TargetTransformInfo analysis pass specific to the
  11 // AMDGPU target machine. It uses the target's detailed information to provide
  12 // more precise answers to certain TTI queries, while letting the target
  13 // independent and default TTI implementations handle the rest.
  14 //
  15 //===----------------------------------------------------------------------===//
  16
  17 #include "AMDGPUTargetTransformInfo.h"
  18 #include "AMDGPUSubtarget.h"
  19 #include "Utils/AMDGPUBaseInfo.h"
  20 #include "llvm/ADT/STLExtras.h"
  21 #include "llvm/Analysis/LoopInfo.h"
  22 #include "llvm/Analysis/TargetTransformInfo.h"
  23 #include "llvm/Analysis/ValueTracking.h"
  24 #include "llvm/CodeGen/ISDOpcodes.h"
  25 #include "llvm/CodeGen/ValueTypes.h"
  26 #include "llvm/IR/Argument.h"
  27 #include "llvm/IR/Attributes.h"
  28 #include "llvm/IR/BasicBlock.h"
  29 #include "llvm/IR/CallingConv.h"
  30 #include "llvm/IR/DataLayout.h"
  31 #include "llvm/IR/DerivedTypes.h"
  32 #include "llvm/IR/Function.h"
  33 #include "llvm/IR/Instruction.h"
  34 #include "llvm/IR/Instructions.h"
  35 #include "llvm/IR/IntrinsicInst.h"
  36 #include "llvm/IR/Module.h"
  37 #include "llvm/IR/PatternMatch.h"
  38 #include "llvm/IR/Type.h"
  39 #include "llvm/IR/Value.h"
  40 #include "llvm/MC/SubtargetFeature.h"
  41 #include "llvm/Support/Casting.h"
  42 #include "llvm/Support/CommandLine.h"
  43 #include "llvm/Support/Debug.h"
  44 #include "llvm/Support/ErrorHandling.h"
  45 #include "llvm/Support/MachineValueType.h"
  46 #include "llvm/Support/raw_ostream.h"
  47 #include "llvm/Target/TargetMachine.h"
  48 #include <algorithm>
  49 #include <cassert>
  50 #include <limits>
  51 #include <utility>
  52
  53 using namespace llvm;
  54
  55 #define DEBUG_TYPE "AMDGPUtti"
  56
  57 static cl::opt<unsigned> UnrollThresholdPrivate(
  58   "amdgpu-unroll-threshold-private",
  59   cl::desc("Unroll threshold for AMDGPU if private memory used in a loop"),
  60   cl::init(2500), cl::Hidden);
  61
  62 static cl::opt<unsigned> UnrollThresholdLocal(
  63   "amdgpu-unroll-threshold-local",
  64   cl::desc("Unroll threshold for AMDGPU if local memory used in a loop"),
  65   cl::init(1000), cl::Hidden);
  66
  67 static cl::opt<unsigned> UnrollThresholdIf(
  68   "amdgpu-unroll-threshold-if",
  69   cl::desc("Unroll threshold increment for AMDGPU for each if statement inside loop"),
  70   cl::init(150), cl::Hidden);
  71
  72 static bool dependsOnLocalPhi(const Loop *L, const Value *Cond,
  73                               unsigned Depth = 0) {
  74   const Instruction *I = dyn_cast<Instruction>(Cond);
  75   if (!I)
  76     return false;
  77
  78   for (const Value *V : I->operand_values()) {
  79     if (!L->contains(I))
  80       continue;
  81     if (const PHINode *PHI = dyn_cast<PHINode>(V)) {
  82       if (llvm::none_of(L->getSubLoops(), [PHI](const Loop* SubLoop) {
  83                   return SubLoop->contains(PHI); }))
  84         return true;
  85     } else if (Depth < 10 && dependsOnLocalPhi(L, V, Depth+1))
  86       return true;
  87   }
  88   return false;
  89 }
  90
  91 void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
  92                                             TTI::UnrollingPreferences &UP) {
  93   UP.Threshold = 300; // Twice the default.
  94   UP.MaxCount = std::numeric_limits<unsigned>::max();
  95   UP.Partial = true;
  96
  97   // TODO: Do we want runtime unrolling?
  98
  99   // Maximum alloca size than can fit registers. Reserve 16 registers.
 100   const unsigned MaxAlloca = (256 - 16) * 4;
 101   unsigned ThresholdPrivate = UnrollThresholdPrivate;
 102   unsigned ThresholdLocal = UnrollThresholdLocal;
 103   unsigned MaxBoost = std::max(ThresholdPrivate, ThresholdLocal);
 104   for (const BasicBlock *BB : L->getBlocks()) {
 105     const DataLayout &DL = BB->getModule()->getDataLayout();
 106     unsigned LocalGEPsSeen = 0;
 107
 108     if (llvm::any_of(L->getSubLoops(), [BB](const Loop* SubLoop) {
 109                return SubLoop->contains(BB); }))
 110         continue; // Block belongs to an inner loop.
 111
 112     for (const Instruction &I : *BB) {
 113       // Unroll a loop which contains an "if" statement whose condition
 114       // defined by a PHI belonging to the loop. This may help to eliminate
 115       // if region and potentially even PHI itself, saving on both divergence
 116       // and registers used for the PHI.
 117       // Add a small bonus for each of such "if" statements.
 118       if (const BranchInst *Br = dyn_cast<BranchInst>(&I)) {
 119         if (UP.Threshold < MaxBoost && Br->isConditional()) {
 120           BasicBlock *Succ0 = Br->getSuccessor(0);
 121           BasicBlock *Succ1 = Br->getSuccessor(1);
 122           if ((L->contains(Succ0) && L->isLoopExiting(Succ0)) ||
 123               (L->contains(Succ1) && L->isLoopExiting(Succ1)))
 124             continue;
 125           if (dependsOnLocalPhi(L, Br->getCondition())) {
 126             UP.Threshold += UnrollThresholdIf;
 127             LLVM_DEBUG(dbgs() << "Set unroll threshold " << UP.Threshold
 128                               << " for loop:\n"
 129                               << *L << " due to " << *Br << '\n');
 130             if (UP.Threshold >= MaxBoost)
 131               return;
 132           }
 133         }
 134         continue;
 135       }
 136
 137       const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(&I);
 138       if (!GEP)
 139         continue;
 140
 141       unsigned AS = GEP->getAddressSpace();
 142       unsigned Threshold = 0;
 143       if (AS == AMDGPUAS::PRIVATE_ADDRESS)
 144         Threshold = ThresholdPrivate;
 145       else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS)
 146         Threshold = ThresholdLocal;
 147       else
 148         continue;
 149
 150       if (UP.Threshold >= Threshold)
 151         continue;
 152
 153       if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
 154         const Value *Ptr = GEP->getPointerOperand();
 155         const AllocaInst *Alloca =
 156             dyn_cast<AllocaInst>(GetUnderlyingObject(Ptr, DL));
 157         if (!Alloca || !Alloca->isStaticAlloca())
 158           continue;
 159         Type *Ty = Alloca->getAllocatedType();
 160         unsigned AllocaSize = Ty->isSized() ? DL.getTypeAllocSize(Ty) : 0;
 161         if (AllocaSize > MaxAlloca)
 162           continue;
 163       } else if (AS == AMDGPUAS::LOCAL_ADDRESS ||
 164                  AS == AMDGPUAS::REGION_ADDRESS) {
 165         LocalGEPsSeen++;
 166         // Inhibit unroll for local memory if we have seen addressing not to
 167         // a variable, most likely we will be unable to combine it.
 168         // Do not unroll too deep inner loops for local memory to give a chance
 169         // to unroll an outer loop for a more important reason.
 170         if (LocalGEPsSeen > 1 || L->getLoopDepth() > 2 ||
 171             (!isa<GlobalVariable>(GEP->getPointerOperand()) &&
 172              !isa<Argument>(GEP->getPointerOperand())))
 173           continue;
 174       }
 175
 176       // Check if GEP depends on a value defined by this loop itself.
 177       bool HasLoopDef = false;
 178       for (const Value *Op : GEP->operands()) {
 179         const Instruction *Inst = dyn_cast<Instruction>(Op);
 180         if (!Inst || L->isLoopInvariant(Op))
 181           continue;
 182
 183         if (llvm::any_of(L->getSubLoops(), [Inst](const Loop* SubLoop) {
 184              return SubLoop->contains(Inst); }))
 185           continue;
 186         HasLoopDef = true;
 187         break;
 188       }
 189       if (!HasLoopDef)
 190         continue;
 191
 192       // We want to do whatever we can to limit the number of alloca
 193       // instructions that make it through to the code generator.  allocas
 194       // require us to use indirect addressing, which is slow and prone to
 195       // compiler bugs.  If this loop does an address calculation on an
 196       // alloca ptr, then we want to use a higher than normal loop unroll
 197       // threshold. This will give SROA a better chance to eliminate these
 198       // allocas.
 199       //
 200       // We also want to have more unrolling for local memory to let ds
 201       // instructions with different offsets combine.
 202       //
 203       // Don't use the maximum allowed value here as it will make some
 204       // programs way too big.
 205       UP.Threshold = Threshold;
 206       LLVM_DEBUG(dbgs() << "Set unroll threshold " << Threshold
 207                         << " for loop:\n"
 208                         << *L << " due to " << *GEP << '\n');
 209       if (UP.Threshold >= MaxBoost)
 210         return;
 211     }
 212   }
 213 }
 214
 215 unsigned GCNTTIImpl::getHardwareNumberOfRegisters(bool Vec) const {
 216   // The concept of vector registers doesn't really exist. Some packed vector
 217   // operations operate on the normal 32-bit registers.
 218   return 256;
 219 }
 220
 221 unsigned GCNTTIImpl::getNumberOfRegisters(bool Vec) const {
 222   // This is really the number of registers to fill when vectorizing /
 223   // interleaving loops, so we lie to avoid trying to use all registers.
 224   return getHardwareNumberOfRegisters(Vec) >> 3;
 225 }
 226
 227 unsigned GCNTTIImpl::getRegisterBitWidth(bool Vector) const {
 228   return 32;
 229 }
 230
 231 unsigned GCNTTIImpl::getMinVectorRegisterBitWidth() const {
 232   return 32;
 233 }
 234
 235 unsigned GCNTTIImpl::getLoadVectorFactor(unsigned VF, unsigned LoadSize,
 236                                             unsigned ChainSizeInBytes,
 237                                             VectorType *VecTy) const {
 238   unsigned VecRegBitWidth = VF * LoadSize;
 239   if (VecRegBitWidth > 128 && VecTy->getScalarSizeInBits() < 32)
 240     // TODO: Support element-size less than 32bit?
 241     return 128 / LoadSize;
 242
 243   return VF;
 244 }
 245
 246 unsigned GCNTTIImpl::getStoreVectorFactor(unsigned VF, unsigned StoreSize,
 247                                              unsigned ChainSizeInBytes,
 248                                              VectorType *VecTy) const {
 249   unsigned VecRegBitWidth = VF * StoreSize;
 250   if (VecRegBitWidth > 128)
 251     return 128 / StoreSize;
 252
 253   return VF;
 254 }
 255
 256 unsigned GCNTTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const {
 257   if (AddrSpace == AMDGPUAS::GLOBAL_ADDRESS ||
 258       AddrSpace == AMDGPUAS::CONSTANT_ADDRESS ||
 259       AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||
 260       AddrSpace == AMDGPUAS::BUFFER_FAT_POINTER) {
 261     return 512;
 262   }
 263
 264   if (AddrSpace == AMDGPUAS::FLAT_ADDRESS ||
 265       AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
 266       AddrSpace == AMDGPUAS::REGION_ADDRESS)
 267     return 128;
 268
 269   if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)
 270     return 8 * ST->getMaxPrivateElementSize();
 271
 272   llvm_unreachable("unhandled address space");
 273 }
 274
 275 bool GCNTTIImpl::isLegalToVectorizeMemChain(unsigned ChainSizeInBytes,
 276                                                unsigned Alignment,
 277                                                unsigned AddrSpace) const {
 278   // We allow vectorization of flat stores, even though we may need to decompose
 279   // them later if they may access private memory. We don't have enough context
 280   // here, and legalization can handle it.
 281   if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS) {
 282     return (Alignment >= 4 || ST->hasUnalignedScratchAccess()) &&
 283       ChainSizeInBytes <= ST->getMaxPrivateElementSize();
 284   }
 285   return true;
 286 }
 287
 288 bool GCNTTIImpl::isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes,
 289                                                 unsigned Alignment,
 290                                                 unsigned AddrSpace) const {
 291   return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
 292 }
 293
 294 bool GCNTTIImpl::isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes,
 295                                                  unsigned Alignment,
 296                                                  unsigned AddrSpace) const {
 297   return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
 298 }
 299
 300 unsigned GCNTTIImpl::getMaxInterleaveFactor(unsigned VF) {
 301   // Disable unrolling if the loop is not vectorized.
 302   // TODO: Enable this again.
 303   if (VF == 1)
 304     return 1;
 305
 306   return 8;
 307 }
 308
 309 bool GCNTTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst,
 310                                        MemIntrinsicInfo &Info) const {
 311   switch (Inst->getIntrinsicID()) {
 312   case Intrinsic::amdgcn_atomic_inc:
 313   case Intrinsic::amdgcn_atomic_dec:
 314   case Intrinsic::amdgcn_ds_ordered_add:
 315   case Intrinsic::amdgcn_ds_ordered_swap:
 316   case Intrinsic::amdgcn_ds_fadd:
 317   case Intrinsic::amdgcn_ds_fmin:
 318   case Intrinsic::amdgcn_ds_fmax: {
 319     auto *Ordering = dyn_cast<ConstantInt>(Inst->getArgOperand(2));
 320     auto *Volatile = dyn_cast<ConstantInt>(Inst->getArgOperand(4));
 321     if (!Ordering || !Volatile)
 322       return false; // Invalid.
 323
 324     unsigned OrderingVal = Ordering->getZExtValue();
 325     if (OrderingVal > static_cast<unsigned>(AtomicOrdering::SequentiallyConsistent))
 326       return false;
 327
 328     Info.PtrVal = Inst->getArgOperand(0);
 329     Info.Ordering = static_cast<AtomicOrdering>(OrderingVal);
 330     Info.ReadMem = true;
 331     Info.WriteMem = true;
 332     Info.IsVolatile = !Volatile->isNullValue();
 333     return true;
 334   }
 335   default:
 336     return false;
 337   }
 338 }
 339
 340 int GCNTTIImpl::getArithmeticInstrCost(
 341     unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info,
 342     TTI::OperandValueKind Opd2Info, TTI::OperandValueProperties Opd1PropInfo,
 343     TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args ) {
 344   EVT OrigTy = TLI->getValueType(DL, Ty);
 345   if (!OrigTy.isSimple()) {
 346     return BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info,
 347                                          Opd1PropInfo, Opd2PropInfo);
 348   }
 349
 350   // Legalize the type.
 351   std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
 352   int ISD = TLI->InstructionOpcodeToISD(Opcode);
 353
 354   // Because we don't have any legal vector operations, but the legal types, we
 355   // need to account for split vectors.
 356   unsigned NElts = LT.second.isVector() ?
 357     LT.second.getVectorNumElements() : 1;
 358
 359   MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;
 360
 361   switch (ISD) {
 362   case ISD::SHL:
 363   case ISD::SRL:
 364   case ISD::SRA:
 365     if (SLT == MVT::i64)
 366       return get64BitInstrCost() * LT.first * NElts;
 367
 368     // i32
 369     return getFullRateInstrCost() * LT.first * NElts;
 370   case ISD::ADD:
 371   case ISD::SUB:
 372   case ISD::AND:
 373   case ISD::OR:
 374   case ISD::XOR:
 375     if (SLT == MVT::i64){
 376       // and, or and xor are typically split into 2 VALU instructions.
 377       return 2 * getFullRateInstrCost() * LT.first * NElts;
 378     }
 379
 380     return LT.first * NElts * getFullRateInstrCost();
 381   case ISD::MUL: {
 382     const int QuarterRateCost = getQuarterRateInstrCost();
 383     if (SLT == MVT::i64) {
 384       const int FullRateCost = getFullRateInstrCost();
 385       return (4 * QuarterRateCost + (2 * 2) * FullRateCost) * LT.first * NElts;
 386     }
 387
 388     // i32
 389     return QuarterRateCost * NElts * LT.first;
 390   }
 391   case ISD::FADD:
 392   case ISD::FSUB:
 393   case ISD::FMUL:
 394     if (SLT == MVT::f64)
 395       return LT.first * NElts * get64BitInstrCost();
 396
 397     if (SLT == MVT::f32 || SLT == MVT::f16)
 398       return LT.first * NElts * getFullRateInstrCost();
 399     break;
 400   case ISD::FDIV:
 401   case ISD::FREM:
 402     // FIXME: frem should be handled separately. The fdiv in it is most of it,
 403     // but the current lowering is also not entirely correct.
 404     if (SLT == MVT::f64) {
 405       int Cost = 4 * get64BitInstrCost() + 7 * getQuarterRateInstrCost();
 406       // Add cost of workaround.
 407       if (!ST->hasUsableDivScaleConditionOutput())
 408         Cost += 3 * getFullRateInstrCost();
 409
 410       return LT.first * Cost * NElts;
 411     }
 412
 413     if (!Args.empty() && match(Args[0], PatternMatch::m_FPOne())) {
 414       // TODO: This is more complicated, unsafe flags etc.
 415       if ((SLT == MVT::f32 && !ST->hasFP32Denormals()) ||
 416           (SLT == MVT::f16 && ST->has16BitInsts())) {
 417         return LT.first * getQuarterRateInstrCost() * NElts;
 418       }
 419     }
 420
 421     if (SLT == MVT::f16 && ST->has16BitInsts()) {
 422       // 2 x v_cvt_f32_f16
 423       // f32 rcp
 424       // f32 fmul
 425       // v_cvt_f16_f32
 426       // f16 div_fixup
 427       int Cost = 4 * getFullRateInstrCost() + 2 * getQuarterRateInstrCost();
 428       return LT.first * Cost * NElts;
 429     }
 430
 431     if (SLT == MVT::f32 || SLT == MVT::f16) {
 432       int Cost = 7 * getFullRateInstrCost() + 1 * getQuarterRateInstrCost();
 433
 434       if (!ST->hasFP32Denormals()) {
 435         // FP mode switches.
 436         Cost += 2 * getFullRateInstrCost();
 437       }
 438
 439       return LT.first * NElts * Cost;
 440     }
 441     break;
 442   default:
 443     break;
 444   }
 445
 446   return BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info,
 447                                        Opd1PropInfo, Opd2PropInfo);
 448 }
 449
 450 unsigned GCNTTIImpl::getCFInstrCost(unsigned Opcode) {
 451   // XXX - For some reason this isn't called for switch.
 452   switch (Opcode) {
 453   case Instruction::Br:
 454   case Instruction::Ret:
 455     return 10;
 456   default:
 457     return BaseT::getCFInstrCost(Opcode);
 458   }
 459 }
 460
 461 int GCNTTIImpl::getArithmeticReductionCost(unsigned Opcode, Type *Ty,
 462                                               bool IsPairwise) {
 463   EVT OrigTy = TLI->getValueType(DL, Ty);
 464
 465   // Computes cost on targets that have packed math instructions(which support
 466   // 16-bit types only).
 467   if (IsPairwise ||
 468       !ST->hasVOP3PInsts() ||
 469       OrigTy.getScalarSizeInBits() != 16)
 470     return BaseT::getArithmeticReductionCost(Opcode, Ty, IsPairwise);
 471
 472   std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
 473   return LT.first * getFullRateInstrCost();
 474 }
 475
 476 int GCNTTIImpl::getMinMaxReductionCost(Type *Ty, Type *CondTy,
 477                                           bool IsPairwise,
 478                                           bool IsUnsigned) {
 479   EVT OrigTy = TLI->getValueType(DL, Ty);
 480
 481   // Computes cost on targets that have packed math instructions(which support
 482   // 16-bit types only).
 483   if (IsPairwise ||
 484       !ST->hasVOP3PInsts() ||
 485       OrigTy.getScalarSizeInBits() != 16)
 486     return BaseT::getMinMaxReductionCost(Ty, CondTy, IsPairwise, IsUnsigned);
 487
 488   std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
 489   return LT.first * getHalfRateInstrCost();
 490 }
 491
 492 int GCNTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
 493                                       unsigned Index) {
 494   switch (Opcode) {
 495   case Instruction::ExtractElement:
 496   case Instruction::InsertElement: {
 497     unsigned EltSize
 498       = DL.getTypeSizeInBits(cast<VectorType>(ValTy)->getElementType());
 499     if (EltSize < 32) {
 500       if (EltSize == 16 && Index == 0 && ST->has16BitInsts())
 501         return 0;
 502       return BaseT::getVectorInstrCost(Opcode, ValTy, Index);
 503     }
 504
 505     // Extracts are just reads of a subregister, so are free. Inserts are
 506     // considered free because we don't want to have any cost for scalarizing
 507     // operations, and we don't have to copy into a different register class.
 508
 509     // Dynamic indexing isn't free and is best avoided.
 510     return Index == ~0u ? 2 : 0;
 511   }
 512   default:
 513     return BaseT::getVectorInstrCost(Opcode, ValTy, Index);
 514   }
 515 }
 516
 517
 518
 519 static bool isArgPassedInSGPR(const Argument *A) {
 520   const Function *F = A->getParent();
 521
 522   // Arguments to compute shaders are never a source of divergence.
 523   CallingConv::ID CC = F->getCallingConv();
 524   switch (CC) {
 525   case CallingConv::AMDGPU_KERNEL:
 526   case CallingConv::SPIR_KERNEL:
 527     return true;
 528   case CallingConv::AMDGPU_VS:
 529   case CallingConv::AMDGPU_LS:
 530   case CallingConv::AMDGPU_HS:
 531   case CallingConv::AMDGPU_ES:
 532   case CallingConv::AMDGPU_GS:
 533   case CallingConv::AMDGPU_PS:
 534   case CallingConv::AMDGPU_CS:
 535     // For non-compute shaders, SGPR inputs are marked with either inreg or byval.
 536     // Everything else is in VGPRs.
 537     return F->getAttributes().hasParamAttribute(A->getArgNo(), Attribute::InReg) ||
 538            F->getAttributes().hasParamAttribute(A->getArgNo(), Attribute::ByVal);
 539   default:
 540     // TODO: Should calls support inreg for SGPR inputs?
 541     return false;
 542   }
 543 }
 544
 545 /// \returns true if the result of the value could potentially be
 546 /// different across workitems in a wavefront.
 547 bool GCNTTIImpl::isSourceOfDivergence(const Value *V) const {
 548   if (const Argument *A = dyn_cast<Argument>(V))
 549     return !isArgPassedInSGPR(A);
 550
 551   // Loads from the private and flat address spaces are divergent, because
 552   // threads can execute the load instruction with the same inputs and get
 553   // different results.
 554   //
 555   // All other loads are not divergent, because if threads issue loads with the
 556   // same arguments, they will always get the same result.
 557   if (const LoadInst *Load = dyn_cast<LoadInst>(V))
 558     return Load->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
 559            Load->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS;
 560
 561   // Atomics are divergent because they are executed sequentially: when an
 562   // atomic operation refers to the same address in each thread, then each
 563   // thread after the first sees the value written by the previous thread as
 564   // original value.
 565   if (isa<AtomicRMWInst>(V) || isa<AtomicCmpXchgInst>(V))
 566     return true;
 567
 568   if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V))
 569     return AMDGPU::isIntrinsicSourceOfDivergence(Intrinsic->getIntrinsicID());
 570
 571   // Assume all function calls are a source of divergence.
 572   if (isa<CallInst>(V) || isa<InvokeInst>(V))
 573     return true;
 574
 575   return false;
 576 }
 577
 578 bool GCNTTIImpl::isAlwaysUniform(const Value *V) const {
 579   if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V)) {
 580     switch (Intrinsic->getIntrinsicID()) {
 581     default:
 582       return false;
 583     case Intrinsic::amdgcn_readfirstlane:
 584     case Intrinsic::amdgcn_readlane:
 585     case Intrinsic::amdgcn_icmp:
 586     case Intrinsic::amdgcn_fcmp:
 587       return true;
 588     }
 589   }
 590   return false;
 591 }
 592
 593 bool GCNTTIImpl::collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes,
 594                                             Intrinsic::ID IID) const {
 595   switch (IID) {
 596   case Intrinsic::amdgcn_atomic_inc:
 597   case Intrinsic::amdgcn_atomic_dec:
 598   case Intrinsic::amdgcn_ds_fadd:
 599   case Intrinsic::amdgcn_ds_fmin:
 600   case Intrinsic::amdgcn_ds_fmax:
 601   case Intrinsic::amdgcn_is_shared:
 602   case Intrinsic::amdgcn_is_private:
 603     OpIndexes.push_back(0);
 604     return true;
 605   default:
 606     return false;
 607   }
 608 }
 609
 610 bool GCNTTIImpl::rewriteIntrinsicWithAddressSpace(
 611   IntrinsicInst *II, Value *OldV, Value *NewV) const {
 612   auto IntrID = II->getIntrinsicID();
 613   switch (IntrID) {
 614   case Intrinsic::amdgcn_atomic_inc:
 615   case Intrinsic::amdgcn_atomic_dec:
 616   case Intrinsic::amdgcn_ds_fadd:
 617   case Intrinsic::amdgcn_ds_fmin:
 618   case Intrinsic::amdgcn_ds_fmax: {
 619     const ConstantInt *IsVolatile = cast<ConstantInt>(II->getArgOperand(4));
 620     if (!IsVolatile->isZero())
 621       return false;
 622     Module *M = II->getParent()->getParent()->getParent();
 623     Type *DestTy = II->getType();
 624     Type *SrcTy = NewV->getType();
 625     Function *NewDecl =
 626         Intrinsic::getDeclaration(M, II->getIntrinsicID(), {DestTy, SrcTy});
 627     II->setArgOperand(0, NewV);
 628     II->setCalledFunction(NewDecl);
 629     return true;
 630   }
 631   case Intrinsic::amdgcn_is_shared:
 632   case Intrinsic::amdgcn_is_private: {
 633     unsigned TrueAS = IntrID == Intrinsic::amdgcn_is_shared ?
 634       AMDGPUAS::LOCAL_ADDRESS : AMDGPUAS::PRIVATE_ADDRESS;
 635     unsigned NewAS = NewV->getType()->getPointerAddressSpace();
 636     LLVMContext &Ctx = NewV->getType()->getContext();
 637     ConstantInt *NewVal = (TrueAS == NewAS) ?
 638       ConstantInt::getTrue(Ctx) : ConstantInt::getFalse(Ctx);
 639     II->replaceAllUsesWith(NewVal);
 640     II->eraseFromParent();
 641     return true;
 642   }
 643   default:
 644     return false;
 645   }
 646 }
 647
 648 unsigned GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
 649                                        Type *SubTp) {
 650   if (ST->hasVOP3PInsts()) {
 651     VectorType *VT = cast<VectorType>(Tp);
 652     if (VT->getNumElements() == 2 &&
 653         DL.getTypeSizeInBits(VT->getElementType()) == 16) {
 654       // With op_sel VOP3P instructions freely can access the low half or high
 655       // half of a register, so any swizzle is free.
 656
 657       switch (Kind) {
 658       case TTI::SK_Broadcast:
 659       case TTI::SK_Reverse:
 660       case TTI::SK_PermuteSingleSrc:
 661         return 0;
 662       default:
 663         break;
 664       }
 665     }
 666   }
 667
 668   return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
 669 }
 670
 671 bool GCNTTIImpl::areInlineCompatible(const Function *Caller,
 672                                      const Function *Callee) const {
 673   const TargetMachine &TM = getTLI()->getTargetMachine();
 674   const FeatureBitset &CallerBits =
 675     TM.getSubtargetImpl(*Caller)->getFeatureBits();
 676   const FeatureBitset &CalleeBits =
 677     TM.getSubtargetImpl(*Callee)->getFeatureBits();
 678
 679   FeatureBitset RealCallerBits = CallerBits & ~InlineFeatureIgnoreList;
 680   FeatureBitset RealCalleeBits = CalleeBits & ~InlineFeatureIgnoreList;
 681   if ((RealCallerBits & RealCalleeBits) != RealCalleeBits)
 682     return false;
 683
 684   // FIXME: dx10_clamp can just take the caller setting, but there seems to be
 685   // no way to support merge for backend defined attributes.
 686   AMDGPU::SIModeRegisterDefaults CallerMode(*Caller);
 687   AMDGPU::SIModeRegisterDefaults CalleeMode(*Callee);
 688   return CallerMode.isInlineCompatible(CalleeMode);
 689 }
 690
 691 void GCNTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
 692                                          TTI::UnrollingPreferences &UP) {
 693   CommonTTI.getUnrollingPreferences(L, SE, UP);
 694 }
 695
 696 unsigned R600TTIImpl::getHardwareNumberOfRegisters(bool Vec) const {
 697   return 4 * 128; // XXX - 4 channels. Should these count as vector instead?
 698 }
 699
 700 unsigned R600TTIImpl::getNumberOfRegisters(bool Vec) const {
 701   return getHardwareNumberOfRegisters(Vec);
 702 }
 703
 704 unsigned R600TTIImpl::getRegisterBitWidth(bool Vector) const {
 705   return 32;
 706 }
 707
 708 unsigned R600TTIImpl::getMinVectorRegisterBitWidth() const {
 709   return 32;
 710 }
 711
 712 unsigned R600TTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const {
 713   if (AddrSpace == AMDGPUAS::GLOBAL_ADDRESS ||
 714       AddrSpace == AMDGPUAS::CONSTANT_ADDRESS)
 715     return 128;
 716   if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
 717       AddrSpace == AMDGPUAS::REGION_ADDRESS)
 718     return 64;
 719   if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)
 720     return 32;
 721
 722   if ((AddrSpace == AMDGPUAS::PARAM_D_ADDRESS ||
 723       AddrSpace == AMDGPUAS::PARAM_I_ADDRESS ||
 724       (AddrSpace >= AMDGPUAS::CONSTANT_BUFFER_0 &&
 725       AddrSpace <= AMDGPUAS::CONSTANT_BUFFER_15)))
 726     return 128;
 727   llvm_unreachable("unhandled address space");
 728 }
 729
 730 bool R600TTIImpl::isLegalToVectorizeMemChain(unsigned ChainSizeInBytes,
 731                                              unsigned Alignment,
 732                                              unsigned AddrSpace) const {
 733   // We allow vectorization of flat stores, even though we may need to decompose
 734   // them later if they may access private memory. We don't have enough context
 735   // here, and legalization can handle it.
 736   return (AddrSpace != AMDGPUAS::PRIVATE_ADDRESS);
 737 }
 738
 739 bool R600TTIImpl::isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes,
 740                                               unsigned Alignment,
 741                                               unsigned AddrSpace) const {
 742   return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
 743 }
 744
 745 bool R600TTIImpl::isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes,
 746                                                unsigned Alignment,
 747                                                unsigned AddrSpace) const {
 748   return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
 749 }
 750
 751 unsigned R600TTIImpl::getMaxInterleaveFactor(unsigned VF) {
 752   // Disable unrolling if the loop is not vectorized.
 753   // TODO: Enable this again.
 754   if (VF == 1)
 755     return 1;
 756
 757   return 8;
 758 }
 759
 760 unsigned R600TTIImpl::getCFInstrCost(unsigned Opcode) {
 761   // XXX - For some reason this isn't called for switch.
 762   switch (Opcode) {
 763   case Instruction::Br:
 764   case Instruction::Ret:
 765     return 10;
 766   default:
 767     return BaseT::getCFInstrCost(Opcode);
 768   }
 769 }
 770
 771 int R600TTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
 772                                     unsigned Index) {
 773   switch (Opcode) {
 774   case Instruction::ExtractElement:
 775   case Instruction::InsertElement: {
 776     unsigned EltSize
 777       = DL.getTypeSizeInBits(cast<VectorType>(ValTy)->getElementType());
 778     if (EltSize < 32) {
 779       return BaseT::getVectorInstrCost(Opcode, ValTy, Index);
 780     }
 781
 782     // Extracts are just reads of a subregister, so are free. Inserts are
 783     // considered free because we don't want to have any cost for scalarizing
 784     // operations, and we don't have to copy into a different register class.
 785
 786     // Dynamic indexing isn't free and is best avoided.
 787     return Index == ~0u ? 2 : 0;
 788   }
 789   default:
 790     return BaseT::getVectorInstrCost(Opcode, ValTy, Index);
 791   }
 792 }
 793
 794 void R600TTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
 795                                           TTI::UnrollingPreferences &UP) {
 796   CommonTTI.getUnrollingPreferences(L, SE, UP);
 797 }