llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp

   1 //===-- NVPTXTargetTransformInfo.cpp - NVPTX specific TTI -----------------===//
   2 //
   3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 // See https://llvm.org/LICENSE.txt for license information.
   5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 //
   7 //===----------------------------------------------------------------------===//
   8
   9 #include "NVPTXTargetTransformInfo.h"
  10 #include "NVPTXUtilities.h"
  11 #include "llvm/Analysis/LoopInfo.h"
  12 #include "llvm/Analysis/TargetTransformInfo.h"
  13 #include "llvm/Analysis/ValueTracking.h"
  14 #include "llvm/CodeGen/BasicTTIImpl.h"
  15 #include "llvm/CodeGen/CostTable.h"
  16 #include "llvm/CodeGen/TargetLowering.h"
  17 #include "llvm/IR/IntrinsicsNVPTX.h"
  18 #include "llvm/Support/Debug.h"
  19 using namespace llvm;
  20
  21 #define DEBUG_TYPE "NVPTXtti"
  22
  23 // Whether the given intrinsic reads threadIdx.x/y/z.
  24 static bool readsThreadIndex(const IntrinsicInst *II) {
  25   switch (II->getIntrinsicID()) {
  26     default: return false;
  27     case Intrinsic::nvvm_read_ptx_sreg_tid_x:
  28     case Intrinsic::nvvm_read_ptx_sreg_tid_y:
  29     case Intrinsic::nvvm_read_ptx_sreg_tid_z:
  30       return true;
  31   }
  32 }
  33
  34 static bool readsLaneId(const IntrinsicInst *II) {
  35   return II->getIntrinsicID() == Intrinsic::nvvm_read_ptx_sreg_laneid;
  36 }
  37
  38 // Whether the given intrinsic is an atomic instruction in PTX.
  39 static bool isNVVMAtomic(const IntrinsicInst *II) {
  40   switch (II->getIntrinsicID()) {
  41     default: return false;
  42     case Intrinsic::nvvm_atomic_load_inc_32:
  43     case Intrinsic::nvvm_atomic_load_dec_32:
  44
  45     case Intrinsic::nvvm_atomic_add_gen_f_cta:
  46     case Intrinsic::nvvm_atomic_add_gen_f_sys:
  47     case Intrinsic::nvvm_atomic_add_gen_i_cta:
  48     case Intrinsic::nvvm_atomic_add_gen_i_sys:
  49     case Intrinsic::nvvm_atomic_and_gen_i_cta:
  50     case Intrinsic::nvvm_atomic_and_gen_i_sys:
  51     case Intrinsic::nvvm_atomic_cas_gen_i_cta:
  52     case Intrinsic::nvvm_atomic_cas_gen_i_sys:
  53     case Intrinsic::nvvm_atomic_dec_gen_i_cta:
  54     case Intrinsic::nvvm_atomic_dec_gen_i_sys:
  55     case Intrinsic::nvvm_atomic_inc_gen_i_cta:
  56     case Intrinsic::nvvm_atomic_inc_gen_i_sys:
  57     case Intrinsic::nvvm_atomic_max_gen_i_cta:
  58     case Intrinsic::nvvm_atomic_max_gen_i_sys:
  59     case Intrinsic::nvvm_atomic_min_gen_i_cta:
  60     case Intrinsic::nvvm_atomic_min_gen_i_sys:
  61     case Intrinsic::nvvm_atomic_or_gen_i_cta:
  62     case Intrinsic::nvvm_atomic_or_gen_i_sys:
  63     case Intrinsic::nvvm_atomic_exch_gen_i_cta:
  64     case Intrinsic::nvvm_atomic_exch_gen_i_sys:
  65     case Intrinsic::nvvm_atomic_xor_gen_i_cta:
  66     case Intrinsic::nvvm_atomic_xor_gen_i_sys:
  67       return true;
  68   }
  69 }
  70
  71 bool NVPTXTTIImpl::isSourceOfDivergence(const Value *V) {
  72   // Without inter-procedural analysis, we conservatively assume that arguments
  73   // to __device__ functions are divergent.
  74   if (const Argument *Arg = dyn_cast<Argument>(V))
  75     return !isKernelFunction(*Arg->getParent());
  76
  77   if (const Instruction *I = dyn_cast<Instruction>(V)) {
  78     // Without pointer analysis, we conservatively assume values loaded from
  79     // generic or local address space are divergent.
  80     if (const LoadInst *LI = dyn_cast<LoadInst>(I)) {
  81       unsigned AS = LI->getPointerAddressSpace();
  82       return AS == ADDRESS_SPACE_GENERIC || AS == ADDRESS_SPACE_LOCAL;
  83     }
  84     // Atomic instructions may cause divergence. Atomic instructions are
  85     // executed sequentially across all threads in a warp. Therefore, an earlier
  86     // executed thread may see different memory inputs than a later executed
  87     // thread. For example, suppose *a = 0 initially.
  88     //
  89     //   atom.global.add.s32 d, [a], 1
  90     //
  91     // returns 0 for the first thread that enters the critical region, and 1 for
  92     // the second thread.
  93     if (I->isAtomic())
  94       return true;
  95     if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
  96       // Instructions that read threadIdx are obviously divergent.
  97       if (readsThreadIndex(II) || readsLaneId(II))
  98         return true;
  99       // Handle the NVPTX atomic instrinsics that cannot be represented as an
 100       // atomic IR instruction.
 101       if (isNVVMAtomic(II))
 102         return true;
 103     }
 104     // Conservatively consider the return value of function calls as divergent.
 105     // We could analyze callees with bodies more precisely using
 106     // inter-procedural analysis.
 107     if (isa<CallInst>(I))
 108       return true;
 109   }
 110
 111   return false;
 112 }
 113
 114 // Convert NVVM intrinsics to target-generic LLVM code where possible.
 115 static Instruction *simplifyNvvmIntrinsic(IntrinsicInst *II, InstCombiner &IC) {
 116   // Each NVVM intrinsic we can simplify can be replaced with one of:
 117   //
 118   //  * an LLVM intrinsic,
 119   //  * an LLVM cast operation,
 120   //  * an LLVM binary operation, or
 121   //  * ad-hoc LLVM IR for the particular operation.
 122
 123   // Some transformations are only valid when the module's
 124   // flush-denormals-to-zero (ftz) setting is true/false, whereas other
 125   // transformations are valid regardless of the module's ftz setting.
 126   enum FtzRequirementTy {
 127     FTZ_Any,       // Any ftz setting is ok.
 128     FTZ_MustBeOn,  // Transformation is valid only if ftz is on.
 129     FTZ_MustBeOff, // Transformation is valid only if ftz is off.
 130   };
 131   // Classes of NVVM intrinsics that can't be replaced one-to-one with a
 132   // target-generic intrinsic, cast op, or binary op but that we can nonetheless
 133   // simplify.
 134   enum SpecialCase {
 135     SPC_Reciprocal,
 136   };
 137
 138   // SimplifyAction is a poor-man's variant (plus an additional flag) that
 139   // represents how to replace an NVVM intrinsic with target-generic LLVM IR.
 140   struct SimplifyAction {
 141     // Invariant: At most one of these Optionals has a value.
 142     Optional<Intrinsic::ID> IID;
 143     Optional<Instruction::CastOps> CastOp;
 144     Optional<Instruction::BinaryOps> BinaryOp;
 145     Optional<SpecialCase> Special;
 146
 147     FtzRequirementTy FtzRequirement = FTZ_Any;
 148
 149     SimplifyAction() = default;
 150
 151     SimplifyAction(Intrinsic::ID IID, FtzRequirementTy FtzReq)
 152         : IID(IID), FtzRequirement(FtzReq) {}
 153
 154     // Cast operations don't have anything to do with FTZ, so we skip that
 155     // argument.
 156     SimplifyAction(Instruction::CastOps CastOp) : CastOp(CastOp) {}
 157
 158     SimplifyAction(Instruction::BinaryOps BinaryOp, FtzRequirementTy FtzReq)
 159         : BinaryOp(BinaryOp), FtzRequirement(FtzReq) {}
 160
 161     SimplifyAction(SpecialCase Special, FtzRequirementTy FtzReq)
 162         : Special(Special), FtzRequirement(FtzReq) {}
 163   };
 164
 165   // Try to generate a SimplifyAction describing how to replace our
 166   // IntrinsicInstr with target-generic LLVM IR.
 167   const SimplifyAction Action = [II]() -> SimplifyAction {
 168     switch (II->getIntrinsicID()) {
 169     // NVVM intrinsics that map directly to LLVM intrinsics.
 170     case Intrinsic::nvvm_ceil_d:
 171       return {Intrinsic::ceil, FTZ_Any};
 172     case Intrinsic::nvvm_ceil_f:
 173       return {Intrinsic::ceil, FTZ_MustBeOff};
 174     case Intrinsic::nvvm_ceil_ftz_f:
 175       return {Intrinsic::ceil, FTZ_MustBeOn};
 176     case Intrinsic::nvvm_fabs_d:
 177       return {Intrinsic::fabs, FTZ_Any};
 178     case Intrinsic::nvvm_fabs_f:
 179       return {Intrinsic::fabs, FTZ_MustBeOff};
 180     case Intrinsic::nvvm_fabs_ftz_f:
 181       return {Intrinsic::fabs, FTZ_MustBeOn};
 182     case Intrinsic::nvvm_floor_d:
 183       return {Intrinsic::floor, FTZ_Any};
 184     case Intrinsic::nvvm_floor_f:
 185       return {Intrinsic::floor, FTZ_MustBeOff};
 186     case Intrinsic::nvvm_floor_ftz_f:
 187       return {Intrinsic::floor, FTZ_MustBeOn};
 188     case Intrinsic::nvvm_fma_rn_d:
 189       return {Intrinsic::fma, FTZ_Any};
 190     case Intrinsic::nvvm_fma_rn_f:
 191       return {Intrinsic::fma, FTZ_MustBeOff};
 192     case Intrinsic::nvvm_fma_rn_ftz_f:
 193       return {Intrinsic::fma, FTZ_MustBeOn};
 194     case Intrinsic::nvvm_fmax_d:
 195       return {Intrinsic::maxnum, FTZ_Any};
 196     case Intrinsic::nvvm_fmax_f:
 197       return {Intrinsic::maxnum, FTZ_MustBeOff};
 198     case Intrinsic::nvvm_fmax_ftz_f:
 199       return {Intrinsic::maxnum, FTZ_MustBeOn};
 200     case Intrinsic::nvvm_fmin_d:
 201       return {Intrinsic::minnum, FTZ_Any};
 202     case Intrinsic::nvvm_fmin_f:
 203       return {Intrinsic::minnum, FTZ_MustBeOff};
 204     case Intrinsic::nvvm_fmin_ftz_f:
 205       return {Intrinsic::minnum, FTZ_MustBeOn};
 206     case Intrinsic::nvvm_round_d:
 207       return {Intrinsic::round, FTZ_Any};
 208     case Intrinsic::nvvm_round_f:
 209       return {Intrinsic::round, FTZ_MustBeOff};
 210     case Intrinsic::nvvm_round_ftz_f:
 211       return {Intrinsic::round, FTZ_MustBeOn};
 212     case Intrinsic::nvvm_sqrt_rn_d:
 213       return {Intrinsic::sqrt, FTZ_Any};
 214     case Intrinsic::nvvm_sqrt_f:
 215       // nvvm_sqrt_f is a special case.  For  most intrinsics, foo_ftz_f is the
 216       // ftz version, and foo_f is the non-ftz version.  But nvvm_sqrt_f adopts
 217       // the ftz-ness of the surrounding code.  sqrt_rn_f and sqrt_rn_ftz_f are
 218       // the versions with explicit ftz-ness.
 219       return {Intrinsic::sqrt, FTZ_Any};
 220     case Intrinsic::nvvm_sqrt_rn_f:
 221       return {Intrinsic::sqrt, FTZ_MustBeOff};
 222     case Intrinsic::nvvm_sqrt_rn_ftz_f:
 223       return {Intrinsic::sqrt, FTZ_MustBeOn};
 224     case Intrinsic::nvvm_trunc_d:
 225       return {Intrinsic::trunc, FTZ_Any};
 226     case Intrinsic::nvvm_trunc_f:
 227       return {Intrinsic::trunc, FTZ_MustBeOff};
 228     case Intrinsic::nvvm_trunc_ftz_f:
 229       return {Intrinsic::trunc, FTZ_MustBeOn};
 230
 231     // NVVM intrinsics that map to LLVM cast operations.
 232     //
 233     // Note that llvm's target-generic conversion operators correspond to the rz
 234     // (round to zero) versions of the nvvm conversion intrinsics, even though
 235     // most everything else here uses the rn (round to nearest even) nvvm ops.
 236     case Intrinsic::nvvm_d2i_rz:
 237     case Intrinsic::nvvm_f2i_rz:
 238     case Intrinsic::nvvm_d2ll_rz:
 239     case Intrinsic::nvvm_f2ll_rz:
 240       return {Instruction::FPToSI};
 241     case Intrinsic::nvvm_d2ui_rz:
 242     case Intrinsic::nvvm_f2ui_rz:
 243     case Intrinsic::nvvm_d2ull_rz:
 244     case Intrinsic::nvvm_f2ull_rz:
 245       return {Instruction::FPToUI};
 246     case Intrinsic::nvvm_i2d_rz:
 247     case Intrinsic::nvvm_i2f_rz:
 248     case Intrinsic::nvvm_ll2d_rz:
 249     case Intrinsic::nvvm_ll2f_rz:
 250       return {Instruction::SIToFP};
 251     case Intrinsic::nvvm_ui2d_rz:
 252     case Intrinsic::nvvm_ui2f_rz:
 253     case Intrinsic::nvvm_ull2d_rz:
 254     case Intrinsic::nvvm_ull2f_rz:
 255       return {Instruction::UIToFP};
 256
 257     // NVVM intrinsics that map to LLVM binary ops.
 258     case Intrinsic::nvvm_add_rn_d:
 259       return {Instruction::FAdd, FTZ_Any};
 260     case Intrinsic::nvvm_add_rn_f:
 261       return {Instruction::FAdd, FTZ_MustBeOff};
 262     case Intrinsic::nvvm_add_rn_ftz_f:
 263       return {Instruction::FAdd, FTZ_MustBeOn};
 264     case Intrinsic::nvvm_mul_rn_d:
 265       return {Instruction::FMul, FTZ_Any};
 266     case Intrinsic::nvvm_mul_rn_f:
 267       return {Instruction::FMul, FTZ_MustBeOff};
 268     case Intrinsic::nvvm_mul_rn_ftz_f:
 269       return {Instruction::FMul, FTZ_MustBeOn};
 270     case Intrinsic::nvvm_div_rn_d:
 271       return {Instruction::FDiv, FTZ_Any};
 272     case Intrinsic::nvvm_div_rn_f:
 273       return {Instruction::FDiv, FTZ_MustBeOff};
 274     case Intrinsic::nvvm_div_rn_ftz_f:
 275       return {Instruction::FDiv, FTZ_MustBeOn};
 276
 277     // The remainder of cases are NVVM intrinsics that map to LLVM idioms, but
 278     // need special handling.
 279     //
 280     // We seem to be missing intrinsics for rcp.approx.{ftz.}f32, which is just
 281     // as well.
 282     case Intrinsic::nvvm_rcp_rn_d:
 283       return {SPC_Reciprocal, FTZ_Any};
 284     case Intrinsic::nvvm_rcp_rn_f:
 285       return {SPC_Reciprocal, FTZ_MustBeOff};
 286     case Intrinsic::nvvm_rcp_rn_ftz_f:
 287       return {SPC_Reciprocal, FTZ_MustBeOn};
 288
 289       // We do not currently simplify intrinsics that give an approximate
 290       // answer. These include:
 291       //
 292       //   - nvvm_cos_approx_{f,ftz_f}
 293       //   - nvvm_ex2_approx_{d,f,ftz_f}
 294       //   - nvvm_lg2_approx_{d,f,ftz_f}
 295       //   - nvvm_sin_approx_{f,ftz_f}
 296       //   - nvvm_sqrt_approx_{f,ftz_f}
 297       //   - nvvm_rsqrt_approx_{d,f,ftz_f}
 298       //   - nvvm_div_approx_{ftz_d,ftz_f,f}
 299       //   - nvvm_rcp_approx_ftz_d
 300       //
 301       // Ideally we'd encode them as e.g. "fast call @llvm.cos", where "fast"
 302       // means that fastmath is enabled in the intrinsic.  Unfortunately only
 303       // binary operators (currently) have a fastmath bit in SelectionDAG, so
 304       // this information gets lost and we can't select on it.
 305       //
 306       // TODO: div and rcp are lowered to a binary op, so these we could in
 307       // theory lower them to "fast fdiv".
 308
 309     default:
 310       return {};
 311     }
 312   }();
 313
 314   // If Action.FtzRequirementTy is not satisfied by the module's ftz state, we
 315   // can bail out now.  (Notice that in the case that IID is not an NVVM
 316   // intrinsic, we don't have to look up any module metadata, as
 317   // FtzRequirementTy will be FTZ_Any.)
 318   if (Action.FtzRequirement != FTZ_Any) {
 319     StringRef Attr = II->getFunction()
 320                          ->getFnAttribute("denormal-fp-math-f32")
 321                          .getValueAsString();
 322     DenormalMode Mode = parseDenormalFPAttribute(Attr);
 323     bool FtzEnabled = Mode.Output != DenormalMode::IEEE;
 324
 325     if (FtzEnabled != (Action.FtzRequirement == FTZ_MustBeOn))
 326       return nullptr;
 327   }
 328
 329   // Simplify to target-generic intrinsic.
 330   if (Action.IID) {
 331     SmallVector<Value *, 4> Args(II->arg_operands());
 332     // All the target-generic intrinsics currently of interest to us have one
 333     // type argument, equal to that of the nvvm intrinsic's argument.
 334     Type *Tys[] = {II->getArgOperand(0)->getType()};
 335     return CallInst::Create(
 336         Intrinsic::getDeclaration(II->getModule(), *Action.IID, Tys), Args);
 337   }
 338
 339   // Simplify to target-generic binary op.
 340   if (Action.BinaryOp)
 341     return BinaryOperator::Create(*Action.BinaryOp, II->getArgOperand(0),
 342                                   II->getArgOperand(1), II->getName());
 343
 344   // Simplify to target-generic cast op.
 345   if (Action.CastOp)
 346     return CastInst::Create(*Action.CastOp, II->getArgOperand(0), II->getType(),
 347                             II->getName());
 348
 349   // All that's left are the special cases.
 350   if (!Action.Special)
 351     return nullptr;
 352
 353   switch (*Action.Special) {
 354   case SPC_Reciprocal:
 355     // Simplify reciprocal.
 356     return BinaryOperator::Create(
 357         Instruction::FDiv, ConstantFP::get(II->getArgOperand(0)->getType(), 1),
 358         II->getArgOperand(0), II->getName());
 359   }
 360   llvm_unreachable("All SpecialCase enumerators should be handled in switch.");
 361 }
 362
 363 Optional<Instruction *>
 364 NVPTXTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
 365   if (Instruction *I = simplifyNvvmIntrinsic(&II, IC)) {
 366     return I;
 367   }
 368   return None;
 369 }
 370
 371 InstructionCost NVPTXTTIImpl::getArithmeticInstrCost(
 372     unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
 373     TTI::OperandValueKind Opd1Info, TTI::OperandValueKind Opd2Info,
 374     TTI::OperandValueProperties Opd1PropInfo,
 375     TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args,
 376     const Instruction *CxtI) {
 377   // Legalize the type.
 378   std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
 379
 380   int ISD = TLI->InstructionOpcodeToISD(Opcode);
 381
 382   switch (ISD) {
 383   default:
 384     return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info,
 385                                          Opd2Info,
 386                                          Opd1PropInfo, Opd2PropInfo);
 387   case ISD::ADD:
 388   case ISD::MUL:
 389   case ISD::XOR:
 390   case ISD::OR:
 391   case ISD::AND:
 392     // The machine code (SASS) simulates an i64 with two i32. Therefore, we
 393     // estimate that arithmetic operations on i64 are twice as expensive as
 394     // those on types that can fit into one machine register.
 395     if (LT.second.SimpleTy == MVT::i64)
 396       return 2 * LT.first;
 397     // Delegate other cases to the basic TTI.
 398     return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info,
 399                                          Opd2Info,
 400                                          Opd1PropInfo, Opd2PropInfo);
 401   }
 402 }
 403
 404 void NVPTXTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
 405                                            TTI::UnrollingPreferences &UP,
 406                                            OptimizationRemarkEmitter *ORE) {
 407   BaseT::getUnrollingPreferences(L, SE, UP, ORE);
 408
 409   // Enable partial unrolling and runtime unrolling, but reduce the
 410   // threshold.  This partially unrolls small loops which are often
 411   // unrolled by the PTX to SASS compiler and unrolling earlier can be
 412   // beneficial.
 413   UP.Partial = UP.Runtime = true;
 414   UP.PartialThreshold = UP.Threshold / 4;
 415 }
 416
 417 void NVPTXTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
 418                                          TTI::PeelingPreferences &PP) {
 419   BaseT::getPeelingPreferences(L, SE, PP);
 420 }