llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp

   1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
   2 //
   3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 // See https://llvm.org/LICENSE.txt for license information.
   5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 //
   7 //===----------------------------------------------------------------------===//
   8 //
   9 /// \file
  10 /// Implements the AMDGPU specific subclass of TargetSubtarget.
  11 //
  12 //===----------------------------------------------------------------------===//
  13
  14 #include "AMDGPUSubtarget.h"
  15 #include "AMDGPUCallLowering.h"
  16 #include "AMDGPUInstructionSelector.h"
  17 #include "AMDGPULegalizerInfo.h"
  18 #include "AMDGPURegisterBankInfo.h"
  19 #include "R600Subtarget.h"
  20 #include "SIMachineFunctionInfo.h"
  21 #include "Utils/AMDGPUBaseInfo.h"
  22 #include "llvm/CodeGen/GlobalISel/InlineAsmLowering.h"
  23 #include "llvm/CodeGen/MachineScheduler.h"
  24 #include "llvm/CodeGen/TargetFrameLowering.h"
  25 #include "llvm/IR/DiagnosticInfo.h"
  26 #include "llvm/IR/IntrinsicsAMDGPU.h"
  27 #include "llvm/IR/IntrinsicsR600.h"
  28 #include "llvm/IR/MDBuilder.h"
  29 #include <algorithm>
  30
  31 using namespace llvm;
  32
  33 #define DEBUG_TYPE "amdgpu-subtarget"
  34
  35 AMDGPUSubtarget::AMDGPUSubtarget(Triple TT) : TargetTriple(std::move(TT)) {}
  36
  37 bool AMDGPUSubtarget::useRealTrue16Insts() const {
  38   return hasTrue16BitInsts() && EnableRealTrue16Insts;
  39 }
  40
  41 // Returns the maximum per-workgroup LDS allocation size (in bytes) that still
  42 // allows the given function to achieve an occupancy of NWaves waves per
  43 // SIMD / EU, taking into account only the function's *maximum* workgroup size.
  44 unsigned
  45 AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
  46                                                  const Function &F) const {
  47   const unsigned WaveSize = getWavefrontSize();
  48   const unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
  49   const unsigned WavesPerWorkgroup =
  50       std::max(1u, (WorkGroupSize + WaveSize - 1) / WaveSize);
  51
  52   const unsigned WorkGroupsPerCU =
  53       std::max(1u, (NWaves * getEUsPerCU()) / WavesPerWorkgroup);
  54
  55   return getLocalMemorySize() / WorkGroupsPerCU;
  56 }
  57
  58 // FIXME: Should return min,max range.
  59 //
  60 // Returns the maximum occupancy, in number of waves per SIMD / EU, that can
  61 // be achieved when only the given function is running on the machine; and
  62 // taking into account the overall number of wave slots, the (maximum) workgroup
  63 // size, and the per-workgroup LDS allocation size.
  64 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes,
  65   const Function &F) const {
  66   const unsigned MaxWorkGroupSize = getFlatWorkGroupSizes(F).second;
  67   const unsigned MaxWorkGroupsPerCu = getMaxWorkGroupsPerCU(MaxWorkGroupSize);
  68   if (!MaxWorkGroupsPerCu)
  69     return 0;
  70
  71   const unsigned WaveSize = getWavefrontSize();
  72
  73   // FIXME: Do we need to account for alignment requirement of LDS rounding the
  74   // size up?
  75   // Compute restriction based on LDS usage
  76   unsigned NumGroups = getLocalMemorySize() / (Bytes ? Bytes : 1u);
  77
  78   // This can be queried with more LDS than is possible, so just assume the
  79   // worst.
  80   if (NumGroups == 0)
  81     return 1;
  82
  83   NumGroups = std::min(MaxWorkGroupsPerCu, NumGroups);
  84
  85   // Round to the number of waves per CU.
  86   const unsigned MaxGroupNumWaves = divideCeil(MaxWorkGroupSize, WaveSize);
  87   unsigned MaxWaves = NumGroups * MaxGroupNumWaves;
  88
  89   // Number of waves per EU (SIMD).
  90   MaxWaves = divideCeil(MaxWaves, getEUsPerCU());
  91
  92   // Clamp to the maximum possible number of waves.
  93   MaxWaves = std::min(MaxWaves, getMaxWavesPerEU());
  94
  95   // FIXME: Needs to be a multiple of the group size?
  96   //MaxWaves = MaxGroupNumWaves * (MaxWaves / MaxGroupNumWaves);
  97
  98   assert(MaxWaves > 0 && MaxWaves <= getMaxWavesPerEU() &&
  99          "computed invalid occupancy");
 100   return MaxWaves;
 101 }
 102
 103 unsigned
 104 AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const {
 105   const auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
 106   return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction());
 107 }
 108
 109 std::pair<unsigned, unsigned>
 110 AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const {
 111   switch (CC) {
 112   case CallingConv::AMDGPU_VS:
 113   case CallingConv::AMDGPU_LS:
 114   case CallingConv::AMDGPU_HS:
 115   case CallingConv::AMDGPU_ES:
 116   case CallingConv::AMDGPU_GS:
 117   case CallingConv::AMDGPU_PS:
 118     return std::pair(1, getWavefrontSize());
 119   default:
 120     return std::pair(1u, getMaxFlatWorkGroupSize());
 121   }
 122 }
 123
 124 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
 125   const Function &F) const {
 126   // Default minimum/maximum flat work group sizes.
 127   std::pair<unsigned, unsigned> Default =
 128     getDefaultFlatWorkGroupSize(F.getCallingConv());
 129
 130   // Requested minimum/maximum flat work group sizes.
 131   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
 132     F, "amdgpu-flat-work-group-size", Default);
 133
 134   // Make sure requested minimum is less than requested maximum.
 135   if (Requested.first > Requested.second)
 136     return Default;
 137
 138   // Make sure requested values do not violate subtarget's specifications.
 139   if (Requested.first < getMinFlatWorkGroupSize())
 140     return Default;
 141   if (Requested.second > getMaxFlatWorkGroupSize())
 142     return Default;
 143
 144   return Requested;
 145 }
 146
 147 std::pair<unsigned, unsigned> AMDGPUSubtarget::getEffectiveWavesPerEU(
 148     std::pair<unsigned, unsigned> Requested,
 149     std::pair<unsigned, unsigned> FlatWorkGroupSizes) const {
 150   // Default minimum/maximum number of waves per execution unit.
 151   std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
 152
 153   // If minimum/maximum flat work group sizes were explicitly requested using
 154   // "amdgpu-flat-workgroup-size" attribute, then set default minimum/maximum
 155   // number of waves per execution unit to values implied by requested
 156   // minimum/maximum flat work group sizes.
 157   unsigned MinImpliedByFlatWorkGroupSize =
 158     getWavesPerEUForWorkGroup(FlatWorkGroupSizes.second);
 159   Default.first = MinImpliedByFlatWorkGroupSize;
 160
 161   // Make sure requested minimum is less than requested maximum.
 162   if (Requested.second && Requested.first > Requested.second)
 163     return Default;
 164
 165   // Make sure requested values do not violate subtarget's specifications.
 166   if (Requested.first < getMinWavesPerEU() ||
 167       Requested.second > getMaxWavesPerEU())
 168     return Default;
 169
 170   // Make sure requested values are compatible with values implied by requested
 171   // minimum/maximum flat work group sizes.
 172   if (Requested.first < MinImpliedByFlatWorkGroupSize)
 173     return Default;
 174
 175   return Requested;
 176 }
 177
 178 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
 179     const Function &F, std::pair<unsigned, unsigned> FlatWorkGroupSizes) const {
 180   // Default minimum/maximum number of waves per execution unit.
 181   std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
 182
 183   // Requested minimum/maximum number of waves per execution unit.
 184   std::pair<unsigned, unsigned> Requested =
 185       AMDGPU::getIntegerPairAttribute(F, "amdgpu-waves-per-eu", Default, true);
 186   return getEffectiveWavesPerEU(Requested, FlatWorkGroupSizes);
 187 }
 188
 189 static unsigned getReqdWorkGroupSize(const Function &Kernel, unsigned Dim) {
 190   auto *Node = Kernel.getMetadata("reqd_work_group_size");
 191   if (Node && Node->getNumOperands() == 3)
 192     return mdconst::extract<ConstantInt>(Node->getOperand(Dim))->getZExtValue();
 193   return std::numeric_limits<unsigned>::max();
 194 }
 195
 196 bool AMDGPUSubtarget::isMesaKernel(const Function &F) const {
 197   return isMesa3DOS() && !AMDGPU::isShader(F.getCallingConv());
 198 }
 199
 200 unsigned AMDGPUSubtarget::getMaxWorkitemID(const Function &Kernel,
 201                                            unsigned Dimension) const {
 202   unsigned ReqdSize = getReqdWorkGroupSize(Kernel, Dimension);
 203   if (ReqdSize != std::numeric_limits<unsigned>::max())
 204     return ReqdSize - 1;
 205   return getFlatWorkGroupSizes(Kernel).second - 1;
 206 }
 207
 208 bool AMDGPUSubtarget::isSingleLaneExecution(const Function &Func) const {
 209   for (int I = 0; I < 3; ++I) {
 210     if (getMaxWorkitemID(Func, I) > 0)
 211       return false;
 212   }
 213
 214   return true;
 215 }
 216
 217 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const {
 218   Function *Kernel = I->getParent()->getParent();
 219   unsigned MinSize = 0;
 220   unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second;
 221   bool IdQuery = false;
 222
 223   // If reqd_work_group_size is present it narrows value down.
 224   if (auto *CI = dyn_cast<CallInst>(I)) {
 225     const Function *F = CI->getCalledFunction();
 226     if (F) {
 227       unsigned Dim = UINT_MAX;
 228       switch (F->getIntrinsicID()) {
 229       case Intrinsic::amdgcn_workitem_id_x:
 230       case Intrinsic::r600_read_tidig_x:
 231         IdQuery = true;
 232         [[fallthrough]];
 233       case Intrinsic::r600_read_local_size_x:
 234         Dim = 0;
 235         break;
 236       case Intrinsic::amdgcn_workitem_id_y:
 237       case Intrinsic::r600_read_tidig_y:
 238         IdQuery = true;
 239         [[fallthrough]];
 240       case Intrinsic::r600_read_local_size_y:
 241         Dim = 1;
 242         break;
 243       case Intrinsic::amdgcn_workitem_id_z:
 244       case Intrinsic::r600_read_tidig_z:
 245         IdQuery = true;
 246         [[fallthrough]];
 247       case Intrinsic::r600_read_local_size_z:
 248         Dim = 2;
 249         break;
 250       default:
 251         break;
 252       }
 253
 254       if (Dim <= 3) {
 255         unsigned ReqdSize = getReqdWorkGroupSize(*Kernel, Dim);
 256         if (ReqdSize != std::numeric_limits<unsigned>::max())
 257           MinSize = MaxSize = ReqdSize;
 258       }
 259     }
 260   }
 261
 262   if (!MaxSize)
 263     return false;
 264
 265   // Range metadata is [Lo, Hi). For ID query we need to pass max size
 266   // as Hi. For size query we need to pass Hi + 1.
 267   if (IdQuery)
 268     MinSize = 0;
 269   else
 270     ++MaxSize;
 271
 272   APInt Lower{32, MinSize};
 273   APInt Upper{32, MaxSize};
 274   if (auto *CI = dyn_cast<CallBase>(I)) {
 275     ConstantRange Range(Lower, Upper);
 276     CI->addRangeRetAttr(Range);
 277   } else {
 278     MDBuilder MDB(I->getContext());
 279     MDNode *MaxWorkGroupSizeRange = MDB.createRange(Lower, Upper);
 280     I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
 281   }
 282   return true;
 283 }
 284
 285 unsigned AMDGPUSubtarget::getImplicitArgNumBytes(const Function &F) const {
 286   assert(AMDGPU::isKernel(F.getCallingConv()));
 287
 288   // We don't allocate the segment if we know the implicit arguments weren't
 289   // used, even if the ABI implies we need them.
 290   if (F.hasFnAttribute("amdgpu-no-implicitarg-ptr"))
 291     return 0;
 292
 293   if (isMesaKernel(F))
 294     return 16;
 295
 296   // Assume all implicit inputs are used by default
 297   const Module *M = F.getParent();
 298   unsigned NBytes =
 299       AMDGPU::getAMDHSACodeObjectVersion(*M) >= AMDGPU::AMDHSA_COV5 ? 256 : 56;
 300   return F.getFnAttributeAsParsedInteger("amdgpu-implicitarg-num-bytes",
 301                                          NBytes);
 302 }
 303
 304 uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F,
 305                                                  Align &MaxAlign) const {
 306   assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
 307          F.getCallingConv() == CallingConv::SPIR_KERNEL);
 308
 309   const DataLayout &DL = F.getDataLayout();
 310   uint64_t ExplicitArgBytes = 0;
 311   MaxAlign = Align(1);
 312
 313   for (const Argument &Arg : F.args()) {
 314     if (Arg.hasAttribute("amdgpu-hidden-argument"))
 315       continue;
 316
 317     const bool IsByRef = Arg.hasByRefAttr();
 318     Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType();
 319     Align Alignment = DL.getValueOrABITypeAlignment(
 320         IsByRef ? Arg.getParamAlign() : std::nullopt, ArgTy);
 321     uint64_t AllocSize = DL.getTypeAllocSize(ArgTy);
 322     ExplicitArgBytes = alignTo(ExplicitArgBytes, Alignment) + AllocSize;
 323     MaxAlign = std::max(MaxAlign, Alignment);
 324   }
 325
 326   return ExplicitArgBytes;
 327 }
 328
 329 unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F,
 330                                                 Align &MaxAlign) const {
 331   if (F.getCallingConv() != CallingConv::AMDGPU_KERNEL &&
 332       F.getCallingConv() != CallingConv::SPIR_KERNEL)
 333     return 0;
 334
 335   uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign);
 336
 337   unsigned ExplicitOffset = getExplicitKernelArgOffset();
 338
 339   uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes;
 340   unsigned ImplicitBytes = getImplicitArgNumBytes(F);
 341   if (ImplicitBytes != 0) {
 342     const Align Alignment = getAlignmentForImplicitArgPtr();
 343     TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
 344     MaxAlign = std::max(MaxAlign, Alignment);
 345   }
 346
 347   // Being able to dereference past the end is useful for emitting scalar loads.
 348   return alignTo(TotalSize, 4);
 349 }
 350
 351 AMDGPUDwarfFlavour AMDGPUSubtarget::getAMDGPUDwarfFlavour() const {
 352   return getWavefrontSize() == 32 ? AMDGPUDwarfFlavour::Wave32
 353                                   : AMDGPUDwarfFlavour::Wave64;
 354 }
 355
 356 const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) {
 357   if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn)
 358     return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>());
 359   return static_cast<const AMDGPUSubtarget &>(MF.getSubtarget<R600Subtarget>());
 360 }
 361
 362 const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Function &F) {
 363   if (TM.getTargetTriple().getArch() == Triple::amdgcn)
 364     return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F));
 365   return static_cast<const AMDGPUSubtarget &>(
 366       TM.getSubtarget<R600Subtarget>(F));
 367 }
 368
 369 SmallVector<unsigned>
 370 AMDGPUSubtarget::getMaxNumWorkGroups(const Function &F) const {
 371   return AMDGPU::getIntegerVecAttribute(F, "amdgpu-max-num-workgroups", 3,
 372                                         std::numeric_limits<uint32_t>::max());
 373 }