llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp

   1 //===-- AMDGPULowerKernelAttributes.cpp ------------------------------------------===//
   2 //
   3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 // See https://llvm.org/LICENSE.txt for license information.
   5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 //
   7 //===----------------------------------------------------------------------===//
   8 //
   9 /// \file This pass does attempts to make use of reqd_work_group_size metadata
  10 /// to eliminate loads from the dispatch packet and to constant fold OpenCL
  11 /// get_local_size-like functions.
  12 //
  13 //===----------------------------------------------------------------------===//
  14
  15 #include "AMDGPU.h"
  16 #include "Utils/AMDGPUBaseInfo.h"
  17 #include "llvm/Analysis/ConstantFolding.h"
  18 #include "llvm/Analysis/ValueTracking.h"
  19 #include "llvm/CodeGen/Passes.h"
  20 #include "llvm/CodeGen/TargetPassConfig.h"
  21 #include "llvm/IR/Constants.h"
  22 #include "llvm/IR/Function.h"
  23 #include "llvm/IR/InstIterator.h"
  24 #include "llvm/IR/Instructions.h"
  25 #include "llvm/IR/IntrinsicsAMDGPU.h"
  26 #include "llvm/IR/PatternMatch.h"
  27 #include "llvm/Pass.h"
  28
  29 #define DEBUG_TYPE "amdgpu-lower-kernel-attributes"
  30
  31 using namespace llvm;
  32
  33 namespace {
  34
  35 // Field offsets in hsa_kernel_dispatch_packet_t.
  36 enum DispatchPackedOffsets {
  37   WORKGROUP_SIZE_X = 4,
  38   WORKGROUP_SIZE_Y = 6,
  39   WORKGROUP_SIZE_Z = 8,
  40
  41   GRID_SIZE_X = 12,
  42   GRID_SIZE_Y = 16,
  43   GRID_SIZE_Z = 20
  44 };
  45
  46 // Field offsets to implicit kernel argument pointer.
  47 enum ImplicitArgOffsets {
  48   HIDDEN_BLOCK_COUNT_X = 0,
  49   HIDDEN_BLOCK_COUNT_Y = 4,
  50   HIDDEN_BLOCK_COUNT_Z = 8,
  51
  52   HIDDEN_GROUP_SIZE_X = 12,
  53   HIDDEN_GROUP_SIZE_Y = 14,
  54   HIDDEN_GROUP_SIZE_Z = 16,
  55
  56   HIDDEN_REMAINDER_X = 18,
  57   HIDDEN_REMAINDER_Y = 20,
  58   HIDDEN_REMAINDER_Z = 22,
  59 };
  60
  61 class AMDGPULowerKernelAttributes : public ModulePass {
  62 public:
  63   static char ID;
  64
  65   AMDGPULowerKernelAttributes() : ModulePass(ID) {}
  66
  67   bool runOnModule(Module &M) override;
  68
  69   StringRef getPassName() const override {
  70     return "AMDGPU Kernel Attributes";
  71   }
  72
  73   void getAnalysisUsage(AnalysisUsage &AU) const override {
  74     AU.setPreservesAll();
  75  }
  76 };
  77
  78 Function *getBasePtrIntrinsic(Module &M, bool IsV5OrAbove) {
  79   auto IntrinsicId = IsV5OrAbove ? Intrinsic::amdgcn_implicitarg_ptr
  80                                  : Intrinsic::amdgcn_dispatch_ptr;
  81   StringRef Name = Intrinsic::getName(IntrinsicId);
  82   return M.getFunction(Name);
  83 }
  84
  85 } // end anonymous namespace
  86
  87 static bool processUse(CallInst *CI, bool IsV5OrAbove) {
  88   Function *F = CI->getParent()->getParent();
  89
  90   auto MD = F->getMetadata("reqd_work_group_size");
  91   const bool HasReqdWorkGroupSize = MD && MD->getNumOperands() == 3;
  92
  93   const bool HasUniformWorkGroupSize =
  94     F->getFnAttribute("uniform-work-group-size").getValueAsBool();
  95
  96   if (!HasReqdWorkGroupSize && !HasUniformWorkGroupSize)
  97     return false;
  98
  99   Value *BlockCounts[3] = {nullptr, nullptr, nullptr};
 100   Value *GroupSizes[3]  = {nullptr, nullptr, nullptr};
 101   Value *Remainders[3]  = {nullptr, nullptr, nullptr};
 102   Value *GridSizes[3]   = {nullptr, nullptr, nullptr};
 103
 104   const DataLayout &DL = F->getDataLayout();
 105
 106   // We expect to see several GEP users, casted to the appropriate type and
 107   // loaded.
 108   for (User *U : CI->users()) {
 109     if (!U->hasOneUse())
 110       continue;
 111
 112     int64_t Offset = 0;
 113     auto *Load = dyn_cast<LoadInst>(U); // Load from ImplicitArgPtr/DispatchPtr?
 114     auto *BCI = dyn_cast<BitCastInst>(U);
 115     if (!Load && !BCI) {
 116       if (GetPointerBaseWithConstantOffset(U, Offset, DL) != CI)
 117         continue;
 118       Load = dyn_cast<LoadInst>(*U->user_begin()); // Load from GEP?
 119       BCI = dyn_cast<BitCastInst>(*U->user_begin());
 120     }
 121
 122     if (BCI) {
 123       if (!BCI->hasOneUse())
 124         continue;
 125       Load = dyn_cast<LoadInst>(*BCI->user_begin()); // Load from BCI?
 126     }
 127
 128     if (!Load || !Load->isSimple())
 129       continue;
 130
 131     unsigned LoadSize = DL.getTypeStoreSize(Load->getType());
 132
 133     // TODO: Handle merged loads.
 134     if (IsV5OrAbove) { // Base is ImplicitArgPtr.
 135       switch (Offset) {
 136       case HIDDEN_BLOCK_COUNT_X:
 137         if (LoadSize == 4)
 138           BlockCounts[0] = Load;
 139         break;
 140       case HIDDEN_BLOCK_COUNT_Y:
 141         if (LoadSize == 4)
 142           BlockCounts[1] = Load;
 143         break;
 144       case HIDDEN_BLOCK_COUNT_Z:
 145         if (LoadSize == 4)
 146           BlockCounts[2] = Load;
 147         break;
 148       case HIDDEN_GROUP_SIZE_X:
 149         if (LoadSize == 2)
 150           GroupSizes[0] = Load;
 151         break;
 152       case HIDDEN_GROUP_SIZE_Y:
 153         if (LoadSize == 2)
 154           GroupSizes[1] = Load;
 155         break;
 156       case HIDDEN_GROUP_SIZE_Z:
 157         if (LoadSize == 2)
 158           GroupSizes[2] = Load;
 159         break;
 160       case HIDDEN_REMAINDER_X:
 161         if (LoadSize == 2)
 162           Remainders[0] = Load;
 163         break;
 164       case HIDDEN_REMAINDER_Y:
 165         if (LoadSize == 2)
 166           Remainders[1] = Load;
 167         break;
 168       case HIDDEN_REMAINDER_Z:
 169         if (LoadSize == 2)
 170           Remainders[2] = Load;
 171         break;
 172       default:
 173         break;
 174       }
 175     } else { // Base is DispatchPtr.
 176       switch (Offset) {
 177       case WORKGROUP_SIZE_X:
 178         if (LoadSize == 2)
 179           GroupSizes[0] = Load;
 180         break;
 181       case WORKGROUP_SIZE_Y:
 182         if (LoadSize == 2)
 183           GroupSizes[1] = Load;
 184         break;
 185       case WORKGROUP_SIZE_Z:
 186         if (LoadSize == 2)
 187           GroupSizes[2] = Load;
 188         break;
 189       case GRID_SIZE_X:
 190         if (LoadSize == 4)
 191           GridSizes[0] = Load;
 192         break;
 193       case GRID_SIZE_Y:
 194         if (LoadSize == 4)
 195           GridSizes[1] = Load;
 196         break;
 197       case GRID_SIZE_Z:
 198         if (LoadSize == 4)
 199           GridSizes[2] = Load;
 200         break;
 201       default:
 202         break;
 203       }
 204     }
 205   }
 206
 207   bool MadeChange = false;
 208   if (IsV5OrAbove && HasUniformWorkGroupSize) {
 209     // Under v5  __ockl_get_local_size returns the value computed by the expression:
 210     //
 211     //   workgroup_id < hidden_block_count ? hidden_group_size : hidden_remainder
 212     //
 213     // For functions with the attribute uniform-work-group-size=true. we can evaluate
 214     // workgroup_id < hidden_block_count as true, and thus hidden_group_size is returned
 215     // for __ockl_get_local_size.
 216     for (int I = 0; I < 3; ++I) {
 217       Value *BlockCount = BlockCounts[I];
 218       if (!BlockCount)
 219         continue;
 220
 221       using namespace llvm::PatternMatch;
 222       auto GroupIDIntrin =
 223           I == 0 ? m_Intrinsic<Intrinsic::amdgcn_workgroup_id_x>()
 224                  : (I == 1 ? m_Intrinsic<Intrinsic::amdgcn_workgroup_id_y>()
 225                            : m_Intrinsic<Intrinsic::amdgcn_workgroup_id_z>());
 226
 227       for (User *ICmp : BlockCount->users()) {
 228         ICmpInst::Predicate Pred;
 229         if (match(ICmp, m_ICmp(Pred, GroupIDIntrin, m_Specific(BlockCount)))) {
 230           if (Pred != ICmpInst::ICMP_ULT)
 231             continue;
 232           ICmp->replaceAllUsesWith(llvm::ConstantInt::getTrue(ICmp->getType()));
 233           MadeChange = true;
 234         }
 235       }
 236     }
 237
 238     // All remainders should be 0 with uniform work group size.
 239     for (Value *Remainder : Remainders) {
 240       if (!Remainder)
 241         continue;
 242       Remainder->replaceAllUsesWith(Constant::getNullValue(Remainder->getType()));
 243       MadeChange = true;
 244     }
 245   } else if (HasUniformWorkGroupSize) { // Pre-V5.
 246     // Pattern match the code used to handle partial workgroup dispatches in the
 247     // library implementation of get_local_size, so the entire function can be
 248     // constant folded with a known group size.
 249     //
 250     // uint r = grid_size - group_id * group_size;
 251     // get_local_size = (r < group_size) ? r : group_size;
 252     //
 253     // If we have uniform-work-group-size (which is the default in OpenCL 1.2),
 254     // the grid_size is required to be a multiple of group_size). In this case:
 255     //
 256     // grid_size - (group_id * group_size) < group_size
 257     // ->
 258     // grid_size < group_size + (group_id * group_size)
 259     //
 260     // (grid_size / group_size) < 1 + group_id
 261     //
 262     // grid_size / group_size is at least 1, so we can conclude the select
 263     // condition is false (except for group_id == 0, where the select result is
 264     // the same).
 265     for (int I = 0; I < 3; ++I) {
 266       Value *GroupSize = GroupSizes[I];
 267       Value *GridSize = GridSizes[I];
 268       if (!GroupSize || !GridSize)
 269         continue;
 270
 271       using namespace llvm::PatternMatch;
 272       auto GroupIDIntrin =
 273           I == 0 ? m_Intrinsic<Intrinsic::amdgcn_workgroup_id_x>()
 274                  : (I == 1 ? m_Intrinsic<Intrinsic::amdgcn_workgroup_id_y>()
 275                            : m_Intrinsic<Intrinsic::amdgcn_workgroup_id_z>());
 276
 277       for (User *U : GroupSize->users()) {
 278         auto *ZextGroupSize = dyn_cast<ZExtInst>(U);
 279         if (!ZextGroupSize)
 280           continue;
 281
 282         for (User *UMin : ZextGroupSize->users()) {
 283           if (match(UMin,
 284                     m_UMin(m_Sub(m_Specific(GridSize),
 285                                  m_Mul(GroupIDIntrin, m_Specific(ZextGroupSize))),
 286                            m_Specific(ZextGroupSize)))) {
 287             if (HasReqdWorkGroupSize) {
 288               ConstantInt *KnownSize
 289                 = mdconst::extract<ConstantInt>(MD->getOperand(I));
 290               UMin->replaceAllUsesWith(ConstantFoldIntegerCast(
 291                   KnownSize, UMin->getType(), false, DL));
 292             } else {
 293               UMin->replaceAllUsesWith(ZextGroupSize);
 294             }
 295
 296             MadeChange = true;
 297           }
 298         }
 299       }
 300     }
 301   }
 302
 303   // If reqd_work_group_size is set, we can replace work group size with it.
 304   if (!HasReqdWorkGroupSize)
 305     return MadeChange;
 306
 307   for (int I = 0; I < 3; I++) {
 308     Value *GroupSize = GroupSizes[I];
 309     if (!GroupSize)
 310       continue;
 311
 312     ConstantInt *KnownSize = mdconst::extract<ConstantInt>(MD->getOperand(I));
 313     GroupSize->replaceAllUsesWith(
 314         ConstantFoldIntegerCast(KnownSize, GroupSize->getType(), false, DL));
 315     MadeChange = true;
 316   }
 317
 318   return MadeChange;
 319 }
 320
 321
 322 // TODO: Move makeLIDRangeMetadata usage into here. Seem to not get
 323 // TargetPassConfig for subtarget.
 324 bool AMDGPULowerKernelAttributes::runOnModule(Module &M) {
 325   bool MadeChange = false;
 326   bool IsV5OrAbove =
 327       AMDGPU::getAMDHSACodeObjectVersion(M) >= AMDGPU::AMDHSA_COV5;
 328   Function *BasePtr = getBasePtrIntrinsic(M, IsV5OrAbove);
 329
 330   if (!BasePtr) // ImplicitArgPtr/DispatchPtr not used.
 331     return false;
 332
 333   SmallPtrSet<Instruction *, 4> HandledUses;
 334   for (auto *U : BasePtr->users()) {
 335     CallInst *CI = cast<CallInst>(U);
 336     if (HandledUses.insert(CI).second) {
 337       if (processUse(CI, IsV5OrAbove))
 338         MadeChange = true;
 339     }
 340   }
 341
 342   return MadeChange;
 343 }
 344
 345
 346 INITIALIZE_PASS_BEGIN(AMDGPULowerKernelAttributes, DEBUG_TYPE,
 347                       "AMDGPU Kernel Attributes", false, false)
 348 INITIALIZE_PASS_END(AMDGPULowerKernelAttributes, DEBUG_TYPE,
 349                     "AMDGPU Kernel Attributes", false, false)
 350
 351 char AMDGPULowerKernelAttributes::ID = 0;
 352
 353 ModulePass *llvm::createAMDGPULowerKernelAttributesPass() {
 354   return new AMDGPULowerKernelAttributes();
 355 }
 356
 357 PreservedAnalyses
 358 AMDGPULowerKernelAttributesPass::run(Function &F, FunctionAnalysisManager &AM) {
 359   bool IsV5OrAbove =
 360       AMDGPU::getAMDHSACodeObjectVersion(*F.getParent()) >= AMDGPU::AMDHSA_COV5;
 361   Function *BasePtr = getBasePtrIntrinsic(*F.getParent(), IsV5OrAbove);
 362
 363   if (!BasePtr) // ImplicitArgPtr/DispatchPtr not used.
 364     return PreservedAnalyses::all();
 365
 366   for (Instruction &I : instructions(F)) {
 367     if (CallInst *CI = dyn_cast<CallInst>(&I)) {
 368       if (CI->getCalledFunction() == BasePtr)
 369         processUse(CI, IsV5OrAbove);
 370     }
 371   }
 372
 373   return PreservedAnalyses::all();
 374 }