lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp

   1 //===- AMDGPUAnnotateKernelFeaturesPass.cpp -------------------------------===//
   2 //
   3 //                     The LLVM Compiler Infrastructure
   4 //
   5 // This file is distributed under the University of Illinois Open Source
   6 // License. See LICENSE.TXT for details.
   7 //
   8 //===----------------------------------------------------------------------===//
   9 //
  10 /// \file This pass adds target attributes to functions which use intrinsics
  11 /// which will impact calling convention lowering.
  12 //
  13 //===----------------------------------------------------------------------===//
  14
  15 #include "AMDGPU.h"
  16 #include "AMDGPUSubtarget.h"
  17 #include "Utils/AMDGPUBaseInfo.h"
  18 #include "llvm/ADT/SmallPtrSet.h"
  19 #include "llvm/ADT/SmallVector.h"
  20 #include "llvm/ADT/StringRef.h"
  21 #include "llvm/ADT/Triple.h"
  22 #include "llvm/Analysis/CallGraph.h"
  23 #include "llvm/Analysis/CallGraphSCCPass.h"
  24 #include "llvm/CodeGen/TargetPassConfig.h"
  25 #include "llvm/IR/CallSite.h"
  26 #include "llvm/IR/Constant.h"
  27 #include "llvm/IR/Constants.h"
  28 #include "llvm/IR/Function.h"
  29 #include "llvm/IR/Instruction.h"
  30 #include "llvm/IR/Instructions.h"
  31 #include "llvm/IR/Intrinsics.h"
  32 #include "llvm/IR/Module.h"
  33 #include "llvm/IR/Type.h"
  34 #include "llvm/IR/Use.h"
  35 #include "llvm/Pass.h"
  36 #include "llvm/Support/Casting.h"
  37 #include "llvm/Support/ErrorHandling.h"
  38 #include "llvm/Target/TargetMachine.h"
  39
  40 #define DEBUG_TYPE "amdgpu-annotate-kernel-features"
  41
  42 using namespace llvm;
  43
  44 namespace {
  45
  46 class AMDGPUAnnotateKernelFeatures : public CallGraphSCCPass {
  47 private:
  48   const TargetMachine *TM = nullptr;
  49
  50   bool addFeatureAttributes(Function &F);
  51
  52 public:
  53   static char ID;
  54
  55   AMDGPUAnnotateKernelFeatures() : CallGraphSCCPass(ID) {}
  56
  57   bool doInitialization(CallGraph &CG) override;
  58   bool runOnSCC(CallGraphSCC &SCC) override;
  59
  60   StringRef getPassName() const override {
  61     return "AMDGPU Annotate Kernel Features";
  62   }
  63
  64   void getAnalysisUsage(AnalysisUsage &AU) const override {
  65     AU.setPreservesAll();
  66     CallGraphSCCPass::getAnalysisUsage(AU);
  67   }
  68
  69   static bool visitConstantExpr(const ConstantExpr *CE);
  70   static bool visitConstantExprsRecursively(
  71     const Constant *EntryC,
  72     SmallPtrSet<const Constant *, 8> &ConstantExprVisited);
  73 };
  74
  75 } // end anonymous namespace
  76
  77 char AMDGPUAnnotateKernelFeatures::ID = 0;
  78
  79 char &llvm::AMDGPUAnnotateKernelFeaturesID = AMDGPUAnnotateKernelFeatures::ID;
  80
  81 INITIALIZE_PASS(AMDGPUAnnotateKernelFeatures, DEBUG_TYPE,
  82                 "Add AMDGPU function attributes", false, false)
  83
  84
  85 // The queue ptr is only needed when casting to flat, not from it.
  86 static bool castRequiresQueuePtr(unsigned SrcAS) {
  87   return SrcAS == AMDGPUAS::LOCAL_ADDRESS || SrcAS == AMDGPUAS::PRIVATE_ADDRESS;
  88 }
  89
  90 static bool castRequiresQueuePtr(const AddrSpaceCastInst *ASC) {
  91   return castRequiresQueuePtr(ASC->getSrcAddressSpace());
  92 }
  93
  94 bool AMDGPUAnnotateKernelFeatures::visitConstantExpr(const ConstantExpr *CE) {
  95   if (CE->getOpcode() == Instruction::AddrSpaceCast) {
  96     unsigned SrcAS = CE->getOperand(0)->getType()->getPointerAddressSpace();
  97     return castRequiresQueuePtr(SrcAS);
  98   }
  99
 100   return false;
 101 }
 102
 103 bool AMDGPUAnnotateKernelFeatures::visitConstantExprsRecursively(
 104   const Constant *EntryC,
 105   SmallPtrSet<const Constant *, 8> &ConstantExprVisited) {
 106
 107   if (!ConstantExprVisited.insert(EntryC).second)
 108     return false;
 109
 110   SmallVector<const Constant *, 16> Stack;
 111   Stack.push_back(EntryC);
 112
 113   while (!Stack.empty()) {
 114     const Constant *C = Stack.pop_back_val();
 115
 116     // Check this constant expression.
 117     if (const auto *CE = dyn_cast<ConstantExpr>(C)) {
 118       if (visitConstantExpr(CE))
 119         return true;
 120     }
 121
 122     // Visit all sub-expressions.
 123     for (const Use &U : C->operands()) {
 124       const auto *OpC = dyn_cast<Constant>(U);
 125       if (!OpC)
 126         continue;
 127
 128       if (!ConstantExprVisited.insert(OpC).second)
 129         continue;
 130
 131       Stack.push_back(OpC);
 132     }
 133   }
 134
 135   return false;
 136 }
 137
 138 // We do not need to note the x workitem or workgroup id because they are always
 139 // initialized.
 140 //
 141 // TODO: We should not add the attributes if the known compile time workgroup
 142 // size is 1 for y/z.
 143 static StringRef intrinsicToAttrName(Intrinsic::ID ID,
 144                                      bool &NonKernelOnly,
 145                                      bool &IsQueuePtr) {
 146   switch (ID) {
 147   case Intrinsic::amdgcn_workitem_id_x:
 148     NonKernelOnly = true;
 149     return "amdgpu-work-item-id-x";
 150   case Intrinsic::amdgcn_workgroup_id_x:
 151     NonKernelOnly = true;
 152     return "amdgpu-work-group-id-x";
 153   case Intrinsic::amdgcn_workitem_id_y:
 154   case Intrinsic::r600_read_tidig_y:
 155     return "amdgpu-work-item-id-y";
 156   case Intrinsic::amdgcn_workitem_id_z:
 157   case Intrinsic::r600_read_tidig_z:
 158     return "amdgpu-work-item-id-z";
 159   case Intrinsic::amdgcn_workgroup_id_y:
 160   case Intrinsic::r600_read_tgid_y:
 161     return "amdgpu-work-group-id-y";
 162   case Intrinsic::amdgcn_workgroup_id_z:
 163   case Intrinsic::r600_read_tgid_z:
 164     return "amdgpu-work-group-id-z";
 165   case Intrinsic::amdgcn_dispatch_ptr:
 166     return "amdgpu-dispatch-ptr";
 167   case Intrinsic::amdgcn_dispatch_id:
 168     return "amdgpu-dispatch-id";
 169   case Intrinsic::amdgcn_kernarg_segment_ptr:
 170     return "amdgpu-kernarg-segment-ptr";
 171   case Intrinsic::amdgcn_implicitarg_ptr:
 172     return "amdgpu-implicitarg-ptr";
 173   case Intrinsic::amdgcn_queue_ptr:
 174   case Intrinsic::trap:
 175   case Intrinsic::debugtrap:
 176     IsQueuePtr = true;
 177     return "amdgpu-queue-ptr";
 178   default:
 179     return "";
 180   }
 181 }
 182
 183 static bool handleAttr(Function &Parent, const Function &Callee,
 184                        StringRef Name) {
 185   if (Callee.hasFnAttribute(Name)) {
 186     Parent.addFnAttr(Name);
 187     return true;
 188   }
 189
 190   return false;
 191 }
 192
 193 static void copyFeaturesToFunction(Function &Parent, const Function &Callee,
 194                                    bool &NeedQueuePtr) {
 195   // X ids unnecessarily propagated to kernels.
 196   static const StringRef AttrNames[] = {
 197     { "amdgpu-work-item-id-x" },
 198     { "amdgpu-work-item-id-y" },
 199     { "amdgpu-work-item-id-z" },
 200     { "amdgpu-work-group-id-x" },
 201     { "amdgpu-work-group-id-y" },
 202     { "amdgpu-work-group-id-z" },
 203     { "amdgpu-dispatch-ptr" },
 204     { "amdgpu-dispatch-id" },
 205     { "amdgpu-kernarg-segment-ptr" },
 206     { "amdgpu-implicitarg-ptr" }
 207   };
 208
 209   if (handleAttr(Parent, Callee, "amdgpu-queue-ptr"))
 210     NeedQueuePtr = true;
 211
 212   for (StringRef AttrName : AttrNames)
 213     handleAttr(Parent, Callee, AttrName);
 214 }
 215
 216 bool AMDGPUAnnotateKernelFeatures::addFeatureAttributes(Function &F) {
 217   const GCNSubtarget &ST = TM->getSubtarget<GCNSubtarget>(F);
 218   bool HasFlat = ST.hasFlatAddressSpace();
 219   bool HasApertureRegs = ST.hasApertureRegs();
 220   SmallPtrSet<const Constant *, 8> ConstantExprVisited;
 221
 222   bool Changed = false;
 223   bool NeedQueuePtr = false;
 224   bool HaveCall = false;
 225   bool IsFunc = !AMDGPU::isEntryFunctionCC(F.getCallingConv());
 226
 227   for (BasicBlock &BB : F) {
 228     for (Instruction &I : BB) {
 229       CallSite CS(&I);
 230       if (CS) {
 231         Function *Callee = CS.getCalledFunction();
 232
 233         // TODO: Do something with indirect calls.
 234         if (!Callee) {
 235           if (!CS.isInlineAsm())
 236             HaveCall = true;
 237           continue;
 238         }
 239
 240         Intrinsic::ID IID = Callee->getIntrinsicID();
 241         if (IID == Intrinsic::not_intrinsic) {
 242           HaveCall = true;
 243           copyFeaturesToFunction(F, *Callee, NeedQueuePtr);
 244           Changed = true;
 245         } else {
 246           bool NonKernelOnly = false;
 247           StringRef AttrName = intrinsicToAttrName(IID,
 248                                                    NonKernelOnly, NeedQueuePtr);
 249           if (!AttrName.empty() && (IsFunc || !NonKernelOnly)) {
 250             F.addFnAttr(AttrName);
 251             Changed = true;
 252           }
 253         }
 254       }
 255
 256       if (NeedQueuePtr || HasApertureRegs)
 257         continue;
 258
 259       if (const AddrSpaceCastInst *ASC = dyn_cast<AddrSpaceCastInst>(&I)) {
 260         if (castRequiresQueuePtr(ASC)) {
 261           NeedQueuePtr = true;
 262           continue;
 263         }
 264       }
 265
 266       for (const Use &U : I.operands()) {
 267         const auto *OpC = dyn_cast<Constant>(U);
 268         if (!OpC)
 269           continue;
 270
 271         if (visitConstantExprsRecursively(OpC, ConstantExprVisited)) {
 272           NeedQueuePtr = true;
 273           break;
 274         }
 275       }
 276     }
 277   }
 278
 279   if (NeedQueuePtr) {
 280     F.addFnAttr("amdgpu-queue-ptr");
 281     Changed = true;
 282   }
 283
 284   // TODO: We could refine this to captured pointers that could possibly be
 285   // accessed by flat instructions. For now this is mostly a poor way of
 286   // estimating whether there are calls before argument lowering.
 287   if (HasFlat && !IsFunc && HaveCall) {
 288     F.addFnAttr("amdgpu-flat-scratch");
 289     Changed = true;
 290   }
 291
 292   return Changed;
 293 }
 294
 295 bool AMDGPUAnnotateKernelFeatures::runOnSCC(CallGraphSCC &SCC) {
 296   Module &M = SCC.getCallGraph().getModule();
 297   Triple TT(M.getTargetTriple());
 298
 299   bool Changed = false;
 300   for (CallGraphNode *I : SCC) {
 301     Function *F = I->getFunction();
 302     if (!F || F->isDeclaration())
 303       continue;
 304
 305     Changed |= addFeatureAttributes(*F);
 306   }
 307
 308   return Changed;
 309 }
 310
 311 bool AMDGPUAnnotateKernelFeatures::doInitialization(CallGraph &CG) {
 312   auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
 313   if (!TPC)
 314     report_fatal_error("TargetMachine is required");
 315
 316   TM = &TPC->getTM<TargetMachine>();
 317   return false;
 318 }
 319
 320 Pass *llvm::createAMDGPUAnnotateKernelFeaturesPass() {
 321   return new AMDGPUAnnotateKernelFeatures();
 322 }