lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp

   1 //===- AMDGPUAnnotateKernelFeaturesPass.cpp -------------------------------===//
   2 //
   3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 // See https://llvm.org/LICENSE.txt for license information.
   5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 //
   7 //===----------------------------------------------------------------------===//
   8 //
   9 /// \file This pass adds target attributes to functions which use intrinsics
  10 /// which will impact calling convention lowering.
  11 //
  12 //===----------------------------------------------------------------------===//
  13
  14 #include "AMDGPU.h"
  15 #include "AMDGPUSubtarget.h"
  16 #include "Utils/AMDGPUBaseInfo.h"
  17 #include "llvm/ADT/SmallPtrSet.h"
  18 #include "llvm/ADT/SmallVector.h"
  19 #include "llvm/ADT/StringRef.h"
  20 #include "llvm/ADT/Triple.h"
  21 #include "llvm/Analysis/CallGraph.h"
  22 #include "llvm/Analysis/CallGraphSCCPass.h"
  23 #include "llvm/CodeGen/TargetPassConfig.h"
  24 #include "llvm/IR/CallSite.h"
  25 #include "llvm/IR/Constant.h"
  26 #include "llvm/IR/Constants.h"
  27 #include "llvm/IR/Function.h"
  28 #include "llvm/IR/Instruction.h"
  29 #include "llvm/IR/Instructions.h"
  30 #include "llvm/IR/Intrinsics.h"
  31 #include "llvm/IR/Module.h"
  32 #include "llvm/IR/Type.h"
  33 #include "llvm/IR/Use.h"
  34 #include "llvm/Pass.h"
  35 #include "llvm/Support/Casting.h"
  36 #include "llvm/Support/ErrorHandling.h"
  37 #include "llvm/Target/TargetMachine.h"
  38
  39 #define DEBUG_TYPE "amdgpu-annotate-kernel-features"
  40
  41 using namespace llvm;
  42
  43 namespace {
  44
  45 class AMDGPUAnnotateKernelFeatures : public CallGraphSCCPass {
  46 private:
  47   const TargetMachine *TM = nullptr;
  48   SmallVector<CallGraphNode*, 8> NodeList;
  49
  50   bool addFeatureAttributes(Function &F);
  51   bool processUniformWorkGroupAttribute();
  52   bool propagateUniformWorkGroupAttribute(Function &Caller, Function &Callee);
  53
  54 public:
  55   static char ID;
  56
  57   AMDGPUAnnotateKernelFeatures() : CallGraphSCCPass(ID) {}
  58
  59   bool doInitialization(CallGraph &CG) override;
  60   bool runOnSCC(CallGraphSCC &SCC) override;
  61
  62   StringRef getPassName() const override {
  63     return "AMDGPU Annotate Kernel Features";
  64   }
  65
  66   void getAnalysisUsage(AnalysisUsage &AU) const override {
  67     AU.setPreservesAll();
  68     CallGraphSCCPass::getAnalysisUsage(AU);
  69   }
  70
  71   static bool visitConstantExpr(const ConstantExpr *CE);
  72   static bool visitConstantExprsRecursively(
  73     const Constant *EntryC,
  74     SmallPtrSet<const Constant *, 8> &ConstantExprVisited);
  75 };
  76
  77 } // end anonymous namespace
  78
  79 char AMDGPUAnnotateKernelFeatures::ID = 0;
  80
  81 char &llvm::AMDGPUAnnotateKernelFeaturesID = AMDGPUAnnotateKernelFeatures::ID;
  82
  83 INITIALIZE_PASS(AMDGPUAnnotateKernelFeatures, DEBUG_TYPE,
  84                 "Add AMDGPU function attributes", false, false)
  85
  86
  87 // The queue ptr is only needed when casting to flat, not from it.
  88 static bool castRequiresQueuePtr(unsigned SrcAS) {
  89   return SrcAS == AMDGPUAS::LOCAL_ADDRESS || SrcAS == AMDGPUAS::PRIVATE_ADDRESS;
  90 }
  91
  92 static bool castRequiresQueuePtr(const AddrSpaceCastInst *ASC) {
  93   return castRequiresQueuePtr(ASC->getSrcAddressSpace());
  94 }
  95
  96 bool AMDGPUAnnotateKernelFeatures::visitConstantExpr(const ConstantExpr *CE) {
  97   if (CE->getOpcode() == Instruction::AddrSpaceCast) {
  98     unsigned SrcAS = CE->getOperand(0)->getType()->getPointerAddressSpace();
  99     return castRequiresQueuePtr(SrcAS);
 100   }
 101
 102   return false;
 103 }
 104
 105 bool AMDGPUAnnotateKernelFeatures::visitConstantExprsRecursively(
 106   const Constant *EntryC,
 107   SmallPtrSet<const Constant *, 8> &ConstantExprVisited) {
 108
 109   if (!ConstantExprVisited.insert(EntryC).second)
 110     return false;
 111
 112   SmallVector<const Constant *, 16> Stack;
 113   Stack.push_back(EntryC);
 114
 115   while (!Stack.empty()) {
 116     const Constant *C = Stack.pop_back_val();
 117
 118     // Check this constant expression.
 119     if (const auto *CE = dyn_cast<ConstantExpr>(C)) {
 120       if (visitConstantExpr(CE))
 121         return true;
 122     }
 123
 124     // Visit all sub-expressions.
 125     for (const Use &U : C->operands()) {
 126       const auto *OpC = dyn_cast<Constant>(U);
 127       if (!OpC)
 128         continue;
 129
 130       if (!ConstantExprVisited.insert(OpC).second)
 131         continue;
 132
 133       Stack.push_back(OpC);
 134     }
 135   }
 136
 137   return false;
 138 }
 139
 140 // We do not need to note the x workitem or workgroup id because they are always
 141 // initialized.
 142 //
 143 // TODO: We should not add the attributes if the known compile time workgroup
 144 // size is 1 for y/z.
 145 static StringRef intrinsicToAttrName(Intrinsic::ID ID,
 146                                      bool &NonKernelOnly,
 147                                      bool &IsQueuePtr) {
 148   switch (ID) {
 149   case Intrinsic::amdgcn_workitem_id_x:
 150     NonKernelOnly = true;
 151     return "amdgpu-work-item-id-x";
 152   case Intrinsic::amdgcn_workgroup_id_x:
 153     NonKernelOnly = true;
 154     return "amdgpu-work-group-id-x";
 155   case Intrinsic::amdgcn_workitem_id_y:
 156   case Intrinsic::r600_read_tidig_y:
 157     return "amdgpu-work-item-id-y";
 158   case Intrinsic::amdgcn_workitem_id_z:
 159   case Intrinsic::r600_read_tidig_z:
 160     return "amdgpu-work-item-id-z";
 161   case Intrinsic::amdgcn_workgroup_id_y:
 162   case Intrinsic::r600_read_tgid_y:
 163     return "amdgpu-work-group-id-y";
 164   case Intrinsic::amdgcn_workgroup_id_z:
 165   case Intrinsic::r600_read_tgid_z:
 166     return "amdgpu-work-group-id-z";
 167   case Intrinsic::amdgcn_dispatch_ptr:
 168     return "amdgpu-dispatch-ptr";
 169   case Intrinsic::amdgcn_dispatch_id:
 170     return "amdgpu-dispatch-id";
 171   case Intrinsic::amdgcn_kernarg_segment_ptr:
 172     return "amdgpu-kernarg-segment-ptr";
 173   case Intrinsic::amdgcn_implicitarg_ptr:
 174     return "amdgpu-implicitarg-ptr";
 175   case Intrinsic::amdgcn_queue_ptr:
 176   case Intrinsic::amdgcn_is_shared:
 177   case Intrinsic::amdgcn_is_private:
 178     // TODO: Does not require queue ptr on gfx9+
 179   case Intrinsic::trap:
 180   case Intrinsic::debugtrap:
 181     IsQueuePtr = true;
 182     return "amdgpu-queue-ptr";
 183   default:
 184     return "";
 185   }
 186 }
 187
 188 static bool handleAttr(Function &Parent, const Function &Callee,
 189                        StringRef Name) {
 190   if (Callee.hasFnAttribute(Name)) {
 191     Parent.addFnAttr(Name);
 192     return true;
 193   }
 194   return false;
 195 }
 196
 197 static void copyFeaturesToFunction(Function &Parent, const Function &Callee,
 198                                    bool &NeedQueuePtr) {
 199   // X ids unnecessarily propagated to kernels.
 200   static const StringRef AttrNames[] = {
 201     { "amdgpu-work-item-id-x" },
 202     { "amdgpu-work-item-id-y" },
 203     { "amdgpu-work-item-id-z" },
 204     { "amdgpu-work-group-id-x" },
 205     { "amdgpu-work-group-id-y" },
 206     { "amdgpu-work-group-id-z" },
 207     { "amdgpu-dispatch-ptr" },
 208     { "amdgpu-dispatch-id" },
 209     { "amdgpu-kernarg-segment-ptr" },
 210     { "amdgpu-implicitarg-ptr" }
 211   };
 212
 213   if (handleAttr(Parent, Callee, "amdgpu-queue-ptr"))
 214     NeedQueuePtr = true;
 215
 216   for (StringRef AttrName : AttrNames)
 217     handleAttr(Parent, Callee, AttrName);
 218 }
 219
 220 bool AMDGPUAnnotateKernelFeatures::processUniformWorkGroupAttribute() {
 221   bool Changed = false;
 222
 223   for (auto *Node : reverse(NodeList)) {
 224     Function *Caller = Node->getFunction();
 225
 226     for (auto I : *Node) {
 227       Function *Callee = std::get<1>(I)->getFunction();
 228       if (Callee)
 229         Changed = propagateUniformWorkGroupAttribute(*Caller, *Callee);
 230     }
 231   }
 232
 233   return Changed;
 234 }
 235
 236 bool AMDGPUAnnotateKernelFeatures::propagateUniformWorkGroupAttribute(
 237        Function &Caller, Function &Callee) {
 238
 239   // Check for externally defined function
 240   if (!Callee.hasExactDefinition()) {
 241     Callee.addFnAttr("uniform-work-group-size", "false");
 242     if (!Caller.hasFnAttribute("uniform-work-group-size"))
 243       Caller.addFnAttr("uniform-work-group-size", "false");
 244
 245     return true;
 246   }
 247   // Check if the Caller has the attribute
 248   if (Caller.hasFnAttribute("uniform-work-group-size")) {
 249     // Check if the value of the attribute is true
 250     if (Caller.getFnAttribute("uniform-work-group-size")
 251         .getValueAsString().equals("true")) {
 252       // Propagate the attribute to the Callee, if it does not have it
 253       if (!Callee.hasFnAttribute("uniform-work-group-size")) {
 254         Callee.addFnAttr("uniform-work-group-size", "true");
 255         return true;
 256       }
 257     } else {
 258       Callee.addFnAttr("uniform-work-group-size", "false");
 259       return true;
 260     }
 261   } else {
 262     // If the attribute is absent, set it as false
 263     Caller.addFnAttr("uniform-work-group-size", "false");
 264     Callee.addFnAttr("uniform-work-group-size", "false");
 265     return true;
 266   }
 267   return false;
 268 }
 269
 270 bool AMDGPUAnnotateKernelFeatures::addFeatureAttributes(Function &F) {
 271   const GCNSubtarget &ST = TM->getSubtarget<GCNSubtarget>(F);
 272   bool HasFlat = ST.hasFlatAddressSpace();
 273   bool HasApertureRegs = ST.hasApertureRegs();
 274   SmallPtrSet<const Constant *, 8> ConstantExprVisited;
 275
 276   bool Changed = false;
 277   bool NeedQueuePtr = false;
 278   bool HaveCall = false;
 279   bool IsFunc = !AMDGPU::isEntryFunctionCC(F.getCallingConv());
 280
 281   for (BasicBlock &BB : F) {
 282     for (Instruction &I : BB) {
 283       CallSite CS(&I);
 284       if (CS) {
 285         Function *Callee = CS.getCalledFunction();
 286
 287         // TODO: Do something with indirect calls.
 288         if (!Callee) {
 289           if (!CS.isInlineAsm())
 290             HaveCall = true;
 291           continue;
 292         }
 293
 294         Intrinsic::ID IID = Callee->getIntrinsicID();
 295         if (IID == Intrinsic::not_intrinsic) {
 296           HaveCall = true;
 297           copyFeaturesToFunction(F, *Callee, NeedQueuePtr);
 298           Changed = true;
 299         } else {
 300           bool NonKernelOnly = false;
 301           StringRef AttrName = intrinsicToAttrName(IID,
 302                                                    NonKernelOnly, NeedQueuePtr);
 303           if (!AttrName.empty() && (IsFunc || !NonKernelOnly)) {
 304             F.addFnAttr(AttrName);
 305             Changed = true;
 306           }
 307         }
 308       }
 309
 310       if (NeedQueuePtr || HasApertureRegs)
 311         continue;
 312
 313       if (const AddrSpaceCastInst *ASC = dyn_cast<AddrSpaceCastInst>(&I)) {
 314         if (castRequiresQueuePtr(ASC)) {
 315           NeedQueuePtr = true;
 316           continue;
 317         }
 318       }
 319
 320       for (const Use &U : I.operands()) {
 321         const auto *OpC = dyn_cast<Constant>(U);
 322         if (!OpC)
 323           continue;
 324
 325         if (visitConstantExprsRecursively(OpC, ConstantExprVisited)) {
 326           NeedQueuePtr = true;
 327           break;
 328         }
 329       }
 330     }
 331   }
 332
 333   if (NeedQueuePtr) {
 334     F.addFnAttr("amdgpu-queue-ptr");
 335     Changed = true;
 336   }
 337
 338   // TODO: We could refine this to captured pointers that could possibly be
 339   // accessed by flat instructions. For now this is mostly a poor way of
 340   // estimating whether there are calls before argument lowering.
 341   if (HasFlat && !IsFunc && HaveCall) {
 342     F.addFnAttr("amdgpu-flat-scratch");
 343     Changed = true;
 344   }
 345
 346   return Changed;
 347 }
 348
 349 bool AMDGPUAnnotateKernelFeatures::runOnSCC(CallGraphSCC &SCC) {
 350   bool Changed = false;
 351
 352   for (CallGraphNode *I : SCC) {
 353     // Build a list of CallGraphNodes from most number of uses to least
 354     if (I->getNumReferences())
 355       NodeList.push_back(I);
 356     else {
 357       processUniformWorkGroupAttribute();
 358       NodeList.clear();
 359     }
 360
 361     Function *F = I->getFunction();
 362     // Add feature attributes
 363     if (!F || F->isDeclaration())
 364       continue;
 365     Changed |= addFeatureAttributes(*F);
 366   }
 367
 368   return Changed;
 369 }
 370
 371 bool AMDGPUAnnotateKernelFeatures::doInitialization(CallGraph &CG) {
 372   auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
 373   if (!TPC)
 374     report_fatal_error("TargetMachine is required");
 375
 376   TM = &TPC->getTM<TargetMachine>();
 377   return false;
 378 }
 379
 380 Pass *llvm::createAMDGPUAnnotateKernelFeaturesPass() {
 381   return new AMDGPUAnnotateKernelFeatures();
 382 }