llvm/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp

   1 //===- AMDGPUAnnotateKernelFeaturesPass.cpp -------------------------------===//
   2 //
   3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 // See https://llvm.org/LICENSE.txt for license information.
   5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 //
   7 //===----------------------------------------------------------------------===//
   8 //
   9 /// \file This pass adds target attributes to functions which use intrinsics
  10 /// which will impact calling convention lowering.
  11 //
  12 //===----------------------------------------------------------------------===//
  13
  14 #include "AMDGPU.h"
  15 #include "GCNSubtarget.h"
  16 #include "llvm/Analysis/CallGraph.h"
  17 #include "llvm/Analysis/CallGraphSCCPass.h"
  18 #include "llvm/CodeGen/TargetPassConfig.h"
  19 #include "llvm/IR/IntrinsicsAMDGPU.h"
  20 #include "llvm/IR/IntrinsicsR600.h"
  21 #include "llvm/Target/TargetMachine.h"
  22
  23 #define DEBUG_TYPE "amdgpu-annotate-kernel-features"
  24
  25 using namespace llvm;
  26
  27 namespace {
  28 static constexpr StringLiteral ImplicitAttrNames[] = {
  29     // X ids unnecessarily propagated to kernels.
  30     "amdgpu-work-item-id-x",  "amdgpu-work-item-id-y",
  31     "amdgpu-work-item-id-z",  "amdgpu-work-group-id-x",
  32     "amdgpu-work-group-id-y", "amdgpu-work-group-id-z",
  33     "amdgpu-dispatch-ptr",    "amdgpu-dispatch-id",
  34     "amdgpu-queue-ptr",       "amdgpu-implicitarg-ptr"};
  35
  36 class AMDGPUAnnotateKernelFeatures : public CallGraphSCCPass {
  37 private:
  38   const TargetMachine *TM = nullptr;
  39   SmallVector<CallGraphNode*, 8> NodeList;
  40
  41   bool addFeatureAttributes(Function &F);
  42   bool processUniformWorkGroupAttribute();
  43   bool propagateUniformWorkGroupAttribute(Function &Caller, Function &Callee);
  44
  45 public:
  46   static char ID;
  47
  48   AMDGPUAnnotateKernelFeatures() : CallGraphSCCPass(ID) {}
  49
  50   bool doInitialization(CallGraph &CG) override;
  51   bool runOnSCC(CallGraphSCC &SCC) override;
  52
  53   StringRef getPassName() const override {
  54     return "AMDGPU Annotate Kernel Features";
  55   }
  56
  57   void getAnalysisUsage(AnalysisUsage &AU) const override {
  58     AU.setPreservesAll();
  59     CallGraphSCCPass::getAnalysisUsage(AU);
  60   }
  61
  62   static bool visitConstantExpr(const ConstantExpr *CE);
  63   static bool visitConstantExprsRecursively(
  64     const Constant *EntryC,
  65     SmallPtrSet<const Constant *, 8> &ConstantExprVisited, bool IsFunc,
  66     bool HasApertureRegs);
  67 };
  68
  69 } // end anonymous namespace
  70
  71 char AMDGPUAnnotateKernelFeatures::ID = 0;
  72
  73 char &llvm::AMDGPUAnnotateKernelFeaturesID = AMDGPUAnnotateKernelFeatures::ID;
  74
  75 INITIALIZE_PASS(AMDGPUAnnotateKernelFeatures, DEBUG_TYPE,
  76                 "Add AMDGPU function attributes", false, false)
  77
  78
  79 // The queue ptr is only needed when casting to flat, not from it.
  80 static bool castRequiresQueuePtr(unsigned SrcAS) {
  81   return SrcAS == AMDGPUAS::LOCAL_ADDRESS || SrcAS == AMDGPUAS::PRIVATE_ADDRESS;
  82 }
  83
  84 static bool castRequiresQueuePtr(const AddrSpaceCastInst *ASC) {
  85   return castRequiresQueuePtr(ASC->getSrcAddressSpace());
  86 }
  87
  88 static bool isDSAddress(const Constant *C) {
  89   const GlobalValue *GV = dyn_cast<GlobalValue>(C);
  90   if (!GV)
  91     return false;
  92   unsigned AS = GV->getAddressSpace();
  93   return AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS;
  94 }
  95
  96 bool AMDGPUAnnotateKernelFeatures::visitConstantExpr(const ConstantExpr *CE) {
  97   if (CE->getOpcode() == Instruction::AddrSpaceCast) {
  98     unsigned SrcAS = CE->getOperand(0)->getType()->getPointerAddressSpace();
  99     return castRequiresQueuePtr(SrcAS);
 100   }
 101
 102   return false;
 103 }
 104
 105 bool AMDGPUAnnotateKernelFeatures::visitConstantExprsRecursively(
 106   const Constant *EntryC,
 107   SmallPtrSet<const Constant *, 8> &ConstantExprVisited,
 108   bool IsFunc, bool HasApertureRegs) {
 109
 110   if (!ConstantExprVisited.insert(EntryC).second)
 111     return false;
 112
 113   SmallVector<const Constant *, 16> Stack;
 114   Stack.push_back(EntryC);
 115
 116   while (!Stack.empty()) {
 117     const Constant *C = Stack.pop_back_val();
 118
 119     // We need to trap on DS globals in non-entry functions.
 120     if (IsFunc && isDSAddress(C))
 121       return true;
 122
 123     // Check this constant expression.
 124     if (const auto *CE = dyn_cast<ConstantExpr>(C)) {
 125       if (!HasApertureRegs && visitConstantExpr(CE))
 126         return true;
 127     }
 128
 129     // Visit all sub-expressions.
 130     for (const Use &U : C->operands()) {
 131       const auto *OpC = dyn_cast<Constant>(U);
 132       if (!OpC)
 133         continue;
 134
 135       if (!ConstantExprVisited.insert(OpC).second)
 136         continue;
 137
 138       Stack.push_back(OpC);
 139     }
 140   }
 141
 142   return false;
 143 }
 144
 145 // We do not need to note the x workitem or workgroup id because they are always
 146 // initialized.
 147 //
 148 // TODO: We should not add the attributes if the known compile time workgroup
 149 // size is 1 for y/z.
 150 static StringRef intrinsicToAttrName(Intrinsic::ID ID,
 151                                      bool &NonKernelOnly,
 152                                      bool &IsQueuePtr) {
 153   switch (ID) {
 154   case Intrinsic::amdgcn_workitem_id_x:
 155     NonKernelOnly = true;
 156     return "amdgpu-work-item-id-x";
 157   case Intrinsic::amdgcn_workgroup_id_x:
 158     NonKernelOnly = true;
 159     return "amdgpu-work-group-id-x";
 160   case Intrinsic::amdgcn_workitem_id_y:
 161   case Intrinsic::r600_read_tidig_y:
 162     return "amdgpu-work-item-id-y";
 163   case Intrinsic::amdgcn_workitem_id_z:
 164   case Intrinsic::r600_read_tidig_z:
 165     return "amdgpu-work-item-id-z";
 166   case Intrinsic::amdgcn_workgroup_id_y:
 167   case Intrinsic::r600_read_tgid_y:
 168     return "amdgpu-work-group-id-y";
 169   case Intrinsic::amdgcn_workgroup_id_z:
 170   case Intrinsic::r600_read_tgid_z:
 171     return "amdgpu-work-group-id-z";
 172   case Intrinsic::amdgcn_dispatch_ptr:
 173     return "amdgpu-dispatch-ptr";
 174   case Intrinsic::amdgcn_dispatch_id:
 175     return "amdgpu-dispatch-id";
 176   case Intrinsic::amdgcn_kernarg_segment_ptr:
 177     return "amdgpu-kernarg-segment-ptr";
 178   case Intrinsic::amdgcn_implicitarg_ptr:
 179     return "amdgpu-implicitarg-ptr";
 180   case Intrinsic::amdgcn_queue_ptr:
 181   case Intrinsic::amdgcn_is_shared:
 182   case Intrinsic::amdgcn_is_private:
 183     // TODO: Does not require queue ptr on gfx9+
 184   case Intrinsic::trap:
 185   case Intrinsic::debugtrap:
 186     IsQueuePtr = true;
 187     return "amdgpu-queue-ptr";
 188   default:
 189     return "";
 190   }
 191 }
 192
 193 static bool handleAttr(Function &Parent, const Function &Callee,
 194                        StringRef Name) {
 195   if (Callee.hasFnAttribute(Name)) {
 196     Parent.addFnAttr(Name);
 197     return true;
 198   }
 199   return false;
 200 }
 201
 202 static void copyFeaturesToFunction(Function &Parent, const Function &Callee,
 203                                    bool &NeedQueuePtr) {
 204   if (handleAttr(Parent, Callee, "amdgpu-queue-ptr"))
 205     NeedQueuePtr = true;
 206
 207   for (StringRef AttrName : ImplicitAttrNames)
 208     handleAttr(Parent, Callee, AttrName);
 209 }
 210
 211 bool AMDGPUAnnotateKernelFeatures::processUniformWorkGroupAttribute() {
 212   bool Changed = false;
 213
 214   for (auto *Node : reverse(NodeList)) {
 215     Function *Caller = Node->getFunction();
 216
 217     for (auto I : *Node) {
 218       Function *Callee = std::get<1>(I)->getFunction();
 219       if (Callee)
 220         Changed = propagateUniformWorkGroupAttribute(*Caller, *Callee);
 221     }
 222   }
 223
 224   return Changed;
 225 }
 226
 227 bool AMDGPUAnnotateKernelFeatures::propagateUniformWorkGroupAttribute(
 228        Function &Caller, Function &Callee) {
 229
 230   // Check for externally defined function
 231   if (!Callee.hasExactDefinition()) {
 232     Callee.addFnAttr("uniform-work-group-size", "false");
 233     if (!Caller.hasFnAttribute("uniform-work-group-size"))
 234       Caller.addFnAttr("uniform-work-group-size", "false");
 235
 236     return true;
 237   }
 238   // Check if the Caller has the attribute
 239   if (Caller.hasFnAttribute("uniform-work-group-size")) {
 240     // Check if the value of the attribute is true
 241     if (Caller.getFnAttribute("uniform-work-group-size")
 242         .getValueAsString().equals("true")) {
 243       // Propagate the attribute to the Callee, if it does not have it
 244       if (!Callee.hasFnAttribute("uniform-work-group-size")) {
 245         Callee.addFnAttr("uniform-work-group-size", "true");
 246         return true;
 247       }
 248     } else {
 249       Callee.addFnAttr("uniform-work-group-size", "false");
 250       return true;
 251     }
 252   } else {
 253     // If the attribute is absent, set it as false
 254     Caller.addFnAttr("uniform-work-group-size", "false");
 255     Callee.addFnAttr("uniform-work-group-size", "false");
 256     return true;
 257   }
 258   return false;
 259 }
 260
 261 bool AMDGPUAnnotateKernelFeatures::addFeatureAttributes(Function &F) {
 262   const GCNSubtarget &ST = TM->getSubtarget<GCNSubtarget>(F);
 263   bool HasApertureRegs = ST.hasApertureRegs();
 264   SmallPtrSet<const Constant *, 8> ConstantExprVisited;
 265
 266   bool HaveStackObjects = false;
 267   bool Changed = false;
 268   bool NeedQueuePtr = false;
 269   bool HaveCall = false;
 270   bool HasIndirectCall = false;
 271   bool IsFunc = !AMDGPU::isEntryFunctionCC(F.getCallingConv());
 272   CallingConv::ID CC = F.getCallingConv();
 273   bool CallingConvSupportsAllImplicits = (CC != CallingConv::AMDGPU_Gfx);
 274
 275   // If this function hasAddressTaken() = true
 276   // then add all attributes corresponding to the implicit args.
 277   if (CallingConvSupportsAllImplicits &&
 278       F.hasAddressTaken(nullptr, true, true, true)) {
 279     for (StringRef AttrName : ImplicitAttrNames) {
 280       F.addFnAttr(AttrName);
 281     }
 282     Changed = true;
 283   }
 284
 285   for (BasicBlock &BB : F) {
 286     for (Instruction &I : BB) {
 287       if (isa<AllocaInst>(I)) {
 288         HaveStackObjects = true;
 289         continue;
 290       }
 291
 292       if (auto *CB = dyn_cast<CallBase>(&I)) {
 293         const Function *Callee =
 294             dyn_cast<Function>(CB->getCalledOperand()->stripPointerCasts());
 295
 296         // Note the occurence of indirect call.
 297         if (!Callee) {
 298           if (!CB->isInlineAsm()) {
 299             HasIndirectCall = true;
 300             HaveCall = true;
 301           }
 302           continue;
 303         }
 304
 305         Intrinsic::ID IID = Callee->getIntrinsicID();
 306         if (IID == Intrinsic::not_intrinsic) {
 307           HaveCall = true;
 308           copyFeaturesToFunction(F, *Callee, NeedQueuePtr);
 309           Changed = true;
 310         } else {
 311           bool NonKernelOnly = false;
 312
 313           if (!IsFunc && IID == Intrinsic::amdgcn_kernarg_segment_ptr) {
 314             F.addFnAttr("amdgpu-kernarg-segment-ptr");
 315           } else {
 316             StringRef AttrName = intrinsicToAttrName(IID, NonKernelOnly,
 317                                                      NeedQueuePtr);
 318             if (!AttrName.empty() && (IsFunc || !NonKernelOnly)) {
 319               F.addFnAttr(AttrName);
 320               Changed = true;
 321             }
 322           }
 323         }
 324       }
 325
 326       if (NeedQueuePtr || (!IsFunc && HasApertureRegs))
 327         continue;
 328
 329       if (const AddrSpaceCastInst *ASC = dyn_cast<AddrSpaceCastInst>(&I)) {
 330         if (!HasApertureRegs && castRequiresQueuePtr(ASC)) {
 331           NeedQueuePtr = true;
 332           continue;
 333         }
 334       }
 335
 336       for (const Use &U : I.operands()) {
 337         const auto *OpC = dyn_cast<Constant>(U);
 338         if (!OpC)
 339           continue;
 340
 341         if (visitConstantExprsRecursively(OpC, ConstantExprVisited, IsFunc,
 342                                           HasApertureRegs)) {
 343           NeedQueuePtr = true;
 344           break;
 345         }
 346       }
 347     }
 348   }
 349
 350   if (NeedQueuePtr) {
 351     F.addFnAttr("amdgpu-queue-ptr");
 352     Changed = true;
 353   }
 354
 355   // TODO: We could refine this to captured pointers that could possibly be
 356   // accessed by flat instructions. For now this is mostly a poor way of
 357   // estimating whether there are calls before argument lowering.
 358   if (!IsFunc && HaveCall) {
 359     F.addFnAttr("amdgpu-calls");
 360     Changed = true;
 361   }
 362
 363   if (HaveStackObjects) {
 364     F.addFnAttr("amdgpu-stack-objects");
 365     Changed = true;
 366   }
 367
 368   // This pass cannot copy attributes from callees to callers
 369   // if there is an indirect call and in thus such cases,
 370   // hasAddressTaken() would be false for kernels and functions
 371   // making an indirect call (if they are themselves not indirectly called).
 372   // We must tag all such kernels/functions with all implicits attributes
 373   // for correctness.
 374   // e.g.
 375   // 1. Kernel K1 makes an indirect call to function F1.
 376   //    Without detecting an indirect call in K1, this pass will not
 377   //    add all implicit args to K1 (which is incorrect).
 378   // 2. Kernel K1 makes direct call to F1 which makes indirect call to function
 379   // F2.
 380   //    Without detecting an indirect call in F1 (whose hasAddressTaken() is
 381   //    false), the pass will not add all implicit args to F1 (which is
 382   //    essential for correctness).
 383   if (CallingConvSupportsAllImplicits && HasIndirectCall) {
 384     for (StringRef AttrName : ImplicitAttrNames) {
 385       F.addFnAttr(AttrName);
 386     }
 387     Changed = true;
 388   }
 389
 390   return Changed;
 391 }
 392
 393 bool AMDGPUAnnotateKernelFeatures::runOnSCC(CallGraphSCC &SCC) {
 394   bool Changed = false;
 395
 396   for (CallGraphNode *I : SCC) {
 397     // Build a list of CallGraphNodes from most number of uses to least
 398     if (I->getNumReferences())
 399       NodeList.push_back(I);
 400     else {
 401       processUniformWorkGroupAttribute();
 402       NodeList.clear();
 403     }
 404
 405     Function *F = I->getFunction();
 406     // Ignore functions with graphics calling conventions, these are currently
 407     // not allowed to have kernel arguments.
 408     if (!F || F->isDeclaration() || AMDGPU::isGraphics(F->getCallingConv()))
 409       continue;
 410     // Add feature attributes
 411     Changed |= addFeatureAttributes(*F);
 412   }
 413
 414   return Changed;
 415 }
 416
 417 bool AMDGPUAnnotateKernelFeatures::doInitialization(CallGraph &CG) {
 418   auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
 419   if (!TPC)
 420     report_fatal_error("TargetMachine is required");
 421
 422   TM = &TPC->getTM<TargetMachine>();
 423   return false;
 424 }
 425
 426 Pass *llvm::createAMDGPUAnnotateKernelFeaturesPass() {
 427   return new AMDGPUAnnotateKernelFeatures();
 428 }