lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp

   1 //===- AMDGPUAnnotateKernelFeaturesPass.cpp -------------------------------===//
   2 //
   3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 // See https://llvm.org/LICENSE.txt for license information.
   5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 //
   7 //===----------------------------------------------------------------------===//
   8 //
   9 /// \file This pass adds target attributes to functions which use intrinsics
  10 /// which will impact calling convention lowering.
  11 //
  12 //===----------------------------------------------------------------------===//
  13
  14 #include "AMDGPU.h"
  15 #include "AMDGPUSubtarget.h"
  16 #include "Utils/AMDGPUBaseInfo.h"
  17 #include "llvm/ADT/SmallPtrSet.h"
  18 #include "llvm/ADT/SmallVector.h"
  19 #include "llvm/ADT/StringRef.h"
  20 #include "llvm/ADT/Triple.h"
  21 #include "llvm/Analysis/CallGraph.h"
  22 #include "llvm/Analysis/CallGraphSCCPass.h"
  23 #include "llvm/CodeGen/TargetPassConfig.h"
  24 #include "llvm/IR/CallSite.h"
  25 #include "llvm/IR/Constant.h"
  26 #include "llvm/IR/Constants.h"
  27 #include "llvm/IR/Function.h"
  28 #include "llvm/IR/Instruction.h"
  29 #include "llvm/IR/Instructions.h"
  30 #include "llvm/IR/Intrinsics.h"
  31 #include "llvm/IR/Module.h"
  32 #include "llvm/IR/Type.h"
  33 #include "llvm/IR/Use.h"
  34 #include "llvm/Pass.h"
  35 #include "llvm/Support/Casting.h"
  36 #include "llvm/Support/ErrorHandling.h"
  37 #include "llvm/Target/TargetMachine.h"
  38
  39 #define DEBUG_TYPE "amdgpu-annotate-kernel-features"
  40
  41 using namespace llvm;
  42
  43 namespace {
  44
  45 class AMDGPUAnnotateKernelFeatures : public CallGraphSCCPass {
  46 private:
  47   const TargetMachine *TM = nullptr;
  48   SmallVector<CallGraphNode*, 8> NodeList;
  49
  50   bool addFeatureAttributes(Function &F);
  51   bool processUniformWorkGroupAttribute();
  52   bool propagateUniformWorkGroupAttribute(Function &Caller, Function &Callee);
  53
  54 public:
  55   static char ID;
  56
  57   AMDGPUAnnotateKernelFeatures() : CallGraphSCCPass(ID) {}
  58
  59   bool doInitialization(CallGraph &CG) override;
  60   bool runOnSCC(CallGraphSCC &SCC) override;
  61
  62   StringRef getPassName() const override {
  63     return "AMDGPU Annotate Kernel Features";
  64   }
  65
  66   void getAnalysisUsage(AnalysisUsage &AU) const override {
  67     AU.setPreservesAll();
  68     CallGraphSCCPass::getAnalysisUsage(AU);
  69   }
  70
  71   static bool visitConstantExpr(const ConstantExpr *CE);
  72   static bool visitConstantExprsRecursively(
  73     const Constant *EntryC,
  74     SmallPtrSet<const Constant *, 8> &ConstantExprVisited);
  75 };
  76
  77 } // end anonymous namespace
  78
  79 char AMDGPUAnnotateKernelFeatures::ID = 0;
  80
  81 char &llvm::AMDGPUAnnotateKernelFeaturesID = AMDGPUAnnotateKernelFeatures::ID;
  82
  83 INITIALIZE_PASS(AMDGPUAnnotateKernelFeatures, DEBUG_TYPE,
  84                 "Add AMDGPU function attributes", false, false)
  85
  86
  87 // The queue ptr is only needed when casting to flat, not from it.
  88 static bool castRequiresQueuePtr(unsigned SrcAS) {
  89   return SrcAS == AMDGPUAS::LOCAL_ADDRESS || SrcAS == AMDGPUAS::PRIVATE_ADDRESS;
  90 }
  91
  92 static bool castRequiresQueuePtr(const AddrSpaceCastInst *ASC) {
  93   return castRequiresQueuePtr(ASC->getSrcAddressSpace());
  94 }
  95
  96 bool AMDGPUAnnotateKernelFeatures::visitConstantExpr(const ConstantExpr *CE) {
  97   if (CE->getOpcode() == Instruction::AddrSpaceCast) {
  98     unsigned SrcAS = CE->getOperand(0)->getType()->getPointerAddressSpace();
  99     return castRequiresQueuePtr(SrcAS);
 100   }
 101
 102   return false;
 103 }
 104
 105 bool AMDGPUAnnotateKernelFeatures::visitConstantExprsRecursively(
 106   const Constant *EntryC,
 107   SmallPtrSet<const Constant *, 8> &ConstantExprVisited) {
 108
 109   if (!ConstantExprVisited.insert(EntryC).second)
 110     return false;
 111
 112   SmallVector<const Constant *, 16> Stack;
 113   Stack.push_back(EntryC);
 114
 115   while (!Stack.empty()) {
 116     const Constant *C = Stack.pop_back_val();
 117
 118     // Check this constant expression.
 119     if (const auto *CE = dyn_cast<ConstantExpr>(C)) {
 120       if (visitConstantExpr(CE))
 121         return true;
 122     }
 123
 124     // Visit all sub-expressions.
 125     for (const Use &U : C->operands()) {
 126       const auto *OpC = dyn_cast<Constant>(U);
 127       if (!OpC)
 128         continue;
 129
 130       if (!ConstantExprVisited.insert(OpC).second)
 131         continue;
 132
 133       Stack.push_back(OpC);
 134     }
 135   }
 136
 137   return false;
 138 }
 139
 140 // We do not need to note the x workitem or workgroup id because they are always
 141 // initialized.
 142 //
 143 // TODO: We should not add the attributes if the known compile time workgroup
 144 // size is 1 for y/z.
 145 static StringRef intrinsicToAttrName(Intrinsic::ID ID,
 146                                      bool &NonKernelOnly,
 147                                      bool &IsQueuePtr) {
 148   switch (ID) {
 149   case Intrinsic::amdgcn_workitem_id_x:
 150     NonKernelOnly = true;
 151     return "amdgpu-work-item-id-x";
 152   case Intrinsic::amdgcn_workgroup_id_x:
 153     NonKernelOnly = true;
 154     return "amdgpu-work-group-id-x";
 155   case Intrinsic::amdgcn_workitem_id_y:
 156   case Intrinsic::r600_read_tidig_y:
 157     return "amdgpu-work-item-id-y";
 158   case Intrinsic::amdgcn_workitem_id_z:
 159   case Intrinsic::r600_read_tidig_z:
 160     return "amdgpu-work-item-id-z";
 161   case Intrinsic::amdgcn_workgroup_id_y:
 162   case Intrinsic::r600_read_tgid_y:
 163     return "amdgpu-work-group-id-y";
 164   case Intrinsic::amdgcn_workgroup_id_z:
 165   case Intrinsic::r600_read_tgid_z:
 166     return "amdgpu-work-group-id-z";
 167   case Intrinsic::amdgcn_dispatch_ptr:
 168     return "amdgpu-dispatch-ptr";
 169   case Intrinsic::amdgcn_dispatch_id:
 170     return "amdgpu-dispatch-id";
 171   case Intrinsic::amdgcn_kernarg_segment_ptr:
 172     return "amdgpu-kernarg-segment-ptr";
 173   case Intrinsic::amdgcn_implicitarg_ptr:
 174     return "amdgpu-implicitarg-ptr";
 175   case Intrinsic::amdgcn_queue_ptr:
 176   case Intrinsic::amdgcn_is_shared:
 177   case Intrinsic::amdgcn_is_private:
 178     // TODO: Does not require queue ptr on gfx9+
 179   case Intrinsic::trap:
 180   case Intrinsic::debugtrap:
 181     IsQueuePtr = true;
 182     return "amdgpu-queue-ptr";
 183   default:
 184     return "";
 185   }
 186 }
 187
 188 static bool handleAttr(Function &Parent, const Function &Callee,
 189                        StringRef Name) {
 190   if (Callee.hasFnAttribute(Name)) {
 191     Parent.addFnAttr(Name);
 192     return true;
 193   }
 194   return false;
 195 }
 196
 197 static void copyFeaturesToFunction(Function &Parent, const Function &Callee,
 198                                    bool &NeedQueuePtr) {
 199   // X ids unnecessarily propagated to kernels.
 200   static constexpr StringLiteral AttrNames[] = {
 201       "amdgpu-work-item-id-x",      "amdgpu-work-item-id-y",
 202       "amdgpu-work-item-id-z",      "amdgpu-work-group-id-x",
 203       "amdgpu-work-group-id-y",     "amdgpu-work-group-id-z",
 204       "amdgpu-dispatch-ptr",        "amdgpu-dispatch-id",
 205       "amdgpu-kernarg-segment-ptr", "amdgpu-implicitarg-ptr"};
 206
 207   if (handleAttr(Parent, Callee, "amdgpu-queue-ptr"))
 208     NeedQueuePtr = true;
 209
 210   for (StringRef AttrName : AttrNames)
 211     handleAttr(Parent, Callee, AttrName);
 212 }
 213
 214 bool AMDGPUAnnotateKernelFeatures::processUniformWorkGroupAttribute() {
 215   bool Changed = false;
 216
 217   for (auto *Node : reverse(NodeList)) {
 218     Function *Caller = Node->getFunction();
 219
 220     for (auto I : *Node) {
 221       Function *Callee = std::get<1>(I)->getFunction();
 222       if (Callee)
 223         Changed = propagateUniformWorkGroupAttribute(*Caller, *Callee);
 224     }
 225   }
 226
 227   return Changed;
 228 }
 229
 230 bool AMDGPUAnnotateKernelFeatures::propagateUniformWorkGroupAttribute(
 231        Function &Caller, Function &Callee) {
 232
 233   // Check for externally defined function
 234   if (!Callee.hasExactDefinition()) {
 235     Callee.addFnAttr("uniform-work-group-size", "false");
 236     if (!Caller.hasFnAttribute("uniform-work-group-size"))
 237       Caller.addFnAttr("uniform-work-group-size", "false");
 238
 239     return true;
 240   }
 241   // Check if the Caller has the attribute
 242   if (Caller.hasFnAttribute("uniform-work-group-size")) {
 243     // Check if the value of the attribute is true
 244     if (Caller.getFnAttribute("uniform-work-group-size")
 245         .getValueAsString().equals("true")) {
 246       // Propagate the attribute to the Callee, if it does not have it
 247       if (!Callee.hasFnAttribute("uniform-work-group-size")) {
 248         Callee.addFnAttr("uniform-work-group-size", "true");
 249         return true;
 250       }
 251     } else {
 252       Callee.addFnAttr("uniform-work-group-size", "false");
 253       return true;
 254     }
 255   } else {
 256     // If the attribute is absent, set it as false
 257     Caller.addFnAttr("uniform-work-group-size", "false");
 258     Callee.addFnAttr("uniform-work-group-size", "false");
 259     return true;
 260   }
 261   return false;
 262 }
 263
 264 bool AMDGPUAnnotateKernelFeatures::addFeatureAttributes(Function &F) {
 265   const GCNSubtarget &ST = TM->getSubtarget<GCNSubtarget>(F);
 266   bool HasFlat = ST.hasFlatAddressSpace();
 267   bool HasApertureRegs = ST.hasApertureRegs();
 268   SmallPtrSet<const Constant *, 8> ConstantExprVisited;
 269
 270   bool Changed = false;
 271   bool NeedQueuePtr = false;
 272   bool HaveCall = false;
 273   bool IsFunc = !AMDGPU::isEntryFunctionCC(F.getCallingConv());
 274
 275   for (BasicBlock &BB : F) {
 276     for (Instruction &I : BB) {
 277       CallSite CS(&I);
 278       if (CS) {
 279         Function *Callee = CS.getCalledFunction();
 280
 281         // TODO: Do something with indirect calls.
 282         if (!Callee) {
 283           if (!CS.isInlineAsm())
 284             HaveCall = true;
 285           continue;
 286         }
 287
 288         Intrinsic::ID IID = Callee->getIntrinsicID();
 289         if (IID == Intrinsic::not_intrinsic) {
 290           HaveCall = true;
 291           copyFeaturesToFunction(F, *Callee, NeedQueuePtr);
 292           Changed = true;
 293         } else {
 294           bool NonKernelOnly = false;
 295           StringRef AttrName = intrinsicToAttrName(IID,
 296                                                    NonKernelOnly, NeedQueuePtr);
 297           if (!AttrName.empty() && (IsFunc || !NonKernelOnly)) {
 298             F.addFnAttr(AttrName);
 299             Changed = true;
 300           }
 301         }
 302       }
 303
 304       if (NeedQueuePtr || HasApertureRegs)
 305         continue;
 306
 307       if (const AddrSpaceCastInst *ASC = dyn_cast<AddrSpaceCastInst>(&I)) {
 308         if (castRequiresQueuePtr(ASC)) {
 309           NeedQueuePtr = true;
 310           continue;
 311         }
 312       }
 313
 314       for (const Use &U : I.operands()) {
 315         const auto *OpC = dyn_cast<Constant>(U);
 316         if (!OpC)
 317           continue;
 318
 319         if (visitConstantExprsRecursively(OpC, ConstantExprVisited)) {
 320           NeedQueuePtr = true;
 321           break;
 322         }
 323       }
 324     }
 325   }
 326
 327   if (NeedQueuePtr) {
 328     F.addFnAttr("amdgpu-queue-ptr");
 329     Changed = true;
 330   }
 331
 332   // TODO: We could refine this to captured pointers that could possibly be
 333   // accessed by flat instructions. For now this is mostly a poor way of
 334   // estimating whether there are calls before argument lowering.
 335   if (HasFlat && !IsFunc && HaveCall) {
 336     F.addFnAttr("amdgpu-flat-scratch");
 337     Changed = true;
 338   }
 339
 340   return Changed;
 341 }
 342
 343 bool AMDGPUAnnotateKernelFeatures::runOnSCC(CallGraphSCC &SCC) {
 344   bool Changed = false;
 345
 346   for (CallGraphNode *I : SCC) {
 347     // Build a list of CallGraphNodes from most number of uses to least
 348     if (I->getNumReferences())
 349       NodeList.push_back(I);
 350     else {
 351       processUniformWorkGroupAttribute();
 352       NodeList.clear();
 353     }
 354
 355     Function *F = I->getFunction();
 356     // Add feature attributes
 357     if (!F || F->isDeclaration())
 358       continue;
 359     Changed |= addFeatureAttributes(*F);
 360   }
 361
 362   return Changed;
 363 }
 364
 365 bool AMDGPUAnnotateKernelFeatures::doInitialization(CallGraph &CG) {
 366   auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
 367   if (!TPC)
 368     report_fatal_error("TargetMachine is required");
 369
 370   TM = &TPC->getTM<TargetMachine>();
 371   return false;
 372 }
 373
 374 Pass *llvm::createAMDGPUAnnotateKernelFeaturesPass() {
 375   return new AMDGPUAnnotateKernelFeatures();
 376 }