clang/lib/CodeGen/CGCUDANV.cpp

   1 //===----- CGCUDANV.cpp - Interface to NVIDIA CUDA Runtime ----------------===//
   2 //
   3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 // See https://llvm.org/LICENSE.txt for license information.
   5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 //
   7 //===----------------------------------------------------------------------===//
   8 //
   9 // This provides a class for CUDA code generation targeting the NVIDIA CUDA
  10 // runtime library.
  11 //
  12 //===----------------------------------------------------------------------===//
  13
  14 #include "CGCUDARuntime.h"
  15 #include "CGCXXABI.h"
  16 #include "CodeGenFunction.h"
  17 #include "CodeGenModule.h"
  18 #include "clang/AST/Decl.h"
  19 #include "clang/Basic/Cuda.h"
  20 #include "clang/CodeGen/CodeGenABITypes.h"
  21 #include "clang/CodeGen/ConstantInitBuilder.h"
  22 #include "llvm/Frontend/Offloading/Utility.h"
  23 #include "llvm/IR/BasicBlock.h"
  24 #include "llvm/IR/Constants.h"
  25 #include "llvm/IR/DerivedTypes.h"
  26 #include "llvm/IR/ReplaceConstant.h"
  27 #include "llvm/Support/Format.h"
  28 #include "llvm/Support/VirtualFileSystem.h"
  29
  30 using namespace clang;
  31 using namespace CodeGen;
  32
  33 namespace {
  34 constexpr unsigned CudaFatMagic = 0x466243b1;
  35 constexpr unsigned HIPFatMagic = 0x48495046; // "HIPF"
  36
  37 class CGNVCUDARuntime : public CGCUDARuntime {
  38
  39 private:
  40   llvm::IntegerType *IntTy, *SizeTy;
  41   llvm::Type *VoidTy;
  42   llvm::PointerType *CharPtrTy, *VoidPtrTy, *VoidPtrPtrTy;
  43
  44   /// Convenience reference to LLVM Context
  45   llvm::LLVMContext &Context;
  46   /// Convenience reference to the current module
  47   llvm::Module &TheModule;
  48   /// Keeps track of kernel launch stubs and handles emitted in this module
  49   struct KernelInfo {
  50     llvm::Function *Kernel; // stub function to help launch kernel
  51     const Decl *D;
  52   };
  53   llvm::SmallVector<KernelInfo, 16> EmittedKernels;
  54   // Map a kernel mangled name to a symbol for identifying kernel in host code
  55   // For CUDA, the symbol for identifying the kernel is the same as the device
  56   // stub function. For HIP, they are different.
  57   llvm::DenseMap<StringRef, llvm::GlobalValue *> KernelHandles;
  58   // Map a kernel handle to the kernel stub.
  59   llvm::DenseMap<llvm::GlobalValue *, llvm::Function *> KernelStubs;
  60   struct VarInfo {
  61     llvm::GlobalVariable *Var;
  62     const VarDecl *D;
  63     DeviceVarFlags Flags;
  64   };
  65   llvm::SmallVector<VarInfo, 16> DeviceVars;
  66   /// Keeps track of variable containing handle of GPU binary. Populated by
  67   /// ModuleCtorFunction() and used to create corresponding cleanup calls in
  68   /// ModuleDtorFunction()
  69   llvm::GlobalVariable *GpuBinaryHandle = nullptr;
  70   /// Whether we generate relocatable device code.
  71   bool RelocatableDeviceCode;
  72   /// Mangle context for device.
  73   std::unique_ptr<MangleContext> DeviceMC;
  74   /// Some zeros used for GEPs.
  75   llvm::Constant *Zeros[2];
  76
  77   llvm::FunctionCallee getSetupArgumentFn() const;
  78   llvm::FunctionCallee getLaunchFn() const;
  79
  80   llvm::FunctionType *getRegisterGlobalsFnTy() const;
  81   llvm::FunctionType *getCallbackFnTy() const;
  82   llvm::FunctionType *getRegisterLinkedBinaryFnTy() const;
  83   std::string addPrefixToName(StringRef FuncName) const;
  84   std::string addUnderscoredPrefixToName(StringRef FuncName) const;
  85
  86   /// Creates a function to register all kernel stubs generated in this module.
  87   llvm::Function *makeRegisterGlobalsFn();
  88
  89   /// Helper function that generates a constant string and returns a pointer to
  90   /// the start of the string.  The result of this function can be used anywhere
  91   /// where the C code specifies const char*.
  92   llvm::Constant *makeConstantString(const std::string &Str,
  93                                      const std::string &Name = "") {
  94     auto ConstStr = CGM.GetAddrOfConstantCString(Str, Name.c_str());
  95     return llvm::ConstantExpr::getGetElementPtr(ConstStr.getElementType(),
  96                                                 ConstStr.getPointer(), Zeros);
  97   }
  98
  99   /// Helper function which generates an initialized constant array from Str,
 100   /// and optionally sets section name and alignment. AddNull specifies whether
 101   /// the array should nave NUL termination.
 102   llvm::Constant *makeConstantArray(StringRef Str,
 103                                     StringRef Name = "",
 104                                     StringRef SectionName = "",
 105                                     unsigned Alignment = 0,
 106                                     bool AddNull = false) {
 107     llvm::Constant *Value =
 108         llvm::ConstantDataArray::getString(Context, Str, AddNull);
 109     auto *GV = new llvm::GlobalVariable(
 110         TheModule, Value->getType(), /*isConstant=*/true,
 111         llvm::GlobalValue::PrivateLinkage, Value, Name);
 112     if (!SectionName.empty()) {
 113       GV->setSection(SectionName);
 114       // Mark the address as used which make sure that this section isn't
 115       // merged and we will really have it in the object file.
 116       GV->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::None);
 117     }
 118     if (Alignment)
 119       GV->setAlignment(llvm::Align(Alignment));
 120     return llvm::ConstantExpr::getGetElementPtr(GV->getValueType(), GV, Zeros);
 121   }
 122
 123   /// Helper function that generates an empty dummy function returning void.
 124   llvm::Function *makeDummyFunction(llvm::FunctionType *FnTy) {
 125     assert(FnTy->getReturnType()->isVoidTy() &&
 126            "Can only generate dummy functions returning void!");
 127     llvm::Function *DummyFunc = llvm::Function::Create(
 128         FnTy, llvm::GlobalValue::InternalLinkage, "dummy", &TheModule);
 129
 130     llvm::BasicBlock *DummyBlock =
 131         llvm::BasicBlock::Create(Context, "", DummyFunc);
 132     CGBuilderTy FuncBuilder(CGM, Context);
 133     FuncBuilder.SetInsertPoint(DummyBlock);
 134     FuncBuilder.CreateRetVoid();
 135
 136     return DummyFunc;
 137   }
 138
 139   void emitDeviceStubBodyLegacy(CodeGenFunction &CGF, FunctionArgList &Args);
 140   void emitDeviceStubBodyNew(CodeGenFunction &CGF, FunctionArgList &Args);
 141   std::string getDeviceSideName(const NamedDecl *ND) override;
 142
 143   void registerDeviceVar(const VarDecl *VD, llvm::GlobalVariable &Var,
 144                          bool Extern, bool Constant) {
 145     DeviceVars.push_back({&Var,
 146                           VD,
 147                           {DeviceVarFlags::Variable, Extern, Constant,
 148                            VD->hasAttr<HIPManagedAttr>(),
 149                            /*Normalized*/ false, 0}});
 150   }
 151   void registerDeviceSurf(const VarDecl *VD, llvm::GlobalVariable &Var,
 152                           bool Extern, int Type) {
 153     DeviceVars.push_back({&Var,
 154                           VD,
 155                           {DeviceVarFlags::Surface, Extern, /*Constant*/ false,
 156                            /*Managed*/ false,
 157                            /*Normalized*/ false, Type}});
 158   }
 159   void registerDeviceTex(const VarDecl *VD, llvm::GlobalVariable &Var,
 160                          bool Extern, int Type, bool Normalized) {
 161     DeviceVars.push_back({&Var,
 162                           VD,
 163                           {DeviceVarFlags::Texture, Extern, /*Constant*/ false,
 164                            /*Managed*/ false, Normalized, Type}});
 165   }
 166
 167   /// Creates module constructor function
 168   llvm::Function *makeModuleCtorFunction();
 169   /// Creates module destructor function
 170   llvm::Function *makeModuleDtorFunction();
 171   /// Transform managed variables for device compilation.
 172   void transformManagedVars();
 173   /// Create offloading entries to register globals in RDC mode.
 174   void createOffloadingEntries();
 175
 176 public:
 177   CGNVCUDARuntime(CodeGenModule &CGM);
 178
 179   llvm::GlobalValue *getKernelHandle(llvm::Function *F, GlobalDecl GD) override;
 180   llvm::Function *getKernelStub(llvm::GlobalValue *Handle) override {
 181     auto Loc = KernelStubs.find(Handle);
 182     assert(Loc != KernelStubs.end());
 183     return Loc->second;
 184   }
 185   void emitDeviceStub(CodeGenFunction &CGF, FunctionArgList &Args) override;
 186   void handleVarRegistration(const VarDecl *VD,
 187                              llvm::GlobalVariable &Var) override;
 188   void
 189   internalizeDeviceSideVar(const VarDecl *D,
 190                            llvm::GlobalValue::LinkageTypes &Linkage) override;
 191
 192   llvm::Function *finalizeModule() override;
 193 };
 194
 195 } // end anonymous namespace
 196
 197 std::string CGNVCUDARuntime::addPrefixToName(StringRef FuncName) const {
 198   if (CGM.getLangOpts().HIP)
 199     return ((Twine("hip") + Twine(FuncName)).str());
 200   return ((Twine("cuda") + Twine(FuncName)).str());
 201 }
 202 std::string
 203 CGNVCUDARuntime::addUnderscoredPrefixToName(StringRef FuncName) const {
 204   if (CGM.getLangOpts().HIP)
 205     return ((Twine("__hip") + Twine(FuncName)).str());
 206   return ((Twine("__cuda") + Twine(FuncName)).str());
 207 }
 208
 209 static std::unique_ptr<MangleContext> InitDeviceMC(CodeGenModule &CGM) {
 210   // If the host and device have different C++ ABIs, mark it as the device
 211   // mangle context so that the mangling needs to retrieve the additional
 212   // device lambda mangling number instead of the regular host one.
 213   if (CGM.getContext().getAuxTargetInfo() &&
 214       CGM.getContext().getTargetInfo().getCXXABI().isMicrosoft() &&
 215       CGM.getContext().getAuxTargetInfo()->getCXXABI().isItaniumFamily()) {
 216     return std::unique_ptr<MangleContext>(
 217         CGM.getContext().createDeviceMangleContext(
 218             *CGM.getContext().getAuxTargetInfo()));
 219   }
 220
 221   return std::unique_ptr<MangleContext>(CGM.getContext().createMangleContext(
 222       CGM.getContext().getAuxTargetInfo()));
 223 }
 224
 225 CGNVCUDARuntime::CGNVCUDARuntime(CodeGenModule &CGM)
 226     : CGCUDARuntime(CGM), Context(CGM.getLLVMContext()),
 227       TheModule(CGM.getModule()),
 228       RelocatableDeviceCode(CGM.getLangOpts().GPURelocatableDeviceCode),
 229       DeviceMC(InitDeviceMC(CGM)) {
 230   IntTy = CGM.IntTy;
 231   SizeTy = CGM.SizeTy;
 232   VoidTy = CGM.VoidTy;
 233   Zeros[0] = llvm::ConstantInt::get(SizeTy, 0);
 234   Zeros[1] = Zeros[0];
 235
 236   CharPtrTy = CGM.UnqualPtrTy;
 237   VoidPtrTy = CGM.UnqualPtrTy;
 238   VoidPtrPtrTy = CGM.UnqualPtrTy;
 239 }
 240
 241 llvm::FunctionCallee CGNVCUDARuntime::getSetupArgumentFn() const {
 242   // cudaError_t cudaSetupArgument(void *, size_t, size_t)
 243   llvm::Type *Params[] = {VoidPtrTy, SizeTy, SizeTy};
 244   return CGM.CreateRuntimeFunction(
 245       llvm::FunctionType::get(IntTy, Params, false),
 246       addPrefixToName("SetupArgument"));
 247 }
 248
 249 llvm::FunctionCallee CGNVCUDARuntime::getLaunchFn() const {
 250   if (CGM.getLangOpts().HIP) {
 251     // hipError_t hipLaunchByPtr(char *);
 252     return CGM.CreateRuntimeFunction(
 253         llvm::FunctionType::get(IntTy, CharPtrTy, false), "hipLaunchByPtr");
 254   }
 255   // cudaError_t cudaLaunch(char *);
 256   return CGM.CreateRuntimeFunction(
 257       llvm::FunctionType::get(IntTy, CharPtrTy, false), "cudaLaunch");
 258 }
 259
 260 llvm::FunctionType *CGNVCUDARuntime::getRegisterGlobalsFnTy() const {
 261   return llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false);
 262 }
 263
 264 llvm::FunctionType *CGNVCUDARuntime::getCallbackFnTy() const {
 265   return llvm::FunctionType::get(VoidTy, VoidPtrTy, false);
 266 }
 267
 268 llvm::FunctionType *CGNVCUDARuntime::getRegisterLinkedBinaryFnTy() const {
 269   llvm::Type *Params[] = {llvm::PointerType::getUnqual(Context), VoidPtrTy,
 270                           VoidPtrTy, llvm::PointerType::getUnqual(Context)};
 271   return llvm::FunctionType::get(VoidTy, Params, false);
 272 }
 273
 274 std::string CGNVCUDARuntime::getDeviceSideName(const NamedDecl *ND) {
 275   GlobalDecl GD;
 276   // D could be either a kernel or a variable.
 277   if (auto *FD = dyn_cast<FunctionDecl>(ND))
 278     GD = GlobalDecl(FD, KernelReferenceKind::Kernel);
 279   else
 280     GD = GlobalDecl(ND);
 281   std::string DeviceSideName;
 282   MangleContext *MC;
 283   if (CGM.getLangOpts().CUDAIsDevice)
 284     MC = &CGM.getCXXABI().getMangleContext();
 285   else
 286     MC = DeviceMC.get();
 287   if (MC->shouldMangleDeclName(ND)) {
 288     SmallString<256> Buffer;
 289     llvm::raw_svector_ostream Out(Buffer);
 290     MC->mangleName(GD, Out);
 291     DeviceSideName = std::string(Out.str());
 292   } else
 293     DeviceSideName = std::string(ND->getIdentifier()->getName());
 294
 295   // Make unique name for device side static file-scope variable for HIP.
 296   if (CGM.getContext().shouldExternalize(ND) &&
 297       CGM.getLangOpts().GPURelocatableDeviceCode) {
 298     SmallString<256> Buffer;
 299     llvm::raw_svector_ostream Out(Buffer);
 300     Out << DeviceSideName;
 301     CGM.printPostfixForExternalizedDecl(Out, ND);
 302     DeviceSideName = std::string(Out.str());
 303   }
 304   return DeviceSideName;
 305 }
 306
 307 void CGNVCUDARuntime::emitDeviceStub(CodeGenFunction &CGF,
 308                                      FunctionArgList &Args) {
 309   EmittedKernels.push_back({CGF.CurFn, CGF.CurFuncDecl});
 310   if (auto *GV =
 311           dyn_cast<llvm::GlobalVariable>(KernelHandles[CGF.CurFn->getName()])) {
 312     GV->setLinkage(CGF.CurFn->getLinkage());
 313     GV->setInitializer(CGF.CurFn);
 314   }
 315   if (CudaFeatureEnabled(CGM.getTarget().getSDKVersion(),
 316                          CudaFeature::CUDA_USES_NEW_LAUNCH) ||
 317       (CGF.getLangOpts().HIP && CGF.getLangOpts().HIPUseNewLaunchAPI))
 318     emitDeviceStubBodyNew(CGF, Args);
 319   else
 320     emitDeviceStubBodyLegacy(CGF, Args);
 321 }
 322
 323 // CUDA 9.0+ uses new way to launch kernels. Parameters are packed in a local
 324 // array and kernels are launched using cudaLaunchKernel().
 325 void CGNVCUDARuntime::emitDeviceStubBodyNew(CodeGenFunction &CGF,
 326                                             FunctionArgList &Args) {
 327   // Build the shadow stack entry at the very start of the function.
 328
 329   // Calculate amount of space we will need for all arguments.  If we have no
 330   // args, allocate a single pointer so we still have a valid pointer to the
 331   // argument array that we can pass to runtime, even if it will be unused.
 332   Address KernelArgs = CGF.CreateTempAlloca(
 333       VoidPtrTy, CharUnits::fromQuantity(16), "kernel_args",
 334       llvm::ConstantInt::get(SizeTy, std::max<size_t>(1, Args.size())));
 335   // Store pointers to the arguments in a locally allocated launch_args.
 336   for (unsigned i = 0; i < Args.size(); ++i) {
 337     llvm::Value* VarPtr = CGF.GetAddrOfLocalVar(Args[i]).getPointer();
 338     llvm::Value *VoidVarPtr = CGF.Builder.CreatePointerCast(VarPtr, VoidPtrTy);
 339     CGF.Builder.CreateDefaultAlignedStore(
 340         VoidVarPtr,
 341         CGF.Builder.CreateConstGEP1_32(VoidPtrTy, KernelArgs.getPointer(), i));
 342   }
 343
 344   llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end");
 345
 346   // Lookup cudaLaunchKernel/hipLaunchKernel function.
 347   // HIP kernel launching API name depends on -fgpu-default-stream option. For
 348   // the default value 'legacy', it is hipLaunchKernel. For 'per-thread',
 349   // it is hipLaunchKernel_spt.
 350   // cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim,
 351   //                              void **args, size_t sharedMem,
 352   //                              cudaStream_t stream);
 353   // hipError_t hipLaunchKernel[_spt](const void *func, dim3 gridDim,
 354   //                                  dim3 blockDim, void **args,
 355   //                                  size_t sharedMem, hipStream_t stream);
 356   TranslationUnitDecl *TUDecl = CGM.getContext().getTranslationUnitDecl();
 357   DeclContext *DC = TranslationUnitDecl::castToDeclContext(TUDecl);
 358   std::string KernelLaunchAPI = "LaunchKernel";
 359   if (CGF.getLangOpts().GPUDefaultStream ==
 360       LangOptions::GPUDefaultStreamKind::PerThread) {
 361     if (CGF.getLangOpts().HIP)
 362       KernelLaunchAPI = KernelLaunchAPI + "_spt";
 363     else if (CGF.getLangOpts().CUDA)
 364       KernelLaunchAPI = KernelLaunchAPI + "_ptsz";
 365   }
 366   auto LaunchKernelName = addPrefixToName(KernelLaunchAPI);
 367   IdentifierInfo &cudaLaunchKernelII =
 368       CGM.getContext().Idents.get(LaunchKernelName);
 369   FunctionDecl *cudaLaunchKernelFD = nullptr;
 370   for (auto *Result : DC->lookup(&cudaLaunchKernelII)) {
 371     if (FunctionDecl *FD = dyn_cast<FunctionDecl>(Result))
 372       cudaLaunchKernelFD = FD;
 373   }
 374
 375   if (cudaLaunchKernelFD == nullptr) {
 376     CGM.Error(CGF.CurFuncDecl->getLocation(),
 377               "Can't find declaration for " + LaunchKernelName);
 378     return;
 379   }
 380   // Create temporary dim3 grid_dim, block_dim.
 381   ParmVarDecl *GridDimParam = cudaLaunchKernelFD->getParamDecl(1);
 382   QualType Dim3Ty = GridDimParam->getType();
 383   Address GridDim =
 384       CGF.CreateMemTemp(Dim3Ty, CharUnits::fromQuantity(8), "grid_dim");
 385   Address BlockDim =
 386       CGF.CreateMemTemp(Dim3Ty, CharUnits::fromQuantity(8), "block_dim");
 387   Address ShmemSize =
 388       CGF.CreateTempAlloca(SizeTy, CGM.getSizeAlign(), "shmem_size");
 389   Address Stream =
 390       CGF.CreateTempAlloca(VoidPtrTy, CGM.getPointerAlign(), "stream");
 391   llvm::FunctionCallee cudaPopConfigFn = CGM.CreateRuntimeFunction(
 392       llvm::FunctionType::get(IntTy,
 393                               {/*gridDim=*/GridDim.getType(),
 394                                /*blockDim=*/BlockDim.getType(),
 395                                /*ShmemSize=*/ShmemSize.getType(),
 396                                /*Stream=*/Stream.getType()},
 397                               /*isVarArg=*/false),
 398       addUnderscoredPrefixToName("PopCallConfiguration"));
 399
 400   CGF.EmitRuntimeCallOrInvoke(cudaPopConfigFn,
 401                               {GridDim.getPointer(), BlockDim.getPointer(),
 402                                ShmemSize.getPointer(), Stream.getPointer()});
 403
 404   // Emit the call to cudaLaunch
 405   llvm::Value *Kernel = CGF.Builder.CreatePointerCast(
 406       KernelHandles[CGF.CurFn->getName()], VoidPtrTy);
 407   CallArgList LaunchKernelArgs;
 408   LaunchKernelArgs.add(RValue::get(Kernel),
 409                        cudaLaunchKernelFD->getParamDecl(0)->getType());
 410   LaunchKernelArgs.add(RValue::getAggregate(GridDim), Dim3Ty);
 411   LaunchKernelArgs.add(RValue::getAggregate(BlockDim), Dim3Ty);
 412   LaunchKernelArgs.add(RValue::get(KernelArgs.getPointer()),
 413                        cudaLaunchKernelFD->getParamDecl(3)->getType());
 414   LaunchKernelArgs.add(RValue::get(CGF.Builder.CreateLoad(ShmemSize)),
 415                        cudaLaunchKernelFD->getParamDecl(4)->getType());
 416   LaunchKernelArgs.add(RValue::get(CGF.Builder.CreateLoad(Stream)),
 417                        cudaLaunchKernelFD->getParamDecl(5)->getType());
 418
 419   QualType QT = cudaLaunchKernelFD->getType();
 420   QualType CQT = QT.getCanonicalType();
 421   llvm::Type *Ty = CGM.getTypes().ConvertType(CQT);
 422   llvm::FunctionType *FTy = cast<llvm::FunctionType>(Ty);
 423
 424   const CGFunctionInfo &FI =
 425       CGM.getTypes().arrangeFunctionDeclaration(cudaLaunchKernelFD);
 426   llvm::FunctionCallee cudaLaunchKernelFn =
 427       CGM.CreateRuntimeFunction(FTy, LaunchKernelName);
 428   CGF.EmitCall(FI, CGCallee::forDirect(cudaLaunchKernelFn), ReturnValueSlot(),
 429                LaunchKernelArgs);
 430   CGF.EmitBranch(EndBlock);
 431
 432   CGF.EmitBlock(EndBlock);
 433 }
 434
 435 void CGNVCUDARuntime::emitDeviceStubBodyLegacy(CodeGenFunction &CGF,
 436                                                FunctionArgList &Args) {
 437   // Emit a call to cudaSetupArgument for each arg in Args.
 438   llvm::FunctionCallee cudaSetupArgFn = getSetupArgumentFn();
 439   llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end");
 440   CharUnits Offset = CharUnits::Zero();
 441   for (const VarDecl *A : Args) {
 442     auto TInfo = CGM.getContext().getTypeInfoInChars(A->getType());
 443     Offset = Offset.alignTo(TInfo.Align);
 444     llvm::Value *Args[] = {
 445         CGF.Builder.CreatePointerCast(CGF.GetAddrOfLocalVar(A).getPointer(),
 446                                       VoidPtrTy),
 447         llvm::ConstantInt::get(SizeTy, TInfo.Width.getQuantity()),
 448         llvm::ConstantInt::get(SizeTy, Offset.getQuantity()),
 449     };
 450     llvm::CallBase *CB = CGF.EmitRuntimeCallOrInvoke(cudaSetupArgFn, Args);
 451     llvm::Constant *Zero = llvm::ConstantInt::get(IntTy, 0);
 452     llvm::Value *CBZero = CGF.Builder.CreateICmpEQ(CB, Zero);
 453     llvm::BasicBlock *NextBlock = CGF.createBasicBlock("setup.next");
 454     CGF.Builder.CreateCondBr(CBZero, NextBlock, EndBlock);
 455     CGF.EmitBlock(NextBlock);
 456     Offset += TInfo.Width;
 457   }
 458
 459   // Emit the call to cudaLaunch
 460   llvm::FunctionCallee cudaLaunchFn = getLaunchFn();
 461   llvm::Value *Arg = CGF.Builder.CreatePointerCast(
 462       KernelHandles[CGF.CurFn->getName()], CharPtrTy);
 463   CGF.EmitRuntimeCallOrInvoke(cudaLaunchFn, Arg);
 464   CGF.EmitBranch(EndBlock);
 465
 466   CGF.EmitBlock(EndBlock);
 467 }
 468
 469 // Replace the original variable Var with the address loaded from variable
 470 // ManagedVar populated by HIP runtime.
 471 static void replaceManagedVar(llvm::GlobalVariable *Var,
 472                               llvm::GlobalVariable *ManagedVar) {
 473   SmallVector<SmallVector<llvm::User *, 8>, 8> WorkList;
 474   for (auto &&VarUse : Var->uses()) {
 475     WorkList.push_back({VarUse.getUser()});
 476   }
 477   while (!WorkList.empty()) {
 478     auto &&WorkItem = WorkList.pop_back_val();
 479     auto *U = WorkItem.back();
 480     if (isa<llvm::ConstantExpr>(U)) {
 481       for (auto &&UU : U->uses()) {
 482         WorkItem.push_back(UU.getUser());
 483         WorkList.push_back(WorkItem);
 484         WorkItem.pop_back();
 485       }
 486       continue;
 487     }
 488     if (auto *I = dyn_cast<llvm::Instruction>(U)) {
 489       llvm::Value *OldV = Var;
 490       llvm::Instruction *NewV =
 491           new llvm::LoadInst(Var->getType(), ManagedVar, "ld.managed", false,
 492                              llvm::Align(Var->getAlignment()), I);
 493       WorkItem.pop_back();
 494       // Replace constant expressions directly or indirectly using the managed
 495       // variable with instructions.
 496       for (auto &&Op : WorkItem) {
 497         auto *CE = cast<llvm::ConstantExpr>(Op);
 498         auto *NewInst = CE->getAsInstruction(I);
 499         NewInst->replaceUsesOfWith(OldV, NewV);
 500         OldV = CE;
 501         NewV = NewInst;
 502       }
 503       I->replaceUsesOfWith(OldV, NewV);
 504     } else {
 505       llvm_unreachable("Invalid use of managed variable");
 506     }
 507   }
 508 }
 509
 510 /// Creates a function that sets up state on the host side for CUDA objects that
 511 /// have a presence on both the host and device sides. Specifically, registers
 512 /// the host side of kernel functions and device global variables with the CUDA
 513 /// runtime.
 514 /// \code
 515 /// void __cuda_register_globals(void** GpuBinaryHandle) {
 516 ///    __cudaRegisterFunction(GpuBinaryHandle,Kernel0,...);
 517 ///    ...
 518 ///    __cudaRegisterFunction(GpuBinaryHandle,KernelM,...);
 519 ///    __cudaRegisterVar(GpuBinaryHandle, GlobalVar0, ...);
 520 ///    ...
 521 ///    __cudaRegisterVar(GpuBinaryHandle, GlobalVarN, ...);
 522 /// }
 523 /// \endcode
 524 llvm::Function *CGNVCUDARuntime::makeRegisterGlobalsFn() {
 525   // No need to register anything
 526   if (EmittedKernels.empty() && DeviceVars.empty())
 527     return nullptr;
 528
 529   llvm::Function *RegisterKernelsFunc = llvm::Function::Create(
 530       getRegisterGlobalsFnTy(), llvm::GlobalValue::InternalLinkage,
 531       addUnderscoredPrefixToName("_register_globals"), &TheModule);
 532   llvm::BasicBlock *EntryBB =
 533       llvm::BasicBlock::Create(Context, "entry", RegisterKernelsFunc);
 534   CGBuilderTy Builder(CGM, Context);
 535   Builder.SetInsertPoint(EntryBB);
 536
 537   // void __cudaRegisterFunction(void **, const char *, char *, const char *,
 538   //                             int, uint3*, uint3*, dim3*, dim3*, int*)
 539   llvm::Type *RegisterFuncParams[] = {
 540       VoidPtrPtrTy, CharPtrTy,
 541       CharPtrTy,    CharPtrTy,
 542       IntTy,        VoidPtrTy,
 543       VoidPtrTy,    VoidPtrTy,
 544       VoidPtrTy,    llvm::PointerType::getUnqual(Context)};
 545   llvm::FunctionCallee RegisterFunc = CGM.CreateRuntimeFunction(
 546       llvm::FunctionType::get(IntTy, RegisterFuncParams, false),
 547       addUnderscoredPrefixToName("RegisterFunction"));
 548
 549   // Extract GpuBinaryHandle passed as the first argument passed to
 550   // __cuda_register_globals() and generate __cudaRegisterFunction() call for
 551   // each emitted kernel.
 552   llvm::Argument &GpuBinaryHandlePtr = *RegisterKernelsFunc->arg_begin();
 553   for (auto &&I : EmittedKernels) {
 554     llvm::Constant *KernelName =
 555         makeConstantString(getDeviceSideName(cast<NamedDecl>(I.D)));
 556     llvm::Constant *NullPtr = llvm::ConstantPointerNull::get(VoidPtrTy);
 557     llvm::Value *Args[] = {
 558         &GpuBinaryHandlePtr,
 559         KernelHandles[I.Kernel->getName()],
 560         KernelName,
 561         KernelName,
 562         llvm::ConstantInt::get(IntTy, -1),
 563         NullPtr,
 564         NullPtr,
 565         NullPtr,
 566         NullPtr,
 567         llvm::ConstantPointerNull::get(llvm::PointerType::getUnqual(Context))};
 568     Builder.CreateCall(RegisterFunc, Args);
 569   }
 570
 571   llvm::Type *VarSizeTy = IntTy;
 572   // For HIP or CUDA 9.0+, device variable size is type of `size_t`.
 573   if (CGM.getLangOpts().HIP ||
 574       ToCudaVersion(CGM.getTarget().getSDKVersion()) >= CudaVersion::CUDA_90)
 575     VarSizeTy = SizeTy;
 576
 577   // void __cudaRegisterVar(void **, char *, char *, const char *,
 578   //                        int, int, int, int)
 579   llvm::Type *RegisterVarParams[] = {VoidPtrPtrTy, CharPtrTy, CharPtrTy,
 580                                      CharPtrTy,    IntTy,     VarSizeTy,
 581                                      IntTy,        IntTy};
 582   llvm::FunctionCallee RegisterVar = CGM.CreateRuntimeFunction(
 583       llvm::FunctionType::get(VoidTy, RegisterVarParams, false),
 584       addUnderscoredPrefixToName("RegisterVar"));
 585   // void __hipRegisterManagedVar(void **, char *, char *, const char *,
 586   //                              size_t, unsigned)
 587   llvm::Type *RegisterManagedVarParams[] = {VoidPtrPtrTy, CharPtrTy, CharPtrTy,
 588                                             CharPtrTy,    VarSizeTy, IntTy};
 589   llvm::FunctionCallee RegisterManagedVar = CGM.CreateRuntimeFunction(
 590       llvm::FunctionType::get(VoidTy, RegisterManagedVarParams, false),
 591       addUnderscoredPrefixToName("RegisterManagedVar"));
 592   // void __cudaRegisterSurface(void **, const struct surfaceReference *,
 593   //                            const void **, const char *, int, int);
 594   llvm::FunctionCallee RegisterSurf = CGM.CreateRuntimeFunction(
 595       llvm::FunctionType::get(
 596           VoidTy, {VoidPtrPtrTy, VoidPtrTy, CharPtrTy, CharPtrTy, IntTy, IntTy},
 597           false),
 598       addUnderscoredPrefixToName("RegisterSurface"));
 599   // void __cudaRegisterTexture(void **, const struct textureReference *,
 600   //                            const void **, const char *, int, int, int)
 601   llvm::FunctionCallee RegisterTex = CGM.CreateRuntimeFunction(
 602       llvm::FunctionType::get(
 603           VoidTy,
 604           {VoidPtrPtrTy, VoidPtrTy, CharPtrTy, CharPtrTy, IntTy, IntTy, IntTy},
 605           false),
 606       addUnderscoredPrefixToName("RegisterTexture"));
 607   for (auto &&Info : DeviceVars) {
 608     llvm::GlobalVariable *Var = Info.Var;
 609     assert((!Var->isDeclaration() || Info.Flags.isManaged()) &&
 610            "External variables should not show up here, except HIP managed "
 611            "variables");
 612     llvm::Constant *VarName = makeConstantString(getDeviceSideName(Info.D));
 613     switch (Info.Flags.getKind()) {
 614     case DeviceVarFlags::Variable: {
 615       uint64_t VarSize =
 616           CGM.getDataLayout().getTypeAllocSize(Var->getValueType());
 617       if (Info.Flags.isManaged()) {
 618         auto *ManagedVar = new llvm::GlobalVariable(
 619             CGM.getModule(), Var->getType(),
 620             /*isConstant=*/false, Var->getLinkage(),
 621             /*Init=*/Var->isDeclaration()
 622                 ? nullptr
 623                 : llvm::ConstantPointerNull::get(Var->getType()),
 624             /*Name=*/"", /*InsertBefore=*/nullptr,
 625             llvm::GlobalVariable::NotThreadLocal);
 626         ManagedVar->setDSOLocal(Var->isDSOLocal());
 627         ManagedVar->setVisibility(Var->getVisibility());
 628         ManagedVar->setExternallyInitialized(true);
 629         ManagedVar->takeName(Var);
 630         Var->setName(Twine(ManagedVar->getName() + ".managed"));
 631         replaceManagedVar(Var, ManagedVar);
 632         llvm::Value *Args[] = {
 633             &GpuBinaryHandlePtr,
 634             ManagedVar,
 635             Var,
 636             VarName,
 637             llvm::ConstantInt::get(VarSizeTy, VarSize),
 638             llvm::ConstantInt::get(IntTy, Var->getAlignment())};
 639         if (!Var->isDeclaration())
 640           Builder.CreateCall(RegisterManagedVar, Args);
 641       } else {
 642         llvm::Value *Args[] = {
 643             &GpuBinaryHandlePtr,
 644             Var,
 645             VarName,
 646             VarName,
 647             llvm::ConstantInt::get(IntTy, Info.Flags.isExtern()),
 648             llvm::ConstantInt::get(VarSizeTy, VarSize),
 649             llvm::ConstantInt::get(IntTy, Info.Flags.isConstant()),
 650             llvm::ConstantInt::get(IntTy, 0)};
 651         Builder.CreateCall(RegisterVar, Args);
 652       }
 653       break;
 654     }
 655     case DeviceVarFlags::Surface:
 656       Builder.CreateCall(
 657           RegisterSurf,
 658           {&GpuBinaryHandlePtr, Var, VarName, VarName,
 659            llvm::ConstantInt::get(IntTy, Info.Flags.getSurfTexType()),
 660            llvm::ConstantInt::get(IntTy, Info.Flags.isExtern())});
 661       break;
 662     case DeviceVarFlags::Texture:
 663       Builder.CreateCall(
 664           RegisterTex,
 665           {&GpuBinaryHandlePtr, Var, VarName, VarName,
 666            llvm::ConstantInt::get(IntTy, Info.Flags.getSurfTexType()),
 667            llvm::ConstantInt::get(IntTy, Info.Flags.isNormalized()),
 668            llvm::ConstantInt::get(IntTy, Info.Flags.isExtern())});
 669       break;
 670     }
 671   }
 672
 673   Builder.CreateRetVoid();
 674   return RegisterKernelsFunc;
 675 }
 676
 677 /// Creates a global constructor function for the module:
 678 ///
 679 /// For CUDA:
 680 /// \code
 681 /// void __cuda_module_ctor() {
 682 ///     Handle = __cudaRegisterFatBinary(GpuBinaryBlob);
 683 ///     __cuda_register_globals(Handle);
 684 /// }
 685 /// \endcode
 686 ///
 687 /// For HIP:
 688 /// \code
 689 /// void __hip_module_ctor() {
 690 ///     if (__hip_gpubin_handle == 0) {
 691 ///         __hip_gpubin_handle  = __hipRegisterFatBinary(GpuBinaryBlob);
 692 ///         __hip_register_globals(__hip_gpubin_handle);
 693 ///     }
 694 /// }
 695 /// \endcode
 696 llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
 697   bool IsHIP = CGM.getLangOpts().HIP;
 698   bool IsCUDA = CGM.getLangOpts().CUDA;
 699   // No need to generate ctors/dtors if there is no GPU binary.
 700   StringRef CudaGpuBinaryFileName = CGM.getCodeGenOpts().CudaGpuBinaryFileName;
 701   if (CudaGpuBinaryFileName.empty() && !IsHIP)
 702     return nullptr;
 703   if ((IsHIP || (IsCUDA && !RelocatableDeviceCode)) && EmittedKernels.empty() &&
 704       DeviceVars.empty())
 705     return nullptr;
 706
 707   // void __{cuda|hip}_register_globals(void* handle);
 708   llvm::Function *RegisterGlobalsFunc = makeRegisterGlobalsFn();
 709   // We always need a function to pass in as callback. Create a dummy
 710   // implementation if we don't need to register anything.
 711   if (RelocatableDeviceCode && !RegisterGlobalsFunc)
 712     RegisterGlobalsFunc = makeDummyFunction(getRegisterGlobalsFnTy());
 713
 714   // void ** __{cuda|hip}RegisterFatBinary(void *);
 715   llvm::FunctionCallee RegisterFatbinFunc = CGM.CreateRuntimeFunction(
 716       llvm::FunctionType::get(VoidPtrPtrTy, VoidPtrTy, false),
 717       addUnderscoredPrefixToName("RegisterFatBinary"));
 718   // struct { int magic, int version, void * gpu_binary, void * dont_care };
 719   llvm::StructType *FatbinWrapperTy =
 720       llvm::StructType::get(IntTy, IntTy, VoidPtrTy, VoidPtrTy);
 721
 722   // Register GPU binary with the CUDA runtime, store returned handle in a
 723   // global variable and save a reference in GpuBinaryHandle to be cleaned up
 724   // in destructor on exit. Then associate all known kernels with the GPU binary
 725   // handle so CUDA runtime can figure out what to call on the GPU side.
 726   std::unique_ptr<llvm::MemoryBuffer> CudaGpuBinary = nullptr;
 727   if (!CudaGpuBinaryFileName.empty()) {
 728     auto VFS = CGM.getFileSystem();
 729     auto CudaGpuBinaryOrErr =
 730         VFS->getBufferForFile(CudaGpuBinaryFileName, -1, false);
 731     if (std::error_code EC = CudaGpuBinaryOrErr.getError()) {
 732       CGM.getDiags().Report(diag::err_cannot_open_file)
 733           << CudaGpuBinaryFileName << EC.message();
 734       return nullptr;
 735     }
 736     CudaGpuBinary = std::move(CudaGpuBinaryOrErr.get());
 737   }
 738
 739   llvm::Function *ModuleCtorFunc = llvm::Function::Create(
 740       llvm::FunctionType::get(VoidTy, false),
 741       llvm::GlobalValue::InternalLinkage,
 742       addUnderscoredPrefixToName("_module_ctor"), &TheModule);
 743   llvm::BasicBlock *CtorEntryBB =
 744       llvm::BasicBlock::Create(Context, "entry", ModuleCtorFunc);
 745   CGBuilderTy CtorBuilder(CGM, Context);
 746
 747   CtorBuilder.SetInsertPoint(CtorEntryBB);
 748
 749   const char *FatbinConstantName;
 750   const char *FatbinSectionName;
 751   const char *ModuleIDSectionName;
 752   StringRef ModuleIDPrefix;
 753   llvm::Constant *FatBinStr;
 754   unsigned FatMagic;
 755   if (IsHIP) {
 756     FatbinConstantName = ".hip_fatbin";
 757     FatbinSectionName = ".hipFatBinSegment";
 758
 759     ModuleIDSectionName = "__hip_module_id";
 760     ModuleIDPrefix = "__hip_";
 761
 762     if (CudaGpuBinary) {
 763       // If fatbin is available from early finalization, create a string
 764       // literal containing the fat binary loaded from the given file.
 765       const unsigned HIPCodeObjectAlign = 4096;
 766       FatBinStr = makeConstantArray(std::string(CudaGpuBinary->getBuffer()), "",
 767                                     FatbinConstantName, HIPCodeObjectAlign);
 768     } else {
 769       // If fatbin is not available, create an external symbol
 770       // __hip_fatbin in section .hip_fatbin. The external symbol is supposed
 771       // to contain the fat binary but will be populated somewhere else,
 772       // e.g. by lld through link script.
 773       FatBinStr = new llvm::GlobalVariable(
 774         CGM.getModule(), CGM.Int8Ty,
 775         /*isConstant=*/true, llvm::GlobalValue::ExternalLinkage, nullptr,
 776         "__hip_fatbin", nullptr,
 777         llvm::GlobalVariable::NotThreadLocal);
 778       cast<llvm::GlobalVariable>(FatBinStr)->setSection(FatbinConstantName);
 779     }
 780
 781     FatMagic = HIPFatMagic;
 782   } else {
 783     if (RelocatableDeviceCode)
 784       FatbinConstantName = CGM.getTriple().isMacOSX()
 785                                ? "__NV_CUDA,__nv_relfatbin"
 786                                : "__nv_relfatbin";
 787     else
 788       FatbinConstantName =
 789           CGM.getTriple().isMacOSX() ? "__NV_CUDA,__nv_fatbin" : ".nv_fatbin";
 790     // NVIDIA's cuobjdump looks for fatbins in this section.
 791     FatbinSectionName =
 792         CGM.getTriple().isMacOSX() ? "__NV_CUDA,__fatbin" : ".nvFatBinSegment";
 793
 794     ModuleIDSectionName = CGM.getTriple().isMacOSX()
 795                               ? "__NV_CUDA,__nv_module_id"
 796                               : "__nv_module_id";
 797     ModuleIDPrefix = "__nv_";
 798
 799     // For CUDA, create a string literal containing the fat binary loaded from
 800     // the given file.
 801     FatBinStr = makeConstantArray(std::string(CudaGpuBinary->getBuffer()), "",
 802                                   FatbinConstantName, 8);
 803     FatMagic = CudaFatMagic;
 804   }
 805
 806   // Create initialized wrapper structure that points to the loaded GPU binary
 807   ConstantInitBuilder Builder(CGM);
 808   auto Values = Builder.beginStruct(FatbinWrapperTy);
 809   // Fatbin wrapper magic.
 810   Values.addInt(IntTy, FatMagic);
 811   // Fatbin version.
 812   Values.addInt(IntTy, 1);
 813   // Data.
 814   Values.add(FatBinStr);
 815   // Unused in fatbin v1.
 816   Values.add(llvm::ConstantPointerNull::get(VoidPtrTy));
 817   llvm::GlobalVariable *FatbinWrapper = Values.finishAndCreateGlobal(
 818       addUnderscoredPrefixToName("_fatbin_wrapper"), CGM.getPointerAlign(),
 819       /*constant*/ true);
 820   FatbinWrapper->setSection(FatbinSectionName);
 821
 822   // There is only one HIP fat binary per linked module, however there are
 823   // multiple constructor functions. Make sure the fat binary is registered
 824   // only once. The constructor functions are executed by the dynamic loader
 825   // before the program gains control. The dynamic loader cannot execute the
 826   // constructor functions concurrently since doing that would not guarantee
 827   // thread safety of the loaded program. Therefore we can assume sequential
 828   // execution of constructor functions here.
 829   if (IsHIP) {
 830     auto Linkage = CudaGpuBinary ? llvm::GlobalValue::InternalLinkage :
 831         llvm::GlobalValue::LinkOnceAnyLinkage;
 832     llvm::BasicBlock *IfBlock =
 833         llvm::BasicBlock::Create(Context, "if", ModuleCtorFunc);
 834     llvm::BasicBlock *ExitBlock =
 835         llvm::BasicBlock::Create(Context, "exit", ModuleCtorFunc);
 836     // The name, size, and initialization pattern of this variable is part
 837     // of HIP ABI.
 838     GpuBinaryHandle = new llvm::GlobalVariable(
 839         TheModule, VoidPtrPtrTy, /*isConstant=*/false,
 840         Linkage,
 841         /*Initializer=*/llvm::ConstantPointerNull::get(VoidPtrPtrTy),
 842         "__hip_gpubin_handle");
 843     if (Linkage == llvm::GlobalValue::LinkOnceAnyLinkage)
 844       GpuBinaryHandle->setComdat(
 845           CGM.getModule().getOrInsertComdat(GpuBinaryHandle->getName()));
 846     GpuBinaryHandle->setAlignment(CGM.getPointerAlign().getAsAlign());
 847     // Prevent the weak symbol in different shared libraries being merged.
 848     if (Linkage != llvm::GlobalValue::InternalLinkage)
 849       GpuBinaryHandle->setVisibility(llvm::GlobalValue::HiddenVisibility);
 850     Address GpuBinaryAddr(
 851         GpuBinaryHandle, VoidPtrPtrTy,
 852         CharUnits::fromQuantity(GpuBinaryHandle->getAlignment()));
 853     {
 854       auto *HandleValue = CtorBuilder.CreateLoad(GpuBinaryAddr);
 855       llvm::Constant *Zero =
 856           llvm::Constant::getNullValue(HandleValue->getType());
 857       llvm::Value *EQZero = CtorBuilder.CreateICmpEQ(HandleValue, Zero);
 858       CtorBuilder.CreateCondBr(EQZero, IfBlock, ExitBlock);
 859     }
 860     {
 861       CtorBuilder.SetInsertPoint(IfBlock);
 862       // GpuBinaryHandle = __hipRegisterFatBinary(&FatbinWrapper);
 863       llvm::CallInst *RegisterFatbinCall =
 864           CtorBuilder.CreateCall(RegisterFatbinFunc, FatbinWrapper);
 865       CtorBuilder.CreateStore(RegisterFatbinCall, GpuBinaryAddr);
 866       CtorBuilder.CreateBr(ExitBlock);
 867     }
 868     {
 869       CtorBuilder.SetInsertPoint(ExitBlock);
 870       // Call __hip_register_globals(GpuBinaryHandle);
 871       if (RegisterGlobalsFunc) {
 872         auto *HandleValue = CtorBuilder.CreateLoad(GpuBinaryAddr);
 873         CtorBuilder.CreateCall(RegisterGlobalsFunc, HandleValue);
 874       }
 875     }
 876   } else if (!RelocatableDeviceCode) {
 877     // Register binary with CUDA runtime. This is substantially different in
 878     // default mode vs. separate compilation!
 879     // GpuBinaryHandle = __cudaRegisterFatBinary(&FatbinWrapper);
 880     llvm::CallInst *RegisterFatbinCall =
 881         CtorBuilder.CreateCall(RegisterFatbinFunc, FatbinWrapper);
 882     GpuBinaryHandle = new llvm::GlobalVariable(
 883         TheModule, VoidPtrPtrTy, false, llvm::GlobalValue::InternalLinkage,
 884         llvm::ConstantPointerNull::get(VoidPtrPtrTy), "__cuda_gpubin_handle");
 885     GpuBinaryHandle->setAlignment(CGM.getPointerAlign().getAsAlign());
 886     CtorBuilder.CreateAlignedStore(RegisterFatbinCall, GpuBinaryHandle,
 887                                    CGM.getPointerAlign());
 888
 889     // Call __cuda_register_globals(GpuBinaryHandle);
 890     if (RegisterGlobalsFunc)
 891       CtorBuilder.CreateCall(RegisterGlobalsFunc, RegisterFatbinCall);
 892
 893     // Call __cudaRegisterFatBinaryEnd(Handle) if this CUDA version needs it.
 894     if (CudaFeatureEnabled(CGM.getTarget().getSDKVersion(),
 895                            CudaFeature::CUDA_USES_FATBIN_REGISTER_END)) {
 896       // void __cudaRegisterFatBinaryEnd(void **);
 897       llvm::FunctionCallee RegisterFatbinEndFunc = CGM.CreateRuntimeFunction(
 898           llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false),
 899           "__cudaRegisterFatBinaryEnd");
 900       CtorBuilder.CreateCall(RegisterFatbinEndFunc, RegisterFatbinCall);
 901     }
 902   } else {
 903     // Generate a unique module ID.
 904     SmallString<64> ModuleID;
 905     llvm::raw_svector_ostream OS(ModuleID);
 906     OS << ModuleIDPrefix << llvm::format("%" PRIx64, FatbinWrapper->getGUID());
 907     llvm::Constant *ModuleIDConstant = makeConstantArray(
 908         std::string(ModuleID.str()), "", ModuleIDSectionName, 32, /*AddNull=*/true);
 909
 910     // Create an alias for the FatbinWrapper that nvcc will look for.
 911     llvm::GlobalAlias::create(llvm::GlobalValue::ExternalLinkage,
 912                               Twine("__fatbinwrap") + ModuleID, FatbinWrapper);
 913
 914     // void __cudaRegisterLinkedBinary%ModuleID%(void (*)(void *), void *,
 915     // void *, void (*)(void **))
 916     SmallString<128> RegisterLinkedBinaryName("__cudaRegisterLinkedBinary");
 917     RegisterLinkedBinaryName += ModuleID;
 918     llvm::FunctionCallee RegisterLinkedBinaryFunc = CGM.CreateRuntimeFunction(
 919         getRegisterLinkedBinaryFnTy(), RegisterLinkedBinaryName);
 920
 921     assert(RegisterGlobalsFunc && "Expecting at least dummy function!");
 922     llvm::Value *Args[] = {RegisterGlobalsFunc, FatbinWrapper, ModuleIDConstant,
 923                            makeDummyFunction(getCallbackFnTy())};
 924     CtorBuilder.CreateCall(RegisterLinkedBinaryFunc, Args);
 925   }
 926
 927   // Create destructor and register it with atexit() the way NVCC does it. Doing
 928   // it during regular destructor phase worked in CUDA before 9.2 but results in
 929   // double-free in 9.2.
 930   if (llvm::Function *CleanupFn = makeModuleDtorFunction()) {
 931     // extern "C" int atexit(void (*f)(void));
 932     llvm::FunctionType *AtExitTy =
 933         llvm::FunctionType::get(IntTy, CleanupFn->getType(), false);
 934     llvm::FunctionCallee AtExitFunc =
 935         CGM.CreateRuntimeFunction(AtExitTy, "atexit", llvm::AttributeList(),
 936                                   /*Local=*/true);
 937     CtorBuilder.CreateCall(AtExitFunc, CleanupFn);
 938   }
 939
 940   CtorBuilder.CreateRetVoid();
 941   return ModuleCtorFunc;
 942 }
 943
 944 /// Creates a global destructor function that unregisters the GPU code blob
 945 /// registered by constructor.
 946 ///
 947 /// For CUDA:
 948 /// \code
 949 /// void __cuda_module_dtor() {
 950 ///     __cudaUnregisterFatBinary(Handle);
 951 /// }
 952 /// \endcode
 953 ///
 954 /// For HIP:
 955 /// \code
 956 /// void __hip_module_dtor() {
 957 ///     if (__hip_gpubin_handle) {
 958 ///         __hipUnregisterFatBinary(__hip_gpubin_handle);
 959 ///         __hip_gpubin_handle = 0;
 960 ///     }
 961 /// }
 962 /// \endcode
 963 llvm::Function *CGNVCUDARuntime::makeModuleDtorFunction() {
 964   // No need for destructor if we don't have a handle to unregister.
 965   if (!GpuBinaryHandle)
 966     return nullptr;
 967
 968   // void __cudaUnregisterFatBinary(void ** handle);
 969   llvm::FunctionCallee UnregisterFatbinFunc = CGM.CreateRuntimeFunction(
 970       llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false),
 971       addUnderscoredPrefixToName("UnregisterFatBinary"));
 972
 973   llvm::Function *ModuleDtorFunc = llvm::Function::Create(
 974       llvm::FunctionType::get(VoidTy, false),
 975       llvm::GlobalValue::InternalLinkage,
 976       addUnderscoredPrefixToName("_module_dtor"), &TheModule);
 977
 978   llvm::BasicBlock *DtorEntryBB =
 979       llvm::BasicBlock::Create(Context, "entry", ModuleDtorFunc);
 980   CGBuilderTy DtorBuilder(CGM, Context);
 981   DtorBuilder.SetInsertPoint(DtorEntryBB);
 982
 983   Address GpuBinaryAddr(
 984       GpuBinaryHandle, GpuBinaryHandle->getValueType(),
 985       CharUnits::fromQuantity(GpuBinaryHandle->getAlignment()));
 986   auto *HandleValue = DtorBuilder.CreateLoad(GpuBinaryAddr);
 987   // There is only one HIP fat binary per linked module, however there are
 988   // multiple destructor functions. Make sure the fat binary is unregistered
 989   // only once.
 990   if (CGM.getLangOpts().HIP) {
 991     llvm::BasicBlock *IfBlock =
 992         llvm::BasicBlock::Create(Context, "if", ModuleDtorFunc);
 993     llvm::BasicBlock *ExitBlock =
 994         llvm::BasicBlock::Create(Context, "exit", ModuleDtorFunc);
 995     llvm::Constant *Zero = llvm::Constant::getNullValue(HandleValue->getType());
 996     llvm::Value *NEZero = DtorBuilder.CreateICmpNE(HandleValue, Zero);
 997     DtorBuilder.CreateCondBr(NEZero, IfBlock, ExitBlock);
 998
 999     DtorBuilder.SetInsertPoint(IfBlock);
1000     DtorBuilder.CreateCall(UnregisterFatbinFunc, HandleValue);
1001     DtorBuilder.CreateStore(Zero, GpuBinaryAddr);
1002     DtorBuilder.CreateBr(ExitBlock);
1003
1004     DtorBuilder.SetInsertPoint(ExitBlock);
1005   } else {
1006     DtorBuilder.CreateCall(UnregisterFatbinFunc, HandleValue);
1007   }
1008   DtorBuilder.CreateRetVoid();
1009   return ModuleDtorFunc;
1010 }
1011
1012 CGCUDARuntime *CodeGen::CreateNVCUDARuntime(CodeGenModule &CGM) {
1013   return new CGNVCUDARuntime(CGM);
1014 }
1015
1016 void CGNVCUDARuntime::internalizeDeviceSideVar(
1017     const VarDecl *D, llvm::GlobalValue::LinkageTypes &Linkage) {
1018   // For -fno-gpu-rdc, host-side shadows of external declarations of device-side
1019   // global variables become internal definitions. These have to be internal in
1020   // order to prevent name conflicts with global host variables with the same
1021   // name in a different TUs.
1022   //
1023   // For -fgpu-rdc, the shadow variables should not be internalized because
1024   // they may be accessed by different TU.
1025   if (CGM.getLangOpts().GPURelocatableDeviceCode)
1026     return;
1027
1028   // __shared__ variables are odd. Shadows do get created, but
1029   // they are not registered with the CUDA runtime, so they
1030   // can't really be used to access their device-side
1031   // counterparts. It's not clear yet whether it's nvcc's bug or
1032   // a feature, but we've got to do the same for compatibility.
1033   if (D->hasAttr<CUDADeviceAttr>() || D->hasAttr<CUDAConstantAttr>() ||
1034       D->hasAttr<CUDASharedAttr>() ||
1035       D->getType()->isCUDADeviceBuiltinSurfaceType() ||
1036       D->getType()->isCUDADeviceBuiltinTextureType()) {
1037     Linkage = llvm::GlobalValue::InternalLinkage;
1038   }
1039 }
1040
1041 void CGNVCUDARuntime::handleVarRegistration(const VarDecl *D,
1042                                             llvm::GlobalVariable &GV) {
1043   if (D->hasAttr<CUDADeviceAttr>() || D->hasAttr<CUDAConstantAttr>()) {
1044     // Shadow variables and their properties must be registered with CUDA
1045     // runtime. Skip Extern global variables, which will be registered in
1046     // the TU where they are defined.
1047     //
1048     // Don't register a C++17 inline variable. The local symbol can be
1049     // discarded and referencing a discarded local symbol from outside the
1050     // comdat (__cuda_register_globals) is disallowed by the ELF spec.
1051     //
1052     // HIP managed variables need to be always recorded in device and host
1053     // compilations for transformation.
1054     //
1055     // HIP managed variables and variables in CUDADeviceVarODRUsedByHost are
1056     // added to llvm.compiler-used, therefore they are safe to be registered.
1057     if ((!D->hasExternalStorage() && !D->isInline()) ||
1058         CGM.getContext().CUDADeviceVarODRUsedByHost.contains(D) ||
1059         D->hasAttr<HIPManagedAttr>()) {
1060       registerDeviceVar(D, GV, !D->hasDefinition(),
1061                         D->hasAttr<CUDAConstantAttr>());
1062     }
1063   } else if (D->getType()->isCUDADeviceBuiltinSurfaceType() ||
1064              D->getType()->isCUDADeviceBuiltinTextureType()) {
1065     // Builtin surfaces and textures and their template arguments are
1066     // also registered with CUDA runtime.
1067     const auto *TD = cast<ClassTemplateSpecializationDecl>(
1068         D->getType()->castAs<RecordType>()->getDecl());
1069     const TemplateArgumentList &Args = TD->getTemplateArgs();
1070     if (TD->hasAttr<CUDADeviceBuiltinSurfaceTypeAttr>()) {
1071       assert(Args.size() == 2 &&
1072              "Unexpected number of template arguments of CUDA device "
1073              "builtin surface type.");
1074       auto SurfType = Args[1].getAsIntegral();
1075       if (!D->hasExternalStorage())
1076         registerDeviceSurf(D, GV, !D->hasDefinition(), SurfType.getSExtValue());
1077     } else {
1078       assert(Args.size() == 3 &&
1079              "Unexpected number of template arguments of CUDA device "
1080              "builtin texture type.");
1081       auto TexType = Args[1].getAsIntegral();
1082       auto Normalized = Args[2].getAsIntegral();
1083       if (!D->hasExternalStorage())
1084         registerDeviceTex(D, GV, !D->hasDefinition(), TexType.getSExtValue(),
1085                           Normalized.getZExtValue());
1086     }
1087   }
1088 }
1089
1090 // Transform managed variables to pointers to managed variables in device code.
1091 // Each use of the original managed variable is replaced by a load from the
1092 // transformed managed variable. The transformed managed variable contains
1093 // the address of managed memory which will be allocated by the runtime.
1094 void CGNVCUDARuntime::transformManagedVars() {
1095   for (auto &&Info : DeviceVars) {
1096     llvm::GlobalVariable *Var = Info.Var;
1097     if (Info.Flags.getKind() == DeviceVarFlags::Variable &&
1098         Info.Flags.isManaged()) {
1099       auto *ManagedVar = new llvm::GlobalVariable(
1100           CGM.getModule(), Var->getType(),
1101           /*isConstant=*/false, Var->getLinkage(),
1102           /*Init=*/Var->isDeclaration()
1103               ? nullptr
1104               : llvm::ConstantPointerNull::get(Var->getType()),
1105           /*Name=*/"", /*InsertBefore=*/nullptr,
1106           llvm::GlobalVariable::NotThreadLocal,
1107           CGM.getContext().getTargetAddressSpace(LangAS::cuda_device));
1108       ManagedVar->setDSOLocal(Var->isDSOLocal());
1109       ManagedVar->setVisibility(Var->getVisibility());
1110       ManagedVar->setExternallyInitialized(true);
1111       replaceManagedVar(Var, ManagedVar);
1112       ManagedVar->takeName(Var);
1113       Var->setName(Twine(ManagedVar->getName()) + ".managed");
1114       // Keep managed variables even if they are not used in device code since
1115       // they need to be allocated by the runtime.
1116       if (!Var->isDeclaration()) {
1117         assert(!ManagedVar->isDeclaration());
1118         CGM.addCompilerUsedGlobal(Var);
1119         CGM.addCompilerUsedGlobal(ManagedVar);
1120       }
1121     }
1122   }
1123 }
1124
1125 // Creates offloading entries for all the kernels and globals that must be
1126 // registered. The linker will provide a pointer to this section so we can
1127 // register the symbols with the linked device image.
1128 void CGNVCUDARuntime::createOffloadingEntries() {
1129   StringRef Section = CGM.getLangOpts().HIP ? "hip_offloading_entries"
1130                                             : "cuda_offloading_entries";
1131   llvm::Module &M = CGM.getModule();
1132   for (KernelInfo &I : EmittedKernels)
1133     llvm::offloading::emitOffloadingEntry(
1134         M, KernelHandles[I.Kernel->getName()],
1135         getDeviceSideName(cast<NamedDecl>(I.D)), 0,
1136         DeviceVarFlags::OffloadGlobalEntry, Section);
1137
1138   for (VarInfo &I : DeviceVars) {
1139     uint64_t VarSize =
1140         CGM.getDataLayout().getTypeAllocSize(I.Var->getValueType());
1141     if (I.Flags.getKind() == DeviceVarFlags::Variable) {
1142       llvm::offloading::emitOffloadingEntry(
1143           M, I.Var, getDeviceSideName(I.D), VarSize,
1144           I.Flags.isManaged() ? DeviceVarFlags::OffloadGlobalManagedEntry
1145                               : DeviceVarFlags::OffloadGlobalEntry,
1146           Section);
1147     } else if (I.Flags.getKind() == DeviceVarFlags::Surface) {
1148       llvm::offloading::emitOffloadingEntry(
1149           M, I.Var, getDeviceSideName(I.D), VarSize,
1150           DeviceVarFlags::OffloadGlobalSurfaceEntry, Section);
1151     } else if (I.Flags.getKind() == DeviceVarFlags::Texture) {
1152       llvm::offloading::emitOffloadingEntry(
1153           M, I.Var, getDeviceSideName(I.D), VarSize,
1154           DeviceVarFlags::OffloadGlobalTextureEntry, Section);
1155     }
1156   }
1157 }
1158
1159 // Returns module constructor to be added.
1160 llvm::Function *CGNVCUDARuntime::finalizeModule() {
1161   if (CGM.getLangOpts().CUDAIsDevice) {
1162     transformManagedVars();
1163
1164     // Mark ODR-used device variables as compiler used to prevent it from being
1165     // eliminated by optimization. This is necessary for device variables
1166     // ODR-used by host functions. Sema correctly marks them as ODR-used no
1167     // matter whether they are ODR-used by device or host functions.
1168     //
1169     // We do not need to do this if the variable has used attribute since it
1170     // has already been added.
1171     //
1172     // Static device variables have been externalized at this point, therefore
1173     // variables with LLVM private or internal linkage need not be added.
1174     for (auto &&Info : DeviceVars) {
1175       auto Kind = Info.Flags.getKind();
1176       if (!Info.Var->isDeclaration() &&
1177           !llvm::GlobalValue::isLocalLinkage(Info.Var->getLinkage()) &&
1178           (Kind == DeviceVarFlags::Variable ||
1179            Kind == DeviceVarFlags::Surface ||
1180            Kind == DeviceVarFlags::Texture) &&
1181           Info.D->isUsed() && !Info.D->hasAttr<UsedAttr>()) {
1182         CGM.addCompilerUsedGlobal(Info.Var);
1183       }
1184     }
1185     return nullptr;
1186   }
1187   if (CGM.getLangOpts().OffloadingNewDriver && RelocatableDeviceCode)
1188     createOffloadingEntries();
1189   else
1190     return makeModuleCtorFunction();
1191
1192   return nullptr;
1193 }
1194
1195 llvm::GlobalValue *CGNVCUDARuntime::getKernelHandle(llvm::Function *F,
1196                                                     GlobalDecl GD) {
1197   auto Loc = KernelHandles.find(F->getName());
1198   if (Loc != KernelHandles.end()) {
1199     auto OldHandle = Loc->second;
1200     if (KernelStubs[OldHandle] == F)
1201       return OldHandle;
1202
1203     // We've found the function name, but F itself has changed, so we need to
1204     // update the references.
1205     if (CGM.getLangOpts().HIP) {
1206       // For HIP compilation the handle itself does not change, so we only need
1207       // to update the Stub value.
1208       KernelStubs[OldHandle] = F;
1209       return OldHandle;
1210     }
1211     // For non-HIP compilation, erase the old Stub and fall-through to creating
1212     // new entries.
1213     KernelStubs.erase(OldHandle);
1214   }
1215
1216   if (!CGM.getLangOpts().HIP) {
1217     KernelHandles[F->getName()] = F;
1218     KernelStubs[F] = F;
1219     return F;
1220   }
1221
1222   auto *Var = new llvm::GlobalVariable(
1223       TheModule, F->getType(), /*isConstant=*/true, F->getLinkage(),
1224       /*Initializer=*/nullptr,
1225       CGM.getMangledName(
1226           GD.getWithKernelReferenceKind(KernelReferenceKind::Kernel)));
1227   Var->setAlignment(CGM.getPointerAlign().getAsAlign());
1228   Var->setDSOLocal(F->isDSOLocal());
1229   Var->setVisibility(F->getVisibility());
1230   auto *FD = cast<FunctionDecl>(GD.getDecl());
1231   auto *FT = FD->getPrimaryTemplate();
1232   if (!FT || FT->isThisDeclarationADefinition())
1233     CGM.maybeSetTrivialComdat(*FD, *Var);
1234   KernelHandles[F->getName()] = Var;
1235   KernelStubs[Var] = F;
1236   return Var;
1237 }