clang/lib/CodeGen/Targets/AMDGPU.cpp

   1 //===- AMDGPU.cpp ---------------------------------------------------------===//
   2 //
   3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 // See https://llvm.org/LICENSE.txt for license information.
   5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 //
   7 //===----------------------------------------------------------------------===//
   8
   9 #include "ABIInfoImpl.h"
  10 #include "TargetInfo.h"
  11 #include "clang/Basic/TargetOptions.h"
  12
  13 using namespace clang;
  14 using namespace clang::CodeGen;
  15
  16 //===----------------------------------------------------------------------===//
  17 // AMDGPU ABI Implementation
  18 //===----------------------------------------------------------------------===//
  19
  20 namespace {
  21
  22 class AMDGPUABIInfo final : public DefaultABIInfo {
  23 private:
  24   static const unsigned MaxNumRegsForArgsRet = 16;
  25
  26   unsigned numRegsForType(QualType Ty) const;
  27
  28   bool isHomogeneousAggregateBaseType(QualType Ty) const override;
  29   bool isHomogeneousAggregateSmallEnough(const Type *Base,
  30                                          uint64_t Members) const override;
  31
  32   // Coerce HIP scalar pointer arguments from generic pointers to global ones.
  33   llvm::Type *coerceKernelArgumentType(llvm::Type *Ty, unsigned FromAS,
  34                                        unsigned ToAS) const {
  35     // Single value types.
  36     auto *PtrTy = llvm::dyn_cast<llvm::PointerType>(Ty);
  37     if (PtrTy && PtrTy->getAddressSpace() == FromAS)
  38       return llvm::PointerType::get(Ty->getContext(), ToAS);
  39     return Ty;
  40   }
  41
  42 public:
  43   explicit AMDGPUABIInfo(CodeGen::CodeGenTypes &CGT) :
  44     DefaultABIInfo(CGT) {}
  45
  46   ABIArgInfo classifyReturnType(QualType RetTy) const;
  47   ABIArgInfo classifyKernelArgumentType(QualType Ty) const;
  48   ABIArgInfo classifyArgumentType(QualType Ty, unsigned &NumRegsLeft) const;
  49
  50   void computeInfo(CGFunctionInfo &FI) const override;
  51   Address EmitVAArg(CodeGenFunction &CGF, Address VAListAddr,
  52                     QualType Ty) const override;
  53 };
  54
  55 bool AMDGPUABIInfo::isHomogeneousAggregateBaseType(QualType Ty) const {
  56   return true;
  57 }
  58
  59 bool AMDGPUABIInfo::isHomogeneousAggregateSmallEnough(
  60   const Type *Base, uint64_t Members) const {
  61   uint32_t NumRegs = (getContext().getTypeSize(Base) + 31) / 32;
  62
  63   // Homogeneous Aggregates may occupy at most 16 registers.
  64   return Members * NumRegs <= MaxNumRegsForArgsRet;
  65 }
  66
  67 /// Estimate number of registers the type will use when passed in registers.
  68 unsigned AMDGPUABIInfo::numRegsForType(QualType Ty) const {
  69   unsigned NumRegs = 0;
  70
  71   if (const VectorType *VT = Ty->getAs<VectorType>()) {
  72     // Compute from the number of elements. The reported size is based on the
  73     // in-memory size, which includes the padding 4th element for 3-vectors.
  74     QualType EltTy = VT->getElementType();
  75     unsigned EltSize = getContext().getTypeSize(EltTy);
  76
  77     // 16-bit element vectors should be passed as packed.
  78     if (EltSize == 16)
  79       return (VT->getNumElements() + 1) / 2;
  80
  81     unsigned EltNumRegs = (EltSize + 31) / 32;
  82     return EltNumRegs * VT->getNumElements();
  83   }
  84
  85   if (const RecordType *RT = Ty->getAs<RecordType>()) {
  86     const RecordDecl *RD = RT->getDecl();
  87     assert(!RD->hasFlexibleArrayMember());
  88
  89     for (const FieldDecl *Field : RD->fields()) {
  90       QualType FieldTy = Field->getType();
  91       NumRegs += numRegsForType(FieldTy);
  92     }
  93
  94     return NumRegs;
  95   }
  96
  97   return (getContext().getTypeSize(Ty) + 31) / 32;
  98 }
  99
 100 void AMDGPUABIInfo::computeInfo(CGFunctionInfo &FI) const {
 101   llvm::CallingConv::ID CC = FI.getCallingConvention();
 102
 103   if (!getCXXABI().classifyReturnType(FI))
 104     FI.getReturnInfo() = classifyReturnType(FI.getReturnType());
 105
 106   unsigned NumRegsLeft = MaxNumRegsForArgsRet;
 107   for (auto &Arg : FI.arguments()) {
 108     if (CC == llvm::CallingConv::AMDGPU_KERNEL) {
 109       Arg.info = classifyKernelArgumentType(Arg.type);
 110     } else {
 111       Arg.info = classifyArgumentType(Arg.type, NumRegsLeft);
 112     }
 113   }
 114 }
 115
 116 Address AMDGPUABIInfo::EmitVAArg(CodeGenFunction &CGF, Address VAListAddr,
 117                                  QualType Ty) const {
 118   llvm_unreachable("AMDGPU does not support varargs");
 119 }
 120
 121 ABIArgInfo AMDGPUABIInfo::classifyReturnType(QualType RetTy) const {
 122   if (isAggregateTypeForABI(RetTy)) {
 123     // Records with non-trivial destructors/copy-constructors should not be
 124     // returned by value.
 125     if (!getRecordArgABI(RetTy, getCXXABI())) {
 126       // Ignore empty structs/unions.
 127       if (isEmptyRecord(getContext(), RetTy, true))
 128         return ABIArgInfo::getIgnore();
 129
 130       // Lower single-element structs to just return a regular value.
 131       if (const Type *SeltTy = isSingleElementStruct(RetTy, getContext()))
 132         return ABIArgInfo::getDirect(CGT.ConvertType(QualType(SeltTy, 0)));
 133
 134       if (const RecordType *RT = RetTy->getAs<RecordType>()) {
 135         const RecordDecl *RD = RT->getDecl();
 136         if (RD->hasFlexibleArrayMember())
 137           return DefaultABIInfo::classifyReturnType(RetTy);
 138       }
 139
 140       // Pack aggregates <= 4 bytes into single VGPR or pair.
 141       uint64_t Size = getContext().getTypeSize(RetTy);
 142       if (Size <= 16)
 143         return ABIArgInfo::getDirect(llvm::Type::getInt16Ty(getVMContext()));
 144
 145       if (Size <= 32)
 146         return ABIArgInfo::getDirect(llvm::Type::getInt32Ty(getVMContext()));
 147
 148       if (Size <= 64) {
 149         llvm::Type *I32Ty = llvm::Type::getInt32Ty(getVMContext());
 150         return ABIArgInfo::getDirect(llvm::ArrayType::get(I32Ty, 2));
 151       }
 152
 153       if (numRegsForType(RetTy) <= MaxNumRegsForArgsRet)
 154         return ABIArgInfo::getDirect();
 155     }
 156   }
 157
 158   // Otherwise just do the default thing.
 159   return DefaultABIInfo::classifyReturnType(RetTy);
 160 }
 161
 162 /// For kernels all parameters are really passed in a special buffer. It doesn't
 163 /// make sense to pass anything byval, so everything must be direct.
 164 ABIArgInfo AMDGPUABIInfo::classifyKernelArgumentType(QualType Ty) const {
 165   Ty = useFirstFieldIfTransparentUnion(Ty);
 166
 167   // TODO: Can we omit empty structs?
 168
 169   if (const Type *SeltTy = isSingleElementStruct(Ty, getContext()))
 170     Ty = QualType(SeltTy, 0);
 171
 172   llvm::Type *OrigLTy = CGT.ConvertType(Ty);
 173   llvm::Type *LTy = OrigLTy;
 174   if (getContext().getLangOpts().HIP) {
 175     LTy = coerceKernelArgumentType(
 176         OrigLTy, /*FromAS=*/getContext().getTargetAddressSpace(LangAS::Default),
 177         /*ToAS=*/getContext().getTargetAddressSpace(LangAS::cuda_device));
 178   }
 179
 180   // FIXME: Should also use this for OpenCL, but it requires addressing the
 181   // problem of kernels being called.
 182   //
 183   // FIXME: This doesn't apply the optimization of coercing pointers in structs
 184   // to global address space when using byref. This would require implementing a
 185   // new kind of coercion of the in-memory type when for indirect arguments.
 186   if (!getContext().getLangOpts().OpenCL && LTy == OrigLTy &&
 187       isAggregateTypeForABI(Ty)) {
 188     return ABIArgInfo::getIndirectAliased(
 189         getContext().getTypeAlignInChars(Ty),
 190         getContext().getTargetAddressSpace(LangAS::opencl_constant),
 191         false /*Realign*/, nullptr /*Padding*/);
 192   }
 193
 194   // If we set CanBeFlattened to true, CodeGen will expand the struct to its
 195   // individual elements, which confuses the Clover OpenCL backend; therefore we
 196   // have to set it to false here. Other args of getDirect() are just defaults.
 197   return ABIArgInfo::getDirect(LTy, 0, nullptr, false);
 198 }
 199
 200 ABIArgInfo AMDGPUABIInfo::classifyArgumentType(QualType Ty,
 201                                                unsigned &NumRegsLeft) const {
 202   assert(NumRegsLeft <= MaxNumRegsForArgsRet && "register estimate underflow");
 203
 204   Ty = useFirstFieldIfTransparentUnion(Ty);
 205
 206   if (isAggregateTypeForABI(Ty)) {
 207     // Records with non-trivial destructors/copy-constructors should not be
 208     // passed by value.
 209     if (auto RAA = getRecordArgABI(Ty, getCXXABI()))
 210       return getNaturalAlignIndirect(Ty, RAA == CGCXXABI::RAA_DirectInMemory);
 211
 212     // Ignore empty structs/unions.
 213     if (isEmptyRecord(getContext(), Ty, true))
 214       return ABIArgInfo::getIgnore();
 215
 216     // Lower single-element structs to just pass a regular value. TODO: We
 217     // could do reasonable-size multiple-element structs too, using getExpand(),
 218     // though watch out for things like bitfields.
 219     if (const Type *SeltTy = isSingleElementStruct(Ty, getContext()))
 220       return ABIArgInfo::getDirect(CGT.ConvertType(QualType(SeltTy, 0)));
 221
 222     if (const RecordType *RT = Ty->getAs<RecordType>()) {
 223       const RecordDecl *RD = RT->getDecl();
 224       if (RD->hasFlexibleArrayMember())
 225         return DefaultABIInfo::classifyArgumentType(Ty);
 226     }
 227
 228     // Pack aggregates <= 8 bytes into single VGPR or pair.
 229     uint64_t Size = getContext().getTypeSize(Ty);
 230     if (Size <= 64) {
 231       unsigned NumRegs = (Size + 31) / 32;
 232       NumRegsLeft -= std::min(NumRegsLeft, NumRegs);
 233
 234       if (Size <= 16)
 235         return ABIArgInfo::getDirect(llvm::Type::getInt16Ty(getVMContext()));
 236
 237       if (Size <= 32)
 238         return ABIArgInfo::getDirect(llvm::Type::getInt32Ty(getVMContext()));
 239
 240       // XXX: Should this be i64 instead, and should the limit increase?
 241       llvm::Type *I32Ty = llvm::Type::getInt32Ty(getVMContext());
 242       return ABIArgInfo::getDirect(llvm::ArrayType::get(I32Ty, 2));
 243     }
 244
 245     if (NumRegsLeft > 0) {
 246       unsigned NumRegs = numRegsForType(Ty);
 247       if (NumRegsLeft >= NumRegs) {
 248         NumRegsLeft -= NumRegs;
 249         return ABIArgInfo::getDirect();
 250       }
 251     }
 252
 253     // Use pass-by-reference in stead of pass-by-value for struct arguments in
 254     // function ABI.
 255     return ABIArgInfo::getIndirectAliased(
 256         getContext().getTypeAlignInChars(Ty),
 257         getContext().getTargetAddressSpace(LangAS::opencl_private));
 258   }
 259
 260   // Otherwise just do the default thing.
 261   ABIArgInfo ArgInfo = DefaultABIInfo::classifyArgumentType(Ty);
 262   if (!ArgInfo.isIndirect()) {
 263     unsigned NumRegs = numRegsForType(Ty);
 264     NumRegsLeft -= std::min(NumRegs, NumRegsLeft);
 265   }
 266
 267   return ArgInfo;
 268 }
 269
 270 class AMDGPUTargetCodeGenInfo : public TargetCodeGenInfo {
 271 public:
 272   AMDGPUTargetCodeGenInfo(CodeGenTypes &CGT)
 273       : TargetCodeGenInfo(std::make_unique<AMDGPUABIInfo>(CGT)) {}
 274
 275   void setFunctionDeclAttributes(const FunctionDecl *FD, llvm::Function *F,
 276                                  CodeGenModule &CGM) const;
 277
 278   void emitTargetGlobals(CodeGen::CodeGenModule &CGM) const override;
 279
 280   void setTargetAttributes(const Decl *D, llvm::GlobalValue *GV,
 281                            CodeGen::CodeGenModule &M) const override;
 282   unsigned getOpenCLKernelCallingConv() const override;
 283
 284   llvm::Constant *getNullPointer(const CodeGen::CodeGenModule &CGM,
 285       llvm::PointerType *T, QualType QT) const override;
 286
 287   LangAS getASTAllocaAddressSpace() const override {
 288     return getLangASFromTargetAS(
 289         getABIInfo().getDataLayout().getAllocaAddrSpace());
 290   }
 291   LangAS getGlobalVarAddressSpace(CodeGenModule &CGM,
 292                                   const VarDecl *D) const override;
 293   llvm::SyncScope::ID getLLVMSyncScopeID(const LangOptions &LangOpts,
 294                                          SyncScope Scope,
 295                                          llvm::AtomicOrdering Ordering,
 296                                          llvm::LLVMContext &Ctx) const override;
 297   llvm::Value *createEnqueuedBlockKernel(CodeGenFunction &CGF,
 298                                          llvm::Function *BlockInvokeFunc,
 299                                          llvm::Type *BlockTy) const override;
 300   bool shouldEmitStaticExternCAliases() const override;
 301   bool shouldEmitDWARFBitFieldSeparators() const override;
 302   void setCUDAKernelCallingConvention(const FunctionType *&FT) const override;
 303 };
 304 }
 305
 306 static bool requiresAMDGPUProtectedVisibility(const Decl *D,
 307                                               llvm::GlobalValue *GV) {
 308   if (GV->getVisibility() != llvm::GlobalValue::HiddenVisibility)
 309     return false;
 310
 311   return !D->hasAttr<OMPDeclareTargetDeclAttr>() &&
 312          (D->hasAttr<OpenCLKernelAttr>() ||
 313           (isa<FunctionDecl>(D) && D->hasAttr<CUDAGlobalAttr>()) ||
 314           (isa<VarDecl>(D) &&
 315            (D->hasAttr<CUDADeviceAttr>() || D->hasAttr<CUDAConstantAttr>() ||
 316             cast<VarDecl>(D)->getType()->isCUDADeviceBuiltinSurfaceType() ||
 317             cast<VarDecl>(D)->getType()->isCUDADeviceBuiltinTextureType())));
 318 }
 319
 320 void AMDGPUTargetCodeGenInfo::setFunctionDeclAttributes(
 321     const FunctionDecl *FD, llvm::Function *F, CodeGenModule &M) const {
 322   const auto *ReqdWGS =
 323       M.getLangOpts().OpenCL ? FD->getAttr<ReqdWorkGroupSizeAttr>() : nullptr;
 324   const bool IsOpenCLKernel =
 325       M.getLangOpts().OpenCL && FD->hasAttr<OpenCLKernelAttr>();
 326   const bool IsHIPKernel = M.getLangOpts().HIP && FD->hasAttr<CUDAGlobalAttr>();
 327
 328   const auto *FlatWGS = FD->getAttr<AMDGPUFlatWorkGroupSizeAttr>();
 329   if (ReqdWGS || FlatWGS) {
 330     M.handleAMDGPUFlatWorkGroupSizeAttr(F, FlatWGS, ReqdWGS);
 331   } else if (IsOpenCLKernel || IsHIPKernel) {
 332     // By default, restrict the maximum size to a value specified by
 333     // --gpu-max-threads-per-block=n or its default value for HIP.
 334     const unsigned OpenCLDefaultMaxWorkGroupSize = 256;
 335     const unsigned DefaultMaxWorkGroupSize =
 336         IsOpenCLKernel ? OpenCLDefaultMaxWorkGroupSize
 337                        : M.getLangOpts().GPUMaxThreadsPerBlock;
 338     std::string AttrVal =
 339         std::string("1,") + llvm::utostr(DefaultMaxWorkGroupSize);
 340     F->addFnAttr("amdgpu-flat-work-group-size", AttrVal);
 341   }
 342
 343   if (const auto *Attr = FD->getAttr<AMDGPUWavesPerEUAttr>())
 344     M.handleAMDGPUWavesPerEUAttr(F, Attr);
 345
 346   if (const auto *Attr = FD->getAttr<AMDGPUNumSGPRAttr>()) {
 347     unsigned NumSGPR = Attr->getNumSGPR();
 348
 349     if (NumSGPR != 0)
 350       F->addFnAttr("amdgpu-num-sgpr", llvm::utostr(NumSGPR));
 351   }
 352
 353   if (const auto *Attr = FD->getAttr<AMDGPUNumVGPRAttr>()) {
 354     uint32_t NumVGPR = Attr->getNumVGPR();
 355
 356     if (NumVGPR != 0)
 357       F->addFnAttr("amdgpu-num-vgpr", llvm::utostr(NumVGPR));
 358   }
 359 }
 360
 361 /// Emits control constants used to change per-architecture behaviour in the
 362 /// AMDGPU ROCm device libraries.
 363 void AMDGPUTargetCodeGenInfo::emitTargetGlobals(
 364     CodeGen::CodeGenModule &CGM) const {
 365   StringRef Name = "llvm.amdgcn.abi.version";
 366   llvm::GlobalVariable *OriginalGV = CGM.getModule().getNamedGlobal(Name);
 367   if (OriginalGV && !llvm::GlobalVariable::isExternalLinkage(OriginalGV->getLinkage()))
 368     return;
 369
 370   auto *Type = llvm::IntegerType::getIntNTy(CGM.getModule().getContext(), 32);
 371   llvm::Constant *COV = llvm::ConstantInt::get(
 372       Type, CGM.getTarget().getTargetOpts().CodeObjectVersion);
 373
 374   // It needs to be constant weak_odr without externally_initialized so that
 375   // the load instuction can be eliminated by the IPSCCP.
 376   auto *GV = new llvm::GlobalVariable(
 377       CGM.getModule(), Type, true, llvm::GlobalValue::WeakODRLinkage, COV, Name,
 378       nullptr, llvm::GlobalValue::ThreadLocalMode::NotThreadLocal,
 379       CGM.getContext().getTargetAddressSpace(LangAS::opencl_constant));
 380   GV->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Local);
 381   GV->setVisibility(llvm::GlobalValue::VisibilityTypes::HiddenVisibility);
 382
 383   // Replace any external references to this variable with the new global.
 384   if (OriginalGV) {
 385     OriginalGV->replaceAllUsesWith(GV);
 386     GV->takeName(OriginalGV);
 387     OriginalGV->eraseFromParent();
 388   }
 389 }
 390
 391 void AMDGPUTargetCodeGenInfo::setTargetAttributes(
 392     const Decl *D, llvm::GlobalValue *GV, CodeGen::CodeGenModule &M) const {
 393   if (requiresAMDGPUProtectedVisibility(D, GV)) {
 394     GV->setVisibility(llvm::GlobalValue::ProtectedVisibility);
 395     GV->setDSOLocal(true);
 396   }
 397
 398   if (GV->isDeclaration())
 399     return;
 400
 401   llvm::Function *F = dyn_cast<llvm::Function>(GV);
 402   if (!F)
 403     return;
 404
 405   const FunctionDecl *FD = dyn_cast_or_null<FunctionDecl>(D);
 406   if (FD)
 407     setFunctionDeclAttributes(FD, F, M);
 408
 409   if (M.getContext().getTargetInfo().allowAMDGPUUnsafeFPAtomics())
 410     F->addFnAttr("amdgpu-unsafe-fp-atomics", "true");
 411
 412   if (!getABIInfo().getCodeGenOpts().EmitIEEENaNCompliantInsts)
 413     F->addFnAttr("amdgpu-ieee", "false");
 414 }
 415
 416 unsigned AMDGPUTargetCodeGenInfo::getOpenCLKernelCallingConv() const {
 417   return llvm::CallingConv::AMDGPU_KERNEL;
 418 }
 419
 420 // Currently LLVM assumes null pointers always have value 0,
 421 // which results in incorrectly transformed IR. Therefore, instead of
 422 // emitting null pointers in private and local address spaces, a null
 423 // pointer in generic address space is emitted which is casted to a
 424 // pointer in local or private address space.
 425 llvm::Constant *AMDGPUTargetCodeGenInfo::getNullPointer(
 426     const CodeGen::CodeGenModule &CGM, llvm::PointerType *PT,
 427     QualType QT) const {
 428   if (CGM.getContext().getTargetNullPointerValue(QT) == 0)
 429     return llvm::ConstantPointerNull::get(PT);
 430
 431   auto &Ctx = CGM.getContext();
 432   auto NPT = llvm::PointerType::get(
 433       PT->getContext(), Ctx.getTargetAddressSpace(LangAS::opencl_generic));
 434   return llvm::ConstantExpr::getAddrSpaceCast(
 435       llvm::ConstantPointerNull::get(NPT), PT);
 436 }
 437
 438 LangAS
 439 AMDGPUTargetCodeGenInfo::getGlobalVarAddressSpace(CodeGenModule &CGM,
 440                                                   const VarDecl *D) const {
 441   assert(!CGM.getLangOpts().OpenCL &&
 442          !(CGM.getLangOpts().CUDA && CGM.getLangOpts().CUDAIsDevice) &&
 443          "Address space agnostic languages only");
 444   LangAS DefaultGlobalAS = getLangASFromTargetAS(
 445       CGM.getContext().getTargetAddressSpace(LangAS::opencl_global));
 446   if (!D)
 447     return DefaultGlobalAS;
 448
 449   LangAS AddrSpace = D->getType().getAddressSpace();
 450   if (AddrSpace != LangAS::Default)
 451     return AddrSpace;
 452
 453   // Only promote to address space 4 if VarDecl has constant initialization.
 454   if (D->getType().isConstantStorage(CGM.getContext(), false, false) &&
 455       D->hasConstantInitialization()) {
 456     if (auto ConstAS = CGM.getTarget().getConstantAddressSpace())
 457       return *ConstAS;
 458   }
 459   return DefaultGlobalAS;
 460 }
 461
 462 llvm::SyncScope::ID
 463 AMDGPUTargetCodeGenInfo::getLLVMSyncScopeID(const LangOptions &LangOpts,
 464                                             SyncScope Scope,
 465                                             llvm::AtomicOrdering Ordering,
 466                                             llvm::LLVMContext &Ctx) const {
 467   std::string Name;
 468   switch (Scope) {
 469   case SyncScope::HIPSingleThread:
 470     Name = "singlethread";
 471     break;
 472   case SyncScope::HIPWavefront:
 473   case SyncScope::OpenCLSubGroup:
 474     Name = "wavefront";
 475     break;
 476   case SyncScope::HIPWorkgroup:
 477   case SyncScope::OpenCLWorkGroup:
 478     Name = "workgroup";
 479     break;
 480   case SyncScope::HIPAgent:
 481   case SyncScope::OpenCLDevice:
 482     Name = "agent";
 483     break;
 484   case SyncScope::HIPSystem:
 485   case SyncScope::OpenCLAllSVMDevices:
 486     Name = "";
 487     break;
 488   }
 489
 490   if (Ordering != llvm::AtomicOrdering::SequentiallyConsistent) {
 491     if (!Name.empty())
 492       Name = Twine(Twine(Name) + Twine("-")).str();
 493
 494     Name = Twine(Twine(Name) + Twine("one-as")).str();
 495   }
 496
 497   return Ctx.getOrInsertSyncScopeID(Name);
 498 }
 499
 500 bool AMDGPUTargetCodeGenInfo::shouldEmitStaticExternCAliases() const {
 501   return false;
 502 }
 503
 504 bool AMDGPUTargetCodeGenInfo::shouldEmitDWARFBitFieldSeparators() const {
 505   return true;
 506 }
 507
 508 void AMDGPUTargetCodeGenInfo::setCUDAKernelCallingConvention(
 509     const FunctionType *&FT) const {
 510   FT = getABIInfo().getContext().adjustFunctionType(
 511       FT, FT->getExtInfo().withCallingConv(CC_OpenCLKernel));
 512 }
 513
 514 /// Create an OpenCL kernel for an enqueued block.
 515 ///
 516 /// The type of the first argument (the block literal) is the struct type
 517 /// of the block literal instead of a pointer type. The first argument
 518 /// (block literal) is passed directly by value to the kernel. The kernel
 519 /// allocates the same type of struct on stack and stores the block literal
 520 /// to it and passes its pointer to the block invoke function. The kernel
 521 /// has "enqueued-block" function attribute and kernel argument metadata.
 522 llvm::Value *AMDGPUTargetCodeGenInfo::createEnqueuedBlockKernel(
 523     CodeGenFunction &CGF, llvm::Function *Invoke, llvm::Type *BlockTy) const {
 524   auto &Builder = CGF.Builder;
 525   auto &C = CGF.getLLVMContext();
 526
 527   auto *InvokeFT = Invoke->getFunctionType();
 528   llvm::SmallVector<llvm::Type *, 2> ArgTys;
 529   llvm::SmallVector<llvm::Metadata *, 8> AddressQuals;
 530   llvm::SmallVector<llvm::Metadata *, 8> AccessQuals;
 531   llvm::SmallVector<llvm::Metadata *, 8> ArgTypeNames;
 532   llvm::SmallVector<llvm::Metadata *, 8> ArgBaseTypeNames;
 533   llvm::SmallVector<llvm::Metadata *, 8> ArgTypeQuals;
 534   llvm::SmallVector<llvm::Metadata *, 8> ArgNames;
 535
 536   ArgTys.push_back(BlockTy);
 537   ArgTypeNames.push_back(llvm::MDString::get(C, "__block_literal"));
 538   AddressQuals.push_back(llvm::ConstantAsMetadata::get(Builder.getInt32(0)));
 539   ArgBaseTypeNames.push_back(llvm::MDString::get(C, "__block_literal"));
 540   ArgTypeQuals.push_back(llvm::MDString::get(C, ""));
 541   AccessQuals.push_back(llvm::MDString::get(C, "none"));
 542   ArgNames.push_back(llvm::MDString::get(C, "block_literal"));
 543   for (unsigned I = 1, E = InvokeFT->getNumParams(); I < E; ++I) {
 544     ArgTys.push_back(InvokeFT->getParamType(I));
 545     ArgTypeNames.push_back(llvm::MDString::get(C, "void*"));
 546     AddressQuals.push_back(llvm::ConstantAsMetadata::get(Builder.getInt32(3)));
 547     AccessQuals.push_back(llvm::MDString::get(C, "none"));
 548     ArgBaseTypeNames.push_back(llvm::MDString::get(C, "void*"));
 549     ArgTypeQuals.push_back(llvm::MDString::get(C, ""));
 550     ArgNames.push_back(
 551         llvm::MDString::get(C, (Twine("local_arg") + Twine(I)).str()));
 552   }
 553   std::string Name = Invoke->getName().str() + "_kernel";
 554   auto *FT = llvm::FunctionType::get(llvm::Type::getVoidTy(C), ArgTys, false);
 555   auto *F = llvm::Function::Create(FT, llvm::GlobalValue::InternalLinkage, Name,
 556                                    &CGF.CGM.getModule());
 557   F->setCallingConv(llvm::CallingConv::AMDGPU_KERNEL);
 558
 559   llvm::AttrBuilder KernelAttrs(C);
 560   // FIXME: The invoke isn't applying the right attributes either
 561   // FIXME: This is missing setTargetAttributes
 562   CGF.CGM.addDefaultFunctionDefinitionAttributes(KernelAttrs);
 563   KernelAttrs.addAttribute("enqueued-block");
 564   F->addFnAttrs(KernelAttrs);
 565
 566   auto IP = CGF.Builder.saveIP();
 567   auto *BB = llvm::BasicBlock::Create(C, "entry", F);
 568   Builder.SetInsertPoint(BB);
 569   const auto BlockAlign = CGF.CGM.getDataLayout().getPrefTypeAlign(BlockTy);
 570   auto *BlockPtr = Builder.CreateAlloca(BlockTy, nullptr);
 571   BlockPtr->setAlignment(BlockAlign);
 572   Builder.CreateAlignedStore(F->arg_begin(), BlockPtr, BlockAlign);
 573   auto *Cast = Builder.CreatePointerCast(BlockPtr, InvokeFT->getParamType(0));
 574   llvm::SmallVector<llvm::Value *, 2> Args;
 575   Args.push_back(Cast);
 576   for (llvm::Argument &A : llvm::drop_begin(F->args()))
 577     Args.push_back(&A);
 578   llvm::CallInst *call = Builder.CreateCall(Invoke, Args);
 579   call->setCallingConv(Invoke->getCallingConv());
 580   Builder.CreateRetVoid();
 581   Builder.restoreIP(IP);
 582
 583   F->setMetadata("kernel_arg_addr_space", llvm::MDNode::get(C, AddressQuals));
 584   F->setMetadata("kernel_arg_access_qual", llvm::MDNode::get(C, AccessQuals));
 585   F->setMetadata("kernel_arg_type", llvm::MDNode::get(C, ArgTypeNames));
 586   F->setMetadata("kernel_arg_base_type",
 587                  llvm::MDNode::get(C, ArgBaseTypeNames));
 588   F->setMetadata("kernel_arg_type_qual", llvm::MDNode::get(C, ArgTypeQuals));
 589   if (CGF.CGM.getCodeGenOpts().EmitOpenCLArgMetadata)
 590     F->setMetadata("kernel_arg_name", llvm::MDNode::get(C, ArgNames));
 591
 592   return F;
 593 }
 594
 595 void CodeGenModule::handleAMDGPUFlatWorkGroupSizeAttr(
 596     llvm::Function *F, const AMDGPUFlatWorkGroupSizeAttr *FlatWGS,
 597     const ReqdWorkGroupSizeAttr *ReqdWGS, int32_t *MinThreadsVal,
 598     int32_t *MaxThreadsVal) {
 599   unsigned Min = 0;
 600   unsigned Max = 0;
 601   if (FlatWGS) {
 602     Min = FlatWGS->getMin()->EvaluateKnownConstInt(getContext()).getExtValue();
 603     Max = FlatWGS->getMax()->EvaluateKnownConstInt(getContext()).getExtValue();
 604   }
 605   if (ReqdWGS && Min == 0 && Max == 0)
 606     Min = Max = ReqdWGS->getXDim() * ReqdWGS->getYDim() * ReqdWGS->getZDim();
 607
 608   if (Min != 0) {
 609     assert(Min <= Max && "Min must be less than or equal Max");
 610
 611     if (MinThreadsVal)
 612       *MinThreadsVal = Min;
 613     if (MaxThreadsVal)
 614       *MaxThreadsVal = Max;
 615     std::string AttrVal = llvm::utostr(Min) + "," + llvm::utostr(Max);
 616     if (F)
 617       F->addFnAttr("amdgpu-flat-work-group-size", AttrVal);
 618   } else
 619     assert(Max == 0 && "Max must be zero");
 620 }
 621
 622 void CodeGenModule::handleAMDGPUWavesPerEUAttr(
 623     llvm::Function *F, const AMDGPUWavesPerEUAttr *Attr) {
 624   unsigned Min =
 625       Attr->getMin()->EvaluateKnownConstInt(getContext()).getExtValue();
 626   unsigned Max =
 627       Attr->getMax()
 628           ? Attr->getMax()->EvaluateKnownConstInt(getContext()).getExtValue()
 629           : 0;
 630
 631   if (Min != 0) {
 632     assert((Max == 0 || Min <= Max) && "Min must be less than or equal Max");
 633
 634     std::string AttrVal = llvm::utostr(Min);
 635     if (Max != 0)
 636       AttrVal = AttrVal + "," + llvm::utostr(Max);
 637     F->addFnAttr("amdgpu-waves-per-eu", AttrVal);
 638   } else
 639     assert(Max == 0 && "Max must be zero");
 640 }
 641
 642 std::unique_ptr<TargetCodeGenInfo>
 643 CodeGen::createAMDGPUTargetCodeGenInfo(CodeGenModule &CGM) {
 644   return std::make_unique<AMDGPUTargetCodeGenInfo>(CGM.getTypes());
 645 }