clang/lib/CodeGen/Targets/AMDGPU.cpp

   1 //===- AMDGPU.cpp ---------------------------------------------------------===//
   2 //
   3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 // See https://llvm.org/LICENSE.txt for license information.
   5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 //
   7 //===----------------------------------------------------------------------===//
   8
   9 #include "ABIInfoImpl.h"
  10 #include "TargetInfo.h"
  11 #include "clang/Basic/TargetOptions.h"
  12 #include "llvm/Support/AMDGPUAddrSpace.h"
  13
  14 using namespace clang;
  15 using namespace clang::CodeGen;
  16
  17 //===----------------------------------------------------------------------===//
  18 // AMDGPU ABI Implementation
  19 //===----------------------------------------------------------------------===//
  20
  21 namespace {
  22
  23 class AMDGPUABIInfo final : public DefaultABIInfo {
  24 private:
  25   static const unsigned MaxNumRegsForArgsRet = 16;
  26
  27   unsigned numRegsForType(QualType Ty) const;
  28
  29   bool isHomogeneousAggregateBaseType(QualType Ty) const override;
  30   bool isHomogeneousAggregateSmallEnough(const Type *Base,
  31                                          uint64_t Members) const override;
  32
  33   // Coerce HIP scalar pointer arguments from generic pointers to global ones.
  34   llvm::Type *coerceKernelArgumentType(llvm::Type *Ty, unsigned FromAS,
  35                                        unsigned ToAS) const {
  36     // Single value types.
  37     auto *PtrTy = llvm::dyn_cast<llvm::PointerType>(Ty);
  38     if (PtrTy && PtrTy->getAddressSpace() == FromAS)
  39       return llvm::PointerType::get(Ty->getContext(), ToAS);
  40     return Ty;
  41   }
  42
  43 public:
  44   explicit AMDGPUABIInfo(CodeGen::CodeGenTypes &CGT) :
  45     DefaultABIInfo(CGT) {}
  46
  47   ABIArgInfo classifyReturnType(QualType RetTy) const;
  48   ABIArgInfo classifyKernelArgumentType(QualType Ty) const;
  49   ABIArgInfo classifyArgumentType(QualType Ty, bool Variadic,
  50                                   unsigned &NumRegsLeft) const;
  51
  52   void computeInfo(CGFunctionInfo &FI) const override;
  53   RValue EmitVAArg(CodeGenFunction &CGF, Address VAListAddr, QualType Ty,
  54                    AggValueSlot Slot) const override;
  55 };
  56
  57 bool AMDGPUABIInfo::isHomogeneousAggregateBaseType(QualType Ty) const {
  58   return true;
  59 }
  60
  61 bool AMDGPUABIInfo::isHomogeneousAggregateSmallEnough(
  62   const Type *Base, uint64_t Members) const {
  63   uint32_t NumRegs = (getContext().getTypeSize(Base) + 31) / 32;
  64
  65   // Homogeneous Aggregates may occupy at most 16 registers.
  66   return Members * NumRegs <= MaxNumRegsForArgsRet;
  67 }
  68
  69 /// Estimate number of registers the type will use when passed in registers.
  70 unsigned AMDGPUABIInfo::numRegsForType(QualType Ty) const {
  71   unsigned NumRegs = 0;
  72
  73   if (const VectorType *VT = Ty->getAs<VectorType>()) {
  74     // Compute from the number of elements. The reported size is based on the
  75     // in-memory size, which includes the padding 4th element for 3-vectors.
  76     QualType EltTy = VT->getElementType();
  77     unsigned EltSize = getContext().getTypeSize(EltTy);
  78
  79     // 16-bit element vectors should be passed as packed.
  80     if (EltSize == 16)
  81       return (VT->getNumElements() + 1) / 2;
  82
  83     unsigned EltNumRegs = (EltSize + 31) / 32;
  84     return EltNumRegs * VT->getNumElements();
  85   }
  86
  87   if (const RecordType *RT = Ty->getAs<RecordType>()) {
  88     const RecordDecl *RD = RT->getDecl();
  89     assert(!RD->hasFlexibleArrayMember());
  90
  91     for (const FieldDecl *Field : RD->fields()) {
  92       QualType FieldTy = Field->getType();
  93       NumRegs += numRegsForType(FieldTy);
  94     }
  95
  96     return NumRegs;
  97   }
  98
  99   return (getContext().getTypeSize(Ty) + 31) / 32;
 100 }
 101
 102 void AMDGPUABIInfo::computeInfo(CGFunctionInfo &FI) const {
 103   llvm::CallingConv::ID CC = FI.getCallingConvention();
 104
 105   if (!getCXXABI().classifyReturnType(FI))
 106     FI.getReturnInfo() = classifyReturnType(FI.getReturnType());
 107
 108   unsigned ArgumentIndex = 0;
 109   const unsigned numFixedArguments = FI.getNumRequiredArgs();
 110
 111   unsigned NumRegsLeft = MaxNumRegsForArgsRet;
 112   for (auto &Arg : FI.arguments()) {
 113     if (CC == llvm::CallingConv::AMDGPU_KERNEL) {
 114       Arg.info = classifyKernelArgumentType(Arg.type);
 115     } else {
 116       bool FixedArgument = ArgumentIndex++ < numFixedArguments;
 117       Arg.info = classifyArgumentType(Arg.type, !FixedArgument, NumRegsLeft);
 118     }
 119   }
 120 }
 121
 122 RValue AMDGPUABIInfo::EmitVAArg(CodeGenFunction &CGF, Address VAListAddr,
 123                                 QualType Ty, AggValueSlot Slot) const {
 124   const bool IsIndirect = false;
 125   const bool AllowHigherAlign = false;
 126   return emitVoidPtrVAArg(CGF, VAListAddr, Ty, IsIndirect,
 127                           getContext().getTypeInfoInChars(Ty),
 128                           CharUnits::fromQuantity(4), AllowHigherAlign, Slot);
 129 }
 130
 131 ABIArgInfo AMDGPUABIInfo::classifyReturnType(QualType RetTy) const {
 132   if (isAggregateTypeForABI(RetTy)) {
 133     // Records with non-trivial destructors/copy-constructors should not be
 134     // returned by value.
 135     if (!getRecordArgABI(RetTy, getCXXABI())) {
 136       // Ignore empty structs/unions.
 137       if (isEmptyRecord(getContext(), RetTy, true))
 138         return ABIArgInfo::getIgnore();
 139
 140       // Lower single-element structs to just return a regular value.
 141       if (const Type *SeltTy = isSingleElementStruct(RetTy, getContext()))
 142         return ABIArgInfo::getDirect(CGT.ConvertType(QualType(SeltTy, 0)));
 143
 144       if (const RecordType *RT = RetTy->getAs<RecordType>()) {
 145         const RecordDecl *RD = RT->getDecl();
 146         if (RD->hasFlexibleArrayMember())
 147           return DefaultABIInfo::classifyReturnType(RetTy);
 148       }
 149
 150       // Pack aggregates <= 4 bytes into single VGPR or pair.
 151       uint64_t Size = getContext().getTypeSize(RetTy);
 152       if (Size <= 16)
 153         return ABIArgInfo::getDirect(llvm::Type::getInt16Ty(getVMContext()));
 154
 155       if (Size <= 32)
 156         return ABIArgInfo::getDirect(llvm::Type::getInt32Ty(getVMContext()));
 157
 158       if (Size <= 64) {
 159         llvm::Type *I32Ty = llvm::Type::getInt32Ty(getVMContext());
 160         return ABIArgInfo::getDirect(llvm::ArrayType::get(I32Ty, 2));
 161       }
 162
 163       if (numRegsForType(RetTy) <= MaxNumRegsForArgsRet)
 164         return ABIArgInfo::getDirect();
 165     }
 166   }
 167
 168   // Otherwise just do the default thing.
 169   return DefaultABIInfo::classifyReturnType(RetTy);
 170 }
 171
 172 /// For kernels all parameters are really passed in a special buffer. It doesn't
 173 /// make sense to pass anything byval, so everything must be direct.
 174 ABIArgInfo AMDGPUABIInfo::classifyKernelArgumentType(QualType Ty) const {
 175   Ty = useFirstFieldIfTransparentUnion(Ty);
 176
 177   // TODO: Can we omit empty structs?
 178
 179   if (const Type *SeltTy = isSingleElementStruct(Ty, getContext()))
 180     Ty = QualType(SeltTy, 0);
 181
 182   llvm::Type *OrigLTy = CGT.ConvertType(Ty);
 183   llvm::Type *LTy = OrigLTy;
 184   if (getContext().getLangOpts().HIP) {
 185     LTy = coerceKernelArgumentType(
 186         OrigLTy, /*FromAS=*/getContext().getTargetAddressSpace(LangAS::Default),
 187         /*ToAS=*/getContext().getTargetAddressSpace(LangAS::cuda_device));
 188   }
 189
 190   // FIXME: Should also use this for OpenCL, but it requires addressing the
 191   // problem of kernels being called.
 192   //
 193   // FIXME: This doesn't apply the optimization of coercing pointers in structs
 194   // to global address space when using byref. This would require implementing a
 195   // new kind of coercion of the in-memory type when for indirect arguments.
 196   if (!getContext().getLangOpts().OpenCL && LTy == OrigLTy &&
 197       isAggregateTypeForABI(Ty)) {
 198     return ABIArgInfo::getIndirectAliased(
 199         getContext().getTypeAlignInChars(Ty),
 200         getContext().getTargetAddressSpace(LangAS::opencl_constant),
 201         false /*Realign*/, nullptr /*Padding*/);
 202   }
 203
 204   // If we set CanBeFlattened to true, CodeGen will expand the struct to its
 205   // individual elements, which confuses the Clover OpenCL backend; therefore we
 206   // have to set it to false here. Other args of getDirect() are just defaults.
 207   return ABIArgInfo::getDirect(LTy, 0, nullptr, false);
 208 }
 209
 210 ABIArgInfo AMDGPUABIInfo::classifyArgumentType(QualType Ty, bool Variadic,
 211                                                unsigned &NumRegsLeft) const {
 212   assert(NumRegsLeft <= MaxNumRegsForArgsRet && "register estimate underflow");
 213
 214   Ty = useFirstFieldIfTransparentUnion(Ty);
 215
 216   if (Variadic) {
 217     return ABIArgInfo::getDirect(/*T=*/nullptr,
 218                                  /*Offset=*/0,
 219                                  /*Padding=*/nullptr,
 220                                  /*CanBeFlattened=*/false,
 221                                  /*Align=*/0);
 222   }
 223
 224   if (isAggregateTypeForABI(Ty)) {
 225     // Records with non-trivial destructors/copy-constructors should not be
 226     // passed by value.
 227     if (auto RAA = getRecordArgABI(Ty, getCXXABI()))
 228       return getNaturalAlignIndirect(Ty, RAA == CGCXXABI::RAA_DirectInMemory);
 229
 230     // Ignore empty structs/unions.
 231     if (isEmptyRecord(getContext(), Ty, true))
 232       return ABIArgInfo::getIgnore();
 233
 234     // Lower single-element structs to just pass a regular value. TODO: We
 235     // could do reasonable-size multiple-element structs too, using getExpand(),
 236     // though watch out for things like bitfields.
 237     if (const Type *SeltTy = isSingleElementStruct(Ty, getContext()))
 238       return ABIArgInfo::getDirect(CGT.ConvertType(QualType(SeltTy, 0)));
 239
 240     if (const RecordType *RT = Ty->getAs<RecordType>()) {
 241       const RecordDecl *RD = RT->getDecl();
 242       if (RD->hasFlexibleArrayMember())
 243         return DefaultABIInfo::classifyArgumentType(Ty);
 244     }
 245
 246     // Pack aggregates <= 8 bytes into single VGPR or pair.
 247     uint64_t Size = getContext().getTypeSize(Ty);
 248     if (Size <= 64) {
 249       unsigned NumRegs = (Size + 31) / 32;
 250       NumRegsLeft -= std::min(NumRegsLeft, NumRegs);
 251
 252       if (Size <= 16)
 253         return ABIArgInfo::getDirect(llvm::Type::getInt16Ty(getVMContext()));
 254
 255       if (Size <= 32)
 256         return ABIArgInfo::getDirect(llvm::Type::getInt32Ty(getVMContext()));
 257
 258       // XXX: Should this be i64 instead, and should the limit increase?
 259       llvm::Type *I32Ty = llvm::Type::getInt32Ty(getVMContext());
 260       return ABIArgInfo::getDirect(llvm::ArrayType::get(I32Ty, 2));
 261     }
 262
 263     if (NumRegsLeft > 0) {
 264       unsigned NumRegs = numRegsForType(Ty);
 265       if (NumRegsLeft >= NumRegs) {
 266         NumRegsLeft -= NumRegs;
 267         return ABIArgInfo::getDirect();
 268       }
 269     }
 270
 271     // Use pass-by-reference in stead of pass-by-value for struct arguments in
 272     // function ABI.
 273     return ABIArgInfo::getIndirectAliased(
 274         getContext().getTypeAlignInChars(Ty),
 275         getContext().getTargetAddressSpace(LangAS::opencl_private));
 276   }
 277
 278   // Otherwise just do the default thing.
 279   ABIArgInfo ArgInfo = DefaultABIInfo::classifyArgumentType(Ty);
 280   if (!ArgInfo.isIndirect()) {
 281     unsigned NumRegs = numRegsForType(Ty);
 282     NumRegsLeft -= std::min(NumRegs, NumRegsLeft);
 283   }
 284
 285   return ArgInfo;
 286 }
 287
 288 class AMDGPUTargetCodeGenInfo : public TargetCodeGenInfo {
 289 public:
 290   AMDGPUTargetCodeGenInfo(CodeGenTypes &CGT)
 291       : TargetCodeGenInfo(std::make_unique<AMDGPUABIInfo>(CGT)) {}
 292
 293   void setFunctionDeclAttributes(const FunctionDecl *FD, llvm::Function *F,
 294                                  CodeGenModule &CGM) const;
 295
 296   void emitTargetGlobals(CodeGen::CodeGenModule &CGM) const override;
 297
 298   void setTargetAttributes(const Decl *D, llvm::GlobalValue *GV,
 299                            CodeGen::CodeGenModule &M) const override;
 300   unsigned getOpenCLKernelCallingConv() const override;
 301
 302   llvm::Constant *getNullPointer(const CodeGen::CodeGenModule &CGM,
 303       llvm::PointerType *T, QualType QT) const override;
 304
 305   LangAS getASTAllocaAddressSpace() const override {
 306     return getLangASFromTargetAS(
 307         getABIInfo().getDataLayout().getAllocaAddrSpace());
 308   }
 309   LangAS getGlobalVarAddressSpace(CodeGenModule &CGM,
 310                                   const VarDecl *D) const override;
 311   llvm::SyncScope::ID getLLVMSyncScopeID(const LangOptions &LangOpts,
 312                                          SyncScope Scope,
 313                                          llvm::AtomicOrdering Ordering,
 314                                          llvm::LLVMContext &Ctx) const override;
 315   void setTargetAtomicMetadata(CodeGenFunction &CGF,
 316                                llvm::Instruction &AtomicInst,
 317                                const AtomicExpr *Expr = nullptr) const override;
 318   llvm::Value *createEnqueuedBlockKernel(CodeGenFunction &CGF,
 319                                          llvm::Function *BlockInvokeFunc,
 320                                          llvm::Type *BlockTy) const override;
 321   bool shouldEmitStaticExternCAliases() const override;
 322   bool shouldEmitDWARFBitFieldSeparators() const override;
 323   void setCUDAKernelCallingConvention(const FunctionType *&FT) const override;
 324 };
 325 }
 326
 327 static bool requiresAMDGPUProtectedVisibility(const Decl *D,
 328                                               llvm::GlobalValue *GV) {
 329   if (GV->getVisibility() != llvm::GlobalValue::HiddenVisibility)
 330     return false;
 331
 332   return !D->hasAttr<OMPDeclareTargetDeclAttr>() &&
 333          (D->hasAttr<OpenCLKernelAttr>() ||
 334           (isa<FunctionDecl>(D) && D->hasAttr<CUDAGlobalAttr>()) ||
 335           (isa<VarDecl>(D) &&
 336            (D->hasAttr<CUDADeviceAttr>() || D->hasAttr<CUDAConstantAttr>() ||
 337             cast<VarDecl>(D)->getType()->isCUDADeviceBuiltinSurfaceType() ||
 338             cast<VarDecl>(D)->getType()->isCUDADeviceBuiltinTextureType())));
 339 }
 340
 341 void AMDGPUTargetCodeGenInfo::setFunctionDeclAttributes(
 342     const FunctionDecl *FD, llvm::Function *F, CodeGenModule &M) const {
 343   const auto *ReqdWGS =
 344       M.getLangOpts().OpenCL ? FD->getAttr<ReqdWorkGroupSizeAttr>() : nullptr;
 345   const bool IsOpenCLKernel =
 346       M.getLangOpts().OpenCL && FD->hasAttr<OpenCLKernelAttr>();
 347   const bool IsHIPKernel = M.getLangOpts().HIP && FD->hasAttr<CUDAGlobalAttr>();
 348
 349   const auto *FlatWGS = FD->getAttr<AMDGPUFlatWorkGroupSizeAttr>();
 350   if (ReqdWGS || FlatWGS) {
 351     M.handleAMDGPUFlatWorkGroupSizeAttr(F, FlatWGS, ReqdWGS);
 352   } else if (IsOpenCLKernel || IsHIPKernel) {
 353     // By default, restrict the maximum size to a value specified by
 354     // --gpu-max-threads-per-block=n or its default value for HIP.
 355     const unsigned OpenCLDefaultMaxWorkGroupSize = 256;
 356     const unsigned DefaultMaxWorkGroupSize =
 357         IsOpenCLKernel ? OpenCLDefaultMaxWorkGroupSize
 358                        : M.getLangOpts().GPUMaxThreadsPerBlock;
 359     std::string AttrVal =
 360         std::string("1,") + llvm::utostr(DefaultMaxWorkGroupSize);
 361     F->addFnAttr("amdgpu-flat-work-group-size", AttrVal);
 362   }
 363
 364   if (const auto *Attr = FD->getAttr<AMDGPUWavesPerEUAttr>())
 365     M.handleAMDGPUWavesPerEUAttr(F, Attr);
 366
 367   if (const auto *Attr = FD->getAttr<AMDGPUNumSGPRAttr>()) {
 368     unsigned NumSGPR = Attr->getNumSGPR();
 369
 370     if (NumSGPR != 0)
 371       F->addFnAttr("amdgpu-num-sgpr", llvm::utostr(NumSGPR));
 372   }
 373
 374   if (const auto *Attr = FD->getAttr<AMDGPUNumVGPRAttr>()) {
 375     uint32_t NumVGPR = Attr->getNumVGPR();
 376
 377     if (NumVGPR != 0)
 378       F->addFnAttr("amdgpu-num-vgpr", llvm::utostr(NumVGPR));
 379   }
 380
 381   if (const auto *Attr = FD->getAttr<AMDGPUMaxNumWorkGroupsAttr>()) {
 382     uint32_t X = Attr->getMaxNumWorkGroupsX()
 383                      ->EvaluateKnownConstInt(M.getContext())
 384                      .getExtValue();
 385     // Y and Z dimensions default to 1 if not specified
 386     uint32_t Y = Attr->getMaxNumWorkGroupsY()
 387                      ? Attr->getMaxNumWorkGroupsY()
 388                            ->EvaluateKnownConstInt(M.getContext())
 389                            .getExtValue()
 390                      : 1;
 391     uint32_t Z = Attr->getMaxNumWorkGroupsZ()
 392                      ? Attr->getMaxNumWorkGroupsZ()
 393                            ->EvaluateKnownConstInt(M.getContext())
 394                            .getExtValue()
 395                      : 1;
 396
 397     llvm::SmallString<32> AttrVal;
 398     llvm::raw_svector_ostream OS(AttrVal);
 399     OS << X << ',' << Y << ',' << Z;
 400
 401     F->addFnAttr("amdgpu-max-num-workgroups", AttrVal.str());
 402   }
 403 }
 404
 405 /// Emits control constants used to change per-architecture behaviour in the
 406 /// AMDGPU ROCm device libraries.
 407 void AMDGPUTargetCodeGenInfo::emitTargetGlobals(
 408     CodeGen::CodeGenModule &CGM) const {
 409   StringRef Name = "__oclc_ABI_version";
 410   llvm::GlobalVariable *OriginalGV = CGM.getModule().getNamedGlobal(Name);
 411   if (OriginalGV && !llvm::GlobalVariable::isExternalLinkage(OriginalGV->getLinkage()))
 412     return;
 413
 414   if (CGM.getTarget().getTargetOpts().CodeObjectVersion ==
 415       llvm::CodeObjectVersionKind::COV_None)
 416     return;
 417
 418   auto *Type = llvm::IntegerType::getIntNTy(CGM.getModule().getContext(), 32);
 419   llvm::Constant *COV = llvm::ConstantInt::get(
 420       Type, CGM.getTarget().getTargetOpts().CodeObjectVersion);
 421
 422   // It needs to be constant weak_odr without externally_initialized so that
 423   // the load instuction can be eliminated by the IPSCCP.
 424   auto *GV = new llvm::GlobalVariable(
 425       CGM.getModule(), Type, true, llvm::GlobalValue::WeakODRLinkage, COV, Name,
 426       nullptr, llvm::GlobalValue::ThreadLocalMode::NotThreadLocal,
 427       CGM.getContext().getTargetAddressSpace(LangAS::opencl_constant));
 428   GV->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Local);
 429   GV->setVisibility(llvm::GlobalValue::VisibilityTypes::HiddenVisibility);
 430
 431   // Replace any external references to this variable with the new global.
 432   if (OriginalGV) {
 433     OriginalGV->replaceAllUsesWith(GV);
 434     GV->takeName(OriginalGV);
 435     OriginalGV->eraseFromParent();
 436   }
 437 }
 438
 439 void AMDGPUTargetCodeGenInfo::setTargetAttributes(
 440     const Decl *D, llvm::GlobalValue *GV, CodeGen::CodeGenModule &M) const {
 441   if (requiresAMDGPUProtectedVisibility(D, GV)) {
 442     GV->setVisibility(llvm::GlobalValue::ProtectedVisibility);
 443     GV->setDSOLocal(true);
 444   }
 445
 446   if (GV->isDeclaration())
 447     return;
 448
 449   llvm::Function *F = dyn_cast<llvm::Function>(GV);
 450   if (!F)
 451     return;
 452
 453   const FunctionDecl *FD = dyn_cast_or_null<FunctionDecl>(D);
 454   if (FD)
 455     setFunctionDeclAttributes(FD, F, M);
 456
 457   if (!getABIInfo().getCodeGenOpts().EmitIEEENaNCompliantInsts)
 458     F->addFnAttr("amdgpu-ieee", "false");
 459 }
 460
 461 unsigned AMDGPUTargetCodeGenInfo::getOpenCLKernelCallingConv() const {
 462   return llvm::CallingConv::AMDGPU_KERNEL;
 463 }
 464
 465 // Currently LLVM assumes null pointers always have value 0,
 466 // which results in incorrectly transformed IR. Therefore, instead of
 467 // emitting null pointers in private and local address spaces, a null
 468 // pointer in generic address space is emitted which is casted to a
 469 // pointer in local or private address space.
 470 llvm::Constant *AMDGPUTargetCodeGenInfo::getNullPointer(
 471     const CodeGen::CodeGenModule &CGM, llvm::PointerType *PT,
 472     QualType QT) const {
 473   if (CGM.getContext().getTargetNullPointerValue(QT) == 0)
 474     return llvm::ConstantPointerNull::get(PT);
 475
 476   auto &Ctx = CGM.getContext();
 477   auto NPT = llvm::PointerType::get(
 478       PT->getContext(), Ctx.getTargetAddressSpace(LangAS::opencl_generic));
 479   return llvm::ConstantExpr::getAddrSpaceCast(
 480       llvm::ConstantPointerNull::get(NPT), PT);
 481 }
 482
 483 LangAS
 484 AMDGPUTargetCodeGenInfo::getGlobalVarAddressSpace(CodeGenModule &CGM,
 485                                                   const VarDecl *D) const {
 486   assert(!CGM.getLangOpts().OpenCL &&
 487          !(CGM.getLangOpts().CUDA && CGM.getLangOpts().CUDAIsDevice) &&
 488          "Address space agnostic languages only");
 489   LangAS DefaultGlobalAS = getLangASFromTargetAS(
 490       CGM.getContext().getTargetAddressSpace(LangAS::opencl_global));
 491   if (!D)
 492     return DefaultGlobalAS;
 493
 494   LangAS AddrSpace = D->getType().getAddressSpace();
 495   if (AddrSpace != LangAS::Default)
 496     return AddrSpace;
 497
 498   // Only promote to address space 4 if VarDecl has constant initialization.
 499   if (D->getType().isConstantStorage(CGM.getContext(), false, false) &&
 500       D->hasConstantInitialization()) {
 501     if (auto ConstAS = CGM.getTarget().getConstantAddressSpace())
 502       return *ConstAS;
 503   }
 504   return DefaultGlobalAS;
 505 }
 506
 507 llvm::SyncScope::ID
 508 AMDGPUTargetCodeGenInfo::getLLVMSyncScopeID(const LangOptions &LangOpts,
 509                                             SyncScope Scope,
 510                                             llvm::AtomicOrdering Ordering,
 511                                             llvm::LLVMContext &Ctx) const {
 512   std::string Name;
 513   switch (Scope) {
 514   case SyncScope::HIPSingleThread:
 515   case SyncScope::SingleScope:
 516     Name = "singlethread";
 517     break;
 518   case SyncScope::HIPWavefront:
 519   case SyncScope::OpenCLSubGroup:
 520   case SyncScope::WavefrontScope:
 521     Name = "wavefront";
 522     break;
 523   case SyncScope::HIPWorkgroup:
 524   case SyncScope::OpenCLWorkGroup:
 525   case SyncScope::WorkgroupScope:
 526     Name = "workgroup";
 527     break;
 528   case SyncScope::HIPAgent:
 529   case SyncScope::OpenCLDevice:
 530   case SyncScope::DeviceScope:
 531     Name = "agent";
 532     break;
 533   case SyncScope::SystemScope:
 534   case SyncScope::HIPSystem:
 535   case SyncScope::OpenCLAllSVMDevices:
 536     Name = "";
 537     break;
 538   }
 539
 540   // OpenCL assumes by default that atomic scopes are per-address space for
 541   // non-sequentially consistent operations.
 542   if (Scope >= SyncScope::OpenCLWorkGroup &&
 543       Scope <= SyncScope::OpenCLSubGroup &&
 544       Ordering != llvm::AtomicOrdering::SequentiallyConsistent) {
 545     if (!Name.empty())
 546       Name = Twine(Twine(Name) + Twine("-")).str();
 547
 548     Name = Twine(Twine(Name) + Twine("one-as")).str();
 549   }
 550
 551   return Ctx.getOrInsertSyncScopeID(Name);
 552 }
 553
 554 void AMDGPUTargetCodeGenInfo::setTargetAtomicMetadata(
 555     CodeGenFunction &CGF, llvm::Instruction &AtomicInst,
 556     const AtomicExpr *AE) const {
 557   auto *RMW = dyn_cast<llvm::AtomicRMWInst>(&AtomicInst);
 558   auto *CmpX = dyn_cast<llvm::AtomicCmpXchgInst>(&AtomicInst);
 559
 560   // OpenCL and old style HIP atomics consider atomics targeting thread private
 561   // memory to be undefined.
 562   //
 563   // TODO: This is probably undefined for atomic load/store, but there's not
 564   // much direct codegen benefit to knowing this.
 565   if (((RMW && RMW->getPointerAddressSpace() == llvm::AMDGPUAS::FLAT_ADDRESS) ||
 566        (CmpX &&
 567         CmpX->getPointerAddressSpace() == llvm::AMDGPUAS::FLAT_ADDRESS)) &&
 568       AE && AE->threadPrivateMemoryAtomicsAreUndefined()) {
 569     llvm::MDBuilder MDHelper(CGF.getLLVMContext());
 570     llvm::MDNode *ASRange = MDHelper.createRange(
 571         llvm::APInt(32, llvm::AMDGPUAS::PRIVATE_ADDRESS),
 572         llvm::APInt(32, llvm::AMDGPUAS::PRIVATE_ADDRESS + 1));
 573     AtomicInst.setMetadata(llvm::LLVMContext::MD_noalias_addrspace, ASRange);
 574   }
 575
 576   if (!RMW || !CGF.getTarget().allowAMDGPUUnsafeFPAtomics())
 577     return;
 578
 579   // TODO: Introduce new, more controlled options that also work for integers,
 580   // and deprecate allowAMDGPUUnsafeFPAtomics.
 581   llvm::AtomicRMWInst::BinOp RMWOp = RMW->getOperation();
 582   if (llvm::AtomicRMWInst::isFPOperation(RMWOp)) {
 583     llvm::MDNode *Empty = llvm::MDNode::get(CGF.getLLVMContext(), {});
 584     RMW->setMetadata("amdgpu.no.fine.grained.memory", Empty);
 585
 586     if (RMWOp == llvm::AtomicRMWInst::FAdd && RMW->getType()->isFloatTy())
 587       RMW->setMetadata("amdgpu.ignore.denormal.mode", Empty);
 588   }
 589 }
 590
 591 bool AMDGPUTargetCodeGenInfo::shouldEmitStaticExternCAliases() const {
 592   return false;
 593 }
 594
 595 bool AMDGPUTargetCodeGenInfo::shouldEmitDWARFBitFieldSeparators() const {
 596   return true;
 597 }
 598
 599 void AMDGPUTargetCodeGenInfo::setCUDAKernelCallingConvention(
 600     const FunctionType *&FT) const {
 601   FT = getABIInfo().getContext().adjustFunctionType(
 602       FT, FT->getExtInfo().withCallingConv(CC_OpenCLKernel));
 603 }
 604
 605 /// Create an OpenCL kernel for an enqueued block.
 606 ///
 607 /// The type of the first argument (the block literal) is the struct type
 608 /// of the block literal instead of a pointer type. The first argument
 609 /// (block literal) is passed directly by value to the kernel. The kernel
 610 /// allocates the same type of struct on stack and stores the block literal
 611 /// to it and passes its pointer to the block invoke function. The kernel
 612 /// has "enqueued-block" function attribute and kernel argument metadata.
 613 llvm::Value *AMDGPUTargetCodeGenInfo::createEnqueuedBlockKernel(
 614     CodeGenFunction &CGF, llvm::Function *Invoke, llvm::Type *BlockTy) const {
 615   auto &Builder = CGF.Builder;
 616   auto &C = CGF.getLLVMContext();
 617
 618   auto *InvokeFT = Invoke->getFunctionType();
 619   llvm::SmallVector<llvm::Type *, 2> ArgTys;
 620   llvm::SmallVector<llvm::Metadata *, 8> AddressQuals;
 621   llvm::SmallVector<llvm::Metadata *, 8> AccessQuals;
 622   llvm::SmallVector<llvm::Metadata *, 8> ArgTypeNames;
 623   llvm::SmallVector<llvm::Metadata *, 8> ArgBaseTypeNames;
 624   llvm::SmallVector<llvm::Metadata *, 8> ArgTypeQuals;
 625   llvm::SmallVector<llvm::Metadata *, 8> ArgNames;
 626
 627   ArgTys.push_back(BlockTy);
 628   ArgTypeNames.push_back(llvm::MDString::get(C, "__block_literal"));
 629   AddressQuals.push_back(llvm::ConstantAsMetadata::get(Builder.getInt32(0)));
 630   ArgBaseTypeNames.push_back(llvm::MDString::get(C, "__block_literal"));
 631   ArgTypeQuals.push_back(llvm::MDString::get(C, ""));
 632   AccessQuals.push_back(llvm::MDString::get(C, "none"));
 633   ArgNames.push_back(llvm::MDString::get(C, "block_literal"));
 634   for (unsigned I = 1, E = InvokeFT->getNumParams(); I < E; ++I) {
 635     ArgTys.push_back(InvokeFT->getParamType(I));
 636     ArgTypeNames.push_back(llvm::MDString::get(C, "void*"));
 637     AddressQuals.push_back(llvm::ConstantAsMetadata::get(Builder.getInt32(3)));
 638     AccessQuals.push_back(llvm::MDString::get(C, "none"));
 639     ArgBaseTypeNames.push_back(llvm::MDString::get(C, "void*"));
 640     ArgTypeQuals.push_back(llvm::MDString::get(C, ""));
 641     ArgNames.push_back(
 642         llvm::MDString::get(C, (Twine("local_arg") + Twine(I)).str()));
 643   }
 644   std::string Name = Invoke->getName().str() + "_kernel";
 645   auto *FT = llvm::FunctionType::get(llvm::Type::getVoidTy(C), ArgTys, false);
 646   auto *F = llvm::Function::Create(FT, llvm::GlobalValue::InternalLinkage, Name,
 647                                    &CGF.CGM.getModule());
 648   F->setCallingConv(llvm::CallingConv::AMDGPU_KERNEL);
 649
 650   llvm::AttrBuilder KernelAttrs(C);
 651   // FIXME: The invoke isn't applying the right attributes either
 652   // FIXME: This is missing setTargetAttributes
 653   CGF.CGM.addDefaultFunctionDefinitionAttributes(KernelAttrs);
 654   KernelAttrs.addAttribute("enqueued-block");
 655   F->addFnAttrs(KernelAttrs);
 656
 657   auto IP = CGF.Builder.saveIP();
 658   auto *BB = llvm::BasicBlock::Create(C, "entry", F);
 659   Builder.SetInsertPoint(BB);
 660   const auto BlockAlign = CGF.CGM.getDataLayout().getPrefTypeAlign(BlockTy);
 661   auto *BlockPtr = Builder.CreateAlloca(BlockTy, nullptr);
 662   BlockPtr->setAlignment(BlockAlign);
 663   Builder.CreateAlignedStore(F->arg_begin(), BlockPtr, BlockAlign);
 664   auto *Cast = Builder.CreatePointerCast(BlockPtr, InvokeFT->getParamType(0));
 665   llvm::SmallVector<llvm::Value *, 2> Args;
 666   Args.push_back(Cast);
 667   for (llvm::Argument &A : llvm::drop_begin(F->args()))
 668     Args.push_back(&A);
 669   llvm::CallInst *call = Builder.CreateCall(Invoke, Args);
 670   call->setCallingConv(Invoke->getCallingConv());
 671   Builder.CreateRetVoid();
 672   Builder.restoreIP(IP);
 673
 674   F->setMetadata("kernel_arg_addr_space", llvm::MDNode::get(C, AddressQuals));
 675   F->setMetadata("kernel_arg_access_qual", llvm::MDNode::get(C, AccessQuals));
 676   F->setMetadata("kernel_arg_type", llvm::MDNode::get(C, ArgTypeNames));
 677   F->setMetadata("kernel_arg_base_type",
 678                  llvm::MDNode::get(C, ArgBaseTypeNames));
 679   F->setMetadata("kernel_arg_type_qual", llvm::MDNode::get(C, ArgTypeQuals));
 680   if (CGF.CGM.getCodeGenOpts().EmitOpenCLArgMetadata)
 681     F->setMetadata("kernel_arg_name", llvm::MDNode::get(C, ArgNames));
 682
 683   return F;
 684 }
 685
 686 void CodeGenModule::handleAMDGPUFlatWorkGroupSizeAttr(
 687     llvm::Function *F, const AMDGPUFlatWorkGroupSizeAttr *FlatWGS,
 688     const ReqdWorkGroupSizeAttr *ReqdWGS, int32_t *MinThreadsVal,
 689     int32_t *MaxThreadsVal) {
 690   unsigned Min = 0;
 691   unsigned Max = 0;
 692   if (FlatWGS) {
 693     Min = FlatWGS->getMin()->EvaluateKnownConstInt(getContext()).getExtValue();
 694     Max = FlatWGS->getMax()->EvaluateKnownConstInt(getContext()).getExtValue();
 695   }
 696   if (ReqdWGS && Min == 0 && Max == 0)
 697     Min = Max = ReqdWGS->getXDim() * ReqdWGS->getYDim() * ReqdWGS->getZDim();
 698
 699   if (Min != 0) {
 700     assert(Min <= Max && "Min must be less than or equal Max");
 701
 702     if (MinThreadsVal)
 703       *MinThreadsVal = Min;
 704     if (MaxThreadsVal)
 705       *MaxThreadsVal = Max;
 706     std::string AttrVal = llvm::utostr(Min) + "," + llvm::utostr(Max);
 707     if (F)
 708       F->addFnAttr("amdgpu-flat-work-group-size", AttrVal);
 709   } else
 710     assert(Max == 0 && "Max must be zero");
 711 }
 712
 713 void CodeGenModule::handleAMDGPUWavesPerEUAttr(
 714     llvm::Function *F, const AMDGPUWavesPerEUAttr *Attr) {
 715   unsigned Min =
 716       Attr->getMin()->EvaluateKnownConstInt(getContext()).getExtValue();
 717   unsigned Max =
 718       Attr->getMax()
 719           ? Attr->getMax()->EvaluateKnownConstInt(getContext()).getExtValue()
 720           : 0;
 721
 722   if (Min != 0) {
 723     assert((Max == 0 || Min <= Max) && "Min must be less than or equal Max");
 724
 725     std::string AttrVal = llvm::utostr(Min);
 726     if (Max != 0)
 727       AttrVal = AttrVal + "," + llvm::utostr(Max);
 728     F->addFnAttr("amdgpu-waves-per-eu", AttrVal);
 729   } else
 730     assert(Max == 0 && "Max must be zero");
 731 }
 732
 733 std::unique_ptr<TargetCodeGenInfo>
 734 CodeGen::createAMDGPUTargetCodeGenInfo(CodeGenModule &CGM) {
 735   return std::make_unique<AMDGPUTargetCodeGenInfo>(CGM.getTypes());
 736 }