[OptTable] Fix typo VALUE => VALUES (NFCI) (#121523)
[llvm-project.git] / clang / lib / CodeGen / Targets / AMDGPU.cpp
blobfa07e68c558356230894f73667de6ebfe8f11445
1 //===- AMDGPU.cpp ---------------------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
9 #include "ABIInfoImpl.h"
10 #include "TargetInfo.h"
11 #include "clang/Basic/TargetOptions.h"
12 #include "llvm/Support/AMDGPUAddrSpace.h"
14 using namespace clang;
15 using namespace clang::CodeGen;
17 //===----------------------------------------------------------------------===//
18 // AMDGPU ABI Implementation
19 //===----------------------------------------------------------------------===//
21 namespace {
23 class AMDGPUABIInfo final : public DefaultABIInfo {
24 private:
25 static const unsigned MaxNumRegsForArgsRet = 16;
27 unsigned numRegsForType(QualType Ty) const;
29 bool isHomogeneousAggregateBaseType(QualType Ty) const override;
30 bool isHomogeneousAggregateSmallEnough(const Type *Base,
31 uint64_t Members) const override;
33 // Coerce HIP scalar pointer arguments from generic pointers to global ones.
34 llvm::Type *coerceKernelArgumentType(llvm::Type *Ty, unsigned FromAS,
35 unsigned ToAS) const {
36 // Single value types.
37 auto *PtrTy = llvm::dyn_cast<llvm::PointerType>(Ty);
38 if (PtrTy && PtrTy->getAddressSpace() == FromAS)
39 return llvm::PointerType::get(Ty->getContext(), ToAS);
40 return Ty;
43 public:
44 explicit AMDGPUABIInfo(CodeGen::CodeGenTypes &CGT) :
45 DefaultABIInfo(CGT) {}
47 ABIArgInfo classifyReturnType(QualType RetTy) const;
48 ABIArgInfo classifyKernelArgumentType(QualType Ty) const;
49 ABIArgInfo classifyArgumentType(QualType Ty, bool Variadic,
50 unsigned &NumRegsLeft) const;
52 void computeInfo(CGFunctionInfo &FI) const override;
53 RValue EmitVAArg(CodeGenFunction &CGF, Address VAListAddr, QualType Ty,
54 AggValueSlot Slot) const override;
57 bool AMDGPUABIInfo::isHomogeneousAggregateBaseType(QualType Ty) const {
58 return true;
61 bool AMDGPUABIInfo::isHomogeneousAggregateSmallEnough(
62 const Type *Base, uint64_t Members) const {
63 uint32_t NumRegs = (getContext().getTypeSize(Base) + 31) / 32;
65 // Homogeneous Aggregates may occupy at most 16 registers.
66 return Members * NumRegs <= MaxNumRegsForArgsRet;
69 /// Estimate number of registers the type will use when passed in registers.
70 unsigned AMDGPUABIInfo::numRegsForType(QualType Ty) const {
71 unsigned NumRegs = 0;
73 if (const VectorType *VT = Ty->getAs<VectorType>()) {
74 // Compute from the number of elements. The reported size is based on the
75 // in-memory size, which includes the padding 4th element for 3-vectors.
76 QualType EltTy = VT->getElementType();
77 unsigned EltSize = getContext().getTypeSize(EltTy);
79 // 16-bit element vectors should be passed as packed.
80 if (EltSize == 16)
81 return (VT->getNumElements() + 1) / 2;
83 unsigned EltNumRegs = (EltSize + 31) / 32;
84 return EltNumRegs * VT->getNumElements();
87 if (const RecordType *RT = Ty->getAs<RecordType>()) {
88 const RecordDecl *RD = RT->getDecl();
89 assert(!RD->hasFlexibleArrayMember());
91 for (const FieldDecl *Field : RD->fields()) {
92 QualType FieldTy = Field->getType();
93 NumRegs += numRegsForType(FieldTy);
96 return NumRegs;
99 return (getContext().getTypeSize(Ty) + 31) / 32;
102 void AMDGPUABIInfo::computeInfo(CGFunctionInfo &FI) const {
103 llvm::CallingConv::ID CC = FI.getCallingConvention();
105 if (!getCXXABI().classifyReturnType(FI))
106 FI.getReturnInfo() = classifyReturnType(FI.getReturnType());
108 unsigned ArgumentIndex = 0;
109 const unsigned numFixedArguments = FI.getNumRequiredArgs();
111 unsigned NumRegsLeft = MaxNumRegsForArgsRet;
112 for (auto &Arg : FI.arguments()) {
113 if (CC == llvm::CallingConv::AMDGPU_KERNEL) {
114 Arg.info = classifyKernelArgumentType(Arg.type);
115 } else {
116 bool FixedArgument = ArgumentIndex++ < numFixedArguments;
117 Arg.info = classifyArgumentType(Arg.type, !FixedArgument, NumRegsLeft);
122 RValue AMDGPUABIInfo::EmitVAArg(CodeGenFunction &CGF, Address VAListAddr,
123 QualType Ty, AggValueSlot Slot) const {
124 const bool IsIndirect = false;
125 const bool AllowHigherAlign = false;
126 return emitVoidPtrVAArg(CGF, VAListAddr, Ty, IsIndirect,
127 getContext().getTypeInfoInChars(Ty),
128 CharUnits::fromQuantity(4), AllowHigherAlign, Slot);
131 ABIArgInfo AMDGPUABIInfo::classifyReturnType(QualType RetTy) const {
132 if (isAggregateTypeForABI(RetTy)) {
133 // Records with non-trivial destructors/copy-constructors should not be
134 // returned by value.
135 if (!getRecordArgABI(RetTy, getCXXABI())) {
136 // Ignore empty structs/unions.
137 if (isEmptyRecord(getContext(), RetTy, true))
138 return ABIArgInfo::getIgnore();
140 // Lower single-element structs to just return a regular value.
141 if (const Type *SeltTy = isSingleElementStruct(RetTy, getContext()))
142 return ABIArgInfo::getDirect(CGT.ConvertType(QualType(SeltTy, 0)));
144 if (const RecordType *RT = RetTy->getAs<RecordType>()) {
145 const RecordDecl *RD = RT->getDecl();
146 if (RD->hasFlexibleArrayMember())
147 return DefaultABIInfo::classifyReturnType(RetTy);
150 // Pack aggregates <= 4 bytes into single VGPR or pair.
151 uint64_t Size = getContext().getTypeSize(RetTy);
152 if (Size <= 16)
153 return ABIArgInfo::getDirect(llvm::Type::getInt16Ty(getVMContext()));
155 if (Size <= 32)
156 return ABIArgInfo::getDirect(llvm::Type::getInt32Ty(getVMContext()));
158 if (Size <= 64) {
159 llvm::Type *I32Ty = llvm::Type::getInt32Ty(getVMContext());
160 return ABIArgInfo::getDirect(llvm::ArrayType::get(I32Ty, 2));
163 if (numRegsForType(RetTy) <= MaxNumRegsForArgsRet)
164 return ABIArgInfo::getDirect();
168 // Otherwise just do the default thing.
169 return DefaultABIInfo::classifyReturnType(RetTy);
172 /// For kernels all parameters are really passed in a special buffer. It doesn't
173 /// make sense to pass anything byval, so everything must be direct.
174 ABIArgInfo AMDGPUABIInfo::classifyKernelArgumentType(QualType Ty) const {
175 Ty = useFirstFieldIfTransparentUnion(Ty);
177 // TODO: Can we omit empty structs?
179 if (const Type *SeltTy = isSingleElementStruct(Ty, getContext()))
180 Ty = QualType(SeltTy, 0);
182 llvm::Type *OrigLTy = CGT.ConvertType(Ty);
183 llvm::Type *LTy = OrigLTy;
184 if (getContext().getLangOpts().HIP) {
185 LTy = coerceKernelArgumentType(
186 OrigLTy, /*FromAS=*/getContext().getTargetAddressSpace(LangAS::Default),
187 /*ToAS=*/getContext().getTargetAddressSpace(LangAS::cuda_device));
190 // FIXME: Should also use this for OpenCL, but it requires addressing the
191 // problem of kernels being called.
193 // FIXME: This doesn't apply the optimization of coercing pointers in structs
194 // to global address space when using byref. This would require implementing a
195 // new kind of coercion of the in-memory type when for indirect arguments.
196 if (!getContext().getLangOpts().OpenCL && LTy == OrigLTy &&
197 isAggregateTypeForABI(Ty)) {
198 return ABIArgInfo::getIndirectAliased(
199 getContext().getTypeAlignInChars(Ty),
200 getContext().getTargetAddressSpace(LangAS::opencl_constant),
201 false /*Realign*/, nullptr /*Padding*/);
204 // If we set CanBeFlattened to true, CodeGen will expand the struct to its
205 // individual elements, which confuses the Clover OpenCL backend; therefore we
206 // have to set it to false here. Other args of getDirect() are just defaults.
207 return ABIArgInfo::getDirect(LTy, 0, nullptr, false);
210 ABIArgInfo AMDGPUABIInfo::classifyArgumentType(QualType Ty, bool Variadic,
211 unsigned &NumRegsLeft) const {
212 assert(NumRegsLeft <= MaxNumRegsForArgsRet && "register estimate underflow");
214 Ty = useFirstFieldIfTransparentUnion(Ty);
216 if (Variadic) {
217 return ABIArgInfo::getDirect(/*T=*/nullptr,
218 /*Offset=*/0,
219 /*Padding=*/nullptr,
220 /*CanBeFlattened=*/false,
221 /*Align=*/0);
224 if (isAggregateTypeForABI(Ty)) {
225 // Records with non-trivial destructors/copy-constructors should not be
226 // passed by value.
227 if (auto RAA = getRecordArgABI(Ty, getCXXABI()))
228 return getNaturalAlignIndirect(Ty, RAA == CGCXXABI::RAA_DirectInMemory);
230 // Ignore empty structs/unions.
231 if (isEmptyRecord(getContext(), Ty, true))
232 return ABIArgInfo::getIgnore();
234 // Lower single-element structs to just pass a regular value. TODO: We
235 // could do reasonable-size multiple-element structs too, using getExpand(),
236 // though watch out for things like bitfields.
237 if (const Type *SeltTy = isSingleElementStruct(Ty, getContext()))
238 return ABIArgInfo::getDirect(CGT.ConvertType(QualType(SeltTy, 0)));
240 if (const RecordType *RT = Ty->getAs<RecordType>()) {
241 const RecordDecl *RD = RT->getDecl();
242 if (RD->hasFlexibleArrayMember())
243 return DefaultABIInfo::classifyArgumentType(Ty);
246 // Pack aggregates <= 8 bytes into single VGPR or pair.
247 uint64_t Size = getContext().getTypeSize(Ty);
248 if (Size <= 64) {
249 unsigned NumRegs = (Size + 31) / 32;
250 NumRegsLeft -= std::min(NumRegsLeft, NumRegs);
252 if (Size <= 16)
253 return ABIArgInfo::getDirect(llvm::Type::getInt16Ty(getVMContext()));
255 if (Size <= 32)
256 return ABIArgInfo::getDirect(llvm::Type::getInt32Ty(getVMContext()));
258 // XXX: Should this be i64 instead, and should the limit increase?
259 llvm::Type *I32Ty = llvm::Type::getInt32Ty(getVMContext());
260 return ABIArgInfo::getDirect(llvm::ArrayType::get(I32Ty, 2));
263 if (NumRegsLeft > 0) {
264 unsigned NumRegs = numRegsForType(Ty);
265 if (NumRegsLeft >= NumRegs) {
266 NumRegsLeft -= NumRegs;
267 return ABIArgInfo::getDirect();
271 // Use pass-by-reference in stead of pass-by-value for struct arguments in
272 // function ABI.
273 return ABIArgInfo::getIndirectAliased(
274 getContext().getTypeAlignInChars(Ty),
275 getContext().getTargetAddressSpace(LangAS::opencl_private));
278 // Otherwise just do the default thing.
279 ABIArgInfo ArgInfo = DefaultABIInfo::classifyArgumentType(Ty);
280 if (!ArgInfo.isIndirect()) {
281 unsigned NumRegs = numRegsForType(Ty);
282 NumRegsLeft -= std::min(NumRegs, NumRegsLeft);
285 return ArgInfo;
288 class AMDGPUTargetCodeGenInfo : public TargetCodeGenInfo {
289 public:
290 AMDGPUTargetCodeGenInfo(CodeGenTypes &CGT)
291 : TargetCodeGenInfo(std::make_unique<AMDGPUABIInfo>(CGT)) {}
293 void setFunctionDeclAttributes(const FunctionDecl *FD, llvm::Function *F,
294 CodeGenModule &CGM) const;
296 void emitTargetGlobals(CodeGen::CodeGenModule &CGM) const override;
298 void setTargetAttributes(const Decl *D, llvm::GlobalValue *GV,
299 CodeGen::CodeGenModule &M) const override;
300 unsigned getOpenCLKernelCallingConv() const override;
302 llvm::Constant *getNullPointer(const CodeGen::CodeGenModule &CGM,
303 llvm::PointerType *T, QualType QT) const override;
305 LangAS getASTAllocaAddressSpace() const override {
306 return getLangASFromTargetAS(
307 getABIInfo().getDataLayout().getAllocaAddrSpace());
309 LangAS getGlobalVarAddressSpace(CodeGenModule &CGM,
310 const VarDecl *D) const override;
311 llvm::SyncScope::ID getLLVMSyncScopeID(const LangOptions &LangOpts,
312 SyncScope Scope,
313 llvm::AtomicOrdering Ordering,
314 llvm::LLVMContext &Ctx) const override;
315 void setTargetAtomicMetadata(CodeGenFunction &CGF,
316 llvm::Instruction &AtomicInst,
317 const AtomicExpr *Expr = nullptr) const override;
318 llvm::Value *createEnqueuedBlockKernel(CodeGenFunction &CGF,
319 llvm::Function *BlockInvokeFunc,
320 llvm::Type *BlockTy) const override;
321 bool shouldEmitStaticExternCAliases() const override;
322 bool shouldEmitDWARFBitFieldSeparators() const override;
323 void setCUDAKernelCallingConvention(const FunctionType *&FT) const override;
327 static bool requiresAMDGPUProtectedVisibility(const Decl *D,
328 llvm::GlobalValue *GV) {
329 if (GV->getVisibility() != llvm::GlobalValue::HiddenVisibility)
330 return false;
332 return !D->hasAttr<OMPDeclareTargetDeclAttr>() &&
333 (D->hasAttr<OpenCLKernelAttr>() ||
334 (isa<FunctionDecl>(D) && D->hasAttr<CUDAGlobalAttr>()) ||
335 (isa<VarDecl>(D) &&
336 (D->hasAttr<CUDADeviceAttr>() || D->hasAttr<CUDAConstantAttr>() ||
337 cast<VarDecl>(D)->getType()->isCUDADeviceBuiltinSurfaceType() ||
338 cast<VarDecl>(D)->getType()->isCUDADeviceBuiltinTextureType())));
341 void AMDGPUTargetCodeGenInfo::setFunctionDeclAttributes(
342 const FunctionDecl *FD, llvm::Function *F, CodeGenModule &M) const {
343 const auto *ReqdWGS =
344 M.getLangOpts().OpenCL ? FD->getAttr<ReqdWorkGroupSizeAttr>() : nullptr;
345 const bool IsOpenCLKernel =
346 M.getLangOpts().OpenCL && FD->hasAttr<OpenCLKernelAttr>();
347 const bool IsHIPKernel = M.getLangOpts().HIP && FD->hasAttr<CUDAGlobalAttr>();
349 const auto *FlatWGS = FD->getAttr<AMDGPUFlatWorkGroupSizeAttr>();
350 if (ReqdWGS || FlatWGS) {
351 M.handleAMDGPUFlatWorkGroupSizeAttr(F, FlatWGS, ReqdWGS);
352 } else if (IsOpenCLKernel || IsHIPKernel) {
353 // By default, restrict the maximum size to a value specified by
354 // --gpu-max-threads-per-block=n or its default value for HIP.
355 const unsigned OpenCLDefaultMaxWorkGroupSize = 256;
356 const unsigned DefaultMaxWorkGroupSize =
357 IsOpenCLKernel ? OpenCLDefaultMaxWorkGroupSize
358 : M.getLangOpts().GPUMaxThreadsPerBlock;
359 std::string AttrVal =
360 std::string("1,") + llvm::utostr(DefaultMaxWorkGroupSize);
361 F->addFnAttr("amdgpu-flat-work-group-size", AttrVal);
364 if (const auto *Attr = FD->getAttr<AMDGPUWavesPerEUAttr>())
365 M.handleAMDGPUWavesPerEUAttr(F, Attr);
367 if (const auto *Attr = FD->getAttr<AMDGPUNumSGPRAttr>()) {
368 unsigned NumSGPR = Attr->getNumSGPR();
370 if (NumSGPR != 0)
371 F->addFnAttr("amdgpu-num-sgpr", llvm::utostr(NumSGPR));
374 if (const auto *Attr = FD->getAttr<AMDGPUNumVGPRAttr>()) {
375 uint32_t NumVGPR = Attr->getNumVGPR();
377 if (NumVGPR != 0)
378 F->addFnAttr("amdgpu-num-vgpr", llvm::utostr(NumVGPR));
381 if (const auto *Attr = FD->getAttr<AMDGPUMaxNumWorkGroupsAttr>()) {
382 uint32_t X = Attr->getMaxNumWorkGroupsX()
383 ->EvaluateKnownConstInt(M.getContext())
384 .getExtValue();
385 // Y and Z dimensions default to 1 if not specified
386 uint32_t Y = Attr->getMaxNumWorkGroupsY()
387 ? Attr->getMaxNumWorkGroupsY()
388 ->EvaluateKnownConstInt(M.getContext())
389 .getExtValue()
390 : 1;
391 uint32_t Z = Attr->getMaxNumWorkGroupsZ()
392 ? Attr->getMaxNumWorkGroupsZ()
393 ->EvaluateKnownConstInt(M.getContext())
394 .getExtValue()
395 : 1;
397 llvm::SmallString<32> AttrVal;
398 llvm::raw_svector_ostream OS(AttrVal);
399 OS << X << ',' << Y << ',' << Z;
401 F->addFnAttr("amdgpu-max-num-workgroups", AttrVal.str());
405 /// Emits control constants used to change per-architecture behaviour in the
406 /// AMDGPU ROCm device libraries.
407 void AMDGPUTargetCodeGenInfo::emitTargetGlobals(
408 CodeGen::CodeGenModule &CGM) const {
409 StringRef Name = "__oclc_ABI_version";
410 llvm::GlobalVariable *OriginalGV = CGM.getModule().getNamedGlobal(Name);
411 if (OriginalGV && !llvm::GlobalVariable::isExternalLinkage(OriginalGV->getLinkage()))
412 return;
414 if (CGM.getTarget().getTargetOpts().CodeObjectVersion ==
415 llvm::CodeObjectVersionKind::COV_None)
416 return;
418 auto *Type = llvm::IntegerType::getIntNTy(CGM.getModule().getContext(), 32);
419 llvm::Constant *COV = llvm::ConstantInt::get(
420 Type, CGM.getTarget().getTargetOpts().CodeObjectVersion);
422 // It needs to be constant weak_odr without externally_initialized so that
423 // the load instuction can be eliminated by the IPSCCP.
424 auto *GV = new llvm::GlobalVariable(
425 CGM.getModule(), Type, true, llvm::GlobalValue::WeakODRLinkage, COV, Name,
426 nullptr, llvm::GlobalValue::ThreadLocalMode::NotThreadLocal,
427 CGM.getContext().getTargetAddressSpace(LangAS::opencl_constant));
428 GV->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Local);
429 GV->setVisibility(llvm::GlobalValue::VisibilityTypes::HiddenVisibility);
431 // Replace any external references to this variable with the new global.
432 if (OriginalGV) {
433 OriginalGV->replaceAllUsesWith(GV);
434 GV->takeName(OriginalGV);
435 OriginalGV->eraseFromParent();
439 void AMDGPUTargetCodeGenInfo::setTargetAttributes(
440 const Decl *D, llvm::GlobalValue *GV, CodeGen::CodeGenModule &M) const {
441 if (requiresAMDGPUProtectedVisibility(D, GV)) {
442 GV->setVisibility(llvm::GlobalValue::ProtectedVisibility);
443 GV->setDSOLocal(true);
446 if (GV->isDeclaration())
447 return;
449 llvm::Function *F = dyn_cast<llvm::Function>(GV);
450 if (!F)
451 return;
453 const FunctionDecl *FD = dyn_cast_or_null<FunctionDecl>(D);
454 if (FD)
455 setFunctionDeclAttributes(FD, F, M);
457 if (!getABIInfo().getCodeGenOpts().EmitIEEENaNCompliantInsts)
458 F->addFnAttr("amdgpu-ieee", "false");
461 unsigned AMDGPUTargetCodeGenInfo::getOpenCLKernelCallingConv() const {
462 return llvm::CallingConv::AMDGPU_KERNEL;
465 // Currently LLVM assumes null pointers always have value 0,
466 // which results in incorrectly transformed IR. Therefore, instead of
467 // emitting null pointers in private and local address spaces, a null
468 // pointer in generic address space is emitted which is casted to a
469 // pointer in local or private address space.
470 llvm::Constant *AMDGPUTargetCodeGenInfo::getNullPointer(
471 const CodeGen::CodeGenModule &CGM, llvm::PointerType *PT,
472 QualType QT) const {
473 if (CGM.getContext().getTargetNullPointerValue(QT) == 0)
474 return llvm::ConstantPointerNull::get(PT);
476 auto &Ctx = CGM.getContext();
477 auto NPT = llvm::PointerType::get(
478 PT->getContext(), Ctx.getTargetAddressSpace(LangAS::opencl_generic));
479 return llvm::ConstantExpr::getAddrSpaceCast(
480 llvm::ConstantPointerNull::get(NPT), PT);
483 LangAS
484 AMDGPUTargetCodeGenInfo::getGlobalVarAddressSpace(CodeGenModule &CGM,
485 const VarDecl *D) const {
486 assert(!CGM.getLangOpts().OpenCL &&
487 !(CGM.getLangOpts().CUDA && CGM.getLangOpts().CUDAIsDevice) &&
488 "Address space agnostic languages only");
489 LangAS DefaultGlobalAS = getLangASFromTargetAS(
490 CGM.getContext().getTargetAddressSpace(LangAS::opencl_global));
491 if (!D)
492 return DefaultGlobalAS;
494 LangAS AddrSpace = D->getType().getAddressSpace();
495 if (AddrSpace != LangAS::Default)
496 return AddrSpace;
498 // Only promote to address space 4 if VarDecl has constant initialization.
499 if (D->getType().isConstantStorage(CGM.getContext(), false, false) &&
500 D->hasConstantInitialization()) {
501 if (auto ConstAS = CGM.getTarget().getConstantAddressSpace())
502 return *ConstAS;
504 return DefaultGlobalAS;
507 llvm::SyncScope::ID
508 AMDGPUTargetCodeGenInfo::getLLVMSyncScopeID(const LangOptions &LangOpts,
509 SyncScope Scope,
510 llvm::AtomicOrdering Ordering,
511 llvm::LLVMContext &Ctx) const {
512 std::string Name;
513 switch (Scope) {
514 case SyncScope::HIPSingleThread:
515 case SyncScope::SingleScope:
516 Name = "singlethread";
517 break;
518 case SyncScope::HIPWavefront:
519 case SyncScope::OpenCLSubGroup:
520 case SyncScope::WavefrontScope:
521 Name = "wavefront";
522 break;
523 case SyncScope::HIPWorkgroup:
524 case SyncScope::OpenCLWorkGroup:
525 case SyncScope::WorkgroupScope:
526 Name = "workgroup";
527 break;
528 case SyncScope::HIPAgent:
529 case SyncScope::OpenCLDevice:
530 case SyncScope::DeviceScope:
531 Name = "agent";
532 break;
533 case SyncScope::SystemScope:
534 case SyncScope::HIPSystem:
535 case SyncScope::OpenCLAllSVMDevices:
536 Name = "";
537 break;
540 // OpenCL assumes by default that atomic scopes are per-address space for
541 // non-sequentially consistent operations.
542 if (Scope >= SyncScope::OpenCLWorkGroup &&
543 Scope <= SyncScope::OpenCLSubGroup &&
544 Ordering != llvm::AtomicOrdering::SequentiallyConsistent) {
545 if (!Name.empty())
546 Name = Twine(Twine(Name) + Twine("-")).str();
548 Name = Twine(Twine(Name) + Twine("one-as")).str();
551 return Ctx.getOrInsertSyncScopeID(Name);
554 void AMDGPUTargetCodeGenInfo::setTargetAtomicMetadata(
555 CodeGenFunction &CGF, llvm::Instruction &AtomicInst,
556 const AtomicExpr *AE) const {
557 auto *RMW = dyn_cast<llvm::AtomicRMWInst>(&AtomicInst);
558 auto *CmpX = dyn_cast<llvm::AtomicCmpXchgInst>(&AtomicInst);
560 // OpenCL and old style HIP atomics consider atomics targeting thread private
561 // memory to be undefined.
563 // TODO: This is probably undefined for atomic load/store, but there's not
564 // much direct codegen benefit to knowing this.
565 if (((RMW && RMW->getPointerAddressSpace() == llvm::AMDGPUAS::FLAT_ADDRESS) ||
566 (CmpX &&
567 CmpX->getPointerAddressSpace() == llvm::AMDGPUAS::FLAT_ADDRESS)) &&
568 AE && AE->threadPrivateMemoryAtomicsAreUndefined()) {
569 llvm::MDBuilder MDHelper(CGF.getLLVMContext());
570 llvm::MDNode *ASRange = MDHelper.createRange(
571 llvm::APInt(32, llvm::AMDGPUAS::PRIVATE_ADDRESS),
572 llvm::APInt(32, llvm::AMDGPUAS::PRIVATE_ADDRESS + 1));
573 AtomicInst.setMetadata(llvm::LLVMContext::MD_noalias_addrspace, ASRange);
576 if (!RMW || !CGF.getTarget().allowAMDGPUUnsafeFPAtomics())
577 return;
579 // TODO: Introduce new, more controlled options that also work for integers,
580 // and deprecate allowAMDGPUUnsafeFPAtomics.
581 llvm::AtomicRMWInst::BinOp RMWOp = RMW->getOperation();
582 if (llvm::AtomicRMWInst::isFPOperation(RMWOp)) {
583 llvm::MDNode *Empty = llvm::MDNode::get(CGF.getLLVMContext(), {});
584 RMW->setMetadata("amdgpu.no.fine.grained.memory", Empty);
586 if (RMWOp == llvm::AtomicRMWInst::FAdd && RMW->getType()->isFloatTy())
587 RMW->setMetadata("amdgpu.ignore.denormal.mode", Empty);
591 bool AMDGPUTargetCodeGenInfo::shouldEmitStaticExternCAliases() const {
592 return false;
595 bool AMDGPUTargetCodeGenInfo::shouldEmitDWARFBitFieldSeparators() const {
596 return true;
599 void AMDGPUTargetCodeGenInfo::setCUDAKernelCallingConvention(
600 const FunctionType *&FT) const {
601 FT = getABIInfo().getContext().adjustFunctionType(
602 FT, FT->getExtInfo().withCallingConv(CC_OpenCLKernel));
605 /// Create an OpenCL kernel for an enqueued block.
607 /// The type of the first argument (the block literal) is the struct type
608 /// of the block literal instead of a pointer type. The first argument
609 /// (block literal) is passed directly by value to the kernel. The kernel
610 /// allocates the same type of struct on stack and stores the block literal
611 /// to it and passes its pointer to the block invoke function. The kernel
612 /// has "enqueued-block" function attribute and kernel argument metadata.
613 llvm::Value *AMDGPUTargetCodeGenInfo::createEnqueuedBlockKernel(
614 CodeGenFunction &CGF, llvm::Function *Invoke, llvm::Type *BlockTy) const {
615 auto &Builder = CGF.Builder;
616 auto &C = CGF.getLLVMContext();
618 auto *InvokeFT = Invoke->getFunctionType();
619 llvm::SmallVector<llvm::Type *, 2> ArgTys;
620 llvm::SmallVector<llvm::Metadata *, 8> AddressQuals;
621 llvm::SmallVector<llvm::Metadata *, 8> AccessQuals;
622 llvm::SmallVector<llvm::Metadata *, 8> ArgTypeNames;
623 llvm::SmallVector<llvm::Metadata *, 8> ArgBaseTypeNames;
624 llvm::SmallVector<llvm::Metadata *, 8> ArgTypeQuals;
625 llvm::SmallVector<llvm::Metadata *, 8> ArgNames;
627 ArgTys.push_back(BlockTy);
628 ArgTypeNames.push_back(llvm::MDString::get(C, "__block_literal"));
629 AddressQuals.push_back(llvm::ConstantAsMetadata::get(Builder.getInt32(0)));
630 ArgBaseTypeNames.push_back(llvm::MDString::get(C, "__block_literal"));
631 ArgTypeQuals.push_back(llvm::MDString::get(C, ""));
632 AccessQuals.push_back(llvm::MDString::get(C, "none"));
633 ArgNames.push_back(llvm::MDString::get(C, "block_literal"));
634 for (unsigned I = 1, E = InvokeFT->getNumParams(); I < E; ++I) {
635 ArgTys.push_back(InvokeFT->getParamType(I));
636 ArgTypeNames.push_back(llvm::MDString::get(C, "void*"));
637 AddressQuals.push_back(llvm::ConstantAsMetadata::get(Builder.getInt32(3)));
638 AccessQuals.push_back(llvm::MDString::get(C, "none"));
639 ArgBaseTypeNames.push_back(llvm::MDString::get(C, "void*"));
640 ArgTypeQuals.push_back(llvm::MDString::get(C, ""));
641 ArgNames.push_back(
642 llvm::MDString::get(C, (Twine("local_arg") + Twine(I)).str()));
644 std::string Name = Invoke->getName().str() + "_kernel";
645 auto *FT = llvm::FunctionType::get(llvm::Type::getVoidTy(C), ArgTys, false);
646 auto *F = llvm::Function::Create(FT, llvm::GlobalValue::InternalLinkage, Name,
647 &CGF.CGM.getModule());
648 F->setCallingConv(llvm::CallingConv::AMDGPU_KERNEL);
650 llvm::AttrBuilder KernelAttrs(C);
651 // FIXME: The invoke isn't applying the right attributes either
652 // FIXME: This is missing setTargetAttributes
653 CGF.CGM.addDefaultFunctionDefinitionAttributes(KernelAttrs);
654 KernelAttrs.addAttribute("enqueued-block");
655 F->addFnAttrs(KernelAttrs);
657 auto IP = CGF.Builder.saveIP();
658 auto *BB = llvm::BasicBlock::Create(C, "entry", F);
659 Builder.SetInsertPoint(BB);
660 const auto BlockAlign = CGF.CGM.getDataLayout().getPrefTypeAlign(BlockTy);
661 auto *BlockPtr = Builder.CreateAlloca(BlockTy, nullptr);
662 BlockPtr->setAlignment(BlockAlign);
663 Builder.CreateAlignedStore(F->arg_begin(), BlockPtr, BlockAlign);
664 auto *Cast = Builder.CreatePointerCast(BlockPtr, InvokeFT->getParamType(0));
665 llvm::SmallVector<llvm::Value *, 2> Args;
666 Args.push_back(Cast);
667 for (llvm::Argument &A : llvm::drop_begin(F->args()))
668 Args.push_back(&A);
669 llvm::CallInst *call = Builder.CreateCall(Invoke, Args);
670 call->setCallingConv(Invoke->getCallingConv());
671 Builder.CreateRetVoid();
672 Builder.restoreIP(IP);
674 F->setMetadata("kernel_arg_addr_space", llvm::MDNode::get(C, AddressQuals));
675 F->setMetadata("kernel_arg_access_qual", llvm::MDNode::get(C, AccessQuals));
676 F->setMetadata("kernel_arg_type", llvm::MDNode::get(C, ArgTypeNames));
677 F->setMetadata("kernel_arg_base_type",
678 llvm::MDNode::get(C, ArgBaseTypeNames));
679 F->setMetadata("kernel_arg_type_qual", llvm::MDNode::get(C, ArgTypeQuals));
680 if (CGF.CGM.getCodeGenOpts().EmitOpenCLArgMetadata)
681 F->setMetadata("kernel_arg_name", llvm::MDNode::get(C, ArgNames));
683 return F;
686 void CodeGenModule::handleAMDGPUFlatWorkGroupSizeAttr(
687 llvm::Function *F, const AMDGPUFlatWorkGroupSizeAttr *FlatWGS,
688 const ReqdWorkGroupSizeAttr *ReqdWGS, int32_t *MinThreadsVal,
689 int32_t *MaxThreadsVal) {
690 unsigned Min = 0;
691 unsigned Max = 0;
692 if (FlatWGS) {
693 Min = FlatWGS->getMin()->EvaluateKnownConstInt(getContext()).getExtValue();
694 Max = FlatWGS->getMax()->EvaluateKnownConstInt(getContext()).getExtValue();
696 if (ReqdWGS && Min == 0 && Max == 0)
697 Min = Max = ReqdWGS->getXDim() * ReqdWGS->getYDim() * ReqdWGS->getZDim();
699 if (Min != 0) {
700 assert(Min <= Max && "Min must be less than or equal Max");
702 if (MinThreadsVal)
703 *MinThreadsVal = Min;
704 if (MaxThreadsVal)
705 *MaxThreadsVal = Max;
706 std::string AttrVal = llvm::utostr(Min) + "," + llvm::utostr(Max);
707 if (F)
708 F->addFnAttr("amdgpu-flat-work-group-size", AttrVal);
709 } else
710 assert(Max == 0 && "Max must be zero");
713 void CodeGenModule::handleAMDGPUWavesPerEUAttr(
714 llvm::Function *F, const AMDGPUWavesPerEUAttr *Attr) {
715 unsigned Min =
716 Attr->getMin()->EvaluateKnownConstInt(getContext()).getExtValue();
717 unsigned Max =
718 Attr->getMax()
719 ? Attr->getMax()->EvaluateKnownConstInt(getContext()).getExtValue()
720 : 0;
722 if (Min != 0) {
723 assert((Max == 0 || Min <= Max) && "Min must be less than or equal Max");
725 std::string AttrVal = llvm::utostr(Min);
726 if (Max != 0)
727 AttrVal = AttrVal + "," + llvm::utostr(Max);
728 F->addFnAttr("amdgpu-waves-per-eu", AttrVal);
729 } else
730 assert(Max == 0 && "Max must be zero");
733 std::unique_ptr<TargetCodeGenInfo>
734 CodeGen::createAMDGPUTargetCodeGenInfo(CodeGenModule &CGM) {
735 return std::make_unique<AMDGPUTargetCodeGenInfo>(CGM.getTypes());