1 //===- AMDGPU.cpp ---------------------------------------------------------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 #include "ABIInfoImpl.h"
10 #include "TargetInfo.h"
11 #include "clang/Basic/TargetOptions.h"
12 #include "llvm/Support/AMDGPUAddrSpace.h"
14 using namespace clang
;
15 using namespace clang::CodeGen
;
17 //===----------------------------------------------------------------------===//
18 // AMDGPU ABI Implementation
19 //===----------------------------------------------------------------------===//
23 class AMDGPUABIInfo final
: public DefaultABIInfo
{
25 static const unsigned MaxNumRegsForArgsRet
= 16;
27 unsigned numRegsForType(QualType Ty
) const;
29 bool isHomogeneousAggregateBaseType(QualType Ty
) const override
;
30 bool isHomogeneousAggregateSmallEnough(const Type
*Base
,
31 uint64_t Members
) const override
;
33 // Coerce HIP scalar pointer arguments from generic pointers to global ones.
34 llvm::Type
*coerceKernelArgumentType(llvm::Type
*Ty
, unsigned FromAS
,
35 unsigned ToAS
) const {
36 // Single value types.
37 auto *PtrTy
= llvm::dyn_cast
<llvm::PointerType
>(Ty
);
38 if (PtrTy
&& PtrTy
->getAddressSpace() == FromAS
)
39 return llvm::PointerType::get(Ty
->getContext(), ToAS
);
44 explicit AMDGPUABIInfo(CodeGen::CodeGenTypes
&CGT
) :
45 DefaultABIInfo(CGT
) {}
47 ABIArgInfo
classifyReturnType(QualType RetTy
) const;
48 ABIArgInfo
classifyKernelArgumentType(QualType Ty
) const;
49 ABIArgInfo
classifyArgumentType(QualType Ty
, bool Variadic
,
50 unsigned &NumRegsLeft
) const;
52 void computeInfo(CGFunctionInfo
&FI
) const override
;
53 RValue
EmitVAArg(CodeGenFunction
&CGF
, Address VAListAddr
, QualType Ty
,
54 AggValueSlot Slot
) const override
;
57 bool AMDGPUABIInfo::isHomogeneousAggregateBaseType(QualType Ty
) const {
61 bool AMDGPUABIInfo::isHomogeneousAggregateSmallEnough(
62 const Type
*Base
, uint64_t Members
) const {
63 uint32_t NumRegs
= (getContext().getTypeSize(Base
) + 31) / 32;
65 // Homogeneous Aggregates may occupy at most 16 registers.
66 return Members
* NumRegs
<= MaxNumRegsForArgsRet
;
69 /// Estimate number of registers the type will use when passed in registers.
70 unsigned AMDGPUABIInfo::numRegsForType(QualType Ty
) const {
73 if (const VectorType
*VT
= Ty
->getAs
<VectorType
>()) {
74 // Compute from the number of elements. The reported size is based on the
75 // in-memory size, which includes the padding 4th element for 3-vectors.
76 QualType EltTy
= VT
->getElementType();
77 unsigned EltSize
= getContext().getTypeSize(EltTy
);
79 // 16-bit element vectors should be passed as packed.
81 return (VT
->getNumElements() + 1) / 2;
83 unsigned EltNumRegs
= (EltSize
+ 31) / 32;
84 return EltNumRegs
* VT
->getNumElements();
87 if (const RecordType
*RT
= Ty
->getAs
<RecordType
>()) {
88 const RecordDecl
*RD
= RT
->getDecl();
89 assert(!RD
->hasFlexibleArrayMember());
91 for (const FieldDecl
*Field
: RD
->fields()) {
92 QualType FieldTy
= Field
->getType();
93 NumRegs
+= numRegsForType(FieldTy
);
99 return (getContext().getTypeSize(Ty
) + 31) / 32;
102 void AMDGPUABIInfo::computeInfo(CGFunctionInfo
&FI
) const {
103 llvm::CallingConv::ID CC
= FI
.getCallingConvention();
105 if (!getCXXABI().classifyReturnType(FI
))
106 FI
.getReturnInfo() = classifyReturnType(FI
.getReturnType());
108 unsigned ArgumentIndex
= 0;
109 const unsigned numFixedArguments
= FI
.getNumRequiredArgs();
111 unsigned NumRegsLeft
= MaxNumRegsForArgsRet
;
112 for (auto &Arg
: FI
.arguments()) {
113 if (CC
== llvm::CallingConv::AMDGPU_KERNEL
) {
114 Arg
.info
= classifyKernelArgumentType(Arg
.type
);
116 bool FixedArgument
= ArgumentIndex
++ < numFixedArguments
;
117 Arg
.info
= classifyArgumentType(Arg
.type
, !FixedArgument
, NumRegsLeft
);
122 RValue
AMDGPUABIInfo::EmitVAArg(CodeGenFunction
&CGF
, Address VAListAddr
,
123 QualType Ty
, AggValueSlot Slot
) const {
124 const bool IsIndirect
= false;
125 const bool AllowHigherAlign
= false;
126 return emitVoidPtrVAArg(CGF
, VAListAddr
, Ty
, IsIndirect
,
127 getContext().getTypeInfoInChars(Ty
),
128 CharUnits::fromQuantity(4), AllowHigherAlign
, Slot
);
131 ABIArgInfo
AMDGPUABIInfo::classifyReturnType(QualType RetTy
) const {
132 if (isAggregateTypeForABI(RetTy
)) {
133 // Records with non-trivial destructors/copy-constructors should not be
134 // returned by value.
135 if (!getRecordArgABI(RetTy
, getCXXABI())) {
136 // Ignore empty structs/unions.
137 if (isEmptyRecord(getContext(), RetTy
, true))
138 return ABIArgInfo::getIgnore();
140 // Lower single-element structs to just return a regular value.
141 if (const Type
*SeltTy
= isSingleElementStruct(RetTy
, getContext()))
142 return ABIArgInfo::getDirect(CGT
.ConvertType(QualType(SeltTy
, 0)));
144 if (const RecordType
*RT
= RetTy
->getAs
<RecordType
>()) {
145 const RecordDecl
*RD
= RT
->getDecl();
146 if (RD
->hasFlexibleArrayMember())
147 return DefaultABIInfo::classifyReturnType(RetTy
);
150 // Pack aggregates <= 4 bytes into single VGPR or pair.
151 uint64_t Size
= getContext().getTypeSize(RetTy
);
153 return ABIArgInfo::getDirect(llvm::Type::getInt16Ty(getVMContext()));
156 return ABIArgInfo::getDirect(llvm::Type::getInt32Ty(getVMContext()));
159 llvm::Type
*I32Ty
= llvm::Type::getInt32Ty(getVMContext());
160 return ABIArgInfo::getDirect(llvm::ArrayType::get(I32Ty
, 2));
163 if (numRegsForType(RetTy
) <= MaxNumRegsForArgsRet
)
164 return ABIArgInfo::getDirect();
168 // Otherwise just do the default thing.
169 return DefaultABIInfo::classifyReturnType(RetTy
);
172 /// For kernels all parameters are really passed in a special buffer. It doesn't
173 /// make sense to pass anything byval, so everything must be direct.
174 ABIArgInfo
AMDGPUABIInfo::classifyKernelArgumentType(QualType Ty
) const {
175 Ty
= useFirstFieldIfTransparentUnion(Ty
);
177 // TODO: Can we omit empty structs?
179 if (const Type
*SeltTy
= isSingleElementStruct(Ty
, getContext()))
180 Ty
= QualType(SeltTy
, 0);
182 llvm::Type
*OrigLTy
= CGT
.ConvertType(Ty
);
183 llvm::Type
*LTy
= OrigLTy
;
184 if (getContext().getLangOpts().HIP
) {
185 LTy
= coerceKernelArgumentType(
186 OrigLTy
, /*FromAS=*/getContext().getTargetAddressSpace(LangAS::Default
),
187 /*ToAS=*/getContext().getTargetAddressSpace(LangAS::cuda_device
));
190 // FIXME: Should also use this for OpenCL, but it requires addressing the
191 // problem of kernels being called.
193 // FIXME: This doesn't apply the optimization of coercing pointers in structs
194 // to global address space when using byref. This would require implementing a
195 // new kind of coercion of the in-memory type when for indirect arguments.
196 if (!getContext().getLangOpts().OpenCL
&& LTy
== OrigLTy
&&
197 isAggregateTypeForABI(Ty
)) {
198 return ABIArgInfo::getIndirectAliased(
199 getContext().getTypeAlignInChars(Ty
),
200 getContext().getTargetAddressSpace(LangAS::opencl_constant
),
201 false /*Realign*/, nullptr /*Padding*/);
204 // If we set CanBeFlattened to true, CodeGen will expand the struct to its
205 // individual elements, which confuses the Clover OpenCL backend; therefore we
206 // have to set it to false here. Other args of getDirect() are just defaults.
207 return ABIArgInfo::getDirect(LTy
, 0, nullptr, false);
210 ABIArgInfo
AMDGPUABIInfo::classifyArgumentType(QualType Ty
, bool Variadic
,
211 unsigned &NumRegsLeft
) const {
212 assert(NumRegsLeft
<= MaxNumRegsForArgsRet
&& "register estimate underflow");
214 Ty
= useFirstFieldIfTransparentUnion(Ty
);
217 return ABIArgInfo::getDirect(/*T=*/nullptr,
220 /*CanBeFlattened=*/false,
224 if (isAggregateTypeForABI(Ty
)) {
225 // Records with non-trivial destructors/copy-constructors should not be
227 if (auto RAA
= getRecordArgABI(Ty
, getCXXABI()))
228 return getNaturalAlignIndirect(Ty
, RAA
== CGCXXABI::RAA_DirectInMemory
);
230 // Ignore empty structs/unions.
231 if (isEmptyRecord(getContext(), Ty
, true))
232 return ABIArgInfo::getIgnore();
234 // Lower single-element structs to just pass a regular value. TODO: We
235 // could do reasonable-size multiple-element structs too, using getExpand(),
236 // though watch out for things like bitfields.
237 if (const Type
*SeltTy
= isSingleElementStruct(Ty
, getContext()))
238 return ABIArgInfo::getDirect(CGT
.ConvertType(QualType(SeltTy
, 0)));
240 if (const RecordType
*RT
= Ty
->getAs
<RecordType
>()) {
241 const RecordDecl
*RD
= RT
->getDecl();
242 if (RD
->hasFlexibleArrayMember())
243 return DefaultABIInfo::classifyArgumentType(Ty
);
246 // Pack aggregates <= 8 bytes into single VGPR or pair.
247 uint64_t Size
= getContext().getTypeSize(Ty
);
249 unsigned NumRegs
= (Size
+ 31) / 32;
250 NumRegsLeft
-= std::min(NumRegsLeft
, NumRegs
);
253 return ABIArgInfo::getDirect(llvm::Type::getInt16Ty(getVMContext()));
256 return ABIArgInfo::getDirect(llvm::Type::getInt32Ty(getVMContext()));
258 // XXX: Should this be i64 instead, and should the limit increase?
259 llvm::Type
*I32Ty
= llvm::Type::getInt32Ty(getVMContext());
260 return ABIArgInfo::getDirect(llvm::ArrayType::get(I32Ty
, 2));
263 if (NumRegsLeft
> 0) {
264 unsigned NumRegs
= numRegsForType(Ty
);
265 if (NumRegsLeft
>= NumRegs
) {
266 NumRegsLeft
-= NumRegs
;
267 return ABIArgInfo::getDirect();
271 // Use pass-by-reference in stead of pass-by-value for struct arguments in
273 return ABIArgInfo::getIndirectAliased(
274 getContext().getTypeAlignInChars(Ty
),
275 getContext().getTargetAddressSpace(LangAS::opencl_private
));
278 // Otherwise just do the default thing.
279 ABIArgInfo ArgInfo
= DefaultABIInfo::classifyArgumentType(Ty
);
280 if (!ArgInfo
.isIndirect()) {
281 unsigned NumRegs
= numRegsForType(Ty
);
282 NumRegsLeft
-= std::min(NumRegs
, NumRegsLeft
);
288 class AMDGPUTargetCodeGenInfo
: public TargetCodeGenInfo
{
290 AMDGPUTargetCodeGenInfo(CodeGenTypes
&CGT
)
291 : TargetCodeGenInfo(std::make_unique
<AMDGPUABIInfo
>(CGT
)) {}
293 void setFunctionDeclAttributes(const FunctionDecl
*FD
, llvm::Function
*F
,
294 CodeGenModule
&CGM
) const;
296 void emitTargetGlobals(CodeGen::CodeGenModule
&CGM
) const override
;
298 void setTargetAttributes(const Decl
*D
, llvm::GlobalValue
*GV
,
299 CodeGen::CodeGenModule
&M
) const override
;
300 unsigned getOpenCLKernelCallingConv() const override
;
302 llvm::Constant
*getNullPointer(const CodeGen::CodeGenModule
&CGM
,
303 llvm::PointerType
*T
, QualType QT
) const override
;
305 LangAS
getASTAllocaAddressSpace() const override
{
306 return getLangASFromTargetAS(
307 getABIInfo().getDataLayout().getAllocaAddrSpace());
309 LangAS
getGlobalVarAddressSpace(CodeGenModule
&CGM
,
310 const VarDecl
*D
) const override
;
311 llvm::SyncScope::ID
getLLVMSyncScopeID(const LangOptions
&LangOpts
,
313 llvm::AtomicOrdering Ordering
,
314 llvm::LLVMContext
&Ctx
) const override
;
315 void setTargetAtomicMetadata(CodeGenFunction
&CGF
,
316 llvm::Instruction
&AtomicInst
,
317 const AtomicExpr
*Expr
= nullptr) const override
;
318 llvm::Value
*createEnqueuedBlockKernel(CodeGenFunction
&CGF
,
319 llvm::Function
*BlockInvokeFunc
,
320 llvm::Type
*BlockTy
) const override
;
321 bool shouldEmitStaticExternCAliases() const override
;
322 bool shouldEmitDWARFBitFieldSeparators() const override
;
323 void setCUDAKernelCallingConvention(const FunctionType
*&FT
) const override
;
327 static bool requiresAMDGPUProtectedVisibility(const Decl
*D
,
328 llvm::GlobalValue
*GV
) {
329 if (GV
->getVisibility() != llvm::GlobalValue::HiddenVisibility
)
332 return !D
->hasAttr
<OMPDeclareTargetDeclAttr
>() &&
333 (D
->hasAttr
<OpenCLKernelAttr
>() ||
334 (isa
<FunctionDecl
>(D
) && D
->hasAttr
<CUDAGlobalAttr
>()) ||
336 (D
->hasAttr
<CUDADeviceAttr
>() || D
->hasAttr
<CUDAConstantAttr
>() ||
337 cast
<VarDecl
>(D
)->getType()->isCUDADeviceBuiltinSurfaceType() ||
338 cast
<VarDecl
>(D
)->getType()->isCUDADeviceBuiltinTextureType())));
341 void AMDGPUTargetCodeGenInfo::setFunctionDeclAttributes(
342 const FunctionDecl
*FD
, llvm::Function
*F
, CodeGenModule
&M
) const {
343 const auto *ReqdWGS
=
344 M
.getLangOpts().OpenCL
? FD
->getAttr
<ReqdWorkGroupSizeAttr
>() : nullptr;
345 const bool IsOpenCLKernel
=
346 M
.getLangOpts().OpenCL
&& FD
->hasAttr
<OpenCLKernelAttr
>();
347 const bool IsHIPKernel
= M
.getLangOpts().HIP
&& FD
->hasAttr
<CUDAGlobalAttr
>();
349 const auto *FlatWGS
= FD
->getAttr
<AMDGPUFlatWorkGroupSizeAttr
>();
350 if (ReqdWGS
|| FlatWGS
) {
351 M
.handleAMDGPUFlatWorkGroupSizeAttr(F
, FlatWGS
, ReqdWGS
);
352 } else if (IsOpenCLKernel
|| IsHIPKernel
) {
353 // By default, restrict the maximum size to a value specified by
354 // --gpu-max-threads-per-block=n or its default value for HIP.
355 const unsigned OpenCLDefaultMaxWorkGroupSize
= 256;
356 const unsigned DefaultMaxWorkGroupSize
=
357 IsOpenCLKernel
? OpenCLDefaultMaxWorkGroupSize
358 : M
.getLangOpts().GPUMaxThreadsPerBlock
;
359 std::string AttrVal
=
360 std::string("1,") + llvm::utostr(DefaultMaxWorkGroupSize
);
361 F
->addFnAttr("amdgpu-flat-work-group-size", AttrVal
);
364 if (const auto *Attr
= FD
->getAttr
<AMDGPUWavesPerEUAttr
>())
365 M
.handleAMDGPUWavesPerEUAttr(F
, Attr
);
367 if (const auto *Attr
= FD
->getAttr
<AMDGPUNumSGPRAttr
>()) {
368 unsigned NumSGPR
= Attr
->getNumSGPR();
371 F
->addFnAttr("amdgpu-num-sgpr", llvm::utostr(NumSGPR
));
374 if (const auto *Attr
= FD
->getAttr
<AMDGPUNumVGPRAttr
>()) {
375 uint32_t NumVGPR
= Attr
->getNumVGPR();
378 F
->addFnAttr("amdgpu-num-vgpr", llvm::utostr(NumVGPR
));
381 if (const auto *Attr
= FD
->getAttr
<AMDGPUMaxNumWorkGroupsAttr
>()) {
382 uint32_t X
= Attr
->getMaxNumWorkGroupsX()
383 ->EvaluateKnownConstInt(M
.getContext())
385 // Y and Z dimensions default to 1 if not specified
386 uint32_t Y
= Attr
->getMaxNumWorkGroupsY()
387 ? Attr
->getMaxNumWorkGroupsY()
388 ->EvaluateKnownConstInt(M
.getContext())
391 uint32_t Z
= Attr
->getMaxNumWorkGroupsZ()
392 ? Attr
->getMaxNumWorkGroupsZ()
393 ->EvaluateKnownConstInt(M
.getContext())
397 llvm::SmallString
<32> AttrVal
;
398 llvm::raw_svector_ostream
OS(AttrVal
);
399 OS
<< X
<< ',' << Y
<< ',' << Z
;
401 F
->addFnAttr("amdgpu-max-num-workgroups", AttrVal
.str());
405 /// Emits control constants used to change per-architecture behaviour in the
406 /// AMDGPU ROCm device libraries.
407 void AMDGPUTargetCodeGenInfo::emitTargetGlobals(
408 CodeGen::CodeGenModule
&CGM
) const {
409 StringRef Name
= "__oclc_ABI_version";
410 llvm::GlobalVariable
*OriginalGV
= CGM
.getModule().getNamedGlobal(Name
);
411 if (OriginalGV
&& !llvm::GlobalVariable::isExternalLinkage(OriginalGV
->getLinkage()))
414 if (CGM
.getTarget().getTargetOpts().CodeObjectVersion
==
415 llvm::CodeObjectVersionKind::COV_None
)
418 auto *Type
= llvm::IntegerType::getIntNTy(CGM
.getModule().getContext(), 32);
419 llvm::Constant
*COV
= llvm::ConstantInt::get(
420 Type
, CGM
.getTarget().getTargetOpts().CodeObjectVersion
);
422 // It needs to be constant weak_odr without externally_initialized so that
423 // the load instuction can be eliminated by the IPSCCP.
424 auto *GV
= new llvm::GlobalVariable(
425 CGM
.getModule(), Type
, true, llvm::GlobalValue::WeakODRLinkage
, COV
, Name
,
426 nullptr, llvm::GlobalValue::ThreadLocalMode::NotThreadLocal
,
427 CGM
.getContext().getTargetAddressSpace(LangAS::opencl_constant
));
428 GV
->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Local
);
429 GV
->setVisibility(llvm::GlobalValue::VisibilityTypes::HiddenVisibility
);
431 // Replace any external references to this variable with the new global.
433 OriginalGV
->replaceAllUsesWith(GV
);
434 GV
->takeName(OriginalGV
);
435 OriginalGV
->eraseFromParent();
439 void AMDGPUTargetCodeGenInfo::setTargetAttributes(
440 const Decl
*D
, llvm::GlobalValue
*GV
, CodeGen::CodeGenModule
&M
) const {
441 if (requiresAMDGPUProtectedVisibility(D
, GV
)) {
442 GV
->setVisibility(llvm::GlobalValue::ProtectedVisibility
);
443 GV
->setDSOLocal(true);
446 if (GV
->isDeclaration())
449 llvm::Function
*F
= dyn_cast
<llvm::Function
>(GV
);
453 const FunctionDecl
*FD
= dyn_cast_or_null
<FunctionDecl
>(D
);
455 setFunctionDeclAttributes(FD
, F
, M
);
457 if (!getABIInfo().getCodeGenOpts().EmitIEEENaNCompliantInsts
)
458 F
->addFnAttr("amdgpu-ieee", "false");
461 unsigned AMDGPUTargetCodeGenInfo::getOpenCLKernelCallingConv() const {
462 return llvm::CallingConv::AMDGPU_KERNEL
;
465 // Currently LLVM assumes null pointers always have value 0,
466 // which results in incorrectly transformed IR. Therefore, instead of
467 // emitting null pointers in private and local address spaces, a null
468 // pointer in generic address space is emitted which is casted to a
469 // pointer in local or private address space.
470 llvm::Constant
*AMDGPUTargetCodeGenInfo::getNullPointer(
471 const CodeGen::CodeGenModule
&CGM
, llvm::PointerType
*PT
,
473 if (CGM
.getContext().getTargetNullPointerValue(QT
) == 0)
474 return llvm::ConstantPointerNull::get(PT
);
476 auto &Ctx
= CGM
.getContext();
477 auto NPT
= llvm::PointerType::get(
478 PT
->getContext(), Ctx
.getTargetAddressSpace(LangAS::opencl_generic
));
479 return llvm::ConstantExpr::getAddrSpaceCast(
480 llvm::ConstantPointerNull::get(NPT
), PT
);
484 AMDGPUTargetCodeGenInfo::getGlobalVarAddressSpace(CodeGenModule
&CGM
,
485 const VarDecl
*D
) const {
486 assert(!CGM
.getLangOpts().OpenCL
&&
487 !(CGM
.getLangOpts().CUDA
&& CGM
.getLangOpts().CUDAIsDevice
) &&
488 "Address space agnostic languages only");
489 LangAS DefaultGlobalAS
= getLangASFromTargetAS(
490 CGM
.getContext().getTargetAddressSpace(LangAS::opencl_global
));
492 return DefaultGlobalAS
;
494 LangAS AddrSpace
= D
->getType().getAddressSpace();
495 if (AddrSpace
!= LangAS::Default
)
498 // Only promote to address space 4 if VarDecl has constant initialization.
499 if (D
->getType().isConstantStorage(CGM
.getContext(), false, false) &&
500 D
->hasConstantInitialization()) {
501 if (auto ConstAS
= CGM
.getTarget().getConstantAddressSpace())
504 return DefaultGlobalAS
;
508 AMDGPUTargetCodeGenInfo::getLLVMSyncScopeID(const LangOptions
&LangOpts
,
510 llvm::AtomicOrdering Ordering
,
511 llvm::LLVMContext
&Ctx
) const {
514 case SyncScope::HIPSingleThread
:
515 case SyncScope::SingleScope
:
516 Name
= "singlethread";
518 case SyncScope::HIPWavefront
:
519 case SyncScope::OpenCLSubGroup
:
520 case SyncScope::WavefrontScope
:
523 case SyncScope::HIPWorkgroup
:
524 case SyncScope::OpenCLWorkGroup
:
525 case SyncScope::WorkgroupScope
:
528 case SyncScope::HIPAgent
:
529 case SyncScope::OpenCLDevice
:
530 case SyncScope::DeviceScope
:
533 case SyncScope::SystemScope
:
534 case SyncScope::HIPSystem
:
535 case SyncScope::OpenCLAllSVMDevices
:
540 // OpenCL assumes by default that atomic scopes are per-address space for
541 // non-sequentially consistent operations.
542 if (Scope
>= SyncScope::OpenCLWorkGroup
&&
543 Scope
<= SyncScope::OpenCLSubGroup
&&
544 Ordering
!= llvm::AtomicOrdering::SequentiallyConsistent
) {
546 Name
= Twine(Twine(Name
) + Twine("-")).str();
548 Name
= Twine(Twine(Name
) + Twine("one-as")).str();
551 return Ctx
.getOrInsertSyncScopeID(Name
);
554 void AMDGPUTargetCodeGenInfo::setTargetAtomicMetadata(
555 CodeGenFunction
&CGF
, llvm::Instruction
&AtomicInst
,
556 const AtomicExpr
*AE
) const {
557 auto *RMW
= dyn_cast
<llvm::AtomicRMWInst
>(&AtomicInst
);
558 auto *CmpX
= dyn_cast
<llvm::AtomicCmpXchgInst
>(&AtomicInst
);
560 // OpenCL and old style HIP atomics consider atomics targeting thread private
561 // memory to be undefined.
563 // TODO: This is probably undefined for atomic load/store, but there's not
564 // much direct codegen benefit to knowing this.
565 if (((RMW
&& RMW
->getPointerAddressSpace() == llvm::AMDGPUAS::FLAT_ADDRESS
) ||
567 CmpX
->getPointerAddressSpace() == llvm::AMDGPUAS::FLAT_ADDRESS
)) &&
568 AE
&& AE
->threadPrivateMemoryAtomicsAreUndefined()) {
569 llvm::MDBuilder
MDHelper(CGF
.getLLVMContext());
570 llvm::MDNode
*ASRange
= MDHelper
.createRange(
571 llvm::APInt(32, llvm::AMDGPUAS::PRIVATE_ADDRESS
),
572 llvm::APInt(32, llvm::AMDGPUAS::PRIVATE_ADDRESS
+ 1));
573 AtomicInst
.setMetadata(llvm::LLVMContext::MD_noalias_addrspace
, ASRange
);
576 if (!RMW
|| !CGF
.getTarget().allowAMDGPUUnsafeFPAtomics())
579 // TODO: Introduce new, more controlled options that also work for integers,
580 // and deprecate allowAMDGPUUnsafeFPAtomics.
581 llvm::AtomicRMWInst::BinOp RMWOp
= RMW
->getOperation();
582 if (llvm::AtomicRMWInst::isFPOperation(RMWOp
)) {
583 llvm::MDNode
*Empty
= llvm::MDNode::get(CGF
.getLLVMContext(), {});
584 RMW
->setMetadata("amdgpu.no.fine.grained.memory", Empty
);
586 if (RMWOp
== llvm::AtomicRMWInst::FAdd
&& RMW
->getType()->isFloatTy())
587 RMW
->setMetadata("amdgpu.ignore.denormal.mode", Empty
);
591 bool AMDGPUTargetCodeGenInfo::shouldEmitStaticExternCAliases() const {
595 bool AMDGPUTargetCodeGenInfo::shouldEmitDWARFBitFieldSeparators() const {
599 void AMDGPUTargetCodeGenInfo::setCUDAKernelCallingConvention(
600 const FunctionType
*&FT
) const {
601 FT
= getABIInfo().getContext().adjustFunctionType(
602 FT
, FT
->getExtInfo().withCallingConv(CC_OpenCLKernel
));
605 /// Create an OpenCL kernel for an enqueued block.
607 /// The type of the first argument (the block literal) is the struct type
608 /// of the block literal instead of a pointer type. The first argument
609 /// (block literal) is passed directly by value to the kernel. The kernel
610 /// allocates the same type of struct on stack and stores the block literal
611 /// to it and passes its pointer to the block invoke function. The kernel
612 /// has "enqueued-block" function attribute and kernel argument metadata.
613 llvm::Value
*AMDGPUTargetCodeGenInfo::createEnqueuedBlockKernel(
614 CodeGenFunction
&CGF
, llvm::Function
*Invoke
, llvm::Type
*BlockTy
) const {
615 auto &Builder
= CGF
.Builder
;
616 auto &C
= CGF
.getLLVMContext();
618 auto *InvokeFT
= Invoke
->getFunctionType();
619 llvm::SmallVector
<llvm::Type
*, 2> ArgTys
;
620 llvm::SmallVector
<llvm::Metadata
*, 8> AddressQuals
;
621 llvm::SmallVector
<llvm::Metadata
*, 8> AccessQuals
;
622 llvm::SmallVector
<llvm::Metadata
*, 8> ArgTypeNames
;
623 llvm::SmallVector
<llvm::Metadata
*, 8> ArgBaseTypeNames
;
624 llvm::SmallVector
<llvm::Metadata
*, 8> ArgTypeQuals
;
625 llvm::SmallVector
<llvm::Metadata
*, 8> ArgNames
;
627 ArgTys
.push_back(BlockTy
);
628 ArgTypeNames
.push_back(llvm::MDString::get(C
, "__block_literal"));
629 AddressQuals
.push_back(llvm::ConstantAsMetadata::get(Builder
.getInt32(0)));
630 ArgBaseTypeNames
.push_back(llvm::MDString::get(C
, "__block_literal"));
631 ArgTypeQuals
.push_back(llvm::MDString::get(C
, ""));
632 AccessQuals
.push_back(llvm::MDString::get(C
, "none"));
633 ArgNames
.push_back(llvm::MDString::get(C
, "block_literal"));
634 for (unsigned I
= 1, E
= InvokeFT
->getNumParams(); I
< E
; ++I
) {
635 ArgTys
.push_back(InvokeFT
->getParamType(I
));
636 ArgTypeNames
.push_back(llvm::MDString::get(C
, "void*"));
637 AddressQuals
.push_back(llvm::ConstantAsMetadata::get(Builder
.getInt32(3)));
638 AccessQuals
.push_back(llvm::MDString::get(C
, "none"));
639 ArgBaseTypeNames
.push_back(llvm::MDString::get(C
, "void*"));
640 ArgTypeQuals
.push_back(llvm::MDString::get(C
, ""));
642 llvm::MDString::get(C
, (Twine("local_arg") + Twine(I
)).str()));
644 std::string Name
= Invoke
->getName().str() + "_kernel";
645 auto *FT
= llvm::FunctionType::get(llvm::Type::getVoidTy(C
), ArgTys
, false);
646 auto *F
= llvm::Function::Create(FT
, llvm::GlobalValue::InternalLinkage
, Name
,
647 &CGF
.CGM
.getModule());
648 F
->setCallingConv(llvm::CallingConv::AMDGPU_KERNEL
);
650 llvm::AttrBuilder
KernelAttrs(C
);
651 // FIXME: The invoke isn't applying the right attributes either
652 // FIXME: This is missing setTargetAttributes
653 CGF
.CGM
.addDefaultFunctionDefinitionAttributes(KernelAttrs
);
654 KernelAttrs
.addAttribute("enqueued-block");
655 F
->addFnAttrs(KernelAttrs
);
657 auto IP
= CGF
.Builder
.saveIP();
658 auto *BB
= llvm::BasicBlock::Create(C
, "entry", F
);
659 Builder
.SetInsertPoint(BB
);
660 const auto BlockAlign
= CGF
.CGM
.getDataLayout().getPrefTypeAlign(BlockTy
);
661 auto *BlockPtr
= Builder
.CreateAlloca(BlockTy
, nullptr);
662 BlockPtr
->setAlignment(BlockAlign
);
663 Builder
.CreateAlignedStore(F
->arg_begin(), BlockPtr
, BlockAlign
);
664 auto *Cast
= Builder
.CreatePointerCast(BlockPtr
, InvokeFT
->getParamType(0));
665 llvm::SmallVector
<llvm::Value
*, 2> Args
;
666 Args
.push_back(Cast
);
667 for (llvm::Argument
&A
: llvm::drop_begin(F
->args()))
669 llvm::CallInst
*call
= Builder
.CreateCall(Invoke
, Args
);
670 call
->setCallingConv(Invoke
->getCallingConv());
671 Builder
.CreateRetVoid();
672 Builder
.restoreIP(IP
);
674 F
->setMetadata("kernel_arg_addr_space", llvm::MDNode::get(C
, AddressQuals
));
675 F
->setMetadata("kernel_arg_access_qual", llvm::MDNode::get(C
, AccessQuals
));
676 F
->setMetadata("kernel_arg_type", llvm::MDNode::get(C
, ArgTypeNames
));
677 F
->setMetadata("kernel_arg_base_type",
678 llvm::MDNode::get(C
, ArgBaseTypeNames
));
679 F
->setMetadata("kernel_arg_type_qual", llvm::MDNode::get(C
, ArgTypeQuals
));
680 if (CGF
.CGM
.getCodeGenOpts().EmitOpenCLArgMetadata
)
681 F
->setMetadata("kernel_arg_name", llvm::MDNode::get(C
, ArgNames
));
686 void CodeGenModule::handleAMDGPUFlatWorkGroupSizeAttr(
687 llvm::Function
*F
, const AMDGPUFlatWorkGroupSizeAttr
*FlatWGS
,
688 const ReqdWorkGroupSizeAttr
*ReqdWGS
, int32_t *MinThreadsVal
,
689 int32_t *MaxThreadsVal
) {
693 Min
= FlatWGS
->getMin()->EvaluateKnownConstInt(getContext()).getExtValue();
694 Max
= FlatWGS
->getMax()->EvaluateKnownConstInt(getContext()).getExtValue();
696 if (ReqdWGS
&& Min
== 0 && Max
== 0)
697 Min
= Max
= ReqdWGS
->getXDim() * ReqdWGS
->getYDim() * ReqdWGS
->getZDim();
700 assert(Min
<= Max
&& "Min must be less than or equal Max");
703 *MinThreadsVal
= Min
;
705 *MaxThreadsVal
= Max
;
706 std::string AttrVal
= llvm::utostr(Min
) + "," + llvm::utostr(Max
);
708 F
->addFnAttr("amdgpu-flat-work-group-size", AttrVal
);
710 assert(Max
== 0 && "Max must be zero");
713 void CodeGenModule::handleAMDGPUWavesPerEUAttr(
714 llvm::Function
*F
, const AMDGPUWavesPerEUAttr
*Attr
) {
716 Attr
->getMin()->EvaluateKnownConstInt(getContext()).getExtValue();
719 ? Attr
->getMax()->EvaluateKnownConstInt(getContext()).getExtValue()
723 assert((Max
== 0 || Min
<= Max
) && "Min must be less than or equal Max");
725 std::string AttrVal
= llvm::utostr(Min
);
727 AttrVal
= AttrVal
+ "," + llvm::utostr(Max
);
728 F
->addFnAttr("amdgpu-waves-per-eu", AttrVal
);
730 assert(Max
== 0 && "Max must be zero");
733 std::unique_ptr
<TargetCodeGenInfo
>
734 CodeGen::createAMDGPUTargetCodeGenInfo(CodeGenModule
&CGM
) {
735 return std::make_unique
<AMDGPUTargetCodeGenInfo
>(CGM
.getTypes());