1 //===- AMDGPU.cpp ---------------------------------------------------------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 #include "ABIInfoImpl.h"
10 #include "TargetInfo.h"
11 #include "clang/Basic/TargetOptions.h"
13 using namespace clang
;
14 using namespace clang::CodeGen
;
16 //===----------------------------------------------------------------------===//
17 // AMDGPU ABI Implementation
18 //===----------------------------------------------------------------------===//
22 class AMDGPUABIInfo final
: public DefaultABIInfo
{
24 static const unsigned MaxNumRegsForArgsRet
= 16;
26 unsigned numRegsForType(QualType Ty
) const;
28 bool isHomogeneousAggregateBaseType(QualType Ty
) const override
;
29 bool isHomogeneousAggregateSmallEnough(const Type
*Base
,
30 uint64_t Members
) const override
;
32 // Coerce HIP scalar pointer arguments from generic pointers to global ones.
33 llvm::Type
*coerceKernelArgumentType(llvm::Type
*Ty
, unsigned FromAS
,
34 unsigned ToAS
) const {
35 // Single value types.
36 auto *PtrTy
= llvm::dyn_cast
<llvm::PointerType
>(Ty
);
37 if (PtrTy
&& PtrTy
->getAddressSpace() == FromAS
)
38 return llvm::PointerType::get(Ty
->getContext(), ToAS
);
43 explicit AMDGPUABIInfo(CodeGen::CodeGenTypes
&CGT
) :
44 DefaultABIInfo(CGT
) {}
46 ABIArgInfo
classifyReturnType(QualType RetTy
) const;
47 ABIArgInfo
classifyKernelArgumentType(QualType Ty
) const;
48 ABIArgInfo
classifyArgumentType(QualType Ty
, unsigned &NumRegsLeft
) const;
50 void computeInfo(CGFunctionInfo
&FI
) const override
;
51 Address
EmitVAArg(CodeGenFunction
&CGF
, Address VAListAddr
,
52 QualType Ty
) const override
;
55 bool AMDGPUABIInfo::isHomogeneousAggregateBaseType(QualType Ty
) const {
59 bool AMDGPUABIInfo::isHomogeneousAggregateSmallEnough(
60 const Type
*Base
, uint64_t Members
) const {
61 uint32_t NumRegs
= (getContext().getTypeSize(Base
) + 31) / 32;
63 // Homogeneous Aggregates may occupy at most 16 registers.
64 return Members
* NumRegs
<= MaxNumRegsForArgsRet
;
67 /// Estimate number of registers the type will use when passed in registers.
68 unsigned AMDGPUABIInfo::numRegsForType(QualType Ty
) const {
71 if (const VectorType
*VT
= Ty
->getAs
<VectorType
>()) {
72 // Compute from the number of elements. The reported size is based on the
73 // in-memory size, which includes the padding 4th element for 3-vectors.
74 QualType EltTy
= VT
->getElementType();
75 unsigned EltSize
= getContext().getTypeSize(EltTy
);
77 // 16-bit element vectors should be passed as packed.
79 return (VT
->getNumElements() + 1) / 2;
81 unsigned EltNumRegs
= (EltSize
+ 31) / 32;
82 return EltNumRegs
* VT
->getNumElements();
85 if (const RecordType
*RT
= Ty
->getAs
<RecordType
>()) {
86 const RecordDecl
*RD
= RT
->getDecl();
87 assert(!RD
->hasFlexibleArrayMember());
89 for (const FieldDecl
*Field
: RD
->fields()) {
90 QualType FieldTy
= Field
->getType();
91 NumRegs
+= numRegsForType(FieldTy
);
97 return (getContext().getTypeSize(Ty
) + 31) / 32;
100 void AMDGPUABIInfo::computeInfo(CGFunctionInfo
&FI
) const {
101 llvm::CallingConv::ID CC
= FI
.getCallingConvention();
103 if (!getCXXABI().classifyReturnType(FI
))
104 FI
.getReturnInfo() = classifyReturnType(FI
.getReturnType());
106 unsigned NumRegsLeft
= MaxNumRegsForArgsRet
;
107 for (auto &Arg
: FI
.arguments()) {
108 if (CC
== llvm::CallingConv::AMDGPU_KERNEL
) {
109 Arg
.info
= classifyKernelArgumentType(Arg
.type
);
111 Arg
.info
= classifyArgumentType(Arg
.type
, NumRegsLeft
);
116 Address
AMDGPUABIInfo::EmitVAArg(CodeGenFunction
&CGF
, Address VAListAddr
,
118 llvm_unreachable("AMDGPU does not support varargs");
121 ABIArgInfo
AMDGPUABIInfo::classifyReturnType(QualType RetTy
) const {
122 if (isAggregateTypeForABI(RetTy
)) {
123 // Records with non-trivial destructors/copy-constructors should not be
124 // returned by value.
125 if (!getRecordArgABI(RetTy
, getCXXABI())) {
126 // Ignore empty structs/unions.
127 if (isEmptyRecord(getContext(), RetTy
, true))
128 return ABIArgInfo::getIgnore();
130 // Lower single-element structs to just return a regular value.
131 if (const Type
*SeltTy
= isSingleElementStruct(RetTy
, getContext()))
132 return ABIArgInfo::getDirect(CGT
.ConvertType(QualType(SeltTy
, 0)));
134 if (const RecordType
*RT
= RetTy
->getAs
<RecordType
>()) {
135 const RecordDecl
*RD
= RT
->getDecl();
136 if (RD
->hasFlexibleArrayMember())
137 return DefaultABIInfo::classifyReturnType(RetTy
);
140 // Pack aggregates <= 4 bytes into single VGPR or pair.
141 uint64_t Size
= getContext().getTypeSize(RetTy
);
143 return ABIArgInfo::getDirect(llvm::Type::getInt16Ty(getVMContext()));
146 return ABIArgInfo::getDirect(llvm::Type::getInt32Ty(getVMContext()));
149 llvm::Type
*I32Ty
= llvm::Type::getInt32Ty(getVMContext());
150 return ABIArgInfo::getDirect(llvm::ArrayType::get(I32Ty
, 2));
153 if (numRegsForType(RetTy
) <= MaxNumRegsForArgsRet
)
154 return ABIArgInfo::getDirect();
158 // Otherwise just do the default thing.
159 return DefaultABIInfo::classifyReturnType(RetTy
);
162 /// For kernels all parameters are really passed in a special buffer. It doesn't
163 /// make sense to pass anything byval, so everything must be direct.
164 ABIArgInfo
AMDGPUABIInfo::classifyKernelArgumentType(QualType Ty
) const {
165 Ty
= useFirstFieldIfTransparentUnion(Ty
);
167 // TODO: Can we omit empty structs?
169 if (const Type
*SeltTy
= isSingleElementStruct(Ty
, getContext()))
170 Ty
= QualType(SeltTy
, 0);
172 llvm::Type
*OrigLTy
= CGT
.ConvertType(Ty
);
173 llvm::Type
*LTy
= OrigLTy
;
174 if (getContext().getLangOpts().HIP
) {
175 LTy
= coerceKernelArgumentType(
176 OrigLTy
, /*FromAS=*/getContext().getTargetAddressSpace(LangAS::Default
),
177 /*ToAS=*/getContext().getTargetAddressSpace(LangAS::cuda_device
));
180 // FIXME: Should also use this for OpenCL, but it requires addressing the
181 // problem of kernels being called.
183 // FIXME: This doesn't apply the optimization of coercing pointers in structs
184 // to global address space when using byref. This would require implementing a
185 // new kind of coercion of the in-memory type when for indirect arguments.
186 if (!getContext().getLangOpts().OpenCL
&& LTy
== OrigLTy
&&
187 isAggregateTypeForABI(Ty
)) {
188 return ABIArgInfo::getIndirectAliased(
189 getContext().getTypeAlignInChars(Ty
),
190 getContext().getTargetAddressSpace(LangAS::opencl_constant
),
191 false /*Realign*/, nullptr /*Padding*/);
194 // If we set CanBeFlattened to true, CodeGen will expand the struct to its
195 // individual elements, which confuses the Clover OpenCL backend; therefore we
196 // have to set it to false here. Other args of getDirect() are just defaults.
197 return ABIArgInfo::getDirect(LTy
, 0, nullptr, false);
200 ABIArgInfo
AMDGPUABIInfo::classifyArgumentType(QualType Ty
,
201 unsigned &NumRegsLeft
) const {
202 assert(NumRegsLeft
<= MaxNumRegsForArgsRet
&& "register estimate underflow");
204 Ty
= useFirstFieldIfTransparentUnion(Ty
);
206 if (isAggregateTypeForABI(Ty
)) {
207 // Records with non-trivial destructors/copy-constructors should not be
209 if (auto RAA
= getRecordArgABI(Ty
, getCXXABI()))
210 return getNaturalAlignIndirect(Ty
, RAA
== CGCXXABI::RAA_DirectInMemory
);
212 // Ignore empty structs/unions.
213 if (isEmptyRecord(getContext(), Ty
, true))
214 return ABIArgInfo::getIgnore();
216 // Lower single-element structs to just pass a regular value. TODO: We
217 // could do reasonable-size multiple-element structs too, using getExpand(),
218 // though watch out for things like bitfields.
219 if (const Type
*SeltTy
= isSingleElementStruct(Ty
, getContext()))
220 return ABIArgInfo::getDirect(CGT
.ConvertType(QualType(SeltTy
, 0)));
222 if (const RecordType
*RT
= Ty
->getAs
<RecordType
>()) {
223 const RecordDecl
*RD
= RT
->getDecl();
224 if (RD
->hasFlexibleArrayMember())
225 return DefaultABIInfo::classifyArgumentType(Ty
);
228 // Pack aggregates <= 8 bytes into single VGPR or pair.
229 uint64_t Size
= getContext().getTypeSize(Ty
);
231 unsigned NumRegs
= (Size
+ 31) / 32;
232 NumRegsLeft
-= std::min(NumRegsLeft
, NumRegs
);
235 return ABIArgInfo::getDirect(llvm::Type::getInt16Ty(getVMContext()));
238 return ABIArgInfo::getDirect(llvm::Type::getInt32Ty(getVMContext()));
240 // XXX: Should this be i64 instead, and should the limit increase?
241 llvm::Type
*I32Ty
= llvm::Type::getInt32Ty(getVMContext());
242 return ABIArgInfo::getDirect(llvm::ArrayType::get(I32Ty
, 2));
245 if (NumRegsLeft
> 0) {
246 unsigned NumRegs
= numRegsForType(Ty
);
247 if (NumRegsLeft
>= NumRegs
) {
248 NumRegsLeft
-= NumRegs
;
249 return ABIArgInfo::getDirect();
253 // Use pass-by-reference in stead of pass-by-value for struct arguments in
255 return ABIArgInfo::getIndirectAliased(
256 getContext().getTypeAlignInChars(Ty
),
257 getContext().getTargetAddressSpace(LangAS::opencl_private
));
260 // Otherwise just do the default thing.
261 ABIArgInfo ArgInfo
= DefaultABIInfo::classifyArgumentType(Ty
);
262 if (!ArgInfo
.isIndirect()) {
263 unsigned NumRegs
= numRegsForType(Ty
);
264 NumRegsLeft
-= std::min(NumRegs
, NumRegsLeft
);
270 class AMDGPUTargetCodeGenInfo
: public TargetCodeGenInfo
{
272 AMDGPUTargetCodeGenInfo(CodeGenTypes
&CGT
)
273 : TargetCodeGenInfo(std::make_unique
<AMDGPUABIInfo
>(CGT
)) {}
275 void setFunctionDeclAttributes(const FunctionDecl
*FD
, llvm::Function
*F
,
276 CodeGenModule
&CGM
) const;
278 void emitTargetGlobals(CodeGen::CodeGenModule
&CGM
) const override
;
280 void setTargetAttributes(const Decl
*D
, llvm::GlobalValue
*GV
,
281 CodeGen::CodeGenModule
&M
) const override
;
282 unsigned getOpenCLKernelCallingConv() const override
;
284 llvm::Constant
*getNullPointer(const CodeGen::CodeGenModule
&CGM
,
285 llvm::PointerType
*T
, QualType QT
) const override
;
287 LangAS
getASTAllocaAddressSpace() const override
{
288 return getLangASFromTargetAS(
289 getABIInfo().getDataLayout().getAllocaAddrSpace());
291 LangAS
getGlobalVarAddressSpace(CodeGenModule
&CGM
,
292 const VarDecl
*D
) const override
;
293 llvm::SyncScope::ID
getLLVMSyncScopeID(const LangOptions
&LangOpts
,
295 llvm::AtomicOrdering Ordering
,
296 llvm::LLVMContext
&Ctx
) const override
;
297 llvm::Value
*createEnqueuedBlockKernel(CodeGenFunction
&CGF
,
298 llvm::Function
*BlockInvokeFunc
,
299 llvm::Type
*BlockTy
) const override
;
300 bool shouldEmitStaticExternCAliases() const override
;
301 bool shouldEmitDWARFBitFieldSeparators() const override
;
302 void setCUDAKernelCallingConvention(const FunctionType
*&FT
) const override
;
306 static bool requiresAMDGPUProtectedVisibility(const Decl
*D
,
307 llvm::GlobalValue
*GV
) {
308 if (GV
->getVisibility() != llvm::GlobalValue::HiddenVisibility
)
311 return !D
->hasAttr
<OMPDeclareTargetDeclAttr
>() &&
312 (D
->hasAttr
<OpenCLKernelAttr
>() ||
313 (isa
<FunctionDecl
>(D
) && D
->hasAttr
<CUDAGlobalAttr
>()) ||
315 (D
->hasAttr
<CUDADeviceAttr
>() || D
->hasAttr
<CUDAConstantAttr
>() ||
316 cast
<VarDecl
>(D
)->getType()->isCUDADeviceBuiltinSurfaceType() ||
317 cast
<VarDecl
>(D
)->getType()->isCUDADeviceBuiltinTextureType())));
320 void AMDGPUTargetCodeGenInfo::setFunctionDeclAttributes(
321 const FunctionDecl
*FD
, llvm::Function
*F
, CodeGenModule
&M
) const {
322 const auto *ReqdWGS
=
323 M
.getLangOpts().OpenCL
? FD
->getAttr
<ReqdWorkGroupSizeAttr
>() : nullptr;
324 const bool IsOpenCLKernel
=
325 M
.getLangOpts().OpenCL
&& FD
->hasAttr
<OpenCLKernelAttr
>();
326 const bool IsHIPKernel
= M
.getLangOpts().HIP
&& FD
->hasAttr
<CUDAGlobalAttr
>();
328 const auto *FlatWGS
= FD
->getAttr
<AMDGPUFlatWorkGroupSizeAttr
>();
329 if (ReqdWGS
|| FlatWGS
) {
330 M
.handleAMDGPUFlatWorkGroupSizeAttr(F
, FlatWGS
, ReqdWGS
);
331 } else if (IsOpenCLKernel
|| IsHIPKernel
) {
332 // By default, restrict the maximum size to a value specified by
333 // --gpu-max-threads-per-block=n or its default value for HIP.
334 const unsigned OpenCLDefaultMaxWorkGroupSize
= 256;
335 const unsigned DefaultMaxWorkGroupSize
=
336 IsOpenCLKernel
? OpenCLDefaultMaxWorkGroupSize
337 : M
.getLangOpts().GPUMaxThreadsPerBlock
;
338 std::string AttrVal
=
339 std::string("1,") + llvm::utostr(DefaultMaxWorkGroupSize
);
340 F
->addFnAttr("amdgpu-flat-work-group-size", AttrVal
);
343 if (const auto *Attr
= FD
->getAttr
<AMDGPUWavesPerEUAttr
>())
344 M
.handleAMDGPUWavesPerEUAttr(F
, Attr
);
346 if (const auto *Attr
= FD
->getAttr
<AMDGPUNumSGPRAttr
>()) {
347 unsigned NumSGPR
= Attr
->getNumSGPR();
350 F
->addFnAttr("amdgpu-num-sgpr", llvm::utostr(NumSGPR
));
353 if (const auto *Attr
= FD
->getAttr
<AMDGPUNumVGPRAttr
>()) {
354 uint32_t NumVGPR
= Attr
->getNumVGPR();
357 F
->addFnAttr("amdgpu-num-vgpr", llvm::utostr(NumVGPR
));
361 /// Emits control constants used to change per-architecture behaviour in the
362 /// AMDGPU ROCm device libraries.
363 void AMDGPUTargetCodeGenInfo::emitTargetGlobals(
364 CodeGen::CodeGenModule
&CGM
) const {
365 StringRef Name
= "llvm.amdgcn.abi.version";
366 llvm::GlobalVariable
*OriginalGV
= CGM
.getModule().getNamedGlobal(Name
);
367 if (OriginalGV
&& !llvm::GlobalVariable::isExternalLinkage(OriginalGV
->getLinkage()))
370 auto *Type
= llvm::IntegerType::getIntNTy(CGM
.getModule().getContext(), 32);
371 llvm::Constant
*COV
= llvm::ConstantInt::get(
372 Type
, CGM
.getTarget().getTargetOpts().CodeObjectVersion
);
374 // It needs to be constant weak_odr without externally_initialized so that
375 // the load instuction can be eliminated by the IPSCCP.
376 auto *GV
= new llvm::GlobalVariable(
377 CGM
.getModule(), Type
, true, llvm::GlobalValue::WeakODRLinkage
, COV
, Name
,
378 nullptr, llvm::GlobalValue::ThreadLocalMode::NotThreadLocal
,
379 CGM
.getContext().getTargetAddressSpace(LangAS::opencl_constant
));
380 GV
->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Local
);
381 GV
->setVisibility(llvm::GlobalValue::VisibilityTypes::HiddenVisibility
);
383 // Replace any external references to this variable with the new global.
385 OriginalGV
->replaceAllUsesWith(GV
);
386 GV
->takeName(OriginalGV
);
387 OriginalGV
->eraseFromParent();
391 void AMDGPUTargetCodeGenInfo::setTargetAttributes(
392 const Decl
*D
, llvm::GlobalValue
*GV
, CodeGen::CodeGenModule
&M
) const {
393 if (requiresAMDGPUProtectedVisibility(D
, GV
)) {
394 GV
->setVisibility(llvm::GlobalValue::ProtectedVisibility
);
395 GV
->setDSOLocal(true);
398 if (GV
->isDeclaration())
401 llvm::Function
*F
= dyn_cast
<llvm::Function
>(GV
);
405 const FunctionDecl
*FD
= dyn_cast_or_null
<FunctionDecl
>(D
);
407 setFunctionDeclAttributes(FD
, F
, M
);
409 if (M
.getContext().getTargetInfo().allowAMDGPUUnsafeFPAtomics())
410 F
->addFnAttr("amdgpu-unsafe-fp-atomics", "true");
412 if (!getABIInfo().getCodeGenOpts().EmitIEEENaNCompliantInsts
)
413 F
->addFnAttr("amdgpu-ieee", "false");
416 unsigned AMDGPUTargetCodeGenInfo::getOpenCLKernelCallingConv() const {
417 return llvm::CallingConv::AMDGPU_KERNEL
;
420 // Currently LLVM assumes null pointers always have value 0,
421 // which results in incorrectly transformed IR. Therefore, instead of
422 // emitting null pointers in private and local address spaces, a null
423 // pointer in generic address space is emitted which is casted to a
424 // pointer in local or private address space.
425 llvm::Constant
*AMDGPUTargetCodeGenInfo::getNullPointer(
426 const CodeGen::CodeGenModule
&CGM
, llvm::PointerType
*PT
,
428 if (CGM
.getContext().getTargetNullPointerValue(QT
) == 0)
429 return llvm::ConstantPointerNull::get(PT
);
431 auto &Ctx
= CGM
.getContext();
432 auto NPT
= llvm::PointerType::get(
433 PT
->getContext(), Ctx
.getTargetAddressSpace(LangAS::opencl_generic
));
434 return llvm::ConstantExpr::getAddrSpaceCast(
435 llvm::ConstantPointerNull::get(NPT
), PT
);
439 AMDGPUTargetCodeGenInfo::getGlobalVarAddressSpace(CodeGenModule
&CGM
,
440 const VarDecl
*D
) const {
441 assert(!CGM
.getLangOpts().OpenCL
&&
442 !(CGM
.getLangOpts().CUDA
&& CGM
.getLangOpts().CUDAIsDevice
) &&
443 "Address space agnostic languages only");
444 LangAS DefaultGlobalAS
= getLangASFromTargetAS(
445 CGM
.getContext().getTargetAddressSpace(LangAS::opencl_global
));
447 return DefaultGlobalAS
;
449 LangAS AddrSpace
= D
->getType().getAddressSpace();
450 if (AddrSpace
!= LangAS::Default
)
453 // Only promote to address space 4 if VarDecl has constant initialization.
454 if (D
->getType().isConstantStorage(CGM
.getContext(), false, false) &&
455 D
->hasConstantInitialization()) {
456 if (auto ConstAS
= CGM
.getTarget().getConstantAddressSpace())
459 return DefaultGlobalAS
;
463 AMDGPUTargetCodeGenInfo::getLLVMSyncScopeID(const LangOptions
&LangOpts
,
465 llvm::AtomicOrdering Ordering
,
466 llvm::LLVMContext
&Ctx
) const {
469 case SyncScope::HIPSingleThread
:
470 Name
= "singlethread";
472 case SyncScope::HIPWavefront
:
473 case SyncScope::OpenCLSubGroup
:
476 case SyncScope::HIPWorkgroup
:
477 case SyncScope::OpenCLWorkGroup
:
480 case SyncScope::HIPAgent
:
481 case SyncScope::OpenCLDevice
:
484 case SyncScope::HIPSystem
:
485 case SyncScope::OpenCLAllSVMDevices
:
490 if (Ordering
!= llvm::AtomicOrdering::SequentiallyConsistent
) {
492 Name
= Twine(Twine(Name
) + Twine("-")).str();
494 Name
= Twine(Twine(Name
) + Twine("one-as")).str();
497 return Ctx
.getOrInsertSyncScopeID(Name
);
500 bool AMDGPUTargetCodeGenInfo::shouldEmitStaticExternCAliases() const {
504 bool AMDGPUTargetCodeGenInfo::shouldEmitDWARFBitFieldSeparators() const {
508 void AMDGPUTargetCodeGenInfo::setCUDAKernelCallingConvention(
509 const FunctionType
*&FT
) const {
510 FT
= getABIInfo().getContext().adjustFunctionType(
511 FT
, FT
->getExtInfo().withCallingConv(CC_OpenCLKernel
));
514 /// Create an OpenCL kernel for an enqueued block.
516 /// The type of the first argument (the block literal) is the struct type
517 /// of the block literal instead of a pointer type. The first argument
518 /// (block literal) is passed directly by value to the kernel. The kernel
519 /// allocates the same type of struct on stack and stores the block literal
520 /// to it and passes its pointer to the block invoke function. The kernel
521 /// has "enqueued-block" function attribute and kernel argument metadata.
522 llvm::Value
*AMDGPUTargetCodeGenInfo::createEnqueuedBlockKernel(
523 CodeGenFunction
&CGF
, llvm::Function
*Invoke
, llvm::Type
*BlockTy
) const {
524 auto &Builder
= CGF
.Builder
;
525 auto &C
= CGF
.getLLVMContext();
527 auto *InvokeFT
= Invoke
->getFunctionType();
528 llvm::SmallVector
<llvm::Type
*, 2> ArgTys
;
529 llvm::SmallVector
<llvm::Metadata
*, 8> AddressQuals
;
530 llvm::SmallVector
<llvm::Metadata
*, 8> AccessQuals
;
531 llvm::SmallVector
<llvm::Metadata
*, 8> ArgTypeNames
;
532 llvm::SmallVector
<llvm::Metadata
*, 8> ArgBaseTypeNames
;
533 llvm::SmallVector
<llvm::Metadata
*, 8> ArgTypeQuals
;
534 llvm::SmallVector
<llvm::Metadata
*, 8> ArgNames
;
536 ArgTys
.push_back(BlockTy
);
537 ArgTypeNames
.push_back(llvm::MDString::get(C
, "__block_literal"));
538 AddressQuals
.push_back(llvm::ConstantAsMetadata::get(Builder
.getInt32(0)));
539 ArgBaseTypeNames
.push_back(llvm::MDString::get(C
, "__block_literal"));
540 ArgTypeQuals
.push_back(llvm::MDString::get(C
, ""));
541 AccessQuals
.push_back(llvm::MDString::get(C
, "none"));
542 ArgNames
.push_back(llvm::MDString::get(C
, "block_literal"));
543 for (unsigned I
= 1, E
= InvokeFT
->getNumParams(); I
< E
; ++I
) {
544 ArgTys
.push_back(InvokeFT
->getParamType(I
));
545 ArgTypeNames
.push_back(llvm::MDString::get(C
, "void*"));
546 AddressQuals
.push_back(llvm::ConstantAsMetadata::get(Builder
.getInt32(3)));
547 AccessQuals
.push_back(llvm::MDString::get(C
, "none"));
548 ArgBaseTypeNames
.push_back(llvm::MDString::get(C
, "void*"));
549 ArgTypeQuals
.push_back(llvm::MDString::get(C
, ""));
551 llvm::MDString::get(C
, (Twine("local_arg") + Twine(I
)).str()));
553 std::string Name
= Invoke
->getName().str() + "_kernel";
554 auto *FT
= llvm::FunctionType::get(llvm::Type::getVoidTy(C
), ArgTys
, false);
555 auto *F
= llvm::Function::Create(FT
, llvm::GlobalValue::InternalLinkage
, Name
,
556 &CGF
.CGM
.getModule());
557 F
->setCallingConv(llvm::CallingConv::AMDGPU_KERNEL
);
559 llvm::AttrBuilder
KernelAttrs(C
);
560 // FIXME: The invoke isn't applying the right attributes either
561 // FIXME: This is missing setTargetAttributes
562 CGF
.CGM
.addDefaultFunctionDefinitionAttributes(KernelAttrs
);
563 KernelAttrs
.addAttribute("enqueued-block");
564 F
->addFnAttrs(KernelAttrs
);
566 auto IP
= CGF
.Builder
.saveIP();
567 auto *BB
= llvm::BasicBlock::Create(C
, "entry", F
);
568 Builder
.SetInsertPoint(BB
);
569 const auto BlockAlign
= CGF
.CGM
.getDataLayout().getPrefTypeAlign(BlockTy
);
570 auto *BlockPtr
= Builder
.CreateAlloca(BlockTy
, nullptr);
571 BlockPtr
->setAlignment(BlockAlign
);
572 Builder
.CreateAlignedStore(F
->arg_begin(), BlockPtr
, BlockAlign
);
573 auto *Cast
= Builder
.CreatePointerCast(BlockPtr
, InvokeFT
->getParamType(0));
574 llvm::SmallVector
<llvm::Value
*, 2> Args
;
575 Args
.push_back(Cast
);
576 for (llvm::Argument
&A
: llvm::drop_begin(F
->args()))
578 llvm::CallInst
*call
= Builder
.CreateCall(Invoke
, Args
);
579 call
->setCallingConv(Invoke
->getCallingConv());
580 Builder
.CreateRetVoid();
581 Builder
.restoreIP(IP
);
583 F
->setMetadata("kernel_arg_addr_space", llvm::MDNode::get(C
, AddressQuals
));
584 F
->setMetadata("kernel_arg_access_qual", llvm::MDNode::get(C
, AccessQuals
));
585 F
->setMetadata("kernel_arg_type", llvm::MDNode::get(C
, ArgTypeNames
));
586 F
->setMetadata("kernel_arg_base_type",
587 llvm::MDNode::get(C
, ArgBaseTypeNames
));
588 F
->setMetadata("kernel_arg_type_qual", llvm::MDNode::get(C
, ArgTypeQuals
));
589 if (CGF
.CGM
.getCodeGenOpts().EmitOpenCLArgMetadata
)
590 F
->setMetadata("kernel_arg_name", llvm::MDNode::get(C
, ArgNames
));
595 void CodeGenModule::handleAMDGPUFlatWorkGroupSizeAttr(
596 llvm::Function
*F
, const AMDGPUFlatWorkGroupSizeAttr
*FlatWGS
,
597 const ReqdWorkGroupSizeAttr
*ReqdWGS
, int32_t *MinThreadsVal
,
598 int32_t *MaxThreadsVal
) {
602 Min
= FlatWGS
->getMin()->EvaluateKnownConstInt(getContext()).getExtValue();
603 Max
= FlatWGS
->getMax()->EvaluateKnownConstInt(getContext()).getExtValue();
605 if (ReqdWGS
&& Min
== 0 && Max
== 0)
606 Min
= Max
= ReqdWGS
->getXDim() * ReqdWGS
->getYDim() * ReqdWGS
->getZDim();
609 assert(Min
<= Max
&& "Min must be less than or equal Max");
612 *MinThreadsVal
= Min
;
614 *MaxThreadsVal
= Max
;
615 std::string AttrVal
= llvm::utostr(Min
) + "," + llvm::utostr(Max
);
617 F
->addFnAttr("amdgpu-flat-work-group-size", AttrVal
);
619 assert(Max
== 0 && "Max must be zero");
622 void CodeGenModule::handleAMDGPUWavesPerEUAttr(
623 llvm::Function
*F
, const AMDGPUWavesPerEUAttr
*Attr
) {
625 Attr
->getMin()->EvaluateKnownConstInt(getContext()).getExtValue();
628 ? Attr
->getMax()->EvaluateKnownConstInt(getContext()).getExtValue()
632 assert((Max
== 0 || Min
<= Max
) && "Min must be less than or equal Max");
634 std::string AttrVal
= llvm::utostr(Min
);
636 AttrVal
= AttrVal
+ "," + llvm::utostr(Max
);
637 F
->addFnAttr("amdgpu-waves-per-eu", AttrVal
);
639 assert(Max
== 0 && "Max must be zero");
642 std::unique_ptr
<TargetCodeGenInfo
>
643 CodeGen::createAMDGPUTargetCodeGenInfo(CodeGenModule
&CGM
) {
644 return std::make_unique
<AMDGPUTargetCodeGenInfo
>(CGM
.getTypes());