[clang][modules] Don't prevent translation of FW_Private includes when explicitly...
[llvm-project.git] / clang / lib / CodeGen / Targets / AMDGPU.cpp
blob0411846cf9b02bd5b251df6f39141edd00ae7a75
1 //===- AMDGPU.cpp ---------------------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
9 #include "ABIInfoImpl.h"
10 #include "TargetInfo.h"
11 #include "clang/Basic/TargetOptions.h"
13 using namespace clang;
14 using namespace clang::CodeGen;
16 //===----------------------------------------------------------------------===//
17 // AMDGPU ABI Implementation
18 //===----------------------------------------------------------------------===//
20 namespace {
22 class AMDGPUABIInfo final : public DefaultABIInfo {
23 private:
24 static const unsigned MaxNumRegsForArgsRet = 16;
26 unsigned numRegsForType(QualType Ty) const;
28 bool isHomogeneousAggregateBaseType(QualType Ty) const override;
29 bool isHomogeneousAggregateSmallEnough(const Type *Base,
30 uint64_t Members) const override;
32 // Coerce HIP scalar pointer arguments from generic pointers to global ones.
33 llvm::Type *coerceKernelArgumentType(llvm::Type *Ty, unsigned FromAS,
34 unsigned ToAS) const {
35 // Single value types.
36 auto *PtrTy = llvm::dyn_cast<llvm::PointerType>(Ty);
37 if (PtrTy && PtrTy->getAddressSpace() == FromAS)
38 return llvm::PointerType::get(Ty->getContext(), ToAS);
39 return Ty;
42 public:
43 explicit AMDGPUABIInfo(CodeGen::CodeGenTypes &CGT) :
44 DefaultABIInfo(CGT) {}
46 ABIArgInfo classifyReturnType(QualType RetTy) const;
47 ABIArgInfo classifyKernelArgumentType(QualType Ty) const;
48 ABIArgInfo classifyArgumentType(QualType Ty, unsigned &NumRegsLeft) const;
50 void computeInfo(CGFunctionInfo &FI) const override;
51 Address EmitVAArg(CodeGenFunction &CGF, Address VAListAddr,
52 QualType Ty) const override;
55 bool AMDGPUABIInfo::isHomogeneousAggregateBaseType(QualType Ty) const {
56 return true;
59 bool AMDGPUABIInfo::isHomogeneousAggregateSmallEnough(
60 const Type *Base, uint64_t Members) const {
61 uint32_t NumRegs = (getContext().getTypeSize(Base) + 31) / 32;
63 // Homogeneous Aggregates may occupy at most 16 registers.
64 return Members * NumRegs <= MaxNumRegsForArgsRet;
67 /// Estimate number of registers the type will use when passed in registers.
68 unsigned AMDGPUABIInfo::numRegsForType(QualType Ty) const {
69 unsigned NumRegs = 0;
71 if (const VectorType *VT = Ty->getAs<VectorType>()) {
72 // Compute from the number of elements. The reported size is based on the
73 // in-memory size, which includes the padding 4th element for 3-vectors.
74 QualType EltTy = VT->getElementType();
75 unsigned EltSize = getContext().getTypeSize(EltTy);
77 // 16-bit element vectors should be passed as packed.
78 if (EltSize == 16)
79 return (VT->getNumElements() + 1) / 2;
81 unsigned EltNumRegs = (EltSize + 31) / 32;
82 return EltNumRegs * VT->getNumElements();
85 if (const RecordType *RT = Ty->getAs<RecordType>()) {
86 const RecordDecl *RD = RT->getDecl();
87 assert(!RD->hasFlexibleArrayMember());
89 for (const FieldDecl *Field : RD->fields()) {
90 QualType FieldTy = Field->getType();
91 NumRegs += numRegsForType(FieldTy);
94 return NumRegs;
97 return (getContext().getTypeSize(Ty) + 31) / 32;
100 void AMDGPUABIInfo::computeInfo(CGFunctionInfo &FI) const {
101 llvm::CallingConv::ID CC = FI.getCallingConvention();
103 if (!getCXXABI().classifyReturnType(FI))
104 FI.getReturnInfo() = classifyReturnType(FI.getReturnType());
106 unsigned NumRegsLeft = MaxNumRegsForArgsRet;
107 for (auto &Arg : FI.arguments()) {
108 if (CC == llvm::CallingConv::AMDGPU_KERNEL) {
109 Arg.info = classifyKernelArgumentType(Arg.type);
110 } else {
111 Arg.info = classifyArgumentType(Arg.type, NumRegsLeft);
116 Address AMDGPUABIInfo::EmitVAArg(CodeGenFunction &CGF, Address VAListAddr,
117 QualType Ty) const {
118 llvm_unreachable("AMDGPU does not support varargs");
121 ABIArgInfo AMDGPUABIInfo::classifyReturnType(QualType RetTy) const {
122 if (isAggregateTypeForABI(RetTy)) {
123 // Records with non-trivial destructors/copy-constructors should not be
124 // returned by value.
125 if (!getRecordArgABI(RetTy, getCXXABI())) {
126 // Ignore empty structs/unions.
127 if (isEmptyRecord(getContext(), RetTy, true))
128 return ABIArgInfo::getIgnore();
130 // Lower single-element structs to just return a regular value.
131 if (const Type *SeltTy = isSingleElementStruct(RetTy, getContext()))
132 return ABIArgInfo::getDirect(CGT.ConvertType(QualType(SeltTy, 0)));
134 if (const RecordType *RT = RetTy->getAs<RecordType>()) {
135 const RecordDecl *RD = RT->getDecl();
136 if (RD->hasFlexibleArrayMember())
137 return DefaultABIInfo::classifyReturnType(RetTy);
140 // Pack aggregates <= 4 bytes into single VGPR or pair.
141 uint64_t Size = getContext().getTypeSize(RetTy);
142 if (Size <= 16)
143 return ABIArgInfo::getDirect(llvm::Type::getInt16Ty(getVMContext()));
145 if (Size <= 32)
146 return ABIArgInfo::getDirect(llvm::Type::getInt32Ty(getVMContext()));
148 if (Size <= 64) {
149 llvm::Type *I32Ty = llvm::Type::getInt32Ty(getVMContext());
150 return ABIArgInfo::getDirect(llvm::ArrayType::get(I32Ty, 2));
153 if (numRegsForType(RetTy) <= MaxNumRegsForArgsRet)
154 return ABIArgInfo::getDirect();
158 // Otherwise just do the default thing.
159 return DefaultABIInfo::classifyReturnType(RetTy);
162 /// For kernels all parameters are really passed in a special buffer. It doesn't
163 /// make sense to pass anything byval, so everything must be direct.
164 ABIArgInfo AMDGPUABIInfo::classifyKernelArgumentType(QualType Ty) const {
165 Ty = useFirstFieldIfTransparentUnion(Ty);
167 // TODO: Can we omit empty structs?
169 if (const Type *SeltTy = isSingleElementStruct(Ty, getContext()))
170 Ty = QualType(SeltTy, 0);
172 llvm::Type *OrigLTy = CGT.ConvertType(Ty);
173 llvm::Type *LTy = OrigLTy;
174 if (getContext().getLangOpts().HIP) {
175 LTy = coerceKernelArgumentType(
176 OrigLTy, /*FromAS=*/getContext().getTargetAddressSpace(LangAS::Default),
177 /*ToAS=*/getContext().getTargetAddressSpace(LangAS::cuda_device));
180 // FIXME: Should also use this for OpenCL, but it requires addressing the
181 // problem of kernels being called.
183 // FIXME: This doesn't apply the optimization of coercing pointers in structs
184 // to global address space when using byref. This would require implementing a
185 // new kind of coercion of the in-memory type when for indirect arguments.
186 if (!getContext().getLangOpts().OpenCL && LTy == OrigLTy &&
187 isAggregateTypeForABI(Ty)) {
188 return ABIArgInfo::getIndirectAliased(
189 getContext().getTypeAlignInChars(Ty),
190 getContext().getTargetAddressSpace(LangAS::opencl_constant),
191 false /*Realign*/, nullptr /*Padding*/);
194 // If we set CanBeFlattened to true, CodeGen will expand the struct to its
195 // individual elements, which confuses the Clover OpenCL backend; therefore we
196 // have to set it to false here. Other args of getDirect() are just defaults.
197 return ABIArgInfo::getDirect(LTy, 0, nullptr, false);
200 ABIArgInfo AMDGPUABIInfo::classifyArgumentType(QualType Ty,
201 unsigned &NumRegsLeft) const {
202 assert(NumRegsLeft <= MaxNumRegsForArgsRet && "register estimate underflow");
204 Ty = useFirstFieldIfTransparentUnion(Ty);
206 if (isAggregateTypeForABI(Ty)) {
207 // Records with non-trivial destructors/copy-constructors should not be
208 // passed by value.
209 if (auto RAA = getRecordArgABI(Ty, getCXXABI()))
210 return getNaturalAlignIndirect(Ty, RAA == CGCXXABI::RAA_DirectInMemory);
212 // Ignore empty structs/unions.
213 if (isEmptyRecord(getContext(), Ty, true))
214 return ABIArgInfo::getIgnore();
216 // Lower single-element structs to just pass a regular value. TODO: We
217 // could do reasonable-size multiple-element structs too, using getExpand(),
218 // though watch out for things like bitfields.
219 if (const Type *SeltTy = isSingleElementStruct(Ty, getContext()))
220 return ABIArgInfo::getDirect(CGT.ConvertType(QualType(SeltTy, 0)));
222 if (const RecordType *RT = Ty->getAs<RecordType>()) {
223 const RecordDecl *RD = RT->getDecl();
224 if (RD->hasFlexibleArrayMember())
225 return DefaultABIInfo::classifyArgumentType(Ty);
228 // Pack aggregates <= 8 bytes into single VGPR or pair.
229 uint64_t Size = getContext().getTypeSize(Ty);
230 if (Size <= 64) {
231 unsigned NumRegs = (Size + 31) / 32;
232 NumRegsLeft -= std::min(NumRegsLeft, NumRegs);
234 if (Size <= 16)
235 return ABIArgInfo::getDirect(llvm::Type::getInt16Ty(getVMContext()));
237 if (Size <= 32)
238 return ABIArgInfo::getDirect(llvm::Type::getInt32Ty(getVMContext()));
240 // XXX: Should this be i64 instead, and should the limit increase?
241 llvm::Type *I32Ty = llvm::Type::getInt32Ty(getVMContext());
242 return ABIArgInfo::getDirect(llvm::ArrayType::get(I32Ty, 2));
245 if (NumRegsLeft > 0) {
246 unsigned NumRegs = numRegsForType(Ty);
247 if (NumRegsLeft >= NumRegs) {
248 NumRegsLeft -= NumRegs;
249 return ABIArgInfo::getDirect();
253 // Use pass-by-reference in stead of pass-by-value for struct arguments in
254 // function ABI.
255 return ABIArgInfo::getIndirectAliased(
256 getContext().getTypeAlignInChars(Ty),
257 getContext().getTargetAddressSpace(LangAS::opencl_private));
260 // Otherwise just do the default thing.
261 ABIArgInfo ArgInfo = DefaultABIInfo::classifyArgumentType(Ty);
262 if (!ArgInfo.isIndirect()) {
263 unsigned NumRegs = numRegsForType(Ty);
264 NumRegsLeft -= std::min(NumRegs, NumRegsLeft);
267 return ArgInfo;
270 class AMDGPUTargetCodeGenInfo : public TargetCodeGenInfo {
271 public:
272 AMDGPUTargetCodeGenInfo(CodeGenTypes &CGT)
273 : TargetCodeGenInfo(std::make_unique<AMDGPUABIInfo>(CGT)) {}
275 void setFunctionDeclAttributes(const FunctionDecl *FD, llvm::Function *F,
276 CodeGenModule &CGM) const;
278 void emitTargetGlobals(CodeGen::CodeGenModule &CGM) const override;
280 void setTargetAttributes(const Decl *D, llvm::GlobalValue *GV,
281 CodeGen::CodeGenModule &M) const override;
282 unsigned getOpenCLKernelCallingConv() const override;
284 llvm::Constant *getNullPointer(const CodeGen::CodeGenModule &CGM,
285 llvm::PointerType *T, QualType QT) const override;
287 LangAS getASTAllocaAddressSpace() const override {
288 return getLangASFromTargetAS(
289 getABIInfo().getDataLayout().getAllocaAddrSpace());
291 LangAS getGlobalVarAddressSpace(CodeGenModule &CGM,
292 const VarDecl *D) const override;
293 llvm::SyncScope::ID getLLVMSyncScopeID(const LangOptions &LangOpts,
294 SyncScope Scope,
295 llvm::AtomicOrdering Ordering,
296 llvm::LLVMContext &Ctx) const override;
297 llvm::Value *createEnqueuedBlockKernel(CodeGenFunction &CGF,
298 llvm::Function *BlockInvokeFunc,
299 llvm::Type *BlockTy) const override;
300 bool shouldEmitStaticExternCAliases() const override;
301 bool shouldEmitDWARFBitFieldSeparators() const override;
302 void setCUDAKernelCallingConvention(const FunctionType *&FT) const override;
306 static bool requiresAMDGPUProtectedVisibility(const Decl *D,
307 llvm::GlobalValue *GV) {
308 if (GV->getVisibility() != llvm::GlobalValue::HiddenVisibility)
309 return false;
311 return !D->hasAttr<OMPDeclareTargetDeclAttr>() &&
312 (D->hasAttr<OpenCLKernelAttr>() ||
313 (isa<FunctionDecl>(D) && D->hasAttr<CUDAGlobalAttr>()) ||
314 (isa<VarDecl>(D) &&
315 (D->hasAttr<CUDADeviceAttr>() || D->hasAttr<CUDAConstantAttr>() ||
316 cast<VarDecl>(D)->getType()->isCUDADeviceBuiltinSurfaceType() ||
317 cast<VarDecl>(D)->getType()->isCUDADeviceBuiltinTextureType())));
320 void AMDGPUTargetCodeGenInfo::setFunctionDeclAttributes(
321 const FunctionDecl *FD, llvm::Function *F, CodeGenModule &M) const {
322 const auto *ReqdWGS =
323 M.getLangOpts().OpenCL ? FD->getAttr<ReqdWorkGroupSizeAttr>() : nullptr;
324 const bool IsOpenCLKernel =
325 M.getLangOpts().OpenCL && FD->hasAttr<OpenCLKernelAttr>();
326 const bool IsHIPKernel = M.getLangOpts().HIP && FD->hasAttr<CUDAGlobalAttr>();
328 const auto *FlatWGS = FD->getAttr<AMDGPUFlatWorkGroupSizeAttr>();
329 if (ReqdWGS || FlatWGS) {
330 M.handleAMDGPUFlatWorkGroupSizeAttr(F, FlatWGS, ReqdWGS);
331 } else if (IsOpenCLKernel || IsHIPKernel) {
332 // By default, restrict the maximum size to a value specified by
333 // --gpu-max-threads-per-block=n or its default value for HIP.
334 const unsigned OpenCLDefaultMaxWorkGroupSize = 256;
335 const unsigned DefaultMaxWorkGroupSize =
336 IsOpenCLKernel ? OpenCLDefaultMaxWorkGroupSize
337 : M.getLangOpts().GPUMaxThreadsPerBlock;
338 std::string AttrVal =
339 std::string("1,") + llvm::utostr(DefaultMaxWorkGroupSize);
340 F->addFnAttr("amdgpu-flat-work-group-size", AttrVal);
343 if (const auto *Attr = FD->getAttr<AMDGPUWavesPerEUAttr>())
344 M.handleAMDGPUWavesPerEUAttr(F, Attr);
346 if (const auto *Attr = FD->getAttr<AMDGPUNumSGPRAttr>()) {
347 unsigned NumSGPR = Attr->getNumSGPR();
349 if (NumSGPR != 0)
350 F->addFnAttr("amdgpu-num-sgpr", llvm::utostr(NumSGPR));
353 if (const auto *Attr = FD->getAttr<AMDGPUNumVGPRAttr>()) {
354 uint32_t NumVGPR = Attr->getNumVGPR();
356 if (NumVGPR != 0)
357 F->addFnAttr("amdgpu-num-vgpr", llvm::utostr(NumVGPR));
361 /// Emits control constants used to change per-architecture behaviour in the
362 /// AMDGPU ROCm device libraries.
363 void AMDGPUTargetCodeGenInfo::emitTargetGlobals(
364 CodeGen::CodeGenModule &CGM) const {
365 StringRef Name = "llvm.amdgcn.abi.version";
366 llvm::GlobalVariable *OriginalGV = CGM.getModule().getNamedGlobal(Name);
367 if (OriginalGV && !llvm::GlobalVariable::isExternalLinkage(OriginalGV->getLinkage()))
368 return;
370 auto *Type = llvm::IntegerType::getIntNTy(CGM.getModule().getContext(), 32);
371 llvm::Constant *COV = llvm::ConstantInt::get(
372 Type, CGM.getTarget().getTargetOpts().CodeObjectVersion);
374 // It needs to be constant weak_odr without externally_initialized so that
375 // the load instuction can be eliminated by the IPSCCP.
376 auto *GV = new llvm::GlobalVariable(
377 CGM.getModule(), Type, true, llvm::GlobalValue::WeakODRLinkage, COV, Name,
378 nullptr, llvm::GlobalValue::ThreadLocalMode::NotThreadLocal,
379 CGM.getContext().getTargetAddressSpace(LangAS::opencl_constant));
380 GV->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Local);
381 GV->setVisibility(llvm::GlobalValue::VisibilityTypes::HiddenVisibility);
383 // Replace any external references to this variable with the new global.
384 if (OriginalGV) {
385 OriginalGV->replaceAllUsesWith(GV);
386 GV->takeName(OriginalGV);
387 OriginalGV->eraseFromParent();
391 void AMDGPUTargetCodeGenInfo::setTargetAttributes(
392 const Decl *D, llvm::GlobalValue *GV, CodeGen::CodeGenModule &M) const {
393 if (requiresAMDGPUProtectedVisibility(D, GV)) {
394 GV->setVisibility(llvm::GlobalValue::ProtectedVisibility);
395 GV->setDSOLocal(true);
398 if (GV->isDeclaration())
399 return;
401 llvm::Function *F = dyn_cast<llvm::Function>(GV);
402 if (!F)
403 return;
405 const FunctionDecl *FD = dyn_cast_or_null<FunctionDecl>(D);
406 if (FD)
407 setFunctionDeclAttributes(FD, F, M);
409 if (M.getContext().getTargetInfo().allowAMDGPUUnsafeFPAtomics())
410 F->addFnAttr("amdgpu-unsafe-fp-atomics", "true");
412 if (!getABIInfo().getCodeGenOpts().EmitIEEENaNCompliantInsts)
413 F->addFnAttr("amdgpu-ieee", "false");
416 unsigned AMDGPUTargetCodeGenInfo::getOpenCLKernelCallingConv() const {
417 return llvm::CallingConv::AMDGPU_KERNEL;
420 // Currently LLVM assumes null pointers always have value 0,
421 // which results in incorrectly transformed IR. Therefore, instead of
422 // emitting null pointers in private and local address spaces, a null
423 // pointer in generic address space is emitted which is casted to a
424 // pointer in local or private address space.
425 llvm::Constant *AMDGPUTargetCodeGenInfo::getNullPointer(
426 const CodeGen::CodeGenModule &CGM, llvm::PointerType *PT,
427 QualType QT) const {
428 if (CGM.getContext().getTargetNullPointerValue(QT) == 0)
429 return llvm::ConstantPointerNull::get(PT);
431 auto &Ctx = CGM.getContext();
432 auto NPT = llvm::PointerType::get(
433 PT->getContext(), Ctx.getTargetAddressSpace(LangAS::opencl_generic));
434 return llvm::ConstantExpr::getAddrSpaceCast(
435 llvm::ConstantPointerNull::get(NPT), PT);
438 LangAS
439 AMDGPUTargetCodeGenInfo::getGlobalVarAddressSpace(CodeGenModule &CGM,
440 const VarDecl *D) const {
441 assert(!CGM.getLangOpts().OpenCL &&
442 !(CGM.getLangOpts().CUDA && CGM.getLangOpts().CUDAIsDevice) &&
443 "Address space agnostic languages only");
444 LangAS DefaultGlobalAS = getLangASFromTargetAS(
445 CGM.getContext().getTargetAddressSpace(LangAS::opencl_global));
446 if (!D)
447 return DefaultGlobalAS;
449 LangAS AddrSpace = D->getType().getAddressSpace();
450 if (AddrSpace != LangAS::Default)
451 return AddrSpace;
453 // Only promote to address space 4 if VarDecl has constant initialization.
454 if (D->getType().isConstantStorage(CGM.getContext(), false, false) &&
455 D->hasConstantInitialization()) {
456 if (auto ConstAS = CGM.getTarget().getConstantAddressSpace())
457 return *ConstAS;
459 return DefaultGlobalAS;
462 llvm::SyncScope::ID
463 AMDGPUTargetCodeGenInfo::getLLVMSyncScopeID(const LangOptions &LangOpts,
464 SyncScope Scope,
465 llvm::AtomicOrdering Ordering,
466 llvm::LLVMContext &Ctx) const {
467 std::string Name;
468 switch (Scope) {
469 case SyncScope::HIPSingleThread:
470 Name = "singlethread";
471 break;
472 case SyncScope::HIPWavefront:
473 case SyncScope::OpenCLSubGroup:
474 Name = "wavefront";
475 break;
476 case SyncScope::HIPWorkgroup:
477 case SyncScope::OpenCLWorkGroup:
478 Name = "workgroup";
479 break;
480 case SyncScope::HIPAgent:
481 case SyncScope::OpenCLDevice:
482 Name = "agent";
483 break;
484 case SyncScope::HIPSystem:
485 case SyncScope::OpenCLAllSVMDevices:
486 Name = "";
487 break;
490 if (Ordering != llvm::AtomicOrdering::SequentiallyConsistent) {
491 if (!Name.empty())
492 Name = Twine(Twine(Name) + Twine("-")).str();
494 Name = Twine(Twine(Name) + Twine("one-as")).str();
497 return Ctx.getOrInsertSyncScopeID(Name);
500 bool AMDGPUTargetCodeGenInfo::shouldEmitStaticExternCAliases() const {
501 return false;
504 bool AMDGPUTargetCodeGenInfo::shouldEmitDWARFBitFieldSeparators() const {
505 return true;
508 void AMDGPUTargetCodeGenInfo::setCUDAKernelCallingConvention(
509 const FunctionType *&FT) const {
510 FT = getABIInfo().getContext().adjustFunctionType(
511 FT, FT->getExtInfo().withCallingConv(CC_OpenCLKernel));
514 /// Create an OpenCL kernel for an enqueued block.
516 /// The type of the first argument (the block literal) is the struct type
517 /// of the block literal instead of a pointer type. The first argument
518 /// (block literal) is passed directly by value to the kernel. The kernel
519 /// allocates the same type of struct on stack and stores the block literal
520 /// to it and passes its pointer to the block invoke function. The kernel
521 /// has "enqueued-block" function attribute and kernel argument metadata.
522 llvm::Value *AMDGPUTargetCodeGenInfo::createEnqueuedBlockKernel(
523 CodeGenFunction &CGF, llvm::Function *Invoke, llvm::Type *BlockTy) const {
524 auto &Builder = CGF.Builder;
525 auto &C = CGF.getLLVMContext();
527 auto *InvokeFT = Invoke->getFunctionType();
528 llvm::SmallVector<llvm::Type *, 2> ArgTys;
529 llvm::SmallVector<llvm::Metadata *, 8> AddressQuals;
530 llvm::SmallVector<llvm::Metadata *, 8> AccessQuals;
531 llvm::SmallVector<llvm::Metadata *, 8> ArgTypeNames;
532 llvm::SmallVector<llvm::Metadata *, 8> ArgBaseTypeNames;
533 llvm::SmallVector<llvm::Metadata *, 8> ArgTypeQuals;
534 llvm::SmallVector<llvm::Metadata *, 8> ArgNames;
536 ArgTys.push_back(BlockTy);
537 ArgTypeNames.push_back(llvm::MDString::get(C, "__block_literal"));
538 AddressQuals.push_back(llvm::ConstantAsMetadata::get(Builder.getInt32(0)));
539 ArgBaseTypeNames.push_back(llvm::MDString::get(C, "__block_literal"));
540 ArgTypeQuals.push_back(llvm::MDString::get(C, ""));
541 AccessQuals.push_back(llvm::MDString::get(C, "none"));
542 ArgNames.push_back(llvm::MDString::get(C, "block_literal"));
543 for (unsigned I = 1, E = InvokeFT->getNumParams(); I < E; ++I) {
544 ArgTys.push_back(InvokeFT->getParamType(I));
545 ArgTypeNames.push_back(llvm::MDString::get(C, "void*"));
546 AddressQuals.push_back(llvm::ConstantAsMetadata::get(Builder.getInt32(3)));
547 AccessQuals.push_back(llvm::MDString::get(C, "none"));
548 ArgBaseTypeNames.push_back(llvm::MDString::get(C, "void*"));
549 ArgTypeQuals.push_back(llvm::MDString::get(C, ""));
550 ArgNames.push_back(
551 llvm::MDString::get(C, (Twine("local_arg") + Twine(I)).str()));
553 std::string Name = Invoke->getName().str() + "_kernel";
554 auto *FT = llvm::FunctionType::get(llvm::Type::getVoidTy(C), ArgTys, false);
555 auto *F = llvm::Function::Create(FT, llvm::GlobalValue::InternalLinkage, Name,
556 &CGF.CGM.getModule());
557 F->setCallingConv(llvm::CallingConv::AMDGPU_KERNEL);
559 llvm::AttrBuilder KernelAttrs(C);
560 // FIXME: The invoke isn't applying the right attributes either
561 // FIXME: This is missing setTargetAttributes
562 CGF.CGM.addDefaultFunctionDefinitionAttributes(KernelAttrs);
563 KernelAttrs.addAttribute("enqueued-block");
564 F->addFnAttrs(KernelAttrs);
566 auto IP = CGF.Builder.saveIP();
567 auto *BB = llvm::BasicBlock::Create(C, "entry", F);
568 Builder.SetInsertPoint(BB);
569 const auto BlockAlign = CGF.CGM.getDataLayout().getPrefTypeAlign(BlockTy);
570 auto *BlockPtr = Builder.CreateAlloca(BlockTy, nullptr);
571 BlockPtr->setAlignment(BlockAlign);
572 Builder.CreateAlignedStore(F->arg_begin(), BlockPtr, BlockAlign);
573 auto *Cast = Builder.CreatePointerCast(BlockPtr, InvokeFT->getParamType(0));
574 llvm::SmallVector<llvm::Value *, 2> Args;
575 Args.push_back(Cast);
576 for (llvm::Argument &A : llvm::drop_begin(F->args()))
577 Args.push_back(&A);
578 llvm::CallInst *call = Builder.CreateCall(Invoke, Args);
579 call->setCallingConv(Invoke->getCallingConv());
580 Builder.CreateRetVoid();
581 Builder.restoreIP(IP);
583 F->setMetadata("kernel_arg_addr_space", llvm::MDNode::get(C, AddressQuals));
584 F->setMetadata("kernel_arg_access_qual", llvm::MDNode::get(C, AccessQuals));
585 F->setMetadata("kernel_arg_type", llvm::MDNode::get(C, ArgTypeNames));
586 F->setMetadata("kernel_arg_base_type",
587 llvm::MDNode::get(C, ArgBaseTypeNames));
588 F->setMetadata("kernel_arg_type_qual", llvm::MDNode::get(C, ArgTypeQuals));
589 if (CGF.CGM.getCodeGenOpts().EmitOpenCLArgMetadata)
590 F->setMetadata("kernel_arg_name", llvm::MDNode::get(C, ArgNames));
592 return F;
595 void CodeGenModule::handleAMDGPUFlatWorkGroupSizeAttr(
596 llvm::Function *F, const AMDGPUFlatWorkGroupSizeAttr *FlatWGS,
597 const ReqdWorkGroupSizeAttr *ReqdWGS, int32_t *MinThreadsVal,
598 int32_t *MaxThreadsVal) {
599 unsigned Min = 0;
600 unsigned Max = 0;
601 if (FlatWGS) {
602 Min = FlatWGS->getMin()->EvaluateKnownConstInt(getContext()).getExtValue();
603 Max = FlatWGS->getMax()->EvaluateKnownConstInt(getContext()).getExtValue();
605 if (ReqdWGS && Min == 0 && Max == 0)
606 Min = Max = ReqdWGS->getXDim() * ReqdWGS->getYDim() * ReqdWGS->getZDim();
608 if (Min != 0) {
609 assert(Min <= Max && "Min must be less than or equal Max");
611 if (MinThreadsVal)
612 *MinThreadsVal = Min;
613 if (MaxThreadsVal)
614 *MaxThreadsVal = Max;
615 std::string AttrVal = llvm::utostr(Min) + "," + llvm::utostr(Max);
616 if (F)
617 F->addFnAttr("amdgpu-flat-work-group-size", AttrVal);
618 } else
619 assert(Max == 0 && "Max must be zero");
622 void CodeGenModule::handleAMDGPUWavesPerEUAttr(
623 llvm::Function *F, const AMDGPUWavesPerEUAttr *Attr) {
624 unsigned Min =
625 Attr->getMin()->EvaluateKnownConstInt(getContext()).getExtValue();
626 unsigned Max =
627 Attr->getMax()
628 ? Attr->getMax()->EvaluateKnownConstInt(getContext()).getExtValue()
629 : 0;
631 if (Min != 0) {
632 assert((Max == 0 || Min <= Max) && "Min must be less than or equal Max");
634 std::string AttrVal = llvm::utostr(Min);
635 if (Max != 0)
636 AttrVal = AttrVal + "," + llvm::utostr(Max);
637 F->addFnAttr("amdgpu-waves-per-eu", AttrVal);
638 } else
639 assert(Max == 0 && "Max must be zero");
642 std::unique_ptr<TargetCodeGenInfo>
643 CodeGen::createAMDGPUTargetCodeGenInfo(CodeGenModule &CGM) {
644 return std::make_unique<AMDGPUTargetCodeGenInfo>(CGM.getTypes());