1 //=== AMDGPUPrintfRuntimeBinding.cpp - OpenCL printf implementation -------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
10 // The pass bind printfs to a kernel arg pointer that will be bound to a buffer
11 // later by the runtime.
13 // This pass traverses the functions in the module and converts
14 // each call to printf to a sequence of operations that
15 // store the following into the printf buffer:
16 // - format string (passed as a module's metadata unique ID)
17 // - bitwise copies of printf arguments
18 // The backend passes will need to store metadata in the kernel
19 //===----------------------------------------------------------------------===//
22 #include "llvm/Analysis/InstructionSimplify.h"
23 #include "llvm/Analysis/TargetLibraryInfo.h"
24 #include "llvm/IR/Dominators.h"
25 #include "llvm/IR/IRBuilder.h"
26 #include "llvm/IR/Instructions.h"
27 #include "llvm/InitializePasses.h"
28 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
32 #define DEBUG_TYPE "printfToRuntime"
36 class AMDGPUPrintfRuntimeBinding final
: public ModulePass
{
41 explicit AMDGPUPrintfRuntimeBinding();
44 bool runOnModule(Module
&M
) override
;
46 void getAnalysisUsage(AnalysisUsage
&AU
) const override
{
47 AU
.addRequired
<TargetLibraryInfoWrapperPass
>();
48 AU
.addRequired
<DominatorTreeWrapperPass
>();
52 class AMDGPUPrintfRuntimeBindingImpl
{
54 AMDGPUPrintfRuntimeBindingImpl(
55 function_ref
<const DominatorTree
&(Function
&)> GetDT
,
56 function_ref
<const TargetLibraryInfo
&(Function
&)> GetTLI
)
57 : GetDT(GetDT
), GetTLI(GetTLI
) {}
61 void getConversionSpecifiers(SmallVectorImpl
<char> &OpConvSpecifiers
,
62 StringRef fmt
, size_t num_ops
) const;
64 bool shouldPrintAsStr(char Specifier
, Type
*OpType
) const;
65 bool lowerPrintfForGpu(Module
&M
);
67 Value
*simplify(Instruction
*I
, const TargetLibraryInfo
*TLI
,
68 const DominatorTree
*DT
) {
69 return SimplifyInstruction(I
, {*TD
, TLI
, DT
});
73 function_ref
<const DominatorTree
&(Function
&)> GetDT
;
74 function_ref
<const TargetLibraryInfo
&(Function
&)> GetTLI
;
75 SmallVector
<CallInst
*, 32> Printfs
;
79 char AMDGPUPrintfRuntimeBinding::ID
= 0;
81 INITIALIZE_PASS_BEGIN(AMDGPUPrintfRuntimeBinding
,
82 "amdgpu-printf-runtime-binding", "AMDGPU Printf lowering",
84 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass
)
85 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass
)
86 INITIALIZE_PASS_END(AMDGPUPrintfRuntimeBinding
, "amdgpu-printf-runtime-binding",
87 "AMDGPU Printf lowering", false, false)
89 char &llvm::AMDGPUPrintfRuntimeBindingID
= AMDGPUPrintfRuntimeBinding::ID
;
92 ModulePass
*createAMDGPUPrintfRuntimeBinding() {
93 return new AMDGPUPrintfRuntimeBinding();
97 AMDGPUPrintfRuntimeBinding::AMDGPUPrintfRuntimeBinding() : ModulePass(ID
) {
98 initializeAMDGPUPrintfRuntimeBindingPass(*PassRegistry::getPassRegistry());
101 void AMDGPUPrintfRuntimeBindingImpl::getConversionSpecifiers(
102 SmallVectorImpl
<char> &OpConvSpecifiers
, StringRef Fmt
,
103 size_t NumOps
) const {
104 // not all format characters are collected.
105 // At this time the format characters of interest
106 // are %p and %s, which use to know if we
107 // are either storing a literal string or a
108 // pointer to the printf buffer.
109 static const char ConvSpecifiers
[] = "cdieEfgGaosuxXp";
110 size_t CurFmtSpecifierIdx
= 0;
111 size_t PrevFmtSpecifierIdx
= 0;
113 while ((CurFmtSpecifierIdx
= Fmt
.find_first_of(
114 ConvSpecifiers
, CurFmtSpecifierIdx
)) != StringRef::npos
) {
115 bool ArgDump
= false;
116 StringRef CurFmt
= Fmt
.substr(PrevFmtSpecifierIdx
,
117 CurFmtSpecifierIdx
- PrevFmtSpecifierIdx
);
118 size_t pTag
= CurFmt
.find_last_of("%");
119 if (pTag
!= StringRef::npos
) {
121 while (pTag
&& CurFmt
[--pTag
] == '%') {
127 OpConvSpecifiers
.push_back(Fmt
[CurFmtSpecifierIdx
]);
129 PrevFmtSpecifierIdx
= ++CurFmtSpecifierIdx
;
133 bool AMDGPUPrintfRuntimeBindingImpl::shouldPrintAsStr(char Specifier
,
134 Type
*OpType
) const {
135 if (Specifier
!= 's')
137 const PointerType
*PT
= dyn_cast
<PointerType
>(OpType
);
138 if (!PT
|| PT
->getAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS
)
140 Type
*ElemType
= PT
->getContainedType(0);
141 if (ElemType
->getTypeID() != Type::IntegerTyID
)
143 IntegerType
*ElemIType
= cast
<IntegerType
>(ElemType
);
144 return ElemIType
->getBitWidth() == 8;
147 bool AMDGPUPrintfRuntimeBindingImpl::lowerPrintfForGpu(Module
&M
) {
148 LLVMContext
&Ctx
= M
.getContext();
149 IRBuilder
<> Builder(Ctx
);
150 Type
*I32Ty
= Type::getInt32Ty(Ctx
);
152 // NB: This is important for this string size to be divizable by 4
153 const char NonLiteralStr
[4] = "???";
155 for (auto CI
: Printfs
) {
156 unsigned NumOps
= CI
->getNumArgOperands();
158 SmallString
<16> OpConvSpecifiers
;
159 Value
*Op
= CI
->getArgOperand(0);
161 if (auto LI
= dyn_cast
<LoadInst
>(Op
)) {
162 Op
= LI
->getPointerOperand();
163 for (auto Use
: Op
->users()) {
164 if (auto SI
= dyn_cast
<StoreInst
>(Use
)) {
165 Op
= SI
->getValueOperand();
171 if (auto I
= dyn_cast
<Instruction
>(Op
)) {
172 Value
*Op_simplified
=
173 simplify(I
, &GetTLI(*I
->getFunction()), &GetDT(*I
->getFunction()));
178 ConstantExpr
*ConstExpr
= dyn_cast
<ConstantExpr
>(Op
);
181 GlobalVariable
*GVar
= dyn_cast
<GlobalVariable
>(ConstExpr
->getOperand(0));
183 StringRef
Str("unknown");
184 if (GVar
&& GVar
->hasInitializer()) {
185 auto *Init
= GVar
->getInitializer();
186 if (auto *CA
= dyn_cast
<ConstantDataArray
>(Init
)) {
188 Str
= CA
->getAsCString();
189 } else if (isa
<ConstantAggregateZero
>(Init
)) {
193 // we need this call to ascertain
194 // that we are printing a string
195 // or a pointer. It takes out the
196 // specifiers and fills up the first
198 getConversionSpecifiers(OpConvSpecifiers
, Str
, NumOps
- 1);
200 // Add metadata for the string
201 std::string AStreamHolder
;
202 raw_string_ostream
Sizes(AStreamHolder
);
203 int Sum
= DWORD_ALIGN
;
204 Sizes
<< CI
->getNumArgOperands() - 1;
206 for (unsigned ArgCount
= 1; ArgCount
< CI
->getNumArgOperands() &&
207 ArgCount
<= OpConvSpecifiers
.size();
209 Value
*Arg
= CI
->getArgOperand(ArgCount
);
210 Type
*ArgType
= Arg
->getType();
211 unsigned ArgSize
= TD
->getTypeAllocSizeInBits(ArgType
);
212 ArgSize
= ArgSize
/ 8;
214 // ArgSize by design should be a multiple of DWORD_ALIGN,
215 // expand the arguments that do not follow this rule.
217 if (ArgSize
% DWORD_ALIGN
!= 0) {
218 llvm::Type
*ResType
= llvm::Type::getInt32Ty(Ctx
);
219 auto *LLVMVecType
= llvm::dyn_cast
<llvm::FixedVectorType
>(ArgType
);
220 int NumElem
= LLVMVecType
? LLVMVecType
->getNumElements() : 1;
221 if (LLVMVecType
&& NumElem
> 1)
222 ResType
= llvm::FixedVectorType::get(ResType
, NumElem
);
223 Builder
.SetInsertPoint(CI
);
224 Builder
.SetCurrentDebugLocation(CI
->getDebugLoc());
225 if (OpConvSpecifiers
[ArgCount
- 1] == 'x' ||
226 OpConvSpecifiers
[ArgCount
- 1] == 'X' ||
227 OpConvSpecifiers
[ArgCount
- 1] == 'u' ||
228 OpConvSpecifiers
[ArgCount
- 1] == 'o')
229 Arg
= Builder
.CreateZExt(Arg
, ResType
);
231 Arg
= Builder
.CreateSExt(Arg
, ResType
);
232 ArgType
= Arg
->getType();
233 ArgSize
= TD
->getTypeAllocSizeInBits(ArgType
);
234 ArgSize
= ArgSize
/ 8;
235 CI
->setOperand(ArgCount
, Arg
);
237 if (OpConvSpecifiers
[ArgCount
- 1] == 'f') {
238 ConstantFP
*FpCons
= dyn_cast
<ConstantFP
>(Arg
);
242 FPExtInst
*FpExt
= dyn_cast
<FPExtInst
>(Arg
);
243 if (FpExt
&& FpExt
->getType()->isDoubleTy() &&
244 FpExt
->getOperand(0)->getType()->isFloatTy())
248 if (shouldPrintAsStr(OpConvSpecifiers
[ArgCount
- 1], ArgType
)) {
249 if (auto *ConstExpr
= dyn_cast
<ConstantExpr
>(Arg
)) {
250 auto *GV
= dyn_cast
<GlobalVariable
>(ConstExpr
->getOperand(0));
251 if (GV
&& GV
->hasInitializer()) {
252 Constant
*Init
= GV
->getInitializer();
253 bool IsZeroValue
= Init
->isZeroValue();
254 auto *CA
= dyn_cast
<ConstantDataArray
>(Init
);
255 if (IsZeroValue
|| (CA
&& CA
->isString())) {
257 IsZeroValue
? 1 : (strlen(CA
->getAsCString().data()) + 1);
258 size_t Rem
= SizeStr
% DWORD_ALIGN
;
260 LLVM_DEBUG(dbgs() << "Printf string original size = " << SizeStr
263 NSizeStr
= SizeStr
+ (DWORD_ALIGN
- Rem
);
270 ArgSize
= sizeof(NonLiteralStr
);
273 ArgSize
= sizeof(NonLiteralStr
);
276 LLVM_DEBUG(dbgs() << "Printf ArgSize (in buffer) = " << ArgSize
277 << " for type: " << *ArgType
<< '\n');
278 Sizes
<< ArgSize
<< ':';
281 LLVM_DEBUG(dbgs() << "Printf format string in source = " << Str
.str()
283 for (size_t I
= 0; I
< Str
.size(); ++I
) {
284 // Rest of the C escape sequences (e.g. \') are handled correctly
306 // ':' cannot be scanned by Flex, as it is defined as a delimiter
307 // Replace it with it's octal representation \72
316 // Insert the printf_alloc call
317 Builder
.SetInsertPoint(CI
);
318 Builder
.SetCurrentDebugLocation(CI
->getDebugLoc());
320 AttributeList Attr
= AttributeList::get(Ctx
, AttributeList::FunctionIndex
,
321 Attribute::NoUnwind
);
323 Type
*SizetTy
= Type::getInt32Ty(Ctx
);
325 Type
*Tys_alloc
[1] = {SizetTy
};
326 Type
*I8Ty
= Type::getInt8Ty(Ctx
);
327 Type
*I8Ptr
= PointerType::get(I8Ty
, 1);
328 FunctionType
*FTy_alloc
= FunctionType::get(I8Ptr
, Tys_alloc
, false);
329 FunctionCallee PrintfAllocFn
=
330 M
.getOrInsertFunction(StringRef("__printf_alloc"), FTy_alloc
, Attr
);
332 LLVM_DEBUG(dbgs() << "Printf metadata = " << Sizes
.str() << '\n');
333 std::string fmtstr
= itostr(++UniqID
) + ":" + Sizes
.str().c_str();
334 MDString
*fmtStrArray
= MDString::get(Ctx
, fmtstr
);
336 // Instead of creating global variables, the
337 // printf format strings are extracted
338 // and passed as metadata. This avoids
339 // polluting llvm's symbol tables in this module.
340 // Metadata is going to be extracted
341 // by the backend passes and inserted
342 // into the OpenCL binary as appropriate.
343 StringRef
amd("llvm.printf.fmts");
344 NamedMDNode
*metaD
= M
.getOrInsertNamedMetadata(amd
);
345 MDNode
*myMD
= MDNode::get(Ctx
, fmtStrArray
);
346 metaD
->addOperand(myMD
);
347 Value
*sumC
= ConstantInt::get(SizetTy
, Sum
, false);
348 SmallVector
<Value
*, 1> alloc_args
;
349 alloc_args
.push_back(sumC
);
351 CallInst::Create(PrintfAllocFn
, alloc_args
, "printf_alloc_fn", CI
);
354 // Insert code to split basicblock with a
355 // piece of hammock code.
356 // basicblock splits after buffer overflow check
358 ConstantPointerNull
*zeroIntPtr
=
359 ConstantPointerNull::get(PointerType::get(I8Ty
, 1));
360 auto *cmp
= cast
<ICmpInst
>(Builder
.CreateICmpNE(pcall
, zeroIntPtr
, ""));
361 if (!CI
->use_empty()) {
363 Builder
.CreateSExt(Builder
.CreateNot(cmp
), I32Ty
, "printf_res");
364 CI
->replaceAllUsesWith(result
);
366 SplitBlock(CI
->getParent(), cmp
);
368 SplitBlockAndInsertIfThen(cmp
, cmp
->getNextNode(), false);
370 Builder
.SetInsertPoint(Brnch
);
372 // store unique printf id in the buffer
374 GetElementPtrInst
*BufferIdx
= GetElementPtrInst::Create(
375 I8Ty
, pcall
, ConstantInt::get(Ctx
, APInt(32, 0)), "PrintBuffID",
378 Type
*idPointer
= PointerType::get(I32Ty
, AMDGPUAS::GLOBAL_ADDRESS
);
380 new BitCastInst(BufferIdx
, idPointer
, "PrintBuffIdCast", Brnch
);
382 new StoreInst(ConstantInt::get(I32Ty
, UniqID
), id_gep_cast
, Brnch
);
384 // 1st 4 bytes hold the printf_id
385 // the following GEP is the buffer pointer
386 BufferIdx
= GetElementPtrInst::Create(
387 I8Ty
, pcall
, ConstantInt::get(Ctx
, APInt(32, 4)), "PrintBuffGep",
390 Type
*Int32Ty
= Type::getInt32Ty(Ctx
);
391 Type
*Int64Ty
= Type::getInt64Ty(Ctx
);
392 for (unsigned ArgCount
= 1; ArgCount
< CI
->getNumArgOperands() &&
393 ArgCount
<= OpConvSpecifiers
.size();
395 Value
*Arg
= CI
->getArgOperand(ArgCount
);
396 Type
*ArgType
= Arg
->getType();
397 SmallVector
<Value
*, 32> WhatToStore
;
398 if (ArgType
->isFPOrFPVectorTy() && !isa
<VectorType
>(ArgType
)) {
399 Type
*IType
= (ArgType
->isFloatTy()) ? Int32Ty
: Int64Ty
;
400 if (OpConvSpecifiers
[ArgCount
- 1] == 'f') {
401 if (auto *FpCons
= dyn_cast
<ConstantFP
>(Arg
)) {
402 APFloat
Val(FpCons
->getValueAPF());
404 Val
.convert(APFloat::IEEEsingle(), APFloat::rmNearestTiesToEven
,
406 Arg
= ConstantFP::get(Ctx
, Val
);
408 } else if (auto *FpExt
= dyn_cast
<FPExtInst
>(Arg
)) {
409 if (FpExt
->getType()->isDoubleTy() &&
410 FpExt
->getOperand(0)->getType()->isFloatTy()) {
411 Arg
= FpExt
->getOperand(0);
416 Arg
= new BitCastInst(Arg
, IType
, "PrintArgFP", Brnch
);
417 WhatToStore
.push_back(Arg
);
418 } else if (ArgType
->getTypeID() == Type::PointerTyID
) {
419 if (shouldPrintAsStr(OpConvSpecifiers
[ArgCount
- 1], ArgType
)) {
420 const char *S
= NonLiteralStr
;
421 if (auto *ConstExpr
= dyn_cast
<ConstantExpr
>(Arg
)) {
422 auto *GV
= dyn_cast
<GlobalVariable
>(ConstExpr
->getOperand(0));
423 if (GV
&& GV
->hasInitializer()) {
424 Constant
*Init
= GV
->getInitializer();
425 bool IsZeroValue
= Init
->isZeroValue();
426 auto *CA
= dyn_cast
<ConstantDataArray
>(Init
);
427 if (IsZeroValue
|| (CA
&& CA
->isString())) {
428 S
= IsZeroValue
? "" : CA
->getAsCString().data();
432 size_t SizeStr
= strlen(S
) + 1;
433 size_t Rem
= SizeStr
% DWORD_ALIGN
;
436 NSizeStr
= SizeStr
+ (DWORD_ALIGN
- Rem
);
441 char *MyNewStr
= new char[NSizeStr
]();
443 int NumInts
= NSizeStr
/ 4;
446 int ANum
= *(int *)(MyNewStr
+ CharC
);
449 Value
*ANumV
= ConstantInt::get(Int32Ty
, ANum
, false);
450 WhatToStore
.push_back(ANumV
);
454 // Empty string, give a hint to RT it is no NULL
455 Value
*ANumV
= ConstantInt::get(Int32Ty
, 0xFFFFFF00, false);
456 WhatToStore
.push_back(ANumV
);
459 uint64_t Size
= TD
->getTypeAllocSizeInBits(ArgType
);
460 assert((Size
== 32 || Size
== 64) && "unsupported size");
461 Type
*DstType
= (Size
== 32) ? Int32Ty
: Int64Ty
;
462 Arg
= new PtrToIntInst(Arg
, DstType
, "PrintArgPtr", Brnch
);
463 WhatToStore
.push_back(Arg
);
465 } else if (isa
<FixedVectorType
>(ArgType
)) {
467 uint32_t EleCount
= cast
<FixedVectorType
>(ArgType
)->getNumElements();
468 uint32_t EleSize
= ArgType
->getScalarSizeInBits();
469 uint32_t TotalSize
= EleCount
* EleSize
;
471 ShuffleVectorInst
*Shuffle
=
472 new ShuffleVectorInst(Arg
, Arg
, ArrayRef
<int>{0, 1, 2, 2});
473 Shuffle
->insertBefore(Brnch
);
475 ArgType
= Arg
->getType();
476 TotalSize
+= EleSize
;
480 EleCount
= TotalSize
/ 64;
481 IType
= Type::getInt64Ty(ArgType
->getContext());
485 EleCount
= TotalSize
/ 64;
486 IType
= Type::getInt64Ty(ArgType
->getContext());
487 } else if (EleCount
>= 3) {
489 IType
= Type::getInt32Ty(ArgType
->getContext());
492 IType
= Type::getInt16Ty(ArgType
->getContext());
497 EleCount
= TotalSize
/ 64;
498 IType
= Type::getInt64Ty(ArgType
->getContext());
501 IType
= Type::getInt32Ty(ArgType
->getContext());
506 IType
= FixedVectorType::get(IType
, EleCount
);
508 Arg
= new BitCastInst(Arg
, IType
, "PrintArgVect", Brnch
);
509 WhatToStore
.push_back(Arg
);
511 WhatToStore
.push_back(Arg
);
513 for (unsigned I
= 0, E
= WhatToStore
.size(); I
!= E
; ++I
) {
514 Value
*TheBtCast
= WhatToStore
[I
];
516 TD
->getTypeAllocSizeInBits(TheBtCast
->getType()) / 8;
517 SmallVector
<Value
*, 1> BuffOffset
;
518 BuffOffset
.push_back(ConstantInt::get(I32Ty
, ArgSize
));
520 Type
*ArgPointer
= PointerType::get(TheBtCast
->getType(), 1);
522 new BitCastInst(BufferIdx
, ArgPointer
, "PrintBuffPtrCast", Brnch
);
523 StoreInst
*StBuff
= new StoreInst(TheBtCast
, CastedGEP
, Brnch
);
524 LLVM_DEBUG(dbgs() << "inserting store to printf buffer:\n"
527 if (I
+ 1 == E
&& ArgCount
+ 1 == CI
->getNumArgOperands())
529 BufferIdx
= GetElementPtrInst::Create(I8Ty
, BufferIdx
, BuffOffset
,
530 "PrintBuffNextPtr", Brnch
);
531 LLVM_DEBUG(dbgs() << "inserting gep to the printf buffer:\n"
532 << *BufferIdx
<< '\n');
538 // erase the printf calls
539 for (auto CI
: Printfs
)
540 CI
->eraseFromParent();
546 bool AMDGPUPrintfRuntimeBindingImpl::run(Module
&M
) {
547 Triple
TT(M
.getTargetTriple());
548 if (TT
.getArch() == Triple::r600
)
551 auto PrintfFunction
= M
.getFunction("printf");
555 for (auto &U
: PrintfFunction
->uses()) {
556 if (auto *CI
= dyn_cast
<CallInst
>(U
.getUser())) {
557 if (CI
->isCallee(&U
))
558 Printfs
.push_back(CI
);
565 if (auto HostcallFunction
= M
.getFunction("__ockl_hostcall_internal")) {
566 for (auto &U
: HostcallFunction
->uses()) {
567 if (auto *CI
= dyn_cast
<CallInst
>(U
.getUser())) {
568 M
.getContext().emitError(
569 CI
, "Cannot use both printf and hostcall in the same module");
574 TD
= &M
.getDataLayout();
576 return lowerPrintfForGpu(M
);
579 bool AMDGPUPrintfRuntimeBinding::runOnModule(Module
&M
) {
580 auto GetDT
= [this](Function
&F
) -> DominatorTree
& {
581 return this->getAnalysis
<DominatorTreeWrapperPass
>(F
).getDomTree();
583 auto GetTLI
= [this](Function
&F
) -> TargetLibraryInfo
& {
584 return this->getAnalysis
<TargetLibraryInfoWrapperPass
>().getTLI(F
);
587 return AMDGPUPrintfRuntimeBindingImpl(GetDT
, GetTLI
).run(M
);
591 AMDGPUPrintfRuntimeBindingPass::run(Module
&M
, ModuleAnalysisManager
&AM
) {
592 FunctionAnalysisManager
&FAM
=
593 AM
.getResult
<FunctionAnalysisManagerModuleProxy
>(M
).getManager();
594 auto GetDT
= [&FAM
](Function
&F
) -> DominatorTree
& {
595 return FAM
.getResult
<DominatorTreeAnalysis
>(F
);
597 auto GetTLI
= [&FAM
](Function
&F
) -> TargetLibraryInfo
& {
598 return FAM
.getResult
<TargetLibraryAnalysis
>(F
);
600 bool Changed
= AMDGPUPrintfRuntimeBindingImpl(GetDT
, GetTLI
).run(M
);
601 return Changed
? PreservedAnalyses::none() : PreservedAnalyses::all();