1 //===- AMDGPULibCalls.cpp -------------------------------------------------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
10 /// This file does AMD library function optimizations.
12 //===----------------------------------------------------------------------===//
15 #include "AMDGPULibFunc.h"
16 #include "GCNSubtarget.h"
17 #include "llvm/Analysis/AssumptionCache.h"
18 #include "llvm/Analysis/TargetLibraryInfo.h"
19 #include "llvm/Analysis/ValueTracking.h"
20 #include "llvm/IR/AttributeMask.h"
21 #include "llvm/IR/Dominators.h"
22 #include "llvm/IR/IRBuilder.h"
23 #include "llvm/IR/IntrinsicInst.h"
24 #include "llvm/IR/IntrinsicsAMDGPU.h"
25 #include "llvm/IR/MDBuilder.h"
26 #include "llvm/IR/PatternMatch.h"
27 #include "llvm/InitializePasses.h"
30 #define DEBUG_TYPE "amdgpu-simplifylib"
33 using namespace llvm::PatternMatch
;
35 static cl::opt
<bool> EnablePreLink("amdgpu-prelink",
36 cl::desc("Enable pre-link mode optimizations"),
40 static cl::list
<std::string
> UseNative("amdgpu-use-native",
41 cl::desc("Comma separated list of functions to replace with native, or all"),
42 cl::CommaSeparated
, cl::ValueOptional
,
45 #define MATH_PI numbers::pi
46 #define MATH_E numbers::e
47 #define MATH_SQRT2 numbers::sqrt2
48 #define MATH_SQRT1_2 numbers::inv_sqrt2
52 class AMDGPULibCalls
{
54 const TargetLibraryInfo
*TLInfo
= nullptr;
55 AssumptionCache
*AC
= nullptr;
56 DominatorTree
*DT
= nullptr;
58 using FuncInfo
= llvm::AMDGPULibFunc
;
60 bool UnsafeFPMath
= false;
63 bool AllNative
= false;
65 bool useNativeFunc(const StringRef F
) const;
67 // Return a pointer (pointer expr) to the function if function definition with
68 // "FuncName" exists. It may create a new function prototype in pre-link mode.
69 FunctionCallee
getFunction(Module
*M
, const FuncInfo
&fInfo
);
71 bool parseFunctionName(const StringRef
&FMangledName
, FuncInfo
&FInfo
);
73 bool TDOFold(CallInst
*CI
, const FuncInfo
&FInfo
);
75 /* Specialized optimizations */
78 bool fold_pow(FPMathOperator
*FPOp
, IRBuilder
<> &B
, const FuncInfo
&FInfo
);
81 bool fold_rootn(FPMathOperator
*FPOp
, IRBuilder
<> &B
, const FuncInfo
&FInfo
);
83 // -fuse-native for sincos
84 bool sincosUseNative(CallInst
*aCI
, const FuncInfo
&FInfo
);
86 // evaluate calls if calls' arguments are constants.
87 bool evaluateScalarMathFunc(const FuncInfo
&FInfo
, double &Res0
, double &Res1
,
88 Constant
*copr0
, Constant
*copr1
);
89 bool evaluateCall(CallInst
*aCI
, const FuncInfo
&FInfo
);
91 /// Insert a value to sincos function \p Fsincos. Returns (value of sin, value
92 /// of cos, sincos call).
93 std::tuple
<Value
*, Value
*, Value
*> insertSinCos(Value
*Arg
,
96 FunctionCallee Fsincos
);
99 bool fold_sincos(FPMathOperator
*FPOp
, IRBuilder
<> &B
, const FuncInfo
&FInfo
);
101 // __read_pipe/__write_pipe
102 bool fold_read_write_pipe(CallInst
*CI
, IRBuilder
<> &B
,
103 const FuncInfo
&FInfo
);
105 // Get a scalar native builtin single argument FP function
106 FunctionCallee
getNativeFunction(Module
*M
, const FuncInfo
&FInfo
);
108 /// Substitute a call to a known libcall with an intrinsic call. If \p
109 /// AllowMinSize is true, allow the replacement in a minsize function.
110 bool shouldReplaceLibcallWithIntrinsic(const CallInst
*CI
,
111 bool AllowMinSizeF32
= false,
112 bool AllowF64
= false,
113 bool AllowStrictFP
= false);
114 void replaceLibCallWithSimpleIntrinsic(IRBuilder
<> &B
, CallInst
*CI
,
115 Intrinsic::ID IntrID
);
117 bool tryReplaceLibcallWithSimpleIntrinsic(IRBuilder
<> &B
, CallInst
*CI
,
118 Intrinsic::ID IntrID
,
119 bool AllowMinSizeF32
= false,
120 bool AllowF64
= false,
121 bool AllowStrictFP
= false);
124 bool isUnsafeMath(const FPMathOperator
*FPOp
) const;
125 bool isUnsafeFiniteOnlyMath(const FPMathOperator
*FPOp
) const;
127 bool canIncreasePrecisionOfConstantFold(const FPMathOperator
*FPOp
) const;
129 static void replaceCall(Instruction
*I
, Value
*With
) {
130 I
->replaceAllUsesWith(With
);
131 I
->eraseFromParent();
134 static void replaceCall(FPMathOperator
*I
, Value
*With
) {
135 replaceCall(cast
<Instruction
>(I
), With
);
139 AMDGPULibCalls() = default;
141 bool fold(CallInst
*CI
);
143 void initFunction(Function
&F
, FunctionAnalysisManager
&FAM
);
144 void initNativeFuncs();
146 // Replace a normal math function call with that native version
147 bool useNative(CallInst
*CI
);
150 } // end namespace llvm
152 template <typename IRB
>
153 static CallInst
*CreateCallEx(IRB
&B
, FunctionCallee Callee
, Value
*Arg
,
154 const Twine
&Name
= "") {
155 CallInst
*R
= B
.CreateCall(Callee
, Arg
, Name
);
156 if (Function
*F
= dyn_cast
<Function
>(Callee
.getCallee()))
157 R
->setCallingConv(F
->getCallingConv());
161 template <typename IRB
>
162 static CallInst
*CreateCallEx2(IRB
&B
, FunctionCallee Callee
, Value
*Arg1
,
163 Value
*Arg2
, const Twine
&Name
= "") {
164 CallInst
*R
= B
.CreateCall(Callee
, {Arg1
, Arg2
}, Name
);
165 if (Function
*F
= dyn_cast
<Function
>(Callee
.getCallee()))
166 R
->setCallingConv(F
->getCallingConv());
170 static FunctionType
*getPownType(FunctionType
*FT
) {
171 Type
*PowNExpTy
= Type::getInt32Ty(FT
->getContext());
172 if (VectorType
*VecTy
= dyn_cast
<VectorType
>(FT
->getReturnType()))
173 PowNExpTy
= VectorType::get(PowNExpTy
, VecTy
->getElementCount());
175 return FunctionType::get(FT
->getReturnType(),
176 {FT
->getParamType(0), PowNExpTy
}, false);
179 // Data structures for table-driven optimizations.
180 // FuncTbl works for both f32 and f64 functions with 1 input argument
187 /* a list of {result, input} */
188 static const TableEntry tbl_acos
[] = {
189 {MATH_PI
/ 2.0, 0.0},
190 {MATH_PI
/ 2.0, -0.0},
194 static const TableEntry tbl_acosh
[] = {
197 static const TableEntry tbl_acospi
[] = {
203 static const TableEntry tbl_asin
[] = {
206 {MATH_PI
/ 2.0, 1.0},
207 {-MATH_PI
/ 2.0, -1.0}
209 static const TableEntry tbl_asinh
[] = {
213 static const TableEntry tbl_asinpi
[] = {
219 static const TableEntry tbl_atan
[] = {
222 {MATH_PI
/ 4.0, 1.0},
223 {-MATH_PI
/ 4.0, -1.0}
225 static const TableEntry tbl_atanh
[] = {
229 static const TableEntry tbl_atanpi
[] = {
235 static const TableEntry tbl_cbrt
[] = {
241 static const TableEntry tbl_cos
[] = {
245 static const TableEntry tbl_cosh
[] = {
249 static const TableEntry tbl_cospi
[] = {
253 static const TableEntry tbl_erfc
[] = {
257 static const TableEntry tbl_erf
[] = {
261 static const TableEntry tbl_exp
[] = {
266 static const TableEntry tbl_exp2
[] = {
271 static const TableEntry tbl_exp10
[] = {
276 static const TableEntry tbl_expm1
[] = {
280 static const TableEntry tbl_log
[] = {
284 static const TableEntry tbl_log2
[] = {
288 static const TableEntry tbl_log10
[] = {
292 static const TableEntry tbl_rsqrt
[] = {
296 static const TableEntry tbl_sin
[] = {
300 static const TableEntry tbl_sinh
[] = {
304 static const TableEntry tbl_sinpi
[] = {
308 static const TableEntry tbl_sqrt
[] = {
313 static const TableEntry tbl_tan
[] = {
317 static const TableEntry tbl_tanh
[] = {
321 static const TableEntry tbl_tanpi
[] = {
325 static const TableEntry tbl_tgamma
[] = {
332 static bool HasNative(AMDGPULibFunc::EFuncId id
) {
334 case AMDGPULibFunc::EI_DIVIDE
:
335 case AMDGPULibFunc::EI_COS
:
336 case AMDGPULibFunc::EI_EXP
:
337 case AMDGPULibFunc::EI_EXP2
:
338 case AMDGPULibFunc::EI_EXP10
:
339 case AMDGPULibFunc::EI_LOG
:
340 case AMDGPULibFunc::EI_LOG2
:
341 case AMDGPULibFunc::EI_LOG10
:
342 case AMDGPULibFunc::EI_POWR
:
343 case AMDGPULibFunc::EI_RECIP
:
344 case AMDGPULibFunc::EI_RSQRT
:
345 case AMDGPULibFunc::EI_SIN
:
346 case AMDGPULibFunc::EI_SINCOS
:
347 case AMDGPULibFunc::EI_SQRT
:
348 case AMDGPULibFunc::EI_TAN
:
355 using TableRef
= ArrayRef
<TableEntry
>;
357 static TableRef
getOptTable(AMDGPULibFunc::EFuncId id
) {
359 case AMDGPULibFunc::EI_ACOS
: return TableRef(tbl_acos
);
360 case AMDGPULibFunc::EI_ACOSH
: return TableRef(tbl_acosh
);
361 case AMDGPULibFunc::EI_ACOSPI
: return TableRef(tbl_acospi
);
362 case AMDGPULibFunc::EI_ASIN
: return TableRef(tbl_asin
);
363 case AMDGPULibFunc::EI_ASINH
: return TableRef(tbl_asinh
);
364 case AMDGPULibFunc::EI_ASINPI
: return TableRef(tbl_asinpi
);
365 case AMDGPULibFunc::EI_ATAN
: return TableRef(tbl_atan
);
366 case AMDGPULibFunc::EI_ATANH
: return TableRef(tbl_atanh
);
367 case AMDGPULibFunc::EI_ATANPI
: return TableRef(tbl_atanpi
);
368 case AMDGPULibFunc::EI_CBRT
: return TableRef(tbl_cbrt
);
369 case AMDGPULibFunc::EI_NCOS
:
370 case AMDGPULibFunc::EI_COS
: return TableRef(tbl_cos
);
371 case AMDGPULibFunc::EI_COSH
: return TableRef(tbl_cosh
);
372 case AMDGPULibFunc::EI_COSPI
: return TableRef(tbl_cospi
);
373 case AMDGPULibFunc::EI_ERFC
: return TableRef(tbl_erfc
);
374 case AMDGPULibFunc::EI_ERF
: return TableRef(tbl_erf
);
375 case AMDGPULibFunc::EI_EXP
: return TableRef(tbl_exp
);
376 case AMDGPULibFunc::EI_NEXP2
:
377 case AMDGPULibFunc::EI_EXP2
: return TableRef(tbl_exp2
);
378 case AMDGPULibFunc::EI_EXP10
: return TableRef(tbl_exp10
);
379 case AMDGPULibFunc::EI_EXPM1
: return TableRef(tbl_expm1
);
380 case AMDGPULibFunc::EI_LOG
: return TableRef(tbl_log
);
381 case AMDGPULibFunc::EI_NLOG2
:
382 case AMDGPULibFunc::EI_LOG2
: return TableRef(tbl_log2
);
383 case AMDGPULibFunc::EI_LOG10
: return TableRef(tbl_log10
);
384 case AMDGPULibFunc::EI_NRSQRT
:
385 case AMDGPULibFunc::EI_RSQRT
: return TableRef(tbl_rsqrt
);
386 case AMDGPULibFunc::EI_NSIN
:
387 case AMDGPULibFunc::EI_SIN
: return TableRef(tbl_sin
);
388 case AMDGPULibFunc::EI_SINH
: return TableRef(tbl_sinh
);
389 case AMDGPULibFunc::EI_SINPI
: return TableRef(tbl_sinpi
);
390 case AMDGPULibFunc::EI_NSQRT
:
391 case AMDGPULibFunc::EI_SQRT
: return TableRef(tbl_sqrt
);
392 case AMDGPULibFunc::EI_TAN
: return TableRef(tbl_tan
);
393 case AMDGPULibFunc::EI_TANH
: return TableRef(tbl_tanh
);
394 case AMDGPULibFunc::EI_TANPI
: return TableRef(tbl_tanpi
);
395 case AMDGPULibFunc::EI_TGAMMA
: return TableRef(tbl_tgamma
);
401 static inline int getVecSize(const AMDGPULibFunc
& FInfo
) {
402 return FInfo
.getLeads()[0].VectorSize
;
405 static inline AMDGPULibFunc::EType
getArgType(const AMDGPULibFunc
& FInfo
) {
406 return (AMDGPULibFunc::EType
)FInfo
.getLeads()[0].ArgType
;
409 FunctionCallee
AMDGPULibCalls::getFunction(Module
*M
, const FuncInfo
&fInfo
) {
410 // If we are doing PreLinkOpt, the function is external. So it is safe to
411 // use getOrInsertFunction() at this stage.
413 return EnablePreLink
? AMDGPULibFunc::getOrInsertFunction(M
, fInfo
)
414 : AMDGPULibFunc::getFunction(M
, fInfo
);
417 bool AMDGPULibCalls::parseFunctionName(const StringRef
&FMangledName
,
419 return AMDGPULibFunc::parse(FMangledName
, FInfo
);
422 bool AMDGPULibCalls::isUnsafeMath(const FPMathOperator
*FPOp
) const {
423 return UnsafeFPMath
|| FPOp
->isFast();
426 bool AMDGPULibCalls::isUnsafeFiniteOnlyMath(const FPMathOperator
*FPOp
) const {
427 return UnsafeFPMath
||
428 (FPOp
->hasApproxFunc() && FPOp
->hasNoNaNs() && FPOp
->hasNoInfs());
431 bool AMDGPULibCalls::canIncreasePrecisionOfConstantFold(
432 const FPMathOperator
*FPOp
) const {
433 // TODO: Refine to approxFunc or contract
434 return isUnsafeMath(FPOp
);
437 void AMDGPULibCalls::initFunction(Function
&F
, FunctionAnalysisManager
&FAM
) {
438 UnsafeFPMath
= F
.getFnAttribute("unsafe-fp-math").getValueAsBool();
439 AC
= &FAM
.getResult
<AssumptionAnalysis
>(F
);
440 TLInfo
= &FAM
.getResult
<TargetLibraryAnalysis
>(F
);
441 DT
= FAM
.getCachedResult
<DominatorTreeAnalysis
>(F
);
444 bool AMDGPULibCalls::useNativeFunc(const StringRef F
) const {
445 return AllNative
|| llvm::is_contained(UseNative
, F
);
448 void AMDGPULibCalls::initNativeFuncs() {
449 AllNative
= useNativeFunc("all") ||
450 (UseNative
.getNumOccurrences() && UseNative
.size() == 1 &&
451 UseNative
.begin()->empty());
454 bool AMDGPULibCalls::sincosUseNative(CallInst
*aCI
, const FuncInfo
&FInfo
) {
455 bool native_sin
= useNativeFunc("sin");
456 bool native_cos
= useNativeFunc("cos");
458 if (native_sin
&& native_cos
) {
459 Module
*M
= aCI
->getModule();
460 Value
*opr0
= aCI
->getArgOperand(0);
463 nf
.getLeads()[0].ArgType
= FInfo
.getLeads()[0].ArgType
;
464 nf
.getLeads()[0].VectorSize
= FInfo
.getLeads()[0].VectorSize
;
466 nf
.setPrefix(AMDGPULibFunc::NATIVE
);
467 nf
.setId(AMDGPULibFunc::EI_SIN
);
468 FunctionCallee sinExpr
= getFunction(M
, nf
);
470 nf
.setPrefix(AMDGPULibFunc::NATIVE
);
471 nf
.setId(AMDGPULibFunc::EI_COS
);
472 FunctionCallee cosExpr
= getFunction(M
, nf
);
473 if (sinExpr
&& cosExpr
) {
475 CallInst::Create(sinExpr
, opr0
, "splitsin", aCI
->getIterator());
477 CallInst::Create(cosExpr
, opr0
, "splitcos", aCI
->getIterator());
478 new StoreInst(cosval
, aCI
->getArgOperand(1), aCI
->getIterator());
480 DEBUG_WITH_TYPE("usenative", dbgs() << "<useNative> replace " << *aCI
481 << " with native version of sin/cos");
483 replaceCall(aCI
, sinval
);
490 bool AMDGPULibCalls::useNative(CallInst
*aCI
) {
491 Function
*Callee
= aCI
->getCalledFunction();
492 if (!Callee
|| aCI
->isNoBuiltin())
496 if (!parseFunctionName(Callee
->getName(), FInfo
) || !FInfo
.isMangled() ||
497 FInfo
.getPrefix() != AMDGPULibFunc::NOPFX
||
498 getArgType(FInfo
) == AMDGPULibFunc::F64
|| !HasNative(FInfo
.getId()) ||
499 !(AllNative
|| useNativeFunc(FInfo
.getName()))) {
503 if (FInfo
.getId() == AMDGPULibFunc::EI_SINCOS
)
504 return sincosUseNative(aCI
, FInfo
);
506 FInfo
.setPrefix(AMDGPULibFunc::NATIVE
);
507 FunctionCallee F
= getFunction(aCI
->getModule(), FInfo
);
511 aCI
->setCalledFunction(F
);
512 DEBUG_WITH_TYPE("usenative", dbgs() << "<useNative> replace " << *aCI
513 << " with native version");
517 // Clang emits call of __read_pipe_2 or __read_pipe_4 for OpenCL read_pipe
518 // builtin, with appended type size and alignment arguments, where 2 or 4
519 // indicates the original number of arguments. The library has optimized version
520 // of __read_pipe_2/__read_pipe_4 when the type size and alignment has the same
521 // power of 2 value. This function transforms __read_pipe_2 to __read_pipe_2_N
522 // for such cases where N is the size in bytes of the type (N = 1, 2, 4, 8, ...,
523 // 128). The same for __read_pipe_4, write_pipe_2, and write_pipe_4.
524 bool AMDGPULibCalls::fold_read_write_pipe(CallInst
*CI
, IRBuilder
<> &B
,
525 const FuncInfo
&FInfo
) {
526 auto *Callee
= CI
->getCalledFunction();
527 if (!Callee
->isDeclaration())
530 assert(Callee
->hasName() && "Invalid read_pipe/write_pipe function");
531 auto *M
= Callee
->getParent();
532 std::string Name
= std::string(Callee
->getName());
533 auto NumArg
= CI
->arg_size();
534 if (NumArg
!= 4 && NumArg
!= 6)
536 ConstantInt
*PacketSize
=
537 dyn_cast
<ConstantInt
>(CI
->getArgOperand(NumArg
- 2));
538 ConstantInt
*PacketAlign
=
539 dyn_cast
<ConstantInt
>(CI
->getArgOperand(NumArg
- 1));
540 if (!PacketSize
|| !PacketAlign
)
543 unsigned Size
= PacketSize
->getZExtValue();
544 Align Alignment
= PacketAlign
->getAlignValue();
545 if (Alignment
!= Size
)
548 unsigned PtrArgLoc
= CI
->arg_size() - 3;
549 Value
*PtrArg
= CI
->getArgOperand(PtrArgLoc
);
550 Type
*PtrTy
= PtrArg
->getType();
552 SmallVector
<llvm::Type
*, 6> ArgTys
;
553 for (unsigned I
= 0; I
!= PtrArgLoc
; ++I
)
554 ArgTys
.push_back(CI
->getArgOperand(I
)->getType());
555 ArgTys
.push_back(PtrTy
);
557 Name
= Name
+ "_" + std::to_string(Size
);
558 auto *FTy
= FunctionType::get(Callee
->getReturnType(),
559 ArrayRef
<Type
*>(ArgTys
), false);
560 AMDGPULibFunc
NewLibFunc(Name
, FTy
);
561 FunctionCallee F
= AMDGPULibFunc::getOrInsertFunction(M
, NewLibFunc
);
565 SmallVector
<Value
*, 6> Args
;
566 for (unsigned I
= 0; I
!= PtrArgLoc
; ++I
)
567 Args
.push_back(CI
->getArgOperand(I
));
568 Args
.push_back(PtrArg
);
570 auto *NCI
= B
.CreateCall(F
, Args
);
571 NCI
->setAttributes(CI
->getAttributes());
572 CI
->replaceAllUsesWith(NCI
);
573 CI
->dropAllReferences();
574 CI
->eraseFromParent();
579 static bool isKnownIntegral(const Value
*V
, const DataLayout
&DL
,
581 if (isa
<PoisonValue
>(V
))
583 if (isa
<UndefValue
>(V
))
586 if (const ConstantFP
*CF
= dyn_cast
<ConstantFP
>(V
))
587 return CF
->getValueAPF().isInteger();
589 auto *VFVTy
= dyn_cast
<FixedVectorType
>(V
->getType());
590 const Constant
*CV
= dyn_cast
<Constant
>(V
);
592 unsigned NumElts
= VFVTy
->getNumElements();
593 for (unsigned i
= 0; i
!= NumElts
; ++i
) {
594 Constant
*Elt
= CV
->getAggregateElement(i
);
597 if (isa
<PoisonValue
>(Elt
))
600 const ConstantFP
*CFP
= dyn_cast
<ConstantFP
>(Elt
);
601 if (!CFP
|| !CFP
->getValue().isInteger())
608 const Instruction
*I
= dyn_cast
<Instruction
>(V
);
612 switch (I
->getOpcode()) {
613 case Instruction::SIToFP
:
614 case Instruction::UIToFP
:
615 // TODO: Could check nofpclass(inf) on incoming argument
619 // Need to check int size cannot produce infinity, which computeKnownFPClass
620 // knows how to do already.
621 return isKnownNeverInfinity(I
, /*Depth=*/0, SimplifyQuery(DL
));
622 case Instruction::Call
: {
623 const CallInst
*CI
= cast
<CallInst
>(I
);
624 switch (CI
->getIntrinsicID()) {
625 case Intrinsic::trunc
:
626 case Intrinsic::floor
:
627 case Intrinsic::ceil
:
628 case Intrinsic::rint
:
629 case Intrinsic::nearbyint
:
630 case Intrinsic::round
:
631 case Intrinsic::roundeven
:
632 return (FMF
.noInfs() && FMF
.noNaNs()) ||
633 isKnownNeverInfOrNaN(I
, /*Depth=*/0, SimplifyQuery(DL
));
647 // This function returns false if no change; return true otherwise.
648 bool AMDGPULibCalls::fold(CallInst
*CI
) {
649 Function
*Callee
= CI
->getCalledFunction();
650 // Ignore indirect calls.
651 if (!Callee
|| Callee
->isIntrinsic() || CI
->isNoBuiltin())
655 if (!parseFunctionName(Callee
->getName(), FInfo
))
658 // Further check the number of arguments to see if they match.
659 // TODO: Check calling convention matches too
660 if (!FInfo
.isCompatibleSignature(CI
->getFunctionType()))
663 LLVM_DEBUG(dbgs() << "AMDIC: try folding " << *CI
<< '\n');
665 if (TDOFold(CI
, FInfo
))
669 if (CI
->isStrictFP())
670 B
.setIsFPConstrained(true);
672 if (FPMathOperator
*FPOp
= dyn_cast
<FPMathOperator
>(CI
)) {
673 // Under unsafe-math, evaluate calls if possible.
674 // According to Brian Sumner, we can do this for all f32 function calls
675 // using host's double function calls.
676 if (canIncreasePrecisionOfConstantFold(FPOp
) && evaluateCall(CI
, FInfo
))
679 // Copy fast flags from the original call.
680 FastMathFlags FMF
= FPOp
->getFastMathFlags();
681 B
.setFastMathFlags(FMF
);
683 // Specialized optimizations for each function call.
685 // TODO: Handle native functions
686 switch (FInfo
.getId()) {
687 case AMDGPULibFunc::EI_EXP
:
690 return tryReplaceLibcallWithSimpleIntrinsic(B
, CI
, Intrinsic::exp
,
692 case AMDGPULibFunc::EI_EXP2
:
695 return tryReplaceLibcallWithSimpleIntrinsic(B
, CI
, Intrinsic::exp2
,
697 case AMDGPULibFunc::EI_LOG
:
700 return tryReplaceLibcallWithSimpleIntrinsic(B
, CI
, Intrinsic::log
,
702 case AMDGPULibFunc::EI_LOG2
:
705 return tryReplaceLibcallWithSimpleIntrinsic(B
, CI
, Intrinsic::log2
,
707 case AMDGPULibFunc::EI_LOG10
:
710 return tryReplaceLibcallWithSimpleIntrinsic(B
, CI
, Intrinsic::log10
,
712 case AMDGPULibFunc::EI_FMIN
:
713 return tryReplaceLibcallWithSimpleIntrinsic(B
, CI
, Intrinsic::minnum
,
715 case AMDGPULibFunc::EI_FMAX
:
716 return tryReplaceLibcallWithSimpleIntrinsic(B
, CI
, Intrinsic::maxnum
,
718 case AMDGPULibFunc::EI_FMA
:
719 return tryReplaceLibcallWithSimpleIntrinsic(B
, CI
, Intrinsic::fma
, true,
721 case AMDGPULibFunc::EI_MAD
:
722 return tryReplaceLibcallWithSimpleIntrinsic(B
, CI
, Intrinsic::fmuladd
,
724 case AMDGPULibFunc::EI_FABS
:
725 return tryReplaceLibcallWithSimpleIntrinsic(B
, CI
, Intrinsic::fabs
, true,
727 case AMDGPULibFunc::EI_COPYSIGN
:
728 return tryReplaceLibcallWithSimpleIntrinsic(B
, CI
, Intrinsic::copysign
,
730 case AMDGPULibFunc::EI_FLOOR
:
731 return tryReplaceLibcallWithSimpleIntrinsic(B
, CI
, Intrinsic::floor
, true,
733 case AMDGPULibFunc::EI_CEIL
:
734 return tryReplaceLibcallWithSimpleIntrinsic(B
, CI
, Intrinsic::ceil
, true,
736 case AMDGPULibFunc::EI_TRUNC
:
737 return tryReplaceLibcallWithSimpleIntrinsic(B
, CI
, Intrinsic::trunc
, true,
739 case AMDGPULibFunc::EI_RINT
:
740 return tryReplaceLibcallWithSimpleIntrinsic(B
, CI
, Intrinsic::rint
, true,
742 case AMDGPULibFunc::EI_ROUND
:
743 return tryReplaceLibcallWithSimpleIntrinsic(B
, CI
, Intrinsic::round
, true,
745 case AMDGPULibFunc::EI_LDEXP
: {
746 if (!shouldReplaceLibcallWithIntrinsic(CI
, true, true))
749 Value
*Arg1
= CI
->getArgOperand(1);
750 if (VectorType
*VecTy
= dyn_cast
<VectorType
>(CI
->getType());
751 VecTy
&& !isa
<VectorType
>(Arg1
->getType())) {
752 Value
*SplatArg1
= B
.CreateVectorSplat(VecTy
->getElementCount(), Arg1
);
753 CI
->setArgOperand(1, SplatArg1
);
756 CI
->setCalledFunction(Intrinsic::getDeclaration(
757 CI
->getModule(), Intrinsic::ldexp
,
758 {CI
->getType(), CI
->getArgOperand(1)->getType()}));
761 case AMDGPULibFunc::EI_POW
: {
762 Module
*M
= Callee
->getParent();
763 AMDGPULibFunc
PowrInfo(AMDGPULibFunc::EI_POWR
, FInfo
);
764 FunctionCallee PowrFunc
= getFunction(M
, PowrInfo
);
765 CallInst
*Call
= cast
<CallInst
>(FPOp
);
767 // pow(x, y) -> powr(x, y) for x >= -0.0
768 // TODO: Account for flags on current call
770 cannotBeOrderedLessThanZero(
771 FPOp
->getOperand(0), /*Depth=*/0,
772 SimplifyQuery(M
->getDataLayout(), TLInfo
, DT
, AC
, Call
))) {
773 Call
->setCalledFunction(PowrFunc
);
774 return fold_pow(FPOp
, B
, PowrInfo
) || true;
777 // pow(x, y) -> pown(x, y) for known integral y
778 if (isKnownIntegral(FPOp
->getOperand(1), M
->getDataLayout(),
779 FPOp
->getFastMathFlags())) {
780 FunctionType
*PownType
= getPownType(CI
->getFunctionType());
781 AMDGPULibFunc
PownInfo(AMDGPULibFunc::EI_POWN
, PownType
, true);
782 FunctionCallee PownFunc
= getFunction(M
, PownInfo
);
784 // TODO: If the incoming integral value is an sitofp/uitofp, it won't
785 // fold out without a known range. We can probably take the source
788 B
.CreateFPToSI(FPOp
->getOperand(1), PownType
->getParamType(1));
789 // Have to drop any nofpclass attributes on the original call site.
790 Call
->removeParamAttrs(
791 1, AttributeFuncs::typeIncompatible(CastedArg
->getType()));
792 Call
->setCalledFunction(PownFunc
);
793 Call
->setArgOperand(1, CastedArg
);
794 return fold_pow(FPOp
, B
, PownInfo
) || true;
798 return fold_pow(FPOp
, B
, FInfo
);
800 case AMDGPULibFunc::EI_POWR
:
801 case AMDGPULibFunc::EI_POWN
:
802 return fold_pow(FPOp
, B
, FInfo
);
803 case AMDGPULibFunc::EI_ROOTN
:
804 return fold_rootn(FPOp
, B
, FInfo
);
805 case AMDGPULibFunc::EI_SQRT
:
806 // TODO: Allow with strictfp + constrained intrinsic
807 return tryReplaceLibcallWithSimpleIntrinsic(
808 B
, CI
, Intrinsic::sqrt
, true, true, /*AllowStrictFP=*/false);
809 case AMDGPULibFunc::EI_COS
:
810 case AMDGPULibFunc::EI_SIN
:
811 return fold_sincos(FPOp
, B
, FInfo
);
816 // Specialized optimizations for each function call
817 switch (FInfo
.getId()) {
818 case AMDGPULibFunc::EI_READ_PIPE_2
:
819 case AMDGPULibFunc::EI_READ_PIPE_4
:
820 case AMDGPULibFunc::EI_WRITE_PIPE_2
:
821 case AMDGPULibFunc::EI_WRITE_PIPE_4
:
822 return fold_read_write_pipe(CI
, B
, FInfo
);
831 bool AMDGPULibCalls::TDOFold(CallInst
*CI
, const FuncInfo
&FInfo
) {
832 // Table-Driven optimization
833 const TableRef tr
= getOptTable(FInfo
.getId());
837 int const sz
= (int)tr
.size();
838 Value
*opr0
= CI
->getArgOperand(0);
840 if (getVecSize(FInfo
) > 1) {
841 if (ConstantDataVector
*CV
= dyn_cast
<ConstantDataVector
>(opr0
)) {
842 SmallVector
<double, 0> DVal
;
843 for (int eltNo
= 0; eltNo
< getVecSize(FInfo
); ++eltNo
) {
844 ConstantFP
*eltval
= dyn_cast
<ConstantFP
>(
845 CV
->getElementAsConstant((unsigned)eltNo
));
846 assert(eltval
&& "Non-FP arguments in math function!");
848 for (int i
=0; i
< sz
; ++i
) {
849 if (eltval
->isExactlyValue(tr
[i
].input
)) {
850 DVal
.push_back(tr
[i
].result
);
856 // This vector constants not handled yet.
860 LLVMContext
&context
= CI
->getParent()->getParent()->getContext();
862 if (getArgType(FInfo
) == AMDGPULibFunc::F32
) {
863 SmallVector
<float, 0> FVal
;
864 for (double D
: DVal
)
865 FVal
.push_back((float)D
);
866 ArrayRef
<float> tmp(FVal
);
867 nval
= ConstantDataVector::get(context
, tmp
);
869 ArrayRef
<double> tmp(DVal
);
870 nval
= ConstantDataVector::get(context
, tmp
);
872 LLVM_DEBUG(errs() << "AMDIC: " << *CI
<< " ---> " << *nval
<< "\n");
873 replaceCall(CI
, nval
);
878 if (ConstantFP
*CF
= dyn_cast
<ConstantFP
>(opr0
)) {
879 for (int i
= 0; i
< sz
; ++i
) {
880 if (CF
->isExactlyValue(tr
[i
].input
)) {
881 Value
*nval
= ConstantFP::get(CF
->getType(), tr
[i
].result
);
882 LLVM_DEBUG(errs() << "AMDIC: " << *CI
<< " ---> " << *nval
<< "\n");
883 replaceCall(CI
, nval
);
894 static double log2(double V
) {
895 #if _XOPEN_SOURCE >= 600 || defined(_ISOC99_SOURCE) || _POSIX_C_SOURCE >= 200112L
898 return log(V
) / numbers::ln2
;
903 bool AMDGPULibCalls::fold_pow(FPMathOperator
*FPOp
, IRBuilder
<> &B
,
904 const FuncInfo
&FInfo
) {
905 assert((FInfo
.getId() == AMDGPULibFunc::EI_POW
||
906 FInfo
.getId() == AMDGPULibFunc::EI_POWR
||
907 FInfo
.getId() == AMDGPULibFunc::EI_POWN
) &&
908 "fold_pow: encounter a wrong function call");
910 Module
*M
= B
.GetInsertBlock()->getModule();
911 Type
*eltType
= FPOp
->getType()->getScalarType();
912 Value
*opr0
= FPOp
->getOperand(0);
913 Value
*opr1
= FPOp
->getOperand(1);
915 const APFloat
*CF
= nullptr;
916 const APInt
*CINT
= nullptr;
917 if (!match(opr1
, m_APFloatAllowPoison(CF
)))
918 match(opr1
, m_APIntAllowPoison(CINT
));
920 // 0x1111111 means that we don't do anything for this call.
921 int ci_opr1
= (CINT
? (int)CINT
->getSExtValue() : 0x1111111);
923 if ((CF
&& CF
->isZero()) || (CINT
&& ci_opr1
== 0)) {
924 // pow/powr/pown(x, 0) == 1
925 LLVM_DEBUG(errs() << "AMDIC: " << *FPOp
<< " ---> 1\n");
926 Constant
*cnval
= ConstantFP::get(eltType
, 1.0);
927 if (getVecSize(FInfo
) > 1) {
928 cnval
= ConstantDataVector::getSplat(getVecSize(FInfo
), cnval
);
930 replaceCall(FPOp
, cnval
);
933 if ((CF
&& CF
->isExactlyValue(1.0)) || (CINT
&& ci_opr1
== 1)) {
934 // pow/powr/pown(x, 1.0) = x
935 LLVM_DEBUG(errs() << "AMDIC: " << *FPOp
<< " ---> " << *opr0
<< "\n");
936 replaceCall(FPOp
, opr0
);
939 if ((CF
&& CF
->isExactlyValue(2.0)) || (CINT
&& ci_opr1
== 2)) {
940 // pow/powr/pown(x, 2.0) = x*x
941 LLVM_DEBUG(errs() << "AMDIC: " << *FPOp
<< " ---> " << *opr0
<< " * "
943 Value
*nval
= B
.CreateFMul(opr0
, opr0
, "__pow2");
944 replaceCall(FPOp
, nval
);
947 if ((CF
&& CF
->isExactlyValue(-1.0)) || (CINT
&& ci_opr1
== -1)) {
948 // pow/powr/pown(x, -1.0) = 1.0/x
949 LLVM_DEBUG(errs() << "AMDIC: " << *FPOp
<< " ---> 1 / " << *opr0
<< "\n");
950 Constant
*cnval
= ConstantFP::get(eltType
, 1.0);
951 if (getVecSize(FInfo
) > 1) {
952 cnval
= ConstantDataVector::getSplat(getVecSize(FInfo
), cnval
);
954 Value
*nval
= B
.CreateFDiv(cnval
, opr0
, "__powrecip");
955 replaceCall(FPOp
, nval
);
959 if (CF
&& (CF
->isExactlyValue(0.5) || CF
->isExactlyValue(-0.5))) {
960 // pow[r](x, [-]0.5) = sqrt(x)
961 bool issqrt
= CF
->isExactlyValue(0.5);
962 if (FunctionCallee FPExpr
=
963 getFunction(M
, AMDGPULibFunc(issqrt
? AMDGPULibFunc::EI_SQRT
964 : AMDGPULibFunc::EI_RSQRT
,
966 LLVM_DEBUG(errs() << "AMDIC: " << *FPOp
<< " ---> " << FInfo
.getName()
967 << '(' << *opr0
<< ")\n");
968 Value
*nval
= CreateCallEx(B
,FPExpr
, opr0
, issqrt
? "__pow2sqrt"
970 replaceCall(FPOp
, nval
);
975 if (!isUnsafeFiniteOnlyMath(FPOp
))
978 // Unsafe Math optimization
980 // Remember that ci_opr1 is set if opr1 is integral
982 double dval
= (getArgType(FInfo
) == AMDGPULibFunc::F32
)
983 ? (double)CF
->convertToFloat()
984 : CF
->convertToDouble();
985 int ival
= (int)dval
;
986 if ((double)ival
== dval
) {
989 ci_opr1
= 0x11111111;
992 // pow/powr/pown(x, c) = [1/](x*x*..x); where
993 // trunc(c) == c && the number of x == c && |c| <= 12
994 unsigned abs_opr1
= (ci_opr1
< 0) ? -ci_opr1
: ci_opr1
;
995 if (abs_opr1
<= 12) {
999 cnval
= ConstantFP::get(eltType
, 1.0);
1000 if (getVecSize(FInfo
) > 1) {
1001 cnval
= ConstantDataVector::getSplat(getVecSize(FInfo
), cnval
);
1005 Value
*valx2
= nullptr;
1007 while (abs_opr1
> 0) {
1008 valx2
= valx2
? B
.CreateFMul(valx2
, valx2
, "__powx2") : opr0
;
1010 nval
= nval
? B
.CreateFMul(nval
, valx2
, "__powprod") : valx2
;
1017 cnval
= ConstantFP::get(eltType
, 1.0);
1018 if (getVecSize(FInfo
) > 1) {
1019 cnval
= ConstantDataVector::getSplat(getVecSize(FInfo
), cnval
);
1021 nval
= B
.CreateFDiv(cnval
, nval
, "__1powprod");
1023 LLVM_DEBUG(errs() << "AMDIC: " << *FPOp
<< " ---> "
1024 << ((ci_opr1
< 0) ? "1/prod(" : "prod(") << *opr0
1026 replaceCall(FPOp
, nval
);
1030 // If we should use the generic intrinsic instead of emitting a libcall
1031 const bool ShouldUseIntrinsic
= eltType
->isFloatTy() || eltType
->isHalfTy();
1033 // powr ---> exp2(y * log2(x))
1034 // pown/pow ---> powr(fabs(x), y) | (x & ((int)y << 31))
1035 FunctionCallee ExpExpr
;
1036 if (ShouldUseIntrinsic
)
1037 ExpExpr
= Intrinsic::getDeclaration(M
, Intrinsic::exp2
, {FPOp
->getType()});
1039 ExpExpr
= getFunction(M
, AMDGPULibFunc(AMDGPULibFunc::EI_EXP2
, FInfo
));
1044 bool needlog
= false;
1045 bool needabs
= false;
1046 bool needcopysign
= false;
1047 Constant
*cnval
= nullptr;
1048 if (getVecSize(FInfo
) == 1) {
1050 match(opr0
, m_APFloatAllowPoison(CF
));
1053 double V
= (getArgType(FInfo
) == AMDGPULibFunc::F32
)
1054 ? (double)CF
->convertToFloat()
1055 : CF
->convertToDouble();
1057 V
= log2(std::abs(V
));
1058 cnval
= ConstantFP::get(eltType
, V
);
1059 needcopysign
= (FInfo
.getId() != AMDGPULibFunc::EI_POWR
) &&
1063 needcopysign
= needabs
= FInfo
.getId() != AMDGPULibFunc::EI_POWR
;
1066 ConstantDataVector
*CDV
= dyn_cast
<ConstantDataVector
>(opr0
);
1070 needcopysign
= needabs
= FInfo
.getId() != AMDGPULibFunc::EI_POWR
;
1072 assert ((int)CDV
->getNumElements() == getVecSize(FInfo
) &&
1073 "Wrong vector size detected");
1075 SmallVector
<double, 0> DVal
;
1076 for (int i
=0; i
< getVecSize(FInfo
); ++i
) {
1077 double V
= CDV
->getElementAsAPFloat(i
).convertToDouble();
1078 if (V
< 0.0) needcopysign
= true;
1079 V
= log2(std::abs(V
));
1082 if (getArgType(FInfo
) == AMDGPULibFunc::F32
) {
1083 SmallVector
<float, 0> FVal
;
1084 for (double D
: DVal
)
1085 FVal
.push_back((float)D
);
1086 ArrayRef
<float> tmp(FVal
);
1087 cnval
= ConstantDataVector::get(M
->getContext(), tmp
);
1089 ArrayRef
<double> tmp(DVal
);
1090 cnval
= ConstantDataVector::get(M
->getContext(), tmp
);
1095 if (needcopysign
&& (FInfo
.getId() == AMDGPULibFunc::EI_POW
)) {
1096 // We cannot handle corner cases for a general pow() function, give up
1097 // unless y is a constant integral value. Then proceed as if it were pown.
1098 if (!isKnownIntegral(opr1
, M
->getDataLayout(), FPOp
->getFastMathFlags()))
1104 nval
= B
.CreateUnaryIntrinsic(Intrinsic::fabs
, opr0
, nullptr, "__fabs");
1106 nval
= cnval
? cnval
: opr0
;
1109 FunctionCallee LogExpr
;
1110 if (ShouldUseIntrinsic
) {
1112 Intrinsic::getDeclaration(M
, Intrinsic::log2
, {FPOp
->getType()});
1114 LogExpr
= getFunction(M
, AMDGPULibFunc(AMDGPULibFunc::EI_LOG2
, FInfo
));
1119 nval
= CreateCallEx(B
,LogExpr
, nval
, "__log2");
1122 if (FInfo
.getId() == AMDGPULibFunc::EI_POWN
) {
1123 // convert int(32) to fp(f32 or f64)
1124 opr1
= B
.CreateSIToFP(opr1
, nval
->getType(), "pownI2F");
1126 nval
= B
.CreateFMul(opr1
, nval
, "__ylogx");
1127 nval
= CreateCallEx(B
,ExpExpr
, nval
, "__exp2");
1130 Type
* nTyS
= B
.getIntNTy(eltType
->getPrimitiveSizeInBits());
1131 Type
*nTy
= FPOp
->getType()->getWithNewType(nTyS
);
1132 unsigned size
= nTy
->getScalarSizeInBits();
1133 Value
*opr_n
= FPOp
->getOperand(1);
1134 if (opr_n
->getType()->getScalarType()->isIntegerTy())
1135 opr_n
= B
.CreateZExtOrTrunc(opr_n
, nTy
, "__ytou");
1137 opr_n
= B
.CreateFPToSI(opr1
, nTy
, "__ytou");
1139 Value
*sign
= B
.CreateShl(opr_n
, size
-1, "__yeven");
1140 sign
= B
.CreateAnd(B
.CreateBitCast(opr0
, nTy
), sign
, "__pow_sign");
1141 nval
= B
.CreateOr(B
.CreateBitCast(nval
, nTy
), sign
);
1142 nval
= B
.CreateBitCast(nval
, opr0
->getType());
1145 LLVM_DEBUG(errs() << "AMDIC: " << *FPOp
<< " ---> "
1146 << "exp2(" << *opr1
<< " * log2(" << *opr0
<< "))\n");
1147 replaceCall(FPOp
, nval
);
1152 bool AMDGPULibCalls::fold_rootn(FPMathOperator
*FPOp
, IRBuilder
<> &B
,
1153 const FuncInfo
&FInfo
) {
1154 Value
*opr0
= FPOp
->getOperand(0);
1155 Value
*opr1
= FPOp
->getOperand(1);
1157 const APInt
*CINT
= nullptr;
1158 if (!match(opr1
, m_APIntAllowPoison(CINT
)))
1161 Function
*Parent
= B
.GetInsertBlock()->getParent();
1163 int ci_opr1
= (int)CINT
->getSExtValue();
1164 if (ci_opr1
== 1 && !Parent
->hasFnAttribute(Attribute::StrictFP
)) {
1167 // TODO: Insert constrained canonicalize for strictfp case.
1168 LLVM_DEBUG(errs() << "AMDIC: " << *FPOp
<< " ---> " << *opr0
<< '\n');
1169 replaceCall(FPOp
, opr0
);
1173 Module
*M
= B
.GetInsertBlock()->getModule();
1175 CallInst
*CI
= cast
<CallInst
>(FPOp
);
1177 shouldReplaceLibcallWithIntrinsic(CI
,
1178 /*AllowMinSizeF32=*/true,
1179 /*AllowF64=*/true)) {
1180 // rootn(x, 2) = sqrt(x)
1181 LLVM_DEBUG(errs() << "AMDIC: " << *FPOp
<< " ---> sqrt(" << *opr0
<< ")\n");
1183 CallInst
*NewCall
= B
.CreateUnaryIntrinsic(Intrinsic::sqrt
, opr0
, CI
);
1184 NewCall
->takeName(CI
);
1186 // OpenCL rootn has a looser ulp of 2 requirement than sqrt, so add some
1188 MDBuilder
MDHelper(M
->getContext());
1189 MDNode
*FPMD
= MDHelper
.createFPMath(std::max(FPOp
->getFPAccuracy(), 2.0f
));
1190 NewCall
->setMetadata(LLVMContext::MD_fpmath
, FPMD
);
1192 replaceCall(CI
, NewCall
);
1196 if (ci_opr1
== 3) { // rootn(x, 3) = cbrt(x)
1197 if (FunctionCallee FPExpr
=
1198 getFunction(M
, AMDGPULibFunc(AMDGPULibFunc::EI_CBRT
, FInfo
))) {
1199 LLVM_DEBUG(errs() << "AMDIC: " << *FPOp
<< " ---> cbrt(" << *opr0
1201 Value
*nval
= CreateCallEx(B
,FPExpr
, opr0
, "__rootn2cbrt");
1202 replaceCall(FPOp
, nval
);
1205 } else if (ci_opr1
== -1) { // rootn(x, -1) = 1.0/x
1206 LLVM_DEBUG(errs() << "AMDIC: " << *FPOp
<< " ---> 1.0 / " << *opr0
<< "\n");
1207 Value
*nval
= B
.CreateFDiv(ConstantFP::get(opr0
->getType(), 1.0),
1210 replaceCall(FPOp
, nval
);
1214 if (ci_opr1
== -2 &&
1215 shouldReplaceLibcallWithIntrinsic(CI
,
1216 /*AllowMinSizeF32=*/true,
1217 /*AllowF64=*/true)) {
1218 // rootn(x, -2) = rsqrt(x)
1220 // The original rootn had looser ulp requirements than the resultant sqrt
1222 MDBuilder
MDHelper(M
->getContext());
1223 MDNode
*FPMD
= MDHelper
.createFPMath(std::max(FPOp
->getFPAccuracy(), 2.0f
));
1225 // TODO: Could handle strictfp but need to fix strict sqrt emission
1226 FastMathFlags FMF
= FPOp
->getFastMathFlags();
1227 FMF
.setAllowContract(true);
1229 CallInst
*Sqrt
= B
.CreateUnaryIntrinsic(Intrinsic::sqrt
, opr0
, CI
);
1230 Instruction
*RSqrt
= cast
<Instruction
>(
1231 B
.CreateFDiv(ConstantFP::get(opr0
->getType(), 1.0), Sqrt
));
1232 Sqrt
->setFastMathFlags(FMF
);
1233 RSqrt
->setFastMathFlags(FMF
);
1234 RSqrt
->setMetadata(LLVMContext::MD_fpmath
, FPMD
);
1236 LLVM_DEBUG(errs() << "AMDIC: " << *FPOp
<< " ---> rsqrt(" << *opr0
1238 replaceCall(CI
, RSqrt
);
1245 // Get a scalar native builtin single argument FP function
1246 FunctionCallee
AMDGPULibCalls::getNativeFunction(Module
*M
,
1247 const FuncInfo
&FInfo
) {
1248 if (getArgType(FInfo
) == AMDGPULibFunc::F64
|| !HasNative(FInfo
.getId()))
1250 FuncInfo nf
= FInfo
;
1251 nf
.setPrefix(AMDGPULibFunc::NATIVE
);
1252 return getFunction(M
, nf
);
1255 // Some library calls are just wrappers around llvm intrinsics, but compiled
1256 // conservatively. Preserve the flags from the original call site by
1257 // substituting them with direct calls with all the flags.
1258 bool AMDGPULibCalls::shouldReplaceLibcallWithIntrinsic(const CallInst
*CI
,
1259 bool AllowMinSizeF32
,
1261 bool AllowStrictFP
) {
1262 Type
*FltTy
= CI
->getType()->getScalarType();
1263 const bool IsF32
= FltTy
->isFloatTy();
1265 // f64 intrinsics aren't implemented for most operations.
1266 if (!IsF32
&& !FltTy
->isHalfTy() && (!AllowF64
|| !FltTy
->isDoubleTy()))
1269 // We're implicitly inlining by replacing the libcall with the intrinsic, so
1270 // don't do it for noinline call sites.
1271 if (CI
->isNoInline())
1274 const Function
*ParentF
= CI
->getFunction();
1275 // TODO: Handle strictfp
1276 if (!AllowStrictFP
&& ParentF
->hasFnAttribute(Attribute::StrictFP
))
1279 if (IsF32
&& !AllowMinSizeF32
&& ParentF
->hasMinSize())
1284 void AMDGPULibCalls::replaceLibCallWithSimpleIntrinsic(IRBuilder
<> &B
,
1286 Intrinsic::ID IntrID
) {
1287 if (CI
->arg_size() == 2) {
1288 Value
*Arg0
= CI
->getArgOperand(0);
1289 Value
*Arg1
= CI
->getArgOperand(1);
1290 VectorType
*Arg0VecTy
= dyn_cast
<VectorType
>(Arg0
->getType());
1291 VectorType
*Arg1VecTy
= dyn_cast
<VectorType
>(Arg1
->getType());
1292 if (Arg0VecTy
&& !Arg1VecTy
) {
1293 Value
*SplatRHS
= B
.CreateVectorSplat(Arg0VecTy
->getElementCount(), Arg1
);
1294 CI
->setArgOperand(1, SplatRHS
);
1295 } else if (!Arg0VecTy
&& Arg1VecTy
) {
1296 Value
*SplatLHS
= B
.CreateVectorSplat(Arg1VecTy
->getElementCount(), Arg0
);
1297 CI
->setArgOperand(0, SplatLHS
);
1301 CI
->setCalledFunction(
1302 Intrinsic::getDeclaration(CI
->getModule(), IntrID
, {CI
->getType()}));
1305 bool AMDGPULibCalls::tryReplaceLibcallWithSimpleIntrinsic(
1306 IRBuilder
<> &B
, CallInst
*CI
, Intrinsic::ID IntrID
, bool AllowMinSizeF32
,
1307 bool AllowF64
, bool AllowStrictFP
) {
1308 if (!shouldReplaceLibcallWithIntrinsic(CI
, AllowMinSizeF32
, AllowF64
,
1311 replaceLibCallWithSimpleIntrinsic(B
, CI
, IntrID
);
1315 std::tuple
<Value
*, Value
*, Value
*>
1316 AMDGPULibCalls::insertSinCos(Value
*Arg
, FastMathFlags FMF
, IRBuilder
<> &B
,
1317 FunctionCallee Fsincos
) {
1318 DebugLoc DL
= B
.getCurrentDebugLocation();
1319 Function
*F
= B
.GetInsertBlock()->getParent();
1320 B
.SetInsertPointPastAllocas(F
);
1322 AllocaInst
*Alloc
= B
.CreateAlloca(Arg
->getType(), nullptr, "__sincos_");
1324 if (Instruction
*ArgInst
= dyn_cast
<Instruction
>(Arg
)) {
1325 // If the argument is an instruction, it must dominate all uses so put our
1326 // sincos call there. Otherwise, right after the allocas works well enough
1327 // if it's an argument or constant.
1329 B
.SetInsertPoint(ArgInst
->getParent(), ++ArgInst
->getIterator());
1331 // SetInsertPoint unwelcomely always tries to set the debug loc.
1332 B
.SetCurrentDebugLocation(DL
);
1335 Type
*CosPtrTy
= Fsincos
.getFunctionType()->getParamType(1);
1337 // The allocaInst allocates the memory in private address space. This need
1338 // to be addrspacecasted to point to the address space of cos pointer type.
1339 // In OpenCL 2.0 this is generic, while in 1.2 that is private.
1340 Value
*CastAlloc
= B
.CreateAddrSpaceCast(Alloc
, CosPtrTy
);
1342 CallInst
*SinCos
= CreateCallEx2(B
, Fsincos
, Arg
, CastAlloc
);
1344 // TODO: Is it worth trying to preserve the location for the cos calls for the
1347 LoadInst
*LoadCos
= B
.CreateLoad(Alloc
->getAllocatedType(), Alloc
);
1348 return {SinCos
, LoadCos
, SinCos
};
1351 // fold sin, cos -> sincos.
1352 bool AMDGPULibCalls::fold_sincos(FPMathOperator
*FPOp
, IRBuilder
<> &B
,
1353 const FuncInfo
&fInfo
) {
1354 assert(fInfo
.getId() == AMDGPULibFunc::EI_SIN
||
1355 fInfo
.getId() == AMDGPULibFunc::EI_COS
);
1357 if ((getArgType(fInfo
) != AMDGPULibFunc::F32
&&
1358 getArgType(fInfo
) != AMDGPULibFunc::F64
) ||
1359 fInfo
.getPrefix() != AMDGPULibFunc::NOPFX
)
1362 bool const isSin
= fInfo
.getId() == AMDGPULibFunc::EI_SIN
;
1364 Value
*CArgVal
= FPOp
->getOperand(0);
1365 CallInst
*CI
= cast
<CallInst
>(FPOp
);
1367 Function
*F
= B
.GetInsertBlock()->getParent();
1368 Module
*M
= F
->getParent();
1370 // Merge the sin and cos. For OpenCL 2.0, there may only be a generic pointer
1371 // implementation. Prefer the private form if available.
1372 AMDGPULibFunc
SinCosLibFuncPrivate(AMDGPULibFunc::EI_SINCOS
, fInfo
);
1373 SinCosLibFuncPrivate
.getLeads()[0].PtrKind
=
1374 AMDGPULibFunc::getEPtrKindFromAddrSpace(AMDGPUAS::PRIVATE_ADDRESS
);
1376 AMDGPULibFunc
SinCosLibFuncGeneric(AMDGPULibFunc::EI_SINCOS
, fInfo
);
1377 SinCosLibFuncGeneric
.getLeads()[0].PtrKind
=
1378 AMDGPULibFunc::getEPtrKindFromAddrSpace(AMDGPUAS::FLAT_ADDRESS
);
1380 FunctionCallee FSinCosPrivate
= getFunction(M
, SinCosLibFuncPrivate
);
1381 FunctionCallee FSinCosGeneric
= getFunction(M
, SinCosLibFuncGeneric
);
1382 FunctionCallee FSinCos
= FSinCosPrivate
? FSinCosPrivate
: FSinCosGeneric
;
1386 SmallVector
<CallInst
*> SinCalls
;
1387 SmallVector
<CallInst
*> CosCalls
;
1388 SmallVector
<CallInst
*> SinCosCalls
;
1389 FuncInfo
PartnerInfo(isSin
? AMDGPULibFunc::EI_COS
: AMDGPULibFunc::EI_SIN
,
1391 const std::string PairName
= PartnerInfo
.mangle();
1393 StringRef SinName
= isSin
? CI
->getCalledFunction()->getName() : PairName
;
1394 StringRef CosName
= isSin
? PairName
: CI
->getCalledFunction()->getName();
1395 const std::string SinCosPrivateName
= SinCosLibFuncPrivate
.mangle();
1396 const std::string SinCosGenericName
= SinCosLibFuncGeneric
.mangle();
1398 // Intersect the two sets of flags.
1399 FastMathFlags FMF
= FPOp
->getFastMathFlags();
1400 MDNode
*FPMath
= CI
->getMetadata(LLVMContext::MD_fpmath
);
1402 SmallVector
<DILocation
*> MergeDbgLocs
= {CI
->getDebugLoc()};
1404 for (User
* U
: CArgVal
->users()) {
1405 CallInst
*XI
= dyn_cast
<CallInst
>(U
);
1406 if (!XI
|| XI
->getFunction() != F
|| XI
->isNoBuiltin())
1409 Function
*UCallee
= XI
->getCalledFunction();
1413 bool Handled
= true;
1415 if (UCallee
->getName() == SinName
)
1416 SinCalls
.push_back(XI
);
1417 else if (UCallee
->getName() == CosName
)
1418 CosCalls
.push_back(XI
);
1419 else if (UCallee
->getName() == SinCosPrivateName
||
1420 UCallee
->getName() == SinCosGenericName
)
1421 SinCosCalls
.push_back(XI
);
1426 MergeDbgLocs
.push_back(XI
->getDebugLoc());
1427 auto *OtherOp
= cast
<FPMathOperator
>(XI
);
1428 FMF
&= OtherOp
->getFastMathFlags();
1429 FPMath
= MDNode::getMostGenericFPMath(
1430 FPMath
, XI
->getMetadata(LLVMContext::MD_fpmath
));
1434 if (SinCalls
.empty() || CosCalls
.empty())
1437 B
.setFastMathFlags(FMF
);
1438 B
.setDefaultFPMathTag(FPMath
);
1439 DILocation
*DbgLoc
= DILocation::getMergedLocations(MergeDbgLocs
);
1440 B
.SetCurrentDebugLocation(DbgLoc
);
1442 auto [Sin
, Cos
, SinCos
] = insertSinCos(CArgVal
, FMF
, B
, FSinCos
);
1444 auto replaceTrigInsts
= [](ArrayRef
<CallInst
*> Calls
, Value
*Res
) {
1445 for (CallInst
*C
: Calls
)
1446 C
->replaceAllUsesWith(Res
);
1448 // Leave the other dead instructions to avoid clobbering iterators.
1451 replaceTrigInsts(SinCalls
, Sin
);
1452 replaceTrigInsts(CosCalls
, Cos
);
1453 replaceTrigInsts(SinCosCalls
, SinCos
);
1455 // It's safe to delete the original now.
1456 CI
->eraseFromParent();
1460 bool AMDGPULibCalls::evaluateScalarMathFunc(const FuncInfo
&FInfo
, double &Res0
,
1461 double &Res1
, Constant
*copr0
,
1463 // By default, opr0/opr1/opr3 holds values of float/double type.
1464 // If they are not float/double, each function has to its
1465 // operand separately.
1466 double opr0
= 0.0, opr1
= 0.0;
1467 ConstantFP
*fpopr0
= dyn_cast_or_null
<ConstantFP
>(copr0
);
1468 ConstantFP
*fpopr1
= dyn_cast_or_null
<ConstantFP
>(copr1
);
1470 opr0
= (getArgType(FInfo
) == AMDGPULibFunc::F64
)
1471 ? fpopr0
->getValueAPF().convertToDouble()
1472 : (double)fpopr0
->getValueAPF().convertToFloat();
1476 opr1
= (getArgType(FInfo
) == AMDGPULibFunc::F64
)
1477 ? fpopr1
->getValueAPF().convertToDouble()
1478 : (double)fpopr1
->getValueAPF().convertToFloat();
1481 switch (FInfo
.getId()) {
1482 default : return false;
1484 case AMDGPULibFunc::EI_ACOS
:
1488 case AMDGPULibFunc::EI_ACOSH
:
1489 // acosh(x) == log(x + sqrt(x*x - 1))
1490 Res0
= log(opr0
+ sqrt(opr0
*opr0
- 1.0));
1493 case AMDGPULibFunc::EI_ACOSPI
:
1494 Res0
= acos(opr0
) / MATH_PI
;
1497 case AMDGPULibFunc::EI_ASIN
:
1501 case AMDGPULibFunc::EI_ASINH
:
1502 // asinh(x) == log(x + sqrt(x*x + 1))
1503 Res0
= log(opr0
+ sqrt(opr0
*opr0
+ 1.0));
1506 case AMDGPULibFunc::EI_ASINPI
:
1507 Res0
= asin(opr0
) / MATH_PI
;
1510 case AMDGPULibFunc::EI_ATAN
:
1514 case AMDGPULibFunc::EI_ATANH
:
1515 // atanh(x) == (log(x+1) - log(x-1))/2;
1516 Res0
= (log(opr0
+ 1.0) - log(opr0
- 1.0))/2.0;
1519 case AMDGPULibFunc::EI_ATANPI
:
1520 Res0
= atan(opr0
) / MATH_PI
;
1523 case AMDGPULibFunc::EI_CBRT
:
1524 Res0
= (opr0
< 0.0) ? -pow(-opr0
, 1.0/3.0) : pow(opr0
, 1.0/3.0);
1527 case AMDGPULibFunc::EI_COS
:
1531 case AMDGPULibFunc::EI_COSH
:
1535 case AMDGPULibFunc::EI_COSPI
:
1536 Res0
= cos(MATH_PI
* opr0
);
1539 case AMDGPULibFunc::EI_EXP
:
1543 case AMDGPULibFunc::EI_EXP2
:
1544 Res0
= pow(2.0, opr0
);
1547 case AMDGPULibFunc::EI_EXP10
:
1548 Res0
= pow(10.0, opr0
);
1551 case AMDGPULibFunc::EI_LOG
:
1555 case AMDGPULibFunc::EI_LOG2
:
1556 Res0
= log(opr0
) / log(2.0);
1559 case AMDGPULibFunc::EI_LOG10
:
1560 Res0
= log(opr0
) / log(10.0);
1563 case AMDGPULibFunc::EI_RSQRT
:
1564 Res0
= 1.0 / sqrt(opr0
);
1567 case AMDGPULibFunc::EI_SIN
:
1571 case AMDGPULibFunc::EI_SINH
:
1575 case AMDGPULibFunc::EI_SINPI
:
1576 Res0
= sin(MATH_PI
* opr0
);
1579 case AMDGPULibFunc::EI_TAN
:
1583 case AMDGPULibFunc::EI_TANH
:
1587 case AMDGPULibFunc::EI_TANPI
:
1588 Res0
= tan(MATH_PI
* opr0
);
1591 // two-arg functions
1592 case AMDGPULibFunc::EI_POW
:
1593 case AMDGPULibFunc::EI_POWR
:
1594 Res0
= pow(opr0
, opr1
);
1597 case AMDGPULibFunc::EI_POWN
: {
1598 if (ConstantInt
*iopr1
= dyn_cast_or_null
<ConstantInt
>(copr1
)) {
1599 double val
= (double)iopr1
->getSExtValue();
1600 Res0
= pow(opr0
, val
);
1606 case AMDGPULibFunc::EI_ROOTN
: {
1607 if (ConstantInt
*iopr1
= dyn_cast_or_null
<ConstantInt
>(copr1
)) {
1608 double val
= (double)iopr1
->getSExtValue();
1609 Res0
= pow(opr0
, 1.0 / val
);
1616 case AMDGPULibFunc::EI_SINCOS
:
1625 bool AMDGPULibCalls::evaluateCall(CallInst
*aCI
, const FuncInfo
&FInfo
) {
1626 int numArgs
= (int)aCI
->arg_size();
1630 Constant
*copr0
= nullptr;
1631 Constant
*copr1
= nullptr;
1633 if ((copr0
= dyn_cast
<Constant
>(aCI
->getArgOperand(0))) == nullptr)
1638 if ((copr1
= dyn_cast
<Constant
>(aCI
->getArgOperand(1))) == nullptr) {
1639 if (FInfo
.getId() != AMDGPULibFunc::EI_SINCOS
)
1644 // At this point, all arguments to aCI are constants.
1646 // max vector size is 16, and sincos will generate two results.
1647 double DVal0
[16], DVal1
[16];
1648 int FuncVecSize
= getVecSize(FInfo
);
1649 bool hasTwoResults
= (FInfo
.getId() == AMDGPULibFunc::EI_SINCOS
);
1650 if (FuncVecSize
== 1) {
1651 if (!evaluateScalarMathFunc(FInfo
, DVal0
[0], DVal1
[0], copr0
, copr1
)) {
1655 ConstantDataVector
*CDV0
= dyn_cast_or_null
<ConstantDataVector
>(copr0
);
1656 ConstantDataVector
*CDV1
= dyn_cast_or_null
<ConstantDataVector
>(copr1
);
1657 for (int i
= 0; i
< FuncVecSize
; ++i
) {
1658 Constant
*celt0
= CDV0
? CDV0
->getElementAsConstant(i
) : nullptr;
1659 Constant
*celt1
= CDV1
? CDV1
->getElementAsConstant(i
) : nullptr;
1660 if (!evaluateScalarMathFunc(FInfo
, DVal0
[i
], DVal1
[i
], celt0
, celt1
)) {
1666 LLVMContext
&context
= aCI
->getContext();
1667 Constant
*nval0
, *nval1
;
1668 if (FuncVecSize
== 1) {
1669 nval0
= ConstantFP::get(aCI
->getType(), DVal0
[0]);
1671 nval1
= ConstantFP::get(aCI
->getType(), DVal1
[0]);
1673 if (getArgType(FInfo
) == AMDGPULibFunc::F32
) {
1674 SmallVector
<float, 0> FVal0
, FVal1
;
1675 for (int i
= 0; i
< FuncVecSize
; ++i
)
1676 FVal0
.push_back((float)DVal0
[i
]);
1677 ArrayRef
<float> tmp0(FVal0
);
1678 nval0
= ConstantDataVector::get(context
, tmp0
);
1679 if (hasTwoResults
) {
1680 for (int i
= 0; i
< FuncVecSize
; ++i
)
1681 FVal1
.push_back((float)DVal1
[i
]);
1682 ArrayRef
<float> tmp1(FVal1
);
1683 nval1
= ConstantDataVector::get(context
, tmp1
);
1686 ArrayRef
<double> tmp0(DVal0
);
1687 nval0
= ConstantDataVector::get(context
, tmp0
);
1688 if (hasTwoResults
) {
1689 ArrayRef
<double> tmp1(DVal1
);
1690 nval1
= ConstantDataVector::get(context
, tmp1
);
1695 if (hasTwoResults
) {
1697 assert(FInfo
.getId() == AMDGPULibFunc::EI_SINCOS
&&
1698 "math function with ptr arg not supported yet");
1699 new StoreInst(nval1
, aCI
->getArgOperand(1), aCI
->getIterator());
1702 replaceCall(aCI
, nval0
);
1706 PreservedAnalyses
AMDGPUSimplifyLibCallsPass::run(Function
&F
,
1707 FunctionAnalysisManager
&AM
) {
1708 AMDGPULibCalls Simplifier
;
1709 Simplifier
.initNativeFuncs();
1710 Simplifier
.initFunction(F
, AM
);
1712 bool Changed
= false;
1714 LLVM_DEBUG(dbgs() << "AMDIC: process function ";
1715 F
.printAsOperand(dbgs(), false, F
.getParent()); dbgs() << '\n';);
1717 for (auto &BB
: F
) {
1718 for (BasicBlock::iterator I
= BB
.begin(), E
= BB
.end(); I
!= E
;) {
1719 // Ignore non-calls.
1720 CallInst
*CI
= dyn_cast
<CallInst
>(I
);
1724 if (Simplifier
.fold(CI
))
1729 return Changed
? PreservedAnalyses::none() : PreservedAnalyses::all();
1732 PreservedAnalyses
AMDGPUUseNativeCallsPass::run(Function
&F
,
1733 FunctionAnalysisManager
&AM
) {
1734 if (UseNative
.empty())
1735 return PreservedAnalyses::all();
1737 AMDGPULibCalls Simplifier
;
1738 Simplifier
.initNativeFuncs();
1739 Simplifier
.initFunction(F
, AM
);
1741 bool Changed
= false;
1742 for (auto &BB
: F
) {
1743 for (BasicBlock::iterator I
= BB
.begin(), E
= BB
.end(); I
!= E
;) {
1744 // Ignore non-calls.
1745 CallInst
*CI
= dyn_cast
<CallInst
>(I
);
1747 if (CI
&& Simplifier
.useNative(CI
))
1751 return Changed
? PreservedAnalyses::none() : PreservedAnalyses::all();