1 //===-- AMDGPUCodeGenPrepare.cpp ------------------------------------------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
10 /// This pass does misc. AMDGPU optimizations on IR before instruction
13 //===----------------------------------------------------------------------===//
16 #include "AMDGPUTargetMachine.h"
17 #include "SIModeRegisterDefaults.h"
18 #include "llvm/Analysis/AssumptionCache.h"
19 #include "llvm/Analysis/ConstantFolding.h"
20 #include "llvm/Analysis/TargetLibraryInfo.h"
21 #include "llvm/Analysis/UniformityAnalysis.h"
22 #include "llvm/Analysis/ValueTracking.h"
23 #include "llvm/CodeGen/TargetPassConfig.h"
24 #include "llvm/IR/Dominators.h"
25 #include "llvm/IR/IRBuilder.h"
26 #include "llvm/IR/InstVisitor.h"
27 #include "llvm/IR/IntrinsicsAMDGPU.h"
28 #include "llvm/IR/PatternMatch.h"
29 #include "llvm/InitializePasses.h"
30 #include "llvm/Pass.h"
31 #include "llvm/Support/KnownBits.h"
32 #include "llvm/Transforms/Utils/IntegerDivision.h"
33 #include "llvm/Transforms/Utils/Local.h"
35 #define DEBUG_TYPE "amdgpu-codegenprepare"
38 using namespace llvm::PatternMatch
;
42 static cl::opt
<bool> WidenLoads(
43 "amdgpu-codegenprepare-widen-constant-loads",
44 cl::desc("Widen sub-dword constant address space loads in AMDGPUCodeGenPrepare"),
48 static cl::opt
<bool> Widen16BitOps(
49 "amdgpu-codegenprepare-widen-16-bit-ops",
50 cl::desc("Widen uniform 16-bit instructions to 32-bit in AMDGPUCodeGenPrepare"),
55 BreakLargePHIs("amdgpu-codegenprepare-break-large-phis",
56 cl::desc("Break large PHI nodes for DAGISel"),
57 cl::ReallyHidden
, cl::init(true));
60 ForceBreakLargePHIs("amdgpu-codegenprepare-force-break-large-phis",
61 cl::desc("For testing purposes, always break large "
62 "PHIs even if it isn't profitable."),
63 cl::ReallyHidden
, cl::init(false));
65 static cl::opt
<unsigned> BreakLargePHIsThreshold(
66 "amdgpu-codegenprepare-break-large-phis-threshold",
67 cl::desc("Minimum type size in bits for breaking large PHI nodes"),
68 cl::ReallyHidden
, cl::init(32));
70 static cl::opt
<bool> UseMul24Intrin(
71 "amdgpu-codegenprepare-mul24",
72 cl::desc("Introduce mul24 intrinsics in AMDGPUCodeGenPrepare"),
76 // Legalize 64-bit division by using the generic IR expansion.
77 static cl::opt
<bool> ExpandDiv64InIR(
78 "amdgpu-codegenprepare-expand-div64",
79 cl::desc("Expand 64-bit division in AMDGPUCodeGenPrepare"),
83 // Leave all division operations as they are. This supersedes ExpandDiv64InIR
84 // and is used for testing the legalizer.
85 static cl::opt
<bool> DisableIDivExpand(
86 "amdgpu-codegenprepare-disable-idiv-expansion",
87 cl::desc("Prevent expanding integer division in AMDGPUCodeGenPrepare"),
91 // Disable processing of fdiv so we can better test the backend implementations.
92 static cl::opt
<bool> DisableFDivExpand(
93 "amdgpu-codegenprepare-disable-fdiv-expansion",
94 cl::desc("Prevent expanding floating point division in AMDGPUCodeGenPrepare"),
98 class AMDGPUCodeGenPrepareImpl
99 : public InstVisitor
<AMDGPUCodeGenPrepareImpl
, bool> {
101 const GCNSubtarget
*ST
= nullptr;
102 const AMDGPUTargetMachine
*TM
= nullptr;
103 const TargetLibraryInfo
*TLInfo
= nullptr;
104 AssumptionCache
*AC
= nullptr;
105 DominatorTree
*DT
= nullptr;
106 UniformityInfo
*UA
= nullptr;
107 Module
*Mod
= nullptr;
108 const DataLayout
*DL
= nullptr;
109 bool HasUnsafeFPMath
= false;
110 bool HasFP32DenormalFlush
= false;
111 bool FlowChanged
= false;
112 mutable Function
*SqrtF32
= nullptr;
113 mutable Function
*LdexpF32
= nullptr;
115 DenseMap
<const PHINode
*, bool> BreakPhiNodesCache
;
117 Function
*getSqrtF32() const {
121 LLVMContext
&Ctx
= Mod
->getContext();
122 SqrtF32
= Intrinsic::getDeclaration(Mod
, Intrinsic::amdgcn_sqrt
,
123 {Type::getFloatTy(Ctx
)});
127 Function
*getLdexpF32() const {
131 LLVMContext
&Ctx
= Mod
->getContext();
132 LdexpF32
= Intrinsic::getDeclaration(
133 Mod
, Intrinsic::ldexp
, {Type::getFloatTy(Ctx
), Type::getInt32Ty(Ctx
)});
137 bool canBreakPHINode(const PHINode
&I
);
139 /// Copies exact/nsw/nuw flags (if any) from binary operation \p I to
140 /// binary operation \p V.
142 /// \returns Binary operation \p V.
143 /// \returns \p T's base element bit width.
144 unsigned getBaseElementBitWidth(const Type
*T
) const;
146 /// \returns Equivalent 32 bit integer type for given type \p T. For example,
147 /// if \p T is i7, then i32 is returned; if \p T is <3 x i12>, then <3 x i32>
149 Type
*getI32Ty(IRBuilder
<> &B
, const Type
*T
) const;
151 /// \returns True if binary operation \p I is a signed binary operation, false
153 bool isSigned(const BinaryOperator
&I
) const;
155 /// \returns True if the condition of 'select' operation \p I comes from a
156 /// signed 'icmp' operation, false otherwise.
157 bool isSigned(const SelectInst
&I
) const;
159 /// \returns True if type \p T needs to be promoted to 32 bit integer type,
161 bool needsPromotionToI32(const Type
*T
) const;
163 /// Return true if \p T is a legal scalar floating point type.
164 bool isLegalFloatingTy(const Type
*T
) const;
166 /// Wrapper to pass all the arguments to computeKnownFPClass
167 KnownFPClass
computeKnownFPClass(const Value
*V
, FPClassTest Interested
,
168 const Instruction
*CtxI
) const {
169 return llvm::computeKnownFPClass(V
, *DL
, Interested
, 0, TLInfo
, AC
, CtxI
,
173 bool canIgnoreDenormalInput(const Value
*V
, const Instruction
*CtxI
) const {
174 return HasFP32DenormalFlush
||
175 computeKnownFPClass(V
, fcSubnormal
, CtxI
).isKnownNeverSubnormal();
178 /// Promotes uniform binary operation \p I to equivalent 32 bit binary
181 /// \details \p I's base element bit width must be greater than 1 and less
182 /// than or equal 16. Promotion is done by sign or zero extending operands to
183 /// 32 bits, replacing \p I with equivalent 32 bit binary operation, and
184 /// truncating the result of 32 bit binary operation back to \p I's original
185 /// type. Division operation is not promoted.
187 /// \returns True if \p I is promoted to equivalent 32 bit binary operation,
189 bool promoteUniformOpToI32(BinaryOperator
&I
) const;
191 /// Promotes uniform 'icmp' operation \p I to 32 bit 'icmp' operation.
193 /// \details \p I's base element bit width must be greater than 1 and less
194 /// than or equal 16. Promotion is done by sign or zero extending operands to
195 /// 32 bits, and replacing \p I with 32 bit 'icmp' operation.
198 bool promoteUniformOpToI32(ICmpInst
&I
) const;
200 /// Promotes uniform 'select' operation \p I to 32 bit 'select'
203 /// \details \p I's base element bit width must be greater than 1 and less
204 /// than or equal 16. Promotion is done by sign or zero extending operands to
205 /// 32 bits, replacing \p I with 32 bit 'select' operation, and truncating the
206 /// result of 32 bit 'select' operation back to \p I's original type.
209 bool promoteUniformOpToI32(SelectInst
&I
) const;
211 /// Promotes uniform 'bitreverse' intrinsic \p I to 32 bit 'bitreverse'
214 /// \details \p I's base element bit width must be greater than 1 and less
215 /// than or equal 16. Promotion is done by zero extending the operand to 32
216 /// bits, replacing \p I with 32 bit 'bitreverse' intrinsic, shifting the
217 /// result of 32 bit 'bitreverse' intrinsic to the right with zero fill (the
218 /// shift amount is 32 minus \p I's base element bit width), and truncating
219 /// the result of the shift operation back to \p I's original type.
222 bool promoteUniformBitreverseToI32(IntrinsicInst
&I
) const;
224 /// \returns The minimum number of bits needed to store the value of \Op as an
225 /// unsigned integer. Truncating to this size and then zero-extending to
226 /// the original will not change the value.
227 unsigned numBitsUnsigned(Value
*Op
) const;
229 /// \returns The minimum number of bits needed to store the value of \Op as a
230 /// signed integer. Truncating to this size and then sign-extending to
231 /// the original size will not change the value.
232 unsigned numBitsSigned(Value
*Op
) const;
234 /// Replace mul instructions with llvm.amdgcn.mul.u24 or llvm.amdgcn.mul.s24.
235 /// SelectionDAG has an issue where an and asserting the bits are known
236 bool replaceMulWithMul24(BinaryOperator
&I
) const;
238 /// Perform same function as equivalently named function in DAGCombiner. Since
239 /// we expand some divisions here, we need to perform this before obscuring.
240 bool foldBinOpIntoSelect(BinaryOperator
&I
) const;
242 bool divHasSpecialOptimization(BinaryOperator
&I
,
243 Value
*Num
, Value
*Den
) const;
244 int getDivNumBits(BinaryOperator
&I
,
245 Value
*Num
, Value
*Den
,
246 unsigned AtLeast
, bool Signed
) const;
248 /// Expands 24 bit div or rem.
249 Value
* expandDivRem24(IRBuilder
<> &Builder
, BinaryOperator
&I
,
250 Value
*Num
, Value
*Den
,
251 bool IsDiv
, bool IsSigned
) const;
253 Value
*expandDivRem24Impl(IRBuilder
<> &Builder
, BinaryOperator
&I
,
254 Value
*Num
, Value
*Den
, unsigned NumBits
,
255 bool IsDiv
, bool IsSigned
) const;
257 /// Expands 32 bit div or rem.
258 Value
* expandDivRem32(IRBuilder
<> &Builder
, BinaryOperator
&I
,
259 Value
*Num
, Value
*Den
) const;
261 Value
*shrinkDivRem64(IRBuilder
<> &Builder
, BinaryOperator
&I
,
262 Value
*Num
, Value
*Den
) const;
263 void expandDivRem64(BinaryOperator
&I
) const;
265 /// Widen a scalar load.
267 /// \details \p Widen scalar load for uniform, small type loads from constant
268 // memory / to a full 32-bits and then truncate the input to allow a scalar
269 // load instead of a vector load.
273 bool canWidenScalarExtLoad(LoadInst
&I
) const;
275 Value
*matchFractPat(IntrinsicInst
&I
);
276 Value
*applyFractPat(IRBuilder
<> &Builder
, Value
*FractArg
);
278 bool canOptimizeWithRsq(const FPMathOperator
*SqrtOp
, FastMathFlags DivFMF
,
279 FastMathFlags SqrtFMF
) const;
281 Value
*optimizeWithRsq(IRBuilder
<> &Builder
, Value
*Num
, Value
*Den
,
282 FastMathFlags DivFMF
, FastMathFlags SqrtFMF
,
283 const Instruction
*CtxI
) const;
285 Value
*optimizeWithRcp(IRBuilder
<> &Builder
, Value
*Num
, Value
*Den
,
286 FastMathFlags FMF
, const Instruction
*CtxI
) const;
287 Value
*optimizeWithFDivFast(IRBuilder
<> &Builder
, Value
*Num
, Value
*Den
,
288 float ReqdAccuracy
) const;
290 Value
*visitFDivElement(IRBuilder
<> &Builder
, Value
*Num
, Value
*Den
,
291 FastMathFlags DivFMF
, FastMathFlags SqrtFMF
,
292 Value
*RsqOp
, const Instruction
*FDiv
,
293 float ReqdAccuracy
) const;
295 std::pair
<Value
*, Value
*> getFrexpResults(IRBuilder
<> &Builder
,
298 Value
*emitRcpIEEE1ULP(IRBuilder
<> &Builder
, Value
*Src
,
299 bool IsNegative
) const;
300 Value
*emitFrexpDiv(IRBuilder
<> &Builder
, Value
*LHS
, Value
*RHS
,
301 FastMathFlags FMF
) const;
302 Value
*emitSqrtIEEE2ULP(IRBuilder
<> &Builder
, Value
*Src
,
303 FastMathFlags FMF
) const;
306 bool visitFDiv(BinaryOperator
&I
);
308 bool visitInstruction(Instruction
&I
) { return false; }
309 bool visitBinaryOperator(BinaryOperator
&I
);
310 bool visitLoadInst(LoadInst
&I
);
311 bool visitICmpInst(ICmpInst
&I
);
312 bool visitSelectInst(SelectInst
&I
);
313 bool visitPHINode(PHINode
&I
);
314 bool visitAddrSpaceCastInst(AddrSpaceCastInst
&I
);
316 bool visitIntrinsicInst(IntrinsicInst
&I
);
317 bool visitBitreverseIntrinsicInst(IntrinsicInst
&I
);
318 bool visitMinNum(IntrinsicInst
&I
);
319 bool visitSqrt(IntrinsicInst
&I
);
320 bool run(Function
&F
);
323 class AMDGPUCodeGenPrepare
: public FunctionPass
{
325 AMDGPUCodeGenPrepareImpl Impl
;
329 AMDGPUCodeGenPrepare() : FunctionPass(ID
) {
330 initializeAMDGPUCodeGenPreparePass(*PassRegistry::getPassRegistry());
332 void getAnalysisUsage(AnalysisUsage
&AU
) const override
{
333 AU
.addRequired
<AssumptionCacheTracker
>();
334 AU
.addRequired
<UniformityInfoWrapperPass
>();
335 AU
.addRequired
<TargetLibraryInfoWrapperPass
>();
337 // FIXME: Division expansion needs to preserve the dominator tree.
338 if (!ExpandDiv64InIR
)
339 AU
.setPreservesAll();
341 bool runOnFunction(Function
&F
) override
;
342 bool doInitialization(Module
&M
) override
;
343 StringRef
getPassName() const override
{ return "AMDGPU IR optimizations"; }
346 } // end anonymous namespace
348 bool AMDGPUCodeGenPrepareImpl::run(Function
&F
) {
349 BreakPhiNodesCache
.clear();
350 bool MadeChange
= false;
352 Function::iterator NextBB
;
353 for (Function::iterator FI
= F
.begin(), FE
= F
.end(); FI
!= FE
; FI
= NextBB
) {
354 BasicBlock
*BB
= &*FI
;
355 NextBB
= std::next(FI
);
357 BasicBlock::iterator Next
;
358 for (BasicBlock::iterator I
= BB
->begin(), E
= BB
->end(); I
!= E
;
362 MadeChange
|= visit(*I
);
364 if (Next
!= E
) { // Control flow changed
365 BasicBlock
*NextInstBB
= Next
->getParent();
366 if (NextInstBB
!= BB
) {
377 unsigned AMDGPUCodeGenPrepareImpl::getBaseElementBitWidth(const Type
*T
) const {
378 assert(needsPromotionToI32(T
) && "T does not need promotion to i32");
380 if (T
->isIntegerTy())
381 return T
->getIntegerBitWidth();
382 return cast
<VectorType
>(T
)->getElementType()->getIntegerBitWidth();
385 Type
*AMDGPUCodeGenPrepareImpl::getI32Ty(IRBuilder
<> &B
, const Type
*T
) const {
386 assert(needsPromotionToI32(T
) && "T does not need promotion to i32");
388 if (T
->isIntegerTy())
389 return B
.getInt32Ty();
390 return FixedVectorType::get(B
.getInt32Ty(), cast
<FixedVectorType
>(T
));
393 bool AMDGPUCodeGenPrepareImpl::isSigned(const BinaryOperator
&I
) const {
394 return I
.getOpcode() == Instruction::AShr
||
395 I
.getOpcode() == Instruction::SDiv
|| I
.getOpcode() == Instruction::SRem
;
398 bool AMDGPUCodeGenPrepareImpl::isSigned(const SelectInst
&I
) const {
399 return isa
<ICmpInst
>(I
.getOperand(0)) ?
400 cast
<ICmpInst
>(I
.getOperand(0))->isSigned() : false;
403 bool AMDGPUCodeGenPrepareImpl::needsPromotionToI32(const Type
*T
) const {
407 const IntegerType
*IntTy
= dyn_cast
<IntegerType
>(T
);
408 if (IntTy
&& IntTy
->getBitWidth() > 1 && IntTy
->getBitWidth() <= 16)
411 if (const VectorType
*VT
= dyn_cast
<VectorType
>(T
)) {
412 // TODO: The set of packed operations is more limited, so may want to
413 // promote some anyway.
414 if (ST
->hasVOP3PInsts())
417 return needsPromotionToI32(VT
->getElementType());
423 bool AMDGPUCodeGenPrepareImpl::isLegalFloatingTy(const Type
*Ty
) const {
424 return Ty
->isFloatTy() || Ty
->isDoubleTy() ||
425 (Ty
->isHalfTy() && ST
->has16BitInsts());
428 // Return true if the op promoted to i32 should have nsw set.
429 static bool promotedOpIsNSW(const Instruction
&I
) {
430 switch (I
.getOpcode()) {
431 case Instruction::Shl
:
432 case Instruction::Add
:
433 case Instruction::Sub
:
435 case Instruction::Mul
:
436 return I
.hasNoUnsignedWrap();
442 // Return true if the op promoted to i32 should have nuw set.
443 static bool promotedOpIsNUW(const Instruction
&I
) {
444 switch (I
.getOpcode()) {
445 case Instruction::Shl
:
446 case Instruction::Add
:
447 case Instruction::Mul
:
449 case Instruction::Sub
:
450 return I
.hasNoUnsignedWrap();
456 bool AMDGPUCodeGenPrepareImpl::canWidenScalarExtLoad(LoadInst
&I
) const {
457 Type
*Ty
= I
.getType();
458 const DataLayout
&DL
= Mod
->getDataLayout();
459 int TySize
= DL
.getTypeSizeInBits(Ty
);
460 Align Alignment
= DL
.getValueOrABITypeAlignment(I
.getAlign(), Ty
);
462 return I
.isSimple() && TySize
< 32 && Alignment
>= 4 && UA
->isUniform(&I
);
465 bool AMDGPUCodeGenPrepareImpl::promoteUniformOpToI32(BinaryOperator
&I
) const {
466 assert(needsPromotionToI32(I
.getType()) &&
467 "I does not need promotion to i32");
469 if (I
.getOpcode() == Instruction::SDiv
||
470 I
.getOpcode() == Instruction::UDiv
||
471 I
.getOpcode() == Instruction::SRem
||
472 I
.getOpcode() == Instruction::URem
)
475 IRBuilder
<> Builder(&I
);
476 Builder
.SetCurrentDebugLocation(I
.getDebugLoc());
478 Type
*I32Ty
= getI32Ty(Builder
, I
.getType());
479 Value
*ExtOp0
= nullptr;
480 Value
*ExtOp1
= nullptr;
481 Value
*ExtRes
= nullptr;
482 Value
*TruncRes
= nullptr;
485 ExtOp0
= Builder
.CreateSExt(I
.getOperand(0), I32Ty
);
486 ExtOp1
= Builder
.CreateSExt(I
.getOperand(1), I32Ty
);
488 ExtOp0
= Builder
.CreateZExt(I
.getOperand(0), I32Ty
);
489 ExtOp1
= Builder
.CreateZExt(I
.getOperand(1), I32Ty
);
492 ExtRes
= Builder
.CreateBinOp(I
.getOpcode(), ExtOp0
, ExtOp1
);
493 if (Instruction
*Inst
= dyn_cast
<Instruction
>(ExtRes
)) {
494 if (promotedOpIsNSW(cast
<Instruction
>(I
)))
495 Inst
->setHasNoSignedWrap();
497 if (promotedOpIsNUW(cast
<Instruction
>(I
)))
498 Inst
->setHasNoUnsignedWrap();
500 if (const auto *ExactOp
= dyn_cast
<PossiblyExactOperator
>(&I
))
501 Inst
->setIsExact(ExactOp
->isExact());
504 TruncRes
= Builder
.CreateTrunc(ExtRes
, I
.getType());
506 I
.replaceAllUsesWith(TruncRes
);
512 bool AMDGPUCodeGenPrepareImpl::promoteUniformOpToI32(ICmpInst
&I
) const {
513 assert(needsPromotionToI32(I
.getOperand(0)->getType()) &&
514 "I does not need promotion to i32");
516 IRBuilder
<> Builder(&I
);
517 Builder
.SetCurrentDebugLocation(I
.getDebugLoc());
519 Type
*I32Ty
= getI32Ty(Builder
, I
.getOperand(0)->getType());
520 Value
*ExtOp0
= nullptr;
521 Value
*ExtOp1
= nullptr;
522 Value
*NewICmp
= nullptr;
525 ExtOp0
= Builder
.CreateSExt(I
.getOperand(0), I32Ty
);
526 ExtOp1
= Builder
.CreateSExt(I
.getOperand(1), I32Ty
);
528 ExtOp0
= Builder
.CreateZExt(I
.getOperand(0), I32Ty
);
529 ExtOp1
= Builder
.CreateZExt(I
.getOperand(1), I32Ty
);
531 NewICmp
= Builder
.CreateICmp(I
.getPredicate(), ExtOp0
, ExtOp1
);
533 I
.replaceAllUsesWith(NewICmp
);
539 bool AMDGPUCodeGenPrepareImpl::promoteUniformOpToI32(SelectInst
&I
) const {
540 assert(needsPromotionToI32(I
.getType()) &&
541 "I does not need promotion to i32");
543 IRBuilder
<> Builder(&I
);
544 Builder
.SetCurrentDebugLocation(I
.getDebugLoc());
546 Type
*I32Ty
= getI32Ty(Builder
, I
.getType());
547 Value
*ExtOp1
= nullptr;
548 Value
*ExtOp2
= nullptr;
549 Value
*ExtRes
= nullptr;
550 Value
*TruncRes
= nullptr;
553 ExtOp1
= Builder
.CreateSExt(I
.getOperand(1), I32Ty
);
554 ExtOp2
= Builder
.CreateSExt(I
.getOperand(2), I32Ty
);
556 ExtOp1
= Builder
.CreateZExt(I
.getOperand(1), I32Ty
);
557 ExtOp2
= Builder
.CreateZExt(I
.getOperand(2), I32Ty
);
559 ExtRes
= Builder
.CreateSelect(I
.getOperand(0), ExtOp1
, ExtOp2
);
560 TruncRes
= Builder
.CreateTrunc(ExtRes
, I
.getType());
562 I
.replaceAllUsesWith(TruncRes
);
568 bool AMDGPUCodeGenPrepareImpl::promoteUniformBitreverseToI32(
569 IntrinsicInst
&I
) const {
570 assert(I
.getIntrinsicID() == Intrinsic::bitreverse
&&
571 "I must be bitreverse intrinsic");
572 assert(needsPromotionToI32(I
.getType()) &&
573 "I does not need promotion to i32");
575 IRBuilder
<> Builder(&I
);
576 Builder
.SetCurrentDebugLocation(I
.getDebugLoc());
578 Type
*I32Ty
= getI32Ty(Builder
, I
.getType());
580 Intrinsic::getDeclaration(Mod
, Intrinsic::bitreverse
, { I32Ty
});
581 Value
*ExtOp
= Builder
.CreateZExt(I
.getOperand(0), I32Ty
);
582 Value
*ExtRes
= Builder
.CreateCall(I32
, { ExtOp
});
584 Builder
.CreateLShr(ExtRes
, 32 - getBaseElementBitWidth(I
.getType()));
586 Builder
.CreateTrunc(LShrOp
, I
.getType());
588 I
.replaceAllUsesWith(TruncRes
);
594 unsigned AMDGPUCodeGenPrepareImpl::numBitsUnsigned(Value
*Op
) const {
595 return computeKnownBits(Op
, *DL
, 0, AC
).countMaxActiveBits();
598 unsigned AMDGPUCodeGenPrepareImpl::numBitsSigned(Value
*Op
) const {
599 return ComputeMaxSignificantBits(Op
, *DL
, 0, AC
);
602 static void extractValues(IRBuilder
<> &Builder
,
603 SmallVectorImpl
<Value
*> &Values
, Value
*V
) {
604 auto *VT
= dyn_cast
<FixedVectorType
>(V
->getType());
610 for (int I
= 0, E
= VT
->getNumElements(); I
!= E
; ++I
)
611 Values
.push_back(Builder
.CreateExtractElement(V
, I
));
614 static Value
*insertValues(IRBuilder
<> &Builder
,
616 SmallVectorImpl
<Value
*> &Values
) {
617 if (!Ty
->isVectorTy()) {
618 assert(Values
.size() == 1);
622 Value
*NewVal
= PoisonValue::get(Ty
);
623 for (int I
= 0, E
= Values
.size(); I
!= E
; ++I
)
624 NewVal
= Builder
.CreateInsertElement(NewVal
, Values
[I
], I
);
629 bool AMDGPUCodeGenPrepareImpl::replaceMulWithMul24(BinaryOperator
&I
) const {
630 if (I
.getOpcode() != Instruction::Mul
)
633 Type
*Ty
= I
.getType();
634 unsigned Size
= Ty
->getScalarSizeInBits();
635 if (Size
<= 16 && ST
->has16BitInsts())
638 // Prefer scalar if this could be s_mul_i32
639 if (UA
->isUniform(&I
))
642 Value
*LHS
= I
.getOperand(0);
643 Value
*RHS
= I
.getOperand(1);
644 IRBuilder
<> Builder(&I
);
645 Builder
.SetCurrentDebugLocation(I
.getDebugLoc());
647 unsigned LHSBits
= 0, RHSBits
= 0;
648 bool IsSigned
= false;
650 if (ST
->hasMulU24() && (LHSBits
= numBitsUnsigned(LHS
)) <= 24 &&
651 (RHSBits
= numBitsUnsigned(RHS
)) <= 24) {
654 } else if (ST
->hasMulI24() && (LHSBits
= numBitsSigned(LHS
)) <= 24 &&
655 (RHSBits
= numBitsSigned(RHS
)) <= 24) {
661 SmallVector
<Value
*, 4> LHSVals
;
662 SmallVector
<Value
*, 4> RHSVals
;
663 SmallVector
<Value
*, 4> ResultVals
;
664 extractValues(Builder
, LHSVals
, LHS
);
665 extractValues(Builder
, RHSVals
, RHS
);
667 IntegerType
*I32Ty
= Builder
.getInt32Ty();
668 IntegerType
*IntrinTy
= Size
> 32 ? Builder
.getInt64Ty() : I32Ty
;
669 Type
*DstTy
= LHSVals
[0]->getType();
671 for (int I
= 0, E
= LHSVals
.size(); I
!= E
; ++I
) {
672 Value
*LHS
= IsSigned
? Builder
.CreateSExtOrTrunc(LHSVals
[I
], I32Ty
)
673 : Builder
.CreateZExtOrTrunc(LHSVals
[I
], I32Ty
);
674 Value
*RHS
= IsSigned
? Builder
.CreateSExtOrTrunc(RHSVals
[I
], I32Ty
)
675 : Builder
.CreateZExtOrTrunc(RHSVals
[I
], I32Ty
);
677 IsSigned
? Intrinsic::amdgcn_mul_i24
: Intrinsic::amdgcn_mul_u24
;
678 Value
*Result
= Builder
.CreateIntrinsic(ID
, {IntrinTy
}, {LHS
, RHS
});
679 Result
= IsSigned
? Builder
.CreateSExtOrTrunc(Result
, DstTy
)
680 : Builder
.CreateZExtOrTrunc(Result
, DstTy
);
681 ResultVals
.push_back(Result
);
684 Value
*NewVal
= insertValues(Builder
, Ty
, ResultVals
);
685 NewVal
->takeName(&I
);
686 I
.replaceAllUsesWith(NewVal
);
692 // Find a select instruction, which may have been casted. This is mostly to deal
693 // with cases where i16 selects were promoted here to i32.
694 static SelectInst
*findSelectThroughCast(Value
*V
, CastInst
*&Cast
) {
696 if (SelectInst
*Sel
= dyn_cast
<SelectInst
>(V
))
699 if ((Cast
= dyn_cast
<CastInst
>(V
))) {
700 if (SelectInst
*Sel
= dyn_cast
<SelectInst
>(Cast
->getOperand(0)))
707 bool AMDGPUCodeGenPrepareImpl::foldBinOpIntoSelect(BinaryOperator
&BO
) const {
708 // Don't do this unless the old select is going away. We want to eliminate the
709 // binary operator, not replace a binop with a select.
714 // TODO: Should probably try to handle some cases with multiple
715 // users. Duplicating the select may be profitable for division.
716 SelectInst
*Sel
= findSelectThroughCast(BO
.getOperand(0), CastOp
);
717 if (!Sel
|| !Sel
->hasOneUse()) {
719 Sel
= findSelectThroughCast(BO
.getOperand(1), CastOp
);
722 if (!Sel
|| !Sel
->hasOneUse())
725 Constant
*CT
= dyn_cast
<Constant
>(Sel
->getTrueValue());
726 Constant
*CF
= dyn_cast
<Constant
>(Sel
->getFalseValue());
727 Constant
*CBO
= dyn_cast
<Constant
>(BO
.getOperand(SelOpNo
^ 1));
728 if (!CBO
|| !CT
|| !CF
)
732 if (!CastOp
->hasOneUse())
734 CT
= ConstantFoldCastOperand(CastOp
->getOpcode(), CT
, BO
.getType(), *DL
);
735 CF
= ConstantFoldCastOperand(CastOp
->getOpcode(), CF
, BO
.getType(), *DL
);
738 // TODO: Handle special 0/-1 cases DAG combine does, although we only really
739 // need to handle divisions here.
740 Constant
*FoldedT
= SelOpNo
?
741 ConstantFoldBinaryOpOperands(BO
.getOpcode(), CBO
, CT
, *DL
) :
742 ConstantFoldBinaryOpOperands(BO
.getOpcode(), CT
, CBO
, *DL
);
743 if (!FoldedT
|| isa
<ConstantExpr
>(FoldedT
))
746 Constant
*FoldedF
= SelOpNo
?
747 ConstantFoldBinaryOpOperands(BO
.getOpcode(), CBO
, CF
, *DL
) :
748 ConstantFoldBinaryOpOperands(BO
.getOpcode(), CF
, CBO
, *DL
);
749 if (!FoldedF
|| isa
<ConstantExpr
>(FoldedF
))
752 IRBuilder
<> Builder(&BO
);
753 Builder
.SetCurrentDebugLocation(BO
.getDebugLoc());
754 if (const FPMathOperator
*FPOp
= dyn_cast
<const FPMathOperator
>(&BO
))
755 Builder
.setFastMathFlags(FPOp
->getFastMathFlags());
757 Value
*NewSelect
= Builder
.CreateSelect(Sel
->getCondition(),
759 NewSelect
->takeName(&BO
);
760 BO
.replaceAllUsesWith(NewSelect
);
761 BO
.eraseFromParent();
763 CastOp
->eraseFromParent();
764 Sel
->eraseFromParent();
768 std::pair
<Value
*, Value
*>
769 AMDGPUCodeGenPrepareImpl::getFrexpResults(IRBuilder
<> &Builder
,
771 Type
*Ty
= Src
->getType();
772 Value
*Frexp
= Builder
.CreateIntrinsic(Intrinsic::frexp
,
773 {Ty
, Builder
.getInt32Ty()}, Src
);
774 Value
*FrexpMant
= Builder
.CreateExtractValue(Frexp
, {0});
776 // Bypass the bug workaround for the exponent result since it doesn't matter.
777 // TODO: Does the bug workaround even really need to consider the exponent
778 // result? It's unspecified by the spec.
782 ? Builder
.CreateIntrinsic(Intrinsic::amdgcn_frexp_exp
,
783 {Builder
.getInt32Ty(), Ty
}, Src
)
784 : Builder
.CreateExtractValue(Frexp
, {1});
785 return {FrexpMant
, FrexpExp
};
788 /// Emit an expansion of 1.0 / Src good for 1ulp that supports denormals.
789 Value
*AMDGPUCodeGenPrepareImpl::emitRcpIEEE1ULP(IRBuilder
<> &Builder
,
791 bool IsNegative
) const {
792 // Same as for 1.0, but expand the sign out of the constant.
793 // -1.0 / x -> rcp (fneg x)
795 Src
= Builder
.CreateFNeg(Src
);
797 // The rcp instruction doesn't support denormals, so scale the input
798 // out of the denormal range and convert at the end.
800 // Expand as 2^-n * (1.0 / (x * 2^n))
802 // TODO: Skip scaling if input is known never denormal and the input
803 // range won't underflow to denormal. The hard part is knowing the
804 // result. We need a range check, the result could be denormal for
805 // 0x1p+126 < den <= 0x1p+127.
806 auto [FrexpMant
, FrexpExp
] = getFrexpResults(Builder
, Src
);
807 Value
*ScaleFactor
= Builder
.CreateNeg(FrexpExp
);
808 Value
*Rcp
= Builder
.CreateUnaryIntrinsic(Intrinsic::amdgcn_rcp
, FrexpMant
);
809 return Builder
.CreateCall(getLdexpF32(), {Rcp
, ScaleFactor
});
812 /// Emit a 2ulp expansion for fdiv by using frexp for input scaling.
813 Value
*AMDGPUCodeGenPrepareImpl::emitFrexpDiv(IRBuilder
<> &Builder
, Value
*LHS
,
815 FastMathFlags FMF
) const {
816 // If we have have to work around the fract/frexp bug, we're worse off than
817 // using the fdiv.fast expansion. The full safe expansion is faster if we have
819 if (HasFP32DenormalFlush
&& ST
->hasFractBug() && !ST
->hasFastFMAF32() &&
820 (!FMF
.noNaNs() || !FMF
.noInfs()))
823 // We're scaling the LHS to avoid a denormal input, and scale the denominator
824 // to avoid large values underflowing the result.
825 auto [FrexpMantRHS
, FrexpExpRHS
] = getFrexpResults(Builder
, RHS
);
828 Builder
.CreateUnaryIntrinsic(Intrinsic::amdgcn_rcp
, FrexpMantRHS
);
830 auto [FrexpMantLHS
, FrexpExpLHS
] = getFrexpResults(Builder
, LHS
);
831 Value
*Mul
= Builder
.CreateFMul(FrexpMantLHS
, Rcp
);
833 // We multiplied by 2^N/2^M, so we need to multiply by 2^(N-M) to scale the
835 Value
*ExpDiff
= Builder
.CreateSub(FrexpExpLHS
, FrexpExpRHS
);
836 return Builder
.CreateCall(getLdexpF32(), {Mul
, ExpDiff
});
839 /// Emit a sqrt that handles denormals and is accurate to 2ulp.
840 Value
*AMDGPUCodeGenPrepareImpl::emitSqrtIEEE2ULP(IRBuilder
<> &Builder
,
842 FastMathFlags FMF
) const {
843 Type
*Ty
= Src
->getType();
844 APFloat SmallestNormal
=
845 APFloat::getSmallestNormalized(Ty
->getFltSemantics());
847 Builder
.CreateFCmpOLT(Src
, ConstantFP::get(Ty
, SmallestNormal
));
849 ConstantInt
*Zero
= Builder
.getInt32(0);
850 Value
*InputScaleFactor
=
851 Builder
.CreateSelect(NeedScale
, Builder
.getInt32(32), Zero
);
853 Value
*Scaled
= Builder
.CreateCall(getLdexpF32(), {Src
, InputScaleFactor
});
855 Value
*Sqrt
= Builder
.CreateCall(getSqrtF32(), Scaled
);
857 Value
*OutputScaleFactor
=
858 Builder
.CreateSelect(NeedScale
, Builder
.getInt32(-16), Zero
);
859 return Builder
.CreateCall(getLdexpF32(), {Sqrt
, OutputScaleFactor
});
862 /// Emit an expansion of 1.0 / sqrt(Src) good for 1ulp that supports denormals.
863 static Value
*emitRsqIEEE1ULP(IRBuilder
<> &Builder
, Value
*Src
,
865 // bool need_scale = x < 0x1p-126f;
866 // float input_scale = need_scale ? 0x1.0p+24f : 1.0f;
867 // float output_scale = need_scale ? 0x1.0p+12f : 1.0f;
868 // rsq(x * input_scale) * output_scale;
870 Type
*Ty
= Src
->getType();
871 APFloat SmallestNormal
=
872 APFloat::getSmallestNormalized(Ty
->getFltSemantics());
874 Builder
.CreateFCmpOLT(Src
, ConstantFP::get(Ty
, SmallestNormal
));
875 Constant
*One
= ConstantFP::get(Ty
, 1.0);
876 Constant
*InputScale
= ConstantFP::get(Ty
, 0x1.0p
+24);
877 Constant
*OutputScale
=
878 ConstantFP::get(Ty
, IsNegative
? -0x1.0p
+12 : 0x1.0p
+12);
880 Value
*InputScaleFactor
= Builder
.CreateSelect(NeedScale
, InputScale
, One
);
882 Value
*ScaledInput
= Builder
.CreateFMul(Src
, InputScaleFactor
);
883 Value
*Rsq
= Builder
.CreateUnaryIntrinsic(Intrinsic::amdgcn_rsq
, ScaledInput
);
884 Value
*OutputScaleFactor
= Builder
.CreateSelect(
885 NeedScale
, OutputScale
, IsNegative
? ConstantFP::get(Ty
, -1.0) : One
);
887 return Builder
.CreateFMul(Rsq
, OutputScaleFactor
);
890 bool AMDGPUCodeGenPrepareImpl::canOptimizeWithRsq(const FPMathOperator
*SqrtOp
,
891 FastMathFlags DivFMF
,
892 FastMathFlags SqrtFMF
) const {
893 // The rsqrt contraction increases accuracy from ~2ulp to ~1ulp.
894 if (!DivFMF
.allowContract() || !SqrtFMF
.allowContract())
897 // v_rsq_f32 gives 1ulp
898 return SqrtFMF
.approxFunc() || HasUnsafeFPMath
||
899 SqrtOp
->getFPAccuracy() >= 1.0f
;
902 Value
*AMDGPUCodeGenPrepareImpl::optimizeWithRsq(
903 IRBuilder
<> &Builder
, Value
*Num
, Value
*Den
, const FastMathFlags DivFMF
,
904 const FastMathFlags SqrtFMF
, const Instruction
*CtxI
) const {
905 // The rsqrt contraction increases accuracy from ~2ulp to ~1ulp.
906 assert(DivFMF
.allowContract() && SqrtFMF
.allowContract());
908 // rsq_f16 is accurate to 0.51 ulp.
909 // rsq_f32 is accurate for !fpmath >= 1.0ulp and denormals are flushed.
910 // rsq_f64 is never accurate.
911 const ConstantFP
*CLHS
= dyn_cast
<ConstantFP
>(Num
);
915 assert(Den
->getType()->isFloatTy());
917 bool IsNegative
= false;
919 // TODO: Handle other numerator values with arcp.
920 if (CLHS
->isExactlyValue(1.0) || (IsNegative
= CLHS
->isExactlyValue(-1.0))) {
921 // Add in the sqrt flags.
922 IRBuilder
<>::FastMathFlagGuard
Guard(Builder
);
923 Builder
.setFastMathFlags(DivFMF
| SqrtFMF
);
925 if ((DivFMF
.approxFunc() && SqrtFMF
.approxFunc()) || HasUnsafeFPMath
||
926 canIgnoreDenormalInput(Den
, CtxI
)) {
927 Value
*Result
= Builder
.CreateUnaryIntrinsic(Intrinsic::amdgcn_rsq
, Den
);
928 // -1.0 / sqrt(x) -> fneg(rsq(x))
929 return IsNegative
? Builder
.CreateFNeg(Result
) : Result
;
932 return emitRsqIEEE1ULP(Builder
, Den
, IsNegative
);
938 // Optimize fdiv with rcp:
940 // 1/x -> rcp(x) when rcp is sufficiently accurate or inaccurate rcp is
941 // allowed with unsafe-fp-math or afn.
943 // a/b -> a*rcp(b) when arcp is allowed, and we only need provide ULP 1.0
945 AMDGPUCodeGenPrepareImpl::optimizeWithRcp(IRBuilder
<> &Builder
, Value
*Num
,
946 Value
*Den
, FastMathFlags FMF
,
947 const Instruction
*CtxI
) const {
948 // rcp_f16 is accurate to 0.51 ulp.
949 // rcp_f32 is accurate for !fpmath >= 1.0ulp and denormals are flushed.
950 // rcp_f64 is never accurate.
951 assert(Den
->getType()->isFloatTy());
953 if (const ConstantFP
*CLHS
= dyn_cast
<ConstantFP
>(Num
)) {
954 bool IsNegative
= false;
955 if (CLHS
->isExactlyValue(1.0) ||
956 (IsNegative
= CLHS
->isExactlyValue(-1.0))) {
959 if (HasFP32DenormalFlush
|| FMF
.approxFunc()) {
960 // -1.0 / x -> 1.0 / fneg(x)
962 Src
= Builder
.CreateFNeg(Src
);
964 // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
965 // the CI documentation has a worst case error of 1 ulp.
966 // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK
967 // to use it as long as we aren't trying to use denormals.
969 // v_rcp_f16 and v_rsq_f16 DO support denormals.
971 // NOTE: v_sqrt and v_rcp will be combined to v_rsq later. So we don't
972 // insert rsq intrinsic here.
975 return Builder
.CreateUnaryIntrinsic(Intrinsic::amdgcn_rcp
, Src
);
978 // TODO: If the input isn't denormal, and we know the input exponent isn't
979 // big enough to introduce a denormal we can avoid the scaling.
980 return emitRcpIEEE1ULP(Builder
, Src
, IsNegative
);
984 if (FMF
.allowReciprocal()) {
985 // x / y -> x * (1.0 / y)
987 // TODO: Could avoid denormal scaling and use raw rcp if we knew the output
988 // will never underflow.
989 if (HasFP32DenormalFlush
|| FMF
.approxFunc()) {
990 Value
*Recip
= Builder
.CreateUnaryIntrinsic(Intrinsic::amdgcn_rcp
, Den
);
991 return Builder
.CreateFMul(Num
, Recip
);
994 Value
*Recip
= emitRcpIEEE1ULP(Builder
, Den
, false);
995 return Builder
.CreateFMul(Num
, Recip
);
1001 // optimize with fdiv.fast:
1003 // a/b -> fdiv.fast(a, b) when !fpmath >= 2.5ulp with denormals flushed.
1005 // 1/x -> fdiv.fast(1,x) when !fpmath >= 2.5ulp.
1007 // NOTE: optimizeWithRcp should be tried first because rcp is the preference.
1008 Value
*AMDGPUCodeGenPrepareImpl::optimizeWithFDivFast(
1009 IRBuilder
<> &Builder
, Value
*Num
, Value
*Den
, float ReqdAccuracy
) const {
1010 // fdiv.fast can achieve 2.5 ULP accuracy.
1011 if (ReqdAccuracy
< 2.5f
)
1014 // Only have fdiv.fast for f32.
1015 assert(Den
->getType()->isFloatTy());
1017 bool NumIsOne
= false;
1018 if (const ConstantFP
*CNum
= dyn_cast
<ConstantFP
>(Num
)) {
1019 if (CNum
->isExactlyValue(+1.0) || CNum
->isExactlyValue(-1.0))
1023 // fdiv does not support denormals. But 1.0/x is always fine to use it.
1025 // TODO: This works for any value with a specific known exponent range, don't
1026 // just limit to constant 1.
1027 if (!HasFP32DenormalFlush
&& !NumIsOne
)
1030 return Builder
.CreateIntrinsic(Intrinsic::amdgcn_fdiv_fast
, {}, {Num
, Den
});
1033 Value
*AMDGPUCodeGenPrepareImpl::visitFDivElement(
1034 IRBuilder
<> &Builder
, Value
*Num
, Value
*Den
, FastMathFlags DivFMF
,
1035 FastMathFlags SqrtFMF
, Value
*RsqOp
, const Instruction
*FDivInst
,
1036 float ReqdDivAccuracy
) const {
1039 optimizeWithRsq(Builder
, Num
, RsqOp
, DivFMF
, SqrtFMF
, FDivInst
);
1044 Value
*Rcp
= optimizeWithRcp(Builder
, Num
, Den
, DivFMF
, FDivInst
);
1048 // In the basic case fdiv_fast has the same instruction count as the frexp div
1049 // expansion. Slightly prefer fdiv_fast since it ends in an fmul that can
1050 // potentially be fused into a user. Also, materialization of the constants
1051 // can be reused for multiple instances.
1052 Value
*FDivFast
= optimizeWithFDivFast(Builder
, Num
, Den
, ReqdDivAccuracy
);
1056 return emitFrexpDiv(Builder
, Num
, Den
, DivFMF
);
1059 // Optimizations is performed based on fpmath, fast math flags as well as
1060 // denormals to optimize fdiv with either rcp or fdiv.fast.
1063 // 1/x -> rcp(x) when rcp is sufficiently accurate or inaccurate rcp is
1064 // allowed with unsafe-fp-math or afn.
1066 // a/b -> a*rcp(b) when inaccurate rcp is allowed with unsafe-fp-math or afn.
1069 // a/b -> fdiv.fast(a, b) when !fpmath >= 2.5ulp with denormals flushed.
1071 // 1/x -> fdiv.fast(1,x) when !fpmath >= 2.5ulp.
1073 // NOTE: rcp is the preference in cases that both are legal.
1074 bool AMDGPUCodeGenPrepareImpl::visitFDiv(BinaryOperator
&FDiv
) {
1075 if (DisableFDivExpand
)
1078 Type
*Ty
= FDiv
.getType()->getScalarType();
1079 if (!Ty
->isFloatTy())
1082 // The f64 rcp/rsq approximations are pretty inaccurate. We can do an
1083 // expansion around them in codegen. f16 is good enough to always use.
1085 const FPMathOperator
*FPOp
= cast
<const FPMathOperator
>(&FDiv
);
1086 const FastMathFlags DivFMF
= FPOp
->getFastMathFlags();
1087 const float ReqdAccuracy
= FPOp
->getFPAccuracy();
1089 FastMathFlags SqrtFMF
;
1091 Value
*Num
= FDiv
.getOperand(0);
1092 Value
*Den
= FDiv
.getOperand(1);
1094 Value
*RsqOp
= nullptr;
1095 auto *DenII
= dyn_cast
<IntrinsicInst
>(Den
);
1096 if (DenII
&& DenII
->getIntrinsicID() == Intrinsic::sqrt
&&
1097 DenII
->hasOneUse()) {
1098 const auto *SqrtOp
= cast
<FPMathOperator
>(DenII
);
1099 SqrtFMF
= SqrtOp
->getFastMathFlags();
1100 if (canOptimizeWithRsq(SqrtOp
, DivFMF
, SqrtFMF
))
1101 RsqOp
= SqrtOp
->getOperand(0);
1104 // Inaccurate rcp is allowed with unsafe-fp-math or afn.
1106 // Defer to codegen to handle this.
1108 // TODO: Decide on an interpretation for interactions between afn + arcp +
1109 // !fpmath, and make it consistent between here and codegen. For now, defer
1110 // expansion of afn to codegen. The current interpretation is so aggressive we
1111 // don't need any pre-consideration here when we have better information. A
1112 // more conservative interpretation could use handling here.
1113 const bool AllowInaccurateRcp
= HasUnsafeFPMath
|| DivFMF
.approxFunc();
1114 if (!RsqOp
&& AllowInaccurateRcp
)
1117 // Defer the correct implementations to codegen.
1118 if (ReqdAccuracy
< 1.0f
)
1121 IRBuilder
<> Builder(FDiv
.getParent(), std::next(FDiv
.getIterator()));
1122 Builder
.setFastMathFlags(DivFMF
);
1123 Builder
.SetCurrentDebugLocation(FDiv
.getDebugLoc());
1125 SmallVector
<Value
*, 4> NumVals
;
1126 SmallVector
<Value
*, 4> DenVals
;
1127 SmallVector
<Value
*, 4> RsqDenVals
;
1128 extractValues(Builder
, NumVals
, Num
);
1129 extractValues(Builder
, DenVals
, Den
);
1132 extractValues(Builder
, RsqDenVals
, RsqOp
);
1134 SmallVector
<Value
*, 4> ResultVals(NumVals
.size());
1135 for (int I
= 0, E
= NumVals
.size(); I
!= E
; ++I
) {
1136 Value
*NumElt
= NumVals
[I
];
1137 Value
*DenElt
= DenVals
[I
];
1138 Value
*RsqDenElt
= RsqOp
? RsqDenVals
[I
] : nullptr;
1141 visitFDivElement(Builder
, NumElt
, DenElt
, DivFMF
, SqrtFMF
, RsqDenElt
,
1142 cast
<Instruction
>(FPOp
), ReqdAccuracy
);
1144 // Keep the original, but scalarized.
1146 // This has the unfortunate side effect of sometimes scalarizing when
1147 // we're not going to do anything.
1148 NewElt
= Builder
.CreateFDiv(NumElt
, DenElt
);
1149 if (auto *NewEltInst
= dyn_cast
<Instruction
>(NewElt
))
1150 NewEltInst
->copyMetadata(FDiv
);
1153 ResultVals
[I
] = NewElt
;
1156 Value
*NewVal
= insertValues(Builder
, FDiv
.getType(), ResultVals
);
1159 FDiv
.replaceAllUsesWith(NewVal
);
1160 NewVal
->takeName(&FDiv
);
1161 RecursivelyDeleteTriviallyDeadInstructions(&FDiv
, TLInfo
);
1167 static bool hasUnsafeFPMath(const Function
&F
) {
1168 Attribute Attr
= F
.getFnAttribute("unsafe-fp-math");
1169 return Attr
.getValueAsBool();
1172 static std::pair
<Value
*, Value
*> getMul64(IRBuilder
<> &Builder
,
1173 Value
*LHS
, Value
*RHS
) {
1174 Type
*I32Ty
= Builder
.getInt32Ty();
1175 Type
*I64Ty
= Builder
.getInt64Ty();
1177 Value
*LHS_EXT64
= Builder
.CreateZExt(LHS
, I64Ty
);
1178 Value
*RHS_EXT64
= Builder
.CreateZExt(RHS
, I64Ty
);
1179 Value
*MUL64
= Builder
.CreateMul(LHS_EXT64
, RHS_EXT64
);
1180 Value
*Lo
= Builder
.CreateTrunc(MUL64
, I32Ty
);
1181 Value
*Hi
= Builder
.CreateLShr(MUL64
, Builder
.getInt64(32));
1182 Hi
= Builder
.CreateTrunc(Hi
, I32Ty
);
1183 return std::pair(Lo
, Hi
);
1186 static Value
* getMulHu(IRBuilder
<> &Builder
, Value
*LHS
, Value
*RHS
) {
1187 return getMul64(Builder
, LHS
, RHS
).second
;
1190 /// Figure out how many bits are really needed for this division. \p AtLeast is
1191 /// an optimization hint to bypass the second ComputeNumSignBits call if we the
1192 /// first one is insufficient. Returns -1 on failure.
1193 int AMDGPUCodeGenPrepareImpl::getDivNumBits(BinaryOperator
&I
, Value
*Num
,
1194 Value
*Den
, unsigned AtLeast
,
1195 bool IsSigned
) const {
1196 const DataLayout
&DL
= Mod
->getDataLayout();
1197 unsigned LHSSignBits
= ComputeNumSignBits(Num
, DL
, 0, AC
, &I
);
1198 if (LHSSignBits
< AtLeast
)
1201 unsigned RHSSignBits
= ComputeNumSignBits(Den
, DL
, 0, AC
, &I
);
1202 if (RHSSignBits
< AtLeast
)
1205 unsigned SignBits
= std::min(LHSSignBits
, RHSSignBits
);
1206 unsigned DivBits
= Num
->getType()->getScalarSizeInBits() - SignBits
;
1212 // The fractional part of a float is enough to accurately represent up to
1213 // a 24-bit signed integer.
1214 Value
*AMDGPUCodeGenPrepareImpl::expandDivRem24(IRBuilder
<> &Builder
,
1215 BinaryOperator
&I
, Value
*Num
,
1216 Value
*Den
, bool IsDiv
,
1217 bool IsSigned
) const {
1218 unsigned SSBits
= Num
->getType()->getScalarSizeInBits();
1219 // If Num bits <= 24, assume 0 signbits.
1220 unsigned AtLeast
= (SSBits
<= 24) ? 0 : (SSBits
- 24 + IsSigned
);
1221 int DivBits
= getDivNumBits(I
, Num
, Den
, AtLeast
, IsSigned
);
1224 return expandDivRem24Impl(Builder
, I
, Num
, Den
, DivBits
, IsDiv
, IsSigned
);
1227 Value
*AMDGPUCodeGenPrepareImpl::expandDivRem24Impl(
1228 IRBuilder
<> &Builder
, BinaryOperator
&I
, Value
*Num
, Value
*Den
,
1229 unsigned DivBits
, bool IsDiv
, bool IsSigned
) const {
1230 Type
*I32Ty
= Builder
.getInt32Ty();
1231 Num
= Builder
.CreateTrunc(Num
, I32Ty
);
1232 Den
= Builder
.CreateTrunc(Den
, I32Ty
);
1234 Type
*F32Ty
= Builder
.getFloatTy();
1235 ConstantInt
*One
= Builder
.getInt32(1);
1239 // char|short jq = ia ^ ib;
1240 JQ
= Builder
.CreateXor(Num
, Den
);
1242 // jq = jq >> (bitsize - 2)
1243 JQ
= Builder
.CreateAShr(JQ
, Builder
.getInt32(30));
1246 JQ
= Builder
.CreateOr(JQ
, One
);
1249 // int ia = (int)LHS;
1252 // int ib, (int)RHS;
1255 // float fa = (float)ia;
1256 Value
*FA
= IsSigned
? Builder
.CreateSIToFP(IA
, F32Ty
)
1257 : Builder
.CreateUIToFP(IA
, F32Ty
);
1259 // float fb = (float)ib;
1260 Value
*FB
= IsSigned
? Builder
.CreateSIToFP(IB
,F32Ty
)
1261 : Builder
.CreateUIToFP(IB
,F32Ty
);
1263 Function
*RcpDecl
= Intrinsic::getDeclaration(Mod
, Intrinsic::amdgcn_rcp
,
1264 Builder
.getFloatTy());
1265 Value
*RCP
= Builder
.CreateCall(RcpDecl
, { FB
});
1266 Value
*FQM
= Builder
.CreateFMul(FA
, RCP
);
1269 CallInst
*FQ
= Builder
.CreateUnaryIntrinsic(Intrinsic::trunc
, FQM
);
1270 FQ
->copyFastMathFlags(Builder
.getFastMathFlags());
1272 // float fqneg = -fq;
1273 Value
*FQNeg
= Builder
.CreateFNeg(FQ
);
1275 // float fr = mad(fqneg, fb, fa);
1276 auto FMAD
= !ST
->hasMadMacF32Insts()
1278 : (Intrinsic::ID
)Intrinsic::amdgcn_fmad_ftz
;
1279 Value
*FR
= Builder
.CreateIntrinsic(FMAD
,
1280 {FQNeg
->getType()}, {FQNeg
, FB
, FA
}, FQ
);
1282 // int iq = (int)fq;
1283 Value
*IQ
= IsSigned
? Builder
.CreateFPToSI(FQ
, I32Ty
)
1284 : Builder
.CreateFPToUI(FQ
, I32Ty
);
1287 FR
= Builder
.CreateUnaryIntrinsic(Intrinsic::fabs
, FR
, FQ
);
1290 FB
= Builder
.CreateUnaryIntrinsic(Intrinsic::fabs
, FB
, FQ
);
1292 // int cv = fr >= fb;
1293 Value
*CV
= Builder
.CreateFCmpOGE(FR
, FB
);
1295 // jq = (cv ? jq : 0);
1296 JQ
= Builder
.CreateSelect(CV
, JQ
, Builder
.getInt32(0));
1299 Value
*Div
= Builder
.CreateAdd(IQ
, JQ
);
1303 // Rem needs compensation, it's easier to recompute it
1304 Value
*Rem
= Builder
.CreateMul(Div
, Den
);
1305 Res
= Builder
.CreateSub(Num
, Rem
);
1308 if (DivBits
!= 0 && DivBits
< 32) {
1309 // Extend in register from the number of bits this divide really is.
1311 int InRegBits
= 32 - DivBits
;
1313 Res
= Builder
.CreateShl(Res
, InRegBits
);
1314 Res
= Builder
.CreateAShr(Res
, InRegBits
);
1316 ConstantInt
*TruncMask
1317 = Builder
.getInt32((UINT64_C(1) << DivBits
) - 1);
1318 Res
= Builder
.CreateAnd(Res
, TruncMask
);
1325 // Try to recognize special cases the DAG will emit special, better expansions
1326 // than the general expansion we do here.
1328 // TODO: It would be better to just directly handle those optimizations here.
1329 bool AMDGPUCodeGenPrepareImpl::divHasSpecialOptimization(BinaryOperator
&I
,
1332 if (Constant
*C
= dyn_cast
<Constant
>(Den
)) {
1333 // Arbitrary constants get a better expansion as long as a wider mulhi is
1335 if (C
->getType()->getScalarSizeInBits() <= 32)
1338 // TODO: Sdiv check for not exact for some reason.
1340 // If there's no wider mulhi, there's only a better expansion for powers of
1342 // TODO: Should really know for each vector element.
1343 if (isKnownToBeAPowerOfTwo(C
, *DL
, true, 0, AC
, &I
, DT
))
1349 if (BinaryOperator
*BinOpDen
= dyn_cast
<BinaryOperator
>(Den
)) {
1350 // fold (udiv x, (shl c, y)) -> x >>u (log2(c)+y) iff c is power of 2
1351 if (BinOpDen
->getOpcode() == Instruction::Shl
&&
1352 isa
<Constant
>(BinOpDen
->getOperand(0)) &&
1353 isKnownToBeAPowerOfTwo(BinOpDen
->getOperand(0), *DL
, true,
1362 static Value
*getSign32(Value
*V
, IRBuilder
<> &Builder
, const DataLayout
*DL
) {
1363 // Check whether the sign can be determined statically.
1364 KnownBits Known
= computeKnownBits(V
, *DL
);
1365 if (Known
.isNegative())
1366 return Constant::getAllOnesValue(V
->getType());
1367 if (Known
.isNonNegative())
1368 return Constant::getNullValue(V
->getType());
1369 return Builder
.CreateAShr(V
, Builder
.getInt32(31));
1372 Value
*AMDGPUCodeGenPrepareImpl::expandDivRem32(IRBuilder
<> &Builder
,
1373 BinaryOperator
&I
, Value
*X
,
1375 Instruction::BinaryOps Opc
= I
.getOpcode();
1376 assert(Opc
== Instruction::URem
|| Opc
== Instruction::UDiv
||
1377 Opc
== Instruction::SRem
|| Opc
== Instruction::SDiv
);
1381 Builder
.setFastMathFlags(FMF
);
1383 if (divHasSpecialOptimization(I
, X
, Y
))
1384 return nullptr; // Keep it for later optimization.
1386 bool IsDiv
= Opc
== Instruction::UDiv
|| Opc
== Instruction::SDiv
;
1387 bool IsSigned
= Opc
== Instruction::SRem
|| Opc
== Instruction::SDiv
;
1389 Type
*Ty
= X
->getType();
1390 Type
*I32Ty
= Builder
.getInt32Ty();
1391 Type
*F32Ty
= Builder
.getFloatTy();
1393 if (Ty
->getScalarSizeInBits() != 32) {
1395 X
= Builder
.CreateSExtOrTrunc(X
, I32Ty
);
1396 Y
= Builder
.CreateSExtOrTrunc(Y
, I32Ty
);
1398 X
= Builder
.CreateZExtOrTrunc(X
, I32Ty
);
1399 Y
= Builder
.CreateZExtOrTrunc(Y
, I32Ty
);
1403 if (Value
*Res
= expandDivRem24(Builder
, I
, X
, Y
, IsDiv
, IsSigned
)) {
1404 return IsSigned
? Builder
.CreateSExtOrTrunc(Res
, Ty
) :
1405 Builder
.CreateZExtOrTrunc(Res
, Ty
);
1408 ConstantInt
*Zero
= Builder
.getInt32(0);
1409 ConstantInt
*One
= Builder
.getInt32(1);
1411 Value
*Sign
= nullptr;
1413 Value
*SignX
= getSign32(X
, Builder
, DL
);
1414 Value
*SignY
= getSign32(Y
, Builder
, DL
);
1415 // Remainder sign is the same as LHS
1416 Sign
= IsDiv
? Builder
.CreateXor(SignX
, SignY
) : SignX
;
1418 X
= Builder
.CreateAdd(X
, SignX
);
1419 Y
= Builder
.CreateAdd(Y
, SignY
);
1421 X
= Builder
.CreateXor(X
, SignX
);
1422 Y
= Builder
.CreateXor(Y
, SignY
);
1425 // The algorithm here is based on ideas from "Software Integer Division", Tom
1426 // Rodeheffer, August 2008.
1428 // unsigned udiv(unsigned x, unsigned y) {
1429 // // Initial estimate of inv(y). The constant is less than 2^32 to ensure
1430 // // that this is a lower bound on inv(y), even if some of the calculations
1432 // unsigned z = (unsigned)((4294967296.0 - 512.0) * v_rcp_f32((float)y));
1434 // // One round of UNR (Unsigned integer Newton-Raphson) to improve z.
1435 // // Empirically this is guaranteed to give a "two-y" lower bound on
1437 // z += umulh(z, -y * z);
1439 // // Quotient/remainder estimate.
1440 // unsigned q = umulh(x, z);
1441 // unsigned r = x - q * y;
1443 // // Two rounds of quotient/remainder refinement.
1456 // Initial estimate of inv(y).
1457 Value
*FloatY
= Builder
.CreateUIToFP(Y
, F32Ty
);
1458 Function
*Rcp
= Intrinsic::getDeclaration(Mod
, Intrinsic::amdgcn_rcp
, F32Ty
);
1459 Value
*RcpY
= Builder
.CreateCall(Rcp
, {FloatY
});
1460 Constant
*Scale
= ConstantFP::get(F32Ty
, llvm::bit_cast
<float>(0x4F7FFFFE));
1461 Value
*ScaledY
= Builder
.CreateFMul(RcpY
, Scale
);
1462 Value
*Z
= Builder
.CreateFPToUI(ScaledY
, I32Ty
);
1464 // One round of UNR.
1465 Value
*NegY
= Builder
.CreateSub(Zero
, Y
);
1466 Value
*NegYZ
= Builder
.CreateMul(NegY
, Z
);
1467 Z
= Builder
.CreateAdd(Z
, getMulHu(Builder
, Z
, NegYZ
));
1469 // Quotient/remainder estimate.
1470 Value
*Q
= getMulHu(Builder
, X
, Z
);
1471 Value
*R
= Builder
.CreateSub(X
, Builder
.CreateMul(Q
, Y
));
1473 // First quotient/remainder refinement.
1474 Value
*Cond
= Builder
.CreateICmpUGE(R
, Y
);
1476 Q
= Builder
.CreateSelect(Cond
, Builder
.CreateAdd(Q
, One
), Q
);
1477 R
= Builder
.CreateSelect(Cond
, Builder
.CreateSub(R
, Y
), R
);
1479 // Second quotient/remainder refinement.
1480 Cond
= Builder
.CreateICmpUGE(R
, Y
);
1483 Res
= Builder
.CreateSelect(Cond
, Builder
.CreateAdd(Q
, One
), Q
);
1485 Res
= Builder
.CreateSelect(Cond
, Builder
.CreateSub(R
, Y
), R
);
1488 Res
= Builder
.CreateXor(Res
, Sign
);
1489 Res
= Builder
.CreateSub(Res
, Sign
);
1490 Res
= Builder
.CreateSExtOrTrunc(Res
, Ty
);
1492 Res
= Builder
.CreateZExtOrTrunc(Res
, Ty
);
1497 Value
*AMDGPUCodeGenPrepareImpl::shrinkDivRem64(IRBuilder
<> &Builder
,
1498 BinaryOperator
&I
, Value
*Num
,
1500 if (!ExpandDiv64InIR
&& divHasSpecialOptimization(I
, Num
, Den
))
1501 return nullptr; // Keep it for later optimization.
1503 Instruction::BinaryOps Opc
= I
.getOpcode();
1505 bool IsDiv
= Opc
== Instruction::SDiv
|| Opc
== Instruction::UDiv
;
1506 bool IsSigned
= Opc
== Instruction::SDiv
|| Opc
== Instruction::SRem
;
1508 int NumDivBits
= getDivNumBits(I
, Num
, Den
, 32, IsSigned
);
1509 if (NumDivBits
== -1)
1512 Value
*Narrowed
= nullptr;
1513 if (NumDivBits
<= 24) {
1514 Narrowed
= expandDivRem24Impl(Builder
, I
, Num
, Den
, NumDivBits
,
1516 } else if (NumDivBits
<= 32) {
1517 Narrowed
= expandDivRem32(Builder
, I
, Num
, Den
);
1521 return IsSigned
? Builder
.CreateSExt(Narrowed
, Num
->getType()) :
1522 Builder
.CreateZExt(Narrowed
, Num
->getType());
1528 void AMDGPUCodeGenPrepareImpl::expandDivRem64(BinaryOperator
&I
) const {
1529 Instruction::BinaryOps Opc
= I
.getOpcode();
1530 // Do the general expansion.
1531 if (Opc
== Instruction::UDiv
|| Opc
== Instruction::SDiv
) {
1532 expandDivisionUpTo64Bits(&I
);
1536 if (Opc
== Instruction::URem
|| Opc
== Instruction::SRem
) {
1537 expandRemainderUpTo64Bits(&I
);
1541 llvm_unreachable("not a division");
1544 bool AMDGPUCodeGenPrepareImpl::visitBinaryOperator(BinaryOperator
&I
) {
1545 if (foldBinOpIntoSelect(I
))
1548 if (ST
->has16BitInsts() && needsPromotionToI32(I
.getType()) &&
1549 UA
->isUniform(&I
) && promoteUniformOpToI32(I
))
1552 if (UseMul24Intrin
&& replaceMulWithMul24(I
))
1555 bool Changed
= false;
1556 Instruction::BinaryOps Opc
= I
.getOpcode();
1557 Type
*Ty
= I
.getType();
1558 Value
*NewDiv
= nullptr;
1559 unsigned ScalarSize
= Ty
->getScalarSizeInBits();
1561 SmallVector
<BinaryOperator
*, 8> Div64ToExpand
;
1563 if ((Opc
== Instruction::URem
|| Opc
== Instruction::UDiv
||
1564 Opc
== Instruction::SRem
|| Opc
== Instruction::SDiv
) &&
1566 !DisableIDivExpand
) {
1567 Value
*Num
= I
.getOperand(0);
1568 Value
*Den
= I
.getOperand(1);
1569 IRBuilder
<> Builder(&I
);
1570 Builder
.SetCurrentDebugLocation(I
.getDebugLoc());
1572 if (auto *VT
= dyn_cast
<FixedVectorType
>(Ty
)) {
1573 NewDiv
= PoisonValue::get(VT
);
1575 for (unsigned N
= 0, E
= VT
->getNumElements(); N
!= E
; ++N
) {
1576 Value
*NumEltN
= Builder
.CreateExtractElement(Num
, N
);
1577 Value
*DenEltN
= Builder
.CreateExtractElement(Den
, N
);
1580 if (ScalarSize
<= 32) {
1581 NewElt
= expandDivRem32(Builder
, I
, NumEltN
, DenEltN
);
1583 NewElt
= Builder
.CreateBinOp(Opc
, NumEltN
, DenEltN
);
1585 // See if this 64-bit division can be shrunk to 32/24-bits before
1586 // producing the general expansion.
1587 NewElt
= shrinkDivRem64(Builder
, I
, NumEltN
, DenEltN
);
1589 // The general 64-bit expansion introduces control flow and doesn't
1590 // return the new value. Just insert a scalar copy and defer
1592 NewElt
= Builder
.CreateBinOp(Opc
, NumEltN
, DenEltN
);
1593 Div64ToExpand
.push_back(cast
<BinaryOperator
>(NewElt
));
1597 if (auto *NewEltI
= dyn_cast
<Instruction
>(NewElt
))
1598 NewEltI
->copyIRFlags(&I
);
1600 NewDiv
= Builder
.CreateInsertElement(NewDiv
, NewElt
, N
);
1603 if (ScalarSize
<= 32)
1604 NewDiv
= expandDivRem32(Builder
, I
, Num
, Den
);
1606 NewDiv
= shrinkDivRem64(Builder
, I
, Num
, Den
);
1608 Div64ToExpand
.push_back(&I
);
1613 I
.replaceAllUsesWith(NewDiv
);
1614 I
.eraseFromParent();
1619 if (ExpandDiv64InIR
) {
1620 // TODO: We get much worse code in specially handled constant cases.
1621 for (BinaryOperator
*Div
: Div64ToExpand
) {
1622 expandDivRem64(*Div
);
1631 bool AMDGPUCodeGenPrepareImpl::visitLoadInst(LoadInst
&I
) {
1635 if ((I
.getPointerAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS
||
1636 I
.getPointerAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT
) &&
1637 canWidenScalarExtLoad(I
)) {
1638 IRBuilder
<> Builder(&I
);
1639 Builder
.SetCurrentDebugLocation(I
.getDebugLoc());
1641 Type
*I32Ty
= Builder
.getInt32Ty();
1642 LoadInst
*WidenLoad
= Builder
.CreateLoad(I32Ty
, I
.getPointerOperand());
1643 WidenLoad
->copyMetadata(I
);
1645 // If we have range metadata, we need to convert the type, and not make
1646 // assumptions about the high bits.
1647 if (auto *Range
= WidenLoad
->getMetadata(LLVMContext::MD_range
)) {
1648 ConstantInt
*Lower
=
1649 mdconst::extract
<ConstantInt
>(Range
->getOperand(0));
1651 if (Lower
->isNullValue()) {
1652 WidenLoad
->setMetadata(LLVMContext::MD_range
, nullptr);
1654 Metadata
*LowAndHigh
[] = {
1655 ConstantAsMetadata::get(ConstantInt::get(I32Ty
, Lower
->getValue().zext(32))),
1656 // Don't make assumptions about the high bits.
1657 ConstantAsMetadata::get(ConstantInt::get(I32Ty
, 0))
1660 WidenLoad
->setMetadata(LLVMContext::MD_range
,
1661 MDNode::get(Mod
->getContext(), LowAndHigh
));
1665 int TySize
= Mod
->getDataLayout().getTypeSizeInBits(I
.getType());
1666 Type
*IntNTy
= Builder
.getIntNTy(TySize
);
1667 Value
*ValTrunc
= Builder
.CreateTrunc(WidenLoad
, IntNTy
);
1668 Value
*ValOrig
= Builder
.CreateBitCast(ValTrunc
, I
.getType());
1669 I
.replaceAllUsesWith(ValOrig
);
1670 I
.eraseFromParent();
1677 bool AMDGPUCodeGenPrepareImpl::visitICmpInst(ICmpInst
&I
) {
1678 bool Changed
= false;
1680 if (ST
->has16BitInsts() && needsPromotionToI32(I
.getOperand(0)->getType()) &&
1682 Changed
|= promoteUniformOpToI32(I
);
1687 bool AMDGPUCodeGenPrepareImpl::visitSelectInst(SelectInst
&I
) {
1688 Value
*Cond
= I
.getCondition();
1689 Value
*TrueVal
= I
.getTrueValue();
1690 Value
*FalseVal
= I
.getFalseValue();
1692 FCmpInst::Predicate Pred
;
1694 if (ST
->has16BitInsts() && needsPromotionToI32(I
.getType())) {
1695 if (UA
->isUniform(&I
))
1696 return promoteUniformOpToI32(I
);
1700 // Match fract pattern with nan check.
1701 if (!match(Cond
, m_FCmp(Pred
, m_Value(CmpVal
), m_NonNaN())))
1704 FPMathOperator
*FPOp
= dyn_cast
<FPMathOperator
>(&I
);
1708 IRBuilder
<> Builder(&I
);
1709 Builder
.setFastMathFlags(FPOp
->getFastMathFlags());
1711 auto *IITrue
= dyn_cast
<IntrinsicInst
>(TrueVal
);
1712 auto *IIFalse
= dyn_cast
<IntrinsicInst
>(FalseVal
);
1714 Value
*Fract
= nullptr;
1715 if (Pred
== FCmpInst::FCMP_UNO
&& TrueVal
== CmpVal
&& IIFalse
&&
1716 CmpVal
== matchFractPat(*IIFalse
)) {
1717 // isnan(x) ? x : fract(x)
1718 Fract
= applyFractPat(Builder
, CmpVal
);
1719 } else if (Pred
== FCmpInst::FCMP_ORD
&& FalseVal
== CmpVal
&& IITrue
&&
1720 CmpVal
== matchFractPat(*IITrue
)) {
1721 // !isnan(x) ? fract(x) : x
1722 Fract
= applyFractPat(Builder
, CmpVal
);
1726 Fract
->takeName(&I
);
1727 I
.replaceAllUsesWith(Fract
);
1728 RecursivelyDeleteTriviallyDeadInstructions(&I
, TLInfo
);
1732 static bool areInSameBB(const Value
*A
, const Value
*B
) {
1733 const auto *IA
= dyn_cast
<Instruction
>(A
);
1734 const auto *IB
= dyn_cast
<Instruction
>(B
);
1735 return IA
&& IB
&& IA
->getParent() == IB
->getParent();
1738 // Helper for breaking large PHIs that returns true when an extractelement on V
1739 // is likely to be folded away by the DAG combiner.
1740 static bool isInterestingPHIIncomingValue(const Value
*V
) {
1741 const auto *FVT
= dyn_cast
<FixedVectorType
>(V
->getType());
1745 const Value
*CurVal
= V
;
1747 // Check for insertelements, keeping track of the elements covered.
1748 BitVector
EltsCovered(FVT
->getNumElements());
1749 while (const auto *IE
= dyn_cast
<InsertElementInst
>(CurVal
)) {
1750 const auto *Idx
= dyn_cast
<ConstantInt
>(IE
->getOperand(2));
1752 // Non constant index/out of bounds index -> folding is unlikely.
1753 // The latter is more of a sanity check because canonical IR should just
1754 // have replaced those with poison.
1755 if (!Idx
|| Idx
->getZExtValue() >= FVT
->getNumElements())
1758 const auto *VecSrc
= IE
->getOperand(0);
1760 // If the vector source is another instruction, it must be in the same basic
1761 // block. Otherwise, the DAGCombiner won't see the whole thing and is
1762 // unlikely to be able to do anything interesting here.
1763 if (isa
<Instruction
>(VecSrc
) && !areInSameBB(VecSrc
, IE
))
1767 EltsCovered
.set(Idx
->getZExtValue());
1769 // All elements covered.
1770 if (EltsCovered
.all())
1774 // We either didn't find a single insertelement, or the insertelement chain
1775 // ended before all elements were covered. Check for other interesting values.
1777 // Constants are always interesting because we can just constant fold the
1779 if (isa
<Constant
>(CurVal
))
1782 // shufflevector is likely to be profitable if either operand is a constant,
1783 // or if either source is in the same block.
1784 // This is because shufflevector is most often lowered as a series of
1785 // insert/extract elements anyway.
1786 if (const auto *SV
= dyn_cast
<ShuffleVectorInst
>(CurVal
)) {
1787 return isa
<Constant
>(SV
->getOperand(1)) ||
1788 areInSameBB(SV
, SV
->getOperand(0)) ||
1789 areInSameBB(SV
, SV
->getOperand(1));
1795 static void collectPHINodes(const PHINode
&I
,
1796 SmallPtrSet
<const PHINode
*, 8> &SeenPHIs
) {
1797 const auto [It
, Inserted
] = SeenPHIs
.insert(&I
);
1801 for (const Value
*Inc
: I
.incoming_values()) {
1802 if (const auto *PhiInc
= dyn_cast
<PHINode
>(Inc
))
1803 collectPHINodes(*PhiInc
, SeenPHIs
);
1806 for (const User
*U
: I
.users()) {
1807 if (const auto *PhiU
= dyn_cast
<PHINode
>(U
))
1808 collectPHINodes(*PhiU
, SeenPHIs
);
1812 bool AMDGPUCodeGenPrepareImpl::canBreakPHINode(const PHINode
&I
) {
1813 // Check in the cache first.
1814 if (const auto It
= BreakPhiNodesCache
.find(&I
);
1815 It
!= BreakPhiNodesCache
.end())
1818 // We consider PHI nodes as part of "chains", so given a PHI node I, we
1819 // recursively consider all its users and incoming values that are also PHI
1820 // nodes. We then make a decision about all of those PHIs at once. Either they
1821 // all get broken up, or none of them do. That way, we avoid cases where a
1822 // single PHI is/is not broken and we end up reforming/exploding a vector
1823 // multiple times, or even worse, doing it in a loop.
1824 SmallPtrSet
<const PHINode
*, 8> WorkList
;
1825 collectPHINodes(I
, WorkList
);
1828 // Check that none of the PHI nodes in the worklist are in the map. If some of
1829 // them are, it means we're not good enough at collecting related PHIs.
1830 for (const PHINode
*WLP
: WorkList
) {
1831 assert(BreakPhiNodesCache
.count(WLP
) == 0);
1835 // To consider a PHI profitable to break, we need to see some interesting
1836 // incoming values. At least 2/3rd (rounded up) of all PHIs in the worklist
1837 // must have one to consider all PHIs breakable.
1839 // This threshold has been determined through performance testing.
1841 // Note that the computation below is equivalent to
1843 // (unsigned)ceil((K / 3.0) * 2)
1845 // It's simply written this way to avoid mixing integral/FP arithmetic.
1846 const auto Threshold
= (alignTo(WorkList
.size() * 2, 3) / 3);
1847 unsigned NumBreakablePHIs
= 0;
1848 bool CanBreak
= false;
1849 for (const PHINode
*Cur
: WorkList
) {
1850 // Don't break PHIs that have no interesting incoming values. That is, where
1851 // there is no clear opportunity to fold the "extractelement" instructions
1854 // Note: IC does not run after this pass, so we're only interested in the
1855 // foldings that the DAG combiner can do.
1856 if (any_of(Cur
->incoming_values(), isInterestingPHIIncomingValue
)) {
1857 if (++NumBreakablePHIs
>= Threshold
) {
1864 for (const PHINode
*Cur
: WorkList
)
1865 BreakPhiNodesCache
[Cur
] = CanBreak
;
1870 /// Helper class for "break large PHIs" (visitPHINode).
1872 /// This represents a slice of a PHI's incoming value, which is made up of:
1873 /// - The type of the slice (Ty)
1874 /// - The index in the incoming value's vector where the slice starts (Idx)
1875 /// - The number of elements in the slice (NumElts).
1876 /// It also keeps track of the NewPHI node inserted for this particular slice.
1879 /// <4 x i64> -> Split into four i64 slices.
1880 /// -> [i64, 0, 1], [i64, 1, 1], [i64, 2, 1], [i64, 3, 1]
1881 /// <5 x i16> -> Split into 2 <2 x i16> slices + a i16 tail.
1882 /// -> [<2 x i16>, 0, 2], [<2 x i16>, 2, 2], [i16, 4, 1]
1885 VectorSlice(Type
*Ty
, unsigned Idx
, unsigned NumElts
)
1886 : Ty(Ty
), Idx(Idx
), NumElts(NumElts
) {}
1890 unsigned NumElts
= 0;
1891 PHINode
*NewPHI
= nullptr;
1893 /// Slice \p Inc according to the information contained within this slice.
1894 /// This is cached, so if called multiple times for the same \p BB & \p Inc
1895 /// pair, it returns the same Sliced value as well.
1897 /// Note this *intentionally* does not return the same value for, say,
1898 /// [%bb.0, %0] & [%bb.1, %0] as:
1899 /// - It could cause issues with dominance (e.g. if bb.1 is seen first, then
1900 /// the value in bb.1 may not be reachable from bb.0 if it's its
1902 /// - We also want to make our extract instructions as local as possible so
1903 /// the DAG has better chances of folding them out. Duplicating them like
1904 /// that is beneficial in that regard.
1906 /// This is both a minor optimization to avoid creating duplicate
1907 /// instructions, but also a requirement for correctness. It is not forbidden
1908 /// for a PHI node to have the same [BB, Val] pair multiple times. If we
1909 /// returned a new value each time, those previously identical pairs would all
1910 /// have different incoming values (from the same block) and it'd cause a "PHI
1911 /// node has multiple entries for the same basic block with different incoming
1912 /// values!" verifier error.
1913 Value
*getSlicedVal(BasicBlock
*BB
, Value
*Inc
, StringRef NewValName
) {
1914 Value
*&Res
= SlicedVals
[{BB
, Inc
}];
1918 IRBuilder
<> B(BB
->getTerminator());
1919 if (Instruction
*IncInst
= dyn_cast
<Instruction
>(Inc
))
1920 B
.SetCurrentDebugLocation(IncInst
->getDebugLoc());
1923 SmallVector
<int, 4> Mask
;
1924 for (unsigned K
= Idx
; K
< (Idx
+ NumElts
); ++K
)
1926 Res
= B
.CreateShuffleVector(Inc
, Mask
, NewValName
);
1928 Res
= B
.CreateExtractElement(Inc
, Idx
, NewValName
);
1934 SmallDenseMap
<std::pair
<BasicBlock
*, Value
*>, Value
*> SlicedVals
;
1937 bool AMDGPUCodeGenPrepareImpl::visitPHINode(PHINode
&I
) {
1938 // Break-up fixed-vector PHIs into smaller pieces.
1939 // Default threshold is 32, so it breaks up any vector that's >32 bits into
1940 // its elements, or into 32-bit pieces (for 8/16 bit elts).
1942 // This is only helpful for DAGISel because it doesn't handle large PHIs as
1943 // well as GlobalISel. DAGISel lowers PHIs by using CopyToReg/CopyFromReg.
1944 // With large, odd-sized PHIs we may end up needing many `build_vector`
1945 // operations with most elements being "undef". This inhibits a lot of
1946 // optimization opportunities and can result in unreasonably high register
1947 // pressure and the inevitable stack spilling.
1948 if (!BreakLargePHIs
|| getCGPassBuilderOption().EnableGlobalISelOption
)
1951 FixedVectorType
*FVT
= dyn_cast
<FixedVectorType
>(I
.getType());
1952 if (!FVT
|| FVT
->getNumElements() == 1 ||
1953 DL
->getTypeSizeInBits(FVT
) <= BreakLargePHIsThreshold
)
1956 if (!ForceBreakLargePHIs
&& !canBreakPHINode(I
))
1959 std::vector
<VectorSlice
> Slices
;
1961 Type
*EltTy
= FVT
->getElementType();
1964 // For 8/16 bits type, don't scalarize fully but break it up into as many
1965 // 32-bit slices as we can, and scalarize the tail.
1966 const unsigned EltSize
= DL
->getTypeSizeInBits(EltTy
);
1967 const unsigned NumElts
= FVT
->getNumElements();
1968 if (EltSize
== 8 || EltSize
== 16) {
1969 const unsigned SubVecSize
= (32 / EltSize
);
1970 Type
*SubVecTy
= FixedVectorType::get(EltTy
, SubVecSize
);
1971 for (unsigned End
= alignDown(NumElts
, SubVecSize
); Idx
< End
;
1973 Slices
.emplace_back(SubVecTy
, Idx
, SubVecSize
);
1976 // Scalarize all remaining elements.
1977 for (; Idx
< NumElts
; ++Idx
)
1978 Slices
.emplace_back(EltTy
, Idx
, 1);
1981 assert(Slices
.size() > 1);
1983 // Create one PHI per vector piece. The "VectorSlice" class takes care of
1984 // creating the necessary instruction to extract the relevant slices of each
1986 IRBuilder
<> B(I
.getParent());
1987 B
.SetCurrentDebugLocation(I
.getDebugLoc());
1989 unsigned IncNameSuffix
= 0;
1990 for (VectorSlice
&S
: Slices
) {
1991 // We need to reset the build on each iteration, because getSlicedVal may
1992 // have inserted something into I's BB.
1993 B
.SetInsertPoint(I
.getParent()->getFirstNonPHIIt());
1994 S
.NewPHI
= B
.CreatePHI(S
.Ty
, I
.getNumIncomingValues());
1996 for (const auto &[Idx
, BB
] : enumerate(I
.blocks())) {
1997 S
.NewPHI
->addIncoming(S
.getSlicedVal(BB
, I
.getIncomingValue(Idx
),
1998 "largephi.extractslice" +
1999 std::to_string(IncNameSuffix
++)),
2004 // And replace this PHI with a vector of all the previous PHI values.
2005 Value
*Vec
= PoisonValue::get(FVT
);
2006 unsigned NameSuffix
= 0;
2007 for (VectorSlice
&S
: Slices
) {
2008 const auto ValName
= "largephi.insertslice" + std::to_string(NameSuffix
++);
2011 B
.CreateInsertVector(FVT
, Vec
, S
.NewPHI
, B
.getInt64(S
.Idx
), ValName
);
2013 Vec
= B
.CreateInsertElement(Vec
, S
.NewPHI
, S
.Idx
, ValName
);
2016 I
.replaceAllUsesWith(Vec
);
2017 I
.eraseFromParent();
2021 /// \param V Value to check
2022 /// \param DL DataLayout
2023 /// \param TM TargetMachine (TODO: remove once DL contains nullptr values)
2024 /// \param AS Target Address Space
2025 /// \return true if \p V cannot be the null value of \p AS, false otherwise.
2026 static bool isPtrKnownNeverNull(const Value
*V
, const DataLayout
&DL
,
2027 const AMDGPUTargetMachine
&TM
, unsigned AS
) {
2028 // Pointer cannot be null if it's a block address, GV or alloca.
2029 // NOTE: We don't support extern_weak, but if we did, we'd need to check for
2030 // it as the symbol could be null in such cases.
2031 if (isa
<BlockAddress
>(V
) || isa
<GlobalValue
>(V
) || isa
<AllocaInst
>(V
))
2034 // Check nonnull arguments.
2035 if (const auto *Arg
= dyn_cast
<Argument
>(V
); Arg
&& Arg
->hasNonNullAttr())
2038 // getUnderlyingObject may have looked through another addrspacecast, although
2039 // the optimizable situations most likely folded out by now.
2040 if (AS
!= cast
<PointerType
>(V
->getType())->getAddressSpace())
2043 // TODO: Calls that return nonnull?
2045 // For all other things, use KnownBits.
2046 // We either use 0 or all bits set to indicate null, so check whether the
2047 // value can be zero or all ones.
2049 // TODO: Use ValueTracking's isKnownNeverNull if it becomes aware that some
2050 // address spaces have non-zero null values.
2051 auto SrcPtrKB
= computeKnownBits(V
, DL
);
2052 const auto NullVal
= TM
.getNullPointerValue(AS
);
2054 assert(SrcPtrKB
.getBitWidth() == DL
.getPointerSizeInBits(AS
));
2055 assert((NullVal
== 0 || NullVal
== -1) &&
2056 "don't know how to check for this null value!");
2057 return NullVal
? !SrcPtrKB
.getMaxValue().isAllOnes() : SrcPtrKB
.isNonZero();
2060 bool AMDGPUCodeGenPrepareImpl::visitAddrSpaceCastInst(AddrSpaceCastInst
&I
) {
2061 // Intrinsic doesn't support vectors, also it seems that it's often difficult
2062 // to prove that a vector cannot have any nulls in it so it's unclear if it's
2063 // worth supporting.
2064 if (I
.getType()->isVectorTy())
2067 // Check if this can be lowered to a amdgcn.addrspacecast.nonnull.
2068 // This is only worthwhile for casts from/to priv/local to flat.
2069 const unsigned SrcAS
= I
.getSrcAddressSpace();
2070 const unsigned DstAS
= I
.getDestAddressSpace();
2072 bool CanLower
= false;
2073 if (SrcAS
== AMDGPUAS::FLAT_ADDRESS
)
2074 CanLower
= (DstAS
== AMDGPUAS::LOCAL_ADDRESS
||
2075 DstAS
== AMDGPUAS::PRIVATE_ADDRESS
);
2076 else if (DstAS
== AMDGPUAS::FLAT_ADDRESS
)
2077 CanLower
= (SrcAS
== AMDGPUAS::LOCAL_ADDRESS
||
2078 SrcAS
== AMDGPUAS::PRIVATE_ADDRESS
);
2082 SmallVector
<const Value
*, 4> WorkList
;
2083 getUnderlyingObjects(I
.getOperand(0), WorkList
);
2084 if (!all_of(WorkList
, [&](const Value
*V
) {
2085 return isPtrKnownNeverNull(V
, *DL
, *TM
, SrcAS
);
2090 auto *Intrin
= B
.CreateIntrinsic(
2091 I
.getType(), Intrinsic::amdgcn_addrspacecast_nonnull
, {I
.getOperand(0)});
2092 I
.replaceAllUsesWith(Intrin
);
2093 I
.eraseFromParent();
2097 bool AMDGPUCodeGenPrepareImpl::visitIntrinsicInst(IntrinsicInst
&I
) {
2098 switch (I
.getIntrinsicID()) {
2099 case Intrinsic::bitreverse
:
2100 return visitBitreverseIntrinsicInst(I
);
2101 case Intrinsic::minnum
:
2102 return visitMinNum(I
);
2103 case Intrinsic::sqrt
:
2104 return visitSqrt(I
);
2110 bool AMDGPUCodeGenPrepareImpl::visitBitreverseIntrinsicInst(IntrinsicInst
&I
) {
2111 bool Changed
= false;
2113 if (ST
->has16BitInsts() && needsPromotionToI32(I
.getType()) &&
2115 Changed
|= promoteUniformBitreverseToI32(I
);
2120 /// Match non-nan fract pattern.
2121 /// minnum(fsub(x, floor(x)), nextafter(1.0, -1.0)
2123 /// If fract is a useful instruction for the subtarget. Does not account for the
2124 /// nan handling; the instruction has a nan check on the input value.
2125 Value
*AMDGPUCodeGenPrepareImpl::matchFractPat(IntrinsicInst
&I
) {
2126 if (ST
->hasFractBug())
2129 if (I
.getIntrinsicID() != Intrinsic::minnum
)
2132 Type
*Ty
= I
.getType();
2133 if (!isLegalFloatingTy(Ty
->getScalarType()))
2136 Value
*Arg0
= I
.getArgOperand(0);
2137 Value
*Arg1
= I
.getArgOperand(1);
2140 if (!match(Arg1
, m_APFloat(C
)))
2145 One
.convert(C
->getSemantics(), APFloat::rmNearestTiesToEven
, &LosesInfo
);
2147 // Match nextafter(1.0, -1)
2153 if (match(Arg0
, m_FSub(m_Value(FloorSrc
),
2154 m_Intrinsic
<Intrinsic::floor
>(m_Deferred(FloorSrc
)))))
2159 Value
*AMDGPUCodeGenPrepareImpl::applyFractPat(IRBuilder
<> &Builder
,
2161 SmallVector
<Value
*, 4> FractVals
;
2162 extractValues(Builder
, FractVals
, FractArg
);
2164 SmallVector
<Value
*, 4> ResultVals(FractVals
.size());
2166 Type
*Ty
= FractArg
->getType()->getScalarType();
2167 for (unsigned I
= 0, E
= FractVals
.size(); I
!= E
; ++I
) {
2169 Builder
.CreateIntrinsic(Intrinsic::amdgcn_fract
, {Ty
}, {FractVals
[I
]});
2172 return insertValues(Builder
, FractArg
->getType(), ResultVals
);
2175 bool AMDGPUCodeGenPrepareImpl::visitMinNum(IntrinsicInst
&I
) {
2176 Value
*FractArg
= matchFractPat(I
);
2180 // Match pattern for fract intrinsic in contexts where the nan check has been
2181 // optimized out (and hope the knowledge the source can't be nan wasn't lost).
2182 if (!I
.hasNoNaNs() &&
2183 !isKnownNeverNaN(FractArg
, /*Depth=*/0, SimplifyQuery(*DL
, TLInfo
)))
2186 IRBuilder
<> Builder(&I
);
2187 FastMathFlags FMF
= I
.getFastMathFlags();
2189 Builder
.setFastMathFlags(FMF
);
2191 Value
*Fract
= applyFractPat(Builder
, FractArg
);
2192 Fract
->takeName(&I
);
2193 I
.replaceAllUsesWith(Fract
);
2195 RecursivelyDeleteTriviallyDeadInstructions(&I
, TLInfo
);
2199 static bool isOneOrNegOne(const Value
*Val
) {
2201 return match(Val
, m_APFloat(C
)) && C
->getExactLog2Abs() == 0;
2204 // Expand llvm.sqrt.f32 calls with !fpmath metadata in a semi-fast way.
2205 bool AMDGPUCodeGenPrepareImpl::visitSqrt(IntrinsicInst
&Sqrt
) {
2206 Type
*Ty
= Sqrt
.getType()->getScalarType();
2207 if (!Ty
->isFloatTy() && (!Ty
->isHalfTy() || ST
->has16BitInsts()))
2210 const FPMathOperator
*FPOp
= cast
<const FPMathOperator
>(&Sqrt
);
2211 FastMathFlags SqrtFMF
= FPOp
->getFastMathFlags();
2213 // We're trying to handle the fast-but-not-that-fast case only. The lowering
2214 // of fast llvm.sqrt will give the raw instruction anyway.
2215 if (SqrtFMF
.approxFunc() || HasUnsafeFPMath
)
2218 const float ReqdAccuracy
= FPOp
->getFPAccuracy();
2220 // Defer correctly rounded expansion to codegen.
2221 if (ReqdAccuracy
< 1.0f
)
2224 // FIXME: This is an ugly hack for this pass using forward iteration instead
2225 // of reverse. If it worked like a normal combiner, the rsq would form before
2226 // we saw a sqrt call.
2228 dyn_cast_or_null
<FPMathOperator
>(Sqrt
.getUniqueUndroppableUser());
2229 if (FDiv
&& FDiv
->getOpcode() == Instruction::FDiv
&&
2230 FDiv
->getFPAccuracy() >= 1.0f
&&
2231 canOptimizeWithRsq(FPOp
, FDiv
->getFastMathFlags(), SqrtFMF
) &&
2232 // TODO: We should also handle the arcp case for the fdiv with non-1 value
2233 isOneOrNegOne(FDiv
->getOperand(0)))
2236 Value
*SrcVal
= Sqrt
.getOperand(0);
2237 bool CanTreatAsDAZ
= canIgnoreDenormalInput(SrcVal
, &Sqrt
);
2239 // The raw instruction is 1 ulp, but the correction for denormal handling
2241 if (!CanTreatAsDAZ
&& ReqdAccuracy
< 2.0f
)
2244 IRBuilder
<> Builder(&Sqrt
);
2245 SmallVector
<Value
*, 4> SrcVals
;
2246 extractValues(Builder
, SrcVals
, SrcVal
);
2248 SmallVector
<Value
*, 4> ResultVals(SrcVals
.size());
2249 for (int I
= 0, E
= SrcVals
.size(); I
!= E
; ++I
) {
2251 ResultVals
[I
] = Builder
.CreateCall(getSqrtF32(), SrcVals
[I
]);
2253 ResultVals
[I
] = emitSqrtIEEE2ULP(Builder
, SrcVals
[I
], SqrtFMF
);
2256 Value
*NewSqrt
= insertValues(Builder
, Sqrt
.getType(), ResultVals
);
2257 NewSqrt
->takeName(&Sqrt
);
2258 Sqrt
.replaceAllUsesWith(NewSqrt
);
2259 Sqrt
.eraseFromParent();
2263 bool AMDGPUCodeGenPrepare::doInitialization(Module
&M
) {
2265 Impl
.DL
= &Impl
.Mod
->getDataLayout();
2266 Impl
.SqrtF32
= nullptr;
2267 Impl
.LdexpF32
= nullptr;
2271 bool AMDGPUCodeGenPrepare::runOnFunction(Function
&F
) {
2272 if (skipFunction(F
))
2275 auto *TPC
= getAnalysisIfAvailable
<TargetPassConfig
>();
2279 const AMDGPUTargetMachine
&TM
= TPC
->getTM
<AMDGPUTargetMachine
>();
2281 Impl
.TLInfo
= &getAnalysis
<TargetLibraryInfoWrapperPass
>().getTLI(F
);
2282 Impl
.ST
= &TM
.getSubtarget
<GCNSubtarget
>(F
);
2283 Impl
.AC
= &getAnalysis
<AssumptionCacheTracker
>().getAssumptionCache(F
);
2284 Impl
.UA
= &getAnalysis
<UniformityInfoWrapperPass
>().getUniformityInfo();
2285 auto *DTWP
= getAnalysisIfAvailable
<DominatorTreeWrapperPass
>();
2286 Impl
.DT
= DTWP
? &DTWP
->getDomTree() : nullptr;
2287 Impl
.HasUnsafeFPMath
= hasUnsafeFPMath(F
);
2288 SIModeRegisterDefaults
Mode(F
, *Impl
.ST
);
2289 Impl
.HasFP32DenormalFlush
=
2290 Mode
.FP32Denormals
== DenormalMode::getPreserveSign();
2294 PreservedAnalyses
AMDGPUCodeGenPreparePass::run(Function
&F
,
2295 FunctionAnalysisManager
&FAM
) {
2296 AMDGPUCodeGenPrepareImpl Impl
;
2297 Impl
.Mod
= F
.getParent();
2298 Impl
.DL
= &Impl
.Mod
->getDataLayout();
2299 Impl
.TM
= static_cast<const AMDGPUTargetMachine
*>(&TM
);
2300 Impl
.TLInfo
= &FAM
.getResult
<TargetLibraryAnalysis
>(F
);
2301 Impl
.ST
= &TM
.getSubtarget
<GCNSubtarget
>(F
);
2302 Impl
.AC
= &FAM
.getResult
<AssumptionAnalysis
>(F
);
2303 Impl
.UA
= &FAM
.getResult
<UniformityInfoAnalysis
>(F
);
2304 Impl
.DT
= FAM
.getCachedResult
<DominatorTreeAnalysis
>(F
);
2305 Impl
.HasUnsafeFPMath
= hasUnsafeFPMath(F
);
2306 SIModeRegisterDefaults
Mode(F
, *Impl
.ST
);
2307 Impl
.HasFP32DenormalFlush
=
2308 Mode
.FP32Denormals
== DenormalMode::getPreserveSign();
2309 PreservedAnalyses PA
= PreservedAnalyses::none();
2310 if (!Impl
.FlowChanged
)
2311 PA
.preserveSet
<CFGAnalyses
>();
2312 return Impl
.run(F
) ? PA
: PreservedAnalyses::all();
2315 INITIALIZE_PASS_BEGIN(AMDGPUCodeGenPrepare
, DEBUG_TYPE
,
2316 "AMDGPU IR optimizations", false, false)
2317 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker
)
2318 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass
)
2319 INITIALIZE_PASS_DEPENDENCY(UniformityInfoWrapperPass
)
2320 INITIALIZE_PASS_END(AMDGPUCodeGenPrepare
, DEBUG_TYPE
, "AMDGPU IR optimizations",
2323 char AMDGPUCodeGenPrepare::ID
= 0;
2325 FunctionPass
*llvm::createAMDGPUCodeGenPreparePass() {
2326 return new AMDGPUCodeGenPrepare();