1 //===-- AMDGPUCodeGenPrepare.cpp ------------------------------------------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
10 /// This pass does misc. AMDGPU optimizations on IR before instruction
13 //===----------------------------------------------------------------------===//
16 #include "AMDGPUTargetMachine.h"
17 #include "SIModeRegisterDefaults.h"
18 #include "llvm/Analysis/AssumptionCache.h"
19 #include "llvm/Analysis/ConstantFolding.h"
20 #include "llvm/Analysis/TargetLibraryInfo.h"
21 #include "llvm/Analysis/UniformityAnalysis.h"
22 #include "llvm/Analysis/ValueTracking.h"
23 #include "llvm/CodeGen/TargetPassConfig.h"
24 #include "llvm/IR/Dominators.h"
25 #include "llvm/IR/IRBuilder.h"
26 #include "llvm/IR/InstVisitor.h"
27 #include "llvm/IR/IntrinsicsAMDGPU.h"
28 #include "llvm/IR/PatternMatch.h"
29 #include "llvm/InitializePasses.h"
30 #include "llvm/Pass.h"
31 #include "llvm/Support/KnownBits.h"
32 #include "llvm/Transforms/Utils/IntegerDivision.h"
33 #include "llvm/Transforms/Utils/Local.h"
35 #define DEBUG_TYPE "amdgpu-codegenprepare"
38 using namespace llvm::PatternMatch
;
42 static cl::opt
<bool> WidenLoads(
43 "amdgpu-codegenprepare-widen-constant-loads",
44 cl::desc("Widen sub-dword constant address space loads in AMDGPUCodeGenPrepare"),
48 static cl::opt
<bool> Widen16BitOps(
49 "amdgpu-codegenprepare-widen-16-bit-ops",
50 cl::desc("Widen uniform 16-bit instructions to 32-bit in AMDGPUCodeGenPrepare"),
55 BreakLargePHIs("amdgpu-codegenprepare-break-large-phis",
56 cl::desc("Break large PHI nodes for DAGISel"),
57 cl::ReallyHidden
, cl::init(true));
60 ForceBreakLargePHIs("amdgpu-codegenprepare-force-break-large-phis",
61 cl::desc("For testing purposes, always break large "
62 "PHIs even if it isn't profitable."),
63 cl::ReallyHidden
, cl::init(false));
65 static cl::opt
<unsigned> BreakLargePHIsThreshold(
66 "amdgpu-codegenprepare-break-large-phis-threshold",
67 cl::desc("Minimum type size in bits for breaking large PHI nodes"),
68 cl::ReallyHidden
, cl::init(32));
70 static cl::opt
<bool> UseMul24Intrin(
71 "amdgpu-codegenprepare-mul24",
72 cl::desc("Introduce mul24 intrinsics in AMDGPUCodeGenPrepare"),
76 // Legalize 64-bit division by using the generic IR expansion.
77 static cl::opt
<bool> ExpandDiv64InIR(
78 "amdgpu-codegenprepare-expand-div64",
79 cl::desc("Expand 64-bit division in AMDGPUCodeGenPrepare"),
83 // Leave all division operations as they are. This supersedes ExpandDiv64InIR
84 // and is used for testing the legalizer.
85 static cl::opt
<bool> DisableIDivExpand(
86 "amdgpu-codegenprepare-disable-idiv-expansion",
87 cl::desc("Prevent expanding integer division in AMDGPUCodeGenPrepare"),
91 // Disable processing of fdiv so we can better test the backend implementations.
92 static cl::opt
<bool> DisableFDivExpand(
93 "amdgpu-codegenprepare-disable-fdiv-expansion",
94 cl::desc("Prevent expanding floating point division in AMDGPUCodeGenPrepare"),
98 class AMDGPUCodeGenPrepareImpl
99 : public InstVisitor
<AMDGPUCodeGenPrepareImpl
, bool> {
101 const GCNSubtarget
*ST
= nullptr;
102 const AMDGPUTargetMachine
*TM
= nullptr;
103 const TargetLibraryInfo
*TLInfo
= nullptr;
104 AssumptionCache
*AC
= nullptr;
105 DominatorTree
*DT
= nullptr;
106 UniformityInfo
*UA
= nullptr;
107 Module
*Mod
= nullptr;
108 const DataLayout
*DL
= nullptr;
109 bool HasUnsafeFPMath
= false;
110 bool HasFP32DenormalFlush
= false;
111 bool FlowChanged
= false;
112 mutable Function
*SqrtF32
= nullptr;
113 mutable Function
*LdexpF32
= nullptr;
115 DenseMap
<const PHINode
*, bool> BreakPhiNodesCache
;
117 Function
*getSqrtF32() const {
121 LLVMContext
&Ctx
= Mod
->getContext();
122 SqrtF32
= Intrinsic::getOrInsertDeclaration(Mod
, Intrinsic::amdgcn_sqrt
,
123 {Type::getFloatTy(Ctx
)});
127 Function
*getLdexpF32() const {
131 LLVMContext
&Ctx
= Mod
->getContext();
132 LdexpF32
= Intrinsic::getOrInsertDeclaration(
133 Mod
, Intrinsic::ldexp
, {Type::getFloatTy(Ctx
), Type::getInt32Ty(Ctx
)});
137 bool canBreakPHINode(const PHINode
&I
);
139 /// Copies exact/nsw/nuw flags (if any) from binary operation \p I to
140 /// binary operation \p V.
142 /// \returns Binary operation \p V.
143 /// \returns \p T's base element bit width.
144 unsigned getBaseElementBitWidth(const Type
*T
) const;
146 /// \returns Equivalent 32 bit integer type for given type \p T. For example,
147 /// if \p T is i7, then i32 is returned; if \p T is <3 x i12>, then <3 x i32>
149 Type
*getI32Ty(IRBuilder
<> &B
, const Type
*T
) const;
151 /// \returns True if binary operation \p I is a signed binary operation, false
153 bool isSigned(const BinaryOperator
&I
) const;
155 /// \returns True if the condition of 'select' operation \p I comes from a
156 /// signed 'icmp' operation, false otherwise.
157 bool isSigned(const SelectInst
&I
) const;
159 /// \returns True if type \p T needs to be promoted to 32 bit integer type,
161 bool needsPromotionToI32(const Type
*T
) const;
163 /// Return true if \p T is a legal scalar floating point type.
164 bool isLegalFloatingTy(const Type
*T
) const;
166 /// Wrapper to pass all the arguments to computeKnownFPClass
167 KnownFPClass
computeKnownFPClass(const Value
*V
, FPClassTest Interested
,
168 const Instruction
*CtxI
) const {
169 return llvm::computeKnownFPClass(V
, *DL
, Interested
, 0, TLInfo
, AC
, CtxI
,
173 bool canIgnoreDenormalInput(const Value
*V
, const Instruction
*CtxI
) const {
174 return HasFP32DenormalFlush
||
175 computeKnownFPClass(V
, fcSubnormal
, CtxI
).isKnownNeverSubnormal();
178 /// Promotes uniform binary operation \p I to equivalent 32 bit binary
181 /// \details \p I's base element bit width must be greater than 1 and less
182 /// than or equal 16. Promotion is done by sign or zero extending operands to
183 /// 32 bits, replacing \p I with equivalent 32 bit binary operation, and
184 /// truncating the result of 32 bit binary operation back to \p I's original
185 /// type. Division operation is not promoted.
187 /// \returns True if \p I is promoted to equivalent 32 bit binary operation,
189 bool promoteUniformOpToI32(BinaryOperator
&I
) const;
191 /// Promotes uniform 'icmp' operation \p I to 32 bit 'icmp' operation.
193 /// \details \p I's base element bit width must be greater than 1 and less
194 /// than or equal 16. Promotion is done by sign or zero extending operands to
195 /// 32 bits, and replacing \p I with 32 bit 'icmp' operation.
198 bool promoteUniformOpToI32(ICmpInst
&I
) const;
200 /// Promotes uniform 'select' operation \p I to 32 bit 'select'
203 /// \details \p I's base element bit width must be greater than 1 and less
204 /// than or equal 16. Promotion is done by sign or zero extending operands to
205 /// 32 bits, replacing \p I with 32 bit 'select' operation, and truncating the
206 /// result of 32 bit 'select' operation back to \p I's original type.
209 bool promoteUniformOpToI32(SelectInst
&I
) const;
211 /// Promotes uniform 'bitreverse' intrinsic \p I to 32 bit 'bitreverse'
214 /// \details \p I's base element bit width must be greater than 1 and less
215 /// than or equal 16. Promotion is done by zero extending the operand to 32
216 /// bits, replacing \p I with 32 bit 'bitreverse' intrinsic, shifting the
217 /// result of 32 bit 'bitreverse' intrinsic to the right with zero fill (the
218 /// shift amount is 32 minus \p I's base element bit width), and truncating
219 /// the result of the shift operation back to \p I's original type.
222 bool promoteUniformBitreverseToI32(IntrinsicInst
&I
) const;
224 /// \returns The minimum number of bits needed to store the value of \Op as an
225 /// unsigned integer. Truncating to this size and then zero-extending to
226 /// the original will not change the value.
227 unsigned numBitsUnsigned(Value
*Op
) const;
229 /// \returns The minimum number of bits needed to store the value of \Op as a
230 /// signed integer. Truncating to this size and then sign-extending to
231 /// the original size will not change the value.
232 unsigned numBitsSigned(Value
*Op
) const;
234 /// Replace mul instructions with llvm.amdgcn.mul.u24 or llvm.amdgcn.mul.s24.
235 /// SelectionDAG has an issue where an and asserting the bits are known
236 bool replaceMulWithMul24(BinaryOperator
&I
) const;
238 /// Perform same function as equivalently named function in DAGCombiner. Since
239 /// we expand some divisions here, we need to perform this before obscuring.
240 bool foldBinOpIntoSelect(BinaryOperator
&I
) const;
242 bool divHasSpecialOptimization(BinaryOperator
&I
,
243 Value
*Num
, Value
*Den
) const;
244 int getDivNumBits(BinaryOperator
&I
,
245 Value
*Num
, Value
*Den
,
246 unsigned AtLeast
, bool Signed
) const;
248 /// Expands 24 bit div or rem.
249 Value
* expandDivRem24(IRBuilder
<> &Builder
, BinaryOperator
&I
,
250 Value
*Num
, Value
*Den
,
251 bool IsDiv
, bool IsSigned
) const;
253 Value
*expandDivRem24Impl(IRBuilder
<> &Builder
, BinaryOperator
&I
,
254 Value
*Num
, Value
*Den
, unsigned NumBits
,
255 bool IsDiv
, bool IsSigned
) const;
257 /// Expands 32 bit div or rem.
258 Value
* expandDivRem32(IRBuilder
<> &Builder
, BinaryOperator
&I
,
259 Value
*Num
, Value
*Den
) const;
261 Value
*shrinkDivRem64(IRBuilder
<> &Builder
, BinaryOperator
&I
,
262 Value
*Num
, Value
*Den
) const;
263 void expandDivRem64(BinaryOperator
&I
) const;
265 /// Widen a scalar load.
267 /// \details \p Widen scalar load for uniform, small type loads from constant
268 // memory / to a full 32-bits and then truncate the input to allow a scalar
269 // load instead of a vector load.
273 bool canWidenScalarExtLoad(LoadInst
&I
) const;
275 Value
*matchFractPat(IntrinsicInst
&I
);
276 Value
*applyFractPat(IRBuilder
<> &Builder
, Value
*FractArg
);
278 bool canOptimizeWithRsq(const FPMathOperator
*SqrtOp
, FastMathFlags DivFMF
,
279 FastMathFlags SqrtFMF
) const;
281 Value
*optimizeWithRsq(IRBuilder
<> &Builder
, Value
*Num
, Value
*Den
,
282 FastMathFlags DivFMF
, FastMathFlags SqrtFMF
,
283 const Instruction
*CtxI
) const;
285 Value
*optimizeWithRcp(IRBuilder
<> &Builder
, Value
*Num
, Value
*Den
,
286 FastMathFlags FMF
, const Instruction
*CtxI
) const;
287 Value
*optimizeWithFDivFast(IRBuilder
<> &Builder
, Value
*Num
, Value
*Den
,
288 float ReqdAccuracy
) const;
290 Value
*visitFDivElement(IRBuilder
<> &Builder
, Value
*Num
, Value
*Den
,
291 FastMathFlags DivFMF
, FastMathFlags SqrtFMF
,
292 Value
*RsqOp
, const Instruction
*FDiv
,
293 float ReqdAccuracy
) const;
295 std::pair
<Value
*, Value
*> getFrexpResults(IRBuilder
<> &Builder
,
298 Value
*emitRcpIEEE1ULP(IRBuilder
<> &Builder
, Value
*Src
,
299 bool IsNegative
) const;
300 Value
*emitFrexpDiv(IRBuilder
<> &Builder
, Value
*LHS
, Value
*RHS
,
301 FastMathFlags FMF
) const;
302 Value
*emitSqrtIEEE2ULP(IRBuilder
<> &Builder
, Value
*Src
,
303 FastMathFlags FMF
) const;
306 bool visitFDiv(BinaryOperator
&I
);
308 bool visitInstruction(Instruction
&I
) { return false; }
309 bool visitBinaryOperator(BinaryOperator
&I
);
310 bool visitLoadInst(LoadInst
&I
);
311 bool visitICmpInst(ICmpInst
&I
);
312 bool visitSelectInst(SelectInst
&I
);
313 bool visitPHINode(PHINode
&I
);
314 bool visitAddrSpaceCastInst(AddrSpaceCastInst
&I
);
316 bool visitIntrinsicInst(IntrinsicInst
&I
);
317 bool visitBitreverseIntrinsicInst(IntrinsicInst
&I
);
318 bool visitMinNum(IntrinsicInst
&I
);
319 bool visitSqrt(IntrinsicInst
&I
);
320 bool run(Function
&F
);
323 class AMDGPUCodeGenPrepare
: public FunctionPass
{
325 AMDGPUCodeGenPrepareImpl Impl
;
329 AMDGPUCodeGenPrepare() : FunctionPass(ID
) {
330 initializeAMDGPUCodeGenPreparePass(*PassRegistry::getPassRegistry());
332 void getAnalysisUsage(AnalysisUsage
&AU
) const override
{
333 AU
.addRequired
<AssumptionCacheTracker
>();
334 AU
.addRequired
<UniformityInfoWrapperPass
>();
335 AU
.addRequired
<TargetLibraryInfoWrapperPass
>();
337 // FIXME: Division expansion needs to preserve the dominator tree.
338 if (!ExpandDiv64InIR
)
339 AU
.setPreservesAll();
341 bool runOnFunction(Function
&F
) override
;
342 bool doInitialization(Module
&M
) override
;
343 StringRef
getPassName() const override
{ return "AMDGPU IR optimizations"; }
346 } // end anonymous namespace
348 bool AMDGPUCodeGenPrepareImpl::run(Function
&F
) {
349 BreakPhiNodesCache
.clear();
350 bool MadeChange
= false;
352 Function::iterator NextBB
;
353 for (Function::iterator FI
= F
.begin(), FE
= F
.end(); FI
!= FE
; FI
= NextBB
) {
354 BasicBlock
*BB
= &*FI
;
355 NextBB
= std::next(FI
);
357 BasicBlock::iterator Next
;
358 for (BasicBlock::iterator I
= BB
->begin(), E
= BB
->end(); I
!= E
;
362 MadeChange
|= visit(*I
);
364 if (Next
!= E
) { // Control flow changed
365 BasicBlock
*NextInstBB
= Next
->getParent();
366 if (NextInstBB
!= BB
) {
377 unsigned AMDGPUCodeGenPrepareImpl::getBaseElementBitWidth(const Type
*T
) const {
378 assert(needsPromotionToI32(T
) && "T does not need promotion to i32");
380 if (T
->isIntegerTy())
381 return T
->getIntegerBitWidth();
382 return cast
<VectorType
>(T
)->getElementType()->getIntegerBitWidth();
385 Type
*AMDGPUCodeGenPrepareImpl::getI32Ty(IRBuilder
<> &B
, const Type
*T
) const {
386 assert(needsPromotionToI32(T
) && "T does not need promotion to i32");
388 if (T
->isIntegerTy())
389 return B
.getInt32Ty();
390 return FixedVectorType::get(B
.getInt32Ty(), cast
<FixedVectorType
>(T
));
393 bool AMDGPUCodeGenPrepareImpl::isSigned(const BinaryOperator
&I
) const {
394 return I
.getOpcode() == Instruction::AShr
||
395 I
.getOpcode() == Instruction::SDiv
|| I
.getOpcode() == Instruction::SRem
;
398 bool AMDGPUCodeGenPrepareImpl::isSigned(const SelectInst
&I
) const {
399 return isa
<ICmpInst
>(I
.getOperand(0)) ?
400 cast
<ICmpInst
>(I
.getOperand(0))->isSigned() : false;
403 bool AMDGPUCodeGenPrepareImpl::needsPromotionToI32(const Type
*T
) const {
407 const IntegerType
*IntTy
= dyn_cast
<IntegerType
>(T
);
408 if (IntTy
&& IntTy
->getBitWidth() > 1 && IntTy
->getBitWidth() <= 16)
411 if (const VectorType
*VT
= dyn_cast
<VectorType
>(T
)) {
412 // TODO: The set of packed operations is more limited, so may want to
413 // promote some anyway.
414 if (ST
->hasVOP3PInsts())
417 return needsPromotionToI32(VT
->getElementType());
423 bool AMDGPUCodeGenPrepareImpl::isLegalFloatingTy(const Type
*Ty
) const {
424 return Ty
->isFloatTy() || Ty
->isDoubleTy() ||
425 (Ty
->isHalfTy() && ST
->has16BitInsts());
428 // Return true if the op promoted to i32 should have nsw set.
429 static bool promotedOpIsNSW(const Instruction
&I
) {
430 switch (I
.getOpcode()) {
431 case Instruction::Shl
:
432 case Instruction::Add
:
433 case Instruction::Sub
:
435 case Instruction::Mul
:
436 return I
.hasNoUnsignedWrap();
442 // Return true if the op promoted to i32 should have nuw set.
443 static bool promotedOpIsNUW(const Instruction
&I
) {
444 switch (I
.getOpcode()) {
445 case Instruction::Shl
:
446 case Instruction::Add
:
447 case Instruction::Mul
:
449 case Instruction::Sub
:
450 return I
.hasNoUnsignedWrap();
456 bool AMDGPUCodeGenPrepareImpl::canWidenScalarExtLoad(LoadInst
&I
) const {
457 Type
*Ty
= I
.getType();
458 const DataLayout
&DL
= Mod
->getDataLayout();
459 int TySize
= DL
.getTypeSizeInBits(Ty
);
460 Align Alignment
= DL
.getValueOrABITypeAlignment(I
.getAlign(), Ty
);
462 return I
.isSimple() && TySize
< 32 && Alignment
>= 4 && UA
->isUniform(&I
);
465 bool AMDGPUCodeGenPrepareImpl::promoteUniformOpToI32(BinaryOperator
&I
) const {
466 assert(needsPromotionToI32(I
.getType()) &&
467 "I does not need promotion to i32");
469 if (I
.getOpcode() == Instruction::SDiv
||
470 I
.getOpcode() == Instruction::UDiv
||
471 I
.getOpcode() == Instruction::SRem
||
472 I
.getOpcode() == Instruction::URem
)
475 IRBuilder
<> Builder(&I
);
476 Builder
.SetCurrentDebugLocation(I
.getDebugLoc());
478 Type
*I32Ty
= getI32Ty(Builder
, I
.getType());
479 Value
*ExtOp0
= nullptr;
480 Value
*ExtOp1
= nullptr;
481 Value
*ExtRes
= nullptr;
482 Value
*TruncRes
= nullptr;
485 ExtOp0
= Builder
.CreateSExt(I
.getOperand(0), I32Ty
);
486 ExtOp1
= Builder
.CreateSExt(I
.getOperand(1), I32Ty
);
488 ExtOp0
= Builder
.CreateZExt(I
.getOperand(0), I32Ty
);
489 ExtOp1
= Builder
.CreateZExt(I
.getOperand(1), I32Ty
);
492 ExtRes
= Builder
.CreateBinOp(I
.getOpcode(), ExtOp0
, ExtOp1
);
493 if (Instruction
*Inst
= dyn_cast
<Instruction
>(ExtRes
)) {
494 if (promotedOpIsNSW(cast
<Instruction
>(I
)))
495 Inst
->setHasNoSignedWrap();
497 if (promotedOpIsNUW(cast
<Instruction
>(I
)))
498 Inst
->setHasNoUnsignedWrap();
500 if (const auto *ExactOp
= dyn_cast
<PossiblyExactOperator
>(&I
))
501 Inst
->setIsExact(ExactOp
->isExact());
504 TruncRes
= Builder
.CreateTrunc(ExtRes
, I
.getType());
506 I
.replaceAllUsesWith(TruncRes
);
512 bool AMDGPUCodeGenPrepareImpl::promoteUniformOpToI32(ICmpInst
&I
) const {
513 assert(needsPromotionToI32(I
.getOperand(0)->getType()) &&
514 "I does not need promotion to i32");
516 IRBuilder
<> Builder(&I
);
517 Builder
.SetCurrentDebugLocation(I
.getDebugLoc());
519 Type
*I32Ty
= getI32Ty(Builder
, I
.getOperand(0)->getType());
520 Value
*ExtOp0
= nullptr;
521 Value
*ExtOp1
= nullptr;
522 Value
*NewICmp
= nullptr;
525 ExtOp0
= Builder
.CreateSExt(I
.getOperand(0), I32Ty
);
526 ExtOp1
= Builder
.CreateSExt(I
.getOperand(1), I32Ty
);
528 ExtOp0
= Builder
.CreateZExt(I
.getOperand(0), I32Ty
);
529 ExtOp1
= Builder
.CreateZExt(I
.getOperand(1), I32Ty
);
531 NewICmp
= Builder
.CreateICmp(I
.getPredicate(), ExtOp0
, ExtOp1
);
533 I
.replaceAllUsesWith(NewICmp
);
539 bool AMDGPUCodeGenPrepareImpl::promoteUniformOpToI32(SelectInst
&I
) const {
540 assert(needsPromotionToI32(I
.getType()) &&
541 "I does not need promotion to i32");
543 IRBuilder
<> Builder(&I
);
544 Builder
.SetCurrentDebugLocation(I
.getDebugLoc());
546 Type
*I32Ty
= getI32Ty(Builder
, I
.getType());
547 Value
*ExtOp1
= nullptr;
548 Value
*ExtOp2
= nullptr;
549 Value
*ExtRes
= nullptr;
550 Value
*TruncRes
= nullptr;
553 ExtOp1
= Builder
.CreateSExt(I
.getOperand(1), I32Ty
);
554 ExtOp2
= Builder
.CreateSExt(I
.getOperand(2), I32Ty
);
556 ExtOp1
= Builder
.CreateZExt(I
.getOperand(1), I32Ty
);
557 ExtOp2
= Builder
.CreateZExt(I
.getOperand(2), I32Ty
);
559 ExtRes
= Builder
.CreateSelect(I
.getOperand(0), ExtOp1
, ExtOp2
);
560 TruncRes
= Builder
.CreateTrunc(ExtRes
, I
.getType());
562 I
.replaceAllUsesWith(TruncRes
);
568 bool AMDGPUCodeGenPrepareImpl::promoteUniformBitreverseToI32(
569 IntrinsicInst
&I
) const {
570 assert(I
.getIntrinsicID() == Intrinsic::bitreverse
&&
571 "I must be bitreverse intrinsic");
572 assert(needsPromotionToI32(I
.getType()) &&
573 "I does not need promotion to i32");
575 IRBuilder
<> Builder(&I
);
576 Builder
.SetCurrentDebugLocation(I
.getDebugLoc());
578 Type
*I32Ty
= getI32Ty(Builder
, I
.getType());
579 Value
*ExtOp
= Builder
.CreateZExt(I
.getOperand(0), I32Ty
);
581 Builder
.CreateIntrinsic(Intrinsic::bitreverse
, {I32Ty
}, {ExtOp
});
583 Builder
.CreateLShr(ExtRes
, 32 - getBaseElementBitWidth(I
.getType()));
585 Builder
.CreateTrunc(LShrOp
, I
.getType());
587 I
.replaceAllUsesWith(TruncRes
);
593 unsigned AMDGPUCodeGenPrepareImpl::numBitsUnsigned(Value
*Op
) const {
594 return computeKnownBits(Op
, *DL
, 0, AC
).countMaxActiveBits();
597 unsigned AMDGPUCodeGenPrepareImpl::numBitsSigned(Value
*Op
) const {
598 return ComputeMaxSignificantBits(Op
, *DL
, 0, AC
);
601 static void extractValues(IRBuilder
<> &Builder
,
602 SmallVectorImpl
<Value
*> &Values
, Value
*V
) {
603 auto *VT
= dyn_cast
<FixedVectorType
>(V
->getType());
609 for (int I
= 0, E
= VT
->getNumElements(); I
!= E
; ++I
)
610 Values
.push_back(Builder
.CreateExtractElement(V
, I
));
613 static Value
*insertValues(IRBuilder
<> &Builder
,
615 SmallVectorImpl
<Value
*> &Values
) {
616 if (!Ty
->isVectorTy()) {
617 assert(Values
.size() == 1);
621 Value
*NewVal
= PoisonValue::get(Ty
);
622 for (int I
= 0, E
= Values
.size(); I
!= E
; ++I
)
623 NewVal
= Builder
.CreateInsertElement(NewVal
, Values
[I
], I
);
628 bool AMDGPUCodeGenPrepareImpl::replaceMulWithMul24(BinaryOperator
&I
) const {
629 if (I
.getOpcode() != Instruction::Mul
)
632 Type
*Ty
= I
.getType();
633 unsigned Size
= Ty
->getScalarSizeInBits();
634 if (Size
<= 16 && ST
->has16BitInsts())
637 // Prefer scalar if this could be s_mul_i32
638 if (UA
->isUniform(&I
))
641 Value
*LHS
= I
.getOperand(0);
642 Value
*RHS
= I
.getOperand(1);
643 IRBuilder
<> Builder(&I
);
644 Builder
.SetCurrentDebugLocation(I
.getDebugLoc());
646 unsigned LHSBits
= 0, RHSBits
= 0;
647 bool IsSigned
= false;
649 if (ST
->hasMulU24() && (LHSBits
= numBitsUnsigned(LHS
)) <= 24 &&
650 (RHSBits
= numBitsUnsigned(RHS
)) <= 24) {
653 } else if (ST
->hasMulI24() && (LHSBits
= numBitsSigned(LHS
)) <= 24 &&
654 (RHSBits
= numBitsSigned(RHS
)) <= 24) {
660 SmallVector
<Value
*, 4> LHSVals
;
661 SmallVector
<Value
*, 4> RHSVals
;
662 SmallVector
<Value
*, 4> ResultVals
;
663 extractValues(Builder
, LHSVals
, LHS
);
664 extractValues(Builder
, RHSVals
, RHS
);
666 IntegerType
*I32Ty
= Builder
.getInt32Ty();
667 IntegerType
*IntrinTy
= Size
> 32 ? Builder
.getInt64Ty() : I32Ty
;
668 Type
*DstTy
= LHSVals
[0]->getType();
670 for (int I
= 0, E
= LHSVals
.size(); I
!= E
; ++I
) {
671 Value
*LHS
= IsSigned
? Builder
.CreateSExtOrTrunc(LHSVals
[I
], I32Ty
)
672 : Builder
.CreateZExtOrTrunc(LHSVals
[I
], I32Ty
);
673 Value
*RHS
= IsSigned
? Builder
.CreateSExtOrTrunc(RHSVals
[I
], I32Ty
)
674 : Builder
.CreateZExtOrTrunc(RHSVals
[I
], I32Ty
);
676 IsSigned
? Intrinsic::amdgcn_mul_i24
: Intrinsic::amdgcn_mul_u24
;
677 Value
*Result
= Builder
.CreateIntrinsic(ID
, {IntrinTy
}, {LHS
, RHS
});
678 Result
= IsSigned
? Builder
.CreateSExtOrTrunc(Result
, DstTy
)
679 : Builder
.CreateZExtOrTrunc(Result
, DstTy
);
680 ResultVals
.push_back(Result
);
683 Value
*NewVal
= insertValues(Builder
, Ty
, ResultVals
);
684 NewVal
->takeName(&I
);
685 I
.replaceAllUsesWith(NewVal
);
691 // Find a select instruction, which may have been casted. This is mostly to deal
692 // with cases where i16 selects were promoted here to i32.
693 static SelectInst
*findSelectThroughCast(Value
*V
, CastInst
*&Cast
) {
695 if (SelectInst
*Sel
= dyn_cast
<SelectInst
>(V
))
698 if ((Cast
= dyn_cast
<CastInst
>(V
))) {
699 if (SelectInst
*Sel
= dyn_cast
<SelectInst
>(Cast
->getOperand(0)))
706 bool AMDGPUCodeGenPrepareImpl::foldBinOpIntoSelect(BinaryOperator
&BO
) const {
707 // Don't do this unless the old select is going away. We want to eliminate the
708 // binary operator, not replace a binop with a select.
713 // TODO: Should probably try to handle some cases with multiple
714 // users. Duplicating the select may be profitable for division.
715 SelectInst
*Sel
= findSelectThroughCast(BO
.getOperand(0), CastOp
);
716 if (!Sel
|| !Sel
->hasOneUse()) {
718 Sel
= findSelectThroughCast(BO
.getOperand(1), CastOp
);
721 if (!Sel
|| !Sel
->hasOneUse())
724 Constant
*CT
= dyn_cast
<Constant
>(Sel
->getTrueValue());
725 Constant
*CF
= dyn_cast
<Constant
>(Sel
->getFalseValue());
726 Constant
*CBO
= dyn_cast
<Constant
>(BO
.getOperand(SelOpNo
^ 1));
727 if (!CBO
|| !CT
|| !CF
)
731 if (!CastOp
->hasOneUse())
733 CT
= ConstantFoldCastOperand(CastOp
->getOpcode(), CT
, BO
.getType(), *DL
);
734 CF
= ConstantFoldCastOperand(CastOp
->getOpcode(), CF
, BO
.getType(), *DL
);
737 // TODO: Handle special 0/-1 cases DAG combine does, although we only really
738 // need to handle divisions here.
739 Constant
*FoldedT
= SelOpNo
?
740 ConstantFoldBinaryOpOperands(BO
.getOpcode(), CBO
, CT
, *DL
) :
741 ConstantFoldBinaryOpOperands(BO
.getOpcode(), CT
, CBO
, *DL
);
742 if (!FoldedT
|| isa
<ConstantExpr
>(FoldedT
))
745 Constant
*FoldedF
= SelOpNo
?
746 ConstantFoldBinaryOpOperands(BO
.getOpcode(), CBO
, CF
, *DL
) :
747 ConstantFoldBinaryOpOperands(BO
.getOpcode(), CF
, CBO
, *DL
);
748 if (!FoldedF
|| isa
<ConstantExpr
>(FoldedF
))
751 IRBuilder
<> Builder(&BO
);
752 Builder
.SetCurrentDebugLocation(BO
.getDebugLoc());
753 if (const FPMathOperator
*FPOp
= dyn_cast
<const FPMathOperator
>(&BO
))
754 Builder
.setFastMathFlags(FPOp
->getFastMathFlags());
756 Value
*NewSelect
= Builder
.CreateSelect(Sel
->getCondition(),
758 NewSelect
->takeName(&BO
);
759 BO
.replaceAllUsesWith(NewSelect
);
760 BO
.eraseFromParent();
762 CastOp
->eraseFromParent();
763 Sel
->eraseFromParent();
767 std::pair
<Value
*, Value
*>
768 AMDGPUCodeGenPrepareImpl::getFrexpResults(IRBuilder
<> &Builder
,
770 Type
*Ty
= Src
->getType();
771 Value
*Frexp
= Builder
.CreateIntrinsic(Intrinsic::frexp
,
772 {Ty
, Builder
.getInt32Ty()}, Src
);
773 Value
*FrexpMant
= Builder
.CreateExtractValue(Frexp
, {0});
775 // Bypass the bug workaround for the exponent result since it doesn't matter.
776 // TODO: Does the bug workaround even really need to consider the exponent
777 // result? It's unspecified by the spec.
781 ? Builder
.CreateIntrinsic(Intrinsic::amdgcn_frexp_exp
,
782 {Builder
.getInt32Ty(), Ty
}, Src
)
783 : Builder
.CreateExtractValue(Frexp
, {1});
784 return {FrexpMant
, FrexpExp
};
787 /// Emit an expansion of 1.0 / Src good for 1ulp that supports denormals.
788 Value
*AMDGPUCodeGenPrepareImpl::emitRcpIEEE1ULP(IRBuilder
<> &Builder
,
790 bool IsNegative
) const {
791 // Same as for 1.0, but expand the sign out of the constant.
792 // -1.0 / x -> rcp (fneg x)
794 Src
= Builder
.CreateFNeg(Src
);
796 // The rcp instruction doesn't support denormals, so scale the input
797 // out of the denormal range and convert at the end.
799 // Expand as 2^-n * (1.0 / (x * 2^n))
801 // TODO: Skip scaling if input is known never denormal and the input
802 // range won't underflow to denormal. The hard part is knowing the
803 // result. We need a range check, the result could be denormal for
804 // 0x1p+126 < den <= 0x1p+127.
805 auto [FrexpMant
, FrexpExp
] = getFrexpResults(Builder
, Src
);
806 Value
*ScaleFactor
= Builder
.CreateNeg(FrexpExp
);
807 Value
*Rcp
= Builder
.CreateUnaryIntrinsic(Intrinsic::amdgcn_rcp
, FrexpMant
);
808 return Builder
.CreateCall(getLdexpF32(), {Rcp
, ScaleFactor
});
811 /// Emit a 2ulp expansion for fdiv by using frexp for input scaling.
812 Value
*AMDGPUCodeGenPrepareImpl::emitFrexpDiv(IRBuilder
<> &Builder
, Value
*LHS
,
814 FastMathFlags FMF
) const {
815 // If we have have to work around the fract/frexp bug, we're worse off than
816 // using the fdiv.fast expansion. The full safe expansion is faster if we have
818 if (HasFP32DenormalFlush
&& ST
->hasFractBug() && !ST
->hasFastFMAF32() &&
819 (!FMF
.noNaNs() || !FMF
.noInfs()))
822 // We're scaling the LHS to avoid a denormal input, and scale the denominator
823 // to avoid large values underflowing the result.
824 auto [FrexpMantRHS
, FrexpExpRHS
] = getFrexpResults(Builder
, RHS
);
827 Builder
.CreateUnaryIntrinsic(Intrinsic::amdgcn_rcp
, FrexpMantRHS
);
829 auto [FrexpMantLHS
, FrexpExpLHS
] = getFrexpResults(Builder
, LHS
);
830 Value
*Mul
= Builder
.CreateFMul(FrexpMantLHS
, Rcp
);
832 // We multiplied by 2^N/2^M, so we need to multiply by 2^(N-M) to scale the
834 Value
*ExpDiff
= Builder
.CreateSub(FrexpExpLHS
, FrexpExpRHS
);
835 return Builder
.CreateCall(getLdexpF32(), {Mul
, ExpDiff
});
838 /// Emit a sqrt that handles denormals and is accurate to 2ulp.
839 Value
*AMDGPUCodeGenPrepareImpl::emitSqrtIEEE2ULP(IRBuilder
<> &Builder
,
841 FastMathFlags FMF
) const {
842 Type
*Ty
= Src
->getType();
843 APFloat SmallestNormal
=
844 APFloat::getSmallestNormalized(Ty
->getFltSemantics());
846 Builder
.CreateFCmpOLT(Src
, ConstantFP::get(Ty
, SmallestNormal
));
848 ConstantInt
*Zero
= Builder
.getInt32(0);
849 Value
*InputScaleFactor
=
850 Builder
.CreateSelect(NeedScale
, Builder
.getInt32(32), Zero
);
852 Value
*Scaled
= Builder
.CreateCall(getLdexpF32(), {Src
, InputScaleFactor
});
854 Value
*Sqrt
= Builder
.CreateCall(getSqrtF32(), Scaled
);
856 Value
*OutputScaleFactor
=
857 Builder
.CreateSelect(NeedScale
, Builder
.getInt32(-16), Zero
);
858 return Builder
.CreateCall(getLdexpF32(), {Sqrt
, OutputScaleFactor
});
861 /// Emit an expansion of 1.0 / sqrt(Src) good for 1ulp that supports denormals.
862 static Value
*emitRsqIEEE1ULP(IRBuilder
<> &Builder
, Value
*Src
,
864 // bool need_scale = x < 0x1p-126f;
865 // float input_scale = need_scale ? 0x1.0p+24f : 1.0f;
866 // float output_scale = need_scale ? 0x1.0p+12f : 1.0f;
867 // rsq(x * input_scale) * output_scale;
869 Type
*Ty
= Src
->getType();
870 APFloat SmallestNormal
=
871 APFloat::getSmallestNormalized(Ty
->getFltSemantics());
873 Builder
.CreateFCmpOLT(Src
, ConstantFP::get(Ty
, SmallestNormal
));
874 Constant
*One
= ConstantFP::get(Ty
, 1.0);
875 Constant
*InputScale
= ConstantFP::get(Ty
, 0x1.0p
+24);
876 Constant
*OutputScale
=
877 ConstantFP::get(Ty
, IsNegative
? -0x1.0p
+12 : 0x1.0p
+12);
879 Value
*InputScaleFactor
= Builder
.CreateSelect(NeedScale
, InputScale
, One
);
881 Value
*ScaledInput
= Builder
.CreateFMul(Src
, InputScaleFactor
);
882 Value
*Rsq
= Builder
.CreateUnaryIntrinsic(Intrinsic::amdgcn_rsq
, ScaledInput
);
883 Value
*OutputScaleFactor
= Builder
.CreateSelect(
884 NeedScale
, OutputScale
, IsNegative
? ConstantFP::get(Ty
, -1.0) : One
);
886 return Builder
.CreateFMul(Rsq
, OutputScaleFactor
);
889 bool AMDGPUCodeGenPrepareImpl::canOptimizeWithRsq(const FPMathOperator
*SqrtOp
,
890 FastMathFlags DivFMF
,
891 FastMathFlags SqrtFMF
) const {
892 // The rsqrt contraction increases accuracy from ~2ulp to ~1ulp.
893 if (!DivFMF
.allowContract() || !SqrtFMF
.allowContract())
896 // v_rsq_f32 gives 1ulp
897 return SqrtFMF
.approxFunc() || HasUnsafeFPMath
||
898 SqrtOp
->getFPAccuracy() >= 1.0f
;
901 Value
*AMDGPUCodeGenPrepareImpl::optimizeWithRsq(
902 IRBuilder
<> &Builder
, Value
*Num
, Value
*Den
, const FastMathFlags DivFMF
,
903 const FastMathFlags SqrtFMF
, const Instruction
*CtxI
) const {
904 // The rsqrt contraction increases accuracy from ~2ulp to ~1ulp.
905 assert(DivFMF
.allowContract() && SqrtFMF
.allowContract());
907 // rsq_f16 is accurate to 0.51 ulp.
908 // rsq_f32 is accurate for !fpmath >= 1.0ulp and denormals are flushed.
909 // rsq_f64 is never accurate.
910 const ConstantFP
*CLHS
= dyn_cast
<ConstantFP
>(Num
);
914 assert(Den
->getType()->isFloatTy());
916 bool IsNegative
= false;
918 // TODO: Handle other numerator values with arcp.
919 if (CLHS
->isExactlyValue(1.0) || (IsNegative
= CLHS
->isExactlyValue(-1.0))) {
920 // Add in the sqrt flags.
921 IRBuilder
<>::FastMathFlagGuard
Guard(Builder
);
922 Builder
.setFastMathFlags(DivFMF
| SqrtFMF
);
924 if ((DivFMF
.approxFunc() && SqrtFMF
.approxFunc()) || HasUnsafeFPMath
||
925 canIgnoreDenormalInput(Den
, CtxI
)) {
926 Value
*Result
= Builder
.CreateUnaryIntrinsic(Intrinsic::amdgcn_rsq
, Den
);
927 // -1.0 / sqrt(x) -> fneg(rsq(x))
928 return IsNegative
? Builder
.CreateFNeg(Result
) : Result
;
931 return emitRsqIEEE1ULP(Builder
, Den
, IsNegative
);
937 // Optimize fdiv with rcp:
939 // 1/x -> rcp(x) when rcp is sufficiently accurate or inaccurate rcp is
940 // allowed with unsafe-fp-math or afn.
942 // a/b -> a*rcp(b) when arcp is allowed, and we only need provide ULP 1.0
944 AMDGPUCodeGenPrepareImpl::optimizeWithRcp(IRBuilder
<> &Builder
, Value
*Num
,
945 Value
*Den
, FastMathFlags FMF
,
946 const Instruction
*CtxI
) const {
947 // rcp_f16 is accurate to 0.51 ulp.
948 // rcp_f32 is accurate for !fpmath >= 1.0ulp and denormals are flushed.
949 // rcp_f64 is never accurate.
950 assert(Den
->getType()->isFloatTy());
952 if (const ConstantFP
*CLHS
= dyn_cast
<ConstantFP
>(Num
)) {
953 bool IsNegative
= false;
954 if (CLHS
->isExactlyValue(1.0) ||
955 (IsNegative
= CLHS
->isExactlyValue(-1.0))) {
958 if (HasFP32DenormalFlush
|| FMF
.approxFunc()) {
959 // -1.0 / x -> 1.0 / fneg(x)
961 Src
= Builder
.CreateFNeg(Src
);
963 // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
964 // the CI documentation has a worst case error of 1 ulp.
965 // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK
966 // to use it as long as we aren't trying to use denormals.
968 // v_rcp_f16 and v_rsq_f16 DO support denormals.
970 // NOTE: v_sqrt and v_rcp will be combined to v_rsq later. So we don't
971 // insert rsq intrinsic here.
974 return Builder
.CreateUnaryIntrinsic(Intrinsic::amdgcn_rcp
, Src
);
977 // TODO: If the input isn't denormal, and we know the input exponent isn't
978 // big enough to introduce a denormal we can avoid the scaling.
979 return emitRcpIEEE1ULP(Builder
, Src
, IsNegative
);
983 if (FMF
.allowReciprocal()) {
984 // x / y -> x * (1.0 / y)
986 // TODO: Could avoid denormal scaling and use raw rcp if we knew the output
987 // will never underflow.
988 if (HasFP32DenormalFlush
|| FMF
.approxFunc()) {
989 Value
*Recip
= Builder
.CreateUnaryIntrinsic(Intrinsic::amdgcn_rcp
, Den
);
990 return Builder
.CreateFMul(Num
, Recip
);
993 Value
*Recip
= emitRcpIEEE1ULP(Builder
, Den
, false);
994 return Builder
.CreateFMul(Num
, Recip
);
1000 // optimize with fdiv.fast:
1002 // a/b -> fdiv.fast(a, b) when !fpmath >= 2.5ulp with denormals flushed.
1004 // 1/x -> fdiv.fast(1,x) when !fpmath >= 2.5ulp.
1006 // NOTE: optimizeWithRcp should be tried first because rcp is the preference.
1007 Value
*AMDGPUCodeGenPrepareImpl::optimizeWithFDivFast(
1008 IRBuilder
<> &Builder
, Value
*Num
, Value
*Den
, float ReqdAccuracy
) const {
1009 // fdiv.fast can achieve 2.5 ULP accuracy.
1010 if (ReqdAccuracy
< 2.5f
)
1013 // Only have fdiv.fast for f32.
1014 assert(Den
->getType()->isFloatTy());
1016 bool NumIsOne
= false;
1017 if (const ConstantFP
*CNum
= dyn_cast
<ConstantFP
>(Num
)) {
1018 if (CNum
->isExactlyValue(+1.0) || CNum
->isExactlyValue(-1.0))
1022 // fdiv does not support denormals. But 1.0/x is always fine to use it.
1024 // TODO: This works for any value with a specific known exponent range, don't
1025 // just limit to constant 1.
1026 if (!HasFP32DenormalFlush
&& !NumIsOne
)
1029 return Builder
.CreateIntrinsic(Intrinsic::amdgcn_fdiv_fast
, {}, {Num
, Den
});
1032 Value
*AMDGPUCodeGenPrepareImpl::visitFDivElement(
1033 IRBuilder
<> &Builder
, Value
*Num
, Value
*Den
, FastMathFlags DivFMF
,
1034 FastMathFlags SqrtFMF
, Value
*RsqOp
, const Instruction
*FDivInst
,
1035 float ReqdDivAccuracy
) const {
1038 optimizeWithRsq(Builder
, Num
, RsqOp
, DivFMF
, SqrtFMF
, FDivInst
);
1043 Value
*Rcp
= optimizeWithRcp(Builder
, Num
, Den
, DivFMF
, FDivInst
);
1047 // In the basic case fdiv_fast has the same instruction count as the frexp div
1048 // expansion. Slightly prefer fdiv_fast since it ends in an fmul that can
1049 // potentially be fused into a user. Also, materialization of the constants
1050 // can be reused for multiple instances.
1051 Value
*FDivFast
= optimizeWithFDivFast(Builder
, Num
, Den
, ReqdDivAccuracy
);
1055 return emitFrexpDiv(Builder
, Num
, Den
, DivFMF
);
1058 // Optimizations is performed based on fpmath, fast math flags as well as
1059 // denormals to optimize fdiv with either rcp or fdiv.fast.
1062 // 1/x -> rcp(x) when rcp is sufficiently accurate or inaccurate rcp is
1063 // allowed with unsafe-fp-math or afn.
1065 // a/b -> a*rcp(b) when inaccurate rcp is allowed with unsafe-fp-math or afn.
1068 // a/b -> fdiv.fast(a, b) when !fpmath >= 2.5ulp with denormals flushed.
1070 // 1/x -> fdiv.fast(1,x) when !fpmath >= 2.5ulp.
1072 // NOTE: rcp is the preference in cases that both are legal.
1073 bool AMDGPUCodeGenPrepareImpl::visitFDiv(BinaryOperator
&FDiv
) {
1074 if (DisableFDivExpand
)
1077 Type
*Ty
= FDiv
.getType()->getScalarType();
1078 if (!Ty
->isFloatTy())
1081 // The f64 rcp/rsq approximations are pretty inaccurate. We can do an
1082 // expansion around them in codegen. f16 is good enough to always use.
1084 const FPMathOperator
*FPOp
= cast
<const FPMathOperator
>(&FDiv
);
1085 const FastMathFlags DivFMF
= FPOp
->getFastMathFlags();
1086 const float ReqdAccuracy
= FPOp
->getFPAccuracy();
1088 FastMathFlags SqrtFMF
;
1090 Value
*Num
= FDiv
.getOperand(0);
1091 Value
*Den
= FDiv
.getOperand(1);
1093 Value
*RsqOp
= nullptr;
1094 auto *DenII
= dyn_cast
<IntrinsicInst
>(Den
);
1095 if (DenII
&& DenII
->getIntrinsicID() == Intrinsic::sqrt
&&
1096 DenII
->hasOneUse()) {
1097 const auto *SqrtOp
= cast
<FPMathOperator
>(DenII
);
1098 SqrtFMF
= SqrtOp
->getFastMathFlags();
1099 if (canOptimizeWithRsq(SqrtOp
, DivFMF
, SqrtFMF
))
1100 RsqOp
= SqrtOp
->getOperand(0);
1103 // Inaccurate rcp is allowed with unsafe-fp-math or afn.
1105 // Defer to codegen to handle this.
1107 // TODO: Decide on an interpretation for interactions between afn + arcp +
1108 // !fpmath, and make it consistent between here and codegen. For now, defer
1109 // expansion of afn to codegen. The current interpretation is so aggressive we
1110 // don't need any pre-consideration here when we have better information. A
1111 // more conservative interpretation could use handling here.
1112 const bool AllowInaccurateRcp
= HasUnsafeFPMath
|| DivFMF
.approxFunc();
1113 if (!RsqOp
&& AllowInaccurateRcp
)
1116 // Defer the correct implementations to codegen.
1117 if (ReqdAccuracy
< 1.0f
)
1120 IRBuilder
<> Builder(FDiv
.getParent(), std::next(FDiv
.getIterator()));
1121 Builder
.setFastMathFlags(DivFMF
);
1122 Builder
.SetCurrentDebugLocation(FDiv
.getDebugLoc());
1124 SmallVector
<Value
*, 4> NumVals
;
1125 SmallVector
<Value
*, 4> DenVals
;
1126 SmallVector
<Value
*, 4> RsqDenVals
;
1127 extractValues(Builder
, NumVals
, Num
);
1128 extractValues(Builder
, DenVals
, Den
);
1131 extractValues(Builder
, RsqDenVals
, RsqOp
);
1133 SmallVector
<Value
*, 4> ResultVals(NumVals
.size());
1134 for (int I
= 0, E
= NumVals
.size(); I
!= E
; ++I
) {
1135 Value
*NumElt
= NumVals
[I
];
1136 Value
*DenElt
= DenVals
[I
];
1137 Value
*RsqDenElt
= RsqOp
? RsqDenVals
[I
] : nullptr;
1140 visitFDivElement(Builder
, NumElt
, DenElt
, DivFMF
, SqrtFMF
, RsqDenElt
,
1141 cast
<Instruction
>(FPOp
), ReqdAccuracy
);
1143 // Keep the original, but scalarized.
1145 // This has the unfortunate side effect of sometimes scalarizing when
1146 // we're not going to do anything.
1147 NewElt
= Builder
.CreateFDiv(NumElt
, DenElt
);
1148 if (auto *NewEltInst
= dyn_cast
<Instruction
>(NewElt
))
1149 NewEltInst
->copyMetadata(FDiv
);
1152 ResultVals
[I
] = NewElt
;
1155 Value
*NewVal
= insertValues(Builder
, FDiv
.getType(), ResultVals
);
1158 FDiv
.replaceAllUsesWith(NewVal
);
1159 NewVal
->takeName(&FDiv
);
1160 RecursivelyDeleteTriviallyDeadInstructions(&FDiv
, TLInfo
);
1166 static bool hasUnsafeFPMath(const Function
&F
) {
1167 Attribute Attr
= F
.getFnAttribute("unsafe-fp-math");
1168 return Attr
.getValueAsBool();
1171 static std::pair
<Value
*, Value
*> getMul64(IRBuilder
<> &Builder
,
1172 Value
*LHS
, Value
*RHS
) {
1173 Type
*I32Ty
= Builder
.getInt32Ty();
1174 Type
*I64Ty
= Builder
.getInt64Ty();
1176 Value
*LHS_EXT64
= Builder
.CreateZExt(LHS
, I64Ty
);
1177 Value
*RHS_EXT64
= Builder
.CreateZExt(RHS
, I64Ty
);
1178 Value
*MUL64
= Builder
.CreateMul(LHS_EXT64
, RHS_EXT64
);
1179 Value
*Lo
= Builder
.CreateTrunc(MUL64
, I32Ty
);
1180 Value
*Hi
= Builder
.CreateLShr(MUL64
, Builder
.getInt64(32));
1181 Hi
= Builder
.CreateTrunc(Hi
, I32Ty
);
1182 return std::pair(Lo
, Hi
);
1185 static Value
* getMulHu(IRBuilder
<> &Builder
, Value
*LHS
, Value
*RHS
) {
1186 return getMul64(Builder
, LHS
, RHS
).second
;
1189 /// Figure out how many bits are really needed for this division. \p AtLeast is
1190 /// an optimization hint to bypass the second ComputeNumSignBits call if we the
1191 /// first one is insufficient. Returns -1 on failure.
1192 int AMDGPUCodeGenPrepareImpl::getDivNumBits(BinaryOperator
&I
, Value
*Num
,
1193 Value
*Den
, unsigned AtLeast
,
1194 bool IsSigned
) const {
1195 const DataLayout
&DL
= Mod
->getDataLayout();
1197 unsigned LHSSignBits
= ComputeNumSignBits(Num
, DL
, 0, AC
, &I
);
1198 if (LHSSignBits
< AtLeast
)
1201 unsigned RHSSignBits
= ComputeNumSignBits(Den
, DL
, 0, AC
, &I
);
1202 if (RHSSignBits
< AtLeast
)
1205 unsigned SignBits
= std::min(LHSSignBits
, RHSSignBits
);
1206 unsigned DivBits
= Num
->getType()->getScalarSizeInBits() - SignBits
;
1209 KnownBits Known
= computeKnownBits(Num
, DL
, 0, AC
, &I
);
1210 // We know all bits are used for division for Num or Den in range
1211 // (SignedMax, UnsignedMax]
1212 if (Known
.isNegative() || !Known
.isNonNegative())
1214 unsigned LHSSignBits
= Known
.countMinLeadingZeros();
1216 Known
= computeKnownBits(Den
, DL
, 0, AC
, &I
);
1217 if (Known
.isNegative() || !Known
.isNonNegative())
1219 unsigned RHSSignBits
= Known
.countMinLeadingZeros();
1221 unsigned SignBits
= std::min(LHSSignBits
, RHSSignBits
);
1222 unsigned DivBits
= Num
->getType()->getScalarSizeInBits() - SignBits
;
1227 // The fractional part of a float is enough to accurately represent up to
1228 // a 24-bit signed integer.
1229 Value
*AMDGPUCodeGenPrepareImpl::expandDivRem24(IRBuilder
<> &Builder
,
1230 BinaryOperator
&I
, Value
*Num
,
1231 Value
*Den
, bool IsDiv
,
1232 bool IsSigned
) const {
1233 unsigned SSBits
= Num
->getType()->getScalarSizeInBits();
1234 // If Num bits <= 24, assume 0 signbits.
1235 unsigned AtLeast
= (SSBits
<= 24) ? 0 : (SSBits
- 24 + IsSigned
);
1236 int DivBits
= getDivNumBits(I
, Num
, Den
, AtLeast
, IsSigned
);
1239 return expandDivRem24Impl(Builder
, I
, Num
, Den
, DivBits
, IsDiv
, IsSigned
);
1242 Value
*AMDGPUCodeGenPrepareImpl::expandDivRem24Impl(
1243 IRBuilder
<> &Builder
, BinaryOperator
&I
, Value
*Num
, Value
*Den
,
1244 unsigned DivBits
, bool IsDiv
, bool IsSigned
) const {
1245 Type
*I32Ty
= Builder
.getInt32Ty();
1246 Num
= Builder
.CreateTrunc(Num
, I32Ty
);
1247 Den
= Builder
.CreateTrunc(Den
, I32Ty
);
1249 Type
*F32Ty
= Builder
.getFloatTy();
1250 ConstantInt
*One
= Builder
.getInt32(1);
1254 // char|short jq = ia ^ ib;
1255 JQ
= Builder
.CreateXor(Num
, Den
);
1257 // jq = jq >> (bitsize - 2)
1258 JQ
= Builder
.CreateAShr(JQ
, Builder
.getInt32(30));
1261 JQ
= Builder
.CreateOr(JQ
, One
);
1264 // int ia = (int)LHS;
1267 // int ib, (int)RHS;
1270 // float fa = (float)ia;
1271 Value
*FA
= IsSigned
? Builder
.CreateSIToFP(IA
, F32Ty
)
1272 : Builder
.CreateUIToFP(IA
, F32Ty
);
1274 // float fb = (float)ib;
1275 Value
*FB
= IsSigned
? Builder
.CreateSIToFP(IB
,F32Ty
)
1276 : Builder
.CreateUIToFP(IB
,F32Ty
);
1278 Value
*RCP
= Builder
.CreateIntrinsic(Intrinsic::amdgcn_rcp
,
1279 Builder
.getFloatTy(), {FB
});
1280 Value
*FQM
= Builder
.CreateFMul(FA
, RCP
);
1283 CallInst
*FQ
= Builder
.CreateUnaryIntrinsic(Intrinsic::trunc
, FQM
);
1284 FQ
->copyFastMathFlags(Builder
.getFastMathFlags());
1286 // float fqneg = -fq;
1287 Value
*FQNeg
= Builder
.CreateFNeg(FQ
);
1289 // float fr = mad(fqneg, fb, fa);
1290 auto FMAD
= !ST
->hasMadMacF32Insts()
1292 : (Intrinsic::ID
)Intrinsic::amdgcn_fmad_ftz
;
1293 Value
*FR
= Builder
.CreateIntrinsic(FMAD
,
1294 {FQNeg
->getType()}, {FQNeg
, FB
, FA
}, FQ
);
1296 // int iq = (int)fq;
1297 Value
*IQ
= IsSigned
? Builder
.CreateFPToSI(FQ
, I32Ty
)
1298 : Builder
.CreateFPToUI(FQ
, I32Ty
);
1301 FR
= Builder
.CreateUnaryIntrinsic(Intrinsic::fabs
, FR
, FQ
);
1304 FB
= Builder
.CreateUnaryIntrinsic(Intrinsic::fabs
, FB
, FQ
);
1306 // int cv = fr >= fb;
1307 Value
*CV
= Builder
.CreateFCmpOGE(FR
, FB
);
1309 // jq = (cv ? jq : 0);
1310 JQ
= Builder
.CreateSelect(CV
, JQ
, Builder
.getInt32(0));
1313 Value
*Div
= Builder
.CreateAdd(IQ
, JQ
);
1317 // Rem needs compensation, it's easier to recompute it
1318 Value
*Rem
= Builder
.CreateMul(Div
, Den
);
1319 Res
= Builder
.CreateSub(Num
, Rem
);
1322 if (DivBits
!= 0 && DivBits
< 32) {
1323 // Extend in register from the number of bits this divide really is.
1325 int InRegBits
= 32 - DivBits
;
1327 Res
= Builder
.CreateShl(Res
, InRegBits
);
1328 Res
= Builder
.CreateAShr(Res
, InRegBits
);
1330 ConstantInt
*TruncMask
1331 = Builder
.getInt32((UINT64_C(1) << DivBits
) - 1);
1332 Res
= Builder
.CreateAnd(Res
, TruncMask
);
1339 // Try to recognize special cases the DAG will emit special, better expansions
1340 // than the general expansion we do here.
1342 // TODO: It would be better to just directly handle those optimizations here.
1343 bool AMDGPUCodeGenPrepareImpl::divHasSpecialOptimization(BinaryOperator
&I
,
1346 if (Constant
*C
= dyn_cast
<Constant
>(Den
)) {
1347 // Arbitrary constants get a better expansion as long as a wider mulhi is
1349 if (C
->getType()->getScalarSizeInBits() <= 32)
1352 // TODO: Sdiv check for not exact for some reason.
1354 // If there's no wider mulhi, there's only a better expansion for powers of
1356 // TODO: Should really know for each vector element.
1357 if (isKnownToBeAPowerOfTwo(C
, *DL
, true, 0, AC
, &I
, DT
))
1363 if (BinaryOperator
*BinOpDen
= dyn_cast
<BinaryOperator
>(Den
)) {
1364 // fold (udiv x, (shl c, y)) -> x >>u (log2(c)+y) iff c is power of 2
1365 if (BinOpDen
->getOpcode() == Instruction::Shl
&&
1366 isa
<Constant
>(BinOpDen
->getOperand(0)) &&
1367 isKnownToBeAPowerOfTwo(BinOpDen
->getOperand(0), *DL
, true,
1376 static Value
*getSign32(Value
*V
, IRBuilder
<> &Builder
, const DataLayout
*DL
) {
1377 // Check whether the sign can be determined statically.
1378 KnownBits Known
= computeKnownBits(V
, *DL
);
1379 if (Known
.isNegative())
1380 return Constant::getAllOnesValue(V
->getType());
1381 if (Known
.isNonNegative())
1382 return Constant::getNullValue(V
->getType());
1383 return Builder
.CreateAShr(V
, Builder
.getInt32(31));
1386 Value
*AMDGPUCodeGenPrepareImpl::expandDivRem32(IRBuilder
<> &Builder
,
1387 BinaryOperator
&I
, Value
*X
,
1389 Instruction::BinaryOps Opc
= I
.getOpcode();
1390 assert(Opc
== Instruction::URem
|| Opc
== Instruction::UDiv
||
1391 Opc
== Instruction::SRem
|| Opc
== Instruction::SDiv
);
1395 Builder
.setFastMathFlags(FMF
);
1397 if (divHasSpecialOptimization(I
, X
, Y
))
1398 return nullptr; // Keep it for later optimization.
1400 bool IsDiv
= Opc
== Instruction::UDiv
|| Opc
== Instruction::SDiv
;
1401 bool IsSigned
= Opc
== Instruction::SRem
|| Opc
== Instruction::SDiv
;
1403 Type
*Ty
= X
->getType();
1404 Type
*I32Ty
= Builder
.getInt32Ty();
1405 Type
*F32Ty
= Builder
.getFloatTy();
1407 if (Ty
->getScalarSizeInBits() != 32) {
1409 X
= Builder
.CreateSExtOrTrunc(X
, I32Ty
);
1410 Y
= Builder
.CreateSExtOrTrunc(Y
, I32Ty
);
1412 X
= Builder
.CreateZExtOrTrunc(X
, I32Ty
);
1413 Y
= Builder
.CreateZExtOrTrunc(Y
, I32Ty
);
1417 if (Value
*Res
= expandDivRem24(Builder
, I
, X
, Y
, IsDiv
, IsSigned
)) {
1418 return IsSigned
? Builder
.CreateSExtOrTrunc(Res
, Ty
) :
1419 Builder
.CreateZExtOrTrunc(Res
, Ty
);
1422 ConstantInt
*Zero
= Builder
.getInt32(0);
1423 ConstantInt
*One
= Builder
.getInt32(1);
1425 Value
*Sign
= nullptr;
1427 Value
*SignX
= getSign32(X
, Builder
, DL
);
1428 Value
*SignY
= getSign32(Y
, Builder
, DL
);
1429 // Remainder sign is the same as LHS
1430 Sign
= IsDiv
? Builder
.CreateXor(SignX
, SignY
) : SignX
;
1432 X
= Builder
.CreateAdd(X
, SignX
);
1433 Y
= Builder
.CreateAdd(Y
, SignY
);
1435 X
= Builder
.CreateXor(X
, SignX
);
1436 Y
= Builder
.CreateXor(Y
, SignY
);
1439 // The algorithm here is based on ideas from "Software Integer Division", Tom
1440 // Rodeheffer, August 2008.
1442 // unsigned udiv(unsigned x, unsigned y) {
1443 // // Initial estimate of inv(y). The constant is less than 2^32 to ensure
1444 // // that this is a lower bound on inv(y), even if some of the calculations
1446 // unsigned z = (unsigned)((4294967296.0 - 512.0) * v_rcp_f32((float)y));
1448 // // One round of UNR (Unsigned integer Newton-Raphson) to improve z.
1449 // // Empirically this is guaranteed to give a "two-y" lower bound on
1451 // z += umulh(z, -y * z);
1453 // // Quotient/remainder estimate.
1454 // unsigned q = umulh(x, z);
1455 // unsigned r = x - q * y;
1457 // // Two rounds of quotient/remainder refinement.
1470 // Initial estimate of inv(y).
1471 Value
*FloatY
= Builder
.CreateUIToFP(Y
, F32Ty
);
1472 Value
*RcpY
= Builder
.CreateIntrinsic(Intrinsic::amdgcn_rcp
, F32Ty
, {FloatY
});
1473 Constant
*Scale
= ConstantFP::get(F32Ty
, llvm::bit_cast
<float>(0x4F7FFFFE));
1474 Value
*ScaledY
= Builder
.CreateFMul(RcpY
, Scale
);
1475 Value
*Z
= Builder
.CreateFPToUI(ScaledY
, I32Ty
);
1477 // One round of UNR.
1478 Value
*NegY
= Builder
.CreateSub(Zero
, Y
);
1479 Value
*NegYZ
= Builder
.CreateMul(NegY
, Z
);
1480 Z
= Builder
.CreateAdd(Z
, getMulHu(Builder
, Z
, NegYZ
));
1482 // Quotient/remainder estimate.
1483 Value
*Q
= getMulHu(Builder
, X
, Z
);
1484 Value
*R
= Builder
.CreateSub(X
, Builder
.CreateMul(Q
, Y
));
1486 // First quotient/remainder refinement.
1487 Value
*Cond
= Builder
.CreateICmpUGE(R
, Y
);
1489 Q
= Builder
.CreateSelect(Cond
, Builder
.CreateAdd(Q
, One
), Q
);
1490 R
= Builder
.CreateSelect(Cond
, Builder
.CreateSub(R
, Y
), R
);
1492 // Second quotient/remainder refinement.
1493 Cond
= Builder
.CreateICmpUGE(R
, Y
);
1496 Res
= Builder
.CreateSelect(Cond
, Builder
.CreateAdd(Q
, One
), Q
);
1498 Res
= Builder
.CreateSelect(Cond
, Builder
.CreateSub(R
, Y
), R
);
1501 Res
= Builder
.CreateXor(Res
, Sign
);
1502 Res
= Builder
.CreateSub(Res
, Sign
);
1503 Res
= Builder
.CreateSExtOrTrunc(Res
, Ty
);
1505 Res
= Builder
.CreateZExtOrTrunc(Res
, Ty
);
1510 Value
*AMDGPUCodeGenPrepareImpl::shrinkDivRem64(IRBuilder
<> &Builder
,
1511 BinaryOperator
&I
, Value
*Num
,
1513 if (!ExpandDiv64InIR
&& divHasSpecialOptimization(I
, Num
, Den
))
1514 return nullptr; // Keep it for later optimization.
1516 Instruction::BinaryOps Opc
= I
.getOpcode();
1518 bool IsDiv
= Opc
== Instruction::SDiv
|| Opc
== Instruction::UDiv
;
1519 bool IsSigned
= Opc
== Instruction::SDiv
|| Opc
== Instruction::SRem
;
1521 int NumDivBits
= getDivNumBits(I
, Num
, Den
, 32, IsSigned
);
1522 if (NumDivBits
== -1)
1525 Value
*Narrowed
= nullptr;
1526 if (NumDivBits
<= 24) {
1527 Narrowed
= expandDivRem24Impl(Builder
, I
, Num
, Den
, NumDivBits
,
1529 } else if (NumDivBits
<= 32) {
1530 Narrowed
= expandDivRem32(Builder
, I
, Num
, Den
);
1534 return IsSigned
? Builder
.CreateSExt(Narrowed
, Num
->getType()) :
1535 Builder
.CreateZExt(Narrowed
, Num
->getType());
1541 void AMDGPUCodeGenPrepareImpl::expandDivRem64(BinaryOperator
&I
) const {
1542 Instruction::BinaryOps Opc
= I
.getOpcode();
1543 // Do the general expansion.
1544 if (Opc
== Instruction::UDiv
|| Opc
== Instruction::SDiv
) {
1545 expandDivisionUpTo64Bits(&I
);
1549 if (Opc
== Instruction::URem
|| Opc
== Instruction::SRem
) {
1550 expandRemainderUpTo64Bits(&I
);
1554 llvm_unreachable("not a division");
1557 bool AMDGPUCodeGenPrepareImpl::visitBinaryOperator(BinaryOperator
&I
) {
1558 if (foldBinOpIntoSelect(I
))
1561 if (ST
->has16BitInsts() && needsPromotionToI32(I
.getType()) &&
1562 UA
->isUniform(&I
) && promoteUniformOpToI32(I
))
1565 if (UseMul24Intrin
&& replaceMulWithMul24(I
))
1568 bool Changed
= false;
1569 Instruction::BinaryOps Opc
= I
.getOpcode();
1570 Type
*Ty
= I
.getType();
1571 Value
*NewDiv
= nullptr;
1572 unsigned ScalarSize
= Ty
->getScalarSizeInBits();
1574 SmallVector
<BinaryOperator
*, 8> Div64ToExpand
;
1576 if ((Opc
== Instruction::URem
|| Opc
== Instruction::UDiv
||
1577 Opc
== Instruction::SRem
|| Opc
== Instruction::SDiv
) &&
1579 !DisableIDivExpand
) {
1580 Value
*Num
= I
.getOperand(0);
1581 Value
*Den
= I
.getOperand(1);
1582 IRBuilder
<> Builder(&I
);
1583 Builder
.SetCurrentDebugLocation(I
.getDebugLoc());
1585 if (auto *VT
= dyn_cast
<FixedVectorType
>(Ty
)) {
1586 NewDiv
= PoisonValue::get(VT
);
1588 for (unsigned N
= 0, E
= VT
->getNumElements(); N
!= E
; ++N
) {
1589 Value
*NumEltN
= Builder
.CreateExtractElement(Num
, N
);
1590 Value
*DenEltN
= Builder
.CreateExtractElement(Den
, N
);
1593 if (ScalarSize
<= 32) {
1594 NewElt
= expandDivRem32(Builder
, I
, NumEltN
, DenEltN
);
1596 NewElt
= Builder
.CreateBinOp(Opc
, NumEltN
, DenEltN
);
1598 // See if this 64-bit division can be shrunk to 32/24-bits before
1599 // producing the general expansion.
1600 NewElt
= shrinkDivRem64(Builder
, I
, NumEltN
, DenEltN
);
1602 // The general 64-bit expansion introduces control flow and doesn't
1603 // return the new value. Just insert a scalar copy and defer
1605 NewElt
= Builder
.CreateBinOp(Opc
, NumEltN
, DenEltN
);
1606 Div64ToExpand
.push_back(cast
<BinaryOperator
>(NewElt
));
1610 if (auto *NewEltI
= dyn_cast
<Instruction
>(NewElt
))
1611 NewEltI
->copyIRFlags(&I
);
1613 NewDiv
= Builder
.CreateInsertElement(NewDiv
, NewElt
, N
);
1616 if (ScalarSize
<= 32)
1617 NewDiv
= expandDivRem32(Builder
, I
, Num
, Den
);
1619 NewDiv
= shrinkDivRem64(Builder
, I
, Num
, Den
);
1621 Div64ToExpand
.push_back(&I
);
1626 I
.replaceAllUsesWith(NewDiv
);
1627 I
.eraseFromParent();
1632 if (ExpandDiv64InIR
) {
1633 // TODO: We get much worse code in specially handled constant cases.
1634 for (BinaryOperator
*Div
: Div64ToExpand
) {
1635 expandDivRem64(*Div
);
1644 bool AMDGPUCodeGenPrepareImpl::visitLoadInst(LoadInst
&I
) {
1648 if ((I
.getPointerAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS
||
1649 I
.getPointerAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT
) &&
1650 canWidenScalarExtLoad(I
)) {
1651 IRBuilder
<> Builder(&I
);
1652 Builder
.SetCurrentDebugLocation(I
.getDebugLoc());
1654 Type
*I32Ty
= Builder
.getInt32Ty();
1655 LoadInst
*WidenLoad
= Builder
.CreateLoad(I32Ty
, I
.getPointerOperand());
1656 WidenLoad
->copyMetadata(I
);
1658 // If we have range metadata, we need to convert the type, and not make
1659 // assumptions about the high bits.
1660 if (auto *Range
= WidenLoad
->getMetadata(LLVMContext::MD_range
)) {
1661 ConstantInt
*Lower
=
1662 mdconst::extract
<ConstantInt
>(Range
->getOperand(0));
1664 if (Lower
->isNullValue()) {
1665 WidenLoad
->setMetadata(LLVMContext::MD_range
, nullptr);
1667 Metadata
*LowAndHigh
[] = {
1668 ConstantAsMetadata::get(ConstantInt::get(I32Ty
, Lower
->getValue().zext(32))),
1669 // Don't make assumptions about the high bits.
1670 ConstantAsMetadata::get(ConstantInt::get(I32Ty
, 0))
1673 WidenLoad
->setMetadata(LLVMContext::MD_range
,
1674 MDNode::get(Mod
->getContext(), LowAndHigh
));
1678 int TySize
= Mod
->getDataLayout().getTypeSizeInBits(I
.getType());
1679 Type
*IntNTy
= Builder
.getIntNTy(TySize
);
1680 Value
*ValTrunc
= Builder
.CreateTrunc(WidenLoad
, IntNTy
);
1681 Value
*ValOrig
= Builder
.CreateBitCast(ValTrunc
, I
.getType());
1682 I
.replaceAllUsesWith(ValOrig
);
1683 I
.eraseFromParent();
1690 bool AMDGPUCodeGenPrepareImpl::visitICmpInst(ICmpInst
&I
) {
1691 bool Changed
= false;
1693 if (ST
->has16BitInsts() && needsPromotionToI32(I
.getOperand(0)->getType()) &&
1695 Changed
|= promoteUniformOpToI32(I
);
1700 bool AMDGPUCodeGenPrepareImpl::visitSelectInst(SelectInst
&I
) {
1701 Value
*Cond
= I
.getCondition();
1702 Value
*TrueVal
= I
.getTrueValue();
1703 Value
*FalseVal
= I
.getFalseValue();
1705 FCmpInst::Predicate Pred
;
1707 if (ST
->has16BitInsts() && needsPromotionToI32(I
.getType())) {
1708 if (UA
->isUniform(&I
))
1709 return promoteUniformOpToI32(I
);
1713 // Match fract pattern with nan check.
1714 if (!match(Cond
, m_FCmp(Pred
, m_Value(CmpVal
), m_NonNaN())))
1717 FPMathOperator
*FPOp
= dyn_cast
<FPMathOperator
>(&I
);
1721 IRBuilder
<> Builder(&I
);
1722 Builder
.setFastMathFlags(FPOp
->getFastMathFlags());
1724 auto *IITrue
= dyn_cast
<IntrinsicInst
>(TrueVal
);
1725 auto *IIFalse
= dyn_cast
<IntrinsicInst
>(FalseVal
);
1727 Value
*Fract
= nullptr;
1728 if (Pred
== FCmpInst::FCMP_UNO
&& TrueVal
== CmpVal
&& IIFalse
&&
1729 CmpVal
== matchFractPat(*IIFalse
)) {
1730 // isnan(x) ? x : fract(x)
1731 Fract
= applyFractPat(Builder
, CmpVal
);
1732 } else if (Pred
== FCmpInst::FCMP_ORD
&& FalseVal
== CmpVal
&& IITrue
&&
1733 CmpVal
== matchFractPat(*IITrue
)) {
1734 // !isnan(x) ? fract(x) : x
1735 Fract
= applyFractPat(Builder
, CmpVal
);
1739 Fract
->takeName(&I
);
1740 I
.replaceAllUsesWith(Fract
);
1741 RecursivelyDeleteTriviallyDeadInstructions(&I
, TLInfo
);
1745 static bool areInSameBB(const Value
*A
, const Value
*B
) {
1746 const auto *IA
= dyn_cast
<Instruction
>(A
);
1747 const auto *IB
= dyn_cast
<Instruction
>(B
);
1748 return IA
&& IB
&& IA
->getParent() == IB
->getParent();
1751 // Helper for breaking large PHIs that returns true when an extractelement on V
1752 // is likely to be folded away by the DAG combiner.
1753 static bool isInterestingPHIIncomingValue(const Value
*V
) {
1754 const auto *FVT
= dyn_cast
<FixedVectorType
>(V
->getType());
1758 const Value
*CurVal
= V
;
1760 // Check for insertelements, keeping track of the elements covered.
1761 BitVector
EltsCovered(FVT
->getNumElements());
1762 while (const auto *IE
= dyn_cast
<InsertElementInst
>(CurVal
)) {
1763 const auto *Idx
= dyn_cast
<ConstantInt
>(IE
->getOperand(2));
1765 // Non constant index/out of bounds index -> folding is unlikely.
1766 // The latter is more of a sanity check because canonical IR should just
1767 // have replaced those with poison.
1768 if (!Idx
|| Idx
->getZExtValue() >= FVT
->getNumElements())
1771 const auto *VecSrc
= IE
->getOperand(0);
1773 // If the vector source is another instruction, it must be in the same basic
1774 // block. Otherwise, the DAGCombiner won't see the whole thing and is
1775 // unlikely to be able to do anything interesting here.
1776 if (isa
<Instruction
>(VecSrc
) && !areInSameBB(VecSrc
, IE
))
1780 EltsCovered
.set(Idx
->getZExtValue());
1782 // All elements covered.
1783 if (EltsCovered
.all())
1787 // We either didn't find a single insertelement, or the insertelement chain
1788 // ended before all elements were covered. Check for other interesting values.
1790 // Constants are always interesting because we can just constant fold the
1792 if (isa
<Constant
>(CurVal
))
1795 // shufflevector is likely to be profitable if either operand is a constant,
1796 // or if either source is in the same block.
1797 // This is because shufflevector is most often lowered as a series of
1798 // insert/extract elements anyway.
1799 if (const auto *SV
= dyn_cast
<ShuffleVectorInst
>(CurVal
)) {
1800 return isa
<Constant
>(SV
->getOperand(1)) ||
1801 areInSameBB(SV
, SV
->getOperand(0)) ||
1802 areInSameBB(SV
, SV
->getOperand(1));
1808 static void collectPHINodes(const PHINode
&I
,
1809 SmallPtrSet
<const PHINode
*, 8> &SeenPHIs
) {
1810 const auto [It
, Inserted
] = SeenPHIs
.insert(&I
);
1814 for (const Value
*Inc
: I
.incoming_values()) {
1815 if (const auto *PhiInc
= dyn_cast
<PHINode
>(Inc
))
1816 collectPHINodes(*PhiInc
, SeenPHIs
);
1819 for (const User
*U
: I
.users()) {
1820 if (const auto *PhiU
= dyn_cast
<PHINode
>(U
))
1821 collectPHINodes(*PhiU
, SeenPHIs
);
1825 bool AMDGPUCodeGenPrepareImpl::canBreakPHINode(const PHINode
&I
) {
1826 // Check in the cache first.
1827 if (const auto It
= BreakPhiNodesCache
.find(&I
);
1828 It
!= BreakPhiNodesCache
.end())
1831 // We consider PHI nodes as part of "chains", so given a PHI node I, we
1832 // recursively consider all its users and incoming values that are also PHI
1833 // nodes. We then make a decision about all of those PHIs at once. Either they
1834 // all get broken up, or none of them do. That way, we avoid cases where a
1835 // single PHI is/is not broken and we end up reforming/exploding a vector
1836 // multiple times, or even worse, doing it in a loop.
1837 SmallPtrSet
<const PHINode
*, 8> WorkList
;
1838 collectPHINodes(I
, WorkList
);
1841 // Check that none of the PHI nodes in the worklist are in the map. If some of
1842 // them are, it means we're not good enough at collecting related PHIs.
1843 for (const PHINode
*WLP
: WorkList
) {
1844 assert(BreakPhiNodesCache
.count(WLP
) == 0);
1848 // To consider a PHI profitable to break, we need to see some interesting
1849 // incoming values. At least 2/3rd (rounded up) of all PHIs in the worklist
1850 // must have one to consider all PHIs breakable.
1852 // This threshold has been determined through performance testing.
1854 // Note that the computation below is equivalent to
1856 // (unsigned)ceil((K / 3.0) * 2)
1858 // It's simply written this way to avoid mixing integral/FP arithmetic.
1859 const auto Threshold
= (alignTo(WorkList
.size() * 2, 3) / 3);
1860 unsigned NumBreakablePHIs
= 0;
1861 bool CanBreak
= false;
1862 for (const PHINode
*Cur
: WorkList
) {
1863 // Don't break PHIs that have no interesting incoming values. That is, where
1864 // there is no clear opportunity to fold the "extractelement" instructions
1867 // Note: IC does not run after this pass, so we're only interested in the
1868 // foldings that the DAG combiner can do.
1869 if (any_of(Cur
->incoming_values(), isInterestingPHIIncomingValue
)) {
1870 if (++NumBreakablePHIs
>= Threshold
) {
1877 for (const PHINode
*Cur
: WorkList
)
1878 BreakPhiNodesCache
[Cur
] = CanBreak
;
1883 /// Helper class for "break large PHIs" (visitPHINode).
1885 /// This represents a slice of a PHI's incoming value, which is made up of:
1886 /// - The type of the slice (Ty)
1887 /// - The index in the incoming value's vector where the slice starts (Idx)
1888 /// - The number of elements in the slice (NumElts).
1889 /// It also keeps track of the NewPHI node inserted for this particular slice.
1892 /// <4 x i64> -> Split into four i64 slices.
1893 /// -> [i64, 0, 1], [i64, 1, 1], [i64, 2, 1], [i64, 3, 1]
1894 /// <5 x i16> -> Split into 2 <2 x i16> slices + a i16 tail.
1895 /// -> [<2 x i16>, 0, 2], [<2 x i16>, 2, 2], [i16, 4, 1]
1898 VectorSlice(Type
*Ty
, unsigned Idx
, unsigned NumElts
)
1899 : Ty(Ty
), Idx(Idx
), NumElts(NumElts
) {}
1903 unsigned NumElts
= 0;
1904 PHINode
*NewPHI
= nullptr;
1906 /// Slice \p Inc according to the information contained within this slice.
1907 /// This is cached, so if called multiple times for the same \p BB & \p Inc
1908 /// pair, it returns the same Sliced value as well.
1910 /// Note this *intentionally* does not return the same value for, say,
1911 /// [%bb.0, %0] & [%bb.1, %0] as:
1912 /// - It could cause issues with dominance (e.g. if bb.1 is seen first, then
1913 /// the value in bb.1 may not be reachable from bb.0 if it's its
1915 /// - We also want to make our extract instructions as local as possible so
1916 /// the DAG has better chances of folding them out. Duplicating them like
1917 /// that is beneficial in that regard.
1919 /// This is both a minor optimization to avoid creating duplicate
1920 /// instructions, but also a requirement for correctness. It is not forbidden
1921 /// for a PHI node to have the same [BB, Val] pair multiple times. If we
1922 /// returned a new value each time, those previously identical pairs would all
1923 /// have different incoming values (from the same block) and it'd cause a "PHI
1924 /// node has multiple entries for the same basic block with different incoming
1925 /// values!" verifier error.
1926 Value
*getSlicedVal(BasicBlock
*BB
, Value
*Inc
, StringRef NewValName
) {
1927 Value
*&Res
= SlicedVals
[{BB
, Inc
}];
1931 IRBuilder
<> B(BB
->getTerminator());
1932 if (Instruction
*IncInst
= dyn_cast
<Instruction
>(Inc
))
1933 B
.SetCurrentDebugLocation(IncInst
->getDebugLoc());
1936 SmallVector
<int, 4> Mask
;
1937 for (unsigned K
= Idx
; K
< (Idx
+ NumElts
); ++K
)
1939 Res
= B
.CreateShuffleVector(Inc
, Mask
, NewValName
);
1941 Res
= B
.CreateExtractElement(Inc
, Idx
, NewValName
);
1947 SmallDenseMap
<std::pair
<BasicBlock
*, Value
*>, Value
*> SlicedVals
;
1950 bool AMDGPUCodeGenPrepareImpl::visitPHINode(PHINode
&I
) {
1951 // Break-up fixed-vector PHIs into smaller pieces.
1952 // Default threshold is 32, so it breaks up any vector that's >32 bits into
1953 // its elements, or into 32-bit pieces (for 8/16 bit elts).
1955 // This is only helpful for DAGISel because it doesn't handle large PHIs as
1956 // well as GlobalISel. DAGISel lowers PHIs by using CopyToReg/CopyFromReg.
1957 // With large, odd-sized PHIs we may end up needing many `build_vector`
1958 // operations with most elements being "undef". This inhibits a lot of
1959 // optimization opportunities and can result in unreasonably high register
1960 // pressure and the inevitable stack spilling.
1961 if (!BreakLargePHIs
|| getCGPassBuilderOption().EnableGlobalISelOption
)
1964 FixedVectorType
*FVT
= dyn_cast
<FixedVectorType
>(I
.getType());
1965 if (!FVT
|| FVT
->getNumElements() == 1 ||
1966 DL
->getTypeSizeInBits(FVT
) <= BreakLargePHIsThreshold
)
1969 if (!ForceBreakLargePHIs
&& !canBreakPHINode(I
))
1972 std::vector
<VectorSlice
> Slices
;
1974 Type
*EltTy
= FVT
->getElementType();
1977 // For 8/16 bits type, don't scalarize fully but break it up into as many
1978 // 32-bit slices as we can, and scalarize the tail.
1979 const unsigned EltSize
= DL
->getTypeSizeInBits(EltTy
);
1980 const unsigned NumElts
= FVT
->getNumElements();
1981 if (EltSize
== 8 || EltSize
== 16) {
1982 const unsigned SubVecSize
= (32 / EltSize
);
1983 Type
*SubVecTy
= FixedVectorType::get(EltTy
, SubVecSize
);
1984 for (unsigned End
= alignDown(NumElts
, SubVecSize
); Idx
< End
;
1986 Slices
.emplace_back(SubVecTy
, Idx
, SubVecSize
);
1989 // Scalarize all remaining elements.
1990 for (; Idx
< NumElts
; ++Idx
)
1991 Slices
.emplace_back(EltTy
, Idx
, 1);
1994 assert(Slices
.size() > 1);
1996 // Create one PHI per vector piece. The "VectorSlice" class takes care of
1997 // creating the necessary instruction to extract the relevant slices of each
1999 IRBuilder
<> B(I
.getParent());
2000 B
.SetCurrentDebugLocation(I
.getDebugLoc());
2002 unsigned IncNameSuffix
= 0;
2003 for (VectorSlice
&S
: Slices
) {
2004 // We need to reset the build on each iteration, because getSlicedVal may
2005 // have inserted something into I's BB.
2006 B
.SetInsertPoint(I
.getParent()->getFirstNonPHIIt());
2007 S
.NewPHI
= B
.CreatePHI(S
.Ty
, I
.getNumIncomingValues());
2009 for (const auto &[Idx
, BB
] : enumerate(I
.blocks())) {
2010 S
.NewPHI
->addIncoming(S
.getSlicedVal(BB
, I
.getIncomingValue(Idx
),
2011 "largephi.extractslice" +
2012 std::to_string(IncNameSuffix
++)),
2017 // And replace this PHI with a vector of all the previous PHI values.
2018 Value
*Vec
= PoisonValue::get(FVT
);
2019 unsigned NameSuffix
= 0;
2020 for (VectorSlice
&S
: Slices
) {
2021 const auto ValName
= "largephi.insertslice" + std::to_string(NameSuffix
++);
2024 B
.CreateInsertVector(FVT
, Vec
, S
.NewPHI
, B
.getInt64(S
.Idx
), ValName
);
2026 Vec
= B
.CreateInsertElement(Vec
, S
.NewPHI
, S
.Idx
, ValName
);
2029 I
.replaceAllUsesWith(Vec
);
2030 I
.eraseFromParent();
2034 /// \param V Value to check
2035 /// \param DL DataLayout
2036 /// \param TM TargetMachine (TODO: remove once DL contains nullptr values)
2037 /// \param AS Target Address Space
2038 /// \return true if \p V cannot be the null value of \p AS, false otherwise.
2039 static bool isPtrKnownNeverNull(const Value
*V
, const DataLayout
&DL
,
2040 const AMDGPUTargetMachine
&TM
, unsigned AS
) {
2041 // Pointer cannot be null if it's a block address, GV or alloca.
2042 // NOTE: We don't support extern_weak, but if we did, we'd need to check for
2043 // it as the symbol could be null in such cases.
2044 if (isa
<BlockAddress
>(V
) || isa
<GlobalValue
>(V
) || isa
<AllocaInst
>(V
))
2047 // Check nonnull arguments.
2048 if (const auto *Arg
= dyn_cast
<Argument
>(V
); Arg
&& Arg
->hasNonNullAttr())
2051 // getUnderlyingObject may have looked through another addrspacecast, although
2052 // the optimizable situations most likely folded out by now.
2053 if (AS
!= cast
<PointerType
>(V
->getType())->getAddressSpace())
2056 // TODO: Calls that return nonnull?
2058 // For all other things, use KnownBits.
2059 // We either use 0 or all bits set to indicate null, so check whether the
2060 // value can be zero or all ones.
2062 // TODO: Use ValueTracking's isKnownNeverNull if it becomes aware that some
2063 // address spaces have non-zero null values.
2064 auto SrcPtrKB
= computeKnownBits(V
, DL
);
2065 const auto NullVal
= TM
.getNullPointerValue(AS
);
2067 assert(SrcPtrKB
.getBitWidth() == DL
.getPointerSizeInBits(AS
));
2068 assert((NullVal
== 0 || NullVal
== -1) &&
2069 "don't know how to check for this null value!");
2070 return NullVal
? !SrcPtrKB
.getMaxValue().isAllOnes() : SrcPtrKB
.isNonZero();
2073 bool AMDGPUCodeGenPrepareImpl::visitAddrSpaceCastInst(AddrSpaceCastInst
&I
) {
2074 // Intrinsic doesn't support vectors, also it seems that it's often difficult
2075 // to prove that a vector cannot have any nulls in it so it's unclear if it's
2076 // worth supporting.
2077 if (I
.getType()->isVectorTy())
2080 // Check if this can be lowered to a amdgcn.addrspacecast.nonnull.
2081 // This is only worthwhile for casts from/to priv/local to flat.
2082 const unsigned SrcAS
= I
.getSrcAddressSpace();
2083 const unsigned DstAS
= I
.getDestAddressSpace();
2085 bool CanLower
= false;
2086 if (SrcAS
== AMDGPUAS::FLAT_ADDRESS
)
2087 CanLower
= (DstAS
== AMDGPUAS::LOCAL_ADDRESS
||
2088 DstAS
== AMDGPUAS::PRIVATE_ADDRESS
);
2089 else if (DstAS
== AMDGPUAS::FLAT_ADDRESS
)
2090 CanLower
= (SrcAS
== AMDGPUAS::LOCAL_ADDRESS
||
2091 SrcAS
== AMDGPUAS::PRIVATE_ADDRESS
);
2095 SmallVector
<const Value
*, 4> WorkList
;
2096 getUnderlyingObjects(I
.getOperand(0), WorkList
);
2097 if (!all_of(WorkList
, [&](const Value
*V
) {
2098 return isPtrKnownNeverNull(V
, *DL
, *TM
, SrcAS
);
2103 auto *Intrin
= B
.CreateIntrinsic(
2104 I
.getType(), Intrinsic::amdgcn_addrspacecast_nonnull
, {I
.getOperand(0)});
2105 I
.replaceAllUsesWith(Intrin
);
2106 I
.eraseFromParent();
2110 bool AMDGPUCodeGenPrepareImpl::visitIntrinsicInst(IntrinsicInst
&I
) {
2111 switch (I
.getIntrinsicID()) {
2112 case Intrinsic::bitreverse
:
2113 return visitBitreverseIntrinsicInst(I
);
2114 case Intrinsic::minnum
:
2115 return visitMinNum(I
);
2116 case Intrinsic::sqrt
:
2117 return visitSqrt(I
);
2123 bool AMDGPUCodeGenPrepareImpl::visitBitreverseIntrinsicInst(IntrinsicInst
&I
) {
2124 bool Changed
= false;
2126 if (ST
->has16BitInsts() && needsPromotionToI32(I
.getType()) &&
2128 Changed
|= promoteUniformBitreverseToI32(I
);
2133 /// Match non-nan fract pattern.
2134 /// minnum(fsub(x, floor(x)), nextafter(1.0, -1.0)
2136 /// If fract is a useful instruction for the subtarget. Does not account for the
2137 /// nan handling; the instruction has a nan check on the input value.
2138 Value
*AMDGPUCodeGenPrepareImpl::matchFractPat(IntrinsicInst
&I
) {
2139 if (ST
->hasFractBug())
2142 if (I
.getIntrinsicID() != Intrinsic::minnum
)
2145 Type
*Ty
= I
.getType();
2146 if (!isLegalFloatingTy(Ty
->getScalarType()))
2149 Value
*Arg0
= I
.getArgOperand(0);
2150 Value
*Arg1
= I
.getArgOperand(1);
2153 if (!match(Arg1
, m_APFloat(C
)))
2158 One
.convert(C
->getSemantics(), APFloat::rmNearestTiesToEven
, &LosesInfo
);
2160 // Match nextafter(1.0, -1)
2166 if (match(Arg0
, m_FSub(m_Value(FloorSrc
),
2167 m_Intrinsic
<Intrinsic::floor
>(m_Deferred(FloorSrc
)))))
2172 Value
*AMDGPUCodeGenPrepareImpl::applyFractPat(IRBuilder
<> &Builder
,
2174 SmallVector
<Value
*, 4> FractVals
;
2175 extractValues(Builder
, FractVals
, FractArg
);
2177 SmallVector
<Value
*, 4> ResultVals(FractVals
.size());
2179 Type
*Ty
= FractArg
->getType()->getScalarType();
2180 for (unsigned I
= 0, E
= FractVals
.size(); I
!= E
; ++I
) {
2182 Builder
.CreateIntrinsic(Intrinsic::amdgcn_fract
, {Ty
}, {FractVals
[I
]});
2185 return insertValues(Builder
, FractArg
->getType(), ResultVals
);
2188 bool AMDGPUCodeGenPrepareImpl::visitMinNum(IntrinsicInst
&I
) {
2189 Value
*FractArg
= matchFractPat(I
);
2193 // Match pattern for fract intrinsic in contexts where the nan check has been
2194 // optimized out (and hope the knowledge the source can't be nan wasn't lost).
2195 if (!I
.hasNoNaNs() &&
2196 !isKnownNeverNaN(FractArg
, /*Depth=*/0, SimplifyQuery(*DL
, TLInfo
)))
2199 IRBuilder
<> Builder(&I
);
2200 FastMathFlags FMF
= I
.getFastMathFlags();
2202 Builder
.setFastMathFlags(FMF
);
2204 Value
*Fract
= applyFractPat(Builder
, FractArg
);
2205 Fract
->takeName(&I
);
2206 I
.replaceAllUsesWith(Fract
);
2208 RecursivelyDeleteTriviallyDeadInstructions(&I
, TLInfo
);
2212 static bool isOneOrNegOne(const Value
*Val
) {
2214 return match(Val
, m_APFloat(C
)) && C
->getExactLog2Abs() == 0;
2217 // Expand llvm.sqrt.f32 calls with !fpmath metadata in a semi-fast way.
2218 bool AMDGPUCodeGenPrepareImpl::visitSqrt(IntrinsicInst
&Sqrt
) {
2219 Type
*Ty
= Sqrt
.getType()->getScalarType();
2220 if (!Ty
->isFloatTy() && (!Ty
->isHalfTy() || ST
->has16BitInsts()))
2223 const FPMathOperator
*FPOp
= cast
<const FPMathOperator
>(&Sqrt
);
2224 FastMathFlags SqrtFMF
= FPOp
->getFastMathFlags();
2226 // We're trying to handle the fast-but-not-that-fast case only. The lowering
2227 // of fast llvm.sqrt will give the raw instruction anyway.
2228 if (SqrtFMF
.approxFunc() || HasUnsafeFPMath
)
2231 const float ReqdAccuracy
= FPOp
->getFPAccuracy();
2233 // Defer correctly rounded expansion to codegen.
2234 if (ReqdAccuracy
< 1.0f
)
2237 // FIXME: This is an ugly hack for this pass using forward iteration instead
2238 // of reverse. If it worked like a normal combiner, the rsq would form before
2239 // we saw a sqrt call.
2241 dyn_cast_or_null
<FPMathOperator
>(Sqrt
.getUniqueUndroppableUser());
2242 if (FDiv
&& FDiv
->getOpcode() == Instruction::FDiv
&&
2243 FDiv
->getFPAccuracy() >= 1.0f
&&
2244 canOptimizeWithRsq(FPOp
, FDiv
->getFastMathFlags(), SqrtFMF
) &&
2245 // TODO: We should also handle the arcp case for the fdiv with non-1 value
2246 isOneOrNegOne(FDiv
->getOperand(0)))
2249 Value
*SrcVal
= Sqrt
.getOperand(0);
2250 bool CanTreatAsDAZ
= canIgnoreDenormalInput(SrcVal
, &Sqrt
);
2252 // The raw instruction is 1 ulp, but the correction for denormal handling
2254 if (!CanTreatAsDAZ
&& ReqdAccuracy
< 2.0f
)
2257 IRBuilder
<> Builder(&Sqrt
);
2258 SmallVector
<Value
*, 4> SrcVals
;
2259 extractValues(Builder
, SrcVals
, SrcVal
);
2261 SmallVector
<Value
*, 4> ResultVals(SrcVals
.size());
2262 for (int I
= 0, E
= SrcVals
.size(); I
!= E
; ++I
) {
2264 ResultVals
[I
] = Builder
.CreateCall(getSqrtF32(), SrcVals
[I
]);
2266 ResultVals
[I
] = emitSqrtIEEE2ULP(Builder
, SrcVals
[I
], SqrtFMF
);
2269 Value
*NewSqrt
= insertValues(Builder
, Sqrt
.getType(), ResultVals
);
2270 NewSqrt
->takeName(&Sqrt
);
2271 Sqrt
.replaceAllUsesWith(NewSqrt
);
2272 Sqrt
.eraseFromParent();
2276 bool AMDGPUCodeGenPrepare::doInitialization(Module
&M
) {
2278 Impl
.DL
= &Impl
.Mod
->getDataLayout();
2279 Impl
.SqrtF32
= nullptr;
2280 Impl
.LdexpF32
= nullptr;
2284 bool AMDGPUCodeGenPrepare::runOnFunction(Function
&F
) {
2285 if (skipFunction(F
))
2288 auto *TPC
= getAnalysisIfAvailable
<TargetPassConfig
>();
2292 const AMDGPUTargetMachine
&TM
= TPC
->getTM
<AMDGPUTargetMachine
>();
2294 Impl
.TLInfo
= &getAnalysis
<TargetLibraryInfoWrapperPass
>().getTLI(F
);
2295 Impl
.ST
= &TM
.getSubtarget
<GCNSubtarget
>(F
);
2296 Impl
.AC
= &getAnalysis
<AssumptionCacheTracker
>().getAssumptionCache(F
);
2297 Impl
.UA
= &getAnalysis
<UniformityInfoWrapperPass
>().getUniformityInfo();
2298 auto *DTWP
= getAnalysisIfAvailable
<DominatorTreeWrapperPass
>();
2299 Impl
.DT
= DTWP
? &DTWP
->getDomTree() : nullptr;
2300 Impl
.HasUnsafeFPMath
= hasUnsafeFPMath(F
);
2301 SIModeRegisterDefaults
Mode(F
, *Impl
.ST
);
2302 Impl
.HasFP32DenormalFlush
=
2303 Mode
.FP32Denormals
== DenormalMode::getPreserveSign();
2307 PreservedAnalyses
AMDGPUCodeGenPreparePass::run(Function
&F
,
2308 FunctionAnalysisManager
&FAM
) {
2309 AMDGPUCodeGenPrepareImpl Impl
;
2310 Impl
.Mod
= F
.getParent();
2311 Impl
.DL
= &Impl
.Mod
->getDataLayout();
2312 Impl
.TM
= static_cast<const AMDGPUTargetMachine
*>(&TM
);
2313 Impl
.TLInfo
= &FAM
.getResult
<TargetLibraryAnalysis
>(F
);
2314 Impl
.ST
= &TM
.getSubtarget
<GCNSubtarget
>(F
);
2315 Impl
.AC
= &FAM
.getResult
<AssumptionAnalysis
>(F
);
2316 Impl
.UA
= &FAM
.getResult
<UniformityInfoAnalysis
>(F
);
2317 Impl
.DT
= FAM
.getCachedResult
<DominatorTreeAnalysis
>(F
);
2318 Impl
.HasUnsafeFPMath
= hasUnsafeFPMath(F
);
2319 SIModeRegisterDefaults
Mode(F
, *Impl
.ST
);
2320 Impl
.HasFP32DenormalFlush
=
2321 Mode
.FP32Denormals
== DenormalMode::getPreserveSign();
2322 PreservedAnalyses PA
= PreservedAnalyses::none();
2323 if (!Impl
.FlowChanged
)
2324 PA
.preserveSet
<CFGAnalyses
>();
2325 return Impl
.run(F
) ? PA
: PreservedAnalyses::all();
2328 INITIALIZE_PASS_BEGIN(AMDGPUCodeGenPrepare
, DEBUG_TYPE
,
2329 "AMDGPU IR optimizations", false, false)
2330 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker
)
2331 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass
)
2332 INITIALIZE_PASS_DEPENDENCY(UniformityInfoWrapperPass
)
2333 INITIALIZE_PASS_END(AMDGPUCodeGenPrepare
, DEBUG_TYPE
, "AMDGPU IR optimizations",
2336 char AMDGPUCodeGenPrepare::ID
= 0;
2338 FunctionPass
*llvm::createAMDGPUCodeGenPreparePass() {
2339 return new AMDGPUCodeGenPrepare();