[AMDGPU] prevent shrinking udiv/urem if either operand is in (SignedMax,UnsignedMax...
[llvm-project.git] / llvm / lib / Target / AMDGPU / AMDGPUCodeGenPrepare.cpp
bloba6cef526499a66d8756fe35d42a6e2c4a5ecaccf
1 //===-- AMDGPUCodeGenPrepare.cpp ------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// This pass does misc. AMDGPU optimizations on IR before instruction
11 /// selection.
13 //===----------------------------------------------------------------------===//
15 #include "AMDGPU.h"
16 #include "AMDGPUTargetMachine.h"
17 #include "SIModeRegisterDefaults.h"
18 #include "llvm/Analysis/AssumptionCache.h"
19 #include "llvm/Analysis/ConstantFolding.h"
20 #include "llvm/Analysis/TargetLibraryInfo.h"
21 #include "llvm/Analysis/UniformityAnalysis.h"
22 #include "llvm/Analysis/ValueTracking.h"
23 #include "llvm/CodeGen/TargetPassConfig.h"
24 #include "llvm/IR/Dominators.h"
25 #include "llvm/IR/IRBuilder.h"
26 #include "llvm/IR/InstVisitor.h"
27 #include "llvm/IR/IntrinsicsAMDGPU.h"
28 #include "llvm/IR/PatternMatch.h"
29 #include "llvm/InitializePasses.h"
30 #include "llvm/Pass.h"
31 #include "llvm/Support/KnownBits.h"
32 #include "llvm/Transforms/Utils/IntegerDivision.h"
33 #include "llvm/Transforms/Utils/Local.h"
35 #define DEBUG_TYPE "amdgpu-codegenprepare"
37 using namespace llvm;
38 using namespace llvm::PatternMatch;
40 namespace {
42 static cl::opt<bool> WidenLoads(
43 "amdgpu-codegenprepare-widen-constant-loads",
44 cl::desc("Widen sub-dword constant address space loads in AMDGPUCodeGenPrepare"),
45 cl::ReallyHidden,
46 cl::init(false));
48 static cl::opt<bool> Widen16BitOps(
49 "amdgpu-codegenprepare-widen-16-bit-ops",
50 cl::desc("Widen uniform 16-bit instructions to 32-bit in AMDGPUCodeGenPrepare"),
51 cl::ReallyHidden,
52 cl::init(true));
54 static cl::opt<bool>
55 BreakLargePHIs("amdgpu-codegenprepare-break-large-phis",
56 cl::desc("Break large PHI nodes for DAGISel"),
57 cl::ReallyHidden, cl::init(true));
59 static cl::opt<bool>
60 ForceBreakLargePHIs("amdgpu-codegenprepare-force-break-large-phis",
61 cl::desc("For testing purposes, always break large "
62 "PHIs even if it isn't profitable."),
63 cl::ReallyHidden, cl::init(false));
65 static cl::opt<unsigned> BreakLargePHIsThreshold(
66 "amdgpu-codegenprepare-break-large-phis-threshold",
67 cl::desc("Minimum type size in bits for breaking large PHI nodes"),
68 cl::ReallyHidden, cl::init(32));
70 static cl::opt<bool> UseMul24Intrin(
71 "amdgpu-codegenprepare-mul24",
72 cl::desc("Introduce mul24 intrinsics in AMDGPUCodeGenPrepare"),
73 cl::ReallyHidden,
74 cl::init(true));
76 // Legalize 64-bit division by using the generic IR expansion.
77 static cl::opt<bool> ExpandDiv64InIR(
78 "amdgpu-codegenprepare-expand-div64",
79 cl::desc("Expand 64-bit division in AMDGPUCodeGenPrepare"),
80 cl::ReallyHidden,
81 cl::init(false));
83 // Leave all division operations as they are. This supersedes ExpandDiv64InIR
84 // and is used for testing the legalizer.
85 static cl::opt<bool> DisableIDivExpand(
86 "amdgpu-codegenprepare-disable-idiv-expansion",
87 cl::desc("Prevent expanding integer division in AMDGPUCodeGenPrepare"),
88 cl::ReallyHidden,
89 cl::init(false));
91 // Disable processing of fdiv so we can better test the backend implementations.
92 static cl::opt<bool> DisableFDivExpand(
93 "amdgpu-codegenprepare-disable-fdiv-expansion",
94 cl::desc("Prevent expanding floating point division in AMDGPUCodeGenPrepare"),
95 cl::ReallyHidden,
96 cl::init(false));
98 class AMDGPUCodeGenPrepareImpl
99 : public InstVisitor<AMDGPUCodeGenPrepareImpl, bool> {
100 public:
101 const GCNSubtarget *ST = nullptr;
102 const AMDGPUTargetMachine *TM = nullptr;
103 const TargetLibraryInfo *TLInfo = nullptr;
104 AssumptionCache *AC = nullptr;
105 DominatorTree *DT = nullptr;
106 UniformityInfo *UA = nullptr;
107 Module *Mod = nullptr;
108 const DataLayout *DL = nullptr;
109 bool HasUnsafeFPMath = false;
110 bool HasFP32DenormalFlush = false;
111 bool FlowChanged = false;
112 mutable Function *SqrtF32 = nullptr;
113 mutable Function *LdexpF32 = nullptr;
115 DenseMap<const PHINode *, bool> BreakPhiNodesCache;
117 Function *getSqrtF32() const {
118 if (SqrtF32)
119 return SqrtF32;
121 LLVMContext &Ctx = Mod->getContext();
122 SqrtF32 = Intrinsic::getOrInsertDeclaration(Mod, Intrinsic::amdgcn_sqrt,
123 {Type::getFloatTy(Ctx)});
124 return SqrtF32;
127 Function *getLdexpF32() const {
128 if (LdexpF32)
129 return LdexpF32;
131 LLVMContext &Ctx = Mod->getContext();
132 LdexpF32 = Intrinsic::getOrInsertDeclaration(
133 Mod, Intrinsic::ldexp, {Type::getFloatTy(Ctx), Type::getInt32Ty(Ctx)});
134 return LdexpF32;
137 bool canBreakPHINode(const PHINode &I);
139 /// Copies exact/nsw/nuw flags (if any) from binary operation \p I to
140 /// binary operation \p V.
142 /// \returns Binary operation \p V.
143 /// \returns \p T's base element bit width.
144 unsigned getBaseElementBitWidth(const Type *T) const;
146 /// \returns Equivalent 32 bit integer type for given type \p T. For example,
147 /// if \p T is i7, then i32 is returned; if \p T is <3 x i12>, then <3 x i32>
148 /// is returned.
149 Type *getI32Ty(IRBuilder<> &B, const Type *T) const;
151 /// \returns True if binary operation \p I is a signed binary operation, false
152 /// otherwise.
153 bool isSigned(const BinaryOperator &I) const;
155 /// \returns True if the condition of 'select' operation \p I comes from a
156 /// signed 'icmp' operation, false otherwise.
157 bool isSigned(const SelectInst &I) const;
159 /// \returns True if type \p T needs to be promoted to 32 bit integer type,
160 /// false otherwise.
161 bool needsPromotionToI32(const Type *T) const;
163 /// Return true if \p T is a legal scalar floating point type.
164 bool isLegalFloatingTy(const Type *T) const;
166 /// Wrapper to pass all the arguments to computeKnownFPClass
167 KnownFPClass computeKnownFPClass(const Value *V, FPClassTest Interested,
168 const Instruction *CtxI) const {
169 return llvm::computeKnownFPClass(V, *DL, Interested, 0, TLInfo, AC, CtxI,
170 DT);
173 bool canIgnoreDenormalInput(const Value *V, const Instruction *CtxI) const {
174 return HasFP32DenormalFlush ||
175 computeKnownFPClass(V, fcSubnormal, CtxI).isKnownNeverSubnormal();
178 /// Promotes uniform binary operation \p I to equivalent 32 bit binary
179 /// operation.
181 /// \details \p I's base element bit width must be greater than 1 and less
182 /// than or equal 16. Promotion is done by sign or zero extending operands to
183 /// 32 bits, replacing \p I with equivalent 32 bit binary operation, and
184 /// truncating the result of 32 bit binary operation back to \p I's original
185 /// type. Division operation is not promoted.
187 /// \returns True if \p I is promoted to equivalent 32 bit binary operation,
188 /// false otherwise.
189 bool promoteUniformOpToI32(BinaryOperator &I) const;
191 /// Promotes uniform 'icmp' operation \p I to 32 bit 'icmp' operation.
193 /// \details \p I's base element bit width must be greater than 1 and less
194 /// than or equal 16. Promotion is done by sign or zero extending operands to
195 /// 32 bits, and replacing \p I with 32 bit 'icmp' operation.
197 /// \returns True.
198 bool promoteUniformOpToI32(ICmpInst &I) const;
200 /// Promotes uniform 'select' operation \p I to 32 bit 'select'
201 /// operation.
203 /// \details \p I's base element bit width must be greater than 1 and less
204 /// than or equal 16. Promotion is done by sign or zero extending operands to
205 /// 32 bits, replacing \p I with 32 bit 'select' operation, and truncating the
206 /// result of 32 bit 'select' operation back to \p I's original type.
208 /// \returns True.
209 bool promoteUniformOpToI32(SelectInst &I) const;
211 /// Promotes uniform 'bitreverse' intrinsic \p I to 32 bit 'bitreverse'
212 /// intrinsic.
214 /// \details \p I's base element bit width must be greater than 1 and less
215 /// than or equal 16. Promotion is done by zero extending the operand to 32
216 /// bits, replacing \p I with 32 bit 'bitreverse' intrinsic, shifting the
217 /// result of 32 bit 'bitreverse' intrinsic to the right with zero fill (the
218 /// shift amount is 32 minus \p I's base element bit width), and truncating
219 /// the result of the shift operation back to \p I's original type.
221 /// \returns True.
222 bool promoteUniformBitreverseToI32(IntrinsicInst &I) const;
224 /// \returns The minimum number of bits needed to store the value of \Op as an
225 /// unsigned integer. Truncating to this size and then zero-extending to
226 /// the original will not change the value.
227 unsigned numBitsUnsigned(Value *Op) const;
229 /// \returns The minimum number of bits needed to store the value of \Op as a
230 /// signed integer. Truncating to this size and then sign-extending to
231 /// the original size will not change the value.
232 unsigned numBitsSigned(Value *Op) const;
234 /// Replace mul instructions with llvm.amdgcn.mul.u24 or llvm.amdgcn.mul.s24.
235 /// SelectionDAG has an issue where an and asserting the bits are known
236 bool replaceMulWithMul24(BinaryOperator &I) const;
238 /// Perform same function as equivalently named function in DAGCombiner. Since
239 /// we expand some divisions here, we need to perform this before obscuring.
240 bool foldBinOpIntoSelect(BinaryOperator &I) const;
242 bool divHasSpecialOptimization(BinaryOperator &I,
243 Value *Num, Value *Den) const;
244 int getDivNumBits(BinaryOperator &I,
245 Value *Num, Value *Den,
246 unsigned AtLeast, bool Signed) const;
248 /// Expands 24 bit div or rem.
249 Value* expandDivRem24(IRBuilder<> &Builder, BinaryOperator &I,
250 Value *Num, Value *Den,
251 bool IsDiv, bool IsSigned) const;
253 Value *expandDivRem24Impl(IRBuilder<> &Builder, BinaryOperator &I,
254 Value *Num, Value *Den, unsigned NumBits,
255 bool IsDiv, bool IsSigned) const;
257 /// Expands 32 bit div or rem.
258 Value* expandDivRem32(IRBuilder<> &Builder, BinaryOperator &I,
259 Value *Num, Value *Den) const;
261 Value *shrinkDivRem64(IRBuilder<> &Builder, BinaryOperator &I,
262 Value *Num, Value *Den) const;
263 void expandDivRem64(BinaryOperator &I) const;
265 /// Widen a scalar load.
267 /// \details \p Widen scalar load for uniform, small type loads from constant
268 // memory / to a full 32-bits and then truncate the input to allow a scalar
269 // load instead of a vector load.
271 /// \returns True.
273 bool canWidenScalarExtLoad(LoadInst &I) const;
275 Value *matchFractPat(IntrinsicInst &I);
276 Value *applyFractPat(IRBuilder<> &Builder, Value *FractArg);
278 bool canOptimizeWithRsq(const FPMathOperator *SqrtOp, FastMathFlags DivFMF,
279 FastMathFlags SqrtFMF) const;
281 Value *optimizeWithRsq(IRBuilder<> &Builder, Value *Num, Value *Den,
282 FastMathFlags DivFMF, FastMathFlags SqrtFMF,
283 const Instruction *CtxI) const;
285 Value *optimizeWithRcp(IRBuilder<> &Builder, Value *Num, Value *Den,
286 FastMathFlags FMF, const Instruction *CtxI) const;
287 Value *optimizeWithFDivFast(IRBuilder<> &Builder, Value *Num, Value *Den,
288 float ReqdAccuracy) const;
290 Value *visitFDivElement(IRBuilder<> &Builder, Value *Num, Value *Den,
291 FastMathFlags DivFMF, FastMathFlags SqrtFMF,
292 Value *RsqOp, const Instruction *FDiv,
293 float ReqdAccuracy) const;
295 std::pair<Value *, Value *> getFrexpResults(IRBuilder<> &Builder,
296 Value *Src) const;
298 Value *emitRcpIEEE1ULP(IRBuilder<> &Builder, Value *Src,
299 bool IsNegative) const;
300 Value *emitFrexpDiv(IRBuilder<> &Builder, Value *LHS, Value *RHS,
301 FastMathFlags FMF) const;
302 Value *emitSqrtIEEE2ULP(IRBuilder<> &Builder, Value *Src,
303 FastMathFlags FMF) const;
305 public:
306 bool visitFDiv(BinaryOperator &I);
308 bool visitInstruction(Instruction &I) { return false; }
309 bool visitBinaryOperator(BinaryOperator &I);
310 bool visitLoadInst(LoadInst &I);
311 bool visitICmpInst(ICmpInst &I);
312 bool visitSelectInst(SelectInst &I);
313 bool visitPHINode(PHINode &I);
314 bool visitAddrSpaceCastInst(AddrSpaceCastInst &I);
316 bool visitIntrinsicInst(IntrinsicInst &I);
317 bool visitBitreverseIntrinsicInst(IntrinsicInst &I);
318 bool visitMinNum(IntrinsicInst &I);
319 bool visitSqrt(IntrinsicInst &I);
320 bool run(Function &F);
323 class AMDGPUCodeGenPrepare : public FunctionPass {
324 private:
325 AMDGPUCodeGenPrepareImpl Impl;
327 public:
328 static char ID;
329 AMDGPUCodeGenPrepare() : FunctionPass(ID) {
330 initializeAMDGPUCodeGenPreparePass(*PassRegistry::getPassRegistry());
332 void getAnalysisUsage(AnalysisUsage &AU) const override {
333 AU.addRequired<AssumptionCacheTracker>();
334 AU.addRequired<UniformityInfoWrapperPass>();
335 AU.addRequired<TargetLibraryInfoWrapperPass>();
337 // FIXME: Division expansion needs to preserve the dominator tree.
338 if (!ExpandDiv64InIR)
339 AU.setPreservesAll();
341 bool runOnFunction(Function &F) override;
342 bool doInitialization(Module &M) override;
343 StringRef getPassName() const override { return "AMDGPU IR optimizations"; }
346 } // end anonymous namespace
348 bool AMDGPUCodeGenPrepareImpl::run(Function &F) {
349 BreakPhiNodesCache.clear();
350 bool MadeChange = false;
352 Function::iterator NextBB;
353 for (Function::iterator FI = F.begin(), FE = F.end(); FI != FE; FI = NextBB) {
354 BasicBlock *BB = &*FI;
355 NextBB = std::next(FI);
357 BasicBlock::iterator Next;
358 for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;
359 I = Next) {
360 Next = std::next(I);
362 MadeChange |= visit(*I);
364 if (Next != E) { // Control flow changed
365 BasicBlock *NextInstBB = Next->getParent();
366 if (NextInstBB != BB) {
367 BB = NextInstBB;
368 E = BB->end();
369 FE = F.end();
374 return MadeChange;
377 unsigned AMDGPUCodeGenPrepareImpl::getBaseElementBitWidth(const Type *T) const {
378 assert(needsPromotionToI32(T) && "T does not need promotion to i32");
380 if (T->isIntegerTy())
381 return T->getIntegerBitWidth();
382 return cast<VectorType>(T)->getElementType()->getIntegerBitWidth();
385 Type *AMDGPUCodeGenPrepareImpl::getI32Ty(IRBuilder<> &B, const Type *T) const {
386 assert(needsPromotionToI32(T) && "T does not need promotion to i32");
388 if (T->isIntegerTy())
389 return B.getInt32Ty();
390 return FixedVectorType::get(B.getInt32Ty(), cast<FixedVectorType>(T));
393 bool AMDGPUCodeGenPrepareImpl::isSigned(const BinaryOperator &I) const {
394 return I.getOpcode() == Instruction::AShr ||
395 I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction::SRem;
398 bool AMDGPUCodeGenPrepareImpl::isSigned(const SelectInst &I) const {
399 return isa<ICmpInst>(I.getOperand(0)) ?
400 cast<ICmpInst>(I.getOperand(0))->isSigned() : false;
403 bool AMDGPUCodeGenPrepareImpl::needsPromotionToI32(const Type *T) const {
404 if (!Widen16BitOps)
405 return false;
407 const IntegerType *IntTy = dyn_cast<IntegerType>(T);
408 if (IntTy && IntTy->getBitWidth() > 1 && IntTy->getBitWidth() <= 16)
409 return true;
411 if (const VectorType *VT = dyn_cast<VectorType>(T)) {
412 // TODO: The set of packed operations is more limited, so may want to
413 // promote some anyway.
414 if (ST->hasVOP3PInsts())
415 return false;
417 return needsPromotionToI32(VT->getElementType());
420 return false;
423 bool AMDGPUCodeGenPrepareImpl::isLegalFloatingTy(const Type *Ty) const {
424 return Ty->isFloatTy() || Ty->isDoubleTy() ||
425 (Ty->isHalfTy() && ST->has16BitInsts());
428 // Return true if the op promoted to i32 should have nsw set.
429 static bool promotedOpIsNSW(const Instruction &I) {
430 switch (I.getOpcode()) {
431 case Instruction::Shl:
432 case Instruction::Add:
433 case Instruction::Sub:
434 return true;
435 case Instruction::Mul:
436 return I.hasNoUnsignedWrap();
437 default:
438 return false;
442 // Return true if the op promoted to i32 should have nuw set.
443 static bool promotedOpIsNUW(const Instruction &I) {
444 switch (I.getOpcode()) {
445 case Instruction::Shl:
446 case Instruction::Add:
447 case Instruction::Mul:
448 return true;
449 case Instruction::Sub:
450 return I.hasNoUnsignedWrap();
451 default:
452 return false;
456 bool AMDGPUCodeGenPrepareImpl::canWidenScalarExtLoad(LoadInst &I) const {
457 Type *Ty = I.getType();
458 const DataLayout &DL = Mod->getDataLayout();
459 int TySize = DL.getTypeSizeInBits(Ty);
460 Align Alignment = DL.getValueOrABITypeAlignment(I.getAlign(), Ty);
462 return I.isSimple() && TySize < 32 && Alignment >= 4 && UA->isUniform(&I);
465 bool AMDGPUCodeGenPrepareImpl::promoteUniformOpToI32(BinaryOperator &I) const {
466 assert(needsPromotionToI32(I.getType()) &&
467 "I does not need promotion to i32");
469 if (I.getOpcode() == Instruction::SDiv ||
470 I.getOpcode() == Instruction::UDiv ||
471 I.getOpcode() == Instruction::SRem ||
472 I.getOpcode() == Instruction::URem)
473 return false;
475 IRBuilder<> Builder(&I);
476 Builder.SetCurrentDebugLocation(I.getDebugLoc());
478 Type *I32Ty = getI32Ty(Builder, I.getType());
479 Value *ExtOp0 = nullptr;
480 Value *ExtOp1 = nullptr;
481 Value *ExtRes = nullptr;
482 Value *TruncRes = nullptr;
484 if (isSigned(I)) {
485 ExtOp0 = Builder.CreateSExt(I.getOperand(0), I32Ty);
486 ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty);
487 } else {
488 ExtOp0 = Builder.CreateZExt(I.getOperand(0), I32Ty);
489 ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty);
492 ExtRes = Builder.CreateBinOp(I.getOpcode(), ExtOp0, ExtOp1);
493 if (Instruction *Inst = dyn_cast<Instruction>(ExtRes)) {
494 if (promotedOpIsNSW(cast<Instruction>(I)))
495 Inst->setHasNoSignedWrap();
497 if (promotedOpIsNUW(cast<Instruction>(I)))
498 Inst->setHasNoUnsignedWrap();
500 if (const auto *ExactOp = dyn_cast<PossiblyExactOperator>(&I))
501 Inst->setIsExact(ExactOp->isExact());
504 TruncRes = Builder.CreateTrunc(ExtRes, I.getType());
506 I.replaceAllUsesWith(TruncRes);
507 I.eraseFromParent();
509 return true;
512 bool AMDGPUCodeGenPrepareImpl::promoteUniformOpToI32(ICmpInst &I) const {
513 assert(needsPromotionToI32(I.getOperand(0)->getType()) &&
514 "I does not need promotion to i32");
516 IRBuilder<> Builder(&I);
517 Builder.SetCurrentDebugLocation(I.getDebugLoc());
519 Type *I32Ty = getI32Ty(Builder, I.getOperand(0)->getType());
520 Value *ExtOp0 = nullptr;
521 Value *ExtOp1 = nullptr;
522 Value *NewICmp = nullptr;
524 if (I.isSigned()) {
525 ExtOp0 = Builder.CreateSExt(I.getOperand(0), I32Ty);
526 ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty);
527 } else {
528 ExtOp0 = Builder.CreateZExt(I.getOperand(0), I32Ty);
529 ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty);
531 NewICmp = Builder.CreateICmp(I.getPredicate(), ExtOp0, ExtOp1);
533 I.replaceAllUsesWith(NewICmp);
534 I.eraseFromParent();
536 return true;
539 bool AMDGPUCodeGenPrepareImpl::promoteUniformOpToI32(SelectInst &I) const {
540 assert(needsPromotionToI32(I.getType()) &&
541 "I does not need promotion to i32");
543 IRBuilder<> Builder(&I);
544 Builder.SetCurrentDebugLocation(I.getDebugLoc());
546 Type *I32Ty = getI32Ty(Builder, I.getType());
547 Value *ExtOp1 = nullptr;
548 Value *ExtOp2 = nullptr;
549 Value *ExtRes = nullptr;
550 Value *TruncRes = nullptr;
552 if (isSigned(I)) {
553 ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty);
554 ExtOp2 = Builder.CreateSExt(I.getOperand(2), I32Ty);
555 } else {
556 ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty);
557 ExtOp2 = Builder.CreateZExt(I.getOperand(2), I32Ty);
559 ExtRes = Builder.CreateSelect(I.getOperand(0), ExtOp1, ExtOp2);
560 TruncRes = Builder.CreateTrunc(ExtRes, I.getType());
562 I.replaceAllUsesWith(TruncRes);
563 I.eraseFromParent();
565 return true;
568 bool AMDGPUCodeGenPrepareImpl::promoteUniformBitreverseToI32(
569 IntrinsicInst &I) const {
570 assert(I.getIntrinsicID() == Intrinsic::bitreverse &&
571 "I must be bitreverse intrinsic");
572 assert(needsPromotionToI32(I.getType()) &&
573 "I does not need promotion to i32");
575 IRBuilder<> Builder(&I);
576 Builder.SetCurrentDebugLocation(I.getDebugLoc());
578 Type *I32Ty = getI32Ty(Builder, I.getType());
579 Value *ExtOp = Builder.CreateZExt(I.getOperand(0), I32Ty);
580 Value *ExtRes =
581 Builder.CreateIntrinsic(Intrinsic::bitreverse, {I32Ty}, {ExtOp});
582 Value *LShrOp =
583 Builder.CreateLShr(ExtRes, 32 - getBaseElementBitWidth(I.getType()));
584 Value *TruncRes =
585 Builder.CreateTrunc(LShrOp, I.getType());
587 I.replaceAllUsesWith(TruncRes);
588 I.eraseFromParent();
590 return true;
593 unsigned AMDGPUCodeGenPrepareImpl::numBitsUnsigned(Value *Op) const {
594 return computeKnownBits(Op, *DL, 0, AC).countMaxActiveBits();
597 unsigned AMDGPUCodeGenPrepareImpl::numBitsSigned(Value *Op) const {
598 return ComputeMaxSignificantBits(Op, *DL, 0, AC);
601 static void extractValues(IRBuilder<> &Builder,
602 SmallVectorImpl<Value *> &Values, Value *V) {
603 auto *VT = dyn_cast<FixedVectorType>(V->getType());
604 if (!VT) {
605 Values.push_back(V);
606 return;
609 for (int I = 0, E = VT->getNumElements(); I != E; ++I)
610 Values.push_back(Builder.CreateExtractElement(V, I));
613 static Value *insertValues(IRBuilder<> &Builder,
614 Type *Ty,
615 SmallVectorImpl<Value *> &Values) {
616 if (!Ty->isVectorTy()) {
617 assert(Values.size() == 1);
618 return Values[0];
621 Value *NewVal = PoisonValue::get(Ty);
622 for (int I = 0, E = Values.size(); I != E; ++I)
623 NewVal = Builder.CreateInsertElement(NewVal, Values[I], I);
625 return NewVal;
628 bool AMDGPUCodeGenPrepareImpl::replaceMulWithMul24(BinaryOperator &I) const {
629 if (I.getOpcode() != Instruction::Mul)
630 return false;
632 Type *Ty = I.getType();
633 unsigned Size = Ty->getScalarSizeInBits();
634 if (Size <= 16 && ST->has16BitInsts())
635 return false;
637 // Prefer scalar if this could be s_mul_i32
638 if (UA->isUniform(&I))
639 return false;
641 Value *LHS = I.getOperand(0);
642 Value *RHS = I.getOperand(1);
643 IRBuilder<> Builder(&I);
644 Builder.SetCurrentDebugLocation(I.getDebugLoc());
646 unsigned LHSBits = 0, RHSBits = 0;
647 bool IsSigned = false;
649 if (ST->hasMulU24() && (LHSBits = numBitsUnsigned(LHS)) <= 24 &&
650 (RHSBits = numBitsUnsigned(RHS)) <= 24) {
651 IsSigned = false;
653 } else if (ST->hasMulI24() && (LHSBits = numBitsSigned(LHS)) <= 24 &&
654 (RHSBits = numBitsSigned(RHS)) <= 24) {
655 IsSigned = true;
657 } else
658 return false;
660 SmallVector<Value *, 4> LHSVals;
661 SmallVector<Value *, 4> RHSVals;
662 SmallVector<Value *, 4> ResultVals;
663 extractValues(Builder, LHSVals, LHS);
664 extractValues(Builder, RHSVals, RHS);
666 IntegerType *I32Ty = Builder.getInt32Ty();
667 IntegerType *IntrinTy = Size > 32 ? Builder.getInt64Ty() : I32Ty;
668 Type *DstTy = LHSVals[0]->getType();
670 for (int I = 0, E = LHSVals.size(); I != E; ++I) {
671 Value *LHS = IsSigned ? Builder.CreateSExtOrTrunc(LHSVals[I], I32Ty)
672 : Builder.CreateZExtOrTrunc(LHSVals[I], I32Ty);
673 Value *RHS = IsSigned ? Builder.CreateSExtOrTrunc(RHSVals[I], I32Ty)
674 : Builder.CreateZExtOrTrunc(RHSVals[I], I32Ty);
675 Intrinsic::ID ID =
676 IsSigned ? Intrinsic::amdgcn_mul_i24 : Intrinsic::amdgcn_mul_u24;
677 Value *Result = Builder.CreateIntrinsic(ID, {IntrinTy}, {LHS, RHS});
678 Result = IsSigned ? Builder.CreateSExtOrTrunc(Result, DstTy)
679 : Builder.CreateZExtOrTrunc(Result, DstTy);
680 ResultVals.push_back(Result);
683 Value *NewVal = insertValues(Builder, Ty, ResultVals);
684 NewVal->takeName(&I);
685 I.replaceAllUsesWith(NewVal);
686 I.eraseFromParent();
688 return true;
691 // Find a select instruction, which may have been casted. This is mostly to deal
692 // with cases where i16 selects were promoted here to i32.
693 static SelectInst *findSelectThroughCast(Value *V, CastInst *&Cast) {
694 Cast = nullptr;
695 if (SelectInst *Sel = dyn_cast<SelectInst>(V))
696 return Sel;
698 if ((Cast = dyn_cast<CastInst>(V))) {
699 if (SelectInst *Sel = dyn_cast<SelectInst>(Cast->getOperand(0)))
700 return Sel;
703 return nullptr;
706 bool AMDGPUCodeGenPrepareImpl::foldBinOpIntoSelect(BinaryOperator &BO) const {
707 // Don't do this unless the old select is going away. We want to eliminate the
708 // binary operator, not replace a binop with a select.
709 int SelOpNo = 0;
711 CastInst *CastOp;
713 // TODO: Should probably try to handle some cases with multiple
714 // users. Duplicating the select may be profitable for division.
715 SelectInst *Sel = findSelectThroughCast(BO.getOperand(0), CastOp);
716 if (!Sel || !Sel->hasOneUse()) {
717 SelOpNo = 1;
718 Sel = findSelectThroughCast(BO.getOperand(1), CastOp);
721 if (!Sel || !Sel->hasOneUse())
722 return false;
724 Constant *CT = dyn_cast<Constant>(Sel->getTrueValue());
725 Constant *CF = dyn_cast<Constant>(Sel->getFalseValue());
726 Constant *CBO = dyn_cast<Constant>(BO.getOperand(SelOpNo ^ 1));
727 if (!CBO || !CT || !CF)
728 return false;
730 if (CastOp) {
731 if (!CastOp->hasOneUse())
732 return false;
733 CT = ConstantFoldCastOperand(CastOp->getOpcode(), CT, BO.getType(), *DL);
734 CF = ConstantFoldCastOperand(CastOp->getOpcode(), CF, BO.getType(), *DL);
737 // TODO: Handle special 0/-1 cases DAG combine does, although we only really
738 // need to handle divisions here.
739 Constant *FoldedT = SelOpNo ?
740 ConstantFoldBinaryOpOperands(BO.getOpcode(), CBO, CT, *DL) :
741 ConstantFoldBinaryOpOperands(BO.getOpcode(), CT, CBO, *DL);
742 if (!FoldedT || isa<ConstantExpr>(FoldedT))
743 return false;
745 Constant *FoldedF = SelOpNo ?
746 ConstantFoldBinaryOpOperands(BO.getOpcode(), CBO, CF, *DL) :
747 ConstantFoldBinaryOpOperands(BO.getOpcode(), CF, CBO, *DL);
748 if (!FoldedF || isa<ConstantExpr>(FoldedF))
749 return false;
751 IRBuilder<> Builder(&BO);
752 Builder.SetCurrentDebugLocation(BO.getDebugLoc());
753 if (const FPMathOperator *FPOp = dyn_cast<const FPMathOperator>(&BO))
754 Builder.setFastMathFlags(FPOp->getFastMathFlags());
756 Value *NewSelect = Builder.CreateSelect(Sel->getCondition(),
757 FoldedT, FoldedF);
758 NewSelect->takeName(&BO);
759 BO.replaceAllUsesWith(NewSelect);
760 BO.eraseFromParent();
761 if (CastOp)
762 CastOp->eraseFromParent();
763 Sel->eraseFromParent();
764 return true;
767 std::pair<Value *, Value *>
768 AMDGPUCodeGenPrepareImpl::getFrexpResults(IRBuilder<> &Builder,
769 Value *Src) const {
770 Type *Ty = Src->getType();
771 Value *Frexp = Builder.CreateIntrinsic(Intrinsic::frexp,
772 {Ty, Builder.getInt32Ty()}, Src);
773 Value *FrexpMant = Builder.CreateExtractValue(Frexp, {0});
775 // Bypass the bug workaround for the exponent result since it doesn't matter.
776 // TODO: Does the bug workaround even really need to consider the exponent
777 // result? It's unspecified by the spec.
779 Value *FrexpExp =
780 ST->hasFractBug()
781 ? Builder.CreateIntrinsic(Intrinsic::amdgcn_frexp_exp,
782 {Builder.getInt32Ty(), Ty}, Src)
783 : Builder.CreateExtractValue(Frexp, {1});
784 return {FrexpMant, FrexpExp};
787 /// Emit an expansion of 1.0 / Src good for 1ulp that supports denormals.
788 Value *AMDGPUCodeGenPrepareImpl::emitRcpIEEE1ULP(IRBuilder<> &Builder,
789 Value *Src,
790 bool IsNegative) const {
791 // Same as for 1.0, but expand the sign out of the constant.
792 // -1.0 / x -> rcp (fneg x)
793 if (IsNegative)
794 Src = Builder.CreateFNeg(Src);
796 // The rcp instruction doesn't support denormals, so scale the input
797 // out of the denormal range and convert at the end.
799 // Expand as 2^-n * (1.0 / (x * 2^n))
801 // TODO: Skip scaling if input is known never denormal and the input
802 // range won't underflow to denormal. The hard part is knowing the
803 // result. We need a range check, the result could be denormal for
804 // 0x1p+126 < den <= 0x1p+127.
805 auto [FrexpMant, FrexpExp] = getFrexpResults(Builder, Src);
806 Value *ScaleFactor = Builder.CreateNeg(FrexpExp);
807 Value *Rcp = Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rcp, FrexpMant);
808 return Builder.CreateCall(getLdexpF32(), {Rcp, ScaleFactor});
811 /// Emit a 2ulp expansion for fdiv by using frexp for input scaling.
812 Value *AMDGPUCodeGenPrepareImpl::emitFrexpDiv(IRBuilder<> &Builder, Value *LHS,
813 Value *RHS,
814 FastMathFlags FMF) const {
815 // If we have have to work around the fract/frexp bug, we're worse off than
816 // using the fdiv.fast expansion. The full safe expansion is faster if we have
817 // fast FMA.
818 if (HasFP32DenormalFlush && ST->hasFractBug() && !ST->hasFastFMAF32() &&
819 (!FMF.noNaNs() || !FMF.noInfs()))
820 return nullptr;
822 // We're scaling the LHS to avoid a denormal input, and scale the denominator
823 // to avoid large values underflowing the result.
824 auto [FrexpMantRHS, FrexpExpRHS] = getFrexpResults(Builder, RHS);
826 Value *Rcp =
827 Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rcp, FrexpMantRHS);
829 auto [FrexpMantLHS, FrexpExpLHS] = getFrexpResults(Builder, LHS);
830 Value *Mul = Builder.CreateFMul(FrexpMantLHS, Rcp);
832 // We multiplied by 2^N/2^M, so we need to multiply by 2^(N-M) to scale the
833 // result.
834 Value *ExpDiff = Builder.CreateSub(FrexpExpLHS, FrexpExpRHS);
835 return Builder.CreateCall(getLdexpF32(), {Mul, ExpDiff});
838 /// Emit a sqrt that handles denormals and is accurate to 2ulp.
839 Value *AMDGPUCodeGenPrepareImpl::emitSqrtIEEE2ULP(IRBuilder<> &Builder,
840 Value *Src,
841 FastMathFlags FMF) const {
842 Type *Ty = Src->getType();
843 APFloat SmallestNormal =
844 APFloat::getSmallestNormalized(Ty->getFltSemantics());
845 Value *NeedScale =
846 Builder.CreateFCmpOLT(Src, ConstantFP::get(Ty, SmallestNormal));
848 ConstantInt *Zero = Builder.getInt32(0);
849 Value *InputScaleFactor =
850 Builder.CreateSelect(NeedScale, Builder.getInt32(32), Zero);
852 Value *Scaled = Builder.CreateCall(getLdexpF32(), {Src, InputScaleFactor});
854 Value *Sqrt = Builder.CreateCall(getSqrtF32(), Scaled);
856 Value *OutputScaleFactor =
857 Builder.CreateSelect(NeedScale, Builder.getInt32(-16), Zero);
858 return Builder.CreateCall(getLdexpF32(), {Sqrt, OutputScaleFactor});
861 /// Emit an expansion of 1.0 / sqrt(Src) good for 1ulp that supports denormals.
862 static Value *emitRsqIEEE1ULP(IRBuilder<> &Builder, Value *Src,
863 bool IsNegative) {
864 // bool need_scale = x < 0x1p-126f;
865 // float input_scale = need_scale ? 0x1.0p+24f : 1.0f;
866 // float output_scale = need_scale ? 0x1.0p+12f : 1.0f;
867 // rsq(x * input_scale) * output_scale;
869 Type *Ty = Src->getType();
870 APFloat SmallestNormal =
871 APFloat::getSmallestNormalized(Ty->getFltSemantics());
872 Value *NeedScale =
873 Builder.CreateFCmpOLT(Src, ConstantFP::get(Ty, SmallestNormal));
874 Constant *One = ConstantFP::get(Ty, 1.0);
875 Constant *InputScale = ConstantFP::get(Ty, 0x1.0p+24);
876 Constant *OutputScale =
877 ConstantFP::get(Ty, IsNegative ? -0x1.0p+12 : 0x1.0p+12);
879 Value *InputScaleFactor = Builder.CreateSelect(NeedScale, InputScale, One);
881 Value *ScaledInput = Builder.CreateFMul(Src, InputScaleFactor);
882 Value *Rsq = Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rsq, ScaledInput);
883 Value *OutputScaleFactor = Builder.CreateSelect(
884 NeedScale, OutputScale, IsNegative ? ConstantFP::get(Ty, -1.0) : One);
886 return Builder.CreateFMul(Rsq, OutputScaleFactor);
889 bool AMDGPUCodeGenPrepareImpl::canOptimizeWithRsq(const FPMathOperator *SqrtOp,
890 FastMathFlags DivFMF,
891 FastMathFlags SqrtFMF) const {
892 // The rsqrt contraction increases accuracy from ~2ulp to ~1ulp.
893 if (!DivFMF.allowContract() || !SqrtFMF.allowContract())
894 return false;
896 // v_rsq_f32 gives 1ulp
897 return SqrtFMF.approxFunc() || HasUnsafeFPMath ||
898 SqrtOp->getFPAccuracy() >= 1.0f;
901 Value *AMDGPUCodeGenPrepareImpl::optimizeWithRsq(
902 IRBuilder<> &Builder, Value *Num, Value *Den, const FastMathFlags DivFMF,
903 const FastMathFlags SqrtFMF, const Instruction *CtxI) const {
904 // The rsqrt contraction increases accuracy from ~2ulp to ~1ulp.
905 assert(DivFMF.allowContract() && SqrtFMF.allowContract());
907 // rsq_f16 is accurate to 0.51 ulp.
908 // rsq_f32 is accurate for !fpmath >= 1.0ulp and denormals are flushed.
909 // rsq_f64 is never accurate.
910 const ConstantFP *CLHS = dyn_cast<ConstantFP>(Num);
911 if (!CLHS)
912 return nullptr;
914 assert(Den->getType()->isFloatTy());
916 bool IsNegative = false;
918 // TODO: Handle other numerator values with arcp.
919 if (CLHS->isExactlyValue(1.0) || (IsNegative = CLHS->isExactlyValue(-1.0))) {
920 // Add in the sqrt flags.
921 IRBuilder<>::FastMathFlagGuard Guard(Builder);
922 Builder.setFastMathFlags(DivFMF | SqrtFMF);
924 if ((DivFMF.approxFunc() && SqrtFMF.approxFunc()) || HasUnsafeFPMath ||
925 canIgnoreDenormalInput(Den, CtxI)) {
926 Value *Result = Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rsq, Den);
927 // -1.0 / sqrt(x) -> fneg(rsq(x))
928 return IsNegative ? Builder.CreateFNeg(Result) : Result;
931 return emitRsqIEEE1ULP(Builder, Den, IsNegative);
934 return nullptr;
937 // Optimize fdiv with rcp:
939 // 1/x -> rcp(x) when rcp is sufficiently accurate or inaccurate rcp is
940 // allowed with unsafe-fp-math or afn.
942 // a/b -> a*rcp(b) when arcp is allowed, and we only need provide ULP 1.0
943 Value *
944 AMDGPUCodeGenPrepareImpl::optimizeWithRcp(IRBuilder<> &Builder, Value *Num,
945 Value *Den, FastMathFlags FMF,
946 const Instruction *CtxI) const {
947 // rcp_f16 is accurate to 0.51 ulp.
948 // rcp_f32 is accurate for !fpmath >= 1.0ulp and denormals are flushed.
949 // rcp_f64 is never accurate.
950 assert(Den->getType()->isFloatTy());
952 if (const ConstantFP *CLHS = dyn_cast<ConstantFP>(Num)) {
953 bool IsNegative = false;
954 if (CLHS->isExactlyValue(1.0) ||
955 (IsNegative = CLHS->isExactlyValue(-1.0))) {
956 Value *Src = Den;
958 if (HasFP32DenormalFlush || FMF.approxFunc()) {
959 // -1.0 / x -> 1.0 / fneg(x)
960 if (IsNegative)
961 Src = Builder.CreateFNeg(Src);
963 // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
964 // the CI documentation has a worst case error of 1 ulp.
965 // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK
966 // to use it as long as we aren't trying to use denormals.
968 // v_rcp_f16 and v_rsq_f16 DO support denormals.
970 // NOTE: v_sqrt and v_rcp will be combined to v_rsq later. So we don't
971 // insert rsq intrinsic here.
973 // 1.0 / x -> rcp(x)
974 return Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rcp, Src);
977 // TODO: If the input isn't denormal, and we know the input exponent isn't
978 // big enough to introduce a denormal we can avoid the scaling.
979 return emitRcpIEEE1ULP(Builder, Src, IsNegative);
983 if (FMF.allowReciprocal()) {
984 // x / y -> x * (1.0 / y)
986 // TODO: Could avoid denormal scaling and use raw rcp if we knew the output
987 // will never underflow.
988 if (HasFP32DenormalFlush || FMF.approxFunc()) {
989 Value *Recip = Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rcp, Den);
990 return Builder.CreateFMul(Num, Recip);
993 Value *Recip = emitRcpIEEE1ULP(Builder, Den, false);
994 return Builder.CreateFMul(Num, Recip);
997 return nullptr;
1000 // optimize with fdiv.fast:
1002 // a/b -> fdiv.fast(a, b) when !fpmath >= 2.5ulp with denormals flushed.
1004 // 1/x -> fdiv.fast(1,x) when !fpmath >= 2.5ulp.
1006 // NOTE: optimizeWithRcp should be tried first because rcp is the preference.
1007 Value *AMDGPUCodeGenPrepareImpl::optimizeWithFDivFast(
1008 IRBuilder<> &Builder, Value *Num, Value *Den, float ReqdAccuracy) const {
1009 // fdiv.fast can achieve 2.5 ULP accuracy.
1010 if (ReqdAccuracy < 2.5f)
1011 return nullptr;
1013 // Only have fdiv.fast for f32.
1014 assert(Den->getType()->isFloatTy());
1016 bool NumIsOne = false;
1017 if (const ConstantFP *CNum = dyn_cast<ConstantFP>(Num)) {
1018 if (CNum->isExactlyValue(+1.0) || CNum->isExactlyValue(-1.0))
1019 NumIsOne = true;
1022 // fdiv does not support denormals. But 1.0/x is always fine to use it.
1024 // TODO: This works for any value with a specific known exponent range, don't
1025 // just limit to constant 1.
1026 if (!HasFP32DenormalFlush && !NumIsOne)
1027 return nullptr;
1029 return Builder.CreateIntrinsic(Intrinsic::amdgcn_fdiv_fast, {}, {Num, Den});
1032 Value *AMDGPUCodeGenPrepareImpl::visitFDivElement(
1033 IRBuilder<> &Builder, Value *Num, Value *Den, FastMathFlags DivFMF,
1034 FastMathFlags SqrtFMF, Value *RsqOp, const Instruction *FDivInst,
1035 float ReqdDivAccuracy) const {
1036 if (RsqOp) {
1037 Value *Rsq =
1038 optimizeWithRsq(Builder, Num, RsqOp, DivFMF, SqrtFMF, FDivInst);
1039 if (Rsq)
1040 return Rsq;
1043 Value *Rcp = optimizeWithRcp(Builder, Num, Den, DivFMF, FDivInst);
1044 if (Rcp)
1045 return Rcp;
1047 // In the basic case fdiv_fast has the same instruction count as the frexp div
1048 // expansion. Slightly prefer fdiv_fast since it ends in an fmul that can
1049 // potentially be fused into a user. Also, materialization of the constants
1050 // can be reused for multiple instances.
1051 Value *FDivFast = optimizeWithFDivFast(Builder, Num, Den, ReqdDivAccuracy);
1052 if (FDivFast)
1053 return FDivFast;
1055 return emitFrexpDiv(Builder, Num, Den, DivFMF);
1058 // Optimizations is performed based on fpmath, fast math flags as well as
1059 // denormals to optimize fdiv with either rcp or fdiv.fast.
1061 // With rcp:
1062 // 1/x -> rcp(x) when rcp is sufficiently accurate or inaccurate rcp is
1063 // allowed with unsafe-fp-math or afn.
1065 // a/b -> a*rcp(b) when inaccurate rcp is allowed with unsafe-fp-math or afn.
1067 // With fdiv.fast:
1068 // a/b -> fdiv.fast(a, b) when !fpmath >= 2.5ulp with denormals flushed.
1070 // 1/x -> fdiv.fast(1,x) when !fpmath >= 2.5ulp.
1072 // NOTE: rcp is the preference in cases that both are legal.
1073 bool AMDGPUCodeGenPrepareImpl::visitFDiv(BinaryOperator &FDiv) {
1074 if (DisableFDivExpand)
1075 return false;
1077 Type *Ty = FDiv.getType()->getScalarType();
1078 if (!Ty->isFloatTy())
1079 return false;
1081 // The f64 rcp/rsq approximations are pretty inaccurate. We can do an
1082 // expansion around them in codegen. f16 is good enough to always use.
1084 const FPMathOperator *FPOp = cast<const FPMathOperator>(&FDiv);
1085 const FastMathFlags DivFMF = FPOp->getFastMathFlags();
1086 const float ReqdAccuracy = FPOp->getFPAccuracy();
1088 FastMathFlags SqrtFMF;
1090 Value *Num = FDiv.getOperand(0);
1091 Value *Den = FDiv.getOperand(1);
1093 Value *RsqOp = nullptr;
1094 auto *DenII = dyn_cast<IntrinsicInst>(Den);
1095 if (DenII && DenII->getIntrinsicID() == Intrinsic::sqrt &&
1096 DenII->hasOneUse()) {
1097 const auto *SqrtOp = cast<FPMathOperator>(DenII);
1098 SqrtFMF = SqrtOp->getFastMathFlags();
1099 if (canOptimizeWithRsq(SqrtOp, DivFMF, SqrtFMF))
1100 RsqOp = SqrtOp->getOperand(0);
1103 // Inaccurate rcp is allowed with unsafe-fp-math or afn.
1105 // Defer to codegen to handle this.
1107 // TODO: Decide on an interpretation for interactions between afn + arcp +
1108 // !fpmath, and make it consistent between here and codegen. For now, defer
1109 // expansion of afn to codegen. The current interpretation is so aggressive we
1110 // don't need any pre-consideration here when we have better information. A
1111 // more conservative interpretation could use handling here.
1112 const bool AllowInaccurateRcp = HasUnsafeFPMath || DivFMF.approxFunc();
1113 if (!RsqOp && AllowInaccurateRcp)
1114 return false;
1116 // Defer the correct implementations to codegen.
1117 if (ReqdAccuracy < 1.0f)
1118 return false;
1120 IRBuilder<> Builder(FDiv.getParent(), std::next(FDiv.getIterator()));
1121 Builder.setFastMathFlags(DivFMF);
1122 Builder.SetCurrentDebugLocation(FDiv.getDebugLoc());
1124 SmallVector<Value *, 4> NumVals;
1125 SmallVector<Value *, 4> DenVals;
1126 SmallVector<Value *, 4> RsqDenVals;
1127 extractValues(Builder, NumVals, Num);
1128 extractValues(Builder, DenVals, Den);
1130 if (RsqOp)
1131 extractValues(Builder, RsqDenVals, RsqOp);
1133 SmallVector<Value *, 4> ResultVals(NumVals.size());
1134 for (int I = 0, E = NumVals.size(); I != E; ++I) {
1135 Value *NumElt = NumVals[I];
1136 Value *DenElt = DenVals[I];
1137 Value *RsqDenElt = RsqOp ? RsqDenVals[I] : nullptr;
1139 Value *NewElt =
1140 visitFDivElement(Builder, NumElt, DenElt, DivFMF, SqrtFMF, RsqDenElt,
1141 cast<Instruction>(FPOp), ReqdAccuracy);
1142 if (!NewElt) {
1143 // Keep the original, but scalarized.
1145 // This has the unfortunate side effect of sometimes scalarizing when
1146 // we're not going to do anything.
1147 NewElt = Builder.CreateFDiv(NumElt, DenElt);
1148 if (auto *NewEltInst = dyn_cast<Instruction>(NewElt))
1149 NewEltInst->copyMetadata(FDiv);
1152 ResultVals[I] = NewElt;
1155 Value *NewVal = insertValues(Builder, FDiv.getType(), ResultVals);
1157 if (NewVal) {
1158 FDiv.replaceAllUsesWith(NewVal);
1159 NewVal->takeName(&FDiv);
1160 RecursivelyDeleteTriviallyDeadInstructions(&FDiv, TLInfo);
1163 return true;
1166 static bool hasUnsafeFPMath(const Function &F) {
1167 Attribute Attr = F.getFnAttribute("unsafe-fp-math");
1168 return Attr.getValueAsBool();
1171 static std::pair<Value*, Value*> getMul64(IRBuilder<> &Builder,
1172 Value *LHS, Value *RHS) {
1173 Type *I32Ty = Builder.getInt32Ty();
1174 Type *I64Ty = Builder.getInt64Ty();
1176 Value *LHS_EXT64 = Builder.CreateZExt(LHS, I64Ty);
1177 Value *RHS_EXT64 = Builder.CreateZExt(RHS, I64Ty);
1178 Value *MUL64 = Builder.CreateMul(LHS_EXT64, RHS_EXT64);
1179 Value *Lo = Builder.CreateTrunc(MUL64, I32Ty);
1180 Value *Hi = Builder.CreateLShr(MUL64, Builder.getInt64(32));
1181 Hi = Builder.CreateTrunc(Hi, I32Ty);
1182 return std::pair(Lo, Hi);
1185 static Value* getMulHu(IRBuilder<> &Builder, Value *LHS, Value *RHS) {
1186 return getMul64(Builder, LHS, RHS).second;
1189 /// Figure out how many bits are really needed for this division. \p AtLeast is
1190 /// an optimization hint to bypass the second ComputeNumSignBits call if we the
1191 /// first one is insufficient. Returns -1 on failure.
1192 int AMDGPUCodeGenPrepareImpl::getDivNumBits(BinaryOperator &I, Value *Num,
1193 Value *Den, unsigned AtLeast,
1194 bool IsSigned) const {
1195 const DataLayout &DL = Mod->getDataLayout();
1196 if (IsSigned) {
1197 unsigned LHSSignBits = ComputeNumSignBits(Num, DL, 0, AC, &I);
1198 if (LHSSignBits < AtLeast)
1199 return -1;
1201 unsigned RHSSignBits = ComputeNumSignBits(Den, DL, 0, AC, &I);
1202 if (RHSSignBits < AtLeast)
1203 return -1;
1205 unsigned SignBits = std::min(LHSSignBits, RHSSignBits);
1206 unsigned DivBits = Num->getType()->getScalarSizeInBits() - SignBits;
1207 return DivBits + 1;
1208 } else {
1209 KnownBits Known = computeKnownBits(Num, DL, 0, AC, &I);
1210 // We know all bits are used for division for Num or Den in range
1211 // (SignedMax, UnsignedMax]
1212 if (Known.isNegative() || !Known.isNonNegative())
1213 return -1;
1214 unsigned LHSSignBits = Known.countMinLeadingZeros();
1216 Known = computeKnownBits(Den, DL, 0, AC, &I);
1217 if (Known.isNegative() || !Known.isNonNegative())
1218 return -1;
1219 unsigned RHSSignBits = Known.countMinLeadingZeros();
1221 unsigned SignBits = std::min(LHSSignBits, RHSSignBits);
1222 unsigned DivBits = Num->getType()->getScalarSizeInBits() - SignBits;
1223 return DivBits;
1227 // The fractional part of a float is enough to accurately represent up to
1228 // a 24-bit signed integer.
1229 Value *AMDGPUCodeGenPrepareImpl::expandDivRem24(IRBuilder<> &Builder,
1230 BinaryOperator &I, Value *Num,
1231 Value *Den, bool IsDiv,
1232 bool IsSigned) const {
1233 unsigned SSBits = Num->getType()->getScalarSizeInBits();
1234 // If Num bits <= 24, assume 0 signbits.
1235 unsigned AtLeast = (SSBits <= 24) ? 0 : (SSBits - 24 + IsSigned);
1236 int DivBits = getDivNumBits(I, Num, Den, AtLeast, IsSigned);
1237 if (DivBits == -1)
1238 return nullptr;
1239 return expandDivRem24Impl(Builder, I, Num, Den, DivBits, IsDiv, IsSigned);
1242 Value *AMDGPUCodeGenPrepareImpl::expandDivRem24Impl(
1243 IRBuilder<> &Builder, BinaryOperator &I, Value *Num, Value *Den,
1244 unsigned DivBits, bool IsDiv, bool IsSigned) const {
1245 Type *I32Ty = Builder.getInt32Ty();
1246 Num = Builder.CreateTrunc(Num, I32Ty);
1247 Den = Builder.CreateTrunc(Den, I32Ty);
1249 Type *F32Ty = Builder.getFloatTy();
1250 ConstantInt *One = Builder.getInt32(1);
1251 Value *JQ = One;
1253 if (IsSigned) {
1254 // char|short jq = ia ^ ib;
1255 JQ = Builder.CreateXor(Num, Den);
1257 // jq = jq >> (bitsize - 2)
1258 JQ = Builder.CreateAShr(JQ, Builder.getInt32(30));
1260 // jq = jq | 0x1
1261 JQ = Builder.CreateOr(JQ, One);
1264 // int ia = (int)LHS;
1265 Value *IA = Num;
1267 // int ib, (int)RHS;
1268 Value *IB = Den;
1270 // float fa = (float)ia;
1271 Value *FA = IsSigned ? Builder.CreateSIToFP(IA, F32Ty)
1272 : Builder.CreateUIToFP(IA, F32Ty);
1274 // float fb = (float)ib;
1275 Value *FB = IsSigned ? Builder.CreateSIToFP(IB,F32Ty)
1276 : Builder.CreateUIToFP(IB,F32Ty);
1278 Value *RCP = Builder.CreateIntrinsic(Intrinsic::amdgcn_rcp,
1279 Builder.getFloatTy(), {FB});
1280 Value *FQM = Builder.CreateFMul(FA, RCP);
1282 // fq = trunc(fqm);
1283 CallInst *FQ = Builder.CreateUnaryIntrinsic(Intrinsic::trunc, FQM);
1284 FQ->copyFastMathFlags(Builder.getFastMathFlags());
1286 // float fqneg = -fq;
1287 Value *FQNeg = Builder.CreateFNeg(FQ);
1289 // float fr = mad(fqneg, fb, fa);
1290 auto FMAD = !ST->hasMadMacF32Insts()
1291 ? Intrinsic::fma
1292 : (Intrinsic::ID)Intrinsic::amdgcn_fmad_ftz;
1293 Value *FR = Builder.CreateIntrinsic(FMAD,
1294 {FQNeg->getType()}, {FQNeg, FB, FA}, FQ);
1296 // int iq = (int)fq;
1297 Value *IQ = IsSigned ? Builder.CreateFPToSI(FQ, I32Ty)
1298 : Builder.CreateFPToUI(FQ, I32Ty);
1300 // fr = fabs(fr);
1301 FR = Builder.CreateUnaryIntrinsic(Intrinsic::fabs, FR, FQ);
1303 // fb = fabs(fb);
1304 FB = Builder.CreateUnaryIntrinsic(Intrinsic::fabs, FB, FQ);
1306 // int cv = fr >= fb;
1307 Value *CV = Builder.CreateFCmpOGE(FR, FB);
1309 // jq = (cv ? jq : 0);
1310 JQ = Builder.CreateSelect(CV, JQ, Builder.getInt32(0));
1312 // dst = iq + jq;
1313 Value *Div = Builder.CreateAdd(IQ, JQ);
1315 Value *Res = Div;
1316 if (!IsDiv) {
1317 // Rem needs compensation, it's easier to recompute it
1318 Value *Rem = Builder.CreateMul(Div, Den);
1319 Res = Builder.CreateSub(Num, Rem);
1322 if (DivBits != 0 && DivBits < 32) {
1323 // Extend in register from the number of bits this divide really is.
1324 if (IsSigned) {
1325 int InRegBits = 32 - DivBits;
1327 Res = Builder.CreateShl(Res, InRegBits);
1328 Res = Builder.CreateAShr(Res, InRegBits);
1329 } else {
1330 ConstantInt *TruncMask
1331 = Builder.getInt32((UINT64_C(1) << DivBits) - 1);
1332 Res = Builder.CreateAnd(Res, TruncMask);
1336 return Res;
1339 // Try to recognize special cases the DAG will emit special, better expansions
1340 // than the general expansion we do here.
1342 // TODO: It would be better to just directly handle those optimizations here.
1343 bool AMDGPUCodeGenPrepareImpl::divHasSpecialOptimization(BinaryOperator &I,
1344 Value *Num,
1345 Value *Den) const {
1346 if (Constant *C = dyn_cast<Constant>(Den)) {
1347 // Arbitrary constants get a better expansion as long as a wider mulhi is
1348 // legal.
1349 if (C->getType()->getScalarSizeInBits() <= 32)
1350 return true;
1352 // TODO: Sdiv check for not exact for some reason.
1354 // If there's no wider mulhi, there's only a better expansion for powers of
1355 // two.
1356 // TODO: Should really know for each vector element.
1357 if (isKnownToBeAPowerOfTwo(C, *DL, true, 0, AC, &I, DT))
1358 return true;
1360 return false;
1363 if (BinaryOperator *BinOpDen = dyn_cast<BinaryOperator>(Den)) {
1364 // fold (udiv x, (shl c, y)) -> x >>u (log2(c)+y) iff c is power of 2
1365 if (BinOpDen->getOpcode() == Instruction::Shl &&
1366 isa<Constant>(BinOpDen->getOperand(0)) &&
1367 isKnownToBeAPowerOfTwo(BinOpDen->getOperand(0), *DL, true,
1368 0, AC, &I, DT)) {
1369 return true;
1373 return false;
1376 static Value *getSign32(Value *V, IRBuilder<> &Builder, const DataLayout *DL) {
1377 // Check whether the sign can be determined statically.
1378 KnownBits Known = computeKnownBits(V, *DL);
1379 if (Known.isNegative())
1380 return Constant::getAllOnesValue(V->getType());
1381 if (Known.isNonNegative())
1382 return Constant::getNullValue(V->getType());
1383 return Builder.CreateAShr(V, Builder.getInt32(31));
1386 Value *AMDGPUCodeGenPrepareImpl::expandDivRem32(IRBuilder<> &Builder,
1387 BinaryOperator &I, Value *X,
1388 Value *Y) const {
1389 Instruction::BinaryOps Opc = I.getOpcode();
1390 assert(Opc == Instruction::URem || Opc == Instruction::UDiv ||
1391 Opc == Instruction::SRem || Opc == Instruction::SDiv);
1393 FastMathFlags FMF;
1394 FMF.setFast();
1395 Builder.setFastMathFlags(FMF);
1397 if (divHasSpecialOptimization(I, X, Y))
1398 return nullptr; // Keep it for later optimization.
1400 bool IsDiv = Opc == Instruction::UDiv || Opc == Instruction::SDiv;
1401 bool IsSigned = Opc == Instruction::SRem || Opc == Instruction::SDiv;
1403 Type *Ty = X->getType();
1404 Type *I32Ty = Builder.getInt32Ty();
1405 Type *F32Ty = Builder.getFloatTy();
1407 if (Ty->getScalarSizeInBits() != 32) {
1408 if (IsSigned) {
1409 X = Builder.CreateSExtOrTrunc(X, I32Ty);
1410 Y = Builder.CreateSExtOrTrunc(Y, I32Ty);
1411 } else {
1412 X = Builder.CreateZExtOrTrunc(X, I32Ty);
1413 Y = Builder.CreateZExtOrTrunc(Y, I32Ty);
1417 if (Value *Res = expandDivRem24(Builder, I, X, Y, IsDiv, IsSigned)) {
1418 return IsSigned ? Builder.CreateSExtOrTrunc(Res, Ty) :
1419 Builder.CreateZExtOrTrunc(Res, Ty);
1422 ConstantInt *Zero = Builder.getInt32(0);
1423 ConstantInt *One = Builder.getInt32(1);
1425 Value *Sign = nullptr;
1426 if (IsSigned) {
1427 Value *SignX = getSign32(X, Builder, DL);
1428 Value *SignY = getSign32(Y, Builder, DL);
1429 // Remainder sign is the same as LHS
1430 Sign = IsDiv ? Builder.CreateXor(SignX, SignY) : SignX;
1432 X = Builder.CreateAdd(X, SignX);
1433 Y = Builder.CreateAdd(Y, SignY);
1435 X = Builder.CreateXor(X, SignX);
1436 Y = Builder.CreateXor(Y, SignY);
1439 // The algorithm here is based on ideas from "Software Integer Division", Tom
1440 // Rodeheffer, August 2008.
1442 // unsigned udiv(unsigned x, unsigned y) {
1443 // // Initial estimate of inv(y). The constant is less than 2^32 to ensure
1444 // // that this is a lower bound on inv(y), even if some of the calculations
1445 // // round up.
1446 // unsigned z = (unsigned)((4294967296.0 - 512.0) * v_rcp_f32((float)y));
1448 // // One round of UNR (Unsigned integer Newton-Raphson) to improve z.
1449 // // Empirically this is guaranteed to give a "two-y" lower bound on
1450 // // inv(y).
1451 // z += umulh(z, -y * z);
1453 // // Quotient/remainder estimate.
1454 // unsigned q = umulh(x, z);
1455 // unsigned r = x - q * y;
1457 // // Two rounds of quotient/remainder refinement.
1458 // if (r >= y) {
1459 // ++q;
1460 // r -= y;
1461 // }
1462 // if (r >= y) {
1463 // ++q;
1464 // r -= y;
1465 // }
1467 // return q;
1468 // }
1470 // Initial estimate of inv(y).
1471 Value *FloatY = Builder.CreateUIToFP(Y, F32Ty);
1472 Value *RcpY = Builder.CreateIntrinsic(Intrinsic::amdgcn_rcp, F32Ty, {FloatY});
1473 Constant *Scale = ConstantFP::get(F32Ty, llvm::bit_cast<float>(0x4F7FFFFE));
1474 Value *ScaledY = Builder.CreateFMul(RcpY, Scale);
1475 Value *Z = Builder.CreateFPToUI(ScaledY, I32Ty);
1477 // One round of UNR.
1478 Value *NegY = Builder.CreateSub(Zero, Y);
1479 Value *NegYZ = Builder.CreateMul(NegY, Z);
1480 Z = Builder.CreateAdd(Z, getMulHu(Builder, Z, NegYZ));
1482 // Quotient/remainder estimate.
1483 Value *Q = getMulHu(Builder, X, Z);
1484 Value *R = Builder.CreateSub(X, Builder.CreateMul(Q, Y));
1486 // First quotient/remainder refinement.
1487 Value *Cond = Builder.CreateICmpUGE(R, Y);
1488 if (IsDiv)
1489 Q = Builder.CreateSelect(Cond, Builder.CreateAdd(Q, One), Q);
1490 R = Builder.CreateSelect(Cond, Builder.CreateSub(R, Y), R);
1492 // Second quotient/remainder refinement.
1493 Cond = Builder.CreateICmpUGE(R, Y);
1494 Value *Res;
1495 if (IsDiv)
1496 Res = Builder.CreateSelect(Cond, Builder.CreateAdd(Q, One), Q);
1497 else
1498 Res = Builder.CreateSelect(Cond, Builder.CreateSub(R, Y), R);
1500 if (IsSigned) {
1501 Res = Builder.CreateXor(Res, Sign);
1502 Res = Builder.CreateSub(Res, Sign);
1503 Res = Builder.CreateSExtOrTrunc(Res, Ty);
1504 } else {
1505 Res = Builder.CreateZExtOrTrunc(Res, Ty);
1507 return Res;
1510 Value *AMDGPUCodeGenPrepareImpl::shrinkDivRem64(IRBuilder<> &Builder,
1511 BinaryOperator &I, Value *Num,
1512 Value *Den) const {
1513 if (!ExpandDiv64InIR && divHasSpecialOptimization(I, Num, Den))
1514 return nullptr; // Keep it for later optimization.
1516 Instruction::BinaryOps Opc = I.getOpcode();
1518 bool IsDiv = Opc == Instruction::SDiv || Opc == Instruction::UDiv;
1519 bool IsSigned = Opc == Instruction::SDiv || Opc == Instruction::SRem;
1521 int NumDivBits = getDivNumBits(I, Num, Den, 32, IsSigned);
1522 if (NumDivBits == -1)
1523 return nullptr;
1525 Value *Narrowed = nullptr;
1526 if (NumDivBits <= 24) {
1527 Narrowed = expandDivRem24Impl(Builder, I, Num, Den, NumDivBits,
1528 IsDiv, IsSigned);
1529 } else if (NumDivBits <= 32) {
1530 Narrowed = expandDivRem32(Builder, I, Num, Den);
1533 if (Narrowed) {
1534 return IsSigned ? Builder.CreateSExt(Narrowed, Num->getType()) :
1535 Builder.CreateZExt(Narrowed, Num->getType());
1538 return nullptr;
1541 void AMDGPUCodeGenPrepareImpl::expandDivRem64(BinaryOperator &I) const {
1542 Instruction::BinaryOps Opc = I.getOpcode();
1543 // Do the general expansion.
1544 if (Opc == Instruction::UDiv || Opc == Instruction::SDiv) {
1545 expandDivisionUpTo64Bits(&I);
1546 return;
1549 if (Opc == Instruction::URem || Opc == Instruction::SRem) {
1550 expandRemainderUpTo64Bits(&I);
1551 return;
1554 llvm_unreachable("not a division");
1557 bool AMDGPUCodeGenPrepareImpl::visitBinaryOperator(BinaryOperator &I) {
1558 if (foldBinOpIntoSelect(I))
1559 return true;
1561 if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) &&
1562 UA->isUniform(&I) && promoteUniformOpToI32(I))
1563 return true;
1565 if (UseMul24Intrin && replaceMulWithMul24(I))
1566 return true;
1568 bool Changed = false;
1569 Instruction::BinaryOps Opc = I.getOpcode();
1570 Type *Ty = I.getType();
1571 Value *NewDiv = nullptr;
1572 unsigned ScalarSize = Ty->getScalarSizeInBits();
1574 SmallVector<BinaryOperator *, 8> Div64ToExpand;
1576 if ((Opc == Instruction::URem || Opc == Instruction::UDiv ||
1577 Opc == Instruction::SRem || Opc == Instruction::SDiv) &&
1578 ScalarSize <= 64 &&
1579 !DisableIDivExpand) {
1580 Value *Num = I.getOperand(0);
1581 Value *Den = I.getOperand(1);
1582 IRBuilder<> Builder(&I);
1583 Builder.SetCurrentDebugLocation(I.getDebugLoc());
1585 if (auto *VT = dyn_cast<FixedVectorType>(Ty)) {
1586 NewDiv = PoisonValue::get(VT);
1588 for (unsigned N = 0, E = VT->getNumElements(); N != E; ++N) {
1589 Value *NumEltN = Builder.CreateExtractElement(Num, N);
1590 Value *DenEltN = Builder.CreateExtractElement(Den, N);
1592 Value *NewElt;
1593 if (ScalarSize <= 32) {
1594 NewElt = expandDivRem32(Builder, I, NumEltN, DenEltN);
1595 if (!NewElt)
1596 NewElt = Builder.CreateBinOp(Opc, NumEltN, DenEltN);
1597 } else {
1598 // See if this 64-bit division can be shrunk to 32/24-bits before
1599 // producing the general expansion.
1600 NewElt = shrinkDivRem64(Builder, I, NumEltN, DenEltN);
1601 if (!NewElt) {
1602 // The general 64-bit expansion introduces control flow and doesn't
1603 // return the new value. Just insert a scalar copy and defer
1604 // expanding it.
1605 NewElt = Builder.CreateBinOp(Opc, NumEltN, DenEltN);
1606 Div64ToExpand.push_back(cast<BinaryOperator>(NewElt));
1610 if (auto *NewEltI = dyn_cast<Instruction>(NewElt))
1611 NewEltI->copyIRFlags(&I);
1613 NewDiv = Builder.CreateInsertElement(NewDiv, NewElt, N);
1615 } else {
1616 if (ScalarSize <= 32)
1617 NewDiv = expandDivRem32(Builder, I, Num, Den);
1618 else {
1619 NewDiv = shrinkDivRem64(Builder, I, Num, Den);
1620 if (!NewDiv)
1621 Div64ToExpand.push_back(&I);
1625 if (NewDiv) {
1626 I.replaceAllUsesWith(NewDiv);
1627 I.eraseFromParent();
1628 Changed = true;
1632 if (ExpandDiv64InIR) {
1633 // TODO: We get much worse code in specially handled constant cases.
1634 for (BinaryOperator *Div : Div64ToExpand) {
1635 expandDivRem64(*Div);
1636 FlowChanged = true;
1637 Changed = true;
1641 return Changed;
1644 bool AMDGPUCodeGenPrepareImpl::visitLoadInst(LoadInst &I) {
1645 if (!WidenLoads)
1646 return false;
1648 if ((I.getPointerAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
1649 I.getPointerAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) &&
1650 canWidenScalarExtLoad(I)) {
1651 IRBuilder<> Builder(&I);
1652 Builder.SetCurrentDebugLocation(I.getDebugLoc());
1654 Type *I32Ty = Builder.getInt32Ty();
1655 LoadInst *WidenLoad = Builder.CreateLoad(I32Ty, I.getPointerOperand());
1656 WidenLoad->copyMetadata(I);
1658 // If we have range metadata, we need to convert the type, and not make
1659 // assumptions about the high bits.
1660 if (auto *Range = WidenLoad->getMetadata(LLVMContext::MD_range)) {
1661 ConstantInt *Lower =
1662 mdconst::extract<ConstantInt>(Range->getOperand(0));
1664 if (Lower->isNullValue()) {
1665 WidenLoad->setMetadata(LLVMContext::MD_range, nullptr);
1666 } else {
1667 Metadata *LowAndHigh[] = {
1668 ConstantAsMetadata::get(ConstantInt::get(I32Ty, Lower->getValue().zext(32))),
1669 // Don't make assumptions about the high bits.
1670 ConstantAsMetadata::get(ConstantInt::get(I32Ty, 0))
1673 WidenLoad->setMetadata(LLVMContext::MD_range,
1674 MDNode::get(Mod->getContext(), LowAndHigh));
1678 int TySize = Mod->getDataLayout().getTypeSizeInBits(I.getType());
1679 Type *IntNTy = Builder.getIntNTy(TySize);
1680 Value *ValTrunc = Builder.CreateTrunc(WidenLoad, IntNTy);
1681 Value *ValOrig = Builder.CreateBitCast(ValTrunc, I.getType());
1682 I.replaceAllUsesWith(ValOrig);
1683 I.eraseFromParent();
1684 return true;
1687 return false;
1690 bool AMDGPUCodeGenPrepareImpl::visitICmpInst(ICmpInst &I) {
1691 bool Changed = false;
1693 if (ST->has16BitInsts() && needsPromotionToI32(I.getOperand(0)->getType()) &&
1694 UA->isUniform(&I))
1695 Changed |= promoteUniformOpToI32(I);
1697 return Changed;
1700 bool AMDGPUCodeGenPrepareImpl::visitSelectInst(SelectInst &I) {
1701 Value *Cond = I.getCondition();
1702 Value *TrueVal = I.getTrueValue();
1703 Value *FalseVal = I.getFalseValue();
1704 Value *CmpVal;
1705 FCmpInst::Predicate Pred;
1707 if (ST->has16BitInsts() && needsPromotionToI32(I.getType())) {
1708 if (UA->isUniform(&I))
1709 return promoteUniformOpToI32(I);
1710 return false;
1713 // Match fract pattern with nan check.
1714 if (!match(Cond, m_FCmp(Pred, m_Value(CmpVal), m_NonNaN())))
1715 return false;
1717 FPMathOperator *FPOp = dyn_cast<FPMathOperator>(&I);
1718 if (!FPOp)
1719 return false;
1721 IRBuilder<> Builder(&I);
1722 Builder.setFastMathFlags(FPOp->getFastMathFlags());
1724 auto *IITrue = dyn_cast<IntrinsicInst>(TrueVal);
1725 auto *IIFalse = dyn_cast<IntrinsicInst>(FalseVal);
1727 Value *Fract = nullptr;
1728 if (Pred == FCmpInst::FCMP_UNO && TrueVal == CmpVal && IIFalse &&
1729 CmpVal == matchFractPat(*IIFalse)) {
1730 // isnan(x) ? x : fract(x)
1731 Fract = applyFractPat(Builder, CmpVal);
1732 } else if (Pred == FCmpInst::FCMP_ORD && FalseVal == CmpVal && IITrue &&
1733 CmpVal == matchFractPat(*IITrue)) {
1734 // !isnan(x) ? fract(x) : x
1735 Fract = applyFractPat(Builder, CmpVal);
1736 } else
1737 return false;
1739 Fract->takeName(&I);
1740 I.replaceAllUsesWith(Fract);
1741 RecursivelyDeleteTriviallyDeadInstructions(&I, TLInfo);
1742 return true;
1745 static bool areInSameBB(const Value *A, const Value *B) {
1746 const auto *IA = dyn_cast<Instruction>(A);
1747 const auto *IB = dyn_cast<Instruction>(B);
1748 return IA && IB && IA->getParent() == IB->getParent();
1751 // Helper for breaking large PHIs that returns true when an extractelement on V
1752 // is likely to be folded away by the DAG combiner.
1753 static bool isInterestingPHIIncomingValue(const Value *V) {
1754 const auto *FVT = dyn_cast<FixedVectorType>(V->getType());
1755 if (!FVT)
1756 return false;
1758 const Value *CurVal = V;
1760 // Check for insertelements, keeping track of the elements covered.
1761 BitVector EltsCovered(FVT->getNumElements());
1762 while (const auto *IE = dyn_cast<InsertElementInst>(CurVal)) {
1763 const auto *Idx = dyn_cast<ConstantInt>(IE->getOperand(2));
1765 // Non constant index/out of bounds index -> folding is unlikely.
1766 // The latter is more of a sanity check because canonical IR should just
1767 // have replaced those with poison.
1768 if (!Idx || Idx->getZExtValue() >= FVT->getNumElements())
1769 return false;
1771 const auto *VecSrc = IE->getOperand(0);
1773 // If the vector source is another instruction, it must be in the same basic
1774 // block. Otherwise, the DAGCombiner won't see the whole thing and is
1775 // unlikely to be able to do anything interesting here.
1776 if (isa<Instruction>(VecSrc) && !areInSameBB(VecSrc, IE))
1777 return false;
1779 CurVal = VecSrc;
1780 EltsCovered.set(Idx->getZExtValue());
1782 // All elements covered.
1783 if (EltsCovered.all())
1784 return true;
1787 // We either didn't find a single insertelement, or the insertelement chain
1788 // ended before all elements were covered. Check for other interesting values.
1790 // Constants are always interesting because we can just constant fold the
1791 // extractelements.
1792 if (isa<Constant>(CurVal))
1793 return true;
1795 // shufflevector is likely to be profitable if either operand is a constant,
1796 // or if either source is in the same block.
1797 // This is because shufflevector is most often lowered as a series of
1798 // insert/extract elements anyway.
1799 if (const auto *SV = dyn_cast<ShuffleVectorInst>(CurVal)) {
1800 return isa<Constant>(SV->getOperand(1)) ||
1801 areInSameBB(SV, SV->getOperand(0)) ||
1802 areInSameBB(SV, SV->getOperand(1));
1805 return false;
1808 static void collectPHINodes(const PHINode &I,
1809 SmallPtrSet<const PHINode *, 8> &SeenPHIs) {
1810 const auto [It, Inserted] = SeenPHIs.insert(&I);
1811 if (!Inserted)
1812 return;
1814 for (const Value *Inc : I.incoming_values()) {
1815 if (const auto *PhiInc = dyn_cast<PHINode>(Inc))
1816 collectPHINodes(*PhiInc, SeenPHIs);
1819 for (const User *U : I.users()) {
1820 if (const auto *PhiU = dyn_cast<PHINode>(U))
1821 collectPHINodes(*PhiU, SeenPHIs);
1825 bool AMDGPUCodeGenPrepareImpl::canBreakPHINode(const PHINode &I) {
1826 // Check in the cache first.
1827 if (const auto It = BreakPhiNodesCache.find(&I);
1828 It != BreakPhiNodesCache.end())
1829 return It->second;
1831 // We consider PHI nodes as part of "chains", so given a PHI node I, we
1832 // recursively consider all its users and incoming values that are also PHI
1833 // nodes. We then make a decision about all of those PHIs at once. Either they
1834 // all get broken up, or none of them do. That way, we avoid cases where a
1835 // single PHI is/is not broken and we end up reforming/exploding a vector
1836 // multiple times, or even worse, doing it in a loop.
1837 SmallPtrSet<const PHINode *, 8> WorkList;
1838 collectPHINodes(I, WorkList);
1840 #ifndef NDEBUG
1841 // Check that none of the PHI nodes in the worklist are in the map. If some of
1842 // them are, it means we're not good enough at collecting related PHIs.
1843 for (const PHINode *WLP : WorkList) {
1844 assert(BreakPhiNodesCache.count(WLP) == 0);
1846 #endif
1848 // To consider a PHI profitable to break, we need to see some interesting
1849 // incoming values. At least 2/3rd (rounded up) of all PHIs in the worklist
1850 // must have one to consider all PHIs breakable.
1852 // This threshold has been determined through performance testing.
1854 // Note that the computation below is equivalent to
1856 // (unsigned)ceil((K / 3.0) * 2)
1858 // It's simply written this way to avoid mixing integral/FP arithmetic.
1859 const auto Threshold = (alignTo(WorkList.size() * 2, 3) / 3);
1860 unsigned NumBreakablePHIs = 0;
1861 bool CanBreak = false;
1862 for (const PHINode *Cur : WorkList) {
1863 // Don't break PHIs that have no interesting incoming values. That is, where
1864 // there is no clear opportunity to fold the "extractelement" instructions
1865 // we would add.
1867 // Note: IC does not run after this pass, so we're only interested in the
1868 // foldings that the DAG combiner can do.
1869 if (any_of(Cur->incoming_values(), isInterestingPHIIncomingValue)) {
1870 if (++NumBreakablePHIs >= Threshold) {
1871 CanBreak = true;
1872 break;
1877 for (const PHINode *Cur : WorkList)
1878 BreakPhiNodesCache[Cur] = CanBreak;
1880 return CanBreak;
1883 /// Helper class for "break large PHIs" (visitPHINode).
1885 /// This represents a slice of a PHI's incoming value, which is made up of:
1886 /// - The type of the slice (Ty)
1887 /// - The index in the incoming value's vector where the slice starts (Idx)
1888 /// - The number of elements in the slice (NumElts).
1889 /// It also keeps track of the NewPHI node inserted for this particular slice.
1891 /// Slice examples:
1892 /// <4 x i64> -> Split into four i64 slices.
1893 /// -> [i64, 0, 1], [i64, 1, 1], [i64, 2, 1], [i64, 3, 1]
1894 /// <5 x i16> -> Split into 2 <2 x i16> slices + a i16 tail.
1895 /// -> [<2 x i16>, 0, 2], [<2 x i16>, 2, 2], [i16, 4, 1]
1896 class VectorSlice {
1897 public:
1898 VectorSlice(Type *Ty, unsigned Idx, unsigned NumElts)
1899 : Ty(Ty), Idx(Idx), NumElts(NumElts) {}
1901 Type *Ty = nullptr;
1902 unsigned Idx = 0;
1903 unsigned NumElts = 0;
1904 PHINode *NewPHI = nullptr;
1906 /// Slice \p Inc according to the information contained within this slice.
1907 /// This is cached, so if called multiple times for the same \p BB & \p Inc
1908 /// pair, it returns the same Sliced value as well.
1910 /// Note this *intentionally* does not return the same value for, say,
1911 /// [%bb.0, %0] & [%bb.1, %0] as:
1912 /// - It could cause issues with dominance (e.g. if bb.1 is seen first, then
1913 /// the value in bb.1 may not be reachable from bb.0 if it's its
1914 /// predecessor.)
1915 /// - We also want to make our extract instructions as local as possible so
1916 /// the DAG has better chances of folding them out. Duplicating them like
1917 /// that is beneficial in that regard.
1919 /// This is both a minor optimization to avoid creating duplicate
1920 /// instructions, but also a requirement for correctness. It is not forbidden
1921 /// for a PHI node to have the same [BB, Val] pair multiple times. If we
1922 /// returned a new value each time, those previously identical pairs would all
1923 /// have different incoming values (from the same block) and it'd cause a "PHI
1924 /// node has multiple entries for the same basic block with different incoming
1925 /// values!" verifier error.
1926 Value *getSlicedVal(BasicBlock *BB, Value *Inc, StringRef NewValName) {
1927 Value *&Res = SlicedVals[{BB, Inc}];
1928 if (Res)
1929 return Res;
1931 IRBuilder<> B(BB->getTerminator());
1932 if (Instruction *IncInst = dyn_cast<Instruction>(Inc))
1933 B.SetCurrentDebugLocation(IncInst->getDebugLoc());
1935 if (NumElts > 1) {
1936 SmallVector<int, 4> Mask;
1937 for (unsigned K = Idx; K < (Idx + NumElts); ++K)
1938 Mask.push_back(K);
1939 Res = B.CreateShuffleVector(Inc, Mask, NewValName);
1940 } else
1941 Res = B.CreateExtractElement(Inc, Idx, NewValName);
1943 return Res;
1946 private:
1947 SmallDenseMap<std::pair<BasicBlock *, Value *>, Value *> SlicedVals;
1950 bool AMDGPUCodeGenPrepareImpl::visitPHINode(PHINode &I) {
1951 // Break-up fixed-vector PHIs into smaller pieces.
1952 // Default threshold is 32, so it breaks up any vector that's >32 bits into
1953 // its elements, or into 32-bit pieces (for 8/16 bit elts).
1955 // This is only helpful for DAGISel because it doesn't handle large PHIs as
1956 // well as GlobalISel. DAGISel lowers PHIs by using CopyToReg/CopyFromReg.
1957 // With large, odd-sized PHIs we may end up needing many `build_vector`
1958 // operations with most elements being "undef". This inhibits a lot of
1959 // optimization opportunities and can result in unreasonably high register
1960 // pressure and the inevitable stack spilling.
1961 if (!BreakLargePHIs || getCGPassBuilderOption().EnableGlobalISelOption)
1962 return false;
1964 FixedVectorType *FVT = dyn_cast<FixedVectorType>(I.getType());
1965 if (!FVT || FVT->getNumElements() == 1 ||
1966 DL->getTypeSizeInBits(FVT) <= BreakLargePHIsThreshold)
1967 return false;
1969 if (!ForceBreakLargePHIs && !canBreakPHINode(I))
1970 return false;
1972 std::vector<VectorSlice> Slices;
1974 Type *EltTy = FVT->getElementType();
1976 unsigned Idx = 0;
1977 // For 8/16 bits type, don't scalarize fully but break it up into as many
1978 // 32-bit slices as we can, and scalarize the tail.
1979 const unsigned EltSize = DL->getTypeSizeInBits(EltTy);
1980 const unsigned NumElts = FVT->getNumElements();
1981 if (EltSize == 8 || EltSize == 16) {
1982 const unsigned SubVecSize = (32 / EltSize);
1983 Type *SubVecTy = FixedVectorType::get(EltTy, SubVecSize);
1984 for (unsigned End = alignDown(NumElts, SubVecSize); Idx < End;
1985 Idx += SubVecSize)
1986 Slices.emplace_back(SubVecTy, Idx, SubVecSize);
1989 // Scalarize all remaining elements.
1990 for (; Idx < NumElts; ++Idx)
1991 Slices.emplace_back(EltTy, Idx, 1);
1994 assert(Slices.size() > 1);
1996 // Create one PHI per vector piece. The "VectorSlice" class takes care of
1997 // creating the necessary instruction to extract the relevant slices of each
1998 // incoming value.
1999 IRBuilder<> B(I.getParent());
2000 B.SetCurrentDebugLocation(I.getDebugLoc());
2002 unsigned IncNameSuffix = 0;
2003 for (VectorSlice &S : Slices) {
2004 // We need to reset the build on each iteration, because getSlicedVal may
2005 // have inserted something into I's BB.
2006 B.SetInsertPoint(I.getParent()->getFirstNonPHIIt());
2007 S.NewPHI = B.CreatePHI(S.Ty, I.getNumIncomingValues());
2009 for (const auto &[Idx, BB] : enumerate(I.blocks())) {
2010 S.NewPHI->addIncoming(S.getSlicedVal(BB, I.getIncomingValue(Idx),
2011 "largephi.extractslice" +
2012 std::to_string(IncNameSuffix++)),
2013 BB);
2017 // And replace this PHI with a vector of all the previous PHI values.
2018 Value *Vec = PoisonValue::get(FVT);
2019 unsigned NameSuffix = 0;
2020 for (VectorSlice &S : Slices) {
2021 const auto ValName = "largephi.insertslice" + std::to_string(NameSuffix++);
2022 if (S.NumElts > 1)
2023 Vec =
2024 B.CreateInsertVector(FVT, Vec, S.NewPHI, B.getInt64(S.Idx), ValName);
2025 else
2026 Vec = B.CreateInsertElement(Vec, S.NewPHI, S.Idx, ValName);
2029 I.replaceAllUsesWith(Vec);
2030 I.eraseFromParent();
2031 return true;
2034 /// \param V Value to check
2035 /// \param DL DataLayout
2036 /// \param TM TargetMachine (TODO: remove once DL contains nullptr values)
2037 /// \param AS Target Address Space
2038 /// \return true if \p V cannot be the null value of \p AS, false otherwise.
2039 static bool isPtrKnownNeverNull(const Value *V, const DataLayout &DL,
2040 const AMDGPUTargetMachine &TM, unsigned AS) {
2041 // Pointer cannot be null if it's a block address, GV or alloca.
2042 // NOTE: We don't support extern_weak, but if we did, we'd need to check for
2043 // it as the symbol could be null in such cases.
2044 if (isa<BlockAddress>(V) || isa<GlobalValue>(V) || isa<AllocaInst>(V))
2045 return true;
2047 // Check nonnull arguments.
2048 if (const auto *Arg = dyn_cast<Argument>(V); Arg && Arg->hasNonNullAttr())
2049 return true;
2051 // getUnderlyingObject may have looked through another addrspacecast, although
2052 // the optimizable situations most likely folded out by now.
2053 if (AS != cast<PointerType>(V->getType())->getAddressSpace())
2054 return false;
2056 // TODO: Calls that return nonnull?
2058 // For all other things, use KnownBits.
2059 // We either use 0 or all bits set to indicate null, so check whether the
2060 // value can be zero or all ones.
2062 // TODO: Use ValueTracking's isKnownNeverNull if it becomes aware that some
2063 // address spaces have non-zero null values.
2064 auto SrcPtrKB = computeKnownBits(V, DL);
2065 const auto NullVal = TM.getNullPointerValue(AS);
2067 assert(SrcPtrKB.getBitWidth() == DL.getPointerSizeInBits(AS));
2068 assert((NullVal == 0 || NullVal == -1) &&
2069 "don't know how to check for this null value!");
2070 return NullVal ? !SrcPtrKB.getMaxValue().isAllOnes() : SrcPtrKB.isNonZero();
2073 bool AMDGPUCodeGenPrepareImpl::visitAddrSpaceCastInst(AddrSpaceCastInst &I) {
2074 // Intrinsic doesn't support vectors, also it seems that it's often difficult
2075 // to prove that a vector cannot have any nulls in it so it's unclear if it's
2076 // worth supporting.
2077 if (I.getType()->isVectorTy())
2078 return false;
2080 // Check if this can be lowered to a amdgcn.addrspacecast.nonnull.
2081 // This is only worthwhile for casts from/to priv/local to flat.
2082 const unsigned SrcAS = I.getSrcAddressSpace();
2083 const unsigned DstAS = I.getDestAddressSpace();
2085 bool CanLower = false;
2086 if (SrcAS == AMDGPUAS::FLAT_ADDRESS)
2087 CanLower = (DstAS == AMDGPUAS::LOCAL_ADDRESS ||
2088 DstAS == AMDGPUAS::PRIVATE_ADDRESS);
2089 else if (DstAS == AMDGPUAS::FLAT_ADDRESS)
2090 CanLower = (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
2091 SrcAS == AMDGPUAS::PRIVATE_ADDRESS);
2092 if (!CanLower)
2093 return false;
2095 SmallVector<const Value *, 4> WorkList;
2096 getUnderlyingObjects(I.getOperand(0), WorkList);
2097 if (!all_of(WorkList, [&](const Value *V) {
2098 return isPtrKnownNeverNull(V, *DL, *TM, SrcAS);
2100 return false;
2102 IRBuilder<> B(&I);
2103 auto *Intrin = B.CreateIntrinsic(
2104 I.getType(), Intrinsic::amdgcn_addrspacecast_nonnull, {I.getOperand(0)});
2105 I.replaceAllUsesWith(Intrin);
2106 I.eraseFromParent();
2107 return true;
2110 bool AMDGPUCodeGenPrepareImpl::visitIntrinsicInst(IntrinsicInst &I) {
2111 switch (I.getIntrinsicID()) {
2112 case Intrinsic::bitreverse:
2113 return visitBitreverseIntrinsicInst(I);
2114 case Intrinsic::minnum:
2115 return visitMinNum(I);
2116 case Intrinsic::sqrt:
2117 return visitSqrt(I);
2118 default:
2119 return false;
2123 bool AMDGPUCodeGenPrepareImpl::visitBitreverseIntrinsicInst(IntrinsicInst &I) {
2124 bool Changed = false;
2126 if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) &&
2127 UA->isUniform(&I))
2128 Changed |= promoteUniformBitreverseToI32(I);
2130 return Changed;
2133 /// Match non-nan fract pattern.
2134 /// minnum(fsub(x, floor(x)), nextafter(1.0, -1.0)
2136 /// If fract is a useful instruction for the subtarget. Does not account for the
2137 /// nan handling; the instruction has a nan check on the input value.
2138 Value *AMDGPUCodeGenPrepareImpl::matchFractPat(IntrinsicInst &I) {
2139 if (ST->hasFractBug())
2140 return nullptr;
2142 if (I.getIntrinsicID() != Intrinsic::minnum)
2143 return nullptr;
2145 Type *Ty = I.getType();
2146 if (!isLegalFloatingTy(Ty->getScalarType()))
2147 return nullptr;
2149 Value *Arg0 = I.getArgOperand(0);
2150 Value *Arg1 = I.getArgOperand(1);
2152 const APFloat *C;
2153 if (!match(Arg1, m_APFloat(C)))
2154 return nullptr;
2156 APFloat One(1.0);
2157 bool LosesInfo;
2158 One.convert(C->getSemantics(), APFloat::rmNearestTiesToEven, &LosesInfo);
2160 // Match nextafter(1.0, -1)
2161 One.next(true);
2162 if (One != *C)
2163 return nullptr;
2165 Value *FloorSrc;
2166 if (match(Arg0, m_FSub(m_Value(FloorSrc),
2167 m_Intrinsic<Intrinsic::floor>(m_Deferred(FloorSrc)))))
2168 return FloorSrc;
2169 return nullptr;
2172 Value *AMDGPUCodeGenPrepareImpl::applyFractPat(IRBuilder<> &Builder,
2173 Value *FractArg) {
2174 SmallVector<Value *, 4> FractVals;
2175 extractValues(Builder, FractVals, FractArg);
2177 SmallVector<Value *, 4> ResultVals(FractVals.size());
2179 Type *Ty = FractArg->getType()->getScalarType();
2180 for (unsigned I = 0, E = FractVals.size(); I != E; ++I) {
2181 ResultVals[I] =
2182 Builder.CreateIntrinsic(Intrinsic::amdgcn_fract, {Ty}, {FractVals[I]});
2185 return insertValues(Builder, FractArg->getType(), ResultVals);
2188 bool AMDGPUCodeGenPrepareImpl::visitMinNum(IntrinsicInst &I) {
2189 Value *FractArg = matchFractPat(I);
2190 if (!FractArg)
2191 return false;
2193 // Match pattern for fract intrinsic in contexts where the nan check has been
2194 // optimized out (and hope the knowledge the source can't be nan wasn't lost).
2195 if (!I.hasNoNaNs() &&
2196 !isKnownNeverNaN(FractArg, /*Depth=*/0, SimplifyQuery(*DL, TLInfo)))
2197 return false;
2199 IRBuilder<> Builder(&I);
2200 FastMathFlags FMF = I.getFastMathFlags();
2201 FMF.setNoNaNs();
2202 Builder.setFastMathFlags(FMF);
2204 Value *Fract = applyFractPat(Builder, FractArg);
2205 Fract->takeName(&I);
2206 I.replaceAllUsesWith(Fract);
2208 RecursivelyDeleteTriviallyDeadInstructions(&I, TLInfo);
2209 return true;
2212 static bool isOneOrNegOne(const Value *Val) {
2213 const APFloat *C;
2214 return match(Val, m_APFloat(C)) && C->getExactLog2Abs() == 0;
2217 // Expand llvm.sqrt.f32 calls with !fpmath metadata in a semi-fast way.
2218 bool AMDGPUCodeGenPrepareImpl::visitSqrt(IntrinsicInst &Sqrt) {
2219 Type *Ty = Sqrt.getType()->getScalarType();
2220 if (!Ty->isFloatTy() && (!Ty->isHalfTy() || ST->has16BitInsts()))
2221 return false;
2223 const FPMathOperator *FPOp = cast<const FPMathOperator>(&Sqrt);
2224 FastMathFlags SqrtFMF = FPOp->getFastMathFlags();
2226 // We're trying to handle the fast-but-not-that-fast case only. The lowering
2227 // of fast llvm.sqrt will give the raw instruction anyway.
2228 if (SqrtFMF.approxFunc() || HasUnsafeFPMath)
2229 return false;
2231 const float ReqdAccuracy = FPOp->getFPAccuracy();
2233 // Defer correctly rounded expansion to codegen.
2234 if (ReqdAccuracy < 1.0f)
2235 return false;
2237 // FIXME: This is an ugly hack for this pass using forward iteration instead
2238 // of reverse. If it worked like a normal combiner, the rsq would form before
2239 // we saw a sqrt call.
2240 auto *FDiv =
2241 dyn_cast_or_null<FPMathOperator>(Sqrt.getUniqueUndroppableUser());
2242 if (FDiv && FDiv->getOpcode() == Instruction::FDiv &&
2243 FDiv->getFPAccuracy() >= 1.0f &&
2244 canOptimizeWithRsq(FPOp, FDiv->getFastMathFlags(), SqrtFMF) &&
2245 // TODO: We should also handle the arcp case for the fdiv with non-1 value
2246 isOneOrNegOne(FDiv->getOperand(0)))
2247 return false;
2249 Value *SrcVal = Sqrt.getOperand(0);
2250 bool CanTreatAsDAZ = canIgnoreDenormalInput(SrcVal, &Sqrt);
2252 // The raw instruction is 1 ulp, but the correction for denormal handling
2253 // brings it to 2.
2254 if (!CanTreatAsDAZ && ReqdAccuracy < 2.0f)
2255 return false;
2257 IRBuilder<> Builder(&Sqrt);
2258 SmallVector<Value *, 4> SrcVals;
2259 extractValues(Builder, SrcVals, SrcVal);
2261 SmallVector<Value *, 4> ResultVals(SrcVals.size());
2262 for (int I = 0, E = SrcVals.size(); I != E; ++I) {
2263 if (CanTreatAsDAZ)
2264 ResultVals[I] = Builder.CreateCall(getSqrtF32(), SrcVals[I]);
2265 else
2266 ResultVals[I] = emitSqrtIEEE2ULP(Builder, SrcVals[I], SqrtFMF);
2269 Value *NewSqrt = insertValues(Builder, Sqrt.getType(), ResultVals);
2270 NewSqrt->takeName(&Sqrt);
2271 Sqrt.replaceAllUsesWith(NewSqrt);
2272 Sqrt.eraseFromParent();
2273 return true;
2276 bool AMDGPUCodeGenPrepare::doInitialization(Module &M) {
2277 Impl.Mod = &M;
2278 Impl.DL = &Impl.Mod->getDataLayout();
2279 Impl.SqrtF32 = nullptr;
2280 Impl.LdexpF32 = nullptr;
2281 return false;
2284 bool AMDGPUCodeGenPrepare::runOnFunction(Function &F) {
2285 if (skipFunction(F))
2286 return false;
2288 auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
2289 if (!TPC)
2290 return false;
2292 const AMDGPUTargetMachine &TM = TPC->getTM<AMDGPUTargetMachine>();
2293 Impl.TM = &TM;
2294 Impl.TLInfo = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
2295 Impl.ST = &TM.getSubtarget<GCNSubtarget>(F);
2296 Impl.AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
2297 Impl.UA = &getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo();
2298 auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
2299 Impl.DT = DTWP ? &DTWP->getDomTree() : nullptr;
2300 Impl.HasUnsafeFPMath = hasUnsafeFPMath(F);
2301 SIModeRegisterDefaults Mode(F, *Impl.ST);
2302 Impl.HasFP32DenormalFlush =
2303 Mode.FP32Denormals == DenormalMode::getPreserveSign();
2304 return Impl.run(F);
2307 PreservedAnalyses AMDGPUCodeGenPreparePass::run(Function &F,
2308 FunctionAnalysisManager &FAM) {
2309 AMDGPUCodeGenPrepareImpl Impl;
2310 Impl.Mod = F.getParent();
2311 Impl.DL = &Impl.Mod->getDataLayout();
2312 Impl.TM = static_cast<const AMDGPUTargetMachine *>(&TM);
2313 Impl.TLInfo = &FAM.getResult<TargetLibraryAnalysis>(F);
2314 Impl.ST = &TM.getSubtarget<GCNSubtarget>(F);
2315 Impl.AC = &FAM.getResult<AssumptionAnalysis>(F);
2316 Impl.UA = &FAM.getResult<UniformityInfoAnalysis>(F);
2317 Impl.DT = FAM.getCachedResult<DominatorTreeAnalysis>(F);
2318 Impl.HasUnsafeFPMath = hasUnsafeFPMath(F);
2319 SIModeRegisterDefaults Mode(F, *Impl.ST);
2320 Impl.HasFP32DenormalFlush =
2321 Mode.FP32Denormals == DenormalMode::getPreserveSign();
2322 PreservedAnalyses PA = PreservedAnalyses::none();
2323 if (!Impl.FlowChanged)
2324 PA.preserveSet<CFGAnalyses>();
2325 return Impl.run(F) ? PA : PreservedAnalyses::all();
2328 INITIALIZE_PASS_BEGIN(AMDGPUCodeGenPrepare, DEBUG_TYPE,
2329 "AMDGPU IR optimizations", false, false)
2330 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
2331 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
2332 INITIALIZE_PASS_DEPENDENCY(UniformityInfoWrapperPass)
2333 INITIALIZE_PASS_END(AMDGPUCodeGenPrepare, DEBUG_TYPE, "AMDGPU IR optimizations",
2334 false, false)
2336 char AMDGPUCodeGenPrepare::ID = 0;
2338 FunctionPass *llvm::createAMDGPUCodeGenPreparePass() {
2339 return new AMDGPUCodeGenPrepare();