1 //===-- AMDGPUCodeGenPrepare.cpp ------------------------------------------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
10 /// This pass does misc. AMDGPU optimizations on IR before instruction
13 //===----------------------------------------------------------------------===//
16 #include "AMDGPUTargetMachine.h"
17 #include "llvm/Analysis/AssumptionCache.h"
18 #include "llvm/Analysis/ConstantFolding.h"
19 #include "llvm/Analysis/LegacyDivergenceAnalysis.h"
20 #include "llvm/Analysis/ValueTracking.h"
21 #include "llvm/CodeGen/TargetPassConfig.h"
22 #include "llvm/IR/Dominators.h"
23 #include "llvm/IR/InstVisitor.h"
24 #include "llvm/IR/IntrinsicsAMDGPU.h"
25 #include "llvm/IR/IRBuilder.h"
26 #include "llvm/InitializePasses.h"
27 #include "llvm/Pass.h"
28 #include "llvm/Support/KnownBits.h"
29 #include "llvm/Transforms/Utils/IntegerDivision.h"
31 #define DEBUG_TYPE "amdgpu-codegenprepare"
37 static cl::opt
<bool> WidenLoads(
38 "amdgpu-codegenprepare-widen-constant-loads",
39 cl::desc("Widen sub-dword constant address space loads in AMDGPUCodeGenPrepare"),
43 static cl::opt
<bool> Widen16BitOps(
44 "amdgpu-codegenprepare-widen-16-bit-ops",
45 cl::desc("Widen uniform 16-bit instructions to 32-bit in AMDGPUCodeGenPrepare"),
49 static cl::opt
<bool> UseMul24Intrin(
50 "amdgpu-codegenprepare-mul24",
51 cl::desc("Introduce mul24 intrinsics in AMDGPUCodeGenPrepare"),
55 // Legalize 64-bit division by using the generic IR expansion.
56 static cl::opt
<bool> ExpandDiv64InIR(
57 "amdgpu-codegenprepare-expand-div64",
58 cl::desc("Expand 64-bit division in AMDGPUCodeGenPrepare"),
62 // Leave all division operations as they are. This supersedes ExpandDiv64InIR
63 // and is used for testing the legalizer.
64 static cl::opt
<bool> DisableIDivExpand(
65 "amdgpu-codegenprepare-disable-idiv-expansion",
66 cl::desc("Prevent expanding integer division in AMDGPUCodeGenPrepare"),
70 class AMDGPUCodeGenPrepare
: public FunctionPass
,
71 public InstVisitor
<AMDGPUCodeGenPrepare
, bool> {
72 const GCNSubtarget
*ST
= nullptr;
73 AssumptionCache
*AC
= nullptr;
74 DominatorTree
*DT
= nullptr;
75 LegacyDivergenceAnalysis
*DA
= nullptr;
76 Module
*Mod
= nullptr;
77 const DataLayout
*DL
= nullptr;
78 bool HasUnsafeFPMath
= false;
79 bool HasFP32Denormals
= false;
81 /// Copies exact/nsw/nuw flags (if any) from binary operation \p I to
82 /// binary operation \p V.
84 /// \returns Binary operation \p V.
85 /// \returns \p T's base element bit width.
86 unsigned getBaseElementBitWidth(const Type
*T
) const;
88 /// \returns Equivalent 32 bit integer type for given type \p T. For example,
89 /// if \p T is i7, then i32 is returned; if \p T is <3 x i12>, then <3 x i32>
91 Type
*getI32Ty(IRBuilder
<> &B
, const Type
*T
) const;
93 /// \returns True if binary operation \p I is a signed binary operation, false
95 bool isSigned(const BinaryOperator
&I
) const;
97 /// \returns True if the condition of 'select' operation \p I comes from a
98 /// signed 'icmp' operation, false otherwise.
99 bool isSigned(const SelectInst
&I
) const;
101 /// \returns True if type \p T needs to be promoted to 32 bit integer type,
103 bool needsPromotionToI32(const Type
*T
) const;
105 /// Promotes uniform binary operation \p I to equivalent 32 bit binary
108 /// \details \p I's base element bit width must be greater than 1 and less
109 /// than or equal 16. Promotion is done by sign or zero extending operands to
110 /// 32 bits, replacing \p I with equivalent 32 bit binary operation, and
111 /// truncating the result of 32 bit binary operation back to \p I's original
112 /// type. Division operation is not promoted.
114 /// \returns True if \p I is promoted to equivalent 32 bit binary operation,
116 bool promoteUniformOpToI32(BinaryOperator
&I
) const;
118 /// Promotes uniform 'icmp' operation \p I to 32 bit 'icmp' operation.
120 /// \details \p I's base element bit width must be greater than 1 and less
121 /// than or equal 16. Promotion is done by sign or zero extending operands to
122 /// 32 bits, and replacing \p I with 32 bit 'icmp' operation.
125 bool promoteUniformOpToI32(ICmpInst
&I
) const;
127 /// Promotes uniform 'select' operation \p I to 32 bit 'select'
130 /// \details \p I's base element bit width must be greater than 1 and less
131 /// than or equal 16. Promotion is done by sign or zero extending operands to
132 /// 32 bits, replacing \p I with 32 bit 'select' operation, and truncating the
133 /// result of 32 bit 'select' operation back to \p I's original type.
136 bool promoteUniformOpToI32(SelectInst
&I
) const;
138 /// Promotes uniform 'bitreverse' intrinsic \p I to 32 bit 'bitreverse'
141 /// \details \p I's base element bit width must be greater than 1 and less
142 /// than or equal 16. Promotion is done by zero extending the operand to 32
143 /// bits, replacing \p I with 32 bit 'bitreverse' intrinsic, shifting the
144 /// result of 32 bit 'bitreverse' intrinsic to the right with zero fill (the
145 /// shift amount is 32 minus \p I's base element bit width), and truncating
146 /// the result of the shift operation back to \p I's original type.
149 bool promoteUniformBitreverseToI32(IntrinsicInst
&I
) const;
152 unsigned numBitsUnsigned(Value
*Op
, unsigned ScalarSize
) const;
153 unsigned numBitsSigned(Value
*Op
, unsigned ScalarSize
) const;
154 bool isI24(Value
*V
, unsigned ScalarSize
) const;
155 bool isU24(Value
*V
, unsigned ScalarSize
) const;
157 /// Replace mul instructions with llvm.amdgcn.mul.u24 or llvm.amdgcn.mul.s24.
158 /// SelectionDAG has an issue where an and asserting the bits are known
159 bool replaceMulWithMul24(BinaryOperator
&I
) const;
161 /// Perform same function as equivalently named function in DAGCombiner. Since
162 /// we expand some divisions here, we need to perform this before obscuring.
163 bool foldBinOpIntoSelect(BinaryOperator
&I
) const;
165 bool divHasSpecialOptimization(BinaryOperator
&I
,
166 Value
*Num
, Value
*Den
) const;
167 int getDivNumBits(BinaryOperator
&I
,
168 Value
*Num
, Value
*Den
,
169 unsigned AtLeast
, bool Signed
) const;
171 /// Expands 24 bit div or rem.
172 Value
* expandDivRem24(IRBuilder
<> &Builder
, BinaryOperator
&I
,
173 Value
*Num
, Value
*Den
,
174 bool IsDiv
, bool IsSigned
) const;
176 Value
*expandDivRem24Impl(IRBuilder
<> &Builder
, BinaryOperator
&I
,
177 Value
*Num
, Value
*Den
, unsigned NumBits
,
178 bool IsDiv
, bool IsSigned
) const;
180 /// Expands 32 bit div or rem.
181 Value
* expandDivRem32(IRBuilder
<> &Builder
, BinaryOperator
&I
,
182 Value
*Num
, Value
*Den
) const;
184 Value
*shrinkDivRem64(IRBuilder
<> &Builder
, BinaryOperator
&I
,
185 Value
*Num
, Value
*Den
) const;
186 void expandDivRem64(BinaryOperator
&I
) const;
188 /// Widen a scalar load.
190 /// \details \p Widen scalar load for uniform, small type loads from constant
191 // memory / to a full 32-bits and then truncate the input to allow a scalar
192 // load instead of a vector load.
196 bool canWidenScalarExtLoad(LoadInst
&I
) const;
201 AMDGPUCodeGenPrepare() : FunctionPass(ID
) {}
203 bool visitFDiv(BinaryOperator
&I
);
204 bool visitXor(BinaryOperator
&I
);
206 bool visitInstruction(Instruction
&I
) { return false; }
207 bool visitBinaryOperator(BinaryOperator
&I
);
208 bool visitLoadInst(LoadInst
&I
);
209 bool visitICmpInst(ICmpInst
&I
);
210 bool visitSelectInst(SelectInst
&I
);
212 bool visitIntrinsicInst(IntrinsicInst
&I
);
213 bool visitBitreverseIntrinsicInst(IntrinsicInst
&I
);
215 bool doInitialization(Module
&M
) override
;
216 bool runOnFunction(Function
&F
) override
;
218 StringRef
getPassName() const override
{ return "AMDGPU IR optimizations"; }
220 void getAnalysisUsage(AnalysisUsage
&AU
) const override
{
221 AU
.addRequired
<AssumptionCacheTracker
>();
222 AU
.addRequired
<LegacyDivergenceAnalysis
>();
224 // FIXME: Division expansion needs to preserve the dominator tree.
225 if (!ExpandDiv64InIR
)
226 AU
.setPreservesAll();
230 } // end anonymous namespace
232 unsigned AMDGPUCodeGenPrepare::getBaseElementBitWidth(const Type
*T
) const {
233 assert(needsPromotionToI32(T
) && "T does not need promotion to i32");
235 if (T
->isIntegerTy())
236 return T
->getIntegerBitWidth();
237 return cast
<VectorType
>(T
)->getElementType()->getIntegerBitWidth();
240 Type
*AMDGPUCodeGenPrepare::getI32Ty(IRBuilder
<> &B
, const Type
*T
) const {
241 assert(needsPromotionToI32(T
) && "T does not need promotion to i32");
243 if (T
->isIntegerTy())
244 return B
.getInt32Ty();
245 return FixedVectorType::get(B
.getInt32Ty(), cast
<FixedVectorType
>(T
));
248 bool AMDGPUCodeGenPrepare::isSigned(const BinaryOperator
&I
) const {
249 return I
.getOpcode() == Instruction::AShr
||
250 I
.getOpcode() == Instruction::SDiv
|| I
.getOpcode() == Instruction::SRem
;
253 bool AMDGPUCodeGenPrepare::isSigned(const SelectInst
&I
) const {
254 return isa
<ICmpInst
>(I
.getOperand(0)) ?
255 cast
<ICmpInst
>(I
.getOperand(0))->isSigned() : false;
258 bool AMDGPUCodeGenPrepare::needsPromotionToI32(const Type
*T
) const {
262 const IntegerType
*IntTy
= dyn_cast
<IntegerType
>(T
);
263 if (IntTy
&& IntTy
->getBitWidth() > 1 && IntTy
->getBitWidth() <= 16)
266 if (const VectorType
*VT
= dyn_cast
<VectorType
>(T
)) {
267 // TODO: The set of packed operations is more limited, so may want to
268 // promote some anyway.
269 if (ST
->hasVOP3PInsts())
272 return needsPromotionToI32(VT
->getElementType());
278 // Return true if the op promoted to i32 should have nsw set.
279 static bool promotedOpIsNSW(const Instruction
&I
) {
280 switch (I
.getOpcode()) {
281 case Instruction::Shl
:
282 case Instruction::Add
:
283 case Instruction::Sub
:
285 case Instruction::Mul
:
286 return I
.hasNoUnsignedWrap();
292 // Return true if the op promoted to i32 should have nuw set.
293 static bool promotedOpIsNUW(const Instruction
&I
) {
294 switch (I
.getOpcode()) {
295 case Instruction::Shl
:
296 case Instruction::Add
:
297 case Instruction::Mul
:
299 case Instruction::Sub
:
300 return I
.hasNoUnsignedWrap();
306 bool AMDGPUCodeGenPrepare::canWidenScalarExtLoad(LoadInst
&I
) const {
307 Type
*Ty
= I
.getType();
308 const DataLayout
&DL
= Mod
->getDataLayout();
309 int TySize
= DL
.getTypeSizeInBits(Ty
);
310 Align Alignment
= DL
.getValueOrABITypeAlignment(I
.getAlign(), Ty
);
312 return I
.isSimple() && TySize
< 32 && Alignment
>= 4 && DA
->isUniform(&I
);
315 bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(BinaryOperator
&I
) const {
316 assert(needsPromotionToI32(I
.getType()) &&
317 "I does not need promotion to i32");
319 if (I
.getOpcode() == Instruction::SDiv
||
320 I
.getOpcode() == Instruction::UDiv
||
321 I
.getOpcode() == Instruction::SRem
||
322 I
.getOpcode() == Instruction::URem
)
325 IRBuilder
<> Builder(&I
);
326 Builder
.SetCurrentDebugLocation(I
.getDebugLoc());
328 Type
*I32Ty
= getI32Ty(Builder
, I
.getType());
329 Value
*ExtOp0
= nullptr;
330 Value
*ExtOp1
= nullptr;
331 Value
*ExtRes
= nullptr;
332 Value
*TruncRes
= nullptr;
335 ExtOp0
= Builder
.CreateSExt(I
.getOperand(0), I32Ty
);
336 ExtOp1
= Builder
.CreateSExt(I
.getOperand(1), I32Ty
);
338 ExtOp0
= Builder
.CreateZExt(I
.getOperand(0), I32Ty
);
339 ExtOp1
= Builder
.CreateZExt(I
.getOperand(1), I32Ty
);
342 ExtRes
= Builder
.CreateBinOp(I
.getOpcode(), ExtOp0
, ExtOp1
);
343 if (Instruction
*Inst
= dyn_cast
<Instruction
>(ExtRes
)) {
344 if (promotedOpIsNSW(cast
<Instruction
>(I
)))
345 Inst
->setHasNoSignedWrap();
347 if (promotedOpIsNUW(cast
<Instruction
>(I
)))
348 Inst
->setHasNoUnsignedWrap();
350 if (const auto *ExactOp
= dyn_cast
<PossiblyExactOperator
>(&I
))
351 Inst
->setIsExact(ExactOp
->isExact());
354 TruncRes
= Builder
.CreateTrunc(ExtRes
, I
.getType());
356 I
.replaceAllUsesWith(TruncRes
);
362 bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(ICmpInst
&I
) const {
363 assert(needsPromotionToI32(I
.getOperand(0)->getType()) &&
364 "I does not need promotion to i32");
366 IRBuilder
<> Builder(&I
);
367 Builder
.SetCurrentDebugLocation(I
.getDebugLoc());
369 Type
*I32Ty
= getI32Ty(Builder
, I
.getOperand(0)->getType());
370 Value
*ExtOp0
= nullptr;
371 Value
*ExtOp1
= nullptr;
372 Value
*NewICmp
= nullptr;
375 ExtOp0
= Builder
.CreateSExt(I
.getOperand(0), I32Ty
);
376 ExtOp1
= Builder
.CreateSExt(I
.getOperand(1), I32Ty
);
378 ExtOp0
= Builder
.CreateZExt(I
.getOperand(0), I32Ty
);
379 ExtOp1
= Builder
.CreateZExt(I
.getOperand(1), I32Ty
);
381 NewICmp
= Builder
.CreateICmp(I
.getPredicate(), ExtOp0
, ExtOp1
);
383 I
.replaceAllUsesWith(NewICmp
);
389 bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(SelectInst
&I
) const {
390 assert(needsPromotionToI32(I
.getType()) &&
391 "I does not need promotion to i32");
393 IRBuilder
<> Builder(&I
);
394 Builder
.SetCurrentDebugLocation(I
.getDebugLoc());
396 Type
*I32Ty
= getI32Ty(Builder
, I
.getType());
397 Value
*ExtOp1
= nullptr;
398 Value
*ExtOp2
= nullptr;
399 Value
*ExtRes
= nullptr;
400 Value
*TruncRes
= nullptr;
403 ExtOp1
= Builder
.CreateSExt(I
.getOperand(1), I32Ty
);
404 ExtOp2
= Builder
.CreateSExt(I
.getOperand(2), I32Ty
);
406 ExtOp1
= Builder
.CreateZExt(I
.getOperand(1), I32Ty
);
407 ExtOp2
= Builder
.CreateZExt(I
.getOperand(2), I32Ty
);
409 ExtRes
= Builder
.CreateSelect(I
.getOperand(0), ExtOp1
, ExtOp2
);
410 TruncRes
= Builder
.CreateTrunc(ExtRes
, I
.getType());
412 I
.replaceAllUsesWith(TruncRes
);
418 bool AMDGPUCodeGenPrepare::promoteUniformBitreverseToI32(
419 IntrinsicInst
&I
) const {
420 assert(I
.getIntrinsicID() == Intrinsic::bitreverse
&&
421 "I must be bitreverse intrinsic");
422 assert(needsPromotionToI32(I
.getType()) &&
423 "I does not need promotion to i32");
425 IRBuilder
<> Builder(&I
);
426 Builder
.SetCurrentDebugLocation(I
.getDebugLoc());
428 Type
*I32Ty
= getI32Ty(Builder
, I
.getType());
430 Intrinsic::getDeclaration(Mod
, Intrinsic::bitreverse
, { I32Ty
});
431 Value
*ExtOp
= Builder
.CreateZExt(I
.getOperand(0), I32Ty
);
432 Value
*ExtRes
= Builder
.CreateCall(I32
, { ExtOp
});
434 Builder
.CreateLShr(ExtRes
, 32 - getBaseElementBitWidth(I
.getType()));
436 Builder
.CreateTrunc(LShrOp
, I
.getType());
438 I
.replaceAllUsesWith(TruncRes
);
444 unsigned AMDGPUCodeGenPrepare::numBitsUnsigned(Value
*Op
,
445 unsigned ScalarSize
) const {
446 KnownBits Known
= computeKnownBits(Op
, *DL
, 0, AC
);
447 return ScalarSize
- Known
.countMinLeadingZeros();
450 unsigned AMDGPUCodeGenPrepare::numBitsSigned(Value
*Op
,
451 unsigned ScalarSize
) const {
452 // In order for this to be a signed 24-bit value, bit 23, must
454 return ScalarSize
- ComputeNumSignBits(Op
, *DL
, 0, AC
);
457 bool AMDGPUCodeGenPrepare::isI24(Value
*V
, unsigned ScalarSize
) const {
458 return ScalarSize
>= 24 && // Types less than 24-bit should be treated
459 // as unsigned 24-bit values.
460 numBitsSigned(V
, ScalarSize
) < 24;
463 bool AMDGPUCodeGenPrepare::isU24(Value
*V
, unsigned ScalarSize
) const {
464 return numBitsUnsigned(V
, ScalarSize
) <= 24;
467 static void extractValues(IRBuilder
<> &Builder
,
468 SmallVectorImpl
<Value
*> &Values
, Value
*V
) {
469 auto *VT
= dyn_cast
<FixedVectorType
>(V
->getType());
475 for (int I
= 0, E
= VT
->getNumElements(); I
!= E
; ++I
)
476 Values
.push_back(Builder
.CreateExtractElement(V
, I
));
479 static Value
*insertValues(IRBuilder
<> &Builder
,
481 SmallVectorImpl
<Value
*> &Values
) {
482 if (Values
.size() == 1)
485 Value
*NewVal
= UndefValue::get(Ty
);
486 for (int I
= 0, E
= Values
.size(); I
!= E
; ++I
)
487 NewVal
= Builder
.CreateInsertElement(NewVal
, Values
[I
], I
);
492 bool AMDGPUCodeGenPrepare::replaceMulWithMul24(BinaryOperator
&I
) const {
493 if (I
.getOpcode() != Instruction::Mul
)
496 Type
*Ty
= I
.getType();
497 unsigned Size
= Ty
->getScalarSizeInBits();
498 if (Size
<= 16 && ST
->has16BitInsts())
501 // Prefer scalar if this could be s_mul_i32
502 if (DA
->isUniform(&I
))
505 Value
*LHS
= I
.getOperand(0);
506 Value
*RHS
= I
.getOperand(1);
507 IRBuilder
<> Builder(&I
);
508 Builder
.SetCurrentDebugLocation(I
.getDebugLoc());
510 Intrinsic::ID IntrID
= Intrinsic::not_intrinsic
;
512 // TODO: Should this try to match mulhi24?
513 if (ST
->hasMulU24() && isU24(LHS
, Size
) && isU24(RHS
, Size
)) {
514 IntrID
= Intrinsic::amdgcn_mul_u24
;
515 } else if (ST
->hasMulI24() && isI24(LHS
, Size
) && isI24(RHS
, Size
)) {
516 IntrID
= Intrinsic::amdgcn_mul_i24
;
520 SmallVector
<Value
*, 4> LHSVals
;
521 SmallVector
<Value
*, 4> RHSVals
;
522 SmallVector
<Value
*, 4> ResultVals
;
523 extractValues(Builder
, LHSVals
, LHS
);
524 extractValues(Builder
, RHSVals
, RHS
);
527 IntegerType
*I32Ty
= Builder
.getInt32Ty();
528 FunctionCallee Intrin
= Intrinsic::getDeclaration(Mod
, IntrID
);
529 for (int I
= 0, E
= LHSVals
.size(); I
!= E
; ++I
) {
531 if (IntrID
== Intrinsic::amdgcn_mul_u24
) {
532 LHS
= Builder
.CreateZExtOrTrunc(LHSVals
[I
], I32Ty
);
533 RHS
= Builder
.CreateZExtOrTrunc(RHSVals
[I
], I32Ty
);
535 LHS
= Builder
.CreateSExtOrTrunc(LHSVals
[I
], I32Ty
);
536 RHS
= Builder
.CreateSExtOrTrunc(RHSVals
[I
], I32Ty
);
539 Value
*Result
= Builder
.CreateCall(Intrin
, {LHS
, RHS
});
541 if (IntrID
== Intrinsic::amdgcn_mul_u24
) {
542 ResultVals
.push_back(Builder
.CreateZExtOrTrunc(Result
,
543 LHSVals
[I
]->getType()));
545 ResultVals
.push_back(Builder
.CreateSExtOrTrunc(Result
,
546 LHSVals
[I
]->getType()));
550 Value
*NewVal
= insertValues(Builder
, Ty
, ResultVals
);
551 NewVal
->takeName(&I
);
552 I
.replaceAllUsesWith(NewVal
);
558 // Find a select instruction, which may have been casted. This is mostly to deal
559 // with cases where i16 selects were promoted here to i32.
560 static SelectInst
*findSelectThroughCast(Value
*V
, CastInst
*&Cast
) {
562 if (SelectInst
*Sel
= dyn_cast
<SelectInst
>(V
))
565 if ((Cast
= dyn_cast
<CastInst
>(V
))) {
566 if (SelectInst
*Sel
= dyn_cast
<SelectInst
>(Cast
->getOperand(0)))
573 bool AMDGPUCodeGenPrepare::foldBinOpIntoSelect(BinaryOperator
&BO
) const {
574 // Don't do this unless the old select is going away. We want to eliminate the
575 // binary operator, not replace a binop with a select.
580 // TODO: Should probably try to handle some cases with multiple
581 // users. Duplicating the select may be profitable for division.
582 SelectInst
*Sel
= findSelectThroughCast(BO
.getOperand(0), CastOp
);
583 if (!Sel
|| !Sel
->hasOneUse()) {
585 Sel
= findSelectThroughCast(BO
.getOperand(1), CastOp
);
588 if (!Sel
|| !Sel
->hasOneUse())
591 Constant
*CT
= dyn_cast
<Constant
>(Sel
->getTrueValue());
592 Constant
*CF
= dyn_cast
<Constant
>(Sel
->getFalseValue());
593 Constant
*CBO
= dyn_cast
<Constant
>(BO
.getOperand(SelOpNo
^ 1));
594 if (!CBO
|| !CT
|| !CF
)
598 if (!CastOp
->hasOneUse())
600 CT
= ConstantFoldCastOperand(CastOp
->getOpcode(), CT
, BO
.getType(), *DL
);
601 CF
= ConstantFoldCastOperand(CastOp
->getOpcode(), CF
, BO
.getType(), *DL
);
604 // TODO: Handle special 0/-1 cases DAG combine does, although we only really
605 // need to handle divisions here.
606 Constant
*FoldedT
= SelOpNo
?
607 ConstantFoldBinaryOpOperands(BO
.getOpcode(), CBO
, CT
, *DL
) :
608 ConstantFoldBinaryOpOperands(BO
.getOpcode(), CT
, CBO
, *DL
);
609 if (isa
<ConstantExpr
>(FoldedT
))
612 Constant
*FoldedF
= SelOpNo
?
613 ConstantFoldBinaryOpOperands(BO
.getOpcode(), CBO
, CF
, *DL
) :
614 ConstantFoldBinaryOpOperands(BO
.getOpcode(), CF
, CBO
, *DL
);
615 if (isa
<ConstantExpr
>(FoldedF
))
618 IRBuilder
<> Builder(&BO
);
619 Builder
.SetCurrentDebugLocation(BO
.getDebugLoc());
620 if (const FPMathOperator
*FPOp
= dyn_cast
<const FPMathOperator
>(&BO
))
621 Builder
.setFastMathFlags(FPOp
->getFastMathFlags());
623 Value
*NewSelect
= Builder
.CreateSelect(Sel
->getCondition(),
625 NewSelect
->takeName(&BO
);
626 BO
.replaceAllUsesWith(NewSelect
);
627 BO
.eraseFromParent();
629 CastOp
->eraseFromParent();
630 Sel
->eraseFromParent();
634 // Optimize fdiv with rcp:
636 // 1/x -> rcp(x) when rcp is sufficiently accurate or inaccurate rcp is
637 // allowed with unsafe-fp-math or afn.
639 // a/b -> a*rcp(b) when inaccurate rcp is allowed with unsafe-fp-math or afn.
640 static Value
*optimizeWithRcp(Value
*Num
, Value
*Den
, bool AllowInaccurateRcp
,
641 bool RcpIsAccurate
, IRBuilder
<> &Builder
,
644 if (!AllowInaccurateRcp
&& !RcpIsAccurate
)
647 Type
*Ty
= Den
->getType();
648 if (const ConstantFP
*CLHS
= dyn_cast
<ConstantFP
>(Num
)) {
649 if (AllowInaccurateRcp
|| RcpIsAccurate
) {
650 if (CLHS
->isExactlyValue(1.0)) {
651 Function
*Decl
= Intrinsic::getDeclaration(
652 Mod
, Intrinsic::amdgcn_rcp
, Ty
);
654 // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
655 // the CI documentation has a worst case error of 1 ulp.
656 // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
657 // use it as long as we aren't trying to use denormals.
659 // v_rcp_f16 and v_rsq_f16 DO support denormals.
661 // NOTE: v_sqrt and v_rcp will be combined to v_rsq later. So we don't
662 // insert rsq intrinsic here.
665 return Builder
.CreateCall(Decl
, { Den
});
668 // Same as for 1.0, but expand the sign out of the constant.
669 if (CLHS
->isExactlyValue(-1.0)) {
670 Function
*Decl
= Intrinsic::getDeclaration(
671 Mod
, Intrinsic::amdgcn_rcp
, Ty
);
673 // -1.0 / x -> rcp (fneg x)
674 Value
*FNeg
= Builder
.CreateFNeg(Den
);
675 return Builder
.CreateCall(Decl
, { FNeg
});
680 if (AllowInaccurateRcp
) {
681 Function
*Decl
= Intrinsic::getDeclaration(
682 Mod
, Intrinsic::amdgcn_rcp
, Ty
);
684 // Turn into multiply by the reciprocal.
685 // x / y -> x * (1.0 / y)
686 Value
*Recip
= Builder
.CreateCall(Decl
, { Den
});
687 return Builder
.CreateFMul(Num
, Recip
);
692 // optimize with fdiv.fast:
694 // a/b -> fdiv.fast(a, b) when !fpmath >= 2.5ulp with denormals flushed.
696 // 1/x -> fdiv.fast(1,x) when !fpmath >= 2.5ulp.
698 // NOTE: optimizeWithRcp should be tried first because rcp is the preference.
699 static Value
*optimizeWithFDivFast(Value
*Num
, Value
*Den
, float ReqdAccuracy
,
700 bool HasDenormals
, IRBuilder
<> &Builder
,
702 // fdiv.fast can achieve 2.5 ULP accuracy.
703 if (ReqdAccuracy
< 2.5f
)
706 // Only have fdiv.fast for f32.
707 Type
*Ty
= Den
->getType();
708 if (!Ty
->isFloatTy())
711 bool NumIsOne
= false;
712 if (const ConstantFP
*CNum
= dyn_cast
<ConstantFP
>(Num
)) {
713 if (CNum
->isExactlyValue(+1.0) || CNum
->isExactlyValue(-1.0))
717 // fdiv does not support denormals. But 1.0/x is always fine to use it.
718 if (HasDenormals
&& !NumIsOne
)
721 Function
*Decl
= Intrinsic::getDeclaration(Mod
, Intrinsic::amdgcn_fdiv_fast
);
722 return Builder
.CreateCall(Decl
, { Num
, Den
});
725 // Optimizations is performed based on fpmath, fast math flags as well as
726 // denormals to optimize fdiv with either rcp or fdiv.fast.
729 // 1/x -> rcp(x) when rcp is sufficiently accurate or inaccurate rcp is
730 // allowed with unsafe-fp-math or afn.
732 // a/b -> a*rcp(b) when inaccurate rcp is allowed with unsafe-fp-math or afn.
735 // a/b -> fdiv.fast(a, b) when !fpmath >= 2.5ulp with denormals flushed.
737 // 1/x -> fdiv.fast(1,x) when !fpmath >= 2.5ulp.
739 // NOTE: rcp is the preference in cases that both are legal.
740 bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator
&FDiv
) {
742 Type
*Ty
= FDiv
.getType()->getScalarType();
744 // The f64 rcp/rsq approximations are pretty inaccurate. We can do an
745 // expansion around them in codegen.
746 if (Ty
->isDoubleTy())
749 // No intrinsic for fdiv16 if target does not support f16.
750 if (Ty
->isHalfTy() && !ST
->has16BitInsts())
753 const FPMathOperator
*FPOp
= cast
<const FPMathOperator
>(&FDiv
);
754 const float ReqdAccuracy
= FPOp
->getFPAccuracy();
756 // Inaccurate rcp is allowed with unsafe-fp-math or afn.
757 FastMathFlags FMF
= FPOp
->getFastMathFlags();
758 const bool AllowInaccurateRcp
= HasUnsafeFPMath
|| FMF
.approxFunc();
760 // rcp_f16 is accurate for !fpmath >= 1.0ulp.
761 // rcp_f32 is accurate for !fpmath >= 1.0ulp and denormals are flushed.
762 // rcp_f64 is never accurate.
763 const bool RcpIsAccurate
= (Ty
->isHalfTy() && ReqdAccuracy
>= 1.0f
) ||
764 (Ty
->isFloatTy() && !HasFP32Denormals
&& ReqdAccuracy
>= 1.0f
);
766 IRBuilder
<> Builder(FDiv
.getParent(), std::next(FDiv
.getIterator()));
767 Builder
.setFastMathFlags(FMF
);
768 Builder
.SetCurrentDebugLocation(FDiv
.getDebugLoc());
770 Value
*Num
= FDiv
.getOperand(0);
771 Value
*Den
= FDiv
.getOperand(1);
773 Value
*NewFDiv
= nullptr;
774 if (auto *VT
= dyn_cast
<FixedVectorType
>(FDiv
.getType())) {
775 NewFDiv
= UndefValue::get(VT
);
777 // FIXME: Doesn't do the right thing for cases where the vector is partially
778 // constant. This works when the scalarizer pass is run first.
779 for (unsigned I
= 0, E
= VT
->getNumElements(); I
!= E
; ++I
) {
780 Value
*NumEltI
= Builder
.CreateExtractElement(Num
, I
);
781 Value
*DenEltI
= Builder
.CreateExtractElement(Den
, I
);
783 Value
*NewElt
= optimizeWithRcp(NumEltI
, DenEltI
, AllowInaccurateRcp
,
784 RcpIsAccurate
, Builder
, Mod
);
785 if (!NewElt
) // Try fdiv.fast.
786 NewElt
= optimizeWithFDivFast(NumEltI
, DenEltI
, ReqdAccuracy
,
787 HasFP32Denormals
, Builder
, Mod
);
788 if (!NewElt
) // Keep the original.
789 NewElt
= Builder
.CreateFDiv(NumEltI
, DenEltI
);
791 NewFDiv
= Builder
.CreateInsertElement(NewFDiv
, NewElt
, I
);
793 } else { // Scalar FDiv.
795 NewFDiv
= optimizeWithRcp(Num
, Den
, AllowInaccurateRcp
, RcpIsAccurate
,
797 if (!NewFDiv
) { // Try fdiv.fast.
798 NewFDiv
= optimizeWithFDivFast(Num
, Den
, ReqdAccuracy
, HasFP32Denormals
,
804 FDiv
.replaceAllUsesWith(NewFDiv
);
805 NewFDiv
->takeName(&FDiv
);
806 FDiv
.eraseFromParent();
812 bool AMDGPUCodeGenPrepare::visitXor(BinaryOperator
&I
) {
813 // Match the Xor instruction, its type and its operands
814 IntrinsicInst
*IntrinsicCall
= dyn_cast
<IntrinsicInst
>(I
.getOperand(0));
815 ConstantInt
*RHS
= dyn_cast
<ConstantInt
>(I
.getOperand(1));
816 if (!RHS
|| !IntrinsicCall
|| RHS
->getSExtValue() != -1)
817 return visitBinaryOperator(I
);
819 // Check if the Call is an intrinsic intruction to amdgcn_class intrinsic
821 if (IntrinsicCall
->getIntrinsicID() != Intrinsic::amdgcn_class
||
822 !IntrinsicCall
->hasOneUse())
823 return visitBinaryOperator(I
);
825 // "Not" the second argument of the intrinsic call
826 ConstantInt
*Arg
= dyn_cast
<ConstantInt
>(IntrinsicCall
->getOperand(1));
828 return visitBinaryOperator(I
);
830 IntrinsicCall
->setOperand(
831 1, ConstantInt::get(Arg
->getType(), Arg
->getZExtValue() ^ 0x3ff));
832 I
.replaceAllUsesWith(IntrinsicCall
);
837 static bool hasUnsafeFPMath(const Function
&F
) {
838 Attribute Attr
= F
.getFnAttribute("unsafe-fp-math");
839 return Attr
.getValueAsBool();
842 static std::pair
<Value
*, Value
*> getMul64(IRBuilder
<> &Builder
,
843 Value
*LHS
, Value
*RHS
) {
844 Type
*I32Ty
= Builder
.getInt32Ty();
845 Type
*I64Ty
= Builder
.getInt64Ty();
847 Value
*LHS_EXT64
= Builder
.CreateZExt(LHS
, I64Ty
);
848 Value
*RHS_EXT64
= Builder
.CreateZExt(RHS
, I64Ty
);
849 Value
*MUL64
= Builder
.CreateMul(LHS_EXT64
, RHS_EXT64
);
850 Value
*Lo
= Builder
.CreateTrunc(MUL64
, I32Ty
);
851 Value
*Hi
= Builder
.CreateLShr(MUL64
, Builder
.getInt64(32));
852 Hi
= Builder
.CreateTrunc(Hi
, I32Ty
);
853 return std::make_pair(Lo
, Hi
);
856 static Value
* getMulHu(IRBuilder
<> &Builder
, Value
*LHS
, Value
*RHS
) {
857 return getMul64(Builder
, LHS
, RHS
).second
;
860 /// Figure out how many bits are really needed for this ddivision. \p AtLeast is
861 /// an optimization hint to bypass the second ComputeNumSignBits call if we the
862 /// first one is insufficient. Returns -1 on failure.
863 int AMDGPUCodeGenPrepare::getDivNumBits(BinaryOperator
&I
,
864 Value
*Num
, Value
*Den
,
865 unsigned AtLeast
, bool IsSigned
) const {
866 const DataLayout
&DL
= Mod
->getDataLayout();
867 unsigned LHSSignBits
= ComputeNumSignBits(Num
, DL
, 0, AC
, &I
);
868 if (LHSSignBits
< AtLeast
)
871 unsigned RHSSignBits
= ComputeNumSignBits(Den
, DL
, 0, AC
, &I
);
872 if (RHSSignBits
< AtLeast
)
875 unsigned SignBits
= std::min(LHSSignBits
, RHSSignBits
);
876 unsigned DivBits
= Num
->getType()->getScalarSizeInBits() - SignBits
;
882 // The fractional part of a float is enough to accurately represent up to
883 // a 24-bit signed integer.
884 Value
*AMDGPUCodeGenPrepare::expandDivRem24(IRBuilder
<> &Builder
,
886 Value
*Num
, Value
*Den
,
887 bool IsDiv
, bool IsSigned
) const {
888 int DivBits
= getDivNumBits(I
, Num
, Den
, 9, IsSigned
);
891 return expandDivRem24Impl(Builder
, I
, Num
, Den
, DivBits
, IsDiv
, IsSigned
);
894 Value
*AMDGPUCodeGenPrepare::expandDivRem24Impl(IRBuilder
<> &Builder
,
896 Value
*Num
, Value
*Den
,
898 bool IsDiv
, bool IsSigned
) const {
899 Type
*I32Ty
= Builder
.getInt32Ty();
900 Num
= Builder
.CreateTrunc(Num
, I32Ty
);
901 Den
= Builder
.CreateTrunc(Den
, I32Ty
);
903 Type
*F32Ty
= Builder
.getFloatTy();
904 ConstantInt
*One
= Builder
.getInt32(1);
908 // char|short jq = ia ^ ib;
909 JQ
= Builder
.CreateXor(Num
, Den
);
911 // jq = jq >> (bitsize - 2)
912 JQ
= Builder
.CreateAShr(JQ
, Builder
.getInt32(30));
915 JQ
= Builder
.CreateOr(JQ
, One
);
918 // int ia = (int)LHS;
924 // float fa = (float)ia;
925 Value
*FA
= IsSigned
? Builder
.CreateSIToFP(IA
, F32Ty
)
926 : Builder
.CreateUIToFP(IA
, F32Ty
);
928 // float fb = (float)ib;
929 Value
*FB
= IsSigned
? Builder
.CreateSIToFP(IB
,F32Ty
)
930 : Builder
.CreateUIToFP(IB
,F32Ty
);
932 Function
*RcpDecl
= Intrinsic::getDeclaration(Mod
, Intrinsic::amdgcn_rcp
,
933 Builder
.getFloatTy());
934 Value
*RCP
= Builder
.CreateCall(RcpDecl
, { FB
});
935 Value
*FQM
= Builder
.CreateFMul(FA
, RCP
);
938 CallInst
*FQ
= Builder
.CreateUnaryIntrinsic(Intrinsic::trunc
, FQM
);
939 FQ
->copyFastMathFlags(Builder
.getFastMathFlags());
941 // float fqneg = -fq;
942 Value
*FQNeg
= Builder
.CreateFNeg(FQ
);
944 // float fr = mad(fqneg, fb, fa);
945 auto FMAD
= !ST
->hasMadMacF32Insts()
947 : (Intrinsic::ID
)Intrinsic::amdgcn_fmad_ftz
;
948 Value
*FR
= Builder
.CreateIntrinsic(FMAD
,
949 {FQNeg
->getType()}, {FQNeg
, FB
, FA
}, FQ
);
952 Value
*IQ
= IsSigned
? Builder
.CreateFPToSI(FQ
, I32Ty
)
953 : Builder
.CreateFPToUI(FQ
, I32Ty
);
956 FR
= Builder
.CreateUnaryIntrinsic(Intrinsic::fabs
, FR
, FQ
);
959 FB
= Builder
.CreateUnaryIntrinsic(Intrinsic::fabs
, FB
, FQ
);
961 // int cv = fr >= fb;
962 Value
*CV
= Builder
.CreateFCmpOGE(FR
, FB
);
964 // jq = (cv ? jq : 0);
965 JQ
= Builder
.CreateSelect(CV
, JQ
, Builder
.getInt32(0));
968 Value
*Div
= Builder
.CreateAdd(IQ
, JQ
);
972 // Rem needs compensation, it's easier to recompute it
973 Value
*Rem
= Builder
.CreateMul(Div
, Den
);
974 Res
= Builder
.CreateSub(Num
, Rem
);
977 if (DivBits
!= 0 && DivBits
< 32) {
978 // Extend in register from the number of bits this divide really is.
980 int InRegBits
= 32 - DivBits
;
982 Res
= Builder
.CreateShl(Res
, InRegBits
);
983 Res
= Builder
.CreateAShr(Res
, InRegBits
);
985 ConstantInt
*TruncMask
986 = Builder
.getInt32((UINT64_C(1) << DivBits
) - 1);
987 Res
= Builder
.CreateAnd(Res
, TruncMask
);
994 // Try to recognize special cases the DAG will emit special, better expansions
995 // than the general expansion we do here.
997 // TODO: It would be better to just directly handle those optimizations here.
998 bool AMDGPUCodeGenPrepare::divHasSpecialOptimization(
999 BinaryOperator
&I
, Value
*Num
, Value
*Den
) const {
1000 if (Constant
*C
= dyn_cast
<Constant
>(Den
)) {
1001 // Arbitrary constants get a better expansion as long as a wider mulhi is
1003 if (C
->getType()->getScalarSizeInBits() <= 32)
1006 // TODO: Sdiv check for not exact for some reason.
1008 // If there's no wider mulhi, there's only a better expansion for powers of
1010 // TODO: Should really know for each vector element.
1011 if (isKnownToBeAPowerOfTwo(C
, *DL
, true, 0, AC
, &I
, DT
))
1017 if (BinaryOperator
*BinOpDen
= dyn_cast
<BinaryOperator
>(Den
)) {
1018 // fold (udiv x, (shl c, y)) -> x >>u (log2(c)+y) iff c is power of 2
1019 if (BinOpDen
->getOpcode() == Instruction::Shl
&&
1020 isa
<Constant
>(BinOpDen
->getOperand(0)) &&
1021 isKnownToBeAPowerOfTwo(BinOpDen
->getOperand(0), *DL
, true,
1030 static Value
*getSign32(Value
*V
, IRBuilder
<> &Builder
, const DataLayout
*DL
) {
1031 // Check whether the sign can be determined statically.
1032 KnownBits Known
= computeKnownBits(V
, *DL
);
1033 if (Known
.isNegative())
1034 return Constant::getAllOnesValue(V
->getType());
1035 if (Known
.isNonNegative())
1036 return Constant::getNullValue(V
->getType());
1037 return Builder
.CreateAShr(V
, Builder
.getInt32(31));
1040 Value
*AMDGPUCodeGenPrepare::expandDivRem32(IRBuilder
<> &Builder
,
1041 BinaryOperator
&I
, Value
*X
,
1043 Instruction::BinaryOps Opc
= I
.getOpcode();
1044 assert(Opc
== Instruction::URem
|| Opc
== Instruction::UDiv
||
1045 Opc
== Instruction::SRem
|| Opc
== Instruction::SDiv
);
1049 Builder
.setFastMathFlags(FMF
);
1051 if (divHasSpecialOptimization(I
, X
, Y
))
1052 return nullptr; // Keep it for later optimization.
1054 bool IsDiv
= Opc
== Instruction::UDiv
|| Opc
== Instruction::SDiv
;
1055 bool IsSigned
= Opc
== Instruction::SRem
|| Opc
== Instruction::SDiv
;
1057 Type
*Ty
= X
->getType();
1058 Type
*I32Ty
= Builder
.getInt32Ty();
1059 Type
*F32Ty
= Builder
.getFloatTy();
1061 if (Ty
->getScalarSizeInBits() < 32) {
1063 X
= Builder
.CreateSExt(X
, I32Ty
);
1064 Y
= Builder
.CreateSExt(Y
, I32Ty
);
1066 X
= Builder
.CreateZExt(X
, I32Ty
);
1067 Y
= Builder
.CreateZExt(Y
, I32Ty
);
1071 if (Value
*Res
= expandDivRem24(Builder
, I
, X
, Y
, IsDiv
, IsSigned
)) {
1072 return IsSigned
? Builder
.CreateSExtOrTrunc(Res
, Ty
) :
1073 Builder
.CreateZExtOrTrunc(Res
, Ty
);
1076 ConstantInt
*Zero
= Builder
.getInt32(0);
1077 ConstantInt
*One
= Builder
.getInt32(1);
1079 Value
*Sign
= nullptr;
1081 Value
*SignX
= getSign32(X
, Builder
, DL
);
1082 Value
*SignY
= getSign32(Y
, Builder
, DL
);
1083 // Remainder sign is the same as LHS
1084 Sign
= IsDiv
? Builder
.CreateXor(SignX
, SignY
) : SignX
;
1086 X
= Builder
.CreateAdd(X
, SignX
);
1087 Y
= Builder
.CreateAdd(Y
, SignY
);
1089 X
= Builder
.CreateXor(X
, SignX
);
1090 Y
= Builder
.CreateXor(Y
, SignY
);
1093 // The algorithm here is based on ideas from "Software Integer Division", Tom
1094 // Rodeheffer, August 2008.
1096 // unsigned udiv(unsigned x, unsigned y) {
1097 // // Initial estimate of inv(y). The constant is less than 2^32 to ensure
1098 // // that this is a lower bound on inv(y), even if some of the calculations
1100 // unsigned z = (unsigned)((4294967296.0 - 512.0) * v_rcp_f32((float)y));
1102 // // One round of UNR (Unsigned integer Newton-Raphson) to improve z.
1103 // // Empirically this is guaranteed to give a "two-y" lower bound on
1105 // z += umulh(z, -y * z);
1107 // // Quotient/remainder estimate.
1108 // unsigned q = umulh(x, z);
1109 // unsigned r = x - q * y;
1111 // // Two rounds of quotient/remainder refinement.
1124 // Initial estimate of inv(y).
1125 Value
*FloatY
= Builder
.CreateUIToFP(Y
, F32Ty
);
1126 Function
*Rcp
= Intrinsic::getDeclaration(Mod
, Intrinsic::amdgcn_rcp
, F32Ty
);
1127 Value
*RcpY
= Builder
.CreateCall(Rcp
, {FloatY
});
1128 Constant
*Scale
= ConstantFP::get(F32Ty
, BitsToFloat(0x4F7FFFFE));
1129 Value
*ScaledY
= Builder
.CreateFMul(RcpY
, Scale
);
1130 Value
*Z
= Builder
.CreateFPToUI(ScaledY
, I32Ty
);
1132 // One round of UNR.
1133 Value
*NegY
= Builder
.CreateSub(Zero
, Y
);
1134 Value
*NegYZ
= Builder
.CreateMul(NegY
, Z
);
1135 Z
= Builder
.CreateAdd(Z
, getMulHu(Builder
, Z
, NegYZ
));
1137 // Quotient/remainder estimate.
1138 Value
*Q
= getMulHu(Builder
, X
, Z
);
1139 Value
*R
= Builder
.CreateSub(X
, Builder
.CreateMul(Q
, Y
));
1141 // First quotient/remainder refinement.
1142 Value
*Cond
= Builder
.CreateICmpUGE(R
, Y
);
1144 Q
= Builder
.CreateSelect(Cond
, Builder
.CreateAdd(Q
, One
), Q
);
1145 R
= Builder
.CreateSelect(Cond
, Builder
.CreateSub(R
, Y
), R
);
1147 // Second quotient/remainder refinement.
1148 Cond
= Builder
.CreateICmpUGE(R
, Y
);
1151 Res
= Builder
.CreateSelect(Cond
, Builder
.CreateAdd(Q
, One
), Q
);
1153 Res
= Builder
.CreateSelect(Cond
, Builder
.CreateSub(R
, Y
), R
);
1156 Res
= Builder
.CreateXor(Res
, Sign
);
1157 Res
= Builder
.CreateSub(Res
, Sign
);
1160 Res
= Builder
.CreateTrunc(Res
, Ty
);
1165 Value
*AMDGPUCodeGenPrepare::shrinkDivRem64(IRBuilder
<> &Builder
,
1167 Value
*Num
, Value
*Den
) const {
1168 if (!ExpandDiv64InIR
&& divHasSpecialOptimization(I
, Num
, Den
))
1169 return nullptr; // Keep it for later optimization.
1171 Instruction::BinaryOps Opc
= I
.getOpcode();
1173 bool IsDiv
= Opc
== Instruction::SDiv
|| Opc
== Instruction::UDiv
;
1174 bool IsSigned
= Opc
== Instruction::SDiv
|| Opc
== Instruction::SRem
;
1176 int NumDivBits
= getDivNumBits(I
, Num
, Den
, 32, IsSigned
);
1177 if (NumDivBits
== -1)
1180 Value
*Narrowed
= nullptr;
1181 if (NumDivBits
<= 24) {
1182 Narrowed
= expandDivRem24Impl(Builder
, I
, Num
, Den
, NumDivBits
,
1184 } else if (NumDivBits
<= 32) {
1185 Narrowed
= expandDivRem32(Builder
, I
, Num
, Den
);
1189 return IsSigned
? Builder
.CreateSExt(Narrowed
, Num
->getType()) :
1190 Builder
.CreateZExt(Narrowed
, Num
->getType());
1196 void AMDGPUCodeGenPrepare::expandDivRem64(BinaryOperator
&I
) const {
1197 Instruction::BinaryOps Opc
= I
.getOpcode();
1198 // Do the general expansion.
1199 if (Opc
== Instruction::UDiv
|| Opc
== Instruction::SDiv
) {
1200 expandDivisionUpTo64Bits(&I
);
1204 if (Opc
== Instruction::URem
|| Opc
== Instruction::SRem
) {
1205 expandRemainderUpTo64Bits(&I
);
1209 llvm_unreachable("not a division");
1212 bool AMDGPUCodeGenPrepare::visitBinaryOperator(BinaryOperator
&I
) {
1213 if (foldBinOpIntoSelect(I
))
1216 if (ST
->has16BitInsts() && needsPromotionToI32(I
.getType()) &&
1217 DA
->isUniform(&I
) && promoteUniformOpToI32(I
))
1220 if (UseMul24Intrin
&& replaceMulWithMul24(I
))
1223 bool Changed
= false;
1224 Instruction::BinaryOps Opc
= I
.getOpcode();
1225 Type
*Ty
= I
.getType();
1226 Value
*NewDiv
= nullptr;
1227 unsigned ScalarSize
= Ty
->getScalarSizeInBits();
1229 SmallVector
<BinaryOperator
*, 8> Div64ToExpand
;
1231 if ((Opc
== Instruction::URem
|| Opc
== Instruction::UDiv
||
1232 Opc
== Instruction::SRem
|| Opc
== Instruction::SDiv
) &&
1234 !DisableIDivExpand
) {
1235 Value
*Num
= I
.getOperand(0);
1236 Value
*Den
= I
.getOperand(1);
1237 IRBuilder
<> Builder(&I
);
1238 Builder
.SetCurrentDebugLocation(I
.getDebugLoc());
1240 if (auto *VT
= dyn_cast
<FixedVectorType
>(Ty
)) {
1241 NewDiv
= UndefValue::get(VT
);
1243 for (unsigned N
= 0, E
= VT
->getNumElements(); N
!= E
; ++N
) {
1244 Value
*NumEltN
= Builder
.CreateExtractElement(Num
, N
);
1245 Value
*DenEltN
= Builder
.CreateExtractElement(Den
, N
);
1248 if (ScalarSize
<= 32) {
1249 NewElt
= expandDivRem32(Builder
, I
, NumEltN
, DenEltN
);
1251 NewElt
= Builder
.CreateBinOp(Opc
, NumEltN
, DenEltN
);
1253 // See if this 64-bit division can be shrunk to 32/24-bits before
1254 // producing the general expansion.
1255 NewElt
= shrinkDivRem64(Builder
, I
, NumEltN
, DenEltN
);
1257 // The general 64-bit expansion introduces control flow and doesn't
1258 // return the new value. Just insert a scalar copy and defer
1260 NewElt
= Builder
.CreateBinOp(Opc
, NumEltN
, DenEltN
);
1261 Div64ToExpand
.push_back(cast
<BinaryOperator
>(NewElt
));
1265 NewDiv
= Builder
.CreateInsertElement(NewDiv
, NewElt
, N
);
1268 if (ScalarSize
<= 32)
1269 NewDiv
= expandDivRem32(Builder
, I
, Num
, Den
);
1271 NewDiv
= shrinkDivRem64(Builder
, I
, Num
, Den
);
1273 Div64ToExpand
.push_back(&I
);
1278 I
.replaceAllUsesWith(NewDiv
);
1279 I
.eraseFromParent();
1284 if (ExpandDiv64InIR
) {
1285 // TODO: We get much worse code in specially handled constant cases.
1286 for (BinaryOperator
*Div
: Div64ToExpand
) {
1287 expandDivRem64(*Div
);
1295 bool AMDGPUCodeGenPrepare::visitLoadInst(LoadInst
&I
) {
1299 if ((I
.getPointerAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS
||
1300 I
.getPointerAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT
) &&
1301 canWidenScalarExtLoad(I
)) {
1302 IRBuilder
<> Builder(&I
);
1303 Builder
.SetCurrentDebugLocation(I
.getDebugLoc());
1305 Type
*I32Ty
= Builder
.getInt32Ty();
1306 Type
*PT
= PointerType::get(I32Ty
, I
.getPointerAddressSpace());
1307 Value
*BitCast
= Builder
.CreateBitCast(I
.getPointerOperand(), PT
);
1308 LoadInst
*WidenLoad
= Builder
.CreateLoad(I32Ty
, BitCast
);
1309 WidenLoad
->copyMetadata(I
);
1311 // If we have range metadata, we need to convert the type, and not make
1312 // assumptions about the high bits.
1313 if (auto *Range
= WidenLoad
->getMetadata(LLVMContext::MD_range
)) {
1314 ConstantInt
*Lower
=
1315 mdconst::extract
<ConstantInt
>(Range
->getOperand(0));
1317 if (Lower
->getValue().isNullValue()) {
1318 WidenLoad
->setMetadata(LLVMContext::MD_range
, nullptr);
1320 Metadata
*LowAndHigh
[] = {
1321 ConstantAsMetadata::get(ConstantInt::get(I32Ty
, Lower
->getValue().zext(32))),
1322 // Don't make assumptions about the high bits.
1323 ConstantAsMetadata::get(ConstantInt::get(I32Ty
, 0))
1326 WidenLoad
->setMetadata(LLVMContext::MD_range
,
1327 MDNode::get(Mod
->getContext(), LowAndHigh
));
1331 int TySize
= Mod
->getDataLayout().getTypeSizeInBits(I
.getType());
1332 Type
*IntNTy
= Builder
.getIntNTy(TySize
);
1333 Value
*ValTrunc
= Builder
.CreateTrunc(WidenLoad
, IntNTy
);
1334 Value
*ValOrig
= Builder
.CreateBitCast(ValTrunc
, I
.getType());
1335 I
.replaceAllUsesWith(ValOrig
);
1336 I
.eraseFromParent();
1343 bool AMDGPUCodeGenPrepare::visitICmpInst(ICmpInst
&I
) {
1344 bool Changed
= false;
1346 if (ST
->has16BitInsts() && needsPromotionToI32(I
.getOperand(0)->getType()) &&
1348 Changed
|= promoteUniformOpToI32(I
);
1353 bool AMDGPUCodeGenPrepare::visitSelectInst(SelectInst
&I
) {
1354 bool Changed
= false;
1356 if (ST
->has16BitInsts() && needsPromotionToI32(I
.getType()) &&
1358 Changed
|= promoteUniformOpToI32(I
);
1363 bool AMDGPUCodeGenPrepare::visitIntrinsicInst(IntrinsicInst
&I
) {
1364 switch (I
.getIntrinsicID()) {
1365 case Intrinsic::bitreverse
:
1366 return visitBitreverseIntrinsicInst(I
);
1372 bool AMDGPUCodeGenPrepare::visitBitreverseIntrinsicInst(IntrinsicInst
&I
) {
1373 bool Changed
= false;
1375 if (ST
->has16BitInsts() && needsPromotionToI32(I
.getType()) &&
1377 Changed
|= promoteUniformBitreverseToI32(I
);
1382 bool AMDGPUCodeGenPrepare::doInitialization(Module
&M
) {
1384 DL
= &Mod
->getDataLayout();
1388 bool AMDGPUCodeGenPrepare::runOnFunction(Function
&F
) {
1389 if (skipFunction(F
))
1392 auto *TPC
= getAnalysisIfAvailable
<TargetPassConfig
>();
1396 const AMDGPUTargetMachine
&TM
= TPC
->getTM
<AMDGPUTargetMachine
>();
1397 ST
= &TM
.getSubtarget
<GCNSubtarget
>(F
);
1398 AC
= &getAnalysis
<AssumptionCacheTracker
>().getAssumptionCache(F
);
1399 DA
= &getAnalysis
<LegacyDivergenceAnalysis
>();
1401 auto *DTWP
= getAnalysisIfAvailable
<DominatorTreeWrapperPass
>();
1402 DT
= DTWP
? &DTWP
->getDomTree() : nullptr;
1404 HasUnsafeFPMath
= hasUnsafeFPMath(F
);
1406 AMDGPU::SIModeRegisterDefaults
Mode(F
);
1407 HasFP32Denormals
= Mode
.allFP32Denormals();
1409 bool MadeChange
= false;
1411 Function::iterator NextBB
;
1412 for (Function::iterator FI
= F
.begin(), FE
= F
.end(); FI
!= FE
; FI
= NextBB
) {
1413 BasicBlock
*BB
= &*FI
;
1414 NextBB
= std::next(FI
);
1416 BasicBlock::iterator Next
;
1417 for (BasicBlock::iterator I
= BB
->begin(), E
= BB
->end(); I
!= E
; I
= Next
) {
1418 Next
= std::next(I
);
1420 MadeChange
|= visit(*I
);
1422 if (Next
!= E
) { // Control flow changed
1423 BasicBlock
*NextInstBB
= Next
->getParent();
1424 if (NextInstBB
!= BB
) {
1436 INITIALIZE_PASS_BEGIN(AMDGPUCodeGenPrepare
, DEBUG_TYPE
,
1437 "AMDGPU IR optimizations", false, false)
1438 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker
)
1439 INITIALIZE_PASS_DEPENDENCY(LegacyDivergenceAnalysis
)
1440 INITIALIZE_PASS_END(AMDGPUCodeGenPrepare
, DEBUG_TYPE
, "AMDGPU IR optimizations",
1443 char AMDGPUCodeGenPrepare::ID
= 0;
1445 FunctionPass
*llvm::createAMDGPUCodeGenPreparePass() {
1446 return new AMDGPUCodeGenPrepare();