1 //===-- AMDGPUCodeGenPrepare.cpp ------------------------------------------===//
3 // The LLVM Compiler Infrastructure
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
8 //===----------------------------------------------------------------------===//
11 /// This pass does misc. AMDGPU optimizations on IR before instruction
14 //===----------------------------------------------------------------------===//
17 #include "AMDGPUSubtarget.h"
18 #include "AMDGPUTargetMachine.h"
19 #include "llvm/ADT/StringRef.h"
20 #include "llvm/Analysis/AssumptionCache.h"
21 #include "llvm/Analysis/LegacyDivergenceAnalysis.h"
22 #include "llvm/Analysis/Loads.h"
23 #include "llvm/Analysis/ValueTracking.h"
24 #include "llvm/CodeGen/Passes.h"
25 #include "llvm/CodeGen/TargetPassConfig.h"
26 #include "llvm/IR/Attributes.h"
27 #include "llvm/IR/BasicBlock.h"
28 #include "llvm/IR/Constants.h"
29 #include "llvm/IR/DerivedTypes.h"
30 #include "llvm/IR/Function.h"
31 #include "llvm/IR/IRBuilder.h"
32 #include "llvm/IR/InstVisitor.h"
33 #include "llvm/IR/InstrTypes.h"
34 #include "llvm/IR/Instruction.h"
35 #include "llvm/IR/Instructions.h"
36 #include "llvm/IR/IntrinsicInst.h"
37 #include "llvm/IR/Intrinsics.h"
38 #include "llvm/IR/LLVMContext.h"
39 #include "llvm/IR/Operator.h"
40 #include "llvm/IR/Type.h"
41 #include "llvm/IR/Value.h"
42 #include "llvm/Pass.h"
43 #include "llvm/Support/Casting.h"
47 #define DEBUG_TYPE "amdgpu-codegenprepare"
53 static cl::opt
<bool> WidenLoads(
54 "amdgpu-codegenprepare-widen-constant-loads",
55 cl::desc("Widen sub-dword constant address space loads in AMDGPUCodeGenPrepare"),
59 class AMDGPUCodeGenPrepare
: public FunctionPass
,
60 public InstVisitor
<AMDGPUCodeGenPrepare
, bool> {
61 const GCNSubtarget
*ST
= nullptr;
62 AssumptionCache
*AC
= nullptr;
63 LegacyDivergenceAnalysis
*DA
= nullptr;
64 Module
*Mod
= nullptr;
65 bool HasUnsafeFPMath
= false;
67 /// Copies exact/nsw/nuw flags (if any) from binary operation \p I to
68 /// binary operation \p V.
70 /// \returns Binary operation \p V.
71 /// \returns \p T's base element bit width.
72 unsigned getBaseElementBitWidth(const Type
*T
) const;
74 /// \returns Equivalent 32 bit integer type for given type \p T. For example,
75 /// if \p T is i7, then i32 is returned; if \p T is <3 x i12>, then <3 x i32>
77 Type
*getI32Ty(IRBuilder
<> &B
, const Type
*T
) const;
79 /// \returns True if binary operation \p I is a signed binary operation, false
81 bool isSigned(const BinaryOperator
&I
) const;
83 /// \returns True if the condition of 'select' operation \p I comes from a
84 /// signed 'icmp' operation, false otherwise.
85 bool isSigned(const SelectInst
&I
) const;
87 /// \returns True if type \p T needs to be promoted to 32 bit integer type,
89 bool needsPromotionToI32(const Type
*T
) const;
91 /// Promotes uniform binary operation \p I to equivalent 32 bit binary
94 /// \details \p I's base element bit width must be greater than 1 and less
95 /// than or equal 16. Promotion is done by sign or zero extending operands to
96 /// 32 bits, replacing \p I with equivalent 32 bit binary operation, and
97 /// truncating the result of 32 bit binary operation back to \p I's original
98 /// type. Division operation is not promoted.
100 /// \returns True if \p I is promoted to equivalent 32 bit binary operation,
102 bool promoteUniformOpToI32(BinaryOperator
&I
) const;
104 /// Promotes uniform 'icmp' operation \p I to 32 bit 'icmp' operation.
106 /// \details \p I's base element bit width must be greater than 1 and less
107 /// than or equal 16. Promotion is done by sign or zero extending operands to
108 /// 32 bits, and replacing \p I with 32 bit 'icmp' operation.
111 bool promoteUniformOpToI32(ICmpInst
&I
) const;
113 /// Promotes uniform 'select' operation \p I to 32 bit 'select'
116 /// \details \p I's base element bit width must be greater than 1 and less
117 /// than or equal 16. Promotion is done by sign or zero extending operands to
118 /// 32 bits, replacing \p I with 32 bit 'select' operation, and truncating the
119 /// result of 32 bit 'select' operation back to \p I's original type.
122 bool promoteUniformOpToI32(SelectInst
&I
) const;
124 /// Promotes uniform 'bitreverse' intrinsic \p I to 32 bit 'bitreverse'
127 /// \details \p I's base element bit width must be greater than 1 and less
128 /// than or equal 16. Promotion is done by zero extending the operand to 32
129 /// bits, replacing \p I with 32 bit 'bitreverse' intrinsic, shifting the
130 /// result of 32 bit 'bitreverse' intrinsic to the right with zero fill (the
131 /// shift amount is 32 minus \p I's base element bit width), and truncating
132 /// the result of the shift operation back to \p I's original type.
135 bool promoteUniformBitreverseToI32(IntrinsicInst
&I
) const;
137 /// Expands 24 bit div or rem.
138 Value
* expandDivRem24(IRBuilder
<> &Builder
, BinaryOperator
&I
,
139 Value
*Num
, Value
*Den
,
140 bool IsDiv
, bool IsSigned
) const;
142 /// Expands 32 bit div or rem.
143 Value
* expandDivRem32(IRBuilder
<> &Builder
, BinaryOperator
&I
,
144 Value
*Num
, Value
*Den
) const;
146 /// Widen a scalar load.
148 /// \details \p Widen scalar load for uniform, small type loads from constant
149 // memory / to a full 32-bits and then truncate the input to allow a scalar
150 // load instead of a vector load.
154 bool canWidenScalarExtLoad(LoadInst
&I
) const;
159 AMDGPUCodeGenPrepare() : FunctionPass(ID
) {}
161 bool visitFDiv(BinaryOperator
&I
);
163 bool visitInstruction(Instruction
&I
) { return false; }
164 bool visitBinaryOperator(BinaryOperator
&I
);
165 bool visitLoadInst(LoadInst
&I
);
166 bool visitICmpInst(ICmpInst
&I
);
167 bool visitSelectInst(SelectInst
&I
);
169 bool visitIntrinsicInst(IntrinsicInst
&I
);
170 bool visitBitreverseIntrinsicInst(IntrinsicInst
&I
);
172 bool doInitialization(Module
&M
) override
;
173 bool runOnFunction(Function
&F
) override
;
175 StringRef
getPassName() const override
{ return "AMDGPU IR optimizations"; }
177 void getAnalysisUsage(AnalysisUsage
&AU
) const override
{
178 AU
.addRequired
<AssumptionCacheTracker
>();
179 AU
.addRequired
<LegacyDivergenceAnalysis
>();
180 AU
.setPreservesAll();
184 } // end anonymous namespace
186 unsigned AMDGPUCodeGenPrepare::getBaseElementBitWidth(const Type
*T
) const {
187 assert(needsPromotionToI32(T
) && "T does not need promotion to i32");
189 if (T
->isIntegerTy())
190 return T
->getIntegerBitWidth();
191 return cast
<VectorType
>(T
)->getElementType()->getIntegerBitWidth();
194 Type
*AMDGPUCodeGenPrepare::getI32Ty(IRBuilder
<> &B
, const Type
*T
) const {
195 assert(needsPromotionToI32(T
) && "T does not need promotion to i32");
197 if (T
->isIntegerTy())
198 return B
.getInt32Ty();
199 return VectorType::get(B
.getInt32Ty(), cast
<VectorType
>(T
)->getNumElements());
202 bool AMDGPUCodeGenPrepare::isSigned(const BinaryOperator
&I
) const {
203 return I
.getOpcode() == Instruction::AShr
||
204 I
.getOpcode() == Instruction::SDiv
|| I
.getOpcode() == Instruction::SRem
;
207 bool AMDGPUCodeGenPrepare::isSigned(const SelectInst
&I
) const {
208 return isa
<ICmpInst
>(I
.getOperand(0)) ?
209 cast
<ICmpInst
>(I
.getOperand(0))->isSigned() : false;
212 bool AMDGPUCodeGenPrepare::needsPromotionToI32(const Type
*T
) const {
213 const IntegerType
*IntTy
= dyn_cast
<IntegerType
>(T
);
214 if (IntTy
&& IntTy
->getBitWidth() > 1 && IntTy
->getBitWidth() <= 16)
217 if (const VectorType
*VT
= dyn_cast
<VectorType
>(T
)) {
218 // TODO: The set of packed operations is more limited, so may want to
219 // promote some anyway.
220 if (ST
->hasVOP3PInsts())
223 return needsPromotionToI32(VT
->getElementType());
229 // Return true if the op promoted to i32 should have nsw set.
230 static bool promotedOpIsNSW(const Instruction
&I
) {
231 switch (I
.getOpcode()) {
232 case Instruction::Shl
:
233 case Instruction::Add
:
234 case Instruction::Sub
:
236 case Instruction::Mul
:
237 return I
.hasNoUnsignedWrap();
243 // Return true if the op promoted to i32 should have nuw set.
244 static bool promotedOpIsNUW(const Instruction
&I
) {
245 switch (I
.getOpcode()) {
246 case Instruction::Shl
:
247 case Instruction::Add
:
248 case Instruction::Mul
:
250 case Instruction::Sub
:
251 return I
.hasNoUnsignedWrap();
257 bool AMDGPUCodeGenPrepare::canWidenScalarExtLoad(LoadInst
&I
) const {
258 Type
*Ty
= I
.getType();
259 const DataLayout
&DL
= Mod
->getDataLayout();
260 int TySize
= DL
.getTypeSizeInBits(Ty
);
261 unsigned Align
= I
.getAlignment() ?
262 I
.getAlignment() : DL
.getABITypeAlignment(Ty
);
264 return I
.isSimple() && TySize
< 32 && Align
>= 4 && DA
->isUniform(&I
);
267 bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(BinaryOperator
&I
) const {
268 assert(needsPromotionToI32(I
.getType()) &&
269 "I does not need promotion to i32");
271 if (I
.getOpcode() == Instruction::SDiv
||
272 I
.getOpcode() == Instruction::UDiv
||
273 I
.getOpcode() == Instruction::SRem
||
274 I
.getOpcode() == Instruction::URem
)
277 IRBuilder
<> Builder(&I
);
278 Builder
.SetCurrentDebugLocation(I
.getDebugLoc());
280 Type
*I32Ty
= getI32Ty(Builder
, I
.getType());
281 Value
*ExtOp0
= nullptr;
282 Value
*ExtOp1
= nullptr;
283 Value
*ExtRes
= nullptr;
284 Value
*TruncRes
= nullptr;
287 ExtOp0
= Builder
.CreateSExt(I
.getOperand(0), I32Ty
);
288 ExtOp1
= Builder
.CreateSExt(I
.getOperand(1), I32Ty
);
290 ExtOp0
= Builder
.CreateZExt(I
.getOperand(0), I32Ty
);
291 ExtOp1
= Builder
.CreateZExt(I
.getOperand(1), I32Ty
);
294 ExtRes
= Builder
.CreateBinOp(I
.getOpcode(), ExtOp0
, ExtOp1
);
295 if (Instruction
*Inst
= dyn_cast
<Instruction
>(ExtRes
)) {
296 if (promotedOpIsNSW(cast
<Instruction
>(I
)))
297 Inst
->setHasNoSignedWrap();
299 if (promotedOpIsNUW(cast
<Instruction
>(I
)))
300 Inst
->setHasNoUnsignedWrap();
302 if (const auto *ExactOp
= dyn_cast
<PossiblyExactOperator
>(&I
))
303 Inst
->setIsExact(ExactOp
->isExact());
306 TruncRes
= Builder
.CreateTrunc(ExtRes
, I
.getType());
308 I
.replaceAllUsesWith(TruncRes
);
314 bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(ICmpInst
&I
) const {
315 assert(needsPromotionToI32(I
.getOperand(0)->getType()) &&
316 "I does not need promotion to i32");
318 IRBuilder
<> Builder(&I
);
319 Builder
.SetCurrentDebugLocation(I
.getDebugLoc());
321 Type
*I32Ty
= getI32Ty(Builder
, I
.getOperand(0)->getType());
322 Value
*ExtOp0
= nullptr;
323 Value
*ExtOp1
= nullptr;
324 Value
*NewICmp
= nullptr;
327 ExtOp0
= Builder
.CreateSExt(I
.getOperand(0), I32Ty
);
328 ExtOp1
= Builder
.CreateSExt(I
.getOperand(1), I32Ty
);
330 ExtOp0
= Builder
.CreateZExt(I
.getOperand(0), I32Ty
);
331 ExtOp1
= Builder
.CreateZExt(I
.getOperand(1), I32Ty
);
333 NewICmp
= Builder
.CreateICmp(I
.getPredicate(), ExtOp0
, ExtOp1
);
335 I
.replaceAllUsesWith(NewICmp
);
341 bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(SelectInst
&I
) const {
342 assert(needsPromotionToI32(I
.getType()) &&
343 "I does not need promotion to i32");
345 IRBuilder
<> Builder(&I
);
346 Builder
.SetCurrentDebugLocation(I
.getDebugLoc());
348 Type
*I32Ty
= getI32Ty(Builder
, I
.getType());
349 Value
*ExtOp1
= nullptr;
350 Value
*ExtOp2
= nullptr;
351 Value
*ExtRes
= nullptr;
352 Value
*TruncRes
= nullptr;
355 ExtOp1
= Builder
.CreateSExt(I
.getOperand(1), I32Ty
);
356 ExtOp2
= Builder
.CreateSExt(I
.getOperand(2), I32Ty
);
358 ExtOp1
= Builder
.CreateZExt(I
.getOperand(1), I32Ty
);
359 ExtOp2
= Builder
.CreateZExt(I
.getOperand(2), I32Ty
);
361 ExtRes
= Builder
.CreateSelect(I
.getOperand(0), ExtOp1
, ExtOp2
);
362 TruncRes
= Builder
.CreateTrunc(ExtRes
, I
.getType());
364 I
.replaceAllUsesWith(TruncRes
);
370 bool AMDGPUCodeGenPrepare::promoteUniformBitreverseToI32(
371 IntrinsicInst
&I
) const {
372 assert(I
.getIntrinsicID() == Intrinsic::bitreverse
&&
373 "I must be bitreverse intrinsic");
374 assert(needsPromotionToI32(I
.getType()) &&
375 "I does not need promotion to i32");
377 IRBuilder
<> Builder(&I
);
378 Builder
.SetCurrentDebugLocation(I
.getDebugLoc());
380 Type
*I32Ty
= getI32Ty(Builder
, I
.getType());
382 Intrinsic::getDeclaration(Mod
, Intrinsic::bitreverse
, { I32Ty
});
383 Value
*ExtOp
= Builder
.CreateZExt(I
.getOperand(0), I32Ty
);
384 Value
*ExtRes
= Builder
.CreateCall(I32
, { ExtOp
});
386 Builder
.CreateLShr(ExtRes
, 32 - getBaseElementBitWidth(I
.getType()));
388 Builder
.CreateTrunc(LShrOp
, I
.getType());
390 I
.replaceAllUsesWith(TruncRes
);
396 static bool shouldKeepFDivF32(Value
*Num
, bool UnsafeDiv
, bool HasDenormals
) {
397 const ConstantFP
*CNum
= dyn_cast
<ConstantFP
>(Num
);
404 bool IsOne
= CNum
->isExactlyValue(+1.0) || CNum
->isExactlyValue(-1.0);
406 // Reciprocal f32 is handled separately without denormals.
407 return HasDenormals
^ IsOne
;
410 // Insert an intrinsic for fast fdiv for safe math situations where we can
411 // reduce precision. Leave fdiv for situations where the generic node is
412 // expected to be optimized.
413 bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator
&FDiv
) {
414 Type
*Ty
= FDiv
.getType();
416 if (!Ty
->getScalarType()->isFloatTy())
419 MDNode
*FPMath
= FDiv
.getMetadata(LLVMContext::MD_fpmath
);
423 const FPMathOperator
*FPOp
= cast
<const FPMathOperator
>(&FDiv
);
424 float ULP
= FPOp
->getFPAccuracy();
428 FastMathFlags FMF
= FPOp
->getFastMathFlags();
429 bool UnsafeDiv
= HasUnsafeFPMath
|| FMF
.isFast() ||
430 FMF
.allowReciprocal();
432 // With UnsafeDiv node will be optimized to just rcp and mul.
436 IRBuilder
<> Builder(FDiv
.getParent(), std::next(FDiv
.getIterator()), FPMath
);
437 Builder
.setFastMathFlags(FMF
);
438 Builder
.SetCurrentDebugLocation(FDiv
.getDebugLoc());
440 Function
*Decl
= Intrinsic::getDeclaration(Mod
, Intrinsic::amdgcn_fdiv_fast
);
442 Value
*Num
= FDiv
.getOperand(0);
443 Value
*Den
= FDiv
.getOperand(1);
445 Value
*NewFDiv
= nullptr;
447 bool HasDenormals
= ST
->hasFP32Denormals();
448 if (VectorType
*VT
= dyn_cast
<VectorType
>(Ty
)) {
449 NewFDiv
= UndefValue::get(VT
);
451 // FIXME: Doesn't do the right thing for cases where the vector is partially
452 // constant. This works when the scalarizer pass is run first.
453 for (unsigned I
= 0, E
= VT
->getNumElements(); I
!= E
; ++I
) {
454 Value
*NumEltI
= Builder
.CreateExtractElement(Num
, I
);
455 Value
*DenEltI
= Builder
.CreateExtractElement(Den
, I
);
458 if (shouldKeepFDivF32(NumEltI
, UnsafeDiv
, HasDenormals
)) {
459 NewElt
= Builder
.CreateFDiv(NumEltI
, DenEltI
);
461 NewElt
= Builder
.CreateCall(Decl
, { NumEltI
, DenEltI
});
464 NewFDiv
= Builder
.CreateInsertElement(NewFDiv
, NewElt
, I
);
467 if (!shouldKeepFDivF32(Num
, UnsafeDiv
, HasDenormals
))
468 NewFDiv
= Builder
.CreateCall(Decl
, { Num
, Den
});
472 FDiv
.replaceAllUsesWith(NewFDiv
);
473 NewFDiv
->takeName(&FDiv
);
474 FDiv
.eraseFromParent();
480 static bool hasUnsafeFPMath(const Function
&F
) {
481 Attribute Attr
= F
.getFnAttribute("unsafe-fp-math");
482 return Attr
.getValueAsString() == "true";
485 static std::pair
<Value
*, Value
*> getMul64(IRBuilder
<> &Builder
,
486 Value
*LHS
, Value
*RHS
) {
487 Type
*I32Ty
= Builder
.getInt32Ty();
488 Type
*I64Ty
= Builder
.getInt64Ty();
490 Value
*LHS_EXT64
= Builder
.CreateZExt(LHS
, I64Ty
);
491 Value
*RHS_EXT64
= Builder
.CreateZExt(RHS
, I64Ty
);
492 Value
*MUL64
= Builder
.CreateMul(LHS_EXT64
, RHS_EXT64
);
493 Value
*Lo
= Builder
.CreateTrunc(MUL64
, I32Ty
);
494 Value
*Hi
= Builder
.CreateLShr(MUL64
, Builder
.getInt64(32));
495 Hi
= Builder
.CreateTrunc(Hi
, I32Ty
);
496 return std::make_pair(Lo
, Hi
);
499 static Value
* getMulHu(IRBuilder
<> &Builder
, Value
*LHS
, Value
*RHS
) {
500 return getMul64(Builder
, LHS
, RHS
).second
;
503 // The fractional part of a float is enough to accurately represent up to
504 // a 24-bit signed integer.
505 Value
* AMDGPUCodeGenPrepare::expandDivRem24(IRBuilder
<> &Builder
,
507 Value
*Num
, Value
*Den
,
508 bool IsDiv
, bool IsSigned
) const {
509 assert(Num
->getType()->isIntegerTy(32));
511 const DataLayout
&DL
= Mod
->getDataLayout();
512 unsigned LHSSignBits
= ComputeNumSignBits(Num
, DL
, 0, AC
, &I
);
516 unsigned RHSSignBits
= ComputeNumSignBits(Den
, DL
, 0, AC
, &I
);
521 unsigned SignBits
= std::min(LHSSignBits
, RHSSignBits
);
522 unsigned DivBits
= 32 - SignBits
;
526 Type
*Ty
= Num
->getType();
527 Type
*I32Ty
= Builder
.getInt32Ty();
528 Type
*F32Ty
= Builder
.getFloatTy();
529 ConstantInt
*One
= Builder
.getInt32(1);
533 // char|short jq = ia ^ ib;
534 JQ
= Builder
.CreateXor(Num
, Den
);
536 // jq = jq >> (bitsize - 2)
537 JQ
= Builder
.CreateAShr(JQ
, Builder
.getInt32(30));
540 JQ
= Builder
.CreateOr(JQ
, One
);
543 // int ia = (int)LHS;
549 // float fa = (float)ia;
550 Value
*FA
= IsSigned
? Builder
.CreateSIToFP(IA
, F32Ty
)
551 : Builder
.CreateUIToFP(IA
, F32Ty
);
553 // float fb = (float)ib;
554 Value
*FB
= IsSigned
? Builder
.CreateSIToFP(IB
,F32Ty
)
555 : Builder
.CreateUIToFP(IB
,F32Ty
);
557 Value
*RCP
= Builder
.CreateFDiv(ConstantFP::get(F32Ty
, 1.0), FB
);
558 Value
*FQM
= Builder
.CreateFMul(FA
, RCP
);
561 CallInst
* FQ
= Builder
.CreateIntrinsic(Intrinsic::trunc
, { FQM
});
562 FQ
->copyFastMathFlags(Builder
.getFastMathFlags());
564 // float fqneg = -fq;
565 Value
*FQNeg
= Builder
.CreateFNeg(FQ
);
567 // float fr = mad(fqneg, fb, fa);
568 Value
*FR
= Builder
.CreateIntrinsic(Intrinsic::amdgcn_fmad_ftz
,
569 { FQNeg
, FB
, FA
}, FQ
);
572 Value
*IQ
= IsSigned
? Builder
.CreateFPToSI(FQ
, I32Ty
)
573 : Builder
.CreateFPToUI(FQ
, I32Ty
);
576 FR
= Builder
.CreateIntrinsic(Intrinsic::fabs
, { FR
}, FQ
);
579 FB
= Builder
.CreateIntrinsic(Intrinsic::fabs
, { FB
}, FQ
);
581 // int cv = fr >= fb;
582 Value
*CV
= Builder
.CreateFCmpOGE(FR
, FB
);
584 // jq = (cv ? jq : 0);
585 JQ
= Builder
.CreateSelect(CV
, JQ
, Builder
.getInt32(0));
588 Value
*Div
= Builder
.CreateAdd(IQ
, JQ
);
592 // Rem needs compensation, it's easier to recompute it
593 Value
*Rem
= Builder
.CreateMul(Div
, Den
);
594 Res
= Builder
.CreateSub(Num
, Rem
);
597 // Truncate to number of bits this divide really is.
599 Res
= Builder
.CreateTrunc(Res
, Builder
.getIntNTy(DivBits
));
600 Res
= Builder
.CreateSExt(Res
, Ty
);
602 ConstantInt
*TruncMask
= Builder
.getInt32((UINT64_C(1) << DivBits
) - 1);
603 Res
= Builder
.CreateAnd(Res
, TruncMask
);
609 Value
* AMDGPUCodeGenPrepare::expandDivRem32(IRBuilder
<> &Builder
,
611 Value
*Num
, Value
*Den
) const {
612 Instruction::BinaryOps Opc
= I
.getOpcode();
613 assert(Opc
== Instruction::URem
|| Opc
== Instruction::UDiv
||
614 Opc
== Instruction::SRem
|| Opc
== Instruction::SDiv
);
618 Builder
.setFastMathFlags(FMF
);
620 if (isa
<Constant
>(Den
))
621 return nullptr; // Keep it for optimization
623 bool IsDiv
= Opc
== Instruction::UDiv
|| Opc
== Instruction::SDiv
;
624 bool IsSigned
= Opc
== Instruction::SRem
|| Opc
== Instruction::SDiv
;
626 Type
*Ty
= Num
->getType();
627 Type
*I32Ty
= Builder
.getInt32Ty();
628 Type
*F32Ty
= Builder
.getFloatTy();
630 if (Ty
->getScalarSizeInBits() < 32) {
632 Num
= Builder
.CreateSExt(Num
, I32Ty
);
633 Den
= Builder
.CreateSExt(Den
, I32Ty
);
635 Num
= Builder
.CreateZExt(Num
, I32Ty
);
636 Den
= Builder
.CreateZExt(Den
, I32Ty
);
640 if (Value
*Res
= expandDivRem24(Builder
, I
, Num
, Den
, IsDiv
, IsSigned
)) {
641 Res
= Builder
.CreateTrunc(Res
, Ty
);
645 ConstantInt
*Zero
= Builder
.getInt32(0);
646 ConstantInt
*One
= Builder
.getInt32(1);
647 ConstantInt
*MinusOne
= Builder
.getInt32(~0);
649 Value
*Sign
= nullptr;
651 ConstantInt
*K31
= Builder
.getInt32(31);
652 Value
*LHSign
= Builder
.CreateAShr(Num
, K31
);
653 Value
*RHSign
= Builder
.CreateAShr(Den
, K31
);
654 // Remainder sign is the same as LHS
655 Sign
= IsDiv
? Builder
.CreateXor(LHSign
, RHSign
) : LHSign
;
657 Num
= Builder
.CreateAdd(Num
, LHSign
);
658 Den
= Builder
.CreateAdd(Den
, RHSign
);
660 Num
= Builder
.CreateXor(Num
, LHSign
);
661 Den
= Builder
.CreateXor(Den
, RHSign
);
664 // RCP = URECIP(Den) = 2^32 / Den + e
665 // e is rounding error.
666 Value
*DEN_F32
= Builder
.CreateUIToFP(Den
, F32Ty
);
667 Value
*RCP_F32
= Builder
.CreateFDiv(ConstantFP::get(F32Ty
, 1.0), DEN_F32
);
668 Constant
*UINT_MAX_PLUS_1
= ConstantFP::get(F32Ty
, BitsToFloat(0x4f800000));
669 Value
*RCP_SCALE
= Builder
.CreateFMul(RCP_F32
, UINT_MAX_PLUS_1
);
670 Value
*RCP
= Builder
.CreateFPToUI(RCP_SCALE
, I32Ty
);
672 // RCP_LO, RCP_HI = mul(RCP, Den) */
673 Value
*RCP_LO
, *RCP_HI
;
674 std::tie(RCP_LO
, RCP_HI
) = getMul64(Builder
, RCP
, Den
);
676 // NEG_RCP_LO = -RCP_LO
677 Value
*NEG_RCP_LO
= Builder
.CreateNeg(RCP_LO
);
679 // ABS_RCP_LO = (RCP_HI == 0 ? NEG_RCP_LO : RCP_LO)
680 Value
*RCP_HI_0_CC
= Builder
.CreateICmpEQ(RCP_HI
, Zero
);
681 Value
*ABS_RCP_LO
= Builder
.CreateSelect(RCP_HI_0_CC
, NEG_RCP_LO
, RCP_LO
);
683 // Calculate the rounding error from the URECIP instruction
684 // E = mulhu(ABS_RCP_LO, RCP)
685 Value
*E
= getMulHu(Builder
, ABS_RCP_LO
, RCP
);
688 Value
*RCP_A_E
= Builder
.CreateAdd(RCP
, E
);
691 Value
*RCP_S_E
= Builder
.CreateSub(RCP
, E
);
693 // Tmp0 = (RCP_HI == 0 ? RCP_A_E : RCP_SUB_E)
694 Value
*Tmp0
= Builder
.CreateSelect(RCP_HI_0_CC
, RCP_A_E
, RCP_S_E
);
696 // Quotient = mulhu(Tmp0, Num)
697 Value
*Quotient
= getMulHu(Builder
, Tmp0
, Num
);
699 // Num_S_Remainder = Quotient * Den
700 Value
*Num_S_Remainder
= Builder
.CreateMul(Quotient
, Den
);
702 // Remainder = Num - Num_S_Remainder
703 Value
*Remainder
= Builder
.CreateSub(Num
, Num_S_Remainder
);
705 // Remainder_GE_Den = (Remainder >= Den ? -1 : 0)
706 Value
*Rem_GE_Den_CC
= Builder
.CreateICmpUGE(Remainder
, Den
);
707 Value
*Remainder_GE_Den
= Builder
.CreateSelect(Rem_GE_Den_CC
, MinusOne
, Zero
);
709 // Remainder_GE_Zero = (Num >= Num_S_Remainder ? -1 : 0)
710 Value
*Num_GE_Num_S_Rem_CC
= Builder
.CreateICmpUGE(Num
, Num_S_Remainder
);
711 Value
*Remainder_GE_Zero
= Builder
.CreateSelect(Num_GE_Num_S_Rem_CC
,
714 // Tmp1 = Remainder_GE_Den & Remainder_GE_Zero
715 Value
*Tmp1
= Builder
.CreateAnd(Remainder_GE_Den
, Remainder_GE_Zero
);
716 Value
*Tmp1_0_CC
= Builder
.CreateICmpEQ(Tmp1
, Zero
);
720 // Quotient_A_One = Quotient + 1
721 Value
*Quotient_A_One
= Builder
.CreateAdd(Quotient
, One
);
723 // Quotient_S_One = Quotient - 1
724 Value
*Quotient_S_One
= Builder
.CreateSub(Quotient
, One
);
726 // Div = (Tmp1 == 0 ? Quotient : Quotient_A_One)
727 Value
*Div
= Builder
.CreateSelect(Tmp1_0_CC
, Quotient
, Quotient_A_One
);
729 // Div = (Remainder_GE_Zero == 0 ? Quotient_S_One : Div)
730 Res
= Builder
.CreateSelect(Num_GE_Num_S_Rem_CC
, Div
, Quotient_S_One
);
732 // Remainder_S_Den = Remainder - Den
733 Value
*Remainder_S_Den
= Builder
.CreateSub(Remainder
, Den
);
735 // Remainder_A_Den = Remainder + Den
736 Value
*Remainder_A_Den
= Builder
.CreateAdd(Remainder
, Den
);
738 // Rem = (Tmp1 == 0 ? Remainder : Remainder_S_Den)
739 Value
*Rem
= Builder
.CreateSelect(Tmp1_0_CC
, Remainder
, Remainder_S_Den
);
741 // Rem = (Remainder_GE_Zero == 0 ? Remainder_A_Den : Rem)
742 Res
= Builder
.CreateSelect(Num_GE_Num_S_Rem_CC
, Rem
, Remainder_A_Den
);
746 Res
= Builder
.CreateXor(Res
, Sign
);
747 Res
= Builder
.CreateSub(Res
, Sign
);
750 Res
= Builder
.CreateTrunc(Res
, Ty
);
755 bool AMDGPUCodeGenPrepare::visitBinaryOperator(BinaryOperator
&I
) {
756 if (ST
->has16BitInsts() && needsPromotionToI32(I
.getType()) &&
757 DA
->isUniform(&I
) && promoteUniformOpToI32(I
))
760 bool Changed
= false;
761 Instruction::BinaryOps Opc
= I
.getOpcode();
762 Type
*Ty
= I
.getType();
763 Value
*NewDiv
= nullptr;
764 if ((Opc
== Instruction::URem
|| Opc
== Instruction::UDiv
||
765 Opc
== Instruction::SRem
|| Opc
== Instruction::SDiv
) &&
766 Ty
->getScalarSizeInBits() <= 32) {
767 Value
*Num
= I
.getOperand(0);
768 Value
*Den
= I
.getOperand(1);
769 IRBuilder
<> Builder(&I
);
770 Builder
.SetCurrentDebugLocation(I
.getDebugLoc());
772 if (VectorType
*VT
= dyn_cast
<VectorType
>(Ty
)) {
773 NewDiv
= UndefValue::get(VT
);
775 for (unsigned N
= 0, E
= VT
->getNumElements(); N
!= E
; ++N
) {
776 Value
*NumEltN
= Builder
.CreateExtractElement(Num
, N
);
777 Value
*DenEltN
= Builder
.CreateExtractElement(Den
, N
);
778 Value
*NewElt
= expandDivRem32(Builder
, I
, NumEltN
, DenEltN
);
780 NewElt
= Builder
.CreateBinOp(Opc
, NumEltN
, DenEltN
);
781 NewDiv
= Builder
.CreateInsertElement(NewDiv
, NewElt
, N
);
784 NewDiv
= expandDivRem32(Builder
, I
, Num
, Den
);
788 I
.replaceAllUsesWith(NewDiv
);
797 bool AMDGPUCodeGenPrepare::visitLoadInst(LoadInst
&I
) {
801 if ((I
.getPointerAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS
||
802 I
.getPointerAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT
) &&
803 canWidenScalarExtLoad(I
)) {
804 IRBuilder
<> Builder(&I
);
805 Builder
.SetCurrentDebugLocation(I
.getDebugLoc());
807 Type
*I32Ty
= Builder
.getInt32Ty();
808 Type
*PT
= PointerType::get(I32Ty
, I
.getPointerAddressSpace());
809 Value
*BitCast
= Builder
.CreateBitCast(I
.getPointerOperand(), PT
);
810 LoadInst
*WidenLoad
= Builder
.CreateLoad(BitCast
);
811 WidenLoad
->copyMetadata(I
);
813 // If we have range metadata, we need to convert the type, and not make
814 // assumptions about the high bits.
815 if (auto *Range
= WidenLoad
->getMetadata(LLVMContext::MD_range
)) {
817 mdconst::extract
<ConstantInt
>(Range
->getOperand(0));
819 if (Lower
->getValue().isNullValue()) {
820 WidenLoad
->setMetadata(LLVMContext::MD_range
, nullptr);
822 Metadata
*LowAndHigh
[] = {
823 ConstantAsMetadata::get(ConstantInt::get(I32Ty
, Lower
->getValue().zext(32))),
824 // Don't make assumptions about the high bits.
825 ConstantAsMetadata::get(ConstantInt::get(I32Ty
, 0))
828 WidenLoad
->setMetadata(LLVMContext::MD_range
,
829 MDNode::get(Mod
->getContext(), LowAndHigh
));
833 int TySize
= Mod
->getDataLayout().getTypeSizeInBits(I
.getType());
834 Type
*IntNTy
= Builder
.getIntNTy(TySize
);
835 Value
*ValTrunc
= Builder
.CreateTrunc(WidenLoad
, IntNTy
);
836 Value
*ValOrig
= Builder
.CreateBitCast(ValTrunc
, I
.getType());
837 I
.replaceAllUsesWith(ValOrig
);
845 bool AMDGPUCodeGenPrepare::visitICmpInst(ICmpInst
&I
) {
846 bool Changed
= false;
848 if (ST
->has16BitInsts() && needsPromotionToI32(I
.getOperand(0)->getType()) &&
850 Changed
|= promoteUniformOpToI32(I
);
855 bool AMDGPUCodeGenPrepare::visitSelectInst(SelectInst
&I
) {
856 bool Changed
= false;
858 if (ST
->has16BitInsts() && needsPromotionToI32(I
.getType()) &&
860 Changed
|= promoteUniformOpToI32(I
);
865 bool AMDGPUCodeGenPrepare::visitIntrinsicInst(IntrinsicInst
&I
) {
866 switch (I
.getIntrinsicID()) {
867 case Intrinsic::bitreverse
:
868 return visitBitreverseIntrinsicInst(I
);
874 bool AMDGPUCodeGenPrepare::visitBitreverseIntrinsicInst(IntrinsicInst
&I
) {
875 bool Changed
= false;
877 if (ST
->has16BitInsts() && needsPromotionToI32(I
.getType()) &&
879 Changed
|= promoteUniformBitreverseToI32(I
);
884 bool AMDGPUCodeGenPrepare::doInitialization(Module
&M
) {
889 bool AMDGPUCodeGenPrepare::runOnFunction(Function
&F
) {
893 auto *TPC
= getAnalysisIfAvailable
<TargetPassConfig
>();
897 const AMDGPUTargetMachine
&TM
= TPC
->getTM
<AMDGPUTargetMachine
>();
898 ST
= &TM
.getSubtarget
<GCNSubtarget
>(F
);
899 AC
= &getAnalysis
<AssumptionCacheTracker
>().getAssumptionCache(F
);
900 DA
= &getAnalysis
<LegacyDivergenceAnalysis
>();
901 HasUnsafeFPMath
= hasUnsafeFPMath(F
);
903 bool MadeChange
= false;
905 for (BasicBlock
&BB
: F
) {
906 BasicBlock::iterator Next
;
907 for (BasicBlock::iterator I
= BB
.begin(), E
= BB
.end(); I
!= E
; I
= Next
) {
909 MadeChange
|= visit(*I
);
916 INITIALIZE_PASS_BEGIN(AMDGPUCodeGenPrepare
, DEBUG_TYPE
,
917 "AMDGPU IR optimizations", false, false)
918 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker
)
919 INITIALIZE_PASS_DEPENDENCY(LegacyDivergenceAnalysis
)
920 INITIALIZE_PASS_END(AMDGPUCodeGenPrepare
, DEBUG_TYPE
, "AMDGPU IR optimizations",
923 char AMDGPUCodeGenPrepare::ID
= 0;
925 FunctionPass
*llvm::createAMDGPUCodeGenPreparePass() {
926 return new AMDGPUCodeGenPrepare();