1 //===-- AMDGPUCodeGenPrepare.cpp ------------------------------------------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
10 /// This pass does misc. AMDGPU optimizations on IR before instruction
13 //===----------------------------------------------------------------------===//
16 #include "AMDGPUSubtarget.h"
17 #include "AMDGPUTargetMachine.h"
18 #include "llvm/ADT/StringRef.h"
19 #include "llvm/Analysis/AssumptionCache.h"
20 #include "llvm/Analysis/LegacyDivergenceAnalysis.h"
21 #include "llvm/Analysis/Loads.h"
22 #include "llvm/Analysis/ValueTracking.h"
23 #include "llvm/CodeGen/Passes.h"
24 #include "llvm/CodeGen/TargetPassConfig.h"
25 #include "llvm/IR/Attributes.h"
26 #include "llvm/IR/BasicBlock.h"
27 #include "llvm/IR/Constants.h"
28 #include "llvm/IR/DerivedTypes.h"
29 #include "llvm/IR/Function.h"
30 #include "llvm/IR/IRBuilder.h"
31 #include "llvm/IR/InstVisitor.h"
32 #include "llvm/IR/InstrTypes.h"
33 #include "llvm/IR/Instruction.h"
34 #include "llvm/IR/Instructions.h"
35 #include "llvm/IR/IntrinsicInst.h"
36 #include "llvm/IR/Intrinsics.h"
37 #include "llvm/IR/LLVMContext.h"
38 #include "llvm/IR/Operator.h"
39 #include "llvm/IR/Type.h"
40 #include "llvm/IR/Value.h"
41 #include "llvm/Pass.h"
42 #include "llvm/Support/Casting.h"
46 #define DEBUG_TYPE "amdgpu-codegenprepare"
52 static cl::opt
<bool> WidenLoads(
53 "amdgpu-codegenprepare-widen-constant-loads",
54 cl::desc("Widen sub-dword constant address space loads in AMDGPUCodeGenPrepare"),
58 class AMDGPUCodeGenPrepare
: public FunctionPass
,
59 public InstVisitor
<AMDGPUCodeGenPrepare
, bool> {
60 const GCNSubtarget
*ST
= nullptr;
61 AssumptionCache
*AC
= nullptr;
62 LegacyDivergenceAnalysis
*DA
= nullptr;
63 Module
*Mod
= nullptr;
64 bool HasUnsafeFPMath
= false;
66 /// Copies exact/nsw/nuw flags (if any) from binary operation \p I to
67 /// binary operation \p V.
69 /// \returns Binary operation \p V.
70 /// \returns \p T's base element bit width.
71 unsigned getBaseElementBitWidth(const Type
*T
) const;
73 /// \returns Equivalent 32 bit integer type for given type \p T. For example,
74 /// if \p T is i7, then i32 is returned; if \p T is <3 x i12>, then <3 x i32>
76 Type
*getI32Ty(IRBuilder
<> &B
, const Type
*T
) const;
78 /// \returns True if binary operation \p I is a signed binary operation, false
80 bool isSigned(const BinaryOperator
&I
) const;
82 /// \returns True if the condition of 'select' operation \p I comes from a
83 /// signed 'icmp' operation, false otherwise.
84 bool isSigned(const SelectInst
&I
) const;
86 /// \returns True if type \p T needs to be promoted to 32 bit integer type,
88 bool needsPromotionToI32(const Type
*T
) const;
90 /// Promotes uniform binary operation \p I to equivalent 32 bit binary
93 /// \details \p I's base element bit width must be greater than 1 and less
94 /// than or equal 16. Promotion is done by sign or zero extending operands to
95 /// 32 bits, replacing \p I with equivalent 32 bit binary operation, and
96 /// truncating the result of 32 bit binary operation back to \p I's original
97 /// type. Division operation is not promoted.
99 /// \returns True if \p I is promoted to equivalent 32 bit binary operation,
101 bool promoteUniformOpToI32(BinaryOperator
&I
) const;
103 /// Promotes uniform 'icmp' operation \p I to 32 bit 'icmp' operation.
105 /// \details \p I's base element bit width must be greater than 1 and less
106 /// than or equal 16. Promotion is done by sign or zero extending operands to
107 /// 32 bits, and replacing \p I with 32 bit 'icmp' operation.
110 bool promoteUniformOpToI32(ICmpInst
&I
) const;
112 /// Promotes uniform 'select' operation \p I to 32 bit 'select'
115 /// \details \p I's base element bit width must be greater than 1 and less
116 /// than or equal 16. Promotion is done by sign or zero extending operands to
117 /// 32 bits, replacing \p I with 32 bit 'select' operation, and truncating the
118 /// result of 32 bit 'select' operation back to \p I's original type.
121 bool promoteUniformOpToI32(SelectInst
&I
) const;
123 /// Promotes uniform 'bitreverse' intrinsic \p I to 32 bit 'bitreverse'
126 /// \details \p I's base element bit width must be greater than 1 and less
127 /// than or equal 16. Promotion is done by zero extending the operand to 32
128 /// bits, replacing \p I with 32 bit 'bitreverse' intrinsic, shifting the
129 /// result of 32 bit 'bitreverse' intrinsic to the right with zero fill (the
130 /// shift amount is 32 minus \p I's base element bit width), and truncating
131 /// the result of the shift operation back to \p I's original type.
134 bool promoteUniformBitreverseToI32(IntrinsicInst
&I
) const;
136 /// Expands 24 bit div or rem.
137 Value
* expandDivRem24(IRBuilder
<> &Builder
, BinaryOperator
&I
,
138 Value
*Num
, Value
*Den
,
139 bool IsDiv
, bool IsSigned
) const;
141 /// Expands 32 bit div or rem.
142 Value
* expandDivRem32(IRBuilder
<> &Builder
, BinaryOperator
&I
,
143 Value
*Num
, Value
*Den
) const;
145 /// Widen a scalar load.
147 /// \details \p Widen scalar load for uniform, small type loads from constant
148 // memory / to a full 32-bits and then truncate the input to allow a scalar
149 // load instead of a vector load.
153 bool canWidenScalarExtLoad(LoadInst
&I
) const;
158 AMDGPUCodeGenPrepare() : FunctionPass(ID
) {}
160 bool visitFDiv(BinaryOperator
&I
);
162 bool visitInstruction(Instruction
&I
) { return false; }
163 bool visitBinaryOperator(BinaryOperator
&I
);
164 bool visitLoadInst(LoadInst
&I
);
165 bool visitICmpInst(ICmpInst
&I
);
166 bool visitSelectInst(SelectInst
&I
);
168 bool visitIntrinsicInst(IntrinsicInst
&I
);
169 bool visitBitreverseIntrinsicInst(IntrinsicInst
&I
);
171 bool doInitialization(Module
&M
) override
;
172 bool runOnFunction(Function
&F
) override
;
174 StringRef
getPassName() const override
{ return "AMDGPU IR optimizations"; }
176 void getAnalysisUsage(AnalysisUsage
&AU
) const override
{
177 AU
.addRequired
<AssumptionCacheTracker
>();
178 AU
.addRequired
<LegacyDivergenceAnalysis
>();
179 AU
.setPreservesAll();
183 } // end anonymous namespace
185 unsigned AMDGPUCodeGenPrepare::getBaseElementBitWidth(const Type
*T
) const {
186 assert(needsPromotionToI32(T
) && "T does not need promotion to i32");
188 if (T
->isIntegerTy())
189 return T
->getIntegerBitWidth();
190 return cast
<VectorType
>(T
)->getElementType()->getIntegerBitWidth();
193 Type
*AMDGPUCodeGenPrepare::getI32Ty(IRBuilder
<> &B
, const Type
*T
) const {
194 assert(needsPromotionToI32(T
) && "T does not need promotion to i32");
196 if (T
->isIntegerTy())
197 return B
.getInt32Ty();
198 return VectorType::get(B
.getInt32Ty(), cast
<VectorType
>(T
)->getNumElements());
201 bool AMDGPUCodeGenPrepare::isSigned(const BinaryOperator
&I
) const {
202 return I
.getOpcode() == Instruction::AShr
||
203 I
.getOpcode() == Instruction::SDiv
|| I
.getOpcode() == Instruction::SRem
;
206 bool AMDGPUCodeGenPrepare::isSigned(const SelectInst
&I
) const {
207 return isa
<ICmpInst
>(I
.getOperand(0)) ?
208 cast
<ICmpInst
>(I
.getOperand(0))->isSigned() : false;
211 bool AMDGPUCodeGenPrepare::needsPromotionToI32(const Type
*T
) const {
212 const IntegerType
*IntTy
= dyn_cast
<IntegerType
>(T
);
213 if (IntTy
&& IntTy
->getBitWidth() > 1 && IntTy
->getBitWidth() <= 16)
216 if (const VectorType
*VT
= dyn_cast
<VectorType
>(T
)) {
217 // TODO: The set of packed operations is more limited, so may want to
218 // promote some anyway.
219 if (ST
->hasVOP3PInsts())
222 return needsPromotionToI32(VT
->getElementType());
228 // Return true if the op promoted to i32 should have nsw set.
229 static bool promotedOpIsNSW(const Instruction
&I
) {
230 switch (I
.getOpcode()) {
231 case Instruction::Shl
:
232 case Instruction::Add
:
233 case Instruction::Sub
:
235 case Instruction::Mul
:
236 return I
.hasNoUnsignedWrap();
242 // Return true if the op promoted to i32 should have nuw set.
243 static bool promotedOpIsNUW(const Instruction
&I
) {
244 switch (I
.getOpcode()) {
245 case Instruction::Shl
:
246 case Instruction::Add
:
247 case Instruction::Mul
:
249 case Instruction::Sub
:
250 return I
.hasNoUnsignedWrap();
256 bool AMDGPUCodeGenPrepare::canWidenScalarExtLoad(LoadInst
&I
) const {
257 Type
*Ty
= I
.getType();
258 const DataLayout
&DL
= Mod
->getDataLayout();
259 int TySize
= DL
.getTypeSizeInBits(Ty
);
260 unsigned Align
= I
.getAlignment() ?
261 I
.getAlignment() : DL
.getABITypeAlignment(Ty
);
263 return I
.isSimple() && TySize
< 32 && Align
>= 4 && DA
->isUniform(&I
);
266 bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(BinaryOperator
&I
) const {
267 assert(needsPromotionToI32(I
.getType()) &&
268 "I does not need promotion to i32");
270 if (I
.getOpcode() == Instruction::SDiv
||
271 I
.getOpcode() == Instruction::UDiv
||
272 I
.getOpcode() == Instruction::SRem
||
273 I
.getOpcode() == Instruction::URem
)
276 IRBuilder
<> Builder(&I
);
277 Builder
.SetCurrentDebugLocation(I
.getDebugLoc());
279 Type
*I32Ty
= getI32Ty(Builder
, I
.getType());
280 Value
*ExtOp0
= nullptr;
281 Value
*ExtOp1
= nullptr;
282 Value
*ExtRes
= nullptr;
283 Value
*TruncRes
= nullptr;
286 ExtOp0
= Builder
.CreateSExt(I
.getOperand(0), I32Ty
);
287 ExtOp1
= Builder
.CreateSExt(I
.getOperand(1), I32Ty
);
289 ExtOp0
= Builder
.CreateZExt(I
.getOperand(0), I32Ty
);
290 ExtOp1
= Builder
.CreateZExt(I
.getOperand(1), I32Ty
);
293 ExtRes
= Builder
.CreateBinOp(I
.getOpcode(), ExtOp0
, ExtOp1
);
294 if (Instruction
*Inst
= dyn_cast
<Instruction
>(ExtRes
)) {
295 if (promotedOpIsNSW(cast
<Instruction
>(I
)))
296 Inst
->setHasNoSignedWrap();
298 if (promotedOpIsNUW(cast
<Instruction
>(I
)))
299 Inst
->setHasNoUnsignedWrap();
301 if (const auto *ExactOp
= dyn_cast
<PossiblyExactOperator
>(&I
))
302 Inst
->setIsExact(ExactOp
->isExact());
305 TruncRes
= Builder
.CreateTrunc(ExtRes
, I
.getType());
307 I
.replaceAllUsesWith(TruncRes
);
313 bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(ICmpInst
&I
) const {
314 assert(needsPromotionToI32(I
.getOperand(0)->getType()) &&
315 "I does not need promotion to i32");
317 IRBuilder
<> Builder(&I
);
318 Builder
.SetCurrentDebugLocation(I
.getDebugLoc());
320 Type
*I32Ty
= getI32Ty(Builder
, I
.getOperand(0)->getType());
321 Value
*ExtOp0
= nullptr;
322 Value
*ExtOp1
= nullptr;
323 Value
*NewICmp
= nullptr;
326 ExtOp0
= Builder
.CreateSExt(I
.getOperand(0), I32Ty
);
327 ExtOp1
= Builder
.CreateSExt(I
.getOperand(1), I32Ty
);
329 ExtOp0
= Builder
.CreateZExt(I
.getOperand(0), I32Ty
);
330 ExtOp1
= Builder
.CreateZExt(I
.getOperand(1), I32Ty
);
332 NewICmp
= Builder
.CreateICmp(I
.getPredicate(), ExtOp0
, ExtOp1
);
334 I
.replaceAllUsesWith(NewICmp
);
340 bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(SelectInst
&I
) const {
341 assert(needsPromotionToI32(I
.getType()) &&
342 "I does not need promotion to i32");
344 IRBuilder
<> Builder(&I
);
345 Builder
.SetCurrentDebugLocation(I
.getDebugLoc());
347 Type
*I32Ty
= getI32Ty(Builder
, I
.getType());
348 Value
*ExtOp1
= nullptr;
349 Value
*ExtOp2
= nullptr;
350 Value
*ExtRes
= nullptr;
351 Value
*TruncRes
= nullptr;
354 ExtOp1
= Builder
.CreateSExt(I
.getOperand(1), I32Ty
);
355 ExtOp2
= Builder
.CreateSExt(I
.getOperand(2), I32Ty
);
357 ExtOp1
= Builder
.CreateZExt(I
.getOperand(1), I32Ty
);
358 ExtOp2
= Builder
.CreateZExt(I
.getOperand(2), I32Ty
);
360 ExtRes
= Builder
.CreateSelect(I
.getOperand(0), ExtOp1
, ExtOp2
);
361 TruncRes
= Builder
.CreateTrunc(ExtRes
, I
.getType());
363 I
.replaceAllUsesWith(TruncRes
);
369 bool AMDGPUCodeGenPrepare::promoteUniformBitreverseToI32(
370 IntrinsicInst
&I
) const {
371 assert(I
.getIntrinsicID() == Intrinsic::bitreverse
&&
372 "I must be bitreverse intrinsic");
373 assert(needsPromotionToI32(I
.getType()) &&
374 "I does not need promotion to i32");
376 IRBuilder
<> Builder(&I
);
377 Builder
.SetCurrentDebugLocation(I
.getDebugLoc());
379 Type
*I32Ty
= getI32Ty(Builder
, I
.getType());
381 Intrinsic::getDeclaration(Mod
, Intrinsic::bitreverse
, { I32Ty
});
382 Value
*ExtOp
= Builder
.CreateZExt(I
.getOperand(0), I32Ty
);
383 Value
*ExtRes
= Builder
.CreateCall(I32
, { ExtOp
});
385 Builder
.CreateLShr(ExtRes
, 32 - getBaseElementBitWidth(I
.getType()));
387 Builder
.CreateTrunc(LShrOp
, I
.getType());
389 I
.replaceAllUsesWith(TruncRes
);
395 static bool shouldKeepFDivF32(Value
*Num
, bool UnsafeDiv
, bool HasDenormals
) {
396 const ConstantFP
*CNum
= dyn_cast
<ConstantFP
>(Num
);
403 bool IsOne
= CNum
->isExactlyValue(+1.0) || CNum
->isExactlyValue(-1.0);
405 // Reciprocal f32 is handled separately without denormals.
406 return HasDenormals
^ IsOne
;
409 // Insert an intrinsic for fast fdiv for safe math situations where we can
410 // reduce precision. Leave fdiv for situations where the generic node is
411 // expected to be optimized.
412 bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator
&FDiv
) {
413 Type
*Ty
= FDiv
.getType();
415 if (!Ty
->getScalarType()->isFloatTy())
418 MDNode
*FPMath
= FDiv
.getMetadata(LLVMContext::MD_fpmath
);
422 const FPMathOperator
*FPOp
= cast
<const FPMathOperator
>(&FDiv
);
423 float ULP
= FPOp
->getFPAccuracy();
427 FastMathFlags FMF
= FPOp
->getFastMathFlags();
428 bool UnsafeDiv
= HasUnsafeFPMath
|| FMF
.isFast() ||
429 FMF
.allowReciprocal();
431 // With UnsafeDiv node will be optimized to just rcp and mul.
435 IRBuilder
<> Builder(FDiv
.getParent(), std::next(FDiv
.getIterator()), FPMath
);
436 Builder
.setFastMathFlags(FMF
);
437 Builder
.SetCurrentDebugLocation(FDiv
.getDebugLoc());
439 Function
*Decl
= Intrinsic::getDeclaration(Mod
, Intrinsic::amdgcn_fdiv_fast
);
441 Value
*Num
= FDiv
.getOperand(0);
442 Value
*Den
= FDiv
.getOperand(1);
444 Value
*NewFDiv
= nullptr;
446 bool HasDenormals
= ST
->hasFP32Denormals();
447 if (VectorType
*VT
= dyn_cast
<VectorType
>(Ty
)) {
448 NewFDiv
= UndefValue::get(VT
);
450 // FIXME: Doesn't do the right thing for cases where the vector is partially
451 // constant. This works when the scalarizer pass is run first.
452 for (unsigned I
= 0, E
= VT
->getNumElements(); I
!= E
; ++I
) {
453 Value
*NumEltI
= Builder
.CreateExtractElement(Num
, I
);
454 Value
*DenEltI
= Builder
.CreateExtractElement(Den
, I
);
457 if (shouldKeepFDivF32(NumEltI
, UnsafeDiv
, HasDenormals
)) {
458 NewElt
= Builder
.CreateFDiv(NumEltI
, DenEltI
);
460 NewElt
= Builder
.CreateCall(Decl
, { NumEltI
, DenEltI
});
463 NewFDiv
= Builder
.CreateInsertElement(NewFDiv
, NewElt
, I
);
466 if (!shouldKeepFDivF32(Num
, UnsafeDiv
, HasDenormals
))
467 NewFDiv
= Builder
.CreateCall(Decl
, { Num
, Den
});
471 FDiv
.replaceAllUsesWith(NewFDiv
);
472 NewFDiv
->takeName(&FDiv
);
473 FDiv
.eraseFromParent();
479 static bool hasUnsafeFPMath(const Function
&F
) {
480 Attribute Attr
= F
.getFnAttribute("unsafe-fp-math");
481 return Attr
.getValueAsString() == "true";
484 static std::pair
<Value
*, Value
*> getMul64(IRBuilder
<> &Builder
,
485 Value
*LHS
, Value
*RHS
) {
486 Type
*I32Ty
= Builder
.getInt32Ty();
487 Type
*I64Ty
= Builder
.getInt64Ty();
489 Value
*LHS_EXT64
= Builder
.CreateZExt(LHS
, I64Ty
);
490 Value
*RHS_EXT64
= Builder
.CreateZExt(RHS
, I64Ty
);
491 Value
*MUL64
= Builder
.CreateMul(LHS_EXT64
, RHS_EXT64
);
492 Value
*Lo
= Builder
.CreateTrunc(MUL64
, I32Ty
);
493 Value
*Hi
= Builder
.CreateLShr(MUL64
, Builder
.getInt64(32));
494 Hi
= Builder
.CreateTrunc(Hi
, I32Ty
);
495 return std::make_pair(Lo
, Hi
);
498 static Value
* getMulHu(IRBuilder
<> &Builder
, Value
*LHS
, Value
*RHS
) {
499 return getMul64(Builder
, LHS
, RHS
).second
;
502 // The fractional part of a float is enough to accurately represent up to
503 // a 24-bit signed integer.
504 Value
* AMDGPUCodeGenPrepare::expandDivRem24(IRBuilder
<> &Builder
,
506 Value
*Num
, Value
*Den
,
507 bool IsDiv
, bool IsSigned
) const {
508 assert(Num
->getType()->isIntegerTy(32));
510 const DataLayout
&DL
= Mod
->getDataLayout();
511 unsigned LHSSignBits
= ComputeNumSignBits(Num
, DL
, 0, AC
, &I
);
515 unsigned RHSSignBits
= ComputeNumSignBits(Den
, DL
, 0, AC
, &I
);
520 unsigned SignBits
= std::min(LHSSignBits
, RHSSignBits
);
521 unsigned DivBits
= 32 - SignBits
;
525 Type
*Ty
= Num
->getType();
526 Type
*I32Ty
= Builder
.getInt32Ty();
527 Type
*F32Ty
= Builder
.getFloatTy();
528 ConstantInt
*One
= Builder
.getInt32(1);
532 // char|short jq = ia ^ ib;
533 JQ
= Builder
.CreateXor(Num
, Den
);
535 // jq = jq >> (bitsize - 2)
536 JQ
= Builder
.CreateAShr(JQ
, Builder
.getInt32(30));
539 JQ
= Builder
.CreateOr(JQ
, One
);
542 // int ia = (int)LHS;
548 // float fa = (float)ia;
549 Value
*FA
= IsSigned
? Builder
.CreateSIToFP(IA
, F32Ty
)
550 : Builder
.CreateUIToFP(IA
, F32Ty
);
552 // float fb = (float)ib;
553 Value
*FB
= IsSigned
? Builder
.CreateSIToFP(IB
,F32Ty
)
554 : Builder
.CreateUIToFP(IB
,F32Ty
);
556 Value
*RCP
= Builder
.CreateFDiv(ConstantFP::get(F32Ty
, 1.0), FB
);
557 Value
*FQM
= Builder
.CreateFMul(FA
, RCP
);
560 CallInst
*FQ
= Builder
.CreateUnaryIntrinsic(Intrinsic::trunc
, FQM
);
561 FQ
->copyFastMathFlags(Builder
.getFastMathFlags());
563 // float fqneg = -fq;
564 Value
*FQNeg
= Builder
.CreateFNeg(FQ
);
566 // float fr = mad(fqneg, fb, fa);
567 Value
*FR
= Builder
.CreateIntrinsic(Intrinsic::amdgcn_fmad_ftz
,
568 {FQNeg
->getType()}, {FQNeg
, FB
, FA
}, FQ
);
571 Value
*IQ
= IsSigned
? Builder
.CreateFPToSI(FQ
, I32Ty
)
572 : Builder
.CreateFPToUI(FQ
, I32Ty
);
575 FR
= Builder
.CreateUnaryIntrinsic(Intrinsic::fabs
, FR
, FQ
);
578 FB
= Builder
.CreateUnaryIntrinsic(Intrinsic::fabs
, FB
, FQ
);
580 // int cv = fr >= fb;
581 Value
*CV
= Builder
.CreateFCmpOGE(FR
, FB
);
583 // jq = (cv ? jq : 0);
584 JQ
= Builder
.CreateSelect(CV
, JQ
, Builder
.getInt32(0));
587 Value
*Div
= Builder
.CreateAdd(IQ
, JQ
);
591 // Rem needs compensation, it's easier to recompute it
592 Value
*Rem
= Builder
.CreateMul(Div
, Den
);
593 Res
= Builder
.CreateSub(Num
, Rem
);
596 // Truncate to number of bits this divide really is.
598 Res
= Builder
.CreateTrunc(Res
, Builder
.getIntNTy(DivBits
));
599 Res
= Builder
.CreateSExt(Res
, Ty
);
601 ConstantInt
*TruncMask
= Builder
.getInt32((UINT64_C(1) << DivBits
) - 1);
602 Res
= Builder
.CreateAnd(Res
, TruncMask
);
608 Value
* AMDGPUCodeGenPrepare::expandDivRem32(IRBuilder
<> &Builder
,
610 Value
*Num
, Value
*Den
) const {
611 Instruction::BinaryOps Opc
= I
.getOpcode();
612 assert(Opc
== Instruction::URem
|| Opc
== Instruction::UDiv
||
613 Opc
== Instruction::SRem
|| Opc
== Instruction::SDiv
);
617 Builder
.setFastMathFlags(FMF
);
619 if (isa
<Constant
>(Den
))
620 return nullptr; // Keep it for optimization
622 bool IsDiv
= Opc
== Instruction::UDiv
|| Opc
== Instruction::SDiv
;
623 bool IsSigned
= Opc
== Instruction::SRem
|| Opc
== Instruction::SDiv
;
625 Type
*Ty
= Num
->getType();
626 Type
*I32Ty
= Builder
.getInt32Ty();
627 Type
*F32Ty
= Builder
.getFloatTy();
629 if (Ty
->getScalarSizeInBits() < 32) {
631 Num
= Builder
.CreateSExt(Num
, I32Ty
);
632 Den
= Builder
.CreateSExt(Den
, I32Ty
);
634 Num
= Builder
.CreateZExt(Num
, I32Ty
);
635 Den
= Builder
.CreateZExt(Den
, I32Ty
);
639 if (Value
*Res
= expandDivRem24(Builder
, I
, Num
, Den
, IsDiv
, IsSigned
)) {
640 Res
= Builder
.CreateTrunc(Res
, Ty
);
644 ConstantInt
*Zero
= Builder
.getInt32(0);
645 ConstantInt
*One
= Builder
.getInt32(1);
646 ConstantInt
*MinusOne
= Builder
.getInt32(~0);
648 Value
*Sign
= nullptr;
650 ConstantInt
*K31
= Builder
.getInt32(31);
651 Value
*LHSign
= Builder
.CreateAShr(Num
, K31
);
652 Value
*RHSign
= Builder
.CreateAShr(Den
, K31
);
653 // Remainder sign is the same as LHS
654 Sign
= IsDiv
? Builder
.CreateXor(LHSign
, RHSign
) : LHSign
;
656 Num
= Builder
.CreateAdd(Num
, LHSign
);
657 Den
= Builder
.CreateAdd(Den
, RHSign
);
659 Num
= Builder
.CreateXor(Num
, LHSign
);
660 Den
= Builder
.CreateXor(Den
, RHSign
);
663 // RCP = URECIP(Den) = 2^32 / Den + e
664 // e is rounding error.
665 Value
*DEN_F32
= Builder
.CreateUIToFP(Den
, F32Ty
);
666 Value
*RCP_F32
= Builder
.CreateFDiv(ConstantFP::get(F32Ty
, 1.0), DEN_F32
);
667 Constant
*UINT_MAX_PLUS_1
= ConstantFP::get(F32Ty
, BitsToFloat(0x4f800000));
668 Value
*RCP_SCALE
= Builder
.CreateFMul(RCP_F32
, UINT_MAX_PLUS_1
);
669 Value
*RCP
= Builder
.CreateFPToUI(RCP_SCALE
, I32Ty
);
671 // RCP_LO, RCP_HI = mul(RCP, Den) */
672 Value
*RCP_LO
, *RCP_HI
;
673 std::tie(RCP_LO
, RCP_HI
) = getMul64(Builder
, RCP
, Den
);
675 // NEG_RCP_LO = -RCP_LO
676 Value
*NEG_RCP_LO
= Builder
.CreateNeg(RCP_LO
);
678 // ABS_RCP_LO = (RCP_HI == 0 ? NEG_RCP_LO : RCP_LO)
679 Value
*RCP_HI_0_CC
= Builder
.CreateICmpEQ(RCP_HI
, Zero
);
680 Value
*ABS_RCP_LO
= Builder
.CreateSelect(RCP_HI_0_CC
, NEG_RCP_LO
, RCP_LO
);
682 // Calculate the rounding error from the URECIP instruction
683 // E = mulhu(ABS_RCP_LO, RCP)
684 Value
*E
= getMulHu(Builder
, ABS_RCP_LO
, RCP
);
687 Value
*RCP_A_E
= Builder
.CreateAdd(RCP
, E
);
690 Value
*RCP_S_E
= Builder
.CreateSub(RCP
, E
);
692 // Tmp0 = (RCP_HI == 0 ? RCP_A_E : RCP_SUB_E)
693 Value
*Tmp0
= Builder
.CreateSelect(RCP_HI_0_CC
, RCP_A_E
, RCP_S_E
);
695 // Quotient = mulhu(Tmp0, Num)
696 Value
*Quotient
= getMulHu(Builder
, Tmp0
, Num
);
698 // Num_S_Remainder = Quotient * Den
699 Value
*Num_S_Remainder
= Builder
.CreateMul(Quotient
, Den
);
701 // Remainder = Num - Num_S_Remainder
702 Value
*Remainder
= Builder
.CreateSub(Num
, Num_S_Remainder
);
704 // Remainder_GE_Den = (Remainder >= Den ? -1 : 0)
705 Value
*Rem_GE_Den_CC
= Builder
.CreateICmpUGE(Remainder
, Den
);
706 Value
*Remainder_GE_Den
= Builder
.CreateSelect(Rem_GE_Den_CC
, MinusOne
, Zero
);
708 // Remainder_GE_Zero = (Num >= Num_S_Remainder ? -1 : 0)
709 Value
*Num_GE_Num_S_Rem_CC
= Builder
.CreateICmpUGE(Num
, Num_S_Remainder
);
710 Value
*Remainder_GE_Zero
= Builder
.CreateSelect(Num_GE_Num_S_Rem_CC
,
713 // Tmp1 = Remainder_GE_Den & Remainder_GE_Zero
714 Value
*Tmp1
= Builder
.CreateAnd(Remainder_GE_Den
, Remainder_GE_Zero
);
715 Value
*Tmp1_0_CC
= Builder
.CreateICmpEQ(Tmp1
, Zero
);
719 // Quotient_A_One = Quotient + 1
720 Value
*Quotient_A_One
= Builder
.CreateAdd(Quotient
, One
);
722 // Quotient_S_One = Quotient - 1
723 Value
*Quotient_S_One
= Builder
.CreateSub(Quotient
, One
);
725 // Div = (Tmp1 == 0 ? Quotient : Quotient_A_One)
726 Value
*Div
= Builder
.CreateSelect(Tmp1_0_CC
, Quotient
, Quotient_A_One
);
728 // Div = (Remainder_GE_Zero == 0 ? Quotient_S_One : Div)
729 Res
= Builder
.CreateSelect(Num_GE_Num_S_Rem_CC
, Div
, Quotient_S_One
);
731 // Remainder_S_Den = Remainder - Den
732 Value
*Remainder_S_Den
= Builder
.CreateSub(Remainder
, Den
);
734 // Remainder_A_Den = Remainder + Den
735 Value
*Remainder_A_Den
= Builder
.CreateAdd(Remainder
, Den
);
737 // Rem = (Tmp1 == 0 ? Remainder : Remainder_S_Den)
738 Value
*Rem
= Builder
.CreateSelect(Tmp1_0_CC
, Remainder
, Remainder_S_Den
);
740 // Rem = (Remainder_GE_Zero == 0 ? Remainder_A_Den : Rem)
741 Res
= Builder
.CreateSelect(Num_GE_Num_S_Rem_CC
, Rem
, Remainder_A_Den
);
745 Res
= Builder
.CreateXor(Res
, Sign
);
746 Res
= Builder
.CreateSub(Res
, Sign
);
749 Res
= Builder
.CreateTrunc(Res
, Ty
);
754 bool AMDGPUCodeGenPrepare::visitBinaryOperator(BinaryOperator
&I
) {
755 if (ST
->has16BitInsts() && needsPromotionToI32(I
.getType()) &&
756 DA
->isUniform(&I
) && promoteUniformOpToI32(I
))
759 bool Changed
= false;
760 Instruction::BinaryOps Opc
= I
.getOpcode();
761 Type
*Ty
= I
.getType();
762 Value
*NewDiv
= nullptr;
763 if ((Opc
== Instruction::URem
|| Opc
== Instruction::UDiv
||
764 Opc
== Instruction::SRem
|| Opc
== Instruction::SDiv
) &&
765 Ty
->getScalarSizeInBits() <= 32) {
766 Value
*Num
= I
.getOperand(0);
767 Value
*Den
= I
.getOperand(1);
768 IRBuilder
<> Builder(&I
);
769 Builder
.SetCurrentDebugLocation(I
.getDebugLoc());
771 if (VectorType
*VT
= dyn_cast
<VectorType
>(Ty
)) {
772 NewDiv
= UndefValue::get(VT
);
774 for (unsigned N
= 0, E
= VT
->getNumElements(); N
!= E
; ++N
) {
775 Value
*NumEltN
= Builder
.CreateExtractElement(Num
, N
);
776 Value
*DenEltN
= Builder
.CreateExtractElement(Den
, N
);
777 Value
*NewElt
= expandDivRem32(Builder
, I
, NumEltN
, DenEltN
);
779 NewElt
= Builder
.CreateBinOp(Opc
, NumEltN
, DenEltN
);
780 NewDiv
= Builder
.CreateInsertElement(NewDiv
, NewElt
, N
);
783 NewDiv
= expandDivRem32(Builder
, I
, Num
, Den
);
787 I
.replaceAllUsesWith(NewDiv
);
796 bool AMDGPUCodeGenPrepare::visitLoadInst(LoadInst
&I
) {
800 if ((I
.getPointerAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS
||
801 I
.getPointerAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT
) &&
802 canWidenScalarExtLoad(I
)) {
803 IRBuilder
<> Builder(&I
);
804 Builder
.SetCurrentDebugLocation(I
.getDebugLoc());
806 Type
*I32Ty
= Builder
.getInt32Ty();
807 Type
*PT
= PointerType::get(I32Ty
, I
.getPointerAddressSpace());
808 Value
*BitCast
= Builder
.CreateBitCast(I
.getPointerOperand(), PT
);
809 LoadInst
*WidenLoad
= Builder
.CreateLoad(I32Ty
, BitCast
);
810 WidenLoad
->copyMetadata(I
);
812 // If we have range metadata, we need to convert the type, and not make
813 // assumptions about the high bits.
814 if (auto *Range
= WidenLoad
->getMetadata(LLVMContext::MD_range
)) {
816 mdconst::extract
<ConstantInt
>(Range
->getOperand(0));
818 if (Lower
->getValue().isNullValue()) {
819 WidenLoad
->setMetadata(LLVMContext::MD_range
, nullptr);
821 Metadata
*LowAndHigh
[] = {
822 ConstantAsMetadata::get(ConstantInt::get(I32Ty
, Lower
->getValue().zext(32))),
823 // Don't make assumptions about the high bits.
824 ConstantAsMetadata::get(ConstantInt::get(I32Ty
, 0))
827 WidenLoad
->setMetadata(LLVMContext::MD_range
,
828 MDNode::get(Mod
->getContext(), LowAndHigh
));
832 int TySize
= Mod
->getDataLayout().getTypeSizeInBits(I
.getType());
833 Type
*IntNTy
= Builder
.getIntNTy(TySize
);
834 Value
*ValTrunc
= Builder
.CreateTrunc(WidenLoad
, IntNTy
);
835 Value
*ValOrig
= Builder
.CreateBitCast(ValTrunc
, I
.getType());
836 I
.replaceAllUsesWith(ValOrig
);
844 bool AMDGPUCodeGenPrepare::visitICmpInst(ICmpInst
&I
) {
845 bool Changed
= false;
847 if (ST
->has16BitInsts() && needsPromotionToI32(I
.getOperand(0)->getType()) &&
849 Changed
|= promoteUniformOpToI32(I
);
854 bool AMDGPUCodeGenPrepare::visitSelectInst(SelectInst
&I
) {
855 bool Changed
= false;
857 if (ST
->has16BitInsts() && needsPromotionToI32(I
.getType()) &&
859 Changed
|= promoteUniformOpToI32(I
);
864 bool AMDGPUCodeGenPrepare::visitIntrinsicInst(IntrinsicInst
&I
) {
865 switch (I
.getIntrinsicID()) {
866 case Intrinsic::bitreverse
:
867 return visitBitreverseIntrinsicInst(I
);
873 bool AMDGPUCodeGenPrepare::visitBitreverseIntrinsicInst(IntrinsicInst
&I
) {
874 bool Changed
= false;
876 if (ST
->has16BitInsts() && needsPromotionToI32(I
.getType()) &&
878 Changed
|= promoteUniformBitreverseToI32(I
);
883 bool AMDGPUCodeGenPrepare::doInitialization(Module
&M
) {
888 bool AMDGPUCodeGenPrepare::runOnFunction(Function
&F
) {
892 auto *TPC
= getAnalysisIfAvailable
<TargetPassConfig
>();
896 const AMDGPUTargetMachine
&TM
= TPC
->getTM
<AMDGPUTargetMachine
>();
897 ST
= &TM
.getSubtarget
<GCNSubtarget
>(F
);
898 AC
= &getAnalysis
<AssumptionCacheTracker
>().getAssumptionCache(F
);
899 DA
= &getAnalysis
<LegacyDivergenceAnalysis
>();
900 HasUnsafeFPMath
= hasUnsafeFPMath(F
);
902 bool MadeChange
= false;
904 for (BasicBlock
&BB
: F
) {
905 BasicBlock::iterator Next
;
906 for (BasicBlock::iterator I
= BB
.begin(), E
= BB
.end(); I
!= E
; I
= Next
) {
908 MadeChange
|= visit(*I
);
915 INITIALIZE_PASS_BEGIN(AMDGPUCodeGenPrepare
, DEBUG_TYPE
,
916 "AMDGPU IR optimizations", false, false)
917 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker
)
918 INITIALIZE_PASS_DEPENDENCY(LegacyDivergenceAnalysis
)
919 INITIALIZE_PASS_END(AMDGPUCodeGenPrepare
, DEBUG_TYPE
, "AMDGPU IR optimizations",
922 char AMDGPUCodeGenPrepare::ID
= 0;
924 FunctionPass
*llvm::createAMDGPUCodeGenPreparePass() {
925 return new AMDGPUCodeGenPrepare();