1 //===-- AMDGPUCodeGenPrepare.cpp ------------------------------------------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
10 /// This pass does misc. AMDGPU optimizations on IR before instruction
13 //===----------------------------------------------------------------------===//
16 #include "AMDGPUSubtarget.h"
17 #include "AMDGPUTargetMachine.h"
18 #include "llvm/ADT/StringRef.h"
19 #include "llvm/Analysis/AssumptionCache.h"
20 #include "llvm/Analysis/LegacyDivergenceAnalysis.h"
21 #include "llvm/Analysis/Loads.h"
22 #include "llvm/Analysis/ValueTracking.h"
23 #include "llvm/CodeGen/Passes.h"
24 #include "llvm/CodeGen/TargetPassConfig.h"
25 #include "llvm/IR/Attributes.h"
26 #include "llvm/IR/BasicBlock.h"
27 #include "llvm/IR/Constants.h"
28 #include "llvm/IR/DerivedTypes.h"
29 #include "llvm/IR/Function.h"
30 #include "llvm/IR/IRBuilder.h"
31 #include "llvm/IR/InstVisitor.h"
32 #include "llvm/IR/InstrTypes.h"
33 #include "llvm/IR/Instruction.h"
34 #include "llvm/IR/Instructions.h"
35 #include "llvm/IR/IntrinsicInst.h"
36 #include "llvm/IR/Intrinsics.h"
37 #include "llvm/IR/LLVMContext.h"
38 #include "llvm/IR/Operator.h"
39 #include "llvm/IR/Type.h"
40 #include "llvm/IR/Value.h"
41 #include "llvm/Pass.h"
42 #include "llvm/Support/Casting.h"
46 #define DEBUG_TYPE "amdgpu-codegenprepare"
52 static cl::opt
<bool> WidenLoads(
53 "amdgpu-codegenprepare-widen-constant-loads",
54 cl::desc("Widen sub-dword constant address space loads in AMDGPUCodeGenPrepare"),
58 static cl::opt
<bool> UseMul24Intrin(
59 "amdgpu-codegenprepare-mul24",
60 cl::desc("Introduce mul24 intrinsics in AMDGPUCodeGenPrepare"),
64 class AMDGPUCodeGenPrepare
: public FunctionPass
,
65 public InstVisitor
<AMDGPUCodeGenPrepare
, bool> {
66 const GCNSubtarget
*ST
= nullptr;
67 AssumptionCache
*AC
= nullptr;
68 LegacyDivergenceAnalysis
*DA
= nullptr;
69 Module
*Mod
= nullptr;
70 const DataLayout
*DL
= nullptr;
71 bool HasUnsafeFPMath
= false;
73 /// Copies exact/nsw/nuw flags (if any) from binary operation \p I to
74 /// binary operation \p V.
76 /// \returns Binary operation \p V.
77 /// \returns \p T's base element bit width.
78 unsigned getBaseElementBitWidth(const Type
*T
) const;
80 /// \returns Equivalent 32 bit integer type for given type \p T. For example,
81 /// if \p T is i7, then i32 is returned; if \p T is <3 x i12>, then <3 x i32>
83 Type
*getI32Ty(IRBuilder
<> &B
, const Type
*T
) const;
85 /// \returns True if binary operation \p I is a signed binary operation, false
87 bool isSigned(const BinaryOperator
&I
) const;
89 /// \returns True if the condition of 'select' operation \p I comes from a
90 /// signed 'icmp' operation, false otherwise.
91 bool isSigned(const SelectInst
&I
) const;
93 /// \returns True if type \p T needs to be promoted to 32 bit integer type,
95 bool needsPromotionToI32(const Type
*T
) const;
97 /// Promotes uniform binary operation \p I to equivalent 32 bit binary
100 /// \details \p I's base element bit width must be greater than 1 and less
101 /// than or equal 16. Promotion is done by sign or zero extending operands to
102 /// 32 bits, replacing \p I with equivalent 32 bit binary operation, and
103 /// truncating the result of 32 bit binary operation back to \p I's original
104 /// type. Division operation is not promoted.
106 /// \returns True if \p I is promoted to equivalent 32 bit binary operation,
108 bool promoteUniformOpToI32(BinaryOperator
&I
) const;
110 /// Promotes uniform 'icmp' operation \p I to 32 bit 'icmp' operation.
112 /// \details \p I's base element bit width must be greater than 1 and less
113 /// than or equal 16. Promotion is done by sign or zero extending operands to
114 /// 32 bits, and replacing \p I with 32 bit 'icmp' operation.
117 bool promoteUniformOpToI32(ICmpInst
&I
) const;
119 /// Promotes uniform 'select' operation \p I to 32 bit 'select'
122 /// \details \p I's base element bit width must be greater than 1 and less
123 /// than or equal 16. Promotion is done by sign or zero extending operands to
124 /// 32 bits, replacing \p I with 32 bit 'select' operation, and truncating the
125 /// result of 32 bit 'select' operation back to \p I's original type.
128 bool promoteUniformOpToI32(SelectInst
&I
) const;
130 /// Promotes uniform 'bitreverse' intrinsic \p I to 32 bit 'bitreverse'
133 /// \details \p I's base element bit width must be greater than 1 and less
134 /// than or equal 16. Promotion is done by zero extending the operand to 32
135 /// bits, replacing \p I with 32 bit 'bitreverse' intrinsic, shifting the
136 /// result of 32 bit 'bitreverse' intrinsic to the right with zero fill (the
137 /// shift amount is 32 minus \p I's base element bit width), and truncating
138 /// the result of the shift operation back to \p I's original type.
141 bool promoteUniformBitreverseToI32(IntrinsicInst
&I
) const;
144 unsigned numBitsUnsigned(Value
*Op
, unsigned ScalarSize
) const;
145 unsigned numBitsSigned(Value
*Op
, unsigned ScalarSize
) const;
146 bool isI24(Value
*V
, unsigned ScalarSize
) const;
147 bool isU24(Value
*V
, unsigned ScalarSize
) const;
149 /// Replace mul instructions with llvm.amdgcn.mul.u24 or llvm.amdgcn.mul.s24.
150 /// SelectionDAG has an issue where an and asserting the bits are known
151 bool replaceMulWithMul24(BinaryOperator
&I
) const;
153 /// Expands 24 bit div or rem.
154 Value
* expandDivRem24(IRBuilder
<> &Builder
, BinaryOperator
&I
,
155 Value
*Num
, Value
*Den
,
156 bool IsDiv
, bool IsSigned
) const;
158 /// Expands 32 bit div or rem.
159 Value
* expandDivRem32(IRBuilder
<> &Builder
, BinaryOperator
&I
,
160 Value
*Num
, Value
*Den
) const;
162 /// Widen a scalar load.
164 /// \details \p Widen scalar load for uniform, small type loads from constant
165 // memory / to a full 32-bits and then truncate the input to allow a scalar
166 // load instead of a vector load.
170 bool canWidenScalarExtLoad(LoadInst
&I
) const;
175 AMDGPUCodeGenPrepare() : FunctionPass(ID
) {}
177 bool visitFDiv(BinaryOperator
&I
);
179 bool visitInstruction(Instruction
&I
) { return false; }
180 bool visitBinaryOperator(BinaryOperator
&I
);
181 bool visitLoadInst(LoadInst
&I
);
182 bool visitICmpInst(ICmpInst
&I
);
183 bool visitSelectInst(SelectInst
&I
);
185 bool visitIntrinsicInst(IntrinsicInst
&I
);
186 bool visitBitreverseIntrinsicInst(IntrinsicInst
&I
);
188 bool doInitialization(Module
&M
) override
;
189 bool runOnFunction(Function
&F
) override
;
191 StringRef
getPassName() const override
{ return "AMDGPU IR optimizations"; }
193 void getAnalysisUsage(AnalysisUsage
&AU
) const override
{
194 AU
.addRequired
<AssumptionCacheTracker
>();
195 AU
.addRequired
<LegacyDivergenceAnalysis
>();
196 AU
.setPreservesAll();
200 } // end anonymous namespace
202 unsigned AMDGPUCodeGenPrepare::getBaseElementBitWidth(const Type
*T
) const {
203 assert(needsPromotionToI32(T
) && "T does not need promotion to i32");
205 if (T
->isIntegerTy())
206 return T
->getIntegerBitWidth();
207 return cast
<VectorType
>(T
)->getElementType()->getIntegerBitWidth();
210 Type
*AMDGPUCodeGenPrepare::getI32Ty(IRBuilder
<> &B
, const Type
*T
) const {
211 assert(needsPromotionToI32(T
) && "T does not need promotion to i32");
213 if (T
->isIntegerTy())
214 return B
.getInt32Ty();
215 return VectorType::get(B
.getInt32Ty(), cast
<VectorType
>(T
)->getNumElements());
218 bool AMDGPUCodeGenPrepare::isSigned(const BinaryOperator
&I
) const {
219 return I
.getOpcode() == Instruction::AShr
||
220 I
.getOpcode() == Instruction::SDiv
|| I
.getOpcode() == Instruction::SRem
;
223 bool AMDGPUCodeGenPrepare::isSigned(const SelectInst
&I
) const {
224 return isa
<ICmpInst
>(I
.getOperand(0)) ?
225 cast
<ICmpInst
>(I
.getOperand(0))->isSigned() : false;
228 bool AMDGPUCodeGenPrepare::needsPromotionToI32(const Type
*T
) const {
229 const IntegerType
*IntTy
= dyn_cast
<IntegerType
>(T
);
230 if (IntTy
&& IntTy
->getBitWidth() > 1 && IntTy
->getBitWidth() <= 16)
233 if (const VectorType
*VT
= dyn_cast
<VectorType
>(T
)) {
234 // TODO: The set of packed operations is more limited, so may want to
235 // promote some anyway.
236 if (ST
->hasVOP3PInsts())
239 return needsPromotionToI32(VT
->getElementType());
245 // Return true if the op promoted to i32 should have nsw set.
246 static bool promotedOpIsNSW(const Instruction
&I
) {
247 switch (I
.getOpcode()) {
248 case Instruction::Shl
:
249 case Instruction::Add
:
250 case Instruction::Sub
:
252 case Instruction::Mul
:
253 return I
.hasNoUnsignedWrap();
259 // Return true if the op promoted to i32 should have nuw set.
260 static bool promotedOpIsNUW(const Instruction
&I
) {
261 switch (I
.getOpcode()) {
262 case Instruction::Shl
:
263 case Instruction::Add
:
264 case Instruction::Mul
:
266 case Instruction::Sub
:
267 return I
.hasNoUnsignedWrap();
273 bool AMDGPUCodeGenPrepare::canWidenScalarExtLoad(LoadInst
&I
) const {
274 Type
*Ty
= I
.getType();
275 const DataLayout
&DL
= Mod
->getDataLayout();
276 int TySize
= DL
.getTypeSizeInBits(Ty
);
277 unsigned Align
= I
.getAlignment() ?
278 I
.getAlignment() : DL
.getABITypeAlignment(Ty
);
280 return I
.isSimple() && TySize
< 32 && Align
>= 4 && DA
->isUniform(&I
);
283 bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(BinaryOperator
&I
) const {
284 assert(needsPromotionToI32(I
.getType()) &&
285 "I does not need promotion to i32");
287 if (I
.getOpcode() == Instruction::SDiv
||
288 I
.getOpcode() == Instruction::UDiv
||
289 I
.getOpcode() == Instruction::SRem
||
290 I
.getOpcode() == Instruction::URem
)
293 IRBuilder
<> Builder(&I
);
294 Builder
.SetCurrentDebugLocation(I
.getDebugLoc());
296 Type
*I32Ty
= getI32Ty(Builder
, I
.getType());
297 Value
*ExtOp0
= nullptr;
298 Value
*ExtOp1
= nullptr;
299 Value
*ExtRes
= nullptr;
300 Value
*TruncRes
= nullptr;
303 ExtOp0
= Builder
.CreateSExt(I
.getOperand(0), I32Ty
);
304 ExtOp1
= Builder
.CreateSExt(I
.getOperand(1), I32Ty
);
306 ExtOp0
= Builder
.CreateZExt(I
.getOperand(0), I32Ty
);
307 ExtOp1
= Builder
.CreateZExt(I
.getOperand(1), I32Ty
);
310 ExtRes
= Builder
.CreateBinOp(I
.getOpcode(), ExtOp0
, ExtOp1
);
311 if (Instruction
*Inst
= dyn_cast
<Instruction
>(ExtRes
)) {
312 if (promotedOpIsNSW(cast
<Instruction
>(I
)))
313 Inst
->setHasNoSignedWrap();
315 if (promotedOpIsNUW(cast
<Instruction
>(I
)))
316 Inst
->setHasNoUnsignedWrap();
318 if (const auto *ExactOp
= dyn_cast
<PossiblyExactOperator
>(&I
))
319 Inst
->setIsExact(ExactOp
->isExact());
322 TruncRes
= Builder
.CreateTrunc(ExtRes
, I
.getType());
324 I
.replaceAllUsesWith(TruncRes
);
330 bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(ICmpInst
&I
) const {
331 assert(needsPromotionToI32(I
.getOperand(0)->getType()) &&
332 "I does not need promotion to i32");
334 IRBuilder
<> Builder(&I
);
335 Builder
.SetCurrentDebugLocation(I
.getDebugLoc());
337 Type
*I32Ty
= getI32Ty(Builder
, I
.getOperand(0)->getType());
338 Value
*ExtOp0
= nullptr;
339 Value
*ExtOp1
= nullptr;
340 Value
*NewICmp
= nullptr;
343 ExtOp0
= Builder
.CreateSExt(I
.getOperand(0), I32Ty
);
344 ExtOp1
= Builder
.CreateSExt(I
.getOperand(1), I32Ty
);
346 ExtOp0
= Builder
.CreateZExt(I
.getOperand(0), I32Ty
);
347 ExtOp1
= Builder
.CreateZExt(I
.getOperand(1), I32Ty
);
349 NewICmp
= Builder
.CreateICmp(I
.getPredicate(), ExtOp0
, ExtOp1
);
351 I
.replaceAllUsesWith(NewICmp
);
357 bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(SelectInst
&I
) const {
358 assert(needsPromotionToI32(I
.getType()) &&
359 "I does not need promotion to i32");
361 IRBuilder
<> Builder(&I
);
362 Builder
.SetCurrentDebugLocation(I
.getDebugLoc());
364 Type
*I32Ty
= getI32Ty(Builder
, I
.getType());
365 Value
*ExtOp1
= nullptr;
366 Value
*ExtOp2
= nullptr;
367 Value
*ExtRes
= nullptr;
368 Value
*TruncRes
= nullptr;
371 ExtOp1
= Builder
.CreateSExt(I
.getOperand(1), I32Ty
);
372 ExtOp2
= Builder
.CreateSExt(I
.getOperand(2), I32Ty
);
374 ExtOp1
= Builder
.CreateZExt(I
.getOperand(1), I32Ty
);
375 ExtOp2
= Builder
.CreateZExt(I
.getOperand(2), I32Ty
);
377 ExtRes
= Builder
.CreateSelect(I
.getOperand(0), ExtOp1
, ExtOp2
);
378 TruncRes
= Builder
.CreateTrunc(ExtRes
, I
.getType());
380 I
.replaceAllUsesWith(TruncRes
);
386 bool AMDGPUCodeGenPrepare::promoteUniformBitreverseToI32(
387 IntrinsicInst
&I
) const {
388 assert(I
.getIntrinsicID() == Intrinsic::bitreverse
&&
389 "I must be bitreverse intrinsic");
390 assert(needsPromotionToI32(I
.getType()) &&
391 "I does not need promotion to i32");
393 IRBuilder
<> Builder(&I
);
394 Builder
.SetCurrentDebugLocation(I
.getDebugLoc());
396 Type
*I32Ty
= getI32Ty(Builder
, I
.getType());
398 Intrinsic::getDeclaration(Mod
, Intrinsic::bitreverse
, { I32Ty
});
399 Value
*ExtOp
= Builder
.CreateZExt(I
.getOperand(0), I32Ty
);
400 Value
*ExtRes
= Builder
.CreateCall(I32
, { ExtOp
});
402 Builder
.CreateLShr(ExtRes
, 32 - getBaseElementBitWidth(I
.getType()));
404 Builder
.CreateTrunc(LShrOp
, I
.getType());
406 I
.replaceAllUsesWith(TruncRes
);
412 unsigned AMDGPUCodeGenPrepare::numBitsUnsigned(Value
*Op
,
413 unsigned ScalarSize
) const {
414 KnownBits Known
= computeKnownBits(Op
, *DL
, 0, AC
);
415 return ScalarSize
- Known
.countMinLeadingZeros();
418 unsigned AMDGPUCodeGenPrepare::numBitsSigned(Value
*Op
,
419 unsigned ScalarSize
) const {
420 // In order for this to be a signed 24-bit value, bit 23, must
422 return ScalarSize
- ComputeNumSignBits(Op
, *DL
, 0, AC
);
425 bool AMDGPUCodeGenPrepare::isI24(Value
*V
, unsigned ScalarSize
) const {
426 return ScalarSize
>= 24 && // Types less than 24-bit should be treated
427 // as unsigned 24-bit values.
428 numBitsSigned(V
, ScalarSize
) < 24;
431 bool AMDGPUCodeGenPrepare::isU24(Value
*V
, unsigned ScalarSize
) const {
432 return numBitsUnsigned(V
, ScalarSize
) <= 24;
435 static void extractValues(IRBuilder
<> &Builder
,
436 SmallVectorImpl
<Value
*> &Values
, Value
*V
) {
437 VectorType
*VT
= dyn_cast
<VectorType
>(V
->getType());
443 for (int I
= 0, E
= VT
->getNumElements(); I
!= E
; ++I
)
444 Values
.push_back(Builder
.CreateExtractElement(V
, I
));
447 static Value
*insertValues(IRBuilder
<> &Builder
,
449 SmallVectorImpl
<Value
*> &Values
) {
450 if (Values
.size() == 1)
453 Value
*NewVal
= UndefValue::get(Ty
);
454 for (int I
= 0, E
= Values
.size(); I
!= E
; ++I
)
455 NewVal
= Builder
.CreateInsertElement(NewVal
, Values
[I
], I
);
460 bool AMDGPUCodeGenPrepare::replaceMulWithMul24(BinaryOperator
&I
) const {
461 if (I
.getOpcode() != Instruction::Mul
)
464 Type
*Ty
= I
.getType();
465 unsigned Size
= Ty
->getScalarSizeInBits();
466 if (Size
<= 16 && ST
->has16BitInsts())
469 // Prefer scalar if this could be s_mul_i32
470 if (DA
->isUniform(&I
))
473 Value
*LHS
= I
.getOperand(0);
474 Value
*RHS
= I
.getOperand(1);
475 IRBuilder
<> Builder(&I
);
476 Builder
.SetCurrentDebugLocation(I
.getDebugLoc());
478 Intrinsic::ID IntrID
= Intrinsic::not_intrinsic
;
480 // TODO: Should this try to match mulhi24?
481 if (ST
->hasMulU24() && isU24(LHS
, Size
) && isU24(RHS
, Size
)) {
482 IntrID
= Intrinsic::amdgcn_mul_u24
;
483 } else if (ST
->hasMulI24() && isI24(LHS
, Size
) && isI24(RHS
, Size
)) {
484 IntrID
= Intrinsic::amdgcn_mul_i24
;
488 SmallVector
<Value
*, 4> LHSVals
;
489 SmallVector
<Value
*, 4> RHSVals
;
490 SmallVector
<Value
*, 4> ResultVals
;
491 extractValues(Builder
, LHSVals
, LHS
);
492 extractValues(Builder
, RHSVals
, RHS
);
495 IntegerType
*I32Ty
= Builder
.getInt32Ty();
496 FunctionCallee Intrin
= Intrinsic::getDeclaration(Mod
, IntrID
);
497 for (int I
= 0, E
= LHSVals
.size(); I
!= E
; ++I
) {
499 if (IntrID
== Intrinsic::amdgcn_mul_u24
) {
500 LHS
= Builder
.CreateZExtOrTrunc(LHSVals
[I
], I32Ty
);
501 RHS
= Builder
.CreateZExtOrTrunc(RHSVals
[I
], I32Ty
);
503 LHS
= Builder
.CreateSExtOrTrunc(LHSVals
[I
], I32Ty
);
504 RHS
= Builder
.CreateSExtOrTrunc(RHSVals
[I
], I32Ty
);
507 Value
*Result
= Builder
.CreateCall(Intrin
, {LHS
, RHS
});
509 if (IntrID
== Intrinsic::amdgcn_mul_u24
) {
510 ResultVals
.push_back(Builder
.CreateZExtOrTrunc(Result
,
511 LHSVals
[I
]->getType()));
513 ResultVals
.push_back(Builder
.CreateSExtOrTrunc(Result
,
514 LHSVals
[I
]->getType()));
518 Value
*NewVal
= insertValues(Builder
, Ty
, ResultVals
);
519 NewVal
->takeName(&I
);
520 I
.replaceAllUsesWith(NewVal
);
526 static bool shouldKeepFDivF32(Value
*Num
, bool UnsafeDiv
, bool HasDenormals
) {
527 const ConstantFP
*CNum
= dyn_cast
<ConstantFP
>(Num
);
534 bool IsOne
= CNum
->isExactlyValue(+1.0) || CNum
->isExactlyValue(-1.0);
536 // Reciprocal f32 is handled separately without denormals.
537 return HasDenormals
^ IsOne
;
540 // Insert an intrinsic for fast fdiv for safe math situations where we can
541 // reduce precision. Leave fdiv for situations where the generic node is
542 // expected to be optimized.
543 bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator
&FDiv
) {
544 Type
*Ty
= FDiv
.getType();
546 if (!Ty
->getScalarType()->isFloatTy())
549 MDNode
*FPMath
= FDiv
.getMetadata(LLVMContext::MD_fpmath
);
553 const FPMathOperator
*FPOp
= cast
<const FPMathOperator
>(&FDiv
);
554 float ULP
= FPOp
->getFPAccuracy();
558 FastMathFlags FMF
= FPOp
->getFastMathFlags();
559 bool UnsafeDiv
= HasUnsafeFPMath
|| FMF
.isFast() ||
560 FMF
.allowReciprocal();
562 // With UnsafeDiv node will be optimized to just rcp and mul.
566 IRBuilder
<> Builder(FDiv
.getParent(), std::next(FDiv
.getIterator()), FPMath
);
567 Builder
.setFastMathFlags(FMF
);
568 Builder
.SetCurrentDebugLocation(FDiv
.getDebugLoc());
570 Function
*Decl
= Intrinsic::getDeclaration(Mod
, Intrinsic::amdgcn_fdiv_fast
);
572 Value
*Num
= FDiv
.getOperand(0);
573 Value
*Den
= FDiv
.getOperand(1);
575 Value
*NewFDiv
= nullptr;
577 bool HasDenormals
= ST
->hasFP32Denormals();
578 if (VectorType
*VT
= dyn_cast
<VectorType
>(Ty
)) {
579 NewFDiv
= UndefValue::get(VT
);
581 // FIXME: Doesn't do the right thing for cases where the vector is partially
582 // constant. This works when the scalarizer pass is run first.
583 for (unsigned I
= 0, E
= VT
->getNumElements(); I
!= E
; ++I
) {
584 Value
*NumEltI
= Builder
.CreateExtractElement(Num
, I
);
585 Value
*DenEltI
= Builder
.CreateExtractElement(Den
, I
);
588 if (shouldKeepFDivF32(NumEltI
, UnsafeDiv
, HasDenormals
)) {
589 NewElt
= Builder
.CreateFDiv(NumEltI
, DenEltI
);
591 NewElt
= Builder
.CreateCall(Decl
, { NumEltI
, DenEltI
});
594 NewFDiv
= Builder
.CreateInsertElement(NewFDiv
, NewElt
, I
);
597 if (!shouldKeepFDivF32(Num
, UnsafeDiv
, HasDenormals
))
598 NewFDiv
= Builder
.CreateCall(Decl
, { Num
, Den
});
602 FDiv
.replaceAllUsesWith(NewFDiv
);
603 NewFDiv
->takeName(&FDiv
);
604 FDiv
.eraseFromParent();
610 static bool hasUnsafeFPMath(const Function
&F
) {
611 Attribute Attr
= F
.getFnAttribute("unsafe-fp-math");
612 return Attr
.getValueAsString() == "true";
615 static std::pair
<Value
*, Value
*> getMul64(IRBuilder
<> &Builder
,
616 Value
*LHS
, Value
*RHS
) {
617 Type
*I32Ty
= Builder
.getInt32Ty();
618 Type
*I64Ty
= Builder
.getInt64Ty();
620 Value
*LHS_EXT64
= Builder
.CreateZExt(LHS
, I64Ty
);
621 Value
*RHS_EXT64
= Builder
.CreateZExt(RHS
, I64Ty
);
622 Value
*MUL64
= Builder
.CreateMul(LHS_EXT64
, RHS_EXT64
);
623 Value
*Lo
= Builder
.CreateTrunc(MUL64
, I32Ty
);
624 Value
*Hi
= Builder
.CreateLShr(MUL64
, Builder
.getInt64(32));
625 Hi
= Builder
.CreateTrunc(Hi
, I32Ty
);
626 return std::make_pair(Lo
, Hi
);
629 static Value
* getMulHu(IRBuilder
<> &Builder
, Value
*LHS
, Value
*RHS
) {
630 return getMul64(Builder
, LHS
, RHS
).second
;
633 // The fractional part of a float is enough to accurately represent up to
634 // a 24-bit signed integer.
635 Value
* AMDGPUCodeGenPrepare::expandDivRem24(IRBuilder
<> &Builder
,
637 Value
*Num
, Value
*Den
,
638 bool IsDiv
, bool IsSigned
) const {
639 assert(Num
->getType()->isIntegerTy(32));
641 const DataLayout
&DL
= Mod
->getDataLayout();
642 unsigned LHSSignBits
= ComputeNumSignBits(Num
, DL
, 0, AC
, &I
);
646 unsigned RHSSignBits
= ComputeNumSignBits(Den
, DL
, 0, AC
, &I
);
651 unsigned SignBits
= std::min(LHSSignBits
, RHSSignBits
);
652 unsigned DivBits
= 32 - SignBits
;
656 Type
*Ty
= Num
->getType();
657 Type
*I32Ty
= Builder
.getInt32Ty();
658 Type
*F32Ty
= Builder
.getFloatTy();
659 ConstantInt
*One
= Builder
.getInt32(1);
663 // char|short jq = ia ^ ib;
664 JQ
= Builder
.CreateXor(Num
, Den
);
666 // jq = jq >> (bitsize - 2)
667 JQ
= Builder
.CreateAShr(JQ
, Builder
.getInt32(30));
670 JQ
= Builder
.CreateOr(JQ
, One
);
673 // int ia = (int)LHS;
679 // float fa = (float)ia;
680 Value
*FA
= IsSigned
? Builder
.CreateSIToFP(IA
, F32Ty
)
681 : Builder
.CreateUIToFP(IA
, F32Ty
);
683 // float fb = (float)ib;
684 Value
*FB
= IsSigned
? Builder
.CreateSIToFP(IB
,F32Ty
)
685 : Builder
.CreateUIToFP(IB
,F32Ty
);
687 Value
*RCP
= Builder
.CreateFDiv(ConstantFP::get(F32Ty
, 1.0), FB
);
688 Value
*FQM
= Builder
.CreateFMul(FA
, RCP
);
691 CallInst
*FQ
= Builder
.CreateUnaryIntrinsic(Intrinsic::trunc
, FQM
);
692 FQ
->copyFastMathFlags(Builder
.getFastMathFlags());
694 // float fqneg = -fq;
695 Value
*FQNeg
= Builder
.CreateFNeg(FQ
);
697 // float fr = mad(fqneg, fb, fa);
698 Value
*FR
= Builder
.CreateIntrinsic(Intrinsic::amdgcn_fmad_ftz
,
699 {FQNeg
->getType()}, {FQNeg
, FB
, FA
}, FQ
);
702 Value
*IQ
= IsSigned
? Builder
.CreateFPToSI(FQ
, I32Ty
)
703 : Builder
.CreateFPToUI(FQ
, I32Ty
);
706 FR
= Builder
.CreateUnaryIntrinsic(Intrinsic::fabs
, FR
, FQ
);
709 FB
= Builder
.CreateUnaryIntrinsic(Intrinsic::fabs
, FB
, FQ
);
711 // int cv = fr >= fb;
712 Value
*CV
= Builder
.CreateFCmpOGE(FR
, FB
);
714 // jq = (cv ? jq : 0);
715 JQ
= Builder
.CreateSelect(CV
, JQ
, Builder
.getInt32(0));
718 Value
*Div
= Builder
.CreateAdd(IQ
, JQ
);
722 // Rem needs compensation, it's easier to recompute it
723 Value
*Rem
= Builder
.CreateMul(Div
, Den
);
724 Res
= Builder
.CreateSub(Num
, Rem
);
727 // Truncate to number of bits this divide really is.
729 Res
= Builder
.CreateTrunc(Res
, Builder
.getIntNTy(DivBits
));
730 Res
= Builder
.CreateSExt(Res
, Ty
);
732 ConstantInt
*TruncMask
= Builder
.getInt32((UINT64_C(1) << DivBits
) - 1);
733 Res
= Builder
.CreateAnd(Res
, TruncMask
);
739 Value
* AMDGPUCodeGenPrepare::expandDivRem32(IRBuilder
<> &Builder
,
741 Value
*Num
, Value
*Den
) const {
742 Instruction::BinaryOps Opc
= I
.getOpcode();
743 assert(Opc
== Instruction::URem
|| Opc
== Instruction::UDiv
||
744 Opc
== Instruction::SRem
|| Opc
== Instruction::SDiv
);
748 Builder
.setFastMathFlags(FMF
);
750 if (isa
<Constant
>(Den
))
751 return nullptr; // Keep it for optimization
753 bool IsDiv
= Opc
== Instruction::UDiv
|| Opc
== Instruction::SDiv
;
754 bool IsSigned
= Opc
== Instruction::SRem
|| Opc
== Instruction::SDiv
;
756 Type
*Ty
= Num
->getType();
757 Type
*I32Ty
= Builder
.getInt32Ty();
758 Type
*F32Ty
= Builder
.getFloatTy();
760 if (Ty
->getScalarSizeInBits() < 32) {
762 Num
= Builder
.CreateSExt(Num
, I32Ty
);
763 Den
= Builder
.CreateSExt(Den
, I32Ty
);
765 Num
= Builder
.CreateZExt(Num
, I32Ty
);
766 Den
= Builder
.CreateZExt(Den
, I32Ty
);
770 if (Value
*Res
= expandDivRem24(Builder
, I
, Num
, Den
, IsDiv
, IsSigned
)) {
771 Res
= Builder
.CreateTrunc(Res
, Ty
);
775 ConstantInt
*Zero
= Builder
.getInt32(0);
776 ConstantInt
*One
= Builder
.getInt32(1);
777 ConstantInt
*MinusOne
= Builder
.getInt32(~0);
779 Value
*Sign
= nullptr;
781 ConstantInt
*K31
= Builder
.getInt32(31);
782 Value
*LHSign
= Builder
.CreateAShr(Num
, K31
);
783 Value
*RHSign
= Builder
.CreateAShr(Den
, K31
);
784 // Remainder sign is the same as LHS
785 Sign
= IsDiv
? Builder
.CreateXor(LHSign
, RHSign
) : LHSign
;
787 Num
= Builder
.CreateAdd(Num
, LHSign
);
788 Den
= Builder
.CreateAdd(Den
, RHSign
);
790 Num
= Builder
.CreateXor(Num
, LHSign
);
791 Den
= Builder
.CreateXor(Den
, RHSign
);
794 // RCP = URECIP(Den) = 2^32 / Den + e
795 // e is rounding error.
796 Value
*DEN_F32
= Builder
.CreateUIToFP(Den
, F32Ty
);
797 Value
*RCP_F32
= Builder
.CreateFDiv(ConstantFP::get(F32Ty
, 1.0), DEN_F32
);
798 Constant
*UINT_MAX_PLUS_1
= ConstantFP::get(F32Ty
, BitsToFloat(0x4f800000));
799 Value
*RCP_SCALE
= Builder
.CreateFMul(RCP_F32
, UINT_MAX_PLUS_1
);
800 Value
*RCP
= Builder
.CreateFPToUI(RCP_SCALE
, I32Ty
);
802 // RCP_LO, RCP_HI = mul(RCP, Den) */
803 Value
*RCP_LO
, *RCP_HI
;
804 std::tie(RCP_LO
, RCP_HI
) = getMul64(Builder
, RCP
, Den
);
806 // NEG_RCP_LO = -RCP_LO
807 Value
*NEG_RCP_LO
= Builder
.CreateNeg(RCP_LO
);
809 // ABS_RCP_LO = (RCP_HI == 0 ? NEG_RCP_LO : RCP_LO)
810 Value
*RCP_HI_0_CC
= Builder
.CreateICmpEQ(RCP_HI
, Zero
);
811 Value
*ABS_RCP_LO
= Builder
.CreateSelect(RCP_HI_0_CC
, NEG_RCP_LO
, RCP_LO
);
813 // Calculate the rounding error from the URECIP instruction
814 // E = mulhu(ABS_RCP_LO, RCP)
815 Value
*E
= getMulHu(Builder
, ABS_RCP_LO
, RCP
);
818 Value
*RCP_A_E
= Builder
.CreateAdd(RCP
, E
);
821 Value
*RCP_S_E
= Builder
.CreateSub(RCP
, E
);
823 // Tmp0 = (RCP_HI == 0 ? RCP_A_E : RCP_SUB_E)
824 Value
*Tmp0
= Builder
.CreateSelect(RCP_HI_0_CC
, RCP_A_E
, RCP_S_E
);
826 // Quotient = mulhu(Tmp0, Num)
827 Value
*Quotient
= getMulHu(Builder
, Tmp0
, Num
);
829 // Num_S_Remainder = Quotient * Den
830 Value
*Num_S_Remainder
= Builder
.CreateMul(Quotient
, Den
);
832 // Remainder = Num - Num_S_Remainder
833 Value
*Remainder
= Builder
.CreateSub(Num
, Num_S_Remainder
);
835 // Remainder_GE_Den = (Remainder >= Den ? -1 : 0)
836 Value
*Rem_GE_Den_CC
= Builder
.CreateICmpUGE(Remainder
, Den
);
837 Value
*Remainder_GE_Den
= Builder
.CreateSelect(Rem_GE_Den_CC
, MinusOne
, Zero
);
839 // Remainder_GE_Zero = (Num >= Num_S_Remainder ? -1 : 0)
840 Value
*Num_GE_Num_S_Rem_CC
= Builder
.CreateICmpUGE(Num
, Num_S_Remainder
);
841 Value
*Remainder_GE_Zero
= Builder
.CreateSelect(Num_GE_Num_S_Rem_CC
,
844 // Tmp1 = Remainder_GE_Den & Remainder_GE_Zero
845 Value
*Tmp1
= Builder
.CreateAnd(Remainder_GE_Den
, Remainder_GE_Zero
);
846 Value
*Tmp1_0_CC
= Builder
.CreateICmpEQ(Tmp1
, Zero
);
850 // Quotient_A_One = Quotient + 1
851 Value
*Quotient_A_One
= Builder
.CreateAdd(Quotient
, One
);
853 // Quotient_S_One = Quotient - 1
854 Value
*Quotient_S_One
= Builder
.CreateSub(Quotient
, One
);
856 // Div = (Tmp1 == 0 ? Quotient : Quotient_A_One)
857 Value
*Div
= Builder
.CreateSelect(Tmp1_0_CC
, Quotient
, Quotient_A_One
);
859 // Div = (Remainder_GE_Zero == 0 ? Quotient_S_One : Div)
860 Res
= Builder
.CreateSelect(Num_GE_Num_S_Rem_CC
, Div
, Quotient_S_One
);
862 // Remainder_S_Den = Remainder - Den
863 Value
*Remainder_S_Den
= Builder
.CreateSub(Remainder
, Den
);
865 // Remainder_A_Den = Remainder + Den
866 Value
*Remainder_A_Den
= Builder
.CreateAdd(Remainder
, Den
);
868 // Rem = (Tmp1 == 0 ? Remainder : Remainder_S_Den)
869 Value
*Rem
= Builder
.CreateSelect(Tmp1_0_CC
, Remainder
, Remainder_S_Den
);
871 // Rem = (Remainder_GE_Zero == 0 ? Remainder_A_Den : Rem)
872 Res
= Builder
.CreateSelect(Num_GE_Num_S_Rem_CC
, Rem
, Remainder_A_Den
);
876 Res
= Builder
.CreateXor(Res
, Sign
);
877 Res
= Builder
.CreateSub(Res
, Sign
);
880 Res
= Builder
.CreateTrunc(Res
, Ty
);
885 bool AMDGPUCodeGenPrepare::visitBinaryOperator(BinaryOperator
&I
) {
886 if (ST
->has16BitInsts() && needsPromotionToI32(I
.getType()) &&
887 DA
->isUniform(&I
) && promoteUniformOpToI32(I
))
890 if (UseMul24Intrin
&& replaceMulWithMul24(I
))
893 bool Changed
= false;
894 Instruction::BinaryOps Opc
= I
.getOpcode();
895 Type
*Ty
= I
.getType();
896 Value
*NewDiv
= nullptr;
897 if ((Opc
== Instruction::URem
|| Opc
== Instruction::UDiv
||
898 Opc
== Instruction::SRem
|| Opc
== Instruction::SDiv
) &&
899 Ty
->getScalarSizeInBits() <= 32) {
900 Value
*Num
= I
.getOperand(0);
901 Value
*Den
= I
.getOperand(1);
902 IRBuilder
<> Builder(&I
);
903 Builder
.SetCurrentDebugLocation(I
.getDebugLoc());
905 if (VectorType
*VT
= dyn_cast
<VectorType
>(Ty
)) {
906 NewDiv
= UndefValue::get(VT
);
908 for (unsigned N
= 0, E
= VT
->getNumElements(); N
!= E
; ++N
) {
909 Value
*NumEltN
= Builder
.CreateExtractElement(Num
, N
);
910 Value
*DenEltN
= Builder
.CreateExtractElement(Den
, N
);
911 Value
*NewElt
= expandDivRem32(Builder
, I
, NumEltN
, DenEltN
);
913 NewElt
= Builder
.CreateBinOp(Opc
, NumEltN
, DenEltN
);
914 NewDiv
= Builder
.CreateInsertElement(NewDiv
, NewElt
, N
);
917 NewDiv
= expandDivRem32(Builder
, I
, Num
, Den
);
921 I
.replaceAllUsesWith(NewDiv
);
930 bool AMDGPUCodeGenPrepare::visitLoadInst(LoadInst
&I
) {
934 if ((I
.getPointerAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS
||
935 I
.getPointerAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT
) &&
936 canWidenScalarExtLoad(I
)) {
937 IRBuilder
<> Builder(&I
);
938 Builder
.SetCurrentDebugLocation(I
.getDebugLoc());
940 Type
*I32Ty
= Builder
.getInt32Ty();
941 Type
*PT
= PointerType::get(I32Ty
, I
.getPointerAddressSpace());
942 Value
*BitCast
= Builder
.CreateBitCast(I
.getPointerOperand(), PT
);
943 LoadInst
*WidenLoad
= Builder
.CreateLoad(I32Ty
, BitCast
);
944 WidenLoad
->copyMetadata(I
);
946 // If we have range metadata, we need to convert the type, and not make
947 // assumptions about the high bits.
948 if (auto *Range
= WidenLoad
->getMetadata(LLVMContext::MD_range
)) {
950 mdconst::extract
<ConstantInt
>(Range
->getOperand(0));
952 if (Lower
->getValue().isNullValue()) {
953 WidenLoad
->setMetadata(LLVMContext::MD_range
, nullptr);
955 Metadata
*LowAndHigh
[] = {
956 ConstantAsMetadata::get(ConstantInt::get(I32Ty
, Lower
->getValue().zext(32))),
957 // Don't make assumptions about the high bits.
958 ConstantAsMetadata::get(ConstantInt::get(I32Ty
, 0))
961 WidenLoad
->setMetadata(LLVMContext::MD_range
,
962 MDNode::get(Mod
->getContext(), LowAndHigh
));
966 int TySize
= Mod
->getDataLayout().getTypeSizeInBits(I
.getType());
967 Type
*IntNTy
= Builder
.getIntNTy(TySize
);
968 Value
*ValTrunc
= Builder
.CreateTrunc(WidenLoad
, IntNTy
);
969 Value
*ValOrig
= Builder
.CreateBitCast(ValTrunc
, I
.getType());
970 I
.replaceAllUsesWith(ValOrig
);
978 bool AMDGPUCodeGenPrepare::visitICmpInst(ICmpInst
&I
) {
979 bool Changed
= false;
981 if (ST
->has16BitInsts() && needsPromotionToI32(I
.getOperand(0)->getType()) &&
983 Changed
|= promoteUniformOpToI32(I
);
988 bool AMDGPUCodeGenPrepare::visitSelectInst(SelectInst
&I
) {
989 bool Changed
= false;
991 if (ST
->has16BitInsts() && needsPromotionToI32(I
.getType()) &&
993 Changed
|= promoteUniformOpToI32(I
);
998 bool AMDGPUCodeGenPrepare::visitIntrinsicInst(IntrinsicInst
&I
) {
999 switch (I
.getIntrinsicID()) {
1000 case Intrinsic::bitreverse
:
1001 return visitBitreverseIntrinsicInst(I
);
1007 bool AMDGPUCodeGenPrepare::visitBitreverseIntrinsicInst(IntrinsicInst
&I
) {
1008 bool Changed
= false;
1010 if (ST
->has16BitInsts() && needsPromotionToI32(I
.getType()) &&
1012 Changed
|= promoteUniformBitreverseToI32(I
);
1017 bool AMDGPUCodeGenPrepare::doInitialization(Module
&M
) {
1019 DL
= &Mod
->getDataLayout();
1023 bool AMDGPUCodeGenPrepare::runOnFunction(Function
&F
) {
1024 if (skipFunction(F
))
1027 auto *TPC
= getAnalysisIfAvailable
<TargetPassConfig
>();
1031 const AMDGPUTargetMachine
&TM
= TPC
->getTM
<AMDGPUTargetMachine
>();
1032 ST
= &TM
.getSubtarget
<GCNSubtarget
>(F
);
1033 AC
= &getAnalysis
<AssumptionCacheTracker
>().getAssumptionCache(F
);
1034 DA
= &getAnalysis
<LegacyDivergenceAnalysis
>();
1035 HasUnsafeFPMath
= hasUnsafeFPMath(F
);
1037 bool MadeChange
= false;
1039 for (BasicBlock
&BB
: F
) {
1040 BasicBlock::iterator Next
;
1041 for (BasicBlock::iterator I
= BB
.begin(), E
= BB
.end(); I
!= E
; I
= Next
) {
1042 Next
= std::next(I
);
1043 MadeChange
|= visit(*I
);
1050 INITIALIZE_PASS_BEGIN(AMDGPUCodeGenPrepare
, DEBUG_TYPE
,
1051 "AMDGPU IR optimizations", false, false)
1052 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker
)
1053 INITIALIZE_PASS_DEPENDENCY(LegacyDivergenceAnalysis
)
1054 INITIALIZE_PASS_END(AMDGPUCodeGenPrepare
, DEBUG_TYPE
, "AMDGPU IR optimizations",
1057 char AMDGPUCodeGenPrepare::ID
= 0;
1059 FunctionPass
*llvm::createAMDGPUCodeGenPreparePass() {
1060 return new AMDGPUCodeGenPrepare();