1 //===- AMDGPInstCombineIntrinsic.cpp - AMDGPU specific InstCombine pass ---===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
10 // This file implements a TargetTransformInfo analysis pass specific to the
11 // AMDGPU target machine. It uses the target's detailed information to provide
12 // more precise answers to certain TTI queries, while letting the target
13 // independent and default TTI implementations handle the rest.
15 //===----------------------------------------------------------------------===//
17 #include "AMDGPUInstrInfo.h"
18 #include "AMDGPUTargetTransformInfo.h"
19 #include "GCNSubtarget.h"
20 #include "R600Subtarget.h"
21 #include "llvm/IR/IntrinsicsAMDGPU.h"
22 #include "llvm/Transforms/InstCombine/InstCombiner.h"
26 #define DEBUG_TYPE "AMDGPUtti"
30 struct AMDGPUImageDMaskIntrinsic
{
34 #define GET_AMDGPUImageDMaskIntrinsicTable_IMPL
35 #include "InstCombineTables.inc"
37 } // end anonymous namespace
39 // Constant fold llvm.amdgcn.fmed3 intrinsics for standard inputs.
41 // A single NaN input is folded to minnum, so we rely on that folding for
43 static APFloat
fmed3AMDGCN(const APFloat
&Src0
, const APFloat
&Src1
,
44 const APFloat
&Src2
) {
45 APFloat Max3
= maxnum(maxnum(Src0
, Src1
), Src2
);
47 APFloat::cmpResult Cmp0
= Max3
.compare(Src0
);
48 assert(Cmp0
!= APFloat::cmpUnordered
&& "nans handled separately");
49 if (Cmp0
== APFloat::cmpEqual
)
50 return maxnum(Src1
, Src2
);
52 APFloat::cmpResult Cmp1
= Max3
.compare(Src1
);
53 assert(Cmp1
!= APFloat::cmpUnordered
&& "nans handled separately");
54 if (Cmp1
== APFloat::cmpEqual
)
55 return maxnum(Src0
, Src2
);
57 return maxnum(Src0
, Src1
);
60 // Check if a value can be converted to a 16-bit value without losing
62 static bool canSafelyConvertTo16Bit(Value
&V
) {
63 Type
*VTy
= V
.getType();
64 if (VTy
->isHalfTy() || VTy
->isIntegerTy(16)) {
65 // The value is already 16-bit, so we don't want to convert to 16-bit again!
68 if (ConstantFP
*ConstFloat
= dyn_cast
<ConstantFP
>(&V
)) {
69 // We need to check that if we cast the index down to a half, we do not lose
71 APFloat
FloatValue(ConstFloat
->getValueAPF());
72 bool LosesInfo
= true;
73 FloatValue
.convert(APFloat::IEEEhalf(), APFloat::rmTowardZero
, &LosesInfo
);
77 if (match(&V
, m_FPExt(PatternMatch::m_Value(CastSrc
))) ||
78 match(&V
, m_SExt(PatternMatch::m_Value(CastSrc
))) ||
79 match(&V
, m_ZExt(PatternMatch::m_Value(CastSrc
)))) {
80 Type
*CastSrcTy
= CastSrc
->getType();
81 if (CastSrcTy
->isHalfTy() || CastSrcTy
->isIntegerTy(16))
88 // Convert a value to 16-bit.
89 static Value
*convertTo16Bit(Value
&V
, InstCombiner::BuilderTy
&Builder
) {
90 Type
*VTy
= V
.getType();
91 if (isa
<FPExtInst
>(&V
) || isa
<SExtInst
>(&V
) || isa
<ZExtInst
>(&V
))
92 return cast
<Instruction
>(&V
)->getOperand(0);
93 if (VTy
->isIntegerTy())
94 return Builder
.CreateIntCast(&V
, Type::getInt16Ty(V
.getContext()), false);
95 if (VTy
->isFloatingPointTy())
96 return Builder
.CreateFPCast(&V
, Type::getHalfTy(V
.getContext()));
98 llvm_unreachable("Should never be called!");
101 static Optional
<Instruction
*>
102 simplifyAMDGCNImageIntrinsic(const GCNSubtarget
*ST
,
103 const AMDGPU::ImageDimIntrinsicInfo
*ImageDimIntr
,
104 IntrinsicInst
&II
, InstCombiner
&IC
) {
105 if (!ST
->hasA16() && !ST
->hasG16())
108 bool FloatCoord
= false;
109 // true means derivatives can be converted to 16 bit, coordinates not
110 bool OnlyDerivatives
= false;
112 for (unsigned OperandIndex
= ImageDimIntr
->GradientStart
;
113 OperandIndex
< ImageDimIntr
->VAddrEnd
; OperandIndex
++) {
114 Value
*Coord
= II
.getOperand(OperandIndex
);
115 // If the values are not derived from 16-bit values, we cannot optimize.
116 if (!canSafelyConvertTo16Bit(*Coord
)) {
117 if (OperandIndex
< ImageDimIntr
->CoordStart
||
118 ImageDimIntr
->GradientStart
== ImageDimIntr
->CoordStart
) {
121 // All gradients can be converted, so convert only them
122 OnlyDerivatives
= true;
126 assert(OperandIndex
== ImageDimIntr
->GradientStart
||
127 FloatCoord
== Coord
->getType()->isFloatingPointTy());
128 FloatCoord
= Coord
->getType()->isFloatingPointTy();
131 if (OnlyDerivatives
) {
136 OnlyDerivatives
= true; // Only supports G16
139 Type
*CoordType
= FloatCoord
? Type::getHalfTy(II
.getContext())
140 : Type::getInt16Ty(II
.getContext());
142 SmallVector
<Type
*, 4> ArgTys
;
143 if (!Intrinsic::getIntrinsicSignature(II
.getCalledFunction(), ArgTys
))
146 ArgTys
[ImageDimIntr
->GradientTyArg
] = CoordType
;
147 if (!OnlyDerivatives
)
148 ArgTys
[ImageDimIntr
->CoordTyArg
] = CoordType
;
150 Intrinsic::getDeclaration(II
.getModule(), II
.getIntrinsicID(), ArgTys
);
152 SmallVector
<Value
*, 8> Args(II
.arg_operands());
155 OnlyDerivatives
? ImageDimIntr
->CoordStart
: ImageDimIntr
->VAddrEnd
;
156 for (unsigned OperandIndex
= ImageDimIntr
->GradientStart
;
157 OperandIndex
< EndIndex
; OperandIndex
++) {
159 convertTo16Bit(*II
.getOperand(OperandIndex
), IC
.Builder
);
162 CallInst
*NewCall
= IC
.Builder
.CreateCall(I
, Args
);
163 NewCall
->takeName(&II
);
164 NewCall
->copyMetadata(II
);
165 if (isa
<FPMathOperator
>(NewCall
))
166 NewCall
->copyFastMathFlags(&II
);
167 return IC
.replaceInstUsesWith(II
, NewCall
);
170 bool GCNTTIImpl::canSimplifyLegacyMulToMul(const Value
*Op0
, const Value
*Op1
,
171 InstCombiner
&IC
) const {
172 // The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or
173 // infinity, gives +0.0. If we can prove we don't have one of the special
174 // cases then we can use a normal multiply instead.
175 // TODO: Create and use isKnownFiniteNonZero instead of just matching
177 if (match(Op0
, PatternMatch::m_FiniteNonZero()) ||
178 match(Op1
, PatternMatch::m_FiniteNonZero())) {
179 // One operand is not zero or infinity or NaN.
182 auto *TLI
= &IC
.getTargetLibraryInfo();
183 if (isKnownNeverInfinity(Op0
, TLI
) && isKnownNeverNaN(Op0
, TLI
) &&
184 isKnownNeverInfinity(Op1
, TLI
) && isKnownNeverNaN(Op1
, TLI
)) {
185 // Neither operand is infinity or NaN.
191 Optional
<Instruction
*>
192 GCNTTIImpl::instCombineIntrinsic(InstCombiner
&IC
, IntrinsicInst
&II
) const {
193 Intrinsic::ID IID
= II
.getIntrinsicID();
195 case Intrinsic::amdgcn_rcp
: {
196 Value
*Src
= II
.getArgOperand(0);
198 // TODO: Move to ConstantFolding/InstSimplify?
199 if (isa
<UndefValue
>(Src
)) {
200 Type
*Ty
= II
.getType();
201 auto *QNaN
= ConstantFP::get(Ty
, APFloat::getQNaN(Ty
->getFltSemantics()));
202 return IC
.replaceInstUsesWith(II
, QNaN
);
208 if (const ConstantFP
*C
= dyn_cast
<ConstantFP
>(Src
)) {
209 const APFloat
&ArgVal
= C
->getValueAPF();
210 APFloat
Val(ArgVal
.getSemantics(), 1);
211 Val
.divide(ArgVal
, APFloat::rmNearestTiesToEven
);
213 // This is more precise than the instruction may give.
215 // TODO: The instruction always flushes denormal results (except for f16),
217 return IC
.replaceInstUsesWith(II
, ConstantFP::get(II
.getContext(), Val
));
222 case Intrinsic::amdgcn_rsq
: {
223 Value
*Src
= II
.getArgOperand(0);
225 // TODO: Move to ConstantFolding/InstSimplify?
226 if (isa
<UndefValue
>(Src
)) {
227 Type
*Ty
= II
.getType();
228 auto *QNaN
= ConstantFP::get(Ty
, APFloat::getQNaN(Ty
->getFltSemantics()));
229 return IC
.replaceInstUsesWith(II
, QNaN
);
234 case Intrinsic::amdgcn_frexp_mant
:
235 case Intrinsic::amdgcn_frexp_exp
: {
236 Value
*Src
= II
.getArgOperand(0);
237 if (const ConstantFP
*C
= dyn_cast
<ConstantFP
>(Src
)) {
239 APFloat Significand
=
240 frexp(C
->getValueAPF(), Exp
, APFloat::rmNearestTiesToEven
);
242 if (IID
== Intrinsic::amdgcn_frexp_mant
) {
243 return IC
.replaceInstUsesWith(
244 II
, ConstantFP::get(II
.getContext(), Significand
));
247 // Match instruction special case behavior.
248 if (Exp
== APFloat::IEK_NaN
|| Exp
== APFloat::IEK_Inf
)
251 return IC
.replaceInstUsesWith(II
, ConstantInt::get(II
.getType(), Exp
));
254 if (isa
<UndefValue
>(Src
)) {
255 return IC
.replaceInstUsesWith(II
, UndefValue::get(II
.getType()));
260 case Intrinsic::amdgcn_class
: {
262 S_NAN
= 1 << 0, // Signaling NaN
263 Q_NAN
= 1 << 1, // Quiet NaN
264 N_INFINITY
= 1 << 2, // Negative infinity
265 N_NORMAL
= 1 << 3, // Negative normal
266 N_SUBNORMAL
= 1 << 4, // Negative subnormal
267 N_ZERO
= 1 << 5, // Negative zero
268 P_ZERO
= 1 << 6, // Positive zero
269 P_SUBNORMAL
= 1 << 7, // Positive subnormal
270 P_NORMAL
= 1 << 8, // Positive normal
271 P_INFINITY
= 1 << 9 // Positive infinity
274 const uint32_t FullMask
= S_NAN
| Q_NAN
| N_INFINITY
| N_NORMAL
|
275 N_SUBNORMAL
| N_ZERO
| P_ZERO
| P_SUBNORMAL
|
276 P_NORMAL
| P_INFINITY
;
278 Value
*Src0
= II
.getArgOperand(0);
279 Value
*Src1
= II
.getArgOperand(1);
280 const ConstantInt
*CMask
= dyn_cast
<ConstantInt
>(Src1
);
282 if (isa
<UndefValue
>(Src0
)) {
283 return IC
.replaceInstUsesWith(II
, UndefValue::get(II
.getType()));
286 if (isa
<UndefValue
>(Src1
)) {
287 return IC
.replaceInstUsesWith(II
,
288 ConstantInt::get(II
.getType(), false));
293 uint32_t Mask
= CMask
->getZExtValue();
295 // If all tests are made, it doesn't matter what the value is.
296 if ((Mask
& FullMask
) == FullMask
) {
297 return IC
.replaceInstUsesWith(II
, ConstantInt::get(II
.getType(), true));
300 if ((Mask
& FullMask
) == 0) {
301 return IC
.replaceInstUsesWith(II
, ConstantInt::get(II
.getType(), false));
304 if (Mask
== (S_NAN
| Q_NAN
)) {
305 // Equivalent of isnan. Replace with standard fcmp.
306 Value
*FCmp
= IC
.Builder
.CreateFCmpUNO(Src0
, Src0
);
308 return IC
.replaceInstUsesWith(II
, FCmp
);
311 if (Mask
== (N_ZERO
| P_ZERO
)) {
312 // Equivalent of == 0.
314 IC
.Builder
.CreateFCmpOEQ(Src0
, ConstantFP::get(Src0
->getType(), 0.0));
317 return IC
.replaceInstUsesWith(II
, FCmp
);
320 // fp_class (nnan x), qnan|snan|other -> fp_class (nnan x), other
321 if (((Mask
& S_NAN
) || (Mask
& Q_NAN
)) &&
322 isKnownNeverNaN(Src0
, &IC
.getTargetLibraryInfo())) {
323 return IC
.replaceOperand(
324 II
, 1, ConstantInt::get(Src1
->getType(), Mask
& ~(S_NAN
| Q_NAN
)));
327 const ConstantFP
*CVal
= dyn_cast
<ConstantFP
>(Src0
);
329 if (isa
<UndefValue
>(Src0
)) {
330 return IC
.replaceInstUsesWith(II
, UndefValue::get(II
.getType()));
333 // Clamp mask to used bits
334 if ((Mask
& FullMask
) != Mask
) {
335 CallInst
*NewCall
= IC
.Builder
.CreateCall(
336 II
.getCalledFunction(),
337 {Src0
, ConstantInt::get(Src1
->getType(), Mask
& FullMask
)});
339 NewCall
->takeName(&II
);
340 return IC
.replaceInstUsesWith(II
, NewCall
);
346 const APFloat
&Val
= CVal
->getValueAPF();
349 ((Mask
& S_NAN
) && Val
.isNaN() && Val
.isSignaling()) ||
350 ((Mask
& Q_NAN
) && Val
.isNaN() && !Val
.isSignaling()) ||
351 ((Mask
& N_INFINITY
) && Val
.isInfinity() && Val
.isNegative()) ||
352 ((Mask
& N_NORMAL
) && Val
.isNormal() && Val
.isNegative()) ||
353 ((Mask
& N_SUBNORMAL
) && Val
.isDenormal() && Val
.isNegative()) ||
354 ((Mask
& N_ZERO
) && Val
.isZero() && Val
.isNegative()) ||
355 ((Mask
& P_ZERO
) && Val
.isZero() && !Val
.isNegative()) ||
356 ((Mask
& P_SUBNORMAL
) && Val
.isDenormal() && !Val
.isNegative()) ||
357 ((Mask
& P_NORMAL
) && Val
.isNormal() && !Val
.isNegative()) ||
358 ((Mask
& P_INFINITY
) && Val
.isInfinity() && !Val
.isNegative());
360 return IC
.replaceInstUsesWith(II
, ConstantInt::get(II
.getType(), Result
));
362 case Intrinsic::amdgcn_cvt_pkrtz
: {
363 Value
*Src0
= II
.getArgOperand(0);
364 Value
*Src1
= II
.getArgOperand(1);
365 if (const ConstantFP
*C0
= dyn_cast
<ConstantFP
>(Src0
)) {
366 if (const ConstantFP
*C1
= dyn_cast
<ConstantFP
>(Src1
)) {
367 const fltSemantics
&HalfSem
=
368 II
.getType()->getScalarType()->getFltSemantics();
370 APFloat Val0
= C0
->getValueAPF();
371 APFloat Val1
= C1
->getValueAPF();
372 Val0
.convert(HalfSem
, APFloat::rmTowardZero
, &LosesInfo
);
373 Val1
.convert(HalfSem
, APFloat::rmTowardZero
, &LosesInfo
);
376 ConstantVector::get({ConstantFP::get(II
.getContext(), Val0
),
377 ConstantFP::get(II
.getContext(), Val1
)});
378 return IC
.replaceInstUsesWith(II
, Folded
);
382 if (isa
<UndefValue
>(Src0
) && isa
<UndefValue
>(Src1
)) {
383 return IC
.replaceInstUsesWith(II
, UndefValue::get(II
.getType()));
388 case Intrinsic::amdgcn_cvt_pknorm_i16
:
389 case Intrinsic::amdgcn_cvt_pknorm_u16
:
390 case Intrinsic::amdgcn_cvt_pk_i16
:
391 case Intrinsic::amdgcn_cvt_pk_u16
: {
392 Value
*Src0
= II
.getArgOperand(0);
393 Value
*Src1
= II
.getArgOperand(1);
395 if (isa
<UndefValue
>(Src0
) && isa
<UndefValue
>(Src1
)) {
396 return IC
.replaceInstUsesWith(II
, UndefValue::get(II
.getType()));
401 case Intrinsic::amdgcn_ubfe
:
402 case Intrinsic::amdgcn_sbfe
: {
403 // Decompose simple cases into standard shifts.
404 Value
*Src
= II
.getArgOperand(0);
405 if (isa
<UndefValue
>(Src
)) {
406 return IC
.replaceInstUsesWith(II
, Src
);
410 Type
*Ty
= II
.getType();
411 unsigned IntSize
= Ty
->getIntegerBitWidth();
413 ConstantInt
*CWidth
= dyn_cast
<ConstantInt
>(II
.getArgOperand(2));
415 Width
= CWidth
->getZExtValue();
416 if ((Width
& (IntSize
- 1)) == 0) {
417 return IC
.replaceInstUsesWith(II
, ConstantInt::getNullValue(Ty
));
420 // Hardware ignores high bits, so remove those.
421 if (Width
>= IntSize
) {
422 return IC
.replaceOperand(
423 II
, 2, ConstantInt::get(CWidth
->getType(), Width
& (IntSize
- 1)));
428 ConstantInt
*COffset
= dyn_cast
<ConstantInt
>(II
.getArgOperand(1));
430 Offset
= COffset
->getZExtValue();
431 if (Offset
>= IntSize
) {
432 return IC
.replaceOperand(
434 ConstantInt::get(COffset
->getType(), Offset
& (IntSize
- 1)));
438 bool Signed
= IID
== Intrinsic::amdgcn_sbfe
;
440 if (!CWidth
|| !COffset
)
443 // The case of Width == 0 is handled above, which makes this tranformation
444 // safe. If Width == 0, then the ashr and lshr instructions become poison
445 // value since the shift amount would be equal to the bit size.
448 // TODO: This allows folding to undef when the hardware has specific
450 if (Offset
+ Width
< IntSize
) {
451 Value
*Shl
= IC
.Builder
.CreateShl(Src
, IntSize
- Offset
- Width
);
452 Value
*RightShift
= Signed
? IC
.Builder
.CreateAShr(Shl
, IntSize
- Width
)
453 : IC
.Builder
.CreateLShr(Shl
, IntSize
- Width
);
454 RightShift
->takeName(&II
);
455 return IC
.replaceInstUsesWith(II
, RightShift
);
458 Value
*RightShift
= Signed
? IC
.Builder
.CreateAShr(Src
, Offset
)
459 : IC
.Builder
.CreateLShr(Src
, Offset
);
461 RightShift
->takeName(&II
);
462 return IC
.replaceInstUsesWith(II
, RightShift
);
464 case Intrinsic::amdgcn_exp
:
465 case Intrinsic::amdgcn_exp_compr
: {
466 ConstantInt
*En
= cast
<ConstantInt
>(II
.getArgOperand(1));
467 unsigned EnBits
= En
->getZExtValue();
469 break; // All inputs enabled.
471 bool IsCompr
= IID
== Intrinsic::amdgcn_exp_compr
;
472 bool Changed
= false;
473 for (int I
= 0; I
< (IsCompr
? 2 : 4); ++I
) {
474 if ((!IsCompr
&& (EnBits
& (1 << I
)) == 0) ||
475 (IsCompr
&& ((EnBits
& (0x3 << (2 * I
))) == 0))) {
476 Value
*Src
= II
.getArgOperand(I
+ 2);
477 if (!isa
<UndefValue
>(Src
)) {
478 IC
.replaceOperand(II
, I
+ 2, UndefValue::get(Src
->getType()));
490 case Intrinsic::amdgcn_fmed3
: {
491 // Note this does not preserve proper sNaN behavior if IEEE-mode is enabled
494 Value
*Src0
= II
.getArgOperand(0);
495 Value
*Src1
= II
.getArgOperand(1);
496 Value
*Src2
= II
.getArgOperand(2);
498 // Checking for NaN before canonicalization provides better fidelity when
499 // mapping other operations onto fmed3 since the order of operands is
501 CallInst
*NewCall
= nullptr;
502 if (match(Src0
, PatternMatch::m_NaN()) || isa
<UndefValue
>(Src0
)) {
503 NewCall
= IC
.Builder
.CreateMinNum(Src1
, Src2
);
504 } else if (match(Src1
, PatternMatch::m_NaN()) || isa
<UndefValue
>(Src1
)) {
505 NewCall
= IC
.Builder
.CreateMinNum(Src0
, Src2
);
506 } else if (match(Src2
, PatternMatch::m_NaN()) || isa
<UndefValue
>(Src2
)) {
507 NewCall
= IC
.Builder
.CreateMaxNum(Src0
, Src1
);
511 NewCall
->copyFastMathFlags(&II
);
512 NewCall
->takeName(&II
);
513 return IC
.replaceInstUsesWith(II
, NewCall
);
517 // Canonicalize constants to RHS operands.
519 // fmed3(c0, x, c1) -> fmed3(x, c0, c1)
520 if (isa
<Constant
>(Src0
) && !isa
<Constant
>(Src1
)) {
521 std::swap(Src0
, Src1
);
525 if (isa
<Constant
>(Src1
) && !isa
<Constant
>(Src2
)) {
526 std::swap(Src1
, Src2
);
530 if (isa
<Constant
>(Src0
) && !isa
<Constant
>(Src1
)) {
531 std::swap(Src0
, Src1
);
536 II
.setArgOperand(0, Src0
);
537 II
.setArgOperand(1, Src1
);
538 II
.setArgOperand(2, Src2
);
542 if (const ConstantFP
*C0
= dyn_cast
<ConstantFP
>(Src0
)) {
543 if (const ConstantFP
*C1
= dyn_cast
<ConstantFP
>(Src1
)) {
544 if (const ConstantFP
*C2
= dyn_cast
<ConstantFP
>(Src2
)) {
545 APFloat Result
= fmed3AMDGCN(C0
->getValueAPF(), C1
->getValueAPF(),
547 return IC
.replaceInstUsesWith(
548 II
, ConstantFP::get(IC
.Builder
.getContext(), Result
));
555 case Intrinsic::amdgcn_icmp
:
556 case Intrinsic::amdgcn_fcmp
: {
557 const ConstantInt
*CC
= cast
<ConstantInt
>(II
.getArgOperand(2));
558 // Guard against invalid arguments.
559 int64_t CCVal
= CC
->getZExtValue();
560 bool IsInteger
= IID
== Intrinsic::amdgcn_icmp
;
561 if ((IsInteger
&& (CCVal
< CmpInst::FIRST_ICMP_PREDICATE
||
562 CCVal
> CmpInst::LAST_ICMP_PREDICATE
)) ||
563 (!IsInteger
&& (CCVal
< CmpInst::FIRST_FCMP_PREDICATE
||
564 CCVal
> CmpInst::LAST_FCMP_PREDICATE
)))
567 Value
*Src0
= II
.getArgOperand(0);
568 Value
*Src1
= II
.getArgOperand(1);
570 if (auto *CSrc0
= dyn_cast
<Constant
>(Src0
)) {
571 if (auto *CSrc1
= dyn_cast
<Constant
>(Src1
)) {
572 Constant
*CCmp
= ConstantExpr::getCompare(CCVal
, CSrc0
, CSrc1
);
573 if (CCmp
->isNullValue()) {
574 return IC
.replaceInstUsesWith(
575 II
, ConstantExpr::getSExt(CCmp
, II
.getType()));
578 // The result of V_ICMP/V_FCMP assembly instructions (which this
579 // intrinsic exposes) is one bit per thread, masked with the EXEC
580 // register (which contains the bitmask of live threads). So a
581 // comparison that always returns true is the same as a read of the
583 Function
*NewF
= Intrinsic::getDeclaration(
584 II
.getModule(), Intrinsic::read_register
, II
.getType());
585 Metadata
*MDArgs
[] = {MDString::get(II
.getContext(), "exec")};
586 MDNode
*MD
= MDNode::get(II
.getContext(), MDArgs
);
587 Value
*Args
[] = {MetadataAsValue::get(II
.getContext(), MD
)};
588 CallInst
*NewCall
= IC
.Builder
.CreateCall(NewF
, Args
);
589 NewCall
->addFnAttr(Attribute::Convergent
);
590 NewCall
->takeName(&II
);
591 return IC
.replaceInstUsesWith(II
, NewCall
);
594 // Canonicalize constants to RHS.
595 CmpInst::Predicate SwapPred
=
596 CmpInst::getSwappedPredicate(static_cast<CmpInst::Predicate
>(CCVal
));
597 II
.setArgOperand(0, Src1
);
598 II
.setArgOperand(1, Src0
);
600 2, ConstantInt::get(CC
->getType(), static_cast<int>(SwapPred
)));
604 if (CCVal
!= CmpInst::ICMP_EQ
&& CCVal
!= CmpInst::ICMP_NE
)
607 // Canonicalize compare eq with true value to compare != 0
608 // llvm.amdgcn.icmp(zext (i1 x), 1, eq)
609 // -> llvm.amdgcn.icmp(zext (i1 x), 0, ne)
610 // llvm.amdgcn.icmp(sext (i1 x), -1, eq)
611 // -> llvm.amdgcn.icmp(sext (i1 x), 0, ne)
613 if (CCVal
== CmpInst::ICMP_EQ
&&
614 ((match(Src1
, PatternMatch::m_One()) &&
615 match(Src0
, m_ZExt(PatternMatch::m_Value(ExtSrc
)))) ||
616 (match(Src1
, PatternMatch::m_AllOnes()) &&
617 match(Src0
, m_SExt(PatternMatch::m_Value(ExtSrc
))))) &&
618 ExtSrc
->getType()->isIntegerTy(1)) {
619 IC
.replaceOperand(II
, 1, ConstantInt::getNullValue(Src1
->getType()));
620 IC
.replaceOperand(II
, 2,
621 ConstantInt::get(CC
->getType(), CmpInst::ICMP_NE
));
625 CmpInst::Predicate SrcPred
;
629 // Fold compare eq/ne with 0 from a compare result as the predicate to the
630 // intrinsic. The typical use is a wave vote function in the library, which
631 // will be fed from a user code condition compared with 0. Fold in the
632 // redundant compare.
634 // llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, ne)
635 // -> llvm.amdgcn.[if]cmp(a, b, pred)
637 // llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, eq)
638 // -> llvm.amdgcn.[if]cmp(a, b, inv pred)
639 if (match(Src1
, PatternMatch::m_Zero()) &&
640 match(Src0
, PatternMatch::m_ZExtOrSExt(
641 m_Cmp(SrcPred
, PatternMatch::m_Value(SrcLHS
),
642 PatternMatch::m_Value(SrcRHS
))))) {
643 if (CCVal
== CmpInst::ICMP_EQ
)
644 SrcPred
= CmpInst::getInversePredicate(SrcPred
);
646 Intrinsic::ID NewIID
= CmpInst::isFPPredicate(SrcPred
)
647 ? Intrinsic::amdgcn_fcmp
648 : Intrinsic::amdgcn_icmp
;
650 Type
*Ty
= SrcLHS
->getType();
651 if (auto *CmpType
= dyn_cast
<IntegerType
>(Ty
)) {
652 // Promote to next legal integer type.
653 unsigned Width
= CmpType
->getBitWidth();
654 unsigned NewWidth
= Width
;
656 // Don't do anything for i1 comparisons.
662 else if (Width
<= 32)
664 else if (Width
<= 64)
667 break; // Can't handle this.
669 if (Width
!= NewWidth
) {
670 IntegerType
*CmpTy
= IC
.Builder
.getIntNTy(NewWidth
);
671 if (CmpInst::isSigned(SrcPred
)) {
672 SrcLHS
= IC
.Builder
.CreateSExt(SrcLHS
, CmpTy
);
673 SrcRHS
= IC
.Builder
.CreateSExt(SrcRHS
, CmpTy
);
675 SrcLHS
= IC
.Builder
.CreateZExt(SrcLHS
, CmpTy
);
676 SrcRHS
= IC
.Builder
.CreateZExt(SrcRHS
, CmpTy
);
679 } else if (!Ty
->isFloatTy() && !Ty
->isDoubleTy() && !Ty
->isHalfTy())
682 Function
*NewF
= Intrinsic::getDeclaration(
683 II
.getModule(), NewIID
, {II
.getType(), SrcLHS
->getType()});
684 Value
*Args
[] = {SrcLHS
, SrcRHS
,
685 ConstantInt::get(CC
->getType(), SrcPred
)};
686 CallInst
*NewCall
= IC
.Builder
.CreateCall(NewF
, Args
);
687 NewCall
->takeName(&II
);
688 return IC
.replaceInstUsesWith(II
, NewCall
);
693 case Intrinsic::amdgcn_ballot
: {
694 if (auto *Src
= dyn_cast
<ConstantInt
>(II
.getArgOperand(0))) {
696 // amdgcn.ballot(i1 0) is zero.
697 return IC
.replaceInstUsesWith(II
, Constant::getNullValue(II
.getType()));
701 // amdgcn.ballot(i1 1) is exec.
702 const char *RegName
= "exec";
703 if (II
.getType()->isIntegerTy(32))
705 else if (!II
.getType()->isIntegerTy(64))
708 Function
*NewF
= Intrinsic::getDeclaration(
709 II
.getModule(), Intrinsic::read_register
, II
.getType());
710 Metadata
*MDArgs
[] = {MDString::get(II
.getContext(), RegName
)};
711 MDNode
*MD
= MDNode::get(II
.getContext(), MDArgs
);
712 Value
*Args
[] = {MetadataAsValue::get(II
.getContext(), MD
)};
713 CallInst
*NewCall
= IC
.Builder
.CreateCall(NewF
, Args
);
714 NewCall
->addFnAttr(Attribute::Convergent
);
715 NewCall
->takeName(&II
);
716 return IC
.replaceInstUsesWith(II
, NewCall
);
721 case Intrinsic::amdgcn_wqm_vote
: {
722 // wqm_vote is identity when the argument is constant.
723 if (!isa
<Constant
>(II
.getArgOperand(0)))
726 return IC
.replaceInstUsesWith(II
, II
.getArgOperand(0));
728 case Intrinsic::amdgcn_kill
: {
729 const ConstantInt
*C
= dyn_cast
<ConstantInt
>(II
.getArgOperand(0));
730 if (!C
|| !C
->getZExtValue())
733 // amdgcn.kill(i1 1) is a no-op
734 return IC
.eraseInstFromFunction(II
);
736 case Intrinsic::amdgcn_update_dpp
: {
737 Value
*Old
= II
.getArgOperand(0);
739 auto *BC
= cast
<ConstantInt
>(II
.getArgOperand(5));
740 auto *RM
= cast
<ConstantInt
>(II
.getArgOperand(3));
741 auto *BM
= cast
<ConstantInt
>(II
.getArgOperand(4));
742 if (BC
->isZeroValue() || RM
->getZExtValue() != 0xF ||
743 BM
->getZExtValue() != 0xF || isa
<UndefValue
>(Old
))
746 // If bound_ctrl = 1, row mask = bank mask = 0xf we can omit old value.
747 return IC
.replaceOperand(II
, 0, UndefValue::get(Old
->getType()));
749 case Intrinsic::amdgcn_permlane16
:
750 case Intrinsic::amdgcn_permlanex16
: {
751 // Discard vdst_in if it's not going to be read.
752 Value
*VDstIn
= II
.getArgOperand(0);
753 if (isa
<UndefValue
>(VDstIn
))
756 ConstantInt
*FetchInvalid
= cast
<ConstantInt
>(II
.getArgOperand(4));
757 ConstantInt
*BoundCtrl
= cast
<ConstantInt
>(II
.getArgOperand(5));
758 if (!FetchInvalid
->getZExtValue() && !BoundCtrl
->getZExtValue())
761 return IC
.replaceOperand(II
, 0, UndefValue::get(VDstIn
->getType()));
763 case Intrinsic::amdgcn_readfirstlane
:
764 case Intrinsic::amdgcn_readlane
: {
765 // A constant value is trivially uniform.
766 if (Constant
*C
= dyn_cast
<Constant
>(II
.getArgOperand(0))) {
767 return IC
.replaceInstUsesWith(II
, C
);
770 // The rest of these may not be safe if the exec may not be the same between
772 Value
*Src
= II
.getArgOperand(0);
773 Instruction
*SrcInst
= dyn_cast
<Instruction
>(Src
);
774 if (SrcInst
&& SrcInst
->getParent() != II
.getParent())
777 // readfirstlane (readfirstlane x) -> readfirstlane x
778 // readlane (readfirstlane x), y -> readfirstlane x
780 PatternMatch::m_Intrinsic
<Intrinsic::amdgcn_readfirstlane
>())) {
781 return IC
.replaceInstUsesWith(II
, Src
);
784 if (IID
== Intrinsic::amdgcn_readfirstlane
) {
785 // readfirstlane (readlane x, y) -> readlane x, y
786 if (match(Src
, PatternMatch::m_Intrinsic
<Intrinsic::amdgcn_readlane
>())) {
787 return IC
.replaceInstUsesWith(II
, Src
);
790 // readlane (readlane x, y), y -> readlane x, y
791 if (match(Src
, PatternMatch::m_Intrinsic
<Intrinsic::amdgcn_readlane
>(
792 PatternMatch::m_Value(),
793 PatternMatch::m_Specific(II
.getArgOperand(1))))) {
794 return IC
.replaceInstUsesWith(II
, Src
);
800 case Intrinsic::amdgcn_ldexp
: {
801 // FIXME: This doesn't introduce new instructions and belongs in
802 // InstructionSimplify.
803 Type
*Ty
= II
.getType();
804 Value
*Op0
= II
.getArgOperand(0);
805 Value
*Op1
= II
.getArgOperand(1);
807 // Folding undef to qnan is safe regardless of the FP mode.
808 if (isa
<UndefValue
>(Op0
)) {
809 auto *QNaN
= ConstantFP::get(Ty
, APFloat::getQNaN(Ty
->getFltSemantics()));
810 return IC
.replaceInstUsesWith(II
, QNaN
);
813 const APFloat
*C
= nullptr;
814 match(Op0
, PatternMatch::m_APFloat(C
));
816 // FIXME: Should flush denorms depending on FP mode, but that's ignored
819 // These cases should be safe, even with strictfp.
820 // ldexp(0.0, x) -> 0.0
821 // ldexp(-0.0, x) -> -0.0
822 // ldexp(inf, x) -> inf
823 // ldexp(-inf, x) -> -inf
824 if (C
&& (C
->isZero() || C
->isInfinity())) {
825 return IC
.replaceInstUsesWith(II
, Op0
);
828 // With strictfp, be more careful about possibly needing to flush denormals
829 // or not, and snan behavior depends on ieee_mode.
833 if (C
&& C
->isNaN()) {
834 // FIXME: We just need to make the nan quiet here, but that's unavailable
835 // on APFloat, only IEEEfloat
837 ConstantFP::get(Ty
, scalbn(*C
, 0, APFloat::rmNearestTiesToEven
));
838 return IC
.replaceInstUsesWith(II
, Quieted
);
842 // ldexp(x, undef) -> x
843 if (isa
<UndefValue
>(Op1
) || match(Op1
, PatternMatch::m_ZeroInt())) {
844 return IC
.replaceInstUsesWith(II
, Op0
);
849 case Intrinsic::amdgcn_fmul_legacy
: {
850 Value
*Op0
= II
.getArgOperand(0);
851 Value
*Op1
= II
.getArgOperand(1);
853 // The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or
854 // infinity, gives +0.0.
855 // TODO: Move to InstSimplify?
856 if (match(Op0
, PatternMatch::m_AnyZeroFP()) ||
857 match(Op1
, PatternMatch::m_AnyZeroFP()))
858 return IC
.replaceInstUsesWith(II
, ConstantFP::getNullValue(II
.getType()));
860 // If we can prove we don't have one of the special cases then we can use a
861 // normal fmul instruction instead.
862 if (canSimplifyLegacyMulToMul(Op0
, Op1
, IC
)) {
863 auto *FMul
= IC
.Builder
.CreateFMulFMF(Op0
, Op1
, &II
);
865 return IC
.replaceInstUsesWith(II
, FMul
);
869 case Intrinsic::amdgcn_fma_legacy
: {
870 Value
*Op0
= II
.getArgOperand(0);
871 Value
*Op1
= II
.getArgOperand(1);
872 Value
*Op2
= II
.getArgOperand(2);
874 // The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or
875 // infinity, gives +0.0.
876 // TODO: Move to InstSimplify?
877 if (match(Op0
, PatternMatch::m_AnyZeroFP()) ||
878 match(Op1
, PatternMatch::m_AnyZeroFP())) {
879 // It's tempting to just return Op2 here, but that would give the wrong
880 // result if Op2 was -0.0.
881 auto *Zero
= ConstantFP::getNullValue(II
.getType());
882 auto *FAdd
= IC
.Builder
.CreateFAddFMF(Zero
, Op2
, &II
);
884 return IC
.replaceInstUsesWith(II
, FAdd
);
887 // If we can prove we don't have one of the special cases then we can use a
888 // normal fma instead.
889 if (canSimplifyLegacyMulToMul(Op0
, Op1
, IC
)) {
890 II
.setCalledOperand(Intrinsic::getDeclaration(
891 II
.getModule(), Intrinsic::fma
, II
.getType()));
897 if (const AMDGPU::ImageDimIntrinsicInfo
*ImageDimIntr
=
898 AMDGPU::getImageDimIntrinsicInfo(II
.getIntrinsicID())) {
899 return simplifyAMDGCNImageIntrinsic(ST
, ImageDimIntr
, II
, IC
);
906 /// Implement SimplifyDemandedVectorElts for amdgcn buffer and image intrinsics.
908 /// Note: This only supports non-TFE/LWE image intrinsic calls; those have
910 static Value
*simplifyAMDGCNMemoryIntrinsicDemanded(InstCombiner
&IC
,
915 auto *IIVTy
= cast
<FixedVectorType
>(II
.getType());
916 unsigned VWidth
= IIVTy
->getNumElements();
920 IRBuilderBase::InsertPointGuard
Guard(IC
.Builder
);
921 IC
.Builder
.SetInsertPoint(&II
);
923 // Assume the arguments are unchanged and later override them, if needed.
924 SmallVector
<Value
*, 16> Args(II
.args());
929 const unsigned ActiveBits
= DemandedElts
.getActiveBits();
930 const unsigned UnusedComponentsAtFront
= DemandedElts
.countTrailingZeros();
932 // Start assuming the prefix of elements is demanded, but possibly clear
933 // some other bits if there are trailing zeros (unused components at front)
934 // and update offset.
935 DemandedElts
= (1 << ActiveBits
) - 1;
937 if (UnusedComponentsAtFront
> 0) {
938 static const unsigned InvalidOffsetIdx
= 0xf;
941 switch (II
.getIntrinsicID()) {
942 case Intrinsic::amdgcn_raw_buffer_load
:
945 case Intrinsic::amdgcn_s_buffer_load
:
946 // If resulting type is vec3, there is no point in trimming the
947 // load with updated offset, as the vec3 would most likely be widened to
948 // vec4 anyway during lowering.
949 if (ActiveBits
== 4 && UnusedComponentsAtFront
== 1)
950 OffsetIdx
= InvalidOffsetIdx
;
954 case Intrinsic::amdgcn_struct_buffer_load
:
958 // TODO: handle tbuffer* intrinsics.
959 OffsetIdx
= InvalidOffsetIdx
;
963 if (OffsetIdx
!= InvalidOffsetIdx
) {
964 // Clear demanded bits and update the offset.
965 DemandedElts
&= ~((1 << UnusedComponentsAtFront
) - 1);
966 auto *Offset
= II
.getArgOperand(OffsetIdx
);
967 unsigned SingleComponentSizeInBits
=
968 IC
.getDataLayout().getTypeSizeInBits(II
.getType()->getScalarType());
970 UnusedComponentsAtFront
* SingleComponentSizeInBits
/ 8;
971 auto *OffsetAddVal
= ConstantInt::get(Offset
->getType(), OffsetAdd
);
972 Args
[OffsetIdx
] = IC
.Builder
.CreateAdd(Offset
, OffsetAddVal
);
978 ConstantInt
*DMask
= cast
<ConstantInt
>(II
.getArgOperand(DMaskIdx
));
979 unsigned DMaskVal
= DMask
->getZExtValue() & 0xf;
981 // Mask off values that are undefined because the dmask doesn't cover them
982 DemandedElts
&= (1 << countPopulation(DMaskVal
)) - 1;
984 unsigned NewDMaskVal
= 0;
985 unsigned OrigLoadIdx
= 0;
986 for (unsigned SrcIdx
= 0; SrcIdx
< 4; ++SrcIdx
) {
987 const unsigned Bit
= 1 << SrcIdx
;
988 if (!!(DMaskVal
& Bit
)) {
989 if (!!DemandedElts
[OrigLoadIdx
])
995 if (DMaskVal
!= NewDMaskVal
)
996 Args
[DMaskIdx
] = ConstantInt::get(DMask
->getType(), NewDMaskVal
);
999 unsigned NewNumElts
= DemandedElts
.countPopulation();
1001 return UndefValue::get(II
.getType());
1003 if (NewNumElts
>= VWidth
&& DemandedElts
.isMask()) {
1005 II
.setArgOperand(DMaskIdx
, Args
[DMaskIdx
]);
1009 // Validate function argument and return types, extracting overloaded types
1011 SmallVector
<Type
*, 6> OverloadTys
;
1012 if (!Intrinsic::getIntrinsicSignature(II
.getCalledFunction(), OverloadTys
))
1015 Module
*M
= II
.getParent()->getParent()->getParent();
1016 Type
*EltTy
= IIVTy
->getElementType();
1018 (NewNumElts
== 1) ? EltTy
: FixedVectorType::get(EltTy
, NewNumElts
);
1020 OverloadTys
[0] = NewTy
;
1021 Function
*NewIntrin
=
1022 Intrinsic::getDeclaration(M
, II
.getIntrinsicID(), OverloadTys
);
1024 CallInst
*NewCall
= IC
.Builder
.CreateCall(NewIntrin
, Args
);
1025 NewCall
->takeName(&II
);
1026 NewCall
->copyMetadata(II
);
1028 if (NewNumElts
== 1) {
1029 return IC
.Builder
.CreateInsertElement(UndefValue::get(II
.getType()),
1031 DemandedElts
.countTrailingZeros());
1034 SmallVector
<int, 8> EltMask
;
1035 unsigned NewLoadIdx
= 0;
1036 for (unsigned OrigLoadIdx
= 0; OrigLoadIdx
< VWidth
; ++OrigLoadIdx
) {
1037 if (!!DemandedElts
[OrigLoadIdx
])
1038 EltMask
.push_back(NewLoadIdx
++);
1040 EltMask
.push_back(NewNumElts
);
1043 Value
*Shuffle
= IC
.Builder
.CreateShuffleVector(NewCall
, EltMask
);
1048 Optional
<Value
*> GCNTTIImpl::simplifyDemandedVectorEltsIntrinsic(
1049 InstCombiner
&IC
, IntrinsicInst
&II
, APInt DemandedElts
, APInt
&UndefElts
,
1050 APInt
&UndefElts2
, APInt
&UndefElts3
,
1051 std::function
<void(Instruction
*, unsigned, APInt
, APInt
&)>
1052 SimplifyAndSetOp
) const {
1053 switch (II
.getIntrinsicID()) {
1054 case Intrinsic::amdgcn_buffer_load
:
1055 case Intrinsic::amdgcn_buffer_load_format
:
1056 case Intrinsic::amdgcn_raw_buffer_load
:
1057 case Intrinsic::amdgcn_raw_buffer_load_format
:
1058 case Intrinsic::amdgcn_raw_tbuffer_load
:
1059 case Intrinsic::amdgcn_s_buffer_load
:
1060 case Intrinsic::amdgcn_struct_buffer_load
:
1061 case Intrinsic::amdgcn_struct_buffer_load_format
:
1062 case Intrinsic::amdgcn_struct_tbuffer_load
:
1063 case Intrinsic::amdgcn_tbuffer_load
:
1064 return simplifyAMDGCNMemoryIntrinsicDemanded(IC
, II
, DemandedElts
);
1066 if (getAMDGPUImageDMaskIntrinsic(II
.getIntrinsicID())) {
1067 return simplifyAMDGCNMemoryIntrinsicDemanded(IC
, II
, DemandedElts
, 0);