1 //===- AMDGPInstCombineIntrinsic.cpp - AMDGPU specific InstCombine pass ---===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
10 // This file implements a TargetTransformInfo analysis pass specific to the
11 // AMDGPU target machine. It uses the target's detailed information to provide
12 // more precise answers to certain TTI queries, while letting the target
13 // independent and default TTI implementations handle the rest.
15 //===----------------------------------------------------------------------===//
17 #include "AMDGPUInstrInfo.h"
18 #include "AMDGPUTargetTransformInfo.h"
19 #include "GCNSubtarget.h"
20 #include "llvm/ADT/FloatingPointMode.h"
21 #include "llvm/IR/IntrinsicsAMDGPU.h"
22 #include "llvm/Transforms/InstCombine/InstCombiner.h"
26 using namespace llvm::PatternMatch
;
28 #define DEBUG_TYPE "AMDGPUtti"
32 struct AMDGPUImageDMaskIntrinsic
{
36 #define GET_AMDGPUImageDMaskIntrinsicTable_IMPL
37 #include "InstCombineTables.inc"
39 } // end anonymous namespace
41 // Constant fold llvm.amdgcn.fmed3 intrinsics for standard inputs.
43 // A single NaN input is folded to minnum, so we rely on that folding for
45 static APFloat
fmed3AMDGCN(const APFloat
&Src0
, const APFloat
&Src1
,
46 const APFloat
&Src2
) {
47 APFloat Max3
= maxnum(maxnum(Src0
, Src1
), Src2
);
49 APFloat::cmpResult Cmp0
= Max3
.compare(Src0
);
50 assert(Cmp0
!= APFloat::cmpUnordered
&& "nans handled separately");
51 if (Cmp0
== APFloat::cmpEqual
)
52 return maxnum(Src1
, Src2
);
54 APFloat::cmpResult Cmp1
= Max3
.compare(Src1
);
55 assert(Cmp1
!= APFloat::cmpUnordered
&& "nans handled separately");
56 if (Cmp1
== APFloat::cmpEqual
)
57 return maxnum(Src0
, Src2
);
59 return maxnum(Src0
, Src1
);
62 // Check if a value can be converted to a 16-bit value without losing
64 // The value is expected to be either a float (IsFloat = true) or an unsigned
65 // integer (IsFloat = false).
66 static bool canSafelyConvertTo16Bit(Value
&V
, bool IsFloat
) {
67 Type
*VTy
= V
.getType();
68 if (VTy
->isHalfTy() || VTy
->isIntegerTy(16)) {
69 // The value is already 16-bit, so we don't want to convert to 16-bit again!
73 if (ConstantFP
*ConstFloat
= dyn_cast
<ConstantFP
>(&V
)) {
74 // We need to check that if we cast the index down to a half, we do not
76 APFloat
FloatValue(ConstFloat
->getValueAPF());
77 bool LosesInfo
= true;
78 FloatValue
.convert(APFloat::IEEEhalf(), APFloat::rmTowardZero
,
83 if (ConstantInt
*ConstInt
= dyn_cast
<ConstantInt
>(&V
)) {
84 // We need to check that if we cast the index down to an i16, we do not
86 APInt
IntValue(ConstInt
->getValue());
87 return IntValue
.getActiveBits() <= 16;
92 bool IsExt
= IsFloat
? match(&V
, m_FPExt(PatternMatch::m_Value(CastSrc
)))
93 : match(&V
, m_ZExt(PatternMatch::m_Value(CastSrc
)));
95 Type
*CastSrcTy
= CastSrc
->getType();
96 if (CastSrcTy
->isHalfTy() || CastSrcTy
->isIntegerTy(16))
103 // Convert a value to 16-bit.
104 static Value
*convertTo16Bit(Value
&V
, InstCombiner::BuilderTy
&Builder
) {
105 Type
*VTy
= V
.getType();
106 if (isa
<FPExtInst
>(&V
) || isa
<SExtInst
>(&V
) || isa
<ZExtInst
>(&V
))
107 return cast
<Instruction
>(&V
)->getOperand(0);
108 if (VTy
->isIntegerTy())
109 return Builder
.CreateIntCast(&V
, Type::getInt16Ty(V
.getContext()), false);
110 if (VTy
->isFloatingPointTy())
111 return Builder
.CreateFPCast(&V
, Type::getHalfTy(V
.getContext()));
113 llvm_unreachable("Should never be called!");
116 /// Applies Func(OldIntr.Args, OldIntr.ArgTys), creates intrinsic call with
117 /// modified arguments (based on OldIntr) and replaces InstToReplace with
118 /// this newly created intrinsic call.
119 static std::optional
<Instruction
*> modifyIntrinsicCall(
120 IntrinsicInst
&OldIntr
, Instruction
&InstToReplace
, unsigned NewIntr
,
122 std::function
<void(SmallVectorImpl
<Value
*> &, SmallVectorImpl
<Type
*> &)>
124 SmallVector
<Type
*, 4> ArgTys
;
125 if (!Intrinsic::getIntrinsicSignature(OldIntr
.getCalledFunction(), ArgTys
))
128 SmallVector
<Value
*, 8> Args(OldIntr
.args());
130 // Modify arguments and types
133 Function
*I
= Intrinsic::getDeclaration(OldIntr
.getModule(), NewIntr
, ArgTys
);
135 CallInst
*NewCall
= IC
.Builder
.CreateCall(I
, Args
);
136 NewCall
->takeName(&OldIntr
);
137 NewCall
->copyMetadata(OldIntr
);
138 if (isa
<FPMathOperator
>(NewCall
))
139 NewCall
->copyFastMathFlags(&OldIntr
);
141 // Erase and replace uses
142 if (!InstToReplace
.getType()->isVoidTy())
143 IC
.replaceInstUsesWith(InstToReplace
, NewCall
);
145 bool RemoveOldIntr
= &OldIntr
!= &InstToReplace
;
147 auto RetValue
= IC
.eraseInstFromFunction(InstToReplace
);
149 IC
.eraseInstFromFunction(OldIntr
);
154 static std::optional
<Instruction
*>
155 simplifyAMDGCNImageIntrinsic(const GCNSubtarget
*ST
,
156 const AMDGPU::ImageDimIntrinsicInfo
*ImageDimIntr
,
157 IntrinsicInst
&II
, InstCombiner
&IC
) {
158 // Optimize _L to _LZ when _L is zero
159 if (const auto *LZMappingInfo
=
160 AMDGPU::getMIMGLZMappingInfo(ImageDimIntr
->BaseOpcode
)) {
161 if (auto *ConstantLod
=
162 dyn_cast
<ConstantFP
>(II
.getOperand(ImageDimIntr
->LodIndex
))) {
163 if (ConstantLod
->isZero() || ConstantLod
->isNegative()) {
164 const AMDGPU::ImageDimIntrinsicInfo
*NewImageDimIntr
=
165 AMDGPU::getImageDimIntrinsicByBaseOpcode(LZMappingInfo
->LZ
,
167 return modifyIntrinsicCall(
168 II
, II
, NewImageDimIntr
->Intr
, IC
, [&](auto &Args
, auto &ArgTys
) {
169 Args
.erase(Args
.begin() + ImageDimIntr
->LodIndex
);
175 // Optimize _mip away, when 'lod' is zero
176 if (const auto *MIPMappingInfo
=
177 AMDGPU::getMIMGMIPMappingInfo(ImageDimIntr
->BaseOpcode
)) {
178 if (auto *ConstantMip
=
179 dyn_cast
<ConstantInt
>(II
.getOperand(ImageDimIntr
->MipIndex
))) {
180 if (ConstantMip
->isZero()) {
181 const AMDGPU::ImageDimIntrinsicInfo
*NewImageDimIntr
=
182 AMDGPU::getImageDimIntrinsicByBaseOpcode(MIPMappingInfo
->NONMIP
,
184 return modifyIntrinsicCall(
185 II
, II
, NewImageDimIntr
->Intr
, IC
, [&](auto &Args
, auto &ArgTys
) {
186 Args
.erase(Args
.begin() + ImageDimIntr
->MipIndex
);
192 // Optimize _bias away when 'bias' is zero
193 if (const auto *BiasMappingInfo
=
194 AMDGPU::getMIMGBiasMappingInfo(ImageDimIntr
->BaseOpcode
)) {
195 if (auto *ConstantBias
=
196 dyn_cast
<ConstantFP
>(II
.getOperand(ImageDimIntr
->BiasIndex
))) {
197 if (ConstantBias
->isZero()) {
198 const AMDGPU::ImageDimIntrinsicInfo
*NewImageDimIntr
=
199 AMDGPU::getImageDimIntrinsicByBaseOpcode(BiasMappingInfo
->NoBias
,
201 return modifyIntrinsicCall(
202 II
, II
, NewImageDimIntr
->Intr
, IC
, [&](auto &Args
, auto &ArgTys
) {
203 Args
.erase(Args
.begin() + ImageDimIntr
->BiasIndex
);
204 ArgTys
.erase(ArgTys
.begin() + ImageDimIntr
->BiasTyArg
);
210 // Optimize _offset away when 'offset' is zero
211 if (const auto *OffsetMappingInfo
=
212 AMDGPU::getMIMGOffsetMappingInfo(ImageDimIntr
->BaseOpcode
)) {
213 if (auto *ConstantOffset
=
214 dyn_cast
<ConstantInt
>(II
.getOperand(ImageDimIntr
->OffsetIndex
))) {
215 if (ConstantOffset
->isZero()) {
216 const AMDGPU::ImageDimIntrinsicInfo
*NewImageDimIntr
=
217 AMDGPU::getImageDimIntrinsicByBaseOpcode(
218 OffsetMappingInfo
->NoOffset
, ImageDimIntr
->Dim
);
219 return modifyIntrinsicCall(
220 II
, II
, NewImageDimIntr
->Intr
, IC
, [&](auto &Args
, auto &ArgTys
) {
221 Args
.erase(Args
.begin() + ImageDimIntr
->OffsetIndex
);
228 if (ST
->hasD16Images()) {
230 const AMDGPU::MIMGBaseOpcodeInfo
*BaseOpcode
=
231 AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr
->BaseOpcode
);
233 if (BaseOpcode
->HasD16
) {
235 // If the only use of image intrinsic is a fptrunc (with conversion to
236 // half) then both fptrunc and image intrinsic will be replaced with image
237 // intrinsic with D16 flag.
238 if (II
.hasOneUse()) {
239 Instruction
*User
= II
.user_back();
241 if (User
->getOpcode() == Instruction::FPTrunc
&&
242 User
->getType()->getScalarType()->isHalfTy()) {
244 return modifyIntrinsicCall(II
, *User
, ImageDimIntr
->Intr
, IC
,
245 [&](auto &Args
, auto &ArgTys
) {
246 // Change return type of image intrinsic.
247 // Set it to return type of fptrunc.
248 ArgTys
[0] = User
->getType();
255 // Try to use A16 or G16
256 if (!ST
->hasA16() && !ST
->hasG16())
259 // Address is interpreted as float if the instruction has a sampler or as
260 // unsigned int if there is no sampler.
262 AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr
->BaseOpcode
)->Sampler
;
263 bool FloatCoord
= false;
264 // true means derivatives can be converted to 16 bit, coordinates not
265 bool OnlyDerivatives
= false;
267 for (unsigned OperandIndex
= ImageDimIntr
->GradientStart
;
268 OperandIndex
< ImageDimIntr
->VAddrEnd
; OperandIndex
++) {
269 Value
*Coord
= II
.getOperand(OperandIndex
);
270 // If the values are not derived from 16-bit values, we cannot optimize.
271 if (!canSafelyConvertTo16Bit(*Coord
, HasSampler
)) {
272 if (OperandIndex
< ImageDimIntr
->CoordStart
||
273 ImageDimIntr
->GradientStart
== ImageDimIntr
->CoordStart
) {
276 // All gradients can be converted, so convert only them
277 OnlyDerivatives
= true;
281 assert(OperandIndex
== ImageDimIntr
->GradientStart
||
282 FloatCoord
== Coord
->getType()->isFloatingPointTy());
283 FloatCoord
= Coord
->getType()->isFloatingPointTy();
286 if (!OnlyDerivatives
&& !ST
->hasA16())
287 OnlyDerivatives
= true; // Only supports G16
289 // Check if there is a bias parameter and if it can be converted to f16
290 if (!OnlyDerivatives
&& ImageDimIntr
->NumBiasArgs
!= 0) {
291 Value
*Bias
= II
.getOperand(ImageDimIntr
->BiasIndex
);
293 "Only image instructions with a sampler can have a bias");
294 if (!canSafelyConvertTo16Bit(*Bias
, HasSampler
))
295 OnlyDerivatives
= true;
298 if (OnlyDerivatives
&& (!ST
->hasG16() || ImageDimIntr
->GradientStart
==
299 ImageDimIntr
->CoordStart
))
302 Type
*CoordType
= FloatCoord
? Type::getHalfTy(II
.getContext())
303 : Type::getInt16Ty(II
.getContext());
305 return modifyIntrinsicCall(
306 II
, II
, II
.getIntrinsicID(), IC
, [&](auto &Args
, auto &ArgTys
) {
307 ArgTys
[ImageDimIntr
->GradientTyArg
] = CoordType
;
308 if (!OnlyDerivatives
) {
309 ArgTys
[ImageDimIntr
->CoordTyArg
] = CoordType
;
311 // Change the bias type
312 if (ImageDimIntr
->NumBiasArgs
!= 0)
313 ArgTys
[ImageDimIntr
->BiasTyArg
] = Type::getHalfTy(II
.getContext());
317 OnlyDerivatives
? ImageDimIntr
->CoordStart
: ImageDimIntr
->VAddrEnd
;
318 for (unsigned OperandIndex
= ImageDimIntr
->GradientStart
;
319 OperandIndex
< EndIndex
; OperandIndex
++) {
321 convertTo16Bit(*II
.getOperand(OperandIndex
), IC
.Builder
);
325 if (!OnlyDerivatives
&& ImageDimIntr
->NumBiasArgs
!= 0) {
326 Value
*Bias
= II
.getOperand(ImageDimIntr
->BiasIndex
);
327 Args
[ImageDimIntr
->BiasIndex
] = convertTo16Bit(*Bias
, IC
.Builder
);
332 bool GCNTTIImpl::canSimplifyLegacyMulToMul(const Instruction
&I
,
333 const Value
*Op0
, const Value
*Op1
,
334 InstCombiner
&IC
) const {
335 // The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or
336 // infinity, gives +0.0. If we can prove we don't have one of the special
337 // cases then we can use a normal multiply instead.
338 // TODO: Create and use isKnownFiniteNonZero instead of just matching
340 if (match(Op0
, PatternMatch::m_FiniteNonZero()) ||
341 match(Op1
, PatternMatch::m_FiniteNonZero())) {
342 // One operand is not zero or infinity or NaN.
346 SimplifyQuery SQ
= IC
.getSimplifyQuery().getWithInstruction(&I
);
347 if (isKnownNeverInfOrNaN(Op0
, /*Depth=*/0, SQ
) &&
348 isKnownNeverInfOrNaN(Op1
, /*Depth=*/0, SQ
)) {
349 // Neither operand is infinity or NaN.
355 /// Match an fpext from half to float, or a constant we can convert.
356 static bool matchFPExtFromF16(Value
*Arg
, Value
*&FPExtSrc
) {
357 if (match(Arg
, m_OneUse(m_FPExt(m_Value(FPExtSrc
)))))
358 return FPExtSrc
->getType()->isHalfTy();
361 if (match(Arg
, m_ConstantFP(CFP
))) {
363 APFloat
Val(CFP
->getValueAPF());
364 Val
.convert(APFloat::IEEEhalf(), APFloat::rmNearestTiesToEven
, &LosesInfo
);
368 FPExtSrc
= ConstantFP::get(Type::getHalfTy(Arg
->getContext()), Val
);
375 // Trim all zero components from the end of the vector \p UseV and return
376 // an appropriate bitset with known elements.
377 static APInt
trimTrailingZerosInVector(InstCombiner
&IC
, Value
*UseV
,
379 auto *VTy
= cast
<FixedVectorType
>(UseV
->getType());
380 unsigned VWidth
= VTy
->getNumElements();
381 APInt DemandedElts
= APInt::getAllOnes(VWidth
);
383 for (int i
= VWidth
- 1; i
> 0; --i
) {
384 auto *Elt
= findScalarElement(UseV
, i
);
388 if (auto *ConstElt
= dyn_cast
<Constant
>(Elt
)) {
389 if (!ConstElt
->isNullValue() && !isa
<UndefValue
>(Elt
))
395 DemandedElts
.clearBit(i
);
401 // Trim elements of the end of the vector \p V, if they are
402 // equal to the first element of the vector.
403 static APInt
defaultComponentBroadcast(Value
*V
) {
404 auto *VTy
= cast
<FixedVectorType
>(V
->getType());
405 unsigned VWidth
= VTy
->getNumElements();
406 APInt DemandedElts
= APInt::getAllOnes(VWidth
);
407 Value
*FirstComponent
= findScalarElement(V
, 0);
409 SmallVector
<int> ShuffleMask
;
410 if (auto *SVI
= dyn_cast
<ShuffleVectorInst
>(V
))
411 SVI
->getShuffleMask(ShuffleMask
);
413 for (int I
= VWidth
- 1; I
> 0; --I
) {
414 if (ShuffleMask
.empty()) {
415 auto *Elt
= findScalarElement(V
, I
);
416 if (!Elt
|| (Elt
!= FirstComponent
&& !isa
<UndefValue
>(Elt
)))
419 // Detect identical elements in the shufflevector result, even though
420 // findScalarElement cannot tell us what that element is.
421 if (ShuffleMask
[I
] != ShuffleMask
[0] && ShuffleMask
[I
] != PoisonMaskElem
)
424 DemandedElts
.clearBit(I
);
430 static Value
*simplifyAMDGCNMemoryIntrinsicDemanded(InstCombiner
&IC
,
436 /// Return true if it's legal to contract llvm.amdgcn.rcp(llvm.sqrt)
437 static bool canContractSqrtToRsq(const FPMathOperator
*SqrtOp
) {
438 return (SqrtOp
->getType()->isFloatTy() &&
439 (SqrtOp
->hasApproxFunc() || SqrtOp
->getFPAccuracy() >= 1.0f
)) ||
440 SqrtOp
->getType()->isHalfTy();
443 std::optional
<Instruction
*>
444 GCNTTIImpl::instCombineIntrinsic(InstCombiner
&IC
, IntrinsicInst
&II
) const {
445 Intrinsic::ID IID
= II
.getIntrinsicID();
447 case Intrinsic::amdgcn_rcp
: {
448 Value
*Src
= II
.getArgOperand(0);
450 // TODO: Move to ConstantFolding/InstSimplify?
451 if (isa
<UndefValue
>(Src
)) {
452 Type
*Ty
= II
.getType();
453 auto *QNaN
= ConstantFP::get(Ty
, APFloat::getQNaN(Ty
->getFltSemantics()));
454 return IC
.replaceInstUsesWith(II
, QNaN
);
460 if (const ConstantFP
*C
= dyn_cast
<ConstantFP
>(Src
)) {
461 const APFloat
&ArgVal
= C
->getValueAPF();
462 APFloat
Val(ArgVal
.getSemantics(), 1);
463 Val
.divide(ArgVal
, APFloat::rmNearestTiesToEven
);
465 // This is more precise than the instruction may give.
467 // TODO: The instruction always flushes denormal results (except for f16),
469 return IC
.replaceInstUsesWith(II
, ConstantFP::get(II
.getContext(), Val
));
472 FastMathFlags FMF
= cast
<FPMathOperator
>(II
).getFastMathFlags();
473 if (!FMF
.allowContract())
475 auto *SrcCI
= dyn_cast
<IntrinsicInst
>(Src
);
479 auto IID
= SrcCI
->getIntrinsicID();
480 // llvm.amdgcn.rcp(llvm.amdgcn.sqrt(x)) -> llvm.amdgcn.rsq(x) if contractable
482 // llvm.amdgcn.rcp(llvm.sqrt(x)) -> llvm.amdgcn.rsq(x) if contractable and
484 if (IID
== Intrinsic::amdgcn_sqrt
|| IID
== Intrinsic::sqrt
) {
485 const FPMathOperator
*SqrtOp
= cast
<FPMathOperator
>(SrcCI
);
486 FastMathFlags InnerFMF
= SqrtOp
->getFastMathFlags();
487 if (!InnerFMF
.allowContract() || !SrcCI
->hasOneUse())
490 if (IID
== Intrinsic::sqrt
&& !canContractSqrtToRsq(SqrtOp
))
493 Function
*NewDecl
= Intrinsic::getDeclaration(
494 SrcCI
->getModule(), Intrinsic::amdgcn_rsq
, {SrcCI
->getType()});
497 II
.setFastMathFlags(InnerFMF
);
499 II
.setCalledFunction(NewDecl
);
500 return IC
.replaceOperand(II
, 0, SrcCI
->getArgOperand(0));
505 case Intrinsic::amdgcn_sqrt
:
506 case Intrinsic::amdgcn_rsq
: {
507 Value
*Src
= II
.getArgOperand(0);
509 // TODO: Move to ConstantFolding/InstSimplify?
510 if (isa
<UndefValue
>(Src
)) {
511 Type
*Ty
= II
.getType();
512 auto *QNaN
= ConstantFP::get(Ty
, APFloat::getQNaN(Ty
->getFltSemantics()));
513 return IC
.replaceInstUsesWith(II
, QNaN
);
516 // f16 amdgcn.sqrt is identical to regular sqrt.
517 if (IID
== Intrinsic::amdgcn_sqrt
&& Src
->getType()->isHalfTy()) {
518 Function
*NewDecl
= Intrinsic::getDeclaration(
519 II
.getModule(), Intrinsic::sqrt
, {II
.getType()});
520 II
.setCalledFunction(NewDecl
);
526 case Intrinsic::amdgcn_log
:
527 case Intrinsic::amdgcn_exp2
: {
528 const bool IsLog
= IID
== Intrinsic::amdgcn_log
;
529 const bool IsExp
= IID
== Intrinsic::amdgcn_exp2
;
530 Value
*Src
= II
.getArgOperand(0);
531 Type
*Ty
= II
.getType();
533 if (isa
<PoisonValue
>(Src
))
534 return IC
.replaceInstUsesWith(II
, Src
);
536 if (IC
.getSimplifyQuery().isUndefValue(Src
))
537 return IC
.replaceInstUsesWith(II
, ConstantFP::getNaN(Ty
));
539 if (ConstantFP
*C
= dyn_cast
<ConstantFP
>(Src
)) {
540 if (C
->isInfinity()) {
541 // exp2(+inf) -> +inf
542 // log2(+inf) -> +inf
543 if (!C
->isNegative())
544 return IC
.replaceInstUsesWith(II
, C
);
547 if (IsExp
&& C
->isNegative())
548 return IC
.replaceInstUsesWith(II
, ConstantFP::getZero(Ty
));
555 Constant
*Quieted
= ConstantFP::get(Ty
, C
->getValue().makeQuiet());
556 return IC
.replaceInstUsesWith(II
, Quieted
);
559 // f32 instruction doesn't handle denormals, f16 does.
560 if (C
->isZero() || (C
->getValue().isDenormal() && Ty
->isFloatTy())) {
561 Constant
*FoldedValue
= IsLog
? ConstantFP::getInfinity(Ty
, true)
562 : ConstantFP::get(Ty
, 1.0);
563 return IC
.replaceInstUsesWith(II
, FoldedValue
);
566 if (IsLog
&& C
->isNegative())
567 return IC
.replaceInstUsesWith(II
, ConstantFP::getNaN(Ty
));
569 // TODO: Full constant folding matching hardware behavior.
574 case Intrinsic::amdgcn_frexp_mant
:
575 case Intrinsic::amdgcn_frexp_exp
: {
576 Value
*Src
= II
.getArgOperand(0);
577 if (const ConstantFP
*C
= dyn_cast
<ConstantFP
>(Src
)) {
579 APFloat Significand
=
580 frexp(C
->getValueAPF(), Exp
, APFloat::rmNearestTiesToEven
);
582 if (IID
== Intrinsic::amdgcn_frexp_mant
) {
583 return IC
.replaceInstUsesWith(
584 II
, ConstantFP::get(II
.getContext(), Significand
));
587 // Match instruction special case behavior.
588 if (Exp
== APFloat::IEK_NaN
|| Exp
== APFloat::IEK_Inf
)
591 return IC
.replaceInstUsesWith(II
, ConstantInt::get(II
.getType(), Exp
));
594 if (isa
<UndefValue
>(Src
)) {
595 return IC
.replaceInstUsesWith(II
, UndefValue::get(II
.getType()));
600 case Intrinsic::amdgcn_class
: {
601 Value
*Src0
= II
.getArgOperand(0);
602 Value
*Src1
= II
.getArgOperand(1);
603 const ConstantInt
*CMask
= dyn_cast
<ConstantInt
>(Src1
);
605 II
.setCalledOperand(Intrinsic::getDeclaration(
606 II
.getModule(), Intrinsic::is_fpclass
, Src0
->getType()));
608 // Clamp any excess bits, as they're illegal for the generic intrinsic.
609 II
.setArgOperand(1, ConstantInt::get(Src1
->getType(),
610 CMask
->getZExtValue() & fcAllFlags
));
615 if (isa
<PoisonValue
>(Src0
) || isa
<PoisonValue
>(Src1
))
616 return IC
.replaceInstUsesWith(II
, PoisonValue::get(II
.getType()));
618 // llvm.amdgcn.class(_, undef) -> false
619 if (IC
.getSimplifyQuery().isUndefValue(Src1
))
620 return IC
.replaceInstUsesWith(II
, ConstantInt::get(II
.getType(), false));
622 // llvm.amdgcn.class(undef, mask) -> mask != 0
623 if (IC
.getSimplifyQuery().isUndefValue(Src0
)) {
624 Value
*CmpMask
= IC
.Builder
.CreateICmpNE(
625 Src1
, ConstantInt::getNullValue(Src1
->getType()));
626 return IC
.replaceInstUsesWith(II
, CmpMask
);
630 case Intrinsic::amdgcn_cvt_pkrtz
: {
631 Value
*Src0
= II
.getArgOperand(0);
632 Value
*Src1
= II
.getArgOperand(1);
633 if (const ConstantFP
*C0
= dyn_cast
<ConstantFP
>(Src0
)) {
634 if (const ConstantFP
*C1
= dyn_cast
<ConstantFP
>(Src1
)) {
635 const fltSemantics
&HalfSem
=
636 II
.getType()->getScalarType()->getFltSemantics();
638 APFloat Val0
= C0
->getValueAPF();
639 APFloat Val1
= C1
->getValueAPF();
640 Val0
.convert(HalfSem
, APFloat::rmTowardZero
, &LosesInfo
);
641 Val1
.convert(HalfSem
, APFloat::rmTowardZero
, &LosesInfo
);
644 ConstantVector::get({ConstantFP::get(II
.getContext(), Val0
),
645 ConstantFP::get(II
.getContext(), Val1
)});
646 return IC
.replaceInstUsesWith(II
, Folded
);
650 if (isa
<UndefValue
>(Src0
) && isa
<UndefValue
>(Src1
)) {
651 return IC
.replaceInstUsesWith(II
, UndefValue::get(II
.getType()));
656 case Intrinsic::amdgcn_cvt_pknorm_i16
:
657 case Intrinsic::amdgcn_cvt_pknorm_u16
:
658 case Intrinsic::amdgcn_cvt_pk_i16
:
659 case Intrinsic::amdgcn_cvt_pk_u16
: {
660 Value
*Src0
= II
.getArgOperand(0);
661 Value
*Src1
= II
.getArgOperand(1);
663 if (isa
<UndefValue
>(Src0
) && isa
<UndefValue
>(Src1
)) {
664 return IC
.replaceInstUsesWith(II
, UndefValue::get(II
.getType()));
669 case Intrinsic::amdgcn_ubfe
:
670 case Intrinsic::amdgcn_sbfe
: {
671 // Decompose simple cases into standard shifts.
672 Value
*Src
= II
.getArgOperand(0);
673 if (isa
<UndefValue
>(Src
)) {
674 return IC
.replaceInstUsesWith(II
, Src
);
678 Type
*Ty
= II
.getType();
679 unsigned IntSize
= Ty
->getIntegerBitWidth();
681 ConstantInt
*CWidth
= dyn_cast
<ConstantInt
>(II
.getArgOperand(2));
683 Width
= CWidth
->getZExtValue();
684 if ((Width
& (IntSize
- 1)) == 0) {
685 return IC
.replaceInstUsesWith(II
, ConstantInt::getNullValue(Ty
));
688 // Hardware ignores high bits, so remove those.
689 if (Width
>= IntSize
) {
690 return IC
.replaceOperand(
691 II
, 2, ConstantInt::get(CWidth
->getType(), Width
& (IntSize
- 1)));
696 ConstantInt
*COffset
= dyn_cast
<ConstantInt
>(II
.getArgOperand(1));
698 Offset
= COffset
->getZExtValue();
699 if (Offset
>= IntSize
) {
700 return IC
.replaceOperand(
702 ConstantInt::get(COffset
->getType(), Offset
& (IntSize
- 1)));
706 bool Signed
= IID
== Intrinsic::amdgcn_sbfe
;
708 if (!CWidth
|| !COffset
)
711 // The case of Width == 0 is handled above, which makes this transformation
712 // safe. If Width == 0, then the ashr and lshr instructions become poison
713 // value since the shift amount would be equal to the bit size.
716 // TODO: This allows folding to undef when the hardware has specific
718 if (Offset
+ Width
< IntSize
) {
719 Value
*Shl
= IC
.Builder
.CreateShl(Src
, IntSize
- Offset
- Width
);
720 Value
*RightShift
= Signed
? IC
.Builder
.CreateAShr(Shl
, IntSize
- Width
)
721 : IC
.Builder
.CreateLShr(Shl
, IntSize
- Width
);
722 RightShift
->takeName(&II
);
723 return IC
.replaceInstUsesWith(II
, RightShift
);
726 Value
*RightShift
= Signed
? IC
.Builder
.CreateAShr(Src
, Offset
)
727 : IC
.Builder
.CreateLShr(Src
, Offset
);
729 RightShift
->takeName(&II
);
730 return IC
.replaceInstUsesWith(II
, RightShift
);
732 case Intrinsic::amdgcn_exp
:
733 case Intrinsic::amdgcn_exp_row
:
734 case Intrinsic::amdgcn_exp_compr
: {
735 ConstantInt
*En
= cast
<ConstantInt
>(II
.getArgOperand(1));
736 unsigned EnBits
= En
->getZExtValue();
738 break; // All inputs enabled.
740 bool IsCompr
= IID
== Intrinsic::amdgcn_exp_compr
;
741 bool Changed
= false;
742 for (int I
= 0; I
< (IsCompr
? 2 : 4); ++I
) {
743 if ((!IsCompr
&& (EnBits
& (1 << I
)) == 0) ||
744 (IsCompr
&& ((EnBits
& (0x3 << (2 * I
))) == 0))) {
745 Value
*Src
= II
.getArgOperand(I
+ 2);
746 if (!isa
<UndefValue
>(Src
)) {
747 IC
.replaceOperand(II
, I
+ 2, UndefValue::get(Src
->getType()));
759 case Intrinsic::amdgcn_fmed3
: {
760 // Note this does not preserve proper sNaN behavior if IEEE-mode is enabled
763 Value
*Src0
= II
.getArgOperand(0);
764 Value
*Src1
= II
.getArgOperand(1);
765 Value
*Src2
= II
.getArgOperand(2);
767 // Checking for NaN before canonicalization provides better fidelity when
768 // mapping other operations onto fmed3 since the order of operands is
771 if (match(Src0
, PatternMatch::m_NaN()) || isa
<UndefValue
>(Src0
)) {
772 V
= IC
.Builder
.CreateMinNum(Src1
, Src2
);
773 } else if (match(Src1
, PatternMatch::m_NaN()) || isa
<UndefValue
>(Src1
)) {
774 V
= IC
.Builder
.CreateMinNum(Src0
, Src2
);
775 } else if (match(Src2
, PatternMatch::m_NaN()) || isa
<UndefValue
>(Src2
)) {
776 V
= IC
.Builder
.CreateMaxNum(Src0
, Src1
);
780 if (auto *CI
= dyn_cast
<CallInst
>(V
)) {
781 CI
->copyFastMathFlags(&II
);
784 return IC
.replaceInstUsesWith(II
, V
);
788 // Canonicalize constants to RHS operands.
790 // fmed3(c0, x, c1) -> fmed3(x, c0, c1)
791 if (isa
<Constant
>(Src0
) && !isa
<Constant
>(Src1
)) {
792 std::swap(Src0
, Src1
);
796 if (isa
<Constant
>(Src1
) && !isa
<Constant
>(Src2
)) {
797 std::swap(Src1
, Src2
);
801 if (isa
<Constant
>(Src0
) && !isa
<Constant
>(Src1
)) {
802 std::swap(Src0
, Src1
);
807 II
.setArgOperand(0, Src0
);
808 II
.setArgOperand(1, Src1
);
809 II
.setArgOperand(2, Src2
);
813 if (const ConstantFP
*C0
= dyn_cast
<ConstantFP
>(Src0
)) {
814 if (const ConstantFP
*C1
= dyn_cast
<ConstantFP
>(Src1
)) {
815 if (const ConstantFP
*C2
= dyn_cast
<ConstantFP
>(Src2
)) {
816 APFloat Result
= fmed3AMDGCN(C0
->getValueAPF(), C1
->getValueAPF(),
818 return IC
.replaceInstUsesWith(
819 II
, ConstantFP::get(IC
.Builder
.getContext(), Result
));
824 if (!ST
->hasMed3_16())
829 // Repeat floating-point width reduction done for minnum/maxnum.
830 // fmed3((fpext X), (fpext Y), (fpext Z)) -> fpext (fmed3(X, Y, Z))
831 if (matchFPExtFromF16(Src0
, X
) && matchFPExtFromF16(Src1
, Y
) &&
832 matchFPExtFromF16(Src2
, Z
)) {
833 Value
*NewCall
= IC
.Builder
.CreateIntrinsic(IID
, {X
->getType()},
834 {X
, Y
, Z
}, &II
, II
.getName());
835 return new FPExtInst(NewCall
, II
.getType());
840 case Intrinsic::amdgcn_icmp
:
841 case Intrinsic::amdgcn_fcmp
: {
842 const ConstantInt
*CC
= cast
<ConstantInt
>(II
.getArgOperand(2));
843 // Guard against invalid arguments.
844 int64_t CCVal
= CC
->getZExtValue();
845 bool IsInteger
= IID
== Intrinsic::amdgcn_icmp
;
846 if ((IsInteger
&& (CCVal
< CmpInst::FIRST_ICMP_PREDICATE
||
847 CCVal
> CmpInst::LAST_ICMP_PREDICATE
)) ||
848 (!IsInteger
&& (CCVal
< CmpInst::FIRST_FCMP_PREDICATE
||
849 CCVal
> CmpInst::LAST_FCMP_PREDICATE
)))
852 Value
*Src0
= II
.getArgOperand(0);
853 Value
*Src1
= II
.getArgOperand(1);
855 if (auto *CSrc0
= dyn_cast
<Constant
>(Src0
)) {
856 if (auto *CSrc1
= dyn_cast
<Constant
>(Src1
)) {
857 Constant
*CCmp
= ConstantFoldCompareInstOperands(
858 (ICmpInst::Predicate
)CCVal
, CSrc0
, CSrc1
, DL
);
859 if (CCmp
&& CCmp
->isNullValue()) {
860 return IC
.replaceInstUsesWith(
861 II
, IC
.Builder
.CreateSExt(CCmp
, II
.getType()));
864 // The result of V_ICMP/V_FCMP assembly instructions (which this
865 // intrinsic exposes) is one bit per thread, masked with the EXEC
866 // register (which contains the bitmask of live threads). So a
867 // comparison that always returns true is the same as a read of the
869 Function
*NewF
= Intrinsic::getDeclaration(
870 II
.getModule(), Intrinsic::read_register
, II
.getType());
871 Metadata
*MDArgs
[] = {MDString::get(II
.getContext(), "exec")};
872 MDNode
*MD
= MDNode::get(II
.getContext(), MDArgs
);
873 Value
*Args
[] = {MetadataAsValue::get(II
.getContext(), MD
)};
874 CallInst
*NewCall
= IC
.Builder
.CreateCall(NewF
, Args
);
875 NewCall
->addFnAttr(Attribute::Convergent
);
876 NewCall
->takeName(&II
);
877 return IC
.replaceInstUsesWith(II
, NewCall
);
880 // Canonicalize constants to RHS.
881 CmpInst::Predicate SwapPred
=
882 CmpInst::getSwappedPredicate(static_cast<CmpInst::Predicate
>(CCVal
));
883 II
.setArgOperand(0, Src1
);
884 II
.setArgOperand(1, Src0
);
886 2, ConstantInt::get(CC
->getType(), static_cast<int>(SwapPred
)));
890 if (CCVal
!= CmpInst::ICMP_EQ
&& CCVal
!= CmpInst::ICMP_NE
)
893 // Canonicalize compare eq with true value to compare != 0
894 // llvm.amdgcn.icmp(zext (i1 x), 1, eq)
895 // -> llvm.amdgcn.icmp(zext (i1 x), 0, ne)
896 // llvm.amdgcn.icmp(sext (i1 x), -1, eq)
897 // -> llvm.amdgcn.icmp(sext (i1 x), 0, ne)
899 if (CCVal
== CmpInst::ICMP_EQ
&&
900 ((match(Src1
, PatternMatch::m_One()) &&
901 match(Src0
, m_ZExt(PatternMatch::m_Value(ExtSrc
)))) ||
902 (match(Src1
, PatternMatch::m_AllOnes()) &&
903 match(Src0
, m_SExt(PatternMatch::m_Value(ExtSrc
))))) &&
904 ExtSrc
->getType()->isIntegerTy(1)) {
905 IC
.replaceOperand(II
, 1, ConstantInt::getNullValue(Src1
->getType()));
906 IC
.replaceOperand(II
, 2,
907 ConstantInt::get(CC
->getType(), CmpInst::ICMP_NE
));
911 CmpInst::Predicate SrcPred
;
915 // Fold compare eq/ne with 0 from a compare result as the predicate to the
916 // intrinsic. The typical use is a wave vote function in the library, which
917 // will be fed from a user code condition compared with 0. Fold in the
918 // redundant compare.
920 // llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, ne)
921 // -> llvm.amdgcn.[if]cmp(a, b, pred)
923 // llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, eq)
924 // -> llvm.amdgcn.[if]cmp(a, b, inv pred)
925 if (match(Src1
, PatternMatch::m_Zero()) &&
926 match(Src0
, PatternMatch::m_ZExtOrSExt(
927 m_Cmp(SrcPred
, PatternMatch::m_Value(SrcLHS
),
928 PatternMatch::m_Value(SrcRHS
))))) {
929 if (CCVal
== CmpInst::ICMP_EQ
)
930 SrcPred
= CmpInst::getInversePredicate(SrcPred
);
932 Intrinsic::ID NewIID
= CmpInst::isFPPredicate(SrcPred
)
933 ? Intrinsic::amdgcn_fcmp
934 : Intrinsic::amdgcn_icmp
;
936 Type
*Ty
= SrcLHS
->getType();
937 if (auto *CmpType
= dyn_cast
<IntegerType
>(Ty
)) {
938 // Promote to next legal integer type.
939 unsigned Width
= CmpType
->getBitWidth();
940 unsigned NewWidth
= Width
;
942 // Don't do anything for i1 comparisons.
948 else if (Width
<= 32)
950 else if (Width
<= 64)
953 break; // Can't handle this.
955 if (Width
!= NewWidth
) {
956 IntegerType
*CmpTy
= IC
.Builder
.getIntNTy(NewWidth
);
957 if (CmpInst::isSigned(SrcPred
)) {
958 SrcLHS
= IC
.Builder
.CreateSExt(SrcLHS
, CmpTy
);
959 SrcRHS
= IC
.Builder
.CreateSExt(SrcRHS
, CmpTy
);
961 SrcLHS
= IC
.Builder
.CreateZExt(SrcLHS
, CmpTy
);
962 SrcRHS
= IC
.Builder
.CreateZExt(SrcRHS
, CmpTy
);
965 } else if (!Ty
->isFloatTy() && !Ty
->isDoubleTy() && !Ty
->isHalfTy())
968 Function
*NewF
= Intrinsic::getDeclaration(
969 II
.getModule(), NewIID
, {II
.getType(), SrcLHS
->getType()});
970 Value
*Args
[] = {SrcLHS
, SrcRHS
,
971 ConstantInt::get(CC
->getType(), SrcPred
)};
972 CallInst
*NewCall
= IC
.Builder
.CreateCall(NewF
, Args
);
973 NewCall
->takeName(&II
);
974 return IC
.replaceInstUsesWith(II
, NewCall
);
979 case Intrinsic::amdgcn_mbcnt_hi
: {
980 // exec_hi is all 0, so this is just a copy.
982 return IC
.replaceInstUsesWith(II
, II
.getArgOperand(1));
985 case Intrinsic::amdgcn_ballot
: {
986 if (auto *Src
= dyn_cast
<ConstantInt
>(II
.getArgOperand(0))) {
988 // amdgcn.ballot(i1 0) is zero.
989 return IC
.replaceInstUsesWith(II
, Constant::getNullValue(II
.getType()));
992 if (ST
->isWave32() && II
.getType()->getIntegerBitWidth() == 64) {
993 // %b64 = call i64 ballot.i64(...)
995 // %b32 = call i32 ballot.i32(...)
996 // %b64 = zext i32 %b32 to i64
997 Value
*Call
= IC
.Builder
.CreateZExt(
998 IC
.Builder
.CreateIntrinsic(Intrinsic::amdgcn_ballot
,
999 {IC
.Builder
.getInt32Ty()},
1000 {II
.getArgOperand(0)}),
1002 Call
->takeName(&II
);
1003 return IC
.replaceInstUsesWith(II
, Call
);
1007 case Intrinsic::amdgcn_wqm_vote
: {
1008 // wqm_vote is identity when the argument is constant.
1009 if (!isa
<Constant
>(II
.getArgOperand(0)))
1012 return IC
.replaceInstUsesWith(II
, II
.getArgOperand(0));
1014 case Intrinsic::amdgcn_kill
: {
1015 const ConstantInt
*C
= dyn_cast
<ConstantInt
>(II
.getArgOperand(0));
1016 if (!C
|| !C
->getZExtValue())
1019 // amdgcn.kill(i1 1) is a no-op
1020 return IC
.eraseInstFromFunction(II
);
1022 case Intrinsic::amdgcn_update_dpp
: {
1023 Value
*Old
= II
.getArgOperand(0);
1025 auto *BC
= cast
<ConstantInt
>(II
.getArgOperand(5));
1026 auto *RM
= cast
<ConstantInt
>(II
.getArgOperand(3));
1027 auto *BM
= cast
<ConstantInt
>(II
.getArgOperand(4));
1028 if (BC
->isZeroValue() || RM
->getZExtValue() != 0xF ||
1029 BM
->getZExtValue() != 0xF || isa
<UndefValue
>(Old
))
1032 // If bound_ctrl = 1, row mask = bank mask = 0xf we can omit old value.
1033 return IC
.replaceOperand(II
, 0, UndefValue::get(Old
->getType()));
1035 case Intrinsic::amdgcn_permlane16
:
1036 case Intrinsic::amdgcn_permlane16_var
:
1037 case Intrinsic::amdgcn_permlanex16
:
1038 case Intrinsic::amdgcn_permlanex16_var
: {
1039 // Discard vdst_in if it's not going to be read.
1040 Value
*VDstIn
= II
.getArgOperand(0);
1041 if (isa
<UndefValue
>(VDstIn
))
1044 // FetchInvalid operand idx.
1045 unsigned int FiIdx
= (IID
== Intrinsic::amdgcn_permlane16
||
1046 IID
== Intrinsic::amdgcn_permlanex16
)
1047 ? 4 /* for permlane16 and permlanex16 */
1048 : 3; /* for permlane16_var and permlanex16_var */
1050 // BoundCtrl operand idx.
1051 // For permlane16 and permlanex16 it should be 5
1052 // For Permlane16_var and permlanex16_var it should be 4
1053 unsigned int BcIdx
= FiIdx
+ 1;
1055 ConstantInt
*FetchInvalid
= cast
<ConstantInt
>(II
.getArgOperand(FiIdx
));
1056 ConstantInt
*BoundCtrl
= cast
<ConstantInt
>(II
.getArgOperand(BcIdx
));
1057 if (!FetchInvalid
->getZExtValue() && !BoundCtrl
->getZExtValue())
1060 return IC
.replaceOperand(II
, 0, UndefValue::get(VDstIn
->getType()));
1062 case Intrinsic::amdgcn_permlane64
:
1063 // A constant value is trivially uniform.
1064 if (Constant
*C
= dyn_cast
<Constant
>(II
.getArgOperand(0))) {
1065 return IC
.replaceInstUsesWith(II
, C
);
1068 case Intrinsic::amdgcn_readfirstlane
:
1069 case Intrinsic::amdgcn_readlane
: {
1070 // A constant value is trivially uniform.
1071 if (Constant
*C
= dyn_cast
<Constant
>(II
.getArgOperand(0))) {
1072 return IC
.replaceInstUsesWith(II
, C
);
1075 // The rest of these may not be safe if the exec may not be the same between
1077 Value
*Src
= II
.getArgOperand(0);
1078 Instruction
*SrcInst
= dyn_cast
<Instruction
>(Src
);
1079 if (SrcInst
&& SrcInst
->getParent() != II
.getParent())
1082 // readfirstlane (readfirstlane x) -> readfirstlane x
1083 // readlane (readfirstlane x), y -> readfirstlane x
1085 PatternMatch::m_Intrinsic
<Intrinsic::amdgcn_readfirstlane
>())) {
1086 return IC
.replaceInstUsesWith(II
, Src
);
1089 if (IID
== Intrinsic::amdgcn_readfirstlane
) {
1090 // readfirstlane (readlane x, y) -> readlane x, y
1091 if (match(Src
, PatternMatch::m_Intrinsic
<Intrinsic::amdgcn_readlane
>())) {
1092 return IC
.replaceInstUsesWith(II
, Src
);
1095 // readlane (readlane x, y), y -> readlane x, y
1096 if (match(Src
, PatternMatch::m_Intrinsic
<Intrinsic::amdgcn_readlane
>(
1097 PatternMatch::m_Value(),
1098 PatternMatch::m_Specific(II
.getArgOperand(1))))) {
1099 return IC
.replaceInstUsesWith(II
, Src
);
1105 case Intrinsic::amdgcn_trig_preop
: {
1106 // The intrinsic is declared with name mangling, but currently the
1107 // instruction only exists for f64
1108 if (!II
.getType()->isDoubleTy())
1111 Value
*Src
= II
.getArgOperand(0);
1112 Value
*Segment
= II
.getArgOperand(1);
1113 if (isa
<PoisonValue
>(Src
) || isa
<PoisonValue
>(Segment
))
1114 return IC
.replaceInstUsesWith(II
, PoisonValue::get(II
.getType()));
1116 if (isa
<UndefValue
>(Src
)) {
1117 auto *QNaN
= ConstantFP::get(
1118 II
.getType(), APFloat::getQNaN(II
.getType()->getFltSemantics()));
1119 return IC
.replaceInstUsesWith(II
, QNaN
);
1122 const ConstantFP
*Csrc
= dyn_cast
<ConstantFP
>(Src
);
1126 if (II
.isStrictFP())
1129 const APFloat
&Fsrc
= Csrc
->getValueAPF();
1131 auto *Quieted
= ConstantFP::get(II
.getType(), Fsrc
.makeQuiet());
1132 return IC
.replaceInstUsesWith(II
, Quieted
);
1135 const ConstantInt
*Cseg
= dyn_cast
<ConstantInt
>(Segment
);
1139 unsigned Exponent
= (Fsrc
.bitcastToAPInt().getZExtValue() >> 52) & 0x7ff;
1140 unsigned SegmentVal
= Cseg
->getValue().trunc(5).getZExtValue();
1141 unsigned Shift
= SegmentVal
* 53;
1142 if (Exponent
> 1077)
1143 Shift
+= Exponent
- 1077;
1146 static const uint32_t TwoByPi
[] = {
1147 0xa2f9836e, 0x4e441529, 0xfc2757d1, 0xf534ddc0, 0xdb629599, 0x3c439041,
1148 0xfe5163ab, 0xdebbc561, 0xb7246e3a, 0x424dd2e0, 0x06492eea, 0x09d1921c,
1149 0xfe1deb1c, 0xb129a73e, 0xe88235f5, 0x2ebb4484, 0xe99c7026, 0xb45f7e41,
1150 0x3991d639, 0x835339f4, 0x9c845f8b, 0xbdf9283b, 0x1ff897ff, 0xde05980f,
1151 0xef2f118b, 0x5a0a6d1f, 0x6d367ecf, 0x27cb09b7, 0x4f463f66, 0x9e5fea2d,
1152 0x7527bac7, 0xebe5f17b, 0x3d0739f7, 0x8a5292ea, 0x6bfb5fb1, 0x1f8d5d08,
1155 // Return 0 for outbound segment (hardware behavior).
1156 unsigned Idx
= Shift
>> 5;
1157 if (Idx
+ 2 >= std::size(TwoByPi
)) {
1158 APFloat Zero
= APFloat::getZero(II
.getType()->getFltSemantics());
1159 return IC
.replaceInstUsesWith(II
, ConstantFP::get(II
.getType(), Zero
));
1162 unsigned BShift
= Shift
& 0x1f;
1163 uint64_t Thi
= Make_64(TwoByPi
[Idx
], TwoByPi
[Idx
+ 1]);
1164 uint64_t Tlo
= Make_64(TwoByPi
[Idx
+ 2], 0);
1166 Thi
= (Thi
<< BShift
) | (Tlo
>> (64 - BShift
));
1168 APFloat Result
= APFloat((double)Thi
);
1170 int Scale
= -53 - Shift
;
1171 if (Exponent
>= 1968)
1174 Result
= scalbn(Result
, Scale
, RoundingMode::NearestTiesToEven
);
1175 return IC
.replaceInstUsesWith(II
, ConstantFP::get(Src
->getType(), Result
));
1177 case Intrinsic::amdgcn_fmul_legacy
: {
1178 Value
*Op0
= II
.getArgOperand(0);
1179 Value
*Op1
= II
.getArgOperand(1);
1181 // The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or
1182 // infinity, gives +0.0.
1183 // TODO: Move to InstSimplify?
1184 if (match(Op0
, PatternMatch::m_AnyZeroFP()) ||
1185 match(Op1
, PatternMatch::m_AnyZeroFP()))
1186 return IC
.replaceInstUsesWith(II
, ConstantFP::getZero(II
.getType()));
1188 // If we can prove we don't have one of the special cases then we can use a
1189 // normal fmul instruction instead.
1190 if (canSimplifyLegacyMulToMul(II
, Op0
, Op1
, IC
)) {
1191 auto *FMul
= IC
.Builder
.CreateFMulFMF(Op0
, Op1
, &II
);
1192 FMul
->takeName(&II
);
1193 return IC
.replaceInstUsesWith(II
, FMul
);
1197 case Intrinsic::amdgcn_fma_legacy
: {
1198 Value
*Op0
= II
.getArgOperand(0);
1199 Value
*Op1
= II
.getArgOperand(1);
1200 Value
*Op2
= II
.getArgOperand(2);
1202 // The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or
1203 // infinity, gives +0.0.
1204 // TODO: Move to InstSimplify?
1205 if (match(Op0
, PatternMatch::m_AnyZeroFP()) ||
1206 match(Op1
, PatternMatch::m_AnyZeroFP())) {
1207 // It's tempting to just return Op2 here, but that would give the wrong
1208 // result if Op2 was -0.0.
1209 auto *Zero
= ConstantFP::getZero(II
.getType());
1210 auto *FAdd
= IC
.Builder
.CreateFAddFMF(Zero
, Op2
, &II
);
1211 FAdd
->takeName(&II
);
1212 return IC
.replaceInstUsesWith(II
, FAdd
);
1215 // If we can prove we don't have one of the special cases then we can use a
1216 // normal fma instead.
1217 if (canSimplifyLegacyMulToMul(II
, Op0
, Op1
, IC
)) {
1218 II
.setCalledOperand(Intrinsic::getDeclaration(
1219 II
.getModule(), Intrinsic::fma
, II
.getType()));
1224 case Intrinsic::amdgcn_is_shared
:
1225 case Intrinsic::amdgcn_is_private
: {
1226 if (isa
<UndefValue
>(II
.getArgOperand(0)))
1227 return IC
.replaceInstUsesWith(II
, UndefValue::get(II
.getType()));
1229 if (isa
<ConstantPointerNull
>(II
.getArgOperand(0)))
1230 return IC
.replaceInstUsesWith(II
, ConstantInt::getFalse(II
.getType()));
1233 case Intrinsic::amdgcn_raw_buffer_store_format
:
1234 case Intrinsic::amdgcn_struct_buffer_store_format
:
1235 case Intrinsic::amdgcn_raw_tbuffer_store
:
1236 case Intrinsic::amdgcn_struct_tbuffer_store
:
1237 case Intrinsic::amdgcn_image_store_1d
:
1238 case Intrinsic::amdgcn_image_store_1darray
:
1239 case Intrinsic::amdgcn_image_store_2d
:
1240 case Intrinsic::amdgcn_image_store_2darray
:
1241 case Intrinsic::amdgcn_image_store_2darraymsaa
:
1242 case Intrinsic::amdgcn_image_store_2dmsaa
:
1243 case Intrinsic::amdgcn_image_store_3d
:
1244 case Intrinsic::amdgcn_image_store_cube
:
1245 case Intrinsic::amdgcn_image_store_mip_1d
:
1246 case Intrinsic::amdgcn_image_store_mip_1darray
:
1247 case Intrinsic::amdgcn_image_store_mip_2d
:
1248 case Intrinsic::amdgcn_image_store_mip_2darray
:
1249 case Intrinsic::amdgcn_image_store_mip_3d
:
1250 case Intrinsic::amdgcn_image_store_mip_cube
: {
1251 if (!isa
<FixedVectorType
>(II
.getArgOperand(0)->getType()))
1255 if (ST
->hasDefaultComponentBroadcast())
1256 DemandedElts
= defaultComponentBroadcast(II
.getArgOperand(0));
1257 else if (ST
->hasDefaultComponentZero())
1258 DemandedElts
= trimTrailingZerosInVector(IC
, II
.getArgOperand(0), &II
);
1262 int DMaskIdx
= getAMDGPUImageDMaskIntrinsic(II
.getIntrinsicID()) ? 1 : -1;
1263 if (simplifyAMDGCNMemoryIntrinsicDemanded(IC
, II
, DemandedElts
, DMaskIdx
,
1265 return IC
.eraseInstFromFunction(II
);
1271 if (const AMDGPU::ImageDimIntrinsicInfo
*ImageDimIntr
=
1272 AMDGPU::getImageDimIntrinsicInfo(II
.getIntrinsicID())) {
1273 return simplifyAMDGCNImageIntrinsic(ST
, ImageDimIntr
, II
, IC
);
1275 return std::nullopt
;
1278 /// Implement SimplifyDemandedVectorElts for amdgcn buffer and image intrinsics.
1280 /// The result of simplifying amdgcn image and buffer store intrinsics is updating
1281 /// definitions of the intrinsics vector argument, not Uses of the result like
1282 /// image and buffer loads.
1283 /// Note: This only supports non-TFE/LWE image intrinsic calls; those have
1285 static Value
*simplifyAMDGCNMemoryIntrinsicDemanded(InstCombiner
&IC
,
1288 int DMaskIdx
, bool IsLoad
) {
1290 auto *IIVTy
= cast
<FixedVectorType
>(IsLoad
? II
.getType()
1291 : II
.getOperand(0)->getType());
1292 unsigned VWidth
= IIVTy
->getNumElements();
1295 Type
*EltTy
= IIVTy
->getElementType();
1297 IRBuilderBase::InsertPointGuard
Guard(IC
.Builder
);
1298 IC
.Builder
.SetInsertPoint(&II
);
1300 // Assume the arguments are unchanged and later override them, if needed.
1301 SmallVector
<Value
*, 16> Args(II
.args());
1306 const unsigned ActiveBits
= DemandedElts
.getActiveBits();
1307 const unsigned UnusedComponentsAtFront
= DemandedElts
.countr_zero();
1309 // Start assuming the prefix of elements is demanded, but possibly clear
1310 // some other bits if there are trailing zeros (unused components at front)
1311 // and update offset.
1312 DemandedElts
= (1 << ActiveBits
) - 1;
1314 if (UnusedComponentsAtFront
> 0) {
1315 static const unsigned InvalidOffsetIdx
= 0xf;
1318 switch (II
.getIntrinsicID()) {
1319 case Intrinsic::amdgcn_raw_buffer_load
:
1320 case Intrinsic::amdgcn_raw_ptr_buffer_load
:
1323 case Intrinsic::amdgcn_s_buffer_load
:
1324 // If resulting type is vec3, there is no point in trimming the
1325 // load with updated offset, as the vec3 would most likely be widened to
1326 // vec4 anyway during lowering.
1327 if (ActiveBits
== 4 && UnusedComponentsAtFront
== 1)
1328 OffsetIdx
= InvalidOffsetIdx
;
1332 case Intrinsic::amdgcn_struct_buffer_load
:
1333 case Intrinsic::amdgcn_struct_ptr_buffer_load
:
1337 // TODO: handle tbuffer* intrinsics.
1338 OffsetIdx
= InvalidOffsetIdx
;
1342 if (OffsetIdx
!= InvalidOffsetIdx
) {
1343 // Clear demanded bits and update the offset.
1344 DemandedElts
&= ~((1 << UnusedComponentsAtFront
) - 1);
1345 auto *Offset
= Args
[OffsetIdx
];
1346 unsigned SingleComponentSizeInBits
=
1347 IC
.getDataLayout().getTypeSizeInBits(EltTy
);
1348 unsigned OffsetAdd
=
1349 UnusedComponentsAtFront
* SingleComponentSizeInBits
/ 8;
1350 auto *OffsetAddVal
= ConstantInt::get(Offset
->getType(), OffsetAdd
);
1351 Args
[OffsetIdx
] = IC
.Builder
.CreateAdd(Offset
, OffsetAddVal
);
1357 ConstantInt
*DMask
= cast
<ConstantInt
>(Args
[DMaskIdx
]);
1358 unsigned DMaskVal
= DMask
->getZExtValue() & 0xf;
1360 // dmask 0 has special semantics, do not simplify.
1364 // Mask off values that are undefined because the dmask doesn't cover them
1365 DemandedElts
&= (1 << llvm::popcount(DMaskVal
)) - 1;
1367 unsigned NewDMaskVal
= 0;
1368 unsigned OrigLdStIdx
= 0;
1369 for (unsigned SrcIdx
= 0; SrcIdx
< 4; ++SrcIdx
) {
1370 const unsigned Bit
= 1 << SrcIdx
;
1371 if (!!(DMaskVal
& Bit
)) {
1372 if (!!DemandedElts
[OrigLdStIdx
])
1378 if (DMaskVal
!= NewDMaskVal
)
1379 Args
[DMaskIdx
] = ConstantInt::get(DMask
->getType(), NewDMaskVal
);
1382 unsigned NewNumElts
= DemandedElts
.popcount();
1384 return PoisonValue::get(IIVTy
);
1386 if (NewNumElts
>= VWidth
&& DemandedElts
.isMask()) {
1388 II
.setArgOperand(DMaskIdx
, Args
[DMaskIdx
]);
1392 // Validate function argument and return types, extracting overloaded types
1394 SmallVector
<Type
*, 6> OverloadTys
;
1395 if (!Intrinsic::getIntrinsicSignature(II
.getCalledFunction(), OverloadTys
))
1399 (NewNumElts
== 1) ? EltTy
: FixedVectorType::get(EltTy
, NewNumElts
);
1400 OverloadTys
[0] = NewTy
;
1403 SmallVector
<int, 8> EltMask
;
1404 for (unsigned OrigStoreIdx
= 0; OrigStoreIdx
< VWidth
; ++OrigStoreIdx
)
1405 if (DemandedElts
[OrigStoreIdx
])
1406 EltMask
.push_back(OrigStoreIdx
);
1408 if (NewNumElts
== 1)
1409 Args
[0] = IC
.Builder
.CreateExtractElement(II
.getOperand(0), EltMask
[0]);
1411 Args
[0] = IC
.Builder
.CreateShuffleVector(II
.getOperand(0), EltMask
);
1414 Function
*NewIntrin
= Intrinsic::getDeclaration(
1415 II
.getModule(), II
.getIntrinsicID(), OverloadTys
);
1416 CallInst
*NewCall
= IC
.Builder
.CreateCall(NewIntrin
, Args
);
1417 NewCall
->takeName(&II
);
1418 NewCall
->copyMetadata(II
);
1421 if (NewNumElts
== 1) {
1422 return IC
.Builder
.CreateInsertElement(PoisonValue::get(IIVTy
), NewCall
,
1423 DemandedElts
.countr_zero());
1426 SmallVector
<int, 8> EltMask
;
1427 unsigned NewLoadIdx
= 0;
1428 for (unsigned OrigLoadIdx
= 0; OrigLoadIdx
< VWidth
; ++OrigLoadIdx
) {
1429 if (!!DemandedElts
[OrigLoadIdx
])
1430 EltMask
.push_back(NewLoadIdx
++);
1432 EltMask
.push_back(NewNumElts
);
1435 auto *Shuffle
= IC
.Builder
.CreateShuffleVector(NewCall
, EltMask
);
1443 std::optional
<Value
*> GCNTTIImpl::simplifyDemandedVectorEltsIntrinsic(
1444 InstCombiner
&IC
, IntrinsicInst
&II
, APInt DemandedElts
, APInt
&UndefElts
,
1445 APInt
&UndefElts2
, APInt
&UndefElts3
,
1446 std::function
<void(Instruction
*, unsigned, APInt
, APInt
&)>
1447 SimplifyAndSetOp
) const {
1448 switch (II
.getIntrinsicID()) {
1449 case Intrinsic::amdgcn_raw_buffer_load
:
1450 case Intrinsic::amdgcn_raw_ptr_buffer_load
:
1451 case Intrinsic::amdgcn_raw_buffer_load_format
:
1452 case Intrinsic::amdgcn_raw_ptr_buffer_load_format
:
1453 case Intrinsic::amdgcn_raw_tbuffer_load
:
1454 case Intrinsic::amdgcn_raw_ptr_tbuffer_load
:
1455 case Intrinsic::amdgcn_s_buffer_load
:
1456 case Intrinsic::amdgcn_struct_buffer_load
:
1457 case Intrinsic::amdgcn_struct_ptr_buffer_load
:
1458 case Intrinsic::amdgcn_struct_buffer_load_format
:
1459 case Intrinsic::amdgcn_struct_ptr_buffer_load_format
:
1460 case Intrinsic::amdgcn_struct_tbuffer_load
:
1461 case Intrinsic::amdgcn_struct_ptr_tbuffer_load
:
1462 return simplifyAMDGCNMemoryIntrinsicDemanded(IC
, II
, DemandedElts
);
1464 if (getAMDGPUImageDMaskIntrinsic(II
.getIntrinsicID())) {
1465 return simplifyAMDGCNMemoryIntrinsicDemanded(IC
, II
, DemandedElts
, 0);
1470 return std::nullopt
;