1 //===-- SIFoldOperands.cpp - Fold operands --- ----------------------------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
8 //===----------------------------------------------------------------------===//
12 #include "AMDGPUSubtarget.h"
13 #include "SIInstrInfo.h"
14 #include "SIMachineFunctionInfo.h"
15 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
16 #include "llvm/ADT/DepthFirstIterator.h"
17 #include "llvm/CodeGen/LiveIntervals.h"
18 #include "llvm/CodeGen/MachineFunctionPass.h"
19 #include "llvm/CodeGen/MachineInstrBuilder.h"
20 #include "llvm/CodeGen/MachineRegisterInfo.h"
21 #include "llvm/Support/Debug.h"
22 #include "llvm/Support/raw_ostream.h"
23 #include "llvm/Target/TargetMachine.h"
25 #define DEBUG_TYPE "si-fold-operands"
30 struct FoldCandidate
{
33 MachineOperand
*OpToFold
;
38 unsigned char UseOpNo
;
39 MachineOperand::MachineOperandType Kind
;
42 FoldCandidate(MachineInstr
*MI
, unsigned OpNo
, MachineOperand
*FoldOp
,
43 bool Commuted_
= false,
45 UseMI(MI
), OpToFold(nullptr), ShrinkOpcode(ShrinkOp
), UseOpNo(OpNo
),
46 Kind(FoldOp
->getType()),
48 if (FoldOp
->isImm()) {
49 ImmToFold
= FoldOp
->getImm();
50 } else if (FoldOp
->isFI()) {
51 FrameIndexToFold
= FoldOp
->getIndex();
53 assert(FoldOp
->isReg() || FoldOp
->isGlobal());
59 return Kind
== MachineOperand::MO_FrameIndex
;
63 return Kind
== MachineOperand::MO_Immediate
;
67 return Kind
== MachineOperand::MO_Register
;
70 bool isGlobal() const { return Kind
== MachineOperand::MO_GlobalAddress
; }
72 bool isCommuted() const {
76 bool needsShrink() const {
77 return ShrinkOpcode
!= -1;
80 int getShrinkOpcode() const {
85 class SIFoldOperands
: public MachineFunctionPass
{
88 MachineRegisterInfo
*MRI
;
89 const SIInstrInfo
*TII
;
90 const SIRegisterInfo
*TRI
;
91 const GCNSubtarget
*ST
;
92 const SIMachineFunctionInfo
*MFI
;
94 void foldOperand(MachineOperand
&OpToFold
,
97 SmallVectorImpl
<FoldCandidate
> &FoldList
,
98 SmallVectorImpl
<MachineInstr
*> &CopiesToReplace
) const;
100 void foldInstOperand(MachineInstr
&MI
, MachineOperand
&OpToFold
) const;
102 const MachineOperand
*isClamp(const MachineInstr
&MI
) const;
103 bool tryFoldClamp(MachineInstr
&MI
);
105 std::pair
<const MachineOperand
*, int> isOMod(const MachineInstr
&MI
) const;
106 bool tryFoldOMod(MachineInstr
&MI
);
109 SIFoldOperands() : MachineFunctionPass(ID
) {
110 initializeSIFoldOperandsPass(*PassRegistry::getPassRegistry());
113 bool runOnMachineFunction(MachineFunction
&MF
) override
;
115 StringRef
getPassName() const override
{ return "SI Fold Operands"; }
117 void getAnalysisUsage(AnalysisUsage
&AU
) const override
{
118 AU
.setPreservesCFG();
119 MachineFunctionPass::getAnalysisUsage(AU
);
123 } // End anonymous namespace.
125 INITIALIZE_PASS(SIFoldOperands
, DEBUG_TYPE
,
126 "SI Fold Operands", false, false)
128 char SIFoldOperands::ID
= 0;
130 char &llvm::SIFoldOperandsID
= SIFoldOperands::ID
;
132 // Wrapper around isInlineConstant that understands special cases when
133 // instruction types are replaced during operand folding.
134 static bool isInlineConstantIfFolded(const SIInstrInfo
*TII
,
135 const MachineInstr
&UseMI
,
137 const MachineOperand
&OpToFold
) {
138 if (TII
->isInlineConstant(UseMI
, OpNo
, OpToFold
))
141 unsigned Opc
= UseMI
.getOpcode();
143 case AMDGPU::V_MAC_F32_e64
:
144 case AMDGPU::V_MAC_F16_e64
:
145 case AMDGPU::V_FMAC_F32_e64
:
146 case AMDGPU::V_FMAC_F16_e64
: {
147 // Special case for mac. Since this is replaced with mad when folded into
148 // src2, we need to check the legality for the final instruction.
149 int Src2Idx
= AMDGPU::getNamedOperandIdx(Opc
, AMDGPU::OpName::src2
);
150 if (static_cast<int>(OpNo
) == Src2Idx
) {
151 bool IsFMA
= Opc
== AMDGPU::V_FMAC_F32_e64
||
152 Opc
== AMDGPU::V_FMAC_F16_e64
;
153 bool IsF32
= Opc
== AMDGPU::V_MAC_F32_e64
||
154 Opc
== AMDGPU::V_FMAC_F32_e64
;
156 unsigned Opc
= IsFMA
?
157 (IsF32
? AMDGPU::V_FMA_F32
: AMDGPU::V_FMA_F16_gfx9
) :
158 (IsF32
? AMDGPU::V_MAD_F32
: AMDGPU::V_MAD_F16
);
159 const MCInstrDesc
&MadDesc
= TII
->get(Opc
);
160 return TII
->isInlineConstant(OpToFold
, MadDesc
.OpInfo
[OpNo
].OperandType
);
169 // TODO: Add heuristic that the frame index might not fit in the addressing mode
170 // immediate offset to avoid materializing in loops.
171 static bool frameIndexMayFold(const SIInstrInfo
*TII
,
172 const MachineInstr
&UseMI
,
174 const MachineOperand
&OpToFold
) {
175 return OpToFold
.isFI() &&
176 (TII
->isMUBUF(UseMI
) || TII
->isFLATScratch(UseMI
)) &&
177 OpNo
== AMDGPU::getNamedOperandIdx(UseMI
.getOpcode(), AMDGPU::OpName::vaddr
);
180 FunctionPass
*llvm::createSIFoldOperandsPass() {
181 return new SIFoldOperands();
184 static bool updateOperand(FoldCandidate
&Fold
,
185 const SIInstrInfo
&TII
,
186 const TargetRegisterInfo
&TRI
,
187 const GCNSubtarget
&ST
) {
188 MachineInstr
*MI
= Fold
.UseMI
;
189 MachineOperand
&Old
= MI
->getOperand(Fold
.UseOpNo
);
193 if (MI
->getDesc().TSFlags
& SIInstrFlags::IsPacked
&&
194 !(MI
->getDesc().TSFlags
& SIInstrFlags::IsMAI
) &&
195 AMDGPU::isInlinableLiteralV216(static_cast<uint16_t>(Fold
.ImmToFold
),
196 ST
.hasInv2PiInlineImm())) {
197 // Set op_sel/op_sel_hi on this operand or bail out if op_sel is
199 unsigned Opcode
= MI
->getOpcode();
200 int OpNo
= MI
->getOperandNo(&Old
);
202 if (OpNo
== AMDGPU::getNamedOperandIdx(Opcode
, AMDGPU::OpName::src0
))
203 ModIdx
= AMDGPU::OpName::src0_modifiers
;
204 else if (OpNo
== AMDGPU::getNamedOperandIdx(Opcode
, AMDGPU::OpName::src1
))
205 ModIdx
= AMDGPU::OpName::src1_modifiers
;
206 else if (OpNo
== AMDGPU::getNamedOperandIdx(Opcode
, AMDGPU::OpName::src2
))
207 ModIdx
= AMDGPU::OpName::src2_modifiers
;
208 assert(ModIdx
!= -1);
209 ModIdx
= AMDGPU::getNamedOperandIdx(Opcode
, ModIdx
);
210 MachineOperand
&Mod
= MI
->getOperand(ModIdx
);
211 unsigned Val
= Mod
.getImm();
212 if ((Val
& SISrcMods::OP_SEL_0
) || !(Val
& SISrcMods::OP_SEL_1
))
214 // Only apply the following transformation if that operand requries
215 // a packed immediate.
216 switch (TII
.get(Opcode
).OpInfo
[OpNo
].OperandType
) {
217 case AMDGPU::OPERAND_REG_IMM_V2FP16
:
218 case AMDGPU::OPERAND_REG_IMM_V2INT16
:
219 case AMDGPU::OPERAND_REG_INLINE_C_V2FP16
:
220 case AMDGPU::OPERAND_REG_INLINE_C_V2INT16
:
221 // If upper part is all zero we do not need op_sel_hi.
222 if (!isUInt
<16>(Fold
.ImmToFold
)) {
223 if (!(Fold
.ImmToFold
& 0xffff)) {
224 Mod
.setImm(Mod
.getImm() | SISrcMods::OP_SEL_0
);
225 Mod
.setImm(Mod
.getImm() & ~SISrcMods::OP_SEL_1
);
226 Old
.ChangeToImmediate((Fold
.ImmToFold
>> 16) & 0xffff);
229 Mod
.setImm(Mod
.getImm() & ~SISrcMods::OP_SEL_1
);
230 Old
.ChangeToImmediate(Fold
.ImmToFold
& 0xffff);
240 if ((Fold
.isImm() || Fold
.isFI() || Fold
.isGlobal()) && Fold
.needsShrink()) {
241 MachineBasicBlock
*MBB
= MI
->getParent();
242 auto Liveness
= MBB
->computeRegisterLiveness(&TRI
, AMDGPU::VCC
, MI
, 16);
243 if (Liveness
!= MachineBasicBlock::LQR_Dead
) {
244 LLVM_DEBUG(dbgs() << "Not shrinking " << MI
<< " due to vcc liveness\n");
248 MachineRegisterInfo
&MRI
= MBB
->getParent()->getRegInfo();
249 int Op32
= Fold
.getShrinkOpcode();
250 MachineOperand
&Dst0
= MI
->getOperand(0);
251 MachineOperand
&Dst1
= MI
->getOperand(1);
252 assert(Dst0
.isDef() && Dst1
.isDef());
254 bool HaveNonDbgCarryUse
= !MRI
.use_nodbg_empty(Dst1
.getReg());
256 const TargetRegisterClass
*Dst0RC
= MRI
.getRegClass(Dst0
.getReg());
257 Register NewReg0
= MRI
.createVirtualRegister(Dst0RC
);
259 MachineInstr
*Inst32
= TII
.buildShrunkInst(*MI
, Op32
);
261 if (HaveNonDbgCarryUse
) {
262 BuildMI(*MBB
, MI
, MI
->getDebugLoc(), TII
.get(AMDGPU::COPY
), Dst1
.getReg())
263 .addReg(AMDGPU::VCC
, RegState::Kill
);
266 // Keep the old instruction around to avoid breaking iterators, but
267 // replace it with a dummy instruction to remove uses.
269 // FIXME: We should not invert how this pass looks at operands to avoid
270 // this. Should track set of foldable movs instead of looking for uses
271 // when looking at a use.
272 Dst0
.setReg(NewReg0
);
273 for (unsigned I
= MI
->getNumOperands() - 1; I
> 0; --I
)
274 MI
->RemoveOperand(I
);
275 MI
->setDesc(TII
.get(AMDGPU::IMPLICIT_DEF
));
277 if (Fold
.isCommuted())
278 TII
.commuteInstruction(*Inst32
, false);
282 assert(!Fold
.needsShrink() && "not handled");
285 Old
.ChangeToImmediate(Fold
.ImmToFold
);
289 if (Fold
.isGlobal()) {
290 Old
.ChangeToGA(Fold
.OpToFold
->getGlobal(), Fold
.OpToFold
->getOffset(),
291 Fold
.OpToFold
->getTargetFlags());
296 Old
.ChangeToFrameIndex(Fold
.FrameIndexToFold
);
300 MachineOperand
*New
= Fold
.OpToFold
;
301 Old
.substVirtReg(New
->getReg(), New
->getSubReg(), TRI
);
302 Old
.setIsUndef(New
->isUndef());
306 static bool isUseMIInFoldList(ArrayRef
<FoldCandidate
> FoldList
,
307 const MachineInstr
*MI
) {
308 for (auto Candidate
: FoldList
) {
309 if (Candidate
.UseMI
== MI
)
315 static bool tryAddToFoldList(SmallVectorImpl
<FoldCandidate
> &FoldList
,
316 MachineInstr
*MI
, unsigned OpNo
,
317 MachineOperand
*OpToFold
,
318 const SIInstrInfo
*TII
) {
319 if (!TII
->isOperandLegal(*MI
, OpNo
, OpToFold
)) {
320 // Special case for v_mac_{f16, f32}_e64 if we are trying to fold into src2
321 unsigned Opc
= MI
->getOpcode();
322 if ((Opc
== AMDGPU::V_MAC_F32_e64
|| Opc
== AMDGPU::V_MAC_F16_e64
||
323 Opc
== AMDGPU::V_FMAC_F32_e64
|| Opc
== AMDGPU::V_FMAC_F16_e64
) &&
324 (int)OpNo
== AMDGPU::getNamedOperandIdx(Opc
, AMDGPU::OpName::src2
)) {
325 bool IsFMA
= Opc
== AMDGPU::V_FMAC_F32_e64
||
326 Opc
== AMDGPU::V_FMAC_F16_e64
;
327 bool IsF32
= Opc
== AMDGPU::V_MAC_F32_e64
||
328 Opc
== AMDGPU::V_FMAC_F32_e64
;
329 unsigned NewOpc
= IsFMA
?
330 (IsF32
? AMDGPU::V_FMA_F32
: AMDGPU::V_FMA_F16_gfx9
) :
331 (IsF32
? AMDGPU::V_MAD_F32
: AMDGPU::V_MAD_F16
);
333 // Check if changing this to a v_mad_{f16, f32} instruction will allow us
334 // to fold the operand.
335 MI
->setDesc(TII
->get(NewOpc
));
336 bool FoldAsMAD
= tryAddToFoldList(FoldList
, MI
, OpNo
, OpToFold
, TII
);
338 MI
->untieRegOperand(OpNo
);
341 MI
->setDesc(TII
->get(Opc
));
344 // Special case for s_setreg_b32
345 if (Opc
== AMDGPU::S_SETREG_B32
&& OpToFold
->isImm()) {
346 MI
->setDesc(TII
->get(AMDGPU::S_SETREG_IMM32_B32
));
347 FoldList
.push_back(FoldCandidate(MI
, OpNo
, OpToFold
));
351 // If we are already folding into another operand of MI, then
352 // we can't commute the instruction, otherwise we risk making the
353 // other fold illegal.
354 if (isUseMIInFoldList(FoldList
, MI
))
357 unsigned CommuteOpNo
= OpNo
;
359 // Operand is not legal, so try to commute the instruction to
360 // see if this makes it possible to fold.
361 unsigned CommuteIdx0
= TargetInstrInfo::CommuteAnyOperandIndex
;
362 unsigned CommuteIdx1
= TargetInstrInfo::CommuteAnyOperandIndex
;
363 bool CanCommute
= TII
->findCommutedOpIndices(*MI
, CommuteIdx0
, CommuteIdx1
);
366 if (CommuteIdx0
== OpNo
)
367 CommuteOpNo
= CommuteIdx1
;
368 else if (CommuteIdx1
== OpNo
)
369 CommuteOpNo
= CommuteIdx0
;
373 // One of operands might be an Imm operand, and OpNo may refer to it after
374 // the call of commuteInstruction() below. Such situations are avoided
375 // here explicitly as OpNo must be a register operand to be a candidate
376 // for memory folding.
377 if (CanCommute
&& (!MI
->getOperand(CommuteIdx0
).isReg() ||
378 !MI
->getOperand(CommuteIdx1
).isReg()))
382 !TII
->commuteInstruction(*MI
, false, CommuteIdx0
, CommuteIdx1
))
385 if (!TII
->isOperandLegal(*MI
, CommuteOpNo
, OpToFold
)) {
386 if ((Opc
== AMDGPU::V_ADD_I32_e64
||
387 Opc
== AMDGPU::V_SUB_I32_e64
||
388 Opc
== AMDGPU::V_SUBREV_I32_e64
) && // FIXME
389 (OpToFold
->isImm() || OpToFold
->isFI() || OpToFold
->isGlobal())) {
390 MachineRegisterInfo
&MRI
= MI
->getParent()->getParent()->getRegInfo();
392 // Verify the other operand is a VGPR, otherwise we would violate the
393 // constant bus restriction.
394 unsigned OtherIdx
= CommuteOpNo
== CommuteIdx0
? CommuteIdx1
: CommuteIdx0
;
395 MachineOperand
&OtherOp
= MI
->getOperand(OtherIdx
);
396 if (!OtherOp
.isReg() ||
397 !TII
->getRegisterInfo().isVGPR(MRI
, OtherOp
.getReg()))
400 assert(MI
->getOperand(1).isDef());
402 // Make sure to get the 32-bit version of the commuted opcode.
403 unsigned MaybeCommutedOpc
= MI
->getOpcode();
404 int Op32
= AMDGPU::getVOPe32(MaybeCommutedOpc
);
406 FoldList
.push_back(FoldCandidate(MI
, CommuteOpNo
, OpToFold
, true,
411 TII
->commuteInstruction(*MI
, false, CommuteIdx0
, CommuteIdx1
);
415 FoldList
.push_back(FoldCandidate(MI
, CommuteOpNo
, OpToFold
, true));
419 FoldList
.push_back(FoldCandidate(MI
, OpNo
, OpToFold
));
423 // If the use operand doesn't care about the value, this may be an operand only
424 // used for register indexing, in which case it is unsafe to fold.
425 static bool isUseSafeToFold(const SIInstrInfo
*TII
,
426 const MachineInstr
&MI
,
427 const MachineOperand
&UseMO
) {
428 return !UseMO
.isUndef() && !TII
->isSDWA(MI
);
429 //return !MI.hasRegisterImplicitUseOperand(UseMO.getReg());
432 static bool tryToFoldACImm(const SIInstrInfo
*TII
,
433 const MachineOperand
&OpToFold
,
436 SmallVectorImpl
<FoldCandidate
> &FoldList
) {
437 const MCInstrDesc
&Desc
= UseMI
->getDesc();
438 const MCOperandInfo
*OpInfo
= Desc
.OpInfo
;
439 if (!OpInfo
|| UseOpIdx
>= Desc
.getNumOperands())
442 uint8_t OpTy
= OpInfo
[UseOpIdx
].OperandType
;
443 if (OpTy
< AMDGPU::OPERAND_REG_INLINE_AC_FIRST
||
444 OpTy
> AMDGPU::OPERAND_REG_INLINE_AC_LAST
)
447 if (OpToFold
.isImm() && TII
->isInlineConstant(OpToFold
, OpTy
) &&
448 TII
->isOperandLegal(*UseMI
, UseOpIdx
, &OpToFold
)) {
449 UseMI
->getOperand(UseOpIdx
).ChangeToImmediate(OpToFold
.getImm());
453 if (!OpToFold
.isReg())
456 Register UseReg
= OpToFold
.getReg();
457 if (!Register::isVirtualRegister(UseReg
))
460 if (llvm::find_if(FoldList
, [UseMI
](const FoldCandidate
&FC
) {
461 return FC
.UseMI
== UseMI
; }) != FoldList
.end())
464 MachineRegisterInfo
&MRI
= UseMI
->getParent()->getParent()->getRegInfo();
465 const MachineInstr
*Def
= MRI
.getUniqueVRegDef(UseReg
);
466 if (!Def
|| !Def
->isRegSequence())
471 for (unsigned I
= 1, E
= Def
->getNumExplicitOperands(); I
< E
; I
+= 2) {
472 const MachineOperand
&Sub
= Def
->getOperand(I
);
473 if (!Sub
.isReg() || Sub
.getSubReg())
475 MachineInstr
*SubDef
= MRI
.getUniqueVRegDef(Sub
.getReg());
476 while (SubDef
&& !SubDef
->isMoveImmediate() &&
477 !SubDef
->getOperand(1).isImm() && TII
->isFoldableCopy(*SubDef
))
478 SubDef
= MRI
.getUniqueVRegDef(SubDef
->getOperand(1).getReg());
479 if (!SubDef
|| !SubDef
->isMoveImmediate() || !SubDef
->getOperand(1).isImm())
481 Op
= &SubDef
->getOperand(1);
482 auto SubImm
= Op
->getImm();
484 if (!TII
->isInlineConstant(SubDef
->getOperand(1), OpTy
))
491 return false; // Can only fold splat constants
494 if (!TII
->isOperandLegal(*UseMI
, UseOpIdx
, Op
))
497 FoldList
.push_back(FoldCandidate(UseMI
, UseOpIdx
, Op
));
501 void SIFoldOperands::foldOperand(
502 MachineOperand
&OpToFold
,
505 SmallVectorImpl
<FoldCandidate
> &FoldList
,
506 SmallVectorImpl
<MachineInstr
*> &CopiesToReplace
) const {
507 const MachineOperand
&UseOp
= UseMI
->getOperand(UseOpIdx
);
509 if (!isUseSafeToFold(TII
, *UseMI
, UseOp
))
512 // FIXME: Fold operands with subregs.
513 if (UseOp
.isReg() && OpToFold
.isReg()) {
514 if (UseOp
.isImplicit() || UseOp
.getSubReg() != AMDGPU::NoSubRegister
)
517 // Don't fold subregister extracts into tied operands, only if it is a full
518 // copy since a subregister use tied to a full register def doesn't really
519 // make sense. e.g. don't fold:
522 // %2<tied3> = V_MAC_{F16, F32} %3, %4, %1<tied0>
525 // %2<tied3> = V_MAC_{F16, F32} %3, %4, %0:sub1<tied0>
526 if (UseOp
.isTied() && OpToFold
.getSubReg() != AMDGPU::NoSubRegister
)
530 // Special case for REG_SEQUENCE: We can't fold literals into
531 // REG_SEQUENCE instructions, so we have to fold them into the
532 // uses of REG_SEQUENCE.
533 if (UseMI
->isRegSequence()) {
534 Register RegSeqDstReg
= UseMI
->getOperand(0).getReg();
535 unsigned RegSeqDstSubReg
= UseMI
->getOperand(UseOpIdx
+ 1).getImm();
537 MachineRegisterInfo::use_iterator Next
;
538 for (MachineRegisterInfo::use_iterator
539 RSUse
= MRI
->use_begin(RegSeqDstReg
), RSE
= MRI
->use_end();
540 RSUse
!= RSE
; RSUse
= Next
) {
541 Next
= std::next(RSUse
);
543 MachineInstr
*RSUseMI
= RSUse
->getParent();
545 if (tryToFoldACImm(TII
, UseMI
->getOperand(0), RSUseMI
,
546 RSUse
.getOperandNo(), FoldList
))
549 if (RSUse
->getSubReg() != RegSeqDstSubReg
)
552 foldOperand(OpToFold
, RSUseMI
, RSUse
.getOperandNo(), FoldList
,
559 if (tryToFoldACImm(TII
, OpToFold
, UseMI
, UseOpIdx
, FoldList
))
562 if (frameIndexMayFold(TII
, *UseMI
, UseOpIdx
, OpToFold
)) {
563 // Sanity check that this is a stack access.
564 // FIXME: Should probably use stack pseudos before frame lowering.
565 MachineOperand
*SOff
= TII
->getNamedOperand(*UseMI
, AMDGPU::OpName::soffset
);
566 if (!SOff
->isReg() || (SOff
->getReg() != MFI
->getScratchWaveOffsetReg() &&
567 SOff
->getReg() != MFI
->getStackPtrOffsetReg()))
570 if (TII
->getNamedOperand(*UseMI
, AMDGPU::OpName::srsrc
)->getReg() !=
571 MFI
->getScratchRSrcReg())
574 // A frame index will resolve to a positive constant, so it should always be
575 // safe to fold the addressing mode, even pre-GFX9.
576 UseMI
->getOperand(UseOpIdx
).ChangeToFrameIndex(OpToFold
.getIndex());
577 SOff
->setReg(MFI
->getStackPtrOffsetReg());
581 bool FoldingImmLike
=
582 OpToFold
.isImm() || OpToFold
.isFI() || OpToFold
.isGlobal();
584 if (FoldingImmLike
&& UseMI
->isCopy()) {
585 Register DestReg
= UseMI
->getOperand(0).getReg();
587 // Don't fold into a copy to a physical register. Doing so would interfere
588 // with the register coalescer's logic which would avoid redundant
590 if (DestReg
.isPhysical())
593 const TargetRegisterClass
*DestRC
= MRI
->getRegClass(DestReg
);
595 Register SrcReg
= UseMI
->getOperand(1).getReg();
596 if (SrcReg
.isVirtual()) { // XXX - This can be an assert?
597 const TargetRegisterClass
* SrcRC
= MRI
->getRegClass(SrcReg
);
598 if (TRI
->isSGPRClass(SrcRC
) && TRI
->hasVectorRegisters(DestRC
)) {
599 MachineRegisterInfo::use_iterator NextUse
;
600 SmallVector
<FoldCandidate
, 4> CopyUses
;
601 for (MachineRegisterInfo::use_iterator
602 Use
= MRI
->use_begin(DestReg
), E
= MRI
->use_end();
603 Use
!= E
; Use
= NextUse
) {
604 NextUse
= std::next(Use
);
605 FoldCandidate FC
= FoldCandidate(Use
->getParent(),
606 Use
.getOperandNo(), &UseMI
->getOperand(1));
607 CopyUses
.push_back(FC
);
609 for (auto & F
: CopyUses
) {
610 foldOperand(*F
.OpToFold
, F
.UseMI
, F
.UseOpNo
,
611 FoldList
, CopiesToReplace
);
616 if (DestRC
== &AMDGPU::AGPR_32RegClass
&&
617 TII
->isInlineConstant(OpToFold
, AMDGPU::OPERAND_REG_INLINE_C_INT32
)) {
618 UseMI
->setDesc(TII
->get(AMDGPU::V_ACCVGPR_WRITE_B32
));
619 UseMI
->getOperand(1).ChangeToImmediate(OpToFold
.getImm());
620 CopiesToReplace
.push_back(UseMI
);
624 // In order to fold immediates into copies, we need to change the
627 unsigned MovOp
= TII
->getMovOpcode(DestRC
);
628 if (MovOp
== AMDGPU::COPY
)
631 UseMI
->setDesc(TII
->get(MovOp
));
632 MachineInstr::mop_iterator ImpOpI
= UseMI
->implicit_operands().begin();
633 MachineInstr::mop_iterator ImpOpE
= UseMI
->implicit_operands().end();
634 while (ImpOpI
!= ImpOpE
) {
635 MachineInstr::mop_iterator Tmp
= ImpOpI
;
637 UseMI
->RemoveOperand(UseMI
->getOperandNo(Tmp
));
639 CopiesToReplace
.push_back(UseMI
);
641 if (UseMI
->isCopy() && OpToFold
.isReg() &&
642 Register::isVirtualRegister(UseMI
->getOperand(0).getReg()) &&
643 TRI
->isVectorRegister(*MRI
, UseMI
->getOperand(0).getReg()) &&
644 TRI
->isVectorRegister(*MRI
, UseMI
->getOperand(1).getReg()) &&
645 !UseMI
->getOperand(1).getSubReg()) {
646 unsigned Size
= TII
->getOpSize(*UseMI
, 1);
647 UseMI
->getOperand(1).setReg(OpToFold
.getReg());
648 UseMI
->getOperand(1).setSubReg(OpToFold
.getSubReg());
649 UseMI
->getOperand(1).setIsKill(false);
650 CopiesToReplace
.push_back(UseMI
);
651 OpToFold
.setIsKill(false);
654 if (TRI
->isAGPR(*MRI
, UseMI
->getOperand(0).getReg()) &&
655 TRI
->isVGPR(*MRI
, UseMI
->getOperand(1).getReg()))
656 UseMI
->setDesc(TII
->get(AMDGPU::V_ACCVGPR_WRITE_B32
));
657 else if (TRI
->isVGPR(*MRI
, UseMI
->getOperand(0).getReg()) &&
658 TRI
->isAGPR(*MRI
, UseMI
->getOperand(1).getReg()))
659 UseMI
->setDesc(TII
->get(AMDGPU::V_ACCVGPR_READ_B32
));
663 unsigned UseOpc
= UseMI
->getOpcode();
664 if (UseOpc
== AMDGPU::V_READFIRSTLANE_B32
||
665 (UseOpc
== AMDGPU::V_READLANE_B32
&&
667 AMDGPU::getNamedOperandIdx(UseOpc
, AMDGPU::OpName::src0
))) {
668 // %vgpr = V_MOV_B32 imm
669 // %sgpr = V_READFIRSTLANE_B32 %vgpr
671 // %sgpr = S_MOV_B32 imm
672 if (FoldingImmLike
) {
673 if (execMayBeModifiedBeforeUse(*MRI
,
674 UseMI
->getOperand(UseOpIdx
).getReg(),
675 *OpToFold
.getParent(),
679 UseMI
->setDesc(TII
->get(AMDGPU::S_MOV_B32
));
681 // FIXME: ChangeToImmediate should clear subreg
682 UseMI
->getOperand(1).setSubReg(0);
683 if (OpToFold
.isImm())
684 UseMI
->getOperand(1).ChangeToImmediate(OpToFold
.getImm());
686 UseMI
->getOperand(1).ChangeToFrameIndex(OpToFold
.getIndex());
687 UseMI
->RemoveOperand(2); // Remove exec read (or src1 for readlane)
691 if (OpToFold
.isReg() && TRI
->isSGPRReg(*MRI
, OpToFold
.getReg())) {
692 if (execMayBeModifiedBeforeUse(*MRI
,
693 UseMI
->getOperand(UseOpIdx
).getReg(),
694 *OpToFold
.getParent(),
698 // %vgpr = COPY %sgpr0
699 // %sgpr1 = V_READFIRSTLANE_B32 %vgpr
701 // %sgpr1 = COPY %sgpr0
702 UseMI
->setDesc(TII
->get(AMDGPU::COPY
));
703 UseMI
->getOperand(1).setReg(OpToFold
.getReg());
704 UseMI
->getOperand(1).setSubReg(OpToFold
.getSubReg());
705 UseMI
->getOperand(1).setIsKill(false);
706 UseMI
->RemoveOperand(2); // Remove exec read (or src1 for readlane)
711 const MCInstrDesc
&UseDesc
= UseMI
->getDesc();
713 // Don't fold into target independent nodes. Target independent opcodes
714 // don't have defined register classes.
715 if (UseDesc
.isVariadic() ||
716 UseOp
.isImplicit() ||
717 UseDesc
.OpInfo
[UseOpIdx
].RegClass
== -1)
721 if (!FoldingImmLike
) {
722 tryAddToFoldList(FoldList
, UseMI
, UseOpIdx
, &OpToFold
, TII
);
724 // FIXME: We could try to change the instruction from 64-bit to 32-bit
725 // to enable more folding opportunites. The shrink operands pass
726 // already does this.
731 const MCInstrDesc
&FoldDesc
= OpToFold
.getParent()->getDesc();
732 const TargetRegisterClass
*FoldRC
=
733 TRI
->getRegClass(FoldDesc
.OpInfo
[0].RegClass
);
735 // Split 64-bit constants into 32-bits for folding.
736 if (UseOp
.getSubReg() && AMDGPU::getRegBitWidth(FoldRC
->getID()) == 64) {
737 Register UseReg
= UseOp
.getReg();
738 const TargetRegisterClass
*UseRC
= MRI
->getRegClass(UseReg
);
740 if (AMDGPU::getRegBitWidth(UseRC
->getID()) != 64)
743 APInt
Imm(64, OpToFold
.getImm());
744 if (UseOp
.getSubReg() == AMDGPU::sub0
) {
745 Imm
= Imm
.getLoBits(32);
747 assert(UseOp
.getSubReg() == AMDGPU::sub1
);
748 Imm
= Imm
.getHiBits(32);
751 MachineOperand ImmOp
= MachineOperand::CreateImm(Imm
.getSExtValue());
752 tryAddToFoldList(FoldList
, UseMI
, UseOpIdx
, &ImmOp
, TII
);
758 tryAddToFoldList(FoldList
, UseMI
, UseOpIdx
, &OpToFold
, TII
);
761 static bool evalBinaryInstruction(unsigned Opcode
, int32_t &Result
,
762 uint32_t LHS
, uint32_t RHS
) {
764 case AMDGPU::V_AND_B32_e64
:
765 case AMDGPU::V_AND_B32_e32
:
766 case AMDGPU::S_AND_B32
:
769 case AMDGPU::V_OR_B32_e64
:
770 case AMDGPU::V_OR_B32_e32
:
771 case AMDGPU::S_OR_B32
:
774 case AMDGPU::V_XOR_B32_e64
:
775 case AMDGPU::V_XOR_B32_e32
:
776 case AMDGPU::S_XOR_B32
:
779 case AMDGPU::V_LSHL_B32_e64
:
780 case AMDGPU::V_LSHL_B32_e32
:
781 case AMDGPU::S_LSHL_B32
:
782 // The instruction ignores the high bits for out of bounds shifts.
783 Result
= LHS
<< (RHS
& 31);
785 case AMDGPU::V_LSHLREV_B32_e64
:
786 case AMDGPU::V_LSHLREV_B32_e32
:
787 Result
= RHS
<< (LHS
& 31);
789 case AMDGPU::V_LSHR_B32_e64
:
790 case AMDGPU::V_LSHR_B32_e32
:
791 case AMDGPU::S_LSHR_B32
:
792 Result
= LHS
>> (RHS
& 31);
794 case AMDGPU::V_LSHRREV_B32_e64
:
795 case AMDGPU::V_LSHRREV_B32_e32
:
796 Result
= RHS
>> (LHS
& 31);
798 case AMDGPU::V_ASHR_I32_e64
:
799 case AMDGPU::V_ASHR_I32_e32
:
800 case AMDGPU::S_ASHR_I32
:
801 Result
= static_cast<int32_t>(LHS
) >> (RHS
& 31);
803 case AMDGPU::V_ASHRREV_I32_e64
:
804 case AMDGPU::V_ASHRREV_I32_e32
:
805 Result
= static_cast<int32_t>(RHS
) >> (LHS
& 31);
812 static unsigned getMovOpc(bool IsScalar
) {
813 return IsScalar
? AMDGPU::S_MOV_B32
: AMDGPU::V_MOV_B32_e32
;
816 /// Remove any leftover implicit operands from mutating the instruction. e.g.
817 /// if we replace an s_and_b32 with a copy, we don't need the implicit scc def
819 static void stripExtraCopyOperands(MachineInstr
&MI
) {
820 const MCInstrDesc
&Desc
= MI
.getDesc();
821 unsigned NumOps
= Desc
.getNumOperands() +
822 Desc
.getNumImplicitUses() +
823 Desc
.getNumImplicitDefs();
825 for (unsigned I
= MI
.getNumOperands() - 1; I
>= NumOps
; --I
)
829 static void mutateCopyOp(MachineInstr
&MI
, const MCInstrDesc
&NewDesc
) {
831 stripExtraCopyOperands(MI
);
834 static MachineOperand
*getImmOrMaterializedImm(MachineRegisterInfo
&MRI
,
835 MachineOperand
&Op
) {
837 // If this has a subregister, it obviously is a register source.
838 if (Op
.getSubReg() != AMDGPU::NoSubRegister
||
839 !Register::isVirtualRegister(Op
.getReg()))
842 MachineInstr
*Def
= MRI
.getVRegDef(Op
.getReg());
843 if (Def
&& Def
->isMoveImmediate()) {
844 MachineOperand
&ImmSrc
= Def
->getOperand(1);
853 // Try to simplify operations with a constant that may appear after instruction
855 // TODO: See if a frame index with a fixed offset can fold.
856 static bool tryConstantFoldOp(MachineRegisterInfo
&MRI
,
857 const SIInstrInfo
*TII
,
859 MachineOperand
*ImmOp
) {
860 unsigned Opc
= MI
->getOpcode();
861 if (Opc
== AMDGPU::V_NOT_B32_e64
|| Opc
== AMDGPU::V_NOT_B32_e32
||
862 Opc
== AMDGPU::S_NOT_B32
) {
863 MI
->getOperand(1).ChangeToImmediate(~ImmOp
->getImm());
864 mutateCopyOp(*MI
, TII
->get(getMovOpc(Opc
== AMDGPU::S_NOT_B32
)));
868 int Src1Idx
= AMDGPU::getNamedOperandIdx(Opc
, AMDGPU::OpName::src1
);
872 int Src0Idx
= AMDGPU::getNamedOperandIdx(Opc
, AMDGPU::OpName::src0
);
873 MachineOperand
*Src0
= getImmOrMaterializedImm(MRI
, MI
->getOperand(Src0Idx
));
874 MachineOperand
*Src1
= getImmOrMaterializedImm(MRI
, MI
->getOperand(Src1Idx
));
876 if (!Src0
->isImm() && !Src1
->isImm())
879 if (MI
->getOpcode() == AMDGPU::V_LSHL_OR_B32
) {
880 if (Src0
->isImm() && Src0
->getImm() == 0) {
881 // v_lshl_or_b32 0, X, Y -> copy Y
882 // v_lshl_or_b32 0, X, K -> v_mov_b32 K
883 bool UseCopy
= TII
->getNamedOperand(*MI
, AMDGPU::OpName::src2
)->isReg();
884 MI
->RemoveOperand(Src1Idx
);
885 MI
->RemoveOperand(Src0Idx
);
887 MI
->setDesc(TII
->get(UseCopy
? AMDGPU::COPY
: AMDGPU::V_MOV_B32_e32
));
892 // and k0, k1 -> v_mov_b32 (k0 & k1)
893 // or k0, k1 -> v_mov_b32 (k0 | k1)
894 // xor k0, k1 -> v_mov_b32 (k0 ^ k1)
895 if (Src0
->isImm() && Src1
->isImm()) {
897 if (!evalBinaryInstruction(Opc
, NewImm
, Src0
->getImm(), Src1
->getImm()))
900 const SIRegisterInfo
&TRI
= TII
->getRegisterInfo();
901 bool IsSGPR
= TRI
.isSGPRReg(MRI
, MI
->getOperand(0).getReg());
903 // Be careful to change the right operand, src0 may belong to a different
905 MI
->getOperand(Src0Idx
).ChangeToImmediate(NewImm
);
906 MI
->RemoveOperand(Src1Idx
);
907 mutateCopyOp(*MI
, TII
->get(getMovOpc(IsSGPR
)));
911 if (!MI
->isCommutable())
914 if (Src0
->isImm() && !Src1
->isImm()) {
915 std::swap(Src0
, Src1
);
916 std::swap(Src0Idx
, Src1Idx
);
919 int32_t Src1Val
= static_cast<int32_t>(Src1
->getImm());
920 if (Opc
== AMDGPU::V_OR_B32_e64
||
921 Opc
== AMDGPU::V_OR_B32_e32
||
922 Opc
== AMDGPU::S_OR_B32
) {
924 // y = or x, 0 => y = copy x
925 MI
->RemoveOperand(Src1Idx
);
926 mutateCopyOp(*MI
, TII
->get(AMDGPU::COPY
));
927 } else if (Src1Val
== -1) {
928 // y = or x, -1 => y = v_mov_b32 -1
929 MI
->RemoveOperand(Src1Idx
);
930 mutateCopyOp(*MI
, TII
->get(getMovOpc(Opc
== AMDGPU::S_OR_B32
)));
937 if (MI
->getOpcode() == AMDGPU::V_AND_B32_e64
||
938 MI
->getOpcode() == AMDGPU::V_AND_B32_e32
||
939 MI
->getOpcode() == AMDGPU::S_AND_B32
) {
941 // y = and x, 0 => y = v_mov_b32 0
942 MI
->RemoveOperand(Src0Idx
);
943 mutateCopyOp(*MI
, TII
->get(getMovOpc(Opc
== AMDGPU::S_AND_B32
)));
944 } else if (Src1Val
== -1) {
945 // y = and x, -1 => y = copy x
946 MI
->RemoveOperand(Src1Idx
);
947 mutateCopyOp(*MI
, TII
->get(AMDGPU::COPY
));
948 stripExtraCopyOperands(*MI
);
955 if (MI
->getOpcode() == AMDGPU::V_XOR_B32_e64
||
956 MI
->getOpcode() == AMDGPU::V_XOR_B32_e32
||
957 MI
->getOpcode() == AMDGPU::S_XOR_B32
) {
959 // y = xor x, 0 => y = copy x
960 MI
->RemoveOperand(Src1Idx
);
961 mutateCopyOp(*MI
, TII
->get(AMDGPU::COPY
));
969 // Try to fold an instruction into a simpler one
970 static bool tryFoldInst(const SIInstrInfo
*TII
,
972 unsigned Opc
= MI
->getOpcode();
974 if (Opc
== AMDGPU::V_CNDMASK_B32_e32
||
975 Opc
== AMDGPU::V_CNDMASK_B32_e64
||
976 Opc
== AMDGPU::V_CNDMASK_B64_PSEUDO
) {
977 const MachineOperand
*Src0
= TII
->getNamedOperand(*MI
, AMDGPU::OpName::src0
);
978 const MachineOperand
*Src1
= TII
->getNamedOperand(*MI
, AMDGPU::OpName::src1
);
979 int Src1ModIdx
= AMDGPU::getNamedOperandIdx(Opc
, AMDGPU::OpName::src1_modifiers
);
980 int Src0ModIdx
= AMDGPU::getNamedOperandIdx(Opc
, AMDGPU::OpName::src0_modifiers
);
981 if (Src1
->isIdenticalTo(*Src0
) &&
982 (Src1ModIdx
== -1 || !MI
->getOperand(Src1ModIdx
).getImm()) &&
983 (Src0ModIdx
== -1 || !MI
->getOperand(Src0ModIdx
).getImm())) {
984 LLVM_DEBUG(dbgs() << "Folded " << *MI
<< " into ");
986 TII
->get(Src0
->isReg() ? (unsigned)AMDGPU::COPY
: getMovOpc(false));
987 int Src2Idx
= AMDGPU::getNamedOperandIdx(Opc
, AMDGPU::OpName::src2
);
989 MI
->RemoveOperand(Src2Idx
);
990 MI
->RemoveOperand(AMDGPU::getNamedOperandIdx(Opc
, AMDGPU::OpName::src1
));
991 if (Src1ModIdx
!= -1)
992 MI
->RemoveOperand(Src1ModIdx
);
993 if (Src0ModIdx
!= -1)
994 MI
->RemoveOperand(Src0ModIdx
);
995 mutateCopyOp(*MI
, NewDesc
);
996 LLVM_DEBUG(dbgs() << *MI
<< '\n');
1004 void SIFoldOperands::foldInstOperand(MachineInstr
&MI
,
1005 MachineOperand
&OpToFold
) const {
1006 // We need mutate the operands of new mov instructions to add implicit
1007 // uses of EXEC, but adding them invalidates the use_iterator, so defer
1009 SmallVector
<MachineInstr
*, 4> CopiesToReplace
;
1010 SmallVector
<FoldCandidate
, 4> FoldList
;
1011 MachineOperand
&Dst
= MI
.getOperand(0);
1013 bool FoldingImm
= OpToFold
.isImm() || OpToFold
.isFI() || OpToFold
.isGlobal();
1015 unsigned NumLiteralUses
= 0;
1016 MachineOperand
*NonInlineUse
= nullptr;
1017 int NonInlineUseOpNo
= -1;
1019 MachineRegisterInfo::use_iterator NextUse
;
1020 for (MachineRegisterInfo::use_iterator
1021 Use
= MRI
->use_begin(Dst
.getReg()), E
= MRI
->use_end();
1022 Use
!= E
; Use
= NextUse
) {
1023 NextUse
= std::next(Use
);
1024 MachineInstr
*UseMI
= Use
->getParent();
1025 unsigned OpNo
= Use
.getOperandNo();
1027 // Folding the immediate may reveal operations that can be constant
1028 // folded or replaced with a copy. This can happen for example after
1029 // frame indices are lowered to constants or from splitting 64-bit
1032 // We may also encounter cases where one or both operands are
1033 // immediates materialized into a register, which would ordinarily not
1034 // be folded due to multiple uses or operand constraints.
1036 if (OpToFold
.isImm() && tryConstantFoldOp(*MRI
, TII
, UseMI
, &OpToFold
)) {
1037 LLVM_DEBUG(dbgs() << "Constant folded " << *UseMI
<< '\n');
1039 // Some constant folding cases change the same immediate's use to a new
1040 // instruction, e.g. and x, 0 -> 0. Make sure we re-visit the user
1041 // again. The same constant folded instruction could also have a second
1043 NextUse
= MRI
->use_begin(Dst
.getReg());
1048 // Try to fold any inline immediate uses, and then only fold other
1049 // constants if they have one use.
1051 // The legality of the inline immediate must be checked based on the use
1052 // operand, not the defining instruction, because 32-bit instructions
1053 // with 32-bit inline immediate sources may be used to materialize
1054 // constants used in 16-bit operands.
1056 // e.g. it is unsafe to fold:
1057 // s_mov_b32 s0, 1.0 // materializes 0x3f800000
1058 // v_add_f16 v0, v1, s0 // 1.0 f16 inline immediate sees 0x00003c00
1060 // Folding immediates with more than one use will increase program size.
1061 // FIXME: This will also reduce register usage, which may be better
1062 // in some cases. A better heuristic is needed.
1063 if (isInlineConstantIfFolded(TII
, *UseMI
, OpNo
, OpToFold
)) {
1064 foldOperand(OpToFold
, UseMI
, OpNo
, FoldList
, CopiesToReplace
);
1065 } else if (frameIndexMayFold(TII
, *UseMI
, OpNo
, OpToFold
)) {
1066 foldOperand(OpToFold
, UseMI
, OpNo
, FoldList
,
1069 if (++NumLiteralUses
== 1) {
1070 NonInlineUse
= &*Use
;
1071 NonInlineUseOpNo
= OpNo
;
1076 if (NumLiteralUses
== 1) {
1077 MachineInstr
*UseMI
= NonInlineUse
->getParent();
1078 foldOperand(OpToFold
, UseMI
, NonInlineUseOpNo
, FoldList
, CopiesToReplace
);
1081 // Folding register.
1082 SmallVector
<MachineRegisterInfo::use_iterator
, 4> UsesToProcess
;
1083 for (MachineRegisterInfo::use_iterator
1084 Use
= MRI
->use_begin(Dst
.getReg()), E
= MRI
->use_end();
1086 UsesToProcess
.push_back(Use
);
1088 for (auto U
: UsesToProcess
) {
1089 MachineInstr
*UseMI
= U
->getParent();
1091 foldOperand(OpToFold
, UseMI
, U
.getOperandNo(),
1092 FoldList
, CopiesToReplace
);
1096 MachineFunction
*MF
= MI
.getParent()->getParent();
1097 // Make sure we add EXEC uses to any new v_mov instructions created.
1098 for (MachineInstr
*Copy
: CopiesToReplace
)
1099 Copy
->addImplicitDefUseOperands(*MF
);
1101 for (FoldCandidate
&Fold
: FoldList
) {
1102 if (Fold
.isReg() && Register::isVirtualRegister(Fold
.OpToFold
->getReg())) {
1103 Register Reg
= Fold
.OpToFold
->getReg();
1104 MachineInstr
*DefMI
= Fold
.OpToFold
->getParent();
1105 if (DefMI
->readsRegister(AMDGPU::EXEC
, TRI
) &&
1106 execMayBeModifiedBeforeUse(*MRI
, Reg
, *DefMI
, *Fold
.UseMI
))
1109 if (updateOperand(Fold
, *TII
, *TRI
, *ST
)) {
1110 // Clear kill flags.
1112 assert(Fold
.OpToFold
&& Fold
.OpToFold
->isReg());
1113 // FIXME: Probably shouldn't bother trying to fold if not an
1114 // SGPR. PeepholeOptimizer can eliminate redundant VGPR->VGPR
1116 MRI
->clearKillFlags(Fold
.OpToFold
->getReg());
1118 LLVM_DEBUG(dbgs() << "Folded source from " << MI
<< " into OpNo "
1119 << static_cast<int>(Fold
.UseOpNo
) << " of "
1120 << *Fold
.UseMI
<< '\n');
1121 tryFoldInst(TII
, Fold
.UseMI
);
1122 } else if (Fold
.isCommuted()) {
1123 // Restoring instruction's original operand order if fold has failed.
1124 TII
->commuteInstruction(*Fold
.UseMI
, false);
1129 // Clamp patterns are canonically selected to v_max_* instructions, so only
1131 const MachineOperand
*SIFoldOperands::isClamp(const MachineInstr
&MI
) const {
1132 unsigned Op
= MI
.getOpcode();
1134 case AMDGPU::V_MAX_F32_e64
:
1135 case AMDGPU::V_MAX_F16_e64
:
1136 case AMDGPU::V_MAX_F64
:
1137 case AMDGPU::V_PK_MAX_F16
: {
1138 if (!TII
->getNamedOperand(MI
, AMDGPU::OpName::clamp
)->getImm())
1141 // Make sure sources are identical.
1142 const MachineOperand
*Src0
= TII
->getNamedOperand(MI
, AMDGPU::OpName::src0
);
1143 const MachineOperand
*Src1
= TII
->getNamedOperand(MI
, AMDGPU::OpName::src1
);
1144 if (!Src0
->isReg() || !Src1
->isReg() ||
1145 Src0
->getReg() != Src1
->getReg() ||
1146 Src0
->getSubReg() != Src1
->getSubReg() ||
1147 Src0
->getSubReg() != AMDGPU::NoSubRegister
)
1150 // Can't fold up if we have modifiers.
1151 if (TII
->hasModifiersSet(MI
, AMDGPU::OpName::omod
))
1155 = TII
->getNamedOperand(MI
, AMDGPU::OpName::src0_modifiers
)->getImm();
1157 = TII
->getNamedOperand(MI
, AMDGPU::OpName::src1_modifiers
)->getImm();
1159 // Having a 0 op_sel_hi would require swizzling the output in the source
1160 // instruction, which we can't do.
1161 unsigned UnsetMods
= (Op
== AMDGPU::V_PK_MAX_F16
) ? SISrcMods::OP_SEL_1
1163 if (Src0Mods
!= UnsetMods
&& Src1Mods
!= UnsetMods
)
1172 // We obviously have multiple uses in a clamp since the register is used twice
1173 // in the same instruction.
1174 static bool hasOneNonDBGUseInst(const MachineRegisterInfo
&MRI
, unsigned Reg
) {
1176 for (auto I
= MRI
.use_instr_nodbg_begin(Reg
), E
= MRI
.use_instr_nodbg_end();
1185 // FIXME: Clamp for v_mad_mixhi_f16 handled during isel.
1186 bool SIFoldOperands::tryFoldClamp(MachineInstr
&MI
) {
1187 const MachineOperand
*ClampSrc
= isClamp(MI
);
1188 if (!ClampSrc
|| !hasOneNonDBGUseInst(*MRI
, ClampSrc
->getReg()))
1191 MachineInstr
*Def
= MRI
->getVRegDef(ClampSrc
->getReg());
1193 // The type of clamp must be compatible.
1194 if (TII
->getClampMask(*Def
) != TII
->getClampMask(MI
))
1197 MachineOperand
*DefClamp
= TII
->getNamedOperand(*Def
, AMDGPU::OpName::clamp
);
1201 LLVM_DEBUG(dbgs() << "Folding clamp " << *DefClamp
<< " into " << *Def
1204 // Clamp is applied after omod, so it is OK if omod is set.
1205 DefClamp
->setImm(1);
1206 MRI
->replaceRegWith(MI
.getOperand(0).getReg(), Def
->getOperand(0).getReg());
1207 MI
.eraseFromParent();
1211 static int getOModValue(unsigned Opc
, int64_t Val
) {
1213 case AMDGPU::V_MUL_F32_e64
: {
1214 switch (static_cast<uint32_t>(Val
)) {
1215 case 0x3f000000: // 0.5
1216 return SIOutMods::DIV2
;
1217 case 0x40000000: // 2.0
1218 return SIOutMods::MUL2
;
1219 case 0x40800000: // 4.0
1220 return SIOutMods::MUL4
;
1222 return SIOutMods::NONE
;
1225 case AMDGPU::V_MUL_F16_e64
: {
1226 switch (static_cast<uint16_t>(Val
)) {
1228 return SIOutMods::DIV2
;
1230 return SIOutMods::MUL2
;
1232 return SIOutMods::MUL4
;
1234 return SIOutMods::NONE
;
1238 llvm_unreachable("invalid mul opcode");
1242 // FIXME: Does this really not support denormals with f16?
1243 // FIXME: Does this need to check IEEE mode bit? SNaNs are generally not
1244 // handled, so will anything other than that break?
1245 std::pair
<const MachineOperand
*, int>
1246 SIFoldOperands::isOMod(const MachineInstr
&MI
) const {
1247 unsigned Op
= MI
.getOpcode();
1249 case AMDGPU::V_MUL_F32_e64
:
1250 case AMDGPU::V_MUL_F16_e64
: {
1251 // If output denormals are enabled, omod is ignored.
1252 if ((Op
== AMDGPU::V_MUL_F32_e64
&& ST
->hasFP32Denormals()) ||
1253 (Op
== AMDGPU::V_MUL_F16_e64
&& ST
->hasFP16Denormals()))
1254 return std::make_pair(nullptr, SIOutMods::NONE
);
1256 const MachineOperand
*RegOp
= nullptr;
1257 const MachineOperand
*ImmOp
= nullptr;
1258 const MachineOperand
*Src0
= TII
->getNamedOperand(MI
, AMDGPU::OpName::src0
);
1259 const MachineOperand
*Src1
= TII
->getNamedOperand(MI
, AMDGPU::OpName::src1
);
1260 if (Src0
->isImm()) {
1263 } else if (Src1
->isImm()) {
1267 return std::make_pair(nullptr, SIOutMods::NONE
);
1269 int OMod
= getOModValue(Op
, ImmOp
->getImm());
1270 if (OMod
== SIOutMods::NONE
||
1271 TII
->hasModifiersSet(MI
, AMDGPU::OpName::src0_modifiers
) ||
1272 TII
->hasModifiersSet(MI
, AMDGPU::OpName::src1_modifiers
) ||
1273 TII
->hasModifiersSet(MI
, AMDGPU::OpName::omod
) ||
1274 TII
->hasModifiersSet(MI
, AMDGPU::OpName::clamp
))
1275 return std::make_pair(nullptr, SIOutMods::NONE
);
1277 return std::make_pair(RegOp
, OMod
);
1279 case AMDGPU::V_ADD_F32_e64
:
1280 case AMDGPU::V_ADD_F16_e64
: {
1281 // If output denormals are enabled, omod is ignored.
1282 if ((Op
== AMDGPU::V_ADD_F32_e64
&& ST
->hasFP32Denormals()) ||
1283 (Op
== AMDGPU::V_ADD_F16_e64
&& ST
->hasFP16Denormals()))
1284 return std::make_pair(nullptr, SIOutMods::NONE
);
1286 // Look through the DAGCombiner canonicalization fmul x, 2 -> fadd x, x
1287 const MachineOperand
*Src0
= TII
->getNamedOperand(MI
, AMDGPU::OpName::src0
);
1288 const MachineOperand
*Src1
= TII
->getNamedOperand(MI
, AMDGPU::OpName::src1
);
1290 if (Src0
->isReg() && Src1
->isReg() && Src0
->getReg() == Src1
->getReg() &&
1291 Src0
->getSubReg() == Src1
->getSubReg() &&
1292 !TII
->hasModifiersSet(MI
, AMDGPU::OpName::src0_modifiers
) &&
1293 !TII
->hasModifiersSet(MI
, AMDGPU::OpName::src1_modifiers
) &&
1294 !TII
->hasModifiersSet(MI
, AMDGPU::OpName::clamp
) &&
1295 !TII
->hasModifiersSet(MI
, AMDGPU::OpName::omod
))
1296 return std::make_pair(Src0
, SIOutMods::MUL2
);
1298 return std::make_pair(nullptr, SIOutMods::NONE
);
1301 return std::make_pair(nullptr, SIOutMods::NONE
);
1305 // FIXME: Does this need to check IEEE bit on function?
1306 bool SIFoldOperands::tryFoldOMod(MachineInstr
&MI
) {
1307 const MachineOperand
*RegOp
;
1309 std::tie(RegOp
, OMod
) = isOMod(MI
);
1310 if (OMod
== SIOutMods::NONE
|| !RegOp
->isReg() ||
1311 RegOp
->getSubReg() != AMDGPU::NoSubRegister
||
1312 !hasOneNonDBGUseInst(*MRI
, RegOp
->getReg()))
1315 MachineInstr
*Def
= MRI
->getVRegDef(RegOp
->getReg());
1316 MachineOperand
*DefOMod
= TII
->getNamedOperand(*Def
, AMDGPU::OpName::omod
);
1317 if (!DefOMod
|| DefOMod
->getImm() != SIOutMods::NONE
)
1320 // Clamp is applied after omod. If the source already has clamp set, don't
1322 if (TII
->hasModifiersSet(*Def
, AMDGPU::OpName::clamp
))
1325 LLVM_DEBUG(dbgs() << "Folding omod " << MI
<< " into " << *Def
<< '\n');
1327 DefOMod
->setImm(OMod
);
1328 MRI
->replaceRegWith(MI
.getOperand(0).getReg(), Def
->getOperand(0).getReg());
1329 MI
.eraseFromParent();
1333 bool SIFoldOperands::runOnMachineFunction(MachineFunction
&MF
) {
1334 if (skipFunction(MF
.getFunction()))
1337 MRI
= &MF
.getRegInfo();
1338 ST
= &MF
.getSubtarget
<GCNSubtarget
>();
1339 TII
= ST
->getInstrInfo();
1340 TRI
= &TII
->getRegisterInfo();
1341 MFI
= MF
.getInfo
<SIMachineFunctionInfo
>();
1343 // omod is ignored by hardware if IEEE bit is enabled. omod also does not
1344 // correctly handle signed zeros.
1346 // FIXME: Also need to check strictfp
1347 bool IsIEEEMode
= MFI
->getMode().IEEE
;
1348 bool HasNSZ
= MFI
->hasNoSignedZerosFPMath();
1350 for (MachineBasicBlock
*MBB
: depth_first(&MF
)) {
1351 MachineBasicBlock::iterator I
, Next
;
1353 MachineOperand
*CurrentKnownM0Val
= nullptr;
1354 for (I
= MBB
->begin(); I
!= MBB
->end(); I
= Next
) {
1355 Next
= std::next(I
);
1356 MachineInstr
&MI
= *I
;
1358 tryFoldInst(TII
, &MI
);
1360 if (!TII
->isFoldableCopy(MI
)) {
1361 // TODO: Omod might be OK if there is NSZ only on the source
1362 // instruction, and not the omod multiply.
1363 if (IsIEEEMode
|| (!HasNSZ
&& !MI
.getFlag(MachineInstr::FmNsz
)) ||
1367 // Saw an unknown clobber of m0, so we no longer know what it is.
1368 if (CurrentKnownM0Val
&& MI
.modifiesRegister(AMDGPU::M0
, TRI
))
1369 CurrentKnownM0Val
= nullptr;
1373 // Specially track simple redefs of m0 to the same value in a block, so we
1374 // can erase the later ones.
1375 if (MI
.getOperand(0).getReg() == AMDGPU::M0
) {
1376 MachineOperand
&NewM0Val
= MI
.getOperand(1);
1377 if (CurrentKnownM0Val
&& CurrentKnownM0Val
->isIdenticalTo(NewM0Val
)) {
1378 MI
.eraseFromParent();
1382 // We aren't tracking other physical registers
1383 CurrentKnownM0Val
= (NewM0Val
.isReg() && NewM0Val
.getReg().isPhysical()) ?
1384 nullptr : &NewM0Val
;
1388 MachineOperand
&OpToFold
= MI
.getOperand(1);
1390 OpToFold
.isImm() || OpToFold
.isFI() || OpToFold
.isGlobal();
1392 // FIXME: We could also be folding things like TargetIndexes.
1393 if (!FoldingImm
&& !OpToFold
.isReg())
1396 if (OpToFold
.isReg() && !Register::isVirtualRegister(OpToFold
.getReg()))
1399 // Prevent folding operands backwards in the function. For example,
1400 // the COPY opcode must not be replaced by 1 in this example:
1402 // %3 = COPY %vgpr0; VGPR_32:%3
1404 // %vgpr0 = V_MOV_B32_e32 1, implicit %exec
1405 MachineOperand
&Dst
= MI
.getOperand(0);
1406 if (Dst
.isReg() && !Register::isVirtualRegister(Dst
.getReg()))
1409 foldInstOperand(MI
, OpToFold
);