1 //===-- SIFoldOperands.cpp - Fold operands --- ----------------------------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
8 //===----------------------------------------------------------------------===//
12 #include "GCNSubtarget.h"
13 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
14 #include "SIMachineFunctionInfo.h"
15 #include "llvm/ADT/DepthFirstIterator.h"
16 #include "llvm/CodeGen/MachineFunctionPass.h"
18 #define DEBUG_TYPE "si-fold-operands"
23 struct FoldCandidate
{
26 MachineOperand
*OpToFold
;
32 MachineOperand::MachineOperandType Kind
;
35 FoldCandidate(MachineInstr
*MI
, unsigned OpNo
, MachineOperand
*FoldOp
,
36 bool Commuted_
= false,
38 UseMI(MI
), OpToFold(nullptr), ShrinkOpcode(ShrinkOp
), UseOpNo(OpNo
),
39 Kind(FoldOp
->getType()),
41 if (FoldOp
->isImm()) {
42 ImmToFold
= FoldOp
->getImm();
43 } else if (FoldOp
->isFI()) {
44 FrameIndexToFold
= FoldOp
->getIndex();
46 assert(FoldOp
->isReg() || FoldOp
->isGlobal());
52 return Kind
== MachineOperand::MO_FrameIndex
;
56 return Kind
== MachineOperand::MO_Immediate
;
60 return Kind
== MachineOperand::MO_Register
;
63 bool isGlobal() const { return Kind
== MachineOperand::MO_GlobalAddress
; }
65 bool isCommuted() const {
69 bool needsShrink() const {
70 return ShrinkOpcode
!= -1;
73 int getShrinkOpcode() const {
78 class SIFoldOperands
: public MachineFunctionPass
{
81 MachineRegisterInfo
*MRI
;
82 const SIInstrInfo
*TII
;
83 const SIRegisterInfo
*TRI
;
84 const GCNSubtarget
*ST
;
85 const SIMachineFunctionInfo
*MFI
;
87 void foldOperand(MachineOperand
&OpToFold
,
90 SmallVectorImpl
<FoldCandidate
> &FoldList
,
91 SmallVectorImpl
<MachineInstr
*> &CopiesToReplace
) const;
93 bool tryFoldCndMask(MachineInstr
&MI
) const;
94 bool tryFoldZeroHighBits(MachineInstr
&MI
) const;
95 bool foldInstOperand(MachineInstr
&MI
, MachineOperand
&OpToFold
) const;
97 const MachineOperand
*isClamp(const MachineInstr
&MI
) const;
98 bool tryFoldClamp(MachineInstr
&MI
);
100 std::pair
<const MachineOperand
*, int> isOMod(const MachineInstr
&MI
) const;
101 bool tryFoldOMod(MachineInstr
&MI
);
102 bool tryFoldRegSequence(MachineInstr
&MI
);
103 bool tryFoldLCSSAPhi(MachineInstr
&MI
);
104 bool tryFoldLoad(MachineInstr
&MI
);
107 SIFoldOperands() : MachineFunctionPass(ID
) {
108 initializeSIFoldOperandsPass(*PassRegistry::getPassRegistry());
111 bool runOnMachineFunction(MachineFunction
&MF
) override
;
113 StringRef
getPassName() const override
{ return "SI Fold Operands"; }
115 void getAnalysisUsage(AnalysisUsage
&AU
) const override
{
116 AU
.setPreservesCFG();
117 MachineFunctionPass::getAnalysisUsage(AU
);
121 } // End anonymous namespace.
123 INITIALIZE_PASS(SIFoldOperands
, DEBUG_TYPE
,
124 "SI Fold Operands", false, false)
126 char SIFoldOperands::ID
= 0;
128 char &llvm::SIFoldOperandsID
= SIFoldOperands::ID
;
130 // Map multiply-accumulate opcode to corresponding multiply-add opcode if any.
131 static unsigned macToMad(unsigned Opc
) {
133 case AMDGPU::V_MAC_F32_e64
:
134 return AMDGPU::V_MAD_F32_e64
;
135 case AMDGPU::V_MAC_F16_e64
:
136 return AMDGPU::V_MAD_F16_e64
;
137 case AMDGPU::V_FMAC_F32_e64
:
138 return AMDGPU::V_FMA_F32_e64
;
139 case AMDGPU::V_FMAC_F16_e64
:
140 return AMDGPU::V_FMA_F16_gfx9_e64
;
141 case AMDGPU::V_FMAC_LEGACY_F32_e64
:
142 return AMDGPU::V_FMA_LEGACY_F32_e64
;
143 case AMDGPU::V_FMAC_F64_e64
:
144 return AMDGPU::V_FMA_F64_e64
;
146 return AMDGPU::INSTRUCTION_LIST_END
;
149 // Wrapper around isInlineConstant that understands special cases when
150 // instruction types are replaced during operand folding.
151 static bool isInlineConstantIfFolded(const SIInstrInfo
*TII
,
152 const MachineInstr
&UseMI
,
154 const MachineOperand
&OpToFold
) {
155 if (TII
->isInlineConstant(UseMI
, OpNo
, OpToFold
))
158 unsigned Opc
= UseMI
.getOpcode();
159 unsigned NewOpc
= macToMad(Opc
);
160 if (NewOpc
!= AMDGPU::INSTRUCTION_LIST_END
) {
161 // Special case for mac. Since this is replaced with mad when folded into
162 // src2, we need to check the legality for the final instruction.
163 int Src2Idx
= AMDGPU::getNamedOperandIdx(Opc
, AMDGPU::OpName::src2
);
164 if (static_cast<int>(OpNo
) == Src2Idx
) {
165 const MCInstrDesc
&MadDesc
= TII
->get(NewOpc
);
166 return TII
->isInlineConstant(OpToFold
, MadDesc
.OpInfo
[OpNo
].OperandType
);
173 // TODO: Add heuristic that the frame index might not fit in the addressing mode
174 // immediate offset to avoid materializing in loops.
175 static bool frameIndexMayFold(const SIInstrInfo
*TII
,
176 const MachineInstr
&UseMI
,
178 const MachineOperand
&OpToFold
) {
179 if (!OpToFold
.isFI())
182 if (TII
->isMUBUF(UseMI
))
183 return OpNo
== AMDGPU::getNamedOperandIdx(UseMI
.getOpcode(),
184 AMDGPU::OpName::vaddr
);
185 if (!TII
->isFLATScratch(UseMI
))
188 int SIdx
= AMDGPU::getNamedOperandIdx(UseMI
.getOpcode(),
189 AMDGPU::OpName::saddr
);
193 int VIdx
= AMDGPU::getNamedOperandIdx(UseMI
.getOpcode(),
194 AMDGPU::OpName::vaddr
);
195 return OpNo
== VIdx
&& SIdx
== -1;
198 FunctionPass
*llvm::createSIFoldOperandsPass() {
199 return new SIFoldOperands();
202 static bool updateOperand(FoldCandidate
&Fold
,
203 const SIInstrInfo
&TII
,
204 const TargetRegisterInfo
&TRI
,
205 const GCNSubtarget
&ST
) {
206 MachineInstr
*MI
= Fold
.UseMI
;
207 MachineOperand
&Old
= MI
->getOperand(Fold
.UseOpNo
);
211 if (MI
->getDesc().TSFlags
& SIInstrFlags::IsPacked
&&
212 !(MI
->getDesc().TSFlags
& SIInstrFlags::IsMAI
) &&
213 (!ST
.hasDOTOpSelHazard() ||
214 !(MI
->getDesc().TSFlags
& SIInstrFlags::IsDOT
)) &&
215 AMDGPU::isFoldableLiteralV216(Fold
.ImmToFold
,
216 ST
.hasInv2PiInlineImm())) {
217 // Set op_sel/op_sel_hi on this operand or bail out if op_sel is
219 unsigned Opcode
= MI
->getOpcode();
220 int OpNo
= MI
->getOperandNo(&Old
);
222 if (OpNo
== AMDGPU::getNamedOperandIdx(Opcode
, AMDGPU::OpName::src0
))
223 ModIdx
= AMDGPU::OpName::src0_modifiers
;
224 else if (OpNo
== AMDGPU::getNamedOperandIdx(Opcode
, AMDGPU::OpName::src1
))
225 ModIdx
= AMDGPU::OpName::src1_modifiers
;
226 else if (OpNo
== AMDGPU::getNamedOperandIdx(Opcode
, AMDGPU::OpName::src2
))
227 ModIdx
= AMDGPU::OpName::src2_modifiers
;
228 assert(ModIdx
!= -1);
229 ModIdx
= AMDGPU::getNamedOperandIdx(Opcode
, ModIdx
);
230 MachineOperand
&Mod
= MI
->getOperand(ModIdx
);
231 unsigned Val
= Mod
.getImm();
232 if (!(Val
& SISrcMods::OP_SEL_0
) && (Val
& SISrcMods::OP_SEL_1
)) {
233 // Only apply the following transformation if that operand requires
234 // a packed immediate.
235 switch (TII
.get(Opcode
).OpInfo
[OpNo
].OperandType
) {
236 case AMDGPU::OPERAND_REG_IMM_V2FP16
:
237 case AMDGPU::OPERAND_REG_IMM_V2INT16
:
238 case AMDGPU::OPERAND_REG_INLINE_C_V2FP16
:
239 case AMDGPU::OPERAND_REG_INLINE_C_V2INT16
:
240 // If upper part is all zero we do not need op_sel_hi.
241 if (!isUInt
<16>(Fold
.ImmToFold
)) {
242 if (!(Fold
.ImmToFold
& 0xffff)) {
243 Mod
.setImm(Mod
.getImm() | SISrcMods::OP_SEL_0
);
244 Mod
.setImm(Mod
.getImm() & ~SISrcMods::OP_SEL_1
);
245 Old
.ChangeToImmediate((Fold
.ImmToFold
>> 16) & 0xffff);
248 Mod
.setImm(Mod
.getImm() & ~SISrcMods::OP_SEL_1
);
249 Old
.ChangeToImmediate(Fold
.ImmToFold
& 0xffff);
260 if ((Fold
.isImm() || Fold
.isFI() || Fold
.isGlobal()) && Fold
.needsShrink()) {
261 MachineBasicBlock
*MBB
= MI
->getParent();
262 auto Liveness
= MBB
->computeRegisterLiveness(&TRI
, AMDGPU::VCC
, MI
, 16);
263 if (Liveness
!= MachineBasicBlock::LQR_Dead
) {
264 LLVM_DEBUG(dbgs() << "Not shrinking " << MI
<< " due to vcc liveness\n");
268 MachineRegisterInfo
&MRI
= MBB
->getParent()->getRegInfo();
269 int Op32
= Fold
.getShrinkOpcode();
270 MachineOperand
&Dst0
= MI
->getOperand(0);
271 MachineOperand
&Dst1
= MI
->getOperand(1);
272 assert(Dst0
.isDef() && Dst1
.isDef());
274 bool HaveNonDbgCarryUse
= !MRI
.use_nodbg_empty(Dst1
.getReg());
276 const TargetRegisterClass
*Dst0RC
= MRI
.getRegClass(Dst0
.getReg());
277 Register NewReg0
= MRI
.createVirtualRegister(Dst0RC
);
279 MachineInstr
*Inst32
= TII
.buildShrunkInst(*MI
, Op32
);
281 if (HaveNonDbgCarryUse
) {
282 BuildMI(*MBB
, MI
, MI
->getDebugLoc(), TII
.get(AMDGPU::COPY
), Dst1
.getReg())
283 .addReg(AMDGPU::VCC
, RegState::Kill
);
286 // Keep the old instruction around to avoid breaking iterators, but
287 // replace it with a dummy instruction to remove uses.
289 // FIXME: We should not invert how this pass looks at operands to avoid
290 // this. Should track set of foldable movs instead of looking for uses
291 // when looking at a use.
292 Dst0
.setReg(NewReg0
);
293 for (unsigned I
= MI
->getNumOperands() - 1; I
> 0; --I
)
294 MI
->removeOperand(I
);
295 MI
->setDesc(TII
.get(AMDGPU::IMPLICIT_DEF
));
297 if (Fold
.isCommuted())
298 TII
.commuteInstruction(*Inst32
, false);
302 assert(!Fold
.needsShrink() && "not handled");
306 int NewMFMAOpc
= AMDGPU::getMFMAEarlyClobberOp(MI
->getOpcode());
307 if (NewMFMAOpc
== -1)
309 MI
->setDesc(TII
.get(NewMFMAOpc
));
310 MI
->untieRegOperand(0);
312 Old
.ChangeToImmediate(Fold
.ImmToFold
);
316 if (Fold
.isGlobal()) {
317 Old
.ChangeToGA(Fold
.OpToFold
->getGlobal(), Fold
.OpToFold
->getOffset(),
318 Fold
.OpToFold
->getTargetFlags());
323 Old
.ChangeToFrameIndex(Fold
.FrameIndexToFold
);
327 MachineOperand
*New
= Fold
.OpToFold
;
328 Old
.substVirtReg(New
->getReg(), New
->getSubReg(), TRI
);
329 Old
.setIsUndef(New
->isUndef());
333 static bool isUseMIInFoldList(ArrayRef
<FoldCandidate
> FoldList
,
334 const MachineInstr
*MI
) {
335 for (auto Candidate
: FoldList
) {
336 if (Candidate
.UseMI
== MI
)
342 static void appendFoldCandidate(SmallVectorImpl
<FoldCandidate
> &FoldList
,
343 MachineInstr
*MI
, unsigned OpNo
,
344 MachineOperand
*FoldOp
, bool Commuted
= false,
346 // Skip additional folding on the same operand.
347 for (FoldCandidate
&Fold
: FoldList
)
348 if (Fold
.UseMI
== MI
&& Fold
.UseOpNo
== OpNo
)
350 LLVM_DEBUG(dbgs() << "Append " << (Commuted
? "commuted" : "normal")
351 << " operand " << OpNo
<< "\n " << *MI
);
352 FoldList
.emplace_back(MI
, OpNo
, FoldOp
, Commuted
, ShrinkOp
);
355 static bool tryAddToFoldList(SmallVectorImpl
<FoldCandidate
> &FoldList
,
356 MachineInstr
*MI
, unsigned OpNo
,
357 MachineOperand
*OpToFold
,
358 const SIInstrInfo
*TII
) {
359 if (!TII
->isOperandLegal(*MI
, OpNo
, OpToFold
)) {
360 // Special case for v_mac_{f16, f32}_e64 if we are trying to fold into src2
361 unsigned Opc
= MI
->getOpcode();
362 unsigned NewOpc
= macToMad(Opc
);
363 if (NewOpc
!= AMDGPU::INSTRUCTION_LIST_END
) {
364 // Check if changing this to a v_mad_{f16, f32} instruction will allow us
365 // to fold the operand.
366 MI
->setDesc(TII
->get(NewOpc
));
367 bool FoldAsMAD
= tryAddToFoldList(FoldList
, MI
, OpNo
, OpToFold
, TII
);
369 MI
->untieRegOperand(OpNo
);
372 MI
->setDesc(TII
->get(Opc
));
375 // Special case for s_setreg_b32
376 if (OpToFold
->isImm()) {
378 if (Opc
== AMDGPU::S_SETREG_B32
)
379 ImmOpc
= AMDGPU::S_SETREG_IMM32_B32
;
380 else if (Opc
== AMDGPU::S_SETREG_B32_mode
)
381 ImmOpc
= AMDGPU::S_SETREG_IMM32_B32_mode
;
383 MI
->setDesc(TII
->get(ImmOpc
));
384 appendFoldCandidate(FoldList
, MI
, OpNo
, OpToFold
);
389 // If we are already folding into another operand of MI, then
390 // we can't commute the instruction, otherwise we risk making the
391 // other fold illegal.
392 if (isUseMIInFoldList(FoldList
, MI
))
395 unsigned CommuteOpNo
= OpNo
;
397 // Operand is not legal, so try to commute the instruction to
398 // see if this makes it possible to fold.
399 unsigned CommuteIdx0
= TargetInstrInfo::CommuteAnyOperandIndex
;
400 unsigned CommuteIdx1
= TargetInstrInfo::CommuteAnyOperandIndex
;
401 bool CanCommute
= TII
->findCommutedOpIndices(*MI
, CommuteIdx0
, CommuteIdx1
);
404 if (CommuteIdx0
== OpNo
)
405 CommuteOpNo
= CommuteIdx1
;
406 else if (CommuteIdx1
== OpNo
)
407 CommuteOpNo
= CommuteIdx0
;
411 // One of operands might be an Imm operand, and OpNo may refer to it after
412 // the call of commuteInstruction() below. Such situations are avoided
413 // here explicitly as OpNo must be a register operand to be a candidate
414 // for memory folding.
415 if (CanCommute
&& (!MI
->getOperand(CommuteIdx0
).isReg() ||
416 !MI
->getOperand(CommuteIdx1
).isReg()))
420 !TII
->commuteInstruction(*MI
, false, CommuteIdx0
, CommuteIdx1
))
423 if (!TII
->isOperandLegal(*MI
, CommuteOpNo
, OpToFold
)) {
424 if ((Opc
== AMDGPU::V_ADD_CO_U32_e64
||
425 Opc
== AMDGPU::V_SUB_CO_U32_e64
||
426 Opc
== AMDGPU::V_SUBREV_CO_U32_e64
) && // FIXME
427 (OpToFold
->isImm() || OpToFold
->isFI() || OpToFold
->isGlobal())) {
428 MachineRegisterInfo
&MRI
= MI
->getParent()->getParent()->getRegInfo();
430 // Verify the other operand is a VGPR, otherwise we would violate the
431 // constant bus restriction.
432 unsigned OtherIdx
= CommuteOpNo
== CommuteIdx0
? CommuteIdx1
: CommuteIdx0
;
433 MachineOperand
&OtherOp
= MI
->getOperand(OtherIdx
);
434 if (!OtherOp
.isReg() ||
435 !TII
->getRegisterInfo().isVGPR(MRI
, OtherOp
.getReg()))
438 assert(MI
->getOperand(1).isDef());
440 // Make sure to get the 32-bit version of the commuted opcode.
441 unsigned MaybeCommutedOpc
= MI
->getOpcode();
442 int Op32
= AMDGPU::getVOPe32(MaybeCommutedOpc
);
444 appendFoldCandidate(FoldList
, MI
, CommuteOpNo
, OpToFold
, true, Op32
);
448 TII
->commuteInstruction(*MI
, false, CommuteIdx0
, CommuteIdx1
);
452 appendFoldCandidate(FoldList
, MI
, CommuteOpNo
, OpToFold
, true);
456 // Check the case where we might introduce a second constant operand to a
457 // scalar instruction
458 if (TII
->isSALU(MI
->getOpcode())) {
459 const MCInstrDesc
&InstDesc
= MI
->getDesc();
460 const MCOperandInfo
&OpInfo
= InstDesc
.OpInfo
[OpNo
];
461 const SIRegisterInfo
&SRI
= TII
->getRegisterInfo();
463 // Fine if the operand can be encoded as an inline constant
464 if (TII
->isLiteralConstantLike(*OpToFold
, OpInfo
)) {
465 if (!SRI
.opCanUseInlineConstant(OpInfo
.OperandType
) ||
466 !TII
->isInlineConstant(*OpToFold
, OpInfo
)) {
467 // Otherwise check for another constant
468 for (unsigned i
= 0, e
= InstDesc
.getNumOperands(); i
!= e
; ++i
) {
469 auto &Op
= MI
->getOperand(i
);
471 TII
->isLiteralConstantLike(Op
, OpInfo
)) {
479 appendFoldCandidate(FoldList
, MI
, OpNo
, OpToFold
);
483 // If the use operand doesn't care about the value, this may be an operand only
484 // used for register indexing, in which case it is unsafe to fold.
485 static bool isUseSafeToFold(const SIInstrInfo
*TII
,
486 const MachineInstr
&MI
,
487 const MachineOperand
&UseMO
) {
488 if (UseMO
.isUndef() || TII
->isSDWA(MI
))
491 switch (MI
.getOpcode()) {
492 case AMDGPU::V_MOV_B32_e32
:
493 case AMDGPU::V_MOV_B32_e64
:
494 case AMDGPU::V_MOV_B64_PSEUDO
:
495 case AMDGPU::V_MOV_B64_e32
:
496 case AMDGPU::V_MOV_B64_e64
:
497 // Do not fold into an indirect mov.
498 return !MI
.hasRegisterImplicitUseOperand(AMDGPU::M0
);
502 //return !MI.hasRegisterImplicitUseOperand(UseMO.getReg());
505 // Find a def of the UseReg, check if it is a reg_sequence and find initializers
506 // for each subreg, tracking it to foldable inline immediate if possible.
507 // Returns true on success.
508 static bool getRegSeqInit(
509 SmallVectorImpl
<std::pair
<MachineOperand
*, unsigned>> &Defs
,
510 Register UseReg
, uint8_t OpTy
,
511 const SIInstrInfo
*TII
, const MachineRegisterInfo
&MRI
) {
512 MachineInstr
*Def
= MRI
.getVRegDef(UseReg
);
513 if (!Def
|| !Def
->isRegSequence())
516 for (unsigned I
= 1, E
= Def
->getNumExplicitOperands(); I
< E
; I
+= 2) {
517 MachineOperand
*Sub
= &Def
->getOperand(I
);
518 assert(Sub
->isReg());
520 for (MachineInstr
*SubDef
= MRI
.getVRegDef(Sub
->getReg());
521 SubDef
&& Sub
->isReg() && Sub
->getReg().isVirtual() &&
522 !Sub
->getSubReg() && TII
->isFoldableCopy(*SubDef
);
523 SubDef
= MRI
.getVRegDef(Sub
->getReg())) {
524 MachineOperand
*Op
= &SubDef
->getOperand(1);
526 if (TII
->isInlineConstant(*Op
, OpTy
))
530 if (!Op
->isReg() || Op
->getReg().isPhysical())
535 Defs
.emplace_back(Sub
, Def
->getOperand(I
+ 1).getImm());
541 static bool tryToFoldACImm(const SIInstrInfo
*TII
,
542 const MachineOperand
&OpToFold
,
545 SmallVectorImpl
<FoldCandidate
> &FoldList
) {
546 const MCInstrDesc
&Desc
= UseMI
->getDesc();
547 const MCOperandInfo
*OpInfo
= Desc
.OpInfo
;
548 if (!OpInfo
|| UseOpIdx
>= Desc
.getNumOperands())
551 uint8_t OpTy
= OpInfo
[UseOpIdx
].OperandType
;
552 if ((OpTy
< AMDGPU::OPERAND_REG_INLINE_AC_FIRST
||
553 OpTy
> AMDGPU::OPERAND_REG_INLINE_AC_LAST
) &&
554 (OpTy
< AMDGPU::OPERAND_REG_INLINE_C_FIRST
||
555 OpTy
> AMDGPU::OPERAND_REG_INLINE_C_LAST
))
558 if (OpToFold
.isImm() && TII
->isInlineConstant(OpToFold
, OpTy
) &&
559 TII
->isOperandLegal(*UseMI
, UseOpIdx
, &OpToFold
)) {
560 UseMI
->getOperand(UseOpIdx
).ChangeToImmediate(OpToFold
.getImm());
564 if (!OpToFold
.isReg())
567 Register UseReg
= OpToFold
.getReg();
568 if (!UseReg
.isVirtual())
571 if (isUseMIInFoldList(FoldList
, UseMI
))
574 MachineRegisterInfo
&MRI
= UseMI
->getParent()->getParent()->getRegInfo();
576 // Maybe it is just a COPY of an immediate itself.
577 MachineInstr
*Def
= MRI
.getVRegDef(UseReg
);
578 MachineOperand
&UseOp
= UseMI
->getOperand(UseOpIdx
);
579 if (!UseOp
.getSubReg() && Def
&& TII
->isFoldableCopy(*Def
)) {
580 MachineOperand
&DefOp
= Def
->getOperand(1);
581 if (DefOp
.isImm() && TII
->isInlineConstant(DefOp
, OpTy
) &&
582 TII
->isOperandLegal(*UseMI
, UseOpIdx
, &DefOp
)) {
583 UseMI
->getOperand(UseOpIdx
).ChangeToImmediate(DefOp
.getImm());
588 SmallVector
<std::pair
<MachineOperand
*, unsigned>, 32> Defs
;
589 if (!getRegSeqInit(Defs
, UseReg
, OpTy
, TII
, MRI
))
593 for (unsigned I
= 0, E
= Defs
.size(); I
!= E
; ++I
) {
594 const MachineOperand
*Op
= Defs
[I
].first
;
598 auto SubImm
= Op
->getImm();
601 if (!TII
->isInlineConstant(*Op
, OpTy
) ||
602 !TII
->isOperandLegal(*UseMI
, UseOpIdx
, Op
))
608 return false; // Can only fold splat constants
611 appendFoldCandidate(FoldList
, UseMI
, UseOpIdx
, Defs
[0].first
);
615 void SIFoldOperands::foldOperand(
616 MachineOperand
&OpToFold
,
619 SmallVectorImpl
<FoldCandidate
> &FoldList
,
620 SmallVectorImpl
<MachineInstr
*> &CopiesToReplace
) const {
621 const MachineOperand
&UseOp
= UseMI
->getOperand(UseOpIdx
);
623 if (!isUseSafeToFold(TII
, *UseMI
, UseOp
))
626 // FIXME: Fold operands with subregs.
627 if (UseOp
.isReg() && OpToFold
.isReg()) {
628 if (UseOp
.isImplicit() || UseOp
.getSubReg() != AMDGPU::NoSubRegister
)
632 // Special case for REG_SEQUENCE: We can't fold literals into
633 // REG_SEQUENCE instructions, so we have to fold them into the
634 // uses of REG_SEQUENCE.
635 if (UseMI
->isRegSequence()) {
636 Register RegSeqDstReg
= UseMI
->getOperand(0).getReg();
637 unsigned RegSeqDstSubReg
= UseMI
->getOperand(UseOpIdx
+ 1).getImm();
639 for (auto &RSUse
: make_early_inc_range(MRI
->use_nodbg_operands(RegSeqDstReg
))) {
640 MachineInstr
*RSUseMI
= RSUse
.getParent();
642 if (tryToFoldACImm(TII
, UseMI
->getOperand(0), RSUseMI
,
643 RSUseMI
->getOperandNo(&RSUse
), FoldList
))
646 if (RSUse
.getSubReg() != RegSeqDstSubReg
)
649 foldOperand(OpToFold
, RSUseMI
, RSUseMI
->getOperandNo(&RSUse
), FoldList
,
656 if (tryToFoldACImm(TII
, OpToFold
, UseMI
, UseOpIdx
, FoldList
))
659 if (frameIndexMayFold(TII
, *UseMI
, UseOpIdx
, OpToFold
)) {
660 // Verify that this is a stack access.
661 // FIXME: Should probably use stack pseudos before frame lowering.
663 if (TII
->isMUBUF(*UseMI
)) {
664 if (TII
->getNamedOperand(*UseMI
, AMDGPU::OpName::srsrc
)->getReg() !=
665 MFI
->getScratchRSrcReg())
668 // Ensure this is either relative to the current frame or the current
670 MachineOperand
&SOff
=
671 *TII
->getNamedOperand(*UseMI
, AMDGPU::OpName::soffset
);
672 if (!SOff
.isImm() || SOff
.getImm() != 0)
676 // A frame index will resolve to a positive constant, so it should always be
677 // safe to fold the addressing mode, even pre-GFX9.
678 UseMI
->getOperand(UseOpIdx
).ChangeToFrameIndex(OpToFold
.getIndex());
680 if (TII
->isFLATScratch(*UseMI
) &&
681 AMDGPU::getNamedOperandIdx(UseMI
->getOpcode(),
682 AMDGPU::OpName::vaddr
) != -1 &&
683 AMDGPU::getNamedOperandIdx(UseMI
->getOpcode(),
684 AMDGPU::OpName::saddr
) == -1) {
685 unsigned NewOpc
= AMDGPU::getFlatScratchInstSSfromSV(UseMI
->getOpcode());
686 UseMI
->setDesc(TII
->get(NewOpc
));
692 bool FoldingImmLike
=
693 OpToFold
.isImm() || OpToFold
.isFI() || OpToFold
.isGlobal();
695 if (FoldingImmLike
&& UseMI
->isCopy()) {
696 Register DestReg
= UseMI
->getOperand(0).getReg();
697 Register SrcReg
= UseMI
->getOperand(1).getReg();
698 assert(SrcReg
.isVirtual());
700 const TargetRegisterClass
*SrcRC
= MRI
->getRegClass(SrcReg
);
702 // Don't fold into a copy to a physical register with the same class. Doing
703 // so would interfere with the register coalescer's logic which would avoid
704 // redundant initializations.
705 if (DestReg
.isPhysical() && SrcRC
->contains(DestReg
))
708 const TargetRegisterClass
*DestRC
= TRI
->getRegClassForReg(*MRI
, DestReg
);
709 if (!DestReg
.isPhysical()) {
710 if (TRI
->isSGPRClass(SrcRC
) && TRI
->hasVectorRegisters(DestRC
)) {
711 SmallVector
<FoldCandidate
, 4> CopyUses
;
712 for (auto &Use
: MRI
->use_nodbg_operands(DestReg
)) {
713 // There's no point trying to fold into an implicit operand.
714 if (Use
.isImplicit())
717 CopyUses
.emplace_back(Use
.getParent(),
718 Use
.getParent()->getOperandNo(&Use
),
719 &UseMI
->getOperand(1));
721 for (auto &F
: CopyUses
) {
722 foldOperand(*F
.OpToFold
, F
.UseMI
, F
.UseOpNo
, FoldList
, CopiesToReplace
);
726 if (DestRC
== &AMDGPU::AGPR_32RegClass
&&
727 TII
->isInlineConstant(OpToFold
, AMDGPU::OPERAND_REG_INLINE_C_INT32
)) {
728 UseMI
->setDesc(TII
->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64
));
729 UseMI
->getOperand(1).ChangeToImmediate(OpToFold
.getImm());
730 CopiesToReplace
.push_back(UseMI
);
735 // In order to fold immediates into copies, we need to change the
738 unsigned MovOp
= TII
->getMovOpcode(DestRC
);
739 if (MovOp
== AMDGPU::COPY
)
742 UseMI
->setDesc(TII
->get(MovOp
));
743 MachineInstr::mop_iterator ImpOpI
= UseMI
->implicit_operands().begin();
744 MachineInstr::mop_iterator ImpOpE
= UseMI
->implicit_operands().end();
745 while (ImpOpI
!= ImpOpE
) {
746 MachineInstr::mop_iterator Tmp
= ImpOpI
;
748 UseMI
->removeOperand(UseMI
->getOperandNo(Tmp
));
750 CopiesToReplace
.push_back(UseMI
);
752 if (UseMI
->isCopy() && OpToFold
.isReg() &&
753 UseMI
->getOperand(0).getReg().isVirtual() &&
754 !UseMI
->getOperand(1).getSubReg()) {
755 LLVM_DEBUG(dbgs() << "Folding " << OpToFold
<< "\n into " << *UseMI
);
756 unsigned Size
= TII
->getOpSize(*UseMI
, 1);
757 Register UseReg
= OpToFold
.getReg();
758 UseMI
->getOperand(1).setReg(UseReg
);
759 UseMI
->getOperand(1).setSubReg(OpToFold
.getSubReg());
760 UseMI
->getOperand(1).setIsKill(false);
761 CopiesToReplace
.push_back(UseMI
);
762 OpToFold
.setIsKill(false);
764 // That is very tricky to store a value into an AGPR. v_accvgpr_write_b32
765 // can only accept VGPR or inline immediate. Recreate a reg_sequence with
766 // its initializers right here, so we will rematerialize immediates and
767 // avoid copies via different reg classes.
768 SmallVector
<std::pair
<MachineOperand
*, unsigned>, 32> Defs
;
769 if (Size
> 4 && TRI
->isAGPR(*MRI
, UseMI
->getOperand(0).getReg()) &&
770 getRegSeqInit(Defs
, UseReg
, AMDGPU::OPERAND_REG_INLINE_C_INT32
, TII
,
772 const DebugLoc
&DL
= UseMI
->getDebugLoc();
773 MachineBasicBlock
&MBB
= *UseMI
->getParent();
775 UseMI
->setDesc(TII
->get(AMDGPU::REG_SEQUENCE
));
776 for (unsigned I
= UseMI
->getNumOperands() - 1; I
> 0; --I
)
777 UseMI
->removeOperand(I
);
779 MachineInstrBuilder
B(*MBB
.getParent(), UseMI
);
780 DenseMap
<TargetInstrInfo::RegSubRegPair
, Register
> VGPRCopies
;
781 SmallSetVector
<TargetInstrInfo::RegSubRegPair
, 32> SeenAGPRs
;
782 for (unsigned I
= 0; I
< Size
/ 4; ++I
) {
783 MachineOperand
*Def
= Defs
[I
].first
;
784 TargetInstrInfo::RegSubRegPair CopyToVGPR
;
786 TII
->isInlineConstant(*Def
, AMDGPU::OPERAND_REG_INLINE_C_INT32
)) {
787 int64_t Imm
= Def
->getImm();
789 auto Tmp
= MRI
->createVirtualRegister(&AMDGPU::AGPR_32RegClass
);
790 BuildMI(MBB
, UseMI
, DL
,
791 TII
->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64
), Tmp
).addImm(Imm
);
793 } else if (Def
->isReg() && TRI
->isAGPR(*MRI
, Def
->getReg())) {
794 auto Src
= getRegSubRegPair(*Def
);
795 Def
->setIsKill(false);
796 if (!SeenAGPRs
.insert(Src
)) {
797 // We cannot build a reg_sequence out of the same registers, they
798 // must be copied. Better do it here before copyPhysReg() created
799 // several reads to do the AGPR->VGPR->AGPR copy.
802 B
.addReg(Src
.Reg
, Def
->isUndef() ? RegState::Undef
: 0,
806 assert(Def
->isReg());
807 Def
->setIsKill(false);
808 auto Src
= getRegSubRegPair(*Def
);
810 // Direct copy from SGPR to AGPR is not possible. To avoid creation
811 // of exploded copies SGPR->VGPR->AGPR in the copyPhysReg() later,
812 // create a copy here and track if we already have such a copy.
813 if (TRI
->isSGPRReg(*MRI
, Src
.Reg
)) {
816 auto Tmp
= MRI
->createVirtualRegister(&AMDGPU::AGPR_32RegClass
);
817 BuildMI(MBB
, UseMI
, DL
, TII
->get(AMDGPU::COPY
), Tmp
).add(*Def
);
822 if (CopyToVGPR
.Reg
) {
824 if (VGPRCopies
.count(CopyToVGPR
)) {
825 Vgpr
= VGPRCopies
[CopyToVGPR
];
827 Vgpr
= MRI
->createVirtualRegister(&AMDGPU::VGPR_32RegClass
);
828 BuildMI(MBB
, UseMI
, DL
, TII
->get(AMDGPU::COPY
), Vgpr
).add(*Def
);
829 VGPRCopies
[CopyToVGPR
] = Vgpr
;
831 auto Tmp
= MRI
->createVirtualRegister(&AMDGPU::AGPR_32RegClass
);
832 BuildMI(MBB
, UseMI
, DL
,
833 TII
->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64
), Tmp
).addReg(Vgpr
);
837 B
.addImm(Defs
[I
].second
);
839 LLVM_DEBUG(dbgs() << "Folded " << *UseMI
);
845 if (TRI
->isAGPR(*MRI
, UseMI
->getOperand(0).getReg()) &&
846 TRI
->isVGPR(*MRI
, UseMI
->getOperand(1).getReg()))
847 UseMI
->setDesc(TII
->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64
));
848 else if (TRI
->isVGPR(*MRI
, UseMI
->getOperand(0).getReg()) &&
849 TRI
->isAGPR(*MRI
, UseMI
->getOperand(1).getReg()))
850 UseMI
->setDesc(TII
->get(AMDGPU::V_ACCVGPR_READ_B32_e64
));
851 else if (ST
->hasGFX90AInsts() &&
852 TRI
->isAGPR(*MRI
, UseMI
->getOperand(0).getReg()) &&
853 TRI
->isAGPR(*MRI
, UseMI
->getOperand(1).getReg()))
854 UseMI
->setDesc(TII
->get(AMDGPU::V_ACCVGPR_MOV_B32
));
858 unsigned UseOpc
= UseMI
->getOpcode();
859 if (UseOpc
== AMDGPU::V_READFIRSTLANE_B32
||
860 (UseOpc
== AMDGPU::V_READLANE_B32
&&
862 AMDGPU::getNamedOperandIdx(UseOpc
, AMDGPU::OpName::src0
))) {
863 // %vgpr = V_MOV_B32 imm
864 // %sgpr = V_READFIRSTLANE_B32 %vgpr
866 // %sgpr = S_MOV_B32 imm
867 if (FoldingImmLike
) {
868 if (execMayBeModifiedBeforeUse(*MRI
,
869 UseMI
->getOperand(UseOpIdx
).getReg(),
870 *OpToFold
.getParent(),
874 UseMI
->setDesc(TII
->get(AMDGPU::S_MOV_B32
));
876 if (OpToFold
.isImm())
877 UseMI
->getOperand(1).ChangeToImmediate(OpToFold
.getImm());
879 UseMI
->getOperand(1).ChangeToFrameIndex(OpToFold
.getIndex());
880 UseMI
->removeOperand(2); // Remove exec read (or src1 for readlane)
884 if (OpToFold
.isReg() && TRI
->isSGPRReg(*MRI
, OpToFold
.getReg())) {
885 if (execMayBeModifiedBeforeUse(*MRI
,
886 UseMI
->getOperand(UseOpIdx
).getReg(),
887 *OpToFold
.getParent(),
891 // %vgpr = COPY %sgpr0
892 // %sgpr1 = V_READFIRSTLANE_B32 %vgpr
894 // %sgpr1 = COPY %sgpr0
895 UseMI
->setDesc(TII
->get(AMDGPU::COPY
));
896 UseMI
->getOperand(1).setReg(OpToFold
.getReg());
897 UseMI
->getOperand(1).setSubReg(OpToFold
.getSubReg());
898 UseMI
->getOperand(1).setIsKill(false);
899 UseMI
->removeOperand(2); // Remove exec read (or src1 for readlane)
904 const MCInstrDesc
&UseDesc
= UseMI
->getDesc();
906 // Don't fold into target independent nodes. Target independent opcodes
907 // don't have defined register classes.
908 if (UseDesc
.isVariadic() ||
909 UseOp
.isImplicit() ||
910 UseDesc
.OpInfo
[UseOpIdx
].RegClass
== -1)
914 if (!FoldingImmLike
) {
915 if (OpToFold
.isReg() && ST
->needsAlignedVGPRs()) {
916 // Don't fold if OpToFold doesn't hold an aligned register.
917 const TargetRegisterClass
*RC
=
918 TRI
->getRegClassForReg(*MRI
, OpToFold
.getReg());
919 if (TRI
->hasVectorRegisters(RC
) && OpToFold
.getSubReg()) {
920 unsigned SubReg
= OpToFold
.getSubReg();
921 const TargetRegisterClass
*SubRC
= TRI
->getSubRegClass(RC
, SubReg
);
922 RC
= TRI
->getCompatibleSubRegClass(RC
, SubRC
, SubReg
);
927 if (!RC
|| !TRI
->isProperlyAlignedRC(*RC
))
931 tryAddToFoldList(FoldList
, UseMI
, UseOpIdx
, &OpToFold
, TII
);
933 // FIXME: We could try to change the instruction from 64-bit to 32-bit
934 // to enable more folding opportunities. The shrink operands pass
935 // already does this.
940 const MCInstrDesc
&FoldDesc
= OpToFold
.getParent()->getDesc();
941 const TargetRegisterClass
*FoldRC
=
942 TRI
->getRegClass(FoldDesc
.OpInfo
[0].RegClass
);
944 // Split 64-bit constants into 32-bits for folding.
945 if (UseOp
.getSubReg() && AMDGPU::getRegBitWidth(FoldRC
->getID()) == 64) {
946 Register UseReg
= UseOp
.getReg();
947 const TargetRegisterClass
*UseRC
= MRI
->getRegClass(UseReg
);
949 if (AMDGPU::getRegBitWidth(UseRC
->getID()) != 64)
952 APInt
Imm(64, OpToFold
.getImm());
953 if (UseOp
.getSubReg() == AMDGPU::sub0
) {
954 Imm
= Imm
.getLoBits(32);
956 assert(UseOp
.getSubReg() == AMDGPU::sub1
);
957 Imm
= Imm
.getHiBits(32);
960 MachineOperand ImmOp
= MachineOperand::CreateImm(Imm
.getSExtValue());
961 tryAddToFoldList(FoldList
, UseMI
, UseOpIdx
, &ImmOp
, TII
);
967 tryAddToFoldList(FoldList
, UseMI
, UseOpIdx
, &OpToFold
, TII
);
970 static bool evalBinaryInstruction(unsigned Opcode
, int32_t &Result
,
971 uint32_t LHS
, uint32_t RHS
) {
973 case AMDGPU::V_AND_B32_e64
:
974 case AMDGPU::V_AND_B32_e32
:
975 case AMDGPU::S_AND_B32
:
978 case AMDGPU::V_OR_B32_e64
:
979 case AMDGPU::V_OR_B32_e32
:
980 case AMDGPU::S_OR_B32
:
983 case AMDGPU::V_XOR_B32_e64
:
984 case AMDGPU::V_XOR_B32_e32
:
985 case AMDGPU::S_XOR_B32
:
988 case AMDGPU::S_XNOR_B32
:
989 Result
= ~(LHS
^ RHS
);
991 case AMDGPU::S_NAND_B32
:
992 Result
= ~(LHS
& RHS
);
994 case AMDGPU::S_NOR_B32
:
995 Result
= ~(LHS
| RHS
);
997 case AMDGPU::S_ANDN2_B32
:
1000 case AMDGPU::S_ORN2_B32
:
1001 Result
= LHS
| ~RHS
;
1003 case AMDGPU::V_LSHL_B32_e64
:
1004 case AMDGPU::V_LSHL_B32_e32
:
1005 case AMDGPU::S_LSHL_B32
:
1006 // The instruction ignores the high bits for out of bounds shifts.
1007 Result
= LHS
<< (RHS
& 31);
1009 case AMDGPU::V_LSHLREV_B32_e64
:
1010 case AMDGPU::V_LSHLREV_B32_e32
:
1011 Result
= RHS
<< (LHS
& 31);
1013 case AMDGPU::V_LSHR_B32_e64
:
1014 case AMDGPU::V_LSHR_B32_e32
:
1015 case AMDGPU::S_LSHR_B32
:
1016 Result
= LHS
>> (RHS
& 31);
1018 case AMDGPU::V_LSHRREV_B32_e64
:
1019 case AMDGPU::V_LSHRREV_B32_e32
:
1020 Result
= RHS
>> (LHS
& 31);
1022 case AMDGPU::V_ASHR_I32_e64
:
1023 case AMDGPU::V_ASHR_I32_e32
:
1024 case AMDGPU::S_ASHR_I32
:
1025 Result
= static_cast<int32_t>(LHS
) >> (RHS
& 31);
1027 case AMDGPU::V_ASHRREV_I32_e64
:
1028 case AMDGPU::V_ASHRREV_I32_e32
:
1029 Result
= static_cast<int32_t>(RHS
) >> (LHS
& 31);
1036 static unsigned getMovOpc(bool IsScalar
) {
1037 return IsScalar
? AMDGPU::S_MOV_B32
: AMDGPU::V_MOV_B32_e32
;
1040 /// Remove any leftover implicit operands from mutating the instruction. e.g.
1041 /// if we replace an s_and_b32 with a copy, we don't need the implicit scc def
1043 static void stripExtraCopyOperands(MachineInstr
&MI
) {
1044 const MCInstrDesc
&Desc
= MI
.getDesc();
1045 unsigned NumOps
= Desc
.getNumOperands() +
1046 Desc
.getNumImplicitUses() +
1047 Desc
.getNumImplicitDefs();
1049 for (unsigned I
= MI
.getNumOperands() - 1; I
>= NumOps
; --I
)
1050 MI
.removeOperand(I
);
1053 static void mutateCopyOp(MachineInstr
&MI
, const MCInstrDesc
&NewDesc
) {
1054 MI
.setDesc(NewDesc
);
1055 stripExtraCopyOperands(MI
);
1058 static MachineOperand
*getImmOrMaterializedImm(MachineRegisterInfo
&MRI
,
1059 MachineOperand
&Op
) {
1061 // If this has a subregister, it obviously is a register source.
1062 if (Op
.getSubReg() != AMDGPU::NoSubRegister
|| !Op
.getReg().isVirtual())
1065 MachineInstr
*Def
= MRI
.getVRegDef(Op
.getReg());
1066 if (Def
&& Def
->isMoveImmediate()) {
1067 MachineOperand
&ImmSrc
= Def
->getOperand(1);
1076 // Try to simplify operations with a constant that may appear after instruction
1078 // TODO: See if a frame index with a fixed offset can fold.
1079 static bool tryConstantFoldOp(MachineRegisterInfo
&MRI
, const SIInstrInfo
*TII
,
1081 unsigned Opc
= MI
->getOpcode();
1083 int Src0Idx
= AMDGPU::getNamedOperandIdx(Opc
, AMDGPU::OpName::src0
);
1086 MachineOperand
*Src0
= getImmOrMaterializedImm(MRI
, MI
->getOperand(Src0Idx
));
1088 if ((Opc
== AMDGPU::V_NOT_B32_e64
|| Opc
== AMDGPU::V_NOT_B32_e32
||
1089 Opc
== AMDGPU::S_NOT_B32
) &&
1091 MI
->getOperand(1).ChangeToImmediate(~Src0
->getImm());
1092 mutateCopyOp(*MI
, TII
->get(getMovOpc(Opc
== AMDGPU::S_NOT_B32
)));
1096 int Src1Idx
= AMDGPU::getNamedOperandIdx(Opc
, AMDGPU::OpName::src1
);
1099 MachineOperand
*Src1
= getImmOrMaterializedImm(MRI
, MI
->getOperand(Src1Idx
));
1101 if (!Src0
->isImm() && !Src1
->isImm())
1104 // and k0, k1 -> v_mov_b32 (k0 & k1)
1105 // or k0, k1 -> v_mov_b32 (k0 | k1)
1106 // xor k0, k1 -> v_mov_b32 (k0 ^ k1)
1107 if (Src0
->isImm() && Src1
->isImm()) {
1109 if (!evalBinaryInstruction(Opc
, NewImm
, Src0
->getImm(), Src1
->getImm()))
1112 const SIRegisterInfo
&TRI
= TII
->getRegisterInfo();
1113 bool IsSGPR
= TRI
.isSGPRReg(MRI
, MI
->getOperand(0).getReg());
1115 // Be careful to change the right operand, src0 may belong to a different
1117 MI
->getOperand(Src0Idx
).ChangeToImmediate(NewImm
);
1118 MI
->removeOperand(Src1Idx
);
1119 mutateCopyOp(*MI
, TII
->get(getMovOpc(IsSGPR
)));
1123 if (!MI
->isCommutable())
1126 if (Src0
->isImm() && !Src1
->isImm()) {
1127 std::swap(Src0
, Src1
);
1128 std::swap(Src0Idx
, Src1Idx
);
1131 int32_t Src1Val
= static_cast<int32_t>(Src1
->getImm());
1132 if (Opc
== AMDGPU::V_OR_B32_e64
||
1133 Opc
== AMDGPU::V_OR_B32_e32
||
1134 Opc
== AMDGPU::S_OR_B32
) {
1136 // y = or x, 0 => y = copy x
1137 MI
->removeOperand(Src1Idx
);
1138 mutateCopyOp(*MI
, TII
->get(AMDGPU::COPY
));
1139 } else if (Src1Val
== -1) {
1140 // y = or x, -1 => y = v_mov_b32 -1
1141 MI
->removeOperand(Src1Idx
);
1142 mutateCopyOp(*MI
, TII
->get(getMovOpc(Opc
== AMDGPU::S_OR_B32
)));
1149 if (MI
->getOpcode() == AMDGPU::V_AND_B32_e64
||
1150 MI
->getOpcode() == AMDGPU::V_AND_B32_e32
||
1151 MI
->getOpcode() == AMDGPU::S_AND_B32
) {
1153 // y = and x, 0 => y = v_mov_b32 0
1154 MI
->removeOperand(Src0Idx
);
1155 mutateCopyOp(*MI
, TII
->get(getMovOpc(Opc
== AMDGPU::S_AND_B32
)));
1156 } else if (Src1Val
== -1) {
1157 // y = and x, -1 => y = copy x
1158 MI
->removeOperand(Src1Idx
);
1159 mutateCopyOp(*MI
, TII
->get(AMDGPU::COPY
));
1160 stripExtraCopyOperands(*MI
);
1167 if (MI
->getOpcode() == AMDGPU::V_XOR_B32_e64
||
1168 MI
->getOpcode() == AMDGPU::V_XOR_B32_e32
||
1169 MI
->getOpcode() == AMDGPU::S_XOR_B32
) {
1171 // y = xor x, 0 => y = copy x
1172 MI
->removeOperand(Src1Idx
);
1173 mutateCopyOp(*MI
, TII
->get(AMDGPU::COPY
));
1181 // Try to fold an instruction into a simpler one
1182 bool SIFoldOperands::tryFoldCndMask(MachineInstr
&MI
) const {
1183 unsigned Opc
= MI
.getOpcode();
1184 if (Opc
!= AMDGPU::V_CNDMASK_B32_e32
&& Opc
!= AMDGPU::V_CNDMASK_B32_e64
&&
1185 Opc
!= AMDGPU::V_CNDMASK_B64_PSEUDO
)
1188 MachineOperand
*Src0
= TII
->getNamedOperand(MI
, AMDGPU::OpName::src0
);
1189 MachineOperand
*Src1
= TII
->getNamedOperand(MI
, AMDGPU::OpName::src1
);
1190 if (!Src1
->isIdenticalTo(*Src0
)) {
1191 auto *Src0Imm
= getImmOrMaterializedImm(*MRI
, *Src0
);
1192 auto *Src1Imm
= getImmOrMaterializedImm(*MRI
, *Src1
);
1193 if (!Src1Imm
->isIdenticalTo(*Src0Imm
))
1198 AMDGPU::getNamedOperandIdx(Opc
, AMDGPU::OpName::src1_modifiers
);
1200 AMDGPU::getNamedOperandIdx(Opc
, AMDGPU::OpName::src0_modifiers
);
1201 if ((Src1ModIdx
!= -1 && MI
.getOperand(Src1ModIdx
).getImm() != 0) ||
1202 (Src0ModIdx
!= -1 && MI
.getOperand(Src0ModIdx
).getImm() != 0))
1205 LLVM_DEBUG(dbgs() << "Folded " << MI
<< " into ");
1207 TII
->get(Src0
->isReg() ? (unsigned)AMDGPU::COPY
: getMovOpc(false));
1208 int Src2Idx
= AMDGPU::getNamedOperandIdx(Opc
, AMDGPU::OpName::src2
);
1210 MI
.removeOperand(Src2Idx
);
1211 MI
.removeOperand(AMDGPU::getNamedOperandIdx(Opc
, AMDGPU::OpName::src1
));
1212 if (Src1ModIdx
!= -1)
1213 MI
.removeOperand(Src1ModIdx
);
1214 if (Src0ModIdx
!= -1)
1215 MI
.removeOperand(Src0ModIdx
);
1216 mutateCopyOp(MI
, NewDesc
);
1217 LLVM_DEBUG(dbgs() << MI
);
1221 bool SIFoldOperands::tryFoldZeroHighBits(MachineInstr
&MI
) const {
1222 if (MI
.getOpcode() != AMDGPU::V_AND_B32_e64
&&
1223 MI
.getOpcode() != AMDGPU::V_AND_B32_e32
)
1226 MachineOperand
*Src0
= getImmOrMaterializedImm(*MRI
, MI
.getOperand(1));
1227 if (!Src0
->isImm() || Src0
->getImm() != 0xffff)
1230 Register Src1
= MI
.getOperand(2).getReg();
1231 MachineInstr
*SrcDef
= MRI
->getVRegDef(Src1
);
1232 if (ST
->zeroesHigh16BitsOfDest(SrcDef
->getOpcode())) {
1233 Register Dst
= MI
.getOperand(0).getReg();
1234 MRI
->replaceRegWith(Dst
, SrcDef
->getOperand(0).getReg());
1235 MI
.eraseFromParent();
1242 bool SIFoldOperands::foldInstOperand(MachineInstr
&MI
,
1243 MachineOperand
&OpToFold
) const {
1244 // We need mutate the operands of new mov instructions to add implicit
1245 // uses of EXEC, but adding them invalidates the use_iterator, so defer
1247 SmallVector
<MachineInstr
*, 4> CopiesToReplace
;
1248 SmallVector
<FoldCandidate
, 4> FoldList
;
1249 MachineOperand
&Dst
= MI
.getOperand(0);
1250 bool Changed
= false;
1252 if (OpToFold
.isImm()) {
1254 make_early_inc_range(MRI
->use_nodbg_instructions(Dst
.getReg()))) {
1255 // Folding the immediate may reveal operations that can be constant
1256 // folded or replaced with a copy. This can happen for example after
1257 // frame indices are lowered to constants or from splitting 64-bit
1260 // We may also encounter cases where one or both operands are
1261 // immediates materialized into a register, which would ordinarily not
1262 // be folded due to multiple uses or operand constraints.
1263 if (tryConstantFoldOp(*MRI
, TII
, &UseMI
)) {
1264 LLVM_DEBUG(dbgs() << "Constant folded " << UseMI
);
1270 bool FoldingImm
= OpToFold
.isImm() || OpToFold
.isFI() || OpToFold
.isGlobal();
1272 unsigned NumLiteralUses
= 0;
1273 MachineOperand
*NonInlineUse
= nullptr;
1274 int NonInlineUseOpNo
= -1;
1277 make_early_inc_range(MRI
->use_nodbg_operands(Dst
.getReg()))) {
1278 MachineInstr
*UseMI
= Use
.getParent();
1279 unsigned OpNo
= UseMI
->getOperandNo(&Use
);
1281 // Try to fold any inline immediate uses, and then only fold other
1282 // constants if they have one use.
1284 // The legality of the inline immediate must be checked based on the use
1285 // operand, not the defining instruction, because 32-bit instructions
1286 // with 32-bit inline immediate sources may be used to materialize
1287 // constants used in 16-bit operands.
1289 // e.g. it is unsafe to fold:
1290 // s_mov_b32 s0, 1.0 // materializes 0x3f800000
1291 // v_add_f16 v0, v1, s0 // 1.0 f16 inline immediate sees 0x00003c00
1293 // Folding immediates with more than one use will increase program size.
1294 // FIXME: This will also reduce register usage, which may be better
1295 // in some cases. A better heuristic is needed.
1296 if (isInlineConstantIfFolded(TII
, *UseMI
, OpNo
, OpToFold
)) {
1297 foldOperand(OpToFold
, UseMI
, OpNo
, FoldList
, CopiesToReplace
);
1298 } else if (frameIndexMayFold(TII
, *UseMI
, OpNo
, OpToFold
)) {
1299 foldOperand(OpToFold
, UseMI
, OpNo
, FoldList
, CopiesToReplace
);
1301 if (++NumLiteralUses
== 1) {
1302 NonInlineUse
= &Use
;
1303 NonInlineUseOpNo
= OpNo
;
1308 if (NumLiteralUses
== 1) {
1309 MachineInstr
*UseMI
= NonInlineUse
->getParent();
1310 foldOperand(OpToFold
, UseMI
, NonInlineUseOpNo
, FoldList
, CopiesToReplace
);
1313 // Folding register.
1314 SmallVector
<MachineOperand
*, 4> UsesToProcess
;
1315 for (auto &Use
: MRI
->use_nodbg_operands(Dst
.getReg()))
1316 UsesToProcess
.push_back(&Use
);
1317 for (auto U
: UsesToProcess
) {
1318 MachineInstr
*UseMI
= U
->getParent();
1320 foldOperand(OpToFold
, UseMI
, UseMI
->getOperandNo(U
),
1321 FoldList
, CopiesToReplace
);
1325 if (CopiesToReplace
.empty() && FoldList
.empty())
1328 MachineFunction
*MF
= MI
.getParent()->getParent();
1329 // Make sure we add EXEC uses to any new v_mov instructions created.
1330 for (MachineInstr
*Copy
: CopiesToReplace
)
1331 Copy
->addImplicitDefUseOperands(*MF
);
1333 for (FoldCandidate
&Fold
: FoldList
) {
1334 assert(!Fold
.isReg() || Fold
.OpToFold
);
1335 if (Fold
.isReg() && Fold
.OpToFold
->getReg().isVirtual()) {
1336 Register Reg
= Fold
.OpToFold
->getReg();
1337 MachineInstr
*DefMI
= Fold
.OpToFold
->getParent();
1338 if (DefMI
->readsRegister(AMDGPU::EXEC
, TRI
) &&
1339 execMayBeModifiedBeforeUse(*MRI
, Reg
, *DefMI
, *Fold
.UseMI
))
1342 if (updateOperand(Fold
, *TII
, *TRI
, *ST
)) {
1343 // Clear kill flags.
1345 assert(Fold
.OpToFold
&& Fold
.OpToFold
->isReg());
1346 // FIXME: Probably shouldn't bother trying to fold if not an
1347 // SGPR. PeepholeOptimizer can eliminate redundant VGPR->VGPR
1349 MRI
->clearKillFlags(Fold
.OpToFold
->getReg());
1351 LLVM_DEBUG(dbgs() << "Folded source from " << MI
<< " into OpNo "
1352 << static_cast<int>(Fold
.UseOpNo
) << " of "
1354 } else if (Fold
.isCommuted()) {
1355 // Restoring instruction's original operand order if fold has failed.
1356 TII
->commuteInstruction(*Fold
.UseMI
, false);
1362 // Clamp patterns are canonically selected to v_max_* instructions, so only
1364 const MachineOperand
*SIFoldOperands::isClamp(const MachineInstr
&MI
) const {
1365 unsigned Op
= MI
.getOpcode();
1367 case AMDGPU::V_MAX_F32_e64
:
1368 case AMDGPU::V_MAX_F16_e64
:
1369 case AMDGPU::V_MAX_F64_e64
:
1370 case AMDGPU::V_PK_MAX_F16
: {
1371 if (!TII
->getNamedOperand(MI
, AMDGPU::OpName::clamp
)->getImm())
1374 // Make sure sources are identical.
1375 const MachineOperand
*Src0
= TII
->getNamedOperand(MI
, AMDGPU::OpName::src0
);
1376 const MachineOperand
*Src1
= TII
->getNamedOperand(MI
, AMDGPU::OpName::src1
);
1377 if (!Src0
->isReg() || !Src1
->isReg() ||
1378 Src0
->getReg() != Src1
->getReg() ||
1379 Src0
->getSubReg() != Src1
->getSubReg() ||
1380 Src0
->getSubReg() != AMDGPU::NoSubRegister
)
1383 // Can't fold up if we have modifiers.
1384 if (TII
->hasModifiersSet(MI
, AMDGPU::OpName::omod
))
1388 = TII
->getNamedOperand(MI
, AMDGPU::OpName::src0_modifiers
)->getImm();
1390 = TII
->getNamedOperand(MI
, AMDGPU::OpName::src1_modifiers
)->getImm();
1392 // Having a 0 op_sel_hi would require swizzling the output in the source
1393 // instruction, which we can't do.
1394 unsigned UnsetMods
= (Op
== AMDGPU::V_PK_MAX_F16
) ? SISrcMods::OP_SEL_1
1396 if (Src0Mods
!= UnsetMods
&& Src1Mods
!= UnsetMods
)
1405 // FIXME: Clamp for v_mad_mixhi_f16 handled during isel.
1406 bool SIFoldOperands::tryFoldClamp(MachineInstr
&MI
) {
1407 const MachineOperand
*ClampSrc
= isClamp(MI
);
1408 if (!ClampSrc
|| !MRI
->hasOneNonDBGUser(ClampSrc
->getReg()))
1411 MachineInstr
*Def
= MRI
->getVRegDef(ClampSrc
->getReg());
1413 // The type of clamp must be compatible.
1414 if (TII
->getClampMask(*Def
) != TII
->getClampMask(MI
))
1417 MachineOperand
*DefClamp
= TII
->getNamedOperand(*Def
, AMDGPU::OpName::clamp
);
1421 LLVM_DEBUG(dbgs() << "Folding clamp " << *DefClamp
<< " into " << *Def
);
1423 // Clamp is applied after omod, so it is OK if omod is set.
1424 DefClamp
->setImm(1);
1425 MRI
->replaceRegWith(MI
.getOperand(0).getReg(), Def
->getOperand(0).getReg());
1426 MI
.eraseFromParent();
1428 // Use of output modifiers forces VOP3 encoding for a VOP2 mac/fmac
1429 // instruction, so we might as well convert it to the more flexible VOP3-only
1431 if (TII
->convertToThreeAddress(*Def
, nullptr, nullptr))
1432 Def
->eraseFromParent();
1437 static int getOModValue(unsigned Opc
, int64_t Val
) {
1439 case AMDGPU::V_MUL_F64_e64
: {
1441 case 0x3fe0000000000000: // 0.5
1442 return SIOutMods::DIV2
;
1443 case 0x4000000000000000: // 2.0
1444 return SIOutMods::MUL2
;
1445 case 0x4010000000000000: // 4.0
1446 return SIOutMods::MUL4
;
1448 return SIOutMods::NONE
;
1451 case AMDGPU::V_MUL_F32_e64
: {
1452 switch (static_cast<uint32_t>(Val
)) {
1453 case 0x3f000000: // 0.5
1454 return SIOutMods::DIV2
;
1455 case 0x40000000: // 2.0
1456 return SIOutMods::MUL2
;
1457 case 0x40800000: // 4.0
1458 return SIOutMods::MUL4
;
1460 return SIOutMods::NONE
;
1463 case AMDGPU::V_MUL_F16_e64
: {
1464 switch (static_cast<uint16_t>(Val
)) {
1466 return SIOutMods::DIV2
;
1468 return SIOutMods::MUL2
;
1470 return SIOutMods::MUL4
;
1472 return SIOutMods::NONE
;
1476 llvm_unreachable("invalid mul opcode");
1480 // FIXME: Does this really not support denormals with f16?
1481 // FIXME: Does this need to check IEEE mode bit? SNaNs are generally not
1482 // handled, so will anything other than that break?
1483 std::pair
<const MachineOperand
*, int>
1484 SIFoldOperands::isOMod(const MachineInstr
&MI
) const {
1485 unsigned Op
= MI
.getOpcode();
1487 case AMDGPU::V_MUL_F64_e64
:
1488 case AMDGPU::V_MUL_F32_e64
:
1489 case AMDGPU::V_MUL_F16_e64
: {
1490 // If output denormals are enabled, omod is ignored.
1491 if ((Op
== AMDGPU::V_MUL_F32_e64
&& MFI
->getMode().FP32OutputDenormals
) ||
1492 ((Op
== AMDGPU::V_MUL_F64_e64
|| Op
== AMDGPU::V_MUL_F16_e64
) &&
1493 MFI
->getMode().FP64FP16OutputDenormals
))
1494 return std::make_pair(nullptr, SIOutMods::NONE
);
1496 const MachineOperand
*RegOp
= nullptr;
1497 const MachineOperand
*ImmOp
= nullptr;
1498 const MachineOperand
*Src0
= TII
->getNamedOperand(MI
, AMDGPU::OpName::src0
);
1499 const MachineOperand
*Src1
= TII
->getNamedOperand(MI
, AMDGPU::OpName::src1
);
1500 if (Src0
->isImm()) {
1503 } else if (Src1
->isImm()) {
1507 return std::make_pair(nullptr, SIOutMods::NONE
);
1509 int OMod
= getOModValue(Op
, ImmOp
->getImm());
1510 if (OMod
== SIOutMods::NONE
||
1511 TII
->hasModifiersSet(MI
, AMDGPU::OpName::src0_modifiers
) ||
1512 TII
->hasModifiersSet(MI
, AMDGPU::OpName::src1_modifiers
) ||
1513 TII
->hasModifiersSet(MI
, AMDGPU::OpName::omod
) ||
1514 TII
->hasModifiersSet(MI
, AMDGPU::OpName::clamp
))
1515 return std::make_pair(nullptr, SIOutMods::NONE
);
1517 return std::make_pair(RegOp
, OMod
);
1519 case AMDGPU::V_ADD_F64_e64
:
1520 case AMDGPU::V_ADD_F32_e64
:
1521 case AMDGPU::V_ADD_F16_e64
: {
1522 // If output denormals are enabled, omod is ignored.
1523 if ((Op
== AMDGPU::V_ADD_F32_e64
&& MFI
->getMode().FP32OutputDenormals
) ||
1524 ((Op
== AMDGPU::V_ADD_F64_e64
|| Op
== AMDGPU::V_ADD_F16_e64
) &&
1525 MFI
->getMode().FP64FP16OutputDenormals
))
1526 return std::make_pair(nullptr, SIOutMods::NONE
);
1528 // Look through the DAGCombiner canonicalization fmul x, 2 -> fadd x, x
1529 const MachineOperand
*Src0
= TII
->getNamedOperand(MI
, AMDGPU::OpName::src0
);
1530 const MachineOperand
*Src1
= TII
->getNamedOperand(MI
, AMDGPU::OpName::src1
);
1532 if (Src0
->isReg() && Src1
->isReg() && Src0
->getReg() == Src1
->getReg() &&
1533 Src0
->getSubReg() == Src1
->getSubReg() &&
1534 !TII
->hasModifiersSet(MI
, AMDGPU::OpName::src0_modifiers
) &&
1535 !TII
->hasModifiersSet(MI
, AMDGPU::OpName::src1_modifiers
) &&
1536 !TII
->hasModifiersSet(MI
, AMDGPU::OpName::clamp
) &&
1537 !TII
->hasModifiersSet(MI
, AMDGPU::OpName::omod
))
1538 return std::make_pair(Src0
, SIOutMods::MUL2
);
1540 return std::make_pair(nullptr, SIOutMods::NONE
);
1543 return std::make_pair(nullptr, SIOutMods::NONE
);
1547 // FIXME: Does this need to check IEEE bit on function?
1548 bool SIFoldOperands::tryFoldOMod(MachineInstr
&MI
) {
1549 const MachineOperand
*RegOp
;
1551 std::tie(RegOp
, OMod
) = isOMod(MI
);
1552 if (OMod
== SIOutMods::NONE
|| !RegOp
->isReg() ||
1553 RegOp
->getSubReg() != AMDGPU::NoSubRegister
||
1554 !MRI
->hasOneNonDBGUser(RegOp
->getReg()))
1557 MachineInstr
*Def
= MRI
->getVRegDef(RegOp
->getReg());
1558 MachineOperand
*DefOMod
= TII
->getNamedOperand(*Def
, AMDGPU::OpName::omod
);
1559 if (!DefOMod
|| DefOMod
->getImm() != SIOutMods::NONE
)
1562 // Clamp is applied after omod. If the source already has clamp set, don't
1564 if (TII
->hasModifiersSet(*Def
, AMDGPU::OpName::clamp
))
1567 LLVM_DEBUG(dbgs() << "Folding omod " << MI
<< " into " << *Def
);
1569 DefOMod
->setImm(OMod
);
1570 MRI
->replaceRegWith(MI
.getOperand(0).getReg(), Def
->getOperand(0).getReg());
1571 MI
.eraseFromParent();
1573 // Use of output modifiers forces VOP3 encoding for a VOP2 mac/fmac
1574 // instruction, so we might as well convert it to the more flexible VOP3-only
1576 if (TII
->convertToThreeAddress(*Def
, nullptr, nullptr))
1577 Def
->eraseFromParent();
1582 // Try to fold a reg_sequence with vgpr output and agpr inputs into an
1583 // instruction which can take an agpr. So far that means a store.
1584 bool SIFoldOperands::tryFoldRegSequence(MachineInstr
&MI
) {
1585 assert(MI
.isRegSequence());
1586 auto Reg
= MI
.getOperand(0).getReg();
1588 if (!ST
->hasGFX90AInsts() || !TRI
->isVGPR(*MRI
, Reg
) ||
1589 !MRI
->hasOneNonDBGUse(Reg
))
1592 SmallVector
<std::pair
<MachineOperand
*, unsigned>, 32> Defs
;
1593 if (!getRegSeqInit(Defs
, Reg
, MCOI::OPERAND_REGISTER
, TII
, *MRI
))
1596 for (auto &Def
: Defs
) {
1597 const auto *Op
= Def
.first
;
1600 if (TRI
->isAGPR(*MRI
, Op
->getReg()))
1602 // Maybe this is a COPY from AREG
1603 const MachineInstr
*SubDef
= MRI
->getVRegDef(Op
->getReg());
1604 if (!SubDef
|| !SubDef
->isCopy() || SubDef
->getOperand(1).getSubReg())
1606 if (!TRI
->isAGPR(*MRI
, SubDef
->getOperand(1).getReg()))
1610 MachineOperand
*Op
= &*MRI
->use_nodbg_begin(Reg
);
1611 MachineInstr
*UseMI
= Op
->getParent();
1612 while (UseMI
->isCopy() && !Op
->getSubReg()) {
1613 Reg
= UseMI
->getOperand(0).getReg();
1614 if (!TRI
->isVGPR(*MRI
, Reg
) || !MRI
->hasOneNonDBGUse(Reg
))
1616 Op
= &*MRI
->use_nodbg_begin(Reg
);
1617 UseMI
= Op
->getParent();
1620 if (Op
->getSubReg())
1623 unsigned OpIdx
= Op
- &UseMI
->getOperand(0);
1624 const MCInstrDesc
&InstDesc
= UseMI
->getDesc();
1625 const TargetRegisterClass
*OpRC
=
1626 TII
->getRegClass(InstDesc
, OpIdx
, TRI
, *MI
.getMF());
1627 if (!OpRC
|| !TRI
->isVectorSuperClass(OpRC
))
1630 const auto *NewDstRC
= TRI
->getEquivalentAGPRClass(MRI
->getRegClass(Reg
));
1631 auto Dst
= MRI
->createVirtualRegister(NewDstRC
);
1632 auto RS
= BuildMI(*MI
.getParent(), MI
, MI
.getDebugLoc(),
1633 TII
->get(AMDGPU::REG_SEQUENCE
), Dst
);
1635 for (unsigned I
= 0; I
< Defs
.size(); ++I
) {
1636 MachineOperand
*Def
= Defs
[I
].first
;
1637 Def
->setIsKill(false);
1638 if (TRI
->isAGPR(*MRI
, Def
->getReg())) {
1640 } else { // This is a copy
1641 MachineInstr
*SubDef
= MRI
->getVRegDef(Def
->getReg());
1642 SubDef
->getOperand(1).setIsKill(false);
1643 RS
.addReg(SubDef
->getOperand(1).getReg(), 0, Def
->getSubReg());
1645 RS
.addImm(Defs
[I
].second
);
1649 if (!TII
->isOperandLegal(*UseMI
, OpIdx
, Op
)) {
1651 RS
->eraseFromParent();
1655 LLVM_DEBUG(dbgs() << "Folded " << *RS
<< " into " << *UseMI
);
1657 // Erase the REG_SEQUENCE eagerly, unless we followed a chain of COPY users,
1658 // in which case we can erase them all later in runOnMachineFunction.
1659 if (MRI
->use_nodbg_empty(MI
.getOperand(0).getReg()))
1660 MI
.eraseFromParent();
1664 // Try to hoist an AGPR to VGPR copy out of the loop across a LCSSA PHI.
1665 // This should allow folding of an AGPR into a consumer which may support it.
1669 // %1:vreg = COPY %0:areg // exit:
1670 // exit: => // %1:areg = PHI %0:areg, %loop
1671 // %2:vreg = PHI %1:vreg, %loop // %2:vreg = COPY %1:areg
1672 bool SIFoldOperands::tryFoldLCSSAPhi(MachineInstr
&PHI
) {
1673 assert(PHI
.isPHI());
1675 if (PHI
.getNumExplicitOperands() != 3) // Single input LCSSA PHI
1678 Register PhiIn
= PHI
.getOperand(1).getReg();
1679 Register PhiOut
= PHI
.getOperand(0).getReg();
1680 if (PHI
.getOperand(1).getSubReg() ||
1681 !TRI
->isVGPR(*MRI
, PhiIn
) || !TRI
->isVGPR(*MRI
, PhiOut
))
1684 // A single use should not matter for correctness, but if it has another use
1685 // inside the loop we may perform copy twice in a worst case.
1686 if (!MRI
->hasOneNonDBGUse(PhiIn
))
1689 MachineInstr
*Copy
= MRI
->getVRegDef(PhiIn
);
1690 if (!Copy
|| !Copy
->isCopy())
1693 Register CopyIn
= Copy
->getOperand(1).getReg();
1694 if (!TRI
->isAGPR(*MRI
, CopyIn
) || Copy
->getOperand(1).getSubReg())
1697 const TargetRegisterClass
*ARC
= MRI
->getRegClass(CopyIn
);
1698 Register NewReg
= MRI
->createVirtualRegister(ARC
);
1699 PHI
.getOperand(1).setReg(CopyIn
);
1700 PHI
.getOperand(0).setReg(NewReg
);
1702 MachineBasicBlock
*MBB
= PHI
.getParent();
1703 BuildMI(*MBB
, MBB
->getFirstNonPHI(), Copy
->getDebugLoc(),
1704 TII
->get(AMDGPU::COPY
), PhiOut
)
1705 .addReg(NewReg
, RegState::Kill
);
1706 Copy
->eraseFromParent(); // We know this copy had a single use.
1708 LLVM_DEBUG(dbgs() << "Folded " << PHI
);
1713 // Attempt to convert VGPR load to an AGPR load.
1714 bool SIFoldOperands::tryFoldLoad(MachineInstr
&MI
) {
1715 assert(MI
.mayLoad());
1716 if (!ST
->hasGFX90AInsts() || MI
.getNumExplicitDefs() != 1)
1719 MachineOperand
&Def
= MI
.getOperand(0);
1723 Register DefReg
= Def
.getReg();
1725 if (DefReg
.isPhysical() || !TRI
->isVGPR(*MRI
, DefReg
))
1728 SmallVector
<const MachineInstr
*, 8> Users
;
1729 SmallVector
<Register
, 8> MoveRegs
;
1730 for (const MachineInstr
&I
: MRI
->use_nodbg_instructions(DefReg
)) {
1731 Users
.push_back(&I
);
1736 // Check that all uses a copy to an agpr or a reg_sequence producing an agpr.
1737 while (!Users
.empty()) {
1738 const MachineInstr
*I
= Users
.pop_back_val();
1739 if (!I
->isCopy() && !I
->isRegSequence())
1741 Register DstReg
= I
->getOperand(0).getReg();
1742 if (TRI
->isAGPR(*MRI
, DstReg
))
1744 MoveRegs
.push_back(DstReg
);
1745 for (const MachineInstr
&U
: MRI
->use_nodbg_instructions(DstReg
)) {
1746 Users
.push_back(&U
);
1750 const TargetRegisterClass
*RC
= MRI
->getRegClass(DefReg
);
1751 MRI
->setRegClass(DefReg
, TRI
->getEquivalentAGPRClass(RC
));
1752 if (!TII
->isOperandLegal(MI
, 0, &Def
)) {
1753 MRI
->setRegClass(DefReg
, RC
);
1757 while (!MoveRegs
.empty()) {
1758 Register Reg
= MoveRegs
.pop_back_val();
1759 MRI
->setRegClass(Reg
, TRI
->getEquivalentAGPRClass(MRI
->getRegClass(Reg
)));
1762 LLVM_DEBUG(dbgs() << "Folded " << MI
);
1767 bool SIFoldOperands::runOnMachineFunction(MachineFunction
&MF
) {
1768 if (skipFunction(MF
.getFunction()))
1771 MRI
= &MF
.getRegInfo();
1772 ST
= &MF
.getSubtarget
<GCNSubtarget
>();
1773 TII
= ST
->getInstrInfo();
1774 TRI
= &TII
->getRegisterInfo();
1775 MFI
= MF
.getInfo
<SIMachineFunctionInfo
>();
1777 // omod is ignored by hardware if IEEE bit is enabled. omod also does not
1778 // correctly handle signed zeros.
1780 // FIXME: Also need to check strictfp
1781 bool IsIEEEMode
= MFI
->getMode().IEEE
;
1782 bool HasNSZ
= MFI
->hasNoSignedZerosFPMath();
1784 bool Changed
= false;
1785 for (MachineBasicBlock
*MBB
: depth_first(&MF
)) {
1786 MachineOperand
*CurrentKnownM0Val
= nullptr;
1787 for (auto &MI
: make_early_inc_range(*MBB
)) {
1788 Changed
|= tryFoldCndMask(MI
);
1790 if (tryFoldZeroHighBits(MI
)) {
1795 if (MI
.isRegSequence() && tryFoldRegSequence(MI
)) {
1800 if (MI
.isPHI() && tryFoldLCSSAPhi(MI
)) {
1805 if (MI
.mayLoad() && tryFoldLoad(MI
)) {
1810 if (!TII
->isFoldableCopy(MI
)) {
1811 // Saw an unknown clobber of m0, so we no longer know what it is.
1812 if (CurrentKnownM0Val
&& MI
.modifiesRegister(AMDGPU::M0
, TRI
))
1813 CurrentKnownM0Val
= nullptr;
1815 // TODO: Omod might be OK if there is NSZ only on the source
1816 // instruction, and not the omod multiply.
1817 if (IsIEEEMode
|| (!HasNSZ
&& !MI
.getFlag(MachineInstr::FmNsz
)) ||
1819 Changed
|= tryFoldClamp(MI
);
1824 // Specially track simple redefs of m0 to the same value in a block, so we
1825 // can erase the later ones.
1826 if (MI
.getOperand(0).getReg() == AMDGPU::M0
) {
1827 MachineOperand
&NewM0Val
= MI
.getOperand(1);
1828 if (CurrentKnownM0Val
&& CurrentKnownM0Val
->isIdenticalTo(NewM0Val
)) {
1829 MI
.eraseFromParent();
1834 // We aren't tracking other physical registers
1835 CurrentKnownM0Val
= (NewM0Val
.isReg() && NewM0Val
.getReg().isPhysical()) ?
1836 nullptr : &NewM0Val
;
1840 MachineOperand
&OpToFold
= MI
.getOperand(1);
1842 OpToFold
.isImm() || OpToFold
.isFI() || OpToFold
.isGlobal();
1844 // FIXME: We could also be folding things like TargetIndexes.
1845 if (!FoldingImm
&& !OpToFold
.isReg())
1848 if (OpToFold
.isReg() && !OpToFold
.getReg().isVirtual())
1851 // Prevent folding operands backwards in the function. For example,
1852 // the COPY opcode must not be replaced by 1 in this example:
1854 // %3 = COPY %vgpr0; VGPR_32:%3
1856 // %vgpr0 = V_MOV_B32_e32 1, implicit %exec
1857 if (!MI
.getOperand(0).getReg().isVirtual())
1860 Changed
|= foldInstOperand(MI
, OpToFold
);
1862 // If we managed to fold all uses of this copy then we might as well
1864 // The only reason we need to follow chains of copies here is that
1865 // tryFoldRegSequence looks forward through copies before folding a
1866 // REG_SEQUENCE into its eventual users.
1867 auto *InstToErase
= &MI
;
1868 while (MRI
->use_nodbg_empty(InstToErase
->getOperand(0).getReg())) {
1869 auto &SrcOp
= InstToErase
->getOperand(1);
1870 auto SrcReg
= SrcOp
.isReg() ? SrcOp
.getReg() : Register();
1871 InstToErase
->eraseFromParent();
1873 InstToErase
= nullptr;
1874 if (!SrcReg
|| SrcReg
.isPhysical())
1876 InstToErase
= MRI
->getVRegDef(SrcReg
);
1877 if (!InstToErase
|| !TII
->isFoldableCopy(*InstToErase
))
1880 if (InstToErase
&& InstToErase
->isRegSequence() &&
1881 MRI
->use_nodbg_empty(InstToErase
->getOperand(0).getReg())) {
1882 InstToErase
->eraseFromParent();