1 //===-- SIFoldOperands.cpp - Fold operands --- ----------------------------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
8 //===----------------------------------------------------------------------===//
12 #include "GCNSubtarget.h"
13 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
14 #include "SIMachineFunctionInfo.h"
15 #include "llvm/ADT/DepthFirstIterator.h"
16 #include "llvm/CodeGen/MachineFunctionPass.h"
17 #include "llvm/CodeGen/MachineOperand.h"
19 #define DEBUG_TYPE "si-fold-operands"
24 struct FoldCandidate
{
27 MachineOperand
*OpToFold
;
33 MachineOperand::MachineOperandType Kind
;
36 FoldCandidate(MachineInstr
*MI
, unsigned OpNo
, MachineOperand
*FoldOp
,
37 bool Commuted_
= false,
39 UseMI(MI
), OpToFold(nullptr), ShrinkOpcode(ShrinkOp
), UseOpNo(OpNo
),
40 Kind(FoldOp
->getType()),
42 if (FoldOp
->isImm()) {
43 ImmToFold
= FoldOp
->getImm();
44 } else if (FoldOp
->isFI()) {
45 FrameIndexToFold
= FoldOp
->getIndex();
47 assert(FoldOp
->isReg() || FoldOp
->isGlobal());
53 return Kind
== MachineOperand::MO_FrameIndex
;
57 return Kind
== MachineOperand::MO_Immediate
;
61 return Kind
== MachineOperand::MO_Register
;
64 bool isGlobal() const { return Kind
== MachineOperand::MO_GlobalAddress
; }
66 bool needsShrink() const { return ShrinkOpcode
!= -1; }
69 class SIFoldOperands
: public MachineFunctionPass
{
72 MachineRegisterInfo
*MRI
;
73 const SIInstrInfo
*TII
;
74 const SIRegisterInfo
*TRI
;
75 const GCNSubtarget
*ST
;
76 const SIMachineFunctionInfo
*MFI
;
78 bool frameIndexMayFold(const MachineInstr
&UseMI
, int OpNo
,
79 const MachineOperand
&OpToFold
) const;
81 bool updateOperand(FoldCandidate
&Fold
) const;
83 bool canUseImmWithOpSel(FoldCandidate
&Fold
) const;
85 bool tryFoldImmWithOpSel(FoldCandidate
&Fold
) const;
87 bool tryAddToFoldList(SmallVectorImpl
<FoldCandidate
> &FoldList
,
88 MachineInstr
*MI
, unsigned OpNo
,
89 MachineOperand
*OpToFold
) const;
90 bool isUseSafeToFold(const MachineInstr
&MI
,
91 const MachineOperand
&UseMO
) const;
93 getRegSeqInit(SmallVectorImpl
<std::pair
<MachineOperand
*, unsigned>> &Defs
,
94 Register UseReg
, uint8_t OpTy
) const;
95 bool tryToFoldACImm(const MachineOperand
&OpToFold
, MachineInstr
*UseMI
,
97 SmallVectorImpl
<FoldCandidate
> &FoldList
) const;
98 void foldOperand(MachineOperand
&OpToFold
,
101 SmallVectorImpl
<FoldCandidate
> &FoldList
,
102 SmallVectorImpl
<MachineInstr
*> &CopiesToReplace
) const;
104 MachineOperand
*getImmOrMaterializedImm(MachineOperand
&Op
) const;
105 bool tryConstantFoldOp(MachineInstr
*MI
) const;
106 bool tryFoldCndMask(MachineInstr
&MI
) const;
107 bool tryFoldZeroHighBits(MachineInstr
&MI
) const;
108 bool foldInstOperand(MachineInstr
&MI
, MachineOperand
&OpToFold
) const;
109 bool tryFoldFoldableCopy(MachineInstr
&MI
,
110 MachineOperand
*&CurrentKnownM0Val
) const;
112 const MachineOperand
*isClamp(const MachineInstr
&MI
) const;
113 bool tryFoldClamp(MachineInstr
&MI
);
115 std::pair
<const MachineOperand
*, int> isOMod(const MachineInstr
&MI
) const;
116 bool tryFoldOMod(MachineInstr
&MI
);
117 bool tryFoldRegSequence(MachineInstr
&MI
);
118 bool tryFoldPhiAGPR(MachineInstr
&MI
);
119 bool tryFoldLoad(MachineInstr
&MI
);
121 bool tryOptimizeAGPRPhis(MachineBasicBlock
&MBB
);
124 SIFoldOperands() : MachineFunctionPass(ID
) {
125 initializeSIFoldOperandsPass(*PassRegistry::getPassRegistry());
128 bool runOnMachineFunction(MachineFunction
&MF
) override
;
130 StringRef
getPassName() const override
{ return "SI Fold Operands"; }
132 void getAnalysisUsage(AnalysisUsage
&AU
) const override
{
133 AU
.setPreservesCFG();
134 MachineFunctionPass::getAnalysisUsage(AU
);
138 } // End anonymous namespace.
140 INITIALIZE_PASS(SIFoldOperands
, DEBUG_TYPE
,
141 "SI Fold Operands", false, false)
143 char SIFoldOperands::ID
= 0;
145 char &llvm::SIFoldOperandsID
= SIFoldOperands::ID
;
147 static const TargetRegisterClass
*getRegOpRC(const MachineRegisterInfo
&MRI
,
148 const TargetRegisterInfo
&TRI
,
149 const MachineOperand
&MO
) {
150 const TargetRegisterClass
*RC
= MRI
.getRegClass(MO
.getReg());
151 if (const TargetRegisterClass
*SubRC
=
152 TRI
.getSubRegisterClass(RC
, MO
.getSubReg()))
157 // Map multiply-accumulate opcode to corresponding multiply-add opcode if any.
158 static unsigned macToMad(unsigned Opc
) {
160 case AMDGPU::V_MAC_F32_e64
:
161 return AMDGPU::V_MAD_F32_e64
;
162 case AMDGPU::V_MAC_F16_e64
:
163 return AMDGPU::V_MAD_F16_e64
;
164 case AMDGPU::V_FMAC_F32_e64
:
165 return AMDGPU::V_FMA_F32_e64
;
166 case AMDGPU::V_FMAC_F16_e64
:
167 return AMDGPU::V_FMA_F16_gfx9_e64
;
168 case AMDGPU::V_FMAC_F16_t16_e64
:
169 return AMDGPU::V_FMA_F16_gfx9_e64
;
170 case AMDGPU::V_FMAC_LEGACY_F32_e64
:
171 return AMDGPU::V_FMA_LEGACY_F32_e64
;
172 case AMDGPU::V_FMAC_F64_e64
:
173 return AMDGPU::V_FMA_F64_e64
;
175 return AMDGPU::INSTRUCTION_LIST_END
;
178 // TODO: Add heuristic that the frame index might not fit in the addressing mode
179 // immediate offset to avoid materializing in loops.
180 bool SIFoldOperands::frameIndexMayFold(const MachineInstr
&UseMI
, int OpNo
,
181 const MachineOperand
&OpToFold
) const {
182 if (!OpToFold
.isFI())
185 const unsigned Opc
= UseMI
.getOpcode();
186 if (TII
->isMUBUF(UseMI
))
187 return OpNo
== AMDGPU::getNamedOperandIdx(Opc
, AMDGPU::OpName::vaddr
);
188 if (!TII
->isFLATScratch(UseMI
))
191 int SIdx
= AMDGPU::getNamedOperandIdx(Opc
, AMDGPU::OpName::saddr
);
195 int VIdx
= AMDGPU::getNamedOperandIdx(Opc
, AMDGPU::OpName::vaddr
);
196 return OpNo
== VIdx
&& SIdx
== -1;
199 FunctionPass
*llvm::createSIFoldOperandsPass() {
200 return new SIFoldOperands();
203 bool SIFoldOperands::canUseImmWithOpSel(FoldCandidate
&Fold
) const {
204 MachineInstr
*MI
= Fold
.UseMI
;
205 MachineOperand
&Old
= MI
->getOperand(Fold
.UseOpNo
);
206 const uint64_t TSFlags
= MI
->getDesc().TSFlags
;
208 assert(Old
.isReg() && Fold
.isImm());
210 if (!(TSFlags
& SIInstrFlags::IsPacked
) || (TSFlags
& SIInstrFlags::IsMAI
) ||
211 (TSFlags
& SIInstrFlags::IsWMMA
) || (TSFlags
& SIInstrFlags::IsSWMMAC
) ||
212 (ST
->hasDOTOpSelHazard() && (TSFlags
& SIInstrFlags::IsDOT
)))
215 unsigned Opcode
= MI
->getOpcode();
216 int OpNo
= MI
->getOperandNo(&Old
);
217 uint8_t OpType
= TII
->get(Opcode
).operands()[OpNo
].OperandType
;
221 case AMDGPU::OPERAND_REG_IMM_V2FP16
:
222 case AMDGPU::OPERAND_REG_IMM_V2BF16
:
223 case AMDGPU::OPERAND_REG_IMM_V2INT16
:
224 case AMDGPU::OPERAND_REG_INLINE_C_V2FP16
:
225 case AMDGPU::OPERAND_REG_INLINE_C_V2BF16
:
226 case AMDGPU::OPERAND_REG_INLINE_C_V2INT16
:
233 bool SIFoldOperands::tryFoldImmWithOpSel(FoldCandidate
&Fold
) const {
234 MachineInstr
*MI
= Fold
.UseMI
;
235 MachineOperand
&Old
= MI
->getOperand(Fold
.UseOpNo
);
236 unsigned Opcode
= MI
->getOpcode();
237 int OpNo
= MI
->getOperandNo(&Old
);
238 uint8_t OpType
= TII
->get(Opcode
).operands()[OpNo
].OperandType
;
240 // If the literal can be inlined as-is, apply it and short-circuit the
241 // tests below. The main motivation for this is to avoid unintuitive
243 if (AMDGPU::isInlinableLiteralV216(Fold
.ImmToFold
, OpType
)) {
244 Old
.ChangeToImmediate(Fold
.ImmToFold
);
248 // Refer to op_sel/op_sel_hi and check if we can change the immediate and
249 // op_sel in a way that allows an inline constant.
251 unsigned SrcIdx
= ~0;
252 if (OpNo
== AMDGPU::getNamedOperandIdx(Opcode
, AMDGPU::OpName::src0
)) {
253 ModIdx
= AMDGPU::OpName::src0_modifiers
;
255 } else if (OpNo
== AMDGPU::getNamedOperandIdx(Opcode
, AMDGPU::OpName::src1
)) {
256 ModIdx
= AMDGPU::OpName::src1_modifiers
;
258 } else if (OpNo
== AMDGPU::getNamedOperandIdx(Opcode
, AMDGPU::OpName::src2
)) {
259 ModIdx
= AMDGPU::OpName::src2_modifiers
;
262 assert(ModIdx
!= -1);
263 ModIdx
= AMDGPU::getNamedOperandIdx(Opcode
, ModIdx
);
264 MachineOperand
&Mod
= MI
->getOperand(ModIdx
);
265 unsigned ModVal
= Mod
.getImm();
267 uint16_t ImmLo
= static_cast<uint16_t>(
268 Fold
.ImmToFold
>> (ModVal
& SISrcMods::OP_SEL_0
? 16 : 0));
269 uint16_t ImmHi
= static_cast<uint16_t>(
270 Fold
.ImmToFold
>> (ModVal
& SISrcMods::OP_SEL_1
? 16 : 0));
271 uint32_t Imm
= (static_cast<uint32_t>(ImmHi
) << 16) | ImmLo
;
272 unsigned NewModVal
= ModVal
& ~(SISrcMods::OP_SEL_0
| SISrcMods::OP_SEL_1
);
274 // Helper function that attempts to inline the given value with a newly
275 // chosen opsel pattern.
276 auto tryFoldToInline
= [&](uint32_t Imm
) -> bool {
277 if (AMDGPU::isInlinableLiteralV216(Imm
, OpType
)) {
278 Mod
.setImm(NewModVal
| SISrcMods::OP_SEL_1
);
279 Old
.ChangeToImmediate(Imm
);
283 // Try to shuffle the halves around and leverage opsel to get an inline
285 uint16_t Lo
= static_cast<uint16_t>(Imm
);
286 uint16_t Hi
= static_cast<uint16_t>(Imm
>> 16);
288 if (AMDGPU::isInlinableLiteralV216(Lo
, OpType
)) {
289 Mod
.setImm(NewModVal
);
290 Old
.ChangeToImmediate(Lo
);
294 if (static_cast<int16_t>(Lo
) < 0) {
295 int32_t SExt
= static_cast<int16_t>(Lo
);
296 if (AMDGPU::isInlinableLiteralV216(SExt
, OpType
)) {
297 Mod
.setImm(NewModVal
);
298 Old
.ChangeToImmediate(SExt
);
303 // This check is only useful for integer instructions
304 if (OpType
== AMDGPU::OPERAND_REG_IMM_V2INT16
||
305 OpType
== AMDGPU::OPERAND_REG_INLINE_AC_V2INT16
) {
306 if (AMDGPU::isInlinableLiteralV216(Lo
<< 16, OpType
)) {
307 Mod
.setImm(NewModVal
| SISrcMods::OP_SEL_0
| SISrcMods::OP_SEL_1
);
308 Old
.ChangeToImmediate(static_cast<uint32_t>(Lo
) << 16);
313 uint32_t Swapped
= (static_cast<uint32_t>(Lo
) << 16) | Hi
;
314 if (AMDGPU::isInlinableLiteralV216(Swapped
, OpType
)) {
315 Mod
.setImm(NewModVal
| SISrcMods::OP_SEL_0
);
316 Old
.ChangeToImmediate(Swapped
);
324 if (tryFoldToInline(Imm
))
327 // Replace integer addition by subtraction and vice versa if it allows
328 // folding the immediate to an inline constant.
330 // We should only ever get here for SrcIdx == 1 due to canonicalization
331 // earlier in the pipeline, but we double-check here to be safe / fully
333 bool IsUAdd
= Opcode
== AMDGPU::V_PK_ADD_U16
;
334 bool IsUSub
= Opcode
== AMDGPU::V_PK_SUB_U16
;
335 if (SrcIdx
== 1 && (IsUAdd
|| IsUSub
)) {
337 AMDGPU::getNamedOperandIdx(Opcode
, AMDGPU::OpName::clamp
);
338 bool Clamp
= MI
->getOperand(ClampIdx
).getImm() != 0;
341 uint16_t NegLo
= -static_cast<uint16_t>(Imm
);
342 uint16_t NegHi
= -static_cast<uint16_t>(Imm
>> 16);
343 uint32_t NegImm
= (static_cast<uint32_t>(NegHi
) << 16) | NegLo
;
345 if (tryFoldToInline(NegImm
)) {
347 IsUAdd
? AMDGPU::V_PK_SUB_U16
: AMDGPU::V_PK_ADD_U16
;
348 MI
->setDesc(TII
->get(NegOpcode
));
357 bool SIFoldOperands::updateOperand(FoldCandidate
&Fold
) const {
358 MachineInstr
*MI
= Fold
.UseMI
;
359 MachineOperand
&Old
= MI
->getOperand(Fold
.UseOpNo
);
362 if (Fold
.isImm() && canUseImmWithOpSel(Fold
)) {
363 if (tryFoldImmWithOpSel(Fold
))
366 // We can't represent the candidate as an inline constant. Try as a literal
367 // with the original opsel, checking constant bus limitations.
368 MachineOperand New
= MachineOperand::CreateImm(Fold
.ImmToFold
);
369 int OpNo
= MI
->getOperandNo(&Old
);
370 if (!TII
->isOperandLegal(*MI
, OpNo
, &New
))
372 Old
.ChangeToImmediate(Fold
.ImmToFold
);
376 if ((Fold
.isImm() || Fold
.isFI() || Fold
.isGlobal()) && Fold
.needsShrink()) {
377 MachineBasicBlock
*MBB
= MI
->getParent();
378 auto Liveness
= MBB
->computeRegisterLiveness(TRI
, AMDGPU::VCC
, MI
, 16);
379 if (Liveness
!= MachineBasicBlock::LQR_Dead
) {
380 LLVM_DEBUG(dbgs() << "Not shrinking " << MI
<< " due to vcc liveness\n");
384 int Op32
= Fold
.ShrinkOpcode
;
385 MachineOperand
&Dst0
= MI
->getOperand(0);
386 MachineOperand
&Dst1
= MI
->getOperand(1);
387 assert(Dst0
.isDef() && Dst1
.isDef());
389 bool HaveNonDbgCarryUse
= !MRI
->use_nodbg_empty(Dst1
.getReg());
391 const TargetRegisterClass
*Dst0RC
= MRI
->getRegClass(Dst0
.getReg());
392 Register NewReg0
= MRI
->createVirtualRegister(Dst0RC
);
394 MachineInstr
*Inst32
= TII
->buildShrunkInst(*MI
, Op32
);
396 if (HaveNonDbgCarryUse
) {
397 BuildMI(*MBB
, MI
, MI
->getDebugLoc(), TII
->get(AMDGPU::COPY
),
399 .addReg(AMDGPU::VCC
, RegState::Kill
);
402 // Keep the old instruction around to avoid breaking iterators, but
403 // replace it with a dummy instruction to remove uses.
405 // FIXME: We should not invert how this pass looks at operands to avoid
406 // this. Should track set of foldable movs instead of looking for uses
407 // when looking at a use.
408 Dst0
.setReg(NewReg0
);
409 for (unsigned I
= MI
->getNumOperands() - 1; I
> 0; --I
)
410 MI
->removeOperand(I
);
411 MI
->setDesc(TII
->get(AMDGPU::IMPLICIT_DEF
));
414 TII
->commuteInstruction(*Inst32
, false);
418 assert(!Fold
.needsShrink() && "not handled");
422 int NewMFMAOpc
= AMDGPU::getMFMAEarlyClobberOp(MI
->getOpcode());
423 if (NewMFMAOpc
== -1)
425 MI
->setDesc(TII
->get(NewMFMAOpc
));
426 MI
->untieRegOperand(0);
428 Old
.ChangeToImmediate(Fold
.ImmToFold
);
432 if (Fold
.isGlobal()) {
433 Old
.ChangeToGA(Fold
.OpToFold
->getGlobal(), Fold
.OpToFold
->getOffset(),
434 Fold
.OpToFold
->getTargetFlags());
439 Old
.ChangeToFrameIndex(Fold
.FrameIndexToFold
);
443 MachineOperand
*New
= Fold
.OpToFold
;
444 Old
.substVirtReg(New
->getReg(), New
->getSubReg(), *TRI
);
445 Old
.setIsUndef(New
->isUndef());
449 static bool isUseMIInFoldList(ArrayRef
<FoldCandidate
> FoldList
,
450 const MachineInstr
*MI
) {
451 return any_of(FoldList
, [&](const auto &C
) { return C
.UseMI
== MI
; });
454 static void appendFoldCandidate(SmallVectorImpl
<FoldCandidate
> &FoldList
,
455 MachineInstr
*MI
, unsigned OpNo
,
456 MachineOperand
*FoldOp
, bool Commuted
= false,
458 // Skip additional folding on the same operand.
459 for (FoldCandidate
&Fold
: FoldList
)
460 if (Fold
.UseMI
== MI
&& Fold
.UseOpNo
== OpNo
)
462 LLVM_DEBUG(dbgs() << "Append " << (Commuted
? "commuted" : "normal")
463 << " operand " << OpNo
<< "\n " << *MI
);
464 FoldList
.emplace_back(MI
, OpNo
, FoldOp
, Commuted
, ShrinkOp
);
467 bool SIFoldOperands::tryAddToFoldList(SmallVectorImpl
<FoldCandidate
> &FoldList
,
468 MachineInstr
*MI
, unsigned OpNo
,
469 MachineOperand
*OpToFold
) const {
470 const unsigned Opc
= MI
->getOpcode();
472 auto tryToFoldAsFMAAKorMK
= [&]() {
473 if (!OpToFold
->isImm())
476 const bool TryAK
= OpNo
== 3;
477 const unsigned NewOpc
= TryAK
? AMDGPU::S_FMAAK_F32
: AMDGPU::S_FMAMK_F32
;
478 MI
->setDesc(TII
->get(NewOpc
));
480 // We have to fold into operand which would be Imm not into OpNo.
481 bool FoldAsFMAAKorMK
=
482 tryAddToFoldList(FoldList
, MI
, TryAK
? 3 : 2, OpToFold
);
483 if (FoldAsFMAAKorMK
) {
484 // Untie Src2 of fmac.
485 MI
->untieRegOperand(3);
486 // For fmamk swap operands 1 and 2 if OpToFold was meant for operand 1.
488 MachineOperand
&Op1
= MI
->getOperand(1);
489 MachineOperand
&Op2
= MI
->getOperand(2);
490 Register OldReg
= Op1
.getReg();
491 // Operand 2 might be an inlinable constant
493 Op1
.ChangeToImmediate(Op2
.getImm());
494 Op2
.ChangeToRegister(OldReg
, false);
496 Op1
.setReg(Op2
.getReg());
502 MI
->setDesc(TII
->get(Opc
));
506 bool IsLegal
= TII
->isOperandLegal(*MI
, OpNo
, OpToFold
);
507 if (!IsLegal
&& OpToFold
->isImm()) {
508 FoldCandidate
Fold(MI
, OpNo
, OpToFold
);
509 IsLegal
= canUseImmWithOpSel(Fold
);
513 // Special case for v_mac_{f16, f32}_e64 if we are trying to fold into src2
514 unsigned NewOpc
= macToMad(Opc
);
515 if (NewOpc
!= AMDGPU::INSTRUCTION_LIST_END
) {
516 // Check if changing this to a v_mad_{f16, f32} instruction will allow us
517 // to fold the operand.
518 MI
->setDesc(TII
->get(NewOpc
));
519 bool AddOpSel
= !AMDGPU::hasNamedOperand(Opc
, AMDGPU::OpName::op_sel
) &&
520 AMDGPU::hasNamedOperand(NewOpc
, AMDGPU::OpName::op_sel
);
522 MI
->addOperand(MachineOperand::CreateImm(0));
523 bool FoldAsMAD
= tryAddToFoldList(FoldList
, MI
, OpNo
, OpToFold
);
525 MI
->untieRegOperand(OpNo
);
529 MI
->removeOperand(MI
->getNumExplicitOperands() - 1);
530 MI
->setDesc(TII
->get(Opc
));
533 // Special case for s_fmac_f32 if we are trying to fold into Src2.
534 // By transforming into fmaak we can untie Src2 and make folding legal.
535 if (Opc
== AMDGPU::S_FMAC_F32
&& OpNo
== 3) {
536 if (tryToFoldAsFMAAKorMK())
540 // Special case for s_setreg_b32
541 if (OpToFold
->isImm()) {
543 if (Opc
== AMDGPU::S_SETREG_B32
)
544 ImmOpc
= AMDGPU::S_SETREG_IMM32_B32
;
545 else if (Opc
== AMDGPU::S_SETREG_B32_mode
)
546 ImmOpc
= AMDGPU::S_SETREG_IMM32_B32_mode
;
548 MI
->setDesc(TII
->get(ImmOpc
));
549 appendFoldCandidate(FoldList
, MI
, OpNo
, OpToFold
);
554 // If we are already folding into another operand of MI, then
555 // we can't commute the instruction, otherwise we risk making the
556 // other fold illegal.
557 if (isUseMIInFoldList(FoldList
, MI
))
560 // Operand is not legal, so try to commute the instruction to
561 // see if this makes it possible to fold.
562 unsigned CommuteOpNo
= TargetInstrInfo::CommuteAnyOperandIndex
;
563 bool CanCommute
= TII
->findCommutedOpIndices(*MI
, OpNo
, CommuteOpNo
);
567 // One of operands might be an Imm operand, and OpNo may refer to it after
568 // the call of commuteInstruction() below. Such situations are avoided
569 // here explicitly as OpNo must be a register operand to be a candidate
570 // for memory folding.
571 if (!MI
->getOperand(OpNo
).isReg() || !MI
->getOperand(CommuteOpNo
).isReg())
574 if (!TII
->commuteInstruction(*MI
, false, OpNo
, CommuteOpNo
))
578 if (!TII
->isOperandLegal(*MI
, CommuteOpNo
, OpToFold
)) {
579 if ((Opc
!= AMDGPU::V_ADD_CO_U32_e64
&& Opc
!= AMDGPU::V_SUB_CO_U32_e64
&&
580 Opc
!= AMDGPU::V_SUBREV_CO_U32_e64
) || // FIXME
581 (!OpToFold
->isImm() && !OpToFold
->isFI() && !OpToFold
->isGlobal())) {
582 TII
->commuteInstruction(*MI
, false, OpNo
, CommuteOpNo
);
586 // Verify the other operand is a VGPR, otherwise we would violate the
587 // constant bus restriction.
588 MachineOperand
&OtherOp
= MI
->getOperand(OpNo
);
589 if (!OtherOp
.isReg() ||
590 !TII
->getRegisterInfo().isVGPR(*MRI
, OtherOp
.getReg()))
593 assert(MI
->getOperand(1).isDef());
595 // Make sure to get the 32-bit version of the commuted opcode.
596 unsigned MaybeCommutedOpc
= MI
->getOpcode();
597 Op32
= AMDGPU::getVOPe32(MaybeCommutedOpc
);
600 appendFoldCandidate(FoldList
, MI
, CommuteOpNo
, OpToFold
, true, Op32
);
604 // Inlineable constant might have been folded into Imm operand of fmaak or
605 // fmamk and we are trying to fold a non-inlinable constant.
606 if ((Opc
== AMDGPU::S_FMAAK_F32
|| Opc
== AMDGPU::S_FMAMK_F32
) &&
607 !OpToFold
->isReg() && !TII
->isInlineConstant(*OpToFold
)) {
608 unsigned ImmIdx
= Opc
== AMDGPU::S_FMAAK_F32
? 3 : 2;
609 MachineOperand
&OpImm
= MI
->getOperand(ImmIdx
);
610 if (!OpImm
.isReg() &&
611 TII
->isInlineConstant(*MI
, MI
->getOperand(OpNo
), OpImm
))
612 return tryToFoldAsFMAAKorMK();
615 // Special case for s_fmac_f32 if we are trying to fold into Src0 or Src1.
616 // By changing into fmamk we can untie Src2.
617 // If folding for Src0 happens first and it is identical operand to Src1 we
618 // should avoid transforming into fmamk which requires commuting as it would
619 // cause folding into Src1 to fail later on due to wrong OpNo used.
620 if (Opc
== AMDGPU::S_FMAC_F32
&&
621 (OpNo
!= 1 || !MI
->getOperand(1).isIdenticalTo(MI
->getOperand(2)))) {
622 if (tryToFoldAsFMAAKorMK())
626 // Check the case where we might introduce a second constant operand to a
627 // scalar instruction
628 if (TII
->isSALU(MI
->getOpcode())) {
629 const MCInstrDesc
&InstDesc
= MI
->getDesc();
630 const MCOperandInfo
&OpInfo
= InstDesc
.operands()[OpNo
];
632 // Fine if the operand can be encoded as an inline constant
633 if (!OpToFold
->isReg() && !TII
->isInlineConstant(*OpToFold
, OpInfo
)) {
634 // Otherwise check for another constant
635 for (unsigned i
= 0, e
= InstDesc
.getNumOperands(); i
!= e
; ++i
) {
636 auto &Op
= MI
->getOperand(i
);
637 if (OpNo
!= i
&& !Op
.isReg() &&
638 !TII
->isInlineConstant(Op
, InstDesc
.operands()[i
]))
644 appendFoldCandidate(FoldList
, MI
, OpNo
, OpToFold
);
648 bool SIFoldOperands::isUseSafeToFold(const MachineInstr
&MI
,
649 const MachineOperand
&UseMO
) const {
650 // Operands of SDWA instructions must be registers.
651 return !TII
->isSDWA(MI
);
654 // Find a def of the UseReg, check if it is a reg_sequence and find initializers
655 // for each subreg, tracking it to foldable inline immediate if possible.
656 // Returns true on success.
657 bool SIFoldOperands::getRegSeqInit(
658 SmallVectorImpl
<std::pair
<MachineOperand
*, unsigned>> &Defs
,
659 Register UseReg
, uint8_t OpTy
) const {
660 MachineInstr
*Def
= MRI
->getVRegDef(UseReg
);
661 if (!Def
|| !Def
->isRegSequence())
664 for (unsigned I
= 1, E
= Def
->getNumExplicitOperands(); I
< E
; I
+= 2) {
665 MachineOperand
*Sub
= &Def
->getOperand(I
);
666 assert(Sub
->isReg());
668 for (MachineInstr
*SubDef
= MRI
->getVRegDef(Sub
->getReg());
669 SubDef
&& Sub
->isReg() && Sub
->getReg().isVirtual() &&
670 !Sub
->getSubReg() && TII
->isFoldableCopy(*SubDef
);
671 SubDef
= MRI
->getVRegDef(Sub
->getReg())) {
672 MachineOperand
*Op
= &SubDef
->getOperand(1);
674 if (TII
->isInlineConstant(*Op
, OpTy
))
678 if (!Op
->isReg() || Op
->getReg().isPhysical())
683 Defs
.emplace_back(Sub
, Def
->getOperand(I
+ 1).getImm());
689 bool SIFoldOperands::tryToFoldACImm(
690 const MachineOperand
&OpToFold
, MachineInstr
*UseMI
, unsigned UseOpIdx
,
691 SmallVectorImpl
<FoldCandidate
> &FoldList
) const {
692 const MCInstrDesc
&Desc
= UseMI
->getDesc();
693 if (UseOpIdx
>= Desc
.getNumOperands())
696 if (!AMDGPU::isSISrcInlinableOperand(Desc
, UseOpIdx
))
699 uint8_t OpTy
= Desc
.operands()[UseOpIdx
].OperandType
;
700 if (OpToFold
.isImm() && TII
->isInlineConstant(OpToFold
, OpTy
) &&
701 TII
->isOperandLegal(*UseMI
, UseOpIdx
, &OpToFold
)) {
702 UseMI
->getOperand(UseOpIdx
).ChangeToImmediate(OpToFold
.getImm());
706 if (!OpToFold
.isReg())
709 Register UseReg
= OpToFold
.getReg();
710 if (!UseReg
.isVirtual())
713 if (isUseMIInFoldList(FoldList
, UseMI
))
716 // Maybe it is just a COPY of an immediate itself.
717 MachineInstr
*Def
= MRI
->getVRegDef(UseReg
);
718 MachineOperand
&UseOp
= UseMI
->getOperand(UseOpIdx
);
719 if (!UseOp
.getSubReg() && Def
&& TII
->isFoldableCopy(*Def
)) {
720 MachineOperand
&DefOp
= Def
->getOperand(1);
721 if (DefOp
.isImm() && TII
->isInlineConstant(DefOp
, OpTy
) &&
722 TII
->isOperandLegal(*UseMI
, UseOpIdx
, &DefOp
)) {
723 UseMI
->getOperand(UseOpIdx
).ChangeToImmediate(DefOp
.getImm());
728 SmallVector
<std::pair
<MachineOperand
*, unsigned>, 32> Defs
;
729 if (!getRegSeqInit(Defs
, UseReg
, OpTy
))
733 for (unsigned I
= 0, E
= Defs
.size(); I
!= E
; ++I
) {
734 const MachineOperand
*Op
= Defs
[I
].first
;
738 auto SubImm
= Op
->getImm();
741 if (!TII
->isInlineConstant(*Op
, OpTy
) ||
742 !TII
->isOperandLegal(*UseMI
, UseOpIdx
, Op
))
748 return false; // Can only fold splat constants
751 appendFoldCandidate(FoldList
, UseMI
, UseOpIdx
, Defs
[0].first
);
755 void SIFoldOperands::foldOperand(
756 MachineOperand
&OpToFold
,
759 SmallVectorImpl
<FoldCandidate
> &FoldList
,
760 SmallVectorImpl
<MachineInstr
*> &CopiesToReplace
) const {
761 const MachineOperand
*UseOp
= &UseMI
->getOperand(UseOpIdx
);
763 if (!isUseSafeToFold(*UseMI
, *UseOp
))
766 // FIXME: Fold operands with subregs.
767 if (UseOp
->isReg() && OpToFold
.isReg() &&
768 (UseOp
->isImplicit() || UseOp
->getSubReg() != AMDGPU::NoSubRegister
))
771 // Special case for REG_SEQUENCE: We can't fold literals into
772 // REG_SEQUENCE instructions, so we have to fold them into the
773 // uses of REG_SEQUENCE.
774 if (UseMI
->isRegSequence()) {
775 Register RegSeqDstReg
= UseMI
->getOperand(0).getReg();
776 unsigned RegSeqDstSubReg
= UseMI
->getOperand(UseOpIdx
+ 1).getImm();
778 // Grab the use operands first
779 SmallVector
<MachineOperand
*, 4> UsesToProcess
;
780 for (auto &Use
: MRI
->use_nodbg_operands(RegSeqDstReg
))
781 UsesToProcess
.push_back(&Use
);
782 for (auto *RSUse
: UsesToProcess
) {
783 MachineInstr
*RSUseMI
= RSUse
->getParent();
785 if (tryToFoldACImm(UseMI
->getOperand(0), RSUseMI
,
786 RSUseMI
->getOperandNo(RSUse
), FoldList
))
789 if (RSUse
->getSubReg() != RegSeqDstSubReg
)
792 foldOperand(OpToFold
, RSUseMI
, RSUseMI
->getOperandNo(RSUse
), FoldList
,
798 if (tryToFoldACImm(OpToFold
, UseMI
, UseOpIdx
, FoldList
))
801 if (frameIndexMayFold(*UseMI
, UseOpIdx
, OpToFold
)) {
802 // Verify that this is a stack access.
803 // FIXME: Should probably use stack pseudos before frame lowering.
805 if (TII
->isMUBUF(*UseMI
)) {
806 if (TII
->getNamedOperand(*UseMI
, AMDGPU::OpName::srsrc
)->getReg() !=
807 MFI
->getScratchRSrcReg())
810 // Ensure this is either relative to the current frame or the current
812 MachineOperand
&SOff
=
813 *TII
->getNamedOperand(*UseMI
, AMDGPU::OpName::soffset
);
814 if (!SOff
.isImm() || SOff
.getImm() != 0)
818 // A frame index will resolve to a positive constant, so it should always be
819 // safe to fold the addressing mode, even pre-GFX9.
820 UseMI
->getOperand(UseOpIdx
).ChangeToFrameIndex(OpToFold
.getIndex());
822 const unsigned Opc
= UseMI
->getOpcode();
823 if (TII
->isFLATScratch(*UseMI
) &&
824 AMDGPU::hasNamedOperand(Opc
, AMDGPU::OpName::vaddr
) &&
825 !AMDGPU::hasNamedOperand(Opc
, AMDGPU::OpName::saddr
)) {
826 unsigned NewOpc
= AMDGPU::getFlatScratchInstSSfromSV(Opc
);
827 UseMI
->setDesc(TII
->get(NewOpc
));
833 bool FoldingImmLike
=
834 OpToFold
.isImm() || OpToFold
.isFI() || OpToFold
.isGlobal();
836 if (FoldingImmLike
&& UseMI
->isCopy()) {
837 Register DestReg
= UseMI
->getOperand(0).getReg();
838 Register SrcReg
= UseMI
->getOperand(1).getReg();
839 assert(SrcReg
.isVirtual());
841 const TargetRegisterClass
*SrcRC
= MRI
->getRegClass(SrcReg
);
843 // Don't fold into a copy to a physical register with the same class. Doing
844 // so would interfere with the register coalescer's logic which would avoid
845 // redundant initializations.
846 if (DestReg
.isPhysical() && SrcRC
->contains(DestReg
))
849 const TargetRegisterClass
*DestRC
= TRI
->getRegClassForReg(*MRI
, DestReg
);
850 if (!DestReg
.isPhysical()) {
851 if (DestRC
== &AMDGPU::AGPR_32RegClass
&&
852 TII
->isInlineConstant(OpToFold
, AMDGPU::OPERAND_REG_INLINE_C_INT32
)) {
853 UseMI
->setDesc(TII
->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64
));
854 UseMI
->getOperand(1).ChangeToImmediate(OpToFold
.getImm());
855 CopiesToReplace
.push_back(UseMI
);
860 // In order to fold immediates into copies, we need to change the
863 unsigned MovOp
= TII
->getMovOpcode(DestRC
);
864 if (MovOp
== AMDGPU::COPY
)
867 MachineInstr::mop_iterator ImpOpI
= UseMI
->implicit_operands().begin();
868 MachineInstr::mop_iterator ImpOpE
= UseMI
->implicit_operands().end();
869 while (ImpOpI
!= ImpOpE
) {
870 MachineInstr::mop_iterator Tmp
= ImpOpI
;
872 UseMI
->removeOperand(UseMI
->getOperandNo(Tmp
));
874 UseMI
->setDesc(TII
->get(MovOp
));
876 if (MovOp
== AMDGPU::V_MOV_B16_t16_e64
) {
877 const auto &SrcOp
= UseMI
->getOperand(UseOpIdx
);
878 MachineOperand
NewSrcOp(SrcOp
);
879 MachineFunction
*MF
= UseMI
->getParent()->getParent();
880 UseMI
->removeOperand(1);
881 UseMI
->addOperand(*MF
, MachineOperand::CreateImm(0)); // src0_modifiers
882 UseMI
->addOperand(NewSrcOp
); // src0
883 UseMI
->addOperand(*MF
, MachineOperand::CreateImm(0)); // op_sel
885 UseOp
= &UseMI
->getOperand(UseOpIdx
);
887 CopiesToReplace
.push_back(UseMI
);
889 if (UseMI
->isCopy() && OpToFold
.isReg() &&
890 UseMI
->getOperand(0).getReg().isVirtual() &&
891 !UseMI
->getOperand(1).getSubReg()) {
892 LLVM_DEBUG(dbgs() << "Folding " << OpToFold
<< "\n into " << *UseMI
);
893 unsigned Size
= TII
->getOpSize(*UseMI
, 1);
894 Register UseReg
= OpToFold
.getReg();
895 UseMI
->getOperand(1).setReg(UseReg
);
896 UseMI
->getOperand(1).setSubReg(OpToFold
.getSubReg());
897 UseMI
->getOperand(1).setIsKill(false);
898 CopiesToReplace
.push_back(UseMI
);
899 OpToFold
.setIsKill(false);
901 // Remove kill flags as kills may now be out of order with uses.
902 MRI
->clearKillFlags(OpToFold
.getReg());
904 // That is very tricky to store a value into an AGPR. v_accvgpr_write_b32
905 // can only accept VGPR or inline immediate. Recreate a reg_sequence with
906 // its initializers right here, so we will rematerialize immediates and
907 // avoid copies via different reg classes.
908 SmallVector
<std::pair
<MachineOperand
*, unsigned>, 32> Defs
;
909 if (Size
> 4 && TRI
->isAGPR(*MRI
, UseMI
->getOperand(0).getReg()) &&
910 getRegSeqInit(Defs
, UseReg
, AMDGPU::OPERAND_REG_INLINE_C_INT32
)) {
911 const DebugLoc
&DL
= UseMI
->getDebugLoc();
912 MachineBasicBlock
&MBB
= *UseMI
->getParent();
914 UseMI
->setDesc(TII
->get(AMDGPU::REG_SEQUENCE
));
915 for (unsigned I
= UseMI
->getNumOperands() - 1; I
> 0; --I
)
916 UseMI
->removeOperand(I
);
918 MachineInstrBuilder
B(*MBB
.getParent(), UseMI
);
919 DenseMap
<TargetInstrInfo::RegSubRegPair
, Register
> VGPRCopies
;
920 SmallSetVector
<TargetInstrInfo::RegSubRegPair
, 32> SeenAGPRs
;
921 for (unsigned I
= 0; I
< Size
/ 4; ++I
) {
922 MachineOperand
*Def
= Defs
[I
].first
;
923 TargetInstrInfo::RegSubRegPair CopyToVGPR
;
925 TII
->isInlineConstant(*Def
, AMDGPU::OPERAND_REG_INLINE_C_INT32
)) {
926 int64_t Imm
= Def
->getImm();
928 auto Tmp
= MRI
->createVirtualRegister(&AMDGPU::AGPR_32RegClass
);
929 BuildMI(MBB
, UseMI
, DL
,
930 TII
->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64
), Tmp
).addImm(Imm
);
932 } else if (Def
->isReg() && TRI
->isAGPR(*MRI
, Def
->getReg())) {
933 auto Src
= getRegSubRegPair(*Def
);
934 Def
->setIsKill(false);
935 if (!SeenAGPRs
.insert(Src
)) {
936 // We cannot build a reg_sequence out of the same registers, they
937 // must be copied. Better do it here before copyPhysReg() created
938 // several reads to do the AGPR->VGPR->AGPR copy.
941 B
.addReg(Src
.Reg
, Def
->isUndef() ? RegState::Undef
: 0,
945 assert(Def
->isReg());
946 Def
->setIsKill(false);
947 auto Src
= getRegSubRegPair(*Def
);
949 // Direct copy from SGPR to AGPR is not possible. To avoid creation
950 // of exploded copies SGPR->VGPR->AGPR in the copyPhysReg() later,
951 // create a copy here and track if we already have such a copy.
952 if (TRI
->isSGPRReg(*MRI
, Src
.Reg
)) {
955 auto Tmp
= MRI
->createVirtualRegister(&AMDGPU::AGPR_32RegClass
);
956 BuildMI(MBB
, UseMI
, DL
, TII
->get(AMDGPU::COPY
), Tmp
).add(*Def
);
961 if (CopyToVGPR
.Reg
) {
963 if (VGPRCopies
.count(CopyToVGPR
)) {
964 Vgpr
= VGPRCopies
[CopyToVGPR
];
966 Vgpr
= MRI
->createVirtualRegister(&AMDGPU::VGPR_32RegClass
);
967 BuildMI(MBB
, UseMI
, DL
, TII
->get(AMDGPU::COPY
), Vgpr
).add(*Def
);
968 VGPRCopies
[CopyToVGPR
] = Vgpr
;
970 auto Tmp
= MRI
->createVirtualRegister(&AMDGPU::AGPR_32RegClass
);
971 BuildMI(MBB
, UseMI
, DL
,
972 TII
->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64
), Tmp
).addReg(Vgpr
);
976 B
.addImm(Defs
[I
].second
);
978 LLVM_DEBUG(dbgs() << "Folded " << *UseMI
);
985 Register Reg0
= UseMI
->getOperand(0).getReg();
986 Register Reg1
= UseMI
->getOperand(1).getReg();
987 if (TRI
->isAGPR(*MRI
, Reg0
) && TRI
->isVGPR(*MRI
, Reg1
))
988 UseMI
->setDesc(TII
->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64
));
989 else if (TRI
->isVGPR(*MRI
, Reg0
) && TRI
->isAGPR(*MRI
, Reg1
))
990 UseMI
->setDesc(TII
->get(AMDGPU::V_ACCVGPR_READ_B32_e64
));
991 else if (ST
->hasGFX90AInsts() && TRI
->isAGPR(*MRI
, Reg0
) &&
992 TRI
->isAGPR(*MRI
, Reg1
))
993 UseMI
->setDesc(TII
->get(AMDGPU::V_ACCVGPR_MOV_B32
));
997 unsigned UseOpc
= UseMI
->getOpcode();
998 if (UseOpc
== AMDGPU::V_READFIRSTLANE_B32
||
999 (UseOpc
== AMDGPU::V_READLANE_B32
&&
1001 AMDGPU::getNamedOperandIdx(UseOpc
, AMDGPU::OpName::src0
))) {
1002 // %vgpr = V_MOV_B32 imm
1003 // %sgpr = V_READFIRSTLANE_B32 %vgpr
1005 // %sgpr = S_MOV_B32 imm
1006 if (FoldingImmLike
) {
1007 if (execMayBeModifiedBeforeUse(*MRI
,
1008 UseMI
->getOperand(UseOpIdx
).getReg(),
1009 *OpToFold
.getParent(),
1013 UseMI
->setDesc(TII
->get(AMDGPU::S_MOV_B32
));
1015 if (OpToFold
.isImm())
1016 UseMI
->getOperand(1).ChangeToImmediate(OpToFold
.getImm());
1018 UseMI
->getOperand(1).ChangeToFrameIndex(OpToFold
.getIndex());
1019 UseMI
->removeOperand(2); // Remove exec read (or src1 for readlane)
1023 if (OpToFold
.isReg() && TRI
->isSGPRReg(*MRI
, OpToFold
.getReg())) {
1024 if (execMayBeModifiedBeforeUse(*MRI
,
1025 UseMI
->getOperand(UseOpIdx
).getReg(),
1026 *OpToFold
.getParent(),
1030 // %vgpr = COPY %sgpr0
1031 // %sgpr1 = V_READFIRSTLANE_B32 %vgpr
1033 // %sgpr1 = COPY %sgpr0
1034 UseMI
->setDesc(TII
->get(AMDGPU::COPY
));
1035 UseMI
->getOperand(1).setReg(OpToFold
.getReg());
1036 UseMI
->getOperand(1).setSubReg(OpToFold
.getSubReg());
1037 UseMI
->getOperand(1).setIsKill(false);
1038 UseMI
->removeOperand(2); // Remove exec read (or src1 for readlane)
1043 const MCInstrDesc
&UseDesc
= UseMI
->getDesc();
1045 // Don't fold into target independent nodes. Target independent opcodes
1046 // don't have defined register classes.
1047 if (UseDesc
.isVariadic() || UseOp
->isImplicit() ||
1048 UseDesc
.operands()[UseOpIdx
].RegClass
== -1)
1052 if (!FoldingImmLike
) {
1053 if (OpToFold
.isReg() && ST
->needsAlignedVGPRs()) {
1054 // Don't fold if OpToFold doesn't hold an aligned register.
1055 const TargetRegisterClass
*RC
=
1056 TRI
->getRegClassForReg(*MRI
, OpToFold
.getReg());
1058 if (TRI
->hasVectorRegisters(RC
) && OpToFold
.getSubReg()) {
1059 unsigned SubReg
= OpToFold
.getSubReg();
1060 if (const TargetRegisterClass
*SubRC
=
1061 TRI
->getSubRegisterClass(RC
, SubReg
))
1065 if (!RC
|| !TRI
->isProperlyAlignedRC(*RC
))
1069 tryAddToFoldList(FoldList
, UseMI
, UseOpIdx
, &OpToFold
);
1071 // FIXME: We could try to change the instruction from 64-bit to 32-bit
1072 // to enable more folding opportunities. The shrink operands pass
1073 // already does this.
1078 const MCInstrDesc
&FoldDesc
= OpToFold
.getParent()->getDesc();
1079 const TargetRegisterClass
*FoldRC
=
1080 TRI
->getRegClass(FoldDesc
.operands()[0].RegClass
);
1082 // Split 64-bit constants into 32-bits for folding.
1083 if (UseOp
->getSubReg() && AMDGPU::getRegBitWidth(*FoldRC
) == 64) {
1084 Register UseReg
= UseOp
->getReg();
1085 const TargetRegisterClass
*UseRC
= MRI
->getRegClass(UseReg
);
1086 if (AMDGPU::getRegBitWidth(*UseRC
) != 64)
1089 APInt
Imm(64, OpToFold
.getImm());
1090 if (UseOp
->getSubReg() == AMDGPU::sub0
) {
1091 Imm
= Imm
.getLoBits(32);
1093 assert(UseOp
->getSubReg() == AMDGPU::sub1
);
1094 Imm
= Imm
.getHiBits(32);
1097 MachineOperand ImmOp
= MachineOperand::CreateImm(Imm
.getSExtValue());
1098 tryAddToFoldList(FoldList
, UseMI
, UseOpIdx
, &ImmOp
);
1102 tryAddToFoldList(FoldList
, UseMI
, UseOpIdx
, &OpToFold
);
1105 static bool evalBinaryInstruction(unsigned Opcode
, int32_t &Result
,
1106 uint32_t LHS
, uint32_t RHS
) {
1108 case AMDGPU::V_AND_B32_e64
:
1109 case AMDGPU::V_AND_B32_e32
:
1110 case AMDGPU::S_AND_B32
:
1113 case AMDGPU::V_OR_B32_e64
:
1114 case AMDGPU::V_OR_B32_e32
:
1115 case AMDGPU::S_OR_B32
:
1118 case AMDGPU::V_XOR_B32_e64
:
1119 case AMDGPU::V_XOR_B32_e32
:
1120 case AMDGPU::S_XOR_B32
:
1123 case AMDGPU::S_XNOR_B32
:
1124 Result
= ~(LHS
^ RHS
);
1126 case AMDGPU::S_NAND_B32
:
1127 Result
= ~(LHS
& RHS
);
1129 case AMDGPU::S_NOR_B32
:
1130 Result
= ~(LHS
| RHS
);
1132 case AMDGPU::S_ANDN2_B32
:
1133 Result
= LHS
& ~RHS
;
1135 case AMDGPU::S_ORN2_B32
:
1136 Result
= LHS
| ~RHS
;
1138 case AMDGPU::V_LSHL_B32_e64
:
1139 case AMDGPU::V_LSHL_B32_e32
:
1140 case AMDGPU::S_LSHL_B32
:
1141 // The instruction ignores the high bits for out of bounds shifts.
1142 Result
= LHS
<< (RHS
& 31);
1144 case AMDGPU::V_LSHLREV_B32_e64
:
1145 case AMDGPU::V_LSHLREV_B32_e32
:
1146 Result
= RHS
<< (LHS
& 31);
1148 case AMDGPU::V_LSHR_B32_e64
:
1149 case AMDGPU::V_LSHR_B32_e32
:
1150 case AMDGPU::S_LSHR_B32
:
1151 Result
= LHS
>> (RHS
& 31);
1153 case AMDGPU::V_LSHRREV_B32_e64
:
1154 case AMDGPU::V_LSHRREV_B32_e32
:
1155 Result
= RHS
>> (LHS
& 31);
1157 case AMDGPU::V_ASHR_I32_e64
:
1158 case AMDGPU::V_ASHR_I32_e32
:
1159 case AMDGPU::S_ASHR_I32
:
1160 Result
= static_cast<int32_t>(LHS
) >> (RHS
& 31);
1162 case AMDGPU::V_ASHRREV_I32_e64
:
1163 case AMDGPU::V_ASHRREV_I32_e32
:
1164 Result
= static_cast<int32_t>(RHS
) >> (LHS
& 31);
1171 static unsigned getMovOpc(bool IsScalar
) {
1172 return IsScalar
? AMDGPU::S_MOV_B32
: AMDGPU::V_MOV_B32_e32
;
1175 static void mutateCopyOp(MachineInstr
&MI
, const MCInstrDesc
&NewDesc
) {
1176 MI
.setDesc(NewDesc
);
1178 // Remove any leftover implicit operands from mutating the instruction. e.g.
1179 // if we replace an s_and_b32 with a copy, we don't need the implicit scc def
1181 const MCInstrDesc
&Desc
= MI
.getDesc();
1182 unsigned NumOps
= Desc
.getNumOperands() + Desc
.implicit_uses().size() +
1183 Desc
.implicit_defs().size();
1185 for (unsigned I
= MI
.getNumOperands() - 1; I
>= NumOps
; --I
)
1186 MI
.removeOperand(I
);
1190 SIFoldOperands::getImmOrMaterializedImm(MachineOperand
&Op
) const {
1191 // If this has a subregister, it obviously is a register source.
1192 if (!Op
.isReg() || Op
.getSubReg() != AMDGPU::NoSubRegister
||
1193 !Op
.getReg().isVirtual())
1196 MachineInstr
*Def
= MRI
->getVRegDef(Op
.getReg());
1197 if (Def
&& Def
->isMoveImmediate()) {
1198 MachineOperand
&ImmSrc
= Def
->getOperand(1);
1206 // Try to simplify operations with a constant that may appear after instruction
1208 // TODO: See if a frame index with a fixed offset can fold.
1209 bool SIFoldOperands::tryConstantFoldOp(MachineInstr
*MI
) const {
1210 if (!MI
->allImplicitDefsAreDead())
1213 unsigned Opc
= MI
->getOpcode();
1215 int Src0Idx
= AMDGPU::getNamedOperandIdx(Opc
, AMDGPU::OpName::src0
);
1218 MachineOperand
*Src0
= getImmOrMaterializedImm(MI
->getOperand(Src0Idx
));
1220 if ((Opc
== AMDGPU::V_NOT_B32_e64
|| Opc
== AMDGPU::V_NOT_B32_e32
||
1221 Opc
== AMDGPU::S_NOT_B32
) &&
1223 MI
->getOperand(1).ChangeToImmediate(~Src0
->getImm());
1224 mutateCopyOp(*MI
, TII
->get(getMovOpc(Opc
== AMDGPU::S_NOT_B32
)));
1228 int Src1Idx
= AMDGPU::getNamedOperandIdx(Opc
, AMDGPU::OpName::src1
);
1231 MachineOperand
*Src1
= getImmOrMaterializedImm(MI
->getOperand(Src1Idx
));
1233 if (!Src0
->isImm() && !Src1
->isImm())
1236 // and k0, k1 -> v_mov_b32 (k0 & k1)
1237 // or k0, k1 -> v_mov_b32 (k0 | k1)
1238 // xor k0, k1 -> v_mov_b32 (k0 ^ k1)
1239 if (Src0
->isImm() && Src1
->isImm()) {
1241 if (!evalBinaryInstruction(Opc
, NewImm
, Src0
->getImm(), Src1
->getImm()))
1244 bool IsSGPR
= TRI
->isSGPRReg(*MRI
, MI
->getOperand(0).getReg());
1246 // Be careful to change the right operand, src0 may belong to a different
1248 MI
->getOperand(Src0Idx
).ChangeToImmediate(NewImm
);
1249 MI
->removeOperand(Src1Idx
);
1250 mutateCopyOp(*MI
, TII
->get(getMovOpc(IsSGPR
)));
1254 if (!MI
->isCommutable())
1257 if (Src0
->isImm() && !Src1
->isImm()) {
1258 std::swap(Src0
, Src1
);
1259 std::swap(Src0Idx
, Src1Idx
);
1262 int32_t Src1Val
= static_cast<int32_t>(Src1
->getImm());
1263 if (Opc
== AMDGPU::V_OR_B32_e64
||
1264 Opc
== AMDGPU::V_OR_B32_e32
||
1265 Opc
== AMDGPU::S_OR_B32
) {
1267 // y = or x, 0 => y = copy x
1268 MI
->removeOperand(Src1Idx
);
1269 mutateCopyOp(*MI
, TII
->get(AMDGPU::COPY
));
1270 } else if (Src1Val
== -1) {
1271 // y = or x, -1 => y = v_mov_b32 -1
1272 MI
->removeOperand(Src1Idx
);
1273 mutateCopyOp(*MI
, TII
->get(getMovOpc(Opc
== AMDGPU::S_OR_B32
)));
1280 if (Opc
== AMDGPU::V_AND_B32_e64
|| Opc
== AMDGPU::V_AND_B32_e32
||
1281 Opc
== AMDGPU::S_AND_B32
) {
1283 // y = and x, 0 => y = v_mov_b32 0
1284 MI
->removeOperand(Src0Idx
);
1285 mutateCopyOp(*MI
, TII
->get(getMovOpc(Opc
== AMDGPU::S_AND_B32
)));
1286 } else if (Src1Val
== -1) {
1287 // y = and x, -1 => y = copy x
1288 MI
->removeOperand(Src1Idx
);
1289 mutateCopyOp(*MI
, TII
->get(AMDGPU::COPY
));
1296 if (Opc
== AMDGPU::V_XOR_B32_e64
|| Opc
== AMDGPU::V_XOR_B32_e32
||
1297 Opc
== AMDGPU::S_XOR_B32
) {
1299 // y = xor x, 0 => y = copy x
1300 MI
->removeOperand(Src1Idx
);
1301 mutateCopyOp(*MI
, TII
->get(AMDGPU::COPY
));
1309 // Try to fold an instruction into a simpler one
1310 bool SIFoldOperands::tryFoldCndMask(MachineInstr
&MI
) const {
1311 unsigned Opc
= MI
.getOpcode();
1312 if (Opc
!= AMDGPU::V_CNDMASK_B32_e32
&& Opc
!= AMDGPU::V_CNDMASK_B32_e64
&&
1313 Opc
!= AMDGPU::V_CNDMASK_B64_PSEUDO
)
1316 MachineOperand
*Src0
= TII
->getNamedOperand(MI
, AMDGPU::OpName::src0
);
1317 MachineOperand
*Src1
= TII
->getNamedOperand(MI
, AMDGPU::OpName::src1
);
1318 if (!Src1
->isIdenticalTo(*Src0
)) {
1319 auto *Src0Imm
= getImmOrMaterializedImm(*Src0
);
1320 auto *Src1Imm
= getImmOrMaterializedImm(*Src1
);
1321 if (!Src1Imm
->isIdenticalTo(*Src0Imm
))
1326 AMDGPU::getNamedOperandIdx(Opc
, AMDGPU::OpName::src1_modifiers
);
1328 AMDGPU::getNamedOperandIdx(Opc
, AMDGPU::OpName::src0_modifiers
);
1329 if ((Src1ModIdx
!= -1 && MI
.getOperand(Src1ModIdx
).getImm() != 0) ||
1330 (Src0ModIdx
!= -1 && MI
.getOperand(Src0ModIdx
).getImm() != 0))
1333 LLVM_DEBUG(dbgs() << "Folded " << MI
<< " into ");
1335 TII
->get(Src0
->isReg() ? (unsigned)AMDGPU::COPY
: getMovOpc(false));
1336 int Src2Idx
= AMDGPU::getNamedOperandIdx(Opc
, AMDGPU::OpName::src2
);
1338 MI
.removeOperand(Src2Idx
);
1339 MI
.removeOperand(AMDGPU::getNamedOperandIdx(Opc
, AMDGPU::OpName::src1
));
1340 if (Src1ModIdx
!= -1)
1341 MI
.removeOperand(Src1ModIdx
);
1342 if (Src0ModIdx
!= -1)
1343 MI
.removeOperand(Src0ModIdx
);
1344 mutateCopyOp(MI
, NewDesc
);
1345 LLVM_DEBUG(dbgs() << MI
);
1349 bool SIFoldOperands::tryFoldZeroHighBits(MachineInstr
&MI
) const {
1350 if (MI
.getOpcode() != AMDGPU::V_AND_B32_e64
&&
1351 MI
.getOpcode() != AMDGPU::V_AND_B32_e32
)
1354 MachineOperand
*Src0
= getImmOrMaterializedImm(MI
.getOperand(1));
1355 if (!Src0
->isImm() || Src0
->getImm() != 0xffff)
1358 Register Src1
= MI
.getOperand(2).getReg();
1359 MachineInstr
*SrcDef
= MRI
->getVRegDef(Src1
);
1360 if (!ST
->zeroesHigh16BitsOfDest(SrcDef
->getOpcode()))
1363 Register Dst
= MI
.getOperand(0).getReg();
1364 MRI
->replaceRegWith(Dst
, Src1
);
1365 if (!MI
.getOperand(2).isKill())
1366 MRI
->clearKillFlags(Src1
);
1367 MI
.eraseFromParent();
1371 bool SIFoldOperands::foldInstOperand(MachineInstr
&MI
,
1372 MachineOperand
&OpToFold
) const {
1373 // We need mutate the operands of new mov instructions to add implicit
1374 // uses of EXEC, but adding them invalidates the use_iterator, so defer
1376 SmallVector
<MachineInstr
*, 4> CopiesToReplace
;
1377 SmallVector
<FoldCandidate
, 4> FoldList
;
1378 MachineOperand
&Dst
= MI
.getOperand(0);
1379 bool Changed
= false;
1381 if (OpToFold
.isImm()) {
1383 make_early_inc_range(MRI
->use_nodbg_instructions(Dst
.getReg()))) {
1384 // Folding the immediate may reveal operations that can be constant
1385 // folded or replaced with a copy. This can happen for example after
1386 // frame indices are lowered to constants or from splitting 64-bit
1389 // We may also encounter cases where one or both operands are
1390 // immediates materialized into a register, which would ordinarily not
1391 // be folded due to multiple uses or operand constraints.
1392 if (tryConstantFoldOp(&UseMI
)) {
1393 LLVM_DEBUG(dbgs() << "Constant folded " << UseMI
);
1399 SmallVector
<MachineOperand
*, 4> UsesToProcess
;
1400 for (auto &Use
: MRI
->use_nodbg_operands(Dst
.getReg()))
1401 UsesToProcess
.push_back(&Use
);
1402 for (auto *U
: UsesToProcess
) {
1403 MachineInstr
*UseMI
= U
->getParent();
1404 foldOperand(OpToFold
, UseMI
, UseMI
->getOperandNo(U
), FoldList
,
1408 if (CopiesToReplace
.empty() && FoldList
.empty())
1411 MachineFunction
*MF
= MI
.getParent()->getParent();
1412 // Make sure we add EXEC uses to any new v_mov instructions created.
1413 for (MachineInstr
*Copy
: CopiesToReplace
)
1414 Copy
->addImplicitDefUseOperands(*MF
);
1416 for (FoldCandidate
&Fold
: FoldList
) {
1417 assert(!Fold
.isReg() || Fold
.OpToFold
);
1418 if (Fold
.isReg() && Fold
.OpToFold
->getReg().isVirtual()) {
1419 Register Reg
= Fold
.OpToFold
->getReg();
1420 MachineInstr
*DefMI
= Fold
.OpToFold
->getParent();
1421 if (DefMI
->readsRegister(AMDGPU::EXEC
, TRI
) &&
1422 execMayBeModifiedBeforeUse(*MRI
, Reg
, *DefMI
, *Fold
.UseMI
))
1425 if (updateOperand(Fold
)) {
1426 // Clear kill flags.
1428 assert(Fold
.OpToFold
&& Fold
.OpToFold
->isReg());
1429 // FIXME: Probably shouldn't bother trying to fold if not an
1430 // SGPR. PeepholeOptimizer can eliminate redundant VGPR->VGPR
1432 MRI
->clearKillFlags(Fold
.OpToFold
->getReg());
1434 LLVM_DEBUG(dbgs() << "Folded source from " << MI
<< " into OpNo "
1435 << static_cast<int>(Fold
.UseOpNo
) << " of "
1437 } else if (Fold
.Commuted
) {
1438 // Restoring instruction's original operand order if fold has failed.
1439 TII
->commuteInstruction(*Fold
.UseMI
, false);
1445 bool SIFoldOperands::tryFoldFoldableCopy(
1446 MachineInstr
&MI
, MachineOperand
*&CurrentKnownM0Val
) const {
1447 // Specially track simple redefs of m0 to the same value in a block, so we
1448 // can erase the later ones.
1449 if (MI
.getOperand(0).getReg() == AMDGPU::M0
) {
1450 MachineOperand
&NewM0Val
= MI
.getOperand(1);
1451 if (CurrentKnownM0Val
&& CurrentKnownM0Val
->isIdenticalTo(NewM0Val
)) {
1452 MI
.eraseFromParent();
1456 // We aren't tracking other physical registers
1457 CurrentKnownM0Val
= (NewM0Val
.isReg() && NewM0Val
.getReg().isPhysical())
1463 MachineOperand
&OpToFold
= MI
.getOperand(1);
1464 bool FoldingImm
= OpToFold
.isImm() || OpToFold
.isFI() || OpToFold
.isGlobal();
1466 // FIXME: We could also be folding things like TargetIndexes.
1467 if (!FoldingImm
&& !OpToFold
.isReg())
1470 if (OpToFold
.isReg() && !OpToFold
.getReg().isVirtual())
1473 // Prevent folding operands backwards in the function. For example,
1474 // the COPY opcode must not be replaced by 1 in this example:
1476 // %3 = COPY %vgpr0; VGPR_32:%3
1478 // %vgpr0 = V_MOV_B32_e32 1, implicit %exec
1479 if (!MI
.getOperand(0).getReg().isVirtual())
1482 bool Changed
= foldInstOperand(MI
, OpToFold
);
1484 // If we managed to fold all uses of this copy then we might as well
1486 // The only reason we need to follow chains of copies here is that
1487 // tryFoldRegSequence looks forward through copies before folding a
1488 // REG_SEQUENCE into its eventual users.
1489 auto *InstToErase
= &MI
;
1490 while (MRI
->use_nodbg_empty(InstToErase
->getOperand(0).getReg())) {
1491 auto &SrcOp
= InstToErase
->getOperand(1);
1492 auto SrcReg
= SrcOp
.isReg() ? SrcOp
.getReg() : Register();
1493 InstToErase
->eraseFromParent();
1495 InstToErase
= nullptr;
1496 if (!SrcReg
|| SrcReg
.isPhysical())
1498 InstToErase
= MRI
->getVRegDef(SrcReg
);
1499 if (!InstToErase
|| !TII
->isFoldableCopy(*InstToErase
))
1503 if (InstToErase
&& InstToErase
->isRegSequence() &&
1504 MRI
->use_nodbg_empty(InstToErase
->getOperand(0).getReg())) {
1505 InstToErase
->eraseFromParent();
1512 // Clamp patterns are canonically selected to v_max_* instructions, so only
1514 const MachineOperand
*SIFoldOperands::isClamp(const MachineInstr
&MI
) const {
1515 unsigned Op
= MI
.getOpcode();
1517 case AMDGPU::V_MAX_F32_e64
:
1518 case AMDGPU::V_MAX_F16_e64
:
1519 case AMDGPU::V_MAX_F16_t16_e64
:
1520 case AMDGPU::V_MAX_F16_fake16_e64
:
1521 case AMDGPU::V_MAX_F64_e64
:
1522 case AMDGPU::V_MAX_NUM_F64_e64
:
1523 case AMDGPU::V_PK_MAX_F16
: {
1524 if (MI
.mayRaiseFPException())
1527 if (!TII
->getNamedOperand(MI
, AMDGPU::OpName::clamp
)->getImm())
1530 // Make sure sources are identical.
1531 const MachineOperand
*Src0
= TII
->getNamedOperand(MI
, AMDGPU::OpName::src0
);
1532 const MachineOperand
*Src1
= TII
->getNamedOperand(MI
, AMDGPU::OpName::src1
);
1533 if (!Src0
->isReg() || !Src1
->isReg() ||
1534 Src0
->getReg() != Src1
->getReg() ||
1535 Src0
->getSubReg() != Src1
->getSubReg() ||
1536 Src0
->getSubReg() != AMDGPU::NoSubRegister
)
1539 // Can't fold up if we have modifiers.
1540 if (TII
->hasModifiersSet(MI
, AMDGPU::OpName::omod
))
1544 = TII
->getNamedOperand(MI
, AMDGPU::OpName::src0_modifiers
)->getImm();
1546 = TII
->getNamedOperand(MI
, AMDGPU::OpName::src1_modifiers
)->getImm();
1548 // Having a 0 op_sel_hi would require swizzling the output in the source
1549 // instruction, which we can't do.
1550 unsigned UnsetMods
= (Op
== AMDGPU::V_PK_MAX_F16
) ? SISrcMods::OP_SEL_1
1552 if (Src0Mods
!= UnsetMods
&& Src1Mods
!= UnsetMods
)
1561 // FIXME: Clamp for v_mad_mixhi_f16 handled during isel.
1562 bool SIFoldOperands::tryFoldClamp(MachineInstr
&MI
) {
1563 const MachineOperand
*ClampSrc
= isClamp(MI
);
1564 if (!ClampSrc
|| !MRI
->hasOneNonDBGUser(ClampSrc
->getReg()))
1567 MachineInstr
*Def
= MRI
->getVRegDef(ClampSrc
->getReg());
1569 // The type of clamp must be compatible.
1570 if (TII
->getClampMask(*Def
) != TII
->getClampMask(MI
))
1573 if (Def
->mayRaiseFPException())
1576 MachineOperand
*DefClamp
= TII
->getNamedOperand(*Def
, AMDGPU::OpName::clamp
);
1580 LLVM_DEBUG(dbgs() << "Folding clamp " << *DefClamp
<< " into " << *Def
);
1582 // Clamp is applied after omod, so it is OK if omod is set.
1583 DefClamp
->setImm(1);
1585 Register DefReg
= Def
->getOperand(0).getReg();
1586 Register MIDstReg
= MI
.getOperand(0).getReg();
1587 if (TRI
->isSGPRReg(*MRI
, DefReg
)) {
1588 // Pseudo scalar instructions have a SGPR for dst and clamp is a v_max*
1589 // instruction with a VGPR dst.
1590 BuildMI(*MI
.getParent(), MI
, MI
.getDebugLoc(), TII
->get(AMDGPU::COPY
),
1594 MRI
->replaceRegWith(MIDstReg
, DefReg
);
1596 MI
.eraseFromParent();
1598 // Use of output modifiers forces VOP3 encoding for a VOP2 mac/fmac
1599 // instruction, so we might as well convert it to the more flexible VOP3-only
1601 if (TII
->convertToThreeAddress(*Def
, nullptr, nullptr))
1602 Def
->eraseFromParent();
1607 static int getOModValue(unsigned Opc
, int64_t Val
) {
1609 case AMDGPU::V_MUL_F64_e64
:
1610 case AMDGPU::V_MUL_F64_pseudo_e64
: {
1612 case 0x3fe0000000000000: // 0.5
1613 return SIOutMods::DIV2
;
1614 case 0x4000000000000000: // 2.0
1615 return SIOutMods::MUL2
;
1616 case 0x4010000000000000: // 4.0
1617 return SIOutMods::MUL4
;
1619 return SIOutMods::NONE
;
1622 case AMDGPU::V_MUL_F32_e64
: {
1623 switch (static_cast<uint32_t>(Val
)) {
1624 case 0x3f000000: // 0.5
1625 return SIOutMods::DIV2
;
1626 case 0x40000000: // 2.0
1627 return SIOutMods::MUL2
;
1628 case 0x40800000: // 4.0
1629 return SIOutMods::MUL4
;
1631 return SIOutMods::NONE
;
1634 case AMDGPU::V_MUL_F16_e64
:
1635 case AMDGPU::V_MUL_F16_t16_e64
:
1636 case AMDGPU::V_MUL_F16_fake16_e64
: {
1637 switch (static_cast<uint16_t>(Val
)) {
1639 return SIOutMods::DIV2
;
1641 return SIOutMods::MUL2
;
1643 return SIOutMods::MUL4
;
1645 return SIOutMods::NONE
;
1649 llvm_unreachable("invalid mul opcode");
1653 // FIXME: Does this really not support denormals with f16?
1654 // FIXME: Does this need to check IEEE mode bit? SNaNs are generally not
1655 // handled, so will anything other than that break?
1656 std::pair
<const MachineOperand
*, int>
1657 SIFoldOperands::isOMod(const MachineInstr
&MI
) const {
1658 unsigned Op
= MI
.getOpcode();
1660 case AMDGPU::V_MUL_F64_e64
:
1661 case AMDGPU::V_MUL_F64_pseudo_e64
:
1662 case AMDGPU::V_MUL_F32_e64
:
1663 case AMDGPU::V_MUL_F16_t16_e64
:
1664 case AMDGPU::V_MUL_F16_fake16_e64
:
1665 case AMDGPU::V_MUL_F16_e64
: {
1666 // If output denormals are enabled, omod is ignored.
1667 if ((Op
== AMDGPU::V_MUL_F32_e64
&&
1668 MFI
->getMode().FP32Denormals
.Output
!= DenormalMode::PreserveSign
) ||
1669 ((Op
== AMDGPU::V_MUL_F64_e64
|| Op
== AMDGPU::V_MUL_F64_pseudo_e64
||
1670 Op
== AMDGPU::V_MUL_F16_e64
|| Op
== AMDGPU::V_MUL_F16_t16_e64
||
1671 Op
== AMDGPU::V_MUL_F16_fake16_e64
) &&
1672 MFI
->getMode().FP64FP16Denormals
.Output
!=
1673 DenormalMode::PreserveSign
) ||
1674 MI
.mayRaiseFPException())
1675 return std::pair(nullptr, SIOutMods::NONE
);
1677 const MachineOperand
*RegOp
= nullptr;
1678 const MachineOperand
*ImmOp
= nullptr;
1679 const MachineOperand
*Src0
= TII
->getNamedOperand(MI
, AMDGPU::OpName::src0
);
1680 const MachineOperand
*Src1
= TII
->getNamedOperand(MI
, AMDGPU::OpName::src1
);
1681 if (Src0
->isImm()) {
1684 } else if (Src1
->isImm()) {
1688 return std::pair(nullptr, SIOutMods::NONE
);
1690 int OMod
= getOModValue(Op
, ImmOp
->getImm());
1691 if (OMod
== SIOutMods::NONE
||
1692 TII
->hasModifiersSet(MI
, AMDGPU::OpName::src0_modifiers
) ||
1693 TII
->hasModifiersSet(MI
, AMDGPU::OpName::src1_modifiers
) ||
1694 TII
->hasModifiersSet(MI
, AMDGPU::OpName::omod
) ||
1695 TII
->hasModifiersSet(MI
, AMDGPU::OpName::clamp
))
1696 return std::pair(nullptr, SIOutMods::NONE
);
1698 return std::pair(RegOp
, OMod
);
1700 case AMDGPU::V_ADD_F64_e64
:
1701 case AMDGPU::V_ADD_F64_pseudo_e64
:
1702 case AMDGPU::V_ADD_F32_e64
:
1703 case AMDGPU::V_ADD_F16_e64
:
1704 case AMDGPU::V_ADD_F16_t16_e64
:
1705 case AMDGPU::V_ADD_F16_fake16_e64
: {
1706 // If output denormals are enabled, omod is ignored.
1707 if ((Op
== AMDGPU::V_ADD_F32_e64
&&
1708 MFI
->getMode().FP32Denormals
.Output
!= DenormalMode::PreserveSign
) ||
1709 ((Op
== AMDGPU::V_ADD_F64_e64
|| Op
== AMDGPU::V_ADD_F64_pseudo_e64
||
1710 Op
== AMDGPU::V_ADD_F16_e64
|| Op
== AMDGPU::V_ADD_F16_t16_e64
||
1711 Op
== AMDGPU::V_ADD_F16_fake16_e64
) &&
1712 MFI
->getMode().FP64FP16Denormals
.Output
!= DenormalMode::PreserveSign
))
1713 return std::pair(nullptr, SIOutMods::NONE
);
1715 // Look through the DAGCombiner canonicalization fmul x, 2 -> fadd x, x
1716 const MachineOperand
*Src0
= TII
->getNamedOperand(MI
, AMDGPU::OpName::src0
);
1717 const MachineOperand
*Src1
= TII
->getNamedOperand(MI
, AMDGPU::OpName::src1
);
1719 if (Src0
->isReg() && Src1
->isReg() && Src0
->getReg() == Src1
->getReg() &&
1720 Src0
->getSubReg() == Src1
->getSubReg() &&
1721 !TII
->hasModifiersSet(MI
, AMDGPU::OpName::src0_modifiers
) &&
1722 !TII
->hasModifiersSet(MI
, AMDGPU::OpName::src1_modifiers
) &&
1723 !TII
->hasModifiersSet(MI
, AMDGPU::OpName::clamp
) &&
1724 !TII
->hasModifiersSet(MI
, AMDGPU::OpName::omod
))
1725 return std::pair(Src0
, SIOutMods::MUL2
);
1727 return std::pair(nullptr, SIOutMods::NONE
);
1730 return std::pair(nullptr, SIOutMods::NONE
);
1734 // FIXME: Does this need to check IEEE bit on function?
1735 bool SIFoldOperands::tryFoldOMod(MachineInstr
&MI
) {
1736 const MachineOperand
*RegOp
;
1738 std::tie(RegOp
, OMod
) = isOMod(MI
);
1739 if (OMod
== SIOutMods::NONE
|| !RegOp
->isReg() ||
1740 RegOp
->getSubReg() != AMDGPU::NoSubRegister
||
1741 !MRI
->hasOneNonDBGUser(RegOp
->getReg()))
1744 MachineInstr
*Def
= MRI
->getVRegDef(RegOp
->getReg());
1745 MachineOperand
*DefOMod
= TII
->getNamedOperand(*Def
, AMDGPU::OpName::omod
);
1746 if (!DefOMod
|| DefOMod
->getImm() != SIOutMods::NONE
)
1749 if (Def
->mayRaiseFPException())
1752 // Clamp is applied after omod. If the source already has clamp set, don't
1754 if (TII
->hasModifiersSet(*Def
, AMDGPU::OpName::clamp
))
1757 LLVM_DEBUG(dbgs() << "Folding omod " << MI
<< " into " << *Def
);
1759 DefOMod
->setImm(OMod
);
1760 MRI
->replaceRegWith(MI
.getOperand(0).getReg(), Def
->getOperand(0).getReg());
1761 MI
.eraseFromParent();
1763 // Use of output modifiers forces VOP3 encoding for a VOP2 mac/fmac
1764 // instruction, so we might as well convert it to the more flexible VOP3-only
1766 if (TII
->convertToThreeAddress(*Def
, nullptr, nullptr))
1767 Def
->eraseFromParent();
1772 // Try to fold a reg_sequence with vgpr output and agpr inputs into an
1773 // instruction which can take an agpr. So far that means a store.
1774 bool SIFoldOperands::tryFoldRegSequence(MachineInstr
&MI
) {
1775 assert(MI
.isRegSequence());
1776 auto Reg
= MI
.getOperand(0).getReg();
1778 if (!ST
->hasGFX90AInsts() || !TRI
->isVGPR(*MRI
, Reg
) ||
1779 !MRI
->hasOneNonDBGUse(Reg
))
1782 SmallVector
<std::pair
<MachineOperand
*, unsigned>, 32> Defs
;
1783 if (!getRegSeqInit(Defs
, Reg
, MCOI::OPERAND_REGISTER
))
1786 for (auto &[Op
, SubIdx
] : Defs
) {
1789 if (TRI
->isAGPR(*MRI
, Op
->getReg()))
1791 // Maybe this is a COPY from AREG
1792 const MachineInstr
*SubDef
= MRI
->getVRegDef(Op
->getReg());
1793 if (!SubDef
|| !SubDef
->isCopy() || SubDef
->getOperand(1).getSubReg())
1795 if (!TRI
->isAGPR(*MRI
, SubDef
->getOperand(1).getReg()))
1799 MachineOperand
*Op
= &*MRI
->use_nodbg_begin(Reg
);
1800 MachineInstr
*UseMI
= Op
->getParent();
1801 while (UseMI
->isCopy() && !Op
->getSubReg()) {
1802 Reg
= UseMI
->getOperand(0).getReg();
1803 if (!TRI
->isVGPR(*MRI
, Reg
) || !MRI
->hasOneNonDBGUse(Reg
))
1805 Op
= &*MRI
->use_nodbg_begin(Reg
);
1806 UseMI
= Op
->getParent();
1809 if (Op
->getSubReg())
1812 unsigned OpIdx
= Op
- &UseMI
->getOperand(0);
1813 const MCInstrDesc
&InstDesc
= UseMI
->getDesc();
1814 const TargetRegisterClass
*OpRC
=
1815 TII
->getRegClass(InstDesc
, OpIdx
, TRI
, *MI
.getMF());
1816 if (!OpRC
|| !TRI
->isVectorSuperClass(OpRC
))
1819 const auto *NewDstRC
= TRI
->getEquivalentAGPRClass(MRI
->getRegClass(Reg
));
1820 auto Dst
= MRI
->createVirtualRegister(NewDstRC
);
1821 auto RS
= BuildMI(*MI
.getParent(), MI
, MI
.getDebugLoc(),
1822 TII
->get(AMDGPU::REG_SEQUENCE
), Dst
);
1824 for (auto &[Def
, SubIdx
] : Defs
) {
1825 Def
->setIsKill(false);
1826 if (TRI
->isAGPR(*MRI
, Def
->getReg())) {
1828 } else { // This is a copy
1829 MachineInstr
*SubDef
= MRI
->getVRegDef(Def
->getReg());
1830 SubDef
->getOperand(1).setIsKill(false);
1831 RS
.addReg(SubDef
->getOperand(1).getReg(), 0, Def
->getSubReg());
1837 if (!TII
->isOperandLegal(*UseMI
, OpIdx
, Op
)) {
1839 RS
->eraseFromParent();
1843 LLVM_DEBUG(dbgs() << "Folded " << *RS
<< " into " << *UseMI
);
1845 // Erase the REG_SEQUENCE eagerly, unless we followed a chain of COPY users,
1846 // in which case we can erase them all later in runOnMachineFunction.
1847 if (MRI
->use_nodbg_empty(MI
.getOperand(0).getReg()))
1848 MI
.eraseFromParent();
1852 /// Checks whether \p Copy is a AGPR -> VGPR copy. Returns `true` on success and
1853 /// stores the AGPR register in \p OutReg and the subreg in \p OutSubReg
1854 static bool isAGPRCopy(const SIRegisterInfo
&TRI
,
1855 const MachineRegisterInfo
&MRI
, const MachineInstr
&Copy
,
1856 Register
&OutReg
, unsigned &OutSubReg
) {
1857 assert(Copy
.isCopy());
1859 const MachineOperand
&CopySrc
= Copy
.getOperand(1);
1860 Register CopySrcReg
= CopySrc
.getReg();
1861 if (!CopySrcReg
.isVirtual())
1864 // Common case: copy from AGPR directly, e.g.
1865 // %1:vgpr_32 = COPY %0:agpr_32
1866 if (TRI
.isAGPR(MRI
, CopySrcReg
)) {
1867 OutReg
= CopySrcReg
;
1868 OutSubReg
= CopySrc
.getSubReg();
1872 // Sometimes it can also involve two copies, e.g.
1873 // %1:vgpr_256 = COPY %0:agpr_256
1874 // %2:vgpr_32 = COPY %1:vgpr_256.sub0
1875 const MachineInstr
*CopySrcDef
= MRI
.getVRegDef(CopySrcReg
);
1876 if (!CopySrcDef
|| !CopySrcDef
->isCopy())
1879 const MachineOperand
&OtherCopySrc
= CopySrcDef
->getOperand(1);
1880 Register OtherCopySrcReg
= OtherCopySrc
.getReg();
1881 if (!OtherCopySrcReg
.isVirtual() ||
1882 CopySrcDef
->getOperand(0).getSubReg() != AMDGPU::NoSubRegister
||
1883 OtherCopySrc
.getSubReg() != AMDGPU::NoSubRegister
||
1884 !TRI
.isAGPR(MRI
, OtherCopySrcReg
))
1887 OutReg
= OtherCopySrcReg
;
1888 OutSubReg
= CopySrc
.getSubReg();
1892 // Try to hoist an AGPR to VGPR copy across a PHI.
1893 // This should allow folding of an AGPR into a consumer which may support it.
1895 // Example 1: LCSSA PHI
1897 // %1:vreg = COPY %0:areg
1899 // %2:vreg = PHI %1:vreg, %loop
1903 // %1:areg = PHI %0:areg, %loop
1904 // %2:vreg = COPY %1:areg
1906 // Example 2: PHI with multiple incoming values:
1908 // %1:vreg = GLOBAL_LOAD(..)
1910 // %2:vreg = PHI %1:vreg, %entry, %5:vreg, %loop
1911 // %3:areg = COPY %2:vreg
1912 // %4:areg = (instr using %3:areg)
1913 // %5:vreg = COPY %4:areg
1916 // %1:vreg = GLOBAL_LOAD(..)
1917 // %2:areg = COPY %1:vreg
1919 // %3:areg = PHI %2:areg, %entry, %X:areg,
1920 // %4:areg = (instr using %3:areg)
1921 bool SIFoldOperands::tryFoldPhiAGPR(MachineInstr
&PHI
) {
1922 assert(PHI
.isPHI());
1924 Register PhiOut
= PHI
.getOperand(0).getReg();
1925 if (!TRI
->isVGPR(*MRI
, PhiOut
))
1928 // Iterate once over all incoming values of the PHI to check if this PHI is
1929 // eligible, and determine the exact AGPR RC we'll target.
1930 const TargetRegisterClass
*ARC
= nullptr;
1931 for (unsigned K
= 1; K
< PHI
.getNumExplicitOperands(); K
+= 2) {
1932 MachineOperand
&MO
= PHI
.getOperand(K
);
1933 MachineInstr
*Copy
= MRI
->getVRegDef(MO
.getReg());
1934 if (!Copy
|| !Copy
->isCopy())
1938 unsigned AGPRRegMask
= AMDGPU::NoSubRegister
;
1939 if (!isAGPRCopy(*TRI
, *MRI
, *Copy
, AGPRSrc
, AGPRRegMask
))
1942 const TargetRegisterClass
*CopyInRC
= MRI
->getRegClass(AGPRSrc
);
1943 if (const auto *SubRC
= TRI
->getSubRegisterClass(CopyInRC
, AGPRRegMask
))
1946 if (ARC
&& !ARC
->hasSubClassEq(CopyInRC
))
1954 bool IsAGPR32
= (ARC
== &AMDGPU::AGPR_32RegClass
);
1956 // Rewrite the PHI's incoming values to ARC.
1957 LLVM_DEBUG(dbgs() << "Folding AGPR copies into: " << PHI
);
1958 for (unsigned K
= 1; K
< PHI
.getNumExplicitOperands(); K
+= 2) {
1959 MachineOperand
&MO
= PHI
.getOperand(K
);
1960 Register Reg
= MO
.getReg();
1962 MachineBasicBlock::iterator InsertPt
;
1963 MachineBasicBlock
*InsertMBB
= nullptr;
1965 // Look at the def of Reg, ignoring all copies.
1966 unsigned CopyOpc
= AMDGPU::COPY
;
1967 if (MachineInstr
*Def
= MRI
->getVRegDef(Reg
)) {
1969 // Look at pre-existing COPY instructions from ARC: Steal the operand. If
1970 // the copy was single-use, it will be removed by DCE later.
1971 if (Def
->isCopy()) {
1973 unsigned AGPRSubReg
= AMDGPU::NoSubRegister
;
1974 if (isAGPRCopy(*TRI
, *MRI
, *Def
, AGPRSrc
, AGPRSubReg
)) {
1976 MO
.setSubReg(AGPRSubReg
);
1980 // If this is a multi-use SGPR -> VGPR copy, use V_ACCVGPR_WRITE on
1981 // GFX908 directly instead of a COPY. Otherwise, SIFoldOperand may try
1982 // to fold the sgpr -> vgpr -> agpr copy into a sgpr -> agpr copy which
1983 // is unlikely to be profitable.
1985 // Note that V_ACCVGPR_WRITE is only used for AGPR_32.
1986 MachineOperand
&CopyIn
= Def
->getOperand(1);
1987 if (IsAGPR32
&& !ST
->hasGFX90AInsts() && !MRI
->hasOneNonDBGUse(Reg
) &&
1988 TRI
->isSGPRReg(*MRI
, CopyIn
.getReg()))
1989 CopyOpc
= AMDGPU::V_ACCVGPR_WRITE_B32_e64
;
1992 InsertMBB
= Def
->getParent();
1993 InsertPt
= InsertMBB
->SkipPHIsLabelsAndDebug(++Def
->getIterator());
1995 InsertMBB
= PHI
.getOperand(MO
.getOperandNo() + 1).getMBB();
1996 InsertPt
= InsertMBB
->getFirstTerminator();
1999 Register NewReg
= MRI
->createVirtualRegister(ARC
);
2000 MachineInstr
*MI
= BuildMI(*InsertMBB
, InsertPt
, PHI
.getDebugLoc(),
2001 TII
->get(CopyOpc
), NewReg
)
2006 LLVM_DEBUG(dbgs() << " Created COPY: " << *MI
);
2009 // Replace the PHI's result with a new register.
2010 Register NewReg
= MRI
->createVirtualRegister(ARC
);
2011 PHI
.getOperand(0).setReg(NewReg
);
2013 // COPY that new register back to the original PhiOut register. This COPY will
2014 // usually be folded out later.
2015 MachineBasicBlock
*MBB
= PHI
.getParent();
2016 BuildMI(*MBB
, MBB
->getFirstNonPHI(), PHI
.getDebugLoc(),
2017 TII
->get(AMDGPU::COPY
), PhiOut
)
2020 LLVM_DEBUG(dbgs() << " Done: Folded " << PHI
);
2024 // Attempt to convert VGPR load to an AGPR load.
2025 bool SIFoldOperands::tryFoldLoad(MachineInstr
&MI
) {
2026 assert(MI
.mayLoad());
2027 if (!ST
->hasGFX90AInsts() || MI
.getNumExplicitDefs() != 1)
2030 MachineOperand
&Def
= MI
.getOperand(0);
2034 Register DefReg
= Def
.getReg();
2036 if (DefReg
.isPhysical() || !TRI
->isVGPR(*MRI
, DefReg
))
2039 SmallVector
<const MachineInstr
*, 8> Users
;
2040 SmallVector
<Register
, 8> MoveRegs
;
2041 for (const MachineInstr
&I
: MRI
->use_nodbg_instructions(DefReg
))
2042 Users
.push_back(&I
);
2047 // Check that all uses a copy to an agpr or a reg_sequence producing an agpr.
2048 while (!Users
.empty()) {
2049 const MachineInstr
*I
= Users
.pop_back_val();
2050 if (!I
->isCopy() && !I
->isRegSequence())
2052 Register DstReg
= I
->getOperand(0).getReg();
2053 // Physical registers may have more than one instruction definitions
2054 if (DstReg
.isPhysical())
2056 if (TRI
->isAGPR(*MRI
, DstReg
))
2058 MoveRegs
.push_back(DstReg
);
2059 for (const MachineInstr
&U
: MRI
->use_nodbg_instructions(DstReg
))
2060 Users
.push_back(&U
);
2063 const TargetRegisterClass
*RC
= MRI
->getRegClass(DefReg
);
2064 MRI
->setRegClass(DefReg
, TRI
->getEquivalentAGPRClass(RC
));
2065 if (!TII
->isOperandLegal(MI
, 0, &Def
)) {
2066 MRI
->setRegClass(DefReg
, RC
);
2070 while (!MoveRegs
.empty()) {
2071 Register Reg
= MoveRegs
.pop_back_val();
2072 MRI
->setRegClass(Reg
, TRI
->getEquivalentAGPRClass(MRI
->getRegClass(Reg
)));
2075 LLVM_DEBUG(dbgs() << "Folded " << MI
);
2080 // tryFoldPhiAGPR will aggressively try to create AGPR PHIs.
2081 // For GFX90A and later, this is pretty much always a good thing, but for GFX908
2082 // there's cases where it can create a lot more AGPR-AGPR copies, which are
2083 // expensive on this architecture due to the lack of V_ACCVGPR_MOV.
2085 // This function looks at all AGPR PHIs in a basic block and collects their
2086 // operands. Then, it checks for register that are used more than once across
2087 // all PHIs and caches them in a VGPR. This prevents ExpandPostRAPseudo from
2088 // having to create one VGPR temporary per use, which can get very messy if
2089 // these PHIs come from a broken-up large PHI (e.g. 32 AGPR phis, one per vector
2094 // %in:agpr_256 = COPY %foo:vgpr_256
2098 // %0:areg = PHI %in.sub0:agpr_32, %a, %x, %c
2099 // %1:areg = PHI %in.sub0:agpr_32, %a, %y, %c
2100 // %2:areg = PHI %in.sub0:agpr_32, %a, %z, %c
2103 // %in:agpr_256 = COPY %foo:vgpr_256
2104 // %tmp:vgpr_32 = V_ACCVGPR_READ_B32_e64 %in.sub0:agpr_32
2105 // %tmp_agpr:agpr_32 = COPY %tmp
2109 // %0:areg = PHI %tmp_agpr, %a, %x, %c
2110 // %1:areg = PHI %tmp_agpr, %a, %y, %c
2111 // %2:areg = PHI %tmp_agpr, %a, %z, %c
2112 bool SIFoldOperands::tryOptimizeAGPRPhis(MachineBasicBlock
&MBB
) {
2113 // This is only really needed on GFX908 where AGPR-AGPR copies are
2114 // unreasonably difficult.
2115 if (ST
->hasGFX90AInsts())
2118 // Look at all AGPR Phis and collect the register + subregister used.
2119 DenseMap
<std::pair
<Register
, unsigned>, std::vector
<MachineOperand
*>>
2122 for (auto &MI
: MBB
) {
2126 if (!TRI
->isAGPR(*MRI
, MI
.getOperand(0).getReg()))
2129 for (unsigned K
= 1; K
< MI
.getNumOperands(); K
+= 2) {
2130 MachineOperand
&PhiMO
= MI
.getOperand(K
);
2131 if (!PhiMO
.getSubReg())
2133 RegToMO
[{PhiMO
.getReg(), PhiMO
.getSubReg()}].push_back(&PhiMO
);
2137 // For all (Reg, SubReg) pair that are used more than once, cache the value in
2139 bool Changed
= false;
2140 for (const auto &[Entry
, MOs
] : RegToMO
) {
2141 if (MOs
.size() == 1)
2144 const auto [Reg
, SubReg
] = Entry
;
2145 MachineInstr
*Def
= MRI
->getVRegDef(Reg
);
2146 MachineBasicBlock
*DefMBB
= Def
->getParent();
2148 // Create a copy in a VGPR using V_ACCVGPR_READ_B32_e64 so it's not folded
2150 const TargetRegisterClass
*ARC
= getRegOpRC(*MRI
, *TRI
, *MOs
.front());
2152 MRI
->createVirtualRegister(TRI
->getEquivalentVGPRClass(ARC
));
2153 MachineInstr
*VGPRCopy
=
2154 BuildMI(*DefMBB
, ++Def
->getIterator(), Def
->getDebugLoc(),
2155 TII
->get(AMDGPU::V_ACCVGPR_READ_B32_e64
), TempVGPR
)
2156 .addReg(Reg
, /* flags */ 0, SubReg
);
2158 // Copy back to an AGPR and use that instead of the AGPR subreg in all MOs.
2159 Register TempAGPR
= MRI
->createVirtualRegister(ARC
);
2160 BuildMI(*DefMBB
, ++VGPRCopy
->getIterator(), Def
->getDebugLoc(),
2161 TII
->get(AMDGPU::COPY
), TempAGPR
)
2164 LLVM_DEBUG(dbgs() << "Caching AGPR into VGPR: " << *VGPRCopy
);
2165 for (MachineOperand
*MO
: MOs
) {
2166 MO
->setReg(TempAGPR
);
2167 MO
->setSubReg(AMDGPU::NoSubRegister
);
2168 LLVM_DEBUG(dbgs() << " Changed PHI Operand: " << *MO
<< "\n");
2177 bool SIFoldOperands::runOnMachineFunction(MachineFunction
&MF
) {
2178 if (skipFunction(MF
.getFunction()))
2181 MRI
= &MF
.getRegInfo();
2182 ST
= &MF
.getSubtarget
<GCNSubtarget
>();
2183 TII
= ST
->getInstrInfo();
2184 TRI
= &TII
->getRegisterInfo();
2185 MFI
= MF
.getInfo
<SIMachineFunctionInfo
>();
2187 // omod is ignored by hardware if IEEE bit is enabled. omod also does not
2188 // correctly handle signed zeros.
2190 // FIXME: Also need to check strictfp
2191 bool IsIEEEMode
= MFI
->getMode().IEEE
;
2192 bool HasNSZ
= MFI
->hasNoSignedZerosFPMath();
2194 bool Changed
= false;
2195 for (MachineBasicBlock
*MBB
: depth_first(&MF
)) {
2196 MachineOperand
*CurrentKnownM0Val
= nullptr;
2197 for (auto &MI
: make_early_inc_range(*MBB
)) {
2198 Changed
|= tryFoldCndMask(MI
);
2200 if (tryFoldZeroHighBits(MI
)) {
2205 if (MI
.isRegSequence() && tryFoldRegSequence(MI
)) {
2210 if (MI
.isPHI() && tryFoldPhiAGPR(MI
)) {
2215 if (MI
.mayLoad() && tryFoldLoad(MI
)) {
2220 if (TII
->isFoldableCopy(MI
)) {
2221 Changed
|= tryFoldFoldableCopy(MI
, CurrentKnownM0Val
);
2225 // Saw an unknown clobber of m0, so we no longer know what it is.
2226 if (CurrentKnownM0Val
&& MI
.modifiesRegister(AMDGPU::M0
, TRI
))
2227 CurrentKnownM0Val
= nullptr;
2229 // TODO: Omod might be OK if there is NSZ only on the source
2230 // instruction, and not the omod multiply.
2231 if (IsIEEEMode
|| (!HasNSZ
&& !MI
.getFlag(MachineInstr::FmNsz
)) ||
2233 Changed
|= tryFoldClamp(MI
);
2236 Changed
|= tryOptimizeAGPRPhis(*MBB
);