1 //===-- SIFoldOperands.cpp - Fold operands --- ----------------------------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
8 //===----------------------------------------------------------------------===//
11 #include "SIFoldOperands.h"
13 #include "GCNSubtarget.h"
14 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
15 #include "SIMachineFunctionInfo.h"
16 #include "llvm/ADT/DepthFirstIterator.h"
17 #include "llvm/CodeGen/MachineFunctionPass.h"
18 #include "llvm/CodeGen/MachineOperand.h"
20 #define DEBUG_TYPE "si-fold-operands"
25 struct FoldCandidate
{
28 MachineOperand
*OpToFold
;
34 MachineOperand::MachineOperandType Kind
;
37 FoldCandidate(MachineInstr
*MI
, unsigned OpNo
, MachineOperand
*FoldOp
,
38 bool Commuted_
= false,
40 UseMI(MI
), OpToFold(nullptr), ShrinkOpcode(ShrinkOp
), UseOpNo(OpNo
),
41 Kind(FoldOp
->getType()),
43 if (FoldOp
->isImm()) {
44 ImmToFold
= FoldOp
->getImm();
45 } else if (FoldOp
->isFI()) {
46 FrameIndexToFold
= FoldOp
->getIndex();
48 assert(FoldOp
->isReg() || FoldOp
->isGlobal());
54 return Kind
== MachineOperand::MO_FrameIndex
;
58 return Kind
== MachineOperand::MO_Immediate
;
62 return Kind
== MachineOperand::MO_Register
;
65 bool isGlobal() const { return Kind
== MachineOperand::MO_GlobalAddress
; }
67 bool needsShrink() const { return ShrinkOpcode
!= -1; }
70 class SIFoldOperandsImpl
{
72 MachineRegisterInfo
*MRI
;
73 const SIInstrInfo
*TII
;
74 const SIRegisterInfo
*TRI
;
75 const GCNSubtarget
*ST
;
76 const SIMachineFunctionInfo
*MFI
;
78 bool frameIndexMayFold(const MachineInstr
&UseMI
, int OpNo
,
79 const MachineOperand
&OpToFold
) const;
81 // TODO: Just use TII::getVALUOp
82 unsigned convertToVALUOp(unsigned Opc
, bool UseVOP3
= false) const {
84 case AMDGPU::S_ADD_I32
: {
85 if (ST
->hasAddNoCarry())
86 return UseVOP3
? AMDGPU::V_ADD_U32_e64
: AMDGPU::V_ADD_U32_e32
;
87 return UseVOP3
? AMDGPU::V_ADD_CO_U32_e64
: AMDGPU::V_ADD_CO_U32_e32
;
89 case AMDGPU::S_OR_B32
:
90 return UseVOP3
? AMDGPU::V_OR_B32_e64
: AMDGPU::V_OR_B32_e32
;
91 case AMDGPU::S_AND_B32
:
92 return UseVOP3
? AMDGPU::V_AND_B32_e64
: AMDGPU::V_AND_B32_e32
;
93 case AMDGPU::S_MUL_I32
:
94 return AMDGPU::V_MUL_LO_U32_e64
;
96 return AMDGPU::INSTRUCTION_LIST_END
;
100 bool foldCopyToVGPROfScalarAddOfFrameIndex(Register DstReg
, Register SrcReg
,
101 MachineInstr
&MI
) const;
103 bool updateOperand(FoldCandidate
&Fold
) const;
105 bool canUseImmWithOpSel(FoldCandidate
&Fold
) const;
107 bool tryFoldImmWithOpSel(FoldCandidate
&Fold
) const;
109 bool tryAddToFoldList(SmallVectorImpl
<FoldCandidate
> &FoldList
,
110 MachineInstr
*MI
, unsigned OpNo
,
111 MachineOperand
*OpToFold
) const;
112 bool isUseSafeToFold(const MachineInstr
&MI
,
113 const MachineOperand
&UseMO
) const;
115 getRegSeqInit(SmallVectorImpl
<std::pair
<MachineOperand
*, unsigned>> &Defs
,
116 Register UseReg
, uint8_t OpTy
) const;
117 bool tryToFoldACImm(const MachineOperand
&OpToFold
, MachineInstr
*UseMI
,
119 SmallVectorImpl
<FoldCandidate
> &FoldList
) const;
120 void foldOperand(MachineOperand
&OpToFold
,
123 SmallVectorImpl
<FoldCandidate
> &FoldList
,
124 SmallVectorImpl
<MachineInstr
*> &CopiesToReplace
) const;
126 MachineOperand
*getImmOrMaterializedImm(MachineOperand
&Op
) const;
127 bool tryConstantFoldOp(MachineInstr
*MI
) const;
128 bool tryFoldCndMask(MachineInstr
&MI
) const;
129 bool tryFoldZeroHighBits(MachineInstr
&MI
) const;
130 bool foldInstOperand(MachineInstr
&MI
, MachineOperand
&OpToFold
) const;
131 bool tryFoldFoldableCopy(MachineInstr
&MI
,
132 MachineOperand
*&CurrentKnownM0Val
) const;
134 const MachineOperand
*isClamp(const MachineInstr
&MI
) const;
135 bool tryFoldClamp(MachineInstr
&MI
);
137 std::pair
<const MachineOperand
*, int> isOMod(const MachineInstr
&MI
) const;
138 bool tryFoldOMod(MachineInstr
&MI
);
139 bool tryFoldRegSequence(MachineInstr
&MI
);
140 bool tryFoldPhiAGPR(MachineInstr
&MI
);
141 bool tryFoldLoad(MachineInstr
&MI
);
143 bool tryOptimizeAGPRPhis(MachineBasicBlock
&MBB
);
146 SIFoldOperandsImpl() = default;
148 bool run(MachineFunction
&MF
);
151 class SIFoldOperandsLegacy
: public MachineFunctionPass
{
155 SIFoldOperandsLegacy() : MachineFunctionPass(ID
) {}
157 bool runOnMachineFunction(MachineFunction
&MF
) override
{
158 if (skipFunction(MF
.getFunction()))
160 return SIFoldOperandsImpl().run(MF
);
163 StringRef
getPassName() const override
{ return "SI Fold Operands"; }
165 void getAnalysisUsage(AnalysisUsage
&AU
) const override
{
166 AU
.setPreservesCFG();
167 MachineFunctionPass::getAnalysisUsage(AU
);
171 } // End anonymous namespace.
173 INITIALIZE_PASS(SIFoldOperandsLegacy
, DEBUG_TYPE
, "SI Fold Operands", false,
176 char SIFoldOperandsLegacy::ID
= 0;
178 char &llvm::SIFoldOperandsLegacyID
= SIFoldOperandsLegacy::ID
;
180 static const TargetRegisterClass
*getRegOpRC(const MachineRegisterInfo
&MRI
,
181 const TargetRegisterInfo
&TRI
,
182 const MachineOperand
&MO
) {
183 const TargetRegisterClass
*RC
= MRI
.getRegClass(MO
.getReg());
184 if (const TargetRegisterClass
*SubRC
=
185 TRI
.getSubRegisterClass(RC
, MO
.getSubReg()))
190 // Map multiply-accumulate opcode to corresponding multiply-add opcode if any.
191 static unsigned macToMad(unsigned Opc
) {
193 case AMDGPU::V_MAC_F32_e64
:
194 return AMDGPU::V_MAD_F32_e64
;
195 case AMDGPU::V_MAC_F16_e64
:
196 return AMDGPU::V_MAD_F16_e64
;
197 case AMDGPU::V_FMAC_F32_e64
:
198 return AMDGPU::V_FMA_F32_e64
;
199 case AMDGPU::V_FMAC_F16_e64
:
200 return AMDGPU::V_FMA_F16_gfx9_e64
;
201 case AMDGPU::V_FMAC_F16_fake16_e64
:
202 return AMDGPU::V_FMA_F16_gfx9_e64
;
203 case AMDGPU::V_FMAC_LEGACY_F32_e64
:
204 return AMDGPU::V_FMA_LEGACY_F32_e64
;
205 case AMDGPU::V_FMAC_F64_e64
:
206 return AMDGPU::V_FMA_F64_e64
;
208 return AMDGPU::INSTRUCTION_LIST_END
;
211 // TODO: Add heuristic that the frame index might not fit in the addressing mode
212 // immediate offset to avoid materializing in loops.
213 bool SIFoldOperandsImpl::frameIndexMayFold(
214 const MachineInstr
&UseMI
, int OpNo
, const MachineOperand
&OpToFold
) const {
215 if (!OpToFold
.isFI())
218 const unsigned Opc
= UseMI
.getOpcode();
220 case AMDGPU::S_ADD_I32
:
221 case AMDGPU::S_OR_B32
:
222 case AMDGPU::S_AND_B32
:
223 case AMDGPU::V_ADD_U32_e32
:
224 case AMDGPU::V_ADD_CO_U32_e32
:
225 // TODO: Possibly relax hasOneUse. It matters more for mubuf, since we have
226 // to insert the wave size shift at every point we use the index.
227 // TODO: Fix depending on visit order to fold immediates into the operand
228 return UseMI
.getOperand(OpNo
== 1 ? 2 : 1).isImm() &&
229 MRI
->hasOneNonDBGUse(UseMI
.getOperand(OpNo
).getReg());
230 case AMDGPU::V_ADD_U32_e64
:
231 case AMDGPU::V_ADD_CO_U32_e64
:
232 return UseMI
.getOperand(OpNo
== 2 ? 3 : 2).isImm() &&
233 MRI
->hasOneNonDBGUse(UseMI
.getOperand(OpNo
).getReg());
238 if (TII
->isMUBUF(UseMI
))
239 return OpNo
== AMDGPU::getNamedOperandIdx(Opc
, AMDGPU::OpName::vaddr
);
240 if (!TII
->isFLATScratch(UseMI
))
243 int SIdx
= AMDGPU::getNamedOperandIdx(Opc
, AMDGPU::OpName::saddr
);
247 int VIdx
= AMDGPU::getNamedOperandIdx(Opc
, AMDGPU::OpName::vaddr
);
248 return OpNo
== VIdx
&& SIdx
== -1;
251 /// Fold %vgpr = COPY (S_ADD_I32 x, frameindex)
253 /// => %vgpr = V_ADD_U32 x, frameindex
254 bool SIFoldOperandsImpl::foldCopyToVGPROfScalarAddOfFrameIndex(
255 Register DstReg
, Register SrcReg
, MachineInstr
&MI
) const {
256 if (TRI
->isVGPR(*MRI
, DstReg
) && TRI
->isSGPRReg(*MRI
, SrcReg
) &&
257 MRI
->hasOneNonDBGUse(SrcReg
)) {
258 MachineInstr
*Def
= MRI
->getVRegDef(SrcReg
);
259 if (!Def
|| Def
->getNumOperands() != 4)
262 MachineOperand
*Src0
= &Def
->getOperand(1);
263 MachineOperand
*Src1
= &Def
->getOperand(2);
265 // TODO: This is profitable with more operand types, and for more
266 // opcodes. But ultimately this is working around poor / nonexistent
268 if (!Src0
->isFI() && !Src1
->isFI())
272 std::swap(Src0
, Src1
);
274 const bool UseVOP3
= !Src0
->isImm() || TII
->isInlineConstant(*Src0
);
275 unsigned NewOp
= convertToVALUOp(Def
->getOpcode(), UseVOP3
);
276 if (NewOp
== AMDGPU::INSTRUCTION_LIST_END
||
277 !Def
->getOperand(3).isDead()) // Check if scc is dead
280 MachineBasicBlock
*MBB
= Def
->getParent();
281 const DebugLoc
&DL
= Def
->getDebugLoc();
282 if (NewOp
!= AMDGPU::V_ADD_CO_U32_e32
) {
283 MachineInstrBuilder Add
=
284 BuildMI(*MBB
, *Def
, DL
, TII
->get(NewOp
), DstReg
);
286 if (Add
->getDesc().getNumDefs() == 2) {
287 Register CarryOutReg
= MRI
->createVirtualRegister(TRI
->getBoolRC());
288 Add
.addDef(CarryOutReg
, RegState::Dead
);
289 MRI
->setRegAllocationHint(CarryOutReg
, 0, TRI
->getVCC());
292 Add
.add(*Src0
).add(*Src1
).setMIFlags(Def
->getFlags());
293 if (AMDGPU::hasNamedOperand(NewOp
, AMDGPU::OpName::clamp
))
296 Def
->eraseFromParent();
297 MI
.eraseFromParent();
301 assert(NewOp
== AMDGPU::V_ADD_CO_U32_e32
);
303 MachineBasicBlock::LivenessQueryResult Liveness
=
304 MBB
->computeRegisterLiveness(TRI
, AMDGPU::VCC
, *Def
, 16);
305 if (Liveness
== MachineBasicBlock::LQR_Dead
) {
306 // TODO: If src1 satisfies operand constraints, use vop3 version.
307 BuildMI(*MBB
, *Def
, DL
, TII
->get(NewOp
), DstReg
)
310 .setOperandDead(3) // implicit-def $vcc
311 .setMIFlags(Def
->getFlags());
312 Def
->eraseFromParent();
313 MI
.eraseFromParent();
321 FunctionPass
*llvm::createSIFoldOperandsLegacyPass() {
322 return new SIFoldOperandsLegacy();
325 bool SIFoldOperandsImpl::canUseImmWithOpSel(FoldCandidate
&Fold
) const {
326 MachineInstr
*MI
= Fold
.UseMI
;
327 MachineOperand
&Old
= MI
->getOperand(Fold
.UseOpNo
);
328 const uint64_t TSFlags
= MI
->getDesc().TSFlags
;
330 assert(Old
.isReg() && Fold
.isImm());
332 if (!(TSFlags
& SIInstrFlags::IsPacked
) || (TSFlags
& SIInstrFlags::IsMAI
) ||
333 (TSFlags
& SIInstrFlags::IsWMMA
) || (TSFlags
& SIInstrFlags::IsSWMMAC
) ||
334 (ST
->hasDOTOpSelHazard() && (TSFlags
& SIInstrFlags::IsDOT
)))
337 unsigned Opcode
= MI
->getOpcode();
338 int OpNo
= MI
->getOperandNo(&Old
);
339 uint8_t OpType
= TII
->get(Opcode
).operands()[OpNo
].OperandType
;
343 case AMDGPU::OPERAND_REG_IMM_V2FP16
:
344 case AMDGPU::OPERAND_REG_IMM_V2BF16
:
345 case AMDGPU::OPERAND_REG_IMM_V2INT16
:
346 case AMDGPU::OPERAND_REG_INLINE_C_V2FP16
:
347 case AMDGPU::OPERAND_REG_INLINE_C_V2BF16
:
348 case AMDGPU::OPERAND_REG_INLINE_C_V2INT16
:
355 bool SIFoldOperandsImpl::tryFoldImmWithOpSel(FoldCandidate
&Fold
) const {
356 MachineInstr
*MI
= Fold
.UseMI
;
357 MachineOperand
&Old
= MI
->getOperand(Fold
.UseOpNo
);
358 unsigned Opcode
= MI
->getOpcode();
359 int OpNo
= MI
->getOperandNo(&Old
);
360 uint8_t OpType
= TII
->get(Opcode
).operands()[OpNo
].OperandType
;
362 // If the literal can be inlined as-is, apply it and short-circuit the
363 // tests below. The main motivation for this is to avoid unintuitive
365 if (AMDGPU::isInlinableLiteralV216(Fold
.ImmToFold
, OpType
)) {
366 Old
.ChangeToImmediate(Fold
.ImmToFold
);
370 // Refer to op_sel/op_sel_hi and check if we can change the immediate and
371 // op_sel in a way that allows an inline constant.
373 unsigned SrcIdx
= ~0;
374 if (OpNo
== AMDGPU::getNamedOperandIdx(Opcode
, AMDGPU::OpName::src0
)) {
375 ModIdx
= AMDGPU::OpName::src0_modifiers
;
377 } else if (OpNo
== AMDGPU::getNamedOperandIdx(Opcode
, AMDGPU::OpName::src1
)) {
378 ModIdx
= AMDGPU::OpName::src1_modifiers
;
380 } else if (OpNo
== AMDGPU::getNamedOperandIdx(Opcode
, AMDGPU::OpName::src2
)) {
381 ModIdx
= AMDGPU::OpName::src2_modifiers
;
384 assert(ModIdx
!= -1);
385 ModIdx
= AMDGPU::getNamedOperandIdx(Opcode
, ModIdx
);
386 MachineOperand
&Mod
= MI
->getOperand(ModIdx
);
387 unsigned ModVal
= Mod
.getImm();
389 uint16_t ImmLo
= static_cast<uint16_t>(
390 Fold
.ImmToFold
>> (ModVal
& SISrcMods::OP_SEL_0
? 16 : 0));
391 uint16_t ImmHi
= static_cast<uint16_t>(
392 Fold
.ImmToFold
>> (ModVal
& SISrcMods::OP_SEL_1
? 16 : 0));
393 uint32_t Imm
= (static_cast<uint32_t>(ImmHi
) << 16) | ImmLo
;
394 unsigned NewModVal
= ModVal
& ~(SISrcMods::OP_SEL_0
| SISrcMods::OP_SEL_1
);
396 // Helper function that attempts to inline the given value with a newly
397 // chosen opsel pattern.
398 auto tryFoldToInline
= [&](uint32_t Imm
) -> bool {
399 if (AMDGPU::isInlinableLiteralV216(Imm
, OpType
)) {
400 Mod
.setImm(NewModVal
| SISrcMods::OP_SEL_1
);
401 Old
.ChangeToImmediate(Imm
);
405 // Try to shuffle the halves around and leverage opsel to get an inline
407 uint16_t Lo
= static_cast<uint16_t>(Imm
);
408 uint16_t Hi
= static_cast<uint16_t>(Imm
>> 16);
410 if (AMDGPU::isInlinableLiteralV216(Lo
, OpType
)) {
411 Mod
.setImm(NewModVal
);
412 Old
.ChangeToImmediate(Lo
);
416 if (static_cast<int16_t>(Lo
) < 0) {
417 int32_t SExt
= static_cast<int16_t>(Lo
);
418 if (AMDGPU::isInlinableLiteralV216(SExt
, OpType
)) {
419 Mod
.setImm(NewModVal
);
420 Old
.ChangeToImmediate(SExt
);
425 // This check is only useful for integer instructions
426 if (OpType
== AMDGPU::OPERAND_REG_IMM_V2INT16
||
427 OpType
== AMDGPU::OPERAND_REG_INLINE_AC_V2INT16
) {
428 if (AMDGPU::isInlinableLiteralV216(Lo
<< 16, OpType
)) {
429 Mod
.setImm(NewModVal
| SISrcMods::OP_SEL_0
| SISrcMods::OP_SEL_1
);
430 Old
.ChangeToImmediate(static_cast<uint32_t>(Lo
) << 16);
435 uint32_t Swapped
= (static_cast<uint32_t>(Lo
) << 16) | Hi
;
436 if (AMDGPU::isInlinableLiteralV216(Swapped
, OpType
)) {
437 Mod
.setImm(NewModVal
| SISrcMods::OP_SEL_0
);
438 Old
.ChangeToImmediate(Swapped
);
446 if (tryFoldToInline(Imm
))
449 // Replace integer addition by subtraction and vice versa if it allows
450 // folding the immediate to an inline constant.
452 // We should only ever get here for SrcIdx == 1 due to canonicalization
453 // earlier in the pipeline, but we double-check here to be safe / fully
455 bool IsUAdd
= Opcode
== AMDGPU::V_PK_ADD_U16
;
456 bool IsUSub
= Opcode
== AMDGPU::V_PK_SUB_U16
;
457 if (SrcIdx
== 1 && (IsUAdd
|| IsUSub
)) {
459 AMDGPU::getNamedOperandIdx(Opcode
, AMDGPU::OpName::clamp
);
460 bool Clamp
= MI
->getOperand(ClampIdx
).getImm() != 0;
463 uint16_t NegLo
= -static_cast<uint16_t>(Imm
);
464 uint16_t NegHi
= -static_cast<uint16_t>(Imm
>> 16);
465 uint32_t NegImm
= (static_cast<uint32_t>(NegHi
) << 16) | NegLo
;
467 if (tryFoldToInline(NegImm
)) {
469 IsUAdd
? AMDGPU::V_PK_SUB_U16
: AMDGPU::V_PK_ADD_U16
;
470 MI
->setDesc(TII
->get(NegOpcode
));
479 bool SIFoldOperandsImpl::updateOperand(FoldCandidate
&Fold
) const {
480 MachineInstr
*MI
= Fold
.UseMI
;
481 MachineOperand
&Old
= MI
->getOperand(Fold
.UseOpNo
);
484 if (Fold
.isImm() && canUseImmWithOpSel(Fold
)) {
485 if (tryFoldImmWithOpSel(Fold
))
488 // We can't represent the candidate as an inline constant. Try as a literal
489 // with the original opsel, checking constant bus limitations.
490 MachineOperand New
= MachineOperand::CreateImm(Fold
.ImmToFold
);
491 int OpNo
= MI
->getOperandNo(&Old
);
492 if (!TII
->isOperandLegal(*MI
, OpNo
, &New
))
494 Old
.ChangeToImmediate(Fold
.ImmToFold
);
498 if ((Fold
.isImm() || Fold
.isFI() || Fold
.isGlobal()) && Fold
.needsShrink()) {
499 MachineBasicBlock
*MBB
= MI
->getParent();
500 auto Liveness
= MBB
->computeRegisterLiveness(TRI
, AMDGPU::VCC
, MI
, 16);
501 if (Liveness
!= MachineBasicBlock::LQR_Dead
) {
502 LLVM_DEBUG(dbgs() << "Not shrinking " << MI
<< " due to vcc liveness\n");
506 int Op32
= Fold
.ShrinkOpcode
;
507 MachineOperand
&Dst0
= MI
->getOperand(0);
508 MachineOperand
&Dst1
= MI
->getOperand(1);
509 assert(Dst0
.isDef() && Dst1
.isDef());
511 bool HaveNonDbgCarryUse
= !MRI
->use_nodbg_empty(Dst1
.getReg());
513 const TargetRegisterClass
*Dst0RC
= MRI
->getRegClass(Dst0
.getReg());
514 Register NewReg0
= MRI
->createVirtualRegister(Dst0RC
);
516 MachineInstr
*Inst32
= TII
->buildShrunkInst(*MI
, Op32
);
518 if (HaveNonDbgCarryUse
) {
519 BuildMI(*MBB
, MI
, MI
->getDebugLoc(), TII
->get(AMDGPU::COPY
),
521 .addReg(AMDGPU::VCC
, RegState::Kill
);
524 // Keep the old instruction around to avoid breaking iterators, but
525 // replace it with a dummy instruction to remove uses.
527 // FIXME: We should not invert how this pass looks at operands to avoid
528 // this. Should track set of foldable movs instead of looking for uses
529 // when looking at a use.
530 Dst0
.setReg(NewReg0
);
531 for (unsigned I
= MI
->getNumOperands() - 1; I
> 0; --I
)
532 MI
->removeOperand(I
);
533 MI
->setDesc(TII
->get(AMDGPU::IMPLICIT_DEF
));
536 TII
->commuteInstruction(*Inst32
, false);
540 assert(!Fold
.needsShrink() && "not handled");
544 int NewMFMAOpc
= AMDGPU::getMFMAEarlyClobberOp(MI
->getOpcode());
545 if (NewMFMAOpc
== -1)
547 MI
->setDesc(TII
->get(NewMFMAOpc
));
548 MI
->untieRegOperand(0);
550 Old
.ChangeToImmediate(Fold
.ImmToFold
);
554 if (Fold
.isGlobal()) {
555 Old
.ChangeToGA(Fold
.OpToFold
->getGlobal(), Fold
.OpToFold
->getOffset(),
556 Fold
.OpToFold
->getTargetFlags());
561 Old
.ChangeToFrameIndex(Fold
.FrameIndexToFold
);
565 MachineOperand
*New
= Fold
.OpToFold
;
566 Old
.substVirtReg(New
->getReg(), New
->getSubReg(), *TRI
);
567 Old
.setIsUndef(New
->isUndef());
571 static bool isUseMIInFoldList(ArrayRef
<FoldCandidate
> FoldList
,
572 const MachineInstr
*MI
) {
573 return any_of(FoldList
, [&](const auto &C
) { return C
.UseMI
== MI
; });
576 static void appendFoldCandidate(SmallVectorImpl
<FoldCandidate
> &FoldList
,
577 MachineInstr
*MI
, unsigned OpNo
,
578 MachineOperand
*FoldOp
, bool Commuted
= false,
580 // Skip additional folding on the same operand.
581 for (FoldCandidate
&Fold
: FoldList
)
582 if (Fold
.UseMI
== MI
&& Fold
.UseOpNo
== OpNo
)
584 LLVM_DEBUG(dbgs() << "Append " << (Commuted
? "commuted" : "normal")
585 << " operand " << OpNo
<< "\n " << *MI
);
586 FoldList
.emplace_back(MI
, OpNo
, FoldOp
, Commuted
, ShrinkOp
);
589 bool SIFoldOperandsImpl::tryAddToFoldList(
590 SmallVectorImpl
<FoldCandidate
> &FoldList
, MachineInstr
*MI
, unsigned OpNo
,
591 MachineOperand
*OpToFold
) const {
592 const unsigned Opc
= MI
->getOpcode();
594 auto tryToFoldAsFMAAKorMK
= [&]() {
595 if (!OpToFold
->isImm())
598 const bool TryAK
= OpNo
== 3;
599 const unsigned NewOpc
= TryAK
? AMDGPU::S_FMAAK_F32
: AMDGPU::S_FMAMK_F32
;
600 MI
->setDesc(TII
->get(NewOpc
));
602 // We have to fold into operand which would be Imm not into OpNo.
603 bool FoldAsFMAAKorMK
=
604 tryAddToFoldList(FoldList
, MI
, TryAK
? 3 : 2, OpToFold
);
605 if (FoldAsFMAAKorMK
) {
606 // Untie Src2 of fmac.
607 MI
->untieRegOperand(3);
608 // For fmamk swap operands 1 and 2 if OpToFold was meant for operand 1.
610 MachineOperand
&Op1
= MI
->getOperand(1);
611 MachineOperand
&Op2
= MI
->getOperand(2);
612 Register OldReg
= Op1
.getReg();
613 // Operand 2 might be an inlinable constant
615 Op1
.ChangeToImmediate(Op2
.getImm());
616 Op2
.ChangeToRegister(OldReg
, false);
618 Op1
.setReg(Op2
.getReg());
624 MI
->setDesc(TII
->get(Opc
));
628 bool IsLegal
= TII
->isOperandLegal(*MI
, OpNo
, OpToFold
);
629 if (!IsLegal
&& OpToFold
->isImm()) {
630 FoldCandidate
Fold(MI
, OpNo
, OpToFold
);
631 IsLegal
= canUseImmWithOpSel(Fold
);
635 // Special case for v_mac_{f16, f32}_e64 if we are trying to fold into src2
636 unsigned NewOpc
= macToMad(Opc
);
637 if (NewOpc
!= AMDGPU::INSTRUCTION_LIST_END
) {
638 // Check if changing this to a v_mad_{f16, f32} instruction will allow us
639 // to fold the operand.
640 MI
->setDesc(TII
->get(NewOpc
));
641 bool AddOpSel
= !AMDGPU::hasNamedOperand(Opc
, AMDGPU::OpName::op_sel
) &&
642 AMDGPU::hasNamedOperand(NewOpc
, AMDGPU::OpName::op_sel
);
644 MI
->addOperand(MachineOperand::CreateImm(0));
645 bool FoldAsMAD
= tryAddToFoldList(FoldList
, MI
, OpNo
, OpToFold
);
647 MI
->untieRegOperand(OpNo
);
651 MI
->removeOperand(MI
->getNumExplicitOperands() - 1);
652 MI
->setDesc(TII
->get(Opc
));
655 // Special case for s_fmac_f32 if we are trying to fold into Src2.
656 // By transforming into fmaak we can untie Src2 and make folding legal.
657 if (Opc
== AMDGPU::S_FMAC_F32
&& OpNo
== 3) {
658 if (tryToFoldAsFMAAKorMK())
662 // Special case for s_setreg_b32
663 if (OpToFold
->isImm()) {
665 if (Opc
== AMDGPU::S_SETREG_B32
)
666 ImmOpc
= AMDGPU::S_SETREG_IMM32_B32
;
667 else if (Opc
== AMDGPU::S_SETREG_B32_mode
)
668 ImmOpc
= AMDGPU::S_SETREG_IMM32_B32_mode
;
670 MI
->setDesc(TII
->get(ImmOpc
));
671 appendFoldCandidate(FoldList
, MI
, OpNo
, OpToFold
);
676 // If we are already folding into another operand of MI, then
677 // we can't commute the instruction, otherwise we risk making the
678 // other fold illegal.
679 if (isUseMIInFoldList(FoldList
, MI
))
682 // Operand is not legal, so try to commute the instruction to
683 // see if this makes it possible to fold.
684 unsigned CommuteOpNo
= TargetInstrInfo::CommuteAnyOperandIndex
;
685 bool CanCommute
= TII
->findCommutedOpIndices(*MI
, OpNo
, CommuteOpNo
);
689 // One of operands might be an Imm operand, and OpNo may refer to it after
690 // the call of commuteInstruction() below. Such situations are avoided
691 // here explicitly as OpNo must be a register operand to be a candidate
692 // for memory folding.
693 if (!MI
->getOperand(OpNo
).isReg() || !MI
->getOperand(CommuteOpNo
).isReg())
696 if (!TII
->commuteInstruction(*MI
, false, OpNo
, CommuteOpNo
))
700 if (!TII
->isOperandLegal(*MI
, CommuteOpNo
, OpToFold
)) {
701 if ((Opc
!= AMDGPU::V_ADD_CO_U32_e64
&& Opc
!= AMDGPU::V_SUB_CO_U32_e64
&&
702 Opc
!= AMDGPU::V_SUBREV_CO_U32_e64
) || // FIXME
703 (!OpToFold
->isImm() && !OpToFold
->isFI() && !OpToFold
->isGlobal())) {
704 TII
->commuteInstruction(*MI
, false, OpNo
, CommuteOpNo
);
708 // Verify the other operand is a VGPR, otherwise we would violate the
709 // constant bus restriction.
710 MachineOperand
&OtherOp
= MI
->getOperand(OpNo
);
711 if (!OtherOp
.isReg() ||
712 !TII
->getRegisterInfo().isVGPR(*MRI
, OtherOp
.getReg()))
715 assert(MI
->getOperand(1).isDef());
717 // Make sure to get the 32-bit version of the commuted opcode.
718 unsigned MaybeCommutedOpc
= MI
->getOpcode();
719 Op32
= AMDGPU::getVOPe32(MaybeCommutedOpc
);
722 appendFoldCandidate(FoldList
, MI
, CommuteOpNo
, OpToFold
, true, Op32
);
726 // Inlineable constant might have been folded into Imm operand of fmaak or
727 // fmamk and we are trying to fold a non-inlinable constant.
728 if ((Opc
== AMDGPU::S_FMAAK_F32
|| Opc
== AMDGPU::S_FMAMK_F32
) &&
729 !OpToFold
->isReg() && !TII
->isInlineConstant(*OpToFold
)) {
730 unsigned ImmIdx
= Opc
== AMDGPU::S_FMAAK_F32
? 3 : 2;
731 MachineOperand
&OpImm
= MI
->getOperand(ImmIdx
);
732 if (!OpImm
.isReg() &&
733 TII
->isInlineConstant(*MI
, MI
->getOperand(OpNo
), OpImm
))
734 return tryToFoldAsFMAAKorMK();
737 // Special case for s_fmac_f32 if we are trying to fold into Src0 or Src1.
738 // By changing into fmamk we can untie Src2.
739 // If folding for Src0 happens first and it is identical operand to Src1 we
740 // should avoid transforming into fmamk which requires commuting as it would
741 // cause folding into Src1 to fail later on due to wrong OpNo used.
742 if (Opc
== AMDGPU::S_FMAC_F32
&&
743 (OpNo
!= 1 || !MI
->getOperand(1).isIdenticalTo(MI
->getOperand(2)))) {
744 if (tryToFoldAsFMAAKorMK())
748 // Check the case where we might introduce a second constant operand to a
749 // scalar instruction
750 if (TII
->isSALU(MI
->getOpcode())) {
751 const MCInstrDesc
&InstDesc
= MI
->getDesc();
752 const MCOperandInfo
&OpInfo
= InstDesc
.operands()[OpNo
];
754 // Fine if the operand can be encoded as an inline constant
755 if (!OpToFold
->isReg() && !TII
->isInlineConstant(*OpToFold
, OpInfo
)) {
756 // Otherwise check for another constant
757 for (unsigned i
= 0, e
= InstDesc
.getNumOperands(); i
!= e
; ++i
) {
758 auto &Op
= MI
->getOperand(i
);
759 if (OpNo
!= i
&& !Op
.isReg() &&
760 !TII
->isInlineConstant(Op
, InstDesc
.operands()[i
]))
766 appendFoldCandidate(FoldList
, MI
, OpNo
, OpToFold
);
770 bool SIFoldOperandsImpl::isUseSafeToFold(const MachineInstr
&MI
,
771 const MachineOperand
&UseMO
) const {
772 // Operands of SDWA instructions must be registers.
773 return !TII
->isSDWA(MI
);
776 // Find a def of the UseReg, check if it is a reg_sequence and find initializers
777 // for each subreg, tracking it to foldable inline immediate if possible.
778 // Returns true on success.
779 bool SIFoldOperandsImpl::getRegSeqInit(
780 SmallVectorImpl
<std::pair
<MachineOperand
*, unsigned>> &Defs
,
781 Register UseReg
, uint8_t OpTy
) const {
782 MachineInstr
*Def
= MRI
->getVRegDef(UseReg
);
783 if (!Def
|| !Def
->isRegSequence())
786 for (unsigned I
= 1, E
= Def
->getNumExplicitOperands(); I
< E
; I
+= 2) {
787 MachineOperand
*Sub
= &Def
->getOperand(I
);
788 assert(Sub
->isReg());
790 for (MachineInstr
*SubDef
= MRI
->getVRegDef(Sub
->getReg());
791 SubDef
&& Sub
->isReg() && Sub
->getReg().isVirtual() &&
792 !Sub
->getSubReg() && TII
->isFoldableCopy(*SubDef
);
793 SubDef
= MRI
->getVRegDef(Sub
->getReg())) {
794 MachineOperand
*Op
= &SubDef
->getOperand(1);
796 if (TII
->isInlineConstant(*Op
, OpTy
))
800 if (!Op
->isReg() || Op
->getReg().isPhysical())
805 Defs
.emplace_back(Sub
, Def
->getOperand(I
+ 1).getImm());
811 bool SIFoldOperandsImpl::tryToFoldACImm(
812 const MachineOperand
&OpToFold
, MachineInstr
*UseMI
, unsigned UseOpIdx
,
813 SmallVectorImpl
<FoldCandidate
> &FoldList
) const {
814 const MCInstrDesc
&Desc
= UseMI
->getDesc();
815 if (UseOpIdx
>= Desc
.getNumOperands())
818 if (!AMDGPU::isSISrcInlinableOperand(Desc
, UseOpIdx
))
821 uint8_t OpTy
= Desc
.operands()[UseOpIdx
].OperandType
;
822 if (OpToFold
.isImm() && TII
->isInlineConstant(OpToFold
, OpTy
) &&
823 TII
->isOperandLegal(*UseMI
, UseOpIdx
, &OpToFold
)) {
824 UseMI
->getOperand(UseOpIdx
).ChangeToImmediate(OpToFold
.getImm());
828 if (!OpToFold
.isReg())
831 Register UseReg
= OpToFold
.getReg();
832 if (!UseReg
.isVirtual())
835 if (isUseMIInFoldList(FoldList
, UseMI
))
838 // Maybe it is just a COPY of an immediate itself.
839 MachineInstr
*Def
= MRI
->getVRegDef(UseReg
);
840 MachineOperand
&UseOp
= UseMI
->getOperand(UseOpIdx
);
841 if (!UseOp
.getSubReg() && Def
&& TII
->isFoldableCopy(*Def
)) {
842 MachineOperand
&DefOp
= Def
->getOperand(1);
843 if (DefOp
.isImm() && TII
->isInlineConstant(DefOp
, OpTy
) &&
844 TII
->isOperandLegal(*UseMI
, UseOpIdx
, &DefOp
)) {
845 UseMI
->getOperand(UseOpIdx
).ChangeToImmediate(DefOp
.getImm());
850 SmallVector
<std::pair
<MachineOperand
*, unsigned>, 32> Defs
;
851 if (!getRegSeqInit(Defs
, UseReg
, OpTy
))
855 for (unsigned I
= 0, E
= Defs
.size(); I
!= E
; ++I
) {
856 const MachineOperand
*Op
= Defs
[I
].first
;
860 auto SubImm
= Op
->getImm();
863 if (!TII
->isInlineConstant(*Op
, OpTy
) ||
864 !TII
->isOperandLegal(*UseMI
, UseOpIdx
, Op
))
870 return false; // Can only fold splat constants
873 appendFoldCandidate(FoldList
, UseMI
, UseOpIdx
, Defs
[0].first
);
877 void SIFoldOperandsImpl::foldOperand(
878 MachineOperand
&OpToFold
, MachineInstr
*UseMI
, int UseOpIdx
,
879 SmallVectorImpl
<FoldCandidate
> &FoldList
,
880 SmallVectorImpl
<MachineInstr
*> &CopiesToReplace
) const {
881 const MachineOperand
*UseOp
= &UseMI
->getOperand(UseOpIdx
);
883 if (!isUseSafeToFold(*UseMI
, *UseOp
))
886 // FIXME: Fold operands with subregs.
887 if (UseOp
->isReg() && OpToFold
.isReg() &&
888 (UseOp
->isImplicit() || UseOp
->getSubReg() != AMDGPU::NoSubRegister
))
891 // Special case for REG_SEQUENCE: We can't fold literals into
892 // REG_SEQUENCE instructions, so we have to fold them into the
893 // uses of REG_SEQUENCE.
894 if (UseMI
->isRegSequence()) {
895 Register RegSeqDstReg
= UseMI
->getOperand(0).getReg();
896 unsigned RegSeqDstSubReg
= UseMI
->getOperand(UseOpIdx
+ 1).getImm();
898 // Grab the use operands first
899 SmallVector
<MachineOperand
*, 4> UsesToProcess
;
900 for (auto &Use
: MRI
->use_nodbg_operands(RegSeqDstReg
))
901 UsesToProcess
.push_back(&Use
);
902 for (auto *RSUse
: UsesToProcess
) {
903 MachineInstr
*RSUseMI
= RSUse
->getParent();
905 if (tryToFoldACImm(UseMI
->getOperand(0), RSUseMI
,
906 RSUseMI
->getOperandNo(RSUse
), FoldList
))
909 if (RSUse
->getSubReg() != RegSeqDstSubReg
)
912 foldOperand(OpToFold
, RSUseMI
, RSUseMI
->getOperandNo(RSUse
), FoldList
,
918 if (tryToFoldACImm(OpToFold
, UseMI
, UseOpIdx
, FoldList
))
921 if (frameIndexMayFold(*UseMI
, UseOpIdx
, OpToFold
)) {
922 // Verify that this is a stack access.
923 // FIXME: Should probably use stack pseudos before frame lowering.
925 if (TII
->isMUBUF(*UseMI
)) {
926 if (TII
->getNamedOperand(*UseMI
, AMDGPU::OpName::srsrc
)->getReg() !=
927 MFI
->getScratchRSrcReg())
930 // Ensure this is either relative to the current frame or the current
932 MachineOperand
&SOff
=
933 *TII
->getNamedOperand(*UseMI
, AMDGPU::OpName::soffset
);
934 if (!SOff
.isImm() || SOff
.getImm() != 0)
938 // A frame index will resolve to a positive constant, so it should always be
939 // safe to fold the addressing mode, even pre-GFX9.
940 UseMI
->getOperand(UseOpIdx
).ChangeToFrameIndex(OpToFold
.getIndex());
942 const unsigned Opc
= UseMI
->getOpcode();
943 if (TII
->isFLATScratch(*UseMI
) &&
944 AMDGPU::hasNamedOperand(Opc
, AMDGPU::OpName::vaddr
) &&
945 !AMDGPU::hasNamedOperand(Opc
, AMDGPU::OpName::saddr
)) {
946 unsigned NewOpc
= AMDGPU::getFlatScratchInstSSfromSV(Opc
);
947 UseMI
->setDesc(TII
->get(NewOpc
));
953 bool FoldingImmLike
=
954 OpToFold
.isImm() || OpToFold
.isFI() || OpToFold
.isGlobal();
956 if (FoldingImmLike
&& UseMI
->isCopy()) {
957 Register DestReg
= UseMI
->getOperand(0).getReg();
958 Register SrcReg
= UseMI
->getOperand(1).getReg();
959 assert(SrcReg
.isVirtual());
961 const TargetRegisterClass
*SrcRC
= MRI
->getRegClass(SrcReg
);
963 // Don't fold into a copy to a physical register with the same class. Doing
964 // so would interfere with the register coalescer's logic which would avoid
965 // redundant initializations.
966 if (DestReg
.isPhysical() && SrcRC
->contains(DestReg
))
969 const TargetRegisterClass
*DestRC
= TRI
->getRegClassForReg(*MRI
, DestReg
);
970 if (!DestReg
.isPhysical()) {
971 if (DestRC
== &AMDGPU::AGPR_32RegClass
&&
972 TII
->isInlineConstant(OpToFold
, AMDGPU::OPERAND_REG_INLINE_C_INT32
)) {
973 UseMI
->setDesc(TII
->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64
));
974 UseMI
->getOperand(1).ChangeToImmediate(OpToFold
.getImm());
975 CopiesToReplace
.push_back(UseMI
);
980 // In order to fold immediates into copies, we need to change the
983 unsigned MovOp
= TII
->getMovOpcode(DestRC
);
984 if (MovOp
== AMDGPU::COPY
)
987 MachineInstr::mop_iterator ImpOpI
= UseMI
->implicit_operands().begin();
988 MachineInstr::mop_iterator ImpOpE
= UseMI
->implicit_operands().end();
989 while (ImpOpI
!= ImpOpE
) {
990 MachineInstr::mop_iterator Tmp
= ImpOpI
;
992 UseMI
->removeOperand(UseMI
->getOperandNo(Tmp
));
994 UseMI
->setDesc(TII
->get(MovOp
));
996 if (MovOp
== AMDGPU::V_MOV_B16_t16_e64
) {
997 const auto &SrcOp
= UseMI
->getOperand(UseOpIdx
);
998 MachineOperand
NewSrcOp(SrcOp
);
999 MachineFunction
*MF
= UseMI
->getParent()->getParent();
1000 UseMI
->removeOperand(1);
1001 UseMI
->addOperand(*MF
, MachineOperand::CreateImm(0)); // src0_modifiers
1002 UseMI
->addOperand(NewSrcOp
); // src0
1003 UseMI
->addOperand(*MF
, MachineOperand::CreateImm(0)); // op_sel
1005 UseOp
= &UseMI
->getOperand(UseOpIdx
);
1007 CopiesToReplace
.push_back(UseMI
);
1009 if (UseMI
->isCopy() && OpToFold
.isReg() &&
1010 UseMI
->getOperand(0).getReg().isVirtual() &&
1011 !UseMI
->getOperand(1).getSubReg()) {
1012 LLVM_DEBUG(dbgs() << "Folding " << OpToFold
<< "\n into " << *UseMI
);
1013 unsigned Size
= TII
->getOpSize(*UseMI
, 1);
1014 Register UseReg
= OpToFold
.getReg();
1015 UseMI
->getOperand(1).setReg(UseReg
);
1016 UseMI
->getOperand(1).setSubReg(OpToFold
.getSubReg());
1017 UseMI
->getOperand(1).setIsKill(false);
1018 CopiesToReplace
.push_back(UseMI
);
1019 OpToFold
.setIsKill(false);
1021 // Remove kill flags as kills may now be out of order with uses.
1022 MRI
->clearKillFlags(OpToFold
.getReg());
1024 // That is very tricky to store a value into an AGPR. v_accvgpr_write_b32
1025 // can only accept VGPR or inline immediate. Recreate a reg_sequence with
1026 // its initializers right here, so we will rematerialize immediates and
1027 // avoid copies via different reg classes.
1028 SmallVector
<std::pair
<MachineOperand
*, unsigned>, 32> Defs
;
1029 if (Size
> 4 && TRI
->isAGPR(*MRI
, UseMI
->getOperand(0).getReg()) &&
1030 getRegSeqInit(Defs
, UseReg
, AMDGPU::OPERAND_REG_INLINE_C_INT32
)) {
1031 const DebugLoc
&DL
= UseMI
->getDebugLoc();
1032 MachineBasicBlock
&MBB
= *UseMI
->getParent();
1034 UseMI
->setDesc(TII
->get(AMDGPU::REG_SEQUENCE
));
1035 for (unsigned I
= UseMI
->getNumOperands() - 1; I
> 0; --I
)
1036 UseMI
->removeOperand(I
);
1038 MachineInstrBuilder
B(*MBB
.getParent(), UseMI
);
1039 DenseMap
<TargetInstrInfo::RegSubRegPair
, Register
> VGPRCopies
;
1040 SmallSetVector
<TargetInstrInfo::RegSubRegPair
, 32> SeenAGPRs
;
1041 for (unsigned I
= 0; I
< Size
/ 4; ++I
) {
1042 MachineOperand
*Def
= Defs
[I
].first
;
1043 TargetInstrInfo::RegSubRegPair CopyToVGPR
;
1045 TII
->isInlineConstant(*Def
, AMDGPU::OPERAND_REG_INLINE_C_INT32
)) {
1046 int64_t Imm
= Def
->getImm();
1048 auto Tmp
= MRI
->createVirtualRegister(&AMDGPU::AGPR_32RegClass
);
1049 BuildMI(MBB
, UseMI
, DL
,
1050 TII
->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64
), Tmp
).addImm(Imm
);
1052 } else if (Def
->isReg() && TRI
->isAGPR(*MRI
, Def
->getReg())) {
1053 auto Src
= getRegSubRegPair(*Def
);
1054 Def
->setIsKill(false);
1055 if (!SeenAGPRs
.insert(Src
)) {
1056 // We cannot build a reg_sequence out of the same registers, they
1057 // must be copied. Better do it here before copyPhysReg() created
1058 // several reads to do the AGPR->VGPR->AGPR copy.
1061 B
.addReg(Src
.Reg
, Def
->isUndef() ? RegState::Undef
: 0,
1065 assert(Def
->isReg());
1066 Def
->setIsKill(false);
1067 auto Src
= getRegSubRegPair(*Def
);
1069 // Direct copy from SGPR to AGPR is not possible. To avoid creation
1070 // of exploded copies SGPR->VGPR->AGPR in the copyPhysReg() later,
1071 // create a copy here and track if we already have such a copy.
1072 if (TRI
->isSGPRReg(*MRI
, Src
.Reg
)) {
1075 auto Tmp
= MRI
->createVirtualRegister(&AMDGPU::AGPR_32RegClass
);
1076 BuildMI(MBB
, UseMI
, DL
, TII
->get(AMDGPU::COPY
), Tmp
).add(*Def
);
1081 if (CopyToVGPR
.Reg
) {
1083 if (VGPRCopies
.count(CopyToVGPR
)) {
1084 Vgpr
= VGPRCopies
[CopyToVGPR
];
1086 Vgpr
= MRI
->createVirtualRegister(&AMDGPU::VGPR_32RegClass
);
1087 BuildMI(MBB
, UseMI
, DL
, TII
->get(AMDGPU::COPY
), Vgpr
).add(*Def
);
1088 VGPRCopies
[CopyToVGPR
] = Vgpr
;
1090 auto Tmp
= MRI
->createVirtualRegister(&AMDGPU::AGPR_32RegClass
);
1091 BuildMI(MBB
, UseMI
, DL
,
1092 TII
->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64
), Tmp
).addReg(Vgpr
);
1096 B
.addImm(Defs
[I
].second
);
1098 LLVM_DEBUG(dbgs() << "Folded " << *UseMI
);
1105 Register Reg0
= UseMI
->getOperand(0).getReg();
1106 Register Reg1
= UseMI
->getOperand(1).getReg();
1107 if (TRI
->isAGPR(*MRI
, Reg0
) && TRI
->isVGPR(*MRI
, Reg1
))
1108 UseMI
->setDesc(TII
->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64
));
1109 else if (TRI
->isVGPR(*MRI
, Reg0
) && TRI
->isAGPR(*MRI
, Reg1
))
1110 UseMI
->setDesc(TII
->get(AMDGPU::V_ACCVGPR_READ_B32_e64
));
1111 else if (ST
->hasGFX90AInsts() && TRI
->isAGPR(*MRI
, Reg0
) &&
1112 TRI
->isAGPR(*MRI
, Reg1
))
1113 UseMI
->setDesc(TII
->get(AMDGPU::V_ACCVGPR_MOV_B32
));
1117 unsigned UseOpc
= UseMI
->getOpcode();
1118 if (UseOpc
== AMDGPU::V_READFIRSTLANE_B32
||
1119 (UseOpc
== AMDGPU::V_READLANE_B32
&&
1121 AMDGPU::getNamedOperandIdx(UseOpc
, AMDGPU::OpName::src0
))) {
1122 // %vgpr = V_MOV_B32 imm
1123 // %sgpr = V_READFIRSTLANE_B32 %vgpr
1125 // %sgpr = S_MOV_B32 imm
1126 if (FoldingImmLike
) {
1127 if (execMayBeModifiedBeforeUse(*MRI
,
1128 UseMI
->getOperand(UseOpIdx
).getReg(),
1129 *OpToFold
.getParent(),
1133 UseMI
->setDesc(TII
->get(AMDGPU::S_MOV_B32
));
1135 if (OpToFold
.isImm())
1136 UseMI
->getOperand(1).ChangeToImmediate(OpToFold
.getImm());
1138 UseMI
->getOperand(1).ChangeToFrameIndex(OpToFold
.getIndex());
1139 UseMI
->removeOperand(2); // Remove exec read (or src1 for readlane)
1143 if (OpToFold
.isReg() && TRI
->isSGPRReg(*MRI
, OpToFold
.getReg())) {
1144 if (execMayBeModifiedBeforeUse(*MRI
,
1145 UseMI
->getOperand(UseOpIdx
).getReg(),
1146 *OpToFold
.getParent(),
1150 // %vgpr = COPY %sgpr0
1151 // %sgpr1 = V_READFIRSTLANE_B32 %vgpr
1153 // %sgpr1 = COPY %sgpr0
1154 UseMI
->setDesc(TII
->get(AMDGPU::COPY
));
1155 UseMI
->getOperand(1).setReg(OpToFold
.getReg());
1156 UseMI
->getOperand(1).setSubReg(OpToFold
.getSubReg());
1157 UseMI
->getOperand(1).setIsKill(false);
1158 UseMI
->removeOperand(2); // Remove exec read (or src1 for readlane)
1163 const MCInstrDesc
&UseDesc
= UseMI
->getDesc();
1165 // Don't fold into target independent nodes. Target independent opcodes
1166 // don't have defined register classes.
1167 if (UseDesc
.isVariadic() || UseOp
->isImplicit() ||
1168 UseDesc
.operands()[UseOpIdx
].RegClass
== -1)
1172 if (!FoldingImmLike
) {
1173 if (OpToFold
.isReg() && ST
->needsAlignedVGPRs()) {
1174 // Don't fold if OpToFold doesn't hold an aligned register.
1175 const TargetRegisterClass
*RC
=
1176 TRI
->getRegClassForReg(*MRI
, OpToFold
.getReg());
1178 if (TRI
->hasVectorRegisters(RC
) && OpToFold
.getSubReg()) {
1179 unsigned SubReg
= OpToFold
.getSubReg();
1180 if (const TargetRegisterClass
*SubRC
=
1181 TRI
->getSubRegisterClass(RC
, SubReg
))
1185 if (!RC
|| !TRI
->isProperlyAlignedRC(*RC
))
1189 tryAddToFoldList(FoldList
, UseMI
, UseOpIdx
, &OpToFold
);
1191 // FIXME: We could try to change the instruction from 64-bit to 32-bit
1192 // to enable more folding opportunities. The shrink operands pass
1193 // already does this.
1198 const MCInstrDesc
&FoldDesc
= OpToFold
.getParent()->getDesc();
1199 const TargetRegisterClass
*FoldRC
=
1200 TRI
->getRegClass(FoldDesc
.operands()[0].RegClass
);
1202 // Split 64-bit constants into 32-bits for folding.
1203 if (UseOp
->getSubReg() && AMDGPU::getRegBitWidth(*FoldRC
) == 64) {
1204 Register UseReg
= UseOp
->getReg();
1205 const TargetRegisterClass
*UseRC
= MRI
->getRegClass(UseReg
);
1206 if (AMDGPU::getRegBitWidth(*UseRC
) != 64)
1209 APInt
Imm(64, OpToFold
.getImm());
1210 if (UseOp
->getSubReg() == AMDGPU::sub0
) {
1211 Imm
= Imm
.getLoBits(32);
1213 assert(UseOp
->getSubReg() == AMDGPU::sub1
);
1214 Imm
= Imm
.getHiBits(32);
1217 MachineOperand ImmOp
= MachineOperand::CreateImm(Imm
.getSExtValue());
1218 tryAddToFoldList(FoldList
, UseMI
, UseOpIdx
, &ImmOp
);
1222 tryAddToFoldList(FoldList
, UseMI
, UseOpIdx
, &OpToFold
);
1225 static bool evalBinaryInstruction(unsigned Opcode
, int32_t &Result
,
1226 uint32_t LHS
, uint32_t RHS
) {
1228 case AMDGPU::V_AND_B32_e64
:
1229 case AMDGPU::V_AND_B32_e32
:
1230 case AMDGPU::S_AND_B32
:
1233 case AMDGPU::V_OR_B32_e64
:
1234 case AMDGPU::V_OR_B32_e32
:
1235 case AMDGPU::S_OR_B32
:
1238 case AMDGPU::V_XOR_B32_e64
:
1239 case AMDGPU::V_XOR_B32_e32
:
1240 case AMDGPU::S_XOR_B32
:
1243 case AMDGPU::S_XNOR_B32
:
1244 Result
= ~(LHS
^ RHS
);
1246 case AMDGPU::S_NAND_B32
:
1247 Result
= ~(LHS
& RHS
);
1249 case AMDGPU::S_NOR_B32
:
1250 Result
= ~(LHS
| RHS
);
1252 case AMDGPU::S_ANDN2_B32
:
1253 Result
= LHS
& ~RHS
;
1255 case AMDGPU::S_ORN2_B32
:
1256 Result
= LHS
| ~RHS
;
1258 case AMDGPU::V_LSHL_B32_e64
:
1259 case AMDGPU::V_LSHL_B32_e32
:
1260 case AMDGPU::S_LSHL_B32
:
1261 // The instruction ignores the high bits for out of bounds shifts.
1262 Result
= LHS
<< (RHS
& 31);
1264 case AMDGPU::V_LSHLREV_B32_e64
:
1265 case AMDGPU::V_LSHLREV_B32_e32
:
1266 Result
= RHS
<< (LHS
& 31);
1268 case AMDGPU::V_LSHR_B32_e64
:
1269 case AMDGPU::V_LSHR_B32_e32
:
1270 case AMDGPU::S_LSHR_B32
:
1271 Result
= LHS
>> (RHS
& 31);
1273 case AMDGPU::V_LSHRREV_B32_e64
:
1274 case AMDGPU::V_LSHRREV_B32_e32
:
1275 Result
= RHS
>> (LHS
& 31);
1277 case AMDGPU::V_ASHR_I32_e64
:
1278 case AMDGPU::V_ASHR_I32_e32
:
1279 case AMDGPU::S_ASHR_I32
:
1280 Result
= static_cast<int32_t>(LHS
) >> (RHS
& 31);
1282 case AMDGPU::V_ASHRREV_I32_e64
:
1283 case AMDGPU::V_ASHRREV_I32_e32
:
1284 Result
= static_cast<int32_t>(RHS
) >> (LHS
& 31);
1291 static unsigned getMovOpc(bool IsScalar
) {
1292 return IsScalar
? AMDGPU::S_MOV_B32
: AMDGPU::V_MOV_B32_e32
;
1295 static void mutateCopyOp(MachineInstr
&MI
, const MCInstrDesc
&NewDesc
) {
1296 MI
.setDesc(NewDesc
);
1298 // Remove any leftover implicit operands from mutating the instruction. e.g.
1299 // if we replace an s_and_b32 with a copy, we don't need the implicit scc def
1301 const MCInstrDesc
&Desc
= MI
.getDesc();
1302 unsigned NumOps
= Desc
.getNumOperands() + Desc
.implicit_uses().size() +
1303 Desc
.implicit_defs().size();
1305 for (unsigned I
= MI
.getNumOperands() - 1; I
>= NumOps
; --I
)
1306 MI
.removeOperand(I
);
1310 SIFoldOperandsImpl::getImmOrMaterializedImm(MachineOperand
&Op
) const {
1311 // If this has a subregister, it obviously is a register source.
1312 if (!Op
.isReg() || Op
.getSubReg() != AMDGPU::NoSubRegister
||
1313 !Op
.getReg().isVirtual())
1316 MachineInstr
*Def
= MRI
->getVRegDef(Op
.getReg());
1317 if (Def
&& Def
->isMoveImmediate()) {
1318 MachineOperand
&ImmSrc
= Def
->getOperand(1);
1326 // Try to simplify operations with a constant that may appear after instruction
1328 // TODO: See if a frame index with a fixed offset can fold.
1329 bool SIFoldOperandsImpl::tryConstantFoldOp(MachineInstr
*MI
) const {
1330 if (!MI
->allImplicitDefsAreDead())
1333 unsigned Opc
= MI
->getOpcode();
1335 int Src0Idx
= AMDGPU::getNamedOperandIdx(Opc
, AMDGPU::OpName::src0
);
1338 MachineOperand
*Src0
= getImmOrMaterializedImm(MI
->getOperand(Src0Idx
));
1340 if ((Opc
== AMDGPU::V_NOT_B32_e64
|| Opc
== AMDGPU::V_NOT_B32_e32
||
1341 Opc
== AMDGPU::S_NOT_B32
) &&
1343 MI
->getOperand(1).ChangeToImmediate(~Src0
->getImm());
1344 mutateCopyOp(*MI
, TII
->get(getMovOpc(Opc
== AMDGPU::S_NOT_B32
)));
1348 int Src1Idx
= AMDGPU::getNamedOperandIdx(Opc
, AMDGPU::OpName::src1
);
1351 MachineOperand
*Src1
= getImmOrMaterializedImm(MI
->getOperand(Src1Idx
));
1353 if (!Src0
->isImm() && !Src1
->isImm())
1356 // and k0, k1 -> v_mov_b32 (k0 & k1)
1357 // or k0, k1 -> v_mov_b32 (k0 | k1)
1358 // xor k0, k1 -> v_mov_b32 (k0 ^ k1)
1359 if (Src0
->isImm() && Src1
->isImm()) {
1361 if (!evalBinaryInstruction(Opc
, NewImm
, Src0
->getImm(), Src1
->getImm()))
1364 bool IsSGPR
= TRI
->isSGPRReg(*MRI
, MI
->getOperand(0).getReg());
1366 // Be careful to change the right operand, src0 may belong to a different
1368 MI
->getOperand(Src0Idx
).ChangeToImmediate(NewImm
);
1369 MI
->removeOperand(Src1Idx
);
1370 mutateCopyOp(*MI
, TII
->get(getMovOpc(IsSGPR
)));
1374 if (!MI
->isCommutable())
1377 if (Src0
->isImm() && !Src1
->isImm()) {
1378 std::swap(Src0
, Src1
);
1379 std::swap(Src0Idx
, Src1Idx
);
1382 int32_t Src1Val
= static_cast<int32_t>(Src1
->getImm());
1383 if (Opc
== AMDGPU::V_OR_B32_e64
||
1384 Opc
== AMDGPU::V_OR_B32_e32
||
1385 Opc
== AMDGPU::S_OR_B32
) {
1387 // y = or x, 0 => y = copy x
1388 MI
->removeOperand(Src1Idx
);
1389 mutateCopyOp(*MI
, TII
->get(AMDGPU::COPY
));
1390 } else if (Src1Val
== -1) {
1391 // y = or x, -1 => y = v_mov_b32 -1
1392 MI
->removeOperand(Src1Idx
);
1393 mutateCopyOp(*MI
, TII
->get(getMovOpc(Opc
== AMDGPU::S_OR_B32
)));
1400 if (Opc
== AMDGPU::V_AND_B32_e64
|| Opc
== AMDGPU::V_AND_B32_e32
||
1401 Opc
== AMDGPU::S_AND_B32
) {
1403 // y = and x, 0 => y = v_mov_b32 0
1404 MI
->removeOperand(Src0Idx
);
1405 mutateCopyOp(*MI
, TII
->get(getMovOpc(Opc
== AMDGPU::S_AND_B32
)));
1406 } else if (Src1Val
== -1) {
1407 // y = and x, -1 => y = copy x
1408 MI
->removeOperand(Src1Idx
);
1409 mutateCopyOp(*MI
, TII
->get(AMDGPU::COPY
));
1416 if (Opc
== AMDGPU::V_XOR_B32_e64
|| Opc
== AMDGPU::V_XOR_B32_e32
||
1417 Opc
== AMDGPU::S_XOR_B32
) {
1419 // y = xor x, 0 => y = copy x
1420 MI
->removeOperand(Src1Idx
);
1421 mutateCopyOp(*MI
, TII
->get(AMDGPU::COPY
));
1429 // Try to fold an instruction into a simpler one
1430 bool SIFoldOperandsImpl::tryFoldCndMask(MachineInstr
&MI
) const {
1431 unsigned Opc
= MI
.getOpcode();
1432 if (Opc
!= AMDGPU::V_CNDMASK_B32_e32
&& Opc
!= AMDGPU::V_CNDMASK_B32_e64
&&
1433 Opc
!= AMDGPU::V_CNDMASK_B64_PSEUDO
)
1436 MachineOperand
*Src0
= TII
->getNamedOperand(MI
, AMDGPU::OpName::src0
);
1437 MachineOperand
*Src1
= TII
->getNamedOperand(MI
, AMDGPU::OpName::src1
);
1438 if (!Src1
->isIdenticalTo(*Src0
)) {
1439 auto *Src0Imm
= getImmOrMaterializedImm(*Src0
);
1440 auto *Src1Imm
= getImmOrMaterializedImm(*Src1
);
1441 if (!Src1Imm
->isIdenticalTo(*Src0Imm
))
1446 AMDGPU::getNamedOperandIdx(Opc
, AMDGPU::OpName::src1_modifiers
);
1448 AMDGPU::getNamedOperandIdx(Opc
, AMDGPU::OpName::src0_modifiers
);
1449 if ((Src1ModIdx
!= -1 && MI
.getOperand(Src1ModIdx
).getImm() != 0) ||
1450 (Src0ModIdx
!= -1 && MI
.getOperand(Src0ModIdx
).getImm() != 0))
1453 LLVM_DEBUG(dbgs() << "Folded " << MI
<< " into ");
1455 TII
->get(Src0
->isReg() ? (unsigned)AMDGPU::COPY
: getMovOpc(false));
1456 int Src2Idx
= AMDGPU::getNamedOperandIdx(Opc
, AMDGPU::OpName::src2
);
1458 MI
.removeOperand(Src2Idx
);
1459 MI
.removeOperand(AMDGPU::getNamedOperandIdx(Opc
, AMDGPU::OpName::src1
));
1460 if (Src1ModIdx
!= -1)
1461 MI
.removeOperand(Src1ModIdx
);
1462 if (Src0ModIdx
!= -1)
1463 MI
.removeOperand(Src0ModIdx
);
1464 mutateCopyOp(MI
, NewDesc
);
1465 LLVM_DEBUG(dbgs() << MI
);
1469 bool SIFoldOperandsImpl::tryFoldZeroHighBits(MachineInstr
&MI
) const {
1470 if (MI
.getOpcode() != AMDGPU::V_AND_B32_e64
&&
1471 MI
.getOpcode() != AMDGPU::V_AND_B32_e32
)
1474 MachineOperand
*Src0
= getImmOrMaterializedImm(MI
.getOperand(1));
1475 if (!Src0
->isImm() || Src0
->getImm() != 0xffff)
1478 Register Src1
= MI
.getOperand(2).getReg();
1479 MachineInstr
*SrcDef
= MRI
->getVRegDef(Src1
);
1480 if (!ST
->zeroesHigh16BitsOfDest(SrcDef
->getOpcode()))
1483 Register Dst
= MI
.getOperand(0).getReg();
1484 MRI
->replaceRegWith(Dst
, Src1
);
1485 if (!MI
.getOperand(2).isKill())
1486 MRI
->clearKillFlags(Src1
);
1487 MI
.eraseFromParent();
1491 bool SIFoldOperandsImpl::foldInstOperand(MachineInstr
&MI
,
1492 MachineOperand
&OpToFold
) const {
1493 // We need mutate the operands of new mov instructions to add implicit
1494 // uses of EXEC, but adding them invalidates the use_iterator, so defer
1496 SmallVector
<MachineInstr
*, 4> CopiesToReplace
;
1497 SmallVector
<FoldCandidate
, 4> FoldList
;
1498 MachineOperand
&Dst
= MI
.getOperand(0);
1499 bool Changed
= false;
1501 if (OpToFold
.isImm()) {
1503 make_early_inc_range(MRI
->use_nodbg_instructions(Dst
.getReg()))) {
1504 // Folding the immediate may reveal operations that can be constant
1505 // folded or replaced with a copy. This can happen for example after
1506 // frame indices are lowered to constants or from splitting 64-bit
1509 // We may also encounter cases where one or both operands are
1510 // immediates materialized into a register, which would ordinarily not
1511 // be folded due to multiple uses or operand constraints.
1512 if (tryConstantFoldOp(&UseMI
)) {
1513 LLVM_DEBUG(dbgs() << "Constant folded " << UseMI
);
1519 SmallVector
<MachineOperand
*, 4> UsesToProcess
;
1520 for (auto &Use
: MRI
->use_nodbg_operands(Dst
.getReg()))
1521 UsesToProcess
.push_back(&Use
);
1522 for (auto *U
: UsesToProcess
) {
1523 MachineInstr
*UseMI
= U
->getParent();
1524 foldOperand(OpToFold
, UseMI
, UseMI
->getOperandNo(U
), FoldList
,
1528 if (CopiesToReplace
.empty() && FoldList
.empty())
1531 MachineFunction
*MF
= MI
.getParent()->getParent();
1532 // Make sure we add EXEC uses to any new v_mov instructions created.
1533 for (MachineInstr
*Copy
: CopiesToReplace
)
1534 Copy
->addImplicitDefUseOperands(*MF
);
1536 for (FoldCandidate
&Fold
: FoldList
) {
1537 assert(!Fold
.isReg() || Fold
.OpToFold
);
1538 if (Fold
.isReg() && Fold
.OpToFold
->getReg().isVirtual()) {
1539 Register Reg
= Fold
.OpToFold
->getReg();
1540 MachineInstr
*DefMI
= Fold
.OpToFold
->getParent();
1541 if (DefMI
->readsRegister(AMDGPU::EXEC
, TRI
) &&
1542 execMayBeModifiedBeforeUse(*MRI
, Reg
, *DefMI
, *Fold
.UseMI
))
1545 if (updateOperand(Fold
)) {
1546 // Clear kill flags.
1548 assert(Fold
.OpToFold
&& Fold
.OpToFold
->isReg());
1549 // FIXME: Probably shouldn't bother trying to fold if not an
1550 // SGPR. PeepholeOptimizer can eliminate redundant VGPR->VGPR
1552 MRI
->clearKillFlags(Fold
.OpToFold
->getReg());
1554 LLVM_DEBUG(dbgs() << "Folded source from " << MI
<< " into OpNo "
1555 << static_cast<int>(Fold
.UseOpNo
) << " of "
1557 } else if (Fold
.Commuted
) {
1558 // Restoring instruction's original operand order if fold has failed.
1559 TII
->commuteInstruction(*Fold
.UseMI
, false);
1565 bool SIFoldOperandsImpl::tryFoldFoldableCopy(
1566 MachineInstr
&MI
, MachineOperand
*&CurrentKnownM0Val
) const {
1567 Register DstReg
= MI
.getOperand(0).getReg();
1568 // Specially track simple redefs of m0 to the same value in a block, so we
1569 // can erase the later ones.
1570 if (DstReg
== AMDGPU::M0
) {
1571 MachineOperand
&NewM0Val
= MI
.getOperand(1);
1572 if (CurrentKnownM0Val
&& CurrentKnownM0Val
->isIdenticalTo(NewM0Val
)) {
1573 MI
.eraseFromParent();
1577 // We aren't tracking other physical registers
1578 CurrentKnownM0Val
= (NewM0Val
.isReg() && NewM0Val
.getReg().isPhysical())
1584 MachineOperand
*OpToFoldPtr
;
1585 if (MI
.getOpcode() == AMDGPU::V_MOV_B16_t16_e64
) {
1586 // Folding when any src_modifiers are non-zero is unsupported
1587 if (TII
->hasAnyModifiersSet(MI
))
1589 OpToFoldPtr
= &MI
.getOperand(2);
1591 OpToFoldPtr
= &MI
.getOperand(1);
1592 MachineOperand
&OpToFold
= *OpToFoldPtr
;
1593 bool FoldingImm
= OpToFold
.isImm() || OpToFold
.isFI() || OpToFold
.isGlobal();
1595 // FIXME: We could also be folding things like TargetIndexes.
1596 if (!FoldingImm
&& !OpToFold
.isReg())
1599 if (OpToFold
.isReg() && !OpToFold
.getReg().isVirtual())
1602 // Prevent folding operands backwards in the function. For example,
1603 // the COPY opcode must not be replaced by 1 in this example:
1605 // %3 = COPY %vgpr0; VGPR_32:%3
1607 // %vgpr0 = V_MOV_B32_e32 1, implicit %exec
1608 if (!DstReg
.isVirtual())
1611 if (OpToFold
.isReg() &&
1612 foldCopyToVGPROfScalarAddOfFrameIndex(DstReg
, OpToFold
.getReg(), MI
))
1615 bool Changed
= foldInstOperand(MI
, OpToFold
);
1617 // If we managed to fold all uses of this copy then we might as well
1619 // The only reason we need to follow chains of copies here is that
1620 // tryFoldRegSequence looks forward through copies before folding a
1621 // REG_SEQUENCE into its eventual users.
1622 auto *InstToErase
= &MI
;
1623 while (MRI
->use_nodbg_empty(InstToErase
->getOperand(0).getReg())) {
1624 auto &SrcOp
= InstToErase
->getOperand(1);
1625 auto SrcReg
= SrcOp
.isReg() ? SrcOp
.getReg() : Register();
1626 InstToErase
->eraseFromParent();
1628 InstToErase
= nullptr;
1629 if (!SrcReg
|| SrcReg
.isPhysical())
1631 InstToErase
= MRI
->getVRegDef(SrcReg
);
1632 if (!InstToErase
|| !TII
->isFoldableCopy(*InstToErase
))
1636 if (InstToErase
&& InstToErase
->isRegSequence() &&
1637 MRI
->use_nodbg_empty(InstToErase
->getOperand(0).getReg())) {
1638 InstToErase
->eraseFromParent();
1645 // Clamp patterns are canonically selected to v_max_* instructions, so only
1647 const MachineOperand
*
1648 SIFoldOperandsImpl::isClamp(const MachineInstr
&MI
) const {
1649 unsigned Op
= MI
.getOpcode();
1651 case AMDGPU::V_MAX_F32_e64
:
1652 case AMDGPU::V_MAX_F16_e64
:
1653 case AMDGPU::V_MAX_F16_t16_e64
:
1654 case AMDGPU::V_MAX_F16_fake16_e64
:
1655 case AMDGPU::V_MAX_F64_e64
:
1656 case AMDGPU::V_MAX_NUM_F64_e64
:
1657 case AMDGPU::V_PK_MAX_F16
: {
1658 if (MI
.mayRaiseFPException())
1661 if (!TII
->getNamedOperand(MI
, AMDGPU::OpName::clamp
)->getImm())
1664 // Make sure sources are identical.
1665 const MachineOperand
*Src0
= TII
->getNamedOperand(MI
, AMDGPU::OpName::src0
);
1666 const MachineOperand
*Src1
= TII
->getNamedOperand(MI
, AMDGPU::OpName::src1
);
1667 if (!Src0
->isReg() || !Src1
->isReg() ||
1668 Src0
->getReg() != Src1
->getReg() ||
1669 Src0
->getSubReg() != Src1
->getSubReg() ||
1670 Src0
->getSubReg() != AMDGPU::NoSubRegister
)
1673 // Can't fold up if we have modifiers.
1674 if (TII
->hasModifiersSet(MI
, AMDGPU::OpName::omod
))
1678 = TII
->getNamedOperand(MI
, AMDGPU::OpName::src0_modifiers
)->getImm();
1680 = TII
->getNamedOperand(MI
, AMDGPU::OpName::src1_modifiers
)->getImm();
1682 // Having a 0 op_sel_hi would require swizzling the output in the source
1683 // instruction, which we can't do.
1684 unsigned UnsetMods
= (Op
== AMDGPU::V_PK_MAX_F16
) ? SISrcMods::OP_SEL_1
1686 if (Src0Mods
!= UnsetMods
&& Src1Mods
!= UnsetMods
)
1695 // FIXME: Clamp for v_mad_mixhi_f16 handled during isel.
1696 bool SIFoldOperandsImpl::tryFoldClamp(MachineInstr
&MI
) {
1697 const MachineOperand
*ClampSrc
= isClamp(MI
);
1698 if (!ClampSrc
|| !MRI
->hasOneNonDBGUser(ClampSrc
->getReg()))
1701 MachineInstr
*Def
= MRI
->getVRegDef(ClampSrc
->getReg());
1703 // The type of clamp must be compatible.
1704 if (TII
->getClampMask(*Def
) != TII
->getClampMask(MI
))
1707 if (Def
->mayRaiseFPException())
1710 MachineOperand
*DefClamp
= TII
->getNamedOperand(*Def
, AMDGPU::OpName::clamp
);
1714 LLVM_DEBUG(dbgs() << "Folding clamp " << *DefClamp
<< " into " << *Def
);
1716 // Clamp is applied after omod, so it is OK if omod is set.
1717 DefClamp
->setImm(1);
1719 Register DefReg
= Def
->getOperand(0).getReg();
1720 Register MIDstReg
= MI
.getOperand(0).getReg();
1721 if (TRI
->isSGPRReg(*MRI
, DefReg
)) {
1722 // Pseudo scalar instructions have a SGPR for dst and clamp is a v_max*
1723 // instruction with a VGPR dst.
1724 BuildMI(*MI
.getParent(), MI
, MI
.getDebugLoc(), TII
->get(AMDGPU::COPY
),
1728 MRI
->replaceRegWith(MIDstReg
, DefReg
);
1730 MI
.eraseFromParent();
1732 // Use of output modifiers forces VOP3 encoding for a VOP2 mac/fmac
1733 // instruction, so we might as well convert it to the more flexible VOP3-only
1735 if (TII
->convertToThreeAddress(*Def
, nullptr, nullptr))
1736 Def
->eraseFromParent();
1741 static int getOModValue(unsigned Opc
, int64_t Val
) {
1743 case AMDGPU::V_MUL_F64_e64
:
1744 case AMDGPU::V_MUL_F64_pseudo_e64
: {
1746 case 0x3fe0000000000000: // 0.5
1747 return SIOutMods::DIV2
;
1748 case 0x4000000000000000: // 2.0
1749 return SIOutMods::MUL2
;
1750 case 0x4010000000000000: // 4.0
1751 return SIOutMods::MUL4
;
1753 return SIOutMods::NONE
;
1756 case AMDGPU::V_MUL_F32_e64
: {
1757 switch (static_cast<uint32_t>(Val
)) {
1758 case 0x3f000000: // 0.5
1759 return SIOutMods::DIV2
;
1760 case 0x40000000: // 2.0
1761 return SIOutMods::MUL2
;
1762 case 0x40800000: // 4.0
1763 return SIOutMods::MUL4
;
1765 return SIOutMods::NONE
;
1768 case AMDGPU::V_MUL_F16_e64
:
1769 case AMDGPU::V_MUL_F16_t16_e64
:
1770 case AMDGPU::V_MUL_F16_fake16_e64
: {
1771 switch (static_cast<uint16_t>(Val
)) {
1773 return SIOutMods::DIV2
;
1775 return SIOutMods::MUL2
;
1777 return SIOutMods::MUL4
;
1779 return SIOutMods::NONE
;
1783 llvm_unreachable("invalid mul opcode");
1787 // FIXME: Does this really not support denormals with f16?
1788 // FIXME: Does this need to check IEEE mode bit? SNaNs are generally not
1789 // handled, so will anything other than that break?
1790 std::pair
<const MachineOperand
*, int>
1791 SIFoldOperandsImpl::isOMod(const MachineInstr
&MI
) const {
1792 unsigned Op
= MI
.getOpcode();
1794 case AMDGPU::V_MUL_F64_e64
:
1795 case AMDGPU::V_MUL_F64_pseudo_e64
:
1796 case AMDGPU::V_MUL_F32_e64
:
1797 case AMDGPU::V_MUL_F16_t16_e64
:
1798 case AMDGPU::V_MUL_F16_fake16_e64
:
1799 case AMDGPU::V_MUL_F16_e64
: {
1800 // If output denormals are enabled, omod is ignored.
1801 if ((Op
== AMDGPU::V_MUL_F32_e64
&&
1802 MFI
->getMode().FP32Denormals
.Output
!= DenormalMode::PreserveSign
) ||
1803 ((Op
== AMDGPU::V_MUL_F64_e64
|| Op
== AMDGPU::V_MUL_F64_pseudo_e64
||
1804 Op
== AMDGPU::V_MUL_F16_e64
|| Op
== AMDGPU::V_MUL_F16_t16_e64
||
1805 Op
== AMDGPU::V_MUL_F16_fake16_e64
) &&
1806 MFI
->getMode().FP64FP16Denormals
.Output
!=
1807 DenormalMode::PreserveSign
) ||
1808 MI
.mayRaiseFPException())
1809 return std::pair(nullptr, SIOutMods::NONE
);
1811 const MachineOperand
*RegOp
= nullptr;
1812 const MachineOperand
*ImmOp
= nullptr;
1813 const MachineOperand
*Src0
= TII
->getNamedOperand(MI
, AMDGPU::OpName::src0
);
1814 const MachineOperand
*Src1
= TII
->getNamedOperand(MI
, AMDGPU::OpName::src1
);
1815 if (Src0
->isImm()) {
1818 } else if (Src1
->isImm()) {
1822 return std::pair(nullptr, SIOutMods::NONE
);
1824 int OMod
= getOModValue(Op
, ImmOp
->getImm());
1825 if (OMod
== SIOutMods::NONE
||
1826 TII
->hasModifiersSet(MI
, AMDGPU::OpName::src0_modifiers
) ||
1827 TII
->hasModifiersSet(MI
, AMDGPU::OpName::src1_modifiers
) ||
1828 TII
->hasModifiersSet(MI
, AMDGPU::OpName::omod
) ||
1829 TII
->hasModifiersSet(MI
, AMDGPU::OpName::clamp
))
1830 return std::pair(nullptr, SIOutMods::NONE
);
1832 return std::pair(RegOp
, OMod
);
1834 case AMDGPU::V_ADD_F64_e64
:
1835 case AMDGPU::V_ADD_F64_pseudo_e64
:
1836 case AMDGPU::V_ADD_F32_e64
:
1837 case AMDGPU::V_ADD_F16_e64
:
1838 case AMDGPU::V_ADD_F16_t16_e64
:
1839 case AMDGPU::V_ADD_F16_fake16_e64
: {
1840 // If output denormals are enabled, omod is ignored.
1841 if ((Op
== AMDGPU::V_ADD_F32_e64
&&
1842 MFI
->getMode().FP32Denormals
.Output
!= DenormalMode::PreserveSign
) ||
1843 ((Op
== AMDGPU::V_ADD_F64_e64
|| Op
== AMDGPU::V_ADD_F64_pseudo_e64
||
1844 Op
== AMDGPU::V_ADD_F16_e64
|| Op
== AMDGPU::V_ADD_F16_t16_e64
||
1845 Op
== AMDGPU::V_ADD_F16_fake16_e64
) &&
1846 MFI
->getMode().FP64FP16Denormals
.Output
!= DenormalMode::PreserveSign
))
1847 return std::pair(nullptr, SIOutMods::NONE
);
1849 // Look through the DAGCombiner canonicalization fmul x, 2 -> fadd x, x
1850 const MachineOperand
*Src0
= TII
->getNamedOperand(MI
, AMDGPU::OpName::src0
);
1851 const MachineOperand
*Src1
= TII
->getNamedOperand(MI
, AMDGPU::OpName::src1
);
1853 if (Src0
->isReg() && Src1
->isReg() && Src0
->getReg() == Src1
->getReg() &&
1854 Src0
->getSubReg() == Src1
->getSubReg() &&
1855 !TII
->hasModifiersSet(MI
, AMDGPU::OpName::src0_modifiers
) &&
1856 !TII
->hasModifiersSet(MI
, AMDGPU::OpName::src1_modifiers
) &&
1857 !TII
->hasModifiersSet(MI
, AMDGPU::OpName::clamp
) &&
1858 !TII
->hasModifiersSet(MI
, AMDGPU::OpName::omod
))
1859 return std::pair(Src0
, SIOutMods::MUL2
);
1861 return std::pair(nullptr, SIOutMods::NONE
);
1864 return std::pair(nullptr, SIOutMods::NONE
);
1868 // FIXME: Does this need to check IEEE bit on function?
1869 bool SIFoldOperandsImpl::tryFoldOMod(MachineInstr
&MI
) {
1870 const MachineOperand
*RegOp
;
1872 std::tie(RegOp
, OMod
) = isOMod(MI
);
1873 if (OMod
== SIOutMods::NONE
|| !RegOp
->isReg() ||
1874 RegOp
->getSubReg() != AMDGPU::NoSubRegister
||
1875 !MRI
->hasOneNonDBGUser(RegOp
->getReg()))
1878 MachineInstr
*Def
= MRI
->getVRegDef(RegOp
->getReg());
1879 MachineOperand
*DefOMod
= TII
->getNamedOperand(*Def
, AMDGPU::OpName::omod
);
1880 if (!DefOMod
|| DefOMod
->getImm() != SIOutMods::NONE
)
1883 if (Def
->mayRaiseFPException())
1886 // Clamp is applied after omod. If the source already has clamp set, don't
1888 if (TII
->hasModifiersSet(*Def
, AMDGPU::OpName::clamp
))
1891 LLVM_DEBUG(dbgs() << "Folding omod " << MI
<< " into " << *Def
);
1893 DefOMod
->setImm(OMod
);
1894 MRI
->replaceRegWith(MI
.getOperand(0).getReg(), Def
->getOperand(0).getReg());
1895 // Kill flags can be wrong if we replaced a def inside a loop with a def
1896 // outside the loop.
1897 MRI
->clearKillFlags(Def
->getOperand(0).getReg());
1898 MI
.eraseFromParent();
1900 // Use of output modifiers forces VOP3 encoding for a VOP2 mac/fmac
1901 // instruction, so we might as well convert it to the more flexible VOP3-only
1903 if (TII
->convertToThreeAddress(*Def
, nullptr, nullptr))
1904 Def
->eraseFromParent();
1909 // Try to fold a reg_sequence with vgpr output and agpr inputs into an
1910 // instruction which can take an agpr. So far that means a store.
1911 bool SIFoldOperandsImpl::tryFoldRegSequence(MachineInstr
&MI
) {
1912 assert(MI
.isRegSequence());
1913 auto Reg
= MI
.getOperand(0).getReg();
1915 if (!ST
->hasGFX90AInsts() || !TRI
->isVGPR(*MRI
, Reg
) ||
1916 !MRI
->hasOneNonDBGUse(Reg
))
1919 SmallVector
<std::pair
<MachineOperand
*, unsigned>, 32> Defs
;
1920 if (!getRegSeqInit(Defs
, Reg
, MCOI::OPERAND_REGISTER
))
1923 for (auto &[Op
, SubIdx
] : Defs
) {
1926 if (TRI
->isAGPR(*MRI
, Op
->getReg()))
1928 // Maybe this is a COPY from AREG
1929 const MachineInstr
*SubDef
= MRI
->getVRegDef(Op
->getReg());
1930 if (!SubDef
|| !SubDef
->isCopy() || SubDef
->getOperand(1).getSubReg())
1932 if (!TRI
->isAGPR(*MRI
, SubDef
->getOperand(1).getReg()))
1936 MachineOperand
*Op
= &*MRI
->use_nodbg_begin(Reg
);
1937 MachineInstr
*UseMI
= Op
->getParent();
1938 while (UseMI
->isCopy() && !Op
->getSubReg()) {
1939 Reg
= UseMI
->getOperand(0).getReg();
1940 if (!TRI
->isVGPR(*MRI
, Reg
) || !MRI
->hasOneNonDBGUse(Reg
))
1942 Op
= &*MRI
->use_nodbg_begin(Reg
);
1943 UseMI
= Op
->getParent();
1946 if (Op
->getSubReg())
1949 unsigned OpIdx
= Op
- &UseMI
->getOperand(0);
1950 const MCInstrDesc
&InstDesc
= UseMI
->getDesc();
1951 const TargetRegisterClass
*OpRC
=
1952 TII
->getRegClass(InstDesc
, OpIdx
, TRI
, *MI
.getMF());
1953 if (!OpRC
|| !TRI
->isVectorSuperClass(OpRC
))
1956 const auto *NewDstRC
= TRI
->getEquivalentAGPRClass(MRI
->getRegClass(Reg
));
1957 auto Dst
= MRI
->createVirtualRegister(NewDstRC
);
1958 auto RS
= BuildMI(*MI
.getParent(), MI
, MI
.getDebugLoc(),
1959 TII
->get(AMDGPU::REG_SEQUENCE
), Dst
);
1961 for (auto &[Def
, SubIdx
] : Defs
) {
1962 Def
->setIsKill(false);
1963 if (TRI
->isAGPR(*MRI
, Def
->getReg())) {
1965 } else { // This is a copy
1966 MachineInstr
*SubDef
= MRI
->getVRegDef(Def
->getReg());
1967 SubDef
->getOperand(1).setIsKill(false);
1968 RS
.addReg(SubDef
->getOperand(1).getReg(), 0, Def
->getSubReg());
1974 if (!TII
->isOperandLegal(*UseMI
, OpIdx
, Op
)) {
1976 RS
->eraseFromParent();
1980 LLVM_DEBUG(dbgs() << "Folded " << *RS
<< " into " << *UseMI
);
1982 // Erase the REG_SEQUENCE eagerly, unless we followed a chain of COPY users,
1983 // in which case we can erase them all later in runOnMachineFunction.
1984 if (MRI
->use_nodbg_empty(MI
.getOperand(0).getReg()))
1985 MI
.eraseFromParent();
1989 /// Checks whether \p Copy is a AGPR -> VGPR copy. Returns `true` on success and
1990 /// stores the AGPR register in \p OutReg and the subreg in \p OutSubReg
1991 static bool isAGPRCopy(const SIRegisterInfo
&TRI
,
1992 const MachineRegisterInfo
&MRI
, const MachineInstr
&Copy
,
1993 Register
&OutReg
, unsigned &OutSubReg
) {
1994 assert(Copy
.isCopy());
1996 const MachineOperand
&CopySrc
= Copy
.getOperand(1);
1997 Register CopySrcReg
= CopySrc
.getReg();
1998 if (!CopySrcReg
.isVirtual())
2001 // Common case: copy from AGPR directly, e.g.
2002 // %1:vgpr_32 = COPY %0:agpr_32
2003 if (TRI
.isAGPR(MRI
, CopySrcReg
)) {
2004 OutReg
= CopySrcReg
;
2005 OutSubReg
= CopySrc
.getSubReg();
2009 // Sometimes it can also involve two copies, e.g.
2010 // %1:vgpr_256 = COPY %0:agpr_256
2011 // %2:vgpr_32 = COPY %1:vgpr_256.sub0
2012 const MachineInstr
*CopySrcDef
= MRI
.getVRegDef(CopySrcReg
);
2013 if (!CopySrcDef
|| !CopySrcDef
->isCopy())
2016 const MachineOperand
&OtherCopySrc
= CopySrcDef
->getOperand(1);
2017 Register OtherCopySrcReg
= OtherCopySrc
.getReg();
2018 if (!OtherCopySrcReg
.isVirtual() ||
2019 CopySrcDef
->getOperand(0).getSubReg() != AMDGPU::NoSubRegister
||
2020 OtherCopySrc
.getSubReg() != AMDGPU::NoSubRegister
||
2021 !TRI
.isAGPR(MRI
, OtherCopySrcReg
))
2024 OutReg
= OtherCopySrcReg
;
2025 OutSubReg
= CopySrc
.getSubReg();
2029 // Try to hoist an AGPR to VGPR copy across a PHI.
2030 // This should allow folding of an AGPR into a consumer which may support it.
2032 // Example 1: LCSSA PHI
2034 // %1:vreg = COPY %0:areg
2036 // %2:vreg = PHI %1:vreg, %loop
2040 // %1:areg = PHI %0:areg, %loop
2041 // %2:vreg = COPY %1:areg
2043 // Example 2: PHI with multiple incoming values:
2045 // %1:vreg = GLOBAL_LOAD(..)
2047 // %2:vreg = PHI %1:vreg, %entry, %5:vreg, %loop
2048 // %3:areg = COPY %2:vreg
2049 // %4:areg = (instr using %3:areg)
2050 // %5:vreg = COPY %4:areg
2053 // %1:vreg = GLOBAL_LOAD(..)
2054 // %2:areg = COPY %1:vreg
2056 // %3:areg = PHI %2:areg, %entry, %X:areg,
2057 // %4:areg = (instr using %3:areg)
2058 bool SIFoldOperandsImpl::tryFoldPhiAGPR(MachineInstr
&PHI
) {
2059 assert(PHI
.isPHI());
2061 Register PhiOut
= PHI
.getOperand(0).getReg();
2062 if (!TRI
->isVGPR(*MRI
, PhiOut
))
2065 // Iterate once over all incoming values of the PHI to check if this PHI is
2066 // eligible, and determine the exact AGPR RC we'll target.
2067 const TargetRegisterClass
*ARC
= nullptr;
2068 for (unsigned K
= 1; K
< PHI
.getNumExplicitOperands(); K
+= 2) {
2069 MachineOperand
&MO
= PHI
.getOperand(K
);
2070 MachineInstr
*Copy
= MRI
->getVRegDef(MO
.getReg());
2071 if (!Copy
|| !Copy
->isCopy())
2075 unsigned AGPRRegMask
= AMDGPU::NoSubRegister
;
2076 if (!isAGPRCopy(*TRI
, *MRI
, *Copy
, AGPRSrc
, AGPRRegMask
))
2079 const TargetRegisterClass
*CopyInRC
= MRI
->getRegClass(AGPRSrc
);
2080 if (const auto *SubRC
= TRI
->getSubRegisterClass(CopyInRC
, AGPRRegMask
))
2083 if (ARC
&& !ARC
->hasSubClassEq(CopyInRC
))
2091 bool IsAGPR32
= (ARC
== &AMDGPU::AGPR_32RegClass
);
2093 // Rewrite the PHI's incoming values to ARC.
2094 LLVM_DEBUG(dbgs() << "Folding AGPR copies into: " << PHI
);
2095 for (unsigned K
= 1; K
< PHI
.getNumExplicitOperands(); K
+= 2) {
2096 MachineOperand
&MO
= PHI
.getOperand(K
);
2097 Register Reg
= MO
.getReg();
2099 MachineBasicBlock::iterator InsertPt
;
2100 MachineBasicBlock
*InsertMBB
= nullptr;
2102 // Look at the def of Reg, ignoring all copies.
2103 unsigned CopyOpc
= AMDGPU::COPY
;
2104 if (MachineInstr
*Def
= MRI
->getVRegDef(Reg
)) {
2106 // Look at pre-existing COPY instructions from ARC: Steal the operand. If
2107 // the copy was single-use, it will be removed by DCE later.
2108 if (Def
->isCopy()) {
2110 unsigned AGPRSubReg
= AMDGPU::NoSubRegister
;
2111 if (isAGPRCopy(*TRI
, *MRI
, *Def
, AGPRSrc
, AGPRSubReg
)) {
2113 MO
.setSubReg(AGPRSubReg
);
2117 // If this is a multi-use SGPR -> VGPR copy, use V_ACCVGPR_WRITE on
2118 // GFX908 directly instead of a COPY. Otherwise, SIFoldOperand may try
2119 // to fold the sgpr -> vgpr -> agpr copy into a sgpr -> agpr copy which
2120 // is unlikely to be profitable.
2122 // Note that V_ACCVGPR_WRITE is only used for AGPR_32.
2123 MachineOperand
&CopyIn
= Def
->getOperand(1);
2124 if (IsAGPR32
&& !ST
->hasGFX90AInsts() && !MRI
->hasOneNonDBGUse(Reg
) &&
2125 TRI
->isSGPRReg(*MRI
, CopyIn
.getReg()))
2126 CopyOpc
= AMDGPU::V_ACCVGPR_WRITE_B32_e64
;
2129 InsertMBB
= Def
->getParent();
2130 InsertPt
= InsertMBB
->SkipPHIsLabelsAndDebug(++Def
->getIterator());
2132 InsertMBB
= PHI
.getOperand(MO
.getOperandNo() + 1).getMBB();
2133 InsertPt
= InsertMBB
->getFirstTerminator();
2136 Register NewReg
= MRI
->createVirtualRegister(ARC
);
2137 MachineInstr
*MI
= BuildMI(*InsertMBB
, InsertPt
, PHI
.getDebugLoc(),
2138 TII
->get(CopyOpc
), NewReg
)
2143 LLVM_DEBUG(dbgs() << " Created COPY: " << *MI
);
2146 // Replace the PHI's result with a new register.
2147 Register NewReg
= MRI
->createVirtualRegister(ARC
);
2148 PHI
.getOperand(0).setReg(NewReg
);
2150 // COPY that new register back to the original PhiOut register. This COPY will
2151 // usually be folded out later.
2152 MachineBasicBlock
*MBB
= PHI
.getParent();
2153 BuildMI(*MBB
, MBB
->getFirstNonPHI(), PHI
.getDebugLoc(),
2154 TII
->get(AMDGPU::COPY
), PhiOut
)
2157 LLVM_DEBUG(dbgs() << " Done: Folded " << PHI
);
2161 // Attempt to convert VGPR load to an AGPR load.
2162 bool SIFoldOperandsImpl::tryFoldLoad(MachineInstr
&MI
) {
2163 assert(MI
.mayLoad());
2164 if (!ST
->hasGFX90AInsts() || MI
.getNumExplicitDefs() != 1)
2167 MachineOperand
&Def
= MI
.getOperand(0);
2171 Register DefReg
= Def
.getReg();
2173 if (DefReg
.isPhysical() || !TRI
->isVGPR(*MRI
, DefReg
))
2176 SmallVector
<const MachineInstr
*, 8> Users
;
2177 SmallVector
<Register
, 8> MoveRegs
;
2178 for (const MachineInstr
&I
: MRI
->use_nodbg_instructions(DefReg
))
2179 Users
.push_back(&I
);
2184 // Check that all uses a copy to an agpr or a reg_sequence producing an agpr.
2185 while (!Users
.empty()) {
2186 const MachineInstr
*I
= Users
.pop_back_val();
2187 if (!I
->isCopy() && !I
->isRegSequence())
2189 Register DstReg
= I
->getOperand(0).getReg();
2190 // Physical registers may have more than one instruction definitions
2191 if (DstReg
.isPhysical())
2193 if (TRI
->isAGPR(*MRI
, DstReg
))
2195 MoveRegs
.push_back(DstReg
);
2196 for (const MachineInstr
&U
: MRI
->use_nodbg_instructions(DstReg
))
2197 Users
.push_back(&U
);
2200 const TargetRegisterClass
*RC
= MRI
->getRegClass(DefReg
);
2201 MRI
->setRegClass(DefReg
, TRI
->getEquivalentAGPRClass(RC
));
2202 if (!TII
->isOperandLegal(MI
, 0, &Def
)) {
2203 MRI
->setRegClass(DefReg
, RC
);
2207 while (!MoveRegs
.empty()) {
2208 Register Reg
= MoveRegs
.pop_back_val();
2209 MRI
->setRegClass(Reg
, TRI
->getEquivalentAGPRClass(MRI
->getRegClass(Reg
)));
2212 LLVM_DEBUG(dbgs() << "Folded " << MI
);
2217 // tryFoldPhiAGPR will aggressively try to create AGPR PHIs.
2218 // For GFX90A and later, this is pretty much always a good thing, but for GFX908
2219 // there's cases where it can create a lot more AGPR-AGPR copies, which are
2220 // expensive on this architecture due to the lack of V_ACCVGPR_MOV.
2222 // This function looks at all AGPR PHIs in a basic block and collects their
2223 // operands. Then, it checks for register that are used more than once across
2224 // all PHIs and caches them in a VGPR. This prevents ExpandPostRAPseudo from
2225 // having to create one VGPR temporary per use, which can get very messy if
2226 // these PHIs come from a broken-up large PHI (e.g. 32 AGPR phis, one per vector
2231 // %in:agpr_256 = COPY %foo:vgpr_256
2235 // %0:areg = PHI %in.sub0:agpr_32, %a, %x, %c
2236 // %1:areg = PHI %in.sub0:agpr_32, %a, %y, %c
2237 // %2:areg = PHI %in.sub0:agpr_32, %a, %z, %c
2240 // %in:agpr_256 = COPY %foo:vgpr_256
2241 // %tmp:vgpr_32 = V_ACCVGPR_READ_B32_e64 %in.sub0:agpr_32
2242 // %tmp_agpr:agpr_32 = COPY %tmp
2246 // %0:areg = PHI %tmp_agpr, %a, %x, %c
2247 // %1:areg = PHI %tmp_agpr, %a, %y, %c
2248 // %2:areg = PHI %tmp_agpr, %a, %z, %c
2249 bool SIFoldOperandsImpl::tryOptimizeAGPRPhis(MachineBasicBlock
&MBB
) {
2250 // This is only really needed on GFX908 where AGPR-AGPR copies are
2251 // unreasonably difficult.
2252 if (ST
->hasGFX90AInsts())
2255 // Look at all AGPR Phis and collect the register + subregister used.
2256 DenseMap
<std::pair
<Register
, unsigned>, std::vector
<MachineOperand
*>>
2259 for (auto &MI
: MBB
) {
2263 if (!TRI
->isAGPR(*MRI
, MI
.getOperand(0).getReg()))
2266 for (unsigned K
= 1; K
< MI
.getNumOperands(); K
+= 2) {
2267 MachineOperand
&PhiMO
= MI
.getOperand(K
);
2268 if (!PhiMO
.getSubReg())
2270 RegToMO
[{PhiMO
.getReg(), PhiMO
.getSubReg()}].push_back(&PhiMO
);
2274 // For all (Reg, SubReg) pair that are used more than once, cache the value in
2276 bool Changed
= false;
2277 for (const auto &[Entry
, MOs
] : RegToMO
) {
2278 if (MOs
.size() == 1)
2281 const auto [Reg
, SubReg
] = Entry
;
2282 MachineInstr
*Def
= MRI
->getVRegDef(Reg
);
2283 MachineBasicBlock
*DefMBB
= Def
->getParent();
2285 // Create a copy in a VGPR using V_ACCVGPR_READ_B32_e64 so it's not folded
2287 const TargetRegisterClass
*ARC
= getRegOpRC(*MRI
, *TRI
, *MOs
.front());
2289 MRI
->createVirtualRegister(TRI
->getEquivalentVGPRClass(ARC
));
2290 MachineInstr
*VGPRCopy
=
2291 BuildMI(*DefMBB
, ++Def
->getIterator(), Def
->getDebugLoc(),
2292 TII
->get(AMDGPU::V_ACCVGPR_READ_B32_e64
), TempVGPR
)
2293 .addReg(Reg
, /* flags */ 0, SubReg
);
2295 // Copy back to an AGPR and use that instead of the AGPR subreg in all MOs.
2296 Register TempAGPR
= MRI
->createVirtualRegister(ARC
);
2297 BuildMI(*DefMBB
, ++VGPRCopy
->getIterator(), Def
->getDebugLoc(),
2298 TII
->get(AMDGPU::COPY
), TempAGPR
)
2301 LLVM_DEBUG(dbgs() << "Caching AGPR into VGPR: " << *VGPRCopy
);
2302 for (MachineOperand
*MO
: MOs
) {
2303 MO
->setReg(TempAGPR
);
2304 MO
->setSubReg(AMDGPU::NoSubRegister
);
2305 LLVM_DEBUG(dbgs() << " Changed PHI Operand: " << *MO
<< "\n");
2314 bool SIFoldOperandsImpl::run(MachineFunction
&MF
) {
2315 MRI
= &MF
.getRegInfo();
2316 ST
= &MF
.getSubtarget
<GCNSubtarget
>();
2317 TII
= ST
->getInstrInfo();
2318 TRI
= &TII
->getRegisterInfo();
2319 MFI
= MF
.getInfo
<SIMachineFunctionInfo
>();
2321 // omod is ignored by hardware if IEEE bit is enabled. omod also does not
2322 // correctly handle signed zeros.
2324 // FIXME: Also need to check strictfp
2325 bool IsIEEEMode
= MFI
->getMode().IEEE
;
2326 bool HasNSZ
= MFI
->hasNoSignedZerosFPMath();
2328 bool Changed
= false;
2329 for (MachineBasicBlock
*MBB
: depth_first(&MF
)) {
2330 MachineOperand
*CurrentKnownM0Val
= nullptr;
2331 for (auto &MI
: make_early_inc_range(*MBB
)) {
2332 Changed
|= tryFoldCndMask(MI
);
2334 if (tryFoldZeroHighBits(MI
)) {
2339 if (MI
.isRegSequence() && tryFoldRegSequence(MI
)) {
2344 if (MI
.isPHI() && tryFoldPhiAGPR(MI
)) {
2349 if (MI
.mayLoad() && tryFoldLoad(MI
)) {
2354 if (TII
->isFoldableCopy(MI
)) {
2355 Changed
|= tryFoldFoldableCopy(MI
, CurrentKnownM0Val
);
2359 // Saw an unknown clobber of m0, so we no longer know what it is.
2360 if (CurrentKnownM0Val
&& MI
.modifiesRegister(AMDGPU::M0
, TRI
))
2361 CurrentKnownM0Val
= nullptr;
2363 // TODO: Omod might be OK if there is NSZ only on the source
2364 // instruction, and not the omod multiply.
2365 if (IsIEEEMode
|| (!HasNSZ
&& !MI
.getFlag(MachineInstr::FmNsz
)) ||
2367 Changed
|= tryFoldClamp(MI
);
2370 Changed
|= tryOptimizeAGPRPhis(*MBB
);
2376 PreservedAnalyses
SIFoldOperandsPass::run(MachineFunction
&MF
,
2377 MachineFunctionAnalysisManager
&) {
2378 bool Changed
= SIFoldOperandsImpl().run(MF
);
2380 return PreservedAnalyses::all();
2382 auto PA
= getMachineFunctionPassPreservedAnalyses();
2383 PA
.preserveSet
<CFGAnalyses
>();