1 //===-- SIFoldOperands.cpp - Fold operands --- ----------------------------===//
3 // The LLVM Compiler Infrastructure
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
9 //===----------------------------------------------------------------------===//
13 #include "AMDGPUSubtarget.h"
14 #include "SIInstrInfo.h"
15 #include "SIMachineFunctionInfo.h"
16 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
17 #include "llvm/ADT/DepthFirstIterator.h"
18 #include "llvm/CodeGen/LiveIntervals.h"
19 #include "llvm/CodeGen/MachineFunctionPass.h"
20 #include "llvm/CodeGen/MachineInstrBuilder.h"
21 #include "llvm/CodeGen/MachineRegisterInfo.h"
22 #include "llvm/Support/Debug.h"
23 #include "llvm/Support/raw_ostream.h"
24 #include "llvm/Target/TargetMachine.h"
26 #define DEBUG_TYPE "si-fold-operands"
31 struct FoldCandidate
{
34 MachineOperand
*OpToFold
;
39 unsigned char UseOpNo
;
40 MachineOperand::MachineOperandType Kind
;
43 FoldCandidate(MachineInstr
*MI
, unsigned OpNo
, MachineOperand
*FoldOp
,
44 bool Commuted_
= false,
46 UseMI(MI
), OpToFold(nullptr), ShrinkOpcode(ShrinkOp
), UseOpNo(OpNo
),
47 Kind(FoldOp
->getType()),
49 if (FoldOp
->isImm()) {
50 ImmToFold
= FoldOp
->getImm();
51 } else if (FoldOp
->isFI()) {
52 FrameIndexToFold
= FoldOp
->getIndex();
54 assert(FoldOp
->isReg());
60 return Kind
== MachineOperand::MO_FrameIndex
;
64 return Kind
== MachineOperand::MO_Immediate
;
68 return Kind
== MachineOperand::MO_Register
;
71 bool isCommuted() const {
75 bool needsShrink() const {
76 return ShrinkOpcode
!= -1;
79 int getShrinkOpcode() const {
84 class SIFoldOperands
: public MachineFunctionPass
{
87 MachineRegisterInfo
*MRI
;
88 const SIInstrInfo
*TII
;
89 const SIRegisterInfo
*TRI
;
90 const GCNSubtarget
*ST
;
92 void foldOperand(MachineOperand
&OpToFold
,
95 SmallVectorImpl
<FoldCandidate
> &FoldList
,
96 SmallVectorImpl
<MachineInstr
*> &CopiesToReplace
) const;
98 void foldInstOperand(MachineInstr
&MI
, MachineOperand
&OpToFold
) const;
100 const MachineOperand
*isClamp(const MachineInstr
&MI
) const;
101 bool tryFoldClamp(MachineInstr
&MI
);
103 std::pair
<const MachineOperand
*, int> isOMod(const MachineInstr
&MI
) const;
104 bool tryFoldOMod(MachineInstr
&MI
);
107 SIFoldOperands() : MachineFunctionPass(ID
) {
108 initializeSIFoldOperandsPass(*PassRegistry::getPassRegistry());
111 bool runOnMachineFunction(MachineFunction
&MF
) override
;
113 StringRef
getPassName() const override
{ return "SI Fold Operands"; }
115 void getAnalysisUsage(AnalysisUsage
&AU
) const override
{
116 AU
.setPreservesCFG();
117 MachineFunctionPass::getAnalysisUsage(AU
);
121 } // End anonymous namespace.
123 INITIALIZE_PASS(SIFoldOperands
, DEBUG_TYPE
,
124 "SI Fold Operands", false, false)
126 char SIFoldOperands::ID
= 0;
128 char &llvm::SIFoldOperandsID
= SIFoldOperands::ID
;
130 // Wrapper around isInlineConstant that understands special cases when
131 // instruction types are replaced during operand folding.
132 static bool isInlineConstantIfFolded(const SIInstrInfo
*TII
,
133 const MachineInstr
&UseMI
,
135 const MachineOperand
&OpToFold
) {
136 if (TII
->isInlineConstant(UseMI
, OpNo
, OpToFold
))
139 unsigned Opc
= UseMI
.getOpcode();
141 case AMDGPU::V_MAC_F32_e64
:
142 case AMDGPU::V_MAC_F16_e64
:
143 case AMDGPU::V_FMAC_F32_e64
: {
144 // Special case for mac. Since this is replaced with mad when folded into
145 // src2, we need to check the legality for the final instruction.
146 int Src2Idx
= AMDGPU::getNamedOperandIdx(Opc
, AMDGPU::OpName::src2
);
147 if (static_cast<int>(OpNo
) == Src2Idx
) {
148 bool IsFMA
= Opc
== AMDGPU::V_FMAC_F32_e64
;
149 bool IsF32
= Opc
== AMDGPU::V_MAC_F32_e64
;
151 unsigned Opc
= IsFMA
?
152 AMDGPU::V_FMA_F32
: (IsF32
? AMDGPU::V_MAD_F32
: AMDGPU::V_MAD_F16
);
153 const MCInstrDesc
&MadDesc
= TII
->get(Opc
);
154 return TII
->isInlineConstant(OpToFold
, MadDesc
.OpInfo
[OpNo
].OperandType
);
163 FunctionPass
*llvm::createSIFoldOperandsPass() {
164 return new SIFoldOperands();
167 static bool updateOperand(FoldCandidate
&Fold
,
168 const SIInstrInfo
&TII
,
169 const TargetRegisterInfo
&TRI
) {
170 MachineInstr
*MI
= Fold
.UseMI
;
171 MachineOperand
&Old
= MI
->getOperand(Fold
.UseOpNo
);
175 if (MI
->getDesc().TSFlags
& SIInstrFlags::IsPacked
) {
176 // Set op_sel/op_sel_hi on this operand or bail out if op_sel is
178 unsigned Opcode
= MI
->getOpcode();
179 int OpNo
= MI
->getOperandNo(&Old
);
181 if (OpNo
== AMDGPU::getNamedOperandIdx(Opcode
, AMDGPU::OpName::src0
))
182 ModIdx
= AMDGPU::OpName::src0_modifiers
;
183 else if (OpNo
== AMDGPU::getNamedOperandIdx(Opcode
, AMDGPU::OpName::src1
))
184 ModIdx
= AMDGPU::OpName::src1_modifiers
;
185 else if (OpNo
== AMDGPU::getNamedOperandIdx(Opcode
, AMDGPU::OpName::src2
))
186 ModIdx
= AMDGPU::OpName::src2_modifiers
;
187 assert(ModIdx
!= -1);
188 ModIdx
= AMDGPU::getNamedOperandIdx(Opcode
, ModIdx
);
189 MachineOperand
&Mod
= MI
->getOperand(ModIdx
);
190 unsigned Val
= Mod
.getImm();
191 if ((Val
& SISrcMods::OP_SEL_0
) || !(Val
& SISrcMods::OP_SEL_1
))
193 // If upper part is all zero we do not need op_sel_hi.
194 if (!isUInt
<16>(Fold
.ImmToFold
)) {
195 if (!(Fold
.ImmToFold
& 0xffff)) {
196 Mod
.setImm(Mod
.getImm() | SISrcMods::OP_SEL_0
);
197 Mod
.setImm(Mod
.getImm() & ~SISrcMods::OP_SEL_1
);
198 Old
.ChangeToImmediate((Fold
.ImmToFold
>> 16) & 0xffff);
201 Mod
.setImm(Mod
.getImm() & ~SISrcMods::OP_SEL_1
);
205 if (Fold
.needsShrink()) {
206 MachineBasicBlock
*MBB
= MI
->getParent();
207 auto Liveness
= MBB
->computeRegisterLiveness(&TRI
, AMDGPU::VCC
, MI
);
208 if (Liveness
!= MachineBasicBlock::LQR_Dead
)
211 MachineRegisterInfo
&MRI
= MBB
->getParent()->getRegInfo();
212 int Op32
= Fold
.getShrinkOpcode();
213 MachineOperand
&Dst0
= MI
->getOperand(0);
214 MachineOperand
&Dst1
= MI
->getOperand(1);
215 assert(Dst0
.isDef() && Dst1
.isDef());
217 bool HaveNonDbgCarryUse
= !MRI
.use_nodbg_empty(Dst1
.getReg());
219 const TargetRegisterClass
*Dst0RC
= MRI
.getRegClass(Dst0
.getReg());
220 unsigned NewReg0
= MRI
.createVirtualRegister(Dst0RC
);
221 const TargetRegisterClass
*Dst1RC
= MRI
.getRegClass(Dst1
.getReg());
222 unsigned NewReg1
= MRI
.createVirtualRegister(Dst1RC
);
224 MachineInstr
*Inst32
= TII
.buildShrunkInst(*MI
, Op32
);
226 if (HaveNonDbgCarryUse
) {
227 BuildMI(*MBB
, MI
, MI
->getDebugLoc(), TII
.get(AMDGPU::COPY
), Dst1
.getReg())
228 .addReg(AMDGPU::VCC
, RegState::Kill
);
231 // Keep the old instruction around to avoid breaking iterators, but
232 // replace the outputs with dummy registers.
233 Dst0
.setReg(NewReg0
);
234 Dst1
.setReg(NewReg1
);
236 if (Fold
.isCommuted())
237 TII
.commuteInstruction(*Inst32
, false);
241 Old
.ChangeToImmediate(Fold
.ImmToFold
);
245 assert(!Fold
.needsShrink() && "not handled");
248 Old
.ChangeToFrameIndex(Fold
.FrameIndexToFold
);
252 MachineOperand
*New
= Fold
.OpToFold
;
253 if (TargetRegisterInfo::isVirtualRegister(Old
.getReg()) &&
254 TargetRegisterInfo::isVirtualRegister(New
->getReg())) {
255 Old
.substVirtReg(New
->getReg(), New
->getSubReg(), TRI
);
257 Old
.setIsUndef(New
->isUndef());
261 // FIXME: Handle physical registers.
266 static bool isUseMIInFoldList(ArrayRef
<FoldCandidate
> FoldList
,
267 const MachineInstr
*MI
) {
268 for (auto Candidate
: FoldList
) {
269 if (Candidate
.UseMI
== MI
)
275 static bool tryAddToFoldList(SmallVectorImpl
<FoldCandidate
> &FoldList
,
276 MachineInstr
*MI
, unsigned OpNo
,
277 MachineOperand
*OpToFold
,
278 const SIInstrInfo
*TII
) {
279 if (!TII
->isOperandLegal(*MI
, OpNo
, OpToFold
)) {
281 // Special case for v_mac_{f16, f32}_e64 if we are trying to fold into src2
282 unsigned Opc
= MI
->getOpcode();
283 if ((Opc
== AMDGPU::V_MAC_F32_e64
|| Opc
== AMDGPU::V_MAC_F16_e64
||
284 Opc
== AMDGPU::V_FMAC_F32_e64
) &&
285 (int)OpNo
== AMDGPU::getNamedOperandIdx(Opc
, AMDGPU::OpName::src2
)) {
286 bool IsFMA
= Opc
== AMDGPU::V_FMAC_F32_e64
;
287 bool IsF32
= Opc
== AMDGPU::V_MAC_F32_e64
;
288 unsigned NewOpc
= IsFMA
?
289 AMDGPU::V_FMA_F32
: (IsF32
? AMDGPU::V_MAD_F32
: AMDGPU::V_MAD_F16
);
291 // Check if changing this to a v_mad_{f16, f32} instruction will allow us
292 // to fold the operand.
293 MI
->setDesc(TII
->get(NewOpc
));
294 bool FoldAsMAD
= tryAddToFoldList(FoldList
, MI
, OpNo
, OpToFold
, TII
);
296 MI
->untieRegOperand(OpNo
);
299 MI
->setDesc(TII
->get(Opc
));
302 // Special case for s_setreg_b32
303 if (Opc
== AMDGPU::S_SETREG_B32
&& OpToFold
->isImm()) {
304 MI
->setDesc(TII
->get(AMDGPU::S_SETREG_IMM32_B32
));
305 FoldList
.push_back(FoldCandidate(MI
, OpNo
, OpToFold
));
309 // If we are already folding into another operand of MI, then
310 // we can't commute the instruction, otherwise we risk making the
311 // other fold illegal.
312 if (isUseMIInFoldList(FoldList
, MI
))
315 unsigned CommuteOpNo
= OpNo
;
317 // Operand is not legal, so try to commute the instruction to
318 // see if this makes it possible to fold.
319 unsigned CommuteIdx0
= TargetInstrInfo::CommuteAnyOperandIndex
;
320 unsigned CommuteIdx1
= TargetInstrInfo::CommuteAnyOperandIndex
;
321 bool CanCommute
= TII
->findCommutedOpIndices(*MI
, CommuteIdx0
, CommuteIdx1
);
324 if (CommuteIdx0
== OpNo
)
325 CommuteOpNo
= CommuteIdx1
;
326 else if (CommuteIdx1
== OpNo
)
327 CommuteOpNo
= CommuteIdx0
;
331 // One of operands might be an Imm operand, and OpNo may refer to it after
332 // the call of commuteInstruction() below. Such situations are avoided
333 // here explicitly as OpNo must be a register operand to be a candidate
334 // for memory folding.
335 if (CanCommute
&& (!MI
->getOperand(CommuteIdx0
).isReg() ||
336 !MI
->getOperand(CommuteIdx1
).isReg()))
340 !TII
->commuteInstruction(*MI
, false, CommuteIdx0
, CommuteIdx1
))
343 if (!TII
->isOperandLegal(*MI
, CommuteOpNo
, OpToFold
)) {
344 if ((Opc
== AMDGPU::V_ADD_I32_e64
||
345 Opc
== AMDGPU::V_SUB_I32_e64
||
346 Opc
== AMDGPU::V_SUBREV_I32_e64
) && // FIXME
348 MachineRegisterInfo
&MRI
= MI
->getParent()->getParent()->getRegInfo();
350 // Verify the other operand is a VGPR, otherwise we would violate the
351 // constant bus restriction.
352 unsigned OtherIdx
= CommuteOpNo
== CommuteIdx0
? CommuteIdx1
: CommuteIdx0
;
353 MachineOperand
&OtherOp
= MI
->getOperand(OtherIdx
);
354 if (!OtherOp
.isReg() ||
355 !TII
->getRegisterInfo().isVGPR(MRI
, OtherOp
.getReg()))
358 assert(MI
->getOperand(1).isDef());
360 int Op32
= AMDGPU::getVOPe32(Opc
);
361 FoldList
.push_back(FoldCandidate(MI
, CommuteOpNo
, OpToFold
, true,
366 TII
->commuteInstruction(*MI
, false, CommuteIdx0
, CommuteIdx1
);
370 FoldList
.push_back(FoldCandidate(MI
, CommuteOpNo
, OpToFold
, true));
374 FoldList
.push_back(FoldCandidate(MI
, OpNo
, OpToFold
));
378 // If the use operand doesn't care about the value, this may be an operand only
379 // used for register indexing, in which case it is unsafe to fold.
380 static bool isUseSafeToFold(const SIInstrInfo
*TII
,
381 const MachineInstr
&MI
,
382 const MachineOperand
&UseMO
) {
383 return !UseMO
.isUndef() && !TII
->isSDWA(MI
);
384 //return !MI.hasRegisterImplicitUseOperand(UseMO.getReg());
387 void SIFoldOperands::foldOperand(
388 MachineOperand
&OpToFold
,
391 SmallVectorImpl
<FoldCandidate
> &FoldList
,
392 SmallVectorImpl
<MachineInstr
*> &CopiesToReplace
) const {
393 const MachineOperand
&UseOp
= UseMI
->getOperand(UseOpIdx
);
395 if (!isUseSafeToFold(TII
, *UseMI
, UseOp
))
398 // FIXME: Fold operands with subregs.
399 if (UseOp
.isReg() && OpToFold
.isReg()) {
400 if (UseOp
.isImplicit() || UseOp
.getSubReg() != AMDGPU::NoSubRegister
)
403 // Don't fold subregister extracts into tied operands, only if it is a full
404 // copy since a subregister use tied to a full register def doesn't really
405 // make sense. e.g. don't fold:
408 // %2<tied3> = V_MAC_{F16, F32} %3, %4, %1<tied0>
411 // %2<tied3> = V_MAC_{F16, F32} %3, %4, %0:sub1<tied0>
412 if (UseOp
.isTied() && OpToFold
.getSubReg() != AMDGPU::NoSubRegister
)
416 // Special case for REG_SEQUENCE: We can't fold literals into
417 // REG_SEQUENCE instructions, so we have to fold them into the
418 // uses of REG_SEQUENCE.
419 if (UseMI
->isRegSequence()) {
420 unsigned RegSeqDstReg
= UseMI
->getOperand(0).getReg();
421 unsigned RegSeqDstSubReg
= UseMI
->getOperand(UseOpIdx
+ 1).getImm();
423 for (MachineRegisterInfo::use_iterator
424 RSUse
= MRI
->use_begin(RegSeqDstReg
), RSE
= MRI
->use_end();
425 RSUse
!= RSE
; ++RSUse
) {
427 MachineInstr
*RSUseMI
= RSUse
->getParent();
428 if (RSUse
->getSubReg() != RegSeqDstSubReg
)
431 foldOperand(OpToFold
, RSUseMI
, RSUse
.getOperandNo(), FoldList
,
439 bool FoldingImm
= OpToFold
.isImm();
441 if (FoldingImm
&& UseMI
->isCopy()) {
442 unsigned DestReg
= UseMI
->getOperand(0).getReg();
443 const TargetRegisterClass
*DestRC
444 = TargetRegisterInfo::isVirtualRegister(DestReg
) ?
445 MRI
->getRegClass(DestReg
) :
446 TRI
->getPhysRegClass(DestReg
);
448 unsigned SrcReg
= UseMI
->getOperand(1).getReg();
449 if (TargetRegisterInfo::isVirtualRegister(DestReg
) &&
450 TargetRegisterInfo::isVirtualRegister(SrcReg
)) {
451 const TargetRegisterClass
* SrcRC
= MRI
->getRegClass(SrcReg
);
452 if (TRI
->isSGPRClass(SrcRC
) && TRI
->hasVGPRs(DestRC
)) {
453 MachineRegisterInfo::use_iterator NextUse
;
454 SmallVector
<FoldCandidate
, 4> CopyUses
;
455 for (MachineRegisterInfo::use_iterator
456 Use
= MRI
->use_begin(DestReg
), E
= MRI
->use_end();
457 Use
!= E
; Use
= NextUse
) {
458 NextUse
= std::next(Use
);
459 FoldCandidate FC
= FoldCandidate(Use
->getParent(),
460 Use
.getOperandNo(), &UseMI
->getOperand(1));
461 CopyUses
.push_back(FC
);
463 for (auto & F
: CopyUses
) {
464 foldOperand(*F
.OpToFold
, F
.UseMI
, F
.UseOpNo
,
465 FoldList
, CopiesToReplace
);
470 // In order to fold immediates into copies, we need to change the
473 unsigned MovOp
= TII
->getMovOpcode(DestRC
);
474 if (MovOp
== AMDGPU::COPY
)
477 UseMI
->setDesc(TII
->get(MovOp
));
478 CopiesToReplace
.push_back(UseMI
);
480 const MCInstrDesc
&UseDesc
= UseMI
->getDesc();
482 // Don't fold into target independent nodes. Target independent opcodes
483 // don't have defined register classes.
484 if (UseDesc
.isVariadic() ||
485 UseOp
.isImplicit() ||
486 UseDesc
.OpInfo
[UseOpIdx
].RegClass
== -1)
491 tryAddToFoldList(FoldList
, UseMI
, UseOpIdx
, &OpToFold
, TII
);
493 // FIXME: We could try to change the instruction from 64-bit to 32-bit
494 // to enable more folding opportunites. The shrink operands pass
495 // already does this.
500 const MCInstrDesc
&FoldDesc
= OpToFold
.getParent()->getDesc();
501 const TargetRegisterClass
*FoldRC
=
502 TRI
->getRegClass(FoldDesc
.OpInfo
[0].RegClass
);
505 // Split 64-bit constants into 32-bits for folding.
506 if (UseOp
.getSubReg() && AMDGPU::getRegBitWidth(FoldRC
->getID()) == 64) {
507 unsigned UseReg
= UseOp
.getReg();
508 const TargetRegisterClass
*UseRC
509 = TargetRegisterInfo::isVirtualRegister(UseReg
) ?
510 MRI
->getRegClass(UseReg
) :
511 TRI
->getPhysRegClass(UseReg
);
513 if (AMDGPU::getRegBitWidth(UseRC
->getID()) != 64)
516 APInt
Imm(64, OpToFold
.getImm());
517 if (UseOp
.getSubReg() == AMDGPU::sub0
) {
518 Imm
= Imm
.getLoBits(32);
520 assert(UseOp
.getSubReg() == AMDGPU::sub1
);
521 Imm
= Imm
.getHiBits(32);
524 MachineOperand ImmOp
= MachineOperand::CreateImm(Imm
.getSExtValue());
525 tryAddToFoldList(FoldList
, UseMI
, UseOpIdx
, &ImmOp
, TII
);
531 tryAddToFoldList(FoldList
, UseMI
, UseOpIdx
, &OpToFold
, TII
);
534 static bool evalBinaryInstruction(unsigned Opcode
, int32_t &Result
,
535 uint32_t LHS
, uint32_t RHS
) {
537 case AMDGPU::V_AND_B32_e64
:
538 case AMDGPU::V_AND_B32_e32
:
539 case AMDGPU::S_AND_B32
:
542 case AMDGPU::V_OR_B32_e64
:
543 case AMDGPU::V_OR_B32_e32
:
544 case AMDGPU::S_OR_B32
:
547 case AMDGPU::V_XOR_B32_e64
:
548 case AMDGPU::V_XOR_B32_e32
:
549 case AMDGPU::S_XOR_B32
:
552 case AMDGPU::V_LSHL_B32_e64
:
553 case AMDGPU::V_LSHL_B32_e32
:
554 case AMDGPU::S_LSHL_B32
:
555 // The instruction ignores the high bits for out of bounds shifts.
556 Result
= LHS
<< (RHS
& 31);
558 case AMDGPU::V_LSHLREV_B32_e64
:
559 case AMDGPU::V_LSHLREV_B32_e32
:
560 Result
= RHS
<< (LHS
& 31);
562 case AMDGPU::V_LSHR_B32_e64
:
563 case AMDGPU::V_LSHR_B32_e32
:
564 case AMDGPU::S_LSHR_B32
:
565 Result
= LHS
>> (RHS
& 31);
567 case AMDGPU::V_LSHRREV_B32_e64
:
568 case AMDGPU::V_LSHRREV_B32_e32
:
569 Result
= RHS
>> (LHS
& 31);
571 case AMDGPU::V_ASHR_I32_e64
:
572 case AMDGPU::V_ASHR_I32_e32
:
573 case AMDGPU::S_ASHR_I32
:
574 Result
= static_cast<int32_t>(LHS
) >> (RHS
& 31);
576 case AMDGPU::V_ASHRREV_I32_e64
:
577 case AMDGPU::V_ASHRREV_I32_e32
:
578 Result
= static_cast<int32_t>(RHS
) >> (LHS
& 31);
585 static unsigned getMovOpc(bool IsScalar
) {
586 return IsScalar
? AMDGPU::S_MOV_B32
: AMDGPU::V_MOV_B32_e32
;
589 /// Remove any leftover implicit operands from mutating the instruction. e.g.
590 /// if we replace an s_and_b32 with a copy, we don't need the implicit scc def
592 static void stripExtraCopyOperands(MachineInstr
&MI
) {
593 const MCInstrDesc
&Desc
= MI
.getDesc();
594 unsigned NumOps
= Desc
.getNumOperands() +
595 Desc
.getNumImplicitUses() +
596 Desc
.getNumImplicitDefs();
598 for (unsigned I
= MI
.getNumOperands() - 1; I
>= NumOps
; --I
)
602 static void mutateCopyOp(MachineInstr
&MI
, const MCInstrDesc
&NewDesc
) {
604 stripExtraCopyOperands(MI
);
607 static MachineOperand
*getImmOrMaterializedImm(MachineRegisterInfo
&MRI
,
608 MachineOperand
&Op
) {
610 // If this has a subregister, it obviously is a register source.
611 if (Op
.getSubReg() != AMDGPU::NoSubRegister
||
612 !TargetRegisterInfo::isVirtualRegister(Op
.getReg()))
615 MachineInstr
*Def
= MRI
.getVRegDef(Op
.getReg());
616 if (Def
&& Def
->isMoveImmediate()) {
617 MachineOperand
&ImmSrc
= Def
->getOperand(1);
626 // Try to simplify operations with a constant that may appear after instruction
628 // TODO: See if a frame index with a fixed offset can fold.
629 static bool tryConstantFoldOp(MachineRegisterInfo
&MRI
,
630 const SIInstrInfo
*TII
,
632 MachineOperand
*ImmOp
) {
633 unsigned Opc
= MI
->getOpcode();
634 if (Opc
== AMDGPU::V_NOT_B32_e64
|| Opc
== AMDGPU::V_NOT_B32_e32
||
635 Opc
== AMDGPU::S_NOT_B32
) {
636 MI
->getOperand(1).ChangeToImmediate(~ImmOp
->getImm());
637 mutateCopyOp(*MI
, TII
->get(getMovOpc(Opc
== AMDGPU::S_NOT_B32
)));
641 int Src1Idx
= AMDGPU::getNamedOperandIdx(Opc
, AMDGPU::OpName::src1
);
645 int Src0Idx
= AMDGPU::getNamedOperandIdx(Opc
, AMDGPU::OpName::src0
);
646 MachineOperand
*Src0
= getImmOrMaterializedImm(MRI
, MI
->getOperand(Src0Idx
));
647 MachineOperand
*Src1
= getImmOrMaterializedImm(MRI
, MI
->getOperand(Src1Idx
));
649 if (!Src0
->isImm() && !Src1
->isImm())
652 if (MI
->getOpcode() == AMDGPU::V_LSHL_OR_B32
) {
653 if (Src0
->isImm() && Src0
->getImm() == 0) {
654 // v_lshl_or_b32 0, X, Y -> copy Y
655 // v_lshl_or_b32 0, X, K -> v_mov_b32 K
656 bool UseCopy
= TII
->getNamedOperand(*MI
, AMDGPU::OpName::src2
)->isReg();
657 MI
->RemoveOperand(Src1Idx
);
658 MI
->RemoveOperand(Src0Idx
);
660 MI
->setDesc(TII
->get(UseCopy
? AMDGPU::COPY
: AMDGPU::V_MOV_B32_e32
));
665 // and k0, k1 -> v_mov_b32 (k0 & k1)
666 // or k0, k1 -> v_mov_b32 (k0 | k1)
667 // xor k0, k1 -> v_mov_b32 (k0 ^ k1)
668 if (Src0
->isImm() && Src1
->isImm()) {
670 if (!evalBinaryInstruction(Opc
, NewImm
, Src0
->getImm(), Src1
->getImm()))
673 const SIRegisterInfo
&TRI
= TII
->getRegisterInfo();
674 bool IsSGPR
= TRI
.isSGPRReg(MRI
, MI
->getOperand(0).getReg());
676 // Be careful to change the right operand, src0 may belong to a different
678 MI
->getOperand(Src0Idx
).ChangeToImmediate(NewImm
);
679 MI
->RemoveOperand(Src1Idx
);
680 mutateCopyOp(*MI
, TII
->get(getMovOpc(IsSGPR
)));
684 if (!MI
->isCommutable())
687 if (Src0
->isImm() && !Src1
->isImm()) {
688 std::swap(Src0
, Src1
);
689 std::swap(Src0Idx
, Src1Idx
);
692 int32_t Src1Val
= static_cast<int32_t>(Src1
->getImm());
693 if (Opc
== AMDGPU::V_OR_B32_e64
||
694 Opc
== AMDGPU::V_OR_B32_e32
||
695 Opc
== AMDGPU::S_OR_B32
) {
697 // y = or x, 0 => y = copy x
698 MI
->RemoveOperand(Src1Idx
);
699 mutateCopyOp(*MI
, TII
->get(AMDGPU::COPY
));
700 } else if (Src1Val
== -1) {
701 // y = or x, -1 => y = v_mov_b32 -1
702 MI
->RemoveOperand(Src1Idx
);
703 mutateCopyOp(*MI
, TII
->get(getMovOpc(Opc
== AMDGPU::S_OR_B32
)));
710 if (MI
->getOpcode() == AMDGPU::V_AND_B32_e64
||
711 MI
->getOpcode() == AMDGPU::V_AND_B32_e32
||
712 MI
->getOpcode() == AMDGPU::S_AND_B32
) {
714 // y = and x, 0 => y = v_mov_b32 0
715 MI
->RemoveOperand(Src0Idx
);
716 mutateCopyOp(*MI
, TII
->get(getMovOpc(Opc
== AMDGPU::S_AND_B32
)));
717 } else if (Src1Val
== -1) {
718 // y = and x, -1 => y = copy x
719 MI
->RemoveOperand(Src1Idx
);
720 mutateCopyOp(*MI
, TII
->get(AMDGPU::COPY
));
721 stripExtraCopyOperands(*MI
);
728 if (MI
->getOpcode() == AMDGPU::V_XOR_B32_e64
||
729 MI
->getOpcode() == AMDGPU::V_XOR_B32_e32
||
730 MI
->getOpcode() == AMDGPU::S_XOR_B32
) {
732 // y = xor x, 0 => y = copy x
733 MI
->RemoveOperand(Src1Idx
);
734 mutateCopyOp(*MI
, TII
->get(AMDGPU::COPY
));
742 // Try to fold an instruction into a simpler one
743 static bool tryFoldInst(const SIInstrInfo
*TII
,
745 unsigned Opc
= MI
->getOpcode();
747 if (Opc
== AMDGPU::V_CNDMASK_B32_e32
||
748 Opc
== AMDGPU::V_CNDMASK_B32_e64
||
749 Opc
== AMDGPU::V_CNDMASK_B64_PSEUDO
) {
750 const MachineOperand
*Src0
= TII
->getNamedOperand(*MI
, AMDGPU::OpName::src0
);
751 const MachineOperand
*Src1
= TII
->getNamedOperand(*MI
, AMDGPU::OpName::src1
);
752 if (Src1
->isIdenticalTo(*Src0
)) {
753 LLVM_DEBUG(dbgs() << "Folded " << *MI
<< " into ");
754 int Src2Idx
= AMDGPU::getNamedOperandIdx(Opc
, AMDGPU::OpName::src2
);
756 MI
->RemoveOperand(Src2Idx
);
757 MI
->RemoveOperand(AMDGPU::getNamedOperandIdx(Opc
, AMDGPU::OpName::src1
));
758 mutateCopyOp(*MI
, TII
->get(Src0
->isReg() ? (unsigned)AMDGPU::COPY
759 : getMovOpc(false)));
760 LLVM_DEBUG(dbgs() << *MI
<< '\n');
768 void SIFoldOperands::foldInstOperand(MachineInstr
&MI
,
769 MachineOperand
&OpToFold
) const {
770 // We need mutate the operands of new mov instructions to add implicit
771 // uses of EXEC, but adding them invalidates the use_iterator, so defer
773 SmallVector
<MachineInstr
*, 4> CopiesToReplace
;
774 SmallVector
<FoldCandidate
, 4> FoldList
;
775 MachineOperand
&Dst
= MI
.getOperand(0);
777 bool FoldingImm
= OpToFold
.isImm() || OpToFold
.isFI();
779 unsigned NumLiteralUses
= 0;
780 MachineOperand
*NonInlineUse
= nullptr;
781 int NonInlineUseOpNo
= -1;
783 MachineRegisterInfo::use_iterator NextUse
;
784 for (MachineRegisterInfo::use_iterator
785 Use
= MRI
->use_begin(Dst
.getReg()), E
= MRI
->use_end();
786 Use
!= E
; Use
= NextUse
) {
787 NextUse
= std::next(Use
);
788 MachineInstr
*UseMI
= Use
->getParent();
789 unsigned OpNo
= Use
.getOperandNo();
791 // Folding the immediate may reveal operations that can be constant
792 // folded or replaced with a copy. This can happen for example after
793 // frame indices are lowered to constants or from splitting 64-bit
796 // We may also encounter cases where one or both operands are
797 // immediates materialized into a register, which would ordinarily not
798 // be folded due to multiple uses or operand constraints.
800 if (OpToFold
.isImm() && tryConstantFoldOp(*MRI
, TII
, UseMI
, &OpToFold
)) {
801 LLVM_DEBUG(dbgs() << "Constant folded " << *UseMI
<< '\n');
803 // Some constant folding cases change the same immediate's use to a new
804 // instruction, e.g. and x, 0 -> 0. Make sure we re-visit the user
805 // again. The same constant folded instruction could also have a second
807 NextUse
= MRI
->use_begin(Dst
.getReg());
812 // Try to fold any inline immediate uses, and then only fold other
813 // constants if they have one use.
815 // The legality of the inline immediate must be checked based on the use
816 // operand, not the defining instruction, because 32-bit instructions
817 // with 32-bit inline immediate sources may be used to materialize
818 // constants used in 16-bit operands.
820 // e.g. it is unsafe to fold:
821 // s_mov_b32 s0, 1.0 // materializes 0x3f800000
822 // v_add_f16 v0, v1, s0 // 1.0 f16 inline immediate sees 0x00003c00
824 // Folding immediates with more than one use will increase program size.
825 // FIXME: This will also reduce register usage, which may be better
826 // in some cases. A better heuristic is needed.
827 if (isInlineConstantIfFolded(TII
, *UseMI
, OpNo
, OpToFold
)) {
828 foldOperand(OpToFold
, UseMI
, OpNo
, FoldList
, CopiesToReplace
);
830 if (++NumLiteralUses
== 1) {
831 NonInlineUse
= &*Use
;
832 NonInlineUseOpNo
= OpNo
;
837 if (NumLiteralUses
== 1) {
838 MachineInstr
*UseMI
= NonInlineUse
->getParent();
839 foldOperand(OpToFold
, UseMI
, NonInlineUseOpNo
, FoldList
, CopiesToReplace
);
843 for (MachineRegisterInfo::use_iterator
844 Use
= MRI
->use_begin(Dst
.getReg()), E
= MRI
->use_end();
846 MachineInstr
*UseMI
= Use
->getParent();
848 foldOperand(OpToFold
, UseMI
, Use
.getOperandNo(),
849 FoldList
, CopiesToReplace
);
853 MachineFunction
*MF
= MI
.getParent()->getParent();
854 // Make sure we add EXEC uses to any new v_mov instructions created.
855 for (MachineInstr
*Copy
: CopiesToReplace
)
856 Copy
->addImplicitDefUseOperands(*MF
);
858 for (FoldCandidate
&Fold
: FoldList
) {
859 if (updateOperand(Fold
, *TII
, *TRI
)) {
862 assert(Fold
.OpToFold
&& Fold
.OpToFold
->isReg());
863 // FIXME: Probably shouldn't bother trying to fold if not an
864 // SGPR. PeepholeOptimizer can eliminate redundant VGPR->VGPR
866 MRI
->clearKillFlags(Fold
.OpToFold
->getReg());
868 LLVM_DEBUG(dbgs() << "Folded source from " << MI
<< " into OpNo "
869 << static_cast<int>(Fold
.UseOpNo
) << " of "
870 << *Fold
.UseMI
<< '\n');
871 tryFoldInst(TII
, Fold
.UseMI
);
872 } else if (Fold
.isCommuted()) {
873 // Restoring instruction's original operand order if fold has failed.
874 TII
->commuteInstruction(*Fold
.UseMI
, false);
879 // Clamp patterns are canonically selected to v_max_* instructions, so only
881 const MachineOperand
*SIFoldOperands::isClamp(const MachineInstr
&MI
) const {
882 unsigned Op
= MI
.getOpcode();
884 case AMDGPU::V_MAX_F32_e64
:
885 case AMDGPU::V_MAX_F16_e64
:
886 case AMDGPU::V_MAX_F64
:
887 case AMDGPU::V_PK_MAX_F16
: {
888 if (!TII
->getNamedOperand(MI
, AMDGPU::OpName::clamp
)->getImm())
891 // Make sure sources are identical.
892 const MachineOperand
*Src0
= TII
->getNamedOperand(MI
, AMDGPU::OpName::src0
);
893 const MachineOperand
*Src1
= TII
->getNamedOperand(MI
, AMDGPU::OpName::src1
);
894 if (!Src0
->isReg() || !Src1
->isReg() ||
895 Src0
->getReg() != Src1
->getReg() ||
896 Src0
->getSubReg() != Src1
->getSubReg() ||
897 Src0
->getSubReg() != AMDGPU::NoSubRegister
)
900 // Can't fold up if we have modifiers.
901 if (TII
->hasModifiersSet(MI
, AMDGPU::OpName::omod
))
905 = TII
->getNamedOperand(MI
, AMDGPU::OpName::src0_modifiers
)->getImm();
907 = TII
->getNamedOperand(MI
, AMDGPU::OpName::src1_modifiers
)->getImm();
909 // Having a 0 op_sel_hi would require swizzling the output in the source
910 // instruction, which we can't do.
911 unsigned UnsetMods
= (Op
== AMDGPU::V_PK_MAX_F16
) ? SISrcMods::OP_SEL_1
: 0;
912 if (Src0Mods
!= UnsetMods
&& Src1Mods
!= UnsetMods
)
921 // We obviously have multiple uses in a clamp since the register is used twice
922 // in the same instruction.
923 static bool hasOneNonDBGUseInst(const MachineRegisterInfo
&MRI
, unsigned Reg
) {
925 for (auto I
= MRI
.use_instr_nodbg_begin(Reg
), E
= MRI
.use_instr_nodbg_end();
934 // FIXME: Clamp for v_mad_mixhi_f16 handled during isel.
935 bool SIFoldOperands::tryFoldClamp(MachineInstr
&MI
) {
936 const MachineOperand
*ClampSrc
= isClamp(MI
);
937 if (!ClampSrc
|| !hasOneNonDBGUseInst(*MRI
, ClampSrc
->getReg()))
940 MachineInstr
*Def
= MRI
->getVRegDef(ClampSrc
->getReg());
942 // The type of clamp must be compatible.
943 if (TII
->getClampMask(*Def
) != TII
->getClampMask(MI
))
946 MachineOperand
*DefClamp
= TII
->getNamedOperand(*Def
, AMDGPU::OpName::clamp
);
950 LLVM_DEBUG(dbgs() << "Folding clamp " << *DefClamp
<< " into " << *Def
953 // Clamp is applied after omod, so it is OK if omod is set.
955 MRI
->replaceRegWith(MI
.getOperand(0).getReg(), Def
->getOperand(0).getReg());
956 MI
.eraseFromParent();
960 static int getOModValue(unsigned Opc
, int64_t Val
) {
962 case AMDGPU::V_MUL_F32_e64
: {
963 switch (static_cast<uint32_t>(Val
)) {
964 case 0x3f000000: // 0.5
965 return SIOutMods::DIV2
;
966 case 0x40000000: // 2.0
967 return SIOutMods::MUL2
;
968 case 0x40800000: // 4.0
969 return SIOutMods::MUL4
;
971 return SIOutMods::NONE
;
974 case AMDGPU::V_MUL_F16_e64
: {
975 switch (static_cast<uint16_t>(Val
)) {
977 return SIOutMods::DIV2
;
979 return SIOutMods::MUL2
;
981 return SIOutMods::MUL4
;
983 return SIOutMods::NONE
;
987 llvm_unreachable("invalid mul opcode");
991 // FIXME: Does this really not support denormals with f16?
992 // FIXME: Does this need to check IEEE mode bit? SNaNs are generally not
993 // handled, so will anything other than that break?
994 std::pair
<const MachineOperand
*, int>
995 SIFoldOperands::isOMod(const MachineInstr
&MI
) const {
996 unsigned Op
= MI
.getOpcode();
998 case AMDGPU::V_MUL_F32_e64
:
999 case AMDGPU::V_MUL_F16_e64
: {
1000 // If output denormals are enabled, omod is ignored.
1001 if ((Op
== AMDGPU::V_MUL_F32_e64
&& ST
->hasFP32Denormals()) ||
1002 (Op
== AMDGPU::V_MUL_F16_e64
&& ST
->hasFP16Denormals()))
1003 return std::make_pair(nullptr, SIOutMods::NONE
);
1005 const MachineOperand
*RegOp
= nullptr;
1006 const MachineOperand
*ImmOp
= nullptr;
1007 const MachineOperand
*Src0
= TII
->getNamedOperand(MI
, AMDGPU::OpName::src0
);
1008 const MachineOperand
*Src1
= TII
->getNamedOperand(MI
, AMDGPU::OpName::src1
);
1009 if (Src0
->isImm()) {
1012 } else if (Src1
->isImm()) {
1016 return std::make_pair(nullptr, SIOutMods::NONE
);
1018 int OMod
= getOModValue(Op
, ImmOp
->getImm());
1019 if (OMod
== SIOutMods::NONE
||
1020 TII
->hasModifiersSet(MI
, AMDGPU::OpName::src0_modifiers
) ||
1021 TII
->hasModifiersSet(MI
, AMDGPU::OpName::src1_modifiers
) ||
1022 TII
->hasModifiersSet(MI
, AMDGPU::OpName::omod
) ||
1023 TII
->hasModifiersSet(MI
, AMDGPU::OpName::clamp
))
1024 return std::make_pair(nullptr, SIOutMods::NONE
);
1026 return std::make_pair(RegOp
, OMod
);
1028 case AMDGPU::V_ADD_F32_e64
:
1029 case AMDGPU::V_ADD_F16_e64
: {
1030 // If output denormals are enabled, omod is ignored.
1031 if ((Op
== AMDGPU::V_ADD_F32_e64
&& ST
->hasFP32Denormals()) ||
1032 (Op
== AMDGPU::V_ADD_F16_e64
&& ST
->hasFP16Denormals()))
1033 return std::make_pair(nullptr, SIOutMods::NONE
);
1035 // Look through the DAGCombiner canonicalization fmul x, 2 -> fadd x, x
1036 const MachineOperand
*Src0
= TII
->getNamedOperand(MI
, AMDGPU::OpName::src0
);
1037 const MachineOperand
*Src1
= TII
->getNamedOperand(MI
, AMDGPU::OpName::src1
);
1039 if (Src0
->isReg() && Src1
->isReg() && Src0
->getReg() == Src1
->getReg() &&
1040 Src0
->getSubReg() == Src1
->getSubReg() &&
1041 !TII
->hasModifiersSet(MI
, AMDGPU::OpName::src0_modifiers
) &&
1042 !TII
->hasModifiersSet(MI
, AMDGPU::OpName::src1_modifiers
) &&
1043 !TII
->hasModifiersSet(MI
, AMDGPU::OpName::clamp
) &&
1044 !TII
->hasModifiersSet(MI
, AMDGPU::OpName::omod
))
1045 return std::make_pair(Src0
, SIOutMods::MUL2
);
1047 return std::make_pair(nullptr, SIOutMods::NONE
);
1050 return std::make_pair(nullptr, SIOutMods::NONE
);
1054 // FIXME: Does this need to check IEEE bit on function?
1055 bool SIFoldOperands::tryFoldOMod(MachineInstr
&MI
) {
1056 const MachineOperand
*RegOp
;
1058 std::tie(RegOp
, OMod
) = isOMod(MI
);
1059 if (OMod
== SIOutMods::NONE
|| !RegOp
->isReg() ||
1060 RegOp
->getSubReg() != AMDGPU::NoSubRegister
||
1061 !hasOneNonDBGUseInst(*MRI
, RegOp
->getReg()))
1064 MachineInstr
*Def
= MRI
->getVRegDef(RegOp
->getReg());
1065 MachineOperand
*DefOMod
= TII
->getNamedOperand(*Def
, AMDGPU::OpName::omod
);
1066 if (!DefOMod
|| DefOMod
->getImm() != SIOutMods::NONE
)
1069 // Clamp is applied after omod. If the source already has clamp set, don't
1071 if (TII
->hasModifiersSet(*Def
, AMDGPU::OpName::clamp
))
1074 LLVM_DEBUG(dbgs() << "Folding omod " << MI
<< " into " << *Def
<< '\n');
1076 DefOMod
->setImm(OMod
);
1077 MRI
->replaceRegWith(MI
.getOperand(0).getReg(), Def
->getOperand(0).getReg());
1078 MI
.eraseFromParent();
1082 bool SIFoldOperands::runOnMachineFunction(MachineFunction
&MF
) {
1083 if (skipFunction(MF
.getFunction()))
1086 MRI
= &MF
.getRegInfo();
1087 ST
= &MF
.getSubtarget
<GCNSubtarget
>();
1088 TII
= ST
->getInstrInfo();
1089 TRI
= &TII
->getRegisterInfo();
1091 const SIMachineFunctionInfo
*MFI
= MF
.getInfo
<SIMachineFunctionInfo
>();
1093 // omod is ignored by hardware if IEEE bit is enabled. omod also does not
1094 // correctly handle signed zeros.
1096 bool IsIEEEMode
= ST
->enableIEEEBit(MF
);
1097 bool HasNSZ
= MFI
->hasNoSignedZerosFPMath();
1099 for (MachineBasicBlock
*MBB
: depth_first(&MF
)) {
1100 MachineBasicBlock::iterator I
, Next
;
1101 for (I
= MBB
->begin(); I
!= MBB
->end(); I
= Next
) {
1102 Next
= std::next(I
);
1103 MachineInstr
&MI
= *I
;
1105 tryFoldInst(TII
, &MI
);
1107 if (!TII
->isFoldableCopy(MI
)) {
1108 // TODO: Omod might be OK if there is NSZ only on the source
1109 // instruction, and not the omod multiply.
1110 if (IsIEEEMode
|| (!HasNSZ
&& !MI
.getFlag(MachineInstr::FmNsz
)) ||
1116 MachineOperand
&OpToFold
= MI
.getOperand(1);
1117 bool FoldingImm
= OpToFold
.isImm() || OpToFold
.isFI();
1119 // FIXME: We could also be folding things like TargetIndexes.
1120 if (!FoldingImm
&& !OpToFold
.isReg())
1123 if (OpToFold
.isReg() &&
1124 !TargetRegisterInfo::isVirtualRegister(OpToFold
.getReg()))
1127 // Prevent folding operands backwards in the function. For example,
1128 // the COPY opcode must not be replaced by 1 in this example:
1130 // %3 = COPY %vgpr0; VGPR_32:%3
1132 // %vgpr0 = V_MOV_B32_e32 1, implicit %exec
1133 MachineOperand
&Dst
= MI
.getOperand(0);
1135 !TargetRegisterInfo::isVirtualRegister(Dst
.getReg()))
1138 foldInstOperand(MI
, OpToFold
);