1 //===-- SIFoldOperands.cpp - Fold operands --- ----------------------------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
8 //===----------------------------------------------------------------------===//
12 #include "AMDGPUSubtarget.h"
13 #include "SIInstrInfo.h"
14 #include "SIMachineFunctionInfo.h"
15 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
16 #include "llvm/ADT/DepthFirstIterator.h"
17 #include "llvm/CodeGen/LiveIntervals.h"
18 #include "llvm/CodeGen/MachineFunctionPass.h"
19 #include "llvm/CodeGen/MachineInstrBuilder.h"
20 #include "llvm/CodeGen/MachineRegisterInfo.h"
21 #include "llvm/Support/Debug.h"
22 #include "llvm/Support/raw_ostream.h"
23 #include "llvm/Target/TargetMachine.h"
25 #define DEBUG_TYPE "si-fold-operands"
30 struct FoldCandidate
{
33 MachineOperand
*OpToFold
;
38 unsigned char UseOpNo
;
39 MachineOperand::MachineOperandType Kind
;
42 FoldCandidate(MachineInstr
*MI
, unsigned OpNo
, MachineOperand
*FoldOp
,
43 bool Commuted_
= false,
45 UseMI(MI
), OpToFold(nullptr), ShrinkOpcode(ShrinkOp
), UseOpNo(OpNo
),
46 Kind(FoldOp
->getType()),
48 if (FoldOp
->isImm()) {
49 ImmToFold
= FoldOp
->getImm();
50 } else if (FoldOp
->isFI()) {
51 FrameIndexToFold
= FoldOp
->getIndex();
53 assert(FoldOp
->isReg());
59 return Kind
== MachineOperand::MO_FrameIndex
;
63 return Kind
== MachineOperand::MO_Immediate
;
67 return Kind
== MachineOperand::MO_Register
;
70 bool isCommuted() const {
74 bool needsShrink() const {
75 return ShrinkOpcode
!= -1;
78 int getShrinkOpcode() const {
83 class SIFoldOperands
: public MachineFunctionPass
{
86 MachineRegisterInfo
*MRI
;
87 const SIInstrInfo
*TII
;
88 const SIRegisterInfo
*TRI
;
89 const GCNSubtarget
*ST
;
91 void foldOperand(MachineOperand
&OpToFold
,
94 SmallVectorImpl
<FoldCandidate
> &FoldList
,
95 SmallVectorImpl
<MachineInstr
*> &CopiesToReplace
) const;
97 void foldInstOperand(MachineInstr
&MI
, MachineOperand
&OpToFold
) const;
99 const MachineOperand
*isClamp(const MachineInstr
&MI
) const;
100 bool tryFoldClamp(MachineInstr
&MI
);
102 std::pair
<const MachineOperand
*, int> isOMod(const MachineInstr
&MI
) const;
103 bool tryFoldOMod(MachineInstr
&MI
);
106 SIFoldOperands() : MachineFunctionPass(ID
) {
107 initializeSIFoldOperandsPass(*PassRegistry::getPassRegistry());
110 bool runOnMachineFunction(MachineFunction
&MF
) override
;
112 StringRef
getPassName() const override
{ return "SI Fold Operands"; }
114 void getAnalysisUsage(AnalysisUsage
&AU
) const override
{
115 AU
.setPreservesCFG();
116 MachineFunctionPass::getAnalysisUsage(AU
);
120 } // End anonymous namespace.
122 INITIALIZE_PASS(SIFoldOperands
, DEBUG_TYPE
,
123 "SI Fold Operands", false, false)
125 char SIFoldOperands::ID
= 0;
127 char &llvm::SIFoldOperandsID
= SIFoldOperands::ID
;
129 // Wrapper around isInlineConstant that understands special cases when
130 // instruction types are replaced during operand folding.
131 static bool isInlineConstantIfFolded(const SIInstrInfo
*TII
,
132 const MachineInstr
&UseMI
,
134 const MachineOperand
&OpToFold
) {
135 if (TII
->isInlineConstant(UseMI
, OpNo
, OpToFold
))
138 unsigned Opc
= UseMI
.getOpcode();
140 case AMDGPU::V_MAC_F32_e64
:
141 case AMDGPU::V_MAC_F16_e64
:
142 case AMDGPU::V_FMAC_F32_e64
: {
143 // Special case for mac. Since this is replaced with mad when folded into
144 // src2, we need to check the legality for the final instruction.
145 int Src2Idx
= AMDGPU::getNamedOperandIdx(Opc
, AMDGPU::OpName::src2
);
146 if (static_cast<int>(OpNo
) == Src2Idx
) {
147 bool IsFMA
= Opc
== AMDGPU::V_FMAC_F32_e64
;
148 bool IsF32
= Opc
== AMDGPU::V_MAC_F32_e64
;
150 unsigned Opc
= IsFMA
?
151 AMDGPU::V_FMA_F32
: (IsF32
? AMDGPU::V_MAD_F32
: AMDGPU::V_MAD_F16
);
152 const MCInstrDesc
&MadDesc
= TII
->get(Opc
);
153 return TII
->isInlineConstant(OpToFold
, MadDesc
.OpInfo
[OpNo
].OperandType
);
162 FunctionPass
*llvm::createSIFoldOperandsPass() {
163 return new SIFoldOperands();
166 static bool updateOperand(FoldCandidate
&Fold
,
167 const SIInstrInfo
&TII
,
168 const TargetRegisterInfo
&TRI
) {
169 MachineInstr
*MI
= Fold
.UseMI
;
170 MachineOperand
&Old
= MI
->getOperand(Fold
.UseOpNo
);
174 if (MI
->getDesc().TSFlags
& SIInstrFlags::IsPacked
) {
175 // Set op_sel/op_sel_hi on this operand or bail out if op_sel is
177 unsigned Opcode
= MI
->getOpcode();
178 int OpNo
= MI
->getOperandNo(&Old
);
180 if (OpNo
== AMDGPU::getNamedOperandIdx(Opcode
, AMDGPU::OpName::src0
))
181 ModIdx
= AMDGPU::OpName::src0_modifiers
;
182 else if (OpNo
== AMDGPU::getNamedOperandIdx(Opcode
, AMDGPU::OpName::src1
))
183 ModIdx
= AMDGPU::OpName::src1_modifiers
;
184 else if (OpNo
== AMDGPU::getNamedOperandIdx(Opcode
, AMDGPU::OpName::src2
))
185 ModIdx
= AMDGPU::OpName::src2_modifiers
;
186 assert(ModIdx
!= -1);
187 ModIdx
= AMDGPU::getNamedOperandIdx(Opcode
, ModIdx
);
188 MachineOperand
&Mod
= MI
->getOperand(ModIdx
);
189 unsigned Val
= Mod
.getImm();
190 if ((Val
& SISrcMods::OP_SEL_0
) || !(Val
& SISrcMods::OP_SEL_1
))
192 // If upper part is all zero we do not need op_sel_hi.
193 if (!isUInt
<16>(Fold
.ImmToFold
)) {
194 if (!(Fold
.ImmToFold
& 0xffff)) {
195 Mod
.setImm(Mod
.getImm() | SISrcMods::OP_SEL_0
);
196 Mod
.setImm(Mod
.getImm() & ~SISrcMods::OP_SEL_1
);
197 Old
.ChangeToImmediate((Fold
.ImmToFold
>> 16) & 0xffff);
200 Mod
.setImm(Mod
.getImm() & ~SISrcMods::OP_SEL_1
);
204 if (Fold
.needsShrink()) {
205 MachineBasicBlock
*MBB
= MI
->getParent();
206 auto Liveness
= MBB
->computeRegisterLiveness(&TRI
, AMDGPU::VCC
, MI
);
207 if (Liveness
!= MachineBasicBlock::LQR_Dead
)
210 MachineRegisterInfo
&MRI
= MBB
->getParent()->getRegInfo();
211 int Op32
= Fold
.getShrinkOpcode();
212 MachineOperand
&Dst0
= MI
->getOperand(0);
213 MachineOperand
&Dst1
= MI
->getOperand(1);
214 assert(Dst0
.isDef() && Dst1
.isDef());
216 bool HaveNonDbgCarryUse
= !MRI
.use_nodbg_empty(Dst1
.getReg());
218 const TargetRegisterClass
*Dst0RC
= MRI
.getRegClass(Dst0
.getReg());
219 unsigned NewReg0
= MRI
.createVirtualRegister(Dst0RC
);
220 const TargetRegisterClass
*Dst1RC
= MRI
.getRegClass(Dst1
.getReg());
221 unsigned NewReg1
= MRI
.createVirtualRegister(Dst1RC
);
223 MachineInstr
*Inst32
= TII
.buildShrunkInst(*MI
, Op32
);
225 if (HaveNonDbgCarryUse
) {
226 BuildMI(*MBB
, MI
, MI
->getDebugLoc(), TII
.get(AMDGPU::COPY
), Dst1
.getReg())
227 .addReg(AMDGPU::VCC
, RegState::Kill
);
230 // Keep the old instruction around to avoid breaking iterators, but
231 // replace the outputs with dummy registers.
232 Dst0
.setReg(NewReg0
);
233 Dst1
.setReg(NewReg1
);
235 if (Fold
.isCommuted())
236 TII
.commuteInstruction(*Inst32
, false);
240 Old
.ChangeToImmediate(Fold
.ImmToFold
);
244 assert(!Fold
.needsShrink() && "not handled");
247 Old
.ChangeToFrameIndex(Fold
.FrameIndexToFold
);
251 MachineOperand
*New
= Fold
.OpToFold
;
252 if (TargetRegisterInfo::isVirtualRegister(Old
.getReg()) &&
253 TargetRegisterInfo::isVirtualRegister(New
->getReg())) {
254 Old
.substVirtReg(New
->getReg(), New
->getSubReg(), TRI
);
256 Old
.setIsUndef(New
->isUndef());
260 // FIXME: Handle physical registers.
265 static bool isUseMIInFoldList(ArrayRef
<FoldCandidate
> FoldList
,
266 const MachineInstr
*MI
) {
267 for (auto Candidate
: FoldList
) {
268 if (Candidate
.UseMI
== MI
)
274 static bool tryAddToFoldList(SmallVectorImpl
<FoldCandidate
> &FoldList
,
275 MachineInstr
*MI
, unsigned OpNo
,
276 MachineOperand
*OpToFold
,
277 const SIInstrInfo
*TII
) {
278 if (!TII
->isOperandLegal(*MI
, OpNo
, OpToFold
)) {
280 // Special case for v_mac_{f16, f32}_e64 if we are trying to fold into src2
281 unsigned Opc
= MI
->getOpcode();
282 if ((Opc
== AMDGPU::V_MAC_F32_e64
|| Opc
== AMDGPU::V_MAC_F16_e64
||
283 Opc
== AMDGPU::V_FMAC_F32_e64
) &&
284 (int)OpNo
== AMDGPU::getNamedOperandIdx(Opc
, AMDGPU::OpName::src2
)) {
285 bool IsFMA
= Opc
== AMDGPU::V_FMAC_F32_e64
;
286 bool IsF32
= Opc
== AMDGPU::V_MAC_F32_e64
;
287 unsigned NewOpc
= IsFMA
?
288 AMDGPU::V_FMA_F32
: (IsF32
? AMDGPU::V_MAD_F32
: AMDGPU::V_MAD_F16
);
290 // Check if changing this to a v_mad_{f16, f32} instruction will allow us
291 // to fold the operand.
292 MI
->setDesc(TII
->get(NewOpc
));
293 bool FoldAsMAD
= tryAddToFoldList(FoldList
, MI
, OpNo
, OpToFold
, TII
);
295 MI
->untieRegOperand(OpNo
);
298 MI
->setDesc(TII
->get(Opc
));
301 // Special case for s_setreg_b32
302 if (Opc
== AMDGPU::S_SETREG_B32
&& OpToFold
->isImm()) {
303 MI
->setDesc(TII
->get(AMDGPU::S_SETREG_IMM32_B32
));
304 FoldList
.push_back(FoldCandidate(MI
, OpNo
, OpToFold
));
308 // If we are already folding into another operand of MI, then
309 // we can't commute the instruction, otherwise we risk making the
310 // other fold illegal.
311 if (isUseMIInFoldList(FoldList
, MI
))
314 unsigned CommuteOpNo
= OpNo
;
316 // Operand is not legal, so try to commute the instruction to
317 // see if this makes it possible to fold.
318 unsigned CommuteIdx0
= TargetInstrInfo::CommuteAnyOperandIndex
;
319 unsigned CommuteIdx1
= TargetInstrInfo::CommuteAnyOperandIndex
;
320 bool CanCommute
= TII
->findCommutedOpIndices(*MI
, CommuteIdx0
, CommuteIdx1
);
323 if (CommuteIdx0
== OpNo
)
324 CommuteOpNo
= CommuteIdx1
;
325 else if (CommuteIdx1
== OpNo
)
326 CommuteOpNo
= CommuteIdx0
;
330 // One of operands might be an Imm operand, and OpNo may refer to it after
331 // the call of commuteInstruction() below. Such situations are avoided
332 // here explicitly as OpNo must be a register operand to be a candidate
333 // for memory folding.
334 if (CanCommute
&& (!MI
->getOperand(CommuteIdx0
).isReg() ||
335 !MI
->getOperand(CommuteIdx1
).isReg()))
339 !TII
->commuteInstruction(*MI
, false, CommuteIdx0
, CommuteIdx1
))
342 if (!TII
->isOperandLegal(*MI
, CommuteOpNo
, OpToFold
)) {
343 if ((Opc
== AMDGPU::V_ADD_I32_e64
||
344 Opc
== AMDGPU::V_SUB_I32_e64
||
345 Opc
== AMDGPU::V_SUBREV_I32_e64
) && // FIXME
347 MachineRegisterInfo
&MRI
= MI
->getParent()->getParent()->getRegInfo();
349 // Verify the other operand is a VGPR, otherwise we would violate the
350 // constant bus restriction.
351 unsigned OtherIdx
= CommuteOpNo
== CommuteIdx0
? CommuteIdx1
: CommuteIdx0
;
352 MachineOperand
&OtherOp
= MI
->getOperand(OtherIdx
);
353 if (!OtherOp
.isReg() ||
354 !TII
->getRegisterInfo().isVGPR(MRI
, OtherOp
.getReg()))
357 assert(MI
->getOperand(1).isDef());
359 int Op32
= AMDGPU::getVOPe32(Opc
);
360 FoldList
.push_back(FoldCandidate(MI
, CommuteOpNo
, OpToFold
, true,
365 TII
->commuteInstruction(*MI
, false, CommuteIdx0
, CommuteIdx1
);
369 FoldList
.push_back(FoldCandidate(MI
, CommuteOpNo
, OpToFold
, true));
373 FoldList
.push_back(FoldCandidate(MI
, OpNo
, OpToFold
));
377 // If the use operand doesn't care about the value, this may be an operand only
378 // used for register indexing, in which case it is unsafe to fold.
379 static bool isUseSafeToFold(const SIInstrInfo
*TII
,
380 const MachineInstr
&MI
,
381 const MachineOperand
&UseMO
) {
382 return !UseMO
.isUndef() && !TII
->isSDWA(MI
);
383 //return !MI.hasRegisterImplicitUseOperand(UseMO.getReg());
386 void SIFoldOperands::foldOperand(
387 MachineOperand
&OpToFold
,
390 SmallVectorImpl
<FoldCandidate
> &FoldList
,
391 SmallVectorImpl
<MachineInstr
*> &CopiesToReplace
) const {
392 const MachineOperand
&UseOp
= UseMI
->getOperand(UseOpIdx
);
394 if (!isUseSafeToFold(TII
, *UseMI
, UseOp
))
397 // FIXME: Fold operands with subregs.
398 if (UseOp
.isReg() && OpToFold
.isReg()) {
399 if (UseOp
.isImplicit() || UseOp
.getSubReg() != AMDGPU::NoSubRegister
)
402 // Don't fold subregister extracts into tied operands, only if it is a full
403 // copy since a subregister use tied to a full register def doesn't really
404 // make sense. e.g. don't fold:
407 // %2<tied3> = V_MAC_{F16, F32} %3, %4, %1<tied0>
410 // %2<tied3> = V_MAC_{F16, F32} %3, %4, %0:sub1<tied0>
411 if (UseOp
.isTied() && OpToFold
.getSubReg() != AMDGPU::NoSubRegister
)
415 // Special case for REG_SEQUENCE: We can't fold literals into
416 // REG_SEQUENCE instructions, so we have to fold them into the
417 // uses of REG_SEQUENCE.
418 if (UseMI
->isRegSequence()) {
419 unsigned RegSeqDstReg
= UseMI
->getOperand(0).getReg();
420 unsigned RegSeqDstSubReg
= UseMI
->getOperand(UseOpIdx
+ 1).getImm();
422 for (MachineRegisterInfo::use_iterator
423 RSUse
= MRI
->use_begin(RegSeqDstReg
), RSE
= MRI
->use_end();
424 RSUse
!= RSE
; ++RSUse
) {
426 MachineInstr
*RSUseMI
= RSUse
->getParent();
427 if (RSUse
->getSubReg() != RegSeqDstSubReg
)
430 foldOperand(OpToFold
, RSUseMI
, RSUse
.getOperandNo(), FoldList
,
438 bool FoldingImm
= OpToFold
.isImm();
440 if (FoldingImm
&& UseMI
->isCopy()) {
441 unsigned DestReg
= UseMI
->getOperand(0).getReg();
442 const TargetRegisterClass
*DestRC
443 = TargetRegisterInfo::isVirtualRegister(DestReg
) ?
444 MRI
->getRegClass(DestReg
) :
445 TRI
->getPhysRegClass(DestReg
);
447 unsigned SrcReg
= UseMI
->getOperand(1).getReg();
448 if (TargetRegisterInfo::isVirtualRegister(DestReg
) &&
449 TargetRegisterInfo::isVirtualRegister(SrcReg
)) {
450 const TargetRegisterClass
* SrcRC
= MRI
->getRegClass(SrcReg
);
451 if (TRI
->isSGPRClass(SrcRC
) && TRI
->hasVGPRs(DestRC
)) {
452 MachineRegisterInfo::use_iterator NextUse
;
453 SmallVector
<FoldCandidate
, 4> CopyUses
;
454 for (MachineRegisterInfo::use_iterator
455 Use
= MRI
->use_begin(DestReg
), E
= MRI
->use_end();
456 Use
!= E
; Use
= NextUse
) {
457 NextUse
= std::next(Use
);
458 FoldCandidate FC
= FoldCandidate(Use
->getParent(),
459 Use
.getOperandNo(), &UseMI
->getOperand(1));
460 CopyUses
.push_back(FC
);
462 for (auto & F
: CopyUses
) {
463 foldOperand(*F
.OpToFold
, F
.UseMI
, F
.UseOpNo
,
464 FoldList
, CopiesToReplace
);
469 // In order to fold immediates into copies, we need to change the
472 unsigned MovOp
= TII
->getMovOpcode(DestRC
);
473 if (MovOp
== AMDGPU::COPY
)
476 UseMI
->setDesc(TII
->get(MovOp
));
477 CopiesToReplace
.push_back(UseMI
);
479 if (UseMI
->isCopy() && OpToFold
.isReg() &&
480 TargetRegisterInfo::isVirtualRegister(UseMI
->getOperand(0).getReg()) &&
481 TargetRegisterInfo::isVirtualRegister(UseMI
->getOperand(1).getReg()) &&
482 TRI
->isVGPR(*MRI
, UseMI
->getOperand(0).getReg()) &&
483 TRI
->isVGPR(*MRI
, UseMI
->getOperand(1).getReg()) &&
484 !UseMI
->getOperand(1).getSubReg()) {
485 UseMI
->getOperand(1).setReg(OpToFold
.getReg());
486 UseMI
->getOperand(1).setSubReg(OpToFold
.getSubReg());
487 UseMI
->getOperand(1).setIsKill(false);
488 CopiesToReplace
.push_back(UseMI
);
489 OpToFold
.setIsKill(false);
493 const MCInstrDesc
&UseDesc
= UseMI
->getDesc();
495 // Don't fold into target independent nodes. Target independent opcodes
496 // don't have defined register classes.
497 if (UseDesc
.isVariadic() ||
498 UseOp
.isImplicit() ||
499 UseDesc
.OpInfo
[UseOpIdx
].RegClass
== -1)
504 tryAddToFoldList(FoldList
, UseMI
, UseOpIdx
, &OpToFold
, TII
);
506 // FIXME: We could try to change the instruction from 64-bit to 32-bit
507 // to enable more folding opportunites. The shrink operands pass
508 // already does this.
513 const MCInstrDesc
&FoldDesc
= OpToFold
.getParent()->getDesc();
514 const TargetRegisterClass
*FoldRC
=
515 TRI
->getRegClass(FoldDesc
.OpInfo
[0].RegClass
);
518 // Split 64-bit constants into 32-bits for folding.
519 if (UseOp
.getSubReg() && AMDGPU::getRegBitWidth(FoldRC
->getID()) == 64) {
520 unsigned UseReg
= UseOp
.getReg();
521 const TargetRegisterClass
*UseRC
522 = TargetRegisterInfo::isVirtualRegister(UseReg
) ?
523 MRI
->getRegClass(UseReg
) :
524 TRI
->getPhysRegClass(UseReg
);
526 if (AMDGPU::getRegBitWidth(UseRC
->getID()) != 64)
529 APInt
Imm(64, OpToFold
.getImm());
530 if (UseOp
.getSubReg() == AMDGPU::sub0
) {
531 Imm
= Imm
.getLoBits(32);
533 assert(UseOp
.getSubReg() == AMDGPU::sub1
);
534 Imm
= Imm
.getHiBits(32);
537 MachineOperand ImmOp
= MachineOperand::CreateImm(Imm
.getSExtValue());
538 tryAddToFoldList(FoldList
, UseMI
, UseOpIdx
, &ImmOp
, TII
);
544 tryAddToFoldList(FoldList
, UseMI
, UseOpIdx
, &OpToFold
, TII
);
547 static bool evalBinaryInstruction(unsigned Opcode
, int32_t &Result
,
548 uint32_t LHS
, uint32_t RHS
) {
550 case AMDGPU::V_AND_B32_e64
:
551 case AMDGPU::V_AND_B32_e32
:
552 case AMDGPU::S_AND_B32
:
555 case AMDGPU::V_OR_B32_e64
:
556 case AMDGPU::V_OR_B32_e32
:
557 case AMDGPU::S_OR_B32
:
560 case AMDGPU::V_XOR_B32_e64
:
561 case AMDGPU::V_XOR_B32_e32
:
562 case AMDGPU::S_XOR_B32
:
565 case AMDGPU::V_LSHL_B32_e64
:
566 case AMDGPU::V_LSHL_B32_e32
:
567 case AMDGPU::S_LSHL_B32
:
568 // The instruction ignores the high bits for out of bounds shifts.
569 Result
= LHS
<< (RHS
& 31);
571 case AMDGPU::V_LSHLREV_B32_e64
:
572 case AMDGPU::V_LSHLREV_B32_e32
:
573 Result
= RHS
<< (LHS
& 31);
575 case AMDGPU::V_LSHR_B32_e64
:
576 case AMDGPU::V_LSHR_B32_e32
:
577 case AMDGPU::S_LSHR_B32
:
578 Result
= LHS
>> (RHS
& 31);
580 case AMDGPU::V_LSHRREV_B32_e64
:
581 case AMDGPU::V_LSHRREV_B32_e32
:
582 Result
= RHS
>> (LHS
& 31);
584 case AMDGPU::V_ASHR_I32_e64
:
585 case AMDGPU::V_ASHR_I32_e32
:
586 case AMDGPU::S_ASHR_I32
:
587 Result
= static_cast<int32_t>(LHS
) >> (RHS
& 31);
589 case AMDGPU::V_ASHRREV_I32_e64
:
590 case AMDGPU::V_ASHRREV_I32_e32
:
591 Result
= static_cast<int32_t>(RHS
) >> (LHS
& 31);
598 static unsigned getMovOpc(bool IsScalar
) {
599 return IsScalar
? AMDGPU::S_MOV_B32
: AMDGPU::V_MOV_B32_e32
;
602 /// Remove any leftover implicit operands from mutating the instruction. e.g.
603 /// if we replace an s_and_b32 with a copy, we don't need the implicit scc def
605 static void stripExtraCopyOperands(MachineInstr
&MI
) {
606 const MCInstrDesc
&Desc
= MI
.getDesc();
607 unsigned NumOps
= Desc
.getNumOperands() +
608 Desc
.getNumImplicitUses() +
609 Desc
.getNumImplicitDefs();
611 for (unsigned I
= MI
.getNumOperands() - 1; I
>= NumOps
; --I
)
615 static void mutateCopyOp(MachineInstr
&MI
, const MCInstrDesc
&NewDesc
) {
617 stripExtraCopyOperands(MI
);
620 static MachineOperand
*getImmOrMaterializedImm(MachineRegisterInfo
&MRI
,
621 MachineOperand
&Op
) {
623 // If this has a subregister, it obviously is a register source.
624 if (Op
.getSubReg() != AMDGPU::NoSubRegister
||
625 !TargetRegisterInfo::isVirtualRegister(Op
.getReg()))
628 MachineInstr
*Def
= MRI
.getVRegDef(Op
.getReg());
629 if (Def
&& Def
->isMoveImmediate()) {
630 MachineOperand
&ImmSrc
= Def
->getOperand(1);
639 // Try to simplify operations with a constant that may appear after instruction
641 // TODO: See if a frame index with a fixed offset can fold.
642 static bool tryConstantFoldOp(MachineRegisterInfo
&MRI
,
643 const SIInstrInfo
*TII
,
645 MachineOperand
*ImmOp
) {
646 unsigned Opc
= MI
->getOpcode();
647 if (Opc
== AMDGPU::V_NOT_B32_e64
|| Opc
== AMDGPU::V_NOT_B32_e32
||
648 Opc
== AMDGPU::S_NOT_B32
) {
649 MI
->getOperand(1).ChangeToImmediate(~ImmOp
->getImm());
650 mutateCopyOp(*MI
, TII
->get(getMovOpc(Opc
== AMDGPU::S_NOT_B32
)));
654 int Src1Idx
= AMDGPU::getNamedOperandIdx(Opc
, AMDGPU::OpName::src1
);
658 int Src0Idx
= AMDGPU::getNamedOperandIdx(Opc
, AMDGPU::OpName::src0
);
659 MachineOperand
*Src0
= getImmOrMaterializedImm(MRI
, MI
->getOperand(Src0Idx
));
660 MachineOperand
*Src1
= getImmOrMaterializedImm(MRI
, MI
->getOperand(Src1Idx
));
662 if (!Src0
->isImm() && !Src1
->isImm())
665 if (MI
->getOpcode() == AMDGPU::V_LSHL_OR_B32
) {
666 if (Src0
->isImm() && Src0
->getImm() == 0) {
667 // v_lshl_or_b32 0, X, Y -> copy Y
668 // v_lshl_or_b32 0, X, K -> v_mov_b32 K
669 bool UseCopy
= TII
->getNamedOperand(*MI
, AMDGPU::OpName::src2
)->isReg();
670 MI
->RemoveOperand(Src1Idx
);
671 MI
->RemoveOperand(Src0Idx
);
673 MI
->setDesc(TII
->get(UseCopy
? AMDGPU::COPY
: AMDGPU::V_MOV_B32_e32
));
678 // and k0, k1 -> v_mov_b32 (k0 & k1)
679 // or k0, k1 -> v_mov_b32 (k0 | k1)
680 // xor k0, k1 -> v_mov_b32 (k0 ^ k1)
681 if (Src0
->isImm() && Src1
->isImm()) {
683 if (!evalBinaryInstruction(Opc
, NewImm
, Src0
->getImm(), Src1
->getImm()))
686 const SIRegisterInfo
&TRI
= TII
->getRegisterInfo();
687 bool IsSGPR
= TRI
.isSGPRReg(MRI
, MI
->getOperand(0).getReg());
689 // Be careful to change the right operand, src0 may belong to a different
691 MI
->getOperand(Src0Idx
).ChangeToImmediate(NewImm
);
692 MI
->RemoveOperand(Src1Idx
);
693 mutateCopyOp(*MI
, TII
->get(getMovOpc(IsSGPR
)));
697 if (!MI
->isCommutable())
700 if (Src0
->isImm() && !Src1
->isImm()) {
701 std::swap(Src0
, Src1
);
702 std::swap(Src0Idx
, Src1Idx
);
705 int32_t Src1Val
= static_cast<int32_t>(Src1
->getImm());
706 if (Opc
== AMDGPU::V_OR_B32_e64
||
707 Opc
== AMDGPU::V_OR_B32_e32
||
708 Opc
== AMDGPU::S_OR_B32
) {
710 // y = or x, 0 => y = copy x
711 MI
->RemoveOperand(Src1Idx
);
712 mutateCopyOp(*MI
, TII
->get(AMDGPU::COPY
));
713 } else if (Src1Val
== -1) {
714 // y = or x, -1 => y = v_mov_b32 -1
715 MI
->RemoveOperand(Src1Idx
);
716 mutateCopyOp(*MI
, TII
->get(getMovOpc(Opc
== AMDGPU::S_OR_B32
)));
723 if (MI
->getOpcode() == AMDGPU::V_AND_B32_e64
||
724 MI
->getOpcode() == AMDGPU::V_AND_B32_e32
||
725 MI
->getOpcode() == AMDGPU::S_AND_B32
) {
727 // y = and x, 0 => y = v_mov_b32 0
728 MI
->RemoveOperand(Src0Idx
);
729 mutateCopyOp(*MI
, TII
->get(getMovOpc(Opc
== AMDGPU::S_AND_B32
)));
730 } else if (Src1Val
== -1) {
731 // y = and x, -1 => y = copy x
732 MI
->RemoveOperand(Src1Idx
);
733 mutateCopyOp(*MI
, TII
->get(AMDGPU::COPY
));
734 stripExtraCopyOperands(*MI
);
741 if (MI
->getOpcode() == AMDGPU::V_XOR_B32_e64
||
742 MI
->getOpcode() == AMDGPU::V_XOR_B32_e32
||
743 MI
->getOpcode() == AMDGPU::S_XOR_B32
) {
745 // y = xor x, 0 => y = copy x
746 MI
->RemoveOperand(Src1Idx
);
747 mutateCopyOp(*MI
, TII
->get(AMDGPU::COPY
));
755 // Try to fold an instruction into a simpler one
756 static bool tryFoldInst(const SIInstrInfo
*TII
,
758 unsigned Opc
= MI
->getOpcode();
760 if (Opc
== AMDGPU::V_CNDMASK_B32_e32
||
761 Opc
== AMDGPU::V_CNDMASK_B32_e64
||
762 Opc
== AMDGPU::V_CNDMASK_B64_PSEUDO
) {
763 const MachineOperand
*Src0
= TII
->getNamedOperand(*MI
, AMDGPU::OpName::src0
);
764 const MachineOperand
*Src1
= TII
->getNamedOperand(*MI
, AMDGPU::OpName::src1
);
765 if (Src1
->isIdenticalTo(*Src0
)) {
766 LLVM_DEBUG(dbgs() << "Folded " << *MI
<< " into ");
767 int Src2Idx
= AMDGPU::getNamedOperandIdx(Opc
, AMDGPU::OpName::src2
);
769 MI
->RemoveOperand(Src2Idx
);
770 MI
->RemoveOperand(AMDGPU::getNamedOperandIdx(Opc
, AMDGPU::OpName::src1
));
771 mutateCopyOp(*MI
, TII
->get(Src0
->isReg() ? (unsigned)AMDGPU::COPY
772 : getMovOpc(false)));
773 LLVM_DEBUG(dbgs() << *MI
<< '\n');
781 void SIFoldOperands::foldInstOperand(MachineInstr
&MI
,
782 MachineOperand
&OpToFold
) const {
783 // We need mutate the operands of new mov instructions to add implicit
784 // uses of EXEC, but adding them invalidates the use_iterator, so defer
786 SmallVector
<MachineInstr
*, 4> CopiesToReplace
;
787 SmallVector
<FoldCandidate
, 4> FoldList
;
788 MachineOperand
&Dst
= MI
.getOperand(0);
790 bool FoldingImm
= OpToFold
.isImm() || OpToFold
.isFI();
792 unsigned NumLiteralUses
= 0;
793 MachineOperand
*NonInlineUse
= nullptr;
794 int NonInlineUseOpNo
= -1;
796 MachineRegisterInfo::use_iterator NextUse
;
797 for (MachineRegisterInfo::use_iterator
798 Use
= MRI
->use_begin(Dst
.getReg()), E
= MRI
->use_end();
799 Use
!= E
; Use
= NextUse
) {
800 NextUse
= std::next(Use
);
801 MachineInstr
*UseMI
= Use
->getParent();
802 unsigned OpNo
= Use
.getOperandNo();
804 // Folding the immediate may reveal operations that can be constant
805 // folded or replaced with a copy. This can happen for example after
806 // frame indices are lowered to constants or from splitting 64-bit
809 // We may also encounter cases where one or both operands are
810 // immediates materialized into a register, which would ordinarily not
811 // be folded due to multiple uses or operand constraints.
813 if (OpToFold
.isImm() && tryConstantFoldOp(*MRI
, TII
, UseMI
, &OpToFold
)) {
814 LLVM_DEBUG(dbgs() << "Constant folded " << *UseMI
<< '\n');
816 // Some constant folding cases change the same immediate's use to a new
817 // instruction, e.g. and x, 0 -> 0. Make sure we re-visit the user
818 // again. The same constant folded instruction could also have a second
820 NextUse
= MRI
->use_begin(Dst
.getReg());
825 // Try to fold any inline immediate uses, and then only fold other
826 // constants if they have one use.
828 // The legality of the inline immediate must be checked based on the use
829 // operand, not the defining instruction, because 32-bit instructions
830 // with 32-bit inline immediate sources may be used to materialize
831 // constants used in 16-bit operands.
833 // e.g. it is unsafe to fold:
834 // s_mov_b32 s0, 1.0 // materializes 0x3f800000
835 // v_add_f16 v0, v1, s0 // 1.0 f16 inline immediate sees 0x00003c00
837 // Folding immediates with more than one use will increase program size.
838 // FIXME: This will also reduce register usage, which may be better
839 // in some cases. A better heuristic is needed.
840 if (isInlineConstantIfFolded(TII
, *UseMI
, OpNo
, OpToFold
)) {
841 foldOperand(OpToFold
, UseMI
, OpNo
, FoldList
, CopiesToReplace
);
843 if (++NumLiteralUses
== 1) {
844 NonInlineUse
= &*Use
;
845 NonInlineUseOpNo
= OpNo
;
850 if (NumLiteralUses
== 1) {
851 MachineInstr
*UseMI
= NonInlineUse
->getParent();
852 foldOperand(OpToFold
, UseMI
, NonInlineUseOpNo
, FoldList
, CopiesToReplace
);
856 SmallVector
<MachineRegisterInfo::use_iterator
, 4> UsesToProcess
;
857 for (MachineRegisterInfo::use_iterator
858 Use
= MRI
->use_begin(Dst
.getReg()), E
= MRI
->use_end();
860 UsesToProcess
.push_back(Use
);
862 for (auto U
: UsesToProcess
) {
863 MachineInstr
*UseMI
= U
->getParent();
865 foldOperand(OpToFold
, UseMI
, U
.getOperandNo(),
866 FoldList
, CopiesToReplace
);
870 MachineFunction
*MF
= MI
.getParent()->getParent();
871 // Make sure we add EXEC uses to any new v_mov instructions created.
872 for (MachineInstr
*Copy
: CopiesToReplace
)
873 Copy
->addImplicitDefUseOperands(*MF
);
875 for (FoldCandidate
&Fold
: FoldList
) {
876 if (updateOperand(Fold
, *TII
, *TRI
)) {
879 assert(Fold
.OpToFold
&& Fold
.OpToFold
->isReg());
880 // FIXME: Probably shouldn't bother trying to fold if not an
881 // SGPR. PeepholeOptimizer can eliminate redundant VGPR->VGPR
883 MRI
->clearKillFlags(Fold
.OpToFold
->getReg());
885 LLVM_DEBUG(dbgs() << "Folded source from " << MI
<< " into OpNo "
886 << static_cast<int>(Fold
.UseOpNo
) << " of "
887 << *Fold
.UseMI
<< '\n');
888 tryFoldInst(TII
, Fold
.UseMI
);
889 } else if (Fold
.isCommuted()) {
890 // Restoring instruction's original operand order if fold has failed.
891 TII
->commuteInstruction(*Fold
.UseMI
, false);
896 // Clamp patterns are canonically selected to v_max_* instructions, so only
898 const MachineOperand
*SIFoldOperands::isClamp(const MachineInstr
&MI
) const {
899 unsigned Op
= MI
.getOpcode();
901 case AMDGPU::V_MAX_F32_e64
:
902 case AMDGPU::V_MAX_F16_e64
:
903 case AMDGPU::V_MAX_F64
:
904 case AMDGPU::V_PK_MAX_F16
: {
905 if (!TII
->getNamedOperand(MI
, AMDGPU::OpName::clamp
)->getImm())
908 // Make sure sources are identical.
909 const MachineOperand
*Src0
= TII
->getNamedOperand(MI
, AMDGPU::OpName::src0
);
910 const MachineOperand
*Src1
= TII
->getNamedOperand(MI
, AMDGPU::OpName::src1
);
911 if (!Src0
->isReg() || !Src1
->isReg() ||
912 Src0
->getReg() != Src1
->getReg() ||
913 Src0
->getSubReg() != Src1
->getSubReg() ||
914 Src0
->getSubReg() != AMDGPU::NoSubRegister
)
917 // Can't fold up if we have modifiers.
918 if (TII
->hasModifiersSet(MI
, AMDGPU::OpName::omod
))
922 = TII
->getNamedOperand(MI
, AMDGPU::OpName::src0_modifiers
)->getImm();
924 = TII
->getNamedOperand(MI
, AMDGPU::OpName::src1_modifiers
)->getImm();
926 // Having a 0 op_sel_hi would require swizzling the output in the source
927 // instruction, which we can't do.
928 unsigned UnsetMods
= (Op
== AMDGPU::V_PK_MAX_F16
) ? SISrcMods::OP_SEL_1
: 0;
929 if (Src0Mods
!= UnsetMods
&& Src1Mods
!= UnsetMods
)
938 // We obviously have multiple uses in a clamp since the register is used twice
939 // in the same instruction.
940 static bool hasOneNonDBGUseInst(const MachineRegisterInfo
&MRI
, unsigned Reg
) {
942 for (auto I
= MRI
.use_instr_nodbg_begin(Reg
), E
= MRI
.use_instr_nodbg_end();
951 // FIXME: Clamp for v_mad_mixhi_f16 handled during isel.
952 bool SIFoldOperands::tryFoldClamp(MachineInstr
&MI
) {
953 const MachineOperand
*ClampSrc
= isClamp(MI
);
954 if (!ClampSrc
|| !hasOneNonDBGUseInst(*MRI
, ClampSrc
->getReg()))
957 MachineInstr
*Def
= MRI
->getVRegDef(ClampSrc
->getReg());
959 // The type of clamp must be compatible.
960 if (TII
->getClampMask(*Def
) != TII
->getClampMask(MI
))
963 MachineOperand
*DefClamp
= TII
->getNamedOperand(*Def
, AMDGPU::OpName::clamp
);
967 LLVM_DEBUG(dbgs() << "Folding clamp " << *DefClamp
<< " into " << *Def
970 // Clamp is applied after omod, so it is OK if omod is set.
972 MRI
->replaceRegWith(MI
.getOperand(0).getReg(), Def
->getOperand(0).getReg());
973 MI
.eraseFromParent();
977 static int getOModValue(unsigned Opc
, int64_t Val
) {
979 case AMDGPU::V_MUL_F32_e64
: {
980 switch (static_cast<uint32_t>(Val
)) {
981 case 0x3f000000: // 0.5
982 return SIOutMods::DIV2
;
983 case 0x40000000: // 2.0
984 return SIOutMods::MUL2
;
985 case 0x40800000: // 4.0
986 return SIOutMods::MUL4
;
988 return SIOutMods::NONE
;
991 case AMDGPU::V_MUL_F16_e64
: {
992 switch (static_cast<uint16_t>(Val
)) {
994 return SIOutMods::DIV2
;
996 return SIOutMods::MUL2
;
998 return SIOutMods::MUL4
;
1000 return SIOutMods::NONE
;
1004 llvm_unreachable("invalid mul opcode");
1008 // FIXME: Does this really not support denormals with f16?
1009 // FIXME: Does this need to check IEEE mode bit? SNaNs are generally not
1010 // handled, so will anything other than that break?
1011 std::pair
<const MachineOperand
*, int>
1012 SIFoldOperands::isOMod(const MachineInstr
&MI
) const {
1013 unsigned Op
= MI
.getOpcode();
1015 case AMDGPU::V_MUL_F32_e64
:
1016 case AMDGPU::V_MUL_F16_e64
: {
1017 // If output denormals are enabled, omod is ignored.
1018 if ((Op
== AMDGPU::V_MUL_F32_e64
&& ST
->hasFP32Denormals()) ||
1019 (Op
== AMDGPU::V_MUL_F16_e64
&& ST
->hasFP16Denormals()))
1020 return std::make_pair(nullptr, SIOutMods::NONE
);
1022 const MachineOperand
*RegOp
= nullptr;
1023 const MachineOperand
*ImmOp
= nullptr;
1024 const MachineOperand
*Src0
= TII
->getNamedOperand(MI
, AMDGPU::OpName::src0
);
1025 const MachineOperand
*Src1
= TII
->getNamedOperand(MI
, AMDGPU::OpName::src1
);
1026 if (Src0
->isImm()) {
1029 } else if (Src1
->isImm()) {
1033 return std::make_pair(nullptr, SIOutMods::NONE
);
1035 int OMod
= getOModValue(Op
, ImmOp
->getImm());
1036 if (OMod
== SIOutMods::NONE
||
1037 TII
->hasModifiersSet(MI
, AMDGPU::OpName::src0_modifiers
) ||
1038 TII
->hasModifiersSet(MI
, AMDGPU::OpName::src1_modifiers
) ||
1039 TII
->hasModifiersSet(MI
, AMDGPU::OpName::omod
) ||
1040 TII
->hasModifiersSet(MI
, AMDGPU::OpName::clamp
))
1041 return std::make_pair(nullptr, SIOutMods::NONE
);
1043 return std::make_pair(RegOp
, OMod
);
1045 case AMDGPU::V_ADD_F32_e64
:
1046 case AMDGPU::V_ADD_F16_e64
: {
1047 // If output denormals are enabled, omod is ignored.
1048 if ((Op
== AMDGPU::V_ADD_F32_e64
&& ST
->hasFP32Denormals()) ||
1049 (Op
== AMDGPU::V_ADD_F16_e64
&& ST
->hasFP16Denormals()))
1050 return std::make_pair(nullptr, SIOutMods::NONE
);
1052 // Look through the DAGCombiner canonicalization fmul x, 2 -> fadd x, x
1053 const MachineOperand
*Src0
= TII
->getNamedOperand(MI
, AMDGPU::OpName::src0
);
1054 const MachineOperand
*Src1
= TII
->getNamedOperand(MI
, AMDGPU::OpName::src1
);
1056 if (Src0
->isReg() && Src1
->isReg() && Src0
->getReg() == Src1
->getReg() &&
1057 Src0
->getSubReg() == Src1
->getSubReg() &&
1058 !TII
->hasModifiersSet(MI
, AMDGPU::OpName::src0_modifiers
) &&
1059 !TII
->hasModifiersSet(MI
, AMDGPU::OpName::src1_modifiers
) &&
1060 !TII
->hasModifiersSet(MI
, AMDGPU::OpName::clamp
) &&
1061 !TII
->hasModifiersSet(MI
, AMDGPU::OpName::omod
))
1062 return std::make_pair(Src0
, SIOutMods::MUL2
);
1064 return std::make_pair(nullptr, SIOutMods::NONE
);
1067 return std::make_pair(nullptr, SIOutMods::NONE
);
1071 // FIXME: Does this need to check IEEE bit on function?
1072 bool SIFoldOperands::tryFoldOMod(MachineInstr
&MI
) {
1073 const MachineOperand
*RegOp
;
1075 std::tie(RegOp
, OMod
) = isOMod(MI
);
1076 if (OMod
== SIOutMods::NONE
|| !RegOp
->isReg() ||
1077 RegOp
->getSubReg() != AMDGPU::NoSubRegister
||
1078 !hasOneNonDBGUseInst(*MRI
, RegOp
->getReg()))
1081 MachineInstr
*Def
= MRI
->getVRegDef(RegOp
->getReg());
1082 MachineOperand
*DefOMod
= TII
->getNamedOperand(*Def
, AMDGPU::OpName::omod
);
1083 if (!DefOMod
|| DefOMod
->getImm() != SIOutMods::NONE
)
1086 // Clamp is applied after omod. If the source already has clamp set, don't
1088 if (TII
->hasModifiersSet(*Def
, AMDGPU::OpName::clamp
))
1091 LLVM_DEBUG(dbgs() << "Folding omod " << MI
<< " into " << *Def
<< '\n');
1093 DefOMod
->setImm(OMod
);
1094 MRI
->replaceRegWith(MI
.getOperand(0).getReg(), Def
->getOperand(0).getReg());
1095 MI
.eraseFromParent();
1099 bool SIFoldOperands::runOnMachineFunction(MachineFunction
&MF
) {
1100 if (skipFunction(MF
.getFunction()))
1103 MRI
= &MF
.getRegInfo();
1104 ST
= &MF
.getSubtarget
<GCNSubtarget
>();
1105 TII
= ST
->getInstrInfo();
1106 TRI
= &TII
->getRegisterInfo();
1108 const SIMachineFunctionInfo
*MFI
= MF
.getInfo
<SIMachineFunctionInfo
>();
1110 // omod is ignored by hardware if IEEE bit is enabled. omod also does not
1111 // correctly handle signed zeros.
1113 bool IsIEEEMode
= ST
->enableIEEEBit(MF
);
1114 bool HasNSZ
= MFI
->hasNoSignedZerosFPMath();
1116 for (MachineBasicBlock
*MBB
: depth_first(&MF
)) {
1117 MachineBasicBlock::iterator I
, Next
;
1118 for (I
= MBB
->begin(); I
!= MBB
->end(); I
= Next
) {
1119 Next
= std::next(I
);
1120 MachineInstr
&MI
= *I
;
1122 tryFoldInst(TII
, &MI
);
1124 if (!TII
->isFoldableCopy(MI
)) {
1125 // TODO: Omod might be OK if there is NSZ only on the source
1126 // instruction, and not the omod multiply.
1127 if (IsIEEEMode
|| (!HasNSZ
&& !MI
.getFlag(MachineInstr::FmNsz
)) ||
1133 MachineOperand
&OpToFold
= MI
.getOperand(1);
1134 bool FoldingImm
= OpToFold
.isImm() || OpToFold
.isFI();
1136 // FIXME: We could also be folding things like TargetIndexes.
1137 if (!FoldingImm
&& !OpToFold
.isReg())
1140 if (OpToFold
.isReg() &&
1141 !TargetRegisterInfo::isVirtualRegister(OpToFold
.getReg()))
1144 // Prevent folding operands backwards in the function. For example,
1145 // the COPY opcode must not be replaced by 1 in this example:
1147 // %3 = COPY %vgpr0; VGPR_32:%3
1149 // %vgpr0 = V_MOV_B32_e32 1, implicit %exec
1150 MachineOperand
&Dst
= MI
.getOperand(0);
1152 !TargetRegisterInfo::isVirtualRegister(Dst
.getReg()))
1155 foldInstOperand(MI
, OpToFold
);