1 //===-- SIShrinkInstructions.cpp - Shrink Instructions --------------------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 /// The pass tries to use the 32-bit encoding for instructions when possible.
8 //===----------------------------------------------------------------------===//
12 #include "GCNSubtarget.h"
13 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
14 #include "Utils/AMDGPUBaseInfo.h"
15 #include "llvm/ADT/Statistic.h"
16 #include "llvm/CodeGen/MachineFunctionPass.h"
18 #define DEBUG_TYPE "si-shrink-instructions"
20 STATISTIC(NumInstructionsShrunk
,
21 "Number of 64-bit instruction reduced to 32-bit.");
22 STATISTIC(NumLiteralConstantsFolded
,
23 "Number of literal constants folded into 32-bit instructions.");
29 class SIShrinkInstructions
: public MachineFunctionPass
{
31 MachineRegisterInfo
*MRI
;
32 const GCNSubtarget
*ST
;
33 const SIInstrInfo
*TII
;
34 const SIRegisterInfo
*TRI
;
40 SIShrinkInstructions() : MachineFunctionPass(ID
) {
43 bool foldImmediates(MachineInstr
&MI
, bool TryToCommute
= true) const;
44 bool shouldShrinkTrue16(MachineInstr
&MI
) const;
45 bool isKImmOperand(const MachineOperand
&Src
) const;
46 bool isKUImmOperand(const MachineOperand
&Src
) const;
47 bool isKImmOrKUImmOperand(const MachineOperand
&Src
, bool &IsUnsigned
) const;
48 void copyExtraImplicitOps(MachineInstr
&NewMI
, MachineInstr
&MI
) const;
49 void shrinkScalarCompare(MachineInstr
&MI
) const;
50 void shrinkMIMG(MachineInstr
&MI
) const;
51 void shrinkMadFma(MachineInstr
&MI
) const;
52 bool shrinkScalarLogicOp(MachineInstr
&MI
) const;
53 bool tryReplaceDeadSDST(MachineInstr
&MI
) const;
54 bool instAccessReg(iterator_range
<MachineInstr::const_mop_iterator
> &&R
,
55 Register Reg
, unsigned SubReg
) const;
56 bool instReadsReg(const MachineInstr
*MI
, unsigned Reg
,
57 unsigned SubReg
) const;
58 bool instModifiesReg(const MachineInstr
*MI
, unsigned Reg
,
59 unsigned SubReg
) const;
60 TargetInstrInfo::RegSubRegPair
getSubRegForIndex(Register Reg
, unsigned Sub
,
62 void dropInstructionKeepingImpDefs(MachineInstr
&MI
) const;
63 MachineInstr
*matchSwap(MachineInstr
&MovT
) const;
65 bool runOnMachineFunction(MachineFunction
&MF
) override
;
67 StringRef
getPassName() const override
{ return "SI Shrink Instructions"; }
69 void getAnalysisUsage(AnalysisUsage
&AU
) const override
{
71 MachineFunctionPass::getAnalysisUsage(AU
);
75 } // End anonymous namespace.
77 INITIALIZE_PASS(SIShrinkInstructions
, DEBUG_TYPE
,
78 "SI Shrink Instructions", false, false)
80 char SIShrinkInstructions::ID
= 0;
82 FunctionPass
*llvm::createSIShrinkInstructionsPass() {
83 return new SIShrinkInstructions();
86 /// This function checks \p MI for operands defined by a move immediate
87 /// instruction and then folds the literal constant into the instruction if it
88 /// can. This function assumes that \p MI is a VOP1, VOP2, or VOPC instructions.
89 bool SIShrinkInstructions::foldImmediates(MachineInstr
&MI
,
90 bool TryToCommute
) const {
91 assert(TII
->isVOP1(MI
) || TII
->isVOP2(MI
) || TII
->isVOPC(MI
));
93 int Src0Idx
= AMDGPU::getNamedOperandIdx(MI
.getOpcode(), AMDGPU::OpName::src0
);
96 MachineOperand
&Src0
= MI
.getOperand(Src0Idx
);
98 Register Reg
= Src0
.getReg();
99 if (Reg
.isVirtual()) {
100 MachineInstr
*Def
= MRI
->getUniqueVRegDef(Reg
);
101 if (Def
&& Def
->isMoveImmediate()) {
102 MachineOperand
&MovSrc
= Def
->getOperand(1);
103 bool ConstantFolded
= false;
105 if (TII
->isOperandLegal(MI
, Src0Idx
, &MovSrc
)) {
106 if (MovSrc
.isImm()) {
107 Src0
.ChangeToImmediate(MovSrc
.getImm());
108 ConstantFolded
= true;
109 } else if (MovSrc
.isFI()) {
110 Src0
.ChangeToFrameIndex(MovSrc
.getIndex());
111 ConstantFolded
= true;
112 } else if (MovSrc
.isGlobal()) {
113 Src0
.ChangeToGA(MovSrc
.getGlobal(), MovSrc
.getOffset(),
114 MovSrc
.getTargetFlags());
115 ConstantFolded
= true;
119 if (ConstantFolded
) {
120 if (MRI
->use_nodbg_empty(Reg
))
121 Def
->eraseFromParent();
122 ++NumLiteralConstantsFolded
;
129 // We have failed to fold src0, so commute the instruction and try again.
130 if (TryToCommute
&& MI
.isCommutable()) {
131 if (TII
->commuteInstruction(MI
)) {
132 if (foldImmediates(MI
, false))
136 TII
->commuteInstruction(MI
);
143 /// Do not shrink the instruction if its registers are not expressible in the
145 bool SIShrinkInstructions::shouldShrinkTrue16(MachineInstr
&MI
) const {
146 for (unsigned I
= 0, E
= MI
.getNumExplicitOperands(); I
!= E
; ++I
) {
147 const MachineOperand
&MO
= MI
.getOperand(I
);
149 Register Reg
= MO
.getReg();
150 assert(!Reg
.isVirtual() && "Prior checks should ensure we only shrink "
151 "True16 Instructions post-RA");
152 if (AMDGPU::VGPR_32RegClass
.contains(Reg
) &&
153 !AMDGPU::VGPR_32_Lo128RegClass
.contains(Reg
))
160 bool SIShrinkInstructions::isKImmOperand(const MachineOperand
&Src
) const {
161 return isInt
<16>(SignExtend64(Src
.getImm(), 32)) &&
162 !TII
->isInlineConstant(*Src
.getParent(), Src
.getOperandNo());
165 bool SIShrinkInstructions::isKUImmOperand(const MachineOperand
&Src
) const {
166 return isUInt
<16>(Src
.getImm()) &&
167 !TII
->isInlineConstant(*Src
.getParent(), Src
.getOperandNo());
170 bool SIShrinkInstructions::isKImmOrKUImmOperand(const MachineOperand
&Src
,
171 bool &IsUnsigned
) const {
172 if (isInt
<16>(SignExtend64(Src
.getImm(), 32))) {
174 return !TII
->isInlineConstant(Src
);
177 if (isUInt
<16>(Src
.getImm())) {
179 return !TII
->isInlineConstant(Src
);
185 /// \returns the opcode of an instruction a move immediate of the constant \p
186 /// Src can be replaced with if the constant is replaced with \p ModifiedImm.
189 /// If the bitreverse of a constant is an inline immediate, reverse the
190 /// immediate and return the bitreverse opcode.
192 /// If the bitwise negation of a constant is an inline immediate, reverse the
193 /// immediate and return the bitwise not opcode.
194 static unsigned canModifyToInlineImmOp32(const SIInstrInfo
*TII
,
195 const MachineOperand
&Src
,
196 int32_t &ModifiedImm
, bool Scalar
) {
197 if (TII
->isInlineConstant(Src
))
199 int32_t SrcImm
= static_cast<int32_t>(Src
.getImm());
202 // We could handle the scalar case with here, but we would need to check
203 // that SCC is not live as S_NOT_B32 clobbers it. It's probably not worth
204 // it, as the reasonable values are already covered by s_movk_i32.
205 ModifiedImm
= ~SrcImm
;
206 if (TII
->isInlineConstant(APInt(32, ModifiedImm
)))
207 return AMDGPU::V_NOT_B32_e32
;
210 ModifiedImm
= reverseBits
<int32_t>(SrcImm
);
211 if (TII
->isInlineConstant(APInt(32, ModifiedImm
)))
212 return Scalar
? AMDGPU::S_BREV_B32
: AMDGPU::V_BFREV_B32_e32
;
217 /// Copy implicit register operands from specified instruction to this
218 /// instruction that are not part of the instruction definition.
219 void SIShrinkInstructions::copyExtraImplicitOps(MachineInstr
&NewMI
,
220 MachineInstr
&MI
) const {
221 MachineFunction
&MF
= *MI
.getMF();
222 for (unsigned i
= MI
.getDesc().getNumOperands() +
223 MI
.getDesc().implicit_uses().size() +
224 MI
.getDesc().implicit_defs().size(),
225 e
= MI
.getNumOperands();
227 const MachineOperand
&MO
= MI
.getOperand(i
);
228 if ((MO
.isReg() && MO
.isImplicit()) || MO
.isRegMask())
229 NewMI
.addOperand(MF
, MO
);
233 void SIShrinkInstructions::shrinkScalarCompare(MachineInstr
&MI
) const {
237 // cmpk instructions do scc = dst <cc op> imm16, so commute the instruction to
238 // get constants on the RHS.
239 if (!MI
.getOperand(0).isReg())
240 TII
->commuteInstruction(MI
, false, 0, 1);
242 // cmpk requires src0 to be a register
243 const MachineOperand
&Src0
= MI
.getOperand(0);
247 MachineOperand
&Src1
= MI
.getOperand(1);
251 int SOPKOpc
= AMDGPU::getSOPKOp(MI
.getOpcode());
255 // eq/ne is special because the imm16 can be treated as signed or unsigned,
256 // and initially selected to the unsigned versions.
257 if (SOPKOpc
== AMDGPU::S_CMPK_EQ_U32
|| SOPKOpc
== AMDGPU::S_CMPK_LG_U32
) {
259 if (isKImmOrKUImmOperand(Src1
, HasUImm
)) {
261 SOPKOpc
= (SOPKOpc
== AMDGPU::S_CMPK_EQ_U32
) ?
262 AMDGPU::S_CMPK_EQ_I32
: AMDGPU::S_CMPK_LG_I32
;
263 Src1
.setImm(SignExtend32(Src1
.getImm(), 32));
266 MI
.setDesc(TII
->get(SOPKOpc
));
272 const MCInstrDesc
&NewDesc
= TII
->get(SOPKOpc
);
274 if ((SIInstrInfo::sopkIsZext(SOPKOpc
) && isKUImmOperand(Src1
)) ||
275 (!SIInstrInfo::sopkIsZext(SOPKOpc
) && isKImmOperand(Src1
))) {
276 if (!SIInstrInfo::sopkIsZext(SOPKOpc
))
277 Src1
.setImm(SignExtend64(Src1
.getImm(), 32));
282 // Shrink NSA encoded instructions with contiguous VGPRs to non-NSA encoding.
283 void SIShrinkInstructions::shrinkMIMG(MachineInstr
&MI
) const {
284 const AMDGPU::MIMGInfo
*Info
= AMDGPU::getMIMGInfo(MI
.getOpcode());
289 switch (Info
->MIMGEncoding
) {
290 case AMDGPU::MIMGEncGfx10NSA
:
291 NewEncoding
= AMDGPU::MIMGEncGfx10Default
;
293 case AMDGPU::MIMGEncGfx11NSA
:
294 NewEncoding
= AMDGPU::MIMGEncGfx11Default
;
301 AMDGPU::getNamedOperandIdx(MI
.getOpcode(), AMDGPU::OpName::vaddr0
);
302 unsigned NewAddrDwords
= Info
->VAddrDwords
;
303 const TargetRegisterClass
*RC
;
305 if (Info
->VAddrDwords
== 2) {
306 RC
= &AMDGPU::VReg_64RegClass
;
307 } else if (Info
->VAddrDwords
== 3) {
308 RC
= &AMDGPU::VReg_96RegClass
;
309 } else if (Info
->VAddrDwords
== 4) {
310 RC
= &AMDGPU::VReg_128RegClass
;
311 } else if (Info
->VAddrDwords
== 5) {
312 RC
= &AMDGPU::VReg_160RegClass
;
313 } else if (Info
->VAddrDwords
== 6) {
314 RC
= &AMDGPU::VReg_192RegClass
;
315 } else if (Info
->VAddrDwords
== 7) {
316 RC
= &AMDGPU::VReg_224RegClass
;
317 } else if (Info
->VAddrDwords
== 8) {
318 RC
= &AMDGPU::VReg_256RegClass
;
319 } else if (Info
->VAddrDwords
== 9) {
320 RC
= &AMDGPU::VReg_288RegClass
;
321 } else if (Info
->VAddrDwords
== 10) {
322 RC
= &AMDGPU::VReg_320RegClass
;
323 } else if (Info
->VAddrDwords
== 11) {
324 RC
= &AMDGPU::VReg_352RegClass
;
325 } else if (Info
->VAddrDwords
== 12) {
326 RC
= &AMDGPU::VReg_384RegClass
;
328 RC
= &AMDGPU::VReg_512RegClass
;
332 unsigned VgprBase
= 0;
333 unsigned NextVgpr
= 0;
335 bool IsKill
= NewAddrDwords
== Info
->VAddrDwords
;
336 const unsigned NSAMaxSize
= ST
->getNSAMaxSize();
337 const bool IsPartialNSA
= NewAddrDwords
> NSAMaxSize
;
338 const unsigned EndVAddr
= IsPartialNSA
? NSAMaxSize
: Info
->VAddrOperands
;
339 for (unsigned Idx
= 0; Idx
< EndVAddr
; ++Idx
) {
340 const MachineOperand
&Op
= MI
.getOperand(VAddr0Idx
+ Idx
);
341 unsigned Vgpr
= TRI
->getHWRegIndex(Op
.getReg());
342 unsigned Dwords
= TRI
->getRegSizeInBits(Op
.getReg(), *MRI
) / 32;
343 assert(Dwords
> 0 && "Un-implemented for less than 32 bit regs");
347 NextVgpr
= Vgpr
+ Dwords
;
348 } else if (Vgpr
== NextVgpr
) {
349 NextVgpr
= Vgpr
+ Dwords
;
360 if (VgprBase
+ NewAddrDwords
> 256)
363 // Further check for implicit tied operands - this may be present if TFE is
365 int TFEIdx
= AMDGPU::getNamedOperandIdx(MI
.getOpcode(), AMDGPU::OpName::tfe
);
366 int LWEIdx
= AMDGPU::getNamedOperandIdx(MI
.getOpcode(), AMDGPU::OpName::lwe
);
367 unsigned TFEVal
= (TFEIdx
== -1) ? 0 : MI
.getOperand(TFEIdx
).getImm();
368 unsigned LWEVal
= (LWEIdx
== -1) ? 0 : MI
.getOperand(LWEIdx
).getImm();
370 if (TFEVal
|| LWEVal
) {
371 // TFE/LWE is enabled so we need to deal with an implicit tied operand
372 for (unsigned i
= LWEIdx
+ 1, e
= MI
.getNumOperands(); i
!= e
; ++i
) {
373 if (MI
.getOperand(i
).isReg() && MI
.getOperand(i
).isTied() &&
374 MI
.getOperand(i
).isImplicit()) {
375 // This is the tied operand
378 "found more than one tied implicit operand when expecting only 1");
380 MI
.untieRegOperand(ToUntie
);
385 unsigned NewOpcode
= AMDGPU::getMIMGOpcode(Info
->BaseOpcode
, NewEncoding
,
386 Info
->VDataDwords
, NewAddrDwords
);
387 MI
.setDesc(TII
->get(NewOpcode
));
388 MI
.getOperand(VAddr0Idx
).setReg(RC
->getRegister(VgprBase
));
389 MI
.getOperand(VAddr0Idx
).setIsUndef(IsUndef
);
390 MI
.getOperand(VAddr0Idx
).setIsKill(IsKill
);
392 for (unsigned i
= 1; i
< EndVAddr
; ++i
)
393 MI
.removeOperand(VAddr0Idx
+ 1);
397 AMDGPU::getNamedOperandIdx(MI
.getOpcode(), AMDGPU::OpName::vdata
),
398 ToUntie
- (EndVAddr
- 1));
402 // Shrink MAD to MADAK/MADMK and FMA to FMAAK/FMAMK.
403 void SIShrinkInstructions::shrinkMadFma(MachineInstr
&MI
) const {
404 // Pre-GFX10 VOP3 instructions like MAD/FMA cannot take a literal operand so
405 // there is no reason to try to shrink them.
406 if (!ST
->hasVOP3Literal())
409 // There is no advantage to doing this pre-RA.
410 if (!MF
->getProperties().hasProperty(
411 MachineFunctionProperties::Property::NoVRegs
))
414 if (TII
->hasAnyModifiersSet(MI
))
417 const unsigned Opcode
= MI
.getOpcode();
418 MachineOperand
&Src0
= *TII
->getNamedOperand(MI
, AMDGPU::OpName::src0
);
419 MachineOperand
&Src1
= *TII
->getNamedOperand(MI
, AMDGPU::OpName::src1
);
420 MachineOperand
&Src2
= *TII
->getNamedOperand(MI
, AMDGPU::OpName::src2
);
421 unsigned NewOpcode
= AMDGPU::INSTRUCTION_LIST_END
;
425 // Detect "Dst = VSrc * VGPR + Imm" and convert to AK form.
426 if (Src2
.isImm() && !TII
->isInlineConstant(Src2
)) {
427 if (Src1
.isReg() && TRI
->isVGPR(*MRI
, Src1
.getReg()))
429 else if (Src0
.isReg() && TRI
->isVGPR(*MRI
, Src0
.getReg()))
436 llvm_unreachable("Unexpected mad/fma opcode!");
437 case AMDGPU::V_MAD_F32_e64
:
438 NewOpcode
= AMDGPU::V_MADAK_F32
;
440 case AMDGPU::V_FMA_F32_e64
:
441 NewOpcode
= AMDGPU::V_FMAAK_F32
;
443 case AMDGPU::V_MAD_F16_e64
:
444 NewOpcode
= AMDGPU::V_MADAK_F16
;
446 case AMDGPU::V_FMA_F16_e64
:
447 case AMDGPU::V_FMA_F16_gfx9_e64
:
448 NewOpcode
= ST
->hasTrue16BitInsts() ? AMDGPU::V_FMAAK_F16_t16
449 : AMDGPU::V_FMAAK_F16
;
454 // Detect "Dst = VSrc * Imm + VGPR" and convert to MK form.
455 if (Src2
.isReg() && TRI
->isVGPR(*MRI
, Src2
.getReg())) {
456 if (Src1
.isImm() && !TII
->isInlineConstant(Src1
))
458 else if (Src0
.isImm() && !TII
->isInlineConstant(Src0
))
465 llvm_unreachable("Unexpected mad/fma opcode!");
466 case AMDGPU::V_MAD_F32_e64
:
467 NewOpcode
= AMDGPU::V_MADMK_F32
;
469 case AMDGPU::V_FMA_F32_e64
:
470 NewOpcode
= AMDGPU::V_FMAMK_F32
;
472 case AMDGPU::V_MAD_F16_e64
:
473 NewOpcode
= AMDGPU::V_MADMK_F16
;
475 case AMDGPU::V_FMA_F16_e64
:
476 case AMDGPU::V_FMA_F16_gfx9_e64
:
477 NewOpcode
= ST
->hasTrue16BitInsts() ? AMDGPU::V_FMAMK_F16_t16
478 : AMDGPU::V_FMAMK_F16
;
483 if (NewOpcode
== AMDGPU::INSTRUCTION_LIST_END
)
486 if (AMDGPU::isTrue16Inst(NewOpcode
) && !shouldShrinkTrue16(MI
))
490 // Swap Src0 and Src1 by building a new instruction.
491 BuildMI(*MI
.getParent(), MI
, MI
.getDebugLoc(), TII
->get(NewOpcode
),
492 MI
.getOperand(0).getReg())
496 .setMIFlags(MI
.getFlags());
497 MI
.eraseFromParent();
499 TII
->removeModOperands(MI
);
500 MI
.setDesc(TII
->get(NewOpcode
));
504 /// Attempt to shrink AND/OR/XOR operations requiring non-inlineable literals.
505 /// For AND or OR, try using S_BITSET{0,1} to clear or set bits.
506 /// If the inverse of the immediate is legal, use ANDN2, ORN2 or
507 /// XNOR (as a ^ b == ~(a ^ ~b)).
508 /// \returns true if the caller should continue the machine function iterator
509 bool SIShrinkInstructions::shrinkScalarLogicOp(MachineInstr
&MI
) const {
510 unsigned Opc
= MI
.getOpcode();
511 const MachineOperand
*Dest
= &MI
.getOperand(0);
512 MachineOperand
*Src0
= &MI
.getOperand(1);
513 MachineOperand
*Src1
= &MI
.getOperand(2);
514 MachineOperand
*SrcReg
= Src0
;
515 MachineOperand
*SrcImm
= Src1
;
517 if (!SrcImm
->isImm() ||
518 AMDGPU::isInlinableLiteral32(SrcImm
->getImm(), ST
->hasInv2PiInlineImm()))
521 uint32_t Imm
= static_cast<uint32_t>(SrcImm
->getImm());
524 if (Opc
== AMDGPU::S_AND_B32
) {
525 if (isPowerOf2_32(~Imm
)) {
526 NewImm
= llvm::countr_one(Imm
);
527 Opc
= AMDGPU::S_BITSET0_B32
;
528 } else if (AMDGPU::isInlinableLiteral32(~Imm
, ST
->hasInv2PiInlineImm())) {
530 Opc
= AMDGPU::S_ANDN2_B32
;
532 } else if (Opc
== AMDGPU::S_OR_B32
) {
533 if (isPowerOf2_32(Imm
)) {
534 NewImm
= llvm::countr_zero(Imm
);
535 Opc
= AMDGPU::S_BITSET1_B32
;
536 } else if (AMDGPU::isInlinableLiteral32(~Imm
, ST
->hasInv2PiInlineImm())) {
538 Opc
= AMDGPU::S_ORN2_B32
;
540 } else if (Opc
== AMDGPU::S_XOR_B32
) {
541 if (AMDGPU::isInlinableLiteral32(~Imm
, ST
->hasInv2PiInlineImm())) {
543 Opc
= AMDGPU::S_XNOR_B32
;
546 llvm_unreachable("unexpected opcode");
550 if (Dest
->getReg().isVirtual() && SrcReg
->isReg()) {
551 MRI
->setRegAllocationHint(Dest
->getReg(), 0, SrcReg
->getReg());
552 MRI
->setRegAllocationHint(SrcReg
->getReg(), 0, Dest
->getReg());
556 if (SrcReg
->isReg() && SrcReg
->getReg() == Dest
->getReg()) {
557 const bool IsUndef
= SrcReg
->isUndef();
558 const bool IsKill
= SrcReg
->isKill();
559 MI
.setDesc(TII
->get(Opc
));
560 if (Opc
== AMDGPU::S_BITSET0_B32
||
561 Opc
== AMDGPU::S_BITSET1_B32
) {
562 Src0
->ChangeToImmediate(NewImm
);
563 // Remove the immediate and add the tied input.
564 MI
.getOperand(2).ChangeToRegister(Dest
->getReg(), /*IsDef*/ false,
565 /*isImp*/ false, IsKill
,
566 /*isDead*/ false, IsUndef
);
567 MI
.tieOperands(0, 2);
569 SrcImm
->setImm(NewImm
);
577 // This is the same as MachineInstr::readsRegister/modifiesRegister except
578 // it takes subregs into account.
579 bool SIShrinkInstructions::instAccessReg(
580 iterator_range
<MachineInstr::const_mop_iterator
> &&R
, Register Reg
,
581 unsigned SubReg
) const {
582 for (const MachineOperand
&MO
: R
) {
586 if (Reg
.isPhysical() && MO
.getReg().isPhysical()) {
587 if (TRI
->regsOverlap(Reg
, MO
.getReg()))
589 } else if (MO
.getReg() == Reg
&& Reg
.isVirtual()) {
590 LaneBitmask Overlap
= TRI
->getSubRegIndexLaneMask(SubReg
) &
591 TRI
->getSubRegIndexLaneMask(MO
.getSubReg());
599 bool SIShrinkInstructions::instReadsReg(const MachineInstr
*MI
, unsigned Reg
,
600 unsigned SubReg
) const {
601 return instAccessReg(MI
->uses(), Reg
, SubReg
);
604 bool SIShrinkInstructions::instModifiesReg(const MachineInstr
*MI
, unsigned Reg
,
605 unsigned SubReg
) const {
606 return instAccessReg(MI
->defs(), Reg
, SubReg
);
609 TargetInstrInfo::RegSubRegPair
610 SIShrinkInstructions::getSubRegForIndex(Register Reg
, unsigned Sub
,
612 if (TRI
->getRegSizeInBits(Reg
, *MRI
) != 32) {
613 if (Reg
.isPhysical()) {
614 Reg
= TRI
->getSubReg(Reg
, TRI
->getSubRegFromChannel(I
));
616 Sub
= TRI
->getSubRegFromChannel(I
+ TRI
->getChannelFromSubReg(Sub
));
619 return TargetInstrInfo::RegSubRegPair(Reg
, Sub
);
622 void SIShrinkInstructions::dropInstructionKeepingImpDefs(
623 MachineInstr
&MI
) const {
624 for (unsigned i
= MI
.getDesc().getNumOperands() +
625 MI
.getDesc().implicit_uses().size() +
626 MI
.getDesc().implicit_defs().size(),
627 e
= MI
.getNumOperands();
629 const MachineOperand
&Op
= MI
.getOperand(i
);
632 BuildMI(*MI
.getParent(), MI
.getIterator(), MI
.getDebugLoc(),
633 TII
->get(AMDGPU::IMPLICIT_DEF
), Op
.getReg());
636 MI
.eraseFromParent();
646 // mov t, x (t is potentially dead and move eliminated)
649 // Returns next valid instruction pointer if was able to create v_swap_b32.
651 // This shall not be done too early not to prevent possible folding which may
652 // remove matched moves, and this should preferably be done before RA to
653 // release saved registers and also possibly after RA which can insert copies
656 // This is really just a generic peephole that is not a canonical shrinking,
657 // although requirements match the pass placement and it reduces code size too.
658 MachineInstr
*SIShrinkInstructions::matchSwap(MachineInstr
&MovT
) const {
659 assert(MovT
.getOpcode() == AMDGPU::V_MOV_B32_e32
||
660 MovT
.getOpcode() == AMDGPU::COPY
);
662 Register T
= MovT
.getOperand(0).getReg();
663 unsigned Tsub
= MovT
.getOperand(0).getSubReg();
664 MachineOperand
&Xop
= MovT
.getOperand(1);
668 Register X
= Xop
.getReg();
669 unsigned Xsub
= Xop
.getSubReg();
671 unsigned Size
= TII
->getOpSize(MovT
, 0) / 4;
673 if (!TRI
->isVGPR(*MRI
, X
))
676 const unsigned SearchLimit
= 16;
678 bool KilledT
= false;
679 for (auto Iter
= std::next(MovT
.getIterator()),
680 E
= MovT
.getParent()->instr_end();
681 Iter
!= E
&& Count
< SearchLimit
&& !KilledT
; ++Iter
, ++Count
) {
683 MachineInstr
*MovY
= &*Iter
;
684 KilledT
= MovY
->killsRegister(T
, TRI
);
686 if ((MovY
->getOpcode() != AMDGPU::V_MOV_B32_e32
&&
687 MovY
->getOpcode() != AMDGPU::COPY
) ||
688 !MovY
->getOperand(1).isReg() ||
689 MovY
->getOperand(1).getReg() != T
||
690 MovY
->getOperand(1).getSubReg() != Tsub
)
693 Register Y
= MovY
->getOperand(0).getReg();
694 unsigned Ysub
= MovY
->getOperand(0).getSubReg();
696 if (!TRI
->isVGPR(*MRI
, Y
))
699 MachineInstr
*MovX
= nullptr;
700 for (auto IY
= MovY
->getIterator(), I
= std::next(MovT
.getIterator());
702 if (instReadsReg(&*I
, X
, Xsub
) || instModifiesReg(&*I
, Y
, Ysub
) ||
703 instModifiesReg(&*I
, T
, Tsub
) ||
704 (MovX
&& instModifiesReg(&*I
, X
, Xsub
))) {
708 if (!instReadsReg(&*I
, Y
, Ysub
)) {
709 if (!MovX
&& instModifiesReg(&*I
, X
, Xsub
)) {
716 (I
->getOpcode() != AMDGPU::V_MOV_B32_e32
&&
717 I
->getOpcode() != AMDGPU::COPY
) ||
718 I
->getOperand(0).getReg() != X
||
719 I
->getOperand(0).getSubReg() != Xsub
) {
724 if (Size
> 1 && (I
->getNumImplicitOperands() > (I
->isCopy() ? 0U : 1U)))
733 LLVM_DEBUG(dbgs() << "Matched v_swap_b32:\n" << MovT
<< *MovX
<< *MovY
);
735 for (unsigned I
= 0; I
< Size
; ++I
) {
736 TargetInstrInfo::RegSubRegPair X1
, Y1
;
737 X1
= getSubRegForIndex(X
, Xsub
, I
);
738 Y1
= getSubRegForIndex(Y
, Ysub
, I
);
739 MachineBasicBlock
&MBB
= *MovT
.getParent();
740 auto MIB
= BuildMI(MBB
, MovX
->getIterator(), MovT
.getDebugLoc(),
741 TII
->get(AMDGPU::V_SWAP_B32
))
742 .addDef(X1
.Reg
, 0, X1
.SubReg
)
743 .addDef(Y1
.Reg
, 0, Y1
.SubReg
)
744 .addReg(Y1
.Reg
, 0, Y1
.SubReg
)
745 .addReg(X1
.Reg
, 0, X1
.SubReg
).getInstr();
746 if (MovX
->hasRegisterImplicitUseOperand(AMDGPU::EXEC
)) {
747 // Drop implicit EXEC.
748 MIB
->removeOperand(MIB
->getNumExplicitOperands());
749 MIB
->copyImplicitOps(*MBB
.getParent(), *MovX
);
752 MovX
->eraseFromParent();
753 dropInstructionKeepingImpDefs(*MovY
);
754 MachineInstr
*Next
= &*std::next(MovT
.getIterator());
756 if (T
.isVirtual() && MRI
->use_nodbg_empty(T
)) {
757 dropInstructionKeepingImpDefs(MovT
);
759 Xop
.setIsKill(false);
760 for (int I
= MovT
.getNumImplicitOperands() - 1; I
>= 0; --I
) {
761 unsigned OpNo
= MovT
.getNumExplicitOperands() + I
;
762 const MachineOperand
&Op
= MovT
.getOperand(OpNo
);
763 if (Op
.isKill() && TRI
->regsOverlap(X
, Op
.getReg()))
764 MovT
.removeOperand(OpNo
);
774 // If an instruction has dead sdst replace it with NULL register on gfx1030+
775 bool SIShrinkInstructions::tryReplaceDeadSDST(MachineInstr
&MI
) const {
776 if (!ST
->hasGFX10_3Insts())
779 MachineOperand
*Op
= TII
->getNamedOperand(MI
, AMDGPU::OpName::sdst
);
782 Register SDstReg
= Op
->getReg();
783 if (SDstReg
.isPhysical() || !MRI
->use_nodbg_empty(SDstReg
))
786 Op
->setReg(ST
->isWave32() ? AMDGPU::SGPR_NULL
: AMDGPU::SGPR_NULL64
);
790 bool SIShrinkInstructions::runOnMachineFunction(MachineFunction
&MF
) {
791 if (skipFunction(MF
.getFunction()))
795 MRI
= &MF
.getRegInfo();
796 ST
= &MF
.getSubtarget
<GCNSubtarget
>();
797 TII
= ST
->getInstrInfo();
798 TRI
= &TII
->getRegisterInfo();
800 unsigned VCCReg
= ST
->isWave32() ? AMDGPU::VCC_LO
: AMDGPU::VCC
;
802 std::vector
<unsigned> I1Defs
;
804 for (MachineFunction::iterator BI
= MF
.begin(), BE
= MF
.end();
807 MachineBasicBlock
&MBB
= *BI
;
808 MachineBasicBlock::iterator I
, Next
;
809 for (I
= MBB
.begin(); I
!= MBB
.end(); I
= Next
) {
811 MachineInstr
&MI
= *I
;
813 if (MI
.getOpcode() == AMDGPU::V_MOV_B32_e32
) {
814 // If this has a literal constant source that is the same as the
815 // reversed bits of an inline immediate, replace with a bitreverse of
816 // that constant. This saves 4 bytes in the common case of materializing
819 // Test if we are after regalloc. We only want to do this after any
820 // optimizations happen because this will confuse them.
821 // XXX - not exactly a check for post-regalloc run.
822 MachineOperand
&Src
= MI
.getOperand(1);
823 if (Src
.isImm() && MI
.getOperand(0).getReg().isPhysical()) {
826 canModifyToInlineImmOp32(TII
, Src
, ModImm
, /*Scalar=*/false);
827 if (ModOpcode
!= 0) {
828 MI
.setDesc(TII
->get(ModOpcode
));
829 Src
.setImm(static_cast<int64_t>(ModImm
));
835 if (ST
->hasSwap() && (MI
.getOpcode() == AMDGPU::V_MOV_B32_e32
||
836 MI
.getOpcode() == AMDGPU::COPY
)) {
837 if (auto *NextMI
= matchSwap(MI
)) {
838 Next
= NextMI
->getIterator();
843 // Try to use S_ADDK_I32 and S_MULK_I32.
844 if (MI
.getOpcode() == AMDGPU::S_ADD_I32
||
845 MI
.getOpcode() == AMDGPU::S_MUL_I32
) {
846 const MachineOperand
*Dest
= &MI
.getOperand(0);
847 MachineOperand
*Src0
= &MI
.getOperand(1);
848 MachineOperand
*Src1
= &MI
.getOperand(2);
850 if (!Src0
->isReg() && Src1
->isReg()) {
851 if (TII
->commuteInstruction(MI
, false, 1, 2))
852 std::swap(Src0
, Src1
);
855 // FIXME: This could work better if hints worked with subregisters. If
856 // we have a vector add of a constant, we usually don't get the correct
857 // allocation due to the subregister usage.
858 if (Dest
->getReg().isVirtual() && Src0
->isReg()) {
859 MRI
->setRegAllocationHint(Dest
->getReg(), 0, Src0
->getReg());
860 MRI
->setRegAllocationHint(Src0
->getReg(), 0, Dest
->getReg());
864 if (Src0
->isReg() && Src0
->getReg() == Dest
->getReg()) {
865 if (Src1
->isImm() && isKImmOperand(*Src1
)) {
866 unsigned Opc
= (MI
.getOpcode() == AMDGPU::S_ADD_I32
) ?
867 AMDGPU::S_ADDK_I32
: AMDGPU::S_MULK_I32
;
869 Src1
->setImm(SignExtend64(Src1
->getImm(), 32));
870 MI
.setDesc(TII
->get(Opc
));
871 MI
.tieOperands(0, 1);
876 // Try to use s_cmpk_*
877 if (MI
.isCompare() && TII
->isSOPC(MI
)) {
878 shrinkScalarCompare(MI
);
882 // Try to use S_MOVK_I32, which will save 4 bytes for small immediates.
883 if (MI
.getOpcode() == AMDGPU::S_MOV_B32
) {
884 const MachineOperand
&Dst
= MI
.getOperand(0);
885 MachineOperand
&Src
= MI
.getOperand(1);
887 if (Src
.isImm() && Dst
.getReg().isPhysical()) {
890 if (isKImmOperand(Src
)) {
891 MI
.setDesc(TII
->get(AMDGPU::S_MOVK_I32
));
892 Src
.setImm(SignExtend64(Src
.getImm(), 32));
893 } else if ((ModOpc
= canModifyToInlineImmOp32(TII
, Src
, ModImm
,
895 MI
.setDesc(TII
->get(ModOpc
));
896 Src
.setImm(static_cast<int64_t>(ModImm
));
903 // Shrink scalar logic operations.
904 if (MI
.getOpcode() == AMDGPU::S_AND_B32
||
905 MI
.getOpcode() == AMDGPU::S_OR_B32
||
906 MI
.getOpcode() == AMDGPU::S_XOR_B32
) {
907 if (shrinkScalarLogicOp(MI
))
911 if (TII
->isMIMG(MI
.getOpcode()) &&
912 ST
->getGeneration() >= AMDGPUSubtarget::GFX10
&&
913 MF
.getProperties().hasProperty(
914 MachineFunctionProperties::Property::NoVRegs
)) {
919 if (!TII
->isVOP3(MI
))
922 if (MI
.getOpcode() == AMDGPU::V_MAD_F32_e64
||
923 MI
.getOpcode() == AMDGPU::V_FMA_F32_e64
||
924 MI
.getOpcode() == AMDGPU::V_MAD_F16_e64
||
925 MI
.getOpcode() == AMDGPU::V_FMA_F16_e64
||
926 MI
.getOpcode() == AMDGPU::V_FMA_F16_gfx9_e64
) {
931 if (!TII
->hasVALU32BitEncoding(MI
.getOpcode())) {
932 // If there is no chance we will shrink it and use VCC as sdst to get
933 // a 32 bit form try to replace dead sdst with NULL.
934 tryReplaceDeadSDST(MI
);
938 if (!TII
->canShrink(MI
, *MRI
)) {
939 // Try commuting the instruction and see if that enables us to shrink
941 if (!MI
.isCommutable() || !TII
->commuteInstruction(MI
) ||
942 !TII
->canShrink(MI
, *MRI
)) {
943 tryReplaceDeadSDST(MI
);
948 int Op32
= AMDGPU::getVOPe32(MI
.getOpcode());
950 if (TII
->isVOPC(Op32
)) {
951 MachineOperand
&Op0
= MI
.getOperand(0);
953 // Exclude VOPCX instructions as these don't explicitly write a
955 Register DstReg
= Op0
.getReg();
956 if (DstReg
.isVirtual()) {
957 // VOPC instructions can only write to the VCC register. We can't
958 // force them to use VCC here, because this is only one register and
959 // cannot deal with sequences which would require multiple copies of
960 // VCC, e.g. S_AND_B64 (vcc = V_CMP_...), (vcc = V_CMP_...)
962 // So, instead of forcing the instruction to write to VCC, we
963 // provide a hint to the register allocator to use VCC and then we
964 // will run this pass again after RA and shrink it if it outputs to
966 MRI
->setRegAllocationHint(DstReg
, 0, VCCReg
);
969 if (DstReg
!= VCCReg
)
974 if (Op32
== AMDGPU::V_CNDMASK_B32_e32
) {
975 // We shrink V_CNDMASK_B32_e64 using regalloc hints like we do for VOPC
977 const MachineOperand
*Src2
=
978 TII
->getNamedOperand(MI
, AMDGPU::OpName::src2
);
981 Register SReg
= Src2
->getReg();
982 if (SReg
.isVirtual()) {
983 MRI
->setRegAllocationHint(SReg
, 0, VCCReg
);
990 // Check for the bool flag output for instructions like V_ADD_I32_e64.
991 const MachineOperand
*SDst
= TII
->getNamedOperand(MI
,
992 AMDGPU::OpName::sdst
);
997 if (SDst
->getReg() != VCCReg
) {
998 if (SDst
->getReg().isVirtual())
999 MRI
->setRegAllocationHint(SDst
->getReg(), 0, VCCReg
);
1003 // All of the instructions with carry outs also have an SGPR input in
1005 const MachineOperand
*Src2
= TII
->getNamedOperand(MI
,
1006 AMDGPU::OpName::src2
);
1007 if (Src2
&& Src2
->getReg() != VCCReg
) {
1008 if (Src2
->getReg().isVirtual())
1009 MRI
->setRegAllocationHint(Src2
->getReg(), 0, VCCReg
);
1017 // Pre-GFX10, shrinking VOP3 instructions pre-RA gave us the chance to
1018 // fold an immediate into the shrunk instruction as a literal operand. In
1019 // GFX10 VOP3 instructions can take a literal operand anyway, so there is
1020 // no advantage to doing this.
1021 if (ST
->hasVOP3Literal() &&
1022 !MF
.getProperties().hasProperty(
1023 MachineFunctionProperties::Property::NoVRegs
))
1026 if (ST
->hasTrue16BitInsts() && AMDGPU::isTrue16Inst(MI
.getOpcode()) &&
1027 !shouldShrinkTrue16(MI
))
1030 // We can shrink this instruction
1031 LLVM_DEBUG(dbgs() << "Shrinking " << MI
);
1033 MachineInstr
*Inst32
= TII
->buildShrunkInst(MI
, Op32
);
1034 ++NumInstructionsShrunk
;
1036 // Copy extra operands not present in the instruction definition.
1037 copyExtraImplicitOps(*Inst32
, MI
);
1039 // Copy deadness from the old explicit vcc def to the new implicit def.
1040 if (SDst
&& SDst
->isDead())
1041 Inst32
->findRegisterDefOperand(VCCReg
, /*TRI=*/nullptr)->setIsDead();
1043 MI
.eraseFromParent();
1044 foldImmediates(*Inst32
);
1046 LLVM_DEBUG(dbgs() << "e32 MI = " << *Inst32
<< '\n');