1 //===-- SIShrinkInstructions.cpp - Shrink Instructions --------------------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 /// The pass tries to use the 32-bit encoding for instructions when possible.
8 //===----------------------------------------------------------------------===//
12 #include "AMDGPUSubtarget.h"
13 #include "SIInstrInfo.h"
14 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
15 #include "llvm/ADT/Statistic.h"
16 #include "llvm/CodeGen/MachineFunctionPass.h"
17 #include "llvm/CodeGen/MachineInstrBuilder.h"
18 #include "llvm/CodeGen/MachineRegisterInfo.h"
19 #include "llvm/IR/Constants.h"
20 #include "llvm/IR/Function.h"
21 #include "llvm/IR/LLVMContext.h"
22 #include "llvm/Support/Debug.h"
23 #include "llvm/Support/raw_ostream.h"
24 #include "llvm/Target/TargetMachine.h"
26 #define DEBUG_TYPE "si-shrink-instructions"
28 STATISTIC(NumInstructionsShrunk
,
29 "Number of 64-bit instruction reduced to 32-bit.");
30 STATISTIC(NumLiteralConstantsFolded
,
31 "Number of literal constants folded into 32-bit instructions.");
37 class SIShrinkInstructions
: public MachineFunctionPass
{
41 void shrinkMIMG(MachineInstr
&MI
);
44 SIShrinkInstructions() : MachineFunctionPass(ID
) {
47 bool runOnMachineFunction(MachineFunction
&MF
) override
;
49 StringRef
getPassName() const override
{ return "SI Shrink Instructions"; }
51 void getAnalysisUsage(AnalysisUsage
&AU
) const override
{
53 MachineFunctionPass::getAnalysisUsage(AU
);
57 } // End anonymous namespace.
59 INITIALIZE_PASS(SIShrinkInstructions
, DEBUG_TYPE
,
60 "SI Shrink Instructions", false, false)
62 char SIShrinkInstructions::ID
= 0;
64 FunctionPass
*llvm::createSIShrinkInstructionsPass() {
65 return new SIShrinkInstructions();
68 /// This function checks \p MI for operands defined by a move immediate
69 /// instruction and then folds the literal constant into the instruction if it
70 /// can. This function assumes that \p MI is a VOP1, VOP2, or VOPC instructions.
71 static bool foldImmediates(MachineInstr
&MI
, const SIInstrInfo
*TII
,
72 MachineRegisterInfo
&MRI
, bool TryToCommute
= true) {
73 assert(TII
->isVOP1(MI
) || TII
->isVOP2(MI
) || TII
->isVOPC(MI
));
75 int Src0Idx
= AMDGPU::getNamedOperandIdx(MI
.getOpcode(), AMDGPU::OpName::src0
);
78 MachineOperand
&Src0
= MI
.getOperand(Src0Idx
);
80 unsigned Reg
= Src0
.getReg();
81 if (TargetRegisterInfo::isVirtualRegister(Reg
) && MRI
.hasOneUse(Reg
)) {
82 MachineInstr
*Def
= MRI
.getUniqueVRegDef(Reg
);
83 if (Def
&& Def
->isMoveImmediate()) {
84 MachineOperand
&MovSrc
= Def
->getOperand(1);
85 bool ConstantFolded
= false;
87 if (MovSrc
.isImm() && (isInt
<32>(MovSrc
.getImm()) ||
88 isUInt
<32>(MovSrc
.getImm()))) {
89 // It's possible to have only one component of a super-reg defined by
90 // a single mov, so we need to clear any subregister flag.
92 Src0
.ChangeToImmediate(MovSrc
.getImm());
93 ConstantFolded
= true;
94 } else if (MovSrc
.isFI()) {
96 Src0
.ChangeToFrameIndex(MovSrc
.getIndex());
97 ConstantFolded
= true;
98 } else if (MovSrc
.isGlobal()) {
99 Src0
.ChangeToGA(MovSrc
.getGlobal(), MovSrc
.getOffset(),
100 MovSrc
.getTargetFlags());
101 ConstantFolded
= true;
104 if (ConstantFolded
) {
105 assert(MRI
.use_empty(Reg
));
106 Def
->eraseFromParent();
107 ++NumLiteralConstantsFolded
;
114 // We have failed to fold src0, so commute the instruction and try again.
115 if (TryToCommute
&& MI
.isCommutable()) {
116 if (TII
->commuteInstruction(MI
)) {
117 if (foldImmediates(MI
, TII
, MRI
, false))
121 TII
->commuteInstruction(MI
);
128 static bool isKImmOperand(const SIInstrInfo
*TII
, const MachineOperand
&Src
) {
129 return isInt
<16>(Src
.getImm()) &&
130 !TII
->isInlineConstant(*Src
.getParent(),
131 Src
.getParent()->getOperandNo(&Src
));
134 static bool isKUImmOperand(const SIInstrInfo
*TII
, const MachineOperand
&Src
) {
135 return isUInt
<16>(Src
.getImm()) &&
136 !TII
->isInlineConstant(*Src
.getParent(),
137 Src
.getParent()->getOperandNo(&Src
));
140 static bool isKImmOrKUImmOperand(const SIInstrInfo
*TII
,
141 const MachineOperand
&Src
,
143 if (isInt
<16>(Src
.getImm())) {
145 return !TII
->isInlineConstant(Src
);
148 if (isUInt
<16>(Src
.getImm())) {
150 return !TII
->isInlineConstant(Src
);
156 /// \returns true if the constant in \p Src should be replaced with a bitreverse
157 /// of an inline immediate.
158 static bool isReverseInlineImm(const SIInstrInfo
*TII
,
159 const MachineOperand
&Src
,
160 int32_t &ReverseImm
) {
161 if (!isInt
<32>(Src
.getImm()) || TII
->isInlineConstant(Src
))
164 ReverseImm
= reverseBits
<int32_t>(static_cast<int32_t>(Src
.getImm()));
165 return ReverseImm
>= -16 && ReverseImm
<= 64;
168 /// Copy implicit register operands from specified instruction to this
169 /// instruction that are not part of the instruction definition.
170 static void copyExtraImplicitOps(MachineInstr
&NewMI
, MachineFunction
&MF
,
171 const MachineInstr
&MI
) {
172 for (unsigned i
= MI
.getDesc().getNumOperands() +
173 MI
.getDesc().getNumImplicitUses() +
174 MI
.getDesc().getNumImplicitDefs(), e
= MI
.getNumOperands();
176 const MachineOperand
&MO
= MI
.getOperand(i
);
177 if ((MO
.isReg() && MO
.isImplicit()) || MO
.isRegMask())
178 NewMI
.addOperand(MF
, MO
);
182 static void shrinkScalarCompare(const SIInstrInfo
*TII
, MachineInstr
&MI
) {
183 // cmpk instructions do scc = dst <cc op> imm16, so commute the instruction to
184 // get constants on the RHS.
185 if (!MI
.getOperand(0).isReg())
186 TII
->commuteInstruction(MI
, false, 0, 1);
188 const MachineOperand
&Src1
= MI
.getOperand(1);
192 int SOPKOpc
= AMDGPU::getSOPKOp(MI
.getOpcode());
196 // eq/ne is special because the imm16 can be treated as signed or unsigned,
197 // and initially selectd to the unsigned versions.
198 if (SOPKOpc
== AMDGPU::S_CMPK_EQ_U32
|| SOPKOpc
== AMDGPU::S_CMPK_LG_U32
) {
200 if (isKImmOrKUImmOperand(TII
, Src1
, HasUImm
)) {
202 SOPKOpc
= (SOPKOpc
== AMDGPU::S_CMPK_EQ_U32
) ?
203 AMDGPU::S_CMPK_EQ_I32
: AMDGPU::S_CMPK_LG_I32
;
206 MI
.setDesc(TII
->get(SOPKOpc
));
212 const MCInstrDesc
&NewDesc
= TII
->get(SOPKOpc
);
214 if ((TII
->sopkIsZext(SOPKOpc
) && isKUImmOperand(TII
, Src1
)) ||
215 (!TII
->sopkIsZext(SOPKOpc
) && isKImmOperand(TII
, Src1
))) {
220 // Shrink NSA encoded instructions with contiguous VGPRs to non-NSA encoding.
221 void SIShrinkInstructions::shrinkMIMG(MachineInstr
&MI
) {
222 const AMDGPU::MIMGInfo
*Info
= AMDGPU::getMIMGInfo(MI
.getOpcode());
223 if (Info
->MIMGEncoding
!= AMDGPU::MIMGEncGfx10NSA
)
226 MachineFunction
*MF
= MI
.getParent()->getParent();
227 const GCNSubtarget
&ST
= MF
->getSubtarget
<GCNSubtarget
>();
228 const SIInstrInfo
*TII
= ST
.getInstrInfo();
229 const SIRegisterInfo
&TRI
= TII
->getRegisterInfo();
231 AMDGPU::getNamedOperandIdx(MI
.getOpcode(), AMDGPU::OpName::vaddr0
);
232 unsigned NewAddrDwords
= Info
->VAddrDwords
;
233 const TargetRegisterClass
*RC
;
235 if (Info
->VAddrDwords
== 2) {
236 RC
= &AMDGPU::VReg_64RegClass
;
237 } else if (Info
->VAddrDwords
== 3) {
238 RC
= &AMDGPU::VReg_96RegClass
;
239 } else if (Info
->VAddrDwords
== 4) {
240 RC
= &AMDGPU::VReg_128RegClass
;
241 } else if (Info
->VAddrDwords
<= 8) {
242 RC
= &AMDGPU::VReg_256RegClass
;
245 RC
= &AMDGPU::VReg_512RegClass
;
249 unsigned VgprBase
= 0;
251 bool IsKill
= NewAddrDwords
== Info
->VAddrDwords
;
252 for (unsigned i
= 0; i
< Info
->VAddrDwords
; ++i
) {
253 const MachineOperand
&Op
= MI
.getOperand(VAddr0Idx
+ i
);
254 unsigned Vgpr
= TRI
.getHWRegIndex(Op
.getReg());
258 } else if (VgprBase
+ i
!= Vgpr
)
267 if (VgprBase
+ NewAddrDwords
> 256)
270 // Further check for implicit tied operands - this may be present if TFE is
272 int TFEIdx
= AMDGPU::getNamedOperandIdx(MI
.getOpcode(), AMDGPU::OpName::tfe
);
273 int LWEIdx
= AMDGPU::getNamedOperandIdx(MI
.getOpcode(), AMDGPU::OpName::lwe
);
274 unsigned TFEVal
= MI
.getOperand(TFEIdx
).getImm();
275 unsigned LWEVal
= MI
.getOperand(LWEIdx
).getImm();
277 if (TFEVal
|| LWEVal
) {
278 // TFE/LWE is enabled so we need to deal with an implicit tied operand
279 for (unsigned i
= LWEIdx
+ 1, e
= MI
.getNumOperands(); i
!= e
; ++i
) {
280 if (MI
.getOperand(i
).isReg() && MI
.getOperand(i
).isTied() &&
281 MI
.getOperand(i
).isImplicit()) {
282 // This is the tied operand
285 "found more than one tied implicit operand when expecting only 1");
287 MI
.untieRegOperand(ToUntie
);
293 AMDGPU::getMIMGOpcode(Info
->BaseOpcode
, AMDGPU::MIMGEncGfx10Default
,
294 Info
->VDataDwords
, NewAddrDwords
);
295 MI
.setDesc(TII
->get(NewOpcode
));
296 MI
.getOperand(VAddr0Idx
).setReg(RC
->getRegister(VgprBase
));
297 MI
.getOperand(VAddr0Idx
).setIsUndef(IsUndef
);
298 MI
.getOperand(VAddr0Idx
).setIsKill(IsKill
);
300 for (unsigned i
= 1; i
< Info
->VAddrDwords
; ++i
)
301 MI
.RemoveOperand(VAddr0Idx
+ 1);
305 AMDGPU::getNamedOperandIdx(MI
.getOpcode(), AMDGPU::OpName::vdata
),
306 ToUntie
- (Info
->VAddrDwords
- 1));
310 /// Attempt to shink AND/OR/XOR operations requiring non-inlineable literals.
311 /// For AND or OR, try using S_BITSET{0,1} to clear or set bits.
312 /// If the inverse of the immediate is legal, use ANDN2, ORN2 or
313 /// XNOR (as a ^ b == ~(a ^ ~b)).
314 /// \returns true if the caller should continue the machine function iterator
315 static bool shrinkScalarLogicOp(const GCNSubtarget
&ST
,
316 MachineRegisterInfo
&MRI
,
317 const SIInstrInfo
*TII
,
319 unsigned Opc
= MI
.getOpcode();
320 const MachineOperand
*Dest
= &MI
.getOperand(0);
321 MachineOperand
*Src0
= &MI
.getOperand(1);
322 MachineOperand
*Src1
= &MI
.getOperand(2);
323 MachineOperand
*SrcReg
= Src0
;
324 MachineOperand
*SrcImm
= Src1
;
326 if (SrcImm
->isImm() &&
327 !AMDGPU::isInlinableLiteral32(SrcImm
->getImm(), ST
.hasInv2PiInlineImm())) {
328 uint32_t Imm
= static_cast<uint32_t>(SrcImm
->getImm());
331 if (Opc
== AMDGPU::S_AND_B32
) {
332 if (isPowerOf2_32(~Imm
)) {
333 NewImm
= countTrailingOnes(Imm
);
334 Opc
= AMDGPU::S_BITSET0_B32
;
335 } else if (AMDGPU::isInlinableLiteral32(~Imm
, ST
.hasInv2PiInlineImm())) {
337 Opc
= AMDGPU::S_ANDN2_B32
;
339 } else if (Opc
== AMDGPU::S_OR_B32
) {
340 if (isPowerOf2_32(Imm
)) {
341 NewImm
= countTrailingZeros(Imm
);
342 Opc
= AMDGPU::S_BITSET1_B32
;
343 } else if (AMDGPU::isInlinableLiteral32(~Imm
, ST
.hasInv2PiInlineImm())) {
345 Opc
= AMDGPU::S_ORN2_B32
;
347 } else if (Opc
== AMDGPU::S_XOR_B32
) {
348 if (AMDGPU::isInlinableLiteral32(~Imm
, ST
.hasInv2PiInlineImm())) {
350 Opc
= AMDGPU::S_XNOR_B32
;
353 llvm_unreachable("unexpected opcode");
356 if ((Opc
== AMDGPU::S_ANDN2_B32
|| Opc
== AMDGPU::S_ORN2_B32
) &&
358 if (!TII
->commuteInstruction(MI
, false, 1, 2))
363 if (TargetRegisterInfo::isVirtualRegister(Dest
->getReg()) &&
365 MRI
.setRegAllocationHint(Dest
->getReg(), 0, SrcReg
->getReg());
366 MRI
.setRegAllocationHint(SrcReg
->getReg(), 0, Dest
->getReg());
370 if (SrcReg
->isReg() && SrcReg
->getReg() == Dest
->getReg()) {
371 MI
.setDesc(TII
->get(Opc
));
372 if (Opc
== AMDGPU::S_BITSET0_B32
||
373 Opc
== AMDGPU::S_BITSET1_B32
) {
374 Src0
->ChangeToImmediate(NewImm
);
375 // Remove the immediate and add the tied input.
376 MI
.getOperand(2).ChangeToRegister(Dest
->getReg(), false);
377 MI
.tieOperands(0, 2);
379 SrcImm
->setImm(NewImm
);
388 // This is the same as MachineInstr::readsRegister/modifiesRegister except
389 // it takes subregs into account.
390 static bool instAccessReg(iterator_range
<MachineInstr::const_mop_iterator
> &&R
,
391 unsigned Reg
, unsigned SubReg
,
392 const SIRegisterInfo
&TRI
) {
393 for (const MachineOperand
&MO
: R
) {
397 if (TargetRegisterInfo::isPhysicalRegister(Reg
) &&
398 TargetRegisterInfo::isPhysicalRegister(MO
.getReg())) {
399 if (TRI
.regsOverlap(Reg
, MO
.getReg()))
401 } else if (MO
.getReg() == Reg
&&
402 TargetRegisterInfo::isVirtualRegister(Reg
)) {
403 LaneBitmask Overlap
= TRI
.getSubRegIndexLaneMask(SubReg
) &
404 TRI
.getSubRegIndexLaneMask(MO
.getSubReg());
412 static bool instReadsReg(const MachineInstr
*MI
,
413 unsigned Reg
, unsigned SubReg
,
414 const SIRegisterInfo
&TRI
) {
415 return instAccessReg(MI
->uses(), Reg
, SubReg
, TRI
);
418 static bool instModifiesReg(const MachineInstr
*MI
,
419 unsigned Reg
, unsigned SubReg
,
420 const SIRegisterInfo
&TRI
) {
421 return instAccessReg(MI
->defs(), Reg
, SubReg
, TRI
);
424 static TargetInstrInfo::RegSubRegPair
425 getSubRegForIndex(unsigned Reg
, unsigned Sub
, unsigned I
,
426 const SIRegisterInfo
&TRI
, const MachineRegisterInfo
&MRI
) {
427 if (TRI
.getRegSizeInBits(Reg
, MRI
) != 32) {
428 if (TargetRegisterInfo::isPhysicalRegister(Reg
)) {
429 Reg
= TRI
.getSubReg(Reg
, TRI
.getSubRegFromChannel(I
));
431 LaneBitmask LM
= TRI
.getSubRegIndexLaneMask(Sub
);
432 Sub
= TRI
.getSubRegFromChannel(I
+ countTrailingZeros(LM
.getAsInteger()));
435 return TargetInstrInfo::RegSubRegPair(Reg
, Sub
);
445 // mov t, x (t is potentially dead and move eliminated)
448 // Returns next valid instruction pointer if was able to create v_swap_b32.
450 // This shall not be done too early not to prevent possible folding which may
451 // remove matched moves, and this should prefereably be done before RA to
452 // release saved registers and also possibly after RA which can insert copies
455 // This is really just a generic peephole that is not a canocical shrinking,
456 // although requirements match the pass placement and it reduces code size too.
457 static MachineInstr
* matchSwap(MachineInstr
&MovT
, MachineRegisterInfo
&MRI
,
458 const SIInstrInfo
*TII
) {
459 assert(MovT
.getOpcode() == AMDGPU::V_MOV_B32_e32
||
460 MovT
.getOpcode() == AMDGPU::COPY
);
462 unsigned T
= MovT
.getOperand(0).getReg();
463 unsigned Tsub
= MovT
.getOperand(0).getSubReg();
464 MachineOperand
&Xop
= MovT
.getOperand(1);
468 unsigned X
= Xop
.getReg();
469 unsigned Xsub
= Xop
.getSubReg();
471 unsigned Size
= TII
->getOpSize(MovT
, 0) / 4;
473 const SIRegisterInfo
&TRI
= TII
->getRegisterInfo();
474 if (!TRI
.isVGPR(MRI
, X
))
477 for (MachineOperand
&YTop
: MRI
.use_nodbg_operands(T
)) {
478 if (YTop
.getSubReg() != Tsub
)
481 MachineInstr
&MovY
= *YTop
.getParent();
482 if ((MovY
.getOpcode() != AMDGPU::V_MOV_B32_e32
&&
483 MovY
.getOpcode() != AMDGPU::COPY
) ||
484 MovY
.getOperand(1).getSubReg() != Tsub
)
487 unsigned Y
= MovY
.getOperand(0).getReg();
488 unsigned Ysub
= MovY
.getOperand(0).getSubReg();
490 if (!TRI
.isVGPR(MRI
, Y
) || MovT
.getParent() != MovY
.getParent())
493 MachineInstr
*MovX
= nullptr;
494 auto I
= std::next(MovT
.getIterator()), E
= MovT
.getParent()->instr_end();
495 for (auto IY
= MovY
.getIterator(); I
!= E
&& I
!= IY
; ++I
) {
496 if (instReadsReg(&*I
, X
, Xsub
, TRI
) ||
497 instModifiesReg(&*I
, Y
, Ysub
, TRI
) ||
498 instModifiesReg(&*I
, T
, Tsub
, TRI
) ||
499 (MovX
&& instModifiesReg(&*I
, X
, Xsub
, TRI
))) {
503 if (!instReadsReg(&*I
, Y
, Ysub
, TRI
)) {
504 if (!MovX
&& instModifiesReg(&*I
, X
, Xsub
, TRI
)) {
511 (I
->getOpcode() != AMDGPU::V_MOV_B32_e32
&&
512 I
->getOpcode() != AMDGPU::COPY
) ||
513 I
->getOperand(0).getReg() != X
||
514 I
->getOperand(0).getSubReg() != Xsub
) {
524 LLVM_DEBUG(dbgs() << "Matched v_swap_b32:\n" << MovT
<< *MovX
<< MovY
);
526 for (unsigned I
= 0; I
< Size
; ++I
) {
527 TargetInstrInfo::RegSubRegPair X1
, Y1
;
528 X1
= getSubRegForIndex(X
, Xsub
, I
, TRI
, MRI
);
529 Y1
= getSubRegForIndex(Y
, Ysub
, I
, TRI
, MRI
);
530 BuildMI(*MovT
.getParent(), MovX
->getIterator(), MovT
.getDebugLoc(),
531 TII
->get(AMDGPU::V_SWAP_B32
))
532 .addDef(X1
.Reg
, 0, X1
.SubReg
)
533 .addDef(Y1
.Reg
, 0, Y1
.SubReg
)
534 .addReg(Y1
.Reg
, 0, Y1
.SubReg
)
535 .addReg(X1
.Reg
, 0, X1
.SubReg
).getInstr();
537 MovX
->eraseFromParent();
538 MovY
.eraseFromParent();
539 MachineInstr
*Next
= &*std::next(MovT
.getIterator());
540 if (MRI
.use_nodbg_empty(T
))
541 MovT
.eraseFromParent();
543 Xop
.setIsKill(false);
551 bool SIShrinkInstructions::runOnMachineFunction(MachineFunction
&MF
) {
552 if (skipFunction(MF
.getFunction()))
555 MachineRegisterInfo
&MRI
= MF
.getRegInfo();
556 const GCNSubtarget
&ST
= MF
.getSubtarget
<GCNSubtarget
>();
557 const SIInstrInfo
*TII
= ST
.getInstrInfo();
558 unsigned VCCReg
= ST
.isWave32() ? AMDGPU::VCC_LO
: AMDGPU::VCC
;
560 std::vector
<unsigned> I1Defs
;
562 for (MachineFunction::iterator BI
= MF
.begin(), BE
= MF
.end();
565 MachineBasicBlock
&MBB
= *BI
;
566 MachineBasicBlock::iterator I
, Next
;
567 for (I
= MBB
.begin(); I
!= MBB
.end(); I
= Next
) {
569 MachineInstr
&MI
= *I
;
571 if (MI
.getOpcode() == AMDGPU::V_MOV_B32_e32
) {
572 // If this has a literal constant source that is the same as the
573 // reversed bits of an inline immediate, replace with a bitreverse of
574 // that constant. This saves 4 bytes in the common case of materializing
577 // Test if we are after regalloc. We only want to do this after any
578 // optimizations happen because this will confuse them.
579 // XXX - not exactly a check for post-regalloc run.
580 MachineOperand
&Src
= MI
.getOperand(1);
582 TargetRegisterInfo::isPhysicalRegister(MI
.getOperand(0).getReg())) {
584 if (isReverseInlineImm(TII
, Src
, ReverseImm
)) {
585 MI
.setDesc(TII
->get(AMDGPU::V_BFREV_B32_e32
));
586 Src
.setImm(ReverseImm
);
592 if (ST
.hasSwap() && (MI
.getOpcode() == AMDGPU::V_MOV_B32_e32
||
593 MI
.getOpcode() == AMDGPU::COPY
)) {
594 if (auto *NextMI
= matchSwap(MI
, MRI
, TII
)) {
595 Next
= NextMI
->getIterator();
600 // Combine adjacent s_nops to use the immediate operand encoding how long
607 if (MI
.getOpcode() == AMDGPU::S_NOP
&&
609 (*Next
).getOpcode() == AMDGPU::S_NOP
) {
611 MachineInstr
&NextMI
= *Next
;
612 // The instruction encodes the amount to wait with an offset of 1,
613 // i.e. 0 is wait 1 cycle. Convert both to cycles and then convert back
615 uint8_t Nop0
= MI
.getOperand(0).getImm() + 1;
616 uint8_t Nop1
= NextMI
.getOperand(0).getImm() + 1;
618 // Make sure we don't overflow the bounds.
619 if (Nop0
+ Nop1
<= 8) {
620 NextMI
.getOperand(0).setImm(Nop0
+ Nop1
- 1);
621 MI
.eraseFromParent();
627 // FIXME: We also need to consider movs of constant operands since
628 // immediate operands are not folded if they have more than one use, and
629 // the operand folding pass is unaware if the immediate will be free since
630 // it won't know if the src == dest constraint will end up being
632 if (MI
.getOpcode() == AMDGPU::S_ADD_I32
||
633 MI
.getOpcode() == AMDGPU::S_MUL_I32
) {
634 const MachineOperand
*Dest
= &MI
.getOperand(0);
635 MachineOperand
*Src0
= &MI
.getOperand(1);
636 MachineOperand
*Src1
= &MI
.getOperand(2);
638 if (!Src0
->isReg() && Src1
->isReg()) {
639 if (TII
->commuteInstruction(MI
, false, 1, 2))
640 std::swap(Src0
, Src1
);
643 // FIXME: This could work better if hints worked with subregisters. If
644 // we have a vector add of a constant, we usually don't get the correct
645 // allocation due to the subregister usage.
646 if (TargetRegisterInfo::isVirtualRegister(Dest
->getReg()) &&
648 MRI
.setRegAllocationHint(Dest
->getReg(), 0, Src0
->getReg());
649 MRI
.setRegAllocationHint(Src0
->getReg(), 0, Dest
->getReg());
653 if (Src0
->isReg() && Src0
->getReg() == Dest
->getReg()) {
654 if (Src1
->isImm() && isKImmOperand(TII
, *Src1
)) {
655 unsigned Opc
= (MI
.getOpcode() == AMDGPU::S_ADD_I32
) ?
656 AMDGPU::S_ADDK_I32
: AMDGPU::S_MULK_I32
;
658 MI
.setDesc(TII
->get(Opc
));
659 MI
.tieOperands(0, 1);
664 // Try to use s_cmpk_*
665 if (MI
.isCompare() && TII
->isSOPC(MI
)) {
666 shrinkScalarCompare(TII
, MI
);
670 // Try to use S_MOVK_I32, which will save 4 bytes for small immediates.
671 if (MI
.getOpcode() == AMDGPU::S_MOV_B32
) {
672 const MachineOperand
&Dst
= MI
.getOperand(0);
673 MachineOperand
&Src
= MI
.getOperand(1);
676 TargetRegisterInfo::isPhysicalRegister(Dst
.getReg())) {
678 if (isKImmOperand(TII
, Src
))
679 MI
.setDesc(TII
->get(AMDGPU::S_MOVK_I32
));
680 else if (isReverseInlineImm(TII
, Src
, ReverseImm
)) {
681 MI
.setDesc(TII
->get(AMDGPU::S_BREV_B32
));
682 Src
.setImm(ReverseImm
);
689 // Shrink scalar logic operations.
690 if (MI
.getOpcode() == AMDGPU::S_AND_B32
||
691 MI
.getOpcode() == AMDGPU::S_OR_B32
||
692 MI
.getOpcode() == AMDGPU::S_XOR_B32
) {
693 if (shrinkScalarLogicOp(ST
, MRI
, TII
, MI
))
697 if (TII
->isMIMG(MI
.getOpcode()) &&
698 ST
.getGeneration() >= AMDGPUSubtarget::GFX10
&&
699 MF
.getProperties().hasProperty(
700 MachineFunctionProperties::Property::NoVRegs
)) {
705 if (!TII
->hasVALU32BitEncoding(MI
.getOpcode()))
708 if (!TII
->canShrink(MI
, MRI
)) {
709 // Try commuting the instruction and see if that enables us to shrink
711 if (!MI
.isCommutable() || !TII
->commuteInstruction(MI
) ||
712 !TII
->canShrink(MI
, MRI
))
716 // getVOPe32 could be -1 here if we started with an instruction that had
717 // a 32-bit encoding and then commuted it to an instruction that did not.
718 if (!TII
->hasVALU32BitEncoding(MI
.getOpcode()))
721 int Op32
= AMDGPU::getVOPe32(MI
.getOpcode());
723 if (TII
->isVOPC(Op32
)) {
724 unsigned DstReg
= MI
.getOperand(0).getReg();
725 if (TargetRegisterInfo::isVirtualRegister(DstReg
)) {
726 // VOPC instructions can only write to the VCC register. We can't
727 // force them to use VCC here, because this is only one register and
728 // cannot deal with sequences which would require multiple copies of
729 // VCC, e.g. S_AND_B64 (vcc = V_CMP_...), (vcc = V_CMP_...)
731 // So, instead of forcing the instruction to write to VCC, we provide
732 // a hint to the register allocator to use VCC and then we will run
733 // this pass again after RA and shrink it if it outputs to VCC.
734 MRI
.setRegAllocationHint(MI
.getOperand(0).getReg(), 0, VCCReg
);
737 if (DstReg
!= VCCReg
)
741 if (Op32
== AMDGPU::V_CNDMASK_B32_e32
) {
742 // We shrink V_CNDMASK_B32_e64 using regalloc hints like we do for VOPC
744 const MachineOperand
*Src2
=
745 TII
->getNamedOperand(MI
, AMDGPU::OpName::src2
);
748 unsigned SReg
= Src2
->getReg();
749 if (TargetRegisterInfo::isVirtualRegister(SReg
)) {
750 MRI
.setRegAllocationHint(SReg
, 0, VCCReg
);
757 // Check for the bool flag output for instructions like V_ADD_I32_e64.
758 const MachineOperand
*SDst
= TII
->getNamedOperand(MI
,
759 AMDGPU::OpName::sdst
);
761 // Check the carry-in operand for v_addc_u32_e64.
762 const MachineOperand
*Src2
= TII
->getNamedOperand(MI
,
763 AMDGPU::OpName::src2
);
768 if (SDst
->getReg() != VCCReg
) {
769 if (TargetRegisterInfo::isVirtualRegister(SDst
->getReg()))
770 MRI
.setRegAllocationHint(SDst
->getReg(), 0, VCCReg
);
774 // All of the instructions with carry outs also have an SGPR input in
776 if (Src2
&& Src2
->getReg() != VCCReg
) {
777 if (TargetRegisterInfo::isVirtualRegister(Src2
->getReg()))
778 MRI
.setRegAllocationHint(Src2
->getReg(), 0, VCCReg
);
786 // We can shrink this instruction
787 LLVM_DEBUG(dbgs() << "Shrinking " << MI
);
789 MachineInstr
*Inst32
= TII
->buildShrunkInst(MI
, Op32
);
790 ++NumInstructionsShrunk
;
792 // Copy extra operands not present in the instruction definition.
793 copyExtraImplicitOps(*Inst32
, MF
, MI
);
795 MI
.eraseFromParent();
796 foldImmediates(*Inst32
, TII
, MRI
);
798 LLVM_DEBUG(dbgs() << "e32 MI = " << *Inst32
<< '\n');