1 //===-- SIShrinkInstructions.cpp - Shrink Instructions --------------------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 /// The pass tries to use the 32-bit encoding for instructions when possible.
8 //===----------------------------------------------------------------------===//
12 #include "AMDGPUSubtarget.h"
13 #include "SIInstrInfo.h"
14 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
15 #include "llvm/ADT/Statistic.h"
16 #include "llvm/CodeGen/MachineFunctionPass.h"
17 #include "llvm/CodeGen/MachineInstrBuilder.h"
18 #include "llvm/CodeGen/MachineRegisterInfo.h"
19 #include "llvm/IR/Constants.h"
20 #include "llvm/IR/Function.h"
21 #include "llvm/IR/LLVMContext.h"
22 #include "llvm/Support/Debug.h"
23 #include "llvm/Support/raw_ostream.h"
24 #include "llvm/Target/TargetMachine.h"
26 #define DEBUG_TYPE "si-shrink-instructions"
28 STATISTIC(NumInstructionsShrunk
,
29 "Number of 64-bit instruction reduced to 32-bit.");
30 STATISTIC(NumLiteralConstantsFolded
,
31 "Number of literal constants folded into 32-bit instructions.");
37 class SIShrinkInstructions
: public MachineFunctionPass
{
41 void shrinkMIMG(MachineInstr
&MI
);
44 SIShrinkInstructions() : MachineFunctionPass(ID
) {
47 bool runOnMachineFunction(MachineFunction
&MF
) override
;
49 StringRef
getPassName() const override
{ return "SI Shrink Instructions"; }
51 void getAnalysisUsage(AnalysisUsage
&AU
) const override
{
53 MachineFunctionPass::getAnalysisUsage(AU
);
57 } // End anonymous namespace.
59 INITIALIZE_PASS(SIShrinkInstructions
, DEBUG_TYPE
,
60 "SI Shrink Instructions", false, false)
62 char SIShrinkInstructions::ID
= 0;
64 FunctionPass
*llvm::createSIShrinkInstructionsPass() {
65 return new SIShrinkInstructions();
68 /// This function checks \p MI for operands defined by a move immediate
69 /// instruction and then folds the literal constant into the instruction if it
70 /// can. This function assumes that \p MI is a VOP1, VOP2, or VOPC instructions.
71 static bool foldImmediates(MachineInstr
&MI
, const SIInstrInfo
*TII
,
72 MachineRegisterInfo
&MRI
, bool TryToCommute
= true) {
73 assert(TII
->isVOP1(MI
) || TII
->isVOP2(MI
) || TII
->isVOPC(MI
));
75 int Src0Idx
= AMDGPU::getNamedOperandIdx(MI
.getOpcode(), AMDGPU::OpName::src0
);
78 MachineOperand
&Src0
= MI
.getOperand(Src0Idx
);
80 Register Reg
= Src0
.getReg();
81 if (Register::isVirtualRegister(Reg
) && MRI
.hasOneUse(Reg
)) {
82 MachineInstr
*Def
= MRI
.getUniqueVRegDef(Reg
);
83 if (Def
&& Def
->isMoveImmediate()) {
84 MachineOperand
&MovSrc
= Def
->getOperand(1);
85 bool ConstantFolded
= false;
87 if (MovSrc
.isImm() && (isInt
<32>(MovSrc
.getImm()) ||
88 isUInt
<32>(MovSrc
.getImm()))) {
89 // It's possible to have only one component of a super-reg defined by
90 // a single mov, so we need to clear any subregister flag.
92 Src0
.ChangeToImmediate(MovSrc
.getImm());
93 ConstantFolded
= true;
94 } else if (MovSrc
.isFI()) {
96 Src0
.ChangeToFrameIndex(MovSrc
.getIndex());
97 ConstantFolded
= true;
98 } else if (MovSrc
.isGlobal()) {
99 Src0
.ChangeToGA(MovSrc
.getGlobal(), MovSrc
.getOffset(),
100 MovSrc
.getTargetFlags());
101 ConstantFolded
= true;
104 if (ConstantFolded
) {
105 assert(MRI
.use_empty(Reg
));
106 Def
->eraseFromParent();
107 ++NumLiteralConstantsFolded
;
114 // We have failed to fold src0, so commute the instruction and try again.
115 if (TryToCommute
&& MI
.isCommutable()) {
116 if (TII
->commuteInstruction(MI
)) {
117 if (foldImmediates(MI
, TII
, MRI
, false))
121 TII
->commuteInstruction(MI
);
128 static bool isKImmOperand(const SIInstrInfo
*TII
, const MachineOperand
&Src
) {
129 return isInt
<16>(Src
.getImm()) &&
130 !TII
->isInlineConstant(*Src
.getParent(),
131 Src
.getParent()->getOperandNo(&Src
));
134 static bool isKUImmOperand(const SIInstrInfo
*TII
, const MachineOperand
&Src
) {
135 return isUInt
<16>(Src
.getImm()) &&
136 !TII
->isInlineConstant(*Src
.getParent(),
137 Src
.getParent()->getOperandNo(&Src
));
140 static bool isKImmOrKUImmOperand(const SIInstrInfo
*TII
,
141 const MachineOperand
&Src
,
143 if (isInt
<16>(Src
.getImm())) {
145 return !TII
->isInlineConstant(Src
);
148 if (isUInt
<16>(Src
.getImm())) {
150 return !TII
->isInlineConstant(Src
);
156 /// \returns true if the constant in \p Src should be replaced with a bitreverse
157 /// of an inline immediate.
158 static bool isReverseInlineImm(const SIInstrInfo
*TII
,
159 const MachineOperand
&Src
,
160 int32_t &ReverseImm
) {
161 if (!isInt
<32>(Src
.getImm()) || TII
->isInlineConstant(Src
))
164 ReverseImm
= reverseBits
<int32_t>(static_cast<int32_t>(Src
.getImm()));
165 return ReverseImm
>= -16 && ReverseImm
<= 64;
168 /// Copy implicit register operands from specified instruction to this
169 /// instruction that are not part of the instruction definition.
170 static void copyExtraImplicitOps(MachineInstr
&NewMI
, MachineFunction
&MF
,
171 const MachineInstr
&MI
) {
172 for (unsigned i
= MI
.getDesc().getNumOperands() +
173 MI
.getDesc().getNumImplicitUses() +
174 MI
.getDesc().getNumImplicitDefs(), e
= MI
.getNumOperands();
176 const MachineOperand
&MO
= MI
.getOperand(i
);
177 if ((MO
.isReg() && MO
.isImplicit()) || MO
.isRegMask())
178 NewMI
.addOperand(MF
, MO
);
182 static void shrinkScalarCompare(const SIInstrInfo
*TII
, MachineInstr
&MI
) {
183 // cmpk instructions do scc = dst <cc op> imm16, so commute the instruction to
184 // get constants on the RHS.
185 if (!MI
.getOperand(0).isReg())
186 TII
->commuteInstruction(MI
, false, 0, 1);
188 const MachineOperand
&Src1
= MI
.getOperand(1);
192 int SOPKOpc
= AMDGPU::getSOPKOp(MI
.getOpcode());
196 // eq/ne is special because the imm16 can be treated as signed or unsigned,
197 // and initially selectd to the unsigned versions.
198 if (SOPKOpc
== AMDGPU::S_CMPK_EQ_U32
|| SOPKOpc
== AMDGPU::S_CMPK_LG_U32
) {
200 if (isKImmOrKUImmOperand(TII
, Src1
, HasUImm
)) {
202 SOPKOpc
= (SOPKOpc
== AMDGPU::S_CMPK_EQ_U32
) ?
203 AMDGPU::S_CMPK_EQ_I32
: AMDGPU::S_CMPK_LG_I32
;
206 MI
.setDesc(TII
->get(SOPKOpc
));
212 const MCInstrDesc
&NewDesc
= TII
->get(SOPKOpc
);
214 if ((TII
->sopkIsZext(SOPKOpc
) && isKUImmOperand(TII
, Src1
)) ||
215 (!TII
->sopkIsZext(SOPKOpc
) && isKImmOperand(TII
, Src1
))) {
220 // Shrink NSA encoded instructions with contiguous VGPRs to non-NSA encoding.
221 void SIShrinkInstructions::shrinkMIMG(MachineInstr
&MI
) {
222 const AMDGPU::MIMGInfo
*Info
= AMDGPU::getMIMGInfo(MI
.getOpcode());
223 if (Info
->MIMGEncoding
!= AMDGPU::MIMGEncGfx10NSA
)
226 MachineFunction
*MF
= MI
.getParent()->getParent();
227 const GCNSubtarget
&ST
= MF
->getSubtarget
<GCNSubtarget
>();
228 const SIInstrInfo
*TII
= ST
.getInstrInfo();
229 const SIRegisterInfo
&TRI
= TII
->getRegisterInfo();
231 AMDGPU::getNamedOperandIdx(MI
.getOpcode(), AMDGPU::OpName::vaddr0
);
232 unsigned NewAddrDwords
= Info
->VAddrDwords
;
233 const TargetRegisterClass
*RC
;
235 if (Info
->VAddrDwords
== 2) {
236 RC
= &AMDGPU::VReg_64RegClass
;
237 } else if (Info
->VAddrDwords
== 3) {
238 RC
= &AMDGPU::VReg_96RegClass
;
239 } else if (Info
->VAddrDwords
== 4) {
240 RC
= &AMDGPU::VReg_128RegClass
;
241 } else if (Info
->VAddrDwords
<= 8) {
242 RC
= &AMDGPU::VReg_256RegClass
;
245 RC
= &AMDGPU::VReg_512RegClass
;
249 unsigned VgprBase
= 0;
251 bool IsKill
= NewAddrDwords
== Info
->VAddrDwords
;
252 for (unsigned i
= 0; i
< Info
->VAddrDwords
; ++i
) {
253 const MachineOperand
&Op
= MI
.getOperand(VAddr0Idx
+ i
);
254 unsigned Vgpr
= TRI
.getHWRegIndex(Op
.getReg());
258 } else if (VgprBase
+ i
!= Vgpr
)
267 if (VgprBase
+ NewAddrDwords
> 256)
270 // Further check for implicit tied operands - this may be present if TFE is
272 int TFEIdx
= AMDGPU::getNamedOperandIdx(MI
.getOpcode(), AMDGPU::OpName::tfe
);
273 int LWEIdx
= AMDGPU::getNamedOperandIdx(MI
.getOpcode(), AMDGPU::OpName::lwe
);
274 unsigned TFEVal
= MI
.getOperand(TFEIdx
).getImm();
275 unsigned LWEVal
= MI
.getOperand(LWEIdx
).getImm();
277 if (TFEVal
|| LWEVal
) {
278 // TFE/LWE is enabled so we need to deal with an implicit tied operand
279 for (unsigned i
= LWEIdx
+ 1, e
= MI
.getNumOperands(); i
!= e
; ++i
) {
280 if (MI
.getOperand(i
).isReg() && MI
.getOperand(i
).isTied() &&
281 MI
.getOperand(i
).isImplicit()) {
282 // This is the tied operand
285 "found more than one tied implicit operand when expecting only 1");
287 MI
.untieRegOperand(ToUntie
);
293 AMDGPU::getMIMGOpcode(Info
->BaseOpcode
, AMDGPU::MIMGEncGfx10Default
,
294 Info
->VDataDwords
, NewAddrDwords
);
295 MI
.setDesc(TII
->get(NewOpcode
));
296 MI
.getOperand(VAddr0Idx
).setReg(RC
->getRegister(VgprBase
));
297 MI
.getOperand(VAddr0Idx
).setIsUndef(IsUndef
);
298 MI
.getOperand(VAddr0Idx
).setIsKill(IsKill
);
300 for (unsigned i
= 1; i
< Info
->VAddrDwords
; ++i
)
301 MI
.RemoveOperand(VAddr0Idx
+ 1);
305 AMDGPU::getNamedOperandIdx(MI
.getOpcode(), AMDGPU::OpName::vdata
),
306 ToUntie
- (Info
->VAddrDwords
- 1));
310 /// Attempt to shink AND/OR/XOR operations requiring non-inlineable literals.
311 /// For AND or OR, try using S_BITSET{0,1} to clear or set bits.
312 /// If the inverse of the immediate is legal, use ANDN2, ORN2 or
313 /// XNOR (as a ^ b == ~(a ^ ~b)).
314 /// \returns true if the caller should continue the machine function iterator
315 static bool shrinkScalarLogicOp(const GCNSubtarget
&ST
,
316 MachineRegisterInfo
&MRI
,
317 const SIInstrInfo
*TII
,
319 unsigned Opc
= MI
.getOpcode();
320 const MachineOperand
*Dest
= &MI
.getOperand(0);
321 MachineOperand
*Src0
= &MI
.getOperand(1);
322 MachineOperand
*Src1
= &MI
.getOperand(2);
323 MachineOperand
*SrcReg
= Src0
;
324 MachineOperand
*SrcImm
= Src1
;
326 if (SrcImm
->isImm() &&
327 !AMDGPU::isInlinableLiteral32(SrcImm
->getImm(), ST
.hasInv2PiInlineImm())) {
328 uint32_t Imm
= static_cast<uint32_t>(SrcImm
->getImm());
331 if (Opc
== AMDGPU::S_AND_B32
) {
332 if (isPowerOf2_32(~Imm
)) {
333 NewImm
= countTrailingOnes(Imm
);
334 Opc
= AMDGPU::S_BITSET0_B32
;
335 } else if (AMDGPU::isInlinableLiteral32(~Imm
, ST
.hasInv2PiInlineImm())) {
337 Opc
= AMDGPU::S_ANDN2_B32
;
339 } else if (Opc
== AMDGPU::S_OR_B32
) {
340 if (isPowerOf2_32(Imm
)) {
341 NewImm
= countTrailingZeros(Imm
);
342 Opc
= AMDGPU::S_BITSET1_B32
;
343 } else if (AMDGPU::isInlinableLiteral32(~Imm
, ST
.hasInv2PiInlineImm())) {
345 Opc
= AMDGPU::S_ORN2_B32
;
347 } else if (Opc
== AMDGPU::S_XOR_B32
) {
348 if (AMDGPU::isInlinableLiteral32(~Imm
, ST
.hasInv2PiInlineImm())) {
350 Opc
= AMDGPU::S_XNOR_B32
;
353 llvm_unreachable("unexpected opcode");
356 if ((Opc
== AMDGPU::S_ANDN2_B32
|| Opc
== AMDGPU::S_ORN2_B32
) &&
358 if (!TII
->commuteInstruction(MI
, false, 1, 2))
363 if (Register::isVirtualRegister(Dest
->getReg()) && SrcReg
->isReg()) {
364 MRI
.setRegAllocationHint(Dest
->getReg(), 0, SrcReg
->getReg());
365 MRI
.setRegAllocationHint(SrcReg
->getReg(), 0, Dest
->getReg());
369 if (SrcReg
->isReg() && SrcReg
->getReg() == Dest
->getReg()) {
370 MI
.setDesc(TII
->get(Opc
));
371 if (Opc
== AMDGPU::S_BITSET0_B32
||
372 Opc
== AMDGPU::S_BITSET1_B32
) {
373 Src0
->ChangeToImmediate(NewImm
);
374 // Remove the immediate and add the tied input.
375 MI
.getOperand(2).ChangeToRegister(Dest
->getReg(), false);
376 MI
.tieOperands(0, 2);
378 SrcImm
->setImm(NewImm
);
387 // This is the same as MachineInstr::readsRegister/modifiesRegister except
388 // it takes subregs into account.
389 static bool instAccessReg(iterator_range
<MachineInstr::const_mop_iterator
> &&R
,
390 unsigned Reg
, unsigned SubReg
,
391 const SIRegisterInfo
&TRI
) {
392 for (const MachineOperand
&MO
: R
) {
396 if (Register::isPhysicalRegister(Reg
) &&
397 Register::isPhysicalRegister(MO
.getReg())) {
398 if (TRI
.regsOverlap(Reg
, MO
.getReg()))
400 } else if (MO
.getReg() == Reg
&& Register::isVirtualRegister(Reg
)) {
401 LaneBitmask Overlap
= TRI
.getSubRegIndexLaneMask(SubReg
) &
402 TRI
.getSubRegIndexLaneMask(MO
.getSubReg());
410 static bool instReadsReg(const MachineInstr
*MI
,
411 unsigned Reg
, unsigned SubReg
,
412 const SIRegisterInfo
&TRI
) {
413 return instAccessReg(MI
->uses(), Reg
, SubReg
, TRI
);
416 static bool instModifiesReg(const MachineInstr
*MI
,
417 unsigned Reg
, unsigned SubReg
,
418 const SIRegisterInfo
&TRI
) {
419 return instAccessReg(MI
->defs(), Reg
, SubReg
, TRI
);
422 static TargetInstrInfo::RegSubRegPair
423 getSubRegForIndex(unsigned Reg
, unsigned Sub
, unsigned I
,
424 const SIRegisterInfo
&TRI
, const MachineRegisterInfo
&MRI
) {
425 if (TRI
.getRegSizeInBits(Reg
, MRI
) != 32) {
426 if (Register::isPhysicalRegister(Reg
)) {
427 Reg
= TRI
.getSubReg(Reg
, TRI
.getSubRegFromChannel(I
));
429 LaneBitmask LM
= TRI
.getSubRegIndexLaneMask(Sub
);
430 Sub
= TRI
.getSubRegFromChannel(I
+ countTrailingZeros(LM
.getAsInteger()));
433 return TargetInstrInfo::RegSubRegPair(Reg
, Sub
);
443 // mov t, x (t is potentially dead and move eliminated)
446 // Returns next valid instruction pointer if was able to create v_swap_b32.
448 // This shall not be done too early not to prevent possible folding which may
449 // remove matched moves, and this should prefereably be done before RA to
450 // release saved registers and also possibly after RA which can insert copies
453 // This is really just a generic peephole that is not a canocical shrinking,
454 // although requirements match the pass placement and it reduces code size too.
455 static MachineInstr
* matchSwap(MachineInstr
&MovT
, MachineRegisterInfo
&MRI
,
456 const SIInstrInfo
*TII
) {
457 assert(MovT
.getOpcode() == AMDGPU::V_MOV_B32_e32
||
458 MovT
.getOpcode() == AMDGPU::COPY
);
460 Register T
= MovT
.getOperand(0).getReg();
461 unsigned Tsub
= MovT
.getOperand(0).getSubReg();
462 MachineOperand
&Xop
= MovT
.getOperand(1);
466 Register X
= Xop
.getReg();
467 unsigned Xsub
= Xop
.getSubReg();
469 unsigned Size
= TII
->getOpSize(MovT
, 0) / 4;
471 const SIRegisterInfo
&TRI
= TII
->getRegisterInfo();
472 if (!TRI
.isVGPR(MRI
, X
))
475 for (MachineOperand
&YTop
: MRI
.use_nodbg_operands(T
)) {
476 if (YTop
.getSubReg() != Tsub
)
479 MachineInstr
&MovY
= *YTop
.getParent();
480 if ((MovY
.getOpcode() != AMDGPU::V_MOV_B32_e32
&&
481 MovY
.getOpcode() != AMDGPU::COPY
) ||
482 MovY
.getOperand(1).getSubReg() != Tsub
)
485 Register Y
= MovY
.getOperand(0).getReg();
486 unsigned Ysub
= MovY
.getOperand(0).getSubReg();
488 if (!TRI
.isVGPR(MRI
, Y
) || MovT
.getParent() != MovY
.getParent())
491 MachineInstr
*MovX
= nullptr;
492 auto I
= std::next(MovT
.getIterator()), E
= MovT
.getParent()->instr_end();
493 for (auto IY
= MovY
.getIterator(); I
!= E
&& I
!= IY
; ++I
) {
494 if (instReadsReg(&*I
, X
, Xsub
, TRI
) ||
495 instModifiesReg(&*I
, Y
, Ysub
, TRI
) ||
496 instModifiesReg(&*I
, T
, Tsub
, TRI
) ||
497 (MovX
&& instModifiesReg(&*I
, X
, Xsub
, TRI
))) {
501 if (!instReadsReg(&*I
, Y
, Ysub
, TRI
)) {
502 if (!MovX
&& instModifiesReg(&*I
, X
, Xsub
, TRI
)) {
509 (I
->getOpcode() != AMDGPU::V_MOV_B32_e32
&&
510 I
->getOpcode() != AMDGPU::COPY
) ||
511 I
->getOperand(0).getReg() != X
||
512 I
->getOperand(0).getSubReg() != Xsub
) {
522 LLVM_DEBUG(dbgs() << "Matched v_swap_b32:\n" << MovT
<< *MovX
<< MovY
);
524 for (unsigned I
= 0; I
< Size
; ++I
) {
525 TargetInstrInfo::RegSubRegPair X1
, Y1
;
526 X1
= getSubRegForIndex(X
, Xsub
, I
, TRI
, MRI
);
527 Y1
= getSubRegForIndex(Y
, Ysub
, I
, TRI
, MRI
);
528 BuildMI(*MovT
.getParent(), MovX
->getIterator(), MovT
.getDebugLoc(),
529 TII
->get(AMDGPU::V_SWAP_B32
))
530 .addDef(X1
.Reg
, 0, X1
.SubReg
)
531 .addDef(Y1
.Reg
, 0, Y1
.SubReg
)
532 .addReg(Y1
.Reg
, 0, Y1
.SubReg
)
533 .addReg(X1
.Reg
, 0, X1
.SubReg
).getInstr();
535 MovX
->eraseFromParent();
536 MovY
.eraseFromParent();
537 MachineInstr
*Next
= &*std::next(MovT
.getIterator());
538 if (MRI
.use_nodbg_empty(T
))
539 MovT
.eraseFromParent();
541 Xop
.setIsKill(false);
549 bool SIShrinkInstructions::runOnMachineFunction(MachineFunction
&MF
) {
550 if (skipFunction(MF
.getFunction()))
553 MachineRegisterInfo
&MRI
= MF
.getRegInfo();
554 const GCNSubtarget
&ST
= MF
.getSubtarget
<GCNSubtarget
>();
555 const SIInstrInfo
*TII
= ST
.getInstrInfo();
556 unsigned VCCReg
= ST
.isWave32() ? AMDGPU::VCC_LO
: AMDGPU::VCC
;
558 std::vector
<unsigned> I1Defs
;
560 for (MachineFunction::iterator BI
= MF
.begin(), BE
= MF
.end();
563 MachineBasicBlock
&MBB
= *BI
;
564 MachineBasicBlock::iterator I
, Next
;
565 for (I
= MBB
.begin(); I
!= MBB
.end(); I
= Next
) {
567 MachineInstr
&MI
= *I
;
569 if (MI
.getOpcode() == AMDGPU::V_MOV_B32_e32
) {
570 // If this has a literal constant source that is the same as the
571 // reversed bits of an inline immediate, replace with a bitreverse of
572 // that constant. This saves 4 bytes in the common case of materializing
575 // Test if we are after regalloc. We only want to do this after any
576 // optimizations happen because this will confuse them.
577 // XXX - not exactly a check for post-regalloc run.
578 MachineOperand
&Src
= MI
.getOperand(1);
580 Register::isPhysicalRegister(MI
.getOperand(0).getReg())) {
582 if (isReverseInlineImm(TII
, Src
, ReverseImm
)) {
583 MI
.setDesc(TII
->get(AMDGPU::V_BFREV_B32_e32
));
584 Src
.setImm(ReverseImm
);
590 if (ST
.hasSwap() && (MI
.getOpcode() == AMDGPU::V_MOV_B32_e32
||
591 MI
.getOpcode() == AMDGPU::COPY
)) {
592 if (auto *NextMI
= matchSwap(MI
, MRI
, TII
)) {
593 Next
= NextMI
->getIterator();
598 // Combine adjacent s_nops to use the immediate operand encoding how long
605 if (MI
.getOpcode() == AMDGPU::S_NOP
&&
607 (*Next
).getOpcode() == AMDGPU::S_NOP
) {
609 MachineInstr
&NextMI
= *Next
;
610 // The instruction encodes the amount to wait with an offset of 1,
611 // i.e. 0 is wait 1 cycle. Convert both to cycles and then convert back
613 uint8_t Nop0
= MI
.getOperand(0).getImm() + 1;
614 uint8_t Nop1
= NextMI
.getOperand(0).getImm() + 1;
616 // Make sure we don't overflow the bounds.
617 if (Nop0
+ Nop1
<= 8) {
618 NextMI
.getOperand(0).setImm(Nop0
+ Nop1
- 1);
619 MI
.eraseFromParent();
625 // FIXME: We also need to consider movs of constant operands since
626 // immediate operands are not folded if they have more than one use, and
627 // the operand folding pass is unaware if the immediate will be free since
628 // it won't know if the src == dest constraint will end up being
630 if (MI
.getOpcode() == AMDGPU::S_ADD_I32
||
631 MI
.getOpcode() == AMDGPU::S_MUL_I32
) {
632 const MachineOperand
*Dest
= &MI
.getOperand(0);
633 MachineOperand
*Src0
= &MI
.getOperand(1);
634 MachineOperand
*Src1
= &MI
.getOperand(2);
636 if (!Src0
->isReg() && Src1
->isReg()) {
637 if (TII
->commuteInstruction(MI
, false, 1, 2))
638 std::swap(Src0
, Src1
);
641 // FIXME: This could work better if hints worked with subregisters. If
642 // we have a vector add of a constant, we usually don't get the correct
643 // allocation due to the subregister usage.
644 if (Register::isVirtualRegister(Dest
->getReg()) && Src0
->isReg()) {
645 MRI
.setRegAllocationHint(Dest
->getReg(), 0, Src0
->getReg());
646 MRI
.setRegAllocationHint(Src0
->getReg(), 0, Dest
->getReg());
650 if (Src0
->isReg() && Src0
->getReg() == Dest
->getReg()) {
651 if (Src1
->isImm() && isKImmOperand(TII
, *Src1
)) {
652 unsigned Opc
= (MI
.getOpcode() == AMDGPU::S_ADD_I32
) ?
653 AMDGPU::S_ADDK_I32
: AMDGPU::S_MULK_I32
;
655 MI
.setDesc(TII
->get(Opc
));
656 MI
.tieOperands(0, 1);
661 // Try to use s_cmpk_*
662 if (MI
.isCompare() && TII
->isSOPC(MI
)) {
663 shrinkScalarCompare(TII
, MI
);
667 // Try to use S_MOVK_I32, which will save 4 bytes for small immediates.
668 if (MI
.getOpcode() == AMDGPU::S_MOV_B32
) {
669 const MachineOperand
&Dst
= MI
.getOperand(0);
670 MachineOperand
&Src
= MI
.getOperand(1);
672 if (Src
.isImm() && Register::isPhysicalRegister(Dst
.getReg())) {
674 if (isKImmOperand(TII
, Src
))
675 MI
.setDesc(TII
->get(AMDGPU::S_MOVK_I32
));
676 else if (isReverseInlineImm(TII
, Src
, ReverseImm
)) {
677 MI
.setDesc(TII
->get(AMDGPU::S_BREV_B32
));
678 Src
.setImm(ReverseImm
);
685 // Shrink scalar logic operations.
686 if (MI
.getOpcode() == AMDGPU::S_AND_B32
||
687 MI
.getOpcode() == AMDGPU::S_OR_B32
||
688 MI
.getOpcode() == AMDGPU::S_XOR_B32
) {
689 if (shrinkScalarLogicOp(ST
, MRI
, TII
, MI
))
693 if (TII
->isMIMG(MI
.getOpcode()) &&
694 ST
.getGeneration() >= AMDGPUSubtarget::GFX10
&&
695 MF
.getProperties().hasProperty(
696 MachineFunctionProperties::Property::NoVRegs
)) {
701 if (!TII
->hasVALU32BitEncoding(MI
.getOpcode()))
704 if (!TII
->canShrink(MI
, MRI
)) {
705 // Try commuting the instruction and see if that enables us to shrink
707 if (!MI
.isCommutable() || !TII
->commuteInstruction(MI
) ||
708 !TII
->canShrink(MI
, MRI
))
712 // getVOPe32 could be -1 here if we started with an instruction that had
713 // a 32-bit encoding and then commuted it to an instruction that did not.
714 if (!TII
->hasVALU32BitEncoding(MI
.getOpcode()))
717 int Op32
= AMDGPU::getVOPe32(MI
.getOpcode());
719 if (TII
->isVOPC(Op32
)) {
720 Register DstReg
= MI
.getOperand(0).getReg();
721 if (Register::isVirtualRegister(DstReg
)) {
722 // VOPC instructions can only write to the VCC register. We can't
723 // force them to use VCC here, because this is only one register and
724 // cannot deal with sequences which would require multiple copies of
725 // VCC, e.g. S_AND_B64 (vcc = V_CMP_...), (vcc = V_CMP_...)
727 // So, instead of forcing the instruction to write to VCC, we provide
728 // a hint to the register allocator to use VCC and then we will run
729 // this pass again after RA and shrink it if it outputs to VCC.
730 MRI
.setRegAllocationHint(MI
.getOperand(0).getReg(), 0, VCCReg
);
733 if (DstReg
!= VCCReg
)
737 if (Op32
== AMDGPU::V_CNDMASK_B32_e32
) {
738 // We shrink V_CNDMASK_B32_e64 using regalloc hints like we do for VOPC
740 const MachineOperand
*Src2
=
741 TII
->getNamedOperand(MI
, AMDGPU::OpName::src2
);
744 Register SReg
= Src2
->getReg();
745 if (Register::isVirtualRegister(SReg
)) {
746 MRI
.setRegAllocationHint(SReg
, 0, VCCReg
);
753 // Check for the bool flag output for instructions like V_ADD_I32_e64.
754 const MachineOperand
*SDst
= TII
->getNamedOperand(MI
,
755 AMDGPU::OpName::sdst
);
757 // Check the carry-in operand for v_addc_u32_e64.
758 const MachineOperand
*Src2
= TII
->getNamedOperand(MI
,
759 AMDGPU::OpName::src2
);
764 if (SDst
->getReg() != VCCReg
) {
765 if (Register::isVirtualRegister(SDst
->getReg()))
766 MRI
.setRegAllocationHint(SDst
->getReg(), 0, VCCReg
);
770 // All of the instructions with carry outs also have an SGPR input in
772 if (Src2
&& Src2
->getReg() != VCCReg
) {
773 if (Register::isVirtualRegister(Src2
->getReg()))
774 MRI
.setRegAllocationHint(Src2
->getReg(), 0, VCCReg
);
782 // We can shrink this instruction
783 LLVM_DEBUG(dbgs() << "Shrinking " << MI
);
785 MachineInstr
*Inst32
= TII
->buildShrunkInst(MI
, Op32
);
786 ++NumInstructionsShrunk
;
788 // Copy extra operands not present in the instruction definition.
789 copyExtraImplicitOps(*Inst32
, MF
, MI
);
791 MI
.eraseFromParent();
792 foldImmediates(*Inst32
, TII
, MRI
);
794 LLVM_DEBUG(dbgs() << "e32 MI = " << *Inst32
<< '\n');