1 //===-- SIPreEmitPeephole.cpp ------------------------------------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
10 /// This pass performs the peephole optimizations before code emission.
12 //===----------------------------------------------------------------------===//
15 #include "GCNSubtarget.h"
16 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
17 #include "llvm/CodeGen/MachineFunctionPass.h"
21 #define DEBUG_TYPE "si-pre-emit-peephole"
23 static unsigned SkipThreshold
;
25 static cl::opt
<unsigned, true> SkipThresholdFlag(
26 "amdgpu-skip-threshold", cl::Hidden
,
28 "Number of instructions before jumping over divergent control flow"),
29 cl::location(SkipThreshold
), cl::init(12));
33 class SIPreEmitPeephole
: public MachineFunctionPass
{
35 const SIInstrInfo
*TII
= nullptr;
36 const SIRegisterInfo
*TRI
= nullptr;
38 bool optimizeVccBranch(MachineInstr
&MI
) const;
39 bool optimizeSetGPR(MachineInstr
&First
, MachineInstr
&MI
) const;
40 bool getBlockDestinations(MachineBasicBlock
&SrcMBB
,
41 MachineBasicBlock
*&TrueMBB
,
42 MachineBasicBlock
*&FalseMBB
,
43 SmallVectorImpl
<MachineOperand
> &Cond
);
44 bool mustRetainExeczBranch(const MachineBasicBlock
&From
,
45 const MachineBasicBlock
&To
) const;
46 bool removeExeczBranch(MachineInstr
&MI
, MachineBasicBlock
&SrcMBB
);
51 SIPreEmitPeephole() : MachineFunctionPass(ID
) {
52 initializeSIPreEmitPeepholePass(*PassRegistry::getPassRegistry());
55 bool runOnMachineFunction(MachineFunction
&MF
) override
;
58 } // End anonymous namespace.
60 INITIALIZE_PASS(SIPreEmitPeephole
, DEBUG_TYPE
,
61 "SI peephole optimizations", false, false)
63 char SIPreEmitPeephole::ID
= 0;
65 char &llvm::SIPreEmitPeepholeID
= SIPreEmitPeephole::ID
;
67 bool SIPreEmitPeephole::optimizeVccBranch(MachineInstr
&MI
) const {
70 // vcc = S_AND_B64 exec, sreg or S_ANDN2_B64 exec, sreg
74 // We end up with this pattern sometimes after basic block placement.
75 // It happens while combining a block which assigns -1 or 0 to a saved mask
76 // and another block which consumes that saved mask and then a branch.
78 MachineBasicBlock
&MBB
= *MI
.getParent();
79 const GCNSubtarget
&ST
= MBB
.getParent()->getSubtarget
<GCNSubtarget
>();
80 const bool IsWave32
= ST
.isWave32();
81 const unsigned CondReg
= TRI
->getVCC();
82 const unsigned ExecReg
= IsWave32
? AMDGPU::EXEC_LO
: AMDGPU::EXEC
;
83 const unsigned And
= IsWave32
? AMDGPU::S_AND_B32
: AMDGPU::S_AND_B64
;
84 const unsigned AndN2
= IsWave32
? AMDGPU::S_ANDN2_B32
: AMDGPU::S_ANDN2_B64
;
85 const unsigned Mov
= IsWave32
? AMDGPU::S_MOV_B32
: AMDGPU::S_MOV_B64
;
87 MachineBasicBlock::reverse_iterator A
= MI
.getReverseIterator(),
89 bool ReadsCond
= false;
90 unsigned Threshold
= 5;
91 for (++A
; A
!= E
; ++A
) {
94 if (A
->modifiesRegister(ExecReg
, TRI
))
96 if (A
->modifiesRegister(CondReg
, TRI
)) {
97 if (!A
->definesRegister(CondReg
, TRI
) ||
98 (A
->getOpcode() != And
&& A
->getOpcode() != AndN2
))
102 ReadsCond
|= A
->readsRegister(CondReg
, TRI
);
107 MachineOperand
&Op1
= A
->getOperand(1);
108 MachineOperand
&Op2
= A
->getOperand(2);
109 if (Op1
.getReg() != ExecReg
&& Op2
.isReg() && Op2
.getReg() == ExecReg
) {
110 TII
->commuteInstruction(*A
);
113 if (Op1
.getReg() != ExecReg
)
115 if (Op2
.isImm() && !(Op2
.getImm() == -1 || Op2
.getImm() == 0))
118 int64_t MaskValue
= 0;
122 auto M
= std::next(A
);
123 bool ReadsSreg
= false;
124 for (; M
!= E
; ++M
) {
125 if (M
->definesRegister(SReg
, TRI
))
127 if (M
->modifiesRegister(SReg
, TRI
))
129 ReadsSreg
|= M
->readsRegister(SReg
, TRI
);
131 if (M
== E
|| !M
->isMoveImmediate() || !M
->getOperand(1).isImm() ||
132 (M
->getOperand(1).getImm() != -1 && M
->getOperand(1).getImm() != 0))
134 MaskValue
= M
->getOperand(1).getImm();
135 // First if sreg is only used in the AND instruction fold the immediate
136 // into into the AND.
137 if (!ReadsSreg
&& Op2
.isKill()) {
138 A
->getOperand(2).ChangeToImmediate(MaskValue
);
139 M
->eraseFromParent();
141 } else if (Op2
.isImm()) {
142 MaskValue
= Op2
.getImm();
144 llvm_unreachable("Op2 must be register or immediate");
147 // Invert mask for s_andn2
148 assert(MaskValue
== 0 || MaskValue
== -1);
149 if (A
->getOpcode() == AndN2
)
150 MaskValue
= ~MaskValue
;
152 if (!ReadsCond
&& A
->registerDefIsDead(AMDGPU::SCC
)) {
153 if (!MI
.killsRegister(CondReg
, TRI
)) {
154 // Replace AND with MOV
155 if (MaskValue
== 0) {
156 BuildMI(*A
->getParent(), *A
, A
->getDebugLoc(), TII
->get(Mov
), CondReg
)
159 BuildMI(*A
->getParent(), *A
, A
->getDebugLoc(), TII
->get(Mov
), CondReg
)
163 // Remove AND instruction
164 A
->eraseFromParent();
167 bool IsVCCZ
= MI
.getOpcode() == AMDGPU::S_CBRANCH_VCCZ
;
168 if (SReg
== ExecReg
) {
169 // EXEC is updated directly
171 MI
.eraseFromParent();
174 MI
.setDesc(TII
->get(AMDGPU::S_BRANCH
));
175 } else if (IsVCCZ
&& MaskValue
== 0) {
176 // Will always branch
177 // Remove all succesors shadowed by new unconditional branch
178 MachineBasicBlock
*Parent
= MI
.getParent();
179 SmallVector
<MachineInstr
*, 4> ToRemove
;
181 for (MachineInstr
&Term
: Parent
->terminators()) {
184 ToRemove
.push_back(&Term
);
186 Found
= Term
.isIdenticalTo(MI
);
189 assert(Found
&& "conditional branch is not terminator");
190 for (auto BranchMI
: ToRemove
) {
191 MachineOperand
&Dst
= BranchMI
->getOperand(0);
192 assert(Dst
.isMBB() && "destination is not basic block");
193 Parent
->removeSuccessor(Dst
.getMBB());
194 BranchMI
->eraseFromParent();
197 if (MachineBasicBlock
*Succ
= Parent
->getFallThrough()) {
198 Parent
->removeSuccessor(Succ
);
201 // Rewrite to unconditional branch
202 MI
.setDesc(TII
->get(AMDGPU::S_BRANCH
));
203 } else if (!IsVCCZ
&& MaskValue
== 0) {
205 MachineOperand
&Dst
= MI
.getOperand(0);
206 assert(Dst
.isMBB() && "destination is not basic block");
207 MI
.getParent()->removeSuccessor(Dst
.getMBB());
208 MI
.eraseFromParent();
210 } else if (MaskValue
== -1) {
211 // Depends only on EXEC
213 TII
->get(IsVCCZ
? AMDGPU::S_CBRANCH_EXECZ
: AMDGPU::S_CBRANCH_EXECNZ
));
216 MI
.RemoveOperand(MI
.findRegisterUseOperandIdx(CondReg
, false /*Kill*/, TRI
));
217 MI
.addImplicitDefUseOperands(*MBB
.getParent());
222 bool SIPreEmitPeephole::optimizeSetGPR(MachineInstr
&First
,
223 MachineInstr
&MI
) const {
224 MachineBasicBlock
&MBB
= *MI
.getParent();
225 const MachineFunction
&MF
= *MBB
.getParent();
226 const MachineRegisterInfo
&MRI
= MF
.getRegInfo();
227 MachineOperand
*Idx
= TII
->getNamedOperand(MI
, AMDGPU::OpName::src0
);
228 Register IdxReg
= Idx
->isReg() ? Idx
->getReg() : Register();
229 SmallVector
<MachineInstr
*, 4> ToRemove
;
232 if (!MI
.isIdenticalTo(First
))
235 // Scan back to find an identical S_SET_GPR_IDX_ON
236 for (MachineBasicBlock::instr_iterator I
= std::next(First
.getIterator()),
237 E
= MI
.getIterator();
241 switch (I
->getOpcode()) {
242 case AMDGPU::S_SET_GPR_IDX_MODE
:
244 case AMDGPU::S_SET_GPR_IDX_OFF
:
246 ToRemove
.push_back(&*I
);
249 if (I
->modifiesRegister(AMDGPU::M0
, TRI
))
251 if (IdxReg
&& I
->modifiesRegister(IdxReg
, TRI
))
253 if (llvm::any_of(I
->operands(),
254 [&MRI
, this](const MachineOperand
&MO
) {
256 TRI
->isVectorRegister(MRI
, MO
.getReg());
258 // The only exception allowed here is another indirect vector move
259 // with the same mode.
261 !((I
->getOpcode() == AMDGPU::V_MOV_B32_e32
&&
262 I
->hasRegisterImplicitUseOperand(AMDGPU::M0
)) ||
263 I
->getOpcode() == AMDGPU::V_MOV_B32_indirect
))
269 MI
.eraseFromBundle();
270 for (MachineInstr
*RI
: ToRemove
)
271 RI
->eraseFromBundle();
275 bool SIPreEmitPeephole::getBlockDestinations(
276 MachineBasicBlock
&SrcMBB
, MachineBasicBlock
*&TrueMBB
,
277 MachineBasicBlock
*&FalseMBB
, SmallVectorImpl
<MachineOperand
> &Cond
) {
278 if (TII
->analyzeBranch(SrcMBB
, TrueMBB
, FalseMBB
, Cond
))
282 FalseMBB
= SrcMBB
.getNextNode();
287 bool SIPreEmitPeephole::mustRetainExeczBranch(
288 const MachineBasicBlock
&From
, const MachineBasicBlock
&To
) const {
289 unsigned NumInstr
= 0;
290 const MachineFunction
*MF
= From
.getParent();
292 for (MachineFunction::const_iterator
MBBI(&From
), ToI(&To
), End
= MF
->end();
293 MBBI
!= End
&& MBBI
!= ToI
; ++MBBI
) {
294 const MachineBasicBlock
&MBB
= *MBBI
;
296 for (MachineBasicBlock::const_iterator I
= MBB
.begin(), E
= MBB
.end();
298 // When a uniform loop is inside non-uniform control flow, the branch
299 // leaving the loop might never be taken when EXEC = 0.
300 // Hence we should retain cbranch out of the loop lest it become infinite.
301 if (I
->isConditionalBranch())
304 if (TII
->hasUnwantedEffectsWhenEXECEmpty(*I
))
307 // These instructions are potentially expensive even if EXEC = 0.
308 if (TII
->isSMRD(*I
) || TII
->isVMEM(*I
) || TII
->isFLAT(*I
) ||
309 TII
->isDS(*I
) || I
->getOpcode() == AMDGPU::S_WAITCNT
)
313 if (NumInstr
>= SkipThreshold
)
321 // Returns true if the skip branch instruction is removed.
322 bool SIPreEmitPeephole::removeExeczBranch(MachineInstr
&MI
,
323 MachineBasicBlock
&SrcMBB
) {
324 MachineBasicBlock
*TrueMBB
= nullptr;
325 MachineBasicBlock
*FalseMBB
= nullptr;
326 SmallVector
<MachineOperand
, 1> Cond
;
328 if (!getBlockDestinations(SrcMBB
, TrueMBB
, FalseMBB
, Cond
))
331 // Consider only the forward branches.
332 if ((SrcMBB
.getNumber() >= TrueMBB
->getNumber()) ||
333 mustRetainExeczBranch(*FalseMBB
, *TrueMBB
))
336 LLVM_DEBUG(dbgs() << "Removing the execz branch: " << MI
);
337 MI
.eraseFromParent();
338 SrcMBB
.removeSuccessor(TrueMBB
);
343 bool SIPreEmitPeephole::runOnMachineFunction(MachineFunction
&MF
) {
344 const GCNSubtarget
&ST
= MF
.getSubtarget
<GCNSubtarget
>();
345 TII
= ST
.getInstrInfo();
346 TRI
= &TII
->getRegisterInfo();
347 bool Changed
= false;
351 for (MachineBasicBlock
&MBB
: MF
) {
352 MachineBasicBlock::iterator TermI
= MBB
.getFirstTerminator();
353 // Check first terminator for branches to optimize
354 if (TermI
!= MBB
.end()) {
355 MachineInstr
&MI
= *TermI
;
356 switch (MI
.getOpcode()) {
357 case AMDGPU::S_CBRANCH_VCCZ
:
358 case AMDGPU::S_CBRANCH_VCCNZ
:
359 Changed
|= optimizeVccBranch(MI
);
361 case AMDGPU::S_CBRANCH_EXECZ
:
362 Changed
|= removeExeczBranch(MI
, MBB
);
367 if (!ST
.hasVGPRIndexMode())
370 MachineInstr
*SetGPRMI
= nullptr;
371 const unsigned Threshold
= 20;
373 // Scan the block for two S_SET_GPR_IDX_ON instructions to see if a
374 // second is not needed. Do expensive checks in the optimizeSetGPR()
375 // and limit the distance to 20 instructions for compile time purposes.
376 // Note: this needs to work on bundles as S_SET_GPR_IDX* instructions
377 // may be bundled with the instructions they modify.
379 make_early_inc_range(make_range(MBB
.instr_begin(), MBB
.instr_end()))) {
380 if (Count
== Threshold
)
385 if (MI
.getOpcode() != AMDGPU::S_SET_GPR_IDX_ON
)
394 if (optimizeSetGPR(*SetGPRMI
, MI
))