1 //===-- SIOptimizeExecMaskingPreRA.cpp ------------------------------------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
10 /// This pass removes redundant S_OR_B64 instructions enabling lanes in
11 /// the exec. If two SI_END_CF (lowered as S_OR_B64) come together without any
12 /// vector instructions between them we can only keep outer SI_END_CF, given
13 /// that CFG is structured and exec bits of the outer end statement are always
14 /// not less than exec bit of the inner one.
16 /// This needs to be done before the RA to eliminate saved exec bits registers
17 /// but after register coalescer to have no vector registers copies in between
18 /// of different end cf statements.
20 //===----------------------------------------------------------------------===//
23 #include "AMDGPUSubtarget.h"
24 #include "SIInstrInfo.h"
25 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
26 #include "llvm/CodeGen/LiveIntervals.h"
27 #include "llvm/CodeGen/MachineFunctionPass.h"
31 #define DEBUG_TYPE "si-optimize-exec-masking-pre-ra"
35 class SIOptimizeExecMaskingPreRA
: public MachineFunctionPass
{
37 const SIRegisterInfo
*TRI
;
38 const SIInstrInfo
*TII
;
39 MachineRegisterInfo
*MRI
;
42 MachineBasicBlock::iterator
skipIgnoreExecInsts(
43 MachineBasicBlock::iterator I
, MachineBasicBlock::iterator E
) const;
45 MachineBasicBlock::iterator
skipIgnoreExecInstsTrivialSucc(
46 MachineBasicBlock
*&MBB
,
47 MachineBasicBlock::iterator It
) const;
52 SIOptimizeExecMaskingPreRA() : MachineFunctionPass(ID
) {
53 initializeSIOptimizeExecMaskingPreRAPass(*PassRegistry::getPassRegistry());
56 bool runOnMachineFunction(MachineFunction
&MF
) override
;
58 StringRef
getPassName() const override
{
59 return "SI optimize exec mask operations pre-RA";
62 void getAnalysisUsage(AnalysisUsage
&AU
) const override
{
63 AU
.addRequired
<LiveIntervals
>();
65 MachineFunctionPass::getAnalysisUsage(AU
);
69 } // End anonymous namespace.
71 INITIALIZE_PASS_BEGIN(SIOptimizeExecMaskingPreRA
, DEBUG_TYPE
,
72 "SI optimize exec mask operations pre-RA", false, false)
73 INITIALIZE_PASS_DEPENDENCY(LiveIntervals
)
74 INITIALIZE_PASS_END(SIOptimizeExecMaskingPreRA
, DEBUG_TYPE
,
75 "SI optimize exec mask operations pre-RA", false, false)
77 char SIOptimizeExecMaskingPreRA::ID
= 0;
79 char &llvm::SIOptimizeExecMaskingPreRAID
= SIOptimizeExecMaskingPreRA::ID
;
81 FunctionPass
*llvm::createSIOptimizeExecMaskingPreRAPass() {
82 return new SIOptimizeExecMaskingPreRA();
85 static bool isEndCF(const MachineInstr
&MI
, const SIRegisterInfo
*TRI
,
86 const GCNSubtarget
&ST
) {
88 return MI
.getOpcode() == AMDGPU::S_OR_B32
&&
89 MI
.modifiesRegister(AMDGPU::EXEC_LO
, TRI
);
92 return MI
.getOpcode() == AMDGPU::S_OR_B64
&&
93 MI
.modifiesRegister(AMDGPU::EXEC
, TRI
);
96 static bool isFullExecCopy(const MachineInstr
& MI
, const GCNSubtarget
& ST
) {
97 unsigned Exec
= ST
.isWave32() ? AMDGPU::EXEC_LO
: AMDGPU::EXEC
;
99 if (MI
.isCopy() && MI
.getOperand(1).getReg() == Exec
) {
100 assert(MI
.isFullCopy());
107 static unsigned getOrNonExecReg(const MachineInstr
&MI
,
108 const SIInstrInfo
&TII
,
109 const GCNSubtarget
& ST
) {
110 unsigned Exec
= ST
.isWave32() ? AMDGPU::EXEC_LO
: AMDGPU::EXEC
;
111 auto Op
= TII
.getNamedOperand(MI
, AMDGPU::OpName::src1
);
112 if (Op
->isReg() && Op
->getReg() != Exec
)
114 Op
= TII
.getNamedOperand(MI
, AMDGPU::OpName::src0
);
115 if (Op
->isReg() && Op
->getReg() != Exec
)
117 return AMDGPU::NoRegister
;
120 static MachineInstr
* getOrExecSource(const MachineInstr
&MI
,
121 const SIInstrInfo
&TII
,
122 const MachineRegisterInfo
&MRI
,
123 const GCNSubtarget
& ST
) {
124 auto SavedExec
= getOrNonExecReg(MI
, TII
, ST
);
125 if (SavedExec
== AMDGPU::NoRegister
)
127 auto SaveExecInst
= MRI
.getUniqueVRegDef(SavedExec
);
128 if (!SaveExecInst
|| !isFullExecCopy(*SaveExecInst
, ST
))
133 /// Skip over instructions that don't care about the exec mask.
134 MachineBasicBlock::iterator
SIOptimizeExecMaskingPreRA::skipIgnoreExecInsts(
135 MachineBasicBlock::iterator I
, MachineBasicBlock::iterator E
) const {
136 for ( ; I
!= E
; ++I
) {
137 if (TII
->mayReadEXEC(*MRI
, *I
))
144 // Skip to the next instruction, ignoring debug instructions, and trivial block
145 // boundaries (blocks that have one (typically fallthrough) successor, and the
146 // successor has one predecessor.
147 MachineBasicBlock::iterator
148 SIOptimizeExecMaskingPreRA::skipIgnoreExecInstsTrivialSucc(
149 MachineBasicBlock
*&MBB
,
150 MachineBasicBlock::iterator It
) const {
153 It
= skipIgnoreExecInsts(It
, MBB
->end());
154 if (It
!= MBB
->end() || MBB
->succ_size() != 1)
157 // If there is one trivial successor, advance to the next block.
158 MachineBasicBlock
*Succ
= *MBB
->succ_begin();
160 // TODO: Is this really necessary?
161 if (!MBB
->isLayoutSuccessor(Succ
))
173 // %sel = V_CNDMASK_B32_e64 0, 1, %cc
174 // %cmp = V_CMP_NE_U32 1, %1
175 // $vcc = S_AND_B64 $exec, %cmp
178 // $vcc = S_ANDN2_B64 $exec, %cc
181 // It is the negation pattern inserted by DAGCombiner::visitBRCOND() in the
182 // rebuildSetCC(). We start with S_CBRANCH to avoid exhaustive search, but
183 // only 3 first instructions are really needed. S_AND_B64 with exec is a
184 // required part of the pattern since V_CNDMASK_B32 writes zeroes for inactive
187 // Returns %cc register on success.
188 static unsigned optimizeVcndVcmpPair(MachineBasicBlock
&MBB
,
189 const GCNSubtarget
&ST
,
190 MachineRegisterInfo
&MRI
,
191 LiveIntervals
*LIS
) {
192 const SIRegisterInfo
*TRI
= ST
.getRegisterInfo();
193 const SIInstrInfo
*TII
= ST
.getInstrInfo();
194 bool Wave32
= ST
.isWave32();
195 const unsigned AndOpc
= Wave32
? AMDGPU::S_AND_B32
: AMDGPU::S_AND_B64
;
196 const unsigned Andn2Opc
= Wave32
? AMDGPU::S_ANDN2_B32
: AMDGPU::S_ANDN2_B64
;
197 const unsigned CondReg
= Wave32
? AMDGPU::VCC_LO
: AMDGPU::VCC
;
198 const unsigned ExecReg
= Wave32
? AMDGPU::EXEC_LO
: AMDGPU::EXEC
;
200 auto I
= llvm::find_if(MBB
.terminators(), [](const MachineInstr
&MI
) {
201 unsigned Opc
= MI
.getOpcode();
202 return Opc
== AMDGPU::S_CBRANCH_VCCZ
||
203 Opc
== AMDGPU::S_CBRANCH_VCCNZ
; });
204 if (I
== MBB
.terminators().end())
205 return AMDGPU::NoRegister
;
207 auto *And
= TRI
->findReachingDef(CondReg
, AMDGPU::NoSubRegister
,
209 if (!And
|| And
->getOpcode() != AndOpc
||
210 !And
->getOperand(1).isReg() || !And
->getOperand(2).isReg())
211 return AMDGPU::NoRegister
;
213 MachineOperand
*AndCC
= &And
->getOperand(1);
214 Register CmpReg
= AndCC
->getReg();
215 unsigned CmpSubReg
= AndCC
->getSubReg();
216 if (CmpReg
== ExecReg
) {
217 AndCC
= &And
->getOperand(2);
218 CmpReg
= AndCC
->getReg();
219 CmpSubReg
= AndCC
->getSubReg();
220 } else if (And
->getOperand(2).getReg() != ExecReg
) {
221 return AMDGPU::NoRegister
;
224 auto *Cmp
= TRI
->findReachingDef(CmpReg
, CmpSubReg
, *And
, MRI
, LIS
);
225 if (!Cmp
|| !(Cmp
->getOpcode() == AMDGPU::V_CMP_NE_U32_e32
||
226 Cmp
->getOpcode() == AMDGPU::V_CMP_NE_U32_e64
) ||
227 Cmp
->getParent() != And
->getParent())
228 return AMDGPU::NoRegister
;
230 MachineOperand
*Op1
= TII
->getNamedOperand(*Cmp
, AMDGPU::OpName::src0
);
231 MachineOperand
*Op2
= TII
->getNamedOperand(*Cmp
, AMDGPU::OpName::src1
);
232 if (Op1
->isImm() && Op2
->isReg())
234 if (!Op1
->isReg() || !Op2
->isImm() || Op2
->getImm() != 1)
235 return AMDGPU::NoRegister
;
237 Register SelReg
= Op1
->getReg();
238 auto *Sel
= TRI
->findReachingDef(SelReg
, Op1
->getSubReg(), *Cmp
, MRI
, LIS
);
239 if (!Sel
|| Sel
->getOpcode() != AMDGPU::V_CNDMASK_B32_e64
)
240 return AMDGPU::NoRegister
;
242 if (TII
->hasModifiersSet(*Sel
, AMDGPU::OpName::src0_modifiers
) ||
243 TII
->hasModifiersSet(*Sel
, AMDGPU::OpName::src1_modifiers
))
244 return AMDGPU::NoRegister
;
246 Op1
= TII
->getNamedOperand(*Sel
, AMDGPU::OpName::src0
);
247 Op2
= TII
->getNamedOperand(*Sel
, AMDGPU::OpName::src1
);
248 MachineOperand
*CC
= TII
->getNamedOperand(*Sel
, AMDGPU::OpName::src2
);
249 if (!Op1
->isImm() || !Op2
->isImm() || !CC
->isReg() ||
250 Op1
->getImm() != 0 || Op2
->getImm() != 1)
251 return AMDGPU::NoRegister
;
253 LLVM_DEBUG(dbgs() << "Folding sequence:\n\t" << *Sel
<< '\t' << *Cmp
<< '\t'
256 Register CCReg
= CC
->getReg();
257 LIS
->RemoveMachineInstrFromMaps(*And
);
258 MachineInstr
*Andn2
=
259 BuildMI(MBB
, *And
, And
->getDebugLoc(), TII
->get(Andn2Opc
),
260 And
->getOperand(0).getReg())
262 .addReg(CCReg
, getUndefRegState(CC
->isUndef()), CC
->getSubReg());
263 And
->eraseFromParent();
264 LIS
->InsertMachineInstrInMaps(*Andn2
);
266 LLVM_DEBUG(dbgs() << "=>\n\t" << *Andn2
<< '\n');
268 // Try to remove compare. Cmp value should not used in between of cmp
269 // and s_and_b64 if VCC or just unused if any other register.
270 if ((Register::isVirtualRegister(CmpReg
) && MRI
.use_nodbg_empty(CmpReg
)) ||
271 (CmpReg
== CondReg
&&
272 std::none_of(std::next(Cmp
->getIterator()), Andn2
->getIterator(),
273 [&](const MachineInstr
&MI
) {
274 return MI
.readsRegister(CondReg
, TRI
);
276 LLVM_DEBUG(dbgs() << "Erasing: " << *Cmp
<< '\n');
278 LIS
->RemoveMachineInstrFromMaps(*Cmp
);
279 Cmp
->eraseFromParent();
281 // Try to remove v_cndmask_b32.
282 if (Register::isVirtualRegister(SelReg
) && MRI
.use_nodbg_empty(SelReg
)) {
283 LLVM_DEBUG(dbgs() << "Erasing: " << *Sel
<< '\n');
285 LIS
->RemoveMachineInstrFromMaps(*Sel
);
286 Sel
->eraseFromParent();
293 bool SIOptimizeExecMaskingPreRA::runOnMachineFunction(MachineFunction
&MF
) {
294 if (skipFunction(MF
.getFunction()))
297 const GCNSubtarget
&ST
= MF
.getSubtarget
<GCNSubtarget
>();
298 TRI
= ST
.getRegisterInfo();
299 TII
= ST
.getInstrInfo();
300 MRI
= &MF
.getRegInfo();
302 MachineRegisterInfo
&MRI
= MF
.getRegInfo();
303 LiveIntervals
*LIS
= &getAnalysis
<LiveIntervals
>();
304 DenseSet
<unsigned> RecalcRegs({AMDGPU::EXEC_LO
, AMDGPU::EXEC_HI
});
305 unsigned Exec
= ST
.isWave32() ? AMDGPU::EXEC_LO
: AMDGPU::EXEC
;
306 bool Changed
= false;
308 for (MachineBasicBlock
&MBB
: MF
) {
310 if (unsigned Reg
= optimizeVcndVcmpPair(MBB
, ST
, MRI
, LIS
)) {
311 RecalcRegs
.insert(Reg
);
312 RecalcRegs
.insert(AMDGPU::VCC_LO
);
313 RecalcRegs
.insert(AMDGPU::VCC_HI
);
314 RecalcRegs
.insert(AMDGPU::SCC
);
318 // Try to remove unneeded instructions before s_endpgm.
319 if (MBB
.succ_empty()) {
323 // Skip this if the endpgm has any implicit uses, otherwise we would need
324 // to be careful to update / remove them.
325 // S_ENDPGM always has a single imm operand that is not used other than to
326 // end up in the encoding
327 MachineInstr
&Term
= MBB
.back();
328 if (Term
.getOpcode() != AMDGPU::S_ENDPGM
|| Term
.getNumOperands() != 1)
331 SmallVector
<MachineBasicBlock
*, 4> Blocks({&MBB
});
333 while (!Blocks
.empty()) {
334 auto CurBB
= Blocks
.pop_back_val();
335 auto I
= CurBB
->rbegin(), E
= CurBB
->rend();
337 if (I
->isUnconditionalBranch() || I
->getOpcode() == AMDGPU::S_ENDPGM
)
339 else if (I
->isBranch())
344 if (I
->isDebugInstr()) {
349 if (I
->mayStore() || I
->isBarrier() || I
->isCall() ||
350 I
->hasUnmodeledSideEffects() || I
->hasOrderedMemoryRef())
354 << "Removing no effect instruction: " << *I
<< '\n');
356 for (auto &Op
: I
->operands()) {
358 RecalcRegs
.insert(Op
.getReg());
361 auto Next
= std::next(I
);
362 LIS
->RemoveMachineInstrFromMaps(*I
);
363 I
->eraseFromParent();
372 // Try to ascend predecessors.
373 for (auto *Pred
: CurBB
->predecessors()) {
374 if (Pred
->succ_size() == 1)
375 Blocks
.push_back(Pred
);
381 // Try to collapse adjacent endifs.
383 auto Lead
= skipDebugInstructionsForward(MBB
.begin(), E
);
384 if (MBB
.succ_size() != 1 || Lead
== E
|| !isEndCF(*Lead
, TRI
, ST
))
387 MachineBasicBlock
*TmpMBB
= &MBB
;
388 auto NextLead
= skipIgnoreExecInstsTrivialSucc(TmpMBB
, std::next(Lead
));
389 if (NextLead
== TmpMBB
->end() || !isEndCF(*NextLead
, TRI
, ST
) ||
390 !getOrExecSource(*NextLead
, *TII
, MRI
, ST
))
393 LLVM_DEBUG(dbgs() << "Redundant EXEC = S_OR_B64 found: " << *Lead
<< '\n');
395 auto SaveExec
= getOrExecSource(*Lead
, *TII
, MRI
, ST
);
396 unsigned SaveExecReg
= getOrNonExecReg(*Lead
, *TII
, ST
);
397 for (auto &Op
: Lead
->operands()) {
399 RecalcRegs
.insert(Op
.getReg());
402 LIS
->RemoveMachineInstrFromMaps(*Lead
);
403 Lead
->eraseFromParent();
405 LIS
->removeInterval(SaveExecReg
);
406 LIS
->createAndComputeVirtRegInterval(SaveExecReg
);
411 // If the only use of saved exec in the removed instruction is S_AND_B64
412 // fold the copy now.
413 if (!SaveExec
|| !SaveExec
->isFullCopy())
416 Register SavedExec
= SaveExec
->getOperand(0).getReg();
417 bool SafeToReplace
= true;
418 for (auto& U
: MRI
.use_nodbg_instructions(SavedExec
)) {
419 if (U
.getParent() != SaveExec
->getParent()) {
420 SafeToReplace
= false;
424 LLVM_DEBUG(dbgs() << "Redundant EXEC COPY: " << *SaveExec
<< '\n');
428 LIS
->RemoveMachineInstrFromMaps(*SaveExec
);
429 SaveExec
->eraseFromParent();
430 MRI
.replaceRegWith(SavedExec
, Exec
);
431 LIS
->removeInterval(SavedExec
);
436 for (auto Reg
: RecalcRegs
) {
437 if (Register::isVirtualRegister(Reg
)) {
438 LIS
->removeInterval(Reg
);
439 if (!MRI
.reg_empty(Reg
))
440 LIS
->createAndComputeVirtRegInterval(Reg
);
442 LIS
->removeAllRegUnitsForPhysReg(Reg
);