1 //===-- SIOptimizeExecMaskingPreRA.cpp ------------------------------------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
10 /// This pass removes redundant S_OR_B64 instructions enabling lanes in
11 /// the exec. If two SI_END_CF (lowered as S_OR_B64) come together without any
12 /// vector instructions between them we can only keep outer SI_END_CF, given
13 /// that CFG is structured and exec bits of the outer end statement are always
14 /// not less than exec bit of the inner one.
16 /// This needs to be done before the RA to eliminate saved exec bits registers
17 /// but after register coalescer to have no vector registers copies in between
18 /// of different end cf statements.
20 //===----------------------------------------------------------------------===//
23 #include "AMDGPUSubtarget.h"
24 #include "SIInstrInfo.h"
25 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
26 #include "llvm/CodeGen/LiveIntervals.h"
27 #include "llvm/CodeGen/MachineFunctionPass.h"
31 #define DEBUG_TYPE "si-optimize-exec-masking-pre-ra"
35 class SIOptimizeExecMaskingPreRA
: public MachineFunctionPass
{
40 SIOptimizeExecMaskingPreRA() : MachineFunctionPass(ID
) {
41 initializeSIOptimizeExecMaskingPreRAPass(*PassRegistry::getPassRegistry());
44 bool runOnMachineFunction(MachineFunction
&MF
) override
;
46 StringRef
getPassName() const override
{
47 return "SI optimize exec mask operations pre-RA";
50 void getAnalysisUsage(AnalysisUsage
&AU
) const override
{
51 AU
.addRequired
<LiveIntervals
>();
53 MachineFunctionPass::getAnalysisUsage(AU
);
57 } // End anonymous namespace.
59 INITIALIZE_PASS_BEGIN(SIOptimizeExecMaskingPreRA
, DEBUG_TYPE
,
60 "SI optimize exec mask operations pre-RA", false, false)
61 INITIALIZE_PASS_DEPENDENCY(LiveIntervals
)
62 INITIALIZE_PASS_END(SIOptimizeExecMaskingPreRA
, DEBUG_TYPE
,
63 "SI optimize exec mask operations pre-RA", false, false)
65 char SIOptimizeExecMaskingPreRA::ID
= 0;
67 char &llvm::SIOptimizeExecMaskingPreRAID
= SIOptimizeExecMaskingPreRA::ID
;
69 FunctionPass
*llvm::createSIOptimizeExecMaskingPreRAPass() {
70 return new SIOptimizeExecMaskingPreRA();
73 static bool isEndCF(const MachineInstr
& MI
, const SIRegisterInfo
* TRI
) {
74 return MI
.getOpcode() == AMDGPU::S_OR_B64
&&
75 MI
.modifiesRegister(AMDGPU::EXEC
, TRI
);
78 static bool isFullExecCopy(const MachineInstr
& MI
) {
79 return MI
.isFullCopy() && MI
.getOperand(1).getReg() == AMDGPU::EXEC
;
82 static unsigned getOrNonExecReg(const MachineInstr
&MI
,
83 const SIInstrInfo
&TII
) {
84 auto Op
= TII
.getNamedOperand(MI
, AMDGPU::OpName::src1
);
85 if (Op
->isReg() && Op
->getReg() != AMDGPU::EXEC
)
87 Op
= TII
.getNamedOperand(MI
, AMDGPU::OpName::src0
);
88 if (Op
->isReg() && Op
->getReg() != AMDGPU::EXEC
)
90 return AMDGPU::NoRegister
;
93 static MachineInstr
* getOrExecSource(const MachineInstr
&MI
,
94 const SIInstrInfo
&TII
,
95 const MachineRegisterInfo
&MRI
) {
96 auto SavedExec
= getOrNonExecReg(MI
, TII
);
97 if (SavedExec
== AMDGPU::NoRegister
)
99 auto SaveExecInst
= MRI
.getUniqueVRegDef(SavedExec
);
100 if (!SaveExecInst
|| !isFullExecCopy(*SaveExecInst
))
106 // %sel = V_CNDMASK_B32_e64 0, 1, %cc
107 // %cmp = V_CMP_NE_U32 1, %1
108 // $vcc = S_AND_B64 $exec, %cmp
111 // $vcc = S_ANDN2_B64 $exec, %cc
114 // It is the negation pattern inserted by DAGCombiner::visitBRCOND() in the
115 // rebuildSetCC(). We start with S_CBRANCH to avoid exhaustive search, but
116 // only 3 first instructions are really needed. S_AND_B64 with exec is a
117 // required part of the pattern since V_CNDMASK_B32 writes zeroes for inactive
120 // Returns %cc register on success.
121 static unsigned optimizeVcndVcmpPair(MachineBasicBlock
&MBB
,
122 const GCNSubtarget
&ST
,
123 MachineRegisterInfo
&MRI
,
124 LiveIntervals
*LIS
) {
125 const SIRegisterInfo
*TRI
= ST
.getRegisterInfo();
126 const SIInstrInfo
*TII
= ST
.getInstrInfo();
127 const unsigned AndOpc
= AMDGPU::S_AND_B64
;
128 const unsigned Andn2Opc
= AMDGPU::S_ANDN2_B64
;
129 const unsigned CondReg
= AMDGPU::VCC
;
130 const unsigned ExecReg
= AMDGPU::EXEC
;
132 auto I
= llvm::find_if(MBB
.terminators(), [](const MachineInstr
&MI
) {
133 unsigned Opc
= MI
.getOpcode();
134 return Opc
== AMDGPU::S_CBRANCH_VCCZ
||
135 Opc
== AMDGPU::S_CBRANCH_VCCNZ
; });
136 if (I
== MBB
.terminators().end())
137 return AMDGPU::NoRegister
;
139 auto *And
= TRI
->findReachingDef(CondReg
, AMDGPU::NoSubRegister
,
141 if (!And
|| And
->getOpcode() != AndOpc
||
142 !And
->getOperand(1).isReg() || !And
->getOperand(2).isReg())
143 return AMDGPU::NoRegister
;
145 MachineOperand
*AndCC
= &And
->getOperand(1);
146 unsigned CmpReg
= AndCC
->getReg();
147 unsigned CmpSubReg
= AndCC
->getSubReg();
148 if (CmpReg
== ExecReg
) {
149 AndCC
= &And
->getOperand(2);
150 CmpReg
= AndCC
->getReg();
151 CmpSubReg
= AndCC
->getSubReg();
152 } else if (And
->getOperand(2).getReg() != ExecReg
) {
153 return AMDGPU::NoRegister
;
156 auto *Cmp
= TRI
->findReachingDef(CmpReg
, CmpSubReg
, *And
, MRI
, LIS
);
157 if (!Cmp
|| !(Cmp
->getOpcode() == AMDGPU::V_CMP_NE_U32_e32
||
158 Cmp
->getOpcode() == AMDGPU::V_CMP_NE_U32_e64
) ||
159 Cmp
->getParent() != And
->getParent())
160 return AMDGPU::NoRegister
;
162 MachineOperand
*Op1
= TII
->getNamedOperand(*Cmp
, AMDGPU::OpName::src0
);
163 MachineOperand
*Op2
= TII
->getNamedOperand(*Cmp
, AMDGPU::OpName::src1
);
164 if (Op1
->isImm() && Op2
->isReg())
166 if (!Op1
->isReg() || !Op2
->isImm() || Op2
->getImm() != 1)
167 return AMDGPU::NoRegister
;
169 unsigned SelReg
= Op1
->getReg();
170 auto *Sel
= TRI
->findReachingDef(SelReg
, Op1
->getSubReg(), *Cmp
, MRI
, LIS
);
171 if (!Sel
|| Sel
->getOpcode() != AMDGPU::V_CNDMASK_B32_e64
)
172 return AMDGPU::NoRegister
;
174 Op1
= TII
->getNamedOperand(*Sel
, AMDGPU::OpName::src0
);
175 Op2
= TII
->getNamedOperand(*Sel
, AMDGPU::OpName::src1
);
176 MachineOperand
*CC
= TII
->getNamedOperand(*Sel
, AMDGPU::OpName::src2
);
177 if (!Op1
->isImm() || !Op2
->isImm() || !CC
->isReg() ||
178 Op1
->getImm() != 0 || Op2
->getImm() != 1)
179 return AMDGPU::NoRegister
;
181 LLVM_DEBUG(dbgs() << "Folding sequence:\n\t" << *Sel
<< '\t'
182 << *Cmp
<< '\t' << *And
);
184 unsigned CCReg
= CC
->getReg();
185 LIS
->RemoveMachineInstrFromMaps(*And
);
186 MachineInstr
*Andn2
= BuildMI(MBB
, *And
, And
->getDebugLoc(),
187 TII
->get(Andn2Opc
), And
->getOperand(0).getReg())
189 .addReg(CCReg
, CC
->getSubReg());
190 And
->eraseFromParent();
191 LIS
->InsertMachineInstrInMaps(*Andn2
);
193 LLVM_DEBUG(dbgs() << "=>\n\t" << *Andn2
<< '\n');
195 // Try to remove compare. Cmp value should not used in between of cmp
196 // and s_and_b64 if VCC or just unused if any other register.
197 if ((TargetRegisterInfo::isVirtualRegister(CmpReg
) &&
198 MRI
.use_nodbg_empty(CmpReg
)) ||
199 (CmpReg
== CondReg
&&
200 std::none_of(std::next(Cmp
->getIterator()), Andn2
->getIterator(),
201 [&](const MachineInstr
&MI
) {
202 return MI
.readsRegister(CondReg
, TRI
); }))) {
203 LLVM_DEBUG(dbgs() << "Erasing: " << *Cmp
<< '\n');
205 LIS
->RemoveMachineInstrFromMaps(*Cmp
);
206 Cmp
->eraseFromParent();
208 // Try to remove v_cndmask_b32.
209 if (TargetRegisterInfo::isVirtualRegister(SelReg
) &&
210 MRI
.use_nodbg_empty(SelReg
)) {
211 LLVM_DEBUG(dbgs() << "Erasing: " << *Sel
<< '\n');
213 LIS
->RemoveMachineInstrFromMaps(*Sel
);
214 Sel
->eraseFromParent();
221 bool SIOptimizeExecMaskingPreRA::runOnMachineFunction(MachineFunction
&MF
) {
222 if (skipFunction(MF
.getFunction()))
225 const GCNSubtarget
&ST
= MF
.getSubtarget
<GCNSubtarget
>();
226 const SIRegisterInfo
*TRI
= ST
.getRegisterInfo();
227 const SIInstrInfo
*TII
= ST
.getInstrInfo();
228 MachineRegisterInfo
&MRI
= MF
.getRegInfo();
229 LiveIntervals
*LIS
= &getAnalysis
<LiveIntervals
>();
230 DenseSet
<unsigned> RecalcRegs({AMDGPU::EXEC_LO
, AMDGPU::EXEC_HI
});
231 bool Changed
= false;
233 for (MachineBasicBlock
&MBB
: MF
) {
235 if (unsigned Reg
= optimizeVcndVcmpPair(MBB
, ST
, MRI
, LIS
)) {
236 RecalcRegs
.insert(Reg
);
237 RecalcRegs
.insert(AMDGPU::VCC_LO
);
238 RecalcRegs
.insert(AMDGPU::VCC_HI
);
239 RecalcRegs
.insert(AMDGPU::SCC
);
243 // Try to remove unneeded instructions before s_endpgm.
244 if (MBB
.succ_empty()) {
248 // Skip this if the endpgm has any implicit uses, otherwise we would need
249 // to be careful to update / remove them.
250 MachineInstr
&Term
= MBB
.back();
251 if (Term
.getOpcode() != AMDGPU::S_ENDPGM
||
252 Term
.getNumOperands() != 0)
255 SmallVector
<MachineBasicBlock
*, 4> Blocks({&MBB
});
257 while (!Blocks
.empty()) {
258 auto CurBB
= Blocks
.pop_back_val();
259 auto I
= CurBB
->rbegin(), E
= CurBB
->rend();
261 if (I
->isUnconditionalBranch() || I
->getOpcode() == AMDGPU::S_ENDPGM
)
263 else if (I
->isBranch())
268 if (I
->isDebugInstr()) {
273 if (I
->mayStore() || I
->isBarrier() || I
->isCall() ||
274 I
->hasUnmodeledSideEffects() || I
->hasOrderedMemoryRef())
278 << "Removing no effect instruction: " << *I
<< '\n');
280 for (auto &Op
: I
->operands()) {
282 RecalcRegs
.insert(Op
.getReg());
285 auto Next
= std::next(I
);
286 LIS
->RemoveMachineInstrFromMaps(*I
);
287 I
->eraseFromParent();
296 // Try to ascend predecessors.
297 for (auto *Pred
: CurBB
->predecessors()) {
298 if (Pred
->succ_size() == 1)
299 Blocks
.push_back(Pred
);
305 // Try to collapse adjacent endifs.
306 auto Lead
= MBB
.begin(), E
= MBB
.end();
307 if (MBB
.succ_size() != 1 || Lead
== E
|| !isEndCF(*Lead
, TRI
))
310 const MachineBasicBlock
* Succ
= *MBB
.succ_begin();
311 if (!MBB
.isLayoutSuccessor(Succ
))
314 auto I
= std::next(Lead
);
317 if (!TII
->isSALU(*I
) || I
->readsRegister(AMDGPU::EXEC
, TRI
))
323 const auto NextLead
= Succ
->begin();
324 if (NextLead
== Succ
->end() || !isEndCF(*NextLead
, TRI
) ||
325 !getOrExecSource(*NextLead
, *TII
, MRI
))
328 LLVM_DEBUG(dbgs() << "Redundant EXEC = S_OR_B64 found: " << *Lead
<< '\n');
330 auto SaveExec
= getOrExecSource(*Lead
, *TII
, MRI
);
331 unsigned SaveExecReg
= getOrNonExecReg(*Lead
, *TII
);
332 for (auto &Op
: Lead
->operands()) {
334 RecalcRegs
.insert(Op
.getReg());
337 LIS
->RemoveMachineInstrFromMaps(*Lead
);
338 Lead
->eraseFromParent();
340 LIS
->removeInterval(SaveExecReg
);
341 LIS
->createAndComputeVirtRegInterval(SaveExecReg
);
346 // If the only use of saved exec in the removed instruction is S_AND_B64
347 // fold the copy now.
348 if (!SaveExec
|| !SaveExec
->isFullCopy())
351 unsigned SavedExec
= SaveExec
->getOperand(0).getReg();
352 bool SafeToReplace
= true;
353 for (auto& U
: MRI
.use_nodbg_instructions(SavedExec
)) {
354 if (U
.getParent() != SaveExec
->getParent()) {
355 SafeToReplace
= false;
359 LLVM_DEBUG(dbgs() << "Redundant EXEC COPY: " << *SaveExec
<< '\n');
363 LIS
->RemoveMachineInstrFromMaps(*SaveExec
);
364 SaveExec
->eraseFromParent();
365 MRI
.replaceRegWith(SavedExec
, AMDGPU::EXEC
);
366 LIS
->removeInterval(SavedExec
);
371 for (auto Reg
: RecalcRegs
) {
372 if (TargetRegisterInfo::isVirtualRegister(Reg
)) {
373 LIS
->removeInterval(Reg
);
374 if (!MRI
.reg_empty(Reg
))
375 LIS
->createAndComputeVirtRegInterval(Reg
);
377 for (MCRegUnitIterator
U(Reg
, TRI
); U
.isValid(); ++U
)
378 LIS
->removeRegUnit(*U
);