1 //===-- SIOptimizeExecMaskingPreRA.cpp ------------------------------------===//
3 // The LLVM Compiler Infrastructure
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
8 //===----------------------------------------------------------------------===//
11 /// This pass removes redundant S_OR_B64 instructions enabling lanes in
12 /// the exec. If two SI_END_CF (lowered as S_OR_B64) come together without any
13 /// vector instructions between them we can only keep outer SI_END_CF, given
14 /// that CFG is structured and exec bits of the outer end statement are always
15 /// not less than exec bit of the inner one.
17 /// This needs to be done before the RA to eliminate saved exec bits registers
18 /// but after register coalescer to have no vector registers copies in between
19 /// of different end cf statements.
21 //===----------------------------------------------------------------------===//
24 #include "AMDGPUSubtarget.h"
25 #include "SIInstrInfo.h"
26 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
27 #include "llvm/CodeGen/LiveIntervals.h"
28 #include "llvm/CodeGen/MachineFunctionPass.h"
32 #define DEBUG_TYPE "si-optimize-exec-masking-pre-ra"
36 class SIOptimizeExecMaskingPreRA
: public MachineFunctionPass
{
41 SIOptimizeExecMaskingPreRA() : MachineFunctionPass(ID
) {
42 initializeSIOptimizeExecMaskingPreRAPass(*PassRegistry::getPassRegistry());
45 bool runOnMachineFunction(MachineFunction
&MF
) override
;
47 StringRef
getPassName() const override
{
48 return "SI optimize exec mask operations pre-RA";
51 void getAnalysisUsage(AnalysisUsage
&AU
) const override
{
52 AU
.addRequired
<LiveIntervals
>();
54 MachineFunctionPass::getAnalysisUsage(AU
);
58 } // End anonymous namespace.
60 INITIALIZE_PASS_BEGIN(SIOptimizeExecMaskingPreRA
, DEBUG_TYPE
,
61 "SI optimize exec mask operations pre-RA", false, false)
62 INITIALIZE_PASS_DEPENDENCY(LiveIntervals
)
63 INITIALIZE_PASS_END(SIOptimizeExecMaskingPreRA
, DEBUG_TYPE
,
64 "SI optimize exec mask operations pre-RA", false, false)
66 char SIOptimizeExecMaskingPreRA::ID
= 0;
68 char &llvm::SIOptimizeExecMaskingPreRAID
= SIOptimizeExecMaskingPreRA::ID
;
70 FunctionPass
*llvm::createSIOptimizeExecMaskingPreRAPass() {
71 return new SIOptimizeExecMaskingPreRA();
74 static bool isEndCF(const MachineInstr
& MI
, const SIRegisterInfo
* TRI
) {
75 return MI
.getOpcode() == AMDGPU::S_OR_B64
&&
76 MI
.modifiesRegister(AMDGPU::EXEC
, TRI
);
79 static bool isFullExecCopy(const MachineInstr
& MI
) {
80 return MI
.isFullCopy() && MI
.getOperand(1).getReg() == AMDGPU::EXEC
;
83 static unsigned getOrNonExecReg(const MachineInstr
&MI
,
84 const SIInstrInfo
&TII
) {
85 auto Op
= TII
.getNamedOperand(MI
, AMDGPU::OpName::src1
);
86 if (Op
->isReg() && Op
->getReg() != AMDGPU::EXEC
)
88 Op
= TII
.getNamedOperand(MI
, AMDGPU::OpName::src0
);
89 if (Op
->isReg() && Op
->getReg() != AMDGPU::EXEC
)
91 return AMDGPU::NoRegister
;
94 static MachineInstr
* getOrExecSource(const MachineInstr
&MI
,
95 const SIInstrInfo
&TII
,
96 const MachineRegisterInfo
&MRI
) {
97 auto SavedExec
= getOrNonExecReg(MI
, TII
);
98 if (SavedExec
== AMDGPU::NoRegister
)
100 auto SaveExecInst
= MRI
.getUniqueVRegDef(SavedExec
);
101 if (!SaveExecInst
|| !isFullExecCopy(*SaveExecInst
))
106 bool SIOptimizeExecMaskingPreRA::runOnMachineFunction(MachineFunction
&MF
) {
107 if (skipFunction(MF
.getFunction()))
110 const GCNSubtarget
&ST
= MF
.getSubtarget
<GCNSubtarget
>();
111 const SIRegisterInfo
*TRI
= ST
.getRegisterInfo();
112 const SIInstrInfo
*TII
= ST
.getInstrInfo();
113 MachineRegisterInfo
&MRI
= MF
.getRegInfo();
114 LiveIntervals
*LIS
= &getAnalysis
<LiveIntervals
>();
115 DenseSet
<unsigned> RecalcRegs({AMDGPU::EXEC_LO
, AMDGPU::EXEC_HI
});
116 bool Changed
= false;
118 for (MachineBasicBlock
&MBB
: MF
) {
120 // Try to remove unneeded instructions before s_endpgm.
121 if (MBB
.succ_empty()) {
125 // Skip this if the endpgm has any implicit uses, otherwise we would need
126 // to be careful to update / remove them.
127 MachineInstr
&Term
= MBB
.back();
128 if (Term
.getOpcode() != AMDGPU::S_ENDPGM
||
129 Term
.getNumOperands() != 0)
132 SmallVector
<MachineBasicBlock
*, 4> Blocks({&MBB
});
134 while (!Blocks
.empty()) {
135 auto CurBB
= Blocks
.pop_back_val();
136 auto I
= CurBB
->rbegin(), E
= CurBB
->rend();
138 if (I
->isUnconditionalBranch() || I
->getOpcode() == AMDGPU::S_ENDPGM
)
140 else if (I
->isBranch())
145 if (I
->isDebugInstr()) {
150 if (I
->mayStore() || I
->isBarrier() || I
->isCall() ||
151 I
->hasUnmodeledSideEffects() || I
->hasOrderedMemoryRef())
155 << "Removing no effect instruction: " << *I
<< '\n');
157 for (auto &Op
: I
->operands()) {
159 RecalcRegs
.insert(Op
.getReg());
162 auto Next
= std::next(I
);
163 LIS
->RemoveMachineInstrFromMaps(*I
);
164 I
->eraseFromParent();
173 // Try to ascend predecessors.
174 for (auto *Pred
: CurBB
->predecessors()) {
175 if (Pred
->succ_size() == 1)
176 Blocks
.push_back(Pred
);
182 // Try to collapse adjacent endifs.
183 auto Lead
= MBB
.begin(), E
= MBB
.end();
184 if (MBB
.succ_size() != 1 || Lead
== E
|| !isEndCF(*Lead
, TRI
))
187 const MachineBasicBlock
* Succ
= *MBB
.succ_begin();
188 if (!MBB
.isLayoutSuccessor(Succ
))
191 auto I
= std::next(Lead
);
194 if (!TII
->isSALU(*I
) || I
->readsRegister(AMDGPU::EXEC
, TRI
))
200 const auto NextLead
= Succ
->begin();
201 if (NextLead
== Succ
->end() || !isEndCF(*NextLead
, TRI
) ||
202 !getOrExecSource(*NextLead
, *TII
, MRI
))
205 LLVM_DEBUG(dbgs() << "Redundant EXEC = S_OR_B64 found: " << *Lead
<< '\n');
207 auto SaveExec
= getOrExecSource(*Lead
, *TII
, MRI
);
208 unsigned SaveExecReg
= getOrNonExecReg(*Lead
, *TII
);
209 for (auto &Op
: Lead
->operands()) {
211 RecalcRegs
.insert(Op
.getReg());
214 LIS
->RemoveMachineInstrFromMaps(*Lead
);
215 Lead
->eraseFromParent();
217 LIS
->removeInterval(SaveExecReg
);
218 LIS
->createAndComputeVirtRegInterval(SaveExecReg
);
223 // If the only use of saved exec in the removed instruction is S_AND_B64
224 // fold the copy now.
225 if (!SaveExec
|| !SaveExec
->isFullCopy())
228 unsigned SavedExec
= SaveExec
->getOperand(0).getReg();
229 bool SafeToReplace
= true;
230 for (auto& U
: MRI
.use_nodbg_instructions(SavedExec
)) {
231 if (U
.getParent() != SaveExec
->getParent()) {
232 SafeToReplace
= false;
236 LLVM_DEBUG(dbgs() << "Redundant EXEC COPY: " << *SaveExec
<< '\n');
240 LIS
->RemoveMachineInstrFromMaps(*SaveExec
);
241 SaveExec
->eraseFromParent();
242 MRI
.replaceRegWith(SavedExec
, AMDGPU::EXEC
);
243 LIS
->removeInterval(SavedExec
);
248 for (auto Reg
: RecalcRegs
) {
249 if (TargetRegisterInfo::isVirtualRegister(Reg
)) {
250 LIS
->removeInterval(Reg
);
251 if (!MRI
.reg_empty(Reg
))
252 LIS
->createAndComputeVirtRegInterval(Reg
);
254 for (MCRegUnitIterator
U(Reg
, TRI
); U
.isValid(); ++U
)
255 LIS
->removeRegUnit(*U
);