1 //===-- SIOptimizeExecMaskingPreRA.cpp ------------------------------------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
10 /// This pass performs exec mask handling peephole optimizations which needs
11 /// to be done before register allocation to reduce register pressure.
13 //===----------------------------------------------------------------------===//
16 #include "GCNSubtarget.h"
17 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
18 #include "llvm/CodeGen/LiveIntervals.h"
19 #include "llvm/CodeGen/MachineFunctionPass.h"
20 #include "llvm/InitializePasses.h"
24 #define DEBUG_TYPE "si-optimize-exec-masking-pre-ra"
28 class SIOptimizeExecMaskingPreRA
: public MachineFunctionPass
{
30 const SIRegisterInfo
*TRI
;
31 const SIInstrInfo
*TII
;
32 MachineRegisterInfo
*MRI
;
37 unsigned OrSaveExecOpc
;
42 Register
optimizeVcndVcmpPair(MachineBasicBlock
&MBB
);
43 bool optimizeElseBranch(MachineBasicBlock
&MBB
);
48 SIOptimizeExecMaskingPreRA() : MachineFunctionPass(ID
) {
49 initializeSIOptimizeExecMaskingPreRAPass(*PassRegistry::getPassRegistry());
52 bool runOnMachineFunction(MachineFunction
&MF
) override
;
54 StringRef
getPassName() const override
{
55 return "SI optimize exec mask operations pre-RA";
58 void getAnalysisUsage(AnalysisUsage
&AU
) const override
{
59 AU
.addRequired
<LiveIntervals
>();
61 MachineFunctionPass::getAnalysisUsage(AU
);
65 } // End anonymous namespace.
67 INITIALIZE_PASS_BEGIN(SIOptimizeExecMaskingPreRA
, DEBUG_TYPE
,
68 "SI optimize exec mask operations pre-RA", false, false)
69 INITIALIZE_PASS_DEPENDENCY(LiveIntervals
)
70 INITIALIZE_PASS_END(SIOptimizeExecMaskingPreRA
, DEBUG_TYPE
,
71 "SI optimize exec mask operations pre-RA", false, false)
73 char SIOptimizeExecMaskingPreRA::ID
= 0;
75 char &llvm::SIOptimizeExecMaskingPreRAID
= SIOptimizeExecMaskingPreRA::ID
;
77 FunctionPass
*llvm::createSIOptimizeExecMaskingPreRAPass() {
78 return new SIOptimizeExecMaskingPreRA();
81 // See if there is a def between \p AndIdx and \p SelIdx that needs to live
83 static bool isDefBetween(const LiveRange
&LR
, SlotIndex AndIdx
,
85 LiveQueryResult AndLRQ
= LR
.Query(AndIdx
);
86 return (!AndLRQ
.isKill() && AndLRQ
.valueIn() != LR
.Query(SelIdx
).valueOut());
89 // FIXME: Why do we bother trying to handle physical registers here?
90 static bool isDefBetween(const SIRegisterInfo
&TRI
,
91 LiveIntervals
*LIS
, Register Reg
,
92 const MachineInstr
&Sel
, const MachineInstr
&And
) {
93 SlotIndex AndIdx
= LIS
->getInstructionIndex(And
);
94 SlotIndex SelIdx
= LIS
->getInstructionIndex(Sel
);
97 return isDefBetween(LIS
->getInterval(Reg
), AndIdx
, SelIdx
);
99 for (MCRegUnitIterator
UI(Reg
.asMCReg(), &TRI
); UI
.isValid(); ++UI
) {
100 if (isDefBetween(LIS
->getRegUnit(*UI
), AndIdx
, SelIdx
))
108 // %sel = V_CNDMASK_B32_e64 0, 1, %cc
109 // %cmp = V_CMP_NE_U32 1, %1
110 // $vcc = S_AND_B64 $exec, %cmp
113 // $vcc = S_ANDN2_B64 $exec, %cc
116 // It is the negation pattern inserted by DAGCombiner::visitBRCOND() in the
117 // rebuildSetCC(). We start with S_CBRANCH to avoid exhaustive search, but
118 // only 3 first instructions are really needed. S_AND_B64 with exec is a
119 // required part of the pattern since V_CNDMASK_B32 writes zeroes for inactive
122 // Returns %cc register on success.
124 SIOptimizeExecMaskingPreRA::optimizeVcndVcmpPair(MachineBasicBlock
&MBB
) {
125 auto I
= llvm::find_if(MBB
.terminators(), [](const MachineInstr
&MI
) {
126 unsigned Opc
= MI
.getOpcode();
127 return Opc
== AMDGPU::S_CBRANCH_VCCZ
||
128 Opc
== AMDGPU::S_CBRANCH_VCCNZ
; });
129 if (I
== MBB
.terminators().end())
133 TRI
->findReachingDef(CondReg
, AMDGPU::NoSubRegister
, *I
, *MRI
, LIS
);
134 if (!And
|| And
->getOpcode() != AndOpc
||
135 !And
->getOperand(1).isReg() || !And
->getOperand(2).isReg())
138 MachineOperand
*AndCC
= &And
->getOperand(1);
139 Register CmpReg
= AndCC
->getReg();
140 unsigned CmpSubReg
= AndCC
->getSubReg();
141 if (CmpReg
== Register(ExecReg
)) {
142 AndCC
= &And
->getOperand(2);
143 CmpReg
= AndCC
->getReg();
144 CmpSubReg
= AndCC
->getSubReg();
145 } else if (And
->getOperand(2).getReg() != Register(ExecReg
)) {
149 auto *Cmp
= TRI
->findReachingDef(CmpReg
, CmpSubReg
, *And
, *MRI
, LIS
);
150 if (!Cmp
|| !(Cmp
->getOpcode() == AMDGPU::V_CMP_NE_U32_e32
||
151 Cmp
->getOpcode() == AMDGPU::V_CMP_NE_U32_e64
) ||
152 Cmp
->getParent() != And
->getParent())
155 MachineOperand
*Op1
= TII
->getNamedOperand(*Cmp
, AMDGPU::OpName::src0
);
156 MachineOperand
*Op2
= TII
->getNamedOperand(*Cmp
, AMDGPU::OpName::src1
);
157 if (Op1
->isImm() && Op2
->isReg())
159 if (!Op1
->isReg() || !Op2
->isImm() || Op2
->getImm() != 1)
162 Register SelReg
= Op1
->getReg();
163 auto *Sel
= TRI
->findReachingDef(SelReg
, Op1
->getSubReg(), *Cmp
, *MRI
, LIS
);
164 if (!Sel
|| Sel
->getOpcode() != AMDGPU::V_CNDMASK_B32_e64
)
167 if (TII
->hasModifiersSet(*Sel
, AMDGPU::OpName::src0_modifiers
) ||
168 TII
->hasModifiersSet(*Sel
, AMDGPU::OpName::src1_modifiers
))
171 Op1
= TII
->getNamedOperand(*Sel
, AMDGPU::OpName::src0
);
172 Op2
= TII
->getNamedOperand(*Sel
, AMDGPU::OpName::src1
);
173 MachineOperand
*CC
= TII
->getNamedOperand(*Sel
, AMDGPU::OpName::src2
);
174 if (!Op1
->isImm() || !Op2
->isImm() || !CC
->isReg() ||
175 Op1
->getImm() != 0 || Op2
->getImm() != 1)
178 Register CCReg
= CC
->getReg();
180 // If there was a def between the select and the and, we would need to move it
182 if (isDefBetween(*TRI
, LIS
, CCReg
, *Sel
, *And
))
185 LLVM_DEBUG(dbgs() << "Folding sequence:\n\t" << *Sel
<< '\t' << *Cmp
<< '\t'
188 LIS
->RemoveMachineInstrFromMaps(*And
);
189 MachineInstr
*Andn2
=
190 BuildMI(MBB
, *And
, And
->getDebugLoc(), TII
->get(Andn2Opc
),
191 And
->getOperand(0).getReg())
193 .addReg(CCReg
, getUndefRegState(CC
->isUndef()), CC
->getSubReg());
194 MachineOperand
&AndSCC
= And
->getOperand(3);
195 assert(AndSCC
.getReg() == AMDGPU::SCC
);
196 MachineOperand
&Andn2SCC
= Andn2
->getOperand(3);
197 assert(Andn2SCC
.getReg() == AMDGPU::SCC
);
198 Andn2SCC
.setIsDead(AndSCC
.isDead());
199 And
->eraseFromParent();
200 LIS
->InsertMachineInstrInMaps(*Andn2
);
202 LLVM_DEBUG(dbgs() << "=>\n\t" << *Andn2
<< '\n');
204 // Try to remove compare. Cmp value should not used in between of cmp
205 // and s_and_b64 if VCC or just unused if any other register.
206 if ((CmpReg
.isVirtual() && MRI
->use_nodbg_empty(CmpReg
)) ||
207 (CmpReg
== Register(CondReg
) &&
208 std::none_of(std::next(Cmp
->getIterator()), Andn2
->getIterator(),
209 [&](const MachineInstr
&MI
) {
210 return MI
.readsRegister(CondReg
, TRI
);
212 LLVM_DEBUG(dbgs() << "Erasing: " << *Cmp
<< '\n');
214 LIS
->RemoveMachineInstrFromMaps(*Cmp
);
215 Cmp
->eraseFromParent();
217 // Try to remove v_cndmask_b32.
218 if (SelReg
.isVirtual() && MRI
->use_nodbg_empty(SelReg
)) {
219 LLVM_DEBUG(dbgs() << "Erasing: " << *Sel
<< '\n');
221 LIS
->RemoveMachineInstrFromMaps(*Sel
);
222 Sel
->eraseFromParent();
230 // %dst = S_OR_SAVEEXEC %src
231 // ... instructions not modifying exec ...
232 // %tmp = S_AND $exec, %dst
233 // $exec = S_XOR_term $exec, %tmp
235 // %dst = S_OR_SAVEEXEC %src
236 // ... instructions not modifying exec ...
237 // $exec = S_XOR_term $exec, %dst
239 // Clean up potentially unnecessary code added for safety during
240 // control flow lowering.
242 // Return whether any changes were made to MBB.
243 bool SIOptimizeExecMaskingPreRA::optimizeElseBranch(MachineBasicBlock
&MBB
) {
247 // Check this is an else block.
248 auto First
= MBB
.begin();
249 MachineInstr
&SaveExecMI
= *First
;
250 if (SaveExecMI
.getOpcode() != OrSaveExecOpc
)
253 auto I
= llvm::find_if(MBB
.terminators(), [this](const MachineInstr
&MI
) {
254 return MI
.getOpcode() == XorTermrOpc
;
256 if (I
== MBB
.terminators().end())
259 MachineInstr
&XorTermMI
= *I
;
260 if (XorTermMI
.getOperand(1).getReg() != Register(ExecReg
))
263 Register SavedExecReg
= SaveExecMI
.getOperand(0).getReg();
264 Register DstReg
= XorTermMI
.getOperand(2).getReg();
266 // Find potentially unnecessary S_AND
267 MachineInstr
*AndExecMI
= nullptr;
269 while (I
!= First
&& !AndExecMI
) {
270 if (I
->getOpcode() == AndOpc
&& I
->getOperand(0).getReg() == DstReg
&&
271 I
->getOperand(1).getReg() == Register(ExecReg
))
278 // Check for exec modifying instructions.
279 // Note: exec defs do not create live ranges beyond the
280 // instruction so isDefBetween cannot be used.
281 // Instead just check that the def segments are adjacent.
282 SlotIndex StartIdx
= LIS
->getInstructionIndex(SaveExecMI
);
283 SlotIndex EndIdx
= LIS
->getInstructionIndex(*AndExecMI
);
284 for (MCRegUnitIterator
UI(ExecReg
, TRI
); UI
.isValid(); ++UI
) {
285 LiveRange
&RegUnit
= LIS
->getRegUnit(*UI
);
286 if (RegUnit
.find(StartIdx
) != std::prev(RegUnit
.find(EndIdx
)))
290 // Remove unnecessary S_AND
291 LIS
->removeInterval(SavedExecReg
);
292 LIS
->removeInterval(DstReg
);
294 SaveExecMI
.getOperand(0).setReg(DstReg
);
296 LIS
->RemoveMachineInstrFromMaps(*AndExecMI
);
297 AndExecMI
->eraseFromParent();
299 LIS
->createAndComputeVirtRegInterval(DstReg
);
304 bool SIOptimizeExecMaskingPreRA::runOnMachineFunction(MachineFunction
&MF
) {
305 if (skipFunction(MF
.getFunction()))
308 const GCNSubtarget
&ST
= MF
.getSubtarget
<GCNSubtarget
>();
309 TRI
= ST
.getRegisterInfo();
310 TII
= ST
.getInstrInfo();
311 MRI
= &MF
.getRegInfo();
312 LIS
= &getAnalysis
<LiveIntervals
>();
314 const bool Wave32
= ST
.isWave32();
315 AndOpc
= Wave32
? AMDGPU::S_AND_B32
: AMDGPU::S_AND_B64
;
316 Andn2Opc
= Wave32
? AMDGPU::S_ANDN2_B32
: AMDGPU::S_ANDN2_B64
;
318 Wave32
? AMDGPU::S_OR_SAVEEXEC_B32
: AMDGPU::S_OR_SAVEEXEC_B64
;
319 XorTermrOpc
= Wave32
? AMDGPU::S_XOR_B32_term
: AMDGPU::S_XOR_B64_term
;
320 CondReg
= MCRegister::from(Wave32
? AMDGPU::VCC_LO
: AMDGPU::VCC
);
321 ExecReg
= MCRegister::from(Wave32
? AMDGPU::EXEC_LO
: AMDGPU::EXEC
);
323 DenseSet
<Register
> RecalcRegs({AMDGPU::EXEC_LO
, AMDGPU::EXEC_HI
});
324 bool Changed
= false;
326 for (MachineBasicBlock
&MBB
: MF
) {
328 if (optimizeElseBranch(MBB
)) {
329 RecalcRegs
.insert(AMDGPU::SCC
);
333 if (Register Reg
= optimizeVcndVcmpPair(MBB
)) {
334 RecalcRegs
.insert(Reg
);
335 RecalcRegs
.insert(AMDGPU::VCC_LO
);
336 RecalcRegs
.insert(AMDGPU::VCC_HI
);
337 RecalcRegs
.insert(AMDGPU::SCC
);
341 // Try to remove unneeded instructions before s_endpgm.
342 if (MBB
.succ_empty()) {
346 // Skip this if the endpgm has any implicit uses, otherwise we would need
347 // to be careful to update / remove them.
348 // S_ENDPGM always has a single imm operand that is not used other than to
349 // end up in the encoding
350 MachineInstr
&Term
= MBB
.back();
351 if (Term
.getOpcode() != AMDGPU::S_ENDPGM
|| Term
.getNumOperands() != 1)
354 SmallVector
<MachineBasicBlock
*, 4> Blocks({&MBB
});
356 while (!Blocks
.empty()) {
357 auto CurBB
= Blocks
.pop_back_val();
358 auto I
= CurBB
->rbegin(), E
= CurBB
->rend();
360 if (I
->isUnconditionalBranch() || I
->getOpcode() == AMDGPU::S_ENDPGM
)
362 else if (I
->isBranch())
367 if (I
->isDebugInstr()) {
372 if (I
->mayStore() || I
->isBarrier() || I
->isCall() ||
373 I
->hasUnmodeledSideEffects() || I
->hasOrderedMemoryRef())
377 << "Removing no effect instruction: " << *I
<< '\n');
379 for (auto &Op
: I
->operands()) {
381 RecalcRegs
.insert(Op
.getReg());
384 auto Next
= std::next(I
);
385 LIS
->RemoveMachineInstrFromMaps(*I
);
386 I
->eraseFromParent();
395 // Try to ascend predecessors.
396 for (auto *Pred
: CurBB
->predecessors()) {
397 if (Pred
->succ_size() == 1)
398 Blocks
.push_back(Pred
);
404 // If the only user of a logical operation is move to exec, fold it now
405 // to prevent forming of saveexec. I.e:
407 // %0:sreg_64 = COPY $exec
408 // %1:sreg_64 = S_AND_B64 %0:sreg_64, %2:sreg_64
410 // %1 = S_AND_B64 $exec, %2:sreg_64
411 unsigned ScanThreshold
= 10;
412 for (auto I
= MBB
.rbegin(), E
= MBB
.rend(); I
!= E
413 && ScanThreshold
--; ++I
) {
414 // Continue scanning if this is not a full exec copy
415 if (!(I
->isFullCopy() && I
->getOperand(1).getReg() == Register(ExecReg
)))
418 Register SavedExec
= I
->getOperand(0).getReg();
419 if (SavedExec
.isVirtual() && MRI
->hasOneNonDBGUse(SavedExec
)) {
420 MachineInstr
*SingleExecUser
= &*MRI
->use_instr_nodbg_begin(SavedExec
);
421 int Idx
= SingleExecUser
->findRegisterUseOperandIdx(SavedExec
);
423 if (SingleExecUser
->getParent() == I
->getParent() &&
424 !SingleExecUser
->getOperand(Idx
).isImplicit() &&
425 TII
->isOperandLegal(*SingleExecUser
, Idx
, &I
->getOperand(1))) {
426 LLVM_DEBUG(dbgs() << "Redundant EXEC COPY: " << *I
<< '\n');
427 LIS
->RemoveMachineInstrFromMaps(*I
);
428 I
->eraseFromParent();
429 MRI
->replaceRegWith(SavedExec
, ExecReg
);
430 LIS
->removeInterval(SavedExec
);
439 for (auto Reg
: RecalcRegs
) {
440 if (Reg
.isVirtual()) {
441 LIS
->removeInterval(Reg
);
442 if (!MRI
->reg_empty(Reg
))
443 LIS
->createAndComputeVirtRegInterval(Reg
);
445 LIS
->removeAllRegUnitsForPhysReg(Reg
);