1 //===-- SIOptimizeExecMaskingPreRA.cpp ------------------------------------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
10 /// This pass performs exec mask handling peephole optimizations which needs
11 /// to be done before register allocation to reduce register pressure.
13 //===----------------------------------------------------------------------===//
16 #include "GCNSubtarget.h"
17 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
18 #include "llvm/CodeGen/LiveIntervals.h"
19 #include "llvm/CodeGen/MachineFunctionPass.h"
20 #include "llvm/InitializePasses.h"
24 #define DEBUG_TYPE "si-optimize-exec-masking-pre-ra"
28 class SIOptimizeExecMaskingPreRA
: public MachineFunctionPass
{
30 const SIRegisterInfo
*TRI
;
31 const SIInstrInfo
*TII
;
32 MachineRegisterInfo
*MRI
;
37 unsigned OrSaveExecOpc
;
42 bool optimizeVcndVcmpPair(MachineBasicBlock
&MBB
);
43 bool optimizeElseBranch(MachineBasicBlock
&MBB
);
48 SIOptimizeExecMaskingPreRA() : MachineFunctionPass(ID
) {
49 initializeSIOptimizeExecMaskingPreRAPass(*PassRegistry::getPassRegistry());
52 bool runOnMachineFunction(MachineFunction
&MF
) override
;
54 StringRef
getPassName() const override
{
55 return "SI optimize exec mask operations pre-RA";
58 void getAnalysisUsage(AnalysisUsage
&AU
) const override
{
59 AU
.addRequired
<LiveIntervalsWrapperPass
>();
61 MachineFunctionPass::getAnalysisUsage(AU
);
65 } // End anonymous namespace.
67 INITIALIZE_PASS_BEGIN(SIOptimizeExecMaskingPreRA
, DEBUG_TYPE
,
68 "SI optimize exec mask operations pre-RA", false, false)
69 INITIALIZE_PASS_DEPENDENCY(LiveIntervalsWrapperPass
)
70 INITIALIZE_PASS_END(SIOptimizeExecMaskingPreRA
, DEBUG_TYPE
,
71 "SI optimize exec mask operations pre-RA", false, false)
73 char SIOptimizeExecMaskingPreRA::ID
= 0;
75 char &llvm::SIOptimizeExecMaskingPreRAID
= SIOptimizeExecMaskingPreRA::ID
;
77 FunctionPass
*llvm::createSIOptimizeExecMaskingPreRAPass() {
78 return new SIOptimizeExecMaskingPreRA();
81 // See if there is a def between \p AndIdx and \p SelIdx that needs to live
83 static bool isDefBetween(const LiveRange
&LR
, SlotIndex AndIdx
,
85 LiveQueryResult AndLRQ
= LR
.Query(AndIdx
);
86 return (!AndLRQ
.isKill() && AndLRQ
.valueIn() != LR
.Query(SelIdx
).valueOut());
89 // FIXME: Why do we bother trying to handle physical registers here?
90 static bool isDefBetween(const SIRegisterInfo
&TRI
,
91 LiveIntervals
*LIS
, Register Reg
,
92 const MachineInstr
&Sel
, const MachineInstr
&And
) {
93 SlotIndex AndIdx
= LIS
->getInstructionIndex(And
).getRegSlot();
94 SlotIndex SelIdx
= LIS
->getInstructionIndex(Sel
).getRegSlot();
97 return isDefBetween(LIS
->getInterval(Reg
), AndIdx
, SelIdx
);
99 for (MCRegUnit Unit
: TRI
.regunits(Reg
.asMCReg())) {
100 if (isDefBetween(LIS
->getRegUnit(Unit
), AndIdx
, SelIdx
))
108 // %sel = V_CNDMASK_B32_e64 0, 1, %cc
109 // %cmp = V_CMP_NE_U32 1, %sel
110 // $vcc = S_AND_B64 $exec, %cmp
113 // $vcc = S_ANDN2_B64 $exec, %cc
116 // It is the negation pattern inserted by DAGCombiner::visitBRCOND() in the
117 // rebuildSetCC(). We start with S_CBRANCH to avoid exhaustive search, but
118 // only 3 first instructions are really needed. S_AND_B64 with exec is a
119 // required part of the pattern since V_CNDMASK_B32 writes zeroes for inactive
122 // Returns true on success.
123 bool SIOptimizeExecMaskingPreRA::optimizeVcndVcmpPair(MachineBasicBlock
&MBB
) {
124 auto I
= llvm::find_if(MBB
.terminators(), [](const MachineInstr
&MI
) {
125 unsigned Opc
= MI
.getOpcode();
126 return Opc
== AMDGPU::S_CBRANCH_VCCZ
||
127 Opc
== AMDGPU::S_CBRANCH_VCCNZ
; });
128 if (I
== MBB
.terminators().end())
132 TRI
->findReachingDef(CondReg
, AMDGPU::NoSubRegister
, *I
, *MRI
, LIS
);
133 if (!And
|| And
->getOpcode() != AndOpc
||
134 !And
->getOperand(1).isReg() || !And
->getOperand(2).isReg())
137 MachineOperand
*AndCC
= &And
->getOperand(1);
138 Register CmpReg
= AndCC
->getReg();
139 unsigned CmpSubReg
= AndCC
->getSubReg();
140 if (CmpReg
== Register(ExecReg
)) {
141 AndCC
= &And
->getOperand(2);
142 CmpReg
= AndCC
->getReg();
143 CmpSubReg
= AndCC
->getSubReg();
144 } else if (And
->getOperand(2).getReg() != Register(ExecReg
)) {
148 auto *Cmp
= TRI
->findReachingDef(CmpReg
, CmpSubReg
, *And
, *MRI
, LIS
);
149 if (!Cmp
|| !(Cmp
->getOpcode() == AMDGPU::V_CMP_NE_U32_e32
||
150 Cmp
->getOpcode() == AMDGPU::V_CMP_NE_U32_e64
) ||
151 Cmp
->getParent() != And
->getParent())
154 MachineOperand
*Op1
= TII
->getNamedOperand(*Cmp
, AMDGPU::OpName::src0
);
155 MachineOperand
*Op2
= TII
->getNamedOperand(*Cmp
, AMDGPU::OpName::src1
);
156 if (Op1
->isImm() && Op2
->isReg())
158 if (!Op1
->isReg() || !Op2
->isImm() || Op2
->getImm() != 1)
161 Register SelReg
= Op1
->getReg();
162 if (SelReg
.isPhysical())
165 auto *Sel
= TRI
->findReachingDef(SelReg
, Op1
->getSubReg(), *Cmp
, *MRI
, LIS
);
166 if (!Sel
|| Sel
->getOpcode() != AMDGPU::V_CNDMASK_B32_e64
)
169 if (TII
->hasModifiersSet(*Sel
, AMDGPU::OpName::src0_modifiers
) ||
170 TII
->hasModifiersSet(*Sel
, AMDGPU::OpName::src1_modifiers
))
173 Op1
= TII
->getNamedOperand(*Sel
, AMDGPU::OpName::src0
);
174 Op2
= TII
->getNamedOperand(*Sel
, AMDGPU::OpName::src1
);
175 MachineOperand
*CC
= TII
->getNamedOperand(*Sel
, AMDGPU::OpName::src2
);
176 if (!Op1
->isImm() || !Op2
->isImm() || !CC
->isReg() ||
177 Op1
->getImm() != 0 || Op2
->getImm() != 1)
180 Register CCReg
= CC
->getReg();
182 // If there was a def between the select and the and, we would need to move it
184 if (isDefBetween(*TRI
, LIS
, CCReg
, *Sel
, *And
))
187 // Cannot safely mirror live intervals with PHI nodes, so check for these
188 // before optimization.
189 SlotIndex SelIdx
= LIS
->getInstructionIndex(*Sel
);
190 LiveInterval
*SelLI
= &LIS
->getInterval(SelReg
);
191 if (llvm::any_of(SelLI
->vnis(),
192 [](const VNInfo
*VNI
) {
193 return VNI
->isPHIDef();
197 // TODO: Guard against implicit def operands?
198 LLVM_DEBUG(dbgs() << "Folding sequence:\n\t" << *Sel
<< '\t' << *Cmp
<< '\t'
201 MachineInstr
*Andn2
=
202 BuildMI(MBB
, *And
, And
->getDebugLoc(), TII
->get(Andn2Opc
),
203 And
->getOperand(0).getReg())
205 .addReg(CCReg
, getUndefRegState(CC
->isUndef()), CC
->getSubReg());
206 MachineOperand
&AndSCC
= And
->getOperand(3);
207 assert(AndSCC
.getReg() == AMDGPU::SCC
);
208 MachineOperand
&Andn2SCC
= Andn2
->getOperand(3);
209 assert(Andn2SCC
.getReg() == AMDGPU::SCC
);
210 Andn2SCC
.setIsDead(AndSCC
.isDead());
212 SlotIndex AndIdx
= LIS
->ReplaceMachineInstrInMaps(*And
, *Andn2
);
213 And
->eraseFromParent();
215 LLVM_DEBUG(dbgs() << "=>\n\t" << *Andn2
<< '\n');
217 // Update live intervals for CCReg before potentially removing CmpReg/SelReg,
218 // and their associated liveness information.
219 SlotIndex CmpIdx
= LIS
->getInstructionIndex(*Cmp
);
220 if (CCReg
.isVirtual()) {
221 LiveInterval
&CCLI
= LIS
->getInterval(CCReg
);
222 auto CCQ
= CCLI
.Query(SelIdx
.getRegSlot());
224 LIS
->removeInterval(CCReg
);
225 LIS
->createAndComputeVirtRegInterval(CCReg
);
228 LIS
->removeAllRegUnitsForPhysReg(CCReg
);
230 // Try to remove compare. Cmp value should not used in between of cmp
231 // and s_and_b64 if VCC or just unused if any other register.
232 LiveInterval
*CmpLI
= CmpReg
.isVirtual() ? &LIS
->getInterval(CmpReg
) : nullptr;
233 if ((CmpLI
&& CmpLI
->Query(AndIdx
.getRegSlot()).isKill()) ||
234 (CmpReg
== Register(CondReg
) &&
235 std::none_of(std::next(Cmp
->getIterator()), Andn2
->getIterator(),
236 [&](const MachineInstr
&MI
) {
237 return MI
.readsRegister(CondReg
, TRI
);
239 LLVM_DEBUG(dbgs() << "Erasing: " << *Cmp
<< '\n');
241 LIS
->removeVRegDefAt(*CmpLI
, CmpIdx
.getRegSlot());
242 LIS
->RemoveMachineInstrFromMaps(*Cmp
);
243 Cmp
->eraseFromParent();
245 // Try to remove v_cndmask_b32.
246 // Kill status must be checked before shrinking the live range.
247 bool IsKill
= SelLI
->Query(CmpIdx
.getRegSlot()).isKill();
248 LIS
->shrinkToUses(SelLI
);
249 bool IsDead
= SelLI
->Query(SelIdx
.getRegSlot()).isDeadDef();
250 if (MRI
->use_nodbg_empty(SelReg
) && (IsKill
|| IsDead
)) {
251 LLVM_DEBUG(dbgs() << "Erasing: " << *Sel
<< '\n');
253 LIS
->removeVRegDefAt(*SelLI
, SelIdx
.getRegSlot());
254 LIS
->RemoveMachineInstrFromMaps(*Sel
);
255 bool ShrinkSel
= Sel
->getOperand(0).readsReg();
256 Sel
->eraseFromParent();
258 // The result of the V_CNDMASK was a subreg def which counted as a read
259 // from the other parts of the reg. Shrink their live ranges.
260 LIS
->shrinkToUses(SelLI
);
269 // %dst = S_OR_SAVEEXEC %src
270 // ... instructions not modifying exec ...
271 // %tmp = S_AND $exec, %dst
272 // $exec = S_XOR_term $exec, %tmp
274 // %dst = S_OR_SAVEEXEC %src
275 // ... instructions not modifying exec ...
276 // $exec = S_XOR_term $exec, %dst
278 // Clean up potentially unnecessary code added for safety during
279 // control flow lowering.
281 // Return whether any changes were made to MBB.
282 bool SIOptimizeExecMaskingPreRA::optimizeElseBranch(MachineBasicBlock
&MBB
) {
286 // Check this is an else block.
287 auto First
= MBB
.begin();
288 MachineInstr
&SaveExecMI
= *First
;
289 if (SaveExecMI
.getOpcode() != OrSaveExecOpc
)
292 auto I
= llvm::find_if(MBB
.terminators(), [this](const MachineInstr
&MI
) {
293 return MI
.getOpcode() == XorTermrOpc
;
295 if (I
== MBB
.terminators().end())
298 MachineInstr
&XorTermMI
= *I
;
299 if (XorTermMI
.getOperand(1).getReg() != Register(ExecReg
))
302 Register SavedExecReg
= SaveExecMI
.getOperand(0).getReg();
303 Register DstReg
= XorTermMI
.getOperand(2).getReg();
305 // Find potentially unnecessary S_AND
306 MachineInstr
*AndExecMI
= nullptr;
308 while (I
!= First
&& !AndExecMI
) {
309 if (I
->getOpcode() == AndOpc
&& I
->getOperand(0).getReg() == DstReg
&&
310 I
->getOperand(1).getReg() == Register(ExecReg
))
317 // Check for exec modifying instructions.
318 // Note: exec defs do not create live ranges beyond the
319 // instruction so isDefBetween cannot be used.
320 // Instead just check that the def segments are adjacent.
321 SlotIndex StartIdx
= LIS
->getInstructionIndex(SaveExecMI
);
322 SlotIndex EndIdx
= LIS
->getInstructionIndex(*AndExecMI
);
323 for (MCRegUnit Unit
: TRI
->regunits(ExecReg
)) {
324 LiveRange
&RegUnit
= LIS
->getRegUnit(Unit
);
325 if (RegUnit
.find(StartIdx
) != std::prev(RegUnit
.find(EndIdx
)))
329 // Remove unnecessary S_AND
330 LIS
->removeInterval(SavedExecReg
);
331 LIS
->removeInterval(DstReg
);
333 SaveExecMI
.getOperand(0).setReg(DstReg
);
335 LIS
->RemoveMachineInstrFromMaps(*AndExecMI
);
336 AndExecMI
->eraseFromParent();
338 LIS
->createAndComputeVirtRegInterval(DstReg
);
343 bool SIOptimizeExecMaskingPreRA::runOnMachineFunction(MachineFunction
&MF
) {
344 if (skipFunction(MF
.getFunction()))
347 const GCNSubtarget
&ST
= MF
.getSubtarget
<GCNSubtarget
>();
348 TRI
= ST
.getRegisterInfo();
349 TII
= ST
.getInstrInfo();
350 MRI
= &MF
.getRegInfo();
351 LIS
= &getAnalysis
<LiveIntervalsWrapperPass
>().getLIS();
353 const bool Wave32
= ST
.isWave32();
354 AndOpc
= Wave32
? AMDGPU::S_AND_B32
: AMDGPU::S_AND_B64
;
355 Andn2Opc
= Wave32
? AMDGPU::S_ANDN2_B32
: AMDGPU::S_ANDN2_B64
;
357 Wave32
? AMDGPU::S_OR_SAVEEXEC_B32
: AMDGPU::S_OR_SAVEEXEC_B64
;
358 XorTermrOpc
= Wave32
? AMDGPU::S_XOR_B32_term
: AMDGPU::S_XOR_B64_term
;
359 CondReg
= MCRegister::from(Wave32
? AMDGPU::VCC_LO
: AMDGPU::VCC
);
360 ExecReg
= MCRegister::from(Wave32
? AMDGPU::EXEC_LO
: AMDGPU::EXEC
);
362 DenseSet
<Register
> RecalcRegs({AMDGPU::EXEC_LO
, AMDGPU::EXEC_HI
});
363 bool Changed
= false;
365 for (MachineBasicBlock
&MBB
: MF
) {
367 if (optimizeElseBranch(MBB
)) {
368 RecalcRegs
.insert(AMDGPU::SCC
);
372 if (optimizeVcndVcmpPair(MBB
)) {
373 RecalcRegs
.insert(AMDGPU::VCC_LO
);
374 RecalcRegs
.insert(AMDGPU::VCC_HI
);
375 RecalcRegs
.insert(AMDGPU::SCC
);
379 // Try to remove unneeded instructions before s_endpgm.
380 if (MBB
.succ_empty()) {
384 // Skip this if the endpgm has any implicit uses, otherwise we would need
385 // to be careful to update / remove them.
386 // S_ENDPGM always has a single imm operand that is not used other than to
387 // end up in the encoding
388 MachineInstr
&Term
= MBB
.back();
389 if (Term
.getOpcode() != AMDGPU::S_ENDPGM
|| Term
.getNumOperands() != 1)
392 SmallVector
<MachineBasicBlock
*, 4> Blocks({&MBB
});
394 while (!Blocks
.empty()) {
395 auto CurBB
= Blocks
.pop_back_val();
396 auto I
= CurBB
->rbegin(), E
= CurBB
->rend();
398 if (I
->isUnconditionalBranch() || I
->getOpcode() == AMDGPU::S_ENDPGM
)
400 else if (I
->isBranch())
405 if (I
->isDebugInstr()) {
410 if (I
->mayStore() || I
->isBarrier() || I
->isCall() ||
411 I
->hasUnmodeledSideEffects() || I
->hasOrderedMemoryRef())
415 << "Removing no effect instruction: " << *I
<< '\n');
417 for (auto &Op
: I
->operands()) {
419 RecalcRegs
.insert(Op
.getReg());
422 auto Next
= std::next(I
);
423 LIS
->RemoveMachineInstrFromMaps(*I
);
424 I
->eraseFromParent();
433 // Try to ascend predecessors.
434 for (auto *Pred
: CurBB
->predecessors()) {
435 if (Pred
->succ_size() == 1)
436 Blocks
.push_back(Pred
);
442 // If the only user of a logical operation is move to exec, fold it now
443 // to prevent forming of saveexec. I.e.:
445 // %0:sreg_64 = COPY $exec
446 // %1:sreg_64 = S_AND_B64 %0:sreg_64, %2:sreg_64
448 // %1 = S_AND_B64 $exec, %2:sreg_64
449 unsigned ScanThreshold
= 10;
450 for (auto I
= MBB
.rbegin(), E
= MBB
.rend(); I
!= E
451 && ScanThreshold
--; ++I
) {
452 // Continue scanning if this is not a full exec copy
453 if (!(I
->isFullCopy() && I
->getOperand(1).getReg() == Register(ExecReg
)))
456 Register SavedExec
= I
->getOperand(0).getReg();
457 if (SavedExec
.isVirtual() && MRI
->hasOneNonDBGUse(SavedExec
)) {
458 MachineInstr
*SingleExecUser
= &*MRI
->use_instr_nodbg_begin(SavedExec
);
459 int Idx
= SingleExecUser
->findRegisterUseOperandIdx(SavedExec
,
462 if (SingleExecUser
->getParent() == I
->getParent() &&
463 !SingleExecUser
->getOperand(Idx
).isImplicit() &&
464 TII
->isOperandLegal(*SingleExecUser
, Idx
, &I
->getOperand(1))) {
465 LLVM_DEBUG(dbgs() << "Redundant EXEC COPY: " << *I
<< '\n');
466 LIS
->RemoveMachineInstrFromMaps(*I
);
467 I
->eraseFromParent();
468 MRI
->replaceRegWith(SavedExec
, ExecReg
);
469 LIS
->removeInterval(SavedExec
);
478 for (auto Reg
: RecalcRegs
) {
479 if (Reg
.isVirtual()) {
480 LIS
->removeInterval(Reg
);
481 if (!MRI
->reg_empty(Reg
))
482 LIS
->createAndComputeVirtRegInterval(Reg
);
484 LIS
->removeAllRegUnitsForPhysReg(Reg
);