1 //===-- SILowerControlFlow.cpp - Use predicates for control flow ----------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
10 /// This pass lowers the pseudo control flow instructions to real
11 /// machine instructions.
13 /// All control flow is handled using predicated instructions and
14 /// a predicate stack. Each Scalar ALU controls the operations of 64 Vector
15 /// ALUs. The Scalar ALU can update the predicate for any of the Vector ALUs
16 /// by writting to the 64-bit EXEC register (each bit corresponds to a
17 /// single vector ALU). Typically, for predicates, a vector ALU will write
18 /// to its bit of the VCC register (like EXEC VCC is 64-bits, one for each
19 /// Vector ALU) and then the ScalarALU will AND the VCC register with the
20 /// EXEC to update the predicates.
23 /// %vcc = V_CMP_GT_F32 %vgpr1, %vgpr2
24 /// %sgpr0 = SI_IF %vcc
25 /// %vgpr0 = V_ADD_F32 %vgpr0, %vgpr0
26 /// %sgpr0 = SI_ELSE %sgpr0
27 /// %vgpr0 = V_SUB_F32 %vgpr0, %vgpr0
32 /// %sgpr0 = S_AND_SAVEEXEC_B64 %vcc // Save and update the exec mask
33 /// %sgpr0 = S_XOR_B64 %sgpr0, %exec // Clear live bits from saved exec mask
34 /// S_CBRANCH_EXECZ label0 // This instruction is an optional
35 /// // optimization which allows us to
36 /// // branch if all the bits of
38 /// %vgpr0 = V_ADD_F32 %vgpr0, %vgpr0 // Do the IF block of the branch
41 /// %sgpr0 = S_OR_SAVEEXEC_B64 %exec // Restore the exec mask for the Then block
42 /// %exec = S_XOR_B64 %sgpr0, %exec // Clear live bits from saved exec mask
43 /// S_BRANCH_EXECZ label1 // Use our branch optimization
44 /// // instruction again.
45 /// %vgpr0 = V_SUB_F32 %vgpr0, %vgpr // Do the THEN block
47 /// %exec = S_OR_B64 %exec, %sgpr0 // Re-enable saved exec mask bits
48 //===----------------------------------------------------------------------===//
51 #include "AMDGPUSubtarget.h"
52 #include "SIInstrInfo.h"
53 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
54 #include "llvm/ADT/SmallVector.h"
55 #include "llvm/ADT/StringRef.h"
56 #include "llvm/CodeGen/LiveIntervals.h"
57 #include "llvm/CodeGen/MachineBasicBlock.h"
58 #include "llvm/CodeGen/MachineFunction.h"
59 #include "llvm/CodeGen/MachineFunctionPass.h"
60 #include "llvm/CodeGen/MachineInstr.h"
61 #include "llvm/CodeGen/MachineInstrBuilder.h"
62 #include "llvm/CodeGen/MachineOperand.h"
63 #include "llvm/CodeGen/MachineRegisterInfo.h"
64 #include "llvm/CodeGen/Passes.h"
65 #include "llvm/CodeGen/SlotIndexes.h"
66 #include "llvm/CodeGen/TargetRegisterInfo.h"
67 #include "llvm/MC/MCRegisterInfo.h"
68 #include "llvm/Pass.h"
74 #define DEBUG_TYPE "si-lower-control-flow"
78 class SILowerControlFlow
: public MachineFunctionPass
{
80 const SIRegisterInfo
*TRI
= nullptr;
81 const SIInstrInfo
*TII
= nullptr;
82 LiveIntervals
*LIS
= nullptr;
83 MachineRegisterInfo
*MRI
= nullptr;
85 void emitIf(MachineInstr
&MI
);
86 void emitElse(MachineInstr
&MI
);
87 void emitIfBreak(MachineInstr
&MI
);
88 void emitLoop(MachineInstr
&MI
);
89 void emitEndCf(MachineInstr
&MI
);
91 void findMaskOperands(MachineInstr
&MI
, unsigned OpNo
,
92 SmallVectorImpl
<MachineOperand
> &Src
) const;
94 void combineMasks(MachineInstr
&MI
);
99 SILowerControlFlow() : MachineFunctionPass(ID
) {}
101 bool runOnMachineFunction(MachineFunction
&MF
) override
;
103 StringRef
getPassName() const override
{
104 return "SI Lower control flow pseudo instructions";
107 void getAnalysisUsage(AnalysisUsage
&AU
) const override
{
108 // Should preserve the same set that TwoAddressInstructions does.
109 AU
.addPreserved
<SlotIndexes
>();
110 AU
.addPreserved
<LiveIntervals
>();
111 AU
.addPreservedID(LiveVariablesID
);
112 AU
.addPreservedID(MachineLoopInfoID
);
113 AU
.addPreservedID(MachineDominatorsID
);
114 AU
.setPreservesCFG();
115 MachineFunctionPass::getAnalysisUsage(AU
);
119 } // end anonymous namespace
121 char SILowerControlFlow::ID
= 0;
123 INITIALIZE_PASS(SILowerControlFlow
, DEBUG_TYPE
,
124 "SI lower control flow", false, false)
126 static void setImpSCCDefDead(MachineInstr
&MI
, bool IsDead
) {
127 MachineOperand
&ImpDefSCC
= MI
.getOperand(3);
128 assert(ImpDefSCC
.getReg() == AMDGPU::SCC
&& ImpDefSCC
.isDef());
130 ImpDefSCC
.setIsDead(IsDead
);
133 char &llvm::SILowerControlFlowID
= SILowerControlFlow::ID
;
135 static bool isSimpleIf(const MachineInstr
&MI
, const MachineRegisterInfo
*MRI
,
136 const SIInstrInfo
*TII
) {
137 unsigned SaveExecReg
= MI
.getOperand(0).getReg();
138 auto U
= MRI
->use_instr_nodbg_begin(SaveExecReg
);
140 if (U
== MRI
->use_instr_nodbg_end() ||
141 std::next(U
) != MRI
->use_instr_nodbg_end() ||
142 U
->getOpcode() != AMDGPU::SI_END_CF
)
145 // Check for SI_KILL_*_TERMINATOR on path from if to endif.
146 // if there is any such terminator simplififcations are not safe.
147 auto SMBB
= MI
.getParent();
148 auto EMBB
= U
->getParent();
149 DenseSet
<const MachineBasicBlock
*> Visited
;
150 SmallVector
<MachineBasicBlock
*, 4> Worklist(SMBB
->succ_begin(),
153 while (!Worklist
.empty()) {
154 MachineBasicBlock
*MBB
= Worklist
.pop_back_val();
156 if (MBB
== EMBB
|| !Visited
.insert(MBB
).second
)
158 for(auto &Term
: MBB
->terminators())
159 if (TII
->isKillTerminator(Term
.getOpcode()))
162 Worklist
.append(MBB
->succ_begin(), MBB
->succ_end());
168 void SILowerControlFlow::emitIf(MachineInstr
&MI
) {
169 MachineBasicBlock
&MBB
= *MI
.getParent();
170 const DebugLoc
&DL
= MI
.getDebugLoc();
171 MachineBasicBlock::iterator
I(&MI
);
173 MachineOperand
&SaveExec
= MI
.getOperand(0);
174 MachineOperand
&Cond
= MI
.getOperand(1);
175 assert(SaveExec
.getSubReg() == AMDGPU::NoSubRegister
&&
176 Cond
.getSubReg() == AMDGPU::NoSubRegister
);
178 unsigned SaveExecReg
= SaveExec
.getReg();
180 MachineOperand
&ImpDefSCC
= MI
.getOperand(4);
181 assert(ImpDefSCC
.getReg() == AMDGPU::SCC
&& ImpDefSCC
.isDef());
183 // If there is only one use of save exec register and that use is SI_END_CF,
184 // we can optimize SI_IF by returning the full saved exec mask instead of
185 // just cleared bits.
186 bool SimpleIf
= isSimpleIf(MI
, MRI
, TII
);
188 // Add an implicit def of exec to discourage scheduling VALU after this which
189 // will interfere with trying to form s_and_saveexec_b64 later.
190 unsigned CopyReg
= SimpleIf
? SaveExecReg
191 : MRI
->createVirtualRegister(&AMDGPU::SReg_64RegClass
);
192 MachineInstr
*CopyExec
=
193 BuildMI(MBB
, I
, DL
, TII
->get(AMDGPU::COPY
), CopyReg
)
194 .addReg(AMDGPU::EXEC
)
195 .addReg(AMDGPU::EXEC
, RegState::ImplicitDefine
);
197 unsigned Tmp
= MRI
->createVirtualRegister(&AMDGPU::SReg_64RegClass
);
200 BuildMI(MBB
, I
, DL
, TII
->get(AMDGPU::S_AND_B64
), Tmp
)
202 //.addReg(AMDGPU::EXEC)
203 .addReg(Cond
.getReg());
204 setImpSCCDefDead(*And
, true);
206 MachineInstr
*Xor
= nullptr;
209 BuildMI(MBB
, I
, DL
, TII
->get(AMDGPU::S_XOR_B64
), SaveExecReg
)
212 setImpSCCDefDead(*Xor
, ImpDefSCC
.isDead());
215 // Use a copy that is a terminator to get correct spill code placement it with
217 MachineInstr
*SetExec
=
218 BuildMI(MBB
, I
, DL
, TII
->get(AMDGPU::S_MOV_B64_term
), AMDGPU::EXEC
)
219 .addReg(Tmp
, RegState::Kill
);
221 // Insert a pseudo terminator to help keep the verifier happy. This will also
222 // be used later when inserting skips.
223 MachineInstr
*NewBr
= BuildMI(MBB
, I
, DL
, TII
->get(AMDGPU::SI_MASK_BRANCH
))
224 .add(MI
.getOperand(2));
227 MI
.eraseFromParent();
231 LIS
->InsertMachineInstrInMaps(*CopyExec
);
233 // Replace with and so we don't need to fix the live interval for condition
235 LIS
->ReplaceMachineInstrInMaps(MI
, *And
);
238 LIS
->InsertMachineInstrInMaps(*Xor
);
239 LIS
->InsertMachineInstrInMaps(*SetExec
);
240 LIS
->InsertMachineInstrInMaps(*NewBr
);
242 LIS
->removeRegUnit(*MCRegUnitIterator(AMDGPU::EXEC
, TRI
));
243 MI
.eraseFromParent();
245 // FIXME: Is there a better way of adjusting the liveness? It shouldn't be
246 // hard to add another def here but I'm not sure how to correctly update the
248 LIS
->removeInterval(SaveExecReg
);
249 LIS
->createAndComputeVirtRegInterval(SaveExecReg
);
250 LIS
->createAndComputeVirtRegInterval(Tmp
);
252 LIS
->createAndComputeVirtRegInterval(CopyReg
);
255 void SILowerControlFlow::emitElse(MachineInstr
&MI
) {
256 MachineBasicBlock
&MBB
= *MI
.getParent();
257 const DebugLoc
&DL
= MI
.getDebugLoc();
259 unsigned DstReg
= MI
.getOperand(0).getReg();
260 assert(MI
.getOperand(0).getSubReg() == AMDGPU::NoSubRegister
);
262 bool ExecModified
= MI
.getOperand(3).getImm() != 0;
263 MachineBasicBlock::iterator Start
= MBB
.begin();
265 // We are running before TwoAddressInstructions, and si_else's operands are
266 // tied. In order to correctly tie the registers, split this into a copy of
267 // the src like it does.
268 unsigned CopyReg
= MRI
->createVirtualRegister(&AMDGPU::SReg_64RegClass
);
269 MachineInstr
*CopyExec
=
270 BuildMI(MBB
, Start
, DL
, TII
->get(AMDGPU::COPY
), CopyReg
)
271 .add(MI
.getOperand(1)); // Saved EXEC
273 // This must be inserted before phis and any spill code inserted before the
275 unsigned SaveReg
= ExecModified
?
276 MRI
->createVirtualRegister(&AMDGPU::SReg_64RegClass
) : DstReg
;
277 MachineInstr
*OrSaveExec
=
278 BuildMI(MBB
, Start
, DL
, TII
->get(AMDGPU::S_OR_SAVEEXEC_B64
), SaveReg
)
281 MachineBasicBlock
*DestBB
= MI
.getOperand(2).getMBB();
283 MachineBasicBlock::iterator
ElsePt(MI
);
287 BuildMI(MBB
, ElsePt
, DL
, TII
->get(AMDGPU::S_AND_B64
), DstReg
)
288 .addReg(AMDGPU::EXEC
)
292 LIS
->InsertMachineInstrInMaps(*And
);
296 BuildMI(MBB
, ElsePt
, DL
, TII
->get(AMDGPU::S_XOR_B64_term
), AMDGPU::EXEC
)
297 .addReg(AMDGPU::EXEC
)
300 MachineInstr
*Branch
=
301 BuildMI(MBB
, ElsePt
, DL
, TII
->get(AMDGPU::SI_MASK_BRANCH
))
305 MI
.eraseFromParent();
309 LIS
->RemoveMachineInstrFromMaps(MI
);
310 MI
.eraseFromParent();
312 LIS
->InsertMachineInstrInMaps(*CopyExec
);
313 LIS
->InsertMachineInstrInMaps(*OrSaveExec
);
315 LIS
->InsertMachineInstrInMaps(*Xor
);
316 LIS
->InsertMachineInstrInMaps(*Branch
);
318 // src reg is tied to dst reg.
319 LIS
->removeInterval(DstReg
);
320 LIS
->createAndComputeVirtRegInterval(DstReg
);
321 LIS
->createAndComputeVirtRegInterval(CopyReg
);
323 LIS
->createAndComputeVirtRegInterval(SaveReg
);
325 // Let this be recomputed.
326 LIS
->removeRegUnit(*MCRegUnitIterator(AMDGPU::EXEC
, TRI
));
329 void SILowerControlFlow::emitIfBreak(MachineInstr
&MI
) {
330 MachineBasicBlock
&MBB
= *MI
.getParent();
331 const DebugLoc
&DL
= MI
.getDebugLoc();
332 auto Dst
= MI
.getOperand(0).getReg();
334 // Skip ANDing with exec if the break condition is already masked by exec
335 // because it is a V_CMP in the same basic block. (We know the break
336 // condition operand was an i1 in IR, so if it is a VALU instruction it must
337 // be one with a carry-out.)
338 bool SkipAnding
= false;
339 if (MI
.getOperand(1).isReg()) {
340 if (MachineInstr
*Def
= MRI
->getUniqueVRegDef(MI
.getOperand(1).getReg())) {
341 SkipAnding
= Def
->getParent() == MI
.getParent()
342 && SIInstrInfo::isVALU(*Def
);
346 // AND the break condition operand with exec, then OR that into the "loop
348 MachineInstr
*And
= nullptr, *Or
= nullptr;
350 And
= BuildMI(MBB
, &MI
, DL
, TII
->get(AMDGPU::S_AND_B64
), Dst
)
351 .addReg(AMDGPU::EXEC
)
352 .add(MI
.getOperand(1));
353 Or
= BuildMI(MBB
, &MI
, DL
, TII
->get(AMDGPU::S_OR_B64
), Dst
)
355 .add(MI
.getOperand(2));
357 Or
= BuildMI(MBB
, &MI
, DL
, TII
->get(AMDGPU::S_OR_B64
), Dst
)
358 .add(MI
.getOperand(1))
359 .add(MI
.getOperand(2));
363 LIS
->InsertMachineInstrInMaps(*And
);
364 LIS
->ReplaceMachineInstrInMaps(MI
, *Or
);
367 MI
.eraseFromParent();
370 void SILowerControlFlow::emitLoop(MachineInstr
&MI
) {
371 MachineBasicBlock
&MBB
= *MI
.getParent();
372 const DebugLoc
&DL
= MI
.getDebugLoc();
374 MachineInstr
*AndN2
=
375 BuildMI(MBB
, &MI
, DL
, TII
->get(AMDGPU::S_ANDN2_B64_term
), AMDGPU::EXEC
)
376 .addReg(AMDGPU::EXEC
)
377 .add(MI
.getOperand(0));
379 MachineInstr
*Branch
=
380 BuildMI(MBB
, &MI
, DL
, TII
->get(AMDGPU::S_CBRANCH_EXECNZ
))
381 .add(MI
.getOperand(1));
384 LIS
->ReplaceMachineInstrInMaps(MI
, *AndN2
);
385 LIS
->InsertMachineInstrInMaps(*Branch
);
388 MI
.eraseFromParent();
391 void SILowerControlFlow::emitEndCf(MachineInstr
&MI
) {
392 MachineBasicBlock
&MBB
= *MI
.getParent();
393 const DebugLoc
&DL
= MI
.getDebugLoc();
395 MachineBasicBlock::iterator InsPt
= MBB
.begin();
396 MachineInstr
*NewMI
=
397 BuildMI(MBB
, InsPt
, DL
, TII
->get(AMDGPU::S_OR_B64
), AMDGPU::EXEC
)
398 .addReg(AMDGPU::EXEC
)
399 .add(MI
.getOperand(0));
402 LIS
->ReplaceMachineInstrInMaps(MI
, *NewMI
);
404 MI
.eraseFromParent();
407 LIS
->handleMove(*NewMI
);
410 // Returns replace operands for a logical operation, either single result
411 // for exec or two operands if source was another equivalent operation.
412 void SILowerControlFlow::findMaskOperands(MachineInstr
&MI
, unsigned OpNo
,
413 SmallVectorImpl
<MachineOperand
> &Src
) const {
414 MachineOperand
&Op
= MI
.getOperand(OpNo
);
415 if (!Op
.isReg() || !TargetRegisterInfo::isVirtualRegister(Op
.getReg())) {
420 MachineInstr
*Def
= MRI
->getUniqueVRegDef(Op
.getReg());
421 if (!Def
|| Def
->getParent() != MI
.getParent() ||
422 !(Def
->isFullCopy() || (Def
->getOpcode() == MI
.getOpcode())))
425 // Make sure we do not modify exec between def and use.
426 // A copy with implcitly defined exec inserted earlier is an exclusion, it
427 // does not really modify exec.
428 for (auto I
= Def
->getIterator(); I
!= MI
.getIterator(); ++I
)
429 if (I
->modifiesRegister(AMDGPU::EXEC
, TRI
) &&
430 !(I
->isCopy() && I
->getOperand(0).getReg() != AMDGPU::EXEC
))
433 for (const auto &SrcOp
: Def
->explicit_operands())
434 if (SrcOp
.isReg() && SrcOp
.isUse() &&
435 (TargetRegisterInfo::isVirtualRegister(SrcOp
.getReg()) ||
436 SrcOp
.getReg() == AMDGPU::EXEC
))
437 Src
.push_back(SrcOp
);
440 // Search and combine pairs of equivalent instructions, like
441 // S_AND_B64 x, (S_AND_B64 x, y) => S_AND_B64 x, y
442 // S_OR_B64 x, (S_OR_B64 x, y) => S_OR_B64 x, y
443 // One of the operands is exec mask.
444 void SILowerControlFlow::combineMasks(MachineInstr
&MI
) {
445 assert(MI
.getNumExplicitOperands() == 3);
446 SmallVector
<MachineOperand
, 4> Ops
;
447 unsigned OpToReplace
= 1;
448 findMaskOperands(MI
, 1, Ops
);
449 if (Ops
.size() == 1) OpToReplace
= 2; // First operand can be exec or its copy
450 findMaskOperands(MI
, 2, Ops
);
451 if (Ops
.size() != 3) return;
453 unsigned UniqueOpndIdx
;
454 if (Ops
[0].isIdenticalTo(Ops
[1])) UniqueOpndIdx
= 2;
455 else if (Ops
[0].isIdenticalTo(Ops
[2])) UniqueOpndIdx
= 1;
456 else if (Ops
[1].isIdenticalTo(Ops
[2])) UniqueOpndIdx
= 1;
459 unsigned Reg
= MI
.getOperand(OpToReplace
).getReg();
460 MI
.RemoveOperand(OpToReplace
);
461 MI
.addOperand(Ops
[UniqueOpndIdx
]);
462 if (MRI
->use_empty(Reg
))
463 MRI
->getUniqueVRegDef(Reg
)->eraseFromParent();
466 bool SILowerControlFlow::runOnMachineFunction(MachineFunction
&MF
) {
467 const GCNSubtarget
&ST
= MF
.getSubtarget
<GCNSubtarget
>();
468 TII
= ST
.getInstrInfo();
469 TRI
= &TII
->getRegisterInfo();
471 // This doesn't actually need LiveIntervals, but we can preserve them.
472 LIS
= getAnalysisIfAvailable
<LiveIntervals
>();
473 MRI
= &MF
.getRegInfo();
475 MachineFunction::iterator NextBB
;
476 for (MachineFunction::iterator BI
= MF
.begin(), BE
= MF
.end();
477 BI
!= BE
; BI
= NextBB
) {
478 NextBB
= std::next(BI
);
479 MachineBasicBlock
&MBB
= *BI
;
481 MachineBasicBlock::iterator I
, Next
, Last
;
483 for (I
= MBB
.begin(), Last
= MBB
.end(); I
!= MBB
.end(); I
= Next
) {
485 MachineInstr
&MI
= *I
;
487 switch (MI
.getOpcode()) {
492 case AMDGPU::SI_ELSE
:
496 case AMDGPU::SI_IF_BREAK
:
500 case AMDGPU::SI_LOOP
:
504 case AMDGPU::SI_END_CF
:
508 case AMDGPU::S_AND_B64
:
509 case AMDGPU::S_OR_B64
:
510 // Cleanup bit manipulations on exec mask
520 // Replay newly inserted code to combine masks
521 Next
= (Last
== MBB
.end()) ? MBB
.begin() : Last
;