1 //===-- SILowerControlFlow.cpp - Use predicates for control flow ----------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
10 /// This pass lowers the pseudo control flow instructions to real
11 /// machine instructions.
13 /// All control flow is handled using predicated instructions and
14 /// a predicate stack. Each Scalar ALU controls the operations of 64 Vector
15 /// ALUs. The Scalar ALU can update the predicate for any of the Vector ALUs
16 /// by writting to the 64-bit EXEC register (each bit corresponds to a
17 /// single vector ALU). Typically, for predicates, a vector ALU will write
18 /// to its bit of the VCC register (like EXEC VCC is 64-bits, one for each
19 /// Vector ALU) and then the ScalarALU will AND the VCC register with the
20 /// EXEC to update the predicates.
23 /// %vcc = V_CMP_GT_F32 %vgpr1, %vgpr2
24 /// %sgpr0 = SI_IF %vcc
25 /// %vgpr0 = V_ADD_F32 %vgpr0, %vgpr0
26 /// %sgpr0 = SI_ELSE %sgpr0
27 /// %vgpr0 = V_SUB_F32 %vgpr0, %vgpr0
32 /// %sgpr0 = S_AND_SAVEEXEC_B64 %vcc // Save and update the exec mask
33 /// %sgpr0 = S_XOR_B64 %sgpr0, %exec // Clear live bits from saved exec mask
34 /// S_CBRANCH_EXECZ label0 // This instruction is an optional
35 /// // optimization which allows us to
36 /// // branch if all the bits of
38 /// %vgpr0 = V_ADD_F32 %vgpr0, %vgpr0 // Do the IF block of the branch
41 /// %sgpr0 = S_OR_SAVEEXEC_B64 %exec // Restore the exec mask for the Then block
42 /// %exec = S_XOR_B64 %sgpr0, %exec // Clear live bits from saved exec mask
43 /// S_BRANCH_EXECZ label1 // Use our branch optimization
44 /// // instruction again.
45 /// %vgpr0 = V_SUB_F32 %vgpr0, %vgpr // Do the THEN block
47 /// %exec = S_OR_B64 %exec, %sgpr0 // Re-enable saved exec mask bits
48 //===----------------------------------------------------------------------===//
51 #include "AMDGPUSubtarget.h"
52 #include "SIInstrInfo.h"
53 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
54 #include "llvm/ADT/SmallVector.h"
55 #include "llvm/ADT/StringRef.h"
56 #include "llvm/CodeGen/LiveIntervals.h"
57 #include "llvm/CodeGen/MachineBasicBlock.h"
58 #include "llvm/CodeGen/MachineFunction.h"
59 #include "llvm/CodeGen/MachineFunctionPass.h"
60 #include "llvm/CodeGen/MachineInstr.h"
61 #include "llvm/CodeGen/MachineInstrBuilder.h"
62 #include "llvm/CodeGen/MachineOperand.h"
63 #include "llvm/CodeGen/MachineRegisterInfo.h"
64 #include "llvm/CodeGen/Passes.h"
65 #include "llvm/CodeGen/SlotIndexes.h"
66 #include "llvm/CodeGen/TargetRegisterInfo.h"
67 #include "llvm/MC/MCRegisterInfo.h"
68 #include "llvm/Pass.h"
74 #define DEBUG_TYPE "si-lower-control-flow"
78 class SILowerControlFlow
: public MachineFunctionPass
{
80 const SIRegisterInfo
*TRI
= nullptr;
81 const SIInstrInfo
*TII
= nullptr;
82 LiveIntervals
*LIS
= nullptr;
83 MachineRegisterInfo
*MRI
= nullptr;
85 const TargetRegisterClass
*BoolRC
= nullptr;
90 unsigned Andn2TermOpc
;
92 unsigned OrSaveExecOpc
;
95 void emitIf(MachineInstr
&MI
);
96 void emitElse(MachineInstr
&MI
);
97 void emitIfBreak(MachineInstr
&MI
);
98 void emitLoop(MachineInstr
&MI
);
99 void emitEndCf(MachineInstr
&MI
);
101 Register
getSaveExec(MachineInstr
* MI
);
103 void findMaskOperands(MachineInstr
&MI
, unsigned OpNo
,
104 SmallVectorImpl
<MachineOperand
> &Src
) const;
106 void combineMasks(MachineInstr
&MI
);
111 SILowerControlFlow() : MachineFunctionPass(ID
) {}
113 bool runOnMachineFunction(MachineFunction
&MF
) override
;
115 StringRef
getPassName() const override
{
116 return "SI Lower control flow pseudo instructions";
119 void getAnalysisUsage(AnalysisUsage
&AU
) const override
{
120 // Should preserve the same set that TwoAddressInstructions does.
121 AU
.addPreserved
<SlotIndexes
>();
122 AU
.addPreserved
<LiveIntervals
>();
123 AU
.addPreservedID(LiveVariablesID
);
124 AU
.addPreservedID(MachineLoopInfoID
);
125 AU
.addPreservedID(MachineDominatorsID
);
126 AU
.setPreservesCFG();
127 MachineFunctionPass::getAnalysisUsage(AU
);
131 } // end anonymous namespace
133 char SILowerControlFlow::ID
= 0;
135 INITIALIZE_PASS(SILowerControlFlow
, DEBUG_TYPE
,
136 "SI lower control flow", false, false)
138 static void setImpSCCDefDead(MachineInstr
&MI
, bool IsDead
) {
139 MachineOperand
&ImpDefSCC
= MI
.getOperand(3);
140 assert(ImpDefSCC
.getReg() == AMDGPU::SCC
&& ImpDefSCC
.isDef());
142 ImpDefSCC
.setIsDead(IsDead
);
145 char &llvm::SILowerControlFlowID
= SILowerControlFlow::ID
;
147 static bool isSimpleIf(const MachineInstr
&MI
, const MachineRegisterInfo
*MRI
,
148 const SIInstrInfo
*TII
) {
149 Register SaveExecReg
= MI
.getOperand(0).getReg();
150 auto U
= MRI
->use_instr_nodbg_begin(SaveExecReg
);
152 if (U
== MRI
->use_instr_nodbg_end() ||
153 std::next(U
) != MRI
->use_instr_nodbg_end() ||
154 U
->getOpcode() != AMDGPU::SI_END_CF
)
157 // Check for SI_KILL_*_TERMINATOR on path from if to endif.
158 // if there is any such terminator simplififcations are not safe.
159 auto SMBB
= MI
.getParent();
160 auto EMBB
= U
->getParent();
161 DenseSet
<const MachineBasicBlock
*> Visited
;
162 SmallVector
<MachineBasicBlock
*, 4> Worklist(SMBB
->succ_begin(),
165 while (!Worklist
.empty()) {
166 MachineBasicBlock
*MBB
= Worklist
.pop_back_val();
168 if (MBB
== EMBB
|| !Visited
.insert(MBB
).second
)
170 for(auto &Term
: MBB
->terminators())
171 if (TII
->isKillTerminator(Term
.getOpcode()))
174 Worklist
.append(MBB
->succ_begin(), MBB
->succ_end());
180 Register
SILowerControlFlow::getSaveExec(MachineInstr
*MI
) {
181 MachineBasicBlock
*MBB
= MI
->getParent();
182 MachineOperand
&SaveExec
= MI
->getOperand(0);
183 assert(SaveExec
.getSubReg() == AMDGPU::NoSubRegister
);
185 Register SaveExecReg
= SaveExec
.getReg();
186 unsigned FalseTermOpc
=
187 TII
->isWave32() ? AMDGPU::S_MOV_B32_term
: AMDGPU::S_MOV_B64_term
;
188 MachineBasicBlock::iterator I
= (MI
);
189 MachineBasicBlock::iterator J
= std::next(I
);
190 if (J
!= MBB
->end() && J
->getOpcode() == FalseTermOpc
&&
191 J
->getOperand(1).isReg() && J
->getOperand(1).getReg() == SaveExecReg
) {
192 SaveExecReg
= J
->getOperand(0).getReg();
193 J
->eraseFromParent();
198 void SILowerControlFlow::emitIf(MachineInstr
&MI
) {
199 MachineBasicBlock
&MBB
= *MI
.getParent();
200 const DebugLoc
&DL
= MI
.getDebugLoc();
201 MachineBasicBlock::iterator
I(&MI
);
202 Register SaveExecReg
= getSaveExec(&MI
);
203 MachineOperand
& Cond
= MI
.getOperand(1);
204 assert(Cond
.getSubReg() == AMDGPU::NoSubRegister
);
206 MachineOperand
&ImpDefSCC
= MI
.getOperand(4);
207 assert(ImpDefSCC
.getReg() == AMDGPU::SCC
&& ImpDefSCC
.isDef());
209 // If there is only one use of save exec register and that use is SI_END_CF,
210 // we can optimize SI_IF by returning the full saved exec mask instead of
211 // just cleared bits.
212 bool SimpleIf
= isSimpleIf(MI
, MRI
, TII
);
214 // Add an implicit def of exec to discourage scheduling VALU after this which
215 // will interfere with trying to form s_and_saveexec_b64 later.
216 Register CopyReg
= SimpleIf
? SaveExecReg
217 : MRI
->createVirtualRegister(BoolRC
);
218 MachineInstr
*CopyExec
=
219 BuildMI(MBB
, I
, DL
, TII
->get(AMDGPU::COPY
), CopyReg
)
221 .addReg(Exec
, RegState::ImplicitDefine
);
223 Register Tmp
= MRI
->createVirtualRegister(BoolRC
);
226 BuildMI(MBB
, I
, DL
, TII
->get(AndOpc
), Tmp
)
230 setImpSCCDefDead(*And
, true);
232 MachineInstr
*Xor
= nullptr;
235 BuildMI(MBB
, I
, DL
, TII
->get(XorOpc
), SaveExecReg
)
238 setImpSCCDefDead(*Xor
, ImpDefSCC
.isDead());
241 // Use a copy that is a terminator to get correct spill code placement it with
243 MachineInstr
*SetExec
=
244 BuildMI(MBB
, I
, DL
, TII
->get(MovTermOpc
), Exec
)
245 .addReg(Tmp
, RegState::Kill
);
247 // Insert a pseudo terminator to help keep the verifier happy. This will also
248 // be used later when inserting skips.
249 MachineInstr
*NewBr
= BuildMI(MBB
, I
, DL
, TII
->get(AMDGPU::SI_MASK_BRANCH
))
250 .add(MI
.getOperand(2));
253 MI
.eraseFromParent();
257 LIS
->InsertMachineInstrInMaps(*CopyExec
);
259 // Replace with and so we don't need to fix the live interval for condition
261 LIS
->ReplaceMachineInstrInMaps(MI
, *And
);
264 LIS
->InsertMachineInstrInMaps(*Xor
);
265 LIS
->InsertMachineInstrInMaps(*SetExec
);
266 LIS
->InsertMachineInstrInMaps(*NewBr
);
268 LIS
->removeAllRegUnitsForPhysReg(AMDGPU::EXEC
);
269 MI
.eraseFromParent();
271 // FIXME: Is there a better way of adjusting the liveness? It shouldn't be
272 // hard to add another def here but I'm not sure how to correctly update the
274 LIS
->removeInterval(SaveExecReg
);
275 LIS
->createAndComputeVirtRegInterval(SaveExecReg
);
276 LIS
->createAndComputeVirtRegInterval(Tmp
);
278 LIS
->createAndComputeVirtRegInterval(CopyReg
);
281 void SILowerControlFlow::emitElse(MachineInstr
&MI
) {
282 MachineBasicBlock
&MBB
= *MI
.getParent();
283 const DebugLoc
&DL
= MI
.getDebugLoc();
285 Register DstReg
= getSaveExec(&MI
);
287 bool ExecModified
= MI
.getOperand(3).getImm() != 0;
288 MachineBasicBlock::iterator Start
= MBB
.begin();
290 // We are running before TwoAddressInstructions, and si_else's operands are
291 // tied. In order to correctly tie the registers, split this into a copy of
292 // the src like it does.
293 Register CopyReg
= MRI
->createVirtualRegister(BoolRC
);
294 MachineInstr
*CopyExec
=
295 BuildMI(MBB
, Start
, DL
, TII
->get(AMDGPU::COPY
), CopyReg
)
296 .add(MI
.getOperand(1)); // Saved EXEC
298 // This must be inserted before phis and any spill code inserted before the
300 Register SaveReg
= ExecModified
?
301 MRI
->createVirtualRegister(BoolRC
) : DstReg
;
302 MachineInstr
*OrSaveExec
=
303 BuildMI(MBB
, Start
, DL
, TII
->get(OrSaveExecOpc
), SaveReg
)
306 MachineBasicBlock
*DestBB
= MI
.getOperand(2).getMBB();
308 MachineBasicBlock::iterator
ElsePt(MI
);
312 BuildMI(MBB
, ElsePt
, DL
, TII
->get(AndOpc
), DstReg
)
317 LIS
->InsertMachineInstrInMaps(*And
);
321 BuildMI(MBB
, ElsePt
, DL
, TII
->get(XorTermrOpc
), Exec
)
325 MachineInstr
*Branch
=
326 BuildMI(MBB
, ElsePt
, DL
, TII
->get(AMDGPU::SI_MASK_BRANCH
))
330 MI
.eraseFromParent();
334 LIS
->RemoveMachineInstrFromMaps(MI
);
335 MI
.eraseFromParent();
337 LIS
->InsertMachineInstrInMaps(*CopyExec
);
338 LIS
->InsertMachineInstrInMaps(*OrSaveExec
);
340 LIS
->InsertMachineInstrInMaps(*Xor
);
341 LIS
->InsertMachineInstrInMaps(*Branch
);
343 // src reg is tied to dst reg.
344 LIS
->removeInterval(DstReg
);
345 LIS
->createAndComputeVirtRegInterval(DstReg
);
346 LIS
->createAndComputeVirtRegInterval(CopyReg
);
348 LIS
->createAndComputeVirtRegInterval(SaveReg
);
350 // Let this be recomputed.
351 LIS
->removeAllRegUnitsForPhysReg(AMDGPU::EXEC
);
354 void SILowerControlFlow::emitIfBreak(MachineInstr
&MI
) {
355 MachineBasicBlock
&MBB
= *MI
.getParent();
356 const DebugLoc
&DL
= MI
.getDebugLoc();
357 auto Dst
= getSaveExec(&MI
);
359 // Skip ANDing with exec if the break condition is already masked by exec
360 // because it is a V_CMP in the same basic block. (We know the break
361 // condition operand was an i1 in IR, so if it is a VALU instruction it must
362 // be one with a carry-out.)
363 bool SkipAnding
= false;
364 if (MI
.getOperand(1).isReg()) {
365 if (MachineInstr
*Def
= MRI
->getUniqueVRegDef(MI
.getOperand(1).getReg())) {
366 SkipAnding
= Def
->getParent() == MI
.getParent()
367 && SIInstrInfo::isVALU(*Def
);
371 // AND the break condition operand with exec, then OR that into the "loop
373 MachineInstr
*And
= nullptr, *Or
= nullptr;
375 And
= BuildMI(MBB
, &MI
, DL
, TII
->get(AndOpc
), Dst
)
377 .add(MI
.getOperand(1));
378 Or
= BuildMI(MBB
, &MI
, DL
, TII
->get(OrOpc
), Dst
)
380 .add(MI
.getOperand(2));
382 Or
= BuildMI(MBB
, &MI
, DL
, TII
->get(OrOpc
), Dst
)
383 .add(MI
.getOperand(1))
384 .add(MI
.getOperand(2));
388 LIS
->InsertMachineInstrInMaps(*And
);
389 LIS
->ReplaceMachineInstrInMaps(MI
, *Or
);
392 MI
.eraseFromParent();
395 void SILowerControlFlow::emitLoop(MachineInstr
&MI
) {
396 MachineBasicBlock
&MBB
= *MI
.getParent();
397 const DebugLoc
&DL
= MI
.getDebugLoc();
399 MachineInstr
*AndN2
=
400 BuildMI(MBB
, &MI
, DL
, TII
->get(Andn2TermOpc
), Exec
)
402 .add(MI
.getOperand(0));
404 MachineInstr
*Branch
=
405 BuildMI(MBB
, &MI
, DL
, TII
->get(AMDGPU::S_CBRANCH_EXECNZ
))
406 .add(MI
.getOperand(1));
409 LIS
->ReplaceMachineInstrInMaps(MI
, *AndN2
);
410 LIS
->InsertMachineInstrInMaps(*Branch
);
413 MI
.eraseFromParent();
416 void SILowerControlFlow::emitEndCf(MachineInstr
&MI
) {
417 MachineBasicBlock
&MBB
= *MI
.getParent();
418 MachineRegisterInfo
&MRI
= MBB
.getParent()->getRegInfo();
419 unsigned CFMask
= MI
.getOperand(0).getReg();
420 MachineInstr
*Def
= MRI
.getUniqueVRegDef(CFMask
);
421 const DebugLoc
&DL
= MI
.getDebugLoc();
423 MachineBasicBlock::iterator InsPt
=
424 Def
&& Def
->getParent() == &MBB
? std::next(MachineBasicBlock::iterator(Def
))
426 MachineInstr
*NewMI
= BuildMI(MBB
, InsPt
, DL
, TII
->get(OrOpc
), Exec
)
428 .add(MI
.getOperand(0));
431 LIS
->ReplaceMachineInstrInMaps(MI
, *NewMI
);
433 MI
.eraseFromParent();
436 LIS
->handleMove(*NewMI
);
439 // Returns replace operands for a logical operation, either single result
440 // for exec or two operands if source was another equivalent operation.
441 void SILowerControlFlow::findMaskOperands(MachineInstr
&MI
, unsigned OpNo
,
442 SmallVectorImpl
<MachineOperand
> &Src
) const {
443 MachineOperand
&Op
= MI
.getOperand(OpNo
);
444 if (!Op
.isReg() || !Register::isVirtualRegister(Op
.getReg())) {
449 MachineInstr
*Def
= MRI
->getUniqueVRegDef(Op
.getReg());
450 if (!Def
|| Def
->getParent() != MI
.getParent() ||
451 !(Def
->isFullCopy() || (Def
->getOpcode() == MI
.getOpcode())))
454 // Make sure we do not modify exec between def and use.
455 // A copy with implcitly defined exec inserted earlier is an exclusion, it
456 // does not really modify exec.
457 for (auto I
= Def
->getIterator(); I
!= MI
.getIterator(); ++I
)
458 if (I
->modifiesRegister(AMDGPU::EXEC
, TRI
) &&
459 !(I
->isCopy() && I
->getOperand(0).getReg() != Exec
))
462 for (const auto &SrcOp
: Def
->explicit_operands())
463 if (SrcOp
.isReg() && SrcOp
.isUse() &&
464 (Register::isVirtualRegister(SrcOp
.getReg()) || SrcOp
.getReg() == Exec
))
465 Src
.push_back(SrcOp
);
468 // Search and combine pairs of equivalent instructions, like
469 // S_AND_B64 x, (S_AND_B64 x, y) => S_AND_B64 x, y
470 // S_OR_B64 x, (S_OR_B64 x, y) => S_OR_B64 x, y
471 // One of the operands is exec mask.
472 void SILowerControlFlow::combineMasks(MachineInstr
&MI
) {
473 assert(MI
.getNumExplicitOperands() == 3);
474 SmallVector
<MachineOperand
, 4> Ops
;
475 unsigned OpToReplace
= 1;
476 findMaskOperands(MI
, 1, Ops
);
477 if (Ops
.size() == 1) OpToReplace
= 2; // First operand can be exec or its copy
478 findMaskOperands(MI
, 2, Ops
);
479 if (Ops
.size() != 3) return;
481 unsigned UniqueOpndIdx
;
482 if (Ops
[0].isIdenticalTo(Ops
[1])) UniqueOpndIdx
= 2;
483 else if (Ops
[0].isIdenticalTo(Ops
[2])) UniqueOpndIdx
= 1;
484 else if (Ops
[1].isIdenticalTo(Ops
[2])) UniqueOpndIdx
= 1;
487 Register Reg
= MI
.getOperand(OpToReplace
).getReg();
488 MI
.RemoveOperand(OpToReplace
);
489 MI
.addOperand(Ops
[UniqueOpndIdx
]);
490 if (MRI
->use_empty(Reg
))
491 MRI
->getUniqueVRegDef(Reg
)->eraseFromParent();
494 bool SILowerControlFlow::runOnMachineFunction(MachineFunction
&MF
) {
495 const GCNSubtarget
&ST
= MF
.getSubtarget
<GCNSubtarget
>();
496 TII
= ST
.getInstrInfo();
497 TRI
= &TII
->getRegisterInfo();
499 // This doesn't actually need LiveIntervals, but we can preserve them.
500 LIS
= getAnalysisIfAvailable
<LiveIntervals
>();
501 MRI
= &MF
.getRegInfo();
502 BoolRC
= TRI
->getBoolRC();
505 AndOpc
= AMDGPU::S_AND_B32
;
506 OrOpc
= AMDGPU::S_OR_B32
;
507 XorOpc
= AMDGPU::S_XOR_B32
;
508 MovTermOpc
= AMDGPU::S_MOV_B32_term
;
509 Andn2TermOpc
= AMDGPU::S_ANDN2_B32_term
;
510 XorTermrOpc
= AMDGPU::S_XOR_B32_term
;
511 OrSaveExecOpc
= AMDGPU::S_OR_SAVEEXEC_B32
;
512 Exec
= AMDGPU::EXEC_LO
;
514 AndOpc
= AMDGPU::S_AND_B64
;
515 OrOpc
= AMDGPU::S_OR_B64
;
516 XorOpc
= AMDGPU::S_XOR_B64
;
517 MovTermOpc
= AMDGPU::S_MOV_B64_term
;
518 Andn2TermOpc
= AMDGPU::S_ANDN2_B64_term
;
519 XorTermrOpc
= AMDGPU::S_XOR_B64_term
;
520 OrSaveExecOpc
= AMDGPU::S_OR_SAVEEXEC_B64
;
524 MachineFunction::iterator NextBB
;
525 for (MachineFunction::iterator BI
= MF
.begin(), BE
= MF
.end();
526 BI
!= BE
; BI
= NextBB
) {
527 NextBB
= std::next(BI
);
528 MachineBasicBlock
&MBB
= *BI
;
530 MachineBasicBlock::iterator I
, Next
, Last
;
532 for (I
= MBB
.begin(), Last
= MBB
.end(); I
!= MBB
.end(); I
= Next
) {
534 MachineInstr
&MI
= *I
;
536 switch (MI
.getOpcode()) {
541 case AMDGPU::SI_ELSE
:
545 case AMDGPU::SI_IF_BREAK
:
549 case AMDGPU::SI_LOOP
:
553 case AMDGPU::SI_END_CF
:
557 case AMDGPU::S_AND_B64
:
558 case AMDGPU::S_OR_B64
:
559 case AMDGPU::S_AND_B32
:
560 case AMDGPU::S_OR_B32
:
561 // Cleanup bit manipulations on exec mask
571 // Replay newly inserted code to combine masks
572 Next
= (Last
== MBB
.end()) ? MBB
.begin() : Last
;