1 //===-- SILowerControlFlow.cpp - Use predicates for control flow ----------===//
3 // The LLVM Compiler Infrastructure
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
8 //===----------------------------------------------------------------------===//
11 /// This pass lowers the pseudo control flow instructions to real
12 /// machine instructions.
14 /// All control flow is handled using predicated instructions and
15 /// a predicate stack. Each Scalar ALU controls the operations of 64 Vector
16 /// ALUs. The Scalar ALU can update the predicate for any of the Vector ALUs
17 /// by writting to the 64-bit EXEC register (each bit corresponds to a
18 /// single vector ALU). Typically, for predicates, a vector ALU will write
19 /// to its bit of the VCC register (like EXEC VCC is 64-bits, one for each
20 /// Vector ALU) and then the ScalarALU will AND the VCC register with the
21 /// EXEC to update the predicates.
24 /// %vcc = V_CMP_GT_F32 %vgpr1, %vgpr2
25 /// %sgpr0 = SI_IF %vcc
26 /// %vgpr0 = V_ADD_F32 %vgpr0, %vgpr0
27 /// %sgpr0 = SI_ELSE %sgpr0
28 /// %vgpr0 = V_SUB_F32 %vgpr0, %vgpr0
33 /// %sgpr0 = S_AND_SAVEEXEC_B64 %vcc // Save and update the exec mask
34 /// %sgpr0 = S_XOR_B64 %sgpr0, %exec // Clear live bits from saved exec mask
35 /// S_CBRANCH_EXECZ label0 // This instruction is an optional
36 /// // optimization which allows us to
37 /// // branch if all the bits of
39 /// %vgpr0 = V_ADD_F32 %vgpr0, %vgpr0 // Do the IF block of the branch
42 /// %sgpr0 = S_OR_SAVEEXEC_B64 %exec // Restore the exec mask for the Then block
43 /// %exec = S_XOR_B64 %sgpr0, %exec // Clear live bits from saved exec mask
44 /// S_BRANCH_EXECZ label1 // Use our branch optimization
45 /// // instruction again.
46 /// %vgpr0 = V_SUB_F32 %vgpr0, %vgpr // Do the THEN block
48 /// %exec = S_OR_B64 %exec, %sgpr0 // Re-enable saved exec mask bits
49 //===----------------------------------------------------------------------===//
52 #include "AMDGPUSubtarget.h"
53 #include "SIInstrInfo.h"
54 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
55 #include "llvm/ADT/SmallVector.h"
56 #include "llvm/ADT/StringRef.h"
57 #include "llvm/CodeGen/LiveIntervals.h"
58 #include "llvm/CodeGen/MachineBasicBlock.h"
59 #include "llvm/CodeGen/MachineFunction.h"
60 #include "llvm/CodeGen/MachineFunctionPass.h"
61 #include "llvm/CodeGen/MachineInstr.h"
62 #include "llvm/CodeGen/MachineInstrBuilder.h"
63 #include "llvm/CodeGen/MachineOperand.h"
64 #include "llvm/CodeGen/MachineRegisterInfo.h"
65 #include "llvm/CodeGen/Passes.h"
66 #include "llvm/CodeGen/SlotIndexes.h"
67 #include "llvm/CodeGen/TargetRegisterInfo.h"
68 #include "llvm/MC/MCRegisterInfo.h"
69 #include "llvm/Pass.h"
75 #define DEBUG_TYPE "si-lower-control-flow"
79 class SILowerControlFlow
: public MachineFunctionPass
{
81 const SIRegisterInfo
*TRI
= nullptr;
82 const SIInstrInfo
*TII
= nullptr;
83 LiveIntervals
*LIS
= nullptr;
84 MachineRegisterInfo
*MRI
= nullptr;
86 void emitIf(MachineInstr
&MI
);
87 void emitElse(MachineInstr
&MI
);
88 void emitBreak(MachineInstr
&MI
);
89 void emitIfBreak(MachineInstr
&MI
);
90 void emitElseBreak(MachineInstr
&MI
);
91 void emitLoop(MachineInstr
&MI
);
92 void emitEndCf(MachineInstr
&MI
);
94 void findMaskOperands(MachineInstr
&MI
, unsigned OpNo
,
95 SmallVectorImpl
<MachineOperand
> &Src
) const;
97 void combineMasks(MachineInstr
&MI
);
102 SILowerControlFlow() : MachineFunctionPass(ID
) {}
104 bool runOnMachineFunction(MachineFunction
&MF
) override
;
106 StringRef
getPassName() const override
{
107 return "SI Lower control flow pseudo instructions";
110 void getAnalysisUsage(AnalysisUsage
&AU
) const override
{
111 // Should preserve the same set that TwoAddressInstructions does.
112 AU
.addPreserved
<SlotIndexes
>();
113 AU
.addPreserved
<LiveIntervals
>();
114 AU
.addPreservedID(LiveVariablesID
);
115 AU
.addPreservedID(MachineLoopInfoID
);
116 AU
.addPreservedID(MachineDominatorsID
);
117 AU
.setPreservesCFG();
118 MachineFunctionPass::getAnalysisUsage(AU
);
122 } // end anonymous namespace
124 char SILowerControlFlow::ID
= 0;
126 INITIALIZE_PASS(SILowerControlFlow
, DEBUG_TYPE
,
127 "SI lower control flow", false, false)
129 static void setImpSCCDefDead(MachineInstr
&MI
, bool IsDead
) {
130 MachineOperand
&ImpDefSCC
= MI
.getOperand(3);
131 assert(ImpDefSCC
.getReg() == AMDGPU::SCC
&& ImpDefSCC
.isDef());
133 ImpDefSCC
.setIsDead(IsDead
);
136 char &llvm::SILowerControlFlowID
= SILowerControlFlow::ID
;
138 static bool isSimpleIf(const MachineInstr
&MI
, const MachineRegisterInfo
*MRI
,
139 const SIInstrInfo
*TII
) {
140 unsigned SaveExecReg
= MI
.getOperand(0).getReg();
141 auto U
= MRI
->use_instr_nodbg_begin(SaveExecReg
);
143 if (U
== MRI
->use_instr_nodbg_end() ||
144 std::next(U
) != MRI
->use_instr_nodbg_end() ||
145 U
->getOpcode() != AMDGPU::SI_END_CF
)
148 // Check for SI_KILL_*_TERMINATOR on path from if to endif.
149 // if there is any such terminator simplififcations are not safe.
150 auto SMBB
= MI
.getParent();
151 auto EMBB
= U
->getParent();
152 DenseSet
<const MachineBasicBlock
*> Visited
;
153 SmallVector
<MachineBasicBlock
*, 4> Worklist(SMBB
->succ_begin(),
156 while (!Worklist
.empty()) {
157 MachineBasicBlock
*MBB
= Worklist
.pop_back_val();
159 if (MBB
== EMBB
|| !Visited
.insert(MBB
).second
)
161 for(auto &Term
: MBB
->terminators())
162 if (TII
->isKillTerminator(Term
.getOpcode()))
165 Worklist
.append(MBB
->succ_begin(), MBB
->succ_end());
171 void SILowerControlFlow::emitIf(MachineInstr
&MI
) {
172 MachineBasicBlock
&MBB
= *MI
.getParent();
173 const DebugLoc
&DL
= MI
.getDebugLoc();
174 MachineBasicBlock::iterator
I(&MI
);
176 MachineOperand
&SaveExec
= MI
.getOperand(0);
177 MachineOperand
&Cond
= MI
.getOperand(1);
178 assert(SaveExec
.getSubReg() == AMDGPU::NoSubRegister
&&
179 Cond
.getSubReg() == AMDGPU::NoSubRegister
);
181 unsigned SaveExecReg
= SaveExec
.getReg();
183 MachineOperand
&ImpDefSCC
= MI
.getOperand(4);
184 assert(ImpDefSCC
.getReg() == AMDGPU::SCC
&& ImpDefSCC
.isDef());
186 // If there is only one use of save exec register and that use is SI_END_CF,
187 // we can optimize SI_IF by returning the full saved exec mask instead of
188 // just cleared bits.
189 bool SimpleIf
= isSimpleIf(MI
, MRI
, TII
);
191 // Add an implicit def of exec to discourage scheduling VALU after this which
192 // will interfere with trying to form s_and_saveexec_b64 later.
193 unsigned CopyReg
= SimpleIf
? SaveExecReg
194 : MRI
->createVirtualRegister(&AMDGPU::SReg_64RegClass
);
195 MachineInstr
*CopyExec
=
196 BuildMI(MBB
, I
, DL
, TII
->get(AMDGPU::COPY
), CopyReg
)
197 .addReg(AMDGPU::EXEC
)
198 .addReg(AMDGPU::EXEC
, RegState::ImplicitDefine
);
200 unsigned Tmp
= MRI
->createVirtualRegister(&AMDGPU::SReg_64RegClass
);
203 BuildMI(MBB
, I
, DL
, TII
->get(AMDGPU::S_AND_B64
), Tmp
)
205 //.addReg(AMDGPU::EXEC)
206 .addReg(Cond
.getReg());
207 setImpSCCDefDead(*And
, true);
209 MachineInstr
*Xor
= nullptr;
212 BuildMI(MBB
, I
, DL
, TII
->get(AMDGPU::S_XOR_B64
), SaveExecReg
)
215 setImpSCCDefDead(*Xor
, ImpDefSCC
.isDead());
218 // Use a copy that is a terminator to get correct spill code placement it with
220 MachineInstr
*SetExec
=
221 BuildMI(MBB
, I
, DL
, TII
->get(AMDGPU::S_MOV_B64_term
), AMDGPU::EXEC
)
222 .addReg(Tmp
, RegState::Kill
);
224 // Insert a pseudo terminator to help keep the verifier happy. This will also
225 // be used later when inserting skips.
226 MachineInstr
*NewBr
= BuildMI(MBB
, I
, DL
, TII
->get(AMDGPU::SI_MASK_BRANCH
))
227 .add(MI
.getOperand(2));
230 MI
.eraseFromParent();
234 LIS
->InsertMachineInstrInMaps(*CopyExec
);
236 // Replace with and so we don't need to fix the live interval for condition
238 LIS
->ReplaceMachineInstrInMaps(MI
, *And
);
241 LIS
->InsertMachineInstrInMaps(*Xor
);
242 LIS
->InsertMachineInstrInMaps(*SetExec
);
243 LIS
->InsertMachineInstrInMaps(*NewBr
);
245 LIS
->removeRegUnit(*MCRegUnitIterator(AMDGPU::EXEC
, TRI
));
246 MI
.eraseFromParent();
248 // FIXME: Is there a better way of adjusting the liveness? It shouldn't be
249 // hard to add another def here but I'm not sure how to correctly update the
251 LIS
->removeInterval(SaveExecReg
);
252 LIS
->createAndComputeVirtRegInterval(SaveExecReg
);
253 LIS
->createAndComputeVirtRegInterval(Tmp
);
255 LIS
->createAndComputeVirtRegInterval(CopyReg
);
258 void SILowerControlFlow::emitElse(MachineInstr
&MI
) {
259 MachineBasicBlock
&MBB
= *MI
.getParent();
260 const DebugLoc
&DL
= MI
.getDebugLoc();
262 unsigned DstReg
= MI
.getOperand(0).getReg();
263 assert(MI
.getOperand(0).getSubReg() == AMDGPU::NoSubRegister
);
265 bool ExecModified
= MI
.getOperand(3).getImm() != 0;
266 MachineBasicBlock::iterator Start
= MBB
.begin();
268 // We are running before TwoAddressInstructions, and si_else's operands are
269 // tied. In order to correctly tie the registers, split this into a copy of
270 // the src like it does.
271 unsigned CopyReg
= MRI
->createVirtualRegister(&AMDGPU::SReg_64RegClass
);
272 MachineInstr
*CopyExec
=
273 BuildMI(MBB
, Start
, DL
, TII
->get(AMDGPU::COPY
), CopyReg
)
274 .add(MI
.getOperand(1)); // Saved EXEC
276 // This must be inserted before phis and any spill code inserted before the
278 unsigned SaveReg
= ExecModified
?
279 MRI
->createVirtualRegister(&AMDGPU::SReg_64RegClass
) : DstReg
;
280 MachineInstr
*OrSaveExec
=
281 BuildMI(MBB
, Start
, DL
, TII
->get(AMDGPU::S_OR_SAVEEXEC_B64
), SaveReg
)
284 MachineBasicBlock
*DestBB
= MI
.getOperand(2).getMBB();
286 MachineBasicBlock::iterator
ElsePt(MI
);
290 BuildMI(MBB
, ElsePt
, DL
, TII
->get(AMDGPU::S_AND_B64
), DstReg
)
291 .addReg(AMDGPU::EXEC
)
295 LIS
->InsertMachineInstrInMaps(*And
);
299 BuildMI(MBB
, ElsePt
, DL
, TII
->get(AMDGPU::S_XOR_B64_term
), AMDGPU::EXEC
)
300 .addReg(AMDGPU::EXEC
)
303 MachineInstr
*Branch
=
304 BuildMI(MBB
, ElsePt
, DL
, TII
->get(AMDGPU::SI_MASK_BRANCH
))
308 MI
.eraseFromParent();
312 LIS
->RemoveMachineInstrFromMaps(MI
);
313 MI
.eraseFromParent();
315 LIS
->InsertMachineInstrInMaps(*CopyExec
);
316 LIS
->InsertMachineInstrInMaps(*OrSaveExec
);
318 LIS
->InsertMachineInstrInMaps(*Xor
);
319 LIS
->InsertMachineInstrInMaps(*Branch
);
321 // src reg is tied to dst reg.
322 LIS
->removeInterval(DstReg
);
323 LIS
->createAndComputeVirtRegInterval(DstReg
);
324 LIS
->createAndComputeVirtRegInterval(CopyReg
);
326 LIS
->createAndComputeVirtRegInterval(SaveReg
);
328 // Let this be recomputed.
329 LIS
->removeRegUnit(*MCRegUnitIterator(AMDGPU::EXEC
, TRI
));
332 void SILowerControlFlow::emitBreak(MachineInstr
&MI
) {
333 MachineBasicBlock
&MBB
= *MI
.getParent();
334 const DebugLoc
&DL
= MI
.getDebugLoc();
335 unsigned Dst
= MI
.getOperand(0).getReg();
337 MachineInstr
*Or
= BuildMI(MBB
, &MI
, DL
, TII
->get(AMDGPU::S_OR_B64
), Dst
)
338 .addReg(AMDGPU::EXEC
)
339 .add(MI
.getOperand(1));
342 LIS
->ReplaceMachineInstrInMaps(MI
, *Or
);
343 MI
.eraseFromParent();
346 void SILowerControlFlow::emitIfBreak(MachineInstr
&MI
) {
347 MachineBasicBlock
&MBB
= *MI
.getParent();
348 const DebugLoc
&DL
= MI
.getDebugLoc();
349 auto Dst
= MI
.getOperand(0).getReg();
351 // Skip ANDing with exec if the break condition is already masked by exec
352 // because it is a V_CMP in the same basic block. (We know the break
353 // condition operand was an i1 in IR, so if it is a VALU instruction it must
354 // be one with a carry-out.)
355 bool SkipAnding
= false;
356 if (MI
.getOperand(1).isReg()) {
357 if (MachineInstr
*Def
= MRI
->getUniqueVRegDef(MI
.getOperand(1).getReg())) {
358 SkipAnding
= Def
->getParent() == MI
.getParent()
359 && SIInstrInfo::isVALU(*Def
);
363 // AND the break condition operand with exec, then OR that into the "loop
365 MachineInstr
*And
= nullptr, *Or
= nullptr;
367 And
= BuildMI(MBB
, &MI
, DL
, TII
->get(AMDGPU::S_AND_B64
), Dst
)
368 .addReg(AMDGPU::EXEC
)
369 .add(MI
.getOperand(1));
370 Or
= BuildMI(MBB
, &MI
, DL
, TII
->get(AMDGPU::S_OR_B64
), Dst
)
372 .add(MI
.getOperand(2));
374 Or
= BuildMI(MBB
, &MI
, DL
, TII
->get(AMDGPU::S_OR_B64
), Dst
)
375 .add(MI
.getOperand(1))
376 .add(MI
.getOperand(2));
380 LIS
->InsertMachineInstrInMaps(*And
);
381 LIS
->ReplaceMachineInstrInMaps(MI
, *Or
);
384 MI
.eraseFromParent();
387 void SILowerControlFlow::emitElseBreak(MachineInstr
&MI
) {
388 // Lowered in the same way as emitIfBreak above.
392 void SILowerControlFlow::emitLoop(MachineInstr
&MI
) {
393 MachineBasicBlock
&MBB
= *MI
.getParent();
394 const DebugLoc
&DL
= MI
.getDebugLoc();
396 MachineInstr
*AndN2
=
397 BuildMI(MBB
, &MI
, DL
, TII
->get(AMDGPU::S_ANDN2_B64_term
), AMDGPU::EXEC
)
398 .addReg(AMDGPU::EXEC
)
399 .add(MI
.getOperand(0));
401 MachineInstr
*Branch
=
402 BuildMI(MBB
, &MI
, DL
, TII
->get(AMDGPU::S_CBRANCH_EXECNZ
))
403 .add(MI
.getOperand(1));
406 LIS
->ReplaceMachineInstrInMaps(MI
, *AndN2
);
407 LIS
->InsertMachineInstrInMaps(*Branch
);
410 MI
.eraseFromParent();
413 void SILowerControlFlow::emitEndCf(MachineInstr
&MI
) {
414 MachineBasicBlock
&MBB
= *MI
.getParent();
415 const DebugLoc
&DL
= MI
.getDebugLoc();
417 MachineBasicBlock::iterator InsPt
= MBB
.begin();
418 MachineInstr
*NewMI
=
419 BuildMI(MBB
, InsPt
, DL
, TII
->get(AMDGPU::S_OR_B64
), AMDGPU::EXEC
)
420 .addReg(AMDGPU::EXEC
)
421 .add(MI
.getOperand(0));
424 LIS
->ReplaceMachineInstrInMaps(MI
, *NewMI
);
426 MI
.eraseFromParent();
429 LIS
->handleMove(*NewMI
);
432 // Returns replace operands for a logical operation, either single result
433 // for exec or two operands if source was another equivalent operation.
434 void SILowerControlFlow::findMaskOperands(MachineInstr
&MI
, unsigned OpNo
,
435 SmallVectorImpl
<MachineOperand
> &Src
) const {
436 MachineOperand
&Op
= MI
.getOperand(OpNo
);
437 if (!Op
.isReg() || !TargetRegisterInfo::isVirtualRegister(Op
.getReg())) {
442 MachineInstr
*Def
= MRI
->getUniqueVRegDef(Op
.getReg());
443 if (!Def
|| Def
->getParent() != MI
.getParent() ||
444 !(Def
->isFullCopy() || (Def
->getOpcode() == MI
.getOpcode())))
447 // Make sure we do not modify exec between def and use.
448 // A copy with implcitly defined exec inserted earlier is an exclusion, it
449 // does not really modify exec.
450 for (auto I
= Def
->getIterator(); I
!= MI
.getIterator(); ++I
)
451 if (I
->modifiesRegister(AMDGPU::EXEC
, TRI
) &&
452 !(I
->isCopy() && I
->getOperand(0).getReg() != AMDGPU::EXEC
))
455 for (const auto &SrcOp
: Def
->explicit_operands())
456 if (SrcOp
.isReg() && SrcOp
.isUse() &&
457 (TargetRegisterInfo::isVirtualRegister(SrcOp
.getReg()) ||
458 SrcOp
.getReg() == AMDGPU::EXEC
))
459 Src
.push_back(SrcOp
);
462 // Search and combine pairs of equivalent instructions, like
463 // S_AND_B64 x, (S_AND_B64 x, y) => S_AND_B64 x, y
464 // S_OR_B64 x, (S_OR_B64 x, y) => S_OR_B64 x, y
465 // One of the operands is exec mask.
466 void SILowerControlFlow::combineMasks(MachineInstr
&MI
) {
467 assert(MI
.getNumExplicitOperands() == 3);
468 SmallVector
<MachineOperand
, 4> Ops
;
469 unsigned OpToReplace
= 1;
470 findMaskOperands(MI
, 1, Ops
);
471 if (Ops
.size() == 1) OpToReplace
= 2; // First operand can be exec or its copy
472 findMaskOperands(MI
, 2, Ops
);
473 if (Ops
.size() != 3) return;
475 unsigned UniqueOpndIdx
;
476 if (Ops
[0].isIdenticalTo(Ops
[1])) UniqueOpndIdx
= 2;
477 else if (Ops
[0].isIdenticalTo(Ops
[2])) UniqueOpndIdx
= 1;
478 else if (Ops
[1].isIdenticalTo(Ops
[2])) UniqueOpndIdx
= 1;
481 unsigned Reg
= MI
.getOperand(OpToReplace
).getReg();
482 MI
.RemoveOperand(OpToReplace
);
483 MI
.addOperand(Ops
[UniqueOpndIdx
]);
484 if (MRI
->use_empty(Reg
))
485 MRI
->getUniqueVRegDef(Reg
)->eraseFromParent();
488 bool SILowerControlFlow::runOnMachineFunction(MachineFunction
&MF
) {
489 const GCNSubtarget
&ST
= MF
.getSubtarget
<GCNSubtarget
>();
490 TII
= ST
.getInstrInfo();
491 TRI
= &TII
->getRegisterInfo();
493 // This doesn't actually need LiveIntervals, but we can preserve them.
494 LIS
= getAnalysisIfAvailable
<LiveIntervals
>();
495 MRI
= &MF
.getRegInfo();
497 MachineFunction::iterator NextBB
;
498 for (MachineFunction::iterator BI
= MF
.begin(), BE
= MF
.end();
499 BI
!= BE
; BI
= NextBB
) {
500 NextBB
= std::next(BI
);
501 MachineBasicBlock
&MBB
= *BI
;
503 MachineBasicBlock::iterator I
, Next
, Last
;
505 for (I
= MBB
.begin(), Last
= MBB
.end(); I
!= MBB
.end(); I
= Next
) {
507 MachineInstr
&MI
= *I
;
509 switch (MI
.getOpcode()) {
514 case AMDGPU::SI_ELSE
:
518 case AMDGPU::SI_BREAK
:
522 case AMDGPU::SI_IF_BREAK
:
526 case AMDGPU::SI_ELSE_BREAK
:
530 case AMDGPU::SI_LOOP
:
534 case AMDGPU::SI_END_CF
:
538 case AMDGPU::S_AND_B64
:
539 case AMDGPU::S_OR_B64
:
540 // Cleanup bit manipulations on exec mask
550 // Replay newly inserted code to combine masks
551 Next
= (Last
== MBB
.end()) ? MBB
.begin() : Last
;