1 //===-- SIInsertSkips.cpp - Use predicates for control flow ---------------===//
3 // The LLVM Compiler Infrastructure
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
8 //===----------------------------------------------------------------------===//
11 /// This pass inserts branches on the 0 exec mask over divergent branches
12 /// branches when it's expected that jumping over the untaken control flow will
13 /// be cheaper than having every workitem no-op through it.
15 //===----------------------------------------------------------------------===//
18 #include "AMDGPUSubtarget.h"
19 #include "SIInstrInfo.h"
20 #include "SIMachineFunctionInfo.h"
21 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
22 #include "llvm/ADT/SmallVector.h"
23 #include "llvm/ADT/StringRef.h"
24 #include "llvm/CodeGen/MachineBasicBlock.h"
25 #include "llvm/CodeGen/MachineFunction.h"
26 #include "llvm/CodeGen/MachineFunctionPass.h"
27 #include "llvm/CodeGen/MachineInstr.h"
28 #include "llvm/CodeGen/MachineInstrBuilder.h"
29 #include "llvm/CodeGen/MachineOperand.h"
30 #include "llvm/IR/CallingConv.h"
31 #include "llvm/IR/DebugLoc.h"
32 #include "llvm/MC/MCAsmInfo.h"
33 #include "llvm/Pass.h"
34 #include "llvm/Support/CommandLine.h"
35 #include "llvm/Target/TargetMachine.h"
42 #define DEBUG_TYPE "si-insert-skips"
44 static cl::opt
<unsigned> SkipThresholdFlag(
45 "amdgpu-skip-threshold",
46 cl::desc("Number of instructions before jumping over divergent control flow"),
47 cl::init(12), cl::Hidden
);
51 class SIInsertSkips
: public MachineFunctionPass
{
53 const SIRegisterInfo
*TRI
= nullptr;
54 const SIInstrInfo
*TII
= nullptr;
55 unsigned SkipThreshold
= 0;
57 bool shouldSkip(const MachineBasicBlock
&From
,
58 const MachineBasicBlock
&To
) const;
60 bool skipIfDead(MachineInstr
&MI
, MachineBasicBlock
&NextBB
);
62 void kill(MachineInstr
&MI
);
64 MachineBasicBlock
*insertSkipBlock(MachineBasicBlock
&MBB
,
65 MachineBasicBlock::iterator I
) const;
67 bool skipMaskBranch(MachineInstr
&MI
, MachineBasicBlock
&MBB
);
72 SIInsertSkips() : MachineFunctionPass(ID
) {}
74 bool runOnMachineFunction(MachineFunction
&MF
) override
;
76 StringRef
getPassName() const override
{
77 return "SI insert s_cbranch_execz instructions";
80 void getAnalysisUsage(AnalysisUsage
&AU
) const override
{
81 MachineFunctionPass::getAnalysisUsage(AU
);
85 } // end anonymous namespace
87 char SIInsertSkips::ID
= 0;
89 INITIALIZE_PASS(SIInsertSkips
, DEBUG_TYPE
,
90 "SI insert s_cbranch_execz instructions", false, false)
92 char &llvm::SIInsertSkipsPassID
= SIInsertSkips::ID
;
94 static bool opcodeEmitsNoInsts(unsigned Opc
) {
96 case TargetOpcode::IMPLICIT_DEF
:
97 case TargetOpcode::KILL
:
98 case TargetOpcode::BUNDLE
:
99 case TargetOpcode::CFI_INSTRUCTION
:
100 case TargetOpcode::EH_LABEL
:
101 case TargetOpcode::GC_LABEL
:
102 case TargetOpcode::DBG_VALUE
:
109 bool SIInsertSkips::shouldSkip(const MachineBasicBlock
&From
,
110 const MachineBasicBlock
&To
) const {
111 if (From
.succ_empty())
114 unsigned NumInstr
= 0;
115 const MachineFunction
*MF
= From
.getParent();
117 for (MachineFunction::const_iterator
MBBI(&From
), ToI(&To
), End
= MF
->end();
118 MBBI
!= End
&& MBBI
!= ToI
; ++MBBI
) {
119 const MachineBasicBlock
&MBB
= *MBBI
;
121 for (MachineBasicBlock::const_iterator I
= MBB
.begin(), E
= MBB
.end();
122 NumInstr
< SkipThreshold
&& I
!= E
; ++I
) {
123 if (opcodeEmitsNoInsts(I
->getOpcode()))
126 // FIXME: Since this is required for correctness, this should be inserted
127 // during SILowerControlFlow.
129 // When a uniform loop is inside non-uniform control flow, the branch
130 // leaving the loop might be an S_CBRANCH_VCCNZ, which is never taken
131 // when EXEC = 0. We should skip the loop lest it becomes infinite.
132 if (I
->getOpcode() == AMDGPU::S_CBRANCH_VCCNZ
||
133 I
->getOpcode() == AMDGPU::S_CBRANCH_VCCZ
)
136 if (TII
->hasUnwantedEffectsWhenEXECEmpty(*I
))
140 if (NumInstr
>= SkipThreshold
)
148 bool SIInsertSkips::skipIfDead(MachineInstr
&MI
, MachineBasicBlock
&NextBB
) {
149 MachineBasicBlock
&MBB
= *MI
.getParent();
150 MachineFunction
*MF
= MBB
.getParent();
152 if (MF
->getFunction().getCallingConv() != CallingConv::AMDGPU_PS
||
153 !shouldSkip(MBB
, MBB
.getParent()->back()))
156 MachineBasicBlock
*SkipBB
= insertSkipBlock(MBB
, MI
.getIterator());
158 const DebugLoc
&DL
= MI
.getDebugLoc();
160 // If the exec mask is non-zero, skip the next two instructions
161 BuildMI(&MBB
, DL
, TII
->get(AMDGPU::S_CBRANCH_EXECNZ
))
164 MachineBasicBlock::iterator Insert
= SkipBB
->begin();
166 // Exec mask is zero: Export to NULL target...
167 BuildMI(*SkipBB
, Insert
, DL
, TII
->get(AMDGPU::EXP_DONE
))
168 .addImm(0x09) // V_008DFC_SQ_EXP_NULL
169 .addReg(AMDGPU::VGPR0
, RegState::Undef
)
170 .addReg(AMDGPU::VGPR0
, RegState::Undef
)
171 .addReg(AMDGPU::VGPR0
, RegState::Undef
)
172 .addReg(AMDGPU::VGPR0
, RegState::Undef
)
177 // ... and terminate wavefront.
178 BuildMI(*SkipBB
, Insert
, DL
, TII
->get(AMDGPU::S_ENDPGM
));
183 void SIInsertSkips::kill(MachineInstr
&MI
) {
184 MachineBasicBlock
&MBB
= *MI
.getParent();
185 DebugLoc DL
= MI
.getDebugLoc();
187 switch (MI
.getOpcode()) {
188 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR
: {
191 // The opcodes are inverted because the inline immediate has to be
192 // the first operand, e.g. from "x < imm" to "imm > x"
193 switch (MI
.getOperand(2).getImm()) {
196 Opcode
= AMDGPU::V_CMPX_EQ_F32_e64
;
200 Opcode
= AMDGPU::V_CMPX_LT_F32_e64
;
204 Opcode
= AMDGPU::V_CMPX_LE_F32_e64
;
208 Opcode
= AMDGPU::V_CMPX_GT_F32_e64
;
212 Opcode
= AMDGPU::V_CMPX_GE_F32_e64
;
216 Opcode
= AMDGPU::V_CMPX_LG_F32_e64
;
219 Opcode
= AMDGPU::V_CMPX_O_F32_e64
;
222 Opcode
= AMDGPU::V_CMPX_U_F32_e64
;
225 Opcode
= AMDGPU::V_CMPX_NLG_F32_e64
;
228 Opcode
= AMDGPU::V_CMPX_NGE_F32_e64
;
231 Opcode
= AMDGPU::V_CMPX_NGT_F32_e64
;
234 Opcode
= AMDGPU::V_CMPX_NLE_F32_e64
;
237 Opcode
= AMDGPU::V_CMPX_NLT_F32_e64
;
240 Opcode
= AMDGPU::V_CMPX_NEQ_F32_e64
;
243 llvm_unreachable("invalid ISD:SET cond code");
246 assert(MI
.getOperand(0).isReg());
248 if (TRI
->isVGPR(MBB
.getParent()->getRegInfo(),
249 MI
.getOperand(0).getReg())) {
250 Opcode
= AMDGPU::getVOPe32(Opcode
);
251 BuildMI(MBB
, &MI
, DL
, TII
->get(Opcode
))
252 .add(MI
.getOperand(1))
253 .add(MI
.getOperand(0));
255 BuildMI(MBB
, &MI
, DL
, TII
->get(Opcode
))
256 .addReg(AMDGPU::VCC
, RegState::Define
)
257 .addImm(0) // src0 modifiers
258 .add(MI
.getOperand(1))
259 .addImm(0) // src1 modifiers
260 .add(MI
.getOperand(0))
265 case AMDGPU::SI_KILL_I1_TERMINATOR
: {
266 const MachineOperand
&Op
= MI
.getOperand(0);
267 int64_t KillVal
= MI
.getOperand(1).getImm();
268 assert(KillVal
== 0 || KillVal
== -1);
270 // Kill all threads if Op0 is an immediate and equal to the Kill value.
272 int64_t Imm
= Op
.getImm();
273 assert(Imm
== 0 || Imm
== -1);
276 BuildMI(MBB
, &MI
, DL
, TII
->get(AMDGPU::S_MOV_B64
), AMDGPU::EXEC
)
281 unsigned Opcode
= KillVal
? AMDGPU::S_ANDN2_B64
: AMDGPU::S_AND_B64
;
282 BuildMI(MBB
, &MI
, DL
, TII
->get(Opcode
), AMDGPU::EXEC
)
283 .addReg(AMDGPU::EXEC
)
288 llvm_unreachable("invalid opcode, expected SI_KILL_*_TERMINATOR");
292 MachineBasicBlock
*SIInsertSkips::insertSkipBlock(
293 MachineBasicBlock
&MBB
, MachineBasicBlock::iterator I
) const {
294 MachineFunction
*MF
= MBB
.getParent();
296 MachineBasicBlock
*SkipBB
= MF
->CreateMachineBasicBlock();
297 MachineFunction::iterator
MBBI(MBB
);
300 MF
->insert(MBBI
, SkipBB
);
301 MBB
.addSuccessor(SkipBB
);
306 // Returns true if a branch over the block was inserted.
307 bool SIInsertSkips::skipMaskBranch(MachineInstr
&MI
,
308 MachineBasicBlock
&SrcMBB
) {
309 MachineBasicBlock
*DestBB
= MI
.getOperand(0).getMBB();
311 if (!shouldSkip(**SrcMBB
.succ_begin(), *DestBB
))
314 const DebugLoc
&DL
= MI
.getDebugLoc();
315 MachineBasicBlock::iterator InsPt
= std::next(MI
.getIterator());
317 BuildMI(SrcMBB
, InsPt
, DL
, TII
->get(AMDGPU::S_CBRANCH_EXECZ
))
323 bool SIInsertSkips::runOnMachineFunction(MachineFunction
&MF
) {
324 const GCNSubtarget
&ST
= MF
.getSubtarget
<GCNSubtarget
>();
325 TII
= ST
.getInstrInfo();
326 TRI
= &TII
->getRegisterInfo();
327 SkipThreshold
= SkipThresholdFlag
;
329 bool HaveKill
= false;
330 bool MadeChange
= false;
332 // Track depth of exec mask, divergent branches.
333 SmallVector
<MachineBasicBlock
*, 16> ExecBranchStack
;
335 MachineFunction::iterator NextBB
;
337 MachineBasicBlock
*EmptyMBBAtEnd
= nullptr;
339 for (MachineFunction::iterator BI
= MF
.begin(), BE
= MF
.end();
340 BI
!= BE
; BI
= NextBB
) {
341 NextBB
= std::next(BI
);
342 MachineBasicBlock
&MBB
= *BI
;
343 bool HaveSkipBlock
= false;
345 if (!ExecBranchStack
.empty() && ExecBranchStack
.back() == &MBB
) {
346 // Reached convergence point for last divergent branch.
347 ExecBranchStack
.pop_back();
350 if (HaveKill
&& ExecBranchStack
.empty()) {
353 // TODO: Insert skip if exec is 0?
356 MachineBasicBlock::iterator I
, Next
;
357 for (I
= MBB
.begin(); I
!= MBB
.end(); I
= Next
) {
360 MachineInstr
&MI
= *I
;
362 switch (MI
.getOpcode()) {
363 case AMDGPU::SI_MASK_BRANCH
:
364 ExecBranchStack
.push_back(MI
.getOperand(0).getMBB());
365 MadeChange
|= skipMaskBranch(MI
, MBB
);
368 case AMDGPU::S_BRANCH
:
369 // Optimize out branches to the next block.
370 // FIXME: Shouldn't this be handled by BranchFolding?
371 if (MBB
.isLayoutSuccessor(MI
.getOperand(0).getMBB())) {
372 MI
.eraseFromParent();
373 } else if (HaveSkipBlock
) {
374 // Remove the given unconditional branch when a skip block has been
375 // inserted after the current one and let skip the two instructions
376 // performing the kill if the exec mask is non-zero.
377 MI
.eraseFromParent();
381 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR
:
382 case AMDGPU::SI_KILL_I1_TERMINATOR
:
386 if (ExecBranchStack
.empty()) {
387 if (skipIfDead(MI
, *NextBB
)) {
388 HaveSkipBlock
= true;
389 NextBB
= std::next(BI
);
396 MI
.eraseFromParent();
399 case AMDGPU::SI_RETURN_TO_EPILOG
:
400 // FIXME: Should move somewhere else
401 assert(!MF
.getInfo
<SIMachineFunctionInfo
>()->returnsVoid());
403 // Graphics shaders returning non-void shouldn't contain S_ENDPGM,
404 // because external bytecode will be appended at the end.
405 if (BI
!= --MF
.end() || I
!= MBB
.getFirstTerminator()) {
406 // SI_RETURN_TO_EPILOG is not the last instruction. Add an empty block at
407 // the end and jump there.
408 if (!EmptyMBBAtEnd
) {
409 EmptyMBBAtEnd
= MF
.CreateMachineBasicBlock();
410 MF
.insert(MF
.end(), EmptyMBBAtEnd
);
413 MBB
.addSuccessor(EmptyMBBAtEnd
);
414 BuildMI(*BI
, I
, MI
.getDebugLoc(), TII
->get(AMDGPU::S_BRANCH
))
415 .addMBB(EmptyMBBAtEnd
);
416 I
->eraseFromParent();