1 //===-- SIInsertSkips.cpp - Use predicates for control flow ---------------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
10 /// This pass inserts branches on the 0 exec mask over divergent branches
11 /// branches when it's expected that jumping over the untaken control flow will
12 /// be cheaper than having every workitem no-op through it.
14 //===----------------------------------------------------------------------===//
17 #include "AMDGPUSubtarget.h"
18 #include "SIInstrInfo.h"
19 #include "SIMachineFunctionInfo.h"
20 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
21 #include "llvm/ADT/SmallVector.h"
22 #include "llvm/ADT/StringRef.h"
23 #include "llvm/CodeGen/MachineBasicBlock.h"
24 #include "llvm/CodeGen/MachineFunction.h"
25 #include "llvm/CodeGen/MachineFunctionPass.h"
26 #include "llvm/CodeGen/MachineInstr.h"
27 #include "llvm/CodeGen/MachineInstrBuilder.h"
28 #include "llvm/CodeGen/MachineOperand.h"
29 #include "llvm/IR/CallingConv.h"
30 #include "llvm/IR/DebugLoc.h"
31 #include "llvm/MC/MCAsmInfo.h"
32 #include "llvm/Pass.h"
33 #include "llvm/Support/CommandLine.h"
34 #include "llvm/Target/TargetMachine.h"
41 #define DEBUG_TYPE "si-insert-skips"
43 static cl::opt
<unsigned> SkipThresholdFlag(
44 "amdgpu-skip-threshold",
45 cl::desc("Number of instructions before jumping over divergent control flow"),
46 cl::init(12), cl::Hidden
);
50 class SIInsertSkips
: public MachineFunctionPass
{
52 const SIRegisterInfo
*TRI
= nullptr;
53 const SIInstrInfo
*TII
= nullptr;
54 unsigned SkipThreshold
= 0;
56 bool shouldSkip(const MachineBasicBlock
&From
,
57 const MachineBasicBlock
&To
) const;
59 bool skipIfDead(MachineInstr
&MI
, MachineBasicBlock
&NextBB
);
61 void kill(MachineInstr
&MI
);
63 MachineBasicBlock
*insertSkipBlock(MachineBasicBlock
&MBB
,
64 MachineBasicBlock::iterator I
) const;
66 bool skipMaskBranch(MachineInstr
&MI
, MachineBasicBlock
&MBB
);
68 bool optimizeVccBranch(MachineInstr
&MI
) const;
73 SIInsertSkips() : MachineFunctionPass(ID
) {}
75 bool runOnMachineFunction(MachineFunction
&MF
) override
;
77 StringRef
getPassName() const override
{
78 return "SI insert s_cbranch_execz instructions";
81 void getAnalysisUsage(AnalysisUsage
&AU
) const override
{
82 MachineFunctionPass::getAnalysisUsage(AU
);
86 } // end anonymous namespace
88 char SIInsertSkips::ID
= 0;
90 INITIALIZE_PASS(SIInsertSkips
, DEBUG_TYPE
,
91 "SI insert s_cbranch_execz instructions", false, false)
93 char &llvm::SIInsertSkipsPassID
= SIInsertSkips::ID
;
95 static bool opcodeEmitsNoInsts(const MachineInstr
&MI
) {
96 if (MI
.isMetaInstruction())
99 // Handle target specific opcodes.
100 switch (MI
.getOpcode()) {
101 case AMDGPU::SI_MASK_BRANCH
:
108 bool SIInsertSkips::shouldSkip(const MachineBasicBlock
&From
,
109 const MachineBasicBlock
&To
) const {
110 unsigned NumInstr
= 0;
111 const MachineFunction
*MF
= From
.getParent();
113 for (MachineFunction::const_iterator
MBBI(&From
), ToI(&To
), End
= MF
->end();
114 MBBI
!= End
&& MBBI
!= ToI
; ++MBBI
) {
115 const MachineBasicBlock
&MBB
= *MBBI
;
117 for (MachineBasicBlock::const_iterator I
= MBB
.begin(), E
= MBB
.end();
118 NumInstr
< SkipThreshold
&& I
!= E
; ++I
) {
119 if (opcodeEmitsNoInsts(*I
))
122 // FIXME: Since this is required for correctness, this should be inserted
123 // during SILowerControlFlow.
125 // When a uniform loop is inside non-uniform control flow, the branch
126 // leaving the loop might be an S_CBRANCH_VCCNZ, which is never taken
127 // when EXEC = 0. We should skip the loop lest it becomes infinite.
128 if (I
->getOpcode() == AMDGPU::S_CBRANCH_VCCNZ
||
129 I
->getOpcode() == AMDGPU::S_CBRANCH_VCCZ
)
132 if (TII
->hasUnwantedEffectsWhenEXECEmpty(*I
))
135 // These instructions are potentially expensive even if EXEC = 0.
136 if (TII
->isSMRD(*I
) || TII
->isVMEM(*I
) || TII
->isFLAT(*I
) ||
137 I
->getOpcode() == AMDGPU::S_WAITCNT
)
141 if (NumInstr
>= SkipThreshold
)
149 bool SIInsertSkips::skipIfDead(MachineInstr
&MI
, MachineBasicBlock
&NextBB
) {
150 MachineBasicBlock
&MBB
= *MI
.getParent();
151 MachineFunction
*MF
= MBB
.getParent();
153 if (MF
->getFunction().getCallingConv() != CallingConv::AMDGPU_PS
||
154 !shouldSkip(MBB
, MBB
.getParent()->back()))
157 MachineBasicBlock
*SkipBB
= insertSkipBlock(MBB
, MI
.getIterator());
159 const DebugLoc
&DL
= MI
.getDebugLoc();
161 // If the exec mask is non-zero, skip the next two instructions
162 BuildMI(&MBB
, DL
, TII
->get(AMDGPU::S_CBRANCH_EXECNZ
))
165 MachineBasicBlock::iterator Insert
= SkipBB
->begin();
167 // Exec mask is zero: Export to NULL target...
168 BuildMI(*SkipBB
, Insert
, DL
, TII
->get(AMDGPU::EXP_DONE
))
169 .addImm(0x09) // V_008DFC_SQ_EXP_NULL
170 .addReg(AMDGPU::VGPR0
, RegState::Undef
)
171 .addReg(AMDGPU::VGPR0
, RegState::Undef
)
172 .addReg(AMDGPU::VGPR0
, RegState::Undef
)
173 .addReg(AMDGPU::VGPR0
, RegState::Undef
)
178 // ... and terminate wavefront.
179 BuildMI(*SkipBB
, Insert
, DL
, TII
->get(AMDGPU::S_ENDPGM
)).addImm(0);
184 void SIInsertSkips::kill(MachineInstr
&MI
) {
185 MachineBasicBlock
&MBB
= *MI
.getParent();
186 DebugLoc DL
= MI
.getDebugLoc();
188 switch (MI
.getOpcode()) {
189 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR
: {
192 // The opcodes are inverted because the inline immediate has to be
193 // the first operand, e.g. from "x < imm" to "imm > x"
194 switch (MI
.getOperand(2).getImm()) {
197 Opcode
= AMDGPU::V_CMPX_EQ_F32_e64
;
201 Opcode
= AMDGPU::V_CMPX_LT_F32_e64
;
205 Opcode
= AMDGPU::V_CMPX_LE_F32_e64
;
209 Opcode
= AMDGPU::V_CMPX_GT_F32_e64
;
213 Opcode
= AMDGPU::V_CMPX_GE_F32_e64
;
217 Opcode
= AMDGPU::V_CMPX_LG_F32_e64
;
220 Opcode
= AMDGPU::V_CMPX_O_F32_e64
;
223 Opcode
= AMDGPU::V_CMPX_U_F32_e64
;
226 Opcode
= AMDGPU::V_CMPX_NLG_F32_e64
;
229 Opcode
= AMDGPU::V_CMPX_NGE_F32_e64
;
232 Opcode
= AMDGPU::V_CMPX_NGT_F32_e64
;
235 Opcode
= AMDGPU::V_CMPX_NLE_F32_e64
;
238 Opcode
= AMDGPU::V_CMPX_NLT_F32_e64
;
241 Opcode
= AMDGPU::V_CMPX_NEQ_F32_e64
;
244 llvm_unreachable("invalid ISD:SET cond code");
247 const GCNSubtarget
&ST
= MBB
.getParent()->getSubtarget
<GCNSubtarget
>();
248 if (ST
.hasNoSdstCMPX())
249 Opcode
= AMDGPU::getVCMPXNoSDstOp(Opcode
);
251 assert(MI
.getOperand(0).isReg());
253 if (TRI
->isVGPR(MBB
.getParent()->getRegInfo(),
254 MI
.getOperand(0).getReg())) {
255 Opcode
= AMDGPU::getVOPe32(Opcode
);
256 BuildMI(MBB
, &MI
, DL
, TII
->get(Opcode
))
257 .add(MI
.getOperand(1))
258 .add(MI
.getOperand(0));
260 auto I
= BuildMI(MBB
, &MI
, DL
, TII
->get(Opcode
));
261 if (!ST
.hasNoSdstCMPX())
262 I
.addReg(AMDGPU::VCC
, RegState::Define
);
264 I
.addImm(0) // src0 modifiers
265 .add(MI
.getOperand(1))
266 .addImm(0) // src1 modifiers
267 .add(MI
.getOperand(0));
273 case AMDGPU::SI_KILL_I1_TERMINATOR
: {
274 const MachineFunction
*MF
= MI
.getParent()->getParent();
275 const GCNSubtarget
&ST
= MF
->getSubtarget
<GCNSubtarget
>();
276 unsigned Exec
= ST
.isWave32() ? AMDGPU::EXEC_LO
: AMDGPU::EXEC
;
277 const MachineOperand
&Op
= MI
.getOperand(0);
278 int64_t KillVal
= MI
.getOperand(1).getImm();
279 assert(KillVal
== 0 || KillVal
== -1);
281 // Kill all threads if Op0 is an immediate and equal to the Kill value.
283 int64_t Imm
= Op
.getImm();
284 assert(Imm
== 0 || Imm
== -1);
287 BuildMI(MBB
, &MI
, DL
, TII
->get(ST
.isWave32() ? AMDGPU::S_MOV_B32
288 : AMDGPU::S_MOV_B64
), Exec
)
293 unsigned Opcode
= KillVal
? AMDGPU::S_ANDN2_B64
: AMDGPU::S_AND_B64
;
295 Opcode
= KillVal
? AMDGPU::S_ANDN2_B32
: AMDGPU::S_AND_B32
;
296 BuildMI(MBB
, &MI
, DL
, TII
->get(Opcode
), Exec
)
302 llvm_unreachable("invalid opcode, expected SI_KILL_*_TERMINATOR");
306 MachineBasicBlock
*SIInsertSkips::insertSkipBlock(
307 MachineBasicBlock
&MBB
, MachineBasicBlock::iterator I
) const {
308 MachineFunction
*MF
= MBB
.getParent();
310 MachineBasicBlock
*SkipBB
= MF
->CreateMachineBasicBlock();
311 MachineFunction::iterator
MBBI(MBB
);
314 MF
->insert(MBBI
, SkipBB
);
315 MBB
.addSuccessor(SkipBB
);
320 // Returns true if a branch over the block was inserted.
321 bool SIInsertSkips::skipMaskBranch(MachineInstr
&MI
,
322 MachineBasicBlock
&SrcMBB
) {
323 MachineBasicBlock
*DestBB
= MI
.getOperand(0).getMBB();
325 if (!shouldSkip(**SrcMBB
.succ_begin(), *DestBB
))
328 const DebugLoc
&DL
= MI
.getDebugLoc();
329 MachineBasicBlock::iterator InsPt
= std::next(MI
.getIterator());
331 BuildMI(SrcMBB
, InsPt
, DL
, TII
->get(AMDGPU::S_CBRANCH_EXECZ
))
337 bool SIInsertSkips::optimizeVccBranch(MachineInstr
&MI
) const {
340 // vcc = S_AND_B64 exec, sreg
343 // S_CBRANCH_EXEC[N]Z
344 bool Changed
= false;
345 MachineBasicBlock
&MBB
= *MI
.getParent();
346 const GCNSubtarget
&ST
= MBB
.getParent()->getSubtarget
<GCNSubtarget
>();
347 const bool IsWave32
= ST
.isWave32();
348 const unsigned CondReg
= TRI
->getVCC();
349 const unsigned ExecReg
= IsWave32
? AMDGPU::EXEC_LO
: AMDGPU::EXEC
;
350 const unsigned And
= IsWave32
? AMDGPU::S_AND_B32
: AMDGPU::S_AND_B64
;
352 MachineBasicBlock::reverse_iterator A
= MI
.getReverseIterator(),
354 bool ReadsCond
= false;
355 unsigned Threshold
= 5;
356 for (++A
; A
!= E
; ++A
) {
359 if (A
->modifiesRegister(ExecReg
, TRI
))
361 if (A
->modifiesRegister(CondReg
, TRI
)) {
362 if (!A
->definesRegister(CondReg
, TRI
) || A
->getOpcode() != And
)
366 ReadsCond
|= A
->readsRegister(CondReg
, TRI
);
371 MachineOperand
&Op1
= A
->getOperand(1);
372 MachineOperand
&Op2
= A
->getOperand(2);
373 if (Op1
.getReg() != ExecReg
&& Op2
.isReg() && Op2
.getReg() == ExecReg
) {
374 TII
->commuteInstruction(*A
);
377 if (Op1
.getReg() != ExecReg
)
379 if (Op2
.isImm() && Op2
.getImm() != -1)
382 unsigned SReg
= AMDGPU::NoRegister
;
385 auto M
= std::next(A
);
386 bool ReadsSreg
= false;
387 for ( ; M
!= E
; ++M
) {
388 if (M
->definesRegister(SReg
, TRI
))
390 if (M
->modifiesRegister(SReg
, TRI
))
392 ReadsSreg
|= M
->readsRegister(SReg
, TRI
);
395 !M
->isMoveImmediate() ||
396 !M
->getOperand(1).isImm() ||
397 M
->getOperand(1).getImm() != -1)
399 // First if sreg is only used in and instruction fold the immediate
401 if (!ReadsSreg
&& Op2
.isKill()) {
402 A
->getOperand(2).ChangeToImmediate(-1);
403 M
->eraseFromParent();
407 if (!ReadsCond
&& A
->registerDefIsDead(AMDGPU::SCC
) &&
408 MI
.killsRegister(CondReg
, TRI
))
409 A
->eraseFromParent();
411 bool IsVCCZ
= MI
.getOpcode() == AMDGPU::S_CBRANCH_VCCZ
;
412 if (SReg
== ExecReg
) {
414 MI
.eraseFromParent();
417 MI
.setDesc(TII
->get(AMDGPU::S_BRANCH
));
419 MI
.setDesc(TII
->get(IsVCCZ
? AMDGPU::S_CBRANCH_EXECZ
420 : AMDGPU::S_CBRANCH_EXECNZ
));
423 MI
.RemoveOperand(MI
.findRegisterUseOperandIdx(CondReg
, false /*Kill*/, TRI
));
424 MI
.addImplicitDefUseOperands(*MBB
.getParent());
429 bool SIInsertSkips::runOnMachineFunction(MachineFunction
&MF
) {
430 const GCNSubtarget
&ST
= MF
.getSubtarget
<GCNSubtarget
>();
431 TII
= ST
.getInstrInfo();
432 TRI
= &TII
->getRegisterInfo();
433 SkipThreshold
= SkipThresholdFlag
;
435 bool HaveKill
= false;
436 bool MadeChange
= false;
438 // Track depth of exec mask, divergent branches.
439 SmallVector
<MachineBasicBlock
*, 16> ExecBranchStack
;
441 MachineFunction::iterator NextBB
;
443 MachineBasicBlock
*EmptyMBBAtEnd
= nullptr;
445 for (MachineFunction::iterator BI
= MF
.begin(), BE
= MF
.end();
446 BI
!= BE
; BI
= NextBB
) {
447 NextBB
= std::next(BI
);
448 MachineBasicBlock
&MBB
= *BI
;
449 bool HaveSkipBlock
= false;
451 if (!ExecBranchStack
.empty() && ExecBranchStack
.back() == &MBB
) {
452 // Reached convergence point for last divergent branch.
453 ExecBranchStack
.pop_back();
456 if (HaveKill
&& ExecBranchStack
.empty()) {
459 // TODO: Insert skip if exec is 0?
462 MachineBasicBlock::iterator I
, Next
;
463 for (I
= MBB
.begin(); I
!= MBB
.end(); I
= Next
) {
466 MachineInstr
&MI
= *I
;
468 switch (MI
.getOpcode()) {
469 case AMDGPU::SI_MASK_BRANCH
:
470 ExecBranchStack
.push_back(MI
.getOperand(0).getMBB());
471 MadeChange
|= skipMaskBranch(MI
, MBB
);
474 case AMDGPU::S_BRANCH
:
475 // Optimize out branches to the next block.
476 // FIXME: Shouldn't this be handled by BranchFolding?
477 if (MBB
.isLayoutSuccessor(MI
.getOperand(0).getMBB())) {
478 MI
.eraseFromParent();
479 } else if (HaveSkipBlock
) {
480 // Remove the given unconditional branch when a skip block has been
481 // inserted after the current one and let skip the two instructions
482 // performing the kill if the exec mask is non-zero.
483 MI
.eraseFromParent();
487 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR
:
488 case AMDGPU::SI_KILL_I1_TERMINATOR
:
492 if (ExecBranchStack
.empty()) {
493 if (NextBB
!= BE
&& skipIfDead(MI
, *NextBB
)) {
494 HaveSkipBlock
= true;
495 NextBB
= std::next(BI
);
502 MI
.eraseFromParent();
505 case AMDGPU::SI_RETURN_TO_EPILOG
:
506 // FIXME: Should move somewhere else
507 assert(!MF
.getInfo
<SIMachineFunctionInfo
>()->returnsVoid());
509 // Graphics shaders returning non-void shouldn't contain S_ENDPGM,
510 // because external bytecode will be appended at the end.
511 if (BI
!= --MF
.end() || I
!= MBB
.getFirstTerminator()) {
512 // SI_RETURN_TO_EPILOG is not the last instruction. Add an empty block at
513 // the end and jump there.
514 if (!EmptyMBBAtEnd
) {
515 EmptyMBBAtEnd
= MF
.CreateMachineBasicBlock();
516 MF
.insert(MF
.end(), EmptyMBBAtEnd
);
519 MBB
.addSuccessor(EmptyMBBAtEnd
);
520 BuildMI(*BI
, I
, MI
.getDebugLoc(), TII
->get(AMDGPU::S_BRANCH
))
521 .addMBB(EmptyMBBAtEnd
);
522 I
->eraseFromParent();
526 case AMDGPU::S_CBRANCH_VCCZ
:
527 case AMDGPU::S_CBRANCH_VCCNZ
:
528 MadeChange
|= optimizeVccBranch(MI
);