[PowerPC] Do not emit record-form rotates when record-form andi/andis suffices
[llvm-core.git] / lib / Target / AMDGPU / SIInsertSkips.cpp
blobdc9397cf7b85e900ad6b308ee35788d7607d2915
1 //===-- SIInsertSkips.cpp - Use predicates for control flow ---------------===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 /// \file
11 /// This pass inserts branches on the 0 exec mask over divergent branches
12 /// branches when it's expected that jumping over the untaken control flow will
13 /// be cheaper than having every workitem no-op through it.
15 //===----------------------------------------------------------------------===//
17 #include "AMDGPU.h"
18 #include "AMDGPUSubtarget.h"
19 #include "SIInstrInfo.h"
20 #include "SIMachineFunctionInfo.h"
21 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
22 #include "llvm/ADT/SmallVector.h"
23 #include "llvm/ADT/StringRef.h"
24 #include "llvm/CodeGen/MachineBasicBlock.h"
25 #include "llvm/CodeGen/MachineFunction.h"
26 #include "llvm/CodeGen/MachineFunctionPass.h"
27 #include "llvm/CodeGen/MachineInstr.h"
28 #include "llvm/CodeGen/MachineInstrBuilder.h"
29 #include "llvm/CodeGen/MachineOperand.h"
30 #include "llvm/IR/CallingConv.h"
31 #include "llvm/IR/DebugLoc.h"
32 #include "llvm/MC/MCAsmInfo.h"
33 #include "llvm/Pass.h"
34 #include "llvm/Support/CommandLine.h"
35 #include "llvm/Target/TargetMachine.h"
36 #include <cassert>
37 #include <cstdint>
38 #include <iterator>
40 using namespace llvm;
42 #define DEBUG_TYPE "si-insert-skips"
44 static cl::opt<unsigned> SkipThresholdFlag(
45 "amdgpu-skip-threshold",
46 cl::desc("Number of instructions before jumping over divergent control flow"),
47 cl::init(12), cl::Hidden);
49 namespace {
51 class SIInsertSkips : public MachineFunctionPass {
52 private:
53 const SIRegisterInfo *TRI = nullptr;
54 const SIInstrInfo *TII = nullptr;
55 unsigned SkipThreshold = 0;
57 bool shouldSkip(const MachineBasicBlock &From,
58 const MachineBasicBlock &To) const;
60 bool skipIfDead(MachineInstr &MI, MachineBasicBlock &NextBB);
62 void kill(MachineInstr &MI);
64 MachineBasicBlock *insertSkipBlock(MachineBasicBlock &MBB,
65 MachineBasicBlock::iterator I) const;
67 bool skipMaskBranch(MachineInstr &MI, MachineBasicBlock &MBB);
69 public:
70 static char ID;
72 SIInsertSkips() : MachineFunctionPass(ID) {}
74 bool runOnMachineFunction(MachineFunction &MF) override;
76 StringRef getPassName() const override {
77 return "SI insert s_cbranch_execz instructions";
80 void getAnalysisUsage(AnalysisUsage &AU) const override {
81 MachineFunctionPass::getAnalysisUsage(AU);
85 } // end anonymous namespace
87 char SIInsertSkips::ID = 0;
89 INITIALIZE_PASS(SIInsertSkips, DEBUG_TYPE,
90 "SI insert s_cbranch_execz instructions", false, false)
92 char &llvm::SIInsertSkipsPassID = SIInsertSkips::ID;
94 static bool opcodeEmitsNoInsts(unsigned Opc) {
95 switch (Opc) {
96 case TargetOpcode::IMPLICIT_DEF:
97 case TargetOpcode::KILL:
98 case TargetOpcode::BUNDLE:
99 case TargetOpcode::CFI_INSTRUCTION:
100 case TargetOpcode::EH_LABEL:
101 case TargetOpcode::GC_LABEL:
102 case TargetOpcode::DBG_VALUE:
103 return true;
104 default:
105 return false;
109 bool SIInsertSkips::shouldSkip(const MachineBasicBlock &From,
110 const MachineBasicBlock &To) const {
111 if (From.succ_empty())
112 return false;
114 unsigned NumInstr = 0;
115 const MachineFunction *MF = From.getParent();
117 for (MachineFunction::const_iterator MBBI(&From), ToI(&To), End = MF->end();
118 MBBI != End && MBBI != ToI; ++MBBI) {
119 const MachineBasicBlock &MBB = *MBBI;
121 for (MachineBasicBlock::const_iterator I = MBB.begin(), E = MBB.end();
122 NumInstr < SkipThreshold && I != E; ++I) {
123 if (opcodeEmitsNoInsts(I->getOpcode()))
124 continue;
126 // FIXME: Since this is required for correctness, this should be inserted
127 // during SILowerControlFlow.
129 // When a uniform loop is inside non-uniform control flow, the branch
130 // leaving the loop might be an S_CBRANCH_VCCNZ, which is never taken
131 // when EXEC = 0. We should skip the loop lest it becomes infinite.
132 if (I->getOpcode() == AMDGPU::S_CBRANCH_VCCNZ ||
133 I->getOpcode() == AMDGPU::S_CBRANCH_VCCZ)
134 return true;
136 if (TII->hasUnwantedEffectsWhenEXECEmpty(*I))
137 return true;
139 ++NumInstr;
140 if (NumInstr >= SkipThreshold)
141 return true;
145 return false;
148 bool SIInsertSkips::skipIfDead(MachineInstr &MI, MachineBasicBlock &NextBB) {
149 MachineBasicBlock &MBB = *MI.getParent();
150 MachineFunction *MF = MBB.getParent();
152 if (MF->getFunction().getCallingConv() != CallingConv::AMDGPU_PS ||
153 !shouldSkip(MBB, MBB.getParent()->back()))
154 return false;
156 MachineBasicBlock *SkipBB = insertSkipBlock(MBB, MI.getIterator());
158 const DebugLoc &DL = MI.getDebugLoc();
160 // If the exec mask is non-zero, skip the next two instructions
161 BuildMI(&MBB, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
162 .addMBB(&NextBB);
164 MachineBasicBlock::iterator Insert = SkipBB->begin();
166 // Exec mask is zero: Export to NULL target...
167 BuildMI(*SkipBB, Insert, DL, TII->get(AMDGPU::EXP_DONE))
168 .addImm(0x09) // V_008DFC_SQ_EXP_NULL
169 .addReg(AMDGPU::VGPR0, RegState::Undef)
170 .addReg(AMDGPU::VGPR0, RegState::Undef)
171 .addReg(AMDGPU::VGPR0, RegState::Undef)
172 .addReg(AMDGPU::VGPR0, RegState::Undef)
173 .addImm(1) // vm
174 .addImm(0) // compr
175 .addImm(0); // en
177 // ... and terminate wavefront.
178 BuildMI(*SkipBB, Insert, DL, TII->get(AMDGPU::S_ENDPGM));
180 return true;
183 void SIInsertSkips::kill(MachineInstr &MI) {
184 MachineBasicBlock &MBB = *MI.getParent();
185 DebugLoc DL = MI.getDebugLoc();
187 switch (MI.getOpcode()) {
188 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR: {
189 unsigned Opcode = 0;
191 // The opcodes are inverted because the inline immediate has to be
192 // the first operand, e.g. from "x < imm" to "imm > x"
193 switch (MI.getOperand(2).getImm()) {
194 case ISD::SETOEQ:
195 case ISD::SETEQ:
196 Opcode = AMDGPU::V_CMPX_EQ_F32_e64;
197 break;
198 case ISD::SETOGT:
199 case ISD::SETGT:
200 Opcode = AMDGPU::V_CMPX_LT_F32_e64;
201 break;
202 case ISD::SETOGE:
203 case ISD::SETGE:
204 Opcode = AMDGPU::V_CMPX_LE_F32_e64;
205 break;
206 case ISD::SETOLT:
207 case ISD::SETLT:
208 Opcode = AMDGPU::V_CMPX_GT_F32_e64;
209 break;
210 case ISD::SETOLE:
211 case ISD::SETLE:
212 Opcode = AMDGPU::V_CMPX_GE_F32_e64;
213 break;
214 case ISD::SETONE:
215 case ISD::SETNE:
216 Opcode = AMDGPU::V_CMPX_LG_F32_e64;
217 break;
218 case ISD::SETO:
219 Opcode = AMDGPU::V_CMPX_O_F32_e64;
220 break;
221 case ISD::SETUO:
222 Opcode = AMDGPU::V_CMPX_U_F32_e64;
223 break;
224 case ISD::SETUEQ:
225 Opcode = AMDGPU::V_CMPX_NLG_F32_e64;
226 break;
227 case ISD::SETUGT:
228 Opcode = AMDGPU::V_CMPX_NGE_F32_e64;
229 break;
230 case ISD::SETUGE:
231 Opcode = AMDGPU::V_CMPX_NGT_F32_e64;
232 break;
233 case ISD::SETULT:
234 Opcode = AMDGPU::V_CMPX_NLE_F32_e64;
235 break;
236 case ISD::SETULE:
237 Opcode = AMDGPU::V_CMPX_NLT_F32_e64;
238 break;
239 case ISD::SETUNE:
240 Opcode = AMDGPU::V_CMPX_NEQ_F32_e64;
241 break;
242 default:
243 llvm_unreachable("invalid ISD:SET cond code");
246 assert(MI.getOperand(0).isReg());
248 if (TRI->isVGPR(MBB.getParent()->getRegInfo(),
249 MI.getOperand(0).getReg())) {
250 Opcode = AMDGPU::getVOPe32(Opcode);
251 BuildMI(MBB, &MI, DL, TII->get(Opcode))
252 .add(MI.getOperand(1))
253 .add(MI.getOperand(0));
254 } else {
255 BuildMI(MBB, &MI, DL, TII->get(Opcode))
256 .addReg(AMDGPU::VCC, RegState::Define)
257 .addImm(0) // src0 modifiers
258 .add(MI.getOperand(1))
259 .addImm(0) // src1 modifiers
260 .add(MI.getOperand(0))
261 .addImm(0); // omod
263 break;
265 case AMDGPU::SI_KILL_I1_TERMINATOR: {
266 const MachineOperand &Op = MI.getOperand(0);
267 int64_t KillVal = MI.getOperand(1).getImm();
268 assert(KillVal == 0 || KillVal == -1);
270 // Kill all threads if Op0 is an immediate and equal to the Kill value.
271 if (Op.isImm()) {
272 int64_t Imm = Op.getImm();
273 assert(Imm == 0 || Imm == -1);
275 if (Imm == KillVal)
276 BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
277 .addImm(0);
278 break;
281 unsigned Opcode = KillVal ? AMDGPU::S_ANDN2_B64 : AMDGPU::S_AND_B64;
282 BuildMI(MBB, &MI, DL, TII->get(Opcode), AMDGPU::EXEC)
283 .addReg(AMDGPU::EXEC)
284 .add(Op);
285 break;
287 default:
288 llvm_unreachable("invalid opcode, expected SI_KILL_*_TERMINATOR");
292 MachineBasicBlock *SIInsertSkips::insertSkipBlock(
293 MachineBasicBlock &MBB, MachineBasicBlock::iterator I) const {
294 MachineFunction *MF = MBB.getParent();
296 MachineBasicBlock *SkipBB = MF->CreateMachineBasicBlock();
297 MachineFunction::iterator MBBI(MBB);
298 ++MBBI;
300 MF->insert(MBBI, SkipBB);
301 MBB.addSuccessor(SkipBB);
303 return SkipBB;
306 // Returns true if a branch over the block was inserted.
307 bool SIInsertSkips::skipMaskBranch(MachineInstr &MI,
308 MachineBasicBlock &SrcMBB) {
309 MachineBasicBlock *DestBB = MI.getOperand(0).getMBB();
311 if (!shouldSkip(**SrcMBB.succ_begin(), *DestBB))
312 return false;
314 const DebugLoc &DL = MI.getDebugLoc();
315 MachineBasicBlock::iterator InsPt = std::next(MI.getIterator());
317 BuildMI(SrcMBB, InsPt, DL, TII->get(AMDGPU::S_CBRANCH_EXECZ))
318 .addMBB(DestBB);
320 return true;
323 bool SIInsertSkips::runOnMachineFunction(MachineFunction &MF) {
324 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
325 TII = ST.getInstrInfo();
326 TRI = &TII->getRegisterInfo();
327 SkipThreshold = SkipThresholdFlag;
329 bool HaveKill = false;
330 bool MadeChange = false;
332 // Track depth of exec mask, divergent branches.
333 SmallVector<MachineBasicBlock *, 16> ExecBranchStack;
335 MachineFunction::iterator NextBB;
337 MachineBasicBlock *EmptyMBBAtEnd = nullptr;
339 for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
340 BI != BE; BI = NextBB) {
341 NextBB = std::next(BI);
342 MachineBasicBlock &MBB = *BI;
343 bool HaveSkipBlock = false;
345 if (!ExecBranchStack.empty() && ExecBranchStack.back() == &MBB) {
346 // Reached convergence point for last divergent branch.
347 ExecBranchStack.pop_back();
350 if (HaveKill && ExecBranchStack.empty()) {
351 HaveKill = false;
353 // TODO: Insert skip if exec is 0?
356 MachineBasicBlock::iterator I, Next;
357 for (I = MBB.begin(); I != MBB.end(); I = Next) {
358 Next = std::next(I);
360 MachineInstr &MI = *I;
362 switch (MI.getOpcode()) {
363 case AMDGPU::SI_MASK_BRANCH:
364 ExecBranchStack.push_back(MI.getOperand(0).getMBB());
365 MadeChange |= skipMaskBranch(MI, MBB);
366 break;
368 case AMDGPU::S_BRANCH:
369 // Optimize out branches to the next block.
370 // FIXME: Shouldn't this be handled by BranchFolding?
371 if (MBB.isLayoutSuccessor(MI.getOperand(0).getMBB())) {
372 MI.eraseFromParent();
373 } else if (HaveSkipBlock) {
374 // Remove the given unconditional branch when a skip block has been
375 // inserted after the current one and let skip the two instructions
376 // performing the kill if the exec mask is non-zero.
377 MI.eraseFromParent();
379 break;
381 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
382 case AMDGPU::SI_KILL_I1_TERMINATOR:
383 MadeChange = true;
384 kill(MI);
386 if (ExecBranchStack.empty()) {
387 if (skipIfDead(MI, *NextBB)) {
388 HaveSkipBlock = true;
389 NextBB = std::next(BI);
390 BE = MF.end();
392 } else {
393 HaveKill = true;
396 MI.eraseFromParent();
397 break;
399 case AMDGPU::SI_RETURN_TO_EPILOG:
400 // FIXME: Should move somewhere else
401 assert(!MF.getInfo<SIMachineFunctionInfo>()->returnsVoid());
403 // Graphics shaders returning non-void shouldn't contain S_ENDPGM,
404 // because external bytecode will be appended at the end.
405 if (BI != --MF.end() || I != MBB.getFirstTerminator()) {
406 // SI_RETURN_TO_EPILOG is not the last instruction. Add an empty block at
407 // the end and jump there.
408 if (!EmptyMBBAtEnd) {
409 EmptyMBBAtEnd = MF.CreateMachineBasicBlock();
410 MF.insert(MF.end(), EmptyMBBAtEnd);
413 MBB.addSuccessor(EmptyMBBAtEnd);
414 BuildMI(*BI, I, MI.getDebugLoc(), TII->get(AMDGPU::S_BRANCH))
415 .addMBB(EmptyMBBAtEnd);
416 I->eraseFromParent();
418 break;
420 default:
421 break;
426 return MadeChange;