[ORC] Add std::tuple support to SimplePackedSerialization.
[llvm-project.git] / llvm / lib / Target / AMDGPU / SIPreEmitPeephole.cpp
blobdce0f4b0df5fdc7bc2ff87b9e38fbb3824e799c3
1 //===-- SIPreEmitPeephole.cpp ------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// This pass performs the peephole optimizations before code emission.
11 ///
12 //===----------------------------------------------------------------------===//
14 #include "AMDGPU.h"
15 #include "GCNSubtarget.h"
16 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
17 #include "llvm/CodeGen/MachineFunctionPass.h"
19 using namespace llvm;
21 #define DEBUG_TYPE "si-pre-emit-peephole"
23 static unsigned SkipThreshold;
25 static cl::opt<unsigned, true> SkipThresholdFlag(
26 "amdgpu-skip-threshold", cl::Hidden,
27 cl::desc(
28 "Number of instructions before jumping over divergent control flow"),
29 cl::location(SkipThreshold), cl::init(12));
31 namespace {
33 class SIPreEmitPeephole : public MachineFunctionPass {
34 private:
35 const SIInstrInfo *TII = nullptr;
36 const SIRegisterInfo *TRI = nullptr;
38 bool optimizeVccBranch(MachineInstr &MI) const;
39 bool optimizeSetGPR(MachineInstr &First, MachineInstr &MI) const;
40 bool getBlockDestinations(MachineBasicBlock &SrcMBB,
41 MachineBasicBlock *&TrueMBB,
42 MachineBasicBlock *&FalseMBB,
43 SmallVectorImpl<MachineOperand> &Cond);
44 bool mustRetainExeczBranch(const MachineBasicBlock &From,
45 const MachineBasicBlock &To) const;
46 bool removeExeczBranch(MachineInstr &MI, MachineBasicBlock &SrcMBB);
48 public:
49 static char ID;
51 SIPreEmitPeephole() : MachineFunctionPass(ID) {
52 initializeSIPreEmitPeepholePass(*PassRegistry::getPassRegistry());
55 bool runOnMachineFunction(MachineFunction &MF) override;
58 } // End anonymous namespace.
60 INITIALIZE_PASS(SIPreEmitPeephole, DEBUG_TYPE,
61 "SI peephole optimizations", false, false)
63 char SIPreEmitPeephole::ID = 0;
65 char &llvm::SIPreEmitPeepholeID = SIPreEmitPeephole::ID;
67 bool SIPreEmitPeephole::optimizeVccBranch(MachineInstr &MI) const {
68 // Match:
69 // sreg = -1 or 0
70 // vcc = S_AND_B64 exec, sreg or S_ANDN2_B64 exec, sreg
71 // S_CBRANCH_VCC[N]Z
72 // =>
73 // S_CBRANCH_EXEC[N]Z
74 // We end up with this pattern sometimes after basic block placement.
75 // It happens while combining a block which assigns -1 or 0 to a saved mask
76 // and another block which consumes that saved mask and then a branch.
77 bool Changed = false;
78 MachineBasicBlock &MBB = *MI.getParent();
79 const GCNSubtarget &ST = MBB.getParent()->getSubtarget<GCNSubtarget>();
80 const bool IsWave32 = ST.isWave32();
81 const unsigned CondReg = TRI->getVCC();
82 const unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
83 const unsigned And = IsWave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
84 const unsigned AndN2 = IsWave32 ? AMDGPU::S_ANDN2_B32 : AMDGPU::S_ANDN2_B64;
85 const unsigned Mov = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
87 MachineBasicBlock::reverse_iterator A = MI.getReverseIterator(),
88 E = MBB.rend();
89 bool ReadsCond = false;
90 unsigned Threshold = 5;
91 for (++A; A != E; ++A) {
92 if (!--Threshold)
93 return false;
94 if (A->modifiesRegister(ExecReg, TRI))
95 return false;
96 if (A->modifiesRegister(CondReg, TRI)) {
97 if (!A->definesRegister(CondReg, TRI) ||
98 (A->getOpcode() != And && A->getOpcode() != AndN2))
99 return false;
100 break;
102 ReadsCond |= A->readsRegister(CondReg, TRI);
104 if (A == E)
105 return false;
107 MachineOperand &Op1 = A->getOperand(1);
108 MachineOperand &Op2 = A->getOperand(2);
109 if (Op1.getReg() != ExecReg && Op2.isReg() && Op2.getReg() == ExecReg) {
110 TII->commuteInstruction(*A);
111 Changed = true;
113 if (Op1.getReg() != ExecReg)
114 return Changed;
115 if (Op2.isImm() && !(Op2.getImm() == -1 || Op2.getImm() == 0))
116 return Changed;
118 int64_t MaskValue = 0;
119 Register SReg;
120 if (Op2.isReg()) {
121 SReg = Op2.getReg();
122 auto M = std::next(A);
123 bool ReadsSreg = false;
124 for (; M != E; ++M) {
125 if (M->definesRegister(SReg, TRI))
126 break;
127 if (M->modifiesRegister(SReg, TRI))
128 return Changed;
129 ReadsSreg |= M->readsRegister(SReg, TRI);
131 if (M == E || !M->isMoveImmediate() || !M->getOperand(1).isImm() ||
132 (M->getOperand(1).getImm() != -1 && M->getOperand(1).getImm() != 0))
133 return Changed;
134 MaskValue = M->getOperand(1).getImm();
135 // First if sreg is only used in the AND instruction fold the immediate
136 // into into the AND.
137 if (!ReadsSreg && Op2.isKill()) {
138 A->getOperand(2).ChangeToImmediate(MaskValue);
139 M->eraseFromParent();
141 } else if (Op2.isImm()) {
142 MaskValue = Op2.getImm();
143 } else {
144 llvm_unreachable("Op2 must be register or immediate");
147 // Invert mask for s_andn2
148 assert(MaskValue == 0 || MaskValue == -1);
149 if (A->getOpcode() == AndN2)
150 MaskValue = ~MaskValue;
152 if (!ReadsCond && A->registerDefIsDead(AMDGPU::SCC)) {
153 if (!MI.killsRegister(CondReg, TRI)) {
154 // Replace AND with MOV
155 if (MaskValue == 0) {
156 BuildMI(*A->getParent(), *A, A->getDebugLoc(), TII->get(Mov), CondReg)
157 .addImm(0);
158 } else {
159 BuildMI(*A->getParent(), *A, A->getDebugLoc(), TII->get(Mov), CondReg)
160 .addReg(ExecReg);
163 // Remove AND instruction
164 A->eraseFromParent();
167 bool IsVCCZ = MI.getOpcode() == AMDGPU::S_CBRANCH_VCCZ;
168 if (SReg == ExecReg) {
169 // EXEC is updated directly
170 if (IsVCCZ) {
171 MI.eraseFromParent();
172 return true;
174 MI.setDesc(TII->get(AMDGPU::S_BRANCH));
175 } else if (IsVCCZ && MaskValue == 0) {
176 // Will always branch
177 // Remove all succesors shadowed by new unconditional branch
178 MachineBasicBlock *Parent = MI.getParent();
179 SmallVector<MachineInstr *, 4> ToRemove;
180 bool Found = false;
181 for (MachineInstr &Term : Parent->terminators()) {
182 if (Found) {
183 if (Term.isBranch())
184 ToRemove.push_back(&Term);
185 } else {
186 Found = Term.isIdenticalTo(MI);
189 assert(Found && "conditional branch is not terminator");
190 for (auto BranchMI : ToRemove) {
191 MachineOperand &Dst = BranchMI->getOperand(0);
192 assert(Dst.isMBB() && "destination is not basic block");
193 Parent->removeSuccessor(Dst.getMBB());
194 BranchMI->eraseFromParent();
197 if (MachineBasicBlock *Succ = Parent->getFallThrough()) {
198 Parent->removeSuccessor(Succ);
201 // Rewrite to unconditional branch
202 MI.setDesc(TII->get(AMDGPU::S_BRANCH));
203 } else if (!IsVCCZ && MaskValue == 0) {
204 // Will never branch
205 MachineOperand &Dst = MI.getOperand(0);
206 assert(Dst.isMBB() && "destination is not basic block");
207 MI.getParent()->removeSuccessor(Dst.getMBB());
208 MI.eraseFromParent();
209 return true;
210 } else if (MaskValue == -1) {
211 // Depends only on EXEC
212 MI.setDesc(
213 TII->get(IsVCCZ ? AMDGPU::S_CBRANCH_EXECZ : AMDGPU::S_CBRANCH_EXECNZ));
216 MI.RemoveOperand(MI.findRegisterUseOperandIdx(CondReg, false /*Kill*/, TRI));
217 MI.addImplicitDefUseOperands(*MBB.getParent());
219 return true;
222 bool SIPreEmitPeephole::optimizeSetGPR(MachineInstr &First,
223 MachineInstr &MI) const {
224 MachineBasicBlock &MBB = *MI.getParent();
225 const MachineFunction &MF = *MBB.getParent();
226 const MachineRegisterInfo &MRI = MF.getRegInfo();
227 MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
228 Register IdxReg = Idx->isReg() ? Idx->getReg() : Register();
229 SmallVector<MachineInstr *, 4> ToRemove;
230 bool IdxOn = true;
232 if (!MI.isIdenticalTo(First))
233 return false;
235 // Scan back to find an identical S_SET_GPR_IDX_ON
236 for (MachineBasicBlock::instr_iterator I = std::next(First.getIterator()),
237 E = MI.getIterator();
238 I != E; ++I) {
239 if (I->isBundle())
240 continue;
241 switch (I->getOpcode()) {
242 case AMDGPU::S_SET_GPR_IDX_MODE:
243 return false;
244 case AMDGPU::S_SET_GPR_IDX_OFF:
245 IdxOn = false;
246 ToRemove.push_back(&*I);
247 break;
248 default:
249 if (I->modifiesRegister(AMDGPU::M0, TRI))
250 return false;
251 if (IdxReg && I->modifiesRegister(IdxReg, TRI))
252 return false;
253 if (llvm::any_of(I->operands(),
254 [&MRI, this](const MachineOperand &MO) {
255 return MO.isReg() &&
256 TRI->isVectorRegister(MRI, MO.getReg());
257 })) {
258 // The only exception allowed here is another indirect vector move
259 // with the same mode.
260 if (!IdxOn ||
261 !((I->getOpcode() == AMDGPU::V_MOV_B32_e32 &&
262 I->hasRegisterImplicitUseOperand(AMDGPU::M0)) ||
263 I->getOpcode() == AMDGPU::V_MOV_B32_indirect))
264 return false;
269 MI.eraseFromBundle();
270 for (MachineInstr *RI : ToRemove)
271 RI->eraseFromBundle();
272 return true;
275 bool SIPreEmitPeephole::getBlockDestinations(
276 MachineBasicBlock &SrcMBB, MachineBasicBlock *&TrueMBB,
277 MachineBasicBlock *&FalseMBB, SmallVectorImpl<MachineOperand> &Cond) {
278 if (TII->analyzeBranch(SrcMBB, TrueMBB, FalseMBB, Cond))
279 return false;
281 if (!FalseMBB)
282 FalseMBB = SrcMBB.getNextNode();
284 return true;
287 bool SIPreEmitPeephole::mustRetainExeczBranch(
288 const MachineBasicBlock &From, const MachineBasicBlock &To) const {
289 unsigned NumInstr = 0;
290 const MachineFunction *MF = From.getParent();
292 for (MachineFunction::const_iterator MBBI(&From), ToI(&To), End = MF->end();
293 MBBI != End && MBBI != ToI; ++MBBI) {
294 const MachineBasicBlock &MBB = *MBBI;
296 for (MachineBasicBlock::const_iterator I = MBB.begin(), E = MBB.end();
297 I != E; ++I) {
298 // When a uniform loop is inside non-uniform control flow, the branch
299 // leaving the loop might never be taken when EXEC = 0.
300 // Hence we should retain cbranch out of the loop lest it become infinite.
301 if (I->isConditionalBranch())
302 return true;
304 if (TII->hasUnwantedEffectsWhenEXECEmpty(*I))
305 return true;
307 // These instructions are potentially expensive even if EXEC = 0.
308 if (TII->isSMRD(*I) || TII->isVMEM(*I) || TII->isFLAT(*I) ||
309 TII->isDS(*I) || I->getOpcode() == AMDGPU::S_WAITCNT)
310 return true;
312 ++NumInstr;
313 if (NumInstr >= SkipThreshold)
314 return true;
318 return false;
321 // Returns true if the skip branch instruction is removed.
322 bool SIPreEmitPeephole::removeExeczBranch(MachineInstr &MI,
323 MachineBasicBlock &SrcMBB) {
324 MachineBasicBlock *TrueMBB = nullptr;
325 MachineBasicBlock *FalseMBB = nullptr;
326 SmallVector<MachineOperand, 1> Cond;
328 if (!getBlockDestinations(SrcMBB, TrueMBB, FalseMBB, Cond))
329 return false;
331 // Consider only the forward branches.
332 if ((SrcMBB.getNumber() >= TrueMBB->getNumber()) ||
333 mustRetainExeczBranch(*FalseMBB, *TrueMBB))
334 return false;
336 LLVM_DEBUG(dbgs() << "Removing the execz branch: " << MI);
337 MI.eraseFromParent();
338 SrcMBB.removeSuccessor(TrueMBB);
340 return true;
343 bool SIPreEmitPeephole::runOnMachineFunction(MachineFunction &MF) {
344 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
345 TII = ST.getInstrInfo();
346 TRI = &TII->getRegisterInfo();
347 bool Changed = false;
349 MF.RenumberBlocks();
351 for (MachineBasicBlock &MBB : MF) {
352 MachineBasicBlock::iterator TermI = MBB.getFirstTerminator();
353 // Check first terminator for branches to optimize
354 if (TermI != MBB.end()) {
355 MachineInstr &MI = *TermI;
356 switch (MI.getOpcode()) {
357 case AMDGPU::S_CBRANCH_VCCZ:
358 case AMDGPU::S_CBRANCH_VCCNZ:
359 Changed |= optimizeVccBranch(MI);
360 break;
361 case AMDGPU::S_CBRANCH_EXECZ:
362 Changed |= removeExeczBranch(MI, MBB);
363 break;
367 if (!ST.hasVGPRIndexMode())
368 continue;
370 MachineInstr *SetGPRMI = nullptr;
371 const unsigned Threshold = 20;
372 unsigned Count = 0;
373 // Scan the block for two S_SET_GPR_IDX_ON instructions to see if a
374 // second is not needed. Do expensive checks in the optimizeSetGPR()
375 // and limit the distance to 20 instructions for compile time purposes.
376 // Note: this needs to work on bundles as S_SET_GPR_IDX* instructions
377 // may be bundled with the instructions they modify.
378 for (auto &MI :
379 make_early_inc_range(make_range(MBB.instr_begin(), MBB.instr_end()))) {
380 if (Count == Threshold)
381 SetGPRMI = nullptr;
382 else
383 ++Count;
385 if (MI.getOpcode() != AMDGPU::S_SET_GPR_IDX_ON)
386 continue;
388 Count = 0;
389 if (!SetGPRMI) {
390 SetGPRMI = &MI;
391 continue;
394 if (optimizeSetGPR(*SetGPRMI, MI))
395 Changed = true;
396 else
397 SetGPRMI = &MI;
401 return Changed;