Revert " [LoongArch][ISel] Check the number of sign bits in `PatGprGpr_32` (#107432)"
[llvm-project.git] / llvm / lib / Target / AMDGPU / AMDGPUInsertDelayAlu.cpp
blob7619a39bac9c1422b7f92082fe766562f945be98
1 //===- AMDGPUInsertDelayAlu.cpp - Insert s_delay_alu instructions ---------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Insert s_delay_alu instructions to avoid stalls on GFX11+.
12 //===----------------------------------------------------------------------===//
14 #include "AMDGPU.h"
15 #include "GCNSubtarget.h"
16 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
17 #include "SIInstrInfo.h"
18 #include "llvm/ADT/SetVector.h"
20 using namespace llvm;
22 #define DEBUG_TYPE "amdgpu-insert-delay-alu"
24 namespace {
26 class AMDGPUInsertDelayAlu : public MachineFunctionPass {
27 public:
28 static char ID;
30 const SIInstrInfo *SII;
31 const TargetRegisterInfo *TRI;
33 TargetSchedModel SchedModel;
35 AMDGPUInsertDelayAlu() : MachineFunctionPass(ID) {}
37 void getAnalysisUsage(AnalysisUsage &AU) const override {
38 AU.setPreservesCFG();
39 MachineFunctionPass::getAnalysisUsage(AU);
42 // Return true if MI waits for all outstanding VALU instructions to complete.
43 static bool instructionWaitsForVALU(const MachineInstr &MI) {
44 // These instruction types wait for VA_VDST==0 before issuing.
45 const uint64_t VA_VDST_0 = SIInstrFlags::DS | SIInstrFlags::EXP |
46 SIInstrFlags::FLAT | SIInstrFlags::MIMG |
47 SIInstrFlags::MTBUF | SIInstrFlags::MUBUF;
48 if (MI.getDesc().TSFlags & VA_VDST_0)
49 return true;
50 if (MI.getOpcode() == AMDGPU::S_SENDMSG_RTN_B32 ||
51 MI.getOpcode() == AMDGPU::S_SENDMSG_RTN_B64)
52 return true;
53 if (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
54 AMDGPU::DepCtr::decodeFieldVaVdst(MI.getOperand(0).getImm()) == 0)
55 return true;
56 return false;
59 // Types of delay that can be encoded in an s_delay_alu instruction.
60 enum DelayType { VALU, TRANS, SALU, OTHER };
62 // Get the delay type for an instruction with the specified TSFlags.
63 static DelayType getDelayType(uint64_t TSFlags) {
64 if (TSFlags & SIInstrFlags::TRANS)
65 return TRANS;
66 if (TSFlags & SIInstrFlags::VALU)
67 return VALU;
68 if (TSFlags & SIInstrFlags::SALU)
69 return SALU;
70 return OTHER;
73 // Information about the last instruction(s) that wrote to a particular
74 // regunit. In straight-line code there will only be one such instruction, but
75 // when control flow converges we merge the delay information from each path
76 // to represent the union of the worst-case delays of each type.
77 struct DelayInfo {
78 // One larger than the maximum number of (non-TRANS) VALU instructions we
79 // can encode in an s_delay_alu instruction.
80 static constexpr unsigned VALU_MAX = 5;
82 // One larger than the maximum number of TRANS instructions we can encode in
83 // an s_delay_alu instruction.
84 static constexpr unsigned TRANS_MAX = 4;
86 // One larger than the maximum number of SALU cycles we can encode in an
87 // s_delay_alu instruction.
88 static constexpr unsigned SALU_CYCLES_MAX = 4;
90 // If it was written by a (non-TRANS) VALU, remember how many clock cycles
91 // are left until it completes, and how many other (non-TRANS) VALU we have
92 // seen since it was issued.
93 uint8_t VALUCycles = 0;
94 uint8_t VALUNum = VALU_MAX;
96 // If it was written by a TRANS, remember how many clock cycles are left
97 // until it completes, and how many other TRANS we have seen since it was
98 // issued.
99 uint8_t TRANSCycles = 0;
100 uint8_t TRANSNum = TRANS_MAX;
101 // Also remember how many other (non-TRANS) VALU we have seen since it was
102 // issued. When an instruction depends on both a prior TRANS and a prior
103 // non-TRANS VALU, this is used to decide whether to encode a wait for just
104 // one or both of them.
105 uint8_t TRANSNumVALU = VALU_MAX;
107 // If it was written by an SALU, remember how many clock cycles are left
108 // until it completes.
109 uint8_t SALUCycles = 0;
111 DelayInfo() = default;
113 DelayInfo(DelayType Type, unsigned Cycles) {
114 switch (Type) {
115 default:
116 llvm_unreachable("unexpected type");
117 case VALU:
118 VALUCycles = Cycles;
119 VALUNum = 0;
120 break;
121 case TRANS:
122 TRANSCycles = Cycles;
123 TRANSNum = 0;
124 TRANSNumVALU = 0;
125 break;
126 case SALU:
127 // Guard against pseudo-instructions like SI_CALL which are marked as
128 // SALU but with a very high latency.
129 SALUCycles = std::min(Cycles, SALU_CYCLES_MAX);
130 break;
134 bool operator==(const DelayInfo &RHS) const {
135 return VALUCycles == RHS.VALUCycles && VALUNum == RHS.VALUNum &&
136 TRANSCycles == RHS.TRANSCycles && TRANSNum == RHS.TRANSNum &&
137 TRANSNumVALU == RHS.TRANSNumVALU && SALUCycles == RHS.SALUCycles;
140 bool operator!=(const DelayInfo &RHS) const { return !(*this == RHS); }
142 // Merge another DelayInfo into this one, to represent the union of the
143 // worst-case delays of each type.
144 void merge(const DelayInfo &RHS) {
145 VALUCycles = std::max(VALUCycles, RHS.VALUCycles);
146 VALUNum = std::min(VALUNum, RHS.VALUNum);
147 TRANSCycles = std::max(TRANSCycles, RHS.TRANSCycles);
148 TRANSNum = std::min(TRANSNum, RHS.TRANSNum);
149 TRANSNumVALU = std::min(TRANSNumVALU, RHS.TRANSNumVALU);
150 SALUCycles = std::max(SALUCycles, RHS.SALUCycles);
153 // Update this DelayInfo after issuing an instruction. IsVALU should be 1
154 // when issuing a (non-TRANS) VALU, else 0. IsTRANS should be 1 when issuing
155 // a TRANS, else 0. Cycles is the number of cycles it takes to issue the
156 // instruction. Return true if there is no longer any useful delay info.
157 bool advance(DelayType Type, unsigned Cycles) {
158 bool Erase = true;
160 VALUNum += (Type == VALU);
161 if (VALUNum >= VALU_MAX || VALUCycles <= Cycles) {
162 // Forget about the VALU instruction. It was too far back or has
163 // definitely completed by now.
164 VALUNum = VALU_MAX;
165 VALUCycles = 0;
166 } else {
167 VALUCycles -= Cycles;
168 Erase = false;
171 TRANSNum += (Type == TRANS);
172 TRANSNumVALU += (Type == VALU);
173 if (TRANSNum >= TRANS_MAX || TRANSCycles <= Cycles) {
174 // Forget about any TRANS instruction. It was too far back or has
175 // definitely completed by now.
176 TRANSNum = TRANS_MAX;
177 TRANSNumVALU = VALU_MAX;
178 TRANSCycles = 0;
179 } else {
180 TRANSCycles -= Cycles;
181 Erase = false;
184 if (SALUCycles <= Cycles) {
185 // Forget about any SALU instruction. It has definitely completed by
186 // now.
187 SALUCycles = 0;
188 } else {
189 SALUCycles -= Cycles;
190 Erase = false;
193 return Erase;
196 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
197 void dump() const {
198 if (VALUCycles)
199 dbgs() << " VALUCycles=" << (int)VALUCycles;
200 if (VALUNum < VALU_MAX)
201 dbgs() << " VALUNum=" << (int)VALUNum;
202 if (TRANSCycles)
203 dbgs() << " TRANSCycles=" << (int)TRANSCycles;
204 if (TRANSNum < TRANS_MAX)
205 dbgs() << " TRANSNum=" << (int)TRANSNum;
206 if (TRANSNumVALU < VALU_MAX)
207 dbgs() << " TRANSNumVALU=" << (int)TRANSNumVALU;
208 if (SALUCycles)
209 dbgs() << " SALUCycles=" << (int)SALUCycles;
211 #endif
214 // A map from regunits to the delay info for that regunit.
215 struct DelayState : DenseMap<unsigned, DelayInfo> {
216 // Merge another DelayState into this one by merging the delay info for each
217 // regunit.
218 void merge(const DelayState &RHS) {
219 for (const auto &KV : RHS) {
220 iterator It;
221 bool Inserted;
222 std::tie(It, Inserted) = insert(KV);
223 if (!Inserted)
224 It->second.merge(KV.second);
228 // Advance the delay info for each regunit, erasing any that are no longer
229 // useful.
230 void advance(DelayType Type, unsigned Cycles) {
231 iterator Next;
232 for (auto I = begin(), E = end(); I != E; I = Next) {
233 Next = std::next(I);
234 if (I->second.advance(Type, Cycles))
235 erase(I);
239 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
240 void dump(const TargetRegisterInfo *TRI) const {
241 if (empty()) {
242 dbgs() << " empty\n";
243 return;
246 // Dump DelayInfo for each RegUnit in numerical order.
247 SmallVector<const_iterator, 8> Order;
248 Order.reserve(size());
249 for (const_iterator I = begin(), E = end(); I != E; ++I)
250 Order.push_back(I);
251 llvm::sort(Order, [](const const_iterator &A, const const_iterator &B) {
252 return A->first < B->first;
254 for (const_iterator I : Order) {
255 dbgs() << " " << printRegUnit(I->first, TRI);
256 I->second.dump();
257 dbgs() << "\n";
260 #endif
263 // The saved delay state at the end of each basic block.
264 DenseMap<MachineBasicBlock *, DelayState> BlockState;
266 // Emit an s_delay_alu instruction if necessary before MI.
267 MachineInstr *emitDelayAlu(MachineInstr &MI, DelayInfo Delay,
268 MachineInstr *LastDelayAlu) {
269 unsigned Imm = 0;
271 // Wait for a TRANS instruction.
272 if (Delay.TRANSNum < DelayInfo::TRANS_MAX)
273 Imm |= 4 + Delay.TRANSNum;
275 // Wait for a VALU instruction (if it's more recent than any TRANS
276 // instruction that we're also waiting for).
277 if (Delay.VALUNum < DelayInfo::VALU_MAX &&
278 Delay.VALUNum <= Delay.TRANSNumVALU) {
279 if (Imm & 0xf)
280 Imm |= Delay.VALUNum << 7;
281 else
282 Imm |= Delay.VALUNum;
285 // Wait for an SALU instruction.
286 if (Delay.SALUCycles) {
287 assert(Delay.SALUCycles < DelayInfo::SALU_CYCLES_MAX);
288 if (Imm & 0x780) {
289 // We have already encoded a VALU and a TRANS delay. There's no room in
290 // the encoding for an SALU delay as well, so just drop it.
291 } else if (Imm & 0xf) {
292 Imm |= (Delay.SALUCycles + 8) << 7;
293 } else {
294 Imm |= Delay.SALUCycles + 8;
298 // Don't emit the s_delay_alu instruction if there's nothing to wait for.
299 if (!Imm)
300 return LastDelayAlu;
302 // If we only need to wait for one instruction, try encoding it in the last
303 // s_delay_alu that we emitted.
304 if (!(Imm & 0x780) && LastDelayAlu) {
305 unsigned Skip = 0;
306 for (auto I = MachineBasicBlock::instr_iterator(LastDelayAlu),
307 E = MachineBasicBlock::instr_iterator(MI);
308 ++I != E;) {
309 if (!I->isBundle() && !I->isMetaInstruction())
310 ++Skip;
312 if (Skip < 6) {
313 MachineOperand &Op = LastDelayAlu->getOperand(0);
314 unsigned LastImm = Op.getImm();
315 assert((LastImm & ~0xf) == 0 &&
316 "Remembered an s_delay_alu with no room for another delay!");
317 LastImm |= Imm << 7 | Skip << 4;
318 Op.setImm(LastImm);
319 return nullptr;
323 auto &MBB = *MI.getParent();
324 MachineInstr *DelayAlu =
325 BuildMI(MBB, MI, DebugLoc(), SII->get(AMDGPU::S_DELAY_ALU)).addImm(Imm);
326 // Remember the s_delay_alu for next time if there is still room in it to
327 // encode another delay.
328 return (Imm & 0x780) ? nullptr : DelayAlu;
331 bool runOnMachineBasicBlock(MachineBasicBlock &MBB, bool Emit) {
332 DelayState State;
333 for (auto *Pred : MBB.predecessors())
334 State.merge(BlockState[Pred]);
336 LLVM_DEBUG(dbgs() << " State at start of " << printMBBReference(MBB)
337 << "\n";
338 State.dump(TRI););
340 bool Changed = false;
341 MachineInstr *LastDelayAlu = nullptr;
343 // Iterate over the contents of bundles, but don't emit any instructions
344 // inside a bundle.
345 for (auto &MI : MBB.instrs()) {
346 if (MI.isBundle() || MI.isMetaInstruction())
347 continue;
349 // Ignore some more instructions that do not generate any code.
350 switch (MI.getOpcode()) {
351 case AMDGPU::SI_RETURN_TO_EPILOG:
352 continue;
355 DelayType Type = getDelayType(MI.getDesc().TSFlags);
357 if (instructionWaitsForVALU(MI)) {
358 // Forget about all outstanding VALU delays.
359 // TODO: This is overkill since it also forgets about SALU delays.
360 State = DelayState();
361 } else if (Type != OTHER) {
362 DelayInfo Delay;
363 // TODO: Scan implicit uses too?
364 for (const auto &Op : MI.explicit_uses()) {
365 if (Op.isReg()) {
366 // One of the operands of the writelane is also the output operand.
367 // This creates the insertion of redundant delays. Hence, we have to
368 // ignore this operand.
369 if (MI.getOpcode() == AMDGPU::V_WRITELANE_B32 && Op.isTied())
370 continue;
371 for (MCRegUnit Unit : TRI->regunits(Op.getReg())) {
372 auto It = State.find(Unit);
373 if (It != State.end()) {
374 Delay.merge(It->second);
375 State.erase(Unit);
380 if (Emit && !MI.isBundledWithPred()) {
381 // TODO: For VALU->SALU delays should we use s_delay_alu or s_nop or
382 // just ignore them?
383 LastDelayAlu = emitDelayAlu(MI, Delay, LastDelayAlu);
387 if (Type != OTHER) {
388 // TODO: Scan implicit defs too?
389 for (const auto &Op : MI.defs()) {
390 unsigned Latency = SchedModel.computeOperandLatency(
391 &MI, Op.getOperandNo(), nullptr, 0);
392 for (MCRegUnit Unit : TRI->regunits(Op.getReg()))
393 State[Unit] = DelayInfo(Type, Latency);
397 // Advance by the number of cycles it takes to issue this instruction.
398 // TODO: Use a more advanced model that accounts for instructions that
399 // take multiple cycles to issue on a particular pipeline.
400 unsigned Cycles = SIInstrInfo::getNumWaitStates(MI);
401 // TODO: In wave64 mode, double the number of cycles for VALU and VMEM
402 // instructions on the assumption that they will usually have to be issued
403 // twice?
404 State.advance(Type, Cycles);
406 LLVM_DEBUG(dbgs() << " State after " << MI; State.dump(TRI););
409 if (Emit) {
410 assert(State == BlockState[&MBB] &&
411 "Basic block state should not have changed on final pass!");
412 } else if (State != BlockState[&MBB]) {
413 BlockState[&MBB] = std::move(State);
414 Changed = true;
416 return Changed;
419 bool runOnMachineFunction(MachineFunction &MF) override {
420 if (skipFunction(MF.getFunction()))
421 return false;
423 LLVM_DEBUG(dbgs() << "AMDGPUInsertDelayAlu running on " << MF.getName()
424 << "\n");
426 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
427 if (!ST.hasDelayAlu())
428 return false;
430 SII = ST.getInstrInfo();
431 TRI = ST.getRegisterInfo();
433 SchedModel.init(&ST);
435 // Calculate the delay state for each basic block, iterating until we reach
436 // a fixed point.
437 SetVector<MachineBasicBlock *> WorkList;
438 for (auto &MBB : reverse(MF))
439 WorkList.insert(&MBB);
440 while (!WorkList.empty()) {
441 auto &MBB = *WorkList.pop_back_val();
442 bool Changed = runOnMachineBasicBlock(MBB, false);
443 if (Changed)
444 WorkList.insert(MBB.succ_begin(), MBB.succ_end());
447 LLVM_DEBUG(dbgs() << "Final pass over all BBs\n");
449 // Make one last pass over all basic blocks to emit s_delay_alu
450 // instructions.
451 bool Changed = false;
452 for (auto &MBB : MF)
453 Changed |= runOnMachineBasicBlock(MBB, true);
454 return Changed;
458 } // namespace
460 char AMDGPUInsertDelayAlu::ID = 0;
462 char &llvm::AMDGPUInsertDelayAluID = AMDGPUInsertDelayAlu::ID;
464 INITIALIZE_PASS(AMDGPUInsertDelayAlu, DEBUG_TYPE, "AMDGPU Insert Delay ALU",
465 false, false)