1 //===- AMDGPUInsertDelayAlu.cpp - Insert s_delay_alu instructions ---------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
10 /// Insert s_delay_alu instructions to avoid stalls on GFX11+.
12 //===----------------------------------------------------------------------===//
15 #include "GCNSubtarget.h"
16 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
17 #include "SIInstrInfo.h"
18 #include "llvm/ADT/SetVector.h"
22 #define DEBUG_TYPE "amdgpu-insert-delay-alu"
26 class AMDGPUInsertDelayAlu
: public MachineFunctionPass
{
30 const SIInstrInfo
*SII
;
31 const TargetRegisterInfo
*TRI
;
33 TargetSchedModel SchedModel
;
35 AMDGPUInsertDelayAlu() : MachineFunctionPass(ID
) {}
37 void getAnalysisUsage(AnalysisUsage
&AU
) const override
{
39 MachineFunctionPass::getAnalysisUsage(AU
);
42 // Return true if MI waits for all outstanding VALU instructions to complete.
43 static bool instructionWaitsForVALU(const MachineInstr
&MI
) {
44 // These instruction types wait for VA_VDST==0 before issuing.
45 const uint64_t VA_VDST_0
= SIInstrFlags::DS
| SIInstrFlags::EXP
|
46 SIInstrFlags::FLAT
| SIInstrFlags::MIMG
|
47 SIInstrFlags::MTBUF
| SIInstrFlags::MUBUF
;
48 if (MI
.getDesc().TSFlags
& VA_VDST_0
)
50 if (MI
.getOpcode() == AMDGPU::S_SENDMSG_RTN_B32
||
51 MI
.getOpcode() == AMDGPU::S_SENDMSG_RTN_B64
)
53 if (MI
.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR
&&
54 AMDGPU::DepCtr::decodeFieldVaVdst(MI
.getOperand(0).getImm()) == 0)
59 // Types of delay that can be encoded in an s_delay_alu instruction.
60 enum DelayType
{ VALU
, TRANS
, SALU
, OTHER
};
62 // Get the delay type for an instruction with the specified TSFlags.
63 static DelayType
getDelayType(uint64_t TSFlags
) {
64 if (TSFlags
& SIInstrFlags::TRANS
)
66 if (TSFlags
& SIInstrFlags::VALU
)
68 if (TSFlags
& SIInstrFlags::SALU
)
73 // Information about the last instruction(s) that wrote to a particular
74 // regunit. In straight-line code there will only be one such instruction, but
75 // when control flow converges we merge the delay information from each path
76 // to represent the union of the worst-case delays of each type.
78 // One larger than the maximum number of (non-TRANS) VALU instructions we
79 // can encode in an s_delay_alu instruction.
80 static constexpr unsigned VALU_MAX
= 5;
82 // One larger than the maximum number of TRANS instructions we can encode in
83 // an s_delay_alu instruction.
84 static constexpr unsigned TRANS_MAX
= 4;
86 // One larger than the maximum number of SALU cycles we can encode in an
87 // s_delay_alu instruction.
88 static constexpr unsigned SALU_CYCLES_MAX
= 4;
90 // If it was written by a (non-TRANS) VALU, remember how many clock cycles
91 // are left until it completes, and how many other (non-TRANS) VALU we have
92 // seen since it was issued.
93 uint8_t VALUCycles
= 0;
94 uint8_t VALUNum
= VALU_MAX
;
96 // If it was written by a TRANS, remember how many clock cycles are left
97 // until it completes, and how many other TRANS we have seen since it was
99 uint8_t TRANSCycles
= 0;
100 uint8_t TRANSNum
= TRANS_MAX
;
101 // Also remember how many other (non-TRANS) VALU we have seen since it was
102 // issued. When an instruction depends on both a prior TRANS and a prior
103 // non-TRANS VALU, this is used to decide whether to encode a wait for just
104 // one or both of them.
105 uint8_t TRANSNumVALU
= VALU_MAX
;
107 // If it was written by an SALU, remember how many clock cycles are left
108 // until it completes.
109 uint8_t SALUCycles
= 0;
111 DelayInfo() = default;
113 DelayInfo(DelayType Type
, unsigned Cycles
) {
116 llvm_unreachable("unexpected type");
122 TRANSCycles
= Cycles
;
127 // Guard against pseudo-instructions like SI_CALL which are marked as
128 // SALU but with a very high latency.
129 SALUCycles
= std::min(Cycles
, SALU_CYCLES_MAX
);
134 bool operator==(const DelayInfo
&RHS
) const {
135 return VALUCycles
== RHS
.VALUCycles
&& VALUNum
== RHS
.VALUNum
&&
136 TRANSCycles
== RHS
.TRANSCycles
&& TRANSNum
== RHS
.TRANSNum
&&
137 TRANSNumVALU
== RHS
.TRANSNumVALU
&& SALUCycles
== RHS
.SALUCycles
;
140 bool operator!=(const DelayInfo
&RHS
) const { return !(*this == RHS
); }
142 // Merge another DelayInfo into this one, to represent the union of the
143 // worst-case delays of each type.
144 void merge(const DelayInfo
&RHS
) {
145 VALUCycles
= std::max(VALUCycles
, RHS
.VALUCycles
);
146 VALUNum
= std::min(VALUNum
, RHS
.VALUNum
);
147 TRANSCycles
= std::max(TRANSCycles
, RHS
.TRANSCycles
);
148 TRANSNum
= std::min(TRANSNum
, RHS
.TRANSNum
);
149 TRANSNumVALU
= std::min(TRANSNumVALU
, RHS
.TRANSNumVALU
);
150 SALUCycles
= std::max(SALUCycles
, RHS
.SALUCycles
);
153 // Update this DelayInfo after issuing an instruction. IsVALU should be 1
154 // when issuing a (non-TRANS) VALU, else 0. IsTRANS should be 1 when issuing
155 // a TRANS, else 0. Cycles is the number of cycles it takes to issue the
156 // instruction. Return true if there is no longer any useful delay info.
157 bool advance(DelayType Type
, unsigned Cycles
) {
160 VALUNum
+= (Type
== VALU
);
161 if (VALUNum
>= VALU_MAX
|| VALUCycles
<= Cycles
) {
162 // Forget about the VALU instruction. It was too far back or has
163 // definitely completed by now.
167 VALUCycles
-= Cycles
;
171 TRANSNum
+= (Type
== TRANS
);
172 TRANSNumVALU
+= (Type
== VALU
);
173 if (TRANSNum
>= TRANS_MAX
|| TRANSCycles
<= Cycles
) {
174 // Forget about any TRANS instruction. It was too far back or has
175 // definitely completed by now.
176 TRANSNum
= TRANS_MAX
;
177 TRANSNumVALU
= VALU_MAX
;
180 TRANSCycles
-= Cycles
;
184 if (SALUCycles
<= Cycles
) {
185 // Forget about any SALU instruction. It has definitely completed by
189 SALUCycles
-= Cycles
;
196 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
199 dbgs() << " VALUCycles=" << (int)VALUCycles
;
200 if (VALUNum
< VALU_MAX
)
201 dbgs() << " VALUNum=" << (int)VALUNum
;
203 dbgs() << " TRANSCycles=" << (int)TRANSCycles
;
204 if (TRANSNum
< TRANS_MAX
)
205 dbgs() << " TRANSNum=" << (int)TRANSNum
;
206 if (TRANSNumVALU
< VALU_MAX
)
207 dbgs() << " TRANSNumVALU=" << (int)TRANSNumVALU
;
209 dbgs() << " SALUCycles=" << (int)SALUCycles
;
214 // A map from regunits to the delay info for that regunit.
215 struct DelayState
: DenseMap
<unsigned, DelayInfo
> {
216 // Merge another DelayState into this one by merging the delay info for each
218 void merge(const DelayState
&RHS
) {
219 for (const auto &KV
: RHS
) {
222 std::tie(It
, Inserted
) = insert(KV
);
224 It
->second
.merge(KV
.second
);
228 // Advance the delay info for each regunit, erasing any that are no longer
230 void advance(DelayType Type
, unsigned Cycles
) {
232 for (auto I
= begin(), E
= end(); I
!= E
; I
= Next
) {
234 if (I
->second
.advance(Type
, Cycles
))
239 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
240 void dump(const TargetRegisterInfo
*TRI
) const {
242 dbgs() << " empty\n";
246 // Dump DelayInfo for each RegUnit in numerical order.
247 SmallVector
<const_iterator
, 8> Order
;
248 Order
.reserve(size());
249 for (const_iterator I
= begin(), E
= end(); I
!= E
; ++I
)
251 llvm::sort(Order
, [](const const_iterator
&A
, const const_iterator
&B
) {
252 return A
->first
< B
->first
;
254 for (const_iterator I
: Order
) {
255 dbgs() << " " << printRegUnit(I
->first
, TRI
);
263 // The saved delay state at the end of each basic block.
264 DenseMap
<MachineBasicBlock
*, DelayState
> BlockState
;
266 // Emit an s_delay_alu instruction if necessary before MI.
267 MachineInstr
*emitDelayAlu(MachineInstr
&MI
, DelayInfo Delay
,
268 MachineInstr
*LastDelayAlu
) {
271 // Wait for a TRANS instruction.
272 if (Delay
.TRANSNum
< DelayInfo::TRANS_MAX
)
273 Imm
|= 4 + Delay
.TRANSNum
;
275 // Wait for a VALU instruction (if it's more recent than any TRANS
276 // instruction that we're also waiting for).
277 if (Delay
.VALUNum
< DelayInfo::VALU_MAX
&&
278 Delay
.VALUNum
<= Delay
.TRANSNumVALU
) {
280 Imm
|= Delay
.VALUNum
<< 7;
282 Imm
|= Delay
.VALUNum
;
285 // Wait for an SALU instruction.
286 if (Delay
.SALUCycles
) {
287 assert(Delay
.SALUCycles
< DelayInfo::SALU_CYCLES_MAX
);
289 // We have already encoded a VALU and a TRANS delay. There's no room in
290 // the encoding for an SALU delay as well, so just drop it.
291 } else if (Imm
& 0xf) {
292 Imm
|= (Delay
.SALUCycles
+ 8) << 7;
294 Imm
|= Delay
.SALUCycles
+ 8;
298 // Don't emit the s_delay_alu instruction if there's nothing to wait for.
302 // If we only need to wait for one instruction, try encoding it in the last
303 // s_delay_alu that we emitted.
304 if (!(Imm
& 0x780) && LastDelayAlu
) {
306 for (auto I
= MachineBasicBlock::instr_iterator(LastDelayAlu
),
307 E
= MachineBasicBlock::instr_iterator(MI
);
309 if (!I
->isBundle() && !I
->isMetaInstruction())
313 MachineOperand
&Op
= LastDelayAlu
->getOperand(0);
314 unsigned LastImm
= Op
.getImm();
315 assert((LastImm
& ~0xf) == 0 &&
316 "Remembered an s_delay_alu with no room for another delay!");
317 LastImm
|= Imm
<< 7 | Skip
<< 4;
323 auto &MBB
= *MI
.getParent();
324 MachineInstr
*DelayAlu
=
325 BuildMI(MBB
, MI
, DebugLoc(), SII
->get(AMDGPU::S_DELAY_ALU
)).addImm(Imm
);
326 // Remember the s_delay_alu for next time if there is still room in it to
327 // encode another delay.
328 return (Imm
& 0x780) ? nullptr : DelayAlu
;
331 bool runOnMachineBasicBlock(MachineBasicBlock
&MBB
, bool Emit
) {
333 for (auto *Pred
: MBB
.predecessors())
334 State
.merge(BlockState
[Pred
]);
336 LLVM_DEBUG(dbgs() << " State at start of " << printMBBReference(MBB
)
340 bool Changed
= false;
341 MachineInstr
*LastDelayAlu
= nullptr;
343 // Iterate over the contents of bundles, but don't emit any instructions
345 for (auto &MI
: MBB
.instrs()) {
346 if (MI
.isBundle() || MI
.isMetaInstruction())
349 // Ignore some more instructions that do not generate any code.
350 switch (MI
.getOpcode()) {
351 case AMDGPU::SI_RETURN_TO_EPILOG
:
355 DelayType Type
= getDelayType(MI
.getDesc().TSFlags
);
357 if (instructionWaitsForVALU(MI
)) {
358 // Forget about all outstanding VALU delays.
359 // TODO: This is overkill since it also forgets about SALU delays.
360 State
= DelayState();
361 } else if (Type
!= OTHER
) {
363 // TODO: Scan implicit uses too?
364 for (const auto &Op
: MI
.explicit_uses()) {
366 // One of the operands of the writelane is also the output operand.
367 // This creates the insertion of redundant delays. Hence, we have to
368 // ignore this operand.
369 if (MI
.getOpcode() == AMDGPU::V_WRITELANE_B32
&& Op
.isTied())
371 for (MCRegUnit Unit
: TRI
->regunits(Op
.getReg())) {
372 auto It
= State
.find(Unit
);
373 if (It
!= State
.end()) {
374 Delay
.merge(It
->second
);
380 if (Emit
&& !MI
.isBundledWithPred()) {
381 // TODO: For VALU->SALU delays should we use s_delay_alu or s_nop or
383 LastDelayAlu
= emitDelayAlu(MI
, Delay
, LastDelayAlu
);
388 // TODO: Scan implicit defs too?
389 for (const auto &Op
: MI
.defs()) {
390 unsigned Latency
= SchedModel
.computeOperandLatency(
391 &MI
, Op
.getOperandNo(), nullptr, 0);
392 for (MCRegUnit Unit
: TRI
->regunits(Op
.getReg()))
393 State
[Unit
] = DelayInfo(Type
, Latency
);
397 // Advance by the number of cycles it takes to issue this instruction.
398 // TODO: Use a more advanced model that accounts for instructions that
399 // take multiple cycles to issue on a particular pipeline.
400 unsigned Cycles
= SIInstrInfo::getNumWaitStates(MI
);
401 // TODO: In wave64 mode, double the number of cycles for VALU and VMEM
402 // instructions on the assumption that they will usually have to be issued
404 State
.advance(Type
, Cycles
);
406 LLVM_DEBUG(dbgs() << " State after " << MI
; State
.dump(TRI
););
410 assert(State
== BlockState
[&MBB
] &&
411 "Basic block state should not have changed on final pass!");
412 } else if (State
!= BlockState
[&MBB
]) {
413 BlockState
[&MBB
] = std::move(State
);
419 bool runOnMachineFunction(MachineFunction
&MF
) override
{
420 if (skipFunction(MF
.getFunction()))
423 LLVM_DEBUG(dbgs() << "AMDGPUInsertDelayAlu running on " << MF
.getName()
426 const GCNSubtarget
&ST
= MF
.getSubtarget
<GCNSubtarget
>();
427 if (!ST
.hasDelayAlu())
430 SII
= ST
.getInstrInfo();
431 TRI
= ST
.getRegisterInfo();
433 SchedModel
.init(&ST
);
435 // Calculate the delay state for each basic block, iterating until we reach
437 SetVector
<MachineBasicBlock
*> WorkList
;
438 for (auto &MBB
: reverse(MF
))
439 WorkList
.insert(&MBB
);
440 while (!WorkList
.empty()) {
441 auto &MBB
= *WorkList
.pop_back_val();
442 bool Changed
= runOnMachineBasicBlock(MBB
, false);
444 WorkList
.insert(MBB
.succ_begin(), MBB
.succ_end());
447 LLVM_DEBUG(dbgs() << "Final pass over all BBs\n");
449 // Make one last pass over all basic blocks to emit s_delay_alu
451 bool Changed
= false;
453 Changed
|= runOnMachineBasicBlock(MBB
, true);
460 char AMDGPUInsertDelayAlu::ID
= 0;
462 char &llvm::AMDGPUInsertDelayAluID
= AMDGPUInsertDelayAlu::ID
;
464 INITIALIZE_PASS(AMDGPUInsertDelayAlu
, DEBUG_TYPE
, "AMDGPU Insert Delay ALU",