1 //===-- GCNHazardRecognizers.cpp - GCN Hazard Recognizer Impls ------------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 // This file implements hazard recognizers for scheduling on GCN processors.
11 //===----------------------------------------------------------------------===//
13 #include "GCNHazardRecognizer.h"
14 #include "AMDGPUSubtarget.h"
15 #include "SIDefines.h"
16 #include "SIInstrInfo.h"
17 #include "SIRegisterInfo.h"
18 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
19 #include "Utils/AMDGPUBaseInfo.h"
20 #include "llvm/ADT/iterator_range.h"
21 #include "llvm/CodeGen/MachineFunction.h"
22 #include "llvm/CodeGen/MachineInstr.h"
23 #include "llvm/CodeGen/MachineInstrBuilder.h"
24 #include "llvm/CodeGen/MachineOperand.h"
25 #include "llvm/CodeGen/ScheduleDAG.h"
26 #include "llvm/MC/MCInstrDesc.h"
27 #include "llvm/Support/ErrorHandling.h"
36 //===----------------------------------------------------------------------===//
37 // Hazard Recoginizer Implementation
38 //===----------------------------------------------------------------------===//
40 GCNHazardRecognizer::GCNHazardRecognizer(const MachineFunction
&MF
) :
41 IsHazardRecognizerMode(false),
42 CurrCycleInstr(nullptr),
44 ST(MF
.getSubtarget
<GCNSubtarget
>()),
45 TII(*ST
.getInstrInfo()),
46 TRI(TII
.getRegisterInfo()),
47 ClauseUses(TRI
.getNumRegUnits()),
48 ClauseDefs(TRI
.getNumRegUnits()) {
49 MaxLookAhead
= MF
.getRegInfo().isPhysRegUsed(AMDGPU::AGPR0
) ? 18 : 5;
50 TSchedModel
.init(&ST
);
53 void GCNHazardRecognizer::EmitInstruction(SUnit
*SU
) {
54 EmitInstruction(SU
->getInstr());
57 void GCNHazardRecognizer::EmitInstruction(MachineInstr
*MI
) {
61 static bool isDivFMas(unsigned Opcode
) {
62 return Opcode
== AMDGPU::V_DIV_FMAS_F32
|| Opcode
== AMDGPU::V_DIV_FMAS_F64
;
65 static bool isSGetReg(unsigned Opcode
) {
66 return Opcode
== AMDGPU::S_GETREG_B32
;
69 static bool isSSetReg(unsigned Opcode
) {
70 return Opcode
== AMDGPU::S_SETREG_B32
|| Opcode
== AMDGPU::S_SETREG_IMM32_B32
;
73 static bool isRWLane(unsigned Opcode
) {
74 return Opcode
== AMDGPU::V_READLANE_B32
|| Opcode
== AMDGPU::V_WRITELANE_B32
;
77 static bool isRFE(unsigned Opcode
) {
78 return Opcode
== AMDGPU::S_RFE_B64
;
81 static bool isSMovRel(unsigned Opcode
) {
83 case AMDGPU::S_MOVRELS_B32
:
84 case AMDGPU::S_MOVRELS_B64
:
85 case AMDGPU::S_MOVRELD_B32
:
86 case AMDGPU::S_MOVRELD_B64
:
93 static bool isSendMsgTraceDataOrGDS(const SIInstrInfo
&TII
,
94 const MachineInstr
&MI
) {
95 if (TII
.isAlwaysGDS(MI
.getOpcode()))
98 switch (MI
.getOpcode()) {
99 case AMDGPU::S_SENDMSG
:
100 case AMDGPU::S_SENDMSGHALT
:
101 case AMDGPU::S_TTRACEDATA
:
103 // These DS opcodes don't support GDS.
105 case AMDGPU::DS_PERMUTE_B32
:
106 case AMDGPU::DS_BPERMUTE_B32
:
109 if (TII
.isDS(MI
.getOpcode())) {
110 int GDS
= AMDGPU::getNamedOperandIdx(MI
.getOpcode(),
111 AMDGPU::OpName::gds
);
112 if (MI
.getOperand(GDS
).getImm())
119 static bool isPermlane(const MachineInstr
&MI
) {
120 unsigned Opcode
= MI
.getOpcode();
121 return Opcode
== AMDGPU::V_PERMLANE16_B32
||
122 Opcode
== AMDGPU::V_PERMLANEX16_B32
;
125 static unsigned getHWReg(const SIInstrInfo
*TII
, const MachineInstr
&RegInstr
) {
126 const MachineOperand
*RegOp
= TII
->getNamedOperand(RegInstr
,
127 AMDGPU::OpName::simm16
);
128 return RegOp
->getImm() & AMDGPU::Hwreg::ID_MASK_
;
131 ScheduleHazardRecognizer::HazardType
132 GCNHazardRecognizer::getHazardType(SUnit
*SU
, int Stalls
) {
133 MachineInstr
*MI
= SU
->getInstr();
137 if (SIInstrInfo::isSMRD(*MI
) && checkSMRDHazards(MI
) > 0)
140 // FIXME: Should flat be considered vmem?
141 if ((SIInstrInfo::isVMEM(*MI
) ||
142 SIInstrInfo::isFLAT(*MI
))
143 && checkVMEMHazards(MI
) > 0)
146 if (ST
.hasNSAtoVMEMBug() && checkNSAtoVMEMHazard(MI
) > 0)
149 if (checkFPAtomicToDenormModeHazard(MI
) > 0)
152 if (ST
.hasNoDataDepHazard())
155 if (SIInstrInfo::isVALU(*MI
) && checkVALUHazards(MI
) > 0)
158 if (SIInstrInfo::isDPP(*MI
) && checkDPPHazards(MI
) > 0)
161 if (isDivFMas(MI
->getOpcode()) && checkDivFMasHazards(MI
) > 0)
164 if (isRWLane(MI
->getOpcode()) && checkRWLaneHazards(MI
) > 0)
167 if (isSGetReg(MI
->getOpcode()) && checkGetRegHazards(MI
) > 0)
170 if (isSSetReg(MI
->getOpcode()) && checkSetRegHazards(MI
) > 0)
173 if (isRFE(MI
->getOpcode()) && checkRFEHazards(MI
) > 0)
176 if (ST
.hasReadM0MovRelInterpHazard() &&
177 (TII
.isVINTRP(*MI
) || isSMovRel(MI
->getOpcode())) &&
178 checkReadM0Hazards(MI
) > 0)
181 if (ST
.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII
, *MI
) &&
182 checkReadM0Hazards(MI
) > 0)
185 if (SIInstrInfo::isMAI(*MI
) && checkMAIHazards(MI
) > 0)
188 if ((MI
->mayLoad() || MI
->mayStore()) && checkMAILdStHazards(MI
) > 0)
191 if (MI
->isInlineAsm() && checkInlineAsmHazards(MI
) > 0)
194 if (checkAnyInstHazards(MI
) > 0)
200 static void insertNoopInBundle(MachineInstr
*MI
, const SIInstrInfo
&TII
) {
201 BuildMI(*MI
->getParent(), MI
, MI
->getDebugLoc(), TII
.get(AMDGPU::S_NOP
))
205 void GCNHazardRecognizer::processBundle() {
206 MachineBasicBlock::instr_iterator MI
= std::next(CurrCycleInstr
->getIterator());
207 MachineBasicBlock::instr_iterator E
= CurrCycleInstr
->getParent()->instr_end();
208 // Check bundled MachineInstr's for hazards.
209 for (; MI
!= E
&& MI
->isInsideBundle(); ++MI
) {
210 CurrCycleInstr
= &*MI
;
211 unsigned WaitStates
= PreEmitNoopsCommon(CurrCycleInstr
);
213 if (IsHazardRecognizerMode
)
214 fixHazards(CurrCycleInstr
);
216 for (unsigned i
= 0; i
< WaitStates
; ++i
)
217 insertNoopInBundle(CurrCycleInstr
, TII
);
219 // It’s unnecessary to track more than MaxLookAhead instructions. Since we
220 // include the bundled MI directly after, only add a maximum of
221 // (MaxLookAhead - 1) noops to EmittedInstrs.
222 for (unsigned i
= 0, e
= std::min(WaitStates
, MaxLookAhead
- 1); i
< e
; ++i
)
223 EmittedInstrs
.push_front(nullptr);
225 EmittedInstrs
.push_front(CurrCycleInstr
);
226 EmittedInstrs
.resize(MaxLookAhead
);
228 CurrCycleInstr
= nullptr;
231 unsigned GCNHazardRecognizer::PreEmitNoops(SUnit
*SU
) {
232 IsHazardRecognizerMode
= false;
233 return PreEmitNoopsCommon(SU
->getInstr());
236 unsigned GCNHazardRecognizer::PreEmitNoops(MachineInstr
*MI
) {
237 IsHazardRecognizerMode
= true;
239 unsigned W
= PreEmitNoopsCommon(MI
);
241 CurrCycleInstr
= nullptr;
245 unsigned GCNHazardRecognizer::PreEmitNoopsCommon(MachineInstr
*MI
) {
249 int WaitStates
= std::max(0, checkAnyInstHazards(MI
));
251 if (SIInstrInfo::isSMRD(*MI
))
252 return std::max(WaitStates
, checkSMRDHazards(MI
));
254 if (SIInstrInfo::isVMEM(*MI
) || SIInstrInfo::isFLAT(*MI
))
255 WaitStates
= std::max(WaitStates
, checkVMEMHazards(MI
));
257 if (ST
.hasNSAtoVMEMBug())
258 WaitStates
= std::max(WaitStates
, checkNSAtoVMEMHazard(MI
));
260 WaitStates
= std::max(WaitStates
, checkFPAtomicToDenormModeHazard(MI
));
262 if (ST
.hasNoDataDepHazard())
265 if (SIInstrInfo::isVALU(*MI
))
266 WaitStates
= std::max(WaitStates
, checkVALUHazards(MI
));
268 if (SIInstrInfo::isDPP(*MI
))
269 WaitStates
= std::max(WaitStates
, checkDPPHazards(MI
));
271 if (isDivFMas(MI
->getOpcode()))
272 WaitStates
= std::max(WaitStates
, checkDivFMasHazards(MI
));
274 if (isRWLane(MI
->getOpcode()))
275 WaitStates
= std::max(WaitStates
, checkRWLaneHazards(MI
));
277 if (MI
->isInlineAsm())
278 return std::max(WaitStates
, checkInlineAsmHazards(MI
));
280 if (isSGetReg(MI
->getOpcode()))
281 return std::max(WaitStates
, checkGetRegHazards(MI
));
283 if (isSSetReg(MI
->getOpcode()))
284 return std::max(WaitStates
, checkSetRegHazards(MI
));
286 if (isRFE(MI
->getOpcode()))
287 return std::max(WaitStates
, checkRFEHazards(MI
));
289 if (ST
.hasReadM0MovRelInterpHazard() && (TII
.isVINTRP(*MI
) ||
290 isSMovRel(MI
->getOpcode())))
291 return std::max(WaitStates
, checkReadM0Hazards(MI
));
293 if (ST
.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII
, *MI
))
294 return std::max(WaitStates
, checkReadM0Hazards(MI
));
296 if (SIInstrInfo::isMAI(*MI
))
297 return std::max(WaitStates
, checkMAIHazards(MI
));
299 if (MI
->mayLoad() || MI
->mayStore())
300 return std::max(WaitStates
, checkMAILdStHazards(MI
));
305 void GCNHazardRecognizer::EmitNoop() {
306 EmittedInstrs
.push_front(nullptr);
309 void GCNHazardRecognizer::AdvanceCycle() {
310 // When the scheduler detects a stall, it will call AdvanceCycle() without
311 // emitting any instructions.
315 // Do not track non-instructions which do not affect the wait states.
316 // If included, these instructions can lead to buffer overflow such that
317 // detectable hazards are missed.
318 if (CurrCycleInstr
->isImplicitDef() || CurrCycleInstr
->isDebugInstr() ||
319 CurrCycleInstr
->isKill())
322 if (CurrCycleInstr
->isBundle()) {
327 unsigned NumWaitStates
= TII
.getNumWaitStates(*CurrCycleInstr
);
329 // Keep track of emitted instructions
330 EmittedInstrs
.push_front(CurrCycleInstr
);
332 // Add a nullptr for each additional wait state after the first. Make sure
333 // not to add more than getMaxLookAhead() items to the list, since we
334 // truncate the list to that size right after this loop.
335 for (unsigned i
= 1, e
= std::min(NumWaitStates
, getMaxLookAhead());
337 EmittedInstrs
.push_front(nullptr);
340 // getMaxLookahead() is the largest number of wait states we will ever need
341 // to insert, so there is no point in keeping track of more than that many
343 EmittedInstrs
.resize(getMaxLookAhead());
345 CurrCycleInstr
= nullptr;
348 void GCNHazardRecognizer::RecedeCycle() {
349 llvm_unreachable("hazard recognizer does not support bottom-up scheduling.");
352 //===----------------------------------------------------------------------===//
354 //===----------------------------------------------------------------------===//
356 typedef function_ref
<bool(MachineInstr
*, int WaitStates
)> IsExpiredFn
;
358 // Returns a minimum wait states since \p I walking all predecessors.
359 // Only scans until \p IsExpired does not return true.
360 // Can only be run in a hazard recognizer mode.
361 static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard
,
362 MachineBasicBlock
*MBB
,
363 MachineBasicBlock::reverse_instr_iterator I
,
365 IsExpiredFn IsExpired
,
366 DenseSet
<const MachineBasicBlock
*> &Visited
) {
367 for (auto E
= MBB
->instr_rend(); I
!= E
; ++I
) {
368 // Don't add WaitStates for parent BUNDLE instructions.
375 if (I
->isInlineAsm() || I
->isImplicitDef() || I
->isDebugInstr())
378 WaitStates
+= SIInstrInfo::getNumWaitStates(*I
);
380 if (IsExpired(&*I
, WaitStates
))
381 return std::numeric_limits
<int>::max();
384 int MinWaitStates
= WaitStates
;
386 for (MachineBasicBlock
*Pred
: MBB
->predecessors()) {
387 if (!Visited
.insert(Pred
).second
)
390 int W
= getWaitStatesSince(IsHazard
, Pred
, Pred
->instr_rbegin(),
391 WaitStates
, IsExpired
, Visited
);
393 if (W
== std::numeric_limits
<int>::max())
396 MinWaitStates
= Found
? std::min(MinWaitStates
, W
) : W
;
397 if (IsExpired(nullptr, MinWaitStates
))
398 return MinWaitStates
;
404 return MinWaitStates
;
406 return std::numeric_limits
<int>::max();
409 static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard
,
411 IsExpiredFn IsExpired
) {
412 DenseSet
<const MachineBasicBlock
*> Visited
;
413 return getWaitStatesSince(IsHazard
, MI
->getParent(),
414 std::next(MI
->getReverseIterator()),
415 0, IsExpired
, Visited
);
418 int GCNHazardRecognizer::getWaitStatesSince(IsHazardFn IsHazard
, int Limit
) {
419 if (IsHazardRecognizerMode
) {
420 auto IsExpiredFn
= [Limit
] (MachineInstr
*, int WaitStates
) {
421 return WaitStates
>= Limit
;
423 return ::getWaitStatesSince(IsHazard
, CurrCycleInstr
, IsExpiredFn
);
427 for (MachineInstr
*MI
: EmittedInstrs
) {
432 if (MI
->isInlineAsm())
437 if (WaitStates
>= Limit
)
440 return std::numeric_limits
<int>::max();
443 int GCNHazardRecognizer::getWaitStatesSinceDef(unsigned Reg
,
444 IsHazardFn IsHazardDef
,
446 const SIRegisterInfo
*TRI
= ST
.getRegisterInfo();
448 auto IsHazardFn
= [IsHazardDef
, TRI
, Reg
] (MachineInstr
*MI
) {
449 return IsHazardDef(MI
) && MI
->modifiesRegister(Reg
, TRI
);
452 return getWaitStatesSince(IsHazardFn
, Limit
);
455 int GCNHazardRecognizer::getWaitStatesSinceSetReg(IsHazardFn IsHazard
,
457 auto IsHazardFn
= [IsHazard
] (MachineInstr
*MI
) {
458 return isSSetReg(MI
->getOpcode()) && IsHazard(MI
);
461 return getWaitStatesSince(IsHazardFn
, Limit
);
464 //===----------------------------------------------------------------------===//
465 // No-op Hazard Detection
466 //===----------------------------------------------------------------------===//
468 static void addRegUnits(const SIRegisterInfo
&TRI
,
469 BitVector
&BV
, unsigned Reg
) {
470 for (MCRegUnitIterator
RUI(Reg
, &TRI
); RUI
.isValid(); ++RUI
)
474 static void addRegsToSet(const SIRegisterInfo
&TRI
,
475 iterator_range
<MachineInstr::const_mop_iterator
> Ops
,
477 for (const MachineOperand
&Op
: Ops
) {
479 addRegUnits(TRI
, Set
, Op
.getReg());
483 void GCNHazardRecognizer::addClauseInst(const MachineInstr
&MI
) {
484 // XXX: Do we need to worry about implicit operands
485 addRegsToSet(TRI
, MI
.defs(), ClauseDefs
);
486 addRegsToSet(TRI
, MI
.uses(), ClauseUses
);
489 int GCNHazardRecognizer::checkSoftClauseHazards(MachineInstr
*MEM
) {
490 // SMEM soft clause are only present on VI+, and only matter if xnack is
492 if (!ST
.isXNACKEnabled())
495 bool IsSMRD
= TII
.isSMRD(*MEM
);
499 // A soft-clause is any group of consecutive SMEM instructions. The
500 // instructions in this group may return out of order and/or may be
501 // replayed (i.e. the same instruction issued more than once).
503 // In order to handle these situations correctly we need to make sure that
504 // when a clause has more than one instruction, no instruction in the clause
505 // writes to a register that is read by another instruction in the clause
506 // (including itself). If we encounter this situaion, we need to break the
507 // clause by inserting a non SMEM instruction.
509 for (MachineInstr
*MI
: EmittedInstrs
) {
510 // When we hit a non-SMEM instruction then we have passed the start of the
511 // clause and we can stop.
515 if (IsSMRD
!= SIInstrInfo::isSMRD(*MI
))
521 if (ClauseDefs
.none())
524 // We need to make sure not to put loads and stores in the same clause if they
525 // use the same address. For now, just start a new clause whenever we see a
532 // If the set of defs and uses intersect then we cannot add this instruction
533 // to the clause, so we have a hazard.
534 return ClauseDefs
.anyCommon(ClauseUses
) ? 1 : 0;
537 int GCNHazardRecognizer::checkSMRDHazards(MachineInstr
*SMRD
) {
538 int WaitStatesNeeded
= 0;
540 WaitStatesNeeded
= checkSoftClauseHazards(SMRD
);
542 // This SMRD hazard only affects SI.
543 if (!ST
.hasSMRDReadVALUDefHazard())
544 return WaitStatesNeeded
;
546 // A read of an SGPR by SMRD instruction requires 4 wait states when the
547 // SGPR was written by a VALU instruction.
548 int SmrdSgprWaitStates
= 4;
549 auto IsHazardDefFn
= [this] (MachineInstr
*MI
) { return TII
.isVALU(*MI
); };
550 auto IsBufferHazardDefFn
= [this] (MachineInstr
*MI
) { return TII
.isSALU(*MI
); };
552 bool IsBufferSMRD
= TII
.isBufferSMRD(*SMRD
);
554 for (const MachineOperand
&Use
: SMRD
->uses()) {
557 int WaitStatesNeededForUse
=
558 SmrdSgprWaitStates
- getWaitStatesSinceDef(Use
.getReg(), IsHazardDefFn
,
560 WaitStatesNeeded
= std::max(WaitStatesNeeded
, WaitStatesNeededForUse
);
562 // This fixes what appears to be undocumented hardware behavior in SI where
563 // s_mov writing a descriptor and s_buffer_load_dword reading the descriptor
564 // needs some number of nops in between. We don't know how many we need, but
565 // let's use 4. This wasn't discovered before probably because the only
566 // case when this happens is when we expand a 64-bit pointer into a full
567 // descriptor and use s_buffer_load_dword instead of s_load_dword, which was
568 // probably never encountered in the closed-source land.
570 int WaitStatesNeededForUse
=
571 SmrdSgprWaitStates
- getWaitStatesSinceDef(Use
.getReg(),
574 WaitStatesNeeded
= std::max(WaitStatesNeeded
, WaitStatesNeededForUse
);
578 return WaitStatesNeeded
;
581 int GCNHazardRecognizer::checkVMEMHazards(MachineInstr
* VMEM
) {
582 if (!ST
.hasVMEMReadSGPRVALUDefHazard())
585 int WaitStatesNeeded
= checkSoftClauseHazards(VMEM
);
587 // A read of an SGPR by a VMEM instruction requires 5 wait states when the
588 // SGPR was written by a VALU Instruction.
589 const int VmemSgprWaitStates
= 5;
590 auto IsHazardDefFn
= [this] (MachineInstr
*MI
) { return TII
.isVALU(*MI
); };
591 for (const MachineOperand
&Use
: VMEM
->uses()) {
592 if (!Use
.isReg() || TRI
.isVGPR(MF
.getRegInfo(), Use
.getReg()))
595 int WaitStatesNeededForUse
=
596 VmemSgprWaitStates
- getWaitStatesSinceDef(Use
.getReg(), IsHazardDefFn
,
598 WaitStatesNeeded
= std::max(WaitStatesNeeded
, WaitStatesNeededForUse
);
600 return WaitStatesNeeded
;
603 int GCNHazardRecognizer::checkDPPHazards(MachineInstr
*DPP
) {
604 const SIRegisterInfo
*TRI
= ST
.getRegisterInfo();
605 const SIInstrInfo
*TII
= ST
.getInstrInfo();
607 // Check for DPP VGPR read after VALU VGPR write and EXEC write.
608 int DppVgprWaitStates
= 2;
609 int DppExecWaitStates
= 5;
610 int WaitStatesNeeded
= 0;
611 auto IsHazardDefFn
= [TII
] (MachineInstr
*MI
) { return TII
->isVALU(*MI
); };
613 for (const MachineOperand
&Use
: DPP
->uses()) {
614 if (!Use
.isReg() || !TRI
->isVGPR(MF
.getRegInfo(), Use
.getReg()))
616 int WaitStatesNeededForUse
=
617 DppVgprWaitStates
- getWaitStatesSinceDef(Use
.getReg(),
618 [](MachineInstr
*) { return true; },
620 WaitStatesNeeded
= std::max(WaitStatesNeeded
, WaitStatesNeededForUse
);
623 WaitStatesNeeded
= std::max(
625 DppExecWaitStates
- getWaitStatesSinceDef(AMDGPU::EXEC
, IsHazardDefFn
,
628 return WaitStatesNeeded
;
631 int GCNHazardRecognizer::checkDivFMasHazards(MachineInstr
*DivFMas
) {
632 const SIInstrInfo
*TII
= ST
.getInstrInfo();
634 // v_div_fmas requires 4 wait states after a write to vcc from a VALU
636 const int DivFMasWaitStates
= 4;
637 auto IsHazardDefFn
= [TII
] (MachineInstr
*MI
) { return TII
->isVALU(*MI
); };
638 int WaitStatesNeeded
= getWaitStatesSinceDef(AMDGPU::VCC
, IsHazardDefFn
,
641 return DivFMasWaitStates
- WaitStatesNeeded
;
644 int GCNHazardRecognizer::checkGetRegHazards(MachineInstr
*GetRegInstr
) {
645 const SIInstrInfo
*TII
= ST
.getInstrInfo();
646 unsigned GetRegHWReg
= getHWReg(TII
, *GetRegInstr
);
648 const int GetRegWaitStates
= 2;
649 auto IsHazardFn
= [TII
, GetRegHWReg
] (MachineInstr
*MI
) {
650 return GetRegHWReg
== getHWReg(TII
, *MI
);
652 int WaitStatesNeeded
= getWaitStatesSinceSetReg(IsHazardFn
, GetRegWaitStates
);
654 return GetRegWaitStates
- WaitStatesNeeded
;
657 int GCNHazardRecognizer::checkSetRegHazards(MachineInstr
*SetRegInstr
) {
658 const SIInstrInfo
*TII
= ST
.getInstrInfo();
659 unsigned HWReg
= getHWReg(TII
, *SetRegInstr
);
661 const int SetRegWaitStates
= ST
.getSetRegWaitStates();
662 auto IsHazardFn
= [TII
, HWReg
] (MachineInstr
*MI
) {
663 return HWReg
== getHWReg(TII
, *MI
);
665 int WaitStatesNeeded
= getWaitStatesSinceSetReg(IsHazardFn
, SetRegWaitStates
);
666 return SetRegWaitStates
- WaitStatesNeeded
;
669 int GCNHazardRecognizer::createsVALUHazard(const MachineInstr
&MI
) {
673 const SIInstrInfo
*TII
= ST
.getInstrInfo();
674 unsigned Opcode
= MI
.getOpcode();
675 const MCInstrDesc
&Desc
= MI
.getDesc();
677 int VDataIdx
= AMDGPU::getNamedOperandIdx(Opcode
, AMDGPU::OpName::vdata
);
680 VDataRCID
= Desc
.OpInfo
[VDataIdx
].RegClass
;
682 if (TII
->isMUBUF(MI
) || TII
->isMTBUF(MI
)) {
683 // There is no hazard if the instruction does not use vector regs
687 // For MUBUF/MTBUF instructions this hazard only exists if the
688 // instruction is not using a register in the soffset field.
689 const MachineOperand
*SOffset
=
690 TII
->getNamedOperand(MI
, AMDGPU::OpName::soffset
);
691 // If we have no soffset operand, then assume this field has been
692 // hardcoded to zero.
693 if (AMDGPU::getRegBitWidth(VDataRCID
) > 64 &&
694 (!SOffset
|| !SOffset
->isReg()))
698 // MIMG instructions create a hazard if they don't use a 256-bit T# and
699 // the store size is greater than 8 bytes and they have more than two bits
700 // of their dmask set.
701 // All our MIMG definitions use a 256-bit T#, so we can skip checking for them.
702 if (TII
->isMIMG(MI
)) {
703 int SRsrcIdx
= AMDGPU::getNamedOperandIdx(Opcode
, AMDGPU::OpName::srsrc
);
704 assert(SRsrcIdx
!= -1 &&
705 AMDGPU::getRegBitWidth(Desc
.OpInfo
[SRsrcIdx
].RegClass
) == 256);
709 if (TII
->isFLAT(MI
)) {
710 int DataIdx
= AMDGPU::getNamedOperandIdx(Opcode
, AMDGPU::OpName::vdata
);
711 if (AMDGPU::getRegBitWidth(Desc
.OpInfo
[DataIdx
].RegClass
) > 64)
718 int GCNHazardRecognizer::checkVALUHazardsHelper(const MachineOperand
&Def
,
719 const MachineRegisterInfo
&MRI
) {
720 // Helper to check for the hazard where VMEM instructions that store more than
721 // 8 bytes can have there store data over written by the next instruction.
722 const SIRegisterInfo
*TRI
= ST
.getRegisterInfo();
724 const int VALUWaitStates
= 1;
725 int WaitStatesNeeded
= 0;
727 if (!TRI
->isVGPR(MRI
, Def
.getReg()))
728 return WaitStatesNeeded
;
729 Register Reg
= Def
.getReg();
730 auto IsHazardFn
= [this, Reg
, TRI
] (MachineInstr
*MI
) {
731 int DataIdx
= createsVALUHazard(*MI
);
732 return DataIdx
>= 0 &&
733 TRI
->regsOverlap(MI
->getOperand(DataIdx
).getReg(), Reg
);
735 int WaitStatesNeededForDef
=
736 VALUWaitStates
- getWaitStatesSince(IsHazardFn
, VALUWaitStates
);
737 WaitStatesNeeded
= std::max(WaitStatesNeeded
, WaitStatesNeededForDef
);
739 return WaitStatesNeeded
;
742 int GCNHazardRecognizer::checkVALUHazards(MachineInstr
*VALU
) {
743 // This checks for the hazard where VMEM instructions that store more than
744 // 8 bytes can have there store data over written by the next instruction.
745 if (!ST
.has12DWordStoreHazard())
748 const MachineRegisterInfo
&MRI
= MF
.getRegInfo();
749 int WaitStatesNeeded
= 0;
751 for (const MachineOperand
&Def
: VALU
->defs()) {
752 WaitStatesNeeded
= std::max(WaitStatesNeeded
, checkVALUHazardsHelper(Def
, MRI
));
755 return WaitStatesNeeded
;
758 int GCNHazardRecognizer::checkInlineAsmHazards(MachineInstr
*IA
) {
759 // This checks for hazards associated with inline asm statements.
760 // Since inline asms can contain just about anything, we use this
761 // to call/leverage other check*Hazard routines. Note that
762 // this function doesn't attempt to address all possible inline asm
763 // hazards (good luck), but is a collection of what has been
764 // problematic thus far.
766 // see checkVALUHazards()
767 if (!ST
.has12DWordStoreHazard())
770 const MachineRegisterInfo
&MRI
= MF
.getRegInfo();
771 int WaitStatesNeeded
= 0;
773 for (unsigned I
= InlineAsm::MIOp_FirstOperand
, E
= IA
->getNumOperands();
775 const MachineOperand
&Op
= IA
->getOperand(I
);
776 if (Op
.isReg() && Op
.isDef()) {
777 WaitStatesNeeded
= std::max(WaitStatesNeeded
, checkVALUHazardsHelper(Op
, MRI
));
781 return WaitStatesNeeded
;
784 int GCNHazardRecognizer::checkRWLaneHazards(MachineInstr
*RWLane
) {
785 const SIInstrInfo
*TII
= ST
.getInstrInfo();
786 const SIRegisterInfo
*TRI
= ST
.getRegisterInfo();
787 const MachineRegisterInfo
&MRI
= MF
.getRegInfo();
789 const MachineOperand
*LaneSelectOp
=
790 TII
->getNamedOperand(*RWLane
, AMDGPU::OpName::src1
);
792 if (!LaneSelectOp
->isReg() || !TRI
->isSGPRReg(MRI
, LaneSelectOp
->getReg()))
795 Register LaneSelectReg
= LaneSelectOp
->getReg();
796 auto IsHazardFn
= [TII
] (MachineInstr
*MI
) {
797 return TII
->isVALU(*MI
);
800 const int RWLaneWaitStates
= 4;
801 int WaitStatesSince
= getWaitStatesSinceDef(LaneSelectReg
, IsHazardFn
,
803 return RWLaneWaitStates
- WaitStatesSince
;
806 int GCNHazardRecognizer::checkRFEHazards(MachineInstr
*RFE
) {
807 if (!ST
.hasRFEHazards())
810 const SIInstrInfo
*TII
= ST
.getInstrInfo();
812 const int RFEWaitStates
= 1;
814 auto IsHazardFn
= [TII
] (MachineInstr
*MI
) {
815 return getHWReg(TII
, *MI
) == AMDGPU::Hwreg::ID_TRAPSTS
;
817 int WaitStatesNeeded
= getWaitStatesSinceSetReg(IsHazardFn
, RFEWaitStates
);
818 return RFEWaitStates
- WaitStatesNeeded
;
821 int GCNHazardRecognizer::checkAnyInstHazards(MachineInstr
*MI
) {
822 if (MI
->isDebugInstr())
825 const SIRegisterInfo
*TRI
= ST
.getRegisterInfo();
826 if (!ST
.hasSMovFedHazard())
829 // Check for any instruction reading an SGPR after a write from
831 int MovFedWaitStates
= 1;
832 int WaitStatesNeeded
= 0;
834 for (const MachineOperand
&Use
: MI
->uses()) {
835 if (!Use
.isReg() || TRI
->isVGPR(MF
.getRegInfo(), Use
.getReg()))
837 auto IsHazardFn
= [] (MachineInstr
*MI
) {
838 return MI
->getOpcode() == AMDGPU::S_MOV_FED_B32
;
840 int WaitStatesNeededForUse
=
841 MovFedWaitStates
- getWaitStatesSinceDef(Use
.getReg(), IsHazardFn
,
843 WaitStatesNeeded
= std::max(WaitStatesNeeded
, WaitStatesNeededForUse
);
846 return WaitStatesNeeded
;
849 int GCNHazardRecognizer::checkReadM0Hazards(MachineInstr
*MI
) {
850 const SIInstrInfo
*TII
= ST
.getInstrInfo();
851 const int SMovRelWaitStates
= 1;
852 auto IsHazardFn
= [TII
] (MachineInstr
*MI
) {
853 return TII
->isSALU(*MI
);
855 return SMovRelWaitStates
- getWaitStatesSinceDef(AMDGPU::M0
, IsHazardFn
,
859 void GCNHazardRecognizer::fixHazards(MachineInstr
*MI
) {
860 fixVMEMtoScalarWriteHazards(MI
);
861 fixVcmpxPermlaneHazards(MI
);
862 fixSMEMtoVectorWriteHazards(MI
);
863 fixVcmpxExecWARHazard(MI
);
864 fixLdsBranchVmemWARHazard(MI
);
867 bool GCNHazardRecognizer::fixVcmpxPermlaneHazards(MachineInstr
*MI
) {
868 if (!ST
.hasVcmpxPermlaneHazard() || !isPermlane(*MI
))
871 const SIInstrInfo
*TII
= ST
.getInstrInfo();
872 auto IsHazardFn
= [TII
] (MachineInstr
*MI
) {
873 return TII
->isVOPC(*MI
);
876 auto IsExpiredFn
= [] (MachineInstr
*MI
, int) {
879 unsigned Opc
= MI
->getOpcode();
880 return SIInstrInfo::isVALU(*MI
) &&
881 Opc
!= AMDGPU::V_NOP_e32
&&
882 Opc
!= AMDGPU::V_NOP_e64
&&
883 Opc
!= AMDGPU::V_NOP_sdwa
;
886 if (::getWaitStatesSince(IsHazardFn
, MI
, IsExpiredFn
) ==
887 std::numeric_limits
<int>::max())
890 // V_NOP will be discarded by SQ.
891 // Use V_MOB_B32 v?, v?. Register must be alive so use src0 of V_PERMLANE*
892 // which is always a VGPR and available.
893 auto *Src0
= TII
->getNamedOperand(*MI
, AMDGPU::OpName::src0
);
894 Register Reg
= Src0
->getReg();
895 bool IsUndef
= Src0
->isUndef();
896 BuildMI(*MI
->getParent(), MI
, MI
->getDebugLoc(),
897 TII
->get(AMDGPU::V_MOV_B32_e32
))
898 .addReg(Reg
, RegState::Define
| (IsUndef
? RegState::Dead
: 0))
899 .addReg(Reg
, IsUndef
? RegState::Undef
: RegState::Kill
);
904 bool GCNHazardRecognizer::fixVMEMtoScalarWriteHazards(MachineInstr
*MI
) {
905 if (!ST
.hasVMEMtoScalarWriteHazard())
908 if (!SIInstrInfo::isSALU(*MI
) && !SIInstrInfo::isSMRD(*MI
))
911 if (MI
->getNumDefs() == 0)
914 const SIRegisterInfo
*TRI
= ST
.getRegisterInfo();
916 auto IsHazardFn
= [TRI
, MI
] (MachineInstr
*I
) {
917 if (!SIInstrInfo::isVMEM(*I
) && !SIInstrInfo::isDS(*I
) &&
918 !SIInstrInfo::isFLAT(*I
))
921 for (const MachineOperand
&Def
: MI
->defs()) {
922 MachineOperand
*Op
= I
->findRegisterUseOperand(Def
.getReg(), false, TRI
);
930 auto IsExpiredFn
= [] (MachineInstr
*MI
, int) {
931 return MI
&& (SIInstrInfo::isVALU(*MI
) ||
932 (MI
->getOpcode() == AMDGPU::S_WAITCNT
&&
933 !MI
->getOperand(0).getImm()));
936 if (::getWaitStatesSince(IsHazardFn
, MI
, IsExpiredFn
) ==
937 std::numeric_limits
<int>::max())
940 const SIInstrInfo
*TII
= ST
.getInstrInfo();
941 BuildMI(*MI
->getParent(), MI
, MI
->getDebugLoc(), TII
->get(AMDGPU::V_NOP_e32
));
945 bool GCNHazardRecognizer::fixSMEMtoVectorWriteHazards(MachineInstr
*MI
) {
946 if (!ST
.hasSMEMtoVectorWriteHazard())
949 if (!SIInstrInfo::isVALU(*MI
))
953 switch (MI
->getOpcode()) {
954 case AMDGPU::V_READLANE_B32
:
955 case AMDGPU::V_READLANE_B32_gfx10
:
956 case AMDGPU::V_READFIRSTLANE_B32
:
957 SDSTName
= AMDGPU::OpName::vdst
;
960 SDSTName
= AMDGPU::OpName::sdst
;
964 const SIInstrInfo
*TII
= ST
.getInstrInfo();
965 const SIRegisterInfo
*TRI
= ST
.getRegisterInfo();
966 const AMDGPU::IsaVersion IV
= AMDGPU::getIsaVersion(ST
.getCPU());
967 const MachineOperand
*SDST
= TII
->getNamedOperand(*MI
, SDSTName
);
969 for (const auto &MO
: MI
->implicit_operands()) {
970 if (MO
.isDef() && TRI
->isSGPRClass(TRI
->getPhysRegClass(MO
.getReg()))) {
980 const Register SDSTReg
= SDST
->getReg();
981 auto IsHazardFn
= [SDSTReg
, TRI
] (MachineInstr
*I
) {
982 return SIInstrInfo::isSMRD(*I
) && I
->readsRegister(SDSTReg
, TRI
);
985 auto IsExpiredFn
= [TII
, IV
] (MachineInstr
*MI
, int) {
987 if (TII
->isSALU(*MI
)) {
988 switch (MI
->getOpcode()) {
989 case AMDGPU::S_SETVSKIP
:
990 case AMDGPU::S_VERSION
:
991 case AMDGPU::S_WAITCNT_VSCNT
:
992 case AMDGPU::S_WAITCNT_VMCNT
:
993 case AMDGPU::S_WAITCNT_EXPCNT
:
994 // These instructions cannot not mitigate the hazard.
996 case AMDGPU::S_WAITCNT_LGKMCNT
:
997 // Reducing lgkmcnt count to 0 always mitigates the hazard.
998 return (MI
->getOperand(1).getImm() == 0) &&
999 (MI
->getOperand(0).getReg() == AMDGPU::SGPR_NULL
);
1000 case AMDGPU::S_WAITCNT
: {
1001 const int64_t Imm
= MI
->getOperand(0).getImm();
1002 AMDGPU::Waitcnt Decoded
= AMDGPU::decodeWaitcnt(IV
, Imm
);
1003 return (Decoded
.LgkmCnt
== 0);
1006 // SOPP instructions cannot mitigate the hazard.
1007 if (TII
->isSOPP(*MI
))
1009 // At this point the SALU can be assumed to mitigate the hazard
1011 // (a) it is independent of the at risk SMEM (breaking chain),
1013 // (b) it is dependent on the SMEM, in which case an appropriate
1014 // s_waitcnt lgkmcnt _must_ exist between it and the at risk
1015 // SMEM instruction.
1023 if (::getWaitStatesSince(IsHazardFn
, MI
, IsExpiredFn
) ==
1024 std::numeric_limits
<int>::max())
1027 BuildMI(*MI
->getParent(), MI
, MI
->getDebugLoc(),
1028 TII
->get(AMDGPU::S_MOV_B32
), AMDGPU::SGPR_NULL
)
1033 bool GCNHazardRecognizer::fixVcmpxExecWARHazard(MachineInstr
*MI
) {
1034 if (!ST
.hasVcmpxExecWARHazard() || !SIInstrInfo::isVALU(*MI
))
1037 const SIRegisterInfo
*TRI
= ST
.getRegisterInfo();
1038 if (!MI
->modifiesRegister(AMDGPU::EXEC
, TRI
))
1041 auto IsHazardFn
= [TRI
] (MachineInstr
*I
) {
1042 if (SIInstrInfo::isVALU(*I
))
1044 return I
->readsRegister(AMDGPU::EXEC
, TRI
);
1047 const SIInstrInfo
*TII
= ST
.getInstrInfo();
1048 auto IsExpiredFn
= [TII
, TRI
] (MachineInstr
*MI
, int) {
1051 if (SIInstrInfo::isVALU(*MI
)) {
1052 if (TII
->getNamedOperand(*MI
, AMDGPU::OpName::sdst
))
1054 for (auto MO
: MI
->implicit_operands())
1055 if (MO
.isDef() && TRI
->isSGPRClass(TRI
->getPhysRegClass(MO
.getReg())))
1058 if (MI
->getOpcode() == AMDGPU::S_WAITCNT_DEPCTR
&&
1059 (MI
->getOperand(0).getImm() & 0xfffe) == 0xfffe)
1064 if (::getWaitStatesSince(IsHazardFn
, MI
, IsExpiredFn
) ==
1065 std::numeric_limits
<int>::max())
1068 BuildMI(*MI
->getParent(), MI
, MI
->getDebugLoc(),
1069 TII
->get(AMDGPU::S_WAITCNT_DEPCTR
))
1074 bool GCNHazardRecognizer::fixLdsBranchVmemWARHazard(MachineInstr
*MI
) {
1075 if (!ST
.hasLdsBranchVmemWARHazard())
1078 auto IsHazardInst
= [] (const MachineInstr
*MI
) {
1079 if (SIInstrInfo::isDS(*MI
))
1081 if (SIInstrInfo::isVMEM(*MI
) || SIInstrInfo::isSegmentSpecificFLAT(*MI
))
1086 auto InstType
= IsHazardInst(MI
);
1090 auto IsExpiredFn
= [&IsHazardInst
] (MachineInstr
*I
, int) {
1091 return I
&& (IsHazardInst(I
) ||
1092 (I
->getOpcode() == AMDGPU::S_WAITCNT_VSCNT
&&
1093 I
->getOperand(0).getReg() == AMDGPU::SGPR_NULL
&&
1094 !I
->getOperand(1).getImm()));
1097 auto IsHazardFn
= [InstType
, &IsHazardInst
] (MachineInstr
*I
) {
1101 auto IsHazardFn
= [InstType
, IsHazardInst
] (MachineInstr
*I
) {
1102 auto InstType2
= IsHazardInst(I
);
1103 return InstType2
&& InstType
!= InstType2
;
1106 auto IsExpiredFn
= [InstType
, &IsHazardInst
] (MachineInstr
*I
, int) {
1110 auto InstType2
= IsHazardInst(I
);
1111 if (InstType
== InstType2
)
1114 return I
->getOpcode() == AMDGPU::S_WAITCNT_VSCNT
&&
1115 I
->getOperand(0).getReg() == AMDGPU::SGPR_NULL
&&
1116 !I
->getOperand(1).getImm();
1119 return ::getWaitStatesSince(IsHazardFn
, I
, IsExpiredFn
) !=
1120 std::numeric_limits
<int>::max();
1123 if (::getWaitStatesSince(IsHazardFn
, MI
, IsExpiredFn
) ==
1124 std::numeric_limits
<int>::max())
1127 const SIInstrInfo
*TII
= ST
.getInstrInfo();
1128 BuildMI(*MI
->getParent(), MI
, MI
->getDebugLoc(),
1129 TII
->get(AMDGPU::S_WAITCNT_VSCNT
))
1130 .addReg(AMDGPU::SGPR_NULL
, RegState::Undef
)
1136 int GCNHazardRecognizer::checkNSAtoVMEMHazard(MachineInstr
*MI
) {
1137 int NSAtoVMEMWaitStates
= 1;
1139 if (!ST
.hasNSAtoVMEMBug())
1142 if (!SIInstrInfo::isMUBUF(*MI
) && !SIInstrInfo::isMTBUF(*MI
))
1145 const SIInstrInfo
*TII
= ST
.getInstrInfo();
1146 const auto *Offset
= TII
->getNamedOperand(*MI
, AMDGPU::OpName::offset
);
1147 if (!Offset
|| (Offset
->getImm() & 6) == 0)
1150 auto IsHazardFn
= [TII
] (MachineInstr
*I
) {
1151 if (!SIInstrInfo::isMIMG(*I
))
1153 const AMDGPU::MIMGInfo
*Info
= AMDGPU::getMIMGInfo(I
->getOpcode());
1154 return Info
->MIMGEncoding
== AMDGPU::MIMGEncGfx10NSA
&&
1155 TII
->getInstSizeInBytes(*I
) >= 16;
1158 return NSAtoVMEMWaitStates
- getWaitStatesSince(IsHazardFn
, 1);
1161 int GCNHazardRecognizer::checkFPAtomicToDenormModeHazard(MachineInstr
*MI
) {
1162 int FPAtomicToDenormModeWaitStates
= 3;
1164 if (MI
->getOpcode() != AMDGPU::S_DENORM_MODE
)
1167 auto IsHazardFn
= [] (MachineInstr
*I
) {
1168 if (!SIInstrInfo::isVMEM(*I
) && !SIInstrInfo::isFLAT(*I
))
1170 return SIInstrInfo::isFPAtomic(*I
);
1173 auto IsExpiredFn
= [] (MachineInstr
*MI
, int WaitStates
) {
1174 if (WaitStates
>= 3 || SIInstrInfo::isVALU(*MI
))
1177 switch (MI
->getOpcode()) {
1178 case AMDGPU::S_WAITCNT
:
1179 case AMDGPU::S_WAITCNT_VSCNT
:
1180 case AMDGPU::S_WAITCNT_VMCNT
:
1181 case AMDGPU::S_WAITCNT_EXPCNT
:
1182 case AMDGPU::S_WAITCNT_LGKMCNT
:
1183 case AMDGPU::S_WAITCNT_IDLE
:
1193 return FPAtomicToDenormModeWaitStates
-
1194 ::getWaitStatesSince(IsHazardFn
, MI
, IsExpiredFn
);
1197 int GCNHazardRecognizer::checkMAIHazards(MachineInstr
*MI
) {
1198 assert(SIInstrInfo::isMAI(*MI
));
1200 int WaitStatesNeeded
= 0;
1201 unsigned Opc
= MI
->getOpcode();
1203 auto IsVALUFn
= [] (MachineInstr
*MI
) {
1204 return SIInstrInfo::isVALU(*MI
);
1207 if (Opc
!= AMDGPU::V_ACCVGPR_READ_B32
) { // MFMA or v_accvgpr_write
1208 const int LegacyVALUWritesVGPRWaitStates
= 2;
1209 const int VALUWritesExecWaitStates
= 4;
1210 const int MaxWaitStates
= 4;
1212 int WaitStatesNeededForUse
= VALUWritesExecWaitStates
-
1213 getWaitStatesSinceDef(AMDGPU::EXEC
, IsVALUFn
, MaxWaitStates
);
1214 WaitStatesNeeded
= std::max(WaitStatesNeeded
, WaitStatesNeededForUse
);
1216 if (WaitStatesNeeded
< MaxWaitStates
) {
1217 for (const MachineOperand
&Use
: MI
->explicit_uses()) {
1218 const int MaxWaitStates
= 2;
1220 if (!Use
.isReg() || !TRI
.isVGPR(MF
.getRegInfo(), Use
.getReg()))
1223 int WaitStatesNeededForUse
= LegacyVALUWritesVGPRWaitStates
-
1224 getWaitStatesSinceDef(Use
.getReg(), IsVALUFn
, MaxWaitStates
);
1225 WaitStatesNeeded
= std::max(WaitStatesNeeded
, WaitStatesNeededForUse
);
1227 if (WaitStatesNeeded
== MaxWaitStates
)
1233 auto IsMFMAFn
= [] (MachineInstr
*MI
) {
1234 return SIInstrInfo::isMAI(*MI
) &&
1235 MI
->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32
&&
1236 MI
->getOpcode() != AMDGPU::V_ACCVGPR_READ_B32
;
1239 for (const MachineOperand
&Op
: MI
->explicit_operands()) {
1240 if (!Op
.isReg() || !TRI
.isAGPR(MF
.getRegInfo(), Op
.getReg()))
1243 if (Op
.isDef() && Opc
!= AMDGPU::V_ACCVGPR_WRITE_B32
)
1246 const int MFMAWritesAGPROverlappedSrcABWaitStates
= 4;
1247 const int MFMAWritesAGPROverlappedSrcCWaitStates
= 2;
1248 const int MFMA4x4WritesAGPRAccVgprReadWaitStates
= 4;
1249 const int MFMA16x16WritesAGPRAccVgprReadWaitStates
= 10;
1250 const int MFMA32x32WritesAGPRAccVgprReadWaitStates
= 18;
1251 const int MFMA4x4WritesAGPRAccVgprWriteWaitStates
= 1;
1252 const int MFMA16x16WritesAGPRAccVgprWriteWaitStates
= 7;
1253 const int MFMA32x32WritesAGPRAccVgprWriteWaitStates
= 15;
1254 const int MaxWaitStates
= 18;
1255 Register Reg
= Op
.getReg();
1256 unsigned HazardDefLatency
= 0;
1258 auto IsOverlappedMFMAFn
= [Reg
, &IsMFMAFn
, &HazardDefLatency
, this]
1259 (MachineInstr
*MI
) {
1262 Register DstReg
= MI
->getOperand(0).getReg();
1265 HazardDefLatency
= std::max(HazardDefLatency
,
1266 TSchedModel
.computeInstrLatency(MI
));
1267 return TRI
.regsOverlap(DstReg
, Reg
);
1270 int WaitStatesSinceDef
= getWaitStatesSinceDef(Reg
, IsOverlappedMFMAFn
,
1272 int NeedWaitStates
= MFMAWritesAGPROverlappedSrcABWaitStates
;
1273 int SrcCIdx
= AMDGPU::getNamedOperandIdx(Opc
, AMDGPU::OpName::src2
);
1274 int OpNo
= MI
->getOperandNo(&Op
);
1275 if (OpNo
== SrcCIdx
) {
1276 NeedWaitStates
= MFMAWritesAGPROverlappedSrcCWaitStates
;
1277 } else if (Opc
== AMDGPU::V_ACCVGPR_READ_B32
) {
1278 switch (HazardDefLatency
) {
1279 case 2: NeedWaitStates
= MFMA4x4WritesAGPRAccVgprReadWaitStates
;
1281 case 8: NeedWaitStates
= MFMA16x16WritesAGPRAccVgprReadWaitStates
;
1283 case 16: LLVM_FALLTHROUGH
;
1284 default: NeedWaitStates
= MFMA32x32WritesAGPRAccVgprReadWaitStates
;
1287 } else if (Opc
== AMDGPU::V_ACCVGPR_WRITE_B32
) {
1288 switch (HazardDefLatency
) {
1289 case 2: NeedWaitStates
= MFMA4x4WritesAGPRAccVgprWriteWaitStates
;
1291 case 8: NeedWaitStates
= MFMA16x16WritesAGPRAccVgprWriteWaitStates
;
1293 case 16: LLVM_FALLTHROUGH
;
1294 default: NeedWaitStates
= MFMA32x32WritesAGPRAccVgprWriteWaitStates
;
1299 int WaitStatesNeededForUse
= NeedWaitStates
- WaitStatesSinceDef
;
1300 WaitStatesNeeded
= std::max(WaitStatesNeeded
, WaitStatesNeededForUse
);
1302 if (WaitStatesNeeded
== MaxWaitStates
)
1303 return WaitStatesNeeded
; // Early exit.
1305 auto IsAccVgprWriteFn
= [Reg
, this] (MachineInstr
*MI
) {
1306 if (MI
->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32
)
1308 Register DstReg
= MI
->getOperand(0).getReg();
1309 return TRI
.regsOverlap(Reg
, DstReg
);
1312 const int AccVGPRWriteMFMAReadSrcCWaitStates
= 1;
1313 const int AccVGPRWriteMFMAReadSrcABWaitStates
= 3;
1314 const int AccVGPRWriteAccVgprReadWaitStates
= 3;
1315 NeedWaitStates
= AccVGPRWriteMFMAReadSrcABWaitStates
;
1316 if (OpNo
== SrcCIdx
)
1317 NeedWaitStates
= AccVGPRWriteMFMAReadSrcCWaitStates
;
1318 else if (Opc
== AMDGPU::V_ACCVGPR_READ_B32
)
1319 NeedWaitStates
= AccVGPRWriteAccVgprReadWaitStates
;
1321 WaitStatesNeededForUse
= NeedWaitStates
-
1322 getWaitStatesSinceDef(Reg
, IsAccVgprWriteFn
, MaxWaitStates
);
1323 WaitStatesNeeded
= std::max(WaitStatesNeeded
, WaitStatesNeededForUse
);
1325 if (WaitStatesNeeded
== MaxWaitStates
)
1326 return WaitStatesNeeded
; // Early exit.
1329 if (Opc
== AMDGPU::V_ACCVGPR_WRITE_B32
) {
1330 const int MFMA4x4ReadSrcCAccVgprWriteWaitStates
= 0;
1331 const int MFMA16x16ReadSrcCAccVgprWriteWaitStates
= 5;
1332 const int MFMA32x32ReadSrcCAccVgprWriteWaitStates
= 13;
1333 const int MaxWaitStates
= 13;
1334 Register DstReg
= MI
->getOperand(0).getReg();
1335 unsigned HazardDefLatency
= 0;
1337 auto IsSrcCMFMAFn
= [DstReg
, &IsMFMAFn
, &HazardDefLatency
, this]
1338 (MachineInstr
*MI
) {
1341 Register Reg
= TII
.getNamedOperand(*MI
, AMDGPU::OpName::src2
)->getReg();
1342 HazardDefLatency
= std::max(HazardDefLatency
,
1343 TSchedModel
.computeInstrLatency(MI
));
1344 return TRI
.regsOverlap(Reg
, DstReg
);
1347 int WaitStatesSince
= getWaitStatesSince(IsSrcCMFMAFn
, MaxWaitStates
);
1349 switch (HazardDefLatency
) {
1350 case 2: NeedWaitStates
= MFMA4x4ReadSrcCAccVgprWriteWaitStates
;
1352 case 8: NeedWaitStates
= MFMA16x16ReadSrcCAccVgprWriteWaitStates
;
1354 case 16: LLVM_FALLTHROUGH
;
1355 default: NeedWaitStates
= MFMA32x32ReadSrcCAccVgprWriteWaitStates
;
1359 int WaitStatesNeededForUse
= NeedWaitStates
- WaitStatesSince
;
1360 WaitStatesNeeded
= std::max(WaitStatesNeeded
, WaitStatesNeededForUse
);
1363 return WaitStatesNeeded
;
1366 int GCNHazardRecognizer::checkMAILdStHazards(MachineInstr
*MI
) {
1367 if (!ST
.hasMAIInsts())
1370 int WaitStatesNeeded
= 0;
1372 auto IsAccVgprReadFn
= [] (MachineInstr
*MI
) {
1373 return MI
->getOpcode() == AMDGPU::V_ACCVGPR_READ_B32
;
1376 for (const MachineOperand
&Op
: MI
->explicit_uses()) {
1377 if (!Op
.isReg() || !TRI
.isVGPR(MF
.getRegInfo(), Op
.getReg()))
1380 Register Reg
= Op
.getReg();
1382 const int AccVgprReadLdStWaitStates
= 2;
1383 const int VALUWriteAccVgprReadLdStDepVALUWaitStates
= 1;
1384 const int MaxWaitStates
= 2;
1386 int WaitStatesNeededForUse
= AccVgprReadLdStWaitStates
-
1387 getWaitStatesSinceDef(Reg
, IsAccVgprReadFn
, MaxWaitStates
);
1388 WaitStatesNeeded
= std::max(WaitStatesNeeded
, WaitStatesNeededForUse
);
1390 if (WaitStatesNeeded
== MaxWaitStates
)
1391 return WaitStatesNeeded
; // Early exit.
1393 auto IsVALUAccVgprReadCheckFn
= [Reg
, this] (MachineInstr
*MI
) {
1394 if (MI
->getOpcode() != AMDGPU::V_ACCVGPR_READ_B32
)
1396 auto IsVALUFn
= [] (MachineInstr
*MI
) {
1397 return SIInstrInfo::isVALU(*MI
) && !SIInstrInfo::isMAI(*MI
);
1399 return getWaitStatesSinceDef(Reg
, IsVALUFn
, 2 /*MaxWaitStates*/) <
1400 std::numeric_limits
<int>::max();
1403 WaitStatesNeededForUse
= VALUWriteAccVgprReadLdStDepVALUWaitStates
-
1404 getWaitStatesSince(IsVALUAccVgprReadCheckFn
, MaxWaitStates
);
1405 WaitStatesNeeded
= std::max(WaitStatesNeeded
, WaitStatesNeededForUse
);
1408 return WaitStatesNeeded
;