1 //===-- GCNHazardRecognizers.cpp - GCN Hazard Recognizer Impls ------------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 // This file implements hazard recognizers for scheduling on GCN processors.
11 //===----------------------------------------------------------------------===//
13 #include "GCNHazardRecognizer.h"
14 #include "GCNSubtarget.h"
15 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
16 #include "SIMachineFunctionInfo.h"
17 #include "llvm/CodeGen/MachineFrameInfo.h"
18 #include "llvm/CodeGen/MachineFunction.h"
19 #include "llvm/CodeGen/ScheduleDAG.h"
20 #include "llvm/TargetParser/TargetParser.h"
26 struct MFMAPaddingRatioParser
: public cl::parser
<unsigned> {
27 MFMAPaddingRatioParser(cl::Option
&O
) : cl::parser
<unsigned>(O
) {}
29 bool parse(cl::Option
&O
, StringRef ArgName
, StringRef Arg
, unsigned &Value
) {
30 if (Arg
.getAsInteger(0, Value
))
31 return O
.error("'" + Arg
+ "' value invalid for uint argument!");
34 return O
.error("'" + Arg
+ "' value must be in the range [0, 100]!");
40 } // end anonymous namespace
42 static cl::opt
<unsigned, false, MFMAPaddingRatioParser
>
43 MFMAPaddingRatio("amdgpu-mfma-padding-ratio", cl::init(0), cl::Hidden
,
44 cl::desc("Fill a percentage of the latency between "
45 "neighboring MFMA with s_nops."));
47 //===----------------------------------------------------------------------===//
48 // Hazard Recognizer Implementation
49 //===----------------------------------------------------------------------===//
51 static bool shouldRunLdsBranchVmemWARHazardFixup(const MachineFunction
&MF
,
52 const GCNSubtarget
&ST
);
54 GCNHazardRecognizer::GCNHazardRecognizer(const MachineFunction
&MF
) :
55 IsHazardRecognizerMode(false),
56 CurrCycleInstr(nullptr),
58 ST(MF
.getSubtarget
<GCNSubtarget
>()),
59 TII(*ST
.getInstrInfo()),
60 TRI(TII
.getRegisterInfo()),
61 ClauseUses(TRI
.getNumRegUnits()),
62 ClauseDefs(TRI
.getNumRegUnits()) {
63 MaxLookAhead
= MF
.getRegInfo().isPhysRegUsed(AMDGPU::AGPR0
) ? 19 : 5;
64 TSchedModel
.init(&ST
);
65 RunLdsBranchVmemWARHazardFixup
= shouldRunLdsBranchVmemWARHazardFixup(MF
, ST
);
68 void GCNHazardRecognizer::Reset() {
69 EmittedInstrs
.clear();
72 void GCNHazardRecognizer::EmitInstruction(SUnit
*SU
) {
73 EmitInstruction(SU
->getInstr());
76 void GCNHazardRecognizer::EmitInstruction(MachineInstr
*MI
) {
80 static bool isDivFMas(unsigned Opcode
) {
81 return Opcode
== AMDGPU::V_DIV_FMAS_F32_e64
|| Opcode
== AMDGPU::V_DIV_FMAS_F64_e64
;
84 static bool isSGetReg(unsigned Opcode
) {
85 return Opcode
== AMDGPU::S_GETREG_B32
;
88 static bool isSSetReg(unsigned Opcode
) {
90 case AMDGPU::S_SETREG_B32
:
91 case AMDGPU::S_SETREG_B32_mode
:
92 case AMDGPU::S_SETREG_IMM32_B32
:
93 case AMDGPU::S_SETREG_IMM32_B32_mode
:
99 static bool isRWLane(unsigned Opcode
) {
100 return Opcode
== AMDGPU::V_READLANE_B32
|| Opcode
== AMDGPU::V_WRITELANE_B32
;
103 static bool isRFE(unsigned Opcode
) {
104 return Opcode
== AMDGPU::S_RFE_B64
;
107 static bool isSMovRel(unsigned Opcode
) {
109 case AMDGPU::S_MOVRELS_B32
:
110 case AMDGPU::S_MOVRELS_B64
:
111 case AMDGPU::S_MOVRELD_B32
:
112 case AMDGPU::S_MOVRELD_B64
:
119 static bool isDGEMM(unsigned Opcode
) {
120 return AMDGPU::getMAIIsDGEMM(Opcode
);
123 static bool isXDL(const GCNSubtarget
&ST
, const MachineInstr
&MI
) {
124 unsigned Opcode
= MI
.getOpcode();
126 if (!SIInstrInfo::isMAI(MI
) ||
128 Opcode
== AMDGPU::V_ACCVGPR_WRITE_B32_e64
||
129 Opcode
== AMDGPU::V_ACCVGPR_READ_B32_e64
)
132 if (!ST
.hasGFX940Insts())
135 return AMDGPU::getMAIIsGFX940XDL(Opcode
);
138 static bool isSendMsgTraceDataOrGDS(const SIInstrInfo
&TII
,
139 const MachineInstr
&MI
) {
140 if (TII
.isAlwaysGDS(MI
.getOpcode()))
143 switch (MI
.getOpcode()) {
144 case AMDGPU::S_SENDMSG
:
145 case AMDGPU::S_SENDMSGHALT
:
146 case AMDGPU::S_TTRACEDATA
:
148 // These DS opcodes don't support GDS.
150 case AMDGPU::DS_PERMUTE_B32
:
151 case AMDGPU::DS_BPERMUTE_B32
:
154 if (TII
.isDS(MI
.getOpcode())) {
155 int GDS
= AMDGPU::getNamedOperandIdx(MI
.getOpcode(),
156 AMDGPU::OpName::gds
);
157 if (MI
.getOperand(GDS
).getImm())
164 static bool isPermlane(const MachineInstr
&MI
) {
165 unsigned Opcode
= MI
.getOpcode();
166 return Opcode
== AMDGPU::V_PERMLANE16_B32_e64
||
167 Opcode
== AMDGPU::V_PERMLANE64_B32
||
168 Opcode
== AMDGPU::V_PERMLANEX16_B32_e64
||
169 Opcode
== AMDGPU::V_PERMLANE16_VAR_B32_e64
||
170 Opcode
== AMDGPU::V_PERMLANEX16_VAR_B32_e64
;
173 static bool isLdsDma(const MachineInstr
&MI
) {
174 return SIInstrInfo::isVALU(MI
) &&
175 (SIInstrInfo::isMUBUF(MI
) || SIInstrInfo::isFLAT(MI
));
178 static unsigned getHWReg(const SIInstrInfo
*TII
, const MachineInstr
&RegInstr
) {
179 const MachineOperand
*RegOp
= TII
->getNamedOperand(RegInstr
,
180 AMDGPU::OpName::simm16
);
181 return std::get
<0>(AMDGPU::Hwreg::HwregEncoding::decode(RegOp
->getImm()));
184 ScheduleHazardRecognizer::HazardType
185 GCNHazardRecognizer::getHazardType(SUnit
*SU
, int Stalls
) {
186 MachineInstr
*MI
= SU
->getInstr();
187 // If we are not in "HazardRecognizerMode" and therefore not being run from
188 // the scheduler, track possible stalls from hazards but don't insert noops.
189 auto HazardType
= IsHazardRecognizerMode
? NoopHazard
: Hazard
;
194 if (SIInstrInfo::isSMRD(*MI
) && checkSMRDHazards(MI
) > 0)
197 if (ST
.hasNSAtoVMEMBug() && checkNSAtoVMEMHazard(MI
) > 0)
200 if (checkFPAtomicToDenormModeHazard(MI
) > 0)
203 if (ST
.hasNoDataDepHazard())
206 // FIXME: Should flat be considered vmem?
207 if ((SIInstrInfo::isVMEM(*MI
) ||
208 SIInstrInfo::isFLAT(*MI
))
209 && checkVMEMHazards(MI
) > 0)
212 if (SIInstrInfo::isVALU(*MI
) && checkVALUHazards(MI
) > 0)
215 if (SIInstrInfo::isDPP(*MI
) && checkDPPHazards(MI
) > 0)
218 if (isDivFMas(MI
->getOpcode()) && checkDivFMasHazards(MI
) > 0)
221 if (isRWLane(MI
->getOpcode()) && checkRWLaneHazards(MI
) > 0)
224 if ((SIInstrInfo::isVALU(*MI
) || SIInstrInfo::isVMEM(*MI
) ||
225 SIInstrInfo::isFLAT(*MI
) || SIInstrInfo::isDS(*MI
) ||
226 SIInstrInfo::isEXP(*MI
)) && checkMAIVALUHazards(MI
) > 0)
229 if (isSGetReg(MI
->getOpcode()) && checkGetRegHazards(MI
) > 0)
232 if (isSSetReg(MI
->getOpcode()) && checkSetRegHazards(MI
) > 0)
235 if (isRFE(MI
->getOpcode()) && checkRFEHazards(MI
) > 0)
238 if (((ST
.hasReadM0MovRelInterpHazard() &&
239 (TII
.isVINTRP(*MI
) || isSMovRel(MI
->getOpcode()) ||
240 MI
->getOpcode() == AMDGPU::DS_WRITE_ADDTID_B32
||
241 MI
->getOpcode() == AMDGPU::DS_READ_ADDTID_B32
)) ||
242 (ST
.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII
, *MI
)) ||
243 (ST
.hasReadM0LdsDmaHazard() && isLdsDma(*MI
)) ||
244 (ST
.hasReadM0LdsDirectHazard() &&
245 MI
->readsRegister(AMDGPU::LDS_DIRECT
, /*TRI=*/nullptr))) &&
246 checkReadM0Hazards(MI
) > 0)
249 if (SIInstrInfo::isMAI(*MI
) && checkMAIHazards(MI
) > 0)
252 if ((SIInstrInfo::isVMEM(*MI
) ||
253 SIInstrInfo::isFLAT(*MI
) ||
254 SIInstrInfo::isDS(*MI
)) && checkMAILdStHazards(MI
) > 0)
257 if (MI
->isInlineAsm() && checkInlineAsmHazards(MI
) > 0)
263 static void insertNoopsInBundle(MachineInstr
*MI
, const SIInstrInfo
&TII
,
265 while (Quantity
> 0) {
266 unsigned Arg
= std::min(Quantity
, 8u);
268 BuildMI(*MI
->getParent(), MI
, MI
->getDebugLoc(), TII
.get(AMDGPU::S_NOP
))
274 GCNHazardRecognizer::getMFMAPipelineWaitStates(const MachineInstr
&MI
) const {
275 const MCSchedClassDesc
*SC
= TSchedModel
.resolveSchedClass(&MI
);
276 assert(TSchedModel
.getWriteProcResBegin(SC
) !=
277 TSchedModel
.getWriteProcResEnd(SC
));
278 return TSchedModel
.getWriteProcResBegin(SC
)->ReleaseAtCycle
;
281 void GCNHazardRecognizer::processBundle() {
282 MachineBasicBlock::instr_iterator MI
= std::next(CurrCycleInstr
->getIterator());
283 MachineBasicBlock::instr_iterator E
= CurrCycleInstr
->getParent()->instr_end();
284 // Check bundled MachineInstr's for hazards.
285 for (; MI
!= E
&& MI
->isInsideBundle(); ++MI
) {
286 CurrCycleInstr
= &*MI
;
287 unsigned WaitStates
= PreEmitNoopsCommon(CurrCycleInstr
);
289 if (IsHazardRecognizerMode
) {
290 fixHazards(CurrCycleInstr
);
292 insertNoopsInBundle(CurrCycleInstr
, TII
, WaitStates
);
295 // It’s unnecessary to track more than MaxLookAhead instructions. Since we
296 // include the bundled MI directly after, only add a maximum of
297 // (MaxLookAhead - 1) noops to EmittedInstrs.
298 for (unsigned i
= 0, e
= std::min(WaitStates
, MaxLookAhead
- 1); i
< e
; ++i
)
299 EmittedInstrs
.push_front(nullptr);
301 EmittedInstrs
.push_front(CurrCycleInstr
);
302 EmittedInstrs
.resize(MaxLookAhead
);
304 CurrCycleInstr
= nullptr;
307 void GCNHazardRecognizer::runOnInstruction(MachineInstr
*MI
) {
308 assert(IsHazardRecognizerMode
);
310 unsigned NumPreNoops
= PreEmitNoops(MI
);
311 EmitNoops(NumPreNoops
);
312 if (MI
->isInsideBundle())
313 insertNoopsInBundle(MI
, TII
, NumPreNoops
);
315 TII
.insertNoops(*MI
->getParent(), MachineBasicBlock::iterator(MI
),
321 unsigned GCNHazardRecognizer::PreEmitNoops(MachineInstr
*MI
) {
322 IsHazardRecognizerMode
= true;
324 unsigned W
= PreEmitNoopsCommon(MI
);
326 CurrCycleInstr
= nullptr;
330 unsigned GCNHazardRecognizer::PreEmitNoopsCommon(MachineInstr
*MI
) {
336 if (SIInstrInfo::isSMRD(*MI
))
337 return std::max(WaitStates
, checkSMRDHazards(MI
));
339 if (ST
.hasNSAtoVMEMBug())
340 WaitStates
= std::max(WaitStates
, checkNSAtoVMEMHazard(MI
));
342 WaitStates
= std::max(WaitStates
, checkFPAtomicToDenormModeHazard(MI
));
344 if (ST
.hasNoDataDepHazard())
347 if (SIInstrInfo::isVMEM(*MI
) || SIInstrInfo::isFLAT(*MI
))
348 WaitStates
= std::max(WaitStates
, checkVMEMHazards(MI
));
350 if (SIInstrInfo::isVALU(*MI
))
351 WaitStates
= std::max(WaitStates
, checkVALUHazards(MI
));
353 if (SIInstrInfo::isDPP(*MI
))
354 WaitStates
= std::max(WaitStates
, checkDPPHazards(MI
));
356 if (isDivFMas(MI
->getOpcode()))
357 WaitStates
= std::max(WaitStates
, checkDivFMasHazards(MI
));
359 if (isRWLane(MI
->getOpcode()))
360 WaitStates
= std::max(WaitStates
, checkRWLaneHazards(MI
));
362 if ((SIInstrInfo::isVALU(*MI
) || SIInstrInfo::isVMEM(*MI
) ||
363 SIInstrInfo::isFLAT(*MI
) || SIInstrInfo::isDS(*MI
) ||
364 SIInstrInfo::isEXP(*MI
)) && checkMAIVALUHazards(MI
) > 0)
365 WaitStates
= std::max(WaitStates
, checkMAIVALUHazards(MI
));
367 if (MI
->isInlineAsm())
368 return std::max(WaitStates
, checkInlineAsmHazards(MI
));
370 if (isSGetReg(MI
->getOpcode()))
371 return std::max(WaitStates
, checkGetRegHazards(MI
));
373 if (isSSetReg(MI
->getOpcode()))
374 return std::max(WaitStates
, checkSetRegHazards(MI
));
376 if (isRFE(MI
->getOpcode()))
377 return std::max(WaitStates
, checkRFEHazards(MI
));
379 if ((ST
.hasReadM0MovRelInterpHazard() &&
380 (TII
.isVINTRP(*MI
) || isSMovRel(MI
->getOpcode()) ||
381 MI
->getOpcode() == AMDGPU::DS_WRITE_ADDTID_B32
||
382 MI
->getOpcode() == AMDGPU::DS_READ_ADDTID_B32
)) ||
383 (ST
.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII
, *MI
)) ||
384 (ST
.hasReadM0LdsDmaHazard() && isLdsDma(*MI
)) ||
385 (ST
.hasReadM0LdsDirectHazard() &&
386 MI
->readsRegister(AMDGPU::LDS_DIRECT
, /*TRI=*/nullptr)))
387 return std::max(WaitStates
, checkReadM0Hazards(MI
));
389 if (SIInstrInfo::isMAI(*MI
))
390 return std::max(WaitStates
, checkMAIHazards(MI
));
392 if (SIInstrInfo::isVMEM(*MI
) ||
393 SIInstrInfo::isFLAT(*MI
) ||
394 SIInstrInfo::isDS(*MI
))
395 return std::max(WaitStates
, checkMAILdStHazards(MI
));
400 void GCNHazardRecognizer::EmitNoop() {
401 EmittedInstrs
.push_front(nullptr);
404 void GCNHazardRecognizer::AdvanceCycle() {
405 // When the scheduler detects a stall, it will call AdvanceCycle() without
406 // emitting any instructions.
407 if (!CurrCycleInstr
) {
408 EmittedInstrs
.push_front(nullptr);
412 if (CurrCycleInstr
->isBundle()) {
417 unsigned NumWaitStates
= TII
.getNumWaitStates(*CurrCycleInstr
);
418 if (!NumWaitStates
) {
419 CurrCycleInstr
= nullptr;
423 // Keep track of emitted instructions
424 EmittedInstrs
.push_front(CurrCycleInstr
);
426 // Add a nullptr for each additional wait state after the first. Make sure
427 // not to add more than getMaxLookAhead() items to the list, since we
428 // truncate the list to that size right after this loop.
429 for (unsigned i
= 1, e
= std::min(NumWaitStates
, getMaxLookAhead());
431 EmittedInstrs
.push_front(nullptr);
434 // getMaxLookahead() is the largest number of wait states we will ever need
435 // to insert, so there is no point in keeping track of more than that many
437 EmittedInstrs
.resize(getMaxLookAhead());
439 CurrCycleInstr
= nullptr;
442 void GCNHazardRecognizer::RecedeCycle() {
443 llvm_unreachable("hazard recognizer does not support bottom-up scheduling.");
446 //===----------------------------------------------------------------------===//
448 //===----------------------------------------------------------------------===//
450 using HazardFnResult
= enum { HazardFound
, HazardExpired
, NoHazardFound
};
452 using IsExpiredFn
= function_ref
<bool(const MachineInstr
&, int WaitStates
)>;
453 using GetNumWaitStatesFn
= function_ref
<unsigned int(const MachineInstr
&)>;
455 // Search for a hazard in a block and its predecessors.
456 template <typename StateT
>
458 hasHazard(StateT State
,
459 function_ref
<HazardFnResult(StateT
&, const MachineInstr
&)> IsHazard
,
460 function_ref
<void(StateT
&, const MachineInstr
&)> UpdateState
,
461 const MachineBasicBlock
*MBB
,
462 MachineBasicBlock::const_reverse_instr_iterator I
,
463 DenseSet
<const MachineBasicBlock
*> &Visited
) {
464 for (auto E
= MBB
->instr_rend(); I
!= E
; ++I
) {
465 // No need to look at parent BUNDLE instructions.
469 switch (IsHazard(State
, *I
)) {
479 if (I
->isInlineAsm() || I
->isMetaInstruction())
482 UpdateState(State
, *I
);
485 for (MachineBasicBlock
*Pred
: MBB
->predecessors()) {
486 if (!Visited
.insert(Pred
).second
)
489 if (hasHazard(State
, IsHazard
, UpdateState
, Pred
, Pred
->instr_rbegin(),
497 // Returns a minimum wait states since \p I walking all predecessors.
498 // Only scans until \p IsExpired does not return true.
499 // Can only be run in a hazard recognizer mode.
500 static int getWaitStatesSince(
501 GCNHazardRecognizer::IsHazardFn IsHazard
, const MachineBasicBlock
*MBB
,
502 MachineBasicBlock::const_reverse_instr_iterator I
, int WaitStates
,
503 IsExpiredFn IsExpired
, DenseSet
<const MachineBasicBlock
*> &Visited
,
504 GetNumWaitStatesFn GetNumWaitStates
= SIInstrInfo::getNumWaitStates
) {
505 for (auto E
= MBB
->instr_rend(); I
!= E
; ++I
) {
506 // Don't add WaitStates for parent BUNDLE instructions.
513 if (I
->isInlineAsm())
516 WaitStates
+= GetNumWaitStates(*I
);
518 if (IsExpired(*I
, WaitStates
))
519 return std::numeric_limits
<int>::max();
522 int MinWaitStates
= std::numeric_limits
<int>::max();
523 for (MachineBasicBlock
*Pred
: MBB
->predecessors()) {
524 if (!Visited
.insert(Pred
).second
)
527 int W
= getWaitStatesSince(IsHazard
, Pred
, Pred
->instr_rbegin(), WaitStates
,
528 IsExpired
, Visited
, GetNumWaitStates
);
530 MinWaitStates
= std::min(MinWaitStates
, W
);
533 return MinWaitStates
;
536 static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard
,
537 const MachineInstr
*MI
, IsExpiredFn IsExpired
) {
538 DenseSet
<const MachineBasicBlock
*> Visited
;
539 return getWaitStatesSince(IsHazard
, MI
->getParent(),
540 std::next(MI
->getReverseIterator()),
541 0, IsExpired
, Visited
);
544 int GCNHazardRecognizer::getWaitStatesSince(IsHazardFn IsHazard
, int Limit
) {
545 if (IsHazardRecognizerMode
) {
546 auto IsExpiredFn
= [Limit
](const MachineInstr
&, int WaitStates
) {
547 return WaitStates
>= Limit
;
549 return ::getWaitStatesSince(IsHazard
, CurrCycleInstr
, IsExpiredFn
);
553 for (MachineInstr
*MI
: EmittedInstrs
) {
558 if (MI
->isInlineAsm())
563 if (WaitStates
>= Limit
)
566 return std::numeric_limits
<int>::max();
569 int GCNHazardRecognizer::getWaitStatesSinceDef(unsigned Reg
,
570 IsHazardFn IsHazardDef
,
572 const SIRegisterInfo
*TRI
= ST
.getRegisterInfo();
574 auto IsHazardFn
= [IsHazardDef
, TRI
, Reg
](const MachineInstr
&MI
) {
575 return IsHazardDef(MI
) && MI
.modifiesRegister(Reg
, TRI
);
578 return getWaitStatesSince(IsHazardFn
, Limit
);
581 int GCNHazardRecognizer::getWaitStatesSinceSetReg(IsHazardFn IsHazard
,
583 auto IsHazardFn
= [IsHazard
](const MachineInstr
&MI
) {
584 return isSSetReg(MI
.getOpcode()) && IsHazard(MI
);
587 return getWaitStatesSince(IsHazardFn
, Limit
);
590 //===----------------------------------------------------------------------===//
591 // No-op Hazard Detection
592 //===----------------------------------------------------------------------===//
594 static void addRegUnits(const SIRegisterInfo
&TRI
, BitVector
&BV
,
596 for (MCRegUnit Unit
: TRI
.regunits(Reg
))
600 static void addRegsToSet(const SIRegisterInfo
&TRI
,
601 iterator_range
<MachineInstr::const_mop_iterator
> Ops
,
602 BitVector
&DefSet
, BitVector
&UseSet
) {
603 for (const MachineOperand
&Op
: Ops
) {
605 addRegUnits(TRI
, Op
.isDef() ? DefSet
: UseSet
, Op
.getReg().asMCReg());
609 void GCNHazardRecognizer::addClauseInst(const MachineInstr
&MI
) {
610 addRegsToSet(TRI
, MI
.operands(), ClauseDefs
, ClauseUses
);
613 static bool breaksSMEMSoftClause(MachineInstr
*MI
) {
614 return !SIInstrInfo::isSMRD(*MI
);
617 static bool breaksVMEMSoftClause(MachineInstr
*MI
) {
618 return !SIInstrInfo::isVMEM(*MI
) && !SIInstrInfo::isFLAT(*MI
);
621 int GCNHazardRecognizer::checkSoftClauseHazards(MachineInstr
*MEM
) {
622 // SMEM soft clause are only present on VI+, and only matter if xnack is
624 if (!ST
.isXNACKEnabled())
627 bool IsSMRD
= TII
.isSMRD(*MEM
);
631 // A soft-clause is any group of consecutive SMEM instructions. The
632 // instructions in this group may return out of order and/or may be
633 // replayed (i.e. the same instruction issued more than once).
635 // In order to handle these situations correctly we need to make sure that
636 // when a clause has more than one instruction, no instruction in the clause
637 // writes to a register that is read by another instruction in the clause
638 // (including itself). If we encounter this situation, we need to break the
639 // clause by inserting a non SMEM instruction.
641 for (MachineInstr
*MI
: EmittedInstrs
) {
642 // When we hit a non-SMEM instruction then we have passed the start of the
643 // clause and we can stop.
647 if (IsSMRD
? breaksSMEMSoftClause(MI
) : breaksVMEMSoftClause(MI
))
653 if (ClauseDefs
.none())
656 // We need to make sure not to put loads and stores in the same clause if they
657 // use the same address. For now, just start a new clause whenever we see a
664 // If the set of defs and uses intersect then we cannot add this instruction
665 // to the clause, so we have a hazard.
666 return ClauseDefs
.anyCommon(ClauseUses
) ? 1 : 0;
669 int GCNHazardRecognizer::checkSMRDHazards(MachineInstr
*SMRD
) {
670 int WaitStatesNeeded
= 0;
672 WaitStatesNeeded
= checkSoftClauseHazards(SMRD
);
674 // This SMRD hazard only affects SI.
675 if (!ST
.hasSMRDReadVALUDefHazard())
676 return WaitStatesNeeded
;
678 // A read of an SGPR by SMRD instruction requires 4 wait states when the
679 // SGPR was written by a VALU instruction.
680 int SmrdSgprWaitStates
= 4;
681 auto IsHazardDefFn
= [this](const MachineInstr
&MI
) {
682 return TII
.isVALU(MI
);
684 auto IsBufferHazardDefFn
= [this](const MachineInstr
&MI
) {
685 return TII
.isSALU(MI
);
688 bool IsBufferSMRD
= TII
.isBufferSMRD(*SMRD
);
690 for (const MachineOperand
&Use
: SMRD
->uses()) {
693 int WaitStatesNeededForUse
=
694 SmrdSgprWaitStates
- getWaitStatesSinceDef(Use
.getReg(), IsHazardDefFn
,
696 WaitStatesNeeded
= std::max(WaitStatesNeeded
, WaitStatesNeededForUse
);
698 // This fixes what appears to be undocumented hardware behavior in SI where
699 // s_mov writing a descriptor and s_buffer_load_dword reading the descriptor
700 // needs some number of nops in between. We don't know how many we need, but
701 // let's use 4. This wasn't discovered before probably because the only
702 // case when this happens is when we expand a 64-bit pointer into a full
703 // descriptor and use s_buffer_load_dword instead of s_load_dword, which was
704 // probably never encountered in the closed-source land.
706 int WaitStatesNeededForUse
=
707 SmrdSgprWaitStates
- getWaitStatesSinceDef(Use
.getReg(),
710 WaitStatesNeeded
= std::max(WaitStatesNeeded
, WaitStatesNeededForUse
);
714 return WaitStatesNeeded
;
717 int GCNHazardRecognizer::checkVMEMHazards(MachineInstr
* VMEM
) {
718 if (!ST
.hasVMEMReadSGPRVALUDefHazard())
721 int WaitStatesNeeded
= checkSoftClauseHazards(VMEM
);
723 // A read of an SGPR by a VMEM instruction requires 5 wait states when the
724 // SGPR was written by a VALU Instruction.
725 const int VmemSgprWaitStates
= 5;
726 auto IsHazardDefFn
= [this](const MachineInstr
&MI
) {
727 return TII
.isVALU(MI
);
729 for (const MachineOperand
&Use
: VMEM
->uses()) {
730 if (!Use
.isReg() || TRI
.isVectorRegister(MF
.getRegInfo(), Use
.getReg()))
733 int WaitStatesNeededForUse
=
734 VmemSgprWaitStates
- getWaitStatesSinceDef(Use
.getReg(), IsHazardDefFn
,
736 WaitStatesNeeded
= std::max(WaitStatesNeeded
, WaitStatesNeededForUse
);
738 return WaitStatesNeeded
;
741 int GCNHazardRecognizer::checkDPPHazards(MachineInstr
*DPP
) {
742 const SIRegisterInfo
*TRI
= ST
.getRegisterInfo();
743 const SIInstrInfo
*TII
= ST
.getInstrInfo();
745 // Check for DPP VGPR read after VALU VGPR write and EXEC write.
746 int DppVgprWaitStates
= 2;
747 int DppExecWaitStates
= 5;
748 int WaitStatesNeeded
= 0;
749 auto IsHazardDefFn
= [TII
](const MachineInstr
&MI
) {
750 return TII
->isVALU(MI
);
753 for (const MachineOperand
&Use
: DPP
->uses()) {
754 if (!Use
.isReg() || !TRI
->isVGPR(MF
.getRegInfo(), Use
.getReg()))
756 int WaitStatesNeededForUse
=
757 DppVgprWaitStates
- getWaitStatesSinceDef(
759 [](const MachineInstr
&) { return true; },
761 WaitStatesNeeded
= std::max(WaitStatesNeeded
, WaitStatesNeededForUse
);
764 WaitStatesNeeded
= std::max(
766 DppExecWaitStates
- getWaitStatesSinceDef(AMDGPU::EXEC
, IsHazardDefFn
,
769 return WaitStatesNeeded
;
772 int GCNHazardRecognizer::checkDivFMasHazards(MachineInstr
*DivFMas
) {
773 const SIInstrInfo
*TII
= ST
.getInstrInfo();
775 // v_div_fmas requires 4 wait states after a write to vcc from a VALU
777 const int DivFMasWaitStates
= 4;
778 auto IsHazardDefFn
= [TII
](const MachineInstr
&MI
) {
779 return TII
->isVALU(MI
);
781 int WaitStatesNeeded
= getWaitStatesSinceDef(AMDGPU::VCC
, IsHazardDefFn
,
784 return DivFMasWaitStates
- WaitStatesNeeded
;
787 int GCNHazardRecognizer::checkGetRegHazards(MachineInstr
*GetRegInstr
) {
788 const SIInstrInfo
*TII
= ST
.getInstrInfo();
789 unsigned GetRegHWReg
= getHWReg(TII
, *GetRegInstr
);
791 const int GetRegWaitStates
= 2;
792 auto IsHazardFn
= [TII
, GetRegHWReg
](const MachineInstr
&MI
) {
793 return GetRegHWReg
== getHWReg(TII
, MI
);
795 int WaitStatesNeeded
= getWaitStatesSinceSetReg(IsHazardFn
, GetRegWaitStates
);
797 return GetRegWaitStates
- WaitStatesNeeded
;
800 int GCNHazardRecognizer::checkSetRegHazards(MachineInstr
*SetRegInstr
) {
801 const SIInstrInfo
*TII
= ST
.getInstrInfo();
802 unsigned HWReg
= getHWReg(TII
, *SetRegInstr
);
804 const int SetRegWaitStates
= ST
.getSetRegWaitStates();
805 auto IsHazardFn
= [TII
, HWReg
](const MachineInstr
&MI
) {
806 return HWReg
== getHWReg(TII
, MI
);
808 int WaitStatesNeeded
= getWaitStatesSinceSetReg(IsHazardFn
, SetRegWaitStates
);
809 return SetRegWaitStates
- WaitStatesNeeded
;
812 int GCNHazardRecognizer::createsVALUHazard(const MachineInstr
&MI
) {
816 const SIInstrInfo
*TII
= ST
.getInstrInfo();
817 unsigned Opcode
= MI
.getOpcode();
818 const MCInstrDesc
&Desc
= MI
.getDesc();
820 int VDataIdx
= AMDGPU::getNamedOperandIdx(Opcode
, AMDGPU::OpName::vdata
);
823 VDataRCID
= Desc
.operands()[VDataIdx
].RegClass
;
825 if (TII
->isMUBUF(MI
) || TII
->isMTBUF(MI
)) {
826 // There is no hazard if the instruction does not use vector regs
830 // For MUBUF/MTBUF instructions this hazard only exists if the
831 // instruction is not using a register in the soffset field.
832 const MachineOperand
*SOffset
=
833 TII
->getNamedOperand(MI
, AMDGPU::OpName::soffset
);
834 // If we have no soffset operand, then assume this field has been
835 // hardcoded to zero.
836 if (AMDGPU::getRegBitWidth(VDataRCID
) > 64 &&
837 (!SOffset
|| !SOffset
->isReg()))
841 // MIMG instructions create a hazard if they don't use a 256-bit T# and
842 // the store size is greater than 8 bytes and they have more than two bits
843 // of their dmask set.
844 // All our MIMG definitions use a 256-bit T#, so we can skip checking for them.
845 if (TII
->isMIMG(MI
)) {
846 int SRsrcIdx
= AMDGPU::getNamedOperandIdx(Opcode
, AMDGPU::OpName::srsrc
);
847 assert(SRsrcIdx
!= -1 &&
848 AMDGPU::getRegBitWidth(Desc
.operands()[SRsrcIdx
].RegClass
) == 256);
852 if (TII
->isFLAT(MI
)) {
853 int DataIdx
= AMDGPU::getNamedOperandIdx(Opcode
, AMDGPU::OpName::vdata
);
854 if (AMDGPU::getRegBitWidth(Desc
.operands()[DataIdx
].RegClass
) > 64)
862 GCNHazardRecognizer::checkVALUHazardsHelper(const MachineOperand
&Def
,
863 const MachineRegisterInfo
&MRI
) {
864 // Helper to check for the hazard where VMEM instructions that store more than
865 // 8 bytes can have there store data over written by the next instruction.
866 const SIRegisterInfo
*TRI
= ST
.getRegisterInfo();
868 const int VALUWaitStates
= ST
.hasGFX940Insts() ? 2 : 1;
869 int WaitStatesNeeded
= 0;
871 if (!TRI
->isVectorRegister(MRI
, Def
.getReg()))
872 return WaitStatesNeeded
;
873 Register Reg
= Def
.getReg();
874 auto IsHazardFn
= [this, Reg
, TRI
](const MachineInstr
&MI
) {
875 int DataIdx
= createsVALUHazard(MI
);
876 return DataIdx
>= 0 &&
877 TRI
->regsOverlap(MI
.getOperand(DataIdx
).getReg(), Reg
);
879 int WaitStatesNeededForDef
=
880 VALUWaitStates
- getWaitStatesSince(IsHazardFn
, VALUWaitStates
);
881 WaitStatesNeeded
= std::max(WaitStatesNeeded
, WaitStatesNeededForDef
);
883 return WaitStatesNeeded
;
886 int GCNHazardRecognizer::checkVALUHazards(MachineInstr
*VALU
) {
887 int WaitStatesNeeded
= 0;
889 if (ST
.hasTransForwardingHazard() && !SIInstrInfo::isTRANS(*VALU
)) {
890 const int TransDefWaitstates
= 1;
892 auto IsTransDefFn
= [this, VALU
](const MachineInstr
&MI
) {
893 if (!SIInstrInfo::isTRANS(MI
))
895 const SIRegisterInfo
*TRI
= ST
.getRegisterInfo();
896 const SIInstrInfo
*TII
= ST
.getInstrInfo();
897 Register Def
= TII
->getNamedOperand(MI
, AMDGPU::OpName::vdst
)->getReg();
899 for (const MachineOperand
&Use
: VALU
->explicit_uses()) {
900 if (Use
.isReg() && TRI
->regsOverlap(Def
, Use
.getReg()))
907 int WaitStatesNeededForDef
=
909 getWaitStatesSince(IsTransDefFn
, TransDefWaitstates
);
910 WaitStatesNeeded
= std::max(WaitStatesNeeded
, WaitStatesNeededForDef
);
913 if (ST
.hasDstSelForwardingHazard()) {
914 const int Shift16DefWaitstates
= 1;
916 auto IsShift16BitDefFn
= [this, VALU
](const MachineInstr
&MI
) {
917 if (!SIInstrInfo::isVALU(MI
))
919 const SIInstrInfo
*TII
= ST
.getInstrInfo();
920 if (SIInstrInfo::isSDWA(MI
)) {
921 if (auto *DstSel
= TII
->getNamedOperand(MI
, AMDGPU::OpName::dst_sel
))
922 if (DstSel
->getImm() == AMDGPU::SDWA::DWORD
)
925 if (!AMDGPU::hasNamedOperand(MI
.getOpcode(), AMDGPU::OpName::op_sel
) ||
926 !(TII
->getNamedOperand(MI
, AMDGPU::OpName::src0_modifiers
)
928 SISrcMods::DST_OP_SEL
))
931 const SIRegisterInfo
*TRI
= ST
.getRegisterInfo();
932 if (auto *Dst
= TII
->getNamedOperand(MI
, AMDGPU::OpName::vdst
)) {
933 Register Def
= Dst
->getReg();
935 for (const MachineOperand
&Use
: VALU
->explicit_uses()) {
936 if (Use
.isReg() && TRI
->regsOverlap(Def
, Use
.getReg()))
944 int WaitStatesNeededForDef
=
945 Shift16DefWaitstates
-
946 getWaitStatesSince(IsShift16BitDefFn
, Shift16DefWaitstates
);
947 WaitStatesNeeded
= std::max(WaitStatesNeeded
, WaitStatesNeededForDef
);
950 if (ST
.hasVDecCoExecHazard()) {
951 const int VALUWriteSGPRVALUReadWaitstates
= 2;
952 const int VALUWriteEXECRWLane
= 4;
953 const int VALUWriteVGPRReadlaneRead
= 1;
955 const SIRegisterInfo
*TRI
= ST
.getRegisterInfo();
956 const MachineRegisterInfo
&MRI
= MF
.getRegInfo();
958 auto IsVALUDefSGPRFn
= [&UseReg
, TRI
](const MachineInstr
&MI
) {
959 if (!SIInstrInfo::isVALU(MI
))
961 return MI
.modifiesRegister(UseReg
, TRI
);
964 for (const MachineOperand
&Use
: VALU
->explicit_uses()) {
968 UseReg
= Use
.getReg();
969 if (TRI
->isSGPRReg(MRI
, UseReg
)) {
970 int WaitStatesNeededForDef
=
971 VALUWriteSGPRVALUReadWaitstates
-
972 getWaitStatesSince(IsVALUDefSGPRFn
,
973 VALUWriteSGPRVALUReadWaitstates
);
974 WaitStatesNeeded
= std::max(WaitStatesNeeded
, WaitStatesNeededForDef
);
978 if (VALU
->readsRegister(AMDGPU::VCC
, TRI
)) {
979 UseReg
= AMDGPU::VCC
;
980 int WaitStatesNeededForDef
=
981 VALUWriteSGPRVALUReadWaitstates
-
982 getWaitStatesSince(IsVALUDefSGPRFn
, VALUWriteSGPRVALUReadWaitstates
);
983 WaitStatesNeeded
= std::max(WaitStatesNeeded
, WaitStatesNeededForDef
);
986 switch (VALU
->getOpcode()) {
987 case AMDGPU::V_READLANE_B32
:
988 case AMDGPU::V_READFIRSTLANE_B32
: {
989 MachineOperand
*Src
= TII
.getNamedOperand(*VALU
, AMDGPU::OpName::src0
);
990 UseReg
= Src
->getReg();
991 int WaitStatesNeededForDef
=
992 VALUWriteVGPRReadlaneRead
-
993 getWaitStatesSince(IsVALUDefSGPRFn
, VALUWriteVGPRReadlaneRead
);
994 WaitStatesNeeded
= std::max(WaitStatesNeeded
, WaitStatesNeededForDef
);
997 case AMDGPU::V_WRITELANE_B32
: {
998 UseReg
= AMDGPU::EXEC
;
999 int WaitStatesNeededForDef
=
1000 VALUWriteEXECRWLane
-
1001 getWaitStatesSince(IsVALUDefSGPRFn
, VALUWriteEXECRWLane
);
1002 WaitStatesNeeded
= std::max(WaitStatesNeeded
, WaitStatesNeededForDef
);
1010 // This checks for the hazard where VMEM instructions that store more than
1011 // 8 bytes can have there store data over written by the next instruction.
1012 if (!ST
.has12DWordStoreHazard())
1013 return WaitStatesNeeded
;
1015 const MachineRegisterInfo
&MRI
= MF
.getRegInfo();
1017 for (const MachineOperand
&Def
: VALU
->defs()) {
1018 WaitStatesNeeded
= std::max(WaitStatesNeeded
, checkVALUHazardsHelper(Def
, MRI
));
1021 return WaitStatesNeeded
;
1024 int GCNHazardRecognizer::checkInlineAsmHazards(MachineInstr
*IA
) {
1025 // This checks for hazards associated with inline asm statements.
1026 // Since inline asms can contain just about anything, we use this
1027 // to call/leverage other check*Hazard routines. Note that
1028 // this function doesn't attempt to address all possible inline asm
1029 // hazards (good luck), but is a collection of what has been
1030 // problematic thus far.
1032 // see checkVALUHazards()
1033 if (!ST
.has12DWordStoreHazard())
1036 const MachineRegisterInfo
&MRI
= MF
.getRegInfo();
1037 int WaitStatesNeeded
= 0;
1039 for (const MachineOperand
&Op
:
1040 llvm::drop_begin(IA
->operands(), InlineAsm::MIOp_FirstOperand
)) {
1041 if (Op
.isReg() && Op
.isDef()) {
1043 std::max(WaitStatesNeeded
, checkVALUHazardsHelper(Op
, MRI
));
1047 return WaitStatesNeeded
;
1050 int GCNHazardRecognizer::checkRWLaneHazards(MachineInstr
*RWLane
) {
1051 const SIInstrInfo
*TII
= ST
.getInstrInfo();
1052 const SIRegisterInfo
*TRI
= ST
.getRegisterInfo();
1053 const MachineRegisterInfo
&MRI
= MF
.getRegInfo();
1055 const MachineOperand
*LaneSelectOp
=
1056 TII
->getNamedOperand(*RWLane
, AMDGPU::OpName::src1
);
1058 if (!LaneSelectOp
->isReg() || !TRI
->isSGPRReg(MRI
, LaneSelectOp
->getReg()))
1061 Register LaneSelectReg
= LaneSelectOp
->getReg();
1062 auto IsHazardFn
= [TII
](const MachineInstr
&MI
) { return TII
->isVALU(MI
); };
1064 const int RWLaneWaitStates
= 4;
1065 int WaitStatesSince
= getWaitStatesSinceDef(LaneSelectReg
, IsHazardFn
,
1067 return RWLaneWaitStates
- WaitStatesSince
;
1070 int GCNHazardRecognizer::checkRFEHazards(MachineInstr
*RFE
) {
1071 if (!ST
.hasRFEHazards())
1074 const SIInstrInfo
*TII
= ST
.getInstrInfo();
1076 const int RFEWaitStates
= 1;
1078 auto IsHazardFn
= [TII
](const MachineInstr
&MI
) {
1079 return getHWReg(TII
, MI
) == AMDGPU::Hwreg::ID_TRAPSTS
;
1081 int WaitStatesNeeded
= getWaitStatesSinceSetReg(IsHazardFn
, RFEWaitStates
);
1082 return RFEWaitStates
- WaitStatesNeeded
;
1085 int GCNHazardRecognizer::checkReadM0Hazards(MachineInstr
*MI
) {
1086 const SIInstrInfo
*TII
= ST
.getInstrInfo();
1087 const int ReadM0WaitStates
= 1;
1088 auto IsHazardFn
= [TII
](const MachineInstr
&MI
) { return TII
->isSALU(MI
); };
1089 return ReadM0WaitStates
-
1090 getWaitStatesSinceDef(AMDGPU::M0
, IsHazardFn
, ReadM0WaitStates
);
1093 void GCNHazardRecognizer::fixHazards(MachineInstr
*MI
) {
1094 fixVMEMtoScalarWriteHazards(MI
);
1095 fixVcmpxPermlaneHazards(MI
);
1096 fixSMEMtoVectorWriteHazards(MI
);
1097 fixVcmpxExecWARHazard(MI
);
1098 fixLdsBranchVmemWARHazard(MI
);
1099 if (ST
.hasLdsDirect()) {
1100 fixLdsDirectVALUHazard(MI
);
1101 fixLdsDirectVMEMHazard(MI
);
1103 fixVALUPartialForwardingHazard(MI
);
1104 fixVALUTransUseHazard(MI
);
1106 fixShift64HighRegBug(MI
);
1107 fixVALUMaskWriteHazard(MI
);
1108 fixRequiredExportPriority(MI
);
1111 bool GCNHazardRecognizer::fixVcmpxPermlaneHazards(MachineInstr
*MI
) {
1112 if (!ST
.hasVcmpxPermlaneHazard() || !isPermlane(*MI
))
1115 const SIInstrInfo
*TII
= ST
.getInstrInfo();
1116 const SIRegisterInfo
*TRI
= ST
.getRegisterInfo();
1117 auto IsHazardFn
= [TII
, TRI
](const MachineInstr
&MI
) {
1118 return (TII
->isVOPC(MI
) ||
1119 ((TII
->isVOP3(MI
) || TII
->isSDWA(MI
)) && MI
.isCompare())) &&
1120 MI
.modifiesRegister(AMDGPU::EXEC
, TRI
);
1123 auto IsExpiredFn
= [](const MachineInstr
&MI
, int) {
1124 unsigned Opc
= MI
.getOpcode();
1125 return SIInstrInfo::isVALU(MI
) && Opc
!= AMDGPU::V_NOP_e32
&&
1126 Opc
!= AMDGPU::V_NOP_e64
&& Opc
!= AMDGPU::V_NOP_sdwa
;
1129 if (::getWaitStatesSince(IsHazardFn
, MI
, IsExpiredFn
) ==
1130 std::numeric_limits
<int>::max())
1133 // V_NOP will be discarded by SQ.
1134 // Use V_MOV_B32 v?, v?. Register must be alive so use src0 of V_PERMLANE*
1135 // which is always a VGPR and available.
1136 auto *Src0
= TII
->getNamedOperand(*MI
, AMDGPU::OpName::src0
);
1137 Register Reg
= Src0
->getReg();
1138 bool IsUndef
= Src0
->isUndef();
1139 BuildMI(*MI
->getParent(), MI
, MI
->getDebugLoc(),
1140 TII
->get(AMDGPU::V_MOV_B32_e32
))
1141 .addReg(Reg
, RegState::Define
| (IsUndef
? RegState::Dead
: 0))
1142 .addReg(Reg
, IsUndef
? RegState::Undef
: RegState::Kill
);
1147 bool GCNHazardRecognizer::fixVMEMtoScalarWriteHazards(MachineInstr
*MI
) {
1148 if (!ST
.hasVMEMtoScalarWriteHazard())
1150 assert(!ST
.hasExtendedWaitCounts());
1152 if (!SIInstrInfo::isSALU(*MI
) && !SIInstrInfo::isSMRD(*MI
))
1155 if (MI
->getNumDefs() == 0)
1158 const SIRegisterInfo
*TRI
= ST
.getRegisterInfo();
1160 auto IsHazardFn
= [TRI
, MI
](const MachineInstr
&I
) {
1161 if (!SIInstrInfo::isVMEM(I
) && !SIInstrInfo::isDS(I
) &&
1162 !SIInstrInfo::isFLAT(I
))
1165 for (const MachineOperand
&Def
: MI
->defs()) {
1166 const MachineOperand
*Op
=
1167 I
.findRegisterUseOperand(Def
.getReg(), TRI
, false);
1175 auto IsExpiredFn
= [](const MachineInstr
&MI
, int) {
1176 return SIInstrInfo::isVALU(MI
) ||
1177 (MI
.getOpcode() == AMDGPU::S_WAITCNT
&&
1178 !MI
.getOperand(0).getImm()) ||
1179 (MI
.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR
&&
1180 AMDGPU::DepCtr::decodeFieldVmVsrc(MI
.getOperand(0).getImm()) == 0);
1183 if (::getWaitStatesSince(IsHazardFn
, MI
, IsExpiredFn
) ==
1184 std::numeric_limits
<int>::max())
1187 const SIInstrInfo
*TII
= ST
.getInstrInfo();
1188 BuildMI(*MI
->getParent(), MI
, MI
->getDebugLoc(),
1189 TII
->get(AMDGPU::S_WAITCNT_DEPCTR
))
1190 .addImm(AMDGPU::DepCtr::encodeFieldVmVsrc(0));
1194 bool GCNHazardRecognizer::fixSMEMtoVectorWriteHazards(MachineInstr
*MI
) {
1195 if (!ST
.hasSMEMtoVectorWriteHazard())
1197 assert(!ST
.hasExtendedWaitCounts());
1199 if (!SIInstrInfo::isVALU(*MI
))
1203 switch (MI
->getOpcode()) {
1204 case AMDGPU::V_READLANE_B32
:
1205 case AMDGPU::V_READFIRSTLANE_B32
:
1206 SDSTName
= AMDGPU::OpName::vdst
;
1209 SDSTName
= AMDGPU::OpName::sdst
;
1213 const SIInstrInfo
*TII
= ST
.getInstrInfo();
1214 const SIRegisterInfo
*TRI
= ST
.getRegisterInfo();
1215 const AMDGPU::IsaVersion IV
= AMDGPU::getIsaVersion(ST
.getCPU());
1216 const MachineOperand
*SDST
= TII
->getNamedOperand(*MI
, SDSTName
);
1218 for (const auto &MO
: MI
->implicit_operands()) {
1219 if (MO
.isDef() && TRI
->isSGPRClass(TRI
->getPhysRegBaseClass(MO
.getReg()))) {
1229 const Register SDSTReg
= SDST
->getReg();
1230 auto IsHazardFn
= [SDSTReg
, TRI
](const MachineInstr
&I
) {
1231 return SIInstrInfo::isSMRD(I
) && I
.readsRegister(SDSTReg
, TRI
);
1234 auto IsExpiredFn
= [TII
, IV
](const MachineInstr
&MI
, int) {
1235 if (TII
->isSALU(MI
)) {
1236 switch (MI
.getOpcode()) {
1237 case AMDGPU::S_SETVSKIP
:
1238 case AMDGPU::S_VERSION
:
1239 case AMDGPU::S_WAITCNT_VSCNT
:
1240 case AMDGPU::S_WAITCNT_VMCNT
:
1241 case AMDGPU::S_WAITCNT_EXPCNT
:
1242 // These instructions cannot not mitigate the hazard.
1244 case AMDGPU::S_WAITCNT_LGKMCNT
:
1245 // Reducing lgkmcnt count to 0 always mitigates the hazard.
1246 return (MI
.getOperand(1).getImm() == 0) &&
1247 (MI
.getOperand(0).getReg() == AMDGPU::SGPR_NULL
);
1248 case AMDGPU::S_WAITCNT
: {
1249 const int64_t Imm
= MI
.getOperand(0).getImm();
1250 AMDGPU::Waitcnt Decoded
= AMDGPU::decodeWaitcnt(IV
, Imm
);
1251 // DsCnt corresponds to LGKMCnt here.
1252 return (Decoded
.DsCnt
== 0);
1255 // SOPP instructions cannot mitigate the hazard.
1256 if (TII
->isSOPP(MI
))
1258 // At this point the SALU can be assumed to mitigate the hazard
1260 // (a) it is independent of the at risk SMEM (breaking chain),
1262 // (b) it is dependent on the SMEM, in which case an appropriate
1263 // s_waitcnt lgkmcnt _must_ exist between it and the at risk
1264 // SMEM instruction.
1271 if (::getWaitStatesSince(IsHazardFn
, MI
, IsExpiredFn
) ==
1272 std::numeric_limits
<int>::max())
1275 BuildMI(*MI
->getParent(), MI
, MI
->getDebugLoc(),
1276 TII
->get(AMDGPU::S_MOV_B32
), AMDGPU::SGPR_NULL
)
1281 bool GCNHazardRecognizer::fixVcmpxExecWARHazard(MachineInstr
*MI
) {
1282 if (!ST
.hasVcmpxExecWARHazard())
1284 assert(!ST
.hasExtendedWaitCounts());
1286 if (!SIInstrInfo::isVALU(*MI
))
1289 const SIRegisterInfo
*TRI
= ST
.getRegisterInfo();
1290 if (!MI
->modifiesRegister(AMDGPU::EXEC
, TRI
))
1293 auto IsHazardFn
= [TRI
](const MachineInstr
&I
) {
1294 if (SIInstrInfo::isVALU(I
))
1296 return I
.readsRegister(AMDGPU::EXEC
, TRI
);
1299 const SIInstrInfo
*TII
= ST
.getInstrInfo();
1300 auto IsExpiredFn
= [TII
, TRI
](const MachineInstr
&MI
, int) {
1301 if (SIInstrInfo::isVALU(MI
)) {
1302 if (TII
->getNamedOperand(MI
, AMDGPU::OpName::sdst
))
1304 for (auto MO
: MI
.implicit_operands())
1305 if (MO
.isDef() && TRI
->isSGPRClass(TRI
->getPhysRegBaseClass(MO
.getReg())))
1308 if (MI
.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR
&&
1309 AMDGPU::DepCtr::decodeFieldSaSdst(MI
.getOperand(0).getImm()) == 0)
1314 if (::getWaitStatesSince(IsHazardFn
, MI
, IsExpiredFn
) ==
1315 std::numeric_limits
<int>::max())
1318 BuildMI(*MI
->getParent(), MI
, MI
->getDebugLoc(),
1319 TII
->get(AMDGPU::S_WAITCNT_DEPCTR
))
1320 .addImm(AMDGPU::DepCtr::encodeFieldSaSdst(0));
1324 static bool shouldRunLdsBranchVmemWARHazardFixup(const MachineFunction
&MF
,
1325 const GCNSubtarget
&ST
) {
1326 if (!ST
.hasLdsBranchVmemWARHazard())
1329 // Check if the necessary condition for the hazard is met: both LDS and VMEM
1330 // instructions need to appear in the same function.
1331 bool HasLds
= false;
1332 bool HasVmem
= false;
1333 for (auto &MBB
: MF
) {
1334 for (auto &MI
: MBB
) {
1335 HasLds
|= SIInstrInfo::isDS(MI
);
1337 SIInstrInfo::isVMEM(MI
) || SIInstrInfo::isSegmentSpecificFLAT(MI
);
1338 if (HasLds
&& HasVmem
)
1345 static bool isStoreCountWaitZero(const MachineInstr
&I
) {
1346 return I
.getOpcode() == AMDGPU::S_WAITCNT_VSCNT
&&
1347 I
.getOperand(0).getReg() == AMDGPU::SGPR_NULL
&&
1348 !I
.getOperand(1).getImm();
1351 bool GCNHazardRecognizer::fixLdsBranchVmemWARHazard(MachineInstr
*MI
) {
1352 if (!RunLdsBranchVmemWARHazardFixup
)
1355 assert(ST
.hasLdsBranchVmemWARHazard());
1356 assert(!ST
.hasExtendedWaitCounts());
1358 auto IsHazardInst
= [](const MachineInstr
&MI
) {
1359 if (SIInstrInfo::isDS(MI
))
1361 if (SIInstrInfo::isVMEM(MI
) || SIInstrInfo::isSegmentSpecificFLAT(MI
))
1366 auto InstType
= IsHazardInst(*MI
);
1370 auto IsExpiredFn
= [&IsHazardInst
](const MachineInstr
&I
, int) {
1371 return IsHazardInst(I
) || isStoreCountWaitZero(I
);
1374 auto IsHazardFn
= [InstType
, &IsHazardInst
](const MachineInstr
&I
) {
1378 auto IsHazardFn
= [InstType
, IsHazardInst
](const MachineInstr
&I
) {
1379 auto InstType2
= IsHazardInst(I
);
1380 return InstType2
&& InstType
!= InstType2
;
1383 auto IsExpiredFn
= [InstType
, &IsHazardInst
](const MachineInstr
&I
, int) {
1384 auto InstType2
= IsHazardInst(I
);
1385 if (InstType
== InstType2
)
1388 return isStoreCountWaitZero(I
);
1391 return ::getWaitStatesSince(IsHazardFn
, &I
, IsExpiredFn
) !=
1392 std::numeric_limits
<int>::max();
1395 if (::getWaitStatesSince(IsHazardFn
, MI
, IsExpiredFn
) ==
1396 std::numeric_limits
<int>::max())
1399 const SIInstrInfo
*TII
= ST
.getInstrInfo();
1400 BuildMI(*MI
->getParent(), MI
, MI
->getDebugLoc(),
1401 TII
->get(AMDGPU::S_WAITCNT_VSCNT
))
1402 .addReg(AMDGPU::SGPR_NULL
, RegState::Undef
)
1408 bool GCNHazardRecognizer::fixLdsDirectVALUHazard(MachineInstr
*MI
) {
1409 if (!SIInstrInfo::isLDSDIR(*MI
))
1412 const int NoHazardWaitStates
= 15;
1413 const MachineOperand
*VDST
= TII
.getNamedOperand(*MI
, AMDGPU::OpName::vdst
);
1414 const Register VDSTReg
= VDST
->getReg();
1416 bool VisitedTrans
= false;
1417 auto IsHazardFn
= [this, VDSTReg
, &VisitedTrans
](const MachineInstr
&I
) {
1418 if (!SIInstrInfo::isVALU(I
))
1420 VisitedTrans
= VisitedTrans
|| SIInstrInfo::isTRANS(I
);
1421 // Cover both WAR and WAW
1422 return I
.readsRegister(VDSTReg
, &TRI
) || I
.modifiesRegister(VDSTReg
, &TRI
);
1424 auto IsExpiredFn
= [&](const MachineInstr
&I
, int WaitStates
) {
1425 if (WaitStates
>= NoHazardWaitStates
)
1427 // Instructions which cause va_vdst==0 expire hazard
1428 return SIInstrInfo::isVMEM(I
) || SIInstrInfo::isFLAT(I
) ||
1429 SIInstrInfo::isDS(I
) || SIInstrInfo::isEXP(I
);
1431 auto GetWaitStatesFn
= [](const MachineInstr
&MI
) {
1432 return SIInstrInfo::isVALU(MI
) ? 1 : 0;
1435 DenseSet
<const MachineBasicBlock
*> Visited
;
1436 auto Count
= ::getWaitStatesSince(IsHazardFn
, MI
->getParent(),
1437 std::next(MI
->getReverseIterator()), 0,
1438 IsExpiredFn
, Visited
, GetWaitStatesFn
);
1440 // Transcendentals can execute in parallel to other VALUs.
1441 // This makes va_vdst count unusable with a mixture of VALU and TRANS.
1445 MachineOperand
*WaitVdstOp
=
1446 TII
.getNamedOperand(*MI
, AMDGPU::OpName::waitvdst
);
1447 WaitVdstOp
->setImm(std::min(Count
, NoHazardWaitStates
));
1452 bool GCNHazardRecognizer::fixLdsDirectVMEMHazard(MachineInstr
*MI
) {
1453 if (!SIInstrInfo::isLDSDIR(*MI
))
1456 const MachineOperand
*VDST
= TII
.getNamedOperand(*MI
, AMDGPU::OpName::vdst
);
1457 const Register VDSTReg
= VDST
->getReg();
1459 auto IsHazardFn
= [this, VDSTReg
](const MachineInstr
&I
) {
1460 if (!SIInstrInfo::isVMEM(I
) && !SIInstrInfo::isFLAT(I
) &&
1461 !SIInstrInfo::isDS(I
))
1463 return I
.readsRegister(VDSTReg
, &TRI
) || I
.modifiesRegister(VDSTReg
, &TRI
);
1465 bool LdsdirCanWait
= ST
.hasLdsWaitVMSRC();
1466 // TODO: On GFX12 the hazard should expire on S_WAIT_LOADCNT/SAMPLECNT/BVHCNT
1467 // according to the type of VMEM instruction.
1468 auto IsExpiredFn
= [this, LdsdirCanWait
](const MachineInstr
&I
, int) {
1469 return SIInstrInfo::isVALU(I
) || SIInstrInfo::isEXP(I
) ||
1470 (I
.getOpcode() == AMDGPU::S_WAITCNT
&& !I
.getOperand(0).getImm()) ||
1471 (I
.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR
&&
1472 AMDGPU::DepCtr::decodeFieldVmVsrc(I
.getOperand(0).getImm()) == 0) ||
1473 (LdsdirCanWait
&& SIInstrInfo::isLDSDIR(I
) &&
1474 !TII
.getNamedOperand(I
, AMDGPU::OpName::waitvsrc
)->getImm());
1477 if (::getWaitStatesSince(IsHazardFn
, MI
, IsExpiredFn
) ==
1478 std::numeric_limits
<int>::max())
1481 if (LdsdirCanWait
) {
1482 TII
.getNamedOperand(*MI
, AMDGPU::OpName::waitvsrc
)->setImm(0);
1484 BuildMI(*MI
->getParent(), MI
, MI
->getDebugLoc(),
1485 TII
.get(AMDGPU::S_WAITCNT_DEPCTR
))
1486 .addImm(AMDGPU::DepCtr::encodeFieldVmVsrc(0));
1492 bool GCNHazardRecognizer::fixVALUPartialForwardingHazard(MachineInstr
*MI
) {
1493 if (!ST
.hasVALUPartialForwardingHazard())
1495 assert(!ST
.hasExtendedWaitCounts());
1497 if (!ST
.isWave64() || !SIInstrInfo::isVALU(*MI
))
1500 SmallSetVector
<Register
, 4> SrcVGPRs
;
1502 for (const MachineOperand
&Use
: MI
->explicit_uses()) {
1503 if (Use
.isReg() && TRI
.isVGPR(MF
.getRegInfo(), Use
.getReg()))
1504 SrcVGPRs
.insert(Use
.getReg());
1507 // Only applies with >= 2 unique VGPR sources
1508 if (SrcVGPRs
.size() <= 1)
1511 // Look for the following pattern:
1512 // Va <- VALU [PreExecPos]
1514 // Exec <- SALU [ExecPos]
1516 // Vb <- VALU [PostExecPos]
1518 // MI Va, Vb (WaitState = 0)
1521 // intv1 + intv2 <= 2 VALUs
1524 // If found, insert an appropriate S_WAITCNT_DEPCTR before MI.
1526 const int Intv1plus2MaxVALUs
= 2;
1527 const int Intv3MaxVALUs
= 4;
1528 const int IntvMaxVALUs
= 6;
1529 const int NoHazardVALUWaitStates
= IntvMaxVALUs
+ 2;
1532 SmallDenseMap
<Register
, int, 4> DefPos
;
1533 int ExecPos
= std::numeric_limits
<int>::max();
1539 // This overloads expiry testing with all the hazard detection
1540 auto IsHazardFn
= [&, this](StateType
&State
, const MachineInstr
&I
) {
1541 // Too many VALU states have passed
1542 if (State
.VALUs
> NoHazardVALUWaitStates
)
1543 return HazardExpired
;
1545 // Instructions which cause va_vdst==0 expire hazard
1546 if (SIInstrInfo::isVMEM(I
) || SIInstrInfo::isFLAT(I
) ||
1547 SIInstrInfo::isDS(I
) || SIInstrInfo::isEXP(I
) ||
1548 (I
.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR
&&
1549 AMDGPU::DepCtr::decodeFieldVaVdst(I
.getOperand(0).getImm()) == 0))
1550 return HazardExpired
;
1552 // Track registers writes
1553 bool Changed
= false;
1554 if (SIInstrInfo::isVALU(I
)) {
1555 for (Register Src
: SrcVGPRs
) {
1556 if (!State
.DefPos
.count(Src
) && I
.modifiesRegister(Src
, &TRI
)) {
1557 State
.DefPos
[Src
] = State
.VALUs
;
1561 } else if (SIInstrInfo::isSALU(I
)) {
1562 if (State
.ExecPos
== std::numeric_limits
<int>::max()) {
1563 if (!State
.DefPos
.empty() && I
.modifiesRegister(AMDGPU::EXEC
, &TRI
)) {
1564 State
.ExecPos
= State
.VALUs
;
1570 // Early expiration: too many VALUs in intv3
1571 if (State
.VALUs
> Intv3MaxVALUs
&& State
.DefPos
.empty())
1572 return HazardExpired
;
1574 // Only evaluate state if something changed
1576 return NoHazardFound
;
1578 // Determine positions of VALUs pre/post exec change
1579 if (State
.ExecPos
== std::numeric_limits
<int>::max())
1580 return NoHazardFound
;
1582 int PreExecPos
= std::numeric_limits
<int>::max();
1583 int PostExecPos
= std::numeric_limits
<int>::max();
1585 for (auto Entry
: State
.DefPos
) {
1586 int DefVALUs
= Entry
.second
;
1587 if (DefVALUs
!= std::numeric_limits
<int>::max()) {
1588 if (DefVALUs
>= State
.ExecPos
)
1589 PreExecPos
= std::min(PreExecPos
, DefVALUs
);
1591 PostExecPos
= std::min(PostExecPos
, DefVALUs
);
1595 // Need a VALUs post exec change
1596 if (PostExecPos
== std::numeric_limits
<int>::max())
1597 return NoHazardFound
;
1599 // Too many VALUs in intv3?
1600 int Intv3VALUs
= PostExecPos
;
1601 if (Intv3VALUs
> Intv3MaxVALUs
)
1602 return HazardExpired
;
1604 // Too many VALUs in intv2?
1605 int Intv2VALUs
= (State
.ExecPos
- PostExecPos
) - 1;
1606 if (Intv2VALUs
> Intv1plus2MaxVALUs
)
1607 return HazardExpired
;
1609 // Need a VALUs pre exec change
1610 if (PreExecPos
== std::numeric_limits
<int>::max())
1611 return NoHazardFound
;
1613 // Too many VALUs in intv1?
1614 int Intv1VALUs
= PreExecPos
- State
.ExecPos
;
1615 if (Intv1VALUs
> Intv1plus2MaxVALUs
)
1616 return HazardExpired
;
1618 // Too many VALUs in intv1 + intv2
1619 if (Intv1VALUs
+ Intv2VALUs
> Intv1plus2MaxVALUs
)
1620 return HazardExpired
;
1624 auto UpdateStateFn
= [](StateType
&State
, const MachineInstr
&MI
) {
1625 if (SIInstrInfo::isVALU(MI
))
1629 DenseSet
<const MachineBasicBlock
*> Visited
;
1630 if (!hasHazard
<StateType
>(State
, IsHazardFn
, UpdateStateFn
, MI
->getParent(),
1631 std::next(MI
->getReverseIterator()), Visited
))
1634 BuildMI(*MI
->getParent(), MI
, MI
->getDebugLoc(),
1635 TII
.get(AMDGPU::S_WAITCNT_DEPCTR
))
1641 bool GCNHazardRecognizer::fixVALUTransUseHazard(MachineInstr
*MI
) {
1642 if (!ST
.hasVALUTransUseHazard())
1644 assert(!ST
.hasExtendedWaitCounts());
1646 if (!SIInstrInfo::isVALU(*MI
))
1649 SmallSet
<Register
, 4> SrcVGPRs
;
1651 for (const MachineOperand
&Use
: MI
->explicit_uses()) {
1652 if (Use
.isReg() && TRI
.isVGPR(MF
.getRegInfo(), Use
.getReg()))
1653 SrcVGPRs
.insert(Use
.getReg());
1656 // Look for the following pattern:
1659 // MI Va (WaitState = 0)
1662 // intv <= 5 VALUs / 1 TRANS
1664 // If found, insert an appropriate S_WAITCNT_DEPCTR before MI.
1666 const int IntvMaxVALUs
= 5;
1667 const int IntvMaxTRANS
= 1;
1676 // This overloads expiry testing with all the hazard detection
1677 auto IsHazardFn
= [&, this](StateType
&State
, const MachineInstr
&I
) {
1678 // Too many VALU states have passed
1679 if (State
.VALUs
> IntvMaxVALUs
|| State
.TRANS
> IntvMaxTRANS
)
1680 return HazardExpired
;
1682 // Instructions which cause va_vdst==0 expire hazard
1683 if (SIInstrInfo::isVMEM(I
) || SIInstrInfo::isFLAT(I
) ||
1684 SIInstrInfo::isDS(I
) || SIInstrInfo::isEXP(I
) ||
1685 (I
.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR
&&
1686 I
.getOperand(0).getImm() == 0x0fff))
1687 return HazardExpired
;
1689 // Track registers writes
1690 if (SIInstrInfo::isTRANS(I
)) {
1691 for (Register Src
: SrcVGPRs
) {
1692 if (I
.modifiesRegister(Src
, &TRI
)) {
1698 return NoHazardFound
;
1700 auto UpdateStateFn
= [](StateType
&State
, const MachineInstr
&MI
) {
1701 if (SIInstrInfo::isVALU(MI
))
1703 if (SIInstrInfo::isTRANS(MI
))
1707 DenseSet
<const MachineBasicBlock
*> Visited
;
1708 if (!hasHazard
<StateType
>(State
, IsHazardFn
, UpdateStateFn
, MI
->getParent(),
1709 std::next(MI
->getReverseIterator()), Visited
))
1712 // Hazard is observed - insert a wait on va_dst counter to ensure hazard is
1714 BuildMI(*MI
->getParent(), MI
, MI
->getDebugLoc(),
1715 TII
.get(AMDGPU::S_WAITCNT_DEPCTR
))
1716 .addImm(AMDGPU::DepCtr::encodeFieldVaVdst(0));
1721 bool GCNHazardRecognizer::fixWMMAHazards(MachineInstr
*MI
) {
1722 if (!SIInstrInfo::isWMMA(*MI
) && !SIInstrInfo::isSWMMAC(*MI
))
1725 const SIInstrInfo
*TII
= ST
.getInstrInfo();
1726 const SIRegisterInfo
*TRI
= ST
.getRegisterInfo();
1728 auto IsHazardFn
= [MI
, TII
, TRI
, this](const MachineInstr
&I
) {
1729 if (!SIInstrInfo::isWMMA(I
) && !SIInstrInfo::isSWMMAC(I
))
1732 // Src0(matrix A) or Src1(matrix B) of the current wmma instruction overlaps
1733 // with the dest(matrix D) of the previous wmma.
1734 const Register CurSrc0Reg
=
1735 TII
->getNamedOperand(*MI
, AMDGPU::OpName::src0
)->getReg();
1736 const Register CurSrc1Reg
=
1737 TII
->getNamedOperand(*MI
, AMDGPU::OpName::src1
)->getReg();
1739 const Register PrevDstReg
=
1740 TII
->getNamedOperand(I
, AMDGPU::OpName::vdst
)->getReg();
1742 if (TRI
->regsOverlap(PrevDstReg
, CurSrc0Reg
) ||
1743 TRI
->regsOverlap(PrevDstReg
, CurSrc1Reg
)) {
1747 // GFX12+ allows overlap of matrix C with PrevDstReg (hardware will stall)
1748 // but Index can't overlap with PrevDstReg.
1749 if (AMDGPU::isGFX12Plus(ST
)) {
1750 if (SIInstrInfo::isSWMMAC(*MI
)) {
1751 const Register CurIndex
=
1752 TII
->getNamedOperand(*MI
, AMDGPU::OpName::src2
)->getReg();
1753 if (TRI
->regsOverlap(PrevDstReg
, CurIndex
))
1762 auto IsExpiredFn
= [](const MachineInstr
&I
, int) {
1763 return SIInstrInfo::isVALU(I
);
1766 if (::getWaitStatesSince(IsHazardFn
, MI
, IsExpiredFn
) ==
1767 std::numeric_limits
<int>::max())
1770 BuildMI(*MI
->getParent(), MI
, MI
->getDebugLoc(), TII
->get(AMDGPU::V_NOP_e32
));
1775 bool GCNHazardRecognizer::fixShift64HighRegBug(MachineInstr
*MI
) {
1776 if (!ST
.hasShift64HighRegBug())
1778 assert(!ST
.hasExtendedWaitCounts());
1780 switch (MI
->getOpcode()) {
1783 case AMDGPU::V_LSHLREV_B64_e64
:
1784 case AMDGPU::V_LSHRREV_B64_e64
:
1785 case AMDGPU::V_ASHRREV_I64_e64
:
1789 MachineOperand
*Amt
= TII
.getNamedOperand(*MI
, AMDGPU::OpName::src0
);
1793 Register AmtReg
= Amt
->getReg();
1794 const MachineRegisterInfo
&MRI
= MF
.getRegInfo();
1795 // Check if this is a last VGPR in the allocation block.
1796 if (!TRI
.isVGPR(MRI
, AmtReg
) || ((AmtReg
- AMDGPU::VGPR0
) & 7) != 7)
1799 if (AmtReg
!= AMDGPU::VGPR255
&& MRI
.isPhysRegUsed(AmtReg
+ 1))
1802 MachineOperand
*Src1
= TII
.getNamedOperand(*MI
, AMDGPU::OpName::src1
);
1803 bool OverlappedSrc
= Src1
->isReg() && TRI
.regsOverlap(Src1
->getReg(), AmtReg
);
1804 bool OverlappedDst
= MI
->modifiesRegister(AmtReg
, &TRI
);
1805 bool Overlapped
= OverlappedSrc
|| OverlappedDst
;
1807 assert(!OverlappedDst
|| !OverlappedSrc
||
1808 Src1
->getReg() == MI
->getOperand(0).getReg());
1809 assert(ST
.needsAlignedVGPRs());
1810 static_assert(AMDGPU::VGPR0
+ 1 == AMDGPU::VGPR1
);
1813 for (MCRegister Reg
: Overlapped
? AMDGPU::VReg_64_Align2RegClass
1814 : AMDGPU::VGPR_32RegClass
) {
1815 if (!MI
->modifiesRegister(Reg
, &TRI
) && !MI
->readsRegister(Reg
, &TRI
)) {
1821 Register NewAmt
= Overlapped
? (Register
)TRI
.getSubReg(NewReg
, AMDGPU::sub1
)
1826 NewAmtLo
= TRI
.getSubReg(NewReg
, AMDGPU::sub0
);
1828 DebugLoc DL
= MI
->getDebugLoc();
1829 MachineBasicBlock
*MBB
= MI
->getParent();
1830 // Insert a full wait count because found register might be pending a wait.
1831 BuildMI(*MBB
, MI
, DL
, TII
.get(AMDGPU::S_WAITCNT
))
1834 // Insert V_SWAP_B32 instruction(s) and run hazard recognizer on them.
1837 BuildMI(*MBB
, MI
, DL
, TII
.get(AMDGPU::V_SWAP_B32
), NewAmtLo
)
1839 .addReg(AmtReg
- 1, RegState::Undef
)
1840 .addReg(NewAmtLo
, RegState::Undef
));
1841 runOnInstruction(BuildMI(*MBB
, MI
, DL
, TII
.get(AMDGPU::V_SWAP_B32
), NewAmt
)
1843 .addReg(AmtReg
, RegState::Undef
)
1844 .addReg(NewAmt
, RegState::Undef
));
1846 // Instructions emitted after the current instruction will be processed by the
1847 // parent loop of the hazard recognizer in a natural way.
1848 BuildMI(*MBB
, std::next(MI
->getIterator()), DL
, TII
.get(AMDGPU::V_SWAP_B32
),
1854 BuildMI(*MBB
, std::next(MI
->getIterator()), DL
, TII
.get(AMDGPU::V_SWAP_B32
),
1858 .addReg(AmtReg
- 1);
1860 // Re-running hazard recognizer on the modified instruction is not necessary,
1861 // inserted V_SWAP_B32 has already both read and write new registers so
1862 // hazards related to these register has already been handled.
1863 Amt
->setReg(NewAmt
);
1864 Amt
->setIsKill(false);
1865 // We do not update liveness, so verifier may see it as undef.
1868 MI
->getOperand(0).setReg(NewReg
);
1869 if (OverlappedSrc
) {
1870 Src1
->setReg(NewReg
);
1871 Src1
->setIsKill(false);
1878 int GCNHazardRecognizer::checkNSAtoVMEMHazard(MachineInstr
*MI
) {
1879 int NSAtoVMEMWaitStates
= 1;
1881 if (!ST
.hasNSAtoVMEMBug())
1884 if (!SIInstrInfo::isMUBUF(*MI
) && !SIInstrInfo::isMTBUF(*MI
))
1887 const SIInstrInfo
*TII
= ST
.getInstrInfo();
1888 const auto *Offset
= TII
->getNamedOperand(*MI
, AMDGPU::OpName::offset
);
1889 if (!Offset
|| (Offset
->getImm() & 6) == 0)
1892 auto IsHazardFn
= [TII
](const MachineInstr
&I
) {
1893 if (!SIInstrInfo::isMIMG(I
))
1895 const AMDGPU::MIMGInfo
*Info
= AMDGPU::getMIMGInfo(I
.getOpcode());
1896 return Info
->MIMGEncoding
== AMDGPU::MIMGEncGfx10NSA
&&
1897 TII
->getInstSizeInBytes(I
) >= 16;
1900 return NSAtoVMEMWaitStates
- getWaitStatesSince(IsHazardFn
, 1);
1903 int GCNHazardRecognizer::checkFPAtomicToDenormModeHazard(MachineInstr
*MI
) {
1904 int FPAtomicToDenormModeWaitStates
= 3;
1906 if (!ST
.hasFPAtomicToDenormModeHazard())
1908 assert(!ST
.hasExtendedWaitCounts());
1910 if (MI
->getOpcode() != AMDGPU::S_DENORM_MODE
)
1913 auto IsHazardFn
= [](const MachineInstr
&I
) {
1914 if (!SIInstrInfo::isVMEM(I
) && !SIInstrInfo::isFLAT(I
))
1916 return SIInstrInfo::isFPAtomic(I
);
1919 auto IsExpiredFn
= [](const MachineInstr
&MI
, int WaitStates
) {
1920 if (WaitStates
>= 3 || SIInstrInfo::isVALU(MI
))
1923 switch (MI
.getOpcode()) {
1924 case AMDGPU::S_WAITCNT
:
1925 case AMDGPU::S_WAITCNT_VSCNT
:
1926 case AMDGPU::S_WAITCNT_VMCNT
:
1927 case AMDGPU::S_WAITCNT_EXPCNT
:
1928 case AMDGPU::S_WAITCNT_LGKMCNT
:
1929 case AMDGPU::S_WAIT_IDLE
:
1938 return FPAtomicToDenormModeWaitStates
-
1939 ::getWaitStatesSince(IsHazardFn
, MI
, IsExpiredFn
);
1942 int GCNHazardRecognizer::checkMAIHazards(MachineInstr
*MI
) {
1943 assert(SIInstrInfo::isMAI(*MI
));
1945 return ST
.hasGFX90AInsts() ? checkMAIHazards90A(MI
) : checkMAIHazards908(MI
);
1948 int GCNHazardRecognizer::checkMFMAPadding(MachineInstr
*MI
) {
1949 // Early exit if no padding is requested.
1950 if (MFMAPaddingRatio
== 0)
1953 const SIMachineFunctionInfo
*MFI
= MF
.getInfo
<SIMachineFunctionInfo
>();
1954 if (!SIInstrInfo::isMFMA(*MI
) || MFI
->getOccupancy() < 2)
1957 int NeighborMFMALatency
= 0;
1958 auto IsNeighboringMFMA
= [&NeighborMFMALatency
,
1959 this](const MachineInstr
&MI
) {
1960 if (!SIInstrInfo::isMFMA(MI
))
1963 NeighborMFMALatency
= this->getMFMAPipelineWaitStates(MI
);
1967 const int MaxMFMAPipelineWaitStates
= 16;
1968 int WaitStatesSinceNeighborMFMA
=
1969 getWaitStatesSince(IsNeighboringMFMA
, MaxMFMAPipelineWaitStates
);
1971 int NeighborMFMAPaddingNeeded
=
1972 (NeighborMFMALatency
* MFMAPaddingRatio
/ 100) -
1973 WaitStatesSinceNeighborMFMA
;
1975 return std::max(0, NeighborMFMAPaddingNeeded
);
1978 int GCNHazardRecognizer::checkMAIHazards908(MachineInstr
*MI
) {
1979 int WaitStatesNeeded
= 0;
1980 unsigned Opc
= MI
->getOpcode();
1982 auto IsVALUFn
= [](const MachineInstr
&MI
) {
1983 return SIInstrInfo::isVALU(MI
) || MI
.isInlineAsm();
1986 if (Opc
!= AMDGPU::V_ACCVGPR_READ_B32_e64
) { // MFMA or v_accvgpr_write
1987 const int LegacyVALUWritesVGPRWaitStates
= 2;
1988 const int VALUWritesExecWaitStates
= 4;
1989 const int MaxWaitStates
= 4;
1991 int WaitStatesNeededForUse
= VALUWritesExecWaitStates
-
1992 getWaitStatesSinceDef(AMDGPU::EXEC
, IsVALUFn
, MaxWaitStates
);
1993 WaitStatesNeeded
= std::max(WaitStatesNeeded
, WaitStatesNeededForUse
);
1995 if (WaitStatesNeeded
< MaxWaitStates
) {
1996 for (const MachineOperand
&Use
: MI
->explicit_uses()) {
1997 const int MaxWaitStates
= 2;
1999 if (!Use
.isReg() || !TRI
.isVGPR(MF
.getRegInfo(), Use
.getReg()))
2002 int WaitStatesNeededForUse
= LegacyVALUWritesVGPRWaitStates
-
2003 getWaitStatesSinceDef(Use
.getReg(), IsVALUFn
, MaxWaitStates
);
2004 WaitStatesNeeded
= std::max(WaitStatesNeeded
, WaitStatesNeededForUse
);
2006 if (WaitStatesNeeded
== MaxWaitStates
)
2012 for (const MachineOperand
&Op
: MI
->explicit_operands()) {
2013 if (!Op
.isReg() || !TRI
.isAGPR(MF
.getRegInfo(), Op
.getReg()))
2016 if (Op
.isDef() && Opc
!= AMDGPU::V_ACCVGPR_WRITE_B32_e64
)
2019 const int MFMAWritesAGPROverlappedSrcABWaitStates
= 4;
2020 const int MFMAWritesAGPROverlappedSrcCWaitStates
= 2;
2021 const int MFMA4x4WritesAGPRAccVgprReadWaitStates
= 4;
2022 const int MFMA16x16WritesAGPRAccVgprReadWaitStates
= 10;
2023 const int MFMA32x32WritesAGPRAccVgprReadWaitStates
= 18;
2024 const int MFMA4x4WritesAGPRAccVgprWriteWaitStates
= 1;
2025 const int MFMA16x16WritesAGPRAccVgprWriteWaitStates
= 7;
2026 const int MFMA32x32WritesAGPRAccVgprWriteWaitStates
= 15;
2027 const int MaxWaitStates
= 18;
2028 Register Reg
= Op
.getReg();
2029 unsigned HazardDefLatency
= 0;
2031 auto IsOverlappedMFMAFn
= [Reg
, &HazardDefLatency
,
2032 this](const MachineInstr
&MI
) {
2033 if (!SIInstrInfo::isMFMA(MI
))
2035 Register DstReg
= MI
.getOperand(0).getReg();
2039 std::max(HazardDefLatency
, TSchedModel
.computeInstrLatency(&MI
));
2040 return TRI
.regsOverlap(DstReg
, Reg
);
2043 int WaitStatesSinceDef
= getWaitStatesSinceDef(Reg
, IsOverlappedMFMAFn
,
2045 int NeedWaitStates
= MFMAWritesAGPROverlappedSrcABWaitStates
;
2046 int SrcCIdx
= AMDGPU::getNamedOperandIdx(Opc
, AMDGPU::OpName::src2
);
2047 int OpNo
= Op
.getOperandNo();
2048 if (OpNo
== SrcCIdx
) {
2049 NeedWaitStates
= MFMAWritesAGPROverlappedSrcCWaitStates
;
2050 } else if (Opc
== AMDGPU::V_ACCVGPR_READ_B32_e64
) {
2051 switch (HazardDefLatency
) {
2052 case 2: NeedWaitStates
= MFMA4x4WritesAGPRAccVgprReadWaitStates
;
2054 case 8: NeedWaitStates
= MFMA16x16WritesAGPRAccVgprReadWaitStates
;
2056 case 16: [[fallthrough
]];
2057 default: NeedWaitStates
= MFMA32x32WritesAGPRAccVgprReadWaitStates
;
2060 } else if (Opc
== AMDGPU::V_ACCVGPR_WRITE_B32_e64
) {
2061 switch (HazardDefLatency
) {
2062 case 2: NeedWaitStates
= MFMA4x4WritesAGPRAccVgprWriteWaitStates
;
2064 case 8: NeedWaitStates
= MFMA16x16WritesAGPRAccVgprWriteWaitStates
;
2066 case 16: [[fallthrough
]];
2067 default: NeedWaitStates
= MFMA32x32WritesAGPRAccVgprWriteWaitStates
;
2072 int WaitStatesNeededForUse
= NeedWaitStates
- WaitStatesSinceDef
;
2073 WaitStatesNeeded
= std::max(WaitStatesNeeded
, WaitStatesNeededForUse
);
2075 if (WaitStatesNeeded
== MaxWaitStates
)
2076 return WaitStatesNeeded
; // Early exit.
2078 auto IsAccVgprWriteFn
= [Reg
, this](const MachineInstr
&MI
) {
2079 if (MI
.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64
)
2081 Register DstReg
= MI
.getOperand(0).getReg();
2082 return TRI
.regsOverlap(Reg
, DstReg
);
2085 const int AccVGPRWriteMFMAReadSrcCWaitStates
= 1;
2086 const int AccVGPRWriteMFMAReadSrcABWaitStates
= 3;
2087 const int AccVGPRWriteAccVgprReadWaitStates
= 3;
2088 NeedWaitStates
= AccVGPRWriteMFMAReadSrcABWaitStates
;
2089 if (OpNo
== SrcCIdx
)
2090 NeedWaitStates
= AccVGPRWriteMFMAReadSrcCWaitStates
;
2091 else if (Opc
== AMDGPU::V_ACCVGPR_READ_B32_e64
)
2092 NeedWaitStates
= AccVGPRWriteAccVgprReadWaitStates
;
2094 WaitStatesNeededForUse
= NeedWaitStates
-
2095 getWaitStatesSinceDef(Reg
, IsAccVgprWriteFn
, MaxWaitStates
);
2096 WaitStatesNeeded
= std::max(WaitStatesNeeded
, WaitStatesNeededForUse
);
2098 if (WaitStatesNeeded
== MaxWaitStates
)
2099 return WaitStatesNeeded
; // Early exit.
2102 if (Opc
== AMDGPU::V_ACCVGPR_WRITE_B32_e64
) {
2103 const int MFMA4x4ReadSrcCAccVgprWriteWaitStates
= 0;
2104 const int MFMA16x16ReadSrcCAccVgprWriteWaitStates
= 5;
2105 const int MFMA32x32ReadSrcCAccVgprWriteWaitStates
= 13;
2106 const int MaxWaitStates
= 13;
2107 Register DstReg
= MI
->getOperand(0).getReg();
2108 unsigned HazardDefLatency
= 0;
2110 auto IsSrcCMFMAFn
= [DstReg
, &HazardDefLatency
,
2111 this](const MachineInstr
&MI
) {
2112 if (!SIInstrInfo::isMFMA(MI
))
2114 Register Reg
= TII
.getNamedOperand(MI
, AMDGPU::OpName::src2
)->getReg();
2116 std::max(HazardDefLatency
, TSchedModel
.computeInstrLatency(&MI
));
2117 return TRI
.regsOverlap(Reg
, DstReg
);
2120 int WaitStatesSince
= getWaitStatesSince(IsSrcCMFMAFn
, MaxWaitStates
);
2122 switch (HazardDefLatency
) {
2123 case 2: NeedWaitStates
= MFMA4x4ReadSrcCAccVgprWriteWaitStates
;
2125 case 8: NeedWaitStates
= MFMA16x16ReadSrcCAccVgprWriteWaitStates
;
2127 case 16: [[fallthrough
]];
2128 default: NeedWaitStates
= MFMA32x32ReadSrcCAccVgprWriteWaitStates
;
2132 int WaitStatesNeededForUse
= NeedWaitStates
- WaitStatesSince
;
2133 WaitStatesNeeded
= std::max(WaitStatesNeeded
, WaitStatesNeededForUse
);
2136 // Pad neighboring MFMA with noops for better inter-wave performance.
2137 WaitStatesNeeded
= std::max(WaitStatesNeeded
, checkMFMAPadding(MI
));
2139 return WaitStatesNeeded
;
2143 GFX940_XDL_N_PassWritesVGPROverlappedSMFMASrcCWaitStates(int NumPasses
) {
2148 return NumPasses
+ 1;
2152 GFX940_SMFMA_N_PassWritesVGPROverlappedSMFMASrcCWaitStates(int NumPasses
) {
2161 GFX940_SMFMA_N_PassWritesVGPROverlappedSrcABWaitStates(int NumPasses
) {
2166 return NumPasses
+ 2;
2169 static int GFX940_XDL_N_PassWritesVGPROverlappedSrcABWaitStates(int NumPasses
) {
2174 return NumPasses
+ 3;
2177 int GCNHazardRecognizer::checkMAIHazards90A(MachineInstr
*MI
) {
2178 int WaitStatesNeeded
= 0;
2179 unsigned Opc
= MI
->getOpcode();
2181 auto IsLegacyVALUFn
= [](const MachineInstr
&MI
) {
2182 return SIInstrInfo::isVALU(MI
) && !SIInstrInfo::isMFMA(MI
);
2185 auto IsLegacyVALUNotDotFn
= [](const MachineInstr
&MI
) {
2186 return SIInstrInfo::isVALU(MI
) && !SIInstrInfo::isMFMA(MI
) &&
2187 !SIInstrInfo::isDOT(MI
);
2190 if (!SIInstrInfo::isMFMA(*MI
))
2191 return WaitStatesNeeded
;
2193 const int VALUWritesExecWaitStates
= 4;
2194 int WaitStatesNeededForUse
= VALUWritesExecWaitStates
-
2195 getWaitStatesSinceDef(AMDGPU::EXEC
, IsLegacyVALUFn
,
2196 VALUWritesExecWaitStates
);
2197 WaitStatesNeeded
= std::max(WaitStatesNeeded
, WaitStatesNeededForUse
);
2199 int SrcCIdx
= AMDGPU::getNamedOperandIdx(Opc
, AMDGPU::OpName::src2
);
2201 // Loop for both DGEMM and S/HGEMM 2nd instruction.
2202 for (const MachineOperand
&Use
: MI
->explicit_uses()) {
2203 const int LegacyVALUNotDotWritesVGPRWaitStates
= 2;
2204 const int SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates
= 2;
2205 const int SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates
= 8;
2206 const int SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates
= 16;
2207 const int SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates
= 3;
2208 const int SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates
= 9;
2209 const int SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates
= 17;
2210 const int DMFMA16x16WritesVGPROverlappedSrcCWaitStates
= 9;
2211 const int DMFMA4x4WritesVGPROverlappedSrcCWaitStates
= 4;
2212 const int SMFMA4x4WritesVGPROverlappedSrcABWaitStates
= 5;
2213 const int SMFMA16x16WritesVGPROverlappedSrcABWaitStates
= 11;
2214 const int SMFMA32x32WritesVGPROverlappedSrcABWaitStates
= 19;
2215 const int DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates
= 6;
2216 const int DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates
= 11;
2217 const int DMFMA4x4WritesVGPRFullSrcCWaitStates
= 4;
2218 const int GFX940_SMFMA4x4WritesVGPRFullSrcCWaitStates
= 2;
2219 const int MaxWaitStates
= 19;
2223 Register Reg
= Use
.getReg();
2225 const MachineInstr
*MI1
;
2227 auto IsOverlappedMFMAFn
= [Reg
, &FullReg
, &MI1
,
2228 this](const MachineInstr
&MI
) {
2229 if (!SIInstrInfo::isMFMA(MI
))
2231 Register DstReg
= MI
.getOperand(0).getReg();
2232 FullReg
= (DstReg
== Reg
);
2234 return TRI
.regsOverlap(DstReg
, Reg
);
2237 WaitStatesNeededForUse
= LegacyVALUNotDotWritesVGPRWaitStates
-
2238 getWaitStatesSinceDef(Reg
, IsLegacyVALUNotDotFn
, MaxWaitStates
);
2239 WaitStatesNeeded
= std::max(WaitStatesNeeded
, WaitStatesNeededForUse
);
2242 getWaitStatesSinceDef(Reg
, IsOverlappedMFMAFn
, MaxWaitStates
);
2243 if (NumWaitStates
== std::numeric_limits
<int>::max())
2246 int OpNo
= Use
.getOperandNo();
2247 unsigned Opc1
= MI1
->getOpcode();
2248 int NeedWaitStates
= 0;
2249 if (OpNo
== SrcCIdx
) {
2250 if (!isDGEMM(Opc
) && (!ST
.hasGFX940Insts() && isDGEMM(Opc1
))) {
2252 } else if (FullReg
) {
2253 if ((Opc
== AMDGPU::V_MFMA_F64_4X4X4F64_e64
||
2254 Opc
== AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64
) &&
2255 (Opc1
== AMDGPU::V_MFMA_F64_4X4X4F64_e64
||
2256 Opc1
== AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64
))
2257 NeedWaitStates
= DMFMA4x4WritesVGPRFullSrcCWaitStates
;
2258 else if (ST
.hasGFX940Insts() &&
2259 TSchedModel
.computeInstrLatency(MI1
) == 2)
2260 NeedWaitStates
= GFX940_SMFMA4x4WritesVGPRFullSrcCWaitStates
;
2263 case AMDGPU::V_MFMA_F64_16X16X4F64_e64
:
2264 case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64
:
2265 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64
:
2266 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64
:
2267 if (!isXDL(ST
, *MI
))
2268 NeedWaitStates
= DMFMA16x16WritesVGPROverlappedSrcCWaitStates
;
2270 case AMDGPU::V_MFMA_F64_4X4X4F64_e64
:
2271 case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64
:
2272 if (!isXDL(ST
, *MI
))
2273 NeedWaitStates
= DMFMA4x4WritesVGPROverlappedSrcCWaitStates
;
2276 int NumPasses
= TSchedModel
.computeInstrLatency(MI1
);
2277 if (ST
.hasGFX940Insts()) {
2278 if (isXDL(ST
, *MI
) && !isXDL(ST
, *MI1
))
2283 ? GFX940_XDL_N_PassWritesVGPROverlappedSMFMASrcCWaitStates(
2285 : GFX940_SMFMA_N_PassWritesVGPROverlappedSMFMASrcCWaitStates(
2290 switch (NumPasses
) {
2293 isDGEMM(Opc
) ? SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates
2294 : SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates
;
2299 ? SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates
2300 : SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates
;
2305 ? SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates
2306 : SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates
;
2309 llvm_unreachable("unexpected number of passes");
2315 case AMDGPU::V_MFMA_F64_16X16X4F64_e64
:
2316 case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64
:
2317 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64
:
2318 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64
:
2319 NeedWaitStates
= DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates
;
2321 case AMDGPU::V_MFMA_F64_4X4X4F64_e64
:
2322 case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64
:
2323 NeedWaitStates
= DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates
;
2326 int NumPasses
= TSchedModel
.computeInstrLatency(MI1
);
2328 if (ST
.hasGFX940Insts()) {
2331 ? GFX940_XDL_N_PassWritesVGPROverlappedSrcABWaitStates(
2333 : GFX940_SMFMA_N_PassWritesVGPROverlappedSrcABWaitStates(
2338 switch (NumPasses
) {
2340 NeedWaitStates
= SMFMA4x4WritesVGPROverlappedSrcABWaitStates
;
2343 llvm_unreachable("unexpected number of passes for mfma");
2345 NeedWaitStates
= SMFMA16x16WritesVGPROverlappedSrcABWaitStates
;
2349 NeedWaitStates
= SMFMA32x32WritesVGPROverlappedSrcABWaitStates
;
2353 if (WaitStatesNeeded
>= NeedWaitStates
)
2356 WaitStatesNeededForUse
= NeedWaitStates
- NumWaitStates
;
2357 WaitStatesNeeded
= std::max(WaitStatesNeeded
, WaitStatesNeededForUse
);
2359 if (WaitStatesNeeded
== MaxWaitStates
)
2363 // Pad neighboring MFMA with noops for better inter-wave performance.
2364 WaitStatesNeeded
= std::max(WaitStatesNeeded
, checkMFMAPadding(MI
));
2366 return WaitStatesNeeded
;
2369 int GCNHazardRecognizer::checkMAILdStHazards(MachineInstr
*MI
) {
2370 // On gfx90a+ relevant hazards are checked in checkMAIVALUHazards()
2371 if (!ST
.hasMAIInsts() || ST
.hasGFX90AInsts())
2374 int WaitStatesNeeded
= 0;
2376 auto IsAccVgprReadFn
= [](const MachineInstr
&MI
) {
2377 return MI
.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64
;
2380 for (const MachineOperand
&Op
: MI
->explicit_uses()) {
2381 if (!Op
.isReg() || !TRI
.isVGPR(MF
.getRegInfo(), Op
.getReg()))
2384 Register Reg
= Op
.getReg();
2386 const int AccVgprReadLdStWaitStates
= 2;
2387 const int VALUWriteAccVgprRdWrLdStDepVALUWaitStates
= 1;
2388 const int MaxWaitStates
= 2;
2390 int WaitStatesNeededForUse
= AccVgprReadLdStWaitStates
-
2391 getWaitStatesSinceDef(Reg
, IsAccVgprReadFn
, MaxWaitStates
);
2392 WaitStatesNeeded
= std::max(WaitStatesNeeded
, WaitStatesNeededForUse
);
2394 if (WaitStatesNeeded
== MaxWaitStates
)
2395 return WaitStatesNeeded
; // Early exit.
2397 auto IsVALUAccVgprRdWrCheckFn
= [Reg
, this](const MachineInstr
&MI
) {
2398 if (MI
.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64
&&
2399 MI
.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64
)
2401 auto IsVALUFn
= [](const MachineInstr
&MI
) {
2402 return SIInstrInfo::isVALU(MI
) && !SIInstrInfo::isMAI(MI
);
2404 return getWaitStatesSinceDef(Reg
, IsVALUFn
, 2 /*MaxWaitStates*/) <
2405 std::numeric_limits
<int>::max();
2408 WaitStatesNeededForUse
= VALUWriteAccVgprRdWrLdStDepVALUWaitStates
-
2409 getWaitStatesSince(IsVALUAccVgprRdWrCheckFn
, MaxWaitStates
);
2410 WaitStatesNeeded
= std::max(WaitStatesNeeded
, WaitStatesNeededForUse
);
2413 return WaitStatesNeeded
;
2416 static int GFX940_SMFMA_N_PassWriteVgprVALUWawWaitStates(int NumPasses
) {
2421 return NumPasses
+ 2;
2424 static int GFX940_XDL_N_PassWriteVgprVALUWawWaitStates(int NumPasses
) {
2429 return NumPasses
+ 3;
2432 static int GFX940_XDL_N_PassWriteVgprVALUMemExpReadWaitStates(int NumPasses
) {
2437 return NumPasses
+ 3;
2440 static int GFX940_SMFMA_N_PassWriteVgprVALUMemExpReadWaitStates(int NumPasses
) {
2445 return NumPasses
+ 2;
2448 int GCNHazardRecognizer::checkMAIVALUHazards(MachineInstr
*MI
) {
2449 if (!ST
.hasGFX90AInsts())
2452 auto IsDGEMMFn
= [](const MachineInstr
&MI
) -> bool {
2453 return isDGEMM(MI
.getOpcode());
2456 // This is checked in checkMAIHazards90A()
2457 if (SIInstrInfo::isMFMA(*MI
))
2460 const MachineRegisterInfo
&MRI
= MF
.getRegInfo();
2462 int WaitStatesNeeded
= 0;
2464 bool IsMem
= SIInstrInfo::isVMEM(*MI
) ||
2465 SIInstrInfo::isFLAT(*MI
) ||
2466 SIInstrInfo::isDS(*MI
);
2467 bool IsMemOrExport
= IsMem
|| SIInstrInfo::isEXP(*MI
);
2468 bool IsVALU
= SIInstrInfo::isVALU(*MI
);
2470 const MachineInstr
*MFMA
= nullptr;
2472 auto IsMFMAWriteFn
= [&Reg
, &MFMA
, this](const MachineInstr
&MI
) {
2473 if (!SIInstrInfo::isMFMA(MI
) ||
2474 !TRI
.regsOverlap(MI
.getOperand(0).getReg(), Reg
))
2480 const MachineInstr
*DOT
= nullptr;
2481 auto IsDotWriteFn
= [&Reg
, &DOT
, this](const MachineInstr
&MI
) {
2482 if (!SIInstrInfo::isDOT(MI
) ||
2483 !TRI
.regsOverlap(MI
.getOperand(0).getReg(), Reg
))
2489 bool DGEMMAfterVALUWrite
= false;
2490 auto IsDGEMMHazard
= [&DGEMMAfterVALUWrite
, this](const MachineInstr
&MI
) {
2491 // Found DGEMM on reverse traversal to def.
2492 if (isDGEMM(MI
.getOpcode()))
2493 DGEMMAfterVALUWrite
= true;
2495 // Only hazard if register is defined by a VALU and a DGEMM is found after
2497 if (!TII
.isVALU(MI
) || !DGEMMAfterVALUWrite
)
2503 int SrcCIdx
= AMDGPU::getNamedOperandIdx(MI
->getOpcode(),
2504 AMDGPU::OpName::src2
);
2506 if (IsMemOrExport
|| IsVALU
) {
2507 const int SMFMA4x4WriteVgprVALUMemExpReadWaitStates
= 5;
2508 const int SMFMA16x16WriteVgprVALUMemExpReadWaitStates
= 11;
2509 const int SMFMA32x32WriteVgprVALUMemExpReadWaitStates
= 19;
2510 const int DMFMA4x4WriteVgprMemExpReadWaitStates
= 9;
2511 const int DMFMA16x16WriteVgprMemExpReadWaitStates
= 18;
2512 const int DMFMA4x4WriteVgprVALUReadWaitStates
= 6;
2513 const int DMFMA16x16WriteVgprVALUReadWaitStates
= 11;
2514 const int DotWriteSameDotReadSrcAB
= 3;
2515 const int DotWriteDifferentVALURead
= 3;
2516 const int DMFMABetweenVALUWriteVMEMRead
= 2;
2517 const int MaxWaitStates
= 19;
2519 for (const MachineOperand
&Use
: MI
->explicit_uses()) {
2525 int WaitStatesSinceDef
= getWaitStatesSinceDef(Reg
, IsDotWriteFn
,
2528 int NeedWaitStates
= 0;
2529 if (DOT
->getOpcode() == MI
->getOpcode()) {
2530 if (&Use
- &MI
->getOperand(0) != SrcCIdx
)
2531 NeedWaitStates
= DotWriteSameDotReadSrcAB
;
2533 NeedWaitStates
= DotWriteDifferentVALURead
;
2536 int WaitStatesNeededForUse
= NeedWaitStates
- WaitStatesSinceDef
;
2537 WaitStatesNeeded
= std::max(WaitStatesNeeded
, WaitStatesNeededForUse
);
2540 // Workaround for HW data hazard bug observed only in GFX90A. When there
2541 // is a DGEMM instruction in-between a VALU and a VMEM instruction it
2542 // causes the SQ to incorrectly not insert two wait states between the two
2543 // instructions needed to avoid data hazard.
2544 if (IsMem
&& ST
.hasGFX90AInsts() && !ST
.hasGFX940Insts()) {
2545 DGEMMAfterVALUWrite
= false;
2546 if (TRI
.isVectorRegister(MRI
, Reg
)) {
2547 int WaitStatesNeededForUse
=
2548 DMFMABetweenVALUWriteVMEMRead
-
2549 getWaitStatesSinceDef(Reg
, IsDGEMMHazard
,
2550 DMFMABetweenVALUWriteVMEMRead
);
2552 WaitStatesNeeded
= std::max(WaitStatesNeeded
, WaitStatesNeededForUse
);
2557 WaitStatesSinceDef
=
2558 getWaitStatesSinceDef(Reg
, IsMFMAWriteFn
, MaxWaitStates
);
2562 unsigned HazardDefLatency
= TSchedModel
.computeInstrLatency(MFMA
);
2563 int NumPasses
= HazardDefLatency
;
2564 int NeedWaitStates
= MaxWaitStates
;
2566 if (isDGEMM(MFMA
->getOpcode())) {
2567 switch (HazardDefLatency
) {
2569 NeedWaitStates
= IsMemOrExport
? DMFMA4x4WriteVgprMemExpReadWaitStates
2570 : DMFMA4x4WriteVgprVALUReadWaitStates
;
2574 NeedWaitStates
= IsMemOrExport
2575 ? DMFMA16x16WriteVgprMemExpReadWaitStates
2576 : DMFMA16x16WriteVgprVALUReadWaitStates
;
2579 llvm_unreachable("unexpected dgemm");
2581 } else if (ST
.hasGFX940Insts()) {
2584 ? GFX940_XDL_N_PassWriteVgprVALUMemExpReadWaitStates(NumPasses
)
2585 : GFX940_SMFMA_N_PassWriteVgprVALUMemExpReadWaitStates(
2588 switch (HazardDefLatency
) {
2590 NeedWaitStates
= SMFMA4x4WriteVgprVALUMemExpReadWaitStates
;
2593 NeedWaitStates
= SMFMA16x16WriteVgprVALUMemExpReadWaitStates
;
2596 NeedWaitStates
= SMFMA32x32WriteVgprVALUMemExpReadWaitStates
;
2599 llvm_unreachable("unexpected number of passes for mfma");
2603 int WaitStatesNeededForUse
= NeedWaitStates
- WaitStatesSinceDef
;
2604 WaitStatesNeeded
= std::max(WaitStatesNeeded
, WaitStatesNeededForUse
);
2606 if (WaitStatesNeeded
== MaxWaitStates
)
2611 unsigned Opc
= MI
->getOpcode();
2612 const int DMFMAToFMA64WaitStates
= 2;
2613 if ((Opc
== AMDGPU::V_FMA_F64_e64
||
2614 Opc
== AMDGPU::V_FMAC_F64_e32
|| Opc
== AMDGPU::V_FMAC_F64_e64
||
2615 Opc
== AMDGPU::V_FMAC_F64_dpp
) &&
2616 WaitStatesNeeded
< DMFMAToFMA64WaitStates
) {
2617 int WaitStatesNeededForUse
= DMFMAToFMA64WaitStates
-
2618 getWaitStatesSince(IsDGEMMFn
, DMFMAToFMA64WaitStates
);
2619 WaitStatesNeeded
= std::max(WaitStatesNeeded
, WaitStatesNeededForUse
);
2622 if (!IsVALU
&& !IsMemOrExport
)
2623 return WaitStatesNeeded
;
2625 for (const MachineOperand
&Def
: MI
->defs()) {
2626 const int SMFMA4x4WriteVgprVALUWawWaitStates
= 5;
2627 const int SMFMA16x16WriteVgprVALUWawWaitStates
= 11;
2628 const int SMFMA32x32WriteVgprVALUWawWaitStates
= 19;
2629 const int SMFMA4x4ReadVgprVALUWarWaitStates
= 1;
2630 const int GFX940_XDL4PassReadVgprVALUWarWaitStates
= 3;
2631 const int SMFMA16x16ReadVgprVALUWarWaitStates
= 7;
2632 const int SMFMA32x32ReadVgprVALUWarWaitStates
= 15;
2633 const int DMFMA4x4WriteVgprVALUWriteWaitStates
= 6;
2634 const int DMFMA16x16WriteVgprVALUWriteWaitStates
= 11;
2635 const int DotWriteDifferentVALUWrite
= 3;
2636 const int MaxWaitStates
= 19;
2637 const int MaxWarWaitStates
= 15;
2642 int WaitStatesSinceDef
= getWaitStatesSinceDef(Reg
, IsDotWriteFn
,
2644 if (DOT
&& DOT
->getOpcode() != MI
->getOpcode())
2645 WaitStatesNeeded
= std::max(WaitStatesNeeded
, DotWriteDifferentVALUWrite
-
2646 WaitStatesSinceDef
);
2649 WaitStatesSinceDef
=
2650 getWaitStatesSinceDef(Reg
, IsMFMAWriteFn
, MaxWaitStates
);
2652 int NeedWaitStates
= MaxWaitStates
;
2653 int NumPasses
= TSchedModel
.computeInstrLatency(MFMA
);
2655 if (isDGEMM(MFMA
->getOpcode())) {
2656 switch (NumPasses
) {
2658 NeedWaitStates
= DMFMA4x4WriteVgprVALUWriteWaitStates
;
2662 NeedWaitStates
= DMFMA16x16WriteVgprVALUWriteWaitStates
;
2665 llvm_unreachable("unexpected number of cycles for dgemm");
2667 } else if (ST
.hasGFX940Insts()) {
2670 ? GFX940_XDL_N_PassWriteVgprVALUWawWaitStates(NumPasses
)
2671 : GFX940_SMFMA_N_PassWriteVgprVALUWawWaitStates(NumPasses
);
2673 switch (NumPasses
) {
2675 NeedWaitStates
= SMFMA4x4WriteVgprVALUWawWaitStates
;
2678 NeedWaitStates
= SMFMA16x16WriteVgprVALUWawWaitStates
;
2681 NeedWaitStates
= SMFMA32x32WriteVgprVALUWawWaitStates
;
2684 llvm_unreachable("Unexpected number of passes for mfma");
2688 int WaitStatesNeededForUse
= NeedWaitStates
- WaitStatesSinceDef
;
2689 WaitStatesNeeded
= std::max(WaitStatesNeeded
, WaitStatesNeededForUse
);
2691 if (WaitStatesNeeded
== MaxWaitStates
)
2695 auto IsSMFMAReadAsCFn
= [&Reg
, &MFMA
, this](const MachineInstr
&MI
) {
2696 if (!SIInstrInfo::isMFMA(MI
) || isDGEMM(MI
.getOpcode()) ||
2697 !MI
.readsRegister(Reg
, &TRI
))
2700 if (ST
.hasGFX940Insts() && !isXDL(ST
, MI
))
2703 const MachineOperand
*SrcC
=
2704 TII
.getNamedOperand(MI
, AMDGPU::OpName::src2
);
2706 if (!SrcC
->isReg() || !TRI
.regsOverlap(SrcC
->getReg(), Reg
))
2714 int WaitStatesSinceUse
= getWaitStatesSince(IsSMFMAReadAsCFn
,
2719 unsigned HazardDefLatency
= TSchedModel
.computeInstrLatency(MFMA
);
2720 int NeedWaitStates
= MaxWaitStates
;
2721 switch (HazardDefLatency
) {
2722 case 2: NeedWaitStates
= SMFMA4x4ReadVgprVALUWarWaitStates
;
2724 case 4: assert(ST
.hasGFX940Insts());
2725 NeedWaitStates
= GFX940_XDL4PassReadVgprVALUWarWaitStates
;
2727 case 8: NeedWaitStates
= SMFMA16x16ReadVgprVALUWarWaitStates
;
2729 case 16: [[fallthrough
]];
2730 default: NeedWaitStates
= SMFMA32x32ReadVgprVALUWarWaitStates
;
2734 int WaitStatesNeededForUse
= NeedWaitStates
- WaitStatesSinceUse
;
2735 WaitStatesNeeded
= std::max(WaitStatesNeeded
, WaitStatesNeededForUse
);
2738 return WaitStatesNeeded
;
2741 bool GCNHazardRecognizer::ShouldPreferAnother(SUnit
*SU
) {
2745 const MachineInstr
*MAI
= nullptr;
2747 auto IsMFMAFn
= [&MAI
](const MachineInstr
&MI
) {
2749 if (SIInstrInfo::isMFMA(MI
))
2751 return MAI
!= nullptr;
2754 MachineInstr
*MI
= SU
->getInstr();
2755 if (IsMFMAFn(*MI
)) {
2756 int W
= getWaitStatesSince(IsMFMAFn
, 16);
2758 return W
< (int)TSchedModel
.computeInstrLatency(MAI
);
2764 bool GCNHazardRecognizer::fixVALUMaskWriteHazard(MachineInstr
*MI
) {
2765 if (!ST
.hasVALUMaskWriteHazard())
2767 assert(!ST
.hasExtendedWaitCounts());
2769 if (!ST
.isWave64() || !SIInstrInfo::isSALU(*MI
))
2772 // The hazard sequence is three instructions:
2773 // 1. VALU reads SGPR as mask
2774 // 2. SALU writes SGPR
2775 // 3. SALU reads SGPR
2776 // The hazard can expire if the distance between 2 and 3 is sufficient.
2777 // In practice this happens <10% of the time, hence this always assumes
2778 // the hazard exists if 1 and 2 are present to avoid searching.
2780 const MachineOperand
*SDSTOp
= TII
.getNamedOperand(*MI
, AMDGPU::OpName::sdst
);
2781 if (!SDSTOp
|| !SDSTOp
->isReg())
2784 const Register HazardReg
= SDSTOp
->getReg();
2785 if (HazardReg
== AMDGPU::EXEC
||
2786 HazardReg
== AMDGPU::EXEC_LO
||
2787 HazardReg
== AMDGPU::EXEC_HI
||
2788 HazardReg
== AMDGPU::M0
)
2791 auto IsHazardFn
= [HazardReg
, this](const MachineInstr
&I
) {
2792 switch (I
.getOpcode()) {
2793 case AMDGPU::V_ADDC_U32_e32
:
2794 case AMDGPU::V_ADDC_U32_dpp
:
2795 case AMDGPU::V_CNDMASK_B16_e32
:
2796 case AMDGPU::V_CNDMASK_B16_dpp
:
2797 case AMDGPU::V_CNDMASK_B32_e32
:
2798 case AMDGPU::V_CNDMASK_B32_dpp
:
2799 case AMDGPU::V_DIV_FMAS_F32_e64
:
2800 case AMDGPU::V_DIV_FMAS_F64_e64
:
2801 case AMDGPU::V_SUBB_U32_e32
:
2802 case AMDGPU::V_SUBB_U32_dpp
:
2803 case AMDGPU::V_SUBBREV_U32_e32
:
2804 case AMDGPU::V_SUBBREV_U32_dpp
:
2805 // These implicitly read VCC as mask source.
2806 return HazardReg
== AMDGPU::VCC
||
2807 HazardReg
== AMDGPU::VCC_LO
||
2808 HazardReg
== AMDGPU::VCC_HI
;
2809 case AMDGPU::V_ADDC_U32_e64
:
2810 case AMDGPU::V_ADDC_U32_e64_dpp
:
2811 case AMDGPU::V_CNDMASK_B16_e64
:
2812 case AMDGPU::V_CNDMASK_B16_e64_dpp
:
2813 case AMDGPU::V_CNDMASK_B32_e64
:
2814 case AMDGPU::V_CNDMASK_B32_e64_dpp
:
2815 case AMDGPU::V_SUBB_U32_e64
:
2816 case AMDGPU::V_SUBB_U32_e64_dpp
:
2817 case AMDGPU::V_SUBBREV_U32_e64
:
2818 case AMDGPU::V_SUBBREV_U32_e64_dpp
: {
2819 // Only check mask register overlaps.
2820 const MachineOperand
*SSRCOp
= TII
.getNamedOperand(I
, AMDGPU::OpName::src2
);
2822 return TRI
.regsOverlap(SSRCOp
->getReg(), HazardReg
);
2829 const MachineRegisterInfo
&MRI
= MF
.getRegInfo();
2830 auto IsExpiredFn
= [&MRI
, this](const MachineInstr
&I
, int) {
2831 // s_waitcnt_depctr sa_sdst(0) mitigates hazard.
2832 if (I
.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR
&&
2833 AMDGPU::DepCtr::decodeFieldSaSdst(I
.getOperand(0).getImm()) == 0)
2836 // VALU access to any SGPR or literal constant other than HazardReg
2837 // mitigates hazard. No need to check HazardReg here as this will
2838 // only be called when !IsHazardFn.
2839 if (!SIInstrInfo::isVALU(I
))
2841 for (int OpNo
= 0, End
= I
.getNumOperands(); OpNo
< End
; ++OpNo
) {
2842 const MachineOperand
&Op
= I
.getOperand(OpNo
);
2844 Register OpReg
= Op
.getReg();
2845 // Only consider uses
2849 if (OpReg
== AMDGPU::EXEC
||
2850 OpReg
== AMDGPU::EXEC_LO
||
2851 OpReg
== AMDGPU::EXEC_HI
)
2853 // Ignore all implicit uses except VCC
2854 if (Op
.isImplicit()) {
2855 if (OpReg
== AMDGPU::VCC
||
2856 OpReg
== AMDGPU::VCC_LO
||
2857 OpReg
== AMDGPU::VCC_HI
)
2861 if (TRI
.isSGPRReg(MRI
, OpReg
))
2864 const MCInstrDesc
&InstDesc
= I
.getDesc();
2865 const MCOperandInfo
&OpInfo
= InstDesc
.operands()[OpNo
];
2866 if (!TII
.isInlineConstant(Op
, OpInfo
))
2874 if (::getWaitStatesSince(IsHazardFn
, MI
, IsExpiredFn
) ==
2875 std::numeric_limits
<int>::max())
2878 auto NextMI
= std::next(MI
->getIterator());
2880 // Add s_waitcnt_depctr sa_sdst(0) after SALU write.
2881 BuildMI(*MI
->getParent(), NextMI
, MI
->getDebugLoc(),
2882 TII
.get(AMDGPU::S_WAITCNT_DEPCTR
))
2883 .addImm(AMDGPU::DepCtr::encodeFieldSaSdst(0));
2885 // SALU write may be s_getpc in a bundle.
2886 if (MI
->getOpcode() == AMDGPU::S_GETPC_B64
) {
2887 // Update offsets of any references in the bundle.
2888 while (NextMI
!= MI
->getParent()->end() &&
2889 NextMI
->isBundledWithPred()) {
2890 for (auto &Operand
: NextMI
->operands()) {
2891 if (Operand
.isGlobal())
2892 Operand
.setOffset(Operand
.getOffset() + 4);
2901 static bool ensureEntrySetPrio(MachineFunction
*MF
, int Priority
,
2902 const SIInstrInfo
&TII
) {
2903 MachineBasicBlock
&EntryMBB
= MF
->front();
2904 if (EntryMBB
.begin() != EntryMBB
.end()) {
2905 auto &EntryMI
= *EntryMBB
.begin();
2906 if (EntryMI
.getOpcode() == AMDGPU::S_SETPRIO
&&
2907 EntryMI
.getOperand(0).getImm() >= Priority
)
2911 BuildMI(EntryMBB
, EntryMBB
.begin(), DebugLoc(), TII
.get(AMDGPU::S_SETPRIO
))
2916 bool GCNHazardRecognizer::fixRequiredExportPriority(MachineInstr
*MI
) {
2917 if (!ST
.hasRequiredExportPriority())
2920 // Assume the following shader types will never have exports,
2921 // and avoid adding or adjusting S_SETPRIO.
2922 MachineBasicBlock
*MBB
= MI
->getParent();
2923 MachineFunction
*MF
= MBB
->getParent();
2924 auto CC
= MF
->getFunction().getCallingConv();
2926 case CallingConv::AMDGPU_CS
:
2927 case CallingConv::AMDGPU_CS_Chain
:
2928 case CallingConv::AMDGPU_CS_ChainPreserve
:
2929 case CallingConv::AMDGPU_KERNEL
:
2935 const int MaxPriority
= 3;
2936 const int NormalPriority
= 2;
2937 const int PostExportPriority
= 0;
2939 auto It
= MI
->getIterator();
2940 switch (MI
->getOpcode()) {
2941 case AMDGPU::S_ENDPGM
:
2942 case AMDGPU::S_ENDPGM_SAVED
:
2943 case AMDGPU::S_ENDPGM_ORDERED_PS_DONE
:
2944 case AMDGPU::SI_RETURN_TO_EPILOG
:
2945 // Ensure shader with calls raises priority at entry.
2946 // This ensures correct priority if exports exist in callee.
2947 if (MF
->getFrameInfo().hasCalls())
2948 return ensureEntrySetPrio(MF
, NormalPriority
, TII
);
2950 case AMDGPU::S_SETPRIO
: {
2951 // Raise minimum priority unless in workaround.
2952 auto &PrioOp
= MI
->getOperand(0);
2953 int Prio
= PrioOp
.getImm();
2954 bool InWA
= (Prio
== PostExportPriority
) &&
2955 (It
!= MBB
->begin() && TII
.isEXP(*std::prev(It
)));
2956 if (InWA
|| Prio
>= NormalPriority
)
2958 PrioOp
.setImm(std::min(Prio
+ NormalPriority
, MaxPriority
));
2962 if (!TII
.isEXP(*MI
))
2967 // Check entry priority at each export (as there will only be a few).
2968 // Note: amdgpu_gfx can only be a callee, so defer to caller setprio.
2969 bool Changed
= false;
2970 if (CC
!= CallingConv::AMDGPU_Gfx
)
2971 Changed
= ensureEntrySetPrio(MF
, NormalPriority
, TII
);
2973 auto NextMI
= std::next(It
);
2974 bool EndOfShader
= false;
2975 if (NextMI
!= MBB
->end()) {
2976 // Only need WA at end of sequence of exports.
2977 if (TII
.isEXP(*NextMI
))
2979 // Assume appropriate S_SETPRIO after export means WA already applied.
2980 if (NextMI
->getOpcode() == AMDGPU::S_SETPRIO
&&
2981 NextMI
->getOperand(0).getImm() == PostExportPriority
)
2983 EndOfShader
= NextMI
->getOpcode() == AMDGPU::S_ENDPGM
;
2986 const DebugLoc
&DL
= MI
->getDebugLoc();
2989 BuildMI(*MBB
, NextMI
, DL
, TII
.get(AMDGPU::S_SETPRIO
))
2990 .addImm(PostExportPriority
);
2993 // Wait for exports to complete.
2994 BuildMI(*MBB
, NextMI
, DL
, TII
.get(AMDGPU::S_WAITCNT_EXPCNT
))
2995 .addReg(AMDGPU::SGPR_NULL
)
2999 BuildMI(*MBB
, NextMI
, DL
, TII
.get(AMDGPU::S_NOP
)).addImm(0);
3000 BuildMI(*MBB
, NextMI
, DL
, TII
.get(AMDGPU::S_NOP
)).addImm(0);
3003 // Return to normal (higher) priority.
3004 BuildMI(*MBB
, NextMI
, DL
, TII
.get(AMDGPU::S_SETPRIO
))
3005 .addImm(NormalPriority
);