[x86] fix assert with horizontal math + broadcast of vector (PR43402)
[llvm-core.git] / lib / Target / AMDGPU / GCNHazardRecognizer.cpp
blob958b8019c7267e7590c758d43f98285ff8f5115e
1 //===-- GCNHazardRecognizers.cpp - GCN Hazard Recognizer Impls ------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements hazard recognizers for scheduling on GCN processors.
11 //===----------------------------------------------------------------------===//
13 #include "GCNHazardRecognizer.h"
14 #include "AMDGPUSubtarget.h"
15 #include "SIDefines.h"
16 #include "SIInstrInfo.h"
17 #include "SIRegisterInfo.h"
18 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
19 #include "Utils/AMDGPUBaseInfo.h"
20 #include "llvm/ADT/iterator_range.h"
21 #include "llvm/CodeGen/MachineFunction.h"
22 #include "llvm/CodeGen/MachineInstr.h"
23 #include "llvm/CodeGen/MachineInstrBuilder.h"
24 #include "llvm/CodeGen/MachineOperand.h"
25 #include "llvm/CodeGen/ScheduleDAG.h"
26 #include "llvm/MC/MCInstrDesc.h"
27 #include "llvm/Support/ErrorHandling.h"
28 #include <algorithm>
29 #include <cassert>
30 #include <limits>
31 #include <set>
32 #include <vector>
34 using namespace llvm;
36 //===----------------------------------------------------------------------===//
37 // Hazard Recoginizer Implementation
38 //===----------------------------------------------------------------------===//
40 GCNHazardRecognizer::GCNHazardRecognizer(const MachineFunction &MF) :
41 IsHazardRecognizerMode(false),
42 CurrCycleInstr(nullptr),
43 MF(MF),
44 ST(MF.getSubtarget<GCNSubtarget>()),
45 TII(*ST.getInstrInfo()),
46 TRI(TII.getRegisterInfo()),
47 ClauseUses(TRI.getNumRegUnits()),
48 ClauseDefs(TRI.getNumRegUnits()) {
49 MaxLookAhead = MF.getRegInfo().isPhysRegUsed(AMDGPU::AGPR0) ? 18 : 5;
50 TSchedModel.init(&ST);
53 void GCNHazardRecognizer::EmitInstruction(SUnit *SU) {
54 EmitInstruction(SU->getInstr());
57 void GCNHazardRecognizer::EmitInstruction(MachineInstr *MI) {
58 CurrCycleInstr = MI;
61 static bool isDivFMas(unsigned Opcode) {
62 return Opcode == AMDGPU::V_DIV_FMAS_F32 || Opcode == AMDGPU::V_DIV_FMAS_F64;
65 static bool isSGetReg(unsigned Opcode) {
66 return Opcode == AMDGPU::S_GETREG_B32;
69 static bool isSSetReg(unsigned Opcode) {
70 return Opcode == AMDGPU::S_SETREG_B32 || Opcode == AMDGPU::S_SETREG_IMM32_B32;
73 static bool isRWLane(unsigned Opcode) {
74 return Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32;
77 static bool isRFE(unsigned Opcode) {
78 return Opcode == AMDGPU::S_RFE_B64;
81 static bool isSMovRel(unsigned Opcode) {
82 switch (Opcode) {
83 case AMDGPU::S_MOVRELS_B32:
84 case AMDGPU::S_MOVRELS_B64:
85 case AMDGPU::S_MOVRELD_B32:
86 case AMDGPU::S_MOVRELD_B64:
87 return true;
88 default:
89 return false;
93 static bool isSendMsgTraceDataOrGDS(const SIInstrInfo &TII,
94 const MachineInstr &MI) {
95 if (TII.isAlwaysGDS(MI.getOpcode()))
96 return true;
98 switch (MI.getOpcode()) {
99 case AMDGPU::S_SENDMSG:
100 case AMDGPU::S_SENDMSGHALT:
101 case AMDGPU::S_TTRACEDATA:
102 return true;
103 // These DS opcodes don't support GDS.
104 case AMDGPU::DS_NOP:
105 case AMDGPU::DS_PERMUTE_B32:
106 case AMDGPU::DS_BPERMUTE_B32:
107 return false;
108 default:
109 if (TII.isDS(MI.getOpcode())) {
110 int GDS = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
111 AMDGPU::OpName::gds);
112 if (MI.getOperand(GDS).getImm())
113 return true;
115 return false;
119 static bool isPermlane(const MachineInstr &MI) {
120 unsigned Opcode = MI.getOpcode();
121 return Opcode == AMDGPU::V_PERMLANE16_B32 ||
122 Opcode == AMDGPU::V_PERMLANEX16_B32;
125 static unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr) {
126 const MachineOperand *RegOp = TII->getNamedOperand(RegInstr,
127 AMDGPU::OpName::simm16);
128 return RegOp->getImm() & AMDGPU::Hwreg::ID_MASK_;
131 ScheduleHazardRecognizer::HazardType
132 GCNHazardRecognizer::getHazardType(SUnit *SU, int Stalls) {
133 MachineInstr *MI = SU->getInstr();
134 if (MI->isBundle())
135 return NoHazard;
137 if (SIInstrInfo::isSMRD(*MI) && checkSMRDHazards(MI) > 0)
138 return NoopHazard;
140 // FIXME: Should flat be considered vmem?
141 if ((SIInstrInfo::isVMEM(*MI) ||
142 SIInstrInfo::isFLAT(*MI))
143 && checkVMEMHazards(MI) > 0)
144 return NoopHazard;
146 if (ST.hasNSAtoVMEMBug() && checkNSAtoVMEMHazard(MI) > 0)
147 return NoopHazard;
149 if (checkFPAtomicToDenormModeHazard(MI) > 0)
150 return NoopHazard;
152 if (ST.hasNoDataDepHazard())
153 return NoHazard;
155 if (SIInstrInfo::isVALU(*MI) && checkVALUHazards(MI) > 0)
156 return NoopHazard;
158 if (SIInstrInfo::isDPP(*MI) && checkDPPHazards(MI) > 0)
159 return NoopHazard;
161 if (isDivFMas(MI->getOpcode()) && checkDivFMasHazards(MI) > 0)
162 return NoopHazard;
164 if (isRWLane(MI->getOpcode()) && checkRWLaneHazards(MI) > 0)
165 return NoopHazard;
167 if (isSGetReg(MI->getOpcode()) && checkGetRegHazards(MI) > 0)
168 return NoopHazard;
170 if (isSSetReg(MI->getOpcode()) && checkSetRegHazards(MI) > 0)
171 return NoopHazard;
173 if (isRFE(MI->getOpcode()) && checkRFEHazards(MI) > 0)
174 return NoopHazard;
176 if (ST.hasReadM0MovRelInterpHazard() &&
177 (TII.isVINTRP(*MI) || isSMovRel(MI->getOpcode())) &&
178 checkReadM0Hazards(MI) > 0)
179 return NoopHazard;
181 if (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, *MI) &&
182 checkReadM0Hazards(MI) > 0)
183 return NoopHazard;
185 if (SIInstrInfo::isMAI(*MI) && checkMAIHazards(MI) > 0)
186 return NoopHazard;
188 if ((MI->mayLoad() || MI->mayStore()) && checkMAILdStHazards(MI) > 0)
189 return NoopHazard;
191 if (MI->isInlineAsm() && checkInlineAsmHazards(MI) > 0)
192 return NoopHazard;
194 if (checkAnyInstHazards(MI) > 0)
195 return NoopHazard;
197 return NoHazard;
200 static void insertNoopInBundle(MachineInstr *MI, const SIInstrInfo &TII) {
201 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII.get(AMDGPU::S_NOP))
202 .addImm(0);
205 void GCNHazardRecognizer::processBundle() {
206 MachineBasicBlock::instr_iterator MI = std::next(CurrCycleInstr->getIterator());
207 MachineBasicBlock::instr_iterator E = CurrCycleInstr->getParent()->instr_end();
208 // Check bundled MachineInstr's for hazards.
209 for (; MI != E && MI->isInsideBundle(); ++MI) {
210 CurrCycleInstr = &*MI;
211 unsigned WaitStates = PreEmitNoopsCommon(CurrCycleInstr);
213 if (IsHazardRecognizerMode)
214 fixHazards(CurrCycleInstr);
216 for (unsigned i = 0; i < WaitStates; ++i)
217 insertNoopInBundle(CurrCycleInstr, TII);
219 // It’s unnecessary to track more than MaxLookAhead instructions. Since we
220 // include the bundled MI directly after, only add a maximum of
221 // (MaxLookAhead - 1) noops to EmittedInstrs.
222 for (unsigned i = 0, e = std::min(WaitStates, MaxLookAhead - 1); i < e; ++i)
223 EmittedInstrs.push_front(nullptr);
225 EmittedInstrs.push_front(CurrCycleInstr);
226 EmittedInstrs.resize(MaxLookAhead);
228 CurrCycleInstr = nullptr;
231 unsigned GCNHazardRecognizer::PreEmitNoops(SUnit *SU) {
232 IsHazardRecognizerMode = false;
233 return PreEmitNoopsCommon(SU->getInstr());
236 unsigned GCNHazardRecognizer::PreEmitNoops(MachineInstr *MI) {
237 IsHazardRecognizerMode = true;
238 CurrCycleInstr = MI;
239 unsigned W = PreEmitNoopsCommon(MI);
240 fixHazards(MI);
241 CurrCycleInstr = nullptr;
242 return W;
245 unsigned GCNHazardRecognizer::PreEmitNoopsCommon(MachineInstr *MI) {
246 if (MI->isBundle())
247 return 0;
249 int WaitStates = std::max(0, checkAnyInstHazards(MI));
251 if (SIInstrInfo::isSMRD(*MI))
252 return std::max(WaitStates, checkSMRDHazards(MI));
254 if (SIInstrInfo::isVMEM(*MI) || SIInstrInfo::isFLAT(*MI))
255 WaitStates = std::max(WaitStates, checkVMEMHazards(MI));
257 if (ST.hasNSAtoVMEMBug())
258 WaitStates = std::max(WaitStates, checkNSAtoVMEMHazard(MI));
260 WaitStates = std::max(WaitStates, checkFPAtomicToDenormModeHazard(MI));
262 if (ST.hasNoDataDepHazard())
263 return WaitStates;
265 if (SIInstrInfo::isVALU(*MI))
266 WaitStates = std::max(WaitStates, checkVALUHazards(MI));
268 if (SIInstrInfo::isDPP(*MI))
269 WaitStates = std::max(WaitStates, checkDPPHazards(MI));
271 if (isDivFMas(MI->getOpcode()))
272 WaitStates = std::max(WaitStates, checkDivFMasHazards(MI));
274 if (isRWLane(MI->getOpcode()))
275 WaitStates = std::max(WaitStates, checkRWLaneHazards(MI));
277 if (MI->isInlineAsm())
278 return std::max(WaitStates, checkInlineAsmHazards(MI));
280 if (isSGetReg(MI->getOpcode()))
281 return std::max(WaitStates, checkGetRegHazards(MI));
283 if (isSSetReg(MI->getOpcode()))
284 return std::max(WaitStates, checkSetRegHazards(MI));
286 if (isRFE(MI->getOpcode()))
287 return std::max(WaitStates, checkRFEHazards(MI));
289 if (ST.hasReadM0MovRelInterpHazard() && (TII.isVINTRP(*MI) ||
290 isSMovRel(MI->getOpcode())))
291 return std::max(WaitStates, checkReadM0Hazards(MI));
293 if (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, *MI))
294 return std::max(WaitStates, checkReadM0Hazards(MI));
296 if (SIInstrInfo::isMAI(*MI))
297 return std::max(WaitStates, checkMAIHazards(MI));
299 if (MI->mayLoad() || MI->mayStore())
300 return std::max(WaitStates, checkMAILdStHazards(MI));
302 return WaitStates;
305 void GCNHazardRecognizer::EmitNoop() {
306 EmittedInstrs.push_front(nullptr);
309 void GCNHazardRecognizer::AdvanceCycle() {
310 // When the scheduler detects a stall, it will call AdvanceCycle() without
311 // emitting any instructions.
312 if (!CurrCycleInstr)
313 return;
315 // Do not track non-instructions which do not affect the wait states.
316 // If included, these instructions can lead to buffer overflow such that
317 // detectable hazards are missed.
318 if (CurrCycleInstr->isImplicitDef() || CurrCycleInstr->isDebugInstr() ||
319 CurrCycleInstr->isKill())
320 return;
322 if (CurrCycleInstr->isBundle()) {
323 processBundle();
324 return;
327 unsigned NumWaitStates = TII.getNumWaitStates(*CurrCycleInstr);
329 // Keep track of emitted instructions
330 EmittedInstrs.push_front(CurrCycleInstr);
332 // Add a nullptr for each additional wait state after the first. Make sure
333 // not to add more than getMaxLookAhead() items to the list, since we
334 // truncate the list to that size right after this loop.
335 for (unsigned i = 1, e = std::min(NumWaitStates, getMaxLookAhead());
336 i < e; ++i) {
337 EmittedInstrs.push_front(nullptr);
340 // getMaxLookahead() is the largest number of wait states we will ever need
341 // to insert, so there is no point in keeping track of more than that many
342 // wait states.
343 EmittedInstrs.resize(getMaxLookAhead());
345 CurrCycleInstr = nullptr;
348 void GCNHazardRecognizer::RecedeCycle() {
349 llvm_unreachable("hazard recognizer does not support bottom-up scheduling.");
352 //===----------------------------------------------------------------------===//
353 // Helper Functions
354 //===----------------------------------------------------------------------===//
356 typedef function_ref<bool(MachineInstr *, int WaitStates)> IsExpiredFn;
358 // Returns a minimum wait states since \p I walking all predecessors.
359 // Only scans until \p IsExpired does not return true.
360 // Can only be run in a hazard recognizer mode.
361 static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard,
362 MachineBasicBlock *MBB,
363 MachineBasicBlock::reverse_instr_iterator I,
364 int WaitStates,
365 IsExpiredFn IsExpired,
366 DenseSet<const MachineBasicBlock *> &Visited) {
367 for (auto E = MBB->instr_rend(); I != E; ++I) {
368 // Don't add WaitStates for parent BUNDLE instructions.
369 if (I->isBundle())
370 continue;
372 if (IsHazard(&*I))
373 return WaitStates;
375 if (I->isInlineAsm() || I->isImplicitDef() || I->isDebugInstr())
376 continue;
378 WaitStates += SIInstrInfo::getNumWaitStates(*I);
380 if (IsExpired(&*I, WaitStates))
381 return std::numeric_limits<int>::max();
384 int MinWaitStates = WaitStates;
385 bool Found = false;
386 for (MachineBasicBlock *Pred : MBB->predecessors()) {
387 if (!Visited.insert(Pred).second)
388 continue;
390 int W = getWaitStatesSince(IsHazard, Pred, Pred->instr_rbegin(),
391 WaitStates, IsExpired, Visited);
393 if (W == std::numeric_limits<int>::max())
394 continue;
396 MinWaitStates = Found ? std::min(MinWaitStates, W) : W;
397 if (IsExpired(nullptr, MinWaitStates))
398 return MinWaitStates;
400 Found = true;
403 if (Found)
404 return MinWaitStates;
406 return std::numeric_limits<int>::max();
409 static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard,
410 MachineInstr *MI,
411 IsExpiredFn IsExpired) {
412 DenseSet<const MachineBasicBlock *> Visited;
413 return getWaitStatesSince(IsHazard, MI->getParent(),
414 std::next(MI->getReverseIterator()),
415 0, IsExpired, Visited);
418 int GCNHazardRecognizer::getWaitStatesSince(IsHazardFn IsHazard, int Limit) {
419 if (IsHazardRecognizerMode) {
420 auto IsExpiredFn = [Limit] (MachineInstr *, int WaitStates) {
421 return WaitStates >= Limit;
423 return ::getWaitStatesSince(IsHazard, CurrCycleInstr, IsExpiredFn);
426 int WaitStates = 0;
427 for (MachineInstr *MI : EmittedInstrs) {
428 if (MI) {
429 if (IsHazard(MI))
430 return WaitStates;
432 if (MI->isInlineAsm())
433 continue;
435 ++WaitStates;
437 if (WaitStates >= Limit)
438 break;
440 return std::numeric_limits<int>::max();
443 int GCNHazardRecognizer::getWaitStatesSinceDef(unsigned Reg,
444 IsHazardFn IsHazardDef,
445 int Limit) {
446 const SIRegisterInfo *TRI = ST.getRegisterInfo();
448 auto IsHazardFn = [IsHazardDef, TRI, Reg] (MachineInstr *MI) {
449 return IsHazardDef(MI) && MI->modifiesRegister(Reg, TRI);
452 return getWaitStatesSince(IsHazardFn, Limit);
455 int GCNHazardRecognizer::getWaitStatesSinceSetReg(IsHazardFn IsHazard,
456 int Limit) {
457 auto IsHazardFn = [IsHazard] (MachineInstr *MI) {
458 return isSSetReg(MI->getOpcode()) && IsHazard(MI);
461 return getWaitStatesSince(IsHazardFn, Limit);
464 //===----------------------------------------------------------------------===//
465 // No-op Hazard Detection
466 //===----------------------------------------------------------------------===//
468 static void addRegUnits(const SIRegisterInfo &TRI,
469 BitVector &BV, unsigned Reg) {
470 for (MCRegUnitIterator RUI(Reg, &TRI); RUI.isValid(); ++RUI)
471 BV.set(*RUI);
474 static void addRegsToSet(const SIRegisterInfo &TRI,
475 iterator_range<MachineInstr::const_mop_iterator> Ops,
476 BitVector &Set) {
477 for (const MachineOperand &Op : Ops) {
478 if (Op.isReg())
479 addRegUnits(TRI, Set, Op.getReg());
483 void GCNHazardRecognizer::addClauseInst(const MachineInstr &MI) {
484 // XXX: Do we need to worry about implicit operands
485 addRegsToSet(TRI, MI.defs(), ClauseDefs);
486 addRegsToSet(TRI, MI.uses(), ClauseUses);
489 int GCNHazardRecognizer::checkSoftClauseHazards(MachineInstr *MEM) {
490 // SMEM soft clause are only present on VI+, and only matter if xnack is
491 // enabled.
492 if (!ST.isXNACKEnabled())
493 return 0;
495 bool IsSMRD = TII.isSMRD(*MEM);
497 resetClause();
499 // A soft-clause is any group of consecutive SMEM instructions. The
500 // instructions in this group may return out of order and/or may be
501 // replayed (i.e. the same instruction issued more than once).
503 // In order to handle these situations correctly we need to make sure that
504 // when a clause has more than one instruction, no instruction in the clause
505 // writes to a register that is read by another instruction in the clause
506 // (including itself). If we encounter this situaion, we need to break the
507 // clause by inserting a non SMEM instruction.
509 for (MachineInstr *MI : EmittedInstrs) {
510 // When we hit a non-SMEM instruction then we have passed the start of the
511 // clause and we can stop.
512 if (!MI)
513 break;
515 if (IsSMRD != SIInstrInfo::isSMRD(*MI))
516 break;
518 addClauseInst(*MI);
521 if (ClauseDefs.none())
522 return 0;
524 // We need to make sure not to put loads and stores in the same clause if they
525 // use the same address. For now, just start a new clause whenever we see a
526 // store.
527 if (MEM->mayStore())
528 return 1;
530 addClauseInst(*MEM);
532 // If the set of defs and uses intersect then we cannot add this instruction
533 // to the clause, so we have a hazard.
534 return ClauseDefs.anyCommon(ClauseUses) ? 1 : 0;
537 int GCNHazardRecognizer::checkSMRDHazards(MachineInstr *SMRD) {
538 int WaitStatesNeeded = 0;
540 WaitStatesNeeded = checkSoftClauseHazards(SMRD);
542 // This SMRD hazard only affects SI.
543 if (!ST.hasSMRDReadVALUDefHazard())
544 return WaitStatesNeeded;
546 // A read of an SGPR by SMRD instruction requires 4 wait states when the
547 // SGPR was written by a VALU instruction.
548 int SmrdSgprWaitStates = 4;
549 auto IsHazardDefFn = [this] (MachineInstr *MI) { return TII.isVALU(*MI); };
550 auto IsBufferHazardDefFn = [this] (MachineInstr *MI) { return TII.isSALU(*MI); };
552 bool IsBufferSMRD = TII.isBufferSMRD(*SMRD);
554 for (const MachineOperand &Use : SMRD->uses()) {
555 if (!Use.isReg())
556 continue;
557 int WaitStatesNeededForUse =
558 SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn,
559 SmrdSgprWaitStates);
560 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
562 // This fixes what appears to be undocumented hardware behavior in SI where
563 // s_mov writing a descriptor and s_buffer_load_dword reading the descriptor
564 // needs some number of nops in between. We don't know how many we need, but
565 // let's use 4. This wasn't discovered before probably because the only
566 // case when this happens is when we expand a 64-bit pointer into a full
567 // descriptor and use s_buffer_load_dword instead of s_load_dword, which was
568 // probably never encountered in the closed-source land.
569 if (IsBufferSMRD) {
570 int WaitStatesNeededForUse =
571 SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(),
572 IsBufferHazardDefFn,
573 SmrdSgprWaitStates);
574 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
578 return WaitStatesNeeded;
581 int GCNHazardRecognizer::checkVMEMHazards(MachineInstr* VMEM) {
582 if (!ST.hasVMEMReadSGPRVALUDefHazard())
583 return 0;
585 int WaitStatesNeeded = checkSoftClauseHazards(VMEM);
587 // A read of an SGPR by a VMEM instruction requires 5 wait states when the
588 // SGPR was written by a VALU Instruction.
589 const int VmemSgprWaitStates = 5;
590 auto IsHazardDefFn = [this] (MachineInstr *MI) { return TII.isVALU(*MI); };
591 for (const MachineOperand &Use : VMEM->uses()) {
592 if (!Use.isReg() || TRI.isVGPR(MF.getRegInfo(), Use.getReg()))
593 continue;
595 int WaitStatesNeededForUse =
596 VmemSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn,
597 VmemSgprWaitStates);
598 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
600 return WaitStatesNeeded;
603 int GCNHazardRecognizer::checkDPPHazards(MachineInstr *DPP) {
604 const SIRegisterInfo *TRI = ST.getRegisterInfo();
605 const SIInstrInfo *TII = ST.getInstrInfo();
607 // Check for DPP VGPR read after VALU VGPR write and EXEC write.
608 int DppVgprWaitStates = 2;
609 int DppExecWaitStates = 5;
610 int WaitStatesNeeded = 0;
611 auto IsHazardDefFn = [TII] (MachineInstr *MI) { return TII->isVALU(*MI); };
613 for (const MachineOperand &Use : DPP->uses()) {
614 if (!Use.isReg() || !TRI->isVGPR(MF.getRegInfo(), Use.getReg()))
615 continue;
616 int WaitStatesNeededForUse =
617 DppVgprWaitStates - getWaitStatesSinceDef(Use.getReg(),
618 [](MachineInstr *) { return true; },
619 DppVgprWaitStates);
620 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
623 WaitStatesNeeded = std::max(
624 WaitStatesNeeded,
625 DppExecWaitStates - getWaitStatesSinceDef(AMDGPU::EXEC, IsHazardDefFn,
626 DppExecWaitStates));
628 return WaitStatesNeeded;
631 int GCNHazardRecognizer::checkDivFMasHazards(MachineInstr *DivFMas) {
632 const SIInstrInfo *TII = ST.getInstrInfo();
634 // v_div_fmas requires 4 wait states after a write to vcc from a VALU
635 // instruction.
636 const int DivFMasWaitStates = 4;
637 auto IsHazardDefFn = [TII] (MachineInstr *MI) { return TII->isVALU(*MI); };
638 int WaitStatesNeeded = getWaitStatesSinceDef(AMDGPU::VCC, IsHazardDefFn,
639 DivFMasWaitStates);
641 return DivFMasWaitStates - WaitStatesNeeded;
644 int GCNHazardRecognizer::checkGetRegHazards(MachineInstr *GetRegInstr) {
645 const SIInstrInfo *TII = ST.getInstrInfo();
646 unsigned GetRegHWReg = getHWReg(TII, *GetRegInstr);
648 const int GetRegWaitStates = 2;
649 auto IsHazardFn = [TII, GetRegHWReg] (MachineInstr *MI) {
650 return GetRegHWReg == getHWReg(TII, *MI);
652 int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, GetRegWaitStates);
654 return GetRegWaitStates - WaitStatesNeeded;
657 int GCNHazardRecognizer::checkSetRegHazards(MachineInstr *SetRegInstr) {
658 const SIInstrInfo *TII = ST.getInstrInfo();
659 unsigned HWReg = getHWReg(TII, *SetRegInstr);
661 const int SetRegWaitStates = ST.getSetRegWaitStates();
662 auto IsHazardFn = [TII, HWReg] (MachineInstr *MI) {
663 return HWReg == getHWReg(TII, *MI);
665 int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, SetRegWaitStates);
666 return SetRegWaitStates - WaitStatesNeeded;
669 int GCNHazardRecognizer::createsVALUHazard(const MachineInstr &MI) {
670 if (!MI.mayStore())
671 return -1;
673 const SIInstrInfo *TII = ST.getInstrInfo();
674 unsigned Opcode = MI.getOpcode();
675 const MCInstrDesc &Desc = MI.getDesc();
677 int VDataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata);
678 int VDataRCID = -1;
679 if (VDataIdx != -1)
680 VDataRCID = Desc.OpInfo[VDataIdx].RegClass;
682 if (TII->isMUBUF(MI) || TII->isMTBUF(MI)) {
683 // There is no hazard if the instruction does not use vector regs
684 // (like wbinvl1)
685 if (VDataIdx == -1)
686 return -1;
687 // For MUBUF/MTBUF instructions this hazard only exists if the
688 // instruction is not using a register in the soffset field.
689 const MachineOperand *SOffset =
690 TII->getNamedOperand(MI, AMDGPU::OpName::soffset);
691 // If we have no soffset operand, then assume this field has been
692 // hardcoded to zero.
693 if (AMDGPU::getRegBitWidth(VDataRCID) > 64 &&
694 (!SOffset || !SOffset->isReg()))
695 return VDataIdx;
698 // MIMG instructions create a hazard if they don't use a 256-bit T# and
699 // the store size is greater than 8 bytes and they have more than two bits
700 // of their dmask set.
701 // All our MIMG definitions use a 256-bit T#, so we can skip checking for them.
702 if (TII->isMIMG(MI)) {
703 int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::srsrc);
704 assert(SRsrcIdx != -1 &&
705 AMDGPU::getRegBitWidth(Desc.OpInfo[SRsrcIdx].RegClass) == 256);
706 (void)SRsrcIdx;
709 if (TII->isFLAT(MI)) {
710 int DataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata);
711 if (AMDGPU::getRegBitWidth(Desc.OpInfo[DataIdx].RegClass) > 64)
712 return DataIdx;
715 return -1;
718 int GCNHazardRecognizer::checkVALUHazardsHelper(const MachineOperand &Def,
719 const MachineRegisterInfo &MRI) {
720 // Helper to check for the hazard where VMEM instructions that store more than
721 // 8 bytes can have there store data over written by the next instruction.
722 const SIRegisterInfo *TRI = ST.getRegisterInfo();
724 const int VALUWaitStates = 1;
725 int WaitStatesNeeded = 0;
727 if (!TRI->isVGPR(MRI, Def.getReg()))
728 return WaitStatesNeeded;
729 Register Reg = Def.getReg();
730 auto IsHazardFn = [this, Reg, TRI] (MachineInstr *MI) {
731 int DataIdx = createsVALUHazard(*MI);
732 return DataIdx >= 0 &&
733 TRI->regsOverlap(MI->getOperand(DataIdx).getReg(), Reg);
735 int WaitStatesNeededForDef =
736 VALUWaitStates - getWaitStatesSince(IsHazardFn, VALUWaitStates);
737 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
739 return WaitStatesNeeded;
742 int GCNHazardRecognizer::checkVALUHazards(MachineInstr *VALU) {
743 // This checks for the hazard where VMEM instructions that store more than
744 // 8 bytes can have there store data over written by the next instruction.
745 if (!ST.has12DWordStoreHazard())
746 return 0;
748 const MachineRegisterInfo &MRI = MF.getRegInfo();
749 int WaitStatesNeeded = 0;
751 for (const MachineOperand &Def : VALU->defs()) {
752 WaitStatesNeeded = std::max(WaitStatesNeeded, checkVALUHazardsHelper(Def, MRI));
755 return WaitStatesNeeded;
758 int GCNHazardRecognizer::checkInlineAsmHazards(MachineInstr *IA) {
759 // This checks for hazards associated with inline asm statements.
760 // Since inline asms can contain just about anything, we use this
761 // to call/leverage other check*Hazard routines. Note that
762 // this function doesn't attempt to address all possible inline asm
763 // hazards (good luck), but is a collection of what has been
764 // problematic thus far.
766 // see checkVALUHazards()
767 if (!ST.has12DWordStoreHazard())
768 return 0;
770 const MachineRegisterInfo &MRI = MF.getRegInfo();
771 int WaitStatesNeeded = 0;
773 for (unsigned I = InlineAsm::MIOp_FirstOperand, E = IA->getNumOperands();
774 I != E; ++I) {
775 const MachineOperand &Op = IA->getOperand(I);
776 if (Op.isReg() && Op.isDef()) {
777 WaitStatesNeeded = std::max(WaitStatesNeeded, checkVALUHazardsHelper(Op, MRI));
781 return WaitStatesNeeded;
784 int GCNHazardRecognizer::checkRWLaneHazards(MachineInstr *RWLane) {
785 const SIInstrInfo *TII = ST.getInstrInfo();
786 const SIRegisterInfo *TRI = ST.getRegisterInfo();
787 const MachineRegisterInfo &MRI = MF.getRegInfo();
789 const MachineOperand *LaneSelectOp =
790 TII->getNamedOperand(*RWLane, AMDGPU::OpName::src1);
792 if (!LaneSelectOp->isReg() || !TRI->isSGPRReg(MRI, LaneSelectOp->getReg()))
793 return 0;
795 Register LaneSelectReg = LaneSelectOp->getReg();
796 auto IsHazardFn = [TII] (MachineInstr *MI) {
797 return TII->isVALU(*MI);
800 const int RWLaneWaitStates = 4;
801 int WaitStatesSince = getWaitStatesSinceDef(LaneSelectReg, IsHazardFn,
802 RWLaneWaitStates);
803 return RWLaneWaitStates - WaitStatesSince;
806 int GCNHazardRecognizer::checkRFEHazards(MachineInstr *RFE) {
807 if (!ST.hasRFEHazards())
808 return 0;
810 const SIInstrInfo *TII = ST.getInstrInfo();
812 const int RFEWaitStates = 1;
814 auto IsHazardFn = [TII] (MachineInstr *MI) {
815 return getHWReg(TII, *MI) == AMDGPU::Hwreg::ID_TRAPSTS;
817 int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, RFEWaitStates);
818 return RFEWaitStates - WaitStatesNeeded;
821 int GCNHazardRecognizer::checkAnyInstHazards(MachineInstr *MI) {
822 if (MI->isDebugInstr())
823 return 0;
825 const SIRegisterInfo *TRI = ST.getRegisterInfo();
826 if (!ST.hasSMovFedHazard())
827 return 0;
829 // Check for any instruction reading an SGPR after a write from
830 // s_mov_fed_b32.
831 int MovFedWaitStates = 1;
832 int WaitStatesNeeded = 0;
834 for (const MachineOperand &Use : MI->uses()) {
835 if (!Use.isReg() || TRI->isVGPR(MF.getRegInfo(), Use.getReg()))
836 continue;
837 auto IsHazardFn = [] (MachineInstr *MI) {
838 return MI->getOpcode() == AMDGPU::S_MOV_FED_B32;
840 int WaitStatesNeededForUse =
841 MovFedWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardFn,
842 MovFedWaitStates);
843 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
846 return WaitStatesNeeded;
849 int GCNHazardRecognizer::checkReadM0Hazards(MachineInstr *MI) {
850 const SIInstrInfo *TII = ST.getInstrInfo();
851 const int SMovRelWaitStates = 1;
852 auto IsHazardFn = [TII] (MachineInstr *MI) {
853 return TII->isSALU(*MI);
855 return SMovRelWaitStates - getWaitStatesSinceDef(AMDGPU::M0, IsHazardFn,
856 SMovRelWaitStates);
859 void GCNHazardRecognizer::fixHazards(MachineInstr *MI) {
860 fixVMEMtoScalarWriteHazards(MI);
861 fixVcmpxPermlaneHazards(MI);
862 fixSMEMtoVectorWriteHazards(MI);
863 fixVcmpxExecWARHazard(MI);
864 fixLdsBranchVmemWARHazard(MI);
867 bool GCNHazardRecognizer::fixVcmpxPermlaneHazards(MachineInstr *MI) {
868 if (!ST.hasVcmpxPermlaneHazard() || !isPermlane(*MI))
869 return false;
871 const SIInstrInfo *TII = ST.getInstrInfo();
872 auto IsHazardFn = [TII] (MachineInstr *MI) {
873 return TII->isVOPC(*MI);
876 auto IsExpiredFn = [] (MachineInstr *MI, int) {
877 if (!MI)
878 return false;
879 unsigned Opc = MI->getOpcode();
880 return SIInstrInfo::isVALU(*MI) &&
881 Opc != AMDGPU::V_NOP_e32 &&
882 Opc != AMDGPU::V_NOP_e64 &&
883 Opc != AMDGPU::V_NOP_sdwa;
886 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
887 std::numeric_limits<int>::max())
888 return false;
890 // V_NOP will be discarded by SQ.
891 // Use V_MOB_B32 v?, v?. Register must be alive so use src0 of V_PERMLANE*
892 // which is always a VGPR and available.
893 auto *Src0 = TII->getNamedOperand(*MI, AMDGPU::OpName::src0);
894 Register Reg = Src0->getReg();
895 bool IsUndef = Src0->isUndef();
896 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
897 TII->get(AMDGPU::V_MOV_B32_e32))
898 .addReg(Reg, RegState::Define | (IsUndef ? RegState::Dead : 0))
899 .addReg(Reg, IsUndef ? RegState::Undef : RegState::Kill);
901 return true;
904 bool GCNHazardRecognizer::fixVMEMtoScalarWriteHazards(MachineInstr *MI) {
905 if (!ST.hasVMEMtoScalarWriteHazard())
906 return false;
908 if (!SIInstrInfo::isSALU(*MI) && !SIInstrInfo::isSMRD(*MI))
909 return false;
911 if (MI->getNumDefs() == 0)
912 return false;
914 const SIRegisterInfo *TRI = ST.getRegisterInfo();
916 auto IsHazardFn = [TRI, MI] (MachineInstr *I) {
917 if (!SIInstrInfo::isVMEM(*I) && !SIInstrInfo::isDS(*I) &&
918 !SIInstrInfo::isFLAT(*I))
919 return false;
921 for (const MachineOperand &Def : MI->defs()) {
922 MachineOperand *Op = I->findRegisterUseOperand(Def.getReg(), false, TRI);
923 if (!Op)
924 continue;
925 return true;
927 return false;
930 auto IsExpiredFn = [] (MachineInstr *MI, int) {
931 return MI && (SIInstrInfo::isVALU(*MI) ||
932 (MI->getOpcode() == AMDGPU::S_WAITCNT &&
933 !MI->getOperand(0).getImm()));
936 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
937 std::numeric_limits<int>::max())
938 return false;
940 const SIInstrInfo *TII = ST.getInstrInfo();
941 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII->get(AMDGPU::V_NOP_e32));
942 return true;
945 bool GCNHazardRecognizer::fixSMEMtoVectorWriteHazards(MachineInstr *MI) {
946 if (!ST.hasSMEMtoVectorWriteHazard())
947 return false;
949 if (!SIInstrInfo::isVALU(*MI))
950 return false;
952 unsigned SDSTName;
953 switch (MI->getOpcode()) {
954 case AMDGPU::V_READLANE_B32:
955 case AMDGPU::V_READFIRSTLANE_B32:
956 SDSTName = AMDGPU::OpName::vdst;
957 break;
958 default:
959 SDSTName = AMDGPU::OpName::sdst;
960 break;
963 const SIInstrInfo *TII = ST.getInstrInfo();
964 const SIRegisterInfo *TRI = ST.getRegisterInfo();
965 const AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(ST.getCPU());
966 const MachineOperand *SDST = TII->getNamedOperand(*MI, SDSTName);
967 if (!SDST) {
968 for (const auto &MO : MI->implicit_operands()) {
969 if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegClass(MO.getReg()))) {
970 SDST = &MO;
971 break;
976 if (!SDST)
977 return false;
979 const Register SDSTReg = SDST->getReg();
980 auto IsHazardFn = [SDSTReg, TRI] (MachineInstr *I) {
981 return SIInstrInfo::isSMRD(*I) && I->readsRegister(SDSTReg, TRI);
984 auto IsExpiredFn = [TII, IV] (MachineInstr *MI, int) {
985 if (MI) {
986 if (TII->isSALU(*MI)) {
987 switch (MI->getOpcode()) {
988 case AMDGPU::S_SETVSKIP:
989 case AMDGPU::S_VERSION:
990 case AMDGPU::S_WAITCNT_VSCNT:
991 case AMDGPU::S_WAITCNT_VMCNT:
992 case AMDGPU::S_WAITCNT_EXPCNT:
993 // These instructions cannot not mitigate the hazard.
994 return false;
995 case AMDGPU::S_WAITCNT_LGKMCNT:
996 // Reducing lgkmcnt count to 0 always mitigates the hazard.
997 return (MI->getOperand(1).getImm() == 0) &&
998 (MI->getOperand(0).getReg() == AMDGPU::SGPR_NULL);
999 case AMDGPU::S_WAITCNT: {
1000 const int64_t Imm = MI->getOperand(0).getImm();
1001 AMDGPU::Waitcnt Decoded = AMDGPU::decodeWaitcnt(IV, Imm);
1002 return (Decoded.LgkmCnt == 0);
1004 default:
1005 // SOPP instructions cannot mitigate the hazard.
1006 if (TII->isSOPP(*MI))
1007 return false;
1008 // At this point the SALU can be assumed to mitigate the hazard
1009 // because either:
1010 // (a) it is independent of the at risk SMEM (breaking chain),
1011 // or
1012 // (b) it is dependent on the SMEM, in which case an appropriate
1013 // s_waitcnt lgkmcnt _must_ exist between it and the at risk
1014 // SMEM instruction.
1015 return true;
1019 return false;
1022 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1023 std::numeric_limits<int>::max())
1024 return false;
1026 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1027 TII->get(AMDGPU::S_MOV_B32), AMDGPU::SGPR_NULL)
1028 .addImm(0);
1029 return true;
1032 bool GCNHazardRecognizer::fixVcmpxExecWARHazard(MachineInstr *MI) {
1033 if (!ST.hasVcmpxExecWARHazard() || !SIInstrInfo::isVALU(*MI))
1034 return false;
1036 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1037 if (!MI->modifiesRegister(AMDGPU::EXEC, TRI))
1038 return false;
1040 auto IsHazardFn = [TRI] (MachineInstr *I) {
1041 if (SIInstrInfo::isVALU(*I))
1042 return false;
1043 return I->readsRegister(AMDGPU::EXEC, TRI);
1046 const SIInstrInfo *TII = ST.getInstrInfo();
1047 auto IsExpiredFn = [TII, TRI] (MachineInstr *MI, int) {
1048 if (!MI)
1049 return false;
1050 if (SIInstrInfo::isVALU(*MI)) {
1051 if (TII->getNamedOperand(*MI, AMDGPU::OpName::sdst))
1052 return true;
1053 for (auto MO : MI->implicit_operands())
1054 if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegClass(MO.getReg())))
1055 return true;
1057 if (MI->getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1058 (MI->getOperand(0).getImm() & 0xfffe) == 0xfffe)
1059 return true;
1060 return false;
1063 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1064 std::numeric_limits<int>::max())
1065 return false;
1067 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1068 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
1069 .addImm(0xfffe);
1070 return true;
1073 bool GCNHazardRecognizer::fixLdsBranchVmemWARHazard(MachineInstr *MI) {
1074 if (!ST.hasLdsBranchVmemWARHazard())
1075 return false;
1077 auto IsHazardInst = [] (const MachineInstr *MI) {
1078 if (SIInstrInfo::isDS(*MI))
1079 return 1;
1080 if (SIInstrInfo::isVMEM(*MI) || SIInstrInfo::isSegmentSpecificFLAT(*MI))
1081 return 2;
1082 return 0;
1085 auto InstType = IsHazardInst(MI);
1086 if (!InstType)
1087 return false;
1089 auto IsExpiredFn = [&IsHazardInst] (MachineInstr *I, int) {
1090 return I && (IsHazardInst(I) ||
1091 (I->getOpcode() == AMDGPU::S_WAITCNT_VSCNT &&
1092 I->getOperand(0).getReg() == AMDGPU::SGPR_NULL &&
1093 !I->getOperand(1).getImm()));
1096 auto IsHazardFn = [InstType, &IsHazardInst] (MachineInstr *I) {
1097 if (!I->isBranch())
1098 return false;
1100 auto IsHazardFn = [InstType, IsHazardInst] (MachineInstr *I) {
1101 auto InstType2 = IsHazardInst(I);
1102 return InstType2 && InstType != InstType2;
1105 auto IsExpiredFn = [InstType, &IsHazardInst] (MachineInstr *I, int) {
1106 if (!I)
1107 return false;
1109 auto InstType2 = IsHazardInst(I);
1110 if (InstType == InstType2)
1111 return true;
1113 return I->getOpcode() == AMDGPU::S_WAITCNT_VSCNT &&
1114 I->getOperand(0).getReg() == AMDGPU::SGPR_NULL &&
1115 !I->getOperand(1).getImm();
1118 return ::getWaitStatesSince(IsHazardFn, I, IsExpiredFn) !=
1119 std::numeric_limits<int>::max();
1122 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1123 std::numeric_limits<int>::max())
1124 return false;
1126 const SIInstrInfo *TII = ST.getInstrInfo();
1127 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1128 TII->get(AMDGPU::S_WAITCNT_VSCNT))
1129 .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1130 .addImm(0);
1132 return true;
1135 int GCNHazardRecognizer::checkNSAtoVMEMHazard(MachineInstr *MI) {
1136 int NSAtoVMEMWaitStates = 1;
1138 if (!ST.hasNSAtoVMEMBug())
1139 return 0;
1141 if (!SIInstrInfo::isMUBUF(*MI) && !SIInstrInfo::isMTBUF(*MI))
1142 return 0;
1144 const SIInstrInfo *TII = ST.getInstrInfo();
1145 const auto *Offset = TII->getNamedOperand(*MI, AMDGPU::OpName::offset);
1146 if (!Offset || (Offset->getImm() & 6) == 0)
1147 return 0;
1149 auto IsHazardFn = [TII] (MachineInstr *I) {
1150 if (!SIInstrInfo::isMIMG(*I))
1151 return false;
1152 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(I->getOpcode());
1153 return Info->MIMGEncoding == AMDGPU::MIMGEncGfx10NSA &&
1154 TII->getInstSizeInBytes(*I) >= 16;
1157 return NSAtoVMEMWaitStates - getWaitStatesSince(IsHazardFn, 1);
1160 int GCNHazardRecognizer::checkFPAtomicToDenormModeHazard(MachineInstr *MI) {
1161 int FPAtomicToDenormModeWaitStates = 3;
1163 if (MI->getOpcode() != AMDGPU::S_DENORM_MODE)
1164 return 0;
1166 auto IsHazardFn = [] (MachineInstr *I) {
1167 if (!SIInstrInfo::isVMEM(*I) && !SIInstrInfo::isFLAT(*I))
1168 return false;
1169 return SIInstrInfo::isFPAtomic(*I);
1172 auto IsExpiredFn = [] (MachineInstr *MI, int WaitStates) {
1173 if (WaitStates >= 3 || SIInstrInfo::isVALU(*MI))
1174 return true;
1176 switch (MI->getOpcode()) {
1177 case AMDGPU::S_WAITCNT:
1178 case AMDGPU::S_WAITCNT_VSCNT:
1179 case AMDGPU::S_WAITCNT_VMCNT:
1180 case AMDGPU::S_WAITCNT_EXPCNT:
1181 case AMDGPU::S_WAITCNT_LGKMCNT:
1182 case AMDGPU::S_WAITCNT_IDLE:
1183 return true;
1184 default:
1185 break;
1188 return false;
1192 return FPAtomicToDenormModeWaitStates -
1193 ::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn);
1196 int GCNHazardRecognizer::checkMAIHazards(MachineInstr *MI) {
1197 assert(SIInstrInfo::isMAI(*MI));
1199 int WaitStatesNeeded = 0;
1200 unsigned Opc = MI->getOpcode();
1202 auto IsVALUFn = [] (MachineInstr *MI) {
1203 return SIInstrInfo::isVALU(*MI);
1206 if (Opc != AMDGPU::V_ACCVGPR_READ_B32) { // MFMA or v_accvgpr_write
1207 const int LegacyVALUWritesVGPRWaitStates = 2;
1208 const int VALUWritesExecWaitStates = 4;
1209 const int MaxWaitStates = 4;
1211 int WaitStatesNeededForUse = VALUWritesExecWaitStates -
1212 getWaitStatesSinceDef(AMDGPU::EXEC, IsVALUFn, MaxWaitStates);
1213 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1215 if (WaitStatesNeeded < MaxWaitStates) {
1216 for (const MachineOperand &Use : MI->explicit_uses()) {
1217 const int MaxWaitStates = 2;
1219 if (!Use.isReg() || !TRI.isVGPR(MF.getRegInfo(), Use.getReg()))
1220 continue;
1222 int WaitStatesNeededForUse = LegacyVALUWritesVGPRWaitStates -
1223 getWaitStatesSinceDef(Use.getReg(), IsVALUFn, MaxWaitStates);
1224 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1226 if (WaitStatesNeeded == MaxWaitStates)
1227 break;
1232 auto IsMFMAFn = [] (MachineInstr *MI) {
1233 return SIInstrInfo::isMAI(*MI) &&
1234 MI->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32 &&
1235 MI->getOpcode() != AMDGPU::V_ACCVGPR_READ_B32;
1238 for (const MachineOperand &Op : MI->explicit_operands()) {
1239 if (!Op.isReg() || !TRI.isAGPR(MF.getRegInfo(), Op.getReg()))
1240 continue;
1242 if (Op.isDef() && Opc != AMDGPU::V_ACCVGPR_WRITE_B32)
1243 continue;
1245 const int MFMAWritesAGPROverlappedSrcABWaitStates = 4;
1246 const int MFMAWritesAGPROverlappedSrcCWaitStates = 2;
1247 const int MFMA4x4WritesAGPRAccVgprReadWaitStates = 4;
1248 const int MFMA16x16WritesAGPRAccVgprReadWaitStates = 10;
1249 const int MFMA32x32WritesAGPRAccVgprReadWaitStates = 18;
1250 const int MFMA4x4WritesAGPRAccVgprWriteWaitStates = 1;
1251 const int MFMA16x16WritesAGPRAccVgprWriteWaitStates = 7;
1252 const int MFMA32x32WritesAGPRAccVgprWriteWaitStates = 15;
1253 const int MaxWaitStates = 18;
1254 Register Reg = Op.getReg();
1255 unsigned HazardDefLatency = 0;
1257 auto IsOverlappedMFMAFn = [Reg, &IsMFMAFn, &HazardDefLatency, this]
1258 (MachineInstr *MI) {
1259 if (!IsMFMAFn(MI))
1260 return false;
1261 Register DstReg = MI->getOperand(0).getReg();
1262 if (DstReg == Reg)
1263 return false;
1264 HazardDefLatency = std::max(HazardDefLatency,
1265 TSchedModel.computeInstrLatency(MI));
1266 return TRI.regsOverlap(DstReg, Reg);
1269 int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsOverlappedMFMAFn,
1270 MaxWaitStates);
1271 int NeedWaitStates = MFMAWritesAGPROverlappedSrcABWaitStates;
1272 int SrcCIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
1273 int OpNo = MI->getOperandNo(&Op);
1274 if (OpNo == SrcCIdx) {
1275 NeedWaitStates = MFMAWritesAGPROverlappedSrcCWaitStates;
1276 } else if (Opc == AMDGPU::V_ACCVGPR_READ_B32) {
1277 switch (HazardDefLatency) {
1278 case 2: NeedWaitStates = MFMA4x4WritesAGPRAccVgprReadWaitStates;
1279 break;
1280 case 8: NeedWaitStates = MFMA16x16WritesAGPRAccVgprReadWaitStates;
1281 break;
1282 case 16: LLVM_FALLTHROUGH;
1283 default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprReadWaitStates;
1284 break;
1286 } else if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32) {
1287 switch (HazardDefLatency) {
1288 case 2: NeedWaitStates = MFMA4x4WritesAGPRAccVgprWriteWaitStates;
1289 break;
1290 case 8: NeedWaitStates = MFMA16x16WritesAGPRAccVgprWriteWaitStates;
1291 break;
1292 case 16: LLVM_FALLTHROUGH;
1293 default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprWriteWaitStates;
1294 break;
1298 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
1299 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1301 if (WaitStatesNeeded == MaxWaitStates)
1302 return WaitStatesNeeded; // Early exit.
1304 auto IsAccVgprWriteFn = [Reg, this] (MachineInstr *MI) {
1305 if (MI->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32)
1306 return false;
1307 Register DstReg = MI->getOperand(0).getReg();
1308 return TRI.regsOverlap(Reg, DstReg);
1311 const int AccVGPRWriteMFMAReadSrcCWaitStates = 1;
1312 const int AccVGPRWriteMFMAReadSrcABWaitStates = 3;
1313 const int AccVGPRWriteAccVgprReadWaitStates = 3;
1314 NeedWaitStates = AccVGPRWriteMFMAReadSrcABWaitStates;
1315 if (OpNo == SrcCIdx)
1316 NeedWaitStates = AccVGPRWriteMFMAReadSrcCWaitStates;
1317 else if (Opc == AMDGPU::V_ACCVGPR_READ_B32)
1318 NeedWaitStates = AccVGPRWriteAccVgprReadWaitStates;
1320 WaitStatesNeededForUse = NeedWaitStates -
1321 getWaitStatesSinceDef(Reg, IsAccVgprWriteFn, MaxWaitStates);
1322 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1324 if (WaitStatesNeeded == MaxWaitStates)
1325 return WaitStatesNeeded; // Early exit.
1328 if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32) {
1329 const int MFMA4x4ReadSrcCAccVgprWriteWaitStates = 0;
1330 const int MFMA16x16ReadSrcCAccVgprWriteWaitStates = 5;
1331 const int MFMA32x32ReadSrcCAccVgprWriteWaitStates = 13;
1332 const int MaxWaitStates = 13;
1333 Register DstReg = MI->getOperand(0).getReg();
1334 unsigned HazardDefLatency = 0;
1336 auto IsSrcCMFMAFn = [DstReg, &IsMFMAFn, &HazardDefLatency, this]
1337 (MachineInstr *MI) {
1338 if (!IsMFMAFn(MI))
1339 return false;
1340 Register Reg = TII.getNamedOperand(*MI, AMDGPU::OpName::src2)->getReg();
1341 HazardDefLatency = std::max(HazardDefLatency,
1342 TSchedModel.computeInstrLatency(MI));
1343 return TRI.regsOverlap(Reg, DstReg);
1346 int WaitStatesSince = getWaitStatesSince(IsSrcCMFMAFn, MaxWaitStates);
1347 int NeedWaitStates;
1348 switch (HazardDefLatency) {
1349 case 2: NeedWaitStates = MFMA4x4ReadSrcCAccVgprWriteWaitStates;
1350 break;
1351 case 8: NeedWaitStates = MFMA16x16ReadSrcCAccVgprWriteWaitStates;
1352 break;
1353 case 16: LLVM_FALLTHROUGH;
1354 default: NeedWaitStates = MFMA32x32ReadSrcCAccVgprWriteWaitStates;
1355 break;
1358 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSince;
1359 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1362 return WaitStatesNeeded;
1365 int GCNHazardRecognizer::checkMAILdStHazards(MachineInstr *MI) {
1366 if (!ST.hasMAIInsts())
1367 return 0;
1369 int WaitStatesNeeded = 0;
1371 auto IsAccVgprReadFn = [] (MachineInstr *MI) {
1372 return MI->getOpcode() == AMDGPU::V_ACCVGPR_READ_B32;
1375 for (const MachineOperand &Op : MI->explicit_uses()) {
1376 if (!Op.isReg() || !TRI.isVGPR(MF.getRegInfo(), Op.getReg()))
1377 continue;
1379 Register Reg = Op.getReg();
1381 const int AccVgprReadLdStWaitStates = 2;
1382 const int VALUWriteAccVgprReadLdStDepVALUWaitStates = 1;
1383 const int MaxWaitStates = 2;
1385 int WaitStatesNeededForUse = AccVgprReadLdStWaitStates -
1386 getWaitStatesSinceDef(Reg, IsAccVgprReadFn, MaxWaitStates);
1387 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1389 if (WaitStatesNeeded == MaxWaitStates)
1390 return WaitStatesNeeded; // Early exit.
1392 auto IsVALUAccVgprReadCheckFn = [Reg, this] (MachineInstr *MI) {
1393 if (MI->getOpcode() != AMDGPU::V_ACCVGPR_READ_B32)
1394 return false;
1395 auto IsVALUFn = [] (MachineInstr *MI) {
1396 return SIInstrInfo::isVALU(*MI) && !SIInstrInfo::isMAI(*MI);
1398 return getWaitStatesSinceDef(Reg, IsVALUFn, 2 /*MaxWaitStates*/) <
1399 std::numeric_limits<int>::max();
1402 WaitStatesNeededForUse = VALUWriteAccVgprReadLdStDepVALUWaitStates -
1403 getWaitStatesSince(IsVALUAccVgprReadCheckFn, MaxWaitStates);
1404 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1407 return WaitStatesNeeded;