lib/Target/AMDGPU/GCNHazardRecognizer.cpp

   1 //===-- GCNHazardRecognizers.cpp - GCN Hazard Recognizer Impls ------------===//
   2 //
   3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 // See https://llvm.org/LICENSE.txt for license information.
   5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 //
   7 //===----------------------------------------------------------------------===//
   8 //
   9 // This file implements hazard recognizers for scheduling on GCN processors.
  10 //
  11 //===----------------------------------------------------------------------===//
  12
  13 #include "GCNHazardRecognizer.h"
  14 #include "AMDGPUSubtarget.h"
  15 #include "SIDefines.h"
  16 #include "SIInstrInfo.h"
  17 #include "SIRegisterInfo.h"
  18 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
  19 #include "Utils/AMDGPUBaseInfo.h"
  20 #include "llvm/ADT/iterator_range.h"
  21 #include "llvm/CodeGen/MachineFunction.h"
  22 #include "llvm/CodeGen/MachineInstr.h"
  23 #include "llvm/CodeGen/MachineInstrBuilder.h"
  24 #include "llvm/CodeGen/MachineOperand.h"
  25 #include "llvm/CodeGen/ScheduleDAG.h"
  26 #include "llvm/MC/MCInstrDesc.h"
  27 #include "llvm/Support/ErrorHandling.h"
  28 #include <algorithm>
  29 #include <cassert>
  30 #include <limits>
  31 #include <set>
  32 #include <vector>
  33
  34 using namespace llvm;
  35
  36 //===----------------------------------------------------------------------===//
  37 // Hazard Recoginizer Implementation
  38 //===----------------------------------------------------------------------===//
  39
  40 GCNHazardRecognizer::GCNHazardRecognizer(const MachineFunction &MF) :
  41   IsHazardRecognizerMode(false),
  42   CurrCycleInstr(nullptr),
  43   MF(MF),
  44   ST(MF.getSubtarget<GCNSubtarget>()),
  45   TII(*ST.getInstrInfo()),
  46   TRI(TII.getRegisterInfo()),
  47   ClauseUses(TRI.getNumRegUnits()),
  48   ClauseDefs(TRI.getNumRegUnits()) {
  49   MaxLookAhead = MF.getRegInfo().isPhysRegUsed(AMDGPU::AGPR0) ? 18 : 5;
  50   TSchedModel.init(&ST);
  51 }
  52
  53 void GCNHazardRecognizer::EmitInstruction(SUnit *SU) {
  54   EmitInstruction(SU->getInstr());
  55 }
  56
  57 void GCNHazardRecognizer::EmitInstruction(MachineInstr *MI) {
  58   CurrCycleInstr = MI;
  59 }
  60
  61 static bool isDivFMas(unsigned Opcode) {
  62   return Opcode == AMDGPU::V_DIV_FMAS_F32 || Opcode == AMDGPU::V_DIV_FMAS_F64;
  63 }
  64
  65 static bool isSGetReg(unsigned Opcode) {
  66   return Opcode == AMDGPU::S_GETREG_B32;
  67 }
  68
  69 static bool isSSetReg(unsigned Opcode) {
  70   return Opcode == AMDGPU::S_SETREG_B32 || Opcode == AMDGPU::S_SETREG_IMM32_B32;
  71 }
  72
  73 static bool isRWLane(unsigned Opcode) {
  74   return Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32;
  75 }
  76
  77 static bool isRFE(unsigned Opcode) {
  78   return Opcode == AMDGPU::S_RFE_B64;
  79 }
  80
  81 static bool isSMovRel(unsigned Opcode) {
  82   switch (Opcode) {
  83   case AMDGPU::S_MOVRELS_B32:
  84   case AMDGPU::S_MOVRELS_B64:
  85   case AMDGPU::S_MOVRELD_B32:
  86   case AMDGPU::S_MOVRELD_B64:
  87     return true;
  88   default:
  89     return false;
  90   }
  91 }
  92
  93 static bool isSendMsgTraceDataOrGDS(const SIInstrInfo &TII,
  94                                     const MachineInstr &MI) {
  95   if (TII.isAlwaysGDS(MI.getOpcode()))
  96     return true;
  97
  98   switch (MI.getOpcode()) {
  99   case AMDGPU::S_SENDMSG:
 100   case AMDGPU::S_SENDMSGHALT:
 101   case AMDGPU::S_TTRACEDATA:
 102     return true;
 103   // These DS opcodes don't support GDS.
 104   case AMDGPU::DS_NOP:
 105   case AMDGPU::DS_PERMUTE_B32:
 106   case AMDGPU::DS_BPERMUTE_B32:
 107     return false;
 108   default:
 109     if (TII.isDS(MI.getOpcode())) {
 110       int GDS = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
 111                                            AMDGPU::OpName::gds);
 112       if (MI.getOperand(GDS).getImm())
 113         return true;
 114     }
 115     return false;
 116   }
 117 }
 118
 119 static bool isPermlane(const MachineInstr &MI) {
 120   unsigned Opcode = MI.getOpcode();
 121   return Opcode == AMDGPU::V_PERMLANE16_B32 ||
 122          Opcode == AMDGPU::V_PERMLANEX16_B32;
 123 }
 124
 125 static unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr) {
 126   const MachineOperand *RegOp = TII->getNamedOperand(RegInstr,
 127                                                      AMDGPU::OpName::simm16);
 128   return RegOp->getImm() & AMDGPU::Hwreg::ID_MASK_;
 129 }
 130
 131 ScheduleHazardRecognizer::HazardType
 132 GCNHazardRecognizer::getHazardType(SUnit *SU, int Stalls) {
 133   MachineInstr *MI = SU->getInstr();
 134   if (MI->isBundle())
 135    return NoHazard;
 136
 137   if (SIInstrInfo::isSMRD(*MI) && checkSMRDHazards(MI) > 0)
 138     return NoopHazard;
 139
 140   // FIXME: Should flat be considered vmem?
 141   if ((SIInstrInfo::isVMEM(*MI) ||
 142        SIInstrInfo::isFLAT(*MI))
 143       && checkVMEMHazards(MI) > 0)
 144     return NoopHazard;
 145
 146   if (ST.hasNSAtoVMEMBug() && checkNSAtoVMEMHazard(MI) > 0)
 147     return NoopHazard;
 148
 149   if (checkFPAtomicToDenormModeHazard(MI) > 0)
 150     return NoopHazard;
 151
 152   if (ST.hasNoDataDepHazard())
 153     return NoHazard;
 154
 155   if (SIInstrInfo::isVALU(*MI) && checkVALUHazards(MI) > 0)
 156     return NoopHazard;
 157
 158   if (SIInstrInfo::isDPP(*MI) && checkDPPHazards(MI) > 0)
 159     return NoopHazard;
 160
 161   if (isDivFMas(MI->getOpcode()) && checkDivFMasHazards(MI) > 0)
 162     return NoopHazard;
 163
 164   if (isRWLane(MI->getOpcode()) && checkRWLaneHazards(MI) > 0)
 165     return NoopHazard;
 166
 167   if (isSGetReg(MI->getOpcode()) && checkGetRegHazards(MI) > 0)
 168     return NoopHazard;
 169
 170   if (isSSetReg(MI->getOpcode()) && checkSetRegHazards(MI) > 0)
 171     return NoopHazard;
 172
 173   if (isRFE(MI->getOpcode()) && checkRFEHazards(MI) > 0)
 174     return NoopHazard;
 175
 176   if (ST.hasReadM0MovRelInterpHazard() &&
 177       (TII.isVINTRP(*MI) || isSMovRel(MI->getOpcode())) &&
 178       checkReadM0Hazards(MI) > 0)
 179     return NoopHazard;
 180
 181   if (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, *MI) &&
 182       checkReadM0Hazards(MI) > 0)
 183     return NoopHazard;
 184
 185   if (SIInstrInfo::isMAI(*MI) && checkMAIHazards(MI) > 0)
 186     return NoopHazard;
 187
 188   if ((MI->mayLoad() || MI->mayStore()) && checkMAILdStHazards(MI) > 0)
 189     return NoopHazard;
 190
 191   if (MI->isInlineAsm() && checkInlineAsmHazards(MI) > 0)
 192     return NoopHazard;
 193
 194   if (checkAnyInstHazards(MI) > 0)
 195     return NoopHazard;
 196
 197   return NoHazard;
 198 }
 199
 200 static void insertNoopInBundle(MachineInstr *MI, const SIInstrInfo &TII) {
 201   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII.get(AMDGPU::S_NOP))
 202       .addImm(0);
 203 }
 204
 205 void GCNHazardRecognizer::processBundle() {
 206   MachineBasicBlock::instr_iterator MI = std::next(CurrCycleInstr->getIterator());
 207   MachineBasicBlock::instr_iterator E = CurrCycleInstr->getParent()->instr_end();
 208   // Check bundled MachineInstr's for hazards.
 209   for (; MI != E && MI->isInsideBundle(); ++MI) {
 210     CurrCycleInstr = &*MI;
 211     unsigned WaitStates = PreEmitNoopsCommon(CurrCycleInstr);
 212
 213     if (IsHazardRecognizerMode)
 214       fixHazards(CurrCycleInstr);
 215
 216     for (unsigned i = 0; i < WaitStates; ++i)
 217       insertNoopInBundle(CurrCycleInstr, TII);
 218
 219     // It’s unnecessary to track more than MaxLookAhead instructions. Since we
 220     // include the bundled MI directly after, only add a maximum of
 221     // (MaxLookAhead - 1) noops to EmittedInstrs.
 222     for (unsigned i = 0, e = std::min(WaitStates, MaxLookAhead - 1); i < e; ++i)
 223       EmittedInstrs.push_front(nullptr);
 224
 225     EmittedInstrs.push_front(CurrCycleInstr);
 226     EmittedInstrs.resize(MaxLookAhead);
 227   }
 228   CurrCycleInstr = nullptr;
 229 }
 230
 231 unsigned GCNHazardRecognizer::PreEmitNoops(SUnit *SU) {
 232   IsHazardRecognizerMode = false;
 233   return PreEmitNoopsCommon(SU->getInstr());
 234 }
 235
 236 unsigned GCNHazardRecognizer::PreEmitNoops(MachineInstr *MI) {
 237   IsHazardRecognizerMode = true;
 238   CurrCycleInstr = MI;
 239   unsigned W = PreEmitNoopsCommon(MI);
 240   fixHazards(MI);
 241   CurrCycleInstr = nullptr;
 242   return W;
 243 }
 244
 245 unsigned GCNHazardRecognizer::PreEmitNoopsCommon(MachineInstr *MI) {
 246   if (MI->isBundle())
 247     return 0;
 248
 249   int WaitStates = std::max(0, checkAnyInstHazards(MI));
 250
 251   if (SIInstrInfo::isSMRD(*MI))
 252     return std::max(WaitStates, checkSMRDHazards(MI));
 253
 254   if (SIInstrInfo::isVMEM(*MI) || SIInstrInfo::isFLAT(*MI))
 255     WaitStates = std::max(WaitStates, checkVMEMHazards(MI));
 256
 257   if (ST.hasNSAtoVMEMBug())
 258     WaitStates = std::max(WaitStates, checkNSAtoVMEMHazard(MI));
 259
 260   WaitStates = std::max(WaitStates, checkFPAtomicToDenormModeHazard(MI));
 261
 262   if (ST.hasNoDataDepHazard())
 263     return WaitStates;
 264
 265   if (SIInstrInfo::isVALU(*MI))
 266     WaitStates = std::max(WaitStates, checkVALUHazards(MI));
 267
 268   if (SIInstrInfo::isDPP(*MI))
 269     WaitStates = std::max(WaitStates, checkDPPHazards(MI));
 270
 271   if (isDivFMas(MI->getOpcode()))
 272     WaitStates = std::max(WaitStates, checkDivFMasHazards(MI));
 273
 274   if (isRWLane(MI->getOpcode()))
 275     WaitStates = std::max(WaitStates, checkRWLaneHazards(MI));
 276
 277   if (MI->isInlineAsm())
 278     return std::max(WaitStates, checkInlineAsmHazards(MI));
 279
 280   if (isSGetReg(MI->getOpcode()))
 281     return std::max(WaitStates, checkGetRegHazards(MI));
 282
 283   if (isSSetReg(MI->getOpcode()))
 284     return std::max(WaitStates, checkSetRegHazards(MI));
 285
 286   if (isRFE(MI->getOpcode()))
 287     return std::max(WaitStates, checkRFEHazards(MI));
 288
 289   if (ST.hasReadM0MovRelInterpHazard() && (TII.isVINTRP(*MI) ||
 290                                            isSMovRel(MI->getOpcode())))
 291     return std::max(WaitStates, checkReadM0Hazards(MI));
 292
 293   if (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, *MI))
 294     return std::max(WaitStates, checkReadM0Hazards(MI));
 295
 296   if (SIInstrInfo::isMAI(*MI))
 297     return std::max(WaitStates, checkMAIHazards(MI));
 298
 299   if (MI->mayLoad() || MI->mayStore())
 300     return std::max(WaitStates, checkMAILdStHazards(MI));
 301
 302   return WaitStates;
 303 }
 304
 305 void GCNHazardRecognizer::EmitNoop() {
 306   EmittedInstrs.push_front(nullptr);
 307 }
 308
 309 void GCNHazardRecognizer::AdvanceCycle() {
 310   // When the scheduler detects a stall, it will call AdvanceCycle() without
 311   // emitting any instructions.
 312   if (!CurrCycleInstr)
 313     return;
 314
 315   // Do not track non-instructions which do not affect the wait states.
 316   // If included, these instructions can lead to buffer overflow such that
 317   // detectable hazards are missed.
 318   if (CurrCycleInstr->isImplicitDef() || CurrCycleInstr->isDebugInstr() ||
 319       CurrCycleInstr->isKill())
 320     return;
 321
 322   if (CurrCycleInstr->isBundle()) {
 323     processBundle();
 324     return;
 325   }
 326
 327   unsigned NumWaitStates = TII.getNumWaitStates(*CurrCycleInstr);
 328
 329   // Keep track of emitted instructions
 330   EmittedInstrs.push_front(CurrCycleInstr);
 331
 332   // Add a nullptr for each additional wait state after the first.  Make sure
 333   // not to add more than getMaxLookAhead() items to the list, since we
 334   // truncate the list to that size right after this loop.
 335   for (unsigned i = 1, e = std::min(NumWaitStates, getMaxLookAhead());
 336        i < e; ++i) {
 337     EmittedInstrs.push_front(nullptr);
 338   }
 339
 340   // getMaxLookahead() is the largest number of wait states we will ever need
 341   // to insert, so there is no point in keeping track of more than that many
 342   // wait states.
 343   EmittedInstrs.resize(getMaxLookAhead());
 344
 345   CurrCycleInstr = nullptr;
 346 }
 347
 348 void GCNHazardRecognizer::RecedeCycle() {
 349   llvm_unreachable("hazard recognizer does not support bottom-up scheduling.");
 350 }
 351
 352 //===----------------------------------------------------------------------===//
 353 // Helper Functions
 354 //===----------------------------------------------------------------------===//
 355
 356 typedef function_ref<bool(MachineInstr *, int WaitStates)> IsExpiredFn;
 357
 358 // Returns a minimum wait states since \p I walking all predecessors.
 359 // Only scans until \p IsExpired does not return true.
 360 // Can only be run in a hazard recognizer mode.
 361 static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard,
 362                               MachineBasicBlock *MBB,
 363                               MachineBasicBlock::reverse_instr_iterator I,
 364                               int WaitStates,
 365                               IsExpiredFn IsExpired,
 366                               DenseSet<const MachineBasicBlock *> &Visited) {
 367   for (auto E = MBB->instr_rend(); I != E; ++I) {
 368     // Don't add WaitStates for parent BUNDLE instructions.
 369     if (I->isBundle())
 370       continue;
 371
 372     if (IsHazard(&*I))
 373       return WaitStates;
 374
 375     if (I->isInlineAsm() || I->isImplicitDef() || I->isDebugInstr())
 376       continue;
 377
 378     WaitStates += SIInstrInfo::getNumWaitStates(*I);
 379
 380     if (IsExpired(&*I, WaitStates))
 381       return std::numeric_limits<int>::max();
 382   }
 383
 384   int MinWaitStates = WaitStates;
 385   bool Found = false;
 386   for (MachineBasicBlock *Pred : MBB->predecessors()) {
 387     if (!Visited.insert(Pred).second)
 388       continue;
 389
 390     int W = getWaitStatesSince(IsHazard, Pred, Pred->instr_rbegin(),
 391                                WaitStates, IsExpired, Visited);
 392
 393     if (W == std::numeric_limits<int>::max())
 394       continue;
 395
 396     MinWaitStates = Found ? std::min(MinWaitStates, W) : W;
 397     if (IsExpired(nullptr, MinWaitStates))
 398       return MinWaitStates;
 399
 400     Found = true;
 401   }
 402
 403   if (Found)
 404     return MinWaitStates;
 405
 406   return std::numeric_limits<int>::max();
 407 }
 408
 409 static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard,
 410                               MachineInstr *MI,
 411                               IsExpiredFn IsExpired) {
 412   DenseSet<const MachineBasicBlock *> Visited;
 413   return getWaitStatesSince(IsHazard, MI->getParent(),
 414                             std::next(MI->getReverseIterator()),
 415                             0, IsExpired, Visited);
 416 }
 417
 418 int GCNHazardRecognizer::getWaitStatesSince(IsHazardFn IsHazard, int Limit) {
 419   if (IsHazardRecognizerMode) {
 420     auto IsExpiredFn = [Limit] (MachineInstr *, int WaitStates) {
 421       return WaitStates >= Limit;
 422     };
 423     return ::getWaitStatesSince(IsHazard, CurrCycleInstr, IsExpiredFn);
 424   }
 425
 426   int WaitStates = 0;
 427   for (MachineInstr *MI : EmittedInstrs) {
 428     if (MI) {
 429       if (IsHazard(MI))
 430         return WaitStates;
 431
 432       if (MI->isInlineAsm())
 433         continue;
 434     }
 435     ++WaitStates;
 436
 437     if (WaitStates >= Limit)
 438       break;
 439   }
 440   return std::numeric_limits<int>::max();
 441 }
 442
 443 int GCNHazardRecognizer::getWaitStatesSinceDef(unsigned Reg,
 444                                                IsHazardFn IsHazardDef,
 445                                                int Limit) {
 446   const SIRegisterInfo *TRI = ST.getRegisterInfo();
 447
 448   auto IsHazardFn = [IsHazardDef, TRI, Reg] (MachineInstr *MI) {
 449     return IsHazardDef(MI) && MI->modifiesRegister(Reg, TRI);
 450   };
 451
 452   return getWaitStatesSince(IsHazardFn, Limit);
 453 }
 454
 455 int GCNHazardRecognizer::getWaitStatesSinceSetReg(IsHazardFn IsHazard,
 456                                                   int Limit) {
 457   auto IsHazardFn = [IsHazard] (MachineInstr *MI) {
 458     return isSSetReg(MI->getOpcode()) && IsHazard(MI);
 459   };
 460
 461   return getWaitStatesSince(IsHazardFn, Limit);
 462 }
 463
 464 //===----------------------------------------------------------------------===//
 465 // No-op Hazard Detection
 466 //===----------------------------------------------------------------------===//
 467
 468 static void addRegUnits(const SIRegisterInfo &TRI,
 469                         BitVector &BV, unsigned Reg) {
 470   for (MCRegUnitIterator RUI(Reg, &TRI); RUI.isValid(); ++RUI)
 471     BV.set(*RUI);
 472 }
 473
 474 static void addRegsToSet(const SIRegisterInfo &TRI,
 475                          iterator_range<MachineInstr::const_mop_iterator> Ops,
 476                          BitVector &Set) {
 477   for (const MachineOperand &Op : Ops) {
 478     if (Op.isReg())
 479       addRegUnits(TRI, Set, Op.getReg());
 480   }
 481 }
 482
 483 void GCNHazardRecognizer::addClauseInst(const MachineInstr &MI) {
 484   // XXX: Do we need to worry about implicit operands
 485   addRegsToSet(TRI, MI.defs(), ClauseDefs);
 486   addRegsToSet(TRI, MI.uses(), ClauseUses);
 487 }
 488
 489 int GCNHazardRecognizer::checkSoftClauseHazards(MachineInstr *MEM) {
 490   // SMEM soft clause are only present on VI+, and only matter if xnack is
 491   // enabled.
 492   if (!ST.isXNACKEnabled())
 493     return 0;
 494
 495   bool IsSMRD = TII.isSMRD(*MEM);
 496
 497   resetClause();
 498
 499   // A soft-clause is any group of consecutive SMEM instructions.  The
 500   // instructions in this group may return out of order and/or may be
 501   // replayed (i.e. the same instruction issued more than once).
 502   //
 503   // In order to handle these situations correctly we need to make sure that
 504   // when a clause has more than one instruction, no instruction in the clause
 505   // writes to a register that is read by another instruction in the clause
 506   // (including itself). If we encounter this situaion, we need to break the
 507   // clause by inserting a non SMEM instruction.
 508
 509   for (MachineInstr *MI : EmittedInstrs) {
 510     // When we hit a non-SMEM instruction then we have passed the start of the
 511     // clause and we can stop.
 512     if (!MI)
 513       break;
 514
 515     if (IsSMRD != SIInstrInfo::isSMRD(*MI))
 516       break;
 517
 518     addClauseInst(*MI);
 519   }
 520
 521   if (ClauseDefs.none())
 522     return 0;
 523
 524   // We need to make sure not to put loads and stores in the same clause if they
 525   // use the same address. For now, just start a new clause whenever we see a
 526   // store.
 527   if (MEM->mayStore())
 528     return 1;
 529
 530   addClauseInst(*MEM);
 531
 532   // If the set of defs and uses intersect then we cannot add this instruction
 533   // to the clause, so we have a hazard.
 534   return ClauseDefs.anyCommon(ClauseUses) ? 1 : 0;
 535 }
 536
 537 int GCNHazardRecognizer::checkSMRDHazards(MachineInstr *SMRD) {
 538   int WaitStatesNeeded = 0;
 539
 540   WaitStatesNeeded = checkSoftClauseHazards(SMRD);
 541
 542   // This SMRD hazard only affects SI.
 543   if (!ST.hasSMRDReadVALUDefHazard())
 544     return WaitStatesNeeded;
 545
 546   // A read of an SGPR by SMRD instruction requires 4 wait states when the
 547   // SGPR was written by a VALU instruction.
 548   int SmrdSgprWaitStates = 4;
 549   auto IsHazardDefFn = [this] (MachineInstr *MI) { return TII.isVALU(*MI); };
 550   auto IsBufferHazardDefFn = [this] (MachineInstr *MI) { return TII.isSALU(*MI); };
 551
 552   bool IsBufferSMRD = TII.isBufferSMRD(*SMRD);
 553
 554   for (const MachineOperand &Use : SMRD->uses()) {
 555     if (!Use.isReg())
 556       continue;
 557     int WaitStatesNeededForUse =
 558         SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn,
 559                                                    SmrdSgprWaitStates);
 560     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
 561
 562     // This fixes what appears to be undocumented hardware behavior in SI where
 563     // s_mov writing a descriptor and s_buffer_load_dword reading the descriptor
 564     // needs some number of nops in between. We don't know how many we need, but
 565     // let's use 4. This wasn't discovered before probably because the only
 566     // case when this happens is when we expand a 64-bit pointer into a full
 567     // descriptor and use s_buffer_load_dword instead of s_load_dword, which was
 568     // probably never encountered in the closed-source land.
 569     if (IsBufferSMRD) {
 570       int WaitStatesNeededForUse =
 571         SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(),
 572                                                    IsBufferHazardDefFn,
 573                                                    SmrdSgprWaitStates);
 574       WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
 575     }
 576   }
 577
 578   return WaitStatesNeeded;
 579 }
 580
 581 int GCNHazardRecognizer::checkVMEMHazards(MachineInstr* VMEM) {
 582   if (!ST.hasVMEMReadSGPRVALUDefHazard())
 583     return 0;
 584
 585   int WaitStatesNeeded = checkSoftClauseHazards(VMEM);
 586
 587   // A read of an SGPR by a VMEM instruction requires 5 wait states when the
 588   // SGPR was written by a VALU Instruction.
 589   const int VmemSgprWaitStates = 5;
 590   auto IsHazardDefFn = [this] (MachineInstr *MI) { return TII.isVALU(*MI); };
 591   for (const MachineOperand &Use : VMEM->uses()) {
 592     if (!Use.isReg() || TRI.isVGPR(MF.getRegInfo(), Use.getReg()))
 593       continue;
 594
 595     int WaitStatesNeededForUse =
 596         VmemSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn,
 597                                                    VmemSgprWaitStates);
 598     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
 599   }
 600   return WaitStatesNeeded;
 601 }
 602
 603 int GCNHazardRecognizer::checkDPPHazards(MachineInstr *DPP) {
 604   const SIRegisterInfo *TRI = ST.getRegisterInfo();
 605   const SIInstrInfo *TII = ST.getInstrInfo();
 606
 607   // Check for DPP VGPR read after VALU VGPR write and EXEC write.
 608   int DppVgprWaitStates = 2;
 609   int DppExecWaitStates = 5;
 610   int WaitStatesNeeded = 0;
 611   auto IsHazardDefFn = [TII] (MachineInstr *MI) { return TII->isVALU(*MI); };
 612
 613   for (const MachineOperand &Use : DPP->uses()) {
 614     if (!Use.isReg() || !TRI->isVGPR(MF.getRegInfo(), Use.getReg()))
 615       continue;
 616     int WaitStatesNeededForUse =
 617         DppVgprWaitStates - getWaitStatesSinceDef(Use.getReg(),
 618                               [](MachineInstr *) { return true; },
 619                               DppVgprWaitStates);
 620     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
 621   }
 622
 623   WaitStatesNeeded = std::max(
 624       WaitStatesNeeded,
 625       DppExecWaitStates - getWaitStatesSinceDef(AMDGPU::EXEC, IsHazardDefFn,
 626                                                 DppExecWaitStates));
 627
 628   return WaitStatesNeeded;
 629 }
 630
 631 int GCNHazardRecognizer::checkDivFMasHazards(MachineInstr *DivFMas) {
 632   const SIInstrInfo *TII = ST.getInstrInfo();
 633
 634   // v_div_fmas requires 4 wait states after a write to vcc from a VALU
 635   // instruction.
 636   const int DivFMasWaitStates = 4;
 637   auto IsHazardDefFn = [TII] (MachineInstr *MI) { return TII->isVALU(*MI); };
 638   int WaitStatesNeeded = getWaitStatesSinceDef(AMDGPU::VCC, IsHazardDefFn,
 639                                                DivFMasWaitStates);
 640
 641   return DivFMasWaitStates - WaitStatesNeeded;
 642 }
 643
 644 int GCNHazardRecognizer::checkGetRegHazards(MachineInstr *GetRegInstr) {
 645   const SIInstrInfo *TII = ST.getInstrInfo();
 646   unsigned GetRegHWReg = getHWReg(TII, *GetRegInstr);
 647
 648   const int GetRegWaitStates = 2;
 649   auto IsHazardFn = [TII, GetRegHWReg] (MachineInstr *MI) {
 650     return GetRegHWReg == getHWReg(TII, *MI);
 651   };
 652   int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, GetRegWaitStates);
 653
 654   return GetRegWaitStates - WaitStatesNeeded;
 655 }
 656
 657 int GCNHazardRecognizer::checkSetRegHazards(MachineInstr *SetRegInstr) {
 658   const SIInstrInfo *TII = ST.getInstrInfo();
 659   unsigned HWReg = getHWReg(TII, *SetRegInstr);
 660
 661   const int SetRegWaitStates = ST.getSetRegWaitStates();
 662   auto IsHazardFn = [TII, HWReg] (MachineInstr *MI) {
 663     return HWReg == getHWReg(TII, *MI);
 664   };
 665   int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, SetRegWaitStates);
 666   return SetRegWaitStates - WaitStatesNeeded;
 667 }
 668
 669 int GCNHazardRecognizer::createsVALUHazard(const MachineInstr &MI) {
 670   if (!MI.mayStore())
 671     return -1;
 672
 673   const SIInstrInfo *TII = ST.getInstrInfo();
 674   unsigned Opcode = MI.getOpcode();
 675   const MCInstrDesc &Desc = MI.getDesc();
 676
 677   int VDataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata);
 678   int VDataRCID = -1;
 679   if (VDataIdx != -1)
 680     VDataRCID = Desc.OpInfo[VDataIdx].RegClass;
 681
 682   if (TII->isMUBUF(MI) || TII->isMTBUF(MI)) {
 683     // There is no hazard if the instruction does not use vector regs
 684     // (like wbinvl1)
 685     if (VDataIdx == -1)
 686       return -1;
 687     // For MUBUF/MTBUF instructions this hazard only exists if the
 688     // instruction is not using a register in the soffset field.
 689     const MachineOperand *SOffset =
 690         TII->getNamedOperand(MI, AMDGPU::OpName::soffset);
 691     // If we have no soffset operand, then assume this field has been
 692     // hardcoded to zero.
 693     if (AMDGPU::getRegBitWidth(VDataRCID) > 64 &&
 694         (!SOffset || !SOffset->isReg()))
 695       return VDataIdx;
 696   }
 697
 698   // MIMG instructions create a hazard if they don't use a 256-bit T# and
 699   // the store size is greater than 8 bytes and they have more than two bits
 700   // of their dmask set.
 701   // All our MIMG definitions use a 256-bit T#, so we can skip checking for them.
 702   if (TII->isMIMG(MI)) {
 703     int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::srsrc);
 704     assert(SRsrcIdx != -1 &&
 705            AMDGPU::getRegBitWidth(Desc.OpInfo[SRsrcIdx].RegClass) == 256);
 706     (void)SRsrcIdx;
 707   }
 708
 709   if (TII->isFLAT(MI)) {
 710     int DataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata);
 711     if (AMDGPU::getRegBitWidth(Desc.OpInfo[DataIdx].RegClass) > 64)
 712       return DataIdx;
 713   }
 714
 715   return -1;
 716 }
 717
 718 int GCNHazardRecognizer::checkVALUHazardsHelper(const MachineOperand &Def,
 719                                                 const MachineRegisterInfo &MRI) {
 720   // Helper to check for the hazard where VMEM instructions that store more than
 721   // 8 bytes can have there store data over written by the next instruction.
 722   const SIRegisterInfo *TRI = ST.getRegisterInfo();
 723
 724   const int VALUWaitStates = 1;
 725   int WaitStatesNeeded = 0;
 726
 727   if (!TRI->isVGPR(MRI, Def.getReg()))
 728     return WaitStatesNeeded;
 729   Register Reg = Def.getReg();
 730   auto IsHazardFn = [this, Reg, TRI] (MachineInstr *MI) {
 731     int DataIdx = createsVALUHazard(*MI);
 732     return DataIdx >= 0 &&
 733     TRI->regsOverlap(MI->getOperand(DataIdx).getReg(), Reg);
 734   };
 735   int WaitStatesNeededForDef =
 736     VALUWaitStates - getWaitStatesSince(IsHazardFn, VALUWaitStates);
 737   WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
 738
 739   return WaitStatesNeeded;
 740 }
 741
 742 int GCNHazardRecognizer::checkVALUHazards(MachineInstr *VALU) {
 743   // This checks for the hazard where VMEM instructions that store more than
 744   // 8 bytes can have there store data over written by the next instruction.
 745   if (!ST.has12DWordStoreHazard())
 746     return 0;
 747
 748   const MachineRegisterInfo &MRI = MF.getRegInfo();
 749   int WaitStatesNeeded = 0;
 750
 751   for (const MachineOperand &Def : VALU->defs()) {
 752     WaitStatesNeeded = std::max(WaitStatesNeeded, checkVALUHazardsHelper(Def, MRI));
 753   }
 754
 755   return WaitStatesNeeded;
 756 }
 757
 758 int GCNHazardRecognizer::checkInlineAsmHazards(MachineInstr *IA) {
 759   // This checks for hazards associated with inline asm statements.
 760   // Since inline asms can contain just about anything, we use this
 761   // to call/leverage other check*Hazard routines. Note that
 762   // this function doesn't attempt to address all possible inline asm
 763   // hazards (good luck), but is a collection of what has been
 764   // problematic thus far.
 765
 766   // see checkVALUHazards()
 767   if (!ST.has12DWordStoreHazard())
 768     return 0;
 769
 770   const MachineRegisterInfo &MRI = MF.getRegInfo();
 771   int WaitStatesNeeded = 0;
 772
 773   for (unsigned I = InlineAsm::MIOp_FirstOperand, E = IA->getNumOperands();
 774        I != E; ++I) {
 775     const MachineOperand &Op = IA->getOperand(I);
 776     if (Op.isReg() && Op.isDef()) {
 777       WaitStatesNeeded = std::max(WaitStatesNeeded, checkVALUHazardsHelper(Op, MRI));
 778     }
 779   }
 780
 781   return WaitStatesNeeded;
 782 }
 783
 784 int GCNHazardRecognizer::checkRWLaneHazards(MachineInstr *RWLane) {
 785   const SIInstrInfo *TII = ST.getInstrInfo();
 786   const SIRegisterInfo *TRI = ST.getRegisterInfo();
 787   const MachineRegisterInfo &MRI = MF.getRegInfo();
 788
 789   const MachineOperand *LaneSelectOp =
 790       TII->getNamedOperand(*RWLane, AMDGPU::OpName::src1);
 791
 792   if (!LaneSelectOp->isReg() || !TRI->isSGPRReg(MRI, LaneSelectOp->getReg()))
 793     return 0;
 794
 795   Register LaneSelectReg = LaneSelectOp->getReg();
 796   auto IsHazardFn = [TII] (MachineInstr *MI) {
 797     return TII->isVALU(*MI);
 798   };
 799
 800   const int RWLaneWaitStates = 4;
 801   int WaitStatesSince = getWaitStatesSinceDef(LaneSelectReg, IsHazardFn,
 802                                               RWLaneWaitStates);
 803   return RWLaneWaitStates - WaitStatesSince;
 804 }
 805
 806 int GCNHazardRecognizer::checkRFEHazards(MachineInstr *RFE) {
 807   if (!ST.hasRFEHazards())
 808     return 0;
 809
 810   const SIInstrInfo *TII = ST.getInstrInfo();
 811
 812   const int RFEWaitStates = 1;
 813
 814   auto IsHazardFn = [TII] (MachineInstr *MI) {
 815     return getHWReg(TII, *MI) == AMDGPU::Hwreg::ID_TRAPSTS;
 816   };
 817   int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, RFEWaitStates);
 818   return RFEWaitStates - WaitStatesNeeded;
 819 }
 820
 821 int GCNHazardRecognizer::checkAnyInstHazards(MachineInstr *MI) {
 822   if (MI->isDebugInstr())
 823     return 0;
 824
 825   const SIRegisterInfo *TRI = ST.getRegisterInfo();
 826   if (!ST.hasSMovFedHazard())
 827     return 0;
 828
 829   // Check for any instruction reading an SGPR after a write from
 830   // s_mov_fed_b32.
 831   int MovFedWaitStates = 1;
 832   int WaitStatesNeeded = 0;
 833
 834   for (const MachineOperand &Use : MI->uses()) {
 835     if (!Use.isReg() || TRI->isVGPR(MF.getRegInfo(), Use.getReg()))
 836       continue;
 837     auto IsHazardFn = [] (MachineInstr *MI) {
 838       return MI->getOpcode() == AMDGPU::S_MOV_FED_B32;
 839     };
 840     int WaitStatesNeededForUse =
 841         MovFedWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardFn,
 842                                                  MovFedWaitStates);
 843     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
 844   }
 845
 846   return WaitStatesNeeded;
 847 }
 848
 849 int GCNHazardRecognizer::checkReadM0Hazards(MachineInstr *MI) {
 850   const SIInstrInfo *TII = ST.getInstrInfo();
 851   const int SMovRelWaitStates = 1;
 852   auto IsHazardFn = [TII] (MachineInstr *MI) {
 853     return TII->isSALU(*MI);
 854   };
 855   return SMovRelWaitStates - getWaitStatesSinceDef(AMDGPU::M0, IsHazardFn,
 856                                                    SMovRelWaitStates);
 857 }
 858
 859 void GCNHazardRecognizer::fixHazards(MachineInstr *MI) {
 860   fixVMEMtoScalarWriteHazards(MI);
 861   fixVcmpxPermlaneHazards(MI);
 862   fixSMEMtoVectorWriteHazards(MI);
 863   fixVcmpxExecWARHazard(MI);
 864   fixLdsBranchVmemWARHazard(MI);
 865 }
 866
 867 bool GCNHazardRecognizer::fixVcmpxPermlaneHazards(MachineInstr *MI) {
 868   if (!ST.hasVcmpxPermlaneHazard() || !isPermlane(*MI))
 869     return false;
 870
 871   const SIInstrInfo *TII = ST.getInstrInfo();
 872   auto IsHazardFn = [TII] (MachineInstr *MI) {
 873     return TII->isVOPC(*MI);
 874   };
 875
 876   auto IsExpiredFn = [] (MachineInstr *MI, int) {
 877     if (!MI)
 878       return false;
 879     unsigned Opc = MI->getOpcode();
 880     return SIInstrInfo::isVALU(*MI) &&
 881            Opc != AMDGPU::V_NOP_e32 &&
 882            Opc != AMDGPU::V_NOP_e64 &&
 883            Opc != AMDGPU::V_NOP_sdwa;
 884   };
 885
 886   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
 887       std::numeric_limits<int>::max())
 888     return false;
 889
 890   // V_NOP will be discarded by SQ.
 891   // Use V_MOB_B32 v?, v?. Register must be alive so use src0 of V_PERMLANE*
 892   // which is always a VGPR and available.
 893   auto *Src0 = TII->getNamedOperand(*MI, AMDGPU::OpName::src0);
 894   Register Reg = Src0->getReg();
 895   bool IsUndef = Src0->isUndef();
 896   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
 897           TII->get(AMDGPU::V_MOV_B32_e32))
 898     .addReg(Reg, RegState::Define | (IsUndef ? RegState::Dead : 0))
 899     .addReg(Reg, IsUndef ? RegState::Undef : RegState::Kill);
 900
 901   return true;
 902 }
 903
 904 bool GCNHazardRecognizer::fixVMEMtoScalarWriteHazards(MachineInstr *MI) {
 905   if (!ST.hasVMEMtoScalarWriteHazard())
 906     return false;
 907
 908   if (!SIInstrInfo::isSALU(*MI) && !SIInstrInfo::isSMRD(*MI))
 909     return false;
 910
 911   if (MI->getNumDefs() == 0)
 912     return false;
 913
 914   const SIRegisterInfo *TRI = ST.getRegisterInfo();
 915
 916   auto IsHazardFn = [TRI, MI] (MachineInstr *I) {
 917     if (!SIInstrInfo::isVMEM(*I) && !SIInstrInfo::isDS(*I) &&
 918         !SIInstrInfo::isFLAT(*I))
 919       return false;
 920
 921     for (const MachineOperand &Def : MI->defs()) {
 922       MachineOperand *Op = I->findRegisterUseOperand(Def.getReg(), false, TRI);
 923       if (!Op)
 924         continue;
 925       return true;
 926     }
 927     return false;
 928   };
 929
 930   auto IsExpiredFn = [] (MachineInstr *MI, int) {
 931     return MI && (SIInstrInfo::isVALU(*MI) ||
 932                   (MI->getOpcode() == AMDGPU::S_WAITCNT &&
 933                    !MI->getOperand(0).getImm()));
 934   };
 935
 936   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
 937       std::numeric_limits<int>::max())
 938     return false;
 939
 940   const SIInstrInfo *TII = ST.getInstrInfo();
 941   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII->get(AMDGPU::V_NOP_e32));
 942   return true;
 943 }
 944
 945 bool GCNHazardRecognizer::fixSMEMtoVectorWriteHazards(MachineInstr *MI) {
 946   if (!ST.hasSMEMtoVectorWriteHazard())
 947     return false;
 948
 949   if (!SIInstrInfo::isVALU(*MI))
 950     return false;
 951
 952   unsigned SDSTName;
 953   switch (MI->getOpcode()) {
 954   case AMDGPU::V_READLANE_B32:
 955   case AMDGPU::V_READFIRSTLANE_B32:
 956     SDSTName = AMDGPU::OpName::vdst;
 957     break;
 958   default:
 959     SDSTName = AMDGPU::OpName::sdst;
 960     break;
 961   }
 962
 963   const SIInstrInfo *TII = ST.getInstrInfo();
 964   const SIRegisterInfo *TRI = ST.getRegisterInfo();
 965   const AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(ST.getCPU());
 966   const MachineOperand *SDST = TII->getNamedOperand(*MI, SDSTName);
 967   if (!SDST) {
 968     for (const auto &MO : MI->implicit_operands()) {
 969       if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegClass(MO.getReg()))) {
 970         SDST = &MO;
 971         break;
 972       }
 973     }
 974   }
 975
 976   if (!SDST)
 977     return false;
 978
 979   const Register SDSTReg = SDST->getReg();
 980   auto IsHazardFn = [SDSTReg, TRI] (MachineInstr *I) {
 981     return SIInstrInfo::isSMRD(*I) && I->readsRegister(SDSTReg, TRI);
 982   };
 983
 984   auto IsExpiredFn = [TII, IV] (MachineInstr *MI, int) {
 985     if (MI) {
 986       if (TII->isSALU(*MI)) {
 987         switch (MI->getOpcode()) {
 988         case AMDGPU::S_SETVSKIP:
 989         case AMDGPU::S_VERSION:
 990         case AMDGPU::S_WAITCNT_VSCNT:
 991         case AMDGPU::S_WAITCNT_VMCNT:
 992         case AMDGPU::S_WAITCNT_EXPCNT:
 993           // These instructions cannot not mitigate the hazard.
 994           return false;
 995         case AMDGPU::S_WAITCNT_LGKMCNT:
 996           // Reducing lgkmcnt count to 0 always mitigates the hazard.
 997           return (MI->getOperand(1).getImm() == 0) &&
 998                  (MI->getOperand(0).getReg() == AMDGPU::SGPR_NULL);
 999         case AMDGPU::S_WAITCNT: {
1000           const int64_t Imm = MI->getOperand(0).getImm();
1001           AMDGPU::Waitcnt Decoded = AMDGPU::decodeWaitcnt(IV, Imm);
1002           return (Decoded.LgkmCnt == 0);
1003         }
1004         default:
1005           // SOPP instructions cannot mitigate the hazard.
1006           if (TII->isSOPP(*MI))
1007             return false;
1008           // At this point the SALU can be assumed to mitigate the hazard
1009           // because either:
1010           // (a) it is independent of the at risk SMEM (breaking chain),
1011           // or
1012           // (b) it is dependent on the SMEM, in which case an appropriate
1013           //     s_waitcnt lgkmcnt _must_ exist between it and the at risk
1014           //     SMEM instruction.
1015           return true;
1016         }
1017       }
1018     }
1019     return false;
1020   };
1021
1022   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1023       std::numeric_limits<int>::max())
1024     return false;
1025
1026   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1027           TII->get(AMDGPU::S_MOV_B32), AMDGPU::SGPR_NULL)
1028       .addImm(0);
1029   return true;
1030 }
1031
1032 bool GCNHazardRecognizer::fixVcmpxExecWARHazard(MachineInstr *MI) {
1033   if (!ST.hasVcmpxExecWARHazard() || !SIInstrInfo::isVALU(*MI))
1034     return false;
1035
1036   const SIRegisterInfo *TRI = ST.getRegisterInfo();
1037   if (!MI->modifiesRegister(AMDGPU::EXEC, TRI))
1038     return false;
1039
1040   auto IsHazardFn = [TRI] (MachineInstr *I) {
1041     if (SIInstrInfo::isVALU(*I))
1042       return false;
1043     return I->readsRegister(AMDGPU::EXEC, TRI);
1044   };
1045
1046   const SIInstrInfo *TII = ST.getInstrInfo();
1047   auto IsExpiredFn = [TII, TRI] (MachineInstr *MI, int) {
1048     if (!MI)
1049       return false;
1050     if (SIInstrInfo::isVALU(*MI)) {
1051       if (TII->getNamedOperand(*MI, AMDGPU::OpName::sdst))
1052         return true;
1053       for (auto MO : MI->implicit_operands())
1054         if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegClass(MO.getReg())))
1055           return true;
1056     }
1057     if (MI->getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1058         (MI->getOperand(0).getImm() & 0xfffe) == 0xfffe)
1059       return true;
1060     return false;
1061   };
1062
1063   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1064       std::numeric_limits<int>::max())
1065     return false;
1066
1067   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1068           TII->get(AMDGPU::S_WAITCNT_DEPCTR))
1069     .addImm(0xfffe);
1070   return true;
1071 }
1072
1073 bool GCNHazardRecognizer::fixLdsBranchVmemWARHazard(MachineInstr *MI) {
1074   if (!ST.hasLdsBranchVmemWARHazard())
1075     return false;
1076
1077   auto IsHazardInst = [] (const MachineInstr *MI) {
1078     if (SIInstrInfo::isDS(*MI))
1079       return 1;
1080     if (SIInstrInfo::isVMEM(*MI) || SIInstrInfo::isSegmentSpecificFLAT(*MI))
1081       return 2;
1082     return 0;
1083   };
1084
1085   auto InstType = IsHazardInst(MI);
1086   if (!InstType)
1087     return false;
1088
1089   auto IsExpiredFn = [&IsHazardInst] (MachineInstr *I, int) {
1090     return I && (IsHazardInst(I) ||
1091                  (I->getOpcode() == AMDGPU::S_WAITCNT_VSCNT &&
1092                   I->getOperand(0).getReg() == AMDGPU::SGPR_NULL &&
1093                   !I->getOperand(1).getImm()));
1094   };
1095
1096   auto IsHazardFn = [InstType, &IsHazardInst] (MachineInstr *I) {
1097     if (!I->isBranch())
1098       return false;
1099
1100     auto IsHazardFn = [InstType, IsHazardInst] (MachineInstr *I) {
1101       auto InstType2 = IsHazardInst(I);
1102       return InstType2 && InstType != InstType2;
1103     };
1104
1105     auto IsExpiredFn = [InstType, &IsHazardInst] (MachineInstr *I, int) {
1106       if (!I)
1107         return false;
1108
1109       auto InstType2 = IsHazardInst(I);
1110       if (InstType == InstType2)
1111         return true;
1112
1113       return I->getOpcode() == AMDGPU::S_WAITCNT_VSCNT &&
1114              I->getOperand(0).getReg() == AMDGPU::SGPR_NULL &&
1115              !I->getOperand(1).getImm();
1116     };
1117
1118     return ::getWaitStatesSince(IsHazardFn, I, IsExpiredFn) !=
1119            std::numeric_limits<int>::max();
1120   };
1121
1122   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1123       std::numeric_limits<int>::max())
1124     return false;
1125
1126   const SIInstrInfo *TII = ST.getInstrInfo();
1127   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1128           TII->get(AMDGPU::S_WAITCNT_VSCNT))
1129     .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1130     .addImm(0);
1131
1132   return true;
1133 }
1134
1135 int GCNHazardRecognizer::checkNSAtoVMEMHazard(MachineInstr *MI) {
1136   int NSAtoVMEMWaitStates = 1;
1137
1138   if (!ST.hasNSAtoVMEMBug())
1139     return 0;
1140
1141   if (!SIInstrInfo::isMUBUF(*MI) && !SIInstrInfo::isMTBUF(*MI))
1142     return 0;
1143
1144   const SIInstrInfo *TII = ST.getInstrInfo();
1145   const auto *Offset = TII->getNamedOperand(*MI, AMDGPU::OpName::offset);
1146   if (!Offset || (Offset->getImm() & 6) == 0)
1147     return 0;
1148
1149   auto IsHazardFn = [TII] (MachineInstr *I) {
1150     if (!SIInstrInfo::isMIMG(*I))
1151       return false;
1152     const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(I->getOpcode());
1153     return Info->MIMGEncoding == AMDGPU::MIMGEncGfx10NSA &&
1154            TII->getInstSizeInBytes(*I) >= 16;
1155   };
1156
1157   return NSAtoVMEMWaitStates - getWaitStatesSince(IsHazardFn, 1);
1158 }
1159
1160 int GCNHazardRecognizer::checkFPAtomicToDenormModeHazard(MachineInstr *MI) {
1161   int FPAtomicToDenormModeWaitStates = 3;
1162
1163   if (MI->getOpcode() != AMDGPU::S_DENORM_MODE)
1164     return 0;
1165
1166   auto IsHazardFn = [] (MachineInstr *I) {
1167     if (!SIInstrInfo::isVMEM(*I) && !SIInstrInfo::isFLAT(*I))
1168       return false;
1169     return SIInstrInfo::isFPAtomic(*I);
1170   };
1171
1172   auto IsExpiredFn = [] (MachineInstr *MI, int WaitStates) {
1173     if (WaitStates >= 3 || SIInstrInfo::isVALU(*MI))
1174       return true;
1175
1176     switch (MI->getOpcode()) {
1177     case AMDGPU::S_WAITCNT:
1178     case AMDGPU::S_WAITCNT_VSCNT:
1179     case AMDGPU::S_WAITCNT_VMCNT:
1180     case AMDGPU::S_WAITCNT_EXPCNT:
1181     case AMDGPU::S_WAITCNT_LGKMCNT:
1182     case AMDGPU::S_WAITCNT_IDLE:
1183       return true;
1184     default:
1185       break;
1186     }
1187
1188     return false;
1189   };
1190
1191
1192   return FPAtomicToDenormModeWaitStates -
1193          ::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn);
1194 }
1195
1196 int GCNHazardRecognizer::checkMAIHazards(MachineInstr *MI) {
1197   assert(SIInstrInfo::isMAI(*MI));
1198
1199   int WaitStatesNeeded = 0;
1200   unsigned Opc = MI->getOpcode();
1201
1202   auto IsVALUFn = [] (MachineInstr *MI) {
1203     return SIInstrInfo::isVALU(*MI);
1204   };
1205
1206   if (Opc != AMDGPU::V_ACCVGPR_READ_B32) { // MFMA or v_accvgpr_write
1207     const int LegacyVALUWritesVGPRWaitStates = 2;
1208     const int VALUWritesExecWaitStates = 4;
1209     const int MaxWaitStates = 4;
1210
1211     int WaitStatesNeededForUse = VALUWritesExecWaitStates -
1212       getWaitStatesSinceDef(AMDGPU::EXEC, IsVALUFn, MaxWaitStates);
1213     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1214
1215     if (WaitStatesNeeded < MaxWaitStates) {
1216       for (const MachineOperand &Use : MI->explicit_uses()) {
1217         const int MaxWaitStates = 2;
1218
1219         if (!Use.isReg() || !TRI.isVGPR(MF.getRegInfo(), Use.getReg()))
1220           continue;
1221
1222         int WaitStatesNeededForUse = LegacyVALUWritesVGPRWaitStates -
1223           getWaitStatesSinceDef(Use.getReg(), IsVALUFn, MaxWaitStates);
1224         WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1225
1226         if (WaitStatesNeeded == MaxWaitStates)
1227           break;
1228       }
1229     }
1230   }
1231
1232   auto IsMFMAFn = [] (MachineInstr *MI) {
1233     return SIInstrInfo::isMAI(*MI) &&
1234            MI->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32 &&
1235            MI->getOpcode() != AMDGPU::V_ACCVGPR_READ_B32;
1236   };
1237
1238   for (const MachineOperand &Op : MI->explicit_operands()) {
1239     if (!Op.isReg() || !TRI.isAGPR(MF.getRegInfo(), Op.getReg()))
1240       continue;
1241
1242     if (Op.isDef() && Opc != AMDGPU::V_ACCVGPR_WRITE_B32)
1243       continue;
1244
1245     const int MFMAWritesAGPROverlappedSrcABWaitStates = 4;
1246     const int MFMAWritesAGPROverlappedSrcCWaitStates = 2;
1247     const int MFMA4x4WritesAGPRAccVgprReadWaitStates = 4;
1248     const int MFMA16x16WritesAGPRAccVgprReadWaitStates = 10;
1249     const int MFMA32x32WritesAGPRAccVgprReadWaitStates = 18;
1250     const int MFMA4x4WritesAGPRAccVgprWriteWaitStates = 1;
1251     const int MFMA16x16WritesAGPRAccVgprWriteWaitStates = 7;
1252     const int MFMA32x32WritesAGPRAccVgprWriteWaitStates = 15;
1253     const int MaxWaitStates = 18;
1254     Register Reg = Op.getReg();
1255     unsigned HazardDefLatency = 0;
1256
1257     auto IsOverlappedMFMAFn = [Reg, &IsMFMAFn, &HazardDefLatency, this]
1258                               (MachineInstr *MI) {
1259       if (!IsMFMAFn(MI))
1260         return false;
1261       Register DstReg = MI->getOperand(0).getReg();
1262       if (DstReg == Reg)
1263         return false;
1264       HazardDefLatency = std::max(HazardDefLatency,
1265                                   TSchedModel.computeInstrLatency(MI));
1266       return TRI.regsOverlap(DstReg, Reg);
1267     };
1268
1269     int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsOverlappedMFMAFn,
1270                                                    MaxWaitStates);
1271     int NeedWaitStates = MFMAWritesAGPROverlappedSrcABWaitStates;
1272     int SrcCIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
1273     int OpNo = MI->getOperandNo(&Op);
1274     if (OpNo == SrcCIdx) {
1275       NeedWaitStates = MFMAWritesAGPROverlappedSrcCWaitStates;
1276     } else if (Opc == AMDGPU::V_ACCVGPR_READ_B32) {
1277       switch (HazardDefLatency) {
1278       case 2:  NeedWaitStates = MFMA4x4WritesAGPRAccVgprReadWaitStates;
1279                break;
1280       case 8:  NeedWaitStates = MFMA16x16WritesAGPRAccVgprReadWaitStates;
1281                break;
1282       case 16: LLVM_FALLTHROUGH;
1283       default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprReadWaitStates;
1284                break;
1285       }
1286     } else if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32) {
1287       switch (HazardDefLatency) {
1288       case 2:  NeedWaitStates = MFMA4x4WritesAGPRAccVgprWriteWaitStates;
1289                break;
1290       case 8:  NeedWaitStates = MFMA16x16WritesAGPRAccVgprWriteWaitStates;
1291                break;
1292       case 16: LLVM_FALLTHROUGH;
1293       default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprWriteWaitStates;
1294                break;
1295       }
1296     }
1297
1298     int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
1299     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1300
1301     if (WaitStatesNeeded == MaxWaitStates)
1302       return WaitStatesNeeded; // Early exit.
1303
1304     auto IsAccVgprWriteFn = [Reg, this] (MachineInstr *MI) {
1305       if (MI->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32)
1306         return false;
1307       Register DstReg = MI->getOperand(0).getReg();
1308       return TRI.regsOverlap(Reg, DstReg);
1309     };
1310
1311     const int AccVGPRWriteMFMAReadSrcCWaitStates = 1;
1312     const int AccVGPRWriteMFMAReadSrcABWaitStates = 3;
1313     const int AccVGPRWriteAccVgprReadWaitStates = 3;
1314     NeedWaitStates = AccVGPRWriteMFMAReadSrcABWaitStates;
1315     if (OpNo == SrcCIdx)
1316       NeedWaitStates = AccVGPRWriteMFMAReadSrcCWaitStates;
1317     else if (Opc == AMDGPU::V_ACCVGPR_READ_B32)
1318       NeedWaitStates = AccVGPRWriteAccVgprReadWaitStates;
1319
1320     WaitStatesNeededForUse = NeedWaitStates -
1321       getWaitStatesSinceDef(Reg, IsAccVgprWriteFn, MaxWaitStates);
1322     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1323
1324     if (WaitStatesNeeded == MaxWaitStates)
1325       return WaitStatesNeeded; // Early exit.
1326   }
1327
1328   if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32) {
1329     const int MFMA4x4ReadSrcCAccVgprWriteWaitStates = 0;
1330     const int MFMA16x16ReadSrcCAccVgprWriteWaitStates = 5;
1331     const int MFMA32x32ReadSrcCAccVgprWriteWaitStates = 13;
1332     const int MaxWaitStates = 13;
1333     Register DstReg = MI->getOperand(0).getReg();
1334     unsigned HazardDefLatency = 0;
1335
1336     auto IsSrcCMFMAFn = [DstReg, &IsMFMAFn, &HazardDefLatency, this]
1337                          (MachineInstr *MI) {
1338       if (!IsMFMAFn(MI))
1339         return false;
1340       Register Reg = TII.getNamedOperand(*MI, AMDGPU::OpName::src2)->getReg();
1341       HazardDefLatency = std::max(HazardDefLatency,
1342                                   TSchedModel.computeInstrLatency(MI));
1343       return TRI.regsOverlap(Reg, DstReg);
1344     };
1345
1346     int WaitStatesSince = getWaitStatesSince(IsSrcCMFMAFn, MaxWaitStates);
1347     int NeedWaitStates;
1348     switch (HazardDefLatency) {
1349     case 2:  NeedWaitStates = MFMA4x4ReadSrcCAccVgprWriteWaitStates;
1350              break;
1351     case 8:  NeedWaitStates = MFMA16x16ReadSrcCAccVgprWriteWaitStates;
1352              break;
1353     case 16: LLVM_FALLTHROUGH;
1354     default: NeedWaitStates = MFMA32x32ReadSrcCAccVgprWriteWaitStates;
1355              break;
1356     }
1357
1358     int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSince;
1359     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1360   }
1361
1362   return WaitStatesNeeded;
1363 }
1364
1365 int GCNHazardRecognizer::checkMAILdStHazards(MachineInstr *MI) {
1366   if (!ST.hasMAIInsts())
1367     return 0;
1368
1369   int WaitStatesNeeded = 0;
1370
1371   auto IsAccVgprReadFn = [] (MachineInstr *MI) {
1372     return MI->getOpcode() == AMDGPU::V_ACCVGPR_READ_B32;
1373   };
1374
1375   for (const MachineOperand &Op : MI->explicit_uses()) {
1376     if (!Op.isReg() || !TRI.isVGPR(MF.getRegInfo(), Op.getReg()))
1377       continue;
1378
1379     Register Reg = Op.getReg();
1380
1381     const int AccVgprReadLdStWaitStates = 2;
1382     const int VALUWriteAccVgprReadLdStDepVALUWaitStates = 1;
1383     const int MaxWaitStates = 2;
1384
1385     int WaitStatesNeededForUse = AccVgprReadLdStWaitStates -
1386       getWaitStatesSinceDef(Reg, IsAccVgprReadFn, MaxWaitStates);
1387     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1388
1389     if (WaitStatesNeeded == MaxWaitStates)
1390       return WaitStatesNeeded; // Early exit.
1391
1392     auto IsVALUAccVgprReadCheckFn = [Reg, this] (MachineInstr *MI) {
1393       if (MI->getOpcode() != AMDGPU::V_ACCVGPR_READ_B32)
1394         return false;
1395       auto IsVALUFn = [] (MachineInstr *MI) {
1396         return SIInstrInfo::isVALU(*MI) && !SIInstrInfo::isMAI(*MI);
1397       };
1398       return getWaitStatesSinceDef(Reg, IsVALUFn, 2 /*MaxWaitStates*/) <
1399              std::numeric_limits<int>::max();
1400     };
1401
1402     WaitStatesNeededForUse = VALUWriteAccVgprReadLdStDepVALUWaitStates -
1403       getWaitStatesSince(IsVALUAccVgprReadCheckFn, MaxWaitStates);
1404     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1405   }
1406
1407   return WaitStatesNeeded;
1408 }