llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp

   1 //===-- GCNSchedStrategy.cpp - GCN Scheduler Strategy ---------------------===//
   2 //
   3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 // See https://llvm.org/LICENSE.txt for license information.
   5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 //
   7 //===----------------------------------------------------------------------===//
   8 //
   9 /// \file
  10 /// This contains a MachineSchedStrategy implementation for maximizing wave
  11 /// occupancy on GCN hardware.
  12 ///
  13 /// This pass will apply multiple scheduling stages to the same function.
  14 /// Regions are first recorded in GCNScheduleDAGMILive::schedule. The actual
  15 /// entry point for the scheduling of those regions is
  16 /// GCNScheduleDAGMILive::runSchedStages.
  17
  18 /// Generally, the reason for having multiple scheduling stages is to account
  19 /// for the kernel-wide effect of register usage on occupancy.  Usually, only a
  20 /// few scheduling regions will have register pressure high enough to limit
  21 /// occupancy for the kernel, so constraints can be relaxed to improve ILP in
  22 /// other regions.
  23 ///
  24 //===----------------------------------------------------------------------===//
  25
  26 #include "GCNSchedStrategy.h"
  27 #include "AMDGPUIGroupLP.h"
  28 #include "SIMachineFunctionInfo.h"
  29 #include "llvm/CodeGen/RegisterClassInfo.h"
  30
  31 #define DEBUG_TYPE "machine-scheduler"
  32
  33 using namespace llvm;
  34
  35 static cl::opt<bool> DisableUnclusterHighRP(
  36     "amdgpu-disable-unclustered-high-rp-reschedule", cl::Hidden,
  37     cl::desc("Disable unclustered high register pressure "
  38              "reduction scheduling stage."),
  39     cl::init(false));
  40
  41 static cl::opt<bool> DisableClusteredLowOccupancy(
  42     "amdgpu-disable-clustered-low-occupancy-reschedule", cl::Hidden,
  43     cl::desc("Disable clustered low occupancy "
  44              "rescheduling for ILP scheduling stage."),
  45     cl::init(false));
  46
  47 static cl::opt<unsigned> ScheduleMetricBias(
  48     "amdgpu-schedule-metric-bias", cl::Hidden,
  49     cl::desc(
  50         "Sets the bias which adds weight to occupancy vs latency. Set it to "
  51         "100 to chase the occupancy only."),
  52     cl::init(10));
  53
  54 static cl::opt<bool>
  55     RelaxedOcc("amdgpu-schedule-relaxed-occupancy", cl::Hidden,
  56                cl::desc("Relax occupancy targets for kernels which are memory "
  57                         "bound (amdgpu-membound-threshold), or "
  58                         "Wave Limited (amdgpu-limit-wave-threshold)."),
  59                cl::init(false));
  60
  61 static cl::opt<bool> GCNTrackers(
  62     "amdgpu-use-amdgpu-trackers", cl::Hidden,
  63     cl::desc("Use the AMDGPU specific RPTrackers during scheduling"),
  64     cl::init(false));
  65
  66 const unsigned ScheduleMetrics::ScaleFactor = 100;
  67
  68 GCNSchedStrategy::GCNSchedStrategy(const MachineSchedContext *C)
  69     : GenericScheduler(C), TargetOccupancy(0), MF(nullptr),
  70       DownwardTracker(*C->LIS), UpwardTracker(*C->LIS), HasHighPressure(false) {
  71 }
  72
  73 void GCNSchedStrategy::initialize(ScheduleDAGMI *DAG) {
  74   GenericScheduler::initialize(DAG);
  75
  76   MF = &DAG->MF;
  77
  78   const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
  79
  80   SGPRExcessLimit =
  81       Context->RegClassInfo->getNumAllocatableRegs(&AMDGPU::SGPR_32RegClass);
  82   VGPRExcessLimit =
  83       Context->RegClassInfo->getNumAllocatableRegs(&AMDGPU::VGPR_32RegClass);
  84
  85   SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>();
  86   // Set the initial TargetOccupnacy to the maximum occupancy that we can
  87   // achieve for this function. This effectively sets a lower bound on the
  88   // 'Critical' register limits in the scheduler.
  89   // Allow for lower occupancy targets if kernel is wave limited or memory
  90   // bound, and using the relaxed occupancy feature.
  91   TargetOccupancy =
  92       RelaxedOcc ? MFI.getMinAllowedOccupancy() : MFI.getOccupancy();
  93   SGPRCriticalLimit =
  94       std::min(ST.getMaxNumSGPRs(TargetOccupancy, true), SGPRExcessLimit);
  95
  96   if (!KnownExcessRP) {
  97     VGPRCriticalLimit =
  98         std::min(ST.getMaxNumVGPRs(TargetOccupancy), VGPRExcessLimit);
  99   } else {
 100     // This is similar to ST.getMaxNumVGPRs(TargetOccupancy) result except
 101     // returns a reasonably small number for targets with lots of VGPRs, such
 102     // as GFX10 and GFX11.
 103     LLVM_DEBUG(dbgs() << "Region is known to spill, use alternative "
 104                          "VGPRCriticalLimit calculation method.\n");
 105
 106     unsigned Granule = AMDGPU::IsaInfo::getVGPRAllocGranule(&ST);
 107     unsigned Addressable = AMDGPU::IsaInfo::getAddressableNumVGPRs(&ST);
 108     unsigned VGPRBudget = alignDown(Addressable / TargetOccupancy, Granule);
 109     VGPRBudget = std::max(VGPRBudget, Granule);
 110     VGPRCriticalLimit = std::min(VGPRBudget, VGPRExcessLimit);
 111   }
 112
 113   // Subtract error margin and bias from register limits and avoid overflow.
 114   SGPRCriticalLimit -= std::min(SGPRLimitBias + ErrorMargin, SGPRCriticalLimit);
 115   VGPRCriticalLimit -= std::min(VGPRLimitBias + ErrorMargin, VGPRCriticalLimit);
 116   SGPRExcessLimit -= std::min(SGPRLimitBias + ErrorMargin, SGPRExcessLimit);
 117   VGPRExcessLimit -= std::min(VGPRLimitBias + ErrorMargin, VGPRExcessLimit);
 118
 119   LLVM_DEBUG(dbgs() << "VGPRCriticalLimit = " << VGPRCriticalLimit
 120                     << ", VGPRExcessLimit = " << VGPRExcessLimit
 121                     << ", SGPRCriticalLimit = " << SGPRCriticalLimit
 122                     << ", SGPRExcessLimit = " << SGPRExcessLimit << "\n\n");
 123 }
 124
 125 /// Checks whether \p SU can use the cached DAG pressure diffs to compute the
 126 /// current register pressure.
 127 ///
 128 /// This works for the common case, but it has a few exceptions that have been
 129 /// observed through trial and error:
 130 ///   - Explicit physical register operands
 131 ///   - Subregister definitions
 132 ///
 133 /// In both of those cases, PressureDiff doesn't represent the actual pressure,
 134 /// and querying LiveIntervals through the RegPressureTracker is needed to get
 135 /// an accurate value.
 136 ///
 137 /// We should eventually only use PressureDiff for maximum performance, but this
 138 /// already allows 80% of SUs to take the fast path without changing scheduling
 139 /// at all. Further changes would either change scheduling, or require a lot
 140 /// more logic to recover an accurate pressure estimate from the PressureDiffs.
 141 static bool canUsePressureDiffs(const SUnit &SU) {
 142   if (!SU.isInstr())
 143     return false;
 144
 145   // Cannot use pressure diffs for subregister defs or with physregs, it's
 146   // imprecise in both cases.
 147   for (const auto &Op : SU.getInstr()->operands()) {
 148     if (!Op.isReg() || Op.isImplicit())
 149       continue;
 150     if (Op.getReg().isPhysical() ||
 151         (Op.isDef() && Op.getSubReg() != AMDGPU::NoSubRegister))
 152       return false;
 153   }
 154   return true;
 155 }
 156
 157 static void getRegisterPressures(
 158     bool AtTop, const RegPressureTracker &RPTracker, SUnit *SU,
 159     std::vector<unsigned> &Pressure, std::vector<unsigned> &MaxPressure,
 160     GCNDownwardRPTracker &DownwardTracker, GCNUpwardRPTracker &UpwardTracker,
 161     ScheduleDAGMI *DAG, const SIRegisterInfo *SRI) {
 162   // getDownwardPressure() and getUpwardPressure() make temporary changes to
 163   // the tracker, so we need to pass those function a non-const copy.
 164   RegPressureTracker &TempTracker = const_cast<RegPressureTracker &>(RPTracker);
 165   if (!GCNTrackers) {
 166     AtTop
 167         ? TempTracker.getDownwardPressure(SU->getInstr(), Pressure, MaxPressure)
 168         : TempTracker.getUpwardPressure(SU->getInstr(), Pressure, MaxPressure);
 169
 170     return;
 171   }
 172
 173   // GCNTrackers
 174   Pressure.resize(4, 0);
 175   MachineInstr *MI = SU->getInstr();
 176   GCNRegPressure NewPressure;
 177   if (AtTop) {
 178     GCNDownwardRPTracker TempDownwardTracker(DownwardTracker);
 179     NewPressure = TempDownwardTracker.bumpDownwardPressure(MI, SRI);
 180   } else {
 181     GCNUpwardRPTracker TempUpwardTracker(UpwardTracker);
 182     TempUpwardTracker.recede(*MI);
 183     NewPressure = TempUpwardTracker.getPressure();
 184   }
 185   Pressure[AMDGPU::RegisterPressureSets::SReg_32] = NewPressure.getSGPRNum();
 186   Pressure[AMDGPU::RegisterPressureSets::VGPR_32] =
 187       NewPressure.getArchVGPRNum();
 188   Pressure[AMDGPU::RegisterPressureSets::AGPR_32] = NewPressure.getAGPRNum();
 189 }
 190
 191 void GCNSchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU,
 192                                      bool AtTop,
 193                                      const RegPressureTracker &RPTracker,
 194                                      const SIRegisterInfo *SRI,
 195                                      unsigned SGPRPressure,
 196                                      unsigned VGPRPressure, bool IsBottomUp) {
 197   Cand.SU = SU;
 198   Cand.AtTop = AtTop;
 199
 200   if (!DAG->isTrackingPressure())
 201     return;
 202
 203   Pressure.clear();
 204   MaxPressure.clear();
 205
 206   // We try to use the cached PressureDiffs in the ScheduleDAG whenever
 207   // possible over querying the RegPressureTracker.
 208   //
 209   // RegPressureTracker will make a lot of LIS queries which are very
 210   // expensive, it is considered a slow function in this context.
 211   //
 212   // PressureDiffs are precomputed and cached, and getPressureDiff is just a
 213   // trivial lookup into an array. It is pretty much free.
 214   //
 215   // In EXPENSIVE_CHECKS, we always query RPTracker to verify the results of
 216   // PressureDiffs.
 217   if (AtTop || !canUsePressureDiffs(*SU) || GCNTrackers) {
 218     getRegisterPressures(AtTop, RPTracker, SU, Pressure, MaxPressure,
 219                          DownwardTracker, UpwardTracker, DAG, SRI);
 220   } else {
 221     // Reserve 4 slots.
 222     Pressure.resize(4, 0);
 223     Pressure[AMDGPU::RegisterPressureSets::SReg_32] = SGPRPressure;
 224     Pressure[AMDGPU::RegisterPressureSets::VGPR_32] = VGPRPressure;
 225
 226     for (const auto &Diff : DAG->getPressureDiff(SU)) {
 227       if (!Diff.isValid())
 228         continue;
 229       // PressureDiffs is always bottom-up so if we're working top-down we need
 230       // to invert its sign.
 231       Pressure[Diff.getPSet()] +=
 232           (IsBottomUp ? Diff.getUnitInc() : -Diff.getUnitInc());
 233     }
 234
 235 #ifdef EXPENSIVE_CHECKS
 236     std::vector<unsigned> CheckPressure, CheckMaxPressure;
 237     getRegisterPressures(AtTop, RPTracker, SU, CheckPressure, CheckMaxPressure,
 238                          DownwardTracker, UpwardTracker, DAG, SRI);
 239     if (Pressure[AMDGPU::RegisterPressureSets::SReg_32] !=
 240             CheckPressure[AMDGPU::RegisterPressureSets::SReg_32] ||
 241         Pressure[AMDGPU::RegisterPressureSets::VGPR_32] !=
 242             CheckPressure[AMDGPU::RegisterPressureSets::VGPR_32]) {
 243       errs() << "Register Pressure is inaccurate when calculated through "
 244                 "PressureDiff\n"
 245              << "SGPR got " << Pressure[AMDGPU::RegisterPressureSets::SReg_32]
 246              << ", expected "
 247              << CheckPressure[AMDGPU::RegisterPressureSets::SReg_32] << "\n"
 248              << "VGPR got " << Pressure[AMDGPU::RegisterPressureSets::VGPR_32]
 249              << ", expected "
 250              << CheckPressure[AMDGPU::RegisterPressureSets::VGPR_32] << "\n";
 251       report_fatal_error("inaccurate register pressure calculation");
 252     }
 253 #endif
 254   }
 255
 256   unsigned NewSGPRPressure = Pressure[AMDGPU::RegisterPressureSets::SReg_32];
 257   unsigned NewVGPRPressure = Pressure[AMDGPU::RegisterPressureSets::VGPR_32];
 258
 259   // If two instructions increase the pressure of different register sets
 260   // by the same amount, the generic scheduler will prefer to schedule the
 261   // instruction that increases the set with the least amount of registers,
 262   // which in our case would be SGPRs.  This is rarely what we want, so
 263   // when we report excess/critical register pressure, we do it either
 264   // only for VGPRs or only for SGPRs.
 265
 266   // FIXME: Better heuristics to determine whether to prefer SGPRs or VGPRs.
 267   const unsigned MaxVGPRPressureInc = 16;
 268   bool ShouldTrackVGPRs = VGPRPressure + MaxVGPRPressureInc >= VGPRExcessLimit;
 269   bool ShouldTrackSGPRs = !ShouldTrackVGPRs && SGPRPressure >= SGPRExcessLimit;
 270
 271   // FIXME: We have to enter REG-EXCESS before we reach the actual threshold
 272   // to increase the likelihood we don't go over the limits.  We should improve
 273   // the analysis to look through dependencies to find the path with the least
 274   // register pressure.
 275
 276   // We only need to update the RPDelta for instructions that increase register
 277   // pressure. Instructions that decrease or keep reg pressure the same will be
 278   // marked as RegExcess in tryCandidate() when they are compared with
 279   // instructions that increase the register pressure.
 280   if (ShouldTrackVGPRs && NewVGPRPressure >= VGPRExcessLimit) {
 281     HasHighPressure = true;
 282     Cand.RPDelta.Excess = PressureChange(AMDGPU::RegisterPressureSets::VGPR_32);
 283     Cand.RPDelta.Excess.setUnitInc(NewVGPRPressure - VGPRExcessLimit);
 284   }
 285
 286   if (ShouldTrackSGPRs && NewSGPRPressure >= SGPRExcessLimit) {
 287     HasHighPressure = true;
 288     Cand.RPDelta.Excess = PressureChange(AMDGPU::RegisterPressureSets::SReg_32);
 289     Cand.RPDelta.Excess.setUnitInc(NewSGPRPressure - SGPRExcessLimit);
 290   }
 291
 292   // Register pressure is considered 'CRITICAL' if it is approaching a value
 293   // that would reduce the wave occupancy for the execution unit.  When
 294   // register pressure is 'CRITICAL', increasing SGPR and VGPR pressure both
 295   // has the same cost, so we don't need to prefer one over the other.
 296
 297   int SGPRDelta = NewSGPRPressure - SGPRCriticalLimit;
 298   int VGPRDelta = NewVGPRPressure - VGPRCriticalLimit;
 299
 300   if (SGPRDelta >= 0 || VGPRDelta >= 0) {
 301     HasHighPressure = true;
 302     if (SGPRDelta > VGPRDelta) {
 303       Cand.RPDelta.CriticalMax =
 304         PressureChange(AMDGPU::RegisterPressureSets::SReg_32);
 305       Cand.RPDelta.CriticalMax.setUnitInc(SGPRDelta);
 306     } else {
 307       Cand.RPDelta.CriticalMax =
 308         PressureChange(AMDGPU::RegisterPressureSets::VGPR_32);
 309       Cand.RPDelta.CriticalMax.setUnitInc(VGPRDelta);
 310     }
 311   }
 312 }
 313
 314 // This function is mostly cut and pasted from
 315 // GenericScheduler::pickNodeFromQueue()
 316 void GCNSchedStrategy::pickNodeFromQueue(SchedBoundary &Zone,
 317                                          const CandPolicy &ZonePolicy,
 318                                          const RegPressureTracker &RPTracker,
 319                                          SchedCandidate &Cand,
 320                                          bool IsBottomUp) {
 321   const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo*>(TRI);
 322   ArrayRef<unsigned> Pressure = RPTracker.getRegSetPressureAtPos();
 323   unsigned SGPRPressure = 0;
 324   unsigned VGPRPressure = 0;
 325   if (DAG->isTrackingPressure()) {
 326     if (!GCNTrackers) {
 327       SGPRPressure = Pressure[AMDGPU::RegisterPressureSets::SReg_32];
 328       VGPRPressure = Pressure[AMDGPU::RegisterPressureSets::VGPR_32];
 329     } else {
 330       GCNRPTracker *T = IsBottomUp
 331                             ? static_cast<GCNRPTracker *>(&UpwardTracker)
 332                             : static_cast<GCNRPTracker *>(&DownwardTracker);
 333       SGPRPressure = T->getPressure().getSGPRNum();
 334       VGPRPressure = T->getPressure().getArchVGPRNum();
 335     }
 336   }
 337   ReadyQueue &Q = Zone.Available;
 338   for (SUnit *SU : Q) {
 339
 340     SchedCandidate TryCand(ZonePolicy);
 341     initCandidate(TryCand, SU, Zone.isTop(), RPTracker, SRI, SGPRPressure,
 342                   VGPRPressure, IsBottomUp);
 343     // Pass SchedBoundary only when comparing nodes from the same boundary.
 344     SchedBoundary *ZoneArg = Cand.AtTop == TryCand.AtTop ? &Zone : nullptr;
 345     tryCandidate(Cand, TryCand, ZoneArg);
 346     if (TryCand.Reason != NoCand) {
 347       // Initialize resource delta if needed in case future heuristics query it.
 348       if (TryCand.ResDelta == SchedResourceDelta())
 349         TryCand.initResourceDelta(Zone.DAG, SchedModel);
 350       Cand.setBest(TryCand);
 351       LLVM_DEBUG(traceCandidate(Cand));
 352     }
 353   }
 354 }
 355
 356 // This function is mostly cut and pasted from
 357 // GenericScheduler::pickNodeBidirectional()
 358 SUnit *GCNSchedStrategy::pickNodeBidirectional(bool &IsTopNode) {
 359   // Schedule as far as possible in the direction of no choice. This is most
 360   // efficient, but also provides the best heuristics for CriticalPSets.
 361   if (SUnit *SU = Bot.pickOnlyChoice()) {
 362     IsTopNode = false;
 363     return SU;
 364   }
 365   if (SUnit *SU = Top.pickOnlyChoice()) {
 366     IsTopNode = true;
 367     return SU;
 368   }
 369   // Set the bottom-up policy based on the state of the current bottom zone and
 370   // the instructions outside the zone, including the top zone.
 371   CandPolicy BotPolicy;
 372   setPolicy(BotPolicy, /*IsPostRA=*/false, Bot, &Top);
 373   // Set the top-down policy based on the state of the current top zone and
 374   // the instructions outside the zone, including the bottom zone.
 375   CandPolicy TopPolicy;
 376   setPolicy(TopPolicy, /*IsPostRA=*/false, Top, &Bot);
 377
 378   // See if BotCand is still valid (because we previously scheduled from Top).
 379   LLVM_DEBUG(dbgs() << "Picking from Bot:\n");
 380   if (!BotCand.isValid() || BotCand.SU->isScheduled ||
 381       BotCand.Policy != BotPolicy) {
 382     BotCand.reset(CandPolicy());
 383     pickNodeFromQueue(Bot, BotPolicy, DAG->getBotRPTracker(), BotCand,
 384                       /*IsBottomUp=*/true);
 385     assert(BotCand.Reason != NoCand && "failed to find the first candidate");
 386   } else {
 387     LLVM_DEBUG(traceCandidate(BotCand));
 388 #ifndef NDEBUG
 389     if (VerifyScheduling) {
 390       SchedCandidate TCand;
 391       TCand.reset(CandPolicy());
 392       pickNodeFromQueue(Bot, BotPolicy, DAG->getBotRPTracker(), TCand,
 393                         /*IsBottomUp=*/true);
 394       assert(TCand.SU == BotCand.SU &&
 395              "Last pick result should correspond to re-picking right now");
 396     }
 397 #endif
 398   }
 399
 400   // Check if the top Q has a better candidate.
 401   LLVM_DEBUG(dbgs() << "Picking from Top:\n");
 402   if (!TopCand.isValid() || TopCand.SU->isScheduled ||
 403       TopCand.Policy != TopPolicy) {
 404     TopCand.reset(CandPolicy());
 405     pickNodeFromQueue(Top, TopPolicy, DAG->getTopRPTracker(), TopCand,
 406                       /*IsBottomUp=*/false);
 407     assert(TopCand.Reason != NoCand && "failed to find the first candidate");
 408   } else {
 409     LLVM_DEBUG(traceCandidate(TopCand));
 410 #ifndef NDEBUG
 411     if (VerifyScheduling) {
 412       SchedCandidate TCand;
 413       TCand.reset(CandPolicy());
 414       pickNodeFromQueue(Top, TopPolicy, DAG->getTopRPTracker(), TCand,
 415                         /*IsBottomUp=*/false);
 416       assert(TCand.SU == TopCand.SU &&
 417            "Last pick result should correspond to re-picking right now");
 418     }
 419 #endif
 420   }
 421
 422   // Pick best from BotCand and TopCand.
 423   LLVM_DEBUG(dbgs() << "Top Cand: "; traceCandidate(TopCand);
 424              dbgs() << "Bot Cand: "; traceCandidate(BotCand););
 425   SchedCandidate Cand = BotCand;
 426   TopCand.Reason = NoCand;
 427   tryCandidate(Cand, TopCand, nullptr);
 428   if (TopCand.Reason != NoCand) {
 429     Cand.setBest(TopCand);
 430   }
 431   LLVM_DEBUG(dbgs() << "Picking: "; traceCandidate(Cand););
 432
 433   IsTopNode = Cand.AtTop;
 434   return Cand.SU;
 435 }
 436
 437 // This function is mostly cut and pasted from
 438 // GenericScheduler::pickNode()
 439 SUnit *GCNSchedStrategy::pickNode(bool &IsTopNode) {
 440   if (DAG->top() == DAG->bottom()) {
 441     assert(Top.Available.empty() && Top.Pending.empty() &&
 442            Bot.Available.empty() && Bot.Pending.empty() && "ReadyQ garbage");
 443     return nullptr;
 444   }
 445   SUnit *SU;
 446   do {
 447     if (RegionPolicy.OnlyTopDown) {
 448       SU = Top.pickOnlyChoice();
 449       if (!SU) {
 450         CandPolicy NoPolicy;
 451         TopCand.reset(NoPolicy);
 452         pickNodeFromQueue(Top, NoPolicy, DAG->getTopRPTracker(), TopCand,
 453                           /*IsBottomUp=*/false);
 454         assert(TopCand.Reason != NoCand && "failed to find a candidate");
 455         SU = TopCand.SU;
 456       }
 457       IsTopNode = true;
 458     } else if (RegionPolicy.OnlyBottomUp) {
 459       SU = Bot.pickOnlyChoice();
 460       if (!SU) {
 461         CandPolicy NoPolicy;
 462         BotCand.reset(NoPolicy);
 463         pickNodeFromQueue(Bot, NoPolicy, DAG->getBotRPTracker(), BotCand,
 464                           /*IsBottomUp=*/true);
 465         assert(BotCand.Reason != NoCand && "failed to find a candidate");
 466         SU = BotCand.SU;
 467       }
 468       IsTopNode = false;
 469     } else {
 470       SU = pickNodeBidirectional(IsTopNode);
 471     }
 472   } while (SU->isScheduled);
 473
 474   if (SU->isTopReady())
 475     Top.removeReady(SU);
 476   if (SU->isBottomReady())
 477     Bot.removeReady(SU);
 478
 479   LLVM_DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") "
 480                     << *SU->getInstr());
 481   return SU;
 482 }
 483
 484 void GCNSchedStrategy::schedNode(SUnit *SU, bool IsTopNode) {
 485   if (GCNTrackers) {
 486     MachineInstr *MI = SU->getInstr();
 487     IsTopNode ? (void)DownwardTracker.advance(MI, false)
 488               : UpwardTracker.recede(*MI);
 489   }
 490
 491   return GenericScheduler::schedNode(SU, IsTopNode);
 492 }
 493
 494 GCNSchedStageID GCNSchedStrategy::getCurrentStage() {
 495   assert(CurrentStage && CurrentStage != SchedStages.end());
 496   return *CurrentStage;
 497 }
 498
 499 bool GCNSchedStrategy::advanceStage() {
 500   assert(CurrentStage != SchedStages.end());
 501   if (!CurrentStage)
 502     CurrentStage = SchedStages.begin();
 503   else
 504     CurrentStage++;
 505
 506   return CurrentStage != SchedStages.end();
 507 }
 508
 509 bool GCNSchedStrategy::hasNextStage() const {
 510   assert(CurrentStage);
 511   return std::next(CurrentStage) != SchedStages.end();
 512 }
 513
 514 GCNSchedStageID GCNSchedStrategy::getNextStage() const {
 515   assert(CurrentStage && std::next(CurrentStage) != SchedStages.end());
 516   return *std::next(CurrentStage);
 517 }
 518
 519 GCNMaxOccupancySchedStrategy::GCNMaxOccupancySchedStrategy(
 520     const MachineSchedContext *C, bool IsLegacyScheduler)
 521     : GCNSchedStrategy(C) {
 522   SchedStages.push_back(GCNSchedStageID::OccInitialSchedule);
 523   SchedStages.push_back(GCNSchedStageID::UnclusteredHighRPReschedule);
 524   SchedStages.push_back(GCNSchedStageID::ClusteredLowOccupancyReschedule);
 525   SchedStages.push_back(GCNSchedStageID::PreRARematerialize);
 526   GCNTrackers = GCNTrackers & !IsLegacyScheduler;
 527 }
 528
 529 GCNMaxILPSchedStrategy::GCNMaxILPSchedStrategy(const MachineSchedContext *C)
 530     : GCNSchedStrategy(C) {
 531   SchedStages.push_back(GCNSchedStageID::ILPInitialSchedule);
 532 }
 533
 534 bool GCNMaxILPSchedStrategy::tryCandidate(SchedCandidate &Cand,
 535                                           SchedCandidate &TryCand,
 536                                           SchedBoundary *Zone) const {
 537   // Initialize the candidate if needed.
 538   if (!Cand.isValid()) {
 539     TryCand.Reason = NodeOrder;
 540     return true;
 541   }
 542
 543   // Avoid spilling by exceeding the register limit.
 544   if (DAG->isTrackingPressure() &&
 545       tryPressure(TryCand.RPDelta.Excess, Cand.RPDelta.Excess, TryCand, Cand,
 546                   RegExcess, TRI, DAG->MF))
 547     return TryCand.Reason != NoCand;
 548
 549   // Bias PhysReg Defs and copies to their uses and defined respectively.
 550   if (tryGreater(biasPhysReg(TryCand.SU, TryCand.AtTop),
 551                  biasPhysReg(Cand.SU, Cand.AtTop), TryCand, Cand, PhysReg))
 552     return TryCand.Reason != NoCand;
 553
 554   bool SameBoundary = Zone != nullptr;
 555   if (SameBoundary) {
 556     // Prioritize instructions that read unbuffered resources by stall cycles.
 557     if (tryLess(Zone->getLatencyStallCycles(TryCand.SU),
 558                 Zone->getLatencyStallCycles(Cand.SU), TryCand, Cand, Stall))
 559       return TryCand.Reason != NoCand;
 560
 561     // Avoid critical resource consumption and balance the schedule.
 562     TryCand.initResourceDelta(DAG, SchedModel);
 563     if (tryLess(TryCand.ResDelta.CritResources, Cand.ResDelta.CritResources,
 564                 TryCand, Cand, ResourceReduce))
 565       return TryCand.Reason != NoCand;
 566     if (tryGreater(TryCand.ResDelta.DemandedResources,
 567                    Cand.ResDelta.DemandedResources, TryCand, Cand,
 568                    ResourceDemand))
 569       return TryCand.Reason != NoCand;
 570
 571     // Unconditionally try to reduce latency.
 572     if (tryLatency(TryCand, Cand, *Zone))
 573       return TryCand.Reason != NoCand;
 574
 575     // Weak edges are for clustering and other constraints.
 576     if (tryLess(getWeakLeft(TryCand.SU, TryCand.AtTop),
 577                 getWeakLeft(Cand.SU, Cand.AtTop), TryCand, Cand, Weak))
 578       return TryCand.Reason != NoCand;
 579   }
 580
 581   // Keep clustered nodes together to encourage downstream peephole
 582   // optimizations which may reduce resource requirements.
 583   //
 584   // This is a best effort to set things up for a post-RA pass. Optimizations
 585   // like generating loads of multiple registers should ideally be done within
 586   // the scheduler pass by combining the loads during DAG postprocessing.
 587   const SUnit *CandNextClusterSU =
 588       Cand.AtTop ? DAG->getNextClusterSucc() : DAG->getNextClusterPred();
 589   const SUnit *TryCandNextClusterSU =
 590       TryCand.AtTop ? DAG->getNextClusterSucc() : DAG->getNextClusterPred();
 591   if (tryGreater(TryCand.SU == TryCandNextClusterSU,
 592                  Cand.SU == CandNextClusterSU, TryCand, Cand, Cluster))
 593     return TryCand.Reason != NoCand;
 594
 595   // Avoid increasing the max critical pressure in the scheduled region.
 596   if (DAG->isTrackingPressure() &&
 597       tryPressure(TryCand.RPDelta.CriticalMax, Cand.RPDelta.CriticalMax,
 598                   TryCand, Cand, RegCritical, TRI, DAG->MF))
 599     return TryCand.Reason != NoCand;
 600
 601   // Avoid increasing the max pressure of the entire region.
 602   if (DAG->isTrackingPressure() &&
 603       tryPressure(TryCand.RPDelta.CurrentMax, Cand.RPDelta.CurrentMax, TryCand,
 604                   Cand, RegMax, TRI, DAG->MF))
 605     return TryCand.Reason != NoCand;
 606
 607   if (SameBoundary) {
 608     // Fall through to original instruction order.
 609     if ((Zone->isTop() && TryCand.SU->NodeNum < Cand.SU->NodeNum) ||
 610         (!Zone->isTop() && TryCand.SU->NodeNum > Cand.SU->NodeNum)) {
 611       TryCand.Reason = NodeOrder;
 612       return true;
 613     }
 614   }
 615   return false;
 616 }
 617
 618 GCNScheduleDAGMILive::GCNScheduleDAGMILive(
 619     MachineSchedContext *C, std::unique_ptr<MachineSchedStrategy> S)
 620     : ScheduleDAGMILive(C, std::move(S)), ST(MF.getSubtarget<GCNSubtarget>()),
 621       MFI(*MF.getInfo<SIMachineFunctionInfo>()),
 622       StartingOccupancy(MFI.getOccupancy()), MinOccupancy(StartingOccupancy),
 623       RegionLiveOuts(this, /*IsLiveOut=*/true) {
 624
 625   LLVM_DEBUG(dbgs() << "Starting occupancy is " << StartingOccupancy << ".\n");
 626   if (RelaxedOcc) {
 627     MinOccupancy = std::min(MFI.getMinAllowedOccupancy(), StartingOccupancy);
 628     if (MinOccupancy != StartingOccupancy)
 629       LLVM_DEBUG(dbgs() << "Allowing Occupancy drops to " << MinOccupancy
 630                         << ".\n");
 631   }
 632 }
 633
 634 std::unique_ptr<GCNSchedStage>
 635 GCNScheduleDAGMILive::createSchedStage(GCNSchedStageID SchedStageID) {
 636   switch (SchedStageID) {
 637   case GCNSchedStageID::OccInitialSchedule:
 638     return std::make_unique<OccInitialScheduleStage>(SchedStageID, *this);
 639   case GCNSchedStageID::UnclusteredHighRPReschedule:
 640     return std::make_unique<UnclusteredHighRPStage>(SchedStageID, *this);
 641   case GCNSchedStageID::ClusteredLowOccupancyReschedule:
 642     return std::make_unique<ClusteredLowOccStage>(SchedStageID, *this);
 643   case GCNSchedStageID::PreRARematerialize:
 644     return std::make_unique<PreRARematStage>(SchedStageID, *this);
 645   case GCNSchedStageID::ILPInitialSchedule:
 646     return std::make_unique<ILPInitialScheduleStage>(SchedStageID, *this);
 647   }
 648
 649   llvm_unreachable("Unknown SchedStageID.");
 650 }
 651
 652 void GCNScheduleDAGMILive::schedule() {
 653   // Collect all scheduling regions. The actual scheduling is performed in
 654   // GCNScheduleDAGMILive::finalizeSchedule.
 655   Regions.push_back(std::pair(RegionBegin, RegionEnd));
 656 }
 657
 658 GCNRegPressure
 659 GCNScheduleDAGMILive::getRealRegPressure(unsigned RegionIdx) const {
 660   GCNDownwardRPTracker RPTracker(*LIS);
 661   RPTracker.advance(begin(), end(), &LiveIns[RegionIdx]);
 662   return RPTracker.moveMaxPressure();
 663 }
 664
 665 static MachineInstr *getLastMIForRegion(MachineBasicBlock::iterator RegionBegin,
 666                                         MachineBasicBlock::iterator RegionEnd) {
 667   auto REnd = RegionEnd == RegionBegin->getParent()->end()
 668                   ? std::prev(RegionEnd)
 669                   : RegionEnd;
 670   return &*skipDebugInstructionsBackward(REnd, RegionBegin);
 671 }
 672
 673 void GCNScheduleDAGMILive::computeBlockPressure(unsigned RegionIdx,
 674                                                 const MachineBasicBlock *MBB) {
 675   GCNDownwardRPTracker RPTracker(*LIS);
 676
 677   // If the block has the only successor then live-ins of that successor are
 678   // live-outs of the current block. We can reuse calculated live set if the
 679   // successor will be sent to scheduling past current block.
 680
 681   // However, due to the bug in LiveInterval analysis it may happen that two
 682   // predecessors of the same successor block have different lane bitmasks for
 683   // a live-out register. Workaround that by sticking to one-to-one relationship
 684   // i.e. one predecessor with one successor block.
 685   const MachineBasicBlock *OnlySucc = nullptr;
 686   if (MBB->succ_size() == 1) {
 687     auto *Candidate = *MBB->succ_begin();
 688     if (!Candidate->empty() && Candidate->pred_size() == 1) {
 689       SlotIndexes *Ind = LIS->getSlotIndexes();
 690       if (Ind->getMBBStartIdx(MBB) < Ind->getMBBStartIdx(Candidate))
 691         OnlySucc = Candidate;
 692     }
 693   }
 694
 695   // Scheduler sends regions from the end of the block upwards.
 696   size_t CurRegion = RegionIdx;
 697   for (size_t E = Regions.size(); CurRegion != E; ++CurRegion)
 698     if (Regions[CurRegion].first->getParent() != MBB)
 699       break;
 700   --CurRegion;
 701
 702   auto I = MBB->begin();
 703   auto LiveInIt = MBBLiveIns.find(MBB);
 704   auto &Rgn = Regions[CurRegion];
 705   auto *NonDbgMI = &*skipDebugInstructionsForward(Rgn.first, Rgn.second);
 706   if (LiveInIt != MBBLiveIns.end()) {
 707     auto LiveIn = std::move(LiveInIt->second);
 708     RPTracker.reset(*MBB->begin(), &LiveIn);
 709     MBBLiveIns.erase(LiveInIt);
 710   } else {
 711     I = Rgn.first;
 712     auto LRS = BBLiveInMap.lookup(NonDbgMI);
 713 #ifdef EXPENSIVE_CHECKS
 714     assert(isEqual(getLiveRegsBefore(*NonDbgMI, *LIS), LRS));
 715 #endif
 716     RPTracker.reset(*I, &LRS);
 717   }
 718
 719   for (;;) {
 720     I = RPTracker.getNext();
 721
 722     if (Regions[CurRegion].first == I || NonDbgMI == I) {
 723       LiveIns[CurRegion] = RPTracker.getLiveRegs();
 724       RPTracker.clearMaxPressure();
 725     }
 726
 727     if (Regions[CurRegion].second == I) {
 728       Pressure[CurRegion] = RPTracker.moveMaxPressure();
 729       if (CurRegion-- == RegionIdx)
 730         break;
 731     }
 732     RPTracker.advanceToNext();
 733     RPTracker.advanceBeforeNext();
 734   }
 735
 736   if (OnlySucc) {
 737     if (I != MBB->end()) {
 738       RPTracker.advanceToNext();
 739       RPTracker.advance(MBB->end());
 740     }
 741     RPTracker.advanceBeforeNext();
 742     MBBLiveIns[OnlySucc] = RPTracker.moveLiveRegs();
 743   }
 744 }
 745
 746 DenseMap<MachineInstr *, GCNRPTracker::LiveRegSet>
 747 GCNScheduleDAGMILive::getRegionLiveInMap() const {
 748   assert(!Regions.empty());
 749   std::vector<MachineInstr *> RegionFirstMIs;
 750   RegionFirstMIs.reserve(Regions.size());
 751   auto I = Regions.rbegin(), E = Regions.rend();
 752   auto *BB = I->first->getParent();
 753   do {
 754     auto *MI = &*skipDebugInstructionsForward(I->first, I->second);
 755     RegionFirstMIs.push_back(MI);
 756     do {
 757       ++I;
 758     } while (I != E && I->first->getParent() == BB);
 759   } while (I != E);
 760   return getLiveRegMap(RegionFirstMIs, /*After=*/false, *LIS);
 761 }
 762
 763 DenseMap<MachineInstr *, GCNRPTracker::LiveRegSet>
 764 GCNScheduleDAGMILive::getRegionLiveOutMap() const {
 765   assert(!Regions.empty());
 766   std::vector<MachineInstr *> RegionLastMIs;
 767   RegionLastMIs.reserve(Regions.size());
 768   for (auto &[RegionBegin, RegionEnd] : reverse(Regions))
 769     RegionLastMIs.push_back(getLastMIForRegion(RegionBegin, RegionEnd));
 770
 771   return getLiveRegMap(RegionLastMIs, /*After=*/true, *LIS);
 772 }
 773
 774 void RegionPressureMap::buildLiveRegMap() {
 775   IdxToInstruction.clear();
 776
 777   RegionLiveRegMap =
 778       IsLiveOut ? DAG->getRegionLiveOutMap() : DAG->getRegionLiveInMap();
 779   for (unsigned I = 0; I < DAG->Regions.size(); I++) {
 780     MachineInstr *RegionKey =
 781         IsLiveOut
 782             ? getLastMIForRegion(DAG->Regions[I].first, DAG->Regions[I].second)
 783             : &*DAG->Regions[I].first;
 784     IdxToInstruction[I] = RegionKey;
 785   }
 786 }
 787
 788 void GCNScheduleDAGMILive::finalizeSchedule() {
 789   // Start actual scheduling here. This function is called by the base
 790   // MachineScheduler after all regions have been recorded by
 791   // GCNScheduleDAGMILive::schedule().
 792   LiveIns.resize(Regions.size());
 793   Pressure.resize(Regions.size());
 794   RescheduleRegions.resize(Regions.size());
 795   RegionsWithHighRP.resize(Regions.size());
 796   RegionsWithExcessRP.resize(Regions.size());
 797   RegionsWithMinOcc.resize(Regions.size());
 798   RegionsWithIGLPInstrs.resize(Regions.size());
 799   RescheduleRegions.set();
 800   RegionsWithHighRP.reset();
 801   RegionsWithExcessRP.reset();
 802   RegionsWithMinOcc.reset();
 803   RegionsWithIGLPInstrs.reset();
 804
 805   runSchedStages();
 806 }
 807
 808 void GCNScheduleDAGMILive::runSchedStages() {
 809   LLVM_DEBUG(dbgs() << "All regions recorded, starting actual scheduling.\n");
 810
 811   if (!Regions.empty()) {
 812     BBLiveInMap = getRegionLiveInMap();
 813     if (GCNTrackers)
 814       RegionLiveOuts.buildLiveRegMap();
 815   }
 816
 817   GCNSchedStrategy &S = static_cast<GCNSchedStrategy &>(*SchedImpl);
 818   while (S.advanceStage()) {
 819     auto Stage = createSchedStage(S.getCurrentStage());
 820     if (!Stage->initGCNSchedStage())
 821       continue;
 822
 823     for (auto Region : Regions) {
 824       RegionBegin = Region.first;
 825       RegionEnd = Region.second;
 826       // Setup for scheduling the region and check whether it should be skipped.
 827       if (!Stage->initGCNRegion()) {
 828         Stage->advanceRegion();
 829         exitRegion();
 830         continue;
 831       }
 832
 833       if (GCNTrackers) {
 834         GCNDownwardRPTracker *DownwardTracker = S.getDownwardTracker();
 835         GCNUpwardRPTracker *UpwardTracker = S.getUpwardTracker();
 836         GCNRPTracker::LiveRegSet *RegionLiveIns =
 837             &LiveIns[Stage->getRegionIdx()];
 838
 839         reinterpret_cast<GCNRPTracker *>(DownwardTracker)
 840             ->reset(MRI, *RegionLiveIns);
 841         reinterpret_cast<GCNRPTracker *>(UpwardTracker)
 842             ->reset(MRI, RegionLiveOuts.getLiveRegsForRegionIdx(
 843                              Stage->getRegionIdx()));
 844       }
 845
 846       ScheduleDAGMILive::schedule();
 847       Stage->finalizeGCNRegion();
 848     }
 849
 850     Stage->finalizeGCNSchedStage();
 851   }
 852 }
 853
 854 #ifndef NDEBUG
 855 raw_ostream &llvm::operator<<(raw_ostream &OS, const GCNSchedStageID &StageID) {
 856   switch (StageID) {
 857   case GCNSchedStageID::OccInitialSchedule:
 858     OS << "Max Occupancy Initial Schedule";
 859     break;
 860   case GCNSchedStageID::UnclusteredHighRPReschedule:
 861     OS << "Unclustered High Register Pressure Reschedule";
 862     break;
 863   case GCNSchedStageID::ClusteredLowOccupancyReschedule:
 864     OS << "Clustered Low Occupancy Reschedule";
 865     break;
 866   case GCNSchedStageID::PreRARematerialize:
 867     OS << "Pre-RA Rematerialize";
 868     break;
 869   case GCNSchedStageID::ILPInitialSchedule:
 870     OS << "Max ILP Initial Schedule";
 871     break;
 872   }
 873
 874   return OS;
 875 }
 876 #endif
 877
 878 GCNSchedStage::GCNSchedStage(GCNSchedStageID StageID, GCNScheduleDAGMILive &DAG)
 879     : DAG(DAG), S(static_cast<GCNSchedStrategy &>(*DAG.SchedImpl)), MF(DAG.MF),
 880       MFI(DAG.MFI), ST(DAG.ST), StageID(StageID) {}
 881
 882 bool GCNSchedStage::initGCNSchedStage() {
 883   if (!DAG.LIS)
 884     return false;
 885
 886   LLVM_DEBUG(dbgs() << "Starting scheduling stage: " << StageID << "\n");
 887   return true;
 888 }
 889
 890 bool UnclusteredHighRPStage::initGCNSchedStage() {
 891   if (DisableUnclusterHighRP)
 892     return false;
 893
 894   if (!GCNSchedStage::initGCNSchedStage())
 895     return false;
 896
 897   if (DAG.RegionsWithHighRP.none() && DAG.RegionsWithExcessRP.none())
 898     return false;
 899
 900   SavedMutations.swap(DAG.Mutations);
 901   DAG.addMutation(
 902       createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::PreRAReentry));
 903
 904   InitialOccupancy = DAG.MinOccupancy;
 905   // Aggressivly try to reduce register pressure in the unclustered high RP
 906   // stage. Temporarily increase occupancy target in the region.
 907   S.SGPRLimitBias = S.HighRPSGPRBias;
 908   S.VGPRLimitBias = S.HighRPVGPRBias;
 909   if (MFI.getMaxWavesPerEU() > DAG.MinOccupancy)
 910     MFI.increaseOccupancy(MF, ++DAG.MinOccupancy);
 911
 912   LLVM_DEBUG(
 913       dbgs()
 914       << "Retrying function scheduling without clustering. "
 915          "Aggressivly try to reduce register pressure to achieve occupancy "
 916       << DAG.MinOccupancy << ".\n");
 917
 918   return true;
 919 }
 920
 921 bool ClusteredLowOccStage::initGCNSchedStage() {
 922   if (DisableClusteredLowOccupancy)
 923     return false;
 924
 925   if (!GCNSchedStage::initGCNSchedStage())
 926     return false;
 927
 928   // Don't bother trying to improve ILP in lower RP regions if occupancy has not
 929   // been dropped. All regions will have already been scheduled with the ideal
 930   // occupancy targets.
 931   if (DAG.StartingOccupancy <= DAG.MinOccupancy)
 932     return false;
 933
 934   LLVM_DEBUG(
 935       dbgs() << "Retrying function scheduling with lowest recorded occupancy "
 936              << DAG.MinOccupancy << ".\n");
 937   return true;
 938 }
 939
 940 bool PreRARematStage::initGCNSchedStage() {
 941   if (!GCNSchedStage::initGCNSchedStage())
 942     return false;
 943
 944   if (DAG.RegionsWithMinOcc.none() || DAG.Regions.size() == 1)
 945     return false;
 946
 947   const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
 948   // Check maximum occupancy
 949   if (ST.computeOccupancy(MF.getFunction(), MFI.getLDSSize()) ==
 950       DAG.MinOccupancy)
 951     return false;
 952
 953   // FIXME: This pass will invalidate cached MBBLiveIns for regions
 954   // inbetween the defs and region we sinked the def to. Cached pressure
 955   // for regions where a def is sinked from will also be invalidated. Will
 956   // need to be fixed if there is another pass after this pass.
 957   assert(!S.hasNextStage());
 958
 959   collectRematerializableInstructions();
 960   if (RematerializableInsts.empty() || !sinkTriviallyRematInsts(ST, TII))
 961     return false;
 962
 963   LLVM_DEBUG(
 964       dbgs() << "Retrying function scheduling with improved occupancy of "
 965              << DAG.MinOccupancy << " from rematerializing\n");
 966   return true;
 967 }
 968
 969 void GCNSchedStage::finalizeGCNSchedStage() {
 970   DAG.finishBlock();
 971   LLVM_DEBUG(dbgs() << "Ending scheduling stage: " << StageID << "\n");
 972 }
 973
 974 void UnclusteredHighRPStage::finalizeGCNSchedStage() {
 975   SavedMutations.swap(DAG.Mutations);
 976   S.SGPRLimitBias = S.VGPRLimitBias = 0;
 977   if (DAG.MinOccupancy > InitialOccupancy) {
 978     for (unsigned IDX = 0; IDX < DAG.Pressure.size(); ++IDX)
 979       DAG.RegionsWithMinOcc[IDX] =
 980           DAG.Pressure[IDX].getOccupancy(DAG.ST) == DAG.MinOccupancy;
 981
 982     LLVM_DEBUG(dbgs() << StageID
 983                       << " stage successfully increased occupancy to "
 984                       << DAG.MinOccupancy << '\n');
 985   }
 986
 987   GCNSchedStage::finalizeGCNSchedStage();
 988 }
 989
 990 bool GCNSchedStage::initGCNRegion() {
 991   // Check whether this new region is also a new block.
 992   if (DAG.RegionBegin->getParent() != CurrentMBB)
 993     setupNewBlock();
 994
 995   unsigned NumRegionInstrs = std::distance(DAG.begin(), DAG.end());
 996   DAG.enterRegion(CurrentMBB, DAG.begin(), DAG.end(), NumRegionInstrs);
 997
 998   // Skip empty scheduling regions (0 or 1 schedulable instructions).
 999   if (DAG.begin() == DAG.end() || DAG.begin() == std::prev(DAG.end()))
1000     return false;
1001
1002   LLVM_DEBUG(dbgs() << "********** MI Scheduling **********\n");
1003   LLVM_DEBUG(dbgs() << MF.getName() << ":" << printMBBReference(*CurrentMBB)
1004                     << " " << CurrentMBB->getName()
1005                     << "\n  From: " << *DAG.begin() << "    To: ";
1006              if (DAG.RegionEnd != CurrentMBB->end()) dbgs() << *DAG.RegionEnd;
1007              else dbgs() << "End";
1008              dbgs() << " RegionInstrs: " << NumRegionInstrs << '\n');
1009
1010   // Save original instruction order before scheduling for possible revert.
1011   Unsched.clear();
1012   Unsched.reserve(DAG.NumRegionInstrs);
1013   if (StageID == GCNSchedStageID::OccInitialSchedule ||
1014       StageID == GCNSchedStageID::ILPInitialSchedule) {
1015     for (auto &I : DAG) {
1016       Unsched.push_back(&I);
1017       if (I.getOpcode() == AMDGPU::SCHED_GROUP_BARRIER ||
1018           I.getOpcode() == AMDGPU::IGLP_OPT)
1019         DAG.RegionsWithIGLPInstrs[RegionIdx] = true;
1020     }
1021   } else {
1022     for (auto &I : DAG)
1023       Unsched.push_back(&I);
1024   }
1025
1026   PressureBefore = DAG.Pressure[RegionIdx];
1027
1028   LLVM_DEBUG(
1029       dbgs() << "Pressure before scheduling:\nRegion live-ins:"
1030              << print(DAG.LiveIns[RegionIdx], DAG.MRI)
1031              << "Region live-in pressure:  "
1032              << print(llvm::getRegPressure(DAG.MRI, DAG.LiveIns[RegionIdx]))
1033              << "Region register pressure: " << print(PressureBefore));
1034
1035   S.HasHighPressure = false;
1036   S.KnownExcessRP = isRegionWithExcessRP();
1037
1038   if (DAG.RegionsWithIGLPInstrs[RegionIdx] &&
1039       StageID != GCNSchedStageID::UnclusteredHighRPReschedule) {
1040     SavedMutations.clear();
1041     SavedMutations.swap(DAG.Mutations);
1042     bool IsInitialStage = StageID == GCNSchedStageID::OccInitialSchedule ||
1043                           StageID == GCNSchedStageID::ILPInitialSchedule;
1044     DAG.addMutation(createIGroupLPDAGMutation(
1045         IsInitialStage ? AMDGPU::SchedulingPhase::Initial
1046                        : AMDGPU::SchedulingPhase::PreRAReentry));
1047   }
1048
1049   return true;
1050 }
1051
1052 bool UnclusteredHighRPStage::initGCNRegion() {
1053   // Only reschedule regions with the minimum occupancy or regions that may have
1054   // spilling (excess register pressure).
1055   if ((!DAG.RegionsWithMinOcc[RegionIdx] ||
1056        DAG.MinOccupancy <= InitialOccupancy) &&
1057       !DAG.RegionsWithExcessRP[RegionIdx])
1058     return false;
1059
1060   return GCNSchedStage::initGCNRegion();
1061 }
1062
1063 bool ClusteredLowOccStage::initGCNRegion() {
1064   // We may need to reschedule this region if it wasn't rescheduled in the last
1065   // stage, or if we found it was testing critical register pressure limits in
1066   // the unclustered reschedule stage. The later is because we may not have been
1067   // able to raise the min occupancy in the previous stage so the region may be
1068   // overly constrained even if it was already rescheduled.
1069   if (!DAG.RegionsWithHighRP[RegionIdx])
1070     return false;
1071
1072   return GCNSchedStage::initGCNRegion();
1073 }
1074
1075 bool PreRARematStage::initGCNRegion() {
1076   if (!DAG.RescheduleRegions[RegionIdx])
1077     return false;
1078
1079   return GCNSchedStage::initGCNRegion();
1080 }
1081
1082 void GCNSchedStage::setupNewBlock() {
1083   if (CurrentMBB)
1084     DAG.finishBlock();
1085
1086   CurrentMBB = DAG.RegionBegin->getParent();
1087   DAG.startBlock(CurrentMBB);
1088   // Get real RP for the region if it hasn't be calculated before. After the
1089   // initial schedule stage real RP will be collected after scheduling.
1090   if (StageID == GCNSchedStageID::OccInitialSchedule ||
1091       StageID == GCNSchedStageID::ILPInitialSchedule)
1092     DAG.computeBlockPressure(RegionIdx, CurrentMBB);
1093 }
1094
1095 void GCNSchedStage::finalizeGCNRegion() {
1096   DAG.Regions[RegionIdx] = std::pair(DAG.RegionBegin, DAG.RegionEnd);
1097   DAG.RescheduleRegions[RegionIdx] = false;
1098   if (S.HasHighPressure)
1099     DAG.RegionsWithHighRP[RegionIdx] = true;
1100
1101   // Revert scheduling if we have dropped occupancy or there is some other
1102   // reason that the original schedule is better.
1103   checkScheduling();
1104
1105   if (DAG.RegionsWithIGLPInstrs[RegionIdx] &&
1106       StageID != GCNSchedStageID::UnclusteredHighRPReschedule)
1107     SavedMutations.swap(DAG.Mutations);
1108
1109   DAG.exitRegion();
1110   RegionIdx++;
1111 }
1112
1113 void GCNSchedStage::checkScheduling() {
1114   // Check the results of scheduling.
1115   PressureAfter = DAG.getRealRegPressure(RegionIdx);
1116
1117   LLVM_DEBUG(dbgs() << "Pressure after scheduling: " << print(PressureAfter));
1118   LLVM_DEBUG(dbgs() << "Region: " << RegionIdx << ".\n");
1119
1120   if (PressureAfter.getSGPRNum() <= S.SGPRCriticalLimit &&
1121       PressureAfter.getVGPRNum(ST.hasGFX90AInsts()) <= S.VGPRCriticalLimit) {
1122     DAG.Pressure[RegionIdx] = PressureAfter;
1123     DAG.RegionsWithMinOcc[RegionIdx] =
1124         PressureAfter.getOccupancy(ST) == DAG.MinOccupancy;
1125
1126     // Early out if we have achieved the occupancy target.
1127     LLVM_DEBUG(dbgs() << "Pressure in desired limits, done.\n");
1128     return;
1129   }
1130
1131   unsigned TargetOccupancy =
1132       std::min(S.getTargetOccupancy(), ST.getOccupancyWithLocalMemSize(MF));
1133   unsigned WavesAfter =
1134       std::min(TargetOccupancy, PressureAfter.getOccupancy(ST));
1135   unsigned WavesBefore =
1136       std::min(TargetOccupancy, PressureBefore.getOccupancy(ST));
1137   LLVM_DEBUG(dbgs() << "Occupancy before scheduling: " << WavesBefore
1138                     << ", after " << WavesAfter << ".\n");
1139
1140   // We may not be able to keep the current target occupancy because of the just
1141   // scheduled region. We might still be able to revert scheduling if the
1142   // occupancy before was higher, or if the current schedule has register
1143   // pressure higher than the excess limits which could lead to more spilling.
1144   unsigned NewOccupancy = std::max(WavesAfter, WavesBefore);
1145
1146   // Allow memory bound functions to drop to 4 waves if not limited by an
1147   // attribute.
1148   if (WavesAfter < WavesBefore && WavesAfter < DAG.MinOccupancy &&
1149       WavesAfter >= MFI.getMinAllowedOccupancy()) {
1150     LLVM_DEBUG(dbgs() << "Function is memory bound, allow occupancy drop up to "
1151                       << MFI.getMinAllowedOccupancy() << " waves\n");
1152     NewOccupancy = WavesAfter;
1153   }
1154
1155   if (NewOccupancy < DAG.MinOccupancy) {
1156     DAG.MinOccupancy = NewOccupancy;
1157     MFI.limitOccupancy(DAG.MinOccupancy);
1158     DAG.RegionsWithMinOcc.reset();
1159     LLVM_DEBUG(dbgs() << "Occupancy lowered for the function to "
1160                       << DAG.MinOccupancy << ".\n");
1161   }
1162   // The maximum number of arch VGPR on non-unified register file, or the
1163   // maximum VGPR + AGPR in the unified register file case.
1164   unsigned MaxVGPRs = ST.getMaxNumVGPRs(MF);
1165   // The maximum number of arch VGPR for both unified and non-unified register
1166   // file.
1167   unsigned MaxArchVGPRs = std::min(MaxVGPRs, ST.getAddressableNumArchVGPRs());
1168   unsigned MaxSGPRs = ST.getMaxNumSGPRs(MF);
1169
1170   if (PressureAfter.getVGPRNum(ST.hasGFX90AInsts()) > MaxVGPRs ||
1171       PressureAfter.getVGPRNum(false) > MaxArchVGPRs ||
1172       PressureAfter.getAGPRNum() > MaxArchVGPRs ||
1173       PressureAfter.getSGPRNum() > MaxSGPRs) {
1174     DAG.RescheduleRegions[RegionIdx] = true;
1175     DAG.RegionsWithHighRP[RegionIdx] = true;
1176     DAG.RegionsWithExcessRP[RegionIdx] = true;
1177   }
1178
1179   // Revert if this region's schedule would cause a drop in occupancy or
1180   // spilling.
1181   if (shouldRevertScheduling(WavesAfter)) {
1182     revertScheduling();
1183   } else {
1184     DAG.Pressure[RegionIdx] = PressureAfter;
1185     DAG.RegionsWithMinOcc[RegionIdx] =
1186         PressureAfter.getOccupancy(ST) == DAG.MinOccupancy;
1187   }
1188 }
1189
1190 unsigned
1191 GCNSchedStage::computeSUnitReadyCycle(const SUnit &SU, unsigned CurrCycle,
1192                                       DenseMap<unsigned, unsigned> &ReadyCycles,
1193                                       const TargetSchedModel &SM) {
1194   unsigned ReadyCycle = CurrCycle;
1195   for (auto &D : SU.Preds) {
1196     if (D.isAssignedRegDep()) {
1197       MachineInstr *DefMI = D.getSUnit()->getInstr();
1198       unsigned Latency = SM.computeInstrLatency(DefMI);
1199       unsigned DefReady = ReadyCycles[DAG.getSUnit(DefMI)->NodeNum];
1200       ReadyCycle = std::max(ReadyCycle, DefReady + Latency);
1201     }
1202   }
1203   ReadyCycles[SU.NodeNum] = ReadyCycle;
1204   return ReadyCycle;
1205 }
1206
1207 #ifndef NDEBUG
1208 struct EarlierIssuingCycle {
1209   bool operator()(std::pair<MachineInstr *, unsigned> A,
1210                   std::pair<MachineInstr *, unsigned> B) const {
1211     return A.second < B.second;
1212   }
1213 };
1214
1215 static void printScheduleModel(std::set<std::pair<MachineInstr *, unsigned>,
1216                                         EarlierIssuingCycle> &ReadyCycles) {
1217   if (ReadyCycles.empty())
1218     return;
1219   unsigned BBNum = ReadyCycles.begin()->first->getParent()->getNumber();
1220   dbgs() << "\n################## Schedule time ReadyCycles for MBB : " << BBNum
1221          << " ##################\n# Cycle #\t\t\tInstruction          "
1222             "             "
1223             "                            \n";
1224   unsigned IPrev = 1;
1225   for (auto &I : ReadyCycles) {
1226     if (I.second > IPrev + 1)
1227       dbgs() << "****************************** BUBBLE OF " << I.second - IPrev
1228              << " CYCLES DETECTED ******************************\n\n";
1229     dbgs() << "[ " << I.second << " ]  :  " << *I.first << "\n";
1230     IPrev = I.second;
1231   }
1232 }
1233 #endif
1234
1235 ScheduleMetrics
1236 GCNSchedStage::getScheduleMetrics(const std::vector<SUnit> &InputSchedule) {
1237 #ifndef NDEBUG
1238   std::set<std::pair<MachineInstr *, unsigned>, EarlierIssuingCycle>
1239       ReadyCyclesSorted;
1240 #endif
1241   const TargetSchedModel &SM = ST.getInstrInfo()->getSchedModel();
1242   unsigned SumBubbles = 0;
1243   DenseMap<unsigned, unsigned> ReadyCycles;
1244   unsigned CurrCycle = 0;
1245   for (auto &SU : InputSchedule) {
1246     unsigned ReadyCycle =
1247         computeSUnitReadyCycle(SU, CurrCycle, ReadyCycles, SM);
1248     SumBubbles += ReadyCycle - CurrCycle;
1249 #ifndef NDEBUG
1250     ReadyCyclesSorted.insert(std::make_pair(SU.getInstr(), ReadyCycle));
1251 #endif
1252     CurrCycle = ++ReadyCycle;
1253   }
1254 #ifndef NDEBUG
1255   LLVM_DEBUG(
1256       printScheduleModel(ReadyCyclesSorted);
1257       dbgs() << "\n\t"
1258              << "Metric: "
1259              << (SumBubbles
1260                      ? (SumBubbles * ScheduleMetrics::ScaleFactor) / CurrCycle
1261                      : 1)
1262              << "\n\n");
1263 #endif
1264
1265   return ScheduleMetrics(CurrCycle, SumBubbles);
1266 }
1267
1268 ScheduleMetrics
1269 GCNSchedStage::getScheduleMetrics(const GCNScheduleDAGMILive &DAG) {
1270 #ifndef NDEBUG
1271   std::set<std::pair<MachineInstr *, unsigned>, EarlierIssuingCycle>
1272       ReadyCyclesSorted;
1273 #endif
1274   const TargetSchedModel &SM = ST.getInstrInfo()->getSchedModel();
1275   unsigned SumBubbles = 0;
1276   DenseMap<unsigned, unsigned> ReadyCycles;
1277   unsigned CurrCycle = 0;
1278   for (auto &MI : DAG) {
1279     SUnit *SU = DAG.getSUnit(&MI);
1280     if (!SU)
1281       continue;
1282     unsigned ReadyCycle =
1283         computeSUnitReadyCycle(*SU, CurrCycle, ReadyCycles, SM);
1284     SumBubbles += ReadyCycle - CurrCycle;
1285 #ifndef NDEBUG
1286     ReadyCyclesSorted.insert(std::make_pair(SU->getInstr(), ReadyCycle));
1287 #endif
1288     CurrCycle = ++ReadyCycle;
1289   }
1290 #ifndef NDEBUG
1291   LLVM_DEBUG(
1292       printScheduleModel(ReadyCyclesSorted);
1293       dbgs() << "\n\t"
1294              << "Metric: "
1295              << (SumBubbles
1296                      ? (SumBubbles * ScheduleMetrics::ScaleFactor) / CurrCycle
1297                      : 1)
1298              << "\n\n");
1299 #endif
1300
1301   return ScheduleMetrics(CurrCycle, SumBubbles);
1302 }
1303
1304 bool GCNSchedStage::shouldRevertScheduling(unsigned WavesAfter) {
1305   if (WavesAfter < DAG.MinOccupancy)
1306     return true;
1307
1308   return false;
1309 }
1310
1311 bool OccInitialScheduleStage::shouldRevertScheduling(unsigned WavesAfter) {
1312   if (PressureAfter == PressureBefore)
1313     return false;
1314
1315   if (GCNSchedStage::shouldRevertScheduling(WavesAfter))
1316     return true;
1317
1318   if (mayCauseSpilling(WavesAfter))
1319     return true;
1320
1321   return false;
1322 }
1323
1324 bool UnclusteredHighRPStage::shouldRevertScheduling(unsigned WavesAfter) {
1325   // If RP is not reduced in the unclustered reschedule stage, revert to the
1326   // old schedule.
1327   if ((WavesAfter <= PressureBefore.getOccupancy(ST) &&
1328        mayCauseSpilling(WavesAfter)) ||
1329       GCNSchedStage::shouldRevertScheduling(WavesAfter)) {
1330     LLVM_DEBUG(dbgs() << "Unclustered reschedule did not help.\n");
1331     return true;
1332   }
1333
1334   // Do not attempt to relax schedule even more if we are already spilling.
1335   if (isRegionWithExcessRP())
1336     return false;
1337
1338   LLVM_DEBUG(
1339       dbgs()
1340       << "\n\t      *** In shouldRevertScheduling ***\n"
1341       << "      *********** BEFORE UnclusteredHighRPStage ***********\n");
1342   ScheduleMetrics MBefore =
1343       getScheduleMetrics(DAG.SUnits);
1344   LLVM_DEBUG(
1345       dbgs()
1346       << "\n      *********** AFTER UnclusteredHighRPStage ***********\n");
1347   ScheduleMetrics MAfter = getScheduleMetrics(DAG);
1348   unsigned OldMetric = MBefore.getMetric();
1349   unsigned NewMetric = MAfter.getMetric();
1350   unsigned WavesBefore =
1351       std::min(S.getTargetOccupancy(), PressureBefore.getOccupancy(ST));
1352   unsigned Profit =
1353       ((WavesAfter * ScheduleMetrics::ScaleFactor) / WavesBefore *
1354        ((OldMetric + ScheduleMetricBias) * ScheduleMetrics::ScaleFactor) /
1355        NewMetric) /
1356       ScheduleMetrics::ScaleFactor;
1357   LLVM_DEBUG(dbgs() << "\tMetric before " << MBefore << "\tMetric after "
1358                     << MAfter << "Profit: " << Profit << "\n");
1359   return Profit < ScheduleMetrics::ScaleFactor;
1360 }
1361
1362 bool ClusteredLowOccStage::shouldRevertScheduling(unsigned WavesAfter) {
1363   if (PressureAfter == PressureBefore)
1364     return false;
1365
1366   if (GCNSchedStage::shouldRevertScheduling(WavesAfter))
1367     return true;
1368
1369   if (mayCauseSpilling(WavesAfter))
1370     return true;
1371
1372   return false;
1373 }
1374
1375 bool PreRARematStage::shouldRevertScheduling(unsigned WavesAfter) {
1376   if (GCNSchedStage::shouldRevertScheduling(WavesAfter))
1377     return true;
1378
1379   if (mayCauseSpilling(WavesAfter))
1380     return true;
1381
1382   return false;
1383 }
1384
1385 bool ILPInitialScheduleStage::shouldRevertScheduling(unsigned WavesAfter) {
1386   if (mayCauseSpilling(WavesAfter))
1387     return true;
1388
1389   return false;
1390 }
1391
1392 bool GCNSchedStage::mayCauseSpilling(unsigned WavesAfter) {
1393   if (WavesAfter <= MFI.getMinWavesPerEU() && isRegionWithExcessRP() &&
1394       !PressureAfter.less(MF, PressureBefore)) {
1395     LLVM_DEBUG(dbgs() << "New pressure will result in more spilling.\n");
1396     return true;
1397   }
1398
1399   return false;
1400 }
1401
1402 void GCNSchedStage::revertScheduling() {
1403   DAG.RegionsWithMinOcc[RegionIdx] =
1404       PressureBefore.getOccupancy(ST) == DAG.MinOccupancy;
1405   LLVM_DEBUG(dbgs() << "Attempting to revert scheduling.\n");
1406   DAG.RescheduleRegions[RegionIdx] =
1407       S.hasNextStage() &&
1408       S.getNextStage() != GCNSchedStageID::UnclusteredHighRPReschedule;
1409   DAG.RegionEnd = DAG.RegionBegin;
1410   int SkippedDebugInstr = 0;
1411   for (MachineInstr *MI : Unsched) {
1412     if (MI->isDebugInstr()) {
1413       ++SkippedDebugInstr;
1414       continue;
1415     }
1416
1417     if (MI->getIterator() != DAG.RegionEnd) {
1418       DAG.BB->remove(MI);
1419       DAG.BB->insert(DAG.RegionEnd, MI);
1420       if (!MI->isDebugInstr())
1421         DAG.LIS->handleMove(*MI, true);
1422     }
1423
1424     // Reset read-undef flags and update them later.
1425     for (auto &Op : MI->all_defs())
1426       Op.setIsUndef(false);
1427     RegisterOperands RegOpers;
1428     RegOpers.collect(*MI, *DAG.TRI, DAG.MRI, DAG.ShouldTrackLaneMasks, false);
1429     if (!MI->isDebugInstr()) {
1430       if (DAG.ShouldTrackLaneMasks) {
1431         // Adjust liveness and add missing dead+read-undef flags.
1432         SlotIndex SlotIdx = DAG.LIS->getInstructionIndex(*MI).getRegSlot();
1433         RegOpers.adjustLaneLiveness(*DAG.LIS, DAG.MRI, SlotIdx, MI);
1434       } else {
1435         // Adjust for missing dead-def flags.
1436         RegOpers.detectDeadDefs(*MI, *DAG.LIS);
1437       }
1438     }
1439     DAG.RegionEnd = MI->getIterator();
1440     ++DAG.RegionEnd;
1441     LLVM_DEBUG(dbgs() << "Scheduling " << *MI);
1442   }
1443
1444   // After reverting schedule, debug instrs will now be at the end of the block
1445   // and RegionEnd will point to the first debug instr. Increment RegionEnd
1446   // pass debug instrs to the actual end of the scheduling region.
1447   while (SkippedDebugInstr-- > 0)
1448     ++DAG.RegionEnd;
1449
1450   // If Unsched.front() instruction is a debug instruction, this will actually
1451   // shrink the region since we moved all debug instructions to the end of the
1452   // block. Find the first instruction that is not a debug instruction.
1453   DAG.RegionBegin = Unsched.front()->getIterator();
1454   if (DAG.RegionBegin->isDebugInstr()) {
1455     for (MachineInstr *MI : Unsched) {
1456       if (MI->isDebugInstr())
1457         continue;
1458       DAG.RegionBegin = MI->getIterator();
1459       break;
1460     }
1461   }
1462
1463   // Then move the debug instructions back into their correct place and set
1464   // RegionBegin and RegionEnd if needed.
1465   DAG.placeDebugValues();
1466
1467   DAG.Regions[RegionIdx] = std::pair(DAG.RegionBegin, DAG.RegionEnd);
1468 }
1469
1470 void PreRARematStage::collectRematerializableInstructions() {
1471   const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo *>(DAG.TRI);
1472   for (unsigned I = 0, E = DAG.MRI.getNumVirtRegs(); I != E; ++I) {
1473     Register Reg = Register::index2VirtReg(I);
1474     if (!DAG.LIS->hasInterval(Reg))
1475       continue;
1476
1477     // TODO: Handle AGPR and SGPR rematerialization
1478     if (!SRI->isVGPRClass(DAG.MRI.getRegClass(Reg)) ||
1479         !DAG.MRI.hasOneDef(Reg) || !DAG.MRI.hasOneNonDBGUse(Reg))
1480       continue;
1481
1482     MachineOperand *Op = DAG.MRI.getOneDef(Reg);
1483     MachineInstr *Def = Op->getParent();
1484     if (Op->getSubReg() != 0 || !isTriviallyReMaterializable(*Def))
1485       continue;
1486
1487     MachineInstr *UseI = &*DAG.MRI.use_instr_nodbg_begin(Reg);
1488     if (Def->getParent() == UseI->getParent())
1489       continue;
1490
1491     // We are only collecting defs that are defined in another block and are
1492     // live-through or used inside regions at MinOccupancy. This means that the
1493     // register must be in the live-in set for the region.
1494     bool AddedToRematList = false;
1495     for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) {
1496       auto It = DAG.LiveIns[I].find(Reg);
1497       if (It != DAG.LiveIns[I].end() && !It->second.none()) {
1498         if (DAG.RegionsWithMinOcc[I]) {
1499           RematerializableInsts[I][Def] = UseI;
1500           AddedToRematList = true;
1501         }
1502
1503         // Collect regions with rematerializable reg as live-in to avoid
1504         // searching later when updating RP.
1505         RematDefToLiveInRegions[Def].push_back(I);
1506       }
1507     }
1508     if (!AddedToRematList)
1509       RematDefToLiveInRegions.erase(Def);
1510   }
1511 }
1512
1513 bool PreRARematStage::sinkTriviallyRematInsts(const GCNSubtarget &ST,
1514                                               const TargetInstrInfo *TII) {
1515   // Temporary copies of cached variables we will be modifying and replacing if
1516   // sinking succeeds.
1517   SmallVector<
1518       std::pair<MachineBasicBlock::iterator, MachineBasicBlock::iterator>, 32>
1519       NewRegions;
1520   DenseMap<unsigned, GCNRPTracker::LiveRegSet> NewLiveIns;
1521   DenseMap<unsigned, GCNRegPressure> NewPressure;
1522   BitVector NewRescheduleRegions;
1523   LiveIntervals *LIS = DAG.LIS;
1524
1525   NewRegions.resize(DAG.Regions.size());
1526   NewRescheduleRegions.resize(DAG.Regions.size());
1527
1528   // Collect only regions that has a rematerializable def as a live-in.
1529   SmallSet<unsigned, 16> ImpactedRegions;
1530   for (const auto &It : RematDefToLiveInRegions)
1531     ImpactedRegions.insert(It.second.begin(), It.second.end());
1532
1533   // Make copies of register pressure and live-ins cache that will be updated
1534   // as we rematerialize.
1535   for (auto Idx : ImpactedRegions) {
1536     NewPressure[Idx] = DAG.Pressure[Idx];
1537     NewLiveIns[Idx] = DAG.LiveIns[Idx];
1538   }
1539   NewRegions = DAG.Regions;
1540   NewRescheduleRegions.reset();
1541
1542   DenseMap<MachineInstr *, MachineInstr *> InsertedMIToOldDef;
1543   bool Improved = false;
1544   for (auto I : ImpactedRegions) {
1545     if (!DAG.RegionsWithMinOcc[I])
1546       continue;
1547
1548     Improved = false;
1549     int VGPRUsage = NewPressure[I].getVGPRNum(ST.hasGFX90AInsts());
1550     int SGPRUsage = NewPressure[I].getSGPRNum();
1551
1552     // TODO: Handle occupancy drop due to AGPR and SGPR.
1553     // Check if cause of occupancy drop is due to VGPR usage and not SGPR.
1554     if (ST.getOccupancyWithNumSGPRs(SGPRUsage) == DAG.MinOccupancy)
1555       break;
1556
1557     // The occupancy of this region could have been improved by a previous
1558     // iteration's sinking of defs.
1559     if (NewPressure[I].getOccupancy(ST) > DAG.MinOccupancy) {
1560       NewRescheduleRegions[I] = true;
1561       Improved = true;
1562       continue;
1563     }
1564
1565     // First check if we have enough trivially rematerializable instructions to
1566     // improve occupancy. Optimistically assume all instructions we are able to
1567     // sink decreased RP.
1568     int TotalSinkableRegs = 0;
1569     for (const auto &It : RematerializableInsts[I]) {
1570       MachineInstr *Def = It.first;
1571       Register DefReg = Def->getOperand(0).getReg();
1572       TotalSinkableRegs +=
1573           SIRegisterInfo::getNumCoveredRegs(NewLiveIns[I][DefReg]);
1574     }
1575     int VGPRsAfterSink = VGPRUsage - TotalSinkableRegs;
1576     unsigned OptimisticOccupancy = ST.getOccupancyWithNumVGPRs(VGPRsAfterSink);
1577     // If in the most optimistic scenario, we cannot improve occupancy, then do
1578     // not attempt to sink any instructions.
1579     if (OptimisticOccupancy <= DAG.MinOccupancy)
1580       break;
1581
1582     unsigned ImproveOccupancy = 0;
1583     SmallVector<MachineInstr *, 4> SinkedDefs;
1584     for (auto &It : RematerializableInsts[I]) {
1585       MachineInstr *Def = It.first;
1586       MachineBasicBlock::iterator InsertPos =
1587           MachineBasicBlock::iterator(It.second);
1588       Register Reg = Def->getOperand(0).getReg();
1589       // Rematerialize MI to its use block. Since we are only rematerializing
1590       // instructions that do not have any virtual reg uses, we do not need to
1591       // call LiveRangeEdit::allUsesAvailableAt() and
1592       // LiveRangeEdit::canRematerializeAt().
1593       TII->reMaterialize(*InsertPos->getParent(), InsertPos, Reg,
1594                          Def->getOperand(0).getSubReg(), *Def, *DAG.TRI);
1595       MachineInstr *NewMI = &*std::prev(InsertPos);
1596       LIS->InsertMachineInstrInMaps(*NewMI);
1597       LIS->removeInterval(Reg);
1598       LIS->createAndComputeVirtRegInterval(Reg);
1599       InsertedMIToOldDef[NewMI] = Def;
1600
1601       // Update region boundaries in scheduling region we sinked from since we
1602       // may sink an instruction that was at the beginning or end of its region
1603       DAG.updateRegionBoundaries(NewRegions, Def, /*NewMI =*/nullptr,
1604                                  /*Removing =*/true);
1605
1606       // Update region boundaries in region we sinked to.
1607       DAG.updateRegionBoundaries(NewRegions, InsertPos, NewMI);
1608
1609       LaneBitmask PrevMask = NewLiveIns[I][Reg];
1610       // FIXME: Also update cached pressure for where the def was sinked from.
1611       // Update RP for all regions that has this reg as a live-in and remove
1612       // the reg from all regions as a live-in.
1613       for (auto Idx : RematDefToLiveInRegions[Def]) {
1614         NewLiveIns[Idx].erase(Reg);
1615         if (InsertPos->getParent() != DAG.Regions[Idx].first->getParent()) {
1616           // Def is live-through and not used in this block.
1617           NewPressure[Idx].inc(Reg, PrevMask, LaneBitmask::getNone(), DAG.MRI);
1618         } else {
1619           // Def is used and rematerialized into this block.
1620           GCNDownwardRPTracker RPT(*LIS);
1621           auto *NonDbgMI = &*skipDebugInstructionsForward(
1622               NewRegions[Idx].first, NewRegions[Idx].second);
1623           RPT.reset(*NonDbgMI, &NewLiveIns[Idx]);
1624           RPT.advance(NewRegions[Idx].second);
1625           NewPressure[Idx] = RPT.moveMaxPressure();
1626         }
1627       }
1628
1629       SinkedDefs.push_back(Def);
1630       ImproveOccupancy = NewPressure[I].getOccupancy(ST);
1631       if (ImproveOccupancy > DAG.MinOccupancy)
1632         break;
1633     }
1634
1635     // Remove defs we just sinked from all regions' list of sinkable defs
1636     for (auto &Def : SinkedDefs)
1637       for (auto TrackedIdx : RematDefToLiveInRegions[Def])
1638         RematerializableInsts[TrackedIdx].erase(Def);
1639
1640     if (ImproveOccupancy <= DAG.MinOccupancy)
1641       break;
1642
1643     NewRescheduleRegions[I] = true;
1644     Improved = true;
1645   }
1646
1647   if (!Improved) {
1648     // Occupancy was not improved for all regions that were at MinOccupancy.
1649     // Undo sinking and remove newly rematerialized instructions.
1650     for (auto &Entry : InsertedMIToOldDef) {
1651       MachineInstr *MI = Entry.first;
1652       MachineInstr *OldMI = Entry.second;
1653       Register Reg = MI->getOperand(0).getReg();
1654       LIS->RemoveMachineInstrFromMaps(*MI);
1655       MI->eraseFromParent();
1656       OldMI->clearRegisterDeads(Reg);
1657       LIS->removeInterval(Reg);
1658       LIS->createAndComputeVirtRegInterval(Reg);
1659     }
1660     return false;
1661   }
1662
1663   // Occupancy was improved for all regions.
1664   for (auto &Entry : InsertedMIToOldDef) {
1665     MachineInstr *MI = Entry.first;
1666     MachineInstr *OldMI = Entry.second;
1667
1668     // Remove OldMI from BBLiveInMap since we are sinking it from its MBB.
1669     DAG.BBLiveInMap.erase(OldMI);
1670
1671     // Remove OldMI and update LIS
1672     Register Reg = MI->getOperand(0).getReg();
1673     LIS->RemoveMachineInstrFromMaps(*OldMI);
1674     OldMI->eraseFromParent();
1675     LIS->removeInterval(Reg);
1676     LIS->createAndComputeVirtRegInterval(Reg);
1677   }
1678
1679   // Update live-ins, register pressure, and regions caches.
1680   for (auto Idx : ImpactedRegions) {
1681     DAG.LiveIns[Idx] = NewLiveIns[Idx];
1682     DAG.Pressure[Idx] = NewPressure[Idx];
1683     DAG.MBBLiveIns.erase(DAG.Regions[Idx].first->getParent());
1684   }
1685   DAG.Regions = NewRegions;
1686   DAG.RescheduleRegions = NewRescheduleRegions;
1687
1688   if (GCNTrackers)
1689     DAG.RegionLiveOuts.buildLiveRegMap();
1690
1691   SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
1692   MFI.increaseOccupancy(MF, ++DAG.MinOccupancy);
1693
1694   return true;
1695 }
1696
1697 // Copied from MachineLICM
1698 bool PreRARematStage::isTriviallyReMaterializable(const MachineInstr &MI) {
1699   if (!DAG.TII->isTriviallyReMaterializable(MI))
1700     return false;
1701
1702   for (const MachineOperand &MO : MI.all_uses())
1703     if (MO.getReg().isVirtual())
1704       return false;
1705
1706   return true;
1707 }
1708
1709 // When removing, we will have to check both beginning and ending of the region.
1710 // When inserting, we will only have to check if we are inserting NewMI in front
1711 // of a scheduling region and do not need to check the ending since we will only
1712 // ever be inserting before an already existing MI.
1713 void GCNScheduleDAGMILive::updateRegionBoundaries(
1714     SmallVectorImpl<std::pair<MachineBasicBlock::iterator,
1715                               MachineBasicBlock::iterator>> &RegionBoundaries,
1716     MachineBasicBlock::iterator MI, MachineInstr *NewMI, bool Removing) {
1717   unsigned I = 0, E = RegionBoundaries.size();
1718   // Search for first region of the block where MI is located
1719   while (I != E && MI->getParent() != RegionBoundaries[I].first->getParent())
1720     ++I;
1721
1722   for (; I != E; ++I) {
1723     if (MI->getParent() != RegionBoundaries[I].first->getParent())
1724       return;
1725
1726     if (Removing && MI == RegionBoundaries[I].first &&
1727         MI == RegionBoundaries[I].second) {
1728       // MI is in a region with size 1, after removing, the region will be
1729       // size 0, set RegionBegin and RegionEnd to pass end of block iterator.
1730       RegionBoundaries[I] =
1731           std::pair(MI->getParent()->end(), MI->getParent()->end());
1732       return;
1733     }
1734     if (MI == RegionBoundaries[I].first) {
1735       if (Removing)
1736         RegionBoundaries[I] =
1737             std::pair(std::next(MI), RegionBoundaries[I].second);
1738       else
1739         // Inserted NewMI in front of region, set new RegionBegin to NewMI
1740         RegionBoundaries[I] = std::pair(MachineBasicBlock::iterator(NewMI),
1741                                         RegionBoundaries[I].second);
1742       return;
1743     }
1744     if (Removing && MI == RegionBoundaries[I].second) {
1745       RegionBoundaries[I] = std::pair(RegionBoundaries[I].first, std::prev(MI));
1746       return;
1747     }
1748   }
1749 }
1750
1751 static bool hasIGLPInstrs(ScheduleDAGInstrs *DAG) {
1752   return any_of(*DAG, [](MachineBasicBlock::iterator MI) {
1753     unsigned Opc = MI->getOpcode();
1754     return Opc == AMDGPU::SCHED_GROUP_BARRIER || Opc == AMDGPU::IGLP_OPT;
1755   });
1756 }
1757
1758 GCNPostScheduleDAGMILive::GCNPostScheduleDAGMILive(
1759     MachineSchedContext *C, std::unique_ptr<MachineSchedStrategy> S,
1760     bool RemoveKillFlags)
1761     : ScheduleDAGMI(C, std::move(S), RemoveKillFlags) {}
1762
1763 void GCNPostScheduleDAGMILive::schedule() {
1764   HasIGLPInstrs = hasIGLPInstrs(this);
1765   if (HasIGLPInstrs) {
1766     SavedMutations.clear();
1767     SavedMutations.swap(Mutations);
1768     addMutation(createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::PostRA));
1769   }
1770
1771   ScheduleDAGMI::schedule();
1772 }
1773
1774 void GCNPostScheduleDAGMILive::finalizeSchedule() {
1775   if (HasIGLPInstrs)
1776     SavedMutations.swap(Mutations);
1777
1778   ScheduleDAGMI::finalizeSchedule();
1779 }