[clang][NFC] simplify the unset check in `ParseLabeledStatement` (#117430)
[llvm-project.git] / llvm / lib / Target / AMDGPU / GCNSchedStrategy.cpp
blob57f517bfba0ebb3fcee5b7d5cc710c30f62a6743
1 //===-- GCNSchedStrategy.cpp - GCN Scheduler Strategy ---------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// This contains a MachineSchedStrategy implementation for maximizing wave
11 /// occupancy on GCN hardware.
12 ///
13 /// This pass will apply multiple scheduling stages to the same function.
14 /// Regions are first recorded in GCNScheduleDAGMILive::schedule. The actual
15 /// entry point for the scheduling of those regions is
16 /// GCNScheduleDAGMILive::runSchedStages.
18 /// Generally, the reason for having multiple scheduling stages is to account
19 /// for the kernel-wide effect of register usage on occupancy. Usually, only a
20 /// few scheduling regions will have register pressure high enough to limit
21 /// occupancy for the kernel, so constraints can be relaxed to improve ILP in
22 /// other regions.
23 ///
24 //===----------------------------------------------------------------------===//
26 #include "GCNSchedStrategy.h"
27 #include "AMDGPUIGroupLP.h"
28 #include "SIMachineFunctionInfo.h"
29 #include "llvm/CodeGen/RegisterClassInfo.h"
31 #define DEBUG_TYPE "machine-scheduler"
33 using namespace llvm;
35 static cl::opt<bool> DisableUnclusterHighRP(
36 "amdgpu-disable-unclustered-high-rp-reschedule", cl::Hidden,
37 cl::desc("Disable unclustered high register pressure "
38 "reduction scheduling stage."),
39 cl::init(false));
41 static cl::opt<bool> DisableClusteredLowOccupancy(
42 "amdgpu-disable-clustered-low-occupancy-reschedule", cl::Hidden,
43 cl::desc("Disable clustered low occupancy "
44 "rescheduling for ILP scheduling stage."),
45 cl::init(false));
47 static cl::opt<unsigned> ScheduleMetricBias(
48 "amdgpu-schedule-metric-bias", cl::Hidden,
49 cl::desc(
50 "Sets the bias which adds weight to occupancy vs latency. Set it to "
51 "100 to chase the occupancy only."),
52 cl::init(10));
54 static cl::opt<bool>
55 RelaxedOcc("amdgpu-schedule-relaxed-occupancy", cl::Hidden,
56 cl::desc("Relax occupancy targets for kernels which are memory "
57 "bound (amdgpu-membound-threshold), or "
58 "Wave Limited (amdgpu-limit-wave-threshold)."),
59 cl::init(false));
61 static cl::opt<bool> GCNTrackers(
62 "amdgpu-use-amdgpu-trackers", cl::Hidden,
63 cl::desc("Use the AMDGPU specific RPTrackers during scheduling"),
64 cl::init(false));
66 const unsigned ScheduleMetrics::ScaleFactor = 100;
68 GCNSchedStrategy::GCNSchedStrategy(const MachineSchedContext *C)
69 : GenericScheduler(C), TargetOccupancy(0), MF(nullptr),
70 DownwardTracker(*C->LIS), UpwardTracker(*C->LIS), HasHighPressure(false) {
73 void GCNSchedStrategy::initialize(ScheduleDAGMI *DAG) {
74 GenericScheduler::initialize(DAG);
76 MF = &DAG->MF;
78 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
80 SGPRExcessLimit =
81 Context->RegClassInfo->getNumAllocatableRegs(&AMDGPU::SGPR_32RegClass);
82 VGPRExcessLimit =
83 Context->RegClassInfo->getNumAllocatableRegs(&AMDGPU::VGPR_32RegClass);
85 SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>();
86 // Set the initial TargetOccupnacy to the maximum occupancy that we can
87 // achieve for this function. This effectively sets a lower bound on the
88 // 'Critical' register limits in the scheduler.
89 // Allow for lower occupancy targets if kernel is wave limited or memory
90 // bound, and using the relaxed occupancy feature.
91 TargetOccupancy =
92 RelaxedOcc ? MFI.getMinAllowedOccupancy() : MFI.getOccupancy();
93 SGPRCriticalLimit =
94 std::min(ST.getMaxNumSGPRs(TargetOccupancy, true), SGPRExcessLimit);
96 if (!KnownExcessRP) {
97 VGPRCriticalLimit =
98 std::min(ST.getMaxNumVGPRs(TargetOccupancy), VGPRExcessLimit);
99 } else {
100 // This is similar to ST.getMaxNumVGPRs(TargetOccupancy) result except
101 // returns a reasonably small number for targets with lots of VGPRs, such
102 // as GFX10 and GFX11.
103 LLVM_DEBUG(dbgs() << "Region is known to spill, use alternative "
104 "VGPRCriticalLimit calculation method.\n");
106 unsigned Granule = AMDGPU::IsaInfo::getVGPRAllocGranule(&ST);
107 unsigned Addressable = AMDGPU::IsaInfo::getAddressableNumVGPRs(&ST);
108 unsigned VGPRBudget = alignDown(Addressable / TargetOccupancy, Granule);
109 VGPRBudget = std::max(VGPRBudget, Granule);
110 VGPRCriticalLimit = std::min(VGPRBudget, VGPRExcessLimit);
113 // Subtract error margin and bias from register limits and avoid overflow.
114 SGPRCriticalLimit -= std::min(SGPRLimitBias + ErrorMargin, SGPRCriticalLimit);
115 VGPRCriticalLimit -= std::min(VGPRLimitBias + ErrorMargin, VGPRCriticalLimit);
116 SGPRExcessLimit -= std::min(SGPRLimitBias + ErrorMargin, SGPRExcessLimit);
117 VGPRExcessLimit -= std::min(VGPRLimitBias + ErrorMargin, VGPRExcessLimit);
119 LLVM_DEBUG(dbgs() << "VGPRCriticalLimit = " << VGPRCriticalLimit
120 << ", VGPRExcessLimit = " << VGPRExcessLimit
121 << ", SGPRCriticalLimit = " << SGPRCriticalLimit
122 << ", SGPRExcessLimit = " << SGPRExcessLimit << "\n\n");
125 /// Checks whether \p SU can use the cached DAG pressure diffs to compute the
126 /// current register pressure.
128 /// This works for the common case, but it has a few exceptions that have been
129 /// observed through trial and error:
130 /// - Explicit physical register operands
131 /// - Subregister definitions
133 /// In both of those cases, PressureDiff doesn't represent the actual pressure,
134 /// and querying LiveIntervals through the RegPressureTracker is needed to get
135 /// an accurate value.
137 /// We should eventually only use PressureDiff for maximum performance, but this
138 /// already allows 80% of SUs to take the fast path without changing scheduling
139 /// at all. Further changes would either change scheduling, or require a lot
140 /// more logic to recover an accurate pressure estimate from the PressureDiffs.
141 static bool canUsePressureDiffs(const SUnit &SU) {
142 if (!SU.isInstr())
143 return false;
145 // Cannot use pressure diffs for subregister defs or with physregs, it's
146 // imprecise in both cases.
147 for (const auto &Op : SU.getInstr()->operands()) {
148 if (!Op.isReg() || Op.isImplicit())
149 continue;
150 if (Op.getReg().isPhysical() ||
151 (Op.isDef() && Op.getSubReg() != AMDGPU::NoSubRegister))
152 return false;
154 return true;
157 static void getRegisterPressures(
158 bool AtTop, const RegPressureTracker &RPTracker, SUnit *SU,
159 std::vector<unsigned> &Pressure, std::vector<unsigned> &MaxPressure,
160 GCNDownwardRPTracker &DownwardTracker, GCNUpwardRPTracker &UpwardTracker,
161 ScheduleDAGMI *DAG, const SIRegisterInfo *SRI) {
162 // getDownwardPressure() and getUpwardPressure() make temporary changes to
163 // the tracker, so we need to pass those function a non-const copy.
164 RegPressureTracker &TempTracker = const_cast<RegPressureTracker &>(RPTracker);
165 if (!GCNTrackers) {
166 AtTop
167 ? TempTracker.getDownwardPressure(SU->getInstr(), Pressure, MaxPressure)
168 : TempTracker.getUpwardPressure(SU->getInstr(), Pressure, MaxPressure);
170 return;
173 // GCNTrackers
174 Pressure.resize(4, 0);
175 MachineInstr *MI = SU->getInstr();
176 GCNRegPressure NewPressure;
177 if (AtTop) {
178 GCNDownwardRPTracker TempDownwardTracker(DownwardTracker);
179 NewPressure = TempDownwardTracker.bumpDownwardPressure(MI, SRI);
180 } else {
181 GCNUpwardRPTracker TempUpwardTracker(UpwardTracker);
182 TempUpwardTracker.recede(*MI);
183 NewPressure = TempUpwardTracker.getPressure();
185 Pressure[AMDGPU::RegisterPressureSets::SReg_32] = NewPressure.getSGPRNum();
186 Pressure[AMDGPU::RegisterPressureSets::VGPR_32] =
187 NewPressure.getArchVGPRNum();
188 Pressure[AMDGPU::RegisterPressureSets::AGPR_32] = NewPressure.getAGPRNum();
191 void GCNSchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU,
192 bool AtTop,
193 const RegPressureTracker &RPTracker,
194 const SIRegisterInfo *SRI,
195 unsigned SGPRPressure,
196 unsigned VGPRPressure, bool IsBottomUp) {
197 Cand.SU = SU;
198 Cand.AtTop = AtTop;
200 if (!DAG->isTrackingPressure())
201 return;
203 Pressure.clear();
204 MaxPressure.clear();
206 // We try to use the cached PressureDiffs in the ScheduleDAG whenever
207 // possible over querying the RegPressureTracker.
209 // RegPressureTracker will make a lot of LIS queries which are very
210 // expensive, it is considered a slow function in this context.
212 // PressureDiffs are precomputed and cached, and getPressureDiff is just a
213 // trivial lookup into an array. It is pretty much free.
215 // In EXPENSIVE_CHECKS, we always query RPTracker to verify the results of
216 // PressureDiffs.
217 if (AtTop || !canUsePressureDiffs(*SU) || GCNTrackers) {
218 getRegisterPressures(AtTop, RPTracker, SU, Pressure, MaxPressure,
219 DownwardTracker, UpwardTracker, DAG, SRI);
220 } else {
221 // Reserve 4 slots.
222 Pressure.resize(4, 0);
223 Pressure[AMDGPU::RegisterPressureSets::SReg_32] = SGPRPressure;
224 Pressure[AMDGPU::RegisterPressureSets::VGPR_32] = VGPRPressure;
226 for (const auto &Diff : DAG->getPressureDiff(SU)) {
227 if (!Diff.isValid())
228 continue;
229 // PressureDiffs is always bottom-up so if we're working top-down we need
230 // to invert its sign.
231 Pressure[Diff.getPSet()] +=
232 (IsBottomUp ? Diff.getUnitInc() : -Diff.getUnitInc());
235 #ifdef EXPENSIVE_CHECKS
236 std::vector<unsigned> CheckPressure, CheckMaxPressure;
237 getRegisterPressures(AtTop, RPTracker, SU, CheckPressure, CheckMaxPressure,
238 DownwardTracker, UpwardTracker, DAG, SRI);
239 if (Pressure[AMDGPU::RegisterPressureSets::SReg_32] !=
240 CheckPressure[AMDGPU::RegisterPressureSets::SReg_32] ||
241 Pressure[AMDGPU::RegisterPressureSets::VGPR_32] !=
242 CheckPressure[AMDGPU::RegisterPressureSets::VGPR_32]) {
243 errs() << "Register Pressure is inaccurate when calculated through "
244 "PressureDiff\n"
245 << "SGPR got " << Pressure[AMDGPU::RegisterPressureSets::SReg_32]
246 << ", expected "
247 << CheckPressure[AMDGPU::RegisterPressureSets::SReg_32] << "\n"
248 << "VGPR got " << Pressure[AMDGPU::RegisterPressureSets::VGPR_32]
249 << ", expected "
250 << CheckPressure[AMDGPU::RegisterPressureSets::VGPR_32] << "\n";
251 report_fatal_error("inaccurate register pressure calculation");
253 #endif
256 unsigned NewSGPRPressure = Pressure[AMDGPU::RegisterPressureSets::SReg_32];
257 unsigned NewVGPRPressure = Pressure[AMDGPU::RegisterPressureSets::VGPR_32];
259 // If two instructions increase the pressure of different register sets
260 // by the same amount, the generic scheduler will prefer to schedule the
261 // instruction that increases the set with the least amount of registers,
262 // which in our case would be SGPRs. This is rarely what we want, so
263 // when we report excess/critical register pressure, we do it either
264 // only for VGPRs or only for SGPRs.
266 // FIXME: Better heuristics to determine whether to prefer SGPRs or VGPRs.
267 const unsigned MaxVGPRPressureInc = 16;
268 bool ShouldTrackVGPRs = VGPRPressure + MaxVGPRPressureInc >= VGPRExcessLimit;
269 bool ShouldTrackSGPRs = !ShouldTrackVGPRs && SGPRPressure >= SGPRExcessLimit;
271 // FIXME: We have to enter REG-EXCESS before we reach the actual threshold
272 // to increase the likelihood we don't go over the limits. We should improve
273 // the analysis to look through dependencies to find the path with the least
274 // register pressure.
276 // We only need to update the RPDelta for instructions that increase register
277 // pressure. Instructions that decrease or keep reg pressure the same will be
278 // marked as RegExcess in tryCandidate() when they are compared with
279 // instructions that increase the register pressure.
280 if (ShouldTrackVGPRs && NewVGPRPressure >= VGPRExcessLimit) {
281 HasHighPressure = true;
282 Cand.RPDelta.Excess = PressureChange(AMDGPU::RegisterPressureSets::VGPR_32);
283 Cand.RPDelta.Excess.setUnitInc(NewVGPRPressure - VGPRExcessLimit);
286 if (ShouldTrackSGPRs && NewSGPRPressure >= SGPRExcessLimit) {
287 HasHighPressure = true;
288 Cand.RPDelta.Excess = PressureChange(AMDGPU::RegisterPressureSets::SReg_32);
289 Cand.RPDelta.Excess.setUnitInc(NewSGPRPressure - SGPRExcessLimit);
292 // Register pressure is considered 'CRITICAL' if it is approaching a value
293 // that would reduce the wave occupancy for the execution unit. When
294 // register pressure is 'CRITICAL', increasing SGPR and VGPR pressure both
295 // has the same cost, so we don't need to prefer one over the other.
297 int SGPRDelta = NewSGPRPressure - SGPRCriticalLimit;
298 int VGPRDelta = NewVGPRPressure - VGPRCriticalLimit;
300 if (SGPRDelta >= 0 || VGPRDelta >= 0) {
301 HasHighPressure = true;
302 if (SGPRDelta > VGPRDelta) {
303 Cand.RPDelta.CriticalMax =
304 PressureChange(AMDGPU::RegisterPressureSets::SReg_32);
305 Cand.RPDelta.CriticalMax.setUnitInc(SGPRDelta);
306 } else {
307 Cand.RPDelta.CriticalMax =
308 PressureChange(AMDGPU::RegisterPressureSets::VGPR_32);
309 Cand.RPDelta.CriticalMax.setUnitInc(VGPRDelta);
314 // This function is mostly cut and pasted from
315 // GenericScheduler::pickNodeFromQueue()
316 void GCNSchedStrategy::pickNodeFromQueue(SchedBoundary &Zone,
317 const CandPolicy &ZonePolicy,
318 const RegPressureTracker &RPTracker,
319 SchedCandidate &Cand,
320 bool IsBottomUp) {
321 const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo*>(TRI);
322 ArrayRef<unsigned> Pressure = RPTracker.getRegSetPressureAtPos();
323 unsigned SGPRPressure = 0;
324 unsigned VGPRPressure = 0;
325 if (DAG->isTrackingPressure()) {
326 if (!GCNTrackers) {
327 SGPRPressure = Pressure[AMDGPU::RegisterPressureSets::SReg_32];
328 VGPRPressure = Pressure[AMDGPU::RegisterPressureSets::VGPR_32];
329 } else {
330 GCNRPTracker *T = IsBottomUp
331 ? static_cast<GCNRPTracker *>(&UpwardTracker)
332 : static_cast<GCNRPTracker *>(&DownwardTracker);
333 SGPRPressure = T->getPressure().getSGPRNum();
334 VGPRPressure = T->getPressure().getArchVGPRNum();
337 ReadyQueue &Q = Zone.Available;
338 for (SUnit *SU : Q) {
340 SchedCandidate TryCand(ZonePolicy);
341 initCandidate(TryCand, SU, Zone.isTop(), RPTracker, SRI, SGPRPressure,
342 VGPRPressure, IsBottomUp);
343 // Pass SchedBoundary only when comparing nodes from the same boundary.
344 SchedBoundary *ZoneArg = Cand.AtTop == TryCand.AtTop ? &Zone : nullptr;
345 tryCandidate(Cand, TryCand, ZoneArg);
346 if (TryCand.Reason != NoCand) {
347 // Initialize resource delta if needed in case future heuristics query it.
348 if (TryCand.ResDelta == SchedResourceDelta())
349 TryCand.initResourceDelta(Zone.DAG, SchedModel);
350 Cand.setBest(TryCand);
351 LLVM_DEBUG(traceCandidate(Cand));
356 // This function is mostly cut and pasted from
357 // GenericScheduler::pickNodeBidirectional()
358 SUnit *GCNSchedStrategy::pickNodeBidirectional(bool &IsTopNode) {
359 // Schedule as far as possible in the direction of no choice. This is most
360 // efficient, but also provides the best heuristics for CriticalPSets.
361 if (SUnit *SU = Bot.pickOnlyChoice()) {
362 IsTopNode = false;
363 return SU;
365 if (SUnit *SU = Top.pickOnlyChoice()) {
366 IsTopNode = true;
367 return SU;
369 // Set the bottom-up policy based on the state of the current bottom zone and
370 // the instructions outside the zone, including the top zone.
371 CandPolicy BotPolicy;
372 setPolicy(BotPolicy, /*IsPostRA=*/false, Bot, &Top);
373 // Set the top-down policy based on the state of the current top zone and
374 // the instructions outside the zone, including the bottom zone.
375 CandPolicy TopPolicy;
376 setPolicy(TopPolicy, /*IsPostRA=*/false, Top, &Bot);
378 // See if BotCand is still valid (because we previously scheduled from Top).
379 LLVM_DEBUG(dbgs() << "Picking from Bot:\n");
380 if (!BotCand.isValid() || BotCand.SU->isScheduled ||
381 BotCand.Policy != BotPolicy) {
382 BotCand.reset(CandPolicy());
383 pickNodeFromQueue(Bot, BotPolicy, DAG->getBotRPTracker(), BotCand,
384 /*IsBottomUp=*/true);
385 assert(BotCand.Reason != NoCand && "failed to find the first candidate");
386 } else {
387 LLVM_DEBUG(traceCandidate(BotCand));
388 #ifndef NDEBUG
389 if (VerifyScheduling) {
390 SchedCandidate TCand;
391 TCand.reset(CandPolicy());
392 pickNodeFromQueue(Bot, BotPolicy, DAG->getBotRPTracker(), TCand,
393 /*IsBottomUp=*/true);
394 assert(TCand.SU == BotCand.SU &&
395 "Last pick result should correspond to re-picking right now");
397 #endif
400 // Check if the top Q has a better candidate.
401 LLVM_DEBUG(dbgs() << "Picking from Top:\n");
402 if (!TopCand.isValid() || TopCand.SU->isScheduled ||
403 TopCand.Policy != TopPolicy) {
404 TopCand.reset(CandPolicy());
405 pickNodeFromQueue(Top, TopPolicy, DAG->getTopRPTracker(), TopCand,
406 /*IsBottomUp=*/false);
407 assert(TopCand.Reason != NoCand && "failed to find the first candidate");
408 } else {
409 LLVM_DEBUG(traceCandidate(TopCand));
410 #ifndef NDEBUG
411 if (VerifyScheduling) {
412 SchedCandidate TCand;
413 TCand.reset(CandPolicy());
414 pickNodeFromQueue(Top, TopPolicy, DAG->getTopRPTracker(), TCand,
415 /*IsBottomUp=*/false);
416 assert(TCand.SU == TopCand.SU &&
417 "Last pick result should correspond to re-picking right now");
419 #endif
422 // Pick best from BotCand and TopCand.
423 LLVM_DEBUG(dbgs() << "Top Cand: "; traceCandidate(TopCand);
424 dbgs() << "Bot Cand: "; traceCandidate(BotCand););
425 SchedCandidate Cand = BotCand;
426 TopCand.Reason = NoCand;
427 tryCandidate(Cand, TopCand, nullptr);
428 if (TopCand.Reason != NoCand) {
429 Cand.setBest(TopCand);
431 LLVM_DEBUG(dbgs() << "Picking: "; traceCandidate(Cand););
433 IsTopNode = Cand.AtTop;
434 return Cand.SU;
437 // This function is mostly cut and pasted from
438 // GenericScheduler::pickNode()
439 SUnit *GCNSchedStrategy::pickNode(bool &IsTopNode) {
440 if (DAG->top() == DAG->bottom()) {
441 assert(Top.Available.empty() && Top.Pending.empty() &&
442 Bot.Available.empty() && Bot.Pending.empty() && "ReadyQ garbage");
443 return nullptr;
445 SUnit *SU;
446 do {
447 if (RegionPolicy.OnlyTopDown) {
448 SU = Top.pickOnlyChoice();
449 if (!SU) {
450 CandPolicy NoPolicy;
451 TopCand.reset(NoPolicy);
452 pickNodeFromQueue(Top, NoPolicy, DAG->getTopRPTracker(), TopCand,
453 /*IsBottomUp=*/false);
454 assert(TopCand.Reason != NoCand && "failed to find a candidate");
455 SU = TopCand.SU;
457 IsTopNode = true;
458 } else if (RegionPolicy.OnlyBottomUp) {
459 SU = Bot.pickOnlyChoice();
460 if (!SU) {
461 CandPolicy NoPolicy;
462 BotCand.reset(NoPolicy);
463 pickNodeFromQueue(Bot, NoPolicy, DAG->getBotRPTracker(), BotCand,
464 /*IsBottomUp=*/true);
465 assert(BotCand.Reason != NoCand && "failed to find a candidate");
466 SU = BotCand.SU;
468 IsTopNode = false;
469 } else {
470 SU = pickNodeBidirectional(IsTopNode);
472 } while (SU->isScheduled);
474 if (SU->isTopReady())
475 Top.removeReady(SU);
476 if (SU->isBottomReady())
477 Bot.removeReady(SU);
479 LLVM_DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") "
480 << *SU->getInstr());
481 return SU;
484 void GCNSchedStrategy::schedNode(SUnit *SU, bool IsTopNode) {
485 if (GCNTrackers) {
486 MachineInstr *MI = SU->getInstr();
487 IsTopNode ? (void)DownwardTracker.advance(MI, false)
488 : UpwardTracker.recede(*MI);
491 return GenericScheduler::schedNode(SU, IsTopNode);
494 GCNSchedStageID GCNSchedStrategy::getCurrentStage() {
495 assert(CurrentStage && CurrentStage != SchedStages.end());
496 return *CurrentStage;
499 bool GCNSchedStrategy::advanceStage() {
500 assert(CurrentStage != SchedStages.end());
501 if (!CurrentStage)
502 CurrentStage = SchedStages.begin();
503 else
504 CurrentStage++;
506 return CurrentStage != SchedStages.end();
509 bool GCNSchedStrategy::hasNextStage() const {
510 assert(CurrentStage);
511 return std::next(CurrentStage) != SchedStages.end();
514 GCNSchedStageID GCNSchedStrategy::getNextStage() const {
515 assert(CurrentStage && std::next(CurrentStage) != SchedStages.end());
516 return *std::next(CurrentStage);
519 GCNMaxOccupancySchedStrategy::GCNMaxOccupancySchedStrategy(
520 const MachineSchedContext *C, bool IsLegacyScheduler)
521 : GCNSchedStrategy(C) {
522 SchedStages.push_back(GCNSchedStageID::OccInitialSchedule);
523 SchedStages.push_back(GCNSchedStageID::UnclusteredHighRPReschedule);
524 SchedStages.push_back(GCNSchedStageID::ClusteredLowOccupancyReschedule);
525 SchedStages.push_back(GCNSchedStageID::PreRARematerialize);
526 GCNTrackers = GCNTrackers & !IsLegacyScheduler;
529 GCNMaxILPSchedStrategy::GCNMaxILPSchedStrategy(const MachineSchedContext *C)
530 : GCNSchedStrategy(C) {
531 SchedStages.push_back(GCNSchedStageID::ILPInitialSchedule);
534 bool GCNMaxILPSchedStrategy::tryCandidate(SchedCandidate &Cand,
535 SchedCandidate &TryCand,
536 SchedBoundary *Zone) const {
537 // Initialize the candidate if needed.
538 if (!Cand.isValid()) {
539 TryCand.Reason = NodeOrder;
540 return true;
543 // Avoid spilling by exceeding the register limit.
544 if (DAG->isTrackingPressure() &&
545 tryPressure(TryCand.RPDelta.Excess, Cand.RPDelta.Excess, TryCand, Cand,
546 RegExcess, TRI, DAG->MF))
547 return TryCand.Reason != NoCand;
549 // Bias PhysReg Defs and copies to their uses and defined respectively.
550 if (tryGreater(biasPhysReg(TryCand.SU, TryCand.AtTop),
551 biasPhysReg(Cand.SU, Cand.AtTop), TryCand, Cand, PhysReg))
552 return TryCand.Reason != NoCand;
554 bool SameBoundary = Zone != nullptr;
555 if (SameBoundary) {
556 // Prioritize instructions that read unbuffered resources by stall cycles.
557 if (tryLess(Zone->getLatencyStallCycles(TryCand.SU),
558 Zone->getLatencyStallCycles(Cand.SU), TryCand, Cand, Stall))
559 return TryCand.Reason != NoCand;
561 // Avoid critical resource consumption and balance the schedule.
562 TryCand.initResourceDelta(DAG, SchedModel);
563 if (tryLess(TryCand.ResDelta.CritResources, Cand.ResDelta.CritResources,
564 TryCand, Cand, ResourceReduce))
565 return TryCand.Reason != NoCand;
566 if (tryGreater(TryCand.ResDelta.DemandedResources,
567 Cand.ResDelta.DemandedResources, TryCand, Cand,
568 ResourceDemand))
569 return TryCand.Reason != NoCand;
571 // Unconditionally try to reduce latency.
572 if (tryLatency(TryCand, Cand, *Zone))
573 return TryCand.Reason != NoCand;
575 // Weak edges are for clustering and other constraints.
576 if (tryLess(getWeakLeft(TryCand.SU, TryCand.AtTop),
577 getWeakLeft(Cand.SU, Cand.AtTop), TryCand, Cand, Weak))
578 return TryCand.Reason != NoCand;
581 // Keep clustered nodes together to encourage downstream peephole
582 // optimizations which may reduce resource requirements.
584 // This is a best effort to set things up for a post-RA pass. Optimizations
585 // like generating loads of multiple registers should ideally be done within
586 // the scheduler pass by combining the loads during DAG postprocessing.
587 const SUnit *CandNextClusterSU =
588 Cand.AtTop ? DAG->getNextClusterSucc() : DAG->getNextClusterPred();
589 const SUnit *TryCandNextClusterSU =
590 TryCand.AtTop ? DAG->getNextClusterSucc() : DAG->getNextClusterPred();
591 if (tryGreater(TryCand.SU == TryCandNextClusterSU,
592 Cand.SU == CandNextClusterSU, TryCand, Cand, Cluster))
593 return TryCand.Reason != NoCand;
595 // Avoid increasing the max critical pressure in the scheduled region.
596 if (DAG->isTrackingPressure() &&
597 tryPressure(TryCand.RPDelta.CriticalMax, Cand.RPDelta.CriticalMax,
598 TryCand, Cand, RegCritical, TRI, DAG->MF))
599 return TryCand.Reason != NoCand;
601 // Avoid increasing the max pressure of the entire region.
602 if (DAG->isTrackingPressure() &&
603 tryPressure(TryCand.RPDelta.CurrentMax, Cand.RPDelta.CurrentMax, TryCand,
604 Cand, RegMax, TRI, DAG->MF))
605 return TryCand.Reason != NoCand;
607 if (SameBoundary) {
608 // Fall through to original instruction order.
609 if ((Zone->isTop() && TryCand.SU->NodeNum < Cand.SU->NodeNum) ||
610 (!Zone->isTop() && TryCand.SU->NodeNum > Cand.SU->NodeNum)) {
611 TryCand.Reason = NodeOrder;
612 return true;
615 return false;
618 GCNScheduleDAGMILive::GCNScheduleDAGMILive(
619 MachineSchedContext *C, std::unique_ptr<MachineSchedStrategy> S)
620 : ScheduleDAGMILive(C, std::move(S)), ST(MF.getSubtarget<GCNSubtarget>()),
621 MFI(*MF.getInfo<SIMachineFunctionInfo>()),
622 StartingOccupancy(MFI.getOccupancy()), MinOccupancy(StartingOccupancy),
623 RegionLiveOuts(this, /*IsLiveOut=*/true) {
625 LLVM_DEBUG(dbgs() << "Starting occupancy is " << StartingOccupancy << ".\n");
626 if (RelaxedOcc) {
627 MinOccupancy = std::min(MFI.getMinAllowedOccupancy(), StartingOccupancy);
628 if (MinOccupancy != StartingOccupancy)
629 LLVM_DEBUG(dbgs() << "Allowing Occupancy drops to " << MinOccupancy
630 << ".\n");
634 std::unique_ptr<GCNSchedStage>
635 GCNScheduleDAGMILive::createSchedStage(GCNSchedStageID SchedStageID) {
636 switch (SchedStageID) {
637 case GCNSchedStageID::OccInitialSchedule:
638 return std::make_unique<OccInitialScheduleStage>(SchedStageID, *this);
639 case GCNSchedStageID::UnclusteredHighRPReschedule:
640 return std::make_unique<UnclusteredHighRPStage>(SchedStageID, *this);
641 case GCNSchedStageID::ClusteredLowOccupancyReschedule:
642 return std::make_unique<ClusteredLowOccStage>(SchedStageID, *this);
643 case GCNSchedStageID::PreRARematerialize:
644 return std::make_unique<PreRARematStage>(SchedStageID, *this);
645 case GCNSchedStageID::ILPInitialSchedule:
646 return std::make_unique<ILPInitialScheduleStage>(SchedStageID, *this);
649 llvm_unreachable("Unknown SchedStageID.");
652 void GCNScheduleDAGMILive::schedule() {
653 // Collect all scheduling regions. The actual scheduling is performed in
654 // GCNScheduleDAGMILive::finalizeSchedule.
655 Regions.push_back(std::pair(RegionBegin, RegionEnd));
658 GCNRegPressure
659 GCNScheduleDAGMILive::getRealRegPressure(unsigned RegionIdx) const {
660 GCNDownwardRPTracker RPTracker(*LIS);
661 RPTracker.advance(begin(), end(), &LiveIns[RegionIdx]);
662 return RPTracker.moveMaxPressure();
665 static MachineInstr *getLastMIForRegion(MachineBasicBlock::iterator RegionBegin,
666 MachineBasicBlock::iterator RegionEnd) {
667 auto REnd = RegionEnd == RegionBegin->getParent()->end()
668 ? std::prev(RegionEnd)
669 : RegionEnd;
670 return &*skipDebugInstructionsBackward(REnd, RegionBegin);
673 void GCNScheduleDAGMILive::computeBlockPressure(unsigned RegionIdx,
674 const MachineBasicBlock *MBB) {
675 GCNDownwardRPTracker RPTracker(*LIS);
677 // If the block has the only successor then live-ins of that successor are
678 // live-outs of the current block. We can reuse calculated live set if the
679 // successor will be sent to scheduling past current block.
681 // However, due to the bug in LiveInterval analysis it may happen that two
682 // predecessors of the same successor block have different lane bitmasks for
683 // a live-out register. Workaround that by sticking to one-to-one relationship
684 // i.e. one predecessor with one successor block.
685 const MachineBasicBlock *OnlySucc = nullptr;
686 if (MBB->succ_size() == 1) {
687 auto *Candidate = *MBB->succ_begin();
688 if (!Candidate->empty() && Candidate->pred_size() == 1) {
689 SlotIndexes *Ind = LIS->getSlotIndexes();
690 if (Ind->getMBBStartIdx(MBB) < Ind->getMBBStartIdx(Candidate))
691 OnlySucc = Candidate;
695 // Scheduler sends regions from the end of the block upwards.
696 size_t CurRegion = RegionIdx;
697 for (size_t E = Regions.size(); CurRegion != E; ++CurRegion)
698 if (Regions[CurRegion].first->getParent() != MBB)
699 break;
700 --CurRegion;
702 auto I = MBB->begin();
703 auto LiveInIt = MBBLiveIns.find(MBB);
704 auto &Rgn = Regions[CurRegion];
705 auto *NonDbgMI = &*skipDebugInstructionsForward(Rgn.first, Rgn.second);
706 if (LiveInIt != MBBLiveIns.end()) {
707 auto LiveIn = std::move(LiveInIt->second);
708 RPTracker.reset(*MBB->begin(), &LiveIn);
709 MBBLiveIns.erase(LiveInIt);
710 } else {
711 I = Rgn.first;
712 auto LRS = BBLiveInMap.lookup(NonDbgMI);
713 #ifdef EXPENSIVE_CHECKS
714 assert(isEqual(getLiveRegsBefore(*NonDbgMI, *LIS), LRS));
715 #endif
716 RPTracker.reset(*I, &LRS);
719 for (;;) {
720 I = RPTracker.getNext();
722 if (Regions[CurRegion].first == I || NonDbgMI == I) {
723 LiveIns[CurRegion] = RPTracker.getLiveRegs();
724 RPTracker.clearMaxPressure();
727 if (Regions[CurRegion].second == I) {
728 Pressure[CurRegion] = RPTracker.moveMaxPressure();
729 if (CurRegion-- == RegionIdx)
730 break;
732 RPTracker.advanceToNext();
733 RPTracker.advanceBeforeNext();
736 if (OnlySucc) {
737 if (I != MBB->end()) {
738 RPTracker.advanceToNext();
739 RPTracker.advance(MBB->end());
741 RPTracker.advanceBeforeNext();
742 MBBLiveIns[OnlySucc] = RPTracker.moveLiveRegs();
746 DenseMap<MachineInstr *, GCNRPTracker::LiveRegSet>
747 GCNScheduleDAGMILive::getRegionLiveInMap() const {
748 assert(!Regions.empty());
749 std::vector<MachineInstr *> RegionFirstMIs;
750 RegionFirstMIs.reserve(Regions.size());
751 auto I = Regions.rbegin(), E = Regions.rend();
752 auto *BB = I->first->getParent();
753 do {
754 auto *MI = &*skipDebugInstructionsForward(I->first, I->second);
755 RegionFirstMIs.push_back(MI);
756 do {
757 ++I;
758 } while (I != E && I->first->getParent() == BB);
759 } while (I != E);
760 return getLiveRegMap(RegionFirstMIs, /*After=*/false, *LIS);
763 DenseMap<MachineInstr *, GCNRPTracker::LiveRegSet>
764 GCNScheduleDAGMILive::getRegionLiveOutMap() const {
765 assert(!Regions.empty());
766 std::vector<MachineInstr *> RegionLastMIs;
767 RegionLastMIs.reserve(Regions.size());
768 for (auto &[RegionBegin, RegionEnd] : reverse(Regions))
769 RegionLastMIs.push_back(getLastMIForRegion(RegionBegin, RegionEnd));
771 return getLiveRegMap(RegionLastMIs, /*After=*/true, *LIS);
774 void RegionPressureMap::buildLiveRegMap() {
775 IdxToInstruction.clear();
777 RegionLiveRegMap =
778 IsLiveOut ? DAG->getRegionLiveOutMap() : DAG->getRegionLiveInMap();
779 for (unsigned I = 0; I < DAG->Regions.size(); I++) {
780 MachineInstr *RegionKey =
781 IsLiveOut
782 ? getLastMIForRegion(DAG->Regions[I].first, DAG->Regions[I].second)
783 : &*DAG->Regions[I].first;
784 IdxToInstruction[I] = RegionKey;
788 void GCNScheduleDAGMILive::finalizeSchedule() {
789 // Start actual scheduling here. This function is called by the base
790 // MachineScheduler after all regions have been recorded by
791 // GCNScheduleDAGMILive::schedule().
792 LiveIns.resize(Regions.size());
793 Pressure.resize(Regions.size());
794 RescheduleRegions.resize(Regions.size());
795 RegionsWithHighRP.resize(Regions.size());
796 RegionsWithExcessRP.resize(Regions.size());
797 RegionsWithMinOcc.resize(Regions.size());
798 RegionsWithIGLPInstrs.resize(Regions.size());
799 RescheduleRegions.set();
800 RegionsWithHighRP.reset();
801 RegionsWithExcessRP.reset();
802 RegionsWithMinOcc.reset();
803 RegionsWithIGLPInstrs.reset();
805 runSchedStages();
808 void GCNScheduleDAGMILive::runSchedStages() {
809 LLVM_DEBUG(dbgs() << "All regions recorded, starting actual scheduling.\n");
811 if (!Regions.empty()) {
812 BBLiveInMap = getRegionLiveInMap();
813 if (GCNTrackers)
814 RegionLiveOuts.buildLiveRegMap();
817 GCNSchedStrategy &S = static_cast<GCNSchedStrategy &>(*SchedImpl);
818 while (S.advanceStage()) {
819 auto Stage = createSchedStage(S.getCurrentStage());
820 if (!Stage->initGCNSchedStage())
821 continue;
823 for (auto Region : Regions) {
824 RegionBegin = Region.first;
825 RegionEnd = Region.second;
826 // Setup for scheduling the region and check whether it should be skipped.
827 if (!Stage->initGCNRegion()) {
828 Stage->advanceRegion();
829 exitRegion();
830 continue;
833 if (GCNTrackers) {
834 GCNDownwardRPTracker *DownwardTracker = S.getDownwardTracker();
835 GCNUpwardRPTracker *UpwardTracker = S.getUpwardTracker();
836 GCNRPTracker::LiveRegSet *RegionLiveIns =
837 &LiveIns[Stage->getRegionIdx()];
839 reinterpret_cast<GCNRPTracker *>(DownwardTracker)
840 ->reset(MRI, *RegionLiveIns);
841 reinterpret_cast<GCNRPTracker *>(UpwardTracker)
842 ->reset(MRI, RegionLiveOuts.getLiveRegsForRegionIdx(
843 Stage->getRegionIdx()));
846 ScheduleDAGMILive::schedule();
847 Stage->finalizeGCNRegion();
850 Stage->finalizeGCNSchedStage();
854 #ifndef NDEBUG
855 raw_ostream &llvm::operator<<(raw_ostream &OS, const GCNSchedStageID &StageID) {
856 switch (StageID) {
857 case GCNSchedStageID::OccInitialSchedule:
858 OS << "Max Occupancy Initial Schedule";
859 break;
860 case GCNSchedStageID::UnclusteredHighRPReschedule:
861 OS << "Unclustered High Register Pressure Reschedule";
862 break;
863 case GCNSchedStageID::ClusteredLowOccupancyReschedule:
864 OS << "Clustered Low Occupancy Reschedule";
865 break;
866 case GCNSchedStageID::PreRARematerialize:
867 OS << "Pre-RA Rematerialize";
868 break;
869 case GCNSchedStageID::ILPInitialSchedule:
870 OS << "Max ILP Initial Schedule";
871 break;
874 return OS;
876 #endif
878 GCNSchedStage::GCNSchedStage(GCNSchedStageID StageID, GCNScheduleDAGMILive &DAG)
879 : DAG(DAG), S(static_cast<GCNSchedStrategy &>(*DAG.SchedImpl)), MF(DAG.MF),
880 MFI(DAG.MFI), ST(DAG.ST), StageID(StageID) {}
882 bool GCNSchedStage::initGCNSchedStage() {
883 if (!DAG.LIS)
884 return false;
886 LLVM_DEBUG(dbgs() << "Starting scheduling stage: " << StageID << "\n");
887 return true;
890 bool UnclusteredHighRPStage::initGCNSchedStage() {
891 if (DisableUnclusterHighRP)
892 return false;
894 if (!GCNSchedStage::initGCNSchedStage())
895 return false;
897 if (DAG.RegionsWithHighRP.none() && DAG.RegionsWithExcessRP.none())
898 return false;
900 SavedMutations.swap(DAG.Mutations);
901 DAG.addMutation(
902 createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::PreRAReentry));
904 InitialOccupancy = DAG.MinOccupancy;
905 // Aggressivly try to reduce register pressure in the unclustered high RP
906 // stage. Temporarily increase occupancy target in the region.
907 S.SGPRLimitBias = S.HighRPSGPRBias;
908 S.VGPRLimitBias = S.HighRPVGPRBias;
909 if (MFI.getMaxWavesPerEU() > DAG.MinOccupancy)
910 MFI.increaseOccupancy(MF, ++DAG.MinOccupancy);
912 LLVM_DEBUG(
913 dbgs()
914 << "Retrying function scheduling without clustering. "
915 "Aggressivly try to reduce register pressure to achieve occupancy "
916 << DAG.MinOccupancy << ".\n");
918 return true;
921 bool ClusteredLowOccStage::initGCNSchedStage() {
922 if (DisableClusteredLowOccupancy)
923 return false;
925 if (!GCNSchedStage::initGCNSchedStage())
926 return false;
928 // Don't bother trying to improve ILP in lower RP regions if occupancy has not
929 // been dropped. All regions will have already been scheduled with the ideal
930 // occupancy targets.
931 if (DAG.StartingOccupancy <= DAG.MinOccupancy)
932 return false;
934 LLVM_DEBUG(
935 dbgs() << "Retrying function scheduling with lowest recorded occupancy "
936 << DAG.MinOccupancy << ".\n");
937 return true;
940 bool PreRARematStage::initGCNSchedStage() {
941 if (!GCNSchedStage::initGCNSchedStage())
942 return false;
944 if (DAG.RegionsWithMinOcc.none() || DAG.Regions.size() == 1)
945 return false;
947 const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
948 // Check maximum occupancy
949 if (ST.computeOccupancy(MF.getFunction(), MFI.getLDSSize()) ==
950 DAG.MinOccupancy)
951 return false;
953 // FIXME: This pass will invalidate cached MBBLiveIns for regions
954 // inbetween the defs and region we sinked the def to. Cached pressure
955 // for regions where a def is sinked from will also be invalidated. Will
956 // need to be fixed if there is another pass after this pass.
957 assert(!S.hasNextStage());
959 collectRematerializableInstructions();
960 if (RematerializableInsts.empty() || !sinkTriviallyRematInsts(ST, TII))
961 return false;
963 LLVM_DEBUG(
964 dbgs() << "Retrying function scheduling with improved occupancy of "
965 << DAG.MinOccupancy << " from rematerializing\n");
966 return true;
969 void GCNSchedStage::finalizeGCNSchedStage() {
970 DAG.finishBlock();
971 LLVM_DEBUG(dbgs() << "Ending scheduling stage: " << StageID << "\n");
974 void UnclusteredHighRPStage::finalizeGCNSchedStage() {
975 SavedMutations.swap(DAG.Mutations);
976 S.SGPRLimitBias = S.VGPRLimitBias = 0;
977 if (DAG.MinOccupancy > InitialOccupancy) {
978 for (unsigned IDX = 0; IDX < DAG.Pressure.size(); ++IDX)
979 DAG.RegionsWithMinOcc[IDX] =
980 DAG.Pressure[IDX].getOccupancy(DAG.ST) == DAG.MinOccupancy;
982 LLVM_DEBUG(dbgs() << StageID
983 << " stage successfully increased occupancy to "
984 << DAG.MinOccupancy << '\n');
987 GCNSchedStage::finalizeGCNSchedStage();
990 bool GCNSchedStage::initGCNRegion() {
991 // Check whether this new region is also a new block.
992 if (DAG.RegionBegin->getParent() != CurrentMBB)
993 setupNewBlock();
995 unsigned NumRegionInstrs = std::distance(DAG.begin(), DAG.end());
996 DAG.enterRegion(CurrentMBB, DAG.begin(), DAG.end(), NumRegionInstrs);
998 // Skip empty scheduling regions (0 or 1 schedulable instructions).
999 if (DAG.begin() == DAG.end() || DAG.begin() == std::prev(DAG.end()))
1000 return false;
1002 LLVM_DEBUG(dbgs() << "********** MI Scheduling **********\n");
1003 LLVM_DEBUG(dbgs() << MF.getName() << ":" << printMBBReference(*CurrentMBB)
1004 << " " << CurrentMBB->getName()
1005 << "\n From: " << *DAG.begin() << " To: ";
1006 if (DAG.RegionEnd != CurrentMBB->end()) dbgs() << *DAG.RegionEnd;
1007 else dbgs() << "End";
1008 dbgs() << " RegionInstrs: " << NumRegionInstrs << '\n');
1010 // Save original instruction order before scheduling for possible revert.
1011 Unsched.clear();
1012 Unsched.reserve(DAG.NumRegionInstrs);
1013 if (StageID == GCNSchedStageID::OccInitialSchedule ||
1014 StageID == GCNSchedStageID::ILPInitialSchedule) {
1015 for (auto &I : DAG) {
1016 Unsched.push_back(&I);
1017 if (I.getOpcode() == AMDGPU::SCHED_GROUP_BARRIER ||
1018 I.getOpcode() == AMDGPU::IGLP_OPT)
1019 DAG.RegionsWithIGLPInstrs[RegionIdx] = true;
1021 } else {
1022 for (auto &I : DAG)
1023 Unsched.push_back(&I);
1026 PressureBefore = DAG.Pressure[RegionIdx];
1028 LLVM_DEBUG(
1029 dbgs() << "Pressure before scheduling:\nRegion live-ins:"
1030 << print(DAG.LiveIns[RegionIdx], DAG.MRI)
1031 << "Region live-in pressure: "
1032 << print(llvm::getRegPressure(DAG.MRI, DAG.LiveIns[RegionIdx]))
1033 << "Region register pressure: " << print(PressureBefore));
1035 S.HasHighPressure = false;
1036 S.KnownExcessRP = isRegionWithExcessRP();
1038 if (DAG.RegionsWithIGLPInstrs[RegionIdx] &&
1039 StageID != GCNSchedStageID::UnclusteredHighRPReschedule) {
1040 SavedMutations.clear();
1041 SavedMutations.swap(DAG.Mutations);
1042 bool IsInitialStage = StageID == GCNSchedStageID::OccInitialSchedule ||
1043 StageID == GCNSchedStageID::ILPInitialSchedule;
1044 DAG.addMutation(createIGroupLPDAGMutation(
1045 IsInitialStage ? AMDGPU::SchedulingPhase::Initial
1046 : AMDGPU::SchedulingPhase::PreRAReentry));
1049 return true;
1052 bool UnclusteredHighRPStage::initGCNRegion() {
1053 // Only reschedule regions with the minimum occupancy or regions that may have
1054 // spilling (excess register pressure).
1055 if ((!DAG.RegionsWithMinOcc[RegionIdx] ||
1056 DAG.MinOccupancy <= InitialOccupancy) &&
1057 !DAG.RegionsWithExcessRP[RegionIdx])
1058 return false;
1060 return GCNSchedStage::initGCNRegion();
1063 bool ClusteredLowOccStage::initGCNRegion() {
1064 // We may need to reschedule this region if it wasn't rescheduled in the last
1065 // stage, or if we found it was testing critical register pressure limits in
1066 // the unclustered reschedule stage. The later is because we may not have been
1067 // able to raise the min occupancy in the previous stage so the region may be
1068 // overly constrained even if it was already rescheduled.
1069 if (!DAG.RegionsWithHighRP[RegionIdx])
1070 return false;
1072 return GCNSchedStage::initGCNRegion();
1075 bool PreRARematStage::initGCNRegion() {
1076 if (!DAG.RescheduleRegions[RegionIdx])
1077 return false;
1079 return GCNSchedStage::initGCNRegion();
1082 void GCNSchedStage::setupNewBlock() {
1083 if (CurrentMBB)
1084 DAG.finishBlock();
1086 CurrentMBB = DAG.RegionBegin->getParent();
1087 DAG.startBlock(CurrentMBB);
1088 // Get real RP for the region if it hasn't be calculated before. After the
1089 // initial schedule stage real RP will be collected after scheduling.
1090 if (StageID == GCNSchedStageID::OccInitialSchedule ||
1091 StageID == GCNSchedStageID::ILPInitialSchedule)
1092 DAG.computeBlockPressure(RegionIdx, CurrentMBB);
1095 void GCNSchedStage::finalizeGCNRegion() {
1096 DAG.Regions[RegionIdx] = std::pair(DAG.RegionBegin, DAG.RegionEnd);
1097 DAG.RescheduleRegions[RegionIdx] = false;
1098 if (S.HasHighPressure)
1099 DAG.RegionsWithHighRP[RegionIdx] = true;
1101 // Revert scheduling if we have dropped occupancy or there is some other
1102 // reason that the original schedule is better.
1103 checkScheduling();
1105 if (DAG.RegionsWithIGLPInstrs[RegionIdx] &&
1106 StageID != GCNSchedStageID::UnclusteredHighRPReschedule)
1107 SavedMutations.swap(DAG.Mutations);
1109 DAG.exitRegion();
1110 RegionIdx++;
1113 void GCNSchedStage::checkScheduling() {
1114 // Check the results of scheduling.
1115 PressureAfter = DAG.getRealRegPressure(RegionIdx);
1117 LLVM_DEBUG(dbgs() << "Pressure after scheduling: " << print(PressureAfter));
1118 LLVM_DEBUG(dbgs() << "Region: " << RegionIdx << ".\n");
1120 if (PressureAfter.getSGPRNum() <= S.SGPRCriticalLimit &&
1121 PressureAfter.getVGPRNum(ST.hasGFX90AInsts()) <= S.VGPRCriticalLimit) {
1122 DAG.Pressure[RegionIdx] = PressureAfter;
1123 DAG.RegionsWithMinOcc[RegionIdx] =
1124 PressureAfter.getOccupancy(ST) == DAG.MinOccupancy;
1126 // Early out if we have achieved the occupancy target.
1127 LLVM_DEBUG(dbgs() << "Pressure in desired limits, done.\n");
1128 return;
1131 unsigned TargetOccupancy =
1132 std::min(S.getTargetOccupancy(), ST.getOccupancyWithLocalMemSize(MF));
1133 unsigned WavesAfter =
1134 std::min(TargetOccupancy, PressureAfter.getOccupancy(ST));
1135 unsigned WavesBefore =
1136 std::min(TargetOccupancy, PressureBefore.getOccupancy(ST));
1137 LLVM_DEBUG(dbgs() << "Occupancy before scheduling: " << WavesBefore
1138 << ", after " << WavesAfter << ".\n");
1140 // We may not be able to keep the current target occupancy because of the just
1141 // scheduled region. We might still be able to revert scheduling if the
1142 // occupancy before was higher, or if the current schedule has register
1143 // pressure higher than the excess limits which could lead to more spilling.
1144 unsigned NewOccupancy = std::max(WavesAfter, WavesBefore);
1146 // Allow memory bound functions to drop to 4 waves if not limited by an
1147 // attribute.
1148 if (WavesAfter < WavesBefore && WavesAfter < DAG.MinOccupancy &&
1149 WavesAfter >= MFI.getMinAllowedOccupancy()) {
1150 LLVM_DEBUG(dbgs() << "Function is memory bound, allow occupancy drop up to "
1151 << MFI.getMinAllowedOccupancy() << " waves\n");
1152 NewOccupancy = WavesAfter;
1155 if (NewOccupancy < DAG.MinOccupancy) {
1156 DAG.MinOccupancy = NewOccupancy;
1157 MFI.limitOccupancy(DAG.MinOccupancy);
1158 DAG.RegionsWithMinOcc.reset();
1159 LLVM_DEBUG(dbgs() << "Occupancy lowered for the function to "
1160 << DAG.MinOccupancy << ".\n");
1162 // The maximum number of arch VGPR on non-unified register file, or the
1163 // maximum VGPR + AGPR in the unified register file case.
1164 unsigned MaxVGPRs = ST.getMaxNumVGPRs(MF);
1165 // The maximum number of arch VGPR for both unified and non-unified register
1166 // file.
1167 unsigned MaxArchVGPRs = std::min(MaxVGPRs, ST.getAddressableNumArchVGPRs());
1168 unsigned MaxSGPRs = ST.getMaxNumSGPRs(MF);
1170 if (PressureAfter.getVGPRNum(ST.hasGFX90AInsts()) > MaxVGPRs ||
1171 PressureAfter.getVGPRNum(false) > MaxArchVGPRs ||
1172 PressureAfter.getAGPRNum() > MaxArchVGPRs ||
1173 PressureAfter.getSGPRNum() > MaxSGPRs) {
1174 DAG.RescheduleRegions[RegionIdx] = true;
1175 DAG.RegionsWithHighRP[RegionIdx] = true;
1176 DAG.RegionsWithExcessRP[RegionIdx] = true;
1179 // Revert if this region's schedule would cause a drop in occupancy or
1180 // spilling.
1181 if (shouldRevertScheduling(WavesAfter)) {
1182 revertScheduling();
1183 } else {
1184 DAG.Pressure[RegionIdx] = PressureAfter;
1185 DAG.RegionsWithMinOcc[RegionIdx] =
1186 PressureAfter.getOccupancy(ST) == DAG.MinOccupancy;
1190 unsigned
1191 GCNSchedStage::computeSUnitReadyCycle(const SUnit &SU, unsigned CurrCycle,
1192 DenseMap<unsigned, unsigned> &ReadyCycles,
1193 const TargetSchedModel &SM) {
1194 unsigned ReadyCycle = CurrCycle;
1195 for (auto &D : SU.Preds) {
1196 if (D.isAssignedRegDep()) {
1197 MachineInstr *DefMI = D.getSUnit()->getInstr();
1198 unsigned Latency = SM.computeInstrLatency(DefMI);
1199 unsigned DefReady = ReadyCycles[DAG.getSUnit(DefMI)->NodeNum];
1200 ReadyCycle = std::max(ReadyCycle, DefReady + Latency);
1203 ReadyCycles[SU.NodeNum] = ReadyCycle;
1204 return ReadyCycle;
1207 #ifndef NDEBUG
1208 struct EarlierIssuingCycle {
1209 bool operator()(std::pair<MachineInstr *, unsigned> A,
1210 std::pair<MachineInstr *, unsigned> B) const {
1211 return A.second < B.second;
1215 static void printScheduleModel(std::set<std::pair<MachineInstr *, unsigned>,
1216 EarlierIssuingCycle> &ReadyCycles) {
1217 if (ReadyCycles.empty())
1218 return;
1219 unsigned BBNum = ReadyCycles.begin()->first->getParent()->getNumber();
1220 dbgs() << "\n################## Schedule time ReadyCycles for MBB : " << BBNum
1221 << " ##################\n# Cycle #\t\t\tInstruction "
1223 " \n";
1224 unsigned IPrev = 1;
1225 for (auto &I : ReadyCycles) {
1226 if (I.second > IPrev + 1)
1227 dbgs() << "****************************** BUBBLE OF " << I.second - IPrev
1228 << " CYCLES DETECTED ******************************\n\n";
1229 dbgs() << "[ " << I.second << " ] : " << *I.first << "\n";
1230 IPrev = I.second;
1233 #endif
1235 ScheduleMetrics
1236 GCNSchedStage::getScheduleMetrics(const std::vector<SUnit> &InputSchedule) {
1237 #ifndef NDEBUG
1238 std::set<std::pair<MachineInstr *, unsigned>, EarlierIssuingCycle>
1239 ReadyCyclesSorted;
1240 #endif
1241 const TargetSchedModel &SM = ST.getInstrInfo()->getSchedModel();
1242 unsigned SumBubbles = 0;
1243 DenseMap<unsigned, unsigned> ReadyCycles;
1244 unsigned CurrCycle = 0;
1245 for (auto &SU : InputSchedule) {
1246 unsigned ReadyCycle =
1247 computeSUnitReadyCycle(SU, CurrCycle, ReadyCycles, SM);
1248 SumBubbles += ReadyCycle - CurrCycle;
1249 #ifndef NDEBUG
1250 ReadyCyclesSorted.insert(std::make_pair(SU.getInstr(), ReadyCycle));
1251 #endif
1252 CurrCycle = ++ReadyCycle;
1254 #ifndef NDEBUG
1255 LLVM_DEBUG(
1256 printScheduleModel(ReadyCyclesSorted);
1257 dbgs() << "\n\t"
1258 << "Metric: "
1259 << (SumBubbles
1260 ? (SumBubbles * ScheduleMetrics::ScaleFactor) / CurrCycle
1261 : 1)
1262 << "\n\n");
1263 #endif
1265 return ScheduleMetrics(CurrCycle, SumBubbles);
1268 ScheduleMetrics
1269 GCNSchedStage::getScheduleMetrics(const GCNScheduleDAGMILive &DAG) {
1270 #ifndef NDEBUG
1271 std::set<std::pair<MachineInstr *, unsigned>, EarlierIssuingCycle>
1272 ReadyCyclesSorted;
1273 #endif
1274 const TargetSchedModel &SM = ST.getInstrInfo()->getSchedModel();
1275 unsigned SumBubbles = 0;
1276 DenseMap<unsigned, unsigned> ReadyCycles;
1277 unsigned CurrCycle = 0;
1278 for (auto &MI : DAG) {
1279 SUnit *SU = DAG.getSUnit(&MI);
1280 if (!SU)
1281 continue;
1282 unsigned ReadyCycle =
1283 computeSUnitReadyCycle(*SU, CurrCycle, ReadyCycles, SM);
1284 SumBubbles += ReadyCycle - CurrCycle;
1285 #ifndef NDEBUG
1286 ReadyCyclesSorted.insert(std::make_pair(SU->getInstr(), ReadyCycle));
1287 #endif
1288 CurrCycle = ++ReadyCycle;
1290 #ifndef NDEBUG
1291 LLVM_DEBUG(
1292 printScheduleModel(ReadyCyclesSorted);
1293 dbgs() << "\n\t"
1294 << "Metric: "
1295 << (SumBubbles
1296 ? (SumBubbles * ScheduleMetrics::ScaleFactor) / CurrCycle
1297 : 1)
1298 << "\n\n");
1299 #endif
1301 return ScheduleMetrics(CurrCycle, SumBubbles);
1304 bool GCNSchedStage::shouldRevertScheduling(unsigned WavesAfter) {
1305 if (WavesAfter < DAG.MinOccupancy)
1306 return true;
1308 return false;
1311 bool OccInitialScheduleStage::shouldRevertScheduling(unsigned WavesAfter) {
1312 if (PressureAfter == PressureBefore)
1313 return false;
1315 if (GCNSchedStage::shouldRevertScheduling(WavesAfter))
1316 return true;
1318 if (mayCauseSpilling(WavesAfter))
1319 return true;
1321 return false;
1324 bool UnclusteredHighRPStage::shouldRevertScheduling(unsigned WavesAfter) {
1325 // If RP is not reduced in the unclustered reschedule stage, revert to the
1326 // old schedule.
1327 if ((WavesAfter <= PressureBefore.getOccupancy(ST) &&
1328 mayCauseSpilling(WavesAfter)) ||
1329 GCNSchedStage::shouldRevertScheduling(WavesAfter)) {
1330 LLVM_DEBUG(dbgs() << "Unclustered reschedule did not help.\n");
1331 return true;
1334 // Do not attempt to relax schedule even more if we are already spilling.
1335 if (isRegionWithExcessRP())
1336 return false;
1338 LLVM_DEBUG(
1339 dbgs()
1340 << "\n\t *** In shouldRevertScheduling ***\n"
1341 << " *********** BEFORE UnclusteredHighRPStage ***********\n");
1342 ScheduleMetrics MBefore =
1343 getScheduleMetrics(DAG.SUnits);
1344 LLVM_DEBUG(
1345 dbgs()
1346 << "\n *********** AFTER UnclusteredHighRPStage ***********\n");
1347 ScheduleMetrics MAfter = getScheduleMetrics(DAG);
1348 unsigned OldMetric = MBefore.getMetric();
1349 unsigned NewMetric = MAfter.getMetric();
1350 unsigned WavesBefore =
1351 std::min(S.getTargetOccupancy(), PressureBefore.getOccupancy(ST));
1352 unsigned Profit =
1353 ((WavesAfter * ScheduleMetrics::ScaleFactor) / WavesBefore *
1354 ((OldMetric + ScheduleMetricBias) * ScheduleMetrics::ScaleFactor) /
1355 NewMetric) /
1356 ScheduleMetrics::ScaleFactor;
1357 LLVM_DEBUG(dbgs() << "\tMetric before " << MBefore << "\tMetric after "
1358 << MAfter << "Profit: " << Profit << "\n");
1359 return Profit < ScheduleMetrics::ScaleFactor;
1362 bool ClusteredLowOccStage::shouldRevertScheduling(unsigned WavesAfter) {
1363 if (PressureAfter == PressureBefore)
1364 return false;
1366 if (GCNSchedStage::shouldRevertScheduling(WavesAfter))
1367 return true;
1369 if (mayCauseSpilling(WavesAfter))
1370 return true;
1372 return false;
1375 bool PreRARematStage::shouldRevertScheduling(unsigned WavesAfter) {
1376 if (GCNSchedStage::shouldRevertScheduling(WavesAfter))
1377 return true;
1379 if (mayCauseSpilling(WavesAfter))
1380 return true;
1382 return false;
1385 bool ILPInitialScheduleStage::shouldRevertScheduling(unsigned WavesAfter) {
1386 if (mayCauseSpilling(WavesAfter))
1387 return true;
1389 return false;
1392 bool GCNSchedStage::mayCauseSpilling(unsigned WavesAfter) {
1393 if (WavesAfter <= MFI.getMinWavesPerEU() && isRegionWithExcessRP() &&
1394 !PressureAfter.less(MF, PressureBefore)) {
1395 LLVM_DEBUG(dbgs() << "New pressure will result in more spilling.\n");
1396 return true;
1399 return false;
1402 void GCNSchedStage::revertScheduling() {
1403 DAG.RegionsWithMinOcc[RegionIdx] =
1404 PressureBefore.getOccupancy(ST) == DAG.MinOccupancy;
1405 LLVM_DEBUG(dbgs() << "Attempting to revert scheduling.\n");
1406 DAG.RescheduleRegions[RegionIdx] =
1407 S.hasNextStage() &&
1408 S.getNextStage() != GCNSchedStageID::UnclusteredHighRPReschedule;
1409 DAG.RegionEnd = DAG.RegionBegin;
1410 int SkippedDebugInstr = 0;
1411 for (MachineInstr *MI : Unsched) {
1412 if (MI->isDebugInstr()) {
1413 ++SkippedDebugInstr;
1414 continue;
1417 if (MI->getIterator() != DAG.RegionEnd) {
1418 DAG.BB->remove(MI);
1419 DAG.BB->insert(DAG.RegionEnd, MI);
1420 if (!MI->isDebugInstr())
1421 DAG.LIS->handleMove(*MI, true);
1424 // Reset read-undef flags and update them later.
1425 for (auto &Op : MI->all_defs())
1426 Op.setIsUndef(false);
1427 RegisterOperands RegOpers;
1428 RegOpers.collect(*MI, *DAG.TRI, DAG.MRI, DAG.ShouldTrackLaneMasks, false);
1429 if (!MI->isDebugInstr()) {
1430 if (DAG.ShouldTrackLaneMasks) {
1431 // Adjust liveness and add missing dead+read-undef flags.
1432 SlotIndex SlotIdx = DAG.LIS->getInstructionIndex(*MI).getRegSlot();
1433 RegOpers.adjustLaneLiveness(*DAG.LIS, DAG.MRI, SlotIdx, MI);
1434 } else {
1435 // Adjust for missing dead-def flags.
1436 RegOpers.detectDeadDefs(*MI, *DAG.LIS);
1439 DAG.RegionEnd = MI->getIterator();
1440 ++DAG.RegionEnd;
1441 LLVM_DEBUG(dbgs() << "Scheduling " << *MI);
1444 // After reverting schedule, debug instrs will now be at the end of the block
1445 // and RegionEnd will point to the first debug instr. Increment RegionEnd
1446 // pass debug instrs to the actual end of the scheduling region.
1447 while (SkippedDebugInstr-- > 0)
1448 ++DAG.RegionEnd;
1450 // If Unsched.front() instruction is a debug instruction, this will actually
1451 // shrink the region since we moved all debug instructions to the end of the
1452 // block. Find the first instruction that is not a debug instruction.
1453 DAG.RegionBegin = Unsched.front()->getIterator();
1454 if (DAG.RegionBegin->isDebugInstr()) {
1455 for (MachineInstr *MI : Unsched) {
1456 if (MI->isDebugInstr())
1457 continue;
1458 DAG.RegionBegin = MI->getIterator();
1459 break;
1463 // Then move the debug instructions back into their correct place and set
1464 // RegionBegin and RegionEnd if needed.
1465 DAG.placeDebugValues();
1467 DAG.Regions[RegionIdx] = std::pair(DAG.RegionBegin, DAG.RegionEnd);
1470 void PreRARematStage::collectRematerializableInstructions() {
1471 const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo *>(DAG.TRI);
1472 for (unsigned I = 0, E = DAG.MRI.getNumVirtRegs(); I != E; ++I) {
1473 Register Reg = Register::index2VirtReg(I);
1474 if (!DAG.LIS->hasInterval(Reg))
1475 continue;
1477 // TODO: Handle AGPR and SGPR rematerialization
1478 if (!SRI->isVGPRClass(DAG.MRI.getRegClass(Reg)) ||
1479 !DAG.MRI.hasOneDef(Reg) || !DAG.MRI.hasOneNonDBGUse(Reg))
1480 continue;
1482 MachineOperand *Op = DAG.MRI.getOneDef(Reg);
1483 MachineInstr *Def = Op->getParent();
1484 if (Op->getSubReg() != 0 || !isTriviallyReMaterializable(*Def))
1485 continue;
1487 MachineInstr *UseI = &*DAG.MRI.use_instr_nodbg_begin(Reg);
1488 if (Def->getParent() == UseI->getParent())
1489 continue;
1491 // We are only collecting defs that are defined in another block and are
1492 // live-through or used inside regions at MinOccupancy. This means that the
1493 // register must be in the live-in set for the region.
1494 bool AddedToRematList = false;
1495 for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) {
1496 auto It = DAG.LiveIns[I].find(Reg);
1497 if (It != DAG.LiveIns[I].end() && !It->second.none()) {
1498 if (DAG.RegionsWithMinOcc[I]) {
1499 RematerializableInsts[I][Def] = UseI;
1500 AddedToRematList = true;
1503 // Collect regions with rematerializable reg as live-in to avoid
1504 // searching later when updating RP.
1505 RematDefToLiveInRegions[Def].push_back(I);
1508 if (!AddedToRematList)
1509 RematDefToLiveInRegions.erase(Def);
1513 bool PreRARematStage::sinkTriviallyRematInsts(const GCNSubtarget &ST,
1514 const TargetInstrInfo *TII) {
1515 // Temporary copies of cached variables we will be modifying and replacing if
1516 // sinking succeeds.
1517 SmallVector<
1518 std::pair<MachineBasicBlock::iterator, MachineBasicBlock::iterator>, 32>
1519 NewRegions;
1520 DenseMap<unsigned, GCNRPTracker::LiveRegSet> NewLiveIns;
1521 DenseMap<unsigned, GCNRegPressure> NewPressure;
1522 BitVector NewRescheduleRegions;
1523 LiveIntervals *LIS = DAG.LIS;
1525 NewRegions.resize(DAG.Regions.size());
1526 NewRescheduleRegions.resize(DAG.Regions.size());
1528 // Collect only regions that has a rematerializable def as a live-in.
1529 SmallSet<unsigned, 16> ImpactedRegions;
1530 for (const auto &It : RematDefToLiveInRegions)
1531 ImpactedRegions.insert(It.second.begin(), It.second.end());
1533 // Make copies of register pressure and live-ins cache that will be updated
1534 // as we rematerialize.
1535 for (auto Idx : ImpactedRegions) {
1536 NewPressure[Idx] = DAG.Pressure[Idx];
1537 NewLiveIns[Idx] = DAG.LiveIns[Idx];
1539 NewRegions = DAG.Regions;
1540 NewRescheduleRegions.reset();
1542 DenseMap<MachineInstr *, MachineInstr *> InsertedMIToOldDef;
1543 bool Improved = false;
1544 for (auto I : ImpactedRegions) {
1545 if (!DAG.RegionsWithMinOcc[I])
1546 continue;
1548 Improved = false;
1549 int VGPRUsage = NewPressure[I].getVGPRNum(ST.hasGFX90AInsts());
1550 int SGPRUsage = NewPressure[I].getSGPRNum();
1552 // TODO: Handle occupancy drop due to AGPR and SGPR.
1553 // Check if cause of occupancy drop is due to VGPR usage and not SGPR.
1554 if (ST.getOccupancyWithNumSGPRs(SGPRUsage) == DAG.MinOccupancy)
1555 break;
1557 // The occupancy of this region could have been improved by a previous
1558 // iteration's sinking of defs.
1559 if (NewPressure[I].getOccupancy(ST) > DAG.MinOccupancy) {
1560 NewRescheduleRegions[I] = true;
1561 Improved = true;
1562 continue;
1565 // First check if we have enough trivially rematerializable instructions to
1566 // improve occupancy. Optimistically assume all instructions we are able to
1567 // sink decreased RP.
1568 int TotalSinkableRegs = 0;
1569 for (const auto &It : RematerializableInsts[I]) {
1570 MachineInstr *Def = It.first;
1571 Register DefReg = Def->getOperand(0).getReg();
1572 TotalSinkableRegs +=
1573 SIRegisterInfo::getNumCoveredRegs(NewLiveIns[I][DefReg]);
1575 int VGPRsAfterSink = VGPRUsage - TotalSinkableRegs;
1576 unsigned OptimisticOccupancy = ST.getOccupancyWithNumVGPRs(VGPRsAfterSink);
1577 // If in the most optimistic scenario, we cannot improve occupancy, then do
1578 // not attempt to sink any instructions.
1579 if (OptimisticOccupancy <= DAG.MinOccupancy)
1580 break;
1582 unsigned ImproveOccupancy = 0;
1583 SmallVector<MachineInstr *, 4> SinkedDefs;
1584 for (auto &It : RematerializableInsts[I]) {
1585 MachineInstr *Def = It.first;
1586 MachineBasicBlock::iterator InsertPos =
1587 MachineBasicBlock::iterator(It.second);
1588 Register Reg = Def->getOperand(0).getReg();
1589 // Rematerialize MI to its use block. Since we are only rematerializing
1590 // instructions that do not have any virtual reg uses, we do not need to
1591 // call LiveRangeEdit::allUsesAvailableAt() and
1592 // LiveRangeEdit::canRematerializeAt().
1593 TII->reMaterialize(*InsertPos->getParent(), InsertPos, Reg,
1594 Def->getOperand(0).getSubReg(), *Def, *DAG.TRI);
1595 MachineInstr *NewMI = &*std::prev(InsertPos);
1596 LIS->InsertMachineInstrInMaps(*NewMI);
1597 LIS->removeInterval(Reg);
1598 LIS->createAndComputeVirtRegInterval(Reg);
1599 InsertedMIToOldDef[NewMI] = Def;
1601 // Update region boundaries in scheduling region we sinked from since we
1602 // may sink an instruction that was at the beginning or end of its region
1603 DAG.updateRegionBoundaries(NewRegions, Def, /*NewMI =*/nullptr,
1604 /*Removing =*/true);
1606 // Update region boundaries in region we sinked to.
1607 DAG.updateRegionBoundaries(NewRegions, InsertPos, NewMI);
1609 LaneBitmask PrevMask = NewLiveIns[I][Reg];
1610 // FIXME: Also update cached pressure for where the def was sinked from.
1611 // Update RP for all regions that has this reg as a live-in and remove
1612 // the reg from all regions as a live-in.
1613 for (auto Idx : RematDefToLiveInRegions[Def]) {
1614 NewLiveIns[Idx].erase(Reg);
1615 if (InsertPos->getParent() != DAG.Regions[Idx].first->getParent()) {
1616 // Def is live-through and not used in this block.
1617 NewPressure[Idx].inc(Reg, PrevMask, LaneBitmask::getNone(), DAG.MRI);
1618 } else {
1619 // Def is used and rematerialized into this block.
1620 GCNDownwardRPTracker RPT(*LIS);
1621 auto *NonDbgMI = &*skipDebugInstructionsForward(
1622 NewRegions[Idx].first, NewRegions[Idx].second);
1623 RPT.reset(*NonDbgMI, &NewLiveIns[Idx]);
1624 RPT.advance(NewRegions[Idx].second);
1625 NewPressure[Idx] = RPT.moveMaxPressure();
1629 SinkedDefs.push_back(Def);
1630 ImproveOccupancy = NewPressure[I].getOccupancy(ST);
1631 if (ImproveOccupancy > DAG.MinOccupancy)
1632 break;
1635 // Remove defs we just sinked from all regions' list of sinkable defs
1636 for (auto &Def : SinkedDefs)
1637 for (auto TrackedIdx : RematDefToLiveInRegions[Def])
1638 RematerializableInsts[TrackedIdx].erase(Def);
1640 if (ImproveOccupancy <= DAG.MinOccupancy)
1641 break;
1643 NewRescheduleRegions[I] = true;
1644 Improved = true;
1647 if (!Improved) {
1648 // Occupancy was not improved for all regions that were at MinOccupancy.
1649 // Undo sinking and remove newly rematerialized instructions.
1650 for (auto &Entry : InsertedMIToOldDef) {
1651 MachineInstr *MI = Entry.first;
1652 MachineInstr *OldMI = Entry.second;
1653 Register Reg = MI->getOperand(0).getReg();
1654 LIS->RemoveMachineInstrFromMaps(*MI);
1655 MI->eraseFromParent();
1656 OldMI->clearRegisterDeads(Reg);
1657 LIS->removeInterval(Reg);
1658 LIS->createAndComputeVirtRegInterval(Reg);
1660 return false;
1663 // Occupancy was improved for all regions.
1664 for (auto &Entry : InsertedMIToOldDef) {
1665 MachineInstr *MI = Entry.first;
1666 MachineInstr *OldMI = Entry.second;
1668 // Remove OldMI from BBLiveInMap since we are sinking it from its MBB.
1669 DAG.BBLiveInMap.erase(OldMI);
1671 // Remove OldMI and update LIS
1672 Register Reg = MI->getOperand(0).getReg();
1673 LIS->RemoveMachineInstrFromMaps(*OldMI);
1674 OldMI->eraseFromParent();
1675 LIS->removeInterval(Reg);
1676 LIS->createAndComputeVirtRegInterval(Reg);
1679 // Update live-ins, register pressure, and regions caches.
1680 for (auto Idx : ImpactedRegions) {
1681 DAG.LiveIns[Idx] = NewLiveIns[Idx];
1682 DAG.Pressure[Idx] = NewPressure[Idx];
1683 DAG.MBBLiveIns.erase(DAG.Regions[Idx].first->getParent());
1685 DAG.Regions = NewRegions;
1686 DAG.RescheduleRegions = NewRescheduleRegions;
1688 if (GCNTrackers)
1689 DAG.RegionLiveOuts.buildLiveRegMap();
1691 SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
1692 MFI.increaseOccupancy(MF, ++DAG.MinOccupancy);
1694 return true;
1697 // Copied from MachineLICM
1698 bool PreRARematStage::isTriviallyReMaterializable(const MachineInstr &MI) {
1699 if (!DAG.TII->isTriviallyReMaterializable(MI))
1700 return false;
1702 for (const MachineOperand &MO : MI.all_uses())
1703 if (MO.getReg().isVirtual())
1704 return false;
1706 return true;
1709 // When removing, we will have to check both beginning and ending of the region.
1710 // When inserting, we will only have to check if we are inserting NewMI in front
1711 // of a scheduling region and do not need to check the ending since we will only
1712 // ever be inserting before an already existing MI.
1713 void GCNScheduleDAGMILive::updateRegionBoundaries(
1714 SmallVectorImpl<std::pair<MachineBasicBlock::iterator,
1715 MachineBasicBlock::iterator>> &RegionBoundaries,
1716 MachineBasicBlock::iterator MI, MachineInstr *NewMI, bool Removing) {
1717 unsigned I = 0, E = RegionBoundaries.size();
1718 // Search for first region of the block where MI is located
1719 while (I != E && MI->getParent() != RegionBoundaries[I].first->getParent())
1720 ++I;
1722 for (; I != E; ++I) {
1723 if (MI->getParent() != RegionBoundaries[I].first->getParent())
1724 return;
1726 if (Removing && MI == RegionBoundaries[I].first &&
1727 MI == RegionBoundaries[I].second) {
1728 // MI is in a region with size 1, after removing, the region will be
1729 // size 0, set RegionBegin and RegionEnd to pass end of block iterator.
1730 RegionBoundaries[I] =
1731 std::pair(MI->getParent()->end(), MI->getParent()->end());
1732 return;
1734 if (MI == RegionBoundaries[I].first) {
1735 if (Removing)
1736 RegionBoundaries[I] =
1737 std::pair(std::next(MI), RegionBoundaries[I].second);
1738 else
1739 // Inserted NewMI in front of region, set new RegionBegin to NewMI
1740 RegionBoundaries[I] = std::pair(MachineBasicBlock::iterator(NewMI),
1741 RegionBoundaries[I].second);
1742 return;
1744 if (Removing && MI == RegionBoundaries[I].second) {
1745 RegionBoundaries[I] = std::pair(RegionBoundaries[I].first, std::prev(MI));
1746 return;
1751 static bool hasIGLPInstrs(ScheduleDAGInstrs *DAG) {
1752 return any_of(*DAG, [](MachineBasicBlock::iterator MI) {
1753 unsigned Opc = MI->getOpcode();
1754 return Opc == AMDGPU::SCHED_GROUP_BARRIER || Opc == AMDGPU::IGLP_OPT;
1758 GCNPostScheduleDAGMILive::GCNPostScheduleDAGMILive(
1759 MachineSchedContext *C, std::unique_ptr<MachineSchedStrategy> S,
1760 bool RemoveKillFlags)
1761 : ScheduleDAGMI(C, std::move(S), RemoveKillFlags) {}
1763 void GCNPostScheduleDAGMILive::schedule() {
1764 HasIGLPInstrs = hasIGLPInstrs(this);
1765 if (HasIGLPInstrs) {
1766 SavedMutations.clear();
1767 SavedMutations.swap(Mutations);
1768 addMutation(createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::PostRA));
1771 ScheduleDAGMI::schedule();
1774 void GCNPostScheduleDAGMILive::finalizeSchedule() {
1775 if (HasIGLPInstrs)
1776 SavedMutations.swap(Mutations);
1778 ScheduleDAGMI::finalizeSchedule();