1 //===-- GCNSchedStrategy.cpp - GCN Scheduler Strategy ---------------------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
10 /// This contains a MachineSchedStrategy implementation for maximizing wave
11 /// occupancy on GCN hardware.
13 /// This pass will apply multiple scheduling stages to the same function.
14 /// Regions are first recorded in GCNScheduleDAGMILive::schedule. The actual
15 /// entry point for the scheduling of those regions is
16 /// GCNScheduleDAGMILive::runSchedStages.
18 /// Generally, the reason for having multiple scheduling stages is to account
19 /// for the kernel-wide effect of register usage on occupancy. Usually, only a
20 /// few scheduling regions will have register pressure high enough to limit
21 /// occupancy for the kernel, so constraints can be relaxed to improve ILP in
24 //===----------------------------------------------------------------------===//
26 #include "GCNSchedStrategy.h"
27 #include "AMDGPUIGroupLP.h"
28 #include "SIMachineFunctionInfo.h"
29 #include "llvm/CodeGen/RegisterClassInfo.h"
31 #define DEBUG_TYPE "machine-scheduler"
35 static cl::opt
<bool> DisableUnclusterHighRP(
36 "amdgpu-disable-unclustered-high-rp-reschedule", cl::Hidden
,
37 cl::desc("Disable unclustered high register pressure "
38 "reduction scheduling stage."),
41 static cl::opt
<bool> DisableClusteredLowOccupancy(
42 "amdgpu-disable-clustered-low-occupancy-reschedule", cl::Hidden
,
43 cl::desc("Disable clustered low occupancy "
44 "rescheduling for ILP scheduling stage."),
47 static cl::opt
<unsigned> ScheduleMetricBias(
48 "amdgpu-schedule-metric-bias", cl::Hidden
,
50 "Sets the bias which adds weight to occupancy vs latency. Set it to "
51 "100 to chase the occupancy only."),
55 RelaxedOcc("amdgpu-schedule-relaxed-occupancy", cl::Hidden
,
56 cl::desc("Relax occupancy targets for kernels which are memory "
57 "bound (amdgpu-membound-threshold), or "
58 "Wave Limited (amdgpu-limit-wave-threshold)."),
61 const unsigned ScheduleMetrics::ScaleFactor
= 100;
63 GCNSchedStrategy::GCNSchedStrategy(const MachineSchedContext
*C
)
64 : GenericScheduler(C
), TargetOccupancy(0), MF(nullptr),
65 HasHighPressure(false) {}
67 void GCNSchedStrategy::initialize(ScheduleDAGMI
*DAG
) {
68 GenericScheduler::initialize(DAG
);
72 const GCNSubtarget
&ST
= MF
->getSubtarget
<GCNSubtarget
>();
75 Context
->RegClassInfo
->getNumAllocatableRegs(&AMDGPU::SGPR_32RegClass
);
77 Context
->RegClassInfo
->getNumAllocatableRegs(&AMDGPU::VGPR_32RegClass
);
79 SIMachineFunctionInfo
&MFI
= *MF
->getInfo
<SIMachineFunctionInfo
>();
80 // Set the initial TargetOccupnacy to the maximum occupancy that we can
81 // achieve for this function. This effectively sets a lower bound on the
82 // 'Critical' register limits in the scheduler.
83 // Allow for lower occupancy targets if kernel is wave limited or memory
84 // bound, and using the relaxed occupancy feature.
86 RelaxedOcc
? MFI
.getMinAllowedOccupancy() : MFI
.getOccupancy();
88 std::min(ST
.getMaxNumSGPRs(TargetOccupancy
, true), SGPRExcessLimit
);
92 std::min(ST
.getMaxNumVGPRs(TargetOccupancy
), VGPRExcessLimit
);
94 // This is similar to ST.getMaxNumVGPRs(TargetOccupancy) result except
95 // returns a reasonably small number for targets with lots of VGPRs, such
96 // as GFX10 and GFX11.
97 LLVM_DEBUG(dbgs() << "Region is known to spill, use alternative "
98 "VGPRCriticalLimit calculation method.\n");
100 unsigned Granule
= AMDGPU::IsaInfo::getVGPRAllocGranule(&ST
);
101 unsigned Addressable
= AMDGPU::IsaInfo::getAddressableNumVGPRs(&ST
);
102 unsigned VGPRBudget
= alignDown(Addressable
/ TargetOccupancy
, Granule
);
103 VGPRBudget
= std::max(VGPRBudget
, Granule
);
104 VGPRCriticalLimit
= std::min(VGPRBudget
, VGPRExcessLimit
);
107 // Subtract error margin and bias from register limits and avoid overflow.
108 SGPRCriticalLimit
-= std::min(SGPRLimitBias
+ ErrorMargin
, SGPRCriticalLimit
);
109 VGPRCriticalLimit
-= std::min(VGPRLimitBias
+ ErrorMargin
, VGPRCriticalLimit
);
110 SGPRExcessLimit
-= std::min(SGPRLimitBias
+ ErrorMargin
, SGPRExcessLimit
);
111 VGPRExcessLimit
-= std::min(VGPRLimitBias
+ ErrorMargin
, VGPRExcessLimit
);
113 LLVM_DEBUG(dbgs() << "VGPRCriticalLimit = " << VGPRCriticalLimit
114 << ", VGPRExcessLimit = " << VGPRExcessLimit
115 << ", SGPRCriticalLimit = " << SGPRCriticalLimit
116 << ", SGPRExcessLimit = " << SGPRExcessLimit
<< "\n\n");
119 /// Checks whether \p SU can use the cached DAG pressure diffs to compute the
120 /// current register pressure.
122 /// This works for the common case, but it has a few exceptions that have been
123 /// observed through trial and error:
124 /// - Explicit physical register operands
125 /// - Subregister definitions
127 /// In both of those cases, PressureDiff doesn't represent the actual pressure,
128 /// and querying LiveIntervals through the RegPressureTracker is needed to get
129 /// an accurate value.
131 /// We should eventually only use PressureDiff for maximum performance, but this
132 /// already allows 80% of SUs to take the fast path without changing scheduling
133 /// at all. Further changes would either change scheduling, or require a lot
134 /// more logic to recover an accurate pressure estimate from the PressureDiffs.
135 static bool canUsePressureDiffs(const SUnit
&SU
) {
139 // Cannot use pressure diffs for subregister defs or with physregs, it's
140 // imprecise in both cases.
141 for (const auto &Op
: SU
.getInstr()->operands()) {
142 if (!Op
.isReg() || Op
.isImplicit())
144 if (Op
.getReg().isPhysical() ||
145 (Op
.isDef() && Op
.getSubReg() != AMDGPU::NoSubRegister
))
151 static void getRegisterPressures(bool AtTop
,
152 const RegPressureTracker
&RPTracker
, SUnit
*SU
,
153 std::vector
<unsigned> &Pressure
,
154 std::vector
<unsigned> &MaxPressure
) {
155 // getDownwardPressure() and getUpwardPressure() make temporary changes to
156 // the tracker, so we need to pass those function a non-const copy.
157 RegPressureTracker
&TempTracker
= const_cast<RegPressureTracker
&>(RPTracker
);
159 TempTracker
.getDownwardPressure(SU
->getInstr(), Pressure
, MaxPressure
);
161 TempTracker
.getUpwardPressure(SU
->getInstr(), Pressure
, MaxPressure
);
164 void GCNSchedStrategy::initCandidate(SchedCandidate
&Cand
, SUnit
*SU
,
166 const RegPressureTracker
&RPTracker
,
167 const SIRegisterInfo
*SRI
,
168 unsigned SGPRPressure
,
169 unsigned VGPRPressure
, bool IsBottomUp
) {
173 if (!DAG
->isTrackingPressure())
179 // We try to use the cached PressureDiffs in the ScheduleDAG whenever
180 // possible over querying the RegPressureTracker.
182 // RegPressureTracker will make a lot of LIS queries which are very
183 // expensive, it is considered a slow function in this context.
185 // PressureDiffs are precomputed and cached, and getPressureDiff is just a
186 // trivial lookup into an array. It is pretty much free.
188 // In EXPENSIVE_CHECKS, we always query RPTracker to verify the results of
190 if (AtTop
|| !canUsePressureDiffs(*SU
)) {
191 getRegisterPressures(AtTop
, RPTracker
, SU
, Pressure
, MaxPressure
);
194 Pressure
.resize(4, 0);
195 Pressure
[AMDGPU::RegisterPressureSets::SReg_32
] = SGPRPressure
;
196 Pressure
[AMDGPU::RegisterPressureSets::VGPR_32
] = VGPRPressure
;
198 for (const auto &Diff
: DAG
->getPressureDiff(SU
)) {
201 // PressureDiffs is always bottom-up so if we're working top-down we need
202 // to invert its sign.
203 Pressure
[Diff
.getPSet()] +=
204 (IsBottomUp
? Diff
.getUnitInc() : -Diff
.getUnitInc());
207 #ifdef EXPENSIVE_CHECKS
208 std::vector
<unsigned> CheckPressure
, CheckMaxPressure
;
209 getRegisterPressures(AtTop
, RPTracker
, SU
, CheckPressure
, CheckMaxPressure
);
210 if (Pressure
[AMDGPU::RegisterPressureSets::SReg_32
] !=
211 CheckPressure
[AMDGPU::RegisterPressureSets::SReg_32
] ||
212 Pressure
[AMDGPU::RegisterPressureSets::VGPR_32
] !=
213 CheckPressure
[AMDGPU::RegisterPressureSets::VGPR_32
]) {
214 errs() << "Register Pressure is inaccurate when calculated through "
216 << "SGPR got " << Pressure
[AMDGPU::RegisterPressureSets::SReg_32
]
218 << CheckPressure
[AMDGPU::RegisterPressureSets::SReg_32
] << "\n"
219 << "VGPR got " << Pressure
[AMDGPU::RegisterPressureSets::VGPR_32
]
221 << CheckPressure
[AMDGPU::RegisterPressureSets::VGPR_32
] << "\n";
222 report_fatal_error("inaccurate register pressure calculation");
227 unsigned NewSGPRPressure
= Pressure
[AMDGPU::RegisterPressureSets::SReg_32
];
228 unsigned NewVGPRPressure
= Pressure
[AMDGPU::RegisterPressureSets::VGPR_32
];
230 // If two instructions increase the pressure of different register sets
231 // by the same amount, the generic scheduler will prefer to schedule the
232 // instruction that increases the set with the least amount of registers,
233 // which in our case would be SGPRs. This is rarely what we want, so
234 // when we report excess/critical register pressure, we do it either
235 // only for VGPRs or only for SGPRs.
237 // FIXME: Better heuristics to determine whether to prefer SGPRs or VGPRs.
238 const unsigned MaxVGPRPressureInc
= 16;
239 bool ShouldTrackVGPRs
= VGPRPressure
+ MaxVGPRPressureInc
>= VGPRExcessLimit
;
240 bool ShouldTrackSGPRs
= !ShouldTrackVGPRs
&& SGPRPressure
>= SGPRExcessLimit
;
242 // FIXME: We have to enter REG-EXCESS before we reach the actual threshold
243 // to increase the likelihood we don't go over the limits. We should improve
244 // the analysis to look through dependencies to find the path with the least
245 // register pressure.
247 // We only need to update the RPDelta for instructions that increase register
248 // pressure. Instructions that decrease or keep reg pressure the same will be
249 // marked as RegExcess in tryCandidate() when they are compared with
250 // instructions that increase the register pressure.
251 if (ShouldTrackVGPRs
&& NewVGPRPressure
>= VGPRExcessLimit
) {
252 HasHighPressure
= true;
253 Cand
.RPDelta
.Excess
= PressureChange(AMDGPU::RegisterPressureSets::VGPR_32
);
254 Cand
.RPDelta
.Excess
.setUnitInc(NewVGPRPressure
- VGPRExcessLimit
);
257 if (ShouldTrackSGPRs
&& NewSGPRPressure
>= SGPRExcessLimit
) {
258 HasHighPressure
= true;
259 Cand
.RPDelta
.Excess
= PressureChange(AMDGPU::RegisterPressureSets::SReg_32
);
260 Cand
.RPDelta
.Excess
.setUnitInc(NewSGPRPressure
- SGPRExcessLimit
);
263 // Register pressure is considered 'CRITICAL' if it is approaching a value
264 // that would reduce the wave occupancy for the execution unit. When
265 // register pressure is 'CRITICAL', increasing SGPR and VGPR pressure both
266 // has the same cost, so we don't need to prefer one over the other.
268 int SGPRDelta
= NewSGPRPressure
- SGPRCriticalLimit
;
269 int VGPRDelta
= NewVGPRPressure
- VGPRCriticalLimit
;
271 if (SGPRDelta
>= 0 || VGPRDelta
>= 0) {
272 HasHighPressure
= true;
273 if (SGPRDelta
> VGPRDelta
) {
274 Cand
.RPDelta
.CriticalMax
=
275 PressureChange(AMDGPU::RegisterPressureSets::SReg_32
);
276 Cand
.RPDelta
.CriticalMax
.setUnitInc(SGPRDelta
);
278 Cand
.RPDelta
.CriticalMax
=
279 PressureChange(AMDGPU::RegisterPressureSets::VGPR_32
);
280 Cand
.RPDelta
.CriticalMax
.setUnitInc(VGPRDelta
);
285 // This function is mostly cut and pasted from
286 // GenericScheduler::pickNodeFromQueue()
287 void GCNSchedStrategy::pickNodeFromQueue(SchedBoundary
&Zone
,
288 const CandPolicy
&ZonePolicy
,
289 const RegPressureTracker
&RPTracker
,
290 SchedCandidate
&Cand
,
292 const SIRegisterInfo
*SRI
= static_cast<const SIRegisterInfo
*>(TRI
);
293 ArrayRef
<unsigned> Pressure
= RPTracker
.getRegSetPressureAtPos();
294 unsigned SGPRPressure
= 0;
295 unsigned VGPRPressure
= 0;
296 if (DAG
->isTrackingPressure()) {
297 SGPRPressure
= Pressure
[AMDGPU::RegisterPressureSets::SReg_32
];
298 VGPRPressure
= Pressure
[AMDGPU::RegisterPressureSets::VGPR_32
];
300 ReadyQueue
&Q
= Zone
.Available
;
301 for (SUnit
*SU
: Q
) {
303 SchedCandidate
TryCand(ZonePolicy
);
304 initCandidate(TryCand
, SU
, Zone
.isTop(), RPTracker
, SRI
, SGPRPressure
,
305 VGPRPressure
, IsBottomUp
);
306 // Pass SchedBoundary only when comparing nodes from the same boundary.
307 SchedBoundary
*ZoneArg
= Cand
.AtTop
== TryCand
.AtTop
? &Zone
: nullptr;
308 tryCandidate(Cand
, TryCand
, ZoneArg
);
309 if (TryCand
.Reason
!= NoCand
) {
310 // Initialize resource delta if needed in case future heuristics query it.
311 if (TryCand
.ResDelta
== SchedResourceDelta())
312 TryCand
.initResourceDelta(Zone
.DAG
, SchedModel
);
313 Cand
.setBest(TryCand
);
314 LLVM_DEBUG(traceCandidate(Cand
));
319 // This function is mostly cut and pasted from
320 // GenericScheduler::pickNodeBidirectional()
321 SUnit
*GCNSchedStrategy::pickNodeBidirectional(bool &IsTopNode
) {
322 // Schedule as far as possible in the direction of no choice. This is most
323 // efficient, but also provides the best heuristics for CriticalPSets.
324 if (SUnit
*SU
= Bot
.pickOnlyChoice()) {
328 if (SUnit
*SU
= Top
.pickOnlyChoice()) {
332 // Set the bottom-up policy based on the state of the current bottom zone and
333 // the instructions outside the zone, including the top zone.
334 CandPolicy BotPolicy
;
335 setPolicy(BotPolicy
, /*IsPostRA=*/false, Bot
, &Top
);
336 // Set the top-down policy based on the state of the current top zone and
337 // the instructions outside the zone, including the bottom zone.
338 CandPolicy TopPolicy
;
339 setPolicy(TopPolicy
, /*IsPostRA=*/false, Top
, &Bot
);
341 // See if BotCand is still valid (because we previously scheduled from Top).
342 LLVM_DEBUG(dbgs() << "Picking from Bot:\n");
343 if (!BotCand
.isValid() || BotCand
.SU
->isScheduled
||
344 BotCand
.Policy
!= BotPolicy
) {
345 BotCand
.reset(CandPolicy());
346 pickNodeFromQueue(Bot
, BotPolicy
, DAG
->getBotRPTracker(), BotCand
,
347 /*IsBottomUp=*/true);
348 assert(BotCand
.Reason
!= NoCand
&& "failed to find the first candidate");
350 LLVM_DEBUG(traceCandidate(BotCand
));
352 if (VerifyScheduling
) {
353 SchedCandidate TCand
;
354 TCand
.reset(CandPolicy());
355 pickNodeFromQueue(Bot
, BotPolicy
, DAG
->getBotRPTracker(), TCand
,
356 /*IsBottomUp=*/true);
357 assert(TCand
.SU
== BotCand
.SU
&&
358 "Last pick result should correspond to re-picking right now");
363 // Check if the top Q has a better candidate.
364 LLVM_DEBUG(dbgs() << "Picking from Top:\n");
365 if (!TopCand
.isValid() || TopCand
.SU
->isScheduled
||
366 TopCand
.Policy
!= TopPolicy
) {
367 TopCand
.reset(CandPolicy());
368 pickNodeFromQueue(Top
, TopPolicy
, DAG
->getTopRPTracker(), TopCand
,
369 /*IsBottomUp=*/false);
370 assert(TopCand
.Reason
!= NoCand
&& "failed to find the first candidate");
372 LLVM_DEBUG(traceCandidate(TopCand
));
374 if (VerifyScheduling
) {
375 SchedCandidate TCand
;
376 TCand
.reset(CandPolicy());
377 pickNodeFromQueue(Top
, TopPolicy
, DAG
->getTopRPTracker(), TCand
,
378 /*IsBottomUp=*/false);
379 assert(TCand
.SU
== TopCand
.SU
&&
380 "Last pick result should correspond to re-picking right now");
385 // Pick best from BotCand and TopCand.
386 LLVM_DEBUG(dbgs() << "Top Cand: "; traceCandidate(TopCand
);
387 dbgs() << "Bot Cand: "; traceCandidate(BotCand
););
388 SchedCandidate Cand
= BotCand
;
389 TopCand
.Reason
= NoCand
;
390 tryCandidate(Cand
, TopCand
, nullptr);
391 if (TopCand
.Reason
!= NoCand
) {
392 Cand
.setBest(TopCand
);
394 LLVM_DEBUG(dbgs() << "Picking: "; traceCandidate(Cand
););
396 IsTopNode
= Cand
.AtTop
;
400 // This function is mostly cut and pasted from
401 // GenericScheduler::pickNode()
402 SUnit
*GCNSchedStrategy::pickNode(bool &IsTopNode
) {
403 if (DAG
->top() == DAG
->bottom()) {
404 assert(Top
.Available
.empty() && Top
.Pending
.empty() &&
405 Bot
.Available
.empty() && Bot
.Pending
.empty() && "ReadyQ garbage");
410 if (RegionPolicy
.OnlyTopDown
) {
411 SU
= Top
.pickOnlyChoice();
414 TopCand
.reset(NoPolicy
);
415 pickNodeFromQueue(Top
, NoPolicy
, DAG
->getTopRPTracker(), TopCand
,
416 /*IsBottomUp=*/false);
417 assert(TopCand
.Reason
!= NoCand
&& "failed to find a candidate");
421 } else if (RegionPolicy
.OnlyBottomUp
) {
422 SU
= Bot
.pickOnlyChoice();
425 BotCand
.reset(NoPolicy
);
426 pickNodeFromQueue(Bot
, NoPolicy
, DAG
->getBotRPTracker(), BotCand
,
427 /*IsBottomUp=*/true);
428 assert(BotCand
.Reason
!= NoCand
&& "failed to find a candidate");
433 SU
= pickNodeBidirectional(IsTopNode
);
435 } while (SU
->isScheduled
);
437 if (SU
->isTopReady())
439 if (SU
->isBottomReady())
442 LLVM_DEBUG(dbgs() << "Scheduling SU(" << SU
->NodeNum
<< ") "
447 GCNSchedStageID
GCNSchedStrategy::getCurrentStage() {
448 assert(CurrentStage
&& CurrentStage
!= SchedStages
.end());
449 return *CurrentStage
;
452 bool GCNSchedStrategy::advanceStage() {
453 assert(CurrentStage
!= SchedStages
.end());
455 CurrentStage
= SchedStages
.begin();
459 return CurrentStage
!= SchedStages
.end();
462 bool GCNSchedStrategy::hasNextStage() const {
463 assert(CurrentStage
);
464 return std::next(CurrentStage
) != SchedStages
.end();
467 GCNSchedStageID
GCNSchedStrategy::getNextStage() const {
468 assert(CurrentStage
&& std::next(CurrentStage
) != SchedStages
.end());
469 return *std::next(CurrentStage
);
472 GCNMaxOccupancySchedStrategy::GCNMaxOccupancySchedStrategy(
473 const MachineSchedContext
*C
)
474 : GCNSchedStrategy(C
) {
475 SchedStages
.push_back(GCNSchedStageID::OccInitialSchedule
);
476 SchedStages
.push_back(GCNSchedStageID::UnclusteredHighRPReschedule
);
477 SchedStages
.push_back(GCNSchedStageID::ClusteredLowOccupancyReschedule
);
478 SchedStages
.push_back(GCNSchedStageID::PreRARematerialize
);
481 GCNMaxILPSchedStrategy::GCNMaxILPSchedStrategy(const MachineSchedContext
*C
)
482 : GCNSchedStrategy(C
) {
483 SchedStages
.push_back(GCNSchedStageID::ILPInitialSchedule
);
486 bool GCNMaxILPSchedStrategy::tryCandidate(SchedCandidate
&Cand
,
487 SchedCandidate
&TryCand
,
488 SchedBoundary
*Zone
) const {
489 // Initialize the candidate if needed.
490 if (!Cand
.isValid()) {
491 TryCand
.Reason
= NodeOrder
;
495 // Avoid spilling by exceeding the register limit.
496 if (DAG
->isTrackingPressure() &&
497 tryPressure(TryCand
.RPDelta
.Excess
, Cand
.RPDelta
.Excess
, TryCand
, Cand
,
498 RegExcess
, TRI
, DAG
->MF
))
499 return TryCand
.Reason
!= NoCand
;
501 // Bias PhysReg Defs and copies to their uses and defined respectively.
502 if (tryGreater(biasPhysReg(TryCand
.SU
, TryCand
.AtTop
),
503 biasPhysReg(Cand
.SU
, Cand
.AtTop
), TryCand
, Cand
, PhysReg
))
504 return TryCand
.Reason
!= NoCand
;
506 bool SameBoundary
= Zone
!= nullptr;
508 // Prioritize instructions that read unbuffered resources by stall cycles.
509 if (tryLess(Zone
->getLatencyStallCycles(TryCand
.SU
),
510 Zone
->getLatencyStallCycles(Cand
.SU
), TryCand
, Cand
, Stall
))
511 return TryCand
.Reason
!= NoCand
;
513 // Avoid critical resource consumption and balance the schedule.
514 TryCand
.initResourceDelta(DAG
, SchedModel
);
515 if (tryLess(TryCand
.ResDelta
.CritResources
, Cand
.ResDelta
.CritResources
,
516 TryCand
, Cand
, ResourceReduce
))
517 return TryCand
.Reason
!= NoCand
;
518 if (tryGreater(TryCand
.ResDelta
.DemandedResources
,
519 Cand
.ResDelta
.DemandedResources
, TryCand
, Cand
,
521 return TryCand
.Reason
!= NoCand
;
523 // Unconditionally try to reduce latency.
524 if (tryLatency(TryCand
, Cand
, *Zone
))
525 return TryCand
.Reason
!= NoCand
;
527 // Weak edges are for clustering and other constraints.
528 if (tryLess(getWeakLeft(TryCand
.SU
, TryCand
.AtTop
),
529 getWeakLeft(Cand
.SU
, Cand
.AtTop
), TryCand
, Cand
, Weak
))
530 return TryCand
.Reason
!= NoCand
;
533 // Keep clustered nodes together to encourage downstream peephole
534 // optimizations which may reduce resource requirements.
536 // This is a best effort to set things up for a post-RA pass. Optimizations
537 // like generating loads of multiple registers should ideally be done within
538 // the scheduler pass by combining the loads during DAG postprocessing.
539 const SUnit
*CandNextClusterSU
=
540 Cand
.AtTop
? DAG
->getNextClusterSucc() : DAG
->getNextClusterPred();
541 const SUnit
*TryCandNextClusterSU
=
542 TryCand
.AtTop
? DAG
->getNextClusterSucc() : DAG
->getNextClusterPred();
543 if (tryGreater(TryCand
.SU
== TryCandNextClusterSU
,
544 Cand
.SU
== CandNextClusterSU
, TryCand
, Cand
, Cluster
))
545 return TryCand
.Reason
!= NoCand
;
547 // Avoid increasing the max critical pressure in the scheduled region.
548 if (DAG
->isTrackingPressure() &&
549 tryPressure(TryCand
.RPDelta
.CriticalMax
, Cand
.RPDelta
.CriticalMax
,
550 TryCand
, Cand
, RegCritical
, TRI
, DAG
->MF
))
551 return TryCand
.Reason
!= NoCand
;
553 // Avoid increasing the max pressure of the entire region.
554 if (DAG
->isTrackingPressure() &&
555 tryPressure(TryCand
.RPDelta
.CurrentMax
, Cand
.RPDelta
.CurrentMax
, TryCand
,
556 Cand
, RegMax
, TRI
, DAG
->MF
))
557 return TryCand
.Reason
!= NoCand
;
560 // Fall through to original instruction order.
561 if ((Zone
->isTop() && TryCand
.SU
->NodeNum
< Cand
.SU
->NodeNum
) ||
562 (!Zone
->isTop() && TryCand
.SU
->NodeNum
> Cand
.SU
->NodeNum
)) {
563 TryCand
.Reason
= NodeOrder
;
570 GCNScheduleDAGMILive::GCNScheduleDAGMILive(
571 MachineSchedContext
*C
, std::unique_ptr
<MachineSchedStrategy
> S
)
572 : ScheduleDAGMILive(C
, std::move(S
)), ST(MF
.getSubtarget
<GCNSubtarget
>()),
573 MFI(*MF
.getInfo
<SIMachineFunctionInfo
>()),
574 StartingOccupancy(MFI
.getOccupancy()), MinOccupancy(StartingOccupancy
) {
576 LLVM_DEBUG(dbgs() << "Starting occupancy is " << StartingOccupancy
<< ".\n");
578 MinOccupancy
= std::min(MFI
.getMinAllowedOccupancy(), StartingOccupancy
);
579 if (MinOccupancy
!= StartingOccupancy
)
580 LLVM_DEBUG(dbgs() << "Allowing Occupancy drops to " << MinOccupancy
585 std::unique_ptr
<GCNSchedStage
>
586 GCNScheduleDAGMILive::createSchedStage(GCNSchedStageID SchedStageID
) {
587 switch (SchedStageID
) {
588 case GCNSchedStageID::OccInitialSchedule
:
589 return std::make_unique
<OccInitialScheduleStage
>(SchedStageID
, *this);
590 case GCNSchedStageID::UnclusteredHighRPReschedule
:
591 return std::make_unique
<UnclusteredHighRPStage
>(SchedStageID
, *this);
592 case GCNSchedStageID::ClusteredLowOccupancyReschedule
:
593 return std::make_unique
<ClusteredLowOccStage
>(SchedStageID
, *this);
594 case GCNSchedStageID::PreRARematerialize
:
595 return std::make_unique
<PreRARematStage
>(SchedStageID
, *this);
596 case GCNSchedStageID::ILPInitialSchedule
:
597 return std::make_unique
<ILPInitialScheduleStage
>(SchedStageID
, *this);
600 llvm_unreachable("Unknown SchedStageID.");
603 void GCNScheduleDAGMILive::schedule() {
604 // Collect all scheduling regions. The actual scheduling is performed in
605 // GCNScheduleDAGMILive::finalizeSchedule.
606 Regions
.push_back(std::pair(RegionBegin
, RegionEnd
));
610 GCNScheduleDAGMILive::getRealRegPressure(unsigned RegionIdx
) const {
611 GCNDownwardRPTracker
RPTracker(*LIS
);
612 RPTracker
.advance(begin(), end(), &LiveIns
[RegionIdx
]);
613 return RPTracker
.moveMaxPressure();
616 void GCNScheduleDAGMILive::computeBlockPressure(unsigned RegionIdx
,
617 const MachineBasicBlock
*MBB
) {
618 GCNDownwardRPTracker
RPTracker(*LIS
);
620 // If the block has the only successor then live-ins of that successor are
621 // live-outs of the current block. We can reuse calculated live set if the
622 // successor will be sent to scheduling past current block.
624 // However, due to the bug in LiveInterval analysis it may happen that two
625 // predecessors of the same successor block have different lane bitmasks for
626 // a live-out register. Workaround that by sticking to one-to-one relationship
627 // i.e. one predecessor with one successor block.
628 const MachineBasicBlock
*OnlySucc
= nullptr;
629 if (MBB
->succ_size() == 1) {
630 auto *Candidate
= *MBB
->succ_begin();
631 if (!Candidate
->empty() && Candidate
->pred_size() == 1) {
632 SlotIndexes
*Ind
= LIS
->getSlotIndexes();
633 if (Ind
->getMBBStartIdx(MBB
) < Ind
->getMBBStartIdx(Candidate
))
634 OnlySucc
= Candidate
;
638 // Scheduler sends regions from the end of the block upwards.
639 size_t CurRegion
= RegionIdx
;
640 for (size_t E
= Regions
.size(); CurRegion
!= E
; ++CurRegion
)
641 if (Regions
[CurRegion
].first
->getParent() != MBB
)
645 auto I
= MBB
->begin();
646 auto LiveInIt
= MBBLiveIns
.find(MBB
);
647 auto &Rgn
= Regions
[CurRegion
];
648 auto *NonDbgMI
= &*skipDebugInstructionsForward(Rgn
.first
, Rgn
.second
);
649 if (LiveInIt
!= MBBLiveIns
.end()) {
650 auto LiveIn
= std::move(LiveInIt
->second
);
651 RPTracker
.reset(*MBB
->begin(), &LiveIn
);
652 MBBLiveIns
.erase(LiveInIt
);
655 auto LRS
= BBLiveInMap
.lookup(NonDbgMI
);
656 #ifdef EXPENSIVE_CHECKS
657 assert(isEqual(getLiveRegsBefore(*NonDbgMI
, *LIS
), LRS
));
659 RPTracker
.reset(*I
, &LRS
);
663 I
= RPTracker
.getNext();
665 if (Regions
[CurRegion
].first
== I
|| NonDbgMI
== I
) {
666 LiveIns
[CurRegion
] = RPTracker
.getLiveRegs();
667 RPTracker
.clearMaxPressure();
670 if (Regions
[CurRegion
].second
== I
) {
671 Pressure
[CurRegion
] = RPTracker
.moveMaxPressure();
672 if (CurRegion
-- == RegionIdx
)
675 RPTracker
.advanceToNext();
676 RPTracker
.advanceBeforeNext();
680 if (I
!= MBB
->end()) {
681 RPTracker
.advanceToNext();
682 RPTracker
.advance(MBB
->end());
684 RPTracker
.advanceBeforeNext();
685 MBBLiveIns
[OnlySucc
] = RPTracker
.moveLiveRegs();
689 DenseMap
<MachineInstr
*, GCNRPTracker::LiveRegSet
>
690 GCNScheduleDAGMILive::getBBLiveInMap() const {
691 assert(!Regions
.empty());
692 std::vector
<MachineInstr
*> BBStarters
;
693 BBStarters
.reserve(Regions
.size());
694 auto I
= Regions
.rbegin(), E
= Regions
.rend();
695 auto *BB
= I
->first
->getParent();
697 auto *MI
= &*skipDebugInstructionsForward(I
->first
, I
->second
);
698 BBStarters
.push_back(MI
);
701 } while (I
!= E
&& I
->first
->getParent() == BB
);
703 return getLiveRegMap(BBStarters
, false /*After*/, *LIS
);
706 void GCNScheduleDAGMILive::finalizeSchedule() {
707 // Start actual scheduling here. This function is called by the base
708 // MachineScheduler after all regions have been recorded by
709 // GCNScheduleDAGMILive::schedule().
710 LiveIns
.resize(Regions
.size());
711 Pressure
.resize(Regions
.size());
712 RescheduleRegions
.resize(Regions
.size());
713 RegionsWithHighRP
.resize(Regions
.size());
714 RegionsWithExcessRP
.resize(Regions
.size());
715 RegionsWithMinOcc
.resize(Regions
.size());
716 RegionsWithIGLPInstrs
.resize(Regions
.size());
717 RescheduleRegions
.set();
718 RegionsWithHighRP
.reset();
719 RegionsWithExcessRP
.reset();
720 RegionsWithMinOcc
.reset();
721 RegionsWithIGLPInstrs
.reset();
726 void GCNScheduleDAGMILive::runSchedStages() {
727 LLVM_DEBUG(dbgs() << "All regions recorded, starting actual scheduling.\n");
729 if (!Regions
.empty())
730 BBLiveInMap
= getBBLiveInMap();
732 GCNSchedStrategy
&S
= static_cast<GCNSchedStrategy
&>(*SchedImpl
);
733 while (S
.advanceStage()) {
734 auto Stage
= createSchedStage(S
.getCurrentStage());
735 if (!Stage
->initGCNSchedStage())
738 for (auto Region
: Regions
) {
739 RegionBegin
= Region
.first
;
740 RegionEnd
= Region
.second
;
741 // Setup for scheduling the region and check whether it should be skipped.
742 if (!Stage
->initGCNRegion()) {
743 Stage
->advanceRegion();
748 ScheduleDAGMILive::schedule();
749 Stage
->finalizeGCNRegion();
752 Stage
->finalizeGCNSchedStage();
757 raw_ostream
&llvm::operator<<(raw_ostream
&OS
, const GCNSchedStageID
&StageID
) {
759 case GCNSchedStageID::OccInitialSchedule
:
760 OS
<< "Max Occupancy Initial Schedule";
762 case GCNSchedStageID::UnclusteredHighRPReschedule
:
763 OS
<< "Unclustered High Register Pressure Reschedule";
765 case GCNSchedStageID::ClusteredLowOccupancyReschedule
:
766 OS
<< "Clustered Low Occupancy Reschedule";
768 case GCNSchedStageID::PreRARematerialize
:
769 OS
<< "Pre-RA Rematerialize";
771 case GCNSchedStageID::ILPInitialSchedule
:
772 OS
<< "Max ILP Initial Schedule";
780 GCNSchedStage::GCNSchedStage(GCNSchedStageID StageID
, GCNScheduleDAGMILive
&DAG
)
781 : DAG(DAG
), S(static_cast<GCNSchedStrategy
&>(*DAG
.SchedImpl
)), MF(DAG
.MF
),
782 MFI(DAG
.MFI
), ST(DAG
.ST
), StageID(StageID
) {}
784 bool GCNSchedStage::initGCNSchedStage() {
788 LLVM_DEBUG(dbgs() << "Starting scheduling stage: " << StageID
<< "\n");
792 bool UnclusteredHighRPStage::initGCNSchedStage() {
793 if (DisableUnclusterHighRP
)
796 if (!GCNSchedStage::initGCNSchedStage())
799 if (DAG
.RegionsWithHighRP
.none() && DAG
.RegionsWithExcessRP
.none())
802 SavedMutations
.swap(DAG
.Mutations
);
804 createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::PreRAReentry
));
806 InitialOccupancy
= DAG
.MinOccupancy
;
807 // Aggressivly try to reduce register pressure in the unclustered high RP
808 // stage. Temporarily increase occupancy target in the region.
809 S
.SGPRLimitBias
= S
.HighRPSGPRBias
;
810 S
.VGPRLimitBias
= S
.HighRPVGPRBias
;
811 if (MFI
.getMaxWavesPerEU() > DAG
.MinOccupancy
)
812 MFI
.increaseOccupancy(MF
, ++DAG
.MinOccupancy
);
816 << "Retrying function scheduling without clustering. "
817 "Aggressivly try to reduce register pressure to achieve occupancy "
818 << DAG
.MinOccupancy
<< ".\n");
823 bool ClusteredLowOccStage::initGCNSchedStage() {
824 if (DisableClusteredLowOccupancy
)
827 if (!GCNSchedStage::initGCNSchedStage())
830 // Don't bother trying to improve ILP in lower RP regions if occupancy has not
831 // been dropped. All regions will have already been scheduled with the ideal
832 // occupancy targets.
833 if (DAG
.StartingOccupancy
<= DAG
.MinOccupancy
)
837 dbgs() << "Retrying function scheduling with lowest recorded occupancy "
838 << DAG
.MinOccupancy
<< ".\n");
842 bool PreRARematStage::initGCNSchedStage() {
843 if (!GCNSchedStage::initGCNSchedStage())
846 if (DAG
.RegionsWithMinOcc
.none() || DAG
.Regions
.size() == 1)
849 const TargetInstrInfo
*TII
= MF
.getSubtarget().getInstrInfo();
850 // Check maximum occupancy
851 if (ST
.computeOccupancy(MF
.getFunction(), MFI
.getLDSSize()) ==
855 // FIXME: This pass will invalidate cached MBBLiveIns for regions
856 // inbetween the defs and region we sinked the def to. Cached pressure
857 // for regions where a def is sinked from will also be invalidated. Will
858 // need to be fixed if there is another pass after this pass.
859 assert(!S
.hasNextStage());
861 collectRematerializableInstructions();
862 if (RematerializableInsts
.empty() || !sinkTriviallyRematInsts(ST
, TII
))
866 dbgs() << "Retrying function scheduling with improved occupancy of "
867 << DAG
.MinOccupancy
<< " from rematerializing\n");
871 void GCNSchedStage::finalizeGCNSchedStage() {
873 LLVM_DEBUG(dbgs() << "Ending scheduling stage: " << StageID
<< "\n");
876 void UnclusteredHighRPStage::finalizeGCNSchedStage() {
877 SavedMutations
.swap(DAG
.Mutations
);
878 S
.SGPRLimitBias
= S
.VGPRLimitBias
= 0;
879 if (DAG
.MinOccupancy
> InitialOccupancy
) {
880 for (unsigned IDX
= 0; IDX
< DAG
.Pressure
.size(); ++IDX
)
881 DAG
.RegionsWithMinOcc
[IDX
] =
882 DAG
.Pressure
[IDX
].getOccupancy(DAG
.ST
) == DAG
.MinOccupancy
;
884 LLVM_DEBUG(dbgs() << StageID
885 << " stage successfully increased occupancy to "
886 << DAG
.MinOccupancy
<< '\n');
889 GCNSchedStage::finalizeGCNSchedStage();
892 bool GCNSchedStage::initGCNRegion() {
893 // Check whether this new region is also a new block.
894 if (DAG
.RegionBegin
->getParent() != CurrentMBB
)
897 unsigned NumRegionInstrs
= std::distance(DAG
.begin(), DAG
.end());
898 DAG
.enterRegion(CurrentMBB
, DAG
.begin(), DAG
.end(), NumRegionInstrs
);
900 // Skip empty scheduling regions (0 or 1 schedulable instructions).
901 if (DAG
.begin() == DAG
.end() || DAG
.begin() == std::prev(DAG
.end()))
904 LLVM_DEBUG(dbgs() << "********** MI Scheduling **********\n");
905 LLVM_DEBUG(dbgs() << MF
.getName() << ":" << printMBBReference(*CurrentMBB
)
906 << " " << CurrentMBB
->getName()
907 << "\n From: " << *DAG
.begin() << " To: ";
908 if (DAG
.RegionEnd
!= CurrentMBB
->end()) dbgs() << *DAG
.RegionEnd
;
909 else dbgs() << "End";
910 dbgs() << " RegionInstrs: " << NumRegionInstrs
<< '\n');
912 // Save original instruction order before scheduling for possible revert.
914 Unsched
.reserve(DAG
.NumRegionInstrs
);
915 if (StageID
== GCNSchedStageID::OccInitialSchedule
||
916 StageID
== GCNSchedStageID::ILPInitialSchedule
) {
917 for (auto &I
: DAG
) {
918 Unsched
.push_back(&I
);
919 if (I
.getOpcode() == AMDGPU::SCHED_GROUP_BARRIER
||
920 I
.getOpcode() == AMDGPU::IGLP_OPT
)
921 DAG
.RegionsWithIGLPInstrs
[RegionIdx
] = true;
925 Unsched
.push_back(&I
);
928 PressureBefore
= DAG
.Pressure
[RegionIdx
];
931 dbgs() << "Pressure before scheduling:\nRegion live-ins:"
932 << print(DAG
.LiveIns
[RegionIdx
], DAG
.MRI
)
933 << "Region live-in pressure: "
934 << print(llvm::getRegPressure(DAG
.MRI
, DAG
.LiveIns
[RegionIdx
]))
935 << "Region register pressure: " << print(PressureBefore
));
937 S
.HasHighPressure
= false;
938 S
.KnownExcessRP
= isRegionWithExcessRP();
940 if (DAG
.RegionsWithIGLPInstrs
[RegionIdx
] &&
941 StageID
!= GCNSchedStageID::UnclusteredHighRPReschedule
) {
942 SavedMutations
.clear();
943 SavedMutations
.swap(DAG
.Mutations
);
944 bool IsInitialStage
= StageID
== GCNSchedStageID::OccInitialSchedule
||
945 StageID
== GCNSchedStageID::ILPInitialSchedule
;
946 DAG
.addMutation(createIGroupLPDAGMutation(
947 IsInitialStage
? AMDGPU::SchedulingPhase::Initial
948 : AMDGPU::SchedulingPhase::PreRAReentry
));
954 bool UnclusteredHighRPStage::initGCNRegion() {
955 // Only reschedule regions with the minimum occupancy or regions that may have
956 // spilling (excess register pressure).
957 if ((!DAG
.RegionsWithMinOcc
[RegionIdx
] ||
958 DAG
.MinOccupancy
<= InitialOccupancy
) &&
959 !DAG
.RegionsWithExcessRP
[RegionIdx
])
962 return GCNSchedStage::initGCNRegion();
965 bool ClusteredLowOccStage::initGCNRegion() {
966 // We may need to reschedule this region if it wasn't rescheduled in the last
967 // stage, or if we found it was testing critical register pressure limits in
968 // the unclustered reschedule stage. The later is because we may not have been
969 // able to raise the min occupancy in the previous stage so the region may be
970 // overly constrained even if it was already rescheduled.
971 if (!DAG
.RegionsWithHighRP
[RegionIdx
])
974 return GCNSchedStage::initGCNRegion();
977 bool PreRARematStage::initGCNRegion() {
978 if (!DAG
.RescheduleRegions
[RegionIdx
])
981 return GCNSchedStage::initGCNRegion();
984 void GCNSchedStage::setupNewBlock() {
988 CurrentMBB
= DAG
.RegionBegin
->getParent();
989 DAG
.startBlock(CurrentMBB
);
990 // Get real RP for the region if it hasn't be calculated before. After the
991 // initial schedule stage real RP will be collected after scheduling.
992 if (StageID
== GCNSchedStageID::OccInitialSchedule
||
993 StageID
== GCNSchedStageID::ILPInitialSchedule
)
994 DAG
.computeBlockPressure(RegionIdx
, CurrentMBB
);
997 void GCNSchedStage::finalizeGCNRegion() {
998 DAG
.Regions
[RegionIdx
] = std::pair(DAG
.RegionBegin
, DAG
.RegionEnd
);
999 DAG
.RescheduleRegions
[RegionIdx
] = false;
1000 if (S
.HasHighPressure
)
1001 DAG
.RegionsWithHighRP
[RegionIdx
] = true;
1003 // Revert scheduling if we have dropped occupancy or there is some other
1004 // reason that the original schedule is better.
1007 if (DAG
.RegionsWithIGLPInstrs
[RegionIdx
] &&
1008 StageID
!= GCNSchedStageID::UnclusteredHighRPReschedule
)
1009 SavedMutations
.swap(DAG
.Mutations
);
1015 void GCNSchedStage::checkScheduling() {
1016 // Check the results of scheduling.
1017 PressureAfter
= DAG
.getRealRegPressure(RegionIdx
);
1018 LLVM_DEBUG(dbgs() << "Pressure after scheduling: " << print(PressureAfter
));
1019 LLVM_DEBUG(dbgs() << "Region: " << RegionIdx
<< ".\n");
1021 if (PressureAfter
.getSGPRNum() <= S
.SGPRCriticalLimit
&&
1022 PressureAfter
.getVGPRNum(ST
.hasGFX90AInsts()) <= S
.VGPRCriticalLimit
) {
1023 DAG
.Pressure
[RegionIdx
] = PressureAfter
;
1024 DAG
.RegionsWithMinOcc
[RegionIdx
] =
1025 PressureAfter
.getOccupancy(ST
) == DAG
.MinOccupancy
;
1027 // Early out if we have achieved the occupancy target.
1028 LLVM_DEBUG(dbgs() << "Pressure in desired limits, done.\n");
1032 unsigned TargetOccupancy
=
1033 std::min(S
.getTargetOccupancy(), ST
.getOccupancyWithLocalMemSize(MF
));
1034 unsigned WavesAfter
=
1035 std::min(TargetOccupancy
, PressureAfter
.getOccupancy(ST
));
1036 unsigned WavesBefore
=
1037 std::min(TargetOccupancy
, PressureBefore
.getOccupancy(ST
));
1038 LLVM_DEBUG(dbgs() << "Occupancy before scheduling: " << WavesBefore
1039 << ", after " << WavesAfter
<< ".\n");
1041 // We may not be able to keep the current target occupancy because of the just
1042 // scheduled region. We might still be able to revert scheduling if the
1043 // occupancy before was higher, or if the current schedule has register
1044 // pressure higher than the excess limits which could lead to more spilling.
1045 unsigned NewOccupancy
= std::max(WavesAfter
, WavesBefore
);
1047 // Allow memory bound functions to drop to 4 waves if not limited by an
1049 if (WavesAfter
< WavesBefore
&& WavesAfter
< DAG
.MinOccupancy
&&
1050 WavesAfter
>= MFI
.getMinAllowedOccupancy()) {
1051 LLVM_DEBUG(dbgs() << "Function is memory bound, allow occupancy drop up to "
1052 << MFI
.getMinAllowedOccupancy() << " waves\n");
1053 NewOccupancy
= WavesAfter
;
1056 if (NewOccupancy
< DAG
.MinOccupancy
) {
1057 DAG
.MinOccupancy
= NewOccupancy
;
1058 MFI
.limitOccupancy(DAG
.MinOccupancy
);
1059 DAG
.RegionsWithMinOcc
.reset();
1060 LLVM_DEBUG(dbgs() << "Occupancy lowered for the function to "
1061 << DAG
.MinOccupancy
<< ".\n");
1063 // The maximum number of arch VGPR on non-unified register file, or the
1064 // maximum VGPR + AGPR in the unified register file case.
1065 unsigned MaxVGPRs
= ST
.getMaxNumVGPRs(MF
);
1066 // The maximum number of arch VGPR for both unified and non-unified register
1068 unsigned MaxArchVGPRs
= std::min(MaxVGPRs
, ST
.getAddressableNumArchVGPRs());
1069 unsigned MaxSGPRs
= ST
.getMaxNumSGPRs(MF
);
1071 if (PressureAfter
.getVGPRNum(ST
.hasGFX90AInsts()) > MaxVGPRs
||
1072 PressureAfter
.getVGPRNum(false) > MaxArchVGPRs
||
1073 PressureAfter
.getAGPRNum() > MaxArchVGPRs
||
1074 PressureAfter
.getSGPRNum() > MaxSGPRs
) {
1075 DAG
.RescheduleRegions
[RegionIdx
] = true;
1076 DAG
.RegionsWithHighRP
[RegionIdx
] = true;
1077 DAG
.RegionsWithExcessRP
[RegionIdx
] = true;
1080 // Revert if this region's schedule would cause a drop in occupancy or
1082 if (shouldRevertScheduling(WavesAfter
)) {
1085 DAG
.Pressure
[RegionIdx
] = PressureAfter
;
1086 DAG
.RegionsWithMinOcc
[RegionIdx
] =
1087 PressureAfter
.getOccupancy(ST
) == DAG
.MinOccupancy
;
1092 GCNSchedStage::computeSUnitReadyCycle(const SUnit
&SU
, unsigned CurrCycle
,
1093 DenseMap
<unsigned, unsigned> &ReadyCycles
,
1094 const TargetSchedModel
&SM
) {
1095 unsigned ReadyCycle
= CurrCycle
;
1096 for (auto &D
: SU
.Preds
) {
1097 if (D
.isAssignedRegDep()) {
1098 MachineInstr
*DefMI
= D
.getSUnit()->getInstr();
1099 unsigned Latency
= SM
.computeInstrLatency(DefMI
);
1100 unsigned DefReady
= ReadyCycles
[DAG
.getSUnit(DefMI
)->NodeNum
];
1101 ReadyCycle
= std::max(ReadyCycle
, DefReady
+ Latency
);
1104 ReadyCycles
[SU
.NodeNum
] = ReadyCycle
;
1109 struct EarlierIssuingCycle
{
1110 bool operator()(std::pair
<MachineInstr
*, unsigned> A
,
1111 std::pair
<MachineInstr
*, unsigned> B
) const {
1112 return A
.second
< B
.second
;
1116 static void printScheduleModel(std::set
<std::pair
<MachineInstr
*, unsigned>,
1117 EarlierIssuingCycle
> &ReadyCycles
) {
1118 if (ReadyCycles
.empty())
1120 unsigned BBNum
= ReadyCycles
.begin()->first
->getParent()->getNumber();
1121 dbgs() << "\n################## Schedule time ReadyCycles for MBB : " << BBNum
1122 << " ##################\n# Cycle #\t\t\tInstruction "
1126 for (auto &I
: ReadyCycles
) {
1127 if (I
.second
> IPrev
+ 1)
1128 dbgs() << "****************************** BUBBLE OF " << I
.second
- IPrev
1129 << " CYCLES DETECTED ******************************\n\n";
1130 dbgs() << "[ " << I
.second
<< " ] : " << *I
.first
<< "\n";
1137 GCNSchedStage::getScheduleMetrics(const std::vector
<SUnit
> &InputSchedule
) {
1139 std::set
<std::pair
<MachineInstr
*, unsigned>, EarlierIssuingCycle
>
1142 const TargetSchedModel
&SM
= ST
.getInstrInfo()->getSchedModel();
1143 unsigned SumBubbles
= 0;
1144 DenseMap
<unsigned, unsigned> ReadyCycles
;
1145 unsigned CurrCycle
= 0;
1146 for (auto &SU
: InputSchedule
) {
1147 unsigned ReadyCycle
=
1148 computeSUnitReadyCycle(SU
, CurrCycle
, ReadyCycles
, SM
);
1149 SumBubbles
+= ReadyCycle
- CurrCycle
;
1151 ReadyCyclesSorted
.insert(std::make_pair(SU
.getInstr(), ReadyCycle
));
1153 CurrCycle
= ++ReadyCycle
;
1157 printScheduleModel(ReadyCyclesSorted
);
1161 ? (SumBubbles
* ScheduleMetrics::ScaleFactor
) / CurrCycle
1166 return ScheduleMetrics(CurrCycle
, SumBubbles
);
1170 GCNSchedStage::getScheduleMetrics(const GCNScheduleDAGMILive
&DAG
) {
1172 std::set
<std::pair
<MachineInstr
*, unsigned>, EarlierIssuingCycle
>
1175 const TargetSchedModel
&SM
= ST
.getInstrInfo()->getSchedModel();
1176 unsigned SumBubbles
= 0;
1177 DenseMap
<unsigned, unsigned> ReadyCycles
;
1178 unsigned CurrCycle
= 0;
1179 for (auto &MI
: DAG
) {
1180 SUnit
*SU
= DAG
.getSUnit(&MI
);
1183 unsigned ReadyCycle
=
1184 computeSUnitReadyCycle(*SU
, CurrCycle
, ReadyCycles
, SM
);
1185 SumBubbles
+= ReadyCycle
- CurrCycle
;
1187 ReadyCyclesSorted
.insert(std::make_pair(SU
->getInstr(), ReadyCycle
));
1189 CurrCycle
= ++ReadyCycle
;
1193 printScheduleModel(ReadyCyclesSorted
);
1197 ? (SumBubbles
* ScheduleMetrics::ScaleFactor
) / CurrCycle
1202 return ScheduleMetrics(CurrCycle
, SumBubbles
);
1205 bool GCNSchedStage::shouldRevertScheduling(unsigned WavesAfter
) {
1206 if (WavesAfter
< DAG
.MinOccupancy
)
1212 bool OccInitialScheduleStage::shouldRevertScheduling(unsigned WavesAfter
) {
1213 if (PressureAfter
== PressureBefore
)
1216 if (GCNSchedStage::shouldRevertScheduling(WavesAfter
))
1219 if (mayCauseSpilling(WavesAfter
))
1225 bool UnclusteredHighRPStage::shouldRevertScheduling(unsigned WavesAfter
) {
1226 // If RP is not reduced in the unclustered reschedule stage, revert to the
1228 if ((WavesAfter
<= PressureBefore
.getOccupancy(ST
) &&
1229 mayCauseSpilling(WavesAfter
)) ||
1230 GCNSchedStage::shouldRevertScheduling(WavesAfter
)) {
1231 LLVM_DEBUG(dbgs() << "Unclustered reschedule did not help.\n");
1235 // Do not attempt to relax schedule even more if we are already spilling.
1236 if (isRegionWithExcessRP())
1241 << "\n\t *** In shouldRevertScheduling ***\n"
1242 << " *********** BEFORE UnclusteredHighRPStage ***********\n");
1243 ScheduleMetrics MBefore
=
1244 getScheduleMetrics(DAG
.SUnits
);
1247 << "\n *********** AFTER UnclusteredHighRPStage ***********\n");
1248 ScheduleMetrics MAfter
= getScheduleMetrics(DAG
);
1249 unsigned OldMetric
= MBefore
.getMetric();
1250 unsigned NewMetric
= MAfter
.getMetric();
1251 unsigned WavesBefore
=
1252 std::min(S
.getTargetOccupancy(), PressureBefore
.getOccupancy(ST
));
1254 ((WavesAfter
* ScheduleMetrics::ScaleFactor
) / WavesBefore
*
1255 ((OldMetric
+ ScheduleMetricBias
) * ScheduleMetrics::ScaleFactor
) /
1257 ScheduleMetrics::ScaleFactor
;
1258 LLVM_DEBUG(dbgs() << "\tMetric before " << MBefore
<< "\tMetric after "
1259 << MAfter
<< "Profit: " << Profit
<< "\n");
1260 return Profit
< ScheduleMetrics::ScaleFactor
;
1263 bool ClusteredLowOccStage::shouldRevertScheduling(unsigned WavesAfter
) {
1264 if (PressureAfter
== PressureBefore
)
1267 if (GCNSchedStage::shouldRevertScheduling(WavesAfter
))
1270 if (mayCauseSpilling(WavesAfter
))
1276 bool PreRARematStage::shouldRevertScheduling(unsigned WavesAfter
) {
1277 if (GCNSchedStage::shouldRevertScheduling(WavesAfter
))
1280 if (mayCauseSpilling(WavesAfter
))
1286 bool ILPInitialScheduleStage::shouldRevertScheduling(unsigned WavesAfter
) {
1287 if (mayCauseSpilling(WavesAfter
))
1293 bool GCNSchedStage::mayCauseSpilling(unsigned WavesAfter
) {
1294 if (WavesAfter
<= MFI
.getMinWavesPerEU() && isRegionWithExcessRP() &&
1295 !PressureAfter
.less(MF
, PressureBefore
)) {
1296 LLVM_DEBUG(dbgs() << "New pressure will result in more spilling.\n");
1303 void GCNSchedStage::revertScheduling() {
1304 DAG
.RegionsWithMinOcc
[RegionIdx
] =
1305 PressureBefore
.getOccupancy(ST
) == DAG
.MinOccupancy
;
1306 LLVM_DEBUG(dbgs() << "Attempting to revert scheduling.\n");
1307 DAG
.RescheduleRegions
[RegionIdx
] =
1309 S
.getNextStage() != GCNSchedStageID::UnclusteredHighRPReschedule
;
1310 DAG
.RegionEnd
= DAG
.RegionBegin
;
1311 int SkippedDebugInstr
= 0;
1312 for (MachineInstr
*MI
: Unsched
) {
1313 if (MI
->isDebugInstr()) {
1314 ++SkippedDebugInstr
;
1318 if (MI
->getIterator() != DAG
.RegionEnd
) {
1320 DAG
.BB
->insert(DAG
.RegionEnd
, MI
);
1321 if (!MI
->isDebugInstr())
1322 DAG
.LIS
->handleMove(*MI
, true);
1325 // Reset read-undef flags and update them later.
1326 for (auto &Op
: MI
->all_defs())
1327 Op
.setIsUndef(false);
1328 RegisterOperands RegOpers
;
1329 RegOpers
.collect(*MI
, *DAG
.TRI
, DAG
.MRI
, DAG
.ShouldTrackLaneMasks
, false);
1330 if (!MI
->isDebugInstr()) {
1331 if (DAG
.ShouldTrackLaneMasks
) {
1332 // Adjust liveness and add missing dead+read-undef flags.
1333 SlotIndex SlotIdx
= DAG
.LIS
->getInstructionIndex(*MI
).getRegSlot();
1334 RegOpers
.adjustLaneLiveness(*DAG
.LIS
, DAG
.MRI
, SlotIdx
, MI
);
1336 // Adjust for missing dead-def flags.
1337 RegOpers
.detectDeadDefs(*MI
, *DAG
.LIS
);
1340 DAG
.RegionEnd
= MI
->getIterator();
1342 LLVM_DEBUG(dbgs() << "Scheduling " << *MI
);
1345 // After reverting schedule, debug instrs will now be at the end of the block
1346 // and RegionEnd will point to the first debug instr. Increment RegionEnd
1347 // pass debug instrs to the actual end of the scheduling region.
1348 while (SkippedDebugInstr
-- > 0)
1351 // If Unsched.front() instruction is a debug instruction, this will actually
1352 // shrink the region since we moved all debug instructions to the end of the
1353 // block. Find the first instruction that is not a debug instruction.
1354 DAG
.RegionBegin
= Unsched
.front()->getIterator();
1355 if (DAG
.RegionBegin
->isDebugInstr()) {
1356 for (MachineInstr
*MI
: Unsched
) {
1357 if (MI
->isDebugInstr())
1359 DAG
.RegionBegin
= MI
->getIterator();
1364 // Then move the debug instructions back into their correct place and set
1365 // RegionBegin and RegionEnd if needed.
1366 DAG
.placeDebugValues();
1368 DAG
.Regions
[RegionIdx
] = std::pair(DAG
.RegionBegin
, DAG
.RegionEnd
);
1371 void PreRARematStage::collectRematerializableInstructions() {
1372 const SIRegisterInfo
*SRI
= static_cast<const SIRegisterInfo
*>(DAG
.TRI
);
1373 for (unsigned I
= 0, E
= DAG
.MRI
.getNumVirtRegs(); I
!= E
; ++I
) {
1374 Register Reg
= Register::index2VirtReg(I
);
1375 if (!DAG
.LIS
->hasInterval(Reg
))
1378 // TODO: Handle AGPR and SGPR rematerialization
1379 if (!SRI
->isVGPRClass(DAG
.MRI
.getRegClass(Reg
)) ||
1380 !DAG
.MRI
.hasOneDef(Reg
) || !DAG
.MRI
.hasOneNonDBGUse(Reg
))
1383 MachineOperand
*Op
= DAG
.MRI
.getOneDef(Reg
);
1384 MachineInstr
*Def
= Op
->getParent();
1385 if (Op
->getSubReg() != 0 || !isTriviallyReMaterializable(*Def
))
1388 MachineInstr
*UseI
= &*DAG
.MRI
.use_instr_nodbg_begin(Reg
);
1389 if (Def
->getParent() == UseI
->getParent())
1392 // We are only collecting defs that are defined in another block and are
1393 // live-through or used inside regions at MinOccupancy. This means that the
1394 // register must be in the live-in set for the region.
1395 bool AddedToRematList
= false;
1396 for (unsigned I
= 0, E
= DAG
.Regions
.size(); I
!= E
; ++I
) {
1397 auto It
= DAG
.LiveIns
[I
].find(Reg
);
1398 if (It
!= DAG
.LiveIns
[I
].end() && !It
->second
.none()) {
1399 if (DAG
.RegionsWithMinOcc
[I
]) {
1400 RematerializableInsts
[I
][Def
] = UseI
;
1401 AddedToRematList
= true;
1404 // Collect regions with rematerializable reg as live-in to avoid
1405 // searching later when updating RP.
1406 RematDefToLiveInRegions
[Def
].push_back(I
);
1409 if (!AddedToRematList
)
1410 RematDefToLiveInRegions
.erase(Def
);
1414 bool PreRARematStage::sinkTriviallyRematInsts(const GCNSubtarget
&ST
,
1415 const TargetInstrInfo
*TII
) {
1416 // Temporary copies of cached variables we will be modifying and replacing if
1417 // sinking succeeds.
1419 std::pair
<MachineBasicBlock::iterator
, MachineBasicBlock::iterator
>, 32>
1421 DenseMap
<unsigned, GCNRPTracker::LiveRegSet
> NewLiveIns
;
1422 DenseMap
<unsigned, GCNRegPressure
> NewPressure
;
1423 BitVector NewRescheduleRegions
;
1424 LiveIntervals
*LIS
= DAG
.LIS
;
1426 NewRegions
.resize(DAG
.Regions
.size());
1427 NewRescheduleRegions
.resize(DAG
.Regions
.size());
1429 // Collect only regions that has a rematerializable def as a live-in.
1430 SmallSet
<unsigned, 16> ImpactedRegions
;
1431 for (const auto &It
: RematDefToLiveInRegions
)
1432 ImpactedRegions
.insert(It
.second
.begin(), It
.second
.end());
1434 // Make copies of register pressure and live-ins cache that will be updated
1435 // as we rematerialize.
1436 for (auto Idx
: ImpactedRegions
) {
1437 NewPressure
[Idx
] = DAG
.Pressure
[Idx
];
1438 NewLiveIns
[Idx
] = DAG
.LiveIns
[Idx
];
1440 NewRegions
= DAG
.Regions
;
1441 NewRescheduleRegions
.reset();
1443 DenseMap
<MachineInstr
*, MachineInstr
*> InsertedMIToOldDef
;
1444 bool Improved
= false;
1445 for (auto I
: ImpactedRegions
) {
1446 if (!DAG
.RegionsWithMinOcc
[I
])
1450 int VGPRUsage
= NewPressure
[I
].getVGPRNum(ST
.hasGFX90AInsts());
1451 int SGPRUsage
= NewPressure
[I
].getSGPRNum();
1453 // TODO: Handle occupancy drop due to AGPR and SGPR.
1454 // Check if cause of occupancy drop is due to VGPR usage and not SGPR.
1455 if (ST
.getOccupancyWithNumSGPRs(SGPRUsage
) == DAG
.MinOccupancy
)
1458 // The occupancy of this region could have been improved by a previous
1459 // iteration's sinking of defs.
1460 if (NewPressure
[I
].getOccupancy(ST
) > DAG
.MinOccupancy
) {
1461 NewRescheduleRegions
[I
] = true;
1466 // First check if we have enough trivially rematerializable instructions to
1467 // improve occupancy. Optimistically assume all instructions we are able to
1468 // sink decreased RP.
1469 int TotalSinkableRegs
= 0;
1470 for (const auto &It
: RematerializableInsts
[I
]) {
1471 MachineInstr
*Def
= It
.first
;
1472 Register DefReg
= Def
->getOperand(0).getReg();
1473 TotalSinkableRegs
+=
1474 SIRegisterInfo::getNumCoveredRegs(NewLiveIns
[I
][DefReg
]);
1476 int VGPRsAfterSink
= VGPRUsage
- TotalSinkableRegs
;
1477 unsigned OptimisticOccupancy
= ST
.getOccupancyWithNumVGPRs(VGPRsAfterSink
);
1478 // If in the most optimistic scenario, we cannot improve occupancy, then do
1479 // not attempt to sink any instructions.
1480 if (OptimisticOccupancy
<= DAG
.MinOccupancy
)
1483 unsigned ImproveOccupancy
= 0;
1484 SmallVector
<MachineInstr
*, 4> SinkedDefs
;
1485 for (auto &It
: RematerializableInsts
[I
]) {
1486 MachineInstr
*Def
= It
.first
;
1487 MachineBasicBlock::iterator InsertPos
=
1488 MachineBasicBlock::iterator(It
.second
);
1489 Register Reg
= Def
->getOperand(0).getReg();
1490 // Rematerialize MI to its use block. Since we are only rematerializing
1491 // instructions that do not have any virtual reg uses, we do not need to
1492 // call LiveRangeEdit::allUsesAvailableAt() and
1493 // LiveRangeEdit::canRematerializeAt().
1494 TII
->reMaterialize(*InsertPos
->getParent(), InsertPos
, Reg
,
1495 Def
->getOperand(0).getSubReg(), *Def
, *DAG
.TRI
);
1496 MachineInstr
*NewMI
= &*std::prev(InsertPos
);
1497 LIS
->InsertMachineInstrInMaps(*NewMI
);
1498 LIS
->removeInterval(Reg
);
1499 LIS
->createAndComputeVirtRegInterval(Reg
);
1500 InsertedMIToOldDef
[NewMI
] = Def
;
1502 // Update region boundaries in scheduling region we sinked from since we
1503 // may sink an instruction that was at the beginning or end of its region
1504 DAG
.updateRegionBoundaries(NewRegions
, Def
, /*NewMI =*/nullptr,
1505 /*Removing =*/true);
1507 // Update region boundaries in region we sinked to.
1508 DAG
.updateRegionBoundaries(NewRegions
, InsertPos
, NewMI
);
1510 LaneBitmask PrevMask
= NewLiveIns
[I
][Reg
];
1511 // FIXME: Also update cached pressure for where the def was sinked from.
1512 // Update RP for all regions that has this reg as a live-in and remove
1513 // the reg from all regions as a live-in.
1514 for (auto Idx
: RematDefToLiveInRegions
[Def
]) {
1515 NewLiveIns
[Idx
].erase(Reg
);
1516 if (InsertPos
->getParent() != DAG
.Regions
[Idx
].first
->getParent()) {
1517 // Def is live-through and not used in this block.
1518 NewPressure
[Idx
].inc(Reg
, PrevMask
, LaneBitmask::getNone(), DAG
.MRI
);
1520 // Def is used and rematerialized into this block.
1521 GCNDownwardRPTracker
RPT(*LIS
);
1522 auto *NonDbgMI
= &*skipDebugInstructionsForward(
1523 NewRegions
[Idx
].first
, NewRegions
[Idx
].second
);
1524 RPT
.reset(*NonDbgMI
, &NewLiveIns
[Idx
]);
1525 RPT
.advance(NewRegions
[Idx
].second
);
1526 NewPressure
[Idx
] = RPT
.moveMaxPressure();
1530 SinkedDefs
.push_back(Def
);
1531 ImproveOccupancy
= NewPressure
[I
].getOccupancy(ST
);
1532 if (ImproveOccupancy
> DAG
.MinOccupancy
)
1536 // Remove defs we just sinked from all regions' list of sinkable defs
1537 for (auto &Def
: SinkedDefs
)
1538 for (auto TrackedIdx
: RematDefToLiveInRegions
[Def
])
1539 RematerializableInsts
[TrackedIdx
].erase(Def
);
1541 if (ImproveOccupancy
<= DAG
.MinOccupancy
)
1544 NewRescheduleRegions
[I
] = true;
1549 // Occupancy was not improved for all regions that were at MinOccupancy.
1550 // Undo sinking and remove newly rematerialized instructions.
1551 for (auto &Entry
: InsertedMIToOldDef
) {
1552 MachineInstr
*MI
= Entry
.first
;
1553 MachineInstr
*OldMI
= Entry
.second
;
1554 Register Reg
= MI
->getOperand(0).getReg();
1555 LIS
->RemoveMachineInstrFromMaps(*MI
);
1556 MI
->eraseFromParent();
1557 OldMI
->clearRegisterDeads(Reg
);
1558 LIS
->removeInterval(Reg
);
1559 LIS
->createAndComputeVirtRegInterval(Reg
);
1564 // Occupancy was improved for all regions.
1565 for (auto &Entry
: InsertedMIToOldDef
) {
1566 MachineInstr
*MI
= Entry
.first
;
1567 MachineInstr
*OldMI
= Entry
.second
;
1569 // Remove OldMI from BBLiveInMap since we are sinking it from its MBB.
1570 DAG
.BBLiveInMap
.erase(OldMI
);
1572 // Remove OldMI and update LIS
1573 Register Reg
= MI
->getOperand(0).getReg();
1574 LIS
->RemoveMachineInstrFromMaps(*OldMI
);
1575 OldMI
->eraseFromParent();
1576 LIS
->removeInterval(Reg
);
1577 LIS
->createAndComputeVirtRegInterval(Reg
);
1580 // Update live-ins, register pressure, and regions caches.
1581 for (auto Idx
: ImpactedRegions
) {
1582 DAG
.LiveIns
[Idx
] = NewLiveIns
[Idx
];
1583 DAG
.Pressure
[Idx
] = NewPressure
[Idx
];
1584 DAG
.MBBLiveIns
.erase(DAG
.Regions
[Idx
].first
->getParent());
1586 DAG
.Regions
= NewRegions
;
1587 DAG
.RescheduleRegions
= NewRescheduleRegions
;
1589 SIMachineFunctionInfo
&MFI
= *MF
.getInfo
<SIMachineFunctionInfo
>();
1590 MFI
.increaseOccupancy(MF
, ++DAG
.MinOccupancy
);
1595 // Copied from MachineLICM
1596 bool PreRARematStage::isTriviallyReMaterializable(const MachineInstr
&MI
) {
1597 if (!DAG
.TII
->isTriviallyReMaterializable(MI
))
1600 for (const MachineOperand
&MO
: MI
.all_uses())
1601 if (MO
.getReg().isVirtual())
1607 // When removing, we will have to check both beginning and ending of the region.
1608 // When inserting, we will only have to check if we are inserting NewMI in front
1609 // of a scheduling region and do not need to check the ending since we will only
1610 // ever be inserting before an already existing MI.
1611 void GCNScheduleDAGMILive::updateRegionBoundaries(
1612 SmallVectorImpl
<std::pair
<MachineBasicBlock::iterator
,
1613 MachineBasicBlock::iterator
>> &RegionBoundaries
,
1614 MachineBasicBlock::iterator MI
, MachineInstr
*NewMI
, bool Removing
) {
1615 unsigned I
= 0, E
= RegionBoundaries
.size();
1616 // Search for first region of the block where MI is located
1617 while (I
!= E
&& MI
->getParent() != RegionBoundaries
[I
].first
->getParent())
1620 for (; I
!= E
; ++I
) {
1621 if (MI
->getParent() != RegionBoundaries
[I
].first
->getParent())
1624 if (Removing
&& MI
== RegionBoundaries
[I
].first
&&
1625 MI
== RegionBoundaries
[I
].second
) {
1626 // MI is in a region with size 1, after removing, the region will be
1627 // size 0, set RegionBegin and RegionEnd to pass end of block iterator.
1628 RegionBoundaries
[I
] =
1629 std::pair(MI
->getParent()->end(), MI
->getParent()->end());
1632 if (MI
== RegionBoundaries
[I
].first
) {
1634 RegionBoundaries
[I
] =
1635 std::pair(std::next(MI
), RegionBoundaries
[I
].second
);
1637 // Inserted NewMI in front of region, set new RegionBegin to NewMI
1638 RegionBoundaries
[I
] = std::pair(MachineBasicBlock::iterator(NewMI
),
1639 RegionBoundaries
[I
].second
);
1642 if (Removing
&& MI
== RegionBoundaries
[I
].second
) {
1643 RegionBoundaries
[I
] = std::pair(RegionBoundaries
[I
].first
, std::prev(MI
));
1649 static bool hasIGLPInstrs(ScheduleDAGInstrs
*DAG
) {
1651 DAG
->begin(), DAG
->end(), [](MachineBasicBlock::iterator MI
) {
1652 unsigned Opc
= MI
->getOpcode();
1653 return Opc
== AMDGPU::SCHED_GROUP_BARRIER
|| Opc
== AMDGPU::IGLP_OPT
;
1657 GCNPostScheduleDAGMILive::GCNPostScheduleDAGMILive(
1658 MachineSchedContext
*C
, std::unique_ptr
<MachineSchedStrategy
> S
,
1659 bool RemoveKillFlags
)
1660 : ScheduleDAGMI(C
, std::move(S
), RemoveKillFlags
) {}
1662 void GCNPostScheduleDAGMILive::schedule() {
1663 HasIGLPInstrs
= hasIGLPInstrs(this);
1664 if (HasIGLPInstrs
) {
1665 SavedMutations
.clear();
1666 SavedMutations
.swap(Mutations
);
1667 addMutation(createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::PostRA
));
1670 ScheduleDAGMI::schedule();
1673 void GCNPostScheduleDAGMILive::finalizeSchedule() {
1675 SavedMutations
.swap(Mutations
);
1677 ScheduleDAGMI::finalizeSchedule();