1 //===-- SIMachineScheduler.cpp - SI Scheduler Interface -------------------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
10 /// SI Machine Scheduler interface
12 //===----------------------------------------------------------------------===//
14 #include "SIMachineScheduler.h"
15 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
16 #include "SIInstrInfo.h"
17 #include "llvm/CodeGen/LiveIntervals.h"
18 #include "llvm/CodeGen/MachineRegisterInfo.h"
22 #define DEBUG_TYPE "machine-scheduler"
24 // This scheduler implements a different scheduling algorithm than
27 // There are several specific architecture behaviours that can't be modelled
28 // for GenericScheduler:
29 // . When accessing the result of an SGPR load instruction, you have to wait
30 // for all the SGPR load instructions before your current instruction to
32 // . When accessing the result of an VGPR load instruction, you have to wait
33 // for all the VGPR load instructions previous to the VGPR load instruction
34 // you are interested in to finish.
35 // . The less the register pressure, the best load latencies are hidden
37 // Moreover some specifities (like the fact a lot of instructions in the shader
38 // have few dependencies) makes the generic scheduler have some unpredictable
39 // behaviours. For example when register pressure becomes high, it can either
40 // manage to prevent register pressure from going too high, or it can
41 // increase register pressure even more than if it hadn't taken register
42 // pressure into account.
44 // Also some other bad behaviours are generated, like loading at the beginning
45 // of the shader a constant in VGPR you won't need until the end of the shader.
47 // The scheduling problem for SI can distinguish three main parts:
48 // . Hiding high latencies (texture sampling, etc)
49 // . Hiding low latencies (SGPR constant loading, etc)
50 // . Keeping register usage low for better latency hiding and general
53 // Some other things can also affect performance, but are hard to predict
54 // (cache usage, the fact the HW can issue several instructions from different
55 // wavefronts if different types, etc)
57 // This scheduler tries to solve the scheduling problem by dividing it into
58 // simpler sub-problems. It divides the instructions into blocks, schedules
59 // locally inside the blocks where it takes care of low latencies, and then
60 // chooses the order of the blocks by taking care of high latencies.
61 // Dividing the instructions into blocks helps control keeping register
64 // First the instructions are put into blocks.
65 // We want the blocks help control register usage and hide high latencies
66 // later. To help control register usage, we typically want all local
67 // computations, when for example you create a result that can be consumed
68 // right away, to be contained in a block. Block inputs and outputs would
69 // typically be important results that are needed in several locations of
70 // the shader. Since we do want blocks to help hide high latencies, we want
71 // the instructions inside the block to have a minimal set of dependencies
72 // on high latencies. It will make it easy to pick blocks to hide specific
74 // The block creation algorithm is divided into several steps, and several
75 // variants can be tried during the scheduling process.
77 // Second the order of the instructions inside the blocks is chosen.
78 // At that step we do take into account only register usage and hiding
79 // low latency instructions
81 // Third the block order is chosen, there we try to hide high latencies
82 // and keep register usage low.
84 // After the third step, a pass is done to improve the hiding of low
87 // Actually when talking about 'low latency' or 'high latency' it includes
88 // both the latency to get the cache (or global mem) data go to the register,
89 // and the bandwidth limitations.
90 // Increasing the number of active wavefronts helps hide the former, but it
91 // doesn't solve the latter, thus why even if wavefront count is high, we have
92 // to try have as many instructions hiding high latencies as possible.
93 // The OpenCL doc says for example latency of 400 cycles for a global mem
94 // access, which is hidden by 10 instructions if the wavefront count is 10.
96 // Some figures taken from AMD docs:
97 // Both texture and constant L1 caches are 4-way associative with 64 bytes
99 // Constant cache is shared with 4 CUs.
100 // For texture sampling, the address generation unit receives 4 texture
101 // addresses per cycle, thus we could expect texture sampling latency to be
102 // equivalent to 4 instructions in the very best case (a VGPR is 64 work items,
103 // instructions in a wavefront group are executed every 4 cycles),
104 // or 16 instructions if the other wavefronts associated to the 3 other VALUs
105 // of the CU do texture sampling too. (Don't take these figures too seriously,
106 // as I'm not 100% sure of the computation)
107 // Data exports should get similar latency.
108 // For constant loading, the cache is shader with 4 CUs.
109 // The doc says "a throughput of 16B/cycle for each of the 4 Compute Unit"
110 // I guess if the other CU don't read the cache, it can go up to 64B/cycle.
111 // It means a simple s_buffer_load should take one instruction to hide, as
112 // well as a s_buffer_loadx2 and potentially a s_buffer_loadx8 if on the same
115 // As of today the driver doesn't preload the constants in cache, thus the
116 // first loads get extra latency. The doc says global memory access can be
117 // 300-600 cycles. We do not specially take that into account when scheduling
118 // As we expect the driver to be able to preload the constants soon.
124 static const char *getReasonStr(SIScheduleCandReason Reason
) {
126 case NoCand
: return "NOCAND";
127 case RegUsage
: return "REGUSAGE";
128 case Latency
: return "LATENCY";
129 case Successor
: return "SUCCESSOR";
130 case Depth
: return "DEPTH";
131 case NodeOrder
: return "ORDER";
133 llvm_unreachable("Unknown reason!");
138 namespace llvm::SISched
{
139 static bool tryLess(int TryVal
, int CandVal
,
140 SISchedulerCandidate
&TryCand
,
141 SISchedulerCandidate
&Cand
,
142 SIScheduleCandReason Reason
) {
143 if (TryVal
< CandVal
) {
144 TryCand
.Reason
= Reason
;
147 if (TryVal
> CandVal
) {
148 if (Cand
.Reason
> Reason
)
149 Cand
.Reason
= Reason
;
152 Cand
.setRepeat(Reason
);
156 static bool tryGreater(int TryVal
, int CandVal
,
157 SISchedulerCandidate
&TryCand
,
158 SISchedulerCandidate
&Cand
,
159 SIScheduleCandReason Reason
) {
160 if (TryVal
> CandVal
) {
161 TryCand
.Reason
= Reason
;
164 if (TryVal
< CandVal
) {
165 if (Cand
.Reason
> Reason
)
166 Cand
.Reason
= Reason
;
169 Cand
.setRepeat(Reason
);
172 } // end namespace llvm::SISched
174 // SIScheduleBlock //
176 void SIScheduleBlock::addUnit(SUnit
*SU
) {
177 NodeNum2Index
[SU
->NodeNum
] = SUnits
.size();
178 SUnits
.push_back(SU
);
182 void SIScheduleBlock::traceCandidate(const SISchedCandidate
&Cand
) {
184 dbgs() << " SU(" << Cand
.SU
->NodeNum
<< ") " << getReasonStr(Cand
.Reason
);
189 void SIScheduleBlock::tryCandidateTopDown(SISchedCandidate
&Cand
,
190 SISchedCandidate
&TryCand
) {
191 // Initialize the candidate if needed.
192 if (!Cand
.isValid()) {
193 TryCand
.Reason
= NodeOrder
;
197 if (Cand
.SGPRUsage
> 60 &&
198 SISched::tryLess(TryCand
.SGPRUsage
, Cand
.SGPRUsage
,
199 TryCand
, Cand
, RegUsage
))
202 // Schedule low latency instructions as top as possible.
203 // Order of priority is:
204 // . Low latency instructions which do not depend on other low latency
205 // instructions we haven't waited for
206 // . Other instructions which do not depend on low latency instructions
207 // we haven't waited for
209 // . All other instructions
210 // Goal is to get: low latency instructions - independent instructions
211 // - (eventually some more low latency instructions)
212 // - instructions that depend on the first low latency instructions.
213 // If in the block there is a lot of constant loads, the SGPR usage
214 // could go quite high, thus above the arbitrary limit of 60 will encourage
215 // use the already loaded constants (in order to release some SGPRs) before
217 if (SISched::tryLess(TryCand
.HasLowLatencyNonWaitedParent
,
218 Cand
.HasLowLatencyNonWaitedParent
,
219 TryCand
, Cand
, SIScheduleCandReason::Depth
))
222 if (SISched::tryGreater(TryCand
.IsLowLatency
, Cand
.IsLowLatency
,
223 TryCand
, Cand
, SIScheduleCandReason::Depth
))
226 if (TryCand
.IsLowLatency
&&
227 SISched::tryLess(TryCand
.LowLatencyOffset
, Cand
.LowLatencyOffset
,
228 TryCand
, Cand
, SIScheduleCandReason::Depth
))
231 if (SISched::tryLess(TryCand
.VGPRUsage
, Cand
.VGPRUsage
,
232 TryCand
, Cand
, RegUsage
))
235 // Fall through to original instruction order.
236 if (TryCand
.SU
->NodeNum
< Cand
.SU
->NodeNum
) {
237 TryCand
.Reason
= NodeOrder
;
241 SUnit
* SIScheduleBlock::pickNode() {
242 SISchedCandidate TopCand
;
244 for (SUnit
* SU
: TopReadySUs
) {
245 SISchedCandidate TryCand
;
246 std::vector
<unsigned> pressure
;
247 std::vector
<unsigned> MaxPressure
;
248 // Predict register usage after this instruction.
250 TopRPTracker
.getDownwardPressure(SU
->getInstr(), pressure
, MaxPressure
);
251 TryCand
.SGPRUsage
= pressure
[AMDGPU::RegisterPressureSets::SReg_32
];
252 TryCand
.VGPRUsage
= pressure
[AMDGPU::RegisterPressureSets::VGPR_32
];
253 TryCand
.IsLowLatency
= DAG
->IsLowLatencySU
[SU
->NodeNum
];
254 TryCand
.LowLatencyOffset
= DAG
->LowLatencyOffset
[SU
->NodeNum
];
255 TryCand
.HasLowLatencyNonWaitedParent
=
256 HasLowLatencyNonWaitedParent
[NodeNum2Index
[SU
->NodeNum
]];
257 tryCandidateTopDown(TopCand
, TryCand
);
258 if (TryCand
.Reason
!= NoCand
)
259 TopCand
.setBest(TryCand
);
266 // Schedule something valid.
267 void SIScheduleBlock::fastSchedule() {
272 for (SUnit
* SU
: SUnits
) {
273 if (!SU
->NumPredsLeft
)
274 TopReadySUs
.push_back(SU
);
277 while (!TopReadySUs
.empty()) {
278 SUnit
*SU
= TopReadySUs
[0];
279 ScheduledSUnits
.push_back(SU
);
286 // Returns if the register was set between first and last.
287 static bool isDefBetween(unsigned Reg
,
288 SlotIndex First
, SlotIndex Last
,
289 const MachineRegisterInfo
*MRI
,
290 const LiveIntervals
*LIS
) {
291 for (MachineRegisterInfo::def_instr_iterator
292 UI
= MRI
->def_instr_begin(Reg
),
293 UE
= MRI
->def_instr_end(); UI
!= UE
; ++UI
) {
294 const MachineInstr
* MI
= &*UI
;
295 if (MI
->isDebugValue())
297 SlotIndex InstSlot
= LIS
->getInstructionIndex(*MI
).getRegSlot();
298 if (InstSlot
>= First
&& InstSlot
<= Last
)
304 void SIScheduleBlock::initRegPressure(MachineBasicBlock::iterator BeginBlock
,
305 MachineBasicBlock::iterator EndBlock
) {
306 IntervalPressure Pressure
, BotPressure
;
307 RegPressureTracker
RPTracker(Pressure
), BotRPTracker(BotPressure
);
308 LiveIntervals
*LIS
= DAG
->getLIS();
309 MachineRegisterInfo
*MRI
= DAG
->getMRI();
310 DAG
->initRPTracker(TopRPTracker
);
311 DAG
->initRPTracker(BotRPTracker
);
312 DAG
->initRPTracker(RPTracker
);
314 // Goes though all SU. RPTracker captures what had to be alive for the SUs
315 // to execute, and what is still alive at the end.
316 for (SUnit
* SU
: ScheduledSUnits
) {
317 RPTracker
.setPos(SU
->getInstr());
321 // Close the RPTracker to finalize live ins/outs.
322 RPTracker
.closeRegion();
324 // Initialize the live ins and live outs.
325 TopRPTracker
.addLiveRegs(RPTracker
.getPressure().LiveInRegs
);
326 BotRPTracker
.addLiveRegs(RPTracker
.getPressure().LiveOutRegs
);
328 // Do not Track Physical Registers, because it messes up.
329 for (const auto &RegMaskPair
: RPTracker
.getPressure().LiveInRegs
) {
330 if (RegMaskPair
.RegUnit
.isVirtual())
331 LiveInRegs
.insert(RegMaskPair
.RegUnit
);
334 // There is several possibilities to distinguish:
335 // 1) Reg is not input to any instruction in the block, but is output of one
336 // 2) 1) + read in the block and not needed after it
337 // 3) 1) + read in the block but needed in another block
338 // 4) Reg is input of an instruction but another block will read it too
339 // 5) Reg is input of an instruction and then rewritten in the block.
340 // result is not read in the block (implies used in another block)
341 // 6) Reg is input of an instruction and then rewritten in the block.
342 // result is read in the block and not needed in another block
343 // 7) Reg is input of an instruction and then rewritten in the block.
344 // result is read in the block but also needed in another block
345 // LiveInRegs will contains all the regs in situation 4, 5, 6, 7
346 // We want LiveOutRegs to contain only Regs whose content will be read after
347 // in another block, and whose content was written in the current block,
348 // that is we want it to get 1, 3, 5, 7
349 // Since we made the MIs of a block to be packed all together before
350 // scheduling, then the LiveIntervals were correct, and the RPTracker was
351 // able to correctly handle 5 vs 6, 2 vs 3.
352 // (Note: This is not sufficient for RPTracker to not do mistakes for case 4)
353 // The RPTracker's LiveOutRegs has 1, 3, (some correct or incorrect)4, 5, 7
354 // Comparing to LiveInRegs is not sufficient to differentiate 4 vs 5, 7
355 // The use of findDefBetween removes the case 4.
356 for (const auto &RegMaskPair
: RPTracker
.getPressure().LiveOutRegs
) {
357 Register Reg
= RegMaskPair
.RegUnit
;
358 if (Reg
.isVirtual() &&
359 isDefBetween(Reg
, LIS
->getInstructionIndex(*BeginBlock
).getRegSlot(),
360 LIS
->getInstructionIndex(*EndBlock
).getRegSlot(), MRI
,
362 LiveOutRegs
.insert(Reg
);
366 // Pressure = sum_alive_registers register size
367 // Internally llvm will represent some registers as big 128 bits registers
368 // for example, but they actually correspond to 4 actual 32 bits registers.
369 // Thus Pressure is not equal to num_alive_registers * constant.
370 LiveInPressure
= TopPressure
.MaxSetPressure
;
371 LiveOutPressure
= BotPressure
.MaxSetPressure
;
373 // Prepares TopRPTracker for top down scheduling.
374 TopRPTracker
.closeTop();
377 void SIScheduleBlock::schedule(MachineBasicBlock::iterator BeginBlock
,
378 MachineBasicBlock::iterator EndBlock
) {
382 // PreScheduling phase to set LiveIn and LiveOut.
383 initRegPressure(BeginBlock
, EndBlock
);
386 // Schedule for real now.
390 for (SUnit
* SU
: SUnits
) {
391 if (!SU
->NumPredsLeft
)
392 TopReadySUs
.push_back(SU
);
395 while (!TopReadySUs
.empty()) {
396 SUnit
*SU
= pickNode();
397 ScheduledSUnits
.push_back(SU
);
398 TopRPTracker
.setPos(SU
->getInstr());
399 TopRPTracker
.advance();
403 // TODO: compute InternalAdditionalPressure.
404 InternalAdditionalPressure
.resize(TopPressure
.MaxSetPressure
.size());
406 // Check everything is right.
408 assert(SUnits
.size() == ScheduledSUnits
.size() &&
409 TopReadySUs
.empty());
410 for (SUnit
* SU
: SUnits
) {
411 assert(SU
->isScheduled
&&
412 SU
->NumPredsLeft
== 0);
419 void SIScheduleBlock::undoSchedule() {
420 for (SUnit
* SU
: SUnits
) {
421 SU
->isScheduled
= false;
422 for (SDep
& Succ
: SU
->Succs
) {
423 if (BC
->isSUInBlock(Succ
.getSUnit(), ID
))
424 undoReleaseSucc(SU
, &Succ
);
427 HasLowLatencyNonWaitedParent
.assign(SUnits
.size(), 0);
428 ScheduledSUnits
.clear();
432 void SIScheduleBlock::undoReleaseSucc(SUnit
*SU
, SDep
*SuccEdge
) {
433 SUnit
*SuccSU
= SuccEdge
->getSUnit();
435 if (SuccEdge
->isWeak()) {
436 ++SuccSU
->WeakPredsLeft
;
439 ++SuccSU
->NumPredsLeft
;
442 void SIScheduleBlock::releaseSucc(SUnit
*SU
, SDep
*SuccEdge
) {
443 SUnit
*SuccSU
= SuccEdge
->getSUnit();
445 if (SuccEdge
->isWeak()) {
446 --SuccSU
->WeakPredsLeft
;
450 if (SuccSU
->NumPredsLeft
== 0) {
451 dbgs() << "*** Scheduling failed! ***\n";
452 DAG
->dumpNode(*SuccSU
);
453 dbgs() << " has been released too many times!\n";
454 llvm_unreachable(nullptr);
458 --SuccSU
->NumPredsLeft
;
461 /// Release Successors of the SU that are in the block or not.
462 void SIScheduleBlock::releaseSuccessors(SUnit
*SU
, bool InOrOutBlock
) {
463 for (SDep
& Succ
: SU
->Succs
) {
464 SUnit
*SuccSU
= Succ
.getSUnit();
466 if (SuccSU
->NodeNum
>= DAG
->SUnits
.size())
469 if (BC
->isSUInBlock(SuccSU
, ID
) != InOrOutBlock
)
472 releaseSucc(SU
, &Succ
);
473 if (SuccSU
->NumPredsLeft
== 0 && InOrOutBlock
)
474 TopReadySUs
.push_back(SuccSU
);
478 void SIScheduleBlock::nodeScheduled(SUnit
*SU
) {
480 assert (!SU
->NumPredsLeft
);
481 std::vector
<SUnit
*>::iterator I
= llvm::find(TopReadySUs
, SU
);
482 if (I
== TopReadySUs
.end()) {
483 dbgs() << "Data Structure Bug in SI Scheduler\n";
484 llvm_unreachable(nullptr);
486 TopReadySUs
.erase(I
);
488 releaseSuccessors(SU
, true);
489 // Scheduling this node will trigger a wait,
490 // thus propagate to other instructions that they do not need to wait either.
491 if (HasLowLatencyNonWaitedParent
[NodeNum2Index
[SU
->NodeNum
]])
492 HasLowLatencyNonWaitedParent
.assign(SUnits
.size(), 0);
494 if (DAG
->IsLowLatencySU
[SU
->NodeNum
]) {
495 for (SDep
& Succ
: SU
->Succs
) {
496 std::map
<unsigned, unsigned>::iterator I
=
497 NodeNum2Index
.find(Succ
.getSUnit()->NodeNum
);
498 if (I
!= NodeNum2Index
.end())
499 HasLowLatencyNonWaitedParent
[I
->second
] = 1;
502 SU
->isScheduled
= true;
505 void SIScheduleBlock::finalizeUnits() {
506 // We remove links from outside blocks to enable scheduling inside the block.
507 for (SUnit
* SU
: SUnits
) {
508 releaseSuccessors(SU
, false);
509 if (DAG
->IsHighLatencySU
[SU
->NodeNum
])
510 HighLatencyBlock
= true;
512 HasLowLatencyNonWaitedParent
.resize(SUnits
.size(), 0);
515 // we maintain ascending order of IDs
516 void SIScheduleBlock::addPred(SIScheduleBlock
*Pred
) {
517 unsigned PredID
= Pred
->getID();
519 // Check if not already predecessor.
520 for (SIScheduleBlock
* P
: Preds
) {
521 if (PredID
== P
->getID())
524 Preds
.push_back(Pred
);
526 assert(none_of(Succs
,
527 [=](std::pair
<SIScheduleBlock
*,
528 SIScheduleBlockLinkKind
> S
) {
529 return PredID
== S
.first
->getID();
531 "Loop in the Block Graph!");
534 void SIScheduleBlock::addSucc(SIScheduleBlock
*Succ
,
535 SIScheduleBlockLinkKind Kind
) {
536 unsigned SuccID
= Succ
->getID();
538 // Check if not already predecessor.
539 for (std::pair
<SIScheduleBlock
*, SIScheduleBlockLinkKind
> &S
: Succs
) {
540 if (SuccID
== S
.first
->getID()) {
541 if (S
.second
== SIScheduleBlockLinkKind::NoData
&&
542 Kind
== SIScheduleBlockLinkKind::Data
)
547 if (Succ
->isHighLatencyBlock())
548 ++NumHighLatencySuccessors
;
549 Succs
.emplace_back(Succ
, Kind
);
551 assert(none_of(Preds
,
552 [=](SIScheduleBlock
*P
) { return SuccID
== P
->getID(); }) &&
553 "Loop in the Block Graph!");
557 void SIScheduleBlock::printDebug(bool full
) {
558 dbgs() << "Block (" << ID
<< ")\n";
562 dbgs() << "\nContains High Latency Instruction: "
563 << HighLatencyBlock
<< '\n';
564 dbgs() << "\nDepends On:\n";
565 for (SIScheduleBlock
* P
: Preds
) {
566 P
->printDebug(false);
569 dbgs() << "\nSuccessors:\n";
570 for (std::pair
<SIScheduleBlock
*, SIScheduleBlockLinkKind
> S
: Succs
) {
571 if (S
.second
== SIScheduleBlockLinkKind::Data
)
572 dbgs() << "(Data Dep) ";
573 S
.first
->printDebug(false);
577 dbgs() << "LiveInPressure "
578 << LiveInPressure
[AMDGPU::RegisterPressureSets::SReg_32
] << ' '
579 << LiveInPressure
[AMDGPU::RegisterPressureSets::VGPR_32
] << '\n';
580 dbgs() << "LiveOutPressure "
581 << LiveOutPressure
[AMDGPU::RegisterPressureSets::SReg_32
] << ' '
582 << LiveOutPressure
[AMDGPU::RegisterPressureSets::VGPR_32
] << "\n\n";
583 dbgs() << "LiveIns:\n";
584 for (unsigned Reg
: LiveInRegs
)
585 dbgs() << printVRegOrUnit(Reg
, DAG
->getTRI()) << ' ';
587 dbgs() << "\nLiveOuts:\n";
588 for (unsigned Reg
: LiveOutRegs
)
589 dbgs() << printVRegOrUnit(Reg
, DAG
->getTRI()) << ' ';
592 dbgs() << "\nInstructions:\n";
593 for (const SUnit
* SU
: SUnits
)
596 dbgs() << "///////////////////////\n";
600 // SIScheduleBlockCreator //
602 SIScheduleBlockCreator::SIScheduleBlockCreator(SIScheduleDAGMI
*DAG
)
606 SIScheduleBlockCreator::getBlocks(SISchedulerBlockCreatorVariant BlockVariant
) {
607 std::map
<SISchedulerBlockCreatorVariant
, SIScheduleBlocks
>::iterator B
=
608 Blocks
.find(BlockVariant
);
609 if (B
== Blocks
.end()) {
610 SIScheduleBlocks Res
;
611 createBlocksForVariant(BlockVariant
);
613 scheduleInsideBlocks();
615 Res
.Blocks
= CurrentBlocks
;
616 Res
.TopDownIndex2Block
= TopDownIndex2Block
;
617 Res
.TopDownBlock2Index
= TopDownBlock2Index
;
618 Blocks
[BlockVariant
] = Res
;
624 bool SIScheduleBlockCreator::isSUInBlock(SUnit
*SU
, unsigned ID
) {
625 if (SU
->NodeNum
>= DAG
->SUnits
.size())
627 return CurrentBlocks
[Node2CurrentBlock
[SU
->NodeNum
]]->getID() == ID
;
630 void SIScheduleBlockCreator::colorHighLatenciesAlone() {
631 unsigned DAGSize
= DAG
->SUnits
.size();
633 for (unsigned i
= 0, e
= DAGSize
; i
!= e
; ++i
) {
634 SUnit
*SU
= &DAG
->SUnits
[i
];
635 if (DAG
->IsHighLatencySU
[SU
->NodeNum
]) {
636 CurrentColoring
[SU
->NodeNum
] = NextReservedID
++;
642 hasDataDependencyPred(const SUnit
&SU
, const SUnit
&FromSU
) {
643 for (const auto &PredDep
: SU
.Preds
) {
644 if (PredDep
.getSUnit() == &FromSU
&&
645 PredDep
.getKind() == llvm::SDep::Data
)
651 void SIScheduleBlockCreator::colorHighLatenciesGroups() {
652 unsigned DAGSize
= DAG
->SUnits
.size();
653 unsigned NumHighLatencies
= 0;
655 int Color
= NextReservedID
;
657 std::set
<unsigned> FormingGroup
;
659 for (unsigned i
= 0, e
= DAGSize
; i
!= e
; ++i
) {
660 SUnit
*SU
= &DAG
->SUnits
[i
];
661 if (DAG
->IsHighLatencySU
[SU
->NodeNum
])
665 if (NumHighLatencies
== 0)
668 if (NumHighLatencies
<= 6)
670 else if (NumHighLatencies
<= 12)
675 for (unsigned SUNum
: DAG
->TopDownIndex2SU
) {
676 const SUnit
&SU
= DAG
->SUnits
[SUNum
];
677 if (DAG
->IsHighLatencySU
[SU
.NodeNum
]) {
678 unsigned CompatibleGroup
= true;
679 int ProposedColor
= Color
;
680 std::vector
<int> AdditionalElements
;
682 // We don't want to put in the same block
683 // two high latency instructions that depend
685 // One way would be to check canAddEdge
686 // in both directions, but that currently is not
687 // enough because there the high latency order is
688 // enforced (via links).
689 // Instead, look at the dependencies between the
690 // high latency instructions and deduce if it is
691 // a data dependency or not.
692 for (unsigned j
: FormingGroup
) {
694 std::vector
<int> SubGraph
;
695 // By construction (topological order), if SU and
696 // DAG->SUnits[j] are linked, DAG->SUnits[j] is necessary
697 // in the parent graph of SU.
699 SubGraph
= DAG
->GetTopo()->GetSubGraph(SU
, DAG
->SUnits
[j
],
701 assert(!HasSubGraph
);
703 SubGraph
= DAG
->GetTopo()->GetSubGraph(DAG
->SUnits
[j
], SU
,
706 continue; // No dependencies between each other
707 if (SubGraph
.size() > 5) {
708 // Too many elements would be required to be added to the block.
709 CompatibleGroup
= false;
712 // Check the type of dependency
713 for (unsigned k
: SubGraph
) {
714 // If in the path to join the two instructions,
715 // there is another high latency instruction,
716 // or instructions colored for another block
718 if (DAG
->IsHighLatencySU
[k
] || (CurrentColoring
[k
] != ProposedColor
&&
719 CurrentColoring
[k
] != 0)) {
720 CompatibleGroup
= false;
723 // If one of the SU in the subgraph depends on the result of SU j,
724 // there'll be a data dependency.
725 if (hasDataDependencyPred(DAG
->SUnits
[k
], DAG
->SUnits
[j
])) {
726 CompatibleGroup
= false;
730 if (!CompatibleGroup
)
732 // Same check for the SU
733 if (hasDataDependencyPred(SU
, DAG
->SUnits
[j
])) {
734 CompatibleGroup
= false;
737 // Add all the required instructions to the block
738 // These cannot live in another block (because they
739 // depend (order dependency) on one of the
740 // instruction in the block, and are required for the
741 // high latency instruction we add.
742 llvm::append_range(AdditionalElements
, SubGraph
);
744 if (CompatibleGroup
) {
745 FormingGroup
.insert(SU
.NodeNum
);
746 for (unsigned j
: AdditionalElements
)
747 CurrentColoring
[j
] = ProposedColor
;
748 CurrentColoring
[SU
.NodeNum
] = ProposedColor
;
751 // Found one incompatible instruction,
752 // or has filled a big enough group.
753 // -> start a new one.
754 if (!CompatibleGroup
) {
755 FormingGroup
.clear();
756 Color
= ++NextReservedID
;
757 ProposedColor
= Color
;
758 FormingGroup
.insert(SU
.NodeNum
);
759 CurrentColoring
[SU
.NodeNum
] = ProposedColor
;
761 } else if (Count
== GroupSize
) {
762 FormingGroup
.clear();
763 Color
= ++NextReservedID
;
764 ProposedColor
= Color
;
771 void SIScheduleBlockCreator::colorComputeReservedDependencies() {
772 unsigned DAGSize
= DAG
->SUnits
.size();
773 std::map
<std::set
<unsigned>, unsigned> ColorCombinations
;
775 CurrentTopDownReservedDependencyColoring
.clear();
776 CurrentBottomUpReservedDependencyColoring
.clear();
778 CurrentTopDownReservedDependencyColoring
.resize(DAGSize
, 0);
779 CurrentBottomUpReservedDependencyColoring
.resize(DAGSize
, 0);
781 // Traverse TopDown, and give different colors to SUs depending
782 // on which combination of High Latencies they depend on.
784 for (unsigned SUNum
: DAG
->TopDownIndex2SU
) {
785 SUnit
*SU
= &DAG
->SUnits
[SUNum
];
786 std::set
<unsigned> SUColors
;
789 if (CurrentColoring
[SU
->NodeNum
]) {
790 CurrentTopDownReservedDependencyColoring
[SU
->NodeNum
] =
791 CurrentColoring
[SU
->NodeNum
];
795 for (SDep
& PredDep
: SU
->Preds
) {
796 SUnit
*Pred
= PredDep
.getSUnit();
797 if (PredDep
.isWeak() || Pred
->NodeNum
>= DAGSize
)
799 if (CurrentTopDownReservedDependencyColoring
[Pred
->NodeNum
] > 0)
800 SUColors
.insert(CurrentTopDownReservedDependencyColoring
[Pred
->NodeNum
]);
802 // Color 0 by default.
803 if (SUColors
.empty())
805 // Same color than parents.
806 if (SUColors
.size() == 1 && *SUColors
.begin() > DAGSize
)
807 CurrentTopDownReservedDependencyColoring
[SU
->NodeNum
] =
810 std::map
<std::set
<unsigned>, unsigned>::iterator Pos
=
811 ColorCombinations
.find(SUColors
);
812 if (Pos
!= ColorCombinations
.end()) {
813 CurrentTopDownReservedDependencyColoring
[SU
->NodeNum
] = Pos
->second
;
815 CurrentTopDownReservedDependencyColoring
[SU
->NodeNum
] =
817 ColorCombinations
[SUColors
] = NextNonReservedID
++;
822 ColorCombinations
.clear();
824 // Same as before, but BottomUp.
826 for (unsigned SUNum
: DAG
->BottomUpIndex2SU
) {
827 SUnit
*SU
= &DAG
->SUnits
[SUNum
];
828 std::set
<unsigned> SUColors
;
831 if (CurrentColoring
[SU
->NodeNum
]) {
832 CurrentBottomUpReservedDependencyColoring
[SU
->NodeNum
] =
833 CurrentColoring
[SU
->NodeNum
];
837 for (SDep
& SuccDep
: SU
->Succs
) {
838 SUnit
*Succ
= SuccDep
.getSUnit();
839 if (SuccDep
.isWeak() || Succ
->NodeNum
>= DAGSize
)
841 if (CurrentBottomUpReservedDependencyColoring
[Succ
->NodeNum
] > 0)
842 SUColors
.insert(CurrentBottomUpReservedDependencyColoring
[Succ
->NodeNum
]);
845 if (SUColors
.empty())
847 // Same color than parents.
848 if (SUColors
.size() == 1 && *SUColors
.begin() > DAGSize
)
849 CurrentBottomUpReservedDependencyColoring
[SU
->NodeNum
] =
852 std::map
<std::set
<unsigned>, unsigned>::iterator Pos
=
853 ColorCombinations
.find(SUColors
);
854 if (Pos
!= ColorCombinations
.end()) {
855 CurrentBottomUpReservedDependencyColoring
[SU
->NodeNum
] = Pos
->second
;
857 CurrentBottomUpReservedDependencyColoring
[SU
->NodeNum
] =
859 ColorCombinations
[SUColors
] = NextNonReservedID
++;
865 void SIScheduleBlockCreator::colorAccordingToReservedDependencies() {
866 std::map
<std::pair
<unsigned, unsigned>, unsigned> ColorCombinations
;
868 // Every combination of colors given by the top down
869 // and bottom up Reserved node dependency
871 for (const SUnit
&SU
: DAG
->SUnits
) {
872 std::pair
<unsigned, unsigned> SUColors
;
874 // High latency instructions: already given.
875 if (CurrentColoring
[SU
.NodeNum
])
878 SUColors
.first
= CurrentTopDownReservedDependencyColoring
[SU
.NodeNum
];
879 SUColors
.second
= CurrentBottomUpReservedDependencyColoring
[SU
.NodeNum
];
881 std::map
<std::pair
<unsigned, unsigned>, unsigned>::iterator Pos
=
882 ColorCombinations
.find(SUColors
);
883 if (Pos
!= ColorCombinations
.end()) {
884 CurrentColoring
[SU
.NodeNum
] = Pos
->second
;
886 CurrentColoring
[SU
.NodeNum
] = NextNonReservedID
;
887 ColorCombinations
[SUColors
] = NextNonReservedID
++;
892 void SIScheduleBlockCreator::colorEndsAccordingToDependencies() {
893 unsigned DAGSize
= DAG
->SUnits
.size();
894 std::vector
<int> PendingColoring
= CurrentColoring
;
896 assert(DAGSize
>= 1 &&
897 CurrentBottomUpReservedDependencyColoring
.size() == DAGSize
&&
898 CurrentTopDownReservedDependencyColoring
.size() == DAGSize
);
899 // If there is no reserved block at all, do nothing. We don't want
900 // everything in one block.
901 if (*llvm::max_element(CurrentBottomUpReservedDependencyColoring
) == 0 &&
902 *llvm::max_element(CurrentTopDownReservedDependencyColoring
) == 0)
905 for (unsigned SUNum
: DAG
->BottomUpIndex2SU
) {
906 SUnit
*SU
= &DAG
->SUnits
[SUNum
];
907 std::set
<unsigned> SUColors
;
908 std::set
<unsigned> SUColorsPending
;
910 if (CurrentColoring
[SU
->NodeNum
] <= (int)DAGSize
)
913 if (CurrentBottomUpReservedDependencyColoring
[SU
->NodeNum
] > 0 ||
914 CurrentTopDownReservedDependencyColoring
[SU
->NodeNum
] > 0)
917 for (SDep
& SuccDep
: SU
->Succs
) {
918 SUnit
*Succ
= SuccDep
.getSUnit();
919 if (SuccDep
.isWeak() || Succ
->NodeNum
>= DAGSize
)
921 if (CurrentBottomUpReservedDependencyColoring
[Succ
->NodeNum
] > 0 ||
922 CurrentTopDownReservedDependencyColoring
[Succ
->NodeNum
] > 0)
923 SUColors
.insert(CurrentColoring
[Succ
->NodeNum
]);
924 SUColorsPending
.insert(PendingColoring
[Succ
->NodeNum
]);
926 // If there is only one child/parent block, and that block
927 // is not among the ones we are removing in this path, then
928 // merge the instruction to that block
929 if (SUColors
.size() == 1 && SUColorsPending
.size() == 1)
930 PendingColoring
[SU
->NodeNum
] = *SUColors
.begin();
931 else // TODO: Attribute new colors depending on color
932 // combination of children.
933 PendingColoring
[SU
->NodeNum
] = NextNonReservedID
++;
935 CurrentColoring
= PendingColoring
;
939 void SIScheduleBlockCreator::colorForceConsecutiveOrderInGroup() {
940 unsigned DAGSize
= DAG
->SUnits
.size();
941 unsigned PreviousColor
;
942 std::set
<unsigned> SeenColors
;
947 PreviousColor
= CurrentColoring
[0];
949 for (unsigned i
= 1, e
= DAGSize
; i
!= e
; ++i
) {
950 SUnit
*SU
= &DAG
->SUnits
[i
];
951 unsigned CurrentColor
= CurrentColoring
[i
];
952 unsigned PreviousColorSave
= PreviousColor
;
953 assert(i
== SU
->NodeNum
);
955 if (CurrentColor
!= PreviousColor
)
956 SeenColors
.insert(PreviousColor
);
957 PreviousColor
= CurrentColor
;
959 if (CurrentColoring
[SU
->NodeNum
] <= (int)DAGSize
)
962 if (SeenColors
.find(CurrentColor
) == SeenColors
.end())
965 if (PreviousColorSave
!= CurrentColor
)
966 CurrentColoring
[i
] = NextNonReservedID
++;
968 CurrentColoring
[i
] = CurrentColoring
[i
-1];
972 void SIScheduleBlockCreator::colorMergeConstantLoadsNextGroup() {
973 unsigned DAGSize
= DAG
->SUnits
.size();
975 for (unsigned SUNum
: DAG
->BottomUpIndex2SU
) {
976 SUnit
*SU
= &DAG
->SUnits
[SUNum
];
977 std::set
<unsigned> SUColors
;
979 if (CurrentColoring
[SU
->NodeNum
] <= (int)DAGSize
)
982 // No predecessor: Vgpr constant loading.
983 // Low latency instructions usually have a predecessor (the address)
984 if (SU
->Preds
.size() > 0 && !DAG
->IsLowLatencySU
[SU
->NodeNum
])
987 for (SDep
& SuccDep
: SU
->Succs
) {
988 SUnit
*Succ
= SuccDep
.getSUnit();
989 if (SuccDep
.isWeak() || Succ
->NodeNum
>= DAGSize
)
991 SUColors
.insert(CurrentColoring
[Succ
->NodeNum
]);
993 if (SUColors
.size() == 1)
994 CurrentColoring
[SU
->NodeNum
] = *SUColors
.begin();
998 void SIScheduleBlockCreator::colorMergeIfPossibleNextGroup() {
999 unsigned DAGSize
= DAG
->SUnits
.size();
1001 for (unsigned SUNum
: DAG
->BottomUpIndex2SU
) {
1002 SUnit
*SU
= &DAG
->SUnits
[SUNum
];
1003 std::set
<unsigned> SUColors
;
1005 if (CurrentColoring
[SU
->NodeNum
] <= (int)DAGSize
)
1008 for (SDep
& SuccDep
: SU
->Succs
) {
1009 SUnit
*Succ
= SuccDep
.getSUnit();
1010 if (SuccDep
.isWeak() || Succ
->NodeNum
>= DAGSize
)
1012 SUColors
.insert(CurrentColoring
[Succ
->NodeNum
]);
1014 if (SUColors
.size() == 1)
1015 CurrentColoring
[SU
->NodeNum
] = *SUColors
.begin();
1019 void SIScheduleBlockCreator::colorMergeIfPossibleNextGroupOnlyForReserved() {
1020 unsigned DAGSize
= DAG
->SUnits
.size();
1022 for (unsigned SUNum
: DAG
->BottomUpIndex2SU
) {
1023 SUnit
*SU
= &DAG
->SUnits
[SUNum
];
1024 std::set
<unsigned> SUColors
;
1026 if (CurrentColoring
[SU
->NodeNum
] <= (int)DAGSize
)
1029 for (SDep
& SuccDep
: SU
->Succs
) {
1030 SUnit
*Succ
= SuccDep
.getSUnit();
1031 if (SuccDep
.isWeak() || Succ
->NodeNum
>= DAGSize
)
1033 SUColors
.insert(CurrentColoring
[Succ
->NodeNum
]);
1035 if (SUColors
.size() == 1 && *SUColors
.begin() <= DAGSize
)
1036 CurrentColoring
[SU
->NodeNum
] = *SUColors
.begin();
1040 void SIScheduleBlockCreator::colorMergeIfPossibleSmallGroupsToNextGroup() {
1041 unsigned DAGSize
= DAG
->SUnits
.size();
1042 std::map
<unsigned, unsigned> ColorCount
;
1044 for (unsigned SUNum
: DAG
->BottomUpIndex2SU
) {
1045 SUnit
*SU
= &DAG
->SUnits
[SUNum
];
1046 unsigned color
= CurrentColoring
[SU
->NodeNum
];
1047 ++ColorCount
[color
];
1050 for (unsigned SUNum
: DAG
->BottomUpIndex2SU
) {
1051 SUnit
*SU
= &DAG
->SUnits
[SUNum
];
1052 unsigned color
= CurrentColoring
[SU
->NodeNum
];
1053 std::set
<unsigned> SUColors
;
1055 if (CurrentColoring
[SU
->NodeNum
] <= (int)DAGSize
)
1058 if (ColorCount
[color
] > 1)
1061 for (SDep
& SuccDep
: SU
->Succs
) {
1062 SUnit
*Succ
= SuccDep
.getSUnit();
1063 if (SuccDep
.isWeak() || Succ
->NodeNum
>= DAGSize
)
1065 SUColors
.insert(CurrentColoring
[Succ
->NodeNum
]);
1067 if (SUColors
.size() == 1 && *SUColors
.begin() != color
) {
1068 --ColorCount
[color
];
1069 CurrentColoring
[SU
->NodeNum
] = *SUColors
.begin();
1070 ++ColorCount
[*SUColors
.begin()];
1075 void SIScheduleBlockCreator::cutHugeBlocks() {
1079 void SIScheduleBlockCreator::regroupNoUserInstructions() {
1080 unsigned DAGSize
= DAG
->SUnits
.size();
1081 int GroupID
= NextNonReservedID
++;
1083 for (unsigned SUNum
: DAG
->BottomUpIndex2SU
) {
1084 SUnit
*SU
= &DAG
->SUnits
[SUNum
];
1085 bool hasSuccessor
= false;
1087 if (CurrentColoring
[SU
->NodeNum
] <= (int)DAGSize
)
1090 for (SDep
& SuccDep
: SU
->Succs
) {
1091 SUnit
*Succ
= SuccDep
.getSUnit();
1092 if (SuccDep
.isWeak() || Succ
->NodeNum
>= DAGSize
)
1094 hasSuccessor
= true;
1097 CurrentColoring
[SU
->NodeNum
] = GroupID
;
1101 void SIScheduleBlockCreator::colorExports() {
1102 unsigned ExportColor
= NextNonReservedID
++;
1103 SmallVector
<unsigned, 8> ExpGroup
;
1105 // Put all exports together in a block.
1106 // The block will naturally end up being scheduled last,
1107 // thus putting exports at the end of the schedule, which
1108 // is better for performance.
1109 // However we must ensure, for safety, the exports can be put
1110 // together in the same block without any other instruction.
1111 // This could happen, for example, when scheduling after regalloc
1112 // if reloading a spilled register from memory using the same
1113 // register than used in a previous export.
1114 // If that happens, do not regroup the exports.
1115 for (unsigned SUNum
: DAG
->TopDownIndex2SU
) {
1116 const SUnit
&SU
= DAG
->SUnits
[SUNum
];
1117 if (SIInstrInfo::isEXP(*SU
.getInstr())) {
1118 // SU is an export instruction. Check whether one of its successor
1119 // dependencies is a non-export, in which case we skip export grouping.
1120 for (const SDep
&SuccDep
: SU
.Succs
) {
1121 const SUnit
*SuccSU
= SuccDep
.getSUnit();
1122 if (SuccDep
.isWeak() || SuccSU
->NodeNum
>= DAG
->SUnits
.size()) {
1123 // Ignore these dependencies.
1126 assert(SuccSU
->isInstr() &&
1127 "SUnit unexpectedly not representing an instruction!");
1129 if (!SIInstrInfo::isEXP(*SuccSU
->getInstr())) {
1130 // A non-export depends on us. Skip export grouping.
1131 // Note that this is a bit pessimistic: We could still group all other
1132 // exports that are not depended on by non-exports, directly or
1133 // indirectly. Simply skipping this particular export but grouping all
1134 // others would not account for indirect dependencies.
1138 ExpGroup
.push_back(SUNum
);
1142 // The group can be formed. Give the color.
1143 for (unsigned j
: ExpGroup
)
1144 CurrentColoring
[j
] = ExportColor
;
1147 void SIScheduleBlockCreator::createBlocksForVariant(SISchedulerBlockCreatorVariant BlockVariant
) {
1148 unsigned DAGSize
= DAG
->SUnits
.size();
1149 std::map
<unsigned,unsigned> RealID
;
1151 CurrentBlocks
.clear();
1152 CurrentColoring
.clear();
1153 CurrentColoring
.resize(DAGSize
, 0);
1154 Node2CurrentBlock
.clear();
1156 // Restore links previous scheduling variant has overridden.
1157 DAG
->restoreSULinksLeft();
1160 NextNonReservedID
= DAGSize
+ 1;
1162 LLVM_DEBUG(dbgs() << "Coloring the graph\n");
1164 if (BlockVariant
== SISchedulerBlockCreatorVariant::LatenciesGrouped
)
1165 colorHighLatenciesGroups();
1167 colorHighLatenciesAlone();
1168 colorComputeReservedDependencies();
1169 colorAccordingToReservedDependencies();
1170 colorEndsAccordingToDependencies();
1171 if (BlockVariant
== SISchedulerBlockCreatorVariant::LatenciesAlonePlusConsecutive
)
1172 colorForceConsecutiveOrderInGroup();
1173 regroupNoUserInstructions();
1174 colorMergeConstantLoadsNextGroup();
1175 colorMergeIfPossibleNextGroupOnlyForReserved();
1178 // Put SUs of same color into same block
1179 Node2CurrentBlock
.resize(DAGSize
, -1);
1180 for (unsigned i
= 0, e
= DAGSize
; i
!= e
; ++i
) {
1181 SUnit
*SU
= &DAG
->SUnits
[i
];
1182 unsigned Color
= CurrentColoring
[SU
->NodeNum
];
1183 if (RealID
.find(Color
) == RealID
.end()) {
1184 int ID
= CurrentBlocks
.size();
1185 BlockPtrs
.push_back(std::make_unique
<SIScheduleBlock
>(DAG
, this, ID
));
1186 CurrentBlocks
.push_back(BlockPtrs
.rbegin()->get());
1189 CurrentBlocks
[RealID
[Color
]]->addUnit(SU
);
1190 Node2CurrentBlock
[SU
->NodeNum
] = RealID
[Color
];
1193 // Build dependencies between blocks.
1194 for (unsigned i
= 0, e
= DAGSize
; i
!= e
; ++i
) {
1195 SUnit
*SU
= &DAG
->SUnits
[i
];
1196 int SUID
= Node2CurrentBlock
[i
];
1197 for (SDep
& SuccDep
: SU
->Succs
) {
1198 SUnit
*Succ
= SuccDep
.getSUnit();
1199 if (SuccDep
.isWeak() || Succ
->NodeNum
>= DAGSize
)
1201 if (Node2CurrentBlock
[Succ
->NodeNum
] != SUID
)
1202 CurrentBlocks
[SUID
]->addSucc(CurrentBlocks
[Node2CurrentBlock
[Succ
->NodeNum
]],
1203 SuccDep
.isCtrl() ? NoData
: Data
);
1205 for (SDep
& PredDep
: SU
->Preds
) {
1206 SUnit
*Pred
= PredDep
.getSUnit();
1207 if (PredDep
.isWeak() || Pred
->NodeNum
>= DAGSize
)
1209 if (Node2CurrentBlock
[Pred
->NodeNum
] != SUID
)
1210 CurrentBlocks
[SUID
]->addPred(CurrentBlocks
[Node2CurrentBlock
[Pred
->NodeNum
]]);
1214 // Free root and leafs of all blocks to enable scheduling inside them.
1215 for (SIScheduleBlock
*Block
: CurrentBlocks
)
1216 Block
->finalizeUnits();
1218 dbgs() << "Blocks created:\n\n";
1219 for (SIScheduleBlock
*Block
: CurrentBlocks
)
1220 Block
->printDebug(true);
1224 // Two functions taken from Codegen/MachineScheduler.cpp
1226 /// Non-const version.
1227 static MachineBasicBlock::iterator
1228 nextIfDebug(MachineBasicBlock::iterator I
,
1229 MachineBasicBlock::const_iterator End
) {
1230 for (; I
!= End
; ++I
) {
1231 if (!I
->isDebugInstr())
1237 void SIScheduleBlockCreator::topologicalSort() {
1238 unsigned DAGSize
= CurrentBlocks
.size();
1239 std::vector
<int> WorkList
;
1241 LLVM_DEBUG(dbgs() << "Topological Sort\n");
1243 WorkList
.reserve(DAGSize
);
1244 TopDownIndex2Block
.resize(DAGSize
);
1245 TopDownBlock2Index
.resize(DAGSize
);
1246 BottomUpIndex2Block
.resize(DAGSize
);
1248 for (unsigned i
= 0, e
= DAGSize
; i
!= e
; ++i
) {
1249 SIScheduleBlock
*Block
= CurrentBlocks
[i
];
1250 unsigned Degree
= Block
->getSuccs().size();
1251 TopDownBlock2Index
[i
] = Degree
;
1253 WorkList
.push_back(i
);
1258 while (!WorkList
.empty()) {
1259 int i
= WorkList
.back();
1260 SIScheduleBlock
*Block
= CurrentBlocks
[i
];
1261 WorkList
.pop_back();
1262 TopDownBlock2Index
[i
] = --Id
;
1263 TopDownIndex2Block
[Id
] = i
;
1264 for (SIScheduleBlock
* Pred
: Block
->getPreds()) {
1265 if (!--TopDownBlock2Index
[Pred
->getID()])
1266 WorkList
.push_back(Pred
->getID());
1271 // Check correctness of the ordering.
1272 for (unsigned i
= 0, e
= DAGSize
; i
!= e
; ++i
) {
1273 SIScheduleBlock
*Block
= CurrentBlocks
[i
];
1274 for (SIScheduleBlock
* Pred
: Block
->getPreds()) {
1275 assert(TopDownBlock2Index
[i
] > TopDownBlock2Index
[Pred
->getID()] &&
1276 "Wrong Top Down topological sorting");
1281 BottomUpIndex2Block
= std::vector
<int>(TopDownIndex2Block
.rbegin(),
1282 TopDownIndex2Block
.rend());
1285 void SIScheduleBlockCreator::scheduleInsideBlocks() {
1286 unsigned DAGSize
= CurrentBlocks
.size();
1288 LLVM_DEBUG(dbgs() << "\nScheduling Blocks\n\n");
1290 // We do schedule a valid scheduling such that a Block corresponds
1291 // to a range of instructions.
1292 LLVM_DEBUG(dbgs() << "First phase: Fast scheduling for Reg Liveness\n");
1293 for (unsigned i
= 0, e
= DAGSize
; i
!= e
; ++i
) {
1294 SIScheduleBlock
*Block
= CurrentBlocks
[i
];
1295 Block
->fastSchedule();
1298 // Note: the following code, and the part restoring previous position
1299 // is by far the most expensive operation of the Scheduler.
1301 // Do not update CurrentTop.
1302 MachineBasicBlock::iterator CurrentTopFastSched
= DAG
->getCurrentTop();
1303 std::vector
<MachineBasicBlock::iterator
> PosOld
;
1304 std::vector
<MachineBasicBlock::iterator
> PosNew
;
1305 PosOld
.reserve(DAG
->SUnits
.size());
1306 PosNew
.reserve(DAG
->SUnits
.size());
1308 for (unsigned i
= 0, e
= DAGSize
; i
!= e
; ++i
) {
1309 int BlockIndice
= TopDownIndex2Block
[i
];
1310 SIScheduleBlock
*Block
= CurrentBlocks
[BlockIndice
];
1311 std::vector
<SUnit
*> SUs
= Block
->getScheduledUnits();
1313 for (SUnit
* SU
: SUs
) {
1314 MachineInstr
*MI
= SU
->getInstr();
1315 MachineBasicBlock::iterator Pos
= MI
;
1316 PosOld
.push_back(Pos
);
1317 if (&*CurrentTopFastSched
== MI
) {
1318 PosNew
.push_back(Pos
);
1319 CurrentTopFastSched
= nextIfDebug(++CurrentTopFastSched
,
1320 DAG
->getCurrentBottom());
1322 // Update the instruction stream.
1323 DAG
->getBB()->splice(CurrentTopFastSched
, DAG
->getBB(), MI
);
1325 // Update LiveIntervals.
1326 // Note: Moving all instructions and calling handleMove every time
1327 // is the most cpu intensive operation of the scheduler.
1328 // It would gain a lot if there was a way to recompute the
1329 // LiveIntervals for the entire scheduling region.
1330 DAG
->getLIS()->handleMove(*MI
, /*UpdateFlags=*/true);
1331 PosNew
.push_back(CurrentTopFastSched
);
1336 // Now we have Block of SUs == Block of MI.
1337 // We do the final schedule for the instructions inside the block.
1338 // The property that all the SUs of the Block are grouped together as MI
1339 // is used for correct reg usage tracking.
1340 for (unsigned i
= 0, e
= DAGSize
; i
!= e
; ++i
) {
1341 SIScheduleBlock
*Block
= CurrentBlocks
[i
];
1342 std::vector
<SUnit
*> SUs
= Block
->getScheduledUnits();
1343 Block
->schedule((*SUs
.begin())->getInstr(), (*SUs
.rbegin())->getInstr());
1346 LLVM_DEBUG(dbgs() << "Restoring MI Pos\n");
1347 // Restore old ordering (which prevents a LIS->handleMove bug).
1348 for (unsigned i
= PosOld
.size(), e
= 0; i
!= e
; --i
) {
1349 MachineBasicBlock::iterator POld
= PosOld
[i
-1];
1350 MachineBasicBlock::iterator PNew
= PosNew
[i
-1];
1352 // Update the instruction stream.
1353 DAG
->getBB()->splice(POld
, DAG
->getBB(), PNew
);
1355 // Update LiveIntervals.
1356 DAG
->getLIS()->handleMove(*POld
, /*UpdateFlags=*/true);
1361 for (SIScheduleBlock
*Block
: CurrentBlocks
)
1362 Block
->printDebug(true);
1366 void SIScheduleBlockCreator::fillStats() {
1367 unsigned DAGSize
= CurrentBlocks
.size();
1369 for (unsigned i
= 0, e
= DAGSize
; i
!= e
; ++i
) {
1370 int BlockIndice
= TopDownIndex2Block
[i
];
1371 SIScheduleBlock
*Block
= CurrentBlocks
[BlockIndice
];
1372 if (Block
->getPreds().empty())
1376 for (SIScheduleBlock
*Pred
: Block
->getPreds()) {
1377 if (Depth
< Pred
->Depth
+ Pred
->getCost())
1378 Depth
= Pred
->Depth
+ Pred
->getCost();
1380 Block
->Depth
= Depth
;
1384 for (unsigned i
= 0, e
= DAGSize
; i
!= e
; ++i
) {
1385 int BlockIndice
= BottomUpIndex2Block
[i
];
1386 SIScheduleBlock
*Block
= CurrentBlocks
[BlockIndice
];
1387 if (Block
->getSuccs().empty())
1390 unsigned Height
= 0;
1391 for (const auto &Succ
: Block
->getSuccs())
1392 Height
= std::max(Height
, Succ
.first
->Height
+ Succ
.first
->getCost());
1393 Block
->Height
= Height
;
1398 // SIScheduleBlockScheduler //
1400 SIScheduleBlockScheduler::SIScheduleBlockScheduler(SIScheduleDAGMI
*DAG
,
1401 SISchedulerBlockSchedulerVariant Variant
,
1402 SIScheduleBlocks BlocksStruct
) :
1403 DAG(DAG
), Variant(Variant
), Blocks(BlocksStruct
.Blocks
),
1404 LastPosWaitedHighLatency(0), NumBlockScheduled(0), VregCurrentUsage(0),
1405 SregCurrentUsage(0), maxVregUsage(0), maxSregUsage(0) {
1407 // Fill the usage of every output
1408 // Warning: while by construction we always have a link between two blocks
1409 // when one needs a result from the other, the number of users of an output
1410 // is not the sum of child blocks having as input the same virtual register.
1411 // Here is an example. A produces x and y. B eats x and produces x'.
1412 // C eats x' and y. The register coalescer may have attributed the same
1413 // virtual register to x and x'.
1414 // To count accurately, we do a topological sort. In case the register is
1415 // found for several parents, we increment the usage of the one with the
1416 // highest topological index.
1417 LiveOutRegsNumUsages
.resize(Blocks
.size());
1418 for (SIScheduleBlock
*Block
: Blocks
) {
1419 for (unsigned Reg
: Block
->getInRegs()) {
1422 for (SIScheduleBlock
* Pred
: Block
->getPreds()) {
1423 std::set
<unsigned> PredOutRegs
= Pred
->getOutRegs();
1424 std::set
<unsigned>::iterator RegPos
= PredOutRegs
.find(Reg
);
1426 if (RegPos
!= PredOutRegs
.end()) {
1428 if (topoInd
< BlocksStruct
.TopDownBlock2Index
[Pred
->getID()]) {
1429 topoInd
= BlocksStruct
.TopDownBlock2Index
[Pred
->getID()];
1437 int PredID
= BlocksStruct
.TopDownIndex2Block
[topoInd
];
1438 ++LiveOutRegsNumUsages
[PredID
][Reg
];
1442 LastPosHighLatencyParentScheduled
.resize(Blocks
.size(), 0);
1443 BlockNumPredsLeft
.resize(Blocks
.size());
1444 BlockNumSuccsLeft
.resize(Blocks
.size());
1446 for (unsigned i
= 0, e
= Blocks
.size(); i
!= e
; ++i
) {
1447 SIScheduleBlock
*Block
= Blocks
[i
];
1448 BlockNumPredsLeft
[i
] = Block
->getPreds().size();
1449 BlockNumSuccsLeft
[i
] = Block
->getSuccs().size();
1453 for (unsigned i
= 0, e
= Blocks
.size(); i
!= e
; ++i
) {
1454 SIScheduleBlock
*Block
= Blocks
[i
];
1455 assert(Block
->getID() == i
);
1459 std::set
<unsigned> InRegs
= DAG
->getInRegs();
1460 addLiveRegs(InRegs
);
1462 // Increase LiveOutRegsNumUsages for blocks
1463 // producing registers consumed in another
1464 // scheduling region.
1465 for (unsigned Reg
: DAG
->getOutRegs()) {
1466 for (unsigned i
= 0, e
= Blocks
.size(); i
!= e
; ++i
) {
1467 // Do reverse traversal
1468 int ID
= BlocksStruct
.TopDownIndex2Block
[Blocks
.size()-1-i
];
1469 SIScheduleBlock
*Block
= Blocks
[ID
];
1470 const std::set
<unsigned> &OutRegs
= Block
->getOutRegs();
1472 if (OutRegs
.find(Reg
) == OutRegs
.end())
1475 ++LiveOutRegsNumUsages
[ID
][Reg
];
1480 // Fill LiveRegsConsumers for regs that were already
1481 // defined before scheduling.
1482 for (SIScheduleBlock
*Block
: Blocks
) {
1483 for (unsigned Reg
: Block
->getInRegs()) {
1485 for (SIScheduleBlock
* Pred
: Block
->getPreds()) {
1486 std::set
<unsigned> PredOutRegs
= Pred
->getOutRegs();
1487 std::set
<unsigned>::iterator RegPos
= PredOutRegs
.find(Reg
);
1489 if (RegPos
!= PredOutRegs
.end()) {
1496 ++LiveRegsConsumers
[Reg
];
1500 for (unsigned i
= 0, e
= Blocks
.size(); i
!= e
; ++i
) {
1501 SIScheduleBlock
*Block
= Blocks
[i
];
1502 if (BlockNumPredsLeft
[i
] == 0) {
1503 ReadyBlocks
.push_back(Block
);
1507 while (SIScheduleBlock
*Block
= pickBlock()) {
1508 BlocksScheduled
.push_back(Block
);
1509 blockScheduled(Block
);
1512 LLVM_DEBUG(dbgs() << "Block Order:"; for (SIScheduleBlock
*Block
1513 : BlocksScheduled
) {
1514 dbgs() << ' ' << Block
->getID();
1518 bool SIScheduleBlockScheduler::tryCandidateLatency(SIBlockSchedCandidate
&Cand
,
1519 SIBlockSchedCandidate
&TryCand
) {
1520 if (!Cand
.isValid()) {
1521 TryCand
.Reason
= NodeOrder
;
1525 // Try to hide high latencies.
1526 if (SISched::tryLess(TryCand
.LastPosHighLatParentScheduled
,
1527 Cand
.LastPosHighLatParentScheduled
, TryCand
, Cand
, Latency
))
1529 // Schedule high latencies early so you can hide them better.
1530 if (SISched::tryGreater(TryCand
.IsHighLatency
, Cand
.IsHighLatency
,
1531 TryCand
, Cand
, Latency
))
1533 if (TryCand
.IsHighLatency
&& SISched::tryGreater(TryCand
.Height
, Cand
.Height
,
1534 TryCand
, Cand
, Depth
))
1536 if (SISched::tryGreater(TryCand
.NumHighLatencySuccessors
,
1537 Cand
.NumHighLatencySuccessors
,
1538 TryCand
, Cand
, Successor
))
1543 bool SIScheduleBlockScheduler::tryCandidateRegUsage(SIBlockSchedCandidate
&Cand
,
1544 SIBlockSchedCandidate
&TryCand
) {
1545 if (!Cand
.isValid()) {
1546 TryCand
.Reason
= NodeOrder
;
1550 if (SISched::tryLess(TryCand
.VGPRUsageDiff
> 0, Cand
.VGPRUsageDiff
> 0,
1551 TryCand
, Cand
, RegUsage
))
1553 if (SISched::tryGreater(TryCand
.NumSuccessors
> 0,
1554 Cand
.NumSuccessors
> 0,
1555 TryCand
, Cand
, Successor
))
1557 if (SISched::tryGreater(TryCand
.Height
, Cand
.Height
, TryCand
, Cand
, Depth
))
1559 if (SISched::tryLess(TryCand
.VGPRUsageDiff
, Cand
.VGPRUsageDiff
,
1560 TryCand
, Cand
, RegUsage
))
1565 SIScheduleBlock
*SIScheduleBlockScheduler::pickBlock() {
1566 SIBlockSchedCandidate Cand
;
1567 std::vector
<SIScheduleBlock
*>::iterator Best
;
1568 SIScheduleBlock
*Block
;
1569 if (ReadyBlocks
.empty())
1572 DAG
->fillVgprSgprCost(LiveRegs
.begin(), LiveRegs
.end(),
1573 VregCurrentUsage
, SregCurrentUsage
);
1574 if (VregCurrentUsage
> maxVregUsage
)
1575 maxVregUsage
= VregCurrentUsage
;
1576 if (SregCurrentUsage
> maxSregUsage
)
1577 maxSregUsage
= SregCurrentUsage
;
1578 LLVM_DEBUG(dbgs() << "Picking New Blocks\n"; dbgs() << "Available: ";
1579 for (SIScheduleBlock
*Block
1580 : ReadyBlocks
) dbgs()
1581 << Block
->getID() << ' ';
1582 dbgs() << "\nCurrent Live:\n";
1585 << printVRegOrUnit(Reg
, DAG
->getTRI()) << ' ';
1587 dbgs() << "Current VGPRs: " << VregCurrentUsage
<< '\n';
1588 dbgs() << "Current SGPRs: " << SregCurrentUsage
<< '\n';);
1590 Cand
.Block
= nullptr;
1591 for (std::vector
<SIScheduleBlock
*>::iterator I
= ReadyBlocks
.begin(),
1592 E
= ReadyBlocks
.end(); I
!= E
; ++I
) {
1593 SIBlockSchedCandidate TryCand
;
1595 TryCand
.IsHighLatency
= TryCand
.Block
->isHighLatencyBlock();
1596 TryCand
.VGPRUsageDiff
=
1597 checkRegUsageImpact(TryCand
.Block
->getInRegs(),
1598 TryCand
.Block
->getOutRegs())[AMDGPU::RegisterPressureSets::VGPR_32
];
1599 TryCand
.NumSuccessors
= TryCand
.Block
->getSuccs().size();
1600 TryCand
.NumHighLatencySuccessors
=
1601 TryCand
.Block
->getNumHighLatencySuccessors();
1602 TryCand
.LastPosHighLatParentScheduled
=
1603 (unsigned int) std::max
<int> (0,
1604 LastPosHighLatencyParentScheduled
[TryCand
.Block
->getID()] -
1605 LastPosWaitedHighLatency
);
1606 TryCand
.Height
= TryCand
.Block
->Height
;
1607 // Try not to increase VGPR usage too much, else we may spill.
1608 if (VregCurrentUsage
> 120 ||
1609 Variant
!= SISchedulerBlockSchedulerVariant::BlockLatencyRegUsage
) {
1610 if (!tryCandidateRegUsage(Cand
, TryCand
) &&
1611 Variant
!= SISchedulerBlockSchedulerVariant::BlockRegUsage
)
1612 tryCandidateLatency(Cand
, TryCand
);
1614 if (!tryCandidateLatency(Cand
, TryCand
))
1615 tryCandidateRegUsage(Cand
, TryCand
);
1617 if (TryCand
.Reason
!= NoCand
) {
1618 Cand
.setBest(TryCand
);
1620 LLVM_DEBUG(dbgs() << "Best Current Choice: " << Cand
.Block
->getID() << ' '
1621 << getReasonStr(Cand
.Reason
) << '\n');
1625 LLVM_DEBUG(dbgs() << "Picking: " << Cand
.Block
->getID() << '\n';
1626 dbgs() << "Is a block with high latency instruction: "
1627 << (Cand
.IsHighLatency
? "yes\n" : "no\n");
1628 dbgs() << "Position of last high latency dependency: "
1629 << Cand
.LastPosHighLatParentScheduled
<< '\n';
1630 dbgs() << "VGPRUsageDiff: " << Cand
.VGPRUsageDiff
<< '\n';
1634 ReadyBlocks
.erase(Best
);
1638 // Tracking of currently alive registers to determine VGPR Usage.
1640 void SIScheduleBlockScheduler::addLiveRegs(std::set
<unsigned> &Regs
) {
1641 for (Register Reg
: Regs
) {
1642 // For now only track virtual registers.
1643 if (!Reg
.isVirtual())
1645 // If not already in the live set, then add it.
1646 (void) LiveRegs
.insert(Reg
);
1650 void SIScheduleBlockScheduler::decreaseLiveRegs(SIScheduleBlock
*Block
,
1651 std::set
<unsigned> &Regs
) {
1652 for (unsigned Reg
: Regs
) {
1653 // For now only track virtual registers.
1654 std::set
<unsigned>::iterator Pos
= LiveRegs
.find(Reg
);
1655 assert (Pos
!= LiveRegs
.end() && // Reg must be live.
1656 LiveRegsConsumers
.find(Reg
) != LiveRegsConsumers
.end() &&
1657 LiveRegsConsumers
[Reg
] >= 1);
1658 --LiveRegsConsumers
[Reg
];
1659 if (LiveRegsConsumers
[Reg
] == 0)
1660 LiveRegs
.erase(Pos
);
1664 void SIScheduleBlockScheduler::releaseBlockSuccs(SIScheduleBlock
*Parent
) {
1665 for (const auto &Block
: Parent
->getSuccs()) {
1666 if (--BlockNumPredsLeft
[Block
.first
->getID()] == 0)
1667 ReadyBlocks
.push_back(Block
.first
);
1669 if (Parent
->isHighLatencyBlock() &&
1670 Block
.second
== SIScheduleBlockLinkKind::Data
)
1671 LastPosHighLatencyParentScheduled
[Block
.first
->getID()] = NumBlockScheduled
;
1675 void SIScheduleBlockScheduler::blockScheduled(SIScheduleBlock
*Block
) {
1676 decreaseLiveRegs(Block
, Block
->getInRegs());
1677 addLiveRegs(Block
->getOutRegs());
1678 releaseBlockSuccs(Block
);
1679 for (const auto &RegP
: LiveOutRegsNumUsages
[Block
->getID()]) {
1680 // We produce this register, thus it must not be previously alive.
1681 assert(LiveRegsConsumers
.find(RegP
.first
) == LiveRegsConsumers
.end() ||
1682 LiveRegsConsumers
[RegP
.first
] == 0);
1683 LiveRegsConsumers
[RegP
.first
] += RegP
.second
;
1685 if (LastPosHighLatencyParentScheduled
[Block
->getID()] >
1686 (unsigned)LastPosWaitedHighLatency
)
1687 LastPosWaitedHighLatency
=
1688 LastPosHighLatencyParentScheduled
[Block
->getID()];
1689 ++NumBlockScheduled
;
1693 SIScheduleBlockScheduler::checkRegUsageImpact(std::set
<unsigned> &InRegs
,
1694 std::set
<unsigned> &OutRegs
) {
1695 std::vector
<int> DiffSetPressure
;
1696 DiffSetPressure
.assign(DAG
->getTRI()->getNumRegPressureSets(), 0);
1698 for (Register Reg
: InRegs
) {
1699 // For now only track virtual registers.
1700 if (!Reg
.isVirtual())
1702 if (LiveRegsConsumers
[Reg
] > 1)
1704 PSetIterator PSetI
= DAG
->getMRI()->getPressureSets(Reg
);
1705 for (; PSetI
.isValid(); ++PSetI
) {
1706 DiffSetPressure
[*PSetI
] -= PSetI
.getWeight();
1710 for (Register Reg
: OutRegs
) {
1711 // For now only track virtual registers.
1712 if (!Reg
.isVirtual())
1714 PSetIterator PSetI
= DAG
->getMRI()->getPressureSets(Reg
);
1715 for (; PSetI
.isValid(); ++PSetI
) {
1716 DiffSetPressure
[*PSetI
] += PSetI
.getWeight();
1720 return DiffSetPressure
;
1725 struct SIScheduleBlockResult
1726 SIScheduler::scheduleVariant(SISchedulerBlockCreatorVariant BlockVariant
,
1727 SISchedulerBlockSchedulerVariant ScheduleVariant
) {
1728 SIScheduleBlocks Blocks
= BlockCreator
.getBlocks(BlockVariant
);
1729 SIScheduleBlockScheduler
Scheduler(DAG
, ScheduleVariant
, Blocks
);
1730 std::vector
<SIScheduleBlock
*> ScheduledBlocks
;
1731 struct SIScheduleBlockResult Res
;
1733 ScheduledBlocks
= Scheduler
.getBlocks();
1735 for (SIScheduleBlock
*Block
: ScheduledBlocks
) {
1736 std::vector
<SUnit
*> SUs
= Block
->getScheduledUnits();
1738 for (SUnit
* SU
: SUs
)
1739 Res
.SUs
.push_back(SU
->NodeNum
);
1742 Res
.MaxSGPRUsage
= Scheduler
.getSGPRUsage();
1743 Res
.MaxVGPRUsage
= Scheduler
.getVGPRUsage();
1747 // SIScheduleDAGMI //
1749 SIScheduleDAGMI::SIScheduleDAGMI(MachineSchedContext
*C
) :
1750 ScheduleDAGMILive(C
, std::make_unique
<GenericScheduler
>(C
)) {
1751 SITII
= static_cast<const SIInstrInfo
*>(TII
);
1752 SITRI
= static_cast<const SIRegisterInfo
*>(TRI
);
1755 SIScheduleDAGMI::~SIScheduleDAGMI() = default;
1757 // Code adapted from scheduleDAG.cpp
1758 // Does a topological sort over the SUs.
1759 // Both TopDown and BottomUp
1760 void SIScheduleDAGMI::topologicalSort() {
1761 Topo
.InitDAGTopologicalSorting();
1763 TopDownIndex2SU
= std::vector
<int>(Topo
.begin(), Topo
.end());
1764 BottomUpIndex2SU
= std::vector
<int>(Topo
.rbegin(), Topo
.rend());
1767 // Move low latencies further from their user without
1768 // increasing SGPR usage (in general)
1769 // This is to be replaced by a better pass that would
1770 // take into account SGPR usage (based on VGPR Usage
1771 // and the corresponding wavefront count), that would
1772 // try to merge groups of loads if it make sense, etc
1773 void SIScheduleDAGMI::moveLowLatencies() {
1774 unsigned DAGSize
= SUnits
.size();
1775 int LastLowLatencyUser
= -1;
1776 int LastLowLatencyPos
= -1;
1778 for (unsigned i
= 0, e
= ScheduledSUnits
.size(); i
!= e
; ++i
) {
1779 SUnit
*SU
= &SUnits
[ScheduledSUnits
[i
]];
1780 bool IsLowLatencyUser
= false;
1781 unsigned MinPos
= 0;
1783 for (SDep
& PredDep
: SU
->Preds
) {
1784 SUnit
*Pred
= PredDep
.getSUnit();
1785 if (SITII
->isLowLatencyInstruction(*Pred
->getInstr())) {
1786 IsLowLatencyUser
= true;
1788 if (Pred
->NodeNum
>= DAGSize
)
1790 unsigned PredPos
= ScheduledSUnitsInv
[Pred
->NodeNum
];
1791 if (PredPos
>= MinPos
)
1792 MinPos
= PredPos
+ 1;
1795 if (SITII
->isLowLatencyInstruction(*SU
->getInstr())) {
1796 unsigned BestPos
= LastLowLatencyUser
+ 1;
1797 if ((int)BestPos
<= LastLowLatencyPos
)
1798 BestPos
= LastLowLatencyPos
+ 1;
1799 if (BestPos
< MinPos
)
1802 for (unsigned u
= i
; u
> BestPos
; --u
) {
1803 ++ScheduledSUnitsInv
[ScheduledSUnits
[u
-1]];
1804 ScheduledSUnits
[u
] = ScheduledSUnits
[u
-1];
1806 ScheduledSUnits
[BestPos
] = SU
->NodeNum
;
1807 ScheduledSUnitsInv
[SU
->NodeNum
] = BestPos
;
1809 LastLowLatencyPos
= BestPos
;
1810 if (IsLowLatencyUser
)
1811 LastLowLatencyUser
= BestPos
;
1812 } else if (IsLowLatencyUser
) {
1813 LastLowLatencyUser
= i
;
1814 // Moves COPY instructions on which depends
1815 // the low latency instructions too.
1816 } else if (SU
->getInstr()->getOpcode() == AMDGPU::COPY
) {
1817 bool CopyForLowLat
= false;
1818 for (SDep
& SuccDep
: SU
->Succs
) {
1819 SUnit
*Succ
= SuccDep
.getSUnit();
1820 if (SuccDep
.isWeak() || Succ
->NodeNum
>= DAGSize
)
1822 if (SITII
->isLowLatencyInstruction(*Succ
->getInstr())) {
1823 CopyForLowLat
= true;
1829 for (unsigned u
= i
; u
> MinPos
; --u
) {
1830 ++ScheduledSUnitsInv
[ScheduledSUnits
[u
-1]];
1831 ScheduledSUnits
[u
] = ScheduledSUnits
[u
-1];
1833 ScheduledSUnits
[MinPos
] = SU
->NodeNum
;
1834 ScheduledSUnitsInv
[SU
->NodeNum
] = MinPos
;
1840 void SIScheduleDAGMI::restoreSULinksLeft() {
1841 for (unsigned i
= 0, e
= SUnits
.size(); i
!= e
; ++i
) {
1842 SUnits
[i
].isScheduled
= false;
1843 SUnits
[i
].WeakPredsLeft
= SUnitsLinksBackup
[i
].WeakPredsLeft
;
1844 SUnits
[i
].NumPredsLeft
= SUnitsLinksBackup
[i
].NumPredsLeft
;
1845 SUnits
[i
].WeakSuccsLeft
= SUnitsLinksBackup
[i
].WeakSuccsLeft
;
1846 SUnits
[i
].NumSuccsLeft
= SUnitsLinksBackup
[i
].NumSuccsLeft
;
1850 // Return the Vgpr and Sgpr usage corresponding to some virtual registers.
1851 template<typename _Iterator
> void
1852 SIScheduleDAGMI::fillVgprSgprCost(_Iterator First
, _Iterator End
,
1853 unsigned &VgprUsage
, unsigned &SgprUsage
) {
1856 for (_Iterator RegI
= First
; RegI
!= End
; ++RegI
) {
1857 Register Reg
= *RegI
;
1858 // For now only track virtual registers
1859 if (!Reg
.isVirtual())
1861 PSetIterator PSetI
= MRI
.getPressureSets(Reg
);
1862 for (; PSetI
.isValid(); ++PSetI
) {
1863 if (*PSetI
== AMDGPU::RegisterPressureSets::VGPR_32
)
1864 VgprUsage
+= PSetI
.getWeight();
1865 else if (*PSetI
== AMDGPU::RegisterPressureSets::SReg_32
)
1866 SgprUsage
+= PSetI
.getWeight();
1871 void SIScheduleDAGMI::schedule()
1873 SmallVector
<SUnit
*, 8> TopRoots
, BotRoots
;
1874 SIScheduleBlockResult Best
, Temp
;
1875 LLVM_DEBUG(dbgs() << "Preparing Scheduling\n");
1877 buildDAGWithRegPressure();
1883 if (ViewMISchedDAGs
)
1887 findRootsAndBiasEdges(TopRoots
, BotRoots
);
1888 // We reuse several ScheduleDAGMI and ScheduleDAGMILive
1889 // functions, but to make them happy we must initialize
1890 // the default Scheduler implementation (even if we do not
1892 SchedImpl
->initialize(this);
1893 initQueues(TopRoots
, BotRoots
);
1895 // Fill some stats to help scheduling.
1897 SUnitsLinksBackup
= SUnits
;
1898 IsLowLatencySU
.clear();
1899 LowLatencyOffset
.clear();
1900 IsHighLatencySU
.clear();
1902 IsLowLatencySU
.resize(SUnits
.size(), 0);
1903 LowLatencyOffset
.resize(SUnits
.size(), 0);
1904 IsHighLatencySU
.resize(SUnits
.size(), 0);
1906 for (unsigned i
= 0, e
= (unsigned)SUnits
.size(); i
!= e
; ++i
) {
1907 SUnit
*SU
= &SUnits
[i
];
1908 const MachineOperand
*BaseLatOp
;
1910 if (SITII
->isLowLatencyInstruction(*SU
->getInstr())) {
1911 IsLowLatencySU
[i
] = 1;
1912 bool OffsetIsScalable
;
1913 if (SITII
->getMemOperandWithOffset(*SU
->getInstr(), BaseLatOp
, OffLatReg
,
1914 OffsetIsScalable
, TRI
))
1915 LowLatencyOffset
[i
] = OffLatReg
;
1916 } else if (SITII
->isHighLatencyDef(SU
->getInstr()->getOpcode()))
1917 IsHighLatencySU
[i
] = 1;
1920 SIScheduler
Scheduler(this);
1921 Best
= Scheduler
.scheduleVariant(SISchedulerBlockCreatorVariant::LatenciesAlone
,
1922 SISchedulerBlockSchedulerVariant::BlockLatencyRegUsage
);
1924 // if VGPR usage is extremely high, try other good performing variants
1925 // which could lead to lower VGPR usage
1926 if (Best
.MaxVGPRUsage
> 180) {
1927 static const std::pair
<SISchedulerBlockCreatorVariant
,
1928 SISchedulerBlockSchedulerVariant
>
1930 { LatenciesAlone
, BlockRegUsageLatency
},
1931 // { LatenciesAlone, BlockRegUsage },
1932 { LatenciesGrouped
, BlockLatencyRegUsage
},
1933 // { LatenciesGrouped, BlockRegUsageLatency },
1934 // { LatenciesGrouped, BlockRegUsage },
1935 { LatenciesAlonePlusConsecutive
, BlockLatencyRegUsage
},
1936 // { LatenciesAlonePlusConsecutive, BlockRegUsageLatency },
1937 // { LatenciesAlonePlusConsecutive, BlockRegUsage }
1939 for (std::pair
<SISchedulerBlockCreatorVariant
, SISchedulerBlockSchedulerVariant
> v
: Variants
) {
1940 Temp
= Scheduler
.scheduleVariant(v
.first
, v
.second
);
1941 if (Temp
.MaxVGPRUsage
< Best
.MaxVGPRUsage
)
1945 // if VGPR usage is still extremely high, we may spill. Try other variants
1946 // which are less performing, but that could lead to lower VGPR usage.
1947 if (Best
.MaxVGPRUsage
> 200) {
1948 static const std::pair
<SISchedulerBlockCreatorVariant
,
1949 SISchedulerBlockSchedulerVariant
>
1951 // { LatenciesAlone, BlockRegUsageLatency },
1952 { LatenciesAlone
, BlockRegUsage
},
1953 // { LatenciesGrouped, BlockLatencyRegUsage },
1954 { LatenciesGrouped
, BlockRegUsageLatency
},
1955 { LatenciesGrouped
, BlockRegUsage
},
1956 // { LatenciesAlonePlusConsecutive, BlockLatencyRegUsage },
1957 { LatenciesAlonePlusConsecutive
, BlockRegUsageLatency
},
1958 { LatenciesAlonePlusConsecutive
, BlockRegUsage
}
1960 for (std::pair
<SISchedulerBlockCreatorVariant
, SISchedulerBlockSchedulerVariant
> v
: Variants
) {
1961 Temp
= Scheduler
.scheduleVariant(v
.first
, v
.second
);
1962 if (Temp
.MaxVGPRUsage
< Best
.MaxVGPRUsage
)
1967 ScheduledSUnits
= Best
.SUs
;
1968 ScheduledSUnitsInv
.resize(SUnits
.size());
1970 for (unsigned i
= 0, e
= (unsigned)SUnits
.size(); i
!= e
; ++i
) {
1971 ScheduledSUnitsInv
[ScheduledSUnits
[i
]] = i
;
1976 // Tell the outside world about the result of the scheduling.
1978 assert(TopRPTracker
.getPos() == RegionBegin
&& "bad initial Top tracker");
1979 TopRPTracker
.setPos(CurrentTop
);
1981 for (unsigned I
: ScheduledSUnits
) {
1982 SUnit
*SU
= &SUnits
[I
];
1984 scheduleMI(SU
, true);
1986 LLVM_DEBUG(dbgs() << "Scheduling SU(" << SU
->NodeNum
<< ") "
1987 << *SU
->getInstr());
1990 assert(CurrentTop
== CurrentBottom
&& "Nonempty unscheduled zone.");
1995 dbgs() << "*** Final schedule for "
1996 << printMBBReference(*begin()->getParent()) << " ***\n";