1 //===-- SIMachineScheduler.cpp - SI Scheduler Interface -------------------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
10 /// SI Machine Scheduler interface
12 //===----------------------------------------------------------------------===//
14 #include "SIMachineScheduler.h"
16 #include "SIInstrInfo.h"
17 #include "SIRegisterInfo.h"
18 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
19 #include "llvm/ADT/STLExtras.h"
20 #include "llvm/ADT/SmallVector.h"
21 #include "llvm/CodeGen/LiveInterval.h"
22 #include "llvm/CodeGen/LiveIntervals.h"
23 #include "llvm/CodeGen/MachineInstr.h"
24 #include "llvm/CodeGen/MachineRegisterInfo.h"
25 #include "llvm/CodeGen/MachineScheduler.h"
26 #include "llvm/CodeGen/RegisterPressure.h"
27 #include "llvm/CodeGen/SlotIndexes.h"
28 #include "llvm/CodeGen/TargetRegisterInfo.h"
29 #include "llvm/Support/Debug.h"
30 #include "llvm/Support/ErrorHandling.h"
31 #include "llvm/Support/raw_ostream.h"
41 #define DEBUG_TYPE "machine-scheduler"
43 // This scheduler implements a different scheduling algorithm than
46 // There are several specific architecture behaviours that can't be modelled
47 // for GenericScheduler:
48 // . When accessing the result of an SGPR load instruction, you have to wait
49 // for all the SGPR load instructions before your current instruction to
51 // . When accessing the result of an VGPR load instruction, you have to wait
52 // for all the VGPR load instructions previous to the VGPR load instruction
53 // you are interested in to finish.
54 // . The less the register pressure, the best load latencies are hidden
56 // Moreover some specifities (like the fact a lot of instructions in the shader
57 // have few dependencies) makes the generic scheduler have some unpredictable
58 // behaviours. For example when register pressure becomes high, it can either
59 // manage to prevent register pressure from going too high, or it can
60 // increase register pressure even more than if it hadn't taken register
61 // pressure into account.
63 // Also some other bad behaviours are generated, like loading at the beginning
64 // of the shader a constant in VGPR you won't need until the end of the shader.
66 // The scheduling problem for SI can distinguish three main parts:
67 // . Hiding high latencies (texture sampling, etc)
68 // . Hiding low latencies (SGPR constant loading, etc)
69 // . Keeping register usage low for better latency hiding and general
72 // Some other things can also affect performance, but are hard to predict
73 // (cache usage, the fact the HW can issue several instructions from different
74 // wavefronts if different types, etc)
76 // This scheduler tries to solve the scheduling problem by dividing it into
77 // simpler sub-problems. It divides the instructions into blocks, schedules
78 // locally inside the blocks where it takes care of low latencies, and then
79 // chooses the order of the blocks by taking care of high latencies.
80 // Dividing the instructions into blocks helps control keeping register
83 // First the instructions are put into blocks.
84 // We want the blocks help control register usage and hide high latencies
85 // later. To help control register usage, we typically want all local
86 // computations, when for example you create a result that can be comsummed
87 // right away, to be contained in a block. Block inputs and outputs would
88 // typically be important results that are needed in several locations of
89 // the shader. Since we do want blocks to help hide high latencies, we want
90 // the instructions inside the block to have a minimal set of dependencies
91 // on high latencies. It will make it easy to pick blocks to hide specific
93 // The block creation algorithm is divided into several steps, and several
94 // variants can be tried during the scheduling process.
96 // Second the order of the instructions inside the blocks is chosen.
97 // At that step we do take into account only register usage and hiding
98 // low latency instructions
100 // Third the block order is chosen, there we try to hide high latencies
101 // and keep register usage low.
103 // After the third step, a pass is done to improve the hiding of low
106 // Actually when talking about 'low latency' or 'high latency' it includes
107 // both the latency to get the cache (or global mem) data go to the register,
108 // and the bandwidth limitations.
109 // Increasing the number of active wavefronts helps hide the former, but it
110 // doesn't solve the latter, thus why even if wavefront count is high, we have
111 // to try have as many instructions hiding high latencies as possible.
112 // The OpenCL doc says for example latency of 400 cycles for a global mem access,
113 // which is hidden by 10 instructions if the wavefront count is 10.
115 // Some figures taken from AMD docs:
116 // Both texture and constant L1 caches are 4-way associative with 64 bytes
118 // Constant cache is shared with 4 CUs.
119 // For texture sampling, the address generation unit receives 4 texture
120 // addresses per cycle, thus we could expect texture sampling latency to be
121 // equivalent to 4 instructions in the very best case (a VGPR is 64 work items,
122 // instructions in a wavefront group are executed every 4 cycles),
123 // or 16 instructions if the other wavefronts associated to the 3 other VALUs
124 // of the CU do texture sampling too. (Don't take these figures too seriously,
125 // as I'm not 100% sure of the computation)
126 // Data exports should get similar latency.
127 // For constant loading, the cache is shader with 4 CUs.
128 // The doc says "a throughput of 16B/cycle for each of the 4 Compute Unit"
129 // I guess if the other CU don't read the cache, it can go up to 64B/cycle.
130 // It means a simple s_buffer_load should take one instruction to hide, as
131 // well as a s_buffer_loadx2 and potentially a s_buffer_loadx8 if on the same
134 // As of today the driver doesn't preload the constants in cache, thus the
135 // first loads get extra latency. The doc says global memory access can be
136 // 300-600 cycles. We do not specially take that into account when scheduling
137 // As we expect the driver to be able to preload the constants soon.
143 static const char *getReasonStr(SIScheduleCandReason Reason
) {
145 case NoCand
: return "NOCAND";
146 case RegUsage
: return "REGUSAGE";
147 case Latency
: return "LATENCY";
148 case Successor
: return "SUCCESSOR";
149 case Depth
: return "DEPTH";
150 case NodeOrder
: return "ORDER";
152 llvm_unreachable("Unknown reason!");
159 static bool tryLess(int TryVal
, int CandVal
,
160 SISchedulerCandidate
&TryCand
,
161 SISchedulerCandidate
&Cand
,
162 SIScheduleCandReason Reason
) {
163 if (TryVal
< CandVal
) {
164 TryCand
.Reason
= Reason
;
167 if (TryVal
> CandVal
) {
168 if (Cand
.Reason
> Reason
)
169 Cand
.Reason
= Reason
;
172 Cand
.setRepeat(Reason
);
176 static bool tryGreater(int TryVal
, int CandVal
,
177 SISchedulerCandidate
&TryCand
,
178 SISchedulerCandidate
&Cand
,
179 SIScheduleCandReason Reason
) {
180 if (TryVal
> CandVal
) {
181 TryCand
.Reason
= Reason
;
184 if (TryVal
< CandVal
) {
185 if (Cand
.Reason
> Reason
)
186 Cand
.Reason
= Reason
;
189 Cand
.setRepeat(Reason
);
192 } // end namespace SISched
193 } // end namespace llvm
195 // SIScheduleBlock //
197 void SIScheduleBlock::addUnit(SUnit
*SU
) {
198 NodeNum2Index
[SU
->NodeNum
] = SUnits
.size();
199 SUnits
.push_back(SU
);
203 void SIScheduleBlock::traceCandidate(const SISchedCandidate
&Cand
) {
205 dbgs() << " SU(" << Cand
.SU
->NodeNum
<< ") " << getReasonStr(Cand
.Reason
);
210 void SIScheduleBlock::tryCandidateTopDown(SISchedCandidate
&Cand
,
211 SISchedCandidate
&TryCand
) {
212 // Initialize the candidate if needed.
213 if (!Cand
.isValid()) {
214 TryCand
.Reason
= NodeOrder
;
218 if (Cand
.SGPRUsage
> 60 &&
219 SISched::tryLess(TryCand
.SGPRUsage
, Cand
.SGPRUsage
,
220 TryCand
, Cand
, RegUsage
))
223 // Schedule low latency instructions as top as possible.
224 // Order of priority is:
225 // . Low latency instructions which do not depend on other low latency
226 // instructions we haven't waited for
227 // . Other instructions which do not depend on low latency instructions
228 // we haven't waited for
230 // . All other instructions
231 // Goal is to get: low latency instructions - independent instructions
232 // - (eventually some more low latency instructions)
233 // - instructions that depend on the first low latency instructions.
234 // If in the block there is a lot of constant loads, the SGPR usage
235 // could go quite high, thus above the arbitrary limit of 60 will encourage
236 // use the already loaded constants (in order to release some SGPRs) before
238 if (SISched::tryLess(TryCand
.HasLowLatencyNonWaitedParent
,
239 Cand
.HasLowLatencyNonWaitedParent
,
240 TryCand
, Cand
, SIScheduleCandReason::Depth
))
243 if (SISched::tryGreater(TryCand
.IsLowLatency
, Cand
.IsLowLatency
,
244 TryCand
, Cand
, SIScheduleCandReason::Depth
))
247 if (TryCand
.IsLowLatency
&&
248 SISched::tryLess(TryCand
.LowLatencyOffset
, Cand
.LowLatencyOffset
,
249 TryCand
, Cand
, SIScheduleCandReason::Depth
))
252 if (SISched::tryLess(TryCand
.VGPRUsage
, Cand
.VGPRUsage
,
253 TryCand
, Cand
, RegUsage
))
256 // Fall through to original instruction order.
257 if (TryCand
.SU
->NodeNum
< Cand
.SU
->NodeNum
) {
258 TryCand
.Reason
= NodeOrder
;
262 SUnit
* SIScheduleBlock::pickNode() {
263 SISchedCandidate TopCand
;
265 for (SUnit
* SU
: TopReadySUs
) {
266 SISchedCandidate TryCand
;
267 std::vector
<unsigned> pressure
;
268 std::vector
<unsigned> MaxPressure
;
269 // Predict register usage after this instruction.
271 TopRPTracker
.getDownwardPressure(SU
->getInstr(), pressure
, MaxPressure
);
272 TryCand
.SGPRUsage
= pressure
[DAG
->getSGPRSetID()];
273 TryCand
.VGPRUsage
= pressure
[DAG
->getVGPRSetID()];
274 TryCand
.IsLowLatency
= DAG
->IsLowLatencySU
[SU
->NodeNum
];
275 TryCand
.LowLatencyOffset
= DAG
->LowLatencyOffset
[SU
->NodeNum
];
276 TryCand
.HasLowLatencyNonWaitedParent
=
277 HasLowLatencyNonWaitedParent
[NodeNum2Index
[SU
->NodeNum
]];
278 tryCandidateTopDown(TopCand
, TryCand
);
279 if (TryCand
.Reason
!= NoCand
)
280 TopCand
.setBest(TryCand
);
287 // Schedule something valid.
288 void SIScheduleBlock::fastSchedule() {
293 for (SUnit
* SU
: SUnits
) {
294 if (!SU
->NumPredsLeft
)
295 TopReadySUs
.push_back(SU
);
298 while (!TopReadySUs
.empty()) {
299 SUnit
*SU
= TopReadySUs
[0];
300 ScheduledSUnits
.push_back(SU
);
307 // Returns if the register was set between first and last.
308 static bool isDefBetween(unsigned Reg
,
309 SlotIndex First
, SlotIndex Last
,
310 const MachineRegisterInfo
*MRI
,
311 const LiveIntervals
*LIS
) {
312 for (MachineRegisterInfo::def_instr_iterator
313 UI
= MRI
->def_instr_begin(Reg
),
314 UE
= MRI
->def_instr_end(); UI
!= UE
; ++UI
) {
315 const MachineInstr
* MI
= &*UI
;
316 if (MI
->isDebugValue())
318 SlotIndex InstSlot
= LIS
->getInstructionIndex(*MI
).getRegSlot();
319 if (InstSlot
>= First
&& InstSlot
<= Last
)
325 void SIScheduleBlock::initRegPressure(MachineBasicBlock::iterator BeginBlock
,
326 MachineBasicBlock::iterator EndBlock
) {
327 IntervalPressure Pressure
, BotPressure
;
328 RegPressureTracker
RPTracker(Pressure
), BotRPTracker(BotPressure
);
329 LiveIntervals
*LIS
= DAG
->getLIS();
330 MachineRegisterInfo
*MRI
= DAG
->getMRI();
331 DAG
->initRPTracker(TopRPTracker
);
332 DAG
->initRPTracker(BotRPTracker
);
333 DAG
->initRPTracker(RPTracker
);
335 // Goes though all SU. RPTracker captures what had to be alive for the SUs
336 // to execute, and what is still alive at the end.
337 for (SUnit
* SU
: ScheduledSUnits
) {
338 RPTracker
.setPos(SU
->getInstr());
342 // Close the RPTracker to finalize live ins/outs.
343 RPTracker
.closeRegion();
345 // Initialize the live ins and live outs.
346 TopRPTracker
.addLiveRegs(RPTracker
.getPressure().LiveInRegs
);
347 BotRPTracker
.addLiveRegs(RPTracker
.getPressure().LiveOutRegs
);
349 // Do not Track Physical Registers, because it messes up.
350 for (const auto &RegMaskPair
: RPTracker
.getPressure().LiveInRegs
) {
351 if (Register::isVirtualRegister(RegMaskPair
.RegUnit
))
352 LiveInRegs
.insert(RegMaskPair
.RegUnit
);
355 // There is several possibilities to distinguish:
356 // 1) Reg is not input to any instruction in the block, but is output of one
357 // 2) 1) + read in the block and not needed after it
358 // 3) 1) + read in the block but needed in another block
359 // 4) Reg is input of an instruction but another block will read it too
360 // 5) Reg is input of an instruction and then rewritten in the block.
361 // result is not read in the block (implies used in another block)
362 // 6) Reg is input of an instruction and then rewritten in the block.
363 // result is read in the block and not needed in another block
364 // 7) Reg is input of an instruction and then rewritten in the block.
365 // result is read in the block but also needed in another block
366 // LiveInRegs will contains all the regs in situation 4, 5, 6, 7
367 // We want LiveOutRegs to contain only Regs whose content will be read after
368 // in another block, and whose content was written in the current block,
369 // that is we want it to get 1, 3, 5, 7
370 // Since we made the MIs of a block to be packed all together before
371 // scheduling, then the LiveIntervals were correct, and the RPTracker was
372 // able to correctly handle 5 vs 6, 2 vs 3.
373 // (Note: This is not sufficient for RPTracker to not do mistakes for case 4)
374 // The RPTracker's LiveOutRegs has 1, 3, (some correct or incorrect)4, 5, 7
375 // Comparing to LiveInRegs is not sufficient to differenciate 4 vs 5, 7
376 // The use of findDefBetween removes the case 4.
377 for (const auto &RegMaskPair
: RPTracker
.getPressure().LiveOutRegs
) {
378 unsigned Reg
= RegMaskPair
.RegUnit
;
379 if (Register::isVirtualRegister(Reg
) &&
380 isDefBetween(Reg
, LIS
->getInstructionIndex(*BeginBlock
).getRegSlot(),
381 LIS
->getInstructionIndex(*EndBlock
).getRegSlot(), MRI
,
383 LiveOutRegs
.insert(Reg
);
387 // Pressure = sum_alive_registers register size
388 // Internally llvm will represent some registers as big 128 bits registers
389 // for example, but they actually correspond to 4 actual 32 bits registers.
390 // Thus Pressure is not equal to num_alive_registers * constant.
391 LiveInPressure
= TopPressure
.MaxSetPressure
;
392 LiveOutPressure
= BotPressure
.MaxSetPressure
;
394 // Prepares TopRPTracker for top down scheduling.
395 TopRPTracker
.closeTop();
398 void SIScheduleBlock::schedule(MachineBasicBlock::iterator BeginBlock
,
399 MachineBasicBlock::iterator EndBlock
) {
403 // PreScheduling phase to set LiveIn and LiveOut.
404 initRegPressure(BeginBlock
, EndBlock
);
407 // Schedule for real now.
411 for (SUnit
* SU
: SUnits
) {
412 if (!SU
->NumPredsLeft
)
413 TopReadySUs
.push_back(SU
);
416 while (!TopReadySUs
.empty()) {
417 SUnit
*SU
= pickNode();
418 ScheduledSUnits
.push_back(SU
);
419 TopRPTracker
.setPos(SU
->getInstr());
420 TopRPTracker
.advance();
424 // TODO: compute InternalAdditionnalPressure.
425 InternalAdditionnalPressure
.resize(TopPressure
.MaxSetPressure
.size());
427 // Check everything is right.
429 assert(SUnits
.size() == ScheduledSUnits
.size() &&
430 TopReadySUs
.empty());
431 for (SUnit
* SU
: SUnits
) {
432 assert(SU
->isScheduled
&&
433 SU
->NumPredsLeft
== 0);
440 void SIScheduleBlock::undoSchedule() {
441 for (SUnit
* SU
: SUnits
) {
442 SU
->isScheduled
= false;
443 for (SDep
& Succ
: SU
->Succs
) {
444 if (BC
->isSUInBlock(Succ
.getSUnit(), ID
))
445 undoReleaseSucc(SU
, &Succ
);
448 HasLowLatencyNonWaitedParent
.assign(SUnits
.size(), 0);
449 ScheduledSUnits
.clear();
453 void SIScheduleBlock::undoReleaseSucc(SUnit
*SU
, SDep
*SuccEdge
) {
454 SUnit
*SuccSU
= SuccEdge
->getSUnit();
456 if (SuccEdge
->isWeak()) {
457 ++SuccSU
->WeakPredsLeft
;
460 ++SuccSU
->NumPredsLeft
;
463 void SIScheduleBlock::releaseSucc(SUnit
*SU
, SDep
*SuccEdge
) {
464 SUnit
*SuccSU
= SuccEdge
->getSUnit();
466 if (SuccEdge
->isWeak()) {
467 --SuccSU
->WeakPredsLeft
;
471 if (SuccSU
->NumPredsLeft
== 0) {
472 dbgs() << "*** Scheduling failed! ***\n";
473 DAG
->dumpNode(*SuccSU
);
474 dbgs() << " has been released too many times!\n";
475 llvm_unreachable(nullptr);
479 --SuccSU
->NumPredsLeft
;
482 /// Release Successors of the SU that are in the block or not.
483 void SIScheduleBlock::releaseSuccessors(SUnit
*SU
, bool InOrOutBlock
) {
484 for (SDep
& Succ
: SU
->Succs
) {
485 SUnit
*SuccSU
= Succ
.getSUnit();
487 if (SuccSU
->NodeNum
>= DAG
->SUnits
.size())
490 if (BC
->isSUInBlock(SuccSU
, ID
) != InOrOutBlock
)
493 releaseSucc(SU
, &Succ
);
494 if (SuccSU
->NumPredsLeft
== 0 && InOrOutBlock
)
495 TopReadySUs
.push_back(SuccSU
);
499 void SIScheduleBlock::nodeScheduled(SUnit
*SU
) {
501 assert (!SU
->NumPredsLeft
);
502 std::vector
<SUnit
*>::iterator I
= llvm::find(TopReadySUs
, SU
);
503 if (I
== TopReadySUs
.end()) {
504 dbgs() << "Data Structure Bug in SI Scheduler\n";
505 llvm_unreachable(nullptr);
507 TopReadySUs
.erase(I
);
509 releaseSuccessors(SU
, true);
510 // Scheduling this node will trigger a wait,
511 // thus propagate to other instructions that they do not need to wait either.
512 if (HasLowLatencyNonWaitedParent
[NodeNum2Index
[SU
->NodeNum
]])
513 HasLowLatencyNonWaitedParent
.assign(SUnits
.size(), 0);
515 if (DAG
->IsLowLatencySU
[SU
->NodeNum
]) {
516 for (SDep
& Succ
: SU
->Succs
) {
517 std::map
<unsigned, unsigned>::iterator I
=
518 NodeNum2Index
.find(Succ
.getSUnit()->NodeNum
);
519 if (I
!= NodeNum2Index
.end())
520 HasLowLatencyNonWaitedParent
[I
->second
] = 1;
523 SU
->isScheduled
= true;
526 void SIScheduleBlock::finalizeUnits() {
527 // We remove links from outside blocks to enable scheduling inside the block.
528 for (SUnit
* SU
: SUnits
) {
529 releaseSuccessors(SU
, false);
530 if (DAG
->IsHighLatencySU
[SU
->NodeNum
])
531 HighLatencyBlock
= true;
533 HasLowLatencyNonWaitedParent
.resize(SUnits
.size(), 0);
536 // we maintain ascending order of IDs
537 void SIScheduleBlock::addPred(SIScheduleBlock
*Pred
) {
538 unsigned PredID
= Pred
->getID();
540 // Check if not already predecessor.
541 for (SIScheduleBlock
* P
: Preds
) {
542 if (PredID
== P
->getID())
545 Preds
.push_back(Pred
);
547 assert(none_of(Succs
,
548 [=](std::pair
<SIScheduleBlock
*,
549 SIScheduleBlockLinkKind
> S
) {
550 return PredID
== S
.first
->getID();
552 "Loop in the Block Graph!");
555 void SIScheduleBlock::addSucc(SIScheduleBlock
*Succ
,
556 SIScheduleBlockLinkKind Kind
) {
557 unsigned SuccID
= Succ
->getID();
559 // Check if not already predecessor.
560 for (std::pair
<SIScheduleBlock
*, SIScheduleBlockLinkKind
> &S
: Succs
) {
561 if (SuccID
== S
.first
->getID()) {
562 if (S
.second
== SIScheduleBlockLinkKind::NoData
&&
563 Kind
== SIScheduleBlockLinkKind::Data
)
568 if (Succ
->isHighLatencyBlock())
569 ++NumHighLatencySuccessors
;
570 Succs
.push_back(std::make_pair(Succ
, Kind
));
572 assert(none_of(Preds
,
573 [=](SIScheduleBlock
*P
) { return SuccID
== P
->getID(); }) &&
574 "Loop in the Block Graph!");
578 void SIScheduleBlock::printDebug(bool full
) {
579 dbgs() << "Block (" << ID
<< ")\n";
583 dbgs() << "\nContains High Latency Instruction: "
584 << HighLatencyBlock
<< '\n';
585 dbgs() << "\nDepends On:\n";
586 for (SIScheduleBlock
* P
: Preds
) {
587 P
->printDebug(false);
590 dbgs() << "\nSuccessors:\n";
591 for (std::pair
<SIScheduleBlock
*, SIScheduleBlockLinkKind
> S
: Succs
) {
592 if (S
.second
== SIScheduleBlockLinkKind::Data
)
593 dbgs() << "(Data Dep) ";
594 S
.first
->printDebug(false);
598 dbgs() << "LiveInPressure " << LiveInPressure
[DAG
->getSGPRSetID()] << ' '
599 << LiveInPressure
[DAG
->getVGPRSetID()] << '\n';
600 dbgs() << "LiveOutPressure " << LiveOutPressure
[DAG
->getSGPRSetID()] << ' '
601 << LiveOutPressure
[DAG
->getVGPRSetID()] << "\n\n";
602 dbgs() << "LiveIns:\n";
603 for (unsigned Reg
: LiveInRegs
)
604 dbgs() << printVRegOrUnit(Reg
, DAG
->getTRI()) << ' ';
606 dbgs() << "\nLiveOuts:\n";
607 for (unsigned Reg
: LiveOutRegs
)
608 dbgs() << printVRegOrUnit(Reg
, DAG
->getTRI()) << ' ';
611 dbgs() << "\nInstructions:\n";
613 for (const SUnit
* SU
: SUnits
)
616 for (const SUnit
* SU
: SUnits
)
620 dbgs() << "///////////////////////\n";
624 // SIScheduleBlockCreator //
626 SIScheduleBlockCreator::SIScheduleBlockCreator(SIScheduleDAGMI
*DAG
) :
630 SIScheduleBlockCreator::~SIScheduleBlockCreator() = default;
633 SIScheduleBlockCreator::getBlocks(SISchedulerBlockCreatorVariant BlockVariant
) {
634 std::map
<SISchedulerBlockCreatorVariant
, SIScheduleBlocks
>::iterator B
=
635 Blocks
.find(BlockVariant
);
636 if (B
== Blocks
.end()) {
637 SIScheduleBlocks Res
;
638 createBlocksForVariant(BlockVariant
);
640 scheduleInsideBlocks();
642 Res
.Blocks
= CurrentBlocks
;
643 Res
.TopDownIndex2Block
= TopDownIndex2Block
;
644 Res
.TopDownBlock2Index
= TopDownBlock2Index
;
645 Blocks
[BlockVariant
] = Res
;
652 bool SIScheduleBlockCreator::isSUInBlock(SUnit
*SU
, unsigned ID
) {
653 if (SU
->NodeNum
>= DAG
->SUnits
.size())
655 return CurrentBlocks
[Node2CurrentBlock
[SU
->NodeNum
]]->getID() == ID
;
658 void SIScheduleBlockCreator::colorHighLatenciesAlone() {
659 unsigned DAGSize
= DAG
->SUnits
.size();
661 for (unsigned i
= 0, e
= DAGSize
; i
!= e
; ++i
) {
662 SUnit
*SU
= &DAG
->SUnits
[i
];
663 if (DAG
->IsHighLatencySU
[SU
->NodeNum
]) {
664 CurrentColoring
[SU
->NodeNum
] = NextReservedID
++;
670 hasDataDependencyPred(const SUnit
&SU
, const SUnit
&FromSU
) {
671 for (const auto &PredDep
: SU
.Preds
) {
672 if (PredDep
.getSUnit() == &FromSU
&&
673 PredDep
.getKind() == llvm::SDep::Data
)
679 void SIScheduleBlockCreator::colorHighLatenciesGroups() {
680 unsigned DAGSize
= DAG
->SUnits
.size();
681 unsigned NumHighLatencies
= 0;
683 int Color
= NextReservedID
;
685 std::set
<unsigned> FormingGroup
;
687 for (unsigned i
= 0, e
= DAGSize
; i
!= e
; ++i
) {
688 SUnit
*SU
= &DAG
->SUnits
[i
];
689 if (DAG
->IsHighLatencySU
[SU
->NodeNum
])
693 if (NumHighLatencies
== 0)
696 if (NumHighLatencies
<= 6)
698 else if (NumHighLatencies
<= 12)
703 for (unsigned SUNum
: DAG
->TopDownIndex2SU
) {
704 const SUnit
&SU
= DAG
->SUnits
[SUNum
];
705 if (DAG
->IsHighLatencySU
[SU
.NodeNum
]) {
706 unsigned CompatibleGroup
= true;
707 int ProposedColor
= Color
;
708 std::vector
<int> AdditionalElements
;
710 // We don't want to put in the same block
711 // two high latency instructions that depend
713 // One way would be to check canAddEdge
714 // in both directions, but that currently is not
715 // enough because there the high latency order is
716 // enforced (via links).
717 // Instead, look at the dependencies between the
718 // high latency instructions and deduce if it is
719 // a data dependency or not.
720 for (unsigned j
: FormingGroup
) {
722 std::vector
<int> SubGraph
;
723 // By construction (topological order), if SU and
724 // DAG->SUnits[j] are linked, DAG->SUnits[j] is neccessary
725 // in the parent graph of SU.
727 SubGraph
= DAG
->GetTopo()->GetSubGraph(SU
, DAG
->SUnits
[j
],
729 assert(!HasSubGraph
);
731 SubGraph
= DAG
->GetTopo()->GetSubGraph(DAG
->SUnits
[j
], SU
,
734 continue; // No dependencies between each other
735 else if (SubGraph
.size() > 5) {
736 // Too many elements would be required to be added to the block.
737 CompatibleGroup
= false;
741 // Check the type of dependency
742 for (unsigned k
: SubGraph
) {
743 // If in the path to join the two instructions,
744 // there is another high latency instruction,
745 // or instructions colored for another block
747 if (DAG
->IsHighLatencySU
[k
] ||
748 (CurrentColoring
[k
] != ProposedColor
&&
749 CurrentColoring
[k
] != 0)) {
750 CompatibleGroup
= false;
753 // If one of the SU in the subgraph depends on the result of SU j,
754 // there'll be a data dependency.
755 if (hasDataDependencyPred(DAG
->SUnits
[k
], DAG
->SUnits
[j
])) {
756 CompatibleGroup
= false;
760 if (!CompatibleGroup
)
762 // Same check for the SU
763 if (hasDataDependencyPred(SU
, DAG
->SUnits
[j
])) {
764 CompatibleGroup
= false;
767 // Add all the required instructions to the block
768 // These cannot live in another block (because they
769 // depend (order dependency) on one of the
770 // instruction in the block, and are required for the
771 // high latency instruction we add.
772 AdditionalElements
.insert(AdditionalElements
.end(),
773 SubGraph
.begin(), SubGraph
.end());
776 if (CompatibleGroup
) {
777 FormingGroup
.insert(SU
.NodeNum
);
778 for (unsigned j
: AdditionalElements
)
779 CurrentColoring
[j
] = ProposedColor
;
780 CurrentColoring
[SU
.NodeNum
] = ProposedColor
;
783 // Found one incompatible instruction,
784 // or has filled a big enough group.
785 // -> start a new one.
786 if (!CompatibleGroup
) {
787 FormingGroup
.clear();
788 Color
= ++NextReservedID
;
789 ProposedColor
= Color
;
790 FormingGroup
.insert(SU
.NodeNum
);
791 CurrentColoring
[SU
.NodeNum
] = ProposedColor
;
793 } else if (Count
== GroupSize
) {
794 FormingGroup
.clear();
795 Color
= ++NextReservedID
;
796 ProposedColor
= Color
;
803 void SIScheduleBlockCreator::colorComputeReservedDependencies() {
804 unsigned DAGSize
= DAG
->SUnits
.size();
805 std::map
<std::set
<unsigned>, unsigned> ColorCombinations
;
807 CurrentTopDownReservedDependencyColoring
.clear();
808 CurrentBottomUpReservedDependencyColoring
.clear();
810 CurrentTopDownReservedDependencyColoring
.resize(DAGSize
, 0);
811 CurrentBottomUpReservedDependencyColoring
.resize(DAGSize
, 0);
813 // Traverse TopDown, and give different colors to SUs depending
814 // on which combination of High Latencies they depend on.
816 for (unsigned SUNum
: DAG
->TopDownIndex2SU
) {
817 SUnit
*SU
= &DAG
->SUnits
[SUNum
];
818 std::set
<unsigned> SUColors
;
821 if (CurrentColoring
[SU
->NodeNum
]) {
822 CurrentTopDownReservedDependencyColoring
[SU
->NodeNum
] =
823 CurrentColoring
[SU
->NodeNum
];
827 for (SDep
& PredDep
: SU
->Preds
) {
828 SUnit
*Pred
= PredDep
.getSUnit();
829 if (PredDep
.isWeak() || Pred
->NodeNum
>= DAGSize
)
831 if (CurrentTopDownReservedDependencyColoring
[Pred
->NodeNum
] > 0)
832 SUColors
.insert(CurrentTopDownReservedDependencyColoring
[Pred
->NodeNum
]);
834 // Color 0 by default.
835 if (SUColors
.empty())
837 // Same color than parents.
838 if (SUColors
.size() == 1 && *SUColors
.begin() > DAGSize
)
839 CurrentTopDownReservedDependencyColoring
[SU
->NodeNum
] =
842 std::map
<std::set
<unsigned>, unsigned>::iterator Pos
=
843 ColorCombinations
.find(SUColors
);
844 if (Pos
!= ColorCombinations
.end()) {
845 CurrentTopDownReservedDependencyColoring
[SU
->NodeNum
] = Pos
->second
;
847 CurrentTopDownReservedDependencyColoring
[SU
->NodeNum
] =
849 ColorCombinations
[SUColors
] = NextNonReservedID
++;
854 ColorCombinations
.clear();
856 // Same as before, but BottomUp.
858 for (unsigned SUNum
: DAG
->BottomUpIndex2SU
) {
859 SUnit
*SU
= &DAG
->SUnits
[SUNum
];
860 std::set
<unsigned> SUColors
;
863 if (CurrentColoring
[SU
->NodeNum
]) {
864 CurrentBottomUpReservedDependencyColoring
[SU
->NodeNum
] =
865 CurrentColoring
[SU
->NodeNum
];
869 for (SDep
& SuccDep
: SU
->Succs
) {
870 SUnit
*Succ
= SuccDep
.getSUnit();
871 if (SuccDep
.isWeak() || Succ
->NodeNum
>= DAGSize
)
873 if (CurrentBottomUpReservedDependencyColoring
[Succ
->NodeNum
] > 0)
874 SUColors
.insert(CurrentBottomUpReservedDependencyColoring
[Succ
->NodeNum
]);
877 if (SUColors
.empty())
879 // Same color than parents.
880 if (SUColors
.size() == 1 && *SUColors
.begin() > DAGSize
)
881 CurrentBottomUpReservedDependencyColoring
[SU
->NodeNum
] =
884 std::map
<std::set
<unsigned>, unsigned>::iterator Pos
=
885 ColorCombinations
.find(SUColors
);
886 if (Pos
!= ColorCombinations
.end()) {
887 CurrentBottomUpReservedDependencyColoring
[SU
->NodeNum
] = Pos
->second
;
889 CurrentBottomUpReservedDependencyColoring
[SU
->NodeNum
] =
891 ColorCombinations
[SUColors
] = NextNonReservedID
++;
897 void SIScheduleBlockCreator::colorAccordingToReservedDependencies() {
898 unsigned DAGSize
= DAG
->SUnits
.size();
899 std::map
<std::pair
<unsigned, unsigned>, unsigned> ColorCombinations
;
901 // Every combination of colors given by the top down
902 // and bottom up Reserved node dependency
904 for (unsigned i
= 0, e
= DAGSize
; i
!= e
; ++i
) {
905 SUnit
*SU
= &DAG
->SUnits
[i
];
906 std::pair
<unsigned, unsigned> SUColors
;
908 // High latency instructions: already given.
909 if (CurrentColoring
[SU
->NodeNum
])
912 SUColors
.first
= CurrentTopDownReservedDependencyColoring
[SU
->NodeNum
];
913 SUColors
.second
= CurrentBottomUpReservedDependencyColoring
[SU
->NodeNum
];
915 std::map
<std::pair
<unsigned, unsigned>, unsigned>::iterator Pos
=
916 ColorCombinations
.find(SUColors
);
917 if (Pos
!= ColorCombinations
.end()) {
918 CurrentColoring
[SU
->NodeNum
] = Pos
->second
;
920 CurrentColoring
[SU
->NodeNum
] = NextNonReservedID
;
921 ColorCombinations
[SUColors
] = NextNonReservedID
++;
926 void SIScheduleBlockCreator::colorEndsAccordingToDependencies() {
927 unsigned DAGSize
= DAG
->SUnits
.size();
928 std::vector
<int> PendingColoring
= CurrentColoring
;
930 assert(DAGSize
>= 1 &&
931 CurrentBottomUpReservedDependencyColoring
.size() == DAGSize
&&
932 CurrentTopDownReservedDependencyColoring
.size() == DAGSize
);
933 // If there is no reserved block at all, do nothing. We don't want
934 // everything in one block.
935 if (*std::max_element(CurrentBottomUpReservedDependencyColoring
.begin(),
936 CurrentBottomUpReservedDependencyColoring
.end()) == 0 &&
937 *std::max_element(CurrentTopDownReservedDependencyColoring
.begin(),
938 CurrentTopDownReservedDependencyColoring
.end()) == 0)
941 for (unsigned SUNum
: DAG
->BottomUpIndex2SU
) {
942 SUnit
*SU
= &DAG
->SUnits
[SUNum
];
943 std::set
<unsigned> SUColors
;
944 std::set
<unsigned> SUColorsPending
;
946 if (CurrentColoring
[SU
->NodeNum
] <= (int)DAGSize
)
949 if (CurrentBottomUpReservedDependencyColoring
[SU
->NodeNum
] > 0 ||
950 CurrentTopDownReservedDependencyColoring
[SU
->NodeNum
] > 0)
953 for (SDep
& SuccDep
: SU
->Succs
) {
954 SUnit
*Succ
= SuccDep
.getSUnit();
955 if (SuccDep
.isWeak() || Succ
->NodeNum
>= DAGSize
)
957 if (CurrentBottomUpReservedDependencyColoring
[Succ
->NodeNum
] > 0 ||
958 CurrentTopDownReservedDependencyColoring
[Succ
->NodeNum
] > 0)
959 SUColors
.insert(CurrentColoring
[Succ
->NodeNum
]);
960 SUColorsPending
.insert(PendingColoring
[Succ
->NodeNum
]);
962 // If there is only one child/parent block, and that block
963 // is not among the ones we are removing in this path, then
964 // merge the instruction to that block
965 if (SUColors
.size() == 1 && SUColorsPending
.size() == 1)
966 PendingColoring
[SU
->NodeNum
] = *SUColors
.begin();
967 else // TODO: Attribute new colors depending on color
968 // combination of children.
969 PendingColoring
[SU
->NodeNum
] = NextNonReservedID
++;
971 CurrentColoring
= PendingColoring
;
975 void SIScheduleBlockCreator::colorForceConsecutiveOrderInGroup() {
976 unsigned DAGSize
= DAG
->SUnits
.size();
977 unsigned PreviousColor
;
978 std::set
<unsigned> SeenColors
;
983 PreviousColor
= CurrentColoring
[0];
985 for (unsigned i
= 1, e
= DAGSize
; i
!= e
; ++i
) {
986 SUnit
*SU
= &DAG
->SUnits
[i
];
987 unsigned CurrentColor
= CurrentColoring
[i
];
988 unsigned PreviousColorSave
= PreviousColor
;
989 assert(i
== SU
->NodeNum
);
991 if (CurrentColor
!= PreviousColor
)
992 SeenColors
.insert(PreviousColor
);
993 PreviousColor
= CurrentColor
;
995 if (CurrentColoring
[SU
->NodeNum
] <= (int)DAGSize
)
998 if (SeenColors
.find(CurrentColor
) == SeenColors
.end())
1001 if (PreviousColorSave
!= CurrentColor
)
1002 CurrentColoring
[i
] = NextNonReservedID
++;
1004 CurrentColoring
[i
] = CurrentColoring
[i
-1];
1008 void SIScheduleBlockCreator::colorMergeConstantLoadsNextGroup() {
1009 unsigned DAGSize
= DAG
->SUnits
.size();
1011 for (unsigned SUNum
: DAG
->BottomUpIndex2SU
) {
1012 SUnit
*SU
= &DAG
->SUnits
[SUNum
];
1013 std::set
<unsigned> SUColors
;
1015 if (CurrentColoring
[SU
->NodeNum
] <= (int)DAGSize
)
1018 // No predecessor: Vgpr constant loading.
1019 // Low latency instructions usually have a predecessor (the address)
1020 if (SU
->Preds
.size() > 0 && !DAG
->IsLowLatencySU
[SU
->NodeNum
])
1023 for (SDep
& SuccDep
: SU
->Succs
) {
1024 SUnit
*Succ
= SuccDep
.getSUnit();
1025 if (SuccDep
.isWeak() || Succ
->NodeNum
>= DAGSize
)
1027 SUColors
.insert(CurrentColoring
[Succ
->NodeNum
]);
1029 if (SUColors
.size() == 1)
1030 CurrentColoring
[SU
->NodeNum
] = *SUColors
.begin();
1034 void SIScheduleBlockCreator::colorMergeIfPossibleNextGroup() {
1035 unsigned DAGSize
= DAG
->SUnits
.size();
1037 for (unsigned SUNum
: DAG
->BottomUpIndex2SU
) {
1038 SUnit
*SU
= &DAG
->SUnits
[SUNum
];
1039 std::set
<unsigned> SUColors
;
1041 if (CurrentColoring
[SU
->NodeNum
] <= (int)DAGSize
)
1044 for (SDep
& SuccDep
: SU
->Succs
) {
1045 SUnit
*Succ
= SuccDep
.getSUnit();
1046 if (SuccDep
.isWeak() || Succ
->NodeNum
>= DAGSize
)
1048 SUColors
.insert(CurrentColoring
[Succ
->NodeNum
]);
1050 if (SUColors
.size() == 1)
1051 CurrentColoring
[SU
->NodeNum
] = *SUColors
.begin();
1055 void SIScheduleBlockCreator::colorMergeIfPossibleNextGroupOnlyForReserved() {
1056 unsigned DAGSize
= DAG
->SUnits
.size();
1058 for (unsigned SUNum
: DAG
->BottomUpIndex2SU
) {
1059 SUnit
*SU
= &DAG
->SUnits
[SUNum
];
1060 std::set
<unsigned> SUColors
;
1062 if (CurrentColoring
[SU
->NodeNum
] <= (int)DAGSize
)
1065 for (SDep
& SuccDep
: SU
->Succs
) {
1066 SUnit
*Succ
= SuccDep
.getSUnit();
1067 if (SuccDep
.isWeak() || Succ
->NodeNum
>= DAGSize
)
1069 SUColors
.insert(CurrentColoring
[Succ
->NodeNum
]);
1071 if (SUColors
.size() == 1 && *SUColors
.begin() <= DAGSize
)
1072 CurrentColoring
[SU
->NodeNum
] = *SUColors
.begin();
1076 void SIScheduleBlockCreator::colorMergeIfPossibleSmallGroupsToNextGroup() {
1077 unsigned DAGSize
= DAG
->SUnits
.size();
1078 std::map
<unsigned, unsigned> ColorCount
;
1080 for (unsigned SUNum
: DAG
->BottomUpIndex2SU
) {
1081 SUnit
*SU
= &DAG
->SUnits
[SUNum
];
1082 unsigned color
= CurrentColoring
[SU
->NodeNum
];
1083 ++ColorCount
[color
];
1086 for (unsigned SUNum
: DAG
->BottomUpIndex2SU
) {
1087 SUnit
*SU
= &DAG
->SUnits
[SUNum
];
1088 unsigned color
= CurrentColoring
[SU
->NodeNum
];
1089 std::set
<unsigned> SUColors
;
1091 if (CurrentColoring
[SU
->NodeNum
] <= (int)DAGSize
)
1094 if (ColorCount
[color
] > 1)
1097 for (SDep
& SuccDep
: SU
->Succs
) {
1098 SUnit
*Succ
= SuccDep
.getSUnit();
1099 if (SuccDep
.isWeak() || Succ
->NodeNum
>= DAGSize
)
1101 SUColors
.insert(CurrentColoring
[Succ
->NodeNum
]);
1103 if (SUColors
.size() == 1 && *SUColors
.begin() != color
) {
1104 --ColorCount
[color
];
1105 CurrentColoring
[SU
->NodeNum
] = *SUColors
.begin();
1106 ++ColorCount
[*SUColors
.begin()];
1111 void SIScheduleBlockCreator::cutHugeBlocks() {
1115 void SIScheduleBlockCreator::regroupNoUserInstructions() {
1116 unsigned DAGSize
= DAG
->SUnits
.size();
1117 int GroupID
= NextNonReservedID
++;
1119 for (unsigned SUNum
: DAG
->BottomUpIndex2SU
) {
1120 SUnit
*SU
= &DAG
->SUnits
[SUNum
];
1121 bool hasSuccessor
= false;
1123 if (CurrentColoring
[SU
->NodeNum
] <= (int)DAGSize
)
1126 for (SDep
& SuccDep
: SU
->Succs
) {
1127 SUnit
*Succ
= SuccDep
.getSUnit();
1128 if (SuccDep
.isWeak() || Succ
->NodeNum
>= DAGSize
)
1130 hasSuccessor
= true;
1133 CurrentColoring
[SU
->NodeNum
] = GroupID
;
1137 void SIScheduleBlockCreator::colorExports() {
1138 unsigned ExportColor
= NextNonReservedID
++;
1139 SmallVector
<unsigned, 8> ExpGroup
;
1141 // Put all exports together in a block.
1142 // The block will naturally end up being scheduled last,
1143 // thus putting exports at the end of the schedule, which
1144 // is better for performance.
1145 // However we must ensure, for safety, the exports can be put
1146 // together in the same block without any other instruction.
1147 // This could happen, for example, when scheduling after regalloc
1148 // if reloading a spilled register from memory using the same
1149 // register than used in a previous export.
1150 // If that happens, do not regroup the exports.
1151 for (unsigned SUNum
: DAG
->TopDownIndex2SU
) {
1152 const SUnit
&SU
= DAG
->SUnits
[SUNum
];
1153 if (SIInstrInfo::isEXP(*SU
.getInstr())) {
1154 // Check the EXP can be added to the group safely,
1155 // ie without needing any other instruction.
1156 // The EXP is allowed to depend on other EXP
1157 // (they will be in the same group).
1158 for (unsigned j
: ExpGroup
) {
1160 std::vector
<int> SubGraph
;
1161 // By construction (topological order), if SU and
1162 // DAG->SUnits[j] are linked, DAG->SUnits[j] is neccessary
1163 // in the parent graph of SU.
1165 SubGraph
= DAG
->GetTopo()->GetSubGraph(SU
, DAG
->SUnits
[j
],
1167 assert(!HasSubGraph
);
1169 SubGraph
= DAG
->GetTopo()->GetSubGraph(DAG
->SUnits
[j
], SU
,
1172 continue; // No dependencies between each other
1174 // SubGraph contains all the instructions required
1175 // between EXP SUnits[j] and EXP SU.
1176 for (unsigned k
: SubGraph
) {
1177 if (!SIInstrInfo::isEXP(*DAG
->SUnits
[k
].getInstr()))
1178 // Other instructions than EXP would be required in the group.
1179 // Abort the groupping.
1184 ExpGroup
.push_back(SUNum
);
1188 // The group can be formed. Give the color.
1189 for (unsigned j
: ExpGroup
)
1190 CurrentColoring
[j
] = ExportColor
;
1193 void SIScheduleBlockCreator::createBlocksForVariant(SISchedulerBlockCreatorVariant BlockVariant
) {
1194 unsigned DAGSize
= DAG
->SUnits
.size();
1195 std::map
<unsigned,unsigned> RealID
;
1197 CurrentBlocks
.clear();
1198 CurrentColoring
.clear();
1199 CurrentColoring
.resize(DAGSize
, 0);
1200 Node2CurrentBlock
.clear();
1202 // Restore links previous scheduling variant has overridden.
1203 DAG
->restoreSULinksLeft();
1206 NextNonReservedID
= DAGSize
+ 1;
1208 LLVM_DEBUG(dbgs() << "Coloring the graph\n");
1210 if (BlockVariant
== SISchedulerBlockCreatorVariant::LatenciesGrouped
)
1211 colorHighLatenciesGroups();
1213 colorHighLatenciesAlone();
1214 colorComputeReservedDependencies();
1215 colorAccordingToReservedDependencies();
1216 colorEndsAccordingToDependencies();
1217 if (BlockVariant
== SISchedulerBlockCreatorVariant::LatenciesAlonePlusConsecutive
)
1218 colorForceConsecutiveOrderInGroup();
1219 regroupNoUserInstructions();
1220 colorMergeConstantLoadsNextGroup();
1221 colorMergeIfPossibleNextGroupOnlyForReserved();
1224 // Put SUs of same color into same block
1225 Node2CurrentBlock
.resize(DAGSize
, -1);
1226 for (unsigned i
= 0, e
= DAGSize
; i
!= e
; ++i
) {
1227 SUnit
*SU
= &DAG
->SUnits
[i
];
1228 unsigned Color
= CurrentColoring
[SU
->NodeNum
];
1229 if (RealID
.find(Color
) == RealID
.end()) {
1230 int ID
= CurrentBlocks
.size();
1231 BlockPtrs
.push_back(std::make_unique
<SIScheduleBlock
>(DAG
, this, ID
));
1232 CurrentBlocks
.push_back(BlockPtrs
.rbegin()->get());
1235 CurrentBlocks
[RealID
[Color
]]->addUnit(SU
);
1236 Node2CurrentBlock
[SU
->NodeNum
] = RealID
[Color
];
1239 // Build dependencies between blocks.
1240 for (unsigned i
= 0, e
= DAGSize
; i
!= e
; ++i
) {
1241 SUnit
*SU
= &DAG
->SUnits
[i
];
1242 int SUID
= Node2CurrentBlock
[i
];
1243 for (SDep
& SuccDep
: SU
->Succs
) {
1244 SUnit
*Succ
= SuccDep
.getSUnit();
1245 if (SuccDep
.isWeak() || Succ
->NodeNum
>= DAGSize
)
1247 if (Node2CurrentBlock
[Succ
->NodeNum
] != SUID
)
1248 CurrentBlocks
[SUID
]->addSucc(CurrentBlocks
[Node2CurrentBlock
[Succ
->NodeNum
]],
1249 SuccDep
.isCtrl() ? NoData
: Data
);
1251 for (SDep
& PredDep
: SU
->Preds
) {
1252 SUnit
*Pred
= PredDep
.getSUnit();
1253 if (PredDep
.isWeak() || Pred
->NodeNum
>= DAGSize
)
1255 if (Node2CurrentBlock
[Pred
->NodeNum
] != SUID
)
1256 CurrentBlocks
[SUID
]->addPred(CurrentBlocks
[Node2CurrentBlock
[Pred
->NodeNum
]]);
1260 // Free root and leafs of all blocks to enable scheduling inside them.
1261 for (unsigned i
= 0, e
= CurrentBlocks
.size(); i
!= e
; ++i
) {
1262 SIScheduleBlock
*Block
= CurrentBlocks
[i
];
1263 Block
->finalizeUnits();
1265 LLVM_DEBUG(dbgs() << "Blocks created:\n\n";
1266 for (unsigned i
= 0, e
= CurrentBlocks
.size(); i
!= e
; ++i
) {
1267 SIScheduleBlock
*Block
= CurrentBlocks
[i
];
1268 Block
->printDebug(true);
1272 // Two functions taken from Codegen/MachineScheduler.cpp
1274 /// Non-const version.
1275 static MachineBasicBlock::iterator
1276 nextIfDebug(MachineBasicBlock::iterator I
,
1277 MachineBasicBlock::const_iterator End
) {
1278 for (; I
!= End
; ++I
) {
1279 if (!I
->isDebugInstr())
1285 void SIScheduleBlockCreator::topologicalSort() {
1286 unsigned DAGSize
= CurrentBlocks
.size();
1287 std::vector
<int> WorkList
;
1289 LLVM_DEBUG(dbgs() << "Topological Sort\n");
1291 WorkList
.reserve(DAGSize
);
1292 TopDownIndex2Block
.resize(DAGSize
);
1293 TopDownBlock2Index
.resize(DAGSize
);
1294 BottomUpIndex2Block
.resize(DAGSize
);
1296 for (unsigned i
= 0, e
= DAGSize
; i
!= e
; ++i
) {
1297 SIScheduleBlock
*Block
= CurrentBlocks
[i
];
1298 unsigned Degree
= Block
->getSuccs().size();
1299 TopDownBlock2Index
[i
] = Degree
;
1301 WorkList
.push_back(i
);
1306 while (!WorkList
.empty()) {
1307 int i
= WorkList
.back();
1308 SIScheduleBlock
*Block
= CurrentBlocks
[i
];
1309 WorkList
.pop_back();
1310 TopDownBlock2Index
[i
] = --Id
;
1311 TopDownIndex2Block
[Id
] = i
;
1312 for (SIScheduleBlock
* Pred
: Block
->getPreds()) {
1313 if (!--TopDownBlock2Index
[Pred
->getID()])
1314 WorkList
.push_back(Pred
->getID());
1319 // Check correctness of the ordering.
1320 for (unsigned i
= 0, e
= DAGSize
; i
!= e
; ++i
) {
1321 SIScheduleBlock
*Block
= CurrentBlocks
[i
];
1322 for (SIScheduleBlock
* Pred
: Block
->getPreds()) {
1323 assert(TopDownBlock2Index
[i
] > TopDownBlock2Index
[Pred
->getID()] &&
1324 "Wrong Top Down topological sorting");
1329 BottomUpIndex2Block
= std::vector
<int>(TopDownIndex2Block
.rbegin(),
1330 TopDownIndex2Block
.rend());
1333 void SIScheduleBlockCreator::scheduleInsideBlocks() {
1334 unsigned DAGSize
= CurrentBlocks
.size();
1336 LLVM_DEBUG(dbgs() << "\nScheduling Blocks\n\n");
1338 // We do schedule a valid scheduling such that a Block corresponds
1339 // to a range of instructions.
1340 LLVM_DEBUG(dbgs() << "First phase: Fast scheduling for Reg Liveness\n");
1341 for (unsigned i
= 0, e
= DAGSize
; i
!= e
; ++i
) {
1342 SIScheduleBlock
*Block
= CurrentBlocks
[i
];
1343 Block
->fastSchedule();
1346 // Note: the following code, and the part restoring previous position
1347 // is by far the most expensive operation of the Scheduler.
1349 // Do not update CurrentTop.
1350 MachineBasicBlock::iterator CurrentTopFastSched
= DAG
->getCurrentTop();
1351 std::vector
<MachineBasicBlock::iterator
> PosOld
;
1352 std::vector
<MachineBasicBlock::iterator
> PosNew
;
1353 PosOld
.reserve(DAG
->SUnits
.size());
1354 PosNew
.reserve(DAG
->SUnits
.size());
1356 for (unsigned i
= 0, e
= DAGSize
; i
!= e
; ++i
) {
1357 int BlockIndice
= TopDownIndex2Block
[i
];
1358 SIScheduleBlock
*Block
= CurrentBlocks
[BlockIndice
];
1359 std::vector
<SUnit
*> SUs
= Block
->getScheduledUnits();
1361 for (SUnit
* SU
: SUs
) {
1362 MachineInstr
*MI
= SU
->getInstr();
1363 MachineBasicBlock::iterator Pos
= MI
;
1364 PosOld
.push_back(Pos
);
1365 if (&*CurrentTopFastSched
== MI
) {
1366 PosNew
.push_back(Pos
);
1367 CurrentTopFastSched
= nextIfDebug(++CurrentTopFastSched
,
1368 DAG
->getCurrentBottom());
1370 // Update the instruction stream.
1371 DAG
->getBB()->splice(CurrentTopFastSched
, DAG
->getBB(), MI
);
1373 // Update LiveIntervals.
1374 // Note: Moving all instructions and calling handleMove every time
1375 // is the most cpu intensive operation of the scheduler.
1376 // It would gain a lot if there was a way to recompute the
1377 // LiveIntervals for the entire scheduling region.
1378 DAG
->getLIS()->handleMove(*MI
, /*UpdateFlags=*/true);
1379 PosNew
.push_back(CurrentTopFastSched
);
1384 // Now we have Block of SUs == Block of MI.
1385 // We do the final schedule for the instructions inside the block.
1386 // The property that all the SUs of the Block are grouped together as MI
1387 // is used for correct reg usage tracking.
1388 for (unsigned i
= 0, e
= DAGSize
; i
!= e
; ++i
) {
1389 SIScheduleBlock
*Block
= CurrentBlocks
[i
];
1390 std::vector
<SUnit
*> SUs
= Block
->getScheduledUnits();
1391 Block
->schedule((*SUs
.begin())->getInstr(), (*SUs
.rbegin())->getInstr());
1394 LLVM_DEBUG(dbgs() << "Restoring MI Pos\n");
1395 // Restore old ordering (which prevents a LIS->handleMove bug).
1396 for (unsigned i
= PosOld
.size(), e
= 0; i
!= e
; --i
) {
1397 MachineBasicBlock::iterator POld
= PosOld
[i
-1];
1398 MachineBasicBlock::iterator PNew
= PosNew
[i
-1];
1400 // Update the instruction stream.
1401 DAG
->getBB()->splice(POld
, DAG
->getBB(), PNew
);
1403 // Update LiveIntervals.
1404 DAG
->getLIS()->handleMove(*POld
, /*UpdateFlags=*/true);
1408 LLVM_DEBUG(for (unsigned i
= 0, e
= CurrentBlocks
.size(); i
!= e
; ++i
) {
1409 SIScheduleBlock
*Block
= CurrentBlocks
[i
];
1410 Block
->printDebug(true);
1414 void SIScheduleBlockCreator::fillStats() {
1415 unsigned DAGSize
= CurrentBlocks
.size();
1417 for (unsigned i
= 0, e
= DAGSize
; i
!= e
; ++i
) {
1418 int BlockIndice
= TopDownIndex2Block
[i
];
1419 SIScheduleBlock
*Block
= CurrentBlocks
[BlockIndice
];
1420 if (Block
->getPreds().empty())
1424 for (SIScheduleBlock
*Pred
: Block
->getPreds()) {
1425 if (Depth
< Pred
->Depth
+ Pred
->getCost())
1426 Depth
= Pred
->Depth
+ Pred
->getCost();
1428 Block
->Depth
= Depth
;
1432 for (unsigned i
= 0, e
= DAGSize
; i
!= e
; ++i
) {
1433 int BlockIndice
= BottomUpIndex2Block
[i
];
1434 SIScheduleBlock
*Block
= CurrentBlocks
[BlockIndice
];
1435 if (Block
->getSuccs().empty())
1438 unsigned Height
= 0;
1439 for (const auto &Succ
: Block
->getSuccs())
1440 Height
= std::max(Height
, Succ
.first
->Height
+ Succ
.first
->getCost());
1441 Block
->Height
= Height
;
1446 // SIScheduleBlockScheduler //
1448 SIScheduleBlockScheduler::SIScheduleBlockScheduler(SIScheduleDAGMI
*DAG
,
1449 SISchedulerBlockSchedulerVariant Variant
,
1450 SIScheduleBlocks BlocksStruct
) :
1451 DAG(DAG
), Variant(Variant
), Blocks(BlocksStruct
.Blocks
),
1452 LastPosWaitedHighLatency(0), NumBlockScheduled(0), VregCurrentUsage(0),
1453 SregCurrentUsage(0), maxVregUsage(0), maxSregUsage(0) {
1455 // Fill the usage of every output
1456 // Warning: while by construction we always have a link between two blocks
1457 // when one needs a result from the other, the number of users of an output
1458 // is not the sum of child blocks having as input the same virtual register.
1459 // Here is an example. A produces x and y. B eats x and produces x'.
1460 // C eats x' and y. The register coalescer may have attributed the same
1461 // virtual register to x and x'.
1462 // To count accurately, we do a topological sort. In case the register is
1463 // found for several parents, we increment the usage of the one with the
1464 // highest topological index.
1465 LiveOutRegsNumUsages
.resize(Blocks
.size());
1466 for (unsigned i
= 0, e
= Blocks
.size(); i
!= e
; ++i
) {
1467 SIScheduleBlock
*Block
= Blocks
[i
];
1468 for (unsigned Reg
: Block
->getInRegs()) {
1471 for (SIScheduleBlock
* Pred
: Block
->getPreds()) {
1472 std::set
<unsigned> PredOutRegs
= Pred
->getOutRegs();
1473 std::set
<unsigned>::iterator RegPos
= PredOutRegs
.find(Reg
);
1475 if (RegPos
!= PredOutRegs
.end()) {
1477 if (topoInd
< BlocksStruct
.TopDownBlock2Index
[Pred
->getID()]) {
1478 topoInd
= BlocksStruct
.TopDownBlock2Index
[Pred
->getID()];
1486 int PredID
= BlocksStruct
.TopDownIndex2Block
[topoInd
];
1487 ++LiveOutRegsNumUsages
[PredID
][Reg
];
1491 LastPosHighLatencyParentScheduled
.resize(Blocks
.size(), 0);
1492 BlockNumPredsLeft
.resize(Blocks
.size());
1493 BlockNumSuccsLeft
.resize(Blocks
.size());
1495 for (unsigned i
= 0, e
= Blocks
.size(); i
!= e
; ++i
) {
1496 SIScheduleBlock
*Block
= Blocks
[i
];
1497 BlockNumPredsLeft
[i
] = Block
->getPreds().size();
1498 BlockNumSuccsLeft
[i
] = Block
->getSuccs().size();
1502 for (unsigned i
= 0, e
= Blocks
.size(); i
!= e
; ++i
) {
1503 SIScheduleBlock
*Block
= Blocks
[i
];
1504 assert(Block
->getID() == i
);
1508 std::set
<unsigned> InRegs
= DAG
->getInRegs();
1509 addLiveRegs(InRegs
);
1511 // Increase LiveOutRegsNumUsages for blocks
1512 // producing registers consumed in another
1513 // scheduling region.
1514 for (unsigned Reg
: DAG
->getOutRegs()) {
1515 for (unsigned i
= 0, e
= Blocks
.size(); i
!= e
; ++i
) {
1516 // Do reverse traversal
1517 int ID
= BlocksStruct
.TopDownIndex2Block
[Blocks
.size()-1-i
];
1518 SIScheduleBlock
*Block
= Blocks
[ID
];
1519 const std::set
<unsigned> &OutRegs
= Block
->getOutRegs();
1521 if (OutRegs
.find(Reg
) == OutRegs
.end())
1524 ++LiveOutRegsNumUsages
[ID
][Reg
];
1529 // Fill LiveRegsConsumers for regs that were already
1530 // defined before scheduling.
1531 for (unsigned i
= 0, e
= Blocks
.size(); i
!= e
; ++i
) {
1532 SIScheduleBlock
*Block
= Blocks
[i
];
1533 for (unsigned Reg
: Block
->getInRegs()) {
1535 for (SIScheduleBlock
* Pred
: Block
->getPreds()) {
1536 std::set
<unsigned> PredOutRegs
= Pred
->getOutRegs();
1537 std::set
<unsigned>::iterator RegPos
= PredOutRegs
.find(Reg
);
1539 if (RegPos
!= PredOutRegs
.end()) {
1546 ++LiveRegsConsumers
[Reg
];
1550 for (unsigned i
= 0, e
= Blocks
.size(); i
!= e
; ++i
) {
1551 SIScheduleBlock
*Block
= Blocks
[i
];
1552 if (BlockNumPredsLeft
[i
] == 0) {
1553 ReadyBlocks
.push_back(Block
);
1557 while (SIScheduleBlock
*Block
= pickBlock()) {
1558 BlocksScheduled
.push_back(Block
);
1559 blockScheduled(Block
);
1562 LLVM_DEBUG(dbgs() << "Block Order:"; for (SIScheduleBlock
*Block
1563 : BlocksScheduled
) {
1564 dbgs() << ' ' << Block
->getID();
1568 bool SIScheduleBlockScheduler::tryCandidateLatency(SIBlockSchedCandidate
&Cand
,
1569 SIBlockSchedCandidate
&TryCand
) {
1570 if (!Cand
.isValid()) {
1571 TryCand
.Reason
= NodeOrder
;
1575 // Try to hide high latencies.
1576 if (SISched::tryLess(TryCand
.LastPosHighLatParentScheduled
,
1577 Cand
.LastPosHighLatParentScheduled
, TryCand
, Cand
, Latency
))
1579 // Schedule high latencies early so you can hide them better.
1580 if (SISched::tryGreater(TryCand
.IsHighLatency
, Cand
.IsHighLatency
,
1581 TryCand
, Cand
, Latency
))
1583 if (TryCand
.IsHighLatency
&& SISched::tryGreater(TryCand
.Height
, Cand
.Height
,
1584 TryCand
, Cand
, Depth
))
1586 if (SISched::tryGreater(TryCand
.NumHighLatencySuccessors
,
1587 Cand
.NumHighLatencySuccessors
,
1588 TryCand
, Cand
, Successor
))
1593 bool SIScheduleBlockScheduler::tryCandidateRegUsage(SIBlockSchedCandidate
&Cand
,
1594 SIBlockSchedCandidate
&TryCand
) {
1595 if (!Cand
.isValid()) {
1596 TryCand
.Reason
= NodeOrder
;
1600 if (SISched::tryLess(TryCand
.VGPRUsageDiff
> 0, Cand
.VGPRUsageDiff
> 0,
1601 TryCand
, Cand
, RegUsage
))
1603 if (SISched::tryGreater(TryCand
.NumSuccessors
> 0,
1604 Cand
.NumSuccessors
> 0,
1605 TryCand
, Cand
, Successor
))
1607 if (SISched::tryGreater(TryCand
.Height
, Cand
.Height
, TryCand
, Cand
, Depth
))
1609 if (SISched::tryLess(TryCand
.VGPRUsageDiff
, Cand
.VGPRUsageDiff
,
1610 TryCand
, Cand
, RegUsage
))
1615 SIScheduleBlock
*SIScheduleBlockScheduler::pickBlock() {
1616 SIBlockSchedCandidate Cand
;
1617 std::vector
<SIScheduleBlock
*>::iterator Best
;
1618 SIScheduleBlock
*Block
;
1619 if (ReadyBlocks
.empty())
1622 DAG
->fillVgprSgprCost(LiveRegs
.begin(), LiveRegs
.end(),
1623 VregCurrentUsage
, SregCurrentUsage
);
1624 if (VregCurrentUsage
> maxVregUsage
)
1625 maxVregUsage
= VregCurrentUsage
;
1626 if (SregCurrentUsage
> maxSregUsage
)
1627 maxSregUsage
= SregCurrentUsage
;
1628 LLVM_DEBUG(dbgs() << "Picking New Blocks\n"; dbgs() << "Available: ";
1629 for (SIScheduleBlock
*Block
1630 : ReadyBlocks
) dbgs()
1631 << Block
->getID() << ' ';
1632 dbgs() << "\nCurrent Live:\n";
1635 << printVRegOrUnit(Reg
, DAG
->getTRI()) << ' ';
1637 dbgs() << "Current VGPRs: " << VregCurrentUsage
<< '\n';
1638 dbgs() << "Current SGPRs: " << SregCurrentUsage
<< '\n';);
1640 Cand
.Block
= nullptr;
1641 for (std::vector
<SIScheduleBlock
*>::iterator I
= ReadyBlocks
.begin(),
1642 E
= ReadyBlocks
.end(); I
!= E
; ++I
) {
1643 SIBlockSchedCandidate TryCand
;
1645 TryCand
.IsHighLatency
= TryCand
.Block
->isHighLatencyBlock();
1646 TryCand
.VGPRUsageDiff
=
1647 checkRegUsageImpact(TryCand
.Block
->getInRegs(),
1648 TryCand
.Block
->getOutRegs())[DAG
->getVGPRSetID()];
1649 TryCand
.NumSuccessors
= TryCand
.Block
->getSuccs().size();
1650 TryCand
.NumHighLatencySuccessors
=
1651 TryCand
.Block
->getNumHighLatencySuccessors();
1652 TryCand
.LastPosHighLatParentScheduled
=
1653 (unsigned int) std::max
<int> (0,
1654 LastPosHighLatencyParentScheduled
[TryCand
.Block
->getID()] -
1655 LastPosWaitedHighLatency
);
1656 TryCand
.Height
= TryCand
.Block
->Height
;
1657 // Try not to increase VGPR usage too much, else we may spill.
1658 if (VregCurrentUsage
> 120 ||
1659 Variant
!= SISchedulerBlockSchedulerVariant::BlockLatencyRegUsage
) {
1660 if (!tryCandidateRegUsage(Cand
, TryCand
) &&
1661 Variant
!= SISchedulerBlockSchedulerVariant::BlockRegUsage
)
1662 tryCandidateLatency(Cand
, TryCand
);
1664 if (!tryCandidateLatency(Cand
, TryCand
))
1665 tryCandidateRegUsage(Cand
, TryCand
);
1667 if (TryCand
.Reason
!= NoCand
) {
1668 Cand
.setBest(TryCand
);
1670 LLVM_DEBUG(dbgs() << "Best Current Choice: " << Cand
.Block
->getID() << ' '
1671 << getReasonStr(Cand
.Reason
) << '\n');
1675 LLVM_DEBUG(dbgs() << "Picking: " << Cand
.Block
->getID() << '\n';
1676 dbgs() << "Is a block with high latency instruction: "
1677 << (Cand
.IsHighLatency
? "yes\n" : "no\n");
1678 dbgs() << "Position of last high latency dependency: "
1679 << Cand
.LastPosHighLatParentScheduled
<< '\n';
1680 dbgs() << "VGPRUsageDiff: " << Cand
.VGPRUsageDiff
<< '\n';
1684 ReadyBlocks
.erase(Best
);
1688 // Tracking of currently alive registers to determine VGPR Usage.
1690 void SIScheduleBlockScheduler::addLiveRegs(std::set
<unsigned> &Regs
) {
1691 for (unsigned Reg
: Regs
) {
1692 // For now only track virtual registers.
1693 if (!Register::isVirtualRegister(Reg
))
1695 // If not already in the live set, then add it.
1696 (void) LiveRegs
.insert(Reg
);
1700 void SIScheduleBlockScheduler::decreaseLiveRegs(SIScheduleBlock
*Block
,
1701 std::set
<unsigned> &Regs
) {
1702 for (unsigned Reg
: Regs
) {
1703 // For now only track virtual registers.
1704 std::set
<unsigned>::iterator Pos
= LiveRegs
.find(Reg
);
1705 assert (Pos
!= LiveRegs
.end() && // Reg must be live.
1706 LiveRegsConsumers
.find(Reg
) != LiveRegsConsumers
.end() &&
1707 LiveRegsConsumers
[Reg
] >= 1);
1708 --LiveRegsConsumers
[Reg
];
1709 if (LiveRegsConsumers
[Reg
] == 0)
1710 LiveRegs
.erase(Pos
);
1714 void SIScheduleBlockScheduler::releaseBlockSuccs(SIScheduleBlock
*Parent
) {
1715 for (const auto &Block
: Parent
->getSuccs()) {
1716 if (--BlockNumPredsLeft
[Block
.first
->getID()] == 0)
1717 ReadyBlocks
.push_back(Block
.first
);
1719 if (Parent
->isHighLatencyBlock() &&
1720 Block
.second
== SIScheduleBlockLinkKind::Data
)
1721 LastPosHighLatencyParentScheduled
[Block
.first
->getID()] = NumBlockScheduled
;
1725 void SIScheduleBlockScheduler::blockScheduled(SIScheduleBlock
*Block
) {
1726 decreaseLiveRegs(Block
, Block
->getInRegs());
1727 addLiveRegs(Block
->getOutRegs());
1728 releaseBlockSuccs(Block
);
1729 for (std::map
<unsigned, unsigned>::iterator RegI
=
1730 LiveOutRegsNumUsages
[Block
->getID()].begin(),
1731 E
= LiveOutRegsNumUsages
[Block
->getID()].end(); RegI
!= E
; ++RegI
) {
1732 std::pair
<unsigned, unsigned> RegP
= *RegI
;
1733 // We produce this register, thus it must not be previously alive.
1734 assert(LiveRegsConsumers
.find(RegP
.first
) == LiveRegsConsumers
.end() ||
1735 LiveRegsConsumers
[RegP
.first
] == 0);
1736 LiveRegsConsumers
[RegP
.first
] += RegP
.second
;
1738 if (LastPosHighLatencyParentScheduled
[Block
->getID()] >
1739 (unsigned)LastPosWaitedHighLatency
)
1740 LastPosWaitedHighLatency
=
1741 LastPosHighLatencyParentScheduled
[Block
->getID()];
1742 ++NumBlockScheduled
;
1746 SIScheduleBlockScheduler::checkRegUsageImpact(std::set
<unsigned> &InRegs
,
1747 std::set
<unsigned> &OutRegs
) {
1748 std::vector
<int> DiffSetPressure
;
1749 DiffSetPressure
.assign(DAG
->getTRI()->getNumRegPressureSets(), 0);
1751 for (unsigned Reg
: InRegs
) {
1752 // For now only track virtual registers.
1753 if (!Register::isVirtualRegister(Reg
))
1755 if (LiveRegsConsumers
[Reg
] > 1)
1757 PSetIterator PSetI
= DAG
->getMRI()->getPressureSets(Reg
);
1758 for (; PSetI
.isValid(); ++PSetI
) {
1759 DiffSetPressure
[*PSetI
] -= PSetI
.getWeight();
1763 for (unsigned Reg
: OutRegs
) {
1764 // For now only track virtual registers.
1765 if (!Register::isVirtualRegister(Reg
))
1767 PSetIterator PSetI
= DAG
->getMRI()->getPressureSets(Reg
);
1768 for (; PSetI
.isValid(); ++PSetI
) {
1769 DiffSetPressure
[*PSetI
] += PSetI
.getWeight();
1773 return DiffSetPressure
;
1778 struct SIScheduleBlockResult
1779 SIScheduler::scheduleVariant(SISchedulerBlockCreatorVariant BlockVariant
,
1780 SISchedulerBlockSchedulerVariant ScheduleVariant
) {
1781 SIScheduleBlocks Blocks
= BlockCreator
.getBlocks(BlockVariant
);
1782 SIScheduleBlockScheduler
Scheduler(DAG
, ScheduleVariant
, Blocks
);
1783 std::vector
<SIScheduleBlock
*> ScheduledBlocks
;
1784 struct SIScheduleBlockResult Res
;
1786 ScheduledBlocks
= Scheduler
.getBlocks();
1788 for (unsigned b
= 0; b
< ScheduledBlocks
.size(); ++b
) {
1789 SIScheduleBlock
*Block
= ScheduledBlocks
[b
];
1790 std::vector
<SUnit
*> SUs
= Block
->getScheduledUnits();
1792 for (SUnit
* SU
: SUs
)
1793 Res
.SUs
.push_back(SU
->NodeNum
);
1796 Res
.MaxSGPRUsage
= Scheduler
.getSGPRUsage();
1797 Res
.MaxVGPRUsage
= Scheduler
.getVGPRUsage();
1801 // SIScheduleDAGMI //
1803 SIScheduleDAGMI::SIScheduleDAGMI(MachineSchedContext
*C
) :
1804 ScheduleDAGMILive(C
, std::make_unique
<GenericScheduler
>(C
)) {
1805 SITII
= static_cast<const SIInstrInfo
*>(TII
);
1806 SITRI
= static_cast<const SIRegisterInfo
*>(TRI
);
1808 VGPRSetID
= SITRI
->getVGPRPressureSet();
1809 SGPRSetID
= SITRI
->getSGPRPressureSet();
1812 SIScheduleDAGMI::~SIScheduleDAGMI() = default;
1814 // Code adapted from scheduleDAG.cpp
1815 // Does a topological sort over the SUs.
1816 // Both TopDown and BottomUp
1817 void SIScheduleDAGMI::topologicalSort() {
1818 Topo
.InitDAGTopologicalSorting();
1820 TopDownIndex2SU
= std::vector
<int>(Topo
.begin(), Topo
.end());
1821 BottomUpIndex2SU
= std::vector
<int>(Topo
.rbegin(), Topo
.rend());
1824 // Move low latencies further from their user without
1825 // increasing SGPR usage (in general)
1826 // This is to be replaced by a better pass that would
1827 // take into account SGPR usage (based on VGPR Usage
1828 // and the corresponding wavefront count), that would
1829 // try to merge groups of loads if it make sense, etc
1830 void SIScheduleDAGMI::moveLowLatencies() {
1831 unsigned DAGSize
= SUnits
.size();
1832 int LastLowLatencyUser
= -1;
1833 int LastLowLatencyPos
= -1;
1835 for (unsigned i
= 0, e
= ScheduledSUnits
.size(); i
!= e
; ++i
) {
1836 SUnit
*SU
= &SUnits
[ScheduledSUnits
[i
]];
1837 bool IsLowLatencyUser
= false;
1838 unsigned MinPos
= 0;
1840 for (SDep
& PredDep
: SU
->Preds
) {
1841 SUnit
*Pred
= PredDep
.getSUnit();
1842 if (SITII
->isLowLatencyInstruction(*Pred
->getInstr())) {
1843 IsLowLatencyUser
= true;
1845 if (Pred
->NodeNum
>= DAGSize
)
1847 unsigned PredPos
= ScheduledSUnitsInv
[Pred
->NodeNum
];
1848 if (PredPos
>= MinPos
)
1849 MinPos
= PredPos
+ 1;
1852 if (SITII
->isLowLatencyInstruction(*SU
->getInstr())) {
1853 unsigned BestPos
= LastLowLatencyUser
+ 1;
1854 if ((int)BestPos
<= LastLowLatencyPos
)
1855 BestPos
= LastLowLatencyPos
+ 1;
1856 if (BestPos
< MinPos
)
1859 for (unsigned u
= i
; u
> BestPos
; --u
) {
1860 ++ScheduledSUnitsInv
[ScheduledSUnits
[u
-1]];
1861 ScheduledSUnits
[u
] = ScheduledSUnits
[u
-1];
1863 ScheduledSUnits
[BestPos
] = SU
->NodeNum
;
1864 ScheduledSUnitsInv
[SU
->NodeNum
] = BestPos
;
1866 LastLowLatencyPos
= BestPos
;
1867 if (IsLowLatencyUser
)
1868 LastLowLatencyUser
= BestPos
;
1869 } else if (IsLowLatencyUser
) {
1870 LastLowLatencyUser
= i
;
1871 // Moves COPY instructions on which depends
1872 // the low latency instructions too.
1873 } else if (SU
->getInstr()->getOpcode() == AMDGPU::COPY
) {
1874 bool CopyForLowLat
= false;
1875 for (SDep
& SuccDep
: SU
->Succs
) {
1876 SUnit
*Succ
= SuccDep
.getSUnit();
1877 if (SuccDep
.isWeak() || Succ
->NodeNum
>= DAGSize
)
1879 if (SITII
->isLowLatencyInstruction(*Succ
->getInstr())) {
1880 CopyForLowLat
= true;
1886 for (unsigned u
= i
; u
> MinPos
; --u
) {
1887 ++ScheduledSUnitsInv
[ScheduledSUnits
[u
-1]];
1888 ScheduledSUnits
[u
] = ScheduledSUnits
[u
-1];
1890 ScheduledSUnits
[MinPos
] = SU
->NodeNum
;
1891 ScheduledSUnitsInv
[SU
->NodeNum
] = MinPos
;
1897 void SIScheduleDAGMI::restoreSULinksLeft() {
1898 for (unsigned i
= 0, e
= SUnits
.size(); i
!= e
; ++i
) {
1899 SUnits
[i
].isScheduled
= false;
1900 SUnits
[i
].WeakPredsLeft
= SUnitsLinksBackup
[i
].WeakPredsLeft
;
1901 SUnits
[i
].NumPredsLeft
= SUnitsLinksBackup
[i
].NumPredsLeft
;
1902 SUnits
[i
].WeakSuccsLeft
= SUnitsLinksBackup
[i
].WeakSuccsLeft
;
1903 SUnits
[i
].NumSuccsLeft
= SUnitsLinksBackup
[i
].NumSuccsLeft
;
1907 // Return the Vgpr and Sgpr usage corresponding to some virtual registers.
1908 template<typename _Iterator
> void
1909 SIScheduleDAGMI::fillVgprSgprCost(_Iterator First
, _Iterator End
,
1910 unsigned &VgprUsage
, unsigned &SgprUsage
) {
1913 for (_Iterator RegI
= First
; RegI
!= End
; ++RegI
) {
1914 unsigned Reg
= *RegI
;
1915 // For now only track virtual registers
1916 if (!Register::isVirtualRegister(Reg
))
1918 PSetIterator PSetI
= MRI
.getPressureSets(Reg
);
1919 for (; PSetI
.isValid(); ++PSetI
) {
1920 if (*PSetI
== VGPRSetID
)
1921 VgprUsage
+= PSetI
.getWeight();
1922 else if (*PSetI
== SGPRSetID
)
1923 SgprUsage
+= PSetI
.getWeight();
1928 void SIScheduleDAGMI::schedule()
1930 SmallVector
<SUnit
*, 8> TopRoots
, BotRoots
;
1931 SIScheduleBlockResult Best
, Temp
;
1932 LLVM_DEBUG(dbgs() << "Preparing Scheduling\n");
1934 buildDAGWithRegPressure();
1938 findRootsAndBiasEdges(TopRoots
, BotRoots
);
1939 // We reuse several ScheduleDAGMI and ScheduleDAGMILive
1940 // functions, but to make them happy we must initialize
1941 // the default Scheduler implementation (even if we do not
1943 SchedImpl
->initialize(this);
1944 initQueues(TopRoots
, BotRoots
);
1946 // Fill some stats to help scheduling.
1948 SUnitsLinksBackup
= SUnits
;
1949 IsLowLatencySU
.clear();
1950 LowLatencyOffset
.clear();
1951 IsHighLatencySU
.clear();
1953 IsLowLatencySU
.resize(SUnits
.size(), 0);
1954 LowLatencyOffset
.resize(SUnits
.size(), 0);
1955 IsHighLatencySU
.resize(SUnits
.size(), 0);
1957 for (unsigned i
= 0, e
= (unsigned)SUnits
.size(); i
!= e
; ++i
) {
1958 SUnit
*SU
= &SUnits
[i
];
1959 const MachineOperand
*BaseLatOp
;
1961 if (SITII
->isLowLatencyInstruction(*SU
->getInstr())) {
1962 IsLowLatencySU
[i
] = 1;
1963 if (SITII
->getMemOperandWithOffset(*SU
->getInstr(), BaseLatOp
, OffLatReg
,
1965 LowLatencyOffset
[i
] = OffLatReg
;
1966 } else if (SITII
->isHighLatencyInstruction(*SU
->getInstr()))
1967 IsHighLatencySU
[i
] = 1;
1970 SIScheduler
Scheduler(this);
1971 Best
= Scheduler
.scheduleVariant(SISchedulerBlockCreatorVariant::LatenciesAlone
,
1972 SISchedulerBlockSchedulerVariant::BlockLatencyRegUsage
);
1974 // if VGPR usage is extremely high, try other good performing variants
1975 // which could lead to lower VGPR usage
1976 if (Best
.MaxVGPRUsage
> 180) {
1977 static const std::pair
<SISchedulerBlockCreatorVariant
,
1978 SISchedulerBlockSchedulerVariant
>
1980 { LatenciesAlone
, BlockRegUsageLatency
},
1981 // { LatenciesAlone, BlockRegUsage },
1982 { LatenciesGrouped
, BlockLatencyRegUsage
},
1983 // { LatenciesGrouped, BlockRegUsageLatency },
1984 // { LatenciesGrouped, BlockRegUsage },
1985 { LatenciesAlonePlusConsecutive
, BlockLatencyRegUsage
},
1986 // { LatenciesAlonePlusConsecutive, BlockRegUsageLatency },
1987 // { LatenciesAlonePlusConsecutive, BlockRegUsage }
1989 for (std::pair
<SISchedulerBlockCreatorVariant
, SISchedulerBlockSchedulerVariant
> v
: Variants
) {
1990 Temp
= Scheduler
.scheduleVariant(v
.first
, v
.second
);
1991 if (Temp
.MaxVGPRUsage
< Best
.MaxVGPRUsage
)
1995 // if VGPR usage is still extremely high, we may spill. Try other variants
1996 // which are less performing, but that could lead to lower VGPR usage.
1997 if (Best
.MaxVGPRUsage
> 200) {
1998 static const std::pair
<SISchedulerBlockCreatorVariant
,
1999 SISchedulerBlockSchedulerVariant
>
2001 // { LatenciesAlone, BlockRegUsageLatency },
2002 { LatenciesAlone
, BlockRegUsage
},
2003 // { LatenciesGrouped, BlockLatencyRegUsage },
2004 { LatenciesGrouped
, BlockRegUsageLatency
},
2005 { LatenciesGrouped
, BlockRegUsage
},
2006 // { LatenciesAlonePlusConsecutive, BlockLatencyRegUsage },
2007 { LatenciesAlonePlusConsecutive
, BlockRegUsageLatency
},
2008 { LatenciesAlonePlusConsecutive
, BlockRegUsage
}
2010 for (std::pair
<SISchedulerBlockCreatorVariant
, SISchedulerBlockSchedulerVariant
> v
: Variants
) {
2011 Temp
= Scheduler
.scheduleVariant(v
.first
, v
.second
);
2012 if (Temp
.MaxVGPRUsage
< Best
.MaxVGPRUsage
)
2017 ScheduledSUnits
= Best
.SUs
;
2018 ScheduledSUnitsInv
.resize(SUnits
.size());
2020 for (unsigned i
= 0, e
= (unsigned)SUnits
.size(); i
!= e
; ++i
) {
2021 ScheduledSUnitsInv
[ScheduledSUnits
[i
]] = i
;
2026 // Tell the outside world about the result of the scheduling.
2028 assert(TopRPTracker
.getPos() == RegionBegin
&& "bad initial Top tracker");
2029 TopRPTracker
.setPos(CurrentTop
);
2031 for (std::vector
<unsigned>::iterator I
= ScheduledSUnits
.begin(),
2032 E
= ScheduledSUnits
.end(); I
!= E
; ++I
) {
2033 SUnit
*SU
= &SUnits
[*I
];
2035 scheduleMI(SU
, true);
2037 LLVM_DEBUG(dbgs() << "Scheduling SU(" << SU
->NodeNum
<< ") "
2038 << *SU
->getInstr());
2041 assert(CurrentTop
== CurrentBottom
&& "Nonempty unscheduled zone.");
2046 dbgs() << "*** Final schedule for "
2047 << printMBBReference(*begin()->getParent()) << " ***\n";