1 //===-- SIMachineScheduler.cpp - SI Scheduler Interface -------------------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
10 /// SI Machine Scheduler interface
12 //===----------------------------------------------------------------------===//
14 #include "SIMachineScheduler.h"
15 #include "SIInstrInfo.h"
16 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
17 #include "llvm/CodeGen/LiveIntervals.h"
18 #include "llvm/CodeGen/MachineRegisterInfo.h"
22 #define DEBUG_TYPE "machine-scheduler"
24 // This scheduler implements a different scheduling algorithm than
27 // There are several specific architecture behaviours that can't be modelled
28 // for GenericScheduler:
29 // . When accessing the result of an SGPR load instruction, you have to wait
30 // for all the SGPR load instructions before your current instruction to
32 // . When accessing the result of an VGPR load instruction, you have to wait
33 // for all the VGPR load instructions previous to the VGPR load instruction
34 // you are interested in to finish.
35 // . The less the register pressure, the best load latencies are hidden
37 // Moreover some specifities (like the fact a lot of instructions in the shader
38 // have few dependencies) makes the generic scheduler have some unpredictable
39 // behaviours. For example when register pressure becomes high, it can either
40 // manage to prevent register pressure from going too high, or it can
41 // increase register pressure even more than if it hadn't taken register
42 // pressure into account.
44 // Also some other bad behaviours are generated, like loading at the beginning
45 // of the shader a constant in VGPR you won't need until the end of the shader.
47 // The scheduling problem for SI can distinguish three main parts:
48 // . Hiding high latencies (texture sampling, etc)
49 // . Hiding low latencies (SGPR constant loading, etc)
50 // . Keeping register usage low for better latency hiding and general
53 // Some other things can also affect performance, but are hard to predict
54 // (cache usage, the fact the HW can issue several instructions from different
55 // wavefronts if different types, etc)
57 // This scheduler tries to solve the scheduling problem by dividing it into
58 // simpler sub-problems. It divides the instructions into blocks, schedules
59 // locally inside the blocks where it takes care of low latencies, and then
60 // chooses the order of the blocks by taking care of high latencies.
61 // Dividing the instructions into blocks helps control keeping register
64 // First the instructions are put into blocks.
65 // We want the blocks help control register usage and hide high latencies
66 // later. To help control register usage, we typically want all local
67 // computations, when for example you create a result that can be comsummed
68 // right away, to be contained in a block. Block inputs and outputs would
69 // typically be important results that are needed in several locations of
70 // the shader. Since we do want blocks to help hide high latencies, we want
71 // the instructions inside the block to have a minimal set of dependencies
72 // on high latencies. It will make it easy to pick blocks to hide specific
74 // The block creation algorithm is divided into several steps, and several
75 // variants can be tried during the scheduling process.
77 // Second the order of the instructions inside the blocks is chosen.
78 // At that step we do take into account only register usage and hiding
79 // low latency instructions
81 // Third the block order is chosen, there we try to hide high latencies
82 // and keep register usage low.
84 // After the third step, a pass is done to improve the hiding of low
87 // Actually when talking about 'low latency' or 'high latency' it includes
88 // both the latency to get the cache (or global mem) data go to the register,
89 // and the bandwidth limitations.
90 // Increasing the number of active wavefronts helps hide the former, but it
91 // doesn't solve the latter, thus why even if wavefront count is high, we have
92 // to try have as many instructions hiding high latencies as possible.
93 // The OpenCL doc says for example latency of 400 cycles for a global mem access,
94 // which is hidden by 10 instructions if the wavefront count is 10.
96 // Some figures taken from AMD docs:
97 // Both texture and constant L1 caches are 4-way associative with 64 bytes
99 // Constant cache is shared with 4 CUs.
100 // For texture sampling, the address generation unit receives 4 texture
101 // addresses per cycle, thus we could expect texture sampling latency to be
102 // equivalent to 4 instructions in the very best case (a VGPR is 64 work items,
103 // instructions in a wavefront group are executed every 4 cycles),
104 // or 16 instructions if the other wavefronts associated to the 3 other VALUs
105 // of the CU do texture sampling too. (Don't take these figures too seriously,
106 // as I'm not 100% sure of the computation)
107 // Data exports should get similar latency.
108 // For constant loading, the cache is shader with 4 CUs.
109 // The doc says "a throughput of 16B/cycle for each of the 4 Compute Unit"
110 // I guess if the other CU don't read the cache, it can go up to 64B/cycle.
111 // It means a simple s_buffer_load should take one instruction to hide, as
112 // well as a s_buffer_loadx2 and potentially a s_buffer_loadx8 if on the same
115 // As of today the driver doesn't preload the constants in cache, thus the
116 // first loads get extra latency. The doc says global memory access can be
117 // 300-600 cycles. We do not specially take that into account when scheduling
118 // As we expect the driver to be able to preload the constants soon.
124 static const char *getReasonStr(SIScheduleCandReason Reason
) {
126 case NoCand
: return "NOCAND";
127 case RegUsage
: return "REGUSAGE";
128 case Latency
: return "LATENCY";
129 case Successor
: return "SUCCESSOR";
130 case Depth
: return "DEPTH";
131 case NodeOrder
: return "ORDER";
133 llvm_unreachable("Unknown reason!");
140 static bool tryLess(int TryVal
, int CandVal
,
141 SISchedulerCandidate
&TryCand
,
142 SISchedulerCandidate
&Cand
,
143 SIScheduleCandReason Reason
) {
144 if (TryVal
< CandVal
) {
145 TryCand
.Reason
= Reason
;
148 if (TryVal
> CandVal
) {
149 if (Cand
.Reason
> Reason
)
150 Cand
.Reason
= Reason
;
153 Cand
.setRepeat(Reason
);
157 static bool tryGreater(int TryVal
, int CandVal
,
158 SISchedulerCandidate
&TryCand
,
159 SISchedulerCandidate
&Cand
,
160 SIScheduleCandReason Reason
) {
161 if (TryVal
> CandVal
) {
162 TryCand
.Reason
= Reason
;
165 if (TryVal
< CandVal
) {
166 if (Cand
.Reason
> Reason
)
167 Cand
.Reason
= Reason
;
170 Cand
.setRepeat(Reason
);
173 } // end namespace SISched
174 } // end namespace llvm
176 // SIScheduleBlock //
178 void SIScheduleBlock::addUnit(SUnit
*SU
) {
179 NodeNum2Index
[SU
->NodeNum
] = SUnits
.size();
180 SUnits
.push_back(SU
);
184 void SIScheduleBlock::traceCandidate(const SISchedCandidate
&Cand
) {
186 dbgs() << " SU(" << Cand
.SU
->NodeNum
<< ") " << getReasonStr(Cand
.Reason
);
191 void SIScheduleBlock::tryCandidateTopDown(SISchedCandidate
&Cand
,
192 SISchedCandidate
&TryCand
) {
193 // Initialize the candidate if needed.
194 if (!Cand
.isValid()) {
195 TryCand
.Reason
= NodeOrder
;
199 if (Cand
.SGPRUsage
> 60 &&
200 SISched::tryLess(TryCand
.SGPRUsage
, Cand
.SGPRUsage
,
201 TryCand
, Cand
, RegUsage
))
204 // Schedule low latency instructions as top as possible.
205 // Order of priority is:
206 // . Low latency instructions which do not depend on other low latency
207 // instructions we haven't waited for
208 // . Other instructions which do not depend on low latency instructions
209 // we haven't waited for
211 // . All other instructions
212 // Goal is to get: low latency instructions - independent instructions
213 // - (eventually some more low latency instructions)
214 // - instructions that depend on the first low latency instructions.
215 // If in the block there is a lot of constant loads, the SGPR usage
216 // could go quite high, thus above the arbitrary limit of 60 will encourage
217 // use the already loaded constants (in order to release some SGPRs) before
219 if (SISched::tryLess(TryCand
.HasLowLatencyNonWaitedParent
,
220 Cand
.HasLowLatencyNonWaitedParent
,
221 TryCand
, Cand
, SIScheduleCandReason::Depth
))
224 if (SISched::tryGreater(TryCand
.IsLowLatency
, Cand
.IsLowLatency
,
225 TryCand
, Cand
, SIScheduleCandReason::Depth
))
228 if (TryCand
.IsLowLatency
&&
229 SISched::tryLess(TryCand
.LowLatencyOffset
, Cand
.LowLatencyOffset
,
230 TryCand
, Cand
, SIScheduleCandReason::Depth
))
233 if (SISched::tryLess(TryCand
.VGPRUsage
, Cand
.VGPRUsage
,
234 TryCand
, Cand
, RegUsage
))
237 // Fall through to original instruction order.
238 if (TryCand
.SU
->NodeNum
< Cand
.SU
->NodeNum
) {
239 TryCand
.Reason
= NodeOrder
;
243 SUnit
* SIScheduleBlock::pickNode() {
244 SISchedCandidate TopCand
;
246 for (SUnit
* SU
: TopReadySUs
) {
247 SISchedCandidate TryCand
;
248 std::vector
<unsigned> pressure
;
249 std::vector
<unsigned> MaxPressure
;
250 // Predict register usage after this instruction.
252 TopRPTracker
.getDownwardPressure(SU
->getInstr(), pressure
, MaxPressure
);
253 TryCand
.SGPRUsage
= pressure
[AMDGPU::RegisterPressureSets::SReg_32
];
254 TryCand
.VGPRUsage
= pressure
[AMDGPU::RegisterPressureSets::VGPR_32
];
255 TryCand
.IsLowLatency
= DAG
->IsLowLatencySU
[SU
->NodeNum
];
256 TryCand
.LowLatencyOffset
= DAG
->LowLatencyOffset
[SU
->NodeNum
];
257 TryCand
.HasLowLatencyNonWaitedParent
=
258 HasLowLatencyNonWaitedParent
[NodeNum2Index
[SU
->NodeNum
]];
259 tryCandidateTopDown(TopCand
, TryCand
);
260 if (TryCand
.Reason
!= NoCand
)
261 TopCand
.setBest(TryCand
);
268 // Schedule something valid.
269 void SIScheduleBlock::fastSchedule() {
274 for (SUnit
* SU
: SUnits
) {
275 if (!SU
->NumPredsLeft
)
276 TopReadySUs
.push_back(SU
);
279 while (!TopReadySUs
.empty()) {
280 SUnit
*SU
= TopReadySUs
[0];
281 ScheduledSUnits
.push_back(SU
);
288 // Returns if the register was set between first and last.
289 static bool isDefBetween(unsigned Reg
,
290 SlotIndex First
, SlotIndex Last
,
291 const MachineRegisterInfo
*MRI
,
292 const LiveIntervals
*LIS
) {
293 for (MachineRegisterInfo::def_instr_iterator
294 UI
= MRI
->def_instr_begin(Reg
),
295 UE
= MRI
->def_instr_end(); UI
!= UE
; ++UI
) {
296 const MachineInstr
* MI
= &*UI
;
297 if (MI
->isDebugValue())
299 SlotIndex InstSlot
= LIS
->getInstructionIndex(*MI
).getRegSlot();
300 if (InstSlot
>= First
&& InstSlot
<= Last
)
306 void SIScheduleBlock::initRegPressure(MachineBasicBlock::iterator BeginBlock
,
307 MachineBasicBlock::iterator EndBlock
) {
308 IntervalPressure Pressure
, BotPressure
;
309 RegPressureTracker
RPTracker(Pressure
), BotRPTracker(BotPressure
);
310 LiveIntervals
*LIS
= DAG
->getLIS();
311 MachineRegisterInfo
*MRI
= DAG
->getMRI();
312 DAG
->initRPTracker(TopRPTracker
);
313 DAG
->initRPTracker(BotRPTracker
);
314 DAG
->initRPTracker(RPTracker
);
316 // Goes though all SU. RPTracker captures what had to be alive for the SUs
317 // to execute, and what is still alive at the end.
318 for (SUnit
* SU
: ScheduledSUnits
) {
319 RPTracker
.setPos(SU
->getInstr());
323 // Close the RPTracker to finalize live ins/outs.
324 RPTracker
.closeRegion();
326 // Initialize the live ins and live outs.
327 TopRPTracker
.addLiveRegs(RPTracker
.getPressure().LiveInRegs
);
328 BotRPTracker
.addLiveRegs(RPTracker
.getPressure().LiveOutRegs
);
330 // Do not Track Physical Registers, because it messes up.
331 for (const auto &RegMaskPair
: RPTracker
.getPressure().LiveInRegs
) {
332 if (Register::isVirtualRegister(RegMaskPair
.RegUnit
))
333 LiveInRegs
.insert(RegMaskPair
.RegUnit
);
336 // There is several possibilities to distinguish:
337 // 1) Reg is not input to any instruction in the block, but is output of one
338 // 2) 1) + read in the block and not needed after it
339 // 3) 1) + read in the block but needed in another block
340 // 4) Reg is input of an instruction but another block will read it too
341 // 5) Reg is input of an instruction and then rewritten in the block.
342 // result is not read in the block (implies used in another block)
343 // 6) Reg is input of an instruction and then rewritten in the block.
344 // result is read in the block and not needed in another block
345 // 7) Reg is input of an instruction and then rewritten in the block.
346 // result is read in the block but also needed in another block
347 // LiveInRegs will contains all the regs in situation 4, 5, 6, 7
348 // We want LiveOutRegs to contain only Regs whose content will be read after
349 // in another block, and whose content was written in the current block,
350 // that is we want it to get 1, 3, 5, 7
351 // Since we made the MIs of a block to be packed all together before
352 // scheduling, then the LiveIntervals were correct, and the RPTracker was
353 // able to correctly handle 5 vs 6, 2 vs 3.
354 // (Note: This is not sufficient for RPTracker to not do mistakes for case 4)
355 // The RPTracker's LiveOutRegs has 1, 3, (some correct or incorrect)4, 5, 7
356 // Comparing to LiveInRegs is not sufficient to differenciate 4 vs 5, 7
357 // The use of findDefBetween removes the case 4.
358 for (const auto &RegMaskPair
: RPTracker
.getPressure().LiveOutRegs
) {
359 Register Reg
= RegMaskPair
.RegUnit
;
360 if (Reg
.isVirtual() &&
361 isDefBetween(Reg
, LIS
->getInstructionIndex(*BeginBlock
).getRegSlot(),
362 LIS
->getInstructionIndex(*EndBlock
).getRegSlot(), MRI
,
364 LiveOutRegs
.insert(Reg
);
368 // Pressure = sum_alive_registers register size
369 // Internally llvm will represent some registers as big 128 bits registers
370 // for example, but they actually correspond to 4 actual 32 bits registers.
371 // Thus Pressure is not equal to num_alive_registers * constant.
372 LiveInPressure
= TopPressure
.MaxSetPressure
;
373 LiveOutPressure
= BotPressure
.MaxSetPressure
;
375 // Prepares TopRPTracker for top down scheduling.
376 TopRPTracker
.closeTop();
379 void SIScheduleBlock::schedule(MachineBasicBlock::iterator BeginBlock
,
380 MachineBasicBlock::iterator EndBlock
) {
384 // PreScheduling phase to set LiveIn and LiveOut.
385 initRegPressure(BeginBlock
, EndBlock
);
388 // Schedule for real now.
392 for (SUnit
* SU
: SUnits
) {
393 if (!SU
->NumPredsLeft
)
394 TopReadySUs
.push_back(SU
);
397 while (!TopReadySUs
.empty()) {
398 SUnit
*SU
= pickNode();
399 ScheduledSUnits
.push_back(SU
);
400 TopRPTracker
.setPos(SU
->getInstr());
401 TopRPTracker
.advance();
405 // TODO: compute InternalAdditionnalPressure.
406 InternalAdditionnalPressure
.resize(TopPressure
.MaxSetPressure
.size());
408 // Check everything is right.
410 assert(SUnits
.size() == ScheduledSUnits
.size() &&
411 TopReadySUs
.empty());
412 for (SUnit
* SU
: SUnits
) {
413 assert(SU
->isScheduled
&&
414 SU
->NumPredsLeft
== 0);
421 void SIScheduleBlock::undoSchedule() {
422 for (SUnit
* SU
: SUnits
) {
423 SU
->isScheduled
= false;
424 for (SDep
& Succ
: SU
->Succs
) {
425 if (BC
->isSUInBlock(Succ
.getSUnit(), ID
))
426 undoReleaseSucc(SU
, &Succ
);
429 HasLowLatencyNonWaitedParent
.assign(SUnits
.size(), 0);
430 ScheduledSUnits
.clear();
434 void SIScheduleBlock::undoReleaseSucc(SUnit
*SU
, SDep
*SuccEdge
) {
435 SUnit
*SuccSU
= SuccEdge
->getSUnit();
437 if (SuccEdge
->isWeak()) {
438 ++SuccSU
->WeakPredsLeft
;
441 ++SuccSU
->NumPredsLeft
;
444 void SIScheduleBlock::releaseSucc(SUnit
*SU
, SDep
*SuccEdge
) {
445 SUnit
*SuccSU
= SuccEdge
->getSUnit();
447 if (SuccEdge
->isWeak()) {
448 --SuccSU
->WeakPredsLeft
;
452 if (SuccSU
->NumPredsLeft
== 0) {
453 dbgs() << "*** Scheduling failed! ***\n";
454 DAG
->dumpNode(*SuccSU
);
455 dbgs() << " has been released too many times!\n";
456 llvm_unreachable(nullptr);
460 --SuccSU
->NumPredsLeft
;
463 /// Release Successors of the SU that are in the block or not.
464 void SIScheduleBlock::releaseSuccessors(SUnit
*SU
, bool InOrOutBlock
) {
465 for (SDep
& Succ
: SU
->Succs
) {
466 SUnit
*SuccSU
= Succ
.getSUnit();
468 if (SuccSU
->NodeNum
>= DAG
->SUnits
.size())
471 if (BC
->isSUInBlock(SuccSU
, ID
) != InOrOutBlock
)
474 releaseSucc(SU
, &Succ
);
475 if (SuccSU
->NumPredsLeft
== 0 && InOrOutBlock
)
476 TopReadySUs
.push_back(SuccSU
);
480 void SIScheduleBlock::nodeScheduled(SUnit
*SU
) {
482 assert (!SU
->NumPredsLeft
);
483 std::vector
<SUnit
*>::iterator I
= llvm::find(TopReadySUs
, SU
);
484 if (I
== TopReadySUs
.end()) {
485 dbgs() << "Data Structure Bug in SI Scheduler\n";
486 llvm_unreachable(nullptr);
488 TopReadySUs
.erase(I
);
490 releaseSuccessors(SU
, true);
491 // Scheduling this node will trigger a wait,
492 // thus propagate to other instructions that they do not need to wait either.
493 if (HasLowLatencyNonWaitedParent
[NodeNum2Index
[SU
->NodeNum
]])
494 HasLowLatencyNonWaitedParent
.assign(SUnits
.size(), 0);
496 if (DAG
->IsLowLatencySU
[SU
->NodeNum
]) {
497 for (SDep
& Succ
: SU
->Succs
) {
498 std::map
<unsigned, unsigned>::iterator I
=
499 NodeNum2Index
.find(Succ
.getSUnit()->NodeNum
);
500 if (I
!= NodeNum2Index
.end())
501 HasLowLatencyNonWaitedParent
[I
->second
] = 1;
504 SU
->isScheduled
= true;
507 void SIScheduleBlock::finalizeUnits() {
508 // We remove links from outside blocks to enable scheduling inside the block.
509 for (SUnit
* SU
: SUnits
) {
510 releaseSuccessors(SU
, false);
511 if (DAG
->IsHighLatencySU
[SU
->NodeNum
])
512 HighLatencyBlock
= true;
514 HasLowLatencyNonWaitedParent
.resize(SUnits
.size(), 0);
517 // we maintain ascending order of IDs
518 void SIScheduleBlock::addPred(SIScheduleBlock
*Pred
) {
519 unsigned PredID
= Pred
->getID();
521 // Check if not already predecessor.
522 for (SIScheduleBlock
* P
: Preds
) {
523 if (PredID
== P
->getID())
526 Preds
.push_back(Pred
);
528 assert(none_of(Succs
,
529 [=](std::pair
<SIScheduleBlock
*,
530 SIScheduleBlockLinkKind
> S
) {
531 return PredID
== S
.first
->getID();
533 "Loop in the Block Graph!");
536 void SIScheduleBlock::addSucc(SIScheduleBlock
*Succ
,
537 SIScheduleBlockLinkKind Kind
) {
538 unsigned SuccID
= Succ
->getID();
540 // Check if not already predecessor.
541 for (std::pair
<SIScheduleBlock
*, SIScheduleBlockLinkKind
> &S
: Succs
) {
542 if (SuccID
== S
.first
->getID()) {
543 if (S
.second
== SIScheduleBlockLinkKind::NoData
&&
544 Kind
== SIScheduleBlockLinkKind::Data
)
549 if (Succ
->isHighLatencyBlock())
550 ++NumHighLatencySuccessors
;
551 Succs
.push_back(std::make_pair(Succ
, Kind
));
553 assert(none_of(Preds
,
554 [=](SIScheduleBlock
*P
) { return SuccID
== P
->getID(); }) &&
555 "Loop in the Block Graph!");
559 void SIScheduleBlock::printDebug(bool full
) {
560 dbgs() << "Block (" << ID
<< ")\n";
564 dbgs() << "\nContains High Latency Instruction: "
565 << HighLatencyBlock
<< '\n';
566 dbgs() << "\nDepends On:\n";
567 for (SIScheduleBlock
* P
: Preds
) {
568 P
->printDebug(false);
571 dbgs() << "\nSuccessors:\n";
572 for (std::pair
<SIScheduleBlock
*, SIScheduleBlockLinkKind
> S
: Succs
) {
573 if (S
.second
== SIScheduleBlockLinkKind::Data
)
574 dbgs() << "(Data Dep) ";
575 S
.first
->printDebug(false);
579 dbgs() << "LiveInPressure "
580 << LiveInPressure
[AMDGPU::RegisterPressureSets::SReg_32
] << ' '
581 << LiveInPressure
[AMDGPU::RegisterPressureSets::VGPR_32
] << '\n';
582 dbgs() << "LiveOutPressure "
583 << LiveOutPressure
[AMDGPU::RegisterPressureSets::SReg_32
] << ' '
584 << LiveOutPressure
[AMDGPU::RegisterPressureSets::VGPR_32
] << "\n\n";
585 dbgs() << "LiveIns:\n";
586 for (unsigned Reg
: LiveInRegs
)
587 dbgs() << printVRegOrUnit(Reg
, DAG
->getTRI()) << ' ';
589 dbgs() << "\nLiveOuts:\n";
590 for (unsigned Reg
: LiveOutRegs
)
591 dbgs() << printVRegOrUnit(Reg
, DAG
->getTRI()) << ' ';
594 dbgs() << "\nInstructions:\n";
595 for (const SUnit
* SU
: SUnits
)
598 dbgs() << "///////////////////////\n";
602 // SIScheduleBlockCreator //
604 SIScheduleBlockCreator::SIScheduleBlockCreator(SIScheduleDAGMI
*DAG
)
608 SIScheduleBlockCreator::getBlocks(SISchedulerBlockCreatorVariant BlockVariant
) {
609 std::map
<SISchedulerBlockCreatorVariant
, SIScheduleBlocks
>::iterator B
=
610 Blocks
.find(BlockVariant
);
611 if (B
== Blocks
.end()) {
612 SIScheduleBlocks Res
;
613 createBlocksForVariant(BlockVariant
);
615 scheduleInsideBlocks();
617 Res
.Blocks
= CurrentBlocks
;
618 Res
.TopDownIndex2Block
= TopDownIndex2Block
;
619 Res
.TopDownBlock2Index
= TopDownBlock2Index
;
620 Blocks
[BlockVariant
] = Res
;
627 bool SIScheduleBlockCreator::isSUInBlock(SUnit
*SU
, unsigned ID
) {
628 if (SU
->NodeNum
>= DAG
->SUnits
.size())
630 return CurrentBlocks
[Node2CurrentBlock
[SU
->NodeNum
]]->getID() == ID
;
633 void SIScheduleBlockCreator::colorHighLatenciesAlone() {
634 unsigned DAGSize
= DAG
->SUnits
.size();
636 for (unsigned i
= 0, e
= DAGSize
; i
!= e
; ++i
) {
637 SUnit
*SU
= &DAG
->SUnits
[i
];
638 if (DAG
->IsHighLatencySU
[SU
->NodeNum
]) {
639 CurrentColoring
[SU
->NodeNum
] = NextReservedID
++;
645 hasDataDependencyPred(const SUnit
&SU
, const SUnit
&FromSU
) {
646 for (const auto &PredDep
: SU
.Preds
) {
647 if (PredDep
.getSUnit() == &FromSU
&&
648 PredDep
.getKind() == llvm::SDep::Data
)
654 void SIScheduleBlockCreator::colorHighLatenciesGroups() {
655 unsigned DAGSize
= DAG
->SUnits
.size();
656 unsigned NumHighLatencies
= 0;
658 int Color
= NextReservedID
;
660 std::set
<unsigned> FormingGroup
;
662 for (unsigned i
= 0, e
= DAGSize
; i
!= e
; ++i
) {
663 SUnit
*SU
= &DAG
->SUnits
[i
];
664 if (DAG
->IsHighLatencySU
[SU
->NodeNum
])
668 if (NumHighLatencies
== 0)
671 if (NumHighLatencies
<= 6)
673 else if (NumHighLatencies
<= 12)
678 for (unsigned SUNum
: DAG
->TopDownIndex2SU
) {
679 const SUnit
&SU
= DAG
->SUnits
[SUNum
];
680 if (DAG
->IsHighLatencySU
[SU
.NodeNum
]) {
681 unsigned CompatibleGroup
= true;
682 int ProposedColor
= Color
;
683 std::vector
<int> AdditionalElements
;
685 // We don't want to put in the same block
686 // two high latency instructions that depend
688 // One way would be to check canAddEdge
689 // in both directions, but that currently is not
690 // enough because there the high latency order is
691 // enforced (via links).
692 // Instead, look at the dependencies between the
693 // high latency instructions and deduce if it is
694 // a data dependency or not.
695 for (unsigned j
: FormingGroup
) {
697 std::vector
<int> SubGraph
;
698 // By construction (topological order), if SU and
699 // DAG->SUnits[j] are linked, DAG->SUnits[j] is neccessary
700 // in the parent graph of SU.
702 SubGraph
= DAG
->GetTopo()->GetSubGraph(SU
, DAG
->SUnits
[j
],
704 assert(!HasSubGraph
);
706 SubGraph
= DAG
->GetTopo()->GetSubGraph(DAG
->SUnits
[j
], SU
,
709 continue; // No dependencies between each other
710 else if (SubGraph
.size() > 5) {
711 // Too many elements would be required to be added to the block.
712 CompatibleGroup
= false;
716 // Check the type of dependency
717 for (unsigned k
: SubGraph
) {
718 // If in the path to join the two instructions,
719 // there is another high latency instruction,
720 // or instructions colored for another block
722 if (DAG
->IsHighLatencySU
[k
] ||
723 (CurrentColoring
[k
] != ProposedColor
&&
724 CurrentColoring
[k
] != 0)) {
725 CompatibleGroup
= false;
728 // If one of the SU in the subgraph depends on the result of SU j,
729 // there'll be a data dependency.
730 if (hasDataDependencyPred(DAG
->SUnits
[k
], DAG
->SUnits
[j
])) {
731 CompatibleGroup
= false;
735 if (!CompatibleGroup
)
737 // Same check for the SU
738 if (hasDataDependencyPred(SU
, DAG
->SUnits
[j
])) {
739 CompatibleGroup
= false;
742 // Add all the required instructions to the block
743 // These cannot live in another block (because they
744 // depend (order dependency) on one of the
745 // instruction in the block, and are required for the
746 // high latency instruction we add.
747 llvm::append_range(AdditionalElements
, SubGraph
);
750 if (CompatibleGroup
) {
751 FormingGroup
.insert(SU
.NodeNum
);
752 for (unsigned j
: AdditionalElements
)
753 CurrentColoring
[j
] = ProposedColor
;
754 CurrentColoring
[SU
.NodeNum
] = ProposedColor
;
757 // Found one incompatible instruction,
758 // or has filled a big enough group.
759 // -> start a new one.
760 if (!CompatibleGroup
) {
761 FormingGroup
.clear();
762 Color
= ++NextReservedID
;
763 ProposedColor
= Color
;
764 FormingGroup
.insert(SU
.NodeNum
);
765 CurrentColoring
[SU
.NodeNum
] = ProposedColor
;
767 } else if (Count
== GroupSize
) {
768 FormingGroup
.clear();
769 Color
= ++NextReservedID
;
770 ProposedColor
= Color
;
777 void SIScheduleBlockCreator::colorComputeReservedDependencies() {
778 unsigned DAGSize
= DAG
->SUnits
.size();
779 std::map
<std::set
<unsigned>, unsigned> ColorCombinations
;
781 CurrentTopDownReservedDependencyColoring
.clear();
782 CurrentBottomUpReservedDependencyColoring
.clear();
784 CurrentTopDownReservedDependencyColoring
.resize(DAGSize
, 0);
785 CurrentBottomUpReservedDependencyColoring
.resize(DAGSize
, 0);
787 // Traverse TopDown, and give different colors to SUs depending
788 // on which combination of High Latencies they depend on.
790 for (unsigned SUNum
: DAG
->TopDownIndex2SU
) {
791 SUnit
*SU
= &DAG
->SUnits
[SUNum
];
792 std::set
<unsigned> SUColors
;
795 if (CurrentColoring
[SU
->NodeNum
]) {
796 CurrentTopDownReservedDependencyColoring
[SU
->NodeNum
] =
797 CurrentColoring
[SU
->NodeNum
];
801 for (SDep
& PredDep
: SU
->Preds
) {
802 SUnit
*Pred
= PredDep
.getSUnit();
803 if (PredDep
.isWeak() || Pred
->NodeNum
>= DAGSize
)
805 if (CurrentTopDownReservedDependencyColoring
[Pred
->NodeNum
] > 0)
806 SUColors
.insert(CurrentTopDownReservedDependencyColoring
[Pred
->NodeNum
]);
808 // Color 0 by default.
809 if (SUColors
.empty())
811 // Same color than parents.
812 if (SUColors
.size() == 1 && *SUColors
.begin() > DAGSize
)
813 CurrentTopDownReservedDependencyColoring
[SU
->NodeNum
] =
816 std::map
<std::set
<unsigned>, unsigned>::iterator Pos
=
817 ColorCombinations
.find(SUColors
);
818 if (Pos
!= ColorCombinations
.end()) {
819 CurrentTopDownReservedDependencyColoring
[SU
->NodeNum
] = Pos
->second
;
821 CurrentTopDownReservedDependencyColoring
[SU
->NodeNum
] =
823 ColorCombinations
[SUColors
] = NextNonReservedID
++;
828 ColorCombinations
.clear();
830 // Same as before, but BottomUp.
832 for (unsigned SUNum
: DAG
->BottomUpIndex2SU
) {
833 SUnit
*SU
= &DAG
->SUnits
[SUNum
];
834 std::set
<unsigned> SUColors
;
837 if (CurrentColoring
[SU
->NodeNum
]) {
838 CurrentBottomUpReservedDependencyColoring
[SU
->NodeNum
] =
839 CurrentColoring
[SU
->NodeNum
];
843 for (SDep
& SuccDep
: SU
->Succs
) {
844 SUnit
*Succ
= SuccDep
.getSUnit();
845 if (SuccDep
.isWeak() || Succ
->NodeNum
>= DAGSize
)
847 if (CurrentBottomUpReservedDependencyColoring
[Succ
->NodeNum
] > 0)
848 SUColors
.insert(CurrentBottomUpReservedDependencyColoring
[Succ
->NodeNum
]);
851 if (SUColors
.empty())
853 // Same color than parents.
854 if (SUColors
.size() == 1 && *SUColors
.begin() > DAGSize
)
855 CurrentBottomUpReservedDependencyColoring
[SU
->NodeNum
] =
858 std::map
<std::set
<unsigned>, unsigned>::iterator Pos
=
859 ColorCombinations
.find(SUColors
);
860 if (Pos
!= ColorCombinations
.end()) {
861 CurrentBottomUpReservedDependencyColoring
[SU
->NodeNum
] = Pos
->second
;
863 CurrentBottomUpReservedDependencyColoring
[SU
->NodeNum
] =
865 ColorCombinations
[SUColors
] = NextNonReservedID
++;
871 void SIScheduleBlockCreator::colorAccordingToReservedDependencies() {
872 unsigned DAGSize
= DAG
->SUnits
.size();
873 std::map
<std::pair
<unsigned, unsigned>, unsigned> ColorCombinations
;
875 // Every combination of colors given by the top down
876 // and bottom up Reserved node dependency
878 for (unsigned i
= 0, e
= DAGSize
; i
!= e
; ++i
) {
879 SUnit
*SU
= &DAG
->SUnits
[i
];
880 std::pair
<unsigned, unsigned> SUColors
;
882 // High latency instructions: already given.
883 if (CurrentColoring
[SU
->NodeNum
])
886 SUColors
.first
= CurrentTopDownReservedDependencyColoring
[SU
->NodeNum
];
887 SUColors
.second
= CurrentBottomUpReservedDependencyColoring
[SU
->NodeNum
];
889 std::map
<std::pair
<unsigned, unsigned>, unsigned>::iterator Pos
=
890 ColorCombinations
.find(SUColors
);
891 if (Pos
!= ColorCombinations
.end()) {
892 CurrentColoring
[SU
->NodeNum
] = Pos
->second
;
894 CurrentColoring
[SU
->NodeNum
] = NextNonReservedID
;
895 ColorCombinations
[SUColors
] = NextNonReservedID
++;
900 void SIScheduleBlockCreator::colorEndsAccordingToDependencies() {
901 unsigned DAGSize
= DAG
->SUnits
.size();
902 std::vector
<int> PendingColoring
= CurrentColoring
;
904 assert(DAGSize
>= 1 &&
905 CurrentBottomUpReservedDependencyColoring
.size() == DAGSize
&&
906 CurrentTopDownReservedDependencyColoring
.size() == DAGSize
);
907 // If there is no reserved block at all, do nothing. We don't want
908 // everything in one block.
909 if (*std::max_element(CurrentBottomUpReservedDependencyColoring
.begin(),
910 CurrentBottomUpReservedDependencyColoring
.end()) == 0 &&
911 *std::max_element(CurrentTopDownReservedDependencyColoring
.begin(),
912 CurrentTopDownReservedDependencyColoring
.end()) == 0)
915 for (unsigned SUNum
: DAG
->BottomUpIndex2SU
) {
916 SUnit
*SU
= &DAG
->SUnits
[SUNum
];
917 std::set
<unsigned> SUColors
;
918 std::set
<unsigned> SUColorsPending
;
920 if (CurrentColoring
[SU
->NodeNum
] <= (int)DAGSize
)
923 if (CurrentBottomUpReservedDependencyColoring
[SU
->NodeNum
] > 0 ||
924 CurrentTopDownReservedDependencyColoring
[SU
->NodeNum
] > 0)
927 for (SDep
& SuccDep
: SU
->Succs
) {
928 SUnit
*Succ
= SuccDep
.getSUnit();
929 if (SuccDep
.isWeak() || Succ
->NodeNum
>= DAGSize
)
931 if (CurrentBottomUpReservedDependencyColoring
[Succ
->NodeNum
] > 0 ||
932 CurrentTopDownReservedDependencyColoring
[Succ
->NodeNum
] > 0)
933 SUColors
.insert(CurrentColoring
[Succ
->NodeNum
]);
934 SUColorsPending
.insert(PendingColoring
[Succ
->NodeNum
]);
936 // If there is only one child/parent block, and that block
937 // is not among the ones we are removing in this path, then
938 // merge the instruction to that block
939 if (SUColors
.size() == 1 && SUColorsPending
.size() == 1)
940 PendingColoring
[SU
->NodeNum
] = *SUColors
.begin();
941 else // TODO: Attribute new colors depending on color
942 // combination of children.
943 PendingColoring
[SU
->NodeNum
] = NextNonReservedID
++;
945 CurrentColoring
= PendingColoring
;
949 void SIScheduleBlockCreator::colorForceConsecutiveOrderInGroup() {
950 unsigned DAGSize
= DAG
->SUnits
.size();
951 unsigned PreviousColor
;
952 std::set
<unsigned> SeenColors
;
957 PreviousColor
= CurrentColoring
[0];
959 for (unsigned i
= 1, e
= DAGSize
; i
!= e
; ++i
) {
960 SUnit
*SU
= &DAG
->SUnits
[i
];
961 unsigned CurrentColor
= CurrentColoring
[i
];
962 unsigned PreviousColorSave
= PreviousColor
;
963 assert(i
== SU
->NodeNum
);
965 if (CurrentColor
!= PreviousColor
)
966 SeenColors
.insert(PreviousColor
);
967 PreviousColor
= CurrentColor
;
969 if (CurrentColoring
[SU
->NodeNum
] <= (int)DAGSize
)
972 if (SeenColors
.find(CurrentColor
) == SeenColors
.end())
975 if (PreviousColorSave
!= CurrentColor
)
976 CurrentColoring
[i
] = NextNonReservedID
++;
978 CurrentColoring
[i
] = CurrentColoring
[i
-1];
982 void SIScheduleBlockCreator::colorMergeConstantLoadsNextGroup() {
983 unsigned DAGSize
= DAG
->SUnits
.size();
985 for (unsigned SUNum
: DAG
->BottomUpIndex2SU
) {
986 SUnit
*SU
= &DAG
->SUnits
[SUNum
];
987 std::set
<unsigned> SUColors
;
989 if (CurrentColoring
[SU
->NodeNum
] <= (int)DAGSize
)
992 // No predecessor: Vgpr constant loading.
993 // Low latency instructions usually have a predecessor (the address)
994 if (SU
->Preds
.size() > 0 && !DAG
->IsLowLatencySU
[SU
->NodeNum
])
997 for (SDep
& SuccDep
: SU
->Succs
) {
998 SUnit
*Succ
= SuccDep
.getSUnit();
999 if (SuccDep
.isWeak() || Succ
->NodeNum
>= DAGSize
)
1001 SUColors
.insert(CurrentColoring
[Succ
->NodeNum
]);
1003 if (SUColors
.size() == 1)
1004 CurrentColoring
[SU
->NodeNum
] = *SUColors
.begin();
1008 void SIScheduleBlockCreator::colorMergeIfPossibleNextGroup() {
1009 unsigned DAGSize
= DAG
->SUnits
.size();
1011 for (unsigned SUNum
: DAG
->BottomUpIndex2SU
) {
1012 SUnit
*SU
= &DAG
->SUnits
[SUNum
];
1013 std::set
<unsigned> SUColors
;
1015 if (CurrentColoring
[SU
->NodeNum
] <= (int)DAGSize
)
1018 for (SDep
& SuccDep
: SU
->Succs
) {
1019 SUnit
*Succ
= SuccDep
.getSUnit();
1020 if (SuccDep
.isWeak() || Succ
->NodeNum
>= DAGSize
)
1022 SUColors
.insert(CurrentColoring
[Succ
->NodeNum
]);
1024 if (SUColors
.size() == 1)
1025 CurrentColoring
[SU
->NodeNum
] = *SUColors
.begin();
1029 void SIScheduleBlockCreator::colorMergeIfPossibleNextGroupOnlyForReserved() {
1030 unsigned DAGSize
= DAG
->SUnits
.size();
1032 for (unsigned SUNum
: DAG
->BottomUpIndex2SU
) {
1033 SUnit
*SU
= &DAG
->SUnits
[SUNum
];
1034 std::set
<unsigned> SUColors
;
1036 if (CurrentColoring
[SU
->NodeNum
] <= (int)DAGSize
)
1039 for (SDep
& SuccDep
: SU
->Succs
) {
1040 SUnit
*Succ
= SuccDep
.getSUnit();
1041 if (SuccDep
.isWeak() || Succ
->NodeNum
>= DAGSize
)
1043 SUColors
.insert(CurrentColoring
[Succ
->NodeNum
]);
1045 if (SUColors
.size() == 1 && *SUColors
.begin() <= DAGSize
)
1046 CurrentColoring
[SU
->NodeNum
] = *SUColors
.begin();
1050 void SIScheduleBlockCreator::colorMergeIfPossibleSmallGroupsToNextGroup() {
1051 unsigned DAGSize
= DAG
->SUnits
.size();
1052 std::map
<unsigned, unsigned> ColorCount
;
1054 for (unsigned SUNum
: DAG
->BottomUpIndex2SU
) {
1055 SUnit
*SU
= &DAG
->SUnits
[SUNum
];
1056 unsigned color
= CurrentColoring
[SU
->NodeNum
];
1057 ++ColorCount
[color
];
1060 for (unsigned SUNum
: DAG
->BottomUpIndex2SU
) {
1061 SUnit
*SU
= &DAG
->SUnits
[SUNum
];
1062 unsigned color
= CurrentColoring
[SU
->NodeNum
];
1063 std::set
<unsigned> SUColors
;
1065 if (CurrentColoring
[SU
->NodeNum
] <= (int)DAGSize
)
1068 if (ColorCount
[color
] > 1)
1071 for (SDep
& SuccDep
: SU
->Succs
) {
1072 SUnit
*Succ
= SuccDep
.getSUnit();
1073 if (SuccDep
.isWeak() || Succ
->NodeNum
>= DAGSize
)
1075 SUColors
.insert(CurrentColoring
[Succ
->NodeNum
]);
1077 if (SUColors
.size() == 1 && *SUColors
.begin() != color
) {
1078 --ColorCount
[color
];
1079 CurrentColoring
[SU
->NodeNum
] = *SUColors
.begin();
1080 ++ColorCount
[*SUColors
.begin()];
1085 void SIScheduleBlockCreator::cutHugeBlocks() {
1089 void SIScheduleBlockCreator::regroupNoUserInstructions() {
1090 unsigned DAGSize
= DAG
->SUnits
.size();
1091 int GroupID
= NextNonReservedID
++;
1093 for (unsigned SUNum
: DAG
->BottomUpIndex2SU
) {
1094 SUnit
*SU
= &DAG
->SUnits
[SUNum
];
1095 bool hasSuccessor
= false;
1097 if (CurrentColoring
[SU
->NodeNum
] <= (int)DAGSize
)
1100 for (SDep
& SuccDep
: SU
->Succs
) {
1101 SUnit
*Succ
= SuccDep
.getSUnit();
1102 if (SuccDep
.isWeak() || Succ
->NodeNum
>= DAGSize
)
1104 hasSuccessor
= true;
1107 CurrentColoring
[SU
->NodeNum
] = GroupID
;
1111 void SIScheduleBlockCreator::colorExports() {
1112 unsigned ExportColor
= NextNonReservedID
++;
1113 SmallVector
<unsigned, 8> ExpGroup
;
1115 // Put all exports together in a block.
1116 // The block will naturally end up being scheduled last,
1117 // thus putting exports at the end of the schedule, which
1118 // is better for performance.
1119 // However we must ensure, for safety, the exports can be put
1120 // together in the same block without any other instruction.
1121 // This could happen, for example, when scheduling after regalloc
1122 // if reloading a spilled register from memory using the same
1123 // register than used in a previous export.
1124 // If that happens, do not regroup the exports.
1125 for (unsigned SUNum
: DAG
->TopDownIndex2SU
) {
1126 const SUnit
&SU
= DAG
->SUnits
[SUNum
];
1127 if (SIInstrInfo::isEXP(*SU
.getInstr())) {
1128 // Check the EXP can be added to the group safely,
1129 // ie without needing any other instruction.
1130 // The EXP is allowed to depend on other EXP
1131 // (they will be in the same group).
1132 for (unsigned j
: ExpGroup
) {
1134 std::vector
<int> SubGraph
;
1135 // By construction (topological order), if SU and
1136 // DAG->SUnits[j] are linked, DAG->SUnits[j] is neccessary
1137 // in the parent graph of SU.
1139 SubGraph
= DAG
->GetTopo()->GetSubGraph(SU
, DAG
->SUnits
[j
],
1141 assert(!HasSubGraph
);
1143 SubGraph
= DAG
->GetTopo()->GetSubGraph(DAG
->SUnits
[j
], SU
,
1146 continue; // No dependencies between each other
1148 // SubGraph contains all the instructions required
1149 // between EXP SUnits[j] and EXP SU.
1150 for (unsigned k
: SubGraph
) {
1151 if (!SIInstrInfo::isEXP(*DAG
->SUnits
[k
].getInstr()))
1152 // Other instructions than EXP would be required in the group.
1153 // Abort the groupping.
1158 ExpGroup
.push_back(SUNum
);
1162 // The group can be formed. Give the color.
1163 for (unsigned j
: ExpGroup
)
1164 CurrentColoring
[j
] = ExportColor
;
1167 void SIScheduleBlockCreator::createBlocksForVariant(SISchedulerBlockCreatorVariant BlockVariant
) {
1168 unsigned DAGSize
= DAG
->SUnits
.size();
1169 std::map
<unsigned,unsigned> RealID
;
1171 CurrentBlocks
.clear();
1172 CurrentColoring
.clear();
1173 CurrentColoring
.resize(DAGSize
, 0);
1174 Node2CurrentBlock
.clear();
1176 // Restore links previous scheduling variant has overridden.
1177 DAG
->restoreSULinksLeft();
1180 NextNonReservedID
= DAGSize
+ 1;
1182 LLVM_DEBUG(dbgs() << "Coloring the graph\n");
1184 if (BlockVariant
== SISchedulerBlockCreatorVariant::LatenciesGrouped
)
1185 colorHighLatenciesGroups();
1187 colorHighLatenciesAlone();
1188 colorComputeReservedDependencies();
1189 colorAccordingToReservedDependencies();
1190 colorEndsAccordingToDependencies();
1191 if (BlockVariant
== SISchedulerBlockCreatorVariant::LatenciesAlonePlusConsecutive
)
1192 colorForceConsecutiveOrderInGroup();
1193 regroupNoUserInstructions();
1194 colorMergeConstantLoadsNextGroup();
1195 colorMergeIfPossibleNextGroupOnlyForReserved();
1198 // Put SUs of same color into same block
1199 Node2CurrentBlock
.resize(DAGSize
, -1);
1200 for (unsigned i
= 0, e
= DAGSize
; i
!= e
; ++i
) {
1201 SUnit
*SU
= &DAG
->SUnits
[i
];
1202 unsigned Color
= CurrentColoring
[SU
->NodeNum
];
1203 if (RealID
.find(Color
) == RealID
.end()) {
1204 int ID
= CurrentBlocks
.size();
1205 BlockPtrs
.push_back(std::make_unique
<SIScheduleBlock
>(DAG
, this, ID
));
1206 CurrentBlocks
.push_back(BlockPtrs
.rbegin()->get());
1209 CurrentBlocks
[RealID
[Color
]]->addUnit(SU
);
1210 Node2CurrentBlock
[SU
->NodeNum
] = RealID
[Color
];
1213 // Build dependencies between blocks.
1214 for (unsigned i
= 0, e
= DAGSize
; i
!= e
; ++i
) {
1215 SUnit
*SU
= &DAG
->SUnits
[i
];
1216 int SUID
= Node2CurrentBlock
[i
];
1217 for (SDep
& SuccDep
: SU
->Succs
) {
1218 SUnit
*Succ
= SuccDep
.getSUnit();
1219 if (SuccDep
.isWeak() || Succ
->NodeNum
>= DAGSize
)
1221 if (Node2CurrentBlock
[Succ
->NodeNum
] != SUID
)
1222 CurrentBlocks
[SUID
]->addSucc(CurrentBlocks
[Node2CurrentBlock
[Succ
->NodeNum
]],
1223 SuccDep
.isCtrl() ? NoData
: Data
);
1225 for (SDep
& PredDep
: SU
->Preds
) {
1226 SUnit
*Pred
= PredDep
.getSUnit();
1227 if (PredDep
.isWeak() || Pred
->NodeNum
>= DAGSize
)
1229 if (Node2CurrentBlock
[Pred
->NodeNum
] != SUID
)
1230 CurrentBlocks
[SUID
]->addPred(CurrentBlocks
[Node2CurrentBlock
[Pred
->NodeNum
]]);
1234 // Free root and leafs of all blocks to enable scheduling inside them.
1235 for (unsigned i
= 0, e
= CurrentBlocks
.size(); i
!= e
; ++i
) {
1236 SIScheduleBlock
*Block
= CurrentBlocks
[i
];
1237 Block
->finalizeUnits();
1239 LLVM_DEBUG(dbgs() << "Blocks created:\n\n";
1240 for (unsigned i
= 0, e
= CurrentBlocks
.size(); i
!= e
; ++i
) {
1241 SIScheduleBlock
*Block
= CurrentBlocks
[i
];
1242 Block
->printDebug(true);
1246 // Two functions taken from Codegen/MachineScheduler.cpp
1248 /// Non-const version.
1249 static MachineBasicBlock::iterator
1250 nextIfDebug(MachineBasicBlock::iterator I
,
1251 MachineBasicBlock::const_iterator End
) {
1252 for (; I
!= End
; ++I
) {
1253 if (!I
->isDebugInstr())
1259 void SIScheduleBlockCreator::topologicalSort() {
1260 unsigned DAGSize
= CurrentBlocks
.size();
1261 std::vector
<int> WorkList
;
1263 LLVM_DEBUG(dbgs() << "Topological Sort\n");
1265 WorkList
.reserve(DAGSize
);
1266 TopDownIndex2Block
.resize(DAGSize
);
1267 TopDownBlock2Index
.resize(DAGSize
);
1268 BottomUpIndex2Block
.resize(DAGSize
);
1270 for (unsigned i
= 0, e
= DAGSize
; i
!= e
; ++i
) {
1271 SIScheduleBlock
*Block
= CurrentBlocks
[i
];
1272 unsigned Degree
= Block
->getSuccs().size();
1273 TopDownBlock2Index
[i
] = Degree
;
1275 WorkList
.push_back(i
);
1280 while (!WorkList
.empty()) {
1281 int i
= WorkList
.back();
1282 SIScheduleBlock
*Block
= CurrentBlocks
[i
];
1283 WorkList
.pop_back();
1284 TopDownBlock2Index
[i
] = --Id
;
1285 TopDownIndex2Block
[Id
] = i
;
1286 for (SIScheduleBlock
* Pred
: Block
->getPreds()) {
1287 if (!--TopDownBlock2Index
[Pred
->getID()])
1288 WorkList
.push_back(Pred
->getID());
1293 // Check correctness of the ordering.
1294 for (unsigned i
= 0, e
= DAGSize
; i
!= e
; ++i
) {
1295 SIScheduleBlock
*Block
= CurrentBlocks
[i
];
1296 for (SIScheduleBlock
* Pred
: Block
->getPreds()) {
1297 assert(TopDownBlock2Index
[i
] > TopDownBlock2Index
[Pred
->getID()] &&
1298 "Wrong Top Down topological sorting");
1303 BottomUpIndex2Block
= std::vector
<int>(TopDownIndex2Block
.rbegin(),
1304 TopDownIndex2Block
.rend());
1307 void SIScheduleBlockCreator::scheduleInsideBlocks() {
1308 unsigned DAGSize
= CurrentBlocks
.size();
1310 LLVM_DEBUG(dbgs() << "\nScheduling Blocks\n\n");
1312 // We do schedule a valid scheduling such that a Block corresponds
1313 // to a range of instructions.
1314 LLVM_DEBUG(dbgs() << "First phase: Fast scheduling for Reg Liveness\n");
1315 for (unsigned i
= 0, e
= DAGSize
; i
!= e
; ++i
) {
1316 SIScheduleBlock
*Block
= CurrentBlocks
[i
];
1317 Block
->fastSchedule();
1320 // Note: the following code, and the part restoring previous position
1321 // is by far the most expensive operation of the Scheduler.
1323 // Do not update CurrentTop.
1324 MachineBasicBlock::iterator CurrentTopFastSched
= DAG
->getCurrentTop();
1325 std::vector
<MachineBasicBlock::iterator
> PosOld
;
1326 std::vector
<MachineBasicBlock::iterator
> PosNew
;
1327 PosOld
.reserve(DAG
->SUnits
.size());
1328 PosNew
.reserve(DAG
->SUnits
.size());
1330 for (unsigned i
= 0, e
= DAGSize
; i
!= e
; ++i
) {
1331 int BlockIndice
= TopDownIndex2Block
[i
];
1332 SIScheduleBlock
*Block
= CurrentBlocks
[BlockIndice
];
1333 std::vector
<SUnit
*> SUs
= Block
->getScheduledUnits();
1335 for (SUnit
* SU
: SUs
) {
1336 MachineInstr
*MI
= SU
->getInstr();
1337 MachineBasicBlock::iterator Pos
= MI
;
1338 PosOld
.push_back(Pos
);
1339 if (&*CurrentTopFastSched
== MI
) {
1340 PosNew
.push_back(Pos
);
1341 CurrentTopFastSched
= nextIfDebug(++CurrentTopFastSched
,
1342 DAG
->getCurrentBottom());
1344 // Update the instruction stream.
1345 DAG
->getBB()->splice(CurrentTopFastSched
, DAG
->getBB(), MI
);
1347 // Update LiveIntervals.
1348 // Note: Moving all instructions and calling handleMove every time
1349 // is the most cpu intensive operation of the scheduler.
1350 // It would gain a lot if there was a way to recompute the
1351 // LiveIntervals for the entire scheduling region.
1352 DAG
->getLIS()->handleMove(*MI
, /*UpdateFlags=*/true);
1353 PosNew
.push_back(CurrentTopFastSched
);
1358 // Now we have Block of SUs == Block of MI.
1359 // We do the final schedule for the instructions inside the block.
1360 // The property that all the SUs of the Block are grouped together as MI
1361 // is used for correct reg usage tracking.
1362 for (unsigned i
= 0, e
= DAGSize
; i
!= e
; ++i
) {
1363 SIScheduleBlock
*Block
= CurrentBlocks
[i
];
1364 std::vector
<SUnit
*> SUs
= Block
->getScheduledUnits();
1365 Block
->schedule((*SUs
.begin())->getInstr(), (*SUs
.rbegin())->getInstr());
1368 LLVM_DEBUG(dbgs() << "Restoring MI Pos\n");
1369 // Restore old ordering (which prevents a LIS->handleMove bug).
1370 for (unsigned i
= PosOld
.size(), e
= 0; i
!= e
; --i
) {
1371 MachineBasicBlock::iterator POld
= PosOld
[i
-1];
1372 MachineBasicBlock::iterator PNew
= PosNew
[i
-1];
1374 // Update the instruction stream.
1375 DAG
->getBB()->splice(POld
, DAG
->getBB(), PNew
);
1377 // Update LiveIntervals.
1378 DAG
->getLIS()->handleMove(*POld
, /*UpdateFlags=*/true);
1382 LLVM_DEBUG(for (unsigned i
= 0, e
= CurrentBlocks
.size(); i
!= e
; ++i
) {
1383 SIScheduleBlock
*Block
= CurrentBlocks
[i
];
1384 Block
->printDebug(true);
1388 void SIScheduleBlockCreator::fillStats() {
1389 unsigned DAGSize
= CurrentBlocks
.size();
1391 for (unsigned i
= 0, e
= DAGSize
; i
!= e
; ++i
) {
1392 int BlockIndice
= TopDownIndex2Block
[i
];
1393 SIScheduleBlock
*Block
= CurrentBlocks
[BlockIndice
];
1394 if (Block
->getPreds().empty())
1398 for (SIScheduleBlock
*Pred
: Block
->getPreds()) {
1399 if (Depth
< Pred
->Depth
+ Pred
->getCost())
1400 Depth
= Pred
->Depth
+ Pred
->getCost();
1402 Block
->Depth
= Depth
;
1406 for (unsigned i
= 0, e
= DAGSize
; i
!= e
; ++i
) {
1407 int BlockIndice
= BottomUpIndex2Block
[i
];
1408 SIScheduleBlock
*Block
= CurrentBlocks
[BlockIndice
];
1409 if (Block
->getSuccs().empty())
1412 unsigned Height
= 0;
1413 for (const auto &Succ
: Block
->getSuccs())
1414 Height
= std::max(Height
, Succ
.first
->Height
+ Succ
.first
->getCost());
1415 Block
->Height
= Height
;
1420 // SIScheduleBlockScheduler //
1422 SIScheduleBlockScheduler::SIScheduleBlockScheduler(SIScheduleDAGMI
*DAG
,
1423 SISchedulerBlockSchedulerVariant Variant
,
1424 SIScheduleBlocks BlocksStruct
) :
1425 DAG(DAG
), Variant(Variant
), Blocks(BlocksStruct
.Blocks
),
1426 LastPosWaitedHighLatency(0), NumBlockScheduled(0), VregCurrentUsage(0),
1427 SregCurrentUsage(0), maxVregUsage(0), maxSregUsage(0) {
1429 // Fill the usage of every output
1430 // Warning: while by construction we always have a link between two blocks
1431 // when one needs a result from the other, the number of users of an output
1432 // is not the sum of child blocks having as input the same virtual register.
1433 // Here is an example. A produces x and y. B eats x and produces x'.
1434 // C eats x' and y. The register coalescer may have attributed the same
1435 // virtual register to x and x'.
1436 // To count accurately, we do a topological sort. In case the register is
1437 // found for several parents, we increment the usage of the one with the
1438 // highest topological index.
1439 LiveOutRegsNumUsages
.resize(Blocks
.size());
1440 for (unsigned i
= 0, e
= Blocks
.size(); i
!= e
; ++i
) {
1441 SIScheduleBlock
*Block
= Blocks
[i
];
1442 for (unsigned Reg
: Block
->getInRegs()) {
1445 for (SIScheduleBlock
* Pred
: Block
->getPreds()) {
1446 std::set
<unsigned> PredOutRegs
= Pred
->getOutRegs();
1447 std::set
<unsigned>::iterator RegPos
= PredOutRegs
.find(Reg
);
1449 if (RegPos
!= PredOutRegs
.end()) {
1451 if (topoInd
< BlocksStruct
.TopDownBlock2Index
[Pred
->getID()]) {
1452 topoInd
= BlocksStruct
.TopDownBlock2Index
[Pred
->getID()];
1460 int PredID
= BlocksStruct
.TopDownIndex2Block
[topoInd
];
1461 ++LiveOutRegsNumUsages
[PredID
][Reg
];
1465 LastPosHighLatencyParentScheduled
.resize(Blocks
.size(), 0);
1466 BlockNumPredsLeft
.resize(Blocks
.size());
1467 BlockNumSuccsLeft
.resize(Blocks
.size());
1469 for (unsigned i
= 0, e
= Blocks
.size(); i
!= e
; ++i
) {
1470 SIScheduleBlock
*Block
= Blocks
[i
];
1471 BlockNumPredsLeft
[i
] = Block
->getPreds().size();
1472 BlockNumSuccsLeft
[i
] = Block
->getSuccs().size();
1476 for (unsigned i
= 0, e
= Blocks
.size(); i
!= e
; ++i
) {
1477 SIScheduleBlock
*Block
= Blocks
[i
];
1478 assert(Block
->getID() == i
);
1482 std::set
<unsigned> InRegs
= DAG
->getInRegs();
1483 addLiveRegs(InRegs
);
1485 // Increase LiveOutRegsNumUsages for blocks
1486 // producing registers consumed in another
1487 // scheduling region.
1488 for (unsigned Reg
: DAG
->getOutRegs()) {
1489 for (unsigned i
= 0, e
= Blocks
.size(); i
!= e
; ++i
) {
1490 // Do reverse traversal
1491 int ID
= BlocksStruct
.TopDownIndex2Block
[Blocks
.size()-1-i
];
1492 SIScheduleBlock
*Block
= Blocks
[ID
];
1493 const std::set
<unsigned> &OutRegs
= Block
->getOutRegs();
1495 if (OutRegs
.find(Reg
) == OutRegs
.end())
1498 ++LiveOutRegsNumUsages
[ID
][Reg
];
1503 // Fill LiveRegsConsumers for regs that were already
1504 // defined before scheduling.
1505 for (unsigned i
= 0, e
= Blocks
.size(); i
!= e
; ++i
) {
1506 SIScheduleBlock
*Block
= Blocks
[i
];
1507 for (unsigned Reg
: Block
->getInRegs()) {
1509 for (SIScheduleBlock
* Pred
: Block
->getPreds()) {
1510 std::set
<unsigned> PredOutRegs
= Pred
->getOutRegs();
1511 std::set
<unsigned>::iterator RegPos
= PredOutRegs
.find(Reg
);
1513 if (RegPos
!= PredOutRegs
.end()) {
1520 ++LiveRegsConsumers
[Reg
];
1524 for (unsigned i
= 0, e
= Blocks
.size(); i
!= e
; ++i
) {
1525 SIScheduleBlock
*Block
= Blocks
[i
];
1526 if (BlockNumPredsLeft
[i
] == 0) {
1527 ReadyBlocks
.push_back(Block
);
1531 while (SIScheduleBlock
*Block
= pickBlock()) {
1532 BlocksScheduled
.push_back(Block
);
1533 blockScheduled(Block
);
1536 LLVM_DEBUG(dbgs() << "Block Order:"; for (SIScheduleBlock
*Block
1537 : BlocksScheduled
) {
1538 dbgs() << ' ' << Block
->getID();
1542 bool SIScheduleBlockScheduler::tryCandidateLatency(SIBlockSchedCandidate
&Cand
,
1543 SIBlockSchedCandidate
&TryCand
) {
1544 if (!Cand
.isValid()) {
1545 TryCand
.Reason
= NodeOrder
;
1549 // Try to hide high latencies.
1550 if (SISched::tryLess(TryCand
.LastPosHighLatParentScheduled
,
1551 Cand
.LastPosHighLatParentScheduled
, TryCand
, Cand
, Latency
))
1553 // Schedule high latencies early so you can hide them better.
1554 if (SISched::tryGreater(TryCand
.IsHighLatency
, Cand
.IsHighLatency
,
1555 TryCand
, Cand
, Latency
))
1557 if (TryCand
.IsHighLatency
&& SISched::tryGreater(TryCand
.Height
, Cand
.Height
,
1558 TryCand
, Cand
, Depth
))
1560 if (SISched::tryGreater(TryCand
.NumHighLatencySuccessors
,
1561 Cand
.NumHighLatencySuccessors
,
1562 TryCand
, Cand
, Successor
))
1567 bool SIScheduleBlockScheduler::tryCandidateRegUsage(SIBlockSchedCandidate
&Cand
,
1568 SIBlockSchedCandidate
&TryCand
) {
1569 if (!Cand
.isValid()) {
1570 TryCand
.Reason
= NodeOrder
;
1574 if (SISched::tryLess(TryCand
.VGPRUsageDiff
> 0, Cand
.VGPRUsageDiff
> 0,
1575 TryCand
, Cand
, RegUsage
))
1577 if (SISched::tryGreater(TryCand
.NumSuccessors
> 0,
1578 Cand
.NumSuccessors
> 0,
1579 TryCand
, Cand
, Successor
))
1581 if (SISched::tryGreater(TryCand
.Height
, Cand
.Height
, TryCand
, Cand
, Depth
))
1583 if (SISched::tryLess(TryCand
.VGPRUsageDiff
, Cand
.VGPRUsageDiff
,
1584 TryCand
, Cand
, RegUsage
))
1589 SIScheduleBlock
*SIScheduleBlockScheduler::pickBlock() {
1590 SIBlockSchedCandidate Cand
;
1591 std::vector
<SIScheduleBlock
*>::iterator Best
;
1592 SIScheduleBlock
*Block
;
1593 if (ReadyBlocks
.empty())
1596 DAG
->fillVgprSgprCost(LiveRegs
.begin(), LiveRegs
.end(),
1597 VregCurrentUsage
, SregCurrentUsage
);
1598 if (VregCurrentUsage
> maxVregUsage
)
1599 maxVregUsage
= VregCurrentUsage
;
1600 if (SregCurrentUsage
> maxSregUsage
)
1601 maxSregUsage
= SregCurrentUsage
;
1602 LLVM_DEBUG(dbgs() << "Picking New Blocks\n"; dbgs() << "Available: ";
1603 for (SIScheduleBlock
*Block
1604 : ReadyBlocks
) dbgs()
1605 << Block
->getID() << ' ';
1606 dbgs() << "\nCurrent Live:\n";
1609 << printVRegOrUnit(Reg
, DAG
->getTRI()) << ' ';
1611 dbgs() << "Current VGPRs: " << VregCurrentUsage
<< '\n';
1612 dbgs() << "Current SGPRs: " << SregCurrentUsage
<< '\n';);
1614 Cand
.Block
= nullptr;
1615 for (std::vector
<SIScheduleBlock
*>::iterator I
= ReadyBlocks
.begin(),
1616 E
= ReadyBlocks
.end(); I
!= E
; ++I
) {
1617 SIBlockSchedCandidate TryCand
;
1619 TryCand
.IsHighLatency
= TryCand
.Block
->isHighLatencyBlock();
1620 TryCand
.VGPRUsageDiff
=
1621 checkRegUsageImpact(TryCand
.Block
->getInRegs(),
1622 TryCand
.Block
->getOutRegs())[AMDGPU::RegisterPressureSets::VGPR_32
];
1623 TryCand
.NumSuccessors
= TryCand
.Block
->getSuccs().size();
1624 TryCand
.NumHighLatencySuccessors
=
1625 TryCand
.Block
->getNumHighLatencySuccessors();
1626 TryCand
.LastPosHighLatParentScheduled
=
1627 (unsigned int) std::max
<int> (0,
1628 LastPosHighLatencyParentScheduled
[TryCand
.Block
->getID()] -
1629 LastPosWaitedHighLatency
);
1630 TryCand
.Height
= TryCand
.Block
->Height
;
1631 // Try not to increase VGPR usage too much, else we may spill.
1632 if (VregCurrentUsage
> 120 ||
1633 Variant
!= SISchedulerBlockSchedulerVariant::BlockLatencyRegUsage
) {
1634 if (!tryCandidateRegUsage(Cand
, TryCand
) &&
1635 Variant
!= SISchedulerBlockSchedulerVariant::BlockRegUsage
)
1636 tryCandidateLatency(Cand
, TryCand
);
1638 if (!tryCandidateLatency(Cand
, TryCand
))
1639 tryCandidateRegUsage(Cand
, TryCand
);
1641 if (TryCand
.Reason
!= NoCand
) {
1642 Cand
.setBest(TryCand
);
1644 LLVM_DEBUG(dbgs() << "Best Current Choice: " << Cand
.Block
->getID() << ' '
1645 << getReasonStr(Cand
.Reason
) << '\n');
1649 LLVM_DEBUG(dbgs() << "Picking: " << Cand
.Block
->getID() << '\n';
1650 dbgs() << "Is a block with high latency instruction: "
1651 << (Cand
.IsHighLatency
? "yes\n" : "no\n");
1652 dbgs() << "Position of last high latency dependency: "
1653 << Cand
.LastPosHighLatParentScheduled
<< '\n';
1654 dbgs() << "VGPRUsageDiff: " << Cand
.VGPRUsageDiff
<< '\n';
1658 ReadyBlocks
.erase(Best
);
1662 // Tracking of currently alive registers to determine VGPR Usage.
1664 void SIScheduleBlockScheduler::addLiveRegs(std::set
<unsigned> &Regs
) {
1665 for (Register Reg
: Regs
) {
1666 // For now only track virtual registers.
1667 if (!Reg
.isVirtual())
1669 // If not already in the live set, then add it.
1670 (void) LiveRegs
.insert(Reg
);
1674 void SIScheduleBlockScheduler::decreaseLiveRegs(SIScheduleBlock
*Block
,
1675 std::set
<unsigned> &Regs
) {
1676 for (unsigned Reg
: Regs
) {
1677 // For now only track virtual registers.
1678 std::set
<unsigned>::iterator Pos
= LiveRegs
.find(Reg
);
1679 assert (Pos
!= LiveRegs
.end() && // Reg must be live.
1680 LiveRegsConsumers
.find(Reg
) != LiveRegsConsumers
.end() &&
1681 LiveRegsConsumers
[Reg
] >= 1);
1682 --LiveRegsConsumers
[Reg
];
1683 if (LiveRegsConsumers
[Reg
] == 0)
1684 LiveRegs
.erase(Pos
);
1688 void SIScheduleBlockScheduler::releaseBlockSuccs(SIScheduleBlock
*Parent
) {
1689 for (const auto &Block
: Parent
->getSuccs()) {
1690 if (--BlockNumPredsLeft
[Block
.first
->getID()] == 0)
1691 ReadyBlocks
.push_back(Block
.first
);
1693 if (Parent
->isHighLatencyBlock() &&
1694 Block
.second
== SIScheduleBlockLinkKind::Data
)
1695 LastPosHighLatencyParentScheduled
[Block
.first
->getID()] = NumBlockScheduled
;
1699 void SIScheduleBlockScheduler::blockScheduled(SIScheduleBlock
*Block
) {
1700 decreaseLiveRegs(Block
, Block
->getInRegs());
1701 addLiveRegs(Block
->getOutRegs());
1702 releaseBlockSuccs(Block
);
1703 for (std::map
<unsigned, unsigned>::iterator RegI
=
1704 LiveOutRegsNumUsages
[Block
->getID()].begin(),
1705 E
= LiveOutRegsNumUsages
[Block
->getID()].end(); RegI
!= E
; ++RegI
) {
1706 std::pair
<unsigned, unsigned> RegP
= *RegI
;
1707 // We produce this register, thus it must not be previously alive.
1708 assert(LiveRegsConsumers
.find(RegP
.first
) == LiveRegsConsumers
.end() ||
1709 LiveRegsConsumers
[RegP
.first
] == 0);
1710 LiveRegsConsumers
[RegP
.first
] += RegP
.second
;
1712 if (LastPosHighLatencyParentScheduled
[Block
->getID()] >
1713 (unsigned)LastPosWaitedHighLatency
)
1714 LastPosWaitedHighLatency
=
1715 LastPosHighLatencyParentScheduled
[Block
->getID()];
1716 ++NumBlockScheduled
;
1720 SIScheduleBlockScheduler::checkRegUsageImpact(std::set
<unsigned> &InRegs
,
1721 std::set
<unsigned> &OutRegs
) {
1722 std::vector
<int> DiffSetPressure
;
1723 DiffSetPressure
.assign(DAG
->getTRI()->getNumRegPressureSets(), 0);
1725 for (Register Reg
: InRegs
) {
1726 // For now only track virtual registers.
1727 if (!Reg
.isVirtual())
1729 if (LiveRegsConsumers
[Reg
] > 1)
1731 PSetIterator PSetI
= DAG
->getMRI()->getPressureSets(Reg
);
1732 for (; PSetI
.isValid(); ++PSetI
) {
1733 DiffSetPressure
[*PSetI
] -= PSetI
.getWeight();
1737 for (Register Reg
: OutRegs
) {
1738 // For now only track virtual registers.
1739 if (!Reg
.isVirtual())
1741 PSetIterator PSetI
= DAG
->getMRI()->getPressureSets(Reg
);
1742 for (; PSetI
.isValid(); ++PSetI
) {
1743 DiffSetPressure
[*PSetI
] += PSetI
.getWeight();
1747 return DiffSetPressure
;
1752 struct SIScheduleBlockResult
1753 SIScheduler::scheduleVariant(SISchedulerBlockCreatorVariant BlockVariant
,
1754 SISchedulerBlockSchedulerVariant ScheduleVariant
) {
1755 SIScheduleBlocks Blocks
= BlockCreator
.getBlocks(BlockVariant
);
1756 SIScheduleBlockScheduler
Scheduler(DAG
, ScheduleVariant
, Blocks
);
1757 std::vector
<SIScheduleBlock
*> ScheduledBlocks
;
1758 struct SIScheduleBlockResult Res
;
1760 ScheduledBlocks
= Scheduler
.getBlocks();
1762 for (unsigned b
= 0; b
< ScheduledBlocks
.size(); ++b
) {
1763 SIScheduleBlock
*Block
= ScheduledBlocks
[b
];
1764 std::vector
<SUnit
*> SUs
= Block
->getScheduledUnits();
1766 for (SUnit
* SU
: SUs
)
1767 Res
.SUs
.push_back(SU
->NodeNum
);
1770 Res
.MaxSGPRUsage
= Scheduler
.getSGPRUsage();
1771 Res
.MaxVGPRUsage
= Scheduler
.getVGPRUsage();
1775 // SIScheduleDAGMI //
1777 SIScheduleDAGMI::SIScheduleDAGMI(MachineSchedContext
*C
) :
1778 ScheduleDAGMILive(C
, std::make_unique
<GenericScheduler
>(C
)) {
1779 SITII
= static_cast<const SIInstrInfo
*>(TII
);
1780 SITRI
= static_cast<const SIRegisterInfo
*>(TRI
);
1783 SIScheduleDAGMI::~SIScheduleDAGMI() = default;
1785 // Code adapted from scheduleDAG.cpp
1786 // Does a topological sort over the SUs.
1787 // Both TopDown and BottomUp
1788 void SIScheduleDAGMI::topologicalSort() {
1789 Topo
.InitDAGTopologicalSorting();
1791 TopDownIndex2SU
= std::vector
<int>(Topo
.begin(), Topo
.end());
1792 BottomUpIndex2SU
= std::vector
<int>(Topo
.rbegin(), Topo
.rend());
1795 // Move low latencies further from their user without
1796 // increasing SGPR usage (in general)
1797 // This is to be replaced by a better pass that would
1798 // take into account SGPR usage (based on VGPR Usage
1799 // and the corresponding wavefront count), that would
1800 // try to merge groups of loads if it make sense, etc
1801 void SIScheduleDAGMI::moveLowLatencies() {
1802 unsigned DAGSize
= SUnits
.size();
1803 int LastLowLatencyUser
= -1;
1804 int LastLowLatencyPos
= -1;
1806 for (unsigned i
= 0, e
= ScheduledSUnits
.size(); i
!= e
; ++i
) {
1807 SUnit
*SU
= &SUnits
[ScheduledSUnits
[i
]];
1808 bool IsLowLatencyUser
= false;
1809 unsigned MinPos
= 0;
1811 for (SDep
& PredDep
: SU
->Preds
) {
1812 SUnit
*Pred
= PredDep
.getSUnit();
1813 if (SITII
->isLowLatencyInstruction(*Pred
->getInstr())) {
1814 IsLowLatencyUser
= true;
1816 if (Pred
->NodeNum
>= DAGSize
)
1818 unsigned PredPos
= ScheduledSUnitsInv
[Pred
->NodeNum
];
1819 if (PredPos
>= MinPos
)
1820 MinPos
= PredPos
+ 1;
1823 if (SITII
->isLowLatencyInstruction(*SU
->getInstr())) {
1824 unsigned BestPos
= LastLowLatencyUser
+ 1;
1825 if ((int)BestPos
<= LastLowLatencyPos
)
1826 BestPos
= LastLowLatencyPos
+ 1;
1827 if (BestPos
< MinPos
)
1830 for (unsigned u
= i
; u
> BestPos
; --u
) {
1831 ++ScheduledSUnitsInv
[ScheduledSUnits
[u
-1]];
1832 ScheduledSUnits
[u
] = ScheduledSUnits
[u
-1];
1834 ScheduledSUnits
[BestPos
] = SU
->NodeNum
;
1835 ScheduledSUnitsInv
[SU
->NodeNum
] = BestPos
;
1837 LastLowLatencyPos
= BestPos
;
1838 if (IsLowLatencyUser
)
1839 LastLowLatencyUser
= BestPos
;
1840 } else if (IsLowLatencyUser
) {
1841 LastLowLatencyUser
= i
;
1842 // Moves COPY instructions on which depends
1843 // the low latency instructions too.
1844 } else if (SU
->getInstr()->getOpcode() == AMDGPU::COPY
) {
1845 bool CopyForLowLat
= false;
1846 for (SDep
& SuccDep
: SU
->Succs
) {
1847 SUnit
*Succ
= SuccDep
.getSUnit();
1848 if (SuccDep
.isWeak() || Succ
->NodeNum
>= DAGSize
)
1850 if (SITII
->isLowLatencyInstruction(*Succ
->getInstr())) {
1851 CopyForLowLat
= true;
1857 for (unsigned u
= i
; u
> MinPos
; --u
) {
1858 ++ScheduledSUnitsInv
[ScheduledSUnits
[u
-1]];
1859 ScheduledSUnits
[u
] = ScheduledSUnits
[u
-1];
1861 ScheduledSUnits
[MinPos
] = SU
->NodeNum
;
1862 ScheduledSUnitsInv
[SU
->NodeNum
] = MinPos
;
1868 void SIScheduleDAGMI::restoreSULinksLeft() {
1869 for (unsigned i
= 0, e
= SUnits
.size(); i
!= e
; ++i
) {
1870 SUnits
[i
].isScheduled
= false;
1871 SUnits
[i
].WeakPredsLeft
= SUnitsLinksBackup
[i
].WeakPredsLeft
;
1872 SUnits
[i
].NumPredsLeft
= SUnitsLinksBackup
[i
].NumPredsLeft
;
1873 SUnits
[i
].WeakSuccsLeft
= SUnitsLinksBackup
[i
].WeakSuccsLeft
;
1874 SUnits
[i
].NumSuccsLeft
= SUnitsLinksBackup
[i
].NumSuccsLeft
;
1878 // Return the Vgpr and Sgpr usage corresponding to some virtual registers.
1879 template<typename _Iterator
> void
1880 SIScheduleDAGMI::fillVgprSgprCost(_Iterator First
, _Iterator End
,
1881 unsigned &VgprUsage
, unsigned &SgprUsage
) {
1884 for (_Iterator RegI
= First
; RegI
!= End
; ++RegI
) {
1885 Register Reg
= *RegI
;
1886 // For now only track virtual registers
1887 if (!Reg
.isVirtual())
1889 PSetIterator PSetI
= MRI
.getPressureSets(Reg
);
1890 for (; PSetI
.isValid(); ++PSetI
) {
1891 if (*PSetI
== AMDGPU::RegisterPressureSets::VGPR_32
)
1892 VgprUsage
+= PSetI
.getWeight();
1893 else if (*PSetI
== AMDGPU::RegisterPressureSets::SReg_32
)
1894 SgprUsage
+= PSetI
.getWeight();
1899 void SIScheduleDAGMI::schedule()
1901 SmallVector
<SUnit
*, 8> TopRoots
, BotRoots
;
1902 SIScheduleBlockResult Best
, Temp
;
1903 LLVM_DEBUG(dbgs() << "Preparing Scheduling\n");
1905 buildDAGWithRegPressure();
1909 findRootsAndBiasEdges(TopRoots
, BotRoots
);
1910 // We reuse several ScheduleDAGMI and ScheduleDAGMILive
1911 // functions, but to make them happy we must initialize
1912 // the default Scheduler implementation (even if we do not
1914 SchedImpl
->initialize(this);
1915 initQueues(TopRoots
, BotRoots
);
1917 // Fill some stats to help scheduling.
1919 SUnitsLinksBackup
= SUnits
;
1920 IsLowLatencySU
.clear();
1921 LowLatencyOffset
.clear();
1922 IsHighLatencySU
.clear();
1924 IsLowLatencySU
.resize(SUnits
.size(), 0);
1925 LowLatencyOffset
.resize(SUnits
.size(), 0);
1926 IsHighLatencySU
.resize(SUnits
.size(), 0);
1928 for (unsigned i
= 0, e
= (unsigned)SUnits
.size(); i
!= e
; ++i
) {
1929 SUnit
*SU
= &SUnits
[i
];
1930 const MachineOperand
*BaseLatOp
;
1932 if (SITII
->isLowLatencyInstruction(*SU
->getInstr())) {
1933 IsLowLatencySU
[i
] = 1;
1934 bool OffsetIsScalable
;
1935 if (SITII
->getMemOperandWithOffset(*SU
->getInstr(), BaseLatOp
, OffLatReg
,
1936 OffsetIsScalable
, TRI
))
1937 LowLatencyOffset
[i
] = OffLatReg
;
1938 } else if (SITII
->isHighLatencyDef(SU
->getInstr()->getOpcode()))
1939 IsHighLatencySU
[i
] = 1;
1942 SIScheduler
Scheduler(this);
1943 Best
= Scheduler
.scheduleVariant(SISchedulerBlockCreatorVariant::LatenciesAlone
,
1944 SISchedulerBlockSchedulerVariant::BlockLatencyRegUsage
);
1946 // if VGPR usage is extremely high, try other good performing variants
1947 // which could lead to lower VGPR usage
1948 if (Best
.MaxVGPRUsage
> 180) {
1949 static const std::pair
<SISchedulerBlockCreatorVariant
,
1950 SISchedulerBlockSchedulerVariant
>
1952 { LatenciesAlone
, BlockRegUsageLatency
},
1953 // { LatenciesAlone, BlockRegUsage },
1954 { LatenciesGrouped
, BlockLatencyRegUsage
},
1955 // { LatenciesGrouped, BlockRegUsageLatency },
1956 // { LatenciesGrouped, BlockRegUsage },
1957 { LatenciesAlonePlusConsecutive
, BlockLatencyRegUsage
},
1958 // { LatenciesAlonePlusConsecutive, BlockRegUsageLatency },
1959 // { LatenciesAlonePlusConsecutive, BlockRegUsage }
1961 for (std::pair
<SISchedulerBlockCreatorVariant
, SISchedulerBlockSchedulerVariant
> v
: Variants
) {
1962 Temp
= Scheduler
.scheduleVariant(v
.first
, v
.second
);
1963 if (Temp
.MaxVGPRUsage
< Best
.MaxVGPRUsage
)
1967 // if VGPR usage is still extremely high, we may spill. Try other variants
1968 // which are less performing, but that could lead to lower VGPR usage.
1969 if (Best
.MaxVGPRUsage
> 200) {
1970 static const std::pair
<SISchedulerBlockCreatorVariant
,
1971 SISchedulerBlockSchedulerVariant
>
1973 // { LatenciesAlone, BlockRegUsageLatency },
1974 { LatenciesAlone
, BlockRegUsage
},
1975 // { LatenciesGrouped, BlockLatencyRegUsage },
1976 { LatenciesGrouped
, BlockRegUsageLatency
},
1977 { LatenciesGrouped
, BlockRegUsage
},
1978 // { LatenciesAlonePlusConsecutive, BlockLatencyRegUsage },
1979 { LatenciesAlonePlusConsecutive
, BlockRegUsageLatency
},
1980 { LatenciesAlonePlusConsecutive
, BlockRegUsage
}
1982 for (std::pair
<SISchedulerBlockCreatorVariant
, SISchedulerBlockSchedulerVariant
> v
: Variants
) {
1983 Temp
= Scheduler
.scheduleVariant(v
.first
, v
.second
);
1984 if (Temp
.MaxVGPRUsage
< Best
.MaxVGPRUsage
)
1989 ScheduledSUnits
= Best
.SUs
;
1990 ScheduledSUnitsInv
.resize(SUnits
.size());
1992 for (unsigned i
= 0, e
= (unsigned)SUnits
.size(); i
!= e
; ++i
) {
1993 ScheduledSUnitsInv
[ScheduledSUnits
[i
]] = i
;
1998 // Tell the outside world about the result of the scheduling.
2000 assert(TopRPTracker
.getPos() == RegionBegin
&& "bad initial Top tracker");
2001 TopRPTracker
.setPos(CurrentTop
);
2003 for (std::vector
<unsigned>::iterator I
= ScheduledSUnits
.begin(),
2004 E
= ScheduledSUnits
.end(); I
!= E
; ++I
) {
2005 SUnit
*SU
= &SUnits
[*I
];
2007 scheduleMI(SU
, true);
2009 LLVM_DEBUG(dbgs() << "Scheduling SU(" << SU
->NodeNum
<< ") "
2010 << *SU
->getInstr());
2013 assert(CurrentTop
== CurrentBottom
&& "Nonempty unscheduled zone.");
2018 dbgs() << "*** Final schedule for "
2019 << printMBBReference(*begin()->getParent()) << " ***\n";