1 //===- MachinePipeliner.cpp - Machine Software Pipeliner Pass -------------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 // An implementation of the Swing Modulo Scheduling (SMS) software pipeliner.
11 // This SMS implementation is a target-independent back-end pass. When enabled,
12 // the pass runs just prior to the register allocation pass, while the machine
13 // IR is in SSA form. If software pipelining is successful, then the original
14 // loop is replaced by the optimized loop. The optimized loop contains one or
15 // more prolog blocks, the pipelined kernel, and one or more epilog blocks. If
16 // the instructions cannot be scheduled in a given MII, we increase the MII by
19 // The SMS implementation is an extension of the ScheduleDAGInstrs class. We
20 // represent loop carried dependences in the DAG as order edges to the Phi
21 // nodes. We also perform several passes over the DAG to eliminate unnecessary
22 // edges that inhibit the ability to pipeline. The implementation uses the
23 // DFAPacketizer class to compute the minimum initiation interval and the check
24 // where an instruction may be inserted in the pipelined schedule.
26 // In order for the SMS pass to work, several target specific hooks need to be
27 // implemented to get information about the loop structure and to rewrite
30 //===----------------------------------------------------------------------===//
32 #include "llvm/CodeGen/MachinePipeliner.h"
33 #include "llvm/ADT/ArrayRef.h"
34 #include "llvm/ADT/BitVector.h"
35 #include "llvm/ADT/DenseMap.h"
36 #include "llvm/ADT/MapVector.h"
37 #include "llvm/ADT/PriorityQueue.h"
38 #include "llvm/ADT/STLExtras.h"
39 #include "llvm/ADT/SetOperations.h"
40 #include "llvm/ADT/SetVector.h"
41 #include "llvm/ADT/SmallPtrSet.h"
42 #include "llvm/ADT/SmallSet.h"
43 #include "llvm/ADT/SmallVector.h"
44 #include "llvm/ADT/Statistic.h"
45 #include "llvm/ADT/iterator_range.h"
46 #include "llvm/Analysis/AliasAnalysis.h"
47 #include "llvm/Analysis/CycleAnalysis.h"
48 #include "llvm/Analysis/MemoryLocation.h"
49 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
50 #include "llvm/Analysis/ValueTracking.h"
51 #include "llvm/CodeGen/DFAPacketizer.h"
52 #include "llvm/CodeGen/LiveIntervals.h"
53 #include "llvm/CodeGen/MachineBasicBlock.h"
54 #include "llvm/CodeGen/MachineDominators.h"
55 #include "llvm/CodeGen/MachineFunction.h"
56 #include "llvm/CodeGen/MachineFunctionPass.h"
57 #include "llvm/CodeGen/MachineInstr.h"
58 #include "llvm/CodeGen/MachineInstrBuilder.h"
59 #include "llvm/CodeGen/MachineLoopInfo.h"
60 #include "llvm/CodeGen/MachineMemOperand.h"
61 #include "llvm/CodeGen/MachineOperand.h"
62 #include "llvm/CodeGen/MachineRegisterInfo.h"
63 #include "llvm/CodeGen/ModuloSchedule.h"
64 #include "llvm/CodeGen/Register.h"
65 #include "llvm/CodeGen/RegisterClassInfo.h"
66 #include "llvm/CodeGen/RegisterPressure.h"
67 #include "llvm/CodeGen/ScheduleDAG.h"
68 #include "llvm/CodeGen/ScheduleDAGMutation.h"
69 #include "llvm/CodeGen/TargetInstrInfo.h"
70 #include "llvm/CodeGen/TargetOpcodes.h"
71 #include "llvm/CodeGen/TargetPassConfig.h"
72 #include "llvm/CodeGen/TargetRegisterInfo.h"
73 #include "llvm/CodeGen/TargetSubtargetInfo.h"
74 #include "llvm/Config/llvm-config.h"
75 #include "llvm/IR/Attributes.h"
76 #include "llvm/IR/Function.h"
77 #include "llvm/MC/LaneBitmask.h"
78 #include "llvm/MC/MCInstrDesc.h"
79 #include "llvm/MC/MCInstrItineraries.h"
80 #include "llvm/MC/MCRegisterInfo.h"
81 #include "llvm/Pass.h"
82 #include "llvm/Support/CommandLine.h"
83 #include "llvm/Support/Compiler.h"
84 #include "llvm/Support/Debug.h"
85 #include "llvm/Support/MathExtras.h"
86 #include "llvm/Support/raw_ostream.h"
102 using namespace llvm
;
104 #define DEBUG_TYPE "pipeliner"
106 STATISTIC(NumTrytoPipeline
, "Number of loops that we attempt to pipeline");
107 STATISTIC(NumPipelined
, "Number of loops software pipelined");
108 STATISTIC(NumNodeOrderIssues
, "Number of node order issues found");
109 STATISTIC(NumFailBranch
, "Pipeliner abort due to unknown branch");
110 STATISTIC(NumFailLoop
, "Pipeliner abort due to unsupported loop");
111 STATISTIC(NumFailPreheader
, "Pipeliner abort due to missing preheader");
112 STATISTIC(NumFailLargeMaxMII
, "Pipeliner abort due to MaxMII too large");
113 STATISTIC(NumFailZeroMII
, "Pipeliner abort due to zero MII");
114 STATISTIC(NumFailNoSchedule
, "Pipeliner abort due to no schedule found");
115 STATISTIC(NumFailZeroStage
, "Pipeliner abort due to zero stage");
116 STATISTIC(NumFailLargeMaxStage
, "Pipeliner abort due to too many stages");
118 /// A command line option to turn software pipelining on or off.
119 static cl::opt
<bool> EnableSWP("enable-pipeliner", cl::Hidden
, cl::init(true),
120 cl::desc("Enable Software Pipelining"));
122 /// A command line option to enable SWP at -Os.
123 static cl::opt
<bool> EnableSWPOptSize("enable-pipeliner-opt-size",
124 cl::desc("Enable SWP at Os."), cl::Hidden
,
127 /// A command line argument to limit minimum initial interval for pipelining.
128 static cl::opt
<int> SwpMaxMii("pipeliner-max-mii",
129 cl::desc("Size limit for the MII."),
130 cl::Hidden
, cl::init(27));
132 /// A command line argument to force pipeliner to use specified initial
134 static cl::opt
<int> SwpForceII("pipeliner-force-ii",
135 cl::desc("Force pipeliner to use specified II."),
136 cl::Hidden
, cl::init(-1));
138 /// A command line argument to limit the number of stages in the pipeline.
140 SwpMaxStages("pipeliner-max-stages",
141 cl::desc("Maximum stages allowed in the generated scheduled."),
142 cl::Hidden
, cl::init(3));
144 /// A command line option to disable the pruning of chain dependences due to
145 /// an unrelated Phi.
147 SwpPruneDeps("pipeliner-prune-deps",
148 cl::desc("Prune dependences between unrelated Phi nodes."),
149 cl::Hidden
, cl::init(true));
151 /// A command line option to disable the pruning of loop carried order
154 SwpPruneLoopCarried("pipeliner-prune-loop-carried",
155 cl::desc("Prune loop carried order dependences."),
156 cl::Hidden
, cl::init(true));
159 static cl::opt
<int> SwpLoopLimit("pipeliner-max", cl::Hidden
, cl::init(-1));
162 static cl::opt
<bool> SwpIgnoreRecMII("pipeliner-ignore-recmii",
164 cl::desc("Ignore RecMII"));
166 static cl::opt
<bool> SwpShowResMask("pipeliner-show-mask", cl::Hidden
,
168 static cl::opt
<bool> SwpDebugResource("pipeliner-dbg-res", cl::Hidden
,
171 static cl::opt
<bool> EmitTestAnnotations(
172 "pipeliner-annotate-for-testing", cl::Hidden
, cl::init(false),
173 cl::desc("Instead of emitting the pipelined code, annotate instructions "
174 "with the generated schedule for feeding into the "
175 "-modulo-schedule-test pass"));
177 static cl::opt
<bool> ExperimentalCodeGen(
178 "pipeliner-experimental-cg", cl::Hidden
, cl::init(false),
180 "Use the experimental peeling code generator for software pipelining"));
182 static cl::opt
<int> SwpIISearchRange("pipeliner-ii-search-range",
183 cl::desc("Range to search for II"),
184 cl::Hidden
, cl::init(10));
187 LimitRegPressure("pipeliner-register-pressure", cl::Hidden
, cl::init(false),
188 cl::desc("Limit register pressure of scheduled loop"));
191 RegPressureMargin("pipeliner-register-pressure-margin", cl::Hidden
,
193 cl::desc("Margin representing the unused percentage of "
194 "the register pressure limit"));
197 MVECodeGen("pipeliner-mve-cg", cl::Hidden
, cl::init(false),
198 cl::desc("Use the MVE code generator for software pipelining"));
202 // A command line option to enable the CopyToPhi DAG mutation.
203 cl::opt
<bool> SwpEnableCopyToPhi("pipeliner-enable-copytophi", cl::ReallyHidden
,
205 cl::desc("Enable CopyToPhi DAG Mutation"));
207 /// A command line argument to force pipeliner to use specified issue
209 cl::opt
<int> SwpForceIssueWidth(
210 "pipeliner-force-issue-width",
211 cl::desc("Force pipeliner to use specified issue width."), cl::Hidden
,
214 /// A command line argument to set the window scheduling option.
215 cl::opt
<WindowSchedulingFlag
> WindowSchedulingOption(
216 "window-sched", cl::Hidden
, cl::init(WindowSchedulingFlag::WS_On
),
217 cl::desc("Set how to use window scheduling algorithm."),
218 cl::values(clEnumValN(WindowSchedulingFlag::WS_Off
, "off",
219 "Turn off window algorithm."),
220 clEnumValN(WindowSchedulingFlag::WS_On
, "on",
221 "Use window algorithm after SMS algorithm fails."),
222 clEnumValN(WindowSchedulingFlag::WS_Force
, "force",
223 "Use window algorithm instead of SMS algorithm.")));
225 } // end namespace llvm
227 unsigned SwingSchedulerDAG::Circuits::MaxPaths
= 5;
228 char MachinePipeliner::ID
= 0;
230 int MachinePipeliner::NumTries
= 0;
232 char &llvm::MachinePipelinerID
= MachinePipeliner::ID
;
234 INITIALIZE_PASS_BEGIN(MachinePipeliner
, DEBUG_TYPE
,
235 "Modulo Software Pipelining", false, false)
236 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass
)
237 INITIALIZE_PASS_DEPENDENCY(MachineLoopInfoWrapperPass
)
238 INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass
)
239 INITIALIZE_PASS_DEPENDENCY(LiveIntervalsWrapperPass
)
240 INITIALIZE_PASS_END(MachinePipeliner
, DEBUG_TYPE
,
241 "Modulo Software Pipelining", false, false)
243 /// The "main" function for implementing Swing Modulo Scheduling.
244 bool MachinePipeliner::runOnMachineFunction(MachineFunction
&mf
) {
245 if (skipFunction(mf
.getFunction()))
251 if (mf
.getFunction().getAttributes().hasFnAttr(Attribute::OptimizeForSize
) &&
252 !EnableSWPOptSize
.getPosition())
255 if (!mf
.getSubtarget().enableMachinePipeliner())
258 // Cannot pipeline loops without instruction itineraries if we are using
259 // DFA for the pipeliner.
260 if (mf
.getSubtarget().useDFAforSMS() &&
261 (!mf
.getSubtarget().getInstrItineraryData() ||
262 mf
.getSubtarget().getInstrItineraryData()->isEmpty()))
266 MLI
= &getAnalysis
<MachineLoopInfoWrapperPass
>().getLI();
267 MDT
= &getAnalysis
<MachineDominatorTreeWrapperPass
>().getDomTree();
268 ORE
= &getAnalysis
<MachineOptimizationRemarkEmitterPass
>().getORE();
269 TII
= MF
->getSubtarget().getInstrInfo();
270 RegClassInfo
.runOnMachineFunction(*MF
);
272 for (const auto &L
: *MLI
)
278 /// Attempt to perform the SMS algorithm on the specified loop. This function is
279 /// the main entry point for the algorithm. The function identifies candidate
280 /// loops, calculates the minimum initiation interval, and attempts to schedule
282 bool MachinePipeliner::scheduleLoop(MachineLoop
&L
) {
283 bool Changed
= false;
284 for (const auto &InnerLoop
: L
)
285 Changed
|= scheduleLoop(*InnerLoop
);
288 // Stop trying after reaching the limit (if any).
289 int Limit
= SwpLoopLimit
;
291 if (NumTries
>= SwpLoopLimit
)
297 setPragmaPipelineOptions(L
);
298 if (!canPipelineLoop(L
)) {
299 LLVM_DEBUG(dbgs() << "\n!!! Can not pipeline loop.\n");
301 return MachineOptimizationRemarkMissed(DEBUG_TYPE
, "canPipelineLoop",
302 L
.getStartLoc(), L
.getHeader())
303 << "Failed to pipeline loop";
306 LI
.LoopPipelinerInfo
.reset();
311 if (useSwingModuloScheduler())
312 Changed
= swingModuloScheduler(L
);
314 if (useWindowScheduler(Changed
))
315 Changed
= runWindowScheduler(L
);
317 LI
.LoopPipelinerInfo
.reset();
321 void MachinePipeliner::setPragmaPipelineOptions(MachineLoop
&L
) {
322 // Reset the pragma for the next loop in iteration.
323 disabledByPragma
= false;
326 MachineBasicBlock
*LBLK
= L
.getTopBlock();
331 const BasicBlock
*BBLK
= LBLK
->getBasicBlock();
335 const Instruction
*TI
= BBLK
->getTerminator();
339 MDNode
*LoopID
= TI
->getMetadata(LLVMContext::MD_loop
);
340 if (LoopID
== nullptr)
343 assert(LoopID
->getNumOperands() > 0 && "requires atleast one operand");
344 assert(LoopID
->getOperand(0) == LoopID
&& "invalid loop");
346 for (const MDOperand
&MDO
: llvm::drop_begin(LoopID
->operands())) {
347 MDNode
*MD
= dyn_cast
<MDNode
>(MDO
);
352 MDString
*S
= dyn_cast
<MDString
>(MD
->getOperand(0));
357 if (S
->getString() == "llvm.loop.pipeline.initiationinterval") {
358 assert(MD
->getNumOperands() == 2 &&
359 "Pipeline initiation interval hint metadata should have two operands.");
361 mdconst::extract
<ConstantInt
>(MD
->getOperand(1))->getZExtValue();
362 assert(II_setByPragma
>= 1 && "Pipeline initiation interval must be positive.");
363 } else if (S
->getString() == "llvm.loop.pipeline.disable") {
364 disabledByPragma
= true;
369 /// Return true if the loop can be software pipelined. The algorithm is
370 /// restricted to loops with a single basic block. Make sure that the
371 /// branch in the loop can be analyzed.
372 bool MachinePipeliner::canPipelineLoop(MachineLoop
&L
) {
373 if (L
.getNumBlocks() != 1) {
375 return MachineOptimizationRemarkAnalysis(DEBUG_TYPE
, "canPipelineLoop",
376 L
.getStartLoc(), L
.getHeader())
377 << "Not a single basic block: "
378 << ore::NV("NumBlocks", L
.getNumBlocks());
383 if (disabledByPragma
) {
385 return MachineOptimizationRemarkAnalysis(DEBUG_TYPE
, "canPipelineLoop",
386 L
.getStartLoc(), L
.getHeader())
387 << "Disabled by Pragma.";
392 // Check if the branch can't be understood because we can't do pipelining
393 // if that's the case.
397 if (TII
->analyzeBranch(*L
.getHeader(), LI
.TBB
, LI
.FBB
, LI
.BrCond
)) {
398 LLVM_DEBUG(dbgs() << "Unable to analyzeBranch, can NOT pipeline Loop\n");
401 return MachineOptimizationRemarkAnalysis(DEBUG_TYPE
, "canPipelineLoop",
402 L
.getStartLoc(), L
.getHeader())
403 << "The branch can't be understood";
408 LI
.LoopInductionVar
= nullptr;
409 LI
.LoopCompare
= nullptr;
410 LI
.LoopPipelinerInfo
= TII
->analyzeLoopForPipelining(L
.getTopBlock());
411 if (!LI
.LoopPipelinerInfo
) {
412 LLVM_DEBUG(dbgs() << "Unable to analyzeLoop, can NOT pipeline Loop\n");
415 return MachineOptimizationRemarkAnalysis(DEBUG_TYPE
, "canPipelineLoop",
416 L
.getStartLoc(), L
.getHeader())
417 << "The loop structure is not supported";
422 if (!L
.getLoopPreheader()) {
423 LLVM_DEBUG(dbgs() << "Preheader not found, can NOT pipeline Loop\n");
426 return MachineOptimizationRemarkAnalysis(DEBUG_TYPE
, "canPipelineLoop",
427 L
.getStartLoc(), L
.getHeader())
428 << "No loop preheader found";
433 // Remove any subregisters from inputs to phi nodes.
434 preprocessPhiNodes(*L
.getHeader());
438 void MachinePipeliner::preprocessPhiNodes(MachineBasicBlock
&B
) {
439 MachineRegisterInfo
&MRI
= MF
->getRegInfo();
441 *getAnalysis
<LiveIntervalsWrapperPass
>().getLIS().getSlotIndexes();
443 for (MachineInstr
&PI
: B
.phis()) {
444 MachineOperand
&DefOp
= PI
.getOperand(0);
445 assert(DefOp
.getSubReg() == 0);
446 auto *RC
= MRI
.getRegClass(DefOp
.getReg());
448 for (unsigned i
= 1, n
= PI
.getNumOperands(); i
!= n
; i
+= 2) {
449 MachineOperand
&RegOp
= PI
.getOperand(i
);
450 if (RegOp
.getSubReg() == 0)
453 // If the operand uses a subregister, replace it with a new register
454 // without subregisters, and generate a copy to the new register.
455 Register NewReg
= MRI
.createVirtualRegister(RC
);
456 MachineBasicBlock
&PredB
= *PI
.getOperand(i
+1).getMBB();
457 MachineBasicBlock::iterator At
= PredB
.getFirstTerminator();
458 const DebugLoc
&DL
= PredB
.findDebugLoc(At
);
459 auto Copy
= BuildMI(PredB
, At
, DL
, TII
->get(TargetOpcode::COPY
), NewReg
)
460 .addReg(RegOp
.getReg(), getRegState(RegOp
),
462 Slots
.insertMachineInstrInMaps(*Copy
);
463 RegOp
.setReg(NewReg
);
469 /// The SMS algorithm consists of the following main steps:
470 /// 1. Computation and analysis of the dependence graph.
471 /// 2. Ordering of the nodes (instructions).
472 /// 3. Attempt to Schedule the loop.
473 bool MachinePipeliner::swingModuloScheduler(MachineLoop
&L
) {
474 assert(L
.getBlocks().size() == 1 && "SMS works on single blocks only.");
476 SwingSchedulerDAG
SMS(
477 *this, L
, getAnalysis
<LiveIntervalsWrapperPass
>().getLIS(), RegClassInfo
,
478 II_setByPragma
, LI
.LoopPipelinerInfo
.get());
480 MachineBasicBlock
*MBB
= L
.getHeader();
481 // The kernel should not include any terminator instructions. These
482 // will be added back later.
485 // Compute the number of 'real' instructions in the basic block by
486 // ignoring terminators.
487 unsigned size
= MBB
->size();
488 for (MachineBasicBlock::iterator I
= MBB
->getFirstTerminator(),
489 E
= MBB
->instr_end();
493 SMS
.enterRegion(MBB
, MBB
->begin(), MBB
->getFirstTerminator(), size
);
498 return SMS
.hasNewSchedule();
501 void MachinePipeliner::getAnalysisUsage(AnalysisUsage
&AU
) const {
502 AU
.addRequired
<AAResultsWrapperPass
>();
503 AU
.addPreserved
<AAResultsWrapperPass
>();
504 AU
.addRequired
<MachineLoopInfoWrapperPass
>();
505 AU
.addRequired
<MachineDominatorTreeWrapperPass
>();
506 AU
.addRequired
<LiveIntervalsWrapperPass
>();
507 AU
.addRequired
<MachineOptimizationRemarkEmitterPass
>();
508 AU
.addRequired
<TargetPassConfig
>();
509 MachineFunctionPass::getAnalysisUsage(AU
);
512 bool MachinePipeliner::runWindowScheduler(MachineLoop
&L
) {
513 MachineSchedContext Context
;
517 Context
.PassConfig
= &getAnalysis
<TargetPassConfig
>();
518 Context
.AA
= &getAnalysis
<AAResultsWrapperPass
>().getAAResults();
519 Context
.LIS
= &getAnalysis
<LiveIntervalsWrapperPass
>().getLIS();
520 Context
.RegClassInfo
->runOnMachineFunction(*MF
);
521 WindowScheduler
WS(&Context
, L
);
525 bool MachinePipeliner::useSwingModuloScheduler() {
526 // SwingModuloScheduler does not work when WindowScheduler is forced.
527 return WindowSchedulingOption
!= WindowSchedulingFlag::WS_Force
;
530 bool MachinePipeliner::useWindowScheduler(bool Changed
) {
531 // WindowScheduler does not work for following cases:
532 // 1. when it is off.
533 // 2. when SwingModuloScheduler is successfully scheduled.
534 // 3. when pragma II is enabled.
535 if (II_setByPragma
) {
536 LLVM_DEBUG(dbgs() << "Window scheduling is disabled when "
537 "llvm.loop.pipeline.initiationinterval is set.\n");
541 return WindowSchedulingOption
== WindowSchedulingFlag::WS_Force
||
542 (WindowSchedulingOption
== WindowSchedulingFlag::WS_On
&& !Changed
);
545 void SwingSchedulerDAG::setMII(unsigned ResMII
, unsigned RecMII
) {
548 else if (II_setByPragma
> 0)
549 MII
= II_setByPragma
;
551 MII
= std::max(ResMII
, RecMII
);
554 void SwingSchedulerDAG::setMAX_II() {
557 else if (II_setByPragma
> 0)
558 MAX_II
= II_setByPragma
;
560 MAX_II
= MII
+ SwpIISearchRange
;
563 /// We override the schedule function in ScheduleDAGInstrs to implement the
564 /// scheduling part of the Swing Modulo Scheduling algorithm.
565 void SwingSchedulerDAG::schedule() {
566 AliasAnalysis
*AA
= &Pass
.getAnalysis
<AAResultsWrapperPass
>().getAAResults();
568 addLoopCarriedDependences(AA
);
569 updatePhiDependences();
570 Topo
.InitDAGTopologicalSorting();
575 NodeSetType NodeSets
;
576 findCircuits(NodeSets
);
577 NodeSetType Circuits
= NodeSets
;
579 // Calculate the MII.
580 unsigned ResMII
= calculateResMII();
581 unsigned RecMII
= calculateRecMII(NodeSets
);
585 // This flag is used for testing and can cause correctness problems.
589 setMII(ResMII
, RecMII
);
592 LLVM_DEBUG(dbgs() << "MII = " << MII
<< " MAX_II = " << MAX_II
593 << " (rec=" << RecMII
<< ", res=" << ResMII
<< ")\n");
595 // Can't schedule a loop without a valid MII.
597 LLVM_DEBUG(dbgs() << "Invalid Minimal Initiation Interval: 0\n");
599 Pass
.ORE
->emit([&]() {
600 return MachineOptimizationRemarkAnalysis(
601 DEBUG_TYPE
, "schedule", Loop
.getStartLoc(), Loop
.getHeader())
602 << "Invalid Minimal Initiation Interval: 0";
607 // Don't pipeline large loops.
608 if (SwpMaxMii
!= -1 && (int)MII
> SwpMaxMii
) {
609 LLVM_DEBUG(dbgs() << "MII > " << SwpMaxMii
610 << ", we don't pipeline large loops\n");
611 NumFailLargeMaxMII
++;
612 Pass
.ORE
->emit([&]() {
613 return MachineOptimizationRemarkAnalysis(
614 DEBUG_TYPE
, "schedule", Loop
.getStartLoc(), Loop
.getHeader())
615 << "Minimal Initiation Interval too large: "
616 << ore::NV("MII", (int)MII
) << " > "
617 << ore::NV("SwpMaxMii", SwpMaxMii
) << "."
618 << "Refer to -pipeliner-max-mii.";
623 computeNodeFunctions(NodeSets
);
625 registerPressureFilter(NodeSets
);
627 colocateNodeSets(NodeSets
);
629 checkNodeSets(NodeSets
);
632 for (auto &I
: NodeSets
) {
633 dbgs() << " Rec NodeSet ";
638 llvm::stable_sort(NodeSets
, std::greater
<NodeSet
>());
640 groupRemainingNodes(NodeSets
);
642 removeDuplicateNodes(NodeSets
);
645 for (auto &I
: NodeSets
) {
646 dbgs() << " NodeSet ";
651 computeNodeOrder(NodeSets
);
653 // check for node order issues
654 checkValidNodeOrder(Circuits
);
656 SMSchedule
Schedule(Pass
.MF
, this);
657 Scheduled
= schedulePipeline(Schedule
);
660 LLVM_DEBUG(dbgs() << "No schedule found, return\n");
662 Pass
.ORE
->emit([&]() {
663 return MachineOptimizationRemarkAnalysis(
664 DEBUG_TYPE
, "schedule", Loop
.getStartLoc(), Loop
.getHeader())
665 << "Unable to find schedule";
670 unsigned numStages
= Schedule
.getMaxStageCount();
671 // No need to generate pipeline if there are no overlapped iterations.
672 if (numStages
== 0) {
673 LLVM_DEBUG(dbgs() << "No overlapped iterations, skip.\n");
675 Pass
.ORE
->emit([&]() {
676 return MachineOptimizationRemarkAnalysis(
677 DEBUG_TYPE
, "schedule", Loop
.getStartLoc(), Loop
.getHeader())
678 << "No need to pipeline - no overlapped iterations in schedule.";
682 // Check that the maximum stage count is less than user-defined limit.
683 if (SwpMaxStages
> -1 && (int)numStages
> SwpMaxStages
) {
684 LLVM_DEBUG(dbgs() << "numStages:" << numStages
<< ">" << SwpMaxStages
685 << " : too many stages, abort\n");
686 NumFailLargeMaxStage
++;
687 Pass
.ORE
->emit([&]() {
688 return MachineOptimizationRemarkAnalysis(
689 DEBUG_TYPE
, "schedule", Loop
.getStartLoc(), Loop
.getHeader())
690 << "Too many stages in schedule: "
691 << ore::NV("numStages", (int)numStages
) << " > "
692 << ore::NV("SwpMaxStages", SwpMaxStages
)
693 << ". Refer to -pipeliner-max-stages.";
698 Pass
.ORE
->emit([&]() {
699 return MachineOptimizationRemark(DEBUG_TYPE
, "schedule", Loop
.getStartLoc(),
701 << "Pipelined succesfully!";
704 // Generate the schedule as a ModuloSchedule.
705 DenseMap
<MachineInstr
*, int> Cycles
, Stages
;
706 std::vector
<MachineInstr
*> OrderedInsts
;
707 for (int Cycle
= Schedule
.getFirstCycle(); Cycle
<= Schedule
.getFinalCycle();
709 for (SUnit
*SU
: Schedule
.getInstructions(Cycle
)) {
710 OrderedInsts
.push_back(SU
->getInstr());
711 Cycles
[SU
->getInstr()] = Cycle
;
712 Stages
[SU
->getInstr()] = Schedule
.stageScheduled(SU
);
715 DenseMap
<MachineInstr
*, std::pair
<unsigned, int64_t>> NewInstrChanges
;
716 for (auto &KV
: NewMIs
) {
717 Cycles
[KV
.first
] = Cycles
[KV
.second
];
718 Stages
[KV
.first
] = Stages
[KV
.second
];
719 NewInstrChanges
[KV
.first
] = InstrChanges
[getSUnit(KV
.first
)];
722 ModuloSchedule
MS(MF
, &Loop
, std::move(OrderedInsts
), std::move(Cycles
),
724 if (EmitTestAnnotations
) {
725 assert(NewInstrChanges
.empty() &&
726 "Cannot serialize a schedule with InstrChanges!");
727 ModuloScheduleTestAnnotater
MSTI(MF
, MS
);
731 // The experimental code generator can't work if there are InstChanges.
732 if (ExperimentalCodeGen
&& NewInstrChanges
.empty()) {
733 PeelingModuloScheduleExpander
MSE(MF
, MS
, &LIS
);
735 } else if (MVECodeGen
&& NewInstrChanges
.empty() &&
736 LoopPipelinerInfo
->isMVEExpanderSupported() &&
737 ModuloScheduleExpanderMVE::canApply(Loop
)) {
738 ModuloScheduleExpanderMVE
MSE(MF
, MS
, LIS
);
741 ModuloScheduleExpander
MSE(MF
, MS
, LIS
, std::move(NewInstrChanges
));
748 /// Clean up after the software pipeliner runs.
749 void SwingSchedulerDAG::finishBlock() {
750 for (auto &KV
: NewMIs
)
751 MF
.deleteMachineInstr(KV
.second
);
754 // Call the superclass.
755 ScheduleDAGInstrs::finishBlock();
758 /// Return the register values for the operands of a Phi instruction.
759 /// This function assume the instruction is a Phi.
760 static void getPhiRegs(MachineInstr
&Phi
, MachineBasicBlock
*Loop
,
761 unsigned &InitVal
, unsigned &LoopVal
) {
762 assert(Phi
.isPHI() && "Expecting a Phi.");
766 for (unsigned i
= 1, e
= Phi
.getNumOperands(); i
!= e
; i
+= 2)
767 if (Phi
.getOperand(i
+ 1).getMBB() != Loop
)
768 InitVal
= Phi
.getOperand(i
).getReg();
770 LoopVal
= Phi
.getOperand(i
).getReg();
772 assert(InitVal
!= 0 && LoopVal
!= 0 && "Unexpected Phi structure.");
775 /// Return the Phi register value that comes the loop block.
776 static unsigned getLoopPhiReg(const MachineInstr
&Phi
,
777 const MachineBasicBlock
*LoopBB
) {
778 for (unsigned i
= 1, e
= Phi
.getNumOperands(); i
!= e
; i
+= 2)
779 if (Phi
.getOperand(i
+ 1).getMBB() == LoopBB
)
780 return Phi
.getOperand(i
).getReg();
784 /// Return true if SUb can be reached from SUa following the chain edges.
785 static bool isSuccOrder(SUnit
*SUa
, SUnit
*SUb
) {
786 SmallPtrSet
<SUnit
*, 8> Visited
;
787 SmallVector
<SUnit
*, 8> Worklist
;
788 Worklist
.push_back(SUa
);
789 while (!Worklist
.empty()) {
790 const SUnit
*SU
= Worklist
.pop_back_val();
791 for (const auto &SI
: SU
->Succs
) {
792 SUnit
*SuccSU
= SI
.getSUnit();
793 if (SI
.getKind() == SDep::Order
) {
794 if (Visited
.count(SuccSU
))
798 Worklist
.push_back(SuccSU
);
799 Visited
.insert(SuccSU
);
806 /// Return true if the instruction causes a chain between memory
807 /// references before and after it.
808 static bool isDependenceBarrier(MachineInstr
&MI
) {
809 return MI
.isCall() || MI
.mayRaiseFPException() ||
810 MI
.hasUnmodeledSideEffects() ||
811 (MI
.hasOrderedMemoryRef() &&
812 (!MI
.mayLoad() || !MI
.isDereferenceableInvariantLoad()));
815 /// Return the underlying objects for the memory references of an instruction.
816 /// This function calls the code in ValueTracking, but first checks that the
817 /// instruction has a memory operand.
818 static void getUnderlyingObjects(const MachineInstr
*MI
,
819 SmallVectorImpl
<const Value
*> &Objs
) {
820 if (!MI
->hasOneMemOperand())
822 MachineMemOperand
*MM
= *MI
->memoperands_begin();
825 getUnderlyingObjects(MM
->getValue(), Objs
);
826 for (const Value
*V
: Objs
) {
827 if (!isIdentifiedObject(V
)) {
834 /// Add a chain edge between a load and store if the store can be an
835 /// alias of the load on a subsequent iteration, i.e., a loop carried
836 /// dependence. This code is very similar to the code in ScheduleDAGInstrs
837 /// but that code doesn't create loop carried dependences.
838 void SwingSchedulerDAG::addLoopCarriedDependences(AliasAnalysis
*AA
) {
839 MapVector
<const Value
*, SmallVector
<SUnit
*, 4>> PendingLoads
;
840 Value
*UnknownValue
=
841 UndefValue::get(Type::getVoidTy(MF
.getFunction().getContext()));
842 for (auto &SU
: SUnits
) {
843 MachineInstr
&MI
= *SU
.getInstr();
844 if (isDependenceBarrier(MI
))
845 PendingLoads
.clear();
846 else if (MI
.mayLoad()) {
847 SmallVector
<const Value
*, 4> Objs
;
848 ::getUnderlyingObjects(&MI
, Objs
);
850 Objs
.push_back(UnknownValue
);
851 for (const auto *V
: Objs
) {
852 SmallVector
<SUnit
*, 4> &SUs
= PendingLoads
[V
];
855 } else if (MI
.mayStore()) {
856 SmallVector
<const Value
*, 4> Objs
;
857 ::getUnderlyingObjects(&MI
, Objs
);
859 Objs
.push_back(UnknownValue
);
860 for (const auto *V
: Objs
) {
861 MapVector
<const Value
*, SmallVector
<SUnit
*, 4>>::iterator I
=
862 PendingLoads
.find(V
);
863 if (I
== PendingLoads
.end())
865 for (auto *Load
: I
->second
) {
866 if (isSuccOrder(Load
, &SU
))
868 MachineInstr
&LdMI
= *Load
->getInstr();
869 // First, perform the cheaper check that compares the base register.
870 // If they are the same and the load offset is less than the store
871 // offset, then mark the dependence as loop carried potentially.
872 const MachineOperand
*BaseOp1
, *BaseOp2
;
873 int64_t Offset1
, Offset2
;
874 bool Offset1IsScalable
, Offset2IsScalable
;
875 if (TII
->getMemOperandWithOffset(LdMI
, BaseOp1
, Offset1
,
876 Offset1IsScalable
, TRI
) &&
877 TII
->getMemOperandWithOffset(MI
, BaseOp2
, Offset2
,
878 Offset2IsScalable
, TRI
)) {
879 if (BaseOp1
->isIdenticalTo(*BaseOp2
) &&
880 Offset1IsScalable
== Offset2IsScalable
&&
881 (int)Offset1
< (int)Offset2
) {
882 assert(TII
->areMemAccessesTriviallyDisjoint(LdMI
, MI
) &&
883 "What happened to the chain edge?");
884 SDep
Dep(Load
, SDep::Barrier
);
890 // Second, the more expensive check that uses alias analysis on the
891 // base registers. If they alias, and the load offset is less than
892 // the store offset, the mark the dependence as loop carried.
894 SDep
Dep(Load
, SDep::Barrier
);
899 MachineMemOperand
*MMO1
= *LdMI
.memoperands_begin();
900 MachineMemOperand
*MMO2
= *MI
.memoperands_begin();
901 if (!MMO1
->getValue() || !MMO2
->getValue()) {
902 SDep
Dep(Load
, SDep::Barrier
);
907 if (MMO1
->getValue() == MMO2
->getValue() &&
908 MMO1
->getOffset() <= MMO2
->getOffset()) {
909 SDep
Dep(Load
, SDep::Barrier
);
915 MemoryLocation::getAfter(MMO1
->getValue(), MMO1
->getAAInfo()),
916 MemoryLocation::getAfter(MMO2
->getValue(),
917 MMO2
->getAAInfo()))) {
918 SDep
Dep(Load
, SDep::Barrier
);
928 /// Update the phi dependences to the DAG because ScheduleDAGInstrs no longer
929 /// processes dependences for PHIs. This function adds true dependences
930 /// from a PHI to a use, and a loop carried dependence from the use to the
931 /// PHI. The loop carried dependence is represented as an anti dependence
932 /// edge. This function also removes chain dependences between unrelated
934 void SwingSchedulerDAG::updatePhiDependences() {
935 SmallVector
<SDep
, 4> RemoveDeps
;
936 const TargetSubtargetInfo
&ST
= MF
.getSubtarget
<TargetSubtargetInfo
>();
938 // Iterate over each DAG node.
939 for (SUnit
&I
: SUnits
) {
941 // Set to true if the instruction has an operand defined by a Phi.
942 unsigned HasPhiUse
= 0;
943 unsigned HasPhiDef
= 0;
944 MachineInstr
*MI
= I
.getInstr();
945 // Iterate over each operand, and we process the definitions.
946 for (const MachineOperand
&MO
: MI
->operands()) {
949 Register Reg
= MO
.getReg();
951 // If the register is used by a Phi, then create an anti dependence.
952 for (MachineRegisterInfo::use_instr_iterator
953 UI
= MRI
.use_instr_begin(Reg
),
954 UE
= MRI
.use_instr_end();
956 MachineInstr
*UseMI
= &*UI
;
957 SUnit
*SU
= getSUnit(UseMI
);
958 if (SU
!= nullptr && UseMI
->isPHI()) {
960 SDep
Dep(SU
, SDep::Anti
, Reg
);
965 // Add a chain edge to a dependent Phi that isn't an existing
967 if (SU
->NodeNum
< I
.NodeNum
&& !I
.isPred(SU
))
968 I
.addPred(SDep(SU
, SDep::Barrier
));
972 } else if (MO
.isUse()) {
973 // If the register is defined by a Phi, then create a true dependence.
974 MachineInstr
*DefMI
= MRI
.getUniqueVRegDef(Reg
);
975 if (DefMI
== nullptr)
977 SUnit
*SU
= getSUnit(DefMI
);
978 if (SU
!= nullptr && DefMI
->isPHI()) {
980 SDep
Dep(SU
, SDep::Data
, Reg
);
982 ST
.adjustSchedDependency(SU
, 0, &I
, MO
.getOperandNo(), Dep
,
987 // Add a chain edge to a dependent Phi that isn't an existing
989 if (SU
->NodeNum
< I
.NodeNum
&& !I
.isPred(SU
))
990 I
.addPred(SDep(SU
, SDep::Barrier
));
995 // Remove order dependences from an unrelated Phi.
998 for (auto &PI
: I
.Preds
) {
999 MachineInstr
*PMI
= PI
.getSUnit()->getInstr();
1000 if (PMI
->isPHI() && PI
.getKind() == SDep::Order
) {
1001 if (I
.getInstr()->isPHI()) {
1002 if (PMI
->getOperand(0).getReg() == HasPhiUse
)
1004 if (getLoopPhiReg(*PMI
, PMI
->getParent()) == HasPhiDef
)
1007 RemoveDeps
.push_back(PI
);
1010 for (const SDep
&D
: RemoveDeps
)
1015 /// Iterate over each DAG node and see if we can change any dependences
1016 /// in order to reduce the recurrence MII.
1017 void SwingSchedulerDAG::changeDependences() {
1018 // See if an instruction can use a value from the previous iteration.
1019 // If so, we update the base and offset of the instruction and change
1021 for (SUnit
&I
: SUnits
) {
1022 unsigned BasePos
= 0, OffsetPos
= 0, NewBase
= 0;
1023 int64_t NewOffset
= 0;
1024 if (!canUseLastOffsetValue(I
.getInstr(), BasePos
, OffsetPos
, NewBase
,
1028 // Get the MI and SUnit for the instruction that defines the original base.
1029 Register OrigBase
= I
.getInstr()->getOperand(BasePos
).getReg();
1030 MachineInstr
*DefMI
= MRI
.getUniqueVRegDef(OrigBase
);
1033 SUnit
*DefSU
= getSUnit(DefMI
);
1036 // Get the MI and SUnit for the instruction that defins the new base.
1037 MachineInstr
*LastMI
= MRI
.getUniqueVRegDef(NewBase
);
1040 SUnit
*LastSU
= getSUnit(LastMI
);
1044 if (Topo
.IsReachable(&I
, LastSU
))
1047 // Remove the dependence. The value now depends on a prior iteration.
1048 SmallVector
<SDep
, 4> Deps
;
1049 for (const SDep
&P
: I
.Preds
)
1050 if (P
.getSUnit() == DefSU
)
1052 for (const SDep
&D
: Deps
) {
1053 Topo
.RemovePred(&I
, D
.getSUnit());
1056 // Remove the chain dependence between the instructions.
1058 for (auto &P
: LastSU
->Preds
)
1059 if (P
.getSUnit() == &I
&& P
.getKind() == SDep::Order
)
1061 for (const SDep
&D
: Deps
) {
1062 Topo
.RemovePred(LastSU
, D
.getSUnit());
1063 LastSU
->removePred(D
);
1066 // Add a dependence between the new instruction and the instruction
1067 // that defines the new base.
1068 SDep
Dep(&I
, SDep::Anti
, NewBase
);
1069 Topo
.AddPred(LastSU
, &I
);
1070 LastSU
->addPred(Dep
);
1072 // Remember the base and offset information so that we can update the
1073 // instruction during code generation.
1074 InstrChanges
[&I
] = std::make_pair(NewBase
, NewOffset
);
1078 /// Create an instruction stream that represents a single iteration and stage of
1079 /// each instruction. This function differs from SMSchedule::finalizeSchedule in
1080 /// that this doesn't have any side-effect to SwingSchedulerDAG. That is, this
1081 /// function is an approximation of SMSchedule::finalizeSchedule with all
1082 /// non-const operations removed.
1083 static void computeScheduledInsts(const SwingSchedulerDAG
*SSD
,
1084 SMSchedule
&Schedule
,
1085 std::vector
<MachineInstr
*> &OrderedInsts
,
1086 DenseMap
<MachineInstr
*, unsigned> &Stages
) {
1087 DenseMap
<int, std::deque
<SUnit
*>> Instrs
;
1089 // Move all instructions to the first stage from the later stages.
1090 for (int Cycle
= Schedule
.getFirstCycle(); Cycle
<= Schedule
.getFinalCycle();
1092 for (int Stage
= 0, LastStage
= Schedule
.getMaxStageCount();
1093 Stage
<= LastStage
; ++Stage
) {
1094 for (SUnit
*SU
: llvm::reverse(Schedule
.getInstructions(
1095 Cycle
+ Stage
* Schedule
.getInitiationInterval()))) {
1096 Instrs
[Cycle
].push_front(SU
);
1101 for (int Cycle
= Schedule
.getFirstCycle(); Cycle
<= Schedule
.getFinalCycle();
1103 std::deque
<SUnit
*> &CycleInstrs
= Instrs
[Cycle
];
1104 CycleInstrs
= Schedule
.reorderInstructions(SSD
, CycleInstrs
);
1105 for (SUnit
*SU
: CycleInstrs
) {
1106 MachineInstr
*MI
= SU
->getInstr();
1107 OrderedInsts
.push_back(MI
);
1108 Stages
[MI
] = Schedule
.stageScheduled(SU
);
1115 // FuncUnitSorter - Comparison operator used to sort instructions by
1116 // the number of functional unit choices.
1117 struct FuncUnitSorter
{
1118 const InstrItineraryData
*InstrItins
;
1119 const MCSubtargetInfo
*STI
;
1120 DenseMap
<InstrStage::FuncUnits
, unsigned> Resources
;
1122 FuncUnitSorter(const TargetSubtargetInfo
&TSI
)
1123 : InstrItins(TSI
.getInstrItineraryData()), STI(&TSI
) {}
1125 // Compute the number of functional unit alternatives needed
1126 // at each stage, and take the minimum value. We prioritize the
1127 // instructions by the least number of choices first.
1128 unsigned minFuncUnits(const MachineInstr
*Inst
,
1129 InstrStage::FuncUnits
&F
) const {
1130 unsigned SchedClass
= Inst
->getDesc().getSchedClass();
1131 unsigned min
= UINT_MAX
;
1132 if (InstrItins
&& !InstrItins
->isEmpty()) {
1133 for (const InstrStage
&IS
:
1134 make_range(InstrItins
->beginStage(SchedClass
),
1135 InstrItins
->endStage(SchedClass
))) {
1136 InstrStage::FuncUnits funcUnits
= IS
.getUnits();
1137 unsigned numAlternatives
= llvm::popcount(funcUnits
);
1138 if (numAlternatives
< min
) {
1139 min
= numAlternatives
;
1145 if (STI
&& STI
->getSchedModel().hasInstrSchedModel()) {
1146 const MCSchedClassDesc
*SCDesc
=
1147 STI
->getSchedModel().getSchedClassDesc(SchedClass
);
1148 if (!SCDesc
->isValid())
1149 // No valid Schedule Class Desc for schedClass, should be
1150 // Pseudo/PostRAPseudo
1153 for (const MCWriteProcResEntry
&PRE
:
1154 make_range(STI
->getWriteProcResBegin(SCDesc
),
1155 STI
->getWriteProcResEnd(SCDesc
))) {
1156 if (!PRE
.ReleaseAtCycle
)
1158 const MCProcResourceDesc
*ProcResource
=
1159 STI
->getSchedModel().getProcResource(PRE
.ProcResourceIdx
);
1160 unsigned NumUnits
= ProcResource
->NumUnits
;
1161 if (NumUnits
< min
) {
1163 F
= PRE
.ProcResourceIdx
;
1168 llvm_unreachable("Should have non-empty InstrItins or hasInstrSchedModel!");
1171 // Compute the critical resources needed by the instruction. This
1172 // function records the functional units needed by instructions that
1173 // must use only one functional unit. We use this as a tie breaker
1174 // for computing the resource MII. The instrutions that require
1175 // the same, highly used, functional unit have high priority.
1176 void calcCriticalResources(MachineInstr
&MI
) {
1177 unsigned SchedClass
= MI
.getDesc().getSchedClass();
1178 if (InstrItins
&& !InstrItins
->isEmpty()) {
1179 for (const InstrStage
&IS
:
1180 make_range(InstrItins
->beginStage(SchedClass
),
1181 InstrItins
->endStage(SchedClass
))) {
1182 InstrStage::FuncUnits FuncUnits
= IS
.getUnits();
1183 if (llvm::popcount(FuncUnits
) == 1)
1184 Resources
[FuncUnits
]++;
1188 if (STI
&& STI
->getSchedModel().hasInstrSchedModel()) {
1189 const MCSchedClassDesc
*SCDesc
=
1190 STI
->getSchedModel().getSchedClassDesc(SchedClass
);
1191 if (!SCDesc
->isValid())
1192 // No valid Schedule Class Desc for schedClass, should be
1193 // Pseudo/PostRAPseudo
1196 for (const MCWriteProcResEntry
&PRE
:
1197 make_range(STI
->getWriteProcResBegin(SCDesc
),
1198 STI
->getWriteProcResEnd(SCDesc
))) {
1199 if (!PRE
.ReleaseAtCycle
)
1201 Resources
[PRE
.ProcResourceIdx
]++;
1205 llvm_unreachable("Should have non-empty InstrItins or hasInstrSchedModel!");
1208 /// Return true if IS1 has less priority than IS2.
1209 bool operator()(const MachineInstr
*IS1
, const MachineInstr
*IS2
) const {
1210 InstrStage::FuncUnits F1
= 0, F2
= 0;
1211 unsigned MFUs1
= minFuncUnits(IS1
, F1
);
1212 unsigned MFUs2
= minFuncUnits(IS2
, F2
);
1214 return Resources
.lookup(F1
) < Resources
.lookup(F2
);
1215 return MFUs1
> MFUs2
;
1219 /// Calculate the maximum register pressure of the scheduled instructions stream
1220 class HighRegisterPressureDetector
{
1221 MachineBasicBlock
*OrigMBB
;
1222 const MachineFunction
&MF
;
1223 const MachineRegisterInfo
&MRI
;
1224 const TargetRegisterInfo
*TRI
;
1226 const unsigned PSetNum
;
1228 // Indexed by PSet ID
1229 // InitSetPressure takes into account the register pressure of live-in
1230 // registers. It's not depend on how the loop is scheduled, so it's enough to
1231 // calculate them once at the beginning.
1232 std::vector
<unsigned> InitSetPressure
;
1234 // Indexed by PSet ID
1235 // Upper limit for each register pressure set
1236 std::vector
<unsigned> PressureSetLimit
;
1238 DenseMap
<MachineInstr
*, RegisterOperands
> ROMap
;
1240 using Instr2LastUsesTy
= DenseMap
<MachineInstr
*, SmallDenseSet
<Register
, 4>>;
1243 using OrderedInstsTy
= std::vector
<MachineInstr
*>;
1244 using Instr2StageTy
= DenseMap
<MachineInstr
*, unsigned>;
1247 static void dumpRegisterPressures(const std::vector
<unsigned> &Pressures
) {
1248 if (Pressures
.size() == 0) {
1252 for (unsigned P
: Pressures
) {
1253 dbgs() << Prefix
<< P
;
1260 void dumpPSet(Register Reg
) const {
1261 dbgs() << "Reg=" << printReg(Reg
, TRI
, 0, &MRI
) << " PSet=";
1262 for (auto PSetIter
= MRI
.getPressureSets(Reg
); PSetIter
.isValid();
1264 dbgs() << *PSetIter
<< ' ';
1269 void increaseRegisterPressure(std::vector
<unsigned> &Pressure
,
1270 Register Reg
) const {
1271 auto PSetIter
= MRI
.getPressureSets(Reg
);
1272 unsigned Weight
= PSetIter
.getWeight();
1273 for (; PSetIter
.isValid(); ++PSetIter
)
1274 Pressure
[*PSetIter
] += Weight
;
1277 void decreaseRegisterPressure(std::vector
<unsigned> &Pressure
,
1278 Register Reg
) const {
1279 auto PSetIter
= MRI
.getPressureSets(Reg
);
1280 unsigned Weight
= PSetIter
.getWeight();
1281 for (; PSetIter
.isValid(); ++PSetIter
) {
1282 auto &P
= Pressure
[*PSetIter
];
1283 assert(P
>= Weight
&&
1284 "register pressure must be greater than or equal weight");
1289 // Return true if Reg is fixed one, for example, stack pointer
1290 bool isFixedRegister(Register Reg
) const {
1291 return Reg
.isPhysical() && TRI
->isFixedRegister(MF
, Reg
.asMCReg());
1294 bool isDefinedInThisLoop(Register Reg
) const {
1295 return Reg
.isVirtual() && MRI
.getVRegDef(Reg
)->getParent() == OrigMBB
;
1298 // Search for live-in variables. They are factored into the register pressure
1299 // from the begining. Live-in variables used by every iteration should be
1300 // considered as alive throughout the loop. For example, the variable `c` in
1301 // following code. \code
1303 // for (int i = 0; i < n; i++)
1304 // a[i] += b[i] + c;
1306 void computeLiveIn() {
1307 DenseSet
<Register
> Used
;
1308 for (auto &MI
: *OrigMBB
) {
1309 if (MI
.isDebugInstr())
1311 for (auto &Use
: ROMap
[&MI
].Uses
) {
1312 auto Reg
= Use
.RegUnit
;
1313 // Ignore the variable that appears only on one side of phi instruction
1314 // because it's used only at the first iteration.
1315 if (MI
.isPHI() && Reg
!= getLoopPhiReg(MI
, OrigMBB
))
1317 if (isFixedRegister(Reg
))
1319 if (isDefinedInThisLoop(Reg
))
1325 for (auto LiveIn
: Used
)
1326 increaseRegisterPressure(InitSetPressure
, LiveIn
);
1329 // Calculate the upper limit of each pressure set
1330 void computePressureSetLimit(const RegisterClassInfo
&RCI
) {
1331 for (unsigned PSet
= 0; PSet
< PSetNum
; PSet
++)
1332 PressureSetLimit
[PSet
] = TRI
->getRegPressureSetLimit(MF
, PSet
);
1334 // We assume fixed registers, such as stack pointer, are already in use.
1335 // Therefore subtracting the weight of the fixed registers from the limit of
1336 // each pressure set in advance.
1337 SmallDenseSet
<Register
, 8> FixedRegs
;
1338 for (const TargetRegisterClass
*TRC
: TRI
->regclasses()) {
1339 for (const MCPhysReg Reg
: *TRC
)
1340 if (isFixedRegister(Reg
))
1341 FixedRegs
.insert(Reg
);
1345 for (auto Reg
: FixedRegs
) {
1346 dbgs() << printReg(Reg
, TRI
, 0, &MRI
) << ": [";
1347 const int *Sets
= TRI
->getRegUnitPressureSets(Reg
);
1348 for (; *Sets
!= -1; Sets
++) {
1349 dbgs() << TRI
->getRegPressureSetName(*Sets
) << ", ";
1355 for (auto Reg
: FixedRegs
) {
1356 LLVM_DEBUG(dbgs() << "fixed register: " << printReg(Reg
, TRI
, 0, &MRI
)
1358 auto PSetIter
= MRI
.getPressureSets(Reg
);
1359 unsigned Weight
= PSetIter
.getWeight();
1360 for (; PSetIter
.isValid(); ++PSetIter
) {
1361 unsigned &Limit
= PressureSetLimit
[*PSetIter
];
1362 assert(Limit
>= Weight
&&
1363 "register pressure limit must be greater than or equal weight");
1365 LLVM_DEBUG(dbgs() << "PSet=" << *PSetIter
<< " Limit=" << Limit
1366 << " (decreased by " << Weight
<< ")\n");
1371 // There are two patterns of last-use.
1372 // - by an instruction of the current iteration
1373 // - by a phi instruction of the next iteration (loop carried value)
1375 // Furthermore, following two groups of instructions are executed
1377 // - next iteration's phi instructions in i-th stage
1378 // - current iteration's instructions in i+1-th stage
1380 // This function calculates the last-use of each register while taking into
1381 // account the above two patterns.
1382 Instr2LastUsesTy
computeLastUses(const OrderedInstsTy
&OrderedInsts
,
1383 Instr2StageTy
&Stages
) const {
1384 // We treat virtual registers that are defined and used in this loop.
1385 // Following virtual register will be ignored
1387 // - defined but not used in the loop (potentially live-out)
1388 DenseSet
<Register
> TargetRegs
;
1389 const auto UpdateTargetRegs
= [this, &TargetRegs
](Register Reg
) {
1390 if (isDefinedInThisLoop(Reg
))
1391 TargetRegs
.insert(Reg
);
1393 for (MachineInstr
*MI
: OrderedInsts
) {
1395 Register Reg
= getLoopPhiReg(*MI
, OrigMBB
);
1396 UpdateTargetRegs(Reg
);
1398 for (auto &Use
: ROMap
.find(MI
)->getSecond().Uses
)
1399 UpdateTargetRegs(Use
.RegUnit
);
1403 const auto InstrScore
= [&Stages
](MachineInstr
*MI
) {
1404 return Stages
[MI
] + MI
->isPHI();
1407 DenseMap
<Register
, MachineInstr
*> LastUseMI
;
1408 for (MachineInstr
*MI
: llvm::reverse(OrderedInsts
)) {
1409 for (auto &Use
: ROMap
.find(MI
)->getSecond().Uses
) {
1410 auto Reg
= Use
.RegUnit
;
1411 if (!TargetRegs
.contains(Reg
))
1413 auto Ite
= LastUseMI
.find(Reg
);
1414 if (Ite
== LastUseMI
.end()) {
1415 LastUseMI
[Reg
] = MI
;
1417 MachineInstr
*Orig
= Ite
->second
;
1418 MachineInstr
*New
= MI
;
1419 if (InstrScore(Orig
) < InstrScore(New
))
1420 LastUseMI
[Reg
] = New
;
1425 Instr2LastUsesTy LastUses
;
1426 for (auto &Entry
: LastUseMI
)
1427 LastUses
[Entry
.second
].insert(Entry
.first
);
1431 // Compute the maximum register pressure of the kernel. We'll simulate #Stage
1432 // iterations and check the register pressure at the point where all stages
1435 // An example of unrolled loop where #Stage is 4..
1436 // Iter i+0 i+1 i+2 i+3
1437 // ------------------------
1441 // Stage 3 2 1 0 <- All stages overlap
1443 std::vector
<unsigned>
1444 computeMaxSetPressure(const OrderedInstsTy
&OrderedInsts
,
1445 Instr2StageTy
&Stages
,
1446 const unsigned StageCount
) const {
1447 using RegSetTy
= SmallDenseSet
<Register
, 16>;
1449 // Indexed by #Iter. To treat "local" variables of each stage separately, we
1450 // manage the liveness of the registers independently by iterations.
1451 SmallVector
<RegSetTy
> LiveRegSets(StageCount
);
1453 auto CurSetPressure
= InitSetPressure
;
1454 auto MaxSetPressure
= InitSetPressure
;
1455 auto LastUses
= computeLastUses(OrderedInsts
, Stages
);
1458 dbgs() << "Ordered instructions:\n";
1459 for (MachineInstr
*MI
: OrderedInsts
) {
1460 dbgs() << "Stage " << Stages
[MI
] << ": ";
1465 const auto InsertReg
= [this, &CurSetPressure
](RegSetTy
&RegSet
,
1467 if (!Reg
.isValid() || isFixedRegister(Reg
))
1470 bool Inserted
= RegSet
.insert(Reg
).second
;
1474 LLVM_DEBUG(dbgs() << "insert " << printReg(Reg
, TRI
, 0, &MRI
) << "\n");
1475 increaseRegisterPressure(CurSetPressure
, Reg
);
1476 LLVM_DEBUG(dumpPSet(Reg
));
1479 const auto EraseReg
= [this, &CurSetPressure
](RegSetTy
&RegSet
,
1481 if (!Reg
.isValid() || isFixedRegister(Reg
))
1485 if (!RegSet
.contains(Reg
))
1488 LLVM_DEBUG(dbgs() << "erase " << printReg(Reg
, TRI
, 0, &MRI
) << "\n");
1490 decreaseRegisterPressure(CurSetPressure
, Reg
);
1491 LLVM_DEBUG(dumpPSet(Reg
));
1494 for (unsigned I
= 0; I
< StageCount
; I
++) {
1495 for (MachineInstr
*MI
: OrderedInsts
) {
1496 const auto Stage
= Stages
[MI
];
1500 const unsigned Iter
= I
- Stage
;
1502 for (auto &Def
: ROMap
.find(MI
)->getSecond().Defs
)
1503 InsertReg(LiveRegSets
[Iter
], Def
.RegUnit
);
1505 for (auto LastUse
: LastUses
[MI
]) {
1508 EraseReg(LiveRegSets
[Iter
- 1], LastUse
);
1510 EraseReg(LiveRegSets
[Iter
], LastUse
);
1514 for (unsigned PSet
= 0; PSet
< PSetNum
; PSet
++)
1515 MaxSetPressure
[PSet
] =
1516 std::max(MaxSetPressure
[PSet
], CurSetPressure
[PSet
]);
1519 dbgs() << "CurSetPressure=";
1520 dumpRegisterPressures(CurSetPressure
);
1521 dbgs() << " iter=" << Iter
<< " stage=" << Stage
<< ":";
1527 return MaxSetPressure
;
1531 HighRegisterPressureDetector(MachineBasicBlock
*OrigMBB
,
1532 const MachineFunction
&MF
)
1533 : OrigMBB(OrigMBB
), MF(MF
), MRI(MF
.getRegInfo()),
1534 TRI(MF
.getSubtarget().getRegisterInfo()),
1535 PSetNum(TRI
->getNumRegPressureSets()), InitSetPressure(PSetNum
, 0),
1536 PressureSetLimit(PSetNum
, 0) {}
1538 // Used to calculate register pressure, which is independent of loop
1540 void init(const RegisterClassInfo
&RCI
) {
1541 for (MachineInstr
&MI
: *OrigMBB
) {
1542 if (MI
.isDebugInstr())
1544 ROMap
[&MI
].collect(MI
, *TRI
, MRI
, false, true);
1548 computePressureSetLimit(RCI
);
1551 // Calculate the maximum register pressures of the loop and check if they
1553 bool detect(const SwingSchedulerDAG
*SSD
, SMSchedule
&Schedule
,
1554 const unsigned MaxStage
) const {
1555 assert(0 <= RegPressureMargin
&& RegPressureMargin
<= 100 &&
1556 "the percentage of the margin must be between 0 to 100");
1558 OrderedInstsTy OrderedInsts
;
1559 Instr2StageTy Stages
;
1560 computeScheduledInsts(SSD
, Schedule
, OrderedInsts
, Stages
);
1561 const auto MaxSetPressure
=
1562 computeMaxSetPressure(OrderedInsts
, Stages
, MaxStage
+ 1);
1565 dbgs() << "Dump MaxSetPressure:\n";
1566 for (unsigned I
= 0; I
< MaxSetPressure
.size(); I
++) {
1567 dbgs() << format("MaxSetPressure[%d]=%d\n", I
, MaxSetPressure
[I
]);
1572 for (unsigned PSet
= 0; PSet
< PSetNum
; PSet
++) {
1573 unsigned Limit
= PressureSetLimit
[PSet
];
1574 unsigned Margin
= Limit
* RegPressureMargin
/ 100;
1575 LLVM_DEBUG(dbgs() << "PSet=" << PSet
<< " Limit=" << Limit
1576 << " Margin=" << Margin
<< "\n");
1577 if (Limit
< MaxSetPressure
[PSet
] + Margin
) {
1580 << "Rejected the schedule because of too high register pressure\n");
1588 } // end anonymous namespace
1590 /// Calculate the resource constrained minimum initiation interval for the
1591 /// specified loop. We use the DFA to model the resources needed for
1592 /// each instruction, and we ignore dependences. A different DFA is created
1593 /// for each cycle that is required. When adding a new instruction, we attempt
1594 /// to add it to each existing DFA, until a legal space is found. If the
1595 /// instruction cannot be reserved in an existing DFA, we create a new one.
1596 unsigned SwingSchedulerDAG::calculateResMII() {
1597 LLVM_DEBUG(dbgs() << "calculateResMII:\n");
1598 ResourceManager
RM(&MF
.getSubtarget(), this);
1599 return RM
.calculateResMII();
1602 /// Calculate the recurrence-constrainted minimum initiation interval.
1603 /// Iterate over each circuit. Compute the delay(c) and distance(c)
1604 /// for each circuit. The II needs to satisfy the inequality
1605 /// delay(c) - II*distance(c) <= 0. For each circuit, choose the smallest
1606 /// II that satisfies the inequality, and the RecMII is the maximum
1607 /// of those values.
1608 unsigned SwingSchedulerDAG::calculateRecMII(NodeSetType
&NodeSets
) {
1609 unsigned RecMII
= 0;
1611 for (NodeSet
&Nodes
: NodeSets
) {
1615 unsigned Delay
= Nodes
.getLatency();
1616 unsigned Distance
= 1;
1618 // ii = ceil(delay / distance)
1619 unsigned CurMII
= (Delay
+ Distance
- 1) / Distance
;
1620 Nodes
.setRecMII(CurMII
);
1621 if (CurMII
> RecMII
)
1628 /// Swap all the anti dependences in the DAG. That means it is no longer a DAG,
1629 /// but we do this to find the circuits, and then change them back.
1630 static void swapAntiDependences(std::vector
<SUnit
> &SUnits
) {
1631 SmallVector
<std::pair
<SUnit
*, SDep
>, 8> DepsAdded
;
1632 for (SUnit
&SU
: SUnits
) {
1633 for (SDep
&Pred
: SU
.Preds
)
1634 if (Pred
.getKind() == SDep::Anti
)
1635 DepsAdded
.push_back(std::make_pair(&SU
, Pred
));
1637 for (std::pair
<SUnit
*, SDep
> &P
: DepsAdded
) {
1638 // Remove this anti dependency and add one in the reverse direction.
1639 SUnit
*SU
= P
.first
;
1641 SUnit
*TargetSU
= D
.getSUnit();
1642 unsigned Reg
= D
.getReg();
1643 unsigned Lat
= D
.getLatency();
1645 SDep
Dep(SU
, SDep::Anti
, Reg
);
1646 Dep
.setLatency(Lat
);
1647 TargetSU
->addPred(Dep
);
1651 /// Create the adjacency structure of the nodes in the graph.
1652 void SwingSchedulerDAG::Circuits::createAdjacencyStructure(
1653 SwingSchedulerDAG
*DAG
) {
1654 BitVector
Added(SUnits
.size());
1655 DenseMap
<int, int> OutputDeps
;
1656 for (int i
= 0, e
= SUnits
.size(); i
!= e
; ++i
) {
1658 // Add any successor to the adjacency matrix and exclude duplicates.
1659 for (auto &SI
: SUnits
[i
].Succs
) {
1660 // Only create a back-edge on the first and last nodes of a dependence
1661 // chain. This records any chains and adds them later.
1662 if (SI
.getKind() == SDep::Output
) {
1663 int N
= SI
.getSUnit()->NodeNum
;
1665 auto Dep
= OutputDeps
.find(BackEdge
);
1666 if (Dep
!= OutputDeps
.end()) {
1667 BackEdge
= Dep
->second
;
1668 OutputDeps
.erase(Dep
);
1670 OutputDeps
[N
] = BackEdge
;
1672 // Do not process a boundary node, an artificial node.
1673 // A back-edge is processed only if it goes to a Phi.
1674 if (SI
.getSUnit()->isBoundaryNode() || SI
.isArtificial() ||
1675 (SI
.getKind() == SDep::Anti
&& !SI
.getSUnit()->getInstr()->isPHI()))
1677 int N
= SI
.getSUnit()->NodeNum
;
1678 if (!Added
.test(N
)) {
1679 AdjK
[i
].push_back(N
);
1683 // A chain edge between a store and a load is treated as a back-edge in the
1684 // adjacency matrix.
1685 for (auto &PI
: SUnits
[i
].Preds
) {
1686 if (!SUnits
[i
].getInstr()->mayStore() ||
1687 !DAG
->isLoopCarriedDep(&SUnits
[i
], PI
, false))
1689 if (PI
.getKind() == SDep::Order
&& PI
.getSUnit()->getInstr()->mayLoad()) {
1690 int N
= PI
.getSUnit()->NodeNum
;
1691 if (!Added
.test(N
)) {
1692 AdjK
[i
].push_back(N
);
1698 // Add back-edges in the adjacency matrix for the output dependences.
1699 for (auto &OD
: OutputDeps
)
1700 if (!Added
.test(OD
.second
)) {
1701 AdjK
[OD
.first
].push_back(OD
.second
);
1702 Added
.set(OD
.second
);
1706 /// Identify an elementary circuit in the dependence graph starting at the
1708 bool SwingSchedulerDAG::Circuits::circuit(int V
, int S
, NodeSetType
&NodeSets
,
1710 SUnit
*SV
= &SUnits
[V
];
1715 for (auto W
: AdjK
[V
]) {
1716 if (NumPaths
> MaxPaths
)
1722 NodeSets
.push_back(NodeSet(Stack
.begin(), Stack
.end()));
1726 } else if (!Blocked
.test(W
)) {
1727 if (circuit(W
, S
, NodeSets
,
1728 Node2Idx
->at(W
) < Node2Idx
->at(V
) ? true : HasBackedge
))
1736 for (auto W
: AdjK
[V
]) {
1746 /// Unblock a node in the circuit finding algorithm.
1747 void SwingSchedulerDAG::Circuits::unblock(int U
) {
1749 SmallPtrSet
<SUnit
*, 4> &BU
= B
[U
];
1750 while (!BU
.empty()) {
1751 SmallPtrSet
<SUnit
*, 4>::iterator SI
= BU
.begin();
1752 assert(SI
!= BU
.end() && "Invalid B set.");
1755 if (Blocked
.test(W
->NodeNum
))
1756 unblock(W
->NodeNum
);
1760 /// Identify all the elementary circuits in the dependence graph using
1761 /// Johnson's circuit algorithm.
1762 void SwingSchedulerDAG::findCircuits(NodeSetType
&NodeSets
) {
1763 // Swap all the anti dependences in the DAG. That means it is no longer a DAG,
1764 // but we do this to find the circuits, and then change them back.
1765 swapAntiDependences(SUnits
);
1767 Circuits
Cir(SUnits
, Topo
);
1768 // Create the adjacency structure.
1769 Cir
.createAdjacencyStructure(this);
1770 for (int i
= 0, e
= SUnits
.size(); i
!= e
; ++i
) {
1772 Cir
.circuit(i
, i
, NodeSets
);
1775 // Change the dependences back so that we've created a DAG again.
1776 swapAntiDependences(SUnits
);
1779 // Create artificial dependencies between the source of COPY/REG_SEQUENCE that
1780 // is loop-carried to the USE in next iteration. This will help pipeliner avoid
1781 // additional copies that are needed across iterations. An artificial dependence
1782 // edge is added from USE to SOURCE of COPY/REG_SEQUENCE.
1784 // PHI-------Anti-Dep-----> COPY/REG_SEQUENCE (loop-carried)
1785 // SRCOfCopY------True-Dep---> COPY/REG_SEQUENCE
1786 // PHI-------True-Dep------> USEOfPhi
1788 // The mutation creates
1789 // USEOfPHI -------Artificial-Dep---> SRCOfCopy
1791 // This overall will ensure, the USEOfPHI is scheduled before SRCOfCopy
1792 // (since USE is a predecessor), implies, the COPY/ REG_SEQUENCE is scheduled
1793 // late to avoid additional copies across iterations. The possible scheduling
1795 // USEOfPHI --- SRCOfCopy--- COPY/REG_SEQUENCE.
1797 void SwingSchedulerDAG::CopyToPhiMutation::apply(ScheduleDAGInstrs
*DAG
) {
1798 for (SUnit
&SU
: DAG
->SUnits
) {
1799 // Find the COPY/REG_SEQUENCE instruction.
1800 if (!SU
.getInstr()->isCopy() && !SU
.getInstr()->isRegSequence())
1803 // Record the loop carried PHIs.
1804 SmallVector
<SUnit
*, 4> PHISUs
;
1805 // Record the SrcSUs that feed the COPY/REG_SEQUENCE instructions.
1806 SmallVector
<SUnit
*, 4> SrcSUs
;
1808 for (auto &Dep
: SU
.Preds
) {
1809 SUnit
*TmpSU
= Dep
.getSUnit();
1810 MachineInstr
*TmpMI
= TmpSU
->getInstr();
1811 SDep::Kind DepKind
= Dep
.getKind();
1812 // Save the loop carried PHI.
1813 if (DepKind
== SDep::Anti
&& TmpMI
->isPHI())
1814 PHISUs
.push_back(TmpSU
);
1815 // Save the source of COPY/REG_SEQUENCE.
1816 // If the source has no pre-decessors, we will end up creating cycles.
1817 else if (DepKind
== SDep::Data
&& !TmpMI
->isPHI() && TmpSU
->NumPreds
> 0)
1818 SrcSUs
.push_back(TmpSU
);
1821 if (PHISUs
.size() == 0 || SrcSUs
.size() == 0)
1824 // Find the USEs of PHI. If the use is a PHI or REG_SEQUENCE, push back this
1825 // SUnit to the container.
1826 SmallVector
<SUnit
*, 8> UseSUs
;
1827 // Do not use iterator based loop here as we are updating the container.
1828 for (size_t Index
= 0; Index
< PHISUs
.size(); ++Index
) {
1829 for (auto &Dep
: PHISUs
[Index
]->Succs
) {
1830 if (Dep
.getKind() != SDep::Data
)
1833 SUnit
*TmpSU
= Dep
.getSUnit();
1834 MachineInstr
*TmpMI
= TmpSU
->getInstr();
1835 if (TmpMI
->isPHI() || TmpMI
->isRegSequence()) {
1836 PHISUs
.push_back(TmpSU
);
1839 UseSUs
.push_back(TmpSU
);
1843 if (UseSUs
.size() == 0)
1846 SwingSchedulerDAG
*SDAG
= cast
<SwingSchedulerDAG
>(DAG
);
1847 // Add the artificial dependencies if it does not form a cycle.
1848 for (auto *I
: UseSUs
) {
1849 for (auto *Src
: SrcSUs
) {
1850 if (!SDAG
->Topo
.IsReachable(I
, Src
) && Src
!= I
) {
1851 Src
->addPred(SDep(I
, SDep::Artificial
));
1852 SDAG
->Topo
.AddPred(Src
, I
);
1859 /// Return true for DAG nodes that we ignore when computing the cost functions.
1860 /// We ignore the back-edge recurrence in order to avoid unbounded recursion
1861 /// in the calculation of the ASAP, ALAP, etc functions.
1862 static bool ignoreDependence(const SDep
&D
, bool isPred
) {
1863 if (D
.isArtificial() || D
.getSUnit()->isBoundaryNode())
1865 return D
.getKind() == SDep::Anti
&& isPred
;
1868 /// Compute several functions need to order the nodes for scheduling.
1869 /// ASAP - Earliest time to schedule a node.
1870 /// ALAP - Latest time to schedule a node.
1871 /// MOV - Mobility function, difference between ALAP and ASAP.
1872 /// D - Depth of each node.
1873 /// H - Height of each node.
1874 void SwingSchedulerDAG::computeNodeFunctions(NodeSetType
&NodeSets
) {
1875 ScheduleInfo
.resize(SUnits
.size());
1878 for (int I
: Topo
) {
1879 const SUnit
&SU
= SUnits
[I
];
1885 // Compute ASAP and ZeroLatencyDepth.
1886 for (int I
: Topo
) {
1888 int zeroLatencyDepth
= 0;
1889 SUnit
*SU
= &SUnits
[I
];
1890 for (const SDep
&P
: SU
->Preds
) {
1891 SUnit
*pred
= P
.getSUnit();
1892 if (P
.getLatency() == 0)
1894 std::max(zeroLatencyDepth
, getZeroLatencyDepth(pred
) + 1);
1895 if (ignoreDependence(P
, true))
1897 asap
= std::max(asap
, (int)(getASAP(pred
) + P
.getLatency() -
1898 getDistance(pred
, SU
, P
) * MII
));
1900 maxASAP
= std::max(maxASAP
, asap
);
1901 ScheduleInfo
[I
].ASAP
= asap
;
1902 ScheduleInfo
[I
].ZeroLatencyDepth
= zeroLatencyDepth
;
1905 // Compute ALAP, ZeroLatencyHeight, and MOV.
1906 for (int I
: llvm::reverse(Topo
)) {
1908 int zeroLatencyHeight
= 0;
1909 SUnit
*SU
= &SUnits
[I
];
1910 for (const SDep
&S
: SU
->Succs
) {
1911 SUnit
*succ
= S
.getSUnit();
1912 if (succ
->isBoundaryNode())
1914 if (S
.getLatency() == 0)
1916 std::max(zeroLatencyHeight
, getZeroLatencyHeight(succ
) + 1);
1917 if (ignoreDependence(S
, true))
1919 alap
= std::min(alap
, (int)(getALAP(succ
) - S
.getLatency() +
1920 getDistance(SU
, succ
, S
) * MII
));
1923 ScheduleInfo
[I
].ALAP
= alap
;
1924 ScheduleInfo
[I
].ZeroLatencyHeight
= zeroLatencyHeight
;
1927 // After computing the node functions, compute the summary for each node set.
1928 for (NodeSet
&I
: NodeSets
)
1929 I
.computeNodeSetInfo(this);
1932 for (unsigned i
= 0; i
< SUnits
.size(); i
++) {
1933 dbgs() << "\tNode " << i
<< ":\n";
1934 dbgs() << "\t ASAP = " << getASAP(&SUnits
[i
]) << "\n";
1935 dbgs() << "\t ALAP = " << getALAP(&SUnits
[i
]) << "\n";
1936 dbgs() << "\t MOV = " << getMOV(&SUnits
[i
]) << "\n";
1937 dbgs() << "\t D = " << getDepth(&SUnits
[i
]) << "\n";
1938 dbgs() << "\t H = " << getHeight(&SUnits
[i
]) << "\n";
1939 dbgs() << "\t ZLD = " << getZeroLatencyDepth(&SUnits
[i
]) << "\n";
1940 dbgs() << "\t ZLH = " << getZeroLatencyHeight(&SUnits
[i
]) << "\n";
1945 /// Compute the Pred_L(O) set, as defined in the paper. The set is defined
1946 /// as the predecessors of the elements of NodeOrder that are not also in
1948 static bool pred_L(SetVector
<SUnit
*> &NodeOrder
,
1949 SmallSetVector
<SUnit
*, 8> &Preds
,
1950 const NodeSet
*S
= nullptr) {
1952 for (const SUnit
*SU
: NodeOrder
) {
1953 for (const SDep
&Pred
: SU
->Preds
) {
1954 if (S
&& S
->count(Pred
.getSUnit()) == 0)
1956 if (ignoreDependence(Pred
, true))
1958 if (NodeOrder
.count(Pred
.getSUnit()) == 0)
1959 Preds
.insert(Pred
.getSUnit());
1961 // Back-edges are predecessors with an anti-dependence.
1962 for (const SDep
&Succ
: SU
->Succs
) {
1963 if (Succ
.getKind() != SDep::Anti
)
1965 if (S
&& S
->count(Succ
.getSUnit()) == 0)
1967 if (NodeOrder
.count(Succ
.getSUnit()) == 0)
1968 Preds
.insert(Succ
.getSUnit());
1971 return !Preds
.empty();
1974 /// Compute the Succ_L(O) set, as defined in the paper. The set is defined
1975 /// as the successors of the elements of NodeOrder that are not also in
1977 static bool succ_L(SetVector
<SUnit
*> &NodeOrder
,
1978 SmallSetVector
<SUnit
*, 8> &Succs
,
1979 const NodeSet
*S
= nullptr) {
1981 for (const SUnit
*SU
: NodeOrder
) {
1982 for (const SDep
&Succ
: SU
->Succs
) {
1983 if (S
&& S
->count(Succ
.getSUnit()) == 0)
1985 if (ignoreDependence(Succ
, false))
1987 if (NodeOrder
.count(Succ
.getSUnit()) == 0)
1988 Succs
.insert(Succ
.getSUnit());
1990 for (const SDep
&Pred
: SU
->Preds
) {
1991 if (Pred
.getKind() != SDep::Anti
)
1993 if (S
&& S
->count(Pred
.getSUnit()) == 0)
1995 if (NodeOrder
.count(Pred
.getSUnit()) == 0)
1996 Succs
.insert(Pred
.getSUnit());
1999 return !Succs
.empty();
2002 /// Return true if there is a path from the specified node to any of the nodes
2003 /// in DestNodes. Keep track and return the nodes in any path.
2004 static bool computePath(SUnit
*Cur
, SetVector
<SUnit
*> &Path
,
2005 SetVector
<SUnit
*> &DestNodes
,
2006 SetVector
<SUnit
*> &Exclude
,
2007 SmallPtrSet
<SUnit
*, 8> &Visited
) {
2008 if (Cur
->isBoundaryNode())
2010 if (Exclude
.contains(Cur
))
2012 if (DestNodes
.contains(Cur
))
2014 if (!Visited
.insert(Cur
).second
)
2015 return Path
.contains(Cur
);
2016 bool FoundPath
= false;
2017 for (auto &SI
: Cur
->Succs
)
2018 if (!ignoreDependence(SI
, false))
2020 computePath(SI
.getSUnit(), Path
, DestNodes
, Exclude
, Visited
);
2021 for (auto &PI
: Cur
->Preds
)
2022 if (PI
.getKind() == SDep::Anti
)
2024 computePath(PI
.getSUnit(), Path
, DestNodes
, Exclude
, Visited
);
2030 /// Compute the live-out registers for the instructions in a node-set.
2031 /// The live-out registers are those that are defined in the node-set,
2032 /// but not used. Except for use operands of Phis.
2033 static void computeLiveOuts(MachineFunction
&MF
, RegPressureTracker
&RPTracker
,
2035 const TargetRegisterInfo
*TRI
= MF
.getSubtarget().getRegisterInfo();
2036 MachineRegisterInfo
&MRI
= MF
.getRegInfo();
2037 SmallVector
<RegisterMaskPair
, 8> LiveOutRegs
;
2038 SmallSet
<unsigned, 4> Uses
;
2039 for (SUnit
*SU
: NS
) {
2040 const MachineInstr
*MI
= SU
->getInstr();
2043 for (const MachineOperand
&MO
: MI
->all_uses()) {
2044 Register Reg
= MO
.getReg();
2045 if (Reg
.isVirtual())
2047 else if (MRI
.isAllocatable(Reg
))
2048 for (MCRegUnit Unit
: TRI
->regunits(Reg
.asMCReg()))
2052 for (SUnit
*SU
: NS
)
2053 for (const MachineOperand
&MO
: SU
->getInstr()->all_defs())
2055 Register Reg
= MO
.getReg();
2056 if (Reg
.isVirtual()) {
2057 if (!Uses
.count(Reg
))
2058 LiveOutRegs
.push_back(RegisterMaskPair(Reg
,
2059 LaneBitmask::getNone()));
2060 } else if (MRI
.isAllocatable(Reg
)) {
2061 for (MCRegUnit Unit
: TRI
->regunits(Reg
.asMCReg()))
2062 if (!Uses
.count(Unit
))
2063 LiveOutRegs
.push_back(
2064 RegisterMaskPair(Unit
, LaneBitmask::getNone()));
2067 RPTracker
.addLiveRegs(LiveOutRegs
);
2070 /// A heuristic to filter nodes in recurrent node-sets if the register
2071 /// pressure of a set is too high.
2072 void SwingSchedulerDAG::registerPressureFilter(NodeSetType
&NodeSets
) {
2073 for (auto &NS
: NodeSets
) {
2074 // Skip small node-sets since they won't cause register pressure problems.
2077 IntervalPressure RecRegPressure
;
2078 RegPressureTracker
RecRPTracker(RecRegPressure
);
2079 RecRPTracker
.init(&MF
, &RegClassInfo
, &LIS
, BB
, BB
->end(), false, true);
2080 computeLiveOuts(MF
, RecRPTracker
, NS
);
2081 RecRPTracker
.closeBottom();
2083 std::vector
<SUnit
*> SUnits(NS
.begin(), NS
.end());
2084 llvm::sort(SUnits
, [](const SUnit
*A
, const SUnit
*B
) {
2085 return A
->NodeNum
> B
->NodeNum
;
2088 for (auto &SU
: SUnits
) {
2089 // Since we're computing the register pressure for a subset of the
2090 // instructions in a block, we need to set the tracker for each
2091 // instruction in the node-set. The tracker is set to the instruction
2092 // just after the one we're interested in.
2093 MachineBasicBlock::const_iterator CurInstI
= SU
->getInstr();
2094 RecRPTracker
.setPos(std::next(CurInstI
));
2096 RegPressureDelta RPDelta
;
2097 ArrayRef
<PressureChange
> CriticalPSets
;
2098 RecRPTracker
.getMaxUpwardPressureDelta(SU
->getInstr(), nullptr, RPDelta
,
2100 RecRegPressure
.MaxSetPressure
);
2101 if (RPDelta
.Excess
.isValid()) {
2103 dbgs() << "Excess register pressure: SU(" << SU
->NodeNum
<< ") "
2104 << TRI
->getRegPressureSetName(RPDelta
.Excess
.getPSet())
2105 << ":" << RPDelta
.Excess
.getUnitInc() << "\n");
2106 NS
.setExceedPressure(SU
);
2109 RecRPTracker
.recede();
2114 /// A heuristic to colocate node sets that have the same set of
2116 void SwingSchedulerDAG::colocateNodeSets(NodeSetType
&NodeSets
) {
2117 unsigned Colocate
= 0;
2118 for (int i
= 0, e
= NodeSets
.size(); i
< e
; ++i
) {
2119 NodeSet
&N1
= NodeSets
[i
];
2120 SmallSetVector
<SUnit
*, 8> S1
;
2121 if (N1
.empty() || !succ_L(N1
, S1
))
2123 for (int j
= i
+ 1; j
< e
; ++j
) {
2124 NodeSet
&N2
= NodeSets
[j
];
2125 if (N1
.compareRecMII(N2
) != 0)
2127 SmallSetVector
<SUnit
*, 8> S2
;
2128 if (N2
.empty() || !succ_L(N2
, S2
))
2130 if (llvm::set_is_subset(S1
, S2
) && S1
.size() == S2
.size()) {
2131 N1
.setColocate(++Colocate
);
2132 N2
.setColocate(Colocate
);
2139 /// Check if the existing node-sets are profitable. If not, then ignore the
2140 /// recurrent node-sets, and attempt to schedule all nodes together. This is
2141 /// a heuristic. If the MII is large and all the recurrent node-sets are small,
2142 /// then it's best to try to schedule all instructions together instead of
2143 /// starting with the recurrent node-sets.
2144 void SwingSchedulerDAG::checkNodeSets(NodeSetType
&NodeSets
) {
2145 // Look for loops with a large MII.
2148 // Check if the node-set contains only a simple add recurrence.
2149 for (auto &NS
: NodeSets
) {
2150 if (NS
.getRecMII() > 2)
2152 if (NS
.getMaxDepth() > MII
)
2156 LLVM_DEBUG(dbgs() << "Clear recurrence node-sets\n");
2159 /// Add the nodes that do not belong to a recurrence set into groups
2160 /// based upon connected components.
2161 void SwingSchedulerDAG::groupRemainingNodes(NodeSetType
&NodeSets
) {
2162 SetVector
<SUnit
*> NodesAdded
;
2163 SmallPtrSet
<SUnit
*, 8> Visited
;
2164 // Add the nodes that are on a path between the previous node sets and
2165 // the current node set.
2166 for (NodeSet
&I
: NodeSets
) {
2167 SmallSetVector
<SUnit
*, 8> N
;
2168 // Add the nodes from the current node set to the previous node set.
2170 SetVector
<SUnit
*> Path
;
2171 for (SUnit
*NI
: N
) {
2173 computePath(NI
, Path
, NodesAdded
, I
, Visited
);
2176 I
.insert(Path
.begin(), Path
.end());
2178 // Add the nodes from the previous node set to the current node set.
2180 if (succ_L(NodesAdded
, N
)) {
2181 SetVector
<SUnit
*> Path
;
2182 for (SUnit
*NI
: N
) {
2184 computePath(NI
, Path
, I
, NodesAdded
, Visited
);
2187 I
.insert(Path
.begin(), Path
.end());
2189 NodesAdded
.insert(I
.begin(), I
.end());
2192 // Create a new node set with the connected nodes of any successor of a node
2193 // in a recurrent set.
2195 SmallSetVector
<SUnit
*, 8> N
;
2196 if (succ_L(NodesAdded
, N
))
2198 addConnectedNodes(I
, NewSet
, NodesAdded
);
2199 if (!NewSet
.empty())
2200 NodeSets
.push_back(NewSet
);
2202 // Create a new node set with the connected nodes of any predecessor of a node
2203 // in a recurrent set.
2205 if (pred_L(NodesAdded
, N
))
2207 addConnectedNodes(I
, NewSet
, NodesAdded
);
2208 if (!NewSet
.empty())
2209 NodeSets
.push_back(NewSet
);
2211 // Create new nodes sets with the connected nodes any remaining node that
2212 // has no predecessor.
2213 for (SUnit
&SU
: SUnits
) {
2214 if (NodesAdded
.count(&SU
) == 0) {
2216 addConnectedNodes(&SU
, NewSet
, NodesAdded
);
2217 if (!NewSet
.empty())
2218 NodeSets
.push_back(NewSet
);
2223 /// Add the node to the set, and add all of its connected nodes to the set.
2224 void SwingSchedulerDAG::addConnectedNodes(SUnit
*SU
, NodeSet
&NewSet
,
2225 SetVector
<SUnit
*> &NodesAdded
) {
2227 NodesAdded
.insert(SU
);
2228 for (auto &SI
: SU
->Succs
) {
2229 SUnit
*Successor
= SI
.getSUnit();
2230 if (!SI
.isArtificial() && !Successor
->isBoundaryNode() &&
2231 NodesAdded
.count(Successor
) == 0)
2232 addConnectedNodes(Successor
, NewSet
, NodesAdded
);
2234 for (auto &PI
: SU
->Preds
) {
2235 SUnit
*Predecessor
= PI
.getSUnit();
2236 if (!PI
.isArtificial() && NodesAdded
.count(Predecessor
) == 0)
2237 addConnectedNodes(Predecessor
, NewSet
, NodesAdded
);
2241 /// Return true if Set1 contains elements in Set2. The elements in common
2242 /// are returned in a different container.
2243 static bool isIntersect(SmallSetVector
<SUnit
*, 8> &Set1
, const NodeSet
&Set2
,
2244 SmallSetVector
<SUnit
*, 8> &Result
) {
2246 for (SUnit
*SU
: Set1
) {
2247 if (Set2
.count(SU
) != 0)
2250 return !Result
.empty();
2253 /// Merge the recurrence node sets that have the same initial node.
2254 void SwingSchedulerDAG::fuseRecs(NodeSetType
&NodeSets
) {
2255 for (NodeSetType::iterator I
= NodeSets
.begin(), E
= NodeSets
.end(); I
!= E
;
2258 for (NodeSetType::iterator J
= I
+ 1; J
!= E
;) {
2260 if (NI
.getNode(0)->NodeNum
== NJ
.getNode(0)->NodeNum
) {
2261 if (NJ
.compareRecMII(NI
) > 0)
2262 NI
.setRecMII(NJ
.getRecMII());
2263 for (SUnit
*SU
: *J
)
2274 /// Remove nodes that have been scheduled in previous NodeSets.
2275 void SwingSchedulerDAG::removeDuplicateNodes(NodeSetType
&NodeSets
) {
2276 for (NodeSetType::iterator I
= NodeSets
.begin(), E
= NodeSets
.end(); I
!= E
;
2278 for (NodeSetType::iterator J
= I
+ 1; J
!= E
;) {
2279 J
->remove_if([&](SUnit
*SUJ
) { return I
->count(SUJ
); });
2290 /// Compute an ordered list of the dependence graph nodes, which
2291 /// indicates the order that the nodes will be scheduled. This is a
2292 /// two-level algorithm. First, a partial order is created, which
2293 /// consists of a list of sets ordered from highest to lowest priority.
2294 void SwingSchedulerDAG::computeNodeOrder(NodeSetType
&NodeSets
) {
2295 SmallSetVector
<SUnit
*, 8> R
;
2298 for (auto &Nodes
: NodeSets
) {
2299 LLVM_DEBUG(dbgs() << "NodeSet size " << Nodes
.size() << "\n");
2301 SmallSetVector
<SUnit
*, 8> N
;
2302 if (pred_L(NodeOrder
, N
) && llvm::set_is_subset(N
, Nodes
)) {
2303 R
.insert(N
.begin(), N
.end());
2305 LLVM_DEBUG(dbgs() << " Bottom up (preds) ");
2306 } else if (succ_L(NodeOrder
, N
) && llvm::set_is_subset(N
, Nodes
)) {
2307 R
.insert(N
.begin(), N
.end());
2309 LLVM_DEBUG(dbgs() << " Top down (succs) ");
2310 } else if (isIntersect(N
, Nodes
, R
)) {
2311 // If some of the successors are in the existing node-set, then use the
2312 // top-down ordering.
2314 LLVM_DEBUG(dbgs() << " Top down (intersect) ");
2315 } else if (NodeSets
.size() == 1) {
2316 for (const auto &N
: Nodes
)
2317 if (N
->Succs
.size() == 0)
2320 LLVM_DEBUG(dbgs() << " Bottom up (all) ");
2322 // Find the node with the highest ASAP.
2323 SUnit
*maxASAP
= nullptr;
2324 for (SUnit
*SU
: Nodes
) {
2325 if (maxASAP
== nullptr || getASAP(SU
) > getASAP(maxASAP
) ||
2326 (getASAP(SU
) == getASAP(maxASAP
) && SU
->NodeNum
> maxASAP
->NodeNum
))
2331 LLVM_DEBUG(dbgs() << " Bottom up (default) ");
2334 while (!R
.empty()) {
2335 if (Order
== TopDown
) {
2336 // Choose the node with the maximum height. If more than one, choose
2337 // the node wiTH the maximum ZeroLatencyHeight. If still more than one,
2338 // choose the node with the lowest MOV.
2339 while (!R
.empty()) {
2340 SUnit
*maxHeight
= nullptr;
2341 for (SUnit
*I
: R
) {
2342 if (maxHeight
== nullptr || getHeight(I
) > getHeight(maxHeight
))
2344 else if (getHeight(I
) == getHeight(maxHeight
) &&
2345 getZeroLatencyHeight(I
) > getZeroLatencyHeight(maxHeight
))
2347 else if (getHeight(I
) == getHeight(maxHeight
) &&
2348 getZeroLatencyHeight(I
) ==
2349 getZeroLatencyHeight(maxHeight
) &&
2350 getMOV(I
) < getMOV(maxHeight
))
2353 NodeOrder
.insert(maxHeight
);
2354 LLVM_DEBUG(dbgs() << maxHeight
->NodeNum
<< " ");
2355 R
.remove(maxHeight
);
2356 for (const auto &I
: maxHeight
->Succs
) {
2357 if (Nodes
.count(I
.getSUnit()) == 0)
2359 if (NodeOrder
.contains(I
.getSUnit()))
2361 if (ignoreDependence(I
, false))
2363 R
.insert(I
.getSUnit());
2365 // Back-edges are predecessors with an anti-dependence.
2366 for (const auto &I
: maxHeight
->Preds
) {
2367 if (I
.getKind() != SDep::Anti
)
2369 if (Nodes
.count(I
.getSUnit()) == 0)
2371 if (NodeOrder
.contains(I
.getSUnit()))
2373 R
.insert(I
.getSUnit());
2377 LLVM_DEBUG(dbgs() << "\n Switching order to bottom up ");
2378 SmallSetVector
<SUnit
*, 8> N
;
2379 if (pred_L(NodeOrder
, N
, &Nodes
))
2380 R
.insert(N
.begin(), N
.end());
2382 // Choose the node with the maximum depth. If more than one, choose
2383 // the node with the maximum ZeroLatencyDepth. If still more than one,
2384 // choose the node with the lowest MOV.
2385 while (!R
.empty()) {
2386 SUnit
*maxDepth
= nullptr;
2387 for (SUnit
*I
: R
) {
2388 if (maxDepth
== nullptr || getDepth(I
) > getDepth(maxDepth
))
2390 else if (getDepth(I
) == getDepth(maxDepth
) &&
2391 getZeroLatencyDepth(I
) > getZeroLatencyDepth(maxDepth
))
2393 else if (getDepth(I
) == getDepth(maxDepth
) &&
2394 getZeroLatencyDepth(I
) == getZeroLatencyDepth(maxDepth
) &&
2395 getMOV(I
) < getMOV(maxDepth
))
2398 NodeOrder
.insert(maxDepth
);
2399 LLVM_DEBUG(dbgs() << maxDepth
->NodeNum
<< " ");
2401 if (Nodes
.isExceedSU(maxDepth
)) {
2404 R
.insert(Nodes
.getNode(0));
2407 for (const auto &I
: maxDepth
->Preds
) {
2408 if (Nodes
.count(I
.getSUnit()) == 0)
2410 if (NodeOrder
.contains(I
.getSUnit()))
2412 R
.insert(I
.getSUnit());
2414 // Back-edges are predecessors with an anti-dependence.
2415 for (const auto &I
: maxDepth
->Succs
) {
2416 if (I
.getKind() != SDep::Anti
)
2418 if (Nodes
.count(I
.getSUnit()) == 0)
2420 if (NodeOrder
.contains(I
.getSUnit()))
2422 R
.insert(I
.getSUnit());
2426 LLVM_DEBUG(dbgs() << "\n Switching order to top down ");
2427 SmallSetVector
<SUnit
*, 8> N
;
2428 if (succ_L(NodeOrder
, N
, &Nodes
))
2429 R
.insert(N
.begin(), N
.end());
2432 LLVM_DEBUG(dbgs() << "\nDone with Nodeset\n");
2436 dbgs() << "Node order: ";
2437 for (SUnit
*I
: NodeOrder
)
2438 dbgs() << " " << I
->NodeNum
<< " ";
2443 /// Process the nodes in the computed order and create the pipelined schedule
2444 /// of the instructions, if possible. Return true if a schedule is found.
2445 bool SwingSchedulerDAG::schedulePipeline(SMSchedule
&Schedule
) {
2447 if (NodeOrder
.empty()){
2448 LLVM_DEBUG(dbgs() << "NodeOrder is empty! abort scheduling\n" );
2452 bool scheduleFound
= false;
2453 std::unique_ptr
<HighRegisterPressureDetector
> HRPDetector
;
2454 if (LimitRegPressure
) {
2456 std::make_unique
<HighRegisterPressureDetector
>(Loop
.getHeader(), MF
);
2457 HRPDetector
->init(RegClassInfo
);
2459 // Keep increasing II until a valid schedule is found.
2460 for (unsigned II
= MII
; II
<= MAX_II
&& !scheduleFound
; ++II
) {
2462 Schedule
.setInitiationInterval(II
);
2463 LLVM_DEBUG(dbgs() << "Try to schedule with " << II
<< "\n");
2465 SetVector
<SUnit
*>::iterator NI
= NodeOrder
.begin();
2466 SetVector
<SUnit
*>::iterator NE
= NodeOrder
.end();
2470 // Compute the schedule time for the instruction, which is based
2471 // upon the scheduled time for any predecessors/successors.
2472 int EarlyStart
= INT_MIN
;
2473 int LateStart
= INT_MAX
;
2474 Schedule
.computeStart(SU
, &EarlyStart
, &LateStart
, II
, this);
2477 dbgs() << "Inst (" << SU
->NodeNum
<< ") ";
2478 SU
->getInstr()->dump();
2482 dbgs() << format("\tes: %8x ls: %8x\n", EarlyStart
, LateStart
));
2484 if (EarlyStart
> LateStart
)
2485 scheduleFound
= false;
2486 else if (EarlyStart
!= INT_MIN
&& LateStart
== INT_MAX
)
2488 Schedule
.insert(SU
, EarlyStart
, EarlyStart
+ (int)II
- 1, II
);
2489 else if (EarlyStart
== INT_MIN
&& LateStart
!= INT_MAX
)
2491 Schedule
.insert(SU
, LateStart
, LateStart
- (int)II
+ 1, II
);
2492 else if (EarlyStart
!= INT_MIN
&& LateStart
!= INT_MAX
) {
2493 LateStart
= std::min(LateStart
, EarlyStart
+ (int)II
- 1);
2494 // When scheduling a Phi it is better to start at the late cycle and
2495 // go backwards. The default order may insert the Phi too far away
2496 // from its first dependence.
2497 // Also, do backward search when all scheduled predecessors are
2498 // loop-carried output/order dependencies. Empirically, there are also
2499 // cases where scheduling becomes possible with backward search.
2500 if (SU
->getInstr()->isPHI() ||
2501 Schedule
.onlyHasLoopCarriedOutputOrOrderPreds(SU
, this))
2502 scheduleFound
= Schedule
.insert(SU
, LateStart
, EarlyStart
, II
);
2504 scheduleFound
= Schedule
.insert(SU
, EarlyStart
, LateStart
, II
);
2506 int FirstCycle
= Schedule
.getFirstCycle();
2507 scheduleFound
= Schedule
.insert(SU
, FirstCycle
+ getASAP(SU
),
2508 FirstCycle
+ getASAP(SU
) + II
- 1, II
);
2511 // Even if we find a schedule, make sure the schedule doesn't exceed the
2512 // allowable number of stages. We keep trying if this happens.
2514 if (SwpMaxStages
> -1 &&
2515 Schedule
.getMaxStageCount() > (unsigned)SwpMaxStages
)
2516 scheduleFound
= false;
2520 dbgs() << "\tCan't schedule\n";
2522 } while (++NI
!= NE
&& scheduleFound
);
2524 // If a schedule is found, ensure non-pipelined instructions are in stage 0
2527 Schedule
.normalizeNonPipelinedInstructions(this, LoopPipelinerInfo
);
2529 // If a schedule is found, check if it is a valid schedule too.
2531 scheduleFound
= Schedule
.isValidSchedule(this);
2533 // If a schedule was found and the option is enabled, check if the schedule
2534 // might generate additional register spills/fills.
2535 if (scheduleFound
&& LimitRegPressure
)
2537 !HRPDetector
->detect(this, Schedule
, Schedule
.getMaxStageCount());
2540 LLVM_DEBUG(dbgs() << "Schedule Found? " << scheduleFound
2541 << " (II=" << Schedule
.getInitiationInterval()
2544 if (scheduleFound
) {
2545 scheduleFound
= LoopPipelinerInfo
->shouldUseSchedule(*this, Schedule
);
2547 LLVM_DEBUG(dbgs() << "Target rejected schedule\n");
2550 if (scheduleFound
) {
2551 Schedule
.finalizeSchedule(this);
2552 Pass
.ORE
->emit([&]() {
2553 return MachineOptimizationRemarkAnalysis(
2554 DEBUG_TYPE
, "schedule", Loop
.getStartLoc(), Loop
.getHeader())
2555 << "Schedule found with Initiation Interval: "
2556 << ore::NV("II", Schedule
.getInitiationInterval())
2557 << ", MaxStageCount: "
2558 << ore::NV("MaxStageCount", Schedule
.getMaxStageCount());
2563 return scheduleFound
&& Schedule
.getMaxStageCount() > 0;
2566 /// Return true if we can compute the amount the instruction changes
2567 /// during each iteration. Set Delta to the amount of the change.
2568 bool SwingSchedulerDAG::computeDelta(MachineInstr
&MI
, unsigned &Delta
) {
2569 const TargetRegisterInfo
*TRI
= MF
.getSubtarget().getRegisterInfo();
2570 const MachineOperand
*BaseOp
;
2572 bool OffsetIsScalable
;
2573 if (!TII
->getMemOperandWithOffset(MI
, BaseOp
, Offset
, OffsetIsScalable
, TRI
))
2576 // FIXME: This algorithm assumes instructions have fixed-size offsets.
2577 if (OffsetIsScalable
)
2580 if (!BaseOp
->isReg())
2583 Register BaseReg
= BaseOp
->getReg();
2585 MachineRegisterInfo
&MRI
= MF
.getRegInfo();
2586 // Check if there is a Phi. If so, get the definition in the loop.
2587 MachineInstr
*BaseDef
= MRI
.getVRegDef(BaseReg
);
2588 if (BaseDef
&& BaseDef
->isPHI()) {
2589 BaseReg
= getLoopPhiReg(*BaseDef
, MI
.getParent());
2590 BaseDef
= MRI
.getVRegDef(BaseReg
);
2596 if (!TII
->getIncrementValue(*BaseDef
, D
) && D
>= 0)
2603 /// Check if we can change the instruction to use an offset value from the
2604 /// previous iteration. If so, return true and set the base and offset values
2605 /// so that we can rewrite the load, if necessary.
2606 /// v1 = Phi(v0, v3)
2608 /// v3 = post_store v1, 4, x
2609 /// This function enables the load to be rewritten as v2 = load v3, 4.
2610 bool SwingSchedulerDAG::canUseLastOffsetValue(MachineInstr
*MI
,
2612 unsigned &OffsetPos
,
2615 // Get the load instruction.
2616 if (TII
->isPostIncrement(*MI
))
2618 unsigned BasePosLd
, OffsetPosLd
;
2619 if (!TII
->getBaseAndOffsetPosition(*MI
, BasePosLd
, OffsetPosLd
))
2621 Register BaseReg
= MI
->getOperand(BasePosLd
).getReg();
2623 // Look for the Phi instruction.
2624 MachineRegisterInfo
&MRI
= MI
->getMF()->getRegInfo();
2625 MachineInstr
*Phi
= MRI
.getVRegDef(BaseReg
);
2626 if (!Phi
|| !Phi
->isPHI())
2628 // Get the register defined in the loop block.
2629 unsigned PrevReg
= getLoopPhiReg(*Phi
, MI
->getParent());
2633 // Check for the post-increment load/store instruction.
2634 MachineInstr
*PrevDef
= MRI
.getVRegDef(PrevReg
);
2635 if (!PrevDef
|| PrevDef
== MI
)
2638 if (!TII
->isPostIncrement(*PrevDef
))
2641 unsigned BasePos1
= 0, OffsetPos1
= 0;
2642 if (!TII
->getBaseAndOffsetPosition(*PrevDef
, BasePos1
, OffsetPos1
))
2645 // Make sure that the instructions do not access the same memory location in
2646 // the next iteration.
2647 int64_t LoadOffset
= MI
->getOperand(OffsetPosLd
).getImm();
2648 int64_t StoreOffset
= PrevDef
->getOperand(OffsetPos1
).getImm();
2649 MachineInstr
*NewMI
= MF
.CloneMachineInstr(MI
);
2650 NewMI
->getOperand(OffsetPosLd
).setImm(LoadOffset
+ StoreOffset
);
2651 bool Disjoint
= TII
->areMemAccessesTriviallyDisjoint(*NewMI
, *PrevDef
);
2652 MF
.deleteMachineInstr(NewMI
);
2656 // Set the return value once we determine that we return true.
2657 BasePos
= BasePosLd
;
2658 OffsetPos
= OffsetPosLd
;
2660 Offset
= StoreOffset
;
2664 /// Apply changes to the instruction if needed. The changes are need
2665 /// to improve the scheduling and depend up on the final schedule.
2666 void SwingSchedulerDAG::applyInstrChange(MachineInstr
*MI
,
2667 SMSchedule
&Schedule
) {
2668 SUnit
*SU
= getSUnit(MI
);
2669 DenseMap
<SUnit
*, std::pair
<unsigned, int64_t>>::iterator It
=
2670 InstrChanges
.find(SU
);
2671 if (It
!= InstrChanges
.end()) {
2672 std::pair
<unsigned, int64_t> RegAndOffset
= It
->second
;
2673 unsigned BasePos
, OffsetPos
;
2674 if (!TII
->getBaseAndOffsetPosition(*MI
, BasePos
, OffsetPos
))
2676 Register BaseReg
= MI
->getOperand(BasePos
).getReg();
2677 MachineInstr
*LoopDef
= findDefInLoop(BaseReg
);
2678 int DefStageNum
= Schedule
.stageScheduled(getSUnit(LoopDef
));
2679 int DefCycleNum
= Schedule
.cycleScheduled(getSUnit(LoopDef
));
2680 int BaseStageNum
= Schedule
.stageScheduled(SU
);
2681 int BaseCycleNum
= Schedule
.cycleScheduled(SU
);
2682 if (BaseStageNum
< DefStageNum
) {
2683 MachineInstr
*NewMI
= MF
.CloneMachineInstr(MI
);
2684 int OffsetDiff
= DefStageNum
- BaseStageNum
;
2685 if (DefCycleNum
< BaseCycleNum
) {
2686 NewMI
->getOperand(BasePos
).setReg(RegAndOffset
.first
);
2691 MI
->getOperand(OffsetPos
).getImm() + RegAndOffset
.second
* OffsetDiff
;
2692 NewMI
->getOperand(OffsetPos
).setImm(NewOffset
);
2693 SU
->setInstr(NewMI
);
2694 MISUnitMap
[NewMI
] = SU
;
2700 /// Return the instruction in the loop that defines the register.
2701 /// If the definition is a Phi, then follow the Phi operand to
2702 /// the instruction in the loop.
2703 MachineInstr
*SwingSchedulerDAG::findDefInLoop(Register Reg
) {
2704 SmallPtrSet
<MachineInstr
*, 8> Visited
;
2705 MachineInstr
*Def
= MRI
.getVRegDef(Reg
);
2706 while (Def
->isPHI()) {
2707 if (!Visited
.insert(Def
).second
)
2709 for (unsigned i
= 1, e
= Def
->getNumOperands(); i
< e
; i
+= 2)
2710 if (Def
->getOperand(i
+ 1).getMBB() == BB
) {
2711 Def
= MRI
.getVRegDef(Def
->getOperand(i
).getReg());
2718 /// Return true for an order or output dependence that is loop carried
2719 /// potentially. A dependence is loop carried if the destination defines a value
2720 /// that may be used or defined by the source in a subsequent iteration.
2721 bool SwingSchedulerDAG::isLoopCarriedDep(SUnit
*Source
, const SDep
&Dep
,
2723 if ((Dep
.getKind() != SDep::Order
&& Dep
.getKind() != SDep::Output
) ||
2724 Dep
.isArtificial() || Dep
.getSUnit()->isBoundaryNode())
2727 if (!SwpPruneLoopCarried
)
2730 if (Dep
.getKind() == SDep::Output
)
2733 MachineInstr
*SI
= Source
->getInstr();
2734 MachineInstr
*DI
= Dep
.getSUnit()->getInstr();
2737 assert(SI
!= nullptr && DI
!= nullptr && "Expecting SUnit with an MI.");
2739 // Assume ordered loads and stores may have a loop carried dependence.
2740 if (SI
->hasUnmodeledSideEffects() || DI
->hasUnmodeledSideEffects() ||
2741 SI
->mayRaiseFPException() || DI
->mayRaiseFPException() ||
2742 SI
->hasOrderedMemoryRef() || DI
->hasOrderedMemoryRef())
2745 if (!DI
->mayLoadOrStore() || !SI
->mayLoadOrStore())
2748 // The conservative assumption is that a dependence between memory operations
2749 // may be loop carried. The following code checks when it can be proved that
2750 // there is no loop carried dependence.
2751 unsigned DeltaS
, DeltaD
;
2752 if (!computeDelta(*SI
, DeltaS
) || !computeDelta(*DI
, DeltaD
))
2755 const MachineOperand
*BaseOpS
, *BaseOpD
;
2756 int64_t OffsetS
, OffsetD
;
2757 bool OffsetSIsScalable
, OffsetDIsScalable
;
2758 const TargetRegisterInfo
*TRI
= MF
.getSubtarget().getRegisterInfo();
2759 if (!TII
->getMemOperandWithOffset(*SI
, BaseOpS
, OffsetS
, OffsetSIsScalable
,
2761 !TII
->getMemOperandWithOffset(*DI
, BaseOpD
, OffsetD
, OffsetDIsScalable
,
2765 assert(!OffsetSIsScalable
&& !OffsetDIsScalable
&&
2766 "Expected offsets to be byte offsets");
2768 MachineInstr
*DefS
= MRI
.getVRegDef(BaseOpS
->getReg());
2769 MachineInstr
*DefD
= MRI
.getVRegDef(BaseOpD
->getReg());
2770 if (!DefS
|| !DefD
|| !DefS
->isPHI() || !DefD
->isPHI())
2773 unsigned InitValS
= 0;
2774 unsigned LoopValS
= 0;
2775 unsigned InitValD
= 0;
2776 unsigned LoopValD
= 0;
2777 getPhiRegs(*DefS
, BB
, InitValS
, LoopValS
);
2778 getPhiRegs(*DefD
, BB
, InitValD
, LoopValD
);
2779 MachineInstr
*InitDefS
= MRI
.getVRegDef(InitValS
);
2780 MachineInstr
*InitDefD
= MRI
.getVRegDef(InitValD
);
2782 if (!InitDefS
->isIdenticalTo(*InitDefD
))
2785 // Check that the base register is incremented by a constant value for each
2787 MachineInstr
*LoopDefS
= MRI
.getVRegDef(LoopValS
);
2789 if (!LoopDefS
|| !TII
->getIncrementValue(*LoopDefS
, D
))
2792 LocationSize AccessSizeS
= (*SI
->memoperands_begin())->getSize();
2793 LocationSize AccessSizeD
= (*DI
->memoperands_begin())->getSize();
2795 // This is the main test, which checks the offset values and the loop
2796 // increment value to determine if the accesses may be loop carried.
2797 if (!AccessSizeS
.hasValue() || !AccessSizeD
.hasValue())
2800 if (DeltaS
!= DeltaD
|| DeltaS
< AccessSizeS
.getValue() ||
2801 DeltaD
< AccessSizeD
.getValue())
2804 return (OffsetS
+ (int64_t)AccessSizeS
.getValue() <
2805 OffsetD
+ (int64_t)AccessSizeD
.getValue());
2808 void SwingSchedulerDAG::postProcessDAG() {
2809 for (auto &M
: Mutations
)
2813 /// Try to schedule the node at the specified StartCycle and continue
2814 /// until the node is schedule or the EndCycle is reached. This function
2815 /// returns true if the node is scheduled. This routine may search either
2816 /// forward or backward for a place to insert the instruction based upon
2817 /// the relative values of StartCycle and EndCycle.
2818 bool SMSchedule::insert(SUnit
*SU
, int StartCycle
, int EndCycle
, int II
) {
2819 bool forward
= true;
2821 dbgs() << "Trying to insert node between " << StartCycle
<< " and "
2822 << EndCycle
<< " II: " << II
<< "\n";
2824 if (StartCycle
> EndCycle
)
2827 // The terminating condition depends on the direction.
2828 int termCycle
= forward
? EndCycle
+ 1 : EndCycle
- 1;
2829 for (int curCycle
= StartCycle
; curCycle
!= termCycle
;
2830 forward
? ++curCycle
: --curCycle
) {
2832 if (ST
.getInstrInfo()->isZeroCost(SU
->getInstr()->getOpcode()) ||
2833 ProcItinResources
.canReserveResources(*SU
, curCycle
)) {
2835 dbgs() << "\tinsert at cycle " << curCycle
<< " ";
2836 SU
->getInstr()->dump();
2839 if (!ST
.getInstrInfo()->isZeroCost(SU
->getInstr()->getOpcode()))
2840 ProcItinResources
.reserveResources(*SU
, curCycle
);
2841 ScheduledInstrs
[curCycle
].push_back(SU
);
2842 InstrToCycle
.insert(std::make_pair(SU
, curCycle
));
2843 if (curCycle
> LastCycle
)
2844 LastCycle
= curCycle
;
2845 if (curCycle
< FirstCycle
)
2846 FirstCycle
= curCycle
;
2850 dbgs() << "\tfailed to insert at cycle " << curCycle
<< " ";
2851 SU
->getInstr()->dump();
2857 // Return the cycle of the earliest scheduled instruction in the chain.
2858 int SMSchedule::earliestCycleInChain(const SDep
&Dep
) {
2859 SmallPtrSet
<SUnit
*, 8> Visited
;
2860 SmallVector
<SDep
, 8> Worklist
;
2861 Worklist
.push_back(Dep
);
2862 int EarlyCycle
= INT_MAX
;
2863 while (!Worklist
.empty()) {
2864 const SDep
&Cur
= Worklist
.pop_back_val();
2865 SUnit
*PrevSU
= Cur
.getSUnit();
2866 if (Visited
.count(PrevSU
))
2868 std::map
<SUnit
*, int>::const_iterator it
= InstrToCycle
.find(PrevSU
);
2869 if (it
== InstrToCycle
.end())
2871 EarlyCycle
= std::min(EarlyCycle
, it
->second
);
2872 for (const auto &PI
: PrevSU
->Preds
)
2873 if (PI
.getKind() == SDep::Order
|| PI
.getKind() == SDep::Output
)
2874 Worklist
.push_back(PI
);
2875 Visited
.insert(PrevSU
);
2880 // Return the cycle of the latest scheduled instruction in the chain.
2881 int SMSchedule::latestCycleInChain(const SDep
&Dep
) {
2882 SmallPtrSet
<SUnit
*, 8> Visited
;
2883 SmallVector
<SDep
, 8> Worklist
;
2884 Worklist
.push_back(Dep
);
2885 int LateCycle
= INT_MIN
;
2886 while (!Worklist
.empty()) {
2887 const SDep
&Cur
= Worklist
.pop_back_val();
2888 SUnit
*SuccSU
= Cur
.getSUnit();
2889 if (Visited
.count(SuccSU
) || SuccSU
->isBoundaryNode())
2891 std::map
<SUnit
*, int>::const_iterator it
= InstrToCycle
.find(SuccSU
);
2892 if (it
== InstrToCycle
.end())
2894 LateCycle
= std::max(LateCycle
, it
->second
);
2895 for (const auto &SI
: SuccSU
->Succs
)
2896 if (SI
.getKind() == SDep::Order
|| SI
.getKind() == SDep::Output
)
2897 Worklist
.push_back(SI
);
2898 Visited
.insert(SuccSU
);
2903 /// If an instruction has a use that spans multiple iterations, then
2904 /// return true. These instructions are characterized by having a back-ege
2905 /// to a Phi, which contains a reference to another Phi.
2906 static SUnit
*multipleIterations(SUnit
*SU
, SwingSchedulerDAG
*DAG
) {
2907 for (auto &P
: SU
->Preds
)
2908 if (DAG
->isBackedge(SU
, P
) && P
.getSUnit()->getInstr()->isPHI())
2909 for (auto &S
: P
.getSUnit()->Succs
)
2910 if (S
.getKind() == SDep::Data
&& S
.getSUnit()->getInstr()->isPHI())
2911 return P
.getSUnit();
2915 /// Compute the scheduling start slot for the instruction. The start slot
2916 /// depends on any predecessor or successor nodes scheduled already.
2917 void SMSchedule::computeStart(SUnit
*SU
, int *MaxEarlyStart
, int *MinLateStart
,
2918 int II
, SwingSchedulerDAG
*DAG
) {
2919 // Iterate over each instruction that has been scheduled already. The start
2920 // slot computation depends on whether the previously scheduled instruction
2921 // is a predecessor or successor of the specified instruction.
2922 for (int cycle
= getFirstCycle(); cycle
<= LastCycle
; ++cycle
) {
2924 // Iterate over each instruction in the current cycle.
2925 for (SUnit
*I
: getInstructions(cycle
)) {
2926 // Because we're processing a DAG for the dependences, we recognize
2927 // the back-edge in recurrences by anti dependences.
2928 for (unsigned i
= 0, e
= (unsigned)SU
->Preds
.size(); i
!= e
; ++i
) {
2929 const SDep
&Dep
= SU
->Preds
[i
];
2930 if (Dep
.getSUnit() == I
) {
2931 if (!DAG
->isBackedge(SU
, Dep
)) {
2932 int EarlyStart
= cycle
+ Dep
.getLatency() -
2933 DAG
->getDistance(Dep
.getSUnit(), SU
, Dep
) * II
;
2934 *MaxEarlyStart
= std::max(*MaxEarlyStart
, EarlyStart
);
2935 if (DAG
->isLoopCarriedDep(SU
, Dep
, false)) {
2936 int End
= earliestCycleInChain(Dep
) + (II
- 1);
2937 *MinLateStart
= std::min(*MinLateStart
, End
);
2940 int LateStart
= cycle
- Dep
.getLatency() +
2941 DAG
->getDistance(SU
, Dep
.getSUnit(), Dep
) * II
;
2942 *MinLateStart
= std::min(*MinLateStart
, LateStart
);
2945 // For instruction that requires multiple iterations, make sure that
2946 // the dependent instruction is not scheduled past the definition.
2947 SUnit
*BE
= multipleIterations(I
, DAG
);
2948 if (BE
&& Dep
.getSUnit() == BE
&& !SU
->getInstr()->isPHI() &&
2950 *MinLateStart
= std::min(*MinLateStart
, cycle
);
2952 for (unsigned i
= 0, e
= (unsigned)SU
->Succs
.size(); i
!= e
; ++i
) {
2953 if (SU
->Succs
[i
].getSUnit() == I
) {
2954 const SDep
&Dep
= SU
->Succs
[i
];
2955 if (!DAG
->isBackedge(SU
, Dep
)) {
2956 int LateStart
= cycle
- Dep
.getLatency() +
2957 DAG
->getDistance(SU
, Dep
.getSUnit(), Dep
) * II
;
2958 *MinLateStart
= std::min(*MinLateStart
, LateStart
);
2959 if (DAG
->isLoopCarriedDep(SU
, Dep
)) {
2960 int Start
= latestCycleInChain(Dep
) + 1 - II
;
2961 *MaxEarlyStart
= std::max(*MaxEarlyStart
, Start
);
2964 int EarlyStart
= cycle
+ Dep
.getLatency() -
2965 DAG
->getDistance(Dep
.getSUnit(), SU
, Dep
) * II
;
2966 *MaxEarlyStart
= std::max(*MaxEarlyStart
, EarlyStart
);
2974 /// Order the instructions within a cycle so that the definitions occur
2975 /// before the uses. Returns true if the instruction is added to the start
2976 /// of the list, or false if added to the end.
2977 void SMSchedule::orderDependence(const SwingSchedulerDAG
*SSD
, SUnit
*SU
,
2978 std::deque
<SUnit
*> &Insts
) const {
2979 MachineInstr
*MI
= SU
->getInstr();
2980 bool OrderBeforeUse
= false;
2981 bool OrderAfterDef
= false;
2982 bool OrderBeforeDef
= false;
2983 unsigned MoveDef
= 0;
2984 unsigned MoveUse
= 0;
2985 int StageInst1
= stageScheduled(SU
);
2988 for (std::deque
<SUnit
*>::iterator I
= Insts
.begin(), E
= Insts
.end(); I
!= E
;
2990 for (MachineOperand
&MO
: MI
->operands()) {
2991 if (!MO
.isReg() || !MO
.getReg().isVirtual())
2994 Register Reg
= MO
.getReg();
2995 unsigned BasePos
, OffsetPos
;
2996 if (ST
.getInstrInfo()->getBaseAndOffsetPosition(*MI
, BasePos
, OffsetPos
))
2997 if (MI
->getOperand(BasePos
).getReg() == Reg
)
2998 if (unsigned NewReg
= SSD
->getInstrBaseReg(SU
))
3001 std::tie(Reads
, Writes
) =
3002 (*I
)->getInstr()->readsWritesVirtualRegister(Reg
);
3003 if (MO
.isDef() && Reads
&& stageScheduled(*I
) <= StageInst1
) {
3004 OrderBeforeUse
= true;
3007 } else if (MO
.isDef() && Reads
&& stageScheduled(*I
) > StageInst1
) {
3008 // Add the instruction after the scheduled instruction.
3009 OrderAfterDef
= true;
3011 } else if (MO
.isUse() && Writes
&& stageScheduled(*I
) == StageInst1
) {
3012 if (cycleScheduled(*I
) == cycleScheduled(SU
) && !(*I
)->isSucc(SU
)) {
3013 OrderBeforeUse
= true;
3017 OrderAfterDef
= true;
3020 } else if (MO
.isUse() && Writes
&& stageScheduled(*I
) > StageInst1
) {
3021 OrderBeforeUse
= true;
3025 OrderAfterDef
= true;
3028 } else if (MO
.isUse() && Writes
&& stageScheduled(*I
) < StageInst1
) {
3029 // Add the instruction before the scheduled instruction.
3030 OrderBeforeUse
= true;
3033 } else if (MO
.isUse() && stageScheduled(*I
) == StageInst1
&&
3034 isLoopCarriedDefOfUse(SSD
, (*I
)->getInstr(), MO
)) {
3036 OrderBeforeDef
= true;
3041 // Check for order dependences between instructions. Make sure the source
3042 // is ordered before the destination.
3043 for (auto &S
: SU
->Succs
) {
3044 if (S
.getSUnit() != *I
)
3046 if (S
.getKind() == SDep::Order
&& stageScheduled(*I
) == StageInst1
) {
3047 OrderBeforeUse
= true;
3051 // We did not handle HW dependences in previous for loop,
3052 // and we normally set Latency = 0 for Anti deps,
3053 // so may have nodes in same cycle with Anti denpendent on HW regs.
3054 else if (S
.getKind() == SDep::Anti
&& stageScheduled(*I
) == StageInst1
) {
3055 OrderBeforeUse
= true;
3056 if ((MoveUse
== 0) || (Pos
< MoveUse
))
3060 for (auto &P
: SU
->Preds
) {
3061 if (P
.getSUnit() != *I
)
3063 if (P
.getKind() == SDep::Order
&& stageScheduled(*I
) == StageInst1
) {
3064 OrderAfterDef
= true;
3070 // A circular dependence.
3071 if (OrderAfterDef
&& OrderBeforeUse
&& MoveUse
== MoveDef
)
3072 OrderBeforeUse
= false;
3074 // OrderAfterDef takes precedences over OrderBeforeDef. The latter is due
3075 // to a loop-carried dependence.
3077 OrderBeforeUse
= !OrderAfterDef
|| (MoveUse
> MoveDef
);
3079 // The uncommon case when the instruction order needs to be updated because
3080 // there is both a use and def.
3081 if (OrderBeforeUse
&& OrderAfterDef
) {
3082 SUnit
*UseSU
= Insts
.at(MoveUse
);
3083 SUnit
*DefSU
= Insts
.at(MoveDef
);
3084 if (MoveUse
> MoveDef
) {
3085 Insts
.erase(Insts
.begin() + MoveUse
);
3086 Insts
.erase(Insts
.begin() + MoveDef
);
3088 Insts
.erase(Insts
.begin() + MoveDef
);
3089 Insts
.erase(Insts
.begin() + MoveUse
);
3091 orderDependence(SSD
, UseSU
, Insts
);
3092 orderDependence(SSD
, SU
, Insts
);
3093 orderDependence(SSD
, DefSU
, Insts
);
3096 // Put the new instruction first if there is a use in the list. Otherwise,
3097 // put it at the end of the list.
3099 Insts
.push_front(SU
);
3101 Insts
.push_back(SU
);
3104 /// Return true if the scheduled Phi has a loop carried operand.
3105 bool SMSchedule::isLoopCarried(const SwingSchedulerDAG
*SSD
,
3106 MachineInstr
&Phi
) const {
3109 assert(Phi
.isPHI() && "Expecting a Phi.");
3110 SUnit
*DefSU
= SSD
->getSUnit(&Phi
);
3111 unsigned DefCycle
= cycleScheduled(DefSU
);
3112 int DefStage
= stageScheduled(DefSU
);
3114 unsigned InitVal
= 0;
3115 unsigned LoopVal
= 0;
3116 getPhiRegs(Phi
, Phi
.getParent(), InitVal
, LoopVal
);
3117 SUnit
*UseSU
= SSD
->getSUnit(MRI
.getVRegDef(LoopVal
));
3120 if (UseSU
->getInstr()->isPHI())
3122 unsigned LoopCycle
= cycleScheduled(UseSU
);
3123 int LoopStage
= stageScheduled(UseSU
);
3124 return (LoopCycle
> DefCycle
) || (LoopStage
<= DefStage
);
3127 /// Return true if the instruction is a definition that is loop carried
3128 /// and defines the use on the next iteration.
3129 /// v1 = phi(v2, v3)
3130 /// (Def) v3 = op v1
3132 /// If MO appears before Def, then v1 and v3 may get assigned to the same
3134 bool SMSchedule::isLoopCarriedDefOfUse(const SwingSchedulerDAG
*SSD
,
3136 MachineOperand
&MO
) const {
3141 MachineInstr
*Phi
= MRI
.getVRegDef(MO
.getReg());
3142 if (!Phi
|| !Phi
->isPHI() || Phi
->getParent() != Def
->getParent())
3144 if (!isLoopCarried(SSD
, *Phi
))
3146 unsigned LoopReg
= getLoopPhiReg(*Phi
, Phi
->getParent());
3147 for (MachineOperand
&DMO
: Def
->all_defs()) {
3148 if (DMO
.getReg() == LoopReg
)
3154 /// Return true if all scheduled predecessors are loop-carried output/order
3156 bool SMSchedule::onlyHasLoopCarriedOutputOrOrderPreds(
3157 SUnit
*SU
, SwingSchedulerDAG
*DAG
) const {
3158 for (const SDep
&Pred
: SU
->Preds
)
3159 if (InstrToCycle
.count(Pred
.getSUnit()) && !DAG
->isBackedge(SU
, Pred
))
3161 for (const SDep
&Succ
: SU
->Succs
)
3162 if (InstrToCycle
.count(Succ
.getSUnit()) && DAG
->isBackedge(SU
, Succ
))
3167 /// Determine transitive dependences of unpipelineable instructions
3168 SmallSet
<SUnit
*, 8> SMSchedule::computeUnpipelineableNodes(
3169 SwingSchedulerDAG
*SSD
, TargetInstrInfo::PipelinerLoopInfo
*PLI
) {
3170 SmallSet
<SUnit
*, 8> DoNotPipeline
;
3171 SmallVector
<SUnit
*, 8> Worklist
;
3173 for (auto &SU
: SSD
->SUnits
)
3174 if (SU
.isInstr() && PLI
->shouldIgnoreForPipelining(SU
.getInstr()))
3175 Worklist
.push_back(&SU
);
3177 while (!Worklist
.empty()) {
3178 auto SU
= Worklist
.pop_back_val();
3179 if (DoNotPipeline
.count(SU
))
3181 LLVM_DEBUG(dbgs() << "Do not pipeline SU(" << SU
->NodeNum
<< ")\n");
3182 DoNotPipeline
.insert(SU
);
3183 for (auto &Dep
: SU
->Preds
)
3184 Worklist
.push_back(Dep
.getSUnit());
3185 if (SU
->getInstr()->isPHI())
3186 for (auto &Dep
: SU
->Succs
)
3187 if (Dep
.getKind() == SDep::Anti
)
3188 Worklist
.push_back(Dep
.getSUnit());
3190 return DoNotPipeline
;
3193 // Determine all instructions upon which any unpipelineable instruction depends
3194 // and ensure that they are in stage 0. If unable to do so, return false.
3195 bool SMSchedule::normalizeNonPipelinedInstructions(
3196 SwingSchedulerDAG
*SSD
, TargetInstrInfo::PipelinerLoopInfo
*PLI
) {
3197 SmallSet
<SUnit
*, 8> DNP
= computeUnpipelineableNodes(SSD
, PLI
);
3199 int NewLastCycle
= INT_MIN
;
3200 for (SUnit
&SU
: SSD
->SUnits
) {
3203 if (!DNP
.contains(&SU
) || stageScheduled(&SU
) == 0) {
3204 NewLastCycle
= std::max(NewLastCycle
, InstrToCycle
[&SU
]);
3208 // Put the non-pipelined instruction as early as possible in the schedule
3209 int NewCycle
= getFirstCycle();
3210 for (auto &Dep
: SU
.Preds
)
3211 NewCycle
= std::max(InstrToCycle
[Dep
.getSUnit()], NewCycle
);
3213 int OldCycle
= InstrToCycle
[&SU
];
3214 if (OldCycle
!= NewCycle
) {
3215 InstrToCycle
[&SU
] = NewCycle
;
3216 auto &OldS
= getInstructions(OldCycle
);
3217 llvm::erase(OldS
, &SU
);
3218 getInstructions(NewCycle
).emplace_back(&SU
);
3219 LLVM_DEBUG(dbgs() << "SU(" << SU
.NodeNum
3220 << ") is not pipelined; moving from cycle " << OldCycle
3221 << " to " << NewCycle
<< " Instr:" << *SU
.getInstr());
3223 NewLastCycle
= std::max(NewLastCycle
, NewCycle
);
3225 LastCycle
= NewLastCycle
;
3229 // Check if the generated schedule is valid. This function checks if
3230 // an instruction that uses a physical register is scheduled in a
3231 // different stage than the definition. The pipeliner does not handle
3232 // physical register values that may cross a basic block boundary.
3233 // Furthermore, if a physical def/use pair is assigned to the same
3234 // cycle, orderDependence does not guarantee def/use ordering, so that
3235 // case should be considered invalid. (The test checks for both
3236 // earlier and same-cycle use to be more robust.)
3237 bool SMSchedule::isValidSchedule(SwingSchedulerDAG
*SSD
) {
3238 for (SUnit
&SU
: SSD
->SUnits
) {
3239 if (!SU
.hasPhysRegDefs
)
3241 int StageDef
= stageScheduled(&SU
);
3242 int CycleDef
= InstrToCycle
[&SU
];
3243 assert(StageDef
!= -1 && "Instruction should have been scheduled.");
3244 for (auto &SI
: SU
.Succs
)
3245 if (SI
.isAssignedRegDep() && !SI
.getSUnit()->isBoundaryNode())
3246 if (Register::isPhysicalRegister(SI
.getReg())) {
3247 if (stageScheduled(SI
.getSUnit()) != StageDef
)
3249 if (InstrToCycle
[SI
.getSUnit()] <= CycleDef
)
3256 /// A property of the node order in swing-modulo-scheduling is
3257 /// that for nodes outside circuits the following holds:
3258 /// none of them is scheduled after both a successor and a
3260 /// The method below checks whether the property is met.
3261 /// If not, debug information is printed and statistics information updated.
3262 /// Note that we do not use an assert statement.
3263 /// The reason is that although an invalid node oder may prevent
3264 /// the pipeliner from finding a pipelined schedule for arbitrary II,
3265 /// it does not lead to the generation of incorrect code.
3266 void SwingSchedulerDAG::checkValidNodeOrder(const NodeSetType
&Circuits
) const {
3268 // a sorted vector that maps each SUnit to its index in the NodeOrder
3269 typedef std::pair
<SUnit
*, unsigned> UnitIndex
;
3270 std::vector
<UnitIndex
> Indices(NodeOrder
.size(), std::make_pair(nullptr, 0));
3272 for (unsigned i
= 0, s
= NodeOrder
.size(); i
< s
; ++i
)
3273 Indices
.push_back(std::make_pair(NodeOrder
[i
], i
));
3275 auto CompareKey
= [](UnitIndex i1
, UnitIndex i2
) {
3276 return std::get
<0>(i1
) < std::get
<0>(i2
);
3279 // sort, so that we can perform a binary search
3280 llvm::sort(Indices
, CompareKey
);
3284 // for each SUnit in the NodeOrder, check whether
3285 // it appears after both a successor and a predecessor
3286 // of the SUnit. If this is the case, and the SUnit
3287 // is not part of circuit, then the NodeOrder is not
3289 for (unsigned i
= 0, s
= NodeOrder
.size(); i
< s
; ++i
) {
3290 SUnit
*SU
= NodeOrder
[i
];
3293 bool PredBefore
= false;
3294 bool SuccBefore
= false;
3301 for (SDep
&PredEdge
: SU
->Preds
) {
3302 SUnit
*PredSU
= PredEdge
.getSUnit();
3303 unsigned PredIndex
= std::get
<1>(
3304 *llvm::lower_bound(Indices
, std::make_pair(PredSU
, 0), CompareKey
));
3305 if (!PredSU
->getInstr()->isPHI() && PredIndex
< Index
) {
3312 for (SDep
&SuccEdge
: SU
->Succs
) {
3313 SUnit
*SuccSU
= SuccEdge
.getSUnit();
3314 // Do not process a boundary node, it was not included in NodeOrder,
3315 // hence not in Indices either, call to std::lower_bound() below will
3316 // return Indices.end().
3317 if (SuccSU
->isBoundaryNode())
3319 unsigned SuccIndex
= std::get
<1>(
3320 *llvm::lower_bound(Indices
, std::make_pair(SuccSU
, 0), CompareKey
));
3321 if (!SuccSU
->getInstr()->isPHI() && SuccIndex
< Index
) {
3328 if (PredBefore
&& SuccBefore
&& !SU
->getInstr()->isPHI()) {
3329 // instructions in circuits are allowed to be scheduled
3330 // after both a successor and predecessor.
3331 bool InCircuit
= llvm::any_of(
3332 Circuits
, [SU
](const NodeSet
&Circuit
) { return Circuit
.count(SU
); });
3334 LLVM_DEBUG(dbgs() << "In a circuit, predecessor ";);
3337 NumNodeOrderIssues
++;
3338 LLVM_DEBUG(dbgs() << "Predecessor ";);
3340 LLVM_DEBUG(dbgs() << Pred
->NodeNum
<< " and successor " << Succ
->NodeNum
3341 << " are scheduled before node " << SU
->NodeNum
3348 dbgs() << "Invalid node order found!\n";
3352 /// Attempt to fix the degenerate cases when the instruction serialization
3353 /// causes the register lifetimes to overlap. For example,
3354 /// p' = store_pi(p, b)
3355 /// = load p, offset
3356 /// In this case p and p' overlap, which means that two registers are needed.
3357 /// Instead, this function changes the load to use p' and updates the offset.
3358 void SwingSchedulerDAG::fixupRegisterOverlaps(std::deque
<SUnit
*> &Instrs
) {
3359 unsigned OverlapReg
= 0;
3360 unsigned NewBaseReg
= 0;
3361 for (SUnit
*SU
: Instrs
) {
3362 MachineInstr
*MI
= SU
->getInstr();
3363 for (unsigned i
= 0, e
= MI
->getNumOperands(); i
< e
; ++i
) {
3364 const MachineOperand
&MO
= MI
->getOperand(i
);
3365 // Look for an instruction that uses p. The instruction occurs in the
3366 // same cycle but occurs later in the serialized order.
3367 if (MO
.isReg() && MO
.isUse() && MO
.getReg() == OverlapReg
) {
3368 // Check that the instruction appears in the InstrChanges structure,
3369 // which contains instructions that can have the offset updated.
3370 DenseMap
<SUnit
*, std::pair
<unsigned, int64_t>>::iterator It
=
3371 InstrChanges
.find(SU
);
3372 if (It
!= InstrChanges
.end()) {
3373 unsigned BasePos
, OffsetPos
;
3374 // Update the base register and adjust the offset.
3375 if (TII
->getBaseAndOffsetPosition(*MI
, BasePos
, OffsetPos
)) {
3376 MachineInstr
*NewMI
= MF
.CloneMachineInstr(MI
);
3377 NewMI
->getOperand(BasePos
).setReg(NewBaseReg
);
3379 MI
->getOperand(OffsetPos
).getImm() - It
->second
.second
;
3380 NewMI
->getOperand(OffsetPos
).setImm(NewOffset
);
3381 SU
->setInstr(NewMI
);
3382 MISUnitMap
[NewMI
] = SU
;
3390 // Look for an instruction of the form p' = op(p), which uses and defines
3391 // two virtual registers that get allocated to the same physical register.
3392 unsigned TiedUseIdx
= 0;
3393 if (MI
->isRegTiedToUseOperand(i
, &TiedUseIdx
)) {
3394 // OverlapReg is p in the example above.
3395 OverlapReg
= MI
->getOperand(TiedUseIdx
).getReg();
3396 // NewBaseReg is p' in the example above.
3397 NewBaseReg
= MI
->getOperand(i
).getReg();
3405 SMSchedule::reorderInstructions(const SwingSchedulerDAG
*SSD
,
3406 const std::deque
<SUnit
*> &Instrs
) const {
3407 std::deque
<SUnit
*> NewOrderPhi
;
3408 for (SUnit
*SU
: Instrs
) {
3409 if (SU
->getInstr()->isPHI())
3410 NewOrderPhi
.push_back(SU
);
3412 std::deque
<SUnit
*> NewOrderI
;
3413 for (SUnit
*SU
: Instrs
) {
3414 if (!SU
->getInstr()->isPHI())
3415 orderDependence(SSD
, SU
, NewOrderI
);
3417 llvm::append_range(NewOrderPhi
, NewOrderI
);
3421 /// After the schedule has been formed, call this function to combine
3422 /// the instructions from the different stages/cycles. That is, this
3423 /// function creates a schedule that represents a single iteration.
3424 void SMSchedule::finalizeSchedule(SwingSchedulerDAG
*SSD
) {
3425 // Move all instructions to the first stage from later stages.
3426 for (int cycle
= getFirstCycle(); cycle
<= getFinalCycle(); ++cycle
) {
3427 for (int stage
= 1, lastStage
= getMaxStageCount(); stage
<= lastStage
;
3429 std::deque
<SUnit
*> &cycleInstrs
=
3430 ScheduledInstrs
[cycle
+ (stage
* InitiationInterval
)];
3431 for (SUnit
*SU
: llvm::reverse(cycleInstrs
))
3432 ScheduledInstrs
[cycle
].push_front(SU
);
3436 // Erase all the elements in the later stages. Only one iteration should
3437 // remain in the scheduled list, and it contains all the instructions.
3438 for (int cycle
= getFinalCycle() + 1; cycle
<= LastCycle
; ++cycle
)
3439 ScheduledInstrs
.erase(cycle
);
3441 // Change the registers in instruction as specified in the InstrChanges
3442 // map. We need to use the new registers to create the correct order.
3443 for (const SUnit
&SU
: SSD
->SUnits
)
3444 SSD
->applyInstrChange(SU
.getInstr(), *this);
3446 // Reorder the instructions in each cycle to fix and improve the
3448 for (int Cycle
= getFirstCycle(), E
= getFinalCycle(); Cycle
<= E
; ++Cycle
) {
3449 std::deque
<SUnit
*> &cycleInstrs
= ScheduledInstrs
[Cycle
];
3450 cycleInstrs
= reorderInstructions(SSD
, cycleInstrs
);
3451 SSD
->fixupRegisterOverlaps(cycleInstrs
);
3454 LLVM_DEBUG(dump(););
3457 void NodeSet::print(raw_ostream
&os
) const {
3458 os
<< "Num nodes " << size() << " rec " << RecMII
<< " mov " << MaxMOV
3459 << " depth " << MaxDepth
<< " col " << Colocate
<< "\n";
3460 for (const auto &I
: Nodes
)
3461 os
<< " SU(" << I
->NodeNum
<< ") " << *(I
->getInstr());
3465 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3466 /// Print the schedule information to the given output.
3467 void SMSchedule::print(raw_ostream
&os
) const {
3468 // Iterate over each cycle.
3469 for (int cycle
= getFirstCycle(); cycle
<= getFinalCycle(); ++cycle
) {
3470 // Iterate over each instruction in the cycle.
3471 const_sched_iterator cycleInstrs
= ScheduledInstrs
.find(cycle
);
3472 for (SUnit
*CI
: cycleInstrs
->second
) {
3473 os
<< "cycle " << cycle
<< " (" << stageScheduled(CI
) << ") ";
3474 os
<< "(" << CI
->NodeNum
<< ") ";
3475 CI
->getInstr()->print(os
);
3481 /// Utility function used for debugging to print the schedule.
3482 LLVM_DUMP_METHOD
void SMSchedule::dump() const { print(dbgs()); }
3483 LLVM_DUMP_METHOD
void NodeSet::dump() const { print(dbgs()); }
3485 void ResourceManager::dumpMRT() const {
3489 std::stringstream SS
;
3491 SS
<< std::setw(4) << "Slot";
3492 for (unsigned I
= 1, E
= SM
.getNumProcResourceKinds(); I
< E
; ++I
)
3493 SS
<< std::setw(3) << I
;
3494 SS
<< std::setw(7) << "#Mops"
3496 for (int Slot
= 0; Slot
< InitiationInterval
; ++Slot
) {
3497 SS
<< std::setw(4) << Slot
;
3498 for (unsigned I
= 1, E
= SM
.getNumProcResourceKinds(); I
< E
; ++I
)
3499 SS
<< std::setw(3) << MRT
[Slot
][I
];
3500 SS
<< std::setw(7) << NumScheduledMops
[Slot
] << "\n";
3507 void ResourceManager::initProcResourceVectors(
3508 const MCSchedModel
&SM
, SmallVectorImpl
<uint64_t> &Masks
) {
3509 unsigned ProcResourceID
= 0;
3511 // We currently limit the resource kinds to 64 and below so that we can use
3512 // uint64_t for Masks
3513 assert(SM
.getNumProcResourceKinds() < 64 &&
3514 "Too many kinds of resources, unsupported");
3515 // Create a unique bitmask for every processor resource unit.
3516 // Skip resource at index 0, since it always references 'InvalidUnit'.
3517 Masks
.resize(SM
.getNumProcResourceKinds());
3518 for (unsigned I
= 1, E
= SM
.getNumProcResourceKinds(); I
< E
; ++I
) {
3519 const MCProcResourceDesc
&Desc
= *SM
.getProcResource(I
);
3520 if (Desc
.SubUnitsIdxBegin
)
3522 Masks
[I
] = 1ULL << ProcResourceID
;
3525 // Create a unique bitmask for every processor resource group.
3526 for (unsigned I
= 1, E
= SM
.getNumProcResourceKinds(); I
< E
; ++I
) {
3527 const MCProcResourceDesc
&Desc
= *SM
.getProcResource(I
);
3528 if (!Desc
.SubUnitsIdxBegin
)
3530 Masks
[I
] = 1ULL << ProcResourceID
;
3531 for (unsigned U
= 0; U
< Desc
.NumUnits
; ++U
)
3532 Masks
[I
] |= Masks
[Desc
.SubUnitsIdxBegin
[U
]];
3536 if (SwpShowResMask
) {
3537 dbgs() << "ProcResourceDesc:\n";
3538 for (unsigned I
= 1, E
= SM
.getNumProcResourceKinds(); I
< E
; ++I
) {
3539 const MCProcResourceDesc
*ProcResource
= SM
.getProcResource(I
);
3540 dbgs() << format(" %16s(%2d): Mask: 0x%08x, NumUnits:%2d\n",
3541 ProcResource
->Name
, I
, Masks
[I
],
3542 ProcResource
->NumUnits
);
3544 dbgs() << " -----------------\n";
3549 bool ResourceManager::canReserveResources(SUnit
&SU
, int Cycle
) {
3551 if (SwpDebugResource
)
3552 dbgs() << "canReserveResources:\n";
3555 return DFAResources
[positiveModulo(Cycle
, InitiationInterval
)]
3556 ->canReserveResources(&SU
.getInstr()->getDesc());
3558 const MCSchedClassDesc
*SCDesc
= DAG
->getSchedClass(&SU
);
3559 if (!SCDesc
->isValid()) {
3561 dbgs() << "No valid Schedule Class Desc for schedClass!\n";
3562 dbgs() << "isPseudo:" << SU
.getInstr()->isPseudo() << "\n";
3567 reserveResources(SCDesc
, Cycle
);
3568 bool Result
= !isOverbooked();
3569 unreserveResources(SCDesc
, Cycle
);
3571 LLVM_DEBUG(if (SwpDebugResource
) dbgs() << "return " << Result
<< "\n\n";);
3575 void ResourceManager::reserveResources(SUnit
&SU
, int Cycle
) {
3577 if (SwpDebugResource
)
3578 dbgs() << "reserveResources:\n";
3581 return DFAResources
[positiveModulo(Cycle
, InitiationInterval
)]
3582 ->reserveResources(&SU
.getInstr()->getDesc());
3584 const MCSchedClassDesc
*SCDesc
= DAG
->getSchedClass(&SU
);
3585 if (!SCDesc
->isValid()) {
3587 dbgs() << "No valid Schedule Class Desc for schedClass!\n";
3588 dbgs() << "isPseudo:" << SU
.getInstr()->isPseudo() << "\n";
3593 reserveResources(SCDesc
, Cycle
);
3596 if (SwpDebugResource
) {
3598 dbgs() << "reserveResources: done!\n\n";
3603 void ResourceManager::reserveResources(const MCSchedClassDesc
*SCDesc
,
3606 for (const MCWriteProcResEntry
&PRE
: make_range(
3607 STI
->getWriteProcResBegin(SCDesc
), STI
->getWriteProcResEnd(SCDesc
)))
3608 for (int C
= Cycle
; C
< Cycle
+ PRE
.ReleaseAtCycle
; ++C
)
3609 ++MRT
[positiveModulo(C
, InitiationInterval
)][PRE
.ProcResourceIdx
];
3611 for (int C
= Cycle
; C
< Cycle
+ SCDesc
->NumMicroOps
; ++C
)
3612 ++NumScheduledMops
[positiveModulo(C
, InitiationInterval
)];
3615 void ResourceManager::unreserveResources(const MCSchedClassDesc
*SCDesc
,
3618 for (const MCWriteProcResEntry
&PRE
: make_range(
3619 STI
->getWriteProcResBegin(SCDesc
), STI
->getWriteProcResEnd(SCDesc
)))
3620 for (int C
= Cycle
; C
< Cycle
+ PRE
.ReleaseAtCycle
; ++C
)
3621 --MRT
[positiveModulo(C
, InitiationInterval
)][PRE
.ProcResourceIdx
];
3623 for (int C
= Cycle
; C
< Cycle
+ SCDesc
->NumMicroOps
; ++C
)
3624 --NumScheduledMops
[positiveModulo(C
, InitiationInterval
)];
3627 bool ResourceManager::isOverbooked() const {
3629 for (int Slot
= 0; Slot
< InitiationInterval
; ++Slot
) {
3630 for (unsigned I
= 1, E
= SM
.getNumProcResourceKinds(); I
< E
; ++I
) {
3631 const MCProcResourceDesc
*Desc
= SM
.getProcResource(I
);
3632 if (MRT
[Slot
][I
] > Desc
->NumUnits
)
3635 if (NumScheduledMops
[Slot
] > IssueWidth
)
3641 int ResourceManager::calculateResMIIDFA() const {
3644 // Sort the instructions by the number of available choices for scheduling,
3645 // least to most. Use the number of critical resources as the tie breaker.
3646 FuncUnitSorter FUS
= FuncUnitSorter(*ST
);
3647 for (SUnit
&SU
: DAG
->SUnits
)
3648 FUS
.calcCriticalResources(*SU
.getInstr());
3649 PriorityQueue
<MachineInstr
*, std::vector
<MachineInstr
*>, FuncUnitSorter
>
3652 for (SUnit
&SU
: DAG
->SUnits
)
3653 FuncUnitOrder
.push(SU
.getInstr());
3655 SmallVector
<std::unique_ptr
<DFAPacketizer
>, 8> Resources
;
3656 Resources
.push_back(
3657 std::unique_ptr
<DFAPacketizer
>(TII
->CreateTargetScheduleState(*ST
)));
3659 while (!FuncUnitOrder
.empty()) {
3660 MachineInstr
*MI
= FuncUnitOrder
.top();
3661 FuncUnitOrder
.pop();
3662 if (TII
->isZeroCost(MI
->getOpcode()))
3665 // Attempt to reserve the instruction in an existing DFA. At least one
3666 // DFA is needed for each cycle.
3667 unsigned NumCycles
= DAG
->getSUnit(MI
)->Latency
;
3668 unsigned ReservedCycles
= 0;
3669 auto *RI
= Resources
.begin();
3670 auto *RE
= Resources
.end();
3672 dbgs() << "Trying to reserve resource for " << NumCycles
3673 << " cycles for \n";
3676 for (unsigned C
= 0; C
< NumCycles
; ++C
)
3678 if ((*RI
)->canReserveResources(*MI
)) {
3679 (*RI
)->reserveResources(*MI
);
3685 LLVM_DEBUG(dbgs() << "ReservedCycles:" << ReservedCycles
3686 << ", NumCycles:" << NumCycles
<< "\n");
3687 // Add new DFAs, if needed, to reserve resources.
3688 for (unsigned C
= ReservedCycles
; C
< NumCycles
; ++C
) {
3689 LLVM_DEBUG(if (SwpDebugResource
) dbgs()
3690 << "NewResource created to reserve resources"
3692 auto *NewResource
= TII
->CreateTargetScheduleState(*ST
);
3693 assert(NewResource
->canReserveResources(*MI
) && "Reserve error.");
3694 NewResource
->reserveResources(*MI
);
3695 Resources
.push_back(std::unique_ptr
<DFAPacketizer
>(NewResource
));
3699 int Resmii
= Resources
.size();
3700 LLVM_DEBUG(dbgs() << "Return Res MII:" << Resmii
<< "\n");
3704 int ResourceManager::calculateResMII() const {
3706 return calculateResMIIDFA();
3708 // Count each resource consumption and divide it by the number of units.
3709 // ResMII is the max value among them.
3712 SmallVector
<uint64_t> ResourceCount(SM
.getNumProcResourceKinds());
3713 for (SUnit
&SU
: DAG
->SUnits
) {
3714 if (TII
->isZeroCost(SU
.getInstr()->getOpcode()))
3717 const MCSchedClassDesc
*SCDesc
= DAG
->getSchedClass(&SU
);
3718 if (!SCDesc
->isValid())
3722 if (SwpDebugResource
) {
3724 dbgs() << " #Mops: " << SCDesc
->NumMicroOps
<< "\n"
3725 << " WriteProcRes: ";
3728 NumMops
+= SCDesc
->NumMicroOps
;
3729 for (const MCWriteProcResEntry
&PRE
:
3730 make_range(STI
->getWriteProcResBegin(SCDesc
),
3731 STI
->getWriteProcResEnd(SCDesc
))) {
3733 if (SwpDebugResource
) {
3734 const MCProcResourceDesc
*Desc
=
3735 SM
.getProcResource(PRE
.ProcResourceIdx
);
3736 dbgs() << Desc
->Name
<< ": " << PRE
.ReleaseAtCycle
<< ", ";
3739 ResourceCount
[PRE
.ProcResourceIdx
] += PRE
.ReleaseAtCycle
;
3741 LLVM_DEBUG(if (SwpDebugResource
) dbgs() << "\n");
3744 int Result
= (NumMops
+ IssueWidth
- 1) / IssueWidth
;
3746 if (SwpDebugResource
)
3747 dbgs() << "#Mops: " << NumMops
<< ", "
3748 << "IssueWidth: " << IssueWidth
<< ", "
3749 << "Cycles: " << Result
<< "\n";
3753 if (SwpDebugResource
) {
3754 std::stringstream SS
;
3755 SS
<< std::setw(2) << "ID" << std::setw(16) << "Name" << std::setw(10)
3756 << "Units" << std::setw(10) << "Consumed" << std::setw(10) << "Cycles"
3761 for (unsigned I
= 1, E
= SM
.getNumProcResourceKinds(); I
< E
; ++I
) {
3762 const MCProcResourceDesc
*Desc
= SM
.getProcResource(I
);
3763 int Cycles
= (ResourceCount
[I
] + Desc
->NumUnits
- 1) / Desc
->NumUnits
;
3765 if (SwpDebugResource
) {
3766 std::stringstream SS
;
3767 SS
<< std::setw(2) << I
<< std::setw(16) << Desc
->Name
<< std::setw(10)
3768 << Desc
->NumUnits
<< std::setw(10) << ResourceCount
[I
]
3769 << std::setw(10) << Cycles
<< "\n";
3773 if (Cycles
> Result
)
3779 void ResourceManager::init(int II
) {
3780 InitiationInterval
= II
;
3781 DFAResources
.clear();
3782 DFAResources
.resize(II
);
3783 for (auto &I
: DFAResources
)
3784 I
.reset(ST
->getInstrInfo()->CreateTargetScheduleState(*ST
));
3786 MRT
.resize(II
, SmallVector
<uint64_t>(SM
.getNumProcResourceKinds()));
3787 NumScheduledMops
.clear();
3788 NumScheduledMops
.resize(II
);