1 //===- MachinePipeliner.cpp - Machine Software Pipeliner Pass -------------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 // An implementation of the Swing Modulo Scheduling (SMS) software pipeliner.
11 // This SMS implementation is a target-independent back-end pass. When enabled,
12 // the pass runs just prior to the register allocation pass, while the machine
13 // IR is in SSA form. If software pipelining is successful, then the original
14 // loop is replaced by the optimized loop. The optimized loop contains one or
15 // more prolog blocks, the pipelined kernel, and one or more epilog blocks. If
16 // the instructions cannot be scheduled in a given MII, we increase the MII by
19 // The SMS implementation is an extension of the ScheduleDAGInstrs class. We
20 // represent loop carried dependences in the DAG as order edges to the Phi
21 // nodes. We also perform several passes over the DAG to eliminate unnecessary
22 // edges that inhibit the ability to pipeline. The implementation uses the
23 // DFAPacketizer class to compute the minimum initiation interval and the check
24 // where an instruction may be inserted in the pipelined schedule.
26 // In order for the SMS pass to work, several target specific hooks need to be
27 // implemented to get information about the loop structure and to rewrite
30 //===----------------------------------------------------------------------===//
32 #include "llvm/ADT/ArrayRef.h"
33 #include "llvm/ADT/BitVector.h"
34 #include "llvm/ADT/DenseMap.h"
35 #include "llvm/ADT/MapVector.h"
36 #include "llvm/ADT/PriorityQueue.h"
37 #include "llvm/ADT/SetVector.h"
38 #include "llvm/ADT/SmallPtrSet.h"
39 #include "llvm/ADT/SmallSet.h"
40 #include "llvm/ADT/SmallVector.h"
41 #include "llvm/ADT/Statistic.h"
42 #include "llvm/ADT/iterator_range.h"
43 #include "llvm/Analysis/AliasAnalysis.h"
44 #include "llvm/Analysis/MemoryLocation.h"
45 #include "llvm/Analysis/ValueTracking.h"
46 #include "llvm/CodeGen/DFAPacketizer.h"
47 #include "llvm/CodeGen/LiveIntervals.h"
48 #include "llvm/CodeGen/MachineBasicBlock.h"
49 #include "llvm/CodeGen/MachineDominators.h"
50 #include "llvm/CodeGen/MachineFunction.h"
51 #include "llvm/CodeGen/MachineFunctionPass.h"
52 #include "llvm/CodeGen/MachineInstr.h"
53 #include "llvm/CodeGen/MachineInstrBuilder.h"
54 #include "llvm/CodeGen/MachineLoopInfo.h"
55 #include "llvm/CodeGen/MachineMemOperand.h"
56 #include "llvm/CodeGen/MachineOperand.h"
57 #include "llvm/CodeGen/MachinePipeliner.h"
58 #include "llvm/CodeGen/MachineRegisterInfo.h"
59 #include "llvm/CodeGen/RegisterPressure.h"
60 #include "llvm/CodeGen/ScheduleDAG.h"
61 #include "llvm/CodeGen/ScheduleDAGMutation.h"
62 #include "llvm/CodeGen/TargetOpcodes.h"
63 #include "llvm/CodeGen/TargetRegisterInfo.h"
64 #include "llvm/CodeGen/TargetSubtargetInfo.h"
65 #include "llvm/Config/llvm-config.h"
66 #include "llvm/IR/Attributes.h"
67 #include "llvm/IR/DebugLoc.h"
68 #include "llvm/IR/Function.h"
69 #include "llvm/MC/LaneBitmask.h"
70 #include "llvm/MC/MCInstrDesc.h"
71 #include "llvm/MC/MCInstrItineraries.h"
72 #include "llvm/MC/MCRegisterInfo.h"
73 #include "llvm/Pass.h"
74 #include "llvm/Support/CommandLine.h"
75 #include "llvm/Support/Compiler.h"
76 #include "llvm/Support/Debug.h"
77 #include "llvm/Support/MathExtras.h"
78 #include "llvm/Support/raw_ostream.h"
94 #define DEBUG_TYPE "pipeliner"
96 STATISTIC(NumTrytoPipeline
, "Number of loops that we attempt to pipeline");
97 STATISTIC(NumPipelined
, "Number of loops software pipelined");
98 STATISTIC(NumNodeOrderIssues
, "Number of node order issues found");
100 /// A command line option to turn software pipelining on or off.
101 static cl::opt
<bool> EnableSWP("enable-pipeliner", cl::Hidden
, cl::init(true),
103 cl::desc("Enable Software Pipelining"));
105 /// A command line option to enable SWP at -Os.
106 static cl::opt
<bool> EnableSWPOptSize("enable-pipeliner-opt-size",
107 cl::desc("Enable SWP at Os."), cl::Hidden
,
110 /// A command line argument to limit minimum initial interval for pipelining.
111 static cl::opt
<int> SwpMaxMii("pipeliner-max-mii",
112 cl::desc("Size limit for the MII."),
113 cl::Hidden
, cl::init(27));
115 /// A command line argument to limit the number of stages in the pipeline.
117 SwpMaxStages("pipeliner-max-stages",
118 cl::desc("Maximum stages allowed in the generated scheduled."),
119 cl::Hidden
, cl::init(3));
121 /// A command line option to disable the pruning of chain dependences due to
122 /// an unrelated Phi.
124 SwpPruneDeps("pipeliner-prune-deps",
125 cl::desc("Prune dependences between unrelated Phi nodes."),
126 cl::Hidden
, cl::init(true));
128 /// A command line option to disable the pruning of loop carried order
131 SwpPruneLoopCarried("pipeliner-prune-loop-carried",
132 cl::desc("Prune loop carried order dependences."),
133 cl::Hidden
, cl::init(true));
136 static cl::opt
<int> SwpLoopLimit("pipeliner-max", cl::Hidden
, cl::init(-1));
139 static cl::opt
<bool> SwpIgnoreRecMII("pipeliner-ignore-recmii",
140 cl::ReallyHidden
, cl::init(false),
141 cl::ZeroOrMore
, cl::desc("Ignore RecMII"));
145 // A command line option to enable the CopyToPhi DAG mutation.
147 SwpEnableCopyToPhi("pipeliner-enable-copytophi", cl::ReallyHidden
,
148 cl::init(true), cl::ZeroOrMore
,
149 cl::desc("Enable CopyToPhi DAG Mutation"));
151 } // end namespace llvm
153 unsigned SwingSchedulerDAG::Circuits::MaxPaths
= 5;
154 char MachinePipeliner::ID
= 0;
156 int MachinePipeliner::NumTries
= 0;
158 char &llvm::MachinePipelinerID
= MachinePipeliner::ID
;
160 INITIALIZE_PASS_BEGIN(MachinePipeliner
, DEBUG_TYPE
,
161 "Modulo Software Pipelining", false, false)
162 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass
)
163 INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo
)
164 INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree
)
165 INITIALIZE_PASS_DEPENDENCY(LiveIntervals
)
166 INITIALIZE_PASS_END(MachinePipeliner
, DEBUG_TYPE
,
167 "Modulo Software Pipelining", false, false)
169 /// The "main" function for implementing Swing Modulo Scheduling.
170 bool MachinePipeliner::runOnMachineFunction(MachineFunction
&mf
) {
171 if (skipFunction(mf
.getFunction()))
177 if (mf
.getFunction().getAttributes().hasAttribute(
178 AttributeList::FunctionIndex
, Attribute::OptimizeForSize
) &&
179 !EnableSWPOptSize
.getPosition())
183 MLI
= &getAnalysis
<MachineLoopInfo
>();
184 MDT
= &getAnalysis
<MachineDominatorTree
>();
185 TII
= MF
->getSubtarget().getInstrInfo();
186 RegClassInfo
.runOnMachineFunction(*MF
);
194 /// Attempt to perform the SMS algorithm on the specified loop. This function is
195 /// the main entry point for the algorithm. The function identifies candidate
196 /// loops, calculates the minimum initiation interval, and attempts to schedule
198 bool MachinePipeliner::scheduleLoop(MachineLoop
&L
) {
199 bool Changed
= false;
200 for (auto &InnerLoop
: L
)
201 Changed
|= scheduleLoop(*InnerLoop
);
204 // Stop trying after reaching the limit (if any).
205 int Limit
= SwpLoopLimit
;
207 if (NumTries
>= SwpLoopLimit
)
213 setPragmaPipelineOptions(L
);
214 if (!canPipelineLoop(L
)) {
215 LLVM_DEBUG(dbgs() << "\n!!! Can not pipeline loop.\n");
221 Changed
= swingModuloScheduler(L
);
226 void MachinePipeliner::setPragmaPipelineOptions(MachineLoop
&L
) {
227 MachineBasicBlock
*LBLK
= L
.getTopBlock();
232 const BasicBlock
*BBLK
= LBLK
->getBasicBlock();
236 const Instruction
*TI
= BBLK
->getTerminator();
240 MDNode
*LoopID
= TI
->getMetadata(LLVMContext::MD_loop
);
241 if (LoopID
== nullptr)
244 assert(LoopID
->getNumOperands() > 0 && "requires atleast one operand");
245 assert(LoopID
->getOperand(0) == LoopID
&& "invalid loop");
247 for (unsigned i
= 1, e
= LoopID
->getNumOperands(); i
< e
; ++i
) {
248 MDNode
*MD
= dyn_cast
<MDNode
>(LoopID
->getOperand(i
));
253 MDString
*S
= dyn_cast
<MDString
>(MD
->getOperand(0));
258 if (S
->getString() == "llvm.loop.pipeline.initiationinterval") {
259 assert(MD
->getNumOperands() == 2 &&
260 "Pipeline initiation interval hint metadata should have two operands.");
262 mdconst::extract
<ConstantInt
>(MD
->getOperand(1))->getZExtValue();
263 assert(II_setByPragma
>= 1 && "Pipeline initiation interval must be positive.");
264 } else if (S
->getString() == "llvm.loop.pipeline.disable") {
265 disabledByPragma
= true;
270 /// Return true if the loop can be software pipelined. The algorithm is
271 /// restricted to loops with a single basic block. Make sure that the
272 /// branch in the loop can be analyzed.
273 bool MachinePipeliner::canPipelineLoop(MachineLoop
&L
) {
274 if (L
.getNumBlocks() != 1)
277 if (disabledByPragma
)
280 // Check if the branch can't be understood because we can't do pipelining
281 // if that's the case.
285 if (TII
->analyzeBranch(*L
.getHeader(), LI
.TBB
, LI
.FBB
, LI
.BrCond
))
288 LI
.LoopInductionVar
= nullptr;
289 LI
.LoopCompare
= nullptr;
290 if (TII
->analyzeLoop(L
, LI
.LoopInductionVar
, LI
.LoopCompare
))
293 if (!L
.getLoopPreheader())
296 // Remove any subregisters from inputs to phi nodes.
297 preprocessPhiNodes(*L
.getHeader());
301 void MachinePipeliner::preprocessPhiNodes(MachineBasicBlock
&B
) {
302 MachineRegisterInfo
&MRI
= MF
->getRegInfo();
303 SlotIndexes
&Slots
= *getAnalysis
<LiveIntervals
>().getSlotIndexes();
305 for (MachineInstr
&PI
: make_range(B
.begin(), B
.getFirstNonPHI())) {
306 MachineOperand
&DefOp
= PI
.getOperand(0);
307 assert(DefOp
.getSubReg() == 0);
308 auto *RC
= MRI
.getRegClass(DefOp
.getReg());
310 for (unsigned i
= 1, n
= PI
.getNumOperands(); i
!= n
; i
+= 2) {
311 MachineOperand
&RegOp
= PI
.getOperand(i
);
312 if (RegOp
.getSubReg() == 0)
315 // If the operand uses a subregister, replace it with a new register
316 // without subregisters, and generate a copy to the new register.
317 unsigned NewReg
= MRI
.createVirtualRegister(RC
);
318 MachineBasicBlock
&PredB
= *PI
.getOperand(i
+1).getMBB();
319 MachineBasicBlock::iterator At
= PredB
.getFirstTerminator();
320 const DebugLoc
&DL
= PredB
.findDebugLoc(At
);
321 auto Copy
= BuildMI(PredB
, At
, DL
, TII
->get(TargetOpcode::COPY
), NewReg
)
322 .addReg(RegOp
.getReg(), getRegState(RegOp
),
324 Slots
.insertMachineInstrInMaps(*Copy
);
325 RegOp
.setReg(NewReg
);
331 /// The SMS algorithm consists of the following main steps:
332 /// 1. Computation and analysis of the dependence graph.
333 /// 2. Ordering of the nodes (instructions).
334 /// 3. Attempt to Schedule the loop.
335 bool MachinePipeliner::swingModuloScheduler(MachineLoop
&L
) {
336 assert(L
.getBlocks().size() == 1 && "SMS works on single blocks only.");
338 SwingSchedulerDAG
SMS(*this, L
, getAnalysis
<LiveIntervals
>(), RegClassInfo
,
341 MachineBasicBlock
*MBB
= L
.getHeader();
342 // The kernel should not include any terminator instructions. These
343 // will be added back later.
346 // Compute the number of 'real' instructions in the basic block by
347 // ignoring terminators.
348 unsigned size
= MBB
->size();
349 for (MachineBasicBlock::iterator I
= MBB
->getFirstTerminator(),
350 E
= MBB
->instr_end();
354 SMS
.enterRegion(MBB
, MBB
->begin(), MBB
->getFirstTerminator(), size
);
359 return SMS
.hasNewSchedule();
362 void SwingSchedulerDAG::setMII(unsigned ResMII
, unsigned RecMII
) {
363 if (II_setByPragma
> 0)
364 MII
= II_setByPragma
;
366 MII
= std::max(ResMII
, RecMII
);
369 void SwingSchedulerDAG::setMAX_II() {
370 if (II_setByPragma
> 0)
371 MAX_II
= II_setByPragma
;
376 /// We override the schedule function in ScheduleDAGInstrs to implement the
377 /// scheduling part of the Swing Modulo Scheduling algorithm.
378 void SwingSchedulerDAG::schedule() {
379 AliasAnalysis
*AA
= &Pass
.getAnalysis
<AAResultsWrapperPass
>().getAAResults();
381 addLoopCarriedDependences(AA
);
382 updatePhiDependences();
383 Topo
.InitDAGTopologicalSorting();
388 NodeSetType NodeSets
;
389 findCircuits(NodeSets
);
390 NodeSetType Circuits
= NodeSets
;
392 // Calculate the MII.
393 unsigned ResMII
= calculateResMII();
394 unsigned RecMII
= calculateRecMII(NodeSets
);
398 // This flag is used for testing and can cause correctness problems.
402 setMII(ResMII
, RecMII
);
405 LLVM_DEBUG(dbgs() << "MII = " << MII
<< " MAX_II = " << MAX_II
406 << " (rec=" << RecMII
<< ", res=" << ResMII
<< ")\n");
408 // Can't schedule a loop without a valid MII.
412 // Don't pipeline large loops.
413 if (SwpMaxMii
!= -1 && (int)MII
> SwpMaxMii
)
416 computeNodeFunctions(NodeSets
);
418 registerPressureFilter(NodeSets
);
420 colocateNodeSets(NodeSets
);
422 checkNodeSets(NodeSets
);
425 for (auto &I
: NodeSets
) {
426 dbgs() << " Rec NodeSet ";
431 llvm::stable_sort(NodeSets
, std::greater
<NodeSet
>());
433 groupRemainingNodes(NodeSets
);
435 removeDuplicateNodes(NodeSets
);
438 for (auto &I
: NodeSets
) {
439 dbgs() << " NodeSet ";
444 computeNodeOrder(NodeSets
);
446 // check for node order issues
447 checkValidNodeOrder(Circuits
);
449 SMSchedule
Schedule(Pass
.MF
);
450 Scheduled
= schedulePipeline(Schedule
);
455 unsigned numStages
= Schedule
.getMaxStageCount();
456 // No need to generate pipeline if there are no overlapped iterations.
460 // Check that the maximum stage count is less than user-defined limit.
461 if (SwpMaxStages
> -1 && (int)numStages
> SwpMaxStages
)
464 generatePipelinedLoop(Schedule
);
468 /// Clean up after the software pipeliner runs.
469 void SwingSchedulerDAG::finishBlock() {
470 for (MachineInstr
*I
: NewMIs
)
471 MF
.DeleteMachineInstr(I
);
474 // Call the superclass.
475 ScheduleDAGInstrs::finishBlock();
478 /// Return the register values for the operands of a Phi instruction.
479 /// This function assume the instruction is a Phi.
480 static void getPhiRegs(MachineInstr
&Phi
, MachineBasicBlock
*Loop
,
481 unsigned &InitVal
, unsigned &LoopVal
) {
482 assert(Phi
.isPHI() && "Expecting a Phi.");
486 for (unsigned i
= 1, e
= Phi
.getNumOperands(); i
!= e
; i
+= 2)
487 if (Phi
.getOperand(i
+ 1).getMBB() != Loop
)
488 InitVal
= Phi
.getOperand(i
).getReg();
490 LoopVal
= Phi
.getOperand(i
).getReg();
492 assert(InitVal
!= 0 && LoopVal
!= 0 && "Unexpected Phi structure.");
495 /// Return the Phi register value that comes from the incoming block.
496 static unsigned getInitPhiReg(MachineInstr
&Phi
, MachineBasicBlock
*LoopBB
) {
497 for (unsigned i
= 1, e
= Phi
.getNumOperands(); i
!= e
; i
+= 2)
498 if (Phi
.getOperand(i
+ 1).getMBB() != LoopBB
)
499 return Phi
.getOperand(i
).getReg();
503 /// Return the Phi register value that comes the loop block.
504 static unsigned getLoopPhiReg(MachineInstr
&Phi
, MachineBasicBlock
*LoopBB
) {
505 for (unsigned i
= 1, e
= Phi
.getNumOperands(); i
!= e
; i
+= 2)
506 if (Phi
.getOperand(i
+ 1).getMBB() == LoopBB
)
507 return Phi
.getOperand(i
).getReg();
511 /// Return true if SUb can be reached from SUa following the chain edges.
512 static bool isSuccOrder(SUnit
*SUa
, SUnit
*SUb
) {
513 SmallPtrSet
<SUnit
*, 8> Visited
;
514 SmallVector
<SUnit
*, 8> Worklist
;
515 Worklist
.push_back(SUa
);
516 while (!Worklist
.empty()) {
517 const SUnit
*SU
= Worklist
.pop_back_val();
518 for (auto &SI
: SU
->Succs
) {
519 SUnit
*SuccSU
= SI
.getSUnit();
520 if (SI
.getKind() == SDep::Order
) {
521 if (Visited
.count(SuccSU
))
525 Worklist
.push_back(SuccSU
);
526 Visited
.insert(SuccSU
);
533 /// Return true if the instruction causes a chain between memory
534 /// references before and after it.
535 static bool isDependenceBarrier(MachineInstr
&MI
, AliasAnalysis
*AA
) {
536 return MI
.isCall() || MI
.hasUnmodeledSideEffects() ||
537 (MI
.hasOrderedMemoryRef() &&
538 (!MI
.mayLoad() || !MI
.isDereferenceableInvariantLoad(AA
)));
541 /// Return the underlying objects for the memory references of an instruction.
542 /// This function calls the code in ValueTracking, but first checks that the
543 /// instruction has a memory operand.
544 static void getUnderlyingObjects(const MachineInstr
*MI
,
545 SmallVectorImpl
<const Value
*> &Objs
,
546 const DataLayout
&DL
) {
547 if (!MI
->hasOneMemOperand())
549 MachineMemOperand
*MM
= *MI
->memoperands_begin();
552 GetUnderlyingObjects(MM
->getValue(), Objs
, DL
);
553 for (const Value
*V
: Objs
) {
554 if (!isIdentifiedObject(V
)) {
562 /// Add a chain edge between a load and store if the store can be an
563 /// alias of the load on a subsequent iteration, i.e., a loop carried
564 /// dependence. This code is very similar to the code in ScheduleDAGInstrs
565 /// but that code doesn't create loop carried dependences.
566 void SwingSchedulerDAG::addLoopCarriedDependences(AliasAnalysis
*AA
) {
567 MapVector
<const Value
*, SmallVector
<SUnit
*, 4>> PendingLoads
;
568 Value
*UnknownValue
=
569 UndefValue::get(Type::getVoidTy(MF
.getFunction().getContext()));
570 for (auto &SU
: SUnits
) {
571 MachineInstr
&MI
= *SU
.getInstr();
572 if (isDependenceBarrier(MI
, AA
))
573 PendingLoads
.clear();
574 else if (MI
.mayLoad()) {
575 SmallVector
<const Value
*, 4> Objs
;
576 getUnderlyingObjects(&MI
, Objs
, MF
.getDataLayout());
578 Objs
.push_back(UnknownValue
);
579 for (auto V
: Objs
) {
580 SmallVector
<SUnit
*, 4> &SUs
= PendingLoads
[V
];
583 } else if (MI
.mayStore()) {
584 SmallVector
<const Value
*, 4> Objs
;
585 getUnderlyingObjects(&MI
, Objs
, MF
.getDataLayout());
587 Objs
.push_back(UnknownValue
);
588 for (auto V
: Objs
) {
589 MapVector
<const Value
*, SmallVector
<SUnit
*, 4>>::iterator I
=
590 PendingLoads
.find(V
);
591 if (I
== PendingLoads
.end())
593 for (auto Load
: I
->second
) {
594 if (isSuccOrder(Load
, &SU
))
596 MachineInstr
&LdMI
= *Load
->getInstr();
597 // First, perform the cheaper check that compares the base register.
598 // If they are the same and the load offset is less than the store
599 // offset, then mark the dependence as loop carried potentially.
600 const MachineOperand
*BaseOp1
, *BaseOp2
;
601 int64_t Offset1
, Offset2
;
602 if (TII
->getMemOperandWithOffset(LdMI
, BaseOp1
, Offset1
, TRI
) &&
603 TII
->getMemOperandWithOffset(MI
, BaseOp2
, Offset2
, TRI
)) {
604 if (BaseOp1
->isIdenticalTo(*BaseOp2
) &&
605 (int)Offset1
< (int)Offset2
) {
606 assert(TII
->areMemAccessesTriviallyDisjoint(LdMI
, MI
, AA
) &&
607 "What happened to the chain edge?");
608 SDep
Dep(Load
, SDep::Barrier
);
614 // Second, the more expensive check that uses alias analysis on the
615 // base registers. If they alias, and the load offset is less than
616 // the store offset, the mark the dependence as loop carried.
618 SDep
Dep(Load
, SDep::Barrier
);
623 MachineMemOperand
*MMO1
= *LdMI
.memoperands_begin();
624 MachineMemOperand
*MMO2
= *MI
.memoperands_begin();
625 if (!MMO1
->getValue() || !MMO2
->getValue()) {
626 SDep
Dep(Load
, SDep::Barrier
);
631 if (MMO1
->getValue() == MMO2
->getValue() &&
632 MMO1
->getOffset() <= MMO2
->getOffset()) {
633 SDep
Dep(Load
, SDep::Barrier
);
638 AliasResult AAResult
= AA
->alias(
639 MemoryLocation(MMO1
->getValue(), LocationSize::unknown(),
641 MemoryLocation(MMO2
->getValue(), LocationSize::unknown(),
644 if (AAResult
!= NoAlias
) {
645 SDep
Dep(Load
, SDep::Barrier
);
655 /// Update the phi dependences to the DAG because ScheduleDAGInstrs no longer
656 /// processes dependences for PHIs. This function adds true dependences
657 /// from a PHI to a use, and a loop carried dependence from the use to the
658 /// PHI. The loop carried dependence is represented as an anti dependence
659 /// edge. This function also removes chain dependences between unrelated
661 void SwingSchedulerDAG::updatePhiDependences() {
662 SmallVector
<SDep
, 4> RemoveDeps
;
663 const TargetSubtargetInfo
&ST
= MF
.getSubtarget
<TargetSubtargetInfo
>();
665 // Iterate over each DAG node.
666 for (SUnit
&I
: SUnits
) {
668 // Set to true if the instruction has an operand defined by a Phi.
669 unsigned HasPhiUse
= 0;
670 unsigned HasPhiDef
= 0;
671 MachineInstr
*MI
= I
.getInstr();
672 // Iterate over each operand, and we process the definitions.
673 for (MachineInstr::mop_iterator MOI
= MI
->operands_begin(),
674 MOE
= MI
->operands_end();
678 unsigned Reg
= MOI
->getReg();
680 // If the register is used by a Phi, then create an anti dependence.
681 for (MachineRegisterInfo::use_instr_iterator
682 UI
= MRI
.use_instr_begin(Reg
),
683 UE
= MRI
.use_instr_end();
685 MachineInstr
*UseMI
= &*UI
;
686 SUnit
*SU
= getSUnit(UseMI
);
687 if (SU
!= nullptr && UseMI
->isPHI()) {
689 SDep
Dep(SU
, SDep::Anti
, Reg
);
694 // Add a chain edge to a dependent Phi that isn't an existing
696 if (SU
->NodeNum
< I
.NodeNum
&& !I
.isPred(SU
))
697 I
.addPred(SDep(SU
, SDep::Barrier
));
701 } else if (MOI
->isUse()) {
702 // If the register is defined by a Phi, then create a true dependence.
703 MachineInstr
*DefMI
= MRI
.getUniqueVRegDef(Reg
);
704 if (DefMI
== nullptr)
706 SUnit
*SU
= getSUnit(DefMI
);
707 if (SU
!= nullptr && DefMI
->isPHI()) {
709 SDep
Dep(SU
, SDep::Data
, Reg
);
711 ST
.adjustSchedDependency(SU
, &I
, Dep
);
715 // Add a chain edge to a dependent Phi that isn't an existing
717 if (SU
->NodeNum
< I
.NodeNum
&& !I
.isPred(SU
))
718 I
.addPred(SDep(SU
, SDep::Barrier
));
723 // Remove order dependences from an unrelated Phi.
726 for (auto &PI
: I
.Preds
) {
727 MachineInstr
*PMI
= PI
.getSUnit()->getInstr();
728 if (PMI
->isPHI() && PI
.getKind() == SDep::Order
) {
729 if (I
.getInstr()->isPHI()) {
730 if (PMI
->getOperand(0).getReg() == HasPhiUse
)
732 if (getLoopPhiReg(*PMI
, PMI
->getParent()) == HasPhiDef
)
735 RemoveDeps
.push_back(PI
);
738 for (int i
= 0, e
= RemoveDeps
.size(); i
!= e
; ++i
)
739 I
.removePred(RemoveDeps
[i
]);
743 /// Iterate over each DAG node and see if we can change any dependences
744 /// in order to reduce the recurrence MII.
745 void SwingSchedulerDAG::changeDependences() {
746 // See if an instruction can use a value from the previous iteration.
747 // If so, we update the base and offset of the instruction and change
749 for (SUnit
&I
: SUnits
) {
750 unsigned BasePos
= 0, OffsetPos
= 0, NewBase
= 0;
751 int64_t NewOffset
= 0;
752 if (!canUseLastOffsetValue(I
.getInstr(), BasePos
, OffsetPos
, NewBase
,
756 // Get the MI and SUnit for the instruction that defines the original base.
757 unsigned OrigBase
= I
.getInstr()->getOperand(BasePos
).getReg();
758 MachineInstr
*DefMI
= MRI
.getUniqueVRegDef(OrigBase
);
761 SUnit
*DefSU
= getSUnit(DefMI
);
764 // Get the MI and SUnit for the instruction that defins the new base.
765 MachineInstr
*LastMI
= MRI
.getUniqueVRegDef(NewBase
);
768 SUnit
*LastSU
= getSUnit(LastMI
);
772 if (Topo
.IsReachable(&I
, LastSU
))
775 // Remove the dependence. The value now depends on a prior iteration.
776 SmallVector
<SDep
, 4> Deps
;
777 for (SUnit::pred_iterator P
= I
.Preds
.begin(), E
= I
.Preds
.end(); P
!= E
;
779 if (P
->getSUnit() == DefSU
)
781 for (int i
= 0, e
= Deps
.size(); i
!= e
; i
++) {
782 Topo
.RemovePred(&I
, Deps
[i
].getSUnit());
783 I
.removePred(Deps
[i
]);
785 // Remove the chain dependence between the instructions.
787 for (auto &P
: LastSU
->Preds
)
788 if (P
.getSUnit() == &I
&& P
.getKind() == SDep::Order
)
790 for (int i
= 0, e
= Deps
.size(); i
!= e
; i
++) {
791 Topo
.RemovePred(LastSU
, Deps
[i
].getSUnit());
792 LastSU
->removePred(Deps
[i
]);
795 // Add a dependence between the new instruction and the instruction
796 // that defines the new base.
797 SDep
Dep(&I
, SDep::Anti
, NewBase
);
798 Topo
.AddPred(LastSU
, &I
);
799 LastSU
->addPred(Dep
);
801 // Remember the base and offset information so that we can update the
802 // instruction during code generation.
803 InstrChanges
[&I
] = std::make_pair(NewBase
, NewOffset
);
809 // FuncUnitSorter - Comparison operator used to sort instructions by
810 // the number of functional unit choices.
811 struct FuncUnitSorter
{
812 const InstrItineraryData
*InstrItins
;
813 DenseMap
<unsigned, unsigned> Resources
;
815 FuncUnitSorter(const InstrItineraryData
*IID
) : InstrItins(IID
) {}
817 // Compute the number of functional unit alternatives needed
818 // at each stage, and take the minimum value. We prioritize the
819 // instructions by the least number of choices first.
820 unsigned minFuncUnits(const MachineInstr
*Inst
, unsigned &F
) const {
821 unsigned schedClass
= Inst
->getDesc().getSchedClass();
822 unsigned min
= UINT_MAX
;
823 for (const InstrStage
*IS
= InstrItins
->beginStage(schedClass
),
824 *IE
= InstrItins
->endStage(schedClass
);
826 unsigned funcUnits
= IS
->getUnits();
827 unsigned numAlternatives
= countPopulation(funcUnits
);
828 if (numAlternatives
< min
) {
829 min
= numAlternatives
;
836 // Compute the critical resources needed by the instruction. This
837 // function records the functional units needed by instructions that
838 // must use only one functional unit. We use this as a tie breaker
839 // for computing the resource MII. The instrutions that require
840 // the same, highly used, functional unit have high priority.
841 void calcCriticalResources(MachineInstr
&MI
) {
842 unsigned SchedClass
= MI
.getDesc().getSchedClass();
843 for (const InstrStage
*IS
= InstrItins
->beginStage(SchedClass
),
844 *IE
= InstrItins
->endStage(SchedClass
);
846 unsigned FuncUnits
= IS
->getUnits();
847 if (countPopulation(FuncUnits
) == 1)
848 Resources
[FuncUnits
]++;
852 /// Return true if IS1 has less priority than IS2.
853 bool operator()(const MachineInstr
*IS1
, const MachineInstr
*IS2
) const {
854 unsigned F1
= 0, F2
= 0;
855 unsigned MFUs1
= minFuncUnits(IS1
, F1
);
856 unsigned MFUs2
= minFuncUnits(IS2
, F2
);
857 if (MFUs1
== 1 && MFUs2
== 1)
858 return Resources
.lookup(F1
) < Resources
.lookup(F2
);
859 return MFUs1
> MFUs2
;
863 } // end anonymous namespace
865 /// Calculate the resource constrained minimum initiation interval for the
866 /// specified loop. We use the DFA to model the resources needed for
867 /// each instruction, and we ignore dependences. A different DFA is created
868 /// for each cycle that is required. When adding a new instruction, we attempt
869 /// to add it to each existing DFA, until a legal space is found. If the
870 /// instruction cannot be reserved in an existing DFA, we create a new one.
871 unsigned SwingSchedulerDAG::calculateResMII() {
872 SmallVector
<DFAPacketizer
*, 8> Resources
;
873 MachineBasicBlock
*MBB
= Loop
.getHeader();
874 Resources
.push_back(TII
->CreateTargetScheduleState(MF
.getSubtarget()));
876 // Sort the instructions by the number of available choices for scheduling,
877 // least to most. Use the number of critical resources as the tie breaker.
879 FuncUnitSorter(MF
.getSubtarget().getInstrItineraryData());
880 for (MachineBasicBlock::iterator I
= MBB
->getFirstNonPHI(),
881 E
= MBB
->getFirstTerminator();
883 FUS
.calcCriticalResources(*I
);
884 PriorityQueue
<MachineInstr
*, std::vector
<MachineInstr
*>, FuncUnitSorter
>
887 for (MachineBasicBlock::iterator I
= MBB
->getFirstNonPHI(),
888 E
= MBB
->getFirstTerminator();
890 FuncUnitOrder
.push(&*I
);
892 while (!FuncUnitOrder
.empty()) {
893 MachineInstr
*MI
= FuncUnitOrder
.top();
895 if (TII
->isZeroCost(MI
->getOpcode()))
897 // Attempt to reserve the instruction in an existing DFA. At least one
898 // DFA is needed for each cycle.
899 unsigned NumCycles
= getSUnit(MI
)->Latency
;
900 unsigned ReservedCycles
= 0;
901 SmallVectorImpl
<DFAPacketizer
*>::iterator RI
= Resources
.begin();
902 SmallVectorImpl
<DFAPacketizer
*>::iterator RE
= Resources
.end();
903 for (unsigned C
= 0; C
< NumCycles
; ++C
)
905 if ((*RI
++)->canReserveResources(*MI
)) {
910 // Start reserving resources using existing DFAs.
911 for (unsigned C
= 0; C
< ReservedCycles
; ++C
) {
913 (*RI
)->reserveResources(*MI
);
915 // Add new DFAs, if needed, to reserve resources.
916 for (unsigned C
= ReservedCycles
; C
< NumCycles
; ++C
) {
917 DFAPacketizer
*NewResource
=
918 TII
->CreateTargetScheduleState(MF
.getSubtarget());
919 assert(NewResource
->canReserveResources(*MI
) && "Reserve error.");
920 NewResource
->reserveResources(*MI
);
921 Resources
.push_back(NewResource
);
924 int Resmii
= Resources
.size();
925 // Delete the memory for each of the DFAs that were created earlier.
926 for (DFAPacketizer
*RI
: Resources
) {
927 DFAPacketizer
*D
= RI
;
934 /// Calculate the recurrence-constrainted minimum initiation interval.
935 /// Iterate over each circuit. Compute the delay(c) and distance(c)
936 /// for each circuit. The II needs to satisfy the inequality
937 /// delay(c) - II*distance(c) <= 0. For each circuit, choose the smallest
938 /// II that satisfies the inequality, and the RecMII is the maximum
940 unsigned SwingSchedulerDAG::calculateRecMII(NodeSetType
&NodeSets
) {
943 for (NodeSet
&Nodes
: NodeSets
) {
947 unsigned Delay
= Nodes
.getLatency();
948 unsigned Distance
= 1;
950 // ii = ceil(delay / distance)
951 unsigned CurMII
= (Delay
+ Distance
- 1) / Distance
;
952 Nodes
.setRecMII(CurMII
);
960 /// Swap all the anti dependences in the DAG. That means it is no longer a DAG,
961 /// but we do this to find the circuits, and then change them back.
962 static void swapAntiDependences(std::vector
<SUnit
> &SUnits
) {
963 SmallVector
<std::pair
<SUnit
*, SDep
>, 8> DepsAdded
;
964 for (unsigned i
= 0, e
= SUnits
.size(); i
!= e
; ++i
) {
965 SUnit
*SU
= &SUnits
[i
];
966 for (SUnit::pred_iterator IP
= SU
->Preds
.begin(), EP
= SU
->Preds
.end();
968 if (IP
->getKind() != SDep::Anti
)
970 DepsAdded
.push_back(std::make_pair(SU
, *IP
));
973 for (SmallVector
<std::pair
<SUnit
*, SDep
>, 8>::iterator I
= DepsAdded
.begin(),
976 // Remove this anti dependency and add one in the reverse direction.
977 SUnit
*SU
= I
->first
;
979 SUnit
*TargetSU
= D
.getSUnit();
980 unsigned Reg
= D
.getReg();
981 unsigned Lat
= D
.getLatency();
983 SDep
Dep(SU
, SDep::Anti
, Reg
);
985 TargetSU
->addPred(Dep
);
989 /// Create the adjacency structure of the nodes in the graph.
990 void SwingSchedulerDAG::Circuits::createAdjacencyStructure(
991 SwingSchedulerDAG
*DAG
) {
992 BitVector
Added(SUnits
.size());
993 DenseMap
<int, int> OutputDeps
;
994 for (int i
= 0, e
= SUnits
.size(); i
!= e
; ++i
) {
996 // Add any successor to the adjacency matrix and exclude duplicates.
997 for (auto &SI
: SUnits
[i
].Succs
) {
998 // Only create a back-edge on the first and last nodes of a dependence
999 // chain. This records any chains and adds them later.
1000 if (SI
.getKind() == SDep::Output
) {
1001 int N
= SI
.getSUnit()->NodeNum
;
1003 auto Dep
= OutputDeps
.find(BackEdge
);
1004 if (Dep
!= OutputDeps
.end()) {
1005 BackEdge
= Dep
->second
;
1006 OutputDeps
.erase(Dep
);
1008 OutputDeps
[N
] = BackEdge
;
1010 // Do not process a boundary node, an artificial node.
1011 // A back-edge is processed only if it goes to a Phi.
1012 if (SI
.getSUnit()->isBoundaryNode() || SI
.isArtificial() ||
1013 (SI
.getKind() == SDep::Anti
&& !SI
.getSUnit()->getInstr()->isPHI()))
1015 int N
= SI
.getSUnit()->NodeNum
;
1016 if (!Added
.test(N
)) {
1017 AdjK
[i
].push_back(N
);
1021 // A chain edge between a store and a load is treated as a back-edge in the
1022 // adjacency matrix.
1023 for (auto &PI
: SUnits
[i
].Preds
) {
1024 if (!SUnits
[i
].getInstr()->mayStore() ||
1025 !DAG
->isLoopCarriedDep(&SUnits
[i
], PI
, false))
1027 if (PI
.getKind() == SDep::Order
&& PI
.getSUnit()->getInstr()->mayLoad()) {
1028 int N
= PI
.getSUnit()->NodeNum
;
1029 if (!Added
.test(N
)) {
1030 AdjK
[i
].push_back(N
);
1036 // Add back-edges in the adjacency matrix for the output dependences.
1037 for (auto &OD
: OutputDeps
)
1038 if (!Added
.test(OD
.second
)) {
1039 AdjK
[OD
.first
].push_back(OD
.second
);
1040 Added
.set(OD
.second
);
1044 /// Identify an elementary circuit in the dependence graph starting at the
1046 bool SwingSchedulerDAG::Circuits::circuit(int V
, int S
, NodeSetType
&NodeSets
,
1048 SUnit
*SV
= &SUnits
[V
];
1053 for (auto W
: AdjK
[V
]) {
1054 if (NumPaths
> MaxPaths
)
1060 NodeSets
.push_back(NodeSet(Stack
.begin(), Stack
.end()));
1064 } else if (!Blocked
.test(W
)) {
1065 if (circuit(W
, S
, NodeSets
,
1066 Node2Idx
->at(W
) < Node2Idx
->at(V
) ? true : HasBackedge
))
1074 for (auto W
: AdjK
[V
]) {
1077 if (B
[W
].count(SV
) == 0)
1085 /// Unblock a node in the circuit finding algorithm.
1086 void SwingSchedulerDAG::Circuits::unblock(int U
) {
1088 SmallPtrSet
<SUnit
*, 4> &BU
= B
[U
];
1089 while (!BU
.empty()) {
1090 SmallPtrSet
<SUnit
*, 4>::iterator SI
= BU
.begin();
1091 assert(SI
!= BU
.end() && "Invalid B set.");
1094 if (Blocked
.test(W
->NodeNum
))
1095 unblock(W
->NodeNum
);
1099 /// Identify all the elementary circuits in the dependence graph using
1100 /// Johnson's circuit algorithm.
1101 void SwingSchedulerDAG::findCircuits(NodeSetType
&NodeSets
) {
1102 // Swap all the anti dependences in the DAG. That means it is no longer a DAG,
1103 // but we do this to find the circuits, and then change them back.
1104 swapAntiDependences(SUnits
);
1106 Circuits
Cir(SUnits
, Topo
);
1107 // Create the adjacency structure.
1108 Cir
.createAdjacencyStructure(this);
1109 for (int i
= 0, e
= SUnits
.size(); i
!= e
; ++i
) {
1111 Cir
.circuit(i
, i
, NodeSets
);
1114 // Change the dependences back so that we've created a DAG again.
1115 swapAntiDependences(SUnits
);
1118 // Create artificial dependencies between the source of COPY/REG_SEQUENCE that
1119 // is loop-carried to the USE in next iteration. This will help pipeliner avoid
1120 // additional copies that are needed across iterations. An artificial dependence
1121 // edge is added from USE to SOURCE of COPY/REG_SEQUENCE.
1123 // PHI-------Anti-Dep-----> COPY/REG_SEQUENCE (loop-carried)
1124 // SRCOfCopY------True-Dep---> COPY/REG_SEQUENCE
1125 // PHI-------True-Dep------> USEOfPhi
1127 // The mutation creates
1128 // USEOfPHI -------Artificial-Dep---> SRCOfCopy
1130 // This overall will ensure, the USEOfPHI is scheduled before SRCOfCopy
1131 // (since USE is a predecessor), implies, the COPY/ REG_SEQUENCE is scheduled
1132 // late to avoid additional copies across iterations. The possible scheduling
1134 // USEOfPHI --- SRCOfCopy--- COPY/REG_SEQUENCE.
1136 void SwingSchedulerDAG::CopyToPhiMutation::apply(ScheduleDAGInstrs
*DAG
) {
1137 for (SUnit
&SU
: DAG
->SUnits
) {
1138 // Find the COPY/REG_SEQUENCE instruction.
1139 if (!SU
.getInstr()->isCopy() && !SU
.getInstr()->isRegSequence())
1142 // Record the loop carried PHIs.
1143 SmallVector
<SUnit
*, 4> PHISUs
;
1144 // Record the SrcSUs that feed the COPY/REG_SEQUENCE instructions.
1145 SmallVector
<SUnit
*, 4> SrcSUs
;
1147 for (auto &Dep
: SU
.Preds
) {
1148 SUnit
*TmpSU
= Dep
.getSUnit();
1149 MachineInstr
*TmpMI
= TmpSU
->getInstr();
1150 SDep::Kind DepKind
= Dep
.getKind();
1151 // Save the loop carried PHI.
1152 if (DepKind
== SDep::Anti
&& TmpMI
->isPHI())
1153 PHISUs
.push_back(TmpSU
);
1154 // Save the source of COPY/REG_SEQUENCE.
1155 // If the source has no pre-decessors, we will end up creating cycles.
1156 else if (DepKind
== SDep::Data
&& !TmpMI
->isPHI() && TmpSU
->NumPreds
> 0)
1157 SrcSUs
.push_back(TmpSU
);
1160 if (PHISUs
.size() == 0 || SrcSUs
.size() == 0)
1163 // Find the USEs of PHI. If the use is a PHI or REG_SEQUENCE, push back this
1164 // SUnit to the container.
1165 SmallVector
<SUnit
*, 8> UseSUs
;
1166 for (auto I
= PHISUs
.begin(); I
!= PHISUs
.end(); ++I
) {
1167 for (auto &Dep
: (*I
)->Succs
) {
1168 if (Dep
.getKind() != SDep::Data
)
1171 SUnit
*TmpSU
= Dep
.getSUnit();
1172 MachineInstr
*TmpMI
= TmpSU
->getInstr();
1173 if (TmpMI
->isPHI() || TmpMI
->isRegSequence()) {
1174 PHISUs
.push_back(TmpSU
);
1177 UseSUs
.push_back(TmpSU
);
1181 if (UseSUs
.size() == 0)
1184 SwingSchedulerDAG
*SDAG
= cast
<SwingSchedulerDAG
>(DAG
);
1185 // Add the artificial dependencies if it does not form a cycle.
1186 for (auto I
: UseSUs
) {
1187 for (auto Src
: SrcSUs
) {
1188 if (!SDAG
->Topo
.IsReachable(I
, Src
) && Src
!= I
) {
1189 Src
->addPred(SDep(I
, SDep::Artificial
));
1190 SDAG
->Topo
.AddPred(Src
, I
);
1197 /// Return true for DAG nodes that we ignore when computing the cost functions.
1198 /// We ignore the back-edge recurrence in order to avoid unbounded recursion
1199 /// in the calculation of the ASAP, ALAP, etc functions.
1200 static bool ignoreDependence(const SDep
&D
, bool isPred
) {
1201 if (D
.isArtificial())
1203 return D
.getKind() == SDep::Anti
&& isPred
;
1206 /// Compute several functions need to order the nodes for scheduling.
1207 /// ASAP - Earliest time to schedule a node.
1208 /// ALAP - Latest time to schedule a node.
1209 /// MOV - Mobility function, difference between ALAP and ASAP.
1210 /// D - Depth of each node.
1211 /// H - Height of each node.
1212 void SwingSchedulerDAG::computeNodeFunctions(NodeSetType
&NodeSets
) {
1213 ScheduleInfo
.resize(SUnits
.size());
1216 for (ScheduleDAGTopologicalSort::const_iterator I
= Topo
.begin(),
1219 const SUnit
&SU
= SUnits
[*I
];
1225 // Compute ASAP and ZeroLatencyDepth.
1226 for (ScheduleDAGTopologicalSort::const_iterator I
= Topo
.begin(),
1230 int zeroLatencyDepth
= 0;
1231 SUnit
*SU
= &SUnits
[*I
];
1232 for (SUnit::const_pred_iterator IP
= SU
->Preds
.begin(),
1233 EP
= SU
->Preds
.end();
1235 SUnit
*pred
= IP
->getSUnit();
1236 if (IP
->getLatency() == 0)
1238 std::max(zeroLatencyDepth
, getZeroLatencyDepth(pred
) + 1);
1239 if (ignoreDependence(*IP
, true))
1241 asap
= std::max(asap
, (int)(getASAP(pred
) + IP
->getLatency() -
1242 getDistance(pred
, SU
, *IP
) * MII
));
1244 maxASAP
= std::max(maxASAP
, asap
);
1245 ScheduleInfo
[*I
].ASAP
= asap
;
1246 ScheduleInfo
[*I
].ZeroLatencyDepth
= zeroLatencyDepth
;
1249 // Compute ALAP, ZeroLatencyHeight, and MOV.
1250 for (ScheduleDAGTopologicalSort::const_reverse_iterator I
= Topo
.rbegin(),
1254 int zeroLatencyHeight
= 0;
1255 SUnit
*SU
= &SUnits
[*I
];
1256 for (SUnit::const_succ_iterator IS
= SU
->Succs
.begin(),
1257 ES
= SU
->Succs
.end();
1259 SUnit
*succ
= IS
->getSUnit();
1260 if (IS
->getLatency() == 0)
1262 std::max(zeroLatencyHeight
, getZeroLatencyHeight(succ
) + 1);
1263 if (ignoreDependence(*IS
, true))
1265 alap
= std::min(alap
, (int)(getALAP(succ
) - IS
->getLatency() +
1266 getDistance(SU
, succ
, *IS
) * MII
));
1269 ScheduleInfo
[*I
].ALAP
= alap
;
1270 ScheduleInfo
[*I
].ZeroLatencyHeight
= zeroLatencyHeight
;
1273 // After computing the node functions, compute the summary for each node set.
1274 for (NodeSet
&I
: NodeSets
)
1275 I
.computeNodeSetInfo(this);
1278 for (unsigned i
= 0; i
< SUnits
.size(); i
++) {
1279 dbgs() << "\tNode " << i
<< ":\n";
1280 dbgs() << "\t ASAP = " << getASAP(&SUnits
[i
]) << "\n";
1281 dbgs() << "\t ALAP = " << getALAP(&SUnits
[i
]) << "\n";
1282 dbgs() << "\t MOV = " << getMOV(&SUnits
[i
]) << "\n";
1283 dbgs() << "\t D = " << getDepth(&SUnits
[i
]) << "\n";
1284 dbgs() << "\t H = " << getHeight(&SUnits
[i
]) << "\n";
1285 dbgs() << "\t ZLD = " << getZeroLatencyDepth(&SUnits
[i
]) << "\n";
1286 dbgs() << "\t ZLH = " << getZeroLatencyHeight(&SUnits
[i
]) << "\n";
1291 /// Compute the Pred_L(O) set, as defined in the paper. The set is defined
1292 /// as the predecessors of the elements of NodeOrder that are not also in
1294 static bool pred_L(SetVector
<SUnit
*> &NodeOrder
,
1295 SmallSetVector
<SUnit
*, 8> &Preds
,
1296 const NodeSet
*S
= nullptr) {
1298 for (SetVector
<SUnit
*>::iterator I
= NodeOrder
.begin(), E
= NodeOrder
.end();
1300 for (SUnit::pred_iterator PI
= (*I
)->Preds
.begin(), PE
= (*I
)->Preds
.end();
1302 if (S
&& S
->count(PI
->getSUnit()) == 0)
1304 if (ignoreDependence(*PI
, true))
1306 if (NodeOrder
.count(PI
->getSUnit()) == 0)
1307 Preds
.insert(PI
->getSUnit());
1309 // Back-edges are predecessors with an anti-dependence.
1310 for (SUnit::const_succ_iterator IS
= (*I
)->Succs
.begin(),
1311 ES
= (*I
)->Succs
.end();
1313 if (IS
->getKind() != SDep::Anti
)
1315 if (S
&& S
->count(IS
->getSUnit()) == 0)
1317 if (NodeOrder
.count(IS
->getSUnit()) == 0)
1318 Preds
.insert(IS
->getSUnit());
1321 return !Preds
.empty();
1324 /// Compute the Succ_L(O) set, as defined in the paper. The set is defined
1325 /// as the successors of the elements of NodeOrder that are not also in
1327 static bool succ_L(SetVector
<SUnit
*> &NodeOrder
,
1328 SmallSetVector
<SUnit
*, 8> &Succs
,
1329 const NodeSet
*S
= nullptr) {
1331 for (SetVector
<SUnit
*>::iterator I
= NodeOrder
.begin(), E
= NodeOrder
.end();
1333 for (SUnit::succ_iterator SI
= (*I
)->Succs
.begin(), SE
= (*I
)->Succs
.end();
1335 if (S
&& S
->count(SI
->getSUnit()) == 0)
1337 if (ignoreDependence(*SI
, false))
1339 if (NodeOrder
.count(SI
->getSUnit()) == 0)
1340 Succs
.insert(SI
->getSUnit());
1342 for (SUnit::const_pred_iterator PI
= (*I
)->Preds
.begin(),
1343 PE
= (*I
)->Preds
.end();
1345 if (PI
->getKind() != SDep::Anti
)
1347 if (S
&& S
->count(PI
->getSUnit()) == 0)
1349 if (NodeOrder
.count(PI
->getSUnit()) == 0)
1350 Succs
.insert(PI
->getSUnit());
1353 return !Succs
.empty();
1356 /// Return true if there is a path from the specified node to any of the nodes
1357 /// in DestNodes. Keep track and return the nodes in any path.
1358 static bool computePath(SUnit
*Cur
, SetVector
<SUnit
*> &Path
,
1359 SetVector
<SUnit
*> &DestNodes
,
1360 SetVector
<SUnit
*> &Exclude
,
1361 SmallPtrSet
<SUnit
*, 8> &Visited
) {
1362 if (Cur
->isBoundaryNode())
1364 if (Exclude
.count(Cur
) != 0)
1366 if (DestNodes
.count(Cur
) != 0)
1368 if (!Visited
.insert(Cur
).second
)
1369 return Path
.count(Cur
) != 0;
1370 bool FoundPath
= false;
1371 for (auto &SI
: Cur
->Succs
)
1372 FoundPath
|= computePath(SI
.getSUnit(), Path
, DestNodes
, Exclude
, Visited
);
1373 for (auto &PI
: Cur
->Preds
)
1374 if (PI
.getKind() == SDep::Anti
)
1376 computePath(PI
.getSUnit(), Path
, DestNodes
, Exclude
, Visited
);
1382 /// Return true if Set1 is a subset of Set2.
1383 template <class S1Ty
, class S2Ty
> static bool isSubset(S1Ty
&Set1
, S2Ty
&Set2
) {
1384 for (typename
S1Ty::iterator I
= Set1
.begin(), E
= Set1
.end(); I
!= E
; ++I
)
1385 if (Set2
.count(*I
) == 0)
1390 /// Compute the live-out registers for the instructions in a node-set.
1391 /// The live-out registers are those that are defined in the node-set,
1392 /// but not used. Except for use operands of Phis.
1393 static void computeLiveOuts(MachineFunction
&MF
, RegPressureTracker
&RPTracker
,
1395 const TargetRegisterInfo
*TRI
= MF
.getSubtarget().getRegisterInfo();
1396 MachineRegisterInfo
&MRI
= MF
.getRegInfo();
1397 SmallVector
<RegisterMaskPair
, 8> LiveOutRegs
;
1398 SmallSet
<unsigned, 4> Uses
;
1399 for (SUnit
*SU
: NS
) {
1400 const MachineInstr
*MI
= SU
->getInstr();
1403 for (const MachineOperand
&MO
: MI
->operands())
1404 if (MO
.isReg() && MO
.isUse()) {
1405 unsigned Reg
= MO
.getReg();
1406 if (TargetRegisterInfo::isVirtualRegister(Reg
))
1408 else if (MRI
.isAllocatable(Reg
))
1409 for (MCRegUnitIterator
Units(Reg
, TRI
); Units
.isValid(); ++Units
)
1410 Uses
.insert(*Units
);
1413 for (SUnit
*SU
: NS
)
1414 for (const MachineOperand
&MO
: SU
->getInstr()->operands())
1415 if (MO
.isReg() && MO
.isDef() && !MO
.isDead()) {
1416 unsigned Reg
= MO
.getReg();
1417 if (TargetRegisterInfo::isVirtualRegister(Reg
)) {
1418 if (!Uses
.count(Reg
))
1419 LiveOutRegs
.push_back(RegisterMaskPair(Reg
,
1420 LaneBitmask::getNone()));
1421 } else if (MRI
.isAllocatable(Reg
)) {
1422 for (MCRegUnitIterator
Units(Reg
, TRI
); Units
.isValid(); ++Units
)
1423 if (!Uses
.count(*Units
))
1424 LiveOutRegs
.push_back(RegisterMaskPair(*Units
,
1425 LaneBitmask::getNone()));
1428 RPTracker
.addLiveRegs(LiveOutRegs
);
1431 /// A heuristic to filter nodes in recurrent node-sets if the register
1432 /// pressure of a set is too high.
1433 void SwingSchedulerDAG::registerPressureFilter(NodeSetType
&NodeSets
) {
1434 for (auto &NS
: NodeSets
) {
1435 // Skip small node-sets since they won't cause register pressure problems.
1438 IntervalPressure RecRegPressure
;
1439 RegPressureTracker
RecRPTracker(RecRegPressure
);
1440 RecRPTracker
.init(&MF
, &RegClassInfo
, &LIS
, BB
, BB
->end(), false, true);
1441 computeLiveOuts(MF
, RecRPTracker
, NS
);
1442 RecRPTracker
.closeBottom();
1444 std::vector
<SUnit
*> SUnits(NS
.begin(), NS
.end());
1445 llvm::sort(SUnits
, [](const SUnit
*A
, const SUnit
*B
) {
1446 return A
->NodeNum
> B
->NodeNum
;
1449 for (auto &SU
: SUnits
) {
1450 // Since we're computing the register pressure for a subset of the
1451 // instructions in a block, we need to set the tracker for each
1452 // instruction in the node-set. The tracker is set to the instruction
1453 // just after the one we're interested in.
1454 MachineBasicBlock::const_iterator CurInstI
= SU
->getInstr();
1455 RecRPTracker
.setPos(std::next(CurInstI
));
1457 RegPressureDelta RPDelta
;
1458 ArrayRef
<PressureChange
> CriticalPSets
;
1459 RecRPTracker
.getMaxUpwardPressureDelta(SU
->getInstr(), nullptr, RPDelta
,
1461 RecRegPressure
.MaxSetPressure
);
1462 if (RPDelta
.Excess
.isValid()) {
1464 dbgs() << "Excess register pressure: SU(" << SU
->NodeNum
<< ") "
1465 << TRI
->getRegPressureSetName(RPDelta
.Excess
.getPSet())
1466 << ":" << RPDelta
.Excess
.getUnitInc());
1467 NS
.setExceedPressure(SU
);
1470 RecRPTracker
.recede();
1475 /// A heuristic to colocate node sets that have the same set of
1477 void SwingSchedulerDAG::colocateNodeSets(NodeSetType
&NodeSets
) {
1478 unsigned Colocate
= 0;
1479 for (int i
= 0, e
= NodeSets
.size(); i
< e
; ++i
) {
1480 NodeSet
&N1
= NodeSets
[i
];
1481 SmallSetVector
<SUnit
*, 8> S1
;
1482 if (N1
.empty() || !succ_L(N1
, S1
))
1484 for (int j
= i
+ 1; j
< e
; ++j
) {
1485 NodeSet
&N2
= NodeSets
[j
];
1486 if (N1
.compareRecMII(N2
) != 0)
1488 SmallSetVector
<SUnit
*, 8> S2
;
1489 if (N2
.empty() || !succ_L(N2
, S2
))
1491 if (isSubset(S1
, S2
) && S1
.size() == S2
.size()) {
1492 N1
.setColocate(++Colocate
);
1493 N2
.setColocate(Colocate
);
1500 /// Check if the existing node-sets are profitable. If not, then ignore the
1501 /// recurrent node-sets, and attempt to schedule all nodes together. This is
1502 /// a heuristic. If the MII is large and all the recurrent node-sets are small,
1503 /// then it's best to try to schedule all instructions together instead of
1504 /// starting with the recurrent node-sets.
1505 void SwingSchedulerDAG::checkNodeSets(NodeSetType
&NodeSets
) {
1506 // Look for loops with a large MII.
1509 // Check if the node-set contains only a simple add recurrence.
1510 for (auto &NS
: NodeSets
) {
1511 if (NS
.getRecMII() > 2)
1513 if (NS
.getMaxDepth() > MII
)
1517 LLVM_DEBUG(dbgs() << "Clear recurrence node-sets\n");
1521 /// Add the nodes that do not belong to a recurrence set into groups
1522 /// based upon connected componenets.
1523 void SwingSchedulerDAG::groupRemainingNodes(NodeSetType
&NodeSets
) {
1524 SetVector
<SUnit
*> NodesAdded
;
1525 SmallPtrSet
<SUnit
*, 8> Visited
;
1526 // Add the nodes that are on a path between the previous node sets and
1527 // the current node set.
1528 for (NodeSet
&I
: NodeSets
) {
1529 SmallSetVector
<SUnit
*, 8> N
;
1530 // Add the nodes from the current node set to the previous node set.
1532 SetVector
<SUnit
*> Path
;
1533 for (SUnit
*NI
: N
) {
1535 computePath(NI
, Path
, NodesAdded
, I
, Visited
);
1538 I
.insert(Path
.begin(), Path
.end());
1540 // Add the nodes from the previous node set to the current node set.
1542 if (succ_L(NodesAdded
, N
)) {
1543 SetVector
<SUnit
*> Path
;
1544 for (SUnit
*NI
: N
) {
1546 computePath(NI
, Path
, I
, NodesAdded
, Visited
);
1549 I
.insert(Path
.begin(), Path
.end());
1551 NodesAdded
.insert(I
.begin(), I
.end());
1554 // Create a new node set with the connected nodes of any successor of a node
1555 // in a recurrent set.
1557 SmallSetVector
<SUnit
*, 8> N
;
1558 if (succ_L(NodesAdded
, N
))
1560 addConnectedNodes(I
, NewSet
, NodesAdded
);
1561 if (!NewSet
.empty())
1562 NodeSets
.push_back(NewSet
);
1564 // Create a new node set with the connected nodes of any predecessor of a node
1565 // in a recurrent set.
1567 if (pred_L(NodesAdded
, N
))
1569 addConnectedNodes(I
, NewSet
, NodesAdded
);
1570 if (!NewSet
.empty())
1571 NodeSets
.push_back(NewSet
);
1573 // Create new nodes sets with the connected nodes any remaining node that
1574 // has no predecessor.
1575 for (unsigned i
= 0; i
< SUnits
.size(); ++i
) {
1576 SUnit
*SU
= &SUnits
[i
];
1577 if (NodesAdded
.count(SU
) == 0) {
1579 addConnectedNodes(SU
, NewSet
, NodesAdded
);
1580 if (!NewSet
.empty())
1581 NodeSets
.push_back(NewSet
);
1586 /// Add the node to the set, and add all of its connected nodes to the set.
1587 void SwingSchedulerDAG::addConnectedNodes(SUnit
*SU
, NodeSet
&NewSet
,
1588 SetVector
<SUnit
*> &NodesAdded
) {
1590 NodesAdded
.insert(SU
);
1591 for (auto &SI
: SU
->Succs
) {
1592 SUnit
*Successor
= SI
.getSUnit();
1593 if (!SI
.isArtificial() && NodesAdded
.count(Successor
) == 0)
1594 addConnectedNodes(Successor
, NewSet
, NodesAdded
);
1596 for (auto &PI
: SU
->Preds
) {
1597 SUnit
*Predecessor
= PI
.getSUnit();
1598 if (!PI
.isArtificial() && NodesAdded
.count(Predecessor
) == 0)
1599 addConnectedNodes(Predecessor
, NewSet
, NodesAdded
);
1603 /// Return true if Set1 contains elements in Set2. The elements in common
1604 /// are returned in a different container.
1605 static bool isIntersect(SmallSetVector
<SUnit
*, 8> &Set1
, const NodeSet
&Set2
,
1606 SmallSetVector
<SUnit
*, 8> &Result
) {
1608 for (unsigned i
= 0, e
= Set1
.size(); i
!= e
; ++i
) {
1609 SUnit
*SU
= Set1
[i
];
1610 if (Set2
.count(SU
) != 0)
1613 return !Result
.empty();
1616 /// Merge the recurrence node sets that have the same initial node.
1617 void SwingSchedulerDAG::fuseRecs(NodeSetType
&NodeSets
) {
1618 for (NodeSetType::iterator I
= NodeSets
.begin(), E
= NodeSets
.end(); I
!= E
;
1621 for (NodeSetType::iterator J
= I
+ 1; J
!= E
;) {
1623 if (NI
.getNode(0)->NodeNum
== NJ
.getNode(0)->NodeNum
) {
1624 if (NJ
.compareRecMII(NI
) > 0)
1625 NI
.setRecMII(NJ
.getRecMII());
1626 for (NodeSet::iterator NII
= J
->begin(), ENI
= J
->end(); NII
!= ENI
;
1638 /// Remove nodes that have been scheduled in previous NodeSets.
1639 void SwingSchedulerDAG::removeDuplicateNodes(NodeSetType
&NodeSets
) {
1640 for (NodeSetType::iterator I
= NodeSets
.begin(), E
= NodeSets
.end(); I
!= E
;
1642 for (NodeSetType::iterator J
= I
+ 1; J
!= E
;) {
1643 J
->remove_if([&](SUnit
*SUJ
) { return I
->count(SUJ
); });
1654 /// Compute an ordered list of the dependence graph nodes, which
1655 /// indicates the order that the nodes will be scheduled. This is a
1656 /// two-level algorithm. First, a partial order is created, which
1657 /// consists of a list of sets ordered from highest to lowest priority.
1658 void SwingSchedulerDAG::computeNodeOrder(NodeSetType
&NodeSets
) {
1659 SmallSetVector
<SUnit
*, 8> R
;
1662 for (auto &Nodes
: NodeSets
) {
1663 LLVM_DEBUG(dbgs() << "NodeSet size " << Nodes
.size() << "\n");
1665 SmallSetVector
<SUnit
*, 8> N
;
1666 if (pred_L(NodeOrder
, N
) && isSubset(N
, Nodes
)) {
1667 R
.insert(N
.begin(), N
.end());
1669 LLVM_DEBUG(dbgs() << " Bottom up (preds) ");
1670 } else if (succ_L(NodeOrder
, N
) && isSubset(N
, Nodes
)) {
1671 R
.insert(N
.begin(), N
.end());
1673 LLVM_DEBUG(dbgs() << " Top down (succs) ");
1674 } else if (isIntersect(N
, Nodes
, R
)) {
1675 // If some of the successors are in the existing node-set, then use the
1676 // top-down ordering.
1678 LLVM_DEBUG(dbgs() << " Top down (intersect) ");
1679 } else if (NodeSets
.size() == 1) {
1680 for (auto &N
: Nodes
)
1681 if (N
->Succs
.size() == 0)
1684 LLVM_DEBUG(dbgs() << " Bottom up (all) ");
1686 // Find the node with the highest ASAP.
1687 SUnit
*maxASAP
= nullptr;
1688 for (SUnit
*SU
: Nodes
) {
1689 if (maxASAP
== nullptr || getASAP(SU
) > getASAP(maxASAP
) ||
1690 (getASAP(SU
) == getASAP(maxASAP
) && SU
->NodeNum
> maxASAP
->NodeNum
))
1695 LLVM_DEBUG(dbgs() << " Bottom up (default) ");
1698 while (!R
.empty()) {
1699 if (Order
== TopDown
) {
1700 // Choose the node with the maximum height. If more than one, choose
1701 // the node wiTH the maximum ZeroLatencyHeight. If still more than one,
1702 // choose the node with the lowest MOV.
1703 while (!R
.empty()) {
1704 SUnit
*maxHeight
= nullptr;
1705 for (SUnit
*I
: R
) {
1706 if (maxHeight
== nullptr || getHeight(I
) > getHeight(maxHeight
))
1708 else if (getHeight(I
) == getHeight(maxHeight
) &&
1709 getZeroLatencyHeight(I
) > getZeroLatencyHeight(maxHeight
))
1711 else if (getHeight(I
) == getHeight(maxHeight
) &&
1712 getZeroLatencyHeight(I
) ==
1713 getZeroLatencyHeight(maxHeight
) &&
1714 getMOV(I
) < getMOV(maxHeight
))
1717 NodeOrder
.insert(maxHeight
);
1718 LLVM_DEBUG(dbgs() << maxHeight
->NodeNum
<< " ");
1719 R
.remove(maxHeight
);
1720 for (const auto &I
: maxHeight
->Succs
) {
1721 if (Nodes
.count(I
.getSUnit()) == 0)
1723 if (NodeOrder
.count(I
.getSUnit()) != 0)
1725 if (ignoreDependence(I
, false))
1727 R
.insert(I
.getSUnit());
1729 // Back-edges are predecessors with an anti-dependence.
1730 for (const auto &I
: maxHeight
->Preds
) {
1731 if (I
.getKind() != SDep::Anti
)
1733 if (Nodes
.count(I
.getSUnit()) == 0)
1735 if (NodeOrder
.count(I
.getSUnit()) != 0)
1737 R
.insert(I
.getSUnit());
1741 LLVM_DEBUG(dbgs() << "\n Switching order to bottom up ");
1742 SmallSetVector
<SUnit
*, 8> N
;
1743 if (pred_L(NodeOrder
, N
, &Nodes
))
1744 R
.insert(N
.begin(), N
.end());
1746 // Choose the node with the maximum depth. If more than one, choose
1747 // the node with the maximum ZeroLatencyDepth. If still more than one,
1748 // choose the node with the lowest MOV.
1749 while (!R
.empty()) {
1750 SUnit
*maxDepth
= nullptr;
1751 for (SUnit
*I
: R
) {
1752 if (maxDepth
== nullptr || getDepth(I
) > getDepth(maxDepth
))
1754 else if (getDepth(I
) == getDepth(maxDepth
) &&
1755 getZeroLatencyDepth(I
) > getZeroLatencyDepth(maxDepth
))
1757 else if (getDepth(I
) == getDepth(maxDepth
) &&
1758 getZeroLatencyDepth(I
) == getZeroLatencyDepth(maxDepth
) &&
1759 getMOV(I
) < getMOV(maxDepth
))
1762 NodeOrder
.insert(maxDepth
);
1763 LLVM_DEBUG(dbgs() << maxDepth
->NodeNum
<< " ");
1765 if (Nodes
.isExceedSU(maxDepth
)) {
1768 R
.insert(Nodes
.getNode(0));
1771 for (const auto &I
: maxDepth
->Preds
) {
1772 if (Nodes
.count(I
.getSUnit()) == 0)
1774 if (NodeOrder
.count(I
.getSUnit()) != 0)
1776 R
.insert(I
.getSUnit());
1778 // Back-edges are predecessors with an anti-dependence.
1779 for (const auto &I
: maxDepth
->Succs
) {
1780 if (I
.getKind() != SDep::Anti
)
1782 if (Nodes
.count(I
.getSUnit()) == 0)
1784 if (NodeOrder
.count(I
.getSUnit()) != 0)
1786 R
.insert(I
.getSUnit());
1790 LLVM_DEBUG(dbgs() << "\n Switching order to top down ");
1791 SmallSetVector
<SUnit
*, 8> N
;
1792 if (succ_L(NodeOrder
, N
, &Nodes
))
1793 R
.insert(N
.begin(), N
.end());
1796 LLVM_DEBUG(dbgs() << "\nDone with Nodeset\n");
1800 dbgs() << "Node order: ";
1801 for (SUnit
*I
: NodeOrder
)
1802 dbgs() << " " << I
->NodeNum
<< " ";
1807 /// Process the nodes in the computed order and create the pipelined schedule
1808 /// of the instructions, if possible. Return true if a schedule is found.
1809 bool SwingSchedulerDAG::schedulePipeline(SMSchedule
&Schedule
) {
1810 if (NodeOrder
.empty())
1813 bool scheduleFound
= false;
1815 // Keep increasing II until a valid schedule is found.
1816 for (II
= MII
; II
<= MAX_II
&& !scheduleFound
; ++II
) {
1818 Schedule
.setInitiationInterval(II
);
1819 LLVM_DEBUG(dbgs() << "Try to schedule with " << II
<< "\n");
1821 SetVector
<SUnit
*>::iterator NI
= NodeOrder
.begin();
1822 SetVector
<SUnit
*>::iterator NE
= NodeOrder
.end();
1826 // Compute the schedule time for the instruction, which is based
1827 // upon the scheduled time for any predecessors/successors.
1828 int EarlyStart
= INT_MIN
;
1829 int LateStart
= INT_MAX
;
1830 // These values are set when the size of the schedule window is limited
1831 // due to chain dependences.
1832 int SchedEnd
= INT_MAX
;
1833 int SchedStart
= INT_MIN
;
1834 Schedule
.computeStart(SU
, &EarlyStart
, &LateStart
, &SchedEnd
, &SchedStart
,
1837 dbgs() << "Inst (" << SU
->NodeNum
<< ") ";
1838 SU
->getInstr()->dump();
1842 dbgs() << "\tes: " << EarlyStart
<< " ls: " << LateStart
1843 << " me: " << SchedEnd
<< " ms: " << SchedStart
<< "\n";
1846 if (EarlyStart
> LateStart
|| SchedEnd
< EarlyStart
||
1847 SchedStart
> LateStart
)
1848 scheduleFound
= false;
1849 else if (EarlyStart
!= INT_MIN
&& LateStart
== INT_MAX
) {
1850 SchedEnd
= std::min(SchedEnd
, EarlyStart
+ (int)II
- 1);
1851 scheduleFound
= Schedule
.insert(SU
, EarlyStart
, SchedEnd
, II
);
1852 } else if (EarlyStart
== INT_MIN
&& LateStart
!= INT_MAX
) {
1853 SchedStart
= std::max(SchedStart
, LateStart
- (int)II
+ 1);
1854 scheduleFound
= Schedule
.insert(SU
, LateStart
, SchedStart
, II
);
1855 } else if (EarlyStart
!= INT_MIN
&& LateStart
!= INT_MAX
) {
1857 std::min(SchedEnd
, std::min(LateStart
, EarlyStart
+ (int)II
- 1));
1858 // When scheduling a Phi it is better to start at the late cycle and go
1859 // backwards. The default order may insert the Phi too far away from
1860 // its first dependence.
1861 if (SU
->getInstr()->isPHI())
1862 scheduleFound
= Schedule
.insert(SU
, SchedEnd
, EarlyStart
, II
);
1864 scheduleFound
= Schedule
.insert(SU
, EarlyStart
, SchedEnd
, II
);
1866 int FirstCycle
= Schedule
.getFirstCycle();
1867 scheduleFound
= Schedule
.insert(SU
, FirstCycle
+ getASAP(SU
),
1868 FirstCycle
+ getASAP(SU
) + II
- 1, II
);
1870 // Even if we find a schedule, make sure the schedule doesn't exceed the
1871 // allowable number of stages. We keep trying if this happens.
1873 if (SwpMaxStages
> -1 &&
1874 Schedule
.getMaxStageCount() > (unsigned)SwpMaxStages
)
1875 scheduleFound
= false;
1879 dbgs() << "\tCan't schedule\n";
1881 } while (++NI
!= NE
&& scheduleFound
);
1883 // If a schedule is found, check if it is a valid schedule too.
1885 scheduleFound
= Schedule
.isValidSchedule(this);
1888 LLVM_DEBUG(dbgs() << "Schedule Found? " << scheduleFound
<< " (II=" << II
1892 Schedule
.finalizeSchedule(this);
1896 return scheduleFound
&& Schedule
.getMaxStageCount() > 0;
1899 /// Given a schedule for the loop, generate a new version of the loop,
1900 /// and replace the old version. This function generates a prolog
1901 /// that contains the initial iterations in the pipeline, and kernel
1902 /// loop, and the epilogue that contains the code for the final
1904 void SwingSchedulerDAG::generatePipelinedLoop(SMSchedule
&Schedule
) {
1905 // Create a new basic block for the kernel and add it to the CFG.
1906 MachineBasicBlock
*KernelBB
= MF
.CreateMachineBasicBlock(BB
->getBasicBlock());
1908 unsigned MaxStageCount
= Schedule
.getMaxStageCount();
1910 // Remember the registers that are used in different stages. The index is
1911 // the iteration, or stage, that the instruction is scheduled in. This is
1912 // a map between register names in the original block and the names created
1913 // in each stage of the pipelined loop.
1914 ValueMapTy
*VRMap
= new ValueMapTy
[(MaxStageCount
+ 1) * 2];
1915 InstrMapTy InstrMap
;
1917 SmallVector
<MachineBasicBlock
*, 4> PrologBBs
;
1918 // Generate the prolog instructions that set up the pipeline.
1919 generateProlog(Schedule
, MaxStageCount
, KernelBB
, VRMap
, PrologBBs
);
1920 MF
.insert(BB
->getIterator(), KernelBB
);
1922 // Rearrange the instructions to generate the new, pipelined loop,
1923 // and update register names as needed.
1924 for (int Cycle
= Schedule
.getFirstCycle(),
1925 LastCycle
= Schedule
.getFinalCycle();
1926 Cycle
<= LastCycle
; ++Cycle
) {
1927 std::deque
<SUnit
*> &CycleInstrs
= Schedule
.getInstructions(Cycle
);
1928 // This inner loop schedules each instruction in the cycle.
1929 for (SUnit
*CI
: CycleInstrs
) {
1930 if (CI
->getInstr()->isPHI())
1932 unsigned StageNum
= Schedule
.stageScheduled(getSUnit(CI
->getInstr()));
1933 MachineInstr
*NewMI
= cloneInstr(CI
->getInstr(), MaxStageCount
, StageNum
);
1934 updateInstruction(NewMI
, false, MaxStageCount
, StageNum
, Schedule
, VRMap
);
1935 KernelBB
->push_back(NewMI
);
1936 InstrMap
[NewMI
] = CI
->getInstr();
1940 // Copy any terminator instructions to the new kernel, and update
1942 for (MachineBasicBlock::iterator I
= BB
->getFirstTerminator(),
1943 E
= BB
->instr_end();
1945 MachineInstr
*NewMI
= MF
.CloneMachineInstr(&*I
);
1946 updateInstruction(NewMI
, false, MaxStageCount
, 0, Schedule
, VRMap
);
1947 KernelBB
->push_back(NewMI
);
1948 InstrMap
[NewMI
] = &*I
;
1951 KernelBB
->transferSuccessors(BB
);
1952 KernelBB
->replaceSuccessor(BB
, KernelBB
);
1954 generateExistingPhis(KernelBB
, PrologBBs
.back(), KernelBB
, KernelBB
, Schedule
,
1955 VRMap
, InstrMap
, MaxStageCount
, MaxStageCount
, false);
1956 generatePhis(KernelBB
, PrologBBs
.back(), KernelBB
, KernelBB
, Schedule
, VRMap
,
1957 InstrMap
, MaxStageCount
, MaxStageCount
, false);
1959 LLVM_DEBUG(dbgs() << "New block\n"; KernelBB
->dump(););
1961 SmallVector
<MachineBasicBlock
*, 4> EpilogBBs
;
1962 // Generate the epilog instructions to complete the pipeline.
1963 generateEpilog(Schedule
, MaxStageCount
, KernelBB
, VRMap
, EpilogBBs
,
1966 // We need this step because the register allocation doesn't handle some
1967 // situations well, so we insert copies to help out.
1968 splitLifetimes(KernelBB
, EpilogBBs
, Schedule
);
1970 // Remove dead instructions due to loop induction variables.
1971 removeDeadInstructions(KernelBB
, EpilogBBs
);
1973 // Add branches between prolog and epilog blocks.
1974 addBranches(PrologBBs
, KernelBB
, EpilogBBs
, Schedule
, VRMap
);
1976 // Remove the original loop since it's no longer referenced.
1978 LIS
.RemoveMachineInstrFromMaps(I
);
1980 BB
->eraseFromParent();
1985 /// Generate the pipeline prolog code.
1986 void SwingSchedulerDAG::generateProlog(SMSchedule
&Schedule
, unsigned LastStage
,
1987 MachineBasicBlock
*KernelBB
,
1989 MBBVectorTy
&PrologBBs
) {
1990 MachineBasicBlock
*PreheaderBB
= MLI
->getLoopFor(BB
)->getLoopPreheader();
1991 assert(PreheaderBB
!= nullptr &&
1992 "Need to add code to handle loops w/o preheader");
1993 MachineBasicBlock
*PredBB
= PreheaderBB
;
1994 InstrMapTy InstrMap
;
1996 // Generate a basic block for each stage, not including the last stage,
1997 // which will be generated in the kernel. Each basic block may contain
1998 // instructions from multiple stages/iterations.
1999 for (unsigned i
= 0; i
< LastStage
; ++i
) {
2000 // Create and insert the prolog basic block prior to the original loop
2001 // basic block. The original loop is removed later.
2002 MachineBasicBlock
*NewBB
= MF
.CreateMachineBasicBlock(BB
->getBasicBlock());
2003 PrologBBs
.push_back(NewBB
);
2004 MF
.insert(BB
->getIterator(), NewBB
);
2005 NewBB
->transferSuccessors(PredBB
);
2006 PredBB
->addSuccessor(NewBB
);
2009 // Generate instructions for each appropriate stage. Process instructions
2010 // in original program order.
2011 for (int StageNum
= i
; StageNum
>= 0; --StageNum
) {
2012 for (MachineBasicBlock::iterator BBI
= BB
->instr_begin(),
2013 BBE
= BB
->getFirstTerminator();
2014 BBI
!= BBE
; ++BBI
) {
2015 if (Schedule
.isScheduledAtStage(getSUnit(&*BBI
), (unsigned)StageNum
)) {
2018 MachineInstr
*NewMI
=
2019 cloneAndChangeInstr(&*BBI
, i
, (unsigned)StageNum
, Schedule
);
2020 updateInstruction(NewMI
, false, i
, (unsigned)StageNum
, Schedule
,
2022 NewBB
->push_back(NewMI
);
2023 InstrMap
[NewMI
] = &*BBI
;
2027 rewritePhiValues(NewBB
, i
, Schedule
, VRMap
, InstrMap
);
2029 dbgs() << "prolog:\n";
2034 PredBB
->replaceSuccessor(BB
, KernelBB
);
2036 // Check if we need to remove the branch from the preheader to the original
2037 // loop, and replace it with a branch to the new loop.
2038 unsigned numBranches
= TII
->removeBranch(*PreheaderBB
);
2040 SmallVector
<MachineOperand
, 0> Cond
;
2041 TII
->insertBranch(*PreheaderBB
, PrologBBs
[0], nullptr, Cond
, DebugLoc());
2045 /// Generate the pipeline epilog code. The epilog code finishes the iterations
2046 /// that were started in either the prolog or the kernel. We create a basic
2047 /// block for each stage that needs to complete.
2048 void SwingSchedulerDAG::generateEpilog(SMSchedule
&Schedule
, unsigned LastStage
,
2049 MachineBasicBlock
*KernelBB
,
2051 MBBVectorTy
&EpilogBBs
,
2052 MBBVectorTy
&PrologBBs
) {
2053 // We need to change the branch from the kernel to the first epilog block, so
2054 // this call to analyze branch uses the kernel rather than the original BB.
2055 MachineBasicBlock
*TBB
= nullptr, *FBB
= nullptr;
2056 SmallVector
<MachineOperand
, 4> Cond
;
2057 bool checkBranch
= TII
->analyzeBranch(*KernelBB
, TBB
, FBB
, Cond
);
2058 assert(!checkBranch
&& "generateEpilog must be able to analyze the branch");
2062 MachineBasicBlock::succ_iterator LoopExitI
= KernelBB
->succ_begin();
2063 if (*LoopExitI
== KernelBB
)
2065 assert(LoopExitI
!= KernelBB
->succ_end() && "Expecting a successor");
2066 MachineBasicBlock
*LoopExitBB
= *LoopExitI
;
2068 MachineBasicBlock
*PredBB
= KernelBB
;
2069 MachineBasicBlock
*EpilogStart
= LoopExitBB
;
2070 InstrMapTy InstrMap
;
2072 // Generate a basic block for each stage, not including the last stage,
2073 // which was generated for the kernel. Each basic block may contain
2074 // instructions from multiple stages/iterations.
2075 int EpilogStage
= LastStage
+ 1;
2076 for (unsigned i
= LastStage
; i
>= 1; --i
, ++EpilogStage
) {
2077 MachineBasicBlock
*NewBB
= MF
.CreateMachineBasicBlock();
2078 EpilogBBs
.push_back(NewBB
);
2079 MF
.insert(BB
->getIterator(), NewBB
);
2081 PredBB
->replaceSuccessor(LoopExitBB
, NewBB
);
2082 NewBB
->addSuccessor(LoopExitBB
);
2084 if (EpilogStart
== LoopExitBB
)
2085 EpilogStart
= NewBB
;
2087 // Add instructions to the epilog depending on the current block.
2088 // Process instructions in original program order.
2089 for (unsigned StageNum
= i
; StageNum
<= LastStage
; ++StageNum
) {
2090 for (auto &BBI
: *BB
) {
2093 MachineInstr
*In
= &BBI
;
2094 if (Schedule
.isScheduledAtStage(getSUnit(In
), StageNum
)) {
2095 // Instructions with memoperands in the epilog are updated with
2096 // conservative values.
2097 MachineInstr
*NewMI
= cloneInstr(In
, UINT_MAX
, 0);
2098 updateInstruction(NewMI
, i
== 1, EpilogStage
, 0, Schedule
, VRMap
);
2099 NewBB
->push_back(NewMI
);
2100 InstrMap
[NewMI
] = In
;
2104 generateExistingPhis(NewBB
, PrologBBs
[i
- 1], PredBB
, KernelBB
, Schedule
,
2105 VRMap
, InstrMap
, LastStage
, EpilogStage
, i
== 1);
2106 generatePhis(NewBB
, PrologBBs
[i
- 1], PredBB
, KernelBB
, Schedule
, VRMap
,
2107 InstrMap
, LastStage
, EpilogStage
, i
== 1);
2111 dbgs() << "epilog:\n";
2116 // Fix any Phi nodes in the loop exit block.
2117 for (MachineInstr
&MI
: *LoopExitBB
) {
2120 for (unsigned i
= 2, e
= MI
.getNumOperands() + 1; i
!= e
; i
+= 2) {
2121 MachineOperand
&MO
= MI
.getOperand(i
);
2122 if (MO
.getMBB() == BB
)
2127 // Create a branch to the new epilog from the kernel.
2128 // Remove the original branch and add a new branch to the epilog.
2129 TII
->removeBranch(*KernelBB
);
2130 TII
->insertBranch(*KernelBB
, KernelBB
, EpilogStart
, Cond
, DebugLoc());
2131 // Add a branch to the loop exit.
2132 if (EpilogBBs
.size() > 0) {
2133 MachineBasicBlock
*LastEpilogBB
= EpilogBBs
.back();
2134 SmallVector
<MachineOperand
, 4> Cond1
;
2135 TII
->insertBranch(*LastEpilogBB
, LoopExitBB
, nullptr, Cond1
, DebugLoc());
2139 /// Replace all uses of FromReg that appear outside the specified
2140 /// basic block with ToReg.
2141 static void replaceRegUsesAfterLoop(unsigned FromReg
, unsigned ToReg
,
2142 MachineBasicBlock
*MBB
,
2143 MachineRegisterInfo
&MRI
,
2144 LiveIntervals
&LIS
) {
2145 for (MachineRegisterInfo::use_iterator I
= MRI
.use_begin(FromReg
),
2148 MachineOperand
&O
= *I
;
2150 if (O
.getParent()->getParent() != MBB
)
2153 if (!LIS
.hasInterval(ToReg
))
2154 LIS
.createEmptyInterval(ToReg
);
2157 /// Return true if the register has a use that occurs outside the
2159 static bool hasUseAfterLoop(unsigned Reg
, MachineBasicBlock
*BB
,
2160 MachineRegisterInfo
&MRI
) {
2161 for (MachineRegisterInfo::use_iterator I
= MRI
.use_begin(Reg
),
2164 if (I
->getParent()->getParent() != BB
)
2169 /// Generate Phis for the specific block in the generated pipelined code.
2170 /// This function looks at the Phis from the original code to guide the
2171 /// creation of new Phis.
2172 void SwingSchedulerDAG::generateExistingPhis(
2173 MachineBasicBlock
*NewBB
, MachineBasicBlock
*BB1
, MachineBasicBlock
*BB2
,
2174 MachineBasicBlock
*KernelBB
, SMSchedule
&Schedule
, ValueMapTy
*VRMap
,
2175 InstrMapTy
&InstrMap
, unsigned LastStageNum
, unsigned CurStageNum
,
2177 // Compute the stage number for the initial value of the Phi, which
2178 // comes from the prolog. The prolog to use depends on to which kernel/
2179 // epilog that we're adding the Phi.
2180 unsigned PrologStage
= 0;
2181 unsigned PrevStage
= 0;
2182 bool InKernel
= (LastStageNum
== CurStageNum
);
2184 PrologStage
= LastStageNum
- 1;
2185 PrevStage
= CurStageNum
;
2187 PrologStage
= LastStageNum
- (CurStageNum
- LastStageNum
);
2188 PrevStage
= LastStageNum
+ (CurStageNum
- LastStageNum
) - 1;
2191 for (MachineBasicBlock::iterator BBI
= BB
->instr_begin(),
2192 BBE
= BB
->getFirstNonPHI();
2193 BBI
!= BBE
; ++BBI
) {
2194 unsigned Def
= BBI
->getOperand(0).getReg();
2196 unsigned InitVal
= 0;
2197 unsigned LoopVal
= 0;
2198 getPhiRegs(*BBI
, BB
, InitVal
, LoopVal
);
2200 unsigned PhiOp1
= 0;
2201 // The Phi value from the loop body typically is defined in the loop, but
2202 // not always. So, we need to check if the value is defined in the loop.
2203 unsigned PhiOp2
= LoopVal
;
2204 if (VRMap
[LastStageNum
].count(LoopVal
))
2205 PhiOp2
= VRMap
[LastStageNum
][LoopVal
];
2207 int StageScheduled
= Schedule
.stageScheduled(getSUnit(&*BBI
));
2209 Schedule
.stageScheduled(getSUnit(MRI
.getVRegDef(LoopVal
)));
2210 unsigned NumStages
= Schedule
.getStagesForReg(Def
, CurStageNum
);
2211 if (NumStages
== 0) {
2212 // We don't need to generate a Phi anymore, but we need to rename any uses
2213 // of the Phi value.
2214 unsigned NewReg
= VRMap
[PrevStage
][LoopVal
];
2215 rewriteScheduledInstr(NewBB
, Schedule
, InstrMap
, CurStageNum
, 0, &*BBI
,
2216 Def
, InitVal
, NewReg
);
2217 if (VRMap
[CurStageNum
].count(LoopVal
))
2218 VRMap
[CurStageNum
][Def
] = VRMap
[CurStageNum
][LoopVal
];
2220 // Adjust the number of Phis needed depending on the number of prologs left,
2221 // and the distance from where the Phi is first scheduled. The number of
2222 // Phis cannot exceed the number of prolog stages. Each stage can
2223 // potentially define two values.
2224 unsigned MaxPhis
= PrologStage
+ 2;
2225 if (!InKernel
&& (int)PrologStage
<= LoopValStage
)
2226 MaxPhis
= std::max((int)MaxPhis
- (int)LoopValStage
, 1);
2227 unsigned NumPhis
= std::min(NumStages
, MaxPhis
);
2229 unsigned NewReg
= 0;
2230 unsigned AccessStage
= (LoopValStage
!= -1) ? LoopValStage
: StageScheduled
;
2231 // In the epilog, we may need to look back one stage to get the correct
2232 // Phi name because the epilog and prolog blocks execute the same stage.
2233 // The correct name is from the previous block only when the Phi has
2234 // been completely scheduled prior to the epilog, and Phi value is not
2235 // needed in multiple stages.
2237 if (!InKernel
&& StageScheduled
>= LoopValStage
&& AccessStage
== 0 &&
2240 // Adjust the computations below when the phi and the loop definition
2241 // are scheduled in different stages.
2242 if (InKernel
&& LoopValStage
!= -1 && StageScheduled
> LoopValStage
)
2243 StageDiff
= StageScheduled
- LoopValStage
;
2244 for (unsigned np
= 0; np
< NumPhis
; ++np
) {
2245 // If the Phi hasn't been scheduled, then use the initial Phi operand
2246 // value. Otherwise, use the scheduled version of the instruction. This
2247 // is a little complicated when a Phi references another Phi.
2248 if (np
> PrologStage
|| StageScheduled
>= (int)LastStageNum
)
2250 // Check if the Phi has already been scheduled in a prolog stage.
2251 else if (PrologStage
>= AccessStage
+ StageDiff
+ np
&&
2252 VRMap
[PrologStage
- StageDiff
- np
].count(LoopVal
) != 0)
2253 PhiOp1
= VRMap
[PrologStage
- StageDiff
- np
][LoopVal
];
2254 // Check if the Phi has already been scheduled, but the loop instruction
2255 // is either another Phi, or doesn't occur in the loop.
2256 else if (PrologStage
>= AccessStage
+ StageDiff
+ np
) {
2257 // If the Phi references another Phi, we need to examine the other
2258 // Phi to get the correct value.
2260 MachineInstr
*InstOp1
= MRI
.getVRegDef(PhiOp1
);
2262 while (InstOp1
&& InstOp1
->isPHI() && InstOp1
->getParent() == BB
) {
2263 int PhiStage
= Schedule
.stageScheduled(getSUnit(InstOp1
));
2264 if ((int)(PrologStage
- StageDiff
- np
) < PhiStage
+ Indirects
)
2265 PhiOp1
= getInitPhiReg(*InstOp1
, BB
);
2267 PhiOp1
= getLoopPhiReg(*InstOp1
, BB
);
2268 InstOp1
= MRI
.getVRegDef(PhiOp1
);
2269 int PhiOpStage
= Schedule
.stageScheduled(getSUnit(InstOp1
));
2270 int StageAdj
= (PhiOpStage
!= -1 ? PhiStage
- PhiOpStage
: 0);
2271 if (PhiOpStage
!= -1 && PrologStage
- StageAdj
>= Indirects
+ np
&&
2272 VRMap
[PrologStage
- StageAdj
- Indirects
- np
].count(PhiOp1
)) {
2273 PhiOp1
= VRMap
[PrologStage
- StageAdj
- Indirects
- np
][PhiOp1
];
2280 // If this references a generated Phi in the kernel, get the Phi operand
2281 // from the incoming block.
2282 if (MachineInstr
*InstOp1
= MRI
.getVRegDef(PhiOp1
))
2283 if (InstOp1
->isPHI() && InstOp1
->getParent() == KernelBB
)
2284 PhiOp1
= getInitPhiReg(*InstOp1
, KernelBB
);
2286 MachineInstr
*PhiInst
= MRI
.getVRegDef(LoopVal
);
2287 bool LoopDefIsPhi
= PhiInst
&& PhiInst
->isPHI();
2288 // In the epilog, a map lookup is needed to get the value from the kernel,
2289 // or previous epilog block. How is does this depends on if the
2290 // instruction is scheduled in the previous block.
2292 int StageDiffAdj
= 0;
2293 if (LoopValStage
!= -1 && StageScheduled
> LoopValStage
)
2294 StageDiffAdj
= StageScheduled
- LoopValStage
;
2295 // Use the loop value defined in the kernel, unless the kernel
2296 // contains the last definition of the Phi.
2297 if (np
== 0 && PrevStage
== LastStageNum
&&
2298 (StageScheduled
!= 0 || LoopValStage
!= 0) &&
2299 VRMap
[PrevStage
- StageDiffAdj
].count(LoopVal
))
2300 PhiOp2
= VRMap
[PrevStage
- StageDiffAdj
][LoopVal
];
2301 // Use the value defined by the Phi. We add one because we switch
2302 // from looking at the loop value to the Phi definition.
2303 else if (np
> 0 && PrevStage
== LastStageNum
&&
2304 VRMap
[PrevStage
- np
+ 1].count(Def
))
2305 PhiOp2
= VRMap
[PrevStage
- np
+ 1][Def
];
2306 // Use the loop value defined in the kernel.
2307 else if (static_cast<unsigned>(LoopValStage
) > PrologStage
+ 1 &&
2308 VRMap
[PrevStage
- StageDiffAdj
- np
].count(LoopVal
))
2309 PhiOp2
= VRMap
[PrevStage
- StageDiffAdj
- np
][LoopVal
];
2310 // Use the value defined by the Phi, unless we're generating the first
2311 // epilog and the Phi refers to a Phi in a different stage.
2312 else if (VRMap
[PrevStage
- np
].count(Def
) &&
2313 (!LoopDefIsPhi
|| PrevStage
!= LastStageNum
))
2314 PhiOp2
= VRMap
[PrevStage
- np
][Def
];
2317 // Check if we can reuse an existing Phi. This occurs when a Phi
2318 // references another Phi, and the other Phi is scheduled in an
2319 // earlier stage. We can try to reuse an existing Phi up until the last
2320 // stage of the current Phi.
2322 if (static_cast<int>(PrologStage
- np
) >= StageScheduled
) {
2323 int LVNumStages
= Schedule
.getStagesForPhi(LoopVal
);
2324 int StageDiff
= (StageScheduled
- LoopValStage
);
2325 LVNumStages
-= StageDiff
;
2326 // Make sure the loop value Phi has been processed already.
2327 if (LVNumStages
> (int)np
&& VRMap
[CurStageNum
].count(LoopVal
)) {
2329 unsigned ReuseStage
= CurStageNum
;
2330 if (Schedule
.isLoopCarried(this, *PhiInst
))
2331 ReuseStage
-= LVNumStages
;
2332 // Check if the Phi to reuse has been generated yet. If not, then
2333 // there is nothing to reuse.
2334 if (VRMap
[ReuseStage
- np
].count(LoopVal
)) {
2335 NewReg
= VRMap
[ReuseStage
- np
][LoopVal
];
2337 rewriteScheduledInstr(NewBB
, Schedule
, InstrMap
, CurStageNum
, np
,
2338 &*BBI
, Def
, NewReg
);
2339 // Update the map with the new Phi name.
2340 VRMap
[CurStageNum
- np
][Def
] = NewReg
;
2342 if (VRMap
[LastStageNum
- np
- 1].count(LoopVal
))
2343 PhiOp2
= VRMap
[LastStageNum
- np
- 1][LoopVal
];
2345 if (IsLast
&& np
== NumPhis
- 1)
2346 replaceRegUsesAfterLoop(Def
, NewReg
, BB
, MRI
, LIS
);
2351 if (InKernel
&& StageDiff
> 0 &&
2352 VRMap
[CurStageNum
- StageDiff
- np
].count(LoopVal
))
2353 PhiOp2
= VRMap
[CurStageNum
- StageDiff
- np
][LoopVal
];
2356 const TargetRegisterClass
*RC
= MRI
.getRegClass(Def
);
2357 NewReg
= MRI
.createVirtualRegister(RC
);
2359 MachineInstrBuilder NewPhi
=
2360 BuildMI(*NewBB
, NewBB
->getFirstNonPHI(), DebugLoc(),
2361 TII
->get(TargetOpcode::PHI
), NewReg
);
2362 NewPhi
.addReg(PhiOp1
).addMBB(BB1
);
2363 NewPhi
.addReg(PhiOp2
).addMBB(BB2
);
2365 InstrMap
[NewPhi
] = &*BBI
;
2367 // We define the Phis after creating the new pipelined code, so
2368 // we need to rename the Phi values in scheduled instructions.
2370 unsigned PrevReg
= 0;
2371 if (InKernel
&& VRMap
[PrevStage
- np
].count(LoopVal
))
2372 PrevReg
= VRMap
[PrevStage
- np
][LoopVal
];
2373 rewriteScheduledInstr(NewBB
, Schedule
, InstrMap
, CurStageNum
, np
, &*BBI
,
2374 Def
, NewReg
, PrevReg
);
2375 // If the Phi has been scheduled, use the new name for rewriting.
2376 if (VRMap
[CurStageNum
- np
].count(Def
)) {
2377 unsigned R
= VRMap
[CurStageNum
- np
][Def
];
2378 rewriteScheduledInstr(NewBB
, Schedule
, InstrMap
, CurStageNum
, np
, &*BBI
,
2382 // Check if we need to rename any uses that occurs after the loop. The
2383 // register to replace depends on whether the Phi is scheduled in the
2385 if (IsLast
&& np
== NumPhis
- 1)
2386 replaceRegUsesAfterLoop(Def
, NewReg
, BB
, MRI
, LIS
);
2388 // In the kernel, a dependent Phi uses the value from this Phi.
2392 // Update the map with the new Phi name.
2393 VRMap
[CurStageNum
- np
][Def
] = NewReg
;
2396 while (NumPhis
++ < NumStages
) {
2397 rewriteScheduledInstr(NewBB
, Schedule
, InstrMap
, CurStageNum
, NumPhis
,
2398 &*BBI
, Def
, NewReg
, 0);
2401 // Check if we need to rename a Phi that has been eliminated due to
2403 if (NumStages
== 0 && IsLast
&& VRMap
[CurStageNum
].count(LoopVal
))
2404 replaceRegUsesAfterLoop(Def
, VRMap
[CurStageNum
][LoopVal
], BB
, MRI
, LIS
);
2408 /// Generate Phis for the specified block in the generated pipelined code.
2409 /// These are new Phis needed because the definition is scheduled after the
2410 /// use in the pipelined sequence.
2411 void SwingSchedulerDAG::generatePhis(
2412 MachineBasicBlock
*NewBB
, MachineBasicBlock
*BB1
, MachineBasicBlock
*BB2
,
2413 MachineBasicBlock
*KernelBB
, SMSchedule
&Schedule
, ValueMapTy
*VRMap
,
2414 InstrMapTy
&InstrMap
, unsigned LastStageNum
, unsigned CurStageNum
,
2416 // Compute the stage number that contains the initial Phi value, and
2417 // the Phi from the previous stage.
2418 unsigned PrologStage
= 0;
2419 unsigned PrevStage
= 0;
2420 unsigned StageDiff
= CurStageNum
- LastStageNum
;
2421 bool InKernel
= (StageDiff
== 0);
2423 PrologStage
= LastStageNum
- 1;
2424 PrevStage
= CurStageNum
;
2426 PrologStage
= LastStageNum
- StageDiff
;
2427 PrevStage
= LastStageNum
+ StageDiff
- 1;
2430 for (MachineBasicBlock::iterator BBI
= BB
->getFirstNonPHI(),
2431 BBE
= BB
->instr_end();
2432 BBI
!= BBE
; ++BBI
) {
2433 for (unsigned i
= 0, e
= BBI
->getNumOperands(); i
!= e
; ++i
) {
2434 MachineOperand
&MO
= BBI
->getOperand(i
);
2435 if (!MO
.isReg() || !MO
.isDef() ||
2436 !TargetRegisterInfo::isVirtualRegister(MO
.getReg()))
2439 int StageScheduled
= Schedule
.stageScheduled(getSUnit(&*BBI
));
2440 assert(StageScheduled
!= -1 && "Expecting scheduled instruction.");
2441 unsigned Def
= MO
.getReg();
2442 unsigned NumPhis
= Schedule
.getStagesForReg(Def
, CurStageNum
);
2443 // An instruction scheduled in stage 0 and is used after the loop
2444 // requires a phi in the epilog for the last definition from either
2445 // the kernel or prolog.
2446 if (!InKernel
&& NumPhis
== 0 && StageScheduled
== 0 &&
2447 hasUseAfterLoop(Def
, BB
, MRI
))
2449 if (!InKernel
&& (unsigned)StageScheduled
> PrologStage
)
2452 unsigned PhiOp2
= VRMap
[PrevStage
][Def
];
2453 if (MachineInstr
*InstOp2
= MRI
.getVRegDef(PhiOp2
))
2454 if (InstOp2
->isPHI() && InstOp2
->getParent() == NewBB
)
2455 PhiOp2
= getLoopPhiReg(*InstOp2
, BB2
);
2456 // The number of Phis can't exceed the number of prolog stages. The
2457 // prolog stage number is zero based.
2458 if (NumPhis
> PrologStage
+ 1 - StageScheduled
)
2459 NumPhis
= PrologStage
+ 1 - StageScheduled
;
2460 for (unsigned np
= 0; np
< NumPhis
; ++np
) {
2461 unsigned PhiOp1
= VRMap
[PrologStage
][Def
];
2462 if (np
<= PrologStage
)
2463 PhiOp1
= VRMap
[PrologStage
- np
][Def
];
2464 if (MachineInstr
*InstOp1
= MRI
.getVRegDef(PhiOp1
)) {
2465 if (InstOp1
->isPHI() && InstOp1
->getParent() == KernelBB
)
2466 PhiOp1
= getInitPhiReg(*InstOp1
, KernelBB
);
2467 if (InstOp1
->isPHI() && InstOp1
->getParent() == NewBB
)
2468 PhiOp1
= getInitPhiReg(*InstOp1
, NewBB
);
2471 PhiOp2
= VRMap
[PrevStage
- np
][Def
];
2473 const TargetRegisterClass
*RC
= MRI
.getRegClass(Def
);
2474 unsigned NewReg
= MRI
.createVirtualRegister(RC
);
2476 MachineInstrBuilder NewPhi
=
2477 BuildMI(*NewBB
, NewBB
->getFirstNonPHI(), DebugLoc(),
2478 TII
->get(TargetOpcode::PHI
), NewReg
);
2479 NewPhi
.addReg(PhiOp1
).addMBB(BB1
);
2480 NewPhi
.addReg(PhiOp2
).addMBB(BB2
);
2482 InstrMap
[NewPhi
] = &*BBI
;
2484 // Rewrite uses and update the map. The actions depend upon whether
2485 // we generating code for the kernel or epilog blocks.
2487 rewriteScheduledInstr(NewBB
, Schedule
, InstrMap
, CurStageNum
, np
,
2488 &*BBI
, PhiOp1
, NewReg
);
2489 rewriteScheduledInstr(NewBB
, Schedule
, InstrMap
, CurStageNum
, np
,
2490 &*BBI
, PhiOp2
, NewReg
);
2493 VRMap
[PrevStage
- np
- 1][Def
] = NewReg
;
2495 VRMap
[CurStageNum
- np
][Def
] = NewReg
;
2496 if (np
== NumPhis
- 1)
2497 rewriteScheduledInstr(NewBB
, Schedule
, InstrMap
, CurStageNum
, np
,
2498 &*BBI
, Def
, NewReg
);
2500 if (IsLast
&& np
== NumPhis
- 1)
2501 replaceRegUsesAfterLoop(Def
, NewReg
, BB
, MRI
, LIS
);
2507 /// Remove instructions that generate values with no uses.
2508 /// Typically, these are induction variable operations that generate values
2509 /// used in the loop itself. A dead instruction has a definition with
2510 /// no uses, or uses that occur in the original loop only.
2511 void SwingSchedulerDAG::removeDeadInstructions(MachineBasicBlock
*KernelBB
,
2512 MBBVectorTy
&EpilogBBs
) {
2513 // For each epilog block, check that the value defined by each instruction
2514 // is used. If not, delete it.
2515 for (MBBVectorTy::reverse_iterator MBB
= EpilogBBs
.rbegin(),
2516 MBE
= EpilogBBs
.rend();
2518 for (MachineBasicBlock::reverse_instr_iterator MI
= (*MBB
)->instr_rbegin(),
2519 ME
= (*MBB
)->instr_rend();
2521 // From DeadMachineInstructionElem. Don't delete inline assembly.
2522 if (MI
->isInlineAsm()) {
2526 bool SawStore
= false;
2527 // Check if it's safe to remove the instruction due to side effects.
2528 // We can, and want to, remove Phis here.
2529 if (!MI
->isSafeToMove(nullptr, SawStore
) && !MI
->isPHI()) {
2534 for (MachineInstr::mop_iterator MOI
= MI
->operands_begin(),
2535 MOE
= MI
->operands_end();
2536 MOI
!= MOE
; ++MOI
) {
2537 if (!MOI
->isReg() || !MOI
->isDef())
2539 unsigned reg
= MOI
->getReg();
2540 // Assume physical registers are used, unless they are marked dead.
2541 if (TargetRegisterInfo::isPhysicalRegister(reg
)) {
2542 used
= !MOI
->isDead();
2547 unsigned realUses
= 0;
2548 for (MachineRegisterInfo::use_iterator UI
= MRI
.use_begin(reg
),
2551 // Check if there are any uses that occur only in the original
2552 // loop. If so, that's not a real use.
2553 if (UI
->getParent()->getParent() != BB
) {
2564 LIS
.RemoveMachineInstrFromMaps(*MI
);
2565 MI
++->eraseFromParent();
2570 // In the kernel block, check if we can remove a Phi that generates a value
2571 // used in an instruction removed in the epilog block.
2572 for (MachineBasicBlock::iterator BBI
= KernelBB
->instr_begin(),
2573 BBE
= KernelBB
->getFirstNonPHI();
2575 MachineInstr
*MI
= &*BBI
;
2577 unsigned reg
= MI
->getOperand(0).getReg();
2578 if (MRI
.use_begin(reg
) == MRI
.use_end()) {
2579 LIS
.RemoveMachineInstrFromMaps(*MI
);
2580 MI
->eraseFromParent();
2585 /// For loop carried definitions, we split the lifetime of a virtual register
2586 /// that has uses past the definition in the next iteration. A copy with a new
2587 /// virtual register is inserted before the definition, which helps with
2588 /// generating a better register assignment.
2590 /// v1 = phi(a, v2) v1 = phi(a, v2)
2591 /// v2 = phi(b, v3) v2 = phi(b, v3)
2592 /// v3 = .. v4 = copy v1
2595 void SwingSchedulerDAG::splitLifetimes(MachineBasicBlock
*KernelBB
,
2596 MBBVectorTy
&EpilogBBs
,
2597 SMSchedule
&Schedule
) {
2598 const TargetRegisterInfo
*TRI
= MF
.getSubtarget().getRegisterInfo();
2599 for (auto &PHI
: KernelBB
->phis()) {
2600 unsigned Def
= PHI
.getOperand(0).getReg();
2601 // Check for any Phi definition that used as an operand of another Phi
2602 // in the same block.
2603 for (MachineRegisterInfo::use_instr_iterator I
= MRI
.use_instr_begin(Def
),
2604 E
= MRI
.use_instr_end();
2606 if (I
->isPHI() && I
->getParent() == KernelBB
) {
2607 // Get the loop carried definition.
2608 unsigned LCDef
= getLoopPhiReg(PHI
, KernelBB
);
2611 MachineInstr
*MI
= MRI
.getVRegDef(LCDef
);
2612 if (!MI
|| MI
->getParent() != KernelBB
|| MI
->isPHI())
2614 // Search through the rest of the block looking for uses of the Phi
2615 // definition. If one occurs, then split the lifetime.
2616 unsigned SplitReg
= 0;
2617 for (auto &BBJ
: make_range(MachineBasicBlock::instr_iterator(MI
),
2618 KernelBB
->instr_end()))
2619 if (BBJ
.readsRegister(Def
)) {
2620 // We split the lifetime when we find the first use.
2621 if (SplitReg
== 0) {
2622 SplitReg
= MRI
.createVirtualRegister(MRI
.getRegClass(Def
));
2623 BuildMI(*KernelBB
, MI
, MI
->getDebugLoc(),
2624 TII
->get(TargetOpcode::COPY
), SplitReg
)
2627 BBJ
.substituteRegister(Def
, SplitReg
, 0, *TRI
);
2631 // Search through each of the epilog blocks for any uses to be renamed.
2632 for (auto &Epilog
: EpilogBBs
)
2633 for (auto &I
: *Epilog
)
2634 if (I
.readsRegister(Def
))
2635 I
.substituteRegister(Def
, SplitReg
, 0, *TRI
);
2642 /// Remove the incoming block from the Phis in a basic block.
2643 static void removePhis(MachineBasicBlock
*BB
, MachineBasicBlock
*Incoming
) {
2644 for (MachineInstr
&MI
: *BB
) {
2647 for (unsigned i
= 1, e
= MI
.getNumOperands(); i
!= e
; i
+= 2)
2648 if (MI
.getOperand(i
+ 1).getMBB() == Incoming
) {
2649 MI
.RemoveOperand(i
+ 1);
2650 MI
.RemoveOperand(i
);
2656 /// Create branches from each prolog basic block to the appropriate epilog
2657 /// block. These edges are needed if the loop ends before reaching the
2659 void SwingSchedulerDAG::addBranches(MBBVectorTy
&PrologBBs
,
2660 MachineBasicBlock
*KernelBB
,
2661 MBBVectorTy
&EpilogBBs
,
2662 SMSchedule
&Schedule
, ValueMapTy
*VRMap
) {
2663 assert(PrologBBs
.size() == EpilogBBs
.size() && "Prolog/Epilog mismatch");
2664 MachineInstr
*IndVar
= Pass
.LI
.LoopInductionVar
;
2665 MachineInstr
*Cmp
= Pass
.LI
.LoopCompare
;
2666 MachineBasicBlock
*LastPro
= KernelBB
;
2667 MachineBasicBlock
*LastEpi
= KernelBB
;
2669 // Start from the blocks connected to the kernel and work "out"
2670 // to the first prolog and the last epilog blocks.
2671 SmallVector
<MachineInstr
*, 4> PrevInsts
;
2672 unsigned MaxIter
= PrologBBs
.size() - 1;
2673 unsigned LC
= UINT_MAX
;
2674 unsigned LCMin
= UINT_MAX
;
2675 for (unsigned i
= 0, j
= MaxIter
; i
<= MaxIter
; ++i
, --j
) {
2676 // Add branches to the prolog that go to the corresponding
2677 // epilog, and the fall-thru prolog/kernel block.
2678 MachineBasicBlock
*Prolog
= PrologBBs
[j
];
2679 MachineBasicBlock
*Epilog
= EpilogBBs
[i
];
2680 // We've executed one iteration, so decrement the loop count and check for
2682 SmallVector
<MachineOperand
, 4> Cond
;
2683 // Check if the LOOP0 has already been removed. If so, then there is no need
2684 // to reduce the trip count.
2686 LC
= TII
->reduceLoopCount(*Prolog
, IndVar
, *Cmp
, Cond
, PrevInsts
, j
,
2689 // Record the value of the first trip count, which is used to determine if
2690 // branches and blocks can be removed for constant trip counts.
2691 if (LCMin
== UINT_MAX
)
2694 unsigned numAdded
= 0;
2695 if (TargetRegisterInfo::isVirtualRegister(LC
)) {
2696 Prolog
->addSuccessor(Epilog
);
2697 numAdded
= TII
->insertBranch(*Prolog
, Epilog
, LastPro
, Cond
, DebugLoc());
2698 } else if (j
>= LCMin
) {
2699 Prolog
->addSuccessor(Epilog
);
2700 Prolog
->removeSuccessor(LastPro
);
2701 LastEpi
->removeSuccessor(Epilog
);
2702 numAdded
= TII
->insertBranch(*Prolog
, Epilog
, nullptr, Cond
, DebugLoc());
2703 removePhis(Epilog
, LastEpi
);
2704 // Remove the blocks that are no longer referenced.
2705 if (LastPro
!= LastEpi
) {
2707 LastEpi
->eraseFromParent();
2710 LastPro
->eraseFromParent();
2712 numAdded
= TII
->insertBranch(*Prolog
, LastPro
, nullptr, Cond
, DebugLoc());
2713 removePhis(Epilog
, Prolog
);
2717 for (MachineBasicBlock::reverse_instr_iterator I
= Prolog
->instr_rbegin(),
2718 E
= Prolog
->instr_rend();
2719 I
!= E
&& numAdded
> 0; ++I
, --numAdded
)
2720 updateInstruction(&*I
, false, j
, 0, Schedule
, VRMap
);
2724 /// Return true if we can compute the amount the instruction changes
2725 /// during each iteration. Set Delta to the amount of the change.
2726 bool SwingSchedulerDAG::computeDelta(MachineInstr
&MI
, unsigned &Delta
) {
2727 const TargetRegisterInfo
*TRI
= MF
.getSubtarget().getRegisterInfo();
2728 const MachineOperand
*BaseOp
;
2730 if (!TII
->getMemOperandWithOffset(MI
, BaseOp
, Offset
, TRI
))
2733 if (!BaseOp
->isReg())
2736 unsigned BaseReg
= BaseOp
->getReg();
2738 MachineRegisterInfo
&MRI
= MF
.getRegInfo();
2739 // Check if there is a Phi. If so, get the definition in the loop.
2740 MachineInstr
*BaseDef
= MRI
.getVRegDef(BaseReg
);
2741 if (BaseDef
&& BaseDef
->isPHI()) {
2742 BaseReg
= getLoopPhiReg(*BaseDef
, MI
.getParent());
2743 BaseDef
= MRI
.getVRegDef(BaseReg
);
2749 if (!TII
->getIncrementValue(*BaseDef
, D
) && D
>= 0)
2756 /// Update the memory operand with a new offset when the pipeliner
2757 /// generates a new copy of the instruction that refers to a
2758 /// different memory location.
2759 void SwingSchedulerDAG::updateMemOperands(MachineInstr
&NewMI
,
2760 MachineInstr
&OldMI
, unsigned Num
) {
2763 // If the instruction has memory operands, then adjust the offset
2764 // when the instruction appears in different stages.
2765 if (NewMI
.memoperands_empty())
2767 SmallVector
<MachineMemOperand
*, 2> NewMMOs
;
2768 for (MachineMemOperand
*MMO
: NewMI
.memoperands()) {
2769 // TODO: Figure out whether isAtomic is really necessary (see D57601).
2770 if (MMO
->isVolatile() || MMO
->isAtomic() ||
2771 (MMO
->isInvariant() && MMO
->isDereferenceable()) ||
2772 (!MMO
->getValue())) {
2773 NewMMOs
.push_back(MMO
);
2777 if (Num
!= UINT_MAX
&& computeDelta(OldMI
, Delta
)) {
2778 int64_t AdjOffset
= Delta
* Num
;
2780 MF
.getMachineMemOperand(MMO
, AdjOffset
, MMO
->getSize()));
2783 MF
.getMachineMemOperand(MMO
, 0, MemoryLocation::UnknownSize
));
2786 NewMI
.setMemRefs(MF
, NewMMOs
);
2789 /// Clone the instruction for the new pipelined loop and update the
2790 /// memory operands, if needed.
2791 MachineInstr
*SwingSchedulerDAG::cloneInstr(MachineInstr
*OldMI
,
2792 unsigned CurStageNum
,
2793 unsigned InstStageNum
) {
2794 MachineInstr
*NewMI
= MF
.CloneMachineInstr(OldMI
);
2795 // Check for tied operands in inline asm instructions. This should be handled
2796 // elsewhere, but I'm not sure of the best solution.
2797 if (OldMI
->isInlineAsm())
2798 for (unsigned i
= 0, e
= OldMI
->getNumOperands(); i
!= e
; ++i
) {
2799 const auto &MO
= OldMI
->getOperand(i
);
2800 if (MO
.isReg() && MO
.isUse())
2803 if (OldMI
->isRegTiedToUseOperand(i
, &UseIdx
))
2804 NewMI
->tieOperands(i
, UseIdx
);
2806 updateMemOperands(*NewMI
, *OldMI
, CurStageNum
- InstStageNum
);
2810 /// Clone the instruction for the new pipelined loop. If needed, this
2811 /// function updates the instruction using the values saved in the
2812 /// InstrChanges structure.
2813 MachineInstr
*SwingSchedulerDAG::cloneAndChangeInstr(MachineInstr
*OldMI
,
2814 unsigned CurStageNum
,
2815 unsigned InstStageNum
,
2816 SMSchedule
&Schedule
) {
2817 MachineInstr
*NewMI
= MF
.CloneMachineInstr(OldMI
);
2818 DenseMap
<SUnit
*, std::pair
<unsigned, int64_t>>::iterator It
=
2819 InstrChanges
.find(getSUnit(OldMI
));
2820 if (It
!= InstrChanges
.end()) {
2821 std::pair
<unsigned, int64_t> RegAndOffset
= It
->second
;
2822 unsigned BasePos
, OffsetPos
;
2823 if (!TII
->getBaseAndOffsetPosition(*OldMI
, BasePos
, OffsetPos
))
2825 int64_t NewOffset
= OldMI
->getOperand(OffsetPos
).getImm();
2826 MachineInstr
*LoopDef
= findDefInLoop(RegAndOffset
.first
);
2827 if (Schedule
.stageScheduled(getSUnit(LoopDef
)) > (signed)InstStageNum
)
2828 NewOffset
+= RegAndOffset
.second
* (CurStageNum
- InstStageNum
);
2829 NewMI
->getOperand(OffsetPos
).setImm(NewOffset
);
2831 updateMemOperands(*NewMI
, *OldMI
, CurStageNum
- InstStageNum
);
2835 /// Update the machine instruction with new virtual registers. This
2836 /// function may change the defintions and/or uses.
2837 void SwingSchedulerDAG::updateInstruction(MachineInstr
*NewMI
, bool LastDef
,
2838 unsigned CurStageNum
,
2839 unsigned InstrStageNum
,
2840 SMSchedule
&Schedule
,
2841 ValueMapTy
*VRMap
) {
2842 for (unsigned i
= 0, e
= NewMI
->getNumOperands(); i
!= e
; ++i
) {
2843 MachineOperand
&MO
= NewMI
->getOperand(i
);
2844 if (!MO
.isReg() || !TargetRegisterInfo::isVirtualRegister(MO
.getReg()))
2846 unsigned reg
= MO
.getReg();
2848 // Create a new virtual register for the definition.
2849 const TargetRegisterClass
*RC
= MRI
.getRegClass(reg
);
2850 unsigned NewReg
= MRI
.createVirtualRegister(RC
);
2852 VRMap
[CurStageNum
][reg
] = NewReg
;
2854 replaceRegUsesAfterLoop(reg
, NewReg
, BB
, MRI
, LIS
);
2855 } else if (MO
.isUse()) {
2856 MachineInstr
*Def
= MRI
.getVRegDef(reg
);
2857 // Compute the stage that contains the last definition for instruction.
2858 int DefStageNum
= Schedule
.stageScheduled(getSUnit(Def
));
2859 unsigned StageNum
= CurStageNum
;
2860 if (DefStageNum
!= -1 && (int)InstrStageNum
> DefStageNum
) {
2861 // Compute the difference in stages between the defintion and the use.
2862 unsigned StageDiff
= (InstrStageNum
- DefStageNum
);
2863 // Make an adjustment to get the last definition.
2864 StageNum
-= StageDiff
;
2866 if (VRMap
[StageNum
].count(reg
))
2867 MO
.setReg(VRMap
[StageNum
][reg
]);
2872 /// Return the instruction in the loop that defines the register.
2873 /// If the definition is a Phi, then follow the Phi operand to
2874 /// the instruction in the loop.
2875 MachineInstr
*SwingSchedulerDAG::findDefInLoop(unsigned Reg
) {
2876 SmallPtrSet
<MachineInstr
*, 8> Visited
;
2877 MachineInstr
*Def
= MRI
.getVRegDef(Reg
);
2878 while (Def
->isPHI()) {
2879 if (!Visited
.insert(Def
).second
)
2881 for (unsigned i
= 1, e
= Def
->getNumOperands(); i
< e
; i
+= 2)
2882 if (Def
->getOperand(i
+ 1).getMBB() == BB
) {
2883 Def
= MRI
.getVRegDef(Def
->getOperand(i
).getReg());
2890 /// Return the new name for the value from the previous stage.
2891 unsigned SwingSchedulerDAG::getPrevMapVal(unsigned StageNum
, unsigned PhiStage
,
2892 unsigned LoopVal
, unsigned LoopStage
,
2894 MachineBasicBlock
*BB
) {
2895 unsigned PrevVal
= 0;
2896 if (StageNum
> PhiStage
) {
2897 MachineInstr
*LoopInst
= MRI
.getVRegDef(LoopVal
);
2898 if (PhiStage
== LoopStage
&& VRMap
[StageNum
- 1].count(LoopVal
))
2899 // The name is defined in the previous stage.
2900 PrevVal
= VRMap
[StageNum
- 1][LoopVal
];
2901 else if (VRMap
[StageNum
].count(LoopVal
))
2902 // The previous name is defined in the current stage when the instruction
2903 // order is swapped.
2904 PrevVal
= VRMap
[StageNum
][LoopVal
];
2905 else if (!LoopInst
->isPHI() || LoopInst
->getParent() != BB
)
2906 // The loop value hasn't yet been scheduled.
2908 else if (StageNum
== PhiStage
+ 1)
2909 // The loop value is another phi, which has not been scheduled.
2910 PrevVal
= getInitPhiReg(*LoopInst
, BB
);
2911 else if (StageNum
> PhiStage
+ 1 && LoopInst
->getParent() == BB
)
2912 // The loop value is another phi, which has been scheduled.
2914 getPrevMapVal(StageNum
- 1, PhiStage
, getLoopPhiReg(*LoopInst
, BB
),
2915 LoopStage
, VRMap
, BB
);
2920 /// Rewrite the Phi values in the specified block to use the mappings
2921 /// from the initial operand. Once the Phi is scheduled, we switch
2922 /// to using the loop value instead of the Phi value, so those names
2923 /// do not need to be rewritten.
2924 void SwingSchedulerDAG::rewritePhiValues(MachineBasicBlock
*NewBB
,
2926 SMSchedule
&Schedule
,
2928 InstrMapTy
&InstrMap
) {
2929 for (auto &PHI
: BB
->phis()) {
2930 unsigned InitVal
= 0;
2931 unsigned LoopVal
= 0;
2932 getPhiRegs(PHI
, BB
, InitVal
, LoopVal
);
2933 unsigned PhiDef
= PHI
.getOperand(0).getReg();
2936 (unsigned)Schedule
.stageScheduled(getSUnit(MRI
.getVRegDef(PhiDef
)));
2937 unsigned LoopStage
=
2938 (unsigned)Schedule
.stageScheduled(getSUnit(MRI
.getVRegDef(LoopVal
)));
2939 unsigned NumPhis
= Schedule
.getStagesForPhi(PhiDef
);
2940 if (NumPhis
> StageNum
)
2942 for (unsigned np
= 0; np
<= NumPhis
; ++np
) {
2944 getPrevMapVal(StageNum
- np
, PhiStage
, LoopVal
, LoopStage
, VRMap
, BB
);
2947 rewriteScheduledInstr(NewBB
, Schedule
, InstrMap
, StageNum
- np
, np
, &PHI
,
2953 /// Rewrite a previously scheduled instruction to use the register value
2954 /// from the new instruction. Make sure the instruction occurs in the
2955 /// basic block, and we don't change the uses in the new instruction.
2956 void SwingSchedulerDAG::rewriteScheduledInstr(
2957 MachineBasicBlock
*BB
, SMSchedule
&Schedule
, InstrMapTy
&InstrMap
,
2958 unsigned CurStageNum
, unsigned PhiNum
, MachineInstr
*Phi
, unsigned OldReg
,
2959 unsigned NewReg
, unsigned PrevReg
) {
2960 bool InProlog
= (CurStageNum
< Schedule
.getMaxStageCount());
2961 int StagePhi
= Schedule
.stageScheduled(getSUnit(Phi
)) + PhiNum
;
2962 // Rewrite uses that have been scheduled already to use the new
2964 for (MachineRegisterInfo::use_iterator UI
= MRI
.use_begin(OldReg
),
2967 MachineOperand
&UseOp
= *UI
;
2968 MachineInstr
*UseMI
= UseOp
.getParent();
2970 if (UseMI
->getParent() != BB
)
2972 if (UseMI
->isPHI()) {
2973 if (!Phi
->isPHI() && UseMI
->getOperand(0).getReg() == NewReg
)
2975 if (getLoopPhiReg(*UseMI
, BB
) != OldReg
)
2978 InstrMapTy::iterator OrigInstr
= InstrMap
.find(UseMI
);
2979 assert(OrigInstr
!= InstrMap
.end() && "Instruction not scheduled.");
2980 SUnit
*OrigMISU
= getSUnit(OrigInstr
->second
);
2981 int StageSched
= Schedule
.stageScheduled(OrigMISU
);
2982 int CycleSched
= Schedule
.cycleScheduled(OrigMISU
);
2983 unsigned ReplaceReg
= 0;
2984 // This is the stage for the scheduled instruction.
2985 if (StagePhi
== StageSched
&& Phi
->isPHI()) {
2986 int CyclePhi
= Schedule
.cycleScheduled(getSUnit(Phi
));
2987 if (PrevReg
&& InProlog
)
2988 ReplaceReg
= PrevReg
;
2989 else if (PrevReg
&& !Schedule
.isLoopCarried(this, *Phi
) &&
2990 (CyclePhi
<= CycleSched
|| OrigMISU
->getInstr()->isPHI()))
2991 ReplaceReg
= PrevReg
;
2993 ReplaceReg
= NewReg
;
2995 // The scheduled instruction occurs before the scheduled Phi, and the
2996 // Phi is not loop carried.
2997 if (!InProlog
&& StagePhi
+ 1 == StageSched
&&
2998 !Schedule
.isLoopCarried(this, *Phi
))
2999 ReplaceReg
= NewReg
;
3000 if (StagePhi
> StageSched
&& Phi
->isPHI())
3001 ReplaceReg
= NewReg
;
3002 if (!InProlog
&& !Phi
->isPHI() && StagePhi
< StageSched
)
3003 ReplaceReg
= NewReg
;
3005 MRI
.constrainRegClass(ReplaceReg
, MRI
.getRegClass(OldReg
));
3006 UseOp
.setReg(ReplaceReg
);
3011 /// Check if we can change the instruction to use an offset value from the
3012 /// previous iteration. If so, return true and set the base and offset values
3013 /// so that we can rewrite the load, if necessary.
3014 /// v1 = Phi(v0, v3)
3016 /// v3 = post_store v1, 4, x
3017 /// This function enables the load to be rewritten as v2 = load v3, 4.
3018 bool SwingSchedulerDAG::canUseLastOffsetValue(MachineInstr
*MI
,
3020 unsigned &OffsetPos
,
3023 // Get the load instruction.
3024 if (TII
->isPostIncrement(*MI
))
3026 unsigned BasePosLd
, OffsetPosLd
;
3027 if (!TII
->getBaseAndOffsetPosition(*MI
, BasePosLd
, OffsetPosLd
))
3029 unsigned BaseReg
= MI
->getOperand(BasePosLd
).getReg();
3031 // Look for the Phi instruction.
3032 MachineRegisterInfo
&MRI
= MI
->getMF()->getRegInfo();
3033 MachineInstr
*Phi
= MRI
.getVRegDef(BaseReg
);
3034 if (!Phi
|| !Phi
->isPHI())
3036 // Get the register defined in the loop block.
3037 unsigned PrevReg
= getLoopPhiReg(*Phi
, MI
->getParent());
3041 // Check for the post-increment load/store instruction.
3042 MachineInstr
*PrevDef
= MRI
.getVRegDef(PrevReg
);
3043 if (!PrevDef
|| PrevDef
== MI
)
3046 if (!TII
->isPostIncrement(*PrevDef
))
3049 unsigned BasePos1
= 0, OffsetPos1
= 0;
3050 if (!TII
->getBaseAndOffsetPosition(*PrevDef
, BasePos1
, OffsetPos1
))
3053 // Make sure that the instructions do not access the same memory location in
3054 // the next iteration.
3055 int64_t LoadOffset
= MI
->getOperand(OffsetPosLd
).getImm();
3056 int64_t StoreOffset
= PrevDef
->getOperand(OffsetPos1
).getImm();
3057 MachineInstr
*NewMI
= MF
.CloneMachineInstr(MI
);
3058 NewMI
->getOperand(OffsetPosLd
).setImm(LoadOffset
+ StoreOffset
);
3059 bool Disjoint
= TII
->areMemAccessesTriviallyDisjoint(*NewMI
, *PrevDef
);
3060 MF
.DeleteMachineInstr(NewMI
);
3064 // Set the return value once we determine that we return true.
3065 BasePos
= BasePosLd
;
3066 OffsetPos
= OffsetPosLd
;
3068 Offset
= StoreOffset
;
3072 /// Apply changes to the instruction if needed. The changes are need
3073 /// to improve the scheduling and depend up on the final schedule.
3074 void SwingSchedulerDAG::applyInstrChange(MachineInstr
*MI
,
3075 SMSchedule
&Schedule
) {
3076 SUnit
*SU
= getSUnit(MI
);
3077 DenseMap
<SUnit
*, std::pair
<unsigned, int64_t>>::iterator It
=
3078 InstrChanges
.find(SU
);
3079 if (It
!= InstrChanges
.end()) {
3080 std::pair
<unsigned, int64_t> RegAndOffset
= It
->second
;
3081 unsigned BasePos
, OffsetPos
;
3082 if (!TII
->getBaseAndOffsetPosition(*MI
, BasePos
, OffsetPos
))
3084 unsigned BaseReg
= MI
->getOperand(BasePos
).getReg();
3085 MachineInstr
*LoopDef
= findDefInLoop(BaseReg
);
3086 int DefStageNum
= Schedule
.stageScheduled(getSUnit(LoopDef
));
3087 int DefCycleNum
= Schedule
.cycleScheduled(getSUnit(LoopDef
));
3088 int BaseStageNum
= Schedule
.stageScheduled(SU
);
3089 int BaseCycleNum
= Schedule
.cycleScheduled(SU
);
3090 if (BaseStageNum
< DefStageNum
) {
3091 MachineInstr
*NewMI
= MF
.CloneMachineInstr(MI
);
3092 int OffsetDiff
= DefStageNum
- BaseStageNum
;
3093 if (DefCycleNum
< BaseCycleNum
) {
3094 NewMI
->getOperand(BasePos
).setReg(RegAndOffset
.first
);
3099 MI
->getOperand(OffsetPos
).getImm() + RegAndOffset
.second
* OffsetDiff
;
3100 NewMI
->getOperand(OffsetPos
).setImm(NewOffset
);
3101 SU
->setInstr(NewMI
);
3102 MISUnitMap
[NewMI
] = SU
;
3103 NewMIs
.insert(NewMI
);
3108 /// Return true for an order or output dependence that is loop carried
3109 /// potentially. A dependence is loop carried if the destination defines a valu
3110 /// that may be used or defined by the source in a subsequent iteration.
3111 bool SwingSchedulerDAG::isLoopCarriedDep(SUnit
*Source
, const SDep
&Dep
,
3113 if ((Dep
.getKind() != SDep::Order
&& Dep
.getKind() != SDep::Output
) ||
3117 if (!SwpPruneLoopCarried
)
3120 if (Dep
.getKind() == SDep::Output
)
3123 MachineInstr
*SI
= Source
->getInstr();
3124 MachineInstr
*DI
= Dep
.getSUnit()->getInstr();
3127 assert(SI
!= nullptr && DI
!= nullptr && "Expecting SUnit with an MI.");
3129 // Assume ordered loads and stores may have a loop carried dependence.
3130 if (SI
->hasUnmodeledSideEffects() || DI
->hasUnmodeledSideEffects() ||
3131 SI
->hasOrderedMemoryRef() || DI
->hasOrderedMemoryRef())
3134 // Only chain dependences between a load and store can be loop carried.
3135 if (!DI
->mayStore() || !SI
->mayLoad())
3138 unsigned DeltaS
, DeltaD
;
3139 if (!computeDelta(*SI
, DeltaS
) || !computeDelta(*DI
, DeltaD
))
3142 const MachineOperand
*BaseOpS
, *BaseOpD
;
3143 int64_t OffsetS
, OffsetD
;
3144 const TargetRegisterInfo
*TRI
= MF
.getSubtarget().getRegisterInfo();
3145 if (!TII
->getMemOperandWithOffset(*SI
, BaseOpS
, OffsetS
, TRI
) ||
3146 !TII
->getMemOperandWithOffset(*DI
, BaseOpD
, OffsetD
, TRI
))
3149 if (!BaseOpS
->isIdenticalTo(*BaseOpD
))
3152 // Check that the base register is incremented by a constant value for each
3154 MachineInstr
*Def
= MRI
.getVRegDef(BaseOpS
->getReg());
3155 if (!Def
|| !Def
->isPHI())
3157 unsigned InitVal
= 0;
3158 unsigned LoopVal
= 0;
3159 getPhiRegs(*Def
, BB
, InitVal
, LoopVal
);
3160 MachineInstr
*LoopDef
= MRI
.getVRegDef(LoopVal
);
3162 if (!LoopDef
|| !TII
->getIncrementValue(*LoopDef
, D
))
3165 uint64_t AccessSizeS
= (*SI
->memoperands_begin())->getSize();
3166 uint64_t AccessSizeD
= (*DI
->memoperands_begin())->getSize();
3168 // This is the main test, which checks the offset values and the loop
3169 // increment value to determine if the accesses may be loop carried.
3170 if (AccessSizeS
== MemoryLocation::UnknownSize
||
3171 AccessSizeD
== MemoryLocation::UnknownSize
)
3174 if (DeltaS
!= DeltaD
|| DeltaS
< AccessSizeS
|| DeltaD
< AccessSizeD
)
3177 return (OffsetS
+ (int64_t)AccessSizeS
< OffsetD
+ (int64_t)AccessSizeD
);
3180 void SwingSchedulerDAG::postprocessDAG() {
3181 for (auto &M
: Mutations
)
3185 /// Try to schedule the node at the specified StartCycle and continue
3186 /// until the node is schedule or the EndCycle is reached. This function
3187 /// returns true if the node is scheduled. This routine may search either
3188 /// forward or backward for a place to insert the instruction based upon
3189 /// the relative values of StartCycle and EndCycle.
3190 bool SMSchedule::insert(SUnit
*SU
, int StartCycle
, int EndCycle
, int II
) {
3191 bool forward
= true;
3192 if (StartCycle
> EndCycle
)
3195 // The terminating condition depends on the direction.
3196 int termCycle
= forward
? EndCycle
+ 1 : EndCycle
- 1;
3197 for (int curCycle
= StartCycle
; curCycle
!= termCycle
;
3198 forward
? ++curCycle
: --curCycle
) {
3200 // Add the already scheduled instructions at the specified cycle to the DFA.
3201 Resources
->clearResources();
3202 for (int checkCycle
= FirstCycle
+ ((curCycle
- FirstCycle
) % II
);
3203 checkCycle
<= LastCycle
; checkCycle
+= II
) {
3204 std::deque
<SUnit
*> &cycleInstrs
= ScheduledInstrs
[checkCycle
];
3206 for (std::deque
<SUnit
*>::iterator I
= cycleInstrs
.begin(),
3207 E
= cycleInstrs
.end();
3209 if (ST
.getInstrInfo()->isZeroCost((*I
)->getInstr()->getOpcode()))
3211 assert(Resources
->canReserveResources(*(*I
)->getInstr()) &&
3212 "These instructions have already been scheduled.");
3213 Resources
->reserveResources(*(*I
)->getInstr());
3216 if (ST
.getInstrInfo()->isZeroCost(SU
->getInstr()->getOpcode()) ||
3217 Resources
->canReserveResources(*SU
->getInstr())) {
3219 dbgs() << "\tinsert at cycle " << curCycle
<< " ";
3220 SU
->getInstr()->dump();
3223 ScheduledInstrs
[curCycle
].push_back(SU
);
3224 InstrToCycle
.insert(std::make_pair(SU
, curCycle
));
3225 if (curCycle
> LastCycle
)
3226 LastCycle
= curCycle
;
3227 if (curCycle
< FirstCycle
)
3228 FirstCycle
= curCycle
;
3232 dbgs() << "\tfailed to insert at cycle " << curCycle
<< " ";
3233 SU
->getInstr()->dump();
3239 // Return the cycle of the earliest scheduled instruction in the chain.
3240 int SMSchedule::earliestCycleInChain(const SDep
&Dep
) {
3241 SmallPtrSet
<SUnit
*, 8> Visited
;
3242 SmallVector
<SDep
, 8> Worklist
;
3243 Worklist
.push_back(Dep
);
3244 int EarlyCycle
= INT_MAX
;
3245 while (!Worklist
.empty()) {
3246 const SDep
&Cur
= Worklist
.pop_back_val();
3247 SUnit
*PrevSU
= Cur
.getSUnit();
3248 if (Visited
.count(PrevSU
))
3250 std::map
<SUnit
*, int>::const_iterator it
= InstrToCycle
.find(PrevSU
);
3251 if (it
== InstrToCycle
.end())
3253 EarlyCycle
= std::min(EarlyCycle
, it
->second
);
3254 for (const auto &PI
: PrevSU
->Preds
)
3255 if (PI
.getKind() == SDep::Order
|| Dep
.getKind() == SDep::Output
)
3256 Worklist
.push_back(PI
);
3257 Visited
.insert(PrevSU
);
3262 // Return the cycle of the latest scheduled instruction in the chain.
3263 int SMSchedule::latestCycleInChain(const SDep
&Dep
) {
3264 SmallPtrSet
<SUnit
*, 8> Visited
;
3265 SmallVector
<SDep
, 8> Worklist
;
3266 Worklist
.push_back(Dep
);
3267 int LateCycle
= INT_MIN
;
3268 while (!Worklist
.empty()) {
3269 const SDep
&Cur
= Worklist
.pop_back_val();
3270 SUnit
*SuccSU
= Cur
.getSUnit();
3271 if (Visited
.count(SuccSU
))
3273 std::map
<SUnit
*, int>::const_iterator it
= InstrToCycle
.find(SuccSU
);
3274 if (it
== InstrToCycle
.end())
3276 LateCycle
= std::max(LateCycle
, it
->second
);
3277 for (const auto &SI
: SuccSU
->Succs
)
3278 if (SI
.getKind() == SDep::Order
|| Dep
.getKind() == SDep::Output
)
3279 Worklist
.push_back(SI
);
3280 Visited
.insert(SuccSU
);
3285 /// If an instruction has a use that spans multiple iterations, then
3286 /// return true. These instructions are characterized by having a back-ege
3287 /// to a Phi, which contains a reference to another Phi.
3288 static SUnit
*multipleIterations(SUnit
*SU
, SwingSchedulerDAG
*DAG
) {
3289 for (auto &P
: SU
->Preds
)
3290 if (DAG
->isBackedge(SU
, P
) && P
.getSUnit()->getInstr()->isPHI())
3291 for (auto &S
: P
.getSUnit()->Succs
)
3292 if (S
.getKind() == SDep::Data
&& S
.getSUnit()->getInstr()->isPHI())
3293 return P
.getSUnit();
3297 /// Compute the scheduling start slot for the instruction. The start slot
3298 /// depends on any predecessor or successor nodes scheduled already.
3299 void SMSchedule::computeStart(SUnit
*SU
, int *MaxEarlyStart
, int *MinLateStart
,
3300 int *MinEnd
, int *MaxStart
, int II
,
3301 SwingSchedulerDAG
*DAG
) {
3302 // Iterate over each instruction that has been scheduled already. The start
3303 // slot computation depends on whether the previously scheduled instruction
3304 // is a predecessor or successor of the specified instruction.
3305 for (int cycle
= getFirstCycle(); cycle
<= LastCycle
; ++cycle
) {
3307 // Iterate over each instruction in the current cycle.
3308 for (SUnit
*I
: getInstructions(cycle
)) {
3309 // Because we're processing a DAG for the dependences, we recognize
3310 // the back-edge in recurrences by anti dependences.
3311 for (unsigned i
= 0, e
= (unsigned)SU
->Preds
.size(); i
!= e
; ++i
) {
3312 const SDep
&Dep
= SU
->Preds
[i
];
3313 if (Dep
.getSUnit() == I
) {
3314 if (!DAG
->isBackedge(SU
, Dep
)) {
3315 int EarlyStart
= cycle
+ Dep
.getLatency() -
3316 DAG
->getDistance(Dep
.getSUnit(), SU
, Dep
) * II
;
3317 *MaxEarlyStart
= std::max(*MaxEarlyStart
, EarlyStart
);
3318 if (DAG
->isLoopCarriedDep(SU
, Dep
, false)) {
3319 int End
= earliestCycleInChain(Dep
) + (II
- 1);
3320 *MinEnd
= std::min(*MinEnd
, End
);
3323 int LateStart
= cycle
- Dep
.getLatency() +
3324 DAG
->getDistance(SU
, Dep
.getSUnit(), Dep
) * II
;
3325 *MinLateStart
= std::min(*MinLateStart
, LateStart
);
3328 // For instruction that requires multiple iterations, make sure that
3329 // the dependent instruction is not scheduled past the definition.
3330 SUnit
*BE
= multipleIterations(I
, DAG
);
3331 if (BE
&& Dep
.getSUnit() == BE
&& !SU
->getInstr()->isPHI() &&
3333 *MinLateStart
= std::min(*MinLateStart
, cycle
);
3335 for (unsigned i
= 0, e
= (unsigned)SU
->Succs
.size(); i
!= e
; ++i
) {
3336 if (SU
->Succs
[i
].getSUnit() == I
) {
3337 const SDep
&Dep
= SU
->Succs
[i
];
3338 if (!DAG
->isBackedge(SU
, Dep
)) {
3339 int LateStart
= cycle
- Dep
.getLatency() +
3340 DAG
->getDistance(SU
, Dep
.getSUnit(), Dep
) * II
;
3341 *MinLateStart
= std::min(*MinLateStart
, LateStart
);
3342 if (DAG
->isLoopCarriedDep(SU
, Dep
)) {
3343 int Start
= latestCycleInChain(Dep
) + 1 - II
;
3344 *MaxStart
= std::max(*MaxStart
, Start
);
3347 int EarlyStart
= cycle
+ Dep
.getLatency() -
3348 DAG
->getDistance(Dep
.getSUnit(), SU
, Dep
) * II
;
3349 *MaxEarlyStart
= std::max(*MaxEarlyStart
, EarlyStart
);
3357 /// Order the instructions within a cycle so that the definitions occur
3358 /// before the uses. Returns true if the instruction is added to the start
3359 /// of the list, or false if added to the end.
3360 void SMSchedule::orderDependence(SwingSchedulerDAG
*SSD
, SUnit
*SU
,
3361 std::deque
<SUnit
*> &Insts
) {
3362 MachineInstr
*MI
= SU
->getInstr();
3363 bool OrderBeforeUse
= false;
3364 bool OrderAfterDef
= false;
3365 bool OrderBeforeDef
= false;
3366 unsigned MoveDef
= 0;
3367 unsigned MoveUse
= 0;
3368 int StageInst1
= stageScheduled(SU
);
3371 for (std::deque
<SUnit
*>::iterator I
= Insts
.begin(), E
= Insts
.end(); I
!= E
;
3373 for (unsigned i
= 0, e
= MI
->getNumOperands(); i
< e
; ++i
) {
3374 MachineOperand
&MO
= MI
->getOperand(i
);
3375 if (!MO
.isReg() || !TargetRegisterInfo::isVirtualRegister(MO
.getReg()))
3378 unsigned Reg
= MO
.getReg();
3379 unsigned BasePos
, OffsetPos
;
3380 if (ST
.getInstrInfo()->getBaseAndOffsetPosition(*MI
, BasePos
, OffsetPos
))
3381 if (MI
->getOperand(BasePos
).getReg() == Reg
)
3382 if (unsigned NewReg
= SSD
->getInstrBaseReg(SU
))
3385 std::tie(Reads
, Writes
) =
3386 (*I
)->getInstr()->readsWritesVirtualRegister(Reg
);
3387 if (MO
.isDef() && Reads
&& stageScheduled(*I
) <= StageInst1
) {
3388 OrderBeforeUse
= true;
3391 } else if (MO
.isDef() && Reads
&& stageScheduled(*I
) > StageInst1
) {
3392 // Add the instruction after the scheduled instruction.
3393 OrderAfterDef
= true;
3395 } else if (MO
.isUse() && Writes
&& stageScheduled(*I
) == StageInst1
) {
3396 if (cycleScheduled(*I
) == cycleScheduled(SU
) && !(*I
)->isSucc(SU
)) {
3397 OrderBeforeUse
= true;
3401 OrderAfterDef
= true;
3404 } else if (MO
.isUse() && Writes
&& stageScheduled(*I
) > StageInst1
) {
3405 OrderBeforeUse
= true;
3409 OrderAfterDef
= true;
3412 } else if (MO
.isUse() && Writes
&& stageScheduled(*I
) < StageInst1
) {
3413 // Add the instruction before the scheduled instruction.
3414 OrderBeforeUse
= true;
3417 } else if (MO
.isUse() && stageScheduled(*I
) == StageInst1
&&
3418 isLoopCarriedDefOfUse(SSD
, (*I
)->getInstr(), MO
)) {
3420 OrderBeforeDef
= true;
3425 // Check for order dependences between instructions. Make sure the source
3426 // is ordered before the destination.
3427 for (auto &S
: SU
->Succs
) {
3428 if (S
.getSUnit() != *I
)
3430 if (S
.getKind() == SDep::Order
&& stageScheduled(*I
) == StageInst1
) {
3431 OrderBeforeUse
= true;
3436 for (auto &P
: SU
->Preds
) {
3437 if (P
.getSUnit() != *I
)
3439 if (P
.getKind() == SDep::Order
&& stageScheduled(*I
) == StageInst1
) {
3440 OrderAfterDef
= true;
3446 // A circular dependence.
3447 if (OrderAfterDef
&& OrderBeforeUse
&& MoveUse
== MoveDef
)
3448 OrderBeforeUse
= false;
3450 // OrderAfterDef takes precedences over OrderBeforeDef. The latter is due
3451 // to a loop-carried dependence.
3453 OrderBeforeUse
= !OrderAfterDef
|| (MoveUse
> MoveDef
);
3455 // The uncommon case when the instruction order needs to be updated because
3456 // there is both a use and def.
3457 if (OrderBeforeUse
&& OrderAfterDef
) {
3458 SUnit
*UseSU
= Insts
.at(MoveUse
);
3459 SUnit
*DefSU
= Insts
.at(MoveDef
);
3460 if (MoveUse
> MoveDef
) {
3461 Insts
.erase(Insts
.begin() + MoveUse
);
3462 Insts
.erase(Insts
.begin() + MoveDef
);
3464 Insts
.erase(Insts
.begin() + MoveDef
);
3465 Insts
.erase(Insts
.begin() + MoveUse
);
3467 orderDependence(SSD
, UseSU
, Insts
);
3468 orderDependence(SSD
, SU
, Insts
);
3469 orderDependence(SSD
, DefSU
, Insts
);
3472 // Put the new instruction first if there is a use in the list. Otherwise,
3473 // put it at the end of the list.
3475 Insts
.push_front(SU
);
3477 Insts
.push_back(SU
);
3480 /// Return true if the scheduled Phi has a loop carried operand.
3481 bool SMSchedule::isLoopCarried(SwingSchedulerDAG
*SSD
, MachineInstr
&Phi
) {
3484 assert(Phi
.isPHI() && "Expecting a Phi.");
3485 SUnit
*DefSU
= SSD
->getSUnit(&Phi
);
3486 unsigned DefCycle
= cycleScheduled(DefSU
);
3487 int DefStage
= stageScheduled(DefSU
);
3489 unsigned InitVal
= 0;
3490 unsigned LoopVal
= 0;
3491 getPhiRegs(Phi
, Phi
.getParent(), InitVal
, LoopVal
);
3492 SUnit
*UseSU
= SSD
->getSUnit(MRI
.getVRegDef(LoopVal
));
3495 if (UseSU
->getInstr()->isPHI())
3497 unsigned LoopCycle
= cycleScheduled(UseSU
);
3498 int LoopStage
= stageScheduled(UseSU
);
3499 return (LoopCycle
> DefCycle
) || (LoopStage
<= DefStage
);
3502 /// Return true if the instruction is a definition that is loop carried
3503 /// and defines the use on the next iteration.
3504 /// v1 = phi(v2, v3)
3505 /// (Def) v3 = op v1
3507 /// If MO appears before Def, then then v1 and v3 may get assigned to the same
3509 bool SMSchedule::isLoopCarriedDefOfUse(SwingSchedulerDAG
*SSD
,
3510 MachineInstr
*Def
, MachineOperand
&MO
) {
3515 MachineInstr
*Phi
= MRI
.getVRegDef(MO
.getReg());
3516 if (!Phi
|| !Phi
->isPHI() || Phi
->getParent() != Def
->getParent())
3518 if (!isLoopCarried(SSD
, *Phi
))
3520 unsigned LoopReg
= getLoopPhiReg(*Phi
, Phi
->getParent());
3521 for (unsigned i
= 0, e
= Def
->getNumOperands(); i
!= e
; ++i
) {
3522 MachineOperand
&DMO
= Def
->getOperand(i
);
3523 if (!DMO
.isReg() || !DMO
.isDef())
3525 if (DMO
.getReg() == LoopReg
)
3531 // Check if the generated schedule is valid. This function checks if
3532 // an instruction that uses a physical register is scheduled in a
3533 // different stage than the definition. The pipeliner does not handle
3534 // physical register values that may cross a basic block boundary.
3535 bool SMSchedule::isValidSchedule(SwingSchedulerDAG
*SSD
) {
3536 for (int i
= 0, e
= SSD
->SUnits
.size(); i
< e
; ++i
) {
3537 SUnit
&SU
= SSD
->SUnits
[i
];
3538 if (!SU
.hasPhysRegDefs
)
3540 int StageDef
= stageScheduled(&SU
);
3541 assert(StageDef
!= -1 && "Instruction should have been scheduled.");
3542 for (auto &SI
: SU
.Succs
)
3543 if (SI
.isAssignedRegDep())
3544 if (ST
.getRegisterInfo()->isPhysicalRegister(SI
.getReg()))
3545 if (stageScheduled(SI
.getSUnit()) != StageDef
)
3551 /// A property of the node order in swing-modulo-scheduling is
3552 /// that for nodes outside circuits the following holds:
3553 /// none of them is scheduled after both a successor and a
3555 /// The method below checks whether the property is met.
3556 /// If not, debug information is printed and statistics information updated.
3557 /// Note that we do not use an assert statement.
3558 /// The reason is that although an invalid node oder may prevent
3559 /// the pipeliner from finding a pipelined schedule for arbitrary II,
3560 /// it does not lead to the generation of incorrect code.
3561 void SwingSchedulerDAG::checkValidNodeOrder(const NodeSetType
&Circuits
) const {
3563 // a sorted vector that maps each SUnit to its index in the NodeOrder
3564 typedef std::pair
<SUnit
*, unsigned> UnitIndex
;
3565 std::vector
<UnitIndex
> Indices(NodeOrder
.size(), std::make_pair(nullptr, 0));
3567 for (unsigned i
= 0, s
= NodeOrder
.size(); i
< s
; ++i
)
3568 Indices
.push_back(std::make_pair(NodeOrder
[i
], i
));
3570 auto CompareKey
= [](UnitIndex i1
, UnitIndex i2
) {
3571 return std::get
<0>(i1
) < std::get
<0>(i2
);
3574 // sort, so that we can perform a binary search
3575 llvm::sort(Indices
, CompareKey
);
3579 // for each SUnit in the NodeOrder, check whether
3580 // it appears after both a successor and a predecessor
3581 // of the SUnit. If this is the case, and the SUnit
3582 // is not part of circuit, then the NodeOrder is not
3584 for (unsigned i
= 0, s
= NodeOrder
.size(); i
< s
; ++i
) {
3585 SUnit
*SU
= NodeOrder
[i
];
3588 bool PredBefore
= false;
3589 bool SuccBefore
= false;
3596 for (SDep
&PredEdge
: SU
->Preds
) {
3597 SUnit
*PredSU
= PredEdge
.getSUnit();
3598 unsigned PredIndex
=
3599 std::get
<1>(*std::lower_bound(Indices
.begin(), Indices
.end(),
3600 std::make_pair(PredSU
, 0), CompareKey
));
3601 if (!PredSU
->getInstr()->isPHI() && PredIndex
< Index
) {
3608 for (SDep
&SuccEdge
: SU
->Succs
) {
3609 SUnit
*SuccSU
= SuccEdge
.getSUnit();
3610 unsigned SuccIndex
=
3611 std::get
<1>(*std::lower_bound(Indices
.begin(), Indices
.end(),
3612 std::make_pair(SuccSU
, 0), CompareKey
));
3613 if (!SuccSU
->getInstr()->isPHI() && SuccIndex
< Index
) {
3620 if (PredBefore
&& SuccBefore
&& !SU
->getInstr()->isPHI()) {
3621 // instructions in circuits are allowed to be scheduled
3622 // after both a successor and predecessor.
3623 bool InCircuit
= std::any_of(
3624 Circuits
.begin(), Circuits
.end(),
3625 [SU
](const NodeSet
&Circuit
) { return Circuit
.count(SU
); });
3627 LLVM_DEBUG(dbgs() << "In a circuit, predecessor ";);
3630 NumNodeOrderIssues
++;
3631 LLVM_DEBUG(dbgs() << "Predecessor ";);
3633 LLVM_DEBUG(dbgs() << Pred
->NodeNum
<< " and successor " << Succ
->NodeNum
3634 << " are scheduled before node " << SU
->NodeNum
3641 dbgs() << "Invalid node order found!\n";
3645 /// Attempt to fix the degenerate cases when the instruction serialization
3646 /// causes the register lifetimes to overlap. For example,
3647 /// p' = store_pi(p, b)
3648 /// = load p, offset
3649 /// In this case p and p' overlap, which means that two registers are needed.
3650 /// Instead, this function changes the load to use p' and updates the offset.
3651 void SwingSchedulerDAG::fixupRegisterOverlaps(std::deque
<SUnit
*> &Instrs
) {
3652 unsigned OverlapReg
= 0;
3653 unsigned NewBaseReg
= 0;
3654 for (SUnit
*SU
: Instrs
) {
3655 MachineInstr
*MI
= SU
->getInstr();
3656 for (unsigned i
= 0, e
= MI
->getNumOperands(); i
< e
; ++i
) {
3657 const MachineOperand
&MO
= MI
->getOperand(i
);
3658 // Look for an instruction that uses p. The instruction occurs in the
3659 // same cycle but occurs later in the serialized order.
3660 if (MO
.isReg() && MO
.isUse() && MO
.getReg() == OverlapReg
) {
3661 // Check that the instruction appears in the InstrChanges structure,
3662 // which contains instructions that can have the offset updated.
3663 DenseMap
<SUnit
*, std::pair
<unsigned, int64_t>>::iterator It
=
3664 InstrChanges
.find(SU
);
3665 if (It
!= InstrChanges
.end()) {
3666 unsigned BasePos
, OffsetPos
;
3667 // Update the base register and adjust the offset.
3668 if (TII
->getBaseAndOffsetPosition(*MI
, BasePos
, OffsetPos
)) {
3669 MachineInstr
*NewMI
= MF
.CloneMachineInstr(MI
);
3670 NewMI
->getOperand(BasePos
).setReg(NewBaseReg
);
3672 MI
->getOperand(OffsetPos
).getImm() - It
->second
.second
;
3673 NewMI
->getOperand(OffsetPos
).setImm(NewOffset
);
3674 SU
->setInstr(NewMI
);
3675 MISUnitMap
[NewMI
] = SU
;
3676 NewMIs
.insert(NewMI
);
3683 // Look for an instruction of the form p' = op(p), which uses and defines
3684 // two virtual registers that get allocated to the same physical register.
3685 unsigned TiedUseIdx
= 0;
3686 if (MI
->isRegTiedToUseOperand(i
, &TiedUseIdx
)) {
3687 // OverlapReg is p in the example above.
3688 OverlapReg
= MI
->getOperand(TiedUseIdx
).getReg();
3689 // NewBaseReg is p' in the example above.
3690 NewBaseReg
= MI
->getOperand(i
).getReg();
3697 /// After the schedule has been formed, call this function to combine
3698 /// the instructions from the different stages/cycles. That is, this
3699 /// function creates a schedule that represents a single iteration.
3700 void SMSchedule::finalizeSchedule(SwingSchedulerDAG
*SSD
) {
3701 // Move all instructions to the first stage from later stages.
3702 for (int cycle
= getFirstCycle(); cycle
<= getFinalCycle(); ++cycle
) {
3703 for (int stage
= 1, lastStage
= getMaxStageCount(); stage
<= lastStage
;
3705 std::deque
<SUnit
*> &cycleInstrs
=
3706 ScheduledInstrs
[cycle
+ (stage
* InitiationInterval
)];
3707 for (std::deque
<SUnit
*>::reverse_iterator I
= cycleInstrs
.rbegin(),
3708 E
= cycleInstrs
.rend();
3710 ScheduledInstrs
[cycle
].push_front(*I
);
3713 // Iterate over the definitions in each instruction, and compute the
3714 // stage difference for each use. Keep the maximum value.
3715 for (auto &I
: InstrToCycle
) {
3716 int DefStage
= stageScheduled(I
.first
);
3717 MachineInstr
*MI
= I
.first
->getInstr();
3718 for (unsigned i
= 0, e
= MI
->getNumOperands(); i
< e
; ++i
) {
3719 MachineOperand
&Op
= MI
->getOperand(i
);
3720 if (!Op
.isReg() || !Op
.isDef())
3723 unsigned Reg
= Op
.getReg();
3724 unsigned MaxDiff
= 0;
3725 bool PhiIsSwapped
= false;
3726 for (MachineRegisterInfo::use_iterator UI
= MRI
.use_begin(Reg
),
3729 MachineOperand
&UseOp
= *UI
;
3730 MachineInstr
*UseMI
= UseOp
.getParent();
3731 SUnit
*SUnitUse
= SSD
->getSUnit(UseMI
);
3732 int UseStage
= stageScheduled(SUnitUse
);
3734 if (UseStage
!= -1 && UseStage
>= DefStage
)
3735 Diff
= UseStage
- DefStage
;
3737 if (isLoopCarried(SSD
, *MI
))
3740 PhiIsSwapped
= true;
3742 MaxDiff
= std::max(Diff
, MaxDiff
);
3744 RegToStageDiff
[Reg
] = std::make_pair(MaxDiff
, PhiIsSwapped
);
3748 // Erase all the elements in the later stages. Only one iteration should
3749 // remain in the scheduled list, and it contains all the instructions.
3750 for (int cycle
= getFinalCycle() + 1; cycle
<= LastCycle
; ++cycle
)
3751 ScheduledInstrs
.erase(cycle
);
3753 // Change the registers in instruction as specified in the InstrChanges
3754 // map. We need to use the new registers to create the correct order.
3755 for (int i
= 0, e
= SSD
->SUnits
.size(); i
!= e
; ++i
) {
3756 SUnit
*SU
= &SSD
->SUnits
[i
];
3757 SSD
->applyInstrChange(SU
->getInstr(), *this);
3760 // Reorder the instructions in each cycle to fix and improve the
3762 for (int Cycle
= getFirstCycle(), E
= getFinalCycle(); Cycle
<= E
; ++Cycle
) {
3763 std::deque
<SUnit
*> &cycleInstrs
= ScheduledInstrs
[Cycle
];
3764 std::deque
<SUnit
*> newOrderPhi
;
3765 for (unsigned i
= 0, e
= cycleInstrs
.size(); i
< e
; ++i
) {
3766 SUnit
*SU
= cycleInstrs
[i
];
3767 if (SU
->getInstr()->isPHI())
3768 newOrderPhi
.push_back(SU
);
3770 std::deque
<SUnit
*> newOrderI
;
3771 for (unsigned i
= 0, e
= cycleInstrs
.size(); i
< e
; ++i
) {
3772 SUnit
*SU
= cycleInstrs
[i
];
3773 if (!SU
->getInstr()->isPHI())
3774 orderDependence(SSD
, SU
, newOrderI
);
3776 // Replace the old order with the new order.
3777 cycleInstrs
.swap(newOrderPhi
);
3778 cycleInstrs
.insert(cycleInstrs
.end(), newOrderI
.begin(), newOrderI
.end());
3779 SSD
->fixupRegisterOverlaps(cycleInstrs
);
3782 LLVM_DEBUG(dump(););
3785 void NodeSet::print(raw_ostream
&os
) const {
3786 os
<< "Num nodes " << size() << " rec " << RecMII
<< " mov " << MaxMOV
3787 << " depth " << MaxDepth
<< " col " << Colocate
<< "\n";
3788 for (const auto &I
: Nodes
)
3789 os
<< " SU(" << I
->NodeNum
<< ") " << *(I
->getInstr());
3793 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3794 /// Print the schedule information to the given output.
3795 void SMSchedule::print(raw_ostream
&os
) const {
3796 // Iterate over each cycle.
3797 for (int cycle
= getFirstCycle(); cycle
<= getFinalCycle(); ++cycle
) {
3798 // Iterate over each instruction in the cycle.
3799 const_sched_iterator cycleInstrs
= ScheduledInstrs
.find(cycle
);
3800 for (SUnit
*CI
: cycleInstrs
->second
) {
3801 os
<< "cycle " << cycle
<< " (" << stageScheduled(CI
) << ") ";
3802 os
<< "(" << CI
->NodeNum
<< ") ";
3803 CI
->getInstr()->print(os
);
3809 /// Utility function used for debugging to print the schedule.
3810 LLVM_DUMP_METHOD
void SMSchedule::dump() const { print(dbgs()); }
3811 LLVM_DUMP_METHOD
void NodeSet::dump() const { print(dbgs()); }