[Alignment][NFC] Convert StoreInst to MaybeAlign
[llvm-complete.git] / include / llvm / CodeGen / ModuloSchedule.h
blob81a9b63b64ca31394785a3ebbfd4d8e654650e21
1 //===- ModuloSchedule.h - Software pipeline schedule expansion ------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // Software pipelining (SWP) is an instruction scheduling technique for loops
10 // that overlaps loop iterations and exploits ILP via compiler transformations.
12 // There are multiple methods for analyzing a loop and creating a schedule.
13 // An example algorithm is Swing Modulo Scheduling (implemented by the
14 // MachinePipeliner). The details of how a schedule is arrived at are irrelevant
15 // for the task of actually rewriting a loop to adhere to the schedule, which
16 // is what this file does.
18 // A schedule is, for every instruction in a block, a Cycle and a Stage. Note
19 // that we only support single-block loops, so "block" and "loop" can be used
20 // interchangably.
22 // The Cycle of an instruction defines a partial order of the instructions in
23 // the remapped loop. Instructions within a cycle must not consume the output
24 // of any instruction in the same cycle. Cycle information is assumed to have
25 // been calculated such that the processor will execute instructions in
26 // lock-step (for example in a VLIW ISA).
28 // The Stage of an instruction defines the mapping between logical loop
29 // iterations and pipelined loop iterations. An example (unrolled) pipeline
30 // may look something like:
32 // I0[0] Execute instruction I0 of iteration 0
33 // I1[0], I0[1] Execute I0 of iteration 1 and I1 of iteration 1
34 // I1[1], I0[2]
35 // I1[2], I0[3]
37 // In the schedule for this unrolled sequence we would say that I0 was scheduled
38 // in stage 0 and I1 in stage 1:
40 // loop:
41 // [stage 0] x = I0
42 // [stage 1] I1 x (from stage 0)
44 // And to actually generate valid code we must insert a phi:
46 // loop:
47 // x' = phi(x)
48 // x = I0
49 // I1 x'
51 // This is a simple example; the rules for how to generate correct code given
52 // an arbitrary schedule containing loop-carried values are complex.
54 // Note that these examples only mention the steady-state kernel of the
55 // generated loop; prologs and epilogs must be generated also that prime and
56 // flush the pipeline. Doing so is nontrivial.
58 //===----------------------------------------------------------------------===//
60 #ifndef LLVM_LIB_CODEGEN_MODULOSCHEDULE_H
61 #define LLVM_LIB_CODEGEN_MODULOSCHEDULE_H
63 #include "llvm/CodeGen/MachineFunction.h"
64 #include "llvm/CodeGen/MachineLoopInfo.h"
65 #include "llvm/CodeGen/MachineLoopUtils.h"
66 #include "llvm/CodeGen/TargetInstrInfo.h"
67 #include "llvm/CodeGen/TargetSubtargetInfo.h"
68 #include <deque>
69 #include <vector>
71 namespace llvm {
72 class MachineBasicBlock;
73 class MachineInstr;
74 class LiveIntervals;
76 /// Represents a schedule for a single-block loop. For every instruction we
77 /// maintain a Cycle and Stage.
78 class ModuloSchedule {
79 private:
80 /// The block containing the loop instructions.
81 MachineLoop *Loop;
83 /// The instructions to be generated, in total order. Cycle provides a partial
84 /// order; the total order within cycles has been decided by the schedule
85 /// producer.
86 std::vector<MachineInstr *> ScheduledInstrs;
88 /// The cycle for each instruction.
89 DenseMap<MachineInstr *, int> Cycle;
91 /// The stage for each instruction.
92 DenseMap<MachineInstr *, int> Stage;
94 /// The number of stages in this schedule (Max(Stage) + 1).
95 int NumStages;
97 public:
98 /// Create a new ModuloSchedule.
99 /// \arg ScheduledInstrs The new loop instructions, in total resequenced
100 /// order.
101 /// \arg Cycle Cycle index for all instructions in ScheduledInstrs. Cycle does
102 /// not need to start at zero. ScheduledInstrs must be partially ordered by
103 /// Cycle.
104 /// \arg Stage Stage index for all instructions in ScheduleInstrs.
105 ModuloSchedule(MachineFunction &MF, MachineLoop *Loop,
106 std::vector<MachineInstr *> ScheduledInstrs,
107 DenseMap<MachineInstr *, int> Cycle,
108 DenseMap<MachineInstr *, int> Stage)
109 : Loop(Loop), ScheduledInstrs(ScheduledInstrs), Cycle(std::move(Cycle)),
110 Stage(std::move(Stage)) {
111 NumStages = 0;
112 for (auto &KV : this->Stage)
113 NumStages = std::max(NumStages, KV.second);
114 ++NumStages;
117 /// Return the single-block loop being scheduled.
118 MachineLoop *getLoop() const { return Loop; }
120 /// Return the number of stages contained in this schedule, which is the
121 /// largest stage index + 1.
122 int getNumStages() const { return NumStages; }
124 /// Return the first cycle in the schedule, which is the cycle index of the
125 /// first instruction.
126 int getFirstCycle() { return Cycle[ScheduledInstrs.front()]; }
128 /// Return the final cycle in the schedule, which is the cycle index of the
129 /// last instruction.
130 int getFinalCycle() { return Cycle[ScheduledInstrs.back()]; }
132 /// Return the stage that MI is scheduled in, or -1.
133 int getStage(MachineInstr *MI) {
134 auto I = Stage.find(MI);
135 return I == Stage.end() ? -1 : I->second;
138 /// Return the cycle that MI is scheduled at, or -1.
139 int getCycle(MachineInstr *MI) {
140 auto I = Cycle.find(MI);
141 return I == Cycle.end() ? -1 : I->second;
144 /// Return the rescheduled instructions in order.
145 ArrayRef<MachineInstr *> getInstructions() { return ScheduledInstrs; }
147 void dump() { print(dbgs()); }
148 void print(raw_ostream &OS);
151 /// The ModuloScheduleExpander takes a ModuloSchedule and expands it in-place,
152 /// rewriting the old loop and inserting prologs and epilogs as required.
153 class ModuloScheduleExpander {
154 public:
155 using InstrChangesTy = DenseMap<MachineInstr *, std::pair<unsigned, int64_t>>;
157 private:
158 using ValueMapTy = DenseMap<unsigned, unsigned>;
159 using MBBVectorTy = SmallVectorImpl<MachineBasicBlock *>;
160 using InstrMapTy = DenseMap<MachineInstr *, MachineInstr *>;
162 ModuloSchedule &Schedule;
163 MachineFunction &MF;
164 const TargetSubtargetInfo &ST;
165 MachineRegisterInfo &MRI;
166 const TargetInstrInfo *TII;
167 LiveIntervals &LIS;
169 MachineBasicBlock *BB;
170 MachineBasicBlock *Preheader;
171 MachineBasicBlock *NewKernel = nullptr;
172 std::unique_ptr<TargetInstrInfo::PipelinerLoopInfo> LoopInfo;
174 /// Map for each register and the max difference between its uses and def.
175 /// The first element in the pair is the max difference in stages. The
176 /// second is true if the register defines a Phi value and loop value is
177 /// scheduled before the Phi.
178 std::map<unsigned, std::pair<unsigned, bool>> RegToStageDiff;
180 /// Instructions to change when emitting the final schedule.
181 InstrChangesTy InstrChanges;
183 void generatePipelinedLoop();
184 void generateProlog(unsigned LastStage, MachineBasicBlock *KernelBB,
185 ValueMapTy *VRMap, MBBVectorTy &PrologBBs);
186 void generateEpilog(unsigned LastStage, MachineBasicBlock *KernelBB,
187 ValueMapTy *VRMap, MBBVectorTy &EpilogBBs,
188 MBBVectorTy &PrologBBs);
189 void generateExistingPhis(MachineBasicBlock *NewBB, MachineBasicBlock *BB1,
190 MachineBasicBlock *BB2, MachineBasicBlock *KernelBB,
191 ValueMapTy *VRMap, InstrMapTy &InstrMap,
192 unsigned LastStageNum, unsigned CurStageNum,
193 bool IsLast);
194 void generatePhis(MachineBasicBlock *NewBB, MachineBasicBlock *BB1,
195 MachineBasicBlock *BB2, MachineBasicBlock *KernelBB,
196 ValueMapTy *VRMap, InstrMapTy &InstrMap,
197 unsigned LastStageNum, unsigned CurStageNum, bool IsLast);
198 void removeDeadInstructions(MachineBasicBlock *KernelBB,
199 MBBVectorTy &EpilogBBs);
200 void splitLifetimes(MachineBasicBlock *KernelBB, MBBVectorTy &EpilogBBs);
201 void addBranches(MachineBasicBlock &PreheaderBB, MBBVectorTy &PrologBBs,
202 MachineBasicBlock *KernelBB, MBBVectorTy &EpilogBBs,
203 ValueMapTy *VRMap);
204 bool computeDelta(MachineInstr &MI, unsigned &Delta);
205 void updateMemOperands(MachineInstr &NewMI, MachineInstr &OldMI,
206 unsigned Num);
207 MachineInstr *cloneInstr(MachineInstr *OldMI, unsigned CurStageNum,
208 unsigned InstStageNum);
209 MachineInstr *cloneAndChangeInstr(MachineInstr *OldMI, unsigned CurStageNum,
210 unsigned InstStageNum);
211 void updateInstruction(MachineInstr *NewMI, bool LastDef,
212 unsigned CurStageNum, unsigned InstrStageNum,
213 ValueMapTy *VRMap);
214 MachineInstr *findDefInLoop(unsigned Reg);
215 unsigned getPrevMapVal(unsigned StageNum, unsigned PhiStage, unsigned LoopVal,
216 unsigned LoopStage, ValueMapTy *VRMap,
217 MachineBasicBlock *BB);
218 void rewritePhiValues(MachineBasicBlock *NewBB, unsigned StageNum,
219 ValueMapTy *VRMap, InstrMapTy &InstrMap);
220 void rewriteScheduledInstr(MachineBasicBlock *BB, InstrMapTy &InstrMap,
221 unsigned CurStageNum, unsigned PhiNum,
222 MachineInstr *Phi, unsigned OldReg,
223 unsigned NewReg, unsigned PrevReg = 0);
224 bool isLoopCarried(MachineInstr &Phi);
226 /// Return the max. number of stages/iterations that can occur between a
227 /// register definition and its uses.
228 unsigned getStagesForReg(int Reg, unsigned CurStage) {
229 std::pair<unsigned, bool> Stages = RegToStageDiff[Reg];
230 if ((int)CurStage > Schedule.getNumStages() - 1 && Stages.first == 0 &&
231 Stages.second)
232 return 1;
233 return Stages.first;
236 /// The number of stages for a Phi is a little different than other
237 /// instructions. The minimum value computed in RegToStageDiff is 1
238 /// because we assume the Phi is needed for at least 1 iteration.
239 /// This is not the case if the loop value is scheduled prior to the
240 /// Phi in the same stage. This function returns the number of stages
241 /// or iterations needed between the Phi definition and any uses.
242 unsigned getStagesForPhi(int Reg) {
243 std::pair<unsigned, bool> Stages = RegToStageDiff[Reg];
244 if (Stages.second)
245 return Stages.first;
246 return Stages.first - 1;
249 public:
250 /// Create a new ModuloScheduleExpander.
251 /// \arg InstrChanges Modifications to make to instructions with memory
252 /// operands.
253 /// FIXME: InstrChanges is opaque and is an implementation detail of an
254 /// optimization in MachinePipeliner that crosses abstraction boundaries.
255 ModuloScheduleExpander(MachineFunction &MF, ModuloSchedule &S,
256 LiveIntervals &LIS, InstrChangesTy InstrChanges)
257 : Schedule(S), MF(MF), ST(MF.getSubtarget()), MRI(MF.getRegInfo()),
258 TII(ST.getInstrInfo()), LIS(LIS),
259 InstrChanges(std::move(InstrChanges)) {}
261 /// Performs the actual expansion.
262 void expand();
263 /// Performs final cleanup after expansion.
264 void cleanup();
266 /// Returns the newly rewritten kernel block, or nullptr if this was
267 /// optimized away.
268 MachineBasicBlock *getRewrittenKernel() { return NewKernel; }
271 /// A reimplementation of ModuloScheduleExpander. It works by generating a
272 /// standalone kernel loop and peeling out the prologs and epilogs.
273 class PeelingModuloScheduleExpander {
274 ModuloSchedule &Schedule;
275 MachineFunction &MF;
276 const TargetSubtargetInfo &ST;
277 MachineRegisterInfo &MRI;
278 const TargetInstrInfo *TII;
279 LiveIntervals *LIS;
281 /// The original loop block that gets rewritten in-place.
282 MachineBasicBlock *BB;
283 /// The original loop preheader.
284 MachineBasicBlock *Preheader;
285 /// All prolog and epilog blocks.
286 SmallVector<MachineBasicBlock *, 4> Prologs, Epilogs;
287 /// For every block, the stages that are produced.
288 DenseMap<MachineBasicBlock *, BitVector> LiveStages;
289 /// For every block, the stages that are available. A stage can be available
290 /// but not produced (in the epilog) or produced but not available (in the
291 /// prolog).
292 DenseMap<MachineBasicBlock *, BitVector> AvailableStages;
294 /// CanonicalMIs and BlockMIs form a bidirectional map between any of the
295 /// loop kernel clones.
296 DenseMap<MachineInstr *, MachineInstr *> CanonicalMIs;
297 DenseMap<std::pair<MachineBasicBlock *, MachineInstr *>, MachineInstr *>
298 BlockMIs;
300 /// State passed from peelKernel to peelPrologAndEpilogs().
301 std::deque<MachineBasicBlock *> PeeledFront, PeeledBack;
303 public:
304 PeelingModuloScheduleExpander(MachineFunction &MF, ModuloSchedule &S,
305 LiveIntervals *LIS)
306 : Schedule(S), MF(MF), ST(MF.getSubtarget()), MRI(MF.getRegInfo()),
307 TII(ST.getInstrInfo()), LIS(LIS) {}
309 void expand();
311 /// Runs ModuloScheduleExpander and treats it as a golden input to validate
312 /// aspects of the code generated by PeelingModuloScheduleExpander.
313 void validateAgainstModuloScheduleExpander();
315 protected:
316 /// Converts BB from the original loop body to the rewritten, pipelined
317 /// steady-state.
318 void rewriteKernel();
320 private:
321 /// Peels one iteration of the rewritten kernel (BB) in the specified
322 /// direction.
323 MachineBasicBlock *peelKernel(LoopPeelDirection LPD);
324 /// Peel the kernel forwards and backwards to produce prologs and epilogs,
325 /// and stitch them together.
326 void peelPrologAndEpilogs();
327 /// All prolog and epilog blocks are clones of the kernel, so any produced
328 /// register in one block has an corollary in all other blocks.
329 Register getEquivalentRegisterIn(Register Reg, MachineBasicBlock *BB);
330 /// Change all users of MI, if MI is predicated out
331 /// (LiveStages[MI->getParent()] == false).
332 void rewriteUsesOf(MachineInstr *MI);
333 /// Insert branches between prologs, kernel and epilogs.
334 void fixupBranches();
335 /// Create a poor-man's LCSSA by cloning only the PHIs from the kernel block
336 /// to a block dominated by all prologs and epilogs. This allows us to treat
337 /// the loop exiting block as any other kernel clone.
338 MachineBasicBlock *CreateLCSSAExitingBlock();
339 /// Helper to get the stage of an instruction in the schedule.
340 unsigned getStage(MachineInstr *MI) {
341 if (CanonicalMIs.count(MI))
342 MI = CanonicalMIs[MI];
343 return Schedule.getStage(MI);
347 /// Expander that simply annotates each scheduled instruction with a post-instr
348 /// symbol that can be consumed by the ModuloScheduleTest pass.
350 /// The post-instr symbol is a way of annotating an instruction that can be
351 /// roundtripped in MIR. The syntax is:
352 /// MYINST %0, post-instr-symbol <mcsymbol Stage-1_Cycle-5>
353 class ModuloScheduleTestAnnotater {
354 MachineFunction &MF;
355 ModuloSchedule &S;
357 public:
358 ModuloScheduleTestAnnotater(MachineFunction &MF, ModuloSchedule &S)
359 : MF(MF), S(S) {}
361 /// Performs the annotation.
362 void annotate();
365 } // end namespace llvm
367 #endif // LLVM_LIB_CODEGEN_MODULOSCHEDULE_H