include/llvm/CodeGen/ModuloSchedule.h

   1 //===- ModuloSchedule.h - Software pipeline schedule expansion ------------===//
   2 //
   3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 // See https://llvm.org/LICENSE.txt for license information.
   5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 //
   7 //===----------------------------------------------------------------------===//
   8 //
   9 // Software pipelining (SWP) is an instruction scheduling technique for loops
  10 // that overlaps loop iterations and exploits ILP via compiler transformations.
  11 //
  12 // There are multiple methods for analyzing a loop and creating a schedule.
  13 // An example algorithm is Swing Modulo Scheduling (implemented by the
  14 // MachinePipeliner). The details of how a schedule is arrived at are irrelevant
  15 // for the task of actually rewriting a loop to adhere to the schedule, which
  16 // is what this file does.
  17 //
  18 // A schedule is, for every instruction in a block, a Cycle and a Stage. Note
  19 // that we only support single-block loops, so "block" and "loop" can be used
  20 // interchangably.
  21 //
  22 // The Cycle of an instruction defines a partial order of the instructions in
  23 // the remapped loop. Instructions within a cycle must not consume the output
  24 // of any instruction in the same cycle. Cycle information is assumed to have
  25 // been calculated such that the processor will execute instructions in
  26 // lock-step (for example in a VLIW ISA).
  27 //
  28 // The Stage of an instruction defines the mapping between logical loop
  29 // iterations and pipelined loop iterations. An example (unrolled) pipeline
  30 // may look something like:
  31 //
  32 //  I0[0]                      Execute instruction I0 of iteration 0
  33 //  I1[0], I0[1]               Execute I0 of iteration 1 and I1 of iteration 1
  34 //         I1[1], I0[2]
  35 //                I1[2], I0[3]
  36 //
  37 // In the schedule for this unrolled sequence we would say that I0 was scheduled
  38 // in stage 0 and I1 in stage 1:
  39 //
  40 //  loop:
  41 //    [stage 0] x = I0
  42 //    [stage 1] I1 x (from stage 0)
  43 //
  44 // And to actually generate valid code we must insert a phi:
  45 //
  46 //  loop:
  47 //    x' = phi(x)
  48 //    x = I0
  49 //    I1 x'
  50 //
  51 // This is a simple example; the rules for how to generate correct code given
  52 // an arbitrary schedule containing loop-carried values are complex.
  53 //
  54 // Note that these examples only mention the steady-state kernel of the
  55 // generated loop; prologs and epilogs must be generated also that prime and
  56 // flush the pipeline. Doing so is nontrivial.
  57 //
  58 //===----------------------------------------------------------------------===//
  59
  60 #ifndef LLVM_LIB_CODEGEN_MODULOSCHEDULE_H
  61 #define LLVM_LIB_CODEGEN_MODULOSCHEDULE_H
  62
  63 #include "llvm/CodeGen/MachineFunction.h"
  64 #include "llvm/CodeGen/MachineLoopInfo.h"
  65 #include "llvm/CodeGen/MachineLoopUtils.h"
  66 #include "llvm/CodeGen/TargetInstrInfo.h"
  67 #include "llvm/CodeGen/TargetSubtargetInfo.h"
  68 #include <deque>
  69 #include <vector>
  70
  71 namespace llvm {
  72 class MachineBasicBlock;
  73 class MachineInstr;
  74 class LiveIntervals;
  75
  76 /// Represents a schedule for a single-block loop. For every instruction we
  77 /// maintain a Cycle and Stage.
  78 class ModuloSchedule {
  79 private:
  80   /// The block containing the loop instructions.
  81   MachineLoop *Loop;
  82
  83   /// The instructions to be generated, in total order. Cycle provides a partial
  84   /// order; the total order within cycles has been decided by the schedule
  85   /// producer.
  86   std::vector<MachineInstr *> ScheduledInstrs;
  87
  88   /// The cycle for each instruction.
  89   DenseMap<MachineInstr *, int> Cycle;
  90
  91   /// The stage for each instruction.
  92   DenseMap<MachineInstr *, int> Stage;
  93
  94   /// The number of stages in this schedule (Max(Stage) + 1).
  95   int NumStages;
  96
  97 public:
  98   /// Create a new ModuloSchedule.
  99   /// \arg ScheduledInstrs The new loop instructions, in total resequenced
 100   ///    order.
 101   /// \arg Cycle Cycle index for all instructions in ScheduledInstrs. Cycle does
 102   ///    not need to start at zero. ScheduledInstrs must be partially ordered by
 103   ///    Cycle.
 104   /// \arg Stage Stage index for all instructions in ScheduleInstrs.
 105   ModuloSchedule(MachineFunction &MF, MachineLoop *Loop,
 106                  std::vector<MachineInstr *> ScheduledInstrs,
 107                  DenseMap<MachineInstr *, int> Cycle,
 108                  DenseMap<MachineInstr *, int> Stage)
 109       : Loop(Loop), ScheduledInstrs(ScheduledInstrs), Cycle(std::move(Cycle)),
 110         Stage(std::move(Stage)) {
 111     NumStages = 0;
 112     for (auto &KV : this->Stage)
 113       NumStages = std::max(NumStages, KV.second);
 114     ++NumStages;
 115   }
 116
 117   /// Return the single-block loop being scheduled.
 118   MachineLoop *getLoop() const { return Loop; }
 119
 120   /// Return the number of stages contained in this schedule, which is the
 121   /// largest stage index + 1.
 122   int getNumStages() const { return NumStages; }
 123
 124   /// Return the first cycle in the schedule, which is the cycle index of the
 125   /// first instruction.
 126   int getFirstCycle() { return Cycle[ScheduledInstrs.front()]; }
 127
 128   /// Return the final cycle in the schedule, which is the cycle index of the
 129   /// last instruction.
 130   int getFinalCycle() { return Cycle[ScheduledInstrs.back()]; }
 131
 132   /// Return the stage that MI is scheduled in, or -1.
 133   int getStage(MachineInstr *MI) {
 134     auto I = Stage.find(MI);
 135     return I == Stage.end() ? -1 : I->second;
 136   }
 137
 138   /// Return the cycle that MI is scheduled at, or -1.
 139   int getCycle(MachineInstr *MI) {
 140     auto I = Cycle.find(MI);
 141     return I == Cycle.end() ? -1 : I->second;
 142   }
 143
 144   /// Return the rescheduled instructions in order.
 145   ArrayRef<MachineInstr *> getInstructions() { return ScheduledInstrs; }
 146
 147   void dump() { print(dbgs()); }
 148   void print(raw_ostream &OS);
 149 };
 150
 151 /// The ModuloScheduleExpander takes a ModuloSchedule and expands it in-place,
 152 /// rewriting the old loop and inserting prologs and epilogs as required.
 153 class ModuloScheduleExpander {
 154 public:
 155   using InstrChangesTy = DenseMap<MachineInstr *, std::pair<unsigned, int64_t>>;
 156
 157 private:
 158   using ValueMapTy = DenseMap<unsigned, unsigned>;
 159   using MBBVectorTy = SmallVectorImpl<MachineBasicBlock *>;
 160   using InstrMapTy = DenseMap<MachineInstr *, MachineInstr *>;
 161
 162   ModuloSchedule &Schedule;
 163   MachineFunction &MF;
 164   const TargetSubtargetInfo &ST;
 165   MachineRegisterInfo &MRI;
 166   const TargetInstrInfo *TII;
 167   LiveIntervals &LIS;
 168
 169   MachineBasicBlock *BB;
 170   MachineBasicBlock *Preheader;
 171   MachineBasicBlock *NewKernel = nullptr;
 172   std::unique_ptr<TargetInstrInfo::PipelinerLoopInfo> LoopInfo;
 173
 174   /// Map for each register and the max difference between its uses and def.
 175   /// The first element in the pair is the max difference in stages. The
 176   /// second is true if the register defines a Phi value and loop value is
 177   /// scheduled before the Phi.
 178   std::map<unsigned, std::pair<unsigned, bool>> RegToStageDiff;
 179
 180   /// Instructions to change when emitting the final schedule.
 181   InstrChangesTy InstrChanges;
 182
 183   void generatePipelinedLoop();
 184   void generateProlog(unsigned LastStage, MachineBasicBlock *KernelBB,
 185                       ValueMapTy *VRMap, MBBVectorTy &PrologBBs);
 186   void generateEpilog(unsigned LastStage, MachineBasicBlock *KernelBB,
 187                       ValueMapTy *VRMap, MBBVectorTy &EpilogBBs,
 188                       MBBVectorTy &PrologBBs);
 189   void generateExistingPhis(MachineBasicBlock *NewBB, MachineBasicBlock *BB1,
 190                             MachineBasicBlock *BB2, MachineBasicBlock *KernelBB,
 191                             ValueMapTy *VRMap, InstrMapTy &InstrMap,
 192                             unsigned LastStageNum, unsigned CurStageNum,
 193                             bool IsLast);
 194   void generatePhis(MachineBasicBlock *NewBB, MachineBasicBlock *BB1,
 195                     MachineBasicBlock *BB2, MachineBasicBlock *KernelBB,
 196                     ValueMapTy *VRMap, InstrMapTy &InstrMap,
 197                     unsigned LastStageNum, unsigned CurStageNum, bool IsLast);
 198   void removeDeadInstructions(MachineBasicBlock *KernelBB,
 199                               MBBVectorTy &EpilogBBs);
 200   void splitLifetimes(MachineBasicBlock *KernelBB, MBBVectorTy &EpilogBBs);
 201   void addBranches(MachineBasicBlock &PreheaderBB, MBBVectorTy &PrologBBs,
 202                    MachineBasicBlock *KernelBB, MBBVectorTy &EpilogBBs,
 203                    ValueMapTy *VRMap);
 204   bool computeDelta(MachineInstr &MI, unsigned &Delta);
 205   void updateMemOperands(MachineInstr &NewMI, MachineInstr &OldMI,
 206                          unsigned Num);
 207   MachineInstr *cloneInstr(MachineInstr *OldMI, unsigned CurStageNum,
 208                            unsigned InstStageNum);
 209   MachineInstr *cloneAndChangeInstr(MachineInstr *OldMI, unsigned CurStageNum,
 210                                     unsigned InstStageNum);
 211   void updateInstruction(MachineInstr *NewMI, bool LastDef,
 212                          unsigned CurStageNum, unsigned InstrStageNum,
 213                          ValueMapTy *VRMap);
 214   MachineInstr *findDefInLoop(unsigned Reg);
 215   unsigned getPrevMapVal(unsigned StageNum, unsigned PhiStage, unsigned LoopVal,
 216                          unsigned LoopStage, ValueMapTy *VRMap,
 217                          MachineBasicBlock *BB);
 218   void rewritePhiValues(MachineBasicBlock *NewBB, unsigned StageNum,
 219                         ValueMapTy *VRMap, InstrMapTy &InstrMap);
 220   void rewriteScheduledInstr(MachineBasicBlock *BB, InstrMapTy &InstrMap,
 221                              unsigned CurStageNum, unsigned PhiNum,
 222                              MachineInstr *Phi, unsigned OldReg,
 223                              unsigned NewReg, unsigned PrevReg = 0);
 224   bool isLoopCarried(MachineInstr &Phi);
 225
 226   /// Return the max. number of stages/iterations that can occur between a
 227   /// register definition and its uses.
 228   unsigned getStagesForReg(int Reg, unsigned CurStage) {
 229     std::pair<unsigned, bool> Stages = RegToStageDiff[Reg];
 230     if ((int)CurStage > Schedule.getNumStages() - 1 && Stages.first == 0 &&
 231         Stages.second)
 232       return 1;
 233     return Stages.first;
 234   }
 235
 236   /// The number of stages for a Phi is a little different than other
 237   /// instructions. The minimum value computed in RegToStageDiff is 1
 238   /// because we assume the Phi is needed for at least 1 iteration.
 239   /// This is not the case if the loop value is scheduled prior to the
 240   /// Phi in the same stage.  This function returns the number of stages
 241   /// or iterations needed between the Phi definition and any uses.
 242   unsigned getStagesForPhi(int Reg) {
 243     std::pair<unsigned, bool> Stages = RegToStageDiff[Reg];
 244     if (Stages.second)
 245       return Stages.first;
 246     return Stages.first - 1;
 247   }
 248
 249 public:
 250   /// Create a new ModuloScheduleExpander.
 251   /// \arg InstrChanges Modifications to make to instructions with memory
 252   ///   operands.
 253   /// FIXME: InstrChanges is opaque and is an implementation detail of an
 254   ///   optimization in MachinePipeliner that crosses abstraction boundaries.
 255   ModuloScheduleExpander(MachineFunction &MF, ModuloSchedule &S,
 256                          LiveIntervals &LIS, InstrChangesTy InstrChanges)
 257       : Schedule(S), MF(MF), ST(MF.getSubtarget()), MRI(MF.getRegInfo()),
 258         TII(ST.getInstrInfo()), LIS(LIS),
 259         InstrChanges(std::move(InstrChanges)) {}
 260
 261   /// Performs the actual expansion.
 262   void expand();
 263   /// Performs final cleanup after expansion.
 264   void cleanup();
 265
 266   /// Returns the newly rewritten kernel block, or nullptr if this was
 267   /// optimized away.
 268   MachineBasicBlock *getRewrittenKernel() { return NewKernel; }
 269 };
 270
 271 /// A reimplementation of ModuloScheduleExpander. It works by generating a
 272 /// standalone kernel loop and peeling out the prologs and epilogs.
 273 class PeelingModuloScheduleExpander {
 274   ModuloSchedule &Schedule;
 275   MachineFunction &MF;
 276   const TargetSubtargetInfo &ST;
 277   MachineRegisterInfo &MRI;
 278   const TargetInstrInfo *TII;
 279   LiveIntervals *LIS;
 280
 281   /// The original loop block that gets rewritten in-place.
 282   MachineBasicBlock *BB;
 283   /// The original loop preheader.
 284   MachineBasicBlock *Preheader;
 285   /// All prolog and epilog blocks.
 286   SmallVector<MachineBasicBlock *, 4> Prologs, Epilogs;
 287   /// For every block, the stages that are produced.
 288   DenseMap<MachineBasicBlock *, BitVector> LiveStages;
 289   /// For every block, the stages that are available. A stage can be available
 290   /// but not produced (in the epilog) or produced but not available (in the
 291   /// prolog).
 292   DenseMap<MachineBasicBlock *, BitVector> AvailableStages;
 293
 294   /// CanonicalMIs and BlockMIs form a bidirectional map between any of the
 295   /// loop kernel clones.
 296   DenseMap<MachineInstr *, MachineInstr *> CanonicalMIs;
 297   DenseMap<std::pair<MachineBasicBlock *, MachineInstr *>, MachineInstr *>
 298       BlockMIs;
 299
 300   /// State passed from peelKernel to peelPrologAndEpilogs().
 301   std::deque<MachineBasicBlock *> PeeledFront, PeeledBack;
 302
 303 public:
 304   PeelingModuloScheduleExpander(MachineFunction &MF, ModuloSchedule &S,
 305                                 LiveIntervals *LIS)
 306       : Schedule(S), MF(MF), ST(MF.getSubtarget()), MRI(MF.getRegInfo()),
 307         TII(ST.getInstrInfo()), LIS(LIS) {}
 308
 309   void expand();
 310
 311   /// Runs ModuloScheduleExpander and treats it as a golden input to validate
 312   /// aspects of the code generated by PeelingModuloScheduleExpander.
 313   void validateAgainstModuloScheduleExpander();
 314
 315 protected:
 316   /// Converts BB from the original loop body to the rewritten, pipelined
 317   /// steady-state.
 318   void rewriteKernel();
 319
 320 private:
 321   /// Peels one iteration of the rewritten kernel (BB) in the specified
 322   /// direction.
 323   MachineBasicBlock *peelKernel(LoopPeelDirection LPD);
 324   /// Peel the kernel forwards and backwards to produce prologs and epilogs,
 325   /// and stitch them together.
 326   void peelPrologAndEpilogs();
 327   /// All prolog and epilog blocks are clones of the kernel, so any produced
 328   /// register in one block has an corollary in all other blocks.
 329   Register getEquivalentRegisterIn(Register Reg, MachineBasicBlock *BB);
 330   /// Change all users of MI, if MI is predicated out
 331   /// (LiveStages[MI->getParent()] == false).
 332   void rewriteUsesOf(MachineInstr *MI);
 333   /// Insert branches between prologs, kernel and epilogs.
 334   void fixupBranches();
 335   /// Create a poor-man's LCSSA by cloning only the PHIs from the kernel block
 336   /// to a block dominated by all prologs and epilogs. This allows us to treat
 337   /// the loop exiting block as any other kernel clone.
 338   MachineBasicBlock *CreateLCSSAExitingBlock();
 339   /// Helper to get the stage of an instruction in the schedule.
 340   unsigned getStage(MachineInstr *MI) {
 341     if (CanonicalMIs.count(MI))
 342       MI = CanonicalMIs[MI];
 343     return Schedule.getStage(MI);
 344   }
 345 };
 346
 347 /// Expander that simply annotates each scheduled instruction with a post-instr
 348 /// symbol that can be consumed by the ModuloScheduleTest pass.
 349 ///
 350 /// The post-instr symbol is a way of annotating an instruction that can be
 351 /// roundtripped in MIR. The syntax is:
 352 ///   MYINST %0, post-instr-symbol <mcsymbol Stage-1_Cycle-5>
 353 class ModuloScheduleTestAnnotater {
 354   MachineFunction &MF;
 355   ModuloSchedule &S;
 356
 357 public:
 358   ModuloScheduleTestAnnotater(MachineFunction &MF, ModuloSchedule &S)
 359       : MF(MF), S(S) {}
 360
 361   /// Performs the annotation.
 362   void annotate();
 363 };
 364
 365 } // end namespace llvm
 366
 367 #endif // LLVM_LIB_CODEGEN_MODULOSCHEDULE_H