llvm/lib/CodeGen/WindowScheduler.cpp

   1 //======----------- WindowScheduler.cpp - window scheduler -------------======//
   2 //
   3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 // See https://llvm.org/LICENSE.txt for license information.
   5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 //
   7 //===----------------------------------------------------------------------===//
   8 //
   9 // An implementation of the Window Scheduling software pipelining algorithm.
  10 //
  11 // The fundamental concept of the window scheduling algorithm involves folding
  12 // the original MBB at a specific position, followed by list scheduling on the
  13 // folded MIs. The optimal scheduling result is then chosen from various folding
  14 // positions as the final scheduling outcome.
  15 //
  16 // The primary challenge in this algorithm lies in generating the folded MIs and
  17 // establishing their dependencies. We have innovatively employed a new MBB,
  18 // created by copying the original MBB three times, known as TripleMBB. This
  19 // TripleMBB enables the convenient implementation of MI folding and dependency
  20 // establishment. To facilitate the algorithm's implementation, we have also
  21 // devised data structures such as OriMIs, TriMIs, TriToOri, and OriToCycle.
  22 //
  23 // Another challenge in the algorithm is the scheduling of phis. Semantically,
  24 // it is difficult to place the phis in the window and perform list scheduling.
  25 // Therefore, we schedule these phis separately after each list scheduling.
  26 //
  27 // The provided implementation is designed for use before the Register Allocator
  28 // (RA). If the target requires implementation after RA, it is recommended to
  29 // reimplement analyseII(), schedulePhi(), and expand(). Additionally,
  30 // target-specific logic can be added in initialize(), preProcess(), and
  31 // postProcess().
  32 //
  33 // Lastly, it is worth mentioning that getSearchIndexes() is an important
  34 // function. We have experimented with more complex heuristics on downstream
  35 // target and achieved favorable results.
  36 //
  37 //===----------------------------------------------------------------------===//
  38 #include "llvm/CodeGen/WindowScheduler.h"
  39 #include "llvm/ADT/Statistic.h"
  40 #include "llvm/CodeGen/LiveIntervals.h"
  41 #include "llvm/CodeGen/MachineLoopInfo.h"
  42 #include "llvm/CodeGen/MachinePipeliner.h"
  43 #include "llvm/CodeGen/ModuloSchedule.h"
  44 #include "llvm/CodeGen/TargetPassConfig.h"
  45 #include "llvm/Support/CommandLine.h"
  46 #include "llvm/Support/Debug.h"
  47 #include "llvm/Support/TimeProfiler.h"
  48
  49 using namespace llvm;
  50
  51 #define DEBUG_TYPE "pipeliner"
  52
  53 namespace {
  54 STATISTIC(NumTryWindowSchedule,
  55           "Number of loops that we attempt to use window scheduling");
  56 STATISTIC(NumTryWindowSearch,
  57           "Number of times that we run list schedule in the window scheduling");
  58 STATISTIC(NumWindowSchedule,
  59           "Number of loops that we successfully use window scheduling");
  60 STATISTIC(NumFailAnalyseII,
  61           "Window scheduling abort due to the failure of the II analysis");
  62
  63 cl::opt<unsigned>
  64     WindowSearchNum("window-search-num",
  65                     cl::desc("The number of searches per loop in the window "
  66                              "algorithm. 0 means no search number limit."),
  67                     cl::Hidden, cl::init(6));
  68
  69 cl::opt<unsigned> WindowSearchRatio(
  70     "window-search-ratio",
  71     cl::desc("The ratio of searches per loop in the window algorithm. 100 "
  72              "means search all positions in the loop, while 0 means not "
  73              "performing any search."),
  74     cl::Hidden, cl::init(40));
  75
  76 cl::opt<unsigned> WindowIICoeff(
  77     "window-ii-coeff",
  78     cl::desc(
  79         "The coefficient used when initializing II in the window algorithm."),
  80     cl::Hidden, cl::init(5));
  81
  82 cl::opt<unsigned> WindowRegionLimit(
  83     "window-region-limit",
  84     cl::desc(
  85         "The lower limit of the scheduling region in the window algorithm."),
  86     cl::Hidden, cl::init(3));
  87
  88 cl::opt<unsigned> WindowDiffLimit(
  89     "window-diff-limit",
  90     cl::desc("The lower limit of the difference between best II and base II in "
  91              "the window algorithm. If the difference is smaller than "
  92              "this lower limit, window scheduling will not be performed."),
  93     cl::Hidden, cl::init(2));
  94 } // namespace
  95
  96 // WindowIILimit serves as an indicator of abnormal scheduling results and could
  97 // potentially be referenced by the derived target window scheduler.
  98 cl::opt<unsigned>
  99     WindowIILimit("window-ii-limit",
 100                   cl::desc("The upper limit of II in the window algorithm."),
 101                   cl::Hidden, cl::init(1000));
 102
 103 WindowScheduler::WindowScheduler(MachineSchedContext *C, MachineLoop &ML)
 104     : Context(C), MF(C->MF), MBB(ML.getHeader()), Loop(ML),
 105       Subtarget(&MF->getSubtarget()), TII(Subtarget->getInstrInfo()),
 106       TRI(Subtarget->getRegisterInfo()), MRI(&MF->getRegInfo()) {
 107   TripleDAG = std::unique_ptr<ScheduleDAGInstrs>(
 108       createMachineScheduler(/*OnlyBuildGraph=*/true));
 109 }
 110
 111 bool WindowScheduler::run() {
 112   if (!initialize()) {
 113     LLVM_DEBUG(dbgs() << "The WindowScheduler failed to initialize!\n");
 114     return false;
 115   }
 116   // The window algorithm is time-consuming, and its compilation time should be
 117   // taken into consideration.
 118   TimeTraceScope Scope("WindowSearch");
 119   ++NumTryWindowSchedule;
 120   // Performing the relevant processing before window scheduling.
 121   preProcess();
 122   // The main window scheduling begins.
 123   std::unique_ptr<ScheduleDAGInstrs> SchedDAG(createMachineScheduler());
 124   auto SearchIndexes = getSearchIndexes(WindowSearchNum, WindowSearchRatio);
 125   for (unsigned Idx : SearchIndexes) {
 126     OriToCycle.clear();
 127     ++NumTryWindowSearch;
 128     // The scheduling starts with non-phi instruction, so SchedPhiNum needs to
 129     // be added to Idx.
 130     unsigned Offset = Idx + SchedPhiNum;
 131     auto Range = getScheduleRange(Offset, SchedInstrNum);
 132     SchedDAG->startBlock(MBB);
 133     SchedDAG->enterRegion(MBB, Range.begin(), Range.end(), SchedInstrNum);
 134     SchedDAG->schedule();
 135     LLVM_DEBUG(SchedDAG->dump());
 136     unsigned II = analyseII(*SchedDAG, Offset);
 137     if (II == WindowIILimit) {
 138       restoreTripleMBB();
 139       LLVM_DEBUG(dbgs() << "Can't find a valid II. Keep searching...\n");
 140       ++NumFailAnalyseII;
 141       continue;
 142     }
 143     schedulePhi(Offset, II);
 144     updateScheduleResult(Offset, II);
 145     restoreTripleMBB();
 146     LLVM_DEBUG(dbgs() << "Current window Offset is " << Offset << " and II is "
 147                       << II << ".\n");
 148   }
 149   // Performing the relevant processing after window scheduling.
 150   postProcess();
 151   // Check whether the scheduling result is valid.
 152   if (!isScheduleValid()) {
 153     LLVM_DEBUG(dbgs() << "Window scheduling is not needed!\n");
 154     return false;
 155   }
 156   LLVM_DEBUG(dbgs() << "\nBest window offset is " << BestOffset
 157                     << " and Best II is " << BestII << ".\n");
 158   // Expand the scheduling result to prologue, kernel, and epilogue.
 159   expand();
 160   ++NumWindowSchedule;
 161   return true;
 162 }
 163
 164 ScheduleDAGInstrs *
 165 WindowScheduler::createMachineScheduler(bool OnlyBuildGraph) {
 166   return OnlyBuildGraph
 167              ? new ScheduleDAGMI(
 168                    Context, std::make_unique<PostGenericScheduler>(Context),
 169                    true)
 170              : Context->PassConfig->createMachineScheduler(Context);
 171 }
 172
 173 bool WindowScheduler::initialize() {
 174   if (!Subtarget->enableWindowScheduler()) {
 175     LLVM_DEBUG(dbgs() << "Target disables the window scheduling!\n");
 176     return false;
 177   }
 178   // Initialized the member variables used by window algorithm.
 179   OriMIs.clear();
 180   TriMIs.clear();
 181   TriToOri.clear();
 182   OriToCycle.clear();
 183   SchedResult.clear();
 184   SchedPhiNum = 0;
 185   SchedInstrNum = 0;
 186   BestII = UINT_MAX;
 187   BestOffset = 0;
 188   BaseII = 0;
 189   // List scheduling used in the window algorithm depends on LiveIntervals.
 190   if (!Context->LIS) {
 191     LLVM_DEBUG(dbgs() << "There is no LiveIntervals information!\n");
 192     return false;
 193   }
 194   // Check each MI in MBB.
 195   SmallSet<Register, 8> PrevDefs;
 196   SmallSet<Register, 8> PrevUses;
 197   auto IsLoopCarried = [&](MachineInstr &Phi) {
 198     // Two cases are checked here: (1)The virtual register defined by the
 199     // preceding phi is used by the succeeding phi;(2)The preceding phi uses the
 200     // virtual register defined by the succeeding phi.
 201     if (PrevUses.count(Phi.getOperand(0).getReg()))
 202       return true;
 203     PrevDefs.insert(Phi.getOperand(0).getReg());
 204     for (unsigned I = 1, E = Phi.getNumOperands(); I != E; I += 2) {
 205       if (PrevDefs.count(Phi.getOperand(I).getReg()))
 206         return true;
 207       PrevUses.insert(Phi.getOperand(I).getReg());
 208     }
 209     return false;
 210   };
 211   auto PLI = TII->analyzeLoopForPipelining(MBB);
 212   for (auto &MI : *MBB) {
 213     if (MI.isMetaInstruction() || MI.isTerminator())
 214       continue;
 215     if (MI.isPHI()) {
 216       if (IsLoopCarried(MI)) {
 217         LLVM_DEBUG(dbgs() << "Loop carried phis are not supported yet!\n");
 218         return false;
 219       }
 220       ++SchedPhiNum;
 221       ++BestOffset;
 222     } else
 223       ++SchedInstrNum;
 224     if (TII->isSchedulingBoundary(MI, MBB, *MF)) {
 225       LLVM_DEBUG(
 226           dbgs() << "Boundary MI is not allowed in window scheduling!\n");
 227       return false;
 228     }
 229     if (PLI->shouldIgnoreForPipelining(&MI)) {
 230       LLVM_DEBUG(dbgs() << "Special MI defined by target is not allowed in "
 231                            "window scheduling!\n");
 232       return false;
 233     }
 234     for (auto &Def : MI.all_defs())
 235       if (Def.isReg() && Def.getReg().isPhysical()) {
 236         LLVM_DEBUG(dbgs() << "Physical registers are not supported in "
 237                              "window scheduling!\n");
 238         return false;
 239       }
 240   }
 241   if (SchedInstrNum <= WindowRegionLimit) {
 242     LLVM_DEBUG(dbgs() << "There are too few MIs in the window region!\n");
 243     return false;
 244   }
 245   return true;
 246 }
 247
 248 void WindowScheduler::preProcess() {
 249   // Prior to window scheduling, it's necessary to backup the original MBB,
 250   // generate a new TripleMBB, and build a TripleDAG based on the TripleMBB.
 251   backupMBB();
 252   generateTripleMBB();
 253   TripleDAG->startBlock(MBB);
 254   TripleDAG->enterRegion(
 255       MBB, MBB->begin(), MBB->getFirstTerminator(),
 256       std::distance(MBB->begin(), MBB->getFirstTerminator()));
 257   TripleDAG->buildSchedGraph(Context->AA);
 258 }
 259
 260 void WindowScheduler::postProcess() {
 261   // After window scheduling, it's necessary to clear the TripleDAG and restore
 262   // to the original MBB.
 263   TripleDAG->exitRegion();
 264   TripleDAG->finishBlock();
 265   restoreMBB();
 266 }
 267
 268 void WindowScheduler::backupMBB() {
 269   for (auto &MI : MBB->instrs())
 270     OriMIs.push_back(&MI);
 271   // Remove MIs and the corresponding live intervals.
 272   for (auto &MI : make_early_inc_range(*MBB)) {
 273     Context->LIS->getSlotIndexes()->removeMachineInstrFromMaps(MI, true);
 274     MBB->remove(&MI);
 275   }
 276 }
 277
 278 void WindowScheduler::restoreMBB() {
 279   // Erase MIs and the corresponding live intervals.
 280   for (auto &MI : make_early_inc_range(*MBB)) {
 281     Context->LIS->getSlotIndexes()->removeMachineInstrFromMaps(MI, true);
 282     MI.eraseFromParent();
 283   }
 284   // Restore MBB to the state before window scheduling.
 285   for (auto *MI : OriMIs)
 286     MBB->push_back(MI);
 287   updateLiveIntervals();
 288 }
 289
 290 void WindowScheduler::generateTripleMBB() {
 291   const unsigned DuplicateNum = 3;
 292   TriMIs.clear();
 293   TriToOri.clear();
 294   assert(OriMIs.size() > 0 && "The Original MIs were not backed up!");
 295   // Step 1: Performing the first copy of MBB instructions, excluding
 296   // terminators. At the same time, we back up the anti-register of phis.
 297   // DefPairs hold the old and new define register pairs.
 298   DenseMap<Register, Register> DefPairs;
 299   for (auto *MI : OriMIs) {
 300     if (MI->isMetaInstruction() || MI->isTerminator())
 301       continue;
 302     if (MI->isPHI())
 303       if (Register AntiReg = getAntiRegister(MI))
 304         DefPairs[MI->getOperand(0).getReg()] = AntiReg;
 305     auto *NewMI = MF->CloneMachineInstr(MI);
 306     MBB->push_back(NewMI);
 307     TriMIs.push_back(NewMI);
 308     TriToOri[NewMI] = MI;
 309   }
 310   // Step 2: Performing the remaining two copies of MBB instructions excluding
 311   // phis, and the last one contains terminators. At the same time, registers
 312   // are updated accordingly.
 313   for (size_t Cnt = 1; Cnt < DuplicateNum; ++Cnt) {
 314     for (auto *MI : OriMIs) {
 315       if (MI->isPHI() || MI->isMetaInstruction() ||
 316           (MI->isTerminator() && Cnt < DuplicateNum - 1))
 317         continue;
 318       auto *NewMI = MF->CloneMachineInstr(MI);
 319       DenseMap<Register, Register> NewDefs;
 320       // New defines are updated.
 321       for (auto MO : NewMI->all_defs())
 322         if (MO.isReg() && MO.getReg().isVirtual()) {
 323           Register NewDef =
 324               MRI->createVirtualRegister(MRI->getRegClass(MO.getReg()));
 325           NewMI->substituteRegister(MO.getReg(), NewDef, 0, *TRI);
 326           NewDefs[MO.getReg()] = NewDef;
 327         }
 328       // New uses are updated.
 329       for (auto DefRegPair : DefPairs)
 330         if (NewMI->readsRegister(DefRegPair.first, TRI)) {
 331           Register NewUse = DefRegPair.second;
 332           // Note the update process for '%1 -> %9' in '%10 = sub i32 %9, %3':
 333           //
 334           // BB.3:                                  DefPairs
 335           // ==================================
 336           // %1 = phi i32 [%2, %BB.1], [%7, %BB.3]  (%1,%7)
 337           // ...
 338           // ==================================
 339           // ...
 340           // %4 = sub i32 %1, %3
 341           // ...
 342           // %7 = add i32 %5, %6
 343           // ...
 344           // ----------------------------------
 345           // ...
 346           // %8 = sub i32 %7, %3                    (%1,%7),(%4,%8)
 347           // ...
 348           // %9 = add i32 %5, %6                    (%1,%7),(%4,%8),(%7,%9)
 349           // ...
 350           // ----------------------------------
 351           // ...
 352           // %10 = sub i32 %9, %3                   (%1,%7),(%4,%10),(%7,%9)
 353           // ...            ^
 354           // %11 = add i32 %5, %6                   (%1,%7),(%4,%10),(%7,%11)
 355           // ...
 356           // ==================================
 357           //          < Terminators >
 358           // ==================================
 359           if (DefPairs.count(NewUse))
 360             NewUse = DefPairs[NewUse];
 361           NewMI->substituteRegister(DefRegPair.first, NewUse, 0, *TRI);
 362         }
 363       // DefPairs is updated at last.
 364       for (auto &NewDef : NewDefs)
 365         DefPairs[NewDef.first] = NewDef.second;
 366       MBB->push_back(NewMI);
 367       TriMIs.push_back(NewMI);
 368       TriToOri[NewMI] = MI;
 369     }
 370   }
 371   // Step 3: The registers used by phis are updated, and they are generated in
 372   // the third copy of MBB.
 373   // In the privious example, the old phi is:
 374   // %1 = phi i32 [%2, %BB.1], [%7, %BB.3]
 375   // The new phi is:
 376   // %1 = phi i32 [%2, %BB.1], [%11, %BB.3]
 377   for (auto &Phi : MBB->phis()) {
 378     for (auto DefRegPair : DefPairs)
 379       if (Phi.readsRegister(DefRegPair.first, TRI))
 380         Phi.substituteRegister(DefRegPair.first, DefRegPair.second, 0, *TRI);
 381   }
 382   updateLiveIntervals();
 383 }
 384
 385 void WindowScheduler::restoreTripleMBB() {
 386   // After list scheduling, the MBB is restored in one traversal.
 387   for (size_t I = 0; I < TriMIs.size(); ++I) {
 388     auto *MI = TriMIs[I];
 389     auto OldPos = MBB->begin();
 390     std::advance(OldPos, I);
 391     auto CurPos = MI->getIterator();
 392     if (CurPos != OldPos) {
 393       MBB->splice(OldPos, MBB, CurPos);
 394       Context->LIS->handleMove(*MI, /*UpdateFlags=*/false);
 395     }
 396   }
 397 }
 398
 399 SmallVector<unsigned> WindowScheduler::getSearchIndexes(unsigned SearchNum,
 400                                                         unsigned SearchRatio) {
 401   // We use SearchRatio to get the index range, and then evenly get the indexes
 402   // according to the SearchNum. This is a simple huristic. Depending on the
 403   // characteristics of the target, more complex algorithms can be used for both
 404   // performance and compilation time.
 405   assert(SearchRatio <= 100 && "SearchRatio should be equal or less than 100!");
 406   unsigned MaxIdx = SchedInstrNum * SearchRatio / 100;
 407   unsigned Step = SearchNum > 0 && SearchNum <= MaxIdx ? MaxIdx / SearchNum : 1;
 408   SmallVector<unsigned> SearchIndexes;
 409   for (unsigned Idx = 0; Idx < MaxIdx; Idx += Step)
 410     SearchIndexes.push_back(Idx);
 411   return SearchIndexes;
 412 }
 413
 414 int WindowScheduler::getEstimatedII(ScheduleDAGInstrs &DAG) {
 415   // Sometimes MaxDepth is 0, so it should be limited to the minimum of 1.
 416   unsigned MaxDepth = 1;
 417   for (auto &SU : DAG.SUnits)
 418     MaxDepth = std::max(SU.getDepth() + SU.Latency, MaxDepth);
 419   return MaxDepth * WindowIICoeff;
 420 }
 421
 422 int WindowScheduler::calculateMaxCycle(ScheduleDAGInstrs &DAG,
 423                                        unsigned Offset) {
 424   int InitII = getEstimatedII(DAG);
 425   ResourceManager RM(Subtarget, &DAG);
 426   RM.init(InitII);
 427   // ResourceManager and DAG are used to calculate the maximum cycle for the
 428   // scheduled MIs. Since MIs in the Region have already been scheduled, the
 429   // emit cycles can be estimated in order here.
 430   int CurCycle = 0;
 431   auto Range = getScheduleRange(Offset, SchedInstrNum);
 432   for (auto &MI : Range) {
 433     auto *SU = DAG.getSUnit(&MI);
 434     int ExpectCycle = CurCycle;
 435     // The predecessors of current MI determine its earliest issue cycle.
 436     for (auto &Pred : SU->Preds) {
 437       if (Pred.isWeak())
 438         continue;
 439       auto *PredMI = Pred.getSUnit()->getInstr();
 440       int PredCycle = getOriCycle(PredMI);
 441       ExpectCycle = std::max(ExpectCycle, PredCycle + (int)Pred.getLatency());
 442     }
 443     // Zero cost instructions do not need to check resource.
 444     if (!TII->isZeroCost(MI.getOpcode())) {
 445       // ResourceManager can be used to detect resource conflicts between the
 446       // current MI and the previously inserted MIs.
 447       while (!RM.canReserveResources(*SU, CurCycle) || CurCycle < ExpectCycle) {
 448         ++CurCycle;
 449         if (CurCycle == (int)WindowIILimit)
 450           return CurCycle;
 451       }
 452       RM.reserveResources(*SU, CurCycle);
 453     }
 454     OriToCycle[getOriMI(&MI)] = CurCycle;
 455     LLVM_DEBUG(dbgs() << "\tCycle " << CurCycle << " [S."
 456                       << getOriStage(getOriMI(&MI), Offset) << "]: " << MI);
 457   }
 458   LLVM_DEBUG(dbgs() << "MaxCycle is " << CurCycle << ".\n");
 459   return CurCycle;
 460 }
 461
 462 // By utilizing TripleDAG, we can easily establish dependencies between A and B.
 463 // Based on the MaxCycle and the issue cycle of A and B, we can determine
 464 // whether it is necessary to add a stall cycle. This is because, without
 465 // inserting the stall cycle, the latency constraint between A and B cannot be
 466 // satisfied. The details are as follows:
 467 //
 468 // New MBB:
 469 // ========================================
 470 //                 < Phis >
 471 // ========================================     (sliding direction)
 472 // MBB copy 1                                            |
 473 //                                                       V
 474 //
 475 // ~~~~~~~~~~~~~~~~~~~|~~~~~~~~~~~~~~~~~~~~  ----schedule window-----
 476 //                    |                                  |
 477 // ===================V====================              |
 478 // MBB copy 2      < MI B >                              |
 479 //                                                       |
 480 //                 < MI A >                              V
 481 // ~~~~~~~~~~~~~~~~~~~:~~~~~~~~~~~~~~~~~~~~  ------------------------
 482 //                    :
 483 // ===================V====================
 484 // MBB copy 3      < MI B'>
 485 //
 486 //
 487 //
 488 //
 489 // ========================================
 490 //              < Terminators >
 491 // ========================================
 492 int WindowScheduler::calculateStallCycle(unsigned Offset, int MaxCycle) {
 493   int MaxStallCycle = 0;
 494   int CurrentII = MaxCycle + 1;
 495   auto Range = getScheduleRange(Offset, SchedInstrNum);
 496   for (auto &MI : Range) {
 497     auto *SU = TripleDAG->getSUnit(&MI);
 498     int DefCycle = getOriCycle(&MI);
 499     for (auto &Succ : SU->Succs) {
 500       if (Succ.isWeak() || Succ.getSUnit() == &TripleDAG->ExitSU)
 501         continue;
 502       // If the expected cycle does not exceed CurrentII, no check is needed.
 503       if (DefCycle + (int)Succ.getLatency() <= CurrentII)
 504         continue;
 505       // If the cycle of the scheduled MI A is less than that of the scheduled
 506       // MI B, the scheduling will fail because the lifetime of the
 507       // corresponding register exceeds II.
 508       auto *SuccMI = Succ.getSUnit()->getInstr();
 509       int UseCycle = getOriCycle(SuccMI);
 510       if (DefCycle < UseCycle)
 511         return WindowIILimit;
 512       // Get the stall cycle introduced by the register between two trips.
 513       int StallCycle = DefCycle + (int)Succ.getLatency() - CurrentII - UseCycle;
 514       MaxStallCycle = std::max(MaxStallCycle, StallCycle);
 515     }
 516   }
 517   LLVM_DEBUG(dbgs() << "MaxStallCycle is " << MaxStallCycle << ".\n");
 518   return MaxStallCycle;
 519 }
 520
 521 unsigned WindowScheduler::analyseII(ScheduleDAGInstrs &DAG, unsigned Offset) {
 522   LLVM_DEBUG(dbgs() << "Start analyzing II:\n");
 523   int MaxCycle = calculateMaxCycle(DAG, Offset);
 524   if (MaxCycle == (int)WindowIILimit)
 525     return MaxCycle;
 526   int StallCycle = calculateStallCycle(Offset, MaxCycle);
 527   if (StallCycle == (int)WindowIILimit)
 528     return StallCycle;
 529   // The value of II is equal to the maximum execution cycle plus 1.
 530   return MaxCycle + StallCycle + 1;
 531 }
 532
 533 void WindowScheduler::schedulePhi(int Offset, unsigned &II) {
 534   LLVM_DEBUG(dbgs() << "Start scheduling Phis:\n");
 535   for (auto &Phi : MBB->phis()) {
 536     int LateCycle = INT_MAX;
 537     auto *SU = TripleDAG->getSUnit(&Phi);
 538     for (auto &Succ : SU->Succs) {
 539       // Phi doesn't have any Anti successors.
 540       if (Succ.getKind() != SDep::Data)
 541         continue;
 542       // Phi is scheduled before the successor of stage 0. The issue cycle of
 543       // phi is the latest cycle in this interval.
 544       auto *SuccMI = Succ.getSUnit()->getInstr();
 545       int Cycle = getOriCycle(SuccMI);
 546       if (getOriStage(getOriMI(SuccMI), Offset) == 0)
 547         LateCycle = std::min(LateCycle, Cycle);
 548     }
 549     // The anti-dependency of phi need to be handled separately in the same way.
 550     if (Register AntiReg = getAntiRegister(&Phi)) {
 551       auto *AntiMI = MRI->getVRegDef(AntiReg);
 552       // AntiReg may be defined outside the kernel MBB.
 553       if (AntiMI->getParent() == MBB) {
 554         auto AntiCycle = getOriCycle(AntiMI);
 555         if (getOriStage(getOriMI(AntiMI), Offset) == 0)
 556           LateCycle = std::min(LateCycle, AntiCycle);
 557       }
 558     }
 559     // If there is no limit to the late cycle, a default value is given.
 560     if (LateCycle == INT_MAX)
 561       LateCycle = (int)(II - 1);
 562     LLVM_DEBUG(dbgs() << "\tCycle range [0, " << LateCycle << "] " << Phi);
 563     // The issue cycle of phi is set to the latest cycle in the interval.
 564     auto *OriPhi = getOriMI(&Phi);
 565     OriToCycle[OriPhi] = LateCycle;
 566   }
 567 }
 568
 569 DenseMap<MachineInstr *, int> WindowScheduler::getIssueOrder(unsigned Offset,
 570                                                              unsigned II) {
 571   // At each issue cycle, phi is placed before MIs in stage 0. So the simplest
 572   // way is to put phi at the beginning of the current cycle.
 573   DenseMap<int, SmallVector<MachineInstr *>> CycleToMIs;
 574   auto Range = getScheduleRange(Offset, SchedInstrNum);
 575   for (auto &Phi : MBB->phis())
 576     CycleToMIs[getOriCycle(&Phi)].push_back(getOriMI(&Phi));
 577   for (auto &MI : Range)
 578     CycleToMIs[getOriCycle(&MI)].push_back(getOriMI(&MI));
 579   // Each MI is assigned a separate ordered Id, which is used as a sort marker
 580   // in the following expand process.
 581   DenseMap<MachineInstr *, int> IssueOrder;
 582   int Id = 0;
 583   for (int Cycle = 0; Cycle < (int)II; ++Cycle) {
 584     if (!CycleToMIs.count(Cycle))
 585       continue;
 586     for (auto *MI : CycleToMIs[Cycle])
 587       IssueOrder[MI] = Id++;
 588   }
 589   return IssueOrder;
 590 }
 591
 592 void WindowScheduler::updateScheduleResult(unsigned Offset, unsigned II) {
 593   // At the first update, Offset is equal to SchedPhiNum. At this time, only
 594   // BestII, BestOffset, and BaseII need to be updated.
 595   if (Offset == SchedPhiNum) {
 596     BestII = II;
 597     BestOffset = SchedPhiNum;
 598     BaseII = II;
 599     return;
 600   }
 601   // The update will only continue if the II is smaller than BestII and the II
 602   // is sufficiently small.
 603   if ((II >= BestII) || (II + WindowDiffLimit > BaseII))
 604     return;
 605   BestII = II;
 606   BestOffset = Offset;
 607   // Record the result of the current list scheduling, noting that each MI is
 608   // stored unordered in SchedResult.
 609   SchedResult.clear();
 610   auto IssueOrder = getIssueOrder(Offset, II);
 611   for (auto &Pair : OriToCycle) {
 612     assert(IssueOrder.count(Pair.first) && "Cannot find original MI!");
 613     SchedResult.push_back(std::make_tuple(Pair.first, Pair.second,
 614                                           getOriStage(Pair.first, Offset),
 615                                           IssueOrder[Pair.first]));
 616   }
 617 }
 618
 619 void WindowScheduler::expand() {
 620   // The MIs in the SchedResult are sorted by the issue order ID.
 621   llvm::stable_sort(SchedResult,
 622                     [](const std::tuple<MachineInstr *, int, int, int> &A,
 623                        const std::tuple<MachineInstr *, int, int, int> &B) {
 624                       return std::get<3>(A) < std::get<3>(B);
 625                     });
 626   // Use the scheduling infrastructure for expansion, noting that InstrChanges
 627   // is not supported here.
 628   DenseMap<MachineInstr *, int> Cycles, Stages;
 629   std::vector<MachineInstr *> OrderedInsts;
 630   for (auto &Info : SchedResult) {
 631     auto *MI = std::get<0>(Info);
 632     OrderedInsts.push_back(MI);
 633     Cycles[MI] = std::get<1>(Info);
 634     Stages[MI] = std::get<2>(Info);
 635     LLVM_DEBUG(dbgs() << "\tCycle " << Cycles[MI] << " [S." << Stages[MI]
 636                       << "]: " << *MI);
 637   }
 638   ModuloSchedule MS(*MF, &Loop, std::move(OrderedInsts), std::move(Cycles),
 639                     std::move(Stages));
 640   ModuloScheduleExpander MSE(*MF, MS, *Context->LIS,
 641                              ModuloScheduleExpander::InstrChangesTy());
 642   MSE.expand();
 643   MSE.cleanup();
 644 }
 645
 646 void WindowScheduler::updateLiveIntervals() {
 647   SmallVector<Register, 128> UsedRegs;
 648   for (MachineInstr &MI : *MBB)
 649     for (const MachineOperand &MO : MI.operands()) {
 650       if (!MO.isReg() || MO.getReg() == 0)
 651         continue;
 652       Register Reg = MO.getReg();
 653       if (!is_contained(UsedRegs, Reg))
 654         UsedRegs.push_back(Reg);
 655     }
 656   Context->LIS->repairIntervalsInRange(MBB, MBB->begin(), MBB->end(), UsedRegs);
 657 }
 658
 659 iterator_range<MachineBasicBlock::iterator>
 660 WindowScheduler::getScheduleRange(unsigned Offset, unsigned Num) {
 661   auto RegionBegin = MBB->begin();
 662   std::advance(RegionBegin, Offset);
 663   auto RegionEnd = RegionBegin;
 664   std::advance(RegionEnd, Num);
 665   return make_range(RegionBegin, RegionEnd);
 666 }
 667
 668 int WindowScheduler::getOriCycle(MachineInstr *NewMI) {
 669   assert(TriToOri.count(NewMI) && "Cannot find original MI!");
 670   auto *OriMI = TriToOri[NewMI];
 671   assert(OriToCycle.count(OriMI) && "Cannot find schedule cycle!");
 672   return OriToCycle[OriMI];
 673 }
 674
 675 MachineInstr *WindowScheduler::getOriMI(MachineInstr *NewMI) {
 676   assert(TriToOri.count(NewMI) && "Cannot find original MI!");
 677   return TriToOri[NewMI];
 678 }
 679
 680 unsigned WindowScheduler::getOriStage(MachineInstr *OriMI, unsigned Offset) {
 681   assert(llvm::find(OriMIs, OriMI) != OriMIs.end() &&
 682          "Cannot find OriMI in OriMIs!");
 683   // If there is no instruction fold, all MI stages are 0.
 684   if (Offset == SchedPhiNum)
 685     return 0;
 686   // For those MIs with an ID less than the Offset, their stages are set to 0,
 687   // while the rest are set to 1.
 688   unsigned Id = 0;
 689   for (auto *MI : OriMIs) {
 690     if (MI->isMetaInstruction())
 691       continue;
 692     if (MI == OriMI)
 693       break;
 694     ++Id;
 695   }
 696   return Id >= (size_t)Offset ? 1 : 0;
 697 }
 698
 699 Register WindowScheduler::getAntiRegister(MachineInstr *Phi) {
 700   assert(Phi->isPHI() && "Expecting PHI!");
 701   Register AntiReg;
 702   for (auto MO : Phi->uses()) {
 703     if (MO.isReg())
 704       AntiReg = MO.getReg();
 705     else if (MO.isMBB() && MO.getMBB() == MBB)
 706       return AntiReg;
 707   }
 708   return 0;
 709 }