[MachineScheduler] Fix physreg dependencies of ExitSU (#123541)
[llvm-project.git] / llvm / lib / CodeGen / ModuloSchedule.cpp
blobf9fe812f7e65ca70b4faa14d4ccf5a9b728c9361
1 //===- ModuloSchedule.cpp - Software pipeline schedule expansion ----------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
9 #include "llvm/CodeGen/ModuloSchedule.h"
10 #include "llvm/ADT/StringExtras.h"
11 #include "llvm/Analysis/MemoryLocation.h"
12 #include "llvm/CodeGen/LiveIntervals.h"
13 #include "llvm/CodeGen/MachineInstrBuilder.h"
14 #include "llvm/CodeGen/MachineLoopInfo.h"
15 #include "llvm/CodeGen/MachineRegisterInfo.h"
16 #include "llvm/InitializePasses.h"
17 #include "llvm/MC/MCContext.h"
18 #include "llvm/Support/Debug.h"
19 #include "llvm/Support/ErrorHandling.h"
20 #include "llvm/Support/raw_ostream.h"
22 #define DEBUG_TYPE "pipeliner"
23 using namespace llvm;
25 static cl::opt<bool> SwapBranchTargetsMVE(
26 "pipeliner-swap-branch-targets-mve", cl::Hidden, cl::init(false),
27 cl::desc("Swap target blocks of a conditional branch for MVE expander"));
29 void ModuloSchedule::print(raw_ostream &OS) {
30 for (MachineInstr *MI : ScheduledInstrs)
31 OS << "[stage " << getStage(MI) << " @" << getCycle(MI) << "c] " << *MI;
34 //===----------------------------------------------------------------------===//
35 // ModuloScheduleExpander implementation
36 //===----------------------------------------------------------------------===//
38 /// Return the register values for the operands of a Phi instruction.
39 /// This function assume the instruction is a Phi.
40 static void getPhiRegs(MachineInstr &Phi, MachineBasicBlock *Loop,
41 unsigned &InitVal, unsigned &LoopVal) {
42 assert(Phi.isPHI() && "Expecting a Phi.");
44 InitVal = 0;
45 LoopVal = 0;
46 for (unsigned i = 1, e = Phi.getNumOperands(); i != e; i += 2)
47 if (Phi.getOperand(i + 1).getMBB() != Loop)
48 InitVal = Phi.getOperand(i).getReg();
49 else
50 LoopVal = Phi.getOperand(i).getReg();
52 assert(InitVal != 0 && LoopVal != 0 && "Unexpected Phi structure.");
55 /// Return the Phi register value that comes from the incoming block.
56 static unsigned getInitPhiReg(MachineInstr &Phi, MachineBasicBlock *LoopBB) {
57 for (unsigned i = 1, e = Phi.getNumOperands(); i != e; i += 2)
58 if (Phi.getOperand(i + 1).getMBB() != LoopBB)
59 return Phi.getOperand(i).getReg();
60 return 0;
63 /// Return the Phi register value that comes the loop block.
64 static unsigned getLoopPhiReg(MachineInstr &Phi, MachineBasicBlock *LoopBB) {
65 for (unsigned i = 1, e = Phi.getNumOperands(); i != e; i += 2)
66 if (Phi.getOperand(i + 1).getMBB() == LoopBB)
67 return Phi.getOperand(i).getReg();
68 return 0;
71 void ModuloScheduleExpander::expand() {
72 BB = Schedule.getLoop()->getTopBlock();
73 Preheader = *BB->pred_begin();
74 if (Preheader == BB)
75 Preheader = *std::next(BB->pred_begin());
77 // Iterate over the definitions in each instruction, and compute the
78 // stage difference for each use. Keep the maximum value.
79 for (MachineInstr *MI : Schedule.getInstructions()) {
80 int DefStage = Schedule.getStage(MI);
81 for (const MachineOperand &Op : MI->all_defs()) {
82 Register Reg = Op.getReg();
83 unsigned MaxDiff = 0;
84 bool PhiIsSwapped = false;
85 for (MachineOperand &UseOp : MRI.use_operands(Reg)) {
86 MachineInstr *UseMI = UseOp.getParent();
87 int UseStage = Schedule.getStage(UseMI);
88 unsigned Diff = 0;
89 if (UseStage != -1 && UseStage >= DefStage)
90 Diff = UseStage - DefStage;
91 if (MI->isPHI()) {
92 if (isLoopCarried(*MI))
93 ++Diff;
94 else
95 PhiIsSwapped = true;
97 MaxDiff = std::max(Diff, MaxDiff);
99 RegToStageDiff[Reg] = std::make_pair(MaxDiff, PhiIsSwapped);
103 generatePipelinedLoop();
106 void ModuloScheduleExpander::generatePipelinedLoop() {
107 LoopInfo = TII->analyzeLoopForPipelining(BB);
108 assert(LoopInfo && "Must be able to analyze loop!");
110 // Create a new basic block for the kernel and add it to the CFG.
111 MachineBasicBlock *KernelBB = MF.CreateMachineBasicBlock(BB->getBasicBlock());
113 unsigned MaxStageCount = Schedule.getNumStages() - 1;
115 // Remember the registers that are used in different stages. The index is
116 // the iteration, or stage, that the instruction is scheduled in. This is
117 // a map between register names in the original block and the names created
118 // in each stage of the pipelined loop.
119 ValueMapTy *VRMap = new ValueMapTy[(MaxStageCount + 1) * 2];
121 // The renaming destination by Phis for the registers across stages.
122 // This map is updated during Phis generation to point to the most recent
123 // renaming destination.
124 ValueMapTy *VRMapPhi = new ValueMapTy[(MaxStageCount + 1) * 2];
126 InstrMapTy InstrMap;
128 SmallVector<MachineBasicBlock *, 4> PrologBBs;
130 // Generate the prolog instructions that set up the pipeline.
131 generateProlog(MaxStageCount, KernelBB, VRMap, PrologBBs);
132 MF.insert(BB->getIterator(), KernelBB);
133 LIS.insertMBBInMaps(KernelBB);
135 // Rearrange the instructions to generate the new, pipelined loop,
136 // and update register names as needed.
137 for (MachineInstr *CI : Schedule.getInstructions()) {
138 if (CI->isPHI())
139 continue;
140 unsigned StageNum = Schedule.getStage(CI);
141 MachineInstr *NewMI = cloneInstr(CI, MaxStageCount, StageNum);
142 updateInstruction(NewMI, false, MaxStageCount, StageNum, VRMap);
143 KernelBB->push_back(NewMI);
144 InstrMap[NewMI] = CI;
147 // Copy any terminator instructions to the new kernel, and update
148 // names as needed.
149 for (MachineInstr &MI : BB->terminators()) {
150 MachineInstr *NewMI = MF.CloneMachineInstr(&MI);
151 updateInstruction(NewMI, false, MaxStageCount, 0, VRMap);
152 KernelBB->push_back(NewMI);
153 InstrMap[NewMI] = &MI;
156 NewKernel = KernelBB;
157 KernelBB->transferSuccessors(BB);
158 KernelBB->replaceSuccessor(BB, KernelBB);
160 generateExistingPhis(KernelBB, PrologBBs.back(), KernelBB, KernelBB, VRMap,
161 InstrMap, MaxStageCount, MaxStageCount, false);
162 generatePhis(KernelBB, PrologBBs.back(), KernelBB, KernelBB, VRMap, VRMapPhi,
163 InstrMap, MaxStageCount, MaxStageCount, false);
165 LLVM_DEBUG(dbgs() << "New block\n"; KernelBB->dump(););
167 SmallVector<MachineBasicBlock *, 4> EpilogBBs;
168 // Generate the epilog instructions to complete the pipeline.
169 generateEpilog(MaxStageCount, KernelBB, BB, VRMap, VRMapPhi, EpilogBBs,
170 PrologBBs);
172 // We need this step because the register allocation doesn't handle some
173 // situations well, so we insert copies to help out.
174 splitLifetimes(KernelBB, EpilogBBs);
176 // Remove dead instructions due to loop induction variables.
177 removeDeadInstructions(KernelBB, EpilogBBs);
179 // Add branches between prolog and epilog blocks.
180 addBranches(*Preheader, PrologBBs, KernelBB, EpilogBBs, VRMap);
182 delete[] VRMap;
183 delete[] VRMapPhi;
186 void ModuloScheduleExpander::cleanup() {
187 // Remove the original loop since it's no longer referenced.
188 for (auto &I : *BB)
189 LIS.RemoveMachineInstrFromMaps(I);
190 BB->clear();
191 BB->eraseFromParent();
194 /// Generate the pipeline prolog code.
195 void ModuloScheduleExpander::generateProlog(unsigned LastStage,
196 MachineBasicBlock *KernelBB,
197 ValueMapTy *VRMap,
198 MBBVectorTy &PrologBBs) {
199 MachineBasicBlock *PredBB = Preheader;
200 InstrMapTy InstrMap;
202 // Generate a basic block for each stage, not including the last stage,
203 // which will be generated in the kernel. Each basic block may contain
204 // instructions from multiple stages/iterations.
205 for (unsigned i = 0; i < LastStage; ++i) {
206 // Create and insert the prolog basic block prior to the original loop
207 // basic block. The original loop is removed later.
208 MachineBasicBlock *NewBB = MF.CreateMachineBasicBlock(BB->getBasicBlock());
209 PrologBBs.push_back(NewBB);
210 MF.insert(BB->getIterator(), NewBB);
211 NewBB->transferSuccessors(PredBB);
212 PredBB->addSuccessor(NewBB);
213 PredBB = NewBB;
214 LIS.insertMBBInMaps(NewBB);
216 // Generate instructions for each appropriate stage. Process instructions
217 // in original program order.
218 for (int StageNum = i; StageNum >= 0; --StageNum) {
219 for (MachineBasicBlock::iterator BBI = BB->instr_begin(),
220 BBE = BB->getFirstTerminator();
221 BBI != BBE; ++BBI) {
222 if (Schedule.getStage(&*BBI) == StageNum) {
223 if (BBI->isPHI())
224 continue;
225 MachineInstr *NewMI =
226 cloneAndChangeInstr(&*BBI, i, (unsigned)StageNum);
227 updateInstruction(NewMI, false, i, (unsigned)StageNum, VRMap);
228 NewBB->push_back(NewMI);
229 InstrMap[NewMI] = &*BBI;
233 rewritePhiValues(NewBB, i, VRMap, InstrMap);
234 LLVM_DEBUG({
235 dbgs() << "prolog:\n";
236 NewBB->dump();
240 PredBB->replaceSuccessor(BB, KernelBB);
242 // Check if we need to remove the branch from the preheader to the original
243 // loop, and replace it with a branch to the new loop.
244 unsigned numBranches = TII->removeBranch(*Preheader);
245 if (numBranches) {
246 SmallVector<MachineOperand, 0> Cond;
247 TII->insertBranch(*Preheader, PrologBBs[0], nullptr, Cond, DebugLoc());
251 /// Generate the pipeline epilog code. The epilog code finishes the iterations
252 /// that were started in either the prolog or the kernel. We create a basic
253 /// block for each stage that needs to complete.
254 void ModuloScheduleExpander::generateEpilog(
255 unsigned LastStage, MachineBasicBlock *KernelBB, MachineBasicBlock *OrigBB,
256 ValueMapTy *VRMap, ValueMapTy *VRMapPhi, MBBVectorTy &EpilogBBs,
257 MBBVectorTy &PrologBBs) {
258 // We need to change the branch from the kernel to the first epilog block, so
259 // this call to analyze branch uses the kernel rather than the original BB.
260 MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
261 SmallVector<MachineOperand, 4> Cond;
262 bool checkBranch = TII->analyzeBranch(*KernelBB, TBB, FBB, Cond);
263 assert(!checkBranch && "generateEpilog must be able to analyze the branch");
264 if (checkBranch)
265 return;
267 MachineBasicBlock::succ_iterator LoopExitI = KernelBB->succ_begin();
268 if (*LoopExitI == KernelBB)
269 ++LoopExitI;
270 assert(LoopExitI != KernelBB->succ_end() && "Expecting a successor");
271 MachineBasicBlock *LoopExitBB = *LoopExitI;
273 MachineBasicBlock *PredBB = KernelBB;
274 MachineBasicBlock *EpilogStart = LoopExitBB;
275 InstrMapTy InstrMap;
277 // Generate a basic block for each stage, not including the last stage,
278 // which was generated for the kernel. Each basic block may contain
279 // instructions from multiple stages/iterations.
280 int EpilogStage = LastStage + 1;
281 for (unsigned i = LastStage; i >= 1; --i, ++EpilogStage) {
282 MachineBasicBlock *NewBB = MF.CreateMachineBasicBlock();
283 EpilogBBs.push_back(NewBB);
284 MF.insert(BB->getIterator(), NewBB);
286 PredBB->replaceSuccessor(LoopExitBB, NewBB);
287 NewBB->addSuccessor(LoopExitBB);
288 LIS.insertMBBInMaps(NewBB);
290 if (EpilogStart == LoopExitBB)
291 EpilogStart = NewBB;
293 // Add instructions to the epilog depending on the current block.
294 // Process instructions in original program order.
295 for (unsigned StageNum = i; StageNum <= LastStage; ++StageNum) {
296 for (auto &BBI : *BB) {
297 if (BBI.isPHI())
298 continue;
299 MachineInstr *In = &BBI;
300 if ((unsigned)Schedule.getStage(In) == StageNum) {
301 // Instructions with memoperands in the epilog are updated with
302 // conservative values.
303 MachineInstr *NewMI = cloneInstr(In, UINT_MAX, 0);
304 updateInstruction(NewMI, i == 1, EpilogStage, 0, VRMap);
305 NewBB->push_back(NewMI);
306 InstrMap[NewMI] = In;
310 generateExistingPhis(NewBB, PrologBBs[i - 1], PredBB, KernelBB, VRMap,
311 InstrMap, LastStage, EpilogStage, i == 1);
312 generatePhis(NewBB, PrologBBs[i - 1], PredBB, KernelBB, VRMap, VRMapPhi,
313 InstrMap, LastStage, EpilogStage, i == 1);
314 PredBB = NewBB;
316 LLVM_DEBUG({
317 dbgs() << "epilog:\n";
318 NewBB->dump();
322 // Fix any Phi nodes in the loop exit block.
323 LoopExitBB->replacePhiUsesWith(BB, PredBB);
325 // Create a branch to the new epilog from the kernel.
326 // Remove the original branch and add a new branch to the epilog.
327 TII->removeBranch(*KernelBB);
328 assert((OrigBB == TBB || OrigBB == FBB) &&
329 "Unable to determine looping branch direction");
330 if (OrigBB != TBB)
331 TII->insertBranch(*KernelBB, EpilogStart, KernelBB, Cond, DebugLoc());
332 else
333 TII->insertBranch(*KernelBB, KernelBB, EpilogStart, Cond, DebugLoc());
334 // Add a branch to the loop exit.
335 if (EpilogBBs.size() > 0) {
336 MachineBasicBlock *LastEpilogBB = EpilogBBs.back();
337 SmallVector<MachineOperand, 4> Cond1;
338 TII->insertBranch(*LastEpilogBB, LoopExitBB, nullptr, Cond1, DebugLoc());
342 /// Replace all uses of FromReg that appear outside the specified
343 /// basic block with ToReg.
344 static void replaceRegUsesAfterLoop(unsigned FromReg, unsigned ToReg,
345 MachineBasicBlock *MBB,
346 MachineRegisterInfo &MRI,
347 LiveIntervals &LIS) {
348 for (MachineOperand &O :
349 llvm::make_early_inc_range(MRI.use_operands(FromReg)))
350 if (O.getParent()->getParent() != MBB)
351 O.setReg(ToReg);
352 if (!LIS.hasInterval(ToReg))
353 LIS.createEmptyInterval(ToReg);
356 /// Return true if the register has a use that occurs outside the
357 /// specified loop.
358 static bool hasUseAfterLoop(unsigned Reg, MachineBasicBlock *BB,
359 MachineRegisterInfo &MRI) {
360 for (const MachineOperand &MO : MRI.use_operands(Reg))
361 if (MO.getParent()->getParent() != BB)
362 return true;
363 return false;
366 /// Generate Phis for the specific block in the generated pipelined code.
367 /// This function looks at the Phis from the original code to guide the
368 /// creation of new Phis.
369 void ModuloScheduleExpander::generateExistingPhis(
370 MachineBasicBlock *NewBB, MachineBasicBlock *BB1, MachineBasicBlock *BB2,
371 MachineBasicBlock *KernelBB, ValueMapTy *VRMap, InstrMapTy &InstrMap,
372 unsigned LastStageNum, unsigned CurStageNum, bool IsLast) {
373 // Compute the stage number for the initial value of the Phi, which
374 // comes from the prolog. The prolog to use depends on to which kernel/
375 // epilog that we're adding the Phi.
376 unsigned PrologStage = 0;
377 unsigned PrevStage = 0;
378 bool InKernel = (LastStageNum == CurStageNum);
379 if (InKernel) {
380 PrologStage = LastStageNum - 1;
381 PrevStage = CurStageNum;
382 } else {
383 PrologStage = LastStageNum - (CurStageNum - LastStageNum);
384 PrevStage = LastStageNum + (CurStageNum - LastStageNum) - 1;
387 for (MachineBasicBlock::iterator BBI = BB->instr_begin(),
388 BBE = BB->getFirstNonPHI();
389 BBI != BBE; ++BBI) {
390 Register Def = BBI->getOperand(0).getReg();
392 unsigned InitVal = 0;
393 unsigned LoopVal = 0;
394 getPhiRegs(*BBI, BB, InitVal, LoopVal);
396 unsigned PhiOp1 = 0;
397 // The Phi value from the loop body typically is defined in the loop, but
398 // not always. So, we need to check if the value is defined in the loop.
399 unsigned PhiOp2 = LoopVal;
400 if (auto It = VRMap[LastStageNum].find(LoopVal);
401 It != VRMap[LastStageNum].end())
402 PhiOp2 = It->second;
404 int StageScheduled = Schedule.getStage(&*BBI);
405 int LoopValStage = Schedule.getStage(MRI.getVRegDef(LoopVal));
406 unsigned NumStages = getStagesForReg(Def, CurStageNum);
407 if (NumStages == 0) {
408 // We don't need to generate a Phi anymore, but we need to rename any uses
409 // of the Phi value.
410 unsigned NewReg = VRMap[PrevStage][LoopVal];
411 rewriteScheduledInstr(NewBB, InstrMap, CurStageNum, 0, &*BBI, Def,
412 InitVal, NewReg);
413 if (VRMap[CurStageNum].count(LoopVal))
414 VRMap[CurStageNum][Def] = VRMap[CurStageNum][LoopVal];
416 // Adjust the number of Phis needed depending on the number of prologs left,
417 // and the distance from where the Phi is first scheduled. The number of
418 // Phis cannot exceed the number of prolog stages. Each stage can
419 // potentially define two values.
420 unsigned MaxPhis = PrologStage + 2;
421 if (!InKernel && (int)PrologStage <= LoopValStage)
422 MaxPhis = std::max((int)MaxPhis - (int)LoopValStage, 1);
423 unsigned NumPhis = std::min(NumStages, MaxPhis);
425 unsigned NewReg = 0;
426 unsigned AccessStage = (LoopValStage != -1) ? LoopValStage : StageScheduled;
427 // In the epilog, we may need to look back one stage to get the correct
428 // Phi name, because the epilog and prolog blocks execute the same stage.
429 // The correct name is from the previous block only when the Phi has
430 // been completely scheduled prior to the epilog, and Phi value is not
431 // needed in multiple stages.
432 int StageDiff = 0;
433 if (!InKernel && StageScheduled >= LoopValStage && AccessStage == 0 &&
434 NumPhis == 1)
435 StageDiff = 1;
436 // Adjust the computations below when the phi and the loop definition
437 // are scheduled in different stages.
438 if (InKernel && LoopValStage != -1 && StageScheduled > LoopValStage)
439 StageDiff = StageScheduled - LoopValStage;
440 for (unsigned np = 0; np < NumPhis; ++np) {
441 // If the Phi hasn't been scheduled, then use the initial Phi operand
442 // value. Otherwise, use the scheduled version of the instruction. This
443 // is a little complicated when a Phi references another Phi.
444 if (np > PrologStage || StageScheduled >= (int)LastStageNum)
445 PhiOp1 = InitVal;
446 // Check if the Phi has already been scheduled in a prolog stage.
447 else if (PrologStage >= AccessStage + StageDiff + np &&
448 VRMap[PrologStage - StageDiff - np].count(LoopVal) != 0)
449 PhiOp1 = VRMap[PrologStage - StageDiff - np][LoopVal];
450 // Check if the Phi has already been scheduled, but the loop instruction
451 // is either another Phi, or doesn't occur in the loop.
452 else if (PrologStage >= AccessStage + StageDiff + np) {
453 // If the Phi references another Phi, we need to examine the other
454 // Phi to get the correct value.
455 PhiOp1 = LoopVal;
456 MachineInstr *InstOp1 = MRI.getVRegDef(PhiOp1);
457 int Indirects = 1;
458 while (InstOp1 && InstOp1->isPHI() && InstOp1->getParent() == BB) {
459 int PhiStage = Schedule.getStage(InstOp1);
460 if ((int)(PrologStage - StageDiff - np) < PhiStage + Indirects)
461 PhiOp1 = getInitPhiReg(*InstOp1, BB);
462 else
463 PhiOp1 = getLoopPhiReg(*InstOp1, BB);
464 InstOp1 = MRI.getVRegDef(PhiOp1);
465 int PhiOpStage = Schedule.getStage(InstOp1);
466 int StageAdj = (PhiOpStage != -1 ? PhiStage - PhiOpStage : 0);
467 if (PhiOpStage != -1 && PrologStage - StageAdj >= Indirects + np &&
468 VRMap[PrologStage - StageAdj - Indirects - np].count(PhiOp1)) {
469 PhiOp1 = VRMap[PrologStage - StageAdj - Indirects - np][PhiOp1];
470 break;
472 ++Indirects;
474 } else
475 PhiOp1 = InitVal;
476 // If this references a generated Phi in the kernel, get the Phi operand
477 // from the incoming block.
478 if (MachineInstr *InstOp1 = MRI.getVRegDef(PhiOp1))
479 if (InstOp1->isPHI() && InstOp1->getParent() == KernelBB)
480 PhiOp1 = getInitPhiReg(*InstOp1, KernelBB);
482 MachineInstr *PhiInst = MRI.getVRegDef(LoopVal);
483 bool LoopDefIsPhi = PhiInst && PhiInst->isPHI();
484 // In the epilog, a map lookup is needed to get the value from the kernel,
485 // or previous epilog block. How is does this depends on if the
486 // instruction is scheduled in the previous block.
487 if (!InKernel) {
488 int StageDiffAdj = 0;
489 if (LoopValStage != -1 && StageScheduled > LoopValStage)
490 StageDiffAdj = StageScheduled - LoopValStage;
491 // Use the loop value defined in the kernel, unless the kernel
492 // contains the last definition of the Phi.
493 if (np == 0 && PrevStage == LastStageNum &&
494 (StageScheduled != 0 || LoopValStage != 0) &&
495 VRMap[PrevStage - StageDiffAdj].count(LoopVal))
496 PhiOp2 = VRMap[PrevStage - StageDiffAdj][LoopVal];
497 // Use the value defined by the Phi. We add one because we switch
498 // from looking at the loop value to the Phi definition.
499 else if (np > 0 && PrevStage == LastStageNum &&
500 VRMap[PrevStage - np + 1].count(Def))
501 PhiOp2 = VRMap[PrevStage - np + 1][Def];
502 // Use the loop value defined in the kernel.
503 else if (static_cast<unsigned>(LoopValStage) > PrologStage + 1 &&
504 VRMap[PrevStage - StageDiffAdj - np].count(LoopVal))
505 PhiOp2 = VRMap[PrevStage - StageDiffAdj - np][LoopVal];
506 // Use the value defined by the Phi, unless we're generating the first
507 // epilog and the Phi refers to a Phi in a different stage.
508 else if (VRMap[PrevStage - np].count(Def) &&
509 (!LoopDefIsPhi || (PrevStage != LastStageNum) ||
510 (LoopValStage == StageScheduled)))
511 PhiOp2 = VRMap[PrevStage - np][Def];
514 // Check if we can reuse an existing Phi. This occurs when a Phi
515 // references another Phi, and the other Phi is scheduled in an
516 // earlier stage. We can try to reuse an existing Phi up until the last
517 // stage of the current Phi.
518 if (LoopDefIsPhi) {
519 if (static_cast<int>(PrologStage - np) >= StageScheduled) {
520 int LVNumStages = getStagesForPhi(LoopVal);
521 int StageDiff = (StageScheduled - LoopValStage);
522 LVNumStages -= StageDiff;
523 // Make sure the loop value Phi has been processed already.
524 if (LVNumStages > (int)np && VRMap[CurStageNum].count(LoopVal)) {
525 NewReg = PhiOp2;
526 unsigned ReuseStage = CurStageNum;
527 if (isLoopCarried(*PhiInst))
528 ReuseStage -= LVNumStages;
529 // Check if the Phi to reuse has been generated yet. If not, then
530 // there is nothing to reuse.
531 if (VRMap[ReuseStage - np].count(LoopVal)) {
532 NewReg = VRMap[ReuseStage - np][LoopVal];
534 rewriteScheduledInstr(NewBB, InstrMap, CurStageNum, np, &*BBI,
535 Def, NewReg);
536 // Update the map with the new Phi name.
537 VRMap[CurStageNum - np][Def] = NewReg;
538 PhiOp2 = NewReg;
539 if (VRMap[LastStageNum - np - 1].count(LoopVal))
540 PhiOp2 = VRMap[LastStageNum - np - 1][LoopVal];
542 if (IsLast && np == NumPhis - 1)
543 replaceRegUsesAfterLoop(Def, NewReg, BB, MRI, LIS);
544 continue;
548 if (InKernel && StageDiff > 0 &&
549 VRMap[CurStageNum - StageDiff - np].count(LoopVal))
550 PhiOp2 = VRMap[CurStageNum - StageDiff - np][LoopVal];
553 const TargetRegisterClass *RC = MRI.getRegClass(Def);
554 NewReg = MRI.createVirtualRegister(RC);
556 MachineInstrBuilder NewPhi =
557 BuildMI(*NewBB, NewBB->getFirstNonPHI(), DebugLoc(),
558 TII->get(TargetOpcode::PHI), NewReg);
559 NewPhi.addReg(PhiOp1).addMBB(BB1);
560 NewPhi.addReg(PhiOp2).addMBB(BB2);
561 if (np == 0)
562 InstrMap[NewPhi] = &*BBI;
564 // We define the Phis after creating the new pipelined code, so
565 // we need to rename the Phi values in scheduled instructions.
567 unsigned PrevReg = 0;
568 if (InKernel && VRMap[PrevStage - np].count(LoopVal))
569 PrevReg = VRMap[PrevStage - np][LoopVal];
570 rewriteScheduledInstr(NewBB, InstrMap, CurStageNum, np, &*BBI, Def,
571 NewReg, PrevReg);
572 // If the Phi has been scheduled, use the new name for rewriting.
573 if (VRMap[CurStageNum - np].count(Def)) {
574 unsigned R = VRMap[CurStageNum - np][Def];
575 rewriteScheduledInstr(NewBB, InstrMap, CurStageNum, np, &*BBI, R,
576 NewReg);
579 // Check if we need to rename any uses that occurs after the loop. The
580 // register to replace depends on whether the Phi is scheduled in the
581 // epilog.
582 if (IsLast && np == NumPhis - 1)
583 replaceRegUsesAfterLoop(Def, NewReg, BB, MRI, LIS);
585 // In the kernel, a dependent Phi uses the value from this Phi.
586 if (InKernel)
587 PhiOp2 = NewReg;
589 // Update the map with the new Phi name.
590 VRMap[CurStageNum - np][Def] = NewReg;
593 while (NumPhis++ < NumStages) {
594 rewriteScheduledInstr(NewBB, InstrMap, CurStageNum, NumPhis, &*BBI, Def,
595 NewReg, 0);
598 // Check if we need to rename a Phi that has been eliminated due to
599 // scheduling.
600 if (NumStages == 0 && IsLast && VRMap[CurStageNum].count(LoopVal))
601 replaceRegUsesAfterLoop(Def, VRMap[CurStageNum][LoopVal], BB, MRI, LIS);
605 /// Generate Phis for the specified block in the generated pipelined code.
606 /// These are new Phis needed because the definition is scheduled after the
607 /// use in the pipelined sequence.
608 void ModuloScheduleExpander::generatePhis(
609 MachineBasicBlock *NewBB, MachineBasicBlock *BB1, MachineBasicBlock *BB2,
610 MachineBasicBlock *KernelBB, ValueMapTy *VRMap, ValueMapTy *VRMapPhi,
611 InstrMapTy &InstrMap, unsigned LastStageNum, unsigned CurStageNum,
612 bool IsLast) {
613 // Compute the stage number that contains the initial Phi value, and
614 // the Phi from the previous stage.
615 unsigned PrologStage = 0;
616 unsigned PrevStage = 0;
617 unsigned StageDiff = CurStageNum - LastStageNum;
618 bool InKernel = (StageDiff == 0);
619 if (InKernel) {
620 PrologStage = LastStageNum - 1;
621 PrevStage = CurStageNum;
622 } else {
623 PrologStage = LastStageNum - StageDiff;
624 PrevStage = LastStageNum + StageDiff - 1;
627 for (MachineBasicBlock::iterator BBI = BB->getFirstNonPHI(),
628 BBE = BB->instr_end();
629 BBI != BBE; ++BBI) {
630 for (unsigned i = 0, e = BBI->getNumOperands(); i != e; ++i) {
631 MachineOperand &MO = BBI->getOperand(i);
632 if (!MO.isReg() || !MO.isDef() || !MO.getReg().isVirtual())
633 continue;
635 int StageScheduled = Schedule.getStage(&*BBI);
636 assert(StageScheduled != -1 && "Expecting scheduled instruction.");
637 Register Def = MO.getReg();
638 unsigned NumPhis = getStagesForReg(Def, CurStageNum);
639 // An instruction scheduled in stage 0 and is used after the loop
640 // requires a phi in the epilog for the last definition from either
641 // the kernel or prolog.
642 if (!InKernel && NumPhis == 0 && StageScheduled == 0 &&
643 hasUseAfterLoop(Def, BB, MRI))
644 NumPhis = 1;
645 if (!InKernel && (unsigned)StageScheduled > PrologStage)
646 continue;
648 unsigned PhiOp2;
649 if (InKernel) {
650 PhiOp2 = VRMap[PrevStage][Def];
651 if (MachineInstr *InstOp2 = MRI.getVRegDef(PhiOp2))
652 if (InstOp2->isPHI() && InstOp2->getParent() == NewBB)
653 PhiOp2 = getLoopPhiReg(*InstOp2, BB2);
655 // The number of Phis can't exceed the number of prolog stages. The
656 // prolog stage number is zero based.
657 if (NumPhis > PrologStage + 1 - StageScheduled)
658 NumPhis = PrologStage + 1 - StageScheduled;
659 for (unsigned np = 0; np < NumPhis; ++np) {
660 // Example for
661 // Org:
662 // %Org = ... (Scheduled at Stage#0, NumPhi = 2)
664 // Prolog0 (Stage0):
665 // %Clone0 = ...
666 // Prolog1 (Stage1):
667 // %Clone1 = ...
668 // Kernel (Stage2):
669 // %Phi0 = Phi %Clone1, Prolog1, %Clone2, Kernel
670 // %Phi1 = Phi %Clone0, Prolog1, %Phi0, Kernel
671 // %Clone2 = ...
672 // Epilog0 (Stage3):
673 // %Phi2 = Phi %Clone1, Prolog1, %Clone2, Kernel
674 // %Phi3 = Phi %Clone0, Prolog1, %Phi0, Kernel
675 // Epilog1 (Stage4):
676 // %Phi4 = Phi %Clone0, Prolog0, %Phi2, Epilog0
678 // VRMap = {0: %Clone0, 1: %Clone1, 2: %Clone2}
679 // VRMapPhi (after Kernel) = {0: %Phi1, 1: %Phi0}
680 // VRMapPhi (after Epilog0) = {0: %Phi3, 1: %Phi2}
682 unsigned PhiOp1 = VRMap[PrologStage][Def];
683 if (np <= PrologStage)
684 PhiOp1 = VRMap[PrologStage - np][Def];
685 if (!InKernel) {
686 if (PrevStage == LastStageNum && np == 0)
687 PhiOp2 = VRMap[LastStageNum][Def];
688 else
689 PhiOp2 = VRMapPhi[PrevStage - np][Def];
692 const TargetRegisterClass *RC = MRI.getRegClass(Def);
693 Register NewReg = MRI.createVirtualRegister(RC);
695 MachineInstrBuilder NewPhi =
696 BuildMI(*NewBB, NewBB->getFirstNonPHI(), DebugLoc(),
697 TII->get(TargetOpcode::PHI), NewReg);
698 NewPhi.addReg(PhiOp1).addMBB(BB1);
699 NewPhi.addReg(PhiOp2).addMBB(BB2);
700 if (np == 0)
701 InstrMap[NewPhi] = &*BBI;
703 // Rewrite uses and update the map. The actions depend upon whether
704 // we generating code for the kernel or epilog blocks.
705 if (InKernel) {
706 rewriteScheduledInstr(NewBB, InstrMap, CurStageNum, np, &*BBI, PhiOp1,
707 NewReg);
708 rewriteScheduledInstr(NewBB, InstrMap, CurStageNum, np, &*BBI, PhiOp2,
709 NewReg);
711 PhiOp2 = NewReg;
712 VRMapPhi[PrevStage - np - 1][Def] = NewReg;
713 } else {
714 VRMapPhi[CurStageNum - np][Def] = NewReg;
715 if (np == NumPhis - 1)
716 rewriteScheduledInstr(NewBB, InstrMap, CurStageNum, np, &*BBI, Def,
717 NewReg);
719 if (IsLast && np == NumPhis - 1)
720 replaceRegUsesAfterLoop(Def, NewReg, BB, MRI, LIS);
726 /// Remove instructions that generate values with no uses.
727 /// Typically, these are induction variable operations that generate values
728 /// used in the loop itself. A dead instruction has a definition with
729 /// no uses, or uses that occur in the original loop only.
730 void ModuloScheduleExpander::removeDeadInstructions(MachineBasicBlock *KernelBB,
731 MBBVectorTy &EpilogBBs) {
732 // For each epilog block, check that the value defined by each instruction
733 // is used. If not, delete it.
734 for (MachineBasicBlock *MBB : llvm::reverse(EpilogBBs))
735 for (MachineBasicBlock::reverse_instr_iterator MI = MBB->instr_rbegin(),
736 ME = MBB->instr_rend();
737 MI != ME;) {
738 // From DeadMachineInstructionElem. Don't delete inline assembly.
739 if (MI->isInlineAsm()) {
740 ++MI;
741 continue;
743 bool SawStore = false;
744 // Check if it's safe to remove the instruction due to side effects.
745 // We can, and want to, remove Phis here.
746 if (!MI->isSafeToMove(SawStore) && !MI->isPHI()) {
747 ++MI;
748 continue;
750 bool used = true;
751 for (const MachineOperand &MO : MI->all_defs()) {
752 Register reg = MO.getReg();
753 // Assume physical registers are used, unless they are marked dead.
754 if (reg.isPhysical()) {
755 used = !MO.isDead();
756 if (used)
757 break;
758 continue;
760 unsigned realUses = 0;
761 for (const MachineOperand &U : MRI.use_operands(reg)) {
762 // Check if there are any uses that occur only in the original
763 // loop. If so, that's not a real use.
764 if (U.getParent()->getParent() != BB) {
765 realUses++;
766 used = true;
767 break;
770 if (realUses > 0)
771 break;
772 used = false;
774 if (!used) {
775 LIS.RemoveMachineInstrFromMaps(*MI);
776 MI++->eraseFromParent();
777 continue;
779 ++MI;
781 // In the kernel block, check if we can remove a Phi that generates a value
782 // used in an instruction removed in the epilog block.
783 for (MachineInstr &MI : llvm::make_early_inc_range(KernelBB->phis())) {
784 Register reg = MI.getOperand(0).getReg();
785 if (MRI.use_begin(reg) == MRI.use_end()) {
786 LIS.RemoveMachineInstrFromMaps(MI);
787 MI.eraseFromParent();
792 /// For loop carried definitions, we split the lifetime of a virtual register
793 /// that has uses past the definition in the next iteration. A copy with a new
794 /// virtual register is inserted before the definition, which helps with
795 /// generating a better register assignment.
797 /// v1 = phi(a, v2) v1 = phi(a, v2)
798 /// v2 = phi(b, v3) v2 = phi(b, v3)
799 /// v3 = .. v4 = copy v1
800 /// .. = V1 v3 = ..
801 /// .. = v4
802 void ModuloScheduleExpander::splitLifetimes(MachineBasicBlock *KernelBB,
803 MBBVectorTy &EpilogBBs) {
804 const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
805 for (auto &PHI : KernelBB->phis()) {
806 Register Def = PHI.getOperand(0).getReg();
807 // Check for any Phi definition that used as an operand of another Phi
808 // in the same block.
809 for (MachineRegisterInfo::use_instr_iterator I = MRI.use_instr_begin(Def),
810 E = MRI.use_instr_end();
811 I != E; ++I) {
812 if (I->isPHI() && I->getParent() == KernelBB) {
813 // Get the loop carried definition.
814 unsigned LCDef = getLoopPhiReg(PHI, KernelBB);
815 if (!LCDef)
816 continue;
817 MachineInstr *MI = MRI.getVRegDef(LCDef);
818 if (!MI || MI->getParent() != KernelBB || MI->isPHI())
819 continue;
820 // Search through the rest of the block looking for uses of the Phi
821 // definition. If one occurs, then split the lifetime.
822 unsigned SplitReg = 0;
823 for (auto &BBJ : make_range(MachineBasicBlock::instr_iterator(MI),
824 KernelBB->instr_end()))
825 if (BBJ.readsRegister(Def, /*TRI=*/nullptr)) {
826 // We split the lifetime when we find the first use.
827 if (SplitReg == 0) {
828 SplitReg = MRI.createVirtualRegister(MRI.getRegClass(Def));
829 BuildMI(*KernelBB, MI, MI->getDebugLoc(),
830 TII->get(TargetOpcode::COPY), SplitReg)
831 .addReg(Def);
833 BBJ.substituteRegister(Def, SplitReg, 0, *TRI);
835 if (!SplitReg)
836 continue;
837 // Search through each of the epilog blocks for any uses to be renamed.
838 for (auto &Epilog : EpilogBBs)
839 for (auto &I : *Epilog)
840 if (I.readsRegister(Def, /*TRI=*/nullptr))
841 I.substituteRegister(Def, SplitReg, 0, *TRI);
842 break;
848 /// Remove the incoming block from the Phis in a basic block.
849 static void removePhis(MachineBasicBlock *BB, MachineBasicBlock *Incoming) {
850 for (MachineInstr &MI : *BB) {
851 if (!MI.isPHI())
852 break;
853 for (unsigned i = 1, e = MI.getNumOperands(); i != e; i += 2)
854 if (MI.getOperand(i + 1).getMBB() == Incoming) {
855 MI.removeOperand(i + 1);
856 MI.removeOperand(i);
857 break;
862 /// Create branches from each prolog basic block to the appropriate epilog
863 /// block. These edges are needed if the loop ends before reaching the
864 /// kernel.
865 void ModuloScheduleExpander::addBranches(MachineBasicBlock &PreheaderBB,
866 MBBVectorTy &PrologBBs,
867 MachineBasicBlock *KernelBB,
868 MBBVectorTy &EpilogBBs,
869 ValueMapTy *VRMap) {
870 assert(PrologBBs.size() == EpilogBBs.size() && "Prolog/Epilog mismatch");
871 MachineBasicBlock *LastPro = KernelBB;
872 MachineBasicBlock *LastEpi = KernelBB;
874 // Start from the blocks connected to the kernel and work "out"
875 // to the first prolog and the last epilog blocks.
876 SmallVector<MachineInstr *, 4> PrevInsts;
877 unsigned MaxIter = PrologBBs.size() - 1;
878 for (unsigned i = 0, j = MaxIter; i <= MaxIter; ++i, --j) {
879 // Add branches to the prolog that go to the corresponding
880 // epilog, and the fall-thru prolog/kernel block.
881 MachineBasicBlock *Prolog = PrologBBs[j];
882 MachineBasicBlock *Epilog = EpilogBBs[i];
884 SmallVector<MachineOperand, 4> Cond;
885 std::optional<bool> StaticallyGreater =
886 LoopInfo->createTripCountGreaterCondition(j + 1, *Prolog, Cond);
887 unsigned numAdded = 0;
888 if (!StaticallyGreater) {
889 Prolog->addSuccessor(Epilog);
890 numAdded = TII->insertBranch(*Prolog, Epilog, LastPro, Cond, DebugLoc());
891 } else if (*StaticallyGreater == false) {
892 Prolog->addSuccessor(Epilog);
893 Prolog->removeSuccessor(LastPro);
894 LastEpi->removeSuccessor(Epilog);
895 numAdded = TII->insertBranch(*Prolog, Epilog, nullptr, Cond, DebugLoc());
896 removePhis(Epilog, LastEpi);
897 // Remove the blocks that are no longer referenced.
898 if (LastPro != LastEpi) {
899 LastEpi->clear();
900 LastEpi->eraseFromParent();
902 if (LastPro == KernelBB) {
903 LoopInfo->disposed(&LIS);
904 NewKernel = nullptr;
906 LastPro->clear();
907 LastPro->eraseFromParent();
908 } else {
909 numAdded = TII->insertBranch(*Prolog, LastPro, nullptr, Cond, DebugLoc());
910 removePhis(Epilog, Prolog);
912 LastPro = Prolog;
913 LastEpi = Epilog;
914 for (MachineBasicBlock::reverse_instr_iterator I = Prolog->instr_rbegin(),
915 E = Prolog->instr_rend();
916 I != E && numAdded > 0; ++I, --numAdded)
917 updateInstruction(&*I, false, j, 0, VRMap);
920 if (NewKernel) {
921 LoopInfo->setPreheader(PrologBBs[MaxIter]);
922 LoopInfo->adjustTripCount(-(MaxIter + 1));
926 /// Return true if we can compute the amount the instruction changes
927 /// during each iteration. Set Delta to the amount of the change.
928 bool ModuloScheduleExpander::computeDelta(MachineInstr &MI, unsigned &Delta) {
929 const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
930 const MachineOperand *BaseOp;
931 int64_t Offset;
932 bool OffsetIsScalable;
933 if (!TII->getMemOperandWithOffset(MI, BaseOp, Offset, OffsetIsScalable, TRI))
934 return false;
936 // FIXME: This algorithm assumes instructions have fixed-size offsets.
937 if (OffsetIsScalable)
938 return false;
940 if (!BaseOp->isReg())
941 return false;
943 Register BaseReg = BaseOp->getReg();
945 MachineRegisterInfo &MRI = MF.getRegInfo();
946 // Check if there is a Phi. If so, get the definition in the loop.
947 MachineInstr *BaseDef = MRI.getVRegDef(BaseReg);
948 if (BaseDef && BaseDef->isPHI()) {
949 BaseReg = getLoopPhiReg(*BaseDef, MI.getParent());
950 BaseDef = MRI.getVRegDef(BaseReg);
952 if (!BaseDef)
953 return false;
955 int D = 0;
956 if (!TII->getIncrementValue(*BaseDef, D) && D >= 0)
957 return false;
959 Delta = D;
960 return true;
963 /// Update the memory operand with a new offset when the pipeliner
964 /// generates a new copy of the instruction that refers to a
965 /// different memory location.
966 void ModuloScheduleExpander::updateMemOperands(MachineInstr &NewMI,
967 MachineInstr &OldMI,
968 unsigned Num) {
969 if (Num == 0)
970 return;
971 // If the instruction has memory operands, then adjust the offset
972 // when the instruction appears in different stages.
973 if (NewMI.memoperands_empty())
974 return;
975 SmallVector<MachineMemOperand *, 2> NewMMOs;
976 for (MachineMemOperand *MMO : NewMI.memoperands()) {
977 // TODO: Figure out whether isAtomic is really necessary (see D57601).
978 if (MMO->isVolatile() || MMO->isAtomic() ||
979 (MMO->isInvariant() && MMO->isDereferenceable()) ||
980 (!MMO->getValue())) {
981 NewMMOs.push_back(MMO);
982 continue;
984 unsigned Delta;
985 if (Num != UINT_MAX && computeDelta(OldMI, Delta)) {
986 int64_t AdjOffset = Delta * Num;
987 NewMMOs.push_back(
988 MF.getMachineMemOperand(MMO, AdjOffset, MMO->getSize()));
989 } else {
990 NewMMOs.push_back(MF.getMachineMemOperand(
991 MMO, 0, LocationSize::beforeOrAfterPointer()));
994 NewMI.setMemRefs(MF, NewMMOs);
997 /// Clone the instruction for the new pipelined loop and update the
998 /// memory operands, if needed.
999 MachineInstr *ModuloScheduleExpander::cloneInstr(MachineInstr *OldMI,
1000 unsigned CurStageNum,
1001 unsigned InstStageNum) {
1002 MachineInstr *NewMI = MF.CloneMachineInstr(OldMI);
1003 updateMemOperands(*NewMI, *OldMI, CurStageNum - InstStageNum);
1004 return NewMI;
1007 /// Clone the instruction for the new pipelined loop. If needed, this
1008 /// function updates the instruction using the values saved in the
1009 /// InstrChanges structure.
1010 MachineInstr *ModuloScheduleExpander::cloneAndChangeInstr(
1011 MachineInstr *OldMI, unsigned CurStageNum, unsigned InstStageNum) {
1012 MachineInstr *NewMI = MF.CloneMachineInstr(OldMI);
1013 auto It = InstrChanges.find(OldMI);
1014 if (It != InstrChanges.end()) {
1015 std::pair<unsigned, int64_t> RegAndOffset = It->second;
1016 unsigned BasePos, OffsetPos;
1017 if (!TII->getBaseAndOffsetPosition(*OldMI, BasePos, OffsetPos))
1018 return nullptr;
1019 int64_t NewOffset = OldMI->getOperand(OffsetPos).getImm();
1020 MachineInstr *LoopDef = findDefInLoop(RegAndOffset.first);
1021 if (Schedule.getStage(LoopDef) > (signed)InstStageNum)
1022 NewOffset += RegAndOffset.second * (CurStageNum - InstStageNum);
1023 NewMI->getOperand(OffsetPos).setImm(NewOffset);
1025 updateMemOperands(*NewMI, *OldMI, CurStageNum - InstStageNum);
1026 return NewMI;
1029 /// Update the machine instruction with new virtual registers. This
1030 /// function may change the definitions and/or uses.
1031 void ModuloScheduleExpander::updateInstruction(MachineInstr *NewMI,
1032 bool LastDef,
1033 unsigned CurStageNum,
1034 unsigned InstrStageNum,
1035 ValueMapTy *VRMap) {
1036 for (MachineOperand &MO : NewMI->operands()) {
1037 if (!MO.isReg() || !MO.getReg().isVirtual())
1038 continue;
1039 Register reg = MO.getReg();
1040 if (MO.isDef()) {
1041 // Create a new virtual register for the definition.
1042 const TargetRegisterClass *RC = MRI.getRegClass(reg);
1043 Register NewReg = MRI.createVirtualRegister(RC);
1044 MO.setReg(NewReg);
1045 VRMap[CurStageNum][reg] = NewReg;
1046 if (LastDef)
1047 replaceRegUsesAfterLoop(reg, NewReg, BB, MRI, LIS);
1048 } else if (MO.isUse()) {
1049 MachineInstr *Def = MRI.getVRegDef(reg);
1050 // Compute the stage that contains the last definition for instruction.
1051 int DefStageNum = Schedule.getStage(Def);
1052 unsigned StageNum = CurStageNum;
1053 if (DefStageNum != -1 && (int)InstrStageNum > DefStageNum) {
1054 // Compute the difference in stages between the defintion and the use.
1055 unsigned StageDiff = (InstrStageNum - DefStageNum);
1056 // Make an adjustment to get the last definition.
1057 StageNum -= StageDiff;
1059 if (auto It = VRMap[StageNum].find(reg); It != VRMap[StageNum].end())
1060 MO.setReg(It->second);
1065 /// Return the instruction in the loop that defines the register.
1066 /// If the definition is a Phi, then follow the Phi operand to
1067 /// the instruction in the loop.
1068 MachineInstr *ModuloScheduleExpander::findDefInLoop(unsigned Reg) {
1069 SmallPtrSet<MachineInstr *, 8> Visited;
1070 MachineInstr *Def = MRI.getVRegDef(Reg);
1071 while (Def->isPHI()) {
1072 if (!Visited.insert(Def).second)
1073 break;
1074 for (unsigned i = 1, e = Def->getNumOperands(); i < e; i += 2)
1075 if (Def->getOperand(i + 1).getMBB() == BB) {
1076 Def = MRI.getVRegDef(Def->getOperand(i).getReg());
1077 break;
1080 return Def;
1083 /// Return the new name for the value from the previous stage.
1084 unsigned ModuloScheduleExpander::getPrevMapVal(
1085 unsigned StageNum, unsigned PhiStage, unsigned LoopVal, unsigned LoopStage,
1086 ValueMapTy *VRMap, MachineBasicBlock *BB) {
1087 unsigned PrevVal = 0;
1088 if (StageNum > PhiStage) {
1089 MachineInstr *LoopInst = MRI.getVRegDef(LoopVal);
1090 if (PhiStage == LoopStage && VRMap[StageNum - 1].count(LoopVal))
1091 // The name is defined in the previous stage.
1092 PrevVal = VRMap[StageNum - 1][LoopVal];
1093 else if (VRMap[StageNum].count(LoopVal))
1094 // The previous name is defined in the current stage when the instruction
1095 // order is swapped.
1096 PrevVal = VRMap[StageNum][LoopVal];
1097 else if (!LoopInst->isPHI() || LoopInst->getParent() != BB)
1098 // The loop value hasn't yet been scheduled.
1099 PrevVal = LoopVal;
1100 else if (StageNum == PhiStage + 1)
1101 // The loop value is another phi, which has not been scheduled.
1102 PrevVal = getInitPhiReg(*LoopInst, BB);
1103 else if (StageNum > PhiStage + 1 && LoopInst->getParent() == BB)
1104 // The loop value is another phi, which has been scheduled.
1105 PrevVal =
1106 getPrevMapVal(StageNum - 1, PhiStage, getLoopPhiReg(*LoopInst, BB),
1107 LoopStage, VRMap, BB);
1109 return PrevVal;
1112 /// Rewrite the Phi values in the specified block to use the mappings
1113 /// from the initial operand. Once the Phi is scheduled, we switch
1114 /// to using the loop value instead of the Phi value, so those names
1115 /// do not need to be rewritten.
1116 void ModuloScheduleExpander::rewritePhiValues(MachineBasicBlock *NewBB,
1117 unsigned StageNum,
1118 ValueMapTy *VRMap,
1119 InstrMapTy &InstrMap) {
1120 for (auto &PHI : BB->phis()) {
1121 unsigned InitVal = 0;
1122 unsigned LoopVal = 0;
1123 getPhiRegs(PHI, BB, InitVal, LoopVal);
1124 Register PhiDef = PHI.getOperand(0).getReg();
1126 unsigned PhiStage = (unsigned)Schedule.getStage(MRI.getVRegDef(PhiDef));
1127 unsigned LoopStage = (unsigned)Schedule.getStage(MRI.getVRegDef(LoopVal));
1128 unsigned NumPhis = getStagesForPhi(PhiDef);
1129 if (NumPhis > StageNum)
1130 NumPhis = StageNum;
1131 for (unsigned np = 0; np <= NumPhis; ++np) {
1132 unsigned NewVal =
1133 getPrevMapVal(StageNum - np, PhiStage, LoopVal, LoopStage, VRMap, BB);
1134 if (!NewVal)
1135 NewVal = InitVal;
1136 rewriteScheduledInstr(NewBB, InstrMap, StageNum - np, np, &PHI, PhiDef,
1137 NewVal);
1142 /// Rewrite a previously scheduled instruction to use the register value
1143 /// from the new instruction. Make sure the instruction occurs in the
1144 /// basic block, and we don't change the uses in the new instruction.
1145 void ModuloScheduleExpander::rewriteScheduledInstr(
1146 MachineBasicBlock *BB, InstrMapTy &InstrMap, unsigned CurStageNum,
1147 unsigned PhiNum, MachineInstr *Phi, unsigned OldReg, unsigned NewReg,
1148 unsigned PrevReg) {
1149 bool InProlog = (CurStageNum < (unsigned)Schedule.getNumStages() - 1);
1150 int StagePhi = Schedule.getStage(Phi) + PhiNum;
1151 // Rewrite uses that have been scheduled already to use the new
1152 // Phi register.
1153 for (MachineOperand &UseOp :
1154 llvm::make_early_inc_range(MRI.use_operands(OldReg))) {
1155 MachineInstr *UseMI = UseOp.getParent();
1156 if (UseMI->getParent() != BB)
1157 continue;
1158 if (UseMI->isPHI()) {
1159 if (!Phi->isPHI() && UseMI->getOperand(0).getReg() == NewReg)
1160 continue;
1161 if (getLoopPhiReg(*UseMI, BB) != OldReg)
1162 continue;
1164 InstrMapTy::iterator OrigInstr = InstrMap.find(UseMI);
1165 assert(OrigInstr != InstrMap.end() && "Instruction not scheduled.");
1166 MachineInstr *OrigMI = OrigInstr->second;
1167 int StageSched = Schedule.getStage(OrigMI);
1168 int CycleSched = Schedule.getCycle(OrigMI);
1169 unsigned ReplaceReg = 0;
1170 // This is the stage for the scheduled instruction.
1171 if (StagePhi == StageSched && Phi->isPHI()) {
1172 int CyclePhi = Schedule.getCycle(Phi);
1173 if (PrevReg && InProlog)
1174 ReplaceReg = PrevReg;
1175 else if (PrevReg && !isLoopCarried(*Phi) &&
1176 (CyclePhi <= CycleSched || OrigMI->isPHI()))
1177 ReplaceReg = PrevReg;
1178 else
1179 ReplaceReg = NewReg;
1181 // The scheduled instruction occurs before the scheduled Phi, and the
1182 // Phi is not loop carried.
1183 if (!InProlog && StagePhi + 1 == StageSched && !isLoopCarried(*Phi))
1184 ReplaceReg = NewReg;
1185 if (StagePhi > StageSched && Phi->isPHI())
1186 ReplaceReg = NewReg;
1187 if (!InProlog && !Phi->isPHI() && StagePhi < StageSched)
1188 ReplaceReg = NewReg;
1189 if (ReplaceReg) {
1190 const TargetRegisterClass *NRC =
1191 MRI.constrainRegClass(ReplaceReg, MRI.getRegClass(OldReg));
1192 if (NRC)
1193 UseOp.setReg(ReplaceReg);
1194 else {
1195 Register SplitReg = MRI.createVirtualRegister(MRI.getRegClass(OldReg));
1196 BuildMI(*BB, UseMI, UseMI->getDebugLoc(), TII->get(TargetOpcode::COPY),
1197 SplitReg)
1198 .addReg(ReplaceReg);
1199 UseOp.setReg(SplitReg);
1205 bool ModuloScheduleExpander::isLoopCarried(MachineInstr &Phi) {
1206 if (!Phi.isPHI())
1207 return false;
1208 int DefCycle = Schedule.getCycle(&Phi);
1209 int DefStage = Schedule.getStage(&Phi);
1211 unsigned InitVal = 0;
1212 unsigned LoopVal = 0;
1213 getPhiRegs(Phi, Phi.getParent(), InitVal, LoopVal);
1214 MachineInstr *Use = MRI.getVRegDef(LoopVal);
1215 if (!Use || Use->isPHI())
1216 return true;
1217 int LoopCycle = Schedule.getCycle(Use);
1218 int LoopStage = Schedule.getStage(Use);
1219 return (LoopCycle > DefCycle) || (LoopStage <= DefStage);
1222 //===----------------------------------------------------------------------===//
1223 // PeelingModuloScheduleExpander implementation
1224 //===----------------------------------------------------------------------===//
1225 // This is a reimplementation of ModuloScheduleExpander that works by creating
1226 // a fully correct steady-state kernel and peeling off the prolog and epilogs.
1227 //===----------------------------------------------------------------------===//
1229 namespace {
1230 // Remove any dead phis in MBB. Dead phis either have only one block as input
1231 // (in which case they are the identity) or have no uses.
1232 void EliminateDeadPhis(MachineBasicBlock *MBB, MachineRegisterInfo &MRI,
1233 LiveIntervals *LIS, bool KeepSingleSrcPhi = false) {
1234 bool Changed = true;
1235 while (Changed) {
1236 Changed = false;
1237 for (MachineInstr &MI : llvm::make_early_inc_range(MBB->phis())) {
1238 assert(MI.isPHI());
1239 if (MRI.use_empty(MI.getOperand(0).getReg())) {
1240 if (LIS)
1241 LIS->RemoveMachineInstrFromMaps(MI);
1242 MI.eraseFromParent();
1243 Changed = true;
1244 } else if (!KeepSingleSrcPhi && MI.getNumExplicitOperands() == 3) {
1245 const TargetRegisterClass *ConstrainRegClass =
1246 MRI.constrainRegClass(MI.getOperand(1).getReg(),
1247 MRI.getRegClass(MI.getOperand(0).getReg()));
1248 assert(ConstrainRegClass &&
1249 "Expected a valid constrained register class!");
1250 (void)ConstrainRegClass;
1251 MRI.replaceRegWith(MI.getOperand(0).getReg(),
1252 MI.getOperand(1).getReg());
1253 if (LIS)
1254 LIS->RemoveMachineInstrFromMaps(MI);
1255 MI.eraseFromParent();
1256 Changed = true;
1262 /// Rewrites the kernel block in-place to adhere to the given schedule.
1263 /// KernelRewriter holds all of the state required to perform the rewriting.
1264 class KernelRewriter {
1265 ModuloSchedule &S;
1266 MachineBasicBlock *BB;
1267 MachineBasicBlock *PreheaderBB, *ExitBB;
1268 MachineRegisterInfo &MRI;
1269 const TargetInstrInfo *TII;
1270 LiveIntervals *LIS;
1272 // Map from register class to canonical undef register for that class.
1273 DenseMap<const TargetRegisterClass *, Register> Undefs;
1274 // Map from <LoopReg, InitReg> to phi register for all created phis. Note that
1275 // this map is only used when InitReg is non-undef.
1276 DenseMap<std::pair<unsigned, unsigned>, Register> Phis;
1277 // Map from LoopReg to phi register where the InitReg is undef.
1278 DenseMap<Register, Register> UndefPhis;
1280 // Reg is used by MI. Return the new register MI should use to adhere to the
1281 // schedule. Insert phis as necessary.
1282 Register remapUse(Register Reg, MachineInstr &MI);
1283 // Insert a phi that carries LoopReg from the loop body and InitReg otherwise.
1284 // If InitReg is not given it is chosen arbitrarily. It will either be undef
1285 // or will be chosen so as to share another phi.
1286 Register phi(Register LoopReg, std::optional<Register> InitReg = {},
1287 const TargetRegisterClass *RC = nullptr);
1288 // Create an undef register of the given register class.
1289 Register undef(const TargetRegisterClass *RC);
1291 public:
1292 KernelRewriter(MachineLoop &L, ModuloSchedule &S, MachineBasicBlock *LoopBB,
1293 LiveIntervals *LIS = nullptr);
1294 void rewrite();
1296 } // namespace
1298 KernelRewriter::KernelRewriter(MachineLoop &L, ModuloSchedule &S,
1299 MachineBasicBlock *LoopBB, LiveIntervals *LIS)
1300 : S(S), BB(LoopBB), PreheaderBB(L.getLoopPreheader()),
1301 ExitBB(L.getExitBlock()), MRI(BB->getParent()->getRegInfo()),
1302 TII(BB->getParent()->getSubtarget().getInstrInfo()), LIS(LIS) {
1303 PreheaderBB = *BB->pred_begin();
1304 if (PreheaderBB == BB)
1305 PreheaderBB = *std::next(BB->pred_begin());
1308 void KernelRewriter::rewrite() {
1309 // Rearrange the loop to be in schedule order. Note that the schedule may
1310 // contain instructions that are not owned by the loop block (InstrChanges and
1311 // friends), so we gracefully handle unowned instructions and delete any
1312 // instructions that weren't in the schedule.
1313 auto InsertPt = BB->getFirstTerminator();
1314 MachineInstr *FirstMI = nullptr;
1315 for (MachineInstr *MI : S.getInstructions()) {
1316 if (MI->isPHI())
1317 continue;
1318 if (MI->getParent())
1319 MI->removeFromParent();
1320 BB->insert(InsertPt, MI);
1321 if (!FirstMI)
1322 FirstMI = MI;
1324 assert(FirstMI && "Failed to find first MI in schedule");
1326 // At this point all of the scheduled instructions are between FirstMI
1327 // and the end of the block. Kill from the first non-phi to FirstMI.
1328 for (auto I = BB->getFirstNonPHI(); I != FirstMI->getIterator();) {
1329 if (LIS)
1330 LIS->RemoveMachineInstrFromMaps(*I);
1331 (I++)->eraseFromParent();
1334 // Now remap every instruction in the loop.
1335 for (MachineInstr &MI : *BB) {
1336 if (MI.isPHI() || MI.isTerminator())
1337 continue;
1338 for (MachineOperand &MO : MI.uses()) {
1339 if (!MO.isReg() || MO.getReg().isPhysical() || MO.isImplicit())
1340 continue;
1341 Register Reg = remapUse(MO.getReg(), MI);
1342 MO.setReg(Reg);
1345 EliminateDeadPhis(BB, MRI, LIS);
1347 // Ensure a phi exists for all instructions that are either referenced by
1348 // an illegal phi or by an instruction outside the loop. This allows us to
1349 // treat remaps of these values the same as "normal" values that come from
1350 // loop-carried phis.
1351 for (auto MI = BB->getFirstNonPHI(); MI != BB->end(); ++MI) {
1352 if (MI->isPHI()) {
1353 Register R = MI->getOperand(0).getReg();
1354 phi(R);
1355 continue;
1358 for (MachineOperand &Def : MI->defs()) {
1359 for (MachineInstr &MI : MRI.use_instructions(Def.getReg())) {
1360 if (MI.getParent() != BB) {
1361 phi(Def.getReg());
1362 break;
1369 Register KernelRewriter::remapUse(Register Reg, MachineInstr &MI) {
1370 MachineInstr *Producer = MRI.getUniqueVRegDef(Reg);
1371 if (!Producer)
1372 return Reg;
1374 int ConsumerStage = S.getStage(&MI);
1375 if (!Producer->isPHI()) {
1376 // Non-phi producers are simple to remap. Insert as many phis as the
1377 // difference between the consumer and producer stages.
1378 if (Producer->getParent() != BB)
1379 // Producer was not inside the loop. Use the register as-is.
1380 return Reg;
1381 int ProducerStage = S.getStage(Producer);
1382 assert(ConsumerStage != -1 &&
1383 "In-loop consumer should always be scheduled!");
1384 assert(ConsumerStage >= ProducerStage);
1385 unsigned StageDiff = ConsumerStage - ProducerStage;
1387 for (unsigned I = 0; I < StageDiff; ++I)
1388 Reg = phi(Reg);
1389 return Reg;
1392 // First, dive through the phi chain to find the defaults for the generated
1393 // phis.
1394 SmallVector<std::optional<Register>, 4> Defaults;
1395 Register LoopReg = Reg;
1396 auto LoopProducer = Producer;
1397 while (LoopProducer->isPHI() && LoopProducer->getParent() == BB) {
1398 LoopReg = getLoopPhiReg(*LoopProducer, BB);
1399 Defaults.emplace_back(getInitPhiReg(*LoopProducer, BB));
1400 LoopProducer = MRI.getUniqueVRegDef(LoopReg);
1401 assert(LoopProducer);
1403 int LoopProducerStage = S.getStage(LoopProducer);
1405 std::optional<Register> IllegalPhiDefault;
1407 if (LoopProducerStage == -1) {
1408 // Do nothing.
1409 } else if (LoopProducerStage > ConsumerStage) {
1410 // This schedule is only representable if ProducerStage == ConsumerStage+1.
1411 // In addition, Consumer's cycle must be scheduled after Producer in the
1412 // rescheduled loop. This is enforced by the pipeliner's ASAP and ALAP
1413 // functions.
1414 #ifndef NDEBUG // Silence unused variables in non-asserts mode.
1415 int LoopProducerCycle = S.getCycle(LoopProducer);
1416 int ConsumerCycle = S.getCycle(&MI);
1417 #endif
1418 assert(LoopProducerCycle <= ConsumerCycle);
1419 assert(LoopProducerStage == ConsumerStage + 1);
1420 // Peel off the first phi from Defaults and insert a phi between producer
1421 // and consumer. This phi will not be at the front of the block so we
1422 // consider it illegal. It will only exist during the rewrite process; it
1423 // needs to exist while we peel off prologs because these could take the
1424 // default value. After that we can replace all uses with the loop producer
1425 // value.
1426 IllegalPhiDefault = Defaults.front();
1427 Defaults.erase(Defaults.begin());
1428 } else {
1429 assert(ConsumerStage >= LoopProducerStage);
1430 int StageDiff = ConsumerStage - LoopProducerStage;
1431 if (StageDiff > 0) {
1432 LLVM_DEBUG(dbgs() << " -- padding defaults array from " << Defaults.size()
1433 << " to " << (Defaults.size() + StageDiff) << "\n");
1434 // If we need more phis than we have defaults for, pad out with undefs for
1435 // the earliest phis, which are at the end of the defaults chain (the
1436 // chain is in reverse order).
1437 Defaults.resize(Defaults.size() + StageDiff,
1438 Defaults.empty() ? std::optional<Register>()
1439 : Defaults.back());
1443 // Now we know the number of stages to jump back, insert the phi chain.
1444 auto DefaultI = Defaults.rbegin();
1445 while (DefaultI != Defaults.rend())
1446 LoopReg = phi(LoopReg, *DefaultI++, MRI.getRegClass(Reg));
1448 if (IllegalPhiDefault) {
1449 // The consumer optionally consumes LoopProducer in the same iteration
1450 // (because the producer is scheduled at an earlier cycle than the consumer)
1451 // or the initial value. To facilitate this we create an illegal block here
1452 // by embedding a phi in the middle of the block. We will fix this up
1453 // immediately prior to pruning.
1454 auto RC = MRI.getRegClass(Reg);
1455 Register R = MRI.createVirtualRegister(RC);
1456 MachineInstr *IllegalPhi =
1457 BuildMI(*BB, MI, DebugLoc(), TII->get(TargetOpcode::PHI), R)
1458 .addReg(*IllegalPhiDefault)
1459 .addMBB(PreheaderBB) // Block choice is arbitrary and has no effect.
1460 .addReg(LoopReg)
1461 .addMBB(BB); // Block choice is arbitrary and has no effect.
1462 // Illegal phi should belong to the producer stage so that it can be
1463 // filtered correctly during peeling.
1464 S.setStage(IllegalPhi, LoopProducerStage);
1465 return R;
1468 return LoopReg;
1471 Register KernelRewriter::phi(Register LoopReg, std::optional<Register> InitReg,
1472 const TargetRegisterClass *RC) {
1473 // If the init register is not undef, try and find an existing phi.
1474 if (InitReg) {
1475 auto I = Phis.find({LoopReg, *InitReg});
1476 if (I != Phis.end())
1477 return I->second;
1478 } else {
1479 for (auto &KV : Phis) {
1480 if (KV.first.first == LoopReg)
1481 return KV.second;
1485 // InitReg is either undef or no existing phi takes InitReg as input. Try and
1486 // find a phi that takes undef as input.
1487 auto I = UndefPhis.find(LoopReg);
1488 if (I != UndefPhis.end()) {
1489 Register R = I->second;
1490 if (!InitReg)
1491 // Found a phi taking undef as input, and this input is undef so return
1492 // without any more changes.
1493 return R;
1494 // Found a phi taking undef as input, so rewrite it to take InitReg.
1495 MachineInstr *MI = MRI.getVRegDef(R);
1496 MI->getOperand(1).setReg(*InitReg);
1497 Phis.insert({{LoopReg, *InitReg}, R});
1498 const TargetRegisterClass *ConstrainRegClass =
1499 MRI.constrainRegClass(R, MRI.getRegClass(*InitReg));
1500 assert(ConstrainRegClass && "Expected a valid constrained register class!");
1501 (void)ConstrainRegClass;
1502 UndefPhis.erase(I);
1503 return R;
1506 // Failed to find any existing phi to reuse, so create a new one.
1507 if (!RC)
1508 RC = MRI.getRegClass(LoopReg);
1509 Register R = MRI.createVirtualRegister(RC);
1510 if (InitReg) {
1511 const TargetRegisterClass *ConstrainRegClass =
1512 MRI.constrainRegClass(R, MRI.getRegClass(*InitReg));
1513 assert(ConstrainRegClass && "Expected a valid constrained register class!");
1514 (void)ConstrainRegClass;
1516 BuildMI(*BB, BB->getFirstNonPHI(), DebugLoc(), TII->get(TargetOpcode::PHI), R)
1517 .addReg(InitReg ? *InitReg : undef(RC))
1518 .addMBB(PreheaderBB)
1519 .addReg(LoopReg)
1520 .addMBB(BB);
1521 if (!InitReg)
1522 UndefPhis[LoopReg] = R;
1523 else
1524 Phis[{LoopReg, *InitReg}] = R;
1525 return R;
1528 Register KernelRewriter::undef(const TargetRegisterClass *RC) {
1529 Register &R = Undefs[RC];
1530 if (R == 0) {
1531 // Create an IMPLICIT_DEF that defines this register if we need it.
1532 // All uses of this should be removed by the time we have finished unrolling
1533 // prologs and epilogs.
1534 R = MRI.createVirtualRegister(RC);
1535 auto *InsertBB = &PreheaderBB->getParent()->front();
1536 BuildMI(*InsertBB, InsertBB->getFirstTerminator(), DebugLoc(),
1537 TII->get(TargetOpcode::IMPLICIT_DEF), R);
1539 return R;
1542 namespace {
1543 /// Describes an operand in the kernel of a pipelined loop. Characteristics of
1544 /// the operand are discovered, such as how many in-loop PHIs it has to jump
1545 /// through and defaults for these phis.
1546 class KernelOperandInfo {
1547 MachineBasicBlock *BB;
1548 MachineRegisterInfo &MRI;
1549 SmallVector<Register, 4> PhiDefaults;
1550 MachineOperand *Source;
1551 MachineOperand *Target;
1553 public:
1554 KernelOperandInfo(MachineOperand *MO, MachineRegisterInfo &MRI,
1555 const SmallPtrSetImpl<MachineInstr *> &IllegalPhis)
1556 : MRI(MRI) {
1557 Source = MO;
1558 BB = MO->getParent()->getParent();
1559 while (isRegInLoop(MO)) {
1560 MachineInstr *MI = MRI.getVRegDef(MO->getReg());
1561 if (MI->isFullCopy()) {
1562 MO = &MI->getOperand(1);
1563 continue;
1565 if (!MI->isPHI())
1566 break;
1567 // If this is an illegal phi, don't count it in distance.
1568 if (IllegalPhis.count(MI)) {
1569 MO = &MI->getOperand(3);
1570 continue;
1573 Register Default = getInitPhiReg(*MI, BB);
1574 MO = MI->getOperand(2).getMBB() == BB ? &MI->getOperand(1)
1575 : &MI->getOperand(3);
1576 PhiDefaults.push_back(Default);
1578 Target = MO;
1581 bool operator==(const KernelOperandInfo &Other) const {
1582 return PhiDefaults.size() == Other.PhiDefaults.size();
1585 void print(raw_ostream &OS) const {
1586 OS << "use of " << *Source << ": distance(" << PhiDefaults.size() << ") in "
1587 << *Source->getParent();
1590 private:
1591 bool isRegInLoop(MachineOperand *MO) {
1592 return MO->isReg() && MO->getReg().isVirtual() &&
1593 MRI.getVRegDef(MO->getReg())->getParent() == BB;
1596 } // namespace
1598 MachineBasicBlock *
1599 PeelingModuloScheduleExpander::peelKernel(LoopPeelDirection LPD) {
1600 MachineBasicBlock *NewBB = PeelSingleBlockLoop(LPD, BB, MRI, TII);
1601 if (LPD == LPD_Front)
1602 PeeledFront.push_back(NewBB);
1603 else
1604 PeeledBack.push_front(NewBB);
1605 for (auto I = BB->begin(), NI = NewBB->begin(); !I->isTerminator();
1606 ++I, ++NI) {
1607 CanonicalMIs[&*I] = &*I;
1608 CanonicalMIs[&*NI] = &*I;
1609 BlockMIs[{NewBB, &*I}] = &*NI;
1610 BlockMIs[{BB, &*I}] = &*I;
1612 return NewBB;
1615 void PeelingModuloScheduleExpander::filterInstructions(MachineBasicBlock *MB,
1616 int MinStage) {
1617 for (auto I = MB->getFirstInstrTerminator()->getReverseIterator();
1618 I != std::next(MB->getFirstNonPHI()->getReverseIterator());) {
1619 MachineInstr *MI = &*I++;
1620 int Stage = getStage(MI);
1621 if (Stage == -1 || Stage >= MinStage)
1622 continue;
1624 for (MachineOperand &DefMO : MI->defs()) {
1625 SmallVector<std::pair<MachineInstr *, Register>, 4> Subs;
1626 for (MachineInstr &UseMI : MRI.use_instructions(DefMO.getReg())) {
1627 // Only PHIs can use values from this block by construction.
1628 // Match with the equivalent PHI in B.
1629 assert(UseMI.isPHI());
1630 Register Reg = getEquivalentRegisterIn(UseMI.getOperand(0).getReg(),
1631 MI->getParent());
1632 Subs.emplace_back(&UseMI, Reg);
1634 for (auto &Sub : Subs)
1635 Sub.first->substituteRegister(DefMO.getReg(), Sub.second, /*SubIdx=*/0,
1636 *MRI.getTargetRegisterInfo());
1638 if (LIS)
1639 LIS->RemoveMachineInstrFromMaps(*MI);
1640 MI->eraseFromParent();
1644 void PeelingModuloScheduleExpander::moveStageBetweenBlocks(
1645 MachineBasicBlock *DestBB, MachineBasicBlock *SourceBB, unsigned Stage) {
1646 auto InsertPt = DestBB->getFirstNonPHI();
1647 DenseMap<Register, Register> Remaps;
1648 for (MachineInstr &MI : llvm::make_early_inc_range(
1649 llvm::make_range(SourceBB->getFirstNonPHI(), SourceBB->end()))) {
1650 if (MI.isPHI()) {
1651 // This is an illegal PHI. If we move any instructions using an illegal
1652 // PHI, we need to create a legal Phi.
1653 if (getStage(&MI) != Stage) {
1654 // The legal Phi is not necessary if the illegal phi's stage
1655 // is being moved.
1656 Register PhiR = MI.getOperand(0).getReg();
1657 auto RC = MRI.getRegClass(PhiR);
1658 Register NR = MRI.createVirtualRegister(RC);
1659 MachineInstr *NI = BuildMI(*DestBB, DestBB->getFirstNonPHI(),
1660 DebugLoc(), TII->get(TargetOpcode::PHI), NR)
1661 .addReg(PhiR)
1662 .addMBB(SourceBB);
1663 BlockMIs[{DestBB, CanonicalMIs[&MI]}] = NI;
1664 CanonicalMIs[NI] = CanonicalMIs[&MI];
1665 Remaps[PhiR] = NR;
1668 if (getStage(&MI) != Stage)
1669 continue;
1670 MI.removeFromParent();
1671 DestBB->insert(InsertPt, &MI);
1672 auto *KernelMI = CanonicalMIs[&MI];
1673 BlockMIs[{DestBB, KernelMI}] = &MI;
1674 BlockMIs.erase({SourceBB, KernelMI});
1676 SmallVector<MachineInstr *, 4> PhiToDelete;
1677 for (MachineInstr &MI : DestBB->phis()) {
1678 assert(MI.getNumOperands() == 3);
1679 MachineInstr *Def = MRI.getVRegDef(MI.getOperand(1).getReg());
1680 // If the instruction referenced by the phi is moved inside the block
1681 // we don't need the phi anymore.
1682 if (getStage(Def) == Stage) {
1683 Register PhiReg = MI.getOperand(0).getReg();
1684 assert(Def->findRegisterDefOperandIdx(MI.getOperand(1).getReg(),
1685 /*TRI=*/nullptr) != -1);
1686 MRI.replaceRegWith(MI.getOperand(0).getReg(), MI.getOperand(1).getReg());
1687 MI.getOperand(0).setReg(PhiReg);
1688 PhiToDelete.push_back(&MI);
1691 for (auto *P : PhiToDelete)
1692 P->eraseFromParent();
1693 InsertPt = DestBB->getFirstNonPHI();
1694 // Helper to clone Phi instructions into the destination block. We clone Phi
1695 // greedily to avoid combinatorial explosion of Phi instructions.
1696 auto clonePhi = [&](MachineInstr *Phi) {
1697 MachineInstr *NewMI = MF.CloneMachineInstr(Phi);
1698 DestBB->insert(InsertPt, NewMI);
1699 Register OrigR = Phi->getOperand(0).getReg();
1700 Register R = MRI.createVirtualRegister(MRI.getRegClass(OrigR));
1701 NewMI->getOperand(0).setReg(R);
1702 NewMI->getOperand(1).setReg(OrigR);
1703 NewMI->getOperand(2).setMBB(*DestBB->pred_begin());
1704 Remaps[OrigR] = R;
1705 CanonicalMIs[NewMI] = CanonicalMIs[Phi];
1706 BlockMIs[{DestBB, CanonicalMIs[Phi]}] = NewMI;
1707 PhiNodeLoopIteration[NewMI] = PhiNodeLoopIteration[Phi];
1708 return R;
1710 for (auto I = DestBB->getFirstNonPHI(); I != DestBB->end(); ++I) {
1711 for (MachineOperand &MO : I->uses()) {
1712 if (!MO.isReg())
1713 continue;
1714 if (auto It = Remaps.find(MO.getReg()); It != Remaps.end())
1715 MO.setReg(It->second);
1716 else {
1717 // If we are using a phi from the source block we need to add a new phi
1718 // pointing to the old one.
1719 MachineInstr *Use = MRI.getUniqueVRegDef(MO.getReg());
1720 if (Use && Use->isPHI() && Use->getParent() == SourceBB) {
1721 Register R = clonePhi(Use);
1722 MO.setReg(R);
1729 Register
1730 PeelingModuloScheduleExpander::getPhiCanonicalReg(MachineInstr *CanonicalPhi,
1731 MachineInstr *Phi) {
1732 unsigned distance = PhiNodeLoopIteration[Phi];
1733 MachineInstr *CanonicalUse = CanonicalPhi;
1734 Register CanonicalUseReg = CanonicalUse->getOperand(0).getReg();
1735 for (unsigned I = 0; I < distance; ++I) {
1736 assert(CanonicalUse->isPHI());
1737 assert(CanonicalUse->getNumOperands() == 5);
1738 unsigned LoopRegIdx = 3, InitRegIdx = 1;
1739 if (CanonicalUse->getOperand(2).getMBB() == CanonicalUse->getParent())
1740 std::swap(LoopRegIdx, InitRegIdx);
1741 CanonicalUseReg = CanonicalUse->getOperand(LoopRegIdx).getReg();
1742 CanonicalUse = MRI.getVRegDef(CanonicalUseReg);
1744 return CanonicalUseReg;
1747 void PeelingModuloScheduleExpander::peelPrologAndEpilogs() {
1748 BitVector LS(Schedule.getNumStages(), true);
1749 BitVector AS(Schedule.getNumStages(), true);
1750 LiveStages[BB] = LS;
1751 AvailableStages[BB] = AS;
1753 // Peel out the prologs.
1754 LS.reset();
1755 for (int I = 0; I < Schedule.getNumStages() - 1; ++I) {
1756 LS[I] = true;
1757 Prologs.push_back(peelKernel(LPD_Front));
1758 LiveStages[Prologs.back()] = LS;
1759 AvailableStages[Prologs.back()] = LS;
1762 // Create a block that will end up as the new loop exiting block (dominated by
1763 // all prologs and epilogs). It will only contain PHIs, in the same order as
1764 // BB's PHIs. This gives us a poor-man's LCSSA with the inductive property
1765 // that the exiting block is a (sub) clone of BB. This in turn gives us the
1766 // property that any value deffed in BB but used outside of BB is used by a
1767 // PHI in the exiting block.
1768 MachineBasicBlock *ExitingBB = CreateLCSSAExitingBlock();
1769 EliminateDeadPhis(ExitingBB, MRI, LIS, /*KeepSingleSrcPhi=*/true);
1770 // Push out the epilogs, again in reverse order.
1771 // We can't assume anything about the minumum loop trip count at this point,
1772 // so emit a fairly complex epilog.
1774 // We first peel number of stages minus one epilogue. Then we remove dead
1775 // stages and reorder instructions based on their stage. If we have 3 stages
1776 // we generate first:
1777 // E0[3, 2, 1]
1778 // E1[3', 2']
1779 // E2[3'']
1780 // And then we move instructions based on their stages to have:
1781 // E0[3]
1782 // E1[2, 3']
1783 // E2[1, 2', 3'']
1784 // The transformation is legal because we only move instructions past
1785 // instructions of a previous loop iteration.
1786 for (int I = 1; I <= Schedule.getNumStages() - 1; ++I) {
1787 Epilogs.push_back(peelKernel(LPD_Back));
1788 MachineBasicBlock *B = Epilogs.back();
1789 filterInstructions(B, Schedule.getNumStages() - I);
1790 // Keep track at which iteration each phi belongs to. We need it to know
1791 // what version of the variable to use during prologue/epilogue stitching.
1792 EliminateDeadPhis(B, MRI, LIS, /*KeepSingleSrcPhi=*/true);
1793 for (MachineInstr &Phi : B->phis())
1794 PhiNodeLoopIteration[&Phi] = Schedule.getNumStages() - I;
1796 for (size_t I = 0; I < Epilogs.size(); I++) {
1797 LS.reset();
1798 for (size_t J = I; J < Epilogs.size(); J++) {
1799 int Iteration = J;
1800 unsigned Stage = Schedule.getNumStages() - 1 + I - J;
1801 // Move stage one block at a time so that Phi nodes are updated correctly.
1802 for (size_t K = Iteration; K > I; K--)
1803 moveStageBetweenBlocks(Epilogs[K - 1], Epilogs[K], Stage);
1804 LS[Stage] = true;
1806 LiveStages[Epilogs[I]] = LS;
1807 AvailableStages[Epilogs[I]] = AS;
1810 // Now we've defined all the prolog and epilog blocks as a fallthrough
1811 // sequence, add the edges that will be followed if the loop trip count is
1812 // lower than the number of stages (connecting prologs directly with epilogs).
1813 auto PI = Prologs.begin();
1814 auto EI = Epilogs.begin();
1815 assert(Prologs.size() == Epilogs.size());
1816 for (; PI != Prologs.end(); ++PI, ++EI) {
1817 MachineBasicBlock *Pred = *(*EI)->pred_begin();
1818 (*PI)->addSuccessor(*EI);
1819 for (MachineInstr &MI : (*EI)->phis()) {
1820 Register Reg = MI.getOperand(1).getReg();
1821 MachineInstr *Use = MRI.getUniqueVRegDef(Reg);
1822 if (Use && Use->getParent() == Pred) {
1823 MachineInstr *CanonicalUse = CanonicalMIs[Use];
1824 if (CanonicalUse->isPHI()) {
1825 // If the use comes from a phi we need to skip as many phi as the
1826 // distance between the epilogue and the kernel. Trace through the phi
1827 // chain to find the right value.
1828 Reg = getPhiCanonicalReg(CanonicalUse, Use);
1830 Reg = getEquivalentRegisterIn(Reg, *PI);
1832 MI.addOperand(MachineOperand::CreateReg(Reg, /*isDef=*/false));
1833 MI.addOperand(MachineOperand::CreateMBB(*PI));
1837 // Create a list of all blocks in order.
1838 SmallVector<MachineBasicBlock *, 8> Blocks;
1839 llvm::copy(PeeledFront, std::back_inserter(Blocks));
1840 Blocks.push_back(BB);
1841 llvm::copy(PeeledBack, std::back_inserter(Blocks));
1843 // Iterate in reverse order over all instructions, remapping as we go.
1844 for (MachineBasicBlock *B : reverse(Blocks)) {
1845 for (auto I = B->instr_rbegin();
1846 I != std::next(B->getFirstNonPHI()->getReverseIterator());) {
1847 MachineBasicBlock::reverse_instr_iterator MI = I++;
1848 rewriteUsesOf(&*MI);
1851 for (auto *MI : IllegalPhisToDelete) {
1852 if (LIS)
1853 LIS->RemoveMachineInstrFromMaps(*MI);
1854 MI->eraseFromParent();
1856 IllegalPhisToDelete.clear();
1858 // Now all remapping has been done, we're free to optimize the generated code.
1859 for (MachineBasicBlock *B : reverse(Blocks))
1860 EliminateDeadPhis(B, MRI, LIS);
1861 EliminateDeadPhis(ExitingBB, MRI, LIS);
1864 MachineBasicBlock *PeelingModuloScheduleExpander::CreateLCSSAExitingBlock() {
1865 MachineFunction &MF = *BB->getParent();
1866 MachineBasicBlock *Exit = *BB->succ_begin();
1867 if (Exit == BB)
1868 Exit = *std::next(BB->succ_begin());
1870 MachineBasicBlock *NewBB = MF.CreateMachineBasicBlock(BB->getBasicBlock());
1871 MF.insert(std::next(BB->getIterator()), NewBB);
1873 // Clone all phis in BB into NewBB and rewrite.
1874 for (MachineInstr &MI : BB->phis()) {
1875 auto RC = MRI.getRegClass(MI.getOperand(0).getReg());
1876 Register OldR = MI.getOperand(3).getReg();
1877 Register R = MRI.createVirtualRegister(RC);
1878 SmallVector<MachineInstr *, 4> Uses;
1879 for (MachineInstr &Use : MRI.use_instructions(OldR))
1880 if (Use.getParent() != BB)
1881 Uses.push_back(&Use);
1882 for (MachineInstr *Use : Uses)
1883 Use->substituteRegister(OldR, R, /*SubIdx=*/0,
1884 *MRI.getTargetRegisterInfo());
1885 MachineInstr *NI = BuildMI(NewBB, DebugLoc(), TII->get(TargetOpcode::PHI), R)
1886 .addReg(OldR)
1887 .addMBB(BB);
1888 BlockMIs[{NewBB, &MI}] = NI;
1889 CanonicalMIs[NI] = &MI;
1891 BB->replaceSuccessor(Exit, NewBB);
1892 Exit->replacePhiUsesWith(BB, NewBB);
1893 NewBB->addSuccessor(Exit);
1895 MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
1896 SmallVector<MachineOperand, 4> Cond;
1897 bool CanAnalyzeBr = !TII->analyzeBranch(*BB, TBB, FBB, Cond);
1898 (void)CanAnalyzeBr;
1899 assert(CanAnalyzeBr && "Must be able to analyze the loop branch!");
1900 TII->removeBranch(*BB);
1901 TII->insertBranch(*BB, TBB == Exit ? NewBB : TBB, FBB == Exit ? NewBB : FBB,
1902 Cond, DebugLoc());
1903 TII->insertUnconditionalBranch(*NewBB, Exit, DebugLoc());
1904 return NewBB;
1907 Register
1908 PeelingModuloScheduleExpander::getEquivalentRegisterIn(Register Reg,
1909 MachineBasicBlock *BB) {
1910 MachineInstr *MI = MRI.getUniqueVRegDef(Reg);
1911 unsigned OpIdx = MI->findRegisterDefOperandIdx(Reg, /*TRI=*/nullptr);
1912 return BlockMIs[{BB, CanonicalMIs[MI]}]->getOperand(OpIdx).getReg();
1915 void PeelingModuloScheduleExpander::rewriteUsesOf(MachineInstr *MI) {
1916 if (MI->isPHI()) {
1917 // This is an illegal PHI. The loop-carried (desired) value is operand 3,
1918 // and it is produced by this block.
1919 Register PhiR = MI->getOperand(0).getReg();
1920 Register R = MI->getOperand(3).getReg();
1921 int RMIStage = getStage(MRI.getUniqueVRegDef(R));
1922 if (RMIStage != -1 && !AvailableStages[MI->getParent()].test(RMIStage))
1923 R = MI->getOperand(1).getReg();
1924 MRI.setRegClass(R, MRI.getRegClass(PhiR));
1925 MRI.replaceRegWith(PhiR, R);
1926 // Postpone deleting the Phi as it may be referenced by BlockMIs and used
1927 // later to figure out how to remap registers.
1928 MI->getOperand(0).setReg(PhiR);
1929 IllegalPhisToDelete.push_back(MI);
1930 return;
1933 int Stage = getStage(MI);
1934 if (Stage == -1 || LiveStages.count(MI->getParent()) == 0 ||
1935 LiveStages[MI->getParent()].test(Stage))
1936 // Instruction is live, no rewriting to do.
1937 return;
1939 for (MachineOperand &DefMO : MI->defs()) {
1940 SmallVector<std::pair<MachineInstr *, Register>, 4> Subs;
1941 for (MachineInstr &UseMI : MRI.use_instructions(DefMO.getReg())) {
1942 // Only PHIs can use values from this block by construction.
1943 // Match with the equivalent PHI in B.
1944 assert(UseMI.isPHI());
1945 Register Reg = getEquivalentRegisterIn(UseMI.getOperand(0).getReg(),
1946 MI->getParent());
1947 Subs.emplace_back(&UseMI, Reg);
1949 for (auto &Sub : Subs)
1950 Sub.first->substituteRegister(DefMO.getReg(), Sub.second, /*SubIdx=*/0,
1951 *MRI.getTargetRegisterInfo());
1953 if (LIS)
1954 LIS->RemoveMachineInstrFromMaps(*MI);
1955 MI->eraseFromParent();
1958 void PeelingModuloScheduleExpander::fixupBranches() {
1959 // Work outwards from the kernel.
1960 bool KernelDisposed = false;
1961 int TC = Schedule.getNumStages() - 1;
1962 for (auto PI = Prologs.rbegin(), EI = Epilogs.rbegin(); PI != Prologs.rend();
1963 ++PI, ++EI, --TC) {
1964 MachineBasicBlock *Prolog = *PI;
1965 MachineBasicBlock *Fallthrough = *Prolog->succ_begin();
1966 MachineBasicBlock *Epilog = *EI;
1967 SmallVector<MachineOperand, 4> Cond;
1968 TII->removeBranch(*Prolog);
1969 std::optional<bool> StaticallyGreater =
1970 LoopInfo->createTripCountGreaterCondition(TC, *Prolog, Cond);
1971 if (!StaticallyGreater) {
1972 LLVM_DEBUG(dbgs() << "Dynamic: TC > " << TC << "\n");
1973 // Dynamically branch based on Cond.
1974 TII->insertBranch(*Prolog, Epilog, Fallthrough, Cond, DebugLoc());
1975 } else if (*StaticallyGreater == false) {
1976 LLVM_DEBUG(dbgs() << "Static-false: TC > " << TC << "\n");
1977 // Prolog never falls through; branch to epilog and orphan interior
1978 // blocks. Leave it to unreachable-block-elim to clean up.
1979 Prolog->removeSuccessor(Fallthrough);
1980 for (MachineInstr &P : Fallthrough->phis()) {
1981 P.removeOperand(2);
1982 P.removeOperand(1);
1984 TII->insertUnconditionalBranch(*Prolog, Epilog, DebugLoc());
1985 KernelDisposed = true;
1986 } else {
1987 LLVM_DEBUG(dbgs() << "Static-true: TC > " << TC << "\n");
1988 // Prolog always falls through; remove incoming values in epilog.
1989 Prolog->removeSuccessor(Epilog);
1990 for (MachineInstr &P : Epilog->phis()) {
1991 P.removeOperand(4);
1992 P.removeOperand(3);
1997 if (!KernelDisposed) {
1998 LoopInfo->adjustTripCount(-(Schedule.getNumStages() - 1));
1999 LoopInfo->setPreheader(Prologs.back());
2000 } else {
2001 LoopInfo->disposed();
2005 void PeelingModuloScheduleExpander::rewriteKernel() {
2006 KernelRewriter KR(*Schedule.getLoop(), Schedule, BB);
2007 KR.rewrite();
2010 void PeelingModuloScheduleExpander::expand() {
2011 BB = Schedule.getLoop()->getTopBlock();
2012 Preheader = Schedule.getLoop()->getLoopPreheader();
2013 LLVM_DEBUG(Schedule.dump());
2014 LoopInfo = TII->analyzeLoopForPipelining(BB);
2015 assert(LoopInfo);
2017 rewriteKernel();
2018 peelPrologAndEpilogs();
2019 fixupBranches();
2022 void PeelingModuloScheduleExpander::validateAgainstModuloScheduleExpander() {
2023 BB = Schedule.getLoop()->getTopBlock();
2024 Preheader = Schedule.getLoop()->getLoopPreheader();
2026 // Dump the schedule before we invalidate and remap all its instructions.
2027 // Stash it in a string so we can print it if we found an error.
2028 std::string ScheduleDump;
2029 raw_string_ostream OS(ScheduleDump);
2030 Schedule.print(OS);
2031 OS.flush();
2033 // First, run the normal ModuleScheduleExpander. We don't support any
2034 // InstrChanges.
2035 assert(LIS && "Requires LiveIntervals!");
2036 ModuloScheduleExpander MSE(MF, Schedule, *LIS,
2037 ModuloScheduleExpander::InstrChangesTy());
2038 MSE.expand();
2039 MachineBasicBlock *ExpandedKernel = MSE.getRewrittenKernel();
2040 if (!ExpandedKernel) {
2041 // The expander optimized away the kernel. We can't do any useful checking.
2042 MSE.cleanup();
2043 return;
2045 // Before running the KernelRewriter, re-add BB into the CFG.
2046 Preheader->addSuccessor(BB);
2048 // Now run the new expansion algorithm.
2049 KernelRewriter KR(*Schedule.getLoop(), Schedule, BB);
2050 KR.rewrite();
2051 peelPrologAndEpilogs();
2053 // Collect all illegal phis that the new algorithm created. We'll give these
2054 // to KernelOperandInfo.
2055 SmallPtrSet<MachineInstr *, 4> IllegalPhis;
2056 for (auto NI = BB->getFirstNonPHI(); NI != BB->end(); ++NI) {
2057 if (NI->isPHI())
2058 IllegalPhis.insert(&*NI);
2061 // Co-iterate across both kernels. We expect them to be identical apart from
2062 // phis and full COPYs (we look through both).
2063 SmallVector<std::pair<KernelOperandInfo, KernelOperandInfo>, 8> KOIs;
2064 auto OI = ExpandedKernel->begin();
2065 auto NI = BB->begin();
2066 for (; !OI->isTerminator() && !NI->isTerminator(); ++OI, ++NI) {
2067 while (OI->isPHI() || OI->isFullCopy())
2068 ++OI;
2069 while (NI->isPHI() || NI->isFullCopy())
2070 ++NI;
2071 assert(OI->getOpcode() == NI->getOpcode() && "Opcodes don't match?!");
2072 // Analyze every operand separately.
2073 for (auto OOpI = OI->operands_begin(), NOpI = NI->operands_begin();
2074 OOpI != OI->operands_end(); ++OOpI, ++NOpI)
2075 KOIs.emplace_back(KernelOperandInfo(&*OOpI, MRI, IllegalPhis),
2076 KernelOperandInfo(&*NOpI, MRI, IllegalPhis));
2079 bool Failed = false;
2080 for (auto &OldAndNew : KOIs) {
2081 if (OldAndNew.first == OldAndNew.second)
2082 continue;
2083 Failed = true;
2084 errs() << "Modulo kernel validation error: [\n";
2085 errs() << " [golden] ";
2086 OldAndNew.first.print(errs());
2087 errs() << " ";
2088 OldAndNew.second.print(errs());
2089 errs() << "]\n";
2092 if (Failed) {
2093 errs() << "Golden reference kernel:\n";
2094 ExpandedKernel->print(errs());
2095 errs() << "New kernel:\n";
2096 BB->print(errs());
2097 errs() << ScheduleDump;
2098 report_fatal_error(
2099 "Modulo kernel validation (-pipeliner-experimental-cg) failed");
2102 // Cleanup by removing BB from the CFG again as the original
2103 // ModuloScheduleExpander intended.
2104 Preheader->removeSuccessor(BB);
2105 MSE.cleanup();
2108 MachineInstr *ModuloScheduleExpanderMVE::cloneInstr(MachineInstr *OldMI) {
2109 MachineInstr *NewMI = MF.CloneMachineInstr(OldMI);
2111 // TODO: Offset information needs to be corrected.
2112 NewMI->dropMemRefs(MF);
2114 return NewMI;
2117 /// Create a dedicated exit for Loop. Exit is the original exit for Loop.
2118 /// If it is already dedicated exit, return it. Otherwise, insert a new
2119 /// block between them and return the new block.
2120 static MachineBasicBlock *createDedicatedExit(MachineBasicBlock *Loop,
2121 MachineBasicBlock *Exit) {
2122 if (Exit->pred_size() == 1)
2123 return Exit;
2125 MachineFunction *MF = Loop->getParent();
2126 const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
2128 MachineBasicBlock *NewExit =
2129 MF->CreateMachineBasicBlock(Loop->getBasicBlock());
2130 MF->insert(Loop->getIterator(), NewExit);
2132 MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
2133 SmallVector<MachineOperand, 4> Cond;
2134 TII->analyzeBranch(*Loop, TBB, FBB, Cond);
2135 if (TBB == Loop)
2136 FBB = NewExit;
2137 else if (FBB == Loop)
2138 TBB = NewExit;
2139 else
2140 llvm_unreachable("unexpected loop structure");
2141 TII->removeBranch(*Loop);
2142 TII->insertBranch(*Loop, TBB, FBB, Cond, DebugLoc());
2143 Loop->replaceSuccessor(Exit, NewExit);
2144 TII->insertUnconditionalBranch(*NewExit, Exit, DebugLoc());
2145 NewExit->addSuccessor(Exit);
2147 Exit->replacePhiUsesWith(Loop, NewExit);
2149 return NewExit;
2152 /// Insert branch code into the end of MBB. It branches to GreaterThan if the
2153 /// remaining trip count for instructions in LastStage0Insts is greater than
2154 /// RequiredTC, and to Otherwise otherwise.
2155 void ModuloScheduleExpanderMVE::insertCondBranch(MachineBasicBlock &MBB,
2156 int RequiredTC,
2157 InstrMapTy &LastStage0Insts,
2158 MachineBasicBlock &GreaterThan,
2159 MachineBasicBlock &Otherwise) {
2160 SmallVector<MachineOperand, 4> Cond;
2161 LoopInfo->createRemainingIterationsGreaterCondition(RequiredTC, MBB, Cond,
2162 LastStage0Insts);
2164 if (SwapBranchTargetsMVE) {
2165 // Set SwapBranchTargetsMVE to true if a target prefers to replace TBB and
2166 // FBB for optimal performance.
2167 if (TII->reverseBranchCondition(Cond))
2168 llvm_unreachable("can not reverse branch condition");
2169 TII->insertBranch(MBB, &Otherwise, &GreaterThan, Cond, DebugLoc());
2170 } else {
2171 TII->insertBranch(MBB, &GreaterThan, &Otherwise, Cond, DebugLoc());
2175 /// Generate a pipelined loop that is unrolled by using MVE algorithm and any
2176 /// other necessary blocks. The control flow is modified to execute the
2177 /// pipelined loop if the trip count satisfies the condition, otherwise the
2178 /// original loop. The original loop is also used to execute the remainder
2179 /// iterations which occur due to unrolling.
2180 void ModuloScheduleExpanderMVE::generatePipelinedLoop() {
2181 // The control flow for pipelining with MVE:
2183 // OrigPreheader:
2184 // // The block that is originally the loop preheader
2185 // goto Check
2187 // Check:
2188 // // Check whether the trip count satisfies the requirements to pipeline.
2189 // if (LoopCounter > NumStages + NumUnroll - 2)
2190 // // The minimum number of iterations to pipeline =
2191 // // iterations executed in prolog/epilog (NumStages-1) +
2192 // // iterations executed in one kernel run (NumUnroll)
2193 // goto Prolog
2194 // // fallback to the original loop
2195 // goto NewPreheader
2197 // Prolog:
2198 // // All prolog stages. There are no direct branches to the epilogue.
2199 // goto NewKernel
2201 // NewKernel:
2202 // // NumUnroll copies of the kernel
2203 // if (LoopCounter > MVE-1)
2204 // goto NewKernel
2205 // goto Epilog
2207 // Epilog:
2208 // // All epilog stages.
2209 // if (LoopCounter > 0)
2210 // // The remainder is executed in the original loop
2211 // goto NewPreheader
2212 // goto NewExit
2214 // NewPreheader:
2215 // // Newly created preheader for the original loop.
2216 // // The initial values of the phis in the loop are merged from two paths.
2217 // NewInitVal = Phi OrigInitVal, Check, PipelineLastVal, Epilog
2218 // goto OrigKernel
2220 // OrigKernel:
2221 // // The original loop block.
2222 // if (LoopCounter != 0)
2223 // goto OrigKernel
2224 // goto NewExit
2226 // NewExit:
2227 // // Newly created dedicated exit for the original loop.
2228 // // Merge values which are referenced after the loop
2229 // Merged = Phi OrigVal, OrigKernel, PipelineVal, Epilog
2230 // goto OrigExit
2232 // OrigExit:
2233 // // The block that is originally the loop exit.
2234 // // If it is already deicated exit, NewExit is not created.
2236 // An example of where each stage is executed:
2237 // Assume #Stages 3, #MVE 4, #Iterations 12
2238 // Iter 0 1 2 3 4 5 6 7 8 9 10-11
2239 // -------------------------------------------------
2240 // Stage 0 Prolog#0
2241 // Stage 1 0 Prolog#1
2242 // Stage 2 1 0 Kernel Unroll#0 Iter#0
2243 // Stage 2 1 0 Kernel Unroll#1 Iter#0
2244 // Stage 2 1 0 Kernel Unroll#2 Iter#0
2245 // Stage 2 1 0 Kernel Unroll#3 Iter#0
2246 // Stage 2 1 0 Kernel Unroll#0 Iter#1
2247 // Stage 2 1 0 Kernel Unroll#1 Iter#1
2248 // Stage 2 1 0 Kernel Unroll#2 Iter#1
2249 // Stage 2 1 0 Kernel Unroll#3 Iter#1
2250 // Stage 2 1 Epilog#0
2251 // Stage 2 Epilog#1
2252 // Stage 0-2 OrigKernel
2254 LoopInfo = TII->analyzeLoopForPipelining(OrigKernel);
2255 assert(LoopInfo && "Must be able to analyze loop!");
2257 calcNumUnroll();
2259 Check = MF.CreateMachineBasicBlock(OrigKernel->getBasicBlock());
2260 Prolog = MF.CreateMachineBasicBlock(OrigKernel->getBasicBlock());
2261 NewKernel = MF.CreateMachineBasicBlock(OrigKernel->getBasicBlock());
2262 Epilog = MF.CreateMachineBasicBlock(OrigKernel->getBasicBlock());
2263 NewPreheader = MF.CreateMachineBasicBlock(OrigKernel->getBasicBlock());
2265 MF.insert(OrigKernel->getIterator(), Check);
2266 MF.insert(OrigKernel->getIterator(), Prolog);
2267 MF.insert(OrigKernel->getIterator(), NewKernel);
2268 MF.insert(OrigKernel->getIterator(), Epilog);
2269 MF.insert(OrigKernel->getIterator(), NewPreheader);
2271 NewExit = createDedicatedExit(OrigKernel, OrigExit);
2273 NewPreheader->transferSuccessorsAndUpdatePHIs(OrigPreheader);
2274 TII->insertUnconditionalBranch(*NewPreheader, OrigKernel, DebugLoc());
2276 OrigPreheader->addSuccessor(Check);
2277 TII->removeBranch(*OrigPreheader);
2278 TII->insertUnconditionalBranch(*OrigPreheader, Check, DebugLoc());
2280 Check->addSuccessor(Prolog);
2281 Check->addSuccessor(NewPreheader);
2283 Prolog->addSuccessor(NewKernel);
2285 NewKernel->addSuccessor(NewKernel);
2286 NewKernel->addSuccessor(Epilog);
2288 Epilog->addSuccessor(NewPreheader);
2289 Epilog->addSuccessor(NewExit);
2291 InstrMapTy LastStage0Insts;
2292 insertCondBranch(*Check, Schedule.getNumStages() + NumUnroll - 2,
2293 LastStage0Insts, *Prolog, *NewPreheader);
2295 // VRMaps map (prolog/kernel/epilog phase#, original register#) to new
2296 // register#
2297 SmallVector<ValueMapTy> PrologVRMap, KernelVRMap, EpilogVRMap;
2298 generateProlog(PrologVRMap);
2299 generateKernel(PrologVRMap, KernelVRMap, LastStage0Insts);
2300 generateEpilog(KernelVRMap, EpilogVRMap, LastStage0Insts);
2303 /// Replace MI's use operands according to the maps.
2304 void ModuloScheduleExpanderMVE::updateInstrUse(
2305 MachineInstr *MI, int StageNum, int PhaseNum,
2306 SmallVectorImpl<ValueMapTy> &CurVRMap,
2307 SmallVectorImpl<ValueMapTy> *PrevVRMap) {
2308 // If MI is in the prolog/kernel/epilog block, CurVRMap is
2309 // PrologVRMap/KernelVRMap/EpilogVRMap respectively.
2310 // PrevVRMap is nullptr/PhiVRMap/KernelVRMap respectively.
2311 // Refer to the appropriate map according to the stage difference between
2312 // MI and the definition of an operand.
2314 for (MachineOperand &UseMO : MI->uses()) {
2315 if (!UseMO.isReg() || !UseMO.getReg().isVirtual())
2316 continue;
2317 int DiffStage = 0;
2318 Register OrigReg = UseMO.getReg();
2319 MachineInstr *DefInst = MRI.getVRegDef(OrigReg);
2320 if (!DefInst || DefInst->getParent() != OrigKernel)
2321 continue;
2322 unsigned InitReg = 0;
2323 unsigned DefReg = OrigReg;
2324 if (DefInst->isPHI()) {
2325 ++DiffStage;
2326 unsigned LoopReg;
2327 getPhiRegs(*DefInst, OrigKernel, InitReg, LoopReg);
2328 // LoopReg is guaranteed to be defined within the loop by canApply()
2329 DefReg = LoopReg;
2330 DefInst = MRI.getVRegDef(LoopReg);
2332 unsigned DefStageNum = Schedule.getStage(DefInst);
2333 DiffStage += StageNum - DefStageNum;
2334 Register NewReg;
2335 if (PhaseNum >= DiffStage && CurVRMap[PhaseNum - DiffStage].count(DefReg))
2336 // NewReg is defined in a previous phase of the same block
2337 NewReg = CurVRMap[PhaseNum - DiffStage][DefReg];
2338 else if (!PrevVRMap)
2339 // Since this is the first iteration, refer the initial register of the
2340 // loop
2341 NewReg = InitReg;
2342 else
2343 // Cases where DiffStage is larger than PhaseNum.
2344 // If MI is in the kernel block, the value is defined by the previous
2345 // iteration and PhiVRMap is referenced. If MI is in the epilog block, the
2346 // value is defined in the kernel block and KernelVRMap is referenced.
2347 NewReg = (*PrevVRMap)[PrevVRMap->size() - (DiffStage - PhaseNum)][DefReg];
2349 const TargetRegisterClass *NRC =
2350 MRI.constrainRegClass(NewReg, MRI.getRegClass(OrigReg));
2351 if (NRC)
2352 UseMO.setReg(NewReg);
2353 else {
2354 Register SplitReg = MRI.createVirtualRegister(MRI.getRegClass(OrigReg));
2355 BuildMI(*OrigKernel, MI, MI->getDebugLoc(), TII->get(TargetOpcode::COPY),
2356 SplitReg)
2357 .addReg(NewReg);
2358 UseMO.setReg(SplitReg);
2363 /// Return a phi if Reg is referenced by the phi.
2364 /// canApply() guarantees that at most only one such phi exists.
2365 static MachineInstr *getLoopPhiUser(Register Reg, MachineBasicBlock *Loop) {
2366 for (MachineInstr &Phi : Loop->phis()) {
2367 unsigned InitVal, LoopVal;
2368 getPhiRegs(Phi, Loop, InitVal, LoopVal);
2369 if (LoopVal == Reg)
2370 return &Phi;
2372 return nullptr;
2375 /// Generate phis for registers defined by OrigMI.
2376 void ModuloScheduleExpanderMVE::generatePhi(
2377 MachineInstr *OrigMI, int UnrollNum,
2378 SmallVectorImpl<ValueMapTy> &PrologVRMap,
2379 SmallVectorImpl<ValueMapTy> &KernelVRMap,
2380 SmallVectorImpl<ValueMapTy> &PhiVRMap) {
2381 int StageNum = Schedule.getStage(OrigMI);
2382 bool UsePrologReg;
2383 if (Schedule.getNumStages() - NumUnroll + UnrollNum - 1 >= StageNum)
2384 UsePrologReg = true;
2385 else if (Schedule.getNumStages() - NumUnroll + UnrollNum == StageNum)
2386 UsePrologReg = false;
2387 else
2388 return;
2390 // Examples that show which stages are merged by phi.
2391 // Meaning of the symbol following the stage number:
2392 // a/b: Stages with the same letter are merged (UsePrologReg == true)
2393 // +: Merged with the initial value (UsePrologReg == false)
2394 // *: No phis required
2396 // #Stages 3, #MVE 4
2397 // Iter 0 1 2 3 4 5 6 7 8
2398 // -----------------------------------------
2399 // Stage 0a Prolog#0
2400 // Stage 1a 0b Prolog#1
2401 // Stage 2* 1* 0* Kernel Unroll#0
2402 // Stage 2* 1* 0+ Kernel Unroll#1
2403 // Stage 2* 1+ 0a Kernel Unroll#2
2404 // Stage 2+ 1a 0b Kernel Unroll#3
2406 // #Stages 3, #MVE 2
2407 // Iter 0 1 2 3 4 5 6 7 8
2408 // -----------------------------------------
2409 // Stage 0a Prolog#0
2410 // Stage 1a 0b Prolog#1
2411 // Stage 2* 1+ 0a Kernel Unroll#0
2412 // Stage 2+ 1a 0b Kernel Unroll#1
2414 // #Stages 3, #MVE 1
2415 // Iter 0 1 2 3 4 5 6 7 8
2416 // -----------------------------------------
2417 // Stage 0* Prolog#0
2418 // Stage 1a 0b Prolog#1
2419 // Stage 2+ 1a 0b Kernel Unroll#0
2421 for (MachineOperand &DefMO : OrigMI->defs()) {
2422 if (!DefMO.isReg() || DefMO.isDead())
2423 continue;
2424 Register OrigReg = DefMO.getReg();
2425 auto NewReg = KernelVRMap[UnrollNum].find(OrigReg);
2426 if (NewReg == KernelVRMap[UnrollNum].end())
2427 continue;
2428 Register CorrespondReg;
2429 if (UsePrologReg) {
2430 int PrologNum = Schedule.getNumStages() - NumUnroll + UnrollNum - 1;
2431 CorrespondReg = PrologVRMap[PrologNum][OrigReg];
2432 } else {
2433 MachineInstr *Phi = getLoopPhiUser(OrigReg, OrigKernel);
2434 if (!Phi)
2435 continue;
2436 CorrespondReg = getInitPhiReg(*Phi, OrigKernel);
2439 assert(CorrespondReg.isValid());
2440 Register PhiReg = MRI.createVirtualRegister(MRI.getRegClass(OrigReg));
2441 BuildMI(*NewKernel, NewKernel->getFirstNonPHI(), DebugLoc(),
2442 TII->get(TargetOpcode::PHI), PhiReg)
2443 .addReg(NewReg->second)
2444 .addMBB(NewKernel)
2445 .addReg(CorrespondReg)
2446 .addMBB(Prolog);
2447 PhiVRMap[UnrollNum][OrigReg] = PhiReg;
2451 static void replacePhiSrc(MachineInstr &Phi, Register OrigReg, Register NewReg,
2452 MachineBasicBlock *NewMBB) {
2453 for (unsigned Idx = 1; Idx < Phi.getNumOperands(); Idx += 2) {
2454 if (Phi.getOperand(Idx).getReg() == OrigReg) {
2455 Phi.getOperand(Idx).setReg(NewReg);
2456 Phi.getOperand(Idx + 1).setMBB(NewMBB);
2457 return;
2462 /// Generate phis that merge values from multiple routes
2463 void ModuloScheduleExpanderMVE::mergeRegUsesAfterPipeline(Register OrigReg,
2464 Register NewReg) {
2465 SmallVector<MachineOperand *> UsesAfterLoop;
2466 SmallVector<MachineInstr *> LoopPhis;
2467 for (MachineRegisterInfo::use_iterator I = MRI.use_begin(OrigReg),
2468 E = MRI.use_end();
2469 I != E; ++I) {
2470 MachineOperand &O = *I;
2471 if (O.getParent()->getParent() != OrigKernel &&
2472 O.getParent()->getParent() != Prolog &&
2473 O.getParent()->getParent() != NewKernel &&
2474 O.getParent()->getParent() != Epilog)
2475 UsesAfterLoop.push_back(&O);
2476 if (O.getParent()->getParent() == OrigKernel && O.getParent()->isPHI())
2477 LoopPhis.push_back(O.getParent());
2480 // Merge the route that only execute the pipelined loop (when there are no
2481 // remaining iterations) with the route that execute the original loop.
2482 if (!UsesAfterLoop.empty()) {
2483 Register PhiReg = MRI.createVirtualRegister(MRI.getRegClass(OrigReg));
2484 BuildMI(*NewExit, NewExit->getFirstNonPHI(), DebugLoc(),
2485 TII->get(TargetOpcode::PHI), PhiReg)
2486 .addReg(OrigReg)
2487 .addMBB(OrigKernel)
2488 .addReg(NewReg)
2489 .addMBB(Epilog);
2491 for (MachineOperand *MO : UsesAfterLoop)
2492 MO->setReg(PhiReg);
2494 if (!LIS.hasInterval(PhiReg))
2495 LIS.createEmptyInterval(PhiReg);
2498 // Merge routes from the pipelined loop and the bypassed route before the
2499 // original loop
2500 if (!LoopPhis.empty()) {
2501 for (MachineInstr *Phi : LoopPhis) {
2502 unsigned InitReg, LoopReg;
2503 getPhiRegs(*Phi, OrigKernel, InitReg, LoopReg);
2504 Register NewInit = MRI.createVirtualRegister(MRI.getRegClass(InitReg));
2505 BuildMI(*NewPreheader, NewPreheader->getFirstNonPHI(), Phi->getDebugLoc(),
2506 TII->get(TargetOpcode::PHI), NewInit)
2507 .addReg(InitReg)
2508 .addMBB(Check)
2509 .addReg(NewReg)
2510 .addMBB(Epilog);
2511 replacePhiSrc(*Phi, InitReg, NewInit, NewPreheader);
2516 void ModuloScheduleExpanderMVE::generateProlog(
2517 SmallVectorImpl<ValueMapTy> &PrologVRMap) {
2518 PrologVRMap.clear();
2519 PrologVRMap.resize(Schedule.getNumStages() - 1);
2520 DenseMap<MachineInstr *, std::pair<int, int>> NewMIMap;
2521 for (int PrologNum = 0; PrologNum < Schedule.getNumStages() - 1;
2522 ++PrologNum) {
2523 for (MachineInstr *MI : Schedule.getInstructions()) {
2524 if (MI->isPHI())
2525 continue;
2526 int StageNum = Schedule.getStage(MI);
2527 if (StageNum > PrologNum)
2528 continue;
2529 MachineInstr *NewMI = cloneInstr(MI);
2530 updateInstrDef(NewMI, PrologVRMap[PrologNum], false);
2531 NewMIMap[NewMI] = {PrologNum, StageNum};
2532 Prolog->push_back(NewMI);
2536 for (auto I : NewMIMap) {
2537 MachineInstr *MI = I.first;
2538 int PrologNum = I.second.first;
2539 int StageNum = I.second.second;
2540 updateInstrUse(MI, StageNum, PrologNum, PrologVRMap, nullptr);
2543 LLVM_DEBUG({
2544 dbgs() << "prolog:\n";
2545 Prolog->dump();
2549 void ModuloScheduleExpanderMVE::generateKernel(
2550 SmallVectorImpl<ValueMapTy> &PrologVRMap,
2551 SmallVectorImpl<ValueMapTy> &KernelVRMap, InstrMapTy &LastStage0Insts) {
2552 KernelVRMap.clear();
2553 KernelVRMap.resize(NumUnroll);
2554 SmallVector<ValueMapTy> PhiVRMap;
2555 PhiVRMap.resize(NumUnroll);
2556 DenseMap<MachineInstr *, std::pair<int, int>> NewMIMap;
2557 DenseMap<MachineInstr *, MachineInstr *> MIMapLastStage0;
2558 for (int UnrollNum = 0; UnrollNum < NumUnroll; ++UnrollNum) {
2559 for (MachineInstr *MI : Schedule.getInstructions()) {
2560 if (MI->isPHI())
2561 continue;
2562 int StageNum = Schedule.getStage(MI);
2563 MachineInstr *NewMI = cloneInstr(MI);
2564 if (UnrollNum == NumUnroll - 1)
2565 LastStage0Insts[MI] = NewMI;
2566 updateInstrDef(NewMI, KernelVRMap[UnrollNum],
2567 (UnrollNum == NumUnroll - 1 && StageNum == 0));
2568 generatePhi(MI, UnrollNum, PrologVRMap, KernelVRMap, PhiVRMap);
2569 NewMIMap[NewMI] = {UnrollNum, StageNum};
2570 NewKernel->push_back(NewMI);
2574 for (auto I : NewMIMap) {
2575 MachineInstr *MI = I.first;
2576 int UnrollNum = I.second.first;
2577 int StageNum = I.second.second;
2578 updateInstrUse(MI, StageNum, UnrollNum, KernelVRMap, &PhiVRMap);
2581 // If remaining trip count is greater than NumUnroll-1, loop continues
2582 insertCondBranch(*NewKernel, NumUnroll - 1, LastStage0Insts, *NewKernel,
2583 *Epilog);
2585 LLVM_DEBUG({
2586 dbgs() << "kernel:\n";
2587 NewKernel->dump();
2591 void ModuloScheduleExpanderMVE::generateEpilog(
2592 SmallVectorImpl<ValueMapTy> &KernelVRMap,
2593 SmallVectorImpl<ValueMapTy> &EpilogVRMap, InstrMapTy &LastStage0Insts) {
2594 EpilogVRMap.clear();
2595 EpilogVRMap.resize(Schedule.getNumStages() - 1);
2596 DenseMap<MachineInstr *, std::pair<int, int>> NewMIMap;
2597 for (int EpilogNum = 0; EpilogNum < Schedule.getNumStages() - 1;
2598 ++EpilogNum) {
2599 for (MachineInstr *MI : Schedule.getInstructions()) {
2600 if (MI->isPHI())
2601 continue;
2602 int StageNum = Schedule.getStage(MI);
2603 if (StageNum <= EpilogNum)
2604 continue;
2605 MachineInstr *NewMI = cloneInstr(MI);
2606 updateInstrDef(NewMI, EpilogVRMap[EpilogNum], StageNum - 1 == EpilogNum);
2607 NewMIMap[NewMI] = {EpilogNum, StageNum};
2608 Epilog->push_back(NewMI);
2612 for (auto I : NewMIMap) {
2613 MachineInstr *MI = I.first;
2614 int EpilogNum = I.second.first;
2615 int StageNum = I.second.second;
2616 updateInstrUse(MI, StageNum, EpilogNum, EpilogVRMap, &KernelVRMap);
2619 // If there are remaining iterations, they are executed in the original loop.
2620 // Instructions related to loop control, such as loop counter comparison,
2621 // are indicated by shouldIgnoreForPipelining() and are assumed to be placed
2622 // in stage 0. Thus, the map is for the last one in the kernel.
2623 insertCondBranch(*Epilog, 0, LastStage0Insts, *NewPreheader, *NewExit);
2625 LLVM_DEBUG({
2626 dbgs() << "epilog:\n";
2627 Epilog->dump();
2631 /// Calculate the number of unroll required and set it to NumUnroll
2632 void ModuloScheduleExpanderMVE::calcNumUnroll() {
2633 DenseMap<MachineInstr *, unsigned> Inst2Idx;
2634 NumUnroll = 1;
2635 for (unsigned I = 0; I < Schedule.getInstructions().size(); ++I)
2636 Inst2Idx[Schedule.getInstructions()[I]] = I;
2638 for (MachineInstr *MI : Schedule.getInstructions()) {
2639 if (MI->isPHI())
2640 continue;
2641 int StageNum = Schedule.getStage(MI);
2642 for (const MachineOperand &MO : MI->uses()) {
2643 if (!MO.isReg() || !MO.getReg().isVirtual())
2644 continue;
2645 MachineInstr *DefMI = MRI.getVRegDef(MO.getReg());
2646 if (DefMI->getParent() != OrigKernel)
2647 continue;
2649 int NumUnrollLocal = 1;
2650 if (DefMI->isPHI()) {
2651 ++NumUnrollLocal;
2652 // canApply() guarantees that DefMI is not phi and is an instruction in
2653 // the loop
2654 DefMI = MRI.getVRegDef(getLoopPhiReg(*DefMI, OrigKernel));
2656 NumUnrollLocal += StageNum - Schedule.getStage(DefMI);
2657 if (Inst2Idx[MI] <= Inst2Idx[DefMI])
2658 --NumUnrollLocal;
2659 NumUnroll = std::max(NumUnroll, NumUnrollLocal);
2662 LLVM_DEBUG(dbgs() << "NumUnroll: " << NumUnroll << "\n");
2665 /// Create new virtual registers for definitions of NewMI and update NewMI.
2666 /// If the definitions are referenced after the pipelined loop, phis are
2667 /// created to merge with other routes.
2668 void ModuloScheduleExpanderMVE::updateInstrDef(MachineInstr *NewMI,
2669 ValueMapTy &VRMap,
2670 bool LastDef) {
2671 for (MachineOperand &MO : NewMI->all_defs()) {
2672 if (!MO.getReg().isVirtual())
2673 continue;
2674 Register Reg = MO.getReg();
2675 const TargetRegisterClass *RC = MRI.getRegClass(Reg);
2676 Register NewReg = MRI.createVirtualRegister(RC);
2677 MO.setReg(NewReg);
2678 VRMap[Reg] = NewReg;
2679 if (LastDef)
2680 mergeRegUsesAfterPipeline(Reg, NewReg);
2684 void ModuloScheduleExpanderMVE::expand() {
2685 OrigKernel = Schedule.getLoop()->getTopBlock();
2686 OrigPreheader = Schedule.getLoop()->getLoopPreheader();
2687 OrigExit = Schedule.getLoop()->getExitBlock();
2689 LLVM_DEBUG(Schedule.dump());
2691 generatePipelinedLoop();
2694 /// Check if ModuloScheduleExpanderMVE can be applied to L
2695 bool ModuloScheduleExpanderMVE::canApply(MachineLoop &L) {
2696 if (!L.getExitBlock()) {
2697 LLVM_DEBUG(dbgs() << "Can not apply MVE expander: No single exit block.\n");
2698 return false;
2701 MachineBasicBlock *BB = L.getTopBlock();
2702 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
2704 // Put some constraints on the operands of the phis to simplify the
2705 // transformation
2706 DenseSet<unsigned> UsedByPhi;
2707 for (MachineInstr &MI : BB->phis()) {
2708 // Registers defined by phis must be used only inside the loop and be never
2709 // used by phis.
2710 for (MachineOperand &MO : MI.defs())
2711 if (MO.isReg())
2712 for (MachineInstr &Ref : MRI.use_instructions(MO.getReg()))
2713 if (Ref.getParent() != BB || Ref.isPHI()) {
2714 LLVM_DEBUG(dbgs() << "Can not apply MVE expander: A phi result is "
2715 "referenced outside of the loop or by phi.\n");
2716 return false;
2719 // A source register from the loop block must be defined inside the loop.
2720 // A register defined inside the loop must be referenced by only one phi at
2721 // most.
2722 unsigned InitVal, LoopVal;
2723 getPhiRegs(MI, MI.getParent(), InitVal, LoopVal);
2724 if (!Register(LoopVal).isVirtual() ||
2725 MRI.getVRegDef(LoopVal)->getParent() != BB) {
2726 LLVM_DEBUG(
2727 dbgs() << "Can not apply MVE expander: A phi source value coming "
2728 "from the loop is not defined in the loop.\n");
2729 return false;
2731 if (UsedByPhi.count(LoopVal)) {
2732 LLVM_DEBUG(dbgs() << "Can not apply MVE expander: A value defined in the "
2733 "loop is referenced by two or more phis.\n");
2734 return false;
2736 UsedByPhi.insert(LoopVal);
2739 return true;
2742 //===----------------------------------------------------------------------===//
2743 // ModuloScheduleTestPass implementation
2744 //===----------------------------------------------------------------------===//
2745 // This pass constructs a ModuloSchedule from its module and runs
2746 // ModuloScheduleExpander.
2748 // The module is expected to contain a single-block analyzable loop.
2749 // The total order of instructions is taken from the loop as-is.
2750 // Instructions are expected to be annotated with a PostInstrSymbol.
2751 // This PostInstrSymbol must have the following format:
2752 // "Stage=%d Cycle=%d".
2753 //===----------------------------------------------------------------------===//
2755 namespace {
2756 class ModuloScheduleTest : public MachineFunctionPass {
2757 public:
2758 static char ID;
2760 ModuloScheduleTest() : MachineFunctionPass(ID) {
2761 initializeModuloScheduleTestPass(*PassRegistry::getPassRegistry());
2764 bool runOnMachineFunction(MachineFunction &MF) override;
2765 void runOnLoop(MachineFunction &MF, MachineLoop &L);
2767 void getAnalysisUsage(AnalysisUsage &AU) const override {
2768 AU.addRequired<MachineLoopInfoWrapperPass>();
2769 AU.addRequired<LiveIntervalsWrapperPass>();
2770 MachineFunctionPass::getAnalysisUsage(AU);
2773 } // namespace
2775 char ModuloScheduleTest::ID = 0;
2777 INITIALIZE_PASS_BEGIN(ModuloScheduleTest, "modulo-schedule-test",
2778 "Modulo Schedule test pass", false, false)
2779 INITIALIZE_PASS_DEPENDENCY(MachineLoopInfoWrapperPass)
2780 INITIALIZE_PASS_DEPENDENCY(LiveIntervalsWrapperPass)
2781 INITIALIZE_PASS_END(ModuloScheduleTest, "modulo-schedule-test",
2782 "Modulo Schedule test pass", false, false)
2784 bool ModuloScheduleTest::runOnMachineFunction(MachineFunction &MF) {
2785 MachineLoopInfo &MLI = getAnalysis<MachineLoopInfoWrapperPass>().getLI();
2786 for (auto *L : MLI) {
2787 if (L->getTopBlock() != L->getBottomBlock())
2788 continue;
2789 runOnLoop(MF, *L);
2790 return false;
2792 return false;
2795 static void parseSymbolString(StringRef S, int &Cycle, int &Stage) {
2796 std::pair<StringRef, StringRef> StageAndCycle = getToken(S, "_");
2797 std::pair<StringRef, StringRef> StageTokenAndValue =
2798 getToken(StageAndCycle.first, "-");
2799 std::pair<StringRef, StringRef> CycleTokenAndValue =
2800 getToken(StageAndCycle.second, "-");
2801 if (StageTokenAndValue.first != "Stage" ||
2802 CycleTokenAndValue.first != "_Cycle") {
2803 llvm_unreachable(
2804 "Bad post-instr symbol syntax: see comment in ModuloScheduleTest");
2805 return;
2808 StageTokenAndValue.second.drop_front().getAsInteger(10, Stage);
2809 CycleTokenAndValue.second.drop_front().getAsInteger(10, Cycle);
2811 dbgs() << " Stage=" << Stage << ", Cycle=" << Cycle << "\n";
2814 void ModuloScheduleTest::runOnLoop(MachineFunction &MF, MachineLoop &L) {
2815 LiveIntervals &LIS = getAnalysis<LiveIntervalsWrapperPass>().getLIS();
2816 MachineBasicBlock *BB = L.getTopBlock();
2817 dbgs() << "--- ModuloScheduleTest running on BB#" << BB->getNumber() << "\n";
2819 DenseMap<MachineInstr *, int> Cycle, Stage;
2820 std::vector<MachineInstr *> Instrs;
2821 for (MachineInstr &MI : *BB) {
2822 if (MI.isTerminator())
2823 continue;
2824 Instrs.push_back(&MI);
2825 if (MCSymbol *Sym = MI.getPostInstrSymbol()) {
2826 dbgs() << "Parsing post-instr symbol for " << MI;
2827 parseSymbolString(Sym->getName(), Cycle[&MI], Stage[&MI]);
2831 ModuloSchedule MS(MF, &L, std::move(Instrs), std::move(Cycle),
2832 std::move(Stage));
2833 ModuloScheduleExpander MSE(
2834 MF, MS, LIS, /*InstrChanges=*/ModuloScheduleExpander::InstrChangesTy());
2835 MSE.expand();
2836 MSE.cleanup();
2839 //===----------------------------------------------------------------------===//
2840 // ModuloScheduleTestAnnotater implementation
2841 //===----------------------------------------------------------------------===//
2843 void ModuloScheduleTestAnnotater::annotate() {
2844 for (MachineInstr *MI : S.getInstructions()) {
2845 SmallVector<char, 16> SV;
2846 raw_svector_ostream OS(SV);
2847 OS << "Stage-" << S.getStage(MI) << "_Cycle-" << S.getCycle(MI);
2848 MCSymbol *Sym = MF.getContext().getOrCreateSymbol(OS.str());
2849 MI->setPostInstrSymbol(MF, Sym);