[MIPS GlobalISel] Select MSA vector generic and builtin add
[llvm-complete.git] / lib / CodeGen / ModuloSchedule.cpp
blob7ce3c5861801a51ea849752d5b5eac749e6b30b5
1 //===- ModuloSchedule.cpp - Software pipeline schedule expansion ----------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
9 #include "llvm/CodeGen/ModuloSchedule.h"
10 #include "llvm/ADT/StringExtras.h"
11 #include "llvm/CodeGen/LiveIntervals.h"
12 #include "llvm/CodeGen/MachineInstrBuilder.h"
13 #include "llvm/CodeGen/MachineLoopUtils.h"
14 #include "llvm/CodeGen/MachineRegisterInfo.h"
15 #include "llvm/CodeGen/TargetInstrInfo.h"
16 #include "llvm/MC/MCContext.h"
17 #include "llvm/Support/Debug.h"
18 #include "llvm/Support/ErrorHandling.h"
19 #include "llvm/Support/raw_ostream.h"
21 #define DEBUG_TYPE "pipeliner"
22 using namespace llvm;
24 void ModuloSchedule::print(raw_ostream &OS) {
25 for (MachineInstr *MI : ScheduledInstrs)
26 OS << "[stage " << getStage(MI) << " @" << getCycle(MI) << "c] " << *MI;
29 //===----------------------------------------------------------------------===//
30 // ModuloScheduleExpander implementation
31 //===----------------------------------------------------------------------===//
33 /// Return the register values for the operands of a Phi instruction.
34 /// This function assume the instruction is a Phi.
35 static void getPhiRegs(MachineInstr &Phi, MachineBasicBlock *Loop,
36 unsigned &InitVal, unsigned &LoopVal) {
37 assert(Phi.isPHI() && "Expecting a Phi.");
39 InitVal = 0;
40 LoopVal = 0;
41 for (unsigned i = 1, e = Phi.getNumOperands(); i != e; i += 2)
42 if (Phi.getOperand(i + 1).getMBB() != Loop)
43 InitVal = Phi.getOperand(i).getReg();
44 else
45 LoopVal = Phi.getOperand(i).getReg();
47 assert(InitVal != 0 && LoopVal != 0 && "Unexpected Phi structure.");
50 /// Return the Phi register value that comes from the incoming block.
51 static unsigned getInitPhiReg(MachineInstr &Phi, MachineBasicBlock *LoopBB) {
52 for (unsigned i = 1, e = Phi.getNumOperands(); i != e; i += 2)
53 if (Phi.getOperand(i + 1).getMBB() != LoopBB)
54 return Phi.getOperand(i).getReg();
55 return 0;
58 /// Return the Phi register value that comes the loop block.
59 static unsigned getLoopPhiReg(MachineInstr &Phi, MachineBasicBlock *LoopBB) {
60 for (unsigned i = 1, e = Phi.getNumOperands(); i != e; i += 2)
61 if (Phi.getOperand(i + 1).getMBB() == LoopBB)
62 return Phi.getOperand(i).getReg();
63 return 0;
66 void ModuloScheduleExpander::expand() {
67 BB = Schedule.getLoop()->getTopBlock();
68 Preheader = *BB->pred_begin();
69 if (Preheader == BB)
70 Preheader = *std::next(BB->pred_begin());
72 // Iterate over the definitions in each instruction, and compute the
73 // stage difference for each use. Keep the maximum value.
74 for (MachineInstr *MI : Schedule.getInstructions()) {
75 int DefStage = Schedule.getStage(MI);
76 for (unsigned i = 0, e = MI->getNumOperands(); i < e; ++i) {
77 MachineOperand &Op = MI->getOperand(i);
78 if (!Op.isReg() || !Op.isDef())
79 continue;
81 Register Reg = Op.getReg();
82 unsigned MaxDiff = 0;
83 bool PhiIsSwapped = false;
84 for (MachineRegisterInfo::use_iterator UI = MRI.use_begin(Reg),
85 EI = MRI.use_end();
86 UI != EI; ++UI) {
87 MachineOperand &UseOp = *UI;
88 MachineInstr *UseMI = UseOp.getParent();
89 int UseStage = Schedule.getStage(UseMI);
90 unsigned Diff = 0;
91 if (UseStage != -1 && UseStage >= DefStage)
92 Diff = UseStage - DefStage;
93 if (MI->isPHI()) {
94 if (isLoopCarried(*MI))
95 ++Diff;
96 else
97 PhiIsSwapped = true;
99 MaxDiff = std::max(Diff, MaxDiff);
101 RegToStageDiff[Reg] = std::make_pair(MaxDiff, PhiIsSwapped);
105 generatePipelinedLoop();
108 void ModuloScheduleExpander::generatePipelinedLoop() {
109 LoopInfo = TII->analyzeLoopForPipelining(BB);
110 assert(LoopInfo && "Must be able to analyze loop!");
112 // Create a new basic block for the kernel and add it to the CFG.
113 MachineBasicBlock *KernelBB = MF.CreateMachineBasicBlock(BB->getBasicBlock());
115 unsigned MaxStageCount = Schedule.getNumStages() - 1;
117 // Remember the registers that are used in different stages. The index is
118 // the iteration, or stage, that the instruction is scheduled in. This is
119 // a map between register names in the original block and the names created
120 // in each stage of the pipelined loop.
121 ValueMapTy *VRMap = new ValueMapTy[(MaxStageCount + 1) * 2];
122 InstrMapTy InstrMap;
124 SmallVector<MachineBasicBlock *, 4> PrologBBs;
126 // Generate the prolog instructions that set up the pipeline.
127 generateProlog(MaxStageCount, KernelBB, VRMap, PrologBBs);
128 MF.insert(BB->getIterator(), KernelBB);
130 // Rearrange the instructions to generate the new, pipelined loop,
131 // and update register names as needed.
132 for (MachineInstr *CI : Schedule.getInstructions()) {
133 if (CI->isPHI())
134 continue;
135 unsigned StageNum = Schedule.getStage(CI);
136 MachineInstr *NewMI = cloneInstr(CI, MaxStageCount, StageNum);
137 updateInstruction(NewMI, false, MaxStageCount, StageNum, VRMap);
138 KernelBB->push_back(NewMI);
139 InstrMap[NewMI] = CI;
142 // Copy any terminator instructions to the new kernel, and update
143 // names as needed.
144 for (MachineBasicBlock::iterator I = BB->getFirstTerminator(),
145 E = BB->instr_end();
146 I != E; ++I) {
147 MachineInstr *NewMI = MF.CloneMachineInstr(&*I);
148 updateInstruction(NewMI, false, MaxStageCount, 0, VRMap);
149 KernelBB->push_back(NewMI);
150 InstrMap[NewMI] = &*I;
153 NewKernel = KernelBB;
154 KernelBB->transferSuccessors(BB);
155 KernelBB->replaceSuccessor(BB, KernelBB);
157 generateExistingPhis(KernelBB, PrologBBs.back(), KernelBB, KernelBB, VRMap,
158 InstrMap, MaxStageCount, MaxStageCount, false);
159 generatePhis(KernelBB, PrologBBs.back(), KernelBB, KernelBB, VRMap, InstrMap,
160 MaxStageCount, MaxStageCount, false);
162 LLVM_DEBUG(dbgs() << "New block\n"; KernelBB->dump(););
164 SmallVector<MachineBasicBlock *, 4> EpilogBBs;
165 // Generate the epilog instructions to complete the pipeline.
166 generateEpilog(MaxStageCount, KernelBB, VRMap, EpilogBBs, PrologBBs);
168 // We need this step because the register allocation doesn't handle some
169 // situations well, so we insert copies to help out.
170 splitLifetimes(KernelBB, EpilogBBs);
172 // Remove dead instructions due to loop induction variables.
173 removeDeadInstructions(KernelBB, EpilogBBs);
175 // Add branches between prolog and epilog blocks.
176 addBranches(*Preheader, PrologBBs, KernelBB, EpilogBBs, VRMap);
178 delete[] VRMap;
181 void ModuloScheduleExpander::cleanup() {
182 // Remove the original loop since it's no longer referenced.
183 for (auto &I : *BB)
184 LIS.RemoveMachineInstrFromMaps(I);
185 BB->clear();
186 BB->eraseFromParent();
189 /// Generate the pipeline prolog code.
190 void ModuloScheduleExpander::generateProlog(unsigned LastStage,
191 MachineBasicBlock *KernelBB,
192 ValueMapTy *VRMap,
193 MBBVectorTy &PrologBBs) {
194 MachineBasicBlock *PredBB = Preheader;
195 InstrMapTy InstrMap;
197 // Generate a basic block for each stage, not including the last stage,
198 // which will be generated in the kernel. Each basic block may contain
199 // instructions from multiple stages/iterations.
200 for (unsigned i = 0; i < LastStage; ++i) {
201 // Create and insert the prolog basic block prior to the original loop
202 // basic block. The original loop is removed later.
203 MachineBasicBlock *NewBB = MF.CreateMachineBasicBlock(BB->getBasicBlock());
204 PrologBBs.push_back(NewBB);
205 MF.insert(BB->getIterator(), NewBB);
206 NewBB->transferSuccessors(PredBB);
207 PredBB->addSuccessor(NewBB);
208 PredBB = NewBB;
210 // Generate instructions for each appropriate stage. Process instructions
211 // in original program order.
212 for (int StageNum = i; StageNum >= 0; --StageNum) {
213 for (MachineBasicBlock::iterator BBI = BB->instr_begin(),
214 BBE = BB->getFirstTerminator();
215 BBI != BBE; ++BBI) {
216 if (Schedule.getStage(&*BBI) == StageNum) {
217 if (BBI->isPHI())
218 continue;
219 MachineInstr *NewMI =
220 cloneAndChangeInstr(&*BBI, i, (unsigned)StageNum);
221 updateInstruction(NewMI, false, i, (unsigned)StageNum, VRMap);
222 NewBB->push_back(NewMI);
223 InstrMap[NewMI] = &*BBI;
227 rewritePhiValues(NewBB, i, VRMap, InstrMap);
228 LLVM_DEBUG({
229 dbgs() << "prolog:\n";
230 NewBB->dump();
234 PredBB->replaceSuccessor(BB, KernelBB);
236 // Check if we need to remove the branch from the preheader to the original
237 // loop, and replace it with a branch to the new loop.
238 unsigned numBranches = TII->removeBranch(*Preheader);
239 if (numBranches) {
240 SmallVector<MachineOperand, 0> Cond;
241 TII->insertBranch(*Preheader, PrologBBs[0], nullptr, Cond, DebugLoc());
245 /// Generate the pipeline epilog code. The epilog code finishes the iterations
246 /// that were started in either the prolog or the kernel. We create a basic
247 /// block for each stage that needs to complete.
248 void ModuloScheduleExpander::generateEpilog(unsigned LastStage,
249 MachineBasicBlock *KernelBB,
250 ValueMapTy *VRMap,
251 MBBVectorTy &EpilogBBs,
252 MBBVectorTy &PrologBBs) {
253 // We need to change the branch from the kernel to the first epilog block, so
254 // this call to analyze branch uses the kernel rather than the original BB.
255 MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
256 SmallVector<MachineOperand, 4> Cond;
257 bool checkBranch = TII->analyzeBranch(*KernelBB, TBB, FBB, Cond);
258 assert(!checkBranch && "generateEpilog must be able to analyze the branch");
259 if (checkBranch)
260 return;
262 MachineBasicBlock::succ_iterator LoopExitI = KernelBB->succ_begin();
263 if (*LoopExitI == KernelBB)
264 ++LoopExitI;
265 assert(LoopExitI != KernelBB->succ_end() && "Expecting a successor");
266 MachineBasicBlock *LoopExitBB = *LoopExitI;
268 MachineBasicBlock *PredBB = KernelBB;
269 MachineBasicBlock *EpilogStart = LoopExitBB;
270 InstrMapTy InstrMap;
272 // Generate a basic block for each stage, not including the last stage,
273 // which was generated for the kernel. Each basic block may contain
274 // instructions from multiple stages/iterations.
275 int EpilogStage = LastStage + 1;
276 for (unsigned i = LastStage; i >= 1; --i, ++EpilogStage) {
277 MachineBasicBlock *NewBB = MF.CreateMachineBasicBlock();
278 EpilogBBs.push_back(NewBB);
279 MF.insert(BB->getIterator(), NewBB);
281 PredBB->replaceSuccessor(LoopExitBB, NewBB);
282 NewBB->addSuccessor(LoopExitBB);
284 if (EpilogStart == LoopExitBB)
285 EpilogStart = NewBB;
287 // Add instructions to the epilog depending on the current block.
288 // Process instructions in original program order.
289 for (unsigned StageNum = i; StageNum <= LastStage; ++StageNum) {
290 for (auto &BBI : *BB) {
291 if (BBI.isPHI())
292 continue;
293 MachineInstr *In = &BBI;
294 if ((unsigned)Schedule.getStage(In) == StageNum) {
295 // Instructions with memoperands in the epilog are updated with
296 // conservative values.
297 MachineInstr *NewMI = cloneInstr(In, UINT_MAX, 0);
298 updateInstruction(NewMI, i == 1, EpilogStage, 0, VRMap);
299 NewBB->push_back(NewMI);
300 InstrMap[NewMI] = In;
304 generateExistingPhis(NewBB, PrologBBs[i - 1], PredBB, KernelBB, VRMap,
305 InstrMap, LastStage, EpilogStage, i == 1);
306 generatePhis(NewBB, PrologBBs[i - 1], PredBB, KernelBB, VRMap, InstrMap,
307 LastStage, EpilogStage, i == 1);
308 PredBB = NewBB;
310 LLVM_DEBUG({
311 dbgs() << "epilog:\n";
312 NewBB->dump();
316 // Fix any Phi nodes in the loop exit block.
317 LoopExitBB->replacePhiUsesWith(BB, PredBB);
319 // Create a branch to the new epilog from the kernel.
320 // Remove the original branch and add a new branch to the epilog.
321 TII->removeBranch(*KernelBB);
322 TII->insertBranch(*KernelBB, KernelBB, EpilogStart, Cond, DebugLoc());
323 // Add a branch to the loop exit.
324 if (EpilogBBs.size() > 0) {
325 MachineBasicBlock *LastEpilogBB = EpilogBBs.back();
326 SmallVector<MachineOperand, 4> Cond1;
327 TII->insertBranch(*LastEpilogBB, LoopExitBB, nullptr, Cond1, DebugLoc());
331 /// Replace all uses of FromReg that appear outside the specified
332 /// basic block with ToReg.
333 static void replaceRegUsesAfterLoop(unsigned FromReg, unsigned ToReg,
334 MachineBasicBlock *MBB,
335 MachineRegisterInfo &MRI,
336 LiveIntervals &LIS) {
337 for (MachineRegisterInfo::use_iterator I = MRI.use_begin(FromReg),
338 E = MRI.use_end();
339 I != E;) {
340 MachineOperand &O = *I;
341 ++I;
342 if (O.getParent()->getParent() != MBB)
343 O.setReg(ToReg);
345 if (!LIS.hasInterval(ToReg))
346 LIS.createEmptyInterval(ToReg);
349 /// Return true if the register has a use that occurs outside the
350 /// specified loop.
351 static bool hasUseAfterLoop(unsigned Reg, MachineBasicBlock *BB,
352 MachineRegisterInfo &MRI) {
353 for (MachineRegisterInfo::use_iterator I = MRI.use_begin(Reg),
354 E = MRI.use_end();
355 I != E; ++I)
356 if (I->getParent()->getParent() != BB)
357 return true;
358 return false;
361 /// Generate Phis for the specific block in the generated pipelined code.
362 /// This function looks at the Phis from the original code to guide the
363 /// creation of new Phis.
364 void ModuloScheduleExpander::generateExistingPhis(
365 MachineBasicBlock *NewBB, MachineBasicBlock *BB1, MachineBasicBlock *BB2,
366 MachineBasicBlock *KernelBB, ValueMapTy *VRMap, InstrMapTy &InstrMap,
367 unsigned LastStageNum, unsigned CurStageNum, bool IsLast) {
368 // Compute the stage number for the initial value of the Phi, which
369 // comes from the prolog. The prolog to use depends on to which kernel/
370 // epilog that we're adding the Phi.
371 unsigned PrologStage = 0;
372 unsigned PrevStage = 0;
373 bool InKernel = (LastStageNum == CurStageNum);
374 if (InKernel) {
375 PrologStage = LastStageNum - 1;
376 PrevStage = CurStageNum;
377 } else {
378 PrologStage = LastStageNum - (CurStageNum - LastStageNum);
379 PrevStage = LastStageNum + (CurStageNum - LastStageNum) - 1;
382 for (MachineBasicBlock::iterator BBI = BB->instr_begin(),
383 BBE = BB->getFirstNonPHI();
384 BBI != BBE; ++BBI) {
385 Register Def = BBI->getOperand(0).getReg();
387 unsigned InitVal = 0;
388 unsigned LoopVal = 0;
389 getPhiRegs(*BBI, BB, InitVal, LoopVal);
391 unsigned PhiOp1 = 0;
392 // The Phi value from the loop body typically is defined in the loop, but
393 // not always. So, we need to check if the value is defined in the loop.
394 unsigned PhiOp2 = LoopVal;
395 if (VRMap[LastStageNum].count(LoopVal))
396 PhiOp2 = VRMap[LastStageNum][LoopVal];
398 int StageScheduled = Schedule.getStage(&*BBI);
399 int LoopValStage = Schedule.getStage(MRI.getVRegDef(LoopVal));
400 unsigned NumStages = getStagesForReg(Def, CurStageNum);
401 if (NumStages == 0) {
402 // We don't need to generate a Phi anymore, but we need to rename any uses
403 // of the Phi value.
404 unsigned NewReg = VRMap[PrevStage][LoopVal];
405 rewriteScheduledInstr(NewBB, InstrMap, CurStageNum, 0, &*BBI, Def,
406 InitVal, NewReg);
407 if (VRMap[CurStageNum].count(LoopVal))
408 VRMap[CurStageNum][Def] = VRMap[CurStageNum][LoopVal];
410 // Adjust the number of Phis needed depending on the number of prologs left,
411 // and the distance from where the Phi is first scheduled. The number of
412 // Phis cannot exceed the number of prolog stages. Each stage can
413 // potentially define two values.
414 unsigned MaxPhis = PrologStage + 2;
415 if (!InKernel && (int)PrologStage <= LoopValStage)
416 MaxPhis = std::max((int)MaxPhis - (int)LoopValStage, 1);
417 unsigned NumPhis = std::min(NumStages, MaxPhis);
419 unsigned NewReg = 0;
420 unsigned AccessStage = (LoopValStage != -1) ? LoopValStage : StageScheduled;
421 // In the epilog, we may need to look back one stage to get the correct
422 // Phi name because the epilog and prolog blocks execute the same stage.
423 // The correct name is from the previous block only when the Phi has
424 // been completely scheduled prior to the epilog, and Phi value is not
425 // needed in multiple stages.
426 int StageDiff = 0;
427 if (!InKernel && StageScheduled >= LoopValStage && AccessStage == 0 &&
428 NumPhis == 1)
429 StageDiff = 1;
430 // Adjust the computations below when the phi and the loop definition
431 // are scheduled in different stages.
432 if (InKernel && LoopValStage != -1 && StageScheduled > LoopValStage)
433 StageDiff = StageScheduled - LoopValStage;
434 for (unsigned np = 0; np < NumPhis; ++np) {
435 // If the Phi hasn't been scheduled, then use the initial Phi operand
436 // value. Otherwise, use the scheduled version of the instruction. This
437 // is a little complicated when a Phi references another Phi.
438 if (np > PrologStage || StageScheduled >= (int)LastStageNum)
439 PhiOp1 = InitVal;
440 // Check if the Phi has already been scheduled in a prolog stage.
441 else if (PrologStage >= AccessStage + StageDiff + np &&
442 VRMap[PrologStage - StageDiff - np].count(LoopVal) != 0)
443 PhiOp1 = VRMap[PrologStage - StageDiff - np][LoopVal];
444 // Check if the Phi has already been scheduled, but the loop instruction
445 // is either another Phi, or doesn't occur in the loop.
446 else if (PrologStage >= AccessStage + StageDiff + np) {
447 // If the Phi references another Phi, we need to examine the other
448 // Phi to get the correct value.
449 PhiOp1 = LoopVal;
450 MachineInstr *InstOp1 = MRI.getVRegDef(PhiOp1);
451 int Indirects = 1;
452 while (InstOp1 && InstOp1->isPHI() && InstOp1->getParent() == BB) {
453 int PhiStage = Schedule.getStage(InstOp1);
454 if ((int)(PrologStage - StageDiff - np) < PhiStage + Indirects)
455 PhiOp1 = getInitPhiReg(*InstOp1, BB);
456 else
457 PhiOp1 = getLoopPhiReg(*InstOp1, BB);
458 InstOp1 = MRI.getVRegDef(PhiOp1);
459 int PhiOpStage = Schedule.getStage(InstOp1);
460 int StageAdj = (PhiOpStage != -1 ? PhiStage - PhiOpStage : 0);
461 if (PhiOpStage != -1 && PrologStage - StageAdj >= Indirects + np &&
462 VRMap[PrologStage - StageAdj - Indirects - np].count(PhiOp1)) {
463 PhiOp1 = VRMap[PrologStage - StageAdj - Indirects - np][PhiOp1];
464 break;
466 ++Indirects;
468 } else
469 PhiOp1 = InitVal;
470 // If this references a generated Phi in the kernel, get the Phi operand
471 // from the incoming block.
472 if (MachineInstr *InstOp1 = MRI.getVRegDef(PhiOp1))
473 if (InstOp1->isPHI() && InstOp1->getParent() == KernelBB)
474 PhiOp1 = getInitPhiReg(*InstOp1, KernelBB);
476 MachineInstr *PhiInst = MRI.getVRegDef(LoopVal);
477 bool LoopDefIsPhi = PhiInst && PhiInst->isPHI();
478 // In the epilog, a map lookup is needed to get the value from the kernel,
479 // or previous epilog block. How is does this depends on if the
480 // instruction is scheduled in the previous block.
481 if (!InKernel) {
482 int StageDiffAdj = 0;
483 if (LoopValStage != -1 && StageScheduled > LoopValStage)
484 StageDiffAdj = StageScheduled - LoopValStage;
485 // Use the loop value defined in the kernel, unless the kernel
486 // contains the last definition of the Phi.
487 if (np == 0 && PrevStage == LastStageNum &&
488 (StageScheduled != 0 || LoopValStage != 0) &&
489 VRMap[PrevStage - StageDiffAdj].count(LoopVal))
490 PhiOp2 = VRMap[PrevStage - StageDiffAdj][LoopVal];
491 // Use the value defined by the Phi. We add one because we switch
492 // from looking at the loop value to the Phi definition.
493 else if (np > 0 && PrevStage == LastStageNum &&
494 VRMap[PrevStage - np + 1].count(Def))
495 PhiOp2 = VRMap[PrevStage - np + 1][Def];
496 // Use the loop value defined in the kernel.
497 else if (static_cast<unsigned>(LoopValStage) > PrologStage + 1 &&
498 VRMap[PrevStage - StageDiffAdj - np].count(LoopVal))
499 PhiOp2 = VRMap[PrevStage - StageDiffAdj - np][LoopVal];
500 // Use the value defined by the Phi, unless we're generating the first
501 // epilog and the Phi refers to a Phi in a different stage.
502 else if (VRMap[PrevStage - np].count(Def) &&
503 (!LoopDefIsPhi || (PrevStage != LastStageNum) ||
504 (LoopValStage == StageScheduled)))
505 PhiOp2 = VRMap[PrevStage - np][Def];
508 // Check if we can reuse an existing Phi. This occurs when a Phi
509 // references another Phi, and the other Phi is scheduled in an
510 // earlier stage. We can try to reuse an existing Phi up until the last
511 // stage of the current Phi.
512 if (LoopDefIsPhi) {
513 if (static_cast<int>(PrologStage - np) >= StageScheduled) {
514 int LVNumStages = getStagesForPhi(LoopVal);
515 int StageDiff = (StageScheduled - LoopValStage);
516 LVNumStages -= StageDiff;
517 // Make sure the loop value Phi has been processed already.
518 if (LVNumStages > (int)np && VRMap[CurStageNum].count(LoopVal)) {
519 NewReg = PhiOp2;
520 unsigned ReuseStage = CurStageNum;
521 if (isLoopCarried(*PhiInst))
522 ReuseStage -= LVNumStages;
523 // Check if the Phi to reuse has been generated yet. If not, then
524 // there is nothing to reuse.
525 if (VRMap[ReuseStage - np].count(LoopVal)) {
526 NewReg = VRMap[ReuseStage - np][LoopVal];
528 rewriteScheduledInstr(NewBB, InstrMap, CurStageNum, np, &*BBI,
529 Def, NewReg);
530 // Update the map with the new Phi name.
531 VRMap[CurStageNum - np][Def] = NewReg;
532 PhiOp2 = NewReg;
533 if (VRMap[LastStageNum - np - 1].count(LoopVal))
534 PhiOp2 = VRMap[LastStageNum - np - 1][LoopVal];
536 if (IsLast && np == NumPhis - 1)
537 replaceRegUsesAfterLoop(Def, NewReg, BB, MRI, LIS);
538 continue;
542 if (InKernel && StageDiff > 0 &&
543 VRMap[CurStageNum - StageDiff - np].count(LoopVal))
544 PhiOp2 = VRMap[CurStageNum - StageDiff - np][LoopVal];
547 const TargetRegisterClass *RC = MRI.getRegClass(Def);
548 NewReg = MRI.createVirtualRegister(RC);
550 MachineInstrBuilder NewPhi =
551 BuildMI(*NewBB, NewBB->getFirstNonPHI(), DebugLoc(),
552 TII->get(TargetOpcode::PHI), NewReg);
553 NewPhi.addReg(PhiOp1).addMBB(BB1);
554 NewPhi.addReg(PhiOp2).addMBB(BB2);
555 if (np == 0)
556 InstrMap[NewPhi] = &*BBI;
558 // We define the Phis after creating the new pipelined code, so
559 // we need to rename the Phi values in scheduled instructions.
561 unsigned PrevReg = 0;
562 if (InKernel && VRMap[PrevStage - np].count(LoopVal))
563 PrevReg = VRMap[PrevStage - np][LoopVal];
564 rewriteScheduledInstr(NewBB, InstrMap, CurStageNum, np, &*BBI, Def,
565 NewReg, PrevReg);
566 // If the Phi has been scheduled, use the new name for rewriting.
567 if (VRMap[CurStageNum - np].count(Def)) {
568 unsigned R = VRMap[CurStageNum - np][Def];
569 rewriteScheduledInstr(NewBB, InstrMap, CurStageNum, np, &*BBI, R,
570 NewReg);
573 // Check if we need to rename any uses that occurs after the loop. The
574 // register to replace depends on whether the Phi is scheduled in the
575 // epilog.
576 if (IsLast && np == NumPhis - 1)
577 replaceRegUsesAfterLoop(Def, NewReg, BB, MRI, LIS);
579 // In the kernel, a dependent Phi uses the value from this Phi.
580 if (InKernel)
581 PhiOp2 = NewReg;
583 // Update the map with the new Phi name.
584 VRMap[CurStageNum - np][Def] = NewReg;
587 while (NumPhis++ < NumStages) {
588 rewriteScheduledInstr(NewBB, InstrMap, CurStageNum, NumPhis, &*BBI, Def,
589 NewReg, 0);
592 // Check if we need to rename a Phi that has been eliminated due to
593 // scheduling.
594 if (NumStages == 0 && IsLast && VRMap[CurStageNum].count(LoopVal))
595 replaceRegUsesAfterLoop(Def, VRMap[CurStageNum][LoopVal], BB, MRI, LIS);
599 /// Generate Phis for the specified block in the generated pipelined code.
600 /// These are new Phis needed because the definition is scheduled after the
601 /// use in the pipelined sequence.
602 void ModuloScheduleExpander::generatePhis(
603 MachineBasicBlock *NewBB, MachineBasicBlock *BB1, MachineBasicBlock *BB2,
604 MachineBasicBlock *KernelBB, ValueMapTy *VRMap, InstrMapTy &InstrMap,
605 unsigned LastStageNum, unsigned CurStageNum, bool IsLast) {
606 // Compute the stage number that contains the initial Phi value, and
607 // the Phi from the previous stage.
608 unsigned PrologStage = 0;
609 unsigned PrevStage = 0;
610 unsigned StageDiff = CurStageNum - LastStageNum;
611 bool InKernel = (StageDiff == 0);
612 if (InKernel) {
613 PrologStage = LastStageNum - 1;
614 PrevStage = CurStageNum;
615 } else {
616 PrologStage = LastStageNum - StageDiff;
617 PrevStage = LastStageNum + StageDiff - 1;
620 for (MachineBasicBlock::iterator BBI = BB->getFirstNonPHI(),
621 BBE = BB->instr_end();
622 BBI != BBE; ++BBI) {
623 for (unsigned i = 0, e = BBI->getNumOperands(); i != e; ++i) {
624 MachineOperand &MO = BBI->getOperand(i);
625 if (!MO.isReg() || !MO.isDef() ||
626 !Register::isVirtualRegister(MO.getReg()))
627 continue;
629 int StageScheduled = Schedule.getStage(&*BBI);
630 assert(StageScheduled != -1 && "Expecting scheduled instruction.");
631 Register Def = MO.getReg();
632 unsigned NumPhis = getStagesForReg(Def, CurStageNum);
633 // An instruction scheduled in stage 0 and is used after the loop
634 // requires a phi in the epilog for the last definition from either
635 // the kernel or prolog.
636 if (!InKernel && NumPhis == 0 && StageScheduled == 0 &&
637 hasUseAfterLoop(Def, BB, MRI))
638 NumPhis = 1;
639 if (!InKernel && (unsigned)StageScheduled > PrologStage)
640 continue;
642 unsigned PhiOp2 = VRMap[PrevStage][Def];
643 if (MachineInstr *InstOp2 = MRI.getVRegDef(PhiOp2))
644 if (InstOp2->isPHI() && InstOp2->getParent() == NewBB)
645 PhiOp2 = getLoopPhiReg(*InstOp2, BB2);
646 // The number of Phis can't exceed the number of prolog stages. The
647 // prolog stage number is zero based.
648 if (NumPhis > PrologStage + 1 - StageScheduled)
649 NumPhis = PrologStage + 1 - StageScheduled;
650 for (unsigned np = 0; np < NumPhis; ++np) {
651 unsigned PhiOp1 = VRMap[PrologStage][Def];
652 if (np <= PrologStage)
653 PhiOp1 = VRMap[PrologStage - np][Def];
654 if (MachineInstr *InstOp1 = MRI.getVRegDef(PhiOp1)) {
655 if (InstOp1->isPHI() && InstOp1->getParent() == KernelBB)
656 PhiOp1 = getInitPhiReg(*InstOp1, KernelBB);
657 if (InstOp1->isPHI() && InstOp1->getParent() == NewBB)
658 PhiOp1 = getInitPhiReg(*InstOp1, NewBB);
660 if (!InKernel)
661 PhiOp2 = VRMap[PrevStage - np][Def];
663 const TargetRegisterClass *RC = MRI.getRegClass(Def);
664 Register NewReg = MRI.createVirtualRegister(RC);
666 MachineInstrBuilder NewPhi =
667 BuildMI(*NewBB, NewBB->getFirstNonPHI(), DebugLoc(),
668 TII->get(TargetOpcode::PHI), NewReg);
669 NewPhi.addReg(PhiOp1).addMBB(BB1);
670 NewPhi.addReg(PhiOp2).addMBB(BB2);
671 if (np == 0)
672 InstrMap[NewPhi] = &*BBI;
674 // Rewrite uses and update the map. The actions depend upon whether
675 // we generating code for the kernel or epilog blocks.
676 if (InKernel) {
677 rewriteScheduledInstr(NewBB, InstrMap, CurStageNum, np, &*BBI, PhiOp1,
678 NewReg);
679 rewriteScheduledInstr(NewBB, InstrMap, CurStageNum, np, &*BBI, PhiOp2,
680 NewReg);
682 PhiOp2 = NewReg;
683 VRMap[PrevStage - np - 1][Def] = NewReg;
684 } else {
685 VRMap[CurStageNum - np][Def] = NewReg;
686 if (np == NumPhis - 1)
687 rewriteScheduledInstr(NewBB, InstrMap, CurStageNum, np, &*BBI, Def,
688 NewReg);
690 if (IsLast && np == NumPhis - 1)
691 replaceRegUsesAfterLoop(Def, NewReg, BB, MRI, LIS);
697 /// Remove instructions that generate values with no uses.
698 /// Typically, these are induction variable operations that generate values
699 /// used in the loop itself. A dead instruction has a definition with
700 /// no uses, or uses that occur in the original loop only.
701 void ModuloScheduleExpander::removeDeadInstructions(MachineBasicBlock *KernelBB,
702 MBBVectorTy &EpilogBBs) {
703 // For each epilog block, check that the value defined by each instruction
704 // is used. If not, delete it.
705 for (MBBVectorTy::reverse_iterator MBB = EpilogBBs.rbegin(),
706 MBE = EpilogBBs.rend();
707 MBB != MBE; ++MBB)
708 for (MachineBasicBlock::reverse_instr_iterator MI = (*MBB)->instr_rbegin(),
709 ME = (*MBB)->instr_rend();
710 MI != ME;) {
711 // From DeadMachineInstructionElem. Don't delete inline assembly.
712 if (MI->isInlineAsm()) {
713 ++MI;
714 continue;
716 bool SawStore = false;
717 // Check if it's safe to remove the instruction due to side effects.
718 // We can, and want to, remove Phis here.
719 if (!MI->isSafeToMove(nullptr, SawStore) && !MI->isPHI()) {
720 ++MI;
721 continue;
723 bool used = true;
724 for (MachineInstr::mop_iterator MOI = MI->operands_begin(),
725 MOE = MI->operands_end();
726 MOI != MOE; ++MOI) {
727 if (!MOI->isReg() || !MOI->isDef())
728 continue;
729 Register reg = MOI->getReg();
730 // Assume physical registers are used, unless they are marked dead.
731 if (Register::isPhysicalRegister(reg)) {
732 used = !MOI->isDead();
733 if (used)
734 break;
735 continue;
737 unsigned realUses = 0;
738 for (MachineRegisterInfo::use_iterator UI = MRI.use_begin(reg),
739 EI = MRI.use_end();
740 UI != EI; ++UI) {
741 // Check if there are any uses that occur only in the original
742 // loop. If so, that's not a real use.
743 if (UI->getParent()->getParent() != BB) {
744 realUses++;
745 used = true;
746 break;
749 if (realUses > 0)
750 break;
751 used = false;
753 if (!used) {
754 LIS.RemoveMachineInstrFromMaps(*MI);
755 MI++->eraseFromParent();
756 continue;
758 ++MI;
760 // In the kernel block, check if we can remove a Phi that generates a value
761 // used in an instruction removed in the epilog block.
762 for (MachineBasicBlock::iterator BBI = KernelBB->instr_begin(),
763 BBE = KernelBB->getFirstNonPHI();
764 BBI != BBE;) {
765 MachineInstr *MI = &*BBI;
766 ++BBI;
767 Register reg = MI->getOperand(0).getReg();
768 if (MRI.use_begin(reg) == MRI.use_end()) {
769 LIS.RemoveMachineInstrFromMaps(*MI);
770 MI->eraseFromParent();
775 /// For loop carried definitions, we split the lifetime of a virtual register
776 /// that has uses past the definition in the next iteration. A copy with a new
777 /// virtual register is inserted before the definition, which helps with
778 /// generating a better register assignment.
780 /// v1 = phi(a, v2) v1 = phi(a, v2)
781 /// v2 = phi(b, v3) v2 = phi(b, v3)
782 /// v3 = .. v4 = copy v1
783 /// .. = V1 v3 = ..
784 /// .. = v4
785 void ModuloScheduleExpander::splitLifetimes(MachineBasicBlock *KernelBB,
786 MBBVectorTy &EpilogBBs) {
787 const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
788 for (auto &PHI : KernelBB->phis()) {
789 Register Def = PHI.getOperand(0).getReg();
790 // Check for any Phi definition that used as an operand of another Phi
791 // in the same block.
792 for (MachineRegisterInfo::use_instr_iterator I = MRI.use_instr_begin(Def),
793 E = MRI.use_instr_end();
794 I != E; ++I) {
795 if (I->isPHI() && I->getParent() == KernelBB) {
796 // Get the loop carried definition.
797 unsigned LCDef = getLoopPhiReg(PHI, KernelBB);
798 if (!LCDef)
799 continue;
800 MachineInstr *MI = MRI.getVRegDef(LCDef);
801 if (!MI || MI->getParent() != KernelBB || MI->isPHI())
802 continue;
803 // Search through the rest of the block looking for uses of the Phi
804 // definition. If one occurs, then split the lifetime.
805 unsigned SplitReg = 0;
806 for (auto &BBJ : make_range(MachineBasicBlock::instr_iterator(MI),
807 KernelBB->instr_end()))
808 if (BBJ.readsRegister(Def)) {
809 // We split the lifetime when we find the first use.
810 if (SplitReg == 0) {
811 SplitReg = MRI.createVirtualRegister(MRI.getRegClass(Def));
812 BuildMI(*KernelBB, MI, MI->getDebugLoc(),
813 TII->get(TargetOpcode::COPY), SplitReg)
814 .addReg(Def);
816 BBJ.substituteRegister(Def, SplitReg, 0, *TRI);
818 if (!SplitReg)
819 continue;
820 // Search through each of the epilog blocks for any uses to be renamed.
821 for (auto &Epilog : EpilogBBs)
822 for (auto &I : *Epilog)
823 if (I.readsRegister(Def))
824 I.substituteRegister(Def, SplitReg, 0, *TRI);
825 break;
831 /// Remove the incoming block from the Phis in a basic block.
832 static void removePhis(MachineBasicBlock *BB, MachineBasicBlock *Incoming) {
833 for (MachineInstr &MI : *BB) {
834 if (!MI.isPHI())
835 break;
836 for (unsigned i = 1, e = MI.getNumOperands(); i != e; i += 2)
837 if (MI.getOperand(i + 1).getMBB() == Incoming) {
838 MI.RemoveOperand(i + 1);
839 MI.RemoveOperand(i);
840 break;
845 /// Create branches from each prolog basic block to the appropriate epilog
846 /// block. These edges are needed if the loop ends before reaching the
847 /// kernel.
848 void ModuloScheduleExpander::addBranches(MachineBasicBlock &PreheaderBB,
849 MBBVectorTy &PrologBBs,
850 MachineBasicBlock *KernelBB,
851 MBBVectorTy &EpilogBBs,
852 ValueMapTy *VRMap) {
853 assert(PrologBBs.size() == EpilogBBs.size() && "Prolog/Epilog mismatch");
854 MachineBasicBlock *LastPro = KernelBB;
855 MachineBasicBlock *LastEpi = KernelBB;
857 // Start from the blocks connected to the kernel and work "out"
858 // to the first prolog and the last epilog blocks.
859 SmallVector<MachineInstr *, 4> PrevInsts;
860 unsigned MaxIter = PrologBBs.size() - 1;
861 for (unsigned i = 0, j = MaxIter; i <= MaxIter; ++i, --j) {
862 // Add branches to the prolog that go to the corresponding
863 // epilog, and the fall-thru prolog/kernel block.
864 MachineBasicBlock *Prolog = PrologBBs[j];
865 MachineBasicBlock *Epilog = EpilogBBs[i];
867 SmallVector<MachineOperand, 4> Cond;
868 Optional<bool> StaticallyGreater =
869 LoopInfo->createTripCountGreaterCondition(j + 1, *Prolog, Cond);
870 unsigned numAdded = 0;
871 if (!StaticallyGreater.hasValue()) {
872 Prolog->addSuccessor(Epilog);
873 numAdded = TII->insertBranch(*Prolog, Epilog, LastPro, Cond, DebugLoc());
874 } else if (*StaticallyGreater == false) {
875 Prolog->addSuccessor(Epilog);
876 Prolog->removeSuccessor(LastPro);
877 LastEpi->removeSuccessor(Epilog);
878 numAdded = TII->insertBranch(*Prolog, Epilog, nullptr, Cond, DebugLoc());
879 removePhis(Epilog, LastEpi);
880 // Remove the blocks that are no longer referenced.
881 if (LastPro != LastEpi) {
882 LastEpi->clear();
883 LastEpi->eraseFromParent();
885 if (LastPro == KernelBB) {
886 LoopInfo->disposed();
887 NewKernel = nullptr;
889 LastPro->clear();
890 LastPro->eraseFromParent();
891 } else {
892 numAdded = TII->insertBranch(*Prolog, LastPro, nullptr, Cond, DebugLoc());
893 removePhis(Epilog, Prolog);
895 LastPro = Prolog;
896 LastEpi = Epilog;
897 for (MachineBasicBlock::reverse_instr_iterator I = Prolog->instr_rbegin(),
898 E = Prolog->instr_rend();
899 I != E && numAdded > 0; ++I, --numAdded)
900 updateInstruction(&*I, false, j, 0, VRMap);
903 if (NewKernel) {
904 LoopInfo->setPreheader(PrologBBs[MaxIter]);
905 LoopInfo->adjustTripCount(-(MaxIter + 1));
909 /// Return true if we can compute the amount the instruction changes
910 /// during each iteration. Set Delta to the amount of the change.
911 bool ModuloScheduleExpander::computeDelta(MachineInstr &MI, unsigned &Delta) {
912 const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
913 const MachineOperand *BaseOp;
914 int64_t Offset;
915 if (!TII->getMemOperandWithOffset(MI, BaseOp, Offset, TRI))
916 return false;
918 if (!BaseOp->isReg())
919 return false;
921 Register BaseReg = BaseOp->getReg();
923 MachineRegisterInfo &MRI = MF.getRegInfo();
924 // Check if there is a Phi. If so, get the definition in the loop.
925 MachineInstr *BaseDef = MRI.getVRegDef(BaseReg);
926 if (BaseDef && BaseDef->isPHI()) {
927 BaseReg = getLoopPhiReg(*BaseDef, MI.getParent());
928 BaseDef = MRI.getVRegDef(BaseReg);
930 if (!BaseDef)
931 return false;
933 int D = 0;
934 if (!TII->getIncrementValue(*BaseDef, D) && D >= 0)
935 return false;
937 Delta = D;
938 return true;
941 /// Update the memory operand with a new offset when the pipeliner
942 /// generates a new copy of the instruction that refers to a
943 /// different memory location.
944 void ModuloScheduleExpander::updateMemOperands(MachineInstr &NewMI,
945 MachineInstr &OldMI,
946 unsigned Num) {
947 if (Num == 0)
948 return;
949 // If the instruction has memory operands, then adjust the offset
950 // when the instruction appears in different stages.
951 if (NewMI.memoperands_empty())
952 return;
953 SmallVector<MachineMemOperand *, 2> NewMMOs;
954 for (MachineMemOperand *MMO : NewMI.memoperands()) {
955 // TODO: Figure out whether isAtomic is really necessary (see D57601).
956 if (MMO->isVolatile() || MMO->isAtomic() ||
957 (MMO->isInvariant() && MMO->isDereferenceable()) ||
958 (!MMO->getValue())) {
959 NewMMOs.push_back(MMO);
960 continue;
962 unsigned Delta;
963 if (Num != UINT_MAX && computeDelta(OldMI, Delta)) {
964 int64_t AdjOffset = Delta * Num;
965 NewMMOs.push_back(
966 MF.getMachineMemOperand(MMO, AdjOffset, MMO->getSize()));
967 } else {
968 NewMMOs.push_back(
969 MF.getMachineMemOperand(MMO, 0, MemoryLocation::UnknownSize));
972 NewMI.setMemRefs(MF, NewMMOs);
975 /// Clone the instruction for the new pipelined loop and update the
976 /// memory operands, if needed.
977 MachineInstr *ModuloScheduleExpander::cloneInstr(MachineInstr *OldMI,
978 unsigned CurStageNum,
979 unsigned InstStageNum) {
980 MachineInstr *NewMI = MF.CloneMachineInstr(OldMI);
981 // Check for tied operands in inline asm instructions. This should be handled
982 // elsewhere, but I'm not sure of the best solution.
983 if (OldMI->isInlineAsm())
984 for (unsigned i = 0, e = OldMI->getNumOperands(); i != e; ++i) {
985 const auto &MO = OldMI->getOperand(i);
986 if (MO.isReg() && MO.isUse())
987 break;
988 unsigned UseIdx;
989 if (OldMI->isRegTiedToUseOperand(i, &UseIdx))
990 NewMI->tieOperands(i, UseIdx);
992 updateMemOperands(*NewMI, *OldMI, CurStageNum - InstStageNum);
993 return NewMI;
996 /// Clone the instruction for the new pipelined loop. If needed, this
997 /// function updates the instruction using the values saved in the
998 /// InstrChanges structure.
999 MachineInstr *ModuloScheduleExpander::cloneAndChangeInstr(
1000 MachineInstr *OldMI, unsigned CurStageNum, unsigned InstStageNum) {
1001 MachineInstr *NewMI = MF.CloneMachineInstr(OldMI);
1002 auto It = InstrChanges.find(OldMI);
1003 if (It != InstrChanges.end()) {
1004 std::pair<unsigned, int64_t> RegAndOffset = It->second;
1005 unsigned BasePos, OffsetPos;
1006 if (!TII->getBaseAndOffsetPosition(*OldMI, BasePos, OffsetPos))
1007 return nullptr;
1008 int64_t NewOffset = OldMI->getOperand(OffsetPos).getImm();
1009 MachineInstr *LoopDef = findDefInLoop(RegAndOffset.first);
1010 if (Schedule.getStage(LoopDef) > (signed)InstStageNum)
1011 NewOffset += RegAndOffset.second * (CurStageNum - InstStageNum);
1012 NewMI->getOperand(OffsetPos).setImm(NewOffset);
1014 updateMemOperands(*NewMI, *OldMI, CurStageNum - InstStageNum);
1015 return NewMI;
1018 /// Update the machine instruction with new virtual registers. This
1019 /// function may change the defintions and/or uses.
1020 void ModuloScheduleExpander::updateInstruction(MachineInstr *NewMI,
1021 bool LastDef,
1022 unsigned CurStageNum,
1023 unsigned InstrStageNum,
1024 ValueMapTy *VRMap) {
1025 for (unsigned i = 0, e = NewMI->getNumOperands(); i != e; ++i) {
1026 MachineOperand &MO = NewMI->getOperand(i);
1027 if (!MO.isReg() || !Register::isVirtualRegister(MO.getReg()))
1028 continue;
1029 Register reg = MO.getReg();
1030 if (MO.isDef()) {
1031 // Create a new virtual register for the definition.
1032 const TargetRegisterClass *RC = MRI.getRegClass(reg);
1033 Register NewReg = MRI.createVirtualRegister(RC);
1034 MO.setReg(NewReg);
1035 VRMap[CurStageNum][reg] = NewReg;
1036 if (LastDef)
1037 replaceRegUsesAfterLoop(reg, NewReg, BB, MRI, LIS);
1038 } else if (MO.isUse()) {
1039 MachineInstr *Def = MRI.getVRegDef(reg);
1040 // Compute the stage that contains the last definition for instruction.
1041 int DefStageNum = Schedule.getStage(Def);
1042 unsigned StageNum = CurStageNum;
1043 if (DefStageNum != -1 && (int)InstrStageNum > DefStageNum) {
1044 // Compute the difference in stages between the defintion and the use.
1045 unsigned StageDiff = (InstrStageNum - DefStageNum);
1046 // Make an adjustment to get the last definition.
1047 StageNum -= StageDiff;
1049 if (VRMap[StageNum].count(reg))
1050 MO.setReg(VRMap[StageNum][reg]);
1055 /// Return the instruction in the loop that defines the register.
1056 /// If the definition is a Phi, then follow the Phi operand to
1057 /// the instruction in the loop.
1058 MachineInstr *ModuloScheduleExpander::findDefInLoop(unsigned Reg) {
1059 SmallPtrSet<MachineInstr *, 8> Visited;
1060 MachineInstr *Def = MRI.getVRegDef(Reg);
1061 while (Def->isPHI()) {
1062 if (!Visited.insert(Def).second)
1063 break;
1064 for (unsigned i = 1, e = Def->getNumOperands(); i < e; i += 2)
1065 if (Def->getOperand(i + 1).getMBB() == BB) {
1066 Def = MRI.getVRegDef(Def->getOperand(i).getReg());
1067 break;
1070 return Def;
1073 /// Return the new name for the value from the previous stage.
1074 unsigned ModuloScheduleExpander::getPrevMapVal(
1075 unsigned StageNum, unsigned PhiStage, unsigned LoopVal, unsigned LoopStage,
1076 ValueMapTy *VRMap, MachineBasicBlock *BB) {
1077 unsigned PrevVal = 0;
1078 if (StageNum > PhiStage) {
1079 MachineInstr *LoopInst = MRI.getVRegDef(LoopVal);
1080 if (PhiStage == LoopStage && VRMap[StageNum - 1].count(LoopVal))
1081 // The name is defined in the previous stage.
1082 PrevVal = VRMap[StageNum - 1][LoopVal];
1083 else if (VRMap[StageNum].count(LoopVal))
1084 // The previous name is defined in the current stage when the instruction
1085 // order is swapped.
1086 PrevVal = VRMap[StageNum][LoopVal];
1087 else if (!LoopInst->isPHI() || LoopInst->getParent() != BB)
1088 // The loop value hasn't yet been scheduled.
1089 PrevVal = LoopVal;
1090 else if (StageNum == PhiStage + 1)
1091 // The loop value is another phi, which has not been scheduled.
1092 PrevVal = getInitPhiReg(*LoopInst, BB);
1093 else if (StageNum > PhiStage + 1 && LoopInst->getParent() == BB)
1094 // The loop value is another phi, which has been scheduled.
1095 PrevVal =
1096 getPrevMapVal(StageNum - 1, PhiStage, getLoopPhiReg(*LoopInst, BB),
1097 LoopStage, VRMap, BB);
1099 return PrevVal;
1102 /// Rewrite the Phi values in the specified block to use the mappings
1103 /// from the initial operand. Once the Phi is scheduled, we switch
1104 /// to using the loop value instead of the Phi value, so those names
1105 /// do not need to be rewritten.
1106 void ModuloScheduleExpander::rewritePhiValues(MachineBasicBlock *NewBB,
1107 unsigned StageNum,
1108 ValueMapTy *VRMap,
1109 InstrMapTy &InstrMap) {
1110 for (auto &PHI : BB->phis()) {
1111 unsigned InitVal = 0;
1112 unsigned LoopVal = 0;
1113 getPhiRegs(PHI, BB, InitVal, LoopVal);
1114 Register PhiDef = PHI.getOperand(0).getReg();
1116 unsigned PhiStage = (unsigned)Schedule.getStage(MRI.getVRegDef(PhiDef));
1117 unsigned LoopStage = (unsigned)Schedule.getStage(MRI.getVRegDef(LoopVal));
1118 unsigned NumPhis = getStagesForPhi(PhiDef);
1119 if (NumPhis > StageNum)
1120 NumPhis = StageNum;
1121 for (unsigned np = 0; np <= NumPhis; ++np) {
1122 unsigned NewVal =
1123 getPrevMapVal(StageNum - np, PhiStage, LoopVal, LoopStage, VRMap, BB);
1124 if (!NewVal)
1125 NewVal = InitVal;
1126 rewriteScheduledInstr(NewBB, InstrMap, StageNum - np, np, &PHI, PhiDef,
1127 NewVal);
1132 /// Rewrite a previously scheduled instruction to use the register value
1133 /// from the new instruction. Make sure the instruction occurs in the
1134 /// basic block, and we don't change the uses in the new instruction.
1135 void ModuloScheduleExpander::rewriteScheduledInstr(
1136 MachineBasicBlock *BB, InstrMapTy &InstrMap, unsigned CurStageNum,
1137 unsigned PhiNum, MachineInstr *Phi, unsigned OldReg, unsigned NewReg,
1138 unsigned PrevReg) {
1139 bool InProlog = (CurStageNum < (unsigned)Schedule.getNumStages() - 1);
1140 int StagePhi = Schedule.getStage(Phi) + PhiNum;
1141 // Rewrite uses that have been scheduled already to use the new
1142 // Phi register.
1143 for (MachineRegisterInfo::use_iterator UI = MRI.use_begin(OldReg),
1144 EI = MRI.use_end();
1145 UI != EI;) {
1146 MachineOperand &UseOp = *UI;
1147 MachineInstr *UseMI = UseOp.getParent();
1148 ++UI;
1149 if (UseMI->getParent() != BB)
1150 continue;
1151 if (UseMI->isPHI()) {
1152 if (!Phi->isPHI() && UseMI->getOperand(0).getReg() == NewReg)
1153 continue;
1154 if (getLoopPhiReg(*UseMI, BB) != OldReg)
1155 continue;
1157 InstrMapTy::iterator OrigInstr = InstrMap.find(UseMI);
1158 assert(OrigInstr != InstrMap.end() && "Instruction not scheduled.");
1159 MachineInstr *OrigMI = OrigInstr->second;
1160 int StageSched = Schedule.getStage(OrigMI);
1161 int CycleSched = Schedule.getCycle(OrigMI);
1162 unsigned ReplaceReg = 0;
1163 // This is the stage for the scheduled instruction.
1164 if (StagePhi == StageSched && Phi->isPHI()) {
1165 int CyclePhi = Schedule.getCycle(Phi);
1166 if (PrevReg && InProlog)
1167 ReplaceReg = PrevReg;
1168 else if (PrevReg && !isLoopCarried(*Phi) &&
1169 (CyclePhi <= CycleSched || OrigMI->isPHI()))
1170 ReplaceReg = PrevReg;
1171 else
1172 ReplaceReg = NewReg;
1174 // The scheduled instruction occurs before the scheduled Phi, and the
1175 // Phi is not loop carried.
1176 if (!InProlog && StagePhi + 1 == StageSched && !isLoopCarried(*Phi))
1177 ReplaceReg = NewReg;
1178 if (StagePhi > StageSched && Phi->isPHI())
1179 ReplaceReg = NewReg;
1180 if (!InProlog && !Phi->isPHI() && StagePhi < StageSched)
1181 ReplaceReg = NewReg;
1182 if (ReplaceReg) {
1183 MRI.constrainRegClass(ReplaceReg, MRI.getRegClass(OldReg));
1184 UseOp.setReg(ReplaceReg);
1189 bool ModuloScheduleExpander::isLoopCarried(MachineInstr &Phi) {
1190 if (!Phi.isPHI())
1191 return false;
1192 unsigned DefCycle = Schedule.getCycle(&Phi);
1193 int DefStage = Schedule.getStage(&Phi);
1195 unsigned InitVal = 0;
1196 unsigned LoopVal = 0;
1197 getPhiRegs(Phi, Phi.getParent(), InitVal, LoopVal);
1198 MachineInstr *Use = MRI.getVRegDef(LoopVal);
1199 if (!Use || Use->isPHI())
1200 return true;
1201 unsigned LoopCycle = Schedule.getCycle(Use);
1202 int LoopStage = Schedule.getStage(Use);
1203 return (LoopCycle > DefCycle) || (LoopStage <= DefStage);
1206 //===----------------------------------------------------------------------===//
1207 // PeelingModuloScheduleExpander implementation
1208 //===----------------------------------------------------------------------===//
1209 // This is a reimplementation of ModuloScheduleExpander that works by creating
1210 // a fully correct steady-state kernel and peeling off the prolog and epilogs.
1211 //===----------------------------------------------------------------------===//
1213 namespace {
1214 // Remove any dead phis in MBB. Dead phis either have only one block as input
1215 // (in which case they are the identity) or have no uses.
1216 void EliminateDeadPhis(MachineBasicBlock *MBB, MachineRegisterInfo &MRI,
1217 LiveIntervals *LIS) {
1218 bool Changed = true;
1219 while (Changed) {
1220 Changed = false;
1221 for (auto I = MBB->begin(); I != MBB->getFirstNonPHI();) {
1222 MachineInstr &MI = *I++;
1223 assert(MI.isPHI());
1224 if (MRI.use_empty(MI.getOperand(0).getReg())) {
1225 if (LIS)
1226 LIS->RemoveMachineInstrFromMaps(MI);
1227 MI.eraseFromParent();
1228 Changed = true;
1229 } else if (MI.getNumExplicitOperands() == 3) {
1230 MRI.constrainRegClass(MI.getOperand(1).getReg(),
1231 MRI.getRegClass(MI.getOperand(0).getReg()));
1232 MRI.replaceRegWith(MI.getOperand(0).getReg(),
1233 MI.getOperand(1).getReg());
1234 if (LIS)
1235 LIS->RemoveMachineInstrFromMaps(MI);
1236 MI.eraseFromParent();
1237 Changed = true;
1243 /// Rewrites the kernel block in-place to adhere to the given schedule.
1244 /// KernelRewriter holds all of the state required to perform the rewriting.
1245 class KernelRewriter {
1246 ModuloSchedule &S;
1247 MachineBasicBlock *BB;
1248 MachineBasicBlock *PreheaderBB, *ExitBB;
1249 MachineRegisterInfo &MRI;
1250 const TargetInstrInfo *TII;
1251 LiveIntervals *LIS;
1253 // Map from register class to canonical undef register for that class.
1254 DenseMap<const TargetRegisterClass *, Register> Undefs;
1255 // Map from <LoopReg, InitReg> to phi register for all created phis. Note that
1256 // this map is only used when InitReg is non-undef.
1257 DenseMap<std::pair<unsigned, unsigned>, Register> Phis;
1258 // Map from LoopReg to phi register where the InitReg is undef.
1259 DenseMap<Register, Register> UndefPhis;
1261 // Reg is used by MI. Return the new register MI should use to adhere to the
1262 // schedule. Insert phis as necessary.
1263 Register remapUse(Register Reg, MachineInstr &MI);
1264 // Insert a phi that carries LoopReg from the loop body and InitReg otherwise.
1265 // If InitReg is not given it is chosen arbitrarily. It will either be undef
1266 // or will be chosen so as to share another phi.
1267 Register phi(Register LoopReg, Optional<Register> InitReg = {},
1268 const TargetRegisterClass *RC = nullptr);
1269 // Create an undef register of the given register class.
1270 Register undef(const TargetRegisterClass *RC);
1272 public:
1273 KernelRewriter(MachineLoop &L, ModuloSchedule &S,
1274 LiveIntervals *LIS = nullptr);
1275 void rewrite();
1277 } // namespace
1279 KernelRewriter::KernelRewriter(MachineLoop &L, ModuloSchedule &S,
1280 LiveIntervals *LIS)
1281 : S(S), BB(L.getTopBlock()), PreheaderBB(L.getLoopPreheader()),
1282 ExitBB(L.getExitBlock()), MRI(BB->getParent()->getRegInfo()),
1283 TII(BB->getParent()->getSubtarget().getInstrInfo()), LIS(LIS) {
1284 PreheaderBB = *BB->pred_begin();
1285 if (PreheaderBB == BB)
1286 PreheaderBB = *std::next(BB->pred_begin());
1289 void KernelRewriter::rewrite() {
1290 // Rearrange the loop to be in schedule order. Note that the schedule may
1291 // contain instructions that are not owned by the loop block (InstrChanges and
1292 // friends), so we gracefully handle unowned instructions and delete any
1293 // instructions that weren't in the schedule.
1294 auto InsertPt = BB->getFirstTerminator();
1295 MachineInstr *FirstMI = nullptr;
1296 for (MachineInstr *MI : S.getInstructions()) {
1297 if (MI->isPHI())
1298 continue;
1299 if (MI->getParent())
1300 MI->removeFromParent();
1301 BB->insert(InsertPt, MI);
1302 if (!FirstMI)
1303 FirstMI = MI;
1305 assert(FirstMI && "Failed to find first MI in schedule");
1307 // At this point all of the scheduled instructions are between FirstMI
1308 // and the end of the block. Kill from the first non-phi to FirstMI.
1309 for (auto I = BB->getFirstNonPHI(); I != FirstMI->getIterator();) {
1310 if (LIS)
1311 LIS->RemoveMachineInstrFromMaps(*I);
1312 (I++)->eraseFromParent();
1315 // Now remap every instruction in the loop.
1316 for (MachineInstr &MI : *BB) {
1317 if (MI.isPHI() || MI.isTerminator())
1318 continue;
1319 for (MachineOperand &MO : MI.uses()) {
1320 if (!MO.isReg() || MO.getReg().isPhysical() || MO.isImplicit())
1321 continue;
1322 Register Reg = remapUse(MO.getReg(), MI);
1323 MO.setReg(Reg);
1326 EliminateDeadPhis(BB, MRI, LIS);
1328 // Ensure a phi exists for all instructions that are either referenced by
1329 // an illegal phi or by an instruction outside the loop. This allows us to
1330 // treat remaps of these values the same as "normal" values that come from
1331 // loop-carried phis.
1332 for (auto MI = BB->getFirstNonPHI(); MI != BB->end(); ++MI) {
1333 if (MI->isPHI()) {
1334 Register R = MI->getOperand(0).getReg();
1335 phi(R);
1336 continue;
1339 for (MachineOperand &Def : MI->defs()) {
1340 for (MachineInstr &MI : MRI.use_instructions(Def.getReg())) {
1341 if (MI.getParent() != BB) {
1342 phi(Def.getReg());
1343 break;
1350 Register KernelRewriter::remapUse(Register Reg, MachineInstr &MI) {
1351 MachineInstr *Producer = MRI.getUniqueVRegDef(Reg);
1352 if (!Producer)
1353 return Reg;
1355 int ConsumerStage = S.getStage(&MI);
1356 if (!Producer->isPHI()) {
1357 // Non-phi producers are simple to remap. Insert as many phis as the
1358 // difference between the consumer and producer stages.
1359 if (Producer->getParent() != BB)
1360 // Producer was not inside the loop. Use the register as-is.
1361 return Reg;
1362 int ProducerStage = S.getStage(Producer);
1363 assert(ConsumerStage != -1 &&
1364 "In-loop consumer should always be scheduled!");
1365 assert(ConsumerStage >= ProducerStage);
1366 unsigned StageDiff = ConsumerStage - ProducerStage;
1368 for (unsigned I = 0; I < StageDiff; ++I)
1369 Reg = phi(Reg);
1370 return Reg;
1373 // First, dive through the phi chain to find the defaults for the generated
1374 // phis.
1375 SmallVector<Optional<Register>, 4> Defaults;
1376 Register LoopReg = Reg;
1377 auto LoopProducer = Producer;
1378 while (LoopProducer->isPHI() && LoopProducer->getParent() == BB) {
1379 LoopReg = getLoopPhiReg(*LoopProducer, BB);
1380 Defaults.emplace_back(getInitPhiReg(*LoopProducer, BB));
1381 LoopProducer = MRI.getUniqueVRegDef(LoopReg);
1382 assert(LoopProducer);
1384 int LoopProducerStage = S.getStage(LoopProducer);
1386 Optional<Register> IllegalPhiDefault;
1388 if (LoopProducerStage == -1) {
1389 // Do nothing.
1390 } else if (LoopProducerStage > ConsumerStage) {
1391 // This schedule is only representable if ProducerStage == ConsumerStage+1.
1392 // In addition, Consumer's cycle must be scheduled after Producer in the
1393 // rescheduled loop. This is enforced by the pipeliner's ASAP and ALAP
1394 // functions.
1395 #ifndef NDEBUG // Silence unused variables in non-asserts mode.
1396 int LoopProducerCycle = S.getCycle(LoopProducer);
1397 int ConsumerCycle = S.getCycle(&MI);
1398 #endif
1399 assert(LoopProducerCycle <= ConsumerCycle);
1400 assert(LoopProducerStage == ConsumerStage + 1);
1401 // Peel off the first phi from Defaults and insert a phi between producer
1402 // and consumer. This phi will not be at the front of the block so we
1403 // consider it illegal. It will only exist during the rewrite process; it
1404 // needs to exist while we peel off prologs because these could take the
1405 // default value. After that we can replace all uses with the loop producer
1406 // value.
1407 IllegalPhiDefault = Defaults.front();
1408 Defaults.erase(Defaults.begin());
1409 } else {
1410 assert(ConsumerStage >= LoopProducerStage);
1411 int StageDiff = ConsumerStage - LoopProducerStage;
1412 if (StageDiff > 0) {
1413 LLVM_DEBUG(dbgs() << " -- padding defaults array from " << Defaults.size()
1414 << " to " << (Defaults.size() + StageDiff) << "\n");
1415 // If we need more phis than we have defaults for, pad out with undefs for
1416 // the earliest phis, which are at the end of the defaults chain (the
1417 // chain is in reverse order).
1418 Defaults.resize(Defaults.size() + StageDiff, Defaults.empty()
1419 ? Optional<Register>()
1420 : Defaults.back());
1424 // Now we know the number of stages to jump back, insert the phi chain.
1425 auto DefaultI = Defaults.rbegin();
1426 while (DefaultI != Defaults.rend())
1427 LoopReg = phi(LoopReg, *DefaultI++, MRI.getRegClass(Reg));
1429 if (IllegalPhiDefault.hasValue()) {
1430 // The consumer optionally consumes LoopProducer in the same iteration
1431 // (because the producer is scheduled at an earlier cycle than the consumer)
1432 // or the initial value. To facilitate this we create an illegal block here
1433 // by embedding a phi in the middle of the block. We will fix this up
1434 // immediately prior to pruning.
1435 auto RC = MRI.getRegClass(Reg);
1436 Register R = MRI.createVirtualRegister(RC);
1437 BuildMI(*BB, MI, DebugLoc(), TII->get(TargetOpcode::PHI), R)
1438 .addReg(IllegalPhiDefault.getValue())
1439 .addMBB(PreheaderBB) // Block choice is arbitrary and has no effect.
1440 .addReg(LoopReg)
1441 .addMBB(BB); // Block choice is arbitrary and has no effect.
1442 return R;
1445 return LoopReg;
1448 Register KernelRewriter::phi(Register LoopReg, Optional<Register> InitReg,
1449 const TargetRegisterClass *RC) {
1450 // If the init register is not undef, try and find an existing phi.
1451 if (InitReg.hasValue()) {
1452 auto I = Phis.find({LoopReg, InitReg.getValue()});
1453 if (I != Phis.end())
1454 return I->second;
1455 } else {
1456 for (auto &KV : Phis) {
1457 if (KV.first.first == LoopReg)
1458 return KV.second;
1462 // InitReg is either undef or no existing phi takes InitReg as input. Try and
1463 // find a phi that takes undef as input.
1464 auto I = UndefPhis.find(LoopReg);
1465 if (I != UndefPhis.end()) {
1466 Register R = I->second;
1467 if (!InitReg.hasValue())
1468 // Found a phi taking undef as input, and this input is undef so return
1469 // without any more changes.
1470 return R;
1471 // Found a phi taking undef as input, so rewrite it to take InitReg.
1472 MachineInstr *MI = MRI.getVRegDef(R);
1473 MI->getOperand(1).setReg(InitReg.getValue());
1474 Phis.insert({{LoopReg, InitReg.getValue()}, R});
1475 MRI.constrainRegClass(R, MRI.getRegClass(InitReg.getValue()));
1476 UndefPhis.erase(I);
1477 return R;
1480 // Failed to find any existing phi to reuse, so create a new one.
1481 if (!RC)
1482 RC = MRI.getRegClass(LoopReg);
1483 Register R = MRI.createVirtualRegister(RC);
1484 if (InitReg.hasValue())
1485 MRI.constrainRegClass(R, MRI.getRegClass(*InitReg));
1486 BuildMI(*BB, BB->getFirstNonPHI(), DebugLoc(), TII->get(TargetOpcode::PHI), R)
1487 .addReg(InitReg.hasValue() ? *InitReg : undef(RC))
1488 .addMBB(PreheaderBB)
1489 .addReg(LoopReg)
1490 .addMBB(BB);
1491 if (!InitReg.hasValue())
1492 UndefPhis[LoopReg] = R;
1493 else
1494 Phis[{LoopReg, *InitReg}] = R;
1495 return R;
1498 Register KernelRewriter::undef(const TargetRegisterClass *RC) {
1499 Register &R = Undefs[RC];
1500 if (R == 0) {
1501 // Create an IMPLICIT_DEF that defines this register if we need it.
1502 // All uses of this should be removed by the time we have finished unrolling
1503 // prologs and epilogs.
1504 R = MRI.createVirtualRegister(RC);
1505 auto *InsertBB = &PreheaderBB->getParent()->front();
1506 BuildMI(*InsertBB, InsertBB->getFirstTerminator(), DebugLoc(),
1507 TII->get(TargetOpcode::IMPLICIT_DEF), R);
1509 return R;
1512 namespace {
1513 /// Describes an operand in the kernel of a pipelined loop. Characteristics of
1514 /// the operand are discovered, such as how many in-loop PHIs it has to jump
1515 /// through and defaults for these phis.
1516 class KernelOperandInfo {
1517 MachineBasicBlock *BB;
1518 MachineRegisterInfo &MRI;
1519 SmallVector<Register, 4> PhiDefaults;
1520 MachineOperand *Source;
1521 MachineOperand *Target;
1523 public:
1524 KernelOperandInfo(MachineOperand *MO, MachineRegisterInfo &MRI,
1525 const SmallPtrSetImpl<MachineInstr *> &IllegalPhis)
1526 : MRI(MRI) {
1527 Source = MO;
1528 BB = MO->getParent()->getParent();
1529 while (isRegInLoop(MO)) {
1530 MachineInstr *MI = MRI.getVRegDef(MO->getReg());
1531 if (MI->isFullCopy()) {
1532 MO = &MI->getOperand(1);
1533 continue;
1535 if (!MI->isPHI())
1536 break;
1537 // If this is an illegal phi, don't count it in distance.
1538 if (IllegalPhis.count(MI)) {
1539 MO = &MI->getOperand(3);
1540 continue;
1543 Register Default = getInitPhiReg(*MI, BB);
1544 MO = MI->getOperand(2).getMBB() == BB ? &MI->getOperand(1)
1545 : &MI->getOperand(3);
1546 PhiDefaults.push_back(Default);
1548 Target = MO;
1551 bool operator==(const KernelOperandInfo &Other) const {
1552 return PhiDefaults.size() == Other.PhiDefaults.size();
1555 void print(raw_ostream &OS) const {
1556 OS << "use of " << *Source << ": distance(" << PhiDefaults.size() << ") in "
1557 << *Source->getParent();
1560 private:
1561 bool isRegInLoop(MachineOperand *MO) {
1562 return MO->isReg() && MO->getReg().isVirtual() &&
1563 MRI.getVRegDef(MO->getReg())->getParent() == BB;
1566 } // namespace
1568 MachineBasicBlock *
1569 PeelingModuloScheduleExpander::peelKernel(LoopPeelDirection LPD) {
1570 MachineBasicBlock *NewBB = PeelSingleBlockLoop(LPD, BB, MRI, TII);
1571 if (LPD == LPD_Front)
1572 PeeledFront.push_back(NewBB);
1573 else
1574 PeeledBack.push_front(NewBB);
1575 for (auto I = BB->begin(), NI = NewBB->begin(); !I->isTerminator();
1576 ++I, ++NI) {
1577 CanonicalMIs[&*I] = &*I;
1578 CanonicalMIs[&*NI] = &*I;
1579 BlockMIs[{NewBB, &*I}] = &*NI;
1580 BlockMIs[{BB, &*I}] = &*I;
1582 return NewBB;
1585 void PeelingModuloScheduleExpander::peelPrologAndEpilogs() {
1586 BitVector LS(Schedule.getNumStages(), true);
1587 BitVector AS(Schedule.getNumStages(), true);
1588 LiveStages[BB] = LS;
1589 AvailableStages[BB] = AS;
1591 // Peel out the prologs.
1592 LS.reset();
1593 for (int I = 0; I < Schedule.getNumStages() - 1; ++I) {
1594 LS[I] = 1;
1595 Prologs.push_back(peelKernel(LPD_Front));
1596 LiveStages[Prologs.back()] = LS;
1597 AvailableStages[Prologs.back()] = LS;
1600 // Create a block that will end up as the new loop exiting block (dominated by
1601 // all prologs and epilogs). It will only contain PHIs, in the same order as
1602 // BB's PHIs. This gives us a poor-man's LCSSA with the inductive property
1603 // that the exiting block is a (sub) clone of BB. This in turn gives us the
1604 // property that any value deffed in BB but used outside of BB is used by a
1605 // PHI in the exiting block.
1606 MachineBasicBlock *ExitingBB = CreateLCSSAExitingBlock();
1608 // Push out the epilogs, again in reverse order.
1609 // We can't assume anything about the minumum loop trip count at this point,
1610 // so emit a fairly complex epilog:
1611 // K[0, 1, 2] // Kernel runs stages 0, 1, 2
1612 // E0[2] <- P1 // Epilog runs stage 2 only, so the state after is [0].
1613 // E1[1, 2] <- P0 // Epilog 1 moves the last item from stage 0 to stage 2.
1615 // This creates a single-successor single-predecessor sequence of blocks for
1616 // each epilog, which are kept this way for simplicity at this stage and
1617 // cleaned up by the optimizer later.
1618 for (int I = 1; I <= Schedule.getNumStages() - 1; ++I) {
1619 Epilogs.push_back(nullptr);
1620 for (int J = Schedule.getNumStages() - 1; J >= I; --J) {
1621 LS.reset();
1622 LS[J] = 1;
1623 Epilogs.back() = peelKernel(LPD_Back);
1624 LiveStages[Epilogs.back()] = LS;
1625 AvailableStages[Epilogs.back()] = AS;
1629 // Now we've defined all the prolog and epilog blocks as a fallthrough
1630 // sequence, add the edges that will be followed if the loop trip count is
1631 // lower than the number of stages (connecting prologs directly with epilogs).
1632 auto PI = Prologs.begin();
1633 auto EI = Epilogs.begin();
1634 assert(Prologs.size() == Epilogs.size());
1635 for (; PI != Prologs.end(); ++PI, ++EI) {
1636 MachineBasicBlock *Pred = *(*EI)->pred_begin();
1637 (*PI)->addSuccessor(*EI);
1638 for (MachineInstr &MI : (*EI)->phis()) {
1639 Register Reg = MI.getOperand(1).getReg();
1640 MachineInstr *Use = MRI.getUniqueVRegDef(Reg);
1641 if (Use && Use->getParent() == Pred)
1642 Reg = getEquivalentRegisterIn(Reg, *PI);
1643 MI.addOperand(MachineOperand::CreateReg(Reg, /*isDef=*/false));
1644 MI.addOperand(MachineOperand::CreateMBB(*PI));
1648 // Create a list of all blocks in order.
1649 SmallVector<MachineBasicBlock *, 8> Blocks;
1650 llvm::copy(PeeledFront, std::back_inserter(Blocks));
1651 Blocks.push_back(BB);
1652 llvm::copy(PeeledBack, std::back_inserter(Blocks));
1654 // Iterate in reverse order over all instructions, remapping as we go.
1655 for (MachineBasicBlock *B : reverse(Blocks)) {
1656 for (auto I = B->getFirstInstrTerminator()->getReverseIterator();
1657 I != std::next(B->getFirstNonPHI()->getReverseIterator());) {
1658 MachineInstr *MI = &*I++;
1659 rewriteUsesOf(MI);
1662 // Now all remapping has been done, we're free to optimize the generated code.
1663 for (MachineBasicBlock *B : reverse(Blocks))
1664 EliminateDeadPhis(B, MRI, LIS);
1665 EliminateDeadPhis(ExitingBB, MRI, LIS);
1668 MachineBasicBlock *PeelingModuloScheduleExpander::CreateLCSSAExitingBlock() {
1669 MachineFunction &MF = *BB->getParent();
1670 MachineBasicBlock *Exit = *BB->succ_begin();
1671 if (Exit == BB)
1672 Exit = *std::next(BB->succ_begin());
1674 MachineBasicBlock *NewBB = MF.CreateMachineBasicBlock(BB->getBasicBlock());
1675 MF.insert(std::next(BB->getIterator()), NewBB);
1677 // Clone all phis in BB into NewBB and rewrite.
1678 for (MachineInstr &MI : BB->phis()) {
1679 auto RC = MRI.getRegClass(MI.getOperand(0).getReg());
1680 Register OldR = MI.getOperand(3).getReg();
1681 Register R = MRI.createVirtualRegister(RC);
1682 SmallVector<MachineInstr *, 4> Uses;
1683 for (MachineInstr &Use : MRI.use_instructions(OldR))
1684 if (Use.getParent() != BB)
1685 Uses.push_back(&Use);
1686 for (MachineInstr *Use : Uses)
1687 Use->substituteRegister(OldR, R, /*SubIdx=*/0,
1688 *MRI.getTargetRegisterInfo());
1689 MachineInstr *NI = BuildMI(NewBB, DebugLoc(), TII->get(TargetOpcode::PHI), R)
1690 .addReg(OldR)
1691 .addMBB(BB);
1692 BlockMIs[{NewBB, &MI}] = NI;
1693 CanonicalMIs[NI] = &MI;
1695 BB->replaceSuccessor(Exit, NewBB);
1696 Exit->replacePhiUsesWith(BB, NewBB);
1697 NewBB->addSuccessor(Exit);
1699 MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
1700 SmallVector<MachineOperand, 4> Cond;
1701 bool CanAnalyzeBr = !TII->analyzeBranch(*BB, TBB, FBB, Cond);
1702 (void)CanAnalyzeBr;
1703 assert(CanAnalyzeBr && "Must be able to analyze the loop branch!");
1704 TII->removeBranch(*BB);
1705 TII->insertBranch(*BB, TBB == Exit ? NewBB : TBB, FBB == Exit ? NewBB : FBB,
1706 Cond, DebugLoc());
1707 TII->insertUnconditionalBranch(*NewBB, Exit, DebugLoc());
1708 return NewBB;
1711 Register
1712 PeelingModuloScheduleExpander::getEquivalentRegisterIn(Register Reg,
1713 MachineBasicBlock *BB) {
1714 MachineInstr *MI = MRI.getUniqueVRegDef(Reg);
1715 unsigned OpIdx = MI->findRegisterDefOperandIdx(Reg);
1716 return BlockMIs[{BB, CanonicalMIs[MI]}]->getOperand(OpIdx).getReg();
1719 void PeelingModuloScheduleExpander::rewriteUsesOf(MachineInstr *MI) {
1720 if (MI->isPHI()) {
1721 // This is an illegal PHI. The loop-carried (desired) value is operand 3,
1722 // and it is produced by this block.
1723 Register PhiR = MI->getOperand(0).getReg();
1724 Register R = MI->getOperand(3).getReg();
1725 int RMIStage = getStage(MRI.getUniqueVRegDef(R));
1726 if (RMIStage != -1 && !AvailableStages[MI->getParent()].test(RMIStage))
1727 R = MI->getOperand(1).getReg();
1728 MRI.setRegClass(R, MRI.getRegClass(PhiR));
1729 MRI.replaceRegWith(PhiR, R);
1730 if (LIS)
1731 LIS->RemoveMachineInstrFromMaps(*MI);
1732 MI->eraseFromParent();
1733 return;
1736 int Stage = getStage(MI);
1737 if (Stage == -1 || LiveStages.count(MI->getParent()) == 0 ||
1738 LiveStages[MI->getParent()].test(Stage))
1739 // Instruction is live, no rewriting to do.
1740 return;
1742 for (MachineOperand &DefMO : MI->defs()) {
1743 SmallVector<std::pair<MachineInstr *, Register>, 4> Subs;
1744 for (MachineInstr &UseMI : MRI.use_instructions(DefMO.getReg())) {
1745 // Only PHIs can use values from this block by construction.
1746 // Match with the equivalent PHI in B.
1747 assert(UseMI.isPHI());
1748 Register Reg = getEquivalentRegisterIn(UseMI.getOperand(0).getReg(),
1749 MI->getParent());
1750 Subs.emplace_back(&UseMI, Reg);
1752 for (auto &Sub : Subs)
1753 Sub.first->substituteRegister(DefMO.getReg(), Sub.second, /*SubIdx=*/0,
1754 *MRI.getTargetRegisterInfo());
1756 if (LIS)
1757 LIS->RemoveMachineInstrFromMaps(*MI);
1758 MI->eraseFromParent();
1761 void PeelingModuloScheduleExpander::fixupBranches() {
1762 std::unique_ptr<TargetInstrInfo::PipelinerLoopInfo> Info =
1763 TII->analyzeLoopForPipelining(BB);
1764 assert(Info);
1766 // Work outwards from the kernel.
1767 bool KernelDisposed = false;
1768 int TC = Schedule.getNumStages() - 1;
1769 for (auto PI = Prologs.rbegin(), EI = Epilogs.rbegin(); PI != Prologs.rend();
1770 ++PI, ++EI, --TC) {
1771 MachineBasicBlock *Prolog = *PI;
1772 MachineBasicBlock *Fallthrough = *Prolog->succ_begin();
1773 MachineBasicBlock *Epilog = *EI;
1774 SmallVector<MachineOperand, 4> Cond;
1775 TII->removeBranch(*Prolog);
1776 Optional<bool> StaticallyGreater =
1777 Info->createTripCountGreaterCondition(TC, *Prolog, Cond);
1778 if (!StaticallyGreater.hasValue()) {
1779 LLVM_DEBUG(dbgs() << "Dynamic: TC > " << TC << "\n");
1780 // Dynamically branch based on Cond.
1781 TII->insertBranch(*Prolog, Epilog, Fallthrough, Cond, DebugLoc());
1782 } else if (*StaticallyGreater == false) {
1783 LLVM_DEBUG(dbgs() << "Static-false: TC > " << TC << "\n");
1784 // Prolog never falls through; branch to epilog and orphan interior
1785 // blocks. Leave it to unreachable-block-elim to clean up.
1786 Prolog->removeSuccessor(Fallthrough);
1787 for (MachineInstr &P : Fallthrough->phis()) {
1788 P.RemoveOperand(2);
1789 P.RemoveOperand(1);
1791 TII->insertUnconditionalBranch(*Prolog, Epilog, DebugLoc());
1792 KernelDisposed = true;
1793 } else {
1794 LLVM_DEBUG(dbgs() << "Static-true: TC > " << TC << "\n");
1795 // Prolog always falls through; remove incoming values in epilog.
1796 Prolog->removeSuccessor(Epilog);
1797 for (MachineInstr &P : Epilog->phis()) {
1798 P.RemoveOperand(4);
1799 P.RemoveOperand(3);
1804 if (!KernelDisposed) {
1805 Info->adjustTripCount(-(Schedule.getNumStages() - 1));
1806 Info->setPreheader(Prologs.back());
1807 } else {
1808 Info->disposed();
1812 void PeelingModuloScheduleExpander::rewriteKernel() {
1813 KernelRewriter KR(*Schedule.getLoop(), Schedule);
1814 KR.rewrite();
1817 void PeelingModuloScheduleExpander::expand() {
1818 BB = Schedule.getLoop()->getTopBlock();
1819 Preheader = Schedule.getLoop()->getLoopPreheader();
1820 LLVM_DEBUG(Schedule.dump());
1822 rewriteKernel();
1823 peelPrologAndEpilogs();
1824 fixupBranches();
1827 void PeelingModuloScheduleExpander::validateAgainstModuloScheduleExpander() {
1828 BB = Schedule.getLoop()->getTopBlock();
1829 Preheader = Schedule.getLoop()->getLoopPreheader();
1831 // Dump the schedule before we invalidate and remap all its instructions.
1832 // Stash it in a string so we can print it if we found an error.
1833 std::string ScheduleDump;
1834 raw_string_ostream OS(ScheduleDump);
1835 Schedule.print(OS);
1836 OS.flush();
1838 // First, run the normal ModuleScheduleExpander. We don't support any
1839 // InstrChanges.
1840 assert(LIS && "Requires LiveIntervals!");
1841 ModuloScheduleExpander MSE(MF, Schedule, *LIS,
1842 ModuloScheduleExpander::InstrChangesTy());
1843 MSE.expand();
1844 MachineBasicBlock *ExpandedKernel = MSE.getRewrittenKernel();
1845 if (!ExpandedKernel) {
1846 // The expander optimized away the kernel. We can't do any useful checking.
1847 MSE.cleanup();
1848 return;
1850 // Before running the KernelRewriter, re-add BB into the CFG.
1851 Preheader->addSuccessor(BB);
1853 // Now run the new expansion algorithm.
1854 KernelRewriter KR(*Schedule.getLoop(), Schedule);
1855 KR.rewrite();
1856 peelPrologAndEpilogs();
1858 // Collect all illegal phis that the new algorithm created. We'll give these
1859 // to KernelOperandInfo.
1860 SmallPtrSet<MachineInstr *, 4> IllegalPhis;
1861 for (auto NI = BB->getFirstNonPHI(); NI != BB->end(); ++NI) {
1862 if (NI->isPHI())
1863 IllegalPhis.insert(&*NI);
1866 // Co-iterate across both kernels. We expect them to be identical apart from
1867 // phis and full COPYs (we look through both).
1868 SmallVector<std::pair<KernelOperandInfo, KernelOperandInfo>, 8> KOIs;
1869 auto OI = ExpandedKernel->begin();
1870 auto NI = BB->begin();
1871 for (; !OI->isTerminator() && !NI->isTerminator(); ++OI, ++NI) {
1872 while (OI->isPHI() || OI->isFullCopy())
1873 ++OI;
1874 while (NI->isPHI() || NI->isFullCopy())
1875 ++NI;
1876 assert(OI->getOpcode() == NI->getOpcode() && "Opcodes don't match?!");
1877 // Analyze every operand separately.
1878 for (auto OOpI = OI->operands_begin(), NOpI = NI->operands_begin();
1879 OOpI != OI->operands_end(); ++OOpI, ++NOpI)
1880 KOIs.emplace_back(KernelOperandInfo(&*OOpI, MRI, IllegalPhis),
1881 KernelOperandInfo(&*NOpI, MRI, IllegalPhis));
1884 bool Failed = false;
1885 for (auto &OldAndNew : KOIs) {
1886 if (OldAndNew.first == OldAndNew.second)
1887 continue;
1888 Failed = true;
1889 errs() << "Modulo kernel validation error: [\n";
1890 errs() << " [golden] ";
1891 OldAndNew.first.print(errs());
1892 errs() << " ";
1893 OldAndNew.second.print(errs());
1894 errs() << "]\n";
1897 if (Failed) {
1898 errs() << "Golden reference kernel:\n";
1899 ExpandedKernel->print(errs());
1900 errs() << "New kernel:\n";
1901 BB->print(errs());
1902 errs() << ScheduleDump;
1903 report_fatal_error(
1904 "Modulo kernel validation (-pipeliner-experimental-cg) failed");
1907 // Cleanup by removing BB from the CFG again as the original
1908 // ModuloScheduleExpander intended.
1909 Preheader->removeSuccessor(BB);
1910 MSE.cleanup();
1913 //===----------------------------------------------------------------------===//
1914 // ModuloScheduleTestPass implementation
1915 //===----------------------------------------------------------------------===//
1916 // This pass constructs a ModuloSchedule from its module and runs
1917 // ModuloScheduleExpander.
1919 // The module is expected to contain a single-block analyzable loop.
1920 // The total order of instructions is taken from the loop as-is.
1921 // Instructions are expected to be annotated with a PostInstrSymbol.
1922 // This PostInstrSymbol must have the following format:
1923 // "Stage=%d Cycle=%d".
1924 //===----------------------------------------------------------------------===//
1926 namespace {
1927 class ModuloScheduleTest : public MachineFunctionPass {
1928 public:
1929 static char ID;
1931 ModuloScheduleTest() : MachineFunctionPass(ID) {
1932 initializeModuloScheduleTestPass(*PassRegistry::getPassRegistry());
1935 bool runOnMachineFunction(MachineFunction &MF) override;
1936 void runOnLoop(MachineFunction &MF, MachineLoop &L);
1938 void getAnalysisUsage(AnalysisUsage &AU) const override {
1939 AU.addRequired<MachineLoopInfo>();
1940 AU.addRequired<LiveIntervals>();
1941 MachineFunctionPass::getAnalysisUsage(AU);
1944 } // namespace
1946 char ModuloScheduleTest::ID = 0;
1948 INITIALIZE_PASS_BEGIN(ModuloScheduleTest, "modulo-schedule-test",
1949 "Modulo Schedule test pass", false, false)
1950 INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
1951 INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
1952 INITIALIZE_PASS_END(ModuloScheduleTest, "modulo-schedule-test",
1953 "Modulo Schedule test pass", false, false)
1955 bool ModuloScheduleTest::runOnMachineFunction(MachineFunction &MF) {
1956 MachineLoopInfo &MLI = getAnalysis<MachineLoopInfo>();
1957 for (auto *L : MLI) {
1958 if (L->getTopBlock() != L->getBottomBlock())
1959 continue;
1960 runOnLoop(MF, *L);
1961 return false;
1963 return false;
1966 static void parseSymbolString(StringRef S, int &Cycle, int &Stage) {
1967 std::pair<StringRef, StringRef> StageAndCycle = getToken(S, "_");
1968 std::pair<StringRef, StringRef> StageTokenAndValue =
1969 getToken(StageAndCycle.first, "-");
1970 std::pair<StringRef, StringRef> CycleTokenAndValue =
1971 getToken(StageAndCycle.second, "-");
1972 if (StageTokenAndValue.first != "Stage" ||
1973 CycleTokenAndValue.first != "_Cycle") {
1974 llvm_unreachable(
1975 "Bad post-instr symbol syntax: see comment in ModuloScheduleTest");
1976 return;
1979 StageTokenAndValue.second.drop_front().getAsInteger(10, Stage);
1980 CycleTokenAndValue.second.drop_front().getAsInteger(10, Cycle);
1982 dbgs() << " Stage=" << Stage << ", Cycle=" << Cycle << "\n";
1985 void ModuloScheduleTest::runOnLoop(MachineFunction &MF, MachineLoop &L) {
1986 LiveIntervals &LIS = getAnalysis<LiveIntervals>();
1987 MachineBasicBlock *BB = L.getTopBlock();
1988 dbgs() << "--- ModuloScheduleTest running on BB#" << BB->getNumber() << "\n";
1990 DenseMap<MachineInstr *, int> Cycle, Stage;
1991 std::vector<MachineInstr *> Instrs;
1992 for (MachineInstr &MI : *BB) {
1993 if (MI.isTerminator())
1994 continue;
1995 Instrs.push_back(&MI);
1996 if (MCSymbol *Sym = MI.getPostInstrSymbol()) {
1997 dbgs() << "Parsing post-instr symbol for " << MI;
1998 parseSymbolString(Sym->getName(), Cycle[&MI], Stage[&MI]);
2002 ModuloSchedule MS(MF, &L, std::move(Instrs), std::move(Cycle),
2003 std::move(Stage));
2004 ModuloScheduleExpander MSE(
2005 MF, MS, LIS, /*InstrChanges=*/ModuloScheduleExpander::InstrChangesTy());
2006 MSE.expand();
2007 MSE.cleanup();
2010 //===----------------------------------------------------------------------===//
2011 // ModuloScheduleTestAnnotater implementation
2012 //===----------------------------------------------------------------------===//
2014 void ModuloScheduleTestAnnotater::annotate() {
2015 for (MachineInstr *MI : S.getInstructions()) {
2016 SmallVector<char, 16> SV;
2017 raw_svector_ostream OS(SV);
2018 OS << "Stage-" << S.getStage(MI) << "_Cycle-" << S.getCycle(MI);
2019 MCSymbol *Sym = MF.getContext().getOrCreateSymbol(OS.str());
2020 MI->setPostInstrSymbol(MF, Sym);