[debug] Use poison instead of undef to set a killed dbg.assign address [NFC] (#119760)
[llvm-project.git] / llvm / lib / Transforms / Scalar / LoopInterchange.cpp
bloba0c0080c0bda1ce3194e2123704b0625d8fb7d9a
1 //===- LoopInterchange.cpp - Loop interchange pass-------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This Pass handles loop interchange transform.
10 // This pass interchanges loops to provide a more cache-friendly memory access
11 // patterns.
13 //===----------------------------------------------------------------------===//
15 #include "llvm/Transforms/Scalar/LoopInterchange.h"
16 #include "llvm/ADT/STLExtras.h"
17 #include "llvm/ADT/SmallVector.h"
18 #include "llvm/ADT/Statistic.h"
19 #include "llvm/ADT/StringRef.h"
20 #include "llvm/ADT/StringSet.h"
21 #include "llvm/Analysis/DependenceAnalysis.h"
22 #include "llvm/Analysis/LoopCacheAnalysis.h"
23 #include "llvm/Analysis/LoopInfo.h"
24 #include "llvm/Analysis/LoopNestAnalysis.h"
25 #include "llvm/Analysis/LoopPass.h"
26 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
27 #include "llvm/Analysis/ScalarEvolution.h"
28 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
29 #include "llvm/IR/BasicBlock.h"
30 #include "llvm/IR/DiagnosticInfo.h"
31 #include "llvm/IR/Dominators.h"
32 #include "llvm/IR/Function.h"
33 #include "llvm/IR/InstrTypes.h"
34 #include "llvm/IR/Instruction.h"
35 #include "llvm/IR/Instructions.h"
36 #include "llvm/IR/User.h"
37 #include "llvm/IR/Value.h"
38 #include "llvm/Support/Casting.h"
39 #include "llvm/Support/CommandLine.h"
40 #include "llvm/Support/Debug.h"
41 #include "llvm/Support/ErrorHandling.h"
42 #include "llvm/Support/raw_ostream.h"
43 #include "llvm/Transforms/Scalar/LoopPassManager.h"
44 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
45 #include "llvm/Transforms/Utils/LoopUtils.h"
46 #include <cassert>
47 #include <utility>
48 #include <vector>
50 using namespace llvm;
52 #define DEBUG_TYPE "loop-interchange"
54 STATISTIC(LoopsInterchanged, "Number of loops interchanged");
56 static cl::opt<int> LoopInterchangeCostThreshold(
57 "loop-interchange-threshold", cl::init(0), cl::Hidden,
58 cl::desc("Interchange if you gain more than this number"));
60 namespace {
62 using LoopVector = SmallVector<Loop *, 8>;
64 // TODO: Check if we can use a sparse matrix here.
65 using CharMatrix = std::vector<std::vector<char>>;
67 } // end anonymous namespace
69 // Maximum number of dependencies that can be handled in the dependency matrix.
70 static const unsigned MaxMemInstrCount = 100;
72 // Maximum loop depth supported.
73 static const unsigned MaxLoopNestDepth = 10;
75 #ifndef NDEBUG
76 static void printDepMatrix(CharMatrix &DepMatrix) {
77 for (auto &Row : DepMatrix) {
78 for (auto D : Row)
79 LLVM_DEBUG(dbgs() << D << " ");
80 LLVM_DEBUG(dbgs() << "\n");
83 #endif
85 static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level,
86 Loop *L, DependenceInfo *DI,
87 ScalarEvolution *SE) {
88 using ValueVector = SmallVector<Value *, 16>;
90 ValueVector MemInstr;
92 // For each block.
93 for (BasicBlock *BB : L->blocks()) {
94 // Scan the BB and collect legal loads and stores.
95 for (Instruction &I : *BB) {
96 if (!isa<Instruction>(I))
97 return false;
98 if (auto *Ld = dyn_cast<LoadInst>(&I)) {
99 if (!Ld->isSimple())
100 return false;
101 MemInstr.push_back(&I);
102 } else if (auto *St = dyn_cast<StoreInst>(&I)) {
103 if (!St->isSimple())
104 return false;
105 MemInstr.push_back(&I);
110 LLVM_DEBUG(dbgs() << "Found " << MemInstr.size()
111 << " Loads and Stores to analyze\n");
113 ValueVector::iterator I, IE, J, JE;
114 StringSet<> Seen;
116 for (I = MemInstr.begin(), IE = MemInstr.end(); I != IE; ++I) {
117 for (J = I, JE = MemInstr.end(); J != JE; ++J) {
118 std::vector<char> Dep;
119 Instruction *Src = cast<Instruction>(*I);
120 Instruction *Dst = cast<Instruction>(*J);
121 // Ignore Input dependencies.
122 if (isa<LoadInst>(Src) && isa<LoadInst>(Dst))
123 continue;
124 // Track Output, Flow, and Anti dependencies.
125 if (auto D = DI->depends(Src, Dst, true)) {
126 assert(D->isOrdered() && "Expected an output, flow or anti dep.");
127 // If the direction vector is negative, normalize it to
128 // make it non-negative.
129 if (D->normalize(SE))
130 LLVM_DEBUG(dbgs() << "Negative dependence vector normalized.\n");
131 LLVM_DEBUG(StringRef DepType =
132 D->isFlow() ? "flow" : D->isAnti() ? "anti" : "output";
133 dbgs() << "Found " << DepType
134 << " dependency between Src and Dst\n"
135 << " Src:" << *Src << "\n Dst:" << *Dst << '\n');
136 unsigned Levels = D->getLevels();
137 char Direction;
138 for (unsigned II = 1; II <= Levels; ++II) {
139 if (D->isScalar(II)) {
140 Direction = 'S';
141 Dep.push_back(Direction);
142 } else {
143 unsigned Dir = D->getDirection(II);
144 if (Dir == Dependence::DVEntry::LT ||
145 Dir == Dependence::DVEntry::LE)
146 Direction = '<';
147 else if (Dir == Dependence::DVEntry::GT ||
148 Dir == Dependence::DVEntry::GE)
149 Direction = '>';
150 else if (Dir == Dependence::DVEntry::EQ)
151 Direction = '=';
152 else
153 Direction = '*';
154 Dep.push_back(Direction);
157 while (Dep.size() != Level) {
158 Dep.push_back('I');
161 // Make sure we only add unique entries to the dependency matrix.
162 if (Seen.insert(StringRef(Dep.data(), Dep.size())).second)
163 DepMatrix.push_back(Dep);
165 if (DepMatrix.size() > MaxMemInstrCount) {
166 LLVM_DEBUG(dbgs() << "Cannot handle more than " << MaxMemInstrCount
167 << " dependencies inside loop\n");
168 return false;
174 return true;
177 // A loop is moved from index 'from' to an index 'to'. Update the Dependence
178 // matrix by exchanging the two columns.
179 static void interChangeDependencies(CharMatrix &DepMatrix, unsigned FromIndx,
180 unsigned ToIndx) {
181 for (unsigned I = 0, E = DepMatrix.size(); I < E; ++I)
182 std::swap(DepMatrix[I][ToIndx], DepMatrix[I][FromIndx]);
185 // After interchanging, check if the direction vector is valid.
186 // [Theorem] A permutation of the loops in a perfect nest is legal if and only
187 // if the direction matrix, after the same permutation is applied to its
188 // columns, has no ">" direction as the leftmost non-"=" direction in any row.
189 static bool isLexicographicallyPositive(std::vector<char> &DV) {
190 for (unsigned char Direction : DV) {
191 if (Direction == '<')
192 return true;
193 if (Direction == '>' || Direction == '*')
194 return false;
196 return true;
199 // Checks if it is legal to interchange 2 loops.
200 static bool isLegalToInterChangeLoops(CharMatrix &DepMatrix,
201 unsigned InnerLoopId,
202 unsigned OuterLoopId) {
203 unsigned NumRows = DepMatrix.size();
204 std::vector<char> Cur;
205 // For each row check if it is valid to interchange.
206 for (unsigned Row = 0; Row < NumRows; ++Row) {
207 // Create temporary DepVector check its lexicographical order
208 // before and after swapping OuterLoop vs InnerLoop
209 Cur = DepMatrix[Row];
210 if (!isLexicographicallyPositive(Cur))
211 return false;
212 std::swap(Cur[InnerLoopId], Cur[OuterLoopId]);
213 if (!isLexicographicallyPositive(Cur))
214 return false;
216 return true;
219 static void populateWorklist(Loop &L, LoopVector &LoopList) {
220 LLVM_DEBUG(dbgs() << "Calling populateWorklist on Func: "
221 << L.getHeader()->getParent()->getName() << " Loop: %"
222 << L.getHeader()->getName() << '\n');
223 assert(LoopList.empty() && "LoopList should initially be empty!");
224 Loop *CurrentLoop = &L;
225 const std::vector<Loop *> *Vec = &CurrentLoop->getSubLoops();
226 while (!Vec->empty()) {
227 // The current loop has multiple subloops in it hence it is not tightly
228 // nested.
229 // Discard all loops above it added into Worklist.
230 if (Vec->size() != 1) {
231 LoopList = {};
232 return;
235 LoopList.push_back(CurrentLoop);
236 CurrentLoop = Vec->front();
237 Vec = &CurrentLoop->getSubLoops();
239 LoopList.push_back(CurrentLoop);
242 static bool hasMinimumLoopDepth(SmallVectorImpl<Loop *> &LoopList) {
243 unsigned LoopNestDepth = LoopList.size();
244 if (LoopNestDepth < 2) {
245 LLVM_DEBUG(dbgs() << "Loop doesn't contain minimum nesting level.\n");
246 return false;
248 return true;
250 namespace {
252 /// LoopInterchangeLegality checks if it is legal to interchange the loop.
253 class LoopInterchangeLegality {
254 public:
255 LoopInterchangeLegality(Loop *Outer, Loop *Inner, ScalarEvolution *SE,
256 OptimizationRemarkEmitter *ORE)
257 : OuterLoop(Outer), InnerLoop(Inner), SE(SE), ORE(ORE) {}
259 /// Check if the loops can be interchanged.
260 bool canInterchangeLoops(unsigned InnerLoopId, unsigned OuterLoopId,
261 CharMatrix &DepMatrix);
263 /// Discover induction PHIs in the header of \p L. Induction
264 /// PHIs are added to \p Inductions.
265 bool findInductions(Loop *L, SmallVectorImpl<PHINode *> &Inductions);
267 /// Check if the loop structure is understood. We do not handle triangular
268 /// loops for now.
269 bool isLoopStructureUnderstood();
271 bool currentLimitations();
273 const SmallPtrSetImpl<PHINode *> &getOuterInnerReductions() const {
274 return OuterInnerReductions;
277 const SmallVectorImpl<PHINode *> &getInnerLoopInductions() const {
278 return InnerLoopInductions;
281 private:
282 bool tightlyNested(Loop *Outer, Loop *Inner);
283 bool containsUnsafeInstructions(BasicBlock *BB);
285 /// Discover induction and reduction PHIs in the header of \p L. Induction
286 /// PHIs are added to \p Inductions, reductions are added to
287 /// OuterInnerReductions. When the outer loop is passed, the inner loop needs
288 /// to be passed as \p InnerLoop.
289 bool findInductionAndReductions(Loop *L,
290 SmallVector<PHINode *, 8> &Inductions,
291 Loop *InnerLoop);
293 Loop *OuterLoop;
294 Loop *InnerLoop;
296 ScalarEvolution *SE;
298 /// Interface to emit optimization remarks.
299 OptimizationRemarkEmitter *ORE;
301 /// Set of reduction PHIs taking part of a reduction across the inner and
302 /// outer loop.
303 SmallPtrSet<PHINode *, 4> OuterInnerReductions;
305 /// Set of inner loop induction PHIs
306 SmallVector<PHINode *, 8> InnerLoopInductions;
309 /// LoopInterchangeProfitability checks if it is profitable to interchange the
310 /// loop.
311 class LoopInterchangeProfitability {
312 public:
313 LoopInterchangeProfitability(Loop *Outer, Loop *Inner, ScalarEvolution *SE,
314 OptimizationRemarkEmitter *ORE)
315 : OuterLoop(Outer), InnerLoop(Inner), SE(SE), ORE(ORE) {}
317 /// Check if the loop interchange is profitable.
318 bool isProfitable(const Loop *InnerLoop, const Loop *OuterLoop,
319 unsigned InnerLoopId, unsigned OuterLoopId,
320 CharMatrix &DepMatrix,
321 const DenseMap<const Loop *, unsigned> &CostMap,
322 std::unique_ptr<CacheCost> &CC);
324 private:
325 int getInstrOrderCost();
326 std::optional<bool> isProfitablePerLoopCacheAnalysis(
327 const DenseMap<const Loop *, unsigned> &CostMap,
328 std::unique_ptr<CacheCost> &CC);
329 std::optional<bool> isProfitablePerInstrOrderCost();
330 std::optional<bool> isProfitableForVectorization(unsigned InnerLoopId,
331 unsigned OuterLoopId,
332 CharMatrix &DepMatrix);
333 Loop *OuterLoop;
334 Loop *InnerLoop;
336 /// Scev analysis.
337 ScalarEvolution *SE;
339 /// Interface to emit optimization remarks.
340 OptimizationRemarkEmitter *ORE;
343 /// LoopInterchangeTransform interchanges the loop.
344 class LoopInterchangeTransform {
345 public:
346 LoopInterchangeTransform(Loop *Outer, Loop *Inner, ScalarEvolution *SE,
347 LoopInfo *LI, DominatorTree *DT,
348 const LoopInterchangeLegality &LIL)
349 : OuterLoop(Outer), InnerLoop(Inner), SE(SE), LI(LI), DT(DT), LIL(LIL) {}
351 /// Interchange OuterLoop and InnerLoop.
352 bool transform();
353 void restructureLoops(Loop *NewInner, Loop *NewOuter,
354 BasicBlock *OrigInnerPreHeader,
355 BasicBlock *OrigOuterPreHeader);
356 void removeChildLoop(Loop *OuterLoop, Loop *InnerLoop);
358 private:
359 bool adjustLoopLinks();
360 bool adjustLoopBranches();
362 Loop *OuterLoop;
363 Loop *InnerLoop;
365 /// Scev analysis.
366 ScalarEvolution *SE;
368 LoopInfo *LI;
369 DominatorTree *DT;
371 const LoopInterchangeLegality &LIL;
374 struct LoopInterchange {
375 ScalarEvolution *SE = nullptr;
376 LoopInfo *LI = nullptr;
377 DependenceInfo *DI = nullptr;
378 DominatorTree *DT = nullptr;
379 std::unique_ptr<CacheCost> CC = nullptr;
381 /// Interface to emit optimization remarks.
382 OptimizationRemarkEmitter *ORE;
384 LoopInterchange(ScalarEvolution *SE, LoopInfo *LI, DependenceInfo *DI,
385 DominatorTree *DT, std::unique_ptr<CacheCost> &CC,
386 OptimizationRemarkEmitter *ORE)
387 : SE(SE), LI(LI), DI(DI), DT(DT), CC(std::move(CC)), ORE(ORE) {}
389 bool run(Loop *L) {
390 if (L->getParentLoop())
391 return false;
392 SmallVector<Loop *, 8> LoopList;
393 populateWorklist(*L, LoopList);
394 return processLoopList(LoopList);
397 bool run(LoopNest &LN) {
398 SmallVector<Loop *, 8> LoopList(LN.getLoops());
399 for (unsigned I = 1; I < LoopList.size(); ++I)
400 if (LoopList[I]->getParentLoop() != LoopList[I - 1])
401 return false;
402 return processLoopList(LoopList);
405 bool isComputableLoopNest(ArrayRef<Loop *> LoopList) {
406 for (Loop *L : LoopList) {
407 const SCEV *ExitCountOuter = SE->getBackedgeTakenCount(L);
408 if (isa<SCEVCouldNotCompute>(ExitCountOuter)) {
409 LLVM_DEBUG(dbgs() << "Couldn't compute backedge count\n");
410 return false;
412 if (L->getNumBackEdges() != 1) {
413 LLVM_DEBUG(dbgs() << "NumBackEdges is not equal to 1\n");
414 return false;
416 if (!L->getExitingBlock()) {
417 LLVM_DEBUG(dbgs() << "Loop doesn't have unique exit block\n");
418 return false;
421 return true;
424 unsigned selectLoopForInterchange(ArrayRef<Loop *> LoopList) {
425 // TODO: Add a better heuristic to select the loop to be interchanged based
426 // on the dependence matrix. Currently we select the innermost loop.
427 return LoopList.size() - 1;
430 bool processLoopList(SmallVectorImpl<Loop *> &LoopList) {
431 bool Changed = false;
433 // Ensure minimum loop nest depth.
434 assert(hasMinimumLoopDepth(LoopList) && "Loop nest does not meet minimum depth.");
436 unsigned LoopNestDepth = LoopList.size();
437 if (LoopNestDepth > MaxLoopNestDepth) {
438 LLVM_DEBUG(dbgs() << "Cannot handle loops of depth greater than "
439 << MaxLoopNestDepth << "\n");
440 return false;
442 if (!isComputableLoopNest(LoopList)) {
443 LLVM_DEBUG(dbgs() << "Not valid loop candidate for interchange\n");
444 return false;
447 LLVM_DEBUG(dbgs() << "Processing LoopList of size = " << LoopNestDepth
448 << "\n");
450 CharMatrix DependencyMatrix;
451 Loop *OuterMostLoop = *(LoopList.begin());
452 if (!populateDependencyMatrix(DependencyMatrix, LoopNestDepth,
453 OuterMostLoop, DI, SE)) {
454 LLVM_DEBUG(dbgs() << "Populating dependency matrix failed\n");
455 return false;
458 LLVM_DEBUG(dbgs() << "Dependency matrix before interchange:\n";
459 printDepMatrix(DependencyMatrix));
461 // Get the Outermost loop exit.
462 BasicBlock *LoopNestExit = OuterMostLoop->getExitBlock();
463 if (!LoopNestExit) {
464 LLVM_DEBUG(dbgs() << "OuterMostLoop needs an unique exit block");
465 return false;
468 unsigned SelecLoopId = selectLoopForInterchange(LoopList);
469 // Obtain the loop vector returned from loop cache analysis beforehand,
470 // and put each <Loop, index> pair into a map for constant time query
471 // later. Indices in loop vector reprsent the optimal order of the
472 // corresponding loop, e.g., given a loopnest with depth N, index 0
473 // indicates the loop should be placed as the outermost loop and index N
474 // indicates the loop should be placed as the innermost loop.
476 // For the old pass manager CacheCost would be null.
477 DenseMap<const Loop *, unsigned> CostMap;
478 if (CC != nullptr) {
479 const auto &LoopCosts = CC->getLoopCosts();
480 for (unsigned i = 0; i < LoopCosts.size(); i++) {
481 CostMap[LoopCosts[i].first] = i;
484 // We try to achieve the globally optimal memory access for the loopnest,
485 // and do interchange based on a bubble-sort fasion. We start from
486 // the innermost loop, move it outwards to the best possible position
487 // and repeat this process.
488 for (unsigned j = SelecLoopId; j > 0; j--) {
489 bool ChangedPerIter = false;
490 for (unsigned i = SelecLoopId; i > SelecLoopId - j; i--) {
491 bool Interchanged = processLoop(LoopList[i], LoopList[i - 1], i, i - 1,
492 DependencyMatrix, CostMap);
493 if (!Interchanged)
494 continue;
495 // Loops interchanged, update LoopList accordingly.
496 std::swap(LoopList[i - 1], LoopList[i]);
497 // Update the DependencyMatrix
498 interChangeDependencies(DependencyMatrix, i, i - 1);
500 LLVM_DEBUG(dbgs() << "Dependency matrix after interchange:\n";
501 printDepMatrix(DependencyMatrix));
503 ChangedPerIter |= Interchanged;
504 Changed |= Interchanged;
506 // Early abort if there was no interchange during an entire round of
507 // moving loops outwards.
508 if (!ChangedPerIter)
509 break;
511 return Changed;
514 bool processLoop(Loop *InnerLoop, Loop *OuterLoop, unsigned InnerLoopId,
515 unsigned OuterLoopId,
516 std::vector<std::vector<char>> &DependencyMatrix,
517 const DenseMap<const Loop *, unsigned> &CostMap) {
518 LLVM_DEBUG(dbgs() << "Processing InnerLoopId = " << InnerLoopId
519 << " and OuterLoopId = " << OuterLoopId << "\n");
520 LoopInterchangeLegality LIL(OuterLoop, InnerLoop, SE, ORE);
521 if (!LIL.canInterchangeLoops(InnerLoopId, OuterLoopId, DependencyMatrix)) {
522 LLVM_DEBUG(dbgs() << "Not interchanging loops. Cannot prove legality.\n");
523 return false;
525 LLVM_DEBUG(dbgs() << "Loops are legal to interchange\n");
526 LoopInterchangeProfitability LIP(OuterLoop, InnerLoop, SE, ORE);
527 if (!LIP.isProfitable(InnerLoop, OuterLoop, InnerLoopId, OuterLoopId,
528 DependencyMatrix, CostMap, CC)) {
529 LLVM_DEBUG(dbgs() << "Interchanging loops not profitable.\n");
530 return false;
533 ORE->emit([&]() {
534 return OptimizationRemark(DEBUG_TYPE, "Interchanged",
535 InnerLoop->getStartLoc(),
536 InnerLoop->getHeader())
537 << "Loop interchanged with enclosing loop.";
540 LoopInterchangeTransform LIT(OuterLoop, InnerLoop, SE, LI, DT, LIL);
541 LIT.transform();
542 LLVM_DEBUG(dbgs() << "Loops interchanged.\n");
543 LoopsInterchanged++;
545 llvm::formLCSSARecursively(*OuterLoop, *DT, LI, SE);
546 return true;
550 } // end anonymous namespace
552 bool LoopInterchangeLegality::containsUnsafeInstructions(BasicBlock *BB) {
553 return any_of(*BB, [](const Instruction &I) {
554 return I.mayHaveSideEffects() || I.mayReadFromMemory();
558 bool LoopInterchangeLegality::tightlyNested(Loop *OuterLoop, Loop *InnerLoop) {
559 BasicBlock *OuterLoopHeader = OuterLoop->getHeader();
560 BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
561 BasicBlock *OuterLoopLatch = OuterLoop->getLoopLatch();
563 LLVM_DEBUG(dbgs() << "Checking if loops are tightly nested\n");
565 // A perfectly nested loop will not have any branch in between the outer and
566 // inner block i.e. outer header will branch to either inner preheader and
567 // outerloop latch.
568 BranchInst *OuterLoopHeaderBI =
569 dyn_cast<BranchInst>(OuterLoopHeader->getTerminator());
570 if (!OuterLoopHeaderBI)
571 return false;
573 for (BasicBlock *Succ : successors(OuterLoopHeaderBI))
574 if (Succ != InnerLoopPreHeader && Succ != InnerLoop->getHeader() &&
575 Succ != OuterLoopLatch)
576 return false;
578 LLVM_DEBUG(dbgs() << "Checking instructions in Loop header and Loop latch\n");
579 // We do not have any basic block in between now make sure the outer header
580 // and outer loop latch doesn't contain any unsafe instructions.
581 if (containsUnsafeInstructions(OuterLoopHeader) ||
582 containsUnsafeInstructions(OuterLoopLatch))
583 return false;
585 // Also make sure the inner loop preheader does not contain any unsafe
586 // instructions. Note that all instructions in the preheader will be moved to
587 // the outer loop header when interchanging.
588 if (InnerLoopPreHeader != OuterLoopHeader &&
589 containsUnsafeInstructions(InnerLoopPreHeader))
590 return false;
592 BasicBlock *InnerLoopExit = InnerLoop->getExitBlock();
593 // Ensure the inner loop exit block flows to the outer loop latch possibly
594 // through empty blocks.
595 const BasicBlock &SuccInner =
596 LoopNest::skipEmptyBlockUntil(InnerLoopExit, OuterLoopLatch);
597 if (&SuccInner != OuterLoopLatch) {
598 LLVM_DEBUG(dbgs() << "Inner loop exit block " << *InnerLoopExit
599 << " does not lead to the outer loop latch.\n";);
600 return false;
602 // The inner loop exit block does flow to the outer loop latch and not some
603 // other BBs, now make sure it contains safe instructions, since it will be
604 // moved into the (new) inner loop after interchange.
605 if (containsUnsafeInstructions(InnerLoopExit))
606 return false;
608 LLVM_DEBUG(dbgs() << "Loops are perfectly nested\n");
609 // We have a perfect loop nest.
610 return true;
613 bool LoopInterchangeLegality::isLoopStructureUnderstood() {
614 BasicBlock *InnerLoopPreheader = InnerLoop->getLoopPreheader();
615 for (PHINode *InnerInduction : InnerLoopInductions) {
616 unsigned Num = InnerInduction->getNumOperands();
617 for (unsigned i = 0; i < Num; ++i) {
618 Value *Val = InnerInduction->getOperand(i);
619 if (isa<Constant>(Val))
620 continue;
621 Instruction *I = dyn_cast<Instruction>(Val);
622 if (!I)
623 return false;
624 // TODO: Handle triangular loops.
625 // e.g. for(int i=0;i<N;i++)
626 // for(int j=i;j<N;j++)
627 unsigned IncomBlockIndx = PHINode::getIncomingValueNumForOperand(i);
628 if (InnerInduction->getIncomingBlock(IncomBlockIndx) ==
629 InnerLoopPreheader &&
630 !OuterLoop->isLoopInvariant(I)) {
631 return false;
636 // TODO: Handle triangular loops of another form.
637 // e.g. for(int i=0;i<N;i++)
638 // for(int j=0;j<i;j++)
639 // or,
640 // for(int i=0;i<N;i++)
641 // for(int j=0;j*i<N;j++)
642 BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch();
643 BranchInst *InnerLoopLatchBI =
644 dyn_cast<BranchInst>(InnerLoopLatch->getTerminator());
645 if (!InnerLoopLatchBI->isConditional())
646 return false;
647 if (CmpInst *InnerLoopCmp =
648 dyn_cast<CmpInst>(InnerLoopLatchBI->getCondition())) {
649 Value *Op0 = InnerLoopCmp->getOperand(0);
650 Value *Op1 = InnerLoopCmp->getOperand(1);
652 // LHS and RHS of the inner loop exit condition, e.g.,
653 // in "for(int j=0;j<i;j++)", LHS is j and RHS is i.
654 Value *Left = nullptr;
655 Value *Right = nullptr;
657 // Check if V only involves inner loop induction variable.
658 // Return true if V is InnerInduction, or a cast from
659 // InnerInduction, or a binary operator that involves
660 // InnerInduction and a constant.
661 std::function<bool(Value *)> IsPathToInnerIndVar;
662 IsPathToInnerIndVar = [this, &IsPathToInnerIndVar](const Value *V) -> bool {
663 if (llvm::is_contained(InnerLoopInductions, V))
664 return true;
665 if (isa<Constant>(V))
666 return true;
667 const Instruction *I = dyn_cast<Instruction>(V);
668 if (!I)
669 return false;
670 if (isa<CastInst>(I))
671 return IsPathToInnerIndVar(I->getOperand(0));
672 if (isa<BinaryOperator>(I))
673 return IsPathToInnerIndVar(I->getOperand(0)) &&
674 IsPathToInnerIndVar(I->getOperand(1));
675 return false;
678 // In case of multiple inner loop indvars, it is okay if LHS and RHS
679 // are both inner indvar related variables.
680 if (IsPathToInnerIndVar(Op0) && IsPathToInnerIndVar(Op1))
681 return true;
683 // Otherwise we check if the cmp instruction compares an inner indvar
684 // related variable (Left) with a outer loop invariant (Right).
685 if (IsPathToInnerIndVar(Op0) && !isa<Constant>(Op0)) {
686 Left = Op0;
687 Right = Op1;
688 } else if (IsPathToInnerIndVar(Op1) && !isa<Constant>(Op1)) {
689 Left = Op1;
690 Right = Op0;
693 if (Left == nullptr)
694 return false;
696 const SCEV *S = SE->getSCEV(Right);
697 if (!SE->isLoopInvariant(S, OuterLoop))
698 return false;
701 return true;
704 // If SV is a LCSSA PHI node with a single incoming value, return the incoming
705 // value.
706 static Value *followLCSSA(Value *SV) {
707 PHINode *PHI = dyn_cast<PHINode>(SV);
708 if (!PHI)
709 return SV;
711 if (PHI->getNumIncomingValues() != 1)
712 return SV;
713 return followLCSSA(PHI->getIncomingValue(0));
716 // Check V's users to see if it is involved in a reduction in L.
717 static PHINode *findInnerReductionPhi(Loop *L, Value *V) {
718 // Reduction variables cannot be constants.
719 if (isa<Constant>(V))
720 return nullptr;
722 for (Value *User : V->users()) {
723 if (PHINode *PHI = dyn_cast<PHINode>(User)) {
724 if (PHI->getNumIncomingValues() == 1)
725 continue;
726 RecurrenceDescriptor RD;
727 if (RecurrenceDescriptor::isReductionPHI(PHI, L, RD)) {
728 // Detect floating point reduction only when it can be reordered.
729 if (RD.getExactFPMathInst() != nullptr)
730 return nullptr;
731 return PHI;
733 return nullptr;
737 return nullptr;
740 bool LoopInterchangeLegality::findInductionAndReductions(
741 Loop *L, SmallVector<PHINode *, 8> &Inductions, Loop *InnerLoop) {
742 if (!L->getLoopLatch() || !L->getLoopPredecessor())
743 return false;
744 for (PHINode &PHI : L->getHeader()->phis()) {
745 InductionDescriptor ID;
746 if (InductionDescriptor::isInductionPHI(&PHI, L, SE, ID))
747 Inductions.push_back(&PHI);
748 else {
749 // PHIs in inner loops need to be part of a reduction in the outer loop,
750 // discovered when checking the PHIs of the outer loop earlier.
751 if (!InnerLoop) {
752 if (!OuterInnerReductions.count(&PHI)) {
753 LLVM_DEBUG(dbgs() << "Inner loop PHI is not part of reductions "
754 "across the outer loop.\n");
755 return false;
757 } else {
758 assert(PHI.getNumIncomingValues() == 2 &&
759 "Phis in loop header should have exactly 2 incoming values");
760 // Check if we have a PHI node in the outer loop that has a reduction
761 // result from the inner loop as an incoming value.
762 Value *V = followLCSSA(PHI.getIncomingValueForBlock(L->getLoopLatch()));
763 PHINode *InnerRedPhi = findInnerReductionPhi(InnerLoop, V);
764 if (!InnerRedPhi ||
765 !llvm::is_contained(InnerRedPhi->incoming_values(), &PHI)) {
766 LLVM_DEBUG(
767 dbgs()
768 << "Failed to recognize PHI as an induction or reduction.\n");
769 return false;
771 OuterInnerReductions.insert(&PHI);
772 OuterInnerReductions.insert(InnerRedPhi);
776 return true;
779 // This function indicates the current limitations in the transform as a result
780 // of which we do not proceed.
781 bool LoopInterchangeLegality::currentLimitations() {
782 BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch();
784 // transform currently expects the loop latches to also be the exiting
785 // blocks.
786 if (InnerLoop->getExitingBlock() != InnerLoopLatch ||
787 OuterLoop->getExitingBlock() != OuterLoop->getLoopLatch() ||
788 !isa<BranchInst>(InnerLoopLatch->getTerminator()) ||
789 !isa<BranchInst>(OuterLoop->getLoopLatch()->getTerminator())) {
790 LLVM_DEBUG(
791 dbgs() << "Loops where the latch is not the exiting block are not"
792 << " supported currently.\n");
793 ORE->emit([&]() {
794 return OptimizationRemarkMissed(DEBUG_TYPE, "ExitingNotLatch",
795 OuterLoop->getStartLoc(),
796 OuterLoop->getHeader())
797 << "Loops where the latch is not the exiting block cannot be"
798 " interchange currently.";
800 return true;
803 SmallVector<PHINode *, 8> Inductions;
804 if (!findInductionAndReductions(OuterLoop, Inductions, InnerLoop)) {
805 LLVM_DEBUG(
806 dbgs() << "Only outer loops with induction or reduction PHI nodes "
807 << "are supported currently.\n");
808 ORE->emit([&]() {
809 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedPHIOuter",
810 OuterLoop->getStartLoc(),
811 OuterLoop->getHeader())
812 << "Only outer loops with induction or reduction PHI nodes can be"
813 " interchanged currently.";
815 return true;
818 Inductions.clear();
819 // For multi-level loop nests, make sure that all phi nodes for inner loops
820 // at all levels can be recognized as a induction or reduction phi. Bail out
821 // if a phi node at a certain nesting level cannot be properly recognized.
822 Loop *CurLevelLoop = OuterLoop;
823 while (!CurLevelLoop->getSubLoops().empty()) {
824 // We already made sure that the loop nest is tightly nested.
825 CurLevelLoop = CurLevelLoop->getSubLoops().front();
826 if (!findInductionAndReductions(CurLevelLoop, Inductions, nullptr)) {
827 LLVM_DEBUG(
828 dbgs() << "Only inner loops with induction or reduction PHI nodes "
829 << "are supported currently.\n");
830 ORE->emit([&]() {
831 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedPHIInner",
832 CurLevelLoop->getStartLoc(),
833 CurLevelLoop->getHeader())
834 << "Only inner loops with induction or reduction PHI nodes can be"
835 " interchange currently.";
837 return true;
841 // TODO: Triangular loops are not handled for now.
842 if (!isLoopStructureUnderstood()) {
843 LLVM_DEBUG(dbgs() << "Loop structure not understood by pass\n");
844 ORE->emit([&]() {
845 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedStructureInner",
846 InnerLoop->getStartLoc(),
847 InnerLoop->getHeader())
848 << "Inner loop structure not understood currently.";
850 return true;
853 return false;
856 bool LoopInterchangeLegality::findInductions(
857 Loop *L, SmallVectorImpl<PHINode *> &Inductions) {
858 for (PHINode &PHI : L->getHeader()->phis()) {
859 InductionDescriptor ID;
860 if (InductionDescriptor::isInductionPHI(&PHI, L, SE, ID))
861 Inductions.push_back(&PHI);
863 return !Inductions.empty();
866 // We currently only support LCSSA PHI nodes in the inner loop exit, if their
867 // users are either reduction PHIs or PHIs outside the outer loop (which means
868 // the we are only interested in the final value after the loop).
869 static bool
870 areInnerLoopExitPHIsSupported(Loop *InnerL, Loop *OuterL,
871 SmallPtrSetImpl<PHINode *> &Reductions) {
872 BasicBlock *InnerExit = OuterL->getUniqueExitBlock();
873 for (PHINode &PHI : InnerExit->phis()) {
874 // Reduction lcssa phi will have only 1 incoming block that from loop latch.
875 if (PHI.getNumIncomingValues() > 1)
876 return false;
877 if (any_of(PHI.users(), [&Reductions, OuterL](User *U) {
878 PHINode *PN = dyn_cast<PHINode>(U);
879 return !PN ||
880 (!Reductions.count(PN) && OuterL->contains(PN->getParent()));
881 })) {
882 return false;
885 return true;
888 // We currently support LCSSA PHI nodes in the outer loop exit, if their
889 // incoming values do not come from the outer loop latch or if the
890 // outer loop latch has a single predecessor. In that case, the value will
891 // be available if both the inner and outer loop conditions are true, which
892 // will still be true after interchanging. If we have multiple predecessor,
893 // that may not be the case, e.g. because the outer loop latch may be executed
894 // if the inner loop is not executed.
895 static bool areOuterLoopExitPHIsSupported(Loop *OuterLoop, Loop *InnerLoop) {
896 BasicBlock *LoopNestExit = OuterLoop->getUniqueExitBlock();
897 for (PHINode &PHI : LoopNestExit->phis()) {
898 for (unsigned i = 0; i < PHI.getNumIncomingValues(); i++) {
899 Instruction *IncomingI = dyn_cast<Instruction>(PHI.getIncomingValue(i));
900 if (!IncomingI || IncomingI->getParent() != OuterLoop->getLoopLatch())
901 continue;
903 // The incoming value is defined in the outer loop latch. Currently we
904 // only support that in case the outer loop latch has a single predecessor.
905 // This guarantees that the outer loop latch is executed if and only if
906 // the inner loop is executed (because tightlyNested() guarantees that the
907 // outer loop header only branches to the inner loop or the outer loop
908 // latch).
909 // FIXME: We could weaken this logic and allow multiple predecessors,
910 // if the values are produced outside the loop latch. We would need
911 // additional logic to update the PHI nodes in the exit block as
912 // well.
913 if (OuterLoop->getLoopLatch()->getUniquePredecessor() == nullptr)
914 return false;
917 return true;
920 // In case of multi-level nested loops, it may occur that lcssa phis exist in
921 // the latch of InnerLoop, i.e., when defs of the incoming values are further
922 // inside the loopnest. Sometimes those incoming values are not available
923 // after interchange, since the original inner latch will become the new outer
924 // latch which may have predecessor paths that do not include those incoming
925 // values.
926 // TODO: Handle transformation of lcssa phis in the InnerLoop latch in case of
927 // multi-level loop nests.
928 static bool areInnerLoopLatchPHIsSupported(Loop *OuterLoop, Loop *InnerLoop) {
929 if (InnerLoop->getSubLoops().empty())
930 return true;
931 // If the original outer latch has only one predecessor, then values defined
932 // further inside the looploop, e.g., in the innermost loop, will be available
933 // at the new outer latch after interchange.
934 if (OuterLoop->getLoopLatch()->getUniquePredecessor() != nullptr)
935 return true;
937 // The outer latch has more than one predecessors, i.e., the inner
938 // exit and the inner header.
939 // PHI nodes in the inner latch are lcssa phis where the incoming values
940 // are defined further inside the loopnest. Check if those phis are used
941 // in the original inner latch. If that is the case then bail out since
942 // those incoming values may not be available at the new outer latch.
943 BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch();
944 for (PHINode &PHI : InnerLoopLatch->phis()) {
945 for (auto *U : PHI.users()) {
946 Instruction *UI = cast<Instruction>(U);
947 if (InnerLoopLatch == UI->getParent())
948 return false;
951 return true;
954 bool LoopInterchangeLegality::canInterchangeLoops(unsigned InnerLoopId,
955 unsigned OuterLoopId,
956 CharMatrix &DepMatrix) {
957 if (!isLegalToInterChangeLoops(DepMatrix, InnerLoopId, OuterLoopId)) {
958 LLVM_DEBUG(dbgs() << "Failed interchange InnerLoopId = " << InnerLoopId
959 << " and OuterLoopId = " << OuterLoopId
960 << " due to dependence\n");
961 ORE->emit([&]() {
962 return OptimizationRemarkMissed(DEBUG_TYPE, "Dependence",
963 InnerLoop->getStartLoc(),
964 InnerLoop->getHeader())
965 << "Cannot interchange loops due to dependences.";
967 return false;
969 // Check if outer and inner loop contain legal instructions only.
970 for (auto *BB : OuterLoop->blocks())
971 for (Instruction &I : BB->instructionsWithoutDebug())
972 if (CallInst *CI = dyn_cast<CallInst>(&I)) {
973 // readnone functions do not prevent interchanging.
974 if (CI->onlyWritesMemory())
975 continue;
976 LLVM_DEBUG(
977 dbgs() << "Loops with call instructions cannot be interchanged "
978 << "safely.");
979 ORE->emit([&]() {
980 return OptimizationRemarkMissed(DEBUG_TYPE, "CallInst",
981 CI->getDebugLoc(),
982 CI->getParent())
983 << "Cannot interchange loops due to call instruction.";
986 return false;
989 if (!findInductions(InnerLoop, InnerLoopInductions)) {
990 LLVM_DEBUG(dbgs() << "Could not find inner loop induction variables.\n");
991 return false;
994 if (!areInnerLoopLatchPHIsSupported(OuterLoop, InnerLoop)) {
995 LLVM_DEBUG(dbgs() << "Found unsupported PHI nodes in inner loop latch.\n");
996 ORE->emit([&]() {
997 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedInnerLatchPHI",
998 InnerLoop->getStartLoc(),
999 InnerLoop->getHeader())
1000 << "Cannot interchange loops because unsupported PHI nodes found "
1001 "in inner loop latch.";
1003 return false;
1006 // TODO: The loops could not be interchanged due to current limitations in the
1007 // transform module.
1008 if (currentLimitations()) {
1009 LLVM_DEBUG(dbgs() << "Not legal because of current transform limitation\n");
1010 return false;
1013 // Check if the loops are tightly nested.
1014 if (!tightlyNested(OuterLoop, InnerLoop)) {
1015 LLVM_DEBUG(dbgs() << "Loops not tightly nested\n");
1016 ORE->emit([&]() {
1017 return OptimizationRemarkMissed(DEBUG_TYPE, "NotTightlyNested",
1018 InnerLoop->getStartLoc(),
1019 InnerLoop->getHeader())
1020 << "Cannot interchange loops because they are not tightly "
1021 "nested.";
1023 return false;
1026 if (!areInnerLoopExitPHIsSupported(OuterLoop, InnerLoop,
1027 OuterInnerReductions)) {
1028 LLVM_DEBUG(dbgs() << "Found unsupported PHI nodes in inner loop exit.\n");
1029 ORE->emit([&]() {
1030 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedExitPHI",
1031 InnerLoop->getStartLoc(),
1032 InnerLoop->getHeader())
1033 << "Found unsupported PHI node in loop exit.";
1035 return false;
1038 if (!areOuterLoopExitPHIsSupported(OuterLoop, InnerLoop)) {
1039 LLVM_DEBUG(dbgs() << "Found unsupported PHI nodes in outer loop exit.\n");
1040 ORE->emit([&]() {
1041 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedExitPHI",
1042 OuterLoop->getStartLoc(),
1043 OuterLoop->getHeader())
1044 << "Found unsupported PHI node in loop exit.";
1046 return false;
1049 return true;
1052 int LoopInterchangeProfitability::getInstrOrderCost() {
1053 unsigned GoodOrder, BadOrder;
1054 BadOrder = GoodOrder = 0;
1055 for (BasicBlock *BB : InnerLoop->blocks()) {
1056 for (Instruction &Ins : *BB) {
1057 if (const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(&Ins)) {
1058 unsigned NumOp = GEP->getNumOperands();
1059 bool FoundInnerInduction = false;
1060 bool FoundOuterInduction = false;
1061 for (unsigned i = 0; i < NumOp; ++i) {
1062 // Skip operands that are not SCEV-able.
1063 if (!SE->isSCEVable(GEP->getOperand(i)->getType()))
1064 continue;
1066 const SCEV *OperandVal = SE->getSCEV(GEP->getOperand(i));
1067 const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(OperandVal);
1068 if (!AR)
1069 continue;
1071 // If we find the inner induction after an outer induction e.g.
1072 // for(int i=0;i<N;i++)
1073 // for(int j=0;j<N;j++)
1074 // A[i][j] = A[i-1][j-1]+k;
1075 // then it is a good order.
1076 if (AR->getLoop() == InnerLoop) {
1077 // We found an InnerLoop induction after OuterLoop induction. It is
1078 // a good order.
1079 FoundInnerInduction = true;
1080 if (FoundOuterInduction) {
1081 GoodOrder++;
1082 break;
1085 // If we find the outer induction after an inner induction e.g.
1086 // for(int i=0;i<N;i++)
1087 // for(int j=0;j<N;j++)
1088 // A[j][i] = A[j-1][i-1]+k;
1089 // then it is a bad order.
1090 if (AR->getLoop() == OuterLoop) {
1091 // We found an OuterLoop induction after InnerLoop induction. It is
1092 // a bad order.
1093 FoundOuterInduction = true;
1094 if (FoundInnerInduction) {
1095 BadOrder++;
1096 break;
1103 return GoodOrder - BadOrder;
1106 std::optional<bool>
1107 LoopInterchangeProfitability::isProfitablePerLoopCacheAnalysis(
1108 const DenseMap<const Loop *, unsigned> &CostMap,
1109 std::unique_ptr<CacheCost> &CC) {
1110 // This is the new cost model returned from loop cache analysis.
1111 // A smaller index means the loop should be placed an outer loop, and vice
1112 // versa.
1113 if (CostMap.contains(InnerLoop) && CostMap.contains(OuterLoop)) {
1114 unsigned InnerIndex = 0, OuterIndex = 0;
1115 InnerIndex = CostMap.find(InnerLoop)->second;
1116 OuterIndex = CostMap.find(OuterLoop)->second;
1117 LLVM_DEBUG(dbgs() << "InnerIndex = " << InnerIndex
1118 << ", OuterIndex = " << OuterIndex << "\n");
1119 if (InnerIndex < OuterIndex)
1120 return std::optional<bool>(true);
1121 assert(InnerIndex != OuterIndex && "CostMap should assign unique "
1122 "numbers to each loop");
1123 if (CC->getLoopCost(*OuterLoop) == CC->getLoopCost(*InnerLoop))
1124 return std::nullopt;
1125 return std::optional<bool>(false);
1127 return std::nullopt;
1130 std::optional<bool>
1131 LoopInterchangeProfitability::isProfitablePerInstrOrderCost() {
1132 // Legacy cost model: this is rough cost estimation algorithm. It counts the
1133 // good and bad order of induction variables in the instruction and allows
1134 // reordering if number of bad orders is more than good.
1135 int Cost = getInstrOrderCost();
1136 LLVM_DEBUG(dbgs() << "Cost = " << Cost << "\n");
1137 if (Cost < 0 && Cost < LoopInterchangeCostThreshold)
1138 return std::optional<bool>(true);
1140 return std::nullopt;
1143 std::optional<bool> LoopInterchangeProfitability::isProfitableForVectorization(
1144 unsigned InnerLoopId, unsigned OuterLoopId, CharMatrix &DepMatrix) {
1145 for (auto &Row : DepMatrix) {
1146 // If the inner loop is loop independent or doesn't carry any dependency
1147 // it is not profitable to move this to outer position, since we are
1148 // likely able to do inner loop vectorization already.
1149 if (Row[InnerLoopId] == 'I' || Row[InnerLoopId] == '=')
1150 return std::optional<bool>(false);
1152 // If the outer loop is not loop independent it is not profitable to move
1153 // this to inner position, since doing so would not enable inner loop
1154 // parallelism.
1155 if (Row[OuterLoopId] != 'I' && Row[OuterLoopId] != '=')
1156 return std::optional<bool>(false);
1158 // If inner loop has dependence and outer loop is loop independent then it
1159 // is/ profitable to interchange to enable inner loop parallelism.
1160 // If there are no dependences, interchanging will not improve anything.
1161 return std::optional<bool>(!DepMatrix.empty());
1164 bool LoopInterchangeProfitability::isProfitable(
1165 const Loop *InnerLoop, const Loop *OuterLoop, unsigned InnerLoopId,
1166 unsigned OuterLoopId, CharMatrix &DepMatrix,
1167 const DenseMap<const Loop *, unsigned> &CostMap,
1168 std::unique_ptr<CacheCost> &CC) {
1169 // isProfitable() is structured to avoid endless loop interchange.
1170 // If loop cache analysis could decide the profitability then,
1171 // profitability check will stop and return the analysis result.
1172 // If cache analysis failed to analyze the loopnest (e.g.,
1173 // due to delinearization issues) then only check whether it is
1174 // profitable for InstrOrderCost. Likewise, if InstrOrderCost failed to
1175 // analysis the profitability then only, isProfitableForVectorization
1176 // will decide.
1177 std::optional<bool> shouldInterchange =
1178 isProfitablePerLoopCacheAnalysis(CostMap, CC);
1179 if (!shouldInterchange.has_value()) {
1180 shouldInterchange = isProfitablePerInstrOrderCost();
1181 if (!shouldInterchange.has_value())
1182 shouldInterchange =
1183 isProfitableForVectorization(InnerLoopId, OuterLoopId, DepMatrix);
1185 if (!shouldInterchange.has_value()) {
1186 ORE->emit([&]() {
1187 return OptimizationRemarkMissed(DEBUG_TYPE, "InterchangeNotProfitable",
1188 InnerLoop->getStartLoc(),
1189 InnerLoop->getHeader())
1190 << "Insufficient information to calculate the cost of loop for "
1191 "interchange.";
1193 return false;
1194 } else if (!shouldInterchange.value()) {
1195 ORE->emit([&]() {
1196 return OptimizationRemarkMissed(DEBUG_TYPE, "InterchangeNotProfitable",
1197 InnerLoop->getStartLoc(),
1198 InnerLoop->getHeader())
1199 << "Interchanging loops is not considered to improve cache "
1200 "locality nor vectorization.";
1202 return false;
1204 return true;
1207 void LoopInterchangeTransform::removeChildLoop(Loop *OuterLoop,
1208 Loop *InnerLoop) {
1209 for (Loop *L : *OuterLoop)
1210 if (L == InnerLoop) {
1211 OuterLoop->removeChildLoop(L);
1212 return;
1214 llvm_unreachable("Couldn't find loop");
1217 /// Update LoopInfo, after interchanging. NewInner and NewOuter refer to the
1218 /// new inner and outer loop after interchanging: NewInner is the original
1219 /// outer loop and NewOuter is the original inner loop.
1221 /// Before interchanging, we have the following structure
1222 /// Outer preheader
1223 // Outer header
1224 // Inner preheader
1225 // Inner header
1226 // Inner body
1227 // Inner latch
1228 // outer bbs
1229 // Outer latch
1231 // After interchanging:
1232 // Inner preheader
1233 // Inner header
1234 // Outer preheader
1235 // Outer header
1236 // Inner body
1237 // outer bbs
1238 // Outer latch
1239 // Inner latch
1240 void LoopInterchangeTransform::restructureLoops(
1241 Loop *NewInner, Loop *NewOuter, BasicBlock *OrigInnerPreHeader,
1242 BasicBlock *OrigOuterPreHeader) {
1243 Loop *OuterLoopParent = OuterLoop->getParentLoop();
1244 // The original inner loop preheader moves from the new inner loop to
1245 // the parent loop, if there is one.
1246 NewInner->removeBlockFromLoop(OrigInnerPreHeader);
1247 LI->changeLoopFor(OrigInnerPreHeader, OuterLoopParent);
1249 // Switch the loop levels.
1250 if (OuterLoopParent) {
1251 // Remove the loop from its parent loop.
1252 removeChildLoop(OuterLoopParent, NewInner);
1253 removeChildLoop(NewInner, NewOuter);
1254 OuterLoopParent->addChildLoop(NewOuter);
1255 } else {
1256 removeChildLoop(NewInner, NewOuter);
1257 LI->changeTopLevelLoop(NewInner, NewOuter);
1259 while (!NewOuter->isInnermost())
1260 NewInner->addChildLoop(NewOuter->removeChildLoop(NewOuter->begin()));
1261 NewOuter->addChildLoop(NewInner);
1263 // BBs from the original inner loop.
1264 SmallVector<BasicBlock *, 8> OrigInnerBBs(NewOuter->blocks());
1266 // Add BBs from the original outer loop to the original inner loop (excluding
1267 // BBs already in inner loop)
1268 for (BasicBlock *BB : NewInner->blocks())
1269 if (LI->getLoopFor(BB) == NewInner)
1270 NewOuter->addBlockEntry(BB);
1272 // Now remove inner loop header and latch from the new inner loop and move
1273 // other BBs (the loop body) to the new inner loop.
1274 BasicBlock *OuterHeader = NewOuter->getHeader();
1275 BasicBlock *OuterLatch = NewOuter->getLoopLatch();
1276 for (BasicBlock *BB : OrigInnerBBs) {
1277 // Nothing will change for BBs in child loops.
1278 if (LI->getLoopFor(BB) != NewOuter)
1279 continue;
1280 // Remove the new outer loop header and latch from the new inner loop.
1281 if (BB == OuterHeader || BB == OuterLatch)
1282 NewInner->removeBlockFromLoop(BB);
1283 else
1284 LI->changeLoopFor(BB, NewInner);
1287 // The preheader of the original outer loop becomes part of the new
1288 // outer loop.
1289 NewOuter->addBlockEntry(OrigOuterPreHeader);
1290 LI->changeLoopFor(OrigOuterPreHeader, NewOuter);
1292 // Tell SE that we move the loops around.
1293 SE->forgetLoop(NewOuter);
1296 bool LoopInterchangeTransform::transform() {
1297 bool Transformed = false;
1299 if (InnerLoop->getSubLoops().empty()) {
1300 BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
1301 LLVM_DEBUG(dbgs() << "Splitting the inner loop latch\n");
1302 auto &InductionPHIs = LIL.getInnerLoopInductions();
1303 if (InductionPHIs.empty()) {
1304 LLVM_DEBUG(dbgs() << "Failed to find the point to split loop latch \n");
1305 return false;
1308 SmallVector<Instruction *, 8> InnerIndexVarList;
1309 for (PHINode *CurInductionPHI : InductionPHIs) {
1310 if (CurInductionPHI->getIncomingBlock(0) == InnerLoopPreHeader)
1311 InnerIndexVarList.push_back(
1312 dyn_cast<Instruction>(CurInductionPHI->getIncomingValue(1)));
1313 else
1314 InnerIndexVarList.push_back(
1315 dyn_cast<Instruction>(CurInductionPHI->getIncomingValue(0)));
1318 // Create a new latch block for the inner loop. We split at the
1319 // current latch's terminator and then move the condition and all
1320 // operands that are not either loop-invariant or the induction PHI into the
1321 // new latch block.
1322 BasicBlock *NewLatch =
1323 SplitBlock(InnerLoop->getLoopLatch(),
1324 InnerLoop->getLoopLatch()->getTerminator(), DT, LI);
1326 SmallSetVector<Instruction *, 4> WorkList;
1327 unsigned i = 0;
1328 auto MoveInstructions = [&i, &WorkList, this, &InductionPHIs, NewLatch]() {
1329 for (; i < WorkList.size(); i++) {
1330 // Duplicate instruction and move it the new latch. Update uses that
1331 // have been moved.
1332 Instruction *NewI = WorkList[i]->clone();
1333 NewI->insertBefore(NewLatch->getFirstNonPHI());
1334 assert(!NewI->mayHaveSideEffects() &&
1335 "Moving instructions with side-effects may change behavior of "
1336 "the loop nest!");
1337 for (Use &U : llvm::make_early_inc_range(WorkList[i]->uses())) {
1338 Instruction *UserI = cast<Instruction>(U.getUser());
1339 if (!InnerLoop->contains(UserI->getParent()) ||
1340 UserI->getParent() == NewLatch ||
1341 llvm::is_contained(InductionPHIs, UserI))
1342 U.set(NewI);
1344 // Add operands of moved instruction to the worklist, except if they are
1345 // outside the inner loop or are the induction PHI.
1346 for (Value *Op : WorkList[i]->operands()) {
1347 Instruction *OpI = dyn_cast<Instruction>(Op);
1348 if (!OpI ||
1349 this->LI->getLoopFor(OpI->getParent()) != this->InnerLoop ||
1350 llvm::is_contained(InductionPHIs, OpI))
1351 continue;
1352 WorkList.insert(OpI);
1357 // FIXME: Should we interchange when we have a constant condition?
1358 Instruction *CondI = dyn_cast<Instruction>(
1359 cast<BranchInst>(InnerLoop->getLoopLatch()->getTerminator())
1360 ->getCondition());
1361 if (CondI)
1362 WorkList.insert(CondI);
1363 MoveInstructions();
1364 for (Instruction *InnerIndexVar : InnerIndexVarList)
1365 WorkList.insert(cast<Instruction>(InnerIndexVar));
1366 MoveInstructions();
1369 // Ensure the inner loop phi nodes have a separate basic block.
1370 BasicBlock *InnerLoopHeader = InnerLoop->getHeader();
1371 if (InnerLoopHeader->getFirstNonPHI() != InnerLoopHeader->getTerminator()) {
1372 SplitBlock(InnerLoopHeader, InnerLoopHeader->getFirstNonPHI(), DT, LI);
1373 LLVM_DEBUG(dbgs() << "splitting InnerLoopHeader done\n");
1376 // Instructions in the original inner loop preheader may depend on values
1377 // defined in the outer loop header. Move them there, because the original
1378 // inner loop preheader will become the entry into the interchanged loop nest.
1379 // Currently we move all instructions and rely on LICM to move invariant
1380 // instructions outside the loop nest.
1381 BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
1382 BasicBlock *OuterLoopHeader = OuterLoop->getHeader();
1383 if (InnerLoopPreHeader != OuterLoopHeader) {
1384 SmallPtrSet<Instruction *, 4> NeedsMoving;
1385 for (Instruction &I :
1386 make_early_inc_range(make_range(InnerLoopPreHeader->begin(),
1387 std::prev(InnerLoopPreHeader->end()))))
1388 I.moveBeforePreserving(OuterLoopHeader->getTerminator());
1391 Transformed |= adjustLoopLinks();
1392 if (!Transformed) {
1393 LLVM_DEBUG(dbgs() << "adjustLoopLinks failed\n");
1394 return false;
1397 return true;
1400 /// \brief Move all instructions except the terminator from FromBB right before
1401 /// InsertBefore
1402 static void moveBBContents(BasicBlock *FromBB, Instruction *InsertBefore) {
1403 BasicBlock *ToBB = InsertBefore->getParent();
1405 ToBB->splice(InsertBefore->getIterator(), FromBB, FromBB->begin(),
1406 FromBB->getTerminator()->getIterator());
1409 /// Swap instructions between \p BB1 and \p BB2 but keep terminators intact.
1410 static void swapBBContents(BasicBlock *BB1, BasicBlock *BB2) {
1411 // Save all non-terminator instructions of BB1 into TempInstrs and unlink them
1412 // from BB1 afterwards.
1413 auto Iter = map_range(*BB1, [](Instruction &I) { return &I; });
1414 SmallVector<Instruction *, 4> TempInstrs(Iter.begin(), std::prev(Iter.end()));
1415 for (Instruction *I : TempInstrs)
1416 I->removeFromParent();
1418 // Move instructions from BB2 to BB1.
1419 moveBBContents(BB2, BB1->getTerminator());
1421 // Move instructions from TempInstrs to BB2.
1422 for (Instruction *I : TempInstrs)
1423 I->insertBefore(BB2->getTerminator());
1426 // Update BI to jump to NewBB instead of OldBB. Records updates to the
1427 // dominator tree in DTUpdates. If \p MustUpdateOnce is true, assert that
1428 // \p OldBB is exactly once in BI's successor list.
1429 static void updateSuccessor(BranchInst *BI, BasicBlock *OldBB,
1430 BasicBlock *NewBB,
1431 std::vector<DominatorTree::UpdateType> &DTUpdates,
1432 bool MustUpdateOnce = true) {
1433 assert((!MustUpdateOnce ||
1434 llvm::count_if(successors(BI),
1435 [OldBB](BasicBlock *BB) {
1436 return BB == OldBB;
1437 }) == 1) && "BI must jump to OldBB exactly once.");
1438 bool Changed = false;
1439 for (Use &Op : BI->operands())
1440 if (Op == OldBB) {
1441 Op.set(NewBB);
1442 Changed = true;
1445 if (Changed) {
1446 DTUpdates.push_back(
1447 {DominatorTree::UpdateKind::Insert, BI->getParent(), NewBB});
1448 DTUpdates.push_back(
1449 {DominatorTree::UpdateKind::Delete, BI->getParent(), OldBB});
1451 assert(Changed && "Expected a successor to be updated");
1454 // Move Lcssa PHIs to the right place.
1455 static void moveLCSSAPhis(BasicBlock *InnerExit, BasicBlock *InnerHeader,
1456 BasicBlock *InnerLatch, BasicBlock *OuterHeader,
1457 BasicBlock *OuterLatch, BasicBlock *OuterExit,
1458 Loop *InnerLoop, LoopInfo *LI) {
1460 // Deal with LCSSA PHI nodes in the exit block of the inner loop, that are
1461 // defined either in the header or latch. Those blocks will become header and
1462 // latch of the new outer loop, and the only possible users can PHI nodes
1463 // in the exit block of the loop nest or the outer loop header (reduction
1464 // PHIs, in that case, the incoming value must be defined in the inner loop
1465 // header). We can just substitute the user with the incoming value and remove
1466 // the PHI.
1467 for (PHINode &P : make_early_inc_range(InnerExit->phis())) {
1468 assert(P.getNumIncomingValues() == 1 &&
1469 "Only loops with a single exit are supported!");
1471 // Incoming values are guaranteed be instructions currently.
1472 auto IncI = cast<Instruction>(P.getIncomingValueForBlock(InnerLatch));
1473 // In case of multi-level nested loops, follow LCSSA to find the incoming
1474 // value defined from the innermost loop.
1475 auto IncIInnerMost = cast<Instruction>(followLCSSA(IncI));
1476 // Skip phis with incoming values from the inner loop body, excluding the
1477 // header and latch.
1478 if (IncIInnerMost->getParent() != InnerLatch &&
1479 IncIInnerMost->getParent() != InnerHeader)
1480 continue;
1482 assert(all_of(P.users(),
1483 [OuterHeader, OuterExit, IncI, InnerHeader](User *U) {
1484 return (cast<PHINode>(U)->getParent() == OuterHeader &&
1485 IncI->getParent() == InnerHeader) ||
1486 cast<PHINode>(U)->getParent() == OuterExit;
1487 }) &&
1488 "Can only replace phis iff the uses are in the loop nest exit or "
1489 "the incoming value is defined in the inner header (it will "
1490 "dominate all loop blocks after interchanging)");
1491 P.replaceAllUsesWith(IncI);
1492 P.eraseFromParent();
1495 SmallVector<PHINode *, 8> LcssaInnerExit;
1496 for (PHINode &P : InnerExit->phis())
1497 LcssaInnerExit.push_back(&P);
1499 SmallVector<PHINode *, 8> LcssaInnerLatch;
1500 for (PHINode &P : InnerLatch->phis())
1501 LcssaInnerLatch.push_back(&P);
1503 // Lcssa PHIs for values used outside the inner loop are in InnerExit.
1504 // If a PHI node has users outside of InnerExit, it has a use outside the
1505 // interchanged loop and we have to preserve it. We move these to
1506 // InnerLatch, which will become the new exit block for the innermost
1507 // loop after interchanging.
1508 for (PHINode *P : LcssaInnerExit)
1509 P->moveBefore(InnerLatch->getFirstNonPHI());
1511 // If the inner loop latch contains LCSSA PHIs, those come from a child loop
1512 // and we have to move them to the new inner latch.
1513 for (PHINode *P : LcssaInnerLatch)
1514 P->moveBefore(InnerExit->getFirstNonPHI());
1516 // Deal with LCSSA PHI nodes in the loop nest exit block. For PHIs that have
1517 // incoming values defined in the outer loop, we have to add a new PHI
1518 // in the inner loop latch, which became the exit block of the outer loop,
1519 // after interchanging.
1520 if (OuterExit) {
1521 for (PHINode &P : OuterExit->phis()) {
1522 if (P.getNumIncomingValues() != 1)
1523 continue;
1524 // Skip Phis with incoming values defined in the inner loop. Those should
1525 // already have been updated.
1526 auto I = dyn_cast<Instruction>(P.getIncomingValue(0));
1527 if (!I || LI->getLoopFor(I->getParent()) == InnerLoop)
1528 continue;
1530 PHINode *NewPhi = dyn_cast<PHINode>(P.clone());
1531 NewPhi->setIncomingValue(0, P.getIncomingValue(0));
1532 NewPhi->setIncomingBlock(0, OuterLatch);
1533 // We might have incoming edges from other BBs, i.e., the original outer
1534 // header.
1535 for (auto *Pred : predecessors(InnerLatch)) {
1536 if (Pred == OuterLatch)
1537 continue;
1538 NewPhi->addIncoming(P.getIncomingValue(0), Pred);
1540 NewPhi->insertBefore(InnerLatch->getFirstNonPHI());
1541 P.setIncomingValue(0, NewPhi);
1545 // Now adjust the incoming blocks for the LCSSA PHIs.
1546 // For PHIs moved from Inner's exit block, we need to replace Inner's latch
1547 // with the new latch.
1548 InnerLatch->replacePhiUsesWith(InnerLatch, OuterLatch);
1551 bool LoopInterchangeTransform::adjustLoopBranches() {
1552 LLVM_DEBUG(dbgs() << "adjustLoopBranches called\n");
1553 std::vector<DominatorTree::UpdateType> DTUpdates;
1555 BasicBlock *OuterLoopPreHeader = OuterLoop->getLoopPreheader();
1556 BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
1558 assert(OuterLoopPreHeader != OuterLoop->getHeader() &&
1559 InnerLoopPreHeader != InnerLoop->getHeader() && OuterLoopPreHeader &&
1560 InnerLoopPreHeader && "Guaranteed by loop-simplify form");
1561 // Ensure that both preheaders do not contain PHI nodes and have single
1562 // predecessors. This allows us to move them easily. We use
1563 // InsertPreHeaderForLoop to create an 'extra' preheader, if the existing
1564 // preheaders do not satisfy those conditions.
1565 if (isa<PHINode>(OuterLoopPreHeader->begin()) ||
1566 !OuterLoopPreHeader->getUniquePredecessor())
1567 OuterLoopPreHeader =
1568 InsertPreheaderForLoop(OuterLoop, DT, LI, nullptr, true);
1569 if (InnerLoopPreHeader == OuterLoop->getHeader())
1570 InnerLoopPreHeader =
1571 InsertPreheaderForLoop(InnerLoop, DT, LI, nullptr, true);
1573 // Adjust the loop preheader
1574 BasicBlock *InnerLoopHeader = InnerLoop->getHeader();
1575 BasicBlock *OuterLoopHeader = OuterLoop->getHeader();
1576 BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch();
1577 BasicBlock *OuterLoopLatch = OuterLoop->getLoopLatch();
1578 BasicBlock *OuterLoopPredecessor = OuterLoopPreHeader->getUniquePredecessor();
1579 BasicBlock *InnerLoopLatchPredecessor =
1580 InnerLoopLatch->getUniquePredecessor();
1581 BasicBlock *InnerLoopLatchSuccessor;
1582 BasicBlock *OuterLoopLatchSuccessor;
1584 BranchInst *OuterLoopLatchBI =
1585 dyn_cast<BranchInst>(OuterLoopLatch->getTerminator());
1586 BranchInst *InnerLoopLatchBI =
1587 dyn_cast<BranchInst>(InnerLoopLatch->getTerminator());
1588 BranchInst *OuterLoopHeaderBI =
1589 dyn_cast<BranchInst>(OuterLoopHeader->getTerminator());
1590 BranchInst *InnerLoopHeaderBI =
1591 dyn_cast<BranchInst>(InnerLoopHeader->getTerminator());
1593 if (!OuterLoopPredecessor || !InnerLoopLatchPredecessor ||
1594 !OuterLoopLatchBI || !InnerLoopLatchBI || !OuterLoopHeaderBI ||
1595 !InnerLoopHeaderBI)
1596 return false;
1598 BranchInst *InnerLoopLatchPredecessorBI =
1599 dyn_cast<BranchInst>(InnerLoopLatchPredecessor->getTerminator());
1600 BranchInst *OuterLoopPredecessorBI =
1601 dyn_cast<BranchInst>(OuterLoopPredecessor->getTerminator());
1603 if (!OuterLoopPredecessorBI || !InnerLoopLatchPredecessorBI)
1604 return false;
1605 BasicBlock *InnerLoopHeaderSuccessor = InnerLoopHeader->getUniqueSuccessor();
1606 if (!InnerLoopHeaderSuccessor)
1607 return false;
1609 // Adjust Loop Preheader and headers.
1610 // The branches in the outer loop predecessor and the outer loop header can
1611 // be unconditional branches or conditional branches with duplicates. Consider
1612 // this when updating the successors.
1613 updateSuccessor(OuterLoopPredecessorBI, OuterLoopPreHeader,
1614 InnerLoopPreHeader, DTUpdates, /*MustUpdateOnce=*/false);
1615 // The outer loop header might or might not branch to the outer latch.
1616 // We are guaranteed to branch to the inner loop preheader.
1617 if (llvm::is_contained(OuterLoopHeaderBI->successors(), OuterLoopLatch)) {
1618 // In this case the outerLoopHeader should branch to the InnerLoopLatch.
1619 updateSuccessor(OuterLoopHeaderBI, OuterLoopLatch, InnerLoopLatch,
1620 DTUpdates,
1621 /*MustUpdateOnce=*/false);
1623 updateSuccessor(OuterLoopHeaderBI, InnerLoopPreHeader,
1624 InnerLoopHeaderSuccessor, DTUpdates,
1625 /*MustUpdateOnce=*/false);
1627 // Adjust reduction PHI's now that the incoming block has changed.
1628 InnerLoopHeaderSuccessor->replacePhiUsesWith(InnerLoopHeader,
1629 OuterLoopHeader);
1631 updateSuccessor(InnerLoopHeaderBI, InnerLoopHeaderSuccessor,
1632 OuterLoopPreHeader, DTUpdates);
1634 // -------------Adjust loop latches-----------
1635 if (InnerLoopLatchBI->getSuccessor(0) == InnerLoopHeader)
1636 InnerLoopLatchSuccessor = InnerLoopLatchBI->getSuccessor(1);
1637 else
1638 InnerLoopLatchSuccessor = InnerLoopLatchBI->getSuccessor(0);
1640 updateSuccessor(InnerLoopLatchPredecessorBI, InnerLoopLatch,
1641 InnerLoopLatchSuccessor, DTUpdates);
1643 if (OuterLoopLatchBI->getSuccessor(0) == OuterLoopHeader)
1644 OuterLoopLatchSuccessor = OuterLoopLatchBI->getSuccessor(1);
1645 else
1646 OuterLoopLatchSuccessor = OuterLoopLatchBI->getSuccessor(0);
1648 updateSuccessor(InnerLoopLatchBI, InnerLoopLatchSuccessor,
1649 OuterLoopLatchSuccessor, DTUpdates);
1650 updateSuccessor(OuterLoopLatchBI, OuterLoopLatchSuccessor, InnerLoopLatch,
1651 DTUpdates);
1653 DT->applyUpdates(DTUpdates);
1654 restructureLoops(OuterLoop, InnerLoop, InnerLoopPreHeader,
1655 OuterLoopPreHeader);
1657 moveLCSSAPhis(InnerLoopLatchSuccessor, InnerLoopHeader, InnerLoopLatch,
1658 OuterLoopHeader, OuterLoopLatch, InnerLoop->getExitBlock(),
1659 InnerLoop, LI);
1660 // For PHIs in the exit block of the outer loop, outer's latch has been
1661 // replaced by Inners'.
1662 OuterLoopLatchSuccessor->replacePhiUsesWith(OuterLoopLatch, InnerLoopLatch);
1664 auto &OuterInnerReductions = LIL.getOuterInnerReductions();
1665 // Now update the reduction PHIs in the inner and outer loop headers.
1666 SmallVector<PHINode *, 4> InnerLoopPHIs, OuterLoopPHIs;
1667 for (PHINode &PHI : InnerLoopHeader->phis())
1668 if (OuterInnerReductions.contains(&PHI))
1669 InnerLoopPHIs.push_back(&PHI);
1671 for (PHINode &PHI : OuterLoopHeader->phis())
1672 if (OuterInnerReductions.contains(&PHI))
1673 OuterLoopPHIs.push_back(&PHI);
1675 // Now move the remaining reduction PHIs from outer to inner loop header and
1676 // vice versa. The PHI nodes must be part of a reduction across the inner and
1677 // outer loop and all the remains to do is and updating the incoming blocks.
1678 for (PHINode *PHI : OuterLoopPHIs) {
1679 LLVM_DEBUG(dbgs() << "Outer loop reduction PHIs:\n"; PHI->dump(););
1680 PHI->moveBefore(InnerLoopHeader->getFirstNonPHI());
1681 assert(OuterInnerReductions.count(PHI) && "Expected a reduction PHI node");
1683 for (PHINode *PHI : InnerLoopPHIs) {
1684 LLVM_DEBUG(dbgs() << "Inner loop reduction PHIs:\n"; PHI->dump(););
1685 PHI->moveBefore(OuterLoopHeader->getFirstNonPHI());
1686 assert(OuterInnerReductions.count(PHI) && "Expected a reduction PHI node");
1689 // Update the incoming blocks for moved PHI nodes.
1690 OuterLoopHeader->replacePhiUsesWith(InnerLoopPreHeader, OuterLoopPreHeader);
1691 OuterLoopHeader->replacePhiUsesWith(InnerLoopLatch, OuterLoopLatch);
1692 InnerLoopHeader->replacePhiUsesWith(OuterLoopPreHeader, InnerLoopPreHeader);
1693 InnerLoopHeader->replacePhiUsesWith(OuterLoopLatch, InnerLoopLatch);
1695 // Values defined in the outer loop header could be used in the inner loop
1696 // latch. In that case, we need to create LCSSA phis for them, because after
1697 // interchanging they will be defined in the new inner loop and used in the
1698 // new outer loop.
1699 SmallVector<Instruction *, 4> MayNeedLCSSAPhis;
1700 for (Instruction &I :
1701 make_range(OuterLoopHeader->begin(), std::prev(OuterLoopHeader->end())))
1702 MayNeedLCSSAPhis.push_back(&I);
1703 formLCSSAForInstructions(MayNeedLCSSAPhis, *DT, *LI, SE);
1705 return true;
1708 bool LoopInterchangeTransform::adjustLoopLinks() {
1709 // Adjust all branches in the inner and outer loop.
1710 bool Changed = adjustLoopBranches();
1711 if (Changed) {
1712 // We have interchanged the preheaders so we need to interchange the data in
1713 // the preheaders as well. This is because the content of the inner
1714 // preheader was previously executed inside the outer loop.
1715 BasicBlock *OuterLoopPreHeader = OuterLoop->getLoopPreheader();
1716 BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
1717 swapBBContents(OuterLoopPreHeader, InnerLoopPreHeader);
1719 return Changed;
1722 PreservedAnalyses LoopInterchangePass::run(LoopNest &LN,
1723 LoopAnalysisManager &AM,
1724 LoopStandardAnalysisResults &AR,
1725 LPMUpdater &U) {
1726 Function &F = *LN.getParent();
1727 SmallVector<Loop *, 8> LoopList(LN.getLoops());
1728 // Ensure minimum depth of the loop nest to do the interchange.
1729 if (!hasMinimumLoopDepth(LoopList))
1730 return PreservedAnalyses::all();
1732 DependenceInfo DI(&F, &AR.AA, &AR.SE, &AR.LI);
1733 std::unique_ptr<CacheCost> CC =
1734 CacheCost::getCacheCost(LN.getOutermostLoop(), AR, DI);
1735 OptimizationRemarkEmitter ORE(&F);
1736 if (!LoopInterchange(&AR.SE, &AR.LI, &DI, &AR.DT, CC, &ORE).run(LN))
1737 return PreservedAnalyses::all();
1738 U.markLoopNestChanged(true);
1739 return getLoopPassPreservedAnalyses();