1 //===-- UnrollLoop.cpp - Loop unrolling utilities -------------------------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 // This file implements some loop unrolling utilities. It does not define any
10 // actual pass or policy, but provides a single function to perform loop
13 // The process of unrolling can produce extraneous basic blocks linked with
14 // unconditional branches. This will be corrected in the future.
16 //===----------------------------------------------------------------------===//
18 #include "llvm/ADT/ArrayRef.h"
19 #include "llvm/ADT/DenseMap.h"
20 #include "llvm/ADT/STLExtras.h"
21 #include "llvm/ADT/SetVector.h"
22 #include "llvm/ADT/SmallVector.h"
23 #include "llvm/ADT/Statistic.h"
24 #include "llvm/ADT/StringRef.h"
25 #include "llvm/ADT/Twine.h"
26 #include "llvm/ADT/ilist_iterator.h"
27 #include "llvm/Analysis/AssumptionCache.h"
28 #include "llvm/Analysis/DomTreeUpdater.h"
29 #include "llvm/Analysis/InstructionSimplify.h"
30 #include "llvm/Analysis/LoopInfo.h"
31 #include "llvm/Analysis/LoopIterator.h"
32 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
33 #include "llvm/Analysis/ScalarEvolution.h"
34 #include "llvm/IR/BasicBlock.h"
35 #include "llvm/IR/CFG.h"
36 #include "llvm/IR/Constants.h"
37 #include "llvm/IR/DebugInfoMetadata.h"
38 #include "llvm/IR/DebugLoc.h"
39 #include "llvm/IR/DiagnosticInfo.h"
40 #include "llvm/IR/Dominators.h"
41 #include "llvm/IR/Function.h"
42 #include "llvm/IR/Instruction.h"
43 #include "llvm/IR/Instructions.h"
44 #include "llvm/IR/IntrinsicInst.h"
45 #include "llvm/IR/Metadata.h"
46 #include "llvm/IR/Module.h"
47 #include "llvm/IR/PatternMatch.h"
48 #include "llvm/IR/Use.h"
49 #include "llvm/IR/User.h"
50 #include "llvm/IR/ValueHandle.h"
51 #include "llvm/IR/ValueMap.h"
52 #include "llvm/Support/Casting.h"
53 #include "llvm/Support/CommandLine.h"
54 #include "llvm/Support/Debug.h"
55 #include "llvm/Support/GenericDomTree.h"
56 #include "llvm/Support/MathExtras.h"
57 #include "llvm/Support/raw_ostream.h"
58 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
59 #include "llvm/Transforms/Utils/Cloning.h"
60 #include "llvm/Transforms/Utils/Local.h"
61 #include "llvm/Transforms/Utils/LoopSimplify.h"
62 #include "llvm/Transforms/Utils/LoopUtils.h"
63 #include "llvm/Transforms/Utils/SimplifyIndVar.h"
64 #include "llvm/Transforms/Utils/UnrollLoop.h"
65 #include "llvm/Transforms/Utils/ValueMapper.h"
69 #include <type_traits>
79 #define DEBUG_TYPE "loop-unroll"
81 // TODO: Should these be here or in LoopUnroll?
82 STATISTIC(NumCompletelyUnrolled
, "Number of loops completely unrolled");
83 STATISTIC(NumUnrolled
, "Number of loops unrolled (completely or otherwise)");
84 STATISTIC(NumUnrolledNotLatch
, "Number of loops unrolled without a conditional "
85 "latch (completely or otherwise)");
88 UnrollRuntimeEpilog("unroll-runtime-epilog", cl::init(false), cl::Hidden
,
89 cl::desc("Allow runtime unrolled loops to be unrolled "
90 "with epilog instead of prolog."));
93 UnrollVerifyDomtree("unroll-verify-domtree", cl::Hidden
,
94 cl::desc("Verify domtree after unrolling"),
95 #ifdef EXPENSIVE_CHECKS
103 UnrollVerifyLoopInfo("unroll-verify-loopinfo", cl::Hidden
,
104 cl::desc("Verify loopinfo after unrolling"),
105 #ifdef EXPENSIVE_CHECKS
113 /// Check if unrolling created a situation where we need to insert phi nodes to
114 /// preserve LCSSA form.
115 /// \param Blocks is a vector of basic blocks representing unrolled loop.
116 /// \param L is the outer loop.
117 /// It's possible that some of the blocks are in L, and some are not. In this
118 /// case, if there is a use is outside L, and definition is inside L, we need to
119 /// insert a phi-node, otherwise LCSSA will be broken.
120 /// The function is just a helper function for llvm::UnrollLoop that returns
121 /// true if this situation occurs, indicating that LCSSA needs to be fixed.
122 static bool needToInsertPhisForLCSSA(Loop
*L
,
123 const std::vector
<BasicBlock
*> &Blocks
,
125 for (BasicBlock
*BB
: Blocks
) {
126 if (LI
->getLoopFor(BB
) == L
)
128 for (Instruction
&I
: *BB
) {
129 for (Use
&U
: I
.operands()) {
130 if (const auto *Def
= dyn_cast
<Instruction
>(U
)) {
131 Loop
*DefLoop
= LI
->getLoopFor(Def
->getParent());
134 if (DefLoop
->contains(L
))
143 /// Adds ClonedBB to LoopInfo, creates a new loop for ClonedBB if necessary
144 /// and adds a mapping from the original loop to the new loop to NewLoops.
145 /// Returns nullptr if no new loop was created and a pointer to the
146 /// original loop OriginalBB was part of otherwise.
147 const Loop
* llvm::addClonedBlockToLoopInfo(BasicBlock
*OriginalBB
,
148 BasicBlock
*ClonedBB
, LoopInfo
*LI
,
149 NewLoopsMap
&NewLoops
) {
150 // Figure out which loop New is in.
151 const Loop
*OldLoop
= LI
->getLoopFor(OriginalBB
);
152 assert(OldLoop
&& "Should (at least) be in the loop being unrolled!");
154 Loop
*&NewLoop
= NewLoops
[OldLoop
];
156 // Found a new sub-loop.
157 assert(OriginalBB
== OldLoop
->getHeader() &&
158 "Header should be first in RPO");
160 NewLoop
= LI
->AllocateLoop();
161 Loop
*NewLoopParent
= NewLoops
.lookup(OldLoop
->getParentLoop());
164 NewLoopParent
->addChildLoop(NewLoop
);
166 LI
->addTopLevelLoop(NewLoop
);
168 NewLoop
->addBasicBlockToLoop(ClonedBB
, *LI
);
171 NewLoop
->addBasicBlockToLoop(ClonedBB
, *LI
);
176 /// The function chooses which type of unroll (epilog or prolog) is more
178 /// Epilog unroll is more profitable when there is PHI that starts from
179 /// constant. In this case epilog will leave PHI start from constant,
180 /// but prolog will convert it to non-constant.
183 /// PN = PHI [I, Latch], [CI, PreHeader]
187 /// Epilog unroll case.
189 /// PN = PHI [I2, Latch], [CI, PreHeader]
193 /// Prolog unroll case.
194 /// NewPN = PHI [PrologI, Prolog], [CI, PreHeader]
196 /// PN = PHI [I2, Latch], [NewPN, PreHeader]
201 static bool isEpilogProfitable(Loop
*L
) {
202 BasicBlock
*PreHeader
= L
->getLoopPreheader();
203 BasicBlock
*Header
= L
->getHeader();
204 assert(PreHeader
&& Header
);
205 for (const PHINode
&PN
: Header
->phis()) {
206 if (isa
<ConstantInt
>(PN
.getIncomingValueForBlock(PreHeader
)))
212 /// Perform some cleanup and simplifications on loops after unrolling. It is
213 /// useful to simplify the IV's in the new loop, as well as do a quick
214 /// simplify/dce pass of the instructions.
215 void llvm::simplifyLoopAfterUnroll(Loop
*L
, bool SimplifyIVs
, LoopInfo
*LI
,
216 ScalarEvolution
*SE
, DominatorTree
*DT
,
218 const TargetTransformInfo
*TTI
) {
219 using namespace llvm::PatternMatch
;
221 // Simplify any new induction variables in the partially unrolled loop.
222 if (SE
&& SimplifyIVs
) {
223 SmallVector
<WeakTrackingVH
, 16> DeadInsts
;
224 simplifyLoopIVs(L
, SE
, DT
, LI
, TTI
, DeadInsts
);
226 // Aggressively clean up dead instructions that simplifyLoopIVs already
227 // identified. Any remaining should be cleaned up below.
228 while (!DeadInsts
.empty()) {
229 Value
*V
= DeadInsts
.pop_back_val();
230 if (Instruction
*Inst
= dyn_cast_or_null
<Instruction
>(V
))
231 RecursivelyDeleteTriviallyDeadInstructions(Inst
);
235 // At this point, the code is well formed. Perform constprop, instsimplify,
237 const DataLayout
&DL
= L
->getHeader()->getModule()->getDataLayout();
238 SmallVector
<WeakTrackingVH
, 16> DeadInsts
;
239 for (BasicBlock
*BB
: L
->getBlocks()) {
240 for (Instruction
&Inst
: llvm::make_early_inc_range(*BB
)) {
241 if (Value
*V
= simplifyInstruction(&Inst
, {DL
, nullptr, DT
, AC
}))
242 if (LI
->replacementPreservesLCSSAForm(&Inst
, V
))
243 Inst
.replaceAllUsesWith(V
);
244 if (isInstructionTriviallyDead(&Inst
))
245 DeadInsts
.emplace_back(&Inst
);
247 // Fold ((add X, C1), C2) to (add X, C1+C2). This is very common in
248 // unrolled loops, and handling this early allows following code to
249 // identify the IV as a "simple recurrence" without first folding away
250 // a long chain of adds.
253 const APInt
*C1
, *C2
;
254 if (match(&Inst
, m_Add(m_Add(m_Value(X
), m_APInt(C1
)), m_APInt(C2
)))) {
255 auto *InnerI
= dyn_cast
<Instruction
>(Inst
.getOperand(0));
256 auto *InnerOBO
= cast
<OverflowingBinaryOperator
>(Inst
.getOperand(0));
258 APInt NewC
= C1
->sadd_ov(*C2
, SignedOverflow
);
259 Inst
.setOperand(0, X
);
260 Inst
.setOperand(1, ConstantInt::get(Inst
.getType(), NewC
));
261 Inst
.setHasNoUnsignedWrap(Inst
.hasNoUnsignedWrap() &&
262 InnerOBO
->hasNoUnsignedWrap());
263 Inst
.setHasNoSignedWrap(Inst
.hasNoSignedWrap() &&
264 InnerOBO
->hasNoSignedWrap() &&
266 if (InnerI
&& isInstructionTriviallyDead(InnerI
))
267 DeadInsts
.emplace_back(InnerI
);
271 // We can't do recursive deletion until we're done iterating, as we might
272 // have a phi which (potentially indirectly) uses instructions later in
273 // the block we're iterating through.
274 RecursivelyDeleteTriviallyDeadInstructions(DeadInsts
);
278 /// Unroll the given loop by Count. The loop must be in LCSSA form. Unrolling
279 /// can only fail when the loop's latch block is not terminated by a conditional
280 /// branch instruction. However, if the trip count (and multiple) are not known,
281 /// loop unrolling will mostly produce more code that is no faster.
283 /// If Runtime is true then UnrollLoop will try to insert a prologue or
284 /// epilogue that ensures the latch has a trip multiple of Count. UnrollLoop
285 /// will not runtime-unroll the loop if computing the run-time trip count will
286 /// be expensive and AllowExpensiveTripCount is false.
288 /// The LoopInfo Analysis that is passed will be kept consistent.
290 /// This utility preserves LoopInfo. It will also preserve ScalarEvolution and
291 /// DominatorTree if they are non-null.
293 /// If RemainderLoop is non-null, it will receive the remainder loop (if
294 /// required and not fully unrolled).
295 LoopUnrollResult
llvm::UnrollLoop(Loop
*L
, UnrollLoopOptions ULO
, LoopInfo
*LI
,
296 ScalarEvolution
*SE
, DominatorTree
*DT
,
298 const TargetTransformInfo
*TTI
,
299 OptimizationRemarkEmitter
*ORE
,
300 bool PreserveLCSSA
, Loop
**RemainderLoop
) {
301 assert(DT
&& "DomTree is required");
303 if (!L
->getLoopPreheader()) {
304 LLVM_DEBUG(dbgs() << " Can't unroll; loop preheader-insertion failed.\n");
305 return LoopUnrollResult::Unmodified
;
308 if (!L
->getLoopLatch()) {
309 LLVM_DEBUG(dbgs() << " Can't unroll; loop exit-block-insertion failed.\n");
310 return LoopUnrollResult::Unmodified
;
313 // Loops with indirectbr cannot be cloned.
314 if (!L
->isSafeToClone()) {
315 LLVM_DEBUG(dbgs() << " Can't unroll; Loop body cannot be cloned.\n");
316 return LoopUnrollResult::Unmodified
;
319 if (L
->getHeader()->hasAddressTaken()) {
320 // The loop-rotate pass can be helpful to avoid this in many cases.
322 dbgs() << " Won't unroll loop: address of header block is taken.\n");
323 return LoopUnrollResult::Unmodified
;
326 assert(ULO
.Count
> 0);
328 // All these values should be taken only after peeling because they might have
330 BasicBlock
*Preheader
= L
->getLoopPreheader();
331 BasicBlock
*Header
= L
->getHeader();
332 BasicBlock
*LatchBlock
= L
->getLoopLatch();
333 SmallVector
<BasicBlock
*, 4> ExitBlocks
;
334 L
->getExitBlocks(ExitBlocks
);
335 std::vector
<BasicBlock
*> OriginalLoopBlocks
= L
->getBlocks();
337 const unsigned MaxTripCount
= SE
->getSmallConstantMaxTripCount(L
);
338 const bool MaxOrZero
= SE
->isBackedgeTakenCountMaxOrZero(L
);
339 unsigned EstimatedLoopInvocationWeight
= 0;
340 std::optional
<unsigned> OriginalTripCount
=
341 llvm::getLoopEstimatedTripCount(L
, &EstimatedLoopInvocationWeight
);
343 // Effectively "DCE" unrolled iterations that are beyond the max tripcount
344 // and will never be executed.
345 if (MaxTripCount
&& ULO
.Count
> MaxTripCount
)
346 ULO
.Count
= MaxTripCount
;
350 unsigned TripMultiple
;
351 unsigned BreakoutTrip
;
353 BasicBlock
*FirstExitingBlock
= nullptr;
354 SmallVector
<BasicBlock
*> ExitingBlocks
;
356 DenseMap
<BasicBlock
*, ExitInfo
> ExitInfos
;
357 SmallVector
<BasicBlock
*, 4> ExitingBlocks
;
358 L
->getExitingBlocks(ExitingBlocks
);
359 for (auto *ExitingBlock
: ExitingBlocks
) {
360 // The folding code is not prepared to deal with non-branch instructions
362 auto *BI
= dyn_cast
<BranchInst
>(ExitingBlock
->getTerminator());
366 ExitInfo
&Info
= ExitInfos
.try_emplace(ExitingBlock
).first
->second
;
367 Info
.TripCount
= SE
->getSmallConstantTripCount(L
, ExitingBlock
);
368 Info
.TripMultiple
= SE
->getSmallConstantTripMultiple(L
, ExitingBlock
);
369 if (Info
.TripCount
!= 0) {
370 Info
.BreakoutTrip
= Info
.TripCount
% ULO
.Count
;
371 Info
.TripMultiple
= 0;
373 Info
.BreakoutTrip
= Info
.TripMultiple
=
374 (unsigned)std::gcd(ULO
.Count
, Info
.TripMultiple
);
376 Info
.ExitOnTrue
= !L
->contains(BI
->getSuccessor(0));
377 Info
.ExitingBlocks
.push_back(ExitingBlock
);
378 LLVM_DEBUG(dbgs() << " Exiting block %" << ExitingBlock
->getName()
379 << ": TripCount=" << Info
.TripCount
380 << ", TripMultiple=" << Info
.TripMultiple
381 << ", BreakoutTrip=" << Info
.BreakoutTrip
<< "\n");
384 // Are we eliminating the loop control altogether? Note that we can know
385 // we're eliminating the backedge without knowing exactly which iteration
386 // of the unrolled body exits.
387 const bool CompletelyUnroll
= ULO
.Count
== MaxTripCount
;
389 const bool PreserveOnlyFirst
= CompletelyUnroll
&& MaxOrZero
;
391 // There's no point in performing runtime unrolling if this unroll count
392 // results in a full unroll.
393 if (CompletelyUnroll
)
396 // Go through all exits of L and see if there are any phi-nodes there. We just
397 // conservatively assume that they're inserted to preserve LCSSA form, which
398 // means that complete unrolling might break this form. We need to either fix
399 // it in-place after the transformation, or entirely rebuild LCSSA. TODO: For
400 // now we just recompute LCSSA for the outer loop, but it should be possible
401 // to fix it in-place.
402 bool NeedToFixLCSSA
=
403 PreserveLCSSA
&& CompletelyUnroll
&&
405 [](const BasicBlock
*BB
) { return isa
<PHINode
>(BB
->begin()); });
407 // The current loop unroll pass can unroll loops that have
408 // (1) single latch; and
409 // (2a) latch is unconditional; or
410 // (2b) latch is conditional and is an exiting block
411 // FIXME: The implementation can be extended to work with more complicated
412 // cases, e.g. loops with multiple latches.
413 BranchInst
*LatchBI
= dyn_cast
<BranchInst
>(LatchBlock
->getTerminator());
415 // A conditional branch which exits the loop, which can be optimized to an
416 // unconditional branch in the unrolled loop in some cases.
417 bool LatchIsExiting
= L
->isLoopExiting(LatchBlock
);
418 if (!LatchBI
|| (LatchBI
->isConditional() && !LatchIsExiting
)) {
420 dbgs() << "Can't unroll; a conditional latch must exit the loop");
421 return LoopUnrollResult::Unmodified
;
424 // Loops containing convergent instructions cannot use runtime unrolling,
425 // as the prologue/epilogue may add additional control-dependencies to
426 // convergent operations.
429 bool HasConvergent
= false;
430 for (auto &BB
: L
->blocks())
432 if (auto *CB
= dyn_cast
<CallBase
>(&I
))
433 HasConvergent
|= CB
->isConvergent();
434 assert((!HasConvergent
|| !ULO
.Runtime
) &&
435 "Can't runtime unroll if loop contains a convergent operation.");
438 bool EpilogProfitability
=
439 UnrollRuntimeEpilog
.getNumOccurrences() ? UnrollRuntimeEpilog
440 : isEpilogProfitable(L
);
443 !UnrollRuntimeLoopRemainder(L
, ULO
.Count
, ULO
.AllowExpensiveTripCount
,
444 EpilogProfitability
, ULO
.UnrollRemainder
,
445 ULO
.ForgetAllSCEV
, LI
, SE
, DT
, AC
, TTI
,
446 PreserveLCSSA
, RemainderLoop
)) {
450 LLVM_DEBUG(dbgs() << "Won't unroll; remainder loop could not be "
451 "generated when assuming runtime trip count\n");
452 return LoopUnrollResult::Unmodified
;
457 // Report the unrolling decision.
458 if (CompletelyUnroll
) {
459 LLVM_DEBUG(dbgs() << "COMPLETELY UNROLLING loop %" << Header
->getName()
460 << " with trip count " << ULO
.Count
<< "!\n");
463 return OptimizationRemark(DEBUG_TYPE
, "FullyUnrolled", L
->getStartLoc(),
465 << "completely unrolled loop with "
466 << NV("UnrollCount", ULO
.Count
) << " iterations";
469 LLVM_DEBUG(dbgs() << "UNROLLING loop %" << Header
->getName() << " by "
472 LLVM_DEBUG(dbgs() << " with run-time trip count");
473 LLVM_DEBUG(dbgs() << "!\n");
477 OptimizationRemark
Diag(DEBUG_TYPE
, "PartialUnrolled", L
->getStartLoc(),
479 Diag
<< "unrolled loop by a factor of " << NV("UnrollCount", ULO
.Count
);
481 Diag
<< " with run-time trip count";
486 // We are going to make changes to this loop. SCEV may be keeping cached info
487 // about it, in particular about backedge taken count. The changes we make
488 // are guaranteed to invalidate this information for our loop. It is tempting
489 // to only invalidate the loop being unrolled, but it is incorrect as long as
490 // all exiting branches from all inner loops have impact on the outer loops,
491 // and if something changes inside them then any of outer loops may also
492 // change. When we forget outermost loop, we also forget all contained loops
493 // and this is what we need here.
495 if (ULO
.ForgetAllSCEV
)
496 SE
->forgetAllLoops();
498 SE
->forgetTopmostLoop(L
);
499 SE
->forgetBlockAndLoopDispositions();
504 ++NumUnrolledNotLatch
;
506 // For the first iteration of the loop, we should use the precloned values for
507 // PHI nodes. Insert associations now.
508 ValueToValueMapTy LastValueMap
;
509 std::vector
<PHINode
*> OrigPHINode
;
510 for (BasicBlock::iterator I
= Header
->begin(); isa
<PHINode
>(I
); ++I
) {
511 OrigPHINode
.push_back(cast
<PHINode
>(I
));
514 std::vector
<BasicBlock
*> Headers
;
515 std::vector
<BasicBlock
*> Latches
;
516 Headers
.push_back(Header
);
517 Latches
.push_back(LatchBlock
);
519 // The current on-the-fly SSA update requires blocks to be processed in
520 // reverse postorder so that LastValueMap contains the correct value at each
522 LoopBlocksDFS
DFS(L
);
525 // Stash the DFS iterators before adding blocks to the loop.
526 LoopBlocksDFS::RPOIterator BlockBegin
= DFS
.beginRPO();
527 LoopBlocksDFS::RPOIterator BlockEnd
= DFS
.endRPO();
529 std::vector
<BasicBlock
*> UnrolledLoopBlocks
= L
->getBlocks();
531 // Loop Unrolling might create new loops. While we do preserve LoopInfo, we
532 // might break loop-simplified form for these loops (as they, e.g., would
533 // share the same exit blocks). We'll keep track of loops for which we can
534 // break this so that later we can re-simplify them.
535 SmallSetVector
<Loop
*, 4> LoopsToSimplify
;
536 for (Loop
*SubLoop
: *L
)
537 LoopsToSimplify
.insert(SubLoop
);
539 // When a FSDiscriminator is enabled, we don't need to add the multiply
540 // factors to the discriminators.
541 if (Header
->getParent()->shouldEmitDebugInfoForProfiling() &&
542 !EnableFSDiscriminator
)
543 for (BasicBlock
*BB
: L
->getBlocks())
544 for (Instruction
&I
: *BB
)
545 if (!I
.isDebugOrPseudoInst())
546 if (const DILocation
*DIL
= I
.getDebugLoc()) {
547 auto NewDIL
= DIL
->cloneByMultiplyingDuplicationFactor(ULO
.Count
);
549 I
.setDebugLoc(*NewDIL
);
552 << "Failed to create new discriminator: "
553 << DIL
->getFilename() << " Line: " << DIL
->getLine());
556 // Identify what noalias metadata is inside the loop: if it is inside the
557 // loop, the associated metadata must be cloned for each iteration.
558 SmallVector
<MDNode
*, 6> LoopLocalNoAliasDeclScopes
;
559 identifyNoAliasScopesToClone(L
->getBlocks(), LoopLocalNoAliasDeclScopes
);
561 // We place the unrolled iterations immediately after the original loop
562 // latch. This is a reasonable default placement if we don't have block
563 // frequencies, and if we do, well the layout will be adjusted later.
564 auto BlockInsertPt
= std::next(LatchBlock
->getIterator());
565 for (unsigned It
= 1; It
!= ULO
.Count
; ++It
) {
566 SmallVector
<BasicBlock
*, 8> NewBlocks
;
567 SmallDenseMap
<const Loop
*, Loop
*, 4> NewLoops
;
570 for (LoopBlocksDFS::RPOIterator BB
= BlockBegin
; BB
!= BlockEnd
; ++BB
) {
571 ValueToValueMapTy VMap
;
572 BasicBlock
*New
= CloneBasicBlock(*BB
, VMap
, "." + Twine(It
));
573 Header
->getParent()->insert(BlockInsertPt
, New
);
575 assert((*BB
!= Header
|| LI
->getLoopFor(*BB
) == L
) &&
576 "Header should not be in a sub-loop");
577 // Tell LI about New.
578 const Loop
*OldLoop
= addClonedBlockToLoopInfo(*BB
, New
, LI
, NewLoops
);
580 LoopsToSimplify
.insert(NewLoops
[OldLoop
]);
583 // Loop over all of the PHI nodes in the block, changing them to use
584 // the incoming values from the previous block.
585 for (PHINode
*OrigPHI
: OrigPHINode
) {
586 PHINode
*NewPHI
= cast
<PHINode
>(VMap
[OrigPHI
]);
587 Value
*InVal
= NewPHI
->getIncomingValueForBlock(LatchBlock
);
588 if (Instruction
*InValI
= dyn_cast
<Instruction
>(InVal
))
589 if (It
> 1 && L
->contains(InValI
))
590 InVal
= LastValueMap
[InValI
];
591 VMap
[OrigPHI
] = InVal
;
592 NewPHI
->eraseFromParent();
595 // Update our running map of newest clones
596 LastValueMap
[*BB
] = New
;
597 for (ValueToValueMapTy::iterator VI
= VMap
.begin(), VE
= VMap
.end();
599 LastValueMap
[VI
->first
] = VI
->second
;
601 // Add phi entries for newly created values to all exit blocks.
602 for (BasicBlock
*Succ
: successors(*BB
)) {
603 if (L
->contains(Succ
))
605 for (PHINode
&PHI
: Succ
->phis()) {
606 Value
*Incoming
= PHI
.getIncomingValueForBlock(*BB
);
607 ValueToValueMapTy::iterator It
= LastValueMap
.find(Incoming
);
608 if (It
!= LastValueMap
.end())
609 Incoming
= It
->second
;
610 PHI
.addIncoming(Incoming
, New
);
611 SE
->forgetValue(&PHI
);
614 // Keep track of new headers and latches as we create them, so that
615 // we can insert the proper branches later.
617 Headers
.push_back(New
);
618 if (*BB
== LatchBlock
)
619 Latches
.push_back(New
);
621 // Keep track of the exiting block and its successor block contained in
622 // the loop for the current iteration.
623 auto ExitInfoIt
= ExitInfos
.find(*BB
);
624 if (ExitInfoIt
!= ExitInfos
.end())
625 ExitInfoIt
->second
.ExitingBlocks
.push_back(New
);
627 NewBlocks
.push_back(New
);
628 UnrolledLoopBlocks
.push_back(New
);
630 // Update DomTree: since we just copy the loop body, and each copy has a
631 // dedicated entry block (copy of the header block), this header's copy
632 // dominates all copied blocks. That means, dominance relations in the
633 // copied body are the same as in the original body.
635 DT
->addNewBlock(New
, Latches
[It
- 1]);
637 auto BBDomNode
= DT
->getNode(*BB
);
638 auto BBIDom
= BBDomNode
->getIDom();
639 BasicBlock
*OriginalBBIDom
= BBIDom
->getBlock();
641 New
, cast
<BasicBlock
>(LastValueMap
[cast
<Value
>(OriginalBBIDom
)]));
645 // Remap all instructions in the most recent iteration
646 remapInstructionsInBlocks(NewBlocks
, LastValueMap
);
647 for (BasicBlock
*NewBlock
: NewBlocks
)
648 for (Instruction
&I
: *NewBlock
)
649 if (auto *II
= dyn_cast
<AssumeInst
>(&I
))
650 AC
->registerAssumption(II
);
653 // Identify what other metadata depends on the cloned version. After
654 // cloning, replace the metadata with the corrected version for both
655 // memory instructions and noalias intrinsics.
656 std::string ext
= (Twine("It") + Twine(It
)).str();
657 cloneAndAdaptNoAliasScopes(LoopLocalNoAliasDeclScopes
, NewBlocks
,
658 Header
->getContext(), ext
);
662 // Loop over the PHI nodes in the original block, setting incoming values.
663 for (PHINode
*PN
: OrigPHINode
) {
664 if (CompletelyUnroll
) {
665 PN
->replaceAllUsesWith(PN
->getIncomingValueForBlock(Preheader
));
666 PN
->eraseFromParent();
667 } else if (ULO
.Count
> 1) {
668 Value
*InVal
= PN
->removeIncomingValue(LatchBlock
, false);
669 // If this value was defined in the loop, take the value defined by the
670 // last iteration of the loop.
671 if (Instruction
*InValI
= dyn_cast
<Instruction
>(InVal
)) {
672 if (L
->contains(InValI
))
673 InVal
= LastValueMap
[InVal
];
675 assert(Latches
.back() == LastValueMap
[LatchBlock
] && "bad last latch");
676 PN
->addIncoming(InVal
, Latches
.back());
680 // Connect latches of the unrolled iterations to the headers of the next
681 // iteration. Currently they point to the header of the same iteration.
682 for (unsigned i
= 0, e
= Latches
.size(); i
!= e
; ++i
) {
683 unsigned j
= (i
+ 1) % e
;
684 Latches
[i
]->getTerminator()->replaceSuccessorWith(Headers
[i
], Headers
[j
]);
687 // Update dominators of blocks we might reach through exits.
688 // Immediate dominator of such block might change, because we add more
689 // routes which can lead to the exit: we can now reach it from the copied
692 for (auto *BB
: OriginalLoopBlocks
) {
693 auto *BBDomNode
= DT
->getNode(BB
);
694 SmallVector
<BasicBlock
*, 16> ChildrenToUpdate
;
695 for (auto *ChildDomNode
: BBDomNode
->children()) {
696 auto *ChildBB
= ChildDomNode
->getBlock();
697 if (!L
->contains(ChildBB
))
698 ChildrenToUpdate
.push_back(ChildBB
);
700 // The new idom of the block will be the nearest common dominator
701 // of all copies of the previous idom. This is equivalent to the
702 // nearest common dominator of the previous idom and the first latch,
703 // which dominates all copies of the previous idom.
704 BasicBlock
*NewIDom
= DT
->findNearestCommonDominator(BB
, LatchBlock
);
705 for (auto *ChildBB
: ChildrenToUpdate
)
706 DT
->changeImmediateDominator(ChildBB
, NewIDom
);
710 assert(!UnrollVerifyDomtree
||
711 DT
->verify(DominatorTree::VerificationLevel::Fast
));
713 SmallVector
<DominatorTree::UpdateType
> DTUpdates
;
714 auto SetDest
= [&](BasicBlock
*Src
, bool WillExit
, bool ExitOnTrue
) {
715 auto *Term
= cast
<BranchInst
>(Src
->getTerminator());
716 const unsigned Idx
= ExitOnTrue
^ WillExit
;
717 BasicBlock
*Dest
= Term
->getSuccessor(Idx
);
718 BasicBlock
*DeadSucc
= Term
->getSuccessor(1-Idx
);
720 // Remove predecessors from all non-Dest successors.
721 DeadSucc
->removePredecessor(Src
, /* KeepOneInputPHIs */ true);
723 // Replace the conditional branch with an unconditional one.
724 BranchInst::Create(Dest
, Term
);
725 Term
->eraseFromParent();
727 DTUpdates
.emplace_back(DominatorTree::Delete
, Src
, DeadSucc
);
730 auto WillExit
= [&](const ExitInfo
&Info
, unsigned i
, unsigned j
,
731 bool IsLatch
) -> std::optional
<bool> {
732 if (CompletelyUnroll
) {
733 if (PreserveOnlyFirst
) {
738 // Complete (but possibly inexact) unrolling
741 if (Info
.TripCount
&& j
!= Info
.TripCount
)
747 // If runtime unrolling inserts a prologue, information about non-latch
748 // exits may be stale.
749 if (IsLatch
&& j
!= 0)
754 if (j
!= Info
.BreakoutTrip
&&
755 (Info
.TripMultiple
== 0 || j
% Info
.TripMultiple
!= 0)) {
756 // If we know the trip count or a multiple of it, we can safely use an
757 // unconditional branch for some iterations.
763 // Fold branches for iterations where we know that they will exit or not
765 for (auto &Pair
: ExitInfos
) {
766 ExitInfo
&Info
= Pair
.second
;
767 for (unsigned i
= 0, e
= Info
.ExitingBlocks
.size(); i
!= e
; ++i
) {
768 // The branch destination.
769 unsigned j
= (i
+ 1) % e
;
770 bool IsLatch
= Pair
.first
== LatchBlock
;
771 std::optional
<bool> KnownWillExit
= WillExit(Info
, i
, j
, IsLatch
);
772 if (!KnownWillExit
) {
773 if (!Info
.FirstExitingBlock
)
774 Info
.FirstExitingBlock
= Info
.ExitingBlocks
[i
];
778 // We don't fold known-exiting branches for non-latch exits here,
779 // because this ensures that both all loop blocks and all exit blocks
780 // remain reachable in the CFG.
781 // TODO: We could fold these branches, but it would require much more
782 // sophisticated updates to LoopInfo.
783 if (*KnownWillExit
&& !IsLatch
) {
784 if (!Info
.FirstExitingBlock
)
785 Info
.FirstExitingBlock
= Info
.ExitingBlocks
[i
];
789 SetDest(Info
.ExitingBlocks
[i
], *KnownWillExit
, Info
.ExitOnTrue
);
793 DomTreeUpdater
DTU(DT
, DomTreeUpdater::UpdateStrategy::Lazy
);
794 DomTreeUpdater
*DTUToUse
= &DTU
;
795 if (ExitingBlocks
.size() == 1 && ExitInfos
.size() == 1) {
796 // Manually update the DT if there's a single exiting node. In that case
797 // there's a single exit node and it is sufficient to update the nodes
798 // immediately dominated by the original exiting block. They will become
799 // dominated by the first exiting block that leaves the loop after
800 // unrolling. Note that the CFG inside the loop does not change, so there's
801 // no need to update the DT inside the unrolled loop.
803 auto &[OriginalExit
, Info
] = *ExitInfos
.begin();
804 if (!Info
.FirstExitingBlock
)
805 Info
.FirstExitingBlock
= Info
.ExitingBlocks
.back();
806 for (auto *C
: to_vector(DT
->getNode(OriginalExit
)->children())) {
807 if (L
->contains(C
->getBlock()))
809 C
->setIDom(DT
->getNode(Info
.FirstExitingBlock
));
812 DTU
.applyUpdates(DTUpdates
);
815 // When completely unrolling, the last latch becomes unreachable.
816 if (!LatchIsExiting
&& CompletelyUnroll
) {
817 // There is no need to update the DT here, because there must be a unique
818 // latch. Hence if the latch is not exiting it must directly branch back to
819 // the original loop header and does not dominate any nodes.
820 assert(LatchBlock
->getSingleSuccessor() && "Loop with multiple latches?");
821 changeToUnreachable(Latches
.back()->getTerminator(), PreserveLCSSA
);
824 // Merge adjacent basic blocks, if possible.
825 for (BasicBlock
*Latch
: Latches
) {
826 BranchInst
*Term
= dyn_cast
<BranchInst
>(Latch
->getTerminator());
828 (CompletelyUnroll
&& !LatchIsExiting
&& Latch
== Latches
.back())) &&
829 "Need a branch as terminator, except when fully unrolling with "
830 "unconditional latch");
831 if (Term
&& Term
->isUnconditional()) {
832 BasicBlock
*Dest
= Term
->getSuccessor(0);
833 BasicBlock
*Fold
= Dest
->getUniquePredecessor();
834 if (MergeBlockIntoPredecessor(Dest
, /*DTU=*/DTUToUse
, LI
,
835 /*MSSAU=*/nullptr, /*MemDep=*/nullptr,
836 /*PredecessorWithTwoSuccessors=*/false,
837 DTUToUse
? nullptr : DT
)) {
838 // Dest has been folded into Fold. Update our worklists accordingly.
839 std::replace(Latches
.begin(), Latches
.end(), Dest
, Fold
);
840 llvm::erase(UnrolledLoopBlocks
, Dest
);
846 // Apply updates to the DomTree.
847 DT
= &DTU
.getDomTree();
849 assert(!UnrollVerifyDomtree
||
850 DT
->verify(DominatorTree::VerificationLevel::Fast
));
852 // At this point, the code is well formed. We now simplify the unrolled loop,
853 // doing constant propagation and dead code elimination as we go.
854 simplifyLoopAfterUnroll(L
, !CompletelyUnroll
&& ULO
.Count
> 1, LI
, SE
, DT
, AC
,
857 NumCompletelyUnrolled
+= CompletelyUnroll
;
860 Loop
*OuterL
= L
->getParentLoop();
861 // Update LoopInfo if the loop is completely removed.
862 if (CompletelyUnroll
) {
864 // We shouldn't try to use `L` anymore.
866 } else if (OriginalTripCount
) {
867 // Update the trip count. Note that the remainder has already logic
868 // computing it in `UnrollRuntimeLoopRemainder`.
869 setLoopEstimatedTripCount(L
, *OriginalTripCount
/ ULO
.Count
,
870 EstimatedLoopInvocationWeight
);
873 // LoopInfo should not be valid, confirm that.
874 if (UnrollVerifyLoopInfo
)
877 // After complete unrolling most of the blocks should be contained in OuterL.
878 // However, some of them might happen to be out of OuterL (e.g. if they
879 // precede a loop exit). In this case we might need to insert PHI nodes in
880 // order to preserve LCSSA form.
881 // We don't need to check this if we already know that we need to fix LCSSA
883 // TODO: For now we just recompute LCSSA for the outer loop in this case, but
884 // it should be possible to fix it in-place.
885 if (PreserveLCSSA
&& OuterL
&& CompletelyUnroll
&& !NeedToFixLCSSA
)
886 NeedToFixLCSSA
|= ::needToInsertPhisForLCSSA(OuterL
, UnrolledLoopBlocks
, LI
);
888 // Make sure that loop-simplify form is preserved. We want to simplify
889 // at least one layer outside of the loop that was unrolled so that any
890 // changes to the parent loop exposed by the unrolling are considered.
892 // OuterL includes all loops for which we can break loop-simplify, so
893 // it's sufficient to simplify only it (it'll recursively simplify inner
895 if (NeedToFixLCSSA
) {
896 // LCSSA must be performed on the outermost affected loop. The unrolled
897 // loop's last loop latch is guaranteed to be in the outermost loop
898 // after LoopInfo's been updated by LoopInfo::erase.
899 Loop
*LatchLoop
= LI
->getLoopFor(Latches
.back());
900 Loop
*FixLCSSALoop
= OuterL
;
901 if (!FixLCSSALoop
->contains(LatchLoop
))
902 while (FixLCSSALoop
->getParentLoop() != LatchLoop
)
903 FixLCSSALoop
= FixLCSSALoop
->getParentLoop();
905 formLCSSARecursively(*FixLCSSALoop
, *DT
, LI
, SE
);
906 } else if (PreserveLCSSA
) {
907 assert(OuterL
->isLCSSAForm(*DT
) &&
908 "Loops should be in LCSSA form after loop-unroll.");
911 // TODO: That potentially might be compile-time expensive. We should try
912 // to fix the loop-simplified form incrementally.
913 simplifyLoop(OuterL
, DT
, LI
, SE
, AC
, nullptr, PreserveLCSSA
);
915 // Simplify loops for which we might've broken loop-simplify form.
916 for (Loop
*SubLoop
: LoopsToSimplify
)
917 simplifyLoop(SubLoop
, DT
, LI
, SE
, AC
, nullptr, PreserveLCSSA
);
920 return CompletelyUnroll
? LoopUnrollResult::FullyUnrolled
921 : LoopUnrollResult::PartiallyUnrolled
;
924 /// Given an llvm.loop loop id metadata node, returns the loop hint metadata
925 /// node with the given name (for example, "llvm.loop.unroll.count"). If no
926 /// such metadata node exists, then nullptr is returned.
927 MDNode
*llvm::GetUnrollMetadata(MDNode
*LoopID
, StringRef Name
) {
928 // First operand should refer to the loop id itself.
929 assert(LoopID
->getNumOperands() > 0 && "requires at least one operand");
930 assert(LoopID
->getOperand(0) == LoopID
&& "invalid loop id");
932 for (unsigned i
= 1, e
= LoopID
->getNumOperands(); i
< e
; ++i
) {
933 MDNode
*MD
= dyn_cast
<MDNode
>(LoopID
->getOperand(i
));
937 MDString
*S
= dyn_cast
<MDString
>(MD
->getOperand(0));
941 if (Name
.equals(S
->getString()))