1 //===-- UnrollLoopPeel.cpp - Loop peeling utilities -----------------------===//
3 // The LLVM Compiler Infrastructure
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
8 //===----------------------------------------------------------------------===//
10 // This file implements some loop unrolling utilities for peeling loops
11 // with dynamically inferred (from PGO) trip counts. See LoopUnroll.cpp for
12 // unrolling loops with compile-time constant trip counts.
14 //===----------------------------------------------------------------------===//
16 #include "llvm/ADT/Statistic.h"
17 #include "llvm/Analysis/LoopIterator.h"
18 #include "llvm/Analysis/LoopPass.h"
19 #include "llvm/Analysis/ScalarEvolution.h"
20 #include "llvm/Analysis/TargetTransformInfo.h"
21 #include "llvm/IR/BasicBlock.h"
22 #include "llvm/IR/Dominators.h"
23 #include "llvm/IR/MDBuilder.h"
24 #include "llvm/IR/Metadata.h"
25 #include "llvm/IR/Module.h"
26 #include "llvm/Support/Debug.h"
27 #include "llvm/Support/raw_ostream.h"
28 #include "llvm/Transforms/Scalar.h"
29 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
30 #include "llvm/Transforms/Utils/Cloning.h"
31 #include "llvm/Transforms/Utils/LoopSimplify.h"
32 #include "llvm/Transforms/Utils/LoopUtils.h"
33 #include "llvm/Transforms/Utils/UnrollLoop.h"
38 #define DEBUG_TYPE "loop-unroll"
39 STATISTIC(NumPeeled
, "Number of loops peeled");
41 static cl::opt
<unsigned> UnrollPeelMaxCount(
42 "unroll-peel-max-count", cl::init(7), cl::Hidden
,
43 cl::desc("Max average trip count which will cause loop peeling."));
45 static cl::opt
<unsigned> UnrollForcePeelCount(
46 "unroll-force-peel-count", cl::init(0), cl::Hidden
,
47 cl::desc("Force a peel count regardless of profiling information."));
49 // Designates that a Phi is estimated to become invariant after an "infinite"
50 // number of loop iterations (i.e. only may become an invariant if the loop is
52 static const unsigned InfiniteIterationsToInvariance
= UINT_MAX
;
54 // Check whether we are capable of peeling this loop.
55 static bool canPeel(Loop
*L
) {
56 // Make sure the loop is in simplified form
57 if (!L
->isLoopSimplifyForm())
60 // Only peel loops that contain a single exit
61 if (!L
->getExitingBlock() || !L
->getUniqueExitBlock())
64 // Don't try to peel loops where the latch is not the exiting block.
65 // This can be an indication of two different things:
66 // 1) The loop is not rotated.
67 // 2) The loop contains irreducible control flow that involves the latch.
68 if (L
->getLoopLatch() != L
->getExitingBlock())
74 // This function calculates the number of iterations after which the given Phi
75 // becomes an invariant. The pre-calculated values are memorized in the map. The
76 // function (shortcut is I) is calculated according to the following definition:
77 // Given %x = phi <Inputs from above the loop>, ..., [%y, %back.edge].
78 // If %y is a loop invariant, then I(%x) = 1.
79 // If %y is a Phi from the loop header, I(%x) = I(%y) + 1.
80 // Otherwise, I(%x) is infinite.
81 // TODO: Actually if %y is an expression that depends only on Phi %z and some
82 // loop invariants, we can estimate I(%x) = I(%z) + 1. The example
84 // %x = phi(0, %a), <-- becomes invariant starting from 3rd iteration.
87 static unsigned calculateIterationsToInvariance(
88 PHINode
*Phi
, Loop
*L
, BasicBlock
*BackEdge
,
89 SmallDenseMap
<PHINode
*, unsigned> &IterationsToInvariance
) {
90 assert(Phi
->getParent() == L
->getHeader() &&
91 "Non-loop Phi should not be checked for turning into invariant.");
92 assert(BackEdge
== L
->getLoopLatch() && "Wrong latch?");
93 // If we already know the answer, take it from the map.
94 auto I
= IterationsToInvariance
.find(Phi
);
95 if (I
!= IterationsToInvariance
.end())
98 // Otherwise we need to analyze the input from the back edge.
99 Value
*Input
= Phi
->getIncomingValueForBlock(BackEdge
);
100 // Place infinity to map to avoid infinite recursion for cycled Phis. Such
101 // cycles can never stop on an invariant.
102 IterationsToInvariance
[Phi
] = InfiniteIterationsToInvariance
;
103 unsigned ToInvariance
= InfiniteIterationsToInvariance
;
105 if (L
->isLoopInvariant(Input
))
107 else if (PHINode
*IncPhi
= dyn_cast
<PHINode
>(Input
)) {
108 // Only consider Phis in header block.
109 if (IncPhi
->getParent() != L
->getHeader())
110 return InfiniteIterationsToInvariance
;
111 // If the input becomes an invariant after X iterations, then our Phi
112 // becomes an invariant after X + 1 iterations.
113 unsigned InputToInvariance
= calculateIterationsToInvariance(
114 IncPhi
, L
, BackEdge
, IterationsToInvariance
);
115 if (InputToInvariance
!= InfiniteIterationsToInvariance
)
116 ToInvariance
= InputToInvariance
+ 1u;
119 // If we found that this Phi lies in an invariant chain, update the map.
120 if (ToInvariance
!= InfiniteIterationsToInvariance
)
121 IterationsToInvariance
[Phi
] = ToInvariance
;
125 // Return the number of iterations we want to peel off.
126 void llvm::computePeelCount(Loop
*L
, unsigned LoopSize
,
127 TargetTransformInfo::UnrollingPreferences
&UP
,
128 unsigned &TripCount
) {
129 assert(LoopSize
> 0 && "Zero loop size is not allowed!");
134 // Only try to peel innermost loops.
138 // Here we try to get rid of Phis which become invariants after 1, 2, ..., N
139 // iterations of the loop. For this we compute the number for iterations after
140 // which every Phi is guaranteed to become an invariant, and try to peel the
141 // maximum number of iterations among these values, thus turning all those
142 // Phis into invariants.
143 // First, check that we can peel at least one iteration.
144 if (2 * LoopSize
<= UP
.Threshold
&& UnrollPeelMaxCount
> 0) {
145 // Store the pre-calculated values here.
146 SmallDenseMap
<PHINode
*, unsigned> IterationsToInvariance
;
147 // Now go through all Phis to calculate their the number of iterations they
148 // need to become invariants.
149 unsigned DesiredPeelCount
= 0;
150 BasicBlock
*BackEdge
= L
->getLoopLatch();
151 assert(BackEdge
&& "Loop is not in simplified form?");
152 for (auto BI
= L
->getHeader()->begin(); isa
<PHINode
>(&*BI
); ++BI
) {
153 PHINode
*Phi
= cast
<PHINode
>(&*BI
);
154 unsigned ToInvariance
= calculateIterationsToInvariance(
155 Phi
, L
, BackEdge
, IterationsToInvariance
);
156 if (ToInvariance
!= InfiniteIterationsToInvariance
)
157 DesiredPeelCount
= std::max(DesiredPeelCount
, ToInvariance
);
159 if (DesiredPeelCount
> 0) {
160 // Pay respect to limitations implied by loop size and the max peel count.
161 unsigned MaxPeelCount
= UnrollPeelMaxCount
;
162 MaxPeelCount
= std::min(MaxPeelCount
, UP
.Threshold
/ LoopSize
- 1);
163 DesiredPeelCount
= std::min(DesiredPeelCount
, MaxPeelCount
);
164 // Consider max peel count limitation.
165 assert(DesiredPeelCount
> 0 && "Wrong loop size estimation?");
166 DEBUG(dbgs() << "Peel " << DesiredPeelCount
<< " iteration(s) to turn"
167 << " some Phis into invariants.\n");
168 UP
.PeelCount
= DesiredPeelCount
;
173 // Bail if we know the statically calculated trip count.
174 // In this case we rather prefer partial unrolling.
178 // If the user provided a peel count, use that.
179 bool UserPeelCount
= UnrollForcePeelCount
.getNumOccurrences() > 0;
181 DEBUG(dbgs() << "Force-peeling first " << UnrollForcePeelCount
182 << " iterations.\n");
183 UP
.PeelCount
= UnrollForcePeelCount
;
187 // If we don't know the trip count, but have reason to believe the average
188 // trip count is low, peeling should be beneficial, since we will usually
189 // hit the peeled section.
190 // We only do this in the presence of profile information, since otherwise
191 // our estimates of the trip count are not reliable enough.
192 if (UP
.AllowPeeling
&& L
->getHeader()->getParent()->getEntryCount()) {
193 Optional
<unsigned> PeelCount
= getLoopEstimatedTripCount(L
);
197 DEBUG(dbgs() << "Profile-based estimated trip count is " << *PeelCount
201 if ((*PeelCount
<= UnrollPeelMaxCount
) &&
202 (LoopSize
* (*PeelCount
+ 1) <= UP
.Threshold
)) {
203 DEBUG(dbgs() << "Peeling first " << *PeelCount
<< " iterations.\n");
204 UP
.PeelCount
= *PeelCount
;
207 DEBUG(dbgs() << "Requested peel count: " << *PeelCount
<< "\n");
208 DEBUG(dbgs() << "Max peel count: " << UnrollPeelMaxCount
<< "\n");
209 DEBUG(dbgs() << "Peel cost: " << LoopSize
* (*PeelCount
+ 1) << "\n");
210 DEBUG(dbgs() << "Max peel cost: " << UP
.Threshold
<< "\n");
217 /// \brief Update the branch weights of the latch of a peeled-off loop
219 /// This sets the branch weights for the latch of the recently peeled off loop
220 /// iteration correctly.
221 /// Our goal is to make sure that:
222 /// a) The total weight of all the copies of the loop body is preserved.
223 /// b) The total weight of the loop exit is preserved.
224 /// c) The body weight is reasonably distributed between the peeled iterations.
226 /// \param Header The copy of the header block that belongs to next iteration.
227 /// \param LatchBR The copy of the latch branch that belongs to this iteration.
228 /// \param IterNumber The serial number of the iteration that was just
230 /// \param AvgIters The average number of iterations we expect the loop to have.
231 /// \param[in,out] PeeledHeaderWeight The total number of dynamic loop
232 /// iterations that are unaccounted for. As an input, it represents the number
233 /// of times we expect to enter the header of the iteration currently being
234 /// peeled off. The output is the number of times we expect to enter the
235 /// header of the next iteration.
236 static void updateBranchWeights(BasicBlock
*Header
, BranchInst
*LatchBR
,
237 unsigned IterNumber
, unsigned AvgIters
,
238 uint64_t &PeeledHeaderWeight
) {
240 // FIXME: Pick a more realistic distribution.
241 // Currently the proportion of weight we assign to the fall-through
242 // side of the branch drops linearly with the iteration number, and we use
243 // a 0.9 fudge factor to make the drop-off less sharp...
244 if (PeeledHeaderWeight
) {
245 uint64_t FallThruWeight
=
246 PeeledHeaderWeight
* ((float)(AvgIters
- IterNumber
) / AvgIters
* 0.9);
247 uint64_t ExitWeight
= PeeledHeaderWeight
- FallThruWeight
;
248 PeeledHeaderWeight
-= ExitWeight
;
250 unsigned HeaderIdx
= (LatchBR
->getSuccessor(0) == Header
? 0 : 1);
251 MDBuilder
MDB(LatchBR
->getContext());
253 HeaderIdx
? MDB
.createBranchWeights(ExitWeight
, FallThruWeight
)
254 : MDB
.createBranchWeights(FallThruWeight
, ExitWeight
);
255 LatchBR
->setMetadata(LLVMContext::MD_prof
, WeightNode
);
259 /// \brief Clones the body of the loop L, putting it between \p InsertTop and \p
261 /// \param IterNumber The serial number of the iteration currently being
263 /// \param Exit The exit block of the original loop.
264 /// \param[out] NewBlocks A list of the the blocks in the newly created clone
265 /// \param[out] VMap The value map between the loop and the new clone.
266 /// \param LoopBlocks A helper for DFS-traversal of the loop.
267 /// \param LVMap A value-map that maps instructions from the original loop to
268 /// instructions in the last peeled-off iteration.
269 static void cloneLoopBlocks(Loop
*L
, unsigned IterNumber
, BasicBlock
*InsertTop
,
270 BasicBlock
*InsertBot
, BasicBlock
*Exit
,
271 SmallVectorImpl
<BasicBlock
*> &NewBlocks
,
272 LoopBlocksDFS
&LoopBlocks
, ValueToValueMapTy
&VMap
,
273 ValueToValueMapTy
&LVMap
, DominatorTree
*DT
,
276 BasicBlock
*Header
= L
->getHeader();
277 BasicBlock
*Latch
= L
->getLoopLatch();
278 BasicBlock
*PreHeader
= L
->getLoopPreheader();
280 Function
*F
= Header
->getParent();
281 LoopBlocksDFS::RPOIterator BlockBegin
= LoopBlocks
.beginRPO();
282 LoopBlocksDFS::RPOIterator BlockEnd
= LoopBlocks
.endRPO();
283 Loop
*ParentLoop
= L
->getParentLoop();
285 // For each block in the original loop, create a new copy,
286 // and update the value map with the newly created values.
287 for (LoopBlocksDFS::RPOIterator BB
= BlockBegin
; BB
!= BlockEnd
; ++BB
) {
288 BasicBlock
*NewBB
= CloneBasicBlock(*BB
, VMap
, ".peel", F
);
289 NewBlocks
.push_back(NewBB
);
292 ParentLoop
->addBasicBlockToLoop(NewBB
, *LI
);
296 // If dominator tree is available, insert nodes to represent cloned blocks.
299 DT
->addNewBlock(NewBB
, InsertTop
);
301 DomTreeNode
*IDom
= DT
->getNode(*BB
)->getIDom();
302 // VMap must contain entry for IDom, as the iteration order is RPO.
303 DT
->addNewBlock(NewBB
, cast
<BasicBlock
>(VMap
[IDom
->getBlock()]));
308 // Hook-up the control flow for the newly inserted blocks.
309 // The new header is hooked up directly to the "top", which is either
310 // the original loop preheader (for the first iteration) or the previous
311 // iteration's exiting block (for every other iteration)
312 InsertTop
->getTerminator()->setSuccessor(0, cast
<BasicBlock
>(VMap
[Header
]));
314 // Similarly, for the latch:
315 // The original exiting edge is still hooked up to the loop exit.
316 // The backedge now goes to the "bottom", which is either the loop's real
317 // header (for the last peeled iteration) or the copied header of the next
318 // iteration (for every other iteration)
319 BasicBlock
*NewLatch
= cast
<BasicBlock
>(VMap
[Latch
]);
320 BranchInst
*LatchBR
= cast
<BranchInst
>(NewLatch
->getTerminator());
321 unsigned HeaderIdx
= (LatchBR
->getSuccessor(0) == Header
? 0 : 1);
322 LatchBR
->setSuccessor(HeaderIdx
, InsertBot
);
323 LatchBR
->setSuccessor(1 - HeaderIdx
, Exit
);
325 DT
->changeImmediateDominator(InsertBot
, NewLatch
);
327 // The new copy of the loop body starts with a bunch of PHI nodes
328 // that pick an incoming value from either the preheader, or the previous
329 // loop iteration. Since this copy is no longer part of the loop, we
330 // resolve this statically:
331 // For the first iteration, we use the value from the preheader directly.
332 // For any other iteration, we replace the phi with the value generated by
333 // the immediately preceding clone of the loop body (which represents
334 // the previous iteration).
335 for (BasicBlock::iterator I
= Header
->begin(); isa
<PHINode
>(I
); ++I
) {
336 PHINode
*NewPHI
= cast
<PHINode
>(VMap
[&*I
]);
337 if (IterNumber
== 0) {
338 VMap
[&*I
] = NewPHI
->getIncomingValueForBlock(PreHeader
);
340 Value
*LatchVal
= NewPHI
->getIncomingValueForBlock(Latch
);
341 Instruction
*LatchInst
= dyn_cast
<Instruction
>(LatchVal
);
342 if (LatchInst
&& L
->contains(LatchInst
))
343 VMap
[&*I
] = LVMap
[LatchInst
];
345 VMap
[&*I
] = LatchVal
;
347 cast
<BasicBlock
>(VMap
[Header
])->getInstList().erase(NewPHI
);
350 // Fix up the outgoing values - we need to add a value for the iteration
351 // we've just created. Note that this must happen *after* the incoming
352 // values are adjusted, since the value going out of the latch may also be
353 // a value coming into the header.
354 for (BasicBlock::iterator I
= Exit
->begin(); isa
<PHINode
>(I
); ++I
) {
355 PHINode
*PHI
= cast
<PHINode
>(I
);
356 Value
*LatchVal
= PHI
->getIncomingValueForBlock(Latch
);
357 Instruction
*LatchInst
= dyn_cast
<Instruction
>(LatchVal
);
358 if (LatchInst
&& L
->contains(LatchInst
))
359 LatchVal
= VMap
[LatchVal
];
360 PHI
->addIncoming(LatchVal
, cast
<BasicBlock
>(VMap
[Latch
]));
363 // LastValueMap is updated with the values for the current loop
364 // which are used the next time this function is called.
365 for (const auto &KV
: VMap
)
366 LVMap
[KV
.first
] = KV
.second
;
369 /// \brief Peel off the first \p PeelCount iterations of loop \p L.
371 /// Note that this does not peel them off as a single straight-line block.
372 /// Rather, each iteration is peeled off separately, and needs to check the
374 /// For loops that dynamically execute \p PeelCount iterations or less
375 /// this provides a benefit, since the peeled off iterations, which account
376 /// for the bulk of dynamic execution, can be further simplified by scalar
378 bool llvm::peelLoop(Loop
*L
, unsigned PeelCount
, LoopInfo
*LI
,
379 ScalarEvolution
*SE
, DominatorTree
*DT
,
380 AssumptionCache
*AC
, bool PreserveLCSSA
) {
384 LoopBlocksDFS
LoopBlocks(L
);
385 LoopBlocks
.perform(LI
);
387 BasicBlock
*Header
= L
->getHeader();
388 BasicBlock
*PreHeader
= L
->getLoopPreheader();
389 BasicBlock
*Latch
= L
->getLoopLatch();
390 BasicBlock
*Exit
= L
->getUniqueExitBlock();
392 Function
*F
= Header
->getParent();
394 // Set up all the necessary basic blocks. It is convenient to split the
395 // preheader into 3 parts - two blocks to anchor the peeled copy of the loop
396 // body, and a new preheader for the "real" loop.
398 // Peeling the first iteration transforms.
404 // If (cond) goto Header
411 // If (!cond) goto Exit
417 // If (cond) goto Header
420 // Each following iteration will split the current bottom anchor in two,
421 // and put the new copy of the loop body between these two blocks. That is,
422 // after peeling another iteration from the example above, we'll split
423 // InsertBot, and get:
427 // If (!cond) goto Exit
430 // If (!cond) goto Exit
436 // If (cond) goto Header
439 BasicBlock
*InsertTop
= SplitEdge(PreHeader
, Header
, DT
, LI
);
440 BasicBlock
*InsertBot
=
441 SplitBlock(InsertTop
, InsertTop
->getTerminator(), DT
, LI
);
442 BasicBlock
*NewPreHeader
=
443 SplitBlock(InsertBot
, InsertBot
->getTerminator(), DT
, LI
);
445 InsertTop
->setName(Header
->getName() + ".peel.begin");
446 InsertBot
->setName(Header
->getName() + ".peel.next");
447 NewPreHeader
->setName(PreHeader
->getName() + ".peel.newph");
449 ValueToValueMapTy LVMap
;
451 // If we have branch weight information, we'll want to update it for the
452 // newly created branches.
453 BranchInst
*LatchBR
=
454 cast
<BranchInst
>(cast
<BasicBlock
>(Latch
)->getTerminator());
455 unsigned HeaderIdx
= (LatchBR
->getSuccessor(0) == Header
? 0 : 1);
457 uint64_t TrueWeight
, FalseWeight
;
458 uint64_t ExitWeight
= 0, CurHeaderWeight
= 0;
459 if (LatchBR
->extractProfMetadata(TrueWeight
, FalseWeight
)) {
460 ExitWeight
= HeaderIdx
? TrueWeight
: FalseWeight
;
461 // The # of times the loop body executes is the sum of the exit block
462 // weight and the # of times the backedges are taken.
463 CurHeaderWeight
= TrueWeight
+ FalseWeight
;
466 // For each peeled-off iteration, make a copy of the loop.
467 for (unsigned Iter
= 0; Iter
< PeelCount
; ++Iter
) {
468 SmallVector
<BasicBlock
*, 8> NewBlocks
;
469 ValueToValueMapTy VMap
;
471 // Subtract the exit weight from the current header weight -- the exit
472 // weight is exactly the weight of the previous iteration's header.
473 // FIXME: due to the way the distribution is constructed, we need a
474 // guard here to make sure we don't end up with non-positive weights.
475 if (ExitWeight
< CurHeaderWeight
)
476 CurHeaderWeight
-= ExitWeight
;
480 cloneLoopBlocks(L
, Iter
, InsertTop
, InsertBot
, Exit
,
481 NewBlocks
, LoopBlocks
, VMap
, LVMap
, DT
, LI
);
483 // Remap to use values from the current iteration instead of the
485 remapInstructionsInBlocks(NewBlocks
, VMap
);
488 // Latches of the cloned loops dominate over the loop exit, so idom of the
489 // latter is the first cloned loop body, as original PreHeader dominates
490 // the original loop body.
492 DT
->changeImmediateDominator(Exit
, cast
<BasicBlock
>(LVMap
[Latch
]));
499 updateBranchWeights(InsertBot
, cast
<BranchInst
>(VMap
[LatchBR
]), Iter
,
500 PeelCount
, ExitWeight
);
502 InsertTop
= InsertBot
;
503 InsertBot
= SplitBlock(InsertBot
, InsertBot
->getTerminator(), DT
, LI
);
504 InsertBot
->setName(Header
->getName() + ".peel.next");
506 F
->getBasicBlockList().splice(InsertTop
->getIterator(),
507 F
->getBasicBlockList(),
508 NewBlocks
[0]->getIterator(), F
->end());
511 // Now adjust the phi nodes in the loop header to get their initial values
512 // from the last peeled-off iteration instead of the preheader.
513 for (BasicBlock::iterator I
= Header
->begin(); isa
<PHINode
>(I
); ++I
) {
514 PHINode
*PHI
= cast
<PHINode
>(I
);
515 Value
*NewVal
= PHI
->getIncomingValueForBlock(Latch
);
516 Instruction
*LatchInst
= dyn_cast
<Instruction
>(NewVal
);
517 if (LatchInst
&& L
->contains(LatchInst
))
518 NewVal
= LVMap
[LatchInst
];
520 PHI
->setIncomingValue(PHI
->getBasicBlockIndex(NewPreHeader
), NewVal
);
523 // Adjust the branch weights on the loop exit.
525 // The backedge count is the difference of current header weight and
526 // current loop exit weight. If the current header weight is smaller than
527 // the current loop exit weight, we mark the loop backedge weight as 1.
528 uint64_t BackEdgeWeight
= 0;
529 if (ExitWeight
< CurHeaderWeight
)
530 BackEdgeWeight
= CurHeaderWeight
- ExitWeight
;
533 MDBuilder
MDB(LatchBR
->getContext());
535 HeaderIdx
? MDB
.createBranchWeights(ExitWeight
, BackEdgeWeight
)
536 : MDB
.createBranchWeights(BackEdgeWeight
, ExitWeight
);
537 LatchBR
->setMetadata(LLVMContext::MD_prof
, WeightNode
);
540 // If the loop is nested, we changed the parent loop, update SE.
541 if (Loop
*ParentLoop
= L
->getParentLoop()) {
542 SE
->forgetLoop(ParentLoop
);
544 // FIXME: Incrementally update loop-simplify
545 simplifyLoop(ParentLoop
, DT
, LI
, SE
, AC
, PreserveLCSSA
);
547 // FIXME: Incrementally update loop-simplify
548 simplifyLoop(L
, DT
, LI
, SE
, AC
, PreserveLCSSA
);