1 //===- LoopFuse.cpp - Loop Fusion Pass ------------------------------------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
10 /// This file implements the loop fusion pass.
11 /// The implementation is largely based on the following document:
13 /// Code Transformations to Augment the Scope of Loop Fusion in a
14 /// Production Compiler
15 /// Christopher Mark Barton
17 /// https://webdocs.cs.ualberta.ca/~amaral/thesis/ChristopherBartonMSc.pdf
19 /// The general approach taken is to collect sets of control flow equivalent
20 /// loops and test whether they can be fused. The necessary conditions for
22 /// 1. The loops must be adjacent (there cannot be any statements between
24 /// 2. The loops must be conforming (they must execute the same number of
26 /// 3. The loops must be control flow equivalent (if one loop executes, the
27 /// other is guaranteed to execute).
28 /// 4. There cannot be any negative distance dependencies between the loops.
29 /// If all of these conditions are satisfied, it is safe to fuse the loops.
31 /// This implementation creates FusionCandidates that represent the loop and the
32 /// necessary information needed by fusion. It then operates on the fusion
33 /// candidates, first confirming that the candidate is eligible for fusion. The
34 /// candidates are then collected into control flow equivalent sets, sorted in
35 /// dominance order. Each set of control flow equivalent candidates is then
36 /// traversed, attempting to fuse pairs of candidates in the set. If all
37 /// requirements for fusion are met, the two candidates are fused, creating a
38 /// new (fused) candidate which is then added back into the set to consider for
39 /// additional fusion.
41 /// This implementation currently does not make any modifications to remove
42 /// conditions for fusion. Code transformations to make loops conform to each of
43 /// the conditions for fusion are discussed in more detail in the document
44 /// above. These can be added to the current implementation in the future.
45 //===----------------------------------------------------------------------===//
47 #include "llvm/Transforms/Scalar/LoopFuse.h"
48 #include "llvm/ADT/Statistic.h"
49 #include "llvm/Analysis/AssumptionCache.h"
50 #include "llvm/Analysis/DependenceAnalysis.h"
51 #include "llvm/Analysis/DomTreeUpdater.h"
52 #include "llvm/Analysis/LoopInfo.h"
53 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
54 #include "llvm/Analysis/PostDominators.h"
55 #include "llvm/Analysis/ScalarEvolution.h"
56 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
57 #include "llvm/Analysis/TargetTransformInfo.h"
58 #include "llvm/IR/Function.h"
59 #include "llvm/IR/Verifier.h"
60 #include "llvm/Support/CommandLine.h"
61 #include "llvm/Support/Debug.h"
62 #include "llvm/Support/raw_ostream.h"
63 #include "llvm/Transforms/Utils.h"
64 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
65 #include "llvm/Transforms/Utils/CodeMoverUtils.h"
66 #include "llvm/Transforms/Utils/LoopPeel.h"
67 #include "llvm/Transforms/Utils/LoopSimplify.h"
71 #define DEBUG_TYPE "loop-fusion"
73 STATISTIC(FuseCounter
, "Loops fused");
74 STATISTIC(NumFusionCandidates
, "Number of candidates for loop fusion");
75 STATISTIC(InvalidPreheader
, "Loop has invalid preheader");
76 STATISTIC(InvalidHeader
, "Loop has invalid header");
77 STATISTIC(InvalidExitingBlock
, "Loop has invalid exiting blocks");
78 STATISTIC(InvalidExitBlock
, "Loop has invalid exit block");
79 STATISTIC(InvalidLatch
, "Loop has invalid latch");
80 STATISTIC(InvalidLoop
, "Loop is invalid");
81 STATISTIC(AddressTakenBB
, "Basic block has address taken");
82 STATISTIC(MayThrowException
, "Loop may throw an exception");
83 STATISTIC(ContainsVolatileAccess
, "Loop contains a volatile access");
84 STATISTIC(NotSimplifiedForm
, "Loop is not in simplified form");
85 STATISTIC(InvalidDependencies
, "Dependencies prevent fusion");
86 STATISTIC(UnknownTripCount
, "Loop has unknown trip count");
87 STATISTIC(UncomputableTripCount
, "SCEV cannot compute trip count of loop");
88 STATISTIC(NonEqualTripCount
, "Loop trip counts are not the same");
89 STATISTIC(NonAdjacent
, "Loops are not adjacent");
92 "Loop has a non-empty preheader with instructions that cannot be moved");
93 STATISTIC(FusionNotBeneficial
, "Fusion is not beneficial");
94 STATISTIC(NonIdenticalGuards
, "Candidates have different guards");
95 STATISTIC(NonEmptyExitBlock
, "Candidate has a non-empty exit block with "
96 "instructions that cannot be moved");
97 STATISTIC(NonEmptyGuardBlock
, "Candidate has a non-empty guard block with "
98 "instructions that cannot be moved");
99 STATISTIC(NotRotated
, "Candidate is not rotated");
100 STATISTIC(OnlySecondCandidateIsGuarded
,
101 "The second candidate is guarded while the first one is not");
102 STATISTIC(NumHoistedInsts
, "Number of hoisted preheader instructions.");
103 STATISTIC(NumSunkInsts
, "Number of hoisted preheader instructions.");
105 enum FusionDependenceAnalysisChoice
{
106 FUSION_DEPENDENCE_ANALYSIS_SCEV
,
107 FUSION_DEPENDENCE_ANALYSIS_DA
,
108 FUSION_DEPENDENCE_ANALYSIS_ALL
,
111 static cl::opt
<FusionDependenceAnalysisChoice
> FusionDependenceAnalysis(
112 "loop-fusion-dependence-analysis",
113 cl::desc("Which dependence analysis should loop fusion use?"),
114 cl::values(clEnumValN(FUSION_DEPENDENCE_ANALYSIS_SCEV
, "scev",
115 "Use the scalar evolution interface"),
116 clEnumValN(FUSION_DEPENDENCE_ANALYSIS_DA
, "da",
117 "Use the dependence analysis interface"),
118 clEnumValN(FUSION_DEPENDENCE_ANALYSIS_ALL
, "all",
119 "Use all available analyses")),
120 cl::Hidden
, cl::init(FUSION_DEPENDENCE_ANALYSIS_ALL
));
122 static cl::opt
<unsigned> FusionPeelMaxCount(
123 "loop-fusion-peel-max-count", cl::init(0), cl::Hidden
,
124 cl::desc("Max number of iterations to be peeled from a loop, such that "
125 "fusion can take place"));
129 VerboseFusionDebugging("loop-fusion-verbose-debug",
130 cl::desc("Enable verbose debugging for Loop Fusion"),
131 cl::Hidden
, cl::init(false));
135 /// This class is used to represent a candidate for loop fusion. When it is
136 /// constructed, it checks the conditions for loop fusion to ensure that it
137 /// represents a valid candidate. It caches several parts of a loop that are
138 /// used throughout loop fusion (e.g., loop preheader, loop header, etc) instead
139 /// of continually querying the underlying Loop to retrieve these values. It is
140 /// assumed these will not change throughout loop fusion.
142 /// The invalidate method should be used to indicate that the FusionCandidate is
143 /// no longer a valid candidate for fusion. Similarly, the isValid() method can
144 /// be used to ensure that the FusionCandidate is still valid for fusion.
145 struct FusionCandidate
{
146 /// Cache of parts of the loop used throughout loop fusion. These should not
147 /// need to change throughout the analysis and transformation.
148 /// These parts are cached to avoid repeatedly looking up in the Loop class.
150 /// Preheader of the loop this candidate represents
151 BasicBlock
*Preheader
;
152 /// Header of the loop this candidate represents
154 /// Blocks in the loop that exit the loop
155 BasicBlock
*ExitingBlock
;
156 /// The successor block of this loop (where the exiting blocks go to)
157 BasicBlock
*ExitBlock
;
158 /// Latch of the loop
160 /// The loop that this fusion candidate represents
162 /// Vector of instructions in this loop that read from memory
163 SmallVector
<Instruction
*, 16> MemReads
;
164 /// Vector of instructions in this loop that write to memory
165 SmallVector
<Instruction
*, 16> MemWrites
;
166 /// Are all of the members of this fusion candidate still valid
168 /// Guard branch of the loop, if it exists
169 BranchInst
*GuardBranch
;
170 /// Peeling Paramaters of the Loop.
171 TTI::PeelingPreferences PP
;
172 /// Can you Peel this Loop?
174 /// Has this loop been Peeled
177 /// Dominator and PostDominator trees are needed for the
178 /// FusionCandidateCompare function, required by FusionCandidateSet to
179 /// determine where the FusionCandidate should be inserted into the set. These
180 /// are used to establish ordering of the FusionCandidates based on dominance.
182 const PostDominatorTree
*PDT
;
184 OptimizationRemarkEmitter
&ORE
;
186 FusionCandidate(Loop
*L
, DominatorTree
&DT
, const PostDominatorTree
*PDT
,
187 OptimizationRemarkEmitter
&ORE
, TTI::PeelingPreferences PP
)
188 : Preheader(L
->getLoopPreheader()), Header(L
->getHeader()),
189 ExitingBlock(L
->getExitingBlock()), ExitBlock(L
->getExitBlock()),
190 Latch(L
->getLoopLatch()), L(L
), Valid(true),
191 GuardBranch(L
->getLoopGuardBranch()), PP(PP
), AbleToPeel(canPeel(L
)),
192 Peeled(false), DT(DT
), PDT(PDT
), ORE(ORE
) {
194 // Walk over all blocks in the loop and check for conditions that may
195 // prevent fusion. For each block, walk over all instructions and collect
196 // the memory reads and writes If any instructions that prevent fusion are
197 // found, invalidate this object and return.
198 for (BasicBlock
*BB
: L
->blocks()) {
199 if (BB
->hasAddressTaken()) {
201 reportInvalidCandidate(AddressTakenBB
);
205 for (Instruction
&I
: *BB
) {
208 reportInvalidCandidate(MayThrowException
);
211 if (StoreInst
*SI
= dyn_cast
<StoreInst
>(&I
)) {
212 if (SI
->isVolatile()) {
214 reportInvalidCandidate(ContainsVolatileAccess
);
218 if (LoadInst
*LI
= dyn_cast
<LoadInst
>(&I
)) {
219 if (LI
->isVolatile()) {
221 reportInvalidCandidate(ContainsVolatileAccess
);
225 if (I
.mayWriteToMemory())
226 MemWrites
.push_back(&I
);
227 if (I
.mayReadFromMemory())
228 MemReads
.push_back(&I
);
233 /// Check if all members of the class are valid.
234 bool isValid() const {
235 return Preheader
&& Header
&& ExitingBlock
&& ExitBlock
&& Latch
&& L
&&
236 !L
->isInvalid() && Valid
;
239 /// Verify that all members are in sync with the Loop object.
240 void verify() const {
241 assert(isValid() && "Candidate is not valid!!");
242 assert(!L
->isInvalid() && "Loop is invalid!");
243 assert(Preheader
== L
->getLoopPreheader() && "Preheader is out of sync");
244 assert(Header
== L
->getHeader() && "Header is out of sync");
245 assert(ExitingBlock
== L
->getExitingBlock() &&
246 "Exiting Blocks is out of sync");
247 assert(ExitBlock
== L
->getExitBlock() && "Exit block is out of sync");
248 assert(Latch
== L
->getLoopLatch() && "Latch is out of sync");
251 /// Get the entry block for this fusion candidate.
253 /// If this fusion candidate represents a guarded loop, the entry block is the
254 /// loop guard block. If it represents an unguarded loop, the entry block is
255 /// the preheader of the loop.
256 BasicBlock
*getEntryBlock() const {
258 return GuardBranch
->getParent();
263 /// After Peeling the loop is modified quite a bit, hence all of the Blocks
264 /// need to be updated accordingly.
265 void updateAfterPeeling() {
266 Preheader
= L
->getLoopPreheader();
267 Header
= L
->getHeader();
268 ExitingBlock
= L
->getExitingBlock();
269 ExitBlock
= L
->getExitBlock();
270 Latch
= L
->getLoopLatch();
274 /// Given a guarded loop, get the successor of the guard that is not in the
277 /// This method returns the successor of the loop guard that is not located
278 /// within the loop (i.e., the successor of the guard that is not the
280 /// This method is only valid for guarded loops.
281 BasicBlock
*getNonLoopBlock() const {
282 assert(GuardBranch
&& "Only valid on guarded loops.");
283 assert(GuardBranch
->isConditional() &&
284 "Expecting guard to be a conditional branch.");
286 return GuardBranch
->getSuccessor(1);
287 return (GuardBranch
->getSuccessor(0) == Preheader
)
288 ? GuardBranch
->getSuccessor(1)
289 : GuardBranch
->getSuccessor(0);
292 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
293 LLVM_DUMP_METHOD
void dump() const {
294 dbgs() << "\tGuardBranch: ";
296 dbgs() << *GuardBranch
;
300 << (GuardBranch
? GuardBranch
->getName() : "nullptr") << "\n"
301 << "\tPreheader: " << (Preheader
? Preheader
->getName() : "nullptr")
303 << "\tHeader: " << (Header
? Header
->getName() : "nullptr") << "\n"
305 << (ExitingBlock
? ExitingBlock
->getName() : "nullptr") << "\n"
306 << "\tExitBB: " << (ExitBlock
? ExitBlock
->getName() : "nullptr")
308 << "\tLatch: " << (Latch
? Latch
->getName() : "nullptr") << "\n"
310 << (getEntryBlock() ? getEntryBlock()->getName() : "nullptr")
315 /// Determine if a fusion candidate (representing a loop) is eligible for
316 /// fusion. Note that this only checks whether a single loop can be fused - it
317 /// does not check whether it is *legal* to fuse two loops together.
318 bool isEligibleForFusion(ScalarEvolution
&SE
) const {
320 LLVM_DEBUG(dbgs() << "FC has invalid CFG requirements!\n");
326 ++InvalidExitingBlock
;
337 // Require ScalarEvolution to be able to determine a trip count.
338 if (!SE
.hasLoopInvariantBackedgeTakenCount(L
)) {
339 LLVM_DEBUG(dbgs() << "Loop " << L
->getName()
340 << " trip count not computable!\n");
341 return reportInvalidCandidate(UnknownTripCount
);
344 if (!L
->isLoopSimplifyForm()) {
345 LLVM_DEBUG(dbgs() << "Loop " << L
->getName()
346 << " is not in simplified form!\n");
347 return reportInvalidCandidate(NotSimplifiedForm
);
350 if (!L
->isRotatedForm()) {
351 LLVM_DEBUG(dbgs() << "Loop " << L
->getName() << " is not rotated!\n");
352 return reportInvalidCandidate(NotRotated
);
359 // This is only used internally for now, to clear the MemWrites and MemReads
360 // list and setting Valid to false. I can't envision other uses of this right
361 // now, since once FusionCandidates are put into the FusionCandidateSet they
362 // are immutable. Thus, any time we need to change/update a FusionCandidate,
363 // we must create a new one and insert it into the FusionCandidateSet to
364 // ensure the FusionCandidateSet remains ordered correctly.
371 bool reportInvalidCandidate(llvm::Statistic
&Stat
) const {
373 assert(L
&& Preheader
&& "Fusion candidate not initialized properly!");
374 #if LLVM_ENABLE_STATS
376 ORE
.emit(OptimizationRemarkAnalysis(DEBUG_TYPE
, Stat
.getName(),
377 L
->getStartLoc(), Preheader
)
378 << "[" << Preheader
->getParent()->getName() << "]: "
379 << "Loop is not a candidate for fusion: " << Stat
.getDesc());
385 struct FusionCandidateCompare
{
386 /// Comparison functor to sort two Control Flow Equivalent fusion candidates
387 /// into dominance order.
388 /// If LHS dominates RHS and RHS post-dominates LHS, return true;
389 /// If RHS dominates LHS and LHS post-dominates RHS, return false;
390 /// If both LHS and RHS are not dominating each other then, non-strictly
391 /// post dominate check will decide the order of candidates. If RHS
392 /// non-strictly post dominates LHS then, return true. If LHS non-strictly
393 /// post dominates RHS then, return false. If both are non-strictly post
394 /// dominate each other then, level in the post dominator tree will decide
395 /// the order of candidates.
396 bool operator()(const FusionCandidate
&LHS
,
397 const FusionCandidate
&RHS
) const {
398 const DominatorTree
*DT
= &(LHS
.DT
);
400 BasicBlock
*LHSEntryBlock
= LHS
.getEntryBlock();
401 BasicBlock
*RHSEntryBlock
= RHS
.getEntryBlock();
403 // Do not save PDT to local variable as it is only used in asserts and thus
404 // will trigger an unused variable warning if building without asserts.
405 assert(DT
&& LHS
.PDT
&& "Expecting valid dominator tree");
407 // Do this compare first so if LHS == RHS, function returns false.
408 if (DT
->dominates(RHSEntryBlock
, LHSEntryBlock
)) {
410 // Verify LHS post-dominates RHS
411 assert(LHS
.PDT
->dominates(LHSEntryBlock
, RHSEntryBlock
));
415 if (DT
->dominates(LHSEntryBlock
, RHSEntryBlock
)) {
416 // Verify RHS Postdominates LHS
417 assert(LHS
.PDT
->dominates(RHSEntryBlock
, LHSEntryBlock
));
421 // If two FusionCandidates are in the same level of dominator tree,
422 // they will not dominate each other, but may still be control flow
423 // equivalent. To sort those FusionCandidates, nonStrictlyPostDominate()
424 // function is needed.
426 nonStrictlyPostDominate(LHSEntryBlock
, RHSEntryBlock
, DT
, LHS
.PDT
);
428 nonStrictlyPostDominate(RHSEntryBlock
, LHSEntryBlock
, DT
, LHS
.PDT
);
429 if (WrongOrder
&& RightOrder
) {
430 // If common predecessor of LHS and RHS post dominates both
431 // FusionCandidates then, Order of FusionCandidate can be
432 // identified by its level in post dominator tree.
433 DomTreeNode
*LNode
= LHS
.PDT
->getNode(LHSEntryBlock
);
434 DomTreeNode
*RNode
= LHS
.PDT
->getNode(RHSEntryBlock
);
435 return LNode
->getLevel() > RNode
->getLevel();
436 } else if (WrongOrder
)
441 // If LHS does not non-strict Postdominate RHS and RHS does not non-strict
442 // Postdominate LHS then, there is no dominance relationship between the
443 // two FusionCandidates. Thus, they should not be in the same set together.
445 "No dominance relationship between these fusion candidates!");
449 using LoopVector
= SmallVector
<Loop
*, 4>;
451 // Set of Control Flow Equivalent (CFE) Fusion Candidates, sorted in dominance
452 // order. Thus, if FC0 comes *before* FC1 in a FusionCandidateSet, then FC0
453 // dominates FC1 and FC1 post-dominates FC0.
454 // std::set was chosen because we want a sorted data structure with stable
455 // iterators. A subsequent patch to loop fusion will enable fusing non-adjacent
456 // loops by moving intervening code around. When this intervening code contains
457 // loops, those loops will be moved also. The corresponding FusionCandidates
458 // will also need to be moved accordingly. As this is done, having stable
459 // iterators will simplify the logic. Similarly, having an efficient insert that
460 // keeps the FusionCandidateSet sorted will also simplify the implementation.
461 using FusionCandidateSet
= std::set
<FusionCandidate
, FusionCandidateCompare
>;
462 using FusionCandidateCollection
= SmallVector
<FusionCandidateSet
, 4>;
465 static llvm::raw_ostream
&operator<<(llvm::raw_ostream
&OS
,
466 const FusionCandidate
&FC
) {
468 OS
<< FC
.Preheader
->getName();
475 static llvm::raw_ostream
&operator<<(llvm::raw_ostream
&OS
,
476 const FusionCandidateSet
&CandSet
) {
477 for (const FusionCandidate
&FC
: CandSet
)
484 printFusionCandidates(const FusionCandidateCollection
&FusionCandidates
) {
485 dbgs() << "Fusion Candidates: \n";
486 for (const auto &CandidateSet
: FusionCandidates
) {
487 dbgs() << "*** Fusion Candidate Set ***\n";
488 dbgs() << CandidateSet
;
489 dbgs() << "****************************\n";
494 /// Collect all loops in function at the same nest level, starting at the
497 /// This data structure collects all loops at the same nest level for a
498 /// given function (specified by the LoopInfo object). It starts at the
500 struct LoopDepthTree
{
501 using LoopsOnLevelTy
= SmallVector
<LoopVector
, 4>;
502 using iterator
= LoopsOnLevelTy::iterator
;
503 using const_iterator
= LoopsOnLevelTy::const_iterator
;
505 LoopDepthTree(LoopInfo
&LI
) : Depth(1) {
507 LoopsOnLevel
.emplace_back(LoopVector(LI
.rbegin(), LI
.rend()));
510 /// Test whether a given loop has been removed from the function, and thus is
512 bool isRemovedLoop(const Loop
*L
) const { return RemovedLoops
.count(L
); }
514 /// Record that a given loop has been removed from the function and is no
516 void removeLoop(const Loop
*L
) { RemovedLoops
.insert(L
); }
518 /// Descend the tree to the next (inner) nesting level
520 LoopsOnLevelTy LoopsOnNextLevel
;
522 for (const LoopVector
&LV
: *this)
524 if (!isRemovedLoop(L
) && L
->begin() != L
->end())
525 LoopsOnNextLevel
.emplace_back(LoopVector(L
->begin(), L
->end()));
527 LoopsOnLevel
= LoopsOnNextLevel
;
528 RemovedLoops
.clear();
532 bool empty() const { return size() == 0; }
533 size_t size() const { return LoopsOnLevel
.size() - RemovedLoops
.size(); }
534 unsigned getDepth() const { return Depth
; }
536 iterator
begin() { return LoopsOnLevel
.begin(); }
537 iterator
end() { return LoopsOnLevel
.end(); }
538 const_iterator
begin() const { return LoopsOnLevel
.begin(); }
539 const_iterator
end() const { return LoopsOnLevel
.end(); }
542 /// Set of loops that have been removed from the function and are no longer
544 SmallPtrSet
<const Loop
*, 8> RemovedLoops
;
546 /// Depth of the current level, starting at 1 (outermost loops).
549 /// Vector of loops at the current depth level that have the same parent loop
550 LoopsOnLevelTy LoopsOnLevel
;
554 static void printLoopVector(const LoopVector
&LV
) {
555 dbgs() << "****************************\n";
557 printLoop(*L
, dbgs());
558 dbgs() << "****************************\n";
564 // Sets of control flow equivalent fusion candidates for a given nest level.
565 FusionCandidateCollection FusionCandidates
;
574 PostDominatorTree
&PDT
;
575 OptimizationRemarkEmitter
&ORE
;
577 const TargetTransformInfo
&TTI
;
580 LoopFuser(LoopInfo
&LI
, DominatorTree
&DT
, DependenceInfo
&DI
,
581 ScalarEvolution
&SE
, PostDominatorTree
&PDT
,
582 OptimizationRemarkEmitter
&ORE
, const DataLayout
&DL
,
583 AssumptionCache
&AC
, const TargetTransformInfo
&TTI
)
584 : LDT(LI
), DTU(DT
, PDT
, DomTreeUpdater::UpdateStrategy::Lazy
), LI(LI
),
585 DT(DT
), DI(DI
), SE(SE
), PDT(PDT
), ORE(ORE
), AC(AC
), TTI(TTI
) {}
587 /// This is the main entry point for loop fusion. It will traverse the
588 /// specified function and collect candidate loops to fuse, starting at the
589 /// outermost nesting level and working inwards.
590 bool fuseLoops(Function
&F
) {
592 if (VerboseFusionDebugging
) {
597 LLVM_DEBUG(dbgs() << "Performing Loop Fusion on function " << F
.getName()
599 bool Changed
= false;
601 while (!LDT
.empty()) {
602 LLVM_DEBUG(dbgs() << "Got " << LDT
.size() << " loop sets for depth "
603 << LDT
.getDepth() << "\n";);
605 for (const LoopVector
&LV
: LDT
) {
606 assert(LV
.size() > 0 && "Empty loop set was build!");
608 // Skip singleton loop sets as they do not offer fusion opportunities on
613 if (VerboseFusionDebugging
) {
615 dbgs() << " Visit loop set (#" << LV
.size() << "):\n";
621 collectFusionCandidates(LV
);
622 Changed
|= fuseCandidates();
625 // Finished analyzing candidates at this level.
626 // Descend to the next level and clear all of the candidates currently
627 // collected. Note that it will not be possible to fuse any of the
628 // existing candidates with new candidates because the new candidates will
629 // be at a different nest level and thus not be control flow equivalent
630 // with all of the candidates collected so far.
631 LLVM_DEBUG(dbgs() << "Descend one level!\n");
633 FusionCandidates
.clear();
637 LLVM_DEBUG(dbgs() << "Function after Loop Fusion: \n"; F
.dump(););
641 assert(PDT
.verify());
646 LLVM_DEBUG(dbgs() << "Loop Fusion complete\n");
651 /// Determine if two fusion candidates are control flow equivalent.
653 /// Two fusion candidates are control flow equivalent if when one executes,
654 /// the other is guaranteed to execute. This is determined using dominators
655 /// and post-dominators: if A dominates B and B post-dominates A then A and B
656 /// are control-flow equivalent.
657 bool isControlFlowEquivalent(const FusionCandidate
&FC0
,
658 const FusionCandidate
&FC1
) const {
659 assert(FC0
.Preheader
&& FC1
.Preheader
&& "Expecting valid preheaders");
661 return ::isControlFlowEquivalent(*FC0
.getEntryBlock(), *FC1
.getEntryBlock(),
665 /// Iterate over all loops in the given loop set and identify the loops that
666 /// are eligible for fusion. Place all eligible fusion candidates into Control
667 /// Flow Equivalent sets, sorted by dominance.
668 void collectFusionCandidates(const LoopVector
&LV
) {
670 TTI::PeelingPreferences PP
=
671 gatherPeelingPreferences(L
, SE
, TTI
, std::nullopt
, std::nullopt
);
672 FusionCandidate
CurrCand(L
, DT
, &PDT
, ORE
, PP
);
673 if (!CurrCand
.isEligibleForFusion(SE
))
676 // Go through each list in FusionCandidates and determine if L is control
677 // flow equivalent with the first loop in that list. If it is, append LV.
678 // If not, go to the next list.
679 // If no suitable list is found, start another list and add it to
681 bool FoundSet
= false;
683 for (auto &CurrCandSet
: FusionCandidates
) {
684 if (isControlFlowEquivalent(*CurrCandSet
.begin(), CurrCand
)) {
685 CurrCandSet
.insert(CurrCand
);
688 if (VerboseFusionDebugging
)
689 LLVM_DEBUG(dbgs() << "Adding " << CurrCand
690 << " to existing candidate set\n");
696 // No set was found. Create a new set and add to FusionCandidates
698 if (VerboseFusionDebugging
)
699 LLVM_DEBUG(dbgs() << "Adding " << CurrCand
<< " to new set\n");
701 FusionCandidateSet NewCandSet
;
702 NewCandSet
.insert(CurrCand
);
703 FusionCandidates
.push_back(NewCandSet
);
705 NumFusionCandidates
++;
709 /// Determine if it is beneficial to fuse two loops.
711 /// For now, this method simply returns true because we want to fuse as much
712 /// as possible (primarily to test the pass). This method will evolve, over
713 /// time, to add heuristics for profitability of fusion.
714 bool isBeneficialFusion(const FusionCandidate
&FC0
,
715 const FusionCandidate
&FC1
) {
719 /// Determine if two fusion candidates have the same trip count (i.e., they
720 /// execute the same number of iterations).
722 /// This function will return a pair of values. The first is a boolean,
723 /// stating whether or not the two candidates are known at compile time to
724 /// have the same TripCount. The second is the difference in the two
725 /// TripCounts. This information can be used later to determine whether or not
726 /// peeling can be performed on either one of the candidates.
727 std::pair
<bool, std::optional
<unsigned>>
728 haveIdenticalTripCounts(const FusionCandidate
&FC0
,
729 const FusionCandidate
&FC1
) const {
730 const SCEV
*TripCount0
= SE
.getBackedgeTakenCount(FC0
.L
);
731 if (isa
<SCEVCouldNotCompute
>(TripCount0
)) {
732 UncomputableTripCount
++;
733 LLVM_DEBUG(dbgs() << "Trip count of first loop could not be computed!");
734 return {false, std::nullopt
};
737 const SCEV
*TripCount1
= SE
.getBackedgeTakenCount(FC1
.L
);
738 if (isa
<SCEVCouldNotCompute
>(TripCount1
)) {
739 UncomputableTripCount
++;
740 LLVM_DEBUG(dbgs() << "Trip count of second loop could not be computed!");
741 return {false, std::nullopt
};
744 LLVM_DEBUG(dbgs() << "\tTrip counts: " << *TripCount0
<< " & "
745 << *TripCount1
<< " are "
746 << (TripCount0
== TripCount1
? "identical" : "different")
749 if (TripCount0
== TripCount1
)
752 LLVM_DEBUG(dbgs() << "The loops do not have the same tripcount, "
753 "determining the difference between trip counts\n");
755 // Currently only considering loops with a single exit point
756 // and a non-constant trip count.
757 const unsigned TC0
= SE
.getSmallConstantTripCount(FC0
.L
);
758 const unsigned TC1
= SE
.getSmallConstantTripCount(FC1
.L
);
760 // If any of the tripcounts are zero that means that loop(s) do not have
761 // a single exit or a constant tripcount.
762 if (TC0
== 0 || TC1
== 0) {
763 LLVM_DEBUG(dbgs() << "Loop(s) do not have a single exit point or do not "
764 "have a constant number of iterations. Peeling "
765 "is not benefical\n");
766 return {false, std::nullopt
};
769 std::optional
<unsigned> Difference
;
770 int Diff
= TC0
- TC1
;
776 dbgs() << "Difference is less than 0. FC1 (second loop) has more "
777 "iterations than the first one. Currently not supported\n");
780 LLVM_DEBUG(dbgs() << "Difference in loop trip count is: " << Difference
783 return {false, Difference
};
786 void peelFusionCandidate(FusionCandidate
&FC0
, const FusionCandidate
&FC1
,
787 unsigned PeelCount
) {
788 assert(FC0
.AbleToPeel
&& "Should be able to peel loop");
790 LLVM_DEBUG(dbgs() << "Attempting to peel first " << PeelCount
791 << " iterations of the first loop. \n");
793 ValueToValueMapTy VMap
;
794 FC0
.Peeled
= peelLoop(FC0
.L
, PeelCount
, &LI
, &SE
, DT
, &AC
, true, VMap
);
796 LLVM_DEBUG(dbgs() << "Done Peeling\n");
799 auto IdenticalTripCount
= haveIdenticalTripCounts(FC0
, FC1
);
801 assert(IdenticalTripCount
.first
&& *IdenticalTripCount
.second
== 0 &&
802 "Loops should have identical trip counts after peeling");
805 FC0
.PP
.PeelCount
+= PeelCount
;
807 // Peeling does not update the PDT
808 PDT
.recalculate(*FC0
.Preheader
->getParent());
810 FC0
.updateAfterPeeling();
812 // In this case the iterations of the loop are constant, so the first
813 // loop will execute completely (will not jump from one of
814 // the peeled blocks to the second loop). Here we are updating the
815 // branch conditions of each of the peeled blocks, such that it will
816 // branch to its successor which is not the preheader of the second loop
817 // in the case of unguarded loops, or the succesors of the exit block of
818 // the first loop otherwise. Doing this update will ensure that the entry
819 // block of the first loop dominates the entry block of the second loop.
821 FC0
.GuardBranch
? FC0
.ExitBlock
->getUniqueSuccessor() : FC1
.Preheader
;
823 SmallVector
<DominatorTree::UpdateType
, 8> TreeUpdates
;
824 SmallVector
<Instruction
*, 8> WorkList
;
825 for (BasicBlock
*Pred
: predecessors(BB
)) {
826 if (Pred
!= FC0
.ExitBlock
) {
827 WorkList
.emplace_back(Pred
->getTerminator());
828 TreeUpdates
.emplace_back(
829 DominatorTree::UpdateType(DominatorTree::Delete
, Pred
, BB
));
832 // Cannot modify the predecessors inside the above loop as it will cause
833 // the iterators to be nullptrs, causing memory errors.
834 for (Instruction
*CurrentBranch
: WorkList
) {
835 BasicBlock
*Succ
= CurrentBranch
->getSuccessor(0);
837 Succ
= CurrentBranch
->getSuccessor(1);
838 ReplaceInstWithInst(CurrentBranch
, BranchInst::Create(Succ
));
841 DTU
.applyUpdates(TreeUpdates
);
845 dbgs() << "Sucessfully peeled " << FC0
.PP
.PeelCount
846 << " iterations from the first loop.\n"
847 "Both Loops have the same number of iterations now.\n");
851 /// Walk each set of control flow equivalent fusion candidates and attempt to
852 /// fuse them. This does a single linear traversal of all candidates in the
853 /// set. The conditions for legal fusion are checked at this point. If a pair
854 /// of fusion candidates passes all legality checks, they are fused together
855 /// and a new fusion candidate is created and added to the FusionCandidateSet.
856 /// The original fusion candidates are then removed, as they are no longer
858 bool fuseCandidates() {
860 LLVM_DEBUG(printFusionCandidates(FusionCandidates
));
861 for (auto &CandidateSet
: FusionCandidates
) {
862 if (CandidateSet
.size() < 2)
865 LLVM_DEBUG(dbgs() << "Attempting fusion on Candidate Set:\n"
866 << CandidateSet
<< "\n");
868 for (auto FC0
= CandidateSet
.begin(); FC0
!= CandidateSet
.end(); ++FC0
) {
869 assert(!LDT
.isRemovedLoop(FC0
->L
) &&
870 "Should not have removed loops in CandidateSet!");
872 for (++FC1
; FC1
!= CandidateSet
.end(); ++FC1
) {
873 assert(!LDT
.isRemovedLoop(FC1
->L
) &&
874 "Should not have removed loops in CandidateSet!");
876 LLVM_DEBUG(dbgs() << "Attempting to fuse candidate \n"; FC0
->dump();
877 dbgs() << " with\n"; FC1
->dump(); dbgs() << "\n");
882 // Check if the candidates have identical tripcounts (first value of
883 // pair), and if not check the difference in the tripcounts between
884 // the loops (second value of pair). The difference is not equal to
885 // std::nullopt iff the loops iterate a constant number of times, and
886 // have a single exit.
887 std::pair
<bool, std::optional
<unsigned>> IdenticalTripCountRes
=
888 haveIdenticalTripCounts(*FC0
, *FC1
);
889 bool SameTripCount
= IdenticalTripCountRes
.first
;
890 std::optional
<unsigned> TCDifference
= IdenticalTripCountRes
.second
;
892 // Here we are checking that FC0 (the first loop) can be peeled, and
893 // both loops have different tripcounts.
894 if (FC0
->AbleToPeel
&& !SameTripCount
&& TCDifference
) {
895 if (*TCDifference
> FusionPeelMaxCount
) {
897 << "Difference in loop trip counts: " << *TCDifference
898 << " is greater than maximum peel count specificed: "
899 << FusionPeelMaxCount
<< "\n");
901 // Dependent on peeling being performed on the first loop, and
902 // assuming all other conditions for fusion return true.
903 SameTripCount
= true;
907 if (!SameTripCount
) {
908 LLVM_DEBUG(dbgs() << "Fusion candidates do not have identical trip "
909 "counts. Not fusing.\n");
910 reportLoopFusion
<OptimizationRemarkMissed
>(*FC0
, *FC1
,
915 if (!isAdjacent(*FC0
, *FC1
)) {
917 << "Fusion candidates are not adjacent. Not fusing.\n");
918 reportLoopFusion
<OptimizationRemarkMissed
>(*FC0
, *FC1
, NonAdjacent
);
922 if ((!FC0
->GuardBranch
&& FC1
->GuardBranch
) ||
923 (FC0
->GuardBranch
&& !FC1
->GuardBranch
)) {
924 LLVM_DEBUG(dbgs() << "The one of candidate is guarded while the "
925 "another one is not. Not fusing.\n");
926 reportLoopFusion
<OptimizationRemarkMissed
>(
927 *FC0
, *FC1
, OnlySecondCandidateIsGuarded
);
931 // Ensure that FC0 and FC1 have identical guards.
932 // If one (or both) are not guarded, this check is not necessary.
933 if (FC0
->GuardBranch
&& FC1
->GuardBranch
&&
934 !haveIdenticalGuards(*FC0
, *FC1
) && !TCDifference
) {
935 LLVM_DEBUG(dbgs() << "Fusion candidates do not have identical "
936 "guards. Not Fusing.\n");
937 reportLoopFusion
<OptimizationRemarkMissed
>(*FC0
, *FC1
,
942 if (FC0
->GuardBranch
) {
943 assert(FC1
->GuardBranch
&& "Expecting valid FC1 guard branch");
945 if (!isSafeToMoveBefore(*FC0
->ExitBlock
,
946 *FC1
->ExitBlock
->getFirstNonPHIOrDbg(), DT
,
948 LLVM_DEBUG(dbgs() << "Fusion candidate contains unsafe "
949 "instructions in exit block. Not fusing.\n");
950 reportLoopFusion
<OptimizationRemarkMissed
>(*FC0
, *FC1
,
955 if (!isSafeToMoveBefore(
956 *FC1
->GuardBranch
->getParent(),
957 *FC0
->GuardBranch
->getParent()->getTerminator(), DT
, &PDT
,
960 << "Fusion candidate contains unsafe "
961 "instructions in guard block. Not fusing.\n");
962 reportLoopFusion
<OptimizationRemarkMissed
>(*FC0
, *FC1
,
968 // Check the dependencies across the loops and do not fuse if it would
970 if (!dependencesAllowFusion(*FC0
, *FC1
)) {
971 LLVM_DEBUG(dbgs() << "Memory dependencies do not allow fusion!\n");
972 reportLoopFusion
<OptimizationRemarkMissed
>(*FC0
, *FC1
,
973 InvalidDependencies
);
977 // If the second loop has instructions in the pre-header, attempt to
978 // hoist them up to the first loop's pre-header or sink them into the
979 // body of the second loop.
980 SmallVector
<Instruction
*, 4> SafeToHoist
;
981 SmallVector
<Instruction
*, 4> SafeToSink
;
982 // At this point, this is the last remaining legality check.
983 // Which means if we can make this pre-header empty, we can fuse
985 if (!isEmptyPreheader(*FC1
)) {
986 LLVM_DEBUG(dbgs() << "Fusion candidate does not have empty "
989 // If it is not safe to hoist/sink all instructions in the
990 // pre-header, we cannot fuse these loops.
991 if (!collectMovablePreheaderInsts(*FC0
, *FC1
, SafeToHoist
,
993 LLVM_DEBUG(dbgs() << "Could not hoist/sink all instructions in "
994 "Fusion Candidate Pre-header.\n"
996 reportLoopFusion
<OptimizationRemarkMissed
>(*FC0
, *FC1
,
1002 bool BeneficialToFuse
= isBeneficialFusion(*FC0
, *FC1
);
1004 << "\tFusion appears to be "
1005 << (BeneficialToFuse
? "" : "un") << "profitable!\n");
1006 if (!BeneficialToFuse
) {
1007 reportLoopFusion
<OptimizationRemarkMissed
>(*FC0
, *FC1
,
1008 FusionNotBeneficial
);
1011 // All analysis has completed and has determined that fusion is legal
1012 // and profitable. At this point, start transforming the code and
1015 // Execute the hoist/sink operations on preheader instructions
1016 movePreheaderInsts(*FC0
, *FC1
, SafeToHoist
, SafeToSink
);
1018 LLVM_DEBUG(dbgs() << "\tFusion is performed: " << *FC0
<< " and "
1021 FusionCandidate FC0Copy
= *FC0
;
1022 // Peel the loop after determining that fusion is legal. The Loops
1023 // will still be safe to fuse after the peeling is performed.
1024 bool Peel
= TCDifference
&& *TCDifference
> 0;
1026 peelFusionCandidate(FC0Copy
, *FC1
, *TCDifference
);
1028 // Report fusion to the Optimization Remarks.
1029 // Note this needs to be done *before* performFusion because
1030 // performFusion will change the original loops, making it not
1031 // possible to identify them after fusion is complete.
1032 reportLoopFusion
<OptimizationRemark
>((Peel
? FC0Copy
: *FC0
), *FC1
,
1035 FusionCandidate
FusedCand(
1036 performFusion((Peel
? FC0Copy
: *FC0
), *FC1
), DT
, &PDT
, ORE
,
1039 assert(FusedCand
.isEligibleForFusion(SE
) &&
1040 "Fused candidate should be eligible for fusion!");
1042 // Notify the loop-depth-tree that these loops are not valid objects
1043 LDT
.removeLoop(FC1
->L
);
1045 CandidateSet
.erase(FC0
);
1046 CandidateSet
.erase(FC1
);
1048 auto InsertPos
= CandidateSet
.insert(FusedCand
);
1050 assert(InsertPos
.second
&&
1051 "Unable to insert TargetCandidate in CandidateSet!");
1053 // Reset FC0 and FC1 the new (fused) candidate. Subsequent iterations
1054 // of the FC1 loop will attempt to fuse the new (fused) loop with the
1055 // remaining candidates in the current candidate set.
1056 FC0
= FC1
= InsertPos
.first
;
1058 LLVM_DEBUG(dbgs() << "Candidate Set (after fusion): " << CandidateSet
1068 // Returns true if the instruction \p I can be hoisted to the end of the
1069 // preheader of \p FC0. \p SafeToHoist contains the instructions that are
1070 // known to be safe to hoist. The instructions encountered that cannot be
1071 // hoisted are in \p NotHoisting.
1072 // TODO: Move functionality into CodeMoverUtils
1073 bool canHoistInst(Instruction
&I
,
1074 const SmallVector
<Instruction
*, 4> &SafeToHoist
,
1075 const SmallVector
<Instruction
*, 4> &NotHoisting
,
1076 const FusionCandidate
&FC0
) const {
1077 const BasicBlock
*FC0PreheaderTarget
= FC0
.Preheader
->getSingleSuccessor();
1078 assert(FC0PreheaderTarget
&&
1079 "Expected single successor for loop preheader.");
1081 for (Use
&Op
: I
.operands()) {
1082 if (auto *OpInst
= dyn_cast
<Instruction
>(Op
)) {
1083 bool OpHoisted
= is_contained(SafeToHoist
, OpInst
);
1084 // Check if we have already decided to hoist this operand. In this
1085 // case, it does not dominate FC0 *yet*, but will after we hoist it.
1086 if (!(OpHoisted
|| DT
.dominates(OpInst
, FC0PreheaderTarget
))) {
1092 // PHIs in FC1's header only have FC0 blocks as predecessors. PHIs
1093 // cannot be hoisted and should be sunk to the exit of the fused loop.
1094 if (isa
<PHINode
>(I
))
1097 // If this isn't a memory inst, hoisting is safe
1098 if (!I
.mayReadOrWriteMemory())
1101 LLVM_DEBUG(dbgs() << "Checking if this mem inst can be hoisted.\n");
1102 for (Instruction
*NotHoistedInst
: NotHoisting
) {
1103 if (auto D
= DI
.depends(&I
, NotHoistedInst
, true)) {
1104 // Dependency is not read-before-write, write-before-read or
1105 // write-before-write
1106 if (D
->isFlow() || D
->isAnti() || D
->isOutput()) {
1107 LLVM_DEBUG(dbgs() << "Inst depends on an instruction in FC1's "
1108 "preheader that is not being hoisted.\n");
1114 for (Instruction
*ReadInst
: FC0
.MemReads
) {
1115 if (auto D
= DI
.depends(ReadInst
, &I
, true)) {
1116 // Dependency is not read-before-write
1118 LLVM_DEBUG(dbgs() << "Inst depends on a read instruction in FC0.\n");
1124 for (Instruction
*WriteInst
: FC0
.MemWrites
) {
1125 if (auto D
= DI
.depends(WriteInst
, &I
, true)) {
1126 // Dependency is not write-before-read or write-before-write
1127 if (D
->isFlow() || D
->isOutput()) {
1128 LLVM_DEBUG(dbgs() << "Inst depends on a write instruction in FC0.\n");
1136 // Returns true if the instruction \p I can be sunk to the top of the exit
1138 // TODO: Move functionality into CodeMoverUtils
1139 bool canSinkInst(Instruction
&I
, const FusionCandidate
&FC1
) const {
1140 for (User
*U
: I
.users()) {
1141 if (auto *UI
{dyn_cast
<Instruction
>(U
)}) {
1142 // Cannot sink if user in loop
1143 // If FC1 has phi users of this value, we cannot sink it into FC1.
1144 if (FC1
.L
->contains(UI
)) {
1145 // Cannot hoist or sink this instruction. No hoisting/sinking
1146 // should take place, loops should not fuse
1152 // If this isn't a memory inst, sinking is safe
1153 if (!I
.mayReadOrWriteMemory())
1156 for (Instruction
*ReadInst
: FC1
.MemReads
) {
1157 if (auto D
= DI
.depends(&I
, ReadInst
, true)) {
1158 // Dependency is not write-before-read
1160 LLVM_DEBUG(dbgs() << "Inst depends on a read instruction in FC1.\n");
1166 for (Instruction
*WriteInst
: FC1
.MemWrites
) {
1167 if (auto D
= DI
.depends(&I
, WriteInst
, true)) {
1168 // Dependency is not write-before-write or read-before-write
1169 if (D
->isOutput() || D
->isAnti()) {
1170 LLVM_DEBUG(dbgs() << "Inst depends on a write instruction in FC1.\n");
1179 /// Collect instructions in the \p FC1 Preheader that can be hoisted
1180 /// to the \p FC0 Preheader or sunk into the \p FC1 Body
1181 bool collectMovablePreheaderInsts(
1182 const FusionCandidate
&FC0
, const FusionCandidate
&FC1
,
1183 SmallVector
<Instruction
*, 4> &SafeToHoist
,
1184 SmallVector
<Instruction
*, 4> &SafeToSink
) const {
1185 BasicBlock
*FC1Preheader
= FC1
.Preheader
;
1186 // Save the instructions that are not being hoisted, so we know not to hoist
1187 // mem insts that they dominate.
1188 SmallVector
<Instruction
*, 4> NotHoisting
;
1190 for (Instruction
&I
: *FC1Preheader
) {
1191 // Can't move a branch
1192 if (&I
== FC1Preheader
->getTerminator())
1194 // If the instruction has side-effects, give up.
1195 // TODO: The case of mayReadFromMemory we can handle but requires
1196 // additional work with a dependence analysis so for now we give
1197 // up on memory reads.
1198 if (I
.mayThrow() || !I
.willReturn()) {
1199 LLVM_DEBUG(dbgs() << "Inst: " << I
<< " may throw or won't return.\n");
1203 LLVM_DEBUG(dbgs() << "Checking Inst: " << I
<< "\n");
1205 if (I
.isAtomic() || I
.isVolatile()) {
1207 dbgs() << "\tInstruction is volatile or atomic. Cannot move it.\n");
1211 if (canHoistInst(I
, SafeToHoist
, NotHoisting
, FC0
)) {
1212 SafeToHoist
.push_back(&I
);
1213 LLVM_DEBUG(dbgs() << "\tSafe to hoist.\n");
1215 LLVM_DEBUG(dbgs() << "\tCould not hoist. Trying to sink...\n");
1216 NotHoisting
.push_back(&I
);
1218 if (canSinkInst(I
, FC1
)) {
1219 SafeToSink
.push_back(&I
);
1220 LLVM_DEBUG(dbgs() << "\tSafe to sink.\n");
1222 LLVM_DEBUG(dbgs() << "\tCould not sink.\n");
1228 dbgs() << "All preheader instructions could be sunk or hoisted!\n");
1232 /// Rewrite all additive recurrences in a SCEV to use a new loop.
1233 class AddRecLoopReplacer
: public SCEVRewriteVisitor
<AddRecLoopReplacer
> {
1235 AddRecLoopReplacer(ScalarEvolution
&SE
, const Loop
&OldL
, const Loop
&NewL
,
1237 : SCEVRewriteVisitor(SE
), Valid(true), UseMax(UseMax
), OldL(OldL
),
1240 const SCEV
*visitAddRecExpr(const SCEVAddRecExpr
*Expr
) {
1241 const Loop
*ExprL
= Expr
->getLoop();
1242 SmallVector
<const SCEV
*, 2> Operands
;
1243 if (ExprL
== &OldL
) {
1244 append_range(Operands
, Expr
->operands());
1245 return SE
.getAddRecExpr(Operands
, &NewL
, Expr
->getNoWrapFlags());
1248 if (OldL
.contains(ExprL
)) {
1249 bool Pos
= SE
.isKnownPositive(Expr
->getStepRecurrence(SE
));
1250 if (!UseMax
|| !Pos
|| !Expr
->isAffine()) {
1254 return visit(Expr
->getStart());
1257 for (const SCEV
*Op
: Expr
->operands())
1258 Operands
.push_back(visit(Op
));
1259 return SE
.getAddRecExpr(Operands
, ExprL
, Expr
->getNoWrapFlags());
1262 bool wasValidSCEV() const { return Valid
; }
1266 const Loop
&OldL
, &NewL
;
1269 /// Return false if the access functions of \p I0 and \p I1 could cause
1270 /// a negative dependence.
1271 bool accessDiffIsPositive(const Loop
&L0
, const Loop
&L1
, Instruction
&I0
,
1272 Instruction
&I1
, bool EqualIsInvalid
) {
1273 Value
*Ptr0
= getLoadStorePointerOperand(&I0
);
1274 Value
*Ptr1
= getLoadStorePointerOperand(&I1
);
1278 const SCEV
*SCEVPtr0
= SE
.getSCEVAtScope(Ptr0
, &L0
);
1279 const SCEV
*SCEVPtr1
= SE
.getSCEVAtScope(Ptr1
, &L1
);
1281 if (VerboseFusionDebugging
)
1282 LLVM_DEBUG(dbgs() << " Access function check: " << *SCEVPtr0
<< " vs "
1283 << *SCEVPtr1
<< "\n");
1285 AddRecLoopReplacer
Rewriter(SE
, L0
, L1
);
1286 SCEVPtr0
= Rewriter
.visit(SCEVPtr0
);
1288 if (VerboseFusionDebugging
)
1289 LLVM_DEBUG(dbgs() << " Access function after rewrite: " << *SCEVPtr0
1290 << " [Valid: " << Rewriter
.wasValidSCEV() << "]\n");
1292 if (!Rewriter
.wasValidSCEV())
1295 // TODO: isKnownPredicate doesnt work well when one SCEV is loop carried (by
1296 // L0) and the other is not. We could check if it is monotone and test
1297 // the beginning and end value instead.
1299 BasicBlock
*L0Header
= L0
.getHeader();
1300 auto HasNonLinearDominanceRelation
= [&](const SCEV
*S
) {
1301 const SCEVAddRecExpr
*AddRec
= dyn_cast
<SCEVAddRecExpr
>(S
);
1304 return !DT
.dominates(L0Header
, AddRec
->getLoop()->getHeader()) &&
1305 !DT
.dominates(AddRec
->getLoop()->getHeader(), L0Header
);
1307 if (SCEVExprContains(SCEVPtr1
, HasNonLinearDominanceRelation
))
1310 ICmpInst::Predicate Pred
=
1311 EqualIsInvalid
? ICmpInst::ICMP_SGT
: ICmpInst::ICMP_SGE
;
1312 bool IsAlwaysGE
= SE
.isKnownPredicate(Pred
, SCEVPtr0
, SCEVPtr1
);
1314 if (VerboseFusionDebugging
)
1315 LLVM_DEBUG(dbgs() << " Relation: " << *SCEVPtr0
1316 << (IsAlwaysGE
? " >= " : " may < ") << *SCEVPtr1
1322 /// Return true if the dependences between @p I0 (in @p L0) and @p I1 (in
1323 /// @p L1) allow loop fusion of @p L0 and @p L1. The dependence analyses
1324 /// specified by @p DepChoice are used to determine this.
1325 bool dependencesAllowFusion(const FusionCandidate
&FC0
,
1326 const FusionCandidate
&FC1
, Instruction
&I0
,
1327 Instruction
&I1
, bool AnyDep
,
1328 FusionDependenceAnalysisChoice DepChoice
) {
1330 if (VerboseFusionDebugging
) {
1331 LLVM_DEBUG(dbgs() << "Check dep: " << I0
<< " vs " << I1
<< " : "
1332 << DepChoice
<< "\n");
1335 switch (DepChoice
) {
1336 case FUSION_DEPENDENCE_ANALYSIS_SCEV
:
1337 return accessDiffIsPositive(*FC0
.L
, *FC1
.L
, I0
, I1
, AnyDep
);
1338 case FUSION_DEPENDENCE_ANALYSIS_DA
: {
1339 auto DepResult
= DI
.depends(&I0
, &I1
, true);
1343 if (VerboseFusionDebugging
) {
1344 LLVM_DEBUG(dbgs() << "DA res: "; DepResult
->dump(dbgs());
1345 dbgs() << " [#l: " << DepResult
->getLevels() << "][Ordered: "
1346 << (DepResult
->isOrdered() ? "true" : "false")
1348 LLVM_DEBUG(dbgs() << "DepResult Levels: " << DepResult
->getLevels()
1353 if (DepResult
->getNextPredecessor() || DepResult
->getNextSuccessor())
1355 dbgs() << "TODO: Implement pred/succ dependence handling!\n");
1357 // TODO: Can we actually use the dependence info analysis here?
1361 case FUSION_DEPENDENCE_ANALYSIS_ALL
:
1362 return dependencesAllowFusion(FC0
, FC1
, I0
, I1
, AnyDep
,
1363 FUSION_DEPENDENCE_ANALYSIS_SCEV
) ||
1364 dependencesAllowFusion(FC0
, FC1
, I0
, I1
, AnyDep
,
1365 FUSION_DEPENDENCE_ANALYSIS_DA
);
1368 llvm_unreachable("Unknown fusion dependence analysis choice!");
1371 /// Perform a dependence check and return if @p FC0 and @p FC1 can be fused.
1372 bool dependencesAllowFusion(const FusionCandidate
&FC0
,
1373 const FusionCandidate
&FC1
) {
1374 LLVM_DEBUG(dbgs() << "Check if " << FC0
<< " can be fused with " << FC1
1376 assert(FC0
.L
->getLoopDepth() == FC1
.L
->getLoopDepth());
1377 assert(DT
.dominates(FC0
.getEntryBlock(), FC1
.getEntryBlock()));
1379 for (Instruction
*WriteL0
: FC0
.MemWrites
) {
1380 for (Instruction
*WriteL1
: FC1
.MemWrites
)
1381 if (!dependencesAllowFusion(FC0
, FC1
, *WriteL0
, *WriteL1
,
1383 FusionDependenceAnalysis
)) {
1384 InvalidDependencies
++;
1387 for (Instruction
*ReadL1
: FC1
.MemReads
)
1388 if (!dependencesAllowFusion(FC0
, FC1
, *WriteL0
, *ReadL1
,
1390 FusionDependenceAnalysis
)) {
1391 InvalidDependencies
++;
1396 for (Instruction
*WriteL1
: FC1
.MemWrites
) {
1397 for (Instruction
*WriteL0
: FC0
.MemWrites
)
1398 if (!dependencesAllowFusion(FC0
, FC1
, *WriteL0
, *WriteL1
,
1400 FusionDependenceAnalysis
)) {
1401 InvalidDependencies
++;
1404 for (Instruction
*ReadL0
: FC0
.MemReads
)
1405 if (!dependencesAllowFusion(FC0
, FC1
, *ReadL0
, *WriteL1
,
1407 FusionDependenceAnalysis
)) {
1408 InvalidDependencies
++;
1413 // Walk through all uses in FC1. For each use, find the reaching def. If the
1414 // def is located in FC0 then it is not safe to fuse.
1415 for (BasicBlock
*BB
: FC1
.L
->blocks())
1416 for (Instruction
&I
: *BB
)
1417 for (auto &Op
: I
.operands())
1418 if (Instruction
*Def
= dyn_cast
<Instruction
>(Op
))
1419 if (FC0
.L
->contains(Def
->getParent())) {
1420 InvalidDependencies
++;
1427 /// Determine if two fusion candidates are adjacent in the CFG.
1429 /// This method will determine if there are additional basic blocks in the CFG
1430 /// between the exit of \p FC0 and the entry of \p FC1.
1431 /// If the two candidates are guarded loops, then it checks whether the
1432 /// non-loop successor of the \p FC0 guard branch is the entry block of \p
1433 /// FC1. If not, then the loops are not adjacent. If the two candidates are
1434 /// not guarded loops, then it checks whether the exit block of \p FC0 is the
1435 /// preheader of \p FC1.
1436 bool isAdjacent(const FusionCandidate
&FC0
,
1437 const FusionCandidate
&FC1
) const {
1438 // If the successor of the guard branch is FC1, then the loops are adjacent
1439 if (FC0
.GuardBranch
)
1440 return FC0
.getNonLoopBlock() == FC1
.getEntryBlock();
1442 return FC0
.ExitBlock
== FC1
.getEntryBlock();
1445 bool isEmptyPreheader(const FusionCandidate
&FC
) const {
1446 return FC
.Preheader
->size() == 1;
1449 /// Hoist \p FC1 Preheader instructions to \p FC0 Preheader
1450 /// and sink others into the body of \p FC1.
1451 void movePreheaderInsts(const FusionCandidate
&FC0
,
1452 const FusionCandidate
&FC1
,
1453 SmallVector
<Instruction
*, 4> &HoistInsts
,
1454 SmallVector
<Instruction
*, 4> &SinkInsts
) const {
1455 // All preheader instructions except the branch must be hoisted or sunk
1456 assert(HoistInsts
.size() + SinkInsts
.size() == FC1
.Preheader
->size() - 1 &&
1457 "Attempting to sink and hoist preheader instructions, but not all "
1458 "the preheader instructions are accounted for.");
1460 NumHoistedInsts
+= HoistInsts
.size();
1461 NumSunkInsts
+= SinkInsts
.size();
1463 LLVM_DEBUG(if (VerboseFusionDebugging
) {
1464 if (!HoistInsts
.empty())
1465 dbgs() << "Hoisting: \n";
1466 for (Instruction
*I
: HoistInsts
)
1467 dbgs() << *I
<< "\n";
1468 if (!SinkInsts
.empty())
1469 dbgs() << "Sinking: \n";
1470 for (Instruction
*I
: SinkInsts
)
1471 dbgs() << *I
<< "\n";
1474 for (Instruction
*I
: HoistInsts
) {
1475 assert(I
->getParent() == FC1
.Preheader
);
1476 I
->moveBefore(*FC0
.Preheader
,
1477 FC0
.Preheader
->getTerminator()->getIterator());
1479 // insert instructions in reverse order to maintain dominance relationship
1480 for (Instruction
*I
: reverse(SinkInsts
)) {
1481 assert(I
->getParent() == FC1
.Preheader
);
1482 I
->moveBefore(*FC1
.ExitBlock
, FC1
.ExitBlock
->getFirstInsertionPt());
1486 /// Determine if two fusion candidates have identical guards
1488 /// This method will determine if two fusion candidates have the same guards.
1489 /// The guards are considered the same if:
1490 /// 1. The instructions to compute the condition used in the compare are
1492 /// 2. The successors of the guard have the same flow into/around the loop.
1493 /// If the compare instructions are identical, then the first successor of the
1494 /// guard must go to the same place (either the preheader of the loop or the
1495 /// NonLoopBlock). In other words, the first successor of both loops must
1496 /// both go into the loop (i.e., the preheader) or go around the loop (i.e.,
1497 /// the NonLoopBlock). The same must be true for the second successor.
1498 bool haveIdenticalGuards(const FusionCandidate
&FC0
,
1499 const FusionCandidate
&FC1
) const {
1500 assert(FC0
.GuardBranch
&& FC1
.GuardBranch
&&
1501 "Expecting FC0 and FC1 to be guarded loops.");
1503 if (auto FC0CmpInst
=
1504 dyn_cast
<Instruction
>(FC0
.GuardBranch
->getCondition()))
1505 if (auto FC1CmpInst
=
1506 dyn_cast
<Instruction
>(FC1
.GuardBranch
->getCondition()))
1507 if (!FC0CmpInst
->isIdenticalTo(FC1CmpInst
))
1510 // The compare instructions are identical.
1511 // Now make sure the successor of the guards have the same flow into/around
1513 if (FC0
.GuardBranch
->getSuccessor(0) == FC0
.Preheader
)
1514 return (FC1
.GuardBranch
->getSuccessor(0) == FC1
.Preheader
);
1516 return (FC1
.GuardBranch
->getSuccessor(1) == FC1
.Preheader
);
1519 /// Modify the latch branch of FC to be unconditional since successors of the
1520 /// branch are the same.
1521 void simplifyLatchBranch(const FusionCandidate
&FC
) const {
1522 BranchInst
*FCLatchBranch
= dyn_cast
<BranchInst
>(FC
.Latch
->getTerminator());
1523 if (FCLatchBranch
) {
1524 assert(FCLatchBranch
->isConditional() &&
1525 FCLatchBranch
->getSuccessor(0) == FCLatchBranch
->getSuccessor(1) &&
1526 "Expecting the two successors of FCLatchBranch to be the same");
1527 BranchInst
*NewBranch
=
1528 BranchInst::Create(FCLatchBranch
->getSuccessor(0));
1529 ReplaceInstWithInst(FCLatchBranch
, NewBranch
);
1533 /// Move instructions from FC0.Latch to FC1.Latch. If FC0.Latch has an unique
1534 /// successor, then merge FC0.Latch with its unique successor.
1535 void mergeLatch(const FusionCandidate
&FC0
, const FusionCandidate
&FC1
) {
1536 moveInstructionsToTheBeginning(*FC0
.Latch
, *FC1
.Latch
, DT
, PDT
, DI
);
1537 if (BasicBlock
*Succ
= FC0
.Latch
->getUniqueSuccessor()) {
1538 MergeBlockIntoPredecessor(Succ
, &DTU
, &LI
);
1543 /// Fuse two fusion candidates, creating a new fused loop.
1545 /// This method contains the mechanics of fusing two loops, represented by \p
1546 /// FC0 and \p FC1. It is assumed that \p FC0 dominates \p FC1 and \p FC1
1547 /// postdominates \p FC0 (making them control flow equivalent). It also
1548 /// assumes that the other conditions for fusion have been met: adjacent,
1549 /// identical trip counts, and no negative distance dependencies exist that
1550 /// would prevent fusion. Thus, there is no checking for these conditions in
1553 /// Fusion is performed by rewiring the CFG to update successor blocks of the
1554 /// components of tho loop. Specifically, the following changes are done:
1556 /// 1. The preheader of \p FC1 is removed as it is no longer necessary
1557 /// (because it is currently only a single statement block).
1558 /// 2. The latch of \p FC0 is modified to jump to the header of \p FC1.
1559 /// 3. The latch of \p FC1 i modified to jump to the header of \p FC0.
1560 /// 4. All blocks from \p FC1 are removed from FC1 and added to FC0.
1562 /// All of these modifications are done with dominator tree updates, thus
1563 /// keeping the dominator (and post dominator) information up-to-date.
1565 /// This can be improved in the future by actually merging blocks during
1566 /// fusion. For example, the preheader of \p FC1 can be merged with the
1567 /// preheader of \p FC0. This would allow loops with more than a single
1568 /// statement in the preheader to be fused. Similarly, the latch blocks of the
1569 /// two loops could also be fused into a single block. This will require
1570 /// analysis to prove it is safe to move the contents of the block past
1571 /// existing code, which currently has not been implemented.
1572 Loop
*performFusion(const FusionCandidate
&FC0
, const FusionCandidate
&FC1
) {
1573 assert(FC0
.isValid() && FC1
.isValid() &&
1574 "Expecting valid fusion candidates");
1576 LLVM_DEBUG(dbgs() << "Fusion Candidate 0: \n"; FC0
.dump();
1577 dbgs() << "Fusion Candidate 1: \n"; FC1
.dump(););
1579 // Move instructions from the preheader of FC1 to the end of the preheader
1581 moveInstructionsToTheEnd(*FC1
.Preheader
, *FC0
.Preheader
, DT
, PDT
, DI
);
1583 // Fusing guarded loops is handled slightly differently than non-guarded
1584 // loops and has been broken out into a separate method instead of trying to
1585 // intersperse the logic within a single method.
1586 if (FC0
.GuardBranch
)
1587 return fuseGuardedLoops(FC0
, FC1
);
1589 assert(FC1
.Preheader
==
1590 (FC0
.Peeled
? FC0
.ExitBlock
->getUniqueSuccessor() : FC0
.ExitBlock
));
1591 assert(FC1
.Preheader
->size() == 1 &&
1592 FC1
.Preheader
->getSingleSuccessor() == FC1
.Header
);
1594 // Remember the phi nodes originally in the header of FC0 in order to rewire
1595 // them later. However, this is only necessary if the new loop carried
1596 // values might not dominate the exiting branch. While we do not generally
1597 // test if this is the case but simply insert intermediate phi nodes, we
1598 // need to make sure these intermediate phi nodes have different
1599 // predecessors. To this end, we filter the special case where the exiting
1600 // block is the latch block of the first loop. Nothing needs to be done
1601 // anyway as all loop carried values dominate the latch and thereby also the
1603 SmallVector
<PHINode
*, 8> OriginalFC0PHIs
;
1604 if (FC0
.ExitingBlock
!= FC0
.Latch
)
1605 for (PHINode
&PHI
: FC0
.Header
->phis())
1606 OriginalFC0PHIs
.push_back(&PHI
);
1608 // Replace incoming blocks for header PHIs first.
1609 FC1
.Preheader
->replaceSuccessorsPhiUsesWith(FC0
.Preheader
);
1610 FC0
.Latch
->replaceSuccessorsPhiUsesWith(FC1
.Latch
);
1612 // Then modify the control flow and update DT and PDT.
1613 SmallVector
<DominatorTree::UpdateType
, 8> TreeUpdates
;
1615 // The old exiting block of the first loop (FC0) has to jump to the header
1616 // of the second as we need to execute the code in the second header block
1617 // regardless of the trip count. That is, if the trip count is 0, so the
1618 // back edge is never taken, we still have to execute both loop headers,
1619 // especially (but not only!) if the second is a do-while style loop.
1620 // However, doing so might invalidate the phi nodes of the first loop as
1621 // the new values do only need to dominate their latch and not the exiting
1622 // predicate. To remedy this potential problem we always introduce phi
1623 // nodes in the header of the second loop later that select the loop carried
1624 // value, if the second header was reached through an old latch of the
1625 // first, or undef otherwise. This is sound as exiting the first implies the
1626 // second will exit too, __without__ taking the back-edge. [Their
1627 // trip-counts are equal after all.
1628 // KB: Would this sequence be simpler to just make FC0.ExitingBlock go
1629 // to FC1.Header? I think this is basically what the three sequences are
1630 // trying to accomplish; however, doing this directly in the CFG may mean
1631 // the DT/PDT becomes invalid
1633 FC0
.ExitingBlock
->getTerminator()->replaceUsesOfWith(FC1
.Preheader
,
1635 TreeUpdates
.emplace_back(DominatorTree::UpdateType(
1636 DominatorTree::Delete
, FC0
.ExitingBlock
, FC1
.Preheader
));
1637 TreeUpdates
.emplace_back(DominatorTree::UpdateType(
1638 DominatorTree::Insert
, FC0
.ExitingBlock
, FC1
.Header
));
1640 TreeUpdates
.emplace_back(DominatorTree::UpdateType(
1641 DominatorTree::Delete
, FC0
.ExitBlock
, FC1
.Preheader
));
1643 // Remove the ExitBlock of the first Loop (also not needed)
1644 FC0
.ExitingBlock
->getTerminator()->replaceUsesOfWith(FC0
.ExitBlock
,
1646 TreeUpdates
.emplace_back(DominatorTree::UpdateType(
1647 DominatorTree::Delete
, FC0
.ExitingBlock
, FC0
.ExitBlock
));
1648 FC0
.ExitBlock
->getTerminator()->eraseFromParent();
1649 TreeUpdates
.emplace_back(DominatorTree::UpdateType(
1650 DominatorTree::Insert
, FC0
.ExitingBlock
, FC1
.Header
));
1651 new UnreachableInst(FC0
.ExitBlock
->getContext(), FC0
.ExitBlock
);
1654 // The pre-header of L1 is not necessary anymore.
1655 assert(pred_empty(FC1
.Preheader
));
1656 FC1
.Preheader
->getTerminator()->eraseFromParent();
1657 new UnreachableInst(FC1
.Preheader
->getContext(), FC1
.Preheader
);
1658 TreeUpdates
.emplace_back(DominatorTree::UpdateType(
1659 DominatorTree::Delete
, FC1
.Preheader
, FC1
.Header
));
1661 // Moves the phi nodes from the second to the first loops header block.
1662 while (PHINode
*PHI
= dyn_cast
<PHINode
>(&FC1
.Header
->front())) {
1663 if (SE
.isSCEVable(PHI
->getType()))
1664 SE
.forgetValue(PHI
);
1665 if (PHI
->hasNUsesOrMore(1))
1666 PHI
->moveBefore(&*FC0
.Header
->getFirstInsertionPt());
1668 PHI
->eraseFromParent();
1671 // Introduce new phi nodes in the second loop header to ensure
1672 // exiting the first and jumping to the header of the second does not break
1673 // the SSA property of the phis originally in the first loop. See also the
1675 BasicBlock::iterator L1HeaderIP
= FC1
.Header
->begin();
1676 for (PHINode
*LCPHI
: OriginalFC0PHIs
) {
1677 int L1LatchBBIdx
= LCPHI
->getBasicBlockIndex(FC1
.Latch
);
1678 assert(L1LatchBBIdx
>= 0 &&
1679 "Expected loop carried value to be rewired at this point!");
1681 Value
*LCV
= LCPHI
->getIncomingValue(L1LatchBBIdx
);
1683 PHINode
*L1HeaderPHI
=
1684 PHINode::Create(LCV
->getType(), 2, LCPHI
->getName() + ".afterFC0");
1685 L1HeaderPHI
->insertBefore(L1HeaderIP
);
1686 L1HeaderPHI
->addIncoming(LCV
, FC0
.Latch
);
1687 L1HeaderPHI
->addIncoming(UndefValue::get(LCV
->getType()),
1690 LCPHI
->setIncomingValue(L1LatchBBIdx
, L1HeaderPHI
);
1693 // Replace latch terminator destinations.
1694 FC0
.Latch
->getTerminator()->replaceUsesOfWith(FC0
.Header
, FC1
.Header
);
1695 FC1
.Latch
->getTerminator()->replaceUsesOfWith(FC1
.Header
, FC0
.Header
);
1697 // Modify the latch branch of FC0 to be unconditional as both successors of
1698 // the branch are the same.
1699 simplifyLatchBranch(FC0
);
1701 // If FC0.Latch and FC0.ExitingBlock are the same then we have already
1702 // performed the updates above.
1703 if (FC0
.Latch
!= FC0
.ExitingBlock
)
1704 TreeUpdates
.emplace_back(DominatorTree::UpdateType(
1705 DominatorTree::Insert
, FC0
.Latch
, FC1
.Header
));
1707 TreeUpdates
.emplace_back(DominatorTree::UpdateType(DominatorTree::Delete
,
1708 FC0
.Latch
, FC0
.Header
));
1709 TreeUpdates
.emplace_back(DominatorTree::UpdateType(DominatorTree::Insert
,
1710 FC1
.Latch
, FC0
.Header
));
1711 TreeUpdates
.emplace_back(DominatorTree::UpdateType(DominatorTree::Delete
,
1712 FC1
.Latch
, FC1
.Header
));
1715 DTU
.applyUpdates(TreeUpdates
);
1717 LI
.removeBlock(FC1
.Preheader
);
1718 DTU
.deleteBB(FC1
.Preheader
);
1720 LI
.removeBlock(FC0
.ExitBlock
);
1721 DTU
.deleteBB(FC0
.ExitBlock
);
1726 // Is there a way to keep SE up-to-date so we don't need to forget the loops
1727 // and rebuild the information in subsequent passes of fusion?
1728 // Note: Need to forget the loops before merging the loop latches, as
1729 // mergeLatch may remove the only block in FC1.
1730 SE
.forgetLoop(FC1
.L
);
1731 SE
.forgetLoop(FC0
.L
);
1732 SE
.forgetLoopDispositions();
1734 // Move instructions from FC0.Latch to FC1.Latch.
1735 // Note: mergeLatch requires an updated DT.
1736 mergeLatch(FC0
, FC1
);
1739 SmallVector
<BasicBlock
*, 8> Blocks(FC1
.L
->blocks());
1740 for (BasicBlock
*BB
: Blocks
) {
1741 FC0
.L
->addBlockEntry(BB
);
1742 FC1
.L
->removeBlockFromLoop(BB
);
1743 if (LI
.getLoopFor(BB
) != FC1
.L
)
1745 LI
.changeLoopFor(BB
, FC0
.L
);
1747 while (!FC1
.L
->isInnermost()) {
1748 const auto &ChildLoopIt
= FC1
.L
->begin();
1749 Loop
*ChildLoop
= *ChildLoopIt
;
1750 FC1
.L
->removeChildLoop(ChildLoopIt
);
1751 FC0
.L
->addChildLoop(ChildLoop
);
1754 // Delete the now empty loop L1.
1758 assert(!verifyFunction(*FC0
.Header
->getParent(), &errs()));
1759 assert(DT
.verify(DominatorTree::VerificationLevel::Fast
));
1760 assert(PDT
.verify());
1765 LLVM_DEBUG(dbgs() << "Fusion done:\n");
1770 /// Report details on loop fusion opportunities.
1772 /// This template function can be used to report both successful and missed
1773 /// loop fusion opportunities, based on the RemarkKind. The RemarkKind should
1775 /// - OptimizationRemarkMissed to report when loop fusion is unsuccessful
1776 /// given two valid fusion candidates.
1777 /// - OptimizationRemark to report successful fusion of two fusion
1779 /// The remarks will be printed using the form:
1780 /// <path/filename>:<line number>:<column number>: [<function name>]:
1781 /// <Cand1 Preheader> and <Cand2 Preheader>: <Stat Description>
1782 template <typename RemarkKind
>
1783 void reportLoopFusion(const FusionCandidate
&FC0
, const FusionCandidate
&FC1
,
1784 llvm::Statistic
&Stat
) {
1785 assert(FC0
.Preheader
&& FC1
.Preheader
&&
1786 "Expecting valid fusion candidates");
1787 using namespace ore
;
1788 #if LLVM_ENABLE_STATS
1790 ORE
.emit(RemarkKind(DEBUG_TYPE
, Stat
.getName(), FC0
.L
->getStartLoc(),
1792 << "[" << FC0
.Preheader
->getParent()->getName()
1793 << "]: " << NV("Cand1", StringRef(FC0
.Preheader
->getName()))
1794 << " and " << NV("Cand2", StringRef(FC1
.Preheader
->getName()))
1795 << ": " << Stat
.getDesc());
1799 /// Fuse two guarded fusion candidates, creating a new fused loop.
1801 /// Fusing guarded loops is handled much the same way as fusing non-guarded
1802 /// loops. The rewiring of the CFG is slightly different though, because of
1803 /// the presence of the guards around the loops and the exit blocks after the
1804 /// loop body. As such, the new loop is rewired as follows:
1805 /// 1. Keep the guard branch from FC0 and use the non-loop block target
1806 /// from the FC1 guard branch.
1807 /// 2. Remove the exit block from FC0 (this exit block should be empty
1809 /// 3. Remove the guard branch for FC1
1810 /// 4. Remove the preheader for FC1.
1811 /// The exit block successor for the latch of FC0 is updated to be the header
1812 /// of FC1 and the non-exit block successor of the latch of FC1 is updated to
1813 /// be the header of FC0, thus creating the fused loop.
1814 Loop
*fuseGuardedLoops(const FusionCandidate
&FC0
,
1815 const FusionCandidate
&FC1
) {
1816 assert(FC0
.GuardBranch
&& FC1
.GuardBranch
&& "Expecting guarded loops");
1818 BasicBlock
*FC0GuardBlock
= FC0
.GuardBranch
->getParent();
1819 BasicBlock
*FC1GuardBlock
= FC1
.GuardBranch
->getParent();
1820 BasicBlock
*FC0NonLoopBlock
= FC0
.getNonLoopBlock();
1821 BasicBlock
*FC1NonLoopBlock
= FC1
.getNonLoopBlock();
1822 BasicBlock
*FC0ExitBlockSuccessor
= FC0
.ExitBlock
->getUniqueSuccessor();
1824 // Move instructions from the exit block of FC0 to the beginning of the exit
1825 // block of FC1, in the case that the FC0 loop has not been peeled. In the
1826 // case that FC0 loop is peeled, then move the instructions of the successor
1827 // of the FC0 Exit block to the beginning of the exit block of FC1.
1828 moveInstructionsToTheBeginning(
1829 (FC0
.Peeled
? *FC0ExitBlockSuccessor
: *FC0
.ExitBlock
), *FC1
.ExitBlock
,
1832 // Move instructions from the guard block of FC1 to the end of the guard
1834 moveInstructionsToTheEnd(*FC1GuardBlock
, *FC0GuardBlock
, DT
, PDT
, DI
);
1836 assert(FC0NonLoopBlock
== FC1GuardBlock
&& "Loops are not adjacent");
1838 SmallVector
<DominatorTree::UpdateType
, 8> TreeUpdates
;
1840 ////////////////////////////////////////////////////////////////////////////
1841 // Update the Loop Guard
1842 ////////////////////////////////////////////////////////////////////////////
1843 // The guard for FC0 is updated to guard both FC0 and FC1. This is done by
1844 // changing the NonLoopGuardBlock for FC0 to the NonLoopGuardBlock for FC1.
1845 // Thus, one path from the guard goes to the preheader for FC0 (and thus
1846 // executes the new fused loop) and the other path goes to the NonLoopBlock
1847 // for FC1 (where FC1 guard would have gone if FC1 was not executed).
1848 FC1NonLoopBlock
->replacePhiUsesWith(FC1GuardBlock
, FC0GuardBlock
);
1849 FC0
.GuardBranch
->replaceUsesOfWith(FC0NonLoopBlock
, FC1NonLoopBlock
);
1851 BasicBlock
*BBToUpdate
= FC0
.Peeled
? FC0ExitBlockSuccessor
: FC0
.ExitBlock
;
1852 BBToUpdate
->getTerminator()->replaceUsesOfWith(FC1GuardBlock
, FC1
.Header
);
1854 // The guard of FC1 is not necessary anymore.
1855 FC1
.GuardBranch
->eraseFromParent();
1856 new UnreachableInst(FC1GuardBlock
->getContext(), FC1GuardBlock
);
1858 TreeUpdates
.emplace_back(DominatorTree::UpdateType(
1859 DominatorTree::Delete
, FC1GuardBlock
, FC1
.Preheader
));
1860 TreeUpdates
.emplace_back(DominatorTree::UpdateType(
1861 DominatorTree::Delete
, FC1GuardBlock
, FC1NonLoopBlock
));
1862 TreeUpdates
.emplace_back(DominatorTree::UpdateType(
1863 DominatorTree::Delete
, FC0GuardBlock
, FC1GuardBlock
));
1864 TreeUpdates
.emplace_back(DominatorTree::UpdateType(
1865 DominatorTree::Insert
, FC0GuardBlock
, FC1NonLoopBlock
));
1868 // Remove the Block after the ExitBlock of FC0
1869 TreeUpdates
.emplace_back(DominatorTree::UpdateType(
1870 DominatorTree::Delete
, FC0ExitBlockSuccessor
, FC1GuardBlock
));
1871 FC0ExitBlockSuccessor
->getTerminator()->eraseFromParent();
1872 new UnreachableInst(FC0ExitBlockSuccessor
->getContext(),
1873 FC0ExitBlockSuccessor
);
1876 assert(pred_empty(FC1GuardBlock
) &&
1877 "Expecting guard block to have no predecessors");
1878 assert(succ_empty(FC1GuardBlock
) &&
1879 "Expecting guard block to have no successors");
1881 // Remember the phi nodes originally in the header of FC0 in order to rewire
1882 // them later. However, this is only necessary if the new loop carried
1883 // values might not dominate the exiting branch. While we do not generally
1884 // test if this is the case but simply insert intermediate phi nodes, we
1885 // need to make sure these intermediate phi nodes have different
1886 // predecessors. To this end, we filter the special case where the exiting
1887 // block is the latch block of the first loop. Nothing needs to be done
1888 // anyway as all loop carried values dominate the latch and thereby also the
1890 // KB: This is no longer necessary because FC0.ExitingBlock == FC0.Latch
1891 // (because the loops are rotated. Thus, nothing will ever be added to
1893 SmallVector
<PHINode
*, 8> OriginalFC0PHIs
;
1894 if (FC0
.ExitingBlock
!= FC0
.Latch
)
1895 for (PHINode
&PHI
: FC0
.Header
->phis())
1896 OriginalFC0PHIs
.push_back(&PHI
);
1898 assert(OriginalFC0PHIs
.empty() && "Expecting OriginalFC0PHIs to be empty!");
1900 // Replace incoming blocks for header PHIs first.
1901 FC1
.Preheader
->replaceSuccessorsPhiUsesWith(FC0
.Preheader
);
1902 FC0
.Latch
->replaceSuccessorsPhiUsesWith(FC1
.Latch
);
1904 // The old exiting block of the first loop (FC0) has to jump to the header
1905 // of the second as we need to execute the code in the second header block
1906 // regardless of the trip count. That is, if the trip count is 0, so the
1907 // back edge is never taken, we still have to execute both loop headers,
1908 // especially (but not only!) if the second is a do-while style loop.
1909 // However, doing so might invalidate the phi nodes of the first loop as
1910 // the new values do only need to dominate their latch and not the exiting
1911 // predicate. To remedy this potential problem we always introduce phi
1912 // nodes in the header of the second loop later that select the loop carried
1913 // value, if the second header was reached through an old latch of the
1914 // first, or undef otherwise. This is sound as exiting the first implies the
1915 // second will exit too, __without__ taking the back-edge (their
1916 // trip-counts are equal after all).
1917 FC0
.ExitingBlock
->getTerminator()->replaceUsesOfWith(FC0
.ExitBlock
,
1920 TreeUpdates
.emplace_back(DominatorTree::UpdateType(
1921 DominatorTree::Delete
, FC0
.ExitingBlock
, FC0
.ExitBlock
));
1922 TreeUpdates
.emplace_back(DominatorTree::UpdateType(
1923 DominatorTree::Insert
, FC0
.ExitingBlock
, FC1
.Header
));
1925 // Remove FC0 Exit Block
1926 // The exit block for FC0 is no longer needed since control will flow
1927 // directly to the header of FC1. Since it is an empty block, it can be
1928 // removed at this point.
1929 // TODO: In the future, we can handle non-empty exit blocks my merging any
1930 // instructions from FC0 exit block into FC1 exit block prior to removing
1932 assert(pred_empty(FC0
.ExitBlock
) && "Expecting exit block to be empty");
1933 FC0
.ExitBlock
->getTerminator()->eraseFromParent();
1934 new UnreachableInst(FC0
.ExitBlock
->getContext(), FC0
.ExitBlock
);
1936 // Remove FC1 Preheader
1937 // The pre-header of L1 is not necessary anymore.
1938 assert(pred_empty(FC1
.Preheader
));
1939 FC1
.Preheader
->getTerminator()->eraseFromParent();
1940 new UnreachableInst(FC1
.Preheader
->getContext(), FC1
.Preheader
);
1941 TreeUpdates
.emplace_back(DominatorTree::UpdateType(
1942 DominatorTree::Delete
, FC1
.Preheader
, FC1
.Header
));
1944 // Moves the phi nodes from the second to the first loops header block.
1945 while (PHINode
*PHI
= dyn_cast
<PHINode
>(&FC1
.Header
->front())) {
1946 if (SE
.isSCEVable(PHI
->getType()))
1947 SE
.forgetValue(PHI
);
1948 if (PHI
->hasNUsesOrMore(1))
1949 PHI
->moveBefore(&*FC0
.Header
->getFirstInsertionPt());
1951 PHI
->eraseFromParent();
1954 // Introduce new phi nodes in the second loop header to ensure
1955 // exiting the first and jumping to the header of the second does not break
1956 // the SSA property of the phis originally in the first loop. See also the
1958 BasicBlock::iterator L1HeaderIP
= FC1
.Header
->begin();
1959 for (PHINode
*LCPHI
: OriginalFC0PHIs
) {
1960 int L1LatchBBIdx
= LCPHI
->getBasicBlockIndex(FC1
.Latch
);
1961 assert(L1LatchBBIdx
>= 0 &&
1962 "Expected loop carried value to be rewired at this point!");
1964 Value
*LCV
= LCPHI
->getIncomingValue(L1LatchBBIdx
);
1966 PHINode
*L1HeaderPHI
=
1967 PHINode::Create(LCV
->getType(), 2, LCPHI
->getName() + ".afterFC0");
1968 L1HeaderPHI
->insertBefore(L1HeaderIP
);
1969 L1HeaderPHI
->addIncoming(LCV
, FC0
.Latch
);
1970 L1HeaderPHI
->addIncoming(UndefValue::get(LCV
->getType()),
1973 LCPHI
->setIncomingValue(L1LatchBBIdx
, L1HeaderPHI
);
1976 // Update the latches
1978 // Replace latch terminator destinations.
1979 FC0
.Latch
->getTerminator()->replaceUsesOfWith(FC0
.Header
, FC1
.Header
);
1980 FC1
.Latch
->getTerminator()->replaceUsesOfWith(FC1
.Header
, FC0
.Header
);
1982 // Modify the latch branch of FC0 to be unconditional as both successors of
1983 // the branch are the same.
1984 simplifyLatchBranch(FC0
);
1986 // If FC0.Latch and FC0.ExitingBlock are the same then we have already
1987 // performed the updates above.
1988 if (FC0
.Latch
!= FC0
.ExitingBlock
)
1989 TreeUpdates
.emplace_back(DominatorTree::UpdateType(
1990 DominatorTree::Insert
, FC0
.Latch
, FC1
.Header
));
1992 TreeUpdates
.emplace_back(DominatorTree::UpdateType(DominatorTree::Delete
,
1993 FC0
.Latch
, FC0
.Header
));
1994 TreeUpdates
.emplace_back(DominatorTree::UpdateType(DominatorTree::Insert
,
1995 FC1
.Latch
, FC0
.Header
));
1996 TreeUpdates
.emplace_back(DominatorTree::UpdateType(DominatorTree::Delete
,
1997 FC1
.Latch
, FC1
.Header
));
2000 // Apply the updates to the Dominator Tree and cleanup.
2002 assert(succ_empty(FC1GuardBlock
) && "FC1GuardBlock has successors!!");
2003 assert(pred_empty(FC1GuardBlock
) && "FC1GuardBlock has predecessors!!");
2006 DTU
.applyUpdates(TreeUpdates
);
2008 LI
.removeBlock(FC1GuardBlock
);
2009 LI
.removeBlock(FC1
.Preheader
);
2010 LI
.removeBlock(FC0
.ExitBlock
);
2012 LI
.removeBlock(FC0ExitBlockSuccessor
);
2013 DTU
.deleteBB(FC0ExitBlockSuccessor
);
2015 DTU
.deleteBB(FC1GuardBlock
);
2016 DTU
.deleteBB(FC1
.Preheader
);
2017 DTU
.deleteBB(FC0
.ExitBlock
);
2020 // Is there a way to keep SE up-to-date so we don't need to forget the loops
2021 // and rebuild the information in subsequent passes of fusion?
2022 // Note: Need to forget the loops before merging the loop latches, as
2023 // mergeLatch may remove the only block in FC1.
2024 SE
.forgetLoop(FC1
.L
);
2025 SE
.forgetLoop(FC0
.L
);
2026 SE
.forgetLoopDispositions();
2028 // Move instructions from FC0.Latch to FC1.Latch.
2029 // Note: mergeLatch requires an updated DT.
2030 mergeLatch(FC0
, FC1
);
2033 SmallVector
<BasicBlock
*, 8> Blocks(FC1
.L
->blocks());
2034 for (BasicBlock
*BB
: Blocks
) {
2035 FC0
.L
->addBlockEntry(BB
);
2036 FC1
.L
->removeBlockFromLoop(BB
);
2037 if (LI
.getLoopFor(BB
) != FC1
.L
)
2039 LI
.changeLoopFor(BB
, FC0
.L
);
2041 while (!FC1
.L
->isInnermost()) {
2042 const auto &ChildLoopIt
= FC1
.L
->begin();
2043 Loop
*ChildLoop
= *ChildLoopIt
;
2044 FC1
.L
->removeChildLoop(ChildLoopIt
);
2045 FC0
.L
->addChildLoop(ChildLoop
);
2048 // Delete the now empty loop L1.
2052 assert(!verifyFunction(*FC0
.Header
->getParent(), &errs()));
2053 assert(DT
.verify(DominatorTree::VerificationLevel::Fast
));
2054 assert(PDT
.verify());
2059 LLVM_DEBUG(dbgs() << "Fusion done:\n");
2066 PreservedAnalyses
LoopFusePass::run(Function
&F
, FunctionAnalysisManager
&AM
) {
2067 auto &LI
= AM
.getResult
<LoopAnalysis
>(F
);
2068 auto &DT
= AM
.getResult
<DominatorTreeAnalysis
>(F
);
2069 auto &DI
= AM
.getResult
<DependenceAnalysis
>(F
);
2070 auto &SE
= AM
.getResult
<ScalarEvolutionAnalysis
>(F
);
2071 auto &PDT
= AM
.getResult
<PostDominatorTreeAnalysis
>(F
);
2072 auto &ORE
= AM
.getResult
<OptimizationRemarkEmitterAnalysis
>(F
);
2073 auto &AC
= AM
.getResult
<AssumptionAnalysis
>(F
);
2074 const TargetTransformInfo
&TTI
= AM
.getResult
<TargetIRAnalysis
>(F
);
2075 const DataLayout
&DL
= F
.getParent()->getDataLayout();
2077 // Ensure loops are in simplifed form which is a pre-requisite for loop fusion
2078 // pass. Added only for new PM since the legacy PM has already added
2079 // LoopSimplify pass as a dependency.
2080 bool Changed
= false;
2081 for (auto &L
: LI
) {
2083 simplifyLoop(L
, &DT
, &LI
, &SE
, &AC
, nullptr, false /* PreserveLCSSA */);
2088 LoopFuser
LF(LI
, DT
, DI
, SE
, PDT
, ORE
, DL
, AC
, TTI
);
2089 Changed
|= LF
.fuseLoops(F
);
2091 return PreservedAnalyses::all();
2093 PreservedAnalyses PA
;
2094 PA
.preserve
<DominatorTreeAnalysis
>();
2095 PA
.preserve
<PostDominatorTreeAnalysis
>();
2096 PA
.preserve
<ScalarEvolutionAnalysis
>();
2097 PA
.preserve
<LoopAnalysis
>();