1 //===-- LoopSink.cpp - Loop Sink Pass -------------------------------------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 // This pass does the inverse transformation of what LICM does.
10 // It traverses all of the instructions in the loop's preheader and sinks
11 // them to the loop body where frequency is lower than the loop's preheader.
12 // This pass is a reverse-transformation of LICM. It differs from the Sink
13 // pass in the following ways:
15 // * It only handles sinking of instructions from the loop's preheader to the
17 // * It uses alias set tracker to get more accurate alias info
18 // * It uses block frequency info to find the optimal sinking locations
22 // For I in Preheader:
23 // InsertBBs = BBs that uses I
24 // For BB in sorted(LoopBBs):
25 // DomBBs = BBs in InsertBBs that are dominated by BB
26 // if freq(DomBBs) > freq(BB)
27 // InsertBBs = UseBBs - DomBBs + BB
28 // For BB in InsertBBs:
29 // Insert I at BB's beginning
31 //===----------------------------------------------------------------------===//
33 #include "llvm/Transforms/Scalar/LoopSink.h"
34 #include "llvm/ADT/Statistic.h"
35 #include "llvm/Analysis/AliasAnalysis.h"
36 #include "llvm/Analysis/AliasSetTracker.h"
37 #include "llvm/Analysis/BasicAliasAnalysis.h"
38 #include "llvm/Analysis/BlockFrequencyInfo.h"
39 #include "llvm/Analysis/Loads.h"
40 #include "llvm/Analysis/LoopInfo.h"
41 #include "llvm/Analysis/LoopPass.h"
42 #include "llvm/Analysis/ScalarEvolution.h"
43 #include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
44 #include "llvm/Transforms/Utils/Local.h"
45 #include "llvm/IR/Dominators.h"
46 #include "llvm/IR/Instructions.h"
47 #include "llvm/IR/LLVMContext.h"
48 #include "llvm/IR/Metadata.h"
49 #include "llvm/Support/CommandLine.h"
50 #include "llvm/Transforms/Scalar.h"
51 #include "llvm/Transforms/Scalar/LoopPassManager.h"
52 #include "llvm/Transforms/Utils/LoopUtils.h"
55 #define DEBUG_TYPE "loopsink"
57 STATISTIC(NumLoopSunk
, "Number of instructions sunk into loop");
58 STATISTIC(NumLoopSunkCloned
, "Number of cloned instructions sunk into loop");
60 static cl::opt
<unsigned> SinkFrequencyPercentThreshold(
61 "sink-freq-percent-threshold", cl::Hidden
, cl::init(90),
62 cl::desc("Do not sink instructions that require cloning unless they "
63 "execute less than this percent of the time."));
65 static cl::opt
<unsigned> MaxNumberOfUseBBsForSinking(
66 "max-uses-for-sinking", cl::Hidden
, cl::init(30),
67 cl::desc("Do not sink instructions that have too many uses."));
69 /// Return adjusted total frequency of \p BBs.
71 /// * If there is only one BB, sinking instruction will not introduce code
72 /// size increase. Thus there is no need to adjust the frequency.
73 /// * If there are more than one BB, sinking would lead to code size increase.
74 /// In this case, we add some "tax" to the total frequency to make it harder
76 /// Freq(Preheader) = 100
77 /// Freq(BBs) = sum(50, 49) = 99
78 /// Even if Freq(BBs) < Freq(Preheader), we will not sink from Preheade to
79 /// BBs as the difference is too small to justify the code size increase.
80 /// To model this, The adjusted Freq(BBs) will be:
81 /// AdjustedFreq(BBs) = 99 / SinkFrequencyPercentThreshold%
82 static BlockFrequency
adjustedSumFreq(SmallPtrSetImpl
<BasicBlock
*> &BBs
,
83 BlockFrequencyInfo
&BFI
) {
85 for (BasicBlock
*B
: BBs
)
86 T
+= BFI
.getBlockFreq(B
);
88 T
/= BranchProbability(SinkFrequencyPercentThreshold
, 100);
92 /// Return a set of basic blocks to insert sinked instructions.
94 /// The returned set of basic blocks (BBsToSinkInto) should satisfy:
96 /// * Inside the loop \p L
97 /// * For each UseBB in \p UseBBs, there is at least one BB in BBsToSinkInto
98 /// that domintates the UseBB
99 /// * Has minimum total frequency that is no greater than preheader frequency
101 /// The purpose of the function is to find the optimal sinking points to
102 /// minimize execution cost, which is defined as "sum of frequency of
104 /// As a result, the returned BBsToSinkInto needs to have minimum total
106 /// Additionally, if the total frequency of BBsToSinkInto exceeds preheader
107 /// frequency, the optimal solution is not sinking (return empty set).
109 /// \p ColdLoopBBs is used to help find the optimal sinking locations.
110 /// It stores a list of BBs that is:
112 /// * Inside the loop \p L
113 /// * Has a frequency no larger than the loop's preheader
114 /// * Sorted by BB frequency
116 /// The complexity of the function is O(UseBBs.size() * ColdLoopBBs.size()).
117 /// To avoid expensive computation, we cap the maximum UseBBs.size() in its
119 static SmallPtrSet
<BasicBlock
*, 2>
120 findBBsToSinkInto(const Loop
&L
, const SmallPtrSetImpl
<BasicBlock
*> &UseBBs
,
121 const SmallVectorImpl
<BasicBlock
*> &ColdLoopBBs
,
122 DominatorTree
&DT
, BlockFrequencyInfo
&BFI
) {
123 SmallPtrSet
<BasicBlock
*, 2> BBsToSinkInto
;
124 if (UseBBs
.size() == 0)
125 return BBsToSinkInto
;
127 BBsToSinkInto
.insert(UseBBs
.begin(), UseBBs
.end());
128 SmallPtrSet
<BasicBlock
*, 2> BBsDominatedByColdestBB
;
130 // For every iteration:
131 // * Pick the ColdestBB from ColdLoopBBs
132 // * Find the set BBsDominatedByColdestBB that satisfy:
133 // - BBsDominatedByColdestBB is a subset of BBsToSinkInto
134 // - Every BB in BBsDominatedByColdestBB is dominated by ColdestBB
135 // * If Freq(ColdestBB) < Freq(BBsDominatedByColdestBB), remove
136 // BBsDominatedByColdestBB from BBsToSinkInto, add ColdestBB to
138 for (BasicBlock
*ColdestBB
: ColdLoopBBs
) {
139 BBsDominatedByColdestBB
.clear();
140 for (BasicBlock
*SinkedBB
: BBsToSinkInto
)
141 if (DT
.dominates(ColdestBB
, SinkedBB
))
142 BBsDominatedByColdestBB
.insert(SinkedBB
);
143 if (BBsDominatedByColdestBB
.size() == 0)
145 if (adjustedSumFreq(BBsDominatedByColdestBB
, BFI
) >
146 BFI
.getBlockFreq(ColdestBB
)) {
147 for (BasicBlock
*DominatedBB
: BBsDominatedByColdestBB
) {
148 BBsToSinkInto
.erase(DominatedBB
);
150 BBsToSinkInto
.insert(ColdestBB
);
154 // Can't sink into blocks that have no valid insertion point.
155 for (BasicBlock
*BB
: BBsToSinkInto
) {
156 if (BB
->getFirstInsertionPt() == BB
->end()) {
157 BBsToSinkInto
.clear();
162 // If the total frequency of BBsToSinkInto is larger than preheader frequency,
164 if (adjustedSumFreq(BBsToSinkInto
, BFI
) >
165 BFI
.getBlockFreq(L
.getLoopPreheader()))
166 BBsToSinkInto
.clear();
167 return BBsToSinkInto
;
170 // Sinks \p I from the loop \p L's preheader to its uses. Returns true if
171 // sinking is successful.
172 // \p LoopBlockNumber is used to sort the insertion blocks to ensure
174 static bool sinkInstruction(Loop
&L
, Instruction
&I
,
175 const SmallVectorImpl
<BasicBlock
*> &ColdLoopBBs
,
176 const SmallDenseMap
<BasicBlock
*, int, 16> &LoopBlockNumber
,
177 LoopInfo
&LI
, DominatorTree
&DT
,
178 BlockFrequencyInfo
&BFI
) {
179 // Compute the set of blocks in loop L which contain a use of I.
180 SmallPtrSet
<BasicBlock
*, 2> BBs
;
181 for (auto &U
: I
.uses()) {
182 Instruction
*UI
= cast
<Instruction
>(U
.getUser());
183 // We cannot sink I to PHI-uses.
184 if (dyn_cast
<PHINode
>(UI
))
186 // We cannot sink I if it has uses outside of the loop.
187 if (!L
.contains(LI
.getLoopFor(UI
->getParent())))
189 BBs
.insert(UI
->getParent());
192 // findBBsToSinkInto is O(BBs.size() * ColdLoopBBs.size()). We cap the max
193 // BBs.size() to avoid expensive computation.
194 // FIXME: Handle code size growth for min_size and opt_size.
195 if (BBs
.size() > MaxNumberOfUseBBsForSinking
)
198 // Find the set of BBs that we should insert a copy of I.
199 SmallPtrSet
<BasicBlock
*, 2> BBsToSinkInto
=
200 findBBsToSinkInto(L
, BBs
, ColdLoopBBs
, DT
, BFI
);
201 if (BBsToSinkInto
.empty())
204 // Return if any of the candidate blocks to sink into is non-cold.
205 if (BBsToSinkInto
.size() > 1) {
206 for (auto *BB
: BBsToSinkInto
)
207 if (!LoopBlockNumber
.count(BB
))
211 // Copy the final BBs into a vector and sort them using the total ordering
212 // of the loop block numbers as iterating the set doesn't give a useful
213 // order. No need to stable sort as the block numbers are a total ordering.
214 SmallVector
<BasicBlock
*, 2> SortedBBsToSinkInto
;
215 SortedBBsToSinkInto
.insert(SortedBBsToSinkInto
.begin(), BBsToSinkInto
.begin(),
216 BBsToSinkInto
.end());
217 llvm::sort(SortedBBsToSinkInto
, [&](BasicBlock
*A
, BasicBlock
*B
) {
218 return LoopBlockNumber
.find(A
)->second
< LoopBlockNumber
.find(B
)->second
;
221 BasicBlock
*MoveBB
= *SortedBBsToSinkInto
.begin();
222 // FIXME: Optimize the efficiency for cloned value replacement. The current
223 // implementation is O(SortedBBsToSinkInto.size() * I.num_uses()).
224 for (BasicBlock
*N
: makeArrayRef(SortedBBsToSinkInto
).drop_front(1)) {
225 assert(LoopBlockNumber
.find(N
)->second
>
226 LoopBlockNumber
.find(MoveBB
)->second
&&
228 // Clone I and replace its uses.
229 Instruction
*IC
= I
.clone();
230 IC
->setName(I
.getName());
231 IC
->insertBefore(&*N
->getFirstInsertionPt());
232 // Replaces uses of I with IC in N
233 I
.replaceUsesWithIf(IC
, [N
](Use
&U
) {
234 return cast
<Instruction
>(U
.getUser())->getParent() == N
;
236 // Replaces uses of I with IC in blocks dominated by N
237 replaceDominatedUsesWith(&I
, IC
, DT
, N
);
238 LLVM_DEBUG(dbgs() << "Sinking a clone of " << I
<< " To: " << N
->getName()
242 LLVM_DEBUG(dbgs() << "Sinking " << I
<< " To: " << MoveBB
->getName() << '\n');
244 I
.moveBefore(&*MoveBB
->getFirstInsertionPt());
249 /// Sinks instructions from loop's preheader to the loop body if the
250 /// sum frequency of inserted copy is smaller than preheader's frequency.
251 static bool sinkLoopInvariantInstructions(Loop
&L
, AAResults
&AA
, LoopInfo
&LI
,
253 BlockFrequencyInfo
&BFI
,
254 ScalarEvolution
*SE
) {
255 BasicBlock
*Preheader
= L
.getLoopPreheader();
259 // Enable LoopSink only when runtime profile is available.
260 // With static profile, the sinking decision may be sub-optimal.
261 if (!Preheader
->getParent()->hasProfileData())
264 const BlockFrequency PreheaderFreq
= BFI
.getBlockFreq(Preheader
);
265 // If there are no basic blocks with lower frequency than the preheader then
266 // we can avoid the detailed analysis as we will never find profitable sinking
268 if (all_of(L
.blocks(), [&](const BasicBlock
*BB
) {
269 return BFI
.getBlockFreq(BB
) > PreheaderFreq
;
273 bool Changed
= false;
274 AliasSetTracker
CurAST(AA
);
276 // Compute alias set.
277 for (BasicBlock
*BB
: L
.blocks())
279 CurAST
.add(*Preheader
);
281 // Sort loop's basic blocks by frequency
282 SmallVector
<BasicBlock
*, 10> ColdLoopBBs
;
283 SmallDenseMap
<BasicBlock
*, int, 16> LoopBlockNumber
;
285 for (BasicBlock
*B
: L
.blocks())
286 if (BFI
.getBlockFreq(B
) < BFI
.getBlockFreq(L
.getLoopPreheader())) {
287 ColdLoopBBs
.push_back(B
);
288 LoopBlockNumber
[B
] = ++i
;
290 llvm::stable_sort(ColdLoopBBs
, [&](BasicBlock
*A
, BasicBlock
*B
) {
291 return BFI
.getBlockFreq(A
) < BFI
.getBlockFreq(B
);
294 // Traverse preheader's instructions in reverse order becaue if A depends
295 // on B (A appears after B), A needs to be sinked first before B can be
297 for (auto II
= Preheader
->rbegin(), E
= Preheader
->rend(); II
!= E
;) {
298 Instruction
*I
= &*II
++;
299 // No need to check for instruction's operands are loop invariant.
300 assert(L
.hasLoopInvariantOperands(I
) &&
301 "Insts in a loop's preheader should have loop invariant operands!");
302 if (!canSinkOrHoistInst(*I
, &AA
, &DT
, &L
, &CurAST
, nullptr, false))
304 if (sinkInstruction(L
, *I
, ColdLoopBBs
, LoopBlockNumber
, LI
, DT
, BFI
))
309 SE
->forgetLoopDispositions(&L
);
313 PreservedAnalyses
LoopSinkPass::run(Function
&F
, FunctionAnalysisManager
&FAM
) {
314 LoopInfo
&LI
= FAM
.getResult
<LoopAnalysis
>(F
);
315 // Nothing to do if there are no loops.
317 return PreservedAnalyses::all();
319 AAResults
&AA
= FAM
.getResult
<AAManager
>(F
);
320 DominatorTree
&DT
= FAM
.getResult
<DominatorTreeAnalysis
>(F
);
321 BlockFrequencyInfo
&BFI
= FAM
.getResult
<BlockFrequencyAnalysis
>(F
);
323 // We want to do a postorder walk over the loops. Since loops are a tree this
324 // is equivalent to a reversed preorder walk and preorder is easy to compute
325 // without recursion. Since we reverse the preorder, we will visit siblings
326 // in reverse program order. This isn't expected to matter at all but is more
327 // consistent with sinking algorithms which generally work bottom-up.
328 SmallVector
<Loop
*, 4> PreorderLoops
= LI
.getLoopsInPreorder();
330 bool Changed
= false;
332 Loop
&L
= *PreorderLoops
.pop_back_val();
334 // Note that we don't pass SCEV here because it is only used to invalidate
335 // loops in SCEV and we don't preserve (or request) SCEV at all making that
337 Changed
|= sinkLoopInvariantInstructions(L
, AA
, LI
, DT
, BFI
,
338 /*ScalarEvolution*/ nullptr);
339 } while (!PreorderLoops
.empty());
342 return PreservedAnalyses::all();
344 PreservedAnalyses PA
;
345 PA
.preserveSet
<CFGAnalyses
>();
350 struct LegacyLoopSinkPass
: public LoopPass
{
352 LegacyLoopSinkPass() : LoopPass(ID
) {
353 initializeLegacyLoopSinkPassPass(*PassRegistry::getPassRegistry());
356 bool runOnLoop(Loop
*L
, LPPassManager
&LPM
) override
{
360 auto *SE
= getAnalysisIfAvailable
<ScalarEvolutionWrapperPass
>();
361 return sinkLoopInvariantInstructions(
362 *L
, getAnalysis
<AAResultsWrapperPass
>().getAAResults(),
363 getAnalysis
<LoopInfoWrapperPass
>().getLoopInfo(),
364 getAnalysis
<DominatorTreeWrapperPass
>().getDomTree(),
365 getAnalysis
<BlockFrequencyInfoWrapperPass
>().getBFI(),
366 SE
? &SE
->getSE() : nullptr);
369 void getAnalysisUsage(AnalysisUsage
&AU
) const override
{
370 AU
.setPreservesCFG();
371 AU
.addRequired
<BlockFrequencyInfoWrapperPass
>();
372 getLoopAnalysisUsage(AU
);
377 char LegacyLoopSinkPass::ID
= 0;
378 INITIALIZE_PASS_BEGIN(LegacyLoopSinkPass
, "loop-sink", "Loop Sink", false,
380 INITIALIZE_PASS_DEPENDENCY(LoopPass
)
381 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass
)
382 INITIALIZE_PASS_END(LegacyLoopSinkPass
, "loop-sink", "Loop Sink", false, false)
384 Pass
*llvm::createLoopSinkPass() { return new LegacyLoopSinkPass(); }