1 //===- LoopUnrollAndJam.cpp - Loop unroll and jam pass --------------------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 // This pass implements an unroll and jam pass. Most of the work is done by
10 // Utils/UnrollLoopAndJam.cpp.
11 //===----------------------------------------------------------------------===//
13 #include "llvm/Transforms/Scalar/LoopUnrollAndJamPass.h"
14 #include "llvm/ADT/ArrayRef.h"
15 #include "llvm/ADT/PriorityWorklist.h"
16 #include "llvm/ADT/SmallPtrSet.h"
17 #include "llvm/ADT/StringRef.h"
18 #include "llvm/Analysis/AssumptionCache.h"
19 #include "llvm/Analysis/CodeMetrics.h"
20 #include "llvm/Analysis/DependenceAnalysis.h"
21 #include "llvm/Analysis/LoopAnalysisManager.h"
22 #include "llvm/Analysis/LoopInfo.h"
23 #include "llvm/Analysis/LoopNestAnalysis.h"
24 #include "llvm/Analysis/LoopPass.h"
25 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
26 #include "llvm/Analysis/ScalarEvolution.h"
27 #include "llvm/Analysis/TargetTransformInfo.h"
28 #include "llvm/IR/BasicBlock.h"
29 #include "llvm/IR/Constants.h"
30 #include "llvm/IR/Dominators.h"
31 #include "llvm/IR/Function.h"
32 #include "llvm/IR/Instructions.h"
33 #include "llvm/IR/Metadata.h"
34 #include "llvm/IR/PassManager.h"
35 #include "llvm/Support/Casting.h"
36 #include "llvm/Support/CommandLine.h"
37 #include "llvm/Support/Compiler.h"
38 #include "llvm/Support/Debug.h"
39 #include "llvm/Support/raw_ostream.h"
40 #include "llvm/Transforms/Scalar/LoopPassManager.h"
41 #include "llvm/Transforms/Utils/LoopPeel.h"
42 #include "llvm/Transforms/Utils/LoopUtils.h"
43 #include "llvm/Transforms/Utils/UnrollLoop.h"
54 #define DEBUG_TYPE "loop-unroll-and-jam"
57 /// Metadata attribute names
58 static const char *const LLVMLoopUnrollAndJamFollowupAll
=
59 "llvm.loop.unroll_and_jam.followup_all";
60 static const char *const LLVMLoopUnrollAndJamFollowupInner
=
61 "llvm.loop.unroll_and_jam.followup_inner";
62 static const char *const LLVMLoopUnrollAndJamFollowupOuter
=
63 "llvm.loop.unroll_and_jam.followup_outer";
64 static const char *const LLVMLoopUnrollAndJamFollowupRemainderInner
=
65 "llvm.loop.unroll_and_jam.followup_remainder_inner";
66 static const char *const LLVMLoopUnrollAndJamFollowupRemainderOuter
=
67 "llvm.loop.unroll_and_jam.followup_remainder_outer";
71 AllowUnrollAndJam("allow-unroll-and-jam", cl::Hidden
,
72 cl::desc("Allows loops to be unroll-and-jammed."));
74 static cl::opt
<unsigned> UnrollAndJamCount(
75 "unroll-and-jam-count", cl::Hidden
,
76 cl::desc("Use this unroll count for all loops including those with "
77 "unroll_and_jam_count pragma values, for testing purposes"));
79 static cl::opt
<unsigned> UnrollAndJamThreshold(
80 "unroll-and-jam-threshold", cl::init(60), cl::Hidden
,
81 cl::desc("Threshold to use for inner loop when doing unroll and jam."));
83 static cl::opt
<unsigned> PragmaUnrollAndJamThreshold(
84 "pragma-unroll-and-jam-threshold", cl::init(1024), cl::Hidden
,
85 cl::desc("Unrolled size limit for loops with an unroll_and_jam(full) or "
86 "unroll_count pragma."));
88 // Returns the loop hint metadata node with the given name (for example,
89 // "llvm.loop.unroll.count"). If no such metadata node exists, then nullptr is
91 static MDNode
*getUnrollMetadataForLoop(const Loop
*L
, StringRef Name
) {
92 if (MDNode
*LoopID
= L
->getLoopID())
93 return GetUnrollMetadata(LoopID
, Name
);
97 // Returns true if the loop has any metadata starting with Prefix. For example a
98 // Prefix of "llvm.loop.unroll." returns true if we have any unroll metadata.
99 static bool hasAnyUnrollPragma(const Loop
*L
, StringRef Prefix
) {
100 if (MDNode
*LoopID
= L
->getLoopID()) {
101 // First operand should refer to the loop id itself.
102 assert(LoopID
->getNumOperands() > 0 && "requires at least one operand");
103 assert(LoopID
->getOperand(0) == LoopID
&& "invalid loop id");
105 for (unsigned I
= 1, E
= LoopID
->getNumOperands(); I
< E
; ++I
) {
106 MDNode
*MD
= dyn_cast
<MDNode
>(LoopID
->getOperand(I
));
110 MDString
*S
= dyn_cast
<MDString
>(MD
->getOperand(0));
114 if (S
->getString().starts_with(Prefix
))
121 // Returns true if the loop has an unroll_and_jam(enable) pragma.
122 static bool hasUnrollAndJamEnablePragma(const Loop
*L
) {
123 return getUnrollMetadataForLoop(L
, "llvm.loop.unroll_and_jam.enable");
126 // If loop has an unroll_and_jam_count pragma return the (necessarily
127 // positive) value from the pragma. Otherwise return 0.
128 static unsigned unrollAndJamCountPragmaValue(const Loop
*L
) {
129 MDNode
*MD
= getUnrollMetadataForLoop(L
, "llvm.loop.unroll_and_jam.count");
131 assert(MD
->getNumOperands() == 2 &&
132 "Unroll count hint metadata should have two operands.");
134 mdconst::extract
<ConstantInt
>(MD
->getOperand(1))->getZExtValue();
135 assert(Count
>= 1 && "Unroll count must be positive.");
141 // Returns loop size estimation for unrolled loop.
143 getUnrollAndJammedLoopSize(unsigned LoopSize
,
144 TargetTransformInfo::UnrollingPreferences
&UP
) {
145 assert(LoopSize
>= UP
.BEInsns
&& "LoopSize should not be less than BEInsns!");
146 return static_cast<uint64_t>(LoopSize
- UP
.BEInsns
) * UP
.Count
+ UP
.BEInsns
;
149 // Calculates unroll and jam count and writes it to UP.Count. Returns true if
150 // unroll count was set explicitly.
151 static bool computeUnrollAndJamCount(
152 Loop
*L
, Loop
*SubLoop
, const TargetTransformInfo
&TTI
, DominatorTree
&DT
,
153 LoopInfo
*LI
, AssumptionCache
*AC
, ScalarEvolution
&SE
,
154 const SmallPtrSetImpl
<const Value
*> &EphValues
,
155 OptimizationRemarkEmitter
*ORE
, unsigned OuterTripCount
,
156 unsigned OuterTripMultiple
, const UnrollCostEstimator
&OuterUCE
,
157 unsigned InnerTripCount
, unsigned InnerLoopSize
,
158 TargetTransformInfo::UnrollingPreferences
&UP
,
159 TargetTransformInfo::PeelingPreferences
&PP
) {
160 unsigned OuterLoopSize
= OuterUCE
.getRolledLoopSize();
161 // First up use computeUnrollCount from the loop unroller to get a count
162 // for unrolling the outer loop, plus any loops requiring explicit
163 // unrolling we leave to the unroller. This uses UP.Threshold /
164 // UP.PartialThreshold / UP.MaxCount to come up with sensible loop values.
165 // We have already checked that the loop has no unroll.* pragmas.
166 unsigned MaxTripCount
= 0;
167 bool UseUpperBound
= false;
168 bool ExplicitUnroll
= computeUnrollCount(
169 L
, TTI
, DT
, LI
, AC
, SE
, EphValues
, ORE
, OuterTripCount
, MaxTripCount
,
170 /*MaxOrZero*/ false, OuterTripMultiple
, OuterUCE
, UP
, PP
,
172 if (ExplicitUnroll
|| UseUpperBound
) {
173 // If the user explicitly set the loop as unrolled, dont UnJ it. Leave it
174 // for the unroller instead.
175 LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; explicit count set by "
176 "computeUnrollCount\n");
181 // Override with any explicit Count from the "unroll-and-jam-count" option.
182 bool UserUnrollCount
= UnrollAndJamCount
.getNumOccurrences() > 0;
183 if (UserUnrollCount
) {
184 UP
.Count
= UnrollAndJamCount
;
186 if (UP
.AllowRemainder
&&
187 getUnrollAndJammedLoopSize(OuterLoopSize
, UP
) < UP
.Threshold
&&
188 getUnrollAndJammedLoopSize(InnerLoopSize
, UP
) <
189 UP
.UnrollAndJamInnerLoopThreshold
)
193 // Check for unroll_and_jam pragmas
194 unsigned PragmaCount
= unrollAndJamCountPragmaValue(L
);
195 if (PragmaCount
> 0) {
196 UP
.Count
= PragmaCount
;
199 if ((UP
.AllowRemainder
|| (OuterTripMultiple
% PragmaCount
== 0)) &&
200 getUnrollAndJammedLoopSize(OuterLoopSize
, UP
) < UP
.Threshold
&&
201 getUnrollAndJammedLoopSize(InnerLoopSize
, UP
) <
202 UP
.UnrollAndJamInnerLoopThreshold
)
206 bool PragmaEnableUnroll
= hasUnrollAndJamEnablePragma(L
);
207 bool ExplicitUnrollAndJamCount
= PragmaCount
> 0 || UserUnrollCount
;
208 bool ExplicitUnrollAndJam
= PragmaEnableUnroll
|| ExplicitUnrollAndJamCount
;
210 // If the loop has an unrolling pragma, we want to be more aggressive with
212 if (ExplicitUnrollAndJam
)
213 UP
.UnrollAndJamInnerLoopThreshold
= PragmaUnrollAndJamThreshold
;
215 if (!UP
.AllowRemainder
&& getUnrollAndJammedLoopSize(InnerLoopSize
, UP
) >=
216 UP
.UnrollAndJamInnerLoopThreshold
) {
217 LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; can't create remainder and "
218 "inner loop too large\n");
223 // We have a sensible limit for the outer loop, now adjust it for the inner
224 // loop and UP.UnrollAndJamInnerLoopThreshold. If the outer limit was set
225 // explicitly, we want to stick to it.
226 if (!ExplicitUnrollAndJamCount
&& UP
.AllowRemainder
) {
227 while (UP
.Count
!= 0 && getUnrollAndJammedLoopSize(InnerLoopSize
, UP
) >=
228 UP
.UnrollAndJamInnerLoopThreshold
)
232 // If we are explicitly unroll and jamming, we are done. Otherwise there are a
233 // number of extra performance heuristics to check.
234 if (ExplicitUnrollAndJam
)
237 // If the inner loop count is known and small, leave the entire loop nest to
239 if (InnerTripCount
&& InnerLoopSize
* InnerTripCount
< UP
.Threshold
) {
240 LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; small inner loop count is "
241 "being left for the unroller\n");
246 // Check for situations where UnJ is likely to be unprofitable. Including
247 // subloops with more than 1 block.
248 if (SubLoop
->getBlocks().size() != 1) {
250 dbgs() << "Won't unroll-and-jam; More than one inner loop block\n");
255 // Limit to loops where there is something to gain from unrolling and
256 // jamming the loop. In this case, look for loads that are invariant in the
257 // outer loop and can become shared.
258 unsigned NumInvariant
= 0;
259 for (BasicBlock
*BB
: SubLoop
->getBlocks()) {
260 for (Instruction
&I
: *BB
) {
261 if (auto *Ld
= dyn_cast
<LoadInst
>(&I
)) {
262 Value
*V
= Ld
->getPointerOperand();
263 const SCEV
*LSCEV
= SE
.getSCEVAtScope(V
, L
);
264 if (SE
.isLoopInvariant(LSCEV
, L
))
269 if (NumInvariant
== 0) {
270 LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; No loop invariant loads\n");
278 static LoopUnrollResult
279 tryToUnrollAndJamLoop(Loop
*L
, DominatorTree
&DT
, LoopInfo
*LI
,
280 ScalarEvolution
&SE
, const TargetTransformInfo
&TTI
,
281 AssumptionCache
&AC
, DependenceInfo
&DI
,
282 OptimizationRemarkEmitter
&ORE
, int OptLevel
) {
283 TargetTransformInfo::UnrollingPreferences UP
= gatherUnrollingPreferences(
284 L
, SE
, TTI
, nullptr, nullptr, ORE
, OptLevel
, std::nullopt
, std::nullopt
,
285 std::nullopt
, std::nullopt
, std::nullopt
, std::nullopt
);
286 TargetTransformInfo::PeelingPreferences PP
=
287 gatherPeelingPreferences(L
, SE
, TTI
, std::nullopt
, std::nullopt
);
289 TransformationMode EnableMode
= hasUnrollAndJamTransformation(L
);
290 if (EnableMode
& TM_Disable
)
291 return LoopUnrollResult::Unmodified
;
292 if (EnableMode
& TM_ForcedByUser
)
293 UP
.UnrollAndJam
= true;
295 if (AllowUnrollAndJam
.getNumOccurrences() > 0)
296 UP
.UnrollAndJam
= AllowUnrollAndJam
;
297 if (UnrollAndJamThreshold
.getNumOccurrences() > 0)
298 UP
.UnrollAndJamInnerLoopThreshold
= UnrollAndJamThreshold
;
299 // Exit early if unrolling is disabled.
300 if (!UP
.UnrollAndJam
|| UP
.UnrollAndJamInnerLoopThreshold
== 0)
301 return LoopUnrollResult::Unmodified
;
303 LLVM_DEBUG(dbgs() << "Loop Unroll and Jam: F["
304 << L
->getHeader()->getParent()->getName() << "] Loop %"
305 << L
->getHeader()->getName() << "\n");
307 // A loop with any unroll pragma (enabling/disabling/count/etc) is left for
308 // the unroller, so long as it does not explicitly have unroll_and_jam
309 // metadata. This means #pragma nounroll will disable unroll and jam as well
311 if (hasAnyUnrollPragma(L
, "llvm.loop.unroll.") &&
312 !hasAnyUnrollPragma(L
, "llvm.loop.unroll_and_jam.")) {
313 LLVM_DEBUG(dbgs() << " Disabled due to pragma.\n");
314 return LoopUnrollResult::Unmodified
;
317 if (!isSafeToUnrollAndJam(L
, SE
, DT
, DI
, *LI
)) {
318 LLVM_DEBUG(dbgs() << " Disabled due to not being safe.\n");
319 return LoopUnrollResult::Unmodified
;
322 // Approximate the loop size and collect useful info
323 SmallPtrSet
<const Value
*, 32> EphValues
;
324 CodeMetrics::collectEphemeralValues(L
, &AC
, EphValues
);
325 Loop
*SubLoop
= L
->getSubLoops()[0];
326 UnrollCostEstimator
InnerUCE(SubLoop
, TTI
, EphValues
, UP
.BEInsns
);
327 UnrollCostEstimator
OuterUCE(L
, TTI
, EphValues
, UP
.BEInsns
);
329 if (!InnerUCE
.canUnroll() || !OuterUCE
.canUnroll()) {
330 LLVM_DEBUG(dbgs() << " Not unrolling loop which contains instructions"
331 << " which cannot be duplicated or have invalid cost.\n");
332 return LoopUnrollResult::Unmodified
;
335 unsigned InnerLoopSize
= InnerUCE
.getRolledLoopSize();
336 LLVM_DEBUG(dbgs() << " Outer Loop Size: " << OuterUCE
.getRolledLoopSize()
338 LLVM_DEBUG(dbgs() << " Inner Loop Size: " << InnerLoopSize
<< "\n");
340 if (InnerUCE
.NumInlineCandidates
!= 0 || OuterUCE
.NumInlineCandidates
!= 0) {
341 LLVM_DEBUG(dbgs() << " Not unrolling loop with inlinable calls.\n");
342 return LoopUnrollResult::Unmodified
;
344 if (InnerUCE
.Convergent
|| OuterUCE
.Convergent
) {
346 dbgs() << " Not unrolling loop with convergent instructions.\n");
347 return LoopUnrollResult::Unmodified
;
350 // Save original loop IDs for after the transformation.
351 MDNode
*OrigOuterLoopID
= L
->getLoopID();
352 MDNode
*OrigSubLoopID
= SubLoop
->getLoopID();
354 // To assign the loop id of the epilogue, assign it before unrolling it so it
355 // is applied to every inner loop of the epilogue. We later apply the loop ID
356 // for the jammed inner loop.
357 std::optional
<MDNode
*> NewInnerEpilogueLoopID
= makeFollowupLoopID(
358 OrigOuterLoopID
, {LLVMLoopUnrollAndJamFollowupAll
,
359 LLVMLoopUnrollAndJamFollowupRemainderInner
});
360 if (NewInnerEpilogueLoopID
)
361 SubLoop
->setLoopID(*NewInnerEpilogueLoopID
);
363 // Find trip count and trip multiple
364 BasicBlock
*Latch
= L
->getLoopLatch();
365 BasicBlock
*SubLoopLatch
= SubLoop
->getLoopLatch();
366 unsigned OuterTripCount
= SE
.getSmallConstantTripCount(L
, Latch
);
367 unsigned OuterTripMultiple
= SE
.getSmallConstantTripMultiple(L
, Latch
);
368 unsigned InnerTripCount
= SE
.getSmallConstantTripCount(SubLoop
, SubLoopLatch
);
370 // Decide if, and by how much, to unroll
371 bool IsCountSetExplicitly
= computeUnrollAndJamCount(
372 L
, SubLoop
, TTI
, DT
, LI
, &AC
, SE
, EphValues
, &ORE
, OuterTripCount
,
373 OuterTripMultiple
, OuterUCE
, InnerTripCount
, InnerLoopSize
, UP
, PP
);
375 return LoopUnrollResult::Unmodified
;
376 // Unroll factor (Count) must be less or equal to TripCount.
377 if (OuterTripCount
&& UP
.Count
> OuterTripCount
)
378 UP
.Count
= OuterTripCount
;
380 Loop
*EpilogueOuterLoop
= nullptr;
381 LoopUnrollResult UnrollResult
= UnrollAndJamLoop(
382 L
, UP
.Count
, OuterTripCount
, OuterTripMultiple
, UP
.UnrollRemainder
, LI
,
383 &SE
, &DT
, &AC
, &TTI
, &ORE
, &EpilogueOuterLoop
);
385 // Assign new loop attributes.
386 if (EpilogueOuterLoop
) {
387 std::optional
<MDNode
*> NewOuterEpilogueLoopID
= makeFollowupLoopID(
388 OrigOuterLoopID
, {LLVMLoopUnrollAndJamFollowupAll
,
389 LLVMLoopUnrollAndJamFollowupRemainderOuter
});
390 if (NewOuterEpilogueLoopID
)
391 EpilogueOuterLoop
->setLoopID(*NewOuterEpilogueLoopID
);
394 std::optional
<MDNode
*> NewInnerLoopID
=
395 makeFollowupLoopID(OrigOuterLoopID
, {LLVMLoopUnrollAndJamFollowupAll
,
396 LLVMLoopUnrollAndJamFollowupInner
});
398 SubLoop
->setLoopID(*NewInnerLoopID
);
400 SubLoop
->setLoopID(OrigSubLoopID
);
402 if (UnrollResult
== LoopUnrollResult::PartiallyUnrolled
) {
403 std::optional
<MDNode
*> NewOuterLoopID
= makeFollowupLoopID(
405 {LLVMLoopUnrollAndJamFollowupAll
, LLVMLoopUnrollAndJamFollowupOuter
});
406 if (NewOuterLoopID
) {
407 L
->setLoopID(*NewOuterLoopID
);
409 // Do not setLoopAlreadyUnrolled if a followup was given.
414 // If loop has an unroll count pragma or unrolled by explicitly set count
415 // mark loop as unrolled to prevent unrolling beyond that requested.
416 if (UnrollResult
!= LoopUnrollResult::FullyUnrolled
&& IsCountSetExplicitly
)
417 L
->setLoopAlreadyUnrolled();
422 static bool tryToUnrollAndJamLoop(LoopNest
&LN
, DominatorTree
&DT
, LoopInfo
&LI
,
424 const TargetTransformInfo
&TTI
,
425 AssumptionCache
&AC
, DependenceInfo
&DI
,
426 OptimizationRemarkEmitter
&ORE
, int OptLevel
,
428 bool DidSomething
= false;
429 ArrayRef
<Loop
*> Loops
= LN
.getLoops();
430 Loop
*OutmostLoop
= &LN
.getOutermostLoop();
432 // Add the loop nests in the reverse order of LN. See method
434 SmallPriorityWorklist
<Loop
*, 4> Worklist
;
435 appendLoopsToWorklist(Loops
, Worklist
);
436 while (!Worklist
.empty()) {
437 Loop
*L
= Worklist
.pop_back_val();
438 std::string LoopName
= std::string(L
->getName());
439 LoopUnrollResult Result
=
440 tryToUnrollAndJamLoop(L
, DT
, &LI
, SE
, TTI
, AC
, DI
, ORE
, OptLevel
);
441 if (Result
!= LoopUnrollResult::Unmodified
)
443 if (L
== OutmostLoop
&& Result
== LoopUnrollResult::FullyUnrolled
)
444 U
.markLoopAsDeleted(*L
, LoopName
);
450 PreservedAnalyses
LoopUnrollAndJamPass::run(LoopNest
&LN
,
451 LoopAnalysisManager
&AM
,
452 LoopStandardAnalysisResults
&AR
,
454 Function
&F
= *LN
.getParent();
456 DependenceInfo
DI(&F
, &AR
.AA
, &AR
.SE
, &AR
.LI
);
457 OptimizationRemarkEmitter
ORE(&F
);
459 if (!tryToUnrollAndJamLoop(LN
, AR
.DT
, AR
.LI
, AR
.SE
, AR
.TTI
, AR
.AC
, DI
, ORE
,
461 return PreservedAnalyses::all();
463 auto PA
= getLoopPassPreservedAnalyses();
464 PA
.preserve
<LoopNestAnalysis
>();