1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
10 // and generates target-independent LLVM-IR.
11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs
12 // of instructions in order to estimate the profitability of vectorization.
14 // The loop vectorizer combines consecutive loop iterations into a single
15 // 'wide' iteration. After this transformation the index is incremented
16 // by the SIMD vector width, and not by one.
18 // This pass has three parts:
19 // 1. The main loop pass that drives the different parts.
20 // 2. LoopVectorizationLegality - A unit that checks for the legality
21 // of the vectorization.
22 // 3. InnerLoopVectorizer - A unit that performs the actual
23 // widening of instructions.
24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability
25 // of vectorization. It decides on the optimal vector width, which
26 // can be one, if vectorization is not profitable.
28 // There is a development effort going on to migrate loop vectorizer to the
29 // VPlan infrastructure and to introduce outer loop vectorization support (see
30 // docs/VectorizationPlan.rst and
31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
32 // purpose, we temporarily introduced the VPlan-native vectorization path: an
33 // alternative vectorization path that is natively implemented on top of the
34 // VPlan infrastructure. See EnableVPlanNativePath for enabling.
36 //===----------------------------------------------------------------------===//
38 // The reduction-variable vectorization is based on the paper:
39 // D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
41 // Variable uniformity checks are inspired by:
42 // Karrenberg, R. and Hack, S. Whole Function Vectorization.
44 // The interleaved access vectorization is based on the paper:
45 // Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved
48 // Other ideas/concepts are from:
49 // A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
51 // S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of
52 // Vectorizing Compilers.
54 //===----------------------------------------------------------------------===//
56 #include "llvm/Transforms/Vectorize/LoopVectorize.h"
57 #include "LoopVectorizationPlanner.h"
58 #include "VPRecipeBuilder.h"
60 #include "VPlanAnalysis.h"
61 #include "VPlanHCFGBuilder.h"
62 #include "VPlanTransforms.h"
63 #include "llvm/ADT/APInt.h"
64 #include "llvm/ADT/ArrayRef.h"
65 #include "llvm/ADT/DenseMap.h"
66 #include "llvm/ADT/DenseMapInfo.h"
67 #include "llvm/ADT/Hashing.h"
68 #include "llvm/ADT/MapVector.h"
69 #include "llvm/ADT/STLExtras.h"
70 #include "llvm/ADT/SmallPtrSet.h"
71 #include "llvm/ADT/SmallSet.h"
72 #include "llvm/ADT/SmallVector.h"
73 #include "llvm/ADT/Statistic.h"
74 #include "llvm/ADT/StringRef.h"
75 #include "llvm/ADT/Twine.h"
76 #include "llvm/ADT/iterator_range.h"
77 #include "llvm/Analysis/AssumptionCache.h"
78 #include "llvm/Analysis/BasicAliasAnalysis.h"
79 #include "llvm/Analysis/BlockFrequencyInfo.h"
80 #include "llvm/Analysis/CFG.h"
81 #include "llvm/Analysis/CodeMetrics.h"
82 #include "llvm/Analysis/DemandedBits.h"
83 #include "llvm/Analysis/GlobalsModRef.h"
84 #include "llvm/Analysis/LoopAccessAnalysis.h"
85 #include "llvm/Analysis/LoopAnalysisManager.h"
86 #include "llvm/Analysis/LoopInfo.h"
87 #include "llvm/Analysis/LoopIterator.h"
88 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
89 #include "llvm/Analysis/ProfileSummaryInfo.h"
90 #include "llvm/Analysis/ScalarEvolution.h"
91 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
92 #include "llvm/Analysis/TargetLibraryInfo.h"
93 #include "llvm/Analysis/TargetTransformInfo.h"
94 #include "llvm/Analysis/ValueTracking.h"
95 #include "llvm/Analysis/VectorUtils.h"
96 #include "llvm/IR/Attributes.h"
97 #include "llvm/IR/BasicBlock.h"
98 #include "llvm/IR/CFG.h"
99 #include "llvm/IR/Constant.h"
100 #include "llvm/IR/Constants.h"
101 #include "llvm/IR/DataLayout.h"
102 #include "llvm/IR/DebugInfo.h"
103 #include "llvm/IR/DebugInfoMetadata.h"
104 #include "llvm/IR/DebugLoc.h"
105 #include "llvm/IR/DerivedTypes.h"
106 #include "llvm/IR/DiagnosticInfo.h"
107 #include "llvm/IR/Dominators.h"
108 #include "llvm/IR/Function.h"
109 #include "llvm/IR/IRBuilder.h"
110 #include "llvm/IR/InstrTypes.h"
111 #include "llvm/IR/Instruction.h"
112 #include "llvm/IR/Instructions.h"
113 #include "llvm/IR/IntrinsicInst.h"
114 #include "llvm/IR/Intrinsics.h"
115 #include "llvm/IR/MDBuilder.h"
116 #include "llvm/IR/Metadata.h"
117 #include "llvm/IR/Module.h"
118 #include "llvm/IR/Operator.h"
119 #include "llvm/IR/PatternMatch.h"
120 #include "llvm/IR/ProfDataUtils.h"
121 #include "llvm/IR/Type.h"
122 #include "llvm/IR/Use.h"
123 #include "llvm/IR/User.h"
124 #include "llvm/IR/Value.h"
125 #include "llvm/IR/ValueHandle.h"
126 #include "llvm/IR/Verifier.h"
127 #include "llvm/Support/Casting.h"
128 #include "llvm/Support/CommandLine.h"
129 #include "llvm/Support/Compiler.h"
130 #include "llvm/Support/Debug.h"
131 #include "llvm/Support/ErrorHandling.h"
132 #include "llvm/Support/InstructionCost.h"
133 #include "llvm/Support/MathExtras.h"
134 #include "llvm/Support/raw_ostream.h"
135 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
136 #include "llvm/Transforms/Utils/InjectTLIMappings.h"
137 #include "llvm/Transforms/Utils/LoopSimplify.h"
138 #include "llvm/Transforms/Utils/LoopUtils.h"
139 #include "llvm/Transforms/Utils/LoopVersioning.h"
140 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
141 #include "llvm/Transforms/Utils/SizeOpts.h"
142 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
147 #include <functional>
156 using namespace llvm
;
158 #define LV_NAME "loop-vectorize"
159 #define DEBUG_TYPE LV_NAME
162 const char VerboseDebug
[] = DEBUG_TYPE
"-verbose";
166 /// Metadata attribute names
167 const char LLVMLoopVectorizeFollowupAll
[] = "llvm.loop.vectorize.followup_all";
168 const char LLVMLoopVectorizeFollowupVectorized
[] =
169 "llvm.loop.vectorize.followup_vectorized";
170 const char LLVMLoopVectorizeFollowupEpilogue
[] =
171 "llvm.loop.vectorize.followup_epilogue";
174 STATISTIC(LoopsVectorized
, "Number of loops vectorized");
175 STATISTIC(LoopsAnalyzed
, "Number of loops analyzed for vectorization");
176 STATISTIC(LoopsEpilogueVectorized
, "Number of epilogues vectorized");
178 static cl::opt
<bool> EnableEpilogueVectorization(
179 "enable-epilogue-vectorization", cl::init(true), cl::Hidden
,
180 cl::desc("Enable vectorization of epilogue loops."));
182 static cl::opt
<unsigned> EpilogueVectorizationForceVF(
183 "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden
,
184 cl::desc("When epilogue vectorization is enabled, and a value greater than "
185 "1 is specified, forces the given VF for all applicable epilogue "
188 static cl::opt
<unsigned> EpilogueVectorizationMinVF(
189 "epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden
,
190 cl::desc("Only loops with vectorization factor equal to or larger than "
191 "the specified value are considered for epilogue vectorization."));
193 /// Loops with a known constant trip count below this number are vectorized only
194 /// if no scalar iteration overheads are incurred.
195 static cl::opt
<unsigned> TinyTripCountVectorThreshold(
196 "vectorizer-min-trip-count", cl::init(16), cl::Hidden
,
197 cl::desc("Loops with a constant trip count that is smaller than this "
198 "value are vectorized only if no scalar iteration overheads "
201 static cl::opt
<unsigned> VectorizeMemoryCheckThreshold(
202 "vectorize-memory-check-threshold", cl::init(128), cl::Hidden
,
203 cl::desc("The maximum allowed number of runtime memory checks"));
205 // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired,
206 // that predication is preferred, and this lists all options. I.e., the
207 // vectorizer will try to fold the tail-loop (epilogue) into the vector body
208 // and predicate the instructions accordingly. If tail-folding fails, there are
209 // different fallback strategies depending on these values:
210 namespace PreferPredicateTy
{
213 PredicateElseScalarEpilogue
,
214 PredicateOrDontVectorize
216 } // namespace PreferPredicateTy
218 static cl::opt
<PreferPredicateTy::Option
> PreferPredicateOverEpilogue(
219 "prefer-predicate-over-epilogue",
220 cl::init(PreferPredicateTy::ScalarEpilogue
),
222 cl::desc("Tail-folding and predication preferences over creating a scalar "
224 cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue
,
226 "Don't tail-predicate loops, create scalar epilogue"),
227 clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue
,
228 "predicate-else-scalar-epilogue",
229 "prefer tail-folding, create scalar epilogue if tail "
231 clEnumValN(PreferPredicateTy::PredicateOrDontVectorize
,
232 "predicate-dont-vectorize",
233 "prefers tail-folding, don't attempt vectorization if "
234 "tail-folding fails.")));
236 static cl::opt
<TailFoldingStyle
> ForceTailFoldingStyle(
237 "force-tail-folding-style", cl::desc("Force the tail folding style"),
238 cl::init(TailFoldingStyle::None
),
240 clEnumValN(TailFoldingStyle::None
, "none", "Disable tail folding"),
242 TailFoldingStyle::Data
, "data",
243 "Create lane mask for data only, using active.lane.mask intrinsic"),
244 clEnumValN(TailFoldingStyle::DataWithoutLaneMask
,
245 "data-without-lane-mask",
246 "Create lane mask with compare/stepvector"),
247 clEnumValN(TailFoldingStyle::DataAndControlFlow
, "data-and-control",
248 "Create lane mask using active.lane.mask intrinsic, and use "
249 "it for both data and control flow"),
251 TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck
,
252 "data-and-control-without-rt-check",
253 "Similar to data-and-control, but remove the runtime check")));
255 static cl::opt
<bool> MaximizeBandwidth(
256 "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden
,
257 cl::desc("Maximize bandwidth when selecting vectorization factor which "
258 "will be determined by the smallest type in loop."));
260 static cl::opt
<bool> EnableInterleavedMemAccesses(
261 "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden
,
262 cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
264 /// An interleave-group may need masking if it resides in a block that needs
265 /// predication, or in order to mask away gaps.
266 static cl::opt
<bool> EnableMaskedInterleavedMemAccesses(
267 "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden
,
268 cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
270 static cl::opt
<unsigned> TinyTripCountInterleaveThreshold(
271 "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden
,
272 cl::desc("We don't interleave loops with a estimated constant trip count "
273 "below this number"));
275 static cl::opt
<unsigned> ForceTargetNumScalarRegs(
276 "force-target-num-scalar-regs", cl::init(0), cl::Hidden
,
277 cl::desc("A flag that overrides the target's number of scalar registers."));
279 static cl::opt
<unsigned> ForceTargetNumVectorRegs(
280 "force-target-num-vector-regs", cl::init(0), cl::Hidden
,
281 cl::desc("A flag that overrides the target's number of vector registers."));
283 static cl::opt
<unsigned> ForceTargetMaxScalarInterleaveFactor(
284 "force-target-max-scalar-interleave", cl::init(0), cl::Hidden
,
285 cl::desc("A flag that overrides the target's max interleave factor for "
288 static cl::opt
<unsigned> ForceTargetMaxVectorInterleaveFactor(
289 "force-target-max-vector-interleave", cl::init(0), cl::Hidden
,
290 cl::desc("A flag that overrides the target's max interleave factor for "
291 "vectorized loops."));
293 static cl::opt
<unsigned> ForceTargetInstructionCost(
294 "force-target-instruction-cost", cl::init(0), cl::Hidden
,
295 cl::desc("A flag that overrides the target's expected cost for "
296 "an instruction to a single constant value. Mostly "
297 "useful for getting consistent testing."));
299 static cl::opt
<bool> ForceTargetSupportsScalableVectors(
300 "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden
,
302 "Pretend that scalable vectors are supported, even if the target does "
303 "not support them. This flag should only be used for testing."));
305 static cl::opt
<unsigned> SmallLoopCost(
306 "small-loop-cost", cl::init(20), cl::Hidden
,
308 "The cost of a loop that is considered 'small' by the interleaver."));
310 static cl::opt
<bool> LoopVectorizeWithBlockFrequency(
311 "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden
,
312 cl::desc("Enable the use of the block frequency analysis to access PGO "
313 "heuristics minimizing code growth in cold regions and being more "
314 "aggressive in hot regions."));
316 // Runtime interleave loops for load/store throughput.
317 static cl::opt
<bool> EnableLoadStoreRuntimeInterleave(
318 "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden
,
320 "Enable runtime interleaving until load/store ports are saturated"));
322 /// Interleave small loops with scalar reductions.
323 static cl::opt
<bool> InterleaveSmallLoopScalarReduction(
324 "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden
,
325 cl::desc("Enable interleaving for loops with small iteration counts that "
326 "contain scalar reductions to expose ILP."));
328 /// The number of stores in a loop that are allowed to need predication.
329 static cl::opt
<unsigned> NumberOfStoresToPredicate(
330 "vectorize-num-stores-pred", cl::init(1), cl::Hidden
,
331 cl::desc("Max number of stores to be predicated behind an if."));
333 static cl::opt
<bool> EnableIndVarRegisterHeur(
334 "enable-ind-var-reg-heur", cl::init(true), cl::Hidden
,
335 cl::desc("Count the induction variable only once when interleaving"));
337 static cl::opt
<bool> EnableCondStoresVectorization(
338 "enable-cond-stores-vec", cl::init(true), cl::Hidden
,
339 cl::desc("Enable if predication of stores during vectorization."));
341 static cl::opt
<unsigned> MaxNestedScalarReductionIC(
342 "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden
,
343 cl::desc("The maximum interleave count to use when interleaving a scalar "
344 "reduction in a nested loop."));
347 PreferInLoopReductions("prefer-inloop-reductions", cl::init(false),
349 cl::desc("Prefer in-loop vector reductions, "
350 "overriding the targets preference."));
352 static cl::opt
<bool> ForceOrderedReductions(
353 "force-ordered-reductions", cl::init(false), cl::Hidden
,
354 cl::desc("Enable the vectorisation of loops with in-order (strict) "
357 static cl::opt
<bool> PreferPredicatedReductionSelect(
358 "prefer-predicated-reduction-select", cl::init(false), cl::Hidden
,
360 "Prefer predicating a reduction operation over an after loop select."));
363 cl::opt
<bool> EnableVPlanNativePath(
364 "enable-vplan-native-path", cl::Hidden
,
365 cl::desc("Enable VPlan-native vectorization path with "
366 "support for outer loop vectorization."));
369 // This flag enables the stress testing of the VPlan H-CFG construction in the
370 // VPlan-native vectorization path. It must be used in conjuction with
371 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
372 // verification of the H-CFGs built.
373 static cl::opt
<bool> VPlanBuildStressTest(
374 "vplan-build-stress-test", cl::init(false), cl::Hidden
,
376 "Build VPlan for every supported loop nest in the function and bail "
377 "out right after the build (stress test the VPlan H-CFG construction "
378 "in the VPlan-native vectorization path)."));
380 cl::opt
<bool> llvm::EnableLoopInterleaving(
381 "interleave-loops", cl::init(true), cl::Hidden
,
382 cl::desc("Enable loop interleaving in Loop vectorization passes"));
383 cl::opt
<bool> llvm::EnableLoopVectorization(
384 "vectorize-loops", cl::init(true), cl::Hidden
,
385 cl::desc("Run the Loop vectorization passes"));
387 static cl::opt
<bool> PrintVPlansInDotFormat(
388 "vplan-print-in-dot-format", cl::Hidden
,
389 cl::desc("Use dot format instead of plain text when dumping VPlans"));
391 static cl::opt
<cl::boolOrDefault
> ForceSafeDivisor(
392 "force-widen-divrem-via-safe-divisor", cl::Hidden
,
394 "Override cost based safe divisor widening for div/rem instructions"));
396 static cl::opt
<bool> UseWiderVFIfCallVariantsPresent(
397 "vectorizer-maximize-bandwidth-for-vector-calls", cl::init(true),
399 cl::desc("Try wider VFs if they enable the use of vector variants"));
401 // Likelyhood of bypassing the vectorized loop because assumptions about SCEV
402 // variables not overflowing do not hold. See `emitSCEVChecks`.
403 static constexpr uint32_t SCEVCheckBypassWeights
[] = {1, 127};
404 // Likelyhood of bypassing the vectorized loop because pointers overlap. See
405 // `emitMemRuntimeChecks`.
406 static constexpr uint32_t MemCheckBypassWeights
[] = {1, 127};
407 // Likelyhood of bypassing the vectorized loop because there are zero trips left
408 // after prolog. See `emitIterationCountCheck`.
409 static constexpr uint32_t MinItersBypassWeights
[] = {1, 127};
411 /// A helper function that returns true if the given type is irregular. The
412 /// type is irregular if its allocated size doesn't equal the store size of an
413 /// element of the corresponding vector type.
414 static bool hasIrregularType(Type
*Ty
, const DataLayout
&DL
) {
415 // Determine if an array of N elements of type Ty is "bitcast compatible"
416 // with a <N x Ty> vector.
417 // This is only true if there is no padding between the array elements.
418 return DL
.getTypeAllocSizeInBits(Ty
) != DL
.getTypeSizeInBits(Ty
);
421 /// A helper function that returns the reciprocal of the block probability of
422 /// predicated blocks. If we return X, we are assuming the predicated block
423 /// will execute once for every X iterations of the loop header.
425 /// TODO: We should use actual block probability here, if available. Currently,
426 /// we always assume predicated blocks have a 50% chance of executing.
427 static unsigned getReciprocalPredBlockProb() { return 2; }
429 /// Returns "best known" trip count for the specified loop \p L as defined by
430 /// the following procedure:
431 /// 1) Returns exact trip count if it is known.
432 /// 2) Returns expected trip count according to profile data if any.
433 /// 3) Returns upper bound estimate if it is known.
434 /// 4) Returns std::nullopt if all of the above failed.
435 static std::optional
<unsigned> getSmallBestKnownTC(ScalarEvolution
&SE
,
437 // Check if exact trip count is known.
438 if (unsigned ExpectedTC
= SE
.getSmallConstantTripCount(L
))
441 // Check if there is an expected trip count available from profile data.
442 if (LoopVectorizeWithBlockFrequency
)
443 if (auto EstimatedTC
= getLoopEstimatedTripCount(L
))
446 // Check if upper bound estimate is known.
447 if (unsigned ExpectedTC
= SE
.getSmallConstantMaxTripCount(L
))
453 /// Return a vector containing interleaved elements from multiple
454 /// smaller input vectors.
455 static Value
*interleaveVectors(IRBuilderBase
&Builder
, ArrayRef
<Value
*> Vals
,
457 unsigned Factor
= Vals
.size();
458 assert(Factor
> 1 && "Tried to interleave invalid number of vectors");
460 VectorType
*VecTy
= cast
<VectorType
>(Vals
[0]->getType());
462 for (Value
*Val
: Vals
)
463 assert(Val
->getType() == VecTy
&& "Tried to interleave mismatched types");
466 // Scalable vectors cannot use arbitrary shufflevectors (only splats), so
467 // must use intrinsics to interleave.
468 if (VecTy
->isScalableTy()) {
469 VectorType
*WideVecTy
= VectorType::getDoubleElementsVectorType(VecTy
);
470 return Builder
.CreateIntrinsic(
471 WideVecTy
, Intrinsic::experimental_vector_interleave2
, Vals
,
472 /*FMFSource=*/nullptr, Name
);
475 // Fixed length. Start by concatenating all vectors into a wide vector.
476 Value
*WideVec
= concatenateVectors(Builder
, Vals
);
478 // Interleave the elements into the wide vector.
479 const unsigned NumElts
= VecTy
->getElementCount().getFixedValue();
480 return Builder
.CreateShuffleVector(
481 WideVec
, createInterleaveMask(NumElts
, Factor
), Name
);
485 // Forward declare GeneratedRTChecks.
486 class GeneratedRTChecks
;
488 using SCEV2ValueTy
= DenseMap
<const SCEV
*, Value
*>;
493 AnalysisKey
ShouldRunExtraVectorPasses::Key
;
495 /// InnerLoopVectorizer vectorizes loops which contain only one basic
496 /// block to a specified vectorization factor (VF).
497 /// This class performs the widening of scalars into vectors, or multiple
498 /// scalars. This class also implements the following features:
499 /// * It inserts an epilogue loop for handling loops that don't have iteration
500 /// counts that are known to be a multiple of the vectorization factor.
501 /// * It handles the code generation for reduction variables.
502 /// * Scalarization (implementation using scalars) of un-vectorizable
504 /// InnerLoopVectorizer does not perform any vectorization-legality
505 /// checks, and relies on the caller to check for the different legality
506 /// aspects. The InnerLoopVectorizer relies on the
507 /// LoopVectorizationLegality class to provide information about the induction
508 /// and reduction variables that were found to a given vectorization factor.
509 class InnerLoopVectorizer
{
511 InnerLoopVectorizer(Loop
*OrigLoop
, PredicatedScalarEvolution
&PSE
,
512 LoopInfo
*LI
, DominatorTree
*DT
,
513 const TargetLibraryInfo
*TLI
,
514 const TargetTransformInfo
*TTI
, AssumptionCache
*AC
,
515 OptimizationRemarkEmitter
*ORE
, ElementCount VecWidth
,
516 ElementCount MinProfitableTripCount
,
517 unsigned UnrollFactor
, LoopVectorizationLegality
*LVL
,
518 LoopVectorizationCostModel
*CM
, BlockFrequencyInfo
*BFI
,
519 ProfileSummaryInfo
*PSI
, GeneratedRTChecks
&RTChecks
)
520 : OrigLoop(OrigLoop
), PSE(PSE
), LI(LI
), DT(DT
), TLI(TLI
), TTI(TTI
),
521 AC(AC
), ORE(ORE
), VF(VecWidth
), UF(UnrollFactor
),
522 Builder(PSE
.getSE()->getContext()), Legal(LVL
), Cost(CM
), BFI(BFI
),
523 PSI(PSI
), RTChecks(RTChecks
) {
524 // Query this against the original loop and save it here because the profile
525 // of the original loop header may change as the transformation happens.
526 OptForSizeBasedOnProfile
= llvm::shouldOptimizeForSize(
527 OrigLoop
->getHeader(), PSI
, BFI
, PGSOQueryType::IRPass
);
529 if (MinProfitableTripCount
.isZero())
530 this->MinProfitableTripCount
= VecWidth
;
532 this->MinProfitableTripCount
= MinProfitableTripCount
;
535 virtual ~InnerLoopVectorizer() = default;
537 /// Create a new empty loop that will contain vectorized instructions later
538 /// on, while the old loop will be used as the scalar remainder. Control flow
539 /// is generated around the vectorized (and scalar epilogue) loops consisting
540 /// of various checks and bypasses. Return the pre-header block of the new
541 /// loop and the start value for the canonical induction, if it is != 0. The
542 /// latter is the case when vectorizing the epilogue loop. In the case of
543 /// epilogue vectorization, this function is overriden to handle the more
544 /// complex control flow around the loops. \p ExpandedSCEVs is used to
545 /// look up SCEV expansions for expressions needed during skeleton creation.
546 virtual std::pair
<BasicBlock
*, Value
*>
547 createVectorizedLoopSkeleton(const SCEV2ValueTy
&ExpandedSCEVs
);
549 /// Fix the vectorized code, taking care of header phi's, live-outs, and more.
550 void fixVectorizedLoop(VPTransformState
&State
, VPlan
&Plan
);
552 // Return true if any runtime check is added.
553 bool areSafetyChecksAdded() { return AddedSafetyChecks
; }
555 /// A type for vectorized values in the new loop. Each value from the
556 /// original loop, when vectorized, is represented by UF vector values in the
557 /// new unrolled loop, where UF is the unroll factor.
558 using VectorParts
= SmallVector
<Value
*, 2>;
560 /// A helper function to scalarize a single Instruction in the innermost loop.
561 /// Generates a sequence of scalar instances for each lane between \p MinLane
562 /// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
563 /// inclusive. Uses the VPValue operands from \p RepRecipe instead of \p
564 /// Instr's operands.
565 void scalarizeInstruction(const Instruction
*Instr
,
566 VPReplicateRecipe
*RepRecipe
,
567 const VPIteration
&Instance
,
568 VPTransformState
&State
);
570 /// Try to vectorize interleaved access group \p Group with the base address
571 /// given in \p Addr, optionally masking the vector operations if \p
572 /// BlockInMask is non-null. Use \p State to translate given VPValues to IR
573 /// values in the vectorized loop.
574 void vectorizeInterleaveGroup(const InterleaveGroup
<Instruction
> *Group
,
575 ArrayRef
<VPValue
*> VPDefs
,
576 VPTransformState
&State
, VPValue
*Addr
,
577 ArrayRef
<VPValue
*> StoredValues
,
578 VPValue
*BlockInMask
, bool NeedsMaskForGaps
);
580 /// Fix the non-induction PHIs in \p Plan.
581 void fixNonInductionPHIs(VPlan
&Plan
, VPTransformState
&State
);
583 /// Returns true if the reordering of FP operations is not allowed, but we are
584 /// able to vectorize with strict in-order reductions for the given RdxDesc.
585 bool useOrderedReductions(const RecurrenceDescriptor
&RdxDesc
);
587 /// Create a new phi node for the induction variable \p OrigPhi to resume
588 /// iteration count in the scalar epilogue, from where the vectorized loop
589 /// left off. \p Step is the SCEV-expanded induction step to use. In cases
590 /// where the loop skeleton is more complicated (i.e., epilogue vectorization)
591 /// and the resume values can come from an additional bypass block, the \p
592 /// AdditionalBypass pair provides information about the bypass block and the
593 /// end value on the edge from bypass to this loop.
594 PHINode
*createInductionResumeValue(
595 PHINode
*OrigPhi
, const InductionDescriptor
&ID
, Value
*Step
,
596 ArrayRef
<BasicBlock
*> BypassBlocks
,
597 std::pair
<BasicBlock
*, Value
*> AdditionalBypass
= {nullptr, nullptr});
599 /// Returns the original loop trip count.
600 Value
*getTripCount() const { return TripCount
; }
602 /// Used to set the trip count after ILV's construction and after the
603 /// preheader block has been executed. Note that this always holds the trip
604 /// count of the original loop for both main loop and epilogue vectorization.
605 void setTripCount(Value
*TC
) { TripCount
= TC
; }
608 friend class LoopVectorizationPlanner
;
610 /// A small list of PHINodes.
611 using PhiVector
= SmallVector
<PHINode
*, 4>;
613 /// A type for scalarized values in the new loop. Each value from the
614 /// original loop, when scalarized, is represented by UF x VF scalar values
615 /// in the new unrolled loop, where UF is the unroll factor and VF is the
616 /// vectorization factor.
617 using ScalarParts
= SmallVector
<SmallVector
<Value
*, 4>, 2>;
619 /// Set up the values of the IVs correctly when exiting the vector loop.
620 void fixupIVUsers(PHINode
*OrigPhi
, const InductionDescriptor
&II
,
621 Value
*VectorTripCount
, Value
*EndValue
,
622 BasicBlock
*MiddleBlock
, BasicBlock
*VectorHeader
,
623 VPlan
&Plan
, VPTransformState
&State
);
625 /// Create the exit value of first order recurrences in the middle block and
626 /// update their users.
627 void fixFixedOrderRecurrence(VPFirstOrderRecurrencePHIRecipe
*PhiR
,
628 VPTransformState
&State
);
630 /// Create code for the loop exit value of the reduction.
631 void fixReduction(VPReductionPHIRecipe
*Phi
, VPTransformState
&State
);
633 /// Iteratively sink the scalarized operands of a predicated instruction into
634 /// the block that was created for it.
635 void sinkScalarOperands(Instruction
*PredInst
);
637 /// Returns (and creates if needed) the trip count of the widened loop.
638 Value
*getOrCreateVectorTripCount(BasicBlock
*InsertBlock
);
640 /// Returns a bitcasted value to the requested vector type.
641 /// Also handles bitcasts of vector<float> <-> vector<pointer> types.
642 Value
*createBitOrPointerCast(Value
*V
, VectorType
*DstVTy
,
643 const DataLayout
&DL
);
645 /// Emit a bypass check to see if the vector trip count is zero, including if
647 void emitIterationCountCheck(BasicBlock
*Bypass
);
649 /// Emit a bypass check to see if all of the SCEV assumptions we've
650 /// had to make are correct. Returns the block containing the checks or
651 /// nullptr if no checks have been added.
652 BasicBlock
*emitSCEVChecks(BasicBlock
*Bypass
);
654 /// Emit bypass checks to check any memory assumptions we may have made.
655 /// Returns the block containing the checks or nullptr if no checks have been
657 BasicBlock
*emitMemRuntimeChecks(BasicBlock
*Bypass
);
659 /// Emit basic blocks (prefixed with \p Prefix) for the iteration check,
660 /// vector loop preheader, middle block and scalar preheader.
661 void createVectorLoopSkeleton(StringRef Prefix
);
663 /// Create new phi nodes for the induction variables to resume iteration count
664 /// in the scalar epilogue, from where the vectorized loop left off.
665 /// In cases where the loop skeleton is more complicated (eg. epilogue
666 /// vectorization) and the resume values can come from an additional bypass
667 /// block, the \p AdditionalBypass pair provides information about the bypass
668 /// block and the end value on the edge from bypass to this loop.
669 void createInductionResumeValues(
670 const SCEV2ValueTy
&ExpandedSCEVs
,
671 std::pair
<BasicBlock
*, Value
*> AdditionalBypass
= {nullptr, nullptr});
673 /// Complete the loop skeleton by adding debug MDs, creating appropriate
674 /// conditional branches in the middle block, preparing the builder and
675 /// running the verifier. Return the preheader of the completed vector loop.
676 BasicBlock
*completeLoopSkeleton();
678 /// Collect poison-generating recipes that may generate a poison value that is
679 /// used after vectorization, even when their operands are not poison. Those
680 /// recipes meet the following conditions:
681 /// * Contribute to the address computation of a recipe generating a widen
682 /// memory load/store (VPWidenMemoryInstructionRecipe or
683 /// VPInterleaveRecipe).
684 /// * Such a widen memory load/store has at least one underlying Instruction
685 /// that is in a basic block that needs predication and after vectorization
686 /// the generated instruction won't be predicated.
687 void collectPoisonGeneratingRecipes(VPTransformState
&State
);
689 /// Allow subclasses to override and print debug traces before/after vplan
690 /// execution, when trace information is requested.
691 virtual void printDebugTracesAtStart(){};
692 virtual void printDebugTracesAtEnd(){};
694 /// The original loop.
697 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
698 /// dynamic knowledge to simplify SCEV expressions and converts them to a
699 /// more usable form.
700 PredicatedScalarEvolution
&PSE
;
708 /// Target Library Info.
709 const TargetLibraryInfo
*TLI
;
711 /// Target Transform Info.
712 const TargetTransformInfo
*TTI
;
714 /// Assumption Cache.
717 /// Interface to emit optimization remarks.
718 OptimizationRemarkEmitter
*ORE
;
720 /// The vectorization SIMD factor to use. Each vector will have this many
724 ElementCount MinProfitableTripCount
;
726 /// The vectorization unroll factor to use. Each scalar is vectorized to this
727 /// many different vector instructions.
730 /// The builder that we use
733 // --- Vectorization state ---
735 /// The vector-loop preheader.
736 BasicBlock
*LoopVectorPreHeader
;
738 /// The scalar-loop preheader.
739 BasicBlock
*LoopScalarPreHeader
;
741 /// Middle Block between the vector and the scalar.
742 BasicBlock
*LoopMiddleBlock
;
744 /// The unique ExitBlock of the scalar loop if one exists. Note that
745 /// there can be multiple exiting edges reaching this block.
746 BasicBlock
*LoopExitBlock
;
748 /// The scalar loop body.
749 BasicBlock
*LoopScalarBody
;
751 /// A list of all bypass blocks. The first block is the entry of the loop.
752 SmallVector
<BasicBlock
*, 4> LoopBypassBlocks
;
754 /// Store instructions that were predicated.
755 SmallVector
<Instruction
*, 4> PredicatedInstructions
;
757 /// Trip count of the original loop.
758 Value
*TripCount
= nullptr;
760 /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
761 Value
*VectorTripCount
= nullptr;
763 /// The legality analysis.
764 LoopVectorizationLegality
*Legal
;
766 /// The profitablity analysis.
767 LoopVectorizationCostModel
*Cost
;
769 // Record whether runtime checks are added.
770 bool AddedSafetyChecks
= false;
772 // Holds the end values for each induction variable. We save the end values
773 // so we can later fix-up the external users of the induction variables.
774 DenseMap
<PHINode
*, Value
*> IVEndValues
;
776 /// BFI and PSI are used to check for profile guided size optimizations.
777 BlockFrequencyInfo
*BFI
;
778 ProfileSummaryInfo
*PSI
;
780 // Whether this loop should be optimized for size based on profile guided size
782 bool OptForSizeBasedOnProfile
;
784 /// Structure to hold information about generated runtime checks, responsible
785 /// for cleaning the checks, if vectorization turns out unprofitable.
786 GeneratedRTChecks
&RTChecks
;
788 // Holds the resume values for reductions in the loops, used to set the
789 // correct start value of reduction PHIs when vectorizing the epilogue.
790 SmallMapVector
<const RecurrenceDescriptor
*, PHINode
*, 4>
791 ReductionResumeValues
;
794 class InnerLoopUnroller
: public InnerLoopVectorizer
{
796 InnerLoopUnroller(Loop
*OrigLoop
, PredicatedScalarEvolution
&PSE
,
797 LoopInfo
*LI
, DominatorTree
*DT
,
798 const TargetLibraryInfo
*TLI
,
799 const TargetTransformInfo
*TTI
, AssumptionCache
*AC
,
800 OptimizationRemarkEmitter
*ORE
, unsigned UnrollFactor
,
801 LoopVectorizationLegality
*LVL
,
802 LoopVectorizationCostModel
*CM
, BlockFrequencyInfo
*BFI
,
803 ProfileSummaryInfo
*PSI
, GeneratedRTChecks
&Check
)
804 : InnerLoopVectorizer(OrigLoop
, PSE
, LI
, DT
, TLI
, TTI
, AC
, ORE
,
805 ElementCount::getFixed(1),
806 ElementCount::getFixed(1), UnrollFactor
, LVL
, CM
,
810 /// Encapsulate information regarding vectorization of a loop and its epilogue.
811 /// This information is meant to be updated and used across two stages of
812 /// epilogue vectorization.
813 struct EpilogueLoopVectorizationInfo
{
814 ElementCount MainLoopVF
= ElementCount::getFixed(0);
815 unsigned MainLoopUF
= 0;
816 ElementCount EpilogueVF
= ElementCount::getFixed(0);
817 unsigned EpilogueUF
= 0;
818 BasicBlock
*MainLoopIterationCountCheck
= nullptr;
819 BasicBlock
*EpilogueIterationCountCheck
= nullptr;
820 BasicBlock
*SCEVSafetyCheck
= nullptr;
821 BasicBlock
*MemSafetyCheck
= nullptr;
822 Value
*TripCount
= nullptr;
823 Value
*VectorTripCount
= nullptr;
825 EpilogueLoopVectorizationInfo(ElementCount MVF
, unsigned MUF
,
826 ElementCount EVF
, unsigned EUF
)
827 : MainLoopVF(MVF
), MainLoopUF(MUF
), EpilogueVF(EVF
), EpilogueUF(EUF
) {
829 "A high UF for the epilogue loop is likely not beneficial.");
833 /// An extension of the inner loop vectorizer that creates a skeleton for a
834 /// vectorized loop that has its epilogue (residual) also vectorized.
835 /// The idea is to run the vplan on a given loop twice, firstly to setup the
836 /// skeleton and vectorize the main loop, and secondly to complete the skeleton
837 /// from the first step and vectorize the epilogue. This is achieved by
838 /// deriving two concrete strategy classes from this base class and invoking
839 /// them in succession from the loop vectorizer planner.
840 class InnerLoopAndEpilogueVectorizer
: public InnerLoopVectorizer
{
842 InnerLoopAndEpilogueVectorizer(
843 Loop
*OrigLoop
, PredicatedScalarEvolution
&PSE
, LoopInfo
*LI
,
844 DominatorTree
*DT
, const TargetLibraryInfo
*TLI
,
845 const TargetTransformInfo
*TTI
, AssumptionCache
*AC
,
846 OptimizationRemarkEmitter
*ORE
, EpilogueLoopVectorizationInfo
&EPI
,
847 LoopVectorizationLegality
*LVL
, llvm::LoopVectorizationCostModel
*CM
,
848 BlockFrequencyInfo
*BFI
, ProfileSummaryInfo
*PSI
,
849 GeneratedRTChecks
&Checks
)
850 : InnerLoopVectorizer(OrigLoop
, PSE
, LI
, DT
, TLI
, TTI
, AC
, ORE
,
851 EPI
.MainLoopVF
, EPI
.MainLoopVF
, EPI
.MainLoopUF
, LVL
,
852 CM
, BFI
, PSI
, Checks
),
855 // Override this function to handle the more complex control flow around the
857 std::pair
<BasicBlock
*, Value
*> createVectorizedLoopSkeleton(
858 const SCEV2ValueTy
&ExpandedSCEVs
) final
{
859 return createEpilogueVectorizedLoopSkeleton(ExpandedSCEVs
);
862 /// The interface for creating a vectorized skeleton using one of two
863 /// different strategies, each corresponding to one execution of the vplan
864 /// as described above.
865 virtual std::pair
<BasicBlock
*, Value
*>
866 createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy
&ExpandedSCEVs
) = 0;
868 /// Holds and updates state information required to vectorize the main loop
869 /// and its epilogue in two separate passes. This setup helps us avoid
870 /// regenerating and recomputing runtime safety checks. It also helps us to
871 /// shorten the iteration-count-check path length for the cases where the
872 /// iteration count of the loop is so small that the main vector loop is
873 /// completely skipped.
874 EpilogueLoopVectorizationInfo
&EPI
;
877 /// A specialized derived class of inner loop vectorizer that performs
878 /// vectorization of *main* loops in the process of vectorizing loops and their
880 class EpilogueVectorizerMainLoop
: public InnerLoopAndEpilogueVectorizer
{
882 EpilogueVectorizerMainLoop(
883 Loop
*OrigLoop
, PredicatedScalarEvolution
&PSE
, LoopInfo
*LI
,
884 DominatorTree
*DT
, const TargetLibraryInfo
*TLI
,
885 const TargetTransformInfo
*TTI
, AssumptionCache
*AC
,
886 OptimizationRemarkEmitter
*ORE
, EpilogueLoopVectorizationInfo
&EPI
,
887 LoopVectorizationLegality
*LVL
, llvm::LoopVectorizationCostModel
*CM
,
888 BlockFrequencyInfo
*BFI
, ProfileSummaryInfo
*PSI
,
889 GeneratedRTChecks
&Check
)
890 : InnerLoopAndEpilogueVectorizer(OrigLoop
, PSE
, LI
, DT
, TLI
, TTI
, AC
, ORE
,
891 EPI
, LVL
, CM
, BFI
, PSI
, Check
) {}
892 /// Implements the interface for creating a vectorized skeleton using the
893 /// *main loop* strategy (ie the first pass of vplan execution).
894 std::pair
<BasicBlock
*, Value
*>
895 createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy
&ExpandedSCEVs
) final
;
898 /// Emits an iteration count bypass check once for the main loop (when \p
899 /// ForEpilogue is false) and once for the epilogue loop (when \p
900 /// ForEpilogue is true).
901 BasicBlock
*emitIterationCountCheck(BasicBlock
*Bypass
, bool ForEpilogue
);
902 void printDebugTracesAtStart() override
;
903 void printDebugTracesAtEnd() override
;
906 // A specialized derived class of inner loop vectorizer that performs
907 // vectorization of *epilogue* loops in the process of vectorizing loops and
909 class EpilogueVectorizerEpilogueLoop
: public InnerLoopAndEpilogueVectorizer
{
911 EpilogueVectorizerEpilogueLoop(
912 Loop
*OrigLoop
, PredicatedScalarEvolution
&PSE
, LoopInfo
*LI
,
913 DominatorTree
*DT
, const TargetLibraryInfo
*TLI
,
914 const TargetTransformInfo
*TTI
, AssumptionCache
*AC
,
915 OptimizationRemarkEmitter
*ORE
, EpilogueLoopVectorizationInfo
&EPI
,
916 LoopVectorizationLegality
*LVL
, llvm::LoopVectorizationCostModel
*CM
,
917 BlockFrequencyInfo
*BFI
, ProfileSummaryInfo
*PSI
,
918 GeneratedRTChecks
&Checks
)
919 : InnerLoopAndEpilogueVectorizer(OrigLoop
, PSE
, LI
, DT
, TLI
, TTI
, AC
, ORE
,
920 EPI
, LVL
, CM
, BFI
, PSI
, Checks
) {
921 TripCount
= EPI
.TripCount
;
923 /// Implements the interface for creating a vectorized skeleton using the
924 /// *epilogue loop* strategy (ie the second pass of vplan execution).
925 std::pair
<BasicBlock
*, Value
*>
926 createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy
&ExpandedSCEVs
) final
;
929 /// Emits an iteration count bypass check after the main vector loop has
930 /// finished to see if there are any iterations left to execute by either
931 /// the vector epilogue or the scalar epilogue.
932 BasicBlock
*emitMinimumVectorEpilogueIterCountCheck(
935 void printDebugTracesAtStart() override
;
936 void printDebugTracesAtEnd() override
;
938 } // end namespace llvm
940 /// Look for a meaningful debug location on the instruction or it's
942 static DebugLoc
getDebugLocFromInstOrOperands(Instruction
*I
) {
947 if (I
->getDebugLoc() != Empty
)
948 return I
->getDebugLoc();
950 for (Use
&Op
: I
->operands()) {
951 if (Instruction
*OpInst
= dyn_cast
<Instruction
>(Op
))
952 if (OpInst
->getDebugLoc() != Empty
)
953 return OpInst
->getDebugLoc();
956 return I
->getDebugLoc();
959 /// Write a \p DebugMsg about vectorization to the debug output stream. If \p I
960 /// is passed, the message relates to that particular instruction.
962 static void debugVectorizationMessage(const StringRef Prefix
,
963 const StringRef DebugMsg
,
965 dbgs() << "LV: " << Prefix
<< DebugMsg
;
974 /// Create an analysis remark that explains why vectorization failed
976 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p
977 /// RemarkName is the identifier for the remark. If \p I is passed it is an
978 /// instruction that prevents vectorization. Otherwise \p TheLoop is used for
979 /// the location of the remark. \return the remark object that can be
981 static OptimizationRemarkAnalysis
createLVAnalysis(const char *PassName
,
982 StringRef RemarkName
, Loop
*TheLoop
, Instruction
*I
) {
983 Value
*CodeRegion
= TheLoop
->getHeader();
984 DebugLoc DL
= TheLoop
->getStartLoc();
987 CodeRegion
= I
->getParent();
988 // If there is no debug location attached to the instruction, revert back to
990 if (I
->getDebugLoc())
991 DL
= I
->getDebugLoc();
994 return OptimizationRemarkAnalysis(PassName
, RemarkName
, DL
, CodeRegion
);
999 /// Return a value for Step multiplied by VF.
1000 Value
*createStepForVF(IRBuilderBase
&B
, Type
*Ty
, ElementCount VF
,
1002 assert(Ty
->isIntegerTy() && "Expected an integer step");
1003 return B
.CreateElementCount(Ty
, VF
.multiplyCoefficientBy(Step
));
1006 /// Return the runtime value for VF.
1007 Value
*getRuntimeVF(IRBuilderBase
&B
, Type
*Ty
, ElementCount VF
) {
1008 return B
.CreateElementCount(Ty
, VF
);
1011 const SCEV
*createTripCountSCEV(Type
*IdxTy
, PredicatedScalarEvolution
&PSE
,
1013 const SCEV
*BackedgeTakenCount
= PSE
.getBackedgeTakenCount();
1014 assert(!isa
<SCEVCouldNotCompute
>(BackedgeTakenCount
) && "Invalid loop count");
1016 ScalarEvolution
&SE
= *PSE
.getSE();
1017 return SE
.getTripCountFromExitCount(BackedgeTakenCount
, IdxTy
, OrigLoop
);
1020 void reportVectorizationFailure(const StringRef DebugMsg
,
1021 const StringRef OREMsg
, const StringRef ORETag
,
1022 OptimizationRemarkEmitter
*ORE
, Loop
*TheLoop
,
1024 LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg
, I
));
1025 LoopVectorizeHints
Hints(TheLoop
, true /* doesn't matter */, *ORE
);
1027 createLVAnalysis(Hints
.vectorizeAnalysisPassName(), ORETag
, TheLoop
, I
)
1028 << "loop not vectorized: " << OREMsg
);
1031 void reportVectorizationInfo(const StringRef Msg
, const StringRef ORETag
,
1032 OptimizationRemarkEmitter
*ORE
, Loop
*TheLoop
,
1034 LLVM_DEBUG(debugVectorizationMessage("", Msg
, I
));
1035 LoopVectorizeHints
Hints(TheLoop
, true /* doesn't matter */, *ORE
);
1037 createLVAnalysis(Hints
.vectorizeAnalysisPassName(), ORETag
, TheLoop
, I
)
1041 /// Report successful vectorization of the loop. In case an outer loop is
1042 /// vectorized, prepend "outer" to the vectorization remark.
1043 static void reportVectorization(OptimizationRemarkEmitter
*ORE
, Loop
*TheLoop
,
1044 VectorizationFactor VF
, unsigned IC
) {
1045 LLVM_DEBUG(debugVectorizationMessage(
1046 "Vectorizing: ", TheLoop
->isInnermost() ? "innermost loop" : "outer loop",
1048 StringRef LoopType
= TheLoop
->isInnermost() ? "" : "outer ";
1050 return OptimizationRemark(LV_NAME
, "Vectorized", TheLoop
->getStartLoc(),
1051 TheLoop
->getHeader())
1052 << "vectorized " << LoopType
<< "loop (vectorization width: "
1053 << ore::NV("VectorizationFactor", VF
.Width
)
1054 << ", interleaved count: " << ore::NV("InterleaveCount", IC
) << ")";
1058 } // end namespace llvm
1061 /// \return string containing a file name and a line # for the given loop.
1062 static std::string
getDebugLocString(const Loop
*L
) {
1065 raw_string_ostream
OS(Result
);
1066 if (const DebugLoc LoopDbgLoc
= L
->getStartLoc())
1067 LoopDbgLoc
.print(OS
);
1069 // Just print the module name.
1070 OS
<< L
->getHeader()->getParent()->getParent()->getModuleIdentifier();
1077 void InnerLoopVectorizer::collectPoisonGeneratingRecipes(
1078 VPTransformState
&State
) {
1080 // Collect recipes in the backward slice of `Root` that may generate a poison
1081 // value that is used after vectorization.
1082 SmallPtrSet
<VPRecipeBase
*, 16> Visited
;
1083 auto collectPoisonGeneratingInstrsInBackwardSlice([&](VPRecipeBase
*Root
) {
1084 SmallVector
<VPRecipeBase
*, 16> Worklist
;
1085 Worklist
.push_back(Root
);
1087 // Traverse the backward slice of Root through its use-def chain.
1088 while (!Worklist
.empty()) {
1089 VPRecipeBase
*CurRec
= Worklist
.back();
1090 Worklist
.pop_back();
1092 if (!Visited
.insert(CurRec
).second
)
1095 // Prune search if we find another recipe generating a widen memory
1096 // instruction. Widen memory instructions involved in address computation
1097 // will lead to gather/scatter instructions, which don't need to be
1099 if (isa
<VPWidenMemoryInstructionRecipe
>(CurRec
) ||
1100 isa
<VPInterleaveRecipe
>(CurRec
) ||
1101 isa
<VPScalarIVStepsRecipe
>(CurRec
) ||
1102 isa
<VPCanonicalIVPHIRecipe
>(CurRec
) ||
1103 isa
<VPActiveLaneMaskPHIRecipe
>(CurRec
))
1106 // This recipe contributes to the address computation of a widen
1107 // load/store. If the underlying instruction has poison-generating flags,
1108 // drop them directly.
1109 if (auto *RecWithFlags
= dyn_cast
<VPRecipeWithIRFlags
>(CurRec
)) {
1110 RecWithFlags
->dropPoisonGeneratingFlags();
1112 Instruction
*Instr
= dyn_cast_or_null
<Instruction
>(
1113 CurRec
->getVPSingleValue()->getUnderlyingValue());
1115 assert((!Instr
|| !Instr
->hasPoisonGeneratingFlags()) &&
1116 "found instruction with poison generating flags not covered by "
1117 "VPRecipeWithIRFlags");
1120 // Add new definitions to the worklist.
1121 for (VPValue
*operand
: CurRec
->operands())
1122 if (VPRecipeBase
*OpDef
= operand
->getDefiningRecipe())
1123 Worklist
.push_back(OpDef
);
1127 // Traverse all the recipes in the VPlan and collect the poison-generating
1128 // recipes in the backward slice starting at the address of a VPWidenRecipe or
1129 // VPInterleaveRecipe.
1130 auto Iter
= vp_depth_first_deep(State
.Plan
->getEntry());
1131 for (VPBasicBlock
*VPBB
: VPBlockUtils::blocksOnly
<VPBasicBlock
>(Iter
)) {
1132 for (VPRecipeBase
&Recipe
: *VPBB
) {
1133 if (auto *WidenRec
= dyn_cast
<VPWidenMemoryInstructionRecipe
>(&Recipe
)) {
1134 Instruction
&UnderlyingInstr
= WidenRec
->getIngredient();
1135 VPRecipeBase
*AddrDef
= WidenRec
->getAddr()->getDefiningRecipe();
1136 if (AddrDef
&& WidenRec
->isConsecutive() &&
1137 Legal
->blockNeedsPredication(UnderlyingInstr
.getParent()))
1138 collectPoisonGeneratingInstrsInBackwardSlice(AddrDef
);
1139 } else if (auto *InterleaveRec
= dyn_cast
<VPInterleaveRecipe
>(&Recipe
)) {
1140 VPRecipeBase
*AddrDef
= InterleaveRec
->getAddr()->getDefiningRecipe();
1142 // Check if any member of the interleave group needs predication.
1143 const InterleaveGroup
<Instruction
> *InterGroup
=
1144 InterleaveRec
->getInterleaveGroup();
1145 bool NeedPredication
= false;
1146 for (int I
= 0, NumMembers
= InterGroup
->getNumMembers();
1147 I
< NumMembers
; ++I
) {
1148 Instruction
*Member
= InterGroup
->getMember(I
);
1151 Legal
->blockNeedsPredication(Member
->getParent());
1154 if (NeedPredication
)
1155 collectPoisonGeneratingInstrsInBackwardSlice(AddrDef
);
1164 // Loop vectorization cost-model hints how the scalar epilogue loop should be
1166 enum ScalarEpilogueLowering
{
1168 // The default: allowing scalar epilogues.
1169 CM_ScalarEpilogueAllowed
,
1171 // Vectorization with OptForSize: don't allow epilogues.
1172 CM_ScalarEpilogueNotAllowedOptSize
,
1174 // A special case of vectorisation with OptForSize: loops with a very small
1175 // trip count are considered for vectorization under OptForSize, thereby
1176 // making sure the cost of their loop body is dominant, free of runtime
1177 // guards and scalar iteration overheads.
1178 CM_ScalarEpilogueNotAllowedLowTripLoop
,
1180 // Loop hint predicate indicating an epilogue is undesired.
1181 CM_ScalarEpilogueNotNeededUsePredicate
,
1183 // Directive indicating we must either tail fold or not vectorize
1184 CM_ScalarEpilogueNotAllowedUsePredicate
1187 using InstructionVFPair
= std::pair
<Instruction
*, ElementCount
>;
1189 /// LoopVectorizationCostModel - estimates the expected speedups due to
1191 /// In many cases vectorization is not profitable. This can happen because of
1192 /// a number of reasons. In this class we mainly attempt to predict the
1193 /// expected speedup/slowdowns due to the supported instruction set. We use the
1194 /// TargetTransformInfo to query the different backends for the cost of
1195 /// different operations.
1196 class LoopVectorizationCostModel
{
1198 LoopVectorizationCostModel(ScalarEpilogueLowering SEL
, Loop
*L
,
1199 PredicatedScalarEvolution
&PSE
, LoopInfo
*LI
,
1200 LoopVectorizationLegality
*Legal
,
1201 const TargetTransformInfo
&TTI
,
1202 const TargetLibraryInfo
*TLI
, DemandedBits
*DB
,
1203 AssumptionCache
*AC
,
1204 OptimizationRemarkEmitter
*ORE
, const Function
*F
,
1205 const LoopVectorizeHints
*Hints
,
1206 InterleavedAccessInfo
&IAI
)
1207 : ScalarEpilogueStatus(SEL
), TheLoop(L
), PSE(PSE
), LI(LI
), Legal(Legal
),
1208 TTI(TTI
), TLI(TLI
), DB(DB
), AC(AC
), ORE(ORE
), TheFunction(F
),
1209 Hints(Hints
), InterleaveInfo(IAI
) {}
1211 /// \return An upper bound for the vectorization factors (both fixed and
1212 /// scalable). If the factors are 0, vectorization and interleaving should be
1213 /// avoided up front.
1214 FixedScalableVFPair
computeMaxVF(ElementCount UserVF
, unsigned UserIC
);
1216 /// \return True if runtime checks are required for vectorization, and false
1218 bool runtimeChecksRequired();
1220 /// Setup cost-based decisions for user vectorization factor.
1221 /// \return true if the UserVF is a feasible VF to be chosen.
1222 bool selectUserVectorizationFactor(ElementCount UserVF
) {
1223 collectUniformsAndScalars(UserVF
);
1224 collectInstsToScalarize(UserVF
);
1225 return expectedCost(UserVF
).first
.isValid();
1228 /// \return The size (in bits) of the smallest and widest types in the code
1229 /// that needs to be vectorized. We ignore values that remain scalar such as
1230 /// 64 bit loop indices.
1231 std::pair
<unsigned, unsigned> getSmallestAndWidestTypes();
1233 /// \return The desired interleave count.
1234 /// If interleave count has been specified by metadata it will be returned.
1235 /// Otherwise, the interleave count is computed and returned. VF and LoopCost
1236 /// are the selected vectorization factor and the cost of the selected VF.
1237 unsigned selectInterleaveCount(ElementCount VF
, InstructionCost LoopCost
);
1239 /// Memory access instruction may be vectorized in more than one way.
1240 /// Form of instruction after vectorization depends on cost.
1241 /// This function takes cost-based decisions for Load/Store instructions
1242 /// and collects them in a map. This decisions map is used for building
1243 /// the lists of loop-uniform and loop-scalar instructions.
1244 /// The calculated cost is saved with widening decision in order to
1245 /// avoid redundant calculations.
1246 void setCostBasedWideningDecision(ElementCount VF
);
1248 /// A call may be vectorized in different ways depending on whether we have
1249 /// vectorized variants available and whether the target supports masking.
1250 /// This function analyzes all calls in the function at the supplied VF,
1251 /// makes a decision based on the costs of available options, and stores that
1252 /// decision in a map for use in planning and plan execution.
1253 void setVectorizedCallDecision(ElementCount VF
);
1255 /// A struct that represents some properties of the register usage
1257 struct RegisterUsage
{
1258 /// Holds the number of loop invariant values that are used in the loop.
1259 /// The key is ClassID of target-provided register class.
1260 SmallMapVector
<unsigned, unsigned, 4> LoopInvariantRegs
;
1261 /// Holds the maximum number of concurrent live intervals in the loop.
1262 /// The key is ClassID of target-provided register class.
1263 SmallMapVector
<unsigned, unsigned, 4> MaxLocalUsers
;
1266 /// \return Returns information about the register usages of the loop for the
1267 /// given vectorization factors.
1268 SmallVector
<RegisterUsage
, 8>
1269 calculateRegisterUsage(ArrayRef
<ElementCount
> VFs
);
1271 /// Collect values we want to ignore in the cost model.
1272 void collectValuesToIgnore();
1274 /// Collect all element types in the loop for which widening is needed.
1275 void collectElementTypesForWidening();
1277 /// Split reductions into those that happen in the loop, and those that happen
1278 /// outside. In loop reductions are collected into InLoopReductions.
1279 void collectInLoopReductions();
1281 /// Returns true if we should use strict in-order reductions for the given
1282 /// RdxDesc. This is true if the -enable-strict-reductions flag is passed,
1283 /// the IsOrdered flag of RdxDesc is set and we do not allow reordering
1284 /// of FP operations.
1285 bool useOrderedReductions(const RecurrenceDescriptor
&RdxDesc
) const {
1286 return !Hints
->allowReordering() && RdxDesc
.isOrdered();
1289 /// \returns The smallest bitwidth each instruction can be represented with.
1290 /// The vector equivalents of these instructions should be truncated to this
1292 const MapVector
<Instruction
*, uint64_t> &getMinimalBitwidths() const {
1296 /// \returns True if it is more profitable to scalarize instruction \p I for
1297 /// vectorization factor \p VF.
1298 bool isProfitableToScalarize(Instruction
*I
, ElementCount VF
) const {
1299 assert(VF
.isVector() &&
1300 "Profitable to scalarize relevant only for VF > 1.");
1302 // Cost model is not run in the VPlan-native path - return conservative
1303 // result until this changes.
1304 if (EnableVPlanNativePath
)
1307 auto Scalars
= InstsToScalarize
.find(VF
);
1308 assert(Scalars
!= InstsToScalarize
.end() &&
1309 "VF not yet analyzed for scalarization profitability");
1310 return Scalars
->second
.contains(I
);
1313 /// Returns true if \p I is known to be uniform after vectorization.
1314 bool isUniformAfterVectorization(Instruction
*I
, ElementCount VF
) const {
1315 // Pseudo probe needs to be duplicated for each unrolled iteration and
1316 // vector lane so that profiled loop trip count can be accurately
1317 // accumulated instead of being under counted.
1318 if (isa
<PseudoProbeInst
>(I
))
1324 // Cost model is not run in the VPlan-native path - return conservative
1325 // result until this changes.
1326 if (EnableVPlanNativePath
)
1329 auto UniformsPerVF
= Uniforms
.find(VF
);
1330 assert(UniformsPerVF
!= Uniforms
.end() &&
1331 "VF not yet analyzed for uniformity");
1332 return UniformsPerVF
->second
.count(I
);
1335 /// Returns true if \p I is known to be scalar after vectorization.
1336 bool isScalarAfterVectorization(Instruction
*I
, ElementCount VF
) const {
1340 // Cost model is not run in the VPlan-native path - return conservative
1341 // result until this changes.
1342 if (EnableVPlanNativePath
)
1345 auto ScalarsPerVF
= Scalars
.find(VF
);
1346 assert(ScalarsPerVF
!= Scalars
.end() &&
1347 "Scalar values are not calculated for VF");
1348 return ScalarsPerVF
->second
.count(I
);
1351 /// \returns True if instruction \p I can be truncated to a smaller bitwidth
1352 /// for vectorization factor \p VF.
1353 bool canTruncateToMinimalBitwidth(Instruction
*I
, ElementCount VF
) const {
1354 return VF
.isVector() && MinBWs
.contains(I
) &&
1355 !isProfitableToScalarize(I
, VF
) &&
1356 !isScalarAfterVectorization(I
, VF
);
1359 /// Decision that was taken during cost calculation for memory instruction.
1362 CM_Widen
, // For consecutive accesses with stride +1.
1363 CM_Widen_Reverse
, // For consecutive accesses with stride -1.
1371 /// Save vectorization decision \p W and \p Cost taken by the cost model for
1372 /// instruction \p I and vector width \p VF.
1373 void setWideningDecision(Instruction
*I
, ElementCount VF
, InstWidening W
,
1374 InstructionCost Cost
) {
1375 assert(VF
.isVector() && "Expected VF >=2");
1376 WideningDecisions
[std::make_pair(I
, VF
)] = std::make_pair(W
, Cost
);
1379 /// Save vectorization decision \p W and \p Cost taken by the cost model for
1380 /// interleaving group \p Grp and vector width \p VF.
1381 void setWideningDecision(const InterleaveGroup
<Instruction
> *Grp
,
1382 ElementCount VF
, InstWidening W
,
1383 InstructionCost Cost
) {
1384 assert(VF
.isVector() && "Expected VF >=2");
1385 /// Broadcast this decicion to all instructions inside the group.
1386 /// But the cost will be assigned to one instruction only.
1387 for (unsigned i
= 0; i
< Grp
->getFactor(); ++i
) {
1388 if (auto *I
= Grp
->getMember(i
)) {
1389 if (Grp
->getInsertPos() == I
)
1390 WideningDecisions
[std::make_pair(I
, VF
)] = std::make_pair(W
, Cost
);
1392 WideningDecisions
[std::make_pair(I
, VF
)] = std::make_pair(W
, 0);
1397 /// Return the cost model decision for the given instruction \p I and vector
1398 /// width \p VF. Return CM_Unknown if this instruction did not pass
1399 /// through the cost modeling.
1400 InstWidening
getWideningDecision(Instruction
*I
, ElementCount VF
) const {
1401 assert(VF
.isVector() && "Expected VF to be a vector VF");
1402 // Cost model is not run in the VPlan-native path - return conservative
1403 // result until this changes.
1404 if (EnableVPlanNativePath
)
1405 return CM_GatherScatter
;
1407 std::pair
<Instruction
*, ElementCount
> InstOnVF
= std::make_pair(I
, VF
);
1408 auto Itr
= WideningDecisions
.find(InstOnVF
);
1409 if (Itr
== WideningDecisions
.end())
1411 return Itr
->second
.first
;
1414 /// Return the vectorization cost for the given instruction \p I and vector
1416 InstructionCost
getWideningCost(Instruction
*I
, ElementCount VF
) {
1417 assert(VF
.isVector() && "Expected VF >=2");
1418 std::pair
<Instruction
*, ElementCount
> InstOnVF
= std::make_pair(I
, VF
);
1419 assert(WideningDecisions
.contains(InstOnVF
) &&
1420 "The cost is not calculated");
1421 return WideningDecisions
[InstOnVF
].second
;
1424 struct CallWideningDecision
{
1428 std::optional
<unsigned> MaskPos
;
1429 InstructionCost Cost
;
1432 void setCallWideningDecision(CallInst
*CI
, ElementCount VF
, InstWidening Kind
,
1433 Function
*Variant
, Intrinsic::ID IID
,
1434 std::optional
<unsigned> MaskPos
,
1435 InstructionCost Cost
) {
1436 assert(!VF
.isScalar() && "Expected vector VF");
1437 CallWideningDecisions
[std::make_pair(CI
, VF
)] = {Kind
, Variant
, IID
,
1441 CallWideningDecision
getCallWideningDecision(CallInst
*CI
,
1442 ElementCount VF
) const {
1443 assert(!VF
.isScalar() && "Expected vector VF");
1444 return CallWideningDecisions
.at(std::make_pair(CI
, VF
));
1447 /// Return True if instruction \p I is an optimizable truncate whose operand
1448 /// is an induction variable. Such a truncate will be removed by adding a new
1449 /// induction variable with the destination type.
1450 bool isOptimizableIVTruncate(Instruction
*I
, ElementCount VF
) {
1451 // If the instruction is not a truncate, return false.
1452 auto *Trunc
= dyn_cast
<TruncInst
>(I
);
1456 // Get the source and destination types of the truncate.
1457 Type
*SrcTy
= ToVectorTy(cast
<CastInst
>(I
)->getSrcTy(), VF
);
1458 Type
*DestTy
= ToVectorTy(cast
<CastInst
>(I
)->getDestTy(), VF
);
1460 // If the truncate is free for the given types, return false. Replacing a
1461 // free truncate with an induction variable would add an induction variable
1462 // update instruction to each iteration of the loop. We exclude from this
1463 // check the primary induction variable since it will need an update
1464 // instruction regardless.
1465 Value
*Op
= Trunc
->getOperand(0);
1466 if (Op
!= Legal
->getPrimaryInduction() && TTI
.isTruncateFree(SrcTy
, DestTy
))
1469 // If the truncated value is not an induction variable, return false.
1470 return Legal
->isInductionPhi(Op
);
1473 /// Collects the instructions to scalarize for each predicated instruction in
1475 void collectInstsToScalarize(ElementCount VF
);
1477 /// Collect Uniform and Scalar values for the given \p VF.
1478 /// The sets depend on CM decision for Load/Store instructions
1479 /// that may be vectorized as interleave, gather-scatter or scalarized.
1480 /// Also make a decision on what to do about call instructions in the loop
1481 /// at that VF -- scalarize, call a known vector routine, or call a
1482 /// vector intrinsic.
1483 void collectUniformsAndScalars(ElementCount VF
) {
1484 // Do the analysis once.
1485 if (VF
.isScalar() || Uniforms
.contains(VF
))
1487 setCostBasedWideningDecision(VF
);
1488 setVectorizedCallDecision(VF
);
1489 collectLoopUniforms(VF
);
1490 collectLoopScalars(VF
);
1493 /// Returns true if the target machine supports masked store operation
1494 /// for the given \p DataType and kind of access to \p Ptr.
1495 bool isLegalMaskedStore(Type
*DataType
, Value
*Ptr
, Align Alignment
) const {
1496 return Legal
->isConsecutivePtr(DataType
, Ptr
) &&
1497 TTI
.isLegalMaskedStore(DataType
, Alignment
);
1500 /// Returns true if the target machine supports masked load operation
1501 /// for the given \p DataType and kind of access to \p Ptr.
1502 bool isLegalMaskedLoad(Type
*DataType
, Value
*Ptr
, Align Alignment
) const {
1503 return Legal
->isConsecutivePtr(DataType
, Ptr
) &&
1504 TTI
.isLegalMaskedLoad(DataType
, Alignment
);
1507 /// Returns true if the target machine can represent \p V as a masked gather
1508 /// or scatter operation.
1509 bool isLegalGatherOrScatter(Value
*V
, ElementCount VF
) {
1510 bool LI
= isa
<LoadInst
>(V
);
1511 bool SI
= isa
<StoreInst
>(V
);
1514 auto *Ty
= getLoadStoreType(V
);
1515 Align Align
= getLoadStoreAlignment(V
);
1517 Ty
= VectorType::get(Ty
, VF
);
1518 return (LI
&& TTI
.isLegalMaskedGather(Ty
, Align
)) ||
1519 (SI
&& TTI
.isLegalMaskedScatter(Ty
, Align
));
1522 /// Returns true if the target machine supports all of the reduction
1523 /// variables found for the given VF.
1524 bool canVectorizeReductions(ElementCount VF
) const {
1525 return (all_of(Legal
->getReductionVars(), [&](auto &Reduction
) -> bool {
1526 const RecurrenceDescriptor
&RdxDesc
= Reduction
.second
;
1527 return TTI
.isLegalToVectorizeReduction(RdxDesc
, VF
);
1531 /// Given costs for both strategies, return true if the scalar predication
1532 /// lowering should be used for div/rem. This incorporates an override
1533 /// option so it is not simply a cost comparison.
1534 bool isDivRemScalarWithPredication(InstructionCost ScalarCost
,
1535 InstructionCost SafeDivisorCost
) const {
1536 switch (ForceSafeDivisor
) {
1538 return ScalarCost
< SafeDivisorCost
;
1544 llvm_unreachable("impossible case value");
1547 /// Returns true if \p I is an instruction which requires predication and
1548 /// for which our chosen predication strategy is scalarization (i.e. we
1549 /// don't have an alternate strategy such as masking available).
1550 /// \p VF is the vectorization factor that will be used to vectorize \p I.
1551 bool isScalarWithPredication(Instruction
*I
, ElementCount VF
) const;
1553 /// Returns true if \p I is an instruction that needs to be predicated
1554 /// at runtime. The result is independent of the predication mechanism.
1555 /// Superset of instructions that return true for isScalarWithPredication.
1556 bool isPredicatedInst(Instruction
*I
) const;
1558 /// Return the costs for our two available strategies for lowering a
1559 /// div/rem operation which requires speculating at least one lane.
1560 /// First result is for scalarization (will be invalid for scalable
1561 /// vectors); second is for the safe-divisor strategy.
1562 std::pair
<InstructionCost
, InstructionCost
>
1563 getDivRemSpeculationCost(Instruction
*I
,
1564 ElementCount VF
) const;
1566 /// Returns true if \p I is a memory instruction with consecutive memory
1567 /// access that can be widened.
1568 bool memoryInstructionCanBeWidened(Instruction
*I
, ElementCount VF
);
1570 /// Returns true if \p I is a memory instruction in an interleaved-group
1571 /// of memory accesses that can be vectorized with wide vector loads/stores
1573 bool interleavedAccessCanBeWidened(Instruction
*I
, ElementCount VF
);
1575 /// Check if \p Instr belongs to any interleaved access group.
1576 bool isAccessInterleaved(Instruction
*Instr
) {
1577 return InterleaveInfo
.isInterleaved(Instr
);
1580 /// Get the interleaved access group that \p Instr belongs to.
1581 const InterleaveGroup
<Instruction
> *
1582 getInterleavedAccessGroup(Instruction
*Instr
) {
1583 return InterleaveInfo
.getInterleaveGroup(Instr
);
1586 /// Returns true if we're required to use a scalar epilogue for at least
1587 /// the final iteration of the original loop.
1588 bool requiresScalarEpilogue(bool IsVectorizing
) const {
1589 if (!isScalarEpilogueAllowed())
1591 // If we might exit from anywhere but the latch, must run the exiting
1592 // iteration in scalar form.
1593 if (TheLoop
->getExitingBlock() != TheLoop
->getLoopLatch())
1595 return IsVectorizing
&& InterleaveInfo
.requiresScalarEpilogue();
1598 /// Returns true if we're required to use a scalar epilogue for at least
1599 /// the final iteration of the original loop for all VFs in \p Range.
1600 /// A scalar epilogue must either be required for all VFs in \p Range or for
1602 bool requiresScalarEpilogue(VFRange Range
) const {
1603 auto RequiresScalarEpilogue
= [this](ElementCount VF
) {
1604 return requiresScalarEpilogue(VF
.isVector());
1606 bool IsRequired
= all_of(Range
, RequiresScalarEpilogue
);
1608 (IsRequired
|| none_of(Range
, RequiresScalarEpilogue
)) &&
1609 "all VFs in range must agree on whether a scalar epilogue is required");
1613 /// Returns true if a scalar epilogue is not allowed due to optsize or a
1614 /// loop hint annotation.
1615 bool isScalarEpilogueAllowed() const {
1616 return ScalarEpilogueStatus
== CM_ScalarEpilogueAllowed
;
1619 /// Returns the TailFoldingStyle that is best for the current loop.
1621 getTailFoldingStyle(bool IVUpdateMayOverflow
= true) const {
1622 if (!CanFoldTailByMasking
)
1623 return TailFoldingStyle::None
;
1625 if (ForceTailFoldingStyle
.getNumOccurrences())
1626 return ForceTailFoldingStyle
;
1628 return TTI
.getPreferredTailFoldingStyle(IVUpdateMayOverflow
);
1631 /// Returns true if all loop blocks should be masked to fold tail loop.
1632 bool foldTailByMasking() const {
1633 return getTailFoldingStyle() != TailFoldingStyle::None
;
1636 /// Returns true if the instructions in this block requires predication
1637 /// for any reason, e.g. because tail folding now requires a predicate
1638 /// or because the block in the original loop was predicated.
1639 bool blockNeedsPredicationForAnyReason(BasicBlock
*BB
) const {
1640 return foldTailByMasking() || Legal
->blockNeedsPredication(BB
);
1643 /// Returns true if the Phi is part of an inloop reduction.
1644 bool isInLoopReduction(PHINode
*Phi
) const {
1645 return InLoopReductions
.contains(Phi
);
1648 /// Estimate cost of an intrinsic call instruction CI if it were vectorized
1649 /// with factor VF. Return the cost of the instruction, including
1650 /// scalarization overhead if it's needed.
1651 InstructionCost
getVectorIntrinsicCost(CallInst
*CI
, ElementCount VF
) const;
1653 /// Estimate cost of a call instruction CI if it were vectorized with factor
1654 /// VF. Return the cost of the instruction, including scalarization overhead
1656 InstructionCost
getVectorCallCost(CallInst
*CI
, ElementCount VF
) const;
1658 /// Invalidates decisions already taken by the cost model.
1659 void invalidateCostModelingDecisions() {
1660 WideningDecisions
.clear();
1661 CallWideningDecisions
.clear();
1666 /// The vectorization cost is a combination of the cost itself and a boolean
1667 /// indicating whether any of the contributing operations will actually
1668 /// operate on vector values after type legalization in the backend. If this
1669 /// latter value is false, then all operations will be scalarized (i.e. no
1670 /// vectorization has actually taken place).
1671 using VectorizationCostTy
= std::pair
<InstructionCost
, bool>;
1673 /// Returns the expected execution cost. The unit of the cost does
1674 /// not matter because we use the 'cost' units to compare different
1675 /// vector widths. The cost that is returned is *not* normalized by
1676 /// the factor width. If \p Invalid is not nullptr, this function
1677 /// will add a pair(Instruction*, ElementCount) to \p Invalid for
1678 /// each instruction that has an Invalid cost for the given VF.
1680 expectedCost(ElementCount VF
,
1681 SmallVectorImpl
<InstructionVFPair
> *Invalid
= nullptr);
1683 bool hasPredStores() const { return NumPredStores
> 0; }
1685 /// Returns true if epilogue vectorization is considered profitable, and
1686 /// false otherwise.
1687 /// \p VF is the vectorization factor chosen for the original loop.
1688 bool isEpilogueVectorizationProfitable(const ElementCount VF
) const;
1691 unsigned NumPredStores
= 0;
1693 /// \return An upper bound for the vectorization factors for both
1694 /// fixed and scalable vectorization, where the minimum-known number of
1695 /// elements is a power-of-2 larger than zero. If scalable vectorization is
1696 /// disabled or unsupported, then the scalable part will be equal to
1697 /// ElementCount::getScalable(0).
1698 FixedScalableVFPair
computeFeasibleMaxVF(unsigned MaxTripCount
,
1699 ElementCount UserVF
,
1700 bool FoldTailByMasking
);
1702 /// \return the maximized element count based on the targets vector
1703 /// registers and the loop trip-count, but limited to a maximum safe VF.
1704 /// This is a helper function of computeFeasibleMaxVF.
1705 ElementCount
getMaximizedVFForTarget(unsigned MaxTripCount
,
1706 unsigned SmallestType
,
1707 unsigned WidestType
,
1708 ElementCount MaxSafeVF
,
1709 bool FoldTailByMasking
);
1711 /// \return the maximum legal scalable VF, based on the safe max number
1713 ElementCount
getMaxLegalScalableVF(unsigned MaxSafeElements
);
1715 /// Returns the execution time cost of an instruction for a given vector
1716 /// width. Vector width of one means scalar.
1717 VectorizationCostTy
getInstructionCost(Instruction
*I
, ElementCount VF
);
1719 /// The cost-computation logic from getInstructionCost which provides
1720 /// the vector type as an output parameter.
1721 InstructionCost
getInstructionCost(Instruction
*I
, ElementCount VF
,
1724 /// Return the cost of instructions in an inloop reduction pattern, if I is
1725 /// part of that pattern.
1726 std::optional
<InstructionCost
>
1727 getReductionPatternCost(Instruction
*I
, ElementCount VF
, Type
*VectorTy
,
1728 TTI::TargetCostKind CostKind
) const;
1730 /// Calculate vectorization cost of memory instruction \p I.
1731 InstructionCost
getMemoryInstructionCost(Instruction
*I
, ElementCount VF
);
1733 /// The cost computation for scalarized memory instruction.
1734 InstructionCost
getMemInstScalarizationCost(Instruction
*I
, ElementCount VF
);
1736 /// The cost computation for interleaving group of memory instructions.
1737 InstructionCost
getInterleaveGroupCost(Instruction
*I
, ElementCount VF
);
1739 /// The cost computation for Gather/Scatter instruction.
1740 InstructionCost
getGatherScatterCost(Instruction
*I
, ElementCount VF
);
1742 /// The cost computation for widening instruction \p I with consecutive
1744 InstructionCost
getConsecutiveMemOpCost(Instruction
*I
, ElementCount VF
);
1746 /// The cost calculation for Load/Store instruction \p I with uniform pointer -
1747 /// Load: scalar load + broadcast.
1748 /// Store: scalar store + (loop invariant value stored? 0 : extract of last
1750 InstructionCost
getUniformMemOpCost(Instruction
*I
, ElementCount VF
);
1752 /// Estimate the overhead of scalarizing an instruction. This is a
1753 /// convenience wrapper for the type-based getScalarizationOverhead API.
1754 InstructionCost
getScalarizationOverhead(Instruction
*I
, ElementCount VF
,
1755 TTI::TargetCostKind CostKind
) const;
1757 /// Returns true if an artificially high cost for emulated masked memrefs
1759 bool useEmulatedMaskMemRefHack(Instruction
*I
, ElementCount VF
);
1761 /// Map of scalar integer values to the smallest bitwidth they can be legally
1762 /// represented as. The vector equivalents of these values should be truncated
1764 MapVector
<Instruction
*, uint64_t> MinBWs
;
1766 /// A type representing the costs for instructions if they were to be
1767 /// scalarized rather than vectorized. The entries are Instruction-Cost
1769 using ScalarCostsTy
= DenseMap
<Instruction
*, InstructionCost
>;
1771 /// A set containing all BasicBlocks that are known to present after
1772 /// vectorization as a predicated block.
1773 DenseMap
<ElementCount
, SmallPtrSet
<BasicBlock
*, 4>>
1774 PredicatedBBsAfterVectorization
;
1776 /// Records whether it is allowed to have the original scalar loop execute at
1777 /// least once. This may be needed as a fallback loop in case runtime
1778 /// aliasing/dependence checks fail, or to handle the tail/remainder
1779 /// iterations when the trip count is unknown or doesn't divide by the VF,
1780 /// or as a peel-loop to handle gaps in interleave-groups.
1781 /// Under optsize and when the trip count is very small we don't allow any
1782 /// iterations to execute in the scalar loop.
1783 ScalarEpilogueLowering ScalarEpilogueStatus
= CM_ScalarEpilogueAllowed
;
1785 /// All blocks of loop are to be masked to fold tail of scalar iterations.
1786 bool CanFoldTailByMasking
= false;
1788 /// A map holding scalar costs for different vectorization factors. The
1789 /// presence of a cost for an instruction in the mapping indicates that the
1790 /// instruction will be scalarized when vectorizing with the associated
1791 /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1792 DenseMap
<ElementCount
, ScalarCostsTy
> InstsToScalarize
;
1794 /// Holds the instructions known to be uniform after vectorization.
1795 /// The data is collected per VF.
1796 DenseMap
<ElementCount
, SmallPtrSet
<Instruction
*, 4>> Uniforms
;
1798 /// Holds the instructions known to be scalar after vectorization.
1799 /// The data is collected per VF.
1800 DenseMap
<ElementCount
, SmallPtrSet
<Instruction
*, 4>> Scalars
;
1802 /// Holds the instructions (address computations) that are forced to be
1804 DenseMap
<ElementCount
, SmallPtrSet
<Instruction
*, 4>> ForcedScalars
;
1806 /// PHINodes of the reductions that should be expanded in-loop.
1807 SmallPtrSet
<PHINode
*, 4> InLoopReductions
;
1809 /// A Map of inloop reduction operations and their immediate chain operand.
1810 /// FIXME: This can be removed once reductions can be costed correctly in
1811 /// VPlan. This was added to allow quick lookup of the inloop operations.
1812 DenseMap
<Instruction
*, Instruction
*> InLoopReductionImmediateChains
;
1814 /// Returns the expected difference in cost from scalarizing the expression
1815 /// feeding a predicated instruction \p PredInst. The instructions to
1816 /// scalarize and their scalar costs are collected in \p ScalarCosts. A
1817 /// non-negative return value implies the expression will be scalarized.
1818 /// Currently, only single-use chains are considered for scalarization.
1819 InstructionCost
computePredInstDiscount(Instruction
*PredInst
,
1820 ScalarCostsTy
&ScalarCosts
,
1823 /// Collect the instructions that are uniform after vectorization. An
1824 /// instruction is uniform if we represent it with a single scalar value in
1825 /// the vectorized loop corresponding to each vector iteration. Examples of
1826 /// uniform instructions include pointer operands of consecutive or
1827 /// interleaved memory accesses. Note that although uniformity implies an
1828 /// instruction will be scalar, the reverse is not true. In general, a
1829 /// scalarized instruction will be represented by VF scalar values in the
1830 /// vectorized loop, each corresponding to an iteration of the original
1832 void collectLoopUniforms(ElementCount VF
);
1834 /// Collect the instructions that are scalar after vectorization. An
1835 /// instruction is scalar if it is known to be uniform or will be scalarized
1836 /// during vectorization. collectLoopScalars should only add non-uniform nodes
1837 /// to the list if they are used by a load/store instruction that is marked as
1838 /// CM_Scalarize. Non-uniform scalarized instructions will be represented by
1839 /// VF values in the vectorized loop, each corresponding to an iteration of
1840 /// the original scalar loop.
1841 void collectLoopScalars(ElementCount VF
);
1843 /// Keeps cost model vectorization decision and cost for instructions.
1844 /// Right now it is used for memory instructions only.
1845 using DecisionList
= DenseMap
<std::pair
<Instruction
*, ElementCount
>,
1846 std::pair
<InstWidening
, InstructionCost
>>;
1848 DecisionList WideningDecisions
;
1850 using CallDecisionList
=
1851 DenseMap
<std::pair
<CallInst
*, ElementCount
>, CallWideningDecision
>;
1853 CallDecisionList CallWideningDecisions
;
1855 /// Returns true if \p V is expected to be vectorized and it needs to be
1857 bool needsExtract(Value
*V
, ElementCount VF
) const {
1858 Instruction
*I
= dyn_cast
<Instruction
>(V
);
1859 if (VF
.isScalar() || !I
|| !TheLoop
->contains(I
) ||
1860 TheLoop
->isLoopInvariant(I
))
1863 // Assume we can vectorize V (and hence we need extraction) if the
1864 // scalars are not computed yet. This can happen, because it is called
1865 // via getScalarizationOverhead from setCostBasedWideningDecision, before
1866 // the scalars are collected. That should be a safe assumption in most
1867 // cases, because we check if the operands have vectorizable types
1868 // beforehand in LoopVectorizationLegality.
1869 return !Scalars
.contains(VF
) || !isScalarAfterVectorization(I
, VF
);
1872 /// Returns a range containing only operands needing to be extracted.
1873 SmallVector
<Value
*, 4> filterExtractingOperands(Instruction::op_range Ops
,
1874 ElementCount VF
) const {
1875 return SmallVector
<Value
*, 4>(make_filter_range(
1876 Ops
, [this, VF
](Value
*V
) { return this->needsExtract(V
, VF
); }));
1880 /// The loop that we evaluate.
1883 /// Predicated scalar evolution analysis.
1884 PredicatedScalarEvolution
&PSE
;
1886 /// Loop Info analysis.
1889 /// Vectorization legality.
1890 LoopVectorizationLegality
*Legal
;
1892 /// Vector target information.
1893 const TargetTransformInfo
&TTI
;
1895 /// Target Library Info.
1896 const TargetLibraryInfo
*TLI
;
1898 /// Demanded bits analysis.
1901 /// Assumption cache.
1902 AssumptionCache
*AC
;
1904 /// Interface to emit optimization remarks.
1905 OptimizationRemarkEmitter
*ORE
;
1907 const Function
*TheFunction
;
1909 /// Loop Vectorize Hint.
1910 const LoopVectorizeHints
*Hints
;
1912 /// The interleave access information contains groups of interleaved accesses
1913 /// with the same stride and close to each other.
1914 InterleavedAccessInfo
&InterleaveInfo
;
1916 /// Values to ignore in the cost model.
1917 SmallPtrSet
<const Value
*, 16> ValuesToIgnore
;
1919 /// Values to ignore in the cost model when VF > 1.
1920 SmallPtrSet
<const Value
*, 16> VecValuesToIgnore
;
1922 /// All element types found in the loop.
1923 SmallPtrSet
<Type
*, 16> ElementTypesInLoop
;
1925 } // end namespace llvm
1928 /// Helper struct to manage generating runtime checks for vectorization.
1930 /// The runtime checks are created up-front in temporary blocks to allow better
1931 /// estimating the cost and un-linked from the existing IR. After deciding to
1932 /// vectorize, the checks are moved back. If deciding not to vectorize, the
1933 /// temporary blocks are completely removed.
1934 class GeneratedRTChecks
{
1935 /// Basic block which contains the generated SCEV checks, if any.
1936 BasicBlock
*SCEVCheckBlock
= nullptr;
1938 /// The value representing the result of the generated SCEV checks. If it is
1939 /// nullptr, either no SCEV checks have been generated or they have been used.
1940 Value
*SCEVCheckCond
= nullptr;
1942 /// Basic block which contains the generated memory runtime checks, if any.
1943 BasicBlock
*MemCheckBlock
= nullptr;
1945 /// The value representing the result of the generated memory runtime checks.
1946 /// If it is nullptr, either no memory runtime checks have been generated or
1947 /// they have been used.
1948 Value
*MemRuntimeCheckCond
= nullptr;
1952 TargetTransformInfo
*TTI
;
1954 SCEVExpander SCEVExp
;
1955 SCEVExpander MemCheckExp
;
1957 bool CostTooHigh
= false;
1958 const bool AddBranchWeights
;
1960 Loop
*OuterLoop
= nullptr;
1963 GeneratedRTChecks(ScalarEvolution
&SE
, DominatorTree
*DT
, LoopInfo
*LI
,
1964 TargetTransformInfo
*TTI
, const DataLayout
&DL
,
1965 bool AddBranchWeights
)
1966 : DT(DT
), LI(LI
), TTI(TTI
), SCEVExp(SE
, DL
, "scev.check"),
1967 MemCheckExp(SE
, DL
, "scev.check"), AddBranchWeights(AddBranchWeights
) {}
1969 /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can
1970 /// accurately estimate the cost of the runtime checks. The blocks are
1971 /// un-linked from the IR and is added back during vector code generation. If
1972 /// there is no vector code generation, the check blocks are removed
1974 void Create(Loop
*L
, const LoopAccessInfo
&LAI
,
1975 const SCEVPredicate
&UnionPred
, ElementCount VF
, unsigned IC
) {
1977 // Hard cutoff to limit compile-time increase in case a very large number of
1978 // runtime checks needs to be generated.
1979 // TODO: Skip cutoff if the loop is guaranteed to execute, e.g. due to
1982 LAI
.getNumRuntimePointerChecks() > VectorizeMemoryCheckThreshold
;
1986 BasicBlock
*LoopHeader
= L
->getHeader();
1987 BasicBlock
*Preheader
= L
->getLoopPreheader();
1989 // Use SplitBlock to create blocks for SCEV & memory runtime checks to
1990 // ensure the blocks are properly added to LoopInfo & DominatorTree. Those
1991 // may be used by SCEVExpander. The blocks will be un-linked from their
1992 // predecessors and removed from LI & DT at the end of the function.
1993 if (!UnionPred
.isAlwaysTrue()) {
1994 SCEVCheckBlock
= SplitBlock(Preheader
, Preheader
->getTerminator(), DT
, LI
,
1995 nullptr, "vector.scevcheck");
1997 SCEVCheckCond
= SCEVExp
.expandCodeForPredicate(
1998 &UnionPred
, SCEVCheckBlock
->getTerminator());
2001 const auto &RtPtrChecking
= *LAI
.getRuntimePointerChecking();
2002 if (RtPtrChecking
.Need
) {
2003 auto *Pred
= SCEVCheckBlock
? SCEVCheckBlock
: Preheader
;
2004 MemCheckBlock
= SplitBlock(Pred
, Pred
->getTerminator(), DT
, LI
, nullptr,
2007 auto DiffChecks
= RtPtrChecking
.getDiffChecks();
2009 Value
*RuntimeVF
= nullptr;
2010 MemRuntimeCheckCond
= addDiffRuntimeChecks(
2011 MemCheckBlock
->getTerminator(), *DiffChecks
, MemCheckExp
,
2012 [VF
, &RuntimeVF
](IRBuilderBase
&B
, unsigned Bits
) {
2014 RuntimeVF
= getRuntimeVF(B
, B
.getIntNTy(Bits
), VF
);
2019 MemRuntimeCheckCond
= addRuntimeChecks(
2020 MemCheckBlock
->getTerminator(), L
, RtPtrChecking
.getChecks(),
2021 MemCheckExp
, VectorizerParams::HoistRuntimeChecks
);
2023 assert(MemRuntimeCheckCond
&&
2024 "no RT checks generated although RtPtrChecking "
2025 "claimed checks are required");
2028 if (!MemCheckBlock
&& !SCEVCheckBlock
)
2031 // Unhook the temporary block with the checks, update various places
2034 SCEVCheckBlock
->replaceAllUsesWith(Preheader
);
2036 MemCheckBlock
->replaceAllUsesWith(Preheader
);
2038 if (SCEVCheckBlock
) {
2039 SCEVCheckBlock
->getTerminator()->moveBefore(Preheader
->getTerminator());
2040 new UnreachableInst(Preheader
->getContext(), SCEVCheckBlock
);
2041 Preheader
->getTerminator()->eraseFromParent();
2043 if (MemCheckBlock
) {
2044 MemCheckBlock
->getTerminator()->moveBefore(Preheader
->getTerminator());
2045 new UnreachableInst(Preheader
->getContext(), MemCheckBlock
);
2046 Preheader
->getTerminator()->eraseFromParent();
2049 DT
->changeImmediateDominator(LoopHeader
, Preheader
);
2050 if (MemCheckBlock
) {
2051 DT
->eraseNode(MemCheckBlock
);
2052 LI
->removeBlock(MemCheckBlock
);
2054 if (SCEVCheckBlock
) {
2055 DT
->eraseNode(SCEVCheckBlock
);
2056 LI
->removeBlock(SCEVCheckBlock
);
2059 // Outer loop is used as part of the later cost calculations.
2060 OuterLoop
= L
->getParentLoop();
2063 InstructionCost
getCost() {
2064 if (SCEVCheckBlock
|| MemCheckBlock
)
2065 LLVM_DEBUG(dbgs() << "Calculating cost of runtime checks:\n");
2068 InstructionCost Cost
;
2070 LLVM_DEBUG(dbgs() << " number of checks exceeded threshold\n");
2074 InstructionCost RTCheckCost
= 0;
2076 for (Instruction
&I
: *SCEVCheckBlock
) {
2077 if (SCEVCheckBlock
->getTerminator() == &I
)
2080 TTI
->getInstructionCost(&I
, TTI::TCK_RecipThroughput
);
2081 LLVM_DEBUG(dbgs() << " " << C
<< " for " << I
<< "\n");
2084 if (MemCheckBlock
) {
2085 InstructionCost MemCheckCost
= 0;
2086 for (Instruction
&I
: *MemCheckBlock
) {
2087 if (MemCheckBlock
->getTerminator() == &I
)
2090 TTI
->getInstructionCost(&I
, TTI::TCK_RecipThroughput
);
2091 LLVM_DEBUG(dbgs() << " " << C
<< " for " << I
<< "\n");
2095 // If the runtime memory checks are being created inside an outer loop
2096 // we should find out if these checks are outer loop invariant. If so,
2097 // the checks will likely be hoisted out and so the effective cost will
2098 // reduce according to the outer loop trip count.
2100 ScalarEvolution
*SE
= MemCheckExp
.getSE();
2101 // TODO: If profitable, we could refine this further by analysing every
2102 // individual memory check, since there could be a mixture of loop
2103 // variant and invariant checks that mean the final condition is
2105 const SCEV
*Cond
= SE
->getSCEV(MemRuntimeCheckCond
);
2106 if (SE
->isLoopInvariant(Cond
, OuterLoop
)) {
2107 // It seems reasonable to assume that we can reduce the effective
2108 // cost of the checks even when we know nothing about the trip
2109 // count. Assume that the outer loop executes at least twice.
2110 unsigned BestTripCount
= 2;
2112 // If exact trip count is known use that.
2113 if (unsigned SmallTC
= SE
->getSmallConstantTripCount(OuterLoop
))
2114 BestTripCount
= SmallTC
;
2115 else if (LoopVectorizeWithBlockFrequency
) {
2116 // Else use profile data if available.
2117 if (auto EstimatedTC
= getLoopEstimatedTripCount(OuterLoop
))
2118 BestTripCount
= *EstimatedTC
;
2121 InstructionCost NewMemCheckCost
= MemCheckCost
/ BestTripCount
;
2123 // Let's ensure the cost is always at least 1.
2124 NewMemCheckCost
= std::max(*NewMemCheckCost
.getValue(),
2125 (InstructionCost::CostType
)1);
2128 << "We expect runtime memory checks to be hoisted "
2129 << "out of the outer loop. Cost reduced from "
2130 << MemCheckCost
<< " to " << NewMemCheckCost
<< '\n');
2132 MemCheckCost
= NewMemCheckCost
;
2136 RTCheckCost
+= MemCheckCost
;
2139 if (SCEVCheckBlock
|| MemCheckBlock
)
2140 LLVM_DEBUG(dbgs() << "Total cost of runtime checks: " << RTCheckCost
2146 /// Remove the created SCEV & memory runtime check blocks & instructions, if
2148 ~GeneratedRTChecks() {
2149 SCEVExpanderCleaner
SCEVCleaner(SCEVExp
);
2150 SCEVExpanderCleaner
MemCheckCleaner(MemCheckExp
);
2152 SCEVCleaner
.markResultUsed();
2154 if (!MemRuntimeCheckCond
)
2155 MemCheckCleaner
.markResultUsed();
2157 if (MemRuntimeCheckCond
) {
2158 auto &SE
= *MemCheckExp
.getSE();
2159 // Memory runtime check generation creates compares that use expanded
2160 // values. Remove them before running the SCEVExpanderCleaners.
2161 for (auto &I
: make_early_inc_range(reverse(*MemCheckBlock
))) {
2162 if (MemCheckExp
.isInsertedInstruction(&I
))
2165 I
.eraseFromParent();
2168 MemCheckCleaner
.cleanup();
2169 SCEVCleaner
.cleanup();
2172 SCEVCheckBlock
->eraseFromParent();
2173 if (MemRuntimeCheckCond
)
2174 MemCheckBlock
->eraseFromParent();
2177 /// Adds the generated SCEVCheckBlock before \p LoopVectorPreHeader and
2178 /// adjusts the branches to branch to the vector preheader or \p Bypass,
2179 /// depending on the generated condition.
2180 BasicBlock
*emitSCEVChecks(BasicBlock
*Bypass
,
2181 BasicBlock
*LoopVectorPreHeader
,
2182 BasicBlock
*LoopExitBlock
) {
2186 Value
*Cond
= SCEVCheckCond
;
2187 // Mark the check as used, to prevent it from being removed during cleanup.
2188 SCEVCheckCond
= nullptr;
2189 if (auto *C
= dyn_cast
<ConstantInt
>(Cond
))
2193 auto *Pred
= LoopVectorPreHeader
->getSinglePredecessor();
2195 BranchInst::Create(LoopVectorPreHeader
, SCEVCheckBlock
);
2196 // Create new preheader for vector loop.
2198 OuterLoop
->addBasicBlockToLoop(SCEVCheckBlock
, *LI
);
2200 SCEVCheckBlock
->getTerminator()->eraseFromParent();
2201 SCEVCheckBlock
->moveBefore(LoopVectorPreHeader
);
2202 Pred
->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader
,
2205 DT
->addNewBlock(SCEVCheckBlock
, Pred
);
2206 DT
->changeImmediateDominator(LoopVectorPreHeader
, SCEVCheckBlock
);
2208 BranchInst
&BI
= *BranchInst::Create(Bypass
, LoopVectorPreHeader
, Cond
);
2209 if (AddBranchWeights
)
2210 setBranchWeights(BI
, SCEVCheckBypassWeights
);
2211 ReplaceInstWithInst(SCEVCheckBlock
->getTerminator(), &BI
);
2212 return SCEVCheckBlock
;
2215 /// Adds the generated MemCheckBlock before \p LoopVectorPreHeader and adjusts
2216 /// the branches to branch to the vector preheader or \p Bypass, depending on
2217 /// the generated condition.
2218 BasicBlock
*emitMemRuntimeChecks(BasicBlock
*Bypass
,
2219 BasicBlock
*LoopVectorPreHeader
) {
2220 // Check if we generated code that checks in runtime if arrays overlap.
2221 if (!MemRuntimeCheckCond
)
2224 auto *Pred
= LoopVectorPreHeader
->getSinglePredecessor();
2225 Pred
->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader
,
2228 DT
->addNewBlock(MemCheckBlock
, Pred
);
2229 DT
->changeImmediateDominator(LoopVectorPreHeader
, MemCheckBlock
);
2230 MemCheckBlock
->moveBefore(LoopVectorPreHeader
);
2233 OuterLoop
->addBasicBlockToLoop(MemCheckBlock
, *LI
);
2236 *BranchInst::Create(Bypass
, LoopVectorPreHeader
, MemRuntimeCheckCond
);
2237 if (AddBranchWeights
) {
2238 setBranchWeights(BI
, MemCheckBypassWeights
);
2240 ReplaceInstWithInst(MemCheckBlock
->getTerminator(), &BI
);
2241 MemCheckBlock
->getTerminator()->setDebugLoc(
2242 Pred
->getTerminator()->getDebugLoc());
2244 // Mark the check as used, to prevent it from being removed during cleanup.
2245 MemRuntimeCheckCond
= nullptr;
2246 return MemCheckBlock
;
2251 static bool useActiveLaneMask(TailFoldingStyle Style
) {
2252 return Style
== TailFoldingStyle::Data
||
2253 Style
== TailFoldingStyle::DataAndControlFlow
||
2254 Style
== TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck
;
2257 static bool useActiveLaneMaskForControlFlow(TailFoldingStyle Style
) {
2258 return Style
== TailFoldingStyle::DataAndControlFlow
||
2259 Style
== TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck
;
2262 // Return true if \p OuterLp is an outer loop annotated with hints for explicit
2263 // vectorization. The loop needs to be annotated with #pragma omp simd
2264 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
2265 // vector length information is not provided, vectorization is not considered
2266 // explicit. Interleave hints are not allowed either. These limitations will be
2267 // relaxed in the future.
2268 // Please, note that we are currently forced to abuse the pragma 'clang
2269 // vectorize' semantics. This pragma provides *auto-vectorization hints*
2270 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
2271 // provides *explicit vectorization hints* (LV can bypass legal checks and
2272 // assume that vectorization is legal). However, both hints are implemented
2273 // using the same metadata (llvm.loop.vectorize, processed by
2274 // LoopVectorizeHints). This will be fixed in the future when the native IR
2275 // representation for pragma 'omp simd' is introduced.
2276 static bool isExplicitVecOuterLoop(Loop
*OuterLp
,
2277 OptimizationRemarkEmitter
*ORE
) {
2278 assert(!OuterLp
->isInnermost() && "This is not an outer loop");
2279 LoopVectorizeHints
Hints(OuterLp
, true /*DisableInterleaving*/, *ORE
);
2281 // Only outer loops with an explicit vectorization hint are supported.
2282 // Unannotated outer loops are ignored.
2283 if (Hints
.getForce() == LoopVectorizeHints::FK_Undefined
)
2286 Function
*Fn
= OuterLp
->getHeader()->getParent();
2287 if (!Hints
.allowVectorization(Fn
, OuterLp
,
2288 true /*VectorizeOnlyWhenForced*/)) {
2289 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n");
2293 if (Hints
.getInterleave() > 1) {
2294 // TODO: Interleave support is future work.
2295 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "
2297 Hints
.emitRemarkWithHints();
2304 static void collectSupportedLoops(Loop
&L
, LoopInfo
*LI
,
2305 OptimizationRemarkEmitter
*ORE
,
2306 SmallVectorImpl
<Loop
*> &V
) {
2307 // Collect inner loops and outer loops without irreducible control flow. For
2308 // now, only collect outer loops that have explicit vectorization hints. If we
2309 // are stress testing the VPlan H-CFG construction, we collect the outermost
2310 // loop of every loop nest.
2311 if (L
.isInnermost() || VPlanBuildStressTest
||
2312 (EnableVPlanNativePath
&& isExplicitVecOuterLoop(&L
, ORE
))) {
2313 LoopBlocksRPO
RPOT(&L
);
2315 if (!containsIrreducibleCFG
<const BasicBlock
*>(RPOT
, *LI
)) {
2317 // TODO: Collect inner loops inside marked outer loops in case
2318 // vectorization fails for the outer loop. Do not invoke
2319 // 'containsIrreducibleCFG' again for inner loops when the outer loop is
2320 // already known to be reducible. We can use an inherited attribute for
2325 for (Loop
*InnerL
: L
)
2326 collectSupportedLoops(*InnerL
, LI
, ORE
, V
);
2329 //===----------------------------------------------------------------------===//
2330 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
2331 // LoopVectorizationCostModel and LoopVectorizationPlanner.
2332 //===----------------------------------------------------------------------===//
2334 /// Compute the transformed value of Index at offset StartValue using step
2336 /// For integer induction, returns StartValue + Index * StepValue.
2337 /// For pointer induction, returns StartValue[Index * StepValue].
2338 /// FIXME: The newly created binary instructions should contain nsw/nuw
2339 /// flags, which can be found from the original scalar operations.
2341 emitTransformedIndex(IRBuilderBase
&B
, Value
*Index
, Value
*StartValue
,
2343 InductionDescriptor::InductionKind InductionKind
,
2344 const BinaryOperator
*InductionBinOp
) {
2345 Type
*StepTy
= Step
->getType();
2346 Value
*CastedIndex
= StepTy
->isIntegerTy()
2347 ? B
.CreateSExtOrTrunc(Index
, StepTy
)
2348 : B
.CreateCast(Instruction::SIToFP
, Index
, StepTy
);
2349 if (CastedIndex
!= Index
) {
2350 CastedIndex
->setName(CastedIndex
->getName() + ".cast");
2351 Index
= CastedIndex
;
2354 // Note: the IR at this point is broken. We cannot use SE to create any new
2355 // SCEV and then expand it, hoping that SCEV's simplification will give us
2356 // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
2357 // lead to various SCEV crashes. So all we can do is to use builder and rely
2358 // on InstCombine for future simplifications. Here we handle some trivial
2360 auto CreateAdd
= [&B
](Value
*X
, Value
*Y
) {
2361 assert(X
->getType() == Y
->getType() && "Types don't match!");
2362 if (auto *CX
= dyn_cast
<ConstantInt
>(X
))
2365 if (auto *CY
= dyn_cast
<ConstantInt
>(Y
))
2368 return B
.CreateAdd(X
, Y
);
2371 // We allow X to be a vector type, in which case Y will potentially be
2372 // splatted into a vector with the same element count.
2373 auto CreateMul
= [&B
](Value
*X
, Value
*Y
) {
2374 assert(X
->getType()->getScalarType() == Y
->getType() &&
2375 "Types don't match!");
2376 if (auto *CX
= dyn_cast
<ConstantInt
>(X
))
2379 if (auto *CY
= dyn_cast
<ConstantInt
>(Y
))
2382 VectorType
*XVTy
= dyn_cast
<VectorType
>(X
->getType());
2383 if (XVTy
&& !isa
<VectorType
>(Y
->getType()))
2384 Y
= B
.CreateVectorSplat(XVTy
->getElementCount(), Y
);
2385 return B
.CreateMul(X
, Y
);
2388 switch (InductionKind
) {
2389 case InductionDescriptor::IK_IntInduction
: {
2390 assert(!isa
<VectorType
>(Index
->getType()) &&
2391 "Vector indices not supported for integer inductions yet");
2392 assert(Index
->getType() == StartValue
->getType() &&
2393 "Index type does not match StartValue type");
2394 if (isa
<ConstantInt
>(Step
) && cast
<ConstantInt
>(Step
)->isMinusOne())
2395 return B
.CreateSub(StartValue
, Index
);
2396 auto *Offset
= CreateMul(Index
, Step
);
2397 return CreateAdd(StartValue
, Offset
);
2399 case InductionDescriptor::IK_PtrInduction
:
2400 return B
.CreatePtrAdd(StartValue
, CreateMul(Index
, Step
));
2401 case InductionDescriptor::IK_FpInduction
: {
2402 assert(!isa
<VectorType
>(Index
->getType()) &&
2403 "Vector indices not supported for FP inductions yet");
2404 assert(Step
->getType()->isFloatingPointTy() && "Expected FP Step value");
2405 assert(InductionBinOp
&&
2406 (InductionBinOp
->getOpcode() == Instruction::FAdd
||
2407 InductionBinOp
->getOpcode() == Instruction::FSub
) &&
2408 "Original bin op should be defined for FP induction");
2410 Value
*MulExp
= B
.CreateFMul(Step
, Index
);
2411 return B
.CreateBinOp(InductionBinOp
->getOpcode(), StartValue
, MulExp
,
2414 case InductionDescriptor::IK_NoInduction
:
2417 llvm_unreachable("invalid enum");
2420 std::optional
<unsigned> getMaxVScale(const Function
&F
,
2421 const TargetTransformInfo
&TTI
) {
2422 if (std::optional
<unsigned> MaxVScale
= TTI
.getMaxVScale())
2425 if (F
.hasFnAttribute(Attribute::VScaleRange
))
2426 return F
.getFnAttribute(Attribute::VScaleRange
).getVScaleRangeMax();
2428 return std::nullopt
;
2431 /// For the given VF and UF and maximum trip count computed for the loop, return
2432 /// whether the induction variable might overflow in the vectorized loop. If not,
2433 /// then we know a runtime overflow check always evaluates to false and can be
2435 static bool isIndvarOverflowCheckKnownFalse(
2436 const LoopVectorizationCostModel
*Cost
,
2437 ElementCount VF
, std::optional
<unsigned> UF
= std::nullopt
) {
2438 // Always be conservative if we don't know the exact unroll factor.
2439 unsigned MaxUF
= UF
? *UF
: Cost
->TTI
.getMaxInterleaveFactor(VF
);
2441 Type
*IdxTy
= Cost
->Legal
->getWidestInductionType();
2442 APInt MaxUIntTripCount
= cast
<IntegerType
>(IdxTy
)->getMask();
2444 // We know the runtime overflow check is known false iff the (max) trip-count
2445 // is known and (max) trip-count + (VF * UF) does not overflow in the type of
2446 // the vector loop induction variable.
2448 Cost
->PSE
.getSE()->getSmallConstantMaxTripCount(Cost
->TheLoop
)) {
2449 uint64_t MaxVF
= VF
.getKnownMinValue();
2450 if (VF
.isScalable()) {
2451 std::optional
<unsigned> MaxVScale
=
2452 getMaxVScale(*Cost
->TheFunction
, Cost
->TTI
);
2455 MaxVF
*= *MaxVScale
;
2458 return (MaxUIntTripCount
- TC
).ugt(MaxVF
* MaxUF
);
2464 // Return whether we allow using masked interleave-groups (for dealing with
2465 // strided loads/stores that reside in predicated blocks, or for dealing
2467 static bool useMaskedInterleavedAccesses(const TargetTransformInfo
&TTI
) {
2468 // If an override option has been passed in for interleaved accesses, use it.
2469 if (EnableMaskedInterleavedMemAccesses
.getNumOccurrences() > 0)
2470 return EnableMaskedInterleavedMemAccesses
;
2472 return TTI
.enableMaskedInterleavedAccessVectorization();
2475 // Try to vectorize the interleave group that \p Instr belongs to.
2477 // E.g. Translate following interleaved load group (factor = 3):
2478 // for (i = 0; i < N; i+=3) {
2479 // R = Pic[i]; // Member of index 0
2480 // G = Pic[i+1]; // Member of index 1
2481 // B = Pic[i+2]; // Member of index 2
2482 // ... // do something to R, G, B
2485 // %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B
2486 // %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9> ; R elements
2487 // %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10> ; G elements
2488 // %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11> ; B elements
2490 // Or translate following interleaved store group (factor = 3):
2491 // for (i = 0; i < N; i+=3) {
2492 // ... do something to R, G, B
2493 // Pic[i] = R; // Member of index 0
2494 // Pic[i+1] = G; // Member of index 1
2495 // Pic[i+2] = B; // Member of index 2
2498 // %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
2499 // %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u>
2500 // %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
2501 // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements
2502 // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B
2503 void InnerLoopVectorizer::vectorizeInterleaveGroup(
2504 const InterleaveGroup
<Instruction
> *Group
, ArrayRef
<VPValue
*> VPDefs
,
2505 VPTransformState
&State
, VPValue
*Addr
, ArrayRef
<VPValue
*> StoredValues
,
2506 VPValue
*BlockInMask
, bool NeedsMaskForGaps
) {
2507 Instruction
*Instr
= Group
->getInsertPos();
2508 const DataLayout
&DL
= Instr
->getModule()->getDataLayout();
2510 // Prepare for the vector type of the interleaved load/store.
2511 Type
*ScalarTy
= getLoadStoreType(Instr
);
2512 unsigned InterleaveFactor
= Group
->getFactor();
2513 auto *VecTy
= VectorType::get(ScalarTy
, VF
* InterleaveFactor
);
2515 // Prepare for the new pointers.
2516 SmallVector
<Value
*, 2> AddrParts
;
2517 unsigned Index
= Group
->getIndex(Instr
);
2519 // TODO: extend the masked interleaved-group support to reversed access.
2520 assert((!BlockInMask
|| !Group
->isReverse()) &&
2521 "Reversed masked interleave-group not supported.");
2524 // If the group is reverse, adjust the index to refer to the last vector lane
2525 // instead of the first. We adjust the index from the first vector lane,
2526 // rather than directly getting the pointer for lane VF - 1, because the
2527 // pointer operand of the interleaved access is supposed to be uniform. For
2528 // uniform instructions, we're only required to generate a value for the
2529 // first vector lane in each unroll iteration.
2530 if (Group
->isReverse()) {
2531 Value
*RuntimeVF
= getRuntimeVF(Builder
, Builder
.getInt32Ty(), VF
);
2532 Idx
= Builder
.CreateSub(RuntimeVF
, Builder
.getInt32(1));
2533 Idx
= Builder
.CreateMul(Idx
, Builder
.getInt32(Group
->getFactor()));
2534 Idx
= Builder
.CreateAdd(Idx
, Builder
.getInt32(Index
));
2535 Idx
= Builder
.CreateNeg(Idx
);
2537 Idx
= Builder
.getInt32(-Index
);
2539 for (unsigned Part
= 0; Part
< UF
; Part
++) {
2540 Value
*AddrPart
= State
.get(Addr
, VPIteration(Part
, 0));
2541 if (auto *I
= dyn_cast
<Instruction
>(AddrPart
))
2542 State
.setDebugLocFrom(I
->getDebugLoc());
2544 // Notice current instruction could be any index. Need to adjust the address
2545 // to the member of index 0.
2547 // E.g. a = A[i+1]; // Member of index 1 (Current instruction)
2548 // b = A[i]; // Member of index 0
2549 // Current pointer is pointed to A[i+1], adjust it to A[i].
2551 // E.g. A[i+1] = a; // Member of index 1
2552 // A[i] = b; // Member of index 0
2553 // A[i+2] = c; // Member of index 2 (Current instruction)
2554 // Current pointer is pointed to A[i+2], adjust it to A[i].
2556 bool InBounds
= false;
2557 if (auto *gep
= dyn_cast
<GetElementPtrInst
>(AddrPart
->stripPointerCasts()))
2558 InBounds
= gep
->isInBounds();
2559 AddrPart
= Builder
.CreateGEP(ScalarTy
, AddrPart
, Idx
, "", InBounds
);
2560 AddrParts
.push_back(AddrPart
);
2563 State
.setDebugLocFrom(Instr
->getDebugLoc());
2564 Value
*PoisonVec
= PoisonValue::get(VecTy
);
2566 auto CreateGroupMask
= [this, &BlockInMask
, &State
, &InterleaveFactor
](
2567 unsigned Part
, Value
*MaskForGaps
) -> Value
* {
2568 if (VF
.isScalable()) {
2569 assert(!MaskForGaps
&& "Interleaved groups with gaps are not supported.");
2570 assert(InterleaveFactor
== 2 &&
2571 "Unsupported deinterleave factor for scalable vectors");
2572 auto *BlockInMaskPart
= State
.get(BlockInMask
, Part
);
2573 SmallVector
<Value
*, 2> Ops
= {BlockInMaskPart
, BlockInMaskPart
};
2575 VectorType::get(Builder
.getInt1Ty(), VF
.getKnownMinValue() * 2, true);
2576 return Builder
.CreateIntrinsic(
2577 MaskTy
, Intrinsic::experimental_vector_interleave2
, Ops
,
2578 /*FMFSource=*/nullptr, "interleaved.mask");
2584 Value
*BlockInMaskPart
= State
.get(BlockInMask
, Part
);
2585 Value
*ShuffledMask
= Builder
.CreateShuffleVector(
2587 createReplicatedMask(InterleaveFactor
, VF
.getKnownMinValue()),
2588 "interleaved.mask");
2589 return MaskForGaps
? Builder
.CreateBinOp(Instruction::And
, ShuffledMask
,
2594 // Vectorize the interleaved load group.
2595 if (isa
<LoadInst
>(Instr
)) {
2596 Value
*MaskForGaps
= nullptr;
2597 if (NeedsMaskForGaps
) {
2599 createBitMaskForGaps(Builder
, VF
.getKnownMinValue(), *Group
);
2600 assert(MaskForGaps
&& "Mask for Gaps is required but it is null");
2603 // For each unroll part, create a wide load for the group.
2604 SmallVector
<Value
*, 2> NewLoads
;
2605 for (unsigned Part
= 0; Part
< UF
; Part
++) {
2606 Instruction
*NewLoad
;
2607 if (BlockInMask
|| MaskForGaps
) {
2608 assert(useMaskedInterleavedAccesses(*TTI
) &&
2609 "masked interleaved groups are not allowed.");
2610 Value
*GroupMask
= CreateGroupMask(Part
, MaskForGaps
);
2612 Builder
.CreateMaskedLoad(VecTy
, AddrParts
[Part
], Group
->getAlign(),
2613 GroupMask
, PoisonVec
, "wide.masked.vec");
2616 NewLoad
= Builder
.CreateAlignedLoad(VecTy
, AddrParts
[Part
],
2617 Group
->getAlign(), "wide.vec");
2618 Group
->addMetadata(NewLoad
);
2619 NewLoads
.push_back(NewLoad
);
2622 if (VecTy
->isScalableTy()) {
2623 assert(InterleaveFactor
== 2 &&
2624 "Unsupported deinterleave factor for scalable vectors");
2626 for (unsigned Part
= 0; Part
< UF
; ++Part
) {
2627 // Scalable vectors cannot use arbitrary shufflevectors (only splats),
2628 // so must use intrinsics to deinterleave.
2629 Value
*DI
= Builder
.CreateIntrinsic(
2630 Intrinsic::experimental_vector_deinterleave2
, VecTy
, NewLoads
[Part
],
2631 /*FMFSource=*/nullptr, "strided.vec");
2633 for (unsigned I
= 0; I
< InterleaveFactor
; ++I
) {
2634 Instruction
*Member
= Group
->getMember(I
);
2639 Value
*StridedVec
= Builder
.CreateExtractValue(DI
, I
);
2640 // If this member has different type, cast the result type.
2641 if (Member
->getType() != ScalarTy
) {
2642 VectorType
*OtherVTy
= VectorType::get(Member
->getType(), VF
);
2643 StridedVec
= createBitOrPointerCast(StridedVec
, OtherVTy
, DL
);
2646 if (Group
->isReverse())
2647 StridedVec
= Builder
.CreateVectorReverse(StridedVec
, "reverse");
2649 State
.set(VPDefs
[J
], StridedVec
, Part
);
2657 // For each member in the group, shuffle out the appropriate data from the
2660 for (unsigned I
= 0; I
< InterleaveFactor
; ++I
) {
2661 Instruction
*Member
= Group
->getMember(I
);
2663 // Skip the gaps in the group.
2668 createStrideMask(I
, InterleaveFactor
, VF
.getKnownMinValue());
2669 for (unsigned Part
= 0; Part
< UF
; Part
++) {
2670 Value
*StridedVec
= Builder
.CreateShuffleVector(
2671 NewLoads
[Part
], StrideMask
, "strided.vec");
2673 // If this member has different type, cast the result type.
2674 if (Member
->getType() != ScalarTy
) {
2675 assert(!VF
.isScalable() && "VF is assumed to be non scalable.");
2676 VectorType
*OtherVTy
= VectorType::get(Member
->getType(), VF
);
2677 StridedVec
= createBitOrPointerCast(StridedVec
, OtherVTy
, DL
);
2680 if (Group
->isReverse())
2681 StridedVec
= Builder
.CreateVectorReverse(StridedVec
, "reverse");
2683 State
.set(VPDefs
[J
], StridedVec
, Part
);
2690 // The sub vector type for current instruction.
2691 auto *SubVT
= VectorType::get(ScalarTy
, VF
);
2693 // Vectorize the interleaved store group.
2694 Value
*MaskForGaps
=
2695 createBitMaskForGaps(Builder
, VF
.getKnownMinValue(), *Group
);
2696 assert((!MaskForGaps
|| useMaskedInterleavedAccesses(*TTI
)) &&
2697 "masked interleaved groups are not allowed.");
2698 assert((!MaskForGaps
|| !VF
.isScalable()) &&
2699 "masking gaps for scalable vectors is not yet supported.");
2700 for (unsigned Part
= 0; Part
< UF
; Part
++) {
2701 // Collect the stored vector from each member.
2702 SmallVector
<Value
*, 4> StoredVecs
;
2703 unsigned StoredIdx
= 0;
2704 for (unsigned i
= 0; i
< InterleaveFactor
; i
++) {
2705 assert((Group
->getMember(i
) || MaskForGaps
) &&
2706 "Fail to get a member from an interleaved store group");
2707 Instruction
*Member
= Group
->getMember(i
);
2709 // Skip the gaps in the group.
2711 Value
*Undef
= PoisonValue::get(SubVT
);
2712 StoredVecs
.push_back(Undef
);
2716 Value
*StoredVec
= State
.get(StoredValues
[StoredIdx
], Part
);
2719 if (Group
->isReverse())
2720 StoredVec
= Builder
.CreateVectorReverse(StoredVec
, "reverse");
2722 // If this member has different type, cast it to a unified type.
2724 if (StoredVec
->getType() != SubVT
)
2725 StoredVec
= createBitOrPointerCast(StoredVec
, SubVT
, DL
);
2727 StoredVecs
.push_back(StoredVec
);
2730 // Interleave all the smaller vectors into one wider vector.
2731 Value
*IVec
= interleaveVectors(Builder
, StoredVecs
, "interleaved.vec");
2732 Instruction
*NewStoreInstr
;
2733 if (BlockInMask
|| MaskForGaps
) {
2734 Value
*GroupMask
= CreateGroupMask(Part
, MaskForGaps
);
2735 NewStoreInstr
= Builder
.CreateMaskedStore(IVec
, AddrParts
[Part
],
2736 Group
->getAlign(), GroupMask
);
2739 Builder
.CreateAlignedStore(IVec
, AddrParts
[Part
], Group
->getAlign());
2741 Group
->addMetadata(NewStoreInstr
);
2745 void InnerLoopVectorizer::scalarizeInstruction(const Instruction
*Instr
,
2746 VPReplicateRecipe
*RepRecipe
,
2747 const VPIteration
&Instance
,
2748 VPTransformState
&State
) {
2749 assert(!Instr
->getType()->isAggregateType() && "Can't handle vectors");
2751 // llvm.experimental.noalias.scope.decl intrinsics must only be duplicated for
2752 // the first lane and part.
2753 if (isa
<NoAliasScopeDeclInst
>(Instr
))
2754 if (!Instance
.isFirstIteration())
2757 // Does this instruction return a value ?
2758 bool IsVoidRetTy
= Instr
->getType()->isVoidTy();
2760 Instruction
*Cloned
= Instr
->clone();
2762 Cloned
->setName(Instr
->getName() + ".cloned");
2763 #if !defined(NDEBUG)
2764 // Verify that VPlan type inference results agree with the type of the
2765 // generated values.
2766 assert(State
.TypeAnalysis
.inferScalarType(RepRecipe
) == Cloned
->getType() &&
2767 "inferred type and type from generated instructions do not match");
2771 RepRecipe
->setFlags(Cloned
);
2773 if (auto DL
= Instr
->getDebugLoc())
2774 State
.setDebugLocFrom(DL
);
2776 // Replace the operands of the cloned instructions with their scalar
2777 // equivalents in the new loop.
2778 for (const auto &I
: enumerate(RepRecipe
->operands())) {
2779 auto InputInstance
= Instance
;
2780 VPValue
*Operand
= I
.value();
2781 if (vputils::isUniformAfterVectorization(Operand
))
2782 InputInstance
.Lane
= VPLane::getFirstLane();
2783 Cloned
->setOperand(I
.index(), State
.get(Operand
, InputInstance
));
2785 State
.addNewMetadata(Cloned
, Instr
);
2787 // Place the cloned scalar in the new loop.
2788 State
.Builder
.Insert(Cloned
);
2790 State
.set(RepRecipe
, Cloned
, Instance
);
2792 // If we just cloned a new assumption, add it the assumption cache.
2793 if (auto *II
= dyn_cast
<AssumeInst
>(Cloned
))
2794 AC
->registerAssumption(II
);
2797 bool IfPredicateInstr
= RepRecipe
->getParent()->getParent()->isReplicator();
2798 if (IfPredicateInstr
)
2799 PredicatedInstructions
.push_back(Cloned
);
2803 InnerLoopVectorizer::getOrCreateVectorTripCount(BasicBlock
*InsertBlock
) {
2804 if (VectorTripCount
)
2805 return VectorTripCount
;
2807 Value
*TC
= getTripCount();
2808 IRBuilder
<> Builder(InsertBlock
->getTerminator());
2810 Type
*Ty
= TC
->getType();
2811 // This is where we can make the step a runtime constant.
2812 Value
*Step
= createStepForVF(Builder
, Ty
, VF
, UF
);
2814 // If the tail is to be folded by masking, round the number of iterations N
2815 // up to a multiple of Step instead of rounding down. This is done by first
2816 // adding Step-1 and then rounding down. Note that it's ok if this addition
2817 // overflows: the vector induction variable will eventually wrap to zero given
2818 // that it starts at zero and its Step is a power of two; the loop will then
2819 // exit, with the last early-exit vector comparison also producing all-true.
2820 // For scalable vectors the VF is not guaranteed to be a power of 2, but this
2821 // is accounted for in emitIterationCountCheck that adds an overflow check.
2822 if (Cost
->foldTailByMasking()) {
2823 assert(isPowerOf2_32(VF
.getKnownMinValue() * UF
) &&
2824 "VF*UF must be a power of 2 when folding tail by masking");
2825 Value
*NumLanes
= getRuntimeVF(Builder
, Ty
, VF
* UF
);
2826 TC
= Builder
.CreateAdd(
2827 TC
, Builder
.CreateSub(NumLanes
, ConstantInt::get(Ty
, 1)), "n.rnd.up");
2830 // Now we need to generate the expression for the part of the loop that the
2831 // vectorized body will execute. This is equal to N - (N % Step) if scalar
2832 // iterations are not required for correctness, or N - Step, otherwise. Step
2833 // is equal to the vectorization factor (number of SIMD elements) times the
2834 // unroll factor (number of SIMD instructions).
2835 Value
*R
= Builder
.CreateURem(TC
, Step
, "n.mod.vf");
2837 // There are cases where we *must* run at least one iteration in the remainder
2838 // loop. See the cost model for when this can happen. If the step evenly
2839 // divides the trip count, we set the remainder to be equal to the step. If
2840 // the step does not evenly divide the trip count, no adjustment is necessary
2841 // since there will already be scalar iterations. Note that the minimum
2842 // iterations check ensures that N >= Step.
2843 if (Cost
->requiresScalarEpilogue(VF
.isVector())) {
2844 auto *IsZero
= Builder
.CreateICmpEQ(R
, ConstantInt::get(R
->getType(), 0));
2845 R
= Builder
.CreateSelect(IsZero
, Step
, R
);
2848 VectorTripCount
= Builder
.CreateSub(TC
, R
, "n.vec");
2850 return VectorTripCount
;
2853 Value
*InnerLoopVectorizer::createBitOrPointerCast(Value
*V
, VectorType
*DstVTy
,
2854 const DataLayout
&DL
) {
2855 // Verify that V is a vector type with same number of elements as DstVTy.
2856 auto *DstFVTy
= cast
<VectorType
>(DstVTy
);
2857 auto VF
= DstFVTy
->getElementCount();
2858 auto *SrcVecTy
= cast
<VectorType
>(V
->getType());
2859 assert(VF
== SrcVecTy
->getElementCount() && "Vector dimensions do not match");
2860 Type
*SrcElemTy
= SrcVecTy
->getElementType();
2861 Type
*DstElemTy
= DstFVTy
->getElementType();
2862 assert((DL
.getTypeSizeInBits(SrcElemTy
) == DL
.getTypeSizeInBits(DstElemTy
)) &&
2863 "Vector elements must have same size");
2865 // Do a direct cast if element types are castable.
2866 if (CastInst::isBitOrNoopPointerCastable(SrcElemTy
, DstElemTy
, DL
)) {
2867 return Builder
.CreateBitOrPointerCast(V
, DstFVTy
);
2869 // V cannot be directly casted to desired vector type.
2870 // May happen when V is a floating point vector but DstVTy is a vector of
2871 // pointers or vice-versa. Handle this using a two-step bitcast using an
2872 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
2873 assert((DstElemTy
->isPointerTy() != SrcElemTy
->isPointerTy()) &&
2874 "Only one type should be a pointer type");
2875 assert((DstElemTy
->isFloatingPointTy() != SrcElemTy
->isFloatingPointTy()) &&
2876 "Only one type should be a floating point type");
2878 IntegerType::getIntNTy(V
->getContext(), DL
.getTypeSizeInBits(SrcElemTy
));
2879 auto *VecIntTy
= VectorType::get(IntTy
, VF
);
2880 Value
*CastVal
= Builder
.CreateBitOrPointerCast(V
, VecIntTy
);
2881 return Builder
.CreateBitOrPointerCast(CastVal
, DstFVTy
);
2884 void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock
*Bypass
) {
2885 Value
*Count
= getTripCount();
2886 // Reuse existing vector loop preheader for TC checks.
2887 // Note that new preheader block is generated for vector loop.
2888 BasicBlock
*const TCCheckBlock
= LoopVectorPreHeader
;
2889 IRBuilder
<> Builder(TCCheckBlock
->getTerminator());
2891 // Generate code to check if the loop's trip count is less than VF * UF, or
2892 // equal to it in case a scalar epilogue is required; this implies that the
2893 // vector trip count is zero. This check also covers the case where adding one
2894 // to the backedge-taken count overflowed leading to an incorrect trip count
2895 // of zero. In this case we will also jump to the scalar loop.
2896 auto P
= Cost
->requiresScalarEpilogue(VF
.isVector()) ? ICmpInst::ICMP_ULE
2897 : ICmpInst::ICMP_ULT
;
2899 // If tail is to be folded, vector loop takes care of all iterations.
2900 Type
*CountTy
= Count
->getType();
2901 Value
*CheckMinIters
= Builder
.getFalse();
2902 auto CreateStep
= [&]() -> Value
* {
2903 // Create step with max(MinProTripCount, UF * VF).
2904 if (UF
* VF
.getKnownMinValue() >= MinProfitableTripCount
.getKnownMinValue())
2905 return createStepForVF(Builder
, CountTy
, VF
, UF
);
2908 createStepForVF(Builder
, CountTy
, MinProfitableTripCount
, 1);
2909 if (!VF
.isScalable())
2911 return Builder
.CreateBinaryIntrinsic(
2912 Intrinsic::umax
, MinProfTC
, createStepForVF(Builder
, CountTy
, VF
, UF
));
2915 TailFoldingStyle Style
= Cost
->getTailFoldingStyle();
2916 if (Style
== TailFoldingStyle::None
)
2918 Builder
.CreateICmp(P
, Count
, CreateStep(), "min.iters.check");
2919 else if (VF
.isScalable() &&
2920 !isIndvarOverflowCheckKnownFalse(Cost
, VF
, UF
) &&
2921 Style
!= TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck
) {
2922 // vscale is not necessarily a power-of-2, which means we cannot guarantee
2923 // an overflow to zero when updating induction variables and so an
2924 // additional overflow check is required before entering the vector loop.
2926 // Get the maximum unsigned value for the type.
2927 Value
*MaxUIntTripCount
=
2928 ConstantInt::get(CountTy
, cast
<IntegerType
>(CountTy
)->getMask());
2929 Value
*LHS
= Builder
.CreateSub(MaxUIntTripCount
, Count
);
2931 // Don't execute the vector loop if (UMax - n) < (VF * UF).
2932 CheckMinIters
= Builder
.CreateICmp(ICmpInst::ICMP_ULT
, LHS
, CreateStep());
2935 // Create new preheader for vector loop.
2936 LoopVectorPreHeader
=
2937 SplitBlock(TCCheckBlock
, TCCheckBlock
->getTerminator(), DT
, LI
, nullptr,
2940 assert(DT
->properlyDominates(DT
->getNode(TCCheckBlock
),
2941 DT
->getNode(Bypass
)->getIDom()) &&
2942 "TC check is expected to dominate Bypass");
2944 // Update dominator for Bypass & LoopExit (if needed).
2945 DT
->changeImmediateDominator(Bypass
, TCCheckBlock
);
2946 if (!Cost
->requiresScalarEpilogue(VF
.isVector()))
2947 // If there is an epilogue which must run, there's no edge from the
2948 // middle block to exit blocks and thus no need to update the immediate
2949 // dominator of the exit blocks.
2950 DT
->changeImmediateDominator(LoopExitBlock
, TCCheckBlock
);
2953 *BranchInst::Create(Bypass
, LoopVectorPreHeader
, CheckMinIters
);
2954 if (hasBranchWeightMD(*OrigLoop
->getLoopLatch()->getTerminator()))
2955 setBranchWeights(BI
, MinItersBypassWeights
);
2956 ReplaceInstWithInst(TCCheckBlock
->getTerminator(), &BI
);
2957 LoopBypassBlocks
.push_back(TCCheckBlock
);
2960 BasicBlock
*InnerLoopVectorizer::emitSCEVChecks(BasicBlock
*Bypass
) {
2961 BasicBlock
*const SCEVCheckBlock
=
2962 RTChecks
.emitSCEVChecks(Bypass
, LoopVectorPreHeader
, LoopExitBlock
);
2963 if (!SCEVCheckBlock
)
2966 assert(!(SCEVCheckBlock
->getParent()->hasOptSize() ||
2967 (OptForSizeBasedOnProfile
&&
2968 Cost
->Hints
->getForce() != LoopVectorizeHints::FK_Enabled
)) &&
2969 "Cannot SCEV check stride or overflow when optimizing for size");
2972 // Update dominator only if this is first RT check.
2973 if (LoopBypassBlocks
.empty()) {
2974 DT
->changeImmediateDominator(Bypass
, SCEVCheckBlock
);
2975 if (!Cost
->requiresScalarEpilogue(VF
.isVector()))
2976 // If there is an epilogue which must run, there's no edge from the
2977 // middle block to exit blocks and thus no need to update the immediate
2978 // dominator of the exit blocks.
2979 DT
->changeImmediateDominator(LoopExitBlock
, SCEVCheckBlock
);
2982 LoopBypassBlocks
.push_back(SCEVCheckBlock
);
2983 AddedSafetyChecks
= true;
2984 return SCEVCheckBlock
;
2987 BasicBlock
*InnerLoopVectorizer::emitMemRuntimeChecks(BasicBlock
*Bypass
) {
2988 // VPlan-native path does not do any analysis for runtime checks currently.
2989 if (EnableVPlanNativePath
)
2992 BasicBlock
*const MemCheckBlock
=
2993 RTChecks
.emitMemRuntimeChecks(Bypass
, LoopVectorPreHeader
);
2995 // Check if we generated code that checks in runtime if arrays overlap. We put
2996 // the checks into a separate block to make the more common case of few
3001 if (MemCheckBlock
->getParent()->hasOptSize() || OptForSizeBasedOnProfile
) {
3002 assert(Cost
->Hints
->getForce() == LoopVectorizeHints::FK_Enabled
&&
3003 "Cannot emit memory checks when optimizing for size, unless forced "
3006 return OptimizationRemarkAnalysis(DEBUG_TYPE
, "VectorizationCodeSize",
3007 OrigLoop
->getStartLoc(),
3008 OrigLoop
->getHeader())
3009 << "Code-size may be reduced by not forcing "
3010 "vectorization, or by source-code modifications "
3011 "eliminating the need for runtime checks "
3012 "(e.g., adding 'restrict').";
3016 LoopBypassBlocks
.push_back(MemCheckBlock
);
3018 AddedSafetyChecks
= true;
3020 return MemCheckBlock
;
3023 void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix
) {
3024 LoopScalarBody
= OrigLoop
->getHeader();
3025 LoopVectorPreHeader
= OrigLoop
->getLoopPreheader();
3026 assert(LoopVectorPreHeader
&& "Invalid loop structure");
3027 LoopExitBlock
= OrigLoop
->getUniqueExitBlock(); // may be nullptr
3028 assert((LoopExitBlock
|| Cost
->requiresScalarEpilogue(VF
.isVector())) &&
3029 "multiple exit loop without required epilogue?");
3032 SplitBlock(LoopVectorPreHeader
, LoopVectorPreHeader
->getTerminator(), DT
,
3033 LI
, nullptr, Twine(Prefix
) + "middle.block");
3034 LoopScalarPreHeader
=
3035 SplitBlock(LoopMiddleBlock
, LoopMiddleBlock
->getTerminator(), DT
, LI
,
3036 nullptr, Twine(Prefix
) + "scalar.ph");
3038 auto *ScalarLatchTerm
= OrigLoop
->getLoopLatch()->getTerminator();
3040 // Set up the middle block terminator. Two cases:
3041 // 1) If we know that we must execute the scalar epilogue, emit an
3042 // unconditional branch.
3043 // 2) Otherwise, we must have a single unique exit block (due to how we
3044 // implement the multiple exit case). In this case, set up a conditional
3045 // branch from the middle block to the loop scalar preheader, and the
3046 // exit block. completeLoopSkeleton will update the condition to use an
3047 // iteration check, if required to decide whether to execute the remainder.
3048 BranchInst
*BrInst
=
3049 Cost
->requiresScalarEpilogue(VF
.isVector())
3050 ? BranchInst::Create(LoopScalarPreHeader
)
3051 : BranchInst::Create(LoopExitBlock
, LoopScalarPreHeader
,
3053 BrInst
->setDebugLoc(ScalarLatchTerm
->getDebugLoc());
3054 ReplaceInstWithInst(LoopMiddleBlock
->getTerminator(), BrInst
);
3056 // Update dominator for loop exit. During skeleton creation, only the vector
3057 // pre-header and the middle block are created. The vector loop is entirely
3058 // created during VPlan exection.
3059 if (!Cost
->requiresScalarEpilogue(VF
.isVector()))
3060 // If there is an epilogue which must run, there's no edge from the
3061 // middle block to exit blocks and thus no need to update the immediate
3062 // dominator of the exit blocks.
3063 DT
->changeImmediateDominator(LoopExitBlock
, LoopMiddleBlock
);
3066 PHINode
*InnerLoopVectorizer::createInductionResumeValue(
3067 PHINode
*OrigPhi
, const InductionDescriptor
&II
, Value
*Step
,
3068 ArrayRef
<BasicBlock
*> BypassBlocks
,
3069 std::pair
<BasicBlock
*, Value
*> AdditionalBypass
) {
3070 Value
*VectorTripCount
= getOrCreateVectorTripCount(LoopVectorPreHeader
);
3071 assert(VectorTripCount
&& "Expected valid arguments");
3073 Instruction
*OldInduction
= Legal
->getPrimaryInduction();
3074 Value
*&EndValue
= IVEndValues
[OrigPhi
];
3075 Value
*EndValueFromAdditionalBypass
= AdditionalBypass
.second
;
3076 if (OrigPhi
== OldInduction
) {
3077 // We know what the end value is.
3078 EndValue
= VectorTripCount
;
3080 IRBuilder
<> B(LoopVectorPreHeader
->getTerminator());
3082 // Fast-math-flags propagate from the original induction instruction.
3083 if (II
.getInductionBinOp() && isa
<FPMathOperator
>(II
.getInductionBinOp()))
3084 B
.setFastMathFlags(II
.getInductionBinOp()->getFastMathFlags());
3086 EndValue
= emitTransformedIndex(B
, VectorTripCount
, II
.getStartValue(),
3087 Step
, II
.getKind(), II
.getInductionBinOp());
3088 EndValue
->setName("ind.end");
3090 // Compute the end value for the additional bypass (if applicable).
3091 if (AdditionalBypass
.first
) {
3092 B
.SetInsertPoint(AdditionalBypass
.first
,
3093 AdditionalBypass
.first
->getFirstInsertionPt());
3094 EndValueFromAdditionalBypass
=
3095 emitTransformedIndex(B
, AdditionalBypass
.second
, II
.getStartValue(),
3096 Step
, II
.getKind(), II
.getInductionBinOp());
3097 EndValueFromAdditionalBypass
->setName("ind.end");
3101 // Create phi nodes to merge from the backedge-taken check block.
3102 PHINode
*BCResumeVal
= PHINode::Create(OrigPhi
->getType(), 3, "bc.resume.val",
3103 LoopScalarPreHeader
->getTerminator());
3104 // Copy original phi DL over to the new one.
3105 BCResumeVal
->setDebugLoc(OrigPhi
->getDebugLoc());
3107 // The new PHI merges the original incoming value, in case of a bypass,
3108 // or the value at the end of the vectorized loop.
3109 BCResumeVal
->addIncoming(EndValue
, LoopMiddleBlock
);
3111 // Fix the scalar body counter (PHI node).
3112 // The old induction's phi node in the scalar body needs the truncated
3114 for (BasicBlock
*BB
: BypassBlocks
)
3115 BCResumeVal
->addIncoming(II
.getStartValue(), BB
);
3117 if (AdditionalBypass
.first
)
3118 BCResumeVal
->setIncomingValueForBlock(AdditionalBypass
.first
,
3119 EndValueFromAdditionalBypass
);
3123 /// Return the expanded step for \p ID using \p ExpandedSCEVs to look up SCEV
3124 /// expansion results.
3125 static Value
*getExpandedStep(const InductionDescriptor
&ID
,
3126 const SCEV2ValueTy
&ExpandedSCEVs
) {
3127 const SCEV
*Step
= ID
.getStep();
3128 if (auto *C
= dyn_cast
<SCEVConstant
>(Step
))
3129 return C
->getValue();
3130 if (auto *U
= dyn_cast
<SCEVUnknown
>(Step
))
3131 return U
->getValue();
3132 auto I
= ExpandedSCEVs
.find(Step
);
3133 assert(I
!= ExpandedSCEVs
.end() && "SCEV must be expanded at this point");
3137 void InnerLoopVectorizer::createInductionResumeValues(
3138 const SCEV2ValueTy
&ExpandedSCEVs
,
3139 std::pair
<BasicBlock
*, Value
*> AdditionalBypass
) {
3140 assert(((AdditionalBypass
.first
&& AdditionalBypass
.second
) ||
3141 (!AdditionalBypass
.first
&& !AdditionalBypass
.second
)) &&
3142 "Inconsistent information about additional bypass.");
3143 // We are going to resume the execution of the scalar loop.
3144 // Go over all of the induction variables that we found and fix the
3145 // PHIs that are left in the scalar version of the loop.
3146 // The starting values of PHI nodes depend on the counter of the last
3147 // iteration in the vectorized loop.
3148 // If we come from a bypass edge then we need to start from the original
3150 for (const auto &InductionEntry
: Legal
->getInductionVars()) {
3151 PHINode
*OrigPhi
= InductionEntry
.first
;
3152 const InductionDescriptor
&II
= InductionEntry
.second
;
3153 PHINode
*BCResumeVal
= createInductionResumeValue(
3154 OrigPhi
, II
, getExpandedStep(II
, ExpandedSCEVs
), LoopBypassBlocks
,
3156 OrigPhi
->setIncomingValueForBlock(LoopScalarPreHeader
, BCResumeVal
);
3160 BasicBlock
*InnerLoopVectorizer::completeLoopSkeleton() {
3161 // The trip counts should be cached by now.
3162 Value
*Count
= getTripCount();
3163 Value
*VectorTripCount
= getOrCreateVectorTripCount(LoopVectorPreHeader
);
3165 auto *ScalarLatchTerm
= OrigLoop
->getLoopLatch()->getTerminator();
3167 // Add a check in the middle block to see if we have completed
3168 // all of the iterations in the first vector loop. Three cases:
3169 // 1) If we require a scalar epilogue, there is no conditional branch as
3170 // we unconditionally branch to the scalar preheader. Do nothing.
3171 // 2) If (N - N%VF) == N, then we *don't* need to run the remainder.
3172 // Thus if tail is to be folded, we know we don't need to run the
3173 // remainder and we can use the previous value for the condition (true).
3174 // 3) Otherwise, construct a runtime check.
3175 if (!Cost
->requiresScalarEpilogue(VF
.isVector()) &&
3176 !Cost
->foldTailByMasking()) {
3177 // Here we use the same DebugLoc as the scalar loop latch terminator instead
3178 // of the corresponding compare because they may have ended up with
3179 // different line numbers and we want to avoid awkward line stepping while
3180 // debugging. Eg. if the compare has got a line number inside the loop.
3181 // TODO: At the moment, CreateICmpEQ will simplify conditions with constant
3182 // operands. Perform simplification directly on VPlan once the branch is
3184 IRBuilder
<> B(LoopMiddleBlock
->getTerminator());
3185 B
.SetCurrentDebugLocation(ScalarLatchTerm
->getDebugLoc());
3186 Value
*CmpN
= B
.CreateICmpEQ(Count
, VectorTripCount
, "cmp.n");
3187 BranchInst
&BI
= *cast
<BranchInst
>(LoopMiddleBlock
->getTerminator());
3188 BI
.setCondition(CmpN
);
3189 if (hasBranchWeightMD(*ScalarLatchTerm
)) {
3190 // Assume that `Count % VectorTripCount` is equally distributed.
3191 unsigned TripCount
= UF
* VF
.getKnownMinValue();
3192 assert(TripCount
> 0 && "trip count should not be zero");
3193 const uint32_t Weights
[] = {1, TripCount
- 1};
3194 setBranchWeights(BI
, Weights
);
3198 #ifdef EXPENSIVE_CHECKS
3199 assert(DT
->verify(DominatorTree::VerificationLevel::Fast
));
3202 return LoopVectorPreHeader
;
3205 std::pair
<BasicBlock
*, Value
*>
3206 InnerLoopVectorizer::createVectorizedLoopSkeleton(
3207 const SCEV2ValueTy
&ExpandedSCEVs
) {
3209 In this function we generate a new loop. The new loop will contain
3210 the vectorized instructions while the old loop will continue to run the
3213 [ ] <-- old preheader - loop iteration number check and SCEVs in Plan's
3214 / | preheader are expanded here. Eventually all required SCEV
3215 / | expansion should happen here.
3217 | [ ] <-- vector loop bypass (may consist of multiple blocks).
3220 || [ ] <-- vector pre header.
3224 | [ ]_| <-- vector loop (created during VPlan execution).
3227 \ -[ ] <--- middle-block.
3230 | ->[ ] <--- new preheader.
3232 (opt) v <-- edge from middle to exit iff epilogue is not required.
3234 | [ ]_| <-- old scalar loop to handle remainder (scalar epilogue).
3237 >[ ] <-- exit block(s).
3241 // Create an empty vector loop, and prepare basic blocks for the runtime
3243 createVectorLoopSkeleton("");
3245 // Now, compare the new count to zero. If it is zero skip the vector loop and
3246 // jump to the scalar loop. This check also covers the case where the
3247 // backedge-taken count is uint##_max: adding one to it will overflow leading
3248 // to an incorrect trip count of zero. In this (rare) case we will also jump
3249 // to the scalar loop.
3250 emitIterationCountCheck(LoopScalarPreHeader
);
3252 // Generate the code to check any assumptions that we've made for SCEV
3254 emitSCEVChecks(LoopScalarPreHeader
);
3256 // Generate the code that checks in runtime if arrays overlap. We put the
3257 // checks into a separate block to make the more common case of few elements
3259 emitMemRuntimeChecks(LoopScalarPreHeader
);
3261 // Emit phis for the new starting index of the scalar loop.
3262 createInductionResumeValues(ExpandedSCEVs
);
3264 return {completeLoopSkeleton(), nullptr};
3267 // Fix up external users of the induction variable. At this point, we are
3268 // in LCSSA form, with all external PHIs that use the IV having one input value,
3269 // coming from the remainder loop. We need those PHIs to also have a correct
3270 // value for the IV when arriving directly from the middle block.
3271 void InnerLoopVectorizer::fixupIVUsers(PHINode
*OrigPhi
,
3272 const InductionDescriptor
&II
,
3273 Value
*VectorTripCount
, Value
*EndValue
,
3274 BasicBlock
*MiddleBlock
,
3275 BasicBlock
*VectorHeader
, VPlan
&Plan
,
3276 VPTransformState
&State
) {
3277 // There are two kinds of external IV usages - those that use the value
3278 // computed in the last iteration (the PHI) and those that use the penultimate
3279 // value (the value that feeds into the phi from the loop latch).
3280 // We allow both, but they, obviously, have different values.
3282 assert(OrigLoop
->getUniqueExitBlock() && "Expected a single exit block");
3284 DenseMap
<Value
*, Value
*> MissingVals
;
3286 // An external user of the last iteration's value should see the value that
3287 // the remainder loop uses to initialize its own IV.
3288 Value
*PostInc
= OrigPhi
->getIncomingValueForBlock(OrigLoop
->getLoopLatch());
3289 for (User
*U
: PostInc
->users()) {
3290 Instruction
*UI
= cast
<Instruction
>(U
);
3291 if (!OrigLoop
->contains(UI
)) {
3292 assert(isa
<PHINode
>(UI
) && "Expected LCSSA form");
3293 MissingVals
[UI
] = EndValue
;
3297 // An external user of the penultimate value need to see EndValue - Step.
3298 // The simplest way to get this is to recompute it from the constituent SCEVs,
3299 // that is Start + (Step * (CRD - 1)).
3300 for (User
*U
: OrigPhi
->users()) {
3301 auto *UI
= cast
<Instruction
>(U
);
3302 if (!OrigLoop
->contains(UI
)) {
3303 assert(isa
<PHINode
>(UI
) && "Expected LCSSA form");
3304 IRBuilder
<> B(MiddleBlock
->getTerminator());
3306 // Fast-math-flags propagate from the original induction instruction.
3307 if (II
.getInductionBinOp() && isa
<FPMathOperator
>(II
.getInductionBinOp()))
3308 B
.setFastMathFlags(II
.getInductionBinOp()->getFastMathFlags());
3310 Value
*CountMinusOne
= B
.CreateSub(
3311 VectorTripCount
, ConstantInt::get(VectorTripCount
->getType(), 1));
3312 CountMinusOne
->setName("cmo");
3314 VPValue
*StepVPV
= Plan
.getSCEVExpansion(II
.getStep());
3315 assert(StepVPV
&& "step must have been expanded during VPlan execution");
3316 Value
*Step
= StepVPV
->isLiveIn() ? StepVPV
->getLiveInIRValue()
3317 : State
.get(StepVPV
, {0, 0});
3319 emitTransformedIndex(B
, CountMinusOne
, II
.getStartValue(), Step
,
3320 II
.getKind(), II
.getInductionBinOp());
3321 Escape
->setName("ind.escape");
3322 MissingVals
[UI
] = Escape
;
3326 for (auto &I
: MissingVals
) {
3327 PHINode
*PHI
= cast
<PHINode
>(I
.first
);
3328 // One corner case we have to handle is two IVs "chasing" each-other,
3329 // that is %IV2 = phi [...], [ %IV1, %latch ]
3330 // In this case, if IV1 has an external use, we need to avoid adding both
3331 // "last value of IV1" and "penultimate value of IV2". So, verify that we
3332 // don't already have an incoming value for the middle block.
3333 if (PHI
->getBasicBlockIndex(MiddleBlock
) == -1) {
3334 PHI
->addIncoming(I
.second
, MiddleBlock
);
3335 Plan
.removeLiveOut(PHI
);
3342 struct CSEDenseMapInfo
{
3343 static bool canHandle(const Instruction
*I
) {
3344 return isa
<InsertElementInst
>(I
) || isa
<ExtractElementInst
>(I
) ||
3345 isa
<ShuffleVectorInst
>(I
) || isa
<GetElementPtrInst
>(I
);
3348 static inline Instruction
*getEmptyKey() {
3349 return DenseMapInfo
<Instruction
*>::getEmptyKey();
3352 static inline Instruction
*getTombstoneKey() {
3353 return DenseMapInfo
<Instruction
*>::getTombstoneKey();
3356 static unsigned getHashValue(const Instruction
*I
) {
3357 assert(canHandle(I
) && "Unknown instruction!");
3358 return hash_combine(I
->getOpcode(), hash_combine_range(I
->value_op_begin(),
3359 I
->value_op_end()));
3362 static bool isEqual(const Instruction
*LHS
, const Instruction
*RHS
) {
3363 if (LHS
== getEmptyKey() || RHS
== getEmptyKey() ||
3364 LHS
== getTombstoneKey() || RHS
== getTombstoneKey())
3366 return LHS
->isIdenticalTo(RHS
);
3370 } // end anonymous namespace
3372 ///Perform cse of induction variable instructions.
3373 static void cse(BasicBlock
*BB
) {
3374 // Perform simple cse.
3375 SmallDenseMap
<Instruction
*, Instruction
*, 4, CSEDenseMapInfo
> CSEMap
;
3376 for (Instruction
&In
: llvm::make_early_inc_range(*BB
)) {
3377 if (!CSEDenseMapInfo::canHandle(&In
))
3380 // Check if we can replace this instruction with any of the
3381 // visited instructions.
3382 if (Instruction
*V
= CSEMap
.lookup(&In
)) {
3383 In
.replaceAllUsesWith(V
);
3384 In
.eraseFromParent();
3393 LoopVectorizationCostModel::getVectorCallCost(CallInst
*CI
,
3394 ElementCount VF
) const {
3395 // We only need to calculate a cost if the VF is scalar; for actual vectors
3396 // we should already have a pre-calculated cost at each VF.
3398 return CallWideningDecisions
.at(std::make_pair(CI
, VF
)).Cost
;
3400 TTI::TargetCostKind CostKind
= TTI::TCK_RecipThroughput
;
3401 Type
*RetTy
= CI
->getType();
3402 if (RecurrenceDescriptor::isFMulAddIntrinsic(CI
))
3403 if (auto RedCost
= getReductionPatternCost(CI
, VF
, RetTy
, CostKind
))
3406 SmallVector
<Type
*, 4> Tys
;
3407 for (auto &ArgOp
: CI
->args())
3408 Tys
.push_back(ArgOp
->getType());
3410 InstructionCost ScalarCallCost
=
3411 TTI
.getCallInstrCost(CI
->getCalledFunction(), RetTy
, Tys
, CostKind
);
3413 // If this is an intrinsic we may have a lower cost for it.
3414 if (getVectorIntrinsicIDForCall(CI
, TLI
)) {
3415 InstructionCost IntrinsicCost
= getVectorIntrinsicCost(CI
, VF
);
3416 return std::min(ScalarCallCost
, IntrinsicCost
);
3418 return ScalarCallCost
;
3421 static Type
*MaybeVectorizeType(Type
*Elt
, ElementCount VF
) {
3422 if (VF
.isScalar() || (!Elt
->isIntOrPtrTy() && !Elt
->isFloatingPointTy()))
3424 return VectorType::get(Elt
, VF
);
3428 LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst
*CI
,
3429 ElementCount VF
) const {
3430 Intrinsic::ID ID
= getVectorIntrinsicIDForCall(CI
, TLI
);
3431 assert(ID
&& "Expected intrinsic call!");
3432 Type
*RetTy
= MaybeVectorizeType(CI
->getType(), VF
);
3434 if (auto *FPMO
= dyn_cast
<FPMathOperator
>(CI
))
3435 FMF
= FPMO
->getFastMathFlags();
3437 SmallVector
<const Value
*> Arguments(CI
->args());
3438 FunctionType
*FTy
= CI
->getCalledFunction()->getFunctionType();
3439 SmallVector
<Type
*> ParamTys
;
3440 std::transform(FTy
->param_begin(), FTy
->param_end(),
3441 std::back_inserter(ParamTys
),
3442 [&](Type
*Ty
) { return MaybeVectorizeType(Ty
, VF
); });
3444 IntrinsicCostAttributes
CostAttrs(ID
, RetTy
, Arguments
, ParamTys
, FMF
,
3445 dyn_cast
<IntrinsicInst
>(CI
));
3446 return TTI
.getIntrinsicInstrCost(CostAttrs
,
3447 TargetTransformInfo::TCK_RecipThroughput
);
3450 static Type
*smallestIntegerVectorType(Type
*T1
, Type
*T2
) {
3451 auto *I1
= cast
<IntegerType
>(cast
<VectorType
>(T1
)->getElementType());
3452 auto *I2
= cast
<IntegerType
>(cast
<VectorType
>(T2
)->getElementType());
3453 return I1
->getBitWidth() < I2
->getBitWidth() ? T1
: T2
;
3456 static Type
*largestIntegerVectorType(Type
*T1
, Type
*T2
) {
3457 auto *I1
= cast
<IntegerType
>(cast
<VectorType
>(T1
)->getElementType());
3458 auto *I2
= cast
<IntegerType
>(cast
<VectorType
>(T2
)->getElementType());
3459 return I1
->getBitWidth() > I2
->getBitWidth() ? T1
: T2
;
3462 void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState
&State
,
3464 // Fix widened non-induction PHIs by setting up the PHI operands.
3465 if (EnableVPlanNativePath
)
3466 fixNonInductionPHIs(Plan
, State
);
3468 // At this point every instruction in the original loop is widened to a
3469 // vector form. Now we need to fix the recurrences in the loop. These PHI
3470 // nodes are currently empty because we did not want to introduce cycles.
3471 // This is the second stage of vectorizing recurrences. Note that fixing
3472 // reduction phis are already modeled in VPlan.
3473 // TODO: Also model fixing fixed-order recurrence phis in VPlan.
3474 VPRegionBlock
*VectorRegion
= State
.Plan
->getVectorLoopRegion();
3475 VPBasicBlock
*HeaderVPBB
= VectorRegion
->getEntryBasicBlock();
3476 for (VPRecipeBase
&R
: HeaderVPBB
->phis()) {
3477 if (auto *FOR
= dyn_cast
<VPFirstOrderRecurrencePHIRecipe
>(&R
))
3478 fixFixedOrderRecurrence(FOR
, State
);
3481 // Forget the original basic block.
3482 PSE
.getSE()->forgetLoop(OrigLoop
);
3483 PSE
.getSE()->forgetBlockAndLoopDispositions();
3485 // After vectorization, the exit blocks of the original loop will have
3486 // additional predecessors. Invalidate SCEVs for the exit phis in case SE
3487 // looked through single-entry phis.
3488 SmallVector
<BasicBlock
*> ExitBlocks
;
3489 OrigLoop
->getExitBlocks(ExitBlocks
);
3490 for (BasicBlock
*Exit
: ExitBlocks
)
3491 for (PHINode
&PN
: Exit
->phis())
3492 PSE
.getSE()->forgetLcssaPhiWithNewPredecessor(OrigLoop
, &PN
);
3494 VPBasicBlock
*LatchVPBB
= VectorRegion
->getExitingBasicBlock();
3495 Loop
*VectorLoop
= LI
->getLoopFor(State
.CFG
.VPBB2IRBB
[LatchVPBB
]);
3496 if (Cost
->requiresScalarEpilogue(VF
.isVector())) {
3497 // No edge from the middle block to the unique exit block has been inserted
3498 // and there is nothing to fix from vector loop; phis should have incoming
3499 // from scalar loop only.
3501 // TODO: Check VPLiveOuts to see if IV users need fixing instead of checking
3504 // If we inserted an edge from the middle block to the unique exit block,
3505 // update uses outside the loop (phis) to account for the newly inserted
3508 // Fix-up external users of the induction variables.
3509 for (const auto &Entry
: Legal
->getInductionVars())
3510 fixupIVUsers(Entry
.first
, Entry
.second
,
3511 getOrCreateVectorTripCount(VectorLoop
->getLoopPreheader()),
3512 IVEndValues
[Entry
.first
], LoopMiddleBlock
,
3513 VectorLoop
->getHeader(), Plan
, State
);
3516 // Fix LCSSA phis not already fixed earlier. Extracts may need to be generated
3517 // in the exit block, so update the builder.
3518 State
.Builder
.SetInsertPoint(State
.CFG
.ExitBB
,
3519 State
.CFG
.ExitBB
->getFirstNonPHIIt());
3520 for (const auto &KV
: Plan
.getLiveOuts())
3521 KV
.second
->fixPhi(Plan
, State
);
3523 for (Instruction
*PI
: PredicatedInstructions
)
3524 sinkScalarOperands(&*PI
);
3526 // Remove redundant induction instructions.
3527 cse(VectorLoop
->getHeader());
3529 // Set/update profile weights for the vector and remainder loops as original
3530 // loop iterations are now distributed among them. Note that original loop
3531 // represented by LoopScalarBody becomes remainder loop after vectorization.
3533 // For cases like foldTailByMasking() and requiresScalarEpiloque() we may
3534 // end up getting slightly roughened result but that should be OK since
3535 // profile is not inherently precise anyway. Note also possible bypass of
3536 // vector code caused by legality checks is ignored, assigning all the weight
3537 // to the vector loop, optimistically.
3539 // For scalable vectorization we can't know at compile time how many iterations
3540 // of the loop are handled in one vector iteration, so instead assume a pessimistic
3542 setProfileInfoAfterUnrolling(LI
->getLoopFor(LoopScalarBody
), VectorLoop
,
3543 LI
->getLoopFor(LoopScalarBody
),
3544 VF
.getKnownMinValue() * UF
);
3547 void InnerLoopVectorizer::fixFixedOrderRecurrence(
3548 VPFirstOrderRecurrencePHIRecipe
*PhiR
, VPTransformState
&State
) {
3549 // This is the second phase of vectorizing first-order recurrences. An
3550 // overview of the transformation is described below. Suppose we have the
3553 // for (int i = 0; i < n; ++i)
3554 // b[i] = a[i] - a[i - 1];
3556 // There is a first-order recurrence on "a". For this loop, the shorthand
3557 // scalar IR looks like:
3564 // i = phi [0, scalar.ph], [i+1, scalar.body]
3565 // s1 = phi [s_init, scalar.ph], [s2, scalar.body]
3568 // br cond, scalar.body, ...
3570 // In this example, s1 is a recurrence because it's value depends on the
3571 // previous iteration. In the first phase of vectorization, we created a
3572 // vector phi v1 for s1. We now complete the vectorization and produce the
3573 // shorthand vector IR shown below (for VF = 4, UF = 1).
3576 // v_init = vector(..., ..., ..., a[-1])
3580 // i = phi [0, vector.ph], [i+4, vector.body]
3581 // v1 = phi [v_init, vector.ph], [v2, vector.body]
3582 // v2 = a[i, i+1, i+2, i+3];
3583 // v3 = vector(v1(3), v2(0, 1, 2))
3584 // b[i, i+1, i+2, i+3] = v2 - v3
3585 // br cond, vector.body, middle.block
3592 // s_init = phi [x, middle.block], [a[-1], otherwise]
3595 // After execution completes the vector loop, we extract the next value of
3596 // the recurrence (x) to use as the initial value in the scalar loop.
3598 // Extract the last vector element in the middle block. This will be the
3599 // initial value for the recurrence when jumping to the scalar loop.
3600 VPValue
*PreviousDef
= PhiR
->getBackedgeValue();
3601 Value
*Incoming
= State
.get(PreviousDef
, UF
- 1);
3602 auto *ExtractForScalar
= Incoming
;
3603 auto *IdxTy
= Builder
.getInt32Ty();
3604 Value
*RuntimeVF
= nullptr;
3605 if (VF
.isVector()) {
3606 auto *One
= ConstantInt::get(IdxTy
, 1);
3607 Builder
.SetInsertPoint(LoopMiddleBlock
->getTerminator());
3608 RuntimeVF
= getRuntimeVF(Builder
, IdxTy
, VF
);
3609 auto *LastIdx
= Builder
.CreateSub(RuntimeVF
, One
);
3611 Builder
.CreateExtractElement(Incoming
, LastIdx
, "vector.recur.extract");
3614 auto RecurSplice
= cast
<VPInstruction
>(*PhiR
->user_begin());
3615 assert(PhiR
->getNumUsers() == 1 &&
3616 RecurSplice
->getOpcode() ==
3617 VPInstruction::FirstOrderRecurrenceSplice
&&
3618 "recurrence phi must have a single user: FirstOrderRecurrenceSplice");
3619 SmallVector
<VPLiveOut
*> LiveOuts
;
3620 for (VPUser
*U
: RecurSplice
->users())
3621 if (auto *LiveOut
= dyn_cast
<VPLiveOut
>(U
))
3622 LiveOuts
.push_back(LiveOut
);
3624 if (!LiveOuts
.empty()) {
3625 // Extract the second last element in the middle block if the
3626 // Phi is used outside the loop. We need to extract the phi itself
3627 // and not the last element (the phi update in the current iteration). This
3628 // will be the value when jumping to the exit block from the
3629 // LoopMiddleBlock, when the scalar loop is not run at all.
3630 Value
*ExtractForPhiUsedOutsideLoop
= nullptr;
3631 if (VF
.isVector()) {
3632 auto *Idx
= Builder
.CreateSub(RuntimeVF
, ConstantInt::get(IdxTy
, 2));
3633 ExtractForPhiUsedOutsideLoop
= Builder
.CreateExtractElement(
3634 Incoming
, Idx
, "vector.recur.extract.for.phi");
3636 assert(UF
> 1 && "VF and UF cannot both be 1");
3637 // When loop is unrolled without vectorizing, initialize
3638 // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled
3639 // value of `Incoming`. This is analogous to the vectorized case above:
3640 // extracting the second last element when VF > 1.
3641 ExtractForPhiUsedOutsideLoop
= State
.get(PreviousDef
, UF
- 2);
3644 for (VPLiveOut
*LiveOut
: LiveOuts
) {
3645 assert(!Cost
->requiresScalarEpilogue(VF
.isVector()));
3646 PHINode
*LCSSAPhi
= LiveOut
->getPhi();
3647 LCSSAPhi
->addIncoming(ExtractForPhiUsedOutsideLoop
, LoopMiddleBlock
);
3648 State
.Plan
->removeLiveOut(LCSSAPhi
);
3652 // Fix the initial value of the original recurrence in the scalar loop.
3653 Builder
.SetInsertPoint(LoopScalarPreHeader
, LoopScalarPreHeader
->begin());
3654 PHINode
*Phi
= cast
<PHINode
>(PhiR
->getUnderlyingValue());
3655 auto *Start
= Builder
.CreatePHI(Phi
->getType(), 2, "scalar.recur.init");
3656 auto *ScalarInit
= PhiR
->getStartValue()->getLiveInIRValue();
3657 for (auto *BB
: predecessors(LoopScalarPreHeader
)) {
3658 auto *Incoming
= BB
== LoopMiddleBlock
? ExtractForScalar
: ScalarInit
;
3659 Start
->addIncoming(Incoming
, BB
);
3662 Phi
->setIncomingValueForBlock(LoopScalarPreHeader
, Start
);
3663 Phi
->setName("scalar.recur");
3666 void InnerLoopVectorizer::sinkScalarOperands(Instruction
*PredInst
) {
3667 // The basic block and loop containing the predicated instruction.
3668 auto *PredBB
= PredInst
->getParent();
3669 auto *VectorLoop
= LI
->getLoopFor(PredBB
);
3671 // Initialize a worklist with the operands of the predicated instruction.
3672 SetVector
<Value
*> Worklist(PredInst
->op_begin(), PredInst
->op_end());
3674 // Holds instructions that we need to analyze again. An instruction may be
3675 // reanalyzed if we don't yet know if we can sink it or not.
3676 SmallVector
<Instruction
*, 8> InstsToReanalyze
;
3678 // Returns true if a given use occurs in the predicated block. Phi nodes use
3679 // their operands in their corresponding predecessor blocks.
3680 auto isBlockOfUsePredicated
= [&](Use
&U
) -> bool {
3681 auto *I
= cast
<Instruction
>(U
.getUser());
3682 BasicBlock
*BB
= I
->getParent();
3683 if (auto *Phi
= dyn_cast
<PHINode
>(I
))
3684 BB
= Phi
->getIncomingBlock(
3685 PHINode::getIncomingValueNumForOperand(U
.getOperandNo()));
3686 return BB
== PredBB
;
3689 // Iteratively sink the scalarized operands of the predicated instruction
3690 // into the block we created for it. When an instruction is sunk, it's
3691 // operands are then added to the worklist. The algorithm ends after one pass
3692 // through the worklist doesn't sink a single instruction.
3695 // Add the instructions that need to be reanalyzed to the worklist, and
3696 // reset the changed indicator.
3697 Worklist
.insert(InstsToReanalyze
.begin(), InstsToReanalyze
.end());
3698 InstsToReanalyze
.clear();
3701 while (!Worklist
.empty()) {
3702 auto *I
= dyn_cast
<Instruction
>(Worklist
.pop_back_val());
3704 // We can't sink an instruction if it is a phi node, is not in the loop,
3705 // may have side effects or may read from memory.
3706 // TODO Could dor more granular checking to allow sinking a load past non-store instructions.
3707 if (!I
|| isa
<PHINode
>(I
) || !VectorLoop
->contains(I
) ||
3708 I
->mayHaveSideEffects() || I
->mayReadFromMemory())
3711 // If the instruction is already in PredBB, check if we can sink its
3712 // operands. In that case, VPlan's sinkScalarOperands() succeeded in
3713 // sinking the scalar instruction I, hence it appears in PredBB; but it
3714 // may have failed to sink I's operands (recursively), which we try
3716 if (I
->getParent() == PredBB
) {
3717 Worklist
.insert(I
->op_begin(), I
->op_end());
3721 // It's legal to sink the instruction if all its uses occur in the
3722 // predicated block. Otherwise, there's nothing to do yet, and we may
3723 // need to reanalyze the instruction.
3724 if (!llvm::all_of(I
->uses(), isBlockOfUsePredicated
)) {
3725 InstsToReanalyze
.push_back(I
);
3729 // Move the instruction to the beginning of the predicated block, and add
3730 // it's operands to the worklist.
3731 I
->moveBefore(&*PredBB
->getFirstInsertionPt());
3732 Worklist
.insert(I
->op_begin(), I
->op_end());
3734 // The sinking may have enabled other instructions to be sunk, so we will
3741 void InnerLoopVectorizer::fixNonInductionPHIs(VPlan
&Plan
,
3742 VPTransformState
&State
) {
3743 auto Iter
= vp_depth_first_deep(Plan
.getEntry());
3744 for (VPBasicBlock
*VPBB
: VPBlockUtils::blocksOnly
<VPBasicBlock
>(Iter
)) {
3745 for (VPRecipeBase
&P
: VPBB
->phis()) {
3746 VPWidenPHIRecipe
*VPPhi
= dyn_cast
<VPWidenPHIRecipe
>(&P
);
3749 PHINode
*NewPhi
= cast
<PHINode
>(State
.get(VPPhi
, 0));
3750 // Make sure the builder has a valid insert point.
3751 Builder
.SetInsertPoint(NewPhi
);
3752 for (unsigned i
= 0; i
< VPPhi
->getNumOperands(); ++i
) {
3753 VPValue
*Inc
= VPPhi
->getIncomingValue(i
);
3754 VPBasicBlock
*VPBB
= VPPhi
->getIncomingBlock(i
);
3755 NewPhi
->addIncoming(State
.get(Inc
, 0), State
.CFG
.VPBB2IRBB
[VPBB
]);
3761 bool InnerLoopVectorizer::useOrderedReductions(
3762 const RecurrenceDescriptor
&RdxDesc
) {
3763 return Cost
->useOrderedReductions(RdxDesc
);
3766 void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF
) {
3767 // We should not collect Scalars more than once per VF. Right now, this
3768 // function is called from collectUniformsAndScalars(), which already does
3769 // this check. Collecting Scalars for VF=1 does not make any sense.
3770 assert(VF
.isVector() && !Scalars
.contains(VF
) &&
3771 "This function should not be visited twice for the same VF");
3773 // This avoids any chances of creating a REPLICATE recipe during planning
3774 // since that would result in generation of scalarized code during execution,
3775 // which is not supported for scalable vectors.
3776 if (VF
.isScalable()) {
3777 Scalars
[VF
].insert(Uniforms
[VF
].begin(), Uniforms
[VF
].end());
3781 SmallSetVector
<Instruction
*, 8> Worklist
;
3783 // These sets are used to seed the analysis with pointers used by memory
3784 // accesses that will remain scalar.
3785 SmallSetVector
<Instruction
*, 8> ScalarPtrs
;
3786 SmallPtrSet
<Instruction
*, 8> PossibleNonScalarPtrs
;
3787 auto *Latch
= TheLoop
->getLoopLatch();
3789 // A helper that returns true if the use of Ptr by MemAccess will be scalar.
3790 // The pointer operands of loads and stores will be scalar as long as the
3791 // memory access is not a gather or scatter operation. The value operand of a
3792 // store will remain scalar if the store is scalarized.
3793 auto isScalarUse
= [&](Instruction
*MemAccess
, Value
*Ptr
) {
3794 InstWidening WideningDecision
= getWideningDecision(MemAccess
, VF
);
3795 assert(WideningDecision
!= CM_Unknown
&&
3796 "Widening decision should be ready at this moment");
3797 if (auto *Store
= dyn_cast
<StoreInst
>(MemAccess
))
3798 if (Ptr
== Store
->getValueOperand())
3799 return WideningDecision
== CM_Scalarize
;
3800 assert(Ptr
== getLoadStorePointerOperand(MemAccess
) &&
3801 "Ptr is neither a value or pointer operand");
3802 return WideningDecision
!= CM_GatherScatter
;
3805 // A helper that returns true if the given value is a bitcast or
3806 // getelementptr instruction contained in the loop.
3807 auto isLoopVaryingBitCastOrGEP
= [&](Value
*V
) {
3808 return ((isa
<BitCastInst
>(V
) && V
->getType()->isPointerTy()) ||
3809 isa
<GetElementPtrInst
>(V
)) &&
3810 !TheLoop
->isLoopInvariant(V
);
3813 // A helper that evaluates a memory access's use of a pointer. If the use will
3814 // be a scalar use and the pointer is only used by memory accesses, we place
3815 // the pointer in ScalarPtrs. Otherwise, the pointer is placed in
3816 // PossibleNonScalarPtrs.
3817 auto evaluatePtrUse
= [&](Instruction
*MemAccess
, Value
*Ptr
) {
3818 // We only care about bitcast and getelementptr instructions contained in
3820 if (!isLoopVaryingBitCastOrGEP(Ptr
))
3823 // If the pointer has already been identified as scalar (e.g., if it was
3824 // also identified as uniform), there's nothing to do.
3825 auto *I
= cast
<Instruction
>(Ptr
);
3826 if (Worklist
.count(I
))
3829 // If the use of the pointer will be a scalar use, and all users of the
3830 // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
3831 // place the pointer in PossibleNonScalarPtrs.
3832 if (isScalarUse(MemAccess
, Ptr
) && llvm::all_of(I
->users(), [&](User
*U
) {
3833 return isa
<LoadInst
>(U
) || isa
<StoreInst
>(U
);
3835 ScalarPtrs
.insert(I
);
3837 PossibleNonScalarPtrs
.insert(I
);
3840 // We seed the scalars analysis with three classes of instructions: (1)
3841 // instructions marked uniform-after-vectorization and (2) bitcast,
3842 // getelementptr and (pointer) phi instructions used by memory accesses
3843 // requiring a scalar use.
3845 // (1) Add to the worklist all instructions that have been identified as
3846 // uniform-after-vectorization.
3847 Worklist
.insert(Uniforms
[VF
].begin(), Uniforms
[VF
].end());
3849 // (2) Add to the worklist all bitcast and getelementptr instructions used by
3850 // memory accesses requiring a scalar use. The pointer operands of loads and
3851 // stores will be scalar as long as the memory accesses is not a gather or
3852 // scatter operation. The value operand of a store will remain scalar if the
3853 // store is scalarized.
3854 for (auto *BB
: TheLoop
->blocks())
3855 for (auto &I
: *BB
) {
3856 if (auto *Load
= dyn_cast
<LoadInst
>(&I
)) {
3857 evaluatePtrUse(Load
, Load
->getPointerOperand());
3858 } else if (auto *Store
= dyn_cast
<StoreInst
>(&I
)) {
3859 evaluatePtrUse(Store
, Store
->getPointerOperand());
3860 evaluatePtrUse(Store
, Store
->getValueOperand());
3863 for (auto *I
: ScalarPtrs
)
3864 if (!PossibleNonScalarPtrs
.count(I
)) {
3865 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I
<< "\n");
3869 // Insert the forced scalars.
3870 // FIXME: Currently VPWidenPHIRecipe() often creates a dead vector
3871 // induction variable when the PHI user is scalarized.
3872 auto ForcedScalar
= ForcedScalars
.find(VF
);
3873 if (ForcedScalar
!= ForcedScalars
.end())
3874 for (auto *I
: ForcedScalar
->second
) {
3875 LLVM_DEBUG(dbgs() << "LV: Found (forced) scalar instruction: " << *I
<< "\n");
3879 // Expand the worklist by looking through any bitcasts and getelementptr
3880 // instructions we've already identified as scalar. This is similar to the
3881 // expansion step in collectLoopUniforms(); however, here we're only
3882 // expanding to include additional bitcasts and getelementptr instructions.
3884 while (Idx
!= Worklist
.size()) {
3885 Instruction
*Dst
= Worklist
[Idx
++];
3886 if (!isLoopVaryingBitCastOrGEP(Dst
->getOperand(0)))
3888 auto *Src
= cast
<Instruction
>(Dst
->getOperand(0));
3889 if (llvm::all_of(Src
->users(), [&](User
*U
) -> bool {
3890 auto *J
= cast
<Instruction
>(U
);
3891 return !TheLoop
->contains(J
) || Worklist
.count(J
) ||
3892 ((isa
<LoadInst
>(J
) || isa
<StoreInst
>(J
)) &&
3893 isScalarUse(J
, Src
));
3895 Worklist
.insert(Src
);
3896 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src
<< "\n");
3900 // An induction variable will remain scalar if all users of the induction
3901 // variable and induction variable update remain scalar.
3902 for (const auto &Induction
: Legal
->getInductionVars()) {
3903 auto *Ind
= Induction
.first
;
3904 auto *IndUpdate
= cast
<Instruction
>(Ind
->getIncomingValueForBlock(Latch
));
3906 // If tail-folding is applied, the primary induction variable will be used
3907 // to feed a vector compare.
3908 if (Ind
== Legal
->getPrimaryInduction() && foldTailByMasking())
3911 // Returns true if \p Indvar is a pointer induction that is used directly by
3912 // load/store instruction \p I.
3913 auto IsDirectLoadStoreFromPtrIndvar
= [&](Instruction
*Indvar
,
3915 return Induction
.second
.getKind() ==
3916 InductionDescriptor::IK_PtrInduction
&&
3917 (isa
<LoadInst
>(I
) || isa
<StoreInst
>(I
)) &&
3918 Indvar
== getLoadStorePointerOperand(I
) && isScalarUse(I
, Indvar
);
3921 // Determine if all users of the induction variable are scalar after
3923 auto ScalarInd
= llvm::all_of(Ind
->users(), [&](User
*U
) -> bool {
3924 auto *I
= cast
<Instruction
>(U
);
3925 return I
== IndUpdate
|| !TheLoop
->contains(I
) || Worklist
.count(I
) ||
3926 IsDirectLoadStoreFromPtrIndvar(Ind
, I
);
3931 // Determine if all users of the induction variable update instruction are
3932 // scalar after vectorization.
3933 auto ScalarIndUpdate
=
3934 llvm::all_of(IndUpdate
->users(), [&](User
*U
) -> bool {
3935 auto *I
= cast
<Instruction
>(U
);
3936 return I
== Ind
|| !TheLoop
->contains(I
) || Worklist
.count(I
) ||
3937 IsDirectLoadStoreFromPtrIndvar(IndUpdate
, I
);
3939 if (!ScalarIndUpdate
)
3942 // The induction variable and its update instruction will remain scalar.
3943 Worklist
.insert(Ind
);
3944 Worklist
.insert(IndUpdate
);
3945 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind
<< "\n");
3946 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
3950 Scalars
[VF
].insert(Worklist
.begin(), Worklist
.end());
3953 bool LoopVectorizationCostModel::isScalarWithPredication(
3954 Instruction
*I
, ElementCount VF
) const {
3955 if (!isPredicatedInst(I
))
3958 // Do we have a non-scalar lowering for this predicated
3959 // instruction? No - it is scalar with predication.
3960 switch(I
->getOpcode()) {
3963 case Instruction::Call
:
3966 return CallWideningDecisions
.at(std::make_pair(cast
<CallInst
>(I
), VF
))
3967 .Kind
== CM_Scalarize
;
3968 case Instruction::Load
:
3969 case Instruction::Store
: {
3970 auto *Ptr
= getLoadStorePointerOperand(I
);
3971 auto *Ty
= getLoadStoreType(I
);
3974 VTy
= VectorType::get(Ty
, VF
);
3975 const Align Alignment
= getLoadStoreAlignment(I
);
3976 return isa
<LoadInst
>(I
) ? !(isLegalMaskedLoad(Ty
, Ptr
, Alignment
) ||
3977 TTI
.isLegalMaskedGather(VTy
, Alignment
))
3978 : !(isLegalMaskedStore(Ty
, Ptr
, Alignment
) ||
3979 TTI
.isLegalMaskedScatter(VTy
, Alignment
));
3981 case Instruction::UDiv
:
3982 case Instruction::SDiv
:
3983 case Instruction::SRem
:
3984 case Instruction::URem
: {
3985 // We have the option to use the safe-divisor idiom to avoid predication.
3986 // The cost based decision here will always select safe-divisor for
3987 // scalable vectors as scalarization isn't legal.
3988 const auto [ScalarCost
, SafeDivisorCost
] = getDivRemSpeculationCost(I
, VF
);
3989 return isDivRemScalarWithPredication(ScalarCost
, SafeDivisorCost
);
3994 bool LoopVectorizationCostModel::isPredicatedInst(Instruction
*I
) const {
3995 if (!blockNeedsPredicationForAnyReason(I
->getParent()))
3998 // Can we prove this instruction is safe to unconditionally execute?
3999 // If not, we must use some form of predication.
4000 switch(I
->getOpcode()) {
4003 case Instruction::Load
:
4004 case Instruction::Store
: {
4005 if (!Legal
->isMaskRequired(I
))
4007 // When we know the load's address is loop invariant and the instruction
4008 // in the original scalar loop was unconditionally executed then we
4009 // don't need to mark it as a predicated instruction. Tail folding may
4010 // introduce additional predication, but we're guaranteed to always have
4011 // at least one active lane. We call Legal->blockNeedsPredication here
4012 // because it doesn't query tail-folding. For stores, we need to prove
4013 // both speculation safety (which follows from the same argument as loads),
4014 // but also must prove the value being stored is correct. The easiest
4015 // form of the later is to require that all values stored are the same.
4016 if (Legal
->isInvariant(getLoadStorePointerOperand(I
)) &&
4017 (isa
<LoadInst
>(I
) ||
4018 (isa
<StoreInst
>(I
) &&
4019 TheLoop
->isLoopInvariant(cast
<StoreInst
>(I
)->getValueOperand()))) &&
4020 !Legal
->blockNeedsPredication(I
->getParent()))
4024 case Instruction::UDiv
:
4025 case Instruction::SDiv
:
4026 case Instruction::SRem
:
4027 case Instruction::URem
:
4028 // TODO: We can use the loop-preheader as context point here and get
4029 // context sensitive reasoning
4030 return !isSafeToSpeculativelyExecute(I
);
4031 case Instruction::Call
:
4032 return Legal
->isMaskRequired(I
);
4036 std::pair
<InstructionCost
, InstructionCost
>
4037 LoopVectorizationCostModel::getDivRemSpeculationCost(Instruction
*I
,
4038 ElementCount VF
) const {
4039 assert(I
->getOpcode() == Instruction::UDiv
||
4040 I
->getOpcode() == Instruction::SDiv
||
4041 I
->getOpcode() == Instruction::SRem
||
4042 I
->getOpcode() == Instruction::URem
);
4043 assert(!isSafeToSpeculativelyExecute(I
));
4045 const TTI::TargetCostKind CostKind
= TTI::TCK_RecipThroughput
;
4047 // Scalarization isn't legal for scalable vector types
4048 InstructionCost ScalarizationCost
= InstructionCost::getInvalid();
4049 if (!VF
.isScalable()) {
4050 // Get the scalarization cost and scale this amount by the probability of
4051 // executing the predicated block. If the instruction is not predicated,
4052 // we fall through to the next case.
4053 ScalarizationCost
= 0;
4055 // These instructions have a non-void type, so account for the phi nodes
4056 // that we will create. This cost is likely to be zero. The phi node
4057 // cost, if any, should be scaled by the block probability because it
4058 // models a copy at the end of each predicated block.
4059 ScalarizationCost
+= VF
.getKnownMinValue() *
4060 TTI
.getCFInstrCost(Instruction::PHI
, CostKind
);
4062 // The cost of the non-predicated instruction.
4063 ScalarizationCost
+= VF
.getKnownMinValue() *
4064 TTI
.getArithmeticInstrCost(I
->getOpcode(), I
->getType(), CostKind
);
4066 // The cost of insertelement and extractelement instructions needed for
4068 ScalarizationCost
+= getScalarizationOverhead(I
, VF
, CostKind
);
4070 // Scale the cost by the probability of executing the predicated blocks.
4071 // This assumes the predicated block for each vector lane is equally
4073 ScalarizationCost
= ScalarizationCost
/ getReciprocalPredBlockProb();
4075 InstructionCost SafeDivisorCost
= 0;
4077 auto *VecTy
= ToVectorTy(I
->getType(), VF
);
4079 // The cost of the select guard to ensure all lanes are well defined
4080 // after we speculate above any internal control flow.
4081 SafeDivisorCost
+= TTI
.getCmpSelInstrCost(
4082 Instruction::Select
, VecTy
,
4083 ToVectorTy(Type::getInt1Ty(I
->getContext()), VF
),
4084 CmpInst::BAD_ICMP_PREDICATE
, CostKind
);
4086 // Certain instructions can be cheaper to vectorize if they have a constant
4087 // second vector operand. One example of this are shifts on x86.
4088 Value
*Op2
= I
->getOperand(1);
4089 auto Op2Info
= TTI
.getOperandInfo(Op2
);
4090 if (Op2Info
.Kind
== TargetTransformInfo::OK_AnyValue
&&
4091 Legal
->isInvariant(Op2
))
4092 Op2Info
.Kind
= TargetTransformInfo::OK_UniformValue
;
4094 SmallVector
<const Value
*, 4> Operands(I
->operand_values());
4095 SafeDivisorCost
+= TTI
.getArithmeticInstrCost(
4096 I
->getOpcode(), VecTy
, CostKind
,
4097 {TargetTransformInfo::OK_AnyValue
, TargetTransformInfo::OP_None
},
4098 Op2Info
, Operands
, I
);
4099 return {ScalarizationCost
, SafeDivisorCost
};
4102 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(
4103 Instruction
*I
, ElementCount VF
) {
4104 assert(isAccessInterleaved(I
) && "Expecting interleaved access.");
4105 assert(getWideningDecision(I
, VF
) == CM_Unknown
&&
4106 "Decision should not be set yet.");
4107 auto *Group
= getInterleavedAccessGroup(I
);
4108 assert(Group
&& "Must have a group.");
4110 // If the instruction's allocated size doesn't equal it's type size, it
4111 // requires padding and will be scalarized.
4112 auto &DL
= I
->getModule()->getDataLayout();
4113 auto *ScalarTy
= getLoadStoreType(I
);
4114 if (hasIrregularType(ScalarTy
, DL
))
4117 // If the group involves a non-integral pointer, we may not be able to
4118 // losslessly cast all values to a common type.
4119 unsigned InterleaveFactor
= Group
->getFactor();
4120 bool ScalarNI
= DL
.isNonIntegralPointerType(ScalarTy
);
4121 for (unsigned i
= 0; i
< InterleaveFactor
; i
++) {
4122 Instruction
*Member
= Group
->getMember(i
);
4125 auto *MemberTy
= getLoadStoreType(Member
);
4126 bool MemberNI
= DL
.isNonIntegralPointerType(MemberTy
);
4127 // Don't coerce non-integral pointers to integers or vice versa.
4128 if (MemberNI
!= ScalarNI
) {
4129 // TODO: Consider adding special nullptr value case here
4131 } else if (MemberNI
&& ScalarNI
&&
4132 ScalarTy
->getPointerAddressSpace() !=
4133 MemberTy
->getPointerAddressSpace()) {
4138 // Check if masking is required.
4139 // A Group may need masking for one of two reasons: it resides in a block that
4140 // needs predication, or it was decided to use masking to deal with gaps
4141 // (either a gap at the end of a load-access that may result in a speculative
4142 // load, or any gaps in a store-access).
4143 bool PredicatedAccessRequiresMasking
=
4144 blockNeedsPredicationForAnyReason(I
->getParent()) &&
4145 Legal
->isMaskRequired(I
);
4146 bool LoadAccessWithGapsRequiresEpilogMasking
=
4147 isa
<LoadInst
>(I
) && Group
->requiresScalarEpilogue() &&
4148 !isScalarEpilogueAllowed();
4149 bool StoreAccessWithGapsRequiresMasking
=
4150 isa
<StoreInst
>(I
) && (Group
->getNumMembers() < Group
->getFactor());
4151 if (!PredicatedAccessRequiresMasking
&&
4152 !LoadAccessWithGapsRequiresEpilogMasking
&&
4153 !StoreAccessWithGapsRequiresMasking
)
4156 // If masked interleaving is required, we expect that the user/target had
4157 // enabled it, because otherwise it either wouldn't have been created or
4158 // it should have been invalidated by the CostModel.
4159 assert(useMaskedInterleavedAccesses(TTI
) &&
4160 "Masked interleave-groups for predicated accesses are not enabled.");
4162 if (Group
->isReverse())
4165 auto *Ty
= getLoadStoreType(I
);
4166 const Align Alignment
= getLoadStoreAlignment(I
);
4167 return isa
<LoadInst
>(I
) ? TTI
.isLegalMaskedLoad(Ty
, Alignment
)
4168 : TTI
.isLegalMaskedStore(Ty
, Alignment
);
4171 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(
4172 Instruction
*I
, ElementCount VF
) {
4173 // Get and ensure we have a valid memory instruction.
4174 assert((isa
<LoadInst
, StoreInst
>(I
)) && "Invalid memory instruction");
4176 auto *Ptr
= getLoadStorePointerOperand(I
);
4177 auto *ScalarTy
= getLoadStoreType(I
);
4179 // In order to be widened, the pointer should be consecutive, first of all.
4180 if (!Legal
->isConsecutivePtr(ScalarTy
, Ptr
))
4183 // If the instruction is a store located in a predicated block, it will be
4185 if (isScalarWithPredication(I
, VF
))
4188 // If the instruction's allocated size doesn't equal it's type size, it
4189 // requires padding and will be scalarized.
4190 auto &DL
= I
->getModule()->getDataLayout();
4191 if (hasIrregularType(ScalarTy
, DL
))
4197 void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF
) {
4198 // We should not collect Uniforms more than once per VF. Right now,
4199 // this function is called from collectUniformsAndScalars(), which
4200 // already does this check. Collecting Uniforms for VF=1 does not make any
4203 assert(VF
.isVector() && !Uniforms
.contains(VF
) &&
4204 "This function should not be visited twice for the same VF");
4206 // Visit the list of Uniforms. If we'll not find any uniform value, we'll
4207 // not analyze again. Uniforms.count(VF) will return 1.
4208 Uniforms
[VF
].clear();
4210 // We now know that the loop is vectorizable!
4211 // Collect instructions inside the loop that will remain uniform after
4214 // Global values, params and instructions outside of current loop are out of
4216 auto isOutOfScope
= [&](Value
*V
) -> bool {
4217 Instruction
*I
= dyn_cast
<Instruction
>(V
);
4218 return (!I
|| !TheLoop
->contains(I
));
4221 // Worklist containing uniform instructions demanding lane 0.
4222 SetVector
<Instruction
*> Worklist
;
4223 BasicBlock
*Latch
= TheLoop
->getLoopLatch();
4225 // Add uniform instructions demanding lane 0 to the worklist. Instructions
4226 // that are scalar with predication must not be considered uniform after
4227 // vectorization, because that would create an erroneous replicating region
4228 // where only a single instance out of VF should be formed.
4229 // TODO: optimize such seldom cases if found important, see PR40816.
4230 auto addToWorklistIfAllowed
= [&](Instruction
*I
) -> void {
4231 if (isOutOfScope(I
)) {
4232 LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: "
4236 if (isScalarWithPredication(I
, VF
)) {
4237 LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: "
4241 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I
<< "\n");
4245 // Start with the conditional branch. If the branch condition is an
4246 // instruction contained in the loop that is only used by the branch, it is
4248 auto *Cmp
= dyn_cast
<Instruction
>(Latch
->getTerminator()->getOperand(0));
4249 if (Cmp
&& TheLoop
->contains(Cmp
) && Cmp
->hasOneUse())
4250 addToWorklistIfAllowed(Cmp
);
4252 auto PrevVF
= VF
.divideCoefficientBy(2);
4253 // Return true if all lanes perform the same memory operation, and we can
4254 // thus chose to execute only one.
4255 auto isUniformMemOpUse
= [&](Instruction
*I
) {
4256 // If the value was already known to not be uniform for the previous
4257 // (smaller VF), it cannot be uniform for the larger VF.
4258 if (PrevVF
.isVector()) {
4259 auto Iter
= Uniforms
.find(PrevVF
);
4260 if (Iter
!= Uniforms
.end() && !Iter
->second
.contains(I
))
4263 if (!Legal
->isUniformMemOp(*I
, VF
))
4265 if (isa
<LoadInst
>(I
))
4266 // Loading the same address always produces the same result - at least
4267 // assuming aliasing and ordering which have already been checked.
4269 // Storing the same value on every iteration.
4270 return TheLoop
->isLoopInvariant(cast
<StoreInst
>(I
)->getValueOperand());
4273 auto isUniformDecision
= [&](Instruction
*I
, ElementCount VF
) {
4274 InstWidening WideningDecision
= getWideningDecision(I
, VF
);
4275 assert(WideningDecision
!= CM_Unknown
&&
4276 "Widening decision should be ready at this moment");
4278 if (isUniformMemOpUse(I
))
4281 return (WideningDecision
== CM_Widen
||
4282 WideningDecision
== CM_Widen_Reverse
||
4283 WideningDecision
== CM_Interleave
);
4286 // Returns true if Ptr is the pointer operand of a memory access instruction
4287 // I, I is known to not require scalarization, and the pointer is not also
4289 auto isVectorizedMemAccessUse
= [&](Instruction
*I
, Value
*Ptr
) -> bool {
4290 if (isa
<StoreInst
>(I
) && I
->getOperand(0) == Ptr
)
4292 return getLoadStorePointerOperand(I
) == Ptr
&&
4293 (isUniformDecision(I
, VF
) || Legal
->isInvariant(Ptr
));
4296 // Holds a list of values which are known to have at least one uniform use.
4297 // Note that there may be other uses which aren't uniform. A "uniform use"
4298 // here is something which only demands lane 0 of the unrolled iterations;
4299 // it does not imply that all lanes produce the same value (e.g. this is not
4300 // the usual meaning of uniform)
4301 SetVector
<Value
*> HasUniformUse
;
4303 // Scan the loop for instructions which are either a) known to have only
4304 // lane 0 demanded or b) are uses which demand only lane 0 of their operand.
4305 for (auto *BB
: TheLoop
->blocks())
4306 for (auto &I
: *BB
) {
4307 if (IntrinsicInst
*II
= dyn_cast
<IntrinsicInst
>(&I
)) {
4308 switch (II
->getIntrinsicID()) {
4309 case Intrinsic::sideeffect
:
4310 case Intrinsic::experimental_noalias_scope_decl
:
4311 case Intrinsic::assume
:
4312 case Intrinsic::lifetime_start
:
4313 case Intrinsic::lifetime_end
:
4314 if (TheLoop
->hasLoopInvariantOperands(&I
))
4315 addToWorklistIfAllowed(&I
);
4322 // ExtractValue instructions must be uniform, because the operands are
4323 // known to be loop-invariant.
4324 if (auto *EVI
= dyn_cast
<ExtractValueInst
>(&I
)) {
4325 assert(isOutOfScope(EVI
->getAggregateOperand()) &&
4326 "Expected aggregate value to be loop invariant");
4327 addToWorklistIfAllowed(EVI
);
4331 // If there's no pointer operand, there's nothing to do.
4332 auto *Ptr
= getLoadStorePointerOperand(&I
);
4336 if (isUniformMemOpUse(&I
))
4337 addToWorklistIfAllowed(&I
);
4339 if (isVectorizedMemAccessUse(&I
, Ptr
))
4340 HasUniformUse
.insert(Ptr
);
4343 // Add to the worklist any operands which have *only* uniform (e.g. lane 0
4344 // demanding) users. Since loops are assumed to be in LCSSA form, this
4345 // disallows uses outside the loop as well.
4346 for (auto *V
: HasUniformUse
) {
4347 if (isOutOfScope(V
))
4349 auto *I
= cast
<Instruction
>(V
);
4350 auto UsersAreMemAccesses
=
4351 llvm::all_of(I
->users(), [&](User
*U
) -> bool {
4352 return isVectorizedMemAccessUse(cast
<Instruction
>(U
), V
);
4354 if (UsersAreMemAccesses
)
4355 addToWorklistIfAllowed(I
);
4358 // Expand Worklist in topological order: whenever a new instruction
4359 // is added , its users should be already inside Worklist. It ensures
4360 // a uniform instruction will only be used by uniform instructions.
4362 while (idx
!= Worklist
.size()) {
4363 Instruction
*I
= Worklist
[idx
++];
4365 for (auto *OV
: I
->operand_values()) {
4366 // isOutOfScope operands cannot be uniform instructions.
4367 if (isOutOfScope(OV
))
4369 // First order recurrence Phi's should typically be considered
4371 auto *OP
= dyn_cast
<PHINode
>(OV
);
4372 if (OP
&& Legal
->isFixedOrderRecurrence(OP
))
4374 // If all the users of the operand are uniform, then add the
4375 // operand into the uniform worklist.
4376 auto *OI
= cast
<Instruction
>(OV
);
4377 if (llvm::all_of(OI
->users(), [&](User
*U
) -> bool {
4378 auto *J
= cast
<Instruction
>(U
);
4379 return Worklist
.count(J
) || isVectorizedMemAccessUse(J
, OI
);
4381 addToWorklistIfAllowed(OI
);
4385 // For an instruction to be added into Worklist above, all its users inside
4386 // the loop should also be in Worklist. However, this condition cannot be
4387 // true for phi nodes that form a cyclic dependence. We must process phi
4388 // nodes separately. An induction variable will remain uniform if all users
4389 // of the induction variable and induction variable update remain uniform.
4390 // The code below handles both pointer and non-pointer induction variables.
4391 for (const auto &Induction
: Legal
->getInductionVars()) {
4392 auto *Ind
= Induction
.first
;
4393 auto *IndUpdate
= cast
<Instruction
>(Ind
->getIncomingValueForBlock(Latch
));
4395 // Determine if all users of the induction variable are uniform after
4397 auto UniformInd
= llvm::all_of(Ind
->users(), [&](User
*U
) -> bool {
4398 auto *I
= cast
<Instruction
>(U
);
4399 return I
== IndUpdate
|| !TheLoop
->contains(I
) || Worklist
.count(I
) ||
4400 isVectorizedMemAccessUse(I
, Ind
);
4405 // Determine if all users of the induction variable update instruction are
4406 // uniform after vectorization.
4407 auto UniformIndUpdate
=
4408 llvm::all_of(IndUpdate
->users(), [&](User
*U
) -> bool {
4409 auto *I
= cast
<Instruction
>(U
);
4410 return I
== Ind
|| !TheLoop
->contains(I
) || Worklist
.count(I
) ||
4411 isVectorizedMemAccessUse(I
, IndUpdate
);
4413 if (!UniformIndUpdate
)
4416 // The induction variable and its update instruction will remain uniform.
4417 addToWorklistIfAllowed(Ind
);
4418 addToWorklistIfAllowed(IndUpdate
);
4421 Uniforms
[VF
].insert(Worklist
.begin(), Worklist
.end());
4424 bool LoopVectorizationCostModel::runtimeChecksRequired() {
4425 LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n");
4427 if (Legal
->getRuntimePointerChecking()->Need
) {
4428 reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz",
4429 "runtime pointer checks needed. Enable vectorization of this "
4430 "loop with '#pragma clang loop vectorize(enable)' when "
4431 "compiling with -Os/-Oz",
4432 "CantVersionLoopWithOptForSize", ORE
, TheLoop
);
4436 if (!PSE
.getPredicate().isAlwaysTrue()) {
4437 reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz",
4438 "runtime SCEV checks needed. Enable vectorization of this "
4439 "loop with '#pragma clang loop vectorize(enable)' when "
4440 "compiling with -Os/-Oz",
4441 "CantVersionLoopWithOptForSize", ORE
, TheLoop
);
4445 // FIXME: Avoid specializing for stride==1 instead of bailing out.
4446 if (!Legal
->getLAI()->getSymbolicStrides().empty()) {
4447 reportVectorizationFailure("Runtime stride check for small trip count",
4448 "runtime stride == 1 checks needed. Enable vectorization of "
4449 "this loop without such check by compiling with -Os/-Oz",
4450 "CantVersionLoopWithOptForSize", ORE
, TheLoop
);
4458 LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements
) {
4459 if (!TTI
.supportsScalableVectors() && !ForceTargetSupportsScalableVectors
)
4460 return ElementCount::getScalable(0);
4462 if (Hints
->isScalableVectorizationDisabled()) {
4463 reportVectorizationInfo("Scalable vectorization is explicitly disabled",
4464 "ScalableVectorizationDisabled", ORE
, TheLoop
);
4465 return ElementCount::getScalable(0);
4468 LLVM_DEBUG(dbgs() << "LV: Scalable vectorization is available\n");
4470 auto MaxScalableVF
= ElementCount::getScalable(
4471 std::numeric_limits
<ElementCount::ScalarTy
>::max());
4473 // Test that the loop-vectorizer can legalize all operations for this MaxVF.
4474 // FIXME: While for scalable vectors this is currently sufficient, this should
4475 // be replaced by a more detailed mechanism that filters out specific VFs,
4476 // instead of invalidating vectorization for a whole set of VFs based on the
4479 // Disable scalable vectorization if the loop contains unsupported reductions.
4480 if (!canVectorizeReductions(MaxScalableVF
)) {
4481 reportVectorizationInfo(
4482 "Scalable vectorization not supported for the reduction "
4483 "operations found in this loop.",
4484 "ScalableVFUnfeasible", ORE
, TheLoop
);
4485 return ElementCount::getScalable(0);
4488 // Disable scalable vectorization if the loop contains any instructions
4489 // with element types not supported for scalable vectors.
4490 if (any_of(ElementTypesInLoop
, [&](Type
*Ty
) {
4491 return !Ty
->isVoidTy() &&
4492 !this->TTI
.isElementTypeLegalForScalableVector(Ty
);
4494 reportVectorizationInfo("Scalable vectorization is not supported "
4495 "for all element types found in this loop.",
4496 "ScalableVFUnfeasible", ORE
, TheLoop
);
4497 return ElementCount::getScalable(0);
4500 if (Legal
->isSafeForAnyVectorWidth())
4501 return MaxScalableVF
;
4503 // Limit MaxScalableVF by the maximum safe dependence distance.
4504 if (std::optional
<unsigned> MaxVScale
= getMaxVScale(*TheFunction
, TTI
))
4505 MaxScalableVF
= ElementCount::getScalable(MaxSafeElements
/ *MaxVScale
);
4507 MaxScalableVF
= ElementCount::getScalable(0);
4510 reportVectorizationInfo(
4511 "Max legal vector width too small, scalable vectorization "
4513 "ScalableVFUnfeasible", ORE
, TheLoop
);
4515 return MaxScalableVF
;
4518 FixedScalableVFPair
LoopVectorizationCostModel::computeFeasibleMaxVF(
4519 unsigned MaxTripCount
, ElementCount UserVF
, bool FoldTailByMasking
) {
4520 MinBWs
= computeMinimumValueSizes(TheLoop
->getBlocks(), *DB
, &TTI
);
4521 unsigned SmallestType
, WidestType
;
4522 std::tie(SmallestType
, WidestType
) = getSmallestAndWidestTypes();
4524 // Get the maximum safe dependence distance in bits computed by LAA.
4525 // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
4526 // the memory accesses that is most restrictive (involved in the smallest
4527 // dependence distance).
4528 unsigned MaxSafeElements
=
4529 llvm::bit_floor(Legal
->getMaxSafeVectorWidthInBits() / WidestType
);
4531 auto MaxSafeFixedVF
= ElementCount::getFixed(MaxSafeElements
);
4532 auto MaxSafeScalableVF
= getMaxLegalScalableVF(MaxSafeElements
);
4534 LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF
4536 LLVM_DEBUG(dbgs() << "LV: The max safe scalable VF is: " << MaxSafeScalableVF
4539 // First analyze the UserVF, fall back if the UserVF should be ignored.
4541 auto MaxSafeUserVF
=
4542 UserVF
.isScalable() ? MaxSafeScalableVF
: MaxSafeFixedVF
;
4544 if (ElementCount::isKnownLE(UserVF
, MaxSafeUserVF
)) {
4545 // If `VF=vscale x N` is safe, then so is `VF=N`
4546 if (UserVF
.isScalable())
4547 return FixedScalableVFPair(
4548 ElementCount::getFixed(UserVF
.getKnownMinValue()), UserVF
);
4553 assert(ElementCount::isKnownGT(UserVF
, MaxSafeUserVF
));
4555 // Only clamp if the UserVF is not scalable. If the UserVF is scalable, it
4556 // is better to ignore the hint and let the compiler choose a suitable VF.
4557 if (!UserVF
.isScalable()) {
4558 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
4559 << " is unsafe, clamping to max safe VF="
4560 << MaxSafeFixedVF
<< ".\n");
4562 return OptimizationRemarkAnalysis(DEBUG_TYPE
, "VectorizationFactor",
4563 TheLoop
->getStartLoc(),
4564 TheLoop
->getHeader())
4565 << "User-specified vectorization factor "
4566 << ore::NV("UserVectorizationFactor", UserVF
)
4567 << " is unsafe, clamping to maximum safe vectorization factor "
4568 << ore::NV("VectorizationFactor", MaxSafeFixedVF
);
4570 return MaxSafeFixedVF
;
4573 if (!TTI
.supportsScalableVectors() && !ForceTargetSupportsScalableVectors
) {
4574 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
4575 << " is ignored because scalable vectors are not "
4578 return OptimizationRemarkAnalysis(DEBUG_TYPE
, "VectorizationFactor",
4579 TheLoop
->getStartLoc(),
4580 TheLoop
->getHeader())
4581 << "User-specified vectorization factor "
4582 << ore::NV("UserVectorizationFactor", UserVF
)
4583 << " is ignored because the target does not support scalable "
4584 "vectors. The compiler will pick a more suitable value.";
4587 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
4588 << " is unsafe. Ignoring scalable UserVF.\n");
4590 return OptimizationRemarkAnalysis(DEBUG_TYPE
, "VectorizationFactor",
4591 TheLoop
->getStartLoc(),
4592 TheLoop
->getHeader())
4593 << "User-specified vectorization factor "
4594 << ore::NV("UserVectorizationFactor", UserVF
)
4595 << " is unsafe. Ignoring the hint to let the compiler pick a "
4596 "more suitable value.";
4601 LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
4602 << " / " << WidestType
<< " bits.\n");
4604 FixedScalableVFPair
Result(ElementCount::getFixed(1),
4605 ElementCount::getScalable(0));
4607 getMaximizedVFForTarget(MaxTripCount
, SmallestType
, WidestType
,
4608 MaxSafeFixedVF
, FoldTailByMasking
))
4609 Result
.FixedVF
= MaxVF
;
4612 getMaximizedVFForTarget(MaxTripCount
, SmallestType
, WidestType
,
4613 MaxSafeScalableVF
, FoldTailByMasking
))
4614 if (MaxVF
.isScalable()) {
4615 Result
.ScalableVF
= MaxVF
;
4616 LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF
4624 LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF
, unsigned UserIC
) {
4625 if (Legal
->getRuntimePointerChecking()->Need
&& TTI
.hasBranchDivergence()) {
4626 // TODO: It may by useful to do since it's still likely to be dynamically
4627 // uniform if the target can skip.
4628 reportVectorizationFailure(
4629 "Not inserting runtime ptr check for divergent target",
4630 "runtime pointer checks needed. Not enabled for divergent target",
4631 "CantVersionLoopWithDivergentTarget", ORE
, TheLoop
);
4632 return FixedScalableVFPair::getNone();
4635 unsigned TC
= PSE
.getSE()->getSmallConstantTripCount(TheLoop
);
4636 unsigned MaxTC
= PSE
.getSE()->getSmallConstantMaxTripCount(TheLoop
);
4637 LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC
<< '\n');
4639 reportVectorizationFailure("Single iteration (non) loop",
4640 "loop trip count is one, irrelevant for vectorization",
4641 "SingleIterationLoop", ORE
, TheLoop
);
4642 return FixedScalableVFPair::getNone();
4645 switch (ScalarEpilogueStatus
) {
4646 case CM_ScalarEpilogueAllowed
:
4647 return computeFeasibleMaxVF(MaxTC
, UserVF
, false);
4648 case CM_ScalarEpilogueNotAllowedUsePredicate
:
4650 case CM_ScalarEpilogueNotNeededUsePredicate
:
4652 dbgs() << "LV: vector predicate hint/switch found.\n"
4653 << "LV: Not allowing scalar epilogue, creating predicated "
4654 << "vector loop.\n");
4656 case CM_ScalarEpilogueNotAllowedLowTripLoop
:
4657 // fallthrough as a special case of OptForSize
4658 case CM_ScalarEpilogueNotAllowedOptSize
:
4659 if (ScalarEpilogueStatus
== CM_ScalarEpilogueNotAllowedOptSize
)
4661 dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
4663 LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip "
4666 // Bail if runtime checks are required, which are not good when optimising
4668 if (runtimeChecksRequired())
4669 return FixedScalableVFPair::getNone();
4674 // The only loops we can vectorize without a scalar epilogue, are loops with
4675 // a bottom-test and a single exiting block. We'd have to handle the fact
4676 // that not every instruction executes on the last iteration. This will
4677 // require a lane mask which varies through the vector loop body. (TODO)
4678 if (TheLoop
->getExitingBlock() != TheLoop
->getLoopLatch()) {
4679 // If there was a tail-folding hint/switch, but we can't fold the tail by
4680 // masking, fallback to a vectorization with a scalar epilogue.
4681 if (ScalarEpilogueStatus
== CM_ScalarEpilogueNotNeededUsePredicate
) {
4682 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
4683 "scalar epilogue instead.\n");
4684 ScalarEpilogueStatus
= CM_ScalarEpilogueAllowed
;
4685 return computeFeasibleMaxVF(MaxTC
, UserVF
, false);
4687 return FixedScalableVFPair::getNone();
4690 // Now try the tail folding
4692 // Invalidate interleave groups that require an epilogue if we can't mask
4693 // the interleave-group.
4694 if (!useMaskedInterleavedAccesses(TTI
)) {
4695 assert(WideningDecisions
.empty() && Uniforms
.empty() && Scalars
.empty() &&
4696 "No decisions should have been taken at this point");
4697 // Note: There is no need to invalidate any cost modeling decisions here, as
4698 // non where taken so far.
4699 InterleaveInfo
.invalidateGroupsRequiringScalarEpilogue();
4702 FixedScalableVFPair MaxFactors
= computeFeasibleMaxVF(MaxTC
, UserVF
, true);
4704 // Avoid tail folding if the trip count is known to be a multiple of any VF
4706 std::optional
<unsigned> MaxPowerOf2RuntimeVF
=
4707 MaxFactors
.FixedVF
.getFixedValue();
4708 if (MaxFactors
.ScalableVF
) {
4709 std::optional
<unsigned> MaxVScale
= getMaxVScale(*TheFunction
, TTI
);
4710 if (MaxVScale
&& TTI
.isVScaleKnownToBeAPowerOfTwo()) {
4711 MaxPowerOf2RuntimeVF
= std::max
<unsigned>(
4712 *MaxPowerOf2RuntimeVF
,
4713 *MaxVScale
* MaxFactors
.ScalableVF
.getKnownMinValue());
4715 MaxPowerOf2RuntimeVF
= std::nullopt
; // Stick with tail-folding for now.
4718 if (MaxPowerOf2RuntimeVF
&& *MaxPowerOf2RuntimeVF
> 0) {
4719 assert((UserVF
.isNonZero() || isPowerOf2_32(*MaxPowerOf2RuntimeVF
)) &&
4720 "MaxFixedVF must be a power of 2");
4721 unsigned MaxVFtimesIC
=
4722 UserIC
? *MaxPowerOf2RuntimeVF
* UserIC
: *MaxPowerOf2RuntimeVF
;
4723 ScalarEvolution
*SE
= PSE
.getSE();
4724 const SCEV
*BackedgeTakenCount
= PSE
.getBackedgeTakenCount();
4725 const SCEV
*ExitCount
= SE
->getAddExpr(
4726 BackedgeTakenCount
, SE
->getOne(BackedgeTakenCount
->getType()));
4727 const SCEV
*Rem
= SE
->getURemExpr(
4728 SE
->applyLoopGuards(ExitCount
, TheLoop
),
4729 SE
->getConstant(BackedgeTakenCount
->getType(), MaxVFtimesIC
));
4730 if (Rem
->isZero()) {
4731 // Accept MaxFixedVF if we do not have a tail.
4732 LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
4737 // If we don't know the precise trip count, or if the trip count that we
4738 // found modulo the vectorization factor is not zero, try to fold the tail
4740 // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
4741 if (Legal
->prepareToFoldTailByMasking()) {
4742 CanFoldTailByMasking
= true;
4746 // If there was a tail-folding hint/switch, but we can't fold the tail by
4747 // masking, fallback to a vectorization with a scalar epilogue.
4748 if (ScalarEpilogueStatus
== CM_ScalarEpilogueNotNeededUsePredicate
) {
4749 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
4750 "scalar epilogue instead.\n");
4751 ScalarEpilogueStatus
= CM_ScalarEpilogueAllowed
;
4755 if (ScalarEpilogueStatus
== CM_ScalarEpilogueNotAllowedUsePredicate
) {
4756 LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n");
4757 return FixedScalableVFPair::getNone();
4761 reportVectorizationFailure(
4762 "Unable to calculate the loop count due to complex control flow",
4763 "unable to calculate the loop count due to complex control flow",
4764 "UnknownLoopCountComplexCFG", ORE
, TheLoop
);
4765 return FixedScalableVFPair::getNone();
4768 reportVectorizationFailure(
4769 "Cannot optimize for size and vectorize at the same time.",
4770 "cannot optimize for size and vectorize at the same time. "
4771 "Enable vectorization of this loop with '#pragma clang loop "
4772 "vectorize(enable)' when compiling with -Os/-Oz",
4773 "NoTailLoopWithOptForSize", ORE
, TheLoop
);
4774 return FixedScalableVFPair::getNone();
4777 ElementCount
LoopVectorizationCostModel::getMaximizedVFForTarget(
4778 unsigned MaxTripCount
, unsigned SmallestType
, unsigned WidestType
,
4779 ElementCount MaxSafeVF
, bool FoldTailByMasking
) {
4780 bool ComputeScalableMaxVF
= MaxSafeVF
.isScalable();
4781 const TypeSize WidestRegister
= TTI
.getRegisterBitWidth(
4782 ComputeScalableMaxVF
? TargetTransformInfo::RGK_ScalableVector
4783 : TargetTransformInfo::RGK_FixedWidthVector
);
4785 // Convenience function to return the minimum of two ElementCounts.
4786 auto MinVF
= [](const ElementCount
&LHS
, const ElementCount
&RHS
) {
4787 assert((LHS
.isScalable() == RHS
.isScalable()) &&
4788 "Scalable flags must match");
4789 return ElementCount::isKnownLT(LHS
, RHS
) ? LHS
: RHS
;
4792 // Ensure MaxVF is a power of 2; the dependence distance bound may not be.
4793 // Note that both WidestRegister and WidestType may not be a powers of 2.
4794 auto MaxVectorElementCount
= ElementCount::get(
4795 llvm::bit_floor(WidestRegister
.getKnownMinValue() / WidestType
),
4796 ComputeScalableMaxVF
);
4797 MaxVectorElementCount
= MinVF(MaxVectorElementCount
, MaxSafeVF
);
4798 LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
4799 << (MaxVectorElementCount
* WidestType
) << " bits.\n");
4801 if (!MaxVectorElementCount
) {
4802 LLVM_DEBUG(dbgs() << "LV: The target has no "
4803 << (ComputeScalableMaxVF
? "scalable" : "fixed")
4804 << " vector registers.\n");
4805 return ElementCount::getFixed(1);
4808 unsigned WidestRegisterMinEC
= MaxVectorElementCount
.getKnownMinValue();
4809 if (MaxVectorElementCount
.isScalable() &&
4810 TheFunction
->hasFnAttribute(Attribute::VScaleRange
)) {
4811 auto Attr
= TheFunction
->getFnAttribute(Attribute::VScaleRange
);
4812 auto Min
= Attr
.getVScaleRangeMin();
4813 WidestRegisterMinEC
*= Min
;
4816 // When a scalar epilogue is required, at least one iteration of the scalar
4817 // loop has to execute. Adjust MaxTripCount accordingly to avoid picking a
4818 // max VF that results in a dead vector loop.
4819 if (MaxTripCount
> 0 && requiresScalarEpilogue(true))
4822 if (MaxTripCount
&& MaxTripCount
<= WidestRegisterMinEC
&&
4823 (!FoldTailByMasking
|| isPowerOf2_32(MaxTripCount
))) {
4824 // If upper bound loop trip count (TC) is known at compile time there is no
4825 // point in choosing VF greater than TC (as done in the loop below). Select
4826 // maximum power of two which doesn't exceed TC. If MaxVectorElementCount is
4827 // scalable, we only fall back on a fixed VF when the TC is less than or
4828 // equal to the known number of lanes.
4829 auto ClampedUpperTripCount
= llvm::bit_floor(MaxTripCount
);
4830 LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to maximum power of two not "
4831 "exceeding the constant trip count: "
4832 << ClampedUpperTripCount
<< "\n");
4833 return ElementCount::get(
4834 ClampedUpperTripCount
,
4835 FoldTailByMasking
? MaxVectorElementCount
.isScalable() : false);
4838 TargetTransformInfo::RegisterKind RegKind
=
4839 ComputeScalableMaxVF
? TargetTransformInfo::RGK_ScalableVector
4840 : TargetTransformInfo::RGK_FixedWidthVector
;
4841 ElementCount MaxVF
= MaxVectorElementCount
;
4842 if (MaximizeBandwidth
||
4843 (MaximizeBandwidth
.getNumOccurrences() == 0 &&
4844 (TTI
.shouldMaximizeVectorBandwidth(RegKind
) ||
4845 (UseWiderVFIfCallVariantsPresent
&& Legal
->hasVectorCallVariants())))) {
4846 auto MaxVectorElementCountMaxBW
= ElementCount::get(
4847 llvm::bit_floor(WidestRegister
.getKnownMinValue() / SmallestType
),
4848 ComputeScalableMaxVF
);
4849 MaxVectorElementCountMaxBW
= MinVF(MaxVectorElementCountMaxBW
, MaxSafeVF
);
4851 // Collect all viable vectorization factors larger than the default MaxVF
4852 // (i.e. MaxVectorElementCount).
4853 SmallVector
<ElementCount
, 8> VFs
;
4854 for (ElementCount VS
= MaxVectorElementCount
* 2;
4855 ElementCount::isKnownLE(VS
, MaxVectorElementCountMaxBW
); VS
*= 2)
4858 // For each VF calculate its register usage.
4859 auto RUs
= calculateRegisterUsage(VFs
);
4861 // Select the largest VF which doesn't require more registers than existing
4863 for (int i
= RUs
.size() - 1; i
>= 0; --i
) {
4864 bool Selected
= true;
4865 for (auto &pair
: RUs
[i
].MaxLocalUsers
) {
4866 unsigned TargetNumRegisters
= TTI
.getNumberOfRegisters(pair
.first
);
4867 if (pair
.second
> TargetNumRegisters
)
4875 if (ElementCount MinVF
=
4876 TTI
.getMinimumVF(SmallestType
, ComputeScalableMaxVF
)) {
4877 if (ElementCount::isKnownLT(MaxVF
, MinVF
)) {
4878 LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF
4879 << ") with target's minimum: " << MinVF
<< '\n');
4884 // Invalidate any widening decisions we might have made, in case the loop
4885 // requires prediction (decided later), but we have already made some
4886 // load/store widening decisions.
4887 invalidateCostModelingDecisions();
4892 /// Convenience function that returns the value of vscale_range iff
4893 /// vscale_range.min == vscale_range.max or otherwise returns the value
4894 /// returned by the corresponding TTI method.
4895 static std::optional
<unsigned>
4896 getVScaleForTuning(const Loop
*L
, const TargetTransformInfo
&TTI
) {
4897 const Function
*Fn
= L
->getHeader()->getParent();
4898 if (Fn
->hasFnAttribute(Attribute::VScaleRange
)) {
4899 auto Attr
= Fn
->getFnAttribute(Attribute::VScaleRange
);
4900 auto Min
= Attr
.getVScaleRangeMin();
4901 auto Max
= Attr
.getVScaleRangeMax();
4902 if (Max
&& Min
== Max
)
4906 return TTI
.getVScaleForTuning();
4909 bool LoopVectorizationPlanner::isMoreProfitable(
4910 const VectorizationFactor
&A
, const VectorizationFactor
&B
) const {
4911 InstructionCost CostA
= A
.Cost
;
4912 InstructionCost CostB
= B
.Cost
;
4914 unsigned MaxTripCount
= PSE
.getSE()->getSmallConstantMaxTripCount(OrigLoop
);
4916 if (!A
.Width
.isScalable() && !B
.Width
.isScalable() && MaxTripCount
) {
4917 // If the trip count is a known (possibly small) constant, the trip count
4918 // will be rounded up to an integer number of iterations under
4919 // FoldTailByMasking. The total cost in that case will be
4920 // VecCost*ceil(TripCount/VF). When not folding the tail, the total
4921 // cost will be VecCost*floor(TC/VF) + ScalarCost*(TC%VF). There will be
4922 // some extra overheads, but for the purpose of comparing the costs of
4923 // different VFs we can use this to compare the total loop-body cost
4924 // expected after vectorization.
4925 auto GetCostForTC
= [MaxTripCount
, this](unsigned VF
,
4926 InstructionCost VectorCost
,
4927 InstructionCost ScalarCost
) {
4928 return CM
.foldTailByMasking() ? VectorCost
* divideCeil(MaxTripCount
, VF
)
4929 : VectorCost
* (MaxTripCount
/ VF
) +
4930 ScalarCost
* (MaxTripCount
% VF
);
4932 auto RTCostA
= GetCostForTC(A
.Width
.getFixedValue(), CostA
, A
.ScalarCost
);
4933 auto RTCostB
= GetCostForTC(B
.Width
.getFixedValue(), CostB
, B
.ScalarCost
);
4935 return RTCostA
< RTCostB
;
4938 // Improve estimate for the vector width if it is scalable.
4939 unsigned EstimatedWidthA
= A
.Width
.getKnownMinValue();
4940 unsigned EstimatedWidthB
= B
.Width
.getKnownMinValue();
4941 if (std::optional
<unsigned> VScale
= getVScaleForTuning(OrigLoop
, TTI
)) {
4942 if (A
.Width
.isScalable())
4943 EstimatedWidthA
*= *VScale
;
4944 if (B
.Width
.isScalable())
4945 EstimatedWidthB
*= *VScale
;
4948 // Assume vscale may be larger than 1 (or the value being tuned for),
4949 // so that scalable vectorization is slightly favorable over fixed-width
4951 if (A
.Width
.isScalable() && !B
.Width
.isScalable())
4952 return (CostA
* B
.Width
.getFixedValue()) <= (CostB
* EstimatedWidthA
);
4954 // To avoid the need for FP division:
4955 // (CostA / A.Width) < (CostB / B.Width)
4956 // <=> (CostA * B.Width) < (CostB * A.Width)
4957 return (CostA
* EstimatedWidthB
) < (CostB
* EstimatedWidthA
);
4960 static void emitInvalidCostRemarks(SmallVector
<InstructionVFPair
> InvalidCosts
,
4961 OptimizationRemarkEmitter
*ORE
,
4963 if (InvalidCosts
.empty())
4966 // Emit a report of VFs with invalid costs in the loop.
4968 // Group the remarks per instruction, keeping the instruction order from
4970 std::map
<Instruction
*, unsigned> Numbering
;
4972 for (auto &Pair
: InvalidCosts
)
4973 if (!Numbering
.count(Pair
.first
))
4974 Numbering
[Pair
.first
] = I
++;
4976 // Sort the list, first on instruction(number) then on VF.
4977 sort(InvalidCosts
, [&Numbering
](InstructionVFPair
&A
, InstructionVFPair
&B
) {
4978 if (Numbering
[A
.first
] != Numbering
[B
.first
])
4979 return Numbering
[A
.first
] < Numbering
[B
.first
];
4980 ElementCountComparator ECC
;
4981 return ECC(A
.second
, B
.second
);
4984 // For a list of ordered instruction-vf pairs:
4985 // [(load, vf1), (load, vf2), (store, vf1)]
4986 // Group the instructions together to emit separate remarks for:
4989 auto Tail
= ArrayRef
<InstructionVFPair
>(InvalidCosts
);
4990 auto Subset
= ArrayRef
<InstructionVFPair
>();
4993 Subset
= Tail
.take_front(1);
4995 Instruction
*I
= Subset
.front().first
;
4997 // If the next instruction is different, or if there are no other pairs,
4998 // emit a remark for the collated subset. e.g.
4999 // [(load, vf1), (load, vf2))]
5001 // remark: invalid costs for 'load' at VF=(vf, vf2)
5002 if (Subset
== Tail
|| Tail
[Subset
.size()].first
!= I
) {
5003 std::string OutString
;
5004 raw_string_ostream
OS(OutString
);
5005 assert(!Subset
.empty() && "Unexpected empty range");
5006 OS
<< "Instruction with invalid costs prevented vectorization at VF=(";
5007 for (const auto &Pair
: Subset
)
5008 OS
<< (Pair
.second
== Subset
.front().second
? "" : ", ") << Pair
.second
;
5010 if (auto *CI
= dyn_cast
<CallInst
>(I
))
5011 OS
<< " call to " << CI
->getCalledFunction()->getName();
5013 OS
<< " " << I
->getOpcodeName();
5015 reportVectorizationInfo(OutString
, "InvalidCost", ORE
, TheLoop
, I
);
5016 Tail
= Tail
.drop_front(Subset
.size());
5019 // Grow the subset by one element
5020 Subset
= Tail
.take_front(Subset
.size() + 1);
5021 } while (!Tail
.empty());
5024 VectorizationFactor
LoopVectorizationPlanner::selectVectorizationFactor(
5025 const ElementCountSet
&VFCandidates
) {
5026 InstructionCost ExpectedCost
=
5027 CM
.expectedCost(ElementCount::getFixed(1)).first
;
5028 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost
<< ".\n");
5029 assert(ExpectedCost
.isValid() && "Unexpected invalid cost for scalar loop");
5030 assert(VFCandidates
.count(ElementCount::getFixed(1)) &&
5031 "Expected Scalar VF to be a candidate");
5033 const VectorizationFactor
ScalarCost(ElementCount::getFixed(1), ExpectedCost
,
5035 VectorizationFactor ChosenFactor
= ScalarCost
;
5037 bool ForceVectorization
= Hints
.getForce() == LoopVectorizeHints::FK_Enabled
;
5038 if (ForceVectorization
&& VFCandidates
.size() > 1) {
5039 // Ignore scalar width, because the user explicitly wants vectorization.
5040 // Initialize cost to max so that VF = 2 is, at least, chosen during cost
5042 ChosenFactor
.Cost
= InstructionCost::getMax();
5045 SmallVector
<InstructionVFPair
> InvalidCosts
;
5046 for (const auto &i
: VFCandidates
) {
5047 // The cost for scalar VF=1 is already calculated, so ignore it.
5051 LoopVectorizationCostModel::VectorizationCostTy C
=
5052 CM
.expectedCost(i
, &InvalidCosts
);
5053 VectorizationFactor
Candidate(i
, C
.first
, ScalarCost
.ScalarCost
);
5056 unsigned AssumedMinimumVscale
=
5057 getVScaleForTuning(OrigLoop
, TTI
).value_or(1);
5059 Candidate
.Width
.isScalable()
5060 ? Candidate
.Width
.getKnownMinValue() * AssumedMinimumVscale
5061 : Candidate
.Width
.getFixedValue();
5062 LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i
5063 << " costs: " << (Candidate
.Cost
/ Width
));
5065 LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of "
5066 << AssumedMinimumVscale
<< ")");
5067 LLVM_DEBUG(dbgs() << ".\n");
5070 if (!C
.second
&& !ForceVectorization
) {
5072 dbgs() << "LV: Not considering vector loop of width " << i
5073 << " because it will not generate any vector instructions.\n");
5077 // If profitable add it to ProfitableVF list.
5078 if (isMoreProfitable(Candidate
, ScalarCost
))
5079 ProfitableVFs
.push_back(Candidate
);
5081 if (isMoreProfitable(Candidate
, ChosenFactor
))
5082 ChosenFactor
= Candidate
;
5085 emitInvalidCostRemarks(InvalidCosts
, ORE
, OrigLoop
);
5087 if (!EnableCondStoresVectorization
&& CM
.hasPredStores()) {
5088 reportVectorizationFailure(
5089 "There are conditional stores.",
5090 "store that is conditionally executed prevents vectorization",
5091 "ConditionalStore", ORE
, OrigLoop
);
5092 ChosenFactor
= ScalarCost
;
5095 LLVM_DEBUG(if (ForceVectorization
&& !ChosenFactor
.Width
.isScalar() &&
5096 !isMoreProfitable(ChosenFactor
, ScalarCost
)) dbgs()
5097 << "LV: Vectorization seems to be not beneficial, "
5098 << "but was forced by a user.\n");
5099 LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << ChosenFactor
.Width
<< ".\n");
5100 return ChosenFactor
;
5103 bool LoopVectorizationPlanner::isCandidateForEpilogueVectorization(
5104 ElementCount VF
) const {
5105 // Cross iteration phis such as reductions need special handling and are
5106 // currently unsupported.
5107 if (any_of(OrigLoop
->getHeader()->phis(),
5108 [&](PHINode
&Phi
) { return Legal
->isFixedOrderRecurrence(&Phi
); }))
5111 // Phis with uses outside of the loop require special handling and are
5112 // currently unsupported.
5113 for (const auto &Entry
: Legal
->getInductionVars()) {
5114 // Look for uses of the value of the induction at the last iteration.
5116 Entry
.first
->getIncomingValueForBlock(OrigLoop
->getLoopLatch());
5117 for (User
*U
: PostInc
->users())
5118 if (!OrigLoop
->contains(cast
<Instruction
>(U
)))
5120 // Look for uses of penultimate value of the induction.
5121 for (User
*U
: Entry
.first
->users())
5122 if (!OrigLoop
->contains(cast
<Instruction
>(U
)))
5126 // Epilogue vectorization code has not been auditted to ensure it handles
5127 // non-latch exits properly. It may be fine, but it needs auditted and
5129 if (OrigLoop
->getExitingBlock() != OrigLoop
->getLoopLatch())
5135 bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable(
5136 const ElementCount VF
) const {
5137 // FIXME: We need a much better cost-model to take different parameters such
5138 // as register pressure, code size increase and cost of extra branches into
5139 // account. For now we apply a very crude heuristic and only consider loops
5140 // with vectorization factors larger than a certain value.
5142 // Allow the target to opt out entirely.
5143 if (!TTI
.preferEpilogueVectorization())
5146 // We also consider epilogue vectorization unprofitable for targets that don't
5147 // consider interleaving beneficial (eg. MVE).
5148 if (TTI
.getMaxInterleaveFactor(VF
) <= 1)
5151 unsigned Multiplier
= 1;
5152 if (VF
.isScalable())
5153 Multiplier
= getVScaleForTuning(TheLoop
, TTI
).value_or(1);
5154 if ((Multiplier
* VF
.getKnownMinValue()) >= EpilogueVectorizationMinVF
)
5159 VectorizationFactor
LoopVectorizationPlanner::selectEpilogueVectorizationFactor(
5160 const ElementCount MainLoopVF
, unsigned IC
) {
5161 VectorizationFactor Result
= VectorizationFactor::Disabled();
5162 if (!EnableEpilogueVectorization
) {
5163 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n");
5167 if (!CM
.isScalarEpilogueAllowed()) {
5168 LLVM_DEBUG(dbgs() << "LEV: Unable to vectorize epilogue because no "
5169 "epilogue is allowed.\n");
5173 // Not really a cost consideration, but check for unsupported cases here to
5174 // simplify the logic.
5175 if (!isCandidateForEpilogueVectorization(MainLoopVF
)) {
5176 LLVM_DEBUG(dbgs() << "LEV: Unable to vectorize epilogue because the loop "
5177 "is not a supported candidate.\n");
5181 if (EpilogueVectorizationForceVF
> 1) {
5182 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n");
5183 ElementCount ForcedEC
= ElementCount::getFixed(EpilogueVectorizationForceVF
);
5184 if (hasPlanWithVF(ForcedEC
))
5185 return {ForcedEC
, 0, 0};
5187 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization forced factor is not "
5193 if (OrigLoop
->getHeader()->getParent()->hasOptSize() ||
5194 OrigLoop
->getHeader()->getParent()->hasMinSize()) {
5196 dbgs() << "LEV: Epilogue vectorization skipped due to opt for size.\n");
5200 if (!CM
.isEpilogueVectorizationProfitable(MainLoopVF
)) {
5201 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is not profitable for "
5206 // If MainLoopVF = vscale x 2, and vscale is expected to be 4, then we know
5207 // the main loop handles 8 lanes per iteration. We could still benefit from
5208 // vectorizing the epilogue loop with VF=4.
5209 ElementCount EstimatedRuntimeVF
= MainLoopVF
;
5210 if (MainLoopVF
.isScalable()) {
5211 EstimatedRuntimeVF
= ElementCount::getFixed(MainLoopVF
.getKnownMinValue());
5212 if (std::optional
<unsigned> VScale
= getVScaleForTuning(OrigLoop
, TTI
))
5213 EstimatedRuntimeVF
*= *VScale
;
5216 ScalarEvolution
&SE
= *PSE
.getSE();
5217 Type
*TCType
= Legal
->getWidestInductionType();
5218 const SCEV
*RemainingIterations
= nullptr;
5219 for (auto &NextVF
: ProfitableVFs
) {
5220 // Skip candidate VFs without a corresponding VPlan.
5221 if (!hasPlanWithVF(NextVF
.Width
))
5224 // Skip candidate VFs with widths >= the estimate runtime VF (scalable
5225 // vectors) or the VF of the main loop (fixed vectors).
5226 if ((!NextVF
.Width
.isScalable() && MainLoopVF
.isScalable() &&
5227 ElementCount::isKnownGE(NextVF
.Width
, EstimatedRuntimeVF
)) ||
5228 ElementCount::isKnownGE(NextVF
.Width
, MainLoopVF
))
5231 // If NextVF is greater than the number of remaining iterations, the
5232 // epilogue loop would be dead. Skip such factors.
5233 if (!MainLoopVF
.isScalable() && !NextVF
.Width
.isScalable()) {
5234 // TODO: extend to support scalable VFs.
5235 if (!RemainingIterations
) {
5236 const SCEV
*TC
= createTripCountSCEV(TCType
, PSE
, OrigLoop
);
5237 RemainingIterations
= SE
.getURemExpr(
5238 TC
, SE
.getConstant(TCType
, MainLoopVF
.getKnownMinValue() * IC
));
5240 if (SE
.isKnownPredicate(
5242 SE
.getConstant(TCType
, NextVF
.Width
.getKnownMinValue()),
5243 RemainingIterations
))
5247 if (Result
.Width
.isScalar() || isMoreProfitable(NextVF
, Result
))
5251 if (Result
!= VectorizationFactor::Disabled())
5252 LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = "
5253 << Result
.Width
<< "\n");
5257 std::pair
<unsigned, unsigned>
5258 LoopVectorizationCostModel::getSmallestAndWidestTypes() {
5259 unsigned MinWidth
= -1U;
5260 unsigned MaxWidth
= 8;
5261 const DataLayout
&DL
= TheFunction
->getParent()->getDataLayout();
5262 // For in-loop reductions, no element types are added to ElementTypesInLoop
5263 // if there are no loads/stores in the loop. In this case, check through the
5264 // reduction variables to determine the maximum width.
5265 if (ElementTypesInLoop
.empty() && !Legal
->getReductionVars().empty()) {
5266 // Reset MaxWidth so that we can find the smallest type used by recurrences
5269 for (const auto &PhiDescriptorPair
: Legal
->getReductionVars()) {
5270 const RecurrenceDescriptor
&RdxDesc
= PhiDescriptorPair
.second
;
5271 // When finding the min width used by the recurrence we need to account
5272 // for casts on the input operands of the recurrence.
5273 MaxWidth
= std::min
<unsigned>(
5274 MaxWidth
, std::min
<unsigned>(
5275 RdxDesc
.getMinWidthCastToRecurrenceTypeInBits(),
5276 RdxDesc
.getRecurrenceType()->getScalarSizeInBits()));
5279 for (Type
*T
: ElementTypesInLoop
) {
5280 MinWidth
= std::min
<unsigned>(
5281 MinWidth
, DL
.getTypeSizeInBits(T
->getScalarType()).getFixedValue());
5282 MaxWidth
= std::max
<unsigned>(
5283 MaxWidth
, DL
.getTypeSizeInBits(T
->getScalarType()).getFixedValue());
5286 return {MinWidth
, MaxWidth
};
5289 void LoopVectorizationCostModel::collectElementTypesForWidening() {
5290 ElementTypesInLoop
.clear();
5292 for (BasicBlock
*BB
: TheLoop
->blocks()) {
5293 // For each instruction in the loop.
5294 for (Instruction
&I
: BB
->instructionsWithoutDebug()) {
5295 Type
*T
= I
.getType();
5297 // Skip ignored values.
5298 if (ValuesToIgnore
.count(&I
))
5301 // Only examine Loads, Stores and PHINodes.
5302 if (!isa
<LoadInst
>(I
) && !isa
<StoreInst
>(I
) && !isa
<PHINode
>(I
))
5305 // Examine PHI nodes that are reduction variables. Update the type to
5306 // account for the recurrence type.
5307 if (auto *PN
= dyn_cast
<PHINode
>(&I
)) {
5308 if (!Legal
->isReductionVariable(PN
))
5310 const RecurrenceDescriptor
&RdxDesc
=
5311 Legal
->getReductionVars().find(PN
)->second
;
5312 if (PreferInLoopReductions
|| useOrderedReductions(RdxDesc
) ||
5313 TTI
.preferInLoopReduction(RdxDesc
.getOpcode(),
5314 RdxDesc
.getRecurrenceType(),
5315 TargetTransformInfo::ReductionFlags()))
5317 T
= RdxDesc
.getRecurrenceType();
5320 // Examine the stored values.
5321 if (auto *ST
= dyn_cast
<StoreInst
>(&I
))
5322 T
= ST
->getValueOperand()->getType();
5324 assert(T
->isSized() &&
5325 "Expected the load/store/recurrence type to be sized");
5327 ElementTypesInLoop
.insert(T
);
5333 LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF
,
5334 InstructionCost LoopCost
) {
5335 // -- The interleave heuristics --
5336 // We interleave the loop in order to expose ILP and reduce the loop overhead.
5337 // There are many micro-architectural considerations that we can't predict
5338 // at this level. For example, frontend pressure (on decode or fetch) due to
5339 // code size, or the number and capabilities of the execution ports.
5341 // We use the following heuristics to select the interleave count:
5342 // 1. If the code has reductions, then we interleave to break the cross
5343 // iteration dependency.
5344 // 2. If the loop is really small, then we interleave to reduce the loop
5346 // 3. We don't interleave if we think that we will spill registers to memory
5347 // due to the increased register pressure.
5349 if (!isScalarEpilogueAllowed())
5352 // We used the distance for the interleave count.
5353 if (!Legal
->isSafeForAnyVectorWidth())
5356 auto BestKnownTC
= getSmallBestKnownTC(*PSE
.getSE(), TheLoop
);
5357 const bool HasReductions
= !Legal
->getReductionVars().empty();
5358 // Do not interleave loops with a relatively small known or estimated trip
5359 // count. But we will interleave when InterleaveSmallLoopScalarReduction is
5360 // enabled, and the code has scalar reductions(HasReductions && VF = 1),
5361 // because with the above conditions interleaving can expose ILP and break
5362 // cross iteration dependences for reductions.
5363 if (BestKnownTC
&& (*BestKnownTC
< TinyTripCountInterleaveThreshold
) &&
5364 !(InterleaveSmallLoopScalarReduction
&& HasReductions
&& VF
.isScalar()))
5367 // If we did not calculate the cost for VF (because the user selected the VF)
5368 // then we calculate the cost of VF here.
5369 if (LoopCost
== 0) {
5370 LoopCost
= expectedCost(VF
).first
;
5371 assert(LoopCost
.isValid() && "Expected to have chosen a VF with valid cost");
5373 // Loop body is free and there is no need for interleaving.
5378 RegisterUsage R
= calculateRegisterUsage({VF
})[0];
5379 // We divide by these constants so assume that we have at least one
5380 // instruction that uses at least one register.
5381 for (auto& pair
: R
.MaxLocalUsers
) {
5382 pair
.second
= std::max(pair
.second
, 1U);
5385 // We calculate the interleave count using the following formula.
5386 // Subtract the number of loop invariants from the number of available
5387 // registers. These registers are used by all of the interleaved instances.
5388 // Next, divide the remaining registers by the number of registers that is
5389 // required by the loop, in order to estimate how many parallel instances
5390 // fit without causing spills. All of this is rounded down if necessary to be
5391 // a power of two. We want power of two interleave count to simplify any
5392 // addressing operations or alignment considerations.
5393 // We also want power of two interleave counts to ensure that the induction
5394 // variable of the vector loop wraps to zero, when tail is folded by masking;
5395 // this currently happens when OptForSize, in which case IC is set to 1 above.
5396 unsigned IC
= UINT_MAX
;
5398 for (auto& pair
: R
.MaxLocalUsers
) {
5399 unsigned TargetNumRegisters
= TTI
.getNumberOfRegisters(pair
.first
);
5400 LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
5402 << TTI
.getRegisterClassName(pair
.first
) << " register class\n");
5403 if (VF
.isScalar()) {
5404 if (ForceTargetNumScalarRegs
.getNumOccurrences() > 0)
5405 TargetNumRegisters
= ForceTargetNumScalarRegs
;
5407 if (ForceTargetNumVectorRegs
.getNumOccurrences() > 0)
5408 TargetNumRegisters
= ForceTargetNumVectorRegs
;
5410 unsigned MaxLocalUsers
= pair
.second
;
5411 unsigned LoopInvariantRegs
= 0;
5412 if (R
.LoopInvariantRegs
.find(pair
.first
) != R
.LoopInvariantRegs
.end())
5413 LoopInvariantRegs
= R
.LoopInvariantRegs
[pair
.first
];
5415 unsigned TmpIC
= llvm::bit_floor((TargetNumRegisters
- LoopInvariantRegs
) /
5417 // Don't count the induction variable as interleaved.
5418 if (EnableIndVarRegisterHeur
) {
5419 TmpIC
= llvm::bit_floor((TargetNumRegisters
- LoopInvariantRegs
- 1) /
5420 std::max(1U, (MaxLocalUsers
- 1)));
5423 IC
= std::min(IC
, TmpIC
);
5426 // Clamp the interleave ranges to reasonable counts.
5427 unsigned MaxInterleaveCount
= TTI
.getMaxInterleaveFactor(VF
);
5429 // Check if the user has overridden the max.
5430 if (VF
.isScalar()) {
5431 if (ForceTargetMaxScalarInterleaveFactor
.getNumOccurrences() > 0)
5432 MaxInterleaveCount
= ForceTargetMaxScalarInterleaveFactor
;
5434 if (ForceTargetMaxVectorInterleaveFactor
.getNumOccurrences() > 0)
5435 MaxInterleaveCount
= ForceTargetMaxVectorInterleaveFactor
;
5438 unsigned EstimatedVF
= VF
.getKnownMinValue();
5439 if (VF
.isScalable()) {
5440 if (std::optional
<unsigned> VScale
= getVScaleForTuning(TheLoop
, TTI
))
5441 EstimatedVF
*= *VScale
;
5443 assert(EstimatedVF
>= 1 && "Estimated VF shouldn't be less than 1");
5445 unsigned KnownTC
= PSE
.getSE()->getSmallConstantTripCount(TheLoop
);
5447 // If trip count is known we select between two prospective ICs, where
5448 // 1) the aggressive IC is capped by the trip count divided by VF
5449 // 2) the conservative IC is capped by the trip count divided by (VF * 2)
5450 // The final IC is selected in a way that the epilogue loop trip count is
5451 // minimized while maximizing the IC itself, so that we either run the
5452 // vector loop at least once if it generates a small epilogue loop, or else
5453 // we run the vector loop at least twice.
5455 unsigned InterleaveCountUB
= bit_floor(
5456 std::max(1u, std::min(KnownTC
/ EstimatedVF
, MaxInterleaveCount
)));
5457 unsigned InterleaveCountLB
= bit_floor(std::max(
5458 1u, std::min(KnownTC
/ (EstimatedVF
* 2), MaxInterleaveCount
)));
5459 MaxInterleaveCount
= InterleaveCountLB
;
5461 if (InterleaveCountUB
!= InterleaveCountLB
) {
5462 unsigned TailTripCountUB
= (KnownTC
% (EstimatedVF
* InterleaveCountUB
));
5463 unsigned TailTripCountLB
= (KnownTC
% (EstimatedVF
* InterleaveCountLB
));
5464 // If both produce same scalar tail, maximize the IC to do the same work
5465 // in fewer vector loop iterations
5466 if (TailTripCountUB
== TailTripCountLB
)
5467 MaxInterleaveCount
= InterleaveCountUB
;
5469 } else if (BestKnownTC
) {
5470 // If trip count is an estimated compile time constant, limit the
5471 // IC to be capped by the trip count divided by VF * 2, such that the vector
5472 // loop runs at least twice to make interleaving seem profitable when there
5473 // is an epilogue loop present. Since exact Trip count is not known we
5474 // choose to be conservative in our IC estimate.
5475 MaxInterleaveCount
= bit_floor(std::max(
5476 1u, std::min(*BestKnownTC
/ (EstimatedVF
* 2), MaxInterleaveCount
)));
5479 assert(MaxInterleaveCount
> 0 &&
5480 "Maximum interleave count must be greater than 0");
5482 // Clamp the calculated IC to be between the 1 and the max interleave count
5483 // that the target and trip count allows.
5484 if (IC
> MaxInterleaveCount
)
5485 IC
= MaxInterleaveCount
;
5487 // Make sure IC is greater than 0.
5488 IC
= std::max(1u, IC
);
5490 assert(IC
> 0 && "Interleave count must be greater than 0.");
5492 // Interleave if we vectorized this loop and there is a reduction that could
5493 // benefit from interleaving.
5494 if (VF
.isVector() && HasReductions
) {
5495 LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
5499 // For any scalar loop that either requires runtime checks or predication we
5500 // are better off leaving this to the unroller. Note that if we've already
5501 // vectorized the loop we will have done the runtime check and so interleaving
5502 // won't require further checks.
5503 bool ScalarInterleavingRequiresPredication
=
5504 (VF
.isScalar() && any_of(TheLoop
->blocks(), [this](BasicBlock
*BB
) {
5505 return Legal
->blockNeedsPredication(BB
);
5507 bool ScalarInterleavingRequiresRuntimePointerCheck
=
5508 (VF
.isScalar() && Legal
->getRuntimePointerChecking()->Need
);
5510 // We want to interleave small loops in order to reduce the loop overhead and
5511 // potentially expose ILP opportunities.
5512 LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost
<< '\n'
5513 << "LV: IC is " << IC
<< '\n'
5514 << "LV: VF is " << VF
<< '\n');
5515 const bool AggressivelyInterleaveReductions
=
5516 TTI
.enableAggressiveInterleaving(HasReductions
);
5517 if (!ScalarInterleavingRequiresRuntimePointerCheck
&&
5518 !ScalarInterleavingRequiresPredication
&& LoopCost
< SmallLoopCost
) {
5519 // We assume that the cost overhead is 1 and we use the cost model
5520 // to estimate the cost of the loop and interleave until the cost of the
5521 // loop overhead is about 5% of the cost of the loop.
5522 unsigned SmallIC
= std::min(IC
, (unsigned)llvm::bit_floor
<uint64_t>(
5523 SmallLoopCost
/ *LoopCost
.getValue()));
5525 // Interleave until store/load ports (estimated by max interleave count) are
5527 unsigned NumStores
= Legal
->getNumStores();
5528 unsigned NumLoads
= Legal
->getNumLoads();
5529 unsigned StoresIC
= IC
/ (NumStores
? NumStores
: 1);
5530 unsigned LoadsIC
= IC
/ (NumLoads
? NumLoads
: 1);
5532 // There is little point in interleaving for reductions containing selects
5533 // and compares when VF=1 since it may just create more overhead than it's
5534 // worth for loops with small trip counts. This is because we still have to
5535 // do the final reduction after the loop.
5536 bool HasSelectCmpReductions
=
5538 any_of(Legal
->getReductionVars(), [&](auto &Reduction
) -> bool {
5539 const RecurrenceDescriptor
&RdxDesc
= Reduction
.second
;
5540 return RecurrenceDescriptor::isAnyOfRecurrenceKind(
5541 RdxDesc
.getRecurrenceKind());
5543 if (HasSelectCmpReductions
) {
5544 LLVM_DEBUG(dbgs() << "LV: Not interleaving select-cmp reductions.\n");
5548 // If we have a scalar reduction (vector reductions are already dealt with
5549 // by this point), we can increase the critical path length if the loop
5550 // we're interleaving is inside another loop. For tree-wise reductions
5551 // set the limit to 2, and for ordered reductions it's best to disable
5552 // interleaving entirely.
5553 if (HasReductions
&& TheLoop
->getLoopDepth() > 1) {
5554 bool HasOrderedReductions
=
5555 any_of(Legal
->getReductionVars(), [&](auto &Reduction
) -> bool {
5556 const RecurrenceDescriptor
&RdxDesc
= Reduction
.second
;
5557 return RdxDesc
.isOrdered();
5559 if (HasOrderedReductions
) {
5561 dbgs() << "LV: Not interleaving scalar ordered reductions.\n");
5565 unsigned F
= static_cast<unsigned>(MaxNestedScalarReductionIC
);
5566 SmallIC
= std::min(SmallIC
, F
);
5567 StoresIC
= std::min(StoresIC
, F
);
5568 LoadsIC
= std::min(LoadsIC
, F
);
5571 if (EnableLoadStoreRuntimeInterleave
&&
5572 std::max(StoresIC
, LoadsIC
) > SmallIC
) {
5574 dbgs() << "LV: Interleaving to saturate store or load ports.\n");
5575 return std::max(StoresIC
, LoadsIC
);
5578 // If there are scalar reductions and TTI has enabled aggressive
5579 // interleaving for reductions, we will interleave to expose ILP.
5580 if (InterleaveSmallLoopScalarReduction
&& VF
.isScalar() &&
5581 AggressivelyInterleaveReductions
) {
5582 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
5583 // Interleave no less than SmallIC but not as aggressive as the normal IC
5584 // to satisfy the rare situation when resources are too limited.
5585 return std::max(IC
/ 2, SmallIC
);
5587 LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
5592 // Interleave if this is a large loop (small loops are already dealt with by
5593 // this point) that could benefit from interleaving.
5594 if (AggressivelyInterleaveReductions
) {
5595 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
5599 LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n");
5603 SmallVector
<LoopVectorizationCostModel::RegisterUsage
, 8>
5604 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef
<ElementCount
> VFs
) {
5605 // This function calculates the register usage by measuring the highest number
5606 // of values that are alive at a single location. Obviously, this is a very
5607 // rough estimation. We scan the loop in a topological order in order and
5608 // assign a number to each instruction. We use RPO to ensure that defs are
5609 // met before their users. We assume that each instruction that has in-loop
5610 // users starts an interval. We record every time that an in-loop value is
5611 // used, so we have a list of the first and last occurrences of each
5612 // instruction. Next, we transpose this data structure into a multi map that
5613 // holds the list of intervals that *end* at a specific location. This multi
5614 // map allows us to perform a linear search. We scan the instructions linearly
5615 // and record each time that a new interval starts, by placing it in a set.
5616 // If we find this value in the multi-map then we remove it from the set.
5617 // The max register usage is the maximum size of the set.
5618 // We also search for instructions that are defined outside the loop, but are
5619 // used inside the loop. We need this number separately from the max-interval
5620 // usage number because when we unroll, loop-invariant values do not take
5622 LoopBlocksDFS
DFS(TheLoop
);
5627 // Each 'key' in the map opens a new interval. The values
5628 // of the map are the index of the 'last seen' usage of the
5629 // instruction that is the key.
5630 using IntervalMap
= DenseMap
<Instruction
*, unsigned>;
5632 // Maps instruction to its index.
5633 SmallVector
<Instruction
*, 64> IdxToInstr
;
5634 // Marks the end of each interval.
5635 IntervalMap EndPoint
;
5636 // Saves the list of instruction indices that are used in the loop.
5637 SmallPtrSet
<Instruction
*, 8> Ends
;
5638 // Saves the list of values that are used in the loop but are defined outside
5639 // the loop (not including non-instruction values such as arguments and
5641 SmallSetVector
<Instruction
*, 8> LoopInvariants
;
5643 for (BasicBlock
*BB
: make_range(DFS
.beginRPO(), DFS
.endRPO())) {
5644 for (Instruction
&I
: BB
->instructionsWithoutDebug()) {
5645 IdxToInstr
.push_back(&I
);
5647 // Save the end location of each USE.
5648 for (Value
*U
: I
.operands()) {
5649 auto *Instr
= dyn_cast
<Instruction
>(U
);
5651 // Ignore non-instruction values such as arguments, constants, etc.
5652 // FIXME: Might need some motivation why these values are ignored. If
5653 // for example an argument is used inside the loop it will increase the
5654 // register pressure (so shouldn't we add it to LoopInvariants).
5658 // If this instruction is outside the loop then record it and continue.
5659 if (!TheLoop
->contains(Instr
)) {
5660 LoopInvariants
.insert(Instr
);
5664 // Overwrite previous end points.
5665 EndPoint
[Instr
] = IdxToInstr
.size();
5671 // Saves the list of intervals that end with the index in 'key'.
5672 using InstrList
= SmallVector
<Instruction
*, 2>;
5673 DenseMap
<unsigned, InstrList
> TransposeEnds
;
5675 // Transpose the EndPoints to a list of values that end at each index.
5676 for (auto &Interval
: EndPoint
)
5677 TransposeEnds
[Interval
.second
].push_back(Interval
.first
);
5679 SmallPtrSet
<Instruction
*, 8> OpenIntervals
;
5680 SmallVector
<RegisterUsage
, 8> RUs(VFs
.size());
5681 SmallVector
<SmallMapVector
<unsigned, unsigned, 4>, 8> MaxUsages(VFs
.size());
5683 LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
5685 const auto &TTICapture
= TTI
;
5686 auto GetRegUsage
= [&TTICapture
](Type
*Ty
, ElementCount VF
) -> unsigned {
5687 if (Ty
->isTokenTy() || !VectorType::isValidElementType(Ty
))
5689 return TTICapture
.getRegUsageForType(VectorType::get(Ty
, VF
));
5692 for (unsigned int i
= 0, s
= IdxToInstr
.size(); i
< s
; ++i
) {
5693 Instruction
*I
= IdxToInstr
[i
];
5695 // Remove all of the instructions that end at this location.
5696 InstrList
&List
= TransposeEnds
[i
];
5697 for (Instruction
*ToRemove
: List
)
5698 OpenIntervals
.erase(ToRemove
);
5700 // Ignore instructions that are never used within the loop.
5704 // Skip ignored values.
5705 if (ValuesToIgnore
.count(I
))
5708 collectInLoopReductions();
5710 // For each VF find the maximum usage of registers.
5711 for (unsigned j
= 0, e
= VFs
.size(); j
< e
; ++j
) {
5712 // Count the number of registers used, per register class, given all open
5714 // Note that elements in this SmallMapVector will be default constructed
5715 // as 0. So we can use "RegUsage[ClassID] += n" in the code below even if
5716 // there is no previous entry for ClassID.
5717 SmallMapVector
<unsigned, unsigned, 4> RegUsage
;
5719 if (VFs
[j
].isScalar()) {
5720 for (auto *Inst
: OpenIntervals
) {
5722 TTI
.getRegisterClassForType(false, Inst
->getType());
5723 // FIXME: The target might use more than one register for the type
5724 // even in the scalar case.
5725 RegUsage
[ClassID
] += 1;
5728 collectUniformsAndScalars(VFs
[j
]);
5729 for (auto *Inst
: OpenIntervals
) {
5730 // Skip ignored values for VF > 1.
5731 if (VecValuesToIgnore
.count(Inst
))
5733 if (isScalarAfterVectorization(Inst
, VFs
[j
])) {
5735 TTI
.getRegisterClassForType(false, Inst
->getType());
5736 // FIXME: The target might use more than one register for the type
5737 // even in the scalar case.
5738 RegUsage
[ClassID
] += 1;
5741 TTI
.getRegisterClassForType(true, Inst
->getType());
5742 RegUsage
[ClassID
] += GetRegUsage(Inst
->getType(), VFs
[j
]);
5747 for (auto& pair
: RegUsage
) {
5748 auto &Entry
= MaxUsages
[j
][pair
.first
];
5749 Entry
= std::max(Entry
, pair
.second
);
5753 LLVM_DEBUG(dbgs() << "LV(REG): At #" << i
<< " Interval # "
5754 << OpenIntervals
.size() << '\n');
5756 // Add the current instruction to the list of open intervals.
5757 OpenIntervals
.insert(I
);
5760 for (unsigned i
= 0, e
= VFs
.size(); i
< e
; ++i
) {
5761 // Note that elements in this SmallMapVector will be default constructed
5762 // as 0. So we can use "Invariant[ClassID] += n" in the code below even if
5763 // there is no previous entry for ClassID.
5764 SmallMapVector
<unsigned, unsigned, 4> Invariant
;
5766 for (auto *Inst
: LoopInvariants
) {
5767 // FIXME: The target might use more than one register for the type
5768 // even in the scalar case.
5769 bool IsScalar
= all_of(Inst
->users(), [&](User
*U
) {
5770 auto *I
= cast
<Instruction
>(U
);
5771 return TheLoop
!= LI
->getLoopFor(I
->getParent()) ||
5772 isScalarAfterVectorization(I
, VFs
[i
]);
5775 ElementCount VF
= IsScalar
? ElementCount::getFixed(1) : VFs
[i
];
5777 TTI
.getRegisterClassForType(VF
.isVector(), Inst
->getType());
5778 Invariant
[ClassID
] += GetRegUsage(Inst
->getType(), VF
);
5782 dbgs() << "LV(REG): VF = " << VFs
[i
] << '\n';
5783 dbgs() << "LV(REG): Found max usage: " << MaxUsages
[i
].size()
5785 for (const auto &pair
: MaxUsages
[i
]) {
5786 dbgs() << "LV(REG): RegisterClass: "
5787 << TTI
.getRegisterClassName(pair
.first
) << ", " << pair
.second
5790 dbgs() << "LV(REG): Found invariant usage: " << Invariant
.size()
5792 for (const auto &pair
: Invariant
) {
5793 dbgs() << "LV(REG): RegisterClass: "
5794 << TTI
.getRegisterClassName(pair
.first
) << ", " << pair
.second
5799 RU
.LoopInvariantRegs
= Invariant
;
5800 RU
.MaxLocalUsers
= MaxUsages
[i
];
5807 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction
*I
,
5809 // TODO: Cost model for emulated masked load/store is completely
5810 // broken. This hack guides the cost model to use an artificially
5811 // high enough value to practically disable vectorization with such
5812 // operations, except where previously deployed legality hack allowed
5813 // using very low cost values. This is to avoid regressions coming simply
5814 // from moving "masked load/store" check from legality to cost model.
5815 // Masked Load/Gather emulation was previously never allowed.
5816 // Limited number of Masked Store/Scatter emulation was allowed.
5817 assert((isPredicatedInst(I
)) &&
5818 "Expecting a scalar emulated instruction");
5819 return isa
<LoadInst
>(I
) ||
5820 (isa
<StoreInst
>(I
) &&
5821 NumPredStores
> NumberOfStoresToPredicate
);
5824 void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF
) {
5825 // If we aren't vectorizing the loop, or if we've already collected the
5826 // instructions to scalarize, there's nothing to do. Collection may already
5827 // have occurred if we have a user-selected VF and are now computing the
5828 // expected cost for interleaving.
5829 if (VF
.isScalar() || VF
.isZero() || InstsToScalarize
.contains(VF
))
5832 // Initialize a mapping for VF in InstsToScalalarize. If we find that it's
5833 // not profitable to scalarize any instructions, the presence of VF in the
5834 // map will indicate that we've analyzed it already.
5835 ScalarCostsTy
&ScalarCostsVF
= InstsToScalarize
[VF
];
5837 PredicatedBBsAfterVectorization
[VF
].clear();
5839 // Find all the instructions that are scalar with predication in the loop and
5840 // determine if it would be better to not if-convert the blocks they are in.
5841 // If so, we also record the instructions to scalarize.
5842 for (BasicBlock
*BB
: TheLoop
->blocks()) {
5843 if (!blockNeedsPredicationForAnyReason(BB
))
5845 for (Instruction
&I
: *BB
)
5846 if (isScalarWithPredication(&I
, VF
)) {
5847 ScalarCostsTy ScalarCosts
;
5848 // Do not apply discount if scalable, because that would lead to
5849 // invalid scalarization costs.
5850 // Do not apply discount logic if hacked cost is needed
5851 // for emulated masked memrefs.
5852 if (!VF
.isScalable() && !useEmulatedMaskMemRefHack(&I
, VF
) &&
5853 computePredInstDiscount(&I
, ScalarCosts
, VF
) >= 0)
5854 ScalarCostsVF
.insert(ScalarCosts
.begin(), ScalarCosts
.end());
5855 // Remember that BB will remain after vectorization.
5856 PredicatedBBsAfterVectorization
[VF
].insert(BB
);
5861 InstructionCost
LoopVectorizationCostModel::computePredInstDiscount(
5862 Instruction
*PredInst
, ScalarCostsTy
&ScalarCosts
, ElementCount VF
) {
5863 assert(!isUniformAfterVectorization(PredInst
, VF
) &&
5864 "Instruction marked uniform-after-vectorization will be predicated");
5866 // Initialize the discount to zero, meaning that the scalar version and the
5867 // vector version cost the same.
5868 InstructionCost Discount
= 0;
5870 // Holds instructions to analyze. The instructions we visit are mapped in
5871 // ScalarCosts. Those instructions are the ones that would be scalarized if
5872 // we find that the scalar version costs less.
5873 SmallVector
<Instruction
*, 8> Worklist
;
5875 // Returns true if the given instruction can be scalarized.
5876 auto canBeScalarized
= [&](Instruction
*I
) -> bool {
5877 // We only attempt to scalarize instructions forming a single-use chain
5878 // from the original predicated block that would otherwise be vectorized.
5879 // Although not strictly necessary, we give up on instructions we know will
5880 // already be scalar to avoid traversing chains that are unlikely to be
5882 if (!I
->hasOneUse() || PredInst
->getParent() != I
->getParent() ||
5883 isScalarAfterVectorization(I
, VF
))
5886 // If the instruction is scalar with predication, it will be analyzed
5887 // separately. We ignore it within the context of PredInst.
5888 if (isScalarWithPredication(I
, VF
))
5891 // If any of the instruction's operands are uniform after vectorization,
5892 // the instruction cannot be scalarized. This prevents, for example, a
5893 // masked load from being scalarized.
5895 // We assume we will only emit a value for lane zero of an instruction
5896 // marked uniform after vectorization, rather than VF identical values.
5897 // Thus, if we scalarize an instruction that uses a uniform, we would
5898 // create uses of values corresponding to the lanes we aren't emitting code
5899 // for. This behavior can be changed by allowing getScalarValue to clone
5900 // the lane zero values for uniforms rather than asserting.
5901 for (Use
&U
: I
->operands())
5902 if (auto *J
= dyn_cast
<Instruction
>(U
.get()))
5903 if (isUniformAfterVectorization(J
, VF
))
5906 // Otherwise, we can scalarize the instruction.
5910 // Compute the expected cost discount from scalarizing the entire expression
5911 // feeding the predicated instruction. We currently only consider expressions
5912 // that are single-use instruction chains.
5913 Worklist
.push_back(PredInst
);
5914 while (!Worklist
.empty()) {
5915 Instruction
*I
= Worklist
.pop_back_val();
5917 // If we've already analyzed the instruction, there's nothing to do.
5918 if (ScalarCosts
.contains(I
))
5921 // Compute the cost of the vector instruction. Note that this cost already
5922 // includes the scalarization overhead of the predicated instruction.
5923 InstructionCost VectorCost
= getInstructionCost(I
, VF
).first
;
5925 // Compute the cost of the scalarized instruction. This cost is the cost of
5926 // the instruction as if it wasn't if-converted and instead remained in the
5927 // predicated block. We will scale this cost by block probability after
5928 // computing the scalarization overhead.
5929 InstructionCost ScalarCost
=
5930 VF
.getFixedValue() *
5931 getInstructionCost(I
, ElementCount::getFixed(1)).first
;
5933 // Compute the scalarization overhead of needed insertelement instructions
5935 TTI::TargetCostKind CostKind
= TTI::TCK_RecipThroughput
;
5936 if (isScalarWithPredication(I
, VF
) && !I
->getType()->isVoidTy()) {
5937 ScalarCost
+= TTI
.getScalarizationOverhead(
5938 cast
<VectorType
>(ToVectorTy(I
->getType(), VF
)),
5939 APInt::getAllOnes(VF
.getFixedValue()), /*Insert*/ true,
5940 /*Extract*/ false, CostKind
);
5942 VF
.getFixedValue() * TTI
.getCFInstrCost(Instruction::PHI
, CostKind
);
5945 // Compute the scalarization overhead of needed extractelement
5946 // instructions. For each of the instruction's operands, if the operand can
5947 // be scalarized, add it to the worklist; otherwise, account for the
5949 for (Use
&U
: I
->operands())
5950 if (auto *J
= dyn_cast
<Instruction
>(U
.get())) {
5951 assert(VectorType::isValidElementType(J
->getType()) &&
5952 "Instruction has non-scalar type");
5953 if (canBeScalarized(J
))
5954 Worklist
.push_back(J
);
5955 else if (needsExtract(J
, VF
)) {
5956 ScalarCost
+= TTI
.getScalarizationOverhead(
5957 cast
<VectorType
>(ToVectorTy(J
->getType(), VF
)),
5958 APInt::getAllOnes(VF
.getFixedValue()), /*Insert*/ false,
5959 /*Extract*/ true, CostKind
);
5963 // Scale the total scalar cost by block probability.
5964 ScalarCost
/= getReciprocalPredBlockProb();
5966 // Compute the discount. A non-negative discount means the vector version
5967 // of the instruction costs more, and scalarizing would be beneficial.
5968 Discount
+= VectorCost
- ScalarCost
;
5969 ScalarCosts
[I
] = ScalarCost
;
5975 LoopVectorizationCostModel::VectorizationCostTy
5976 LoopVectorizationCostModel::expectedCost(
5977 ElementCount VF
, SmallVectorImpl
<InstructionVFPair
> *Invalid
) {
5978 VectorizationCostTy Cost
;
5981 for (BasicBlock
*BB
: TheLoop
->blocks()) {
5982 VectorizationCostTy BlockCost
;
5984 // For each instruction in the old loop.
5985 for (Instruction
&I
: BB
->instructionsWithoutDebug()) {
5986 // Skip ignored values.
5987 if (ValuesToIgnore
.count(&I
) ||
5988 (VF
.isVector() && VecValuesToIgnore
.count(&I
)))
5991 VectorizationCostTy C
= getInstructionCost(&I
, VF
);
5993 // Check if we should override the cost.
5994 if (C
.first
.isValid() &&
5995 ForceTargetInstructionCost
.getNumOccurrences() > 0)
5996 C
.first
= InstructionCost(ForceTargetInstructionCost
);
5998 // Keep a list of instructions with invalid costs.
5999 if (Invalid
&& !C
.first
.isValid())
6000 Invalid
->emplace_back(&I
, VF
);
6002 BlockCost
.first
+= C
.first
;
6003 BlockCost
.second
|= C
.second
;
6004 LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C
.first
6005 << " for VF " << VF
<< " For instruction: " << I
6009 // If we are vectorizing a predicated block, it will have been
6010 // if-converted. This means that the block's instructions (aside from
6011 // stores and instructions that may divide by zero) will now be
6012 // unconditionally executed. For the scalar case, we may not always execute
6013 // the predicated block, if it is an if-else block. Thus, scale the block's
6014 // cost by the probability of executing it. blockNeedsPredication from
6015 // Legal is used so as to not include all blocks in tail folded loops.
6016 if (VF
.isScalar() && Legal
->blockNeedsPredication(BB
))
6017 BlockCost
.first
/= getReciprocalPredBlockProb();
6019 Cost
.first
+= BlockCost
.first
;
6020 Cost
.second
|= BlockCost
.second
;
6026 /// Gets Address Access SCEV after verifying that the access pattern
6027 /// is loop invariant except the induction variable dependence.
6029 /// This SCEV can be sent to the Target in order to estimate the address
6030 /// calculation cost.
6031 static const SCEV
*getAddressAccessSCEV(
6033 LoopVectorizationLegality
*Legal
,
6034 PredicatedScalarEvolution
&PSE
,
6035 const Loop
*TheLoop
) {
6037 auto *Gep
= dyn_cast
<GetElementPtrInst
>(Ptr
);
6041 // We are looking for a gep with all loop invariant indices except for one
6042 // which should be an induction variable.
6043 auto SE
= PSE
.getSE();
6044 unsigned NumOperands
= Gep
->getNumOperands();
6045 for (unsigned i
= 1; i
< NumOperands
; ++i
) {
6046 Value
*Opd
= Gep
->getOperand(i
);
6047 if (!SE
->isLoopInvariant(SE
->getSCEV(Opd
), TheLoop
) &&
6048 !Legal
->isInductionVariable(Opd
))
6052 // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV.
6053 return PSE
.getSCEV(Ptr
);
6057 LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction
*I
,
6059 assert(VF
.isVector() &&
6060 "Scalarization cost of instruction implies vectorization.");
6061 if (VF
.isScalable())
6062 return InstructionCost::getInvalid();
6064 Type
*ValTy
= getLoadStoreType(I
);
6065 auto SE
= PSE
.getSE();
6067 unsigned AS
= getLoadStoreAddressSpace(I
);
6068 Value
*Ptr
= getLoadStorePointerOperand(I
);
6069 Type
*PtrTy
= ToVectorTy(Ptr
->getType(), VF
);
6070 // NOTE: PtrTy is a vector to signal `TTI::getAddressComputationCost`
6071 // that it is being called from this specific place.
6073 // Figure out whether the access is strided and get the stride value
6074 // if it's known in compile time
6075 const SCEV
*PtrSCEV
= getAddressAccessSCEV(Ptr
, Legal
, PSE
, TheLoop
);
6077 // Get the cost of the scalar memory instruction and address computation.
6078 InstructionCost Cost
=
6079 VF
.getKnownMinValue() * TTI
.getAddressComputationCost(PtrTy
, SE
, PtrSCEV
);
6081 // Don't pass *I here, since it is scalar but will actually be part of a
6082 // vectorized loop where the user of it is a vectorized instruction.
6083 TTI::TargetCostKind CostKind
= TTI::TCK_RecipThroughput
;
6084 const Align Alignment
= getLoadStoreAlignment(I
);
6085 Cost
+= VF
.getKnownMinValue() * TTI
.getMemoryOpCost(I
->getOpcode(),
6086 ValTy
->getScalarType(),
6087 Alignment
, AS
, CostKind
);
6089 // Get the overhead of the extractelement and insertelement instructions
6090 // we might create due to scalarization.
6091 Cost
+= getScalarizationOverhead(I
, VF
, CostKind
);
6093 // If we have a predicated load/store, it will need extra i1 extracts and
6094 // conditional branches, but may not be executed for each vector lane. Scale
6095 // the cost by the probability of executing the predicated block.
6096 if (isPredicatedInst(I
)) {
6097 Cost
/= getReciprocalPredBlockProb();
6099 // Add the cost of an i1 extract and a branch
6101 VectorType::get(IntegerType::getInt1Ty(ValTy
->getContext()), VF
);
6102 Cost
+= TTI
.getScalarizationOverhead(
6103 Vec_i1Ty
, APInt::getAllOnes(VF
.getKnownMinValue()),
6104 /*Insert=*/false, /*Extract=*/true, CostKind
);
6105 Cost
+= TTI
.getCFInstrCost(Instruction::Br
, CostKind
);
6107 if (useEmulatedMaskMemRefHack(I
, VF
))
6108 // Artificially setting to a high enough value to practically disable
6109 // vectorization with such operations.
6117 LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction
*I
,
6119 Type
*ValTy
= getLoadStoreType(I
);
6120 auto *VectorTy
= cast
<VectorType
>(ToVectorTy(ValTy
, VF
));
6121 Value
*Ptr
= getLoadStorePointerOperand(I
);
6122 unsigned AS
= getLoadStoreAddressSpace(I
);
6123 int ConsecutiveStride
= Legal
->isConsecutivePtr(ValTy
, Ptr
);
6124 enum TTI::TargetCostKind CostKind
= TTI::TCK_RecipThroughput
;
6126 assert((ConsecutiveStride
== 1 || ConsecutiveStride
== -1) &&
6127 "Stride should be 1 or -1 for consecutive memory access");
6128 const Align Alignment
= getLoadStoreAlignment(I
);
6129 InstructionCost Cost
= 0;
6130 if (Legal
->isMaskRequired(I
)) {
6131 Cost
+= TTI
.getMaskedMemoryOpCost(I
->getOpcode(), VectorTy
, Alignment
, AS
,
6134 TTI::OperandValueInfo OpInfo
= TTI::getOperandInfo(I
->getOperand(0));
6135 Cost
+= TTI
.getMemoryOpCost(I
->getOpcode(), VectorTy
, Alignment
, AS
,
6136 CostKind
, OpInfo
, I
);
6139 bool Reverse
= ConsecutiveStride
< 0;
6141 Cost
+= TTI
.getShuffleCost(TargetTransformInfo::SK_Reverse
, VectorTy
,
6142 std::nullopt
, CostKind
, 0);
6147 LoopVectorizationCostModel::getUniformMemOpCost(Instruction
*I
,
6149 assert(Legal
->isUniformMemOp(*I
, VF
));
6151 Type
*ValTy
= getLoadStoreType(I
);
6152 auto *VectorTy
= cast
<VectorType
>(ToVectorTy(ValTy
, VF
));
6153 const Align Alignment
= getLoadStoreAlignment(I
);
6154 unsigned AS
= getLoadStoreAddressSpace(I
);
6155 enum TTI::TargetCostKind CostKind
= TTI::TCK_RecipThroughput
;
6156 if (isa
<LoadInst
>(I
)) {
6157 return TTI
.getAddressComputationCost(ValTy
) +
6158 TTI
.getMemoryOpCost(Instruction::Load
, ValTy
, Alignment
, AS
,
6160 TTI
.getShuffleCost(TargetTransformInfo::SK_Broadcast
, VectorTy
);
6162 StoreInst
*SI
= cast
<StoreInst
>(I
);
6164 bool isLoopInvariantStoreValue
= Legal
->isInvariant(SI
->getValueOperand());
6165 return TTI
.getAddressComputationCost(ValTy
) +
6166 TTI
.getMemoryOpCost(Instruction::Store
, ValTy
, Alignment
, AS
,
6168 (isLoopInvariantStoreValue
6170 : TTI
.getVectorInstrCost(Instruction::ExtractElement
, VectorTy
,
6171 CostKind
, VF
.getKnownMinValue() - 1));
6175 LoopVectorizationCostModel::getGatherScatterCost(Instruction
*I
,
6177 Type
*ValTy
= getLoadStoreType(I
);
6178 auto *VectorTy
= cast
<VectorType
>(ToVectorTy(ValTy
, VF
));
6179 const Align Alignment
= getLoadStoreAlignment(I
);
6180 const Value
*Ptr
= getLoadStorePointerOperand(I
);
6182 return TTI
.getAddressComputationCost(VectorTy
) +
6183 TTI
.getGatherScatterOpCost(
6184 I
->getOpcode(), VectorTy
, Ptr
, Legal
->isMaskRequired(I
), Alignment
,
6185 TargetTransformInfo::TCK_RecipThroughput
, I
);
6189 LoopVectorizationCostModel::getInterleaveGroupCost(Instruction
*I
,
6191 Type
*ValTy
= getLoadStoreType(I
);
6192 auto *VectorTy
= cast
<VectorType
>(ToVectorTy(ValTy
, VF
));
6193 unsigned AS
= getLoadStoreAddressSpace(I
);
6194 enum TTI::TargetCostKind CostKind
= TTI::TCK_RecipThroughput
;
6196 auto Group
= getInterleavedAccessGroup(I
);
6197 assert(Group
&& "Fail to get an interleaved access group.");
6199 unsigned InterleaveFactor
= Group
->getFactor();
6200 auto *WideVecTy
= VectorType::get(ValTy
, VF
* InterleaveFactor
);
6202 // Holds the indices of existing members in the interleaved group.
6203 SmallVector
<unsigned, 4> Indices
;
6204 for (unsigned IF
= 0; IF
< InterleaveFactor
; IF
++)
6205 if (Group
->getMember(IF
))
6206 Indices
.push_back(IF
);
6208 // Calculate the cost of the whole interleaved group.
6209 bool UseMaskForGaps
=
6210 (Group
->requiresScalarEpilogue() && !isScalarEpilogueAllowed()) ||
6211 (isa
<StoreInst
>(I
) && (Group
->getNumMembers() < Group
->getFactor()));
6212 InstructionCost Cost
= TTI
.getInterleavedMemoryOpCost(
6213 I
->getOpcode(), WideVecTy
, Group
->getFactor(), Indices
, Group
->getAlign(),
6214 AS
, CostKind
, Legal
->isMaskRequired(I
), UseMaskForGaps
);
6216 if (Group
->isReverse()) {
6217 // TODO: Add support for reversed masked interleaved access.
6218 assert(!Legal
->isMaskRequired(I
) &&
6219 "Reverse masked interleaved access not supported.");
6220 Cost
+= Group
->getNumMembers() *
6221 TTI
.getShuffleCost(TargetTransformInfo::SK_Reverse
, VectorTy
,
6222 std::nullopt
, CostKind
, 0);
6227 std::optional
<InstructionCost
>
6228 LoopVectorizationCostModel::getReductionPatternCost(
6229 Instruction
*I
, ElementCount VF
, Type
*Ty
,
6230 TTI::TargetCostKind CostKind
) const {
6231 using namespace llvm::PatternMatch
;
6232 // Early exit for no inloop reductions
6233 if (InLoopReductions
.empty() || VF
.isScalar() || !isa
<VectorType
>(Ty
))
6234 return std::nullopt
;
6235 auto *VectorTy
= cast
<VectorType
>(Ty
);
6237 // We are looking for a pattern of, and finding the minimal acceptable cost:
6238 // reduce(mul(ext(A), ext(B))) or
6239 // reduce(mul(A, B)) or
6240 // reduce(ext(A)) or
6242 // The basic idea is that we walk down the tree to do that, finding the root
6243 // reduction instruction in InLoopReductionImmediateChains. From there we find
6244 // the pattern of mul/ext and test the cost of the entire pattern vs the cost
6245 // of the components. If the reduction cost is lower then we return it for the
6246 // reduction instruction and 0 for the other instructions in the pattern. If
6247 // it is not we return an invalid cost specifying the orignal cost method
6249 Instruction
*RetI
= I
;
6250 if (match(RetI
, m_ZExtOrSExt(m_Value()))) {
6251 if (!RetI
->hasOneUser())
6252 return std::nullopt
;
6253 RetI
= RetI
->user_back();
6256 if (match(RetI
, m_OneUse(m_Mul(m_Value(), m_Value()))) &&
6257 RetI
->user_back()->getOpcode() == Instruction::Add
) {
6258 RetI
= RetI
->user_back();
6261 // Test if the found instruction is a reduction, and if not return an invalid
6262 // cost specifying the parent to use the original cost modelling.
6263 if (!InLoopReductionImmediateChains
.count(RetI
))
6264 return std::nullopt
;
6266 // Find the reduction this chain is a part of and calculate the basic cost of
6267 // the reduction on its own.
6268 Instruction
*LastChain
= InLoopReductionImmediateChains
.at(RetI
);
6269 Instruction
*ReductionPhi
= LastChain
;
6270 while (!isa
<PHINode
>(ReductionPhi
))
6271 ReductionPhi
= InLoopReductionImmediateChains
.at(ReductionPhi
);
6273 const RecurrenceDescriptor
&RdxDesc
=
6274 Legal
->getReductionVars().find(cast
<PHINode
>(ReductionPhi
))->second
;
6276 InstructionCost BaseCost
= TTI
.getArithmeticReductionCost(
6277 RdxDesc
.getOpcode(), VectorTy
, RdxDesc
.getFastMathFlags(), CostKind
);
6279 // For a call to the llvm.fmuladd intrinsic we need to add the cost of a
6280 // normal fmul instruction to the cost of the fadd reduction.
6281 if (RdxDesc
.getRecurrenceKind() == RecurKind::FMulAdd
)
6283 TTI
.getArithmeticInstrCost(Instruction::FMul
, VectorTy
, CostKind
);
6285 // If we're using ordered reductions then we can just return the base cost
6286 // here, since getArithmeticReductionCost calculates the full ordered
6287 // reduction cost when FP reassociation is not allowed.
6288 if (useOrderedReductions(RdxDesc
))
6291 // Get the operand that was not the reduction chain and match it to one of the
6292 // patterns, returning the better cost if it is found.
6293 Instruction
*RedOp
= RetI
->getOperand(1) == LastChain
6294 ? dyn_cast
<Instruction
>(RetI
->getOperand(0))
6295 : dyn_cast
<Instruction
>(RetI
->getOperand(1));
6297 VectorTy
= VectorType::get(I
->getOperand(0)->getType(), VectorTy
);
6299 Instruction
*Op0
, *Op1
;
6300 if (RedOp
&& RdxDesc
.getOpcode() == Instruction::Add
&&
6302 m_ZExtOrSExt(m_Mul(m_Instruction(Op0
), m_Instruction(Op1
)))) &&
6303 match(Op0
, m_ZExtOrSExt(m_Value())) &&
6304 Op0
->getOpcode() == Op1
->getOpcode() &&
6305 Op0
->getOperand(0)->getType() == Op1
->getOperand(0)->getType() &&
6306 !TheLoop
->isLoopInvariant(Op0
) && !TheLoop
->isLoopInvariant(Op1
) &&
6307 (Op0
->getOpcode() == RedOp
->getOpcode() || Op0
== Op1
)) {
6309 // Matched reduce.add(ext(mul(ext(A), ext(B)))
6310 // Note that the extend opcodes need to all match, or if A==B they will have
6311 // been converted to zext(mul(sext(A), sext(A))) as it is known positive,
6312 // which is equally fine.
6313 bool IsUnsigned
= isa
<ZExtInst
>(Op0
);
6314 auto *ExtType
= VectorType::get(Op0
->getOperand(0)->getType(), VectorTy
);
6315 auto *MulType
= VectorType::get(Op0
->getType(), VectorTy
);
6317 InstructionCost ExtCost
=
6318 TTI
.getCastInstrCost(Op0
->getOpcode(), MulType
, ExtType
,
6319 TTI::CastContextHint::None
, CostKind
, Op0
);
6320 InstructionCost MulCost
=
6321 TTI
.getArithmeticInstrCost(Instruction::Mul
, MulType
, CostKind
);
6322 InstructionCost Ext2Cost
=
6323 TTI
.getCastInstrCost(RedOp
->getOpcode(), VectorTy
, MulType
,
6324 TTI::CastContextHint::None
, CostKind
, RedOp
);
6326 InstructionCost RedCost
= TTI
.getMulAccReductionCost(
6327 IsUnsigned
, RdxDesc
.getRecurrenceType(), ExtType
, CostKind
);
6329 if (RedCost
.isValid() &&
6330 RedCost
< ExtCost
* 2 + MulCost
+ Ext2Cost
+ BaseCost
)
6331 return I
== RetI
? RedCost
: 0;
6332 } else if (RedOp
&& match(RedOp
, m_ZExtOrSExt(m_Value())) &&
6333 !TheLoop
->isLoopInvariant(RedOp
)) {
6334 // Matched reduce(ext(A))
6335 bool IsUnsigned
= isa
<ZExtInst
>(RedOp
);
6336 auto *ExtType
= VectorType::get(RedOp
->getOperand(0)->getType(), VectorTy
);
6337 InstructionCost RedCost
= TTI
.getExtendedReductionCost(
6338 RdxDesc
.getOpcode(), IsUnsigned
, RdxDesc
.getRecurrenceType(), ExtType
,
6339 RdxDesc
.getFastMathFlags(), CostKind
);
6341 InstructionCost ExtCost
=
6342 TTI
.getCastInstrCost(RedOp
->getOpcode(), VectorTy
, ExtType
,
6343 TTI::CastContextHint::None
, CostKind
, RedOp
);
6344 if (RedCost
.isValid() && RedCost
< BaseCost
+ ExtCost
)
6345 return I
== RetI
? RedCost
: 0;
6346 } else if (RedOp
&& RdxDesc
.getOpcode() == Instruction::Add
&&
6347 match(RedOp
, m_Mul(m_Instruction(Op0
), m_Instruction(Op1
)))) {
6348 if (match(Op0
, m_ZExtOrSExt(m_Value())) &&
6349 Op0
->getOpcode() == Op1
->getOpcode() &&
6350 !TheLoop
->isLoopInvariant(Op0
) && !TheLoop
->isLoopInvariant(Op1
)) {
6351 bool IsUnsigned
= isa
<ZExtInst
>(Op0
);
6352 Type
*Op0Ty
= Op0
->getOperand(0)->getType();
6353 Type
*Op1Ty
= Op1
->getOperand(0)->getType();
6355 Op0Ty
->getIntegerBitWidth() < Op1Ty
->getIntegerBitWidth() ? Op1Ty
6357 auto *ExtType
= VectorType::get(LargestOpTy
, VectorTy
);
6359 // Matched reduce.add(mul(ext(A), ext(B))), where the two ext may be of
6360 // different sizes. We take the largest type as the ext to reduce, and add
6361 // the remaining cost as, for example reduce(mul(ext(ext(A)), ext(B))).
6362 InstructionCost ExtCost0
= TTI
.getCastInstrCost(
6363 Op0
->getOpcode(), VectorTy
, VectorType::get(Op0Ty
, VectorTy
),
6364 TTI::CastContextHint::None
, CostKind
, Op0
);
6365 InstructionCost ExtCost1
= TTI
.getCastInstrCost(
6366 Op1
->getOpcode(), VectorTy
, VectorType::get(Op1Ty
, VectorTy
),
6367 TTI::CastContextHint::None
, CostKind
, Op1
);
6368 InstructionCost MulCost
=
6369 TTI
.getArithmeticInstrCost(Instruction::Mul
, VectorTy
, CostKind
);
6371 InstructionCost RedCost
= TTI
.getMulAccReductionCost(
6372 IsUnsigned
, RdxDesc
.getRecurrenceType(), ExtType
, CostKind
);
6373 InstructionCost ExtraExtCost
= 0;
6374 if (Op0Ty
!= LargestOpTy
|| Op1Ty
!= LargestOpTy
) {
6375 Instruction
*ExtraExtOp
= (Op0Ty
!= LargestOpTy
) ? Op0
: Op1
;
6376 ExtraExtCost
= TTI
.getCastInstrCost(
6377 ExtraExtOp
->getOpcode(), ExtType
,
6378 VectorType::get(ExtraExtOp
->getOperand(0)->getType(), VectorTy
),
6379 TTI::CastContextHint::None
, CostKind
, ExtraExtOp
);
6382 if (RedCost
.isValid() &&
6383 (RedCost
+ ExtraExtCost
) < (ExtCost0
+ ExtCost1
+ MulCost
+ BaseCost
))
6384 return I
== RetI
? RedCost
: 0;
6385 } else if (!match(I
, m_ZExtOrSExt(m_Value()))) {
6386 // Matched reduce.add(mul())
6387 InstructionCost MulCost
=
6388 TTI
.getArithmeticInstrCost(Instruction::Mul
, VectorTy
, CostKind
);
6390 InstructionCost RedCost
= TTI
.getMulAccReductionCost(
6391 true, RdxDesc
.getRecurrenceType(), VectorTy
, CostKind
);
6393 if (RedCost
.isValid() && RedCost
< MulCost
+ BaseCost
)
6394 return I
== RetI
? RedCost
: 0;
6398 return I
== RetI
? std::optional
<InstructionCost
>(BaseCost
) : std::nullopt
;
6402 LoopVectorizationCostModel::getMemoryInstructionCost(Instruction
*I
,
6404 // Calculate scalar cost only. Vectorization cost should be ready at this
6406 if (VF
.isScalar()) {
6407 Type
*ValTy
= getLoadStoreType(I
);
6408 const Align Alignment
= getLoadStoreAlignment(I
);
6409 unsigned AS
= getLoadStoreAddressSpace(I
);
6411 TTI::OperandValueInfo OpInfo
= TTI::getOperandInfo(I
->getOperand(0));
6412 return TTI
.getAddressComputationCost(ValTy
) +
6413 TTI
.getMemoryOpCost(I
->getOpcode(), ValTy
, Alignment
, AS
,
6414 TTI::TCK_RecipThroughput
, OpInfo
, I
);
6416 return getWideningCost(I
, VF
);
6419 LoopVectorizationCostModel::VectorizationCostTy
6420 LoopVectorizationCostModel::getInstructionCost(Instruction
*I
,
6422 // If we know that this instruction will remain uniform, check the cost of
6423 // the scalar version.
6424 if (isUniformAfterVectorization(I
, VF
))
6425 VF
= ElementCount::getFixed(1);
6427 if (VF
.isVector() && isProfitableToScalarize(I
, VF
))
6428 return VectorizationCostTy(InstsToScalarize
[VF
][I
], false);
6430 // Forced scalars do not have any scalarization overhead.
6431 auto ForcedScalar
= ForcedScalars
.find(VF
);
6432 if (VF
.isVector() && ForcedScalar
!= ForcedScalars
.end()) {
6433 auto InstSet
= ForcedScalar
->second
;
6434 if (InstSet
.count(I
))
6435 return VectorizationCostTy(
6436 (getInstructionCost(I
, ElementCount::getFixed(1)).first
*
6437 VF
.getKnownMinValue()),
6442 InstructionCost C
= getInstructionCost(I
, VF
, VectorTy
);
6444 bool TypeNotScalarized
= false;
6445 if (VF
.isVector() && VectorTy
->isVectorTy()) {
6446 if (unsigned NumParts
= TTI
.getNumberOfParts(VectorTy
)) {
6447 if (VF
.isScalable())
6448 // <vscale x 1 x iN> is assumed to be profitable over iN because
6449 // scalable registers are a distinct register class from scalar ones.
6450 // If we ever find a target which wants to lower scalable vectors
6451 // back to scalars, we'll need to update this code to explicitly
6452 // ask TTI about the register class uses for each part.
6453 TypeNotScalarized
= NumParts
<= VF
.getKnownMinValue();
6455 TypeNotScalarized
= NumParts
< VF
.getKnownMinValue();
6457 C
= InstructionCost::getInvalid();
6459 return VectorizationCostTy(C
, TypeNotScalarized
);
6462 InstructionCost
LoopVectorizationCostModel::getScalarizationOverhead(
6463 Instruction
*I
, ElementCount VF
, TTI::TargetCostKind CostKind
) const {
6465 // There is no mechanism yet to create a scalable scalarization loop,
6466 // so this is currently Invalid.
6467 if (VF
.isScalable())
6468 return InstructionCost::getInvalid();
6473 InstructionCost Cost
= 0;
6474 Type
*RetTy
= ToVectorTy(I
->getType(), VF
);
6475 if (!RetTy
->isVoidTy() &&
6476 (!isa
<LoadInst
>(I
) || !TTI
.supportsEfficientVectorElementLoadStore()))
6477 Cost
+= TTI
.getScalarizationOverhead(
6478 cast
<VectorType
>(RetTy
), APInt::getAllOnes(VF
.getKnownMinValue()),
6480 /*Extract*/ false, CostKind
);
6482 // Some targets keep addresses scalar.
6483 if (isa
<LoadInst
>(I
) && !TTI
.prefersVectorizedAddressing())
6486 // Some targets support efficient element stores.
6487 if (isa
<StoreInst
>(I
) && TTI
.supportsEfficientVectorElementLoadStore())
6490 // Collect operands to consider.
6491 CallInst
*CI
= dyn_cast
<CallInst
>(I
);
6492 Instruction::op_range Ops
= CI
? CI
->args() : I
->operands();
6494 // Skip operands that do not require extraction/scalarization and do not incur
6496 SmallVector
<Type
*> Tys
;
6497 for (auto *V
: filterExtractingOperands(Ops
, VF
))
6498 Tys
.push_back(MaybeVectorizeType(V
->getType(), VF
));
6499 return Cost
+ TTI
.getOperandsScalarizationOverhead(
6500 filterExtractingOperands(Ops
, VF
), Tys
, CostKind
);
6503 void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF
) {
6507 for (BasicBlock
*BB
: TheLoop
->blocks()) {
6508 // For each instruction in the old loop.
6509 for (Instruction
&I
: *BB
) {
6510 Value
*Ptr
= getLoadStorePointerOperand(&I
);
6514 // TODO: We should generate better code and update the cost model for
6515 // predicated uniform stores. Today they are treated as any other
6516 // predicated store (see added test cases in
6517 // invariant-store-vectorization.ll).
6518 if (isa
<StoreInst
>(&I
) && isScalarWithPredication(&I
, VF
))
6521 if (Legal
->isUniformMemOp(I
, VF
)) {
6522 auto isLegalToScalarize
= [&]() {
6523 if (!VF
.isScalable())
6524 // Scalarization of fixed length vectors "just works".
6527 // We have dedicated lowering for unpredicated uniform loads and
6528 // stores. Note that even with tail folding we know that at least
6529 // one lane is active (i.e. generalized predication is not possible
6530 // here), and the logic below depends on this fact.
6531 if (!foldTailByMasking())
6534 // For scalable vectors, a uniform memop load is always
6535 // uniform-by-parts and we know how to scalarize that.
6536 if (isa
<LoadInst
>(I
))
6539 // A uniform store isn't neccessarily uniform-by-part
6540 // and we can't assume scalarization.
6541 auto &SI
= cast
<StoreInst
>(I
);
6542 return TheLoop
->isLoopInvariant(SI
.getValueOperand());
6545 const InstructionCost GatherScatterCost
=
6546 isLegalGatherOrScatter(&I
, VF
) ?
6547 getGatherScatterCost(&I
, VF
) : InstructionCost::getInvalid();
6549 // Load: Scalar load + broadcast
6550 // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
6551 // FIXME: This cost is a significant under-estimate for tail folded
6553 const InstructionCost ScalarizationCost
= isLegalToScalarize() ?
6554 getUniformMemOpCost(&I
, VF
) : InstructionCost::getInvalid();
6556 // Choose better solution for the current VF, Note that Invalid
6557 // costs compare as maximumal large. If both are invalid, we get
6558 // scalable invalid which signals a failure and a vectorization abort.
6559 if (GatherScatterCost
< ScalarizationCost
)
6560 setWideningDecision(&I
, VF
, CM_GatherScatter
, GatherScatterCost
);
6562 setWideningDecision(&I
, VF
, CM_Scalarize
, ScalarizationCost
);
6566 // We assume that widening is the best solution when possible.
6567 if (memoryInstructionCanBeWidened(&I
, VF
)) {
6568 InstructionCost Cost
= getConsecutiveMemOpCost(&I
, VF
);
6569 int ConsecutiveStride
= Legal
->isConsecutivePtr(
6570 getLoadStoreType(&I
), getLoadStorePointerOperand(&I
));
6571 assert((ConsecutiveStride
== 1 || ConsecutiveStride
== -1) &&
6572 "Expected consecutive stride.");
6573 InstWidening Decision
=
6574 ConsecutiveStride
== 1 ? CM_Widen
: CM_Widen_Reverse
;
6575 setWideningDecision(&I
, VF
, Decision
, Cost
);
6579 // Choose between Interleaving, Gather/Scatter or Scalarization.
6580 InstructionCost InterleaveCost
= InstructionCost::getInvalid();
6581 unsigned NumAccesses
= 1;
6582 if (isAccessInterleaved(&I
)) {
6583 auto Group
= getInterleavedAccessGroup(&I
);
6584 assert(Group
&& "Fail to get an interleaved access group.");
6586 // Make one decision for the whole group.
6587 if (getWideningDecision(&I
, VF
) != CM_Unknown
)
6590 NumAccesses
= Group
->getNumMembers();
6591 if (interleavedAccessCanBeWidened(&I
, VF
))
6592 InterleaveCost
= getInterleaveGroupCost(&I
, VF
);
6595 InstructionCost GatherScatterCost
=
6596 isLegalGatherOrScatter(&I
, VF
)
6597 ? getGatherScatterCost(&I
, VF
) * NumAccesses
6598 : InstructionCost::getInvalid();
6600 InstructionCost ScalarizationCost
=
6601 getMemInstScalarizationCost(&I
, VF
) * NumAccesses
;
6603 // Choose better solution for the current VF,
6604 // write down this decision and use it during vectorization.
6605 InstructionCost Cost
;
6606 InstWidening Decision
;
6607 if (InterleaveCost
<= GatherScatterCost
&&
6608 InterleaveCost
< ScalarizationCost
) {
6609 Decision
= CM_Interleave
;
6610 Cost
= InterleaveCost
;
6611 } else if (GatherScatterCost
< ScalarizationCost
) {
6612 Decision
= CM_GatherScatter
;
6613 Cost
= GatherScatterCost
;
6615 Decision
= CM_Scalarize
;
6616 Cost
= ScalarizationCost
;
6618 // If the instructions belongs to an interleave group, the whole group
6619 // receives the same decision. The whole group receives the cost, but
6620 // the cost will actually be assigned to one instruction.
6621 if (auto Group
= getInterleavedAccessGroup(&I
))
6622 setWideningDecision(Group
, VF
, Decision
, Cost
);
6624 setWideningDecision(&I
, VF
, Decision
, Cost
);
6628 // Make sure that any load of address and any other address computation
6629 // remains scalar unless there is gather/scatter support. This avoids
6630 // inevitable extracts into address registers, and also has the benefit of
6631 // activating LSR more, since that pass can't optimize vectorized
6633 if (TTI
.prefersVectorizedAddressing())
6636 // Start with all scalar pointer uses.
6637 SmallPtrSet
<Instruction
*, 8> AddrDefs
;
6638 for (BasicBlock
*BB
: TheLoop
->blocks())
6639 for (Instruction
&I
: *BB
) {
6640 Instruction
*PtrDef
=
6641 dyn_cast_or_null
<Instruction
>(getLoadStorePointerOperand(&I
));
6642 if (PtrDef
&& TheLoop
->contains(PtrDef
) &&
6643 getWideningDecision(&I
, VF
) != CM_GatherScatter
)
6644 AddrDefs
.insert(PtrDef
);
6647 // Add all instructions used to generate the addresses.
6648 SmallVector
<Instruction
*, 4> Worklist
;
6649 append_range(Worklist
, AddrDefs
);
6650 while (!Worklist
.empty()) {
6651 Instruction
*I
= Worklist
.pop_back_val();
6652 for (auto &Op
: I
->operands())
6653 if (auto *InstOp
= dyn_cast
<Instruction
>(Op
))
6654 if ((InstOp
->getParent() == I
->getParent()) && !isa
<PHINode
>(InstOp
) &&
6655 AddrDefs
.insert(InstOp
).second
)
6656 Worklist
.push_back(InstOp
);
6659 for (auto *I
: AddrDefs
) {
6660 if (isa
<LoadInst
>(I
)) {
6661 // Setting the desired widening decision should ideally be handled in
6662 // by cost functions, but since this involves the task of finding out
6663 // if the loaded register is involved in an address computation, it is
6664 // instead changed here when we know this is the case.
6665 InstWidening Decision
= getWideningDecision(I
, VF
);
6666 if (Decision
== CM_Widen
|| Decision
== CM_Widen_Reverse
)
6667 // Scalarize a widened load of address.
6668 setWideningDecision(
6669 I
, VF
, CM_Scalarize
,
6670 (VF
.getKnownMinValue() *
6671 getMemoryInstructionCost(I
, ElementCount::getFixed(1))));
6672 else if (auto Group
= getInterleavedAccessGroup(I
)) {
6673 // Scalarize an interleave group of address loads.
6674 for (unsigned I
= 0; I
< Group
->getFactor(); ++I
) {
6675 if (Instruction
*Member
= Group
->getMember(I
))
6676 setWideningDecision(
6677 Member
, VF
, CM_Scalarize
,
6678 (VF
.getKnownMinValue() *
6679 getMemoryInstructionCost(Member
, ElementCount::getFixed(1))));
6683 // Make sure I gets scalarized and a cost estimate without
6684 // scalarization overhead.
6685 ForcedScalars
[VF
].insert(I
);
6689 void LoopVectorizationCostModel::setVectorizedCallDecision(ElementCount VF
) {
6690 assert(!VF
.isScalar() &&
6691 "Trying to set a vectorization decision for a scalar VF");
6693 for (BasicBlock
*BB
: TheLoop
->blocks()) {
6694 // For each instruction in the old loop.
6695 for (Instruction
&I
: *BB
) {
6696 CallInst
*CI
= dyn_cast
<CallInst
>(&I
);
6701 InstructionCost ScalarCost
= InstructionCost::getInvalid();
6702 InstructionCost VectorCost
= InstructionCost::getInvalid();
6703 InstructionCost IntrinsicCost
= InstructionCost::getInvalid();
6704 TTI::TargetCostKind CostKind
= TTI::TCK_RecipThroughput
;
6706 Function
*ScalarFunc
= CI
->getCalledFunction();
6707 Type
*ScalarRetTy
= CI
->getType();
6708 SmallVector
<Type
*, 4> Tys
, ScalarTys
;
6709 bool MaskRequired
= Legal
->isMaskRequired(CI
);
6710 for (auto &ArgOp
: CI
->args())
6711 ScalarTys
.push_back(ArgOp
->getType());
6713 // Compute corresponding vector type for return value and arguments.
6714 Type
*RetTy
= ToVectorTy(ScalarRetTy
, VF
);
6715 for (Type
*ScalarTy
: ScalarTys
)
6716 Tys
.push_back(ToVectorTy(ScalarTy
, VF
));
6718 // An in-loop reduction using an fmuladd intrinsic is a special case;
6719 // we don't want the normal cost for that intrinsic.
6720 if (RecurrenceDescriptor::isFMulAddIntrinsic(CI
))
6721 if (auto RedCost
= getReductionPatternCost(CI
, VF
, RetTy
, CostKind
)) {
6722 setCallWideningDecision(CI
, VF
, CM_IntrinsicCall
, nullptr,
6723 getVectorIntrinsicIDForCall(CI
, TLI
),
6724 std::nullopt
, *RedCost
);
6728 // Estimate cost of scalarized vector call. The source operands are
6729 // assumed to be vectors, so we need to extract individual elements from
6730 // there, execute VF scalar calls, and then gather the result into the
6731 // vector return value.
6732 InstructionCost ScalarCallCost
=
6733 TTI
.getCallInstrCost(ScalarFunc
, ScalarRetTy
, ScalarTys
, CostKind
);
6735 // Compute costs of unpacking argument values for the scalar calls and
6736 // packing the return values to a vector.
6737 InstructionCost ScalarizationCost
=
6738 getScalarizationOverhead(CI
, VF
, CostKind
);
6740 ScalarCost
= ScalarCallCost
* VF
.getKnownMinValue() + ScalarizationCost
;
6742 // Find the cost of vectorizing the call, if we can find a suitable
6743 // vector variant of the function.
6744 bool UsesMask
= false;
6746 Function
*VecFunc
= nullptr;
6747 // Search through any available variants for one we can use at this VF.
6748 for (VFInfo
&Info
: VFDatabase::getMappings(*CI
)) {
6749 // Must match requested VF.
6750 if (Info
.Shape
.VF
!= VF
)
6753 // Must take a mask argument if one is required
6754 if (MaskRequired
&& !Info
.isMasked())
6757 // Check that all parameter kinds are supported
6758 bool ParamsOk
= true;
6759 for (VFParameter Param
: Info
.Shape
.Parameters
) {
6760 switch (Param
.ParamKind
) {
6761 case VFParamKind::Vector
:
6763 case VFParamKind::OMP_Uniform
: {
6764 Value
*ScalarParam
= CI
->getArgOperand(Param
.ParamPos
);
6765 // Make sure the scalar parameter in the loop is invariant.
6766 if (!PSE
.getSE()->isLoopInvariant(PSE
.getSCEV(ScalarParam
),
6771 case VFParamKind::OMP_Linear
: {
6772 Value
*ScalarParam
= CI
->getArgOperand(Param
.ParamPos
);
6773 // Find the stride for the scalar parameter in this loop and see if
6774 // it matches the stride for the variant.
6775 // TODO: do we need to figure out the cost of an extract to get the
6776 // first lane? Or do we hope that it will be folded away?
6777 ScalarEvolution
*SE
= PSE
.getSE();
6779 dyn_cast
<SCEVAddRecExpr
>(SE
->getSCEV(ScalarParam
));
6781 if (!SAR
|| SAR
->getLoop() != TheLoop
) {
6786 const SCEVConstant
*Step
=
6787 dyn_cast
<SCEVConstant
>(SAR
->getStepRecurrence(*SE
));
6790 Step
->getAPInt().getSExtValue() != Param
.LinearStepOrPos
)
6795 case VFParamKind::GlobalPredicate
:
6807 // Found a suitable candidate, stop here.
6808 VecFunc
= CI
->getModule()->getFunction(Info
.VectorName
);
6813 // Add in the cost of synthesizing a mask if one wasn't required.
6814 InstructionCost MaskCost
= 0;
6815 if (VecFunc
&& UsesMask
&& !MaskRequired
)
6816 MaskCost
= TTI
.getShuffleCost(
6817 TargetTransformInfo::SK_Broadcast
,
6818 VectorType::get(IntegerType::getInt1Ty(
6819 VecFunc
->getFunctionType()->getContext()),
6822 if (TLI
&& VecFunc
&& !CI
->isNoBuiltin())
6824 TTI
.getCallInstrCost(nullptr, RetTy
, Tys
, CostKind
) + MaskCost
;
6826 // Find the cost of an intrinsic; some targets may have instructions that
6827 // perform the operation without needing an actual call.
6828 Intrinsic::ID IID
= getVectorIntrinsicIDForCall(CI
, TLI
);
6829 if (IID
!= Intrinsic::not_intrinsic
)
6830 IntrinsicCost
= getVectorIntrinsicCost(CI
, VF
);
6832 InstructionCost Cost
= ScalarCost
;
6833 InstWidening Decision
= CM_Scalarize
;
6835 if (VectorCost
<= Cost
) {
6837 Decision
= CM_VectorCall
;
6840 if (IntrinsicCost
<= Cost
) {
6841 Cost
= IntrinsicCost
;
6842 Decision
= CM_IntrinsicCall
;
6845 setCallWideningDecision(CI
, VF
, Decision
, VecFunc
, IID
,
6846 FuncInfo
.getParamIndexForOptionalMask(), Cost
);
6852 LoopVectorizationCostModel::getInstructionCost(Instruction
*I
, ElementCount VF
,
6854 Type
*RetTy
= I
->getType();
6855 if (canTruncateToMinimalBitwidth(I
, VF
))
6856 RetTy
= IntegerType::get(RetTy
->getContext(), MinBWs
[I
]);
6857 auto SE
= PSE
.getSE();
6858 TTI::TargetCostKind CostKind
= TTI::TCK_RecipThroughput
;
6860 auto hasSingleCopyAfterVectorization
= [this](Instruction
*I
,
6861 ElementCount VF
) -> bool {
6865 auto Scalarized
= InstsToScalarize
.find(VF
);
6866 assert(Scalarized
!= InstsToScalarize
.end() &&
6867 "VF not yet analyzed for scalarization profitability");
6868 return !Scalarized
->second
.count(I
) &&
6869 llvm::all_of(I
->users(), [&](User
*U
) {
6870 auto *UI
= cast
<Instruction
>(U
);
6871 return !Scalarized
->second
.count(UI
);
6874 (void) hasSingleCopyAfterVectorization
;
6876 if (isScalarAfterVectorization(I
, VF
)) {
6877 // With the exception of GEPs and PHIs, after scalarization there should
6878 // only be one copy of the instruction generated in the loop. This is
6879 // because the VF is either 1, or any instructions that need scalarizing
6880 // have already been dealt with by the time we get here. As a result,
6881 // it means we don't have to multiply the instruction cost by VF.
6882 assert(I
->getOpcode() == Instruction::GetElementPtr
||
6883 I
->getOpcode() == Instruction::PHI
||
6884 (I
->getOpcode() == Instruction::BitCast
&&
6885 I
->getType()->isPointerTy()) ||
6886 hasSingleCopyAfterVectorization(I
, VF
));
6889 VectorTy
= ToVectorTy(RetTy
, VF
);
6891 // TODO: We need to estimate the cost of intrinsic calls.
6892 switch (I
->getOpcode()) {
6893 case Instruction::GetElementPtr
:
6894 // We mark this instruction as zero-cost because the cost of GEPs in
6895 // vectorized code depends on whether the corresponding memory instruction
6896 // is scalarized or not. Therefore, we handle GEPs with the memory
6897 // instruction cost.
6899 case Instruction::Br
: {
6900 // In cases of scalarized and predicated instructions, there will be VF
6901 // predicated blocks in the vectorized loop. Each branch around these
6902 // blocks requires also an extract of its vector compare i1 element.
6903 bool ScalarPredicatedBB
= false;
6904 BranchInst
*BI
= cast
<BranchInst
>(I
);
6905 if (VF
.isVector() && BI
->isConditional() &&
6906 (PredicatedBBsAfterVectorization
[VF
].count(BI
->getSuccessor(0)) ||
6907 PredicatedBBsAfterVectorization
[VF
].count(BI
->getSuccessor(1))))
6908 ScalarPredicatedBB
= true;
6910 if (ScalarPredicatedBB
) {
6911 // Not possible to scalarize scalable vector with predicated instructions.
6912 if (VF
.isScalable())
6913 return InstructionCost::getInvalid();
6914 // Return cost for branches around scalarized and predicated blocks.
6916 VectorType::get(IntegerType::getInt1Ty(RetTy
->getContext()), VF
);
6918 TTI
.getScalarizationOverhead(
6919 Vec_i1Ty
, APInt::getAllOnes(VF
.getFixedValue()),
6920 /*Insert*/ false, /*Extract*/ true, CostKind
) +
6921 (TTI
.getCFInstrCost(Instruction::Br
, CostKind
) * VF
.getFixedValue()));
6922 } else if (I
->getParent() == TheLoop
->getLoopLatch() || VF
.isScalar())
6923 // The back-edge branch will remain, as will all scalar branches.
6924 return TTI
.getCFInstrCost(Instruction::Br
, CostKind
);
6926 // This branch will be eliminated by if-conversion.
6928 // Note: We currently assume zero cost for an unconditional branch inside
6929 // a predicated block since it will become a fall-through, although we
6930 // may decide in the future to call TTI for all branches.
6932 case Instruction::PHI
: {
6933 auto *Phi
= cast
<PHINode
>(I
);
6935 // First-order recurrences are replaced by vector shuffles inside the loop.
6936 if (VF
.isVector() && Legal
->isFixedOrderRecurrence(Phi
)) {
6937 SmallVector
<int> Mask(VF
.getKnownMinValue());
6938 std::iota(Mask
.begin(), Mask
.end(), VF
.getKnownMinValue() - 1);
6939 return TTI
.getShuffleCost(TargetTransformInfo::SK_Splice
,
6940 cast
<VectorType
>(VectorTy
), Mask
, CostKind
,
6941 VF
.getKnownMinValue() - 1);
6944 // Phi nodes in non-header blocks (not inductions, reductions, etc.) are
6945 // converted into select instructions. We require N - 1 selects per phi
6946 // node, where N is the number of incoming values.
6947 if (VF
.isVector() && Phi
->getParent() != TheLoop
->getHeader())
6948 return (Phi
->getNumIncomingValues() - 1) *
6949 TTI
.getCmpSelInstrCost(
6950 Instruction::Select
, ToVectorTy(Phi
->getType(), VF
),
6951 ToVectorTy(Type::getInt1Ty(Phi
->getContext()), VF
),
6952 CmpInst::BAD_ICMP_PREDICATE
, CostKind
);
6954 return TTI
.getCFInstrCost(Instruction::PHI
, CostKind
);
6956 case Instruction::UDiv
:
6957 case Instruction::SDiv
:
6958 case Instruction::URem
:
6959 case Instruction::SRem
:
6960 if (VF
.isVector() && isPredicatedInst(I
)) {
6961 const auto [ScalarCost
, SafeDivisorCost
] = getDivRemSpeculationCost(I
, VF
);
6962 return isDivRemScalarWithPredication(ScalarCost
, SafeDivisorCost
) ?
6963 ScalarCost
: SafeDivisorCost
;
6965 // We've proven all lanes safe to speculate, fall through.
6967 case Instruction::Add
:
6968 case Instruction::FAdd
:
6969 case Instruction::Sub
:
6970 case Instruction::FSub
:
6971 case Instruction::Mul
:
6972 case Instruction::FMul
:
6973 case Instruction::FDiv
:
6974 case Instruction::FRem
:
6975 case Instruction::Shl
:
6976 case Instruction::LShr
:
6977 case Instruction::AShr
:
6978 case Instruction::And
:
6979 case Instruction::Or
:
6980 case Instruction::Xor
: {
6981 // If we're speculating on the stride being 1, the multiplication may
6982 // fold away. We can generalize this for all operations using the notion
6983 // of neutral elements. (TODO)
6984 if (I
->getOpcode() == Instruction::Mul
&&
6985 (PSE
.getSCEV(I
->getOperand(0))->isOne() ||
6986 PSE
.getSCEV(I
->getOperand(1))->isOne()))
6989 // Detect reduction patterns
6990 if (auto RedCost
= getReductionPatternCost(I
, VF
, VectorTy
, CostKind
))
6993 // Certain instructions can be cheaper to vectorize if they have a constant
6994 // second vector operand. One example of this are shifts on x86.
6995 Value
*Op2
= I
->getOperand(1);
6996 auto Op2Info
= TTI
.getOperandInfo(Op2
);
6997 if (Op2Info
.Kind
== TargetTransformInfo::OK_AnyValue
&&
6998 Legal
->isInvariant(Op2
))
6999 Op2Info
.Kind
= TargetTransformInfo::OK_UniformValue
;
7001 SmallVector
<const Value
*, 4> Operands(I
->operand_values());
7002 auto InstrCost
= TTI
.getArithmeticInstrCost(
7003 I
->getOpcode(), VectorTy
, CostKind
,
7004 {TargetTransformInfo::OK_AnyValue
, TargetTransformInfo::OP_None
},
7005 Op2Info
, Operands
, I
);
7007 // Some targets can replace frem with vector library calls.
7008 InstructionCost VecCallCost
= InstructionCost::getInvalid();
7009 if (I
->getOpcode() == Instruction::FRem
) {
7011 if (TLI
->getLibFunc(I
->getOpcode(), I
->getType(), Func
) &&
7012 TLI
->isFunctionVectorizable(TLI
->getName(Func
), VF
)) {
7013 SmallVector
<Type
*, 4> OpTypes
;
7014 for (auto &Op
: I
->operands())
7015 OpTypes
.push_back(Op
->getType());
7017 TTI
.getCallInstrCost(nullptr, VectorTy
, OpTypes
, CostKind
);
7020 return std::min(InstrCost
, VecCallCost
);
7022 case Instruction::FNeg
: {
7023 return TTI
.getArithmeticInstrCost(
7024 I
->getOpcode(), VectorTy
, CostKind
,
7025 {TargetTransformInfo::OK_AnyValue
, TargetTransformInfo::OP_None
},
7026 {TargetTransformInfo::OK_AnyValue
, TargetTransformInfo::OP_None
},
7027 I
->getOperand(0), I
);
7029 case Instruction::Select
: {
7030 SelectInst
*SI
= cast
<SelectInst
>(I
);
7031 const SCEV
*CondSCEV
= SE
->getSCEV(SI
->getCondition());
7032 bool ScalarCond
= (SE
->isLoopInvariant(CondSCEV
, TheLoop
));
7034 const Value
*Op0
, *Op1
;
7035 using namespace llvm::PatternMatch
;
7036 if (!ScalarCond
&& (match(I
, m_LogicalAnd(m_Value(Op0
), m_Value(Op1
))) ||
7037 match(I
, m_LogicalOr(m_Value(Op0
), m_Value(Op1
))))) {
7038 // select x, y, false --> x & y
7039 // select x, true, y --> x | y
7040 const auto [Op1VK
, Op1VP
] = TTI::getOperandInfo(Op0
);
7041 const auto [Op2VK
, Op2VP
] = TTI::getOperandInfo(Op1
);
7042 assert(Op0
->getType()->getScalarSizeInBits() == 1 &&
7043 Op1
->getType()->getScalarSizeInBits() == 1);
7045 SmallVector
<const Value
*, 2> Operands
{Op0
, Op1
};
7046 return TTI
.getArithmeticInstrCost(
7047 match(I
, m_LogicalOr()) ? Instruction::Or
: Instruction::And
, VectorTy
,
7048 CostKind
, {Op1VK
, Op1VP
}, {Op2VK
, Op2VP
}, Operands
, I
);
7051 Type
*CondTy
= SI
->getCondition()->getType();
7053 CondTy
= VectorType::get(CondTy
, VF
);
7055 CmpInst::Predicate Pred
= CmpInst::BAD_ICMP_PREDICATE
;
7056 if (auto *Cmp
= dyn_cast
<CmpInst
>(SI
->getCondition()))
7057 Pred
= Cmp
->getPredicate();
7058 return TTI
.getCmpSelInstrCost(I
->getOpcode(), VectorTy
, CondTy
, Pred
,
7061 case Instruction::ICmp
:
7062 case Instruction::FCmp
: {
7063 Type
*ValTy
= I
->getOperand(0)->getType();
7064 Instruction
*Op0AsInstruction
= dyn_cast
<Instruction
>(I
->getOperand(0));
7065 if (canTruncateToMinimalBitwidth(Op0AsInstruction
, VF
))
7066 ValTy
= IntegerType::get(ValTy
->getContext(), MinBWs
[Op0AsInstruction
]);
7067 VectorTy
= ToVectorTy(ValTy
, VF
);
7068 return TTI
.getCmpSelInstrCost(I
->getOpcode(), VectorTy
, nullptr,
7069 cast
<CmpInst
>(I
)->getPredicate(), CostKind
,
7072 case Instruction::Store
:
7073 case Instruction::Load
: {
7074 ElementCount Width
= VF
;
7075 if (Width
.isVector()) {
7076 InstWidening Decision
= getWideningDecision(I
, Width
);
7077 assert(Decision
!= CM_Unknown
&&
7078 "CM decision should be taken at this point");
7079 if (getWideningCost(I
, VF
) == InstructionCost::getInvalid())
7080 return InstructionCost::getInvalid();
7081 if (Decision
== CM_Scalarize
)
7082 Width
= ElementCount::getFixed(1);
7084 VectorTy
= ToVectorTy(getLoadStoreType(I
), Width
);
7085 return getMemoryInstructionCost(I
, VF
);
7087 case Instruction::BitCast
:
7088 if (I
->getType()->isPointerTy())
7091 case Instruction::ZExt
:
7092 case Instruction::SExt
:
7093 case Instruction::FPToUI
:
7094 case Instruction::FPToSI
:
7095 case Instruction::FPExt
:
7096 case Instruction::PtrToInt
:
7097 case Instruction::IntToPtr
:
7098 case Instruction::SIToFP
:
7099 case Instruction::UIToFP
:
7100 case Instruction::Trunc
:
7101 case Instruction::FPTrunc
: {
7102 // Computes the CastContextHint from a Load/Store instruction.
7103 auto ComputeCCH
= [&](Instruction
*I
) -> TTI::CastContextHint
{
7104 assert((isa
<LoadInst
>(I
) || isa
<StoreInst
>(I
)) &&
7105 "Expected a load or a store!");
7107 if (VF
.isScalar() || !TheLoop
->contains(I
))
7108 return TTI::CastContextHint::Normal
;
7110 switch (getWideningDecision(I
, VF
)) {
7111 case LoopVectorizationCostModel::CM_GatherScatter
:
7112 return TTI::CastContextHint::GatherScatter
;
7113 case LoopVectorizationCostModel::CM_Interleave
:
7114 return TTI::CastContextHint::Interleave
;
7115 case LoopVectorizationCostModel::CM_Scalarize
:
7116 case LoopVectorizationCostModel::CM_Widen
:
7117 return Legal
->isMaskRequired(I
) ? TTI::CastContextHint::Masked
7118 : TTI::CastContextHint::Normal
;
7119 case LoopVectorizationCostModel::CM_Widen_Reverse
:
7120 return TTI::CastContextHint::Reversed
;
7121 case LoopVectorizationCostModel::CM_Unknown
:
7122 llvm_unreachable("Instr did not go through cost modelling?");
7123 case LoopVectorizationCostModel::CM_VectorCall
:
7124 case LoopVectorizationCostModel::CM_IntrinsicCall
:
7125 llvm_unreachable_internal("Instr has invalid widening decision");
7128 llvm_unreachable("Unhandled case!");
7131 unsigned Opcode
= I
->getOpcode();
7132 TTI::CastContextHint CCH
= TTI::CastContextHint::None
;
7133 // For Trunc, the context is the only user, which must be a StoreInst.
7134 if (Opcode
== Instruction::Trunc
|| Opcode
== Instruction::FPTrunc
) {
7136 if (StoreInst
*Store
= dyn_cast
<StoreInst
>(*I
->user_begin()))
7137 CCH
= ComputeCCH(Store
);
7139 // For Z/Sext, the context is the operand, which must be a LoadInst.
7140 else if (Opcode
== Instruction::ZExt
|| Opcode
== Instruction::SExt
||
7141 Opcode
== Instruction::FPExt
) {
7142 if (LoadInst
*Load
= dyn_cast
<LoadInst
>(I
->getOperand(0)))
7143 CCH
= ComputeCCH(Load
);
7146 // We optimize the truncation of induction variables having constant
7147 // integer steps. The cost of these truncations is the same as the scalar
7149 if (isOptimizableIVTruncate(I
, VF
)) {
7150 auto *Trunc
= cast
<TruncInst
>(I
);
7151 return TTI
.getCastInstrCost(Instruction::Trunc
, Trunc
->getDestTy(),
7152 Trunc
->getSrcTy(), CCH
, CostKind
, Trunc
);
7155 // Detect reduction patterns
7156 if (auto RedCost
= getReductionPatternCost(I
, VF
, VectorTy
, CostKind
))
7159 Type
*SrcScalarTy
= I
->getOperand(0)->getType();
7161 VectorTy
->isVectorTy() ? ToVectorTy(SrcScalarTy
, VF
) : SrcScalarTy
;
7162 if (canTruncateToMinimalBitwidth(I
, VF
)) {
7163 // This cast is going to be shrunk. This may remove the cast or it might
7164 // turn it into slightly different cast. For example, if MinBW == 16,
7165 // "zext i8 %1 to i32" becomes "zext i8 %1 to i16".
7167 // Calculate the modified src and dest types.
7168 Type
*MinVecTy
= VectorTy
;
7169 if (Opcode
== Instruction::Trunc
) {
7170 SrcVecTy
= smallestIntegerVectorType(SrcVecTy
, MinVecTy
);
7172 largestIntegerVectorType(ToVectorTy(I
->getType(), VF
), MinVecTy
);
7173 } else if (Opcode
== Instruction::ZExt
|| Opcode
== Instruction::SExt
) {
7174 // Leave SrcVecTy unchanged - we only shrink the destination element
7177 smallestIntegerVectorType(ToVectorTy(I
->getType(), VF
), MinVecTy
);
7181 return TTI
.getCastInstrCost(Opcode
, VectorTy
, SrcVecTy
, CCH
, CostKind
, I
);
7183 case Instruction::Call
:
7184 return getVectorCallCost(cast
<CallInst
>(I
), VF
);
7185 case Instruction::ExtractValue
:
7186 return TTI
.getInstructionCost(I
, TTI::TCK_RecipThroughput
);
7187 case Instruction::Alloca
:
7188 // We cannot easily widen alloca to a scalable alloca, as
7189 // the result would need to be a vector of pointers.
7190 if (VF
.isScalable())
7191 return InstructionCost::getInvalid();
7194 // This opcode is unknown. Assume that it is the same as 'mul'.
7195 return TTI
.getArithmeticInstrCost(Instruction::Mul
, VectorTy
, CostKind
);
7199 void LoopVectorizationCostModel::collectValuesToIgnore() {
7200 // Ignore ephemeral values.
7201 CodeMetrics::collectEphemeralValues(TheLoop
, AC
, ValuesToIgnore
);
7203 // Find all stores to invariant variables. Since they are going to sink
7204 // outside the loop we do not need calculate cost for them.
7205 for (BasicBlock
*BB
: TheLoop
->blocks())
7206 for (Instruction
&I
: *BB
) {
7208 if ((SI
= dyn_cast
<StoreInst
>(&I
)) &&
7209 Legal
->isInvariantAddressOfReduction(SI
->getPointerOperand()))
7210 ValuesToIgnore
.insert(&I
);
7213 // Ignore type-promoting instructions we identified during reduction
7215 for (const auto &Reduction
: Legal
->getReductionVars()) {
7216 const RecurrenceDescriptor
&RedDes
= Reduction
.second
;
7217 const SmallPtrSetImpl
<Instruction
*> &Casts
= RedDes
.getCastInsts();
7218 VecValuesToIgnore
.insert(Casts
.begin(), Casts
.end());
7220 // Ignore type-casting instructions we identified during induction
7222 for (const auto &Induction
: Legal
->getInductionVars()) {
7223 const InductionDescriptor
&IndDes
= Induction
.second
;
7224 const SmallVectorImpl
<Instruction
*> &Casts
= IndDes
.getCastInsts();
7225 VecValuesToIgnore
.insert(Casts
.begin(), Casts
.end());
7229 void LoopVectorizationCostModel::collectInLoopReductions() {
7230 for (const auto &Reduction
: Legal
->getReductionVars()) {
7231 PHINode
*Phi
= Reduction
.first
;
7232 const RecurrenceDescriptor
&RdxDesc
= Reduction
.second
;
7234 // We don't collect reductions that are type promoted (yet).
7235 if (RdxDesc
.getRecurrenceType() != Phi
->getType())
7238 // If the target would prefer this reduction to happen "in-loop", then we
7239 // want to record it as such.
7240 unsigned Opcode
= RdxDesc
.getOpcode();
7241 if (!PreferInLoopReductions
&& !useOrderedReductions(RdxDesc
) &&
7242 !TTI
.preferInLoopReduction(Opcode
, Phi
->getType(),
7243 TargetTransformInfo::ReductionFlags()))
7246 // Check that we can correctly put the reductions into the loop, by
7247 // finding the chain of operations that leads from the phi to the loop
7249 SmallVector
<Instruction
*, 4> ReductionOperations
=
7250 RdxDesc
.getReductionOpChain(Phi
, TheLoop
);
7251 bool InLoop
= !ReductionOperations
.empty();
7254 InLoopReductions
.insert(Phi
);
7255 // Add the elements to InLoopReductionImmediateChains for cost modelling.
7256 Instruction
*LastChain
= Phi
;
7257 for (auto *I
: ReductionOperations
) {
7258 InLoopReductionImmediateChains
[I
] = LastChain
;
7262 LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop
? "inloop" : "out of loop")
7263 << " reduction for phi: " << *Phi
<< "\n");
7267 VPValue
*VPBuilder::createICmp(CmpInst::Predicate Pred
, VPValue
*A
, VPValue
*B
,
7268 DebugLoc DL
, const Twine
&Name
) {
7269 assert(Pred
>= CmpInst::FIRST_ICMP_PREDICATE
&&
7270 Pred
<= CmpInst::LAST_ICMP_PREDICATE
&& "invalid predicate");
7271 return tryInsertInstruction(
7272 new VPInstruction(Instruction::ICmp
, Pred
, A
, B
, DL
, Name
));
7275 // This function will select a scalable VF if the target supports scalable
7276 // vectors and a fixed one otherwise.
7277 // TODO: we could return a pair of values that specify the max VF and
7278 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of
7279 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
7280 // doesn't have a cost model that can choose which plan to execute if
7281 // more than one is generated.
7282 static ElementCount
determineVPlanVF(const TargetTransformInfo
&TTI
,
7283 LoopVectorizationCostModel
&CM
) {
7284 unsigned WidestType
;
7285 std::tie(std::ignore
, WidestType
) = CM
.getSmallestAndWidestTypes();
7287 TargetTransformInfo::RegisterKind RegKind
=
7288 TTI
.enableScalableVectorization()
7289 ? TargetTransformInfo::RGK_ScalableVector
7290 : TargetTransformInfo::RGK_FixedWidthVector
;
7292 TypeSize RegSize
= TTI
.getRegisterBitWidth(RegKind
);
7293 unsigned N
= RegSize
.getKnownMinValue() / WidestType
;
7294 return ElementCount::get(N
, RegSize
.isScalable());
7298 LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF
) {
7299 ElementCount VF
= UserVF
;
7300 // Outer loop handling: They may require CFG and instruction level
7301 // transformations before even evaluating whether vectorization is profitable.
7302 // Since we cannot modify the incoming IR, we need to build VPlan upfront in
7303 // the vectorization pipeline.
7304 if (!OrigLoop
->isInnermost()) {
7305 // If the user doesn't provide a vectorization factor, determine a
7307 if (UserVF
.isZero()) {
7308 VF
= determineVPlanVF(TTI
, CM
);
7309 LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF
<< ".\n");
7311 // Make sure we have a VF > 1 for stress testing.
7312 if (VPlanBuildStressTest
&& (VF
.isScalar() || VF
.isZero())) {
7313 LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: "
7314 << "overriding computed VF.\n");
7315 VF
= ElementCount::getFixed(4);
7317 } else if (UserVF
.isScalable() && !TTI
.supportsScalableVectors() &&
7318 !ForceTargetSupportsScalableVectors
) {
7319 LLVM_DEBUG(dbgs() << "LV: Not vectorizing. Scalable VF requested, but "
7320 << "not supported by the target.\n");
7321 reportVectorizationFailure(
7322 "Scalable vectorization requested but not supported by the target",
7323 "the scalable user-specified vectorization width for outer-loop "
7324 "vectorization cannot be used because the target does not support "
7325 "scalable vectors.",
7326 "ScalableVFUnfeasible", ORE
, OrigLoop
);
7327 return VectorizationFactor::Disabled();
7329 assert(EnableVPlanNativePath
&& "VPlan-native path is not enabled.");
7330 assert(isPowerOf2_32(VF
.getKnownMinValue()) &&
7331 "VF needs to be a power of two");
7332 LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF
.isZero() ? "user " : "")
7333 << "VF " << VF
<< " to build VPlans.\n");
7334 buildVPlans(VF
, VF
);
7336 // For VPlan build stress testing, we bail out after VPlan construction.
7337 if (VPlanBuildStressTest
)
7338 return VectorizationFactor::Disabled();
7340 return {VF
, 0 /*Cost*/, 0 /* ScalarCost */};
7344 dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
7345 "VPlan-native path.\n");
7346 return VectorizationFactor::Disabled();
7349 std::optional
<VectorizationFactor
>
7350 LoopVectorizationPlanner::plan(ElementCount UserVF
, unsigned UserIC
) {
7351 assert(OrigLoop
->isInnermost() && "Inner loop expected.");
7352 CM
.collectValuesToIgnore();
7353 CM
.collectElementTypesForWidening();
7355 FixedScalableVFPair MaxFactors
= CM
.computeMaxVF(UserVF
, UserIC
);
7356 if (!MaxFactors
) // Cases that should not to be vectorized nor interleaved.
7357 return std::nullopt
;
7359 // Invalidate interleave groups if all blocks of loop will be predicated.
7360 if (CM
.blockNeedsPredicationForAnyReason(OrigLoop
->getHeader()) &&
7361 !useMaskedInterleavedAccesses(TTI
)) {
7364 << "LV: Invalidate all interleaved groups due to fold-tail by masking "
7365 "which requires masked-interleaved support.\n");
7366 if (CM
.InterleaveInfo
.invalidateGroups())
7367 // Invalidating interleave groups also requires invalidating all decisions
7368 // based on them, which includes widening decisions and uniform and scalar
7370 CM
.invalidateCostModelingDecisions();
7373 ElementCount MaxUserVF
=
7374 UserVF
.isScalable() ? MaxFactors
.ScalableVF
: MaxFactors
.FixedVF
;
7375 bool UserVFIsLegal
= ElementCount::isKnownLE(UserVF
, MaxUserVF
);
7376 if (!UserVF
.isZero() && UserVFIsLegal
) {
7377 assert(isPowerOf2_32(UserVF
.getKnownMinValue()) &&
7378 "VF needs to be a power of two");
7379 // Collect the instructions (and their associated costs) that will be more
7380 // profitable to scalarize.
7381 CM
.collectInLoopReductions();
7382 if (CM
.selectUserVectorizationFactor(UserVF
)) {
7383 LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF
<< ".\n");
7384 buildVPlansWithVPRecipes(UserVF
, UserVF
);
7385 if (!hasPlanWithVF(UserVF
)) {
7386 LLVM_DEBUG(dbgs() << "LV: No VPlan could be built for " << UserVF
7388 return std::nullopt
;
7391 LLVM_DEBUG(printPlans(dbgs()));
7392 return {{UserVF
, 0, 0}};
7394 reportVectorizationInfo("UserVF ignored because of invalid costs.",
7395 "InvalidCost", ORE
, OrigLoop
);
7398 // Populate the set of Vectorization Factor Candidates.
7399 ElementCountSet VFCandidates
;
7400 for (auto VF
= ElementCount::getFixed(1);
7401 ElementCount::isKnownLE(VF
, MaxFactors
.FixedVF
); VF
*= 2)
7402 VFCandidates
.insert(VF
);
7403 for (auto VF
= ElementCount::getScalable(1);
7404 ElementCount::isKnownLE(VF
, MaxFactors
.ScalableVF
); VF
*= 2)
7405 VFCandidates
.insert(VF
);
7407 CM
.collectInLoopReductions();
7408 for (const auto &VF
: VFCandidates
) {
7409 // Collect Uniform and Scalar instructions after vectorization with VF.
7410 CM
.collectUniformsAndScalars(VF
);
7412 // Collect the instructions (and their associated costs) that will be more
7413 // profitable to scalarize.
7415 CM
.collectInstsToScalarize(VF
);
7418 buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors
.FixedVF
);
7419 buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors
.ScalableVF
);
7421 LLVM_DEBUG(printPlans(dbgs()));
7422 if (!MaxFactors
.hasVector())
7423 return VectorizationFactor::Disabled();
7425 // Select the optimal vectorization factor.
7426 VectorizationFactor VF
= selectVectorizationFactor(VFCandidates
);
7427 assert((VF
.Width
.isScalar() || VF
.ScalarCost
> 0) && "when vectorizing, the scalar cost must be non-zero.");
7428 if (!hasPlanWithVF(VF
.Width
)) {
7429 LLVM_DEBUG(dbgs() << "LV: No VPlan could be built for " << VF
.Width
7431 return std::nullopt
;
7436 VPlan
&LoopVectorizationPlanner::getBestPlanFor(ElementCount VF
) const {
7437 assert(count_if(VPlans
,
7438 [VF
](const VPlanPtr
&Plan
) { return Plan
->hasVF(VF
); }) ==
7440 "Best VF has not a single VPlan.");
7442 for (const VPlanPtr
&Plan
: VPlans
) {
7443 if (Plan
->hasVF(VF
))
7446 llvm_unreachable("No plan found!");
7449 static void AddRuntimeUnrollDisableMetaData(Loop
*L
) {
7450 SmallVector
<Metadata
*, 4> MDs
;
7451 // Reserve first location for self reference to the LoopID metadata node.
7452 MDs
.push_back(nullptr);
7453 bool IsUnrollMetadata
= false;
7454 MDNode
*LoopID
= L
->getLoopID();
7456 // First find existing loop unrolling disable metadata.
7457 for (unsigned i
= 1, ie
= LoopID
->getNumOperands(); i
< ie
; ++i
) {
7458 auto *MD
= dyn_cast
<MDNode
>(LoopID
->getOperand(i
));
7460 const auto *S
= dyn_cast
<MDString
>(MD
->getOperand(0));
7462 S
&& S
->getString().starts_with("llvm.loop.unroll.disable");
7464 MDs
.push_back(LoopID
->getOperand(i
));
7468 if (!IsUnrollMetadata
) {
7469 // Add runtime unroll disable metadata.
7470 LLVMContext
&Context
= L
->getHeader()->getContext();
7471 SmallVector
<Metadata
*, 1> DisableOperands
;
7472 DisableOperands
.push_back(
7473 MDString::get(Context
, "llvm.loop.unroll.runtime.disable"));
7474 MDNode
*DisableNode
= MDNode::get(Context
, DisableOperands
);
7475 MDs
.push_back(DisableNode
);
7476 MDNode
*NewLoopID
= MDNode::get(Context
, MDs
);
7477 // Set operand 0 to refer to the loop id itself.
7478 NewLoopID
->replaceOperandWith(0, NewLoopID
);
7479 L
->setLoopID(NewLoopID
);
7483 // Check if \p RedResult is a ComputeReductionResult instruction, and if it is
7484 // create a merge phi node for it and add it to \p ReductionResumeValues.
7485 static void createAndCollectMergePhiForReduction(
7486 VPInstruction
*RedResult
,
7487 DenseMap
<const RecurrenceDescriptor
*, Value
*> &ReductionResumeValues
,
7488 VPTransformState
&State
, Loop
*OrigLoop
, BasicBlock
*LoopMiddleBlock
) {
7490 RedResult
->getOpcode() != VPInstruction::ComputeReductionResult
)
7493 auto *PhiR
= cast
<VPReductionPHIRecipe
>(RedResult
->getOperand(0));
7494 const RecurrenceDescriptor
&RdxDesc
= PhiR
->getRecurrenceDescriptor();
7496 TrackingVH
<Value
> ReductionStartValue
= RdxDesc
.getRecurrenceStartValue();
7498 State
.get(RedResult
, VPIteration(State
.UF
- 1, VPLane::getFirstLane()));
7500 dyn_cast
<PHINode
>(PhiR
->getStartValue()->getUnderlyingValue());
7502 // TODO: bc.merge.rdx should not be created here, instead it should be
7503 // modeled in VPlan.
7504 BasicBlock
*LoopScalarPreHeader
= OrigLoop
->getLoopPreheader();
7505 // Create a phi node that merges control-flow from the backedge-taken check
7506 // block and the middle block.
7507 auto *BCBlockPhi
= PHINode::Create(FinalValue
->getType(), 2, "bc.merge.rdx",
7508 LoopScalarPreHeader
->getTerminator());
7510 // If we are fixing reductions in the epilogue loop then we should already
7511 // have created a bc.merge.rdx Phi after the main vector body. Ensure that
7512 // we carry over the incoming values correctly.
7513 for (auto *Incoming
: predecessors(LoopScalarPreHeader
)) {
7514 if (Incoming
== LoopMiddleBlock
)
7515 BCBlockPhi
->addIncoming(FinalValue
, Incoming
);
7516 else if (ResumePhi
&& is_contained(ResumePhi
->blocks(), Incoming
))
7517 BCBlockPhi
->addIncoming(ResumePhi
->getIncomingValueForBlock(Incoming
),
7520 BCBlockPhi
->addIncoming(ReductionStartValue
, Incoming
);
7523 auto *OrigPhi
= cast
<PHINode
>(PhiR
->getUnderlyingValue());
7524 // TODO: This fixup should instead be modeled in VPlan.
7525 // Fix the scalar loop reduction variable with the incoming reduction sum
7526 // from the vector body and from the backedge value.
7527 int IncomingEdgeBlockIdx
=
7528 OrigPhi
->getBasicBlockIndex(OrigLoop
->getLoopLatch());
7529 assert(IncomingEdgeBlockIdx
>= 0 && "Invalid block index");
7530 // Pick the other block.
7531 int SelfEdgeBlockIdx
= (IncomingEdgeBlockIdx
? 0 : 1);
7532 OrigPhi
->setIncomingValue(SelfEdgeBlockIdx
, BCBlockPhi
);
7533 Instruction
*LoopExitInst
= RdxDesc
.getLoopExitInstr();
7534 OrigPhi
->setIncomingValue(IncomingEdgeBlockIdx
, LoopExitInst
);
7536 ReductionResumeValues
[&RdxDesc
] = BCBlockPhi
;
7539 std::pair
<DenseMap
<const SCEV
*, Value
*>,
7540 DenseMap
<const RecurrenceDescriptor
*, Value
*>>
7541 LoopVectorizationPlanner::executePlan(
7542 ElementCount BestVF
, unsigned BestUF
, VPlan
&BestVPlan
,
7543 InnerLoopVectorizer
&ILV
, DominatorTree
*DT
, bool IsEpilogueVectorization
,
7544 const DenseMap
<const SCEV
*, Value
*> *ExpandedSCEVs
) {
7545 assert(BestVPlan
.hasVF(BestVF
) &&
7546 "Trying to execute plan with unsupported VF");
7547 assert(BestVPlan
.hasUF(BestUF
) &&
7548 "Trying to execute plan with unsupported UF");
7550 (IsEpilogueVectorization
|| !ExpandedSCEVs
) &&
7551 "expanded SCEVs to reuse can only be used during epilogue vectorization");
7553 LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << BestVF
<< ", UF=" << BestUF
7556 if (!IsEpilogueVectorization
)
7557 VPlanTransforms::optimizeForVFAndUF(BestVPlan
, BestVF
, BestUF
, PSE
);
7559 // Perform the actual loop transformation.
7560 VPTransformState
State(BestVF
, BestUF
, LI
, DT
, ILV
.Builder
, &ILV
, &BestVPlan
,
7561 OrigLoop
->getHeader()->getContext());
7563 // 0. Generate SCEV-dependent code into the preheader, including TripCount,
7564 // before making any changes to the CFG.
7565 if (!BestVPlan
.getPreheader()->empty()) {
7566 State
.CFG
.PrevBB
= OrigLoop
->getLoopPreheader();
7567 State
.Builder
.SetInsertPoint(OrigLoop
->getLoopPreheader()->getTerminator());
7568 BestVPlan
.getPreheader()->execute(&State
);
7570 if (!ILV
.getTripCount())
7571 ILV
.setTripCount(State
.get(BestVPlan
.getTripCount(), {0, 0}));
7573 assert(IsEpilogueVectorization
&& "should only re-use the existing trip "
7574 "count during epilogue vectorization");
7576 // 1. Set up the skeleton for vectorization, including vector pre-header and
7577 // middle block. The vector loop is created during VPlan execution.
7578 Value
*CanonicalIVStartValue
;
7579 std::tie(State
.CFG
.PrevBB
, CanonicalIVStartValue
) =
7580 ILV
.createVectorizedLoopSkeleton(ExpandedSCEVs
? *ExpandedSCEVs
7581 : State
.ExpandedSCEVs
);
7583 // Only use noalias metadata when using memory checks guaranteeing no overlap
7584 // across all iterations.
7585 const LoopAccessInfo
*LAI
= ILV
.Legal
->getLAI();
7586 std::unique_ptr
<LoopVersioning
> LVer
= nullptr;
7587 if (LAI
&& !LAI
->getRuntimePointerChecking()->getChecks().empty() &&
7588 !LAI
->getRuntimePointerChecking()->getDiffChecks()) {
7590 // We currently don't use LoopVersioning for the actual loop cloning but we
7591 // still use it to add the noalias metadata.
7592 // TODO: Find a better way to re-use LoopVersioning functionality to add
7594 LVer
= std::make_unique
<LoopVersioning
>(
7595 *LAI
, LAI
->getRuntimePointerChecking()->getChecks(), OrigLoop
, LI
, DT
,
7597 State
.LVer
= &*LVer
;
7598 State
.LVer
->prepareNoAliasMetadata();
7601 ILV
.collectPoisonGeneratingRecipes(State
);
7603 ILV
.printDebugTracesAtStart();
7605 //===------------------------------------------------===//
7607 // Notice: any optimization or new instruction that go
7608 // into the code below should also be implemented in
7611 //===------------------------------------------------===//
7613 // 2. Copy and widen instructions from the old loop into the new loop.
7614 BestVPlan
.prepareToExecute(ILV
.getTripCount(),
7615 ILV
.getOrCreateVectorTripCount(nullptr),
7616 CanonicalIVStartValue
, State
);
7618 BestVPlan
.execute(&State
);
7620 // 2.5 Collect reduction resume values.
7621 DenseMap
<const RecurrenceDescriptor
*, Value
*> ReductionResumeValues
;
7623 cast
<VPBasicBlock
>(BestVPlan
.getVectorLoopRegion()->getSingleSuccessor());
7624 for (VPRecipeBase
&R
: *ExitVPBB
) {
7625 createAndCollectMergePhiForReduction(dyn_cast
<VPInstruction
>(&R
),
7626 ReductionResumeValues
, State
, OrigLoop
,
7627 State
.CFG
.VPBB2IRBB
[ExitVPBB
]);
7630 // 2.6. Maintain Loop Hints
7631 // Keep all loop hints from the original loop on the vector loop (we'll
7632 // replace the vectorizer-specific hints below).
7633 MDNode
*OrigLoopID
= OrigLoop
->getLoopID();
7635 std::optional
<MDNode
*> VectorizedLoopID
=
7636 makeFollowupLoopID(OrigLoopID
, {LLVMLoopVectorizeFollowupAll
,
7637 LLVMLoopVectorizeFollowupVectorized
});
7639 VPBasicBlock
*HeaderVPBB
=
7640 BestVPlan
.getVectorLoopRegion()->getEntryBasicBlock();
7641 Loop
*L
= LI
->getLoopFor(State
.CFG
.VPBB2IRBB
[HeaderVPBB
]);
7642 if (VectorizedLoopID
)
7643 L
->setLoopID(*VectorizedLoopID
);
7645 // Keep all loop hints from the original loop on the vector loop (we'll
7646 // replace the vectorizer-specific hints below).
7647 if (MDNode
*LID
= OrigLoop
->getLoopID())
7650 LoopVectorizeHints
Hints(L
, true, *ORE
);
7651 Hints
.setAlreadyVectorized();
7653 TargetTransformInfo::UnrollingPreferences UP
;
7654 TTI
.getUnrollingPreferences(L
, *PSE
.getSE(), UP
, ORE
);
7655 if (!UP
.UnrollVectorizedLoop
|| CanonicalIVStartValue
)
7656 AddRuntimeUnrollDisableMetaData(L
);
7658 // 3. Fix the vectorized code: take care of header phi's, live-outs,
7659 // predication, updating analyses.
7660 ILV
.fixVectorizedLoop(State
, BestVPlan
);
7662 ILV
.printDebugTracesAtEnd();
7664 return {State
.ExpandedSCEVs
, ReductionResumeValues
};
7667 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
7668 void LoopVectorizationPlanner::printPlans(raw_ostream
&O
) {
7669 for (const auto &Plan
: VPlans
)
7670 if (PrintVPlansInDotFormat
)
7677 //===--------------------------------------------------------------------===//
7678 // EpilogueVectorizerMainLoop
7679 //===--------------------------------------------------------------------===//
7681 /// This function is partially responsible for generating the control flow
7682 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
7683 std::pair
<BasicBlock
*, Value
*>
7684 EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton(
7685 const SCEV2ValueTy
&ExpandedSCEVs
) {
7686 createVectorLoopSkeleton("");
7688 // Generate the code to check the minimum iteration count of the vector
7689 // epilogue (see below).
7690 EPI
.EpilogueIterationCountCheck
=
7691 emitIterationCountCheck(LoopScalarPreHeader
, true);
7692 EPI
.EpilogueIterationCountCheck
->setName("iter.check");
7694 // Generate the code to check any assumptions that we've made for SCEV
7696 EPI
.SCEVSafetyCheck
= emitSCEVChecks(LoopScalarPreHeader
);
7698 // Generate the code that checks at runtime if arrays overlap. We put the
7699 // checks into a separate block to make the more common case of few elements
7701 EPI
.MemSafetyCheck
= emitMemRuntimeChecks(LoopScalarPreHeader
);
7703 // Generate the iteration count check for the main loop, *after* the check
7704 // for the epilogue loop, so that the path-length is shorter for the case
7705 // that goes directly through the vector epilogue. The longer-path length for
7706 // the main loop is compensated for, by the gain from vectorizing the larger
7707 // trip count. Note: the branch will get updated later on when we vectorize
7709 EPI
.MainLoopIterationCountCheck
=
7710 emitIterationCountCheck(LoopScalarPreHeader
, false);
7712 // Generate the induction variable.
7713 EPI
.VectorTripCount
= getOrCreateVectorTripCount(LoopVectorPreHeader
);
7715 // Skip induction resume value creation here because they will be created in
7716 // the second pass for the scalar loop. The induction resume values for the
7717 // inductions in the epilogue loop are created before executing the plan for
7718 // the epilogue loop.
7720 return {completeLoopSkeleton(), nullptr};
7723 void EpilogueVectorizerMainLoop::printDebugTracesAtStart() {
7725 dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n"
7726 << "Main Loop VF:" << EPI
.MainLoopVF
7727 << ", Main Loop UF:" << EPI
.MainLoopUF
7728 << ", Epilogue Loop VF:" << EPI
.EpilogueVF
7729 << ", Epilogue Loop UF:" << EPI
.EpilogueUF
<< "\n";
7733 void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() {
7734 DEBUG_WITH_TYPE(VerboseDebug
, {
7735 dbgs() << "intermediate fn:\n"
7736 << *OrigLoop
->getHeader()->getParent() << "\n";
7741 EpilogueVectorizerMainLoop::emitIterationCountCheck(BasicBlock
*Bypass
,
7743 assert(Bypass
&& "Expected valid bypass basic block.");
7744 ElementCount VFactor
= ForEpilogue
? EPI
.EpilogueVF
: VF
;
7745 unsigned UFactor
= ForEpilogue
? EPI
.EpilogueUF
: UF
;
7746 Value
*Count
= getTripCount();
7747 // Reuse existing vector loop preheader for TC checks.
7748 // Note that new preheader block is generated for vector loop.
7749 BasicBlock
*const TCCheckBlock
= LoopVectorPreHeader
;
7750 IRBuilder
<> Builder(TCCheckBlock
->getTerminator());
7752 // Generate code to check if the loop's trip count is less than VF * UF of the
7753 // main vector loop.
7754 auto P
= Cost
->requiresScalarEpilogue(ForEpilogue
? EPI
.EpilogueVF
.isVector()
7756 ? ICmpInst::ICMP_ULE
7757 : ICmpInst::ICMP_ULT
;
7759 Value
*CheckMinIters
= Builder
.CreateICmp(
7760 P
, Count
, createStepForVF(Builder
, Count
->getType(), VFactor
, UFactor
),
7764 TCCheckBlock
->setName("vector.main.loop.iter.check");
7766 // Create new preheader for vector loop.
7767 LoopVectorPreHeader
= SplitBlock(TCCheckBlock
, TCCheckBlock
->getTerminator(),
7768 DT
, LI
, nullptr, "vector.ph");
7771 assert(DT
->properlyDominates(DT
->getNode(TCCheckBlock
),
7772 DT
->getNode(Bypass
)->getIDom()) &&
7773 "TC check is expected to dominate Bypass");
7775 // Update dominator for Bypass & LoopExit.
7776 DT
->changeImmediateDominator(Bypass
, TCCheckBlock
);
7777 if (!Cost
->requiresScalarEpilogue(EPI
.EpilogueVF
.isVector()))
7778 // For loops with multiple exits, there's no edge from the middle block
7779 // to exit blocks (as the epilogue must run) and thus no need to update
7780 // the immediate dominator of the exit blocks.
7781 DT
->changeImmediateDominator(LoopExitBlock
, TCCheckBlock
);
7783 LoopBypassBlocks
.push_back(TCCheckBlock
);
7785 // Save the trip count so we don't have to regenerate it in the
7786 // vec.epilog.iter.check. This is safe to do because the trip count
7787 // generated here dominates the vector epilog iter check.
7788 EPI
.TripCount
= Count
;
7792 *BranchInst::Create(Bypass
, LoopVectorPreHeader
, CheckMinIters
);
7793 if (hasBranchWeightMD(*OrigLoop
->getLoopLatch()->getTerminator()))
7794 setBranchWeights(BI
, MinItersBypassWeights
);
7795 ReplaceInstWithInst(TCCheckBlock
->getTerminator(), &BI
);
7797 return TCCheckBlock
;
7800 //===--------------------------------------------------------------------===//
7801 // EpilogueVectorizerEpilogueLoop
7802 //===--------------------------------------------------------------------===//
7804 /// This function is partially responsible for generating the control flow
7805 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
7806 std::pair
<BasicBlock
*, Value
*>
7807 EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton(
7808 const SCEV2ValueTy
&ExpandedSCEVs
) {
7809 createVectorLoopSkeleton("vec.epilog.");
7811 // Now, compare the remaining count and if there aren't enough iterations to
7812 // execute the vectorized epilogue skip to the scalar part.
7813 BasicBlock
*VecEpilogueIterationCountCheck
= LoopVectorPreHeader
;
7814 VecEpilogueIterationCountCheck
->setName("vec.epilog.iter.check");
7815 LoopVectorPreHeader
=
7816 SplitBlock(LoopVectorPreHeader
, LoopVectorPreHeader
->getTerminator(), DT
,
7817 LI
, nullptr, "vec.epilog.ph");
7818 emitMinimumVectorEpilogueIterCountCheck(LoopScalarPreHeader
,
7819 VecEpilogueIterationCountCheck
);
7821 // Adjust the control flow taking the state info from the main loop
7822 // vectorization into account.
7823 assert(EPI
.MainLoopIterationCountCheck
&& EPI
.EpilogueIterationCountCheck
&&
7824 "expected this to be saved from the previous pass.");
7825 EPI
.MainLoopIterationCountCheck
->getTerminator()->replaceUsesOfWith(
7826 VecEpilogueIterationCountCheck
, LoopVectorPreHeader
);
7828 DT
->changeImmediateDominator(LoopVectorPreHeader
,
7829 EPI
.MainLoopIterationCountCheck
);
7831 EPI
.EpilogueIterationCountCheck
->getTerminator()->replaceUsesOfWith(
7832 VecEpilogueIterationCountCheck
, LoopScalarPreHeader
);
7834 if (EPI
.SCEVSafetyCheck
)
7835 EPI
.SCEVSafetyCheck
->getTerminator()->replaceUsesOfWith(
7836 VecEpilogueIterationCountCheck
, LoopScalarPreHeader
);
7837 if (EPI
.MemSafetyCheck
)
7838 EPI
.MemSafetyCheck
->getTerminator()->replaceUsesOfWith(
7839 VecEpilogueIterationCountCheck
, LoopScalarPreHeader
);
7841 DT
->changeImmediateDominator(
7842 VecEpilogueIterationCountCheck
,
7843 VecEpilogueIterationCountCheck
->getSinglePredecessor());
7845 DT
->changeImmediateDominator(LoopScalarPreHeader
,
7846 EPI
.EpilogueIterationCountCheck
);
7847 if (!Cost
->requiresScalarEpilogue(EPI
.EpilogueVF
.isVector()))
7848 // If there is an epilogue which must run, there's no edge from the
7849 // middle block to exit blocks and thus no need to update the immediate
7850 // dominator of the exit blocks.
7851 DT
->changeImmediateDominator(LoopExitBlock
,
7852 EPI
.EpilogueIterationCountCheck
);
7854 // Keep track of bypass blocks, as they feed start values to the induction and
7855 // reduction phis in the scalar loop preheader.
7856 if (EPI
.SCEVSafetyCheck
)
7857 LoopBypassBlocks
.push_back(EPI
.SCEVSafetyCheck
);
7858 if (EPI
.MemSafetyCheck
)
7859 LoopBypassBlocks
.push_back(EPI
.MemSafetyCheck
);
7860 LoopBypassBlocks
.push_back(EPI
.EpilogueIterationCountCheck
);
7862 // The vec.epilog.iter.check block may contain Phi nodes from inductions or
7863 // reductions which merge control-flow from the latch block and the middle
7864 // block. Update the incoming values here and move the Phi into the preheader.
7865 SmallVector
<PHINode
*, 4> PhisInBlock
;
7866 for (PHINode
&Phi
: VecEpilogueIterationCountCheck
->phis())
7867 PhisInBlock
.push_back(&Phi
);
7869 for (PHINode
*Phi
: PhisInBlock
) {
7870 Phi
->moveBefore(LoopVectorPreHeader
->getFirstNonPHI());
7871 Phi
->replaceIncomingBlockWith(
7872 VecEpilogueIterationCountCheck
->getSinglePredecessor(),
7873 VecEpilogueIterationCountCheck
);
7875 // If the phi doesn't have an incoming value from the
7876 // EpilogueIterationCountCheck, we are done. Otherwise remove the incoming
7877 // value and also those from other check blocks. This is needed for
7878 // reduction phis only.
7879 if (none_of(Phi
->blocks(), [&](BasicBlock
*IncB
) {
7880 return EPI
.EpilogueIterationCountCheck
== IncB
;
7883 Phi
->removeIncomingValue(EPI
.EpilogueIterationCountCheck
);
7884 if (EPI
.SCEVSafetyCheck
)
7885 Phi
->removeIncomingValue(EPI
.SCEVSafetyCheck
);
7886 if (EPI
.MemSafetyCheck
)
7887 Phi
->removeIncomingValue(EPI
.MemSafetyCheck
);
7890 // Generate a resume induction for the vector epilogue and put it in the
7891 // vector epilogue preheader
7892 Type
*IdxTy
= Legal
->getWidestInductionType();
7893 PHINode
*EPResumeVal
= PHINode::Create(IdxTy
, 2, "vec.epilog.resume.val");
7894 EPResumeVal
->insertBefore(LoopVectorPreHeader
->getFirstNonPHIIt());
7895 EPResumeVal
->addIncoming(EPI
.VectorTripCount
, VecEpilogueIterationCountCheck
);
7896 EPResumeVal
->addIncoming(ConstantInt::get(IdxTy
, 0),
7897 EPI
.MainLoopIterationCountCheck
);
7899 // Generate induction resume values. These variables save the new starting
7900 // indexes for the scalar loop. They are used to test if there are any tail
7901 // iterations left once the vector loop has completed.
7902 // Note that when the vectorized epilogue is skipped due to iteration count
7903 // check, then the resume value for the induction variable comes from
7904 // the trip count of the main vector loop, hence passing the AdditionalBypass
7906 createInductionResumeValues(ExpandedSCEVs
,
7907 {VecEpilogueIterationCountCheck
,
7908 EPI
.VectorTripCount
} /* AdditionalBypass */);
7910 return {completeLoopSkeleton(), EPResumeVal
};
7914 EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck(
7915 BasicBlock
*Bypass
, BasicBlock
*Insert
) {
7917 assert(EPI
.TripCount
&&
7918 "Expected trip count to have been safed in the first pass.");
7920 (!isa
<Instruction
>(EPI
.TripCount
) ||
7921 DT
->dominates(cast
<Instruction
>(EPI
.TripCount
)->getParent(), Insert
)) &&
7922 "saved trip count does not dominate insertion point.");
7923 Value
*TC
= EPI
.TripCount
;
7924 IRBuilder
<> Builder(Insert
->getTerminator());
7925 Value
*Count
= Builder
.CreateSub(TC
, EPI
.VectorTripCount
, "n.vec.remaining");
7927 // Generate code to check if the loop's trip count is less than VF * UF of the
7928 // vector epilogue loop.
7929 auto P
= Cost
->requiresScalarEpilogue(EPI
.EpilogueVF
.isVector())
7930 ? ICmpInst::ICMP_ULE
7931 : ICmpInst::ICMP_ULT
;
7933 Value
*CheckMinIters
=
7934 Builder
.CreateICmp(P
, Count
,
7935 createStepForVF(Builder
, Count
->getType(),
7936 EPI
.EpilogueVF
, EPI
.EpilogueUF
),
7937 "min.epilog.iters.check");
7940 *BranchInst::Create(Bypass
, LoopVectorPreHeader
, CheckMinIters
);
7941 if (hasBranchWeightMD(*OrigLoop
->getLoopLatch()->getTerminator())) {
7942 unsigned MainLoopStep
= UF
* VF
.getKnownMinValue();
7943 unsigned EpilogueLoopStep
=
7944 EPI
.EpilogueUF
* EPI
.EpilogueVF
.getKnownMinValue();
7945 // We assume the remaining `Count` is equally distributed in
7946 // [0, MainLoopStep)
7947 // So the probability for `Count < EpilogueLoopStep` should be
7948 // min(MainLoopStep, EpilogueLoopStep) / MainLoopStep
7949 unsigned EstimatedSkipCount
= std::min(MainLoopStep
, EpilogueLoopStep
);
7950 const uint32_t Weights
[] = {EstimatedSkipCount
,
7951 MainLoopStep
- EstimatedSkipCount
};
7952 setBranchWeights(BI
, Weights
);
7954 ReplaceInstWithInst(Insert
->getTerminator(), &BI
);
7956 LoopBypassBlocks
.push_back(Insert
);
7960 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() {
7962 dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n"
7963 << "Epilogue Loop VF:" << EPI
.EpilogueVF
7964 << ", Epilogue Loop UF:" << EPI
.EpilogueUF
<< "\n";
7968 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() {
7969 DEBUG_WITH_TYPE(VerboseDebug
, {
7970 dbgs() << "final fn:\n" << *OrigLoop
->getHeader()->getParent() << "\n";
7974 bool LoopVectorizationPlanner::getDecisionAndClampRange(
7975 const std::function
<bool(ElementCount
)> &Predicate
, VFRange
&Range
) {
7976 assert(!Range
.isEmpty() && "Trying to test an empty VF range.");
7977 bool PredicateAtRangeStart
= Predicate(Range
.Start
);
7979 for (ElementCount TmpVF
: VFRange(Range
.Start
* 2, Range
.End
))
7980 if (Predicate(TmpVF
) != PredicateAtRangeStart
) {
7985 return PredicateAtRangeStart
;
7988 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF,
7989 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range
7990 /// of VF's starting at a given VF and extending it as much as possible. Each
7991 /// vectorization decision can potentially shorten this sub-range during
7993 void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF
,
7994 ElementCount MaxVF
) {
7995 auto MaxVFTimes2
= MaxVF
* 2;
7996 for (ElementCount VF
= MinVF
; ElementCount::isKnownLT(VF
, MaxVFTimes2
);) {
7997 VFRange SubRange
= {VF
, MaxVFTimes2
};
7998 VPlans
.push_back(buildVPlan(SubRange
));
8003 VPValue
*VPRecipeBuilder::createEdgeMask(BasicBlock
*Src
, BasicBlock
*Dst
,
8005 assert(is_contained(predecessors(Dst
), Src
) && "Invalid edge");
8007 // Look for cached value.
8008 std::pair
<BasicBlock
*, BasicBlock
*> Edge(Src
, Dst
);
8009 EdgeMaskCacheTy::iterator ECEntryIt
= EdgeMaskCache
.find(Edge
);
8010 if (ECEntryIt
!= EdgeMaskCache
.end())
8011 return ECEntryIt
->second
;
8013 VPValue
*SrcMask
= getBlockInMask(Src
);
8015 // The terminator has to be a branch inst!
8016 BranchInst
*BI
= dyn_cast
<BranchInst
>(Src
->getTerminator());
8017 assert(BI
&& "Unexpected terminator found");
8019 if (!BI
->isConditional() || BI
->getSuccessor(0) == BI
->getSuccessor(1))
8020 return EdgeMaskCache
[Edge
] = SrcMask
;
8022 // If source is an exiting block, we know the exit edge is dynamically dead
8023 // in the vector loop, and thus we don't need to restrict the mask. Avoid
8024 // adding uses of an otherwise potentially dead instruction.
8025 if (OrigLoop
->isLoopExiting(Src
))
8026 return EdgeMaskCache
[Edge
] = SrcMask
;
8028 VPValue
*EdgeMask
= Plan
.getVPValueOrAddLiveIn(BI
->getCondition());
8029 assert(EdgeMask
&& "No Edge Mask found for condition");
8031 if (BI
->getSuccessor(0) != Dst
)
8032 EdgeMask
= Builder
.createNot(EdgeMask
, BI
->getDebugLoc());
8034 if (SrcMask
) { // Otherwise block in-mask is all-one, no need to AND.
8035 // The condition is 'SrcMask && EdgeMask', which is equivalent to
8036 // 'select i1 SrcMask, i1 EdgeMask, i1 false'.
8037 // The select version does not introduce new UB if SrcMask is false and
8038 // EdgeMask is poison. Using 'and' here introduces undefined behavior.
8039 VPValue
*False
= Plan
.getVPValueOrAddLiveIn(
8040 ConstantInt::getFalse(BI
->getCondition()->getType()));
8042 Builder
.createSelect(SrcMask
, EdgeMask
, False
, BI
->getDebugLoc());
8045 return EdgeMaskCache
[Edge
] = EdgeMask
;
8048 void VPRecipeBuilder::createHeaderMask(VPlan
&Plan
) {
8049 BasicBlock
*Header
= OrigLoop
->getHeader();
8051 // When not folding the tail, use nullptr to model all-true mask.
8052 if (!CM
.foldTailByMasking()) {
8053 BlockMaskCache
[Header
] = nullptr;
8057 // Introduce the early-exit compare IV <= BTC to form header block mask.
8058 // This is used instead of IV < TC because TC may wrap, unlike BTC. Start by
8059 // constructing the desired canonical IV in the header block as its first
8060 // non-phi instructions.
8062 VPBasicBlock
*HeaderVPBB
= Plan
.getVectorLoopRegion()->getEntryBasicBlock();
8063 auto NewInsertionPoint
= HeaderVPBB
->getFirstNonPhi();
8064 auto *IV
= new VPWidenCanonicalIVRecipe(Plan
.getCanonicalIV());
8065 HeaderVPBB
->insert(IV
, NewInsertionPoint
);
8067 VPBuilder::InsertPointGuard
Guard(Builder
);
8068 Builder
.setInsertPoint(HeaderVPBB
, NewInsertionPoint
);
8069 VPValue
*BlockMask
= nullptr;
8070 VPValue
*BTC
= Plan
.getOrCreateBackedgeTakenCount();
8071 BlockMask
= Builder
.createICmp(CmpInst::ICMP_ULE
, IV
, BTC
);
8072 BlockMaskCache
[Header
] = BlockMask
;
8075 VPValue
*VPRecipeBuilder::getBlockInMask(BasicBlock
*BB
) const {
8076 // Return the cached value.
8077 BlockMaskCacheTy::const_iterator BCEntryIt
= BlockMaskCache
.find(BB
);
8078 assert(BCEntryIt
!= BlockMaskCache
.end() &&
8079 "Trying to access mask for block without one.");
8080 return BCEntryIt
->second
;
8083 void VPRecipeBuilder::createBlockInMask(BasicBlock
*BB
, VPlan
&Plan
) {
8084 assert(OrigLoop
->contains(BB
) && "Block is not a part of a loop");
8085 assert(BlockMaskCache
.count(BB
) == 0 && "Mask for block already computed");
8086 assert(OrigLoop
->getHeader() != BB
&&
8087 "Loop header must have cached block mask");
8089 // All-one mask is modelled as no-mask following the convention for masked
8090 // load/store/gather/scatter. Initialize BlockMask to no-mask.
8091 VPValue
*BlockMask
= nullptr;
8092 // This is the block mask. We OR all incoming edges.
8093 for (auto *Predecessor
: predecessors(BB
)) {
8094 VPValue
*EdgeMask
= createEdgeMask(Predecessor
, BB
, Plan
);
8095 if (!EdgeMask
) { // Mask of predecessor is all-one so mask of block is too.
8096 BlockMaskCache
[BB
] = EdgeMask
;
8100 if (!BlockMask
) { // BlockMask has its initialized nullptr value.
8101 BlockMask
= EdgeMask
;
8105 BlockMask
= Builder
.createOr(BlockMask
, EdgeMask
, {});
8108 BlockMaskCache
[BB
] = BlockMask
;
8111 VPRecipeBase
*VPRecipeBuilder::tryToWidenMemory(Instruction
*I
,
8112 ArrayRef
<VPValue
*> Operands
,
8115 assert((isa
<LoadInst
>(I
) || isa
<StoreInst
>(I
)) &&
8116 "Must be called with either a load or store");
8118 auto willWiden
= [&](ElementCount VF
) -> bool {
8119 LoopVectorizationCostModel::InstWidening Decision
=
8120 CM
.getWideningDecision(I
, VF
);
8121 assert(Decision
!= LoopVectorizationCostModel::CM_Unknown
&&
8122 "CM decision should be taken at this point.");
8123 if (Decision
== LoopVectorizationCostModel::CM_Interleave
)
8125 if (CM
.isScalarAfterVectorization(I
, VF
) ||
8126 CM
.isProfitableToScalarize(I
, VF
))
8128 return Decision
!= LoopVectorizationCostModel::CM_Scalarize
;
8131 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden
, Range
))
8134 VPValue
*Mask
= nullptr;
8135 if (Legal
->isMaskRequired(I
))
8136 Mask
= getBlockInMask(I
->getParent());
8138 // Determine if the pointer operand of the access is either consecutive or
8139 // reverse consecutive.
8140 LoopVectorizationCostModel::InstWidening Decision
=
8141 CM
.getWideningDecision(I
, Range
.Start
);
8142 bool Reverse
= Decision
== LoopVectorizationCostModel::CM_Widen_Reverse
;
8144 Reverse
|| Decision
== LoopVectorizationCostModel::CM_Widen
;
8146 VPValue
*Ptr
= isa
<LoadInst
>(I
) ? Operands
[0] : Operands
[1];
8148 auto *GEP
= dyn_cast
<GetElementPtrInst
>(
8149 Ptr
->getUnderlyingValue()->stripPointerCasts());
8150 auto *VectorPtr
= new VPVectorPointerRecipe(
8151 Ptr
, getLoadStoreType(I
), Reverse
, GEP
? GEP
->isInBounds() : false,
8153 Builder
.getInsertBlock()->appendRecipe(VectorPtr
);
8156 if (LoadInst
*Load
= dyn_cast
<LoadInst
>(I
))
8157 return new VPWidenMemoryInstructionRecipe(*Load
, Ptr
, Mask
, Consecutive
,
8160 StoreInst
*Store
= cast
<StoreInst
>(I
);
8161 return new VPWidenMemoryInstructionRecipe(*Store
, Ptr
, Operands
[0], Mask
,
8162 Consecutive
, Reverse
);
8165 /// Creates a VPWidenIntOrFpInductionRecpipe for \p Phi. If needed, it will also
8166 /// insert a recipe to expand the step for the induction recipe.
8167 static VPWidenIntOrFpInductionRecipe
*
8168 createWidenInductionRecipes(PHINode
*Phi
, Instruction
*PhiOrTrunc
,
8169 VPValue
*Start
, const InductionDescriptor
&IndDesc
,
8170 VPlan
&Plan
, ScalarEvolution
&SE
, Loop
&OrigLoop
,
8172 assert(IndDesc
.getStartValue() ==
8173 Phi
->getIncomingValueForBlock(OrigLoop
.getLoopPreheader()));
8174 assert(SE
.isLoopInvariant(IndDesc
.getStep(), &OrigLoop
) &&
8175 "step must be loop invariant");
8178 vputils::getOrCreateVPValueForSCEVExpr(Plan
, IndDesc
.getStep(), SE
);
8179 if (auto *TruncI
= dyn_cast
<TruncInst
>(PhiOrTrunc
)) {
8180 return new VPWidenIntOrFpInductionRecipe(Phi
, Start
, Step
, IndDesc
, TruncI
);
8182 assert(isa
<PHINode
>(PhiOrTrunc
) && "must be a phi node here");
8183 return new VPWidenIntOrFpInductionRecipe(Phi
, Start
, Step
, IndDesc
);
8186 VPRecipeBase
*VPRecipeBuilder::tryToOptimizeInductionPHI(
8187 PHINode
*Phi
, ArrayRef
<VPValue
*> Operands
, VPlan
&Plan
, VFRange
&Range
) {
8189 // Check if this is an integer or fp induction. If so, build the recipe that
8190 // produces its scalar and vector values.
8191 if (auto *II
= Legal
->getIntOrFpInductionDescriptor(Phi
))
8192 return createWidenInductionRecipes(Phi
, Phi
, Operands
[0], *II
, Plan
,
8193 *PSE
.getSE(), *OrigLoop
, Range
);
8195 // Check if this is pointer induction. If so, build the recipe for it.
8196 if (auto *II
= Legal
->getPointerInductionDescriptor(Phi
)) {
8197 VPValue
*Step
= vputils::getOrCreateVPValueForSCEVExpr(Plan
, II
->getStep(),
8199 return new VPWidenPointerInductionRecipe(
8200 Phi
, Operands
[0], Step
, *II
,
8201 LoopVectorizationPlanner::getDecisionAndClampRange(
8202 [&](ElementCount VF
) {
8203 return CM
.isScalarAfterVectorization(Phi
, VF
);
8210 VPWidenIntOrFpInductionRecipe
*VPRecipeBuilder::tryToOptimizeInductionTruncate(
8211 TruncInst
*I
, ArrayRef
<VPValue
*> Operands
, VFRange
&Range
, VPlan
&Plan
) {
8212 // Optimize the special case where the source is a constant integer
8213 // induction variable. Notice that we can only optimize the 'trunc' case
8214 // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
8215 // (c) other casts depend on pointer size.
8217 // Determine whether \p K is a truncation based on an induction variable that
8218 // can be optimized.
8219 auto isOptimizableIVTruncate
=
8220 [&](Instruction
*K
) -> std::function
<bool(ElementCount
)> {
8221 return [=](ElementCount VF
) -> bool {
8222 return CM
.isOptimizableIVTruncate(K
, VF
);
8226 if (LoopVectorizationPlanner::getDecisionAndClampRange(
8227 isOptimizableIVTruncate(I
), Range
)) {
8229 auto *Phi
= cast
<PHINode
>(I
->getOperand(0));
8230 const InductionDescriptor
&II
= *Legal
->getIntOrFpInductionDescriptor(Phi
);
8231 VPValue
*Start
= Plan
.getVPValueOrAddLiveIn(II
.getStartValue());
8232 return createWidenInductionRecipes(Phi
, I
, Start
, II
, Plan
, *PSE
.getSE(),
8238 VPRecipeOrVPValueTy
VPRecipeBuilder::tryToBlend(PHINode
*Phi
,
8239 ArrayRef
<VPValue
*> Operands
,
8241 // If all incoming values are equal, the incoming VPValue can be used directly
8242 // instead of creating a new VPBlendRecipe.
8243 if (llvm::all_equal(Operands
))
8246 unsigned NumIncoming
= Phi
->getNumIncomingValues();
8247 // For in-loop reductions, we do not need to create an additional select.
8248 VPValue
*InLoopVal
= nullptr;
8249 for (unsigned In
= 0; In
< NumIncoming
; In
++) {
8251 dyn_cast_or_null
<PHINode
>(Operands
[In
]->getUnderlyingValue());
8252 if (PhiOp
&& CM
.isInLoopReduction(PhiOp
)) {
8253 assert(!InLoopVal
&& "Found more than one in-loop reduction!");
8254 InLoopVal
= Operands
[In
];
8258 assert((!InLoopVal
|| NumIncoming
== 2) &&
8259 "Found an in-loop reduction for PHI with unexpected number of "
8262 return Operands
[Operands
[0] == InLoopVal
? 1 : 0];
8264 // We know that all PHIs in non-header blocks are converted into selects, so
8265 // we don't have to worry about the insertion order and we can just use the
8266 // builder. At this point we generate the predication tree. There may be
8267 // duplications since this is a simple recursive scan, but future
8268 // optimizations will clean it up.
8269 SmallVector
<VPValue
*, 2> OperandsWithMask
;
8271 for (unsigned In
= 0; In
< NumIncoming
; In
++) {
8273 createEdgeMask(Phi
->getIncomingBlock(In
), Phi
->getParent(), *Plan
);
8274 assert((EdgeMask
|| NumIncoming
== 1) &&
8275 "Multiple predecessors with one having a full mask");
8276 OperandsWithMask
.push_back(Operands
[In
]);
8278 OperandsWithMask
.push_back(EdgeMask
);
8280 return toVPRecipeResult(new VPBlendRecipe(Phi
, OperandsWithMask
));
8283 VPWidenCallRecipe
*VPRecipeBuilder::tryToWidenCall(CallInst
*CI
,
8284 ArrayRef
<VPValue
*> Operands
,
8287 bool IsPredicated
= LoopVectorizationPlanner::getDecisionAndClampRange(
8288 [this, CI
](ElementCount VF
) {
8289 return CM
.isScalarWithPredication(CI
, VF
);
8296 Intrinsic::ID ID
= getVectorIntrinsicIDForCall(CI
, TLI
);
8297 if (ID
&& (ID
== Intrinsic::assume
|| ID
== Intrinsic::lifetime_end
||
8298 ID
== Intrinsic::lifetime_start
|| ID
== Intrinsic::sideeffect
||
8299 ID
== Intrinsic::pseudoprobe
||
8300 ID
== Intrinsic::experimental_noalias_scope_decl
))
8303 SmallVector
<VPValue
*, 4> Ops(Operands
.take_front(CI
->arg_size()));
8305 // Is it beneficial to perform intrinsic call compared to lib call?
8306 bool ShouldUseVectorIntrinsic
=
8307 ID
&& LoopVectorizationPlanner::getDecisionAndClampRange(
8308 [&](ElementCount VF
) -> bool {
8309 return CM
.getCallWideningDecision(CI
, VF
).Kind
==
8310 LoopVectorizationCostModel::CM_IntrinsicCall
;
8313 if (ShouldUseVectorIntrinsic
)
8314 return new VPWidenCallRecipe(*CI
, make_range(Ops
.begin(), Ops
.end()), ID
,
8317 Function
*Variant
= nullptr;
8318 std::optional
<unsigned> MaskPos
;
8319 // Is better to call a vectorized version of the function than to to scalarize
8321 auto ShouldUseVectorCall
= LoopVectorizationPlanner::getDecisionAndClampRange(
8322 [&](ElementCount VF
) -> bool {
8323 // The following case may be scalarized depending on the VF.
8324 // The flag shows whether we can use a usual Call for vectorized
8325 // version of the instruction.
8327 // If we've found a variant at a previous VF, then stop looking. A
8328 // vectorized variant of a function expects input in a certain shape
8329 // -- basically the number of input registers, the number of lanes
8330 // per register, and whether there's a mask required.
8331 // We store a pointer to the variant in the VPWidenCallRecipe, so
8332 // once we have an appropriate variant it's only valid for that VF.
8333 // This will force a different vplan to be generated for each VF that
8334 // finds a valid variant.
8337 LoopVectorizationCostModel::CallWideningDecision Decision
=
8338 CM
.getCallWideningDecision(CI
, VF
);
8339 if (Decision
.Kind
== LoopVectorizationCostModel::CM_VectorCall
) {
8340 Variant
= Decision
.Variant
;
8341 MaskPos
= Decision
.MaskPos
;
8348 if (ShouldUseVectorCall
) {
8349 if (MaskPos
.has_value()) {
8350 // We have 2 cases that would require a mask:
8351 // 1) The block needs to be predicated, either due to a conditional
8352 // in the scalar loop or use of an active lane mask with
8353 // tail-folding, and we use the appropriate mask for the block.
8354 // 2) No mask is required for the block, but the only available
8355 // vector variant at this VF requires a mask, so we synthesize an
8357 VPValue
*Mask
= nullptr;
8358 if (Legal
->isMaskRequired(CI
))
8359 Mask
= getBlockInMask(CI
->getParent());
8361 Mask
= Plan
->getVPValueOrAddLiveIn(ConstantInt::getTrue(
8362 IntegerType::getInt1Ty(Variant
->getFunctionType()->getContext())));
8364 Ops
.insert(Ops
.begin() + *MaskPos
, Mask
);
8367 return new VPWidenCallRecipe(*CI
, make_range(Ops
.begin(), Ops
.end()),
8368 Intrinsic::not_intrinsic
, CI
->getDebugLoc(),
8375 bool VPRecipeBuilder::shouldWiden(Instruction
*I
, VFRange
&Range
) const {
8376 assert(!isa
<BranchInst
>(I
) && !isa
<PHINode
>(I
) && !isa
<LoadInst
>(I
) &&
8377 !isa
<StoreInst
>(I
) && "Instruction should have been handled earlier");
8378 // Instruction should be widened, unless it is scalar after vectorization,
8379 // scalarization is profitable or it is predicated.
8380 auto WillScalarize
= [this, I
](ElementCount VF
) -> bool {
8381 return CM
.isScalarAfterVectorization(I
, VF
) ||
8382 CM
.isProfitableToScalarize(I
, VF
) ||
8383 CM
.isScalarWithPredication(I
, VF
);
8385 return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize
,
8389 VPRecipeBase
*VPRecipeBuilder::tryToWiden(Instruction
*I
,
8390 ArrayRef
<VPValue
*> Operands
,
8391 VPBasicBlock
*VPBB
, VPlanPtr
&Plan
) {
8392 switch (I
->getOpcode()) {
8395 case Instruction::SDiv
:
8396 case Instruction::UDiv
:
8397 case Instruction::SRem
:
8398 case Instruction::URem
: {
8399 // If not provably safe, use a select to form a safe divisor before widening the
8400 // div/rem operation itself. Otherwise fall through to general handling below.
8401 if (CM
.isPredicatedInst(I
)) {
8402 SmallVector
<VPValue
*> Ops(Operands
.begin(), Operands
.end());
8403 VPValue
*Mask
= getBlockInMask(I
->getParent());
8404 VPValue
*One
= Plan
->getVPValueOrAddLiveIn(
8405 ConstantInt::get(I
->getType(), 1u, false));
8407 new VPInstruction(Instruction::Select
, {Mask
, Ops
[1], One
},
8409 VPBB
->appendRecipe(SafeRHS
);
8411 return new VPWidenRecipe(*I
, make_range(Ops
.begin(), Ops
.end()));
8415 case Instruction::Add
:
8416 case Instruction::And
:
8417 case Instruction::AShr
:
8418 case Instruction::FAdd
:
8419 case Instruction::FCmp
:
8420 case Instruction::FDiv
:
8421 case Instruction::FMul
:
8422 case Instruction::FNeg
:
8423 case Instruction::FRem
:
8424 case Instruction::FSub
:
8425 case Instruction::ICmp
:
8426 case Instruction::LShr
:
8427 case Instruction::Mul
:
8428 case Instruction::Or
:
8429 case Instruction::Select
:
8430 case Instruction::Shl
:
8431 case Instruction::Sub
:
8432 case Instruction::Xor
:
8433 case Instruction::Freeze
:
8434 return new VPWidenRecipe(*I
, make_range(Operands
.begin(), Operands
.end()));
8438 void VPRecipeBuilder::fixHeaderPhis() {
8439 BasicBlock
*OrigLatch
= OrigLoop
->getLoopLatch();
8440 for (VPHeaderPHIRecipe
*R
: PhisToFix
) {
8441 auto *PN
= cast
<PHINode
>(R
->getUnderlyingValue());
8442 VPRecipeBase
*IncR
=
8443 getRecipe(cast
<Instruction
>(PN
->getIncomingValueForBlock(OrigLatch
)));
8444 R
->addOperand(IncR
->getVPSingleValue());
8448 VPRecipeOrVPValueTy
VPRecipeBuilder::handleReplication(Instruction
*I
,
8451 bool IsUniform
= LoopVectorizationPlanner::getDecisionAndClampRange(
8452 [&](ElementCount VF
) { return CM
.isUniformAfterVectorization(I
, VF
); },
8455 bool IsPredicated
= CM
.isPredicatedInst(I
);
8457 // Even if the instruction is not marked as uniform, there are certain
8458 // intrinsic calls that can be effectively treated as such, so we check for
8459 // them here. Conservatively, we only do this for scalable vectors, since
8460 // for fixed-width VFs we can always fall back on full scalarization.
8461 if (!IsUniform
&& Range
.Start
.isScalable() && isa
<IntrinsicInst
>(I
)) {
8462 switch (cast
<IntrinsicInst
>(I
)->getIntrinsicID()) {
8463 case Intrinsic::assume
:
8464 case Intrinsic::lifetime_start
:
8465 case Intrinsic::lifetime_end
:
8466 // For scalable vectors if one of the operands is variant then we still
8467 // want to mark as uniform, which will generate one instruction for just
8468 // the first lane of the vector. We can't scalarize the call in the same
8469 // way as for fixed-width vectors because we don't know how many lanes
8472 // The reasons for doing it this way for scalable vectors are:
8473 // 1. For the assume intrinsic generating the instruction for the first
8474 // lane is still be better than not generating any at all. For
8475 // example, the input may be a splat across all lanes.
8476 // 2. For the lifetime start/end intrinsics the pointer operand only
8477 // does anything useful when the input comes from a stack object,
8478 // which suggests it should always be uniform. For non-stack objects
8479 // the effect is to poison the object, which still allows us to
8487 VPValue
*BlockInMask
= nullptr;
8488 if (!IsPredicated
) {
8489 // Finalize the recipe for Instr, first if it is not predicated.
8490 LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I
<< "\n");
8492 LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I
<< "\n");
8493 // Instructions marked for predication are replicated and a mask operand is
8494 // added initially. Masked replicate recipes will later be placed under an
8495 // if-then construct to prevent side-effects. Generate recipes to compute
8496 // the block mask for this region.
8497 BlockInMask
= getBlockInMask(I
->getParent());
8500 auto *Recipe
= new VPReplicateRecipe(I
, Plan
.mapToVPValues(I
->operands()),
8501 IsUniform
, BlockInMask
);
8502 return toVPRecipeResult(Recipe
);
8506 VPRecipeBuilder::tryToCreateWidenRecipe(Instruction
*Instr
,
8507 ArrayRef
<VPValue
*> Operands
,
8508 VFRange
&Range
, VPBasicBlock
*VPBB
,
8510 // First, check for specific widening recipes that deal with inductions, Phi
8511 // nodes, calls and memory operations.
8512 VPRecipeBase
*Recipe
;
8513 if (auto Phi
= dyn_cast
<PHINode
>(Instr
)) {
8514 if (Phi
->getParent() != OrigLoop
->getHeader())
8515 return tryToBlend(Phi
, Operands
, Plan
);
8517 // Always record recipes for header phis. Later first-order recurrence phis
8518 // can have earlier phis as incoming values.
8519 recordRecipeOf(Phi
);
8521 if ((Recipe
= tryToOptimizeInductionPHI(Phi
, Operands
, *Plan
, Range
)))
8522 return toVPRecipeResult(Recipe
);
8524 VPHeaderPHIRecipe
*PhiRecipe
= nullptr;
8525 assert((Legal
->isReductionVariable(Phi
) ||
8526 Legal
->isFixedOrderRecurrence(Phi
)) &&
8527 "can only widen reductions and fixed-order recurrences here");
8528 VPValue
*StartV
= Operands
[0];
8529 if (Legal
->isReductionVariable(Phi
)) {
8530 const RecurrenceDescriptor
&RdxDesc
=
8531 Legal
->getReductionVars().find(Phi
)->second
;
8532 assert(RdxDesc
.getRecurrenceStartValue() ==
8533 Phi
->getIncomingValueForBlock(OrigLoop
->getLoopPreheader()));
8534 PhiRecipe
= new VPReductionPHIRecipe(Phi
, RdxDesc
, *StartV
,
8535 CM
.isInLoopReduction(Phi
),
8536 CM
.useOrderedReductions(RdxDesc
));
8538 // TODO: Currently fixed-order recurrences are modeled as chains of
8539 // first-order recurrences. If there are no users of the intermediate
8540 // recurrences in the chain, the fixed order recurrence should be modeled
8541 // directly, enabling more efficient codegen.
8542 PhiRecipe
= new VPFirstOrderRecurrencePHIRecipe(Phi
, *StartV
);
8545 // Record the incoming value from the backedge, so we can add the incoming
8546 // value from the backedge after all recipes have been created.
8547 auto *Inc
= cast
<Instruction
>(
8548 Phi
->getIncomingValueForBlock(OrigLoop
->getLoopLatch()));
8549 auto RecipeIter
= Ingredient2Recipe
.find(Inc
);
8550 if (RecipeIter
== Ingredient2Recipe
.end())
8551 recordRecipeOf(Inc
);
8553 PhisToFix
.push_back(PhiRecipe
);
8554 return toVPRecipeResult(PhiRecipe
);
8557 if (isa
<TruncInst
>(Instr
) &&
8558 (Recipe
= tryToOptimizeInductionTruncate(cast
<TruncInst
>(Instr
), Operands
,
8560 return toVPRecipeResult(Recipe
);
8562 // All widen recipes below deal only with VF > 1.
8563 if (LoopVectorizationPlanner::getDecisionAndClampRange(
8564 [&](ElementCount VF
) { return VF
.isScalar(); }, Range
))
8567 if (auto *CI
= dyn_cast
<CallInst
>(Instr
))
8568 return toVPRecipeResult(tryToWidenCall(CI
, Operands
, Range
, Plan
));
8570 if (isa
<LoadInst
>(Instr
) || isa
<StoreInst
>(Instr
))
8571 return toVPRecipeResult(tryToWidenMemory(Instr
, Operands
, Range
, Plan
));
8573 if (!shouldWiden(Instr
, Range
))
8576 if (auto GEP
= dyn_cast
<GetElementPtrInst
>(Instr
))
8577 return toVPRecipeResult(new VPWidenGEPRecipe(
8578 GEP
, make_range(Operands
.begin(), Operands
.end())));
8580 if (auto *SI
= dyn_cast
<SelectInst
>(Instr
)) {
8581 return toVPRecipeResult(new VPWidenSelectRecipe(
8582 *SI
, make_range(Operands
.begin(), Operands
.end())));
8585 if (auto *CI
= dyn_cast
<CastInst
>(Instr
)) {
8586 return toVPRecipeResult(new VPWidenCastRecipe(CI
->getOpcode(), Operands
[0],
8587 CI
->getType(), *CI
));
8590 return toVPRecipeResult(tryToWiden(Instr
, Operands
, VPBB
, Plan
));
8593 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF
,
8594 ElementCount MaxVF
) {
8595 assert(OrigLoop
->isInnermost() && "Inner loop expected.");
8597 auto MaxVFTimes2
= MaxVF
* 2;
8598 for (ElementCount VF
= MinVF
; ElementCount::isKnownLT(VF
, MaxVFTimes2
);) {
8599 VFRange SubRange
= {VF
, MaxVFTimes2
};
8600 if (auto Plan
= tryToBuildVPlanWithVPRecipes(SubRange
)) {
8601 // Now optimize the initial VPlan.
8602 if (!Plan
->hasVF(ElementCount::getFixed(1)))
8603 VPlanTransforms::truncateToMinimalBitwidths(
8604 *Plan
, CM
.getMinimalBitwidths(), PSE
.getSE()->getContext());
8605 VPlanTransforms::optimize(*Plan
, *PSE
.getSE());
8606 assert(VPlanVerifier::verifyPlanIsValid(*Plan
) && "VPlan is invalid");
8607 VPlans
.push_back(std::move(Plan
));
8613 // Add the necessary canonical IV and branch recipes required to control the
8615 static void addCanonicalIVRecipes(VPlan
&Plan
, Type
*IdxTy
, bool HasNUW
,
8617 Value
*StartIdx
= ConstantInt::get(IdxTy
, 0);
8618 auto *StartV
= Plan
.getVPValueOrAddLiveIn(StartIdx
);
8620 // Add a VPCanonicalIVPHIRecipe starting at 0 to the header.
8621 auto *CanonicalIVPHI
= new VPCanonicalIVPHIRecipe(StartV
, DL
);
8622 VPRegionBlock
*TopRegion
= Plan
.getVectorLoopRegion();
8623 VPBasicBlock
*Header
= TopRegion
->getEntryBasicBlock();
8624 Header
->insert(CanonicalIVPHI
, Header
->begin());
8626 // Add a CanonicalIVIncrement{NUW} VPInstruction to increment the scalar
8628 auto *CanonicalIVIncrement
=
8629 new VPInstruction(Instruction::Add
, {CanonicalIVPHI
, &Plan
.getVFxUF()},
8630 {HasNUW
, false}, DL
, "index.next");
8631 CanonicalIVPHI
->addOperand(CanonicalIVIncrement
);
8633 VPBasicBlock
*EB
= TopRegion
->getExitingBasicBlock();
8634 EB
->appendRecipe(CanonicalIVIncrement
);
8636 // Add the BranchOnCount VPInstruction to the latch.
8637 VPInstruction
*BranchBack
=
8638 new VPInstruction(VPInstruction::BranchOnCount
,
8639 {CanonicalIVIncrement
, &Plan
.getVectorTripCount()}, DL
);
8640 EB
->appendRecipe(BranchBack
);
8643 // Add exit values to \p Plan. VPLiveOuts are added for each LCSSA phi in the
8644 // original exit block.
8645 static void addUsersInExitBlock(VPBasicBlock
*HeaderVPBB
, Loop
*OrigLoop
,
8647 BasicBlock
*ExitBB
= OrigLoop
->getUniqueExitBlock();
8648 BasicBlock
*ExitingBB
= OrigLoop
->getExitingBlock();
8649 // Only handle single-exit loops with unique exit blocks for now.
8650 if (!ExitBB
|| !ExitBB
->getSinglePredecessor() || !ExitingBB
)
8653 // Introduce VPUsers modeling the exit values.
8654 for (PHINode
&ExitPhi
: ExitBB
->phis()) {
8655 Value
*IncomingValue
=
8656 ExitPhi
.getIncomingValueForBlock(ExitingBB
);
8657 VPValue
*V
= Plan
.getVPValueOrAddLiveIn(IncomingValue
);
8658 Plan
.addLiveOut(&ExitPhi
, V
);
8663 LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange
&Range
) {
8665 SmallPtrSet
<const InterleaveGroup
<Instruction
> *, 1> InterleaveGroups
;
8667 VPRecipeBuilder
RecipeBuilder(OrigLoop
, TLI
, Legal
, CM
, PSE
, Builder
);
8669 // ---------------------------------------------------------------------------
8670 // Pre-construction: record ingredients whose recipes we'll need to further
8671 // process after constructing the initial VPlan.
8672 // ---------------------------------------------------------------------------
8674 // For each interleave group which is relevant for this (possibly trimmed)
8675 // Range, add it to the set of groups to be later applied to the VPlan and add
8676 // placeholders for its members' Recipes which we'll be replacing with a
8677 // single VPInterleaveRecipe.
8678 for (InterleaveGroup
<Instruction
> *IG
: IAI
.getInterleaveGroups()) {
8679 auto applyIG
= [IG
, this](ElementCount VF
) -> bool {
8680 bool Result
= (VF
.isVector() && // Query is illegal for VF == 1
8681 CM
.getWideningDecision(IG
->getInsertPos(), VF
) ==
8682 LoopVectorizationCostModel::CM_Interleave
);
8683 // For scalable vectors, the only interleave factor currently supported
8684 // is 2 since we require the (de)interleave2 intrinsics instead of
8686 assert((!Result
|| !VF
.isScalable() || IG
->getFactor() == 2) &&
8687 "Unsupported interleave factor for scalable vectors");
8690 if (!getDecisionAndClampRange(applyIG
, Range
))
8692 InterleaveGroups
.insert(IG
);
8693 for (unsigned i
= 0; i
< IG
->getFactor(); i
++)
8694 if (Instruction
*Member
= IG
->getMember(i
))
8695 RecipeBuilder
.recordRecipeOf(Member
);
8698 // ---------------------------------------------------------------------------
8699 // Build initial VPlan: Scan the body of the loop in a topological order to
8700 // visit each basic block after having visited its predecessor basic blocks.
8701 // ---------------------------------------------------------------------------
8703 // Create initial VPlan skeleton, having a basic block for the pre-header
8704 // which contains SCEV expansions that need to happen before the CFG is
8705 // modified; a basic block for the vector pre-header, followed by a region for
8706 // the vector loop, followed by the middle basic block. The skeleton vector
8707 // loop region contains a header and latch basic blocks.
8708 VPlanPtr Plan
= VPlan::createInitialVPlan(
8709 createTripCountSCEV(Legal
->getWidestInductionType(), PSE
, OrigLoop
),
8711 VPBasicBlock
*HeaderVPBB
= new VPBasicBlock("vector.body");
8712 VPBasicBlock
*LatchVPBB
= new VPBasicBlock("vector.latch");
8713 VPBlockUtils::insertBlockAfter(LatchVPBB
, HeaderVPBB
);
8714 Plan
->getVectorLoopRegion()->setEntry(HeaderVPBB
);
8715 Plan
->getVectorLoopRegion()->setExiting(LatchVPBB
);
8717 // Don't use getDecisionAndClampRange here, because we don't know the UF
8718 // so this function is better to be conservative, rather than to split
8719 // it up into different VPlans.
8720 // TODO: Consider using getDecisionAndClampRange here to split up VPlans.
8721 bool IVUpdateMayOverflow
= false;
8722 for (ElementCount VF
: Range
)
8723 IVUpdateMayOverflow
|= !isIndvarOverflowCheckKnownFalse(&CM
, VF
);
8725 DebugLoc DL
= getDebugLocFromInstOrOperands(Legal
->getPrimaryInduction());
8726 TailFoldingStyle Style
= CM
.getTailFoldingStyle(IVUpdateMayOverflow
);
8727 // When not folding the tail, we know that the induction increment will not
8729 bool HasNUW
= Style
== TailFoldingStyle::None
;
8730 addCanonicalIVRecipes(*Plan
, Legal
->getWidestInductionType(), HasNUW
, DL
);
8732 // Scan the body of the loop in a topological order to visit each basic block
8733 // after having visited its predecessor basic blocks.
8734 LoopBlocksDFS
DFS(OrigLoop
);
8737 VPBasicBlock
*VPBB
= HeaderVPBB
;
8738 bool NeedsMasks
= CM
.foldTailByMasking() ||
8739 any_of(OrigLoop
->blocks(), [this](BasicBlock
*BB
) {
8740 return Legal
->blockNeedsPredication(BB
);
8742 for (BasicBlock
*BB
: make_range(DFS
.beginRPO(), DFS
.endRPO())) {
8743 // Relevant instructions from basic block BB will be grouped into VPRecipe
8744 // ingredients and fill a new VPBasicBlock.
8745 if (VPBB
!= HeaderVPBB
)
8746 VPBB
->setName(BB
->getName());
8747 Builder
.setInsertPoint(VPBB
);
8749 if (VPBB
== HeaderVPBB
)
8750 RecipeBuilder
.createHeaderMask(*Plan
);
8751 else if (NeedsMasks
)
8752 RecipeBuilder
.createBlockInMask(BB
, *Plan
);
8754 // Introduce each ingredient into VPlan.
8755 // TODO: Model and preserve debug intrinsics in VPlan.
8756 for (Instruction
&I
: drop_end(BB
->instructionsWithoutDebug(false))) {
8757 Instruction
*Instr
= &I
;
8758 SmallVector
<VPValue
*, 4> Operands
;
8759 auto *Phi
= dyn_cast
<PHINode
>(Instr
);
8760 if (Phi
&& Phi
->getParent() == OrigLoop
->getHeader()) {
8761 Operands
.push_back(Plan
->getVPValueOrAddLiveIn(
8762 Phi
->getIncomingValueForBlock(OrigLoop
->getLoopPreheader())));
8764 auto OpRange
= Plan
->mapToVPValues(Instr
->operands());
8765 Operands
= {OpRange
.begin(), OpRange
.end()};
8768 // Invariant stores inside loop will be deleted and a single store
8769 // with the final reduction value will be added to the exit block
8771 if ((SI
= dyn_cast
<StoreInst
>(&I
)) &&
8772 Legal
->isInvariantAddressOfReduction(SI
->getPointerOperand()))
8775 auto RecipeOrValue
= RecipeBuilder
.tryToCreateWidenRecipe(
8776 Instr
, Operands
, Range
, VPBB
, Plan
);
8778 RecipeOrValue
= RecipeBuilder
.handleReplication(Instr
, Range
, *Plan
);
8779 // If Instr can be simplified to an existing VPValue, use it.
8780 if (isa
<VPValue
*>(RecipeOrValue
)) {
8781 auto *VPV
= cast
<VPValue
*>(RecipeOrValue
);
8782 Plan
->addVPValue(Instr
, VPV
);
8783 // If the re-used value is a recipe, register the recipe for the
8784 // instruction, in case the recipe for Instr needs to be recorded.
8785 if (VPRecipeBase
*R
= VPV
->getDefiningRecipe())
8786 RecipeBuilder
.setRecipe(Instr
, R
);
8789 // Otherwise, add the new recipe.
8790 VPRecipeBase
*Recipe
= cast
<VPRecipeBase
*>(RecipeOrValue
);
8791 for (auto *Def
: Recipe
->definedValues()) {
8792 auto *UV
= Def
->getUnderlyingValue();
8793 Plan
->addVPValue(UV
, Def
);
8796 RecipeBuilder
.setRecipe(Instr
, Recipe
);
8797 if (isa
<VPHeaderPHIRecipe
>(Recipe
)) {
8798 // VPHeaderPHIRecipes must be kept in the phi section of HeaderVPBB. In
8799 // the following cases, VPHeaderPHIRecipes may be created after non-phi
8800 // recipes and need to be moved to the phi section of HeaderVPBB:
8801 // * tail-folding (non-phi recipes computing the header mask are
8802 // introduced earlier than regular header phi recipes, and should appear
8804 // * Optimizing truncates to VPWidenIntOrFpInductionRecipe.
8806 assert((HeaderVPBB
->getFirstNonPhi() == VPBB
->end() ||
8807 CM
.foldTailByMasking() || isa
<TruncInst
>(Instr
)) &&
8808 "unexpected recipe needs moving");
8809 Recipe
->insertBefore(*HeaderVPBB
, HeaderVPBB
->getFirstNonPhi());
8811 VPBB
->appendRecipe(Recipe
);
8814 VPBlockUtils::insertBlockAfter(new VPBasicBlock(), VPBB
);
8815 VPBB
= cast
<VPBasicBlock
>(VPBB
->getSingleSuccessor());
8818 // After here, VPBB should not be used.
8821 if (CM
.requiresScalarEpilogue(Range
)) {
8822 // No edge from the middle block to the unique exit block has been inserted
8823 // and there is nothing to fix from vector loop; phis should have incoming
8824 // from scalar loop only.
8826 addUsersInExitBlock(HeaderVPBB
, OrigLoop
, *Plan
);
8828 assert(isa
<VPRegionBlock
>(Plan
->getVectorLoopRegion()) &&
8829 !Plan
->getVectorLoopRegion()->getEntryBasicBlock()->empty() &&
8830 "entry block must be set to a VPRegionBlock having a non-empty entry "
8832 RecipeBuilder
.fixHeaderPhis();
8834 // ---------------------------------------------------------------------------
8835 // Transform initial VPlan: Apply previously taken decisions, in order, to
8836 // bring the VPlan to its final state.
8837 // ---------------------------------------------------------------------------
8839 // Adjust the recipes for any inloop reductions.
8840 adjustRecipesForReductions(LatchVPBB
, Plan
, RecipeBuilder
, Range
.Start
);
8842 // Interleave memory: for each Interleave Group we marked earlier as relevant
8843 // for this VPlan, replace the Recipes widening its memory instructions with a
8844 // single VPInterleaveRecipe at its insertion point.
8845 for (const auto *IG
: InterleaveGroups
) {
8846 auto *Recipe
= cast
<VPWidenMemoryInstructionRecipe
>(
8847 RecipeBuilder
.getRecipe(IG
->getInsertPos()));
8848 SmallVector
<VPValue
*, 4> StoredValues
;
8849 for (unsigned i
= 0; i
< IG
->getFactor(); ++i
)
8850 if (auto *SI
= dyn_cast_or_null
<StoreInst
>(IG
->getMember(i
))) {
8852 cast
<VPWidenMemoryInstructionRecipe
>(RecipeBuilder
.getRecipe(SI
));
8853 StoredValues
.push_back(StoreR
->getStoredValue());
8856 bool NeedsMaskForGaps
=
8857 IG
->requiresScalarEpilogue() && !CM
.isScalarEpilogueAllowed();
8858 auto *VPIG
= new VPInterleaveRecipe(IG
, Recipe
->getAddr(), StoredValues
,
8859 Recipe
->getMask(), NeedsMaskForGaps
);
8860 VPIG
->insertBefore(Recipe
);
8862 for (unsigned i
= 0; i
< IG
->getFactor(); ++i
)
8863 if (Instruction
*Member
= IG
->getMember(i
)) {
8864 VPRecipeBase
*MemberR
= RecipeBuilder
.getRecipe(Member
);
8865 if (!Member
->getType()->isVoidTy()) {
8866 VPValue
*OriginalV
= MemberR
->getVPSingleValue();
8867 OriginalV
->replaceAllUsesWith(VPIG
->getVPValue(J
));
8870 MemberR
->eraseFromParent();
8874 for (ElementCount VF
: Range
)
8876 Plan
->setName("Initial VPlan");
8878 // Replace VPValues for known constant strides guaranteed by predicate scalar
8880 for (auto [_
, Stride
] : Legal
->getLAI()->getSymbolicStrides()) {
8881 auto *StrideV
= cast
<SCEVUnknown
>(Stride
)->getValue();
8882 auto *ScevStride
= dyn_cast
<SCEVConstant
>(PSE
.getSCEV(StrideV
));
8883 // Only handle constant strides for now.
8886 Constant
*CI
= ConstantInt::get(Stride
->getType(), ScevStride
->getAPInt());
8888 auto *ConstVPV
= Plan
->getVPValueOrAddLiveIn(CI
);
8889 // The versioned value may not be used in the loop directly, so just add a
8890 // new live-in in those cases.
8891 Plan
->getVPValueOrAddLiveIn(StrideV
)->replaceAllUsesWith(ConstVPV
);
8894 // From this point onwards, VPlan-to-VPlan transformations may change the plan
8895 // in ways that accessing values using original IR values is incorrect.
8896 Plan
->disableValue2VPValue();
8898 // Sink users of fixed-order recurrence past the recipe defining the previous
8899 // value and introduce FirstOrderRecurrenceSplice VPInstructions.
8900 if (!VPlanTransforms::adjustFixedOrderRecurrences(*Plan
, Builder
))
8903 if (useActiveLaneMask(Style
)) {
8904 // TODO: Move checks to VPlanTransforms::addActiveLaneMask once
8905 // TailFoldingStyle is visible there.
8906 bool ForControlFlow
= useActiveLaneMaskForControlFlow(Style
);
8907 bool WithoutRuntimeCheck
=
8908 Style
== TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck
;
8909 VPlanTransforms::addActiveLaneMask(*Plan
, ForControlFlow
,
8910 WithoutRuntimeCheck
);
8915 VPlanPtr
LoopVectorizationPlanner::buildVPlan(VFRange
&Range
) {
8916 // Outer loop handling: They may require CFG and instruction level
8917 // transformations before even evaluating whether vectorization is profitable.
8918 // Since we cannot modify the incoming IR, we need to build VPlan upfront in
8919 // the vectorization pipeline.
8920 assert(!OrigLoop
->isInnermost());
8921 assert(EnableVPlanNativePath
&& "VPlan-native path is not enabled.");
8923 // Create new empty VPlan
8924 auto Plan
= VPlan::createInitialVPlan(
8925 createTripCountSCEV(Legal
->getWidestInductionType(), PSE
, OrigLoop
),
8928 // Build hierarchical CFG
8929 VPlanHCFGBuilder
HCFGBuilder(OrigLoop
, LI
, *Plan
);
8930 HCFGBuilder
.buildHierarchicalCFG();
8932 for (ElementCount VF
: Range
)
8935 VPlanTransforms::VPInstructionsToVPRecipes(
8937 [this](PHINode
*P
) { return Legal
->getIntOrFpInductionDescriptor(P
); },
8938 *PSE
.getSE(), *TLI
);
8940 // Remove the existing terminator of the exiting block of the top-most region.
8941 // A BranchOnCount will be added instead when adding the canonical IV recipes.
8943 Plan
->getVectorLoopRegion()->getExitingBasicBlock()->getTerminator();
8944 Term
->eraseFromParent();
8946 // Tail folding is not supported for outer loops, so the induction increment
8947 // is guaranteed to not wrap.
8949 addCanonicalIVRecipes(*Plan
, Legal
->getWidestInductionType(), HasNUW
,
8954 // Adjust the recipes for reductions. For in-loop reductions the chain of
8955 // instructions leading from the loop exit instr to the phi need to be converted
8956 // to reductions, with one operand being vector and the other being the scalar
8957 // reduction chain. For other reductions, a select is introduced between the phi
8958 // and live-out recipes when folding the tail.
8960 // A ComputeReductionResult recipe is added to the middle block, also for
8961 // in-loop reductions which compute their result in-loop, because generating
8962 // the subsequent bc.merge.rdx phi is driven by ComputeReductionResult recipes.
8963 void LoopVectorizationPlanner::adjustRecipesForReductions(
8964 VPBasicBlock
*LatchVPBB
, VPlanPtr
&Plan
, VPRecipeBuilder
&RecipeBuilder
,
8965 ElementCount MinVF
) {
8966 VPRegionBlock
*VectorLoopRegion
= Plan
->getVectorLoopRegion();
8967 VPBasicBlock
*Header
= VectorLoopRegion
->getEntryBasicBlock();
8968 // Gather all VPReductionPHIRecipe and sort them so that Intermediate stores
8969 // sank outside of the loop would keep the same order as they had in the
8971 SmallVector
<VPReductionPHIRecipe
*> ReductionPHIList
;
8972 for (VPRecipeBase
&R
: Header
->phis()) {
8973 if (auto *ReductionPhi
= dyn_cast
<VPReductionPHIRecipe
>(&R
))
8974 ReductionPHIList
.emplace_back(ReductionPhi
);
8976 bool HasIntermediateStore
= false;
8977 stable_sort(ReductionPHIList
,
8978 [this, &HasIntermediateStore
](const VPReductionPHIRecipe
*R1
,
8979 const VPReductionPHIRecipe
*R2
) {
8980 auto *IS1
= R1
->getRecurrenceDescriptor().IntermediateStore
;
8981 auto *IS2
= R2
->getRecurrenceDescriptor().IntermediateStore
;
8982 HasIntermediateStore
|= IS1
|| IS2
;
8984 // If neither of the recipes has an intermediate store, keep the
8989 // If only one of the recipes has an intermediate store, then
8990 // move it towards the beginning of the list.
8997 // If both recipes have an intermediate store, then the recipe
8998 // with the later store should be processed earlier. So it
8999 // should go to the beginning of the list.
9000 return DT
->dominates(IS2
, IS1
);
9003 if (HasIntermediateStore
&& ReductionPHIList
.size() > 1)
9004 for (VPRecipeBase
*R
: ReductionPHIList
)
9005 R
->moveBefore(*Header
, Header
->getFirstNonPhi());
9007 for (VPRecipeBase
&R
: Header
->phis()) {
9008 auto *PhiR
= dyn_cast
<VPReductionPHIRecipe
>(&R
);
9009 if (!PhiR
|| !PhiR
->isInLoop() || (MinVF
.isScalar() && !PhiR
->isOrdered()))
9012 const RecurrenceDescriptor
&RdxDesc
= PhiR
->getRecurrenceDescriptor();
9013 RecurKind Kind
= RdxDesc
.getRecurrenceKind();
9014 assert(!RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind
) &&
9015 "AnyOf reductions are not allowed for in-loop reductions");
9017 // Collect the chain of "link" recipes for the reduction starting at PhiR.
9018 SetVector
<VPSingleDefRecipe
*> Worklist
;
9019 Worklist
.insert(PhiR
);
9020 for (unsigned I
= 0; I
!= Worklist
.size(); ++I
) {
9021 VPSingleDefRecipe
*Cur
= Worklist
[I
];
9022 for (VPUser
*U
: Cur
->users()) {
9023 auto *UserRecipe
= dyn_cast
<VPSingleDefRecipe
>(U
);
9025 assert(isa
<VPLiveOut
>(U
) &&
9026 "U must either be a VPSingleDef or VPLiveOut");
9029 Worklist
.insert(UserRecipe
);
9033 // Visit operation "Links" along the reduction chain top-down starting from
9034 // the phi until LoopExitValue. We keep track of the previous item
9035 // (PreviousLink) to tell which of the two operands of a Link will remain
9036 // scalar and which will be reduced. For minmax by select(cmp), Link will be
9037 // the select instructions.
9038 VPSingleDefRecipe
*PreviousLink
= PhiR
; // Aka Worklist[0].
9039 for (VPSingleDefRecipe
*CurrentLink
: Worklist
.getArrayRef().drop_front()) {
9040 Instruction
*CurrentLinkI
= CurrentLink
->getUnderlyingInstr();
9042 // Index of the first operand which holds a non-mask vector operand.
9043 unsigned IndexOfFirstOperand
;
9044 // Recognize a call to the llvm.fmuladd intrinsic.
9045 bool IsFMulAdd
= (Kind
== RecurKind::FMulAdd
);
9047 VPBasicBlock
*LinkVPBB
= CurrentLink
->getParent();
9050 RecurrenceDescriptor::isFMulAddIntrinsic(CurrentLinkI
) &&
9051 "Expected instruction to be a call to the llvm.fmuladd intrinsic");
9052 assert(((MinVF
.isScalar() && isa
<VPReplicateRecipe
>(CurrentLink
)) ||
9053 isa
<VPWidenCallRecipe
>(CurrentLink
)) &&
9054 CurrentLink
->getOperand(2) == PreviousLink
&&
9055 "expected a call where the previous link is the added operand");
9057 // If the instruction is a call to the llvm.fmuladd intrinsic then we
9058 // need to create an fmul recipe (multiplying the first two operands of
9059 // the fmuladd together) to use as the vector operand for the fadd
9061 VPInstruction
*FMulRecipe
= new VPInstruction(
9063 {CurrentLink
->getOperand(0), CurrentLink
->getOperand(1)},
9064 CurrentLinkI
->getFastMathFlags());
9065 LinkVPBB
->insert(FMulRecipe
, CurrentLink
->getIterator());
9068 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind
)) {
9069 if (isa
<VPWidenRecipe
>(CurrentLink
)) {
9070 assert(isa
<CmpInst
>(CurrentLinkI
) &&
9071 "need to have the compare of the select");
9074 assert(isa
<VPWidenSelectRecipe
>(CurrentLink
) &&
9075 "must be a select recipe");
9076 IndexOfFirstOperand
= 1;
9078 assert((MinVF
.isScalar() || isa
<VPWidenRecipe
>(CurrentLink
)) &&
9079 "Expected to replace a VPWidenSC");
9080 IndexOfFirstOperand
= 0;
9082 // Note that for non-commutable operands (cmp-selects), the semantics of
9083 // the cmp-select are captured in the recurrence kind.
9085 CurrentLink
->getOperand(IndexOfFirstOperand
) == PreviousLink
9086 ? IndexOfFirstOperand
+ 1
9087 : IndexOfFirstOperand
;
9088 VecOp
= CurrentLink
->getOperand(VecOpId
);
9089 assert(VecOp
!= PreviousLink
&&
9090 CurrentLink
->getOperand(CurrentLink
->getNumOperands() - 1 -
9091 (VecOpId
- IndexOfFirstOperand
)) ==
9093 "PreviousLink must be the operand other than VecOp");
9096 BasicBlock
*BB
= CurrentLinkI
->getParent();
9097 VPValue
*CondOp
= nullptr;
9098 if (CM
.blockNeedsPredicationForAnyReason(BB
)) {
9099 VPBuilder::InsertPointGuard
Guard(Builder
);
9100 Builder
.setInsertPoint(CurrentLink
);
9101 CondOp
= RecipeBuilder
.getBlockInMask(BB
);
9104 VPReductionRecipe
*RedRecipe
= new VPReductionRecipe(
9105 RdxDesc
, CurrentLinkI
, PreviousLink
, VecOp
, CondOp
);
9106 // Append the recipe to the end of the VPBasicBlock because we need to
9107 // ensure that it comes after all of it's inputs, including CondOp.
9108 // Note that this transformation may leave over dead recipes (including
9109 // CurrentLink), which will be cleaned by a later VPlan transform.
9110 LinkVPBB
->appendRecipe(RedRecipe
);
9111 CurrentLink
->replaceAllUsesWith(RedRecipe
);
9112 PreviousLink
= RedRecipe
;
9115 Builder
.setInsertPoint(&*LatchVPBB
->begin());
9116 for (VPRecipeBase
&R
:
9117 Plan
->getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
9118 VPReductionPHIRecipe
*PhiR
= dyn_cast
<VPReductionPHIRecipe
>(&R
);
9122 const RecurrenceDescriptor
&RdxDesc
= PhiR
->getRecurrenceDescriptor();
9123 // If tail is folded by masking, introduce selects between the phi
9124 // and the live-out instruction of each reduction, at the beginning of the
9125 // dedicated latch block.
9126 auto *OrigExitingVPV
= PhiR
->getBackedgeValue();
9127 auto *NewExitingVPV
= PhiR
->getBackedgeValue();
9128 if (!PhiR
->isInLoop() && CM
.foldTailByMasking()) {
9129 VPValue
*Cond
= RecipeBuilder
.getBlockInMask(OrigLoop
->getHeader());
9130 assert(OrigExitingVPV
->getDefiningRecipe()->getParent() != LatchVPBB
&&
9131 "reduction recipe must be defined before latch");
9132 Type
*PhiTy
= PhiR
->getOperand(0)->getLiveInIRValue()->getType();
9133 std::optional
<FastMathFlags
> FMFs
=
9134 PhiTy
->isFloatingPointTy()
9135 ? std::make_optional(RdxDesc
.getFastMathFlags())
9138 Builder
.createSelect(Cond
, OrigExitingVPV
, PhiR
, {}, "", FMFs
);
9139 OrigExitingVPV
->replaceUsesWithIf(NewExitingVPV
, [](VPUser
&U
, unsigned) {
9140 return isa
<VPInstruction
>(&U
) &&
9141 cast
<VPInstruction
>(&U
)->getOpcode() ==
9142 VPInstruction::ComputeReductionResult
;
9144 if (PreferPredicatedReductionSelect
||
9145 TTI
.preferPredicatedReductionSelect(
9146 PhiR
->getRecurrenceDescriptor().getOpcode(), PhiTy
,
9147 TargetTransformInfo::ReductionFlags()))
9148 PhiR
->setOperand(1, NewExitingVPV
);
9151 // If the vector reduction can be performed in a smaller type, we truncate
9152 // then extend the loop exit value to enable InstCombine to evaluate the
9153 // entire expression in the smaller type.
9154 Type
*PhiTy
= PhiR
->getStartValue()->getLiveInIRValue()->getType();
9155 if (MinVF
.isVector() && PhiTy
!= RdxDesc
.getRecurrenceType()) {
9156 assert(!PhiR
->isInLoop() && "Unexpected truncated inloop reduction!");
9157 Type
*RdxTy
= RdxDesc
.getRecurrenceType();
9159 new VPWidenCastRecipe(Instruction::Trunc
, NewExitingVPV
, RdxTy
);
9162 ? new VPWidenCastRecipe(Instruction::SExt
, Trunc
, PhiTy
)
9163 : new VPWidenCastRecipe(Instruction::ZExt
, Trunc
, PhiTy
);
9165 Trunc
->insertAfter(NewExitingVPV
->getDefiningRecipe());
9166 Extnd
->insertAfter(Trunc
);
9167 if (PhiR
->getOperand(1) == NewExitingVPV
)
9168 PhiR
->setOperand(1, Extnd
->getVPSingleValue());
9169 NewExitingVPV
= Extnd
;
9172 // We want code in the middle block to appear to execute on the location of
9173 // the scalar loop's latch terminator because: (a) it is all compiler
9174 // generated, (b) these instructions are always executed after evaluating
9175 // the latch conditional branch, and (c) other passes may add new
9176 // predecessors which terminate on this line. This is the easiest way to
9177 // ensure we don't accidentally cause an extra step back into the loop while
9179 DebugLoc ExitDL
= OrigLoop
->getLoopLatch()->getTerminator()->getDebugLoc();
9181 // TODO: At the moment ComputeReductionResult also drives creation of the
9182 // bc.merge.rdx phi nodes, hence it needs to be created unconditionally here
9183 // even for in-loop reductions, until the reduction resume value handling is
9184 // also modeled in VPlan.
9185 auto *FinalReductionResult
= new VPInstruction(
9186 VPInstruction::ComputeReductionResult
, {PhiR
, NewExitingVPV
}, ExitDL
);
9187 cast
<VPBasicBlock
>(VectorLoopRegion
->getSingleSuccessor())
9188 ->appendRecipe(FinalReductionResult
);
9189 OrigExitingVPV
->replaceUsesWithIf(
9190 FinalReductionResult
,
9191 [](VPUser
&User
, unsigned) { return isa
<VPLiveOut
>(&User
); });
9194 VPlanTransforms::clearReductionWrapFlags(*Plan
);
9197 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
9198 void VPInterleaveRecipe::print(raw_ostream
&O
, const Twine
&Indent
,
9199 VPSlotTracker
&SlotTracker
) const {
9200 O
<< Indent
<< "INTERLEAVE-GROUP with factor " << IG
->getFactor() << " at ";
9201 IG
->getInsertPos()->printAsOperand(O
, false);
9203 getAddr()->printAsOperand(O
, SlotTracker
);
9204 VPValue
*Mask
= getMask();
9207 Mask
->printAsOperand(O
, SlotTracker
);
9211 for (unsigned i
= 0; i
< IG
->getFactor(); ++i
) {
9212 if (!IG
->getMember(i
))
9214 if (getNumStoreOperands() > 0) {
9215 O
<< "\n" << Indent
<< " store ";
9216 getOperand(1 + OpIdx
)->printAsOperand(O
, SlotTracker
);
9217 O
<< " to index " << i
;
9219 O
<< "\n" << Indent
<< " ";
9220 getVPValue(OpIdx
)->printAsOperand(O
, SlotTracker
);
9221 O
<< " = load from index " << i
;
9228 void VPWidenPointerInductionRecipe::execute(VPTransformState
&State
) {
9229 assert(IndDesc
.getKind() == InductionDescriptor::IK_PtrInduction
&&
9230 "Not a pointer induction according to InductionDescriptor!");
9231 assert(cast
<PHINode
>(getUnderlyingInstr())->getType()->isPointerTy() &&
9232 "Unexpected type.");
9234 auto *IVR
= getParent()->getPlan()->getCanonicalIV();
9235 PHINode
*CanonicalIV
= cast
<PHINode
>(State
.get(IVR
, 0));
9237 if (onlyScalarsGenerated(State
.VF
)) {
9238 // This is the normalized GEP that starts counting at zero.
9239 Value
*PtrInd
= State
.Builder
.CreateSExtOrTrunc(
9240 CanonicalIV
, IndDesc
.getStep()->getType());
9241 // Determine the number of scalars we need to generate for each unroll
9242 // iteration. If the instruction is uniform, we only need to generate the
9243 // first lane. Otherwise, we generate all VF values.
9244 bool IsUniform
= vputils::onlyFirstLaneUsed(this);
9245 assert((IsUniform
|| !State
.VF
.isScalable()) &&
9246 "Cannot scalarize a scalable VF");
9247 unsigned Lanes
= IsUniform
? 1 : State
.VF
.getFixedValue();
9249 for (unsigned Part
= 0; Part
< State
.UF
; ++Part
) {
9251 createStepForVF(State
.Builder
, PtrInd
->getType(), State
.VF
, Part
);
9253 for (unsigned Lane
= 0; Lane
< Lanes
; ++Lane
) {
9254 Value
*Idx
= State
.Builder
.CreateAdd(
9255 PartStart
, ConstantInt::get(PtrInd
->getType(), Lane
));
9256 Value
*GlobalIdx
= State
.Builder
.CreateAdd(PtrInd
, Idx
);
9258 Value
*Step
= State
.get(getOperand(1), VPIteration(Part
, Lane
));
9259 Value
*SclrGep
= emitTransformedIndex(
9260 State
.Builder
, GlobalIdx
, IndDesc
.getStartValue(), Step
,
9261 IndDesc
.getKind(), IndDesc
.getInductionBinOp());
9262 SclrGep
->setName("next.gep");
9263 State
.set(this, SclrGep
, VPIteration(Part
, Lane
));
9269 Type
*PhiType
= IndDesc
.getStep()->getType();
9271 // Build a pointer phi
9272 Value
*ScalarStartValue
= getStartValue()->getLiveInIRValue();
9273 Type
*ScStValueType
= ScalarStartValue
->getType();
9274 PHINode
*NewPointerPhi
=
9275 PHINode::Create(ScStValueType
, 2, "pointer.phi", CanonicalIV
);
9277 BasicBlock
*VectorPH
= State
.CFG
.getPreheaderBBFor(this);
9278 NewPointerPhi
->addIncoming(ScalarStartValue
, VectorPH
);
9280 // A pointer induction, performed by using a gep
9281 Instruction
*InductionLoc
= &*State
.Builder
.GetInsertPoint();
9283 Value
*ScalarStepValue
= State
.get(getOperand(1), VPIteration(0, 0));
9284 Value
*RuntimeVF
= getRuntimeVF(State
.Builder
, PhiType
, State
.VF
);
9285 Value
*NumUnrolledElems
=
9286 State
.Builder
.CreateMul(RuntimeVF
, ConstantInt::get(PhiType
, State
.UF
));
9287 Value
*InductionGEP
= GetElementPtrInst::Create(
9288 State
.Builder
.getInt8Ty(), NewPointerPhi
,
9289 State
.Builder
.CreateMul(ScalarStepValue
, NumUnrolledElems
), "ptr.ind",
9291 // Add induction update using an incorrect block temporarily. The phi node
9292 // will be fixed after VPlan execution. Note that at this point the latch
9293 // block cannot be used, as it does not exist yet.
9294 // TODO: Model increment value in VPlan, by turning the recipe into a
9295 // multi-def and a subclass of VPHeaderPHIRecipe.
9296 NewPointerPhi
->addIncoming(InductionGEP
, VectorPH
);
9298 // Create UF many actual address geps that use the pointer
9299 // phi as base and a vectorized version of the step value
9300 // (<step*0, ..., step*N>) as offset.
9301 for (unsigned Part
= 0; Part
< State
.UF
; ++Part
) {
9302 Type
*VecPhiType
= VectorType::get(PhiType
, State
.VF
);
9303 Value
*StartOffsetScalar
=
9304 State
.Builder
.CreateMul(RuntimeVF
, ConstantInt::get(PhiType
, Part
));
9305 Value
*StartOffset
=
9306 State
.Builder
.CreateVectorSplat(State
.VF
, StartOffsetScalar
);
9307 // Create a vector of consecutive numbers from zero to VF.
9308 StartOffset
= State
.Builder
.CreateAdd(
9309 StartOffset
, State
.Builder
.CreateStepVector(VecPhiType
));
9311 assert(ScalarStepValue
== State
.get(getOperand(1), VPIteration(Part
, 0)) &&
9312 "scalar step must be the same across all parts");
9313 Value
*GEP
= State
.Builder
.CreateGEP(
9314 State
.Builder
.getInt8Ty(), NewPointerPhi
,
9315 State
.Builder
.CreateMul(
9317 State
.Builder
.CreateVectorSplat(State
.VF
, ScalarStepValue
),
9319 State
.set(this, GEP
, Part
);
9323 void VPDerivedIVRecipe::execute(VPTransformState
&State
) {
9324 assert(!State
.Instance
&& "VPDerivedIVRecipe being replicated.");
9326 // Fast-math-flags propagate from the original induction instruction.
9327 IRBuilder
<>::FastMathFlagGuard
FMFG(State
.Builder
);
9329 State
.Builder
.setFastMathFlags(FPBinOp
->getFastMathFlags());
9331 Value
*Step
= State
.get(getStepValue(), VPIteration(0, 0));
9332 Value
*CanonicalIV
= State
.get(getCanonicalIV(), VPIteration(0, 0));
9333 Value
*DerivedIV
= emitTransformedIndex(
9334 State
.Builder
, CanonicalIV
, getStartValue()->getLiveInIRValue(), Step
,
9335 Kind
, cast_if_present
<BinaryOperator
>(FPBinOp
));
9336 DerivedIV
->setName("offset.idx");
9337 if (TruncResultTy
) {
9338 assert(TruncResultTy
!= DerivedIV
->getType() &&
9339 Step
->getType()->isIntegerTy() &&
9340 "Truncation requires an integer step");
9341 DerivedIV
= State
.Builder
.CreateTrunc(DerivedIV
, TruncResultTy
);
9343 assert(DerivedIV
!= CanonicalIV
&& "IV didn't need transforming?");
9345 State
.set(this, DerivedIV
, VPIteration(0, 0));
9348 void VPInterleaveRecipe::execute(VPTransformState
&State
) {
9349 assert(!State
.Instance
&& "Interleave group being replicated.");
9350 State
.ILV
->vectorizeInterleaveGroup(IG
, definedValues(), State
, getAddr(),
9351 getStoredValues(), getMask(),
9355 void VPReductionRecipe::execute(VPTransformState
&State
) {
9356 assert(!State
.Instance
&& "Reduction being replicated.");
9357 Value
*PrevInChain
= State
.get(getChainOp(), 0);
9358 RecurKind Kind
= RdxDesc
.getRecurrenceKind();
9359 bool IsOrdered
= State
.ILV
->useOrderedReductions(RdxDesc
);
9360 // Propagate the fast-math flags carried by the underlying instruction.
9361 IRBuilderBase::FastMathFlagGuard
FMFGuard(State
.Builder
);
9362 State
.Builder
.setFastMathFlags(RdxDesc
.getFastMathFlags());
9363 for (unsigned Part
= 0; Part
< State
.UF
; ++Part
) {
9364 Value
*NewVecOp
= State
.get(getVecOp(), Part
);
9365 if (VPValue
*Cond
= getCondOp()) {
9366 Value
*NewCond
= State
.VF
.isVector() ? State
.get(Cond
, Part
)
9367 : State
.get(Cond
, {Part
, 0});
9368 VectorType
*VecTy
= dyn_cast
<VectorType
>(NewVecOp
->getType());
9369 Type
*ElementTy
= VecTy
? VecTy
->getElementType() : NewVecOp
->getType();
9370 Value
*Iden
= RdxDesc
.getRecurrenceIdentity(Kind
, ElementTy
,
9371 RdxDesc
.getFastMathFlags());
9372 if (State
.VF
.isVector()) {
9374 State
.Builder
.CreateVectorSplat(VecTy
->getElementCount(), Iden
);
9377 Value
*Select
= State
.Builder
.CreateSelect(NewCond
, NewVecOp
, Iden
);
9383 if (State
.VF
.isVector())
9384 NewRed
= createOrderedReduction(State
.Builder
, RdxDesc
, NewVecOp
,
9387 NewRed
= State
.Builder
.CreateBinOp(
9388 (Instruction::BinaryOps
)RdxDesc
.getOpcode(Kind
), PrevInChain
,
9390 PrevInChain
= NewRed
;
9392 PrevInChain
= State
.get(getChainOp(), Part
);
9393 NewRed
= createTargetReduction(State
.Builder
, RdxDesc
, NewVecOp
);
9395 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind
)) {
9396 NextInChain
= createMinMaxOp(State
.Builder
, RdxDesc
.getRecurrenceKind(),
9397 NewRed
, PrevInChain
);
9398 } else if (IsOrdered
)
9399 NextInChain
= NewRed
;
9401 NextInChain
= State
.Builder
.CreateBinOp(
9402 (Instruction::BinaryOps
)RdxDesc
.getOpcode(Kind
), NewRed
, PrevInChain
);
9403 State
.set(this, NextInChain
, Part
);
9407 void VPReplicateRecipe::execute(VPTransformState
&State
) {
9408 Instruction
*UI
= getUnderlyingInstr();
9409 if (State
.Instance
) { // Generate a single instance.
9410 assert(!State
.VF
.isScalable() && "Can't scalarize a scalable vector");
9411 State
.ILV
->scalarizeInstruction(UI
, this, *State
.Instance
, State
);
9412 // Insert scalar instance packing it into a vector.
9413 if (State
.VF
.isVector() && shouldPack()) {
9414 // If we're constructing lane 0, initialize to start from poison.
9415 if (State
.Instance
->Lane
.isFirstLane()) {
9416 assert(!State
.VF
.isScalable() && "VF is assumed to be non scalable.");
9417 Value
*Poison
= PoisonValue::get(
9418 VectorType::get(UI
->getType(), State
.VF
));
9419 State
.set(this, Poison
, State
.Instance
->Part
);
9421 State
.packScalarIntoVectorValue(this, *State
.Instance
);
9427 // If the recipe is uniform across all parts (instead of just per VF), only
9428 // generate a single instance.
9429 if ((isa
<LoadInst
>(UI
) || isa
<StoreInst
>(UI
)) &&
9430 all_of(operands(), [](VPValue
*Op
) {
9431 return Op
->isDefinedOutsideVectorRegions();
9433 State
.ILV
->scalarizeInstruction(UI
, this, VPIteration(0, 0), State
);
9434 if (user_begin() != user_end()) {
9435 for (unsigned Part
= 1; Part
< State
.UF
; ++Part
)
9436 State
.set(this, State
.get(this, VPIteration(0, 0)),
9437 VPIteration(Part
, 0));
9442 // Uniform within VL means we need to generate lane 0 only for each
9444 for (unsigned Part
= 0; Part
< State
.UF
; ++Part
)
9445 State
.ILV
->scalarizeInstruction(UI
, this, VPIteration(Part
, 0), State
);
9449 // A store of a loop varying value to a uniform address only needs the last
9450 // copy of the store.
9451 if (isa
<StoreInst
>(UI
) &&
9452 vputils::isUniformAfterVectorization(getOperand(1))) {
9453 auto Lane
= VPLane::getLastLaneForVF(State
.VF
);
9454 State
.ILV
->scalarizeInstruction(UI
, this, VPIteration(State
.UF
- 1, Lane
),
9459 // Generate scalar instances for all VF lanes of all UF parts.
9460 assert(!State
.VF
.isScalable() && "Can't scalarize a scalable vector");
9461 const unsigned EndLane
= State
.VF
.getKnownMinValue();
9462 for (unsigned Part
= 0; Part
< State
.UF
; ++Part
)
9463 for (unsigned Lane
= 0; Lane
< EndLane
; ++Lane
)
9464 State
.ILV
->scalarizeInstruction(UI
, this, VPIteration(Part
, Lane
), State
);
9467 void VPWidenMemoryInstructionRecipe::execute(VPTransformState
&State
) {
9468 VPValue
*StoredValue
= isStore() ? getStoredValue() : nullptr;
9470 // Attempt to issue a wide load.
9471 LoadInst
*LI
= dyn_cast
<LoadInst
>(&Ingredient
);
9472 StoreInst
*SI
= dyn_cast
<StoreInst
>(&Ingredient
);
9474 assert((LI
|| SI
) && "Invalid Load/Store instruction");
9475 assert((!SI
|| StoredValue
) && "No stored value provided for widened store");
9476 assert((!LI
|| !StoredValue
) && "Stored value provided for widened load");
9478 Type
*ScalarDataTy
= getLoadStoreType(&Ingredient
);
9480 auto *DataTy
= VectorType::get(ScalarDataTy
, State
.VF
);
9481 const Align Alignment
= getLoadStoreAlignment(&Ingredient
);
9482 bool CreateGatherScatter
= !isConsecutive();
9484 auto &Builder
= State
.Builder
;
9485 InnerLoopVectorizer::VectorParts
BlockInMaskParts(State
.UF
);
9486 bool isMaskRequired
= getMask();
9487 if (isMaskRequired
) {
9488 // Mask reversal is only needed for non-all-one (null) masks, as reverse of
9489 // a null all-one mask is a null mask.
9490 for (unsigned Part
= 0; Part
< State
.UF
; ++Part
) {
9491 Value
*Mask
= State
.get(getMask(), Part
);
9493 Mask
= Builder
.CreateVectorReverse(Mask
, "reverse");
9494 BlockInMaskParts
[Part
] = Mask
;
9500 State
.setDebugLocFrom(SI
->getDebugLoc());
9502 for (unsigned Part
= 0; Part
< State
.UF
; ++Part
) {
9503 Instruction
*NewSI
= nullptr;
9504 Value
*StoredVal
= State
.get(StoredValue
, Part
);
9505 if (CreateGatherScatter
) {
9506 Value
*MaskPart
= isMaskRequired
? BlockInMaskParts
[Part
] : nullptr;
9507 Value
*VectorGep
= State
.get(getAddr(), Part
);
9508 NewSI
= Builder
.CreateMaskedScatter(StoredVal
, VectorGep
, Alignment
,
9512 // If we store to reverse consecutive memory locations, then we need
9513 // to reverse the order of elements in the stored value.
9514 StoredVal
= Builder
.CreateVectorReverse(StoredVal
, "reverse");
9515 // We don't want to update the value in the map as it might be used in
9516 // another expression. So don't call resetVectorValue(StoredVal).
9518 auto *VecPtr
= State
.get(getAddr(), Part
);
9520 NewSI
= Builder
.CreateMaskedStore(StoredVal
, VecPtr
, Alignment
,
9521 BlockInMaskParts
[Part
]);
9523 NewSI
= Builder
.CreateAlignedStore(StoredVal
, VecPtr
, Alignment
);
9525 State
.addMetadata(NewSI
, SI
);
9531 assert(LI
&& "Must have a load instruction");
9532 State
.setDebugLocFrom(LI
->getDebugLoc());
9533 for (unsigned Part
= 0; Part
< State
.UF
; ++Part
) {
9535 if (CreateGatherScatter
) {
9536 Value
*MaskPart
= isMaskRequired
? BlockInMaskParts
[Part
] : nullptr;
9537 Value
*VectorGep
= State
.get(getAddr(), Part
);
9538 NewLI
= Builder
.CreateMaskedGather(DataTy
, VectorGep
, Alignment
, MaskPart
,
9539 nullptr, "wide.masked.gather");
9540 State
.addMetadata(NewLI
, LI
);
9542 auto *VecPtr
= State
.get(getAddr(), Part
);
9544 NewLI
= Builder
.CreateMaskedLoad(
9545 DataTy
, VecPtr
, Alignment
, BlockInMaskParts
[Part
],
9546 PoisonValue::get(DataTy
), "wide.masked.load");
9549 Builder
.CreateAlignedLoad(DataTy
, VecPtr
, Alignment
, "wide.load");
9551 // Add metadata to the load, but setVectorValue to the reverse shuffle.
9552 State
.addMetadata(NewLI
, LI
);
9554 NewLI
= Builder
.CreateVectorReverse(NewLI
, "reverse");
9557 State
.set(getVPSingleValue(), NewLI
, Part
);
9561 // Determine how to lower the scalar epilogue, which depends on 1) optimising
9562 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing
9563 // predication, and 4) a TTI hook that analyses whether the loop is suitable
9565 static ScalarEpilogueLowering
getScalarEpilogueLowering(
9566 Function
*F
, Loop
*L
, LoopVectorizeHints
&Hints
, ProfileSummaryInfo
*PSI
,
9567 BlockFrequencyInfo
*BFI
, TargetTransformInfo
*TTI
, TargetLibraryInfo
*TLI
,
9568 LoopVectorizationLegality
&LVL
, InterleavedAccessInfo
*IAI
) {
9569 // 1) OptSize takes precedence over all other options, i.e. if this is set,
9570 // don't look at hints or options, and don't request a scalar epilogue.
9571 // (For PGSO, as shouldOptimizeForSize isn't currently accessible from
9572 // LoopAccessInfo (due to code dependency and not being able to reliably get
9573 // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection
9574 // of strides in LoopAccessInfo::analyzeLoop() and vectorize without
9575 // versioning when the vectorization is forced, unlike hasOptSize. So revert
9576 // back to the old way and vectorize with versioning when forced. See D81345.)
9577 if (F
->hasOptSize() || (llvm::shouldOptimizeForSize(L
->getHeader(), PSI
, BFI
,
9578 PGSOQueryType::IRPass
) &&
9579 Hints
.getForce() != LoopVectorizeHints::FK_Enabled
))
9580 return CM_ScalarEpilogueNotAllowedOptSize
;
9582 // 2) If set, obey the directives
9583 if (PreferPredicateOverEpilogue
.getNumOccurrences()) {
9584 switch (PreferPredicateOverEpilogue
) {
9585 case PreferPredicateTy::ScalarEpilogue
:
9586 return CM_ScalarEpilogueAllowed
;
9587 case PreferPredicateTy::PredicateElseScalarEpilogue
:
9588 return CM_ScalarEpilogueNotNeededUsePredicate
;
9589 case PreferPredicateTy::PredicateOrDontVectorize
:
9590 return CM_ScalarEpilogueNotAllowedUsePredicate
;
9594 // 3) If set, obey the hints
9595 switch (Hints
.getPredicate()) {
9596 case LoopVectorizeHints::FK_Enabled
:
9597 return CM_ScalarEpilogueNotNeededUsePredicate
;
9598 case LoopVectorizeHints::FK_Disabled
:
9599 return CM_ScalarEpilogueAllowed
;
9602 // 4) if the TTI hook indicates this is profitable, request predication.
9603 TailFoldingInfo
TFI(TLI
, &LVL
, IAI
);
9604 if (TTI
->preferPredicateOverEpilogue(&TFI
))
9605 return CM_ScalarEpilogueNotNeededUsePredicate
;
9607 return CM_ScalarEpilogueAllowed
;
9610 // Process the loop in the VPlan-native vectorization path. This path builds
9611 // VPlan upfront in the vectorization pipeline, which allows to apply
9612 // VPlan-to-VPlan transformations from the very beginning without modifying the
9614 static bool processLoopInVPlanNativePath(
9615 Loop
*L
, PredicatedScalarEvolution
&PSE
, LoopInfo
*LI
, DominatorTree
*DT
,
9616 LoopVectorizationLegality
*LVL
, TargetTransformInfo
*TTI
,
9617 TargetLibraryInfo
*TLI
, DemandedBits
*DB
, AssumptionCache
*AC
,
9618 OptimizationRemarkEmitter
*ORE
, BlockFrequencyInfo
*BFI
,
9619 ProfileSummaryInfo
*PSI
, LoopVectorizeHints
&Hints
,
9620 LoopVectorizationRequirements
&Requirements
) {
9622 if (isa
<SCEVCouldNotCompute
>(PSE
.getBackedgeTakenCount())) {
9623 LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n");
9626 assert(EnableVPlanNativePath
&& "VPlan-native path is disabled.");
9627 Function
*F
= L
->getHeader()->getParent();
9628 InterleavedAccessInfo
IAI(PSE
, L
, DT
, LI
, LVL
->getLAI());
9630 ScalarEpilogueLowering SEL
=
9631 getScalarEpilogueLowering(F
, L
, Hints
, PSI
, BFI
, TTI
, TLI
, *LVL
, &IAI
);
9633 LoopVectorizationCostModel
CM(SEL
, L
, PSE
, LI
, LVL
, *TTI
, TLI
, DB
, AC
, ORE
, F
,
9635 // Use the planner for outer loop vectorization.
9636 // TODO: CM is not used at this point inside the planner. Turn CM into an
9637 // optional argument if we don't need it in the future.
9638 LoopVectorizationPlanner
LVP(L
, LI
, DT
, TLI
, *TTI
, LVL
, CM
, IAI
, PSE
, Hints
,
9641 // Get user vectorization factor.
9642 ElementCount UserVF
= Hints
.getWidth();
9644 CM
.collectElementTypesForWidening();
9646 // Plan how to best vectorize, return the best VF and its cost.
9647 const VectorizationFactor VF
= LVP
.planInVPlanNativePath(UserVF
);
9649 // If we are stress testing VPlan builds, do not attempt to generate vector
9650 // code. Masked vector code generation support will follow soon.
9651 // Also, do not attempt to vectorize if no vector code will be produced.
9652 if (VPlanBuildStressTest
|| VectorizationFactor::Disabled() == VF
)
9655 VPlan
&BestPlan
= LVP
.getBestPlanFor(VF
.Width
);
9658 bool AddBranchWeights
=
9659 hasBranchWeightMD(*L
->getLoopLatch()->getTerminator());
9660 GeneratedRTChecks
Checks(*PSE
.getSE(), DT
, LI
, TTI
,
9661 F
->getParent()->getDataLayout(), AddBranchWeights
);
9662 InnerLoopVectorizer
LB(L
, PSE
, LI
, DT
, TLI
, TTI
, AC
, ORE
, VF
.Width
,
9663 VF
.Width
, 1, LVL
, &CM
, BFI
, PSI
, Checks
);
9664 LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
9665 << L
->getHeader()->getParent()->getName() << "\"\n");
9666 LVP
.executePlan(VF
.Width
, 1, BestPlan
, LB
, DT
, false);
9669 reportVectorization(ORE
, L
, VF
, 1);
9671 // Mark the loop as already vectorized to avoid vectorizing again.
9672 Hints
.setAlreadyVectorized();
9673 assert(!verifyFunction(*L
->getHeader()->getParent(), &dbgs()));
9677 // Emit a remark if there are stores to floats that required a floating point
9678 // extension. If the vectorized loop was generated with floating point there
9679 // will be a performance penalty from the conversion overhead and the change in
9680 // the vector width.
9681 static void checkMixedPrecision(Loop
*L
, OptimizationRemarkEmitter
*ORE
) {
9682 SmallVector
<Instruction
*, 4> Worklist
;
9683 for (BasicBlock
*BB
: L
->getBlocks()) {
9684 for (Instruction
&Inst
: *BB
) {
9685 if (auto *S
= dyn_cast
<StoreInst
>(&Inst
)) {
9686 if (S
->getValueOperand()->getType()->isFloatTy())
9687 Worklist
.push_back(S
);
9692 // Traverse the floating point stores upwards searching, for floating point
9694 SmallPtrSet
<const Instruction
*, 4> Visited
;
9695 SmallPtrSet
<const Instruction
*, 4> EmittedRemark
;
9696 while (!Worklist
.empty()) {
9697 auto *I
= Worklist
.pop_back_val();
9698 if (!L
->contains(I
))
9700 if (!Visited
.insert(I
).second
)
9703 // Emit a remark if the floating point store required a floating
9704 // point conversion.
9705 // TODO: More work could be done to identify the root cause such as a
9706 // constant or a function return type and point the user to it.
9707 if (isa
<FPExtInst
>(I
) && EmittedRemark
.insert(I
).second
)
9709 return OptimizationRemarkAnalysis(LV_NAME
, "VectorMixedPrecision",
9710 I
->getDebugLoc(), L
->getHeader())
9711 << "floating point conversion changes vector width. "
9712 << "Mixed floating point precision requires an up/down "
9713 << "cast that will negatively impact performance.";
9716 for (Use
&Op
: I
->operands())
9717 if (auto *OpI
= dyn_cast
<Instruction
>(Op
))
9718 Worklist
.push_back(OpI
);
9722 static bool areRuntimeChecksProfitable(GeneratedRTChecks
&Checks
,
9723 VectorizationFactor
&VF
,
9724 std::optional
<unsigned> VScale
, Loop
*L
,
9725 ScalarEvolution
&SE
,
9726 ScalarEpilogueLowering SEL
) {
9727 InstructionCost CheckCost
= Checks
.getCost();
9728 if (!CheckCost
.isValid())
9731 // When interleaving only scalar and vector cost will be equal, which in turn
9732 // would lead to a divide by 0. Fall back to hard threshold.
9733 if (VF
.Width
.isScalar()) {
9734 if (CheckCost
> VectorizeMemoryCheckThreshold
) {
9737 << "LV: Interleaving only is not profitable due to runtime checks\n");
9743 // The scalar cost should only be 0 when vectorizing with a user specified VF/IC. In those cases, runtime checks should always be generated.
9744 double ScalarC
= *VF
.ScalarCost
.getValue();
9748 // First, compute the minimum iteration count required so that the vector
9749 // loop outperforms the scalar loop.
9750 // The total cost of the scalar loop is
9753 // * TC is the actual trip count of the loop.
9754 // * ScalarC is the cost of a single scalar iteration.
9756 // The total cost of the vector loop is
9757 // RtC + VecC * (TC / VF) + EpiC
9759 // * RtC is the cost of the generated runtime checks
9760 // * VecC is the cost of a single vector iteration.
9761 // * TC is the actual trip count of the loop
9762 // * VF is the vectorization factor
9763 // * EpiCost is the cost of the generated epilogue, including the cost
9764 // of the remaining scalar operations.
9766 // Vectorization is profitable once the total vector cost is less than the
9767 // total scalar cost:
9768 // RtC + VecC * (TC / VF) + EpiC < ScalarC * TC
9770 // Now we can compute the minimum required trip count TC as
9771 // (RtC + EpiC) / (ScalarC - (VecC / VF)) < TC
9773 // For now we assume the epilogue cost EpiC = 0 for simplicity. Note that
9774 // the computations are performed on doubles, not integers and the result
9775 // is rounded up, hence we get an upper estimate of the TC.
9776 unsigned IntVF
= VF
.Width
.getKnownMinValue();
9777 if (VF
.Width
.isScalable()) {
9778 unsigned AssumedMinimumVscale
= 1;
9780 AssumedMinimumVscale
= *VScale
;
9781 IntVF
*= AssumedMinimumVscale
;
9783 double VecCOverVF
= double(*VF
.Cost
.getValue()) / IntVF
;
9784 double RtC
= *CheckCost
.getValue();
9785 double MinTC1
= RtC
/ (ScalarC
- VecCOverVF
);
9787 // Second, compute a minimum iteration count so that the cost of the
9788 // runtime checks is only a fraction of the total scalar loop cost. This
9789 // adds a loop-dependent bound on the overhead incurred if the runtime
9790 // checks fail. In case the runtime checks fail, the cost is RtC + ScalarC
9791 // * TC. To bound the runtime check to be a fraction 1/X of the scalar
9793 // RtC < ScalarC * TC * (1 / X) ==> RtC * X / ScalarC < TC
9794 double MinTC2
= RtC
* 10 / ScalarC
;
9796 // Now pick the larger minimum. If it is not a multiple of VF and a scalar
9797 // epilogue is allowed, choose the next closest multiple of VF. This should
9798 // partly compensate for ignoring the epilogue cost.
9799 uint64_t MinTC
= std::ceil(std::max(MinTC1
, MinTC2
));
9800 if (SEL
== CM_ScalarEpilogueAllowed
)
9801 MinTC
= alignTo(MinTC
, IntVF
);
9802 VF
.MinProfitableTripCount
= ElementCount::getFixed(MinTC
);
9805 dbgs() << "LV: Minimum required TC for runtime checks to be profitable:"
9806 << VF
.MinProfitableTripCount
<< "\n");
9808 // Skip vectorization if the expected trip count is less than the minimum
9809 // required trip count.
9810 if (auto ExpectedTC
= getSmallBestKnownTC(SE
, L
)) {
9811 if (ElementCount::isKnownLT(ElementCount::getFixed(*ExpectedTC
),
9812 VF
.MinProfitableTripCount
)) {
9813 LLVM_DEBUG(dbgs() << "LV: Vectorization is not beneficial: expected "
9814 "trip count < minimum profitable VF ("
9815 << *ExpectedTC
<< " < " << VF
.MinProfitableTripCount
9824 LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts
)
9825 : InterleaveOnlyWhenForced(Opts
.InterleaveOnlyWhenForced
||
9826 !EnableLoopInterleaving
),
9827 VectorizeOnlyWhenForced(Opts
.VectorizeOnlyWhenForced
||
9828 !EnableLoopVectorization
) {}
9830 bool LoopVectorizePass::processLoop(Loop
*L
) {
9831 assert((EnableVPlanNativePath
|| L
->isInnermost()) &&
9832 "VPlan-native path is not enabled. Only process inner loops.");
9835 const std::string DebugLocStr
= getDebugLocString(L
);
9838 LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in '"
9839 << L
->getHeader()->getParent()->getName() << "' from "
9840 << DebugLocStr
<< "\n");
9842 LoopVectorizeHints
Hints(L
, InterleaveOnlyWhenForced
, *ORE
, TTI
);
9845 dbgs() << "LV: Loop hints:"
9847 << (Hints
.getForce() == LoopVectorizeHints::FK_Disabled
9849 : (Hints
.getForce() == LoopVectorizeHints::FK_Enabled
9852 << " width=" << Hints
.getWidth()
9853 << " interleave=" << Hints
.getInterleave() << "\n");
9855 // Function containing loop
9856 Function
*F
= L
->getHeader()->getParent();
9858 // Looking at the diagnostic output is the only way to determine if a loop
9859 // was vectorized (other than looking at the IR or machine code), so it
9860 // is important to generate an optimization remark for each loop. Most of
9861 // these messages are generated as OptimizationRemarkAnalysis. Remarks
9862 // generated as OptimizationRemark and OptimizationRemarkMissed are
9863 // less verbose reporting vectorized loops and unvectorized loops that may
9864 // benefit from vectorization, respectively.
9866 if (!Hints
.allowVectorization(F
, L
, VectorizeOnlyWhenForced
)) {
9867 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n");
9871 PredicatedScalarEvolution
PSE(*SE
, *L
);
9873 // Check if it is legal to vectorize the loop.
9874 LoopVectorizationRequirements Requirements
;
9875 LoopVectorizationLegality
LVL(L
, PSE
, DT
, TTI
, TLI
, F
, *LAIs
, LI
, ORE
,
9876 &Requirements
, &Hints
, DB
, AC
, BFI
, PSI
);
9877 if (!LVL
.canVectorize(EnableVPlanNativePath
)) {
9878 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
9879 Hints
.emitRemarkWithHints();
9883 // Entrance to the VPlan-native vectorization path. Outer loops are processed
9884 // here. They may require CFG and instruction level transformations before
9885 // even evaluating whether vectorization is profitable. Since we cannot modify
9886 // the incoming IR, we need to build VPlan upfront in the vectorization
9888 if (!L
->isInnermost())
9889 return processLoopInVPlanNativePath(L
, PSE
, LI
, DT
, &LVL
, TTI
, TLI
, DB
, AC
,
9890 ORE
, BFI
, PSI
, Hints
, Requirements
);
9892 assert(L
->isInnermost() && "Inner loop expected.");
9894 InterleavedAccessInfo
IAI(PSE
, L
, DT
, LI
, LVL
.getLAI());
9895 bool UseInterleaved
= TTI
->enableInterleavedAccessVectorization();
9897 // If an override option has been passed in for interleaved accesses, use it.
9898 if (EnableInterleavedMemAccesses
.getNumOccurrences() > 0)
9899 UseInterleaved
= EnableInterleavedMemAccesses
;
9901 // Analyze interleaved memory accesses.
9903 IAI
.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI
));
9905 // Check the function attributes and profiles to find out if this function
9906 // should be optimized for size.
9907 ScalarEpilogueLowering SEL
=
9908 getScalarEpilogueLowering(F
, L
, Hints
, PSI
, BFI
, TTI
, TLI
, LVL
, &IAI
);
9910 // Check the loop for a trip count threshold: vectorize loops with a tiny trip
9911 // count by optimizing for size, to minimize overheads.
9912 auto ExpectedTC
= getSmallBestKnownTC(*SE
, L
);
9913 if (ExpectedTC
&& *ExpectedTC
< TinyTripCountVectorThreshold
) {
9914 LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
9915 << "This loop is worth vectorizing only if no scalar "
9916 << "iteration overheads are incurred.");
9917 if (Hints
.getForce() == LoopVectorizeHints::FK_Enabled
)
9918 LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
9920 if (*ExpectedTC
> TTI
->getMinTripCountTailFoldingThreshold()) {
9921 LLVM_DEBUG(dbgs() << "\n");
9922 // Predicate tail-folded loops are efficient even when the loop
9923 // iteration count is low. However, setting the epilogue policy to
9924 // `CM_ScalarEpilogueNotAllowedLowTripLoop` prevents vectorizing loops
9925 // with runtime checks. It's more effective to let
9926 // `areRuntimeChecksProfitable` determine if vectorization is beneficial
9928 if (SEL
!= CM_ScalarEpilogueNotNeededUsePredicate
)
9929 SEL
= CM_ScalarEpilogueNotAllowedLowTripLoop
;
9931 LLVM_DEBUG(dbgs() << " But the target considers the trip count too "
9932 "small to consider vectorizing.\n");
9933 reportVectorizationFailure(
9934 "The trip count is below the minial threshold value.",
9935 "loop trip count is too low, avoiding vectorization",
9936 "LowTripCount", ORE
, L
);
9937 Hints
.emitRemarkWithHints();
9943 // Check the function attributes to see if implicit floats or vectors are
9945 if (F
->hasFnAttribute(Attribute::NoImplicitFloat
)) {
9946 reportVectorizationFailure(
9947 "Can't vectorize when the NoImplicitFloat attribute is used",
9948 "loop not vectorized due to NoImplicitFloat attribute",
9949 "NoImplicitFloat", ORE
, L
);
9950 Hints
.emitRemarkWithHints();
9954 // Check if the target supports potentially unsafe FP vectorization.
9955 // FIXME: Add a check for the type of safety issue (denormal, signaling)
9956 // for the target we're vectorizing for, to make sure none of the
9957 // additional fp-math flags can help.
9958 if (Hints
.isPotentiallyUnsafe() &&
9959 TTI
->isFPVectorizationPotentiallyUnsafe()) {
9960 reportVectorizationFailure(
9961 "Potentially unsafe FP op prevents vectorization",
9962 "loop not vectorized due to unsafe FP support.",
9963 "UnsafeFP", ORE
, L
);
9964 Hints
.emitRemarkWithHints();
9968 bool AllowOrderedReductions
;
9969 // If the flag is set, use that instead and override the TTI behaviour.
9970 if (ForceOrderedReductions
.getNumOccurrences() > 0)
9971 AllowOrderedReductions
= ForceOrderedReductions
;
9973 AllowOrderedReductions
= TTI
->enableOrderedReductions();
9974 if (!LVL
.canVectorizeFPMath(AllowOrderedReductions
)) {
9976 auto *ExactFPMathInst
= Requirements
.getExactFPInst();
9977 return OptimizationRemarkAnalysisFPCommute(DEBUG_TYPE
, "CantReorderFPOps",
9978 ExactFPMathInst
->getDebugLoc(),
9979 ExactFPMathInst
->getParent())
9980 << "loop not vectorized: cannot prove it is safe to reorder "
9981 "floating-point operations";
9983 LLVM_DEBUG(dbgs() << "LV: loop not vectorized: cannot prove it is safe to "
9984 "reorder floating-point operations\n");
9985 Hints
.emitRemarkWithHints();
9989 // Use the cost model.
9990 LoopVectorizationCostModel
CM(SEL
, L
, PSE
, LI
, &LVL
, *TTI
, TLI
, DB
, AC
, ORE
,
9992 // Use the planner for vectorization.
9993 LoopVectorizationPlanner
LVP(L
, LI
, DT
, TLI
, *TTI
, &LVL
, CM
, IAI
, PSE
, Hints
,
9996 // Get user vectorization factor and interleave count.
9997 ElementCount UserVF
= Hints
.getWidth();
9998 unsigned UserIC
= Hints
.getInterleave();
10000 // Plan how to best vectorize, return the best VF and its cost.
10001 std::optional
<VectorizationFactor
> MaybeVF
= LVP
.plan(UserVF
, UserIC
);
10003 VectorizationFactor VF
= VectorizationFactor::Disabled();
10006 bool AddBranchWeights
=
10007 hasBranchWeightMD(*L
->getLoopLatch()->getTerminator());
10008 GeneratedRTChecks
Checks(*PSE
.getSE(), DT
, LI
, TTI
,
10009 F
->getParent()->getDataLayout(), AddBranchWeights
);
10012 // Select the interleave count.
10013 IC
= CM
.selectInterleaveCount(VF
.Width
, VF
.Cost
);
10015 unsigned SelectedIC
= std::max(IC
, UserIC
);
10016 // Optimistically generate runtime checks if they are needed. Drop them if
10017 // they turn out to not be profitable.
10018 if (VF
.Width
.isVector() || SelectedIC
> 1)
10019 Checks
.Create(L
, *LVL
.getLAI(), PSE
.getPredicate(), VF
.Width
, SelectedIC
);
10021 // Check if it is profitable to vectorize with runtime checks.
10022 bool ForceVectorization
=
10023 Hints
.getForce() == LoopVectorizeHints::FK_Enabled
;
10024 if (!ForceVectorization
&&
10025 !areRuntimeChecksProfitable(Checks
, VF
, getVScaleForTuning(L
, *TTI
), L
,
10026 *PSE
.getSE(), SEL
)) {
10028 return OptimizationRemarkAnalysisAliasing(
10029 DEBUG_TYPE
, "CantReorderMemOps", L
->getStartLoc(),
10031 << "loop not vectorized: cannot prove it is safe to reorder "
10032 "memory operations";
10034 LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n");
10035 Hints
.emitRemarkWithHints();
10040 // Identify the diagnostic messages that should be produced.
10041 std::pair
<StringRef
, std::string
> VecDiagMsg
, IntDiagMsg
;
10042 bool VectorizeLoop
= true, InterleaveLoop
= true;
10043 if (VF
.Width
.isScalar()) {
10044 LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
10045 VecDiagMsg
= std::make_pair(
10046 "VectorizationNotBeneficial",
10047 "the cost-model indicates that vectorization is not beneficial");
10048 VectorizeLoop
= false;
10051 if (!MaybeVF
&& UserIC
> 1) {
10052 // Tell the user interleaving was avoided up-front, despite being explicitly
10054 LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and "
10055 "interleaving should be avoided up front\n");
10056 IntDiagMsg
= std::make_pair(
10057 "InterleavingAvoided",
10058 "Ignoring UserIC, because interleaving was avoided up front");
10059 InterleaveLoop
= false;
10060 } else if (IC
== 1 && UserIC
<= 1) {
10061 // Tell the user interleaving is not beneficial.
10062 LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");
10063 IntDiagMsg
= std::make_pair(
10064 "InterleavingNotBeneficial",
10065 "the cost-model indicates that interleaving is not beneficial");
10066 InterleaveLoop
= false;
10068 IntDiagMsg
.first
= "InterleavingNotBeneficialAndDisabled";
10069 IntDiagMsg
.second
+=
10070 " and is explicitly disabled or interleave count is set to 1";
10072 } else if (IC
> 1 && UserIC
== 1) {
10073 // Tell the user interleaving is beneficial, but it explicitly disabled.
10075 dbgs() << "LV: Interleaving is beneficial but is explicitly disabled.");
10076 IntDiagMsg
= std::make_pair(
10077 "InterleavingBeneficialButDisabled",
10078 "the cost-model indicates that interleaving is beneficial "
10079 "but is explicitly disabled or interleave count is set to 1");
10080 InterleaveLoop
= false;
10083 // Override IC if user provided an interleave count.
10084 IC
= UserIC
> 0 ? UserIC
: IC
;
10086 // Emit diagnostic messages, if any.
10087 const char *VAPassName
= Hints
.vectorizeAnalysisPassName();
10088 if (!VectorizeLoop
&& !InterleaveLoop
) {
10089 // Do not vectorize or interleaving the loop.
10091 return OptimizationRemarkMissed(VAPassName
, VecDiagMsg
.first
,
10092 L
->getStartLoc(), L
->getHeader())
10093 << VecDiagMsg
.second
;
10096 return OptimizationRemarkMissed(LV_NAME
, IntDiagMsg
.first
,
10097 L
->getStartLoc(), L
->getHeader())
10098 << IntDiagMsg
.second
;
10101 } else if (!VectorizeLoop
&& InterleaveLoop
) {
10102 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC
<< '\n');
10104 return OptimizationRemarkAnalysis(VAPassName
, VecDiagMsg
.first
,
10105 L
->getStartLoc(), L
->getHeader())
10106 << VecDiagMsg
.second
;
10108 } else if (VectorizeLoop
&& !InterleaveLoop
) {
10109 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF
.Width
10110 << ") in " << DebugLocStr
<< '\n');
10112 return OptimizationRemarkAnalysis(LV_NAME
, IntDiagMsg
.first
,
10113 L
->getStartLoc(), L
->getHeader())
10114 << IntDiagMsg
.second
;
10116 } else if (VectorizeLoop
&& InterleaveLoop
) {
10117 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF
.Width
10118 << ") in " << DebugLocStr
<< '\n');
10119 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC
<< '\n');
10122 bool DisableRuntimeUnroll
= false;
10123 MDNode
*OrigLoopID
= L
->getLoopID();
10125 using namespace ore
;
10126 if (!VectorizeLoop
) {
10127 assert(IC
> 1 && "interleave count should not be 1 or 0");
10128 // If we decided that it is not legal to vectorize the loop, then
10130 InnerLoopUnroller
Unroller(L
, PSE
, LI
, DT
, TLI
, TTI
, AC
, ORE
, IC
, &LVL
,
10131 &CM
, BFI
, PSI
, Checks
);
10133 VPlan
&BestPlan
= LVP
.getBestPlanFor(VF
.Width
);
10134 LVP
.executePlan(VF
.Width
, IC
, BestPlan
, Unroller
, DT
, false);
10137 return OptimizationRemark(LV_NAME
, "Interleaved", L
->getStartLoc(),
10139 << "interleaved loop (interleaved count: "
10140 << NV("InterleaveCount", IC
) << ")";
10143 // If we decided that it is *legal* to vectorize the loop, then do it.
10145 // Consider vectorizing the epilogue too if it's profitable.
10146 VectorizationFactor EpilogueVF
=
10147 LVP
.selectEpilogueVectorizationFactor(VF
.Width
, IC
);
10148 if (EpilogueVF
.Width
.isVector()) {
10150 // The first pass vectorizes the main loop and creates a scalar epilogue
10151 // to be vectorized by executing the plan (potentially with a different
10152 // factor) again shortly afterwards.
10153 EpilogueLoopVectorizationInfo
EPI(VF
.Width
, IC
, EpilogueVF
.Width
, 1);
10154 EpilogueVectorizerMainLoop
MainILV(L
, PSE
, LI
, DT
, TLI
, TTI
, AC
, ORE
,
10155 EPI
, &LVL
, &CM
, BFI
, PSI
, Checks
);
10157 VPlan
&BestMainPlan
= LVP
.getBestPlanFor(EPI
.MainLoopVF
);
10158 const auto &[ExpandedSCEVs
, ReductionResumeValues
] = LVP
.executePlan(
10159 EPI
.MainLoopVF
, EPI
.MainLoopUF
, BestMainPlan
, MainILV
, DT
, true);
10162 // Second pass vectorizes the epilogue and adjusts the control flow
10163 // edges from the first pass.
10164 EPI
.MainLoopVF
= EPI
.EpilogueVF
;
10165 EPI
.MainLoopUF
= EPI
.EpilogueUF
;
10166 EpilogueVectorizerEpilogueLoop
EpilogILV(L
, PSE
, LI
, DT
, TLI
, TTI
, AC
,
10167 ORE
, EPI
, &LVL
, &CM
, BFI
, PSI
,
10170 VPlan
&BestEpiPlan
= LVP
.getBestPlanFor(EPI
.EpilogueVF
);
10171 VPRegionBlock
*VectorLoop
= BestEpiPlan
.getVectorLoopRegion();
10172 VPBasicBlock
*Header
= VectorLoop
->getEntryBasicBlock();
10173 Header
->setName("vec.epilog.vector.body");
10175 // Re-use the trip count and steps expanded for the main loop, as
10176 // skeleton creation needs it as a value that dominates both the scalar
10177 // and vector epilogue loops
10178 // TODO: This is a workaround needed for epilogue vectorization and it
10179 // should be removed once induction resume value creation is done
10180 // directly in VPlan.
10181 EpilogILV
.setTripCount(MainILV
.getTripCount());
10182 for (auto &R
: make_early_inc_range(*BestEpiPlan
.getPreheader())) {
10183 auto *ExpandR
= cast
<VPExpandSCEVRecipe
>(&R
);
10184 auto *ExpandedVal
= BestEpiPlan
.getVPValueOrAddLiveIn(
10185 ExpandedSCEVs
.find(ExpandR
->getSCEV())->second
);
10186 ExpandR
->replaceAllUsesWith(ExpandedVal
);
10187 ExpandR
->eraseFromParent();
10190 // Ensure that the start values for any VPWidenIntOrFpInductionRecipe,
10191 // VPWidenPointerInductionRecipe and VPReductionPHIRecipes are updated
10192 // before vectorizing the epilogue loop.
10193 for (VPRecipeBase
&R
: Header
->phis()) {
10194 if (isa
<VPCanonicalIVPHIRecipe
>(&R
))
10197 Value
*ResumeV
= nullptr;
10198 // TODO: Move setting of resume values to prepareToExecute.
10199 if (auto *ReductionPhi
= dyn_cast
<VPReductionPHIRecipe
>(&R
)) {
10200 ResumeV
= ReductionResumeValues
10201 .find(&ReductionPhi
->getRecurrenceDescriptor())
10204 // Create induction resume values for both widened pointer and
10205 // integer/fp inductions and update the start value of the induction
10206 // recipes to use the resume value.
10207 PHINode
*IndPhi
= nullptr;
10208 const InductionDescriptor
*ID
;
10209 if (auto *Ind
= dyn_cast
<VPWidenPointerInductionRecipe
>(&R
)) {
10210 IndPhi
= cast
<PHINode
>(Ind
->getUnderlyingValue());
10211 ID
= &Ind
->getInductionDescriptor();
10213 auto *WidenInd
= cast
<VPWidenIntOrFpInductionRecipe
>(&R
);
10214 IndPhi
= WidenInd
->getPHINode();
10215 ID
= &WidenInd
->getInductionDescriptor();
10218 ResumeV
= MainILV
.createInductionResumeValue(
10219 IndPhi
, *ID
, getExpandedStep(*ID
, ExpandedSCEVs
),
10220 {EPI
.MainLoopIterationCountCheck
});
10222 assert(ResumeV
&& "Must have a resume value");
10223 VPValue
*StartVal
= BestEpiPlan
.getVPValueOrAddLiveIn(ResumeV
);
10224 cast
<VPHeaderPHIRecipe
>(&R
)->setStartValue(StartVal
);
10227 LVP
.executePlan(EPI
.EpilogueVF
, EPI
.EpilogueUF
, BestEpiPlan
, EpilogILV
,
10228 DT
, true, &ExpandedSCEVs
);
10229 ++LoopsEpilogueVectorized
;
10231 if (!MainILV
.areSafetyChecksAdded())
10232 DisableRuntimeUnroll
= true;
10234 InnerLoopVectorizer
LB(L
, PSE
, LI
, DT
, TLI
, TTI
, AC
, ORE
, VF
.Width
,
10235 VF
.MinProfitableTripCount
, IC
, &LVL
, &CM
, BFI
,
10238 VPlan
&BestPlan
= LVP
.getBestPlanFor(VF
.Width
);
10239 LVP
.executePlan(VF
.Width
, IC
, BestPlan
, LB
, DT
, false);
10242 // Add metadata to disable runtime unrolling a scalar loop when there
10243 // are no runtime checks about strides and memory. A scalar loop that is
10244 // rarely used is not worth unrolling.
10245 if (!LB
.areSafetyChecksAdded())
10246 DisableRuntimeUnroll
= true;
10248 // Report the vectorization decision.
10249 reportVectorization(ORE
, L
, VF
, IC
);
10252 if (ORE
->allowExtraAnalysis(LV_NAME
))
10253 checkMixedPrecision(L
, ORE
);
10256 std::optional
<MDNode
*> RemainderLoopID
=
10257 makeFollowupLoopID(OrigLoopID
, {LLVMLoopVectorizeFollowupAll
,
10258 LLVMLoopVectorizeFollowupEpilogue
});
10259 if (RemainderLoopID
) {
10260 L
->setLoopID(*RemainderLoopID
);
10262 if (DisableRuntimeUnroll
)
10263 AddRuntimeUnrollDisableMetaData(L
);
10265 // Mark the loop as already vectorized to avoid vectorizing again.
10266 Hints
.setAlreadyVectorized();
10269 assert(!verifyFunction(*L
->getHeader()->getParent(), &dbgs()));
10273 LoopVectorizeResult
LoopVectorizePass::runImpl(
10274 Function
&F
, ScalarEvolution
&SE_
, LoopInfo
&LI_
, TargetTransformInfo
&TTI_
,
10275 DominatorTree
&DT_
, BlockFrequencyInfo
*BFI_
, TargetLibraryInfo
*TLI_
,
10276 DemandedBits
&DB_
, AssumptionCache
&AC_
, LoopAccessInfoManager
&LAIs_
,
10277 OptimizationRemarkEmitter
&ORE_
, ProfileSummaryInfo
*PSI_
) {
10290 // Don't attempt if
10291 // 1. the target claims to have no vector registers, and
10292 // 2. interleaving won't help ILP.
10294 // The second condition is necessary because, even if the target has no
10295 // vector registers, loop vectorization may still enable scalar
10297 if (!TTI
->getNumberOfRegisters(TTI
->getRegisterClassForType(true)) &&
10298 TTI
->getMaxInterleaveFactor(ElementCount::getFixed(1)) < 2)
10299 return LoopVectorizeResult(false, false);
10301 bool Changed
= false, CFGChanged
= false;
10303 // The vectorizer requires loops to be in simplified form.
10304 // Since simplification may add new inner loops, it has to run before the
10305 // legality and profitability checks. This means running the loop vectorizer
10306 // will simplify all loops, regardless of whether anything end up being
10308 for (const auto &L
: *LI
)
10309 Changed
|= CFGChanged
|=
10310 simplifyLoop(L
, DT
, LI
, SE
, AC
, nullptr, false /* PreserveLCSSA */);
10312 // Build up a worklist of inner-loops to vectorize. This is necessary as
10313 // the act of vectorizing or partially unrolling a loop creates new loops
10314 // and can invalidate iterators across the loops.
10315 SmallVector
<Loop
*, 8> Worklist
;
10317 for (Loop
*L
: *LI
)
10318 collectSupportedLoops(*L
, LI
, ORE
, Worklist
);
10320 LoopsAnalyzed
+= Worklist
.size();
10322 // Now walk the identified inner loops.
10323 while (!Worklist
.empty()) {
10324 Loop
*L
= Worklist
.pop_back_val();
10326 // For the inner loops we actually process, form LCSSA to simplify the
10328 Changed
|= formLCSSARecursively(*L
, *DT
, LI
, SE
);
10330 Changed
|= CFGChanged
|= processLoop(L
);
10342 // Process each loop nest in the function.
10343 return LoopVectorizeResult(Changed
, CFGChanged
);
10346 PreservedAnalyses
LoopVectorizePass::run(Function
&F
,
10347 FunctionAnalysisManager
&AM
) {
10348 auto &LI
= AM
.getResult
<LoopAnalysis
>(F
);
10349 // There are no loops in the function. Return before computing other expensive
10352 return PreservedAnalyses::all();
10353 auto &SE
= AM
.getResult
<ScalarEvolutionAnalysis
>(F
);
10354 auto &TTI
= AM
.getResult
<TargetIRAnalysis
>(F
);
10355 auto &DT
= AM
.getResult
<DominatorTreeAnalysis
>(F
);
10356 auto &TLI
= AM
.getResult
<TargetLibraryAnalysis
>(F
);
10357 auto &AC
= AM
.getResult
<AssumptionAnalysis
>(F
);
10358 auto &DB
= AM
.getResult
<DemandedBitsAnalysis
>(F
);
10359 auto &ORE
= AM
.getResult
<OptimizationRemarkEmitterAnalysis
>(F
);
10361 LoopAccessInfoManager
&LAIs
= AM
.getResult
<LoopAccessAnalysis
>(F
);
10362 auto &MAMProxy
= AM
.getResult
<ModuleAnalysisManagerFunctionProxy
>(F
);
10363 ProfileSummaryInfo
*PSI
=
10364 MAMProxy
.getCachedResult
<ProfileSummaryAnalysis
>(*F
.getParent());
10365 BlockFrequencyInfo
*BFI
= nullptr;
10366 if (PSI
&& PSI
->hasProfileSummary())
10367 BFI
= &AM
.getResult
<BlockFrequencyAnalysis
>(F
);
10368 LoopVectorizeResult Result
=
10369 runImpl(F
, SE
, LI
, TTI
, DT
, BFI
, &TLI
, DB
, AC
, LAIs
, ORE
, PSI
);
10370 if (!Result
.MadeAnyChange
)
10371 return PreservedAnalyses::all();
10372 PreservedAnalyses PA
;
10374 if (isAssignmentTrackingEnabled(*F
.getParent())) {
10376 RemoveRedundantDbgInstrs(&BB
);
10379 // We currently do not preserve loopinfo/dominator analyses with outer loop
10380 // vectorization. Until this is addressed, mark these analyses as preserved
10381 // only for non-VPlan-native path.
10382 // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
10383 if (!EnableVPlanNativePath
) {
10384 PA
.preserve
<LoopAnalysis
>();
10385 PA
.preserve
<DominatorTreeAnalysis
>();
10386 PA
.preserve
<ScalarEvolutionAnalysis
>();
10389 if (Result
.MadeCFGChange
) {
10390 // Making CFG changes likely means a loop got vectorized. Indicate that
10391 // extra simplification passes should be run.
10392 // TODO: MadeCFGChanges is not a prefect proxy. Extra passes should only
10393 // be run if runtime checks have been added.
10394 AM
.getResult
<ShouldRunExtraVectorPasses
>(F
);
10395 PA
.preserve
<ShouldRunExtraVectorPasses
>();
10397 PA
.preserveSet
<CFGAnalyses
>();
10402 void LoopVectorizePass::printPipeline(
10403 raw_ostream
&OS
, function_ref
<StringRef(StringRef
)> MapClassName2PassName
) {
10404 static_cast<PassInfoMixin
<LoopVectorizePass
> *>(this)->printPipeline(
10405 OS
, MapClassName2PassName
);
10408 OS
<< (InterleaveOnlyWhenForced
? "" : "no-") << "interleave-forced-only;";
10409 OS
<< (VectorizeOnlyWhenForced
? "" : "no-") << "vectorize-forced-only;";