[RISCV] Fix mgather -> riscv.masked.strided.load combine not extending indices (...
[llvm-project.git] / llvm / lib / Transforms / Vectorize / LoopVectorize.cpp
blobdd596c567cd4824b7e49e0240af86d1772f573ec
1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
10 // and generates target-independent LLVM-IR.
11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs
12 // of instructions in order to estimate the profitability of vectorization.
14 // The loop vectorizer combines consecutive loop iterations into a single
15 // 'wide' iteration. After this transformation the index is incremented
16 // by the SIMD vector width, and not by one.
18 // This pass has three parts:
19 // 1. The main loop pass that drives the different parts.
20 // 2. LoopVectorizationLegality - A unit that checks for the legality
21 // of the vectorization.
22 // 3. InnerLoopVectorizer - A unit that performs the actual
23 // widening of instructions.
24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability
25 // of vectorization. It decides on the optimal vector width, which
26 // can be one, if vectorization is not profitable.
28 // There is a development effort going on to migrate loop vectorizer to the
29 // VPlan infrastructure and to introduce outer loop vectorization support (see
30 // docs/VectorizationPlan.rst and
31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
32 // purpose, we temporarily introduced the VPlan-native vectorization path: an
33 // alternative vectorization path that is natively implemented on top of the
34 // VPlan infrastructure. See EnableVPlanNativePath for enabling.
36 //===----------------------------------------------------------------------===//
38 // The reduction-variable vectorization is based on the paper:
39 // D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
41 // Variable uniformity checks are inspired by:
42 // Karrenberg, R. and Hack, S. Whole Function Vectorization.
44 // The interleaved access vectorization is based on the paper:
45 // Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved
46 // Data for SIMD
48 // Other ideas/concepts are from:
49 // A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
51 // S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of
52 // Vectorizing Compilers.
54 //===----------------------------------------------------------------------===//
56 #include "llvm/Transforms/Vectorize/LoopVectorize.h"
57 #include "LoopVectorizationPlanner.h"
58 #include "VPRecipeBuilder.h"
59 #include "VPlan.h"
60 #include "VPlanAnalysis.h"
61 #include "VPlanHCFGBuilder.h"
62 #include "VPlanTransforms.h"
63 #include "llvm/ADT/APInt.h"
64 #include "llvm/ADT/ArrayRef.h"
65 #include "llvm/ADT/DenseMap.h"
66 #include "llvm/ADT/DenseMapInfo.h"
67 #include "llvm/ADT/Hashing.h"
68 #include "llvm/ADT/MapVector.h"
69 #include "llvm/ADT/STLExtras.h"
70 #include "llvm/ADT/SmallPtrSet.h"
71 #include "llvm/ADT/SmallSet.h"
72 #include "llvm/ADT/SmallVector.h"
73 #include "llvm/ADT/Statistic.h"
74 #include "llvm/ADT/StringRef.h"
75 #include "llvm/ADT/Twine.h"
76 #include "llvm/ADT/iterator_range.h"
77 #include "llvm/Analysis/AssumptionCache.h"
78 #include "llvm/Analysis/BasicAliasAnalysis.h"
79 #include "llvm/Analysis/BlockFrequencyInfo.h"
80 #include "llvm/Analysis/CFG.h"
81 #include "llvm/Analysis/CodeMetrics.h"
82 #include "llvm/Analysis/DemandedBits.h"
83 #include "llvm/Analysis/GlobalsModRef.h"
84 #include "llvm/Analysis/LoopAccessAnalysis.h"
85 #include "llvm/Analysis/LoopAnalysisManager.h"
86 #include "llvm/Analysis/LoopInfo.h"
87 #include "llvm/Analysis/LoopIterator.h"
88 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
89 #include "llvm/Analysis/ProfileSummaryInfo.h"
90 #include "llvm/Analysis/ScalarEvolution.h"
91 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
92 #include "llvm/Analysis/TargetLibraryInfo.h"
93 #include "llvm/Analysis/TargetTransformInfo.h"
94 #include "llvm/Analysis/ValueTracking.h"
95 #include "llvm/Analysis/VectorUtils.h"
96 #include "llvm/IR/Attributes.h"
97 #include "llvm/IR/BasicBlock.h"
98 #include "llvm/IR/CFG.h"
99 #include "llvm/IR/Constant.h"
100 #include "llvm/IR/Constants.h"
101 #include "llvm/IR/DataLayout.h"
102 #include "llvm/IR/DebugInfo.h"
103 #include "llvm/IR/DebugInfoMetadata.h"
104 #include "llvm/IR/DebugLoc.h"
105 #include "llvm/IR/DerivedTypes.h"
106 #include "llvm/IR/DiagnosticInfo.h"
107 #include "llvm/IR/Dominators.h"
108 #include "llvm/IR/Function.h"
109 #include "llvm/IR/IRBuilder.h"
110 #include "llvm/IR/InstrTypes.h"
111 #include "llvm/IR/Instruction.h"
112 #include "llvm/IR/Instructions.h"
113 #include "llvm/IR/IntrinsicInst.h"
114 #include "llvm/IR/Intrinsics.h"
115 #include "llvm/IR/MDBuilder.h"
116 #include "llvm/IR/Metadata.h"
117 #include "llvm/IR/Module.h"
118 #include "llvm/IR/Operator.h"
119 #include "llvm/IR/PatternMatch.h"
120 #include "llvm/IR/ProfDataUtils.h"
121 #include "llvm/IR/Type.h"
122 #include "llvm/IR/Use.h"
123 #include "llvm/IR/User.h"
124 #include "llvm/IR/Value.h"
125 #include "llvm/IR/ValueHandle.h"
126 #include "llvm/IR/Verifier.h"
127 #include "llvm/Support/Casting.h"
128 #include "llvm/Support/CommandLine.h"
129 #include "llvm/Support/Compiler.h"
130 #include "llvm/Support/Debug.h"
131 #include "llvm/Support/ErrorHandling.h"
132 #include "llvm/Support/InstructionCost.h"
133 #include "llvm/Support/MathExtras.h"
134 #include "llvm/Support/raw_ostream.h"
135 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
136 #include "llvm/Transforms/Utils/InjectTLIMappings.h"
137 #include "llvm/Transforms/Utils/LoopSimplify.h"
138 #include "llvm/Transforms/Utils/LoopUtils.h"
139 #include "llvm/Transforms/Utils/LoopVersioning.h"
140 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
141 #include "llvm/Transforms/Utils/SizeOpts.h"
142 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
143 #include <algorithm>
144 #include <cassert>
145 #include <cmath>
146 #include <cstdint>
147 #include <functional>
148 #include <iterator>
149 #include <limits>
150 #include <map>
151 #include <memory>
152 #include <string>
153 #include <tuple>
154 #include <utility>
156 using namespace llvm;
158 #define LV_NAME "loop-vectorize"
159 #define DEBUG_TYPE LV_NAME
161 #ifndef NDEBUG
162 const char VerboseDebug[] = DEBUG_TYPE "-verbose";
163 #endif
165 /// @{
166 /// Metadata attribute names
167 const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all";
168 const char LLVMLoopVectorizeFollowupVectorized[] =
169 "llvm.loop.vectorize.followup_vectorized";
170 const char LLVMLoopVectorizeFollowupEpilogue[] =
171 "llvm.loop.vectorize.followup_epilogue";
172 /// @}
174 STATISTIC(LoopsVectorized, "Number of loops vectorized");
175 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
176 STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized");
178 static cl::opt<bool> EnableEpilogueVectorization(
179 "enable-epilogue-vectorization", cl::init(true), cl::Hidden,
180 cl::desc("Enable vectorization of epilogue loops."));
182 static cl::opt<unsigned> EpilogueVectorizationForceVF(
183 "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden,
184 cl::desc("When epilogue vectorization is enabled, and a value greater than "
185 "1 is specified, forces the given VF for all applicable epilogue "
186 "loops."));
188 static cl::opt<unsigned> EpilogueVectorizationMinVF(
189 "epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden,
190 cl::desc("Only loops with vectorization factor equal to or larger than "
191 "the specified value are considered for epilogue vectorization."));
193 /// Loops with a known constant trip count below this number are vectorized only
194 /// if no scalar iteration overheads are incurred.
195 static cl::opt<unsigned> TinyTripCountVectorThreshold(
196 "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
197 cl::desc("Loops with a constant trip count that is smaller than this "
198 "value are vectorized only if no scalar iteration overheads "
199 "are incurred."));
201 static cl::opt<unsigned> VectorizeMemoryCheckThreshold(
202 "vectorize-memory-check-threshold", cl::init(128), cl::Hidden,
203 cl::desc("The maximum allowed number of runtime memory checks"));
205 // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired,
206 // that predication is preferred, and this lists all options. I.e., the
207 // vectorizer will try to fold the tail-loop (epilogue) into the vector body
208 // and predicate the instructions accordingly. If tail-folding fails, there are
209 // different fallback strategies depending on these values:
210 namespace PreferPredicateTy {
211 enum Option {
212 ScalarEpilogue = 0,
213 PredicateElseScalarEpilogue,
214 PredicateOrDontVectorize
216 } // namespace PreferPredicateTy
218 static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue(
219 "prefer-predicate-over-epilogue",
220 cl::init(PreferPredicateTy::ScalarEpilogue),
221 cl::Hidden,
222 cl::desc("Tail-folding and predication preferences over creating a scalar "
223 "epilogue loop."),
224 cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue,
225 "scalar-epilogue",
226 "Don't tail-predicate loops, create scalar epilogue"),
227 clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue,
228 "predicate-else-scalar-epilogue",
229 "prefer tail-folding, create scalar epilogue if tail "
230 "folding fails."),
231 clEnumValN(PreferPredicateTy::PredicateOrDontVectorize,
232 "predicate-dont-vectorize",
233 "prefers tail-folding, don't attempt vectorization if "
234 "tail-folding fails.")));
236 static cl::opt<TailFoldingStyle> ForceTailFoldingStyle(
237 "force-tail-folding-style", cl::desc("Force the tail folding style"),
238 cl::init(TailFoldingStyle::None),
239 cl::values(
240 clEnumValN(TailFoldingStyle::None, "none", "Disable tail folding"),
241 clEnumValN(
242 TailFoldingStyle::Data, "data",
243 "Create lane mask for data only, using active.lane.mask intrinsic"),
244 clEnumValN(TailFoldingStyle::DataWithoutLaneMask,
245 "data-without-lane-mask",
246 "Create lane mask with compare/stepvector"),
247 clEnumValN(TailFoldingStyle::DataAndControlFlow, "data-and-control",
248 "Create lane mask using active.lane.mask intrinsic, and use "
249 "it for both data and control flow"),
250 clEnumValN(
251 TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck,
252 "data-and-control-without-rt-check",
253 "Similar to data-and-control, but remove the runtime check")));
255 static cl::opt<bool> MaximizeBandwidth(
256 "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
257 cl::desc("Maximize bandwidth when selecting vectorization factor which "
258 "will be determined by the smallest type in loop."));
260 static cl::opt<bool> EnableInterleavedMemAccesses(
261 "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
262 cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
264 /// An interleave-group may need masking if it resides in a block that needs
265 /// predication, or in order to mask away gaps.
266 static cl::opt<bool> EnableMaskedInterleavedMemAccesses(
267 "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
268 cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
270 static cl::opt<unsigned> TinyTripCountInterleaveThreshold(
271 "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden,
272 cl::desc("We don't interleave loops with a estimated constant trip count "
273 "below this number"));
275 static cl::opt<unsigned> ForceTargetNumScalarRegs(
276 "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
277 cl::desc("A flag that overrides the target's number of scalar registers."));
279 static cl::opt<unsigned> ForceTargetNumVectorRegs(
280 "force-target-num-vector-regs", cl::init(0), cl::Hidden,
281 cl::desc("A flag that overrides the target's number of vector registers."));
283 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor(
284 "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
285 cl::desc("A flag that overrides the target's max interleave factor for "
286 "scalar loops."));
288 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(
289 "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
290 cl::desc("A flag that overrides the target's max interleave factor for "
291 "vectorized loops."));
293 static cl::opt<unsigned> ForceTargetInstructionCost(
294 "force-target-instruction-cost", cl::init(0), cl::Hidden,
295 cl::desc("A flag that overrides the target's expected cost for "
296 "an instruction to a single constant value. Mostly "
297 "useful for getting consistent testing."));
299 static cl::opt<bool> ForceTargetSupportsScalableVectors(
300 "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden,
301 cl::desc(
302 "Pretend that scalable vectors are supported, even if the target does "
303 "not support them. This flag should only be used for testing."));
305 static cl::opt<unsigned> SmallLoopCost(
306 "small-loop-cost", cl::init(20), cl::Hidden,
307 cl::desc(
308 "The cost of a loop that is considered 'small' by the interleaver."));
310 static cl::opt<bool> LoopVectorizeWithBlockFrequency(
311 "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden,
312 cl::desc("Enable the use of the block frequency analysis to access PGO "
313 "heuristics minimizing code growth in cold regions and being more "
314 "aggressive in hot regions."));
316 // Runtime interleave loops for load/store throughput.
317 static cl::opt<bool> EnableLoadStoreRuntimeInterleave(
318 "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
319 cl::desc(
320 "Enable runtime interleaving until load/store ports are saturated"));
322 /// Interleave small loops with scalar reductions.
323 static cl::opt<bool> InterleaveSmallLoopScalarReduction(
324 "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden,
325 cl::desc("Enable interleaving for loops with small iteration counts that "
326 "contain scalar reductions to expose ILP."));
328 /// The number of stores in a loop that are allowed to need predication.
329 static cl::opt<unsigned> NumberOfStoresToPredicate(
330 "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
331 cl::desc("Max number of stores to be predicated behind an if."));
333 static cl::opt<bool> EnableIndVarRegisterHeur(
334 "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
335 cl::desc("Count the induction variable only once when interleaving"));
337 static cl::opt<bool> EnableCondStoresVectorization(
338 "enable-cond-stores-vec", cl::init(true), cl::Hidden,
339 cl::desc("Enable if predication of stores during vectorization."));
341 static cl::opt<unsigned> MaxNestedScalarReductionIC(
342 "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
343 cl::desc("The maximum interleave count to use when interleaving a scalar "
344 "reduction in a nested loop."));
346 static cl::opt<bool>
347 PreferInLoopReductions("prefer-inloop-reductions", cl::init(false),
348 cl::Hidden,
349 cl::desc("Prefer in-loop vector reductions, "
350 "overriding the targets preference."));
352 static cl::opt<bool> ForceOrderedReductions(
353 "force-ordered-reductions", cl::init(false), cl::Hidden,
354 cl::desc("Enable the vectorisation of loops with in-order (strict) "
355 "FP reductions"));
357 static cl::opt<bool> PreferPredicatedReductionSelect(
358 "prefer-predicated-reduction-select", cl::init(false), cl::Hidden,
359 cl::desc(
360 "Prefer predicating a reduction operation over an after loop select."));
362 namespace llvm {
363 cl::opt<bool> EnableVPlanNativePath(
364 "enable-vplan-native-path", cl::Hidden,
365 cl::desc("Enable VPlan-native vectorization path with "
366 "support for outer loop vectorization."));
369 // This flag enables the stress testing of the VPlan H-CFG construction in the
370 // VPlan-native vectorization path. It must be used in conjuction with
371 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
372 // verification of the H-CFGs built.
373 static cl::opt<bool> VPlanBuildStressTest(
374 "vplan-build-stress-test", cl::init(false), cl::Hidden,
375 cl::desc(
376 "Build VPlan for every supported loop nest in the function and bail "
377 "out right after the build (stress test the VPlan H-CFG construction "
378 "in the VPlan-native vectorization path)."));
380 cl::opt<bool> llvm::EnableLoopInterleaving(
381 "interleave-loops", cl::init(true), cl::Hidden,
382 cl::desc("Enable loop interleaving in Loop vectorization passes"));
383 cl::opt<bool> llvm::EnableLoopVectorization(
384 "vectorize-loops", cl::init(true), cl::Hidden,
385 cl::desc("Run the Loop vectorization passes"));
387 static cl::opt<bool> PrintVPlansInDotFormat(
388 "vplan-print-in-dot-format", cl::Hidden,
389 cl::desc("Use dot format instead of plain text when dumping VPlans"));
391 static cl::opt<cl::boolOrDefault> ForceSafeDivisor(
392 "force-widen-divrem-via-safe-divisor", cl::Hidden,
393 cl::desc(
394 "Override cost based safe divisor widening for div/rem instructions"));
396 static cl::opt<bool> UseWiderVFIfCallVariantsPresent(
397 "vectorizer-maximize-bandwidth-for-vector-calls", cl::init(true),
398 cl::Hidden,
399 cl::desc("Try wider VFs if they enable the use of vector variants"));
401 // Likelyhood of bypassing the vectorized loop because assumptions about SCEV
402 // variables not overflowing do not hold. See `emitSCEVChecks`.
403 static constexpr uint32_t SCEVCheckBypassWeights[] = {1, 127};
404 // Likelyhood of bypassing the vectorized loop because pointers overlap. See
405 // `emitMemRuntimeChecks`.
406 static constexpr uint32_t MemCheckBypassWeights[] = {1, 127};
407 // Likelyhood of bypassing the vectorized loop because there are zero trips left
408 // after prolog. See `emitIterationCountCheck`.
409 static constexpr uint32_t MinItersBypassWeights[] = {1, 127};
411 /// A helper function that returns true if the given type is irregular. The
412 /// type is irregular if its allocated size doesn't equal the store size of an
413 /// element of the corresponding vector type.
414 static bool hasIrregularType(Type *Ty, const DataLayout &DL) {
415 // Determine if an array of N elements of type Ty is "bitcast compatible"
416 // with a <N x Ty> vector.
417 // This is only true if there is no padding between the array elements.
418 return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
421 /// A helper function that returns the reciprocal of the block probability of
422 /// predicated blocks. If we return X, we are assuming the predicated block
423 /// will execute once for every X iterations of the loop header.
425 /// TODO: We should use actual block probability here, if available. Currently,
426 /// we always assume predicated blocks have a 50% chance of executing.
427 static unsigned getReciprocalPredBlockProb() { return 2; }
429 /// Returns "best known" trip count for the specified loop \p L as defined by
430 /// the following procedure:
431 /// 1) Returns exact trip count if it is known.
432 /// 2) Returns expected trip count according to profile data if any.
433 /// 3) Returns upper bound estimate if it is known.
434 /// 4) Returns std::nullopt if all of the above failed.
435 static std::optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE,
436 Loop *L) {
437 // Check if exact trip count is known.
438 if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L))
439 return ExpectedTC;
441 // Check if there is an expected trip count available from profile data.
442 if (LoopVectorizeWithBlockFrequency)
443 if (auto EstimatedTC = getLoopEstimatedTripCount(L))
444 return *EstimatedTC;
446 // Check if upper bound estimate is known.
447 if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L))
448 return ExpectedTC;
450 return std::nullopt;
453 /// Return a vector containing interleaved elements from multiple
454 /// smaller input vectors.
455 static Value *interleaveVectors(IRBuilderBase &Builder, ArrayRef<Value *> Vals,
456 const Twine &Name) {
457 unsigned Factor = Vals.size();
458 assert(Factor > 1 && "Tried to interleave invalid number of vectors");
460 VectorType *VecTy = cast<VectorType>(Vals[0]->getType());
461 #ifndef NDEBUG
462 for (Value *Val : Vals)
463 assert(Val->getType() == VecTy && "Tried to interleave mismatched types");
464 #endif
466 // Scalable vectors cannot use arbitrary shufflevectors (only splats), so
467 // must use intrinsics to interleave.
468 if (VecTy->isScalableTy()) {
469 VectorType *WideVecTy = VectorType::getDoubleElementsVectorType(VecTy);
470 return Builder.CreateIntrinsic(
471 WideVecTy, Intrinsic::experimental_vector_interleave2, Vals,
472 /*FMFSource=*/nullptr, Name);
475 // Fixed length. Start by concatenating all vectors into a wide vector.
476 Value *WideVec = concatenateVectors(Builder, Vals);
478 // Interleave the elements into the wide vector.
479 const unsigned NumElts = VecTy->getElementCount().getFixedValue();
480 return Builder.CreateShuffleVector(
481 WideVec, createInterleaveMask(NumElts, Factor), Name);
484 namespace {
485 // Forward declare GeneratedRTChecks.
486 class GeneratedRTChecks;
488 using SCEV2ValueTy = DenseMap<const SCEV *, Value *>;
489 } // namespace
491 namespace llvm {
493 AnalysisKey ShouldRunExtraVectorPasses::Key;
495 /// InnerLoopVectorizer vectorizes loops which contain only one basic
496 /// block to a specified vectorization factor (VF).
497 /// This class performs the widening of scalars into vectors, or multiple
498 /// scalars. This class also implements the following features:
499 /// * It inserts an epilogue loop for handling loops that don't have iteration
500 /// counts that are known to be a multiple of the vectorization factor.
501 /// * It handles the code generation for reduction variables.
502 /// * Scalarization (implementation using scalars) of un-vectorizable
503 /// instructions.
504 /// InnerLoopVectorizer does not perform any vectorization-legality
505 /// checks, and relies on the caller to check for the different legality
506 /// aspects. The InnerLoopVectorizer relies on the
507 /// LoopVectorizationLegality class to provide information about the induction
508 /// and reduction variables that were found to a given vectorization factor.
509 class InnerLoopVectorizer {
510 public:
511 InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
512 LoopInfo *LI, DominatorTree *DT,
513 const TargetLibraryInfo *TLI,
514 const TargetTransformInfo *TTI, AssumptionCache *AC,
515 OptimizationRemarkEmitter *ORE, ElementCount VecWidth,
516 ElementCount MinProfitableTripCount,
517 unsigned UnrollFactor, LoopVectorizationLegality *LVL,
518 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
519 ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks)
520 : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
521 AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
522 Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI),
523 PSI(PSI), RTChecks(RTChecks) {
524 // Query this against the original loop and save it here because the profile
525 // of the original loop header may change as the transformation happens.
526 OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize(
527 OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass);
529 if (MinProfitableTripCount.isZero())
530 this->MinProfitableTripCount = VecWidth;
531 else
532 this->MinProfitableTripCount = MinProfitableTripCount;
535 virtual ~InnerLoopVectorizer() = default;
537 /// Create a new empty loop that will contain vectorized instructions later
538 /// on, while the old loop will be used as the scalar remainder. Control flow
539 /// is generated around the vectorized (and scalar epilogue) loops consisting
540 /// of various checks and bypasses. Return the pre-header block of the new
541 /// loop and the start value for the canonical induction, if it is != 0. The
542 /// latter is the case when vectorizing the epilogue loop. In the case of
543 /// epilogue vectorization, this function is overriden to handle the more
544 /// complex control flow around the loops. \p ExpandedSCEVs is used to
545 /// look up SCEV expansions for expressions needed during skeleton creation.
546 virtual std::pair<BasicBlock *, Value *>
547 createVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs);
549 /// Fix the vectorized code, taking care of header phi's, live-outs, and more.
550 void fixVectorizedLoop(VPTransformState &State, VPlan &Plan);
552 // Return true if any runtime check is added.
553 bool areSafetyChecksAdded() { return AddedSafetyChecks; }
555 /// A type for vectorized values in the new loop. Each value from the
556 /// original loop, when vectorized, is represented by UF vector values in the
557 /// new unrolled loop, where UF is the unroll factor.
558 using VectorParts = SmallVector<Value *, 2>;
560 /// A helper function to scalarize a single Instruction in the innermost loop.
561 /// Generates a sequence of scalar instances for each lane between \p MinLane
562 /// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
563 /// inclusive. Uses the VPValue operands from \p RepRecipe instead of \p
564 /// Instr's operands.
565 void scalarizeInstruction(const Instruction *Instr,
566 VPReplicateRecipe *RepRecipe,
567 const VPIteration &Instance,
568 VPTransformState &State);
570 /// Try to vectorize interleaved access group \p Group with the base address
571 /// given in \p Addr, optionally masking the vector operations if \p
572 /// BlockInMask is non-null. Use \p State to translate given VPValues to IR
573 /// values in the vectorized loop.
574 void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group,
575 ArrayRef<VPValue *> VPDefs,
576 VPTransformState &State, VPValue *Addr,
577 ArrayRef<VPValue *> StoredValues,
578 VPValue *BlockInMask, bool NeedsMaskForGaps);
580 /// Fix the non-induction PHIs in \p Plan.
581 void fixNonInductionPHIs(VPlan &Plan, VPTransformState &State);
583 /// Returns true if the reordering of FP operations is not allowed, but we are
584 /// able to vectorize with strict in-order reductions for the given RdxDesc.
585 bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc);
587 /// Create a new phi node for the induction variable \p OrigPhi to resume
588 /// iteration count in the scalar epilogue, from where the vectorized loop
589 /// left off. \p Step is the SCEV-expanded induction step to use. In cases
590 /// where the loop skeleton is more complicated (i.e., epilogue vectorization)
591 /// and the resume values can come from an additional bypass block, the \p
592 /// AdditionalBypass pair provides information about the bypass block and the
593 /// end value on the edge from bypass to this loop.
594 PHINode *createInductionResumeValue(
595 PHINode *OrigPhi, const InductionDescriptor &ID, Value *Step,
596 ArrayRef<BasicBlock *> BypassBlocks,
597 std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr});
599 /// Returns the original loop trip count.
600 Value *getTripCount() const { return TripCount; }
602 /// Used to set the trip count after ILV's construction and after the
603 /// preheader block has been executed. Note that this always holds the trip
604 /// count of the original loop for both main loop and epilogue vectorization.
605 void setTripCount(Value *TC) { TripCount = TC; }
607 protected:
608 friend class LoopVectorizationPlanner;
610 /// A small list of PHINodes.
611 using PhiVector = SmallVector<PHINode *, 4>;
613 /// A type for scalarized values in the new loop. Each value from the
614 /// original loop, when scalarized, is represented by UF x VF scalar values
615 /// in the new unrolled loop, where UF is the unroll factor and VF is the
616 /// vectorization factor.
617 using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>;
619 /// Set up the values of the IVs correctly when exiting the vector loop.
620 void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
621 Value *VectorTripCount, Value *EndValue,
622 BasicBlock *MiddleBlock, BasicBlock *VectorHeader,
623 VPlan &Plan, VPTransformState &State);
625 /// Create the exit value of first order recurrences in the middle block and
626 /// update their users.
627 void fixFixedOrderRecurrence(VPFirstOrderRecurrencePHIRecipe *PhiR,
628 VPTransformState &State);
630 /// Create code for the loop exit value of the reduction.
631 void fixReduction(VPReductionPHIRecipe *Phi, VPTransformState &State);
633 /// Iteratively sink the scalarized operands of a predicated instruction into
634 /// the block that was created for it.
635 void sinkScalarOperands(Instruction *PredInst);
637 /// Returns (and creates if needed) the trip count of the widened loop.
638 Value *getOrCreateVectorTripCount(BasicBlock *InsertBlock);
640 /// Returns a bitcasted value to the requested vector type.
641 /// Also handles bitcasts of vector<float> <-> vector<pointer> types.
642 Value *createBitOrPointerCast(Value *V, VectorType *DstVTy,
643 const DataLayout &DL);
645 /// Emit a bypass check to see if the vector trip count is zero, including if
646 /// it overflows.
647 void emitIterationCountCheck(BasicBlock *Bypass);
649 /// Emit a bypass check to see if all of the SCEV assumptions we've
650 /// had to make are correct. Returns the block containing the checks or
651 /// nullptr if no checks have been added.
652 BasicBlock *emitSCEVChecks(BasicBlock *Bypass);
654 /// Emit bypass checks to check any memory assumptions we may have made.
655 /// Returns the block containing the checks or nullptr if no checks have been
656 /// added.
657 BasicBlock *emitMemRuntimeChecks(BasicBlock *Bypass);
659 /// Emit basic blocks (prefixed with \p Prefix) for the iteration check,
660 /// vector loop preheader, middle block and scalar preheader.
661 void createVectorLoopSkeleton(StringRef Prefix);
663 /// Create new phi nodes for the induction variables to resume iteration count
664 /// in the scalar epilogue, from where the vectorized loop left off.
665 /// In cases where the loop skeleton is more complicated (eg. epilogue
666 /// vectorization) and the resume values can come from an additional bypass
667 /// block, the \p AdditionalBypass pair provides information about the bypass
668 /// block and the end value on the edge from bypass to this loop.
669 void createInductionResumeValues(
670 const SCEV2ValueTy &ExpandedSCEVs,
671 std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr});
673 /// Complete the loop skeleton by adding debug MDs, creating appropriate
674 /// conditional branches in the middle block, preparing the builder and
675 /// running the verifier. Return the preheader of the completed vector loop.
676 BasicBlock *completeLoopSkeleton();
678 /// Collect poison-generating recipes that may generate a poison value that is
679 /// used after vectorization, even when their operands are not poison. Those
680 /// recipes meet the following conditions:
681 /// * Contribute to the address computation of a recipe generating a widen
682 /// memory load/store (VPWidenMemoryInstructionRecipe or
683 /// VPInterleaveRecipe).
684 /// * Such a widen memory load/store has at least one underlying Instruction
685 /// that is in a basic block that needs predication and after vectorization
686 /// the generated instruction won't be predicated.
687 void collectPoisonGeneratingRecipes(VPTransformState &State);
689 /// Allow subclasses to override and print debug traces before/after vplan
690 /// execution, when trace information is requested.
691 virtual void printDebugTracesAtStart(){};
692 virtual void printDebugTracesAtEnd(){};
694 /// The original loop.
695 Loop *OrigLoop;
697 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
698 /// dynamic knowledge to simplify SCEV expressions and converts them to a
699 /// more usable form.
700 PredicatedScalarEvolution &PSE;
702 /// Loop Info.
703 LoopInfo *LI;
705 /// Dominator Tree.
706 DominatorTree *DT;
708 /// Target Library Info.
709 const TargetLibraryInfo *TLI;
711 /// Target Transform Info.
712 const TargetTransformInfo *TTI;
714 /// Assumption Cache.
715 AssumptionCache *AC;
717 /// Interface to emit optimization remarks.
718 OptimizationRemarkEmitter *ORE;
720 /// The vectorization SIMD factor to use. Each vector will have this many
721 /// vector elements.
722 ElementCount VF;
724 ElementCount MinProfitableTripCount;
726 /// The vectorization unroll factor to use. Each scalar is vectorized to this
727 /// many different vector instructions.
728 unsigned UF;
730 /// The builder that we use
731 IRBuilder<> Builder;
733 // --- Vectorization state ---
735 /// The vector-loop preheader.
736 BasicBlock *LoopVectorPreHeader;
738 /// The scalar-loop preheader.
739 BasicBlock *LoopScalarPreHeader;
741 /// Middle Block between the vector and the scalar.
742 BasicBlock *LoopMiddleBlock;
744 /// The unique ExitBlock of the scalar loop if one exists. Note that
745 /// there can be multiple exiting edges reaching this block.
746 BasicBlock *LoopExitBlock;
748 /// The scalar loop body.
749 BasicBlock *LoopScalarBody;
751 /// A list of all bypass blocks. The first block is the entry of the loop.
752 SmallVector<BasicBlock *, 4> LoopBypassBlocks;
754 /// Store instructions that were predicated.
755 SmallVector<Instruction *, 4> PredicatedInstructions;
757 /// Trip count of the original loop.
758 Value *TripCount = nullptr;
760 /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
761 Value *VectorTripCount = nullptr;
763 /// The legality analysis.
764 LoopVectorizationLegality *Legal;
766 /// The profitablity analysis.
767 LoopVectorizationCostModel *Cost;
769 // Record whether runtime checks are added.
770 bool AddedSafetyChecks = false;
772 // Holds the end values for each induction variable. We save the end values
773 // so we can later fix-up the external users of the induction variables.
774 DenseMap<PHINode *, Value *> IVEndValues;
776 /// BFI and PSI are used to check for profile guided size optimizations.
777 BlockFrequencyInfo *BFI;
778 ProfileSummaryInfo *PSI;
780 // Whether this loop should be optimized for size based on profile guided size
781 // optimizatios.
782 bool OptForSizeBasedOnProfile;
784 /// Structure to hold information about generated runtime checks, responsible
785 /// for cleaning the checks, if vectorization turns out unprofitable.
786 GeneratedRTChecks &RTChecks;
788 // Holds the resume values for reductions in the loops, used to set the
789 // correct start value of reduction PHIs when vectorizing the epilogue.
790 SmallMapVector<const RecurrenceDescriptor *, PHINode *, 4>
791 ReductionResumeValues;
794 class InnerLoopUnroller : public InnerLoopVectorizer {
795 public:
796 InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
797 LoopInfo *LI, DominatorTree *DT,
798 const TargetLibraryInfo *TLI,
799 const TargetTransformInfo *TTI, AssumptionCache *AC,
800 OptimizationRemarkEmitter *ORE, unsigned UnrollFactor,
801 LoopVectorizationLegality *LVL,
802 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
803 ProfileSummaryInfo *PSI, GeneratedRTChecks &Check)
804 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
805 ElementCount::getFixed(1),
806 ElementCount::getFixed(1), UnrollFactor, LVL, CM,
807 BFI, PSI, Check) {}
810 /// Encapsulate information regarding vectorization of a loop and its epilogue.
811 /// This information is meant to be updated and used across two stages of
812 /// epilogue vectorization.
813 struct EpilogueLoopVectorizationInfo {
814 ElementCount MainLoopVF = ElementCount::getFixed(0);
815 unsigned MainLoopUF = 0;
816 ElementCount EpilogueVF = ElementCount::getFixed(0);
817 unsigned EpilogueUF = 0;
818 BasicBlock *MainLoopIterationCountCheck = nullptr;
819 BasicBlock *EpilogueIterationCountCheck = nullptr;
820 BasicBlock *SCEVSafetyCheck = nullptr;
821 BasicBlock *MemSafetyCheck = nullptr;
822 Value *TripCount = nullptr;
823 Value *VectorTripCount = nullptr;
825 EpilogueLoopVectorizationInfo(ElementCount MVF, unsigned MUF,
826 ElementCount EVF, unsigned EUF)
827 : MainLoopVF(MVF), MainLoopUF(MUF), EpilogueVF(EVF), EpilogueUF(EUF) {
828 assert(EUF == 1 &&
829 "A high UF for the epilogue loop is likely not beneficial.");
833 /// An extension of the inner loop vectorizer that creates a skeleton for a
834 /// vectorized loop that has its epilogue (residual) also vectorized.
835 /// The idea is to run the vplan on a given loop twice, firstly to setup the
836 /// skeleton and vectorize the main loop, and secondly to complete the skeleton
837 /// from the first step and vectorize the epilogue. This is achieved by
838 /// deriving two concrete strategy classes from this base class and invoking
839 /// them in succession from the loop vectorizer planner.
840 class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer {
841 public:
842 InnerLoopAndEpilogueVectorizer(
843 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
844 DominatorTree *DT, const TargetLibraryInfo *TLI,
845 const TargetTransformInfo *TTI, AssumptionCache *AC,
846 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
847 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
848 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
849 GeneratedRTChecks &Checks)
850 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
851 EPI.MainLoopVF, EPI.MainLoopVF, EPI.MainLoopUF, LVL,
852 CM, BFI, PSI, Checks),
853 EPI(EPI) {}
855 // Override this function to handle the more complex control flow around the
856 // three loops.
857 std::pair<BasicBlock *, Value *> createVectorizedLoopSkeleton(
858 const SCEV2ValueTy &ExpandedSCEVs) final {
859 return createEpilogueVectorizedLoopSkeleton(ExpandedSCEVs);
862 /// The interface for creating a vectorized skeleton using one of two
863 /// different strategies, each corresponding to one execution of the vplan
864 /// as described above.
865 virtual std::pair<BasicBlock *, Value *>
866 createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) = 0;
868 /// Holds and updates state information required to vectorize the main loop
869 /// and its epilogue in two separate passes. This setup helps us avoid
870 /// regenerating and recomputing runtime safety checks. It also helps us to
871 /// shorten the iteration-count-check path length for the cases where the
872 /// iteration count of the loop is so small that the main vector loop is
873 /// completely skipped.
874 EpilogueLoopVectorizationInfo &EPI;
877 /// A specialized derived class of inner loop vectorizer that performs
878 /// vectorization of *main* loops in the process of vectorizing loops and their
879 /// epilogues.
880 class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer {
881 public:
882 EpilogueVectorizerMainLoop(
883 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
884 DominatorTree *DT, const TargetLibraryInfo *TLI,
885 const TargetTransformInfo *TTI, AssumptionCache *AC,
886 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
887 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
888 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
889 GeneratedRTChecks &Check)
890 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
891 EPI, LVL, CM, BFI, PSI, Check) {}
892 /// Implements the interface for creating a vectorized skeleton using the
893 /// *main loop* strategy (ie the first pass of vplan execution).
894 std::pair<BasicBlock *, Value *>
895 createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final;
897 protected:
898 /// Emits an iteration count bypass check once for the main loop (when \p
899 /// ForEpilogue is false) and once for the epilogue loop (when \p
900 /// ForEpilogue is true).
901 BasicBlock *emitIterationCountCheck(BasicBlock *Bypass, bool ForEpilogue);
902 void printDebugTracesAtStart() override;
903 void printDebugTracesAtEnd() override;
906 // A specialized derived class of inner loop vectorizer that performs
907 // vectorization of *epilogue* loops in the process of vectorizing loops and
908 // their epilogues.
909 class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer {
910 public:
911 EpilogueVectorizerEpilogueLoop(
912 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
913 DominatorTree *DT, const TargetLibraryInfo *TLI,
914 const TargetTransformInfo *TTI, AssumptionCache *AC,
915 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
916 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
917 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
918 GeneratedRTChecks &Checks)
919 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
920 EPI, LVL, CM, BFI, PSI, Checks) {
921 TripCount = EPI.TripCount;
923 /// Implements the interface for creating a vectorized skeleton using the
924 /// *epilogue loop* strategy (ie the second pass of vplan execution).
925 std::pair<BasicBlock *, Value *>
926 createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final;
928 protected:
929 /// Emits an iteration count bypass check after the main vector loop has
930 /// finished to see if there are any iterations left to execute by either
931 /// the vector epilogue or the scalar epilogue.
932 BasicBlock *emitMinimumVectorEpilogueIterCountCheck(
933 BasicBlock *Bypass,
934 BasicBlock *Insert);
935 void printDebugTracesAtStart() override;
936 void printDebugTracesAtEnd() override;
938 } // end namespace llvm
940 /// Look for a meaningful debug location on the instruction or it's
941 /// operands.
942 static DebugLoc getDebugLocFromInstOrOperands(Instruction *I) {
943 if (!I)
944 return DebugLoc();
946 DebugLoc Empty;
947 if (I->getDebugLoc() != Empty)
948 return I->getDebugLoc();
950 for (Use &Op : I->operands()) {
951 if (Instruction *OpInst = dyn_cast<Instruction>(Op))
952 if (OpInst->getDebugLoc() != Empty)
953 return OpInst->getDebugLoc();
956 return I->getDebugLoc();
959 /// Write a \p DebugMsg about vectorization to the debug output stream. If \p I
960 /// is passed, the message relates to that particular instruction.
961 #ifndef NDEBUG
962 static void debugVectorizationMessage(const StringRef Prefix,
963 const StringRef DebugMsg,
964 Instruction *I) {
965 dbgs() << "LV: " << Prefix << DebugMsg;
966 if (I != nullptr)
967 dbgs() << " " << *I;
968 else
969 dbgs() << '.';
970 dbgs() << '\n';
972 #endif
974 /// Create an analysis remark that explains why vectorization failed
976 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p
977 /// RemarkName is the identifier for the remark. If \p I is passed it is an
978 /// instruction that prevents vectorization. Otherwise \p TheLoop is used for
979 /// the location of the remark. \return the remark object that can be
980 /// streamed to.
981 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName,
982 StringRef RemarkName, Loop *TheLoop, Instruction *I) {
983 Value *CodeRegion = TheLoop->getHeader();
984 DebugLoc DL = TheLoop->getStartLoc();
986 if (I) {
987 CodeRegion = I->getParent();
988 // If there is no debug location attached to the instruction, revert back to
989 // using the loop's.
990 if (I->getDebugLoc())
991 DL = I->getDebugLoc();
994 return OptimizationRemarkAnalysis(PassName, RemarkName, DL, CodeRegion);
997 namespace llvm {
999 /// Return a value for Step multiplied by VF.
1000 Value *createStepForVF(IRBuilderBase &B, Type *Ty, ElementCount VF,
1001 int64_t Step) {
1002 assert(Ty->isIntegerTy() && "Expected an integer step");
1003 return B.CreateElementCount(Ty, VF.multiplyCoefficientBy(Step));
1006 /// Return the runtime value for VF.
1007 Value *getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF) {
1008 return B.CreateElementCount(Ty, VF);
1011 const SCEV *createTripCountSCEV(Type *IdxTy, PredicatedScalarEvolution &PSE,
1012 Loop *OrigLoop) {
1013 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
1014 assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) && "Invalid loop count");
1016 ScalarEvolution &SE = *PSE.getSE();
1017 return SE.getTripCountFromExitCount(BackedgeTakenCount, IdxTy, OrigLoop);
1020 void reportVectorizationFailure(const StringRef DebugMsg,
1021 const StringRef OREMsg, const StringRef ORETag,
1022 OptimizationRemarkEmitter *ORE, Loop *TheLoop,
1023 Instruction *I) {
1024 LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg, I));
1025 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
1026 ORE->emit(
1027 createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I)
1028 << "loop not vectorized: " << OREMsg);
1031 void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag,
1032 OptimizationRemarkEmitter *ORE, Loop *TheLoop,
1033 Instruction *I) {
1034 LLVM_DEBUG(debugVectorizationMessage("", Msg, I));
1035 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
1036 ORE->emit(
1037 createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I)
1038 << Msg);
1041 /// Report successful vectorization of the loop. In case an outer loop is
1042 /// vectorized, prepend "outer" to the vectorization remark.
1043 static void reportVectorization(OptimizationRemarkEmitter *ORE, Loop *TheLoop,
1044 VectorizationFactor VF, unsigned IC) {
1045 LLVM_DEBUG(debugVectorizationMessage(
1046 "Vectorizing: ", TheLoop->isInnermost() ? "innermost loop" : "outer loop",
1047 nullptr));
1048 StringRef LoopType = TheLoop->isInnermost() ? "" : "outer ";
1049 ORE->emit([&]() {
1050 return OptimizationRemark(LV_NAME, "Vectorized", TheLoop->getStartLoc(),
1051 TheLoop->getHeader())
1052 << "vectorized " << LoopType << "loop (vectorization width: "
1053 << ore::NV("VectorizationFactor", VF.Width)
1054 << ", interleaved count: " << ore::NV("InterleaveCount", IC) << ")";
1058 } // end namespace llvm
1060 #ifndef NDEBUG
1061 /// \return string containing a file name and a line # for the given loop.
1062 static std::string getDebugLocString(const Loop *L) {
1063 std::string Result;
1064 if (L) {
1065 raw_string_ostream OS(Result);
1066 if (const DebugLoc LoopDbgLoc = L->getStartLoc())
1067 LoopDbgLoc.print(OS);
1068 else
1069 // Just print the module name.
1070 OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier();
1071 OS.flush();
1073 return Result;
1075 #endif
1077 void InnerLoopVectorizer::collectPoisonGeneratingRecipes(
1078 VPTransformState &State) {
1080 // Collect recipes in the backward slice of `Root` that may generate a poison
1081 // value that is used after vectorization.
1082 SmallPtrSet<VPRecipeBase *, 16> Visited;
1083 auto collectPoisonGeneratingInstrsInBackwardSlice([&](VPRecipeBase *Root) {
1084 SmallVector<VPRecipeBase *, 16> Worklist;
1085 Worklist.push_back(Root);
1087 // Traverse the backward slice of Root through its use-def chain.
1088 while (!Worklist.empty()) {
1089 VPRecipeBase *CurRec = Worklist.back();
1090 Worklist.pop_back();
1092 if (!Visited.insert(CurRec).second)
1093 continue;
1095 // Prune search if we find another recipe generating a widen memory
1096 // instruction. Widen memory instructions involved in address computation
1097 // will lead to gather/scatter instructions, which don't need to be
1098 // handled.
1099 if (isa<VPWidenMemoryInstructionRecipe>(CurRec) ||
1100 isa<VPInterleaveRecipe>(CurRec) ||
1101 isa<VPScalarIVStepsRecipe>(CurRec) ||
1102 isa<VPCanonicalIVPHIRecipe>(CurRec) ||
1103 isa<VPActiveLaneMaskPHIRecipe>(CurRec))
1104 continue;
1106 // This recipe contributes to the address computation of a widen
1107 // load/store. If the underlying instruction has poison-generating flags,
1108 // drop them directly.
1109 if (auto *RecWithFlags = dyn_cast<VPRecipeWithIRFlags>(CurRec)) {
1110 RecWithFlags->dropPoisonGeneratingFlags();
1111 } else {
1112 Instruction *Instr = dyn_cast_or_null<Instruction>(
1113 CurRec->getVPSingleValue()->getUnderlyingValue());
1114 (void)Instr;
1115 assert((!Instr || !Instr->hasPoisonGeneratingFlags()) &&
1116 "found instruction with poison generating flags not covered by "
1117 "VPRecipeWithIRFlags");
1120 // Add new definitions to the worklist.
1121 for (VPValue *operand : CurRec->operands())
1122 if (VPRecipeBase *OpDef = operand->getDefiningRecipe())
1123 Worklist.push_back(OpDef);
1127 // Traverse all the recipes in the VPlan and collect the poison-generating
1128 // recipes in the backward slice starting at the address of a VPWidenRecipe or
1129 // VPInterleaveRecipe.
1130 auto Iter = vp_depth_first_deep(State.Plan->getEntry());
1131 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
1132 for (VPRecipeBase &Recipe : *VPBB) {
1133 if (auto *WidenRec = dyn_cast<VPWidenMemoryInstructionRecipe>(&Recipe)) {
1134 Instruction &UnderlyingInstr = WidenRec->getIngredient();
1135 VPRecipeBase *AddrDef = WidenRec->getAddr()->getDefiningRecipe();
1136 if (AddrDef && WidenRec->isConsecutive() &&
1137 Legal->blockNeedsPredication(UnderlyingInstr.getParent()))
1138 collectPoisonGeneratingInstrsInBackwardSlice(AddrDef);
1139 } else if (auto *InterleaveRec = dyn_cast<VPInterleaveRecipe>(&Recipe)) {
1140 VPRecipeBase *AddrDef = InterleaveRec->getAddr()->getDefiningRecipe();
1141 if (AddrDef) {
1142 // Check if any member of the interleave group needs predication.
1143 const InterleaveGroup<Instruction> *InterGroup =
1144 InterleaveRec->getInterleaveGroup();
1145 bool NeedPredication = false;
1146 for (int I = 0, NumMembers = InterGroup->getNumMembers();
1147 I < NumMembers; ++I) {
1148 Instruction *Member = InterGroup->getMember(I);
1149 if (Member)
1150 NeedPredication |=
1151 Legal->blockNeedsPredication(Member->getParent());
1154 if (NeedPredication)
1155 collectPoisonGeneratingInstrsInBackwardSlice(AddrDef);
1162 namespace llvm {
1164 // Loop vectorization cost-model hints how the scalar epilogue loop should be
1165 // lowered.
1166 enum ScalarEpilogueLowering {
1168 // The default: allowing scalar epilogues.
1169 CM_ScalarEpilogueAllowed,
1171 // Vectorization with OptForSize: don't allow epilogues.
1172 CM_ScalarEpilogueNotAllowedOptSize,
1174 // A special case of vectorisation with OptForSize: loops with a very small
1175 // trip count are considered for vectorization under OptForSize, thereby
1176 // making sure the cost of their loop body is dominant, free of runtime
1177 // guards and scalar iteration overheads.
1178 CM_ScalarEpilogueNotAllowedLowTripLoop,
1180 // Loop hint predicate indicating an epilogue is undesired.
1181 CM_ScalarEpilogueNotNeededUsePredicate,
1183 // Directive indicating we must either tail fold or not vectorize
1184 CM_ScalarEpilogueNotAllowedUsePredicate
1187 using InstructionVFPair = std::pair<Instruction *, ElementCount>;
1189 /// LoopVectorizationCostModel - estimates the expected speedups due to
1190 /// vectorization.
1191 /// In many cases vectorization is not profitable. This can happen because of
1192 /// a number of reasons. In this class we mainly attempt to predict the
1193 /// expected speedup/slowdowns due to the supported instruction set. We use the
1194 /// TargetTransformInfo to query the different backends for the cost of
1195 /// different operations.
1196 class LoopVectorizationCostModel {
1197 public:
1198 LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L,
1199 PredicatedScalarEvolution &PSE, LoopInfo *LI,
1200 LoopVectorizationLegality *Legal,
1201 const TargetTransformInfo &TTI,
1202 const TargetLibraryInfo *TLI, DemandedBits *DB,
1203 AssumptionCache *AC,
1204 OptimizationRemarkEmitter *ORE, const Function *F,
1205 const LoopVectorizeHints *Hints,
1206 InterleavedAccessInfo &IAI)
1207 : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
1208 TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
1209 Hints(Hints), InterleaveInfo(IAI) {}
1211 /// \return An upper bound for the vectorization factors (both fixed and
1212 /// scalable). If the factors are 0, vectorization and interleaving should be
1213 /// avoided up front.
1214 FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC);
1216 /// \return True if runtime checks are required for vectorization, and false
1217 /// otherwise.
1218 bool runtimeChecksRequired();
1220 /// Setup cost-based decisions for user vectorization factor.
1221 /// \return true if the UserVF is a feasible VF to be chosen.
1222 bool selectUserVectorizationFactor(ElementCount UserVF) {
1223 collectUniformsAndScalars(UserVF);
1224 collectInstsToScalarize(UserVF);
1225 return expectedCost(UserVF).first.isValid();
1228 /// \return The size (in bits) of the smallest and widest types in the code
1229 /// that needs to be vectorized. We ignore values that remain scalar such as
1230 /// 64 bit loop indices.
1231 std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
1233 /// \return The desired interleave count.
1234 /// If interleave count has been specified by metadata it will be returned.
1235 /// Otherwise, the interleave count is computed and returned. VF and LoopCost
1236 /// are the selected vectorization factor and the cost of the selected VF.
1237 unsigned selectInterleaveCount(ElementCount VF, InstructionCost LoopCost);
1239 /// Memory access instruction may be vectorized in more than one way.
1240 /// Form of instruction after vectorization depends on cost.
1241 /// This function takes cost-based decisions for Load/Store instructions
1242 /// and collects them in a map. This decisions map is used for building
1243 /// the lists of loop-uniform and loop-scalar instructions.
1244 /// The calculated cost is saved with widening decision in order to
1245 /// avoid redundant calculations.
1246 void setCostBasedWideningDecision(ElementCount VF);
1248 /// A call may be vectorized in different ways depending on whether we have
1249 /// vectorized variants available and whether the target supports masking.
1250 /// This function analyzes all calls in the function at the supplied VF,
1251 /// makes a decision based on the costs of available options, and stores that
1252 /// decision in a map for use in planning and plan execution.
1253 void setVectorizedCallDecision(ElementCount VF);
1255 /// A struct that represents some properties of the register usage
1256 /// of a loop.
1257 struct RegisterUsage {
1258 /// Holds the number of loop invariant values that are used in the loop.
1259 /// The key is ClassID of target-provided register class.
1260 SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs;
1261 /// Holds the maximum number of concurrent live intervals in the loop.
1262 /// The key is ClassID of target-provided register class.
1263 SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers;
1266 /// \return Returns information about the register usages of the loop for the
1267 /// given vectorization factors.
1268 SmallVector<RegisterUsage, 8>
1269 calculateRegisterUsage(ArrayRef<ElementCount> VFs);
1271 /// Collect values we want to ignore in the cost model.
1272 void collectValuesToIgnore();
1274 /// Collect all element types in the loop for which widening is needed.
1275 void collectElementTypesForWidening();
1277 /// Split reductions into those that happen in the loop, and those that happen
1278 /// outside. In loop reductions are collected into InLoopReductions.
1279 void collectInLoopReductions();
1281 /// Returns true if we should use strict in-order reductions for the given
1282 /// RdxDesc. This is true if the -enable-strict-reductions flag is passed,
1283 /// the IsOrdered flag of RdxDesc is set and we do not allow reordering
1284 /// of FP operations.
1285 bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) const {
1286 return !Hints->allowReordering() && RdxDesc.isOrdered();
1289 /// \returns The smallest bitwidth each instruction can be represented with.
1290 /// The vector equivalents of these instructions should be truncated to this
1291 /// type.
1292 const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const {
1293 return MinBWs;
1296 /// \returns True if it is more profitable to scalarize instruction \p I for
1297 /// vectorization factor \p VF.
1298 bool isProfitableToScalarize(Instruction *I, ElementCount VF) const {
1299 assert(VF.isVector() &&
1300 "Profitable to scalarize relevant only for VF > 1.");
1302 // Cost model is not run in the VPlan-native path - return conservative
1303 // result until this changes.
1304 if (EnableVPlanNativePath)
1305 return false;
1307 auto Scalars = InstsToScalarize.find(VF);
1308 assert(Scalars != InstsToScalarize.end() &&
1309 "VF not yet analyzed for scalarization profitability");
1310 return Scalars->second.contains(I);
1313 /// Returns true if \p I is known to be uniform after vectorization.
1314 bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const {
1315 // Pseudo probe needs to be duplicated for each unrolled iteration and
1316 // vector lane so that profiled loop trip count can be accurately
1317 // accumulated instead of being under counted.
1318 if (isa<PseudoProbeInst>(I))
1319 return false;
1321 if (VF.isScalar())
1322 return true;
1324 // Cost model is not run in the VPlan-native path - return conservative
1325 // result until this changes.
1326 if (EnableVPlanNativePath)
1327 return false;
1329 auto UniformsPerVF = Uniforms.find(VF);
1330 assert(UniformsPerVF != Uniforms.end() &&
1331 "VF not yet analyzed for uniformity");
1332 return UniformsPerVF->second.count(I);
1335 /// Returns true if \p I is known to be scalar after vectorization.
1336 bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const {
1337 if (VF.isScalar())
1338 return true;
1340 // Cost model is not run in the VPlan-native path - return conservative
1341 // result until this changes.
1342 if (EnableVPlanNativePath)
1343 return false;
1345 auto ScalarsPerVF = Scalars.find(VF);
1346 assert(ScalarsPerVF != Scalars.end() &&
1347 "Scalar values are not calculated for VF");
1348 return ScalarsPerVF->second.count(I);
1351 /// \returns True if instruction \p I can be truncated to a smaller bitwidth
1352 /// for vectorization factor \p VF.
1353 bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const {
1354 return VF.isVector() && MinBWs.contains(I) &&
1355 !isProfitableToScalarize(I, VF) &&
1356 !isScalarAfterVectorization(I, VF);
1359 /// Decision that was taken during cost calculation for memory instruction.
1360 enum InstWidening {
1361 CM_Unknown,
1362 CM_Widen, // For consecutive accesses with stride +1.
1363 CM_Widen_Reverse, // For consecutive accesses with stride -1.
1364 CM_Interleave,
1365 CM_GatherScatter,
1366 CM_Scalarize,
1367 CM_VectorCall,
1368 CM_IntrinsicCall
1371 /// Save vectorization decision \p W and \p Cost taken by the cost model for
1372 /// instruction \p I and vector width \p VF.
1373 void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W,
1374 InstructionCost Cost) {
1375 assert(VF.isVector() && "Expected VF >=2");
1376 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1379 /// Save vectorization decision \p W and \p Cost taken by the cost model for
1380 /// interleaving group \p Grp and vector width \p VF.
1381 void setWideningDecision(const InterleaveGroup<Instruction> *Grp,
1382 ElementCount VF, InstWidening W,
1383 InstructionCost Cost) {
1384 assert(VF.isVector() && "Expected VF >=2");
1385 /// Broadcast this decicion to all instructions inside the group.
1386 /// But the cost will be assigned to one instruction only.
1387 for (unsigned i = 0; i < Grp->getFactor(); ++i) {
1388 if (auto *I = Grp->getMember(i)) {
1389 if (Grp->getInsertPos() == I)
1390 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1391 else
1392 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0);
1397 /// Return the cost model decision for the given instruction \p I and vector
1398 /// width \p VF. Return CM_Unknown if this instruction did not pass
1399 /// through the cost modeling.
1400 InstWidening getWideningDecision(Instruction *I, ElementCount VF) const {
1401 assert(VF.isVector() && "Expected VF to be a vector VF");
1402 // Cost model is not run in the VPlan-native path - return conservative
1403 // result until this changes.
1404 if (EnableVPlanNativePath)
1405 return CM_GatherScatter;
1407 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1408 auto Itr = WideningDecisions.find(InstOnVF);
1409 if (Itr == WideningDecisions.end())
1410 return CM_Unknown;
1411 return Itr->second.first;
1414 /// Return the vectorization cost for the given instruction \p I and vector
1415 /// width \p VF.
1416 InstructionCost getWideningCost(Instruction *I, ElementCount VF) {
1417 assert(VF.isVector() && "Expected VF >=2");
1418 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1419 assert(WideningDecisions.contains(InstOnVF) &&
1420 "The cost is not calculated");
1421 return WideningDecisions[InstOnVF].second;
1424 struct CallWideningDecision {
1425 InstWidening Kind;
1426 Function *Variant;
1427 Intrinsic::ID IID;
1428 std::optional<unsigned> MaskPos;
1429 InstructionCost Cost;
1432 void setCallWideningDecision(CallInst *CI, ElementCount VF, InstWidening Kind,
1433 Function *Variant, Intrinsic::ID IID,
1434 std::optional<unsigned> MaskPos,
1435 InstructionCost Cost) {
1436 assert(!VF.isScalar() && "Expected vector VF");
1437 CallWideningDecisions[std::make_pair(CI, VF)] = {Kind, Variant, IID,
1438 MaskPos, Cost};
1441 CallWideningDecision getCallWideningDecision(CallInst *CI,
1442 ElementCount VF) const {
1443 assert(!VF.isScalar() && "Expected vector VF");
1444 return CallWideningDecisions.at(std::make_pair(CI, VF));
1447 /// Return True if instruction \p I is an optimizable truncate whose operand
1448 /// is an induction variable. Such a truncate will be removed by adding a new
1449 /// induction variable with the destination type.
1450 bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) {
1451 // If the instruction is not a truncate, return false.
1452 auto *Trunc = dyn_cast<TruncInst>(I);
1453 if (!Trunc)
1454 return false;
1456 // Get the source and destination types of the truncate.
1457 Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF);
1458 Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF);
1460 // If the truncate is free for the given types, return false. Replacing a
1461 // free truncate with an induction variable would add an induction variable
1462 // update instruction to each iteration of the loop. We exclude from this
1463 // check the primary induction variable since it will need an update
1464 // instruction regardless.
1465 Value *Op = Trunc->getOperand(0);
1466 if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
1467 return false;
1469 // If the truncated value is not an induction variable, return false.
1470 return Legal->isInductionPhi(Op);
1473 /// Collects the instructions to scalarize for each predicated instruction in
1474 /// the loop.
1475 void collectInstsToScalarize(ElementCount VF);
1477 /// Collect Uniform and Scalar values for the given \p VF.
1478 /// The sets depend on CM decision for Load/Store instructions
1479 /// that may be vectorized as interleave, gather-scatter or scalarized.
1480 /// Also make a decision on what to do about call instructions in the loop
1481 /// at that VF -- scalarize, call a known vector routine, or call a
1482 /// vector intrinsic.
1483 void collectUniformsAndScalars(ElementCount VF) {
1484 // Do the analysis once.
1485 if (VF.isScalar() || Uniforms.contains(VF))
1486 return;
1487 setCostBasedWideningDecision(VF);
1488 setVectorizedCallDecision(VF);
1489 collectLoopUniforms(VF);
1490 collectLoopScalars(VF);
1493 /// Returns true if the target machine supports masked store operation
1494 /// for the given \p DataType and kind of access to \p Ptr.
1495 bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) const {
1496 return Legal->isConsecutivePtr(DataType, Ptr) &&
1497 TTI.isLegalMaskedStore(DataType, Alignment);
1500 /// Returns true if the target machine supports masked load operation
1501 /// for the given \p DataType and kind of access to \p Ptr.
1502 bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) const {
1503 return Legal->isConsecutivePtr(DataType, Ptr) &&
1504 TTI.isLegalMaskedLoad(DataType, Alignment);
1507 /// Returns true if the target machine can represent \p V as a masked gather
1508 /// or scatter operation.
1509 bool isLegalGatherOrScatter(Value *V, ElementCount VF) {
1510 bool LI = isa<LoadInst>(V);
1511 bool SI = isa<StoreInst>(V);
1512 if (!LI && !SI)
1513 return false;
1514 auto *Ty = getLoadStoreType(V);
1515 Align Align = getLoadStoreAlignment(V);
1516 if (VF.isVector())
1517 Ty = VectorType::get(Ty, VF);
1518 return (LI && TTI.isLegalMaskedGather(Ty, Align)) ||
1519 (SI && TTI.isLegalMaskedScatter(Ty, Align));
1522 /// Returns true if the target machine supports all of the reduction
1523 /// variables found for the given VF.
1524 bool canVectorizeReductions(ElementCount VF) const {
1525 return (all_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
1526 const RecurrenceDescriptor &RdxDesc = Reduction.second;
1527 return TTI.isLegalToVectorizeReduction(RdxDesc, VF);
1528 }));
1531 /// Given costs for both strategies, return true if the scalar predication
1532 /// lowering should be used for div/rem. This incorporates an override
1533 /// option so it is not simply a cost comparison.
1534 bool isDivRemScalarWithPredication(InstructionCost ScalarCost,
1535 InstructionCost SafeDivisorCost) const {
1536 switch (ForceSafeDivisor) {
1537 case cl::BOU_UNSET:
1538 return ScalarCost < SafeDivisorCost;
1539 case cl::BOU_TRUE:
1540 return false;
1541 case cl::BOU_FALSE:
1542 return true;
1544 llvm_unreachable("impossible case value");
1547 /// Returns true if \p I is an instruction which requires predication and
1548 /// for which our chosen predication strategy is scalarization (i.e. we
1549 /// don't have an alternate strategy such as masking available).
1550 /// \p VF is the vectorization factor that will be used to vectorize \p I.
1551 bool isScalarWithPredication(Instruction *I, ElementCount VF) const;
1553 /// Returns true if \p I is an instruction that needs to be predicated
1554 /// at runtime. The result is independent of the predication mechanism.
1555 /// Superset of instructions that return true for isScalarWithPredication.
1556 bool isPredicatedInst(Instruction *I) const;
1558 /// Return the costs for our two available strategies for lowering a
1559 /// div/rem operation which requires speculating at least one lane.
1560 /// First result is for scalarization (will be invalid for scalable
1561 /// vectors); second is for the safe-divisor strategy.
1562 std::pair<InstructionCost, InstructionCost>
1563 getDivRemSpeculationCost(Instruction *I,
1564 ElementCount VF) const;
1566 /// Returns true if \p I is a memory instruction with consecutive memory
1567 /// access that can be widened.
1568 bool memoryInstructionCanBeWidened(Instruction *I, ElementCount VF);
1570 /// Returns true if \p I is a memory instruction in an interleaved-group
1571 /// of memory accesses that can be vectorized with wide vector loads/stores
1572 /// and shuffles.
1573 bool interleavedAccessCanBeWidened(Instruction *I, ElementCount VF);
1575 /// Check if \p Instr belongs to any interleaved access group.
1576 bool isAccessInterleaved(Instruction *Instr) {
1577 return InterleaveInfo.isInterleaved(Instr);
1580 /// Get the interleaved access group that \p Instr belongs to.
1581 const InterleaveGroup<Instruction> *
1582 getInterleavedAccessGroup(Instruction *Instr) {
1583 return InterleaveInfo.getInterleaveGroup(Instr);
1586 /// Returns true if we're required to use a scalar epilogue for at least
1587 /// the final iteration of the original loop.
1588 bool requiresScalarEpilogue(bool IsVectorizing) const {
1589 if (!isScalarEpilogueAllowed())
1590 return false;
1591 // If we might exit from anywhere but the latch, must run the exiting
1592 // iteration in scalar form.
1593 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch())
1594 return true;
1595 return IsVectorizing && InterleaveInfo.requiresScalarEpilogue();
1598 /// Returns true if we're required to use a scalar epilogue for at least
1599 /// the final iteration of the original loop for all VFs in \p Range.
1600 /// A scalar epilogue must either be required for all VFs in \p Range or for
1601 /// none.
1602 bool requiresScalarEpilogue(VFRange Range) const {
1603 auto RequiresScalarEpilogue = [this](ElementCount VF) {
1604 return requiresScalarEpilogue(VF.isVector());
1606 bool IsRequired = all_of(Range, RequiresScalarEpilogue);
1607 assert(
1608 (IsRequired || none_of(Range, RequiresScalarEpilogue)) &&
1609 "all VFs in range must agree on whether a scalar epilogue is required");
1610 return IsRequired;
1613 /// Returns true if a scalar epilogue is not allowed due to optsize or a
1614 /// loop hint annotation.
1615 bool isScalarEpilogueAllowed() const {
1616 return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
1619 /// Returns the TailFoldingStyle that is best for the current loop.
1620 TailFoldingStyle
1621 getTailFoldingStyle(bool IVUpdateMayOverflow = true) const {
1622 if (!CanFoldTailByMasking)
1623 return TailFoldingStyle::None;
1625 if (ForceTailFoldingStyle.getNumOccurrences())
1626 return ForceTailFoldingStyle;
1628 return TTI.getPreferredTailFoldingStyle(IVUpdateMayOverflow);
1631 /// Returns true if all loop blocks should be masked to fold tail loop.
1632 bool foldTailByMasking() const {
1633 return getTailFoldingStyle() != TailFoldingStyle::None;
1636 /// Returns true if the instructions in this block requires predication
1637 /// for any reason, e.g. because tail folding now requires a predicate
1638 /// or because the block in the original loop was predicated.
1639 bool blockNeedsPredicationForAnyReason(BasicBlock *BB) const {
1640 return foldTailByMasking() || Legal->blockNeedsPredication(BB);
1643 /// Returns true if the Phi is part of an inloop reduction.
1644 bool isInLoopReduction(PHINode *Phi) const {
1645 return InLoopReductions.contains(Phi);
1648 /// Estimate cost of an intrinsic call instruction CI if it were vectorized
1649 /// with factor VF. Return the cost of the instruction, including
1650 /// scalarization overhead if it's needed.
1651 InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const;
1653 /// Estimate cost of a call instruction CI if it were vectorized with factor
1654 /// VF. Return the cost of the instruction, including scalarization overhead
1655 /// if it's needed.
1656 InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF) const;
1658 /// Invalidates decisions already taken by the cost model.
1659 void invalidateCostModelingDecisions() {
1660 WideningDecisions.clear();
1661 CallWideningDecisions.clear();
1662 Uniforms.clear();
1663 Scalars.clear();
1666 /// The vectorization cost is a combination of the cost itself and a boolean
1667 /// indicating whether any of the contributing operations will actually
1668 /// operate on vector values after type legalization in the backend. If this
1669 /// latter value is false, then all operations will be scalarized (i.e. no
1670 /// vectorization has actually taken place).
1671 using VectorizationCostTy = std::pair<InstructionCost, bool>;
1673 /// Returns the expected execution cost. The unit of the cost does
1674 /// not matter because we use the 'cost' units to compare different
1675 /// vector widths. The cost that is returned is *not* normalized by
1676 /// the factor width. If \p Invalid is not nullptr, this function
1677 /// will add a pair(Instruction*, ElementCount) to \p Invalid for
1678 /// each instruction that has an Invalid cost for the given VF.
1679 VectorizationCostTy
1680 expectedCost(ElementCount VF,
1681 SmallVectorImpl<InstructionVFPair> *Invalid = nullptr);
1683 bool hasPredStores() const { return NumPredStores > 0; }
1685 /// Returns true if epilogue vectorization is considered profitable, and
1686 /// false otherwise.
1687 /// \p VF is the vectorization factor chosen for the original loop.
1688 bool isEpilogueVectorizationProfitable(const ElementCount VF) const;
1690 private:
1691 unsigned NumPredStores = 0;
1693 /// \return An upper bound for the vectorization factors for both
1694 /// fixed and scalable vectorization, where the minimum-known number of
1695 /// elements is a power-of-2 larger than zero. If scalable vectorization is
1696 /// disabled or unsupported, then the scalable part will be equal to
1697 /// ElementCount::getScalable(0).
1698 FixedScalableVFPair computeFeasibleMaxVF(unsigned MaxTripCount,
1699 ElementCount UserVF,
1700 bool FoldTailByMasking);
1702 /// \return the maximized element count based on the targets vector
1703 /// registers and the loop trip-count, but limited to a maximum safe VF.
1704 /// This is a helper function of computeFeasibleMaxVF.
1705 ElementCount getMaximizedVFForTarget(unsigned MaxTripCount,
1706 unsigned SmallestType,
1707 unsigned WidestType,
1708 ElementCount MaxSafeVF,
1709 bool FoldTailByMasking);
1711 /// \return the maximum legal scalable VF, based on the safe max number
1712 /// of elements.
1713 ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements);
1715 /// Returns the execution time cost of an instruction for a given vector
1716 /// width. Vector width of one means scalar.
1717 VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF);
1719 /// The cost-computation logic from getInstructionCost which provides
1720 /// the vector type as an output parameter.
1721 InstructionCost getInstructionCost(Instruction *I, ElementCount VF,
1722 Type *&VectorTy);
1724 /// Return the cost of instructions in an inloop reduction pattern, if I is
1725 /// part of that pattern.
1726 std::optional<InstructionCost>
1727 getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy,
1728 TTI::TargetCostKind CostKind) const;
1730 /// Calculate vectorization cost of memory instruction \p I.
1731 InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF);
1733 /// The cost computation for scalarized memory instruction.
1734 InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF);
1736 /// The cost computation for interleaving group of memory instructions.
1737 InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF);
1739 /// The cost computation for Gather/Scatter instruction.
1740 InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF);
1742 /// The cost computation for widening instruction \p I with consecutive
1743 /// memory access.
1744 InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF);
1746 /// The cost calculation for Load/Store instruction \p I with uniform pointer -
1747 /// Load: scalar load + broadcast.
1748 /// Store: scalar store + (loop invariant value stored? 0 : extract of last
1749 /// element)
1750 InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF);
1752 /// Estimate the overhead of scalarizing an instruction. This is a
1753 /// convenience wrapper for the type-based getScalarizationOverhead API.
1754 InstructionCost getScalarizationOverhead(Instruction *I, ElementCount VF,
1755 TTI::TargetCostKind CostKind) const;
1757 /// Returns true if an artificially high cost for emulated masked memrefs
1758 /// should be used.
1759 bool useEmulatedMaskMemRefHack(Instruction *I, ElementCount VF);
1761 /// Map of scalar integer values to the smallest bitwidth they can be legally
1762 /// represented as. The vector equivalents of these values should be truncated
1763 /// to this type.
1764 MapVector<Instruction *, uint64_t> MinBWs;
1766 /// A type representing the costs for instructions if they were to be
1767 /// scalarized rather than vectorized. The entries are Instruction-Cost
1768 /// pairs.
1769 using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>;
1771 /// A set containing all BasicBlocks that are known to present after
1772 /// vectorization as a predicated block.
1773 DenseMap<ElementCount, SmallPtrSet<BasicBlock *, 4>>
1774 PredicatedBBsAfterVectorization;
1776 /// Records whether it is allowed to have the original scalar loop execute at
1777 /// least once. This may be needed as a fallback loop in case runtime
1778 /// aliasing/dependence checks fail, or to handle the tail/remainder
1779 /// iterations when the trip count is unknown or doesn't divide by the VF,
1780 /// or as a peel-loop to handle gaps in interleave-groups.
1781 /// Under optsize and when the trip count is very small we don't allow any
1782 /// iterations to execute in the scalar loop.
1783 ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
1785 /// All blocks of loop are to be masked to fold tail of scalar iterations.
1786 bool CanFoldTailByMasking = false;
1788 /// A map holding scalar costs for different vectorization factors. The
1789 /// presence of a cost for an instruction in the mapping indicates that the
1790 /// instruction will be scalarized when vectorizing with the associated
1791 /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1792 DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize;
1794 /// Holds the instructions known to be uniform after vectorization.
1795 /// The data is collected per VF.
1796 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms;
1798 /// Holds the instructions known to be scalar after vectorization.
1799 /// The data is collected per VF.
1800 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars;
1802 /// Holds the instructions (address computations) that are forced to be
1803 /// scalarized.
1804 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars;
1806 /// PHINodes of the reductions that should be expanded in-loop.
1807 SmallPtrSet<PHINode *, 4> InLoopReductions;
1809 /// A Map of inloop reduction operations and their immediate chain operand.
1810 /// FIXME: This can be removed once reductions can be costed correctly in
1811 /// VPlan. This was added to allow quick lookup of the inloop operations.
1812 DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains;
1814 /// Returns the expected difference in cost from scalarizing the expression
1815 /// feeding a predicated instruction \p PredInst. The instructions to
1816 /// scalarize and their scalar costs are collected in \p ScalarCosts. A
1817 /// non-negative return value implies the expression will be scalarized.
1818 /// Currently, only single-use chains are considered for scalarization.
1819 InstructionCost computePredInstDiscount(Instruction *PredInst,
1820 ScalarCostsTy &ScalarCosts,
1821 ElementCount VF);
1823 /// Collect the instructions that are uniform after vectorization. An
1824 /// instruction is uniform if we represent it with a single scalar value in
1825 /// the vectorized loop corresponding to each vector iteration. Examples of
1826 /// uniform instructions include pointer operands of consecutive or
1827 /// interleaved memory accesses. Note that although uniformity implies an
1828 /// instruction will be scalar, the reverse is not true. In general, a
1829 /// scalarized instruction will be represented by VF scalar values in the
1830 /// vectorized loop, each corresponding to an iteration of the original
1831 /// scalar loop.
1832 void collectLoopUniforms(ElementCount VF);
1834 /// Collect the instructions that are scalar after vectorization. An
1835 /// instruction is scalar if it is known to be uniform or will be scalarized
1836 /// during vectorization. collectLoopScalars should only add non-uniform nodes
1837 /// to the list if they are used by a load/store instruction that is marked as
1838 /// CM_Scalarize. Non-uniform scalarized instructions will be represented by
1839 /// VF values in the vectorized loop, each corresponding to an iteration of
1840 /// the original scalar loop.
1841 void collectLoopScalars(ElementCount VF);
1843 /// Keeps cost model vectorization decision and cost for instructions.
1844 /// Right now it is used for memory instructions only.
1845 using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>,
1846 std::pair<InstWidening, InstructionCost>>;
1848 DecisionList WideningDecisions;
1850 using CallDecisionList =
1851 DenseMap<std::pair<CallInst *, ElementCount>, CallWideningDecision>;
1853 CallDecisionList CallWideningDecisions;
1855 /// Returns true if \p V is expected to be vectorized and it needs to be
1856 /// extracted.
1857 bool needsExtract(Value *V, ElementCount VF) const {
1858 Instruction *I = dyn_cast<Instruction>(V);
1859 if (VF.isScalar() || !I || !TheLoop->contains(I) ||
1860 TheLoop->isLoopInvariant(I))
1861 return false;
1863 // Assume we can vectorize V (and hence we need extraction) if the
1864 // scalars are not computed yet. This can happen, because it is called
1865 // via getScalarizationOverhead from setCostBasedWideningDecision, before
1866 // the scalars are collected. That should be a safe assumption in most
1867 // cases, because we check if the operands have vectorizable types
1868 // beforehand in LoopVectorizationLegality.
1869 return !Scalars.contains(VF) || !isScalarAfterVectorization(I, VF);
1872 /// Returns a range containing only operands needing to be extracted.
1873 SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
1874 ElementCount VF) const {
1875 return SmallVector<Value *, 4>(make_filter_range(
1876 Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
1879 public:
1880 /// The loop that we evaluate.
1881 Loop *TheLoop;
1883 /// Predicated scalar evolution analysis.
1884 PredicatedScalarEvolution &PSE;
1886 /// Loop Info analysis.
1887 LoopInfo *LI;
1889 /// Vectorization legality.
1890 LoopVectorizationLegality *Legal;
1892 /// Vector target information.
1893 const TargetTransformInfo &TTI;
1895 /// Target Library Info.
1896 const TargetLibraryInfo *TLI;
1898 /// Demanded bits analysis.
1899 DemandedBits *DB;
1901 /// Assumption cache.
1902 AssumptionCache *AC;
1904 /// Interface to emit optimization remarks.
1905 OptimizationRemarkEmitter *ORE;
1907 const Function *TheFunction;
1909 /// Loop Vectorize Hint.
1910 const LoopVectorizeHints *Hints;
1912 /// The interleave access information contains groups of interleaved accesses
1913 /// with the same stride and close to each other.
1914 InterleavedAccessInfo &InterleaveInfo;
1916 /// Values to ignore in the cost model.
1917 SmallPtrSet<const Value *, 16> ValuesToIgnore;
1919 /// Values to ignore in the cost model when VF > 1.
1920 SmallPtrSet<const Value *, 16> VecValuesToIgnore;
1922 /// All element types found in the loop.
1923 SmallPtrSet<Type *, 16> ElementTypesInLoop;
1925 } // end namespace llvm
1927 namespace {
1928 /// Helper struct to manage generating runtime checks for vectorization.
1930 /// The runtime checks are created up-front in temporary blocks to allow better
1931 /// estimating the cost and un-linked from the existing IR. After deciding to
1932 /// vectorize, the checks are moved back. If deciding not to vectorize, the
1933 /// temporary blocks are completely removed.
1934 class GeneratedRTChecks {
1935 /// Basic block which contains the generated SCEV checks, if any.
1936 BasicBlock *SCEVCheckBlock = nullptr;
1938 /// The value representing the result of the generated SCEV checks. If it is
1939 /// nullptr, either no SCEV checks have been generated or they have been used.
1940 Value *SCEVCheckCond = nullptr;
1942 /// Basic block which contains the generated memory runtime checks, if any.
1943 BasicBlock *MemCheckBlock = nullptr;
1945 /// The value representing the result of the generated memory runtime checks.
1946 /// If it is nullptr, either no memory runtime checks have been generated or
1947 /// they have been used.
1948 Value *MemRuntimeCheckCond = nullptr;
1950 DominatorTree *DT;
1951 LoopInfo *LI;
1952 TargetTransformInfo *TTI;
1954 SCEVExpander SCEVExp;
1955 SCEVExpander MemCheckExp;
1957 bool CostTooHigh = false;
1958 const bool AddBranchWeights;
1960 Loop *OuterLoop = nullptr;
1962 public:
1963 GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI,
1964 TargetTransformInfo *TTI, const DataLayout &DL,
1965 bool AddBranchWeights)
1966 : DT(DT), LI(LI), TTI(TTI), SCEVExp(SE, DL, "scev.check"),
1967 MemCheckExp(SE, DL, "scev.check"), AddBranchWeights(AddBranchWeights) {}
1969 /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can
1970 /// accurately estimate the cost of the runtime checks. The blocks are
1971 /// un-linked from the IR and is added back during vector code generation. If
1972 /// there is no vector code generation, the check blocks are removed
1973 /// completely.
1974 void Create(Loop *L, const LoopAccessInfo &LAI,
1975 const SCEVPredicate &UnionPred, ElementCount VF, unsigned IC) {
1977 // Hard cutoff to limit compile-time increase in case a very large number of
1978 // runtime checks needs to be generated.
1979 // TODO: Skip cutoff if the loop is guaranteed to execute, e.g. due to
1980 // profile info.
1981 CostTooHigh =
1982 LAI.getNumRuntimePointerChecks() > VectorizeMemoryCheckThreshold;
1983 if (CostTooHigh)
1984 return;
1986 BasicBlock *LoopHeader = L->getHeader();
1987 BasicBlock *Preheader = L->getLoopPreheader();
1989 // Use SplitBlock to create blocks for SCEV & memory runtime checks to
1990 // ensure the blocks are properly added to LoopInfo & DominatorTree. Those
1991 // may be used by SCEVExpander. The blocks will be un-linked from their
1992 // predecessors and removed from LI & DT at the end of the function.
1993 if (!UnionPred.isAlwaysTrue()) {
1994 SCEVCheckBlock = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI,
1995 nullptr, "vector.scevcheck");
1997 SCEVCheckCond = SCEVExp.expandCodeForPredicate(
1998 &UnionPred, SCEVCheckBlock->getTerminator());
2001 const auto &RtPtrChecking = *LAI.getRuntimePointerChecking();
2002 if (RtPtrChecking.Need) {
2003 auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader;
2004 MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr,
2005 "vector.memcheck");
2007 auto DiffChecks = RtPtrChecking.getDiffChecks();
2008 if (DiffChecks) {
2009 Value *RuntimeVF = nullptr;
2010 MemRuntimeCheckCond = addDiffRuntimeChecks(
2011 MemCheckBlock->getTerminator(), *DiffChecks, MemCheckExp,
2012 [VF, &RuntimeVF](IRBuilderBase &B, unsigned Bits) {
2013 if (!RuntimeVF)
2014 RuntimeVF = getRuntimeVF(B, B.getIntNTy(Bits), VF);
2015 return RuntimeVF;
2017 IC);
2018 } else {
2019 MemRuntimeCheckCond = addRuntimeChecks(
2020 MemCheckBlock->getTerminator(), L, RtPtrChecking.getChecks(),
2021 MemCheckExp, VectorizerParams::HoistRuntimeChecks);
2023 assert(MemRuntimeCheckCond &&
2024 "no RT checks generated although RtPtrChecking "
2025 "claimed checks are required");
2028 if (!MemCheckBlock && !SCEVCheckBlock)
2029 return;
2031 // Unhook the temporary block with the checks, update various places
2032 // accordingly.
2033 if (SCEVCheckBlock)
2034 SCEVCheckBlock->replaceAllUsesWith(Preheader);
2035 if (MemCheckBlock)
2036 MemCheckBlock->replaceAllUsesWith(Preheader);
2038 if (SCEVCheckBlock) {
2039 SCEVCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator());
2040 new UnreachableInst(Preheader->getContext(), SCEVCheckBlock);
2041 Preheader->getTerminator()->eraseFromParent();
2043 if (MemCheckBlock) {
2044 MemCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator());
2045 new UnreachableInst(Preheader->getContext(), MemCheckBlock);
2046 Preheader->getTerminator()->eraseFromParent();
2049 DT->changeImmediateDominator(LoopHeader, Preheader);
2050 if (MemCheckBlock) {
2051 DT->eraseNode(MemCheckBlock);
2052 LI->removeBlock(MemCheckBlock);
2054 if (SCEVCheckBlock) {
2055 DT->eraseNode(SCEVCheckBlock);
2056 LI->removeBlock(SCEVCheckBlock);
2059 // Outer loop is used as part of the later cost calculations.
2060 OuterLoop = L->getParentLoop();
2063 InstructionCost getCost() {
2064 if (SCEVCheckBlock || MemCheckBlock)
2065 LLVM_DEBUG(dbgs() << "Calculating cost of runtime checks:\n");
2067 if (CostTooHigh) {
2068 InstructionCost Cost;
2069 Cost.setInvalid();
2070 LLVM_DEBUG(dbgs() << " number of checks exceeded threshold\n");
2071 return Cost;
2074 InstructionCost RTCheckCost = 0;
2075 if (SCEVCheckBlock)
2076 for (Instruction &I : *SCEVCheckBlock) {
2077 if (SCEVCheckBlock->getTerminator() == &I)
2078 continue;
2079 InstructionCost C =
2080 TTI->getInstructionCost(&I, TTI::TCK_RecipThroughput);
2081 LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n");
2082 RTCheckCost += C;
2084 if (MemCheckBlock) {
2085 InstructionCost MemCheckCost = 0;
2086 for (Instruction &I : *MemCheckBlock) {
2087 if (MemCheckBlock->getTerminator() == &I)
2088 continue;
2089 InstructionCost C =
2090 TTI->getInstructionCost(&I, TTI::TCK_RecipThroughput);
2091 LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n");
2092 MemCheckCost += C;
2095 // If the runtime memory checks are being created inside an outer loop
2096 // we should find out if these checks are outer loop invariant. If so,
2097 // the checks will likely be hoisted out and so the effective cost will
2098 // reduce according to the outer loop trip count.
2099 if (OuterLoop) {
2100 ScalarEvolution *SE = MemCheckExp.getSE();
2101 // TODO: If profitable, we could refine this further by analysing every
2102 // individual memory check, since there could be a mixture of loop
2103 // variant and invariant checks that mean the final condition is
2104 // variant.
2105 const SCEV *Cond = SE->getSCEV(MemRuntimeCheckCond);
2106 if (SE->isLoopInvariant(Cond, OuterLoop)) {
2107 // It seems reasonable to assume that we can reduce the effective
2108 // cost of the checks even when we know nothing about the trip
2109 // count. Assume that the outer loop executes at least twice.
2110 unsigned BestTripCount = 2;
2112 // If exact trip count is known use that.
2113 if (unsigned SmallTC = SE->getSmallConstantTripCount(OuterLoop))
2114 BestTripCount = SmallTC;
2115 else if (LoopVectorizeWithBlockFrequency) {
2116 // Else use profile data if available.
2117 if (auto EstimatedTC = getLoopEstimatedTripCount(OuterLoop))
2118 BestTripCount = *EstimatedTC;
2121 InstructionCost NewMemCheckCost = MemCheckCost / BestTripCount;
2123 // Let's ensure the cost is always at least 1.
2124 NewMemCheckCost = std::max(*NewMemCheckCost.getValue(),
2125 (InstructionCost::CostType)1);
2127 LLVM_DEBUG(dbgs()
2128 << "We expect runtime memory checks to be hoisted "
2129 << "out of the outer loop. Cost reduced from "
2130 << MemCheckCost << " to " << NewMemCheckCost << '\n');
2132 MemCheckCost = NewMemCheckCost;
2136 RTCheckCost += MemCheckCost;
2139 if (SCEVCheckBlock || MemCheckBlock)
2140 LLVM_DEBUG(dbgs() << "Total cost of runtime checks: " << RTCheckCost
2141 << "\n");
2143 return RTCheckCost;
2146 /// Remove the created SCEV & memory runtime check blocks & instructions, if
2147 /// unused.
2148 ~GeneratedRTChecks() {
2149 SCEVExpanderCleaner SCEVCleaner(SCEVExp);
2150 SCEVExpanderCleaner MemCheckCleaner(MemCheckExp);
2151 if (!SCEVCheckCond)
2152 SCEVCleaner.markResultUsed();
2154 if (!MemRuntimeCheckCond)
2155 MemCheckCleaner.markResultUsed();
2157 if (MemRuntimeCheckCond) {
2158 auto &SE = *MemCheckExp.getSE();
2159 // Memory runtime check generation creates compares that use expanded
2160 // values. Remove them before running the SCEVExpanderCleaners.
2161 for (auto &I : make_early_inc_range(reverse(*MemCheckBlock))) {
2162 if (MemCheckExp.isInsertedInstruction(&I))
2163 continue;
2164 SE.forgetValue(&I);
2165 I.eraseFromParent();
2168 MemCheckCleaner.cleanup();
2169 SCEVCleaner.cleanup();
2171 if (SCEVCheckCond)
2172 SCEVCheckBlock->eraseFromParent();
2173 if (MemRuntimeCheckCond)
2174 MemCheckBlock->eraseFromParent();
2177 /// Adds the generated SCEVCheckBlock before \p LoopVectorPreHeader and
2178 /// adjusts the branches to branch to the vector preheader or \p Bypass,
2179 /// depending on the generated condition.
2180 BasicBlock *emitSCEVChecks(BasicBlock *Bypass,
2181 BasicBlock *LoopVectorPreHeader,
2182 BasicBlock *LoopExitBlock) {
2183 if (!SCEVCheckCond)
2184 return nullptr;
2186 Value *Cond = SCEVCheckCond;
2187 // Mark the check as used, to prevent it from being removed during cleanup.
2188 SCEVCheckCond = nullptr;
2189 if (auto *C = dyn_cast<ConstantInt>(Cond))
2190 if (C->isZero())
2191 return nullptr;
2193 auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
2195 BranchInst::Create(LoopVectorPreHeader, SCEVCheckBlock);
2196 // Create new preheader for vector loop.
2197 if (OuterLoop)
2198 OuterLoop->addBasicBlockToLoop(SCEVCheckBlock, *LI);
2200 SCEVCheckBlock->getTerminator()->eraseFromParent();
2201 SCEVCheckBlock->moveBefore(LoopVectorPreHeader);
2202 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader,
2203 SCEVCheckBlock);
2205 DT->addNewBlock(SCEVCheckBlock, Pred);
2206 DT->changeImmediateDominator(LoopVectorPreHeader, SCEVCheckBlock);
2208 BranchInst &BI = *BranchInst::Create(Bypass, LoopVectorPreHeader, Cond);
2209 if (AddBranchWeights)
2210 setBranchWeights(BI, SCEVCheckBypassWeights);
2211 ReplaceInstWithInst(SCEVCheckBlock->getTerminator(), &BI);
2212 return SCEVCheckBlock;
2215 /// Adds the generated MemCheckBlock before \p LoopVectorPreHeader and adjusts
2216 /// the branches to branch to the vector preheader or \p Bypass, depending on
2217 /// the generated condition.
2218 BasicBlock *emitMemRuntimeChecks(BasicBlock *Bypass,
2219 BasicBlock *LoopVectorPreHeader) {
2220 // Check if we generated code that checks in runtime if arrays overlap.
2221 if (!MemRuntimeCheckCond)
2222 return nullptr;
2224 auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
2225 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader,
2226 MemCheckBlock);
2228 DT->addNewBlock(MemCheckBlock, Pred);
2229 DT->changeImmediateDominator(LoopVectorPreHeader, MemCheckBlock);
2230 MemCheckBlock->moveBefore(LoopVectorPreHeader);
2232 if (OuterLoop)
2233 OuterLoop->addBasicBlockToLoop(MemCheckBlock, *LI);
2235 BranchInst &BI =
2236 *BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond);
2237 if (AddBranchWeights) {
2238 setBranchWeights(BI, MemCheckBypassWeights);
2240 ReplaceInstWithInst(MemCheckBlock->getTerminator(), &BI);
2241 MemCheckBlock->getTerminator()->setDebugLoc(
2242 Pred->getTerminator()->getDebugLoc());
2244 // Mark the check as used, to prevent it from being removed during cleanup.
2245 MemRuntimeCheckCond = nullptr;
2246 return MemCheckBlock;
2249 } // namespace
2251 static bool useActiveLaneMask(TailFoldingStyle Style) {
2252 return Style == TailFoldingStyle::Data ||
2253 Style == TailFoldingStyle::DataAndControlFlow ||
2254 Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
2257 static bool useActiveLaneMaskForControlFlow(TailFoldingStyle Style) {
2258 return Style == TailFoldingStyle::DataAndControlFlow ||
2259 Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
2262 // Return true if \p OuterLp is an outer loop annotated with hints for explicit
2263 // vectorization. The loop needs to be annotated with #pragma omp simd
2264 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
2265 // vector length information is not provided, vectorization is not considered
2266 // explicit. Interleave hints are not allowed either. These limitations will be
2267 // relaxed in the future.
2268 // Please, note that we are currently forced to abuse the pragma 'clang
2269 // vectorize' semantics. This pragma provides *auto-vectorization hints*
2270 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
2271 // provides *explicit vectorization hints* (LV can bypass legal checks and
2272 // assume that vectorization is legal). However, both hints are implemented
2273 // using the same metadata (llvm.loop.vectorize, processed by
2274 // LoopVectorizeHints). This will be fixed in the future when the native IR
2275 // representation for pragma 'omp simd' is introduced.
2276 static bool isExplicitVecOuterLoop(Loop *OuterLp,
2277 OptimizationRemarkEmitter *ORE) {
2278 assert(!OuterLp->isInnermost() && "This is not an outer loop");
2279 LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
2281 // Only outer loops with an explicit vectorization hint are supported.
2282 // Unannotated outer loops are ignored.
2283 if (Hints.getForce() == LoopVectorizeHints::FK_Undefined)
2284 return false;
2286 Function *Fn = OuterLp->getHeader()->getParent();
2287 if (!Hints.allowVectorization(Fn, OuterLp,
2288 true /*VectorizeOnlyWhenForced*/)) {
2289 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n");
2290 return false;
2293 if (Hints.getInterleave() > 1) {
2294 // TODO: Interleave support is future work.
2295 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "
2296 "outer loops.\n");
2297 Hints.emitRemarkWithHints();
2298 return false;
2301 return true;
2304 static void collectSupportedLoops(Loop &L, LoopInfo *LI,
2305 OptimizationRemarkEmitter *ORE,
2306 SmallVectorImpl<Loop *> &V) {
2307 // Collect inner loops and outer loops without irreducible control flow. For
2308 // now, only collect outer loops that have explicit vectorization hints. If we
2309 // are stress testing the VPlan H-CFG construction, we collect the outermost
2310 // loop of every loop nest.
2311 if (L.isInnermost() || VPlanBuildStressTest ||
2312 (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) {
2313 LoopBlocksRPO RPOT(&L);
2314 RPOT.perform(LI);
2315 if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) {
2316 V.push_back(&L);
2317 // TODO: Collect inner loops inside marked outer loops in case
2318 // vectorization fails for the outer loop. Do not invoke
2319 // 'containsIrreducibleCFG' again for inner loops when the outer loop is
2320 // already known to be reducible. We can use an inherited attribute for
2321 // that.
2322 return;
2325 for (Loop *InnerL : L)
2326 collectSupportedLoops(*InnerL, LI, ORE, V);
2329 //===----------------------------------------------------------------------===//
2330 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
2331 // LoopVectorizationCostModel and LoopVectorizationPlanner.
2332 //===----------------------------------------------------------------------===//
2334 /// Compute the transformed value of Index at offset StartValue using step
2335 /// StepValue.
2336 /// For integer induction, returns StartValue + Index * StepValue.
2337 /// For pointer induction, returns StartValue[Index * StepValue].
2338 /// FIXME: The newly created binary instructions should contain nsw/nuw
2339 /// flags, which can be found from the original scalar operations.
2340 static Value *
2341 emitTransformedIndex(IRBuilderBase &B, Value *Index, Value *StartValue,
2342 Value *Step,
2343 InductionDescriptor::InductionKind InductionKind,
2344 const BinaryOperator *InductionBinOp) {
2345 Type *StepTy = Step->getType();
2346 Value *CastedIndex = StepTy->isIntegerTy()
2347 ? B.CreateSExtOrTrunc(Index, StepTy)
2348 : B.CreateCast(Instruction::SIToFP, Index, StepTy);
2349 if (CastedIndex != Index) {
2350 CastedIndex->setName(CastedIndex->getName() + ".cast");
2351 Index = CastedIndex;
2354 // Note: the IR at this point is broken. We cannot use SE to create any new
2355 // SCEV and then expand it, hoping that SCEV's simplification will give us
2356 // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
2357 // lead to various SCEV crashes. So all we can do is to use builder and rely
2358 // on InstCombine for future simplifications. Here we handle some trivial
2359 // cases only.
2360 auto CreateAdd = [&B](Value *X, Value *Y) {
2361 assert(X->getType() == Y->getType() && "Types don't match!");
2362 if (auto *CX = dyn_cast<ConstantInt>(X))
2363 if (CX->isZero())
2364 return Y;
2365 if (auto *CY = dyn_cast<ConstantInt>(Y))
2366 if (CY->isZero())
2367 return X;
2368 return B.CreateAdd(X, Y);
2371 // We allow X to be a vector type, in which case Y will potentially be
2372 // splatted into a vector with the same element count.
2373 auto CreateMul = [&B](Value *X, Value *Y) {
2374 assert(X->getType()->getScalarType() == Y->getType() &&
2375 "Types don't match!");
2376 if (auto *CX = dyn_cast<ConstantInt>(X))
2377 if (CX->isOne())
2378 return Y;
2379 if (auto *CY = dyn_cast<ConstantInt>(Y))
2380 if (CY->isOne())
2381 return X;
2382 VectorType *XVTy = dyn_cast<VectorType>(X->getType());
2383 if (XVTy && !isa<VectorType>(Y->getType()))
2384 Y = B.CreateVectorSplat(XVTy->getElementCount(), Y);
2385 return B.CreateMul(X, Y);
2388 switch (InductionKind) {
2389 case InductionDescriptor::IK_IntInduction: {
2390 assert(!isa<VectorType>(Index->getType()) &&
2391 "Vector indices not supported for integer inductions yet");
2392 assert(Index->getType() == StartValue->getType() &&
2393 "Index type does not match StartValue type");
2394 if (isa<ConstantInt>(Step) && cast<ConstantInt>(Step)->isMinusOne())
2395 return B.CreateSub(StartValue, Index);
2396 auto *Offset = CreateMul(Index, Step);
2397 return CreateAdd(StartValue, Offset);
2399 case InductionDescriptor::IK_PtrInduction:
2400 return B.CreatePtrAdd(StartValue, CreateMul(Index, Step));
2401 case InductionDescriptor::IK_FpInduction: {
2402 assert(!isa<VectorType>(Index->getType()) &&
2403 "Vector indices not supported for FP inductions yet");
2404 assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
2405 assert(InductionBinOp &&
2406 (InductionBinOp->getOpcode() == Instruction::FAdd ||
2407 InductionBinOp->getOpcode() == Instruction::FSub) &&
2408 "Original bin op should be defined for FP induction");
2410 Value *MulExp = B.CreateFMul(Step, Index);
2411 return B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
2412 "induction");
2414 case InductionDescriptor::IK_NoInduction:
2415 return nullptr;
2417 llvm_unreachable("invalid enum");
2420 std::optional<unsigned> getMaxVScale(const Function &F,
2421 const TargetTransformInfo &TTI) {
2422 if (std::optional<unsigned> MaxVScale = TTI.getMaxVScale())
2423 return MaxVScale;
2425 if (F.hasFnAttribute(Attribute::VScaleRange))
2426 return F.getFnAttribute(Attribute::VScaleRange).getVScaleRangeMax();
2428 return std::nullopt;
2431 /// For the given VF and UF and maximum trip count computed for the loop, return
2432 /// whether the induction variable might overflow in the vectorized loop. If not,
2433 /// then we know a runtime overflow check always evaluates to false and can be
2434 /// removed.
2435 static bool isIndvarOverflowCheckKnownFalse(
2436 const LoopVectorizationCostModel *Cost,
2437 ElementCount VF, std::optional<unsigned> UF = std::nullopt) {
2438 // Always be conservative if we don't know the exact unroll factor.
2439 unsigned MaxUF = UF ? *UF : Cost->TTI.getMaxInterleaveFactor(VF);
2441 Type *IdxTy = Cost->Legal->getWidestInductionType();
2442 APInt MaxUIntTripCount = cast<IntegerType>(IdxTy)->getMask();
2444 // We know the runtime overflow check is known false iff the (max) trip-count
2445 // is known and (max) trip-count + (VF * UF) does not overflow in the type of
2446 // the vector loop induction variable.
2447 if (unsigned TC =
2448 Cost->PSE.getSE()->getSmallConstantMaxTripCount(Cost->TheLoop)) {
2449 uint64_t MaxVF = VF.getKnownMinValue();
2450 if (VF.isScalable()) {
2451 std::optional<unsigned> MaxVScale =
2452 getMaxVScale(*Cost->TheFunction, Cost->TTI);
2453 if (!MaxVScale)
2454 return false;
2455 MaxVF *= *MaxVScale;
2458 return (MaxUIntTripCount - TC).ugt(MaxVF * MaxUF);
2461 return false;
2464 // Return whether we allow using masked interleave-groups (for dealing with
2465 // strided loads/stores that reside in predicated blocks, or for dealing
2466 // with gaps).
2467 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
2468 // If an override option has been passed in for interleaved accesses, use it.
2469 if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)
2470 return EnableMaskedInterleavedMemAccesses;
2472 return TTI.enableMaskedInterleavedAccessVectorization();
2475 // Try to vectorize the interleave group that \p Instr belongs to.
2477 // E.g. Translate following interleaved load group (factor = 3):
2478 // for (i = 0; i < N; i+=3) {
2479 // R = Pic[i]; // Member of index 0
2480 // G = Pic[i+1]; // Member of index 1
2481 // B = Pic[i+2]; // Member of index 2
2482 // ... // do something to R, G, B
2483 // }
2484 // To:
2485 // %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B
2486 // %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9> ; R elements
2487 // %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10> ; G elements
2488 // %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11> ; B elements
2490 // Or translate following interleaved store group (factor = 3):
2491 // for (i = 0; i < N; i+=3) {
2492 // ... do something to R, G, B
2493 // Pic[i] = R; // Member of index 0
2494 // Pic[i+1] = G; // Member of index 1
2495 // Pic[i+2] = B; // Member of index 2
2496 // }
2497 // To:
2498 // %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
2499 // %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u>
2500 // %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
2501 // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements
2502 // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B
2503 void InnerLoopVectorizer::vectorizeInterleaveGroup(
2504 const InterleaveGroup<Instruction> *Group, ArrayRef<VPValue *> VPDefs,
2505 VPTransformState &State, VPValue *Addr, ArrayRef<VPValue *> StoredValues,
2506 VPValue *BlockInMask, bool NeedsMaskForGaps) {
2507 Instruction *Instr = Group->getInsertPos();
2508 const DataLayout &DL = Instr->getModule()->getDataLayout();
2510 // Prepare for the vector type of the interleaved load/store.
2511 Type *ScalarTy = getLoadStoreType(Instr);
2512 unsigned InterleaveFactor = Group->getFactor();
2513 auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor);
2515 // Prepare for the new pointers.
2516 SmallVector<Value *, 2> AddrParts;
2517 unsigned Index = Group->getIndex(Instr);
2519 // TODO: extend the masked interleaved-group support to reversed access.
2520 assert((!BlockInMask || !Group->isReverse()) &&
2521 "Reversed masked interleave-group not supported.");
2523 Value *Idx;
2524 // If the group is reverse, adjust the index to refer to the last vector lane
2525 // instead of the first. We adjust the index from the first vector lane,
2526 // rather than directly getting the pointer for lane VF - 1, because the
2527 // pointer operand of the interleaved access is supposed to be uniform. For
2528 // uniform instructions, we're only required to generate a value for the
2529 // first vector lane in each unroll iteration.
2530 if (Group->isReverse()) {
2531 Value *RuntimeVF = getRuntimeVF(Builder, Builder.getInt32Ty(), VF);
2532 Idx = Builder.CreateSub(RuntimeVF, Builder.getInt32(1));
2533 Idx = Builder.CreateMul(Idx, Builder.getInt32(Group->getFactor()));
2534 Idx = Builder.CreateAdd(Idx, Builder.getInt32(Index));
2535 Idx = Builder.CreateNeg(Idx);
2536 } else
2537 Idx = Builder.getInt32(-Index);
2539 for (unsigned Part = 0; Part < UF; Part++) {
2540 Value *AddrPart = State.get(Addr, VPIteration(Part, 0));
2541 if (auto *I = dyn_cast<Instruction>(AddrPart))
2542 State.setDebugLocFrom(I->getDebugLoc());
2544 // Notice current instruction could be any index. Need to adjust the address
2545 // to the member of index 0.
2547 // E.g. a = A[i+1]; // Member of index 1 (Current instruction)
2548 // b = A[i]; // Member of index 0
2549 // Current pointer is pointed to A[i+1], adjust it to A[i].
2551 // E.g. A[i+1] = a; // Member of index 1
2552 // A[i] = b; // Member of index 0
2553 // A[i+2] = c; // Member of index 2 (Current instruction)
2554 // Current pointer is pointed to A[i+2], adjust it to A[i].
2556 bool InBounds = false;
2557 if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts()))
2558 InBounds = gep->isInBounds();
2559 AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Idx, "", InBounds);
2560 AddrParts.push_back(AddrPart);
2563 State.setDebugLocFrom(Instr->getDebugLoc());
2564 Value *PoisonVec = PoisonValue::get(VecTy);
2566 auto CreateGroupMask = [this, &BlockInMask, &State, &InterleaveFactor](
2567 unsigned Part, Value *MaskForGaps) -> Value * {
2568 if (VF.isScalable()) {
2569 assert(!MaskForGaps && "Interleaved groups with gaps are not supported.");
2570 assert(InterleaveFactor == 2 &&
2571 "Unsupported deinterleave factor for scalable vectors");
2572 auto *BlockInMaskPart = State.get(BlockInMask, Part);
2573 SmallVector<Value *, 2> Ops = {BlockInMaskPart, BlockInMaskPart};
2574 auto *MaskTy =
2575 VectorType::get(Builder.getInt1Ty(), VF.getKnownMinValue() * 2, true);
2576 return Builder.CreateIntrinsic(
2577 MaskTy, Intrinsic::experimental_vector_interleave2, Ops,
2578 /*FMFSource=*/nullptr, "interleaved.mask");
2581 if (!BlockInMask)
2582 return MaskForGaps;
2584 Value *BlockInMaskPart = State.get(BlockInMask, Part);
2585 Value *ShuffledMask = Builder.CreateShuffleVector(
2586 BlockInMaskPart,
2587 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),
2588 "interleaved.mask");
2589 return MaskForGaps ? Builder.CreateBinOp(Instruction::And, ShuffledMask,
2590 MaskForGaps)
2591 : ShuffledMask;
2594 // Vectorize the interleaved load group.
2595 if (isa<LoadInst>(Instr)) {
2596 Value *MaskForGaps = nullptr;
2597 if (NeedsMaskForGaps) {
2598 MaskForGaps =
2599 createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group);
2600 assert(MaskForGaps && "Mask for Gaps is required but it is null");
2603 // For each unroll part, create a wide load for the group.
2604 SmallVector<Value *, 2> NewLoads;
2605 for (unsigned Part = 0; Part < UF; Part++) {
2606 Instruction *NewLoad;
2607 if (BlockInMask || MaskForGaps) {
2608 assert(useMaskedInterleavedAccesses(*TTI) &&
2609 "masked interleaved groups are not allowed.");
2610 Value *GroupMask = CreateGroupMask(Part, MaskForGaps);
2611 NewLoad =
2612 Builder.CreateMaskedLoad(VecTy, AddrParts[Part], Group->getAlign(),
2613 GroupMask, PoisonVec, "wide.masked.vec");
2615 else
2616 NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part],
2617 Group->getAlign(), "wide.vec");
2618 Group->addMetadata(NewLoad);
2619 NewLoads.push_back(NewLoad);
2622 if (VecTy->isScalableTy()) {
2623 assert(InterleaveFactor == 2 &&
2624 "Unsupported deinterleave factor for scalable vectors");
2626 for (unsigned Part = 0; Part < UF; ++Part) {
2627 // Scalable vectors cannot use arbitrary shufflevectors (only splats),
2628 // so must use intrinsics to deinterleave.
2629 Value *DI = Builder.CreateIntrinsic(
2630 Intrinsic::experimental_vector_deinterleave2, VecTy, NewLoads[Part],
2631 /*FMFSource=*/nullptr, "strided.vec");
2632 unsigned J = 0;
2633 for (unsigned I = 0; I < InterleaveFactor; ++I) {
2634 Instruction *Member = Group->getMember(I);
2636 if (!Member)
2637 continue;
2639 Value *StridedVec = Builder.CreateExtractValue(DI, I);
2640 // If this member has different type, cast the result type.
2641 if (Member->getType() != ScalarTy) {
2642 VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
2643 StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
2646 if (Group->isReverse())
2647 StridedVec = Builder.CreateVectorReverse(StridedVec, "reverse");
2649 State.set(VPDefs[J], StridedVec, Part);
2650 ++J;
2654 return;
2657 // For each member in the group, shuffle out the appropriate data from the
2658 // wide loads.
2659 unsigned J = 0;
2660 for (unsigned I = 0; I < InterleaveFactor; ++I) {
2661 Instruction *Member = Group->getMember(I);
2663 // Skip the gaps in the group.
2664 if (!Member)
2665 continue;
2667 auto StrideMask =
2668 createStrideMask(I, InterleaveFactor, VF.getKnownMinValue());
2669 for (unsigned Part = 0; Part < UF; Part++) {
2670 Value *StridedVec = Builder.CreateShuffleVector(
2671 NewLoads[Part], StrideMask, "strided.vec");
2673 // If this member has different type, cast the result type.
2674 if (Member->getType() != ScalarTy) {
2675 assert(!VF.isScalable() && "VF is assumed to be non scalable.");
2676 VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
2677 StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
2680 if (Group->isReverse())
2681 StridedVec = Builder.CreateVectorReverse(StridedVec, "reverse");
2683 State.set(VPDefs[J], StridedVec, Part);
2685 ++J;
2687 return;
2690 // The sub vector type for current instruction.
2691 auto *SubVT = VectorType::get(ScalarTy, VF);
2693 // Vectorize the interleaved store group.
2694 Value *MaskForGaps =
2695 createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group);
2696 assert((!MaskForGaps || useMaskedInterleavedAccesses(*TTI)) &&
2697 "masked interleaved groups are not allowed.");
2698 assert((!MaskForGaps || !VF.isScalable()) &&
2699 "masking gaps for scalable vectors is not yet supported.");
2700 for (unsigned Part = 0; Part < UF; Part++) {
2701 // Collect the stored vector from each member.
2702 SmallVector<Value *, 4> StoredVecs;
2703 unsigned StoredIdx = 0;
2704 for (unsigned i = 0; i < InterleaveFactor; i++) {
2705 assert((Group->getMember(i) || MaskForGaps) &&
2706 "Fail to get a member from an interleaved store group");
2707 Instruction *Member = Group->getMember(i);
2709 // Skip the gaps in the group.
2710 if (!Member) {
2711 Value *Undef = PoisonValue::get(SubVT);
2712 StoredVecs.push_back(Undef);
2713 continue;
2716 Value *StoredVec = State.get(StoredValues[StoredIdx], Part);
2717 ++StoredIdx;
2719 if (Group->isReverse())
2720 StoredVec = Builder.CreateVectorReverse(StoredVec, "reverse");
2722 // If this member has different type, cast it to a unified type.
2724 if (StoredVec->getType() != SubVT)
2725 StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL);
2727 StoredVecs.push_back(StoredVec);
2730 // Interleave all the smaller vectors into one wider vector.
2731 Value *IVec = interleaveVectors(Builder, StoredVecs, "interleaved.vec");
2732 Instruction *NewStoreInstr;
2733 if (BlockInMask || MaskForGaps) {
2734 Value *GroupMask = CreateGroupMask(Part, MaskForGaps);
2735 NewStoreInstr = Builder.CreateMaskedStore(IVec, AddrParts[Part],
2736 Group->getAlign(), GroupMask);
2737 } else
2738 NewStoreInstr =
2739 Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign());
2741 Group->addMetadata(NewStoreInstr);
2745 void InnerLoopVectorizer::scalarizeInstruction(const Instruction *Instr,
2746 VPReplicateRecipe *RepRecipe,
2747 const VPIteration &Instance,
2748 VPTransformState &State) {
2749 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
2751 // llvm.experimental.noalias.scope.decl intrinsics must only be duplicated for
2752 // the first lane and part.
2753 if (isa<NoAliasScopeDeclInst>(Instr))
2754 if (!Instance.isFirstIteration())
2755 return;
2757 // Does this instruction return a value ?
2758 bool IsVoidRetTy = Instr->getType()->isVoidTy();
2760 Instruction *Cloned = Instr->clone();
2761 if (!IsVoidRetTy) {
2762 Cloned->setName(Instr->getName() + ".cloned");
2763 #if !defined(NDEBUG)
2764 // Verify that VPlan type inference results agree with the type of the
2765 // generated values.
2766 assert(State.TypeAnalysis.inferScalarType(RepRecipe) == Cloned->getType() &&
2767 "inferred type and type from generated instructions do not match");
2768 #endif
2771 RepRecipe->setFlags(Cloned);
2773 if (auto DL = Instr->getDebugLoc())
2774 State.setDebugLocFrom(DL);
2776 // Replace the operands of the cloned instructions with their scalar
2777 // equivalents in the new loop.
2778 for (const auto &I : enumerate(RepRecipe->operands())) {
2779 auto InputInstance = Instance;
2780 VPValue *Operand = I.value();
2781 if (vputils::isUniformAfterVectorization(Operand))
2782 InputInstance.Lane = VPLane::getFirstLane();
2783 Cloned->setOperand(I.index(), State.get(Operand, InputInstance));
2785 State.addNewMetadata(Cloned, Instr);
2787 // Place the cloned scalar in the new loop.
2788 State.Builder.Insert(Cloned);
2790 State.set(RepRecipe, Cloned, Instance);
2792 // If we just cloned a new assumption, add it the assumption cache.
2793 if (auto *II = dyn_cast<AssumeInst>(Cloned))
2794 AC->registerAssumption(II);
2796 // End if-block.
2797 bool IfPredicateInstr = RepRecipe->getParent()->getParent()->isReplicator();
2798 if (IfPredicateInstr)
2799 PredicatedInstructions.push_back(Cloned);
2802 Value *
2803 InnerLoopVectorizer::getOrCreateVectorTripCount(BasicBlock *InsertBlock) {
2804 if (VectorTripCount)
2805 return VectorTripCount;
2807 Value *TC = getTripCount();
2808 IRBuilder<> Builder(InsertBlock->getTerminator());
2810 Type *Ty = TC->getType();
2811 // This is where we can make the step a runtime constant.
2812 Value *Step = createStepForVF(Builder, Ty, VF, UF);
2814 // If the tail is to be folded by masking, round the number of iterations N
2815 // up to a multiple of Step instead of rounding down. This is done by first
2816 // adding Step-1 and then rounding down. Note that it's ok if this addition
2817 // overflows: the vector induction variable will eventually wrap to zero given
2818 // that it starts at zero and its Step is a power of two; the loop will then
2819 // exit, with the last early-exit vector comparison also producing all-true.
2820 // For scalable vectors the VF is not guaranteed to be a power of 2, but this
2821 // is accounted for in emitIterationCountCheck that adds an overflow check.
2822 if (Cost->foldTailByMasking()) {
2823 assert(isPowerOf2_32(VF.getKnownMinValue() * UF) &&
2824 "VF*UF must be a power of 2 when folding tail by masking");
2825 Value *NumLanes = getRuntimeVF(Builder, Ty, VF * UF);
2826 TC = Builder.CreateAdd(
2827 TC, Builder.CreateSub(NumLanes, ConstantInt::get(Ty, 1)), "n.rnd.up");
2830 // Now we need to generate the expression for the part of the loop that the
2831 // vectorized body will execute. This is equal to N - (N % Step) if scalar
2832 // iterations are not required for correctness, or N - Step, otherwise. Step
2833 // is equal to the vectorization factor (number of SIMD elements) times the
2834 // unroll factor (number of SIMD instructions).
2835 Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
2837 // There are cases where we *must* run at least one iteration in the remainder
2838 // loop. See the cost model for when this can happen. If the step evenly
2839 // divides the trip count, we set the remainder to be equal to the step. If
2840 // the step does not evenly divide the trip count, no adjustment is necessary
2841 // since there will already be scalar iterations. Note that the minimum
2842 // iterations check ensures that N >= Step.
2843 if (Cost->requiresScalarEpilogue(VF.isVector())) {
2844 auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
2845 R = Builder.CreateSelect(IsZero, Step, R);
2848 VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
2850 return VectorTripCount;
2853 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy,
2854 const DataLayout &DL) {
2855 // Verify that V is a vector type with same number of elements as DstVTy.
2856 auto *DstFVTy = cast<VectorType>(DstVTy);
2857 auto VF = DstFVTy->getElementCount();
2858 auto *SrcVecTy = cast<VectorType>(V->getType());
2859 assert(VF == SrcVecTy->getElementCount() && "Vector dimensions do not match");
2860 Type *SrcElemTy = SrcVecTy->getElementType();
2861 Type *DstElemTy = DstFVTy->getElementType();
2862 assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
2863 "Vector elements must have same size");
2865 // Do a direct cast if element types are castable.
2866 if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
2867 return Builder.CreateBitOrPointerCast(V, DstFVTy);
2869 // V cannot be directly casted to desired vector type.
2870 // May happen when V is a floating point vector but DstVTy is a vector of
2871 // pointers or vice-versa. Handle this using a two-step bitcast using an
2872 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
2873 assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&
2874 "Only one type should be a pointer type");
2875 assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&
2876 "Only one type should be a floating point type");
2877 Type *IntTy =
2878 IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
2879 auto *VecIntTy = VectorType::get(IntTy, VF);
2880 Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
2881 return Builder.CreateBitOrPointerCast(CastVal, DstFVTy);
2884 void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) {
2885 Value *Count = getTripCount();
2886 // Reuse existing vector loop preheader for TC checks.
2887 // Note that new preheader block is generated for vector loop.
2888 BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
2889 IRBuilder<> Builder(TCCheckBlock->getTerminator());
2891 // Generate code to check if the loop's trip count is less than VF * UF, or
2892 // equal to it in case a scalar epilogue is required; this implies that the
2893 // vector trip count is zero. This check also covers the case where adding one
2894 // to the backedge-taken count overflowed leading to an incorrect trip count
2895 // of zero. In this case we will also jump to the scalar loop.
2896 auto P = Cost->requiresScalarEpilogue(VF.isVector()) ? ICmpInst::ICMP_ULE
2897 : ICmpInst::ICMP_ULT;
2899 // If tail is to be folded, vector loop takes care of all iterations.
2900 Type *CountTy = Count->getType();
2901 Value *CheckMinIters = Builder.getFalse();
2902 auto CreateStep = [&]() -> Value * {
2903 // Create step with max(MinProTripCount, UF * VF).
2904 if (UF * VF.getKnownMinValue() >= MinProfitableTripCount.getKnownMinValue())
2905 return createStepForVF(Builder, CountTy, VF, UF);
2907 Value *MinProfTC =
2908 createStepForVF(Builder, CountTy, MinProfitableTripCount, 1);
2909 if (!VF.isScalable())
2910 return MinProfTC;
2911 return Builder.CreateBinaryIntrinsic(
2912 Intrinsic::umax, MinProfTC, createStepForVF(Builder, CountTy, VF, UF));
2915 TailFoldingStyle Style = Cost->getTailFoldingStyle();
2916 if (Style == TailFoldingStyle::None)
2917 CheckMinIters =
2918 Builder.CreateICmp(P, Count, CreateStep(), "min.iters.check");
2919 else if (VF.isScalable() &&
2920 !isIndvarOverflowCheckKnownFalse(Cost, VF, UF) &&
2921 Style != TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck) {
2922 // vscale is not necessarily a power-of-2, which means we cannot guarantee
2923 // an overflow to zero when updating induction variables and so an
2924 // additional overflow check is required before entering the vector loop.
2926 // Get the maximum unsigned value for the type.
2927 Value *MaxUIntTripCount =
2928 ConstantInt::get(CountTy, cast<IntegerType>(CountTy)->getMask());
2929 Value *LHS = Builder.CreateSub(MaxUIntTripCount, Count);
2931 // Don't execute the vector loop if (UMax - n) < (VF * UF).
2932 CheckMinIters = Builder.CreateICmp(ICmpInst::ICMP_ULT, LHS, CreateStep());
2935 // Create new preheader for vector loop.
2936 LoopVectorPreHeader =
2937 SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr,
2938 "vector.ph");
2940 assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
2941 DT->getNode(Bypass)->getIDom()) &&
2942 "TC check is expected to dominate Bypass");
2944 // Update dominator for Bypass & LoopExit (if needed).
2945 DT->changeImmediateDominator(Bypass, TCCheckBlock);
2946 if (!Cost->requiresScalarEpilogue(VF.isVector()))
2947 // If there is an epilogue which must run, there's no edge from the
2948 // middle block to exit blocks and thus no need to update the immediate
2949 // dominator of the exit blocks.
2950 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
2952 BranchInst &BI =
2953 *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters);
2954 if (hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator()))
2955 setBranchWeights(BI, MinItersBypassWeights);
2956 ReplaceInstWithInst(TCCheckBlock->getTerminator(), &BI);
2957 LoopBypassBlocks.push_back(TCCheckBlock);
2960 BasicBlock *InnerLoopVectorizer::emitSCEVChecks(BasicBlock *Bypass) {
2961 BasicBlock *const SCEVCheckBlock =
2962 RTChecks.emitSCEVChecks(Bypass, LoopVectorPreHeader, LoopExitBlock);
2963 if (!SCEVCheckBlock)
2964 return nullptr;
2966 assert(!(SCEVCheckBlock->getParent()->hasOptSize() ||
2967 (OptForSizeBasedOnProfile &&
2968 Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) &&
2969 "Cannot SCEV check stride or overflow when optimizing for size");
2972 // Update dominator only if this is first RT check.
2973 if (LoopBypassBlocks.empty()) {
2974 DT->changeImmediateDominator(Bypass, SCEVCheckBlock);
2975 if (!Cost->requiresScalarEpilogue(VF.isVector()))
2976 // If there is an epilogue which must run, there's no edge from the
2977 // middle block to exit blocks and thus no need to update the immediate
2978 // dominator of the exit blocks.
2979 DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock);
2982 LoopBypassBlocks.push_back(SCEVCheckBlock);
2983 AddedSafetyChecks = true;
2984 return SCEVCheckBlock;
2987 BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(BasicBlock *Bypass) {
2988 // VPlan-native path does not do any analysis for runtime checks currently.
2989 if (EnableVPlanNativePath)
2990 return nullptr;
2992 BasicBlock *const MemCheckBlock =
2993 RTChecks.emitMemRuntimeChecks(Bypass, LoopVectorPreHeader);
2995 // Check if we generated code that checks in runtime if arrays overlap. We put
2996 // the checks into a separate block to make the more common case of few
2997 // elements faster.
2998 if (!MemCheckBlock)
2999 return nullptr;
3001 if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) {
3002 assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled &&
3003 "Cannot emit memory checks when optimizing for size, unless forced "
3004 "to vectorize.");
3005 ORE->emit([&]() {
3006 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize",
3007 OrigLoop->getStartLoc(),
3008 OrigLoop->getHeader())
3009 << "Code-size may be reduced by not forcing "
3010 "vectorization, or by source-code modifications "
3011 "eliminating the need for runtime checks "
3012 "(e.g., adding 'restrict').";
3016 LoopBypassBlocks.push_back(MemCheckBlock);
3018 AddedSafetyChecks = true;
3020 return MemCheckBlock;
3023 void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
3024 LoopScalarBody = OrigLoop->getHeader();
3025 LoopVectorPreHeader = OrigLoop->getLoopPreheader();
3026 assert(LoopVectorPreHeader && "Invalid loop structure");
3027 LoopExitBlock = OrigLoop->getUniqueExitBlock(); // may be nullptr
3028 assert((LoopExitBlock || Cost->requiresScalarEpilogue(VF.isVector())) &&
3029 "multiple exit loop without required epilogue?");
3031 LoopMiddleBlock =
3032 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
3033 LI, nullptr, Twine(Prefix) + "middle.block");
3034 LoopScalarPreHeader =
3035 SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI,
3036 nullptr, Twine(Prefix) + "scalar.ph");
3038 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();
3040 // Set up the middle block terminator. Two cases:
3041 // 1) If we know that we must execute the scalar epilogue, emit an
3042 // unconditional branch.
3043 // 2) Otherwise, we must have a single unique exit block (due to how we
3044 // implement the multiple exit case). In this case, set up a conditional
3045 // branch from the middle block to the loop scalar preheader, and the
3046 // exit block. completeLoopSkeleton will update the condition to use an
3047 // iteration check, if required to decide whether to execute the remainder.
3048 BranchInst *BrInst =
3049 Cost->requiresScalarEpilogue(VF.isVector())
3050 ? BranchInst::Create(LoopScalarPreHeader)
3051 : BranchInst::Create(LoopExitBlock, LoopScalarPreHeader,
3052 Builder.getTrue());
3053 BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc());
3054 ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst);
3056 // Update dominator for loop exit. During skeleton creation, only the vector
3057 // pre-header and the middle block are created. The vector loop is entirely
3058 // created during VPlan exection.
3059 if (!Cost->requiresScalarEpilogue(VF.isVector()))
3060 // If there is an epilogue which must run, there's no edge from the
3061 // middle block to exit blocks and thus no need to update the immediate
3062 // dominator of the exit blocks.
3063 DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock);
3066 PHINode *InnerLoopVectorizer::createInductionResumeValue(
3067 PHINode *OrigPhi, const InductionDescriptor &II, Value *Step,
3068 ArrayRef<BasicBlock *> BypassBlocks,
3069 std::pair<BasicBlock *, Value *> AdditionalBypass) {
3070 Value *VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader);
3071 assert(VectorTripCount && "Expected valid arguments");
3073 Instruction *OldInduction = Legal->getPrimaryInduction();
3074 Value *&EndValue = IVEndValues[OrigPhi];
3075 Value *EndValueFromAdditionalBypass = AdditionalBypass.second;
3076 if (OrigPhi == OldInduction) {
3077 // We know what the end value is.
3078 EndValue = VectorTripCount;
3079 } else {
3080 IRBuilder<> B(LoopVectorPreHeader->getTerminator());
3082 // Fast-math-flags propagate from the original induction instruction.
3083 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp()))
3084 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());
3086 EndValue = emitTransformedIndex(B, VectorTripCount, II.getStartValue(),
3087 Step, II.getKind(), II.getInductionBinOp());
3088 EndValue->setName("ind.end");
3090 // Compute the end value for the additional bypass (if applicable).
3091 if (AdditionalBypass.first) {
3092 B.SetInsertPoint(AdditionalBypass.first,
3093 AdditionalBypass.first->getFirstInsertionPt());
3094 EndValueFromAdditionalBypass =
3095 emitTransformedIndex(B, AdditionalBypass.second, II.getStartValue(),
3096 Step, II.getKind(), II.getInductionBinOp());
3097 EndValueFromAdditionalBypass->setName("ind.end");
3101 // Create phi nodes to merge from the backedge-taken check block.
3102 PHINode *BCResumeVal = PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val",
3103 LoopScalarPreHeader->getTerminator());
3104 // Copy original phi DL over to the new one.
3105 BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
3107 // The new PHI merges the original incoming value, in case of a bypass,
3108 // or the value at the end of the vectorized loop.
3109 BCResumeVal->addIncoming(EndValue, LoopMiddleBlock);
3111 // Fix the scalar body counter (PHI node).
3112 // The old induction's phi node in the scalar body needs the truncated
3113 // value.
3114 for (BasicBlock *BB : BypassBlocks)
3115 BCResumeVal->addIncoming(II.getStartValue(), BB);
3117 if (AdditionalBypass.first)
3118 BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first,
3119 EndValueFromAdditionalBypass);
3120 return BCResumeVal;
3123 /// Return the expanded step for \p ID using \p ExpandedSCEVs to look up SCEV
3124 /// expansion results.
3125 static Value *getExpandedStep(const InductionDescriptor &ID,
3126 const SCEV2ValueTy &ExpandedSCEVs) {
3127 const SCEV *Step = ID.getStep();
3128 if (auto *C = dyn_cast<SCEVConstant>(Step))
3129 return C->getValue();
3130 if (auto *U = dyn_cast<SCEVUnknown>(Step))
3131 return U->getValue();
3132 auto I = ExpandedSCEVs.find(Step);
3133 assert(I != ExpandedSCEVs.end() && "SCEV must be expanded at this point");
3134 return I->second;
3137 void InnerLoopVectorizer::createInductionResumeValues(
3138 const SCEV2ValueTy &ExpandedSCEVs,
3139 std::pair<BasicBlock *, Value *> AdditionalBypass) {
3140 assert(((AdditionalBypass.first && AdditionalBypass.second) ||
3141 (!AdditionalBypass.first && !AdditionalBypass.second)) &&
3142 "Inconsistent information about additional bypass.");
3143 // We are going to resume the execution of the scalar loop.
3144 // Go over all of the induction variables that we found and fix the
3145 // PHIs that are left in the scalar version of the loop.
3146 // The starting values of PHI nodes depend on the counter of the last
3147 // iteration in the vectorized loop.
3148 // If we come from a bypass edge then we need to start from the original
3149 // start value.
3150 for (const auto &InductionEntry : Legal->getInductionVars()) {
3151 PHINode *OrigPhi = InductionEntry.first;
3152 const InductionDescriptor &II = InductionEntry.second;
3153 PHINode *BCResumeVal = createInductionResumeValue(
3154 OrigPhi, II, getExpandedStep(II, ExpandedSCEVs), LoopBypassBlocks,
3155 AdditionalBypass);
3156 OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal);
3160 BasicBlock *InnerLoopVectorizer::completeLoopSkeleton() {
3161 // The trip counts should be cached by now.
3162 Value *Count = getTripCount();
3163 Value *VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader);
3165 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();
3167 // Add a check in the middle block to see if we have completed
3168 // all of the iterations in the first vector loop. Three cases:
3169 // 1) If we require a scalar epilogue, there is no conditional branch as
3170 // we unconditionally branch to the scalar preheader. Do nothing.
3171 // 2) If (N - N%VF) == N, then we *don't* need to run the remainder.
3172 // Thus if tail is to be folded, we know we don't need to run the
3173 // remainder and we can use the previous value for the condition (true).
3174 // 3) Otherwise, construct a runtime check.
3175 if (!Cost->requiresScalarEpilogue(VF.isVector()) &&
3176 !Cost->foldTailByMasking()) {
3177 // Here we use the same DebugLoc as the scalar loop latch terminator instead
3178 // of the corresponding compare because they may have ended up with
3179 // different line numbers and we want to avoid awkward line stepping while
3180 // debugging. Eg. if the compare has got a line number inside the loop.
3181 // TODO: At the moment, CreateICmpEQ will simplify conditions with constant
3182 // operands. Perform simplification directly on VPlan once the branch is
3183 // modeled there.
3184 IRBuilder<> B(LoopMiddleBlock->getTerminator());
3185 B.SetCurrentDebugLocation(ScalarLatchTerm->getDebugLoc());
3186 Value *CmpN = B.CreateICmpEQ(Count, VectorTripCount, "cmp.n");
3187 BranchInst &BI = *cast<BranchInst>(LoopMiddleBlock->getTerminator());
3188 BI.setCondition(CmpN);
3189 if (hasBranchWeightMD(*ScalarLatchTerm)) {
3190 // Assume that `Count % VectorTripCount` is equally distributed.
3191 unsigned TripCount = UF * VF.getKnownMinValue();
3192 assert(TripCount > 0 && "trip count should not be zero");
3193 const uint32_t Weights[] = {1, TripCount - 1};
3194 setBranchWeights(BI, Weights);
3198 #ifdef EXPENSIVE_CHECKS
3199 assert(DT->verify(DominatorTree::VerificationLevel::Fast));
3200 #endif
3202 return LoopVectorPreHeader;
3205 std::pair<BasicBlock *, Value *>
3206 InnerLoopVectorizer::createVectorizedLoopSkeleton(
3207 const SCEV2ValueTy &ExpandedSCEVs) {
3209 In this function we generate a new loop. The new loop will contain
3210 the vectorized instructions while the old loop will continue to run the
3211 scalar remainder.
3213 [ ] <-- old preheader - loop iteration number check and SCEVs in Plan's
3214 / | preheader are expanded here. Eventually all required SCEV
3215 / | expansion should happen here.
3217 | [ ] <-- vector loop bypass (may consist of multiple blocks).
3218 | / |
3219 | / v
3220 || [ ] <-- vector pre header.
3221 |/ |
3223 | [ ] \
3224 | [ ]_| <-- vector loop (created during VPlan execution).
3227 \ -[ ] <--- middle-block.
3228 \/ |
3229 /\ v
3230 | ->[ ] <--- new preheader.
3232 (opt) v <-- edge from middle to exit iff epilogue is not required.
3233 | [ ] \
3234 | [ ]_| <-- old scalar loop to handle remainder (scalar epilogue).
3237 >[ ] <-- exit block(s).
3241 // Create an empty vector loop, and prepare basic blocks for the runtime
3242 // checks.
3243 createVectorLoopSkeleton("");
3245 // Now, compare the new count to zero. If it is zero skip the vector loop and
3246 // jump to the scalar loop. This check also covers the case where the
3247 // backedge-taken count is uint##_max: adding one to it will overflow leading
3248 // to an incorrect trip count of zero. In this (rare) case we will also jump
3249 // to the scalar loop.
3250 emitIterationCountCheck(LoopScalarPreHeader);
3252 // Generate the code to check any assumptions that we've made for SCEV
3253 // expressions.
3254 emitSCEVChecks(LoopScalarPreHeader);
3256 // Generate the code that checks in runtime if arrays overlap. We put the
3257 // checks into a separate block to make the more common case of few elements
3258 // faster.
3259 emitMemRuntimeChecks(LoopScalarPreHeader);
3261 // Emit phis for the new starting index of the scalar loop.
3262 createInductionResumeValues(ExpandedSCEVs);
3264 return {completeLoopSkeleton(), nullptr};
3267 // Fix up external users of the induction variable. At this point, we are
3268 // in LCSSA form, with all external PHIs that use the IV having one input value,
3269 // coming from the remainder loop. We need those PHIs to also have a correct
3270 // value for the IV when arriving directly from the middle block.
3271 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
3272 const InductionDescriptor &II,
3273 Value *VectorTripCount, Value *EndValue,
3274 BasicBlock *MiddleBlock,
3275 BasicBlock *VectorHeader, VPlan &Plan,
3276 VPTransformState &State) {
3277 // There are two kinds of external IV usages - those that use the value
3278 // computed in the last iteration (the PHI) and those that use the penultimate
3279 // value (the value that feeds into the phi from the loop latch).
3280 // We allow both, but they, obviously, have different values.
3282 assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block");
3284 DenseMap<Value *, Value *> MissingVals;
3286 // An external user of the last iteration's value should see the value that
3287 // the remainder loop uses to initialize its own IV.
3288 Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch());
3289 for (User *U : PostInc->users()) {
3290 Instruction *UI = cast<Instruction>(U);
3291 if (!OrigLoop->contains(UI)) {
3292 assert(isa<PHINode>(UI) && "Expected LCSSA form");
3293 MissingVals[UI] = EndValue;
3297 // An external user of the penultimate value need to see EndValue - Step.
3298 // The simplest way to get this is to recompute it from the constituent SCEVs,
3299 // that is Start + (Step * (CRD - 1)).
3300 for (User *U : OrigPhi->users()) {
3301 auto *UI = cast<Instruction>(U);
3302 if (!OrigLoop->contains(UI)) {
3303 assert(isa<PHINode>(UI) && "Expected LCSSA form");
3304 IRBuilder<> B(MiddleBlock->getTerminator());
3306 // Fast-math-flags propagate from the original induction instruction.
3307 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp()))
3308 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());
3310 Value *CountMinusOne = B.CreateSub(
3311 VectorTripCount, ConstantInt::get(VectorTripCount->getType(), 1));
3312 CountMinusOne->setName("cmo");
3314 VPValue *StepVPV = Plan.getSCEVExpansion(II.getStep());
3315 assert(StepVPV && "step must have been expanded during VPlan execution");
3316 Value *Step = StepVPV->isLiveIn() ? StepVPV->getLiveInIRValue()
3317 : State.get(StepVPV, {0, 0});
3318 Value *Escape =
3319 emitTransformedIndex(B, CountMinusOne, II.getStartValue(), Step,
3320 II.getKind(), II.getInductionBinOp());
3321 Escape->setName("ind.escape");
3322 MissingVals[UI] = Escape;
3326 for (auto &I : MissingVals) {
3327 PHINode *PHI = cast<PHINode>(I.first);
3328 // One corner case we have to handle is two IVs "chasing" each-other,
3329 // that is %IV2 = phi [...], [ %IV1, %latch ]
3330 // In this case, if IV1 has an external use, we need to avoid adding both
3331 // "last value of IV1" and "penultimate value of IV2". So, verify that we
3332 // don't already have an incoming value for the middle block.
3333 if (PHI->getBasicBlockIndex(MiddleBlock) == -1) {
3334 PHI->addIncoming(I.second, MiddleBlock);
3335 Plan.removeLiveOut(PHI);
3340 namespace {
3342 struct CSEDenseMapInfo {
3343 static bool canHandle(const Instruction *I) {
3344 return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
3345 isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
3348 static inline Instruction *getEmptyKey() {
3349 return DenseMapInfo<Instruction *>::getEmptyKey();
3352 static inline Instruction *getTombstoneKey() {
3353 return DenseMapInfo<Instruction *>::getTombstoneKey();
3356 static unsigned getHashValue(const Instruction *I) {
3357 assert(canHandle(I) && "Unknown instruction!");
3358 return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),
3359 I->value_op_end()));
3362 static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
3363 if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
3364 LHS == getTombstoneKey() || RHS == getTombstoneKey())
3365 return LHS == RHS;
3366 return LHS->isIdenticalTo(RHS);
3370 } // end anonymous namespace
3372 ///Perform cse of induction variable instructions.
3373 static void cse(BasicBlock *BB) {
3374 // Perform simple cse.
3375 SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap;
3376 for (Instruction &In : llvm::make_early_inc_range(*BB)) {
3377 if (!CSEDenseMapInfo::canHandle(&In))
3378 continue;
3380 // Check if we can replace this instruction with any of the
3381 // visited instructions.
3382 if (Instruction *V = CSEMap.lookup(&In)) {
3383 In.replaceAllUsesWith(V);
3384 In.eraseFromParent();
3385 continue;
3388 CSEMap[&In] = &In;
3392 InstructionCost
3393 LoopVectorizationCostModel::getVectorCallCost(CallInst *CI,
3394 ElementCount VF) const {
3395 // We only need to calculate a cost if the VF is scalar; for actual vectors
3396 // we should already have a pre-calculated cost at each VF.
3397 if (!VF.isScalar())
3398 return CallWideningDecisions.at(std::make_pair(CI, VF)).Cost;
3400 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
3401 Type *RetTy = CI->getType();
3402 if (RecurrenceDescriptor::isFMulAddIntrinsic(CI))
3403 if (auto RedCost = getReductionPatternCost(CI, VF, RetTy, CostKind))
3404 return *RedCost;
3406 SmallVector<Type *, 4> Tys;
3407 for (auto &ArgOp : CI->args())
3408 Tys.push_back(ArgOp->getType());
3410 InstructionCost ScalarCallCost =
3411 TTI.getCallInstrCost(CI->getCalledFunction(), RetTy, Tys, CostKind);
3413 // If this is an intrinsic we may have a lower cost for it.
3414 if (getVectorIntrinsicIDForCall(CI, TLI)) {
3415 InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF);
3416 return std::min(ScalarCallCost, IntrinsicCost);
3418 return ScalarCallCost;
3421 static Type *MaybeVectorizeType(Type *Elt, ElementCount VF) {
3422 if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy()))
3423 return Elt;
3424 return VectorType::get(Elt, VF);
3427 InstructionCost
3428 LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
3429 ElementCount VF) const {
3430 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
3431 assert(ID && "Expected intrinsic call!");
3432 Type *RetTy = MaybeVectorizeType(CI->getType(), VF);
3433 FastMathFlags FMF;
3434 if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
3435 FMF = FPMO->getFastMathFlags();
3437 SmallVector<const Value *> Arguments(CI->args());
3438 FunctionType *FTy = CI->getCalledFunction()->getFunctionType();
3439 SmallVector<Type *> ParamTys;
3440 std::transform(FTy->param_begin(), FTy->param_end(),
3441 std::back_inserter(ParamTys),
3442 [&](Type *Ty) { return MaybeVectorizeType(Ty, VF); });
3444 IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF,
3445 dyn_cast<IntrinsicInst>(CI));
3446 return TTI.getIntrinsicInstrCost(CostAttrs,
3447 TargetTransformInfo::TCK_RecipThroughput);
3450 static Type *smallestIntegerVectorType(Type *T1, Type *T2) {
3451 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
3452 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3453 return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2;
3456 static Type *largestIntegerVectorType(Type *T1, Type *T2) {
3457 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
3458 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3459 return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2;
3462 void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State,
3463 VPlan &Plan) {
3464 // Fix widened non-induction PHIs by setting up the PHI operands.
3465 if (EnableVPlanNativePath)
3466 fixNonInductionPHIs(Plan, State);
3468 // At this point every instruction in the original loop is widened to a
3469 // vector form. Now we need to fix the recurrences in the loop. These PHI
3470 // nodes are currently empty because we did not want to introduce cycles.
3471 // This is the second stage of vectorizing recurrences. Note that fixing
3472 // reduction phis are already modeled in VPlan.
3473 // TODO: Also model fixing fixed-order recurrence phis in VPlan.
3474 VPRegionBlock *VectorRegion = State.Plan->getVectorLoopRegion();
3475 VPBasicBlock *HeaderVPBB = VectorRegion->getEntryBasicBlock();
3476 for (VPRecipeBase &R : HeaderVPBB->phis()) {
3477 if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R))
3478 fixFixedOrderRecurrence(FOR, State);
3481 // Forget the original basic block.
3482 PSE.getSE()->forgetLoop(OrigLoop);
3483 PSE.getSE()->forgetBlockAndLoopDispositions();
3485 // After vectorization, the exit blocks of the original loop will have
3486 // additional predecessors. Invalidate SCEVs for the exit phis in case SE
3487 // looked through single-entry phis.
3488 SmallVector<BasicBlock *> ExitBlocks;
3489 OrigLoop->getExitBlocks(ExitBlocks);
3490 for (BasicBlock *Exit : ExitBlocks)
3491 for (PHINode &PN : Exit->phis())
3492 PSE.getSE()->forgetLcssaPhiWithNewPredecessor(OrigLoop, &PN);
3494 VPBasicBlock *LatchVPBB = VectorRegion->getExitingBasicBlock();
3495 Loop *VectorLoop = LI->getLoopFor(State.CFG.VPBB2IRBB[LatchVPBB]);
3496 if (Cost->requiresScalarEpilogue(VF.isVector())) {
3497 // No edge from the middle block to the unique exit block has been inserted
3498 // and there is nothing to fix from vector loop; phis should have incoming
3499 // from scalar loop only.
3500 } else {
3501 // TODO: Check VPLiveOuts to see if IV users need fixing instead of checking
3502 // the cost model.
3504 // If we inserted an edge from the middle block to the unique exit block,
3505 // update uses outside the loop (phis) to account for the newly inserted
3506 // edge.
3508 // Fix-up external users of the induction variables.
3509 for (const auto &Entry : Legal->getInductionVars())
3510 fixupIVUsers(Entry.first, Entry.second,
3511 getOrCreateVectorTripCount(VectorLoop->getLoopPreheader()),
3512 IVEndValues[Entry.first], LoopMiddleBlock,
3513 VectorLoop->getHeader(), Plan, State);
3516 // Fix LCSSA phis not already fixed earlier. Extracts may need to be generated
3517 // in the exit block, so update the builder.
3518 State.Builder.SetInsertPoint(State.CFG.ExitBB,
3519 State.CFG.ExitBB->getFirstNonPHIIt());
3520 for (const auto &KV : Plan.getLiveOuts())
3521 KV.second->fixPhi(Plan, State);
3523 for (Instruction *PI : PredicatedInstructions)
3524 sinkScalarOperands(&*PI);
3526 // Remove redundant induction instructions.
3527 cse(VectorLoop->getHeader());
3529 // Set/update profile weights for the vector and remainder loops as original
3530 // loop iterations are now distributed among them. Note that original loop
3531 // represented by LoopScalarBody becomes remainder loop after vectorization.
3533 // For cases like foldTailByMasking() and requiresScalarEpiloque() we may
3534 // end up getting slightly roughened result but that should be OK since
3535 // profile is not inherently precise anyway. Note also possible bypass of
3536 // vector code caused by legality checks is ignored, assigning all the weight
3537 // to the vector loop, optimistically.
3539 // For scalable vectorization we can't know at compile time how many iterations
3540 // of the loop are handled in one vector iteration, so instead assume a pessimistic
3541 // vscale of '1'.
3542 setProfileInfoAfterUnrolling(LI->getLoopFor(LoopScalarBody), VectorLoop,
3543 LI->getLoopFor(LoopScalarBody),
3544 VF.getKnownMinValue() * UF);
3547 void InnerLoopVectorizer::fixFixedOrderRecurrence(
3548 VPFirstOrderRecurrencePHIRecipe *PhiR, VPTransformState &State) {
3549 // This is the second phase of vectorizing first-order recurrences. An
3550 // overview of the transformation is described below. Suppose we have the
3551 // following loop.
3553 // for (int i = 0; i < n; ++i)
3554 // b[i] = a[i] - a[i - 1];
3556 // There is a first-order recurrence on "a". For this loop, the shorthand
3557 // scalar IR looks like:
3559 // scalar.ph:
3560 // s_init = a[-1]
3561 // br scalar.body
3563 // scalar.body:
3564 // i = phi [0, scalar.ph], [i+1, scalar.body]
3565 // s1 = phi [s_init, scalar.ph], [s2, scalar.body]
3566 // s2 = a[i]
3567 // b[i] = s2 - s1
3568 // br cond, scalar.body, ...
3570 // In this example, s1 is a recurrence because it's value depends on the
3571 // previous iteration. In the first phase of vectorization, we created a
3572 // vector phi v1 for s1. We now complete the vectorization and produce the
3573 // shorthand vector IR shown below (for VF = 4, UF = 1).
3575 // vector.ph:
3576 // v_init = vector(..., ..., ..., a[-1])
3577 // br vector.body
3579 // vector.body
3580 // i = phi [0, vector.ph], [i+4, vector.body]
3581 // v1 = phi [v_init, vector.ph], [v2, vector.body]
3582 // v2 = a[i, i+1, i+2, i+3];
3583 // v3 = vector(v1(3), v2(0, 1, 2))
3584 // b[i, i+1, i+2, i+3] = v2 - v3
3585 // br cond, vector.body, middle.block
3587 // middle.block:
3588 // x = v2(3)
3589 // br scalar.ph
3591 // scalar.ph:
3592 // s_init = phi [x, middle.block], [a[-1], otherwise]
3593 // br scalar.body
3595 // After execution completes the vector loop, we extract the next value of
3596 // the recurrence (x) to use as the initial value in the scalar loop.
3598 // Extract the last vector element in the middle block. This will be the
3599 // initial value for the recurrence when jumping to the scalar loop.
3600 VPValue *PreviousDef = PhiR->getBackedgeValue();
3601 Value *Incoming = State.get(PreviousDef, UF - 1);
3602 auto *ExtractForScalar = Incoming;
3603 auto *IdxTy = Builder.getInt32Ty();
3604 Value *RuntimeVF = nullptr;
3605 if (VF.isVector()) {
3606 auto *One = ConstantInt::get(IdxTy, 1);
3607 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
3608 RuntimeVF = getRuntimeVF(Builder, IdxTy, VF);
3609 auto *LastIdx = Builder.CreateSub(RuntimeVF, One);
3610 ExtractForScalar =
3611 Builder.CreateExtractElement(Incoming, LastIdx, "vector.recur.extract");
3614 auto RecurSplice = cast<VPInstruction>(*PhiR->user_begin());
3615 assert(PhiR->getNumUsers() == 1 &&
3616 RecurSplice->getOpcode() ==
3617 VPInstruction::FirstOrderRecurrenceSplice &&
3618 "recurrence phi must have a single user: FirstOrderRecurrenceSplice");
3619 SmallVector<VPLiveOut *> LiveOuts;
3620 for (VPUser *U : RecurSplice->users())
3621 if (auto *LiveOut = dyn_cast<VPLiveOut>(U))
3622 LiveOuts.push_back(LiveOut);
3624 if (!LiveOuts.empty()) {
3625 // Extract the second last element in the middle block if the
3626 // Phi is used outside the loop. We need to extract the phi itself
3627 // and not the last element (the phi update in the current iteration). This
3628 // will be the value when jumping to the exit block from the
3629 // LoopMiddleBlock, when the scalar loop is not run at all.
3630 Value *ExtractForPhiUsedOutsideLoop = nullptr;
3631 if (VF.isVector()) {
3632 auto *Idx = Builder.CreateSub(RuntimeVF, ConstantInt::get(IdxTy, 2));
3633 ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement(
3634 Incoming, Idx, "vector.recur.extract.for.phi");
3635 } else {
3636 assert(UF > 1 && "VF and UF cannot both be 1");
3637 // When loop is unrolled without vectorizing, initialize
3638 // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled
3639 // value of `Incoming`. This is analogous to the vectorized case above:
3640 // extracting the second last element when VF > 1.
3641 ExtractForPhiUsedOutsideLoop = State.get(PreviousDef, UF - 2);
3644 for (VPLiveOut *LiveOut : LiveOuts) {
3645 assert(!Cost->requiresScalarEpilogue(VF.isVector()));
3646 PHINode *LCSSAPhi = LiveOut->getPhi();
3647 LCSSAPhi->addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
3648 State.Plan->removeLiveOut(LCSSAPhi);
3652 // Fix the initial value of the original recurrence in the scalar loop.
3653 Builder.SetInsertPoint(LoopScalarPreHeader, LoopScalarPreHeader->begin());
3654 PHINode *Phi = cast<PHINode>(PhiR->getUnderlyingValue());
3655 auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init");
3656 auto *ScalarInit = PhiR->getStartValue()->getLiveInIRValue();
3657 for (auto *BB : predecessors(LoopScalarPreHeader)) {
3658 auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit;
3659 Start->addIncoming(Incoming, BB);
3662 Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start);
3663 Phi->setName("scalar.recur");
3666 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
3667 // The basic block and loop containing the predicated instruction.
3668 auto *PredBB = PredInst->getParent();
3669 auto *VectorLoop = LI->getLoopFor(PredBB);
3671 // Initialize a worklist with the operands of the predicated instruction.
3672 SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end());
3674 // Holds instructions that we need to analyze again. An instruction may be
3675 // reanalyzed if we don't yet know if we can sink it or not.
3676 SmallVector<Instruction *, 8> InstsToReanalyze;
3678 // Returns true if a given use occurs in the predicated block. Phi nodes use
3679 // their operands in their corresponding predecessor blocks.
3680 auto isBlockOfUsePredicated = [&](Use &U) -> bool {
3681 auto *I = cast<Instruction>(U.getUser());
3682 BasicBlock *BB = I->getParent();
3683 if (auto *Phi = dyn_cast<PHINode>(I))
3684 BB = Phi->getIncomingBlock(
3685 PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
3686 return BB == PredBB;
3689 // Iteratively sink the scalarized operands of the predicated instruction
3690 // into the block we created for it. When an instruction is sunk, it's
3691 // operands are then added to the worklist. The algorithm ends after one pass
3692 // through the worklist doesn't sink a single instruction.
3693 bool Changed;
3694 do {
3695 // Add the instructions that need to be reanalyzed to the worklist, and
3696 // reset the changed indicator.
3697 Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end());
3698 InstsToReanalyze.clear();
3699 Changed = false;
3701 while (!Worklist.empty()) {
3702 auto *I = dyn_cast<Instruction>(Worklist.pop_back_val());
3704 // We can't sink an instruction if it is a phi node, is not in the loop,
3705 // may have side effects or may read from memory.
3706 // TODO Could dor more granular checking to allow sinking a load past non-store instructions.
3707 if (!I || isa<PHINode>(I) || !VectorLoop->contains(I) ||
3708 I->mayHaveSideEffects() || I->mayReadFromMemory())
3709 continue;
3711 // If the instruction is already in PredBB, check if we can sink its
3712 // operands. In that case, VPlan's sinkScalarOperands() succeeded in
3713 // sinking the scalar instruction I, hence it appears in PredBB; but it
3714 // may have failed to sink I's operands (recursively), which we try
3715 // (again) here.
3716 if (I->getParent() == PredBB) {
3717 Worklist.insert(I->op_begin(), I->op_end());
3718 continue;
3721 // It's legal to sink the instruction if all its uses occur in the
3722 // predicated block. Otherwise, there's nothing to do yet, and we may
3723 // need to reanalyze the instruction.
3724 if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) {
3725 InstsToReanalyze.push_back(I);
3726 continue;
3729 // Move the instruction to the beginning of the predicated block, and add
3730 // it's operands to the worklist.
3731 I->moveBefore(&*PredBB->getFirstInsertionPt());
3732 Worklist.insert(I->op_begin(), I->op_end());
3734 // The sinking may have enabled other instructions to be sunk, so we will
3735 // need to iterate.
3736 Changed = true;
3738 } while (Changed);
3741 void InnerLoopVectorizer::fixNonInductionPHIs(VPlan &Plan,
3742 VPTransformState &State) {
3743 auto Iter = vp_depth_first_deep(Plan.getEntry());
3744 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
3745 for (VPRecipeBase &P : VPBB->phis()) {
3746 VPWidenPHIRecipe *VPPhi = dyn_cast<VPWidenPHIRecipe>(&P);
3747 if (!VPPhi)
3748 continue;
3749 PHINode *NewPhi = cast<PHINode>(State.get(VPPhi, 0));
3750 // Make sure the builder has a valid insert point.
3751 Builder.SetInsertPoint(NewPhi);
3752 for (unsigned i = 0; i < VPPhi->getNumOperands(); ++i) {
3753 VPValue *Inc = VPPhi->getIncomingValue(i);
3754 VPBasicBlock *VPBB = VPPhi->getIncomingBlock(i);
3755 NewPhi->addIncoming(State.get(Inc, 0), State.CFG.VPBB2IRBB[VPBB]);
3761 bool InnerLoopVectorizer::useOrderedReductions(
3762 const RecurrenceDescriptor &RdxDesc) {
3763 return Cost->useOrderedReductions(RdxDesc);
3766 void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
3767 // We should not collect Scalars more than once per VF. Right now, this
3768 // function is called from collectUniformsAndScalars(), which already does
3769 // this check. Collecting Scalars for VF=1 does not make any sense.
3770 assert(VF.isVector() && !Scalars.contains(VF) &&
3771 "This function should not be visited twice for the same VF");
3773 // This avoids any chances of creating a REPLICATE recipe during planning
3774 // since that would result in generation of scalarized code during execution,
3775 // which is not supported for scalable vectors.
3776 if (VF.isScalable()) {
3777 Scalars[VF].insert(Uniforms[VF].begin(), Uniforms[VF].end());
3778 return;
3781 SmallSetVector<Instruction *, 8> Worklist;
3783 // These sets are used to seed the analysis with pointers used by memory
3784 // accesses that will remain scalar.
3785 SmallSetVector<Instruction *, 8> ScalarPtrs;
3786 SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
3787 auto *Latch = TheLoop->getLoopLatch();
3789 // A helper that returns true if the use of Ptr by MemAccess will be scalar.
3790 // The pointer operands of loads and stores will be scalar as long as the
3791 // memory access is not a gather or scatter operation. The value operand of a
3792 // store will remain scalar if the store is scalarized.
3793 auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
3794 InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
3795 assert(WideningDecision != CM_Unknown &&
3796 "Widening decision should be ready at this moment");
3797 if (auto *Store = dyn_cast<StoreInst>(MemAccess))
3798 if (Ptr == Store->getValueOperand())
3799 return WideningDecision == CM_Scalarize;
3800 assert(Ptr == getLoadStorePointerOperand(MemAccess) &&
3801 "Ptr is neither a value or pointer operand");
3802 return WideningDecision != CM_GatherScatter;
3805 // A helper that returns true if the given value is a bitcast or
3806 // getelementptr instruction contained in the loop.
3807 auto isLoopVaryingBitCastOrGEP = [&](Value *V) {
3808 return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) ||
3809 isa<GetElementPtrInst>(V)) &&
3810 !TheLoop->isLoopInvariant(V);
3813 // A helper that evaluates a memory access's use of a pointer. If the use will
3814 // be a scalar use and the pointer is only used by memory accesses, we place
3815 // the pointer in ScalarPtrs. Otherwise, the pointer is placed in
3816 // PossibleNonScalarPtrs.
3817 auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
3818 // We only care about bitcast and getelementptr instructions contained in
3819 // the loop.
3820 if (!isLoopVaryingBitCastOrGEP(Ptr))
3821 return;
3823 // If the pointer has already been identified as scalar (e.g., if it was
3824 // also identified as uniform), there's nothing to do.
3825 auto *I = cast<Instruction>(Ptr);
3826 if (Worklist.count(I))
3827 return;
3829 // If the use of the pointer will be a scalar use, and all users of the
3830 // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
3831 // place the pointer in PossibleNonScalarPtrs.
3832 if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) {
3833 return isa<LoadInst>(U) || isa<StoreInst>(U);
3835 ScalarPtrs.insert(I);
3836 else
3837 PossibleNonScalarPtrs.insert(I);
3840 // We seed the scalars analysis with three classes of instructions: (1)
3841 // instructions marked uniform-after-vectorization and (2) bitcast,
3842 // getelementptr and (pointer) phi instructions used by memory accesses
3843 // requiring a scalar use.
3845 // (1) Add to the worklist all instructions that have been identified as
3846 // uniform-after-vectorization.
3847 Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end());
3849 // (2) Add to the worklist all bitcast and getelementptr instructions used by
3850 // memory accesses requiring a scalar use. The pointer operands of loads and
3851 // stores will be scalar as long as the memory accesses is not a gather or
3852 // scatter operation. The value operand of a store will remain scalar if the
3853 // store is scalarized.
3854 for (auto *BB : TheLoop->blocks())
3855 for (auto &I : *BB) {
3856 if (auto *Load = dyn_cast<LoadInst>(&I)) {
3857 evaluatePtrUse(Load, Load->getPointerOperand());
3858 } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
3859 evaluatePtrUse(Store, Store->getPointerOperand());
3860 evaluatePtrUse(Store, Store->getValueOperand());
3863 for (auto *I : ScalarPtrs)
3864 if (!PossibleNonScalarPtrs.count(I)) {
3865 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
3866 Worklist.insert(I);
3869 // Insert the forced scalars.
3870 // FIXME: Currently VPWidenPHIRecipe() often creates a dead vector
3871 // induction variable when the PHI user is scalarized.
3872 auto ForcedScalar = ForcedScalars.find(VF);
3873 if (ForcedScalar != ForcedScalars.end())
3874 for (auto *I : ForcedScalar->second) {
3875 LLVM_DEBUG(dbgs() << "LV: Found (forced) scalar instruction: " << *I << "\n");
3876 Worklist.insert(I);
3879 // Expand the worklist by looking through any bitcasts and getelementptr
3880 // instructions we've already identified as scalar. This is similar to the
3881 // expansion step in collectLoopUniforms(); however, here we're only
3882 // expanding to include additional bitcasts and getelementptr instructions.
3883 unsigned Idx = 0;
3884 while (Idx != Worklist.size()) {
3885 Instruction *Dst = Worklist[Idx++];
3886 if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0)))
3887 continue;
3888 auto *Src = cast<Instruction>(Dst->getOperand(0));
3889 if (llvm::all_of(Src->users(), [&](User *U) -> bool {
3890 auto *J = cast<Instruction>(U);
3891 return !TheLoop->contains(J) || Worklist.count(J) ||
3892 ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
3893 isScalarUse(J, Src));
3894 })) {
3895 Worklist.insert(Src);
3896 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");
3900 // An induction variable will remain scalar if all users of the induction
3901 // variable and induction variable update remain scalar.
3902 for (const auto &Induction : Legal->getInductionVars()) {
3903 auto *Ind = Induction.first;
3904 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
3906 // If tail-folding is applied, the primary induction variable will be used
3907 // to feed a vector compare.
3908 if (Ind == Legal->getPrimaryInduction() && foldTailByMasking())
3909 continue;
3911 // Returns true if \p Indvar is a pointer induction that is used directly by
3912 // load/store instruction \p I.
3913 auto IsDirectLoadStoreFromPtrIndvar = [&](Instruction *Indvar,
3914 Instruction *I) {
3915 return Induction.second.getKind() ==
3916 InductionDescriptor::IK_PtrInduction &&
3917 (isa<LoadInst>(I) || isa<StoreInst>(I)) &&
3918 Indvar == getLoadStorePointerOperand(I) && isScalarUse(I, Indvar);
3921 // Determine if all users of the induction variable are scalar after
3922 // vectorization.
3923 auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
3924 auto *I = cast<Instruction>(U);
3925 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
3926 IsDirectLoadStoreFromPtrIndvar(Ind, I);
3928 if (!ScalarInd)
3929 continue;
3931 // Determine if all users of the induction variable update instruction are
3932 // scalar after vectorization.
3933 auto ScalarIndUpdate =
3934 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
3935 auto *I = cast<Instruction>(U);
3936 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
3937 IsDirectLoadStoreFromPtrIndvar(IndUpdate, I);
3939 if (!ScalarIndUpdate)
3940 continue;
3942 // The induction variable and its update instruction will remain scalar.
3943 Worklist.insert(Ind);
3944 Worklist.insert(IndUpdate);
3945 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
3946 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
3947 << "\n");
3950 Scalars[VF].insert(Worklist.begin(), Worklist.end());
3953 bool LoopVectorizationCostModel::isScalarWithPredication(
3954 Instruction *I, ElementCount VF) const {
3955 if (!isPredicatedInst(I))
3956 return false;
3958 // Do we have a non-scalar lowering for this predicated
3959 // instruction? No - it is scalar with predication.
3960 switch(I->getOpcode()) {
3961 default:
3962 return true;
3963 case Instruction::Call:
3964 if (VF.isScalar())
3965 return true;
3966 return CallWideningDecisions.at(std::make_pair(cast<CallInst>(I), VF))
3967 .Kind == CM_Scalarize;
3968 case Instruction::Load:
3969 case Instruction::Store: {
3970 auto *Ptr = getLoadStorePointerOperand(I);
3971 auto *Ty = getLoadStoreType(I);
3972 Type *VTy = Ty;
3973 if (VF.isVector())
3974 VTy = VectorType::get(Ty, VF);
3975 const Align Alignment = getLoadStoreAlignment(I);
3976 return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) ||
3977 TTI.isLegalMaskedGather(VTy, Alignment))
3978 : !(isLegalMaskedStore(Ty, Ptr, Alignment) ||
3979 TTI.isLegalMaskedScatter(VTy, Alignment));
3981 case Instruction::UDiv:
3982 case Instruction::SDiv:
3983 case Instruction::SRem:
3984 case Instruction::URem: {
3985 // We have the option to use the safe-divisor idiom to avoid predication.
3986 // The cost based decision here will always select safe-divisor for
3987 // scalable vectors as scalarization isn't legal.
3988 const auto [ScalarCost, SafeDivisorCost] = getDivRemSpeculationCost(I, VF);
3989 return isDivRemScalarWithPredication(ScalarCost, SafeDivisorCost);
3994 bool LoopVectorizationCostModel::isPredicatedInst(Instruction *I) const {
3995 if (!blockNeedsPredicationForAnyReason(I->getParent()))
3996 return false;
3998 // Can we prove this instruction is safe to unconditionally execute?
3999 // If not, we must use some form of predication.
4000 switch(I->getOpcode()) {
4001 default:
4002 return false;
4003 case Instruction::Load:
4004 case Instruction::Store: {
4005 if (!Legal->isMaskRequired(I))
4006 return false;
4007 // When we know the load's address is loop invariant and the instruction
4008 // in the original scalar loop was unconditionally executed then we
4009 // don't need to mark it as a predicated instruction. Tail folding may
4010 // introduce additional predication, but we're guaranteed to always have
4011 // at least one active lane. We call Legal->blockNeedsPredication here
4012 // because it doesn't query tail-folding. For stores, we need to prove
4013 // both speculation safety (which follows from the same argument as loads),
4014 // but also must prove the value being stored is correct. The easiest
4015 // form of the later is to require that all values stored are the same.
4016 if (Legal->isInvariant(getLoadStorePointerOperand(I)) &&
4017 (isa<LoadInst>(I) ||
4018 (isa<StoreInst>(I) &&
4019 TheLoop->isLoopInvariant(cast<StoreInst>(I)->getValueOperand()))) &&
4020 !Legal->blockNeedsPredication(I->getParent()))
4021 return false;
4022 return true;
4024 case Instruction::UDiv:
4025 case Instruction::SDiv:
4026 case Instruction::SRem:
4027 case Instruction::URem:
4028 // TODO: We can use the loop-preheader as context point here and get
4029 // context sensitive reasoning
4030 return !isSafeToSpeculativelyExecute(I);
4031 case Instruction::Call:
4032 return Legal->isMaskRequired(I);
4036 std::pair<InstructionCost, InstructionCost>
4037 LoopVectorizationCostModel::getDivRemSpeculationCost(Instruction *I,
4038 ElementCount VF) const {
4039 assert(I->getOpcode() == Instruction::UDiv ||
4040 I->getOpcode() == Instruction::SDiv ||
4041 I->getOpcode() == Instruction::SRem ||
4042 I->getOpcode() == Instruction::URem);
4043 assert(!isSafeToSpeculativelyExecute(I));
4045 const TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
4047 // Scalarization isn't legal for scalable vector types
4048 InstructionCost ScalarizationCost = InstructionCost::getInvalid();
4049 if (!VF.isScalable()) {
4050 // Get the scalarization cost and scale this amount by the probability of
4051 // executing the predicated block. If the instruction is not predicated,
4052 // we fall through to the next case.
4053 ScalarizationCost = 0;
4055 // These instructions have a non-void type, so account for the phi nodes
4056 // that we will create. This cost is likely to be zero. The phi node
4057 // cost, if any, should be scaled by the block probability because it
4058 // models a copy at the end of each predicated block.
4059 ScalarizationCost += VF.getKnownMinValue() *
4060 TTI.getCFInstrCost(Instruction::PHI, CostKind);
4062 // The cost of the non-predicated instruction.
4063 ScalarizationCost += VF.getKnownMinValue() *
4064 TTI.getArithmeticInstrCost(I->getOpcode(), I->getType(), CostKind);
4066 // The cost of insertelement and extractelement instructions needed for
4067 // scalarization.
4068 ScalarizationCost += getScalarizationOverhead(I, VF, CostKind);
4070 // Scale the cost by the probability of executing the predicated blocks.
4071 // This assumes the predicated block for each vector lane is equally
4072 // likely.
4073 ScalarizationCost = ScalarizationCost / getReciprocalPredBlockProb();
4075 InstructionCost SafeDivisorCost = 0;
4077 auto *VecTy = ToVectorTy(I->getType(), VF);
4079 // The cost of the select guard to ensure all lanes are well defined
4080 // after we speculate above any internal control flow.
4081 SafeDivisorCost += TTI.getCmpSelInstrCost(
4082 Instruction::Select, VecTy,
4083 ToVectorTy(Type::getInt1Ty(I->getContext()), VF),
4084 CmpInst::BAD_ICMP_PREDICATE, CostKind);
4086 // Certain instructions can be cheaper to vectorize if they have a constant
4087 // second vector operand. One example of this are shifts on x86.
4088 Value *Op2 = I->getOperand(1);
4089 auto Op2Info = TTI.getOperandInfo(Op2);
4090 if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue &&
4091 Legal->isInvariant(Op2))
4092 Op2Info.Kind = TargetTransformInfo::OK_UniformValue;
4094 SmallVector<const Value *, 4> Operands(I->operand_values());
4095 SafeDivisorCost += TTI.getArithmeticInstrCost(
4096 I->getOpcode(), VecTy, CostKind,
4097 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
4098 Op2Info, Operands, I);
4099 return {ScalarizationCost, SafeDivisorCost};
4102 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(
4103 Instruction *I, ElementCount VF) {
4104 assert(isAccessInterleaved(I) && "Expecting interleaved access.");
4105 assert(getWideningDecision(I, VF) == CM_Unknown &&
4106 "Decision should not be set yet.");
4107 auto *Group = getInterleavedAccessGroup(I);
4108 assert(Group && "Must have a group.");
4110 // If the instruction's allocated size doesn't equal it's type size, it
4111 // requires padding and will be scalarized.
4112 auto &DL = I->getModule()->getDataLayout();
4113 auto *ScalarTy = getLoadStoreType(I);
4114 if (hasIrregularType(ScalarTy, DL))
4115 return false;
4117 // If the group involves a non-integral pointer, we may not be able to
4118 // losslessly cast all values to a common type.
4119 unsigned InterleaveFactor = Group->getFactor();
4120 bool ScalarNI = DL.isNonIntegralPointerType(ScalarTy);
4121 for (unsigned i = 0; i < InterleaveFactor; i++) {
4122 Instruction *Member = Group->getMember(i);
4123 if (!Member)
4124 continue;
4125 auto *MemberTy = getLoadStoreType(Member);
4126 bool MemberNI = DL.isNonIntegralPointerType(MemberTy);
4127 // Don't coerce non-integral pointers to integers or vice versa.
4128 if (MemberNI != ScalarNI) {
4129 // TODO: Consider adding special nullptr value case here
4130 return false;
4131 } else if (MemberNI && ScalarNI &&
4132 ScalarTy->getPointerAddressSpace() !=
4133 MemberTy->getPointerAddressSpace()) {
4134 return false;
4138 // Check if masking is required.
4139 // A Group may need masking for one of two reasons: it resides in a block that
4140 // needs predication, or it was decided to use masking to deal with gaps
4141 // (either a gap at the end of a load-access that may result in a speculative
4142 // load, or any gaps in a store-access).
4143 bool PredicatedAccessRequiresMasking =
4144 blockNeedsPredicationForAnyReason(I->getParent()) &&
4145 Legal->isMaskRequired(I);
4146 bool LoadAccessWithGapsRequiresEpilogMasking =
4147 isa<LoadInst>(I) && Group->requiresScalarEpilogue() &&
4148 !isScalarEpilogueAllowed();
4149 bool StoreAccessWithGapsRequiresMasking =
4150 isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor());
4151 if (!PredicatedAccessRequiresMasking &&
4152 !LoadAccessWithGapsRequiresEpilogMasking &&
4153 !StoreAccessWithGapsRequiresMasking)
4154 return true;
4156 // If masked interleaving is required, we expect that the user/target had
4157 // enabled it, because otherwise it either wouldn't have been created or
4158 // it should have been invalidated by the CostModel.
4159 assert(useMaskedInterleavedAccesses(TTI) &&
4160 "Masked interleave-groups for predicated accesses are not enabled.");
4162 if (Group->isReverse())
4163 return false;
4165 auto *Ty = getLoadStoreType(I);
4166 const Align Alignment = getLoadStoreAlignment(I);
4167 return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment)
4168 : TTI.isLegalMaskedStore(Ty, Alignment);
4171 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(
4172 Instruction *I, ElementCount VF) {
4173 // Get and ensure we have a valid memory instruction.
4174 assert((isa<LoadInst, StoreInst>(I)) && "Invalid memory instruction");
4176 auto *Ptr = getLoadStorePointerOperand(I);
4177 auto *ScalarTy = getLoadStoreType(I);
4179 // In order to be widened, the pointer should be consecutive, first of all.
4180 if (!Legal->isConsecutivePtr(ScalarTy, Ptr))
4181 return false;
4183 // If the instruction is a store located in a predicated block, it will be
4184 // scalarized.
4185 if (isScalarWithPredication(I, VF))
4186 return false;
4188 // If the instruction's allocated size doesn't equal it's type size, it
4189 // requires padding and will be scalarized.
4190 auto &DL = I->getModule()->getDataLayout();
4191 if (hasIrregularType(ScalarTy, DL))
4192 return false;
4194 return true;
4197 void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
4198 // We should not collect Uniforms more than once per VF. Right now,
4199 // this function is called from collectUniformsAndScalars(), which
4200 // already does this check. Collecting Uniforms for VF=1 does not make any
4201 // sense.
4203 assert(VF.isVector() && !Uniforms.contains(VF) &&
4204 "This function should not be visited twice for the same VF");
4206 // Visit the list of Uniforms. If we'll not find any uniform value, we'll
4207 // not analyze again. Uniforms.count(VF) will return 1.
4208 Uniforms[VF].clear();
4210 // We now know that the loop is vectorizable!
4211 // Collect instructions inside the loop that will remain uniform after
4212 // vectorization.
4214 // Global values, params and instructions outside of current loop are out of
4215 // scope.
4216 auto isOutOfScope = [&](Value *V) -> bool {
4217 Instruction *I = dyn_cast<Instruction>(V);
4218 return (!I || !TheLoop->contains(I));
4221 // Worklist containing uniform instructions demanding lane 0.
4222 SetVector<Instruction *> Worklist;
4223 BasicBlock *Latch = TheLoop->getLoopLatch();
4225 // Add uniform instructions demanding lane 0 to the worklist. Instructions
4226 // that are scalar with predication must not be considered uniform after
4227 // vectorization, because that would create an erroneous replicating region
4228 // where only a single instance out of VF should be formed.
4229 // TODO: optimize such seldom cases if found important, see PR40816.
4230 auto addToWorklistIfAllowed = [&](Instruction *I) -> void {
4231 if (isOutOfScope(I)) {
4232 LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: "
4233 << *I << "\n");
4234 return;
4236 if (isScalarWithPredication(I, VF)) {
4237 LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: "
4238 << *I << "\n");
4239 return;
4241 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n");
4242 Worklist.insert(I);
4245 // Start with the conditional branch. If the branch condition is an
4246 // instruction contained in the loop that is only used by the branch, it is
4247 // uniform.
4248 auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
4249 if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
4250 addToWorklistIfAllowed(Cmp);
4252 auto PrevVF = VF.divideCoefficientBy(2);
4253 // Return true if all lanes perform the same memory operation, and we can
4254 // thus chose to execute only one.
4255 auto isUniformMemOpUse = [&](Instruction *I) {
4256 // If the value was already known to not be uniform for the previous
4257 // (smaller VF), it cannot be uniform for the larger VF.
4258 if (PrevVF.isVector()) {
4259 auto Iter = Uniforms.find(PrevVF);
4260 if (Iter != Uniforms.end() && !Iter->second.contains(I))
4261 return false;
4263 if (!Legal->isUniformMemOp(*I, VF))
4264 return false;
4265 if (isa<LoadInst>(I))
4266 // Loading the same address always produces the same result - at least
4267 // assuming aliasing and ordering which have already been checked.
4268 return true;
4269 // Storing the same value on every iteration.
4270 return TheLoop->isLoopInvariant(cast<StoreInst>(I)->getValueOperand());
4273 auto isUniformDecision = [&](Instruction *I, ElementCount VF) {
4274 InstWidening WideningDecision = getWideningDecision(I, VF);
4275 assert(WideningDecision != CM_Unknown &&
4276 "Widening decision should be ready at this moment");
4278 if (isUniformMemOpUse(I))
4279 return true;
4281 return (WideningDecision == CM_Widen ||
4282 WideningDecision == CM_Widen_Reverse ||
4283 WideningDecision == CM_Interleave);
4286 // Returns true if Ptr is the pointer operand of a memory access instruction
4287 // I, I is known to not require scalarization, and the pointer is not also
4288 // stored.
4289 auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
4290 if (isa<StoreInst>(I) && I->getOperand(0) == Ptr)
4291 return false;
4292 return getLoadStorePointerOperand(I) == Ptr &&
4293 (isUniformDecision(I, VF) || Legal->isInvariant(Ptr));
4296 // Holds a list of values which are known to have at least one uniform use.
4297 // Note that there may be other uses which aren't uniform. A "uniform use"
4298 // here is something which only demands lane 0 of the unrolled iterations;
4299 // it does not imply that all lanes produce the same value (e.g. this is not
4300 // the usual meaning of uniform)
4301 SetVector<Value *> HasUniformUse;
4303 // Scan the loop for instructions which are either a) known to have only
4304 // lane 0 demanded or b) are uses which demand only lane 0 of their operand.
4305 for (auto *BB : TheLoop->blocks())
4306 for (auto &I : *BB) {
4307 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I)) {
4308 switch (II->getIntrinsicID()) {
4309 case Intrinsic::sideeffect:
4310 case Intrinsic::experimental_noalias_scope_decl:
4311 case Intrinsic::assume:
4312 case Intrinsic::lifetime_start:
4313 case Intrinsic::lifetime_end:
4314 if (TheLoop->hasLoopInvariantOperands(&I))
4315 addToWorklistIfAllowed(&I);
4316 break;
4317 default:
4318 break;
4322 // ExtractValue instructions must be uniform, because the operands are
4323 // known to be loop-invariant.
4324 if (auto *EVI = dyn_cast<ExtractValueInst>(&I)) {
4325 assert(isOutOfScope(EVI->getAggregateOperand()) &&
4326 "Expected aggregate value to be loop invariant");
4327 addToWorklistIfAllowed(EVI);
4328 continue;
4331 // If there's no pointer operand, there's nothing to do.
4332 auto *Ptr = getLoadStorePointerOperand(&I);
4333 if (!Ptr)
4334 continue;
4336 if (isUniformMemOpUse(&I))
4337 addToWorklistIfAllowed(&I);
4339 if (isVectorizedMemAccessUse(&I, Ptr))
4340 HasUniformUse.insert(Ptr);
4343 // Add to the worklist any operands which have *only* uniform (e.g. lane 0
4344 // demanding) users. Since loops are assumed to be in LCSSA form, this
4345 // disallows uses outside the loop as well.
4346 for (auto *V : HasUniformUse) {
4347 if (isOutOfScope(V))
4348 continue;
4349 auto *I = cast<Instruction>(V);
4350 auto UsersAreMemAccesses =
4351 llvm::all_of(I->users(), [&](User *U) -> bool {
4352 return isVectorizedMemAccessUse(cast<Instruction>(U), V);
4354 if (UsersAreMemAccesses)
4355 addToWorklistIfAllowed(I);
4358 // Expand Worklist in topological order: whenever a new instruction
4359 // is added , its users should be already inside Worklist. It ensures
4360 // a uniform instruction will only be used by uniform instructions.
4361 unsigned idx = 0;
4362 while (idx != Worklist.size()) {
4363 Instruction *I = Worklist[idx++];
4365 for (auto *OV : I->operand_values()) {
4366 // isOutOfScope operands cannot be uniform instructions.
4367 if (isOutOfScope(OV))
4368 continue;
4369 // First order recurrence Phi's should typically be considered
4370 // non-uniform.
4371 auto *OP = dyn_cast<PHINode>(OV);
4372 if (OP && Legal->isFixedOrderRecurrence(OP))
4373 continue;
4374 // If all the users of the operand are uniform, then add the
4375 // operand into the uniform worklist.
4376 auto *OI = cast<Instruction>(OV);
4377 if (llvm::all_of(OI->users(), [&](User *U) -> bool {
4378 auto *J = cast<Instruction>(U);
4379 return Worklist.count(J) || isVectorizedMemAccessUse(J, OI);
4381 addToWorklistIfAllowed(OI);
4385 // For an instruction to be added into Worklist above, all its users inside
4386 // the loop should also be in Worklist. However, this condition cannot be
4387 // true for phi nodes that form a cyclic dependence. We must process phi
4388 // nodes separately. An induction variable will remain uniform if all users
4389 // of the induction variable and induction variable update remain uniform.
4390 // The code below handles both pointer and non-pointer induction variables.
4391 for (const auto &Induction : Legal->getInductionVars()) {
4392 auto *Ind = Induction.first;
4393 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4395 // Determine if all users of the induction variable are uniform after
4396 // vectorization.
4397 auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
4398 auto *I = cast<Instruction>(U);
4399 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
4400 isVectorizedMemAccessUse(I, Ind);
4402 if (!UniformInd)
4403 continue;
4405 // Determine if all users of the induction variable update instruction are
4406 // uniform after vectorization.
4407 auto UniformIndUpdate =
4408 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
4409 auto *I = cast<Instruction>(U);
4410 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
4411 isVectorizedMemAccessUse(I, IndUpdate);
4413 if (!UniformIndUpdate)
4414 continue;
4416 // The induction variable and its update instruction will remain uniform.
4417 addToWorklistIfAllowed(Ind);
4418 addToWorklistIfAllowed(IndUpdate);
4421 Uniforms[VF].insert(Worklist.begin(), Worklist.end());
4424 bool LoopVectorizationCostModel::runtimeChecksRequired() {
4425 LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n");
4427 if (Legal->getRuntimePointerChecking()->Need) {
4428 reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz",
4429 "runtime pointer checks needed. Enable vectorization of this "
4430 "loop with '#pragma clang loop vectorize(enable)' when "
4431 "compiling with -Os/-Oz",
4432 "CantVersionLoopWithOptForSize", ORE, TheLoop);
4433 return true;
4436 if (!PSE.getPredicate().isAlwaysTrue()) {
4437 reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz",
4438 "runtime SCEV checks needed. Enable vectorization of this "
4439 "loop with '#pragma clang loop vectorize(enable)' when "
4440 "compiling with -Os/-Oz",
4441 "CantVersionLoopWithOptForSize", ORE, TheLoop);
4442 return true;
4445 // FIXME: Avoid specializing for stride==1 instead of bailing out.
4446 if (!Legal->getLAI()->getSymbolicStrides().empty()) {
4447 reportVectorizationFailure("Runtime stride check for small trip count",
4448 "runtime stride == 1 checks needed. Enable vectorization of "
4449 "this loop without such check by compiling with -Os/-Oz",
4450 "CantVersionLoopWithOptForSize", ORE, TheLoop);
4451 return true;
4454 return false;
4457 ElementCount
4458 LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) {
4459 if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors)
4460 return ElementCount::getScalable(0);
4462 if (Hints->isScalableVectorizationDisabled()) {
4463 reportVectorizationInfo("Scalable vectorization is explicitly disabled",
4464 "ScalableVectorizationDisabled", ORE, TheLoop);
4465 return ElementCount::getScalable(0);
4468 LLVM_DEBUG(dbgs() << "LV: Scalable vectorization is available\n");
4470 auto MaxScalableVF = ElementCount::getScalable(
4471 std::numeric_limits<ElementCount::ScalarTy>::max());
4473 // Test that the loop-vectorizer can legalize all operations for this MaxVF.
4474 // FIXME: While for scalable vectors this is currently sufficient, this should
4475 // be replaced by a more detailed mechanism that filters out specific VFs,
4476 // instead of invalidating vectorization for a whole set of VFs based on the
4477 // MaxVF.
4479 // Disable scalable vectorization if the loop contains unsupported reductions.
4480 if (!canVectorizeReductions(MaxScalableVF)) {
4481 reportVectorizationInfo(
4482 "Scalable vectorization not supported for the reduction "
4483 "operations found in this loop.",
4484 "ScalableVFUnfeasible", ORE, TheLoop);
4485 return ElementCount::getScalable(0);
4488 // Disable scalable vectorization if the loop contains any instructions
4489 // with element types not supported for scalable vectors.
4490 if (any_of(ElementTypesInLoop, [&](Type *Ty) {
4491 return !Ty->isVoidTy() &&
4492 !this->TTI.isElementTypeLegalForScalableVector(Ty);
4493 })) {
4494 reportVectorizationInfo("Scalable vectorization is not supported "
4495 "for all element types found in this loop.",
4496 "ScalableVFUnfeasible", ORE, TheLoop);
4497 return ElementCount::getScalable(0);
4500 if (Legal->isSafeForAnyVectorWidth())
4501 return MaxScalableVF;
4503 // Limit MaxScalableVF by the maximum safe dependence distance.
4504 if (std::optional<unsigned> MaxVScale = getMaxVScale(*TheFunction, TTI))
4505 MaxScalableVF = ElementCount::getScalable(MaxSafeElements / *MaxVScale);
4506 else
4507 MaxScalableVF = ElementCount::getScalable(0);
4509 if (!MaxScalableVF)
4510 reportVectorizationInfo(
4511 "Max legal vector width too small, scalable vectorization "
4512 "unfeasible.",
4513 "ScalableVFUnfeasible", ORE, TheLoop);
4515 return MaxScalableVF;
4518 FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF(
4519 unsigned MaxTripCount, ElementCount UserVF, bool FoldTailByMasking) {
4520 MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
4521 unsigned SmallestType, WidestType;
4522 std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
4524 // Get the maximum safe dependence distance in bits computed by LAA.
4525 // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
4526 // the memory accesses that is most restrictive (involved in the smallest
4527 // dependence distance).
4528 unsigned MaxSafeElements =
4529 llvm::bit_floor(Legal->getMaxSafeVectorWidthInBits() / WidestType);
4531 auto MaxSafeFixedVF = ElementCount::getFixed(MaxSafeElements);
4532 auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements);
4534 LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF
4535 << ".\n");
4536 LLVM_DEBUG(dbgs() << "LV: The max safe scalable VF is: " << MaxSafeScalableVF
4537 << ".\n");
4539 // First analyze the UserVF, fall back if the UserVF should be ignored.
4540 if (UserVF) {
4541 auto MaxSafeUserVF =
4542 UserVF.isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF;
4544 if (ElementCount::isKnownLE(UserVF, MaxSafeUserVF)) {
4545 // If `VF=vscale x N` is safe, then so is `VF=N`
4546 if (UserVF.isScalable())
4547 return FixedScalableVFPair(
4548 ElementCount::getFixed(UserVF.getKnownMinValue()), UserVF);
4549 else
4550 return UserVF;
4553 assert(ElementCount::isKnownGT(UserVF, MaxSafeUserVF));
4555 // Only clamp if the UserVF is not scalable. If the UserVF is scalable, it
4556 // is better to ignore the hint and let the compiler choose a suitable VF.
4557 if (!UserVF.isScalable()) {
4558 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
4559 << " is unsafe, clamping to max safe VF="
4560 << MaxSafeFixedVF << ".\n");
4561 ORE->emit([&]() {
4562 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
4563 TheLoop->getStartLoc(),
4564 TheLoop->getHeader())
4565 << "User-specified vectorization factor "
4566 << ore::NV("UserVectorizationFactor", UserVF)
4567 << " is unsafe, clamping to maximum safe vectorization factor "
4568 << ore::NV("VectorizationFactor", MaxSafeFixedVF);
4570 return MaxSafeFixedVF;
4573 if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) {
4574 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
4575 << " is ignored because scalable vectors are not "
4576 "available.\n");
4577 ORE->emit([&]() {
4578 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
4579 TheLoop->getStartLoc(),
4580 TheLoop->getHeader())
4581 << "User-specified vectorization factor "
4582 << ore::NV("UserVectorizationFactor", UserVF)
4583 << " is ignored because the target does not support scalable "
4584 "vectors. The compiler will pick a more suitable value.";
4586 } else {
4587 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
4588 << " is unsafe. Ignoring scalable UserVF.\n");
4589 ORE->emit([&]() {
4590 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
4591 TheLoop->getStartLoc(),
4592 TheLoop->getHeader())
4593 << "User-specified vectorization factor "
4594 << ore::NV("UserVectorizationFactor", UserVF)
4595 << " is unsafe. Ignoring the hint to let the compiler pick a "
4596 "more suitable value.";
4601 LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
4602 << " / " << WidestType << " bits.\n");
4604 FixedScalableVFPair Result(ElementCount::getFixed(1),
4605 ElementCount::getScalable(0));
4606 if (auto MaxVF =
4607 getMaximizedVFForTarget(MaxTripCount, SmallestType, WidestType,
4608 MaxSafeFixedVF, FoldTailByMasking))
4609 Result.FixedVF = MaxVF;
4611 if (auto MaxVF =
4612 getMaximizedVFForTarget(MaxTripCount, SmallestType, WidestType,
4613 MaxSafeScalableVF, FoldTailByMasking))
4614 if (MaxVF.isScalable()) {
4615 Result.ScalableVF = MaxVF;
4616 LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF
4617 << "\n");
4620 return Result;
4623 FixedScalableVFPair
4624 LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
4625 if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
4626 // TODO: It may by useful to do since it's still likely to be dynamically
4627 // uniform if the target can skip.
4628 reportVectorizationFailure(
4629 "Not inserting runtime ptr check for divergent target",
4630 "runtime pointer checks needed. Not enabled for divergent target",
4631 "CantVersionLoopWithDivergentTarget", ORE, TheLoop);
4632 return FixedScalableVFPair::getNone();
4635 unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
4636 unsigned MaxTC = PSE.getSE()->getSmallConstantMaxTripCount(TheLoop);
4637 LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
4638 if (TC == 1) {
4639 reportVectorizationFailure("Single iteration (non) loop",
4640 "loop trip count is one, irrelevant for vectorization",
4641 "SingleIterationLoop", ORE, TheLoop);
4642 return FixedScalableVFPair::getNone();
4645 switch (ScalarEpilogueStatus) {
4646 case CM_ScalarEpilogueAllowed:
4647 return computeFeasibleMaxVF(MaxTC, UserVF, false);
4648 case CM_ScalarEpilogueNotAllowedUsePredicate:
4649 [[fallthrough]];
4650 case CM_ScalarEpilogueNotNeededUsePredicate:
4651 LLVM_DEBUG(
4652 dbgs() << "LV: vector predicate hint/switch found.\n"
4653 << "LV: Not allowing scalar epilogue, creating predicated "
4654 << "vector loop.\n");
4655 break;
4656 case CM_ScalarEpilogueNotAllowedLowTripLoop:
4657 // fallthrough as a special case of OptForSize
4658 case CM_ScalarEpilogueNotAllowedOptSize:
4659 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize)
4660 LLVM_DEBUG(
4661 dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
4662 else
4663 LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip "
4664 << "count.\n");
4666 // Bail if runtime checks are required, which are not good when optimising
4667 // for size.
4668 if (runtimeChecksRequired())
4669 return FixedScalableVFPair::getNone();
4671 break;
4674 // The only loops we can vectorize without a scalar epilogue, are loops with
4675 // a bottom-test and a single exiting block. We'd have to handle the fact
4676 // that not every instruction executes on the last iteration. This will
4677 // require a lane mask which varies through the vector loop body. (TODO)
4678 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
4679 // If there was a tail-folding hint/switch, but we can't fold the tail by
4680 // masking, fallback to a vectorization with a scalar epilogue.
4681 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
4682 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
4683 "scalar epilogue instead.\n");
4684 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
4685 return computeFeasibleMaxVF(MaxTC, UserVF, false);
4687 return FixedScalableVFPair::getNone();
4690 // Now try the tail folding
4692 // Invalidate interleave groups that require an epilogue if we can't mask
4693 // the interleave-group.
4694 if (!useMaskedInterleavedAccesses(TTI)) {
4695 assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() &&
4696 "No decisions should have been taken at this point");
4697 // Note: There is no need to invalidate any cost modeling decisions here, as
4698 // non where taken so far.
4699 InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
4702 FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(MaxTC, UserVF, true);
4704 // Avoid tail folding if the trip count is known to be a multiple of any VF
4705 // we choose.
4706 std::optional<unsigned> MaxPowerOf2RuntimeVF =
4707 MaxFactors.FixedVF.getFixedValue();
4708 if (MaxFactors.ScalableVF) {
4709 std::optional<unsigned> MaxVScale = getMaxVScale(*TheFunction, TTI);
4710 if (MaxVScale && TTI.isVScaleKnownToBeAPowerOfTwo()) {
4711 MaxPowerOf2RuntimeVF = std::max<unsigned>(
4712 *MaxPowerOf2RuntimeVF,
4713 *MaxVScale * MaxFactors.ScalableVF.getKnownMinValue());
4714 } else
4715 MaxPowerOf2RuntimeVF = std::nullopt; // Stick with tail-folding for now.
4718 if (MaxPowerOf2RuntimeVF && *MaxPowerOf2RuntimeVF > 0) {
4719 assert((UserVF.isNonZero() || isPowerOf2_32(*MaxPowerOf2RuntimeVF)) &&
4720 "MaxFixedVF must be a power of 2");
4721 unsigned MaxVFtimesIC =
4722 UserIC ? *MaxPowerOf2RuntimeVF * UserIC : *MaxPowerOf2RuntimeVF;
4723 ScalarEvolution *SE = PSE.getSE();
4724 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
4725 const SCEV *ExitCount = SE->getAddExpr(
4726 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
4727 const SCEV *Rem = SE->getURemExpr(
4728 SE->applyLoopGuards(ExitCount, TheLoop),
4729 SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC));
4730 if (Rem->isZero()) {
4731 // Accept MaxFixedVF if we do not have a tail.
4732 LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
4733 return MaxFactors;
4737 // If we don't know the precise trip count, or if the trip count that we
4738 // found modulo the vectorization factor is not zero, try to fold the tail
4739 // by masking.
4740 // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
4741 if (Legal->prepareToFoldTailByMasking()) {
4742 CanFoldTailByMasking = true;
4743 return MaxFactors;
4746 // If there was a tail-folding hint/switch, but we can't fold the tail by
4747 // masking, fallback to a vectorization with a scalar epilogue.
4748 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
4749 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
4750 "scalar epilogue instead.\n");
4751 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
4752 return MaxFactors;
4755 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) {
4756 LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n");
4757 return FixedScalableVFPair::getNone();
4760 if (TC == 0) {
4761 reportVectorizationFailure(
4762 "Unable to calculate the loop count due to complex control flow",
4763 "unable to calculate the loop count due to complex control flow",
4764 "UnknownLoopCountComplexCFG", ORE, TheLoop);
4765 return FixedScalableVFPair::getNone();
4768 reportVectorizationFailure(
4769 "Cannot optimize for size and vectorize at the same time.",
4770 "cannot optimize for size and vectorize at the same time. "
4771 "Enable vectorization of this loop with '#pragma clang loop "
4772 "vectorize(enable)' when compiling with -Os/-Oz",
4773 "NoTailLoopWithOptForSize", ORE, TheLoop);
4774 return FixedScalableVFPair::getNone();
4777 ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
4778 unsigned MaxTripCount, unsigned SmallestType, unsigned WidestType,
4779 ElementCount MaxSafeVF, bool FoldTailByMasking) {
4780 bool ComputeScalableMaxVF = MaxSafeVF.isScalable();
4781 const TypeSize WidestRegister = TTI.getRegisterBitWidth(
4782 ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
4783 : TargetTransformInfo::RGK_FixedWidthVector);
4785 // Convenience function to return the minimum of two ElementCounts.
4786 auto MinVF = [](const ElementCount &LHS, const ElementCount &RHS) {
4787 assert((LHS.isScalable() == RHS.isScalable()) &&
4788 "Scalable flags must match");
4789 return ElementCount::isKnownLT(LHS, RHS) ? LHS : RHS;
4792 // Ensure MaxVF is a power of 2; the dependence distance bound may not be.
4793 // Note that both WidestRegister and WidestType may not be a powers of 2.
4794 auto MaxVectorElementCount = ElementCount::get(
4795 llvm::bit_floor(WidestRegister.getKnownMinValue() / WidestType),
4796 ComputeScalableMaxVF);
4797 MaxVectorElementCount = MinVF(MaxVectorElementCount, MaxSafeVF);
4798 LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
4799 << (MaxVectorElementCount * WidestType) << " bits.\n");
4801 if (!MaxVectorElementCount) {
4802 LLVM_DEBUG(dbgs() << "LV: The target has no "
4803 << (ComputeScalableMaxVF ? "scalable" : "fixed")
4804 << " vector registers.\n");
4805 return ElementCount::getFixed(1);
4808 unsigned WidestRegisterMinEC = MaxVectorElementCount.getKnownMinValue();
4809 if (MaxVectorElementCount.isScalable() &&
4810 TheFunction->hasFnAttribute(Attribute::VScaleRange)) {
4811 auto Attr = TheFunction->getFnAttribute(Attribute::VScaleRange);
4812 auto Min = Attr.getVScaleRangeMin();
4813 WidestRegisterMinEC *= Min;
4816 // When a scalar epilogue is required, at least one iteration of the scalar
4817 // loop has to execute. Adjust MaxTripCount accordingly to avoid picking a
4818 // max VF that results in a dead vector loop.
4819 if (MaxTripCount > 0 && requiresScalarEpilogue(true))
4820 MaxTripCount -= 1;
4822 if (MaxTripCount && MaxTripCount <= WidestRegisterMinEC &&
4823 (!FoldTailByMasking || isPowerOf2_32(MaxTripCount))) {
4824 // If upper bound loop trip count (TC) is known at compile time there is no
4825 // point in choosing VF greater than TC (as done in the loop below). Select
4826 // maximum power of two which doesn't exceed TC. If MaxVectorElementCount is
4827 // scalable, we only fall back on a fixed VF when the TC is less than or
4828 // equal to the known number of lanes.
4829 auto ClampedUpperTripCount = llvm::bit_floor(MaxTripCount);
4830 LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to maximum power of two not "
4831 "exceeding the constant trip count: "
4832 << ClampedUpperTripCount << "\n");
4833 return ElementCount::get(
4834 ClampedUpperTripCount,
4835 FoldTailByMasking ? MaxVectorElementCount.isScalable() : false);
4838 TargetTransformInfo::RegisterKind RegKind =
4839 ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
4840 : TargetTransformInfo::RGK_FixedWidthVector;
4841 ElementCount MaxVF = MaxVectorElementCount;
4842 if (MaximizeBandwidth ||
4843 (MaximizeBandwidth.getNumOccurrences() == 0 &&
4844 (TTI.shouldMaximizeVectorBandwidth(RegKind) ||
4845 (UseWiderVFIfCallVariantsPresent && Legal->hasVectorCallVariants())))) {
4846 auto MaxVectorElementCountMaxBW = ElementCount::get(
4847 llvm::bit_floor(WidestRegister.getKnownMinValue() / SmallestType),
4848 ComputeScalableMaxVF);
4849 MaxVectorElementCountMaxBW = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF);
4851 // Collect all viable vectorization factors larger than the default MaxVF
4852 // (i.e. MaxVectorElementCount).
4853 SmallVector<ElementCount, 8> VFs;
4854 for (ElementCount VS = MaxVectorElementCount * 2;
4855 ElementCount::isKnownLE(VS, MaxVectorElementCountMaxBW); VS *= 2)
4856 VFs.push_back(VS);
4858 // For each VF calculate its register usage.
4859 auto RUs = calculateRegisterUsage(VFs);
4861 // Select the largest VF which doesn't require more registers than existing
4862 // ones.
4863 for (int i = RUs.size() - 1; i >= 0; --i) {
4864 bool Selected = true;
4865 for (auto &pair : RUs[i].MaxLocalUsers) {
4866 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
4867 if (pair.second > TargetNumRegisters)
4868 Selected = false;
4870 if (Selected) {
4871 MaxVF = VFs[i];
4872 break;
4875 if (ElementCount MinVF =
4876 TTI.getMinimumVF(SmallestType, ComputeScalableMaxVF)) {
4877 if (ElementCount::isKnownLT(MaxVF, MinVF)) {
4878 LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF
4879 << ") with target's minimum: " << MinVF << '\n');
4880 MaxVF = MinVF;
4884 // Invalidate any widening decisions we might have made, in case the loop
4885 // requires prediction (decided later), but we have already made some
4886 // load/store widening decisions.
4887 invalidateCostModelingDecisions();
4889 return MaxVF;
4892 /// Convenience function that returns the value of vscale_range iff
4893 /// vscale_range.min == vscale_range.max or otherwise returns the value
4894 /// returned by the corresponding TTI method.
4895 static std::optional<unsigned>
4896 getVScaleForTuning(const Loop *L, const TargetTransformInfo &TTI) {
4897 const Function *Fn = L->getHeader()->getParent();
4898 if (Fn->hasFnAttribute(Attribute::VScaleRange)) {
4899 auto Attr = Fn->getFnAttribute(Attribute::VScaleRange);
4900 auto Min = Attr.getVScaleRangeMin();
4901 auto Max = Attr.getVScaleRangeMax();
4902 if (Max && Min == Max)
4903 return Max;
4906 return TTI.getVScaleForTuning();
4909 bool LoopVectorizationPlanner::isMoreProfitable(
4910 const VectorizationFactor &A, const VectorizationFactor &B) const {
4911 InstructionCost CostA = A.Cost;
4912 InstructionCost CostB = B.Cost;
4914 unsigned MaxTripCount = PSE.getSE()->getSmallConstantMaxTripCount(OrigLoop);
4916 if (!A.Width.isScalable() && !B.Width.isScalable() && MaxTripCount) {
4917 // If the trip count is a known (possibly small) constant, the trip count
4918 // will be rounded up to an integer number of iterations under
4919 // FoldTailByMasking. The total cost in that case will be
4920 // VecCost*ceil(TripCount/VF). When not folding the tail, the total
4921 // cost will be VecCost*floor(TC/VF) + ScalarCost*(TC%VF). There will be
4922 // some extra overheads, but for the purpose of comparing the costs of
4923 // different VFs we can use this to compare the total loop-body cost
4924 // expected after vectorization.
4925 auto GetCostForTC = [MaxTripCount, this](unsigned VF,
4926 InstructionCost VectorCost,
4927 InstructionCost ScalarCost) {
4928 return CM.foldTailByMasking() ? VectorCost * divideCeil(MaxTripCount, VF)
4929 : VectorCost * (MaxTripCount / VF) +
4930 ScalarCost * (MaxTripCount % VF);
4932 auto RTCostA = GetCostForTC(A.Width.getFixedValue(), CostA, A.ScalarCost);
4933 auto RTCostB = GetCostForTC(B.Width.getFixedValue(), CostB, B.ScalarCost);
4935 return RTCostA < RTCostB;
4938 // Improve estimate for the vector width if it is scalable.
4939 unsigned EstimatedWidthA = A.Width.getKnownMinValue();
4940 unsigned EstimatedWidthB = B.Width.getKnownMinValue();
4941 if (std::optional<unsigned> VScale = getVScaleForTuning(OrigLoop, TTI)) {
4942 if (A.Width.isScalable())
4943 EstimatedWidthA *= *VScale;
4944 if (B.Width.isScalable())
4945 EstimatedWidthB *= *VScale;
4948 // Assume vscale may be larger than 1 (or the value being tuned for),
4949 // so that scalable vectorization is slightly favorable over fixed-width
4950 // vectorization.
4951 if (A.Width.isScalable() && !B.Width.isScalable())
4952 return (CostA * B.Width.getFixedValue()) <= (CostB * EstimatedWidthA);
4954 // To avoid the need for FP division:
4955 // (CostA / A.Width) < (CostB / B.Width)
4956 // <=> (CostA * B.Width) < (CostB * A.Width)
4957 return (CostA * EstimatedWidthB) < (CostB * EstimatedWidthA);
4960 static void emitInvalidCostRemarks(SmallVector<InstructionVFPair> InvalidCosts,
4961 OptimizationRemarkEmitter *ORE,
4962 Loop *TheLoop) {
4963 if (InvalidCosts.empty())
4964 return;
4966 // Emit a report of VFs with invalid costs in the loop.
4968 // Group the remarks per instruction, keeping the instruction order from
4969 // InvalidCosts.
4970 std::map<Instruction *, unsigned> Numbering;
4971 unsigned I = 0;
4972 for (auto &Pair : InvalidCosts)
4973 if (!Numbering.count(Pair.first))
4974 Numbering[Pair.first] = I++;
4976 // Sort the list, first on instruction(number) then on VF.
4977 sort(InvalidCosts, [&Numbering](InstructionVFPair &A, InstructionVFPair &B) {
4978 if (Numbering[A.first] != Numbering[B.first])
4979 return Numbering[A.first] < Numbering[B.first];
4980 ElementCountComparator ECC;
4981 return ECC(A.second, B.second);
4984 // For a list of ordered instruction-vf pairs:
4985 // [(load, vf1), (load, vf2), (store, vf1)]
4986 // Group the instructions together to emit separate remarks for:
4987 // load (vf1, vf2)
4988 // store (vf1)
4989 auto Tail = ArrayRef<InstructionVFPair>(InvalidCosts);
4990 auto Subset = ArrayRef<InstructionVFPair>();
4991 do {
4992 if (Subset.empty())
4993 Subset = Tail.take_front(1);
4995 Instruction *I = Subset.front().first;
4997 // If the next instruction is different, or if there are no other pairs,
4998 // emit a remark for the collated subset. e.g.
4999 // [(load, vf1), (load, vf2))]
5000 // to emit:
5001 // remark: invalid costs for 'load' at VF=(vf, vf2)
5002 if (Subset == Tail || Tail[Subset.size()].first != I) {
5003 std::string OutString;
5004 raw_string_ostream OS(OutString);
5005 assert(!Subset.empty() && "Unexpected empty range");
5006 OS << "Instruction with invalid costs prevented vectorization at VF=(";
5007 for (const auto &Pair : Subset)
5008 OS << (Pair.second == Subset.front().second ? "" : ", ") << Pair.second;
5009 OS << "):";
5010 if (auto *CI = dyn_cast<CallInst>(I))
5011 OS << " call to " << CI->getCalledFunction()->getName();
5012 else
5013 OS << " " << I->getOpcodeName();
5014 OS.flush();
5015 reportVectorizationInfo(OutString, "InvalidCost", ORE, TheLoop, I);
5016 Tail = Tail.drop_front(Subset.size());
5017 Subset = {};
5018 } else
5019 // Grow the subset by one element
5020 Subset = Tail.take_front(Subset.size() + 1);
5021 } while (!Tail.empty());
5024 VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor(
5025 const ElementCountSet &VFCandidates) {
5026 InstructionCost ExpectedCost =
5027 CM.expectedCost(ElementCount::getFixed(1)).first;
5028 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n");
5029 assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop");
5030 assert(VFCandidates.count(ElementCount::getFixed(1)) &&
5031 "Expected Scalar VF to be a candidate");
5033 const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost,
5034 ExpectedCost);
5035 VectorizationFactor ChosenFactor = ScalarCost;
5037 bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled;
5038 if (ForceVectorization && VFCandidates.size() > 1) {
5039 // Ignore scalar width, because the user explicitly wants vectorization.
5040 // Initialize cost to max so that VF = 2 is, at least, chosen during cost
5041 // evaluation.
5042 ChosenFactor.Cost = InstructionCost::getMax();
5045 SmallVector<InstructionVFPair> InvalidCosts;
5046 for (const auto &i : VFCandidates) {
5047 // The cost for scalar VF=1 is already calculated, so ignore it.
5048 if (i.isScalar())
5049 continue;
5051 LoopVectorizationCostModel::VectorizationCostTy C =
5052 CM.expectedCost(i, &InvalidCosts);
5053 VectorizationFactor Candidate(i, C.first, ScalarCost.ScalarCost);
5055 #ifndef NDEBUG
5056 unsigned AssumedMinimumVscale =
5057 getVScaleForTuning(OrigLoop, TTI).value_or(1);
5058 unsigned Width =
5059 Candidate.Width.isScalable()
5060 ? Candidate.Width.getKnownMinValue() * AssumedMinimumVscale
5061 : Candidate.Width.getFixedValue();
5062 LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i
5063 << " costs: " << (Candidate.Cost / Width));
5064 if (i.isScalable())
5065 LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of "
5066 << AssumedMinimumVscale << ")");
5067 LLVM_DEBUG(dbgs() << ".\n");
5068 #endif
5070 if (!C.second && !ForceVectorization) {
5071 LLVM_DEBUG(
5072 dbgs() << "LV: Not considering vector loop of width " << i
5073 << " because it will not generate any vector instructions.\n");
5074 continue;
5077 // If profitable add it to ProfitableVF list.
5078 if (isMoreProfitable(Candidate, ScalarCost))
5079 ProfitableVFs.push_back(Candidate);
5081 if (isMoreProfitable(Candidate, ChosenFactor))
5082 ChosenFactor = Candidate;
5085 emitInvalidCostRemarks(InvalidCosts, ORE, OrigLoop);
5087 if (!EnableCondStoresVectorization && CM.hasPredStores()) {
5088 reportVectorizationFailure(
5089 "There are conditional stores.",
5090 "store that is conditionally executed prevents vectorization",
5091 "ConditionalStore", ORE, OrigLoop);
5092 ChosenFactor = ScalarCost;
5095 LLVM_DEBUG(if (ForceVectorization && !ChosenFactor.Width.isScalar() &&
5096 !isMoreProfitable(ChosenFactor, ScalarCost)) dbgs()
5097 << "LV: Vectorization seems to be not beneficial, "
5098 << "but was forced by a user.\n");
5099 LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << ChosenFactor.Width << ".\n");
5100 return ChosenFactor;
5103 bool LoopVectorizationPlanner::isCandidateForEpilogueVectorization(
5104 ElementCount VF) const {
5105 // Cross iteration phis such as reductions need special handling and are
5106 // currently unsupported.
5107 if (any_of(OrigLoop->getHeader()->phis(),
5108 [&](PHINode &Phi) { return Legal->isFixedOrderRecurrence(&Phi); }))
5109 return false;
5111 // Phis with uses outside of the loop require special handling and are
5112 // currently unsupported.
5113 for (const auto &Entry : Legal->getInductionVars()) {
5114 // Look for uses of the value of the induction at the last iteration.
5115 Value *PostInc =
5116 Entry.first->getIncomingValueForBlock(OrigLoop->getLoopLatch());
5117 for (User *U : PostInc->users())
5118 if (!OrigLoop->contains(cast<Instruction>(U)))
5119 return false;
5120 // Look for uses of penultimate value of the induction.
5121 for (User *U : Entry.first->users())
5122 if (!OrigLoop->contains(cast<Instruction>(U)))
5123 return false;
5126 // Epilogue vectorization code has not been auditted to ensure it handles
5127 // non-latch exits properly. It may be fine, but it needs auditted and
5128 // tested.
5129 if (OrigLoop->getExitingBlock() != OrigLoop->getLoopLatch())
5130 return false;
5132 return true;
5135 bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable(
5136 const ElementCount VF) const {
5137 // FIXME: We need a much better cost-model to take different parameters such
5138 // as register pressure, code size increase and cost of extra branches into
5139 // account. For now we apply a very crude heuristic and only consider loops
5140 // with vectorization factors larger than a certain value.
5142 // Allow the target to opt out entirely.
5143 if (!TTI.preferEpilogueVectorization())
5144 return false;
5146 // We also consider epilogue vectorization unprofitable for targets that don't
5147 // consider interleaving beneficial (eg. MVE).
5148 if (TTI.getMaxInterleaveFactor(VF) <= 1)
5149 return false;
5151 unsigned Multiplier = 1;
5152 if (VF.isScalable())
5153 Multiplier = getVScaleForTuning(TheLoop, TTI).value_or(1);
5154 if ((Multiplier * VF.getKnownMinValue()) >= EpilogueVectorizationMinVF)
5155 return true;
5156 return false;
5159 VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(
5160 const ElementCount MainLoopVF, unsigned IC) {
5161 VectorizationFactor Result = VectorizationFactor::Disabled();
5162 if (!EnableEpilogueVectorization) {
5163 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n");
5164 return Result;
5167 if (!CM.isScalarEpilogueAllowed()) {
5168 LLVM_DEBUG(dbgs() << "LEV: Unable to vectorize epilogue because no "
5169 "epilogue is allowed.\n");
5170 return Result;
5173 // Not really a cost consideration, but check for unsupported cases here to
5174 // simplify the logic.
5175 if (!isCandidateForEpilogueVectorization(MainLoopVF)) {
5176 LLVM_DEBUG(dbgs() << "LEV: Unable to vectorize epilogue because the loop "
5177 "is not a supported candidate.\n");
5178 return Result;
5181 if (EpilogueVectorizationForceVF > 1) {
5182 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n");
5183 ElementCount ForcedEC = ElementCount::getFixed(EpilogueVectorizationForceVF);
5184 if (hasPlanWithVF(ForcedEC))
5185 return {ForcedEC, 0, 0};
5186 else {
5187 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization forced factor is not "
5188 "viable.\n");
5189 return Result;
5193 if (OrigLoop->getHeader()->getParent()->hasOptSize() ||
5194 OrigLoop->getHeader()->getParent()->hasMinSize()) {
5195 LLVM_DEBUG(
5196 dbgs() << "LEV: Epilogue vectorization skipped due to opt for size.\n");
5197 return Result;
5200 if (!CM.isEpilogueVectorizationProfitable(MainLoopVF)) {
5201 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is not profitable for "
5202 "this loop\n");
5203 return Result;
5206 // If MainLoopVF = vscale x 2, and vscale is expected to be 4, then we know
5207 // the main loop handles 8 lanes per iteration. We could still benefit from
5208 // vectorizing the epilogue loop with VF=4.
5209 ElementCount EstimatedRuntimeVF = MainLoopVF;
5210 if (MainLoopVF.isScalable()) {
5211 EstimatedRuntimeVF = ElementCount::getFixed(MainLoopVF.getKnownMinValue());
5212 if (std::optional<unsigned> VScale = getVScaleForTuning(OrigLoop, TTI))
5213 EstimatedRuntimeVF *= *VScale;
5216 ScalarEvolution &SE = *PSE.getSE();
5217 Type *TCType = Legal->getWidestInductionType();
5218 const SCEV *RemainingIterations = nullptr;
5219 for (auto &NextVF : ProfitableVFs) {
5220 // Skip candidate VFs without a corresponding VPlan.
5221 if (!hasPlanWithVF(NextVF.Width))
5222 continue;
5224 // Skip candidate VFs with widths >= the estimate runtime VF (scalable
5225 // vectors) or the VF of the main loop (fixed vectors).
5226 if ((!NextVF.Width.isScalable() && MainLoopVF.isScalable() &&
5227 ElementCount::isKnownGE(NextVF.Width, EstimatedRuntimeVF)) ||
5228 ElementCount::isKnownGE(NextVF.Width, MainLoopVF))
5229 continue;
5231 // If NextVF is greater than the number of remaining iterations, the
5232 // epilogue loop would be dead. Skip such factors.
5233 if (!MainLoopVF.isScalable() && !NextVF.Width.isScalable()) {
5234 // TODO: extend to support scalable VFs.
5235 if (!RemainingIterations) {
5236 const SCEV *TC = createTripCountSCEV(TCType, PSE, OrigLoop);
5237 RemainingIterations = SE.getURemExpr(
5238 TC, SE.getConstant(TCType, MainLoopVF.getKnownMinValue() * IC));
5240 if (SE.isKnownPredicate(
5241 CmpInst::ICMP_UGT,
5242 SE.getConstant(TCType, NextVF.Width.getKnownMinValue()),
5243 RemainingIterations))
5244 continue;
5247 if (Result.Width.isScalar() || isMoreProfitable(NextVF, Result))
5248 Result = NextVF;
5251 if (Result != VectorizationFactor::Disabled())
5252 LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = "
5253 << Result.Width << "\n");
5254 return Result;
5257 std::pair<unsigned, unsigned>
5258 LoopVectorizationCostModel::getSmallestAndWidestTypes() {
5259 unsigned MinWidth = -1U;
5260 unsigned MaxWidth = 8;
5261 const DataLayout &DL = TheFunction->getParent()->getDataLayout();
5262 // For in-loop reductions, no element types are added to ElementTypesInLoop
5263 // if there are no loads/stores in the loop. In this case, check through the
5264 // reduction variables to determine the maximum width.
5265 if (ElementTypesInLoop.empty() && !Legal->getReductionVars().empty()) {
5266 // Reset MaxWidth so that we can find the smallest type used by recurrences
5267 // in the loop.
5268 MaxWidth = -1U;
5269 for (const auto &PhiDescriptorPair : Legal->getReductionVars()) {
5270 const RecurrenceDescriptor &RdxDesc = PhiDescriptorPair.second;
5271 // When finding the min width used by the recurrence we need to account
5272 // for casts on the input operands of the recurrence.
5273 MaxWidth = std::min<unsigned>(
5274 MaxWidth, std::min<unsigned>(
5275 RdxDesc.getMinWidthCastToRecurrenceTypeInBits(),
5276 RdxDesc.getRecurrenceType()->getScalarSizeInBits()));
5278 } else {
5279 for (Type *T : ElementTypesInLoop) {
5280 MinWidth = std::min<unsigned>(
5281 MinWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedValue());
5282 MaxWidth = std::max<unsigned>(
5283 MaxWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedValue());
5286 return {MinWidth, MaxWidth};
5289 void LoopVectorizationCostModel::collectElementTypesForWidening() {
5290 ElementTypesInLoop.clear();
5291 // For each block.
5292 for (BasicBlock *BB : TheLoop->blocks()) {
5293 // For each instruction in the loop.
5294 for (Instruction &I : BB->instructionsWithoutDebug()) {
5295 Type *T = I.getType();
5297 // Skip ignored values.
5298 if (ValuesToIgnore.count(&I))
5299 continue;
5301 // Only examine Loads, Stores and PHINodes.
5302 if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I))
5303 continue;
5305 // Examine PHI nodes that are reduction variables. Update the type to
5306 // account for the recurrence type.
5307 if (auto *PN = dyn_cast<PHINode>(&I)) {
5308 if (!Legal->isReductionVariable(PN))
5309 continue;
5310 const RecurrenceDescriptor &RdxDesc =
5311 Legal->getReductionVars().find(PN)->second;
5312 if (PreferInLoopReductions || useOrderedReductions(RdxDesc) ||
5313 TTI.preferInLoopReduction(RdxDesc.getOpcode(),
5314 RdxDesc.getRecurrenceType(),
5315 TargetTransformInfo::ReductionFlags()))
5316 continue;
5317 T = RdxDesc.getRecurrenceType();
5320 // Examine the stored values.
5321 if (auto *ST = dyn_cast<StoreInst>(&I))
5322 T = ST->getValueOperand()->getType();
5324 assert(T->isSized() &&
5325 "Expected the load/store/recurrence type to be sized");
5327 ElementTypesInLoop.insert(T);
5332 unsigned
5333 LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
5334 InstructionCost LoopCost) {
5335 // -- The interleave heuristics --
5336 // We interleave the loop in order to expose ILP and reduce the loop overhead.
5337 // There are many micro-architectural considerations that we can't predict
5338 // at this level. For example, frontend pressure (on decode or fetch) due to
5339 // code size, or the number and capabilities of the execution ports.
5341 // We use the following heuristics to select the interleave count:
5342 // 1. If the code has reductions, then we interleave to break the cross
5343 // iteration dependency.
5344 // 2. If the loop is really small, then we interleave to reduce the loop
5345 // overhead.
5346 // 3. We don't interleave if we think that we will spill registers to memory
5347 // due to the increased register pressure.
5349 if (!isScalarEpilogueAllowed())
5350 return 1;
5352 // We used the distance for the interleave count.
5353 if (!Legal->isSafeForAnyVectorWidth())
5354 return 1;
5356 auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop);
5357 const bool HasReductions = !Legal->getReductionVars().empty();
5358 // Do not interleave loops with a relatively small known or estimated trip
5359 // count. But we will interleave when InterleaveSmallLoopScalarReduction is
5360 // enabled, and the code has scalar reductions(HasReductions && VF = 1),
5361 // because with the above conditions interleaving can expose ILP and break
5362 // cross iteration dependences for reductions.
5363 if (BestKnownTC && (*BestKnownTC < TinyTripCountInterleaveThreshold) &&
5364 !(InterleaveSmallLoopScalarReduction && HasReductions && VF.isScalar()))
5365 return 1;
5367 // If we did not calculate the cost for VF (because the user selected the VF)
5368 // then we calculate the cost of VF here.
5369 if (LoopCost == 0) {
5370 LoopCost = expectedCost(VF).first;
5371 assert(LoopCost.isValid() && "Expected to have chosen a VF with valid cost");
5373 // Loop body is free and there is no need for interleaving.
5374 if (LoopCost == 0)
5375 return 1;
5378 RegisterUsage R = calculateRegisterUsage({VF})[0];
5379 // We divide by these constants so assume that we have at least one
5380 // instruction that uses at least one register.
5381 for (auto& pair : R.MaxLocalUsers) {
5382 pair.second = std::max(pair.second, 1U);
5385 // We calculate the interleave count using the following formula.
5386 // Subtract the number of loop invariants from the number of available
5387 // registers. These registers are used by all of the interleaved instances.
5388 // Next, divide the remaining registers by the number of registers that is
5389 // required by the loop, in order to estimate how many parallel instances
5390 // fit without causing spills. All of this is rounded down if necessary to be
5391 // a power of two. We want power of two interleave count to simplify any
5392 // addressing operations or alignment considerations.
5393 // We also want power of two interleave counts to ensure that the induction
5394 // variable of the vector loop wraps to zero, when tail is folded by masking;
5395 // this currently happens when OptForSize, in which case IC is set to 1 above.
5396 unsigned IC = UINT_MAX;
5398 for (auto& pair : R.MaxLocalUsers) {
5399 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
5400 LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
5401 << " registers of "
5402 << TTI.getRegisterClassName(pair.first) << " register class\n");
5403 if (VF.isScalar()) {
5404 if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
5405 TargetNumRegisters = ForceTargetNumScalarRegs;
5406 } else {
5407 if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
5408 TargetNumRegisters = ForceTargetNumVectorRegs;
5410 unsigned MaxLocalUsers = pair.second;
5411 unsigned LoopInvariantRegs = 0;
5412 if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end())
5413 LoopInvariantRegs = R.LoopInvariantRegs[pair.first];
5415 unsigned TmpIC = llvm::bit_floor((TargetNumRegisters - LoopInvariantRegs) /
5416 MaxLocalUsers);
5417 // Don't count the induction variable as interleaved.
5418 if (EnableIndVarRegisterHeur) {
5419 TmpIC = llvm::bit_floor((TargetNumRegisters - LoopInvariantRegs - 1) /
5420 std::max(1U, (MaxLocalUsers - 1)));
5423 IC = std::min(IC, TmpIC);
5426 // Clamp the interleave ranges to reasonable counts.
5427 unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF);
5429 // Check if the user has overridden the max.
5430 if (VF.isScalar()) {
5431 if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
5432 MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
5433 } else {
5434 if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
5435 MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
5438 unsigned EstimatedVF = VF.getKnownMinValue();
5439 if (VF.isScalable()) {
5440 if (std::optional<unsigned> VScale = getVScaleForTuning(TheLoop, TTI))
5441 EstimatedVF *= *VScale;
5443 assert(EstimatedVF >= 1 && "Estimated VF shouldn't be less than 1");
5445 unsigned KnownTC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
5446 if (KnownTC) {
5447 // If trip count is known we select between two prospective ICs, where
5448 // 1) the aggressive IC is capped by the trip count divided by VF
5449 // 2) the conservative IC is capped by the trip count divided by (VF * 2)
5450 // The final IC is selected in a way that the epilogue loop trip count is
5451 // minimized while maximizing the IC itself, so that we either run the
5452 // vector loop at least once if it generates a small epilogue loop, or else
5453 // we run the vector loop at least twice.
5455 unsigned InterleaveCountUB = bit_floor(
5456 std::max(1u, std::min(KnownTC / EstimatedVF, MaxInterleaveCount)));
5457 unsigned InterleaveCountLB = bit_floor(std::max(
5458 1u, std::min(KnownTC / (EstimatedVF * 2), MaxInterleaveCount)));
5459 MaxInterleaveCount = InterleaveCountLB;
5461 if (InterleaveCountUB != InterleaveCountLB) {
5462 unsigned TailTripCountUB = (KnownTC % (EstimatedVF * InterleaveCountUB));
5463 unsigned TailTripCountLB = (KnownTC % (EstimatedVF * InterleaveCountLB));
5464 // If both produce same scalar tail, maximize the IC to do the same work
5465 // in fewer vector loop iterations
5466 if (TailTripCountUB == TailTripCountLB)
5467 MaxInterleaveCount = InterleaveCountUB;
5469 } else if (BestKnownTC) {
5470 // If trip count is an estimated compile time constant, limit the
5471 // IC to be capped by the trip count divided by VF * 2, such that the vector
5472 // loop runs at least twice to make interleaving seem profitable when there
5473 // is an epilogue loop present. Since exact Trip count is not known we
5474 // choose to be conservative in our IC estimate.
5475 MaxInterleaveCount = bit_floor(std::max(
5476 1u, std::min(*BestKnownTC / (EstimatedVF * 2), MaxInterleaveCount)));
5479 assert(MaxInterleaveCount > 0 &&
5480 "Maximum interleave count must be greater than 0");
5482 // Clamp the calculated IC to be between the 1 and the max interleave count
5483 // that the target and trip count allows.
5484 if (IC > MaxInterleaveCount)
5485 IC = MaxInterleaveCount;
5486 else
5487 // Make sure IC is greater than 0.
5488 IC = std::max(1u, IC);
5490 assert(IC > 0 && "Interleave count must be greater than 0.");
5492 // Interleave if we vectorized this loop and there is a reduction that could
5493 // benefit from interleaving.
5494 if (VF.isVector() && HasReductions) {
5495 LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
5496 return IC;
5499 // For any scalar loop that either requires runtime checks or predication we
5500 // are better off leaving this to the unroller. Note that if we've already
5501 // vectorized the loop we will have done the runtime check and so interleaving
5502 // won't require further checks.
5503 bool ScalarInterleavingRequiresPredication =
5504 (VF.isScalar() && any_of(TheLoop->blocks(), [this](BasicBlock *BB) {
5505 return Legal->blockNeedsPredication(BB);
5506 }));
5507 bool ScalarInterleavingRequiresRuntimePointerCheck =
5508 (VF.isScalar() && Legal->getRuntimePointerChecking()->Need);
5510 // We want to interleave small loops in order to reduce the loop overhead and
5511 // potentially expose ILP opportunities.
5512 LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n'
5513 << "LV: IC is " << IC << '\n'
5514 << "LV: VF is " << VF << '\n');
5515 const bool AggressivelyInterleaveReductions =
5516 TTI.enableAggressiveInterleaving(HasReductions);
5517 if (!ScalarInterleavingRequiresRuntimePointerCheck &&
5518 !ScalarInterleavingRequiresPredication && LoopCost < SmallLoopCost) {
5519 // We assume that the cost overhead is 1 and we use the cost model
5520 // to estimate the cost of the loop and interleave until the cost of the
5521 // loop overhead is about 5% of the cost of the loop.
5522 unsigned SmallIC = std::min(IC, (unsigned)llvm::bit_floor<uint64_t>(
5523 SmallLoopCost / *LoopCost.getValue()));
5525 // Interleave until store/load ports (estimated by max interleave count) are
5526 // saturated.
5527 unsigned NumStores = Legal->getNumStores();
5528 unsigned NumLoads = Legal->getNumLoads();
5529 unsigned StoresIC = IC / (NumStores ? NumStores : 1);
5530 unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);
5532 // There is little point in interleaving for reductions containing selects
5533 // and compares when VF=1 since it may just create more overhead than it's
5534 // worth for loops with small trip counts. This is because we still have to
5535 // do the final reduction after the loop.
5536 bool HasSelectCmpReductions =
5537 HasReductions &&
5538 any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
5539 const RecurrenceDescriptor &RdxDesc = Reduction.second;
5540 return RecurrenceDescriptor::isAnyOfRecurrenceKind(
5541 RdxDesc.getRecurrenceKind());
5543 if (HasSelectCmpReductions) {
5544 LLVM_DEBUG(dbgs() << "LV: Not interleaving select-cmp reductions.\n");
5545 return 1;
5548 // If we have a scalar reduction (vector reductions are already dealt with
5549 // by this point), we can increase the critical path length if the loop
5550 // we're interleaving is inside another loop. For tree-wise reductions
5551 // set the limit to 2, and for ordered reductions it's best to disable
5552 // interleaving entirely.
5553 if (HasReductions && TheLoop->getLoopDepth() > 1) {
5554 bool HasOrderedReductions =
5555 any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
5556 const RecurrenceDescriptor &RdxDesc = Reduction.second;
5557 return RdxDesc.isOrdered();
5559 if (HasOrderedReductions) {
5560 LLVM_DEBUG(
5561 dbgs() << "LV: Not interleaving scalar ordered reductions.\n");
5562 return 1;
5565 unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC);
5566 SmallIC = std::min(SmallIC, F);
5567 StoresIC = std::min(StoresIC, F);
5568 LoadsIC = std::min(LoadsIC, F);
5571 if (EnableLoadStoreRuntimeInterleave &&
5572 std::max(StoresIC, LoadsIC) > SmallIC) {
5573 LLVM_DEBUG(
5574 dbgs() << "LV: Interleaving to saturate store or load ports.\n");
5575 return std::max(StoresIC, LoadsIC);
5578 // If there are scalar reductions and TTI has enabled aggressive
5579 // interleaving for reductions, we will interleave to expose ILP.
5580 if (InterleaveSmallLoopScalarReduction && VF.isScalar() &&
5581 AggressivelyInterleaveReductions) {
5582 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
5583 // Interleave no less than SmallIC but not as aggressive as the normal IC
5584 // to satisfy the rare situation when resources are too limited.
5585 return std::max(IC / 2, SmallIC);
5586 } else {
5587 LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
5588 return SmallIC;
5592 // Interleave if this is a large loop (small loops are already dealt with by
5593 // this point) that could benefit from interleaving.
5594 if (AggressivelyInterleaveReductions) {
5595 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
5596 return IC;
5599 LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n");
5600 return 1;
5603 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8>
5604 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) {
5605 // This function calculates the register usage by measuring the highest number
5606 // of values that are alive at a single location. Obviously, this is a very
5607 // rough estimation. We scan the loop in a topological order in order and
5608 // assign a number to each instruction. We use RPO to ensure that defs are
5609 // met before their users. We assume that each instruction that has in-loop
5610 // users starts an interval. We record every time that an in-loop value is
5611 // used, so we have a list of the first and last occurrences of each
5612 // instruction. Next, we transpose this data structure into a multi map that
5613 // holds the list of intervals that *end* at a specific location. This multi
5614 // map allows us to perform a linear search. We scan the instructions linearly
5615 // and record each time that a new interval starts, by placing it in a set.
5616 // If we find this value in the multi-map then we remove it from the set.
5617 // The max register usage is the maximum size of the set.
5618 // We also search for instructions that are defined outside the loop, but are
5619 // used inside the loop. We need this number separately from the max-interval
5620 // usage number because when we unroll, loop-invariant values do not take
5621 // more register.
5622 LoopBlocksDFS DFS(TheLoop);
5623 DFS.perform(LI);
5625 RegisterUsage RU;
5627 // Each 'key' in the map opens a new interval. The values
5628 // of the map are the index of the 'last seen' usage of the
5629 // instruction that is the key.
5630 using IntervalMap = DenseMap<Instruction *, unsigned>;
5632 // Maps instruction to its index.
5633 SmallVector<Instruction *, 64> IdxToInstr;
5634 // Marks the end of each interval.
5635 IntervalMap EndPoint;
5636 // Saves the list of instruction indices that are used in the loop.
5637 SmallPtrSet<Instruction *, 8> Ends;
5638 // Saves the list of values that are used in the loop but are defined outside
5639 // the loop (not including non-instruction values such as arguments and
5640 // constants).
5641 SmallSetVector<Instruction *, 8> LoopInvariants;
5643 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
5644 for (Instruction &I : BB->instructionsWithoutDebug()) {
5645 IdxToInstr.push_back(&I);
5647 // Save the end location of each USE.
5648 for (Value *U : I.operands()) {
5649 auto *Instr = dyn_cast<Instruction>(U);
5651 // Ignore non-instruction values such as arguments, constants, etc.
5652 // FIXME: Might need some motivation why these values are ignored. If
5653 // for example an argument is used inside the loop it will increase the
5654 // register pressure (so shouldn't we add it to LoopInvariants).
5655 if (!Instr)
5656 continue;
5658 // If this instruction is outside the loop then record it and continue.
5659 if (!TheLoop->contains(Instr)) {
5660 LoopInvariants.insert(Instr);
5661 continue;
5664 // Overwrite previous end points.
5665 EndPoint[Instr] = IdxToInstr.size();
5666 Ends.insert(Instr);
5671 // Saves the list of intervals that end with the index in 'key'.
5672 using InstrList = SmallVector<Instruction *, 2>;
5673 DenseMap<unsigned, InstrList> TransposeEnds;
5675 // Transpose the EndPoints to a list of values that end at each index.
5676 for (auto &Interval : EndPoint)
5677 TransposeEnds[Interval.second].push_back(Interval.first);
5679 SmallPtrSet<Instruction *, 8> OpenIntervals;
5680 SmallVector<RegisterUsage, 8> RUs(VFs.size());
5681 SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size());
5683 LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
5685 const auto &TTICapture = TTI;
5686 auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) -> unsigned {
5687 if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty))
5688 return 0;
5689 return TTICapture.getRegUsageForType(VectorType::get(Ty, VF));
5692 for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) {
5693 Instruction *I = IdxToInstr[i];
5695 // Remove all of the instructions that end at this location.
5696 InstrList &List = TransposeEnds[i];
5697 for (Instruction *ToRemove : List)
5698 OpenIntervals.erase(ToRemove);
5700 // Ignore instructions that are never used within the loop.
5701 if (!Ends.count(I))
5702 continue;
5704 // Skip ignored values.
5705 if (ValuesToIgnore.count(I))
5706 continue;
5708 collectInLoopReductions();
5710 // For each VF find the maximum usage of registers.
5711 for (unsigned j = 0, e = VFs.size(); j < e; ++j) {
5712 // Count the number of registers used, per register class, given all open
5713 // intervals.
5714 // Note that elements in this SmallMapVector will be default constructed
5715 // as 0. So we can use "RegUsage[ClassID] += n" in the code below even if
5716 // there is no previous entry for ClassID.
5717 SmallMapVector<unsigned, unsigned, 4> RegUsage;
5719 if (VFs[j].isScalar()) {
5720 for (auto *Inst : OpenIntervals) {
5721 unsigned ClassID =
5722 TTI.getRegisterClassForType(false, Inst->getType());
5723 // FIXME: The target might use more than one register for the type
5724 // even in the scalar case.
5725 RegUsage[ClassID] += 1;
5727 } else {
5728 collectUniformsAndScalars(VFs[j]);
5729 for (auto *Inst : OpenIntervals) {
5730 // Skip ignored values for VF > 1.
5731 if (VecValuesToIgnore.count(Inst))
5732 continue;
5733 if (isScalarAfterVectorization(Inst, VFs[j])) {
5734 unsigned ClassID =
5735 TTI.getRegisterClassForType(false, Inst->getType());
5736 // FIXME: The target might use more than one register for the type
5737 // even in the scalar case.
5738 RegUsage[ClassID] += 1;
5739 } else {
5740 unsigned ClassID =
5741 TTI.getRegisterClassForType(true, Inst->getType());
5742 RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]);
5747 for (auto& pair : RegUsage) {
5748 auto &Entry = MaxUsages[j][pair.first];
5749 Entry = std::max(Entry, pair.second);
5753 LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # "
5754 << OpenIntervals.size() << '\n');
5756 // Add the current instruction to the list of open intervals.
5757 OpenIntervals.insert(I);
5760 for (unsigned i = 0, e = VFs.size(); i < e; ++i) {
5761 // Note that elements in this SmallMapVector will be default constructed
5762 // as 0. So we can use "Invariant[ClassID] += n" in the code below even if
5763 // there is no previous entry for ClassID.
5764 SmallMapVector<unsigned, unsigned, 4> Invariant;
5766 for (auto *Inst : LoopInvariants) {
5767 // FIXME: The target might use more than one register for the type
5768 // even in the scalar case.
5769 bool IsScalar = all_of(Inst->users(), [&](User *U) {
5770 auto *I = cast<Instruction>(U);
5771 return TheLoop != LI->getLoopFor(I->getParent()) ||
5772 isScalarAfterVectorization(I, VFs[i]);
5775 ElementCount VF = IsScalar ? ElementCount::getFixed(1) : VFs[i];
5776 unsigned ClassID =
5777 TTI.getRegisterClassForType(VF.isVector(), Inst->getType());
5778 Invariant[ClassID] += GetRegUsage(Inst->getType(), VF);
5781 LLVM_DEBUG({
5782 dbgs() << "LV(REG): VF = " << VFs[i] << '\n';
5783 dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size()
5784 << " item\n";
5785 for (const auto &pair : MaxUsages[i]) {
5786 dbgs() << "LV(REG): RegisterClass: "
5787 << TTI.getRegisterClassName(pair.first) << ", " << pair.second
5788 << " registers\n";
5790 dbgs() << "LV(REG): Found invariant usage: " << Invariant.size()
5791 << " item\n";
5792 for (const auto &pair : Invariant) {
5793 dbgs() << "LV(REG): RegisterClass: "
5794 << TTI.getRegisterClassName(pair.first) << ", " << pair.second
5795 << " registers\n";
5799 RU.LoopInvariantRegs = Invariant;
5800 RU.MaxLocalUsers = MaxUsages[i];
5801 RUs[i] = RU;
5804 return RUs;
5807 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I,
5808 ElementCount VF) {
5809 // TODO: Cost model for emulated masked load/store is completely
5810 // broken. This hack guides the cost model to use an artificially
5811 // high enough value to practically disable vectorization with such
5812 // operations, except where previously deployed legality hack allowed
5813 // using very low cost values. This is to avoid regressions coming simply
5814 // from moving "masked load/store" check from legality to cost model.
5815 // Masked Load/Gather emulation was previously never allowed.
5816 // Limited number of Masked Store/Scatter emulation was allowed.
5817 assert((isPredicatedInst(I)) &&
5818 "Expecting a scalar emulated instruction");
5819 return isa<LoadInst>(I) ||
5820 (isa<StoreInst>(I) &&
5821 NumPredStores > NumberOfStoresToPredicate);
5824 void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) {
5825 // If we aren't vectorizing the loop, or if we've already collected the
5826 // instructions to scalarize, there's nothing to do. Collection may already
5827 // have occurred if we have a user-selected VF and are now computing the
5828 // expected cost for interleaving.
5829 if (VF.isScalar() || VF.isZero() || InstsToScalarize.contains(VF))
5830 return;
5832 // Initialize a mapping for VF in InstsToScalalarize. If we find that it's
5833 // not profitable to scalarize any instructions, the presence of VF in the
5834 // map will indicate that we've analyzed it already.
5835 ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF];
5837 PredicatedBBsAfterVectorization[VF].clear();
5839 // Find all the instructions that are scalar with predication in the loop and
5840 // determine if it would be better to not if-convert the blocks they are in.
5841 // If so, we also record the instructions to scalarize.
5842 for (BasicBlock *BB : TheLoop->blocks()) {
5843 if (!blockNeedsPredicationForAnyReason(BB))
5844 continue;
5845 for (Instruction &I : *BB)
5846 if (isScalarWithPredication(&I, VF)) {
5847 ScalarCostsTy ScalarCosts;
5848 // Do not apply discount if scalable, because that would lead to
5849 // invalid scalarization costs.
5850 // Do not apply discount logic if hacked cost is needed
5851 // for emulated masked memrefs.
5852 if (!VF.isScalable() && !useEmulatedMaskMemRefHack(&I, VF) &&
5853 computePredInstDiscount(&I, ScalarCosts, VF) >= 0)
5854 ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end());
5855 // Remember that BB will remain after vectorization.
5856 PredicatedBBsAfterVectorization[VF].insert(BB);
5861 InstructionCost LoopVectorizationCostModel::computePredInstDiscount(
5862 Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) {
5863 assert(!isUniformAfterVectorization(PredInst, VF) &&
5864 "Instruction marked uniform-after-vectorization will be predicated");
5866 // Initialize the discount to zero, meaning that the scalar version and the
5867 // vector version cost the same.
5868 InstructionCost Discount = 0;
5870 // Holds instructions to analyze. The instructions we visit are mapped in
5871 // ScalarCosts. Those instructions are the ones that would be scalarized if
5872 // we find that the scalar version costs less.
5873 SmallVector<Instruction *, 8> Worklist;
5875 // Returns true if the given instruction can be scalarized.
5876 auto canBeScalarized = [&](Instruction *I) -> bool {
5877 // We only attempt to scalarize instructions forming a single-use chain
5878 // from the original predicated block that would otherwise be vectorized.
5879 // Although not strictly necessary, we give up on instructions we know will
5880 // already be scalar to avoid traversing chains that are unlikely to be
5881 // beneficial.
5882 if (!I->hasOneUse() || PredInst->getParent() != I->getParent() ||
5883 isScalarAfterVectorization(I, VF))
5884 return false;
5886 // If the instruction is scalar with predication, it will be analyzed
5887 // separately. We ignore it within the context of PredInst.
5888 if (isScalarWithPredication(I, VF))
5889 return false;
5891 // If any of the instruction's operands are uniform after vectorization,
5892 // the instruction cannot be scalarized. This prevents, for example, a
5893 // masked load from being scalarized.
5895 // We assume we will only emit a value for lane zero of an instruction
5896 // marked uniform after vectorization, rather than VF identical values.
5897 // Thus, if we scalarize an instruction that uses a uniform, we would
5898 // create uses of values corresponding to the lanes we aren't emitting code
5899 // for. This behavior can be changed by allowing getScalarValue to clone
5900 // the lane zero values for uniforms rather than asserting.
5901 for (Use &U : I->operands())
5902 if (auto *J = dyn_cast<Instruction>(U.get()))
5903 if (isUniformAfterVectorization(J, VF))
5904 return false;
5906 // Otherwise, we can scalarize the instruction.
5907 return true;
5910 // Compute the expected cost discount from scalarizing the entire expression
5911 // feeding the predicated instruction. We currently only consider expressions
5912 // that are single-use instruction chains.
5913 Worklist.push_back(PredInst);
5914 while (!Worklist.empty()) {
5915 Instruction *I = Worklist.pop_back_val();
5917 // If we've already analyzed the instruction, there's nothing to do.
5918 if (ScalarCosts.contains(I))
5919 continue;
5921 // Compute the cost of the vector instruction. Note that this cost already
5922 // includes the scalarization overhead of the predicated instruction.
5923 InstructionCost VectorCost = getInstructionCost(I, VF).first;
5925 // Compute the cost of the scalarized instruction. This cost is the cost of
5926 // the instruction as if it wasn't if-converted and instead remained in the
5927 // predicated block. We will scale this cost by block probability after
5928 // computing the scalarization overhead.
5929 InstructionCost ScalarCost =
5930 VF.getFixedValue() *
5931 getInstructionCost(I, ElementCount::getFixed(1)).first;
5933 // Compute the scalarization overhead of needed insertelement instructions
5934 // and phi nodes.
5935 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
5936 if (isScalarWithPredication(I, VF) && !I->getType()->isVoidTy()) {
5937 ScalarCost += TTI.getScalarizationOverhead(
5938 cast<VectorType>(ToVectorTy(I->getType(), VF)),
5939 APInt::getAllOnes(VF.getFixedValue()), /*Insert*/ true,
5940 /*Extract*/ false, CostKind);
5941 ScalarCost +=
5942 VF.getFixedValue() * TTI.getCFInstrCost(Instruction::PHI, CostKind);
5945 // Compute the scalarization overhead of needed extractelement
5946 // instructions. For each of the instruction's operands, if the operand can
5947 // be scalarized, add it to the worklist; otherwise, account for the
5948 // overhead.
5949 for (Use &U : I->operands())
5950 if (auto *J = dyn_cast<Instruction>(U.get())) {
5951 assert(VectorType::isValidElementType(J->getType()) &&
5952 "Instruction has non-scalar type");
5953 if (canBeScalarized(J))
5954 Worklist.push_back(J);
5955 else if (needsExtract(J, VF)) {
5956 ScalarCost += TTI.getScalarizationOverhead(
5957 cast<VectorType>(ToVectorTy(J->getType(), VF)),
5958 APInt::getAllOnes(VF.getFixedValue()), /*Insert*/ false,
5959 /*Extract*/ true, CostKind);
5963 // Scale the total scalar cost by block probability.
5964 ScalarCost /= getReciprocalPredBlockProb();
5966 // Compute the discount. A non-negative discount means the vector version
5967 // of the instruction costs more, and scalarizing would be beneficial.
5968 Discount += VectorCost - ScalarCost;
5969 ScalarCosts[I] = ScalarCost;
5972 return Discount;
5975 LoopVectorizationCostModel::VectorizationCostTy
5976 LoopVectorizationCostModel::expectedCost(
5977 ElementCount VF, SmallVectorImpl<InstructionVFPair> *Invalid) {
5978 VectorizationCostTy Cost;
5980 // For each block.
5981 for (BasicBlock *BB : TheLoop->blocks()) {
5982 VectorizationCostTy BlockCost;
5984 // For each instruction in the old loop.
5985 for (Instruction &I : BB->instructionsWithoutDebug()) {
5986 // Skip ignored values.
5987 if (ValuesToIgnore.count(&I) ||
5988 (VF.isVector() && VecValuesToIgnore.count(&I)))
5989 continue;
5991 VectorizationCostTy C = getInstructionCost(&I, VF);
5993 // Check if we should override the cost.
5994 if (C.first.isValid() &&
5995 ForceTargetInstructionCost.getNumOccurrences() > 0)
5996 C.first = InstructionCost(ForceTargetInstructionCost);
5998 // Keep a list of instructions with invalid costs.
5999 if (Invalid && !C.first.isValid())
6000 Invalid->emplace_back(&I, VF);
6002 BlockCost.first += C.first;
6003 BlockCost.second |= C.second;
6004 LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first
6005 << " for VF " << VF << " For instruction: " << I
6006 << '\n');
6009 // If we are vectorizing a predicated block, it will have been
6010 // if-converted. This means that the block's instructions (aside from
6011 // stores and instructions that may divide by zero) will now be
6012 // unconditionally executed. For the scalar case, we may not always execute
6013 // the predicated block, if it is an if-else block. Thus, scale the block's
6014 // cost by the probability of executing it. blockNeedsPredication from
6015 // Legal is used so as to not include all blocks in tail folded loops.
6016 if (VF.isScalar() && Legal->blockNeedsPredication(BB))
6017 BlockCost.first /= getReciprocalPredBlockProb();
6019 Cost.first += BlockCost.first;
6020 Cost.second |= BlockCost.second;
6023 return Cost;
6026 /// Gets Address Access SCEV after verifying that the access pattern
6027 /// is loop invariant except the induction variable dependence.
6029 /// This SCEV can be sent to the Target in order to estimate the address
6030 /// calculation cost.
6031 static const SCEV *getAddressAccessSCEV(
6032 Value *Ptr,
6033 LoopVectorizationLegality *Legal,
6034 PredicatedScalarEvolution &PSE,
6035 const Loop *TheLoop) {
6037 auto *Gep = dyn_cast<GetElementPtrInst>(Ptr);
6038 if (!Gep)
6039 return nullptr;
6041 // We are looking for a gep with all loop invariant indices except for one
6042 // which should be an induction variable.
6043 auto SE = PSE.getSE();
6044 unsigned NumOperands = Gep->getNumOperands();
6045 for (unsigned i = 1; i < NumOperands; ++i) {
6046 Value *Opd = Gep->getOperand(i);
6047 if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) &&
6048 !Legal->isInductionVariable(Opd))
6049 return nullptr;
6052 // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV.
6053 return PSE.getSCEV(Ptr);
6056 InstructionCost
6057 LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
6058 ElementCount VF) {
6059 assert(VF.isVector() &&
6060 "Scalarization cost of instruction implies vectorization.");
6061 if (VF.isScalable())
6062 return InstructionCost::getInvalid();
6064 Type *ValTy = getLoadStoreType(I);
6065 auto SE = PSE.getSE();
6067 unsigned AS = getLoadStoreAddressSpace(I);
6068 Value *Ptr = getLoadStorePointerOperand(I);
6069 Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
6070 // NOTE: PtrTy is a vector to signal `TTI::getAddressComputationCost`
6071 // that it is being called from this specific place.
6073 // Figure out whether the access is strided and get the stride value
6074 // if it's known in compile time
6075 const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop);
6077 // Get the cost of the scalar memory instruction and address computation.
6078 InstructionCost Cost =
6079 VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
6081 // Don't pass *I here, since it is scalar but will actually be part of a
6082 // vectorized loop where the user of it is a vectorized instruction.
6083 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6084 const Align Alignment = getLoadStoreAlignment(I);
6085 Cost += VF.getKnownMinValue() * TTI.getMemoryOpCost(I->getOpcode(),
6086 ValTy->getScalarType(),
6087 Alignment, AS, CostKind);
6089 // Get the overhead of the extractelement and insertelement instructions
6090 // we might create due to scalarization.
6091 Cost += getScalarizationOverhead(I, VF, CostKind);
6093 // If we have a predicated load/store, it will need extra i1 extracts and
6094 // conditional branches, but may not be executed for each vector lane. Scale
6095 // the cost by the probability of executing the predicated block.
6096 if (isPredicatedInst(I)) {
6097 Cost /= getReciprocalPredBlockProb();
6099 // Add the cost of an i1 extract and a branch
6100 auto *Vec_i1Ty =
6101 VectorType::get(IntegerType::getInt1Ty(ValTy->getContext()), VF);
6102 Cost += TTI.getScalarizationOverhead(
6103 Vec_i1Ty, APInt::getAllOnes(VF.getKnownMinValue()),
6104 /*Insert=*/false, /*Extract=*/true, CostKind);
6105 Cost += TTI.getCFInstrCost(Instruction::Br, CostKind);
6107 if (useEmulatedMaskMemRefHack(I, VF))
6108 // Artificially setting to a high enough value to practically disable
6109 // vectorization with such operations.
6110 Cost = 3000000;
6113 return Cost;
6116 InstructionCost
6117 LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
6118 ElementCount VF) {
6119 Type *ValTy = getLoadStoreType(I);
6120 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6121 Value *Ptr = getLoadStorePointerOperand(I);
6122 unsigned AS = getLoadStoreAddressSpace(I);
6123 int ConsecutiveStride = Legal->isConsecutivePtr(ValTy, Ptr);
6124 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6126 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
6127 "Stride should be 1 or -1 for consecutive memory access");
6128 const Align Alignment = getLoadStoreAlignment(I);
6129 InstructionCost Cost = 0;
6130 if (Legal->isMaskRequired(I)) {
6131 Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
6132 CostKind);
6133 } else {
6134 TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(I->getOperand(0));
6135 Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
6136 CostKind, OpInfo, I);
6139 bool Reverse = ConsecutiveStride < 0;
6140 if (Reverse)
6141 Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy,
6142 std::nullopt, CostKind, 0);
6143 return Cost;
6146 InstructionCost
6147 LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
6148 ElementCount VF) {
6149 assert(Legal->isUniformMemOp(*I, VF));
6151 Type *ValTy = getLoadStoreType(I);
6152 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6153 const Align Alignment = getLoadStoreAlignment(I);
6154 unsigned AS = getLoadStoreAddressSpace(I);
6155 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6156 if (isa<LoadInst>(I)) {
6157 return TTI.getAddressComputationCost(ValTy) +
6158 TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS,
6159 CostKind) +
6160 TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy);
6162 StoreInst *SI = cast<StoreInst>(I);
6164 bool isLoopInvariantStoreValue = Legal->isInvariant(SI->getValueOperand());
6165 return TTI.getAddressComputationCost(ValTy) +
6166 TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS,
6167 CostKind) +
6168 (isLoopInvariantStoreValue
6170 : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy,
6171 CostKind, VF.getKnownMinValue() - 1));
6174 InstructionCost
6175 LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
6176 ElementCount VF) {
6177 Type *ValTy = getLoadStoreType(I);
6178 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6179 const Align Alignment = getLoadStoreAlignment(I);
6180 const Value *Ptr = getLoadStorePointerOperand(I);
6182 return TTI.getAddressComputationCost(VectorTy) +
6183 TTI.getGatherScatterOpCost(
6184 I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment,
6185 TargetTransformInfo::TCK_RecipThroughput, I);
6188 InstructionCost
6189 LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
6190 ElementCount VF) {
6191 Type *ValTy = getLoadStoreType(I);
6192 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6193 unsigned AS = getLoadStoreAddressSpace(I);
6194 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6196 auto Group = getInterleavedAccessGroup(I);
6197 assert(Group && "Fail to get an interleaved access group.");
6199 unsigned InterleaveFactor = Group->getFactor();
6200 auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
6202 // Holds the indices of existing members in the interleaved group.
6203 SmallVector<unsigned, 4> Indices;
6204 for (unsigned IF = 0; IF < InterleaveFactor; IF++)
6205 if (Group->getMember(IF))
6206 Indices.push_back(IF);
6208 // Calculate the cost of the whole interleaved group.
6209 bool UseMaskForGaps =
6210 (Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed()) ||
6211 (isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor()));
6212 InstructionCost Cost = TTI.getInterleavedMemoryOpCost(
6213 I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(),
6214 AS, CostKind, Legal->isMaskRequired(I), UseMaskForGaps);
6216 if (Group->isReverse()) {
6217 // TODO: Add support for reversed masked interleaved access.
6218 assert(!Legal->isMaskRequired(I) &&
6219 "Reverse masked interleaved access not supported.");
6220 Cost += Group->getNumMembers() *
6221 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy,
6222 std::nullopt, CostKind, 0);
6224 return Cost;
6227 std::optional<InstructionCost>
6228 LoopVectorizationCostModel::getReductionPatternCost(
6229 Instruction *I, ElementCount VF, Type *Ty,
6230 TTI::TargetCostKind CostKind) const {
6231 using namespace llvm::PatternMatch;
6232 // Early exit for no inloop reductions
6233 if (InLoopReductions.empty() || VF.isScalar() || !isa<VectorType>(Ty))
6234 return std::nullopt;
6235 auto *VectorTy = cast<VectorType>(Ty);
6237 // We are looking for a pattern of, and finding the minimal acceptable cost:
6238 // reduce(mul(ext(A), ext(B))) or
6239 // reduce(mul(A, B)) or
6240 // reduce(ext(A)) or
6241 // reduce(A).
6242 // The basic idea is that we walk down the tree to do that, finding the root
6243 // reduction instruction in InLoopReductionImmediateChains. From there we find
6244 // the pattern of mul/ext and test the cost of the entire pattern vs the cost
6245 // of the components. If the reduction cost is lower then we return it for the
6246 // reduction instruction and 0 for the other instructions in the pattern. If
6247 // it is not we return an invalid cost specifying the orignal cost method
6248 // should be used.
6249 Instruction *RetI = I;
6250 if (match(RetI, m_ZExtOrSExt(m_Value()))) {
6251 if (!RetI->hasOneUser())
6252 return std::nullopt;
6253 RetI = RetI->user_back();
6256 if (match(RetI, m_OneUse(m_Mul(m_Value(), m_Value()))) &&
6257 RetI->user_back()->getOpcode() == Instruction::Add) {
6258 RetI = RetI->user_back();
6261 // Test if the found instruction is a reduction, and if not return an invalid
6262 // cost specifying the parent to use the original cost modelling.
6263 if (!InLoopReductionImmediateChains.count(RetI))
6264 return std::nullopt;
6266 // Find the reduction this chain is a part of and calculate the basic cost of
6267 // the reduction on its own.
6268 Instruction *LastChain = InLoopReductionImmediateChains.at(RetI);
6269 Instruction *ReductionPhi = LastChain;
6270 while (!isa<PHINode>(ReductionPhi))
6271 ReductionPhi = InLoopReductionImmediateChains.at(ReductionPhi);
6273 const RecurrenceDescriptor &RdxDesc =
6274 Legal->getReductionVars().find(cast<PHINode>(ReductionPhi))->second;
6276 InstructionCost BaseCost = TTI.getArithmeticReductionCost(
6277 RdxDesc.getOpcode(), VectorTy, RdxDesc.getFastMathFlags(), CostKind);
6279 // For a call to the llvm.fmuladd intrinsic we need to add the cost of a
6280 // normal fmul instruction to the cost of the fadd reduction.
6281 if (RdxDesc.getRecurrenceKind() == RecurKind::FMulAdd)
6282 BaseCost +=
6283 TTI.getArithmeticInstrCost(Instruction::FMul, VectorTy, CostKind);
6285 // If we're using ordered reductions then we can just return the base cost
6286 // here, since getArithmeticReductionCost calculates the full ordered
6287 // reduction cost when FP reassociation is not allowed.
6288 if (useOrderedReductions(RdxDesc))
6289 return BaseCost;
6291 // Get the operand that was not the reduction chain and match it to one of the
6292 // patterns, returning the better cost if it is found.
6293 Instruction *RedOp = RetI->getOperand(1) == LastChain
6294 ? dyn_cast<Instruction>(RetI->getOperand(0))
6295 : dyn_cast<Instruction>(RetI->getOperand(1));
6297 VectorTy = VectorType::get(I->getOperand(0)->getType(), VectorTy);
6299 Instruction *Op0, *Op1;
6300 if (RedOp && RdxDesc.getOpcode() == Instruction::Add &&
6301 match(RedOp,
6302 m_ZExtOrSExt(m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) &&
6303 match(Op0, m_ZExtOrSExt(m_Value())) &&
6304 Op0->getOpcode() == Op1->getOpcode() &&
6305 Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() &&
6306 !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1) &&
6307 (Op0->getOpcode() == RedOp->getOpcode() || Op0 == Op1)) {
6309 // Matched reduce.add(ext(mul(ext(A), ext(B)))
6310 // Note that the extend opcodes need to all match, or if A==B they will have
6311 // been converted to zext(mul(sext(A), sext(A))) as it is known positive,
6312 // which is equally fine.
6313 bool IsUnsigned = isa<ZExtInst>(Op0);
6314 auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy);
6315 auto *MulType = VectorType::get(Op0->getType(), VectorTy);
6317 InstructionCost ExtCost =
6318 TTI.getCastInstrCost(Op0->getOpcode(), MulType, ExtType,
6319 TTI::CastContextHint::None, CostKind, Op0);
6320 InstructionCost MulCost =
6321 TTI.getArithmeticInstrCost(Instruction::Mul, MulType, CostKind);
6322 InstructionCost Ext2Cost =
6323 TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, MulType,
6324 TTI::CastContextHint::None, CostKind, RedOp);
6326 InstructionCost RedCost = TTI.getMulAccReductionCost(
6327 IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, CostKind);
6329 if (RedCost.isValid() &&
6330 RedCost < ExtCost * 2 + MulCost + Ext2Cost + BaseCost)
6331 return I == RetI ? RedCost : 0;
6332 } else if (RedOp && match(RedOp, m_ZExtOrSExt(m_Value())) &&
6333 !TheLoop->isLoopInvariant(RedOp)) {
6334 // Matched reduce(ext(A))
6335 bool IsUnsigned = isa<ZExtInst>(RedOp);
6336 auto *ExtType = VectorType::get(RedOp->getOperand(0)->getType(), VectorTy);
6337 InstructionCost RedCost = TTI.getExtendedReductionCost(
6338 RdxDesc.getOpcode(), IsUnsigned, RdxDesc.getRecurrenceType(), ExtType,
6339 RdxDesc.getFastMathFlags(), CostKind);
6341 InstructionCost ExtCost =
6342 TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, ExtType,
6343 TTI::CastContextHint::None, CostKind, RedOp);
6344 if (RedCost.isValid() && RedCost < BaseCost + ExtCost)
6345 return I == RetI ? RedCost : 0;
6346 } else if (RedOp && RdxDesc.getOpcode() == Instruction::Add &&
6347 match(RedOp, m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) {
6348 if (match(Op0, m_ZExtOrSExt(m_Value())) &&
6349 Op0->getOpcode() == Op1->getOpcode() &&
6350 !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1)) {
6351 bool IsUnsigned = isa<ZExtInst>(Op0);
6352 Type *Op0Ty = Op0->getOperand(0)->getType();
6353 Type *Op1Ty = Op1->getOperand(0)->getType();
6354 Type *LargestOpTy =
6355 Op0Ty->getIntegerBitWidth() < Op1Ty->getIntegerBitWidth() ? Op1Ty
6356 : Op0Ty;
6357 auto *ExtType = VectorType::get(LargestOpTy, VectorTy);
6359 // Matched reduce.add(mul(ext(A), ext(B))), where the two ext may be of
6360 // different sizes. We take the largest type as the ext to reduce, and add
6361 // the remaining cost as, for example reduce(mul(ext(ext(A)), ext(B))).
6362 InstructionCost ExtCost0 = TTI.getCastInstrCost(
6363 Op0->getOpcode(), VectorTy, VectorType::get(Op0Ty, VectorTy),
6364 TTI::CastContextHint::None, CostKind, Op0);
6365 InstructionCost ExtCost1 = TTI.getCastInstrCost(
6366 Op1->getOpcode(), VectorTy, VectorType::get(Op1Ty, VectorTy),
6367 TTI::CastContextHint::None, CostKind, Op1);
6368 InstructionCost MulCost =
6369 TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
6371 InstructionCost RedCost = TTI.getMulAccReductionCost(
6372 IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, CostKind);
6373 InstructionCost ExtraExtCost = 0;
6374 if (Op0Ty != LargestOpTy || Op1Ty != LargestOpTy) {
6375 Instruction *ExtraExtOp = (Op0Ty != LargestOpTy) ? Op0 : Op1;
6376 ExtraExtCost = TTI.getCastInstrCost(
6377 ExtraExtOp->getOpcode(), ExtType,
6378 VectorType::get(ExtraExtOp->getOperand(0)->getType(), VectorTy),
6379 TTI::CastContextHint::None, CostKind, ExtraExtOp);
6382 if (RedCost.isValid() &&
6383 (RedCost + ExtraExtCost) < (ExtCost0 + ExtCost1 + MulCost + BaseCost))
6384 return I == RetI ? RedCost : 0;
6385 } else if (!match(I, m_ZExtOrSExt(m_Value()))) {
6386 // Matched reduce.add(mul())
6387 InstructionCost MulCost =
6388 TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
6390 InstructionCost RedCost = TTI.getMulAccReductionCost(
6391 true, RdxDesc.getRecurrenceType(), VectorTy, CostKind);
6393 if (RedCost.isValid() && RedCost < MulCost + BaseCost)
6394 return I == RetI ? RedCost : 0;
6398 return I == RetI ? std::optional<InstructionCost>(BaseCost) : std::nullopt;
6401 InstructionCost
6402 LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
6403 ElementCount VF) {
6404 // Calculate scalar cost only. Vectorization cost should be ready at this
6405 // moment.
6406 if (VF.isScalar()) {
6407 Type *ValTy = getLoadStoreType(I);
6408 const Align Alignment = getLoadStoreAlignment(I);
6409 unsigned AS = getLoadStoreAddressSpace(I);
6411 TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(I->getOperand(0));
6412 return TTI.getAddressComputationCost(ValTy) +
6413 TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS,
6414 TTI::TCK_RecipThroughput, OpInfo, I);
6416 return getWideningCost(I, VF);
6419 LoopVectorizationCostModel::VectorizationCostTy
6420 LoopVectorizationCostModel::getInstructionCost(Instruction *I,
6421 ElementCount VF) {
6422 // If we know that this instruction will remain uniform, check the cost of
6423 // the scalar version.
6424 if (isUniformAfterVectorization(I, VF))
6425 VF = ElementCount::getFixed(1);
6427 if (VF.isVector() && isProfitableToScalarize(I, VF))
6428 return VectorizationCostTy(InstsToScalarize[VF][I], false);
6430 // Forced scalars do not have any scalarization overhead.
6431 auto ForcedScalar = ForcedScalars.find(VF);
6432 if (VF.isVector() && ForcedScalar != ForcedScalars.end()) {
6433 auto InstSet = ForcedScalar->second;
6434 if (InstSet.count(I))
6435 return VectorizationCostTy(
6436 (getInstructionCost(I, ElementCount::getFixed(1)).first *
6437 VF.getKnownMinValue()),
6438 false);
6441 Type *VectorTy;
6442 InstructionCost C = getInstructionCost(I, VF, VectorTy);
6444 bool TypeNotScalarized = false;
6445 if (VF.isVector() && VectorTy->isVectorTy()) {
6446 if (unsigned NumParts = TTI.getNumberOfParts(VectorTy)) {
6447 if (VF.isScalable())
6448 // <vscale x 1 x iN> is assumed to be profitable over iN because
6449 // scalable registers are a distinct register class from scalar ones.
6450 // If we ever find a target which wants to lower scalable vectors
6451 // back to scalars, we'll need to update this code to explicitly
6452 // ask TTI about the register class uses for each part.
6453 TypeNotScalarized = NumParts <= VF.getKnownMinValue();
6454 else
6455 TypeNotScalarized = NumParts < VF.getKnownMinValue();
6456 } else
6457 C = InstructionCost::getInvalid();
6459 return VectorizationCostTy(C, TypeNotScalarized);
6462 InstructionCost LoopVectorizationCostModel::getScalarizationOverhead(
6463 Instruction *I, ElementCount VF, TTI::TargetCostKind CostKind) const {
6465 // There is no mechanism yet to create a scalable scalarization loop,
6466 // so this is currently Invalid.
6467 if (VF.isScalable())
6468 return InstructionCost::getInvalid();
6470 if (VF.isScalar())
6471 return 0;
6473 InstructionCost Cost = 0;
6474 Type *RetTy = ToVectorTy(I->getType(), VF);
6475 if (!RetTy->isVoidTy() &&
6476 (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore()))
6477 Cost += TTI.getScalarizationOverhead(
6478 cast<VectorType>(RetTy), APInt::getAllOnes(VF.getKnownMinValue()),
6479 /*Insert*/ true,
6480 /*Extract*/ false, CostKind);
6482 // Some targets keep addresses scalar.
6483 if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())
6484 return Cost;
6486 // Some targets support efficient element stores.
6487 if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore())
6488 return Cost;
6490 // Collect operands to consider.
6491 CallInst *CI = dyn_cast<CallInst>(I);
6492 Instruction::op_range Ops = CI ? CI->args() : I->operands();
6494 // Skip operands that do not require extraction/scalarization and do not incur
6495 // any overhead.
6496 SmallVector<Type *> Tys;
6497 for (auto *V : filterExtractingOperands(Ops, VF))
6498 Tys.push_back(MaybeVectorizeType(V->getType(), VF));
6499 return Cost + TTI.getOperandsScalarizationOverhead(
6500 filterExtractingOperands(Ops, VF), Tys, CostKind);
6503 void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
6504 if (VF.isScalar())
6505 return;
6506 NumPredStores = 0;
6507 for (BasicBlock *BB : TheLoop->blocks()) {
6508 // For each instruction in the old loop.
6509 for (Instruction &I : *BB) {
6510 Value *Ptr = getLoadStorePointerOperand(&I);
6511 if (!Ptr)
6512 continue;
6514 // TODO: We should generate better code and update the cost model for
6515 // predicated uniform stores. Today they are treated as any other
6516 // predicated store (see added test cases in
6517 // invariant-store-vectorization.ll).
6518 if (isa<StoreInst>(&I) && isScalarWithPredication(&I, VF))
6519 NumPredStores++;
6521 if (Legal->isUniformMemOp(I, VF)) {
6522 auto isLegalToScalarize = [&]() {
6523 if (!VF.isScalable())
6524 // Scalarization of fixed length vectors "just works".
6525 return true;
6527 // We have dedicated lowering for unpredicated uniform loads and
6528 // stores. Note that even with tail folding we know that at least
6529 // one lane is active (i.e. generalized predication is not possible
6530 // here), and the logic below depends on this fact.
6531 if (!foldTailByMasking())
6532 return true;
6534 // For scalable vectors, a uniform memop load is always
6535 // uniform-by-parts and we know how to scalarize that.
6536 if (isa<LoadInst>(I))
6537 return true;
6539 // A uniform store isn't neccessarily uniform-by-part
6540 // and we can't assume scalarization.
6541 auto &SI = cast<StoreInst>(I);
6542 return TheLoop->isLoopInvariant(SI.getValueOperand());
6545 const InstructionCost GatherScatterCost =
6546 isLegalGatherOrScatter(&I, VF) ?
6547 getGatherScatterCost(&I, VF) : InstructionCost::getInvalid();
6549 // Load: Scalar load + broadcast
6550 // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
6551 // FIXME: This cost is a significant under-estimate for tail folded
6552 // memory ops.
6553 const InstructionCost ScalarizationCost = isLegalToScalarize() ?
6554 getUniformMemOpCost(&I, VF) : InstructionCost::getInvalid();
6556 // Choose better solution for the current VF, Note that Invalid
6557 // costs compare as maximumal large. If both are invalid, we get
6558 // scalable invalid which signals a failure and a vectorization abort.
6559 if (GatherScatterCost < ScalarizationCost)
6560 setWideningDecision(&I, VF, CM_GatherScatter, GatherScatterCost);
6561 else
6562 setWideningDecision(&I, VF, CM_Scalarize, ScalarizationCost);
6563 continue;
6566 // We assume that widening is the best solution when possible.
6567 if (memoryInstructionCanBeWidened(&I, VF)) {
6568 InstructionCost Cost = getConsecutiveMemOpCost(&I, VF);
6569 int ConsecutiveStride = Legal->isConsecutivePtr(
6570 getLoadStoreType(&I), getLoadStorePointerOperand(&I));
6571 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
6572 "Expected consecutive stride.");
6573 InstWidening Decision =
6574 ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
6575 setWideningDecision(&I, VF, Decision, Cost);
6576 continue;
6579 // Choose between Interleaving, Gather/Scatter or Scalarization.
6580 InstructionCost InterleaveCost = InstructionCost::getInvalid();
6581 unsigned NumAccesses = 1;
6582 if (isAccessInterleaved(&I)) {
6583 auto Group = getInterleavedAccessGroup(&I);
6584 assert(Group && "Fail to get an interleaved access group.");
6586 // Make one decision for the whole group.
6587 if (getWideningDecision(&I, VF) != CM_Unknown)
6588 continue;
6590 NumAccesses = Group->getNumMembers();
6591 if (interleavedAccessCanBeWidened(&I, VF))
6592 InterleaveCost = getInterleaveGroupCost(&I, VF);
6595 InstructionCost GatherScatterCost =
6596 isLegalGatherOrScatter(&I, VF)
6597 ? getGatherScatterCost(&I, VF) * NumAccesses
6598 : InstructionCost::getInvalid();
6600 InstructionCost ScalarizationCost =
6601 getMemInstScalarizationCost(&I, VF) * NumAccesses;
6603 // Choose better solution for the current VF,
6604 // write down this decision and use it during vectorization.
6605 InstructionCost Cost;
6606 InstWidening Decision;
6607 if (InterleaveCost <= GatherScatterCost &&
6608 InterleaveCost < ScalarizationCost) {
6609 Decision = CM_Interleave;
6610 Cost = InterleaveCost;
6611 } else if (GatherScatterCost < ScalarizationCost) {
6612 Decision = CM_GatherScatter;
6613 Cost = GatherScatterCost;
6614 } else {
6615 Decision = CM_Scalarize;
6616 Cost = ScalarizationCost;
6618 // If the instructions belongs to an interleave group, the whole group
6619 // receives the same decision. The whole group receives the cost, but
6620 // the cost will actually be assigned to one instruction.
6621 if (auto Group = getInterleavedAccessGroup(&I))
6622 setWideningDecision(Group, VF, Decision, Cost);
6623 else
6624 setWideningDecision(&I, VF, Decision, Cost);
6628 // Make sure that any load of address and any other address computation
6629 // remains scalar unless there is gather/scatter support. This avoids
6630 // inevitable extracts into address registers, and also has the benefit of
6631 // activating LSR more, since that pass can't optimize vectorized
6632 // addresses.
6633 if (TTI.prefersVectorizedAddressing())
6634 return;
6636 // Start with all scalar pointer uses.
6637 SmallPtrSet<Instruction *, 8> AddrDefs;
6638 for (BasicBlock *BB : TheLoop->blocks())
6639 for (Instruction &I : *BB) {
6640 Instruction *PtrDef =
6641 dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
6642 if (PtrDef && TheLoop->contains(PtrDef) &&
6643 getWideningDecision(&I, VF) != CM_GatherScatter)
6644 AddrDefs.insert(PtrDef);
6647 // Add all instructions used to generate the addresses.
6648 SmallVector<Instruction *, 4> Worklist;
6649 append_range(Worklist, AddrDefs);
6650 while (!Worklist.empty()) {
6651 Instruction *I = Worklist.pop_back_val();
6652 for (auto &Op : I->operands())
6653 if (auto *InstOp = dyn_cast<Instruction>(Op))
6654 if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) &&
6655 AddrDefs.insert(InstOp).second)
6656 Worklist.push_back(InstOp);
6659 for (auto *I : AddrDefs) {
6660 if (isa<LoadInst>(I)) {
6661 // Setting the desired widening decision should ideally be handled in
6662 // by cost functions, but since this involves the task of finding out
6663 // if the loaded register is involved in an address computation, it is
6664 // instead changed here when we know this is the case.
6665 InstWidening Decision = getWideningDecision(I, VF);
6666 if (Decision == CM_Widen || Decision == CM_Widen_Reverse)
6667 // Scalarize a widened load of address.
6668 setWideningDecision(
6669 I, VF, CM_Scalarize,
6670 (VF.getKnownMinValue() *
6671 getMemoryInstructionCost(I, ElementCount::getFixed(1))));
6672 else if (auto Group = getInterleavedAccessGroup(I)) {
6673 // Scalarize an interleave group of address loads.
6674 for (unsigned I = 0; I < Group->getFactor(); ++I) {
6675 if (Instruction *Member = Group->getMember(I))
6676 setWideningDecision(
6677 Member, VF, CM_Scalarize,
6678 (VF.getKnownMinValue() *
6679 getMemoryInstructionCost(Member, ElementCount::getFixed(1))));
6682 } else
6683 // Make sure I gets scalarized and a cost estimate without
6684 // scalarization overhead.
6685 ForcedScalars[VF].insert(I);
6689 void LoopVectorizationCostModel::setVectorizedCallDecision(ElementCount VF) {
6690 assert(!VF.isScalar() &&
6691 "Trying to set a vectorization decision for a scalar VF");
6693 for (BasicBlock *BB : TheLoop->blocks()) {
6694 // For each instruction in the old loop.
6695 for (Instruction &I : *BB) {
6696 CallInst *CI = dyn_cast<CallInst>(&I);
6698 if (!CI)
6699 continue;
6701 InstructionCost ScalarCost = InstructionCost::getInvalid();
6702 InstructionCost VectorCost = InstructionCost::getInvalid();
6703 InstructionCost IntrinsicCost = InstructionCost::getInvalid();
6704 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6706 Function *ScalarFunc = CI->getCalledFunction();
6707 Type *ScalarRetTy = CI->getType();
6708 SmallVector<Type *, 4> Tys, ScalarTys;
6709 bool MaskRequired = Legal->isMaskRequired(CI);
6710 for (auto &ArgOp : CI->args())
6711 ScalarTys.push_back(ArgOp->getType());
6713 // Compute corresponding vector type for return value and arguments.
6714 Type *RetTy = ToVectorTy(ScalarRetTy, VF);
6715 for (Type *ScalarTy : ScalarTys)
6716 Tys.push_back(ToVectorTy(ScalarTy, VF));
6718 // An in-loop reduction using an fmuladd intrinsic is a special case;
6719 // we don't want the normal cost for that intrinsic.
6720 if (RecurrenceDescriptor::isFMulAddIntrinsic(CI))
6721 if (auto RedCost = getReductionPatternCost(CI, VF, RetTy, CostKind)) {
6722 setCallWideningDecision(CI, VF, CM_IntrinsicCall, nullptr,
6723 getVectorIntrinsicIDForCall(CI, TLI),
6724 std::nullopt, *RedCost);
6725 continue;
6728 // Estimate cost of scalarized vector call. The source operands are
6729 // assumed to be vectors, so we need to extract individual elements from
6730 // there, execute VF scalar calls, and then gather the result into the
6731 // vector return value.
6732 InstructionCost ScalarCallCost =
6733 TTI.getCallInstrCost(ScalarFunc, ScalarRetTy, ScalarTys, CostKind);
6735 // Compute costs of unpacking argument values for the scalar calls and
6736 // packing the return values to a vector.
6737 InstructionCost ScalarizationCost =
6738 getScalarizationOverhead(CI, VF, CostKind);
6740 ScalarCost = ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost;
6742 // Find the cost of vectorizing the call, if we can find a suitable
6743 // vector variant of the function.
6744 bool UsesMask = false;
6745 VFInfo FuncInfo;
6746 Function *VecFunc = nullptr;
6747 // Search through any available variants for one we can use at this VF.
6748 for (VFInfo &Info : VFDatabase::getMappings(*CI)) {
6749 // Must match requested VF.
6750 if (Info.Shape.VF != VF)
6751 continue;
6753 // Must take a mask argument if one is required
6754 if (MaskRequired && !Info.isMasked())
6755 continue;
6757 // Check that all parameter kinds are supported
6758 bool ParamsOk = true;
6759 for (VFParameter Param : Info.Shape.Parameters) {
6760 switch (Param.ParamKind) {
6761 case VFParamKind::Vector:
6762 break;
6763 case VFParamKind::OMP_Uniform: {
6764 Value *ScalarParam = CI->getArgOperand(Param.ParamPos);
6765 // Make sure the scalar parameter in the loop is invariant.
6766 if (!PSE.getSE()->isLoopInvariant(PSE.getSCEV(ScalarParam),
6767 TheLoop))
6768 ParamsOk = false;
6769 break;
6771 case VFParamKind::OMP_Linear: {
6772 Value *ScalarParam = CI->getArgOperand(Param.ParamPos);
6773 // Find the stride for the scalar parameter in this loop and see if
6774 // it matches the stride for the variant.
6775 // TODO: do we need to figure out the cost of an extract to get the
6776 // first lane? Or do we hope that it will be folded away?
6777 ScalarEvolution *SE = PSE.getSE();
6778 const auto *SAR =
6779 dyn_cast<SCEVAddRecExpr>(SE->getSCEV(ScalarParam));
6781 if (!SAR || SAR->getLoop() != TheLoop) {
6782 ParamsOk = false;
6783 break;
6786 const SCEVConstant *Step =
6787 dyn_cast<SCEVConstant>(SAR->getStepRecurrence(*SE));
6789 if (!Step ||
6790 Step->getAPInt().getSExtValue() != Param.LinearStepOrPos)
6791 ParamsOk = false;
6793 break;
6795 case VFParamKind::GlobalPredicate:
6796 UsesMask = true;
6797 break;
6798 default:
6799 ParamsOk = false;
6800 break;
6804 if (!ParamsOk)
6805 continue;
6807 // Found a suitable candidate, stop here.
6808 VecFunc = CI->getModule()->getFunction(Info.VectorName);
6809 FuncInfo = Info;
6810 break;
6813 // Add in the cost of synthesizing a mask if one wasn't required.
6814 InstructionCost MaskCost = 0;
6815 if (VecFunc && UsesMask && !MaskRequired)
6816 MaskCost = TTI.getShuffleCost(
6817 TargetTransformInfo::SK_Broadcast,
6818 VectorType::get(IntegerType::getInt1Ty(
6819 VecFunc->getFunctionType()->getContext()),
6820 VF));
6822 if (TLI && VecFunc && !CI->isNoBuiltin())
6823 VectorCost =
6824 TTI.getCallInstrCost(nullptr, RetTy, Tys, CostKind) + MaskCost;
6826 // Find the cost of an intrinsic; some targets may have instructions that
6827 // perform the operation without needing an actual call.
6828 Intrinsic::ID IID = getVectorIntrinsicIDForCall(CI, TLI);
6829 if (IID != Intrinsic::not_intrinsic)
6830 IntrinsicCost = getVectorIntrinsicCost(CI, VF);
6832 InstructionCost Cost = ScalarCost;
6833 InstWidening Decision = CM_Scalarize;
6835 if (VectorCost <= Cost) {
6836 Cost = VectorCost;
6837 Decision = CM_VectorCall;
6840 if (IntrinsicCost <= Cost) {
6841 Cost = IntrinsicCost;
6842 Decision = CM_IntrinsicCall;
6845 setCallWideningDecision(CI, VF, Decision, VecFunc, IID,
6846 FuncInfo.getParamIndexForOptionalMask(), Cost);
6851 InstructionCost
6852 LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
6853 Type *&VectorTy) {
6854 Type *RetTy = I->getType();
6855 if (canTruncateToMinimalBitwidth(I, VF))
6856 RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
6857 auto SE = PSE.getSE();
6858 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6860 auto hasSingleCopyAfterVectorization = [this](Instruction *I,
6861 ElementCount VF) -> bool {
6862 if (VF.isScalar())
6863 return true;
6865 auto Scalarized = InstsToScalarize.find(VF);
6866 assert(Scalarized != InstsToScalarize.end() &&
6867 "VF not yet analyzed for scalarization profitability");
6868 return !Scalarized->second.count(I) &&
6869 llvm::all_of(I->users(), [&](User *U) {
6870 auto *UI = cast<Instruction>(U);
6871 return !Scalarized->second.count(UI);
6874 (void) hasSingleCopyAfterVectorization;
6876 if (isScalarAfterVectorization(I, VF)) {
6877 // With the exception of GEPs and PHIs, after scalarization there should
6878 // only be one copy of the instruction generated in the loop. This is
6879 // because the VF is either 1, or any instructions that need scalarizing
6880 // have already been dealt with by the time we get here. As a result,
6881 // it means we don't have to multiply the instruction cost by VF.
6882 assert(I->getOpcode() == Instruction::GetElementPtr ||
6883 I->getOpcode() == Instruction::PHI ||
6884 (I->getOpcode() == Instruction::BitCast &&
6885 I->getType()->isPointerTy()) ||
6886 hasSingleCopyAfterVectorization(I, VF));
6887 VectorTy = RetTy;
6888 } else
6889 VectorTy = ToVectorTy(RetTy, VF);
6891 // TODO: We need to estimate the cost of intrinsic calls.
6892 switch (I->getOpcode()) {
6893 case Instruction::GetElementPtr:
6894 // We mark this instruction as zero-cost because the cost of GEPs in
6895 // vectorized code depends on whether the corresponding memory instruction
6896 // is scalarized or not. Therefore, we handle GEPs with the memory
6897 // instruction cost.
6898 return 0;
6899 case Instruction::Br: {
6900 // In cases of scalarized and predicated instructions, there will be VF
6901 // predicated blocks in the vectorized loop. Each branch around these
6902 // blocks requires also an extract of its vector compare i1 element.
6903 bool ScalarPredicatedBB = false;
6904 BranchInst *BI = cast<BranchInst>(I);
6905 if (VF.isVector() && BI->isConditional() &&
6906 (PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(0)) ||
6907 PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(1))))
6908 ScalarPredicatedBB = true;
6910 if (ScalarPredicatedBB) {
6911 // Not possible to scalarize scalable vector with predicated instructions.
6912 if (VF.isScalable())
6913 return InstructionCost::getInvalid();
6914 // Return cost for branches around scalarized and predicated blocks.
6915 auto *Vec_i1Ty =
6916 VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF);
6917 return (
6918 TTI.getScalarizationOverhead(
6919 Vec_i1Ty, APInt::getAllOnes(VF.getFixedValue()),
6920 /*Insert*/ false, /*Extract*/ true, CostKind) +
6921 (TTI.getCFInstrCost(Instruction::Br, CostKind) * VF.getFixedValue()));
6922 } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar())
6923 // The back-edge branch will remain, as will all scalar branches.
6924 return TTI.getCFInstrCost(Instruction::Br, CostKind);
6925 else
6926 // This branch will be eliminated by if-conversion.
6927 return 0;
6928 // Note: We currently assume zero cost for an unconditional branch inside
6929 // a predicated block since it will become a fall-through, although we
6930 // may decide in the future to call TTI for all branches.
6932 case Instruction::PHI: {
6933 auto *Phi = cast<PHINode>(I);
6935 // First-order recurrences are replaced by vector shuffles inside the loop.
6936 if (VF.isVector() && Legal->isFixedOrderRecurrence(Phi)) {
6937 SmallVector<int> Mask(VF.getKnownMinValue());
6938 std::iota(Mask.begin(), Mask.end(), VF.getKnownMinValue() - 1);
6939 return TTI.getShuffleCost(TargetTransformInfo::SK_Splice,
6940 cast<VectorType>(VectorTy), Mask, CostKind,
6941 VF.getKnownMinValue() - 1);
6944 // Phi nodes in non-header blocks (not inductions, reductions, etc.) are
6945 // converted into select instructions. We require N - 1 selects per phi
6946 // node, where N is the number of incoming values.
6947 if (VF.isVector() && Phi->getParent() != TheLoop->getHeader())
6948 return (Phi->getNumIncomingValues() - 1) *
6949 TTI.getCmpSelInstrCost(
6950 Instruction::Select, ToVectorTy(Phi->getType(), VF),
6951 ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF),
6952 CmpInst::BAD_ICMP_PREDICATE, CostKind);
6954 return TTI.getCFInstrCost(Instruction::PHI, CostKind);
6956 case Instruction::UDiv:
6957 case Instruction::SDiv:
6958 case Instruction::URem:
6959 case Instruction::SRem:
6960 if (VF.isVector() && isPredicatedInst(I)) {
6961 const auto [ScalarCost, SafeDivisorCost] = getDivRemSpeculationCost(I, VF);
6962 return isDivRemScalarWithPredication(ScalarCost, SafeDivisorCost) ?
6963 ScalarCost : SafeDivisorCost;
6965 // We've proven all lanes safe to speculate, fall through.
6966 [[fallthrough]];
6967 case Instruction::Add:
6968 case Instruction::FAdd:
6969 case Instruction::Sub:
6970 case Instruction::FSub:
6971 case Instruction::Mul:
6972 case Instruction::FMul:
6973 case Instruction::FDiv:
6974 case Instruction::FRem:
6975 case Instruction::Shl:
6976 case Instruction::LShr:
6977 case Instruction::AShr:
6978 case Instruction::And:
6979 case Instruction::Or:
6980 case Instruction::Xor: {
6981 // If we're speculating on the stride being 1, the multiplication may
6982 // fold away. We can generalize this for all operations using the notion
6983 // of neutral elements. (TODO)
6984 if (I->getOpcode() == Instruction::Mul &&
6985 (PSE.getSCEV(I->getOperand(0))->isOne() ||
6986 PSE.getSCEV(I->getOperand(1))->isOne()))
6987 return 0;
6989 // Detect reduction patterns
6990 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
6991 return *RedCost;
6993 // Certain instructions can be cheaper to vectorize if they have a constant
6994 // second vector operand. One example of this are shifts on x86.
6995 Value *Op2 = I->getOperand(1);
6996 auto Op2Info = TTI.getOperandInfo(Op2);
6997 if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue &&
6998 Legal->isInvariant(Op2))
6999 Op2Info.Kind = TargetTransformInfo::OK_UniformValue;
7001 SmallVector<const Value *, 4> Operands(I->operand_values());
7002 auto InstrCost = TTI.getArithmeticInstrCost(
7003 I->getOpcode(), VectorTy, CostKind,
7004 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
7005 Op2Info, Operands, I);
7007 // Some targets can replace frem with vector library calls.
7008 InstructionCost VecCallCost = InstructionCost::getInvalid();
7009 if (I->getOpcode() == Instruction::FRem) {
7010 LibFunc Func;
7011 if (TLI->getLibFunc(I->getOpcode(), I->getType(), Func) &&
7012 TLI->isFunctionVectorizable(TLI->getName(Func), VF)) {
7013 SmallVector<Type *, 4> OpTypes;
7014 for (auto &Op : I->operands())
7015 OpTypes.push_back(Op->getType());
7016 VecCallCost =
7017 TTI.getCallInstrCost(nullptr, VectorTy, OpTypes, CostKind);
7020 return std::min(InstrCost, VecCallCost);
7022 case Instruction::FNeg: {
7023 return TTI.getArithmeticInstrCost(
7024 I->getOpcode(), VectorTy, CostKind,
7025 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
7026 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
7027 I->getOperand(0), I);
7029 case Instruction::Select: {
7030 SelectInst *SI = cast<SelectInst>(I);
7031 const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
7032 bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
7034 const Value *Op0, *Op1;
7035 using namespace llvm::PatternMatch;
7036 if (!ScalarCond && (match(I, m_LogicalAnd(m_Value(Op0), m_Value(Op1))) ||
7037 match(I, m_LogicalOr(m_Value(Op0), m_Value(Op1))))) {
7038 // select x, y, false --> x & y
7039 // select x, true, y --> x | y
7040 const auto [Op1VK, Op1VP] = TTI::getOperandInfo(Op0);
7041 const auto [Op2VK, Op2VP] = TTI::getOperandInfo(Op1);
7042 assert(Op0->getType()->getScalarSizeInBits() == 1 &&
7043 Op1->getType()->getScalarSizeInBits() == 1);
7045 SmallVector<const Value *, 2> Operands{Op0, Op1};
7046 return TTI.getArithmeticInstrCost(
7047 match(I, m_LogicalOr()) ? Instruction::Or : Instruction::And, VectorTy,
7048 CostKind, {Op1VK, Op1VP}, {Op2VK, Op2VP}, Operands, I);
7051 Type *CondTy = SI->getCondition()->getType();
7052 if (!ScalarCond)
7053 CondTy = VectorType::get(CondTy, VF);
7055 CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE;
7056 if (auto *Cmp = dyn_cast<CmpInst>(SI->getCondition()))
7057 Pred = Cmp->getPredicate();
7058 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, Pred,
7059 CostKind, I);
7061 case Instruction::ICmp:
7062 case Instruction::FCmp: {
7063 Type *ValTy = I->getOperand(0)->getType();
7064 Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
7065 if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
7066 ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]);
7067 VectorTy = ToVectorTy(ValTy, VF);
7068 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr,
7069 cast<CmpInst>(I)->getPredicate(), CostKind,
7072 case Instruction::Store:
7073 case Instruction::Load: {
7074 ElementCount Width = VF;
7075 if (Width.isVector()) {
7076 InstWidening Decision = getWideningDecision(I, Width);
7077 assert(Decision != CM_Unknown &&
7078 "CM decision should be taken at this point");
7079 if (getWideningCost(I, VF) == InstructionCost::getInvalid())
7080 return InstructionCost::getInvalid();
7081 if (Decision == CM_Scalarize)
7082 Width = ElementCount::getFixed(1);
7084 VectorTy = ToVectorTy(getLoadStoreType(I), Width);
7085 return getMemoryInstructionCost(I, VF);
7087 case Instruction::BitCast:
7088 if (I->getType()->isPointerTy())
7089 return 0;
7090 [[fallthrough]];
7091 case Instruction::ZExt:
7092 case Instruction::SExt:
7093 case Instruction::FPToUI:
7094 case Instruction::FPToSI:
7095 case Instruction::FPExt:
7096 case Instruction::PtrToInt:
7097 case Instruction::IntToPtr:
7098 case Instruction::SIToFP:
7099 case Instruction::UIToFP:
7100 case Instruction::Trunc:
7101 case Instruction::FPTrunc: {
7102 // Computes the CastContextHint from a Load/Store instruction.
7103 auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint {
7104 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
7105 "Expected a load or a store!");
7107 if (VF.isScalar() || !TheLoop->contains(I))
7108 return TTI::CastContextHint::Normal;
7110 switch (getWideningDecision(I, VF)) {
7111 case LoopVectorizationCostModel::CM_GatherScatter:
7112 return TTI::CastContextHint::GatherScatter;
7113 case LoopVectorizationCostModel::CM_Interleave:
7114 return TTI::CastContextHint::Interleave;
7115 case LoopVectorizationCostModel::CM_Scalarize:
7116 case LoopVectorizationCostModel::CM_Widen:
7117 return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked
7118 : TTI::CastContextHint::Normal;
7119 case LoopVectorizationCostModel::CM_Widen_Reverse:
7120 return TTI::CastContextHint::Reversed;
7121 case LoopVectorizationCostModel::CM_Unknown:
7122 llvm_unreachable("Instr did not go through cost modelling?");
7123 case LoopVectorizationCostModel::CM_VectorCall:
7124 case LoopVectorizationCostModel::CM_IntrinsicCall:
7125 llvm_unreachable_internal("Instr has invalid widening decision");
7128 llvm_unreachable("Unhandled case!");
7131 unsigned Opcode = I->getOpcode();
7132 TTI::CastContextHint CCH = TTI::CastContextHint::None;
7133 // For Trunc, the context is the only user, which must be a StoreInst.
7134 if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) {
7135 if (I->hasOneUse())
7136 if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin()))
7137 CCH = ComputeCCH(Store);
7139 // For Z/Sext, the context is the operand, which must be a LoadInst.
7140 else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt ||
7141 Opcode == Instruction::FPExt) {
7142 if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0)))
7143 CCH = ComputeCCH(Load);
7146 // We optimize the truncation of induction variables having constant
7147 // integer steps. The cost of these truncations is the same as the scalar
7148 // operation.
7149 if (isOptimizableIVTruncate(I, VF)) {
7150 auto *Trunc = cast<TruncInst>(I);
7151 return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(),
7152 Trunc->getSrcTy(), CCH, CostKind, Trunc);
7155 // Detect reduction patterns
7156 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
7157 return *RedCost;
7159 Type *SrcScalarTy = I->getOperand(0)->getType();
7160 Type *SrcVecTy =
7161 VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy;
7162 if (canTruncateToMinimalBitwidth(I, VF)) {
7163 // This cast is going to be shrunk. This may remove the cast or it might
7164 // turn it into slightly different cast. For example, if MinBW == 16,
7165 // "zext i8 %1 to i32" becomes "zext i8 %1 to i16".
7167 // Calculate the modified src and dest types.
7168 Type *MinVecTy = VectorTy;
7169 if (Opcode == Instruction::Trunc) {
7170 SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy);
7171 VectorTy =
7172 largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
7173 } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) {
7174 // Leave SrcVecTy unchanged - we only shrink the destination element
7175 // type.
7176 VectorTy =
7177 smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
7181 return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I);
7183 case Instruction::Call:
7184 return getVectorCallCost(cast<CallInst>(I), VF);
7185 case Instruction::ExtractValue:
7186 return TTI.getInstructionCost(I, TTI::TCK_RecipThroughput);
7187 case Instruction::Alloca:
7188 // We cannot easily widen alloca to a scalable alloca, as
7189 // the result would need to be a vector of pointers.
7190 if (VF.isScalable())
7191 return InstructionCost::getInvalid();
7192 [[fallthrough]];
7193 default:
7194 // This opcode is unknown. Assume that it is the same as 'mul'.
7195 return TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
7196 } // end of switch.
7199 void LoopVectorizationCostModel::collectValuesToIgnore() {
7200 // Ignore ephemeral values.
7201 CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore);
7203 // Find all stores to invariant variables. Since they are going to sink
7204 // outside the loop we do not need calculate cost for them.
7205 for (BasicBlock *BB : TheLoop->blocks())
7206 for (Instruction &I : *BB) {
7207 StoreInst *SI;
7208 if ((SI = dyn_cast<StoreInst>(&I)) &&
7209 Legal->isInvariantAddressOfReduction(SI->getPointerOperand()))
7210 ValuesToIgnore.insert(&I);
7213 // Ignore type-promoting instructions we identified during reduction
7214 // detection.
7215 for (const auto &Reduction : Legal->getReductionVars()) {
7216 const RecurrenceDescriptor &RedDes = Reduction.second;
7217 const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
7218 VecValuesToIgnore.insert(Casts.begin(), Casts.end());
7220 // Ignore type-casting instructions we identified during induction
7221 // detection.
7222 for (const auto &Induction : Legal->getInductionVars()) {
7223 const InductionDescriptor &IndDes = Induction.second;
7224 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
7225 VecValuesToIgnore.insert(Casts.begin(), Casts.end());
7229 void LoopVectorizationCostModel::collectInLoopReductions() {
7230 for (const auto &Reduction : Legal->getReductionVars()) {
7231 PHINode *Phi = Reduction.first;
7232 const RecurrenceDescriptor &RdxDesc = Reduction.second;
7234 // We don't collect reductions that are type promoted (yet).
7235 if (RdxDesc.getRecurrenceType() != Phi->getType())
7236 continue;
7238 // If the target would prefer this reduction to happen "in-loop", then we
7239 // want to record it as such.
7240 unsigned Opcode = RdxDesc.getOpcode();
7241 if (!PreferInLoopReductions && !useOrderedReductions(RdxDesc) &&
7242 !TTI.preferInLoopReduction(Opcode, Phi->getType(),
7243 TargetTransformInfo::ReductionFlags()))
7244 continue;
7246 // Check that we can correctly put the reductions into the loop, by
7247 // finding the chain of operations that leads from the phi to the loop
7248 // exit value.
7249 SmallVector<Instruction *, 4> ReductionOperations =
7250 RdxDesc.getReductionOpChain(Phi, TheLoop);
7251 bool InLoop = !ReductionOperations.empty();
7253 if (InLoop) {
7254 InLoopReductions.insert(Phi);
7255 // Add the elements to InLoopReductionImmediateChains for cost modelling.
7256 Instruction *LastChain = Phi;
7257 for (auto *I : ReductionOperations) {
7258 InLoopReductionImmediateChains[I] = LastChain;
7259 LastChain = I;
7262 LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop")
7263 << " reduction for phi: " << *Phi << "\n");
7267 VPValue *VPBuilder::createICmp(CmpInst::Predicate Pred, VPValue *A, VPValue *B,
7268 DebugLoc DL, const Twine &Name) {
7269 assert(Pred >= CmpInst::FIRST_ICMP_PREDICATE &&
7270 Pred <= CmpInst::LAST_ICMP_PREDICATE && "invalid predicate");
7271 return tryInsertInstruction(
7272 new VPInstruction(Instruction::ICmp, Pred, A, B, DL, Name));
7275 // This function will select a scalable VF if the target supports scalable
7276 // vectors and a fixed one otherwise.
7277 // TODO: we could return a pair of values that specify the max VF and
7278 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of
7279 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
7280 // doesn't have a cost model that can choose which plan to execute if
7281 // more than one is generated.
7282 static ElementCount determineVPlanVF(const TargetTransformInfo &TTI,
7283 LoopVectorizationCostModel &CM) {
7284 unsigned WidestType;
7285 std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes();
7287 TargetTransformInfo::RegisterKind RegKind =
7288 TTI.enableScalableVectorization()
7289 ? TargetTransformInfo::RGK_ScalableVector
7290 : TargetTransformInfo::RGK_FixedWidthVector;
7292 TypeSize RegSize = TTI.getRegisterBitWidth(RegKind);
7293 unsigned N = RegSize.getKnownMinValue() / WidestType;
7294 return ElementCount::get(N, RegSize.isScalable());
7297 VectorizationFactor
7298 LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {
7299 ElementCount VF = UserVF;
7300 // Outer loop handling: They may require CFG and instruction level
7301 // transformations before even evaluating whether vectorization is profitable.
7302 // Since we cannot modify the incoming IR, we need to build VPlan upfront in
7303 // the vectorization pipeline.
7304 if (!OrigLoop->isInnermost()) {
7305 // If the user doesn't provide a vectorization factor, determine a
7306 // reasonable one.
7307 if (UserVF.isZero()) {
7308 VF = determineVPlanVF(TTI, CM);
7309 LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n");
7311 // Make sure we have a VF > 1 for stress testing.
7312 if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) {
7313 LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: "
7314 << "overriding computed VF.\n");
7315 VF = ElementCount::getFixed(4);
7317 } else if (UserVF.isScalable() && !TTI.supportsScalableVectors() &&
7318 !ForceTargetSupportsScalableVectors) {
7319 LLVM_DEBUG(dbgs() << "LV: Not vectorizing. Scalable VF requested, but "
7320 << "not supported by the target.\n");
7321 reportVectorizationFailure(
7322 "Scalable vectorization requested but not supported by the target",
7323 "the scalable user-specified vectorization width for outer-loop "
7324 "vectorization cannot be used because the target does not support "
7325 "scalable vectors.",
7326 "ScalableVFUnfeasible", ORE, OrigLoop);
7327 return VectorizationFactor::Disabled();
7329 assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
7330 assert(isPowerOf2_32(VF.getKnownMinValue()) &&
7331 "VF needs to be a power of two");
7332 LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "")
7333 << "VF " << VF << " to build VPlans.\n");
7334 buildVPlans(VF, VF);
7336 // For VPlan build stress testing, we bail out after VPlan construction.
7337 if (VPlanBuildStressTest)
7338 return VectorizationFactor::Disabled();
7340 return {VF, 0 /*Cost*/, 0 /* ScalarCost */};
7343 LLVM_DEBUG(
7344 dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
7345 "VPlan-native path.\n");
7346 return VectorizationFactor::Disabled();
7349 std::optional<VectorizationFactor>
7350 LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
7351 assert(OrigLoop->isInnermost() && "Inner loop expected.");
7352 CM.collectValuesToIgnore();
7353 CM.collectElementTypesForWidening();
7355 FixedScalableVFPair MaxFactors = CM.computeMaxVF(UserVF, UserIC);
7356 if (!MaxFactors) // Cases that should not to be vectorized nor interleaved.
7357 return std::nullopt;
7359 // Invalidate interleave groups if all blocks of loop will be predicated.
7360 if (CM.blockNeedsPredicationForAnyReason(OrigLoop->getHeader()) &&
7361 !useMaskedInterleavedAccesses(TTI)) {
7362 LLVM_DEBUG(
7363 dbgs()
7364 << "LV: Invalidate all interleaved groups due to fold-tail by masking "
7365 "which requires masked-interleaved support.\n");
7366 if (CM.InterleaveInfo.invalidateGroups())
7367 // Invalidating interleave groups also requires invalidating all decisions
7368 // based on them, which includes widening decisions and uniform and scalar
7369 // values.
7370 CM.invalidateCostModelingDecisions();
7373 ElementCount MaxUserVF =
7374 UserVF.isScalable() ? MaxFactors.ScalableVF : MaxFactors.FixedVF;
7375 bool UserVFIsLegal = ElementCount::isKnownLE(UserVF, MaxUserVF);
7376 if (!UserVF.isZero() && UserVFIsLegal) {
7377 assert(isPowerOf2_32(UserVF.getKnownMinValue()) &&
7378 "VF needs to be a power of two");
7379 // Collect the instructions (and their associated costs) that will be more
7380 // profitable to scalarize.
7381 CM.collectInLoopReductions();
7382 if (CM.selectUserVectorizationFactor(UserVF)) {
7383 LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
7384 buildVPlansWithVPRecipes(UserVF, UserVF);
7385 if (!hasPlanWithVF(UserVF)) {
7386 LLVM_DEBUG(dbgs() << "LV: No VPlan could be built for " << UserVF
7387 << ".\n");
7388 return std::nullopt;
7391 LLVM_DEBUG(printPlans(dbgs()));
7392 return {{UserVF, 0, 0}};
7393 } else
7394 reportVectorizationInfo("UserVF ignored because of invalid costs.",
7395 "InvalidCost", ORE, OrigLoop);
7398 // Populate the set of Vectorization Factor Candidates.
7399 ElementCountSet VFCandidates;
7400 for (auto VF = ElementCount::getFixed(1);
7401 ElementCount::isKnownLE(VF, MaxFactors.FixedVF); VF *= 2)
7402 VFCandidates.insert(VF);
7403 for (auto VF = ElementCount::getScalable(1);
7404 ElementCount::isKnownLE(VF, MaxFactors.ScalableVF); VF *= 2)
7405 VFCandidates.insert(VF);
7407 CM.collectInLoopReductions();
7408 for (const auto &VF : VFCandidates) {
7409 // Collect Uniform and Scalar instructions after vectorization with VF.
7410 CM.collectUniformsAndScalars(VF);
7412 // Collect the instructions (and their associated costs) that will be more
7413 // profitable to scalarize.
7414 if (VF.isVector())
7415 CM.collectInstsToScalarize(VF);
7418 buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF);
7419 buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF);
7421 LLVM_DEBUG(printPlans(dbgs()));
7422 if (!MaxFactors.hasVector())
7423 return VectorizationFactor::Disabled();
7425 // Select the optimal vectorization factor.
7426 VectorizationFactor VF = selectVectorizationFactor(VFCandidates);
7427 assert((VF.Width.isScalar() || VF.ScalarCost > 0) && "when vectorizing, the scalar cost must be non-zero.");
7428 if (!hasPlanWithVF(VF.Width)) {
7429 LLVM_DEBUG(dbgs() << "LV: No VPlan could be built for " << VF.Width
7430 << ".\n");
7431 return std::nullopt;
7433 return VF;
7436 VPlan &LoopVectorizationPlanner::getBestPlanFor(ElementCount VF) const {
7437 assert(count_if(VPlans,
7438 [VF](const VPlanPtr &Plan) { return Plan->hasVF(VF); }) ==
7439 1 &&
7440 "Best VF has not a single VPlan.");
7442 for (const VPlanPtr &Plan : VPlans) {
7443 if (Plan->hasVF(VF))
7444 return *Plan.get();
7446 llvm_unreachable("No plan found!");
7449 static void AddRuntimeUnrollDisableMetaData(Loop *L) {
7450 SmallVector<Metadata *, 4> MDs;
7451 // Reserve first location for self reference to the LoopID metadata node.
7452 MDs.push_back(nullptr);
7453 bool IsUnrollMetadata = false;
7454 MDNode *LoopID = L->getLoopID();
7455 if (LoopID) {
7456 // First find existing loop unrolling disable metadata.
7457 for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
7458 auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
7459 if (MD) {
7460 const auto *S = dyn_cast<MDString>(MD->getOperand(0));
7461 IsUnrollMetadata =
7462 S && S->getString().starts_with("llvm.loop.unroll.disable");
7464 MDs.push_back(LoopID->getOperand(i));
7468 if (!IsUnrollMetadata) {
7469 // Add runtime unroll disable metadata.
7470 LLVMContext &Context = L->getHeader()->getContext();
7471 SmallVector<Metadata *, 1> DisableOperands;
7472 DisableOperands.push_back(
7473 MDString::get(Context, "llvm.loop.unroll.runtime.disable"));
7474 MDNode *DisableNode = MDNode::get(Context, DisableOperands);
7475 MDs.push_back(DisableNode);
7476 MDNode *NewLoopID = MDNode::get(Context, MDs);
7477 // Set operand 0 to refer to the loop id itself.
7478 NewLoopID->replaceOperandWith(0, NewLoopID);
7479 L->setLoopID(NewLoopID);
7483 // Check if \p RedResult is a ComputeReductionResult instruction, and if it is
7484 // create a merge phi node for it and add it to \p ReductionResumeValues.
7485 static void createAndCollectMergePhiForReduction(
7486 VPInstruction *RedResult,
7487 DenseMap<const RecurrenceDescriptor *, Value *> &ReductionResumeValues,
7488 VPTransformState &State, Loop *OrigLoop, BasicBlock *LoopMiddleBlock) {
7489 if (!RedResult ||
7490 RedResult->getOpcode() != VPInstruction::ComputeReductionResult)
7491 return;
7493 auto *PhiR = cast<VPReductionPHIRecipe>(RedResult->getOperand(0));
7494 const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
7496 TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
7497 Value *FinalValue =
7498 State.get(RedResult, VPIteration(State.UF - 1, VPLane::getFirstLane()));
7499 auto *ResumePhi =
7500 dyn_cast<PHINode>(PhiR->getStartValue()->getUnderlyingValue());
7502 // TODO: bc.merge.rdx should not be created here, instead it should be
7503 // modeled in VPlan.
7504 BasicBlock *LoopScalarPreHeader = OrigLoop->getLoopPreheader();
7505 // Create a phi node that merges control-flow from the backedge-taken check
7506 // block and the middle block.
7507 auto *BCBlockPhi = PHINode::Create(FinalValue->getType(), 2, "bc.merge.rdx",
7508 LoopScalarPreHeader->getTerminator());
7510 // If we are fixing reductions in the epilogue loop then we should already
7511 // have created a bc.merge.rdx Phi after the main vector body. Ensure that
7512 // we carry over the incoming values correctly.
7513 for (auto *Incoming : predecessors(LoopScalarPreHeader)) {
7514 if (Incoming == LoopMiddleBlock)
7515 BCBlockPhi->addIncoming(FinalValue, Incoming);
7516 else if (ResumePhi && is_contained(ResumePhi->blocks(), Incoming))
7517 BCBlockPhi->addIncoming(ResumePhi->getIncomingValueForBlock(Incoming),
7518 Incoming);
7519 else
7520 BCBlockPhi->addIncoming(ReductionStartValue, Incoming);
7523 auto *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue());
7524 // TODO: This fixup should instead be modeled in VPlan.
7525 // Fix the scalar loop reduction variable with the incoming reduction sum
7526 // from the vector body and from the backedge value.
7527 int IncomingEdgeBlockIdx =
7528 OrigPhi->getBasicBlockIndex(OrigLoop->getLoopLatch());
7529 assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
7530 // Pick the other block.
7531 int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
7532 OrigPhi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
7533 Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
7534 OrigPhi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
7536 ReductionResumeValues[&RdxDesc] = BCBlockPhi;
7539 std::pair<DenseMap<const SCEV *, Value *>,
7540 DenseMap<const RecurrenceDescriptor *, Value *>>
7541 LoopVectorizationPlanner::executePlan(
7542 ElementCount BestVF, unsigned BestUF, VPlan &BestVPlan,
7543 InnerLoopVectorizer &ILV, DominatorTree *DT, bool IsEpilogueVectorization,
7544 const DenseMap<const SCEV *, Value *> *ExpandedSCEVs) {
7545 assert(BestVPlan.hasVF(BestVF) &&
7546 "Trying to execute plan with unsupported VF");
7547 assert(BestVPlan.hasUF(BestUF) &&
7548 "Trying to execute plan with unsupported UF");
7549 assert(
7550 (IsEpilogueVectorization || !ExpandedSCEVs) &&
7551 "expanded SCEVs to reuse can only be used during epilogue vectorization");
7553 LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << BestVF << ", UF=" << BestUF
7554 << '\n');
7556 if (!IsEpilogueVectorization)
7557 VPlanTransforms::optimizeForVFAndUF(BestVPlan, BestVF, BestUF, PSE);
7559 // Perform the actual loop transformation.
7560 VPTransformState State(BestVF, BestUF, LI, DT, ILV.Builder, &ILV, &BestVPlan,
7561 OrigLoop->getHeader()->getContext());
7563 // 0. Generate SCEV-dependent code into the preheader, including TripCount,
7564 // before making any changes to the CFG.
7565 if (!BestVPlan.getPreheader()->empty()) {
7566 State.CFG.PrevBB = OrigLoop->getLoopPreheader();
7567 State.Builder.SetInsertPoint(OrigLoop->getLoopPreheader()->getTerminator());
7568 BestVPlan.getPreheader()->execute(&State);
7570 if (!ILV.getTripCount())
7571 ILV.setTripCount(State.get(BestVPlan.getTripCount(), {0, 0}));
7572 else
7573 assert(IsEpilogueVectorization && "should only re-use the existing trip "
7574 "count during epilogue vectorization");
7576 // 1. Set up the skeleton for vectorization, including vector pre-header and
7577 // middle block. The vector loop is created during VPlan execution.
7578 Value *CanonicalIVStartValue;
7579 std::tie(State.CFG.PrevBB, CanonicalIVStartValue) =
7580 ILV.createVectorizedLoopSkeleton(ExpandedSCEVs ? *ExpandedSCEVs
7581 : State.ExpandedSCEVs);
7583 // Only use noalias metadata when using memory checks guaranteeing no overlap
7584 // across all iterations.
7585 const LoopAccessInfo *LAI = ILV.Legal->getLAI();
7586 std::unique_ptr<LoopVersioning> LVer = nullptr;
7587 if (LAI && !LAI->getRuntimePointerChecking()->getChecks().empty() &&
7588 !LAI->getRuntimePointerChecking()->getDiffChecks()) {
7590 // We currently don't use LoopVersioning for the actual loop cloning but we
7591 // still use it to add the noalias metadata.
7592 // TODO: Find a better way to re-use LoopVersioning functionality to add
7593 // metadata.
7594 LVer = std::make_unique<LoopVersioning>(
7595 *LAI, LAI->getRuntimePointerChecking()->getChecks(), OrigLoop, LI, DT,
7596 PSE.getSE());
7597 State.LVer = &*LVer;
7598 State.LVer->prepareNoAliasMetadata();
7601 ILV.collectPoisonGeneratingRecipes(State);
7603 ILV.printDebugTracesAtStart();
7605 //===------------------------------------------------===//
7607 // Notice: any optimization or new instruction that go
7608 // into the code below should also be implemented in
7609 // the cost-model.
7611 //===------------------------------------------------===//
7613 // 2. Copy and widen instructions from the old loop into the new loop.
7614 BestVPlan.prepareToExecute(ILV.getTripCount(),
7615 ILV.getOrCreateVectorTripCount(nullptr),
7616 CanonicalIVStartValue, State);
7618 BestVPlan.execute(&State);
7620 // 2.5 Collect reduction resume values.
7621 DenseMap<const RecurrenceDescriptor *, Value *> ReductionResumeValues;
7622 auto *ExitVPBB =
7623 cast<VPBasicBlock>(BestVPlan.getVectorLoopRegion()->getSingleSuccessor());
7624 for (VPRecipeBase &R : *ExitVPBB) {
7625 createAndCollectMergePhiForReduction(dyn_cast<VPInstruction>(&R),
7626 ReductionResumeValues, State, OrigLoop,
7627 State.CFG.VPBB2IRBB[ExitVPBB]);
7630 // 2.6. Maintain Loop Hints
7631 // Keep all loop hints from the original loop on the vector loop (we'll
7632 // replace the vectorizer-specific hints below).
7633 MDNode *OrigLoopID = OrigLoop->getLoopID();
7635 std::optional<MDNode *> VectorizedLoopID =
7636 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
7637 LLVMLoopVectorizeFollowupVectorized});
7639 VPBasicBlock *HeaderVPBB =
7640 BestVPlan.getVectorLoopRegion()->getEntryBasicBlock();
7641 Loop *L = LI->getLoopFor(State.CFG.VPBB2IRBB[HeaderVPBB]);
7642 if (VectorizedLoopID)
7643 L->setLoopID(*VectorizedLoopID);
7644 else {
7645 // Keep all loop hints from the original loop on the vector loop (we'll
7646 // replace the vectorizer-specific hints below).
7647 if (MDNode *LID = OrigLoop->getLoopID())
7648 L->setLoopID(LID);
7650 LoopVectorizeHints Hints(L, true, *ORE);
7651 Hints.setAlreadyVectorized();
7653 TargetTransformInfo::UnrollingPreferences UP;
7654 TTI.getUnrollingPreferences(L, *PSE.getSE(), UP, ORE);
7655 if (!UP.UnrollVectorizedLoop || CanonicalIVStartValue)
7656 AddRuntimeUnrollDisableMetaData(L);
7658 // 3. Fix the vectorized code: take care of header phi's, live-outs,
7659 // predication, updating analyses.
7660 ILV.fixVectorizedLoop(State, BestVPlan);
7662 ILV.printDebugTracesAtEnd();
7664 return {State.ExpandedSCEVs, ReductionResumeValues};
7667 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
7668 void LoopVectorizationPlanner::printPlans(raw_ostream &O) {
7669 for (const auto &Plan : VPlans)
7670 if (PrintVPlansInDotFormat)
7671 Plan->printDOT(O);
7672 else
7673 Plan->print(O);
7675 #endif
7677 //===--------------------------------------------------------------------===//
7678 // EpilogueVectorizerMainLoop
7679 //===--------------------------------------------------------------------===//
7681 /// This function is partially responsible for generating the control flow
7682 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
7683 std::pair<BasicBlock *, Value *>
7684 EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton(
7685 const SCEV2ValueTy &ExpandedSCEVs) {
7686 createVectorLoopSkeleton("");
7688 // Generate the code to check the minimum iteration count of the vector
7689 // epilogue (see below).
7690 EPI.EpilogueIterationCountCheck =
7691 emitIterationCountCheck(LoopScalarPreHeader, true);
7692 EPI.EpilogueIterationCountCheck->setName("iter.check");
7694 // Generate the code to check any assumptions that we've made for SCEV
7695 // expressions.
7696 EPI.SCEVSafetyCheck = emitSCEVChecks(LoopScalarPreHeader);
7698 // Generate the code that checks at runtime if arrays overlap. We put the
7699 // checks into a separate block to make the more common case of few elements
7700 // faster.
7701 EPI.MemSafetyCheck = emitMemRuntimeChecks(LoopScalarPreHeader);
7703 // Generate the iteration count check for the main loop, *after* the check
7704 // for the epilogue loop, so that the path-length is shorter for the case
7705 // that goes directly through the vector epilogue. The longer-path length for
7706 // the main loop is compensated for, by the gain from vectorizing the larger
7707 // trip count. Note: the branch will get updated later on when we vectorize
7708 // the epilogue.
7709 EPI.MainLoopIterationCountCheck =
7710 emitIterationCountCheck(LoopScalarPreHeader, false);
7712 // Generate the induction variable.
7713 EPI.VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader);
7715 // Skip induction resume value creation here because they will be created in
7716 // the second pass for the scalar loop. The induction resume values for the
7717 // inductions in the epilogue loop are created before executing the plan for
7718 // the epilogue loop.
7720 return {completeLoopSkeleton(), nullptr};
7723 void EpilogueVectorizerMainLoop::printDebugTracesAtStart() {
7724 LLVM_DEBUG({
7725 dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n"
7726 << "Main Loop VF:" << EPI.MainLoopVF
7727 << ", Main Loop UF:" << EPI.MainLoopUF
7728 << ", Epilogue Loop VF:" << EPI.EpilogueVF
7729 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
7733 void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() {
7734 DEBUG_WITH_TYPE(VerboseDebug, {
7735 dbgs() << "intermediate fn:\n"
7736 << *OrigLoop->getHeader()->getParent() << "\n";
7740 BasicBlock *
7741 EpilogueVectorizerMainLoop::emitIterationCountCheck(BasicBlock *Bypass,
7742 bool ForEpilogue) {
7743 assert(Bypass && "Expected valid bypass basic block.");
7744 ElementCount VFactor = ForEpilogue ? EPI.EpilogueVF : VF;
7745 unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF;
7746 Value *Count = getTripCount();
7747 // Reuse existing vector loop preheader for TC checks.
7748 // Note that new preheader block is generated for vector loop.
7749 BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
7750 IRBuilder<> Builder(TCCheckBlock->getTerminator());
7752 // Generate code to check if the loop's trip count is less than VF * UF of the
7753 // main vector loop.
7754 auto P = Cost->requiresScalarEpilogue(ForEpilogue ? EPI.EpilogueVF.isVector()
7755 : VF.isVector())
7756 ? ICmpInst::ICMP_ULE
7757 : ICmpInst::ICMP_ULT;
7759 Value *CheckMinIters = Builder.CreateICmp(
7760 P, Count, createStepForVF(Builder, Count->getType(), VFactor, UFactor),
7761 "min.iters.check");
7763 if (!ForEpilogue)
7764 TCCheckBlock->setName("vector.main.loop.iter.check");
7766 // Create new preheader for vector loop.
7767 LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(),
7768 DT, LI, nullptr, "vector.ph");
7770 if (ForEpilogue) {
7771 assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
7772 DT->getNode(Bypass)->getIDom()) &&
7773 "TC check is expected to dominate Bypass");
7775 // Update dominator for Bypass & LoopExit.
7776 DT->changeImmediateDominator(Bypass, TCCheckBlock);
7777 if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF.isVector()))
7778 // For loops with multiple exits, there's no edge from the middle block
7779 // to exit blocks (as the epilogue must run) and thus no need to update
7780 // the immediate dominator of the exit blocks.
7781 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
7783 LoopBypassBlocks.push_back(TCCheckBlock);
7785 // Save the trip count so we don't have to regenerate it in the
7786 // vec.epilog.iter.check. This is safe to do because the trip count
7787 // generated here dominates the vector epilog iter check.
7788 EPI.TripCount = Count;
7791 BranchInst &BI =
7792 *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters);
7793 if (hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator()))
7794 setBranchWeights(BI, MinItersBypassWeights);
7795 ReplaceInstWithInst(TCCheckBlock->getTerminator(), &BI);
7797 return TCCheckBlock;
7800 //===--------------------------------------------------------------------===//
7801 // EpilogueVectorizerEpilogueLoop
7802 //===--------------------------------------------------------------------===//
7804 /// This function is partially responsible for generating the control flow
7805 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
7806 std::pair<BasicBlock *, Value *>
7807 EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton(
7808 const SCEV2ValueTy &ExpandedSCEVs) {
7809 createVectorLoopSkeleton("vec.epilog.");
7811 // Now, compare the remaining count and if there aren't enough iterations to
7812 // execute the vectorized epilogue skip to the scalar part.
7813 BasicBlock *VecEpilogueIterationCountCheck = LoopVectorPreHeader;
7814 VecEpilogueIterationCountCheck->setName("vec.epilog.iter.check");
7815 LoopVectorPreHeader =
7816 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
7817 LI, nullptr, "vec.epilog.ph");
7818 emitMinimumVectorEpilogueIterCountCheck(LoopScalarPreHeader,
7819 VecEpilogueIterationCountCheck);
7821 // Adjust the control flow taking the state info from the main loop
7822 // vectorization into account.
7823 assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck &&
7824 "expected this to be saved from the previous pass.");
7825 EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith(
7826 VecEpilogueIterationCountCheck, LoopVectorPreHeader);
7828 DT->changeImmediateDominator(LoopVectorPreHeader,
7829 EPI.MainLoopIterationCountCheck);
7831 EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith(
7832 VecEpilogueIterationCountCheck, LoopScalarPreHeader);
7834 if (EPI.SCEVSafetyCheck)
7835 EPI.SCEVSafetyCheck->getTerminator()->replaceUsesOfWith(
7836 VecEpilogueIterationCountCheck, LoopScalarPreHeader);
7837 if (EPI.MemSafetyCheck)
7838 EPI.MemSafetyCheck->getTerminator()->replaceUsesOfWith(
7839 VecEpilogueIterationCountCheck, LoopScalarPreHeader);
7841 DT->changeImmediateDominator(
7842 VecEpilogueIterationCountCheck,
7843 VecEpilogueIterationCountCheck->getSinglePredecessor());
7845 DT->changeImmediateDominator(LoopScalarPreHeader,
7846 EPI.EpilogueIterationCountCheck);
7847 if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF.isVector()))
7848 // If there is an epilogue which must run, there's no edge from the
7849 // middle block to exit blocks and thus no need to update the immediate
7850 // dominator of the exit blocks.
7851 DT->changeImmediateDominator(LoopExitBlock,
7852 EPI.EpilogueIterationCountCheck);
7854 // Keep track of bypass blocks, as they feed start values to the induction and
7855 // reduction phis in the scalar loop preheader.
7856 if (EPI.SCEVSafetyCheck)
7857 LoopBypassBlocks.push_back(EPI.SCEVSafetyCheck);
7858 if (EPI.MemSafetyCheck)
7859 LoopBypassBlocks.push_back(EPI.MemSafetyCheck);
7860 LoopBypassBlocks.push_back(EPI.EpilogueIterationCountCheck);
7862 // The vec.epilog.iter.check block may contain Phi nodes from inductions or
7863 // reductions which merge control-flow from the latch block and the middle
7864 // block. Update the incoming values here and move the Phi into the preheader.
7865 SmallVector<PHINode *, 4> PhisInBlock;
7866 for (PHINode &Phi : VecEpilogueIterationCountCheck->phis())
7867 PhisInBlock.push_back(&Phi);
7869 for (PHINode *Phi : PhisInBlock) {
7870 Phi->moveBefore(LoopVectorPreHeader->getFirstNonPHI());
7871 Phi->replaceIncomingBlockWith(
7872 VecEpilogueIterationCountCheck->getSinglePredecessor(),
7873 VecEpilogueIterationCountCheck);
7875 // If the phi doesn't have an incoming value from the
7876 // EpilogueIterationCountCheck, we are done. Otherwise remove the incoming
7877 // value and also those from other check blocks. This is needed for
7878 // reduction phis only.
7879 if (none_of(Phi->blocks(), [&](BasicBlock *IncB) {
7880 return EPI.EpilogueIterationCountCheck == IncB;
7882 continue;
7883 Phi->removeIncomingValue(EPI.EpilogueIterationCountCheck);
7884 if (EPI.SCEVSafetyCheck)
7885 Phi->removeIncomingValue(EPI.SCEVSafetyCheck);
7886 if (EPI.MemSafetyCheck)
7887 Phi->removeIncomingValue(EPI.MemSafetyCheck);
7890 // Generate a resume induction for the vector epilogue and put it in the
7891 // vector epilogue preheader
7892 Type *IdxTy = Legal->getWidestInductionType();
7893 PHINode *EPResumeVal = PHINode::Create(IdxTy, 2, "vec.epilog.resume.val");
7894 EPResumeVal->insertBefore(LoopVectorPreHeader->getFirstNonPHIIt());
7895 EPResumeVal->addIncoming(EPI.VectorTripCount, VecEpilogueIterationCountCheck);
7896 EPResumeVal->addIncoming(ConstantInt::get(IdxTy, 0),
7897 EPI.MainLoopIterationCountCheck);
7899 // Generate induction resume values. These variables save the new starting
7900 // indexes for the scalar loop. They are used to test if there are any tail
7901 // iterations left once the vector loop has completed.
7902 // Note that when the vectorized epilogue is skipped due to iteration count
7903 // check, then the resume value for the induction variable comes from
7904 // the trip count of the main vector loop, hence passing the AdditionalBypass
7905 // argument.
7906 createInductionResumeValues(ExpandedSCEVs,
7907 {VecEpilogueIterationCountCheck,
7908 EPI.VectorTripCount} /* AdditionalBypass */);
7910 return {completeLoopSkeleton(), EPResumeVal};
7913 BasicBlock *
7914 EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck(
7915 BasicBlock *Bypass, BasicBlock *Insert) {
7917 assert(EPI.TripCount &&
7918 "Expected trip count to have been safed in the first pass.");
7919 assert(
7920 (!isa<Instruction>(EPI.TripCount) ||
7921 DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) &&
7922 "saved trip count does not dominate insertion point.");
7923 Value *TC = EPI.TripCount;
7924 IRBuilder<> Builder(Insert->getTerminator());
7925 Value *Count = Builder.CreateSub(TC, EPI.VectorTripCount, "n.vec.remaining");
7927 // Generate code to check if the loop's trip count is less than VF * UF of the
7928 // vector epilogue loop.
7929 auto P = Cost->requiresScalarEpilogue(EPI.EpilogueVF.isVector())
7930 ? ICmpInst::ICMP_ULE
7931 : ICmpInst::ICMP_ULT;
7933 Value *CheckMinIters =
7934 Builder.CreateICmp(P, Count,
7935 createStepForVF(Builder, Count->getType(),
7936 EPI.EpilogueVF, EPI.EpilogueUF),
7937 "min.epilog.iters.check");
7939 BranchInst &BI =
7940 *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters);
7941 if (hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) {
7942 unsigned MainLoopStep = UF * VF.getKnownMinValue();
7943 unsigned EpilogueLoopStep =
7944 EPI.EpilogueUF * EPI.EpilogueVF.getKnownMinValue();
7945 // We assume the remaining `Count` is equally distributed in
7946 // [0, MainLoopStep)
7947 // So the probability for `Count < EpilogueLoopStep` should be
7948 // min(MainLoopStep, EpilogueLoopStep) / MainLoopStep
7949 unsigned EstimatedSkipCount = std::min(MainLoopStep, EpilogueLoopStep);
7950 const uint32_t Weights[] = {EstimatedSkipCount,
7951 MainLoopStep - EstimatedSkipCount};
7952 setBranchWeights(BI, Weights);
7954 ReplaceInstWithInst(Insert->getTerminator(), &BI);
7956 LoopBypassBlocks.push_back(Insert);
7957 return Insert;
7960 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() {
7961 LLVM_DEBUG({
7962 dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n"
7963 << "Epilogue Loop VF:" << EPI.EpilogueVF
7964 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
7968 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() {
7969 DEBUG_WITH_TYPE(VerboseDebug, {
7970 dbgs() << "final fn:\n" << *OrigLoop->getHeader()->getParent() << "\n";
7974 bool LoopVectorizationPlanner::getDecisionAndClampRange(
7975 const std::function<bool(ElementCount)> &Predicate, VFRange &Range) {
7976 assert(!Range.isEmpty() && "Trying to test an empty VF range.");
7977 bool PredicateAtRangeStart = Predicate(Range.Start);
7979 for (ElementCount TmpVF : VFRange(Range.Start * 2, Range.End))
7980 if (Predicate(TmpVF) != PredicateAtRangeStart) {
7981 Range.End = TmpVF;
7982 break;
7985 return PredicateAtRangeStart;
7988 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF,
7989 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range
7990 /// of VF's starting at a given VF and extending it as much as possible. Each
7991 /// vectorization decision can potentially shorten this sub-range during
7992 /// buildVPlan().
7993 void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF,
7994 ElementCount MaxVF) {
7995 auto MaxVFTimes2 = MaxVF * 2;
7996 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFTimes2);) {
7997 VFRange SubRange = {VF, MaxVFTimes2};
7998 VPlans.push_back(buildVPlan(SubRange));
7999 VF = SubRange.End;
8003 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst,
8004 VPlan &Plan) {
8005 assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
8007 // Look for cached value.
8008 std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
8009 EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge);
8010 if (ECEntryIt != EdgeMaskCache.end())
8011 return ECEntryIt->second;
8013 VPValue *SrcMask = getBlockInMask(Src);
8015 // The terminator has to be a branch inst!
8016 BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
8017 assert(BI && "Unexpected terminator found");
8019 if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1))
8020 return EdgeMaskCache[Edge] = SrcMask;
8022 // If source is an exiting block, we know the exit edge is dynamically dead
8023 // in the vector loop, and thus we don't need to restrict the mask. Avoid
8024 // adding uses of an otherwise potentially dead instruction.
8025 if (OrigLoop->isLoopExiting(Src))
8026 return EdgeMaskCache[Edge] = SrcMask;
8028 VPValue *EdgeMask = Plan.getVPValueOrAddLiveIn(BI->getCondition());
8029 assert(EdgeMask && "No Edge Mask found for condition");
8031 if (BI->getSuccessor(0) != Dst)
8032 EdgeMask = Builder.createNot(EdgeMask, BI->getDebugLoc());
8034 if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND.
8035 // The condition is 'SrcMask && EdgeMask', which is equivalent to
8036 // 'select i1 SrcMask, i1 EdgeMask, i1 false'.
8037 // The select version does not introduce new UB if SrcMask is false and
8038 // EdgeMask is poison. Using 'and' here introduces undefined behavior.
8039 VPValue *False = Plan.getVPValueOrAddLiveIn(
8040 ConstantInt::getFalse(BI->getCondition()->getType()));
8041 EdgeMask =
8042 Builder.createSelect(SrcMask, EdgeMask, False, BI->getDebugLoc());
8045 return EdgeMaskCache[Edge] = EdgeMask;
8048 void VPRecipeBuilder::createHeaderMask(VPlan &Plan) {
8049 BasicBlock *Header = OrigLoop->getHeader();
8051 // When not folding the tail, use nullptr to model all-true mask.
8052 if (!CM.foldTailByMasking()) {
8053 BlockMaskCache[Header] = nullptr;
8054 return;
8057 // Introduce the early-exit compare IV <= BTC to form header block mask.
8058 // This is used instead of IV < TC because TC may wrap, unlike BTC. Start by
8059 // constructing the desired canonical IV in the header block as its first
8060 // non-phi instructions.
8062 VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
8063 auto NewInsertionPoint = HeaderVPBB->getFirstNonPhi();
8064 auto *IV = new VPWidenCanonicalIVRecipe(Plan.getCanonicalIV());
8065 HeaderVPBB->insert(IV, NewInsertionPoint);
8067 VPBuilder::InsertPointGuard Guard(Builder);
8068 Builder.setInsertPoint(HeaderVPBB, NewInsertionPoint);
8069 VPValue *BlockMask = nullptr;
8070 VPValue *BTC = Plan.getOrCreateBackedgeTakenCount();
8071 BlockMask = Builder.createICmp(CmpInst::ICMP_ULE, IV, BTC);
8072 BlockMaskCache[Header] = BlockMask;
8075 VPValue *VPRecipeBuilder::getBlockInMask(BasicBlock *BB) const {
8076 // Return the cached value.
8077 BlockMaskCacheTy::const_iterator BCEntryIt = BlockMaskCache.find(BB);
8078 assert(BCEntryIt != BlockMaskCache.end() &&
8079 "Trying to access mask for block without one.");
8080 return BCEntryIt->second;
8083 void VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlan &Plan) {
8084 assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
8085 assert(BlockMaskCache.count(BB) == 0 && "Mask for block already computed");
8086 assert(OrigLoop->getHeader() != BB &&
8087 "Loop header must have cached block mask");
8089 // All-one mask is modelled as no-mask following the convention for masked
8090 // load/store/gather/scatter. Initialize BlockMask to no-mask.
8091 VPValue *BlockMask = nullptr;
8092 // This is the block mask. We OR all incoming edges.
8093 for (auto *Predecessor : predecessors(BB)) {
8094 VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan);
8095 if (!EdgeMask) { // Mask of predecessor is all-one so mask of block is too.
8096 BlockMaskCache[BB] = EdgeMask;
8097 return;
8100 if (!BlockMask) { // BlockMask has its initialized nullptr value.
8101 BlockMask = EdgeMask;
8102 continue;
8105 BlockMask = Builder.createOr(BlockMask, EdgeMask, {});
8108 BlockMaskCache[BB] = BlockMask;
8111 VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I,
8112 ArrayRef<VPValue *> Operands,
8113 VFRange &Range,
8114 VPlanPtr &Plan) {
8115 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
8116 "Must be called with either a load or store");
8118 auto willWiden = [&](ElementCount VF) -> bool {
8119 LoopVectorizationCostModel::InstWidening Decision =
8120 CM.getWideningDecision(I, VF);
8121 assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
8122 "CM decision should be taken at this point.");
8123 if (Decision == LoopVectorizationCostModel::CM_Interleave)
8124 return true;
8125 if (CM.isScalarAfterVectorization(I, VF) ||
8126 CM.isProfitableToScalarize(I, VF))
8127 return false;
8128 return Decision != LoopVectorizationCostModel::CM_Scalarize;
8131 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
8132 return nullptr;
8134 VPValue *Mask = nullptr;
8135 if (Legal->isMaskRequired(I))
8136 Mask = getBlockInMask(I->getParent());
8138 // Determine if the pointer operand of the access is either consecutive or
8139 // reverse consecutive.
8140 LoopVectorizationCostModel::InstWidening Decision =
8141 CM.getWideningDecision(I, Range.Start);
8142 bool Reverse = Decision == LoopVectorizationCostModel::CM_Widen_Reverse;
8143 bool Consecutive =
8144 Reverse || Decision == LoopVectorizationCostModel::CM_Widen;
8146 VPValue *Ptr = isa<LoadInst>(I) ? Operands[0] : Operands[1];
8147 if (Consecutive) {
8148 auto *GEP = dyn_cast<GetElementPtrInst>(
8149 Ptr->getUnderlyingValue()->stripPointerCasts());
8150 auto *VectorPtr = new VPVectorPointerRecipe(
8151 Ptr, getLoadStoreType(I), Reverse, GEP ? GEP->isInBounds() : false,
8152 I->getDebugLoc());
8153 Builder.getInsertBlock()->appendRecipe(VectorPtr);
8154 Ptr = VectorPtr;
8156 if (LoadInst *Load = dyn_cast<LoadInst>(I))
8157 return new VPWidenMemoryInstructionRecipe(*Load, Ptr, Mask, Consecutive,
8158 Reverse);
8160 StoreInst *Store = cast<StoreInst>(I);
8161 return new VPWidenMemoryInstructionRecipe(*Store, Ptr, Operands[0], Mask,
8162 Consecutive, Reverse);
8165 /// Creates a VPWidenIntOrFpInductionRecpipe for \p Phi. If needed, it will also
8166 /// insert a recipe to expand the step for the induction recipe.
8167 static VPWidenIntOrFpInductionRecipe *
8168 createWidenInductionRecipes(PHINode *Phi, Instruction *PhiOrTrunc,
8169 VPValue *Start, const InductionDescriptor &IndDesc,
8170 VPlan &Plan, ScalarEvolution &SE, Loop &OrigLoop,
8171 VFRange &Range) {
8172 assert(IndDesc.getStartValue() ==
8173 Phi->getIncomingValueForBlock(OrigLoop.getLoopPreheader()));
8174 assert(SE.isLoopInvariant(IndDesc.getStep(), &OrigLoop) &&
8175 "step must be loop invariant");
8177 VPValue *Step =
8178 vputils::getOrCreateVPValueForSCEVExpr(Plan, IndDesc.getStep(), SE);
8179 if (auto *TruncI = dyn_cast<TruncInst>(PhiOrTrunc)) {
8180 return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, IndDesc, TruncI);
8182 assert(isa<PHINode>(PhiOrTrunc) && "must be a phi node here");
8183 return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, IndDesc);
8186 VPRecipeBase *VPRecipeBuilder::tryToOptimizeInductionPHI(
8187 PHINode *Phi, ArrayRef<VPValue *> Operands, VPlan &Plan, VFRange &Range) {
8189 // Check if this is an integer or fp induction. If so, build the recipe that
8190 // produces its scalar and vector values.
8191 if (auto *II = Legal->getIntOrFpInductionDescriptor(Phi))
8192 return createWidenInductionRecipes(Phi, Phi, Operands[0], *II, Plan,
8193 *PSE.getSE(), *OrigLoop, Range);
8195 // Check if this is pointer induction. If so, build the recipe for it.
8196 if (auto *II = Legal->getPointerInductionDescriptor(Phi)) {
8197 VPValue *Step = vputils::getOrCreateVPValueForSCEVExpr(Plan, II->getStep(),
8198 *PSE.getSE());
8199 return new VPWidenPointerInductionRecipe(
8200 Phi, Operands[0], Step, *II,
8201 LoopVectorizationPlanner::getDecisionAndClampRange(
8202 [&](ElementCount VF) {
8203 return CM.isScalarAfterVectorization(Phi, VF);
8205 Range));
8207 return nullptr;
8210 VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate(
8211 TruncInst *I, ArrayRef<VPValue *> Operands, VFRange &Range, VPlan &Plan) {
8212 // Optimize the special case where the source is a constant integer
8213 // induction variable. Notice that we can only optimize the 'trunc' case
8214 // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
8215 // (c) other casts depend on pointer size.
8217 // Determine whether \p K is a truncation based on an induction variable that
8218 // can be optimized.
8219 auto isOptimizableIVTruncate =
8220 [&](Instruction *K) -> std::function<bool(ElementCount)> {
8221 return [=](ElementCount VF) -> bool {
8222 return CM.isOptimizableIVTruncate(K, VF);
8226 if (LoopVectorizationPlanner::getDecisionAndClampRange(
8227 isOptimizableIVTruncate(I), Range)) {
8229 auto *Phi = cast<PHINode>(I->getOperand(0));
8230 const InductionDescriptor &II = *Legal->getIntOrFpInductionDescriptor(Phi);
8231 VPValue *Start = Plan.getVPValueOrAddLiveIn(II.getStartValue());
8232 return createWidenInductionRecipes(Phi, I, Start, II, Plan, *PSE.getSE(),
8233 *OrigLoop, Range);
8235 return nullptr;
8238 VPRecipeOrVPValueTy VPRecipeBuilder::tryToBlend(PHINode *Phi,
8239 ArrayRef<VPValue *> Operands,
8240 VPlanPtr &Plan) {
8241 // If all incoming values are equal, the incoming VPValue can be used directly
8242 // instead of creating a new VPBlendRecipe.
8243 if (llvm::all_equal(Operands))
8244 return Operands[0];
8246 unsigned NumIncoming = Phi->getNumIncomingValues();
8247 // For in-loop reductions, we do not need to create an additional select.
8248 VPValue *InLoopVal = nullptr;
8249 for (unsigned In = 0; In < NumIncoming; In++) {
8250 PHINode *PhiOp =
8251 dyn_cast_or_null<PHINode>(Operands[In]->getUnderlyingValue());
8252 if (PhiOp && CM.isInLoopReduction(PhiOp)) {
8253 assert(!InLoopVal && "Found more than one in-loop reduction!");
8254 InLoopVal = Operands[In];
8258 assert((!InLoopVal || NumIncoming == 2) &&
8259 "Found an in-loop reduction for PHI with unexpected number of "
8260 "incoming values");
8261 if (InLoopVal)
8262 return Operands[Operands[0] == InLoopVal ? 1 : 0];
8264 // We know that all PHIs in non-header blocks are converted into selects, so
8265 // we don't have to worry about the insertion order and we can just use the
8266 // builder. At this point we generate the predication tree. There may be
8267 // duplications since this is a simple recursive scan, but future
8268 // optimizations will clean it up.
8269 SmallVector<VPValue *, 2> OperandsWithMask;
8271 for (unsigned In = 0; In < NumIncoming; In++) {
8272 VPValue *EdgeMask =
8273 createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), *Plan);
8274 assert((EdgeMask || NumIncoming == 1) &&
8275 "Multiple predecessors with one having a full mask");
8276 OperandsWithMask.push_back(Operands[In]);
8277 if (EdgeMask)
8278 OperandsWithMask.push_back(EdgeMask);
8280 return toVPRecipeResult(new VPBlendRecipe(Phi, OperandsWithMask));
8283 VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI,
8284 ArrayRef<VPValue *> Operands,
8285 VFRange &Range,
8286 VPlanPtr &Plan) {
8287 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
8288 [this, CI](ElementCount VF) {
8289 return CM.isScalarWithPredication(CI, VF);
8291 Range);
8293 if (IsPredicated)
8294 return nullptr;
8296 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
8297 if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
8298 ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect ||
8299 ID == Intrinsic::pseudoprobe ||
8300 ID == Intrinsic::experimental_noalias_scope_decl))
8301 return nullptr;
8303 SmallVector<VPValue *, 4> Ops(Operands.take_front(CI->arg_size()));
8305 // Is it beneficial to perform intrinsic call compared to lib call?
8306 bool ShouldUseVectorIntrinsic =
8307 ID && LoopVectorizationPlanner::getDecisionAndClampRange(
8308 [&](ElementCount VF) -> bool {
8309 return CM.getCallWideningDecision(CI, VF).Kind ==
8310 LoopVectorizationCostModel::CM_IntrinsicCall;
8312 Range);
8313 if (ShouldUseVectorIntrinsic)
8314 return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end()), ID,
8315 CI->getDebugLoc());
8317 Function *Variant = nullptr;
8318 std::optional<unsigned> MaskPos;
8319 // Is better to call a vectorized version of the function than to to scalarize
8320 // the call?
8321 auto ShouldUseVectorCall = LoopVectorizationPlanner::getDecisionAndClampRange(
8322 [&](ElementCount VF) -> bool {
8323 // The following case may be scalarized depending on the VF.
8324 // The flag shows whether we can use a usual Call for vectorized
8325 // version of the instruction.
8327 // If we've found a variant at a previous VF, then stop looking. A
8328 // vectorized variant of a function expects input in a certain shape
8329 // -- basically the number of input registers, the number of lanes
8330 // per register, and whether there's a mask required.
8331 // We store a pointer to the variant in the VPWidenCallRecipe, so
8332 // once we have an appropriate variant it's only valid for that VF.
8333 // This will force a different vplan to be generated for each VF that
8334 // finds a valid variant.
8335 if (Variant)
8336 return false;
8337 LoopVectorizationCostModel::CallWideningDecision Decision =
8338 CM.getCallWideningDecision(CI, VF);
8339 if (Decision.Kind == LoopVectorizationCostModel::CM_VectorCall) {
8340 Variant = Decision.Variant;
8341 MaskPos = Decision.MaskPos;
8342 return true;
8345 return false;
8347 Range);
8348 if (ShouldUseVectorCall) {
8349 if (MaskPos.has_value()) {
8350 // We have 2 cases that would require a mask:
8351 // 1) The block needs to be predicated, either due to a conditional
8352 // in the scalar loop or use of an active lane mask with
8353 // tail-folding, and we use the appropriate mask for the block.
8354 // 2) No mask is required for the block, but the only available
8355 // vector variant at this VF requires a mask, so we synthesize an
8356 // all-true mask.
8357 VPValue *Mask = nullptr;
8358 if (Legal->isMaskRequired(CI))
8359 Mask = getBlockInMask(CI->getParent());
8360 else
8361 Mask = Plan->getVPValueOrAddLiveIn(ConstantInt::getTrue(
8362 IntegerType::getInt1Ty(Variant->getFunctionType()->getContext())));
8364 Ops.insert(Ops.begin() + *MaskPos, Mask);
8367 return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end()),
8368 Intrinsic::not_intrinsic, CI->getDebugLoc(),
8369 Variant);
8372 return nullptr;
8375 bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const {
8376 assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) &&
8377 !isa<StoreInst>(I) && "Instruction should have been handled earlier");
8378 // Instruction should be widened, unless it is scalar after vectorization,
8379 // scalarization is profitable or it is predicated.
8380 auto WillScalarize = [this, I](ElementCount VF) -> bool {
8381 return CM.isScalarAfterVectorization(I, VF) ||
8382 CM.isProfitableToScalarize(I, VF) ||
8383 CM.isScalarWithPredication(I, VF);
8385 return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize,
8386 Range);
8389 VPRecipeBase *VPRecipeBuilder::tryToWiden(Instruction *I,
8390 ArrayRef<VPValue *> Operands,
8391 VPBasicBlock *VPBB, VPlanPtr &Plan) {
8392 switch (I->getOpcode()) {
8393 default:
8394 return nullptr;
8395 case Instruction::SDiv:
8396 case Instruction::UDiv:
8397 case Instruction::SRem:
8398 case Instruction::URem: {
8399 // If not provably safe, use a select to form a safe divisor before widening the
8400 // div/rem operation itself. Otherwise fall through to general handling below.
8401 if (CM.isPredicatedInst(I)) {
8402 SmallVector<VPValue *> Ops(Operands.begin(), Operands.end());
8403 VPValue *Mask = getBlockInMask(I->getParent());
8404 VPValue *One = Plan->getVPValueOrAddLiveIn(
8405 ConstantInt::get(I->getType(), 1u, false));
8406 auto *SafeRHS =
8407 new VPInstruction(Instruction::Select, {Mask, Ops[1], One},
8408 I->getDebugLoc());
8409 VPBB->appendRecipe(SafeRHS);
8410 Ops[1] = SafeRHS;
8411 return new VPWidenRecipe(*I, make_range(Ops.begin(), Ops.end()));
8413 [[fallthrough]];
8415 case Instruction::Add:
8416 case Instruction::And:
8417 case Instruction::AShr:
8418 case Instruction::FAdd:
8419 case Instruction::FCmp:
8420 case Instruction::FDiv:
8421 case Instruction::FMul:
8422 case Instruction::FNeg:
8423 case Instruction::FRem:
8424 case Instruction::FSub:
8425 case Instruction::ICmp:
8426 case Instruction::LShr:
8427 case Instruction::Mul:
8428 case Instruction::Or:
8429 case Instruction::Select:
8430 case Instruction::Shl:
8431 case Instruction::Sub:
8432 case Instruction::Xor:
8433 case Instruction::Freeze:
8434 return new VPWidenRecipe(*I, make_range(Operands.begin(), Operands.end()));
8438 void VPRecipeBuilder::fixHeaderPhis() {
8439 BasicBlock *OrigLatch = OrigLoop->getLoopLatch();
8440 for (VPHeaderPHIRecipe *R : PhisToFix) {
8441 auto *PN = cast<PHINode>(R->getUnderlyingValue());
8442 VPRecipeBase *IncR =
8443 getRecipe(cast<Instruction>(PN->getIncomingValueForBlock(OrigLatch)));
8444 R->addOperand(IncR->getVPSingleValue());
8448 VPRecipeOrVPValueTy VPRecipeBuilder::handleReplication(Instruction *I,
8449 VFRange &Range,
8450 VPlan &Plan) {
8451 bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange(
8452 [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); },
8453 Range);
8455 bool IsPredicated = CM.isPredicatedInst(I);
8457 // Even if the instruction is not marked as uniform, there are certain
8458 // intrinsic calls that can be effectively treated as such, so we check for
8459 // them here. Conservatively, we only do this for scalable vectors, since
8460 // for fixed-width VFs we can always fall back on full scalarization.
8461 if (!IsUniform && Range.Start.isScalable() && isa<IntrinsicInst>(I)) {
8462 switch (cast<IntrinsicInst>(I)->getIntrinsicID()) {
8463 case Intrinsic::assume:
8464 case Intrinsic::lifetime_start:
8465 case Intrinsic::lifetime_end:
8466 // For scalable vectors if one of the operands is variant then we still
8467 // want to mark as uniform, which will generate one instruction for just
8468 // the first lane of the vector. We can't scalarize the call in the same
8469 // way as for fixed-width vectors because we don't know how many lanes
8470 // there are.
8472 // The reasons for doing it this way for scalable vectors are:
8473 // 1. For the assume intrinsic generating the instruction for the first
8474 // lane is still be better than not generating any at all. For
8475 // example, the input may be a splat across all lanes.
8476 // 2. For the lifetime start/end intrinsics the pointer operand only
8477 // does anything useful when the input comes from a stack object,
8478 // which suggests it should always be uniform. For non-stack objects
8479 // the effect is to poison the object, which still allows us to
8480 // remove the call.
8481 IsUniform = true;
8482 break;
8483 default:
8484 break;
8487 VPValue *BlockInMask = nullptr;
8488 if (!IsPredicated) {
8489 // Finalize the recipe for Instr, first if it is not predicated.
8490 LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n");
8491 } else {
8492 LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n");
8493 // Instructions marked for predication are replicated and a mask operand is
8494 // added initially. Masked replicate recipes will later be placed under an
8495 // if-then construct to prevent side-effects. Generate recipes to compute
8496 // the block mask for this region.
8497 BlockInMask = getBlockInMask(I->getParent());
8500 auto *Recipe = new VPReplicateRecipe(I, Plan.mapToVPValues(I->operands()),
8501 IsUniform, BlockInMask);
8502 return toVPRecipeResult(Recipe);
8505 VPRecipeOrVPValueTy
8506 VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
8507 ArrayRef<VPValue *> Operands,
8508 VFRange &Range, VPBasicBlock *VPBB,
8509 VPlanPtr &Plan) {
8510 // First, check for specific widening recipes that deal with inductions, Phi
8511 // nodes, calls and memory operations.
8512 VPRecipeBase *Recipe;
8513 if (auto Phi = dyn_cast<PHINode>(Instr)) {
8514 if (Phi->getParent() != OrigLoop->getHeader())
8515 return tryToBlend(Phi, Operands, Plan);
8517 // Always record recipes for header phis. Later first-order recurrence phis
8518 // can have earlier phis as incoming values.
8519 recordRecipeOf(Phi);
8521 if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands, *Plan, Range)))
8522 return toVPRecipeResult(Recipe);
8524 VPHeaderPHIRecipe *PhiRecipe = nullptr;
8525 assert((Legal->isReductionVariable(Phi) ||
8526 Legal->isFixedOrderRecurrence(Phi)) &&
8527 "can only widen reductions and fixed-order recurrences here");
8528 VPValue *StartV = Operands[0];
8529 if (Legal->isReductionVariable(Phi)) {
8530 const RecurrenceDescriptor &RdxDesc =
8531 Legal->getReductionVars().find(Phi)->second;
8532 assert(RdxDesc.getRecurrenceStartValue() ==
8533 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()));
8534 PhiRecipe = new VPReductionPHIRecipe(Phi, RdxDesc, *StartV,
8535 CM.isInLoopReduction(Phi),
8536 CM.useOrderedReductions(RdxDesc));
8537 } else {
8538 // TODO: Currently fixed-order recurrences are modeled as chains of
8539 // first-order recurrences. If there are no users of the intermediate
8540 // recurrences in the chain, the fixed order recurrence should be modeled
8541 // directly, enabling more efficient codegen.
8542 PhiRecipe = new VPFirstOrderRecurrencePHIRecipe(Phi, *StartV);
8545 // Record the incoming value from the backedge, so we can add the incoming
8546 // value from the backedge after all recipes have been created.
8547 auto *Inc = cast<Instruction>(
8548 Phi->getIncomingValueForBlock(OrigLoop->getLoopLatch()));
8549 auto RecipeIter = Ingredient2Recipe.find(Inc);
8550 if (RecipeIter == Ingredient2Recipe.end())
8551 recordRecipeOf(Inc);
8553 PhisToFix.push_back(PhiRecipe);
8554 return toVPRecipeResult(PhiRecipe);
8557 if (isa<TruncInst>(Instr) &&
8558 (Recipe = tryToOptimizeInductionTruncate(cast<TruncInst>(Instr), Operands,
8559 Range, *Plan)))
8560 return toVPRecipeResult(Recipe);
8562 // All widen recipes below deal only with VF > 1.
8563 if (LoopVectorizationPlanner::getDecisionAndClampRange(
8564 [&](ElementCount VF) { return VF.isScalar(); }, Range))
8565 return nullptr;
8567 if (auto *CI = dyn_cast<CallInst>(Instr))
8568 return toVPRecipeResult(tryToWidenCall(CI, Operands, Range, Plan));
8570 if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr))
8571 return toVPRecipeResult(tryToWidenMemory(Instr, Operands, Range, Plan));
8573 if (!shouldWiden(Instr, Range))
8574 return nullptr;
8576 if (auto GEP = dyn_cast<GetElementPtrInst>(Instr))
8577 return toVPRecipeResult(new VPWidenGEPRecipe(
8578 GEP, make_range(Operands.begin(), Operands.end())));
8580 if (auto *SI = dyn_cast<SelectInst>(Instr)) {
8581 return toVPRecipeResult(new VPWidenSelectRecipe(
8582 *SI, make_range(Operands.begin(), Operands.end())));
8585 if (auto *CI = dyn_cast<CastInst>(Instr)) {
8586 return toVPRecipeResult(new VPWidenCastRecipe(CI->getOpcode(), Operands[0],
8587 CI->getType(), *CI));
8590 return toVPRecipeResult(tryToWiden(Instr, Operands, VPBB, Plan));
8593 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
8594 ElementCount MaxVF) {
8595 assert(OrigLoop->isInnermost() && "Inner loop expected.");
8597 auto MaxVFTimes2 = MaxVF * 2;
8598 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFTimes2);) {
8599 VFRange SubRange = {VF, MaxVFTimes2};
8600 if (auto Plan = tryToBuildVPlanWithVPRecipes(SubRange)) {
8601 // Now optimize the initial VPlan.
8602 if (!Plan->hasVF(ElementCount::getFixed(1)))
8603 VPlanTransforms::truncateToMinimalBitwidths(
8604 *Plan, CM.getMinimalBitwidths(), PSE.getSE()->getContext());
8605 VPlanTransforms::optimize(*Plan, *PSE.getSE());
8606 assert(VPlanVerifier::verifyPlanIsValid(*Plan) && "VPlan is invalid");
8607 VPlans.push_back(std::move(Plan));
8609 VF = SubRange.End;
8613 // Add the necessary canonical IV and branch recipes required to control the
8614 // loop.
8615 static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, bool HasNUW,
8616 DebugLoc DL) {
8617 Value *StartIdx = ConstantInt::get(IdxTy, 0);
8618 auto *StartV = Plan.getVPValueOrAddLiveIn(StartIdx);
8620 // Add a VPCanonicalIVPHIRecipe starting at 0 to the header.
8621 auto *CanonicalIVPHI = new VPCanonicalIVPHIRecipe(StartV, DL);
8622 VPRegionBlock *TopRegion = Plan.getVectorLoopRegion();
8623 VPBasicBlock *Header = TopRegion->getEntryBasicBlock();
8624 Header->insert(CanonicalIVPHI, Header->begin());
8626 // Add a CanonicalIVIncrement{NUW} VPInstruction to increment the scalar
8627 // IV by VF * UF.
8628 auto *CanonicalIVIncrement =
8629 new VPInstruction(Instruction::Add, {CanonicalIVPHI, &Plan.getVFxUF()},
8630 {HasNUW, false}, DL, "index.next");
8631 CanonicalIVPHI->addOperand(CanonicalIVIncrement);
8633 VPBasicBlock *EB = TopRegion->getExitingBasicBlock();
8634 EB->appendRecipe(CanonicalIVIncrement);
8636 // Add the BranchOnCount VPInstruction to the latch.
8637 VPInstruction *BranchBack =
8638 new VPInstruction(VPInstruction::BranchOnCount,
8639 {CanonicalIVIncrement, &Plan.getVectorTripCount()}, DL);
8640 EB->appendRecipe(BranchBack);
8643 // Add exit values to \p Plan. VPLiveOuts are added for each LCSSA phi in the
8644 // original exit block.
8645 static void addUsersInExitBlock(VPBasicBlock *HeaderVPBB, Loop *OrigLoop,
8646 VPlan &Plan) {
8647 BasicBlock *ExitBB = OrigLoop->getUniqueExitBlock();
8648 BasicBlock *ExitingBB = OrigLoop->getExitingBlock();
8649 // Only handle single-exit loops with unique exit blocks for now.
8650 if (!ExitBB || !ExitBB->getSinglePredecessor() || !ExitingBB)
8651 return;
8653 // Introduce VPUsers modeling the exit values.
8654 for (PHINode &ExitPhi : ExitBB->phis()) {
8655 Value *IncomingValue =
8656 ExitPhi.getIncomingValueForBlock(ExitingBB);
8657 VPValue *V = Plan.getVPValueOrAddLiveIn(IncomingValue);
8658 Plan.addLiveOut(&ExitPhi, V);
8662 VPlanPtr
8663 LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
8665 SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups;
8667 VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder);
8669 // ---------------------------------------------------------------------------
8670 // Pre-construction: record ingredients whose recipes we'll need to further
8671 // process after constructing the initial VPlan.
8672 // ---------------------------------------------------------------------------
8674 // For each interleave group which is relevant for this (possibly trimmed)
8675 // Range, add it to the set of groups to be later applied to the VPlan and add
8676 // placeholders for its members' Recipes which we'll be replacing with a
8677 // single VPInterleaveRecipe.
8678 for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) {
8679 auto applyIG = [IG, this](ElementCount VF) -> bool {
8680 bool Result = (VF.isVector() && // Query is illegal for VF == 1
8681 CM.getWideningDecision(IG->getInsertPos(), VF) ==
8682 LoopVectorizationCostModel::CM_Interleave);
8683 // For scalable vectors, the only interleave factor currently supported
8684 // is 2 since we require the (de)interleave2 intrinsics instead of
8685 // shufflevectors.
8686 assert((!Result || !VF.isScalable() || IG->getFactor() == 2) &&
8687 "Unsupported interleave factor for scalable vectors");
8688 return Result;
8690 if (!getDecisionAndClampRange(applyIG, Range))
8691 continue;
8692 InterleaveGroups.insert(IG);
8693 for (unsigned i = 0; i < IG->getFactor(); i++)
8694 if (Instruction *Member = IG->getMember(i))
8695 RecipeBuilder.recordRecipeOf(Member);
8698 // ---------------------------------------------------------------------------
8699 // Build initial VPlan: Scan the body of the loop in a topological order to
8700 // visit each basic block after having visited its predecessor basic blocks.
8701 // ---------------------------------------------------------------------------
8703 // Create initial VPlan skeleton, having a basic block for the pre-header
8704 // which contains SCEV expansions that need to happen before the CFG is
8705 // modified; a basic block for the vector pre-header, followed by a region for
8706 // the vector loop, followed by the middle basic block. The skeleton vector
8707 // loop region contains a header and latch basic blocks.
8708 VPlanPtr Plan = VPlan::createInitialVPlan(
8709 createTripCountSCEV(Legal->getWidestInductionType(), PSE, OrigLoop),
8710 *PSE.getSE());
8711 VPBasicBlock *HeaderVPBB = new VPBasicBlock("vector.body");
8712 VPBasicBlock *LatchVPBB = new VPBasicBlock("vector.latch");
8713 VPBlockUtils::insertBlockAfter(LatchVPBB, HeaderVPBB);
8714 Plan->getVectorLoopRegion()->setEntry(HeaderVPBB);
8715 Plan->getVectorLoopRegion()->setExiting(LatchVPBB);
8717 // Don't use getDecisionAndClampRange here, because we don't know the UF
8718 // so this function is better to be conservative, rather than to split
8719 // it up into different VPlans.
8720 // TODO: Consider using getDecisionAndClampRange here to split up VPlans.
8721 bool IVUpdateMayOverflow = false;
8722 for (ElementCount VF : Range)
8723 IVUpdateMayOverflow |= !isIndvarOverflowCheckKnownFalse(&CM, VF);
8725 DebugLoc DL = getDebugLocFromInstOrOperands(Legal->getPrimaryInduction());
8726 TailFoldingStyle Style = CM.getTailFoldingStyle(IVUpdateMayOverflow);
8727 // When not folding the tail, we know that the induction increment will not
8728 // overflow.
8729 bool HasNUW = Style == TailFoldingStyle::None;
8730 addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), HasNUW, DL);
8732 // Scan the body of the loop in a topological order to visit each basic block
8733 // after having visited its predecessor basic blocks.
8734 LoopBlocksDFS DFS(OrigLoop);
8735 DFS.perform(LI);
8737 VPBasicBlock *VPBB = HeaderVPBB;
8738 bool NeedsMasks = CM.foldTailByMasking() ||
8739 any_of(OrigLoop->blocks(), [this](BasicBlock *BB) {
8740 return Legal->blockNeedsPredication(BB);
8742 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
8743 // Relevant instructions from basic block BB will be grouped into VPRecipe
8744 // ingredients and fill a new VPBasicBlock.
8745 if (VPBB != HeaderVPBB)
8746 VPBB->setName(BB->getName());
8747 Builder.setInsertPoint(VPBB);
8749 if (VPBB == HeaderVPBB)
8750 RecipeBuilder.createHeaderMask(*Plan);
8751 else if (NeedsMasks)
8752 RecipeBuilder.createBlockInMask(BB, *Plan);
8754 // Introduce each ingredient into VPlan.
8755 // TODO: Model and preserve debug intrinsics in VPlan.
8756 for (Instruction &I : drop_end(BB->instructionsWithoutDebug(false))) {
8757 Instruction *Instr = &I;
8758 SmallVector<VPValue *, 4> Operands;
8759 auto *Phi = dyn_cast<PHINode>(Instr);
8760 if (Phi && Phi->getParent() == OrigLoop->getHeader()) {
8761 Operands.push_back(Plan->getVPValueOrAddLiveIn(
8762 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())));
8763 } else {
8764 auto OpRange = Plan->mapToVPValues(Instr->operands());
8765 Operands = {OpRange.begin(), OpRange.end()};
8768 // Invariant stores inside loop will be deleted and a single store
8769 // with the final reduction value will be added to the exit block
8770 StoreInst *SI;
8771 if ((SI = dyn_cast<StoreInst>(&I)) &&
8772 Legal->isInvariantAddressOfReduction(SI->getPointerOperand()))
8773 continue;
8775 auto RecipeOrValue = RecipeBuilder.tryToCreateWidenRecipe(
8776 Instr, Operands, Range, VPBB, Plan);
8777 if (!RecipeOrValue)
8778 RecipeOrValue = RecipeBuilder.handleReplication(Instr, Range, *Plan);
8779 // If Instr can be simplified to an existing VPValue, use it.
8780 if (isa<VPValue *>(RecipeOrValue)) {
8781 auto *VPV = cast<VPValue *>(RecipeOrValue);
8782 Plan->addVPValue(Instr, VPV);
8783 // If the re-used value is a recipe, register the recipe for the
8784 // instruction, in case the recipe for Instr needs to be recorded.
8785 if (VPRecipeBase *R = VPV->getDefiningRecipe())
8786 RecipeBuilder.setRecipe(Instr, R);
8787 continue;
8789 // Otherwise, add the new recipe.
8790 VPRecipeBase *Recipe = cast<VPRecipeBase *>(RecipeOrValue);
8791 for (auto *Def : Recipe->definedValues()) {
8792 auto *UV = Def->getUnderlyingValue();
8793 Plan->addVPValue(UV, Def);
8796 RecipeBuilder.setRecipe(Instr, Recipe);
8797 if (isa<VPHeaderPHIRecipe>(Recipe)) {
8798 // VPHeaderPHIRecipes must be kept in the phi section of HeaderVPBB. In
8799 // the following cases, VPHeaderPHIRecipes may be created after non-phi
8800 // recipes and need to be moved to the phi section of HeaderVPBB:
8801 // * tail-folding (non-phi recipes computing the header mask are
8802 // introduced earlier than regular header phi recipes, and should appear
8803 // after them)
8804 // * Optimizing truncates to VPWidenIntOrFpInductionRecipe.
8806 assert((HeaderVPBB->getFirstNonPhi() == VPBB->end() ||
8807 CM.foldTailByMasking() || isa<TruncInst>(Instr)) &&
8808 "unexpected recipe needs moving");
8809 Recipe->insertBefore(*HeaderVPBB, HeaderVPBB->getFirstNonPhi());
8810 } else
8811 VPBB->appendRecipe(Recipe);
8814 VPBlockUtils::insertBlockAfter(new VPBasicBlock(), VPBB);
8815 VPBB = cast<VPBasicBlock>(VPBB->getSingleSuccessor());
8818 // After here, VPBB should not be used.
8819 VPBB = nullptr;
8821 if (CM.requiresScalarEpilogue(Range)) {
8822 // No edge from the middle block to the unique exit block has been inserted
8823 // and there is nothing to fix from vector loop; phis should have incoming
8824 // from scalar loop only.
8825 } else
8826 addUsersInExitBlock(HeaderVPBB, OrigLoop, *Plan);
8828 assert(isa<VPRegionBlock>(Plan->getVectorLoopRegion()) &&
8829 !Plan->getVectorLoopRegion()->getEntryBasicBlock()->empty() &&
8830 "entry block must be set to a VPRegionBlock having a non-empty entry "
8831 "VPBasicBlock");
8832 RecipeBuilder.fixHeaderPhis();
8834 // ---------------------------------------------------------------------------
8835 // Transform initial VPlan: Apply previously taken decisions, in order, to
8836 // bring the VPlan to its final state.
8837 // ---------------------------------------------------------------------------
8839 // Adjust the recipes for any inloop reductions.
8840 adjustRecipesForReductions(LatchVPBB, Plan, RecipeBuilder, Range.Start);
8842 // Interleave memory: for each Interleave Group we marked earlier as relevant
8843 // for this VPlan, replace the Recipes widening its memory instructions with a
8844 // single VPInterleaveRecipe at its insertion point.
8845 for (const auto *IG : InterleaveGroups) {
8846 auto *Recipe = cast<VPWidenMemoryInstructionRecipe>(
8847 RecipeBuilder.getRecipe(IG->getInsertPos()));
8848 SmallVector<VPValue *, 4> StoredValues;
8849 for (unsigned i = 0; i < IG->getFactor(); ++i)
8850 if (auto *SI = dyn_cast_or_null<StoreInst>(IG->getMember(i))) {
8851 auto *StoreR =
8852 cast<VPWidenMemoryInstructionRecipe>(RecipeBuilder.getRecipe(SI));
8853 StoredValues.push_back(StoreR->getStoredValue());
8856 bool NeedsMaskForGaps =
8857 IG->requiresScalarEpilogue() && !CM.isScalarEpilogueAllowed();
8858 auto *VPIG = new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues,
8859 Recipe->getMask(), NeedsMaskForGaps);
8860 VPIG->insertBefore(Recipe);
8861 unsigned J = 0;
8862 for (unsigned i = 0; i < IG->getFactor(); ++i)
8863 if (Instruction *Member = IG->getMember(i)) {
8864 VPRecipeBase *MemberR = RecipeBuilder.getRecipe(Member);
8865 if (!Member->getType()->isVoidTy()) {
8866 VPValue *OriginalV = MemberR->getVPSingleValue();
8867 OriginalV->replaceAllUsesWith(VPIG->getVPValue(J));
8868 J++;
8870 MemberR->eraseFromParent();
8874 for (ElementCount VF : Range)
8875 Plan->addVF(VF);
8876 Plan->setName("Initial VPlan");
8878 // Replace VPValues for known constant strides guaranteed by predicate scalar
8879 // evolution.
8880 for (auto [_, Stride] : Legal->getLAI()->getSymbolicStrides()) {
8881 auto *StrideV = cast<SCEVUnknown>(Stride)->getValue();
8882 auto *ScevStride = dyn_cast<SCEVConstant>(PSE.getSCEV(StrideV));
8883 // Only handle constant strides for now.
8884 if (!ScevStride)
8885 continue;
8886 Constant *CI = ConstantInt::get(Stride->getType(), ScevStride->getAPInt());
8888 auto *ConstVPV = Plan->getVPValueOrAddLiveIn(CI);
8889 // The versioned value may not be used in the loop directly, so just add a
8890 // new live-in in those cases.
8891 Plan->getVPValueOrAddLiveIn(StrideV)->replaceAllUsesWith(ConstVPV);
8894 // From this point onwards, VPlan-to-VPlan transformations may change the plan
8895 // in ways that accessing values using original IR values is incorrect.
8896 Plan->disableValue2VPValue();
8898 // Sink users of fixed-order recurrence past the recipe defining the previous
8899 // value and introduce FirstOrderRecurrenceSplice VPInstructions.
8900 if (!VPlanTransforms::adjustFixedOrderRecurrences(*Plan, Builder))
8901 return nullptr;
8903 if (useActiveLaneMask(Style)) {
8904 // TODO: Move checks to VPlanTransforms::addActiveLaneMask once
8905 // TailFoldingStyle is visible there.
8906 bool ForControlFlow = useActiveLaneMaskForControlFlow(Style);
8907 bool WithoutRuntimeCheck =
8908 Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
8909 VPlanTransforms::addActiveLaneMask(*Plan, ForControlFlow,
8910 WithoutRuntimeCheck);
8912 return Plan;
8915 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
8916 // Outer loop handling: They may require CFG and instruction level
8917 // transformations before even evaluating whether vectorization is profitable.
8918 // Since we cannot modify the incoming IR, we need to build VPlan upfront in
8919 // the vectorization pipeline.
8920 assert(!OrigLoop->isInnermost());
8921 assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
8923 // Create new empty VPlan
8924 auto Plan = VPlan::createInitialVPlan(
8925 createTripCountSCEV(Legal->getWidestInductionType(), PSE, OrigLoop),
8926 *PSE.getSE());
8928 // Build hierarchical CFG
8929 VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);
8930 HCFGBuilder.buildHierarchicalCFG();
8932 for (ElementCount VF : Range)
8933 Plan->addVF(VF);
8935 VPlanTransforms::VPInstructionsToVPRecipes(
8936 Plan,
8937 [this](PHINode *P) { return Legal->getIntOrFpInductionDescriptor(P); },
8938 *PSE.getSE(), *TLI);
8940 // Remove the existing terminator of the exiting block of the top-most region.
8941 // A BranchOnCount will be added instead when adding the canonical IV recipes.
8942 auto *Term =
8943 Plan->getVectorLoopRegion()->getExitingBasicBlock()->getTerminator();
8944 Term->eraseFromParent();
8946 // Tail folding is not supported for outer loops, so the induction increment
8947 // is guaranteed to not wrap.
8948 bool HasNUW = true;
8949 addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), HasNUW,
8950 DebugLoc());
8951 return Plan;
8954 // Adjust the recipes for reductions. For in-loop reductions the chain of
8955 // instructions leading from the loop exit instr to the phi need to be converted
8956 // to reductions, with one operand being vector and the other being the scalar
8957 // reduction chain. For other reductions, a select is introduced between the phi
8958 // and live-out recipes when folding the tail.
8960 // A ComputeReductionResult recipe is added to the middle block, also for
8961 // in-loop reductions which compute their result in-loop, because generating
8962 // the subsequent bc.merge.rdx phi is driven by ComputeReductionResult recipes.
8963 void LoopVectorizationPlanner::adjustRecipesForReductions(
8964 VPBasicBlock *LatchVPBB, VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder,
8965 ElementCount MinVF) {
8966 VPRegionBlock *VectorLoopRegion = Plan->getVectorLoopRegion();
8967 VPBasicBlock *Header = VectorLoopRegion->getEntryBasicBlock();
8968 // Gather all VPReductionPHIRecipe and sort them so that Intermediate stores
8969 // sank outside of the loop would keep the same order as they had in the
8970 // original loop.
8971 SmallVector<VPReductionPHIRecipe *> ReductionPHIList;
8972 for (VPRecipeBase &R : Header->phis()) {
8973 if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R))
8974 ReductionPHIList.emplace_back(ReductionPhi);
8976 bool HasIntermediateStore = false;
8977 stable_sort(ReductionPHIList,
8978 [this, &HasIntermediateStore](const VPReductionPHIRecipe *R1,
8979 const VPReductionPHIRecipe *R2) {
8980 auto *IS1 = R1->getRecurrenceDescriptor().IntermediateStore;
8981 auto *IS2 = R2->getRecurrenceDescriptor().IntermediateStore;
8982 HasIntermediateStore |= IS1 || IS2;
8984 // If neither of the recipes has an intermediate store, keep the
8985 // order the same.
8986 if (!IS1 && !IS2)
8987 return false;
8989 // If only one of the recipes has an intermediate store, then
8990 // move it towards the beginning of the list.
8991 if (IS1 && !IS2)
8992 return true;
8994 if (!IS1 && IS2)
8995 return false;
8997 // If both recipes have an intermediate store, then the recipe
8998 // with the later store should be processed earlier. So it
8999 // should go to the beginning of the list.
9000 return DT->dominates(IS2, IS1);
9003 if (HasIntermediateStore && ReductionPHIList.size() > 1)
9004 for (VPRecipeBase *R : ReductionPHIList)
9005 R->moveBefore(*Header, Header->getFirstNonPhi());
9007 for (VPRecipeBase &R : Header->phis()) {
9008 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
9009 if (!PhiR || !PhiR->isInLoop() || (MinVF.isScalar() && !PhiR->isOrdered()))
9010 continue;
9012 const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
9013 RecurKind Kind = RdxDesc.getRecurrenceKind();
9014 assert(!RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind) &&
9015 "AnyOf reductions are not allowed for in-loop reductions");
9017 // Collect the chain of "link" recipes for the reduction starting at PhiR.
9018 SetVector<VPSingleDefRecipe *> Worklist;
9019 Worklist.insert(PhiR);
9020 for (unsigned I = 0; I != Worklist.size(); ++I) {
9021 VPSingleDefRecipe *Cur = Worklist[I];
9022 for (VPUser *U : Cur->users()) {
9023 auto *UserRecipe = dyn_cast<VPSingleDefRecipe>(U);
9024 if (!UserRecipe) {
9025 assert(isa<VPLiveOut>(U) &&
9026 "U must either be a VPSingleDef or VPLiveOut");
9027 continue;
9029 Worklist.insert(UserRecipe);
9033 // Visit operation "Links" along the reduction chain top-down starting from
9034 // the phi until LoopExitValue. We keep track of the previous item
9035 // (PreviousLink) to tell which of the two operands of a Link will remain
9036 // scalar and which will be reduced. For minmax by select(cmp), Link will be
9037 // the select instructions.
9038 VPSingleDefRecipe *PreviousLink = PhiR; // Aka Worklist[0].
9039 for (VPSingleDefRecipe *CurrentLink : Worklist.getArrayRef().drop_front()) {
9040 Instruction *CurrentLinkI = CurrentLink->getUnderlyingInstr();
9042 // Index of the first operand which holds a non-mask vector operand.
9043 unsigned IndexOfFirstOperand;
9044 // Recognize a call to the llvm.fmuladd intrinsic.
9045 bool IsFMulAdd = (Kind == RecurKind::FMulAdd);
9046 VPValue *VecOp;
9047 VPBasicBlock *LinkVPBB = CurrentLink->getParent();
9048 if (IsFMulAdd) {
9049 assert(
9050 RecurrenceDescriptor::isFMulAddIntrinsic(CurrentLinkI) &&
9051 "Expected instruction to be a call to the llvm.fmuladd intrinsic");
9052 assert(((MinVF.isScalar() && isa<VPReplicateRecipe>(CurrentLink)) ||
9053 isa<VPWidenCallRecipe>(CurrentLink)) &&
9054 CurrentLink->getOperand(2) == PreviousLink &&
9055 "expected a call where the previous link is the added operand");
9057 // If the instruction is a call to the llvm.fmuladd intrinsic then we
9058 // need to create an fmul recipe (multiplying the first two operands of
9059 // the fmuladd together) to use as the vector operand for the fadd
9060 // reduction.
9061 VPInstruction *FMulRecipe = new VPInstruction(
9062 Instruction::FMul,
9063 {CurrentLink->getOperand(0), CurrentLink->getOperand(1)},
9064 CurrentLinkI->getFastMathFlags());
9065 LinkVPBB->insert(FMulRecipe, CurrentLink->getIterator());
9066 VecOp = FMulRecipe;
9067 } else {
9068 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
9069 if (isa<VPWidenRecipe>(CurrentLink)) {
9070 assert(isa<CmpInst>(CurrentLinkI) &&
9071 "need to have the compare of the select");
9072 continue;
9074 assert(isa<VPWidenSelectRecipe>(CurrentLink) &&
9075 "must be a select recipe");
9076 IndexOfFirstOperand = 1;
9077 } else {
9078 assert((MinVF.isScalar() || isa<VPWidenRecipe>(CurrentLink)) &&
9079 "Expected to replace a VPWidenSC");
9080 IndexOfFirstOperand = 0;
9082 // Note that for non-commutable operands (cmp-selects), the semantics of
9083 // the cmp-select are captured in the recurrence kind.
9084 unsigned VecOpId =
9085 CurrentLink->getOperand(IndexOfFirstOperand) == PreviousLink
9086 ? IndexOfFirstOperand + 1
9087 : IndexOfFirstOperand;
9088 VecOp = CurrentLink->getOperand(VecOpId);
9089 assert(VecOp != PreviousLink &&
9090 CurrentLink->getOperand(CurrentLink->getNumOperands() - 1 -
9091 (VecOpId - IndexOfFirstOperand)) ==
9092 PreviousLink &&
9093 "PreviousLink must be the operand other than VecOp");
9096 BasicBlock *BB = CurrentLinkI->getParent();
9097 VPValue *CondOp = nullptr;
9098 if (CM.blockNeedsPredicationForAnyReason(BB)) {
9099 VPBuilder::InsertPointGuard Guard(Builder);
9100 Builder.setInsertPoint(CurrentLink);
9101 CondOp = RecipeBuilder.getBlockInMask(BB);
9104 VPReductionRecipe *RedRecipe = new VPReductionRecipe(
9105 RdxDesc, CurrentLinkI, PreviousLink, VecOp, CondOp);
9106 // Append the recipe to the end of the VPBasicBlock because we need to
9107 // ensure that it comes after all of it's inputs, including CondOp.
9108 // Note that this transformation may leave over dead recipes (including
9109 // CurrentLink), which will be cleaned by a later VPlan transform.
9110 LinkVPBB->appendRecipe(RedRecipe);
9111 CurrentLink->replaceAllUsesWith(RedRecipe);
9112 PreviousLink = RedRecipe;
9115 Builder.setInsertPoint(&*LatchVPBB->begin());
9116 for (VPRecipeBase &R :
9117 Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
9118 VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
9119 if (!PhiR)
9120 continue;
9122 const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
9123 // If tail is folded by masking, introduce selects between the phi
9124 // and the live-out instruction of each reduction, at the beginning of the
9125 // dedicated latch block.
9126 auto *OrigExitingVPV = PhiR->getBackedgeValue();
9127 auto *NewExitingVPV = PhiR->getBackedgeValue();
9128 if (!PhiR->isInLoop() && CM.foldTailByMasking()) {
9129 VPValue *Cond = RecipeBuilder.getBlockInMask(OrigLoop->getHeader());
9130 assert(OrigExitingVPV->getDefiningRecipe()->getParent() != LatchVPBB &&
9131 "reduction recipe must be defined before latch");
9132 Type *PhiTy = PhiR->getOperand(0)->getLiveInIRValue()->getType();
9133 std::optional<FastMathFlags> FMFs =
9134 PhiTy->isFloatingPointTy()
9135 ? std::make_optional(RdxDesc.getFastMathFlags())
9136 : std::nullopt;
9137 NewExitingVPV =
9138 Builder.createSelect(Cond, OrigExitingVPV, PhiR, {}, "", FMFs);
9139 OrigExitingVPV->replaceUsesWithIf(NewExitingVPV, [](VPUser &U, unsigned) {
9140 return isa<VPInstruction>(&U) &&
9141 cast<VPInstruction>(&U)->getOpcode() ==
9142 VPInstruction::ComputeReductionResult;
9144 if (PreferPredicatedReductionSelect ||
9145 TTI.preferPredicatedReductionSelect(
9146 PhiR->getRecurrenceDescriptor().getOpcode(), PhiTy,
9147 TargetTransformInfo::ReductionFlags()))
9148 PhiR->setOperand(1, NewExitingVPV);
9151 // If the vector reduction can be performed in a smaller type, we truncate
9152 // then extend the loop exit value to enable InstCombine to evaluate the
9153 // entire expression in the smaller type.
9154 Type *PhiTy = PhiR->getStartValue()->getLiveInIRValue()->getType();
9155 if (MinVF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) {
9156 assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!");
9157 Type *RdxTy = RdxDesc.getRecurrenceType();
9158 auto *Trunc =
9159 new VPWidenCastRecipe(Instruction::Trunc, NewExitingVPV, RdxTy);
9160 auto *Extnd =
9161 RdxDesc.isSigned()
9162 ? new VPWidenCastRecipe(Instruction::SExt, Trunc, PhiTy)
9163 : new VPWidenCastRecipe(Instruction::ZExt, Trunc, PhiTy);
9165 Trunc->insertAfter(NewExitingVPV->getDefiningRecipe());
9166 Extnd->insertAfter(Trunc);
9167 if (PhiR->getOperand(1) == NewExitingVPV)
9168 PhiR->setOperand(1, Extnd->getVPSingleValue());
9169 NewExitingVPV = Extnd;
9172 // We want code in the middle block to appear to execute on the location of
9173 // the scalar loop's latch terminator because: (a) it is all compiler
9174 // generated, (b) these instructions are always executed after evaluating
9175 // the latch conditional branch, and (c) other passes may add new
9176 // predecessors which terminate on this line. This is the easiest way to
9177 // ensure we don't accidentally cause an extra step back into the loop while
9178 // debugging.
9179 DebugLoc ExitDL = OrigLoop->getLoopLatch()->getTerminator()->getDebugLoc();
9181 // TODO: At the moment ComputeReductionResult also drives creation of the
9182 // bc.merge.rdx phi nodes, hence it needs to be created unconditionally here
9183 // even for in-loop reductions, until the reduction resume value handling is
9184 // also modeled in VPlan.
9185 auto *FinalReductionResult = new VPInstruction(
9186 VPInstruction::ComputeReductionResult, {PhiR, NewExitingVPV}, ExitDL);
9187 cast<VPBasicBlock>(VectorLoopRegion->getSingleSuccessor())
9188 ->appendRecipe(FinalReductionResult);
9189 OrigExitingVPV->replaceUsesWithIf(
9190 FinalReductionResult,
9191 [](VPUser &User, unsigned) { return isa<VPLiveOut>(&User); });
9194 VPlanTransforms::clearReductionWrapFlags(*Plan);
9197 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
9198 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent,
9199 VPSlotTracker &SlotTracker) const {
9200 O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
9201 IG->getInsertPos()->printAsOperand(O, false);
9202 O << ", ";
9203 getAddr()->printAsOperand(O, SlotTracker);
9204 VPValue *Mask = getMask();
9205 if (Mask) {
9206 O << ", ";
9207 Mask->printAsOperand(O, SlotTracker);
9210 unsigned OpIdx = 0;
9211 for (unsigned i = 0; i < IG->getFactor(); ++i) {
9212 if (!IG->getMember(i))
9213 continue;
9214 if (getNumStoreOperands() > 0) {
9215 O << "\n" << Indent << " store ";
9216 getOperand(1 + OpIdx)->printAsOperand(O, SlotTracker);
9217 O << " to index " << i;
9218 } else {
9219 O << "\n" << Indent << " ";
9220 getVPValue(OpIdx)->printAsOperand(O, SlotTracker);
9221 O << " = load from index " << i;
9223 ++OpIdx;
9226 #endif
9228 void VPWidenPointerInductionRecipe::execute(VPTransformState &State) {
9229 assert(IndDesc.getKind() == InductionDescriptor::IK_PtrInduction &&
9230 "Not a pointer induction according to InductionDescriptor!");
9231 assert(cast<PHINode>(getUnderlyingInstr())->getType()->isPointerTy() &&
9232 "Unexpected type.");
9234 auto *IVR = getParent()->getPlan()->getCanonicalIV();
9235 PHINode *CanonicalIV = cast<PHINode>(State.get(IVR, 0));
9237 if (onlyScalarsGenerated(State.VF)) {
9238 // This is the normalized GEP that starts counting at zero.
9239 Value *PtrInd = State.Builder.CreateSExtOrTrunc(
9240 CanonicalIV, IndDesc.getStep()->getType());
9241 // Determine the number of scalars we need to generate for each unroll
9242 // iteration. If the instruction is uniform, we only need to generate the
9243 // first lane. Otherwise, we generate all VF values.
9244 bool IsUniform = vputils::onlyFirstLaneUsed(this);
9245 assert((IsUniform || !State.VF.isScalable()) &&
9246 "Cannot scalarize a scalable VF");
9247 unsigned Lanes = IsUniform ? 1 : State.VF.getFixedValue();
9249 for (unsigned Part = 0; Part < State.UF; ++Part) {
9250 Value *PartStart =
9251 createStepForVF(State.Builder, PtrInd->getType(), State.VF, Part);
9253 for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
9254 Value *Idx = State.Builder.CreateAdd(
9255 PartStart, ConstantInt::get(PtrInd->getType(), Lane));
9256 Value *GlobalIdx = State.Builder.CreateAdd(PtrInd, Idx);
9258 Value *Step = State.get(getOperand(1), VPIteration(Part, Lane));
9259 Value *SclrGep = emitTransformedIndex(
9260 State.Builder, GlobalIdx, IndDesc.getStartValue(), Step,
9261 IndDesc.getKind(), IndDesc.getInductionBinOp());
9262 SclrGep->setName("next.gep");
9263 State.set(this, SclrGep, VPIteration(Part, Lane));
9266 return;
9269 Type *PhiType = IndDesc.getStep()->getType();
9271 // Build a pointer phi
9272 Value *ScalarStartValue = getStartValue()->getLiveInIRValue();
9273 Type *ScStValueType = ScalarStartValue->getType();
9274 PHINode *NewPointerPhi =
9275 PHINode::Create(ScStValueType, 2, "pointer.phi", CanonicalIV);
9277 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
9278 NewPointerPhi->addIncoming(ScalarStartValue, VectorPH);
9280 // A pointer induction, performed by using a gep
9281 Instruction *InductionLoc = &*State.Builder.GetInsertPoint();
9283 Value *ScalarStepValue = State.get(getOperand(1), VPIteration(0, 0));
9284 Value *RuntimeVF = getRuntimeVF(State.Builder, PhiType, State.VF);
9285 Value *NumUnrolledElems =
9286 State.Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, State.UF));
9287 Value *InductionGEP = GetElementPtrInst::Create(
9288 State.Builder.getInt8Ty(), NewPointerPhi,
9289 State.Builder.CreateMul(ScalarStepValue, NumUnrolledElems), "ptr.ind",
9290 InductionLoc);
9291 // Add induction update using an incorrect block temporarily. The phi node
9292 // will be fixed after VPlan execution. Note that at this point the latch
9293 // block cannot be used, as it does not exist yet.
9294 // TODO: Model increment value in VPlan, by turning the recipe into a
9295 // multi-def and a subclass of VPHeaderPHIRecipe.
9296 NewPointerPhi->addIncoming(InductionGEP, VectorPH);
9298 // Create UF many actual address geps that use the pointer
9299 // phi as base and a vectorized version of the step value
9300 // (<step*0, ..., step*N>) as offset.
9301 for (unsigned Part = 0; Part < State.UF; ++Part) {
9302 Type *VecPhiType = VectorType::get(PhiType, State.VF);
9303 Value *StartOffsetScalar =
9304 State.Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, Part));
9305 Value *StartOffset =
9306 State.Builder.CreateVectorSplat(State.VF, StartOffsetScalar);
9307 // Create a vector of consecutive numbers from zero to VF.
9308 StartOffset = State.Builder.CreateAdd(
9309 StartOffset, State.Builder.CreateStepVector(VecPhiType));
9311 assert(ScalarStepValue == State.get(getOperand(1), VPIteration(Part, 0)) &&
9312 "scalar step must be the same across all parts");
9313 Value *GEP = State.Builder.CreateGEP(
9314 State.Builder.getInt8Ty(), NewPointerPhi,
9315 State.Builder.CreateMul(
9316 StartOffset,
9317 State.Builder.CreateVectorSplat(State.VF, ScalarStepValue),
9318 "vector.gep"));
9319 State.set(this, GEP, Part);
9323 void VPDerivedIVRecipe::execute(VPTransformState &State) {
9324 assert(!State.Instance && "VPDerivedIVRecipe being replicated.");
9326 // Fast-math-flags propagate from the original induction instruction.
9327 IRBuilder<>::FastMathFlagGuard FMFG(State.Builder);
9328 if (FPBinOp)
9329 State.Builder.setFastMathFlags(FPBinOp->getFastMathFlags());
9331 Value *Step = State.get(getStepValue(), VPIteration(0, 0));
9332 Value *CanonicalIV = State.get(getCanonicalIV(), VPIteration(0, 0));
9333 Value *DerivedIV = emitTransformedIndex(
9334 State.Builder, CanonicalIV, getStartValue()->getLiveInIRValue(), Step,
9335 Kind, cast_if_present<BinaryOperator>(FPBinOp));
9336 DerivedIV->setName("offset.idx");
9337 if (TruncResultTy) {
9338 assert(TruncResultTy != DerivedIV->getType() &&
9339 Step->getType()->isIntegerTy() &&
9340 "Truncation requires an integer step");
9341 DerivedIV = State.Builder.CreateTrunc(DerivedIV, TruncResultTy);
9343 assert(DerivedIV != CanonicalIV && "IV didn't need transforming?");
9345 State.set(this, DerivedIV, VPIteration(0, 0));
9348 void VPInterleaveRecipe::execute(VPTransformState &State) {
9349 assert(!State.Instance && "Interleave group being replicated.");
9350 State.ILV->vectorizeInterleaveGroup(IG, definedValues(), State, getAddr(),
9351 getStoredValues(), getMask(),
9352 NeedsMaskForGaps);
9355 void VPReductionRecipe::execute(VPTransformState &State) {
9356 assert(!State.Instance && "Reduction being replicated.");
9357 Value *PrevInChain = State.get(getChainOp(), 0);
9358 RecurKind Kind = RdxDesc.getRecurrenceKind();
9359 bool IsOrdered = State.ILV->useOrderedReductions(RdxDesc);
9360 // Propagate the fast-math flags carried by the underlying instruction.
9361 IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder);
9362 State.Builder.setFastMathFlags(RdxDesc.getFastMathFlags());
9363 for (unsigned Part = 0; Part < State.UF; ++Part) {
9364 Value *NewVecOp = State.get(getVecOp(), Part);
9365 if (VPValue *Cond = getCondOp()) {
9366 Value *NewCond = State.VF.isVector() ? State.get(Cond, Part)
9367 : State.get(Cond, {Part, 0});
9368 VectorType *VecTy = dyn_cast<VectorType>(NewVecOp->getType());
9369 Type *ElementTy = VecTy ? VecTy->getElementType() : NewVecOp->getType();
9370 Value *Iden = RdxDesc.getRecurrenceIdentity(Kind, ElementTy,
9371 RdxDesc.getFastMathFlags());
9372 if (State.VF.isVector()) {
9373 Iden =
9374 State.Builder.CreateVectorSplat(VecTy->getElementCount(), Iden);
9377 Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, Iden);
9378 NewVecOp = Select;
9380 Value *NewRed;
9381 Value *NextInChain;
9382 if (IsOrdered) {
9383 if (State.VF.isVector())
9384 NewRed = createOrderedReduction(State.Builder, RdxDesc, NewVecOp,
9385 PrevInChain);
9386 else
9387 NewRed = State.Builder.CreateBinOp(
9388 (Instruction::BinaryOps)RdxDesc.getOpcode(Kind), PrevInChain,
9389 NewVecOp);
9390 PrevInChain = NewRed;
9391 } else {
9392 PrevInChain = State.get(getChainOp(), Part);
9393 NewRed = createTargetReduction(State.Builder, RdxDesc, NewVecOp);
9395 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
9396 NextInChain = createMinMaxOp(State.Builder, RdxDesc.getRecurrenceKind(),
9397 NewRed, PrevInChain);
9398 } else if (IsOrdered)
9399 NextInChain = NewRed;
9400 else
9401 NextInChain = State.Builder.CreateBinOp(
9402 (Instruction::BinaryOps)RdxDesc.getOpcode(Kind), NewRed, PrevInChain);
9403 State.set(this, NextInChain, Part);
9407 void VPReplicateRecipe::execute(VPTransformState &State) {
9408 Instruction *UI = getUnderlyingInstr();
9409 if (State.Instance) { // Generate a single instance.
9410 assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
9411 State.ILV->scalarizeInstruction(UI, this, *State.Instance, State);
9412 // Insert scalar instance packing it into a vector.
9413 if (State.VF.isVector() && shouldPack()) {
9414 // If we're constructing lane 0, initialize to start from poison.
9415 if (State.Instance->Lane.isFirstLane()) {
9416 assert(!State.VF.isScalable() && "VF is assumed to be non scalable.");
9417 Value *Poison = PoisonValue::get(
9418 VectorType::get(UI->getType(), State.VF));
9419 State.set(this, Poison, State.Instance->Part);
9421 State.packScalarIntoVectorValue(this, *State.Instance);
9423 return;
9426 if (IsUniform) {
9427 // If the recipe is uniform across all parts (instead of just per VF), only
9428 // generate a single instance.
9429 if ((isa<LoadInst>(UI) || isa<StoreInst>(UI)) &&
9430 all_of(operands(), [](VPValue *Op) {
9431 return Op->isDefinedOutsideVectorRegions();
9432 })) {
9433 State.ILV->scalarizeInstruction(UI, this, VPIteration(0, 0), State);
9434 if (user_begin() != user_end()) {
9435 for (unsigned Part = 1; Part < State.UF; ++Part)
9436 State.set(this, State.get(this, VPIteration(0, 0)),
9437 VPIteration(Part, 0));
9439 return;
9442 // Uniform within VL means we need to generate lane 0 only for each
9443 // unrolled copy.
9444 for (unsigned Part = 0; Part < State.UF; ++Part)
9445 State.ILV->scalarizeInstruction(UI, this, VPIteration(Part, 0), State);
9446 return;
9449 // A store of a loop varying value to a uniform address only needs the last
9450 // copy of the store.
9451 if (isa<StoreInst>(UI) &&
9452 vputils::isUniformAfterVectorization(getOperand(1))) {
9453 auto Lane = VPLane::getLastLaneForVF(State.VF);
9454 State.ILV->scalarizeInstruction(UI, this, VPIteration(State.UF - 1, Lane),
9455 State);
9456 return;
9459 // Generate scalar instances for all VF lanes of all UF parts.
9460 assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
9461 const unsigned EndLane = State.VF.getKnownMinValue();
9462 for (unsigned Part = 0; Part < State.UF; ++Part)
9463 for (unsigned Lane = 0; Lane < EndLane; ++Lane)
9464 State.ILV->scalarizeInstruction(UI, this, VPIteration(Part, Lane), State);
9467 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
9468 VPValue *StoredValue = isStore() ? getStoredValue() : nullptr;
9470 // Attempt to issue a wide load.
9471 LoadInst *LI = dyn_cast<LoadInst>(&Ingredient);
9472 StoreInst *SI = dyn_cast<StoreInst>(&Ingredient);
9474 assert((LI || SI) && "Invalid Load/Store instruction");
9475 assert((!SI || StoredValue) && "No stored value provided for widened store");
9476 assert((!LI || !StoredValue) && "Stored value provided for widened load");
9478 Type *ScalarDataTy = getLoadStoreType(&Ingredient);
9480 auto *DataTy = VectorType::get(ScalarDataTy, State.VF);
9481 const Align Alignment = getLoadStoreAlignment(&Ingredient);
9482 bool CreateGatherScatter = !isConsecutive();
9484 auto &Builder = State.Builder;
9485 InnerLoopVectorizer::VectorParts BlockInMaskParts(State.UF);
9486 bool isMaskRequired = getMask();
9487 if (isMaskRequired) {
9488 // Mask reversal is only needed for non-all-one (null) masks, as reverse of
9489 // a null all-one mask is a null mask.
9490 for (unsigned Part = 0; Part < State.UF; ++Part) {
9491 Value *Mask = State.get(getMask(), Part);
9492 if (isReverse())
9493 Mask = Builder.CreateVectorReverse(Mask, "reverse");
9494 BlockInMaskParts[Part] = Mask;
9498 // Handle Stores:
9499 if (SI) {
9500 State.setDebugLocFrom(SI->getDebugLoc());
9502 for (unsigned Part = 0; Part < State.UF; ++Part) {
9503 Instruction *NewSI = nullptr;
9504 Value *StoredVal = State.get(StoredValue, Part);
9505 if (CreateGatherScatter) {
9506 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
9507 Value *VectorGep = State.get(getAddr(), Part);
9508 NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment,
9509 MaskPart);
9510 } else {
9511 if (isReverse()) {
9512 // If we store to reverse consecutive memory locations, then we need
9513 // to reverse the order of elements in the stored value.
9514 StoredVal = Builder.CreateVectorReverse(StoredVal, "reverse");
9515 // We don't want to update the value in the map as it might be used in
9516 // another expression. So don't call resetVectorValue(StoredVal).
9518 auto *VecPtr = State.get(getAddr(), Part);
9519 if (isMaskRequired)
9520 NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment,
9521 BlockInMaskParts[Part]);
9522 else
9523 NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment);
9525 State.addMetadata(NewSI, SI);
9527 return;
9530 // Handle loads.
9531 assert(LI && "Must have a load instruction");
9532 State.setDebugLocFrom(LI->getDebugLoc());
9533 for (unsigned Part = 0; Part < State.UF; ++Part) {
9534 Value *NewLI;
9535 if (CreateGatherScatter) {
9536 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
9537 Value *VectorGep = State.get(getAddr(), Part);
9538 NewLI = Builder.CreateMaskedGather(DataTy, VectorGep, Alignment, MaskPart,
9539 nullptr, "wide.masked.gather");
9540 State.addMetadata(NewLI, LI);
9541 } else {
9542 auto *VecPtr = State.get(getAddr(), Part);
9543 if (isMaskRequired)
9544 NewLI = Builder.CreateMaskedLoad(
9545 DataTy, VecPtr, Alignment, BlockInMaskParts[Part],
9546 PoisonValue::get(DataTy), "wide.masked.load");
9547 else
9548 NewLI =
9549 Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load");
9551 // Add metadata to the load, but setVectorValue to the reverse shuffle.
9552 State.addMetadata(NewLI, LI);
9553 if (Reverse)
9554 NewLI = Builder.CreateVectorReverse(NewLI, "reverse");
9557 State.set(getVPSingleValue(), NewLI, Part);
9561 // Determine how to lower the scalar epilogue, which depends on 1) optimising
9562 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing
9563 // predication, and 4) a TTI hook that analyses whether the loop is suitable
9564 // for predication.
9565 static ScalarEpilogueLowering getScalarEpilogueLowering(
9566 Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI,
9567 BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI,
9568 LoopVectorizationLegality &LVL, InterleavedAccessInfo *IAI) {
9569 // 1) OptSize takes precedence over all other options, i.e. if this is set,
9570 // don't look at hints or options, and don't request a scalar epilogue.
9571 // (For PGSO, as shouldOptimizeForSize isn't currently accessible from
9572 // LoopAccessInfo (due to code dependency and not being able to reliably get
9573 // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection
9574 // of strides in LoopAccessInfo::analyzeLoop() and vectorize without
9575 // versioning when the vectorization is forced, unlike hasOptSize. So revert
9576 // back to the old way and vectorize with versioning when forced. See D81345.)
9577 if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
9578 PGSOQueryType::IRPass) &&
9579 Hints.getForce() != LoopVectorizeHints::FK_Enabled))
9580 return CM_ScalarEpilogueNotAllowedOptSize;
9582 // 2) If set, obey the directives
9583 if (PreferPredicateOverEpilogue.getNumOccurrences()) {
9584 switch (PreferPredicateOverEpilogue) {
9585 case PreferPredicateTy::ScalarEpilogue:
9586 return CM_ScalarEpilogueAllowed;
9587 case PreferPredicateTy::PredicateElseScalarEpilogue:
9588 return CM_ScalarEpilogueNotNeededUsePredicate;
9589 case PreferPredicateTy::PredicateOrDontVectorize:
9590 return CM_ScalarEpilogueNotAllowedUsePredicate;
9594 // 3) If set, obey the hints
9595 switch (Hints.getPredicate()) {
9596 case LoopVectorizeHints::FK_Enabled:
9597 return CM_ScalarEpilogueNotNeededUsePredicate;
9598 case LoopVectorizeHints::FK_Disabled:
9599 return CM_ScalarEpilogueAllowed;
9602 // 4) if the TTI hook indicates this is profitable, request predication.
9603 TailFoldingInfo TFI(TLI, &LVL, IAI);
9604 if (TTI->preferPredicateOverEpilogue(&TFI))
9605 return CM_ScalarEpilogueNotNeededUsePredicate;
9607 return CM_ScalarEpilogueAllowed;
9610 // Process the loop in the VPlan-native vectorization path. This path builds
9611 // VPlan upfront in the vectorization pipeline, which allows to apply
9612 // VPlan-to-VPlan transformations from the very beginning without modifying the
9613 // input LLVM IR.
9614 static bool processLoopInVPlanNativePath(
9615 Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT,
9616 LoopVectorizationLegality *LVL, TargetTransformInfo *TTI,
9617 TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC,
9618 OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI,
9619 ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints,
9620 LoopVectorizationRequirements &Requirements) {
9622 if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) {
9623 LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n");
9624 return false;
9626 assert(EnableVPlanNativePath && "VPlan-native path is disabled.");
9627 Function *F = L->getHeader()->getParent();
9628 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
9630 ScalarEpilogueLowering SEL =
9631 getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, *LVL, &IAI);
9633 LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
9634 &Hints, IAI);
9635 // Use the planner for outer loop vectorization.
9636 // TODO: CM is not used at this point inside the planner. Turn CM into an
9637 // optional argument if we don't need it in the future.
9638 LoopVectorizationPlanner LVP(L, LI, DT, TLI, *TTI, LVL, CM, IAI, PSE, Hints,
9639 ORE);
9641 // Get user vectorization factor.
9642 ElementCount UserVF = Hints.getWidth();
9644 CM.collectElementTypesForWidening();
9646 // Plan how to best vectorize, return the best VF and its cost.
9647 const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF);
9649 // If we are stress testing VPlan builds, do not attempt to generate vector
9650 // code. Masked vector code generation support will follow soon.
9651 // Also, do not attempt to vectorize if no vector code will be produced.
9652 if (VPlanBuildStressTest || VectorizationFactor::Disabled() == VF)
9653 return false;
9655 VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
9658 bool AddBranchWeights =
9659 hasBranchWeightMD(*L->getLoopLatch()->getTerminator());
9660 GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, TTI,
9661 F->getParent()->getDataLayout(), AddBranchWeights);
9662 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width,
9663 VF.Width, 1, LVL, &CM, BFI, PSI, Checks);
9664 LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
9665 << L->getHeader()->getParent()->getName() << "\"\n");
9666 LVP.executePlan(VF.Width, 1, BestPlan, LB, DT, false);
9669 reportVectorization(ORE, L, VF, 1);
9671 // Mark the loop as already vectorized to avoid vectorizing again.
9672 Hints.setAlreadyVectorized();
9673 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
9674 return true;
9677 // Emit a remark if there are stores to floats that required a floating point
9678 // extension. If the vectorized loop was generated with floating point there
9679 // will be a performance penalty from the conversion overhead and the change in
9680 // the vector width.
9681 static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) {
9682 SmallVector<Instruction *, 4> Worklist;
9683 for (BasicBlock *BB : L->getBlocks()) {
9684 for (Instruction &Inst : *BB) {
9685 if (auto *S = dyn_cast<StoreInst>(&Inst)) {
9686 if (S->getValueOperand()->getType()->isFloatTy())
9687 Worklist.push_back(S);
9692 // Traverse the floating point stores upwards searching, for floating point
9693 // conversions.
9694 SmallPtrSet<const Instruction *, 4> Visited;
9695 SmallPtrSet<const Instruction *, 4> EmittedRemark;
9696 while (!Worklist.empty()) {
9697 auto *I = Worklist.pop_back_val();
9698 if (!L->contains(I))
9699 continue;
9700 if (!Visited.insert(I).second)
9701 continue;
9703 // Emit a remark if the floating point store required a floating
9704 // point conversion.
9705 // TODO: More work could be done to identify the root cause such as a
9706 // constant or a function return type and point the user to it.
9707 if (isa<FPExtInst>(I) && EmittedRemark.insert(I).second)
9708 ORE->emit([&]() {
9709 return OptimizationRemarkAnalysis(LV_NAME, "VectorMixedPrecision",
9710 I->getDebugLoc(), L->getHeader())
9711 << "floating point conversion changes vector width. "
9712 << "Mixed floating point precision requires an up/down "
9713 << "cast that will negatively impact performance.";
9716 for (Use &Op : I->operands())
9717 if (auto *OpI = dyn_cast<Instruction>(Op))
9718 Worklist.push_back(OpI);
9722 static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks,
9723 VectorizationFactor &VF,
9724 std::optional<unsigned> VScale, Loop *L,
9725 ScalarEvolution &SE,
9726 ScalarEpilogueLowering SEL) {
9727 InstructionCost CheckCost = Checks.getCost();
9728 if (!CheckCost.isValid())
9729 return false;
9731 // When interleaving only scalar and vector cost will be equal, which in turn
9732 // would lead to a divide by 0. Fall back to hard threshold.
9733 if (VF.Width.isScalar()) {
9734 if (CheckCost > VectorizeMemoryCheckThreshold) {
9735 LLVM_DEBUG(
9736 dbgs()
9737 << "LV: Interleaving only is not profitable due to runtime checks\n");
9738 return false;
9740 return true;
9743 // The scalar cost should only be 0 when vectorizing with a user specified VF/IC. In those cases, runtime checks should always be generated.
9744 double ScalarC = *VF.ScalarCost.getValue();
9745 if (ScalarC == 0)
9746 return true;
9748 // First, compute the minimum iteration count required so that the vector
9749 // loop outperforms the scalar loop.
9750 // The total cost of the scalar loop is
9751 // ScalarC * TC
9752 // where
9753 // * TC is the actual trip count of the loop.
9754 // * ScalarC is the cost of a single scalar iteration.
9756 // The total cost of the vector loop is
9757 // RtC + VecC * (TC / VF) + EpiC
9758 // where
9759 // * RtC is the cost of the generated runtime checks
9760 // * VecC is the cost of a single vector iteration.
9761 // * TC is the actual trip count of the loop
9762 // * VF is the vectorization factor
9763 // * EpiCost is the cost of the generated epilogue, including the cost
9764 // of the remaining scalar operations.
9766 // Vectorization is profitable once the total vector cost is less than the
9767 // total scalar cost:
9768 // RtC + VecC * (TC / VF) + EpiC < ScalarC * TC
9770 // Now we can compute the minimum required trip count TC as
9771 // (RtC + EpiC) / (ScalarC - (VecC / VF)) < TC
9773 // For now we assume the epilogue cost EpiC = 0 for simplicity. Note that
9774 // the computations are performed on doubles, not integers and the result
9775 // is rounded up, hence we get an upper estimate of the TC.
9776 unsigned IntVF = VF.Width.getKnownMinValue();
9777 if (VF.Width.isScalable()) {
9778 unsigned AssumedMinimumVscale = 1;
9779 if (VScale)
9780 AssumedMinimumVscale = *VScale;
9781 IntVF *= AssumedMinimumVscale;
9783 double VecCOverVF = double(*VF.Cost.getValue()) / IntVF;
9784 double RtC = *CheckCost.getValue();
9785 double MinTC1 = RtC / (ScalarC - VecCOverVF);
9787 // Second, compute a minimum iteration count so that the cost of the
9788 // runtime checks is only a fraction of the total scalar loop cost. This
9789 // adds a loop-dependent bound on the overhead incurred if the runtime
9790 // checks fail. In case the runtime checks fail, the cost is RtC + ScalarC
9791 // * TC. To bound the runtime check to be a fraction 1/X of the scalar
9792 // cost, compute
9793 // RtC < ScalarC * TC * (1 / X) ==> RtC * X / ScalarC < TC
9794 double MinTC2 = RtC * 10 / ScalarC;
9796 // Now pick the larger minimum. If it is not a multiple of VF and a scalar
9797 // epilogue is allowed, choose the next closest multiple of VF. This should
9798 // partly compensate for ignoring the epilogue cost.
9799 uint64_t MinTC = std::ceil(std::max(MinTC1, MinTC2));
9800 if (SEL == CM_ScalarEpilogueAllowed)
9801 MinTC = alignTo(MinTC, IntVF);
9802 VF.MinProfitableTripCount = ElementCount::getFixed(MinTC);
9804 LLVM_DEBUG(
9805 dbgs() << "LV: Minimum required TC for runtime checks to be profitable:"
9806 << VF.MinProfitableTripCount << "\n");
9808 // Skip vectorization if the expected trip count is less than the minimum
9809 // required trip count.
9810 if (auto ExpectedTC = getSmallBestKnownTC(SE, L)) {
9811 if (ElementCount::isKnownLT(ElementCount::getFixed(*ExpectedTC),
9812 VF.MinProfitableTripCount)) {
9813 LLVM_DEBUG(dbgs() << "LV: Vectorization is not beneficial: expected "
9814 "trip count < minimum profitable VF ("
9815 << *ExpectedTC << " < " << VF.MinProfitableTripCount
9816 << ")\n");
9818 return false;
9821 return true;
9824 LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts)
9825 : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced ||
9826 !EnableLoopInterleaving),
9827 VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced ||
9828 !EnableLoopVectorization) {}
9830 bool LoopVectorizePass::processLoop(Loop *L) {
9831 assert((EnableVPlanNativePath || L->isInnermost()) &&
9832 "VPlan-native path is not enabled. Only process inner loops.");
9834 #ifndef NDEBUG
9835 const std::string DebugLocStr = getDebugLocString(L);
9836 #endif /* NDEBUG */
9838 LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in '"
9839 << L->getHeader()->getParent()->getName() << "' from "
9840 << DebugLocStr << "\n");
9842 LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE, TTI);
9844 LLVM_DEBUG(
9845 dbgs() << "LV: Loop hints:"
9846 << " force="
9847 << (Hints.getForce() == LoopVectorizeHints::FK_Disabled
9848 ? "disabled"
9849 : (Hints.getForce() == LoopVectorizeHints::FK_Enabled
9850 ? "enabled"
9851 : "?"))
9852 << " width=" << Hints.getWidth()
9853 << " interleave=" << Hints.getInterleave() << "\n");
9855 // Function containing loop
9856 Function *F = L->getHeader()->getParent();
9858 // Looking at the diagnostic output is the only way to determine if a loop
9859 // was vectorized (other than looking at the IR or machine code), so it
9860 // is important to generate an optimization remark for each loop. Most of
9861 // these messages are generated as OptimizationRemarkAnalysis. Remarks
9862 // generated as OptimizationRemark and OptimizationRemarkMissed are
9863 // less verbose reporting vectorized loops and unvectorized loops that may
9864 // benefit from vectorization, respectively.
9866 if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) {
9867 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n");
9868 return false;
9871 PredicatedScalarEvolution PSE(*SE, *L);
9873 // Check if it is legal to vectorize the loop.
9874 LoopVectorizationRequirements Requirements;
9875 LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, F, *LAIs, LI, ORE,
9876 &Requirements, &Hints, DB, AC, BFI, PSI);
9877 if (!LVL.canVectorize(EnableVPlanNativePath)) {
9878 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
9879 Hints.emitRemarkWithHints();
9880 return false;
9883 // Entrance to the VPlan-native vectorization path. Outer loops are processed
9884 // here. They may require CFG and instruction level transformations before
9885 // even evaluating whether vectorization is profitable. Since we cannot modify
9886 // the incoming IR, we need to build VPlan upfront in the vectorization
9887 // pipeline.
9888 if (!L->isInnermost())
9889 return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC,
9890 ORE, BFI, PSI, Hints, Requirements);
9892 assert(L->isInnermost() && "Inner loop expected.");
9894 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI());
9895 bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
9897 // If an override option has been passed in for interleaved accesses, use it.
9898 if (EnableInterleavedMemAccesses.getNumOccurrences() > 0)
9899 UseInterleaved = EnableInterleavedMemAccesses;
9901 // Analyze interleaved memory accesses.
9902 if (UseInterleaved)
9903 IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI));
9905 // Check the function attributes and profiles to find out if this function
9906 // should be optimized for size.
9907 ScalarEpilogueLowering SEL =
9908 getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, LVL, &IAI);
9910 // Check the loop for a trip count threshold: vectorize loops with a tiny trip
9911 // count by optimizing for size, to minimize overheads.
9912 auto ExpectedTC = getSmallBestKnownTC(*SE, L);
9913 if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) {
9914 LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
9915 << "This loop is worth vectorizing only if no scalar "
9916 << "iteration overheads are incurred.");
9917 if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
9918 LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
9919 else {
9920 if (*ExpectedTC > TTI->getMinTripCountTailFoldingThreshold()) {
9921 LLVM_DEBUG(dbgs() << "\n");
9922 // Predicate tail-folded loops are efficient even when the loop
9923 // iteration count is low. However, setting the epilogue policy to
9924 // `CM_ScalarEpilogueNotAllowedLowTripLoop` prevents vectorizing loops
9925 // with runtime checks. It's more effective to let
9926 // `areRuntimeChecksProfitable` determine if vectorization is beneficial
9927 // for the loop.
9928 if (SEL != CM_ScalarEpilogueNotNeededUsePredicate)
9929 SEL = CM_ScalarEpilogueNotAllowedLowTripLoop;
9930 } else {
9931 LLVM_DEBUG(dbgs() << " But the target considers the trip count too "
9932 "small to consider vectorizing.\n");
9933 reportVectorizationFailure(
9934 "The trip count is below the minial threshold value.",
9935 "loop trip count is too low, avoiding vectorization",
9936 "LowTripCount", ORE, L);
9937 Hints.emitRemarkWithHints();
9938 return false;
9943 // Check the function attributes to see if implicit floats or vectors are
9944 // allowed.
9945 if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
9946 reportVectorizationFailure(
9947 "Can't vectorize when the NoImplicitFloat attribute is used",
9948 "loop not vectorized due to NoImplicitFloat attribute",
9949 "NoImplicitFloat", ORE, L);
9950 Hints.emitRemarkWithHints();
9951 return false;
9954 // Check if the target supports potentially unsafe FP vectorization.
9955 // FIXME: Add a check for the type of safety issue (denormal, signaling)
9956 // for the target we're vectorizing for, to make sure none of the
9957 // additional fp-math flags can help.
9958 if (Hints.isPotentiallyUnsafe() &&
9959 TTI->isFPVectorizationPotentiallyUnsafe()) {
9960 reportVectorizationFailure(
9961 "Potentially unsafe FP op prevents vectorization",
9962 "loop not vectorized due to unsafe FP support.",
9963 "UnsafeFP", ORE, L);
9964 Hints.emitRemarkWithHints();
9965 return false;
9968 bool AllowOrderedReductions;
9969 // If the flag is set, use that instead and override the TTI behaviour.
9970 if (ForceOrderedReductions.getNumOccurrences() > 0)
9971 AllowOrderedReductions = ForceOrderedReductions;
9972 else
9973 AllowOrderedReductions = TTI->enableOrderedReductions();
9974 if (!LVL.canVectorizeFPMath(AllowOrderedReductions)) {
9975 ORE->emit([&]() {
9976 auto *ExactFPMathInst = Requirements.getExactFPInst();
9977 return OptimizationRemarkAnalysisFPCommute(DEBUG_TYPE, "CantReorderFPOps",
9978 ExactFPMathInst->getDebugLoc(),
9979 ExactFPMathInst->getParent())
9980 << "loop not vectorized: cannot prove it is safe to reorder "
9981 "floating-point operations";
9983 LLVM_DEBUG(dbgs() << "LV: loop not vectorized: cannot prove it is safe to "
9984 "reorder floating-point operations\n");
9985 Hints.emitRemarkWithHints();
9986 return false;
9989 // Use the cost model.
9990 LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
9991 F, &Hints, IAI);
9992 // Use the planner for vectorization.
9993 LoopVectorizationPlanner LVP(L, LI, DT, TLI, *TTI, &LVL, CM, IAI, PSE, Hints,
9994 ORE);
9996 // Get user vectorization factor and interleave count.
9997 ElementCount UserVF = Hints.getWidth();
9998 unsigned UserIC = Hints.getInterleave();
10000 // Plan how to best vectorize, return the best VF and its cost.
10001 std::optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC);
10003 VectorizationFactor VF = VectorizationFactor::Disabled();
10004 unsigned IC = 1;
10006 bool AddBranchWeights =
10007 hasBranchWeightMD(*L->getLoopLatch()->getTerminator());
10008 GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, TTI,
10009 F->getParent()->getDataLayout(), AddBranchWeights);
10010 if (MaybeVF) {
10011 VF = *MaybeVF;
10012 // Select the interleave count.
10013 IC = CM.selectInterleaveCount(VF.Width, VF.Cost);
10015 unsigned SelectedIC = std::max(IC, UserIC);
10016 // Optimistically generate runtime checks if they are needed. Drop them if
10017 // they turn out to not be profitable.
10018 if (VF.Width.isVector() || SelectedIC > 1)
10019 Checks.Create(L, *LVL.getLAI(), PSE.getPredicate(), VF.Width, SelectedIC);
10021 // Check if it is profitable to vectorize with runtime checks.
10022 bool ForceVectorization =
10023 Hints.getForce() == LoopVectorizeHints::FK_Enabled;
10024 if (!ForceVectorization &&
10025 !areRuntimeChecksProfitable(Checks, VF, getVScaleForTuning(L, *TTI), L,
10026 *PSE.getSE(), SEL)) {
10027 ORE->emit([&]() {
10028 return OptimizationRemarkAnalysisAliasing(
10029 DEBUG_TYPE, "CantReorderMemOps", L->getStartLoc(),
10030 L->getHeader())
10031 << "loop not vectorized: cannot prove it is safe to reorder "
10032 "memory operations";
10034 LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n");
10035 Hints.emitRemarkWithHints();
10036 return false;
10040 // Identify the diagnostic messages that should be produced.
10041 std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;
10042 bool VectorizeLoop = true, InterleaveLoop = true;
10043 if (VF.Width.isScalar()) {
10044 LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
10045 VecDiagMsg = std::make_pair(
10046 "VectorizationNotBeneficial",
10047 "the cost-model indicates that vectorization is not beneficial");
10048 VectorizeLoop = false;
10051 if (!MaybeVF && UserIC > 1) {
10052 // Tell the user interleaving was avoided up-front, despite being explicitly
10053 // requested.
10054 LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and "
10055 "interleaving should be avoided up front\n");
10056 IntDiagMsg = std::make_pair(
10057 "InterleavingAvoided",
10058 "Ignoring UserIC, because interleaving was avoided up front");
10059 InterleaveLoop = false;
10060 } else if (IC == 1 && UserIC <= 1) {
10061 // Tell the user interleaving is not beneficial.
10062 LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");
10063 IntDiagMsg = std::make_pair(
10064 "InterleavingNotBeneficial",
10065 "the cost-model indicates that interleaving is not beneficial");
10066 InterleaveLoop = false;
10067 if (UserIC == 1) {
10068 IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled";
10069 IntDiagMsg.second +=
10070 " and is explicitly disabled or interleave count is set to 1";
10072 } else if (IC > 1 && UserIC == 1) {
10073 // Tell the user interleaving is beneficial, but it explicitly disabled.
10074 LLVM_DEBUG(
10075 dbgs() << "LV: Interleaving is beneficial but is explicitly disabled.");
10076 IntDiagMsg = std::make_pair(
10077 "InterleavingBeneficialButDisabled",
10078 "the cost-model indicates that interleaving is beneficial "
10079 "but is explicitly disabled or interleave count is set to 1");
10080 InterleaveLoop = false;
10083 // Override IC if user provided an interleave count.
10084 IC = UserIC > 0 ? UserIC : IC;
10086 // Emit diagnostic messages, if any.
10087 const char *VAPassName = Hints.vectorizeAnalysisPassName();
10088 if (!VectorizeLoop && !InterleaveLoop) {
10089 // Do not vectorize or interleaving the loop.
10090 ORE->emit([&]() {
10091 return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first,
10092 L->getStartLoc(), L->getHeader())
10093 << VecDiagMsg.second;
10095 ORE->emit([&]() {
10096 return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first,
10097 L->getStartLoc(), L->getHeader())
10098 << IntDiagMsg.second;
10100 return false;
10101 } else if (!VectorizeLoop && InterleaveLoop) {
10102 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
10103 ORE->emit([&]() {
10104 return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first,
10105 L->getStartLoc(), L->getHeader())
10106 << VecDiagMsg.second;
10108 } else if (VectorizeLoop && !InterleaveLoop) {
10109 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
10110 << ") in " << DebugLocStr << '\n');
10111 ORE->emit([&]() {
10112 return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first,
10113 L->getStartLoc(), L->getHeader())
10114 << IntDiagMsg.second;
10116 } else if (VectorizeLoop && InterleaveLoop) {
10117 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
10118 << ") in " << DebugLocStr << '\n');
10119 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
10122 bool DisableRuntimeUnroll = false;
10123 MDNode *OrigLoopID = L->getLoopID();
10125 using namespace ore;
10126 if (!VectorizeLoop) {
10127 assert(IC > 1 && "interleave count should not be 1 or 0");
10128 // If we decided that it is not legal to vectorize the loop, then
10129 // interleave it.
10130 InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL,
10131 &CM, BFI, PSI, Checks);
10133 VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
10134 LVP.executePlan(VF.Width, IC, BestPlan, Unroller, DT, false);
10136 ORE->emit([&]() {
10137 return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
10138 L->getHeader())
10139 << "interleaved loop (interleaved count: "
10140 << NV("InterleaveCount", IC) << ")";
10142 } else {
10143 // If we decided that it is *legal* to vectorize the loop, then do it.
10145 // Consider vectorizing the epilogue too if it's profitable.
10146 VectorizationFactor EpilogueVF =
10147 LVP.selectEpilogueVectorizationFactor(VF.Width, IC);
10148 if (EpilogueVF.Width.isVector()) {
10150 // The first pass vectorizes the main loop and creates a scalar epilogue
10151 // to be vectorized by executing the plan (potentially with a different
10152 // factor) again shortly afterwards.
10153 EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1);
10154 EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE,
10155 EPI, &LVL, &CM, BFI, PSI, Checks);
10157 VPlan &BestMainPlan = LVP.getBestPlanFor(EPI.MainLoopVF);
10158 const auto &[ExpandedSCEVs, ReductionResumeValues] = LVP.executePlan(
10159 EPI.MainLoopVF, EPI.MainLoopUF, BestMainPlan, MainILV, DT, true);
10160 ++LoopsVectorized;
10162 // Second pass vectorizes the epilogue and adjusts the control flow
10163 // edges from the first pass.
10164 EPI.MainLoopVF = EPI.EpilogueVF;
10165 EPI.MainLoopUF = EPI.EpilogueUF;
10166 EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC,
10167 ORE, EPI, &LVL, &CM, BFI, PSI,
10168 Checks);
10170 VPlan &BestEpiPlan = LVP.getBestPlanFor(EPI.EpilogueVF);
10171 VPRegionBlock *VectorLoop = BestEpiPlan.getVectorLoopRegion();
10172 VPBasicBlock *Header = VectorLoop->getEntryBasicBlock();
10173 Header->setName("vec.epilog.vector.body");
10175 // Re-use the trip count and steps expanded for the main loop, as
10176 // skeleton creation needs it as a value that dominates both the scalar
10177 // and vector epilogue loops
10178 // TODO: This is a workaround needed for epilogue vectorization and it
10179 // should be removed once induction resume value creation is done
10180 // directly in VPlan.
10181 EpilogILV.setTripCount(MainILV.getTripCount());
10182 for (auto &R : make_early_inc_range(*BestEpiPlan.getPreheader())) {
10183 auto *ExpandR = cast<VPExpandSCEVRecipe>(&R);
10184 auto *ExpandedVal = BestEpiPlan.getVPValueOrAddLiveIn(
10185 ExpandedSCEVs.find(ExpandR->getSCEV())->second);
10186 ExpandR->replaceAllUsesWith(ExpandedVal);
10187 ExpandR->eraseFromParent();
10190 // Ensure that the start values for any VPWidenIntOrFpInductionRecipe,
10191 // VPWidenPointerInductionRecipe and VPReductionPHIRecipes are updated
10192 // before vectorizing the epilogue loop.
10193 for (VPRecipeBase &R : Header->phis()) {
10194 if (isa<VPCanonicalIVPHIRecipe>(&R))
10195 continue;
10197 Value *ResumeV = nullptr;
10198 // TODO: Move setting of resume values to prepareToExecute.
10199 if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) {
10200 ResumeV = ReductionResumeValues
10201 .find(&ReductionPhi->getRecurrenceDescriptor())
10202 ->second;
10203 } else {
10204 // Create induction resume values for both widened pointer and
10205 // integer/fp inductions and update the start value of the induction
10206 // recipes to use the resume value.
10207 PHINode *IndPhi = nullptr;
10208 const InductionDescriptor *ID;
10209 if (auto *Ind = dyn_cast<VPWidenPointerInductionRecipe>(&R)) {
10210 IndPhi = cast<PHINode>(Ind->getUnderlyingValue());
10211 ID = &Ind->getInductionDescriptor();
10212 } else {
10213 auto *WidenInd = cast<VPWidenIntOrFpInductionRecipe>(&R);
10214 IndPhi = WidenInd->getPHINode();
10215 ID = &WidenInd->getInductionDescriptor();
10218 ResumeV = MainILV.createInductionResumeValue(
10219 IndPhi, *ID, getExpandedStep(*ID, ExpandedSCEVs),
10220 {EPI.MainLoopIterationCountCheck});
10222 assert(ResumeV && "Must have a resume value");
10223 VPValue *StartVal = BestEpiPlan.getVPValueOrAddLiveIn(ResumeV);
10224 cast<VPHeaderPHIRecipe>(&R)->setStartValue(StartVal);
10227 LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV,
10228 DT, true, &ExpandedSCEVs);
10229 ++LoopsEpilogueVectorized;
10231 if (!MainILV.areSafetyChecksAdded())
10232 DisableRuntimeUnroll = true;
10233 } else {
10234 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width,
10235 VF.MinProfitableTripCount, IC, &LVL, &CM, BFI,
10236 PSI, Checks);
10238 VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
10239 LVP.executePlan(VF.Width, IC, BestPlan, LB, DT, false);
10240 ++LoopsVectorized;
10242 // Add metadata to disable runtime unrolling a scalar loop when there
10243 // are no runtime checks about strides and memory. A scalar loop that is
10244 // rarely used is not worth unrolling.
10245 if (!LB.areSafetyChecksAdded())
10246 DisableRuntimeUnroll = true;
10248 // Report the vectorization decision.
10249 reportVectorization(ORE, L, VF, IC);
10252 if (ORE->allowExtraAnalysis(LV_NAME))
10253 checkMixedPrecision(L, ORE);
10256 std::optional<MDNode *> RemainderLoopID =
10257 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
10258 LLVMLoopVectorizeFollowupEpilogue});
10259 if (RemainderLoopID) {
10260 L->setLoopID(*RemainderLoopID);
10261 } else {
10262 if (DisableRuntimeUnroll)
10263 AddRuntimeUnrollDisableMetaData(L);
10265 // Mark the loop as already vectorized to avoid vectorizing again.
10266 Hints.setAlreadyVectorized();
10269 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
10270 return true;
10273 LoopVectorizeResult LoopVectorizePass::runImpl(
10274 Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_,
10275 DominatorTree &DT_, BlockFrequencyInfo *BFI_, TargetLibraryInfo *TLI_,
10276 DemandedBits &DB_, AssumptionCache &AC_, LoopAccessInfoManager &LAIs_,
10277 OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) {
10278 SE = &SE_;
10279 LI = &LI_;
10280 TTI = &TTI_;
10281 DT = &DT_;
10282 BFI = BFI_;
10283 TLI = TLI_;
10284 AC = &AC_;
10285 LAIs = &LAIs_;
10286 DB = &DB_;
10287 ORE = &ORE_;
10288 PSI = PSI_;
10290 // Don't attempt if
10291 // 1. the target claims to have no vector registers, and
10292 // 2. interleaving won't help ILP.
10294 // The second condition is necessary because, even if the target has no
10295 // vector registers, loop vectorization may still enable scalar
10296 // interleaving.
10297 if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) &&
10298 TTI->getMaxInterleaveFactor(ElementCount::getFixed(1)) < 2)
10299 return LoopVectorizeResult(false, false);
10301 bool Changed = false, CFGChanged = false;
10303 // The vectorizer requires loops to be in simplified form.
10304 // Since simplification may add new inner loops, it has to run before the
10305 // legality and profitability checks. This means running the loop vectorizer
10306 // will simplify all loops, regardless of whether anything end up being
10307 // vectorized.
10308 for (const auto &L : *LI)
10309 Changed |= CFGChanged |=
10310 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
10312 // Build up a worklist of inner-loops to vectorize. This is necessary as
10313 // the act of vectorizing or partially unrolling a loop creates new loops
10314 // and can invalidate iterators across the loops.
10315 SmallVector<Loop *, 8> Worklist;
10317 for (Loop *L : *LI)
10318 collectSupportedLoops(*L, LI, ORE, Worklist);
10320 LoopsAnalyzed += Worklist.size();
10322 // Now walk the identified inner loops.
10323 while (!Worklist.empty()) {
10324 Loop *L = Worklist.pop_back_val();
10326 // For the inner loops we actually process, form LCSSA to simplify the
10327 // transform.
10328 Changed |= formLCSSARecursively(*L, *DT, LI, SE);
10330 Changed |= CFGChanged |= processLoop(L);
10332 if (Changed) {
10333 LAIs->clear();
10335 #ifndef NDEBUG
10336 if (VerifySCEV)
10337 SE->verify();
10338 #endif
10342 // Process each loop nest in the function.
10343 return LoopVectorizeResult(Changed, CFGChanged);
10346 PreservedAnalyses LoopVectorizePass::run(Function &F,
10347 FunctionAnalysisManager &AM) {
10348 auto &LI = AM.getResult<LoopAnalysis>(F);
10349 // There are no loops in the function. Return before computing other expensive
10350 // analyses.
10351 if (LI.empty())
10352 return PreservedAnalyses::all();
10353 auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
10354 auto &TTI = AM.getResult<TargetIRAnalysis>(F);
10355 auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
10356 auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
10357 auto &AC = AM.getResult<AssumptionAnalysis>(F);
10358 auto &DB = AM.getResult<DemandedBitsAnalysis>(F);
10359 auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
10361 LoopAccessInfoManager &LAIs = AM.getResult<LoopAccessAnalysis>(F);
10362 auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
10363 ProfileSummaryInfo *PSI =
10364 MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
10365 BlockFrequencyInfo *BFI = nullptr;
10366 if (PSI && PSI->hasProfileSummary())
10367 BFI = &AM.getResult<BlockFrequencyAnalysis>(F);
10368 LoopVectorizeResult Result =
10369 runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AC, LAIs, ORE, PSI);
10370 if (!Result.MadeAnyChange)
10371 return PreservedAnalyses::all();
10372 PreservedAnalyses PA;
10374 if (isAssignmentTrackingEnabled(*F.getParent())) {
10375 for (auto &BB : F)
10376 RemoveRedundantDbgInstrs(&BB);
10379 // We currently do not preserve loopinfo/dominator analyses with outer loop
10380 // vectorization. Until this is addressed, mark these analyses as preserved
10381 // only for non-VPlan-native path.
10382 // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
10383 if (!EnableVPlanNativePath) {
10384 PA.preserve<LoopAnalysis>();
10385 PA.preserve<DominatorTreeAnalysis>();
10386 PA.preserve<ScalarEvolutionAnalysis>();
10389 if (Result.MadeCFGChange) {
10390 // Making CFG changes likely means a loop got vectorized. Indicate that
10391 // extra simplification passes should be run.
10392 // TODO: MadeCFGChanges is not a prefect proxy. Extra passes should only
10393 // be run if runtime checks have been added.
10394 AM.getResult<ShouldRunExtraVectorPasses>(F);
10395 PA.preserve<ShouldRunExtraVectorPasses>();
10396 } else {
10397 PA.preserveSet<CFGAnalyses>();
10399 return PA;
10402 void LoopVectorizePass::printPipeline(
10403 raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) {
10404 static_cast<PassInfoMixin<LoopVectorizePass> *>(this)->printPipeline(
10405 OS, MapClassName2PassName);
10407 OS << '<';
10408 OS << (InterleaveOnlyWhenForced ? "" : "no-") << "interleave-forced-only;";
10409 OS << (VectorizeOnlyWhenForced ? "" : "no-") << "vectorize-forced-only;";
10410 OS << '>';